@@ -586,4 +586,69 @@ mod tests {
586586 eprintln ! ( " → These shifts = what 'visual grounding' looks like in LLM weight space" ) ;
587587 }
588588 }
589+
590+ // ── Reader LM 1.5B: HTML→Markdown local model ──
591+
592+ #[ test]
593+ #[ ignore] // Streams ~3.1 GB from HuggingFace
594+ fn test_stream_index_reader_lm ( ) {
595+ // jinaai/reader-lm-1.5b: 1 shard, 1.54B params, 3.1 GB BF16
596+ // Produces ~30 MB bgz7 for local HTML→Markdown palette routing
597+ index_safetensors_shards (
598+ "jinaai/reader-lm-1.5b" ,
599+ & [ "model.safetensors" ] ,
600+ "/tmp/reader_lm_1_5b" ,
601+ 16 ,
602+ ) ;
603+ }
604+
605+ // ── BGE-M3: multilingual embedding model (GGUF path) ──
606+
607+ #[ test]
608+ #[ ignore] // Streams ~1.2 GB GGUF from HuggingFace
609+ fn test_stream_index_bge_m3 ( ) {
610+ use super :: super :: http_reader:: HttpRangeReader ;
611+ use std:: io:: BufWriter ;
612+
613+ let url = "https://huggingface.co/CompendiumLabs/bge-m3-gguf/resolve/main/bge-m3-f16.gguf" ;
614+ let out_path = "/tmp/bge_m3_f16.bgz7" ;
615+
616+ if std:: fs:: metadata ( out_path) . is_ok ( ) {
617+ eprintln ! ( "SKIP {} (exists)" , out_path) ;
618+ return ;
619+ }
620+
621+ // HEAD for size
622+ let size: u64 = std:: process:: Command :: new ( "curl" )
623+ . args ( & [ "-sI" , "-L" , url] )
624+ . output ( )
625+ . ok ( )
626+ . and_then ( |o| {
627+ String :: from_utf8_lossy ( & o. stdout )
628+ . lines ( )
629+ . filter ( |l| l. to_lowercase ( ) . starts_with ( "content-length:" ) )
630+ . last ( )
631+ . and_then ( |l| l. split ( ':' ) . nth ( 1 ) )
632+ . and_then ( |s| s. trim ( ) . parse ( ) . ok ( ) )
633+ } )
634+ . unwrap_or ( 1_500_000_000 ) ;
635+
636+ eprintln ! ( "Indexing BGE-M3 F16 GGUF ({:.1} GB)..." , size as f64 / 1e9 ) ;
637+ let mut reader = HttpRangeReader :: with_chunk_size ( url. to_string ( ) , size, 256 * 1024 * 1024 ) ;
638+ let out = std:: fs:: File :: create ( out_path) . expect ( "create output" ) ;
639+ let mut writer = BufWriter :: new ( out) ;
640+
641+ let stats = super :: super :: gguf_indexer:: stream_index_gguf_bf16 (
642+ & mut reader, & mut writer, 16 ,
643+ Some ( & |name, _lt, orig, comp| {
644+ let ratio = if comp > 0 { orig as f64 / comp as f64 } else { 0.0 } ;
645+ eprintln ! ( " {:50} {:>12} → {:>8} ({:.0}×)" , name, orig, comp, ratio) ;
646+ } ) ,
647+ ) . expect ( "GGUF indexing failed" ) ;
648+
649+ drop ( writer) ;
650+ let out_size = std:: fs:: metadata ( out_path) . map ( |m| m. len ( ) ) . unwrap_or ( 0 ) ;
651+ eprintln ! ( " → {:.2} MB, {} tensors, {:.0}×" ,
652+ out_size as f64 / 1e6 , stats. tensors_indexed, stats. overall_ratio( ) ) ;
653+ }
589654}
0 commit comments