Skip to content

Commit dd7fa49

Browse files
committed
feat: index Reader LM 1.5B + BGE-M3 for OSINT pipeline
jinaai/reader-lm-1.5b (safetensors, 1 shard, 3.1 GB): HTML→Markdown local model. No Jina API needed. bgz7 → palette → O(1) HTML structure recognition. CompendiumLabs/bge-m3-gguf (GGUF F16, ~1.2 GB): Multilingual embedding model. Replaces DeepNSM for non-English. bgz7 → palette → O(1) semantic similarity. Together: Reader LM reads the web, BGE-M3 embeds it, AriGraph stores it as SPO triplets, AutocompleteCache routes it at 17K tok/sec. https://claude.ai/code/session_01M3at4EuHVvQ8S95mSnKgtK
1 parent 2dd417a commit dd7fa49

1 file changed

Lines changed: 65 additions & 0 deletions

File tree

src/hpc/safetensors.rs

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -586,4 +586,69 @@ mod tests {
586586
eprintln!(" → These shifts = what 'visual grounding' looks like in LLM weight space");
587587
}
588588
}
589+
590+
// ── Reader LM 1.5B: HTML→Markdown local model ──
591+
592+
#[test]
593+
#[ignore] // Streams ~3.1 GB from HuggingFace
594+
fn test_stream_index_reader_lm() {
595+
// jinaai/reader-lm-1.5b: 1 shard, 1.54B params, 3.1 GB BF16
596+
// Produces ~30 MB bgz7 for local HTML→Markdown palette routing
597+
index_safetensors_shards(
598+
"jinaai/reader-lm-1.5b",
599+
&["model.safetensors"],
600+
"/tmp/reader_lm_1_5b",
601+
16,
602+
);
603+
}
604+
605+
// ── BGE-M3: multilingual embedding model (GGUF path) ──
606+
607+
#[test]
608+
#[ignore] // Streams ~1.2 GB GGUF from HuggingFace
609+
fn test_stream_index_bge_m3() {
610+
use super::super::http_reader::HttpRangeReader;
611+
use std::io::BufWriter;
612+
613+
let url = "https://huggingface.co/CompendiumLabs/bge-m3-gguf/resolve/main/bge-m3-f16.gguf";
614+
let out_path = "/tmp/bge_m3_f16.bgz7";
615+
616+
if std::fs::metadata(out_path).is_ok() {
617+
eprintln!("SKIP {} (exists)", out_path);
618+
return;
619+
}
620+
621+
// HEAD for size
622+
let size: u64 = std::process::Command::new("curl")
623+
.args(&["-sI", "-L", url])
624+
.output()
625+
.ok()
626+
.and_then(|o| {
627+
String::from_utf8_lossy(&o.stdout)
628+
.lines()
629+
.filter(|l| l.to_lowercase().starts_with("content-length:"))
630+
.last()
631+
.and_then(|l| l.split(':').nth(1))
632+
.and_then(|s| s.trim().parse().ok())
633+
})
634+
.unwrap_or(1_500_000_000);
635+
636+
eprintln!("Indexing BGE-M3 F16 GGUF ({:.1} GB)...", size as f64 / 1e9);
637+
let mut reader = HttpRangeReader::with_chunk_size(url.to_string(), size, 256 * 1024 * 1024);
638+
let out = std::fs::File::create(out_path).expect("create output");
639+
let mut writer = BufWriter::new(out);
640+
641+
let stats = super::super::gguf_indexer::stream_index_gguf_bf16(
642+
&mut reader, &mut writer, 16,
643+
Some(&|name, _lt, orig, comp| {
644+
let ratio = if comp > 0 { orig as f64 / comp as f64 } else { 0.0 };
645+
eprintln!(" {:50} {:>12} → {:>8} ({:.0}×)", name, orig, comp, ratio);
646+
}),
647+
).expect("GGUF indexing failed");
648+
649+
drop(writer);
650+
let out_size = std::fs::metadata(out_path).map(|m| m.len()).unwrap_or(0);
651+
eprintln!(" → {:.2} MB, {} tensors, {:.0}×",
652+
out_size as f64 / 1e6, stats.tensors_indexed, stats.overall_ratio());
653+
}
589654
}

0 commit comments

Comments
 (0)