Skip to content

Commit 6cdfa9b

Browse files
authored
Merge pull request #49 from AdaWorldAPI/claude/transcode-deepnsm-rust-oNa1Z
data: Llama 4 Scout BF16 shard 5 → bgz17 (18.2 GB → 7.7 MB, 4735×) Streamed from HuggingFace via HTTP range reader. Zero disk for source. MoE expert FFN: 15,420× compression. Shared expert: 964×. Attention: 2,162×. Full model estimate: ~215 GB BF16 → ~40 MB bgz7. https://claude.ai/code/session_01Y69Vnw751w75iVSBRws7o7
2 parents a97d162 + 92cde14 commit 6cdfa9b

5 files changed

Lines changed: 400 additions & 6 deletions

File tree

src/hpc/gguf.rs

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -215,12 +215,17 @@ pub fn read_tensor_f32<R: Read + Seek>(
215215
GgmlType::BF16 => {
216216
let mut buf = vec![0u8; n_elements * 2];
217217
reader.read_exact(&mut buf).map_err(|e| e.to_string())?;
218-
Ok(buf.chunks_exact(2)
219-
.map(|c| {
220-
let bits = u16::from_le_bytes([c[0], c[1]]);
221-
bf16_to_f32(bits)
222-
})
223-
.collect())
218+
// Reinterpret u8 pairs as BF16 (same repr) and batch-convert via quantized.rs
219+
// SAFETY: BF16 is #[repr(transparent)] over u16, same layout as [u8; 2] LE pairs.
220+
let bf16_slice: &[super::quantized::BF16] = unsafe {
221+
std::slice::from_raw_parts(
222+
buf.as_ptr() as *const super::quantized::BF16,
223+
n_elements,
224+
)
225+
};
226+
let mut result = vec![0.0f32; n_elements];
227+
super::quantized::bf16_to_f32_slice(bf16_slice, &mut result);
228+
Ok(result)
224229
}
225230
GgmlType::Q8_0 => {
226231
dequantize_q8_0(reader, n_elements)

src/hpc/gguf_indexer.rs

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -574,4 +574,126 @@ mod tests {
574574
assert!(stats.tensors_indexed > 0, "should index at least some tensors");
575575
assert!(stats.overall_ratio() > 10.0, "ratio should be significant: {:.1}", stats.overall_ratio());
576576
}
577+
578+
#[test]
579+
#[ignore] // Streams from HuggingFace — requires network + time
580+
fn test_stream_index_llama4_scout_from_hf() {
581+
use super::super::http_reader::{HttpRangeReader, resolve_hf_url};
582+
use std::io::BufWriter;
583+
584+
let repo = "unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF";
585+
let filename = "Llama-4-Scout-17B-16E-Instruct-UD-IQ1_S.gguf";
586+
587+
eprintln!("Resolving {} / {} ...", repo, filename);
588+
let (url, size) = match resolve_hf_url(repo, filename) {
589+
Ok(r) => r,
590+
Err(e) => { eprintln!("SKIP: {}", e); return; }
591+
};
592+
eprintln!(" URL resolved, size: {:.2} GB", size as f64 / 1e9);
593+
594+
let mut reader = HttpRangeReader::with_chunk_size(url, size, 256 * 1024 * 1024); // 16 MB chunks
595+
596+
let out_path = "/tmp/llama4_scout.bgz7";
597+
let out = std::fs::File::create(out_path).expect("create output");
598+
let mut writer = BufWriter::new(out);
599+
600+
eprintln!("Streaming index...");
601+
let stats = stream_index_gguf(
602+
&mut reader,
603+
&mut writer,
604+
Some(&|name, layer_type, orig, comp| {
605+
let ratio = if comp > 0 { orig as f64 / comp as f64 } else { 0.0 };
606+
eprintln!(" {:60} {:12?} {:>12} → {:>8} ({:.0}×)",
607+
name, layer_type, orig, comp, ratio);
608+
}),
609+
).expect("stream_index_gguf");
610+
611+
drop(writer);
612+
let out_size = std::fs::metadata(out_path).map(|m| m.len()).unwrap_or(0);
613+
614+
eprintln!();
615+
eprintln!("=== Llama 4 Scout → bgz17 (streamed from HF) ===");
616+
eprintln!(" Source: {:.2} GB ({})", size as f64 / 1e9, filename);
617+
eprintln!(" Output: {:.2} MB ({})", out_size as f64 / 1e6, out_path);
618+
eprintln!(" Downloaded: {:.2} GB", reader.bytes_downloaded() as f64 / 1e9);
619+
eprintln!(" Tensors: {} indexed, {} skipped",
620+
stats.tensors_indexed, stats.tensors_skipped);
621+
eprintln!(" Original (f32): {:.2} GB", stats.original_bytes as f64 / 1e9);
622+
eprintln!(" Compressed: {:.2} MB", stats.compressed_bytes as f64 / 1e6);
623+
eprintln!(" Ratio: {:.1}×", stats.overall_ratio());
624+
eprintln!(" Peak tensor: {:.2} MB", stats.peak_tensor_bytes as f64 / 1e6);
625+
626+
let type_names = ["Attention", "FeedForward", "Conv2D", "Norm", "Embedding", "Skip"];
627+
for (i, name) in type_names.iter().enumerate() {
628+
let (count, orig, comp) = stats.by_type[i];
629+
if count > 0 {
630+
let ratio = if comp > 0 { orig as f64 / comp as f64 } else { 0.0 };
631+
eprintln!(" {:<12} {:>3} tensors: {:>10.2} GB → {:>8.2} MB ({:.1}×)",
632+
name, count, orig as f64 / 1e9, comp as f64 / 1e6, ratio);
633+
}
634+
}
635+
636+
assert!(stats.tensors_indexed > 0);
637+
}
638+
639+
#[test]
640+
#[ignore] // Streams BF16 shard 5 (18.2 GB) from HuggingFace
641+
fn test_stream_index_llama4_bf16_shard5() {
642+
use super::super::http_reader::HttpRangeReader;
643+
use std::io::BufWriter;
644+
645+
let repo = "unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF";
646+
let filename = "BF16/Llama-4-Scout-17B-16E-Instruct-BF16-00005-of-00005.gguf";
647+
let size: u64 = 18_220_000_000; // ~18.2 GB from metadata
648+
649+
let url = format!("https://huggingface.co/{}/resolve/main/{}", repo, filename);
650+
eprintln!("Streaming shard 5: {:.2} GB", size as f64 / 1e9);
651+
eprintln!(" URL: {}", url);
652+
653+
// 16 MB chunks for fewer HTTP round-trips
654+
let mut reader = HttpRangeReader::with_chunk_size(url, size, 256 * 1024 * 1024);
655+
656+
let out_path = "/tmp/llama4_scout_shard5.bgz7";
657+
let out = std::fs::File::create(out_path).expect("create output");
658+
let mut writer = BufWriter::new(out);
659+
660+
let stats = stream_index_gguf(
661+
&mut reader,
662+
&mut writer,
663+
Some(&|name, layer_type, orig, comp| {
664+
let ratio = if comp > 0 { orig as f64 / comp as f64 } else { 0.0 };
665+
eprintln!(" {:60} {:12?} {:>12} → {:>8} ({:.0}×)",
666+
name, layer_type, orig, comp, ratio);
667+
}),
668+
).expect("stream_index_gguf");
669+
670+
drop(writer);
671+
let out_size = std::fs::metadata(out_path).map(|m| m.len()).unwrap_or(0);
672+
673+
eprintln!();
674+
eprintln!("=== Llama 4 Scout BF16 Shard 5 → bgz17 ===");
675+
eprintln!(" Source: {:.2} GB (BF16, streamed from HF)", size as f64 / 1e9);
676+
eprintln!(" Output: {:.2} MB", out_size as f64 / 1e6);
677+
eprintln!(" Downloaded: {:.2} GB", reader.bytes_downloaded() as f64 / 1e9);
678+
eprintln!(" Tensors: {} indexed, {} skipped",
679+
stats.tensors_indexed, stats.tensors_skipped);
680+
eprintln!(" Original (f32): {:.2} GB", stats.original_bytes as f64 / 1e9);
681+
eprintln!(" Compressed: {:.2} MB", stats.compressed_bytes as f64 / 1e6);
682+
eprintln!(" Ratio: {:.1}×", stats.overall_ratio());
683+
eprintln!(" Peak tensor: {:.2} MB", stats.peak_tensor_bytes as f64 / 1e6);
684+
685+
let type_names = ["Attention", "FeedForward", "Conv2D", "Norm", "Embedding", "Skip"];
686+
for (i, name) in type_names.iter().enumerate() {
687+
let (count, orig, comp) = stats.by_type[i];
688+
if count > 0 {
689+
let ratio = if comp > 0 { orig as f64 / comp as f64 } else { 0.0 };
690+
eprintln!(" {:<12} {:>3} tensors: {:>10.2} GB → {:>8.2} MB ({:.1}×)",
691+
name, count, orig as f64 / 1e9, comp as f64 / 1e6, ratio);
692+
}
693+
}
694+
695+
assert!(stats.tensors_indexed > 0);
696+
// BF16 dequant to f32 doubles the size, so original_bytes > source size
697+
assert!(stats.original_bytes > 0);
698+
}
577699
}

0 commit comments

Comments
 (0)