Skip to content

Commit b4ef4d8

Browse files
committed
rebase Scout shard runner to BF16-direct + F64x8 SIMD path
run_llama4_shard() now uses stream_index_gguf_bf16() instead of stream_index_gguf(). Changes: - BF16-direct: no f32 intermediate allocation (saves 283 MB/tensor) - F64x8 SIMD: 8 rows projected in parallel per zmm register - Strided octave (stride=16): 97% fewer BF16→f64 conversions - Halftone drop: 9 of 17 golden positions, odd bins interpolated - Exact shard sizes: SCOUT_SHARD_SIZES const replaces 44 GB estimate - Reusable u16 buffer inside indexer (no per-tensor alloc) Both Scout shard tests and Maverick test now use the same BF16-direct pipeline. The old f32 path remains for non-BF16 formats (IQ1_S, Q8_0, etc).
1 parent a993794 commit b4ef4d8

1 file changed

Lines changed: 22 additions & 9 deletions

File tree

src/hpc/gguf_indexer.rs

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1126,8 +1126,19 @@ mod tests {
11261126
assert!(stats.tensors_indexed > 0);
11271127
}
11281128

1129-
/// Run one shard of Llama 4 Scout BF16 through the streaming indexer.
1130-
/// Returns the output path on success.
1129+
/// Exact Scout BF16 shard sizes (verified via HuggingFace HEAD).
1130+
const SCOUT_SHARD_SIZES: [u64; 5] = [
1131+
48_940_000_000, // shard 1: layers 0-10 + embeddings
1132+
49_960_000_000, // shard 2: layers 11-21
1133+
48_660_000_000, // shard 3: layers 22-32
1134+
49_790_000_000, // shard 4: layers 33-43
1135+
18_220_000_000, // shard 5: layers 44-47 + output
1136+
];
1137+
1138+
/// Run one shard of Llama 4 Scout BF16 through the BF16-direct indexer.
1139+
///
1140+
/// Uses stream_index_gguf_bf16 with F64x8 SIMD and strided octave sampling.
1141+
/// No f32 intermediate allocation. Reusable u16 buffer inside the indexer.
11311142
fn run_llama4_shard(shard: u32) -> Option<(String, IndexStats)> {
11321143
use super::super::http_reader::HttpRangeReader;
11331144
use std::io::BufWriter;
@@ -1136,41 +1147,43 @@ mod tests {
11361147
let filename = format!(
11371148
"BF16/Llama-4-Scout-17B-16E-Instruct-BF16-{:05}-of-00005.gguf", shard
11381149
);
1139-
// Shards are ~18-44 GB each; use conservative 44 GB estimate
1140-
let size: u64 = 44_000_000_000;
1150+
let size = SCOUT_SHARD_SIZES[(shard - 1) as usize];
1151+
let octave_stride: usize = 16; // 4 octaves higher + halftone drop
11411152

11421153
let url = format!("https://huggingface.co/{}/resolve/main/{}", repo, filename);
1143-
eprintln!("Streaming shard {}/5: {}", shard, filename);
1154+
eprintln!("Streaming shard {}/5: {} ({:.2} GB)", shard, filename, size as f64 / 1e9);
1155+
eprintln!(" BF16-direct, octave_stride={}, F64x8 SIMD", octave_stride);
11441156

11451157
let mut reader = HttpRangeReader::with_chunk_size(url, size, 256 * 1024 * 1024);
11461158

11471159
let out_path = format!("/tmp/llama4_scout_shard{}.bgz7", shard);
11481160
let out = std::fs::File::create(&out_path).expect("create output");
11491161
let mut writer = BufWriter::new(out);
11501162

1151-
let stats = stream_index_gguf(
1163+
let stats = stream_index_gguf_bf16(
11521164
&mut reader,
11531165
&mut writer,
1166+
octave_stride,
11541167
Some(&|name, layer_type, orig, comp| {
11551168
let ratio = if comp > 0 { orig as f64 / comp as f64 } else { 0.0 };
11561169
eprintln!(" {:60} {:12?} {:>12} → {:>8} ({:.0}×)",
11571170
name, layer_type, orig, comp, ratio);
11581171
}),
1159-
).expect("stream_index_gguf");
1172+
).expect("stream_index_gguf_bf16");
11601173

11611174
drop(writer);
11621175
let out_size = std::fs::metadata(&out_path).map(|m| m.len()).unwrap_or(0);
11631176

11641177
eprintln!();
1165-
eprintln!("=== Llama 4 Scout BF16 Shard {}/5 → bgz17 ===", shard);
1178+
eprintln!("=== Llama 4 Scout BF16 Shard {}/5 → bgz17 (BF16-direct) ===", shard);
11661179
eprintln!(" Output: {:.2} MB ({})", out_size as f64 / 1e6, out_path);
11671180
eprintln!(" Downloaded: {:.2} GB", reader.bytes_downloaded() as f64 / 1e9);
11681181
eprintln!(" Tensors: {} indexed, {} skipped",
11691182
stats.tensors_indexed, stats.tensors_skipped);
11701183
eprintln!(" Original (f32): {:.2} GB", stats.original_bytes as f64 / 1e9);
11711184
eprintln!(" Compressed: {:.2} MB", stats.compressed_bytes as f64 / 1e6);
11721185
eprintln!(" Ratio: {:.1}×", stats.overall_ratio());
1173-
eprintln!(" Peak tensor: {:.2} MB", stats.peak_tensor_bytes as f64 / 1e6);
1186+
eprintln!(" Peak buf (BF16): {:.2} MB", stats.peak_tensor_bytes as f64 / 1e6);
11741187

11751188
let type_names = ["Attention", "FeedForward", "Conv2D", "Norm", "Embedding", "Skip"];
11761189
for (i, name) in type_names.iter().enumerate() {

0 commit comments

Comments
 (0)