Fix rebase duplicates, clean BF16-direct implementation

claude · claude · commit 8d2d37205c44 · 2026-03-30T07:16:04.000Z
https://claude.ai/code/session_01HmdXNPit7QsTCfhJFef3Ee
diff --git a/src/hpc/gguf_indexer.rs b/src/hpc/gguf_indexer.rs
@@ -630,329 +630,6 @@ pub fn stream_index_gguf<R: Read + Seek, W: Write>(
     Ok(stats)
 }
 
-// ============================================================================
-// BF16-DIRECT OPTIMIZATIONS
-// ============================================================================
-//
-// Skip f32 intermediate entirely for BF16 tensors:
-//   Old: alloc Vec<u8> + alloc Vec<f32> + batch dequant + project  (424 MB peak)
-//   New: alloc Vec<u16> (reused) + inline BF16→f64 at sample sites (141 MB peak)
-//   CPU: 97% fewer BF16→f64 conversions with octave stride + halftone drop
-// ============================================================================
-
-/// Halftone-dropped golden positions: every other step from GOLDEN_POS.
-/// 9 positions, still well-distributed across 0..16.
-const HALFTONE_POS: [u8; 9] = {
-    let mut t = [0u8; 9];
-    let mut i = 0;
-    let mut j = 0;
-    while i < BASE_DIM {
-        if i % 2 == 0 {
-            t[j] = ((i * GOLDEN_STEP) % BASE_DIM) as u8;
-            j += 1;
-        }
-        i += 1;
-    }
-    t
-};
-
-/// Which Base17 bin each halftone sample maps to (even-indexed bins).
-const HALFTONE_TO_BIN: [u8; 9] = [0, 2, 4, 6, 8, 10, 12, 14, 16];
-
-// ── Core: inline BF16 → f64 (zero allocation) ──
-
-/// Convert one BF16 u16 to f64. Zero allocation. 2 instructions.
-///
-/// BF16 = upper 16 bits of IEEE 754 f32.
-/// Shift left 16 → f32 bit pattern → extend to f64.
-#[inline(always)]
-fn bf16_to_f64(bits: u16) -> f64 {
-    f32::from_bits((bits as u32) << 16) as f64
-}
-
-// ── BF16-direct projection (full octave, no f32 intermediate) ──
-
-/// Project a BF16 row directly to Base17. No f32 Vec allocated.
-///
-/// Same golden-step octave averaging as project_row_to_base17(),
-/// but reads u16 BF16 values and converts inline to f64 accumulator.
-///
-/// Memory: 17 × f64 accumulators = 136 bytes stack. That's it.
-pub fn project_row_bf16_direct(row: &[u16]) -> Base17 {
-    let d = row.len();
-    let n_octaves = (d + BASE_DIM - 1) / BASE_DIM;
-    let mut sum = [0.0f64; BASE_DIM];
-    let mut count = [0u32; BASE_DIM];
-
-    for octave in 0..n_octaves {
-        for bi in 0..BASE_DIM {
-            let dim = octave * BASE_DIM + GOLDEN_POS[bi] as usize;
-            if dim < d {
-                sum[bi] += bf16_to_f64(row[dim]);
-                count[bi] += 1;
-            }
-        }
-    }
-
-    let mut dims = [0i16; BASE_DIM];
-    for i in 0..BASE_DIM {
-        if count[i] > 0 {
-            let mean = sum[i] / count[i] as f64;
-            dims[i] = (mean * FP_SCALE).round().clamp(-32768.0, 32767.0) as i16;
-        }
-    }
-    Base17 { dims }
-}
-
-// ── Strided octave + halftone drop (the big win) ──
-
-/// Project a BF16 row with octave stride and halftone dropping.
-///
-/// For a 5120-element row at stride=16:
-///   302 octaves / 16 = 19 sampled octaves
-///   19 octaves × 9 halftone positions = 171 BF16→f64 conversions
-///   vs 5120 conversions in the full path (97% reduction)
-///
-/// Odd bins are interpolated as average of their two neighbors.
-pub fn project_row_bf16_strided(row: &[u16], octave_stride: usize) -> Base17 {
-    let d = row.len();
-    let n_octaves = (d + BASE_DIM - 1) / BASE_DIM;
-
-    // Phase 1: accumulate halftone samples into 9 bins
-    let mut half_sum = [0.0f64; 9];
-    let mut half_count = [0u32; 9];
-
-    let mut octave = 0;
-    while octave < n_octaves {
-        for hi in 0..9 {
-            let dim = octave * BASE_DIM + HALFTONE_POS[hi] as usize;
-            if dim < d {
-                half_sum[hi] += bf16_to_f64(row[dim]);
-                half_count[hi] += 1;
-            }
-        }
-        octave += octave_stride;
-    }
-
-    // Phase 2: fill 17 bins — sampled bins from data, gaps interpolated
-    let mut dims = [0i16; BASE_DIM];
-
-    // Even bins: direct from halftone samples
-    for hi in 0..9 {
-        let bin = HALFTONE_TO_BIN[hi] as usize;
-        if half_count[hi] > 0 {
-            let mean = half_sum[hi] / half_count[hi] as f64;
-            dims[bin] = (mean * FP_SCALE).round().clamp(-32768.0, 32767.0) as i16;
-        }
-    }
-
-    // Odd bins: interpolate from neighbors (circular)
-    for odd in (1..BASE_DIM).step_by(2) {
-        let left = dims[odd - 1] as i32;
-        let right = dims[(odd + 1) % BASE_DIM] as i32;
-        dims[odd] = ((left + right) / 2) as i16;
-    }
-
-    Base17 { dims }
-}
-
-// ── Read tensor as raw u16 (skip f32 allocation entirely) ──
-
-/// Read a BF16 tensor as raw u16 values. NO f32 conversion.
-///
-/// `buf` is a REUSABLE buffer — caller allocates once, passes to every tensor.
-/// Grows to max tensor, never shrinks. Saves 283 MB per tensor vs f32 path.
-pub fn read_tensor_bf16_raw<R: Read + Seek>(
-    reader: &mut R,
-    gguf: &GgufFile,
-    tensor: &TensorInfo,
-    buf: &mut Vec<u16>,
-) -> Result<usize, String> {
-    let abs_offset = gguf.tensor_data_offset + tensor.offset;
-    reader.seek(SeekFrom::Start(abs_offset)).map_err(|e| e.to_string())?;
-
-    let n_elements = tensor.element_count() as usize;
-
-    if buf.len() < n_elements {
-        buf.resize(n_elements, 0);
-    }
-
-    // SAFETY: u16 and [u8; 2] have the same layout on little-endian.
-    // GGUF BF16 tensors are stored as little-endian u16 pairs.
-    let byte_slice = unsafe {
-        std::slice::from_raw_parts_mut(
-            buf.as_mut_ptr() as *mut u8,
-            n_elements * 2,
-        )
-    };
-    reader.read_exact(byte_slice).map_err(|e| e.to_string())?;
-
-    Ok(n_elements)
-}
-
-// ── Helper: tensor_to_rows from dimensions only (no data needed for BF16 path) ──
-
-fn tensor_to_rows_dims(dims: &[u64], layer_type: &LayerType) -> (usize, usize) {
-    match layer_type {
-        LayerType::Conv2D if dims.len() == 4 => {
-            (dims[0] as usize, (dims[1] * dims[2] * dims[3]) as usize)
-        }
-        _ if dims.len() >= 2 => {
-            let rows = dims[0] as usize;
-            let cols: usize = dims[1..].iter().map(|&d| d as usize).product();
-            (rows, cols)
-        }
-        _ => {
-            let total: usize = dims.iter().map(|&d| d as usize).product();
-            (1, total)
-        }
-    }
-}
-
-/// Helper: LayerType → array index.
-fn layer_type_index(lt: &LayerType) -> usize {
-    match lt {
-        LayerType::Attention => 0,
-        LayerType::FeedForward => 1,
-        LayerType::Conv2D => 2,
-        LayerType::Norm => 3,
-        LayerType::Embedding => 4,
-        LayerType::Skip => 5,
-    }
-}
-
-// ── Combined BF16-direct streaming indexer ──
-
-/// Stream-index a BF16 GGUF file with all optimizations.
-///
-/// - No f32 Vec allocation (saves 283 MB per tensor)
-/// - Reusable u16 buffer (one alloc for entire shard)
-/// - Strided octave projection (97% fewer conversions when stride>1)
-/// - Direct BF16→f64 inline conversion (no batch bf16_to_f32_slice)
-///
-/// `octave_stride`: 1 = full (identical to original), 16 = 4 octaves higher
-pub fn stream_index_gguf_bf16<R: Read + Seek, W: Write>(
-    reader: &mut R,
-    writer: &mut W,
-    octave_stride: usize,
-    callback: Option<&dyn Fn(&str, &LayerType, usize, usize)>,
-) -> Result<IndexStats, String> {
-    let gguf = gguf::read_gguf_header(reader)?;
-    let mut stats = IndexStats::default();
-    stats.tensors_total = gguf.tensors.len();
-
-    writer.write_all(b"BGZ7").map_err(|e| e.to_string())?;
-    writer.write_all(&(gguf.tensors.len() as u32).to_le_bytes()).map_err(|e| e.to_string())?;
-
-    // ONE reusable buffer — grows to largest tensor, never shrinks
-    let mut bf16_buf: Vec<u16> = Vec::new();
-
-    for tensor in &gguf.tensors {
-        let layer_type = classify_tensor(&tensor.name, &tensor.dimensions);
-
-        if matches!(layer_type, LayerType::Skip | LayerType::Norm) {
-            stats.tensors_skipped += 1;
-            continue;
-        }
-
-        let is_bf16 = matches!(tensor.dtype, GgmlType::BF16);
-
-        if is_bf16 {
-            // FAST PATH: BF16 direct — no f32 intermediate
-            let n_elements = read_tensor_bf16_raw(reader, &gguf, tensor, &mut bf16_buf)?;
-
-            let (n_rows, n_cols) = tensor_to_rows_dims(&tensor.dimensions, &layer_type);
-            let orig_bytes = (n_rows * n_cols * 4) as u64; // f32 equivalent
-
-            let mut rows = Vec::with_capacity(n_rows);
-            for r in 0..n_rows {
-                let start = r * n_cols;
-                let end = (start + n_cols).min(n_elements);
-                let row_slice = &bf16_buf[start..end];
-
-                let b17 = if octave_stride > 1 {
-                    project_row_bf16_strided(row_slice, octave_stride)
-                } else {
-                    project_row_bf16_direct(row_slice)
-                };
-                rows.push(b17);
-            }
-
-            let comp_bytes = (rows.len() * Base17::BYTE_SIZE) as u64;
-
-            let ct = CompressedTensor {
-                name: tensor.name.clone(),
-                layer_type: layer_type.clone(),
-                original_shape: tensor.dimensions.clone(),
-                n_rows,
-                n_cols,
-                rows,
-            };
-            ct.write_to(writer)?;
-
-            let lt_idx = layer_type_index(&layer_type);
-            stats.by_type[lt_idx].0 += 1;
-            stats.by_type[lt_idx].1 += orig_bytes;
-            stats.by_type[lt_idx].2 += comp_bytes;
-            stats.original_bytes += orig_bytes;
-            stats.compressed_bytes += comp_bytes;
-            stats.tensors_indexed += 1;
-
-            if n_elements as u64 * 2 > stats.peak_tensor_bytes {
-                stats.peak_tensor_bytes = n_elements as u64 * 2;
-            }
-
-            if let Some(cb) = callback {
-                cb(&tensor.name, &layer_type, orig_bytes as usize, comp_bytes as usize);
-            }
-        } else {
-            // FALLBACK: non-BF16 dtype — use original f32 path
-            let data = gguf::read_tensor_f32(reader, &gguf, tensor)?;
-
-            let tensor_bytes = data.len() as u64 * 4;
-            if tensor_bytes > stats.peak_tensor_bytes {
-                stats.peak_tensor_bytes = tensor_bytes;
-            }
-
-            let (n_rows, n_cols) = tensor_to_rows(&data, &tensor.dimensions, &layer_type);
-
-            let mut rows = Vec::with_capacity(n_rows);
-            for r in 0..n_rows {
-                let start = r * n_cols;
-                let end = (start + n_cols).min(data.len());
-                rows.push(project_row_to_base17(&data[start..end]));
-            }
-
-            let orig_bytes = (n_rows * n_cols * 4) as u64;
-            let comp_bytes = (rows.len() * Base17::BYTE_SIZE) as u64;
-
-            let ct = CompressedTensor {
-                name: tensor.name.clone(),
-                layer_type: layer_type.clone(),
-                original_shape: tensor.dimensions.clone(),
-                n_rows,
-                n_cols,
-                rows,
-            };
-            ct.write_to(writer)?;
-
-            let lt_idx = layer_type_index(&layer_type);
-            stats.by_type[lt_idx].0 += 1;
-            stats.by_type[lt_idx].1 += orig_bytes;
-            stats.by_type[lt_idx].2 += comp_bytes;
-            stats.original_bytes += orig_bytes;
-            stats.compressed_bytes += comp_bytes;
-            stats.tensors_indexed += 1;
-
-            if let Some(cb) = callback {
-                cb(&tensor.name, &layer_type, orig_bytes as usize, comp_bytes as usize);
-            }
-        }
-    }
-
-    Ok(stats)
-}
-
 // ============================================================================
 // Tests
 // ============================================================================
@@ -1372,35 +1049,6 @@ mod tests {
         }
     }
 
-    #[test]
-    fn test_halftone_positions_coverage() {
-        let positions: Vec<u8> = HALFTONE_POS.to_vec();
-        let mut sorted = positions.clone();
-        sorted.sort();
-        assert_eq!(sorted, vec![0, 1, 3, 5, 6, 8, 10, 13, 15]);
-    }
-
-    #[test]
-    fn test_bf16_to_f64_accuracy() {
-        assert_eq!(bf16_to_f64(0x3F80), 1.0);
-        assert_eq!(bf16_to_f64(0x0000), 0.0);
-        assert_eq!(bf16_to_f64(0xBF80), -1.0);
-        let v = bf16_to_f64(0x4049);
-        assert!((v - 3.140625).abs() < 0.01);
-    }
-
-    #[test]
-    fn test_strided_vs_full_agreement() {
-        let row: Vec<u16> = vec![0x3F80; 5120]; // all 1.0 in BF16
-        let full = project_row_bf16_direct(&row);
-        let strided = project_row_bf16_strided(&row, 16);
-        for i in 0..BASE_DIM {
-            let diff = (full.dims[i] as i32 - strided.dims[i] as i32).abs();
-            assert!(diff <= 1, "bin {} differs by {}: full={}, strided={}",
-                i, diff, full.dims[i], strided.dims[i]);
-        }
-    }
-
     #[test]
     #[ignore] // Streams ~801 GB from HuggingFace
     fn test_stream_index_llama4_maverick_bf16_all_shards() {