@@ -283,58 +283,28 @@ pub fn project_8rows_bf16_simd(
283283 use crate :: simd:: F64x8 ;
284284
285285 let n_octaves = ( n_cols + BASE_DIM - 1 ) / BASE_DIM ;
286- let use_halftone = octave_stride > 1 ;
287286
288287 let mut sums: [ F64x8 ; BASE_DIM ] = [ F64x8 :: splat ( 0.0 ) ; BASE_DIM ] ;
289288 let mut counts: [ u32 ; BASE_DIM ] = [ 0 ; BASE_DIM ] ;
290289
291- if use_halftone {
292- let mut octave = 0 ;
293- while octave < n_octaves {
294- for hi in 0 ..9 {
295- let col = octave * BASE_DIM + HALFTONE_POS [ hi] as usize ;
296- if col < n_cols {
297- let bin = HALFTONE_TO_BIN [ hi] as usize ;
298- let offsets: [ usize ; 8 ] = [
299- row_starts[ 0 ] + col, row_starts[ 1 ] + col,
300- row_starts[ 2 ] + col, row_starts[ 3 ] + col,
301- row_starts[ 4 ] + col, row_starts[ 5 ] + col,
302- row_starts[ 6 ] + col, row_starts[ 7 ] + col,
303- ] ;
304- sums[ bin] += gather_bf16_x8 ( buf, & offsets) ;
305- counts[ bin] += 1 ;
306- }
307- }
308- octave += octave_stride;
309- }
310-
311- // Interpolate odd bins from even neighbors (per-lane, still SIMD)
312- for odd in ( 1 ..BASE_DIM ) . step_by ( 2 ) {
313- let left = sums[ odd - 1 ] ;
314- let right = sums[ ( odd + 1 ) % BASE_DIM ] ;
315- let left_c = counts[ odd - 1 ] . max ( 1 ) ;
316- let right_c = counts[ ( odd + 1 ) % BASE_DIM ] . max ( 1 ) ;
317- let left_mean = left * F64x8 :: splat ( 1.0 / left_c as f64 ) ;
318- let right_mean = right * F64x8 :: splat ( 1.0 / right_c as f64 ) ;
319- sums[ odd] = ( left_mean + right_mean) * F64x8 :: splat ( 0.5 ) ;
320- counts[ odd] = 1 ;
321- }
322- } else {
323- for octave in 0 ..n_octaves {
324- for bi in 0 ..BASE_DIM {
325- let col = octave * BASE_DIM + GOLDEN_POS [ bi] as usize ;
326- if col < n_cols {
327- let offsets: [ usize ; 8 ] = [
328- row_starts[ 0 ] + col, row_starts[ 1 ] + col,
329- row_starts[ 2 ] + col, row_starts[ 3 ] + col,
330- row_starts[ 4 ] + col, row_starts[ 5 ] + col,
331- row_starts[ 6 ] + col, row_starts[ 7 ] + col,
332- ] ;
333- sums[ bi] += gather_bf16_x8 ( buf, & offsets) ;
334- counts[ bi] += 1 ;
335- }
290+ // All 17 golden-step positions per sampled octave. Stride skips octaves,
291+ // NOT positions — every bin gets real data from actual weight values.
292+ let mut octave = 0 ;
293+ while octave < n_octaves {
294+ for bi in 0 ..BASE_DIM {
295+ let col = octave * BASE_DIM + GOLDEN_POS [ bi] as usize ;
296+ if col < n_cols {
297+ let offsets: [ usize ; 8 ] = [
298+ row_starts[ 0 ] + col, row_starts[ 1 ] + col,
299+ row_starts[ 2 ] + col, row_starts[ 3 ] + col,
300+ row_starts[ 4 ] + col, row_starts[ 5 ] + col,
301+ row_starts[ 6 ] + col, row_starts[ 7 ] + col,
302+ ] ;
303+ sums[ bi] += gather_bf16_x8 ( buf, & offsets) ;
304+ counts[ bi] += 1 ;
336305 }
337306 }
307+ octave += octave_stride;
338308 }
339309
340310 // Finalize: mean → scale → clamp → i16, all 8 lanes parallel
@@ -365,39 +335,21 @@ pub fn project_8rows_bf16_simd(
365335pub fn project_1row_bf16_strided ( row : & [ u16 ] , octave_stride : usize ) -> Base17 {
366336 let d = row. len ( ) ;
367337 let n_octaves = ( d + BASE_DIM - 1 ) / BASE_DIM ;
368- let use_halftone = octave_stride > 1 ;
369338
370339 let mut sum = [ 0.0f64 ; BASE_DIM ] ;
371340 let mut count = [ 0u32 ; BASE_DIM ] ;
372341
373- if use_halftone {
374- let mut octave = 0 ;
375- while octave < n_octaves {
376- for hi in 0 ..9 {
377- let col = octave * BASE_DIM + HALFTONE_POS [ hi] as usize ;
378- if col < d {
379- sum[ HALFTONE_TO_BIN [ hi] as usize ] += bf16_to_f64 ( row[ col] ) ;
380- count[ HALFTONE_TO_BIN [ hi] as usize ] += 1 ;
381- }
382- }
383- octave += octave_stride;
384- }
385- for odd in ( 1 ..BASE_DIM ) . step_by ( 2 ) {
386- let lc = count[ odd - 1 ] . max ( 1 ) as f64 ;
387- let rc = count[ ( odd + 1 ) % BASE_DIM ] . max ( 1 ) as f64 ;
388- sum[ odd] = ( sum[ odd - 1 ] / lc + sum[ ( odd + 1 ) % BASE_DIM ] / rc) * 0.5 ;
389- count[ odd] = 1 ;
390- }
391- } else {
392- for octave in 0 ..n_octaves {
393- for bi in 0 ..BASE_DIM {
394- let col = octave * BASE_DIM + GOLDEN_POS [ bi] as usize ;
395- if col < d {
396- sum[ bi] += bf16_to_f64 ( row[ col] ) ;
397- count[ bi] += 1 ;
398- }
342+ // All 17 positions per sampled octave — no halftone, all bins real
343+ let mut octave = 0 ;
344+ while octave < n_octaves {
345+ for bi in 0 ..BASE_DIM {
346+ let col = octave * BASE_DIM + GOLDEN_POS [ bi] as usize ;
347+ if col < d {
348+ sum[ bi] += bf16_to_f64 ( row[ col] ) ;
349+ count[ bi] += 1 ;
399350 }
400351 }
352+ octave += octave_stride;
401353 }
402354
403355 let mut dims = [ 0i16 ; BASE_DIM ] ;
0 commit comments