Skip to content

Commit 4fdce79

Browse files
authored
Merge pull request #74 from AdaWorldAPI/claude/index-llama-shards-A2Qzr
fix: HttpRangeReader stall detection, CDN re-resolve, HF API resolve Root cause: 21 GB tensor reads stall silently when HuggingFace CDN drops the TCP connection. curl sits forever with no data. Fixes: - --speed-limit 100000 --speed-time 30: abort if < 100 KB/s for 30s - Re-resolve URL on 403 (CDN token expiry after ~1 hour) - Segment-aligned fetches (no overlapping reads on sequential access) - 6 retries (was 4) with capped exponential backoff (max 32s) - from_hf() constructor: resolves via huggingface_hub Python API first, falls back to curl HEAD, then HF REST API. Stores repo/filename for automatic re-resolution on token expiry. - resolve_hf_url() tries 3 methods: Python HF API → curl HEAD → REST API https://claude.ai/code/session_01HmdXNPit7QsTCfhJFef3Ee
2 parents 9462291 + 4d09df0 commit 4fdce79

2 files changed

Lines changed: 252 additions & 144 deletions

File tree

src/hpc/gguf_indexer.rs

Lines changed: 25 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -283,58 +283,28 @@ pub fn project_8rows_bf16_simd(
283283
use crate::simd::F64x8;
284284

285285
let n_octaves = (n_cols + BASE_DIM - 1) / BASE_DIM;
286-
let use_halftone = octave_stride > 1;
287286

288287
let mut sums: [F64x8; BASE_DIM] = [F64x8::splat(0.0); BASE_DIM];
289288
let mut counts: [u32; BASE_DIM] = [0; BASE_DIM];
290289

291-
if use_halftone {
292-
let mut octave = 0;
293-
while octave < n_octaves {
294-
for hi in 0..9 {
295-
let col = octave * BASE_DIM + HALFTONE_POS[hi] as usize;
296-
if col < n_cols {
297-
let bin = HALFTONE_TO_BIN[hi] as usize;
298-
let offsets: [usize; 8] = [
299-
row_starts[0] + col, row_starts[1] + col,
300-
row_starts[2] + col, row_starts[3] + col,
301-
row_starts[4] + col, row_starts[5] + col,
302-
row_starts[6] + col, row_starts[7] + col,
303-
];
304-
sums[bin] += gather_bf16_x8(buf, &offsets);
305-
counts[bin] += 1;
306-
}
307-
}
308-
octave += octave_stride;
309-
}
310-
311-
// Interpolate odd bins from even neighbors (per-lane, still SIMD)
312-
for odd in (1..BASE_DIM).step_by(2) {
313-
let left = sums[odd - 1];
314-
let right = sums[(odd + 1) % BASE_DIM];
315-
let left_c = counts[odd - 1].max(1);
316-
let right_c = counts[(odd + 1) % BASE_DIM].max(1);
317-
let left_mean = left * F64x8::splat(1.0 / left_c as f64);
318-
let right_mean = right * F64x8::splat(1.0 / right_c as f64);
319-
sums[odd] = (left_mean + right_mean) * F64x8::splat(0.5);
320-
counts[odd] = 1;
321-
}
322-
} else {
323-
for octave in 0..n_octaves {
324-
for bi in 0..BASE_DIM {
325-
let col = octave * BASE_DIM + GOLDEN_POS[bi] as usize;
326-
if col < n_cols {
327-
let offsets: [usize; 8] = [
328-
row_starts[0] + col, row_starts[1] + col,
329-
row_starts[2] + col, row_starts[3] + col,
330-
row_starts[4] + col, row_starts[5] + col,
331-
row_starts[6] + col, row_starts[7] + col,
332-
];
333-
sums[bi] += gather_bf16_x8(buf, &offsets);
334-
counts[bi] += 1;
335-
}
290+
// All 17 golden-step positions per sampled octave. Stride skips octaves,
291+
// NOT positions — every bin gets real data from actual weight values.
292+
let mut octave = 0;
293+
while octave < n_octaves {
294+
for bi in 0..BASE_DIM {
295+
let col = octave * BASE_DIM + GOLDEN_POS[bi] as usize;
296+
if col < n_cols {
297+
let offsets: [usize; 8] = [
298+
row_starts[0] + col, row_starts[1] + col,
299+
row_starts[2] + col, row_starts[3] + col,
300+
row_starts[4] + col, row_starts[5] + col,
301+
row_starts[6] + col, row_starts[7] + col,
302+
];
303+
sums[bi] += gather_bf16_x8(buf, &offsets);
304+
counts[bi] += 1;
336305
}
337306
}
307+
octave += octave_stride;
338308
}
339309

340310
// Finalize: mean → scale → clamp → i16, all 8 lanes parallel
@@ -365,39 +335,21 @@ pub fn project_8rows_bf16_simd(
365335
pub fn project_1row_bf16_strided(row: &[u16], octave_stride: usize) -> Base17 {
366336
let d = row.len();
367337
let n_octaves = (d + BASE_DIM - 1) / BASE_DIM;
368-
let use_halftone = octave_stride > 1;
369338

370339
let mut sum = [0.0f64; BASE_DIM];
371340
let mut count = [0u32; BASE_DIM];
372341

373-
if use_halftone {
374-
let mut octave = 0;
375-
while octave < n_octaves {
376-
for hi in 0..9 {
377-
let col = octave * BASE_DIM + HALFTONE_POS[hi] as usize;
378-
if col < d {
379-
sum[HALFTONE_TO_BIN[hi] as usize] += bf16_to_f64(row[col]);
380-
count[HALFTONE_TO_BIN[hi] as usize] += 1;
381-
}
382-
}
383-
octave += octave_stride;
384-
}
385-
for odd in (1..BASE_DIM).step_by(2) {
386-
let lc = count[odd - 1].max(1) as f64;
387-
let rc = count[(odd + 1) % BASE_DIM].max(1) as f64;
388-
sum[odd] = (sum[odd - 1] / lc + sum[(odd + 1) % BASE_DIM] / rc) * 0.5;
389-
count[odd] = 1;
390-
}
391-
} else {
392-
for octave in 0..n_octaves {
393-
for bi in 0..BASE_DIM {
394-
let col = octave * BASE_DIM + GOLDEN_POS[bi] as usize;
395-
if col < d {
396-
sum[bi] += bf16_to_f64(row[col]);
397-
count[bi] += 1;
398-
}
342+
// All 17 positions per sampled octave — no halftone, all bins real
343+
let mut octave = 0;
344+
while octave < n_octaves {
345+
for bi in 0..BASE_DIM {
346+
let col = octave * BASE_DIM + GOLDEN_POS[bi] as usize;
347+
if col < d {
348+
sum[bi] += bf16_to_f64(row[col]);
349+
count[bi] += 1;
399350
}
400351
}
352+
octave += octave_stride;
401353
}
402354

403355
let mut dims = [0i16; BASE_DIM];

0 commit comments

Comments
 (0)