Skip to content

Commit db3669e

Browse files
committed
feat(simd): SimdProfile enum + detect() implements dispatch matrix
Phase 3 T3.1 of the SIMD integration plan: introduce crate::hpc::simd_profile::SimdProfile, the silicon-grained dispatch identity that replaces the coarse three-Tier collapse called out in audit findings TD-T12/T13/T14. The decision tree in SimdProfile::detect() implements .claude/knowledge/td-simd-cpu-dispatch-matrix.md lines 271-305 verbatim, preserving the four load-bearing invariants from the "Detection invariants" section: GraniteRapids-before-SapphireRapids, Zen4-vs-SPR via amx_tile, CooperLake-vs-IceLakeSp via the mutually exclusive BF16/VBMI bit pattern, and TigerLakeU-vs-IceLakeSp via VP2INTERSECT. Risk #4 of the integration plan (no GNR detection without leaf 7,1 reader) closed in the same change: SimdCaps gains avx512fp16, avx512vp2intersect, and amx_fp16 fields, with the x86 detect() arm adding a __cpuid_count(7, 1) read gated on the leaf 7,0 EAX max subleaf advertising support. has_amx_fp16() requires amx_tile in addition to the FP16 bit, mirroring the defense-in-depth pattern in simd_amx::amx_available(). Surface follows cognitive-shader-foundation.md: SimdProfile + simd_profile() re-exported through crate::simd::* so consumers import a single public path. The existing private Tier / tier() machinery in src/simd.rs is untouched; this lands alongside, with incremental migration deferred to T3.5/T3.6. Tests: 7 new in simd_profile (detection determinism, arch partitioning, AVX-512 subset invariant, x86_64-only Scalar fallback, name uniqueness), 2 new in simd_caps (FP16 fields false on non-x86, has_amx_fp16 requires amx_tile). 2075/2075 lib tests pass, clippy -D warnings clean.
1 parent eb6444f commit db3669e

4 files changed

Lines changed: 471 additions & 4 deletions

File tree

src/hpc/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717
pub mod simd_caps;
1818
// LazyLock frozen SIMD dispatch — function pointers selected once at startup
1919
pub mod simd_dispatch;
20+
// Silicon-grained profile (Sapphire Rapids vs Zen 4 vs Ice Lake-SP, etc.) —
21+
// implements the dispatch matrix from `.claude/knowledge/td-simd-cpu-dispatch-matrix.md`
22+
pub mod simd_profile;
2023

2124
pub mod blas_level1;
2225
pub mod blas_level2;

src/hpc/simd_caps.rs

Lines changed: 95 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,21 @@ pub struct SimdCaps {
7171
/// (`is_x86_feature_detected!("avxvnniint8")`).
7272
/// Present on Arrow Lake, Lunar Lake, NUC 14 (Meteor Lake-H).
7373
pub avxvnniint8: bool,
74+
/// AVX-512 FP16 arithmetic (CPUID.07H.0H:EDX bit 23).
75+
/// Native `__m512h` operations (`_mm512_*_ph`). Distinguishes Sapphire
76+
/// Rapids / Granite Rapids / Zen 4+ from earlier AVX-512 silicon.
77+
/// Required by the matrix doc to discriminate SPR (yes) from CLX (no).
78+
pub avx512fp16: bool,
79+
/// AVX-512 VP2INTERSECT (CPUID.07H.0H:EDX bit 8). Present only on
80+
/// Tiger Lake mobile silicon; absent from Ice Lake-SP and every later
81+
/// server part. Sole discriminator between `TigerLakeU` and
82+
/// `IceLakeSp` profiles, which otherwise share an identical feature set.
83+
pub avx512vp2intersect: bool,
84+
/// AMX-FP16 (CPUID.07H.1H:EAX bit 21). `TDPFP16PS` FP16 tile dot
85+
/// product, present on Granite Rapids only. Sole discriminator between
86+
/// `SapphireRapids` and `GraniteRapids` profiles. Lives at CPUID leaf
87+
/// 7,1, not leaf 7,0 — separate cpuid_count call required.
88+
pub amx_fp16: bool,
7489

7590
// ── aarch64 (ARM) ──
7691
/// NEON 128-bit SIMD (mandatory on aarch64, always true).
@@ -124,6 +139,9 @@ impl SimdCaps {
124139
amx_bf16: false,
125140
avx512bf16: false,
126141
avxvnniint8: false,
142+
avx512fp16: false,
143+
avx512vp2intersect: false,
144+
amx_fp16: false,
127145
neon: false,
128146
asimd_dotprod: false,
129147
fp16: false,
@@ -139,10 +157,24 @@ impl SimdCaps {
139157
// `__cpuid_count` is safe on x86_64 (Rust 1.87+): CPUID is always
140158
// available on x86_64 (guaranteed by the ABI) and has no side effects
141159
// beyond reading CPU registers.
142-
let cpuid7 = core::arch::x86_64::__cpuid_count(7, 0);
143-
let amx_tile = (cpuid7.edx >> 24) & 1 == 1;
144-
let amx_int8 = (cpuid7.edx >> 25) & 1 == 1;
145-
let amx_bf16 = (cpuid7.edx >> 22) & 1 == 1;
160+
let cpuid7_0 = core::arch::x86_64::__cpuid_count(7, 0);
161+
let amx_tile = (cpuid7_0.edx >> 24) & 1 == 1;
162+
let amx_int8 = (cpuid7_0.edx >> 25) & 1 == 1;
163+
let amx_bf16 = (cpuid7_0.edx >> 22) & 1 == 1;
164+
let avx512fp16 = (cpuid7_0.edx >> 23) & 1 == 1;
165+
let avx512vp2intersect = (cpuid7_0.edx >> 8) & 1 == 1;
166+
167+
// Leaf 7,1 EAX bit 21 = AMX-FP16. Per the dispatch matrix this is
168+
// the sole discriminator between Granite Rapids and Sapphire Rapids.
169+
// Leaf 7,1 only exists when leaf 7,0 EAX (max subleaf) >= 1; on
170+
// older silicon this returns zero, which is the correct answer.
171+
let leaf7_max_sub = cpuid7_0.eax;
172+
let amx_fp16 = if leaf7_max_sub >= 1 {
173+
let cpuid7_1 = core::arch::x86_64::__cpuid_count(7, 1);
174+
(cpuid7_1.eax >> 21) & 1 == 1
175+
} else {
176+
false
177+
};
146178

147179
Self {
148180
avx2: is_x86_feature_detected!("avx2"),
@@ -160,6 +192,9 @@ impl SimdCaps {
160192
amx_bf16,
161193
avx512bf16: is_x86_feature_detected!("avx512bf16"),
162194
avxvnniint8: is_x86_feature_detected!("avxvnniint8"),
195+
avx512fp16,
196+
avx512vp2intersect,
197+
amx_fp16,
163198
// ARM fields: all false on x86
164199
neon: false,
165200
asimd_dotprod: false,
@@ -192,6 +227,9 @@ impl SimdCaps {
192227
amx_bf16: false,
193228
avx512bf16: false,
194229
avxvnniint8: false,
230+
avx512fp16: false,
231+
avx512vp2intersect: false,
232+
amx_fp16: false,
195233
// ARM fields: runtime detection
196234
neon: true, // mandatory on aarch64
197235
asimd_dotprod: std::arch::is_aarch64_feature_detected!("dotprod"),
@@ -221,6 +259,9 @@ impl SimdCaps {
221259
amx_bf16: false,
222260
avx512bf16: false,
223261
avxvnniint8: false,
262+
avx512fp16: false,
263+
avx512vp2intersect: false,
264+
amx_fp16: false,
224265
neon: false,
225266
asimd_dotprod: false,
226267
fp16: false,
@@ -275,6 +316,20 @@ impl SimdCaps {
275316
self.avxvnniint8
276317
}
277318

319+
/// True if AVX-512 FP16 (`__m512h`) is available. Required to
320+
/// discriminate `SapphireRapids` from `CascadeLake`-class profiles.
321+
#[inline(always)]
322+
pub fn has_avx512_fp16(self) -> bool {
323+
self.avx512fp16
324+
}
325+
326+
/// True if AMX-FP16 (`TDPFP16PS`) is available. Only Granite Rapids
327+
/// silicon advertises this bit; sole discriminator between GNR and SPR.
328+
#[inline(always)]
329+
pub fn has_amx_fp16(self) -> bool {
330+
self.amx_fp16 && self.amx_tile
331+
}
332+
278333
// ── ARM convenience methods ──
279334

280335
/// True if running on aarch64 with NEON (always true on aarch64).
@@ -408,6 +463,42 @@ mod tests {
408463
let _ = caps.amx_bf16;
409464
let _ = caps.avx512bf16;
410465
let _ = caps.avxvnniint8;
466+
// PR-#181 follow-up fields (matrix doc lines 240, 247-248).
467+
let _ = caps.avx512fp16;
468+
let _ = caps.avx512vp2intersect;
469+
let _ = caps.amx_fp16;
470+
}
471+
472+
#[test]
473+
fn fp16_fields_consistent_on_non_x86() {
474+
// Non-x86 targets must never report x86 AMX/AVX-512 FP16 capabilities.
475+
#[cfg(not(target_arch = "x86_64"))]
476+
{
477+
let caps = simd_caps();
478+
assert!(!caps.avx512fp16);
479+
assert!(!caps.avx512vp2intersect);
480+
assert!(!caps.amx_fp16);
481+
assert!(!caps.has_avx512_fp16());
482+
assert!(!caps.has_amx_fp16());
483+
}
484+
}
485+
486+
#[test]
487+
fn has_amx_fp16_requires_amx_tile() {
488+
// Even if `amx_fp16` were spuriously true without `amx_tile`,
489+
// the convenience method must require both bits. Matches
490+
// simd_amx.rs::amx_available()'s defense-in-depth pattern.
491+
let synthetic = SimdCaps {
492+
avx2: false, avx512f: false, avx512bw: false, avx512vl: false,
493+
avx512vpopcntdq: false, sse41: false, sse2: false, fma: false,
494+
avx512vnni: false, avx512vbmi: false,
495+
amx_tile: false, amx_int8: false, amx_bf16: false,
496+
avx512bf16: false, avxvnniint8: false,
497+
avx512fp16: false, avx512vp2intersect: false, amx_fp16: true,
498+
neon: false, asimd_dotprod: false, fp16: false,
499+
aes: false, sha2: false, crc32: false,
500+
};
501+
assert!(!synthetic.has_amx_fp16(), "amx_fp16 without amx_tile must report false");
411502
}
412503

413504
#[test]

0 commit comments

Comments
 (0)