Skip to content

Commit f68fefa

Browse files
committed
Cherry-pick: fix(amx) prevent SIGILL — use _xgetbv(0) + prctl for detection
Root cause: amx_available() used CPUID leaf 0xD (what CPU supports) instead of _xgetbv(0) (what OS actually enabled). Hypervisors that advertise AMX in CPUID but don't enable tile state caused SIGILL. Fix — 4-step detection: 1. CPUID.07H bits 24+25 → CPU has AMX-TILE + AMX-INT8? 2. CPUID.01H bit 27 → OS supports XSAVE? 3. _xgetbv(0) bits 17+18 → OS ACTUALLY enabled tile state? 4. prctl(ARCH_REQ_XCOMP_PERM, 18) → process has tile permission? VNNI hierarchy documented: avx512vnni (EVEX zmm, 64 MACs) → checked first avxvnniint8 (VEX ymm, 32 MACs) → only if avx512vnni absent Before: 1612 pass + SIGILL crash After: 1612 pass, 0 fail, 36 ignored Cherry-picked from 282daf7 (claude/continue-lance-graph-ndarray-Ld786) https://claude.ai/code/session_017ZN5PNEf8boFBgorUZVrFU
1 parent a42d999 commit f68fefa

2 files changed

Lines changed: 107 additions & 19 deletions

File tree

.claude/AMX_GOTCHAS.md

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -66,18 +66,41 @@ For CPUID leaf 7 (AMX detection): use `__cpuid_count()`, not inline asm.
6666

6767
---
6868

69-
## Gotcha 4: OS must enable AMX via XSETBV
69+
## Gotcha 4: OS must enable AMX via XSETBV + process must request permission
7070

71-
AMX tiles are large (8 KB of state). The OS must opt in via XCR0 bits 17+18.
72-
Linux 5.19+ enables AMX by default. Older kernels: SIGILL on tile instructions.
71+
AMX tiles are large (8 KB of state). Two levels of OS enablement required:
72+
73+
1. **Kernel enables tile state in XCR0** (bits 17+18). Linux 5.19+ does this.
74+
2. **Process requests XCOMP_PERM** via `prctl(ARCH_REQ_XCOMP_PERM, 18)`.
75+
Without this, LDTILECFG will SIGILL even if XCR0 bits are set.
7376

7477
**Detection (stable)**:
7578
```rust
76-
let xcr0 = core::arch::x86_64::__cpuid_count(0xD, 0);
77-
let tilecfg = (xcr0.eax >> 17) & 1; // bit 17 = XTILECFG
78-
let tiledata = (xcr0.eax >> 18) & 1; // bit 18 = XTILEDATA
79-
// Both must be 1
80-
```
79+
// Step 1: CPUID — does CPU support AMX?
80+
let cpuid = core::arch::x86_64::__cpuid_count(7, 0);
81+
let amx_tile = (cpuid.edx >> 24) & 1;
82+
let amx_int8 = (cpuid.edx >> 25) & 1;
83+
84+
// Step 2: OSXSAVE — does OS support XSAVE?
85+
let cpuid_01 = core::arch::x86_64::__cpuid(1);
86+
let osxsave = (cpuid_01.ecx >> 27) & 1;
87+
88+
// Step 3: _xgetbv(0) — did OS ACTUALLY enable tile state?
89+
// ⚠ Do NOT use __cpuid_count(0xD, 0) — that reports what CPU SUPPORTS,
90+
// not what the OS ENABLED. _xgetbv(0) reads the actual XCR0 register.
91+
let xcr0: u64 = unsafe { core::arch::x86_64::_xgetbv(0) };
92+
let tilecfg = (xcr0 >> 17) & 1; // bit 17 = XTILECFG
93+
let tiledata = (xcr0 >> 18) & 1; // bit 18 = XTILEDATA
94+
95+
// Step 4: prctl — request tile permission for this process
96+
// SYS_prctl = 157, ARCH_REQ_XCOMP_PERM = 0x1023, XFEATURE_XTILEDATA = 18
97+
// Returns 0 on success, -errno on failure. Idempotent.
98+
```
99+
100+
**Previous bug**: `__cpuid_count(0xD, 0)` reports XSAVE state component bitmap
101+
(what the CPU *supports*), NOT the actual XCR0 value (what the OS *enabled*).
102+
On hypervisors that advertise AMX in CPUID but don't enable tile state,
103+
the old check returned `true` → SIGILL on LDTILECFG.
81104

82105
---
83106

src/simd_amx.rs

Lines changed: 76 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,17 +31,74 @@
3131
// ═══════════════════════════════════════════════════════════════════════════
3232

3333
/// Check if AMX hardware is present AND OS-enabled.
34+
///
35+
/// Two checks required:
36+
/// 1. CPUID.07H.0H:EDX bits 24 (AMX-TILE) + 25 (AMX-INT8) = CPU supports it
37+
/// 2. XCR0 bits 17 (TILECFG) + 18 (TILEDATA) = OS has enabled tile state
38+
///
39+
/// The XCR0 check is critical: even if CPUID reports AMX, the hypervisor
40+
/// may not have enabled the XSTATE for tiles. Without OS enablement,
41+
/// LDTILECFG will SIGILL.
42+
///
43+
/// Previous bug: used CPUID leaf 0xD (reports what CPU supports for XSAVE)
44+
/// instead of _xgetbv(0) (reports what OS actually enabled). The old check
45+
/// could return true on a hypervisor that advertises AMX in CPUID but
46+
/// hasn't set XCR0 bits 17+18.
3447
#[cfg(target_arch = "x86_64")]
3548
pub fn amx_available() -> bool {
49+
// Step 1: CPU supports AMX-TILE + AMX-INT8?
3650
let cpuid = core::arch::x86_64::__cpuid_count(7, 0);
3751
let amx_tile = (cpuid.edx >> 24) & 1;
3852
let amx_int8 = (cpuid.edx >> 25) & 1;
3953
if amx_tile == 0 || amx_int8 == 0 { return false; }
40-
// Check OS enabled via XCR0 bits 17+18
41-
let xcr0 = core::arch::x86_64::__cpuid_count(0xD, 0);
42-
let tilecfg = (xcr0.eax >> 17) & 1;
43-
let tiledata = (xcr0.eax >> 18) & 1;
44-
tilecfg == 1 && tiledata == 1
54+
55+
// Step 2: OS enabled XSAVE? (CPUID.01H:ECX bit 27 = OSXSAVE)
56+
let cpuid_01 = core::arch::x86_64::__cpuid(1);
57+
let osxsave = (cpuid_01.ecx >> 27) & 1;
58+
if osxsave == 0 { return false; }
59+
60+
// Step 3: OS actually enabled tile state in XCR0?
61+
// _xgetbv(0) reads the ACTUAL XCR0 register (what the OS set),
62+
// not the CPUID-reported capability.
63+
// Bit 17 = TILECFG, Bit 18 = TILEDATA. Both must be set.
64+
let xcr0: u64 = unsafe { core::arch::x86_64::_xgetbv(0) };
65+
let tilecfg = (xcr0 >> 17) & 1;
66+
let tiledata = (xcr0 >> 18) & 1;
67+
if tilecfg == 0 || tiledata == 0 { return false; }
68+
69+
// Step 4: Request XCOMP_PERM for TILEDATA.
70+
// Linux kernel 5.19+: processes must call prctl(ARCH_REQ_XCOMP_PERM, 18)
71+
// to request permission for TILEDATA (XFEATURE 18) before using AMX.
72+
// Without this, LDTILECFG will SIGILL even if XCR0 bits are set.
73+
// The prctl either succeeds (0) or fails (-1) — idempotent, safe to call
74+
// multiple times.
75+
#[cfg(target_os = "linux")]
76+
{
77+
const SYS_PRCTL: i64 = 157; // x86_64 syscall number for prctl
78+
const ARCH_REQ_XCOMP_PERM: i64 = 0x1023;
79+
const XFEATURE_XTILEDATA: i64 = 18;
80+
// SAFETY: syscall(prctl, ARCH_REQ_XCOMP_PERM, 18) is a simple permission
81+
// request. It either grants tile permission (returns 0) or fails (returns
82+
// -errno). No side effects on failure. Idempotent.
83+
let ret: i64;
84+
unsafe {
85+
core::arch::asm!(
86+
"syscall",
87+
inlateout("rax") SYS_PRCTL => ret,
88+
in("rdi") ARCH_REQ_XCOMP_PERM,
89+
in("rsi") XFEATURE_XTILEDATA,
90+
in("rdx") 0i64,
91+
in("r10") 0i64,
92+
in("r8") 0i64,
93+
lateout("rcx") _,
94+
lateout("r11") _,
95+
options(nostack),
96+
);
97+
}
98+
if ret != 0 { return false; }
99+
}
100+
101+
true
45102
}
46103

47104
#[cfg(not(target_arch = "x86_64"))]
@@ -203,17 +260,25 @@ pub fn vnni_matvec_scalar(
203260

204261
/// Runtime-dispatched VNNI MatVec: avx512vnni → avxvnniint8 → scalar i32.
205262
///
206-
/// Three tiers, mutually exclusive by hardware generation:
263+
/// Three tiers, checked in order (first match wins):
207264
/// avx512vnni — 64 MACs/instr (zmm, Cascade Lake+, Zen 4+)
208265
/// avxvnniint8 — 32 MACs/instr (ymm, Arrow Lake, NUC 14 i9-185H)
209-
/// scalar i32 — only for non-x86 or testing (caller should prefer F32x16 FMA)
266+
/// scalar i32 — only for non-x86 or testing
267+
///
268+
/// IMPORTANT: avxvnniint8 (VNNI2, 256-bit) is NEVER reached when
269+
/// avx512vnni (VNNI512) is present. This is correct:
270+
/// - CPUs with avx512vnni always have 512-bit VPDPBUSD (faster)
271+
/// - avxvnniint8 exists ONLY for CPUs that dropped AVX-512
272+
/// but added 256-bit VNNI (Arrow Lake, Meteor Lake U-series)
273+
/// - The two instructions have DIFFERENT encodings:
274+
/// avx512vnni: EVEX-encoded VPDPBUSD zmm (512-bit)
275+
/// avxvnniint8: VEX-encoded VPDPBUSD ymm (256-bit)
276+
/// - Running EVEX VPDPBUSD on a VEX-only CPU = SIGILL
277+
/// - Running VEX VPDPBUSD on an EVEX CPU = works but wastes half the width
210278
///
211-
/// NOTE: The scalar path here does i32 multiply-accumulate, NOT f32.
212-
/// For the thinking engine, F32x16 FMA (16 MACs/instr) is the true floor.
213-
/// This scalar path exists only for correctness on non-x86 targets.
214279
/// The thinking engine's cycle_auto() dispatches:
215280
/// VNNI detected → cycle_vnni() → this function
216-
/// No VNNI → cycle() → F32x16 (never reaches here)
281+
/// No VNNI → cycle() → F32x16 FMA (never reaches here)
217282
pub fn matvec_dispatch(
218283
table: &[u8],
219284
energy_i8: &[i8],

0 commit comments

Comments
 (0)