From 1702b2a8c829a7cf8b9e491976705671f2860c03 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 25 Feb 2026 16:29:42 +0000 Subject: [PATCH 1/5] Sve2 Scatters need a temp register for indices Fixes #124750 There are three forms of scatter instructions supported by CoreCLR * Vector of addresses * A single address plus a vector of indices (vector length offsets) * A single address plus a vector of byte offsets There are encodings for all of these in SVE1. SVE2 duplicates all the scatter instructions, providing non temporal versions of them. The encodings all match SVE1, except for the indices version, which is missing. This can be replicated by simply shifting the offsets before calling the instruction (and is exactly what happens in the equivalent C++ instrinsics). Therefore, ensure there is a temp register to hold the shifted value. --- src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 29 ++++++++++++++------- src/coreclr/jit/lsraarm64.cpp | 13 +++++++++ 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index 61ed96590534f6..1d1a7dc8ae4d39 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -2429,37 +2429,34 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) } case NI_Sve2_Scatter16BitNarrowingNonTemporal: - case NI_Sve2_Scatter16BitWithByteOffsetsNarrowingNonTemporal: case NI_Sve2_Scatter32BitNarrowingNonTemporal: - case NI_Sve2_Scatter32BitWithByteOffsetsNarrowingNonTemporal: case NI_Sve2_Scatter8BitNarrowingNonTemporal: - case NI_Sve2_Scatter8BitWithByteOffsetsNarrowingNonTemporal: case NI_Sve2_ScatterNonTemporal: - case NI_Sve2_ScatterWithByteOffsetsNonTemporal: { if (!varTypeIsSIMD(intrin.op2->gtType)) { - // Scatter...(Vector mask, T* address, Vector offsets, Vector data) + // Scatter...(Vector mask, T* address, Vector indices, Vector data) assert(intrin.numOperands == 4); - // Calculate the byte offsets if using indices. + // SVE2 instruction only directly support byte offsets. Convert indices to bytes. + regNumber tempReg = internalRegisters.GetSingle(node, RBM_ALLFLOAT); if (intrin.id == NI_Sve2_Scatter16BitNarrowingNonTemporal) { - GetEmitter()->emitIns_R_R_I(INS_sve_lsl, emitSize, op3Reg, op3Reg, 1, opt); + GetEmitter()->emitIns_R_R_I(INS_sve_lsl, emitSize, tempReg, op3Reg, 1, opt); } else if (intrin.id == NI_Sve2_Scatter32BitNarrowingNonTemporal) { - GetEmitter()->emitIns_R_R_I(INS_sve_lsl, emitSize, op3Reg, op3Reg, 2, opt); + GetEmitter()->emitIns_R_R_I(INS_sve_lsl, emitSize, tempReg, op3Reg, 2, opt); } else if (intrin.id == NI_Sve2_ScatterNonTemporal) { assert(emitActualTypeSize(intrin.baseType) == 8); - GetEmitter()->emitIns_R_R_I(INS_sve_lsl, emitSize, op3Reg, op3Reg, 3, opt); + GetEmitter()->emitIns_R_R_I(INS_sve_lsl, emitSize, tempReg, op3Reg, 3, opt); } // op2Reg and op3Reg are swapped - GetEmitter()->emitIns_R_R_R_R(ins, emitSize, op4Reg, op1Reg, op3Reg, op2Reg, opt); + GetEmitter()->emitIns_R_R_R_R(ins, emitSize, op4Reg, op1Reg, tempReg, op2Reg, opt); } else { @@ -2471,6 +2468,18 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_Sve2_Scatter16BitWithByteOffsetsNarrowingNonTemporal: + case NI_Sve2_Scatter32BitWithByteOffsetsNarrowingNonTemporal: + case NI_Sve2_Scatter8BitWithByteOffsetsNarrowingNonTemporal: + case NI_Sve2_ScatterWithByteOffsetsNonTemporal: + // Scatter...(Vector mask, T* address, Vector offsets, Vector data) + assert(!varTypeIsSIMD(intrin.op2->gtType)); + assert(intrin.numOperands == 4); + + // op2Reg and op3Reg are swapped + GetEmitter()->emitIns_R_R_R_R(ins, emitSize, op4Reg, op1Reg, op3Reg, op2Reg, opt); + break; + case NI_Sve_StoreNarrowing: opt = emitter::optGetSveInsOpt(emitTypeSize(intrin.baseType)); GetEmitter()->emitIns_R_R_R_I(ins, emitSize, op3Reg, op1Reg, op2Reg, 0, opt); diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 89c0f3c69e982c..999ae592e68987 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1365,6 +1365,19 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // Build any immediates BuildHWIntrinsicImmediate(intrinsicTree, intrin); + // Build any additional special cases + switch (intrin.id) + { + case NI_Sve2_Scatter16BitNarrowingNonTemporal: + case NI_Sve2_Scatter32BitNarrowingNonTemporal: + case NI_Sve2_ScatterNonTemporal: + buildInternalFloatRegisterDefForNode(intrinsicTree, internalFloatRegCandidates()); + break; + + default: + break; + } + // Build all Operands for (size_t opNum = 1; opNum <= intrin.numOperands; opNum++) { From 62b949f40ee5ccb6a44c935b2e4bf96ad7c5fad5 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 26 Feb 2026 10:28:18 +0000 Subject: [PATCH 2/5] Fix up 8bit scatter --- src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 33 ++++++++++++++++----- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index 1d1a7dc8ae4d39..3b0f5970cd29e3 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -2430,7 +2430,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Sve2_Scatter16BitNarrowingNonTemporal: case NI_Sve2_Scatter32BitNarrowingNonTemporal: - case NI_Sve2_Scatter8BitNarrowingNonTemporal: case NI_Sve2_ScatterNonTemporal: { if (!varTypeIsSIMD(intrin.op2->gtType)) @@ -2439,23 +2438,26 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) assert(intrin.numOperands == 4); - // SVE2 instruction only directly support byte offsets. Convert indices to bytes. + ssize_t shift = 0; regNumber tempReg = internalRegisters.GetSingle(node, RBM_ALLFLOAT); + if (intrin.id == NI_Sve2_Scatter16BitNarrowingNonTemporal) { - GetEmitter()->emitIns_R_R_I(INS_sve_lsl, emitSize, tempReg, op3Reg, 1, opt); + shift = 1; } else if (intrin.id == NI_Sve2_Scatter32BitNarrowingNonTemporal) { - GetEmitter()->emitIns_R_R_I(INS_sve_lsl, emitSize, tempReg, op3Reg, 2, opt); + shift = 2; } - else if (intrin.id == NI_Sve2_ScatterNonTemporal) + else { - assert(emitActualTypeSize(intrin.baseType) == 8); - GetEmitter()->emitIns_R_R_I(INS_sve_lsl, emitSize, tempReg, op3Reg, 3, opt); + assert(intrin.id == NI_Sve2_ScatterNonTemporal); + shift = 3; } - // op2Reg and op3Reg are swapped + // SVE2 instruction only directly support byte offsets. Convert indices to bytes. + GetEmitter()->emitIns_R_R_I(INS_sve_lsl, emitSize, tempReg, op3Reg, shift, opt); + GetEmitter()->emitIns_R_R_R_R(ins, emitSize, op4Reg, op1Reg, tempReg, op2Reg, opt); } else @@ -2468,6 +2470,21 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_Sve2_Scatter8BitNarrowingNonTemporal: + if (!varTypeIsSIMD(intrin.op2->gtType)) + { + // Scatter...(Vector mask, T* address, Vector indices, Vector data) + assert(intrin.numOperands == 4); + GetEmitter()->emitIns_R_R_R_R(ins, emitSize, op4Reg, op1Reg, op3Reg, op2Reg, opt); + } + else + { + // Scatter...(Vector mask, Vector addresses, Vector data) + assert(intrin.numOperands == 3); + GetEmitter()->emitIns_R_R_R_R(ins, emitSize, op3Reg, op1Reg, op2Reg, REG_ZR, opt); + } + break; + case NI_Sve2_Scatter16BitWithByteOffsetsNarrowingNonTemporal: case NI_Sve2_Scatter32BitWithByteOffsetsNarrowingNonTemporal: case NI_Sve2_Scatter8BitWithByteOffsetsNarrowingNonTemporal: From df3ddaac88668df73c96a526f0faf43bea366399 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 26 Feb 2026 10:30:41 +0000 Subject: [PATCH 3/5] don't reserve a register when not required --- src/coreclr/jit/lsraarm64.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 999ae592e68987..9e3979ac161201 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1371,7 +1371,10 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou case NI_Sve2_Scatter16BitNarrowingNonTemporal: case NI_Sve2_Scatter32BitNarrowingNonTemporal: case NI_Sve2_ScatterNonTemporal: - buildInternalFloatRegisterDefForNode(intrinsicTree, internalFloatRegCandidates()); + if (!varTypeIsSIMD(intrin.op2->gtType)) + { + buildInternalFloatRegisterDefForNode(intrinsicTree, internalFloatRegCandidates()); + } break; default: From 7a0a8da1be64b8c1b31ff34dfa36c5f00073fa19 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 26 Feb 2026 10:34:43 +0000 Subject: [PATCH 4/5] fix comment --- src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index 3b0f5970cd29e3..478e5011ab3934 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -2455,7 +2455,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) shift = 3; } - // SVE2 instruction only directly support byte offsets. Convert indices to bytes. + // The SVE2 instructions only support byte offsets. Convert indices to bytes. GetEmitter()->emitIns_R_R_I(INS_sve_lsl, emitSize, tempReg, op3Reg, shift, opt); GetEmitter()->emitIns_R_R_R_R(ins, emitSize, op4Reg, op1Reg, tempReg, op2Reg, opt); From b918db53746da4cbf66b17845fe0cb9f0eb3500d Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 26 Feb 2026 10:57:47 +0000 Subject: [PATCH 5/5] formatting --- src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index 478e5011ab3934..e7c5a098870bb9 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -2438,7 +2438,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) assert(intrin.numOperands == 4); - ssize_t shift = 0; + ssize_t shift = 0; regNumber tempReg = internalRegisters.GetSingle(node, RBM_ALLFLOAT); if (intrin.id == NI_Sve2_Scatter16BitNarrowingNonTemporal)