Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/coreclr/inc/clrconfigvalues.h
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,7 @@ RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableHWIntrinsic, W("EnableHWIntri
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX, W("EnableAVX"), 1, "Allows AVX and dependent hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX2, W("EnableAVX2"), 1, "Allows AVX2, BMI1, BMI2, F16C, FMA, LZCNT, MOVBE and dependent hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX512, W("EnableAVX512"), 1, "Allows AVX512 F+BW+CD+DQ+VL and depdendent hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX512BMM, W("EnableAVX512BMM"), 1, "Allows AVX512BMM and depdendent hardware intrinsics to be disabled")

RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX512v2, W("EnableAVX512v2"), 1, "Allows AVX512 IFMA+VBMI and depdendent hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX512v3, W("EnableAVX512v3"), 1, "Allows AVX512 BITALG+VBMI2+VNNI+VPOPCNTDQ and depdendent hardware intrinsics to be disabled")
Expand Down
6 changes: 6 additions & 0 deletions src/coreclr/inc/corinfoinstructionset.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ enum CORINFO_InstructionSet
InstructionSet_SHA_X64=43,
InstructionSet_WAITPKG_X64=44,
InstructionSet_X86Serialize_X64=45,
InstructionSet_AVX512BMM=46,
InstructionSet_AVX512BMM_X64=47,
Comment on lines +100 to +101
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like this may have been added manually.

You rather want to modify src/coreclr/tools/Common/JitInterface/ThunkGenerator/InstructionSetDesc.txt and then run src/coreclr/tools/Common/JitInterface/ThunkGenerator/gen.bat

#endif // TARGET_AMD64
#ifdef TARGET_X86
InstructionSet_X86Base=1,
Expand Down Expand Up @@ -144,6 +146,8 @@ enum CORINFO_InstructionSet
InstructionSet_SHA_X64=43,
InstructionSet_WAITPKG_X64=44,
InstructionSet_X86Serialize_X64=45,
InstructionSet_AVX512BMM=46,
InstructionSet_AVX512BMM_X64=47,
#endif // TARGET_X86

};
Expand Down Expand Up @@ -295,6 +299,8 @@ struct CORINFO_InstructionSetFlags
AddInstructionSet(InstructionSet_WAITPKG_X64);
if (HasInstructionSet(InstructionSet_X86Serialize))
AddInstructionSet(InstructionSet_X86Serialize_X64);
if (HasInstructionSet(InstructionSet_AVX512BMM))
AddInstructionSet(InstructionSet_AVX512BMM);
#endif // TARGET_AMD64
#ifdef TARGET_X86
#endif // TARGET_X86
Expand Down
23 changes: 20 additions & 3 deletions src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ bool emitter::Is3OpRmwInstruction(instruction ins)
{
return ((ins >= FIRST_FMA_INSTRUCTION) && (ins <= LAST_FMA_INSTRUCTION)) ||
(IsAVXVNNIFamilyInstruction(ins)) ||
((ins >= FIRST_AVX512BMM_INSTRUCTION) && (ins <= LAST_AVX512BMM_INSTRUCTION)) ||
((ins >= FIRST_AVXIFMA_INSTRUCTION) && (ins <= LAST_AVXIFMA_INSTRUCTION));
}
}
Expand Down Expand Up @@ -3104,8 +3105,9 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co
// 0x0000RM11.
leadingBytes = (code >> 16) & 0xFF;
assert(leadingBytes == 0x0F ||
(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) && leadingBytes >= 0x00 &&
leadingBytes <= 0x07) ||
((m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) ||
m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512BMM)) &&
leadingBytes >= 0x00 && leadingBytes <= 0x07) ||
(IsApxExtendedEvexInstruction(ins) && leadingBytes == 0));
code &= 0xFFFF;
}
Expand Down Expand Up @@ -3164,10 +3166,16 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co
break;
}

case 0x06:
{
assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512BMM));
evexPrefix |= (0x6 << 16);
break;
}

case 0x01:
case 0x02:
case 0x03:
case 0x06:
case 0x07:
default:
{
Expand Down Expand Up @@ -21377,6 +21385,15 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
break;
}

case INS_vbmacor16x16x16:
case INS_vbmacxor16x16x16:
case INS_vbitrev:
{
result.insLatency = PERFSCORE_LATENCY_1C;
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
break;
}

default:
{
assert((unsigned)ins < ArrLen(insThroughputInfos));
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/hwintrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -987,6 +987,7 @@ static const HWIntrinsicIsaRange hwintrinsicIsaRangeArray[] = {
{ NI_Illegal, NI_Illegal }, // SHA_X64
{ NI_Illegal, NI_Illegal }, // WAITPKG_X64
{ NI_Illegal, NI_Illegal }, // X86Serialize_X64
{ FIRST_NI_AVX512BMM, LAST_NI_AVX512BMM }, // AVX512BMM
#elif defined (TARGET_ARM64)
{ FIRST_NI_ArmBase, LAST_NI_ArmBase }, // ArmBase
{ FIRST_NI_AdvSimd, LAST_NI_AdvSimd }, // AdvSimd
Expand Down
7 changes: 7 additions & 0 deletions src/coreclr/jit/hwintrinsiccodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -911,6 +911,13 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
break;
}

case NI_AVX512BMM_BitMultiplyMatrix16x16WithOrReduction:
case NI_AVX512BMM_BitMultiplyMatrix16x16WithXorReduction:
Comment on lines +914 to +915
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These could be put in the same grouping as AvxVnni just above, right?

Looks like they're identical and should also have the same assertions.

{
genHWIntrinsic_R_R_R_RM(ins, simdSize, targetReg, op1Reg, op2Reg, op3, instOptions);
break;
}

default:
{
unreached();
Expand Down
6 changes: 6 additions & 0 deletions src/coreclr/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -1092,6 +1092,12 @@ HARDWARE_INTRINSIC(AVX10v2, MultipleSumAbsoluteDifferences,
HARDWARE_INTRINSIC(AVX10v2, StoreScalar, 16, 2, {INS_invalid, INS_invalid, INS_vmovw_simd, INS_vmovw_simd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg)
#define LAST_NI_AVX10v2 NI_AVX10v2_StoreScalar

#define FIRST_NI_AVX512BMM NI_AVX512BMM_BitMultiplyMatrix16x16WithOrReduction
HARDWARE_INTRINSIC(AVX512BMM, BitMultiplyMatrix16x16WithOrReduction, -1, -1, {INS_invalid, INS_invalid, INS_vbmacor16x16x16, INS_vbmacor16x16x16, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(AVX512BMM, BitMultiplyMatrix16x16WithXorReduction, -1, -1, {INS_invalid, INS_invalid, INS_vbmacxor16x16x16, INS_vbmacxor16x16x16, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg)
Comment on lines +1096 to +1097
Copy link
Member

@tannergooding tannergooding Feb 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These two are "RMW" (read-modify write) and so should be marked as IsRmwIntrinsic and have some specialized handling like Fma and AvxVnni intrinsics have (in lsra and lower). This will ensure better codegen since the first operand is both a source and destination as far as the register allocator is concerned

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd also expect some minimal handling in lower to handle the fact that left and right are commutative and so either can be the "from memory" operand.

HARDWARE_INTRINSIC(AVX512BMM, ReverseBits, -1, -1, {INS_invalid, INS_vbitrev, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg)
#define LAST_NI_AVX512BMM NI_AVX512BMM_ReverseBits

// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// ISA Function name SIMD size NumArg Instructions Category Flags
// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE}
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,10 @@ CORINFO_InstructionSet Compiler::lookupInstructionSet(const char* className)
{
return InstructionSet_AVX512;
}
else if (strcmp(className + 7, "mm") == 0)
{
return InstructionSet_AVX512BMM;
}
}
else if ((strcmp(className + 6, "CD") == 0) || (strcmp(className + 6, "DQ") == 0))
{
Expand Down
7 changes: 7 additions & 0 deletions src/coreclr/jit/instrsxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -1097,6 +1097,12 @@ INST3(vucomish, "vucomish", IUM_RD, BAD_CODE, BAD_
INST3(vp2intersectd, "vp2intersectd", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0x68), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Compute Intersection Between DWORDS to a Pair of Mask Registers
INST3(vp2intersectq, "vp2intersectq", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0x68), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX) // Compute Intersection Between QWORDS to a Pair of Mask Registers

#define FIRST_AVX512BMM_INSTRUCTION INS_vbmacor16x16x16
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably put these below the AvxVnni group. Just so its not splitting the Avx10v2 and Avx10v2.x64 group

INST3(vbmacor16x16x16, "vbmacor16x16x16", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x06, 0x80), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Multiply Packed FP16 Values // Convert Scalar FP16 Value to Scalar Single Precision FP Value
INST3(vbmacxor16x16x16, "vbmacxor16x16x16", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x06, 0x80), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Multiply Packed FP16 Values // Convert Scalar FP16 Value to Scalar Single Precision FP Value
INST3(vbitrev, "vbitrev", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x06, 0x81), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_8Bit | KMask_Base16 | REX_W0 | Encoding_EVEX ) // Multiply Packed FP16 Values // Convert Scalar FP16 Value to Scalar Single Precision FP Value
#define LAST_AVX512BMM_INSTRUCTION INS_vbitrev

// Instructions for AVX10v2
INST3(vcomxsd, "vcomxsd", IUM_RD, BAD_CODE, BAD_CODE, SSEFLT(0x2f), 3C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_EVEX | Writes_OF | Writes_SF | Writes_ZF | Writes_PF | Writes_CF | Resets_AF) // Compare double precision floating point values and set flags
INST3(vcomxss, "vcomxss", IUM_RD, BAD_CODE, BAD_CODE, SSEDBL(0x2f), 3C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX | Writes_OF | Writes_SF | Writes_ZF | Writes_PF | Writes_CF | Resets_AF) // Compare single precision floating point values and set flags
Expand Down Expand Up @@ -1131,6 +1137,7 @@ INST3(vucomxsd, "vucomxsd", IUM_RD, BAD_CODE, BAD_
INST3(vucomxss, "vucomxss", IUM_RD, BAD_CODE, BAD_CODE, SSEDBL(0x2E), 3C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX | Writes_OF | Writes_SF | Writes_ZF | Writes_PF | Writes_CF | Resets_AF) // Perform an unordered compare of single precision floating point values and set flags
#define LAST_AVX512_INSTRUCTION INS_vucomxss


// id nm um mr mi rm lat tp tt flags
#define FIRST_APX_INSTRUCTION INS_ccmpo
#define FIRST_CCMP_INSTRUCTION INS_ccmpo
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/jitconfigvalues.h
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,7 @@ RELEASE_CONFIG_INTEGER(EnableHWIntrinsic, "EnableHWIntrinsic",
RELEASE_CONFIG_INTEGER(EnableAVX, "EnableAVX", 1) // Allows AVX and dependent hardware intrinsics to be disabled
RELEASE_CONFIG_INTEGER(EnableAVX2, "EnableAVX2", 1) // Allows AVX2, BMI1, BMI2, F16C, FMA, LZCNT, MOVBE and dependent hardware intrinsics to be disabled
RELEASE_CONFIG_INTEGER(EnableAVX512, "EnableAVX512", 1) // Allows AVX512 F+BW+CD+DQ+VL and depdendent hardware intrinsics to be disabled
RELEASE_CONFIG_INTEGER(EnableAVX512BMM, "EnableAVX512BMM", 1) // Allows AVX10v2 and depdendent hardware intrinsics to be disabled

RELEASE_CONFIG_INTEGER(EnableAVX512v2, "EnableAVX512v2", 1) // Allows AVX512 IFMA+VBMI and depdendent hardware intrinsics to be disabled
RELEASE_CONFIG_INTEGER(EnableAVX512v3, "EnableAVX512v3", 1) // Allows AVX512 BITALG+VBMI2+VNNI+VPOPCNTDQ and depdendent hardware intrinsics to be disabled
Expand Down
3 changes: 2 additions & 1 deletion src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10281,7 +10281,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
default:
{
assert((intrinsicId == NI_X86Base_DivRem) || (intrinsicId == NI_X86Base_X64_DivRem) ||
(intrinsicId >= FIRST_NI_AVXVNNI && intrinsicId <= LAST_NI_AVXVNNIINT_V512));
(intrinsicId >= FIRST_NI_AVXVNNI && intrinsicId <= LAST_NI_AVXVNNIINT_V512) ||
(intrinsicId >= FIRST_NI_AVX512BMM && intrinsicId <= LAST_NI_AVX512BMM));
TryMakeSrcContainedOrRegOptional(node, op3);
break;
}
Expand Down
5 changes: 5 additions & 0 deletions src/coreclr/vm/codeman.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1297,6 +1297,11 @@ void EEJitManager::SetCpuInfo()
CPUCompileFlags.Set(InstructionSet_AVX512v3);
}

if (((cpuFeatures & XArchIntrinsicConstants_AVX512Bmm) != 0) && CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableAVX512BMM))
{
CPUCompileFlags.Set(InstructionSet_AVX512BMM);
}

if (((cpuFeatures & XArchIntrinsicConstants_Avx10v1) != 0) && CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableAVX10v1))
{
CPUCompileFlags.Set(InstructionSet_AVX10v1);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2729,6 +2729,7 @@
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\Avx2$(NotSupportedOnMono).cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\Avx10v1$(NotSupportedOnMono).cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\Avx10v2$(NotSupportedOnMono).cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\Avx512Bmm$(NotSupportedOnMono).cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\AvxVnniInt8$(NotSupportedOnMono).cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\AvxVnniInt16$(NotSupportedOnMono).cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\Avx512BW$(NotSupportedOnMono).cs" />
Expand Down Expand Up @@ -2765,6 +2766,7 @@
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\Avx512F.PlatformNotSupported.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\Avx10v1.PlatformNotSupported.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\Avx10v2.PlatformNotSupported.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\Avx512Bmm.PlatformNotSupported.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\AvxVnniInt8.PlatformNotSupported.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\AvxVnniInt16.PlatformNotSupported.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\Avx512Vbmi.PlatformNotSupported.cs" />
Expand Down
Loading
Loading