dotnet · alexcovington · Sep 29, 2025 · Feb 2, 2026 · Feb 19, 2026 · Feb 19, 2026
diff --git a/src/coreclr/inc/clrconfigvalues.h b/src/coreclr/inc/clrconfigvalues.h
@@ -678,6 +678,7 @@ RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableHWIntrinsic,            W("EnableHWIntri
 RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX,                    W("EnableAVX"),                 1, "Allows AVX and dependent hardware intrinsics to be disabled")
 RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX2,                   W("EnableAVX2"),                1, "Allows AVX2, BMI1, BMI2, F16C, FMA, LZCNT, MOVBE and dependent hardware intrinsics to be disabled")
 RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX512,                 W("EnableAVX512"),              1, "Allows AVX512 F+BW+CD+DQ+VL and depdendent hardware intrinsics to be disabled")
+RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX512BMM,              W("EnableAVX512BMM"),           1, "Allows AVX512BMM and depdendent hardware intrinsics to be disabled")
 
 RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX512v2,               W("EnableAVX512v2"),            1, "Allows AVX512 IFMA+VBMI and depdendent hardware intrinsics to be disabled")
 RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX512v3,               W("EnableAVX512v3"),            1, "Allows AVX512 BITALG+VBMI2+VNNI+VPOPCNTDQ and depdendent hardware intrinsics to be disabled")

diff --git a/src/coreclr/inc/corinfoinstructionset.h b/src/coreclr/inc/corinfoinstructionset.h
@@ -97,6 +97,8 @@ enum CORINFO_InstructionSet
     InstructionSet_SHA_X64=43,
     InstructionSet_WAITPKG_X64=44,
     InstructionSet_X86Serialize_X64=45,
+    InstructionSet_AVX512BMM=46,
+    InstructionSet_AVX512BMM_X64=47,
 #endif // TARGET_AMD64
 #ifdef TARGET_X86
     InstructionSet_X86Base=1,
@@ -144,6 +146,8 @@ enum CORINFO_InstructionSet
     InstructionSet_SHA_X64=43,
     InstructionSet_WAITPKG_X64=44,
     InstructionSet_X86Serialize_X64=45,
+    InstructionSet_AVX512BMM=46,
+    InstructionSet_AVX512BMM_X64=47,
 #endif // TARGET_X86
 
 };
@@ -295,6 +299,8 @@ struct CORINFO_InstructionSetFlags
             AddInstructionSet(InstructionSet_WAITPKG_X64);
         if (HasInstructionSet(InstructionSet_X86Serialize))
             AddInstructionSet(InstructionSet_X86Serialize_X64);
+        if (HasInstructionSet(InstructionSet_AVX512BMM))
+            AddInstructionSet(InstructionSet_AVX512BMM);
 #endif // TARGET_AMD64
 #ifdef TARGET_X86
 #endif // TARGET_X86

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
@@ -128,6 +128,7 @@ bool emitter::Is3OpRmwInstruction(instruction ins)
         {
             return ((ins >= FIRST_FMA_INSTRUCTION) && (ins <= LAST_FMA_INSTRUCTION)) ||
                    (IsAVXVNNIFamilyInstruction(ins)) ||
+                   ((ins >= FIRST_AVX512BMM_INSTRUCTION) && (ins <= LAST_AVX512BMM_INSTRUCTION)) ||
                    ((ins >= FIRST_AVXIFMA_INSTRUCTION) && (ins <= LAST_AVXIFMA_INSTRUCTION));
         }
     }
@@ -3104,8 +3105,9 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co
         // 0x0000RM11.
         leadingBytes = (code >> 16) & 0xFF;
         assert(leadingBytes == 0x0F ||
-               (m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) && leadingBytes >= 0x00 &&
-                leadingBytes <= 0x07) ||
+               ((m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) ||
+                 m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512BMM)) &&
+                leadingBytes >= 0x00 && leadingBytes <= 0x07) ||
                (IsApxExtendedEvexInstruction(ins) && leadingBytes == 0));
         code &= 0xFFFF;
     }
@@ -3164,10 +3166,16 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co
             break;
         }
 
+        case 0x06:
+        {
+            assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512BMM));
+            evexPrefix |= (0x6 << 16);
+            break;
+        }
+
         case 0x01:
         case 0x02:
         case 0x03:
-        case 0x06:
         case 0x07:
         default:
         {
@@ -21377,6 +21385,15 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
             break;
         }
 
+        case INS_vbmacor16x16x16:
+        case INS_vbmacxor16x16x16:
+        case INS_vbitrev:
+        {
+            result.insLatency    = PERFSCORE_LATENCY_1C;
+            result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+            break;
+        }
+
         default:
         {
             assert((unsigned)ins < ArrLen(insThroughputInfos));

diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp
@@ -987,6 +987,7 @@ static const HWIntrinsicIsaRange hwintrinsicIsaRangeArray[] = {
     { NI_Illegal, NI_Illegal },                                 //      SHA_X64
     { NI_Illegal, NI_Illegal },                                 //      WAITPKG_X64
     { NI_Illegal, NI_Illegal },                                 //      X86Serialize_X64
+    { FIRST_NI_AVX512BMM, LAST_NI_AVX512BMM },                  // AVX512BMM
 #elif defined (TARGET_ARM64)
     { FIRST_NI_ArmBase, LAST_NI_ArmBase },                      // ArmBase
     { FIRST_NI_AdvSimd, LAST_NI_AdvSimd },                      // AdvSimd

diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp
@@ -911,6 +911,13 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
                             break;
                         }
 
+                        case NI_AVX512BMM_BitMultiplyMatrix16x16WithOrReduction:
+                        case NI_AVX512BMM_BitMultiplyMatrix16x16WithXorReduction:
+                        {
+                            genHWIntrinsic_R_R_R_RM(ins, simdSize, targetReg, op1Reg, op2Reg, op3, instOptions);
+                            break;
+                        }
+
                         default:
                         {
                             unreached();

diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h
@@ -1092,6 +1092,12 @@ HARDWARE_INTRINSIC(AVX10v2,         MultipleSumAbsoluteDifferences,
 HARDWARE_INTRINSIC(AVX10v2,         StoreScalar,                                                     16,              2,     {INS_invalid,           INS_invalid,            INS_vmovw_simd,         INS_vmovw_simd,         INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},               HW_Category_MemoryStore,            HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg)
 #define LAST_NI_AVX10v2             NI_AVX10v2_StoreScalar
 
+#define FIRST_NI_AVX512BMM          NI_AVX512BMM_BitMultiplyMatrix16x16WithOrReduction
+HARDWARE_INTRINSIC(AVX512BMM,       BitMultiplyMatrix16x16WithOrReduction,                           -1,             -1,     {INS_invalid,           INS_invalid,            INS_vbmacor16x16x16,    INS_vbmacor16x16x16,    INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},               HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX512BMM,       BitMultiplyMatrix16x16WithXorReduction,                          -1,             -1,     {INS_invalid,           INS_invalid,            INS_vbmacxor16x16x16,   INS_vbmacxor16x16x16,   INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},               HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX512BMM,       ReverseBits,                                                     -1,             -1,     {INS_invalid,           INS_vbitrev,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},               HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg)
+#define LAST_NI_AVX512BMM           NI_AVX512BMM_ReverseBits
+
 // ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
 //                 ISA              Function name                                                    SIMD size        NumArg                                                                                                                            Instructions                                                                                                                  Category                            Flags
 //                                                                                                                           {TYP_BYTE,              TYP_UBYTE,              TYP_SHORT,              TYP_USHORT,             TYP_INT,                TYP_UINT,               TYP_LONG,               TYP_ULONG,              TYP_FLOAT,              TYP_DOUBLE}

diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp
@@ -215,6 +215,10 @@ CORINFO_InstructionSet Compiler::lookupInstructionSet(const char* className)
                     {
                         return InstructionSet_AVX512;
                     }
+                    else if (strcmp(className + 7, "mm") == 0)
+                    {
+                        return InstructionSet_AVX512BMM;
+                    }
                 }
                 else if ((strcmp(className + 6, "CD") == 0) || (strcmp(className + 6, "DQ") == 0))
                 {

diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h
@@ -1097,6 +1097,12 @@ INST3(vucomish,         "vucomish",         IUM_RD, BAD_CODE,               BAD_
 INST3(vp2intersectd,    "vp2intersectd",    IUM_WR, BAD_CODE,               BAD_CODE,     PSSE38(0xF2, 0x68),            ILLEGAL,           ILLEGAL,    INS_TT_FULL,                         Input_32Bit    | KMask_Base4     | REX_W0                       | Encoding_EVEX)                                                                                                                                  // Compute Intersection Between DWORDS to a Pair of Mask Registers
 INST3(vp2intersectq,    "vp2intersectq",    IUM_WR, BAD_CODE,               BAD_CODE,     PSSE38(0xF2, 0x68),            ILLEGAL,           ILLEGAL,    INS_TT_FULL,                         Input_64Bit    | KMask_Base2     | REX_W1                       | Encoding_EVEX)                                                                                                                                  // Compute Intersection Between QWORDS to a Pair of Mask Registers
 
+#define FIRST_AVX512BMM_INSTRUCTION INS_vbmacor16x16x16
+INST3(vbmacor16x16x16,  "vbmacor16x16x16",  IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLTMAP(0x06, 0x80),         ILLEGAL,           ILLEGAL,    INS_TT_FULL_MEM,                     Input_16Bit                      | REX_W0                       | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction)                                                                                                                                  // Multiply Packed FP16 Values                                                                                                                                  // Convert Scalar FP16 Value to Scalar Single Precision FP Value
+INST3(vbmacxor16x16x16, "vbmacxor16x16x16", IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLTMAP(0x06, 0x80),         ILLEGAL,           ILLEGAL,    INS_TT_FULL_MEM,                     Input_16Bit                      | REX_W1                       | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction)                                                                                                                                  // Multiply Packed FP16 Values                                                                                                                                  // Convert Scalar FP16 Value to Scalar Single Precision FP Value
+INST3(vbitrev,          "vbitrev",          IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLTMAP(0x06, 0x81),         ILLEGAL,           ILLEGAL,    INS_TT_FULL_MEM,                     Input_8Bit     | KMask_Base16    | REX_W0                       | Encoding_EVEX )                                                                                                                                  // Multiply Packed FP16 Values                                                                                                                                  // Convert Scalar FP16 Value to Scalar Single Precision FP Value
+#define LAST_AVX512BMM_INSTRUCTION INS_vbitrev
+
 // Instructions for AVX10v2
 INST3(vcomxsd,          "vcomxsd",          IUM_RD, BAD_CODE,               BAD_CODE,     SSEFLT(0x2f),                  3C,                1C,         INS_TT_TUPLE1_SCALAR,                Input_64Bit                      | REX_W1                       | Encoding_EVEX    | Writes_OF | Writes_SF | Writes_ZF | Writes_PF | Writes_CF | Resets_AF)                                                       // Compare double precision floating point values and set flags
 INST3(vcomxss,          "vcomxss",          IUM_RD, BAD_CODE,               BAD_CODE,     SSEDBL(0x2f),                  3C,                1C,         INS_TT_TUPLE1_SCALAR,                Input_32Bit                      | REX_W0                       | Encoding_EVEX    | Writes_OF | Writes_SF | Writes_ZF | Writes_PF | Writes_CF | Resets_AF)                                                       // Compare single precision floating point values and set flags
@@ -1131,6 +1137,7 @@ INST3(vucomxsd,         "vucomxsd",         IUM_RD, BAD_CODE,               BAD_
 INST3(vucomxss,         "vucomxss",         IUM_RD, BAD_CODE,               BAD_CODE,     SSEDBL(0x2E),                  3C,                1C,         INS_TT_TUPLE1_SCALAR,                Input_32Bit                      | REX_W0                       | Encoding_EVEX    | Writes_OF | Writes_SF | Writes_ZF | Writes_PF | Writes_CF | Resets_AF)                                                       // Perform an unordered compare of single precision floating point values and set flags
 #define LAST_AVX512_INSTRUCTION INS_vucomxss
 
+
 //    id                nm                  um      mr            mi            rm                                       lat                tp          tt              flags
 #define FIRST_APX_INSTRUCTION INS_ccmpo
 #define FIRST_CCMP_INSTRUCTION INS_ccmpo

diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
@@ -400,6 +400,7 @@ RELEASE_CONFIG_INTEGER(EnableHWIntrinsic,           "EnableHWIntrinsic",
 RELEASE_CONFIG_INTEGER(EnableAVX,                   "EnableAVX",                 1) // Allows AVX and dependent hardware intrinsics to be disabled
 RELEASE_CONFIG_INTEGER(EnableAVX2,                  "EnableAVX2",                1) // Allows AVX2, BMI1, BMI2, F16C, FMA, LZCNT, MOVBE and dependent hardware intrinsics to be disabled
 RELEASE_CONFIG_INTEGER(EnableAVX512,                "EnableAVX512",              1) // Allows AVX512 F+BW+CD+DQ+VL and depdendent hardware intrinsics to be disabled
+RELEASE_CONFIG_INTEGER(EnableAVX512BMM,             "EnableAVX512BMM",           1) // Allows AVX10v2 and depdendent hardware intrinsics to be disabled
 
 RELEASE_CONFIG_INTEGER(EnableAVX512v2,              "EnableAVX512v2",            1) // Allows AVX512 IFMA+VBMI and depdendent hardware intrinsics to be disabled
 RELEASE_CONFIG_INTEGER(EnableAVX512v3,              "EnableAVX512v3",            1) // Allows AVX512 BITALG+VBMI2+VNNI+VPOPCNTDQ and depdendent hardware intrinsics to be disabled

diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
@@ -10281,7 +10281,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
                             default:
                             {
                                 assert((intrinsicId == NI_X86Base_DivRem) || (intrinsicId == NI_X86Base_X64_DivRem) ||
-                                       (intrinsicId >= FIRST_NI_AVXVNNI && intrinsicId <= LAST_NI_AVXVNNIINT_V512));
+                                       (intrinsicId >= FIRST_NI_AVXVNNI && intrinsicId <= LAST_NI_AVXVNNIINT_V512) ||
+                                       (intrinsicId >= FIRST_NI_AVX512BMM && intrinsicId <= LAST_NI_AVX512BMM));
                                 TryMakeSrcContainedOrRegOptional(node, op3);
                                 break;
                             }

diff --git a/src/coreclr/vm/codeman.cpp b/src/coreclr/vm/codeman.cpp
@@ -1297,6 +1297,11 @@ void EEJitManager::SetCpuInfo()
         CPUCompileFlags.Set(InstructionSet_AVX512v3);
     }
 
+    if (((cpuFeatures & XArchIntrinsicConstants_AVX512Bmm) != 0) && CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableAVX512BMM))
+    {
+        CPUCompileFlags.Set(InstructionSet_AVX512BMM);
+    }
+
     if (((cpuFeatures & XArchIntrinsicConstants_Avx10v1) != 0) && CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableAVX10v1))
     {
         CPUCompileFlags.Set(InstructionSet_AVX10v1);

diff --git a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems
@@ -2729,6 +2729,7 @@
     <Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\Avx2$(NotSupportedOnMono).cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\Avx10v1$(NotSupportedOnMono).cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\Avx10v2$(NotSupportedOnMono).cs" />
+    <Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\Avx512Bmm$(NotSupportedOnMono).cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\AvxVnniInt8$(NotSupportedOnMono).cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\AvxVnniInt16$(NotSupportedOnMono).cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\Avx512BW$(NotSupportedOnMono).cs" />
@@ -2765,6 +2766,7 @@
     <Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\Avx512F.PlatformNotSupported.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\Avx10v1.PlatformNotSupported.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\Avx10v2.PlatformNotSupported.cs" />
+    <Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\Avx512Bmm.PlatformNotSupported.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\AvxVnniInt8.PlatformNotSupported.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\AvxVnniInt16.PlatformNotSupported.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Intrinsics\X86\Avx512Vbmi.PlatformNotSupported.cs" />