-
Notifications
You must be signed in to change notification settings - Fork 5.4k
Add AVX512 BMM API #124804
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Add AVX512 BMM API #124804
Changes from all commits
02cd4a8
e1a3773
bc2b3c2
7d6fd4d
f377ada
1fc5374
876458c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -911,6 +911,13 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) | |
| break; | ||
| } | ||
|
|
||
| case NI_AVX512BMM_BitMultiplyMatrix16x16WithOrReduction: | ||
| case NI_AVX512BMM_BitMultiplyMatrix16x16WithXorReduction: | ||
|
Comment on lines
+914
to
+915
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These could be put in the same grouping as Looks like they're identical and should also have the same assertions. |
||
| { | ||
| genHWIntrinsic_R_R_R_RM(ins, simdSize, targetReg, op1Reg, op2Reg, op3, instOptions); | ||
| break; | ||
| } | ||
|
|
||
| default: | ||
| { | ||
| unreached(); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1092,6 +1092,12 @@ HARDWARE_INTRINSIC(AVX10v2, MultipleSumAbsoluteDifferences, | |
| HARDWARE_INTRINSIC(AVX10v2, StoreScalar, 16, 2, {INS_invalid, INS_invalid, INS_vmovw_simd, INS_vmovw_simd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) | ||
| #define LAST_NI_AVX10v2 NI_AVX10v2_StoreScalar | ||
|
|
||
| #define FIRST_NI_AVX512BMM NI_AVX512BMM_BitMultiplyMatrix16x16WithOrReduction | ||
| HARDWARE_INTRINSIC(AVX512BMM, BitMultiplyMatrix16x16WithOrReduction, -1, -1, {INS_invalid, INS_invalid, INS_vbmacor16x16x16, INS_vbmacor16x16x16, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) | ||
| HARDWARE_INTRINSIC(AVX512BMM, BitMultiplyMatrix16x16WithXorReduction, -1, -1, {INS_invalid, INS_invalid, INS_vbmacxor16x16x16, INS_vbmacxor16x16x16, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) | ||
|
Comment on lines
+1096
to
+1097
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These two are "RMW" (read-modify write) and so should be marked as
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd also expect some minimal handling in lower to handle the fact that |
||
| HARDWARE_INTRINSIC(AVX512BMM, ReverseBits, -1, -1, {INS_invalid, INS_vbitrev, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) | ||
| #define LAST_NI_AVX512BMM NI_AVX512BMM_ReverseBits | ||
|
|
||
| // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** | ||
| // ISA Function name SIMD size NumArg Instructions Category Flags | ||
| // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1097,6 +1097,12 @@ INST3(vucomish, "vucomish", IUM_RD, BAD_CODE, BAD_ | |
| INST3(vp2intersectd, "vp2intersectd", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0x68), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Compute Intersection Between DWORDS to a Pair of Mask Registers | ||
| INST3(vp2intersectq, "vp2intersectq", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0x68), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX) // Compute Intersection Between QWORDS to a Pair of Mask Registers | ||
|
|
||
| #define FIRST_AVX512BMM_INSTRUCTION INS_vbmacor16x16x16 | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should probably put these below the |
||
| INST3(vbmacor16x16x16, "vbmacor16x16x16", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x06, 0x80), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Multiply Packed FP16 Values // Convert Scalar FP16 Value to Scalar Single Precision FP Value | ||
| INST3(vbmacxor16x16x16, "vbmacxor16x16x16", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x06, 0x80), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Multiply Packed FP16 Values // Convert Scalar FP16 Value to Scalar Single Precision FP Value | ||
| INST3(vbitrev, "vbitrev", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x06, 0x81), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_8Bit | KMask_Base16 | REX_W0 | Encoding_EVEX ) // Multiply Packed FP16 Values // Convert Scalar FP16 Value to Scalar Single Precision FP Value | ||
| #define LAST_AVX512BMM_INSTRUCTION INS_vbitrev | ||
|
|
||
| // Instructions for AVX10v2 | ||
| INST3(vcomxsd, "vcomxsd", IUM_RD, BAD_CODE, BAD_CODE, SSEFLT(0x2f), 3C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_EVEX | Writes_OF | Writes_SF | Writes_ZF | Writes_PF | Writes_CF | Resets_AF) // Compare double precision floating point values and set flags | ||
| INST3(vcomxss, "vcomxss", IUM_RD, BAD_CODE, BAD_CODE, SSEDBL(0x2f), 3C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX | Writes_OF | Writes_SF | Writes_ZF | Writes_PF | Writes_CF | Resets_AF) // Compare single precision floating point values and set flags | ||
|
|
@@ -1131,6 +1137,7 @@ INST3(vucomxsd, "vucomxsd", IUM_RD, BAD_CODE, BAD_ | |
| INST3(vucomxss, "vucomxss", IUM_RD, BAD_CODE, BAD_CODE, SSEDBL(0x2E), 3C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX | Writes_OF | Writes_SF | Writes_ZF | Writes_PF | Writes_CF | Resets_AF) // Perform an unordered compare of single precision floating point values and set flags | ||
| #define LAST_AVX512_INSTRUCTION INS_vucomxss | ||
|
|
||
|
|
||
| // id nm um mr mi rm lat tp tt flags | ||
| #define FIRST_APX_INSTRUCTION INS_ccmpo | ||
| #define FIRST_CCMP_INSTRUCTION INS_ccmpo | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like this may have been added manually.
You rather want to modify
src/coreclr/tools/Common/JitInterface/ThunkGenerator/InstructionSetDesc.txtand then runsrc/coreclr/tools/Common/JitInterface/ThunkGenerator/gen.bat