Skip to content

Latest commit

 

History

History
205 lines (192 loc) · 53.3 KB

File metadata and controls

205 lines (192 loc) · 53.3 KB

KleidiAI micro-kernel tables

This page provides an overview of the micro-kernels offered by KleidiAI.

Micro-kernel categories:

Matmul micro-kernels

Output type Output quantization LHS type RHS type LHS quantization RHS quantization Block size SIMD Feature Uarch Micro-kernel Packing micro-kernels
bf16 - i8 i4 dimensionwise asymmetric blockwise symmetric 1x4 Advanced SIMD dotprod - kai_matmul_clamp_bf16_qai8dxp1x8_qsi4c32p4x8_1x4_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_bf16_neon
RHS: kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0, kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0
bf16 - i8 i4 dimensionwise asymmetric channelwise symmetric 1x8 Advanced SIMD dotprod - kai_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_bf16_neon
RHS: kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0, kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0
bf16 - i8 i4 dimensionwise asymmetric blockwise symmetric 16x4 Advanced SIMD i8mm - kai_matmul_clamp_bf16_qai8dxp4x8_qsi4c32p4x8_16x4_neon_i8mm LHS: kai_lhs_quant_pack_qai8dxp_bf16_neon
RHS: kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0, kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0
bf16 - i8 i4 dimensionwise asymmetric channelwise symmetric 8x8 Advanced SIMD i8mm - kai_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm LHS: kai_lhs_quant_pack_qai8dxp_bf16_neon
RHS: kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0, kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0
f16 - bf16 bf16 - - 8x12 Advanced SIMD mmla - kai_matmul_clamp_f16_bf16p8x4_bf16p12x4b_8x12_neon_mmla LHS: kai_lhs_pack_bf16p8x4_f16_neon, kai_lhs_quant_pack_bf16p8x4_f32_neon
RHS: kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon, kai_rhs_pack_kxn_bf16p12x4biasf32_f16_neon
f16 - f16 f16 - - 6x16x8 Advanced SIMD mla - kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla RHS: kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon
f16 - f16 f16 - - 1x16vl SME2 dot - kai_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot RHS: kai_rhs_pack_kxn_x16p2vlx2b_x16_x16_sme
f16 - f16 f16 - - 1x8vl SME mla - kai_matmul_clamp_f16_f16_f16p2vlx2b_1x8vl_sme_mla RHS: kai_rhs_pack_kxn_x16p2vlx2b_x16_x16_sme, kai_rhs_pack_nxk_x16p2vlx2b_x16_x16_sme
f16 - f16 f16 - - 6x32 Advanced SIMD mla - kai_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla RHS: kai_rhs_pack_kxn_x16p32x1b_x16_x16_neon
f16 - f16 f16 - - 6x32 Advanced SIMD mla cortexa55 kai_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla_cortexa55 RHS: kai_rhs_pack_kxn_x16p32x1b_x16_x16_neon
f16 - f16 f16 - - 2vlx2vl SME2 mopa - kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa LHS: kai_lhs_pack_x16p2vlx2_x16_sme
RHS: kai_rhs_pack_kxn_x16p2vlx2b_x16_x16_sme, kai_rhs_pack_nxk_x16p2vlx2b_x16_x16_sme
f16 - f16 f16 - - 2vlx2vl SME mopa - kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2b_2vlx2vl_sme_mopa LHS: kai_lhs_pack_x16p2vlx2_x16_sme
RHS: kai_rhs_pack_kxn_x16p2vlx2b_x16_x16_sme, kai_rhs_pack_nxk_x16p2vlx2b_x16_x16_sme
f16 - i8 i4 dimensionwise asymmetric channelwise symmetric 1x4 Advanced SIMD dotprod - kai_matmul_clamp_f16_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_f16_neon
RHS: kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0, kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0
f16 - i8 i8 dimensionwise asymmetric channelwise symmetric 1x4 Advanced SIMD dotprod - kai_matmul_clamp_f16_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_f16_neon
RHS: kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon, kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon
f16 - i8 i4 dimensionwise asymmetric channelwise symmetric 1x4 Advanced SIMD dotprod - kai_matmul_clamp_f16_qai8dxp1x8_qsi4cxp4x8_1x4_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_f16_neon
RHS: kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0, kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0
f16 - i8 i8 dimensionwise asymmetric channelwise symmetric 1x4 Advanced SIMD dotprod - kai_matmul_clamp_f16_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_f16_neon
RHS: kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon, kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon
f16 - i8 i4 dimensionwise asymmetric channelwise symmetric 16x4 Advanced SIMD dotprod - kai_matmul_clamp_f16_qai8dxp4x4_qsi4cxp4x4_16x4_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_f16_neon
RHS: kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0, kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0
f16 - i8 i8 dimensionwise asymmetric channelwise symmetric 16x4 Advanced SIMD dotprod - kai_matmul_clamp_f16_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_f16_neon
RHS: kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon, kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon
f16 - i8 i4 dimensionwise asymmetric channelwise symmetric 16x4 Advanced SIMD i8mm - kai_matmul_clamp_f16_qai8dxp4x8_qsi4cxp4x8_16x4_neon_i8mm LHS: kai_lhs_quant_pack_qai8dxp_f16_neon
RHS: kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0, kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0
f16 - i8 i8 dimensionwise asymmetric channelwise symmetric 16x4 Advanced SIMD i8mm - kai_matmul_clamp_f16_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm LHS: kai_lhs_quant_pack_qai8dxp_f16_neon
RHS: kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon, kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon
f16 - i8 i8 dimensionwise asymmetric channelwise symmetric 1vlx4vl SME2 mopa - kai_matmul_clamp_f16_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon, kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon
f16 - i8 i8 dimensionwise asymmetric channelwise symmetric 1x4vl SME2 dot - kai_matmul_clamp_f16_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon, kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon
f16 - i8 i4 blockwise per dimension symmetric blockwise asymmetric 1vlx4vl SME2 mopa - kai_matmul_clamp_f16_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa LHS: kai_lhs_quant_pack_qsi8d32pscalef32_f16_neon
RHS: kai_rhs_pack_nxk_qai4c32ps1s0nrx4_qau4c32s1s0_f32_f32_f32_neon, kai_rhs_pack_nxk_qai4c32ps1s0nrx4_qau4c32s0s1_f32_f32_f32_neon
f16 - i8 i4 blockwise per dimension symmetric blockwise asymmetric 1x4vl SME2 dot - kai_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot LHS: kai_lhs_quant_pack_qsi8d32pscalef32_f16_neon
RHS: kai_rhs_pack_nxk_qai4c32ps1s0nrx4_qau4c32s1s0_f32_f32_f32_neon, kai_rhs_pack_nxk_qai4c32ps1s0nrx4_qau4c32s0s1_f32_f32_f32_neon
f16 - i8 i4 blockwise per dimension symmetric blockwise asymmetric 1x4 Advanced SIMD dotprod - kai_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod LHS: kai_lhs_quant_pack_qsi8d32pscalef32_f16_neon
RHS: kai_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon
f16 - i8 i4 blockwise per dimension symmetric blockwise asymmetric 1x4 Advanced SIMD dotprod - kai_matmul_clamp_f16_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod LHS: kai_lhs_quant_pack_qsi8d32pscalef32_f16_neon
RHS: kai_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon
f16 - i8 i4 blockwise per dimension symmetric blockwise asymmetric 8x4 Advanced SIMD dotprod - kai_matmul_clamp_f16_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod LHS: kai_lhs_quant_pack_qsi8d32pscalef32_f16_neon
RHS: kai_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon
f16 - i8 i4 blockwise per dimension symmetric blockwise asymmetric 8x4 Advanced SIMD i8mm - kai_matmul_clamp_f16_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm LHS: kai_lhs_quant_pack_qsi8d32pscalef32_f16_neon
RHS: kai_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon
f32 - bf16 bf16 - - 1x36 Advanced SIMD dot - kai_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot LHS: kai_lhs_quant_pack_bf16p1x4_f32_neon
RHS: kai_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon
f32 - bf16 bf16 - - 2vlx2vl SME2 mopa - kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa LHS: kai_lhs_pack_bf16p2vlx2_f32_sme
RHS: kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme
f32 - bf16 bf16 - - 8x12 Advanced SIMD mmla - kai_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla -
f32 - f16 i4 - blockwise symmetric 1vlx4vl SME2 mopa - kai_matmul_clamp_f32_f16p1vlx2_qsi4c32p4vlx2_1vlx4vl_sme2_mopa LHS: kai_lhs_pack_f16pmrx2_f32_neon
RHS: kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon
f32 - f32 f32 - - 1x16vl SME2 mla - kai_matmul_clamp_f32_f32_f32p16vlx1b_1x16vl_sme2_mla RHS: kai_rhs_pack_kxn_f32p16vlx1b_f32_f32_sme
f32 - f32 f32 - - 6x16 Advanced SIMD mla - kai_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla RHS: kai_rhs_pack_kxn_x32p16x1b_x32_x32_neon
f32 - f32 f32 - - 6x16 Advanced SIMD mla cortexa55 kai_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla_cortexa55 RHS: kai_rhs_pack_kxn_x32p16x1b_x32_x32_neon
f32 - f32 f32 - - 1x16vl SME2 mla - kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla RHS: kai_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme
f32 - f32 f32 - - 1x8vl SME mla - kai_matmul_clamp_f32_f32_f32p2vlx1b_1x8vl_sme_mla RHS: kai_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme, kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme
f32 - f32 f32 - - 6x4vl SVE mla - kai_matmul_clamp_f32_f32_f32p4vlx1b_6x4vl_sve_mla RHS: kai_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve
f32 - f32 f32 - - 6x8x4 Advanced SIMD mla - kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla RHS: kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon
f32 - f32 f32 - - 2vlx2vl SME mopa - kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa LHS: kai_lhs_pack_f32p2vlx1_f32_sme
RHS: kai_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme, kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme
f32 - f32 f32 - - - SME2 mopa - kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa LHS: kai_lhs_pack_f32p2vlx1_f32_sme
RHS: kai_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme
f32 - f32 f32 - - 4vsx4vs SME2 mopa - kai_matmul_clamp_f32_f32p4vsx1_f32p4vsx1bf32_4vsx4vs_sme2_mopa LHS: kai_matmul_pack_lhs_mxk_x32p4vsx1_x32_sme
RHS: kai_matmul_pack_rhs_kxn_x32p4vsx1bx32_x32_x32_sme, kai_matmul_pack_rhs_nxk_x32p4vsx1bx32_x32_x32_sme
f32 - f32 f32 - - 4vsx8vs SME2 mopa - kai_matmul_clamp_f32_f32p4vsx1_f32p4vsx1bf32_4vsx8vs_sme2_mopa LHS: kai_matmul_pack_lhs_mxk_x32p4vsx1_x32_sme
RHS: kai_matmul_pack_rhs_kxn_x32p4vsx1bx32_x32_x32_sme, kai_matmul_pack_rhs_nxk_x32p4vsx1bx32_x32_x32_sme
f32 - f32 f32 - - 4vsx16vs SME2 mopa - kai_matmul_clamp_f32_f32p4vsx1_f32p4vsx1bf32_4vsx16vs_sme2_mopa LHS: kai_matmul_pack_lhs_mxk_x32p4vsx1_x32_sme
RHS: kai_matmul_pack_rhs_kxn_x32p4vsx1bx32_x32_x32_sme, kai_matmul_pack_rhs_nxk_x32p4vsx1bx32_x32_x32_sme
f32 - f32 f32 - - 8vsx4vs SME2 mopa - kai_matmul_clamp_f32_f32p4vsx1_f32p4vsx1bf32_8vsx4vs_sme2_mopa LHS: kai_matmul_pack_lhs_mxk_x32p4vsx1_x32_sme
RHS: kai_matmul_pack_rhs_kxn_x32p4vsx1bx32_x32_x32_sme, kai_matmul_pack_rhs_nxk_x32p4vsx1bx32_x32_x32_sme
f32 - f32 f32 - - 8vsx8vs SME2 mopa - kai_matmul_clamp_f32_f32p4vsx1_f32p4vsx1bf32_8vsx8vs_sme2_mopa LHS: kai_matmul_pack_lhs_mxk_x32p4vsx1_x32_sme
RHS: kai_matmul_pack_rhs_kxn_x32p4vsx1bx32_x32_x32_sme, kai_matmul_pack_rhs_nxk_x32p4vsx1bx32_x32_x32_sme
f32 - f32 f32 - - 16vsx4vs SME2 mopa - kai_matmul_clamp_f32_f32p4vsx1_f32p4vsx1bf32_16vsx4vs_sme2_mopa LHS: kai_matmul_pack_lhs_mxk_x32p4vsx1_x32_sme
RHS: kai_matmul_pack_rhs_kxn_x32p4vsx1bx32_x32_x32_sme, kai_matmul_pack_rhs_nxk_x32p4vsx1bx32_x32_x32_sme
f32 - i8 i4 dimensionwise asymmetric blockwise symmetric 1vlx4vl SME2 mopa - kai_matmul_clamp_f32_qai8dxp1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi4c32ps1s0nrx4_qsu4c32s1s0_neon, kai_rhs_pack_kxn_qsi4c32ps1s0nrx4_qsu4c32s1s0_neon
f32 - i8 i8 dimensionwise asymmetric channelwise symmetric 1vlx4vl SME2 mopa - kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon, kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon
f32 - i8 i8 dimensionwise asymmetric channelwise symmetric 1vlx4vl SME mopa - kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon, kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon
f32 - i8 u2 dimensionwise asymmetric channelwise symmetric 1vlx4vl SME2 mopa - kai_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsu2cxp4vlx4_qsu2cx_neon
f32 - i8 i4 dimensionwise asymmetric channelwise symmetric 1vlx4vl SME2 mopa - kai_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon
f32 - i8 i4 dimensionwise asymmetric channelwise symmetric 1vlx4vl SME mopa - kai_matmul_clamp_f32_qai8dxp1vlx4_qsi4cxp4vlx4_1vlx4vl_sme_mopa LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0
f32 - i8 i4 dimensionwise asymmetric blockwise symmetric 1x4vl SME2 dot - kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4vlx4_1x4vl_sme2_dot LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi4c32ps1s0nrx4_qsu4c32s1s0_neon, kai_rhs_pack_kxn_qsi4c32ps1s0nrx4_qsu4c32s1s0_neon
f32 - i8 i4 dimensionwise asymmetric blockwise symmetric 1x4 Advanced SIMD dotprod - kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0, kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0
f32 - i8 i4 dimensionwise asymmetric blockwise symmetric 1x8 Advanced SIMD dotprod - kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0, kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0
f32 - i8 i4 dimensionwise asymmetric channelwise symmetric 1x4vl SME2 sdot - kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon
f32 - i8 i4 dimensionwise asymmetric channelwise symmetric 1x4vl SME sdot - kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme_dot LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0
f32 - i8 i4 dimensionwise asymmetric channelwise symmetric 1x4 Advanced SIMD dotprod - kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0, kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0
f32 - i8 i8 dimensionwise asymmetric channelwise symmetric 1x4vl SME2 dot - kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon, kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon
f32 - i8 i8 dimensionwise asymmetric channelwise symmetric 1x4vl SME dot - kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme_dot LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon, kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon
f32 - i8 i8 dimensionwise asymmetric channelwise symmetric 1x4 Advanced SIMD dotprod - kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon, kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon
f32 - i8 u2 dimensionwise asymmetric channelwise symmetric 1x4vl SME2 dot - kai_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsu2cxp4vlx4_qsu2cx_neon
f32 - i8 i4 dimensionwise asymmetric blockwise symmetric 1x4x32 Advanced SIMD dotprod - kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0, kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0
f32 - i8 i4 dimensionwise asymmetric blockwise symmetric 1x8 Advanced SIMD dotprod - kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0, kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0
f32 - i8 i4 dimensionwise asymmetric blockwise symmetric 1x8x32 Advanced SIMD dotprod - kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0, kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0
f32 - i8 i4 dimensionwise asymmetric channelwise symmetric 1x4x32 Advanced SIMD dotprod - kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0, kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0
f32 - i8 i4 dimensionwise asymmetric channelwise symmetric 1x8x32 Advanced SIMD dotprod - kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0, kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0
f32 - i8 i8 dimensionwise asymmetric channelwise symmetric 1x4 Advanced SIMD dotprod - kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon, kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon
f32 - i8 i4 dimensionwise asymmetric blockwise symmetric 16x4 Advanced SIMD dotprod - kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0, kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0
f32 - i8 i4 dimensionwise asymmetric blockwise symmetric 4x8 Advanced SIMD dotprod - kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p8x4_4x8_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0, kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0
f32 - i8 i4 dimensionwise asymmetric channelwise symmetric 8x8x32 Advanced SIMD dotprod - kai_matmul_clamp_f32_qai8dxp4x4_qsi4cxp8x4_8x8x32_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0, kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0
f32 - i8 i8 dimensionwise asymmetric channelwise symmetric 16x4 Advanced SIMD dotprod - kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon, kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon
f32 - i8 i4 dimensionwise asymmetric blockwise symmetric 16x4x32 Advanced SIMD i8mm - kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0, kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0
f32 - i8 i4 dimensionwise asymmetric blockwise symmetric 8x4x32 Advanced SIMD i8mm - kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0, kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0
f32 - i8 i4 dimensionwise asymmetric blockwise symmetric 4x8 Advanced SIMD i8mm - kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8_neon_i8mm LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0, kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0
f32 - i8 i4 dimensionwise asymmetric blockwise symmetric 4x8x32 Advanced SIMD i8mm - kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0, kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0
f32 - i8 i4 dimensionwise asymmetric channelwise symmetric 16x4x32 Advanced SIMD dotprod - kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0, kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0
f32 - i8 i4 dimensionwise asymmetric channelwise symmetric 4x4x32 Advanced SIMD i8mm - kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0, kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0
f32 - i8 i4 dimensionwise asymmetric channelwise symmetric 8x4x32 Advanced SIMD i8mm - kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0, kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0
f32 - i8 i4 dimensionwise asymmetric channelwise symmetric 4x8x32 Advanced SIMD i8mm - kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0, kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0
f32 - i8 i4 dimensionwise asymmetric channelwise symmetric 8x8x32 Advanced SIMD i8mm - kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0, kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0
f32 - i8 i8 dimensionwise asymmetric channelwise symmetric 16x4 Advanced SIMD i8mm - kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm LHS: kai_lhs_quant_pack_qai8dxp_f32
RHS: kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon, kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon
f32 - i8 i4 blockwise per dimension symmetric blockwise asymmetric 1vlx4vl SME2 mopa - kai_matmul_clamp_f32_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa LHS: kai_lhs_quant_pack_qsi8d32pscalef32_f32_neon
RHS: kai_rhs_pack_nxk_qai4c32ps1s0nrx4_qau4c32s1s0_f32_f32_f32_neon, kai_rhs_pack_nxk_qai4c32ps1s0nrx4_qau4c32s0s1_f32_f32_f32_neon
f32 - i8 i4 blockwise per dimension symmetric blockwise symmetric 1vlx4vl SME2 mopa - kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa LHS: kai_lhs_quant_pack_qsi8d32p_f32_neon
RHS: kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon
f32 - i8 i4 blockwise per dimension symmetric blockwise asymmetric 1x4vl SME2 dot - kai_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot LHS: kai_lhs_quant_pack_qsi8d32pscalef32_f32_neon
RHS: kai_rhs_pack_nxk_qai4c32ps1s0nrx4_qau4c32s1s0_f32_f32_f32_neon, kai_rhs_pack_nxk_qai4c32ps1s0nrx4_qau4c32s0s1_f32_f32_f32_neon
f32 - i8 i4 blockwise per dimension symmetric blockwise asymmetric 1x4 Advanced SIMD dotprod - kai_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod LHS: kai_lhs_quant_pack_qsi8d32pscalef32_f32_neon
RHS: kai_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon
f32 - i8 i4 blockwise per dimension symmetric blockwise symmetric 1x4vl SME2 sdot - kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot LHS: kai_lhs_quant_pack_qsi8d32p_f32_neon
RHS: kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon
f32 - i8 i4 blockwise per dimension symmetric blockwise symmetric 1x4 Advanced SIMD dotprod - kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod LHS: kai_lhs_quant_pack_qsi8d32p_f32
RHS: kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0
f32 - i8 i4 blockwise per dimension symmetric blockwise symmetric 1x8 SVE dotprod - kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p8x4_1x8_sve_dotprod LHS: kai_lhs_quant_pack_qsi8d32p_f32_neon
RHS: kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0
f32 - i8 i4 blockwise per dimension symmetric blockwise asymmetric 1x4 Advanced SIMD dotprod - kai_matmul_clamp_f32_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod LHS: kai_lhs_quant_pack_qsi8d32pscalef32_f32_neon
RHS: kai_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon
f32 - i8 i4 blockwise per dimension symmetric blockwise symmetric 1x4x32 Advanced SIMD dotprod - kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod LHS: kai_lhs_quant_pack_qsi8d32p_f32
RHS: kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0
f32 - i8 i4 blockwise per dimension symmetric blockwise symmetric 1x8 SVE dotprod - kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod LHS: kai_lhs_quant_pack_qsi8d32p_f32_neon
RHS: kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0
f32 - i8 i4 blockwise per dimension symmetric blockwise asymmetric 8x4 Advanced SIMD dotprod - kai_matmul_clamp_f32_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod LHS: kai_lhs_quant_pack_qsi8d32pscalef32_f32_neon
RHS: kai_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon
f32 - i8 i4 blockwise per dimension symmetric blockwise symmetric 16x4 Advanced SIMD dotprod - kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod LHS: kai_lhs_quant_pack_qsi8d32p_f32
RHS: kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0
f32 - i8 i4 blockwise per dimension symmetric blockwise asymmetric 8x4 Advanced SIMD i8mm - kai_matmul_clamp_f32_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm LHS: kai_lhs_quant_pack_qsi8d32pscalef32_f32_neon
RHS: kai_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon
f32 - i8 i4 blockwise per dimension symmetric blockwise symmetric 16x4 Advanced SIMD i8mm - kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm LHS: kai_lhs_quant_pack_qsi8d32p_f32
RHS: kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0
f32 - i8 i4 blockwise per dimension symmetric blockwise symmetric 8x4x32 Advanced SIMD i8mm - kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm LHS: kai_lhs_quant_pack_qsi8d32p_f32
RHS: kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0
f32 - i8 i4 blockwise per dimension symmetric blockwise symmetric 16x8 SVE i8mm - kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm LHS: kai_lhs_quant_pack_qsi8d32p_f32_neon
RHS: kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0
i8 tensorwise asymmetric i8 i8 tensorwise asymmetric channelwise symmetric 1x16vl SME2 dot - kai_matmul_clamp_qai8_qai8_qsi8cxp2vlx4sb_1x16vl_sme2_dot RHS: kai_rhs_pack_kxn_qsi8cxp2vlx4sb_qs8cx_f32_i32_sme
i8 tensorwise asymmetric i8 i8 tensorwise asymmetric channelwise symmetric 2vlx2vl SME mopa - kai_matmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa LHS: kai_lhs_pack_x8p2vlx4_x8_sme
RHS: kai_rhs_pack_kxn_qsi8cxp2vlx4sb_qs8cx_f32_i32_sme
i8 tensorwise asymmetric i8 i8 tensorwise asymmetric channelwise symmetric 2vlx2vl SME2 mopa - kai_matmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme2_mopa LHS: kai_lhs_pack_x8p2vlx4_x8_sme
RHS: kai_rhs_pack_kxn_qsi8cxp2vlx4sb_qs8cx_f32_i32_sme

Indirect matmul micro-kernels

Output type Output quantization LHS type RHS type LHS quantization RHS quantization Block size SIMD Feature Uarch Micro-kernel Packing micro-kernels
f16 - f16 f16 - - 2vlx2vl SME2 mopa - kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa LHS: kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme
RHS: kai_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme
f16 - f16 f16 - - 2vlx2vl SME mopa - kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2b_2vlx2vl_sme_mopa LHS: kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme
RHS: kai_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme
f32 - f32 f32 - - 6x4vl SVE mla - kai_imatmul_clamp_f32_f32_f32p4vlx1b_6x4vl_sve_mla RHS: kai_rhs_imatmul_pack_kxn_x32p4vlx1b_x32_x32_sve
f32 - f32 f32 - - 2vlx2vl SME2 mopa - kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme2_mopa LHS: kai_lhs_imatmul_pack_x32p2vlx1_x32p_sme
RHS: kai_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme
f32 - f32 f32 - - 2vlx2vl SME mopa - kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa LHS: kai_lhs_imatmul_pack_x32p2vlx1_x32p_sme
RHS: kai_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme
i8 tensorwise asymmetric i8 i8 tensorwise asymmetric channelwise symmetric 2vlx2vl SME mopa - kai_imatmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa LHS: kai_lhs_imatmul_pack_x8p2vlx4_x8p_sme
RHS: kai_rhs_imatmul_pack_kxn_qsi8cxp2vlx4sb_qs8cx_f32_i32_sme
i8 tensorwise asymmetric i8 i8 tensorwise asymmetric channelwise symmetric 2vlx2vl SME2 mopa - kai_imatmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme2_mopa LHS: kai_lhs_imatmul_pack_x8p2vlx4_x8p_sme
RHS: kai_rhs_imatmul_pack_kxn_qsi8cxp2vlx4sb_qs8cx_f32_i32_sme

Depthwise convolution micro-kernels

Output type LHS type RHS type LHS quantization RHS quantization Filter size Block size SIMD Feature Uarch Micro-kernel Packing micro-kernels
f32 f32 f32 - - 3x3 stride 1 4 rows, planar SME2 mla - kai_dwconv_clamp_f32_f32_f32p1vlx1b_3x3_s1_4xc_sme2_mla RHS: kai_rhs_dwconv_pack_x32p1vlx1b_x32_x32_sme

Packing micro-kernels

Output type Output quantization Input type Input quantization Bias type Scale type Zero type SIMD Micro-kernel
bf16 - f16 - f16 - - Advanced SIMD kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon
bf16 - f16 - f32 - - Advanced SIMD kai_rhs_pack_kxn_bf16p12x4biasf32_f16_neon
bf16 - f32 - f32 - - Advanced SIMD kai_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon
bf16 - f32 - - - - Advanced SIMD kai_lhs_quant_pack_bf16p1x4_f32_neon
bf16 - f32 - - - - SME kai_lhs_pack_bf16p2vlx2_f32_sme
bf16 - f32 - x32 - - SME kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme
bf16 - f16 - - - - Advanced SIMD kai_lhs_pack_bf16p8x4_f16_neon
bf16 - f32 - - - - Advanced SIMD kai_lhs_quant_pack_bf16p8x4_f32_neon
f16 - f16 - f16 - - Advanced SIMD kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon
f16 - f32 - - - - Advanced SIMD kai_lhs_pack_f16pmrx2_f32_neon
f32 - f32 - f32 - - SME kai_rhs_pack_kxn_f32p16vlx1b_f32_f32_sme
f32 - f32 - - - - SME kai_lhs_pack_f32p2vlx1_f32_sme
f32 - f32 - f32 - - SME kai_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme
f32 - f32 - f32 - - SME kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme
f32 - f32 - f32 - - Advanced SIMD kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon
i4 blockwise asymmetric u4 blockwise asymmetric f32 f32 f32 Advanced SIMD kai_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon
i4 blockwise asymmetric u4 blockwise asymmetric f32 f32 f32 Advanced SIMD kai_rhs_pack_nxk_qai4c32ps1s0nrx4_qau4c32s0s1_f32_f32_f32_neon
i4 blockwise asymmetric u4 blockwise asymmetric f32 f32 f32 Advanced SIMD kai_rhs_pack_nxk_qai4c32ps1s0nrx4_qau4c32s1s0_f32_f32_f32_neon
i8 dimensionwise asymmetric bf16 - - - - Advanced SIMD kai_lhs_quant_pack_qai8dxp_bf16_neon
i8 dimensionwise asymmetric f16 - - - - Advanced SIMD kai_lhs_quant_pack_qai8dxp_f16_neon
i8 dimensionwise asymmetric f32 - - - - - kai_lhs_quant_pack_qai8dxp_f32
i4 blockwise symmetric u4 blockwise symmetric - bf16 - - kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0
i4 blockwise symmetric u4 blockwise symmetric - bf16 - - kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0
i4 blockwise symmetric u4 blockwise symmetric f32 bf16 - Advanced SIMD kai_rhs_pack_nxk_qsi4c32pnrx4_qsu4c32s1s0_neon
i4 blockwise symmetric u4 blockwise symmetric f32 bf16 - Advanced SIMD kai_rhs_pack_nxk_qsi4c32pnrx8_qsu4c32s1s0_neon
i4 blockwise symmetric u4 blockwise symmetric f32 bf16 - Advanced SIMD kai_rhs_pack_kxn_qsi4c32ps1s0nrx4_qsu4c32s1s0_neon
i4 blockwise symmetric u4 blockwise symmetric f32 bf16 - Advanced SIMD kai_rhs_pack_nxk_qsi4c32ps1s0nrx4_qsu4c32s1s0_neon
i4 blockwise symmetric u4 blockwise symmetric - f16 - Advanced SIMD kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon
i4 blockwise symmetric u4 blockwise symmetric - f16 - - kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0
i4 channelwise symmetric i4 channelwise symmetric - f32 - - kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0
i4 channelwise symmetric i4 channelwise symmetric - f32 - - kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0
i4 channelwise symmetric u4 channelwise symmetric f32 f32 - Advanced SIMD kai_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon
i8 channelwise symmetric i8 channelwise symmetric f32 f32 - Advanced SIMD kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon
i8 channelwise symmetric i8 channelwise symmetric f32 f32 - Advanced SIMD kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon
i8 channelwise symmetric i8 channelwise symmetric i32 f32 - SME kai_rhs_imatmul_pack_kxn_qsi8cxp2vlx4sb_qs8cx_f32_i32_sme
i8 channelwise symmetric i8 channelwise symmetric i32 f32 - SME kai_rhs_pack_kxn_qsi8cxp2vlx4sb_qs8cx_f32_i32_sme
i8 blockwise per dimension symmetric f32 - - - - - kai_lhs_quant_pack_qsi8d32p_f32
i8 blockwise per dimension symmetric f32 - - - - Advanced SIMD kai_lhs_quant_pack_qsi8d32p_f32_neon
i8 blockwise per dimension symmetric f32 - - - - Advanced SIMD kai_lhs_quant_pack_qsi8d32p4x8sb_f32_neon
i8 blockwise per dimension symmetric f16 - - - - Advanced SIMD kai_lhs_quant_pack_qsi8d32pscalef32_f16_neon
i8 blockwise per dimension symmetric f32 - - - - Advanced SIMD kai_lhs_quant_pack_qsi8d32pscalef32_f32_neon
u2 channelwise symmetric u2 channelwise symmetric f32 f32 - Advanced SIMD kai_rhs_pack_nxk_qsu2cxp4vlx4_qsu2cx_neon
x16 - x16 - - - - SME kai_lhs_pack_x16p2vlx2_x16_sme
x16 - x16 - - - - SME kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme
x16 - x16 - x16 - - SME kai_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme
x16 - x16 - x16 - - SME kai_rhs_pack_kxn_x16p2vlx2b_x16_x16_sme
x16 - x16 - x16 - - SME kai_rhs_pack_nxk_x16p2vlx2b_x16_x16_sme
x16 - x16 - x16 - - Advanced SIMD kai_rhs_pack_kxn_x16p32x1b_x16_x16_neon
x32 - x32 - x32 - - Advanced SIMD kai_rhs_pack_kxn_x32p16x1b_x32_x32_neon
x32 - x32 - x32 - - SME kai_rhs_dwconv_pack_x32p1vlx1b_x32_x32_sme
x32 - x32 - - - - SME kai_matmul_pack_lhs_mxk_x32p4vsx1_x32_sme
x32 - x32 - x32 - - SME kai_matmul_pack_rhs_kxn_x32p4vsx1bx32_x32_x32_sme
x32 - x32 - x32 - - SME kai_matmul_pack_rhs_nxk_x32p4vsx1bx32_x32_x32_sme
x32 - x32 - - - - SME kai_lhs_imatmul_pack_x32p2vlx1_x32p_sme
x32 - x32 - x32 - - SME kai_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme
x32 - x32 - x32 - - SVE kai_rhs_imatmul_pack_kxn_x32p4vlx1b_x32_x32_sve
x32 - x32 - x32 - - SVE kai_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve
x8 - x8 - - - - SME kai_lhs_pack_x8p2vlx4_x8_sme
x8 - x8 - - - - SME kai_lhs_imatmul_pack_x8p2vlx4_x8p_sme