diff --git a/CMakeLists.txt b/CMakeLists.txt index 3d34e2fe..d3986e16 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -143,8 +143,22 @@ endif() # PowerPC if(ARCH_ID STREQUAL "ppc64" OR ARCH_ID STREQUAL "ppc64le") - if(ARCH STREQUAL "native") - add_flag("-mcpu=native") + list(APPEND randomx_sources + src/jit_compiler_ppc64_static.S + src/jit_compiler_ppc64.cpp) + + set_property(SOURCE src/jit_compiler_ppc64_static.S PROPERTY LANGUAGE C) + + if(ARCH STREQUAL "default") + if(ARCH_ID STREQUAL "ppc64le") + # Little-endian defaults to POWER8 + add_flag("-mcpu=power8") + else() + # Big-endian defaults to POWER7 + add_flag("-mcpu=power7") + endif() + else() + add_flag("-mcpu=${ARCH}") endif() # PowerPC AES requires ALTIVEC (POWER7+), so it cannot be enabled in the default build endif() diff --git a/src/common.hpp b/src/common.hpp index 579752d9..9b92d08a 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -138,6 +138,11 @@ namespace randomx { #define RANDOMX_COMPILER_RV64 class JitCompilerRV64; using JitCompiler = JitCompilerRV64; +#elif defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__) + #define RANDOMX_HAVE_COMPILER 1 + #define RANDOMX_COMPILER_PPC64 + class JitCompilerPPC64; + using JitCompiler = JitCompilerPPC64; #else #define RANDOMX_HAVE_COMPILER 0 class JitCompilerFallback; diff --git a/src/cpu.cpp b/src/cpu.cpp index 3178d037..3faa0f45 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -47,6 +47,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif +#if defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__) + #include + // From asm/cputable.h: + #ifndef PPC_FEATURE2_VEC_CRYPTO + #define PPC_FEATURE2_VEC_CRYPTO 0x02000000 + #endif + #ifndef PPC_FEATURE2_ARCH_3_00 + #define PPC_FEATURE2_ARCH_3_00 0x00800000 + #endif +#endif + #ifdef __riscv #include #include @@ -120,8 +131,11 @@ namespace randomx { sigaction(SIGILL, &old_action, nullptr); } +#elif defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__) + unsigned long hwcaps2 = getauxval(AT_HWCAP2); + aes_ = (hwcaps2 & PPC_FEATURE2_VEC_CRYPTO) != 0; + v3p0_ = (hwcaps2 & PPC_FEATURE2_ARCH_3_00) != 0; #endif - //TODO POWER8 AES } const Cpu cpu; diff --git a/src/cpu.hpp b/src/cpu.hpp index 7db03311..0c5058d6 100644 --- a/src/cpu.hpp +++ b/src/cpu.hpp @@ -41,6 +41,9 @@ namespace randomx { inline bool hasRVV() const { return rvv_; } inline int getRVV_Length() const { return rvv_length; } #endif +#if defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__) + inline bool hasV3P0() const { return v3p0_; } +#endif private: bool aes_ = false; @@ -49,6 +52,9 @@ namespace randomx { #ifdef __riscv bool rvv_ = false; int rvv_length = 0; +#endif +#if defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__) + bool v3p0_ = false; #endif }; diff --git a/src/intrin_portable.h b/src/intrin_portable.h index 10530656..e1a06b12 100644 --- a/src/intrin_portable.h +++ b/src/intrin_portable.h @@ -277,11 +277,19 @@ FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) { } FORCE_INLINE rx_vec_f128 rx_cast_vec_i2f(rx_vec_i128 a) { +#if defined(NATIVE_LITTLE_ENDIAN) return (rx_vec_f128)a; +#else + return (rx_vec_f128)vec_perm((__m128i)a, (__m128i)a, (__m128i){4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}); +#endif } FORCE_INLINE rx_vec_i128 rx_cast_vec_f2i(rx_vec_f128 a) { +#if defined(NATIVE_LITTLE_ENDIAN) return (rx_vec_i128)a; +#else + return (rx_vec_i128)vec_perm((__m128i)a, (__m128i)a, (__m128i){4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}); +#endif } FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { diff --git a/src/jit_compiler.hpp b/src/jit_compiler.hpp index 56c0655c..52fce1db 100644 --- a/src/jit_compiler.hpp +++ b/src/jit_compiler.hpp @@ -70,6 +70,8 @@ namespace randomx { #include "jit_compiler_a64.hpp" #elif defined(RANDOMX_COMPILER_RV64) #include "jit_compiler_rv64.hpp" +#elif defined(RANDOMX_COMPILER_PPC64) +#include "jit_compiler_ppc64.hpp" #else #include "jit_compiler_fallback.hpp" #endif diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp new file mode 100644 index 00000000..32b0ef9e --- /dev/null +++ b/src/jit_compiler_ppc64.cpp @@ -0,0 +1,1622 @@ +/* +Copyright (c) 2023 tevador +Copyright (c) 2026, Forest Crossman + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include + +#include + +#include "cpu.hpp" +#include "program.hpp" +#include "reciprocal.h" +#include "superscalar.hpp" +#include "virtual_memory.h" + +#include "jit_compiler_ppc64.hpp" + +namespace { +#define HANDLER_ARGS randomx::JitCompilerPPC64* jit, randomx::CompilerState& state, randomx::Instruction isn, int i, randomx_flags flags + using InstructionHandler = void(HANDLER_ARGS); + extern InstructionHandler* opcodeMap1[256]; +} + +namespace PPC64 { + + static inline uint32_t A_form(uint32_t po, uint32_t frt, uint32_t fra, uint32_t frb, uint32_t frc, uint32_t xo, uint32_t rc) { + if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F"); + if (!(frt <= 0x1F)) throw std::runtime_error("frt <= 0x1F"); + if (!(fra <= 0x1F)) throw std::runtime_error("fra <= 0x1F"); + if (!(frb <= 0x1F)) throw std::runtime_error("frb <= 0x1F"); + if (!(frc <= 0x1F)) throw std::runtime_error("frc <= 0x1F"); + if (!(xo <= 0x1F)) throw std::runtime_error("xo <= 0x1F"); + if (!(rc <= 0x1)) throw std::runtime_error("rc <= 0x1"); + return (po << 26) | (frt << 21) | (fra << 16) | (frb << 11) | (frc << 6) | (xo << 1) | rc; + } + + static inline uint32_t B_form(uint32_t po, uint32_t bo, uint32_t bi, uint32_t bd, uint32_t aa, uint32_t lk) { + if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F"); + if (!(bo <= 0x1F)) throw std::runtime_error("bo <= 0x1F"); + if (!(bi <= 0x1F)) throw std::runtime_error("bi <= 0x1F"); + if (!(bd <= 0x3FFF)) throw std::runtime_error("bd <= 0x3FFF"); + if (!(aa <= 0x1)) throw std::runtime_error("aa <= 0x1"); + if (!(lk <= 0x1)) throw std::runtime_error("lk <= 0x1"); + return (po << 26) | (bo << 21) | (bi << 16) | (bd << 2) | (aa << 1) | lk; + } + + static inline uint32_t D_form(uint32_t po, uint32_t rt, uint32_t ra, uint32_t d) { + if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F"); + if (!(rt <= 0x1F)) throw std::runtime_error("rt <= 0x1F"); + if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F"); + if (!(d <= 0xFFFF)) throw std::runtime_error("d <= 0xFFFF"); + return (po << 26) | (rt << 21) | (ra << 16) | d; + } + + static inline uint32_t DQ_form(uint32_t po, uint32_t s, uint32_t ra, uint32_t dq, uint32_t sx, uint32_t xo) { + if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F"); + if (!(s <= 0x1F)) throw std::runtime_error("s <= 0x1F"); + if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F"); + if (!(dq <= 0xFFF)) throw std::runtime_error("dq <= 0xFFF"); + if (!(sx <= 0x1)) throw std::runtime_error("sx <= 0x1"); + if (!(xo <= 0x7)) throw std::runtime_error("xo <= 0x7"); + return (po << 26) | (s << 21) | (ra << 16) | (dq << 4) | (sx << 3) | xo; + } + + static inline uint32_t DS_form(uint32_t po, uint32_t rt, uint32_t ra, uint32_t ds, uint32_t xo) { + if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F"); + if (!(rt <= 0x1F)) throw std::runtime_error("rt <= 0x1F"); + if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F"); + if (!(ds <= 0x3FFF)) throw std::runtime_error("ds <= 0x3FFF"); + if (!(xo <= 0x3)) throw std::runtime_error("xo <= 0x3"); + return (po << 26) | (rt << 21) | (ra << 16) | (ds << 2) | xo; + } + + static inline uint32_t I_form(uint32_t po, uint32_t li, uint32_t aa, uint32_t lk) { + if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F"); + if (!(li <= 0xFFFFFF)) throw std::runtime_error("li <= 0xFFFFFF"); + if (!(aa <= 0x1)) throw std::runtime_error("aa <= 0x1"); + if (!(lk <= 0x1)) throw std::runtime_error("lk <= 0x1"); + return (po << 26) | (li << 2) | (aa << 1) | lk; + } + + static inline uint32_t M_form(uint32_t po, uint32_t rs, uint32_t ra, uint32_t sh, uint32_t mb, uint32_t me, uint32_t rc) { + if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F"); + if (!(rs <= 0x1F)) throw std::runtime_error("rs <= 0x1F"); + if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F"); + if (!(sh <= 0x1F)) throw std::runtime_error("sh <= 0x1F"); + if (!(mb <= 0x1F)) throw std::runtime_error("mb <= 0x1F"); + if (!(me <= 0x1F)) throw std::runtime_error("me <= 0x1F"); + if (!(rc <= 0x1)) throw std::runtime_error("rc <= 0x1"); + return (po << 26) | (rs << 21) | (ra << 16) | (sh << 11) | (mb << 6) | (me << 1) | rc; + } + + static inline uint32_t MD_form(uint32_t po, uint32_t rs, uint32_t ra, uint32_t sh, uint32_t mb, uint32_t xo, uint32_t rc) { + if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F"); + if (!(rs <= 0x1F)) throw std::runtime_error("rs <= 0x1F"); + if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F"); + if (!(sh <= 0x3F)) throw std::runtime_error("sh <= 0x3F"); + if (!(mb <= 0x3F)) throw std::runtime_error("mb <= 0x3F"); + if (!(xo <= 0x7)) throw std::runtime_error("xo <= 0x7"); + if (!(rc <= 0x1)) throw std::runtime_error("rc <= 0x1"); + uint32_t sh0_4 = sh & 0x1F; + uint32_t sh5 = (sh >> 5) & 0x1; + uint32_t mb0_4 = mb & 0x1F; + uint32_t mb5 = (mb >> 5) & 0x1; + return (po << 26) | (rs << 21) | (ra << 16) | (sh0_4 << 11) | (mb0_4 << 6) | (mb5 << 5) | (xo << 2) | (sh5 << 1) | rc; + } + + static inline uint32_t MDS_form(uint32_t po, uint32_t rs, uint32_t ra, uint32_t rb, uint32_t mb, uint32_t xo, uint32_t rc) { + if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F"); + if (!(rs <= 0x1F)) throw std::runtime_error("rs <= 0x1F"); + if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F"); + if (!(rb <= 0x1F)) throw std::runtime_error("rb <= 0x1F"); + if (!(mb <= 0x3F)) throw std::runtime_error("mb <= 0x3F"); + if (!(xo <= 0xF)) throw std::runtime_error("xo <= 0xF"); + if (!(rc <= 0x1)) throw std::runtime_error("rc <= 0x1"); + uint32_t mb0_4 = mb & 0x1F; + uint32_t mb5 = (mb >> 5) & 0x1; + return (po << 26) | (rs << 21) | (ra << 16) | (rb << 11) | (mb0_4 << 6) | (mb5 << 5) | (xo << 1) | rc; + } + + static inline uint32_t VA_form(uint32_t po, uint32_t vrt, uint32_t vra, uint32_t vrb, uint32_t vrc, uint32_t xo) { + if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F"); + if (!(vrt <= 0x1F)) throw std::runtime_error("vrt <= 0x1F"); + if (!(vra <= 0x1F)) throw std::runtime_error("vra <= 0x1F"); + if (!(vrb <= 0x1F)) throw std::runtime_error("vrb <= 0x1F"); + if (!(vrc <= 0x1F)) throw std::runtime_error("vrc <= 0x1F"); + if (!(xo <= 0x3F)) throw std::runtime_error("xo <= 0x3F"); + return (po << 26) | (vrt << 21) | (vra << 16) | (vrb << 11) | (vrc << 6) | xo; + } + + static inline uint32_t VX_form(uint32_t po, uint32_t vrt, uint32_t vra, uint32_t vrb, uint32_t xo) { + if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F"); + if (!(vrt <= 0x1F)) throw std::runtime_error("vrt <= 0x1F"); + if (!(vra <= 0x1F)) throw std::runtime_error("vra <= 0x1F"); + if (!(vrb <= 0x1F)) throw std::runtime_error("vrb <= 0x1F"); + if (!(xo <= 0x7FF)) throw std::runtime_error("xo <= 0x7FF"); + return (po << 26) | (vrt << 21) | (vra << 16) | (vrb << 11) | xo; + } + + static inline uint32_t X_form(uint32_t po, uint32_t rt, uint32_t ra, uint32_t rb, uint32_t xo, uint32_t rc) { + if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F"); + if (!(rt <= 0x1F)) throw std::runtime_error("rt <= 0x1F"); + if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F"); + if (!(rb <= 0x1F)) throw std::runtime_error("rb <= 0x1F"); + if (!(xo <= 0x3FF)) throw std::runtime_error("xo <= 0x3FF"); + if (!(rc <= 0x1)) throw std::runtime_error("rc <= 0x1"); + return (po << 26) | (rt << 21) | (ra << 16) | (rb << 11) | (xo << 1) | rc; + } + + static inline uint32_t XFL_form(uint32_t po, uint32_t l, uint32_t flm, uint32_t w, uint32_t frb, uint32_t xo, uint32_t rc) { + if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F"); + if (!(l <= 0x1)) throw std::runtime_error("l <= 0x1"); + if (!(flm <= 0xFF)) throw std::runtime_error("flm <= 0xFF"); + if (!(w <= 0x1)) throw std::runtime_error("w <= 0x1"); + if (!(frb <= 0x1F)) throw std::runtime_error("frb <= 0x1F"); + if (!(xo <= 0x3FF)) throw std::runtime_error("xo <= 0x3FF"); + if (!(rc <= 0x1)) throw std::runtime_error("rc <= 0x1"); + return (po << 26) | (l << 25) | (flm << 17) | (w << 16) | (frb << 11) | (xo << 1) | rc; + } + + static inline uint32_t XO_form(uint32_t po, uint32_t rt, uint32_t ra, uint32_t rb, uint32_t oe, uint32_t xo, uint32_t rc) { + if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F"); + if (!(rt <= 0x1F)) throw std::runtime_error("rt <= 0x1F"); + if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F"); + if (!(rb <= 0x1F)) throw std::runtime_error("rb <= 0x1F"); + if (!(oe <= 0x1)) throw std::runtime_error("oe <= 0x1"); + if (!(xo <= 0x1FF)) throw std::runtime_error("xo <= 0x1FF"); + if (!(rc <= 0x1)) throw std::runtime_error("rc <= 0x1"); + return (po << 26) | (rt << 21) | (ra << 16) | (rb << 11) | (oe << 10) | (xo << 1) | rc; + } + + static inline uint32_t XX2_form(uint32_t po, uint32_t t, uint32_t a, uint32_t b, uint32_t xo, uint32_t bx, uint32_t tx) { + if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F"); + if (!(t <= 0x1F)) throw std::runtime_error("t <= 0x1F"); + if (!(a <= 0x1F)) throw std::runtime_error("a <= 0x1F"); + if (!(b <= 0x1F)) throw std::runtime_error("b <= 0x1F"); + if (!(xo <= 0x1FF)) throw std::runtime_error("xo <= 0x1FF"); + if (!(bx <= 0x1)) throw std::runtime_error("bx <= 0x1"); + if (!(tx <= 0x1)) throw std::runtime_error("tx <= 0x1"); + return (po << 26) | (t << 21) | (a << 16) | (b << 11) | (xo << 2) | (bx << 1) | tx; + } + + static inline uint32_t XX3_form(uint32_t po, uint32_t t, uint32_t a, uint32_t b, uint32_t xo, uint32_t ax, uint32_t bx, uint32_t tx) { + if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F"); + if (!(t <= 0x1F)) throw std::runtime_error("t <= 0x1F"); + if (!(a <= 0x1F)) throw std::runtime_error("a <= 0x1F"); + if (!(b <= 0x1F)) throw std::runtime_error("b <= 0x1F"); + if (!(xo <= 0xFF)) throw std::runtime_error("xo <= 0xFF"); + if (!(ax <= 0x1)) throw std::runtime_error("ax <= 0x1"); + if (!(bx <= 0x1)) throw std::runtime_error("bx <= 0x1"); + if (!(tx <= 0x1)) throw std::runtime_error("tx <= 0x1"); + return (po << 26) | (t << 21) | (a << 16) | (b << 11) | (xo << 3) | (ax << 2) | (bx << 1) | tx; + } + + static inline uint32_t b(int32_t offset) { + if (offset & 3) throw std::runtime_error("offset must be 4-byte aligned"); + if (offset < -(1 << 25) || offset >= (1 << 25)) throw std::runtime_error("offset out of range"); + return I_form(18, (offset >> 2) & 0xFFFFFF, 0, 0); + } + + static inline uint32_t bl(int32_t offset) { + if (offset & 3) throw std::runtime_error("offset must be 4-byte aligned"); + if (offset < -(1 << 25) || offset >= (1 << 25)) throw std::runtime_error("offset out of range"); + return I_form(18, (offset >> 2) & 0xFFFFFF, 0, 1); + } + + static inline uint32_t bc(uint32_t bo, uint32_t bi, int32_t offset) { + if (!(bo <= 0x1F)) throw std::runtime_error("bo <= 0x1F"); + if (!(bi <= 0x1F)) throw std::runtime_error("bi <= 0x1F"); + if (offset & 3) throw std::runtime_error("offset must be 4-byte aligned"); + if (offset < -(1 << 15) || offset >= (1 << 15)) throw std::runtime_error("offset out of range"); + return B_form(16, bo, bi, (offset >> 2) & 0x3FFF, 0, 0); + } + + static inline uint32_t beq(int32_t offset) { return bc(0x0C, 2, offset); } + static inline uint32_t beq_predict_not_taken(int32_t offset) { return bc(0x0E, 2, offset); } + static inline uint32_t bne(int32_t offset) { return bc(0x04, 2, offset); } + static inline uint32_t bne_predict_taken(int32_t offset) { return bc(0x07, 2, offset); } + + static inline uint32_t cmpi(uint32_t bf, uint32_t l, uint32_t ra, int32_t si) { + if (!(bf <= 0x7)) throw std::runtime_error("bf <= 0x7"); + if (!(l <= 0x1)) throw std::runtime_error("l <= 0x1"); + if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F"); + if (si < -(1 << 15) || si >= (1 << 15)) throw std::runtime_error("si out of range"); + return D_form(11, (bf << 2) | l, ra, si); + } + + static inline uint32_t addi(uint32_t rt, uint32_t ra, uint32_t si) { return D_form(14, rt, ra, si); } + static inline uint32_t addis(uint32_t rt, uint32_t ra, uint32_t si) { return D_form(15, rt, ra, si); } + static inline uint32_t ori(uint32_t ra, uint32_t rs, uint32_t ui) { return D_form(24, rs, ra, ui); } + static inline uint32_t oris(uint32_t ra, uint32_t rs, uint32_t ui) { return D_form(25, rs, ra, ui); } + static inline uint32_t xori(uint32_t ra, uint32_t rs, uint32_t ui) { return D_form(26, rs, ra, ui); } + static inline uint32_t xoris(uint32_t ra, uint32_t rs, uint32_t ui) { return D_form(27, rs, ra, ui); } + static inline uint32_t andi_dot(uint32_t ra, uint32_t rs, uint32_t ui) { return D_form(28, rs, ra, ui); } + + static inline uint32_t add(uint32_t rt, uint32_t ra, uint32_t rb) { return XO_form(31, rt, ra, rb, 0, 266, 0); } + static inline uint32_t subf(uint32_t rt, uint32_t ra, uint32_t rb) { return XO_form(31, rt, ra, rb, 0, 40, 0); } + static inline uint32_t neg(uint32_t rt, uint32_t ra) { return XO_form(31, rt, ra, 0, 0, 104, 0); } + static inline uint32_t and_(uint32_t ra, uint32_t rs, uint32_t rb) { return X_form(31, rs, ra, rb, 28, 0); } + static inline uint32_t and_dot(uint32_t ra, uint32_t rs, uint32_t rb) { return X_form(31, rs, ra, rb, 28, 1); } + static inline uint32_t xor_(uint32_t ra, uint32_t rs, uint32_t rb) { return X_form(31, rs, ra, rb, 316, 0); } + static inline uint32_t or_(uint32_t ra, uint32_t rs, uint32_t rb) { return X_form(31, rs, ra, rb, 444, 0); } + + static inline uint32_t mulld(uint32_t rt, uint32_t ra, uint32_t rb) { return XO_form(31, rt, ra, rb, 0, 233, 0); } + static inline uint32_t mulhdu(uint32_t rt, uint32_t ra, uint32_t rb) { return XO_form(31, rt, ra, rb, 0, 9, 0); } + static inline uint32_t mulhd(uint32_t rt, uint32_t ra, uint32_t rb) { return XO_form(31, rt, ra, rb, 0, 73, 0); } + + static inline uint32_t rlwinm(uint32_t ra, uint32_t rs, uint32_t sh, uint32_t mb, uint32_t me) { return M_form(21, rs, ra, sh, mb, me, 0); } + static inline uint32_t rldicl(uint32_t ra, uint32_t rs, uint32_t sh, uint32_t mb) { return MD_form(30, rs, ra, sh, mb, 0, 0); } + static inline uint32_t rldicl_dot(uint32_t ra, uint32_t rs, uint32_t sh, uint32_t mb) { return MD_form(30, rs, ra, sh, mb, 0, 1); } + static inline uint32_t rldicr(uint32_t ra, uint32_t rs, uint32_t sh, uint32_t me) { return MD_form(30, rs, ra, sh, me, 1, 0); } + static inline uint32_t rldic(uint32_t ra, uint32_t rs, uint32_t sh, uint32_t mb) { return MD_form(30, rs, ra, sh, mb, 2, 0); } + static inline uint32_t rldcl(uint32_t ra, uint32_t rs, uint32_t rb, uint32_t mb) { return MDS_form(30, rs, ra, rb, mb, 8, 0); } + + static inline uint32_t cmpdi(uint32_t rx, int32_t si) { return cmpi(0, 1, rx, si); } + + static inline uint32_t li(uint32_t rx, int32_t si) { return addi(rx, 0, si); } + static inline uint32_t lis(uint32_t rx, int32_t si) { return addis(rx, 0, si); } + static inline uint32_t mr(uint32_t rx, uint32_t ry) { return or_(rx, ry, ry); } + static inline uint32_t rotldi(uint32_t ra, uint32_t rs, uint32_t n) { return rldicl(ra, rs, n, 0); } + static inline uint32_t rotrdi(uint32_t ra, uint32_t rs, uint32_t n) { return rldicl(ra, rs, 64-n, 0); } + static inline uint32_t sldi(uint32_t rx, uint32_t ry, uint32_t n) { return rldicr(rx, ry, n, 63-n); } + static inline uint32_t srdi(uint32_t rx, uint32_t ry, uint32_t n) { return rldicl(rx, ry, 64-n, n); } + + static inline uint32_t ld(uint32_t rt, int32_t offset, uint32_t ra) { + if (offset & 3) throw std::runtime_error("offset must be 4-byte aligned"); + if (offset < -(1 << 15) || offset >= (1 << 15)) throw std::runtime_error("offset out of range"); + return DS_form(58, rt, ra, (offset >> 2) & 0x3FFF, 0); + } + + static inline uint32_t ldx(uint32_t rt, uint32_t ra, uint32_t rb) { return X_form(31, rt, ra, rb, 21, 0); } + static inline uint32_t ldbrx(uint32_t rt, uint32_t ra, uint32_t rb) { return X_form(31, rt, ra, rb, 532, 0); } + static inline uint32_t stdx(uint32_t rs, uint32_t ra, uint32_t rb) { return X_form(31, rs, ra, rb, 149, 0); } + static inline uint32_t stdbrx(uint32_t rs, uint32_t ra, uint32_t rb) { return X_form(31, rs, ra, rb, 660, 0); } + + static inline uint32_t lfd(uint32_t frt, uint32_t ra, uint32_t d) { return D_form(50, frt, ra, d); } + static inline uint32_t lfdx(uint32_t frt, uint32_t ra, uint32_t rb) { return X_form(31, frt, ra, rb, 599, 0); } + static inline uint32_t mtfsf(uint32_t flm, uint32_t frb, uint32_t l, uint32_t w) { return XFL_form(63, l, flm, w, frb, 711, 0); } + static inline uint32_t mffscrn(uint32_t frt, uint32_t frb) { return X_form(63, frt, 22, frb, 583, 0); } // Only v3.0B and later + + static inline uint32_t lxsdx(uint32_t xt, uint32_t ra, uint32_t rb) { + if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F"); + uint32_t t = xt & 0x1F; + uint32_t tx = xt >> 5; + return X_form(31, t, ra, rb, 588, tx); + } + + static inline uint32_t lxvd2x(uint32_t xt, uint32_t ra, uint32_t rb) { + if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F"); + uint32_t t = xt & 0x1F; + uint32_t tx = xt >> 5; + return X_form(31, t, ra, rb, 844, tx); + } + + static inline uint32_t stvx(uint32_t vrs, uint32_t ra, uint32_t rb) { return X_form(31, vrs, ra, rb, 231, 0); } + + static inline uint32_t stxv(uint32_t xs, int32_t offset, uint32_t ra) { // Only v3.0B and later + if (!(xs <= 0x3F)) throw std::runtime_error("xs <= 0x3F"); + if (offset & 0xF) throw std::runtime_error("offset must be 16-byte aligned"); + if (offset < -(1 << 15) || offset >= (1 << 15)) throw std::runtime_error("offset out of range"); + uint32_t s = xs & 0x1F; + uint32_t sx = xs >> 5; + return DQ_form(61, s, ra, (offset >> 4) & 0xFFF, sx, 5); + } + + static inline uint32_t vperm(uint32_t vrt, uint32_t vra, uint32_t vrb, uint32_t vrc) { return VA_form(4, vrt, vra, vrb, vrc, 43); } + static inline uint32_t vsel(uint32_t vrt, uint32_t vra, uint32_t vrb, uint32_t vrc) { return VA_form(4, vrt, vra, vrb, vrc, 42); } + + static inline uint32_t vand(uint32_t vrt, uint32_t vra, uint32_t vrb) { return VX_form(4, vrt, vra, vrb, 1028); } + static inline uint32_t vor(uint32_t vrt, uint32_t vra, uint32_t vrb) { return VX_form(4, vrt, vra, vrb, 1156); } + static inline uint32_t vxor(uint32_t vrt, uint32_t vra, uint32_t vrb) { return VX_form(4, vrt, vra, vrb, 1220); } + + static inline uint32_t xxmrghw(uint32_t xt, uint32_t xa, uint32_t xb) { + if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F"); + if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F"); + if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F"); + uint32_t t = xt & 0x1F; + uint32_t tx = xt >> 5; + uint32_t a = xa & 0x1F; + uint32_t ax = xa >> 5; + uint32_t b = xb & 0x1F; + uint32_t bx = xb >> 5; + return XX3_form(60, t, a, b, 18, ax, bx, tx); + } + + static inline uint32_t xvadddp(uint32_t xt, uint32_t xa, uint32_t xb) { + if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F"); + if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F"); + if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F"); + uint32_t t = xt & 0x1F; + uint32_t tx = xt >> 5; + uint32_t a = xa & 0x1F; + uint32_t ax = xa >> 5; + uint32_t b = xb & 0x1F; + uint32_t bx = xb >> 5; + return XX3_form(60, t, a, b, 96, ax, bx, tx); + } + + static inline uint32_t xvsubdp(uint32_t xt, uint32_t xa, uint32_t xb) { + if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F"); + if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F"); + if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F"); + uint32_t t = xt & 0x1F; + uint32_t tx = xt >> 5; + uint32_t a = xa & 0x1F; + uint32_t ax = xa >> 5; + uint32_t b = xb & 0x1F; + uint32_t bx = xb >> 5; + return XX3_form(60, t, a, b, 104, ax, bx, tx); + } + + static inline uint32_t xvmuldp(uint32_t xt, uint32_t xa, uint32_t xb) { + if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F"); + if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F"); + if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F"); + uint32_t t = xt & 0x1F; + uint32_t tx = xt >> 5; + uint32_t a = xa & 0x1F; + uint32_t ax = xa >> 5; + uint32_t b = xb & 0x1F; + uint32_t bx = xb >> 5; + return XX3_form(60, t, a, b, 112, ax, bx, tx); + } + + static inline uint32_t xvdivdp(uint32_t xt, uint32_t xa, uint32_t xb) { + if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F"); + if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F"); + if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F"); + uint32_t t = xt & 0x1F; + uint32_t tx = xt >> 5; + uint32_t a = xa & 0x1F; + uint32_t ax = xa >> 5; + uint32_t b = xb & 0x1F; + uint32_t bx = xb >> 5; + return XX3_form(60, t, a, b, 120, ax, bx, tx); + } + + static inline uint32_t xvsqrtdp(uint32_t xt, uint32_t xb) { + if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F"); + if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F"); + uint32_t t = xt & 0x1F; + uint32_t tx = xt >> 5; + uint32_t b = xb & 0x1F; + uint32_t bx = xb >> 5; + return XX2_form(60, t, 0, b, 203, bx, tx); + } + + static inline uint32_t xvcvsxwdp(uint32_t xt, uint32_t xb) { + if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F"); + if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F"); + uint32_t t = xt & 0x1F; + uint32_t tx = xt >> 5; + uint32_t b = xb & 0x1F; + uint32_t bx = xb >> 5; + return XX2_form(60, t, 0, b, 248, bx, tx); + } + + static inline uint32_t xxpermdi(uint32_t xt, uint32_t xa, uint32_t xb, uint32_t dm) { + if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F"); + if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F"); + if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F"); + uint32_t t = xt & 0x1F; + uint32_t tx = xt >> 5; + uint32_t a = xa & 0x1F; + uint32_t ax = xa >> 5; + uint32_t b = xb & 0x1F; + uint32_t bx = xb >> 5; + return XX3_form(60, t, a, b, (dm << 5) | 10, ax, bx, tx); + } + + static inline uint32_t xxswapd(uint32_t xt, uint32_t xa) { return xxpermdi(xt, xa, xa, 2); } + + static inline uint32_t xxland(uint32_t xt, uint32_t xa, uint32_t xb) { + if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F"); + if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F"); + if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F"); + uint32_t t = xt & 0x1F; + uint32_t tx = xt >> 5; + uint32_t a = xa & 0x1F; + uint32_t ax = xa >> 5; + uint32_t b = xb & 0x1F; + uint32_t bx = xb >> 5; + return XX3_form(60, t, a, b, 130, ax, bx, tx); + } + + static inline uint32_t xxlor(uint32_t xt, uint32_t xa, uint32_t xb) { + if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F"); + if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F"); + if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F"); + uint32_t t = xt & 0x1F; + uint32_t tx = xt >> 5; + uint32_t a = xa & 0x1F; + uint32_t ax = xa >> 5; + uint32_t b = xb & 0x1F; + uint32_t bx = xb >> 5; + return XX3_form(60, t, a, b, 146, ax, bx, tx); + } + + static inline uint32_t xxlxor(uint32_t xt, uint32_t xa, uint32_t xb) { + if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F"); + if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F"); + if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F"); + uint32_t t = xt & 0x1F; + uint32_t tx = xt >> 5; + uint32_t a = xa & 0x1F; + uint32_t ax = xa >> 5; + uint32_t b = xb & 0x1F; + uint32_t bx = xb >> 5; + return XX3_form(60, t, a, b, 154, ax, bx, tx); + } + +} + +namespace randomx { + + static const uint8_t* codeConstants = (uint8_t*)&randomx_ppc64_constants; + static const uint8_t* codeConstantLutFprcToFpscr = (uint8_t*)&randomx_ppc64_constant_lut_fprc_to_fpscr; + static const uint8_t* codeConstantsEnd = (uint8_t*)&randomx_ppc64_constants_end; + + static const uint8_t* codeDatasetInit = (uint8_t*)&randomx_ppc64_dataset_init; + static const uint8_t* codeDatasetInitFixCall = (uint8_t*)&randomx_ppc64_dataset_init_fix_call; + static const uint8_t* codeDatasetInitEnd = (uint8_t*)&randomx_ppc64_dataset_init_end; + + static const uint8_t* codeSshashSingleItemPrologue = (uint8_t*)&randomx_ppc64_sshash_single_item_prologue; + static const uint8_t* codeSshashSingleItemPrologueEnd = (uint8_t*)&randomx_ppc64_sshash_single_item_prologue_end; + static const uint8_t* codeSshashSingleItemEpilogue = (uint8_t*)&randomx_ppc64_sshash_single_item_epilogue; + static const uint8_t* codeSshashSingleItemEpilogueEnd = (uint8_t*)&randomx_ppc64_sshash_single_item_epilogue_end; + static const uint8_t* codeSshashCachePrefetch = (uint8_t*)&randomx_ppc64_sshash_cache_prefetch; + static const uint8_t* codeSshashCachePrefetchEnd = (uint8_t*)&randomx_ppc64_sshash_cache_prefetch_end; + static const uint8_t* codeSshashXor = (uint8_t*)&randomx_ppc64_sshash_xor; + static const uint8_t* codeSshashXorEnd = (uint8_t*)&randomx_ppc64_sshash_xor_end; + + static const uint8_t* codeVmPrologue = (uint8_t*)&randomx_ppc64_vm_prologue; + static const uint8_t* codeVmPrologueEnd = (uint8_t*)&randomx_ppc64_vm_prologue_end; + static const uint8_t* codeVmEpilogue = (uint8_t*)&randomx_ppc64_vm_epilogue; + static const uint8_t* codeVmFixLoop = (uint8_t*)&randomx_ppc64_vm_fix_loop; + static const uint8_t* codeVmEpilogueEnd = (uint8_t*)&randomx_ppc64_vm_epilogue_end; + static const uint8_t* codeVmLoopPrologue = (uint8_t*)&randomx_ppc64_vm_loop_prologue; + static const uint8_t* codeVmLoopPrologueEnd = (uint8_t*)&randomx_ppc64_vm_loop_prologue_end; + static const uint8_t* codeVmDataRead = (uint8_t*)&randomx_ppc64_vm_data_read; + static const uint8_t* codeVmDataReadEnd = (uint8_t*)&randomx_ppc64_vm_data_read_end; + static const uint8_t* codeVmDataReadLight = (uint8_t*)&randomx_ppc64_vm_data_read_light; + static const uint8_t* codeVmDataReadLightFixCall = (uint8_t*)&randomx_ppc64_vm_data_read_light_fix_call; + static const uint8_t* codeVmDataReadLightEnd = (uint8_t*)&randomx_ppc64_vm_data_read_light_end; + static const uint8_t* codeVmSpadStoreGroupR = (uint8_t*)&randomx_ppc64_vm_spad_store_group_r; + static const uint8_t* codeVmSpadStoreGroupREnd = (uint8_t*)&randomx_ppc64_vm_spad_store_group_r_end; + static const uint8_t* codeVmSpadStoreMixV1 = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v1; + static const uint8_t* codeVmSpadStoreMixV1End = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v1_end; + static const uint8_t* codeVmSpadStoreMixV2HardAes = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v2_hard_aes; + static const uint8_t* codeVmSpadStoreMixV2HardAesEnd = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end; + static const uint8_t* codeVmSpadStoreMixV2SoftAes = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v2_soft_aes; + static const uint8_t* codeVmSpadStoreMixV2SoftAesEnd = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v2_soft_aes_end; + + static const int32_t sizeConstants = codeConstantsEnd - codeConstants; + + static const int32_t sizeDatasetInit = codeDatasetInitEnd - codeDatasetInit; + + static const int32_t sizeSshashSingleItemPrologue = codeSshashSingleItemPrologueEnd - codeSshashSingleItemPrologue; + static const int32_t sizeSshashSingleItemEpilogue = codeSshashSingleItemEpilogueEnd - codeSshashSingleItemEpilogue; + static const int32_t sizeSshashCachePrefetch = codeSshashCachePrefetchEnd - codeSshashCachePrefetch; + static const int32_t sizeSshashXor = codeSshashXorEnd - codeSshashXor; + + static const int32_t sizeVmPrologue = codeVmPrologueEnd - codeVmPrologue; + static const int32_t sizeVmEpilogue = codeVmEpilogueEnd - codeVmEpilogue; + static const int32_t sizeVmLoopPrologue = codeVmLoopPrologueEnd - codeVmLoopPrologue; + static const int32_t sizeVmDataRead = codeVmDataReadEnd - codeVmDataRead; + static const int32_t sizeVmDataReadLight = codeVmDataReadLightEnd - codeVmDataReadLight; + static const int32_t sizeVmSpadStoreGroupR = codeVmSpadStoreGroupREnd - codeVmSpadStoreGroupR; + static const int32_t sizeVmSpadStoreMixV1 = codeVmSpadStoreMixV1End - codeVmSpadStoreMixV1; + static const int32_t sizeVmSpadStoreMixV2HardAes = codeVmSpadStoreMixV2HardAesEnd - codeVmSpadStoreMixV2HardAes; + static const int32_t sizeVmSpadStoreMixV2SoftAes = codeVmSpadStoreMixV2SoftAesEnd - codeVmSpadStoreMixV2SoftAes; + constexpr size_t sizeVmSpadStoreGroupF = 4*12; // Worst case size is 12 instructions + + static const int32_t offsetConstantLutFprcToFpscr = codeConstantLutFprcToFpscr - codeConstants; + + static const int32_t offsetDatasetInitFixCall = codeDatasetInitFixCall - codeDatasetInit; + + static const int32_t offsetVmFixLoop = codeVmFixLoop - codeVmEpilogue; + static const int32_t offsetVmDataReadLightFixCall = codeVmDataReadLightFixCall - codeVmDataReadLight; + + constexpr size_t CodeAlign = 64*1024; // 64 kB, to ensure alignment on systems with a page size <= 64 kB + constexpr size_t ReciprocalPoolSize = 8 * RANDOMX_PROGRAM_MAX_SIZE; // RANDOMX_PROGRAM_MAX_SIZE 64-bit reciprocals + static const size_t ReciprocalPoolPos = sizeConstants + 16; // Add 16 bytes for the Group E OR vector mask + static const size_t ConstantPoolSize = alignSize(sizeConstants + 16 + ReciprocalPoolSize, CodeAlign); // Add 16 bytes for the Group E OR vector mask + static const size_t ReserveCodeSize = alignSize(sizeVmPrologue + sizeVmEpilogue + sizeVmLoopPrologue + sizeVmDataRead + sizeVmDataReadLight + sizeVmSpadStoreGroupR + sizeVmSpadStoreMixV2SoftAes + sizeVmSpadStoreGroupF, CodeAlign); + constexpr size_t MaxRandomXInstrCodeSize = 4*9; // FDIV_M and CFROUND require at most 9 instructions + constexpr size_t MaxSuperscalarInstrSize = 4*6; // IMUL_RCP requires at most 6 instructions + static const size_t SuperscalarProgramHeaders = sizeSshashSingleItemPrologue + sizeSshashSingleItemEpilogue; + + static const size_t RandomXCodeSize = alignSize(ConstantPoolSize + ReserveCodeSize + MaxRandomXInstrCodeSize * RANDOMX_PROGRAM_MAX_SIZE, CodeAlign); + static const size_t SuperscalarSize = alignSize(sizeDatasetInit + SuperscalarProgramHeaders + (sizeSshashCachePrefetch + sizeSshashXor + MaxSuperscalarInstrSize * SuperscalarMaxSize) * RANDOMX_CACHE_ACCESSES, CodeAlign); + + static const uint32_t CodeSize = RandomXCodeSize + SuperscalarSize; + + constexpr uint32_t ConstantsBaseAddressRegisterGPR2 = 2; + constexpr uint32_t ConstantVectorByteReverseMaskVR15 = 15; + constexpr uint32_t ConstantVectorByteReverseMaskVSR47 = 32 + ConstantVectorByteReverseMaskVR15; + constexpr uint32_t ConstantVectorBePermutationMaskVR16 = 16; + constexpr uint32_t ConstantVectorBePermutationMaskVSR48 = 32 + ConstantVectorBePermutationMaskVR16; + constexpr uint32_t ConstantVectorGroupEAndMaskVR17 = 17; + constexpr uint32_t ConstantVectorGroupEAndMaskVSR49 = 32 + ConstantVectorGroupEAndMaskVR17; + constexpr uint32_t ConstantVectorFscalXorMaskVR18 = 18; + constexpr uint32_t ConstantVectorFscalXorMaskVSR50 = 32 + ConstantVectorFscalXorMaskVR18; + constexpr uint32_t ConstantVectorGroupEOrMaskVR19 = 19; + constexpr uint32_t ConstantVectorGroupEOrMaskVSR51 = 32 + ConstantVectorGroupEOrMaskVR19; + + constexpr uint32_t MaGPR24 = 24; + constexpr uint32_t MxGPR25 = 25; + constexpr uint32_t SpAddr0GPR26 = 26; + constexpr uint32_t SpAddr1GPR27 = 27; + constexpr uint32_t ScratchpadPointerGPR30 = 30; + + template + struct GprMap { + uint32_t regs[N]; + uint32_t getPpcGprNum(uint8_t idx) const { + return regs[idx % N]; + } + }; + + template + struct VsrMap { + uint32_t regs[N]; + uint32_t getPpcVrNum(uint8_t idx) const { + return regs[idx % N]; + } + uint32_t getPpcVsrNum(uint8_t idx) const { + return regs[idx % N] + 32; + } + }; + + static const GprMap<8> RegisterMapR = {{ 14, 15, 16, 17, 18, 19, 20, 21 }}; + static const VsrMap<4> RegisterMapF = {{ 0, 1, 2, 3 }}; + static const VsrMap<4> RegisterMapE = {{ 4, 5, 6, 7 }}; + static const VsrMap<4> RegisterMapA = {{ 8, 9, 10, 11 }}; + static const VsrMap<8> RegisterMapFE = {{ 0, 1, 2, 3, 4, 5, 6, 7 }}; + + static const GprMap<8> RegisterMapSsh = {{ 4, 6, 7, 9, 10, 11, 12, 22 }}; + + template static constexpr size_t Log2(T value) { return (value > 1) ? (Log2(value / 2) + 1) : 0; } + + constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) { + return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x); + } + + static void syncInstructionCache(void* start_ptr, void* end_ptr) { + // Apparently GCC compiles __builtin___clear_cache to nothing, so we use LLVM's implementation instead. + // + // This code has been modified from compiler-rt/lib/builtins/clear_cache.c, found at + // https://github.com/llvm/llvm-project revision 7459e10f34aa86952b1620d0cb48b40be112ebe9. + // + // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + // See https://llvm.org/LICENSE.txt for license information. + // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + char* start = (char*)start_ptr; + char* end = (char*)end_ptr; + const size_t len = (uintptr_t)end - (uintptr_t)start; + if (len == 0) return; + + // Query data and instruction cache line sizes + long dcache_val = 0; + long icache_val = 0; + +#ifdef _SC_LEVEL1_DCACHE_LINESIZE + dcache_val = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); +#endif +#ifdef _SC_LEVEL1_ICACHE_LINESIZE + icache_val = sysconf(_SC_LEVEL1_ICACHE_LINESIZE); +#endif + + // Default to 32 bytes if querying the line size fails + const size_t d_line_size = (dcache_val > 0) ? dcache_val : 32; + const size_t i_line_size = (icache_val > 0) ? icache_val : 32; + + // Flush Data Cache + const uintptr_t d_mask = ~(d_line_size - 1); + const uintptr_t d_start_line = ((uintptr_t)start) & d_mask; + const uintptr_t d_end_line = ((uintptr_t)start + len + d_line_size - 1) & d_mask; + + for (uintptr_t line = d_start_line; line < d_end_line; line += d_line_size) + __asm__ volatile("dcbst 0, %0" : : "r"(line)); + + // Wait for memory writes to complete + __asm__ volatile("sync"); + + // Invalidate Instruction Cache + const uintptr_t i_mask = ~(i_line_size - 1); + const uintptr_t i_start_line = ((uintptr_t)start) & i_mask; + const uintptr_t i_end_line = ((uintptr_t)start + len + i_line_size - 1) & i_mask; + + for (uintptr_t line = i_start_line; line < i_end_line; line += i_line_size) + __asm__ volatile("icbi 0, %0" : : "r"(line)); + + // Flush the local instruction pipeline + __asm__ volatile("isync"); + } + + static void emitLoadGpr64(CompilerState& state, uint32_t rt, uint32_t ra, uint32_t rb) { + if (PPC_BIG_ENDIAN) { + state.emit(PPC64::ldbrx(rt, ra, rb)); + } else { + state.emit(PPC64::ldx(rt, ra, rb)); + } + } + + static void emitStoreGpr64(CompilerState& state, uint32_t rs, uint32_t ra, uint32_t rb) { + if (PPC_BIG_ENDIAN) { + state.emit(PPC64::stdbrx(rs, ra, rb)); + } else { + state.emit(PPC64::stdx(rs, ra, rb)); + } + } + + static void emitLoadVr64(CompilerState& state, uint32_t vrt, uint32_t ra, uint32_t rb) { + // We need to load the two packed little-endian signed 32-bit integers into a VSR, then we need to + // shuffle them so they're in the correct halves of the VSR register and in the correct byte order, + // and then we need to convert the signed 32-bit ints to doubles. + uint32_t xt = 32 + vrt; + state.emit(PPC64::lxsdx(xt, ra, rb)); + if (PPC_BIG_ENDIAN) { + // Register XT contains the value as [ 0123 4567 zzzz zzzz ] + state.emit(PPC64::vperm(vrt, vrt, vrt, ConstantVectorBePermutationMaskVR16)); // Shuffles values in XT to be [ 7654 7654 3210 3210 ] + } else { + // Register XT contains the value as [ 7654 3210 zzzz zzzz ] + state.emit(PPC64::xxmrghw(xt, xt, xt)); // Shuffles values in XT to be [ 7654 7654 3210 3210 ] + } + state.emit(PPC64::xvcvsxwdp(xt, xt)); // Needs values in XT as [ 7654 zzzz 3210 zzzz ] + } + + static void emitMovImm32(CompilerState& state, int reg, uint32_t imm) { + // Move signed 32-bit immediate into 64-bit register. + // Note that `imm` is a `uint32_t` and not an `int32_t` for type compatibility--it has no effect on + // functionality because `lis` will automatically sign-extend the 16-bit value. + int32_t simm = (int32_t)imm; + if (simm >= -32768 && simm <= 32767) { + state.emit(PPC64::li(reg, simm & 0xFFFF)); + } else { + uint16_t upper = (imm >> 16) & 0xFFFF; + uint16_t lower = (imm >> 0) & 0xFFFF; + + state.emit(PPC64::lis(reg, upper)); + if (lower) + state.emit(PPC64::ori(reg, reg, lower)); + } + } + + static void emitAddImm32(CompilerState& state, uint32_t tmpReg, int dstReg, int srcReg, uint32_t imm) { + int32_t simm = (int32_t)imm; + if (simm >= -32768 && simm <= 32767) { + state.emit(PPC64::addi(dstReg, srcReg, simm & 0xFFFF)); + } else if ((imm & 0xFFFF) == 0) { + state.emit(PPC64::addis(dstReg, srcReg, (imm >> 16) & 0xFFFF)); + } else { + // Notes on optimization: + // + // 1. Performing an `addis` -> `addi` is not a complete replacement for `lis` -> `ori` -> `add`, as constants in the + // range 0x7FFF8000 to 0x7FFFFFFF cannot be handled by `addis` -> `addi`. So to be able to handle all constants, + // `lis` -> `ori` -> `add` must always be available as a fallback. + // 2. In the context of RandomX, `addis` -> `addi` is almost always slower than `lis` -> `ori` -> `add`. The reason + // for this is subtle--with `addis` -> `addi`, execution blocks at the `addis` as the CPU waits for the source + // register to become ready, and `addi` can't be executed because it depends on the result of `addis`. In + // contrast, `lis` -> `ori` to a temporary register can almost always be executed while the CPU waits for the + // source register to become ready, and so execution will usually only block on the single `add` instruction. So + // despite significantly reducing the total number of instructions executed, using `addis` -> `addi` instead of + // `lis` -> `ori` -> `add` results in a significant reduction in IPC (-5%) and a small overall reduction in + // performance (-0.5%). + emitMovImm32(state, tmpReg, imm); + state.emit(PPC64::add(dstReg, srcReg, tmpReg)); + } + } + + static void emitMovImm64(CompilerState& state, int reg, uint64_t imm) { + if (imm == (uint64_t)(int64_t)(int32_t)imm) { + // Values that can be represented by loading a 32-bit signed immediate + emitMovImm32(state, reg, (uint32_t)imm); + } else { + uint64_t lowestBit = imm & -(int64_t)imm; + uint64_t added = imm + lowestBit; + if (imm != 0 && imm != ~0ULL && (added & (added - 1)) == 0) { + // Values that are a contiguous sequence of 1s + uint32_t mb = added == 0 ? 0 : __builtin_clzll(added); + uint32_t me = 63 - __builtin_ctzll(lowestBit); + state.emit(PPC64::li(reg, -1)); + if (mb == 0) { + state.emit(PPC64::rldicr(reg, reg, 0, me)); + } else if (me == 63) { + state.emit(PPC64::rldicl(reg, reg, 0, mb)); + } else { + state.emit(PPC64::rldic(reg, reg, 63 - me, mb)); + } + return; + } + + // Values that can be generated by loading a <=32-bit immediate and rotating it + for (int i = 1; i < 64; ++i) { + uint64_t rot = (imm << i) | (imm >> (64 - i)); + if (rot == (uint64_t)(int64_t)(int32_t)rot) { + emitMovImm32(state, reg, (uint32_t)rot); + state.emit(PPC64::rotldi(reg, reg, 64 - i)); + return; + } + } + + // All other values + uint32_t high = imm >> 32; + uint32_t low = imm & 0xFFFFFFFF; + + if (high) { + emitMovImm32(state, reg, high); + state.emit(PPC64::sldi(reg, reg, 32)); + } else { + state.emit(PPC64::li(reg, 0)); + } + + uint16_t lower = (low >> 16) & 0xFFFF; + uint16_t lowest = low & 0xFFFF; + + if (lower) + state.emit(PPC64::oris(reg, reg, lower)); + + if (lowest) + state.emit(PPC64::ori(reg, reg, lowest)); + } + } + + static void emitLoadGprFromScratchpad(CompilerState& state, uint32_t tmp_gpr, uint32_t dst, uint32_t src, Instruction& instr) { + uint32_t imm = instr.getImm32(); + + if (src != dst) { + uint32_t size = instr.getModMem() ? RANDOMX_SCRATCHPAD_L1 : RANDOMX_SCRATCHPAD_L2; + imm &= size - 1; + emitAddImm32(state, tmp_gpr, tmp_gpr, src, imm); + + uint32_t mb = 32 - Log2(size); + state.emit(PPC64::rlwinm(tmp_gpr, tmp_gpr, 0, mb, 28)); + } else { + imm &= ScratchpadL3Mask; + emitMovImm64(state, tmp_gpr, imm); + } + + emitLoadGpr64(state, tmp_gpr, ScratchpadPointerGPR30, tmp_gpr); + } + + static void emitLoadVsrFromScratchpad(CompilerState& state, uint32_t tmp_gpr, uint32_t tmp_vr, Instruction& instr) { + int src = RegisterMapR.getPpcGprNum(instr.src); + + uint32_t imm = instr.getImm32(); + uint32_t size = instr.getModMem() ? RANDOMX_SCRATCHPAD_L1 : RANDOMX_SCRATCHPAD_L2; + imm &= size - 1; + emitAddImm32(state, tmp_gpr, tmp_gpr, src, imm); + + uint32_t mb = 32 - Log2(size); + state.emit(PPC64::rlwinm(tmp_gpr, tmp_gpr, 0, mb, 28)); + + emitLoadVr64(state, tmp_vr, ScratchpadPointerGPR30, tmp_gpr); + } + + static void emitVmSpadStoreGroupF(CompilerState& state) { + // Store F registers to scratchpad at spAddr0 + if (randomx::cpu.hasV3P0()) { + if (PPC_BIG_ENDIAN) { + state.emit(PPC64::vperm(12, 0, 0, ConstantVectorByteReverseMaskVR15)); + state.emit(PPC64::stxv(32 + 12, 16 * 0, SpAddr0GPR26)); + state.emit(PPC64::vperm(13, 1, 1, ConstantVectorByteReverseMaskVR15)); + state.emit(PPC64::stxv(32 + 13, 16 * 1, SpAddr0GPR26)); + state.emit(PPC64::vperm(14, 2, 2, ConstantVectorByteReverseMaskVR15)); + state.emit(PPC64::stxv(32 + 14, 16 * 2, SpAddr0GPR26)); + state.emit(PPC64::vperm(12, 3, 3, ConstantVectorByteReverseMaskVR15)); + state.emit(PPC64::stxv(32 + 12, 16 * 3, SpAddr0GPR26)); + } else { + state.emit(PPC64::stxv(32 + 0, 16 * 0, SpAddr0GPR26)); + state.emit(PPC64::stxv(32 + 1, 16 * 1, SpAddr0GPR26)); + state.emit(PPC64::stxv(32 + 2, 16 * 2, SpAddr0GPR26)); + state.emit(PPC64::stxv(32 + 3, 16 * 3, SpAddr0GPR26)); + } + } else { + if (PPC_BIG_ENDIAN) { + state.emit(PPC64::vperm(12, 0, 0, ConstantVectorByteReverseMaskVR15)); + state.emit(PPC64::stvx(12, 0, SpAddr0GPR26)); // RA=0 for zero offset + state.emit(PPC64::vperm(13, 1, 1, ConstantVectorByteReverseMaskVR15)); + state.emit(PPC64::li(9, 16 * 1)); + state.emit(PPC64::stvx(13, SpAddr0GPR26, 9)); + state.emit(PPC64::vperm(14, 2, 2, ConstantVectorByteReverseMaskVR15)); + state.emit(PPC64::li(10, 16 * 2)); + state.emit(PPC64::stvx(14, SpAddr0GPR26, 10)); + state.emit(PPC64::vperm(12, 3, 3, ConstantVectorByteReverseMaskVR15)); + state.emit(PPC64::li(11, 16 * 3)); + state.emit(PPC64::stvx(12, SpAddr0GPR26, 11)); + } else { + state.emit(PPC64::stvx(0, 0, SpAddr0GPR26)); // RA=0 for zero offset + state.emit(PPC64::li(9, 16 * 1)); + state.emit(PPC64::stvx(1, SpAddr0GPR26, 9)); + state.emit(PPC64::li(10, 16 * 2)); + state.emit(PPC64::stvx(2, SpAddr0GPR26, 10)); + state.emit(PPC64::li(11, 16 * 3)); + state.emit(PPC64::stvx(3, SpAddr0GPR26, 11)); + } + } + } + + uint32_t JitCompilerPPC64::getTempGpr() { + static const uint32_t gprs[] = {6, 7, 8, 9, 10, 11, 12}; + uint32_t reg = gprs[tempGprIndex]; + tempGprIndex = (tempGprIndex + 1) % 7; + return reg; + } + + uint32_t JitCompilerPPC64::getTempVr() { + static const uint32_t vrs[] = {12, 13, 14}; + uint32_t reg = vrs[tempVrIndex]; + tempVrIndex = (tempVrIndex + 1) % 3; + return reg; + } + + void JitCompilerPPC64::emitProgramPrefix(CompilerState& state, Program& prog, ProgramConfiguration& pcfg, randomx_flags flags) { + // Set the Group E OR vector mask + state.emitAt(sizeConstants, pcfg.eMask[0]); + state.emitAt(sizeConstants + 8, pcfg.eMask[1]); + + state.codePos = RandomXCodePos; + + state.emit(codeVmPrologue, sizeVmPrologue); + // Mask mx and ma with Scratchpad L3 mask + uint32_t mask_begin = 32 - Log2(RANDOMX_SCRATCHPAD_L3); + uint32_t mask_end = 31 - Log2(RANDOMX_DATASET_ITEM_SIZE); + state.emit(PPC64::rlwinm(SpAddr0GPR26, MxGPR25, 0, mask_begin, mask_end)); + state.emit(PPC64::rlwinm(SpAddr1GPR27, MaGPR24, 0, mask_begin, mask_end)); + // Init spAddr0 to masked mx + scratchpad base + state.emit(PPC64::add(SpAddr0GPR26, SpAddr0GPR26, ScratchpadPointerGPR30)); + // Init spAddr1 to masked ma + scratchpad base + state.emit(PPC64::add(SpAddr1GPR27, SpAddr1GPR27, ScratchpadPointerGPR30)); + + LoopBeginPos = state.codePos; + state.emit(codeVmLoopPrologue, sizeVmLoopPrologue); + + // Initialize the reciprocal pool to zero + state.rcpCount = 0; + + // Step 4: The 256 instructions stored in the Program Buffer are executed. + for (unsigned i = 0; i < RegistersCount; ++i) { + state.registerUsage[i] = -1; + } + for (unsigned i = 0; i < prog.getSize(flags); ++i) { + Instruction instr = prog(i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + state.instructionOffsets[i] = state.codePos; + opcodeMap1[instr.opcode](this, state, instr, i, flags); + } + } + + void JitCompilerPPC64::emitProgramSuffix(CompilerState& state, ProgramConfiguration& pcfg, randomx_flags flags) { + state.emit(codeVmSpadStoreGroupR, sizeVmSpadStoreGroupR); + + if (flags & RANDOMX_FLAG_V2) { + if (flags & RANDOMX_FLAG_HARD_AES) { + if (!randomx::cpu.hasAes()) { + throw std::runtime_error("This CPU is missing support for hardware AES!"); + } + state.emit(codeVmSpadStoreMixV2HardAes, sizeVmSpadStoreMixV2HardAes); + } else { + state.emit(codeVmSpadStoreMixV2SoftAes, sizeVmSpadStoreMixV2SoftAes); + } + } else { + state.emit(codeVmSpadStoreMixV1, sizeVmSpadStoreMixV1); + } + + emitVmSpadStoreGroupF(state); + + state.emit(PPC64::xor_(SpAddr0GPR26, RegisterMapR.getPpcGprNum(pcfg.readReg0), RegisterMapR.getPpcGprNum(pcfg.readReg1))); + + // spAddr1 (r27) = r26 >> 32 + state.emit(PPC64::srdi(SpAddr1GPR27, SpAddr0GPR26, 32)); + // spAddr0 (r26) = r26 & 0xFFFFFFFF + state.emit(PPC64::rldicl(SpAddr0GPR26, SpAddr0GPR26, 0, 32)); + + // Apply Scratchpad L3 mask + uint32_t mb = 32 - Log2(RANDOMX_SCRATCHPAD_L3); + uint32_t me = 31 - Log2(RANDOMX_DATASET_ITEM_SIZE); + state.emit(PPC64::rlwinm(SpAddr0GPR26, SpAddr0GPR26, 0, mb, me)); + state.emit(PPC64::rlwinm(SpAddr1GPR27, SpAddr1GPR27, 0, mb, me)); + + // Add scratchpad base pointer (r30) + state.emit(PPC64::add(SpAddr0GPR26, SpAddr0GPR26, ScratchpadPointerGPR30)); + state.emit(PPC64::add(SpAddr1GPR27, SpAddr1GPR27, ScratchpadPointerGPR30)); + + int32_t fixPos = state.codePos; + state.emit(codeVmEpilogue, sizeVmEpilogue); + + int32_t fixContinuePos = fixPos + offsetVmFixLoop; + state.emitAt(fixContinuePos, PPC64::b(LoopBeginPos - fixContinuePos)); + } + + JitCompilerPPC64::JitCompilerPPC64() { + state.code = (uint8_t*) allocMemoryPages(CodeSize); + if (state.code == nullptr) + throw std::runtime_error("allocMemoryPages"); + + state.codePos = 0; + state.emit(codeConstants, sizeConstants); + + state.codePos = ConstantPoolSize; + entryProgram = state.code + state.codePos; + if (PPC_ABI_V2) { + // Load r2 with the base address of the constant pool + emitMovImm64(state, ConstantsBaseAddressRegisterGPR2, reinterpret_cast(state.code)); + } + RandomXCodePos = state.codePos; + + state.codePos = RandomXCodeSize; + entryDataInit = state.code + state.codePos; + if (PPC_ABI_V2) { + // Load r2 with the base address of the constant pool + emitMovImm64(state, ConstantsBaseAddressRegisterGPR2, reinterpret_cast(state.code)); + } + int32_t datasetInitFixCallPos = state.codePos + offsetDatasetInitFixCall; + state.emit(codeDatasetInit, sizeDatasetInit); + SshashSingleItemPos = alignSize(state.codePos, 128); + // Patch in the call to the SuperScalar Hash single item function + state.emitAt(datasetInitFixCallPos, PPC64::bl(SshashSingleItemPos - datasetInitFixCallPos)); + +#if !PPC_ABI_V2 + // Initialize the ABI V1 function descriptors + descriptorProgram[0] = reinterpret_cast(entryProgram); + descriptorProgram[1] = reinterpret_cast(state.code); + descriptorProgram[2] = 0; + + descriptorDataInit[0] = reinterpret_cast(entryDataInit); + descriptorDataInit[1] = reinterpret_cast(state.code); + descriptorDataInit[2] = 0; +#endif + } + + JitCompilerPPC64::~JitCompilerPPC64() { + freePagedMemory(state.code, CodeSize); + } + + void JitCompilerPPC64::enableWriting() { + setPagesRW(state.code, CodeSize); + } + + void JitCompilerPPC64::enableExecution() { + setPagesRX(state.code, CodeSize); + } + + void JitCompilerPPC64::enableAll() { + setPagesRWX(state.code, CodeSize); + } + + void JitCompilerPPC64::generateProgram(Program& prog, ProgramConfiguration& pcfg) { + emitProgramPrefix(state, prog, pcfg, flags); + + int mtReg = MaGPR24; + int mpReg = MxGPR25; + + if (flags & RANDOMX_FLAG_V2) { + // Step 5a: Save ma in mt (r9, temporary) + mtReg = 9; + state.emit(PPC64::mr(mtReg, MaGPR24)); + + mpReg = MaGPR24; + } + + // Step 5b: the mp register is XORed with the low 32 bits of registers readReg2 and readReg3 + state.emit(PPC64::xor_(8, RegisterMapR.getPpcGprNum(pcfg.readReg2), RegisterMapR.getPpcGprNum(pcfg.readReg3))); + // Zero-extend r8 to 32 bits (clear upper 32 bits) + state.emit(PPC64::rldicl(8, 8, 0, 32)); + // mp ^= (readReg2 ^ readReg3) + state.emit(PPC64::xor_(mpReg, mpReg, 8)); + + int32_t dataReadPos = state.codePos; + state.emit(codeVmDataRead, sizeVmDataRead); + + uint32_t mask_begin = 32 - Log2(RANDOMX_DATASET_BASE_SIZE); + uint32_t mask_end = 31 - Log2(CacheLineSize); + + // Patch prefetch address calculation (offset 0) + state.emitAt(dataReadPos, PPC64::rlwinm(8, mpReg, 0, mask_begin, mask_end)); + + // Patch read address calculation (offset 12) + state.emitAt(dataReadPos + 12, PPC64::rlwinm(8, mtReg, 0, mask_begin, mask_end)); + + emitProgramSuffix(state, pcfg, flags); + + syncInstructionCache(entryProgram, state.code + state.codePos); + } + + void JitCompilerPPC64::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) { + emitProgramPrefix(state, prog, pcfg, flags); + + int mtReg = MaGPR24; + int mpReg = MxGPR25; + + if (flags & RANDOMX_FLAG_V2) { + // Step 5a: Save ma in mt (r9, temporary) + mtReg = 9; + state.emit(PPC64::mr(mtReg, MaGPR24)); + + mpReg = MaGPR24; + } + + // Step 5b: the mp register is XORed with the low 32 bits of registers readReg2 and readReg3 + state.emit(PPC64::xor_(8, RegisterMapR.getPpcGprNum(pcfg.readReg2), RegisterMapR.getPpcGprNum(pcfg.readReg3))); + // Zero-extend r8 to 32 bits (clear upper 32 bits) + state.emit(PPC64::rldicl(8, 8, 0, 32)); + // mp ^= (readReg2 ^ readReg3) + state.emit(PPC64::xor_(mpReg, mpReg, 8)); + + // Calculate itemNumber = (mt & datasetMask) / CacheLineSize + uint32_t datasetMask = (RANDOMX_DATASET_BASE_SIZE - 1) & ~63; + emitMovImm32(state, 8, datasetMask); + state.emit(PPC64::and_(5, mtReg, 8)); // r5 = mt & datasetMask + state.emit(PPC64::srdi(5, 5, Log2(CacheLineSize))); // r5 = r5 >> 6 + + emitAddImm32(state, 8, 5, 5, datasetOffset / CacheLineSize); + + int32_t callPos = state.codePos + offsetVmDataReadLightFixCall; + state.emit(codeVmDataReadLight, sizeVmDataReadLight); + state.emitAt(callPos, PPC64::bl(SshashSingleItemPos - callPos)); + + emitProgramSuffix(state, pcfg, flags); + + syncInstructionCache(entryProgram, state.code + state.codePos); + } + + static void generateSuperscalarCode(CompilerState& state, Instruction instr, const std::vector& reciprocalCache) { + int dst = RegisterMapSsh.getPpcGprNum(instr.dst); + int src = RegisterMapSsh.getPpcGprNum(instr.src); + uint32_t rotation = instr.getImm32() & 63; + + switch ((SuperscalarInstructionType)instr.opcode) { + case SuperscalarInstructionType::ISUB_R: + // subf dst, src, dst + state.emit(PPC64::subf(dst, src, dst)); + break; + case SuperscalarInstructionType::IXOR_R: + // xor dst, dst, src + state.emit(PPC64::xor_(dst, dst, src)); + break; + case SuperscalarInstructionType::IADD_RS: + // sldi r8, src, shift + state.emit(PPC64::sldi(8, src, instr.getModShift())); + // add dst, dst, r8 + state.emit(PPC64::add(dst, dst, 8)); + break; + case SuperscalarInstructionType::IMUL_R: + // mulld dst, dst, src + state.emit(PPC64::mulld(dst, dst, src)); + break; + case SuperscalarInstructionType::IROR_C: + if (rotation) { + // rotrdi dst, dst, imm + state.emit(PPC64::rotrdi(dst, dst, rotation)); + } + break; + case SuperscalarInstructionType::IADD_C7: + case SuperscalarInstructionType::IADD_C8: + case SuperscalarInstructionType::IADD_C9: + emitMovImm32(state, 8, instr.getImm32()); + // add dst, dst, r8 + state.emit(PPC64::add(dst, dst, 8)); + break; + case SuperscalarInstructionType::IXOR_C7: + case SuperscalarInstructionType::IXOR_C8: + case SuperscalarInstructionType::IXOR_C9: + emitMovImm32(state, 8, instr.getImm32()); + // xor dst, dst, r8 + state.emit(PPC64::xor_(dst, dst, 8)); + break; + case SuperscalarInstructionType::IMULH_R: + // mulhdu dst, dst, src + state.emit(PPC64::mulhdu(dst, dst, src)); + break; + case SuperscalarInstructionType::ISMULH_R: + // mulhd dst, dst, src + state.emit(PPC64::mulhd(dst, dst, src)); + break; + case SuperscalarInstructionType::IMUL_RCP: + emitMovImm64(state, 8, reciprocalCache[instr.getImm32()]); + // mulld dst, dst, r8 + state.emit(PPC64::mulld(dst, dst, 8)); + break; + default: + UNREACHABLE; + } + } + + void JitCompilerPPC64::generateSuperscalarHash(SuperscalarProgramList& programs, std::vector &reciprocalCache) { + state.codePos = SshashSingleItemPos; + + // Steps 1 and 2 + state.emit(codeSshashSingleItemPrologue, sizeSshashSingleItemPrologue); + + for (size_t i = 0; i < programs.size(); ++i) { + SuperscalarProgram& prog = programs[i]; + + // Step 4 + // rldic r8, r5, Log2(CacheLineSize), 64 - Log2(CacheSize / CacheLineSize) - Log2(CacheLineSize) + state.emit(PPC64::rldic(8, 5, Log2(CacheLineSize), 64 - Log2(CacheSize / CacheLineSize) - Log2(CacheLineSize))); + state.emit(codeSshashCachePrefetch + 4, sizeSshashCachePrefetch - 4); + + // Step 5 + for (uint32_t j = 0; j < prog.getSize(); ++j) { + Instruction& instr = prog(j); + generateSuperscalarCode(state, instr, reciprocalCache); + } + + // Step 6 + state.emit(codeSshashXor, sizeSshashXor); + + uint32_t addrReg = RegisterMapSsh.getPpcGprNum(prog.getAddressRegister()); + state.emit(PPC64::mr(5, addrReg)); + + } + + // Return + state.emit(codeSshashSingleItemEpilogue, sizeSshashSingleItemEpilogue); + + syncInstructionCache(entryDataInit, state.code + state.codePos); + } + + size_t JitCompilerPPC64::getCodeSize() { + return CodeSize; + } + + static void h_IADD_RS(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + int dst = RegisterMapR.getPpcGprNum(isn.dst); + int src = RegisterMapR.getPpcGprNum(isn.src); + int shift = isn.getModShift(); + + if (shift) { + uint32_t tmp_gpr = jit->getTempGpr(); + state.emit(PPC64::sldi(tmp_gpr, src, shift)); + state.emit(PPC64::add(dst, dst, tmp_gpr)); + } else { + state.emit(PPC64::add(dst, dst, src)); + } + + if (isn.dst == RegisterNeedsDisplacement) { + emitAddImm32(state, jit->getTempGpr(), dst, dst, isn.getImm32()); + } + } + static void h_IADD_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + int dst = RegisterMapR.getPpcGprNum(isn.dst); + int src = RegisterMapR.getPpcGprNum(isn.src); + uint32_t tmp_gpr = jit->getTempGpr(); + emitLoadGprFromScratchpad(state, tmp_gpr, dst, src, isn); + state.emit(PPC64::add(dst, dst, tmp_gpr)); + } + static void h_ISUB_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + int dst = RegisterMapR.getPpcGprNum(isn.dst); + if (isn.src != isn.dst) { + int src = RegisterMapR.getPpcGprNum(isn.src); + state.emit(PPC64::subf(dst, src, dst)); + } else { + int32_t imm = unsigned32ToSigned2sCompl(-isn.getImm32()); + emitAddImm32(state, jit->getTempGpr(), dst, dst, imm); + } + } + static void h_ISUB_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + int dst = RegisterMapR.getPpcGprNum(isn.dst); + int src = RegisterMapR.getPpcGprNum(isn.src); + uint32_t tmp_gpr = jit->getTempGpr(); + emitLoadGprFromScratchpad(state, tmp_gpr, dst, src, isn); + state.emit(PPC64::subf(dst, tmp_gpr, dst)); + } + static void h_IMUL_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + int dst = RegisterMapR.getPpcGprNum(isn.dst); + if (isn.src != isn.dst) { + int src = RegisterMapR.getPpcGprNum(isn.src); + state.emit(PPC64::mulld(dst, dst, src)); + } else { + uint32_t tmp_gpr = jit->getTempGpr(); + emitMovImm32(state, tmp_gpr, isn.getImm32()); + state.emit(PPC64::mulld(dst, dst, tmp_gpr)); + } + } + static void h_IMUL_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + int dst = RegisterMapR.getPpcGprNum(isn.dst); + int src = RegisterMapR.getPpcGprNum(isn.src); + uint32_t tmp_gpr = jit->getTempGpr(); + emitLoadGprFromScratchpad(state, tmp_gpr, dst, src, isn); + state.emit(PPC64::mulld(dst, dst, tmp_gpr)); + } + static void h_IMULH_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + int dst = RegisterMapR.getPpcGprNum(isn.dst); + int src = RegisterMapR.getPpcGprNum(isn.src); + state.emit(PPC64::mulhdu(dst, dst, src)); + } + static void h_IMULH_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + int dst = RegisterMapR.getPpcGprNum(isn.dst); + int src = RegisterMapR.getPpcGprNum(isn.src); + uint32_t tmp_gpr = jit->getTempGpr(); + emitLoadGprFromScratchpad(state, tmp_gpr, dst, src, isn); + state.emit(PPC64::mulhdu(dst, dst, tmp_gpr)); + } + static void h_ISMULH_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + int dst = RegisterMapR.getPpcGprNum(isn.dst); + int src = RegisterMapR.getPpcGprNum(isn.src); + state.emit(PPC64::mulhd(dst, dst, src)); + } + static void h_ISMULH_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + int dst = RegisterMapR.getPpcGprNum(isn.dst); + int src = RegisterMapR.getPpcGprNum(isn.src); + uint32_t tmp_gpr = jit->getTempGpr(); + emitLoadGprFromScratchpad(state, tmp_gpr, dst, src, isn); + state.emit(PPC64::mulhd(dst, dst, tmp_gpr)); + } + static void h_IMUL_RCP(HANDLER_ARGS) { + uint32_t divisor = isn.getImm32(); + if (!isZeroOrPowerOf2(divisor)) { + state.registerUsage[isn.dst] = i; + int dst = RegisterMapR.getPpcGprNum(isn.dst); + uint32_t tmp_gpr = jit->getTempGpr(); + + // Calculate and cache the reciprocal + int32_t offset = ReciprocalPoolPos + 8 * state.rcpCount++; + uint64_t rcp = randomx_reciprocal_fast(divisor); + state.emitAt(offset, rcp); + + state.emit(PPC64::ld(tmp_gpr, offset, ConstantsBaseAddressRegisterGPR2)); + state.emit(PPC64::mulld(dst, dst, tmp_gpr)); + } + } + static void h_INEG_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + int dst = RegisterMapR.getPpcGprNum(isn.dst); + state.emit(PPC64::neg(dst, dst)); + } + static void h_IXOR_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + int dst = RegisterMapR.getPpcGprNum(isn.dst); + if (isn.src != isn.dst) { + int src = RegisterMapR.getPpcGprNum(isn.src); + state.emit(PPC64::xor_(dst, dst, src)); + } else { + // Note: RandomX 32-bit immediates are sign-extended to 64 bits. + // xori/xoris zero-extend their 16-bit immediate, so they only match + // the sign-extended semantics when the imm32 is non-negative as a + // signed 32-bit value (i.e., <= 0x7FFFFFFF). + uint32_t imm = isn.getImm32(); + if (imm <= 0xFFFF) { + // Fits in unsigned 16 bits; XOR of upper bits is a no-op. + state.emit(PPC64::xori(dst, dst, imm)); + } else if ((imm & 0xFFFF) == 0 && imm <= 0x7FFFFFFF) { + // Only the high 16 bits are nonzero, and the value is non-negative. + state.emit(PPC64::xoris(dst, dst, (imm >> 16) & 0xFFFF)); + } else { + uint32_t tmp_gpr = jit->getTempGpr(); + emitMovImm32(state, tmp_gpr, imm); + state.emit(PPC64::xor_(dst, dst, tmp_gpr)); + } + } + } + static void h_IXOR_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + int dst = RegisterMapR.getPpcGprNum(isn.dst); + int src = RegisterMapR.getPpcGprNum(isn.src); + uint32_t tmp_gpr = jit->getTempGpr(); + emitLoadGprFromScratchpad(state, tmp_gpr, dst, src, isn); + state.emit(PPC64::xor_(dst, dst, tmp_gpr)); + } + static void h_IROR_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + int dst = RegisterMapR.getPpcGprNum(isn.dst); + if (isn.src != isn.dst) { + int src = RegisterMapR.getPpcGprNum(isn.src); + uint32_t tmp_gpr = jit->getTempGpr(); + state.emit(PPC64::neg(tmp_gpr, src)); + state.emit(PPC64::rldcl(dst, dst, tmp_gpr, 0)); + } else { + uint32_t imm = isn.getImm32() & 63; + if (imm) + state.emit(PPC64::rotrdi(dst, dst, imm)); + } + } + static void h_IROL_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + int dst = RegisterMapR.getPpcGprNum(isn.dst); + if (isn.src != isn.dst) { + int src = RegisterMapR.getPpcGprNum(isn.src); + state.emit(PPC64::rldcl(dst, dst, src, 0)); + } else { + uint32_t imm = isn.getImm32() & 63; + if (imm) + state.emit(PPC64::rotldi(dst, dst, imm)); + } + } + static void h_ISWAP_R(HANDLER_ARGS) { + if (isn.src != isn.dst) { + state.registerUsage[isn.dst] = i; + state.registerUsage[isn.src] = i; + int dst = RegisterMapR.getPpcGprNum(isn.dst); + int src = RegisterMapR.getPpcGprNum(isn.src); + uint32_t tmp_gpr = jit->getTempGpr(); + state.emit(PPC64::mr(tmp_gpr, dst)); + state.emit(PPC64::mr(dst, src)); + state.emit(PPC64::mr(src, tmp_gpr)); + } + } + static void h_FSWAP_R(HANDLER_ARGS) { + int dst = RegisterMapFE.getPpcVsrNum(isn.dst); + state.emit(PPC64::xxswapd(dst, dst)); + } + static void h_FADD_R(HANDLER_ARGS) { + int dst = RegisterMapF.getPpcVsrNum(isn.dst); + int src = RegisterMapA.getPpcVsrNum(isn.src); + state.emit(PPC64::xvadddp(dst, dst, src)); + } + static void h_FADD_M(HANDLER_ARGS) { + int dst = RegisterMapF.getPpcVsrNum(isn.dst); + uint32_t tmp_gpr = jit->getTempGpr(); + uint32_t tmp_vr = jit->getTempVr(); + emitLoadVsrFromScratchpad(state, tmp_gpr, tmp_vr, isn); + state.emit(PPC64::xvadddp(dst, dst, 32 + tmp_vr)); + } + static void h_FSUB_R(HANDLER_ARGS) { + int dst = RegisterMapF.getPpcVsrNum(isn.dst); + int src = RegisterMapA.getPpcVsrNum(isn.src); + state.emit(PPC64::xvsubdp(dst, dst, src)); + } + static void h_FSUB_M(HANDLER_ARGS) { + int dst = RegisterMapF.getPpcVsrNum(isn.dst); + uint32_t tmp_gpr = jit->getTempGpr(); + uint32_t tmp_vr = jit->getTempVr(); + emitLoadVsrFromScratchpad(state, tmp_gpr, tmp_vr, isn); + state.emit(PPC64::xvsubdp(dst, dst, 32 + tmp_vr)); + } + static void h_FSCAL_R(HANDLER_ARGS) { + int dst = RegisterMapF.getPpcVrNum(isn.dst); + state.emit(PPC64::vxor(dst, dst, ConstantVectorFscalXorMaskVR18)); + } + static void h_FMUL_R(HANDLER_ARGS) { + int dst = RegisterMapE.getPpcVsrNum(isn.dst); + int src = RegisterMapA.getPpcVsrNum(isn.src); + state.emit(PPC64::xvmuldp(dst, dst, src)); + } + static void h_FDIV_M(HANDLER_ARGS) { + int dst = RegisterMapE.getPpcVsrNum(isn.dst); + uint32_t tmp_gpr = jit->getTempGpr(); + uint32_t tmp_vr = jit->getTempVr(); + emitLoadVsrFromScratchpad(state, tmp_gpr, tmp_vr, isn); + state.emit(PPC64::vsel(tmp_vr, ConstantVectorGroupEOrMaskVR19, tmp_vr, ConstantVectorGroupEAndMaskVR17)); + state.emit(PPC64::xvdivdp(dst, dst, 32 + tmp_vr)); + } + static void h_FSQRT_R(HANDLER_ARGS) { + int dst = RegisterMapE.getPpcVsrNum(isn.dst); + state.emit(PPC64::xvsqrtdp(dst, dst)); + } + static void h_CBRANCH(HANDLER_ARGS) { + int reg = isn.dst; + int target = state.registerUsage[reg] + 1; + int shift = isn.getModCond() + ConditionOffset; + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + imm |= (1UL << shift); + if (ConditionOffset > 0 || shift > 0) + imm &= ~(1UL << (shift - 1)); + + int dst = RegisterMapR.getPpcGprNum(reg); + emitAddImm32(state, jit->getTempGpr(), dst, dst, imm); + + // Calculate the Mask Begin (MB) parameter + uint32_t mb = 64 - RANDOMX_JUMP_BITS; + + // rldicl. tmp_gpr, dst, 64 - shift, mb + state.emit(PPC64::rldicl_dot(jit->getTempGpr(), dst, (64 - shift) & 63, mb)); + + int32_t targetPos = state.instructionOffsets[target]; + int offset = targetPos - state.codePos; + + if (offset >= -(1 << 15) && offset < (1 << 15)) { + state.emit(PPC64::beq_predict_not_taken(offset)); + } else { + // Branch over the jump if not equal + state.emit(PPC64::bne_predict_taken(8)); + state.emit(PPC64::b(offset - 4)); + } + + for (unsigned j = 0; j < RegistersCount; ++j) { + state.registerUsage[j] = i; + } + } + static void h_CFROUND(HANDLER_ARGS) { + int src = RegisterMapR.getPpcGprNum(isn.src); + int32_t rotateBits = isn.getImm32() & 63; + + // Operate directly on src by default + int rot_src = src; + + // Rotate right by rotateBits + if (rotateBits) { + uint32_t tmp_gpr = jit->getTempGpr(); + + // rotrdi tmp_gpr, src, rotateBits + state.emit(PPC64::rotrdi(tmp_gpr, src, rotateBits)); + + // We rotated src and put the new value in tmp_gpr + rot_src = tmp_gpr; + } + + int32_t patch_pos = 0; + if (flags & RANDOMX_FLAG_V2) { + // Skip the rest of the code if bits 5:2 are not zero. Use GPR0 as a discard register. + // andi. r0, rot_src, 0x003C + state.emit(PPC64::andi_dot(0, rot_src, 0x003C)); + + // Get position to patch with conditional branch. + patch_pos = state.codePos; + + // Emit invalid instruction now and patch later once we have the code length. + state.emit(0); // bne skip_update + } + + uint32_t offset_gpr = jit->getTempGpr(); + + // Mask out bits 1:0 and multiply by 8 (shift left by 3) to get the table word offset (0, 8, 16, 24) + // rldic offset_gpr, rot_src, 3, 59 + state.emit(PPC64::rldic(offset_gpr, rot_src, 3, 59)); + + uint32_t address_gpr = jit->getTempGpr(); + + // Load table address into scratch address_gpr + emitAddImm32(state, jit->getTempGpr(), address_gpr, ConstantsBaseAddressRegisterGPR2, offsetConstantLutFprcToFpscr); + + // Load value from fprc-to-FPSCR table into temporary FPR0 + // lfdx f0, offset_gpr, address_gpr + state.emit(PPC64::lfdx(0, offset_gpr, address_gpr)); + + if (randomx::cpu.hasV3P0()) { + // Move the RN value from scratch FPR0 to FPSCR field RN + // mffscrn f0, f0 + state.emit(PPC64::mffscrn(0, 0)); + } else { + // Move the RN value from scratch FPR0 to FPSCR (masked) + // mtfsf 0x01, f0, 0, 0 + state.emit(PPC64::mtfsf(0x01, 0, 0, 0)); + } + + if (flags & RANDOMX_FLAG_V2) { + // Patch in the conditional branch instruction. We predict that the branch is taken because + // there's only a 1-in-16 chance of bits 5:2 of the rotated value being equal to zero and + // falling through to the RN-update code. + int32_t branch_offset = state.codePos - patch_pos; + state.emitAt(patch_pos, PPC64::bne_predict_taken(branch_offset)); + } + } + static void h_ISTORE(HANDLER_ARGS) { + int dst = RegisterMapR.getPpcGprNum(isn.dst); + int src = RegisterMapR.getPpcGprNum(isn.src); + uint32_t imm = isn.getImm32(); + uint32_t tmp_gpr = jit->getTempGpr(); + + uint32_t size; + if (isn.getModCond() < StoreL3Condition) { + size = isn.getModMem() ? RANDOMX_SCRATCHPAD_L1 : RANDOMX_SCRATCHPAD_L2; + } else { + size = RANDOMX_SCRATCHPAD_L3; + } + imm &= size - 1; + + emitAddImm32(state, jit->getTempGpr(), tmp_gpr, dst, imm); + + uint32_t mb = 32 - Log2(size); + state.emit(PPC64::rlwinm(tmp_gpr, tmp_gpr, 0, mb, 28)); + + emitStoreGpr64(state, src, ScratchpadPointerGPR30, tmp_gpr); + } + static void h_NOP(HANDLER_ARGS) { + } +} + +#include "instruction_weights.hpp" + +namespace { + +#define INST_HANDLE(x) REPN(&randomx::h_##x, WT(x)) + + InstructionHandler* opcodeMap1[256] = { + INST_HANDLE(IADD_RS) + INST_HANDLE(IADD_M) + INST_HANDLE(ISUB_R) + INST_HANDLE(ISUB_M) + INST_HANDLE(IMUL_R) + INST_HANDLE(IMUL_M) + INST_HANDLE(IMULH_R) + INST_HANDLE(IMULH_M) + INST_HANDLE(ISMULH_R) + INST_HANDLE(ISMULH_M) + INST_HANDLE(IMUL_RCP) + INST_HANDLE(INEG_R) + INST_HANDLE(IXOR_R) + INST_HANDLE(IXOR_M) + INST_HANDLE(IROR_R) + INST_HANDLE(IROL_R) + INST_HANDLE(ISWAP_R) + INST_HANDLE(FSWAP_R) + INST_HANDLE(FADD_R) + INST_HANDLE(FADD_M) + INST_HANDLE(FSUB_R) + INST_HANDLE(FSUB_M) + INST_HANDLE(FSCAL_R) + INST_HANDLE(FMUL_R) + INST_HANDLE(FDIV_M) + INST_HANDLE(FSQRT_R) + INST_HANDLE(CBRANCH) + INST_HANDLE(CFROUND) + INST_HANDLE(ISTORE) + INST_HANDLE(NOP) + }; + +#undef INST_HANDLE +} + +#define INST_HANDLE(x) REPN(static_cast(randomx::InstructionType::x), WT(x)) + +alignas(128) uint8_t randomx::JitCompilerPPC64::instMap[256] = { + INST_HANDLE(IADD_RS) + INST_HANDLE(IADD_M) + INST_HANDLE(ISUB_R) + INST_HANDLE(ISUB_M) + INST_HANDLE(IMUL_R) + INST_HANDLE(IMUL_M) + INST_HANDLE(IMULH_R) + INST_HANDLE(IMULH_M) + INST_HANDLE(ISMULH_R) + INST_HANDLE(ISMULH_M) + INST_HANDLE(IMUL_RCP) + INST_HANDLE(INEG_R) + INST_HANDLE(IXOR_R) + INST_HANDLE(IXOR_M) + INST_HANDLE(IROR_R) + INST_HANDLE(IROL_R) + INST_HANDLE(ISWAP_R) + INST_HANDLE(FSWAP_R) + INST_HANDLE(FADD_R) + INST_HANDLE(FADD_M) + INST_HANDLE(FSUB_R) + INST_HANDLE(FSUB_M) + INST_HANDLE(FSCAL_R) + INST_HANDLE(FMUL_R) + INST_HANDLE(FDIV_M) + INST_HANDLE(FSQRT_R) + INST_HANDLE(CBRANCH) + INST_HANDLE(CFROUND) + INST_HANDLE(ISTORE) + INST_HANDLE(NOP) +}; diff --git a/src/jit_compiler_ppc64.hpp b/src/jit_compiler_ppc64.hpp new file mode 100644 index 00000000..b9392d92 --- /dev/null +++ b/src/jit_compiler_ppc64.hpp @@ -0,0 +1,120 @@ +/* +Copyright (c) 2023 tevador +Copyright (c) 2026, Forest Crossman + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include + +#include "common.hpp" +#include "jit_compiler.hpp" + +#include "jit_compiler_ppc64_static.hpp" + +#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + #define PPC_BIG_ENDIAN 1 +#else + #define PPC_BIG_ENDIAN 0 +#endif + +#if (defined(_CALL_ELF) && _CALL_ELF == 2) || (!defined(_CALL_ELF) && !PPC_BIG_ENDIAN) + #define PPC_ABI_V2 1 +#else + #define PPC_ABI_V2 0 +#endif + +namespace randomx { + + class Program; + struct ProgramConfiguration; + class SuperscalarProgram; + class Instruction; + + class JitCompilerPPC64 { + public: + JitCompilerPPC64(); + ~JitCompilerPPC64(); + + void generateProgram(Program&, ProgramConfiguration&); + void generateProgramLight(Program&, ProgramConfiguration&, uint32_t); + + void generateSuperscalarHash(SuperscalarProgramList& programs, std::vector &); + + void generateDatasetInitCode() {} + + ProgramFunc* getProgramFunc() { +#if PPC_ABI_V2 + return reinterpret_cast(entryProgram); +#else + return reinterpret_cast(descriptorProgram); +#endif + } + DatasetInitFunc* getDatasetInitFunc() { +#if PPC_ABI_V2 + return reinterpret_cast(entryDataInit); +#else + return reinterpret_cast(descriptorDataInit); +#endif + } + uint8_t* getCode() { return state.code; } + size_t getCodeSize(); + + void enableWriting(); + void enableExecution(); + void enableAll(); + + void setFlags(randomx_flags f) { flags = f; } + + uint32_t getTempGpr(); + uint32_t getTempVr(); + + static uint8_t instMap[256]; + + private: + void emitProgramPrefix(CompilerState& state, Program& prog, ProgramConfiguration& pcfg, randomx_flags flags); + void emitProgramSuffix(CompilerState& state, ProgramConfiguration& pcfg, randomx_flags flags); + + CompilerState state; + randomx_flags flags; + + void* entryDataInit = nullptr; + void* entryProgram = nullptr; +#if !PPC_ABI_V2 + uint64_t descriptorProgram[3]; + uint64_t descriptorDataInit[3]; +#endif + + int32_t RandomXCodePos; + int32_t SshashSingleItemPos; + int32_t LoopBeginPos; + + uint32_t tempGprIndex = 0; + uint32_t tempVrIndex = 0; + }; + +} diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S new file mode 100644 index 00000000..ad3666fc --- /dev/null +++ b/src/jit_compiler_ppc64_static.S @@ -0,0 +1,1389 @@ +/* +Copyright (c) 2026, Forest Crossman + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + .machine power7 + .machine altivec + .section ".rodata" // Not .text because it's not meant to be executed in-place. + +#include "configuration.h" + +#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + #define PPC_BIG_ENDIAN 1 +#else + #define PPC_BIG_ENDIAN 0 +#endif + +#if (defined(_CALL_ELF) && _CALL_ELF == 2) || (!defined(_CALL_ELF) && !PPC_BIG_ENDIAN) + #define PPC_ABI_V2 1 +#else + #define PPC_ABI_V2 0 +#endif + +#if PPC_ABI_V2 + .abiversion 2 + #define C_FUNCTION(name) \ + .global name; \ + name: +#else + .abiversion 1 + #define C_FUNCTION(name) \ + .section ".opd","aw"; \ + .align 3; \ + .global name; \ + name: \ + .quad .name, .TOC.@tocbase, 0; \ + .previous; \ + .global .name; \ + .name: +#endif + + .global randomx_ppc64_constants + .global randomx_ppc64_constant_lut_fprc_to_fpscr + .global randomx_ppc64_constants_end + + .global randomx_ppc64_dataset_init + .global randomx_ppc64_dataset_init_fix_call + .global randomx_ppc64_dataset_init_end + + .global randomx_ppc64_sshash_single_item_prologue + .global randomx_ppc64_sshash_single_item_prologue_end + .global randomx_ppc64_sshash_single_item_epilogue + .global randomx_ppc64_sshash_single_item_epilogue_end + .global randomx_ppc64_sshash_cache_prefetch + .global randomx_ppc64_sshash_cache_prefetch_end + .global randomx_ppc64_sshash_xor + .global randomx_ppc64_sshash_xor_end + + .global randomx_ppc64_vm_prologue + .global randomx_ppc64_vm_prologue_end + .global randomx_ppc64_vm_epilogue + .global randomx_ppc64_vm_fix_loop + .global randomx_ppc64_vm_epilogue_end + .global randomx_ppc64_vm_loop_prologue + .global randomx_ppc64_vm_loop_prologue_end + .global randomx_ppc64_vm_data_read + .global randomx_ppc64_vm_data_read_end + .global randomx_ppc64_vm_data_read_light + .global randomx_ppc64_vm_data_read_light_fix_call + .global randomx_ppc64_vm_data_read_light_end + .global randomx_ppc64_vm_spad_store_group_r + .global randomx_ppc64_vm_spad_store_group_r_end + .global randomx_ppc64_vm_spad_store_mix_v1 + .global randomx_ppc64_vm_spad_store_mix_v1_end + .global randomx_ppc64_vm_spad_store_mix_v2_hard_aes + .global randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end + .global randomx_ppc64_vm_spad_store_mix_v2_soft_aes + .global randomx_ppc64_vm_spad_store_mix_v2_soft_aes_end + +// Macro to shuffle a VR after being loaded with lxsdx. +.macro SHUFFLE_VR vr_reg +#if PPC_BIG_ENDIAN + vperm \vr_reg, \vr_reg, \vr_reg, %v16 +#else + xxmrghw \vr_reg + 32, \vr_reg + 32, \vr_reg + 32 +#endif +.endm + +// Macro to load a GPR from little-endian bytes in memory +// Clobbers (BE only): r0 +.macro LOAD_LE_GPR reg, offset, base_reg +#if PPC_BIG_ENDIAN + li %r0, \offset + ldbrx \reg, \base_reg, %r0 +#else + ld \reg, \offset(\base_reg) +#endif +.endm + +// Macro to store a GPR to memory as little-endian bytes +// Clobbers (BE only): r0 +.macro STORE_LE_GPR reg, offset, base_reg +#if PPC_BIG_ENDIAN + li %r0, \offset + stdbrx \reg, \base_reg, %r0 +#else + std \reg, \offset(\base_reg) +#endif +.endm + + // Align constants to 128 bytes (lowest 7 bits masked) + .align 7 + +randomx_ppc64_constants: + +sshash_constant_0: .8byte 6364136223846793005 +sshash_constant_1: .8byte 9298411001130361340 +sshash_constant_2: .8byte 12065312585734608966 +sshash_constant_3: .8byte 9306329213124626780 +sshash_constant_4: .8byte 5281919268842080866 +sshash_constant_5: .8byte 10536153434571861004 +sshash_constant_6: .8byte 3398623926847679864 +sshash_constant_7: .8byte 9549104520008361294 + +randomx_ppc64_constant_lut_fprc_to_fpscr: + // RandomX fprc to PPC64 FPSCR lookup table + .8byte 0 /* 00 Round to Nearest */ + .8byte 3 /* 11 Round toward -Infinity */ + .8byte 2 /* 10 Round toward +Infinity */ + .8byte 1 /* 01 Round toward Zero */ + + // Align vector constants to 16 bytes (lowest 4 bits masked) + .align 4 + +constant_vector_group_e_and_mask: + .8byte 0x00FFFFFFFFC00000 + .8byte 0x00FFFFFFFFC00000 + +constant_vector_fscal_xor_mask: + .8byte 0x80F0000000000000 + .8byte 0x80F0000000000000 + +constant_vector_byte_reverse_mask: + // Vector byte reverse mask + .octa 0x0F0E0D0C0B0A09080706050403020100 + +#if PPC_BIG_ENDIAN +constant_vector_be_permutation_mask: + // Big-endian vector permutation mask + .byte 7, 6, 5, 4, 7, 6, 5, 4 + .byte 3, 2, 1, 0, 3, 2, 1, 0 +#endif + +constant_vector_soft_aes_galois_field_inversion_lo: + .octa 0xf001080d0f06050e020c0b0a09030704 +constant_vector_soft_aes_galois_field_inversion_hi: + .octa 0xf0070b0f060a0401090805020c0e0d03 +constant_vector_soft_aes_mixcolumns_forward: + .octa 0x03000102070405060b08090a0f0c0d0e +constant_vector_soft_aes_mixcolumns_backward: + .octa 0x0102030005060704090a0b080d0e0f0c + +constant_vector_soft_aes_shiftrows: + .octa 0x04090e03080d02070c01060b00050a0f +constant_vector_soft_aes_encrypt_input_transform_lo: + .octa 0x00702a5a98e8b2c20878225290e0baca +constant_vector_soft_aes_encrypt_input_transform_hi: + .octa 0x004d7c317d30014c81ccfdb0fcb180cd +constant_vector_soft_aes_subbytes_mul1_lo: + .octa 0x0023e2fa15d41836efd92e0dc1ccf73b +constant_vector_soft_aes_subbytes_mul1_hi: + .octa 0x003e50cb8fe19bb144f52a146e7adfa5 +constant_vector_soft_aes_subbytes_mul2_lo: + .octa 0x0029e10a4088eb694a2382abc863a1c2 +constant_vector_soft_aes_subbytes_mul2_hi: + .octa 0x0024710bc6937ae2cd2f98bc55e9b75e +constant_vector_soft_aes_encrypt_63: + .octa 0x5b5b5b5b5b5b5b5b5b5b5b5b5b5b5b5b +constant_vector_soft_aes_encrypt_output_transform_lo: + .octa 0x0060b6d629499fff0868bede214197f7 +constant_vector_soft_aes_encrypt_output_transform_hi: + .octa 0x00ecbc5051bded01e00c5cb0b15d0de1 + +constant_vector_soft_aes_invshiftrows: + .octa 0x0c090603000d0a0704010e0b0805020f +constant_vector_soft_aes_decrypt_input_transform_lo: + .octa 0x005f540b045b500f1a454e111e414a15 +constant_vector_soft_aes_decrypt_input_transform_hi: + .octa 0x00650560e683e38694f191f472177712 +constant_vector_soft_aes_invsubbytes_mul9_lo: + .octa 0x00d6869a53031c85c94c994f501fd5ca +constant_vector_soft_aes_invsubbytes_mul9_hi: + .octa 0x0049d7ec89173bc065a5fbb29e2c5e72 +constant_vector_soft_aes_invsubbytes_mulD_lo: + .octa 0x00a2b1e6dfcc577d39442a88139b6ef5 +constant_vector_soft_aes_invsubbytes_mulD_hi: + .octa 0x00cbc624f7fae23cd3efde150d183129 +constant_vector_soft_aes_invsubbytes_mulB_lo: + .octa 0x0042b496926422d004d4f2b0f6462660 +constant_vector_soft_aes_invsubbytes_mulB_hi: + .octa 0x006759cda69894c16baa55323e0cfff3 +constant_vector_soft_aes_invsubbytes_mulE_lo: + .octa 0x00d0d4269692f246b0f6b46404604222 +constant_vector_soft_aes_invsubbytes_mulE_hi: + .octa 0x00c1aaffcda6550c323e59986bf36794 +constant_vector_soft_aes_decrypt_63: + .octa 0xe8e8e8e8e8e8e8e8e8e8e8e8e8e8e8e8 +constant_vector_soft_aes_decrypt_output_transform_lo: + .octa 0x0024dffb0420dbfff8dc2703fcd82307 +constant_vector_soft_aes_decrypt_output_transform_hi: + .octa 0x002f19362906301fab84b29d82ad9bb4 + +randomx_ppc64_constants_end: + +literal_vector_group_e_or_mask: + // Program generator will write the vector here + + +// Register allocations: dataset_init +// +// Passed on entry: +// +// r1 (non-volatile) -> stack pointer (sp) +// r2 (volatile) -> constants base address (randomx_ppc64_constants), patched in by JIT, restored from stack frame by caller +// r3 (volatile) -> arg0, pointer to randomx_cache +// r4 (volatile) -> arg1, pointer to dataset (uint8_t *) +// r5 (volatile) -> arg2, uint32_t startBlock / itemNumber / initial cacheIndex +// r6 (volatile) -> arg3, uint32_t endBlock +// +// After prologue: +// +// r0 (volatile) -> scratch register +// r1 (non-volatile) -> stack pointer (sp) +// r2 (volatile) -> constants base address (randomx_ppc64_constants), patched in by JIT, restored from stack frame by caller +// r3 (volatile) -> arg0, pointer to cache memory +// r4 (volatile) -> arg1, pointer to dataset (uint8_t *) +// r5 (volatile) -> arg2, uint32_t startBlock / itemNumber / initial cacheIndex +// r6 (volatile) -> arg3, uint32_t endBlock +// r7-r12 (volatile) -> scratch registers +// r14 (non-volatile) -> saved pointer to dataset (uint8_t *) +// r15 (non-volatile) -> saved itemNumber +// r16-r31 (non-volatile) -> unused + +randomx_ppc64_dataset_init: + // JIT compiler MUST emit immediate load to r2 before this code (ABI v2 only) + + // Standard function prologue + mflr %r0 + std %r0, 16(%r1) + stdu %r1, -128(%r1) + std %r14, 112(%r1) + std %r15, 120(%r1) + + // Load cache->memory pointer + ld %r3, 0(%r3) + + // Save the dataset pointer (r4) to r14 + mr %r14, %r4 + + // Save the itemNumber (r5) to r15 + mr %r15, %r5 + + // Loop setup + // for (size_t itemNumber = startBlock; itemNumber < endBlock; itemNumber++) { ... } + sub %r8, %r6, %r5 + mtctr %r8 + +1: + // r5 gets clobbered by the item-hashing function, so we need to restore it + // from r15 before calling the function again. + mr %r5, %r15 + +randomx_ppc64_dataset_init_fix_call: + // JIT compiler MUST patch this to bl to the item hashing function + b 0 + + // Store the 64 computed bytes back in the dataset + std %r4, 8*0(%r14) + std %r6, 8*1(%r14) + std %r7, 8*2(%r14) + std %r9, 8*3(%r14) + std %r10, 8*4(%r14) + std %r11, 8*5(%r14) + std %r12, 8*6(%r14) + std %r5, 8*7(%r14) + + // Increment the dataset pointer by 64 bytes + addi %r14, %r14, 8*8 + + // Increment the itemNumber by one + addi %r15, %r15, 1 + + // Loop + bdnz 1b + + // Standard function epilogue + ld %r14, 112(%r1) + ld %r15, 120(%r1) + addi %r1, %r1, 128 + ld %r0, 16(%r1) + mtlr %r0 + blr + +randomx_ppc64_dataset_init_end: + + +// Register allocations: sshash_single_item +// +// Passed on entry: +// +// r1 (non-volatile) -> stack pointer (sp) +// r2 (volatile) -> constants base address (randomx_ppc64_constants), patched in by JIT, restored from stack frame by caller +// r3 (volatile) -> arg0, pointer to cache memory +// r5 (volatile) -> arg2, uint32_t itemNumber +// +// After prologue: +// +// r0 (volatile) -> scratch register +// r1 (non-volatile) -> stack pointer (sp) +// r2 (volatile) -> constants base address (randomx_ppc64_constants), patched in by JIT, restored from stack frame by caller +// r3 (volatile) -> arg0, pointer to cache memory +// r4 (volatile) -> SuperscalarHash r0 +// r5 (volatile) -> cacheIndex, set to SuperscalarHash r7 on return +// r6-r7 (volatile) -> SuperscalarHash r1-r2 +// r8 (volatile) -> scratch register +// r9-r12 (volatile) -> SuperscalarHash r3-r6 +// r14-r21 (non-volatile) -> unused +// r22 (non-volatile) -> SuperscalarHash r7 +// r23 (non-volatile) -> cache line address +// r24-r31 (non-volatile) -> unused + +randomx_ppc64_sshash_single_item_prologue: + // Standard function prologue + mflr %r0 + std %r0, 16(%r1) + stdu %r1, -128(%r1) + std %r22, 112(%r1) + std %r23, 120(%r1) + + // Step 1. Initialize registers + + // r0 = (itemNumber + 1) * 6364136223846793005 + ld %r8, (sshash_constant_0-randomx_ppc64_constants)(%r2) + addi %r0, %r5, 1 + mulld %r4, %r8, %r0 + + // r1 = r0 ^ 9298411001130361340 + ld %r8, (sshash_constant_1-randomx_ppc64_constants)(%r2) + xor %r6, %r4, %r8 + + // r2 = r0 ^ 12065312585734608966 + ld %r8, (sshash_constant_2-randomx_ppc64_constants)(%r2) + xor %r7, %r4, %r8 + + // r3 = r0 ^ 9306329213124626780 + ld %r8, (sshash_constant_3-randomx_ppc64_constants)(%r2) + xor %r9, %r4, %r8 + + // r4 = r0 ^ 5281919268842080866 + ld %r8, (sshash_constant_4-randomx_ppc64_constants)(%r2) + xor %r10, %r4, %r8 + + // r5 = r0 ^ 10536153434571861004 + ld %r8, (sshash_constant_5-randomx_ppc64_constants)(%r2) + xor %r11, %r4, %r8 + + // r6 = r0 ^ 3398623926847679864 + ld %r8, (sshash_constant_6-randomx_ppc64_constants)(%r2) + xor %r12, %r4, %r8 + + // r7 = r0 ^ 9549104520008361294 + ld %r8, (sshash_constant_7-randomx_ppc64_constants)(%r2) + xor %r22, %r4, %r8 + + // Step 2. Use r5 (itemNumber) as cacheIndex so it can be used to generate the initial cache line mask + +randomx_ppc64_sshash_single_item_prologue_end: + +randomx_ppc64_sshash_single_item_epilogue: + // Return SuperscalarHash r7 in GPR5 + mr %r5, %r22 + + // Standard function epilogue + ld %r22, 112(%r1) + ld %r23, 120(%r1) + addi %r1, %r1, 128 + ld %r0, 16(%r1) + mtlr %r0 + blr + +randomx_ppc64_sshash_single_item_epilogue_end: + + +// Step 4. Load a 64-byte item from the Cache. The item index is given by cacheIndex modulo the total number of 64-byte items in Cache. +randomx_ppc64_sshash_cache_prefetch: + // Actual mask MUST be inserted by JIT compiler + rldic %r8, %r5, 0, 63 + add %r23, %r3, %r8 + dcbt 0, %r23, 0 + // If TH=0b00000, the dcbt/dcbtst instruction provides a + // hint that the program will probably soon access the + // block containing the byte addressed by EA. + +randomx_ppc64_sshash_cache_prefetch_end: + +// Step 6. XOR all registers with data loaded from randomx cache +randomx_ppc64_sshash_xor: + ld %r8, 0(%r23) + ld %r0, 8(%r23) + xor %r4, %r4, %r8 + xor %r6, %r6, %r0 + ld %r8, 16(%r23) + ld %r0, 24(%r23) + xor %r7, %r7, %r8 + xor %r9, %r9, %r0 + ld %r8, 32(%r23) + ld %r0, 40(%r23) + xor %r10, %r10, %r8 + xor %r11, %r11, %r0 + ld %r8, 48(%r23) + ld %r0, 56(%r23) + xor %r12, %r12, %r8 + xor %r22, %r22, %r0 + +randomx_ppc64_sshash_xor_end: + + +// Register allocations: vm +// +// Passed on entry: +// +// r1 (non-volatile) -> stack pointer (sp) +// r2 (volatile) -> constants base address (randomx_ppc64_constants), patched in by JIT, restored from stack frame by caller +// r3 (volatile) -> arg0, pointer to RegisterFile +// r4 (volatile) -> arg1, pointer to MemoryRegisters +// r5 (volatile) -> arg2, pointer to scratchpad (uint8_t *) +// r6 (volatile) -> arg3, uint64_t loop iterations +// +// After prologue: +// +// r0 (volatile) -> scratch register +// r1 (non-volatile) -> stack pointer (sp) +// r2 (volatile) -> constants base address (randomx_ppc64_constants), patched in by JIT, restored from stack frame by caller +// r3 (volatile) -> arg0 for SuperScalarHash (dataset pointer) +// r4 (volatile) -> scratch register +// r5 (volatile) -> arg2 for SuperScalarHash (loop iteration) +// r6-r12 (volatile) -> scratch registers +// r14-r21 (non-volatile) -> RandomX integer registers r0-r7 +// r22 (non-volatile) -> dataset pointer (memory) +// r23 (non-volatile) -> unused +// r24 (non-volatile) -> ma +// r25 (non-volatile) -> mx +// r26 (non-volatile) -> spAddr0 +// r27 (non-volatile) -> spAddr1 +// r28 (non-volatile) -> Saved pointer to RegisterFile +// r29 (non-volatile) -> Saved pointer to MemoryRegisters +// r30 (non-volatile) -> Saved pointer to scratchpad (uint8_t *) +// r31 (non-volatile) -> unused +// f0-f13 / vs0–vs13 (volatile) -> scratch registers +// f14-f31 / vs14–vs31 (non-volatile) -> unused +// v0-v3 / vs32-vs35 (volatile) -> RandomX floating point registers f0-f3 +// v4-v7 / vs36-vs39 (volatile) -> RandomX floating point registers e0-e3 +// v8-v11 / vs40-vs43 (volatile) -> RandomX floating point registers a0-a3 +// v12-v14 / vs44-vs46 (volatile) -> scratch registers +// v15 / vs47 (volatile) -> constant_vector_byte_reverse_mask +// v16 / vs48 (volatile) -> constant_vector_be_permutation_mask +// v17 / vs49 (volatile) -> constant_vector_group_e_and_mask +// v18 / vs50 (volatile) -> constant_vector_fscal_xor_mask +// v19 / vs51 (volatile) -> literal_vector_group_e_or_mask +// v20-v31 / vs52-vs63 (non-volatile) -> unused + +randomx_ppc64_vm_prologue: + // JIT compiler MUST emit immediate load to r2 before this code (ABI v2 only) + + // Standard function prologue + mflr %r0 + std %r0, 16(%r1) + stdu %r1, -256(%r1) + std %r14, 112(%r1) + std %r15, 120(%r1) + std %r16, 128(%r1) + std %r17, 136(%r1) + std %r18, 144(%r1) + std %r19, 152(%r1) + std %r20, 160(%r1) + std %r21, 168(%r1) + std %r22, 176(%r1) + std %r23, 184(%r1) + std %r24, 192(%r1) + std %r25, 200(%r1) + std %r26, 208(%r1) + std %r27, 216(%r1) + std %r28, 224(%r1) + std %r29, 232(%r1) + std %r30, 240(%r1) + //std %r31, 248(%r1) + + // Save arguments + mr %r28, %r3 + mr %r29, %r4 + mr %r30, %r5 + + // Move the loop iterations into the counter + mtctr %r6 + + // Load the vector constants/literals + li %r8, constant_vector_group_e_and_mask-randomx_ppc64_constants + lxvd2x %vs49, %r2, %r8 + li %r9, constant_vector_fscal_xor_mask-randomx_ppc64_constants + lxvd2x %vs50, %r2, %r9 + li %r10, literal_vector_group_e_or_mask-randomx_ppc64_constants + lvx %v19, %r2, %r10 // Use lvx to load the vector since it's written [ low word, high word ] in memory +#if PPC_BIG_ENDIAN + li %r11, constant_vector_be_permutation_mask-randomx_ppc64_constants + lxvd2x %vs48, %r2, %r11 // Load the BE permutation mask (not needed for LE) +#endif + li %r12, constant_vector_byte_reverse_mask-randomx_ppc64_constants + lvx %v15, %r2, %r12 +#if PPC_BIG_ENDIAN + vperm %v19, %v19, %v19, %v15 // Swap the byte order of the Group E OR mask vector +#endif + + // Zero the RandomX integer registers + li %r14, 0 + li %r15, 0 + li %r16, 0 + li %r17, 0 + li %r18, 0 + li %r19, 0 + li %r20, 0 + li %r21, 0 + + // Load MemoryRegisters (r29) + lwz %r25, 0(%r29) // mx + lwz %r24, 4(%r29) // ma + ld %r22, 8(%r29) // memory (dataset pointer) + + // Load a0-a3 from RegisterFile + .equ registers_a_base, 8*8+16*4+16*4 + li %r8, registers_a_base + 16*0 + lvx %v8, %r28, %r8 + li %r9, registers_a_base + 16*1 + lvx %v9, %r28, %r9 + li %r10, registers_a_base + 16*2 + lvx %v10, %r28, %r10 + li %r11, registers_a_base + 16*3 + lvx %v11, %r28, %r11 +#if PPC_BIG_ENDIAN + vperm %v8, %v8, %v8, %v15 + vperm %v9, %v9, %v9, %v15 + vperm %v10, %v10, %v10, %v15 + vperm %v11, %v11, %v11, %v15 +#endif + + // Instructions to mask mx and ma with Scratchpad L3 mask and set the + // initial values of spAddr0 and spAddr1 are appended here by the JIT + +randomx_ppc64_vm_prologue_end: + +randomx_ppc64_vm_epilogue: + // Loop + bdz 1f +randomx_ppc64_vm_fix_loop: + // JIT compiler MUST patch this to b to vm_loop_prologue + b 0 +1: + + // Store RandomX registers back into register file + STORE_LE_GPR %r14, 8*0, %r28 + STORE_LE_GPR %r15, 8*1, %r28 + STORE_LE_GPR %r16, 8*2, %r28 + STORE_LE_GPR %r17, 8*3, %r28 + STORE_LE_GPR %r18, 8*4, %r28 + STORE_LE_GPR %r19, 8*5, %r28 + STORE_LE_GPR %r20, 8*6, %r28 + STORE_LE_GPR %r21, 8*7, %r28 + +#if PPC_BIG_ENDIAN + // Reverse the Group F/E register bytes so they're arranged as [ 0123 4567 ] + vperm %v0, %v0, %v0, %v15 + vperm %v1, %v1, %v1, %v15 + vperm %v2, %v2, %v2, %v15 + vperm %v3, %v3, %v3, %v15 + vperm %v4, %v4, %v4, %v15 + vperm %v5, %v5, %v5, %v15 + vperm %v6, %v6, %v6, %v15 + vperm %v7, %v7, %v7, %v15 +#endif + + .equ registers_f_base, 8*8 + li %r8, registers_f_base + 16*0 + stvx %v0, %r28, %r8 + li %r9, registers_f_base + 16*1 + stvx %v1, %r28, %r9 + li %r10, registers_f_base + 16*2 + stvx %v2, %r28, %r10 + li %r11, registers_f_base + 16*3 + stvx %v3, %r28, %r11 + + .equ registers_e_base, 8*8+16*4 + li %r8, registers_e_base + 16*0 + stvx %v4, %r28, %r8 + li %r9, registers_e_base + 16*1 + stvx %v5, %r28, %r9 + li %r10, registers_e_base + 16*2 + stvx %v6, %r28, %r10 + li %r11, registers_e_base + 16*3 + stvx %v7, %r28, %r11 + + // Standard function epilogue + ld %r14, 112(%r1) + ld %r15, 120(%r1) + ld %r16, 128(%r1) + ld %r17, 136(%r1) + ld %r18, 144(%r1) + ld %r19, 152(%r1) + ld %r20, 160(%r1) + ld %r21, 168(%r1) + ld %r22, 176(%r1) + ld %r23, 184(%r1) + ld %r24, 192(%r1) + ld %r25, 200(%r1) + ld %r26, 208(%r1) + ld %r27, 216(%r1) + ld %r28, 224(%r1) + ld %r29, 232(%r1) + ld %r30, 240(%r1) + //ld %r31, 248(%r1) + addi %r1, %r1,256 + ld %r0, 16(%r1) + mtlr %r0 + blr + +randomx_ppc64_vm_epilogue_end: + +randomx_ppc64_vm_loop_prologue: + // Main loop start + + // Load scratchpad data, mix registers, etc. + LOAD_LE_GPR %r8, 0, %r26 + LOAD_LE_GPR %r9, 8, %r26 + LOAD_LE_GPR %r10, 16, %r26 + LOAD_LE_GPR %r11, 24, %r26 + xor %r14, %r14, %r8 + xor %r15, %r15, %r9 + xor %r16, %r16, %r10 + xor %r17, %r17, %r11 + LOAD_LE_GPR %r8, 32, %r26 + LOAD_LE_GPR %r9, 40, %r26 + LOAD_LE_GPR %r10, 48, %r26 + LOAD_LE_GPR %r11, 56, %r26 + xor %r18, %r18, %r8 + xor %r19, %r19, %r9 + xor %r20, %r20, %r10 + xor %r21, %r21, %r11 + + // Load F registers (v0-v3 / vs32-vs35) from spAddr1 (r27) + lxsdx %vs32, 0, %r27 // Use base address directly to avoid an immediate load + li %r9, 8*1 + lxsdx %vs33, %r27, %r9 + li %r10, 8*2 + lxsdx %vs34, %r27, %r10 + li %r11, 8*3 + lxsdx %vs35, %r27, %r11 + SHUFFLE_VR 0 + SHUFFLE_VR 1 + SHUFFLE_VR 2 + SHUFFLE_VR 3 + xvcvsxwdp %vs32, %vs32 + xvcvsxwdp %vs33, %vs33 + xvcvsxwdp %vs34, %vs34 + xvcvsxwdp %vs35, %vs35 + + // Load E registers (v4-v7 / vs36-vs39) from spAddr1 (r27) and fixup + li %r8, 8*4 + lxsdx %vs36, %r27, %r8 + li %r9, 8*5 + lxsdx %vs37, %r27, %r9 + li %r10, 8*6 + lxsdx %vs38, %r27, %r10 + li %r11, 8*7 + lxsdx %vs39, %r27, %r11 + SHUFFLE_VR 4 + SHUFFLE_VR 5 + SHUFFLE_VR 6 + SHUFFLE_VR 7 + xvcvsxwdp %vs36, %vs36 + xvcvsxwdp %vs37, %vs37 + xvcvsxwdp %vs38, %vs38 + xvcvsxwdp %vs39, %vs39 + vsel %v4, %v19, %v4, %v17 + vsel %v5, %v19, %v5, %v17 + vsel %v6, %v19, %v6, %v17 + vsel %v7, %v19, %v7, %v17 + +randomx_ppc64_vm_loop_prologue_end: + +randomx_ppc64_vm_data_read: + // Read dataset logic + + // Calculate prefetch address (JIT compiler MUST patch) + .long 0 // Placeholder for: rlwinm %r8, %mpReg, 0, mask_begin, mask_end + add %r8, %r8, %r22 // r22 holds dataset base pointer + + // Prefetch the next block with dcbt. This is extremely important--without this + // we lose >20% performance in V1 and >16% in V2. + // Setting TH=0b10000 (dcbtt 0, %r8) didn't make any measurable difference in + // performance. + dcbt 0, %r8, 0 + + // Calculate read address (JIT compiler MUST patch) + .long 0 // Placeholder for: rlwinm %r8, %mtReg, 0, mask_begin, mask_end + add %r8, %r8, %r22 + + // Read 64 bytes and XOR with integer registers + ld %r9, 0(%r8) + ld %r10, 8(%r8) + ld %r11, 16(%r8) + ld %r12, 24(%r8) + xor %r14, %r14, %r9 + xor %r15, %r15, %r10 + xor %r16, %r16, %r11 + xor %r17, %r17, %r12 + ld %r9, 32(%r8) + ld %r10, 40(%r8) + ld %r11, 48(%r8) + ld %r12, 56(%r8) + xor %r18, %r18, %r9 + xor %r19, %r19, %r10 + xor %r20, %r20, %r11 + xor %r21, %r21, %r12 + + // Swap mx and ma + mr %r8, %r25 + mr %r25, %r24 + mr %r24, %r8 + +randomx_ppc64_vm_data_read_end: + +randomx_ppc64_vm_data_read_light: + // Light mode read dataset logic + // (Similar to data_read but uses sshash_single_item) + + // Copy dataset pointer argument for sshash_single_item + mr %r3, %r22 + +randomx_ppc64_vm_data_read_light_fix_call: + // JIT compiler MUST patch this to bl to sshash_single_item + b 0 + + // XOR the result from sshash_single_item with the VM registers + xor %r14, %r14, %r4 + xor %r15, %r15, %r6 + xor %r16, %r16, %r7 + xor %r17, %r17, %r9 + xor %r18, %r18, %r10 + xor %r19, %r19, %r11 + xor %r20, %r20, %r12 + xor %r21, %r21, %r5 + + // Swap mx and ma + mr %r8, %r25 + mr %r25, %r24 + mr %r24, %r8 + +randomx_ppc64_vm_data_read_light_end: + +randomx_ppc64_vm_spad_store_group_r: + // Store to scratchpad at spAddr1 + STORE_LE_GPR %r14, 8*0, %r27 + STORE_LE_GPR %r15, 8*1, %r27 + STORE_LE_GPR %r16, 8*2, %r27 + STORE_LE_GPR %r17, 8*3, %r27 + STORE_LE_GPR %r18, 8*4, %r27 + STORE_LE_GPR %r19, 8*5, %r27 + STORE_LE_GPR %r20, 8*6, %r27 + STORE_LE_GPR %r21, 8*7, %r27 + +randomx_ppc64_vm_spad_store_group_r_end: + +randomx_ppc64_vm_spad_store_mix_v1: + // Mix F and E registers (f0-f3 are v0-v3, e0-e3 are v4-v7) + vxor %v0, %v0, %v4 + vxor %v1, %v1, %v5 + vxor %v2, %v2, %v6 + vxor %v3, %v3, %v7 + +randomx_ppc64_vm_spad_store_mix_v1_end: + +randomx_ppc64_vm_spad_store_mix_v2_hard_aes: + // Mix F and E registers (f0-f3 are v0-v3, e0-e3 are v4-v7) + + // We need a zero vector to bypass vncipher's internal key XOR + vxor %v12, %v12, %v12 + + // Byte-reverse f0-f3 and e0-e3 + vperm %v0, %v0, %v0, %v15 + vperm %v1, %v1, %v1, %v15 + vperm %v2, %v2, %v2, %v15 + vperm %v3, %v3, %v3, %v15 + vperm %v4, %v4, %v4, %v15 + vperm %v5, %v5, %v5, %v15 + vperm %v6, %v6, %v6, %v15 + vperm %v7, %v7, %v7, %v15 + + vcipher %v0, %v0, %v4 + vncipher %v1, %v1, %v12 // Pass 0 as the key + vcipher %v2, %v2, %v4 + vncipher %v3, %v3, %v12 + vxor %v1, %v1, %v4 // XOR the actual key afterwards + vxor %v3, %v3, %v4 + + vcipher %v0, %v0, %v5 + vncipher %v1, %v1, %v12 + vcipher %v2, %v2, %v5 + vncipher %v3, %v3, %v12 + vxor %v1, %v1, %v5 + vxor %v3, %v3, %v5 + + vcipher %v0, %v0, %v6 + vncipher %v1, %v1, %v12 + vcipher %v2, %v2, %v6 + vncipher %v3, %v3, %v12 + vxor %v1, %v1, %v6 + vxor %v3, %v3, %v6 + + vcipher %v0, %v0, %v7 + vncipher %v1, %v1, %v12 + vcipher %v2, %v2, %v7 + vncipher %v3, %v3, %v12 + vxor %v1, %v1, %v7 + vxor %v3, %v3, %v7 + + // Byte-reverse f0-f3 and e0-e3 + vperm %v0, %v0, %v0, %v15 + vperm %v1, %v1, %v1, %v15 + vperm %v2, %v2, %v2, %v15 + vperm %v3, %v3, %v3, %v15 + vperm %v4, %v4, %v4, %v15 + vperm %v5, %v5, %v5, %v15 + vperm %v6, %v6, %v6, %v15 + vperm %v7, %v7, %v7, %v15 + +randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end: + + +// The following software AES code is based heavily on public-domain work by +// Mike Hamburg of Stanford University. More information on that work can be +// found here: https://crypto.stanford.edu/vpaes/ +// +// This port of that code is not particularly well-optimized, partly because I +// didn't really understand all the math behind it, and partly because I don't +// yet have a POWER7 system to benchmark on. Possible areas for improvement +// include: +// +// - Folding constants into other constants (to do this would require +// understanding the math behind this algorithm). +// - Using functions and loops instead of unrolling everything with macros. +// - Using VSX instructions to move values that are exclusively used in XOR +// operations to vs0-vs31 in order to reduce the number of registers we need +// to save to the stack. +// - Using VSX loads to load constants into vs0-vs31 just once at the start of +// VM execution, then moving those constants into vector registers with xxlor +// during software AES. Might be faster if it means we can avoid having to +// save vector registers to the stack. Might also be slower if loading a +// series of constants from d-cache is faster than a bunch of xxlor +// operations to move them from vs0-vs31 into the vector registers. +// - Restoring the overwritten constant registers (v15-v19) from the constant +// pool instead of the stack. +// - Ordering the software AES constants in the constant pool based on the order +// they get loaded in, which could help with prefetching and reduce load +// latency. +// - Loading the zero/4/0x0F vectors from the constant pool might be faster than +// generating them with vector instructions. +// - Reordering instructions to keep the vector pipeline full and avoid false +// dependencies. + +// Macro: VPAES_TRANSFORM +// Converts standard bytes to custom basis (using Lk_ipt tables) +// OR custom basis back to standard bytes (using Lk_opt tables). +// +// Arguments: +// v_out : Destination vector register +// v_in : Source vector register (can be same as v_out) +// v_tmp : Temporary vector register +// v_tab_lo : Vector register loaded with the low table (Lk_ipt lo / Lk_opt lo) +// v_tab_hi : Vector register loaded with the high table (Lk_ipt hi / Lk_opt hi) +// v_splat4 : Vector register pre-loaded with byte values of 0x04 (for shifting) + +.macro VPAES_TRANSFORM v_out, v_in, v_tmp, v_tab_lo, v_tab_hi, v_splat4 + // Shift right by 4 to isolate the high nybbles into v_tmp + vsrb \v_tmp, \v_in, \v_splat4 + + // Lookup the low nybbles (vperm ignores the upper bits of the index) + vperm \v_out, \v_tab_lo, \v_tab_lo, \v_in + + // Lookup the high nybbles + vperm \v_tmp, \v_tab_hi, \v_tab_hi, \v_tmp + + // Combine the results + vxor \v_out, \v_out, \v_tmp +.endm + +// Macro: VPAES_INVERSION +// Performs Galois Field inversion in the custom composite field basis. +// +// Arguments: +// v_io : Output vector 1 (io) +// v_jo : Output vector 2 (jo) +// v_in : Input vector (state in custom basis) +// v_invlo : Lk_inv low table (first 16 bytes) +// v_invhi : Lk_inv high table (second 16 bytes) +// v_splat4 : Vector pre-loaded with 0x04 (for shifting) +// v_splat0f : Vector pre-loaded with 0x0F (for masking) +// v_zero : Vector pre-loaded with 0x00 +// v_tmp1 : Temporary vector +// v_tmp2 : Temporary vector +// v_tmp3 : Temporary vector + +.macro VPAES_INVERSION v_io, v_jo, v_in, v_invlo, v_invhi, v_splat4, v_splat0f, v_zero, v_tmp1, v_tmp2, v_tmp3 + // v_tmp1 = i (high nybbles) + vsrb \v_tmp1, \v_in, \v_splat4 + + // v_tmp3 = a/k + vperm \v_tmp3, \v_invhi, \v_invhi, \v_in + + // v_tmp2 = j (low nybbles) + vxor \v_tmp2, \v_in, \v_tmp1 + + // v_io = 1/i + vperm \v_io, \v_invlo, \v_invlo, \v_tmp1 + + // v_jo = 1/j + vperm \v_jo, \v_invlo, \v_invlo, \v_tmp2 + + // mask j with 0x0F + vand \v_tmp2, \v_tmp2, \v_splat0f + + // iak = 1/i + a/k + vxor \v_io, \v_io, \v_tmp3 + + // jak = 1/j + a/k + vxor \v_jo, \v_jo, \v_tmp3 + + // 1/iak (Note: v_zero is used for the second half of the table) + vperm \v_io, \v_invlo, \v_zero, \v_io + + // 1/jak + vperm \v_jo, \v_invlo, \v_zero, \v_jo + + // io = 1/iak + j + vxor \v_io, \v_io, \v_tmp2 + + // jo = 1/jak + i + vxor \v_jo, \v_jo, \v_tmp1 +.endm + +// Macro: VPAES_SB_MC +// Performs combined SubBytes affine transform and MixColumns. +// Output remains in the custom composite field basis. +// +// Arguments: +// v_out : Output vector (custom basis) +// v_io : Input vector 1 (inverted high nybbles from VPAES_INVERSION) +// v_jo : Input vector 2 (inverted low nybbles from VPAES_INVERSION) +// v_sb1u : Lk_sb1 low table (first 16 bytes) +// v_sb1t : Lk_sb1 high table (second 16 bytes) +// v_sb2u : Lk_sb2 low table (first 16 bytes) +// v_sb2t : Lk_sb2 high table (second 16 bytes) +// v_mc_fwd : Lk_mc_forward base table (first 16 bytes) +// v_mc_bwd : Lk_mc_backward base table (first 16 bytes) +// v_zero : Vector pre-loaded with 0x00 +// v_tmp1 : Temporary vector +// v_tmp2 : Temporary vector +// v_tmp3 : Temporary vector + +.macro VPAES_SB_MC v_out, v_io, v_jo, v_sb1u, v_sb1t, v_sb2u, v_sb2t, v_mc_fwd, v_mc_bwd, v_zero, v_tmp1, v_tmp2, v_tmp3 + // 1. Calculate A = sb1u(jo) ^ sb1t(io) + vperm \v_tmp1, \v_sb1u, \v_zero, \v_jo + vperm \v_out, \v_sb1t, \v_zero, \v_io + vxor \v_out, \v_out, \v_tmp1 // v_out = A + + // 2. Calculate 2A = sb2u(jo) ^ sb2t(io) + vperm \v_tmp2, \v_sb2u, \v_zero, \v_jo + vperm \v_tmp3, \v_sb2t, \v_zero, \v_io + vxor \v_tmp2, \v_tmp2, \v_tmp3 // v_tmp2 = 2A + + // 3. Calculate B = rot(A, 1) and D = rot(A, 3) + vperm \v_tmp1, \v_out, \v_zero, \v_mc_fwd // v_tmp1 = B + vperm \v_tmp3, \v_out, \v_zero, \v_mc_bwd // v_tmp3 = D + + // 4. Calculate 2A + B + vxor \v_tmp1, \v_tmp1, \v_tmp2 // v_tmp1 = 2A + B + + // 5. Calculate 2B + C = rot(2A + B, 1) + vperm \v_tmp2, \v_tmp1, \v_zero, \v_mc_fwd // v_tmp2 = 2B + C + + // 6. Calculate 2A + B + D + vxor \v_out, \v_tmp1, \v_tmp3 // v_out = 2A + B + D + + // 7. Final Result = (2A + B + D) ^ (2B + C) + vxor \v_out, \v_out, \v_tmp2 +.endm + +// Macro: VPAES_INVSB_INVMC +// Performs combined InvSubBytes affine transform and InvMixColumns on the inverted custom basis state. +// +// Arguments: +// v_out : Output vector (custom basis) +// v_io : Input vector 1 (inverted high nybbles from VPAES_INVERSION) +// v_jo : Input vector 2 (inverted low nybbles from VPAES_INVERSION) +// v_sb9u, v_sb9t : Lk_dsb9 tables (low and high) +// v_sbdu, v_sbdt : Lk_dsbd tables (low and high) +// v_sbbu, v_sbbt : Lk_dsbb tables (low and high) +// v_sbeu, v_sbet : Lk_dsbe tables (low and high) +// v_mc_fwd : Lk_mc_forward base table (first 16 bytes) +// v_zero : Vector pre-loaded with 0x00 +// v_tmp1 : Temporary vector +// v_tmp2 : Temporary vector + +.macro VPAES_INVSB_INVMC v_out, v_io, v_jo, v_sb9u, v_sb9t, v_sbdu, v_sbdt, v_sbbu, v_sbbt, v_sbeu, v_sbet, v_mc_fwd, v_zero, v_tmp1, v_tmp2 + // 1. Multiply by 0x09 + vperm \v_tmp1, \v_sb9u, \v_zero, \v_io + vperm \v_tmp2, \v_sb9t, \v_zero, \v_jo + vxor \v_out, \v_tmp1, \v_tmp2 // Acc = 0x09 * State + + // 2. Rotate and add 0x0D + vperm \v_tmp1, \v_sbdu, \v_zero, \v_io + vperm \v_out, \v_out, \v_zero, \v_mc_fwd // rot(Acc) + vperm \v_tmp2, \v_sbdt, \v_zero, \v_jo + vxor \v_out, \v_out, \v_tmp1 + vxor \v_out, \v_out, \v_tmp2 // Acc = rot(Acc) ^ (0x0D * State) + + // 3. Rotate and add 0x0B + vperm \v_tmp1, \v_sbbu, \v_zero, \v_io + vperm \v_out, \v_out, \v_zero, \v_mc_fwd // rot(Acc) + vperm \v_tmp2, \v_sbbt, \v_zero, \v_jo + vxor \v_out, \v_out, \v_tmp1 + vxor \v_out, \v_out, \v_tmp2 // Acc = rot(Acc) ^ (0x0B * State) + + // 4. Rotate and add 0x0E + vperm \v_tmp1, \v_sbeu, \v_zero, \v_io + vperm \v_out, \v_out, \v_zero, \v_mc_fwd // rot(Acc) + vperm \v_tmp2, \v_sbet, \v_zero, \v_jo + vxor \v_out, \v_out, \v_tmp1 + vxor \v_out, \v_out, \v_tmp2 // Acc = rot(Acc) ^ (0x0E * State) +.endm + +// Register allocations: Software AES +// +// v0-v3 / vs32-vs35 (in/out / volatile) -> RandomX floating point registers f0-f3 +// v4-v7 / vs36-vs39 (input / non-volatile) -> RandomX floating point registers e0-e3 +// v8 / vs40 (non-volatile) -> io (must reload: RandomX floating point registers a0) +// v9 / vs41 (non-volatile) -> jo (must reload: RandomX floating point registers a1) +// v10 / vs42 (non-volatile) -> 0x5b... / 0xe8... (must reload: RandomX floating point registers a2) +// v11 / vs43 (non-volatile) -> invsubbytes_mulE_hi (must reload: RandomX floating point registers a3) +// v12-v14 / vs44-vs46 (volatile) -> scratch registers +// v15 / vs47 (non-volatile) -> shiftrows / invshiftrows (must reload: constant_vector_byte_reverse_mask) +// v16 / vs48 (non-volatile) -> enc/dec input/output transform lo (must reload (BE only): constant_vector_be_permutation_mask) +// v17 / vs49 (non-volatile) -> enc/dec input/output transform hi (must reload: constant_vector_group_e_and_mask) +// v18 / vs50 (non-volatile) -> mixcolumns forward (must reload: constant_vector_fscal_xor_mask) +// v19 / vs51 (non-volatile) -> mixcolumns backward / invsubbytes_mulE_lo (must reload: literal_vector_group_e_or_mask) +// v20 / vs52 (non-volatile) -> Zero +// v21 / vs53 (non-volatile) -> Shift amount (4) +// v22 / vs54 (non-volatile) -> Low nybble mask (0x0F) +// v23 / vs55 (non-volatile) -> Transformed round key +// v24 / vs56 (non-volatile) -> galois_field_inversion_lo +// v25 / vs57 (non-volatile) -> galois_field_inversion_hi +// v26 / vs58 (non-volatile) -> subbytes_mul1_lo / invsubbytes_mul9_lo +// v27 / vs59 (non-volatile) -> subbytes_mul1_hi / invsubbytes_mul9_hi +// v28 / vs60 (non-volatile) -> subbytes_mul2_lo / invsubbytes_mulD_lo +// v29 / vs61 (non-volatile) -> subbytes_mul2_lo / invsubbytes_mulD_hi +// v30 / vs62 (non-volatile) -> invsubbytes_mulB_lo +// v31 / vs63 (non-volatile) -> invsubbytes_mulB_hi + +randomx_ppc64_vm_spad_store_mix_v2_soft_aes: + // Mix F and E registers (f0-f3 are v0-v3, e0-e3 are v4-v7) + + // Save v8-v11 and v15-v31 to the stack + addi %r6, %r1, -(16 * 21) + stvx %v8, 0, %r6 + li %r7, 16*1 + stvx %v9, %r7, %r6 + li %r8, 16*2 + stvx %v10, %r8, %r6 + li %r9, 16*3 + stvx %v11, %r9, %r6 + li %r10, 16*4 + stvx %v15, %r10, %r6 + li %r11, 16*5 + stvx %v16, %r11, %r6 + li %r12, 16*6 + stvx %v17, %r12, %r6 + li %r7, 16*7 + stvx %v18, %r7, %r6 + li %r8, 16*8 + stvx %v19, %r8, %r6 + li %r9, 16*9 + stvx %v20, %r9, %r6 + li %r10, 16*10 + stvx %v21, %r10, %r6 + li %r11, 16*11 + stvx %v22, %r11, %r6 + li %r12, 16*12 + stvx %v23, %r12, %r6 + li %r7, 16*13 + stvx %v24, %r7, %r6 + li %r8, 16*14 + stvx %v25, %r8, %r6 + li %r9, 16*15 + stvx %v26, %r9, %r6 + li %r10, 16*16 + stvx %v27, %r10, %r6 + li %r11, 16*17 + stvx %v28, %r11, %r6 + li %r12, 16*18 + stvx %v29, %r12, %r6 + li %r7, 16*19 + stvx %v30, %r7, %r6 + li %r8, 16*20 + stvx %v31, %r8, %r6 + + // Zero vector v20 + vxor %v20, %v20, %v20 + + // Splat the shift amount to v21 + vspltisb %v21, 4 + + // Splat the low nybble mask to v22 + vspltisb %v22, 0x0F + + // Load initial encryption constants + li %r6, constant_vector_soft_aes_encrypt_input_transform_lo-randomx_ppc64_constants + li %r7, constant_vector_soft_aes_encrypt_input_transform_hi-randomx_ppc64_constants + lvx %v16, %r6, %r2 + lvx %v17, %r7, %r2 + li %r8, constant_vector_soft_aes_mixcolumns_forward-randomx_ppc64_constants + li %r9, constant_vector_soft_aes_mixcolumns_backward-randomx_ppc64_constants + lvx %v18, %r8, %r2 + lvx %v19, %r9, %r2 + li %r6, constant_vector_soft_aes_galois_field_inversion_lo-randomx_ppc64_constants + li %r7, constant_vector_soft_aes_galois_field_inversion_hi-randomx_ppc64_constants + lvx %v24, %r6, %r2 + lvx %v25, %r7, %r2 + li %r8, constant_vector_soft_aes_subbytes_mul1_lo-randomx_ppc64_constants + li %r9, constant_vector_soft_aes_subbytes_mul1_hi-randomx_ppc64_constants + lvx %v26, %r8, %r2 + lvx %v27, %r9, %r2 + li %r6, constant_vector_soft_aes_subbytes_mul2_lo-randomx_ppc64_constants + li %r7, constant_vector_soft_aes_subbytes_mul2_hi-randomx_ppc64_constants + lvx %v28, %r6, %r2 + lvx %v29, %r7, %r2 + li %r8, constant_vector_soft_aes_shiftrows-randomx_ppc64_constants + li %r9, constant_vector_soft_aes_encrypt_63-randomx_ppc64_constants + lvx %v15, %r8, %r2 + lvx %v10, %r9, %r2 + + // Transform inputs to composite field representation + VPAES_TRANSFORM %v0, %v0, %v12, %v16, %v17, %v21 + VPAES_TRANSFORM %v2, %v2, %v12, %v16, %v17, %v21 + + // Round 0 (key v4) + VPAES_TRANSFORM %v23, %v4, %v12, %v16, %v17, %v21 + vxor %v23, %v23, %v10 + vperm %v0, %v0, %v0, %v15 + VPAES_INVERSION %v8, %v9, %v0, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14 + VPAES_SB_MC %v0, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14 + vxor %v0, %v0, %v23 + vperm %v2, %v2, %v2, %v15 + VPAES_INVERSION %v8, %v9, %v2, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14 + VPAES_SB_MC %v2, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14 + vxor %v2, %v2, %v23 + + // Round 1 (key v5) + VPAES_TRANSFORM %v23, %v5, %v12, %v16, %v17, %v21 + vxor %v23, %v23, %v10 + vperm %v0, %v0, %v0, %v15 + VPAES_INVERSION %v8, %v9, %v0, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14 + VPAES_SB_MC %v0, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14 + vxor %v0, %v0, %v23 + vperm %v2, %v2, %v2, %v15 + VPAES_INVERSION %v8, %v9, %v2, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14 + VPAES_SB_MC %v2, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14 + vxor %v2, %v2, %v23 + + // Round 2 (key v6) + VPAES_TRANSFORM %v23, %v6, %v12, %v16, %v17, %v21 + vxor %v23, %v23, %v10 + vperm %v0, %v0, %v0, %v15 + VPAES_INVERSION %v8, %v9, %v0, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14 + VPAES_SB_MC %v0, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14 + vxor %v0, %v0, %v23 + vperm %v2, %v2, %v2, %v15 + VPAES_INVERSION %v8, %v9, %v2, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14 + VPAES_SB_MC %v2, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14 + vxor %v2, %v2, %v23 + + // Round 3 (key v7) + VPAES_TRANSFORM %v23, %v7, %v12, %v16, %v17, %v21 + vxor %v23, %v23, %v10 + vperm %v0, %v0, %v0, %v15 + VPAES_INVERSION %v8, %v9, %v0, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14 + VPAES_SB_MC %v0, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14 + vxor %v0, %v0, %v23 + vperm %v2, %v2, %v2, %v15 + VPAES_INVERSION %v8, %v9, %v2, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14 + VPAES_SB_MC %v2, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14 + vxor %v2, %v2, %v23 + + // Load encryption output transform constants + li %r6, constant_vector_soft_aes_encrypt_output_transform_lo-randomx_ppc64_constants + li %r7, constant_vector_soft_aes_encrypt_output_transform_hi-randomx_ppc64_constants + lvx %v16, %r6, %r2 + lvx %v17, %r7, %r2 + + // Transform output from composite field representation + VPAES_TRANSFORM %v0, %v0, %v12, %v16, %v17, %v21 + VPAES_TRANSFORM %v2, %v2, %v12, %v16, %v17, %v21 + + // Load initial decryption constants + li %r6, constant_vector_soft_aes_decrypt_input_transform_lo-randomx_ppc64_constants + li %r7, constant_vector_soft_aes_decrypt_input_transform_hi-randomx_ppc64_constants + lvx %v16, %r6, %r2 + lvx %v17, %r7, %r2 + li %r8, constant_vector_soft_aes_invsubbytes_mul9_lo-randomx_ppc64_constants + li %r9, constant_vector_soft_aes_invsubbytes_mul9_hi-randomx_ppc64_constants + lvx %v26, %r8, %r2 + lvx %v27, %r9, %r2 + li %r6, constant_vector_soft_aes_invsubbytes_mulD_lo-randomx_ppc64_constants + li %r7, constant_vector_soft_aes_invsubbytes_mulD_hi-randomx_ppc64_constants + lvx %v28, %r6, %r2 + lvx %v29, %r7, %r2 + li %r8, constant_vector_soft_aes_invsubbytes_mulB_lo-randomx_ppc64_constants + li %r9, constant_vector_soft_aes_invsubbytes_mulB_hi-randomx_ppc64_constants + lvx %v30, %r8, %r2 + lvx %v31, %r9, %r2 + li %r6, constant_vector_soft_aes_invsubbytes_mulE_lo-randomx_ppc64_constants + li %r7, constant_vector_soft_aes_invsubbytes_mulE_hi-randomx_ppc64_constants + lvx %v19, %r6, %r2 + lvx %v11, %r7, %r2 + li %r8, constant_vector_soft_aes_invshiftrows-randomx_ppc64_constants + li %r9, constant_vector_soft_aes_decrypt_63-randomx_ppc64_constants + lvx %v15, %r8, %r2 + lvx %v10, %r9, %r2 + + // Transform inputs to composite field representation + VPAES_TRANSFORM %v1, %v1, %v12, %v16, %v17, %v21 + VPAES_TRANSFORM %v3, %v3, %v12, %v16, %v17, %v21 + + // Round 0 (key v4) + VPAES_TRANSFORM %v23, %v4, %v12, %v16, %v17, %v21 + vperm %v1, %v1, %v1, %v15 + vxor %v1, %v1, %v10 + VPAES_INVERSION %v8, %v9, %v1, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14 + VPAES_INVSB_INVMC %v1, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13 + vxor %v1, %v1, %v23 + vperm %v3, %v3, %v3, %v15 + vxor %v3, %v3, %v10 + VPAES_INVERSION %v8, %v9, %v3, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14 + VPAES_INVSB_INVMC %v3, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13 + vxor %v3, %v3, %v23 + + // Round 1 (key v5) + VPAES_TRANSFORM %v23, %v5, %v12, %v16, %v17, %v21 + vperm %v1, %v1, %v1, %v15 + vxor %v1, %v1, %v10 + VPAES_INVERSION %v8, %v9, %v1, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14 + VPAES_INVSB_INVMC %v1, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13 + vxor %v1, %v1, %v23 + vperm %v3, %v3, %v3, %v15 + vxor %v3, %v3, %v10 + VPAES_INVERSION %v8, %v9, %v3, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14 + VPAES_INVSB_INVMC %v3, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13 + vxor %v3, %v3, %v23 + + // Round 2 (key v6) + VPAES_TRANSFORM %v23, %v6, %v12, %v16, %v17, %v21 + vperm %v1, %v1, %v1, %v15 + vxor %v1, %v1, %v10 + VPAES_INVERSION %v8, %v9, %v1, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14 + VPAES_INVSB_INVMC %v1, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13 + vxor %v1, %v1, %v23 + vperm %v3, %v3, %v3, %v15 + vxor %v3, %v3, %v10 + VPAES_INVERSION %v8, %v9, %v3, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14 + VPAES_INVSB_INVMC %v3, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13 + vxor %v3, %v3, %v23 + + // Round 3 (key v7) + VPAES_TRANSFORM %v23, %v7, %v12, %v16, %v17, %v21 + vperm %v1, %v1, %v1, %v15 + vxor %v1, %v1, %v10 + VPAES_INVERSION %v8, %v9, %v1, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14 + VPAES_INVSB_INVMC %v1, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13 + vxor %v1, %v1, %v23 + vperm %v3, %v3, %v3, %v15 + vxor %v3, %v3, %v10 + VPAES_INVERSION %v8, %v9, %v3, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14 + VPAES_INVSB_INVMC %v3, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13 + vxor %v3, %v3, %v23 + + // Load decryption output transform constants + li %r6, constant_vector_soft_aes_decrypt_output_transform_lo-randomx_ppc64_constants + li %r7, constant_vector_soft_aes_decrypt_output_transform_hi-randomx_ppc64_constants + lvx %v16, %r6, %r2 + lvx %v17, %r7, %r2 + + // Transform output from composite field representation + VPAES_TRANSFORM %v1, %v1, %v12, %v16, %v17, %v21 + VPAES_TRANSFORM %v3, %v3, %v12, %v16, %v17, %v21 + + // Restore v8-v11 and v15-v31 from the stack + addi %r6, %r1, -(16 * 21) + lvx %v8, 0, %r6 + li %r7, 16*1 + lvx %v9, %r7, %r6 + li %r8, 16*2 + lvx %v10, %r8, %r6 + li %r9, 16*3 + lvx %v11, %r9, %r6 + li %r10, 16*4 + lvx %v15, %r10, %r6 + li %r11, 16*5 + lvx %v16, %r11, %r6 + li %r12, 16*6 + lvx %v17, %r12, %r6 + li %r7, 16*7 + lvx %v18, %r7, %r6 + li %r8, 16*8 + lvx %v19, %r8, %r6 + li %r9, 16*9 + lvx %v20, %r9, %r6 + li %r10, 16*10 + lvx %v21, %r10, %r6 + li %r11, 16*11 + lvx %v22, %r11, %r6 + li %r12, 16*12 + lvx %v23, %r12, %r6 + li %r7, 16*13 + lvx %v24, %r7, %r6 + li %r8, 16*14 + lvx %v25, %r8, %r6 + li %r9, 16*15 + lvx %v26, %r9, %r6 + li %r10, 16*16 + lvx %v27, %r10, %r6 + li %r11, 16*17 + lvx %v28, %r11, %r6 + li %r12, 16*18 + lvx %v29, %r12, %r6 + li %r7, 16*19 + lvx %v30, %r7, %r6 + li %r8, 16*20 + lvx %v31, %r8, %r6 + +randomx_ppc64_vm_spad_store_mix_v2_soft_aes_end: + + + .section ".text" +C_FUNCTION(randomx_reciprocal_fast) + cntlzd %r4, %r3 // r4 = 63 - k (count leading zeros) + li %r5, 1 // r5 = 1 + subfic %r4, %r4, 63 // r4 = 63 - (63 - k) = k + sld %r4, %r5, %r4 // r4 = 1 << k (this is the upper 64 bits of the dividend) + divdeu %r3, %r4, %r3 // r3 = (r4 || 0x0000000000000000) / divisor + blr diff --git a/src/jit_compiler_ppc64_static.hpp b/src/jit_compiler_ppc64_static.hpp new file mode 100644 index 00000000..7909a81b --- /dev/null +++ b/src/jit_compiler_ppc64_static.hpp @@ -0,0 +1,69 @@ +/* +Copyright (c) 2026, Forest Crossman + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +extern "C" { + void randomx_ppc64_constants(); + void randomx_ppc64_constant_lut_fprc_to_fpscr(); + void randomx_ppc64_constants_end(); + + void randomx_ppc64_dataset_init(); + void randomx_ppc64_dataset_init_fix_call(); + void randomx_ppc64_dataset_init_end(); + + void randomx_ppc64_sshash_single_item_prologue(); + void randomx_ppc64_sshash_single_item_prologue_end(); + void randomx_ppc64_sshash_single_item_epilogue(); + void randomx_ppc64_sshash_single_item_epilogue_end(); + void randomx_ppc64_sshash_cache_prefetch(); + void randomx_ppc64_sshash_cache_prefetch_end(); + void randomx_ppc64_sshash_xor(); + void randomx_ppc64_sshash_xor_end(); + + void randomx_ppc64_vm_prologue(); + void randomx_ppc64_vm_prologue_end(); + void randomx_ppc64_vm_epilogue(); + void randomx_ppc64_vm_fix_loop(); + void randomx_ppc64_vm_epilogue_end(); + void randomx_ppc64_vm_loop_prologue(); + void randomx_ppc64_vm_loop_prologue_end(); + void randomx_ppc64_vm_data_read(); + void randomx_ppc64_vm_data_read_end(); + void randomx_ppc64_vm_data_read_light(); + void randomx_ppc64_vm_data_read_light_fix_call(); + void randomx_ppc64_vm_data_read_light_end(); + void randomx_ppc64_vm_spad_store_group_r(); + void randomx_ppc64_vm_spad_store_group_r_end(); + void randomx_ppc64_vm_spad_store_mix_v1(); + void randomx_ppc64_vm_spad_store_mix_v1_end(); + void randomx_ppc64_vm_spad_store_mix_v2_hard_aes(); + void randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end(); + void randomx_ppc64_vm_spad_store_mix_v2_soft_aes(); + void randomx_ppc64_vm_spad_store_mix_v2_soft_aes_end(); +} diff --git a/src/reciprocal.h b/src/reciprocal.h index 90bd9b6b..57f3985f 100644 --- a/src/reciprocal.h +++ b/src/reciprocal.h @@ -30,7 +30,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#if defined(_M_X64) || defined(__x86_64__) +#if defined(_M_X64) || defined(__x86_64__) || (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)) #define RANDOMX_HAVE_FAST_RECIPROCAL 1 #else #define RANDOMX_HAVE_FAST_RECIPROCAL 0