From aba79effc9dd33fa226c4c4c3e05a4e43f7d190a Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Fri, 27 Mar 2026 23:43:34 -0500
Subject: [PATCH 01/50] Add JIT code generator for PPC64

Adds a JIT backend for POWER8 and later Power ISA CPUs. Assembly
instructions were restricted to those available in Power ISA v2.06 in
order to facilitate adding support for POWER7, but currently only
RandomX V1 is supported on those chips due to its lack of AES
instructions.

Support has been added for both little-endian and big-endian CPUs, but
only little-endian has been tested.

Fixes #132
---
 CMakeLists.txt                    |    8 +
 src/common.hpp                    |    5 +
 src/cpu.cpp                       |    9 +-
 src/jit_compiler.hpp              |    2 +
 src/jit_compiler_ppc64.cpp        | 1354 +++++++++++++++++++++++++++++
 src/jit_compiler_ppc64.hpp        |   86 ++
 src/jit_compiler_ppc64_static.S   |  826 ++++++++++++++++++
 src/jit_compiler_ppc64_static.hpp |   65 ++
 src/reciprocal.h                  |    2 +-
 9 files changed, 2355 insertions(+), 2 deletions(-)
 create mode 100644 src/jit_compiler_ppc64.cpp
 create mode 100644 src/jit_compiler_ppc64.hpp
 create mode 100644 src/jit_compiler_ppc64_static.S
 create mode 100644 src/jit_compiler_ppc64_static.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3d34e2fe..4768b1af 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -143,8 +143,16 @@ endif()
 
 # PowerPC
 if(ARCH_ID STREQUAL "ppc64" OR ARCH_ID STREQUAL "ppc64le")
+  list(APPEND randomx_sources
+    src/jit_compiler_ppc64_static.S
+    src/jit_compiler_ppc64.cpp)
+
+  set_property(SOURCE src/jit_compiler_ppc64_static.S PROPERTY LANGUAGE C)
+
   if(ARCH STREQUAL "native")
     add_flag("-mcpu=native")
+  else()
+    add_flag("-mcpu=power8")
   endif()
   # PowerPC AES requires ALTIVEC (POWER7+), so it cannot be enabled in the default build
 endif()
diff --git a/src/common.hpp b/src/common.hpp
index 579752d9..9b92d08a 100644
--- a/src/common.hpp
+++ b/src/common.hpp
@@ -138,6 +138,11 @@ namespace randomx {
 	#define RANDOMX_COMPILER_RV64
 	class JitCompilerRV64;
 	using JitCompiler = JitCompilerRV64;
+#elif defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)
+	#define RANDOMX_HAVE_COMPILER 1
+	#define RANDOMX_COMPILER_PPC64
+	class JitCompilerPPC64;
+	using JitCompiler = JitCompilerPPC64;
 #else
 	#define RANDOMX_HAVE_COMPILER 0
 	class JitCompilerFallback;
diff --git a/src/cpu.cpp b/src/cpu.cpp
index 3178d037..d20b6ec6 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -47,6 +47,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#include <asm/hwcap.h>
 #endif
 
+#if defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)
+	#include <sys/auxv.h>
+	#include <asm/cputable.h>
+#endif
+
 #ifdef __riscv
 #include <signal.h>
 #include <setjmp.h>
@@ -120,8 +125,10 @@ namespace randomx {
 
 			sigaction(SIGILL, &old_action, nullptr);
 		}
+#elif defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)
+		unsigned long hwcaps2 = getauxval(AT_HWCAP2);
+		aes_ = (hwcaps2 & PPC_FEATURE2_VEC_CRYPTO) != 0;
 #endif
-		//TODO POWER8 AES
 	}
 
 	const Cpu cpu;
diff --git a/src/jit_compiler.hpp b/src/jit_compiler.hpp
index 56c0655c..52fce1db 100644
--- a/src/jit_compiler.hpp
+++ b/src/jit_compiler.hpp
@@ -70,6 +70,8 @@ namespace randomx {
 #include "jit_compiler_a64.hpp"
 #elif defined(RANDOMX_COMPILER_RV64)
 #include "jit_compiler_rv64.hpp"
+#elif defined(RANDOMX_COMPILER_PPC64)
+#include "jit_compiler_ppc64.hpp"
 #else
 #include "jit_compiler_fallback.hpp"
 #endif
diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
new file mode 100644
index 00000000..75a0e05f
--- /dev/null
+++ b/src/jit_compiler_ppc64.cpp
@@ -0,0 +1,1354 @@
+/*
+Copyright (c) 2023 tevador <tevador@gmail.com>
+Copyright (c) 2026, Forest Crossman <cyrozap@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdexcept>
+#include <cstring>
+
+#include <sys/auxv.h>
+#include <asm/cputable.h>
+
+#include "cpu.hpp"
+#include "program.hpp"
+#include "reciprocal.h"
+#include "superscalar.hpp"
+#include "virtual_memory.h"
+
+#include "jit_compiler_ppc64.hpp"
+
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	#define PPC_BIG_ENDIAN 1
+#else
+	#define PPC_BIG_ENDIAN 0
+#endif
+
+namespace {
+#define HANDLER_ARGS randomx::CompilerState& state, randomx::Instruction isn, int i, randomx_flags flags
+	using InstructionHandler = void(HANDLER_ARGS);
+	extern InstructionHandler* opcodeMap1[256];
+}
+
+namespace PPC64 {
+
+	static inline uint32_t A_form(uint32_t po, uint32_t frt, uint32_t fra, uint32_t frb, uint32_t frc, uint32_t xo, uint32_t rc) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(frt <= 0x1F)) throw std::runtime_error("frt <= 0x1F");
+		if (!(fra <= 0x1F)) throw std::runtime_error("fra <= 0x1F");
+		if (!(frb <= 0x1F)) throw std::runtime_error("frb <= 0x1F");
+		if (!(frc <= 0x1F)) throw std::runtime_error("frc <= 0x1F");
+		if (!(xo <= 0x1F)) throw std::runtime_error("xo <= 0x1F");
+		if (!(rc <= 0x1)) throw std::runtime_error("rc <= 0x1");
+		return (po << 26) | (frt << 21) | (fra << 16) | (frb << 11) | (frc << 6) | (xo << 1) | rc;
+	}
+
+	static inline uint32_t B_form(uint32_t po, uint32_t bo, uint32_t bi, uint32_t bd, uint32_t aa, uint32_t lk) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(bo <= 0x1F)) throw std::runtime_error("bo <= 0x1F");
+		if (!(bi <= 0x1F)) throw std::runtime_error("bi <= 0x1F");
+		if (!(bd <= 0x3FFF)) throw std::runtime_error("bd <= 0x3FFF");
+		if (!(aa <= 0x1)) throw std::runtime_error("aa <= 0x1");
+		if (!(lk <= 0x1)) throw std::runtime_error("lk <= 0x1");
+		return (po << 26) | (bo << 21) | (bi << 16) | (bd << 2) | (aa << 1) | lk;
+	}
+
+	static inline uint32_t D_form(uint32_t po, uint32_t rt, uint32_t ra, uint32_t d) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(rt <= 0x1F)) throw std::runtime_error("rt <= 0x1F");
+		if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F");
+		if (!(d <= 0xFFFF)) throw std::runtime_error("d <= 0xFFFF");
+		return (po << 26) | (rt << 21) | (ra << 16) | d;
+	}
+
+	static inline uint32_t I_form(uint32_t po, uint32_t li, uint32_t aa, uint32_t lk) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(li <= 0xFFFFFF)) throw std::runtime_error("li <= 0xFFFFFF");
+		if (!(aa <= 0x1)) throw std::runtime_error("aa <= 0x1");
+		if (!(lk <= 0x1)) throw std::runtime_error("lk <= 0x1");
+		return (po << 26) | (li << 2) | (aa << 1) | lk;
+	}
+
+	static inline uint32_t M_form(uint32_t po, uint32_t rs, uint32_t ra, uint32_t sh, uint32_t mb, uint32_t me, uint32_t rc) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(rs <= 0x1F)) throw std::runtime_error("rs <= 0x1F");
+		if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F");
+		if (!(sh <= 0x1F)) throw std::runtime_error("sh <= 0x1F");
+		if (!(mb <= 0x1F)) throw std::runtime_error("mb <= 0x1F");
+		if (!(me <= 0x1F)) throw std::runtime_error("me <= 0x1F");
+		if (!(rc <= 0x1)) throw std::runtime_error("rc <= 0x1");
+		return (po << 26) | (rs << 21) | (ra << 16) | (sh << 11) | (mb << 6) | (me << 1) | rc;
+	}
+
+	static inline uint32_t MD_form(uint32_t po, uint32_t rs, uint32_t ra, uint32_t sh, uint32_t mb, uint32_t xo, uint32_t rc) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(rs <= 0x1F)) throw std::runtime_error("rs <= 0x1F");
+		if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F");
+		if (!(sh <= 0x3F)) throw std::runtime_error("sh <= 0x3F");
+		if (!(mb <= 0x3F)) throw std::runtime_error("mb <= 0x3F");
+		if (!(xo <= 0x7)) throw std::runtime_error("xo <= 0x7");
+		if (!(rc <= 0x1)) throw std::runtime_error("rc <= 0x1");
+		uint32_t sh0_4 = sh & 0x1F;
+		uint32_t sh5 = (sh >> 5) & 0x1;
+		uint32_t mb0_4 = mb & 0x1F;
+		uint32_t mb5 = (mb >> 5) & 0x1;
+		return (po << 26) | (rs << 21) | (ra << 16) | (sh0_4 << 11) | (mb0_4 << 6) | (mb5 << 5) | (xo << 2) | (sh5 << 1) | rc;
+	}
+
+	static inline uint32_t MDS_form(uint32_t po, uint32_t rs, uint32_t ra, uint32_t rb, uint32_t mb, uint32_t xo, uint32_t rc) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(rs <= 0x1F)) throw std::runtime_error("rs <= 0x1F");
+		if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F");
+		if (!(rb <= 0x1F)) throw std::runtime_error("rb <= 0x1F");
+		if (!(mb <= 0x3F)) throw std::runtime_error("mb <= 0x3F");
+		if (!(xo <= 0xF)) throw std::runtime_error("xo <= 0xF");
+		if (!(rc <= 0x1)) throw std::runtime_error("rc <= 0x1");
+		uint32_t mb0_4 = mb & 0x1F;
+		uint32_t mb5 = (mb >> 5) & 0x1;
+		return (po << 26) | (rs << 21) | (ra << 16) | (rb << 11) | (mb0_4 << 6) | (mb5 << 5) | (xo << 1) | rc;
+	}
+
+	static inline uint32_t VA_form(uint32_t po, uint32_t vrt, uint32_t vra, uint32_t vrb, uint32_t vrc, uint32_t xo) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(vrt <= 0x1F)) throw std::runtime_error("vrt <= 0x1F");
+		if (!(vra <= 0x1F)) throw std::runtime_error("vra <= 0x1F");
+		if (!(vrb <= 0x1F)) throw std::runtime_error("vrb <= 0x1F");
+		if (!(vrc <= 0x1F)) throw std::runtime_error("vrc <= 0x1F");
+		if (!(xo <= 0x3F)) throw std::runtime_error("xo <= 0x3F");
+		return (po << 26) | (vrt << 21) | (vra << 16) | (vrb << 11) | (vrc << 6) | xo;
+	}
+
+	static inline uint32_t X_form(uint32_t po, uint32_t rt, uint32_t ra, uint32_t rb, uint32_t xo, uint32_t rc) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(rt <= 0x1F)) throw std::runtime_error("rt <= 0x1F");
+		if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F");
+		if (!(rb <= 0x1F)) throw std::runtime_error("rb <= 0x1F");
+		if (!(xo <= 0x3FF)) throw std::runtime_error("xo <= 0x3FF");
+		if (!(rc <= 0x1)) throw std::runtime_error("rc <= 0x1");
+		return (po << 26) | (rt << 21) | (ra << 16) | (rb << 11) | (xo << 1) | rc;
+	}
+
+	static inline uint32_t XFL_form(uint32_t po, uint32_t l, uint32_t flm, uint32_t w, uint32_t frb, uint32_t xo, uint32_t rc) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(l <= 0x1)) throw std::runtime_error("l <= 0x1");
+		if (!(flm <= 0xFF)) throw std::runtime_error("flm <= 0xFF");
+		if (!(w <= 0x1)) throw std::runtime_error("w <= 0x1");
+		if (!(frb <= 0x1F)) throw std::runtime_error("frb <= 0x1F");
+		if (!(xo <= 0x3FF)) throw std::runtime_error("xo <= 0x3FF");
+		if (!(rc <= 0x1)) throw std::runtime_error("rc <= 0x1");
+		return (po << 26) | (l << 25) | (flm << 17) | (w << 16) | (frb << 11) | (xo << 1) | rc;
+	}
+
+	static inline uint32_t XO_form(uint32_t po, uint32_t rt, uint32_t ra, uint32_t rb, uint32_t oe, uint32_t xo, uint32_t rc) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(rt <= 0x1F)) throw std::runtime_error("rt <= 0x1F");
+		if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F");
+		if (!(rb <= 0x1F)) throw std::runtime_error("rb <= 0x1F");
+		if (!(oe <= 0x1)) throw std::runtime_error("oe <= 0x1");
+		if (!(xo <= 0x1FF)) throw std::runtime_error("xo <= 0x1FF");
+		if (!(rc <= 0x1)) throw std::runtime_error("rc <= 0x1");
+		return (po << 26) | (rt << 21) | (ra << 16) | (rb << 11) | (oe << 10) | (xo << 1) | rc;
+	}
+
+	static inline uint32_t XX2_form(uint32_t po, uint32_t t, uint32_t a, uint32_t b, uint32_t xo, uint32_t bx, uint32_t tx) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(t <= 0x1F)) throw std::runtime_error("t <= 0x1F");
+		if (!(a <= 0x1F)) throw std::runtime_error("a <= 0x1F");
+		if (!(b <= 0x1F)) throw std::runtime_error("b <= 0x1F");
+		if (!(xo <= 0x1FF)) throw std::runtime_error("xo <= 0x1FF");
+		if (!(bx <= 0x1)) throw std::runtime_error("bx <= 0x1");
+		if (!(tx <= 0x1)) throw std::runtime_error("tx <= 0x1");
+		return (po << 26) | (t << 21) | (a << 16) | (b << 11) | (xo << 2) | (bx << 1) | tx;
+	}
+
+	static inline uint32_t XX3_form(uint32_t po, uint32_t t, uint32_t a, uint32_t b, uint32_t xo, uint32_t ax, uint32_t bx, uint32_t tx) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(t <= 0x1F)) throw std::runtime_error("t <= 0x1F");
+		if (!(a <= 0x1F)) throw std::runtime_error("a <= 0x1F");
+		if (!(b <= 0x1F)) throw std::runtime_error("b <= 0x1F");
+		if (!(xo <= 0xFF)) throw std::runtime_error("xo <= 0xFF");
+		if (!(ax <= 0x1)) throw std::runtime_error("ax <= 0x1");
+		if (!(bx <= 0x1)) throw std::runtime_error("bx <= 0x1");
+		if (!(tx <= 0x1)) throw std::runtime_error("tx <= 0x1");
+		return (po << 26) | (t << 21) | (a << 16) | (b << 11) | (xo << 3) | (ax << 2) | (bx << 1) | tx;
+	}
+
+	static inline uint32_t b(int32_t offset) {
+		if (offset & 3) throw std::runtime_error("offset must be 4-byte aligned");
+		if (offset < -(1 << 25) || offset >= (1 << 25)) throw std::runtime_error("offset out of range");
+		return I_form(18, (offset >> 2) & 0xFFFFFF, 0, 0);
+	}
+
+	static inline uint32_t bl(int32_t offset) {
+		if (offset & 3) throw std::runtime_error("offset must be 4-byte aligned");
+		if (offset < -(1 << 25) || offset >= (1 << 25)) throw std::runtime_error("offset out of range");
+		return I_form(18, (offset >> 2) & 0xFFFFFF, 0, 1);
+	}
+
+	static inline uint32_t bc(uint32_t bo, uint32_t bi, int32_t offset) {
+		if (!(bo <= 0x1F)) throw std::runtime_error("bo <= 0x1F");
+		if (!(bi <= 0x1F)) throw std::runtime_error("bi <= 0x1F");
+		if (offset & 3) throw std::runtime_error("offset must be 4-byte aligned");
+		if (offset < -(1 << 15) || offset >= (1 << 15)) throw std::runtime_error("offset out of range");
+		return B_form(16, bo, bi, (offset >> 2) & 0x3FFF, 0, 0);
+	}
+
+	static inline uint32_t beq(int32_t offset) {
+		return bc(12, 2, offset);
+	}
+
+	static inline uint32_t bne(int32_t offset) {
+		return bc(4, 2, offset);
+	}
+
+	static inline uint32_t cmpi(uint32_t bf, uint32_t l, uint32_t ra, int32_t si) {
+		if (!(bf <= 0x7)) throw std::runtime_error("bf <= 0x7");
+		if (!(l <= 0x1)) throw std::runtime_error("l <= 0x1");
+		if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F");
+		if (si < -(1 << 15) || si >= (1 << 15)) throw std::runtime_error("si out of range");
+		return D_form(11, (bf << 2) | l, ra, si);
+	}
+
+	static inline uint32_t addi(uint32_t rt, uint32_t ra, uint32_t si) { return D_form(14, rt, ra, si); }
+	static inline uint32_t addis(uint32_t rt, uint32_t ra, uint32_t si) { return D_form(15, rt, ra, si); }
+	static inline uint32_t ori(uint32_t ra, uint32_t rs, uint32_t ui) { return D_form(24, rs, ra, ui); }
+	static inline uint32_t oris(uint32_t ra, uint32_t rs, uint32_t ui) { return D_form(25, rs, ra, ui); }
+	static inline uint32_t andi_dot(uint32_t ra, uint32_t rs, uint32_t ui) { return D_form(28, rs, ra, ui); }
+
+	static inline uint32_t add(uint32_t rt, uint32_t ra, uint32_t rb) { return XO_form(31, rt, ra, rb, 0, 266, 0); }
+	static inline uint32_t subf(uint32_t rt, uint32_t ra, uint32_t rb) { return XO_form(31, rt, ra, rb, 0, 40, 0); }
+	static inline uint32_t neg(uint32_t rt, uint32_t ra) { return XO_form(31, rt, ra, 0, 0, 104, 0); }
+	static inline uint32_t and_(uint32_t ra, uint32_t rs, uint32_t rb) { return X_form(31, rs, ra, rb, 28, 0); }
+	static inline uint32_t and_dot(uint32_t ra, uint32_t rs, uint32_t rb) { return X_form(31, rs, ra, rb, 28, 1); }
+	static inline uint32_t xor_(uint32_t ra, uint32_t rs, uint32_t rb) { return X_form(31, rs, ra, rb, 316, 0); }
+	static inline uint32_t or_(uint32_t ra, uint32_t rs, uint32_t rb) { return X_form(31, rs, ra, rb, 444, 0); }
+
+	static inline uint32_t mulld(uint32_t rt, uint32_t ra, uint32_t rb) { return XO_form(31, rt, ra, rb, 0, 233, 0); }
+	static inline uint32_t mulhdu(uint32_t rt, uint32_t ra, uint32_t rb) { return XO_form(31, rt, ra, rb, 0, 9, 0); }
+	static inline uint32_t mulhd(uint32_t rt, uint32_t ra, uint32_t rb) { return XO_form(31, rt, ra, rb, 0, 73, 0); }
+
+	static inline uint32_t rlwinm(uint32_t ra, uint32_t rs, uint32_t sh, uint32_t mb, uint32_t me) { return M_form(21, rs, ra, sh, mb, me, 0); }
+	static inline uint32_t rldicl(uint32_t ra, uint32_t rs, uint32_t sh, uint32_t mb) { return MD_form(30, rs, ra, sh, mb, 0, 0); }
+	static inline uint32_t rldicr(uint32_t ra, uint32_t rs, uint32_t sh, uint32_t me) { return MD_form(30, rs, ra, sh, me, 1, 0); }
+	static inline uint32_t rldic(uint32_t ra, uint32_t rs, uint32_t sh, uint32_t mb) { return MD_form(30, rs, ra, sh, mb, 2, 0); }
+	static inline uint32_t rldcl(uint32_t ra, uint32_t rs, uint32_t rb, uint32_t mb) { return MDS_form(30, rs, ra, rb, mb, 8, 0); }
+
+	static inline uint32_t cmpdi(uint32_t rx, int32_t si) { return cmpi(0, 1, rx, si); }
+
+	static inline uint32_t li(uint32_t rx, int32_t si) { return addi(rx, 0, si); }
+	static inline uint32_t lis(uint32_t rx, int32_t si) { return addis(rx, 0, si); }
+	static inline uint32_t mr(uint32_t rx, uint32_t ry) { return or_(rx, ry, ry); }
+	static inline uint32_t rotldi(uint32_t ra, uint32_t rs, uint32_t n) { return rldicl(ra, rs, n, 0); }
+	static inline uint32_t rotrdi(uint32_t ra, uint32_t rs, uint32_t n) { return rldicl(ra, rs, 64-n, 0); }
+	static inline uint32_t sldi(uint32_t rx, uint32_t ry, uint32_t n) { return rldicr(rx, ry, n, 63-n); }
+	static inline uint32_t srdi(uint32_t rx, uint32_t ry, uint32_t n) { return rldicl(rx, ry, 64-n, n); }
+
+	static inline uint32_t ldx(uint32_t rt, uint32_t ra, uint32_t rb) { return X_form(31, rt, ra, rb, 21, 0); }
+	static inline uint32_t ldbrx(uint32_t rt, uint32_t ra, uint32_t rb) { return X_form(31, rt, ra, rb, 532, 0); }
+	static inline uint32_t stdx(uint32_t rs, uint32_t ra, uint32_t rb) { return X_form(31, rs, ra, rb, 149, 0); }
+
+	static inline uint32_t lfd(uint32_t frt, uint32_t ra, uint32_t d) { return D_form(50, frt, ra, d); }
+	static inline uint32_t lfdx(uint32_t frt, uint32_t ra, uint32_t rb) { return X_form(31, frt, ra, rb, 599, 0); }
+	static inline uint32_t mtfsf(uint32_t flm, uint32_t frb, uint32_t l, uint32_t w) { return XFL_form(63, l, flm, w, frb, 711, 0); }
+
+	static inline uint32_t lxsdx(uint32_t xt, uint32_t ra, uint32_t rb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		return X_form(31, t, ra, rb, 588, tx);
+	}
+
+	static inline uint32_t lxvd2x(uint32_t xt, uint32_t ra, uint32_t rb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		return X_form(31, t, ra, rb, 844, tx);
+	}
+
+	static inline uint32_t vperm(uint32_t vrt, uint32_t vra, uint32_t vrb, uint32_t vrc) { return VA_form(4, vrt, vra, vrb, vrc, 43); }
+
+	static inline uint32_t xxmrghw(uint32_t xt, uint32_t xa, uint32_t xb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t a = xa & 0x1F;
+		uint32_t ax = xa >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX3_form(60, t, a, b, 18, ax, bx, tx);
+	}
+
+	static inline uint32_t xvadddp(uint32_t xt, uint32_t xa, uint32_t xb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t a = xa & 0x1F;
+		uint32_t ax = xa >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX3_form(60, t, a, b, 96, ax, bx, tx);
+	}
+
+	static inline uint32_t xvsubdp(uint32_t xt, uint32_t xa, uint32_t xb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t a = xa & 0x1F;
+		uint32_t ax = xa >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX3_form(60, t, a, b, 104, ax, bx, tx);
+	}
+
+	static inline uint32_t xvmuldp(uint32_t xt, uint32_t xa, uint32_t xb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t a = xa & 0x1F;
+		uint32_t ax = xa >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX3_form(60, t, a, b, 112, ax, bx, tx);
+	}
+
+	static inline uint32_t xvdivdp(uint32_t xt, uint32_t xa, uint32_t xb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t a = xa & 0x1F;
+		uint32_t ax = xa >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX3_form(60, t, a, b, 120, ax, bx, tx);
+	}
+
+	static inline uint32_t xvsqrtdp(uint32_t xt, uint32_t xb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX2_form(60, t, 0, b, 203, bx, tx);
+	}
+
+	static inline uint32_t xvcvsxwdp(uint32_t xt, uint32_t xb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX2_form(60, t, 0, b, 248, bx, tx);
+	}
+
+	static inline uint32_t xxpermdi(uint32_t xt, uint32_t xa, uint32_t xb, uint32_t dm) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t a = xa & 0x1F;
+		uint32_t ax = xa >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX3_form(60, t, a, b, (dm << 5) | 10, ax, bx, tx);
+	}
+
+	static inline uint32_t xxswapd(uint32_t xt, uint32_t xa) { return xxpermdi(xt, xa, xa, 2); }
+
+	static inline uint32_t xxland(uint32_t xt, uint32_t xa, uint32_t xb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t a = xa & 0x1F;
+		uint32_t ax = xa >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX3_form(60, t, a, b, 130, ax, bx, tx);
+	}
+
+	static inline uint32_t xxlor(uint32_t xt, uint32_t xa, uint32_t xb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t a = xa & 0x1F;
+		uint32_t ax = xa >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX3_form(60, t, a, b, 146, ax, bx, tx);
+	}
+
+	static inline uint32_t xxlxor(uint32_t xt, uint32_t xa, uint32_t xb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t a = xa & 0x1F;
+		uint32_t ax = xa >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX3_form(60, t, a, b, 154, ax, bx, tx);
+	}
+
+}
+
+namespace randomx {
+
+	static const uint8_t* codeConstants = (uint8_t*)&randomx_ppc64_constants;
+	static const uint8_t* codeConstantLutFprcToFpscr = (uint8_t*)&randomx_ppc64_constant_lut_fprc_to_fpscr;
+	static const uint8_t* codeConstantsEnd = (uint8_t*)&randomx_ppc64_constants_end;
+
+	static const uint8_t* codeDatasetInit = (uint8_t*)&randomx_ppc64_dataset_init;
+	static const uint8_t* codeDatasetInitFixCall = (uint8_t*)&randomx_ppc64_dataset_init_fix_call;
+	static const uint8_t* codeDatasetInitEnd = (uint8_t*)&randomx_ppc64_dataset_init_end;
+
+	static const uint8_t* codeSshashSingleItemPrologue = (uint8_t*)&randomx_ppc64_sshash_single_item_prologue;
+	static const uint8_t* codeSshashSingleItemPrologueEnd = (uint8_t*)&randomx_ppc64_sshash_single_item_prologue_end;
+	static const uint8_t* codeSshashSingleItemEpilogue = (uint8_t*)&randomx_ppc64_sshash_single_item_epilogue;
+	static const uint8_t* codeSshashSingleItemEpilogueEnd = (uint8_t*)&randomx_ppc64_sshash_single_item_epilogue_end;
+	static const uint8_t* codeSshashCachePrefetch = (uint8_t*)&randomx_ppc64_sshash_cache_prefetch;
+	static const uint8_t* codeSshashCachePrefetchEnd = (uint8_t*)&randomx_ppc64_sshash_cache_prefetch_end;
+	static const uint8_t* codeSshashXor = (uint8_t*)&randomx_ppc64_sshash_xor;
+	static const uint8_t* codeSshashXorEnd = (uint8_t*)&randomx_ppc64_sshash_xor_end;
+
+	static const uint8_t* codeVmPrologue = (uint8_t*)&randomx_ppc64_vm_prologue;
+	static const uint8_t* codeVmPrologueEnd = (uint8_t*)&randomx_ppc64_vm_prologue_end;
+	static const uint8_t* codeVmEpilogue = (uint8_t*)&randomx_ppc64_vm_epilogue;
+	static const uint8_t* codeVmFixLoop = (uint8_t*)&randomx_ppc64_vm_fix_loop;
+	static const uint8_t* codeVmEpilogueEnd = (uint8_t*)&randomx_ppc64_vm_epilogue_end;
+	static const uint8_t* codeVmLoopPrologue = (uint8_t*)&randomx_ppc64_vm_loop_prologue;
+	static const uint8_t* codeVmLoopPrologueEnd = (uint8_t*)&randomx_ppc64_vm_loop_prologue_end;
+	static const uint8_t* codeVmDataRead = (uint8_t*)&randomx_ppc64_vm_data_read;
+	static const uint8_t* codeVmDataReadEnd = (uint8_t*)&randomx_ppc64_vm_data_read_end;
+	static const uint8_t* codeVmDataReadLight = (uint8_t*)&randomx_ppc64_vm_data_read_light;
+	static const uint8_t* codeVmDataReadLightFixCall = (uint8_t*)&randomx_ppc64_vm_data_read_light_fix_call;
+	static const uint8_t* codeVmDataReadLightEnd = (uint8_t*)&randomx_ppc64_vm_data_read_light_end;
+	static const uint8_t* codeVmSpadStore = (uint8_t*)&randomx_ppc64_vm_spad_store;
+	static const uint8_t* codeVmSpadStoreEnd = (uint8_t*)&randomx_ppc64_vm_spad_store_end;
+	static const uint8_t* codeVmSpadStoreHardAes = (uint8_t*)&randomx_ppc64_vm_spad_store_hard_aes;
+	static const uint8_t* codeVmSpadStoreHardAesEnd = (uint8_t*)&randomx_ppc64_vm_spad_store_hard_aes_end;
+
+	static const int32_t sizeConstants = codeConstantsEnd - codeConstants;
+
+	static const int32_t sizeDatasetInit = codeDatasetInitEnd - codeDatasetInit;
+
+	static const int32_t sizeSshashSingleItemPrologue = codeSshashSingleItemPrologueEnd - codeSshashSingleItemPrologue;
+	static const int32_t sizeSshashSingleItemEpilogue = codeSshashSingleItemEpilogueEnd - codeSshashSingleItemEpilogue;
+	static const int32_t sizeSshashCachePrefetch = codeSshashCachePrefetchEnd - codeSshashCachePrefetch;
+	static const int32_t sizeSshashXor = codeSshashXorEnd - codeSshashXor;
+
+	static const int32_t sizeVmPrologue = codeVmPrologueEnd - codeVmPrologue;
+	static const int32_t sizeVmEpilogue = codeVmEpilogueEnd - codeVmEpilogue;
+	static const int32_t sizeVmLoopPrologue = codeVmLoopPrologueEnd - codeVmLoopPrologue;
+	static const int32_t sizeVmDataRead = codeVmDataReadEnd - codeVmDataRead;
+	static const int32_t sizeVmDataReadLight = codeVmDataReadLightEnd - codeVmDataReadLight;
+	static const int32_t sizeVmSpadStore = codeVmSpadStoreEnd - codeVmSpadStore;
+	static const int32_t sizeVmSpadStoreHardAes = codeVmSpadStoreHardAesEnd - codeVmSpadStoreHardAes;
+
+	static const int32_t offsetConstantLutFprcToFpscr = codeConstantLutFprcToFpscr - codeConstants;
+
+	static const int32_t offsetDatasetInitFixCall = codeDatasetInitFixCall - codeDatasetInit;
+
+	static const int32_t offsetVmFixLoop = codeVmFixLoop - codeVmEpilogue;
+	static const int32_t offsetVmDataReadLightFixCall = codeVmDataReadLightFixCall - codeVmDataReadLight;
+
+	constexpr size_t CodeAlign = 64*1024;  // 64 kB, to ensure alignment on systems with a page size <= 64 kB
+	static const size_t ConstantPoolSize = alignSize(sizeConstants + 16, CodeAlign);  // Add 16 bytes for the Group E OR vector mask
+	static const size_t ReserveCodeSize = alignSize(sizeVmPrologue + sizeVmEpilogue + sizeVmLoopPrologue + sizeVmDataRead + sizeVmDataReadLight + sizeVmSpadStore + sizeVmSpadStoreHardAes, CodeAlign);
+	constexpr size_t MaxRandomXInstrCodeSize = 4*11;  // CBRANCH requires at most 11 instructions
+	constexpr size_t MaxSuperscalarInstrSize = 4*6;  // IMUL_RCP requires at most 6 instructions
+	static const size_t SuperscalarProgramHeaders = sizeSshashSingleItemPrologue + sizeSshashSingleItemEpilogue;
+
+	static const size_t RandomXCodeSize = alignSize(ConstantPoolSize + ReserveCodeSize + MaxRandomXInstrCodeSize * RANDOMX_PROGRAM_MAX_SIZE, CodeAlign);
+	static const size_t SuperscalarSize = alignSize(sizeDatasetInit + SuperscalarProgramHeaders + (sizeSshashCachePrefetch + sizeSshashXor + MaxSuperscalarInstrSize * SuperscalarMaxSize) * RANDOMX_CACHE_ACCESSES, CodeAlign);
+
+	static const uint32_t CodeSize = RandomXCodeSize + SuperscalarSize;
+
+	constexpr uint32_t ConstantsBaseAddressRegisterGPR2 = 2;
+	constexpr uint32_t ConstantVectorBePermutationMaskVR16 = 16;
+	constexpr uint32_t ConstantVectorBePermutationMaskVSR48 = 32 + ConstantVectorBePermutationMaskVR16;
+	constexpr uint32_t ConstantVectorGroupEAndMaskVR17 = 17;
+	constexpr uint32_t ConstantVectorGroupEAndMaskVSR49 = 32 + ConstantVectorGroupEAndMaskVR17;
+	constexpr uint32_t ConstantVectorFscalXorMaskVR18 = 18;
+	constexpr uint32_t ConstantVectorFscalXorMaskVSR50 = 32 + ConstantVectorFscalXorMaskVR18;
+	constexpr uint32_t ConstantVectorGroupEOrMaskVR19 = 19;
+	constexpr uint32_t ConstantVectorGroupEOrMaskVSR51 = 32 + ConstantVectorGroupEOrMaskVR19;
+
+	constexpr uint32_t MaGPR24 = 24;
+	constexpr uint32_t MxGPR25 = 25;
+	constexpr uint32_t SpAddr0GPR26 = 26;
+	constexpr uint32_t SpAddr1GPR27 = 27;
+	constexpr uint32_t ScratchpadPointerGPR30 = 30;
+
+	template<size_t N>
+	struct GprMap {
+		uint32_t regs[N];
+		uint32_t getPpcGprNum(uint8_t idx) const {
+			return regs[idx % N];
+		}
+	};
+
+	template<size_t N>
+	struct VsrMap {
+		uint32_t regs[N];
+		uint32_t getPpcVrNum(uint8_t idx) const {
+			return regs[idx % N];
+		}
+		uint32_t getPpcVsrNum(uint8_t idx) const {
+			return regs[idx % N] + 32;
+		}
+	};
+
+	static const GprMap<8> RegisterMapR = {{ 14, 15, 16, 17, 18, 19, 20, 21 }};
+	static const VsrMap<4> RegisterMapF = {{ 0, 1, 2, 3 }};
+	static const VsrMap<4> RegisterMapE = {{ 4, 5, 6, 7 }};
+	static const VsrMap<4> RegisterMapA = {{ 8, 9, 10, 11 }};
+	static const VsrMap<8> RegisterMapFE = {{ 0, 1, 2, 3, 4, 5, 6, 7 }};
+
+	static const GprMap<8> RegisterMapSsh = {{ 4, 6, 7, 9, 10, 11, 12, 22 }};
+
+	template<typename T> static constexpr size_t Log2(T value) { return (value > 1) ? (Log2(value / 2) + 1) : 0; }
+
+	constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) {
+		return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x);
+	}
+
+	static void clearCache(CodeBuffer& buf) {
+#ifdef __GNUC__
+		__builtin___clear_cache((char*)buf.code, (char*)(buf.code + CodeSize));
+#endif
+	}
+
+	static void emitLoadGpr64(CompilerState& state, uint32_t rt, uint32_t ra, uint32_t rb) {
+		if (PPC_BIG_ENDIAN) {
+			state.emit(PPC64::ldbrx(rt, ra, rb));
+		} else {
+			state.emit(PPC64::ldx(rt, ra, rb));
+		}
+	}
+
+	static void emitLoadVr64(CompilerState& state, uint32_t vrt, uint32_t ra, uint32_t rb) {
+		// We need to load the two packed little-endian signed 32-bit integers into a VSR, then we need to
+		// shuffle them so they're in the correct halves of the VSR register and in the correct byte order,
+		// and then we need to convert the signed 32-bit ints to doubles.
+		uint32_t xt = 32 + vrt;
+		state.emit(PPC64::lxsdx(xt, ra, rb));
+		if (PPC_BIG_ENDIAN) {
+			// Register XT contains the value as [ 0123 4567 zzzz zzzz ]
+			state.emit(PPC64::vperm(vrt, vrt, vrt, ConstantVectorBePermutationMaskVR16)); // Shuffles values in XT to be [ 7654 7654 3210 3210 ]
+		} else {
+			// Register XT contains the value as [ 7654 3210 zzzz zzzz ]
+			state.emit(PPC64::xxmrghw(xt, xt, xt)); // Shuffles values in XT to be [ 7654 7654 3210 3210 ]
+		}
+		state.emit(PPC64::xvcvsxwdp(xt, xt)); // Needs values in XT as [ 7654 zzzz 3210 zzzz ]
+	}
+
+	static void emitMovImm32(CompilerState& state, int reg, uint32_t imm) {
+		// Move signed 32-bit immediate into 64-bit register.
+		// Note that `imm` is a `uint32_t` and not an `int32_t` for type compatibility--it has no effect on
+		// functionality because `lis` will automatically sign-extend the 16-bit value.
+		int32_t simm = (int32_t)imm;
+		if (simm >= -32768 && simm <= 32767) {
+			state.emit(PPC64::li(reg, simm & 0xFFFF));
+		} else {
+			uint16_t upper  = (imm >> 16) & 0xFFFF;
+			uint16_t lower  = (imm >>  0) & 0xFFFF;
+
+			state.emit(PPC64::lis(reg, upper));
+			if (lower)
+				state.emit(PPC64::ori(reg, reg, lower));
+		}
+	}
+
+	static void emitAddImm32(CompilerState& state, int dstReg, int srcReg, uint32_t imm) {
+		int32_t simm = (int32_t)imm;
+		if (simm >= -32768 && simm <= 32767) {
+			state.emit(PPC64::addi(dstReg, srcReg, simm & 0xFFFF));
+		} else {
+			emitMovImm32(state, 8, imm);
+			state.emit(PPC64::add(dstReg, srcReg, 8));
+		}
+	}
+
+	static void emitMovImm64(CompilerState& state, int reg, uint64_t imm) {
+		if (imm == (uint64_t)(int64_t)(int32_t)imm) {
+			// Values that can be represented by loading a 32-bit signed immediate
+			emitMovImm32(state, reg, (uint32_t)imm);
+		} else {
+			uint64_t lowestBit = imm & -(int64_t)imm;
+			uint64_t added = imm + lowestBit;
+			if (imm != 0 && imm != ~0ULL && (added & (added - 1)) == 0) {
+				// Values that are a contiguous sequence of 1s
+				uint32_t mb = added == 0 ? 0 : __builtin_clzll(added);
+				uint32_t me = 63 - __builtin_ctzll(lowestBit);
+				state.emit(PPC64::li(reg, -1));
+				if (mb == 0) {
+					state.emit(PPC64::rldicr(reg, reg, 0, me));
+				} else if (me == 63) {
+					state.emit(PPC64::rldicl(reg, reg, 0, mb));
+				} else {
+					state.emit(PPC64::rldic(reg, reg, 63 - me, mb));
+				}
+				return;
+			}
+
+			// All other values
+			uint32_t high = imm >> 32;
+			uint32_t low = imm & 0xFFFFFFFF;
+
+			if (high) {
+				emitMovImm32(state, reg, high);
+				state.emit(PPC64::sldi(reg, reg, 32));
+			} else {
+				state.emit(PPC64::li(reg, 0));
+			}
+
+			uint16_t lower = (low >> 16) & 0xFFFF;
+			uint16_t lowest = low & 0xFFFF;
+
+			if (lower)
+				state.emit(PPC64::oris(reg, reg, lower));
+
+			if (lowest)
+				state.emit(PPC64::ori(reg, reg, lowest));
+		}
+	}
+
+	template<uint32_t tmp_gpr>
+	static void emitLoadGprFromScratchpad(CompilerState& state, uint32_t dst, uint32_t src, Instruction& instr) {
+		uint32_t imm = instr.getImm32();
+
+		if (src != dst) {
+			uint32_t size = instr.getModMem() ? RANDOMX_SCRATCHPAD_L1 : RANDOMX_SCRATCHPAD_L2;
+			imm &= size - 1;
+			emitAddImm32(state, tmp_gpr, src, imm);
+
+			uint32_t mb = 32 - Log2(size);
+			state.emit(PPC64::rlwinm(tmp_gpr, tmp_gpr, 0, mb, 28));
+
+			emitLoadGpr64(state, tmp_gpr, ScratchpadPointerGPR30, tmp_gpr);
+		} else {
+			imm = (imm & ScratchpadL3Mask) >> 3;
+			emitMovImm32(state, tmp_gpr, imm);
+			state.emit(PPC64::sldi(tmp_gpr, tmp_gpr, 3));
+
+			emitLoadGpr64(state, tmp_gpr, ScratchpadPointerGPR30, tmp_gpr);
+		}
+	}
+
+	template<uint32_t tmp_vr>
+	static void emitLoadVsrFromScratchpad(CompilerState& state, Instruction& instr) {
+		int src = RegisterMapR.getPpcGprNum(instr.src);
+
+		uint32_t imm = instr.getImm32();
+		uint32_t size = instr.getModMem() ? RANDOMX_SCRATCHPAD_L1 : RANDOMX_SCRATCHPAD_L2;
+		imm &= size - 1;
+		emitAddImm32(state, 8, src, imm);
+
+		uint32_t mb = 32 - Log2(size);
+		state.emit(PPC64::rlwinm(8, 8, 0, mb, 28));
+
+		emitLoadVr64(state, tmp_vr, ScratchpadPointerGPR30, 8);
+	}
+
+	void JitCompilerPPC64::emitProgramPrefix(CompilerState& state, Program& prog, ProgramConfiguration& pcfg, randomx_flags flags) {
+		state.codePos = RandomXCodePos;
+
+		// Load the Group E OR vector mask (high word in offset 1, low word in offset 0--enables loading with lxvd2x)
+		state.emitAt(sizeConstants, pcfg.eMask[1]);
+		state.emitAt(sizeConstants + 8, pcfg.eMask[0]);
+
+		LoopBeginPos = state.codePos;
+		state.emit(codeVmLoopPrologue, sizeVmLoopPrologue);
+
+		// Step 4: The 256 instructions stored in the Program Buffer are executed.
+		for (unsigned i = 0; i < RegistersCount; ++i) {
+			state.registerUsage[i] = -1;
+		}
+		for (unsigned i = 0; i < prog.getSize(flags); ++i) {
+			Instruction instr = prog(i);
+			instr.src %= RegistersCount;
+			instr.dst %= RegistersCount;
+			state.instructionOffsets[i] = state.codePos;
+			opcodeMap1[instr.opcode](state, instr, i, flags);
+		}
+	}
+
+	void JitCompilerPPC64::emitProgramSuffix(CompilerState& state, ProgramConfiguration& pcfg, randomx_flags flags) {
+		if (flags & RANDOMX_FLAG_V2) {
+			if (true || (flags & RANDOMX_FLAG_HARD_AES)) {  // TODO: Remove the "true" once software AES is working
+				unsigned long hwcaps2 = getauxval(AT_HWCAP2);
+				if (!(hwcaps2 & PPC_FEATURE2_VEC_CRYPTO)) {
+					throw std::runtime_error("This CPU is missing support for hardware AES!");
+				}
+				state.emit(codeVmSpadStoreHardAes, sizeVmSpadStoreHardAes);
+			} else {
+				throw std::runtime_error("Software AES is not yet implemented for PPC64!");
+			}
+		} else {
+			state.emit(codeVmSpadStore, sizeVmSpadStore);
+		}
+
+		state.emit(PPC64::xor_(SpAddr0GPR26, RegisterMapR.getPpcGprNum(pcfg.readReg0), RegisterMapR.getPpcGprNum(pcfg.readReg1)));
+
+		// spAddr1 (r27) = r26 >> 32
+		state.emit(PPC64::srdi(SpAddr1GPR27, SpAddr0GPR26, 32));
+		// spAddr0 (r26) = r26 & 0xFFFFFFFF
+		state.emit(PPC64::rldicl(SpAddr0GPR26, SpAddr0GPR26, 0, 32));
+
+		// Load Scratchpad L3 mask into r8
+		uint32_t l3Mask = (RANDOMX_SCRATCHPAD_L3 - 1) & ~63;
+		emitMovImm32(state, 8, l3Mask);
+
+		// Apply mask
+		state.emit(PPC64::and_(SpAddr0GPR26, SpAddr0GPR26, 8));
+		state.emit(PPC64::and_(SpAddr1GPR27, SpAddr1GPR27, 8));
+
+		// Add scratchpad base pointer (r30)
+		state.emit(PPC64::add(SpAddr0GPR26, SpAddr0GPR26, ScratchpadPointerGPR30));
+		state.emit(PPC64::add(SpAddr1GPR27, SpAddr1GPR27, ScratchpadPointerGPR30));
+
+		int32_t fixPos = state.codePos;
+		state.emit(codeVmEpilogue, sizeVmEpilogue);
+
+		int32_t fixContinuePos = fixPos + offsetVmFixLoop;
+		state.emitAt(fixContinuePos, PPC64::b(LoopBeginPos - fixContinuePos));
+	}
+
+	JitCompilerPPC64::JitCompilerPPC64() {
+		state.code = (uint8_t*) allocMemoryPages(CodeSize);
+		if (state.code == nullptr)
+			throw std::runtime_error("allocMemoryPages");
+
+		state.codePos = 0;
+		state.emit(codeConstants, sizeConstants);
+
+		state.codePos = ConstantPoolSize;
+		entryProgram = state.code + state.codePos;
+		// Load r2 with the base address of the constant pool
+		emitMovImm64(state, ConstantsBaseAddressRegisterGPR2, reinterpret_cast<uint64_t>(state.code));
+		state.emit(codeVmPrologue, sizeVmPrologue);
+		// Mask mx and ma with Scratchpad L3 mask
+		uint32_t mask_begin = 32 - Log2(RANDOMX_SCRATCHPAD_L3);
+		uint32_t mask_end = 31 - Log2(RANDOMX_DATASET_ITEM_SIZE);
+		state.emit(PPC64::rlwinm(SpAddr0GPR26, MxGPR25, 0, mask_begin, mask_end));
+		state.emit(PPC64::rlwinm(SpAddr1GPR27, MaGPR24, 0, mask_begin, mask_end));
+		// Init spAddr0 to masked mx + scratchpad base
+		state.emit(PPC64::add(SpAddr0GPR26, SpAddr0GPR26, ScratchpadPointerGPR30));
+		// Init spAddr1 to masked ma + scratchpad base
+		state.emit(PPC64::add(SpAddr1GPR27, SpAddr1GPR27, ScratchpadPointerGPR30));
+		RandomXCodePos = state.codePos;
+
+		state.codePos = RandomXCodeSize;
+		entryDataInit = state.code + state.codePos;
+		// Load r2 with the base address of the constant pool
+		emitMovImm64(state, ConstantsBaseAddressRegisterGPR2, reinterpret_cast<uint64_t>(state.code));
+		int32_t datasetInitFixCallPos = state.codePos + offsetDatasetInitFixCall;
+		state.emit(codeDatasetInit, sizeDatasetInit);
+		SshashSingleItemPos = alignSize(state.codePos, 128);
+		// Patch in the call to the SuperScalar Hash single item function
+		state.emitAt(datasetInitFixCallPos, PPC64::bl(SshashSingleItemPos - datasetInitFixCallPos));
+
+		clearCache(state);
+	}
+
+	JitCompilerPPC64::~JitCompilerPPC64() {
+		freePagedMemory(state.code, CodeSize);
+	}
+
+	void JitCompilerPPC64::enableWriting() {
+		setPagesRW(state.code, CodeSize);
+	}
+
+	void JitCompilerPPC64::enableExecution() {
+		setPagesRX(state.code, CodeSize);
+	}
+
+	void JitCompilerPPC64::enableAll() {
+		setPagesRWX(state.code, CodeSize);
+	}
+
+	void JitCompilerPPC64::generateProgram(Program& prog, ProgramConfiguration& pcfg) {
+		emitProgramPrefix(state, prog, pcfg, flags);
+
+		// Step 5a: Save ma in mt (r9, temporary)
+		int mtReg = 9;
+		state.emit(PPC64::mr(mtReg, MaGPR24));
+
+		// Step 5b: the mp register is XORed with the low 32 bits of registers readReg2 and readReg3
+		int mpReg = (flags & RANDOMX_FLAG_V2) ? MaGPR24 : MxGPR25;  // r24 = ma, r25 = mx
+		state.emit(PPC64::xor_(8, RegisterMapR.getPpcGprNum(pcfg.readReg2), RegisterMapR.getPpcGprNum(pcfg.readReg3)));
+		// Zero-extend r8 to 32 bits (clear upper 32 bits)
+		state.emit(PPC64::rldicl(8, 8, 0, 32));
+		// mp ^= (readReg2 ^ readReg3)
+		state.emit(PPC64::xor_(mpReg, mpReg, 8));
+
+		int32_t dataReadPos = state.codePos;
+		state.emit(codeVmDataRead, sizeVmDataRead);
+
+		uint32_t mask_begin = 32 - Log2(RANDOMX_DATASET_BASE_SIZE);
+		uint32_t mask_end = 31 - Log2(CacheLineSize);
+
+		// Patch prefetch address calculation (offset 0)
+		state.emitAt(dataReadPos, PPC64::rlwinm(8, mpReg, 0, mask_begin, mask_end));
+
+		// Patch read address calculation (offset 12)
+		state.emitAt(dataReadPos + 12, PPC64::rlwinm(8, mtReg, 0, mask_begin, mask_end));
+
+		emitProgramSuffix(state, pcfg, flags);
+
+		clearCache(state);
+	}
+
+	void JitCompilerPPC64::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) {
+		emitProgramPrefix(state, prog, pcfg, flags);
+
+		// Step 5a: Save ma in mt (r9, temporary)
+		int mtReg = 9;
+		state.emit(PPC64::mr(mtReg, MaGPR24));
+
+		// Step 5b: the mp register is XORed with the low 32 bits of registers readReg2 and readReg3
+		int mpReg = (flags & RANDOMX_FLAG_V2) ? MaGPR24 : MxGPR25;  // r24 = ma, r25 = mx
+		state.emit(PPC64::xor_(8, RegisterMapR.getPpcGprNum(pcfg.readReg2), RegisterMapR.getPpcGprNum(pcfg.readReg3)));
+		// Zero-extend r8 to 32 bits (clear upper 32 bits)
+		state.emit(PPC64::rldicl(8, 8, 0, 32));
+		// mp ^= (readReg2 ^ readReg3)
+		state.emit(PPC64::xor_(mpReg, mpReg, 8));
+
+		// Calculate itemNumber = (mt & datasetMask) / CacheLineSize
+		uint32_t datasetMask = (RANDOMX_DATASET_BASE_SIZE - 1) & ~63;
+		emitMovImm32(state, 8, datasetMask);
+		state.emit(PPC64::and_(5, mtReg, 8)); // r5 = mt & datasetMask
+		state.emit(PPC64::srdi(5, 5, Log2(CacheLineSize))); // r5 = r5 >> 6
+
+		emitAddImm32(state, 5, 5, datasetOffset / CacheLineSize);
+
+		int32_t callPos = state.codePos + offsetVmDataReadLightFixCall;
+		state.emit(codeVmDataReadLight, sizeVmDataReadLight);
+		state.emitAt(callPos, PPC64::bl(SshashSingleItemPos - callPos));
+
+		emitProgramSuffix(state, pcfg, flags);
+
+		clearCache(state);
+	}
+
+	static void generateSuperscalarCode(CompilerState& state, Instruction instr, const std::vector<uint64_t>& reciprocalCache) {
+		int dst = RegisterMapSsh.getPpcGprNum(instr.dst);
+		int src = RegisterMapSsh.getPpcGprNum(instr.src);
+		uint32_t rotation = instr.getImm32() & 63;
+
+		switch ((SuperscalarInstructionType)instr.opcode) {
+			case SuperscalarInstructionType::ISUB_R:
+				// subf dst, src, dst
+				state.emit(PPC64::subf(dst, src, dst));
+				break;
+			case SuperscalarInstructionType::IXOR_R:
+				// xor dst, dst, src
+				state.emit(PPC64::xor_(dst, dst, src));
+				break;
+			case SuperscalarInstructionType::IADD_RS:
+				// sldi r8, src, shift
+				state.emit(PPC64::sldi(8, src, instr.getModShift()));
+				// add dst, dst, r8
+				state.emit(PPC64::add(dst, dst, 8));
+				break;
+			case SuperscalarInstructionType::IMUL_R:
+				// mulld dst, dst, src
+				state.emit(PPC64::mulld(dst, dst, src));
+				break;
+			case SuperscalarInstructionType::IROR_C:
+				if (rotation) {
+					// rotrdi dst, dst, imm
+					state.emit(PPC64::rotrdi(dst, dst, rotation));
+				}
+				break;
+			case SuperscalarInstructionType::IADD_C7:
+			case SuperscalarInstructionType::IADD_C8:
+			case SuperscalarInstructionType::IADD_C9:
+				emitMovImm32(state, 8, instr.getImm32());
+				// add dst, dst, r8
+				state.emit(PPC64::add(dst, dst, 8));
+				break;
+			case SuperscalarInstructionType::IXOR_C7:
+			case SuperscalarInstructionType::IXOR_C8:
+			case SuperscalarInstructionType::IXOR_C9:
+				emitMovImm32(state, 8, instr.getImm32());
+				// xor dst, dst, r8
+				state.emit(PPC64::xor_(dst, dst, 8));
+				break;
+			case SuperscalarInstructionType::IMULH_R:
+				// mulhdu dst, dst, src
+				state.emit(PPC64::mulhdu(dst, dst, src));
+				break;
+			case SuperscalarInstructionType::ISMULH_R:
+				// mulhd dst, dst, src
+				state.emit(PPC64::mulhd(dst, dst, src));
+				break;
+			case SuperscalarInstructionType::IMUL_RCP:
+				emitMovImm64(state, 8, reciprocalCache[instr.getImm32()]);
+				// mulld dst, dst, r8
+				state.emit(PPC64::mulld(dst, dst, 8));
+				break;
+			default:
+				UNREACHABLE;
+		}
+	}
+
+	void JitCompilerPPC64::generateSuperscalarHash(SuperscalarProgramList& programs, std::vector<uint64_t> &reciprocalCache) {
+		state.codePos = SshashSingleItemPos;
+
+		// Steps 1 and 2
+		state.emit(codeSshashSingleItemPrologue, sizeSshashSingleItemPrologue);
+
+		for (size_t i = 0; i < programs.size(); ++i) {
+			SuperscalarProgram& prog = programs[i];
+
+			// Step 4
+			// rldic r8, r5, Log2(CacheLineSize), 64 - Log2(CacheSize / CacheLineSize) - Log2(CacheLineSize)
+			state.emit(PPC64::rldic(8, 5, Log2(CacheLineSize), 64 - Log2(CacheSize / CacheLineSize) - Log2(CacheLineSize)));
+			state.emit(codeSshashCachePrefetch + 4, sizeSshashCachePrefetch - 4);
+
+			// Step 5
+			for (uint32_t j = 0; j < prog.getSize(); ++j) {
+				Instruction& instr = prog(j);
+				generateSuperscalarCode(state, instr, reciprocalCache);
+			}
+
+			// Step 6
+			state.emit(codeSshashXor, sizeSshashXor);
+
+			uint32_t addrReg = RegisterMapSsh.getPpcGprNum(prog.getAddressRegister());
+			state.emit(PPC64::mr(5, addrReg));
+
+		}
+
+		// Return
+		state.emit(codeSshashSingleItemEpilogue, sizeSshashSingleItemEpilogue);
+
+		clearCache(state);
+	}
+
+	size_t JitCompilerPPC64::getCodeSize() {
+		return CodeSize;
+	}
+
+	static void h_IADD_RS(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		int shift = isn.getModShift();
+
+		if (shift) {
+			state.emit(PPC64::sldi(8, src, shift));
+			state.emit(PPC64::add(dst, dst, 8));
+		} else {
+			state.emit(PPC64::add(dst, dst, src));
+		}
+
+		if (isn.dst == RegisterNeedsDisplacement) {
+			emitAddImm32(state, dst, dst, isn.getImm32());
+		}
+	}
+	static void h_IADD_M(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		emitLoadGprFromScratchpad<8>(state, dst, src, isn);
+		state.emit(PPC64::add(dst, dst, 8));
+	}
+	static void h_ISUB_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		if (isn.src != isn.dst) {
+			int src = RegisterMapR.getPpcGprNum(isn.src);
+			state.emit(PPC64::subf(dst, src, dst));
+		} else {
+			int32_t imm = unsigned32ToSigned2sCompl(-isn.getImm32());
+			emitAddImm32(state, dst, dst, imm);
+		}
+	}
+	static void h_ISUB_M(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		emitLoadGprFromScratchpad<8>(state, dst, src, isn);
+		state.emit(PPC64::subf(dst, 8, dst));
+	}
+	static void h_IMUL_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		if (isn.src != isn.dst) {
+			int src = RegisterMapR.getPpcGprNum(isn.src);
+			state.emit(PPC64::mulld(dst, dst, src));
+		} else {
+			emitMovImm32(state, 8, isn.getImm32());
+			state.emit(PPC64::mulld(dst, dst, 8));
+		}
+	}
+	static void h_IMUL_M(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		emitLoadGprFromScratchpad<8>(state, dst, src, isn);
+		state.emit(PPC64::mulld(dst, dst, 8));
+	}
+	static void h_IMULH_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		state.emit(PPC64::mulhdu(dst, dst, src));
+	}
+	static void h_IMULH_M(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		emitLoadGprFromScratchpad<8>(state, dst, src, isn);
+		state.emit(PPC64::mulhdu(dst, dst, 8));
+	}
+	static void h_ISMULH_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		state.emit(PPC64::mulhd(dst, dst, src));
+	}
+	static void h_ISMULH_M(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		emitLoadGprFromScratchpad<8>(state, dst, src, isn);
+		state.emit(PPC64::mulhd(dst, dst, 8));
+	}
+	static void h_IMUL_RCP(HANDLER_ARGS) {
+		uint32_t divisor = isn.getImm32();
+		if (!isZeroOrPowerOf2(divisor)) {
+			state.registerUsage[isn.dst] = i;
+			int dst = RegisterMapR.getPpcGprNum(isn.dst);
+			uint64_t rcp = randomx_reciprocal_fast(divisor);
+			emitMovImm64(state, 8, rcp);
+			state.emit(PPC64::mulld(dst, dst, 8));
+		}
+	}
+	static void h_INEG_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		state.emit(PPC64::neg(dst, dst));
+	}
+	static void h_IXOR_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		if (isn.src != isn.dst) {
+			int src = RegisterMapR.getPpcGprNum(isn.src);
+			state.emit(PPC64::xor_(dst, dst, src));
+		} else {
+			emitMovImm32(state, 8, isn.getImm32());
+			state.emit(PPC64::xor_(dst, dst, 8));
+		}
+	}
+	static void h_IXOR_M(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		emitLoadGprFromScratchpad<8>(state, dst, src, isn);
+		state.emit(PPC64::xor_(dst, dst, 8));
+	}
+	static void h_IROR_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		if (isn.src != isn.dst) {
+			int src = RegisterMapR.getPpcGprNum(isn.src);
+			state.emit(PPC64::neg(8, src));
+			state.emit(PPC64::rldcl(dst, dst, 8, 0));
+		} else {
+			uint32_t imm = isn.getImm32() & 63;
+			if (imm)
+				state.emit(PPC64::rotrdi(dst, dst, imm));
+		}
+	}
+	static void h_IROL_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		if (isn.src != isn.dst) {
+			int src = RegisterMapR.getPpcGprNum(isn.src);
+			state.emit(PPC64::rldcl(dst, dst, src, 0));
+		} else {
+			uint32_t imm = isn.getImm32() & 63;
+			if (imm)
+				state.emit(PPC64::rotldi(dst, dst, imm));
+		}
+	}
+	static void h_ISWAP_R(HANDLER_ARGS) {
+		if (isn.src != isn.dst) {
+			state.registerUsage[isn.dst] = i;
+			state.registerUsage[isn.src] = i;
+			int dst = RegisterMapR.getPpcGprNum(isn.dst);
+			int src = RegisterMapR.getPpcGprNum(isn.src);
+			state.emit(PPC64::mr(8, dst));
+			state.emit(PPC64::mr(dst, src));
+			state.emit(PPC64::mr(src, 8));
+		}
+	}
+	static void h_FSWAP_R(HANDLER_ARGS) {
+		int dst = RegisterMapFE.getPpcVsrNum(isn.dst);
+		state.emit(PPC64::xxswapd(dst, dst));
+	}
+	static void h_FADD_R(HANDLER_ARGS) {
+		int dst = RegisterMapF.getPpcVsrNum(isn.dst);
+		int src = RegisterMapA.getPpcVsrNum(isn.src);
+		state.emit(PPC64::xvadddp(dst, dst, src));
+	}
+	static void h_FADD_M(HANDLER_ARGS) {
+		int dst = RegisterMapF.getPpcVsrNum(isn.dst);
+		emitLoadVsrFromScratchpad<12>(state, isn);
+		state.emit(PPC64::xvadddp(dst, dst, 32 + 12));
+	}
+	static void h_FSUB_R(HANDLER_ARGS) {
+		int dst = RegisterMapF.getPpcVsrNum(isn.dst);
+		int src = RegisterMapA.getPpcVsrNum(isn.src);
+		state.emit(PPC64::xvsubdp(dst, dst, src));
+	}
+	static void h_FSUB_M(HANDLER_ARGS) {
+		int dst = RegisterMapF.getPpcVsrNum(isn.dst);
+		emitLoadVsrFromScratchpad<12>(state, isn);
+		state.emit(PPC64::xvsubdp(dst, dst, 32 + 12));
+	}
+	static void h_FSCAL_R(HANDLER_ARGS) {
+		int dst = RegisterMapF.getPpcVsrNum(isn.dst);
+		state.emit(PPC64::xxlxor(dst, dst, ConstantVectorFscalXorMaskVSR50));
+	}
+	static void h_FMUL_R(HANDLER_ARGS) {
+		int dst = RegisterMapE.getPpcVsrNum(isn.dst);
+		int src = RegisterMapA.getPpcVsrNum(isn.src);
+		state.emit(PPC64::xvmuldp(dst, dst, src));
+	}
+	static void h_FDIV_M(HANDLER_ARGS) {
+		int dst = RegisterMapE.getPpcVsrNum(isn.dst);
+		uint32_t temp_vsr = 32 + 12;
+		emitLoadVsrFromScratchpad<12>(state, isn);
+		state.emit(PPC64::xxland(temp_vsr, temp_vsr, ConstantVectorGroupEAndMaskVSR49));
+		state.emit(PPC64::xxlor(temp_vsr, temp_vsr, ConstantVectorGroupEOrMaskVSR51));
+		state.emit(PPC64::xvdivdp(dst, dst, temp_vsr));
+	}
+	static void h_FSQRT_R(HANDLER_ARGS) {
+		int dst = RegisterMapE.getPpcVsrNum(isn.dst);
+		state.emit(PPC64::xvsqrtdp(dst, dst));
+	}
+	static void h_CBRANCH(HANDLER_ARGS) {
+		int reg = isn.dst;
+		int target = state.registerUsage[reg] + 1;
+		int shift = isn.getModCond() + ConditionOffset;
+		int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32());
+		imm |= (1UL << shift);
+		if (ConditionOffset > 0 || shift > 0)
+			imm &= ~(1UL << (shift - 1));
+
+		int dst = RegisterMapR.getPpcGprNum(reg);
+		emitAddImm32(state, dst, dst, imm);
+
+		uint64_t mask = (uint64_t)ConditionMask << shift;
+		emitMovImm64(state, 8, mask);
+		state.emit(PPC64::and_dot(8, dst, 8));
+
+		int32_t targetPos = state.instructionOffsets[target];
+		int offset = targetPos - state.codePos;
+
+		if (offset >= -(1 << 15) && offset < (1 << 15)) {
+			state.emit(PPC64::beq(offset));
+		} else {
+			// Branch over the jump if not equal
+			state.emit(PPC64::bne(8));
+			state.emit(PPC64::b(offset - 4));
+		}
+
+		for (unsigned j = 0; j < RegistersCount; ++j) {
+			state.registerUsage[j] = i;
+		}
+	}
+	static void h_CFROUND(HANDLER_ARGS) {
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		int32_t rotateBits = isn.getImm32() & 63;
+
+		// Operate directly on src by default
+		int rot_src = src;
+
+		// Rotate right by rotateBits
+		if (rotateBits) {
+			// rotrdi r8, src, rotateBits
+			state.emit(PPC64::rotrdi(8, src, rotateBits));
+
+			// We rotated src and put the new value in r8
+			rot_src = 8;
+		}
+
+		int32_t patch_pos = 0;
+		if (flags & RANDOMX_FLAG_V2) {
+			// Skip the rest of the code if bits 5:2 are not zero. Use GPR0 as a discard register.
+			// andi. r0, rot_src, 0x003C
+			state.emit(PPC64::andi_dot(0, rot_src, 0x003C));
+
+			// Get position to patch with conditional branch.
+			patch_pos = state.codePos;
+
+			// Emit invalid instruction now and patch later once we have the code length.
+			state.emit(0); // bne skip_update
+		}
+
+		// Mask out bits 1:0 and multiply by 8 (shift left by 3) to get the table word offset (0, 8, 16, 24)
+		// rldic r8, rot_src, 3, 59
+		state.emit(PPC64::rldic(8, rot_src, 3, 59));
+
+		// Load table address into scratch GPR0
+		emitAddImm32(state, 0, ConstantsBaseAddressRegisterGPR2, offsetConstantLutFprcToFpscr);
+
+		// Load value from fprc-to-FPSCR table into temporary FPR0
+		// lfdx f0, r8, r0
+		state.emit(PPC64::lfdx(0, 8, 0));
+
+		// Move the RN value from scratch FPR0 to FPSCR (masked)
+		// mtfsf 0x01, f0, 0, 0
+		state.emit(PPC64::mtfsf(0x01, 0, 0, 0));
+
+		if (flags & RANDOMX_FLAG_V2) {
+			// Patch in the conditional branch instruction.
+			int32_t branch_offset = state.codePos - patch_pos;
+			state.emitAt(patch_pos, PPC64::bne(branch_offset));
+		}
+	}
+	static void h_ISTORE(HANDLER_ARGS) {
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		uint32_t imm = isn.getImm32();
+
+		uint32_t size;
+		if (isn.getModCond() < StoreL3Condition) {
+			size = isn.getModMem() ? RANDOMX_SCRATCHPAD_L1 : RANDOMX_SCRATCHPAD_L2;
+		} else {
+			size = RANDOMX_SCRATCHPAD_L3;
+		}
+		imm &= size - 1;
+
+		emitAddImm32(state, 8, dst, imm);
+
+		uint32_t mb = 32 - Log2(size);
+		state.emit(PPC64::rlwinm(8, 8, 0, mb, 28));
+
+		state.emit(PPC64::stdx(src, ScratchpadPointerGPR30, 8));
+	}
+	static void h_NOP(HANDLER_ARGS) {
+	}
+}
+
+#include "instruction_weights.hpp"
+
+namespace {
+
+#define INST_HANDLE(x) REPN(&randomx::h_##x, WT(x))
+
+	InstructionHandler* opcodeMap1[256] = {
+		INST_HANDLE(IADD_RS)
+		INST_HANDLE(IADD_M)
+		INST_HANDLE(ISUB_R)
+		INST_HANDLE(ISUB_M)
+		INST_HANDLE(IMUL_R)
+		INST_HANDLE(IMUL_M)
+		INST_HANDLE(IMULH_R)
+		INST_HANDLE(IMULH_M)
+		INST_HANDLE(ISMULH_R)
+		INST_HANDLE(ISMULH_M)
+		INST_HANDLE(IMUL_RCP)
+		INST_HANDLE(INEG_R)
+		INST_HANDLE(IXOR_R)
+		INST_HANDLE(IXOR_M)
+		INST_HANDLE(IROR_R)
+		INST_HANDLE(IROL_R)
+		INST_HANDLE(ISWAP_R)
+		INST_HANDLE(FSWAP_R)
+		INST_HANDLE(FADD_R)
+		INST_HANDLE(FADD_M)
+		INST_HANDLE(FSUB_R)
+		INST_HANDLE(FSUB_M)
+		INST_HANDLE(FSCAL_R)
+		INST_HANDLE(FMUL_R)
+		INST_HANDLE(FDIV_M)
+		INST_HANDLE(FSQRT_R)
+		INST_HANDLE(CBRANCH)
+		INST_HANDLE(CFROUND)
+		INST_HANDLE(ISTORE)
+		INST_HANDLE(NOP)
+	};
+
+#undef INST_HANDLE
+}
+
+#define INST_HANDLE(x) REPN(static_cast<uint8_t>(randomx::InstructionType::x), WT(x))
+
+alignas(128) uint8_t randomx::JitCompilerPPC64::instMap[256] = {
+	INST_HANDLE(IADD_RS)
+	INST_HANDLE(IADD_M)
+	INST_HANDLE(ISUB_R)
+	INST_HANDLE(ISUB_M)
+	INST_HANDLE(IMUL_R)
+	INST_HANDLE(IMUL_M)
+	INST_HANDLE(IMULH_R)
+	INST_HANDLE(IMULH_M)
+	INST_HANDLE(ISMULH_R)
+	INST_HANDLE(ISMULH_M)
+	INST_HANDLE(IMUL_RCP)
+	INST_HANDLE(INEG_R)
+	INST_HANDLE(IXOR_R)
+	INST_HANDLE(IXOR_M)
+	INST_HANDLE(IROR_R)
+	INST_HANDLE(IROL_R)
+	INST_HANDLE(ISWAP_R)
+	INST_HANDLE(FSWAP_R)
+	INST_HANDLE(FADD_R)
+	INST_HANDLE(FADD_M)
+	INST_HANDLE(FSUB_R)
+	INST_HANDLE(FSUB_M)
+	INST_HANDLE(FSCAL_R)
+	INST_HANDLE(FMUL_R)
+	INST_HANDLE(FDIV_M)
+	INST_HANDLE(FSQRT_R)
+	INST_HANDLE(CBRANCH)
+	INST_HANDLE(CFROUND)
+	INST_HANDLE(ISTORE)
+	INST_HANDLE(NOP)
+};
diff --git a/src/jit_compiler_ppc64.hpp b/src/jit_compiler_ppc64.hpp
new file mode 100644
index 00000000..edea12bd
--- /dev/null
+++ b/src/jit_compiler_ppc64.hpp
@@ -0,0 +1,86 @@
+/*
+Copyright (c) 2023 tevador <tevador@gmail.com>
+Copyright (c) 2026, Forest Crossman <cyrozap@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+#include <vector>
+
+#include "common.hpp"
+#include "jit_compiler.hpp"
+
+#include "jit_compiler_ppc64_static.hpp"
+
+namespace randomx {
+
+	class Program;
+	struct ProgramConfiguration;
+	class SuperscalarProgram;
+	class Instruction;
+
+	class JitCompilerPPC64 {
+	public:
+		JitCompilerPPC64();
+		~JitCompilerPPC64();
+
+		void generateProgram(Program&, ProgramConfiguration&);
+		void generateProgramLight(Program&, ProgramConfiguration&, uint32_t);
+
+		void generateSuperscalarHash(SuperscalarProgramList& programs, std::vector<uint64_t> &);
+
+		void generateDatasetInitCode() {}
+
+		ProgramFunc* getProgramFunc() { return reinterpret_cast<ProgramFunc*>(entryProgram); }
+		DatasetInitFunc* getDatasetInitFunc() { return reinterpret_cast<DatasetInitFunc*>(entryDataInit); }
+		uint8_t* getCode() { return state.code; }
+		size_t getCodeSize();
+
+		void enableWriting();
+		void enableExecution();
+		void enableAll();
+
+		void setFlags(randomx_flags f) { flags = f; }
+
+		static uint8_t instMap[256];
+
+	private:
+		void emitProgramPrefix(CompilerState& state, Program& prog, ProgramConfiguration& pcfg, randomx_flags flags);
+		void emitProgramSuffix(CompilerState& state, ProgramConfiguration& pcfg, randomx_flags flags);
+
+		CompilerState state;
+		randomx_flags flags;
+
+		void* entryDataInit = nullptr;
+		void* entryProgram = nullptr;
+
+		int32_t RandomXCodePos;
+		int32_t SshashSingleItemPos;
+		int32_t LoopBeginPos;
+	};
+
+}
diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
new file mode 100644
index 00000000..26ecef7c
--- /dev/null
+++ b/src/jit_compiler_ppc64_static.S
@@ -0,0 +1,826 @@
+/*
+Copyright (c) 2026, Forest Crossman <cyrozap@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+	.machine power7
+	.machine altivec
+	.abiversion 2
+	.section ".rodata"  // Not .text because it's not meant to be executed in-place.
+
+#include "configuration.h"
+
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	#define PPC_BIG_ENDIAN 1
+#else
+	#define PPC_BIG_ENDIAN 0
+#endif
+
+	.global randomx_ppc64_constants
+	.global randomx_ppc64_constant_lut_fprc_to_fpscr
+	.global randomx_ppc64_constants_end
+
+	.global randomx_ppc64_dataset_init
+	.global randomx_ppc64_dataset_init_fix_call
+	.global randomx_ppc64_dataset_init_end
+
+	.global randomx_ppc64_sshash_single_item_prologue
+	.global randomx_ppc64_sshash_single_item_prologue_end
+	.global randomx_ppc64_sshash_single_item_epilogue
+	.global randomx_ppc64_sshash_single_item_epilogue_end
+	.global randomx_ppc64_sshash_cache_prefetch
+	.global randomx_ppc64_sshash_cache_prefetch_end
+	.global randomx_ppc64_sshash_xor
+	.global randomx_ppc64_sshash_xor_end
+
+	.global randomx_ppc64_vm_prologue
+	.global randomx_ppc64_vm_prologue_end
+	.global randomx_ppc64_vm_epilogue
+	.global randomx_ppc64_vm_fix_loop
+	.global randomx_ppc64_vm_epilogue_end
+	.global randomx_ppc64_vm_loop_prologue
+	.global randomx_ppc64_vm_loop_prologue_end
+	.global randomx_ppc64_vm_data_read
+	.global randomx_ppc64_vm_data_read_end
+	.global randomx_ppc64_vm_data_read_light
+	.global randomx_ppc64_vm_data_read_light_fix_call
+	.global randomx_ppc64_vm_data_read_light_end
+	.global randomx_ppc64_vm_spad_store
+	.global randomx_ppc64_vm_spad_store_end
+	.global randomx_ppc64_vm_spad_store_hard_aes
+	.global randomx_ppc64_vm_spad_store_hard_aes_end
+
+	.global randomx_reciprocal_fast
+
+// Macro to store a VR containing a RandomX Group F/E/A register to memory
+// Clobbers: v12 / vs44
+.macro STORE_LE_VR vr_src, vr_temp, offset_reg, base_reg
+#if PPC_BIG_ENDIAN
+	vperm \vr_temp, \vr_src, \vr_src, %v15         // Reverse the bytes so they're arranged as [ 0123 4567 ]
+#else
+	xxswapd \vr_temp + 32, \vr_src + 32            // Swap the doubles so they're arranged as [ 3210 7654 ]
+#endif
+	stxvd2x \vr_temp + 32, \offset_reg, \base_reg  // Store the two doubles to memory
+.endm
+
+// Macro to shuffle a VR after being loaded with lxsdx.
+.macro SHUFFLE_VR vr_reg
+#if PPC_BIG_ENDIAN
+	vperm \vr_reg, \vr_reg, \vr_reg, %v16
+#else
+	xxmrghw \vr_reg + 32, \vr_reg + 32, \vr_reg + 32
+#endif
+.endm
+
+// Macro to load a GPR from little-endian bytes in memory
+// Clobbers (BE only): r0
+.macro LOAD_LE_GPR reg, offset, base_reg
+#if PPC_BIG_ENDIAN
+	li %r0, \offset
+	ldbrx \reg, \base_reg, %r0
+#else
+	ld \reg, \offset(\base_reg)
+#endif
+.endm
+
+// Macro to store a GPR to memory as little-endian bytes
+// Clobbers (BE only): r0
+.macro STORE_LE_GPR reg, offset, base_reg
+#if PPC_BIG_ENDIAN
+	li %r0, \offset
+	stdbrx \reg, \base_reg, %r0
+#else
+	std \reg, \offset(\base_reg)
+#endif
+.endm
+
+	// Align constants to 128 bytes (lowest 7 bits masked)
+	.align 7
+
+randomx_ppc64_constants:
+
+sshash_constant_0:  .8byte 6364136223846793005
+sshash_constant_1:  .8byte 9298411001130361340
+sshash_constant_2:  .8byte 12065312585734608966
+sshash_constant_3:  .8byte 9306329213124626780
+sshash_constant_4:  .8byte 5281919268842080866
+sshash_constant_5:  .8byte 10536153434571861004
+sshash_constant_6:  .8byte 3398623926847679864
+sshash_constant_7:  .8byte 9549104520008361294
+
+randomx_ppc64_constant_lut_fprc_to_fpscr:
+	// RandomX fprc to PPC64 FPSCR lookup table
+	.8byte 0  /* 00 Round to Nearest */
+	.8byte 3  /* 11 Round toward -Infinity */
+	.8byte 2  /* 10 Round toward +Infinity */
+	.8byte 1  /* 01 Round toward Zero */
+
+	// Align vector constants to 16 bytes (lowest 4 bits masked)
+	.align 4
+
+constant_vector_group_e_and_mask:
+	.8byte 0x00FFFFFFFFFFFFFF
+	.8byte 0x00FFFFFFFFFFFFFF
+
+constant_vector_fscal_xor_mask:
+	.8byte 0x80F0000000000000
+	.8byte 0x80F0000000000000
+
+constant_vector_be_byte_reverse_mask:
+	// Big-endian vector byte reverse mask
+	.8byte 0x0F0E0D0C0B0A0908
+	.8byte 0x0706050403020100
+
+#if PPC_BIG_ENDIAN
+constant_vector_be_permutation_mask:
+	// Big-endian vector permutation mask
+	.byte 7, 6, 5, 4, 7, 6, 5, 4
+	.byte 3, 2, 1, 0, 3, 2, 1, 0
+#endif
+
+randomx_ppc64_constants_end:
+
+literal_vector_group_e_or_mask:
+	// Program generator will write the vector here
+
+
+// Register allocations: dataset_init
+//
+// Passed on entry:
+//
+// r1 (non-volatile) -> stack pointer (sp)
+// r2 (volatile)     -> constants base address (randomx_ppc64_constants), patched in by JIT, restored from stack frame by caller
+// r3 (volatile)     -> arg0, pointer to randomx_cache
+// r4 (volatile)     -> arg1, pointer to dataset (uint8_t *)
+// r5 (volatile)     -> arg2, uint32_t startBlock / itemNumber / initial cacheIndex
+// r6 (volatile)     -> arg3, uint32_t endBlock
+//
+// After prologue:
+//
+// r0 (volatile)     -> scratch register
+// r1 (non-volatile) -> stack pointer (sp)
+// r2 (volatile)     -> constants base address (randomx_ppc64_constants), patched in by JIT, restored from stack frame by caller
+// r3 (volatile)     -> arg0, pointer to cache memory
+// r4 (volatile)     -> arg1, pointer to dataset (uint8_t *)
+// r5 (volatile)     -> arg2, uint32_t startBlock / itemNumber / initial cacheIndex
+// r6 (volatile)     -> arg3, uint32_t endBlock
+// r7-r12 (volatile) -> scratch registers
+// r14 (non-volatile) -> saved pointer to dataset (uint8_t *)
+// r15 (non-volatile) -> saved itemNumber
+// r16-r31 (non-volatile) -> unused
+
+randomx_ppc64_dataset_init:
+	// JIT compiler MUST emit immediate load to r2 before this code
+
+	// Standard function prologue
+	mflr %r0
+	std %r0, 16(%r1)
+	stdu %r1, -48(%r1)
+	std %r14, 32(%r1)
+	std %r15, 40(%r1)
+
+	// Load cache->memory pointer
+	ld %r3, 0(%r3)
+
+	// Save the dataset pointer (r4) to r14
+	mr %r14, %r4
+
+	// Save the itemNumber (r5) to r15
+	mr %r15, %r5
+
+	// Loop setup
+	// for (size_t itemNumber = startBlock; itemNumber < endBlock; itemNumber++) { ... }
+	sub %r8, %r6, %r5
+	mtctr %r8
+
+1:
+	// r5 gets clobbered by the item-hashing function, so we need to restore it
+	// from r15 before calling the function again.
+	mr %r5, %r15
+
+randomx_ppc64_dataset_init_fix_call:
+	// JIT compiler MUST patch this to bl to the item hashing function
+	b 0
+
+	// Store the 64 computed bytes back in the dataset
+	STORE_LE_GPR %r4, 8*0, %r14
+	STORE_LE_GPR %r6, 8*1, %r14
+	STORE_LE_GPR %r7, 8*2, %r14
+	STORE_LE_GPR %r9, 8*3, %r14
+	STORE_LE_GPR %r10, 8*4, %r14
+	STORE_LE_GPR %r11, 8*5, %r14
+	STORE_LE_GPR %r12, 8*6, %r14
+	STORE_LE_GPR %r5, 8*7, %r14
+
+	// Increment the dataset pointer by 64 bytes
+	addi %r14, %r14, 8*8
+
+	// Increment the itemNumber by one
+	addi %r15, %r15, 1
+
+	// Loop
+	bdnz 1b
+
+	// Standard function epilogue
+	ld %r14, 32(%r1)
+	ld %r15, 40(%r1)
+	addi %r1, %r1, 48
+	ld %r0, 16(%r1)
+	mtlr %r0
+	blr
+
+randomx_ppc64_dataset_init_end:
+
+
+// Register allocations: sshash_single_item
+//
+// Passed on entry:
+//
+// r1 (non-volatile) -> stack pointer (sp)
+// r2 (volatile)     -> constants base address (randomx_ppc64_constants), patched in by JIT, restored from stack frame by caller
+// r3 (volatile)     -> arg0, pointer to cache memory
+// r5 (volatile)     -> arg2, uint32_t itemNumber
+//
+// After prologue:
+//
+// r0 (volatile)     -> scratch register
+// r1 (non-volatile) -> stack pointer (sp)
+// r2 (volatile)     -> constants base address (randomx_ppc64_constants), patched in by JIT, restored from stack frame by caller
+// r3 (volatile)     -> arg0, pointer to cache memory
+// r4 (volatile)     -> SuperscalarHash r0
+// r5 (volatile)     -> cacheIndex, set to SuperscalarHash r7 on return
+// r6-r7 (volatile)  -> SuperscalarHash r1-r2
+// r8 (volatile)     -> scratch register
+// r9-r12 (volatile) -> SuperscalarHash r3-r6
+// r14-r21 (non-volatile) -> unused
+// r22 (non-volatile) -> SuperscalarHash r7
+// r23 (non-volatile) -> cache line address
+// r24-r31 (non-volatile) -> unused
+
+randomx_ppc64_sshash_single_item_prologue:
+	// Standard function prologue
+	mflr %r0
+	std %r0, 16(%r1)
+	stdu %r1, -48(%r1)
+	std %r22, 32(%r1)
+	std %r23, 40(%r1)
+
+	// Step 1. Initialize registers
+
+	// r0 = (itemNumber + 1) * 6364136223846793005
+	ld %r8, (sshash_constant_0-randomx_ppc64_constants)(%r2)
+	addi %r0, %r5, 1
+	mulld %r4, %r8, %r0
+
+	// r1 = r0 ^ 9298411001130361340
+	ld %r8, (sshash_constant_1-randomx_ppc64_constants)(%r2)
+	xor %r6, %r4, %r8
+
+	// r2 = r0 ^ 12065312585734608966
+	ld %r8, (sshash_constant_2-randomx_ppc64_constants)(%r2)
+	xor %r7, %r4, %r8
+
+	// r3 = r0 ^ 9306329213124626780
+	ld %r8, (sshash_constant_3-randomx_ppc64_constants)(%r2)
+	xor %r9, %r4, %r8
+
+	// r4 = r0 ^ 5281919268842080866
+	ld %r8, (sshash_constant_4-randomx_ppc64_constants)(%r2)
+	xor %r10, %r4, %r8
+
+	// r5 = r0 ^ 10536153434571861004
+	ld %r8, (sshash_constant_5-randomx_ppc64_constants)(%r2)
+	xor %r11, %r4, %r8
+
+	// r6 = r0 ^ 3398623926847679864
+	ld %r8, (sshash_constant_6-randomx_ppc64_constants)(%r2)
+	xor %r12, %r4, %r8
+
+	// r7 = r0 ^ 9549104520008361294
+	ld %r8, (sshash_constant_7-randomx_ppc64_constants)(%r2)
+	xor %r22, %r4, %r8
+
+	// Step 2. Use r5 (itemNumber) as cacheIndex so it can be used to generate the initial cache line mask
+
+randomx_ppc64_sshash_single_item_prologue_end:
+
+randomx_ppc64_sshash_single_item_epilogue:
+	// Return SuperscalarHash r7 in GPR5
+	mr %r5, %r22
+
+	// Standard function epilogue
+	ld %r22, 32(%r1)
+	ld %r23, 40(%r1)
+	addi %r1, %r1, 48
+	ld %r0, 16(%r1)
+	mtlr %r0
+	blr
+
+randomx_ppc64_sshash_single_item_epilogue_end:
+
+
+// Step 4. Load a 64-byte item from the Cache. The item index is given by cacheIndex modulo the total number of 64-byte items in Cache.
+randomx_ppc64_sshash_cache_prefetch:
+	// Actual mask MUST be inserted by JIT compiler
+	rldic %r8, %r5, 0, 63
+	add %r23, %r3, %r8
+	dcbt 0, %r23, 0
+	// If TH=0b00000, the dcbt/dcbtst instruction provides a
+	// hint that the program will probably soon access the
+	// block containing the byte addressed by EA.
+
+randomx_ppc64_sshash_cache_prefetch_end:
+
+// Step 6. XOR all registers with data loaded from randomx cache
+randomx_ppc64_sshash_xor:
+	LOAD_LE_GPR %r8, 0, %r23
+	LOAD_LE_GPR %r0, 8, %r23
+	xor %r4, %r4, %r8
+	xor %r6, %r6, %r0
+	LOAD_LE_GPR %r8, 16, %r23
+	LOAD_LE_GPR %r0, 24, %r23
+	xor %r7, %r7, %r8
+	xor %r9, %r9, %r0
+	LOAD_LE_GPR %r8, 32, %r23
+	LOAD_LE_GPR %r0, 40, %r23
+	xor %r10, %r10, %r8
+	xor %r11, %r11, %r0
+	LOAD_LE_GPR %r8, 48, %r23
+	LOAD_LE_GPR %r0, 56, %r23
+	xor %r12, %r12, %r8
+	xor %r22, %r22, %r0
+
+randomx_ppc64_sshash_xor_end:
+
+
+// Register allocations: vm
+//
+// Passed on entry:
+//
+// r1 (non-volatile) -> stack pointer (sp)
+// r2 (volatile)     -> constants base address (randomx_ppc64_constants), patched in by JIT, restored from stack frame by caller
+// r3 (volatile)     -> arg0, pointer to RegisterFile
+// r4 (volatile)     -> arg1, pointer to MemoryRegisters
+// r5 (volatile)     -> arg2, pointer to scratchpad (uint8_t *)
+// r6 (volatile)     -> arg3, uint64_t loop iterations
+//
+// After prologue:
+//
+// r0 (volatile)     -> scratch register
+// r1 (non-volatile) -> stack pointer (sp)
+// r2 (volatile)     -> constants base address (randomx_ppc64_constants), patched in by JIT, restored from stack frame by caller
+// r3 (volatile)     -> arg0 for SuperScalarHash (dataset pointer)
+// r4 (volatile)     -> scratch register
+// r5 (volatile)     -> arg2 for SuperScalarHash (loop iteration)
+// r6-r12 (volatile) -> scratch registers
+// r14-r21 (non-volatile) -> RandomX integer registers r0-r7
+// r22 (non-volatile) -> dataset pointer (memory)
+// r24 (non-volatile) -> ma
+// r25 (non-volatile) -> mx
+// r26 (non-volatile) -> spAddr0
+// r27 (non-volatile) -> spAddr1
+// r28 (non-volatile) -> Saved pointer to RegisterFile
+// r29 (non-volatile) -> Saved pointer to MemoryRegisters
+// r30 (non-volatile) -> Saved pointer to scratchpad (uint8_t *)
+// r31 (non-volatile) -> unused
+// vsr0–vsr13 (volatile) -> scratch registers
+// vsr14–vsr31 (non-volatile) -> unused
+// v0-v3 / vs32-vs35 (volatile) -> RandomX floating point registers f0-f3
+// v4-v7 / vs36-vs39 (volatile) -> RandomX floating point registers e0-e3
+// v8-v11 / vs40-vs43 (volatile) -> RandomX floating point registers a0-a3
+// v12-v14 / vs44-vs46 (volatile) -> scratch registers
+// v15 / vs47 (volatile) -> constant_vector_be_byte_reverse_mask
+// v16 / vs48 (volatile) -> constant_vector_be_permutation_mask
+// v17 / vs49 (volatile) -> constant_vector_group_e_and_mask
+// v18 / vs50 (volatile) -> constant_vector_fscal_xor_mask
+// v19 / vs51 (volatile) -> literal_vector_group_e_or_mask
+// v20-v31 / vs52-vs63 (non-volatile) -> unused
+
+randomx_ppc64_vm_prologue:
+	// JIT compiler MUST emit immediate load to r2 before this code
+
+	// Standard function prologue
+	mflr %r0
+	std %r0, 16(%r1)
+	stdu %r1, -176(%r1)
+	std %r14, 32(%r1)
+	std %r15, 40(%r1)
+	std %r16, 48(%r1)
+	std %r17, 56(%r1)
+	std %r18, 64(%r1)
+	std %r19, 72(%r1)
+	std %r20, 80(%r1)
+	std %r21, 88(%r1)
+	std %r22, 96(%r1)
+	std %r23, 104(%r1)
+	std %r24, 112(%r1)
+	std %r25, 120(%r1)
+	std %r26, 128(%r1)
+	std %r27, 136(%r1)
+	std %r28, 144(%r1)
+	std %r29, 152(%r1)
+	std %r30, 160(%r1)
+	//std %r31, 168(%r1)
+
+	// Save arguments
+	mr %r28, %r3
+	mr %r29, %r4
+	mr %r30, %r5
+
+	// Move the loop iterations into the counter
+	mtctr %r6
+
+	// Load the vector constants/literals
+	li %r8, constant_vector_group_e_and_mask-randomx_ppc64_constants
+	li %r9, constant_vector_fscal_xor_mask-randomx_ppc64_constants
+	li %r10, literal_vector_group_e_or_mask-randomx_ppc64_constants
+#if PPC_BIG_ENDIAN
+	li %r11, constant_vector_be_permutation_mask-randomx_ppc64_constants
+#endif
+	li %r12, constant_vector_be_byte_reverse_mask-randomx_ppc64_constants
+	lxvd2x %vs49, %r8, %r2
+	lxvd2x %vs50, %r9, %r2
+	lxvd2x %vs51, %r10, %r2
+#if PPC_BIG_ENDIAN
+	lxvd2x %vs48, %r11, %r2  // Load the BE permutation mask (not needed for LE)
+#endif
+	lxvd2x %vs47, %r12, %r2
+
+	// Zero the RandomX integer registers
+	li %r14, 0
+	li %r15, 0
+	li %r16, 0
+	li %r17, 0
+	li %r18, 0
+	li %r19, 0
+	li %r20, 0
+	li %r21, 0
+
+	// Load MemoryRegisters (r29)
+	lwz %r25, 0(%r29)  // mx
+	lwz %r24, 4(%r29)  // ma
+	ld %r22, 8(%r29)   // memory (dataset pointer)
+
+	// Load a0-a3 from RegisterFile (we have to swap doubles because lxvd2x always loads them in big-endian word order)
+	.equ registers_a_base, 8*8+16*4+16*4
+	addi %r8, %r28, registers_a_base + 16*0
+	addi %r9, %r28, registers_a_base + 16*1
+	addi %r10, %r28, registers_a_base + 16*2
+	addi %r11, %r28, registers_a_base + 16*3
+	lxvd2x %vs40, 0, %r8
+	lxvd2x %vs41, 0, %r9
+	lxvd2x %vs42, 0, %r10
+	lxvd2x %vs43, 0, %r11
+	xxswapd %vs40, %vs40
+	xxswapd %vs41, %vs41
+	xxswapd %vs42, %vs42
+	xxswapd %vs43, %vs43
+
+	// Instructions to mask mx and ma with Scratchpad L3 mask and set the
+	// initial values of spAddr0 and spAddr1 are appended here by the JIT
+
+randomx_ppc64_vm_prologue_end:
+
+randomx_ppc64_vm_epilogue:
+	// Loop
+	bdz 1f
+randomx_ppc64_vm_fix_loop:
+	// JIT compiler MUST patch this to b to vm_loop_prologue
+	b 0
+1:
+
+	// Store RandomX registers back into register file (no endian swaps needed)
+	std %r14, 8*0(%r28)
+	std %r15, 8*1(%r28)
+	std %r16, 8*2(%r28)
+	std %r17, 8*3(%r28)
+	std %r18, 8*4(%r28)
+	std %r19, 8*5(%r28)
+	std %r20, 8*6(%r28)
+	std %r21, 8*7(%r28)
+
+	.equ registers_f_base, 8*8
+	addi %r8, %r28, registers_f_base + 16*0
+	addi %r9, %r28, registers_f_base + 16*1
+	addi %r10, %r28, registers_f_base + 16*2
+	addi %r11, %r28, registers_f_base + 16*3
+	xxswapd %vs0, %vs32
+	xxswapd %vs1, %vs33
+	xxswapd %vs2, %vs34
+	xxswapd %vs3, %vs35
+	stxvd2x %vs0, 0, %r8
+	stxvd2x %vs1, 0, %r9
+	stxvd2x %vs2, 0, %r10
+	stxvd2x %vs3, 0, %r11
+
+	.equ registers_e_base, 8*8+16*4
+	addi %r8, %r28, registers_e_base + 16*0
+	addi %r9, %r28, registers_e_base + 16*1
+	addi %r10, %r28, registers_e_base + 16*2
+	addi %r11, %r28, registers_e_base + 16*3
+	xxswapd %vs4, %vs36
+	xxswapd %vs5, %vs37
+	xxswapd %vs6, %vs38
+	xxswapd %vs7, %vs39
+	stxvd2x %vs4, 0, %r8
+	stxvd2x %vs5, 0, %r9
+	stxvd2x %vs6, 0, %r10
+	stxvd2x %vs7, 0, %r11
+
+	// Standard function epilogue
+	ld %r14, 32(%r1)
+	ld %r15, 40(%r1)
+	ld %r16, 48(%r1)
+	ld %r17, 56(%r1)
+	ld %r18, 64(%r1)
+	ld %r19, 72(%r1)
+	ld %r20, 80(%r1)
+	ld %r21, 88(%r1)
+	ld %r22, 96(%r1)
+	ld %r23, 104(%r1)
+	ld %r24, 112(%r1)
+	ld %r25, 120(%r1)
+	ld %r26, 128(%r1)
+	ld %r27, 136(%r1)
+	ld %r28, 144(%r1)
+	ld %r29, 152(%r1)
+	ld %r30, 160(%r1)
+	//ld %r31, 168(%r1)
+	addi %r1, %r1,176
+	ld %r0, 16(%r1)
+	mtlr %r0
+	blr
+
+randomx_ppc64_vm_epilogue_end:
+
+randomx_ppc64_vm_loop_prologue:
+	// Main loop start
+
+	// Load scratchpad data, mix registers, etc.
+	LOAD_LE_GPR %r8, 0, %r26
+	LOAD_LE_GPR %r9, 8, %r26
+	xor %r14, %r14, %r8
+	xor %r15, %r15, %r9
+	LOAD_LE_GPR %r8, 16, %r26
+	LOAD_LE_GPR %r9, 24, %r26
+	xor %r16, %r16, %r8
+	xor %r17, %r17, %r9
+	LOAD_LE_GPR %r8, 32, %r26
+	LOAD_LE_GPR %r9, 40, %r26
+	xor %r18, %r18, %r8
+	xor %r19, %r19, %r9
+	LOAD_LE_GPR %r8, 48, %r26
+	LOAD_LE_GPR %r9, 56, %r26
+	xor %r20, %r20, %r8
+	xor %r21, %r21, %r9
+
+	// Load F registers (v0-v3 / vs32-vs35) from spAddr1 (r27)
+	//addi %r8, %r27, 8*0
+	addi %r9, %r27, 8*1
+	addi %r10, %r27, 8*2
+	addi %r11, %r27, 8*3
+	lxsdx %vs32, 0, %r27  // Use base address directly to avoid an `addi`
+	lxsdx %vs33, 0, %r9
+	lxsdx %vs34, 0, %r10
+	lxsdx %vs35, 0, %r11
+	SHUFFLE_VR 0
+	SHUFFLE_VR 1
+	SHUFFLE_VR 2
+	SHUFFLE_VR 3
+	xvcvsxwdp %vs32, %vs32
+	xvcvsxwdp %vs33, %vs33
+	xvcvsxwdp %vs34, %vs34
+	xvcvsxwdp %vs35, %vs35
+
+	// Load E registers (v4-v7 / vs36-vs39) from spAddr1 (r27) and fixup
+	addi %r8, %r27, 8*4
+	addi %r9, %r27, 8*5
+	addi %r10, %r27, 8*6
+	addi %r11, %r27, 8*7
+	lxsdx %vs36, 0, %r8
+	lxsdx %vs37, 0, %r9
+	lxsdx %vs38, 0, %r10
+	lxsdx %vs39, 0, %r11
+	SHUFFLE_VR 4
+	SHUFFLE_VR 5
+	SHUFFLE_VR 6
+	SHUFFLE_VR 7
+	xvcvsxwdp %vs36, %vs36
+	xvcvsxwdp %vs37, %vs37
+	xvcvsxwdp %vs38, %vs38
+	xvcvsxwdp %vs39, %vs39
+	xxland %vs36, %vs36, %vs49
+	xxland %vs37, %vs37, %vs49
+	xxland %vs38, %vs38, %vs49
+	xxland %vs39, %vs39, %vs49
+	xxlor %vs36, %vs36, %vs51
+	xxlor %vs37, %vs37, %vs51
+	xxlor %vs38, %vs38, %vs51
+	xxlor %vs39, %vs39, %vs51
+
+randomx_ppc64_vm_loop_prologue_end:
+
+randomx_ppc64_vm_data_read:
+	// Read dataset logic
+
+	// Calculate prefetch address (JIT compiler MUST patch)
+	.long 0  // Placeholder for: rlwinm %r8, %mpReg, 0, mask_begin, mask_end
+	add %r8, %r8, %r22  // r22 holds dataset base pointer
+
+	// Prefetch
+	dcbt 0, %r8, 0
+
+	// Calculate read address (JIT compiler MUST patch)
+	.long 0  // Placeholder for: rlwinm %r8, %mtReg, 0, mask_begin, mask_end
+	add %r8, %r8, %r22
+
+	// Read 64 bytes and XOR with integer registers
+	LOAD_LE_GPR %r9, 0, %r8
+	LOAD_LE_GPR %r10, 8, %r8
+	xor %r14, %r14, %r9
+	xor %r15, %r15, %r10
+	LOAD_LE_GPR %r9, 16, %r8
+	LOAD_LE_GPR %r10, 24, %r8
+	xor %r16, %r16, %r9
+	xor %r17, %r17, %r10
+	LOAD_LE_GPR %r9, 32, %r8
+	LOAD_LE_GPR %r10, 40, %r8
+	xor %r18, %r18, %r9
+	xor %r19, %r19, %r10
+	LOAD_LE_GPR %r9, 48, %r8
+	LOAD_LE_GPR %r10, 56, %r8
+	xor %r20, %r20, %r9
+	xor %r21, %r21, %r10
+
+	// Swap mx and ma
+	mr %r8, %r25
+	mr %r25, %r24
+	mr %r24, %r8
+
+randomx_ppc64_vm_data_read_end:
+
+randomx_ppc64_vm_data_read_light:
+	// Light mode read dataset logic
+	// (Similar to data_read but uses sshash_single_item)
+
+	// Copy dataset pointer argument for sshash_single_item
+	mr %r3, %r22
+
+randomx_ppc64_vm_data_read_light_fix_call:
+	// JIT compiler MUST patch this to bl to sshash_single_item
+	b 0
+
+	// XOR the result from sshash_single_item with the VM registers
+	xor %r14, %r14, %r4
+	xor %r15, %r15, %r6
+	xor %r16, %r16, %r7
+	xor %r17, %r17, %r9
+	xor %r18, %r18, %r10
+	xor %r19, %r19, %r11
+	xor %r20, %r20, %r12
+	xor %r21, %r21, %r5
+
+	// Swap mx and ma
+	mr %r8, %r25
+	mr %r25, %r24
+	mr %r24, %r8
+
+randomx_ppc64_vm_data_read_light_end:
+
+randomx_ppc64_vm_spad_store:
+	// Store to scratchpad at spAddr1
+	STORE_LE_GPR %r14, 8*0, %r27
+	STORE_LE_GPR %r15, 8*1, %r27
+	STORE_LE_GPR %r16, 8*2, %r27
+	STORE_LE_GPR %r17, 8*3, %r27
+	STORE_LE_GPR %r18, 8*4, %r27
+	STORE_LE_GPR %r19, 8*5, %r27
+	STORE_LE_GPR %r20, 8*6, %r27
+	STORE_LE_GPR %r21, 8*7, %r27
+
+	// Mix F and E registers (f0-f3 are vs32-vs35, e0-e3 are vs36-vs39)
+	xxlxor %vs32, %vs32, %vs36
+	xxlxor %vs33, %vs33, %vs37
+	xxlxor %vs34, %vs34, %vs38
+	xxlxor %vs35, %vs35, %vs39
+
+	// Store F registers to scratchpad at spAddr0
+	li %r8, 16*0
+	li %r9, 16*1
+	li %r10, 16*2
+	li %r11, 16*3
+	STORE_LE_VR 0, 12, %r8, %r26
+	STORE_LE_VR 1, 13, %r9, %r26
+	STORE_LE_VR 2, 14, %r10, %r26
+	STORE_LE_VR 3, 12, %r11, %r26
+
+randomx_ppc64_vm_spad_store_end:
+
+randomx_ppc64_vm_spad_store_hard_aes:
+	// Store to scratchpad at spAddr1
+	STORE_LE_GPR %r14, 8*0, %r27
+	STORE_LE_GPR %r15, 8*1, %r27
+	STORE_LE_GPR %r16, 8*2, %r27
+	STORE_LE_GPR %r17, 8*3, %r27
+	STORE_LE_GPR %r18, 8*4, %r27
+	STORE_LE_GPR %r19, 8*5, %r27
+	STORE_LE_GPR %r20, 8*6, %r27
+	STORE_LE_GPR %r21, 8*7, %r27
+
+	// Mix F and E registers (f0-f3 are v0-v3, e0-e3 are v4-v7)
+
+	// Byte-reverse f0-f3 and e0-e3
+	vperm %v0, %v0, %v0, %v15
+	vperm %v1, %v1, %v1, %v15
+	vperm %v2, %v2, %v2, %v15
+	vperm %v3, %v3, %v3, %v15
+	vperm %v4, %v4, %v4, %v15
+	vperm %v5, %v5, %v5, %v15
+	vperm %v6, %v6, %v6, %v15
+	vperm %v7, %v7, %v7, %v15
+
+	// We need a zero vector to bypass vncipher's internal key XOR
+	vxor %v12, %v12, %v12
+
+	vcipher %v0, %v0, %v4
+	vncipher %v1, %v1, %v12  // Pass 0 as the key
+	vcipher %v2, %v2, %v4
+	vncipher %v3, %v3, %v12
+	vxor %v1, %v1, %v4       // XOR the actual key afterwards
+	vxor %v3, %v3, %v4
+
+	vcipher %v0, %v0, %v5
+	vncipher %v1, %v1, %v12
+	vcipher %v2, %v2, %v5
+	vncipher %v3, %v3, %v12
+	vxor %v1, %v1, %v5
+	vxor %v3, %v3, %v5
+
+	vcipher %v0, %v0, %v6
+	vncipher %v1, %v1, %v12
+	vcipher %v2, %v2, %v6
+	vncipher %v3, %v3, %v12
+	vxor %v1, %v1, %v6
+	vxor %v3, %v3, %v6
+
+	vcipher %v0, %v0, %v7
+	vncipher %v1, %v1, %v12
+	vcipher %v2, %v2, %v7
+	vncipher %v3, %v3, %v12
+	vxor %v1, %v1, %v7
+	vxor %v3, %v3, %v7
+
+	// Byte-reverse f0-f3 and e0-e3
+	vperm %v0, %v0, %v0, %v15
+	vperm %v1, %v1, %v1, %v15
+	vperm %v2, %v2, %v2, %v15
+	vperm %v3, %v3, %v3, %v15
+	vperm %v4, %v4, %v4, %v15
+	vperm %v5, %v5, %v5, %v15
+	vperm %v6, %v6, %v6, %v15
+	vperm %v7, %v7, %v7, %v15
+
+	// Store F registers to scratchpad at spAddr0
+	li %r8, 16*0
+	li %r9, 16*1
+	li %r10, 16*2
+	li %r11, 16*3
+	STORE_LE_VR 0, 12, %r8, %r26
+	STORE_LE_VR 1, 13, %r9, %r26
+	STORE_LE_VR 2, 14, %r10, %r26
+	STORE_LE_VR 3, 12, %r11, %r26
+
+randomx_ppc64_vm_spad_store_hard_aes_end:
+
+
+	.section ".text"
+randomx_reciprocal_fast:
+	cntlzd  %r4, %r3       // r4 = 63 - k (count leading zeros)
+	li      %r5, 1         // r5 = 1
+	subfic  %r4, %r4, 63   // r4 = 63 - (63 - k) = k
+	sld     %r4, %r5, %r4  // r4 = 1 << k (this is the upper 64 bits of the dividend)
+	divdeu  %r3, %r4, %r3  // r3 = (r4 || 0x0000000000000000) / divisor
+	blr
diff --git a/src/jit_compiler_ppc64_static.hpp b/src/jit_compiler_ppc64_static.hpp
new file mode 100644
index 00000000..7fe7afd6
--- /dev/null
+++ b/src/jit_compiler_ppc64_static.hpp
@@ -0,0 +1,65 @@
+/*
+Copyright (c) 2026, Forest Crossman <cyrozap@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+extern "C" {
+	void randomx_ppc64_constants();
+	void randomx_ppc64_constant_lut_fprc_to_fpscr();
+	void randomx_ppc64_constants_end();
+
+	void randomx_ppc64_dataset_init();
+	void randomx_ppc64_dataset_init_fix_call();
+	void randomx_ppc64_dataset_init_end();
+
+	void randomx_ppc64_sshash_single_item_prologue();
+	void randomx_ppc64_sshash_single_item_prologue_end();
+	void randomx_ppc64_sshash_single_item_epilogue();
+	void randomx_ppc64_sshash_single_item_epilogue_end();
+	void randomx_ppc64_sshash_cache_prefetch();
+	void randomx_ppc64_sshash_cache_prefetch_end();
+	void randomx_ppc64_sshash_xor();
+	void randomx_ppc64_sshash_xor_end();
+
+	void randomx_ppc64_vm_prologue();
+	void randomx_ppc64_vm_prologue_end();
+	void randomx_ppc64_vm_epilogue();
+	void randomx_ppc64_vm_fix_loop();
+	void randomx_ppc64_vm_epilogue_end();
+	void randomx_ppc64_vm_loop_prologue();
+	void randomx_ppc64_vm_loop_prologue_end();
+	void randomx_ppc64_vm_data_read();
+	void randomx_ppc64_vm_data_read_end();
+	void randomx_ppc64_vm_data_read_light();
+	void randomx_ppc64_vm_data_read_light_fix_call();
+	void randomx_ppc64_vm_data_read_light_end();
+	void randomx_ppc64_vm_spad_store();
+	void randomx_ppc64_vm_spad_store_end();
+	void randomx_ppc64_vm_spad_store_hard_aes();
+	void randomx_ppc64_vm_spad_store_hard_aes_end();
+}
diff --git a/src/reciprocal.h b/src/reciprocal.h
index 90bd9b6b..57f3985f 100644
--- a/src/reciprocal.h
+++ b/src/reciprocal.h
@@ -30,7 +30,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <stdint.h>
 
-#if defined(_M_X64) || defined(__x86_64__)
+#if defined(_M_X64) || defined(__x86_64__) || (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__))
 #define RANDOMX_HAVE_FAST_RECIPROCAL 1
 #else
 #define RANDOMX_HAVE_FAST_RECIPROCAL 0

From 65ba514598ac6ec6846a74cc7bc1a5a4311f1a49 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Sun, 5 Apr 2026 10:40:07 -0500
Subject: [PATCH 02/50] Correct comment for STORE_LE_VR

---
 src/jit_compiler_ppc64_static.S | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index 26ecef7c..23bf0177 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -76,7 +76,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	.global randomx_reciprocal_fast
 
 // Macro to store a VR containing a RandomX Group F/E/A register to memory
-// Clobbers: v12 / vs44
 .macro STORE_LE_VR vr_src, vr_temp, offset_reg, base_reg
 #if PPC_BIG_ENDIAN
 	vperm \vr_temp, \vr_src, \vr_src, %v15         // Reverse the bytes so they're arranged as [ 0123 4567 ]

From 05ff7ddd8ef102d8b2ad8e7f5bab09688e83acf3 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Sun, 5 Apr 2026 10:45:49 -0500
Subject: [PATCH 03/50] Optimize STORE_LE_VR on little-endian POWER

The vector permutation is unnecessary on little-endian systems when
using `stvx`.
---
 src/jit_compiler_ppc64_static.S | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index 23bf0177..a4b4da20 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -78,11 +78,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // Macro to store a VR containing a RandomX Group F/E/A register to memory
 .macro STORE_LE_VR vr_src, vr_temp, offset_reg, base_reg
 #if PPC_BIG_ENDIAN
-	vperm \vr_temp, \vr_src, \vr_src, %v15         // Reverse the bytes so they're arranged as [ 0123 4567 ]
+	vperm \vr_temp, \vr_src, \vr_src, %v15  // Reverse the bytes so they're arranged as [ 0123 4567 ]
+	stvx \vr_temp, \offset_reg, \base_reg   // Store the two doubles to memory
 #else
-	xxswapd \vr_temp + 32, \vr_src + 32            // Swap the doubles so they're arranged as [ 3210 7654 ]
+	stvx \vr_src, \offset_reg, \base_reg    // Store the two doubles to memory
 #endif
-	stxvd2x \vr_temp + 32, \offset_reg, \base_reg  // Store the two doubles to memory
 .endm
 
 // Macro to shuffle a VR after being loaded with lxsdx.

From dcc710a1c96a4770f6194aa897f6b8b2309fbaec Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Sun, 5 Apr 2026 11:14:40 -0500
Subject: [PATCH 04/50] Make it clear that f0-f31 are aliased by vs0-vs31

---
 src/jit_compiler_ppc64_static.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index a4b4da20..1356650a 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -405,8 +405,8 @@ randomx_ppc64_sshash_xor_end:
 // r29 (non-volatile) -> Saved pointer to MemoryRegisters
 // r30 (non-volatile) -> Saved pointer to scratchpad (uint8_t *)
 // r31 (non-volatile) -> unused
-// vsr0–vsr13 (volatile) -> scratch registers
-// vsr14–vsr31 (non-volatile) -> unused
+// f0-f13 / vs0–vs13 (volatile) -> scratch registers
+// f14-f31 / vs14–vs31 (non-volatile) -> unused
 // v0-v3 / vs32-vs35 (volatile) -> RandomX floating point registers f0-f3
 // v4-v7 / vs36-vs39 (volatile) -> RandomX floating point registers e0-e3
 // v8-v11 / vs40-vs43 (volatile) -> RandomX floating point registers a0-a3

From 279514b59b86d8a840d7661449b81a20a04fc683 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Sun, 5 Apr 2026 11:51:04 -0500
Subject: [PATCH 05/50] Mark r23 as unused

---
 src/jit_compiler_ppc64_static.S | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index 1356650a..46d20f03 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -397,6 +397,7 @@ randomx_ppc64_sshash_xor_end:
 // r6-r12 (volatile) -> scratch registers
 // r14-r21 (non-volatile) -> RandomX integer registers r0-r7
 // r22 (non-volatile) -> dataset pointer (memory)
+// r23 (non-volatile) -> unused
 // r24 (non-volatile) -> ma
 // r25 (non-volatile) -> mx
 // r26 (non-volatile) -> spAddr0

From 712da1d3e569de478b4ba5390d6a57f7b18e7654 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Sun, 5 Apr 2026 14:15:46 -0500
Subject: [PATCH 06/50] Optimize CBRANCH

This only saves one or two instructions, but there are no drawbacks to
how this optimization is implemented so there's no reason not to do it.
---
 src/jit_compiler_ppc64.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 75a0e05f..e91df37a 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -252,6 +252,7 @@ namespace PPC64 {
 
 	static inline uint32_t rlwinm(uint32_t ra, uint32_t rs, uint32_t sh, uint32_t mb, uint32_t me) { return M_form(21, rs, ra, sh, mb, me, 0); }
 	static inline uint32_t rldicl(uint32_t ra, uint32_t rs, uint32_t sh, uint32_t mb) { return MD_form(30, rs, ra, sh, mb, 0, 0); }
+	static inline uint32_t rldicl_dot(uint32_t ra, uint32_t rs, uint32_t sh, uint32_t mb) { return MD_form(30, rs, ra, sh, mb, 0, 1); }
 	static inline uint32_t rldicr(uint32_t ra, uint32_t rs, uint32_t sh, uint32_t me) { return MD_form(30, rs, ra, sh, me, 1, 0); }
 	static inline uint32_t rldic(uint32_t ra, uint32_t rs, uint32_t sh, uint32_t mb) { return MD_form(30, rs, ra, sh, mb, 2, 0); }
 	static inline uint32_t rldcl(uint32_t ra, uint32_t rs, uint32_t rb, uint32_t mb) { return MDS_form(30, rs, ra, rb, mb, 8, 0); }
@@ -494,7 +495,7 @@ namespace randomx {
 	constexpr size_t CodeAlign = 64*1024;  // 64 kB, to ensure alignment on systems with a page size <= 64 kB
 	static const size_t ConstantPoolSize = alignSize(sizeConstants + 16, CodeAlign);  // Add 16 bytes for the Group E OR vector mask
 	static const size_t ReserveCodeSize = alignSize(sizeVmPrologue + sizeVmEpilogue + sizeVmLoopPrologue + sizeVmDataRead + sizeVmDataReadLight + sizeVmSpadStore + sizeVmSpadStoreHardAes, CodeAlign);
-	constexpr size_t MaxRandomXInstrCodeSize = 4*11;  // CBRANCH requires at most 11 instructions
+	constexpr size_t MaxRandomXInstrCodeSize = 4*10;  // FDIV_M requires at most 10 instructions
 	constexpr size_t MaxSuperscalarInstrSize = 4*6;  // IMUL_RCP requires at most 6 instructions
 	static const size_t SuperscalarProgramHeaders = sizeSshashSingleItemPrologue + sizeSshashSingleItemEpilogue;
 
@@ -1183,9 +1184,11 @@ namespace randomx {
 		int dst = RegisterMapR.getPpcGprNum(reg);
 		emitAddImm32(state, dst, dst, imm);
 
-		uint64_t mask = (uint64_t)ConditionMask << shift;
-		emitMovImm64(state, 8, mask);
-		state.emit(PPC64::and_dot(8, dst, 8));
+		// Calculate the Mask Begin (MB) parameter
+		uint32_t mb = 64 - RANDOMX_JUMP_BITS;
+
+		// rldicl. r8, dst, 64 - shift, mb
+		state.emit(PPC64::rldicl_dot(8, dst, (64 - shift) & 63, mb));
 
 		int32_t targetPos = state.instructionOffsets[target];
 		int offset = targetPos - state.codePos;

From 09ddaf3b74d648aee2bd9ed2977403d638387111 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Sun, 5 Apr 2026 23:25:13 -0500
Subject: [PATCH 07/50] Add a comment on the importance of using dcbt to
 prefetch the next block

---
 src/jit_compiler_ppc64_static.S | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index 46d20f03..2f0911b6 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -650,7 +650,10 @@ randomx_ppc64_vm_data_read:
 	.long 0  // Placeholder for: rlwinm %r8, %mpReg, 0, mask_begin, mask_end
 	add %r8, %r8, %r22  // r22 holds dataset base pointer
 
-	// Prefetch
+	// Prefetch the next block with dcbt. This is extremely important--without this
+	// we lose >20% performance in V1 and >16% in V2.
+	// Setting TH=0b10000 (dcbtt 0, %r8) didn't make any measurable difference in
+	// performance.
 	dcbt 0, %r8, 0
 
 	// Calculate read address (JIT compiler MUST patch)

From c103a1ba11ce90e5b30973878d549163d053028b Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Mon, 6 Apr 2026 10:08:57 -0500
Subject: [PATCH 08/50] Optimize scratchpad address calculation in program
 suffix

This only saves one or two instructions in a very cold path in the code,
but there are no drawbacks to implementing this optimization so there's
no reason not to do it.
---
 src/jit_compiler_ppc64.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index e91df37a..f2a34f5b 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -736,13 +736,11 @@ namespace randomx {
 		// spAddr0 (r26) = r26 & 0xFFFFFFFF
 		state.emit(PPC64::rldicl(SpAddr0GPR26, SpAddr0GPR26, 0, 32));
 
-		// Load Scratchpad L3 mask into r8
-		uint32_t l3Mask = (RANDOMX_SCRATCHPAD_L3 - 1) & ~63;
-		emitMovImm32(state, 8, l3Mask);
-
-		// Apply mask
-		state.emit(PPC64::and_(SpAddr0GPR26, SpAddr0GPR26, 8));
-		state.emit(PPC64::and_(SpAddr1GPR27, SpAddr1GPR27, 8));
+		// Apply Scratchpad L3 mask
+		uint32_t mb = 32 - Log2(RANDOMX_SCRATCHPAD_L3);
+		uint32_t me = 31 - Log2(RANDOMX_DATASET_ITEM_SIZE);
+		state.emit(PPC64::rlwinm(SpAddr0GPR26, SpAddr0GPR26, 0, mb, me));
+		state.emit(PPC64::rlwinm(SpAddr1GPR27, SpAddr1GPR27, 0, mb, me));
 
 		// Add scratchpad base pointer (r30)
 		state.emit(PPC64::add(SpAddr0GPR26, SpAddr0GPR26, ScratchpadPointerGPR30));

From 2241c3680bba4458ccb0c13d30ac8aec15bdfdc6 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Mon, 6 Apr 2026 10:41:19 -0500
Subject: [PATCH 09/50] Optimize emitMovImm64 for rotated 32-bit immediates

This optimization can save one or two instructions for some immediates.
---
 src/jit_compiler_ppc64.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index f2a34f5b..e602ed56 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -632,6 +632,16 @@ namespace randomx {
 				return;
 			}
 
+			// Values that can be generated by loading a <=32-bit immediate and rotating it
+			for (int i = 1; i < 64; ++i) {
+				uint64_t rot = (imm << i) | (imm >> (64 - i));
+				if (rot == (uint64_t)(int64_t)(int32_t)rot) {
+					emitMovImm32(state, reg, (uint32_t)rot);
+					state.emit(PPC64::rotldi(reg, reg, 64 - i));
+					return;
+				}
+			}
+
 			// All other values
 			uint32_t high = imm >> 32;
 			uint32_t low = imm & 0xFFFFFFFF;

From cf9ca50e3766e8ee7b0ec7b5865ed1300c12a67c Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Mon, 6 Apr 2026 15:35:07 -0500
Subject: [PATCH 10/50] Use cpu.hasAes() instead of getauxval on PPC64

We already query the CPU feature support in cpu.cpp, so there's no need
to do it again.
---
 src/jit_compiler_ppc64.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index e602ed56..46b10a4c 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -30,9 +30,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <stdexcept>
 #include <cstring>
 
-#include <sys/auxv.h>
-#include <asm/cputable.h>
-
 #include "cpu.hpp"
 #include "program.hpp"
 #include "reciprocal.h"
@@ -727,8 +724,7 @@ namespace randomx {
 	void JitCompilerPPC64::emitProgramSuffix(CompilerState& state, ProgramConfiguration& pcfg, randomx_flags flags) {
 		if (flags & RANDOMX_FLAG_V2) {
 			if (true || (flags & RANDOMX_FLAG_HARD_AES)) {  // TODO: Remove the "true" once software AES is working
-				unsigned long hwcaps2 = getauxval(AT_HWCAP2);
-				if (!(hwcaps2 & PPC_FEATURE2_VEC_CRYPTO)) {
+				if (!randomx::cpu.hasAes()) {
 					throw std::runtime_error("This CPU is missing support for hardware AES!");
 				}
 				state.emit(codeVmSpadStoreHardAes, sizeVmSpadStoreHardAes);

From 65950a89195977117d7a1e8880a81acbe0173e82 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Mon, 6 Apr 2026 16:20:06 -0500
Subject: [PATCH 11/50] Set default PPC64 CPU based on whether the system is BE
 or LE

This is the same split in Debian--the ppc64el port is only supported on
POWER8 and later, so POWER7 and earlier can only run Debian ppc64
(big-endian 64-bit PowerPC). Because of this, we set the default
little-endian architecture to POWER8. And since the RandomX JIT backend
for PPC64 requires VSX, which is only supported by POWER7 and later, the
lowest we can set the default big-endian architecture to is POWER7.
---
 CMakeLists.txt | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4768b1af..d3986e16 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -149,10 +149,16 @@ if(ARCH_ID STREQUAL "ppc64" OR ARCH_ID STREQUAL "ppc64le")
 
   set_property(SOURCE src/jit_compiler_ppc64_static.S PROPERTY LANGUAGE C)
 
-  if(ARCH STREQUAL "native")
-    add_flag("-mcpu=native")
+  if(ARCH STREQUAL "default")
+    if(ARCH_ID STREQUAL "ppc64le")
+      # Little-endian defaults to POWER8
+      add_flag("-mcpu=power8")
+    else()
+      # Big-endian defaults to POWER7
+      add_flag("-mcpu=power7")
+    endif()
   else()
-    add_flag("-mcpu=power8")
+    add_flag("-mcpu=${ARCH}")
   endif()
   # PowerPC AES requires ALTIVEC (POWER7+), so it cannot be enabled in the default build
 endif()

From bb57fcda3d0098ac0cb5f0b9756ef66de056f784 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Tue, 7 Apr 2026 00:20:37 -0500
Subject: [PATCH 12/50] Enable compatibility with PPC64 ELF ABI V1

Most big-endian Power systems use V1 of the Power ELF ABI, so in order
for this code to run on those systems we need to make a few changes.

- We increase the size of the stacks by 80 bytes to avoid overwriting
  any of the information callers will write there. These larger stack
  frames are compatible with both ABI V1 and V2.
- We add a macro to declare a C function with metadata that the linker
  is able to use when building for an ABI V1 system.
- We generate function descriptors for JIT-generated functions, since
  pointers to functions in the V1 ABI actually point to function
  descriptors, not the functions themselves.
---
 src/jit_compiler_ppc64.cpp      |  17 +++--
 src/jit_compiler_ppc64.hpp      |  32 +++++++-
 src/jit_compiler_ppc64_static.S | 129 +++++++++++++++++++-------------
 3 files changed, 116 insertions(+), 62 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 46b10a4c..4f256e3e 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -38,12 +38,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "jit_compiler_ppc64.hpp"
 
-#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-	#define PPC_BIG_ENDIAN 1
-#else
-	#define PPC_BIG_ENDIAN 0
-#endif
-
 namespace {
 #define HANDLER_ARGS randomx::CompilerState& state, randomx::Instruction isn, int i, randomx_flags flags
 	using InstructionHandler = void(HANDLER_ARGS);
@@ -793,6 +787,17 @@ namespace randomx {
 		// Patch in the call to the SuperScalar Hash single item function
 		state.emitAt(datasetInitFixCallPos, PPC64::bl(SshashSingleItemPos - datasetInitFixCallPos));
 
+#if !PPC_ABI_V2
+		// Initialize the ABI V1 function descriptors
+		descriptorProgram[0] = reinterpret_cast<uint64_t>(entryProgram);
+		descriptorProgram[1] = 0;
+		descriptorProgram[2] = 0;
+
+		descriptorDataInit[0] = reinterpret_cast<uint64_t>(entryDataInit);
+		descriptorDataInit[1] = 0;
+		descriptorDataInit[2] = 0;
+#endif
+
 		clearCache(state);
 	}
 
diff --git a/src/jit_compiler_ppc64.hpp b/src/jit_compiler_ppc64.hpp
index edea12bd..9107cdc4 100644
--- a/src/jit_compiler_ppc64.hpp
+++ b/src/jit_compiler_ppc64.hpp
@@ -36,6 +36,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "jit_compiler_ppc64_static.hpp"
 
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	#define PPC_BIG_ENDIAN 1
+#else
+	#define PPC_BIG_ENDIAN 0
+#endif
+
+#if (defined(_CALL_ELF) && _CALL_ELF == 2) || (!defined(_CALL_ELF) && !PPC_BIG_ENDIAN)
+	#define PPC_ABI_V2 1
+#else
+	#define PPC_ABI_V2 0
+#endif
+
 namespace randomx {
 
 	class Program;
@@ -55,8 +67,20 @@ namespace randomx {
 
 		void generateDatasetInitCode() {}
 
-		ProgramFunc* getProgramFunc() { return reinterpret_cast<ProgramFunc*>(entryProgram); }
-		DatasetInitFunc* getDatasetInitFunc() { return reinterpret_cast<DatasetInitFunc*>(entryDataInit); }
+		ProgramFunc* getProgramFunc() {
+#if PPC_ABI_V2
+			return reinterpret_cast<ProgramFunc*>(entryProgram);
+#else
+			return reinterpret_cast<ProgramFunc*>(descriptorProgram);
+#endif
+		}
+		DatasetInitFunc* getDatasetInitFunc() {
+#if PPC_ABI_V2
+			return reinterpret_cast<DatasetInitFunc*>(entryDataInit);
+#else
+			return reinterpret_cast<DatasetInitFunc*>(descriptorDataInit);
+#endif
+		}
 		uint8_t* getCode() { return state.code; }
 		size_t getCodeSize();
 
@@ -77,6 +101,10 @@ namespace randomx {
 
 		void* entryDataInit = nullptr;
 		void* entryProgram = nullptr;
+#if !PPC_ABI_V2
+		uint64_t descriptorProgram[3];
+		uint64_t descriptorDataInit[3];
+#endif
 
 		int32_t RandomXCodePos;
 		int32_t SshashSingleItemPos;
diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index 2f0911b6..81b68aa8 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -28,7 +28,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	.machine power7
 	.machine altivec
-	.abiversion 2
 	.section ".rodata"  // Not .text because it's not meant to be executed in-place.
 
 #include "configuration.h"
@@ -39,6 +38,30 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define PPC_BIG_ENDIAN 0
 #endif
 
+#if (defined(_CALL_ELF) && _CALL_ELF == 2) || (!defined(_CALL_ELF) && !PPC_BIG_ENDIAN)
+	#define PPC_ABI_V2 1
+#else
+	#define PPC_ABI_V2 0
+#endif
+
+#if PPC_ABI_V2
+	.abiversion 2
+	#define C_FUNCTION(name) \
+		.global name; \
+		name:
+#else
+	.abiversion 1
+	#define C_FUNCTION(name) \
+		.section ".opd","aw"; \
+		.align 3; \
+		.global name; \
+		name: \
+		.quad .name, .TOC.@tocbase, 0; \
+		.previous; \
+		.global .name; \
+		.name:
+#endif
+
 	.global randomx_ppc64_constants
 	.global randomx_ppc64_constant_lut_fprc_to_fpscr
 	.global randomx_ppc64_constants_end
@@ -73,8 +96,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	.global randomx_ppc64_vm_spad_store_hard_aes
 	.global randomx_ppc64_vm_spad_store_hard_aes_end
 
-	.global randomx_reciprocal_fast
-
 // Macro to store a VR containing a RandomX Group F/E/A register to memory
 .macro STORE_LE_VR vr_src, vr_temp, offset_reg, base_reg
 #if PPC_BIG_ENDIAN
@@ -197,9 +218,9 @@ randomx_ppc64_dataset_init:
 	// Standard function prologue
 	mflr %r0
 	std %r0, 16(%r1)
-	stdu %r1, -48(%r1)
-	std %r14, 32(%r1)
-	std %r15, 40(%r1)
+	stdu %r1, -128(%r1)
+	std %r14, 112(%r1)
+	std %r15, 120(%r1)
 
 	// Load cache->memory pointer
 	ld %r3, 0(%r3)
@@ -244,9 +265,9 @@ randomx_ppc64_dataset_init_fix_call:
 	bdnz 1b
 
 	// Standard function epilogue
-	ld %r14, 32(%r1)
-	ld %r15, 40(%r1)
-	addi %r1, %r1, 48
+	ld %r14, 112(%r1)
+	ld %r15, 120(%r1)
+	addi %r1, %r1, 128
 	ld %r0, 16(%r1)
 	mtlr %r0
 	blr
@@ -283,9 +304,9 @@ randomx_ppc64_sshash_single_item_prologue:
 	// Standard function prologue
 	mflr %r0
 	std %r0, 16(%r1)
-	stdu %r1, -48(%r1)
-	std %r22, 32(%r1)
-	std %r23, 40(%r1)
+	stdu %r1, -128(%r1)
+	std %r22, 112(%r1)
+	std %r23, 120(%r1)
 
 	// Step 1. Initialize registers
 
@@ -331,9 +352,9 @@ randomx_ppc64_sshash_single_item_epilogue:
 	mr %r5, %r22
 
 	// Standard function epilogue
-	ld %r22, 32(%r1)
-	ld %r23, 40(%r1)
-	addi %r1, %r1, 48
+	ld %r22, 112(%r1)
+	ld %r23, 120(%r1)
+	addi %r1, %r1, 128
 	ld %r0, 16(%r1)
 	mtlr %r0
 	blr
@@ -425,25 +446,25 @@ randomx_ppc64_vm_prologue:
 	// Standard function prologue
 	mflr %r0
 	std %r0, 16(%r1)
-	stdu %r1, -176(%r1)
-	std %r14, 32(%r1)
-	std %r15, 40(%r1)
-	std %r16, 48(%r1)
-	std %r17, 56(%r1)
-	std %r18, 64(%r1)
-	std %r19, 72(%r1)
-	std %r20, 80(%r1)
-	std %r21, 88(%r1)
-	std %r22, 96(%r1)
-	std %r23, 104(%r1)
-	std %r24, 112(%r1)
-	std %r25, 120(%r1)
-	std %r26, 128(%r1)
-	std %r27, 136(%r1)
-	std %r28, 144(%r1)
-	std %r29, 152(%r1)
-	std %r30, 160(%r1)
-	//std %r31, 168(%r1)
+	stdu %r1, -256(%r1)
+	std %r14, 112(%r1)
+	std %r15, 120(%r1)
+	std %r16, 128(%r1)
+	std %r17, 136(%r1)
+	std %r18, 144(%r1)
+	std %r19, 152(%r1)
+	std %r20, 160(%r1)
+	std %r21, 168(%r1)
+	std %r22, 176(%r1)
+	std %r23, 184(%r1)
+	std %r24, 192(%r1)
+	std %r25, 200(%r1)
+	std %r26, 208(%r1)
+	std %r27, 216(%r1)
+	std %r28, 224(%r1)
+	std %r29, 232(%r1)
+	std %r30, 240(%r1)
+	//std %r31, 248(%r1)
 
 	// Save arguments
 	mr %r28, %r3
@@ -551,25 +572,25 @@ randomx_ppc64_vm_fix_loop:
 	stxvd2x %vs7, 0, %r11
 
 	// Standard function epilogue
-	ld %r14, 32(%r1)
-	ld %r15, 40(%r1)
-	ld %r16, 48(%r1)
-	ld %r17, 56(%r1)
-	ld %r18, 64(%r1)
-	ld %r19, 72(%r1)
-	ld %r20, 80(%r1)
-	ld %r21, 88(%r1)
-	ld %r22, 96(%r1)
-	ld %r23, 104(%r1)
-	ld %r24, 112(%r1)
-	ld %r25, 120(%r1)
-	ld %r26, 128(%r1)
-	ld %r27, 136(%r1)
-	ld %r28, 144(%r1)
-	ld %r29, 152(%r1)
-	ld %r30, 160(%r1)
-	//ld %r31, 168(%r1)
-	addi %r1, %r1,176
+	ld %r14, 112(%r1)
+	ld %r15, 120(%r1)
+	ld %r16, 128(%r1)
+	ld %r17, 136(%r1)
+	ld %r18, 144(%r1)
+	ld %r19, 152(%r1)
+	ld %r20, 160(%r1)
+	ld %r21, 168(%r1)
+	ld %r22, 176(%r1)
+	ld %r23, 184(%r1)
+	ld %r24, 192(%r1)
+	ld %r25, 200(%r1)
+	ld %r26, 208(%r1)
+	ld %r27, 216(%r1)
+	ld %r28, 224(%r1)
+	ld %r29, 232(%r1)
+	ld %r30, 240(%r1)
+	//ld %r31, 248(%r1)
+	addi %r1, %r1,256
 	ld %r0, 16(%r1)
 	mtlr %r0
 	blr
@@ -820,7 +841,7 @@ randomx_ppc64_vm_spad_store_hard_aes_end:
 
 
 	.section ".text"
-randomx_reciprocal_fast:
+C_FUNCTION(randomx_reciprocal_fast)
 	cntlzd  %r4, %r3       // r4 = 63 - k (count leading zeros)
 	li      %r5, 1         // r5 = 1
 	subfic  %r4, %r4, 63   // r4 = 63 - (63 - k) = k

From 807bdf5e167cac2276dc0e539a7485b726b305a9 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Tue, 7 Apr 2026 14:26:21 -0500
Subject: [PATCH 13/50] Fix BE PPC64 cache and dataset endianness

The Argon2 implementation writes to the cache in native endianness, so
we need to read it in native endianness. And since nothing that reads
the dataset cares about its byte order, we can keep that in native
endianness as well despite the spec saying that it should be in
little-endian byte order.
---
 src/jit_compiler_ppc64_static.S | 48 ++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index 81b68aa8..a2ac532b 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -246,14 +246,14 @@ randomx_ppc64_dataset_init_fix_call:
 	b 0
 
 	// Store the 64 computed bytes back in the dataset
-	STORE_LE_GPR %r4, 8*0, %r14
-	STORE_LE_GPR %r6, 8*1, %r14
-	STORE_LE_GPR %r7, 8*2, %r14
-	STORE_LE_GPR %r9, 8*3, %r14
-	STORE_LE_GPR %r10, 8*4, %r14
-	STORE_LE_GPR %r11, 8*5, %r14
-	STORE_LE_GPR %r12, 8*6, %r14
-	STORE_LE_GPR %r5, 8*7, %r14
+	std %r4, 8*0(%r14)
+	std %r6, 8*1(%r14)
+	std %r7, 8*2(%r14)
+	std %r9, 8*3(%r14)
+	std %r10, 8*4(%r14)
+	std %r11, 8*5(%r14)
+	std %r12, 8*6(%r14)
+	std %r5, 8*7(%r14)
 
 	// Increment the dataset pointer by 64 bytes
 	addi %r14, %r14, 8*8
@@ -376,20 +376,20 @@ randomx_ppc64_sshash_cache_prefetch_end:
 
 // Step 6. XOR all registers with data loaded from randomx cache
 randomx_ppc64_sshash_xor:
-	LOAD_LE_GPR %r8, 0, %r23
-	LOAD_LE_GPR %r0, 8, %r23
+	ld %r8, 0(%r23)
+	ld %r0, 8(%r23)
 	xor %r4, %r4, %r8
 	xor %r6, %r6, %r0
-	LOAD_LE_GPR %r8, 16, %r23
-	LOAD_LE_GPR %r0, 24, %r23
+	ld %r8, 16(%r23)
+	ld %r0, 24(%r23)
 	xor %r7, %r7, %r8
 	xor %r9, %r9, %r0
-	LOAD_LE_GPR %r8, 32, %r23
-	LOAD_LE_GPR %r0, 40, %r23
+	ld %r8, 32(%r23)
+	ld %r0, 40(%r23)
 	xor %r10, %r10, %r8
 	xor %r11, %r11, %r0
-	LOAD_LE_GPR %r8, 48, %r23
-	LOAD_LE_GPR %r0, 56, %r23
+	ld %r8, 48(%r23)
+	ld %r0, 56(%r23)
 	xor %r12, %r12, %r8
 	xor %r22, %r22, %r0
 
@@ -682,20 +682,20 @@ randomx_ppc64_vm_data_read:
 	add %r8, %r8, %r22
 
 	// Read 64 bytes and XOR with integer registers
-	LOAD_LE_GPR %r9, 0, %r8
-	LOAD_LE_GPR %r10, 8, %r8
+	ld %r9, 0(%r8)
+	ld %r10, 8(%r8)
 	xor %r14, %r14, %r9
 	xor %r15, %r15, %r10
-	LOAD_LE_GPR %r9, 16, %r8
-	LOAD_LE_GPR %r10, 24, %r8
+	ld %r9, 16(%r8)
+	ld %r10, 24(%r8)
 	xor %r16, %r16, %r9
 	xor %r17, %r17, %r10
-	LOAD_LE_GPR %r9, 32, %r8
-	LOAD_LE_GPR %r10, 40, %r8
+	ld %r9, 32(%r8)
+	ld %r10, 40(%r8)
 	xor %r18, %r18, %r9
 	xor %r19, %r19, %r10
-	LOAD_LE_GPR %r9, 48, %r8
-	LOAD_LE_GPR %r10, 56, %r8
+	ld %r9, 48(%r8)
+	ld %r10, 56(%r8)
 	xor %r20, %r20, %r9
 	xor %r21, %r21, %r10
 

From 205f4a5ac30ace5ea9b7dc93199254f169f13ec9 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Tue, 7 Apr 2026 19:20:28 -0500
Subject: [PATCH 14/50] Fix BE PPC64 scratchpad and register endianness

The scratchpad and register file must both be read from and written to
in little-endian byte order.
---
 src/jit_compiler_ppc64.cpp      | 17 +++++--
 src/jit_compiler_ppc64_static.S | 83 ++++++++++++++++-----------------
 2 files changed, 53 insertions(+), 47 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 4f256e3e..86fc976a 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -261,6 +261,7 @@ namespace PPC64 {
 	static inline uint32_t ldx(uint32_t rt, uint32_t ra, uint32_t rb) { return X_form(31, rt, ra, rb, 21, 0); }
 	static inline uint32_t ldbrx(uint32_t rt, uint32_t ra, uint32_t rb) { return X_form(31, rt, ra, rb, 532, 0); }
 	static inline uint32_t stdx(uint32_t rs, uint32_t ra, uint32_t rb) { return X_form(31, rs, ra, rb, 149, 0); }
+	static inline uint32_t stdbrx(uint32_t rs, uint32_t ra, uint32_t rb) { return X_form(31, rs, ra, rb, 660, 0); }
 
 	static inline uint32_t lfd(uint32_t frt, uint32_t ra, uint32_t d) { return D_form(50, frt, ra, d); }
 	static inline uint32_t lfdx(uint32_t frt, uint32_t ra, uint32_t rb) { return X_form(31, frt, ra, rb, 599, 0); }
@@ -558,6 +559,14 @@ namespace randomx {
 		}
 	}
 
+	static void emitStoreGpr64(CompilerState& state, uint32_t rs, uint32_t ra, uint32_t rb) {
+		if (PPC_BIG_ENDIAN) {
+			state.emit(PPC64::stdbrx(rs, ra, rb));
+		} else {
+			state.emit(PPC64::stdx(rs, ra, rb));
+		}
+	}
+
 	static void emitLoadVr64(CompilerState& state, uint32_t vrt, uint32_t ra, uint32_t rb) {
 		// We need to load the two packed little-endian signed 32-bit integers into a VSR, then we need to
 		// shuffle them so they're in the correct halves of the VSR register and in the correct byte order,
@@ -695,9 +704,9 @@ namespace randomx {
 	void JitCompilerPPC64::emitProgramPrefix(CompilerState& state, Program& prog, ProgramConfiguration& pcfg, randomx_flags flags) {
 		state.codePos = RandomXCodePos;
 
-		// Load the Group E OR vector mask (high word in offset 1, low word in offset 0--enables loading with lxvd2x)
-		state.emitAt(sizeConstants, pcfg.eMask[1]);
-		state.emitAt(sizeConstants + 8, pcfg.eMask[0]);
+		// Set the Group E OR vector mask
+		state.emitAt(sizeConstants, pcfg.eMask[0]);
+		state.emitAt(sizeConstants + 8, pcfg.eMask[1]);
 
 		LoopBeginPos = state.codePos;
 		state.emit(codeVmLoopPrologue, sizeVmLoopPrologue);
@@ -1282,7 +1291,7 @@ namespace randomx {
 		uint32_t mb = 32 - Log2(size);
 		state.emit(PPC64::rlwinm(8, 8, 0, mb, 28));
 
-		state.emit(PPC64::stdx(src, ScratchpadPointerGPR30, 8));
+		emitStoreGpr64(state, src, ScratchpadPointerGPR30, 8);
 	}
 	static void h_NOP(HANDLER_ARGS) {
 	}
diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index a2ac532b..1dfbe219 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -484,11 +484,14 @@ randomx_ppc64_vm_prologue:
 	li %r12, constant_vector_be_byte_reverse_mask-randomx_ppc64_constants
 	lxvd2x %vs49, %r8, %r2
 	lxvd2x %vs50, %r9, %r2
-	lxvd2x %vs51, %r10, %r2
+	lvx %v19, %r10, %r2  // Use lvx to load the vector since it's written [ low word, high word ] in memory
 #if PPC_BIG_ENDIAN
 	lxvd2x %vs48, %r11, %r2  // Load the BE permutation mask (not needed for LE)
 #endif
 	lxvd2x %vs47, %r12, %r2
+#if PPC_BIG_ENDIAN
+	vperm %v19, %v19, %v19, %v15  // Swap the byte order of the Group E OR mask vector
+#endif
 
 	// Zero the RandomX integer registers
 	li %r14, 0
@@ -505,20 +508,22 @@ randomx_ppc64_vm_prologue:
 	lwz %r24, 4(%r29)  // ma
 	ld %r22, 8(%r29)   // memory (dataset pointer)
 
-	// Load a0-a3 from RegisterFile (we have to swap doubles because lxvd2x always loads them in big-endian word order)
+	// Load a0-a3 from RegisterFile
 	.equ registers_a_base, 8*8+16*4+16*4
 	addi %r8, %r28, registers_a_base + 16*0
 	addi %r9, %r28, registers_a_base + 16*1
 	addi %r10, %r28, registers_a_base + 16*2
 	addi %r11, %r28, registers_a_base + 16*3
-	lxvd2x %vs40, 0, %r8
-	lxvd2x %vs41, 0, %r9
-	lxvd2x %vs42, 0, %r10
-	lxvd2x %vs43, 0, %r11
-	xxswapd %vs40, %vs40
-	xxswapd %vs41, %vs41
-	xxswapd %vs42, %vs42
-	xxswapd %vs43, %vs43
+	lvx %v8, 0, %r8
+	lvx %v9, 0, %r9
+	lvx %v10, 0, %r10
+	lvx %v11, 0, %r11
+#if PPC_BIG_ENDIAN
+	vperm %v8, %v8, %v8, %v15
+	vperm %v9, %v9, %v9, %v15
+	vperm %v10, %v10, %v10, %v15
+	vperm %v11, %v11, %v11, %v15
+#endif
 
 	// Instructions to mask mx and ma with Scratchpad L3 mask and set the
 	// initial values of spAddr0 and spAddr1 are appended here by the JIT
@@ -533,43 +538,35 @@ randomx_ppc64_vm_fix_loop:
 	b 0
 1:
 
-	// Store RandomX registers back into register file (no endian swaps needed)
-	std %r14, 8*0(%r28)
-	std %r15, 8*1(%r28)
-	std %r16, 8*2(%r28)
-	std %r17, 8*3(%r28)
-	std %r18, 8*4(%r28)
-	std %r19, 8*5(%r28)
-	std %r20, 8*6(%r28)
-	std %r21, 8*7(%r28)
+	// Store RandomX registers back into register file
+	STORE_LE_GPR %r14, 8*0, %r28
+	STORE_LE_GPR %r15, 8*1, %r28
+	STORE_LE_GPR %r16, 8*2, %r28
+	STORE_LE_GPR %r17, 8*3, %r28
+	STORE_LE_GPR %r18, 8*4, %r28
+	STORE_LE_GPR %r19, 8*5, %r28
+	STORE_LE_GPR %r20, 8*6, %r28
+	STORE_LE_GPR %r21, 8*7, %r28
 
 	.equ registers_f_base, 8*8
-	addi %r8, %r28, registers_f_base + 16*0
-	addi %r9, %r28, registers_f_base + 16*1
-	addi %r10, %r28, registers_f_base + 16*2
-	addi %r11, %r28, registers_f_base + 16*3
-	xxswapd %vs0, %vs32
-	xxswapd %vs1, %vs33
-	xxswapd %vs2, %vs34
-	xxswapd %vs3, %vs35
-	stxvd2x %vs0, 0, %r8
-	stxvd2x %vs1, 0, %r9
-	stxvd2x %vs2, 0, %r10
-	stxvd2x %vs3, 0, %r11
+	li %r8, registers_f_base + 16*0
+	li %r9, registers_f_base + 16*1
+	li %r10, registers_f_base + 16*2
+	li %r11, registers_f_base + 16*3
+	STORE_LE_VR 0, 12, %r8, %r28
+	STORE_LE_VR 1, 13, %r9, %r28
+	STORE_LE_VR 2, 14, %r10, %r28
+	STORE_LE_VR 3, 12, %r11, %r28
 
 	.equ registers_e_base, 8*8+16*4
-	addi %r8, %r28, registers_e_base + 16*0
-	addi %r9, %r28, registers_e_base + 16*1
-	addi %r10, %r28, registers_e_base + 16*2
-	addi %r11, %r28, registers_e_base + 16*3
-	xxswapd %vs4, %vs36
-	xxswapd %vs5, %vs37
-	xxswapd %vs6, %vs38
-	xxswapd %vs7, %vs39
-	stxvd2x %vs4, 0, %r8
-	stxvd2x %vs5, 0, %r9
-	stxvd2x %vs6, 0, %r10
-	stxvd2x %vs7, 0, %r11
+	li %r8, registers_e_base + 16*0
+	li %r9, registers_e_base + 16*1
+	li %r10, registers_e_base + 16*2
+	li %r11, registers_e_base + 16*3
+	STORE_LE_VR 4, 12, %r8, %r28
+	STORE_LE_VR 5, 13, %r9, %r28
+	STORE_LE_VR 6, 14, %r10, %r28
+	STORE_LE_VR 7, 12, %r11, %r28
 
 	// Standard function epilogue
 	ld %r14, 112(%r1)

From 1865a43660b049ae8d3f00e07b9d8c5d5cc11958 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Tue, 7 Apr 2026 20:06:11 -0500
Subject: [PATCH 15/50] Fix interpreter v2 tests on big-endian PPC64

We need to byte-swap 128-bit vectors for AES mixing on big-endian PPC64.
Otherwise, the interpreter v2 hash tests will fail.
---
 src/intrin_portable.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/intrin_portable.h b/src/intrin_portable.h
index 10530656..e1a06b12 100644
--- a/src/intrin_portable.h
+++ b/src/intrin_portable.h
@@ -277,11 +277,19 @@ FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
 }
 
 FORCE_INLINE rx_vec_f128 rx_cast_vec_i2f(rx_vec_i128 a) {
+#if defined(NATIVE_LITTLE_ENDIAN)
 	return (rx_vec_f128)a;
+#else
+	return (rx_vec_f128)vec_perm((__m128i)a, (__m128i)a, (__m128i){4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11});
+#endif
 }
 
 FORCE_INLINE rx_vec_i128 rx_cast_vec_f2i(rx_vec_f128 a) {
+#if defined(NATIVE_LITTLE_ENDIAN)
 	return (rx_vec_i128)a;
+#else
+	return (rx_vec_i128)vec_perm((__m128i)a, (__m128i)a, (__m128i){4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11});
+#endif
 }
 
 FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {

From b7a3154dfcf11f99e8be6fd8af453c2e8b351274 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Tue, 7 Apr 2026 20:23:14 -0500
Subject: [PATCH 16/50] Remove unnecessary immediate load on PPC64 BE with v1
 ABI

In ABI v1, register GPR2 is loaded by the caller from the function
descriptor, so we don't need to emit instructions to load it ourselves.
---
 src/jit_compiler_ppc64.cpp      | 16 ++++++++++------
 src/jit_compiler_ppc64_static.S |  4 ++--
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 86fc976a..cc106fff 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -772,8 +772,10 @@ namespace randomx {
 
 		state.codePos = ConstantPoolSize;
 		entryProgram = state.code + state.codePos;
-		// Load r2 with the base address of the constant pool
-		emitMovImm64(state, ConstantsBaseAddressRegisterGPR2, reinterpret_cast<uint64_t>(state.code));
+		if (PPC_ABI_V2) {
+			// Load r2 with the base address of the constant pool
+			emitMovImm64(state, ConstantsBaseAddressRegisterGPR2, reinterpret_cast<uint64_t>(state.code));
+		}
 		state.emit(codeVmPrologue, sizeVmPrologue);
 		// Mask mx and ma with Scratchpad L3 mask
 		uint32_t mask_begin = 32 - Log2(RANDOMX_SCRATCHPAD_L3);
@@ -788,8 +790,10 @@ namespace randomx {
 
 		state.codePos = RandomXCodeSize;
 		entryDataInit = state.code + state.codePos;
-		// Load r2 with the base address of the constant pool
-		emitMovImm64(state, ConstantsBaseAddressRegisterGPR2, reinterpret_cast<uint64_t>(state.code));
+		if (PPC_ABI_V2) {
+			// Load r2 with the base address of the constant pool
+			emitMovImm64(state, ConstantsBaseAddressRegisterGPR2, reinterpret_cast<uint64_t>(state.code));
+		}
 		int32_t datasetInitFixCallPos = state.codePos + offsetDatasetInitFixCall;
 		state.emit(codeDatasetInit, sizeDatasetInit);
 		SshashSingleItemPos = alignSize(state.codePos, 128);
@@ -799,11 +803,11 @@ namespace randomx {
 #if !PPC_ABI_V2
 		// Initialize the ABI V1 function descriptors
 		descriptorProgram[0] = reinterpret_cast<uint64_t>(entryProgram);
-		descriptorProgram[1] = 0;
+		descriptorProgram[1] = reinterpret_cast<uint64_t>(state.code);
 		descriptorProgram[2] = 0;
 
 		descriptorDataInit[0] = reinterpret_cast<uint64_t>(entryDataInit);
-		descriptorDataInit[1] = 0;
+		descriptorDataInit[1] = reinterpret_cast<uint64_t>(state.code);
 		descriptorDataInit[2] = 0;
 #endif
 
diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index 1dfbe219..66b32032 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -213,7 +213,7 @@ literal_vector_group_e_or_mask:
 // r16-r31 (non-volatile) -> unused
 
 randomx_ppc64_dataset_init:
-	// JIT compiler MUST emit immediate load to r2 before this code
+	// JIT compiler MUST emit immediate load to r2 before this code (ABI v2 only)
 
 	// Standard function prologue
 	mflr %r0
@@ -441,7 +441,7 @@ randomx_ppc64_sshash_xor_end:
 // v20-v31 / vs52-vs63 (non-volatile) -> unused
 
 randomx_ppc64_vm_prologue:
-	// JIT compiler MUST emit immediate load to r2 before this code
+	// JIT compiler MUST emit immediate load to r2 before this code (ABI v2 only)
 
 	// Standard function prologue
 	mflr %r0

From 778a58fbe1120780a0d70fb05c5be7f7a7e94a93 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Thu, 9 Apr 2026 22:51:50 -0500
Subject: [PATCH 17/50] Move PPC64 VM prologue generation into prefix
 generation function

This should make the code a little bit easier to reason about.
---
 src/jit_compiler_ppc64.cpp | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index cc106fff..7cb8ba49 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -702,12 +702,23 @@ namespace randomx {
 	}
 
 	void JitCompilerPPC64::emitProgramPrefix(CompilerState& state, Program& prog, ProgramConfiguration& pcfg, randomx_flags flags) {
-		state.codePos = RandomXCodePos;
-
 		// Set the Group E OR vector mask
 		state.emitAt(sizeConstants, pcfg.eMask[0]);
 		state.emitAt(sizeConstants + 8, pcfg.eMask[1]);
 
+		state.codePos = RandomXCodePos;
+
+		state.emit(codeVmPrologue, sizeVmPrologue);
+		// Mask mx and ma with Scratchpad L3 mask
+		uint32_t mask_begin = 32 - Log2(RANDOMX_SCRATCHPAD_L3);
+		uint32_t mask_end = 31 - Log2(RANDOMX_DATASET_ITEM_SIZE);
+		state.emit(PPC64::rlwinm(SpAddr0GPR26, MxGPR25, 0, mask_begin, mask_end));
+		state.emit(PPC64::rlwinm(SpAddr1GPR27, MaGPR24, 0, mask_begin, mask_end));
+		// Init spAddr0 to masked mx + scratchpad base
+		state.emit(PPC64::add(SpAddr0GPR26, SpAddr0GPR26, ScratchpadPointerGPR30));
+		// Init spAddr1 to masked ma + scratchpad base
+		state.emit(PPC64::add(SpAddr1GPR27, SpAddr1GPR27, ScratchpadPointerGPR30));
+
 		LoopBeginPos = state.codePos;
 		state.emit(codeVmLoopPrologue, sizeVmLoopPrologue);
 
@@ -776,16 +787,6 @@ namespace randomx {
 			// Load r2 with the base address of the constant pool
 			emitMovImm64(state, ConstantsBaseAddressRegisterGPR2, reinterpret_cast<uint64_t>(state.code));
 		}
-		state.emit(codeVmPrologue, sizeVmPrologue);
-		// Mask mx and ma with Scratchpad L3 mask
-		uint32_t mask_begin = 32 - Log2(RANDOMX_SCRATCHPAD_L3);
-		uint32_t mask_end = 31 - Log2(RANDOMX_DATASET_ITEM_SIZE);
-		state.emit(PPC64::rlwinm(SpAddr0GPR26, MxGPR25, 0, mask_begin, mask_end));
-		state.emit(PPC64::rlwinm(SpAddr1GPR27, MaGPR24, 0, mask_begin, mask_end));
-		// Init spAddr0 to masked mx + scratchpad base
-		state.emit(PPC64::add(SpAddr0GPR26, SpAddr0GPR26, ScratchpadPointerGPR30));
-		// Init spAddr1 to masked ma + scratchpad base
-		state.emit(PPC64::add(SpAddr1GPR27, SpAddr1GPR27, ScratchpadPointerGPR30));
 		RandomXCodePos = state.codePos;
 
 		state.codePos = RandomXCodeSize;

From c2e43559fba31645dadf9ea087902c16b99cda86 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Thu, 9 Apr 2026 23:07:58 -0500
Subject: [PATCH 18/50] Factor out the common parts of the scratchpad store

---
 src/jit_compiler_ppc64.cpp        | 28 +++++++++++++------
 src/jit_compiler_ppc64_static.S   | 46 ++++++++++++-------------------
 src/jit_compiler_ppc64_static.hpp | 12 +++++---
 3 files changed, 45 insertions(+), 41 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 7cb8ba49..e70eb4c7 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -455,10 +455,14 @@ namespace randomx {
 	static const uint8_t* codeVmDataReadLight = (uint8_t*)&randomx_ppc64_vm_data_read_light;
 	static const uint8_t* codeVmDataReadLightFixCall = (uint8_t*)&randomx_ppc64_vm_data_read_light_fix_call;
 	static const uint8_t* codeVmDataReadLightEnd = (uint8_t*)&randomx_ppc64_vm_data_read_light_end;
-	static const uint8_t* codeVmSpadStore = (uint8_t*)&randomx_ppc64_vm_spad_store;
-	static const uint8_t* codeVmSpadStoreEnd = (uint8_t*)&randomx_ppc64_vm_spad_store_end;
-	static const uint8_t* codeVmSpadStoreHardAes = (uint8_t*)&randomx_ppc64_vm_spad_store_hard_aes;
-	static const uint8_t* codeVmSpadStoreHardAesEnd = (uint8_t*)&randomx_ppc64_vm_spad_store_hard_aes_end;
+	static const uint8_t* codeVmSpadStorePrologue = (uint8_t*)&randomx_ppc64_vm_spad_store_prologue;
+	static const uint8_t* codeVmSpadStorePrologueEnd = (uint8_t*)&randomx_ppc64_vm_spad_store_prologue_end;
+	static const uint8_t* codeVmSpadStoreMixV1 = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v1;
+	static const uint8_t* codeVmSpadStoreMixV1End = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v1_end;
+	static const uint8_t* codeVmSpadStoreEpilogue = (uint8_t*)&randomx_ppc64_vm_spad_store_epilogue;
+	static const uint8_t* codeVmSpadStoreEpilogueEnd = (uint8_t*)&randomx_ppc64_vm_spad_store_epilogue_end;
+	static const uint8_t* codeVmSpadStoreMixV2HardAes = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v2_hard_aes;
+	static const uint8_t* codeVmSpadStoreMixV2HardAesEnd = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end;
 
 	static const int32_t sizeConstants = codeConstantsEnd - codeConstants;
 
@@ -474,8 +478,10 @@ namespace randomx {
 	static const int32_t sizeVmLoopPrologue = codeVmLoopPrologueEnd - codeVmLoopPrologue;
 	static const int32_t sizeVmDataRead = codeVmDataReadEnd - codeVmDataRead;
 	static const int32_t sizeVmDataReadLight = codeVmDataReadLightEnd - codeVmDataReadLight;
-	static const int32_t sizeVmSpadStore = codeVmSpadStoreEnd - codeVmSpadStore;
-	static const int32_t sizeVmSpadStoreHardAes = codeVmSpadStoreHardAesEnd - codeVmSpadStoreHardAes;
+	static const int32_t sizeVmSpadStorePrologue = codeVmSpadStorePrologueEnd - codeVmSpadStorePrologue;
+	static const int32_t sizeVmSpadStoreMixV1 = codeVmSpadStoreMixV1End - codeVmSpadStoreMixV1;
+	static const int32_t sizeVmSpadStoreEpilogue = codeVmSpadStoreEpilogueEnd - codeVmSpadStoreEpilogue;
+	static const int32_t sizeVmSpadStoreMixV2HardAes = codeVmSpadStoreMixV2HardAesEnd - codeVmSpadStoreMixV2HardAes;
 
 	static const int32_t offsetConstantLutFprcToFpscr = codeConstantLutFprcToFpscr - codeConstants;
 
@@ -486,7 +492,7 @@ namespace randomx {
 
 	constexpr size_t CodeAlign = 64*1024;  // 64 kB, to ensure alignment on systems with a page size <= 64 kB
 	static const size_t ConstantPoolSize = alignSize(sizeConstants + 16, CodeAlign);  // Add 16 bytes for the Group E OR vector mask
-	static const size_t ReserveCodeSize = alignSize(sizeVmPrologue + sizeVmEpilogue + sizeVmLoopPrologue + sizeVmDataRead + sizeVmDataReadLight + sizeVmSpadStore + sizeVmSpadStoreHardAes, CodeAlign);
+	static const size_t ReserveCodeSize = alignSize(sizeVmPrologue + sizeVmEpilogue + sizeVmLoopPrologue + sizeVmDataRead + sizeVmDataReadLight + sizeVmSpadStorePrologue + sizeVmSpadStoreMixV2HardAes + sizeVmSpadStoreEpilogue, CodeAlign);
 	constexpr size_t MaxRandomXInstrCodeSize = 4*10;  // FDIV_M requires at most 10 instructions
 	constexpr size_t MaxSuperscalarInstrSize = 4*6;  // IMUL_RCP requires at most 6 instructions
 	static const size_t SuperscalarProgramHeaders = sizeSshashSingleItemPrologue + sizeSshashSingleItemEpilogue;
@@ -736,19 +742,23 @@ namespace randomx {
 	}
 
 	void JitCompilerPPC64::emitProgramSuffix(CompilerState& state, ProgramConfiguration& pcfg, randomx_flags flags) {
+		state.emit(codeVmSpadStorePrologue, sizeVmSpadStorePrologue);
+
 		if (flags & RANDOMX_FLAG_V2) {
 			if (true || (flags & RANDOMX_FLAG_HARD_AES)) {  // TODO: Remove the "true" once software AES is working
 				if (!randomx::cpu.hasAes()) {
 					throw std::runtime_error("This CPU is missing support for hardware AES!");
 				}
-				state.emit(codeVmSpadStoreHardAes, sizeVmSpadStoreHardAes);
+				state.emit(codeVmSpadStoreMixV2HardAes, sizeVmSpadStoreMixV2HardAes);
 			} else {
 				throw std::runtime_error("Software AES is not yet implemented for PPC64!");
 			}
 		} else {
-			state.emit(codeVmSpadStore, sizeVmSpadStore);
+			state.emit(codeVmSpadStoreMixV1, sizeVmSpadStoreMixV1);
 		}
 
+		state.emit(codeVmSpadStoreEpilogue, sizeVmSpadStoreEpilogue);
+
 		state.emit(PPC64::xor_(SpAddr0GPR26, RegisterMapR.getPpcGprNum(pcfg.readReg0), RegisterMapR.getPpcGprNum(pcfg.readReg1)));
 
 		// spAddr1 (r27) = r26 >> 32
diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index 66b32032..8ae7cab9 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -91,10 +91,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	.global randomx_ppc64_vm_data_read_light
 	.global randomx_ppc64_vm_data_read_light_fix_call
 	.global randomx_ppc64_vm_data_read_light_end
-	.global randomx_ppc64_vm_spad_store
-	.global randomx_ppc64_vm_spad_store_end
-	.global randomx_ppc64_vm_spad_store_hard_aes
-	.global randomx_ppc64_vm_spad_store_hard_aes_end
+	.global randomx_ppc64_vm_spad_store_prologue
+	.global randomx_ppc64_vm_spad_store_prologue_end
+	.global randomx_ppc64_vm_spad_store_mix_v1
+	.global randomx_ppc64_vm_spad_store_mix_v1_end
+	.global randomx_ppc64_vm_spad_store_epilogue
+	.global randomx_ppc64_vm_spad_store_epilogue_end
+	.global randomx_ppc64_vm_spad_store_mix_v2_hard_aes
+	.global randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end
 
 // Macro to store a VR containing a RandomX Group F/E/A register to memory
 .macro STORE_LE_VR vr_src, vr_temp, offset_reg, base_reg
@@ -731,7 +735,7 @@ randomx_ppc64_vm_data_read_light_fix_call:
 
 randomx_ppc64_vm_data_read_light_end:
 
-randomx_ppc64_vm_spad_store:
+randomx_ppc64_vm_spad_store_prologue:
 	// Store to scratchpad at spAddr1
 	STORE_LE_GPR %r14, 8*0, %r27
 	STORE_LE_GPR %r15, 8*1, %r27
@@ -742,12 +746,18 @@ randomx_ppc64_vm_spad_store:
 	STORE_LE_GPR %r20, 8*6, %r27
 	STORE_LE_GPR %r21, 8*7, %r27
 
+randomx_ppc64_vm_spad_store_prologue_end:
+
+randomx_ppc64_vm_spad_store_mix_v1:
 	// Mix F and E registers (f0-f3 are vs32-vs35, e0-e3 are vs36-vs39)
 	xxlxor %vs32, %vs32, %vs36
 	xxlxor %vs33, %vs33, %vs37
 	xxlxor %vs34, %vs34, %vs38
 	xxlxor %vs35, %vs35, %vs39
 
+randomx_ppc64_vm_spad_store_mix_v1_end:
+
+randomx_ppc64_vm_spad_store_epilogue:
 	// Store F registers to scratchpad at spAddr0
 	li %r8, 16*0
 	li %r9, 16*1
@@ -758,19 +768,9 @@ randomx_ppc64_vm_spad_store:
 	STORE_LE_VR 2, 14, %r10, %r26
 	STORE_LE_VR 3, 12, %r11, %r26
 
-randomx_ppc64_vm_spad_store_end:
-
-randomx_ppc64_vm_spad_store_hard_aes:
-	// Store to scratchpad at spAddr1
-	STORE_LE_GPR %r14, 8*0, %r27
-	STORE_LE_GPR %r15, 8*1, %r27
-	STORE_LE_GPR %r16, 8*2, %r27
-	STORE_LE_GPR %r17, 8*3, %r27
-	STORE_LE_GPR %r18, 8*4, %r27
-	STORE_LE_GPR %r19, 8*5, %r27
-	STORE_LE_GPR %r20, 8*6, %r27
-	STORE_LE_GPR %r21, 8*7, %r27
+randomx_ppc64_vm_spad_store_epilogue_end:
 
+randomx_ppc64_vm_spad_store_mix_v2_hard_aes:
 	// Mix F and E registers (f0-f3 are v0-v3, e0-e3 are v4-v7)
 
 	// Byte-reverse f0-f3 and e0-e3
@@ -824,17 +824,7 @@ randomx_ppc64_vm_spad_store_hard_aes:
 	vperm %v6, %v6, %v6, %v15
 	vperm %v7, %v7, %v7, %v15
 
-	// Store F registers to scratchpad at spAddr0
-	li %r8, 16*0
-	li %r9, 16*1
-	li %r10, 16*2
-	li %r11, 16*3
-	STORE_LE_VR 0, 12, %r8, %r26
-	STORE_LE_VR 1, 13, %r9, %r26
-	STORE_LE_VR 2, 14, %r10, %r26
-	STORE_LE_VR 3, 12, %r11, %r26
-
-randomx_ppc64_vm_spad_store_hard_aes_end:
+randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end:
 
 
 	.section ".text"
diff --git a/src/jit_compiler_ppc64_static.hpp b/src/jit_compiler_ppc64_static.hpp
index 7fe7afd6..8b2bff93 100644
--- a/src/jit_compiler_ppc64_static.hpp
+++ b/src/jit_compiler_ppc64_static.hpp
@@ -58,8 +58,12 @@ extern "C" {
 	void randomx_ppc64_vm_data_read_light();
 	void randomx_ppc64_vm_data_read_light_fix_call();
 	void randomx_ppc64_vm_data_read_light_end();
-	void randomx_ppc64_vm_spad_store();
-	void randomx_ppc64_vm_spad_store_end();
-	void randomx_ppc64_vm_spad_store_hard_aes();
-	void randomx_ppc64_vm_spad_store_hard_aes_end();
+	void randomx_ppc64_vm_spad_store_prologue();
+	void randomx_ppc64_vm_spad_store_prologue_end();
+	void randomx_ppc64_vm_spad_store_mix_v1();
+	void randomx_ppc64_vm_spad_store_mix_v1_end();
+	void randomx_ppc64_vm_spad_store_epilogue();
+	void randomx_ppc64_vm_spad_store_epilogue_end();
+	void randomx_ppc64_vm_spad_store_mix_v2_hard_aes();
+	void randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end();
 }

From 2bbb740b6c596801acc31e902e515e59bf8cfd86 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Fri, 10 Apr 2026 10:55:11 -0500
Subject: [PATCH 19/50] Flush the cache on PPC64 for real

GCC's built-in cache clearing function didn't do anything on Power, so
we use our own code, borrowed from LLVM and modified to detect the cache
line size at runtime.

Also, to avoid a huge hit to performance (~13%), rather than clearing
the cache for the whole 128 KiB of constants and code, we clear the
cache just for the bytes of the program that was just written, and we
only do that cache clearing after the whole program has been written.
And since the constants aren't used as instructions, we can skip
clearing any caches for that data. So now instead of clearing caches for
128 KiB of memory, we're only doing that for about 4+ KiB of just the
program memory.

Total hit to performance from actually clearing the cache seems to be in
the range of 0.2%-0.3%, which is more than acceptable considering the
alternative results in wasted cycles and random crashes.
---
 src/jit_compiler_ppc64.cpp | 56 ++++++++++++++++++++++++++++++++------
 1 file changed, 47 insertions(+), 9 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index e70eb4c7..12b0a7fd 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -30,6 +30,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <stdexcept>
 #include <cstring>
 
+#include <unistd.h>
+
 #include "cpu.hpp"
 #include "program.hpp"
 #include "reciprocal.h"
@@ -551,10 +553,48 @@ namespace randomx {
 		return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x);
 	}
 
-	static void clearCache(CodeBuffer& buf) {
-#ifdef __GNUC__
-		__builtin___clear_cache((char*)buf.code, (char*)(buf.code + CodeSize));
-#endif
+	static void syncInstructionCache(void* start_ptr, void* end_ptr) {
+		// Apparently GCC compiles __builtin___clear_cache to nothing, so we use LLVM's implementation instead.
+		//
+		// This code has been modified from compiler-rt/lib/builtins/clear_cache.c, found at
+		// https://github.com/llvm/llvm-project revision 7459e10f34aa86952b1620d0cb48b40be112ebe9.
+		//
+		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+		// See https://llvm.org/LICENSE.txt for license information.
+		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+		char* start = (char*)start_ptr;
+		char* end = (char*)end_ptr;
+		const size_t len = (uintptr_t)end - (uintptr_t)start;
+		if (len == 0) return;
+
+		// Query data and instruction cache line sizes
+		long dcache_val = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+		long icache_val = sysconf(_SC_LEVEL1_ICACHE_LINESIZE);
+
+		const size_t d_line_size = (dcache_val > 0) ? dcache_val : 32;
+		const size_t i_line_size = (icache_val > 0) ? icache_val : 32;
+
+		// Flush Data Cache
+		const uintptr_t d_mask = ~(d_line_size - 1);
+		const uintptr_t d_start_line = ((uintptr_t)start) & d_mask;
+		const uintptr_t d_end_line = ((uintptr_t)start + len + d_line_size - 1) & d_mask;
+
+		for (uintptr_t line = d_start_line; line < d_end_line; line += d_line_size)
+			__asm__ volatile("dcbst 0, %0" : : "r"(line));
+
+		// Wait for memory writes to complete
+		__asm__ volatile("sync");
+
+		// Invalidate Instruction Cache
+		const uintptr_t i_mask = ~(i_line_size - 1);
+		const uintptr_t i_start_line = ((uintptr_t)start) & i_mask;
+		const uintptr_t i_end_line = ((uintptr_t)start + len + i_line_size - 1) & i_mask;
+
+		for (uintptr_t line = i_start_line; line < i_end_line; line += i_line_size)
+			__asm__ volatile("icbi 0, %0" : : "r"(line));
+
+		// Flush the local instruction pipeline
+		__asm__ volatile("isync");
 	}
 
 	static void emitLoadGpr64(CompilerState& state, uint32_t rt, uint32_t ra, uint32_t rb) {
@@ -821,8 +861,6 @@ namespace randomx {
 		descriptorDataInit[1] = reinterpret_cast<uint64_t>(state.code);
 		descriptorDataInit[2] = 0;
 #endif
-
-		clearCache(state);
 	}
 
 	JitCompilerPPC64::~JitCompilerPPC64() {
@@ -870,7 +908,7 @@ namespace randomx {
 
 		emitProgramSuffix(state, pcfg, flags);
 
-		clearCache(state);
+		syncInstructionCache(entryProgram, state.code + state.codePos);
 	}
 
 	void JitCompilerPPC64::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) {
@@ -902,7 +940,7 @@ namespace randomx {
 
 		emitProgramSuffix(state, pcfg, flags);
 
-		clearCache(state);
+		syncInstructionCache(entryProgram, state.code + state.codePos);
 	}
 
 	static void generateSuperscalarCode(CompilerState& state, Instruction instr, const std::vector<uint64_t>& reciprocalCache) {
@@ -998,7 +1036,7 @@ namespace randomx {
 		// Return
 		state.emit(codeSshashSingleItemEpilogue, sizeSshashSingleItemEpilogue);
 
-		clearCache(state);
+		syncInstructionCache(entryDataInit, state.code + state.codePos);
 	}
 
 	size_t JitCompilerPPC64::getCodeSize() {

From d8f508deb72df5bf3501b83f2d38934a9cf08d7d Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Sat, 11 Apr 2026 00:38:10 -0500
Subject: [PATCH 20/50] Optimize CFROUND for POWER9 (ISA v3.0B)

On Power ISA processors that support v3.0B or later, use mffscrn instead
of mtfsf to avoid a pipeline flush. This gives us an extra 0.5%-1.0%
performance on RandomX V1 and a negligible performance increase (much
less than 0.1%) on RandomX V2.
---
 src/cpu.cpp                |  1 +
 src/cpu.hpp                |  6 ++++++
 src/jit_compiler_ppc64.cpp | 13 ++++++++++---
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/cpu.cpp b/src/cpu.cpp
index d20b6ec6..6800e8ca 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -128,6 +128,7 @@ namespace randomx {
 #elif defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)
 		unsigned long hwcaps2 = getauxval(AT_HWCAP2);
 		aes_ = (hwcaps2 & PPC_FEATURE2_VEC_CRYPTO) != 0;
+		v3p0_ = (hwcaps2 & PPC_FEATURE2_ARCH_3_00) != 0;
 #endif
 	}
 
diff --git a/src/cpu.hpp b/src/cpu.hpp
index 7db03311..0c5058d6 100644
--- a/src/cpu.hpp
+++ b/src/cpu.hpp
@@ -41,6 +41,9 @@ namespace randomx {
 		inline bool hasRVV() const { return rvv_; }
 		inline int getRVV_Length() const { return rvv_length; }
 #endif
+#if defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)
+		inline bool hasV3P0() const { return v3p0_; }
+#endif
 
 	private:
 		bool aes_ = false;
@@ -49,6 +52,9 @@ namespace randomx {
 #ifdef __riscv
 		bool rvv_ = false;
 		int rvv_length = 0;
+#endif
+#if defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)
+		bool v3p0_ = false;
 #endif
 	};
 
diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 12b0a7fd..2011a298 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -268,6 +268,7 @@ namespace PPC64 {
 	static inline uint32_t lfd(uint32_t frt, uint32_t ra, uint32_t d) { return D_form(50, frt, ra, d); }
 	static inline uint32_t lfdx(uint32_t frt, uint32_t ra, uint32_t rb) { return X_form(31, frt, ra, rb, 599, 0); }
 	static inline uint32_t mtfsf(uint32_t flm, uint32_t frb, uint32_t l, uint32_t w) { return XFL_form(63, l, flm, w, frb, 711, 0); }
+	static inline uint32_t mffscrn(uint32_t frt, uint32_t frb) { return X_form(63, frt, 22, frb, 583, 0); }  // Only v3.0B and later
 
 	static inline uint32_t lxsdx(uint32_t xt, uint32_t ra, uint32_t rb) {
 		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
@@ -1316,9 +1317,15 @@ namespace randomx {
 		// lfdx f0, r8, r0
 		state.emit(PPC64::lfdx(0, 8, 0));
 
-		// Move the RN value from scratch FPR0 to FPSCR (masked)
-		// mtfsf 0x01, f0, 0, 0
-		state.emit(PPC64::mtfsf(0x01, 0, 0, 0));
+		if (randomx::cpu.hasV3P0()) {
+			// Move the RN value from scratch FPR0 to FPSCR field RN
+			// mffscrn f0, f0
+			state.emit(PPC64::mffscrn(0, 0));
+		} else {
+			// Move the RN value from scratch FPR0 to FPSCR (masked)
+			// mtfsf 0x01, f0, 0, 0
+			state.emit(PPC64::mtfsf(0x01, 0, 0, 0));
+		}
 
 		if (flags & RANDOMX_FLAG_V2) {
 			// Patch in the conditional branch instruction.

From 31ff28d4dd892fd5161cf461513c007ec94c05fe Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Sat, 11 Apr 2026 10:54:33 -0500
Subject: [PATCH 21/50] Cache reciprocals in PPC64 JIT compiler

This gives us an extra 2.0% performance on RandomX V1 and V2.
---
 src/jit_compiler_ppc64.cpp | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 2011a298..b18253da 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -77,6 +77,15 @@ namespace PPC64 {
 		return (po << 26) | (rt << 21) | (ra << 16) | d;
 	}
 
+	static inline uint32_t DS_form(uint32_t po, uint32_t rt, uint32_t ra, uint32_t ds, uint32_t xo) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(rt <= 0x1F)) throw std::runtime_error("rt <= 0x1F");
+		if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F");
+		if (!(ds <= 0x3FFF)) throw std::runtime_error("ds <= 0x3FFF");
+		if (!(xo <= 0x3)) throw std::runtime_error("xo <= 0x3");
+		return (po << 26) | (rt << 21) | (ra << 16) | (ds << 2) | xo;
+	}
+
 	static inline uint32_t I_form(uint32_t po, uint32_t li, uint32_t aa, uint32_t lk) {
 		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
 		if (!(li <= 0xFFFFFF)) throw std::runtime_error("li <= 0xFFFFFF");
@@ -260,6 +269,12 @@ namespace PPC64 {
 	static inline uint32_t sldi(uint32_t rx, uint32_t ry, uint32_t n) { return rldicr(rx, ry, n, 63-n); }
 	static inline uint32_t srdi(uint32_t rx, uint32_t ry, uint32_t n) { return rldicl(rx, ry, 64-n, n); }
 
+	static inline uint32_t ld(uint32_t rt, uint32_t ra, int32_t offset) {
+		if (offset & 3) throw std::runtime_error("offset must be 4-byte aligned");
+		if (offset < -(1 << 15) || offset >= (1 << 15)) throw std::runtime_error("offset out of range");
+		return DS_form(58, rt, ra, (offset >> 2) & 0x3FFF, 0);
+	}
+
 	static inline uint32_t ldx(uint32_t rt, uint32_t ra, uint32_t rb) { return X_form(31, rt, ra, rb, 21, 0); }
 	static inline uint32_t ldbrx(uint32_t rt, uint32_t ra, uint32_t rb) { return X_form(31, rt, ra, rb, 532, 0); }
 	static inline uint32_t stdx(uint32_t rs, uint32_t ra, uint32_t rb) { return X_form(31, rs, ra, rb, 149, 0); }
@@ -494,7 +509,9 @@ namespace randomx {
 	static const int32_t offsetVmDataReadLightFixCall = codeVmDataReadLightFixCall - codeVmDataReadLight;
 
 	constexpr size_t CodeAlign = 64*1024;  // 64 kB, to ensure alignment on systems with a page size <= 64 kB
-	static const size_t ConstantPoolSize = alignSize(sizeConstants + 16, CodeAlign);  // Add 16 bytes for the Group E OR vector mask
+	constexpr size_t ReciprocalPoolSize = 8 * RANDOMX_PROGRAM_MAX_SIZE;  // RANDOMX_PROGRAM_MAX_SIZE 64-bit reciprocals
+	static const size_t ReciprocalPoolPos = sizeConstants + 16;  // Add 16 bytes for the Group E OR vector mask
+	static const size_t ConstantPoolSize = alignSize(sizeConstants + 16 + ReciprocalPoolSize, CodeAlign);  // Add 16 bytes for the Group E OR vector mask
 	static const size_t ReserveCodeSize = alignSize(sizeVmPrologue + sizeVmEpilogue + sizeVmLoopPrologue + sizeVmDataRead + sizeVmDataReadLight + sizeVmSpadStorePrologue + sizeVmSpadStoreMixV2HardAes + sizeVmSpadStoreEpilogue, CodeAlign);
 	constexpr size_t MaxRandomXInstrCodeSize = 4*10;  // FDIV_M requires at most 10 instructions
 	constexpr size_t MaxSuperscalarInstrSize = 4*6;  // IMUL_RCP requires at most 6 instructions
@@ -769,6 +786,9 @@ namespace randomx {
 		LoopBeginPos = state.codePos;
 		state.emit(codeVmLoopPrologue, sizeVmLoopPrologue);
 
+		// Initialize the reciprocal pool to zero
+		state.rcpCount = 0;
+
 		// Step 4: The 256 instructions stored in the Program Buffer are executed.
 		for (unsigned i = 0; i < RegistersCount; ++i) {
 			state.registerUsage[i] = -1;
@@ -1135,8 +1155,13 @@ namespace randomx {
 		if (!isZeroOrPowerOf2(divisor)) {
 			state.registerUsage[isn.dst] = i;
 			int dst = RegisterMapR.getPpcGprNum(isn.dst);
+
+			// Calculate and cache the reciprocal
+			int32_t offset = ReciprocalPoolPos + 8 * state.rcpCount++;
 			uint64_t rcp = randomx_reciprocal_fast(divisor);
-			emitMovImm64(state, 8, rcp);
+			state.emitAt(offset, rcp);
+
+			state.emit(PPC64::ld(8, ConstantsBaseAddressRegisterGPR2, offset));
 			state.emit(PPC64::mulld(dst, dst, 8));
 		}
 	}

From 8abf44d38568fc7c62fcc632112828d9a386e710 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Sat, 11 Apr 2026 11:13:29 -0500
Subject: [PATCH 22/50] Simplify scratchpad loading code

This saves maybe one instruction every once in a while.
---
 src/jit_compiler_ppc64.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index b18253da..2ec98f96 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -739,15 +739,12 @@ namespace randomx {
 
 			uint32_t mb = 32 - Log2(size);
 			state.emit(PPC64::rlwinm(tmp_gpr, tmp_gpr, 0, mb, 28));
-
-			emitLoadGpr64(state, tmp_gpr, ScratchpadPointerGPR30, tmp_gpr);
 		} else {
-			imm = (imm & ScratchpadL3Mask) >> 3;
-			emitMovImm32(state, tmp_gpr, imm);
-			state.emit(PPC64::sldi(tmp_gpr, tmp_gpr, 3));
-
-			emitLoadGpr64(state, tmp_gpr, ScratchpadPointerGPR30, tmp_gpr);
+			imm &= ScratchpadL3Mask;
+			emitMovImm64(state, tmp_gpr, imm);
 		}
+
+		emitLoadGpr64(state, tmp_gpr, ScratchpadPointerGPR30, tmp_gpr);
 	}
 
 	template<uint32_t tmp_vr>

From 7f815b9abf8d02ab48bf5396569b0e420ca485e9 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Sat, 11 Apr 2026 15:44:13 -0500
Subject: [PATCH 23/50] Move the creation of the zero vector further from where
 it's used

This should help slightly with pipelining.
---
 src/jit_compiler_ppc64_static.S | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index 8ae7cab9..81668fd0 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -773,6 +773,9 @@ randomx_ppc64_vm_spad_store_epilogue_end:
 randomx_ppc64_vm_spad_store_mix_v2_hard_aes:
 	// Mix F and E registers (f0-f3 are v0-v3, e0-e3 are v4-v7)
 
+	// We need a zero vector to bypass vncipher's internal key XOR
+	vxor %v12, %v12, %v12
+
 	// Byte-reverse f0-f3 and e0-e3
 	vperm %v0, %v0, %v0, %v15
 	vperm %v1, %v1, %v1, %v15
@@ -783,9 +786,6 @@ randomx_ppc64_vm_spad_store_mix_v2_hard_aes:
 	vperm %v6, %v6, %v6, %v15
 	vperm %v7, %v7, %v7, %v15
 
-	// We need a zero vector to bypass vncipher's internal key XOR
-	vxor %v12, %v12, %v12
-
 	vcipher %v0, %v0, %v4
 	vncipher %v1, %v1, %v12  // Pass 0 as the key
 	vcipher %v2, %v2, %v4

From 1d6d26cf53f9e769e7b7e4b25aee8fbae77747c2 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Sun, 12 Apr 2026 19:10:30 -0500
Subject: [PATCH 24/50] Use AltiVec instructions instead of VSX instructions
 where possible

The only benefit to using the VSX versions of these instructions is that
they have access to more registers. But we don't use those extra
registers, so in case anyone wants to port this code to processors
without VSX we can make their job easier by removing the VSX-only
instructions where they're not strictly needed.
---
 src/jit_compiler_ppc64.cpp      | 24 ++++++++++++++++++------
 src/jit_compiler_ppc64_static.S | 26 +++++++++++++-------------
 2 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 2ec98f96..321af639 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -143,6 +143,15 @@ namespace PPC64 {
 		return (po << 26) | (vrt << 21) | (vra << 16) | (vrb << 11) | (vrc << 6) | xo;
 	}
 
+	static inline uint32_t VX_form(uint32_t po, uint32_t vrt, uint32_t vra, uint32_t vrb, uint32_t xo) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(vrt <= 0x1F)) throw std::runtime_error("vrt <= 0x1F");
+		if (!(vra <= 0x1F)) throw std::runtime_error("vra <= 0x1F");
+		if (!(vrb <= 0x1F)) throw std::runtime_error("vrb <= 0x1F");
+		if (!(xo <= 0x7FF)) throw std::runtime_error("xo <= 0x7FF");
+		return (po << 26) | (vrt << 21) | (vra << 16) | (vrb << 11) | xo;
+	}
+
 	static inline uint32_t X_form(uint32_t po, uint32_t rt, uint32_t ra, uint32_t rb, uint32_t xo, uint32_t rc) {
 		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
 		if (!(rt <= 0x1F)) throw std::runtime_error("rt <= 0x1F");
@@ -301,6 +310,10 @@ namespace PPC64 {
 
 	static inline uint32_t vperm(uint32_t vrt, uint32_t vra, uint32_t vrb, uint32_t vrc) { return VA_form(4, vrt, vra, vrb, vrc, 43); }
 
+	static inline uint32_t vand(uint32_t vrt, uint32_t vra, uint32_t vrb) { return VX_form(4, vrt, vra, vrb, 1028); }
+	static inline uint32_t vor(uint32_t vrt, uint32_t vra, uint32_t vrb) { return VX_form(4, vrt, vra, vrb, 1156); }
+	static inline uint32_t vxor(uint32_t vrt, uint32_t vra, uint32_t vrb) { return VX_form(4, vrt, vra, vrb, 1220); }
+
 	static inline uint32_t xxmrghw(uint32_t xt, uint32_t xa, uint32_t xb) {
 		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
 		if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F");
@@ -1246,8 +1259,8 @@ namespace randomx {
 		state.emit(PPC64::xvsubdp(dst, dst, 32 + 12));
 	}
 	static void h_FSCAL_R(HANDLER_ARGS) {
-		int dst = RegisterMapF.getPpcVsrNum(isn.dst);
-		state.emit(PPC64::xxlxor(dst, dst, ConstantVectorFscalXorMaskVSR50));
+		int dst = RegisterMapF.getPpcVrNum(isn.dst);
+		state.emit(PPC64::vxor(dst, dst, ConstantVectorFscalXorMaskVR18));
 	}
 	static void h_FMUL_R(HANDLER_ARGS) {
 		int dst = RegisterMapE.getPpcVsrNum(isn.dst);
@@ -1256,11 +1269,10 @@ namespace randomx {
 	}
 	static void h_FDIV_M(HANDLER_ARGS) {
 		int dst = RegisterMapE.getPpcVsrNum(isn.dst);
-		uint32_t temp_vsr = 32 + 12;
 		emitLoadVsrFromScratchpad<12>(state, isn);
-		state.emit(PPC64::xxland(temp_vsr, temp_vsr, ConstantVectorGroupEAndMaskVSR49));
-		state.emit(PPC64::xxlor(temp_vsr, temp_vsr, ConstantVectorGroupEOrMaskVSR51));
-		state.emit(PPC64::xvdivdp(dst, dst, temp_vsr));
+		state.emit(PPC64::vand(12, 12, ConstantVectorGroupEAndMaskVR17));
+		state.emit(PPC64::vor(12, 12, ConstantVectorGroupEOrMaskVR19));
+		state.emit(PPC64::xvdivdp(dst, dst, 32 + 12));
 	}
 	static void h_FSQRT_R(HANDLER_ARGS) {
 		int dst = RegisterMapE.getPpcVsrNum(isn.dst);
diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index 81668fd0..fb8d5502 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -654,14 +654,14 @@ randomx_ppc64_vm_loop_prologue:
 	xvcvsxwdp %vs37, %vs37
 	xvcvsxwdp %vs38, %vs38
 	xvcvsxwdp %vs39, %vs39
-	xxland %vs36, %vs36, %vs49
-	xxland %vs37, %vs37, %vs49
-	xxland %vs38, %vs38, %vs49
-	xxland %vs39, %vs39, %vs49
-	xxlor %vs36, %vs36, %vs51
-	xxlor %vs37, %vs37, %vs51
-	xxlor %vs38, %vs38, %vs51
-	xxlor %vs39, %vs39, %vs51
+	vand %v4, %v4, %v17
+	vand %v5, %v5, %v17
+	vand %v6, %v6, %v17
+	vand %v7, %v7, %v17
+	vor %v4, %v4, %v19
+	vor %v5, %v5, %v19
+	vor %v6, %v6, %v19
+	vor %v7, %v7, %v19
 
 randomx_ppc64_vm_loop_prologue_end:
 
@@ -749,11 +749,11 @@ randomx_ppc64_vm_spad_store_prologue:
 randomx_ppc64_vm_spad_store_prologue_end:
 
 randomx_ppc64_vm_spad_store_mix_v1:
-	// Mix F and E registers (f0-f3 are vs32-vs35, e0-e3 are vs36-vs39)
-	xxlxor %vs32, %vs32, %vs36
-	xxlxor %vs33, %vs33, %vs37
-	xxlxor %vs34, %vs34, %vs38
-	xxlxor %vs35, %vs35, %vs39
+	// Mix F and E registers (f0-f3 are v0-v3, e0-e3 are v4-v7)
+	vxor %v0, %v0, %v4
+	vxor %v1, %v1, %v5
+	vxor %v2, %v2, %v6
+	vxor %v3, %v3, %v7
 
 randomx_ppc64_vm_spad_store_mix_v1_end:
 

From 2016660af22ac77eb3a6ad9cdd0ae02cd631f561 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Sun, 12 Apr 2026 21:41:25 -0500
Subject: [PATCH 25/50] Implement software AES for PPC64

---
 src/jit_compiler_ppc64.cpp        |   9 +-
 src/jit_compiler_ppc64_static.S   | 573 ++++++++++++++++++++++++++++++
 src/jit_compiler_ppc64_static.hpp |   2 +
 3 files changed, 581 insertions(+), 3 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 321af639..7353527f 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -494,6 +494,8 @@ namespace randomx {
 	static const uint8_t* codeVmSpadStoreEpilogueEnd = (uint8_t*)&randomx_ppc64_vm_spad_store_epilogue_end;
 	static const uint8_t* codeVmSpadStoreMixV2HardAes = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v2_hard_aes;
 	static const uint8_t* codeVmSpadStoreMixV2HardAesEnd = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end;
+	static const uint8_t* codeVmSpadStoreMixV2SoftAes = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v2_soft_aes;
+	static const uint8_t* codeVmSpadStoreMixV2SoftAesEnd = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v2_soft_aes_end;
 
 	static const int32_t sizeConstants = codeConstantsEnd - codeConstants;
 
@@ -513,6 +515,7 @@ namespace randomx {
 	static const int32_t sizeVmSpadStoreMixV1 = codeVmSpadStoreMixV1End - codeVmSpadStoreMixV1;
 	static const int32_t sizeVmSpadStoreEpilogue = codeVmSpadStoreEpilogueEnd - codeVmSpadStoreEpilogue;
 	static const int32_t sizeVmSpadStoreMixV2HardAes = codeVmSpadStoreMixV2HardAesEnd - codeVmSpadStoreMixV2HardAes;
+	static const int32_t sizeVmSpadStoreMixV2SoftAes = codeVmSpadStoreMixV2SoftAesEnd - codeVmSpadStoreMixV2SoftAes;
 
 	static const int32_t offsetConstantLutFprcToFpscr = codeConstantLutFprcToFpscr - codeConstants;
 
@@ -525,7 +528,7 @@ namespace randomx {
 	constexpr size_t ReciprocalPoolSize = 8 * RANDOMX_PROGRAM_MAX_SIZE;  // RANDOMX_PROGRAM_MAX_SIZE 64-bit reciprocals
 	static const size_t ReciprocalPoolPos = sizeConstants + 16;  // Add 16 bytes for the Group E OR vector mask
 	static const size_t ConstantPoolSize = alignSize(sizeConstants + 16 + ReciprocalPoolSize, CodeAlign);  // Add 16 bytes for the Group E OR vector mask
-	static const size_t ReserveCodeSize = alignSize(sizeVmPrologue + sizeVmEpilogue + sizeVmLoopPrologue + sizeVmDataRead + sizeVmDataReadLight + sizeVmSpadStorePrologue + sizeVmSpadStoreMixV2HardAes + sizeVmSpadStoreEpilogue, CodeAlign);
+	static const size_t ReserveCodeSize = alignSize(sizeVmPrologue + sizeVmEpilogue + sizeVmLoopPrologue + sizeVmDataRead + sizeVmDataReadLight + sizeVmSpadStorePrologue + sizeVmSpadStoreMixV2SoftAes + sizeVmSpadStoreEpilogue, CodeAlign);
 	constexpr size_t MaxRandomXInstrCodeSize = 4*10;  // FDIV_M requires at most 10 instructions
 	constexpr size_t MaxSuperscalarInstrSize = 4*6;  // IMUL_RCP requires at most 6 instructions
 	static const size_t SuperscalarProgramHeaders = sizeSshashSingleItemPrologue + sizeSshashSingleItemEpilogue;
@@ -816,13 +819,13 @@ namespace randomx {
 		state.emit(codeVmSpadStorePrologue, sizeVmSpadStorePrologue);
 
 		if (flags & RANDOMX_FLAG_V2) {
-			if (true || (flags & RANDOMX_FLAG_HARD_AES)) {  // TODO: Remove the "true" once software AES is working
+			if (flags & RANDOMX_FLAG_HARD_AES) {
 				if (!randomx::cpu.hasAes()) {
 					throw std::runtime_error("This CPU is missing support for hardware AES!");
 				}
 				state.emit(codeVmSpadStoreMixV2HardAes, sizeVmSpadStoreMixV2HardAes);
 			} else {
-				throw std::runtime_error("Software AES is not yet implemented for PPC64!");
+				state.emit(codeVmSpadStoreMixV2SoftAes, sizeVmSpadStoreMixV2SoftAes);
 			}
 		} else {
 			state.emit(codeVmSpadStoreMixV1, sizeVmSpadStoreMixV1);
diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index fb8d5502..dee9bb28 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -99,6 +99,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	.global randomx_ppc64_vm_spad_store_epilogue_end
 	.global randomx_ppc64_vm_spad_store_mix_v2_hard_aes
 	.global randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end
+	.global randomx_ppc64_vm_spad_store_mix_v2_soft_aes
+	.global randomx_ppc64_vm_spad_store_mix_v2_soft_aes_end
 
 // Macro to store a VR containing a RandomX Group F/E/A register to memory
 .macro STORE_LE_VR vr_src, vr_temp, offset_reg, base_reg
@@ -185,6 +187,65 @@ constant_vector_be_permutation_mask:
 	.byte 3, 2, 1, 0, 3, 2, 1, 0
 #endif
 
+constant_vector_soft_aes_galois_field_inversion_lo:
+	.octa 0xf001080d0f06050e020c0b0a09030704
+constant_vector_soft_aes_galois_field_inversion_hi:
+	.octa 0xf0070b0f060a0401090805020c0e0d03
+constant_vector_soft_aes_mixcolumns_forward:
+	.octa 0x03000102070405060b08090a0f0c0d0e
+constant_vector_soft_aes_mixcolumns_backward:
+	.octa 0x0102030005060704090a0b080d0e0f0c
+
+constant_vector_soft_aes_shiftrows:
+	.octa 0x04090e03080d02070c01060b00050a0f
+constant_vector_soft_aes_encrypt_input_transform_lo:
+	.octa 0x00702a5a98e8b2c20878225290e0baca
+constant_vector_soft_aes_encrypt_input_transform_hi:
+	.octa 0x004d7c317d30014c81ccfdb0fcb180cd
+constant_vector_soft_aes_subbytes_mul1_lo:
+	.octa 0x0023e2fa15d41836efd92e0dc1ccf73b
+constant_vector_soft_aes_subbytes_mul1_hi:
+	.octa 0x003e50cb8fe19bb144f52a146e7adfa5
+constant_vector_soft_aes_subbytes_mul2_lo:
+	.octa 0x0029e10a4088eb694a2382abc863a1c2
+constant_vector_soft_aes_subbytes_mul2_hi:
+	.octa 0x0024710bc6937ae2cd2f98bc55e9b75e
+constant_vector_soft_aes_encrypt_63:
+	.octa 0x5b5b5b5b5b5b5b5b5b5b5b5b5b5b5b5b
+constant_vector_soft_aes_encrypt_output_transform_lo:
+	.octa 0x0060b6d629499fff0868bede214197f7
+constant_vector_soft_aes_encrypt_output_transform_hi:
+	.octa 0x00ecbc5051bded01e00c5cb0b15d0de1
+
+constant_vector_soft_aes_invshiftrows:
+	.octa 0x0c090603000d0a0704010e0b0805020f
+constant_vector_soft_aes_decrypt_input_transform_lo:
+	.octa 0x005f540b045b500f1a454e111e414a15
+constant_vector_soft_aes_decrypt_input_transform_hi:
+	.octa 0x00650560e683e38694f191f472177712
+constant_vector_soft_aes_invsubbytes_mul9_lo:
+	.octa 0x00d6869a53031c85c94c994f501fd5ca
+constant_vector_soft_aes_invsubbytes_mul9_hi:
+	.octa 0x0049d7ec89173bc065a5fbb29e2c5e72
+constant_vector_soft_aes_invsubbytes_mulD_lo:
+	.octa 0x00a2b1e6dfcc577d39442a88139b6ef5
+constant_vector_soft_aes_invsubbytes_mulD_hi:
+	.octa 0x00cbc624f7fae23cd3efde150d183129
+constant_vector_soft_aes_invsubbytes_mulB_lo:
+	.octa 0x0042b496926422d004d4f2b0f6462660
+constant_vector_soft_aes_invsubbytes_mulB_hi:
+	.octa 0x006759cda69894c16baa55323e0cfff3
+constant_vector_soft_aes_invsubbytes_mulE_lo:
+	.octa 0x00d0d4269692f246b0f6b46404604222
+constant_vector_soft_aes_invsubbytes_mulE_hi:
+	.octa 0x00c1aaffcda6550c323e59986bf36794
+constant_vector_soft_aes_decrypt_63:
+	.octa 0xe8e8e8e8e8e8e8e8e8e8e8e8e8e8e8e8
+constant_vector_soft_aes_decrypt_output_transform_lo:
+	.octa 0x0024dffb0420dbfff8dc2703fcd82307
+constant_vector_soft_aes_decrypt_output_transform_hi:
+	.octa 0x002f19362906301fab84b29d82ad9bb4
+
 randomx_ppc64_constants_end:
 
 literal_vector_group_e_or_mask:
@@ -827,6 +888,518 @@ randomx_ppc64_vm_spad_store_mix_v2_hard_aes:
 randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end:
 
 
+// The following software AES code is based heavily on public-domain work by
+// Mike Hamburg of Stanford University. More information on that work can be
+// found here: https://crypto.stanford.edu/vpaes/
+//
+// This port of that code is not particularly well-optimized, partly because I
+// didn't really understand all the math behind it, and partly because I don't
+// yet have a POWER7 system to benchmark on. Possible areas for improvement
+// include:
+//
+// - Folding constants into other constants (to do this would require
+//   understanding the math behind this algorithm).
+// - Using functions and loops instead of unrolling everything with macros.
+// - Using VSX instructions to move values that are exclusively used in XOR
+//   operations to vs0-vs31 in order to reduce the number of registers we need
+//   to save to the stack.
+// - Using VSX loads to load constants into vs0-vs31 just once at the start of
+//   VM execution, then moving those constants into vector registers with xxlor
+//   during software AES. Might be faster if it means we can avoid having to
+//   save vector registers to the stack. Might also be slower if loading a
+//   series of constants from d-cache is faster than a bunch of xxlor
+//   operations to move them from vs0-vs31 into the vector registers.
+// - Restoring the overwritten constant registers (v15-v19) from the constant
+//   pool instead of the stack.
+// - Ordering the software AES constants in the constant pool based on the order
+//   they get loaded in, which could help with prefetching and reduce load
+//   latency.
+// - Loading the zero/4/0x0F vectors from the constant pool might be faster than
+//   generating them with vector instructions.
+// - Reordering instructions to keep the vector pipeline full and avoid false
+//   dependencies.
+
+// Macro: VPAES_TRANSFORM
+// Converts standard bytes to custom basis (using Lk_ipt tables)
+// OR custom basis back to standard bytes (using Lk_opt tables).
+//
+// Arguments:
+//   v_out    : Destination vector register
+//   v_in     : Source vector register (can be same as v_out)
+//   v_tmp    : Temporary vector register
+//   v_tab_lo : Vector register loaded with the low table (Lk_ipt lo / Lk_opt lo)
+//   v_tab_hi : Vector register loaded with the high table (Lk_ipt hi / Lk_opt hi)
+//   v_splat4 : Vector register pre-loaded with byte values of 0x04 (for shifting)
+
+.macro VPAES_TRANSFORM v_out, v_in, v_tmp, v_tab_lo, v_tab_hi, v_splat4
+	// Shift right by 4 to isolate the high nybbles into v_tmp
+	vsrb \v_tmp, \v_in, \v_splat4
+
+	// Lookup the low nybbles (vperm ignores the upper bits of the index)
+	vperm \v_out, \v_tab_lo, \v_tab_lo, \v_in
+
+	// Lookup the high nybbles
+	vperm \v_tmp, \v_tab_hi, \v_tab_hi, \v_tmp
+
+	// Combine the results
+	vxor \v_out, \v_out, \v_tmp
+.endm
+
+// Macro: VPAES_INVERSION
+// Performs Galois Field inversion in the custom composite field basis.
+//
+// Arguments:
+//   v_io      : Output vector 1 (io)
+//   v_jo      : Output vector 2 (jo)
+//   v_in      : Input vector (state in custom basis)
+//   v_invlo   : Lk_inv low table (first 16 bytes)
+//   v_invhi   : Lk_inv high table (second 16 bytes)
+//   v_splat4  : Vector pre-loaded with 0x04 (for shifting)
+//   v_splat0f : Vector pre-loaded with 0x0F (for masking)
+//   v_zero    : Vector pre-loaded with 0x00
+//   v_tmp1    : Temporary vector
+//   v_tmp2    : Temporary vector
+//   v_tmp3    : Temporary vector
+
+.macro VPAES_INVERSION v_io, v_jo, v_in, v_invlo, v_invhi, v_splat4, v_splat0f, v_zero, v_tmp1, v_tmp2, v_tmp3
+	// v_tmp1 = i (high nybbles)
+	vsrb \v_tmp1, \v_in, \v_splat4
+
+	// v_tmp3 = a/k
+	vperm \v_tmp3, \v_invhi, \v_invhi, \v_in
+
+	// v_tmp2 = j (low nybbles)
+	vxor \v_tmp2, \v_in, \v_tmp1
+
+	// v_io = 1/i
+	vperm \v_io, \v_invlo, \v_invlo, \v_tmp1
+
+	// v_jo = 1/j
+	vperm \v_jo, \v_invlo, \v_invlo, \v_tmp2
+
+	// mask j with 0x0F
+	vand \v_tmp2, \v_tmp2, \v_splat0f
+
+	// iak = 1/i + a/k
+	vxor \v_io, \v_io, \v_tmp3
+
+	// jak = 1/j + a/k
+	vxor \v_jo, \v_jo, \v_tmp3
+
+	// 1/iak (Note: v_zero is used for the second half of the table)
+	vperm \v_io, \v_invlo, \v_zero, \v_io
+
+	// 1/jak
+	vperm \v_jo, \v_invlo, \v_zero, \v_jo
+
+	// io = 1/iak + j
+	vxor \v_io, \v_io, \v_tmp2
+
+	// jo = 1/jak + i
+	vxor \v_jo, \v_jo, \v_tmp1
+.endm
+
+// Macro: VPAES_SB_MC
+// Performs combined SubBytes affine transform and MixColumns.
+// Output remains in the custom composite field basis.
+//
+// Arguments:
+//   v_out    : Output vector (custom basis)
+//   v_io     : Input vector 1 (inverted high nybbles from VPAES_INVERSION)
+//   v_jo     : Input vector 2 (inverted low nybbles from VPAES_INVERSION)
+//   v_sb1u   : Lk_sb1 low table (first 16 bytes)
+//   v_sb1t   : Lk_sb1 high table (second 16 bytes)
+//   v_sb2u   : Lk_sb2 low table (first 16 bytes)
+//   v_sb2t   : Lk_sb2 high table (second 16 bytes)
+//   v_mc_fwd : Lk_mc_forward base table (first 16 bytes)
+//   v_mc_bwd : Lk_mc_backward base table (first 16 bytes)
+//   v_zero   : Vector pre-loaded with 0x00
+//   v_tmp1   : Temporary vector
+//   v_tmp2   : Temporary vector
+//   v_tmp3   : Temporary vector
+
+.macro VPAES_SB_MC v_out, v_io, v_jo, v_sb1u, v_sb1t, v_sb2u, v_sb2t, v_mc_fwd, v_mc_bwd, v_zero, v_tmp1, v_tmp2, v_tmp3
+	// 1. Calculate A = sb1u(jo) ^ sb1t(io)
+	vperm \v_tmp1, \v_sb1u, \v_zero, \v_jo
+	vperm \v_out,  \v_sb1t, \v_zero, \v_io
+	vxor \v_out,  \v_out,  \v_tmp1              // v_out = A
+
+	// 2. Calculate 2A = sb2u(jo) ^ sb2t(io)
+	vperm \v_tmp2, \v_sb2u, \v_zero, \v_jo
+	vperm \v_tmp3, \v_sb2t, \v_zero, \v_io
+	vxor \v_tmp2, \v_tmp2, \v_tmp3              // v_tmp2 = 2A
+
+	// 3. Calculate B = rot(A, 1) and D = rot(A, 3)
+	vperm \v_tmp1, \v_out, \v_zero, \v_mc_fwd   // v_tmp1 = B
+	vperm \v_tmp3, \v_out, \v_zero, \v_mc_bwd   // v_tmp3 = D
+
+	// 4. Calculate 2A + B
+	vxor \v_tmp1, \v_tmp1, \v_tmp2              // v_tmp1 = 2A + B
+
+	// 5. Calculate 2B + C = rot(2A + B, 1)
+	vperm \v_tmp2, \v_tmp1, \v_zero, \v_mc_fwd  // v_tmp2 = 2B + C
+
+	// 6. Calculate 2A + B + D
+	vxor \v_out,  \v_tmp1, \v_tmp3              // v_out = 2A + B + D
+
+	// 7. Final Result = (2A + B + D) ^ (2B + C)
+	vxor \v_out,  \v_out,  \v_tmp2
+.endm
+
+// Macro: VPAES_INVSB_INVMC
+// Performs combined InvSubBytes affine transform and InvMixColumns on the inverted custom basis state.
+//
+// Arguments:
+//   v_out    : Output vector (custom basis)
+//   v_io     : Input vector 1 (inverted high nybbles from VPAES_INVERSION)
+//   v_jo     : Input vector 2 (inverted low nybbles from VPAES_INVERSION)
+//   v_sb9u, v_sb9t : Lk_dsb9 tables (low and high)
+//   v_sbdu, v_sbdt : Lk_dsbd tables (low and high)
+//   v_sbbu, v_sbbt : Lk_dsbb tables (low and high)
+//   v_sbeu, v_sbet : Lk_dsbe tables (low and high)
+//   v_mc_fwd : Lk_mc_forward base table (first 16 bytes)
+//   v_zero   : Vector pre-loaded with 0x00
+//   v_tmp1   : Temporary vector
+//   v_tmp2   : Temporary vector
+
+.macro VPAES_INVSB_INVMC v_out, v_io, v_jo, v_sb9u, v_sb9t, v_sbdu, v_sbdt, v_sbbu, v_sbbt, v_sbeu, v_sbet, v_mc_fwd, v_zero, v_tmp1, v_tmp2
+	// 1. Multiply by 0x09
+	vperm \v_tmp1, \v_sb9u, \v_zero, \v_io
+	vperm \v_tmp2, \v_sb9t, \v_zero, \v_jo
+	vxor \v_out,  \v_tmp1, \v_tmp2              // Acc = 0x09 * State
+
+	// 2. Rotate and add 0x0D
+	vperm \v_tmp1, \v_sbdu, \v_zero, \v_io
+	vperm \v_out,  \v_out,  \v_zero, \v_mc_fwd  // rot(Acc)
+	vperm \v_tmp2, \v_sbdt, \v_zero, \v_jo
+	vxor \v_out,  \v_out,  \v_tmp1
+	vxor \v_out,  \v_out,  \v_tmp2              // Acc = rot(Acc) ^ (0x0D * State)
+
+	// 3. Rotate and add 0x0B
+	vperm \v_tmp1, \v_sbbu, \v_zero, \v_io
+	vperm \v_out,  \v_out,  \v_zero, \v_mc_fwd  // rot(Acc)
+	vperm \v_tmp2, \v_sbbt, \v_zero, \v_jo
+	vxor \v_out,  \v_out,  \v_tmp1
+	vxor \v_out,  \v_out,  \v_tmp2              // Acc = rot(Acc) ^ (0x0B * State)
+
+	// 4. Rotate and add 0x0E
+	vperm \v_tmp1, \v_sbeu, \v_zero, \v_io
+	vperm \v_out,  \v_out,  \v_zero, \v_mc_fwd  // rot(Acc)
+	vperm \v_tmp2, \v_sbet, \v_zero, \v_jo
+	vxor \v_out,  \v_out,  \v_tmp1
+	vxor \v_out,  \v_out,  \v_tmp2              // Acc = rot(Acc) ^ (0x0E * State)
+.endm
+
+// Register allocations: Software AES
+//
+// v0-v3 / vs32-vs35 (in/out / volatile) -> RandomX floating point registers f0-f3
+// v4-v7 / vs36-vs39 (input / non-volatile) -> RandomX floating point registers e0-e3
+// v8 / vs40 (non-volatile) -> io (must reload: RandomX floating point registers a0)
+// v9 / vs41 (non-volatile) -> jo (must reload: RandomX floating point registers a1)
+// v10 / vs42 (non-volatile) -> 0x5b... / 0xe8... (must reload: RandomX floating point registers a2)
+// v11 / vs43 (non-volatile) -> invsubbytes_mulE_hi (must reload: RandomX floating point registers a3)
+// v12-v14 / vs44-vs46 (volatile) -> scratch registers
+// v15 / vs47 (non-volatile) -> shiftrows / invshiftrows (must reload: constant_vector_be_byte_reverse_mask)
+// v16 / vs48 (non-volatile) -> enc/dec input/output transform lo (must reload (BE only): constant_vector_be_permutation_mask)
+// v17 / vs49 (non-volatile) -> enc/dec input/output transform hi (must reload: constant_vector_group_e_and_mask)
+// v18 / vs50 (non-volatile) -> mixcolumns forward (must reload: constant_vector_fscal_xor_mask)
+// v19 / vs51 (non-volatile) -> mixcolumns backward / invsubbytes_mulE_lo (must reload: literal_vector_group_e_or_mask)
+// v20 / vs52 (non-volatile) -> Zero
+// v21 / vs53 (non-volatile) -> Shift amount (4)
+// v22 / vs54 (non-volatile) -> Low nybble mask (0x0F)
+// v23 / vs55 (non-volatile) -> Transformed round key
+// v24 / vs56 (non-volatile) -> galois_field_inversion_lo
+// v25 / vs57 (non-volatile) -> galois_field_inversion_hi
+// v26 / vs58 (non-volatile) -> subbytes_mul1_lo / invsubbytes_mul9_lo
+// v27 / vs59 (non-volatile) -> subbytes_mul1_hi / invsubbytes_mul9_hi
+// v28 / vs60 (non-volatile) -> subbytes_mul2_lo / invsubbytes_mulD_lo
+// v29 / vs61 (non-volatile) -> subbytes_mul2_lo / invsubbytes_mulD_hi
+// v30 / vs62 (non-volatile) -> invsubbytes_mulB_lo
+// v31 / vs63 (non-volatile) -> invsubbytes_mulB_hi
+
+randomx_ppc64_vm_spad_store_mix_v2_soft_aes:
+	// Mix F and E registers (f0-f3 are v0-v3, e0-e3 are v4-v7)
+
+	// Save v8-v11 and v15-v31 to the stack
+	addi %r6, %r1, -(16 * 21)
+	stvx %v8, 0, %r6
+	li %r7, 16*1
+	stvx %v9, %r7, %r6
+	li %r8, 16*2
+	stvx %v10, %r8, %r6
+	li %r9, 16*3
+	stvx %v11, %r9, %r6
+	li %r10, 16*4
+	stvx %v15, %r10, %r6
+	li %r11, 16*5
+	stvx %v16, %r11, %r6
+	li %r12, 16*6
+	stvx %v17, %r12, %r6
+	li %r7, 16*7
+	stvx %v18, %r7, %r6
+	li %r8, 16*8
+	stvx %v19, %r8, %r6
+	li %r9, 16*9
+	stvx %v20, %r9, %r6
+	li %r10, 16*10
+	stvx %v21, %r10, %r6
+	li %r11, 16*11
+	stvx %v22, %r11, %r6
+	li %r12, 16*12
+	stvx %v23, %r12, %r6
+	li %r7, 16*13
+	stvx %v24, %r7, %r6
+	li %r8, 16*14
+	stvx %v25, %r8, %r6
+	li %r9, 16*15
+	stvx %v26, %r9, %r6
+	li %r10, 16*16
+	stvx %v27, %r10, %r6
+	li %r11, 16*17
+	stvx %v28, %r11, %r6
+	li %r12, 16*18
+	stvx %v29, %r12, %r6
+	li %r7, 16*19
+	stvx %v30, %r7, %r6
+	li %r8, 16*20
+	stvx %v31, %r8, %r6
+
+	// Zero vector v20
+	vxor %v20, %v20, %v20
+
+	// Splat the shift amount to v21
+	vspltisb %v21, 4
+
+	// Splat the low nybble mask to v22
+	vspltisb %v22, 0x0F
+
+	// Load initial encryption constants
+	li %r6, constant_vector_soft_aes_encrypt_input_transform_lo-randomx_ppc64_constants
+	li %r7, constant_vector_soft_aes_encrypt_input_transform_hi-randomx_ppc64_constants
+	lvx %v16, %r6, %r2
+	lvx %v17, %r7, %r2
+	li %r8, constant_vector_soft_aes_mixcolumns_forward-randomx_ppc64_constants
+	li %r9, constant_vector_soft_aes_mixcolumns_backward-randomx_ppc64_constants
+	lvx %v18, %r8, %r2
+	lvx %v19, %r9, %r2
+	li %r6, constant_vector_soft_aes_galois_field_inversion_lo-randomx_ppc64_constants
+	li %r7, constant_vector_soft_aes_galois_field_inversion_hi-randomx_ppc64_constants
+	lvx %v24, %r6, %r2
+	lvx %v25, %r7, %r2
+	li %r8, constant_vector_soft_aes_subbytes_mul1_lo-randomx_ppc64_constants
+	li %r9, constant_vector_soft_aes_subbytes_mul1_hi-randomx_ppc64_constants
+	lvx %v26, %r8, %r2
+	lvx %v27, %r9, %r2
+	li %r6, constant_vector_soft_aes_subbytes_mul2_lo-randomx_ppc64_constants
+	li %r7, constant_vector_soft_aes_subbytes_mul2_hi-randomx_ppc64_constants
+	lvx %v28, %r6, %r2
+	lvx %v29, %r7, %r2
+	li %r8, constant_vector_soft_aes_shiftrows-randomx_ppc64_constants
+	li %r9, constant_vector_soft_aes_encrypt_63-randomx_ppc64_constants
+	lvx %v15, %r8, %r2
+	lvx %v10, %r9, %r2
+
+	// Transform inputs to composite field representation
+	VPAES_TRANSFORM %v0, %v0, %v12, %v16, %v17, %v21
+	VPAES_TRANSFORM %v2, %v2, %v12, %v16, %v17, %v21
+
+	// Round 0 (key v4)
+	VPAES_TRANSFORM %v23, %v4, %v12, %v16, %v17, %v21
+	vxor %v23, %v23, %v10
+	vperm %v0, %v0, %v0, %v15
+	VPAES_INVERSION %v8, %v9, %v0, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_SB_MC %v0, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14
+	vxor %v0, %v0, %v23
+	vperm %v2, %v2, %v2, %v15
+	VPAES_INVERSION %v8, %v9, %v2, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_SB_MC %v2, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14
+	vxor %v2, %v2, %v23
+
+	// Round 1 (key v5)
+	VPAES_TRANSFORM %v23, %v5, %v12, %v16, %v17, %v21
+	vxor %v23, %v23, %v10
+	vperm %v0, %v0, %v0, %v15
+	VPAES_INVERSION %v8, %v9, %v0, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_SB_MC %v0, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14
+	vxor %v0, %v0, %v23
+	vperm %v2, %v2, %v2, %v15
+	VPAES_INVERSION %v8, %v9, %v2, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_SB_MC %v2, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14
+	vxor %v2, %v2, %v23
+
+	// Round 2 (key v6)
+	VPAES_TRANSFORM %v23, %v6, %v12, %v16, %v17, %v21
+	vxor %v23, %v23, %v10
+	vperm %v0, %v0, %v0, %v15
+	VPAES_INVERSION %v8, %v9, %v0, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_SB_MC %v0, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14
+	vxor %v0, %v0, %v23
+	vperm %v2, %v2, %v2, %v15
+	VPAES_INVERSION %v8, %v9, %v2, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_SB_MC %v2, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14
+	vxor %v2, %v2, %v23
+
+	// Round 3 (key v7)
+	VPAES_TRANSFORM %v23, %v7, %v12, %v16, %v17, %v21
+	vxor %v23, %v23, %v10
+	vperm %v0, %v0, %v0, %v15
+	VPAES_INVERSION %v8, %v9, %v0, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_SB_MC %v0, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14
+	vxor %v0, %v0, %v23
+	vperm %v2, %v2, %v2, %v15
+	VPAES_INVERSION %v8, %v9, %v2, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_SB_MC %v2, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14
+	vxor %v2, %v2, %v23
+
+	// Load encryption output transform constants
+	li %r6, constant_vector_soft_aes_encrypt_output_transform_lo-randomx_ppc64_constants
+	li %r7, constant_vector_soft_aes_encrypt_output_transform_hi-randomx_ppc64_constants
+	lvx %v16, %r6, %r2
+	lvx %v17, %r7, %r2
+
+	// Transform output from composite field representation
+	VPAES_TRANSFORM %v0, %v0, %v12, %v16, %v17, %v21
+	VPAES_TRANSFORM %v2, %v2, %v12, %v16, %v17, %v21
+
+	// Load initial decryption constants
+	li %r6, constant_vector_soft_aes_decrypt_input_transform_lo-randomx_ppc64_constants
+	li %r7, constant_vector_soft_aes_decrypt_input_transform_hi-randomx_ppc64_constants
+	lvx %v16, %r6, %r2
+	lvx %v17, %r7, %r2
+	li %r8, constant_vector_soft_aes_invsubbytes_mul9_lo-randomx_ppc64_constants
+	li %r9, constant_vector_soft_aes_invsubbytes_mul9_hi-randomx_ppc64_constants
+	lvx %v26, %r8, %r2
+	lvx %v27, %r9, %r2
+	li %r6, constant_vector_soft_aes_invsubbytes_mulD_lo-randomx_ppc64_constants
+	li %r7, constant_vector_soft_aes_invsubbytes_mulD_hi-randomx_ppc64_constants
+	lvx %v28, %r6, %r2
+	lvx %v29, %r7, %r2
+	li %r8, constant_vector_soft_aes_invsubbytes_mulB_lo-randomx_ppc64_constants
+	li %r9, constant_vector_soft_aes_invsubbytes_mulB_hi-randomx_ppc64_constants
+	lvx %v30, %r8, %r2
+	lvx %v31, %r9, %r2
+	li %r6, constant_vector_soft_aes_invsubbytes_mulE_lo-randomx_ppc64_constants
+	li %r7, constant_vector_soft_aes_invsubbytes_mulE_hi-randomx_ppc64_constants
+	lvx %v19, %r6, %r2
+	lvx %v11, %r7, %r2
+	li %r8, constant_vector_soft_aes_invshiftrows-randomx_ppc64_constants
+	li %r9, constant_vector_soft_aes_decrypt_63-randomx_ppc64_constants
+	lvx %v15, %r8, %r2
+	lvx %v10, %r9, %r2
+
+	// Transform inputs to composite field representation
+	VPAES_TRANSFORM %v1, %v1, %v12, %v16, %v17, %v21
+	VPAES_TRANSFORM %v3, %v3, %v12, %v16, %v17, %v21
+
+	// Round 0 (key v4)
+	VPAES_TRANSFORM %v23, %v4, %v12, %v16, %v17, %v21
+	vperm %v1, %v1, %v1, %v15
+	vxor %v1, %v1, %v10
+	VPAES_INVERSION %v8, %v9, %v1, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_INVSB_INVMC %v1, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13
+	vxor %v1, %v1, %v23
+	vperm %v3, %v3, %v3, %v15
+	vxor %v3, %v3, %v10
+	VPAES_INVERSION %v8, %v9, %v3, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_INVSB_INVMC %v3, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13
+	vxor %v3, %v3, %v23
+
+	// Round 1 (key v5)
+	VPAES_TRANSFORM %v23, %v5, %v12, %v16, %v17, %v21
+	vperm %v1, %v1, %v1, %v15
+	vxor %v1, %v1, %v10
+	VPAES_INVERSION %v8, %v9, %v1, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_INVSB_INVMC %v1, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13
+	vxor %v1, %v1, %v23
+	vperm %v3, %v3, %v3, %v15
+	vxor %v3, %v3, %v10
+	VPAES_INVERSION %v8, %v9, %v3, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_INVSB_INVMC %v3, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13
+	vxor %v3, %v3, %v23
+
+	// Round 2 (key v6)
+	VPAES_TRANSFORM %v23, %v6, %v12, %v16, %v17, %v21
+	vperm %v1, %v1, %v1, %v15
+	vxor %v1, %v1, %v10
+	VPAES_INVERSION %v8, %v9, %v1, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_INVSB_INVMC %v1, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13
+	vxor %v1, %v1, %v23
+	vperm %v3, %v3, %v3, %v15
+	vxor %v3, %v3, %v10
+	VPAES_INVERSION %v8, %v9, %v3, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_INVSB_INVMC %v3, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13
+	vxor %v3, %v3, %v23
+
+	// Round 3 (key v7)
+	VPAES_TRANSFORM %v23, %v7, %v12, %v16, %v17, %v21
+	vperm %v1, %v1, %v1, %v15
+	vxor %v1, %v1, %v10
+	VPAES_INVERSION %v8, %v9, %v1, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_INVSB_INVMC %v1, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13
+	vxor %v1, %v1, %v23
+	vperm %v3, %v3, %v3, %v15
+	vxor %v3, %v3, %v10
+	VPAES_INVERSION %v8, %v9, %v3, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_INVSB_INVMC %v3, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13
+	vxor %v3, %v3, %v23
+
+	// Load decryption output transform constants
+	li %r6, constant_vector_soft_aes_decrypt_output_transform_lo-randomx_ppc64_constants
+	li %r7, constant_vector_soft_aes_decrypt_output_transform_hi-randomx_ppc64_constants
+	lvx %v16, %r6, %r2
+	lvx %v17, %r7, %r2
+
+	// Transform output from composite field representation
+	VPAES_TRANSFORM %v1, %v1, %v12, %v16, %v17, %v21
+	VPAES_TRANSFORM %v3, %v3, %v12, %v16, %v17, %v21
+
+	// Restore v8-v11 and v15-v31 from the stack
+	addi %r6, %r1, -(16 * 21)
+	lvx %v8, 0, %r6
+	li %r7, 16*1
+	lvx %v9, %r7, %r6
+	li %r8, 16*2
+	lvx %v10, %r8, %r6
+	li %r9, 16*3
+	lvx %v11, %r9, %r6
+	li %r10, 16*4
+	lvx %v15, %r10, %r6
+	li %r11, 16*5
+	lvx %v16, %r11, %r6
+	li %r12, 16*6
+	lvx %v17, %r12, %r6
+	li %r7, 16*7
+	lvx %v18, %r7, %r6
+	li %r8, 16*8
+	lvx %v19, %r8, %r6
+	li %r9, 16*9
+	lvx %v20, %r9, %r6
+	li %r10, 16*10
+	lvx %v21, %r10, %r6
+	li %r11, 16*11
+	lvx %v22, %r11, %r6
+	li %r12, 16*12
+	lvx %v23, %r12, %r6
+	li %r7, 16*13
+	lvx %v24, %r7, %r6
+	li %r8, 16*14
+	lvx %v25, %r8, %r6
+	li %r9, 16*15
+	lvx %v26, %r9, %r6
+	li %r10, 16*16
+	lvx %v27, %r10, %r6
+	li %r11, 16*17
+	lvx %v28, %r11, %r6
+	li %r12, 16*18
+	lvx %v29, %r12, %r6
+	li %r7, 16*19
+	lvx %v30, %r7, %r6
+	li %r8, 16*20
+	lvx %v31, %r8, %r6
+
+randomx_ppc64_vm_spad_store_mix_v2_soft_aes_end:
+
+
 	.section ".text"
 C_FUNCTION(randomx_reciprocal_fast)
 	cntlzd  %r4, %r3       // r4 = 63 - k (count leading zeros)
diff --git a/src/jit_compiler_ppc64_static.hpp b/src/jit_compiler_ppc64_static.hpp
index 8b2bff93..d99c4529 100644
--- a/src/jit_compiler_ppc64_static.hpp
+++ b/src/jit_compiler_ppc64_static.hpp
@@ -66,4 +66,6 @@ extern "C" {
 	void randomx_ppc64_vm_spad_store_epilogue_end();
 	void randomx_ppc64_vm_spad_store_mix_v2_hard_aes();
 	void randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end();
+	void randomx_ppc64_vm_spad_store_mix_v2_soft_aes();
+	void randomx_ppc64_vm_spad_store_mix_v2_soft_aes_end();
 }

From 9a77acfb2c8a2bfb339d91a9de127f4d3cc73e30 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Mon, 6 Apr 2026 15:04:17 -0500
Subject: [PATCH 26/50] Avoid dependency on Linux kernel headers

We only need these two definitions, so we can avoid adding a new
dependency by copying them from the headers.
---
 src/cpu.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/cpu.cpp b/src/cpu.cpp
index 6800e8ca..3faa0f45 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -49,7 +49,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)
 	#include <sys/auxv.h>
-	#include <asm/cputable.h>
+	// From asm/cputable.h:
+	#ifndef PPC_FEATURE2_VEC_CRYPTO
+		#define PPC_FEATURE2_VEC_CRYPTO 0x02000000
+	#endif
+	#ifndef PPC_FEATURE2_ARCH_3_00
+		#define PPC_FEATURE2_ARCH_3_00 0x00800000
+	#endif
 #endif
 
 #ifdef __riscv

From 62e945700dffcc0c656bfcc8464f0c06a2b7a250 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Thu, 16 Apr 2026 23:07:34 -0500
Subject: [PATCH 27/50] Use round-robin temporary register allocator in PPC64
 JIT compiler

This gives us an extra ~2% performance on RandomX V1 and V2.
---
 src/jit_compiler_ppc64.cpp | 165 +++++++++++++++++++++++--------------
 src/jit_compiler_ppc64.hpp |   6 ++
 2 files changed, 107 insertions(+), 64 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 7353527f..868db377 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -41,7 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "jit_compiler_ppc64.hpp"
 
 namespace {
-#define HANDLER_ARGS randomx::CompilerState& state, randomx::Instruction isn, int i, randomx_flags flags
+#define HANDLER_ARGS randomx::JitCompilerPPC64* jit, randomx::CompilerState& state, randomx::Instruction isn, int i, randomx_flags flags
 	using InstructionHandler = void(HANDLER_ARGS);
 	extern InstructionHandler* opcodeMap1[256];
 }
@@ -680,13 +680,13 @@ namespace randomx {
 		}
 	}
 
-	static void emitAddImm32(CompilerState& state, int dstReg, int srcReg, uint32_t imm) {
+	static void emitAddImm32(CompilerState& state, uint32_t tmpReg, int dstReg, int srcReg, uint32_t imm) {
 		int32_t simm = (int32_t)imm;
 		if (simm >= -32768 && simm <= 32767) {
 			state.emit(PPC64::addi(dstReg, srcReg, simm & 0xFFFF));
 		} else {
-			emitMovImm32(state, 8, imm);
-			state.emit(PPC64::add(dstReg, srcReg, 8));
+			emitMovImm32(state, tmpReg, imm);
+			state.emit(PPC64::add(dstReg, srcReg, tmpReg));
 		}
 	}
 
@@ -744,14 +744,13 @@ namespace randomx {
 		}
 	}
 
-	template<uint32_t tmp_gpr>
-	static void emitLoadGprFromScratchpad(CompilerState& state, uint32_t dst, uint32_t src, Instruction& instr) {
+	static void emitLoadGprFromScratchpad(CompilerState& state, uint32_t tmp_gpr, uint32_t dst, uint32_t src, Instruction& instr) {
 		uint32_t imm = instr.getImm32();
 
 		if (src != dst) {
 			uint32_t size = instr.getModMem() ? RANDOMX_SCRATCHPAD_L1 : RANDOMX_SCRATCHPAD_L2;
 			imm &= size - 1;
-			emitAddImm32(state, tmp_gpr, src, imm);
+			emitAddImm32(state, tmp_gpr, tmp_gpr, src, imm);
 
 			uint32_t mb = 32 - Log2(size);
 			state.emit(PPC64::rlwinm(tmp_gpr, tmp_gpr, 0, mb, 28));
@@ -763,19 +762,32 @@ namespace randomx {
 		emitLoadGpr64(state, tmp_gpr, ScratchpadPointerGPR30, tmp_gpr);
 	}
 
-	template<uint32_t tmp_vr>
-	static void emitLoadVsrFromScratchpad(CompilerState& state, Instruction& instr) {
+	static void emitLoadVsrFromScratchpad(CompilerState& state, uint32_t tmp_gpr, uint32_t tmp_vr, Instruction& instr) {
 		int src = RegisterMapR.getPpcGprNum(instr.src);
 
 		uint32_t imm = instr.getImm32();
 		uint32_t size = instr.getModMem() ? RANDOMX_SCRATCHPAD_L1 : RANDOMX_SCRATCHPAD_L2;
 		imm &= size - 1;
-		emitAddImm32(state, 8, src, imm);
+		emitAddImm32(state, tmp_gpr, tmp_gpr, src, imm);
 
 		uint32_t mb = 32 - Log2(size);
-		state.emit(PPC64::rlwinm(8, 8, 0, mb, 28));
+		state.emit(PPC64::rlwinm(tmp_gpr, tmp_gpr, 0, mb, 28));
 
-		emitLoadVr64(state, tmp_vr, ScratchpadPointerGPR30, 8);
+		emitLoadVr64(state, tmp_vr, ScratchpadPointerGPR30, tmp_gpr);
+	}
+
+	uint32_t JitCompilerPPC64::getTempGpr() {
+		static const uint32_t gprs[] = {6, 7, 8, 9, 10, 11, 12};
+		uint32_t reg = gprs[tempGprIndex];
+		tempGprIndex = (tempGprIndex + 1) % 7;
+		return reg;
+	}
+
+	uint32_t JitCompilerPPC64::getTempVr() {
+		static const uint32_t vrs[] = {12, 13, 14};
+		uint32_t reg = vrs[tempVrIndex];
+		tempVrIndex = (tempVrIndex + 1) % 3;
+		return reg;
 	}
 
 	void JitCompilerPPC64::emitProgramPrefix(CompilerState& state, Program& prog, ProgramConfiguration& pcfg, randomx_flags flags) {
@@ -811,7 +823,7 @@ namespace randomx {
 			instr.src %= RegistersCount;
 			instr.dst %= RegistersCount;
 			state.instructionOffsets[i] = state.codePos;
-			opcodeMap1[instr.opcode](state, instr, i, flags);
+			opcodeMap1[instr.opcode](this, state, instr, i, flags);
 		}
 	}
 
@@ -966,7 +978,7 @@ namespace randomx {
 		state.emit(PPC64::and_(5, mtReg, 8)); // r5 = mt & datasetMask
 		state.emit(PPC64::srdi(5, 5, Log2(CacheLineSize))); // r5 = r5 >> 6
 
-		emitAddImm32(state, 5, 5, datasetOffset / CacheLineSize);
+		emitAddImm32(state, 8, 5, 5, datasetOffset / CacheLineSize);
 
 		int32_t callPos = state.codePos + offsetVmDataReadLightFixCall;
 		state.emit(codeVmDataReadLight, sizeVmDataReadLight);
@@ -1084,22 +1096,24 @@ namespace randomx {
 		int shift = isn.getModShift();
 
 		if (shift) {
-			state.emit(PPC64::sldi(8, src, shift));
-			state.emit(PPC64::add(dst, dst, 8));
+			uint32_t tmp_gpr = jit->getTempGpr();
+			state.emit(PPC64::sldi(tmp_gpr, src, shift));
+			state.emit(PPC64::add(dst, dst, tmp_gpr));
 		} else {
 			state.emit(PPC64::add(dst, dst, src));
 		}
 
 		if (isn.dst == RegisterNeedsDisplacement) {
-			emitAddImm32(state, dst, dst, isn.getImm32());
+			emitAddImm32(state, jit->getTempGpr(), dst, dst, isn.getImm32());
 		}
 	}
 	static void h_IADD_M(HANDLER_ARGS) {
 		state.registerUsage[isn.dst] = i;
 		int dst = RegisterMapR.getPpcGprNum(isn.dst);
 		int src = RegisterMapR.getPpcGprNum(isn.src);
-		emitLoadGprFromScratchpad<8>(state, dst, src, isn);
-		state.emit(PPC64::add(dst, dst, 8));
+		uint32_t tmp_gpr = jit->getTempGpr();
+		emitLoadGprFromScratchpad(state, tmp_gpr, dst, src, isn);
+		state.emit(PPC64::add(dst, dst, tmp_gpr));
 	}
 	static void h_ISUB_R(HANDLER_ARGS) {
 		state.registerUsage[isn.dst] = i;
@@ -1109,15 +1123,16 @@ namespace randomx {
 			state.emit(PPC64::subf(dst, src, dst));
 		} else {
 			int32_t imm = unsigned32ToSigned2sCompl(-isn.getImm32());
-			emitAddImm32(state, dst, dst, imm);
+			emitAddImm32(state, jit->getTempGpr(), dst, dst, imm);
 		}
 	}
 	static void h_ISUB_M(HANDLER_ARGS) {
 		state.registerUsage[isn.dst] = i;
 		int dst = RegisterMapR.getPpcGprNum(isn.dst);
 		int src = RegisterMapR.getPpcGprNum(isn.src);
-		emitLoadGprFromScratchpad<8>(state, dst, src, isn);
-		state.emit(PPC64::subf(dst, 8, dst));
+		uint32_t tmp_gpr = jit->getTempGpr();
+		emitLoadGprFromScratchpad(state, tmp_gpr, dst, src, isn);
+		state.emit(PPC64::subf(dst, tmp_gpr, dst));
 	}
 	static void h_IMUL_R(HANDLER_ARGS) {
 		state.registerUsage[isn.dst] = i;
@@ -1126,16 +1141,18 @@ namespace randomx {
 			int src = RegisterMapR.getPpcGprNum(isn.src);
 			state.emit(PPC64::mulld(dst, dst, src));
 		} else {
-			emitMovImm32(state, 8, isn.getImm32());
-			state.emit(PPC64::mulld(dst, dst, 8));
+			uint32_t tmp_gpr = jit->getTempGpr();
+			emitMovImm32(state, tmp_gpr, isn.getImm32());
+			state.emit(PPC64::mulld(dst, dst, tmp_gpr));
 		}
 	}
 	static void h_IMUL_M(HANDLER_ARGS) {
 		state.registerUsage[isn.dst] = i;
 		int dst = RegisterMapR.getPpcGprNum(isn.dst);
 		int src = RegisterMapR.getPpcGprNum(isn.src);
-		emitLoadGprFromScratchpad<8>(state, dst, src, isn);
-		state.emit(PPC64::mulld(dst, dst, 8));
+		uint32_t tmp_gpr = jit->getTempGpr();
+		emitLoadGprFromScratchpad(state, tmp_gpr, dst, src, isn);
+		state.emit(PPC64::mulld(dst, dst, tmp_gpr));
 	}
 	static void h_IMULH_R(HANDLER_ARGS) {
 		state.registerUsage[isn.dst] = i;
@@ -1147,8 +1164,9 @@ namespace randomx {
 		state.registerUsage[isn.dst] = i;
 		int dst = RegisterMapR.getPpcGprNum(isn.dst);
 		int src = RegisterMapR.getPpcGprNum(isn.src);
-		emitLoadGprFromScratchpad<8>(state, dst, src, isn);
-		state.emit(PPC64::mulhdu(dst, dst, 8));
+		uint32_t tmp_gpr = jit->getTempGpr();
+		emitLoadGprFromScratchpad(state, tmp_gpr, dst, src, isn);
+		state.emit(PPC64::mulhdu(dst, dst, tmp_gpr));
 	}
 	static void h_ISMULH_R(HANDLER_ARGS) {
 		state.registerUsage[isn.dst] = i;
@@ -1160,22 +1178,24 @@ namespace randomx {
 		state.registerUsage[isn.dst] = i;
 		int dst = RegisterMapR.getPpcGprNum(isn.dst);
 		int src = RegisterMapR.getPpcGprNum(isn.src);
-		emitLoadGprFromScratchpad<8>(state, dst, src, isn);
-		state.emit(PPC64::mulhd(dst, dst, 8));
+		uint32_t tmp_gpr = jit->getTempGpr();
+		emitLoadGprFromScratchpad(state, tmp_gpr, dst, src, isn);
+		state.emit(PPC64::mulhd(dst, dst, tmp_gpr));
 	}
 	static void h_IMUL_RCP(HANDLER_ARGS) {
 		uint32_t divisor = isn.getImm32();
 		if (!isZeroOrPowerOf2(divisor)) {
 			state.registerUsage[isn.dst] = i;
 			int dst = RegisterMapR.getPpcGprNum(isn.dst);
+			uint32_t tmp_gpr = jit->getTempGpr();
 
 			// Calculate and cache the reciprocal
 			int32_t offset = ReciprocalPoolPos + 8 * state.rcpCount++;
 			uint64_t rcp = randomx_reciprocal_fast(divisor);
 			state.emitAt(offset, rcp);
 
-			state.emit(PPC64::ld(8, ConstantsBaseAddressRegisterGPR2, offset));
-			state.emit(PPC64::mulld(dst, dst, 8));
+			state.emit(PPC64::ld(tmp_gpr, ConstantsBaseAddressRegisterGPR2, offset));
+			state.emit(PPC64::mulld(dst, dst, tmp_gpr));
 		}
 	}
 	static void h_INEG_R(HANDLER_ARGS) {
@@ -1190,24 +1210,27 @@ namespace randomx {
 			int src = RegisterMapR.getPpcGprNum(isn.src);
 			state.emit(PPC64::xor_(dst, dst, src));
 		} else {
-			emitMovImm32(state, 8, isn.getImm32());
-			state.emit(PPC64::xor_(dst, dst, 8));
+			uint32_t tmp_gpr = jit->getTempGpr();
+			emitMovImm32(state, tmp_gpr, isn.getImm32());
+			state.emit(PPC64::xor_(dst, dst, tmp_gpr));
 		}
 	}
 	static void h_IXOR_M(HANDLER_ARGS) {
 		state.registerUsage[isn.dst] = i;
 		int dst = RegisterMapR.getPpcGprNum(isn.dst);
 		int src = RegisterMapR.getPpcGprNum(isn.src);
-		emitLoadGprFromScratchpad<8>(state, dst, src, isn);
-		state.emit(PPC64::xor_(dst, dst, 8));
+		uint32_t tmp_gpr = jit->getTempGpr();
+		emitLoadGprFromScratchpad(state, tmp_gpr, dst, src, isn);
+		state.emit(PPC64::xor_(dst, dst, tmp_gpr));
 	}
 	static void h_IROR_R(HANDLER_ARGS) {
 		state.registerUsage[isn.dst] = i;
 		int dst = RegisterMapR.getPpcGprNum(isn.dst);
 		if (isn.src != isn.dst) {
 			int src = RegisterMapR.getPpcGprNum(isn.src);
-			state.emit(PPC64::neg(8, src));
-			state.emit(PPC64::rldcl(dst, dst, 8, 0));
+			uint32_t tmp_gpr = jit->getTempGpr();
+			state.emit(PPC64::neg(tmp_gpr, src));
+			state.emit(PPC64::rldcl(dst, dst, tmp_gpr, 0));
 		} else {
 			uint32_t imm = isn.getImm32() & 63;
 			if (imm)
@@ -1232,9 +1255,10 @@ namespace randomx {
 			state.registerUsage[isn.src] = i;
 			int dst = RegisterMapR.getPpcGprNum(isn.dst);
 			int src = RegisterMapR.getPpcGprNum(isn.src);
-			state.emit(PPC64::mr(8, dst));
+			uint32_t tmp_gpr = jit->getTempGpr();
+			state.emit(PPC64::mr(tmp_gpr, dst));
 			state.emit(PPC64::mr(dst, src));
-			state.emit(PPC64::mr(src, 8));
+			state.emit(PPC64::mr(src, tmp_gpr));
 		}
 	}
 	static void h_FSWAP_R(HANDLER_ARGS) {
@@ -1248,8 +1272,10 @@ namespace randomx {
 	}
 	static void h_FADD_M(HANDLER_ARGS) {
 		int dst = RegisterMapF.getPpcVsrNum(isn.dst);
-		emitLoadVsrFromScratchpad<12>(state, isn);
-		state.emit(PPC64::xvadddp(dst, dst, 32 + 12));
+		uint32_t tmp_gpr = jit->getTempGpr();
+		uint32_t tmp_vr = jit->getTempVr();
+		emitLoadVsrFromScratchpad(state, tmp_gpr, tmp_vr, isn);
+		state.emit(PPC64::xvadddp(dst, dst, 32 + tmp_vr));
 	}
 	static void h_FSUB_R(HANDLER_ARGS) {
 		int dst = RegisterMapF.getPpcVsrNum(isn.dst);
@@ -1258,8 +1284,10 @@ namespace randomx {
 	}
 	static void h_FSUB_M(HANDLER_ARGS) {
 		int dst = RegisterMapF.getPpcVsrNum(isn.dst);
-		emitLoadVsrFromScratchpad<12>(state, isn);
-		state.emit(PPC64::xvsubdp(dst, dst, 32 + 12));
+		uint32_t tmp_gpr = jit->getTempGpr();
+		uint32_t tmp_vr = jit->getTempVr();
+		emitLoadVsrFromScratchpad(state, tmp_gpr, tmp_vr, isn);
+		state.emit(PPC64::xvsubdp(dst, dst, 32 + tmp_vr));
 	}
 	static void h_FSCAL_R(HANDLER_ARGS) {
 		int dst = RegisterMapF.getPpcVrNum(isn.dst);
@@ -1272,10 +1300,12 @@ namespace randomx {
 	}
 	static void h_FDIV_M(HANDLER_ARGS) {
 		int dst = RegisterMapE.getPpcVsrNum(isn.dst);
-		emitLoadVsrFromScratchpad<12>(state, isn);
-		state.emit(PPC64::vand(12, 12, ConstantVectorGroupEAndMaskVR17));
-		state.emit(PPC64::vor(12, 12, ConstantVectorGroupEOrMaskVR19));
-		state.emit(PPC64::xvdivdp(dst, dst, 32 + 12));
+		uint32_t tmp_gpr = jit->getTempGpr();
+		uint32_t tmp_vr = jit->getTempVr();
+		emitLoadVsrFromScratchpad(state, tmp_gpr, tmp_vr, isn);
+		state.emit(PPC64::vand(tmp_vr, tmp_vr, ConstantVectorGroupEAndMaskVR17));
+		state.emit(PPC64::vor(tmp_vr, tmp_vr, ConstantVectorGroupEOrMaskVR19));
+		state.emit(PPC64::xvdivdp(dst, dst, 32 + tmp_vr));
 	}
 	static void h_FSQRT_R(HANDLER_ARGS) {
 		int dst = RegisterMapE.getPpcVsrNum(isn.dst);
@@ -1291,13 +1321,13 @@ namespace randomx {
 			imm &= ~(1UL << (shift - 1));
 
 		int dst = RegisterMapR.getPpcGprNum(reg);
-		emitAddImm32(state, dst, dst, imm);
+		emitAddImm32(state, jit->getTempGpr(), dst, dst, imm);
 
 		// Calculate the Mask Begin (MB) parameter
 		uint32_t mb = 64 - RANDOMX_JUMP_BITS;
 
-		// rldicl. r8, dst, 64 - shift, mb
-		state.emit(PPC64::rldicl_dot(8, dst, (64 - shift) & 63, mb));
+		// rldicl. tmp_gpr, dst, 64 - shift, mb
+		state.emit(PPC64::rldicl_dot(jit->getTempGpr(), dst, (64 - shift) & 63, mb));
 
 		int32_t targetPos = state.instructionOffsets[target];
 		int offset = targetPos - state.codePos;
@@ -1323,11 +1353,13 @@ namespace randomx {
 
 		// Rotate right by rotateBits
 		if (rotateBits) {
-			// rotrdi r8, src, rotateBits
-			state.emit(PPC64::rotrdi(8, src, rotateBits));
+			uint32_t tmp_gpr = jit->getTempGpr();
 
-			// We rotated src and put the new value in r8
-			rot_src = 8;
+			// rotrdi tmp_gpr, src, rotateBits
+			state.emit(PPC64::rotrdi(tmp_gpr, src, rotateBits));
+
+			// We rotated src and put the new value in tmp_gpr
+			rot_src = tmp_gpr;
 		}
 
 		int32_t patch_pos = 0;
@@ -1343,16 +1375,20 @@ namespace randomx {
 			state.emit(0); // bne skip_update
 		}
 
+		uint32_t offset_gpr = jit->getTempGpr();
+
 		// Mask out bits 1:0 and multiply by 8 (shift left by 3) to get the table word offset (0, 8, 16, 24)
-		// rldic r8, rot_src, 3, 59
-		state.emit(PPC64::rldic(8, rot_src, 3, 59));
+		// rldic offset_gpr, rot_src, 3, 59
+		state.emit(PPC64::rldic(offset_gpr, rot_src, 3, 59));
+
+		uint32_t address_gpr = jit->getTempGpr();
 
-		// Load table address into scratch GPR0
-		emitAddImm32(state, 0, ConstantsBaseAddressRegisterGPR2, offsetConstantLutFprcToFpscr);
+		// Load table address into scratch address_gpr
+		emitAddImm32(state, jit->getTempGpr(), address_gpr, ConstantsBaseAddressRegisterGPR2, offsetConstantLutFprcToFpscr);
 
 		// Load value from fprc-to-FPSCR table into temporary FPR0
-		// lfdx f0, r8, r0
-		state.emit(PPC64::lfdx(0, 8, 0));
+		// lfdx f0, offset_gpr, address_gpr
+		state.emit(PPC64::lfdx(0, offset_gpr, address_gpr));
 
 		if (randomx::cpu.hasV3P0()) {
 			// Move the RN value from scratch FPR0 to FPSCR field RN
@@ -1374,6 +1410,7 @@ namespace randomx {
 		int dst = RegisterMapR.getPpcGprNum(isn.dst);
 		int src = RegisterMapR.getPpcGprNum(isn.src);
 		uint32_t imm = isn.getImm32();
+		uint32_t tmp_gpr = jit->getTempGpr();
 
 		uint32_t size;
 		if (isn.getModCond() < StoreL3Condition) {
@@ -1383,12 +1420,12 @@ namespace randomx {
 		}
 		imm &= size - 1;
 
-		emitAddImm32(state, 8, dst, imm);
+		emitAddImm32(state, jit->getTempGpr(), tmp_gpr, dst, imm);
 
 		uint32_t mb = 32 - Log2(size);
-		state.emit(PPC64::rlwinm(8, 8, 0, mb, 28));
+		state.emit(PPC64::rlwinm(tmp_gpr, tmp_gpr, 0, mb, 28));
 
-		emitStoreGpr64(state, src, ScratchpadPointerGPR30, 8);
+		emitStoreGpr64(state, src, ScratchpadPointerGPR30, tmp_gpr);
 	}
 	static void h_NOP(HANDLER_ARGS) {
 	}
diff --git a/src/jit_compiler_ppc64.hpp b/src/jit_compiler_ppc64.hpp
index 9107cdc4..b9392d92 100644
--- a/src/jit_compiler_ppc64.hpp
+++ b/src/jit_compiler_ppc64.hpp
@@ -90,6 +90,9 @@ namespace randomx {
 
 		void setFlags(randomx_flags f) { flags = f; }
 
+		uint32_t getTempGpr();
+		uint32_t getTempVr();
+
 		static uint8_t instMap[256];
 
 	private:
@@ -109,6 +112,9 @@ namespace randomx {
 		int32_t RandomXCodePos;
 		int32_t SshashSingleItemPos;
 		int32_t LoopBeginPos;
+
+		uint32_t tempGprIndex = 0;
+		uint32_t tempVrIndex = 0;
 	};
 
 }

From 087edd164db8d031ee5af14db32e7ee65fe993d3 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Fri, 17 Apr 2026 11:11:13 -0500
Subject: [PATCH 28/50] Fix PPC64 build for musl libc

musl doesn't define _SC_LEVEL1_DCACHE_LINESIZE or (I assume)
_SC_LEVEL1_ICACHE_LINESIZE, so attempting to use those values with
sysconf to get the cache line sizes causes the build to fail on Alpine
Linux and other distros that use musl libc.

To fix this, we just check if those values are defined and if they're
not, we skip querying the cache line size and fall back to the safe
default of 32 bytes. While we could probably get away with assuming a
minimum cache line size of 64 bytes or even 128 bytes [1] for 64-bit
systems, caching issues can be extremely difficult to catch, so I think
it's best to play it safe to avoid those issues entirely.

This will cause a small performance penalty on systems that use musl
libc. That said, if any users of those systems need this fixed it would
be straightforward to modify the code to use whatever mechanism is
normally used on those systems to detect the cache line sizes.

[1]: The lowest version of the Power ISA we target is v2.06 and we also
require VSX. The only chips I'm aware of that meet both of those
requirements are POWER7 and later, and every one of those chips has
128-byte cache lines.
---
 src/jit_compiler_ppc64.cpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 868db377..615ae429 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -602,9 +602,17 @@ namespace randomx {
 		if (len == 0) return;
 
 		// Query data and instruction cache line sizes
-		long dcache_val = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
-		long icache_val = sysconf(_SC_LEVEL1_ICACHE_LINESIZE);
+		long dcache_val = 0;
+		long icache_val = 0;
 
+#ifdef _SC_LEVEL1_DCACHE_LINESIZE
+		dcache_val = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+#endif
+#ifdef _SC_LEVEL1_ICACHE_LINESIZE
+		icache_val = sysconf(_SC_LEVEL1_ICACHE_LINESIZE);
+#endif
+
+		// Default to 32 bytes if querying the line size fails
 		const size_t d_line_size = (dcache_val > 0) ? dcache_val : 32;
 		const size_t i_line_size = (icache_val > 0) ? icache_val : 32;
 

From 23b22fbd52a4074c01870238b198dbdbbb166793 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Fri, 24 Apr 2026 10:19:33 -0500
Subject: [PATCH 29/50] Rename the PPC64 byte-reverse mask to better reflect
 its purpose

We don't exclusively use it on big-endian systems anymore as
little-endian systems need it for the RandomX V2 AES mixing process.
---
 src/jit_compiler_ppc64_static.S | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index dee9bb28..ceb6a3b2 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -175,8 +175,8 @@ constant_vector_fscal_xor_mask:
 	.8byte 0x80F0000000000000
 	.8byte 0x80F0000000000000
 
-constant_vector_be_byte_reverse_mask:
-	// Big-endian vector byte reverse mask
+constant_vector_byte_reverse_mask:
+	// Vector byte reverse mask
 	.8byte 0x0F0E0D0C0B0A0908
 	.8byte 0x0706050403020100
 
@@ -498,7 +498,7 @@ randomx_ppc64_sshash_xor_end:
 // v4-v7 / vs36-vs39 (volatile) -> RandomX floating point registers e0-e3
 // v8-v11 / vs40-vs43 (volatile) -> RandomX floating point registers a0-a3
 // v12-v14 / vs44-vs46 (volatile) -> scratch registers
-// v15 / vs47 (volatile) -> constant_vector_be_byte_reverse_mask
+// v15 / vs47 (volatile) -> constant_vector_byte_reverse_mask
 // v16 / vs48 (volatile) -> constant_vector_be_permutation_mask
 // v17 / vs49 (volatile) -> constant_vector_group_e_and_mask
 // v18 / vs50 (volatile) -> constant_vector_fscal_xor_mask
@@ -546,7 +546,7 @@ randomx_ppc64_vm_prologue:
 #if PPC_BIG_ENDIAN
 	li %r11, constant_vector_be_permutation_mask-randomx_ppc64_constants
 #endif
-	li %r12, constant_vector_be_byte_reverse_mask-randomx_ppc64_constants
+	li %r12, constant_vector_byte_reverse_mask-randomx_ppc64_constants
 	lxvd2x %vs49, %r8, %r2
 	lxvd2x %vs50, %r9, %r2
 	lvx %v19, %r10, %r2  // Use lvx to load the vector since it's written [ low word, high word ] in memory
@@ -1099,7 +1099,7 @@ randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end:
 // v10 / vs42 (non-volatile) -> 0x5b... / 0xe8... (must reload: RandomX floating point registers a2)
 // v11 / vs43 (non-volatile) -> invsubbytes_mulE_hi (must reload: RandomX floating point registers a3)
 // v12-v14 / vs44-vs46 (volatile) -> scratch registers
-// v15 / vs47 (non-volatile) -> shiftrows / invshiftrows (must reload: constant_vector_be_byte_reverse_mask)
+// v15 / vs47 (non-volatile) -> shiftrows / invshiftrows (must reload: constant_vector_byte_reverse_mask)
 // v16 / vs48 (non-volatile) -> enc/dec input/output transform lo (must reload (BE only): constant_vector_be_permutation_mask)
 // v17 / vs49 (non-volatile) -> enc/dec input/output transform hi (must reload: constant_vector_group_e_and_mask)
 // v18 / vs50 (non-volatile) -> mixcolumns forward (must reload: constant_vector_fscal_xor_mask)

From 2e4c98641f82fab6be3c90579a77b09746f2baa6 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Fri, 24 Apr 2026 10:39:20 -0500
Subject: [PATCH 30/50] Use .octa for vector byte-reverse mask to avoid
 confusion

---
 src/jit_compiler_ppc64_static.S | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index ceb6a3b2..05994854 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -177,8 +177,7 @@ constant_vector_fscal_xor_mask:
 
 constant_vector_byte_reverse_mask:
 	// Vector byte reverse mask
-	.8byte 0x0F0E0D0C0B0A0908
-	.8byte 0x0706050403020100
+	.octa 0x0F0E0D0C0B0A09080706050403020100
 
 #if PPC_BIG_ENDIAN
 constant_vector_be_permutation_mask:
@@ -553,7 +552,7 @@ randomx_ppc64_vm_prologue:
 #if PPC_BIG_ENDIAN
 	lxvd2x %vs48, %r11, %r2  // Load the BE permutation mask (not needed for LE)
 #endif
-	lxvd2x %vs47, %r12, %r2
+	lvx %v15, %r12, %r2
 #if PPC_BIG_ENDIAN
 	vperm %v19, %v19, %v19, %v15  // Swap the byte order of the Group E OR mask vector
 #endif

From bb7633a131346015f6a2bd70889c41d270cbbaf8 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Fri, 24 Apr 2026 10:45:15 -0500
Subject: [PATCH 31/50] Correct the Group E AND mask

This is more correct since it more accurately describes what is being
done to the Group E registers (the upper eight bits and lower 22 bits of
each double come directly from the Group E OR mask values.

That said, despite clearing the lower 22 bits of the mask this change is
effectively a no-op. The reason for this is that because the Group E
registers are loaded exclusively by converting signed 32-bit integers
into doubles, the lower 22 bits of each double are always zero before
they each get set by the OR mask. So, clearing those bits does not
change their values.
---
 src/jit_compiler_ppc64_static.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index 05994854..e535bb65 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -168,8 +168,8 @@ randomx_ppc64_constant_lut_fprc_to_fpscr:
 	.align 4
 
 constant_vector_group_e_and_mask:
-	.8byte 0x00FFFFFFFFFFFFFF
-	.8byte 0x00FFFFFFFFFFFFFF
+	.8byte 0x00FFFFFFFFC00000
+	.8byte 0x00FFFFFFFFC00000
 
 constant_vector_fscal_xor_mask:
 	.8byte 0x80F0000000000000

From 8cd6435064dd677256ad064e5daa4791dd6710bf Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Fri, 24 Apr 2026 11:08:43 -0500
Subject: [PATCH 32/50] Optimize Group E register conversion on PPC64

With the Group E AND mask properly set to include just the bits we want
to keep from the original register, we can combine the `vand` and `vor`
operations into a single `vsel` operation, saving us one vector
instruction every time we need to load a Group E register from memory.

This gives us an extra ~1.3% boost to performance on both V1 and V2.
---
 src/jit_compiler_ppc64.cpp      |  4 ++--
 src/jit_compiler_ppc64_static.S | 12 ++++--------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 615ae429..ebea7706 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -309,6 +309,7 @@ namespace PPC64 {
 	}
 
 	static inline uint32_t vperm(uint32_t vrt, uint32_t vra, uint32_t vrb, uint32_t vrc) { return VA_form(4, vrt, vra, vrb, vrc, 43); }
+	static inline uint32_t vsel(uint32_t vrt, uint32_t vra, uint32_t vrb, uint32_t vrc) { return VA_form(4, vrt, vra, vrb, vrc, 42); }
 
 	static inline uint32_t vand(uint32_t vrt, uint32_t vra, uint32_t vrb) { return VX_form(4, vrt, vra, vrb, 1028); }
 	static inline uint32_t vor(uint32_t vrt, uint32_t vra, uint32_t vrb) { return VX_form(4, vrt, vra, vrb, 1156); }
@@ -1311,8 +1312,7 @@ namespace randomx {
 		uint32_t tmp_gpr = jit->getTempGpr();
 		uint32_t tmp_vr = jit->getTempVr();
 		emitLoadVsrFromScratchpad(state, tmp_gpr, tmp_vr, isn);
-		state.emit(PPC64::vand(tmp_vr, tmp_vr, ConstantVectorGroupEAndMaskVR17));
-		state.emit(PPC64::vor(tmp_vr, tmp_vr, ConstantVectorGroupEOrMaskVR19));
+		state.emit(PPC64::vsel(tmp_vr, ConstantVectorGroupEOrMaskVR19, tmp_vr, ConstantVectorGroupEAndMaskVR17));
 		state.emit(PPC64::xvdivdp(dst, dst, 32 + tmp_vr));
 	}
 	static void h_FSQRT_R(HANDLER_ARGS) {
diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index e535bb65..2d7b605f 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -714,14 +714,10 @@ randomx_ppc64_vm_loop_prologue:
 	xvcvsxwdp %vs37, %vs37
 	xvcvsxwdp %vs38, %vs38
 	xvcvsxwdp %vs39, %vs39
-	vand %v4, %v4, %v17
-	vand %v5, %v5, %v17
-	vand %v6, %v6, %v17
-	vand %v7, %v7, %v17
-	vor %v4, %v4, %v19
-	vor %v5, %v5, %v19
-	vor %v6, %v6, %v19
-	vor %v7, %v7, %v19
+	vsel %v4, %v19, %v4, %v17
+	vsel %v5, %v19, %v5, %v17
+	vsel %v6, %v19, %v6, %v17
+	vsel %v7, %v19, %v7, %v17
 
 randomx_ppc64_vm_loop_prologue_end:
 

From 694dd0043ea81888f3b03520ae4fde18ca0dbf15 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Sun, 26 Apr 2026 17:54:34 -0500
Subject: [PATCH 33/50] PPC64 JIT: Correct maximum RandomX instruction code
 size

Fixes: 8cd6435 ("Optimize Group E register conversion on PPC64")
---
 src/jit_compiler_ppc64.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index ebea7706..adff7532 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -530,7 +530,7 @@ namespace randomx {
 	static const size_t ReciprocalPoolPos = sizeConstants + 16;  // Add 16 bytes for the Group E OR vector mask
 	static const size_t ConstantPoolSize = alignSize(sizeConstants + 16 + ReciprocalPoolSize, CodeAlign);  // Add 16 bytes for the Group E OR vector mask
 	static const size_t ReserveCodeSize = alignSize(sizeVmPrologue + sizeVmEpilogue + sizeVmLoopPrologue + sizeVmDataRead + sizeVmDataReadLight + sizeVmSpadStorePrologue + sizeVmSpadStoreMixV2SoftAes + sizeVmSpadStoreEpilogue, CodeAlign);
-	constexpr size_t MaxRandomXInstrCodeSize = 4*10;  // FDIV_M requires at most 10 instructions
+	constexpr size_t MaxRandomXInstrCodeSize = 4*9;  // FDIV_M and CFROUND require at most 9 instructions
 	constexpr size_t MaxSuperscalarInstrSize = 4*6;  // IMUL_RCP requires at most 6 instructions
 	static const size_t SuperscalarProgramHeaders = sizeSshashSingleItemPrologue + sizeSshashSingleItemEpilogue;
 

From 2bb1ab292cc570729ddb42bf99a7ccd6dac6b0f6 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Sun, 26 Apr 2026 19:43:16 -0500
Subject: [PATCH 34/50] PPC64 JIT: Optimize emitAddImm32 by using `addis` for
 supported values

This will sometimes save us one instruction.
---
 src/jit_compiler_ppc64.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index adff7532..b3dd9547 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -693,6 +693,8 @@ namespace randomx {
 		int32_t simm = (int32_t)imm;
 		if (simm >= -32768 && simm <= 32767) {
 			state.emit(PPC64::addi(dstReg, srcReg, simm & 0xFFFF));
+		} else if ((imm & 0xFFFF) == 0) {
+			state.emit(PPC64::addis(dstReg, srcReg, (imm >> 16) & 0xFFFF));
 		} else {
 			emitMovImm32(state, tmpReg, imm);
 			state.emit(PPC64::add(dstReg, srcReg, tmpReg));

From f633ec1ae404667ebb173df668458ad194cf9f80 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Sun, 26 Apr 2026 21:50:47 -0500
Subject: [PATCH 35/50] PPC64 JIT: Add some notes on optimizing emitAddImm32

---
 src/jit_compiler_ppc64.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index b3dd9547..1f0c3ce0 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -696,6 +696,19 @@ namespace randomx {
 		} else if ((imm & 0xFFFF) == 0) {
 			state.emit(PPC64::addis(dstReg, srcReg, (imm >> 16) & 0xFFFF));
 		} else {
+			// Notes on optimization:
+			//
+			// 1. Performing an `addis` -> `addi` is not a complete replacement for `lis` -> `ori` -> `add`, as constants in the
+			//    range 0x7FFF8000 to 0x7FFFFFFF cannot be handled by `addis` -> `addi`. So to be able to handle all constants,
+			//    `lis` -> `ori` -> `add` must always be available as a fallback.
+			// 2. In the context of RandomX, `addis` -> `addi` is almost always slower than `lis` -> `ori` -> `add`. The reason
+			//    for this is subtle--with `addis` -> `addi`, execution blocks at the `addis` as the CPU waits for the source
+			//    register to become ready, and `addi` can't be executed because it depends on the result of `addis`. In
+			//    contrast, `lis` -> `ori` to a temporary register can almost always be executed while the CPU waits for the
+			//    source register to become ready, and so execution will usually only block on the single `add` instruction. So
+			//    despite significantly reducing the total number of instructions executed, using `addis` -> `addi` instead of
+			//    `lis` -> `ori` -> `add` results in a significant reduction in IPC (-5%) and a small overall reduction in
+			//    performance (-0.5%).
 			emitMovImm32(state, tmpReg, imm);
 			state.emit(PPC64::add(dstReg, srcReg, tmpReg));
 		}

From bf0b5cd296277547a92b0dc925a7cca85c4e04ca Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Mon, 27 Apr 2026 11:01:58 -0500
Subject: [PATCH 36/50] PPC64 JIT: Make sure groups of four loads use different
 temporary regs

On POWER9 the LSU can perform four loads at a time, and we can make it
easier for the CPU to do this by using different destination registers
for each load.
---
 src/jit_compiler_ppc64_static.S | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index 2d7b605f..c4840e7f 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -666,18 +666,18 @@ randomx_ppc64_vm_loop_prologue:
 	LOAD_LE_GPR %r9, 8, %r26
 	xor %r14, %r14, %r8
 	xor %r15, %r15, %r9
-	LOAD_LE_GPR %r8, 16, %r26
-	LOAD_LE_GPR %r9, 24, %r26
-	xor %r16, %r16, %r8
-	xor %r17, %r17, %r9
+	LOAD_LE_GPR %r10, 16, %r26
+	LOAD_LE_GPR %r11, 24, %r26
+	xor %r16, %r16, %r10
+	xor %r17, %r17, %r11
 	LOAD_LE_GPR %r8, 32, %r26
 	LOAD_LE_GPR %r9, 40, %r26
 	xor %r18, %r18, %r8
 	xor %r19, %r19, %r9
-	LOAD_LE_GPR %r8, 48, %r26
-	LOAD_LE_GPR %r9, 56, %r26
-	xor %r20, %r20, %r8
-	xor %r21, %r21, %r9
+	LOAD_LE_GPR %r10, 48, %r26
+	LOAD_LE_GPR %r11, 56, %r26
+	xor %r20, %r20, %r10
+	xor %r21, %r21, %r11
 
 	// Load F registers (v0-v3 / vs32-vs35) from spAddr1 (r27)
 	//addi %r8, %r27, 8*0
@@ -743,18 +743,18 @@ randomx_ppc64_vm_data_read:
 	ld %r10, 8(%r8)
 	xor %r14, %r14, %r9
 	xor %r15, %r15, %r10
-	ld %r9, 16(%r8)
-	ld %r10, 24(%r8)
-	xor %r16, %r16, %r9
-	xor %r17, %r17, %r10
+	ld %r11, 16(%r8)
+	ld %r12, 24(%r8)
+	xor %r16, %r16, %r11
+	xor %r17, %r17, %r12
 	ld %r9, 32(%r8)
 	ld %r10, 40(%r8)
 	xor %r18, %r18, %r9
 	xor %r19, %r19, %r10
-	ld %r9, 48(%r8)
-	ld %r10, 56(%r8)
-	xor %r20, %r20, %r9
-	xor %r21, %r21, %r10
+	ld %r11, 48(%r8)
+	ld %r12, 56(%r8)
+	xor %r20, %r20, %r11
+	xor %r21, %r21, %r12
 
 	// Swap mx and ma
 	mr %r8, %r25

From 578ada3e7b9b0fbcbefce7a39b833ddfd82f58ba Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Mon, 27 Apr 2026 11:10:50 -0500
Subject: [PATCH 37/50] PPC64 JIT: Group loads four at a time

The POWER9 LSU can perform four loads at a time and a POWER9 core can
perform four ALU operations at a time, so if we group four loads
followed by four ALU operations we can better hide the load latency and
get a very small performance boost.
---
 src/jit_compiler_ppc64_static.S | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index c4840e7f..30d36387 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -664,18 +664,18 @@ randomx_ppc64_vm_loop_prologue:
 	// Load scratchpad data, mix registers, etc.
 	LOAD_LE_GPR %r8, 0, %r26
 	LOAD_LE_GPR %r9, 8, %r26
-	xor %r14, %r14, %r8
-	xor %r15, %r15, %r9
 	LOAD_LE_GPR %r10, 16, %r26
 	LOAD_LE_GPR %r11, 24, %r26
+	xor %r14, %r14, %r8
+	xor %r15, %r15, %r9
 	xor %r16, %r16, %r10
 	xor %r17, %r17, %r11
 	LOAD_LE_GPR %r8, 32, %r26
 	LOAD_LE_GPR %r9, 40, %r26
-	xor %r18, %r18, %r8
-	xor %r19, %r19, %r9
 	LOAD_LE_GPR %r10, 48, %r26
 	LOAD_LE_GPR %r11, 56, %r26
+	xor %r18, %r18, %r8
+	xor %r19, %r19, %r9
 	xor %r20, %r20, %r10
 	xor %r21, %r21, %r11
 
@@ -741,18 +741,18 @@ randomx_ppc64_vm_data_read:
 	// Read 64 bytes and XOR with integer registers
 	ld %r9, 0(%r8)
 	ld %r10, 8(%r8)
-	xor %r14, %r14, %r9
-	xor %r15, %r15, %r10
 	ld %r11, 16(%r8)
 	ld %r12, 24(%r8)
+	xor %r14, %r14, %r9
+	xor %r15, %r15, %r10
 	xor %r16, %r16, %r11
 	xor %r17, %r17, %r12
 	ld %r9, 32(%r8)
 	ld %r10, 40(%r8)
-	xor %r18, %r18, %r9
-	xor %r19, %r19, %r10
 	ld %r11, 48(%r8)
 	ld %r12, 56(%r8)
+	xor %r18, %r18, %r9
+	xor %r19, %r19, %r10
 	xor %r20, %r20, %r11
 	xor %r21, %r21, %r12
 

From 279a960439b794e1dc361cc8ff5f8f15a89b9133 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Tue, 28 Apr 2026 00:40:49 -0500
Subject: [PATCH 38/50] PPC64 JIT: Reorder ld arguments to match the assembly
 instruction

Fixes: 31ff28d ("Cache reciprocals in PPC64 JIT compiler")
---
 src/jit_compiler_ppc64.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 1f0c3ce0..78268654 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -278,7 +278,7 @@ namespace PPC64 {
 	static inline uint32_t sldi(uint32_t rx, uint32_t ry, uint32_t n) { return rldicr(rx, ry, n, 63-n); }
 	static inline uint32_t srdi(uint32_t rx, uint32_t ry, uint32_t n) { return rldicl(rx, ry, 64-n, n); }
 
-	static inline uint32_t ld(uint32_t rt, uint32_t ra, int32_t offset) {
+	static inline uint32_t ld(uint32_t rt, int32_t offset, uint32_t ra) {
 		if (offset & 3) throw std::runtime_error("offset must be 4-byte aligned");
 		if (offset < -(1 << 15) || offset >= (1 << 15)) throw std::runtime_error("offset out of range");
 		return DS_form(58, rt, ra, (offset >> 2) & 0x3FFF, 0);
@@ -1218,7 +1218,7 @@ namespace randomx {
 			uint64_t rcp = randomx_reciprocal_fast(divisor);
 			state.emitAt(offset, rcp);
 
-			state.emit(PPC64::ld(tmp_gpr, ConstantsBaseAddressRegisterGPR2, offset));
+			state.emit(PPC64::ld(tmp_gpr, offset, ConstantsBaseAddressRegisterGPR2));
 			state.emit(PPC64::mulld(dst, dst, tmp_gpr));
 		}
 	}

From 8f1c2f3e60a0fce878a551273622c84037ffb867 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Mon, 27 Apr 2026 19:19:37 -0500
Subject: [PATCH 39/50] PPC64 JIT: Rename scratchpad store prologue/epilogue

This will better indicate what each part is doing.
---
 src/jit_compiler_ppc64.cpp        | 18 +++++++++---------
 src/jit_compiler_ppc64_static.S   | 16 ++++++++--------
 src/jit_compiler_ppc64_static.hpp |  8 ++++----
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 78268654..637b0f38 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -487,12 +487,12 @@ namespace randomx {
 	static const uint8_t* codeVmDataReadLight = (uint8_t*)&randomx_ppc64_vm_data_read_light;
 	static const uint8_t* codeVmDataReadLightFixCall = (uint8_t*)&randomx_ppc64_vm_data_read_light_fix_call;
 	static const uint8_t* codeVmDataReadLightEnd = (uint8_t*)&randomx_ppc64_vm_data_read_light_end;
-	static const uint8_t* codeVmSpadStorePrologue = (uint8_t*)&randomx_ppc64_vm_spad_store_prologue;
-	static const uint8_t* codeVmSpadStorePrologueEnd = (uint8_t*)&randomx_ppc64_vm_spad_store_prologue_end;
+	static const uint8_t* codeVmSpadStoreGroupR = (uint8_t*)&randomx_ppc64_vm_spad_store_group_r;
+	static const uint8_t* codeVmSpadStoreGroupREnd = (uint8_t*)&randomx_ppc64_vm_spad_store_group_r_end;
 	static const uint8_t* codeVmSpadStoreMixV1 = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v1;
 	static const uint8_t* codeVmSpadStoreMixV1End = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v1_end;
-	static const uint8_t* codeVmSpadStoreEpilogue = (uint8_t*)&randomx_ppc64_vm_spad_store_epilogue;
-	static const uint8_t* codeVmSpadStoreEpilogueEnd = (uint8_t*)&randomx_ppc64_vm_spad_store_epilogue_end;
+	static const uint8_t* codeVmSpadStoreGroupF = (uint8_t*)&randomx_ppc64_vm_spad_store_group_f;
+	static const uint8_t* codeVmSpadStoreGroupFEnd = (uint8_t*)&randomx_ppc64_vm_spad_store_group_f_end;
 	static const uint8_t* codeVmSpadStoreMixV2HardAes = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v2_hard_aes;
 	static const uint8_t* codeVmSpadStoreMixV2HardAesEnd = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end;
 	static const uint8_t* codeVmSpadStoreMixV2SoftAes = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v2_soft_aes;
@@ -512,9 +512,9 @@ namespace randomx {
 	static const int32_t sizeVmLoopPrologue = codeVmLoopPrologueEnd - codeVmLoopPrologue;
 	static const int32_t sizeVmDataRead = codeVmDataReadEnd - codeVmDataRead;
 	static const int32_t sizeVmDataReadLight = codeVmDataReadLightEnd - codeVmDataReadLight;
-	static const int32_t sizeVmSpadStorePrologue = codeVmSpadStorePrologueEnd - codeVmSpadStorePrologue;
+	static const int32_t sizeVmSpadStoreGroupR = codeVmSpadStoreGroupREnd - codeVmSpadStoreGroupR;
 	static const int32_t sizeVmSpadStoreMixV1 = codeVmSpadStoreMixV1End - codeVmSpadStoreMixV1;
-	static const int32_t sizeVmSpadStoreEpilogue = codeVmSpadStoreEpilogueEnd - codeVmSpadStoreEpilogue;
+	static const int32_t sizeVmSpadStoreGroupF = codeVmSpadStoreGroupFEnd - codeVmSpadStoreGroupF;
 	static const int32_t sizeVmSpadStoreMixV2HardAes = codeVmSpadStoreMixV2HardAesEnd - codeVmSpadStoreMixV2HardAes;
 	static const int32_t sizeVmSpadStoreMixV2SoftAes = codeVmSpadStoreMixV2SoftAesEnd - codeVmSpadStoreMixV2SoftAes;
 
@@ -529,7 +529,7 @@ namespace randomx {
 	constexpr size_t ReciprocalPoolSize = 8 * RANDOMX_PROGRAM_MAX_SIZE;  // RANDOMX_PROGRAM_MAX_SIZE 64-bit reciprocals
 	static const size_t ReciprocalPoolPos = sizeConstants + 16;  // Add 16 bytes for the Group E OR vector mask
 	static const size_t ConstantPoolSize = alignSize(sizeConstants + 16 + ReciprocalPoolSize, CodeAlign);  // Add 16 bytes for the Group E OR vector mask
-	static const size_t ReserveCodeSize = alignSize(sizeVmPrologue + sizeVmEpilogue + sizeVmLoopPrologue + sizeVmDataRead + sizeVmDataReadLight + sizeVmSpadStorePrologue + sizeVmSpadStoreMixV2SoftAes + sizeVmSpadStoreEpilogue, CodeAlign);
+	static const size_t ReserveCodeSize = alignSize(sizeVmPrologue + sizeVmEpilogue + sizeVmLoopPrologue + sizeVmDataRead + sizeVmDataReadLight + sizeVmSpadStoreGroupR + sizeVmSpadStoreMixV2SoftAes + sizeVmSpadStoreGroupF, CodeAlign);
 	constexpr size_t MaxRandomXInstrCodeSize = 4*9;  // FDIV_M and CFROUND require at most 9 instructions
 	constexpr size_t MaxSuperscalarInstrSize = 4*6;  // IMUL_RCP requires at most 6 instructions
 	static const size_t SuperscalarProgramHeaders = sizeSshashSingleItemPrologue + sizeSshashSingleItemEpilogue;
@@ -852,7 +852,7 @@ namespace randomx {
 	}
 
 	void JitCompilerPPC64::emitProgramSuffix(CompilerState& state, ProgramConfiguration& pcfg, randomx_flags flags) {
-		state.emit(codeVmSpadStorePrologue, sizeVmSpadStorePrologue);
+		state.emit(codeVmSpadStoreGroupR, sizeVmSpadStoreGroupR);
 
 		if (flags & RANDOMX_FLAG_V2) {
 			if (flags & RANDOMX_FLAG_HARD_AES) {
@@ -867,7 +867,7 @@ namespace randomx {
 			state.emit(codeVmSpadStoreMixV1, sizeVmSpadStoreMixV1);
 		}
 
-		state.emit(codeVmSpadStoreEpilogue, sizeVmSpadStoreEpilogue);
+		state.emit(codeVmSpadStoreGroupF, sizeVmSpadStoreGroupF);
 
 		state.emit(PPC64::xor_(SpAddr0GPR26, RegisterMapR.getPpcGprNum(pcfg.readReg0), RegisterMapR.getPpcGprNum(pcfg.readReg1)));
 
diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index 30d36387..3e55d4f2 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -91,12 +91,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	.global randomx_ppc64_vm_data_read_light
 	.global randomx_ppc64_vm_data_read_light_fix_call
 	.global randomx_ppc64_vm_data_read_light_end
-	.global randomx_ppc64_vm_spad_store_prologue
-	.global randomx_ppc64_vm_spad_store_prologue_end
+	.global randomx_ppc64_vm_spad_store_group_r
+	.global randomx_ppc64_vm_spad_store_group_r_end
 	.global randomx_ppc64_vm_spad_store_mix_v1
 	.global randomx_ppc64_vm_spad_store_mix_v1_end
-	.global randomx_ppc64_vm_spad_store_epilogue
-	.global randomx_ppc64_vm_spad_store_epilogue_end
+	.global randomx_ppc64_vm_spad_store_group_f
+	.global randomx_ppc64_vm_spad_store_group_f_end
 	.global randomx_ppc64_vm_spad_store_mix_v2_hard_aes
 	.global randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end
 	.global randomx_ppc64_vm_spad_store_mix_v2_soft_aes
@@ -791,7 +791,7 @@ randomx_ppc64_vm_data_read_light_fix_call:
 
 randomx_ppc64_vm_data_read_light_end:
 
-randomx_ppc64_vm_spad_store_prologue:
+randomx_ppc64_vm_spad_store_group_r:
 	// Store to scratchpad at spAddr1
 	STORE_LE_GPR %r14, 8*0, %r27
 	STORE_LE_GPR %r15, 8*1, %r27
@@ -802,7 +802,7 @@ randomx_ppc64_vm_spad_store_prologue:
 	STORE_LE_GPR %r20, 8*6, %r27
 	STORE_LE_GPR %r21, 8*7, %r27
 
-randomx_ppc64_vm_spad_store_prologue_end:
+randomx_ppc64_vm_spad_store_group_r_end:
 
 randomx_ppc64_vm_spad_store_mix_v1:
 	// Mix F and E registers (f0-f3 are v0-v3, e0-e3 are v4-v7)
@@ -813,7 +813,7 @@ randomx_ppc64_vm_spad_store_mix_v1:
 
 randomx_ppc64_vm_spad_store_mix_v1_end:
 
-randomx_ppc64_vm_spad_store_epilogue:
+randomx_ppc64_vm_spad_store_group_f:
 	// Store F registers to scratchpad at spAddr0
 	li %r8, 16*0
 	li %r9, 16*1
@@ -824,7 +824,7 @@ randomx_ppc64_vm_spad_store_epilogue:
 	STORE_LE_VR 2, 14, %r10, %r26
 	STORE_LE_VR 3, 12, %r11, %r26
 
-randomx_ppc64_vm_spad_store_epilogue_end:
+randomx_ppc64_vm_spad_store_group_f_end:
 
 randomx_ppc64_vm_spad_store_mix_v2_hard_aes:
 	// Mix F and E registers (f0-f3 are v0-v3, e0-e3 are v4-v7)
diff --git a/src/jit_compiler_ppc64_static.hpp b/src/jit_compiler_ppc64_static.hpp
index d99c4529..9c2024b8 100644
--- a/src/jit_compiler_ppc64_static.hpp
+++ b/src/jit_compiler_ppc64_static.hpp
@@ -58,12 +58,12 @@ extern "C" {
 	void randomx_ppc64_vm_data_read_light();
 	void randomx_ppc64_vm_data_read_light_fix_call();
 	void randomx_ppc64_vm_data_read_light_end();
-	void randomx_ppc64_vm_spad_store_prologue();
-	void randomx_ppc64_vm_spad_store_prologue_end();
+	void randomx_ppc64_vm_spad_store_group_r();
+	void randomx_ppc64_vm_spad_store_group_r_end();
 	void randomx_ppc64_vm_spad_store_mix_v1();
 	void randomx_ppc64_vm_spad_store_mix_v1_end();
-	void randomx_ppc64_vm_spad_store_epilogue();
-	void randomx_ppc64_vm_spad_store_epilogue_end();
+	void randomx_ppc64_vm_spad_store_group_f();
+	void randomx_ppc64_vm_spad_store_group_f_end();
 	void randomx_ppc64_vm_spad_store_mix_v2_hard_aes();
 	void randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end();
 	void randomx_ppc64_vm_spad_store_mix_v2_soft_aes();

From 7144c2d80084cbf27fbb467b3752f10c543cb50b Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Tue, 28 Apr 2026 00:58:19 -0500
Subject: [PATCH 40/50] PPC64 JIT: Move the Group F scratchpad store into the
 code generator

This doesn't affect performance since it's the same sequence of
instructions, but doing this enables us to more easily optimize the code
for each supported architecture.
---
 src/jit_compiler_ppc64.cpp        | 34 +++++++++++++++++++++++++++----
 src/jit_compiler_ppc64_static.S   | 15 --------------
 src/jit_compiler_ppc64_static.hpp |  2 --
 3 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 637b0f38..609ee610 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -308,6 +308,8 @@ namespace PPC64 {
 		return X_form(31, t, ra, rb, 844, tx);
 	}
 
+	static inline uint32_t stvx(uint32_t vrs, uint32_t ra, uint32_t rb) { return X_form(31, vrs, ra, rb, 231, 0); }
+
 	static inline uint32_t vperm(uint32_t vrt, uint32_t vra, uint32_t vrb, uint32_t vrc) { return VA_form(4, vrt, vra, vrb, vrc, 43); }
 	static inline uint32_t vsel(uint32_t vrt, uint32_t vra, uint32_t vrb, uint32_t vrc) { return VA_form(4, vrt, vra, vrb, vrc, 42); }
 
@@ -491,8 +493,6 @@ namespace randomx {
 	static const uint8_t* codeVmSpadStoreGroupREnd = (uint8_t*)&randomx_ppc64_vm_spad_store_group_r_end;
 	static const uint8_t* codeVmSpadStoreMixV1 = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v1;
 	static const uint8_t* codeVmSpadStoreMixV1End = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v1_end;
-	static const uint8_t* codeVmSpadStoreGroupF = (uint8_t*)&randomx_ppc64_vm_spad_store_group_f;
-	static const uint8_t* codeVmSpadStoreGroupFEnd = (uint8_t*)&randomx_ppc64_vm_spad_store_group_f_end;
 	static const uint8_t* codeVmSpadStoreMixV2HardAes = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v2_hard_aes;
 	static const uint8_t* codeVmSpadStoreMixV2HardAesEnd = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end;
 	static const uint8_t* codeVmSpadStoreMixV2SoftAes = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v2_soft_aes;
@@ -514,9 +514,9 @@ namespace randomx {
 	static const int32_t sizeVmDataReadLight = codeVmDataReadLightEnd - codeVmDataReadLight;
 	static const int32_t sizeVmSpadStoreGroupR = codeVmSpadStoreGroupREnd - codeVmSpadStoreGroupR;
 	static const int32_t sizeVmSpadStoreMixV1 = codeVmSpadStoreMixV1End - codeVmSpadStoreMixV1;
-	static const int32_t sizeVmSpadStoreGroupF = codeVmSpadStoreGroupFEnd - codeVmSpadStoreGroupF;
 	static const int32_t sizeVmSpadStoreMixV2HardAes = codeVmSpadStoreMixV2HardAesEnd - codeVmSpadStoreMixV2HardAes;
 	static const int32_t sizeVmSpadStoreMixV2SoftAes = codeVmSpadStoreMixV2SoftAesEnd - codeVmSpadStoreMixV2SoftAes;
+	constexpr size_t sizeVmSpadStoreGroupF = 4*12;  // Worst case size is 12 instructions
 
 	static const int32_t offsetConstantLutFprcToFpscr = codeConstantLutFprcToFpscr - codeConstants;
 
@@ -540,6 +540,8 @@ namespace randomx {
 	static const uint32_t CodeSize = RandomXCodeSize + SuperscalarSize;
 
 	constexpr uint32_t ConstantsBaseAddressRegisterGPR2 = 2;
+	constexpr uint32_t ConstantVectorByteReverseMaskVR15 = 15;
+	constexpr uint32_t ConstantVectorByteReverseMaskVSR47 = 32 + ConstantVectorByteReverseMaskVR15;
 	constexpr uint32_t ConstantVectorBePermutationMaskVR16 = 16;
 	constexpr uint32_t ConstantVectorBePermutationMaskVSR48 = 32 + ConstantVectorBePermutationMaskVR16;
 	constexpr uint32_t ConstantVectorGroupEAndMaskVR17 = 17;
@@ -800,6 +802,30 @@ namespace randomx {
 		emitLoadVr64(state, tmp_vr, ScratchpadPointerGPR30, tmp_gpr);
 	}
 
+	static void emitVmSpadStoreGroupF(CompilerState& state) {
+		// Store F registers to scratchpad at spAddr0
+		state.emit(PPC64::li(8, 16 * 0));
+		state.emit(PPC64::li(9, 16 * 1));
+		state.emit(PPC64::li(10, 16 * 2));
+		state.emit(PPC64::li(11, 16 * 3));
+
+		if (PPC_BIG_ENDIAN) {
+			state.emit(PPC64::vperm(12, 0, 0, ConstantVectorByteReverseMaskVR15));
+			state.emit(PPC64::stvx(12, 8, SpAddr0GPR26));
+			state.emit(PPC64::vperm(13, 1, 1, ConstantVectorByteReverseMaskVR15));
+			state.emit(PPC64::stvx(13, 9, SpAddr0GPR26));
+			state.emit(PPC64::vperm(14, 2, 2, ConstantVectorByteReverseMaskVR15));
+			state.emit(PPC64::stvx(14, 10, SpAddr0GPR26));
+			state.emit(PPC64::vperm(12, 3, 3, ConstantVectorByteReverseMaskVR15));
+			state.emit(PPC64::stvx(12, 11, SpAddr0GPR26));
+		} else {
+			state.emit(PPC64::stvx(0, 8, SpAddr0GPR26));
+			state.emit(PPC64::stvx(1, 9, SpAddr0GPR26));
+			state.emit(PPC64::stvx(2, 10, SpAddr0GPR26));
+			state.emit(PPC64::stvx(3, 11, SpAddr0GPR26));
+		}
+	}
+
 	uint32_t JitCompilerPPC64::getTempGpr() {
 		static const uint32_t gprs[] = {6, 7, 8, 9, 10, 11, 12};
 		uint32_t reg = gprs[tempGprIndex];
@@ -867,7 +893,7 @@ namespace randomx {
 			state.emit(codeVmSpadStoreMixV1, sizeVmSpadStoreMixV1);
 		}
 
-		state.emit(codeVmSpadStoreGroupF, sizeVmSpadStoreGroupF);
+		emitVmSpadStoreGroupF(state);
 
 		state.emit(PPC64::xor_(SpAddr0GPR26, RegisterMapR.getPpcGprNum(pcfg.readReg0), RegisterMapR.getPpcGprNum(pcfg.readReg1)));
 
diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index 3e55d4f2..3f3ca8ca 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -95,8 +95,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	.global randomx_ppc64_vm_spad_store_group_r_end
 	.global randomx_ppc64_vm_spad_store_mix_v1
 	.global randomx_ppc64_vm_spad_store_mix_v1_end
-	.global randomx_ppc64_vm_spad_store_group_f
-	.global randomx_ppc64_vm_spad_store_group_f_end
 	.global randomx_ppc64_vm_spad_store_mix_v2_hard_aes
 	.global randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end
 	.global randomx_ppc64_vm_spad_store_mix_v2_soft_aes
@@ -813,19 +811,6 @@ randomx_ppc64_vm_spad_store_mix_v1:
 
 randomx_ppc64_vm_spad_store_mix_v1_end:
 
-randomx_ppc64_vm_spad_store_group_f:
-	// Store F registers to scratchpad at spAddr0
-	li %r8, 16*0
-	li %r9, 16*1
-	li %r10, 16*2
-	li %r11, 16*3
-	STORE_LE_VR 0, 12, %r8, %r26
-	STORE_LE_VR 1, 13, %r9, %r26
-	STORE_LE_VR 2, 14, %r10, %r26
-	STORE_LE_VR 3, 12, %r11, %r26
-
-randomx_ppc64_vm_spad_store_group_f_end:
-
 randomx_ppc64_vm_spad_store_mix_v2_hard_aes:
 	// Mix F and E registers (f0-f3 are v0-v3, e0-e3 are v4-v7)
 
diff --git a/src/jit_compiler_ppc64_static.hpp b/src/jit_compiler_ppc64_static.hpp
index 9c2024b8..7909a81b 100644
--- a/src/jit_compiler_ppc64_static.hpp
+++ b/src/jit_compiler_ppc64_static.hpp
@@ -62,8 +62,6 @@ extern "C" {
 	void randomx_ppc64_vm_spad_store_group_r_end();
 	void randomx_ppc64_vm_spad_store_mix_v1();
 	void randomx_ppc64_vm_spad_store_mix_v1_end();
-	void randomx_ppc64_vm_spad_store_group_f();
-	void randomx_ppc64_vm_spad_store_group_f_end();
 	void randomx_ppc64_vm_spad_store_mix_v2_hard_aes();
 	void randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end();
 	void randomx_ppc64_vm_spad_store_mix_v2_soft_aes();

From 69f019f1b9a945ed235f8afdfe7b3c9f7b75ede0 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Tue, 28 Apr 2026 01:17:02 -0500
Subject: [PATCH 41/50] PPC64 JIT: Optimize Group F scratchpad store on v3.0
 and later

On Power ISA v3.0 and later, we can use stxv to perform the Group F
scratchpad stores using immediate offsets instead of register offsets.
This saves us from having to load the offsets into registers before
performing the stores.
---
 src/jit_compiler_ppc64.cpp | 73 ++++++++++++++++++++++++++++----------
 1 file changed, 55 insertions(+), 18 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 609ee610..6dd94012 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -77,6 +77,16 @@ namespace PPC64 {
 		return (po << 26) | (rt << 21) | (ra << 16) | d;
 	}
 
+	static inline uint32_t DQ_form(uint32_t po, uint32_t s, uint32_t ra, uint32_t dq, uint32_t sx, uint32_t xo) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(s <= 0x1F)) throw std::runtime_error("s <= 0x1F");
+		if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F");
+		if (!(dq <= 0xFFF)) throw std::runtime_error("dq <= 0xFFF");
+		if (!(sx <= 0x1)) throw std::runtime_error("sx <= 0x1");
+		if (!(xo <= 0x7)) throw std::runtime_error("xo <= 0x7");
+		return (po << 26) | (s << 21) | (ra << 16) | (dq << 4) | (sx << 3) | xo;
+	}
+
 	static inline uint32_t DS_form(uint32_t po, uint32_t rt, uint32_t ra, uint32_t ds, uint32_t xo) {
 		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
 		if (!(rt <= 0x1F)) throw std::runtime_error("rt <= 0x1F");
@@ -310,6 +320,15 @@ namespace PPC64 {
 
 	static inline uint32_t stvx(uint32_t vrs, uint32_t ra, uint32_t rb) { return X_form(31, vrs, ra, rb, 231, 0); }
 
+	static inline uint32_t stxv(uint32_t xs, int32_t offset, uint32_t ra) {  // Only v3.0B and later
+		if (!(xs <= 0x3F)) throw std::runtime_error("xs <= 0x3F");
+		if (offset & 0xF) throw std::runtime_error("offset must be 16-byte aligned");
+		if (offset < -(1 << 15) || offset >= (1 << 15)) throw std::runtime_error("offset out of range");
+		uint32_t s = xs & 0x1F;
+		uint32_t sx = xs >> 5;
+		return DQ_form(61, s, ra, (offset >> 4) & 0xFFF, sx, 5);
+	}
+
 	static inline uint32_t vperm(uint32_t vrt, uint32_t vra, uint32_t vrb, uint32_t vrc) { return VA_form(4, vrt, vra, vrb, vrc, 43); }
 	static inline uint32_t vsel(uint32_t vrt, uint32_t vra, uint32_t vrb, uint32_t vrc) { return VA_form(4, vrt, vra, vrb, vrc, 42); }
 
@@ -804,25 +823,43 @@ namespace randomx {
 
 	static void emitVmSpadStoreGroupF(CompilerState& state) {
 		// Store F registers to scratchpad at spAddr0
-		state.emit(PPC64::li(8, 16 * 0));
-		state.emit(PPC64::li(9, 16 * 1));
-		state.emit(PPC64::li(10, 16 * 2));
-		state.emit(PPC64::li(11, 16 * 3));
-
-		if (PPC_BIG_ENDIAN) {
-			state.emit(PPC64::vperm(12, 0, 0, ConstantVectorByteReverseMaskVR15));
-			state.emit(PPC64::stvx(12, 8, SpAddr0GPR26));
-			state.emit(PPC64::vperm(13, 1, 1, ConstantVectorByteReverseMaskVR15));
-			state.emit(PPC64::stvx(13, 9, SpAddr0GPR26));
-			state.emit(PPC64::vperm(14, 2, 2, ConstantVectorByteReverseMaskVR15));
-			state.emit(PPC64::stvx(14, 10, SpAddr0GPR26));
-			state.emit(PPC64::vperm(12, 3, 3, ConstantVectorByteReverseMaskVR15));
-			state.emit(PPC64::stvx(12, 11, SpAddr0GPR26));
+		if (randomx::cpu.hasV3P0()) {
+			if (PPC_BIG_ENDIAN) {
+				state.emit(PPC64::vperm(12, 0, 0, ConstantVectorByteReverseMaskVR15));
+				state.emit(PPC64::stxv(32 + 12, 16 * 0, SpAddr0GPR26));
+				state.emit(PPC64::vperm(13, 1, 1, ConstantVectorByteReverseMaskVR15));
+				state.emit(PPC64::stxv(32 + 13, 16 * 1, SpAddr0GPR26));
+				state.emit(PPC64::vperm(14, 2, 2, ConstantVectorByteReverseMaskVR15));
+				state.emit(PPC64::stxv(32 + 14, 16 * 2, SpAddr0GPR26));
+				state.emit(PPC64::vperm(12, 3, 3, ConstantVectorByteReverseMaskVR15));
+				state.emit(PPC64::stxv(32 + 12, 16 * 3, SpAddr0GPR26));
+			} else {
+				state.emit(PPC64::stxv(32 + 0, 16 * 0, SpAddr0GPR26));
+				state.emit(PPC64::stxv(32 + 1, 16 * 1, SpAddr0GPR26));
+				state.emit(PPC64::stxv(32 + 2, 16 * 2, SpAddr0GPR26));
+				state.emit(PPC64::stxv(32 + 3, 16 * 3, SpAddr0GPR26));
+			}
 		} else {
-			state.emit(PPC64::stvx(0, 8, SpAddr0GPR26));
-			state.emit(PPC64::stvx(1, 9, SpAddr0GPR26));
-			state.emit(PPC64::stvx(2, 10, SpAddr0GPR26));
-			state.emit(PPC64::stvx(3, 11, SpAddr0GPR26));
+			state.emit(PPC64::li(8, 16 * 0));
+			state.emit(PPC64::li(9, 16 * 1));
+			state.emit(PPC64::li(10, 16 * 2));
+			state.emit(PPC64::li(11, 16 * 3));
+
+			if (PPC_BIG_ENDIAN) {
+				state.emit(PPC64::vperm(12, 0, 0, ConstantVectorByteReverseMaskVR15));
+				state.emit(PPC64::stvx(12, 8, SpAddr0GPR26));
+				state.emit(PPC64::vperm(13, 1, 1, ConstantVectorByteReverseMaskVR15));
+				state.emit(PPC64::stvx(13, 9, SpAddr0GPR26));
+				state.emit(PPC64::vperm(14, 2, 2, ConstantVectorByteReverseMaskVR15));
+				state.emit(PPC64::stvx(14, 10, SpAddr0GPR26));
+				state.emit(PPC64::vperm(12, 3, 3, ConstantVectorByteReverseMaskVR15));
+				state.emit(PPC64::stvx(12, 11, SpAddr0GPR26));
+			} else {
+				state.emit(PPC64::stvx(0, 8, SpAddr0GPR26));
+				state.emit(PPC64::stvx(1, 9, SpAddr0GPR26));
+				state.emit(PPC64::stvx(2, 10, SpAddr0GPR26));
+				state.emit(PPC64::stvx(3, 11, SpAddr0GPR26));
+			}
 		}
 	}
 

From 7b4844352820c62dfc094e6e4c5cf59f342aa807 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Tue, 28 Apr 2026 10:44:56 -0500
Subject: [PATCH 42/50] PPC64 JIT: Avoid moving register ma when we don't need
 to

---
 src/jit_compiler_ppc64.cpp | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 6dd94012..0d3202dd 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -1015,12 +1015,18 @@ namespace randomx {
 	void JitCompilerPPC64::generateProgram(Program& prog, ProgramConfiguration& pcfg) {
 		emitProgramPrefix(state, prog, pcfg, flags);
 
-		// Step 5a: Save ma in mt (r9, temporary)
-		int mtReg = 9;
-		state.emit(PPC64::mr(mtReg, MaGPR24));
+		int mtReg = MaGPR24;
+		int mpReg = MxGPR25;
+
+		if (flags & RANDOMX_FLAG_V2) {
+			// Step 5a: Save ma in mt (r9, temporary)
+			mtReg = 9;
+			state.emit(PPC64::mr(mtReg, MaGPR24));
+
+			mpReg = MaGPR24;
+		}
 
 		// Step 5b: the mp register is XORed with the low 32 bits of registers readReg2 and readReg3
-		int mpReg = (flags & RANDOMX_FLAG_V2) ? MaGPR24 : MxGPR25;  // r24 = ma, r25 = mx
 		state.emit(PPC64::xor_(8, RegisterMapR.getPpcGprNum(pcfg.readReg2), RegisterMapR.getPpcGprNum(pcfg.readReg3)));
 		// Zero-extend r8 to 32 bits (clear upper 32 bits)
 		state.emit(PPC64::rldicl(8, 8, 0, 32));
@@ -1047,12 +1053,18 @@ namespace randomx {
 	void JitCompilerPPC64::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) {
 		emitProgramPrefix(state, prog, pcfg, flags);
 
-		// Step 5a: Save ma in mt (r9, temporary)
-		int mtReg = 9;
-		state.emit(PPC64::mr(mtReg, MaGPR24));
+		int mtReg = MaGPR24;
+		int mpReg = MxGPR25;
+
+		if (flags & RANDOMX_FLAG_V2) {
+			// Step 5a: Save ma in mt (r9, temporary)
+			mtReg = 9;
+			state.emit(PPC64::mr(mtReg, MaGPR24));
+
+			mpReg = MaGPR24;
+		}
 
 		// Step 5b: the mp register is XORed with the low 32 bits of registers readReg2 and readReg3
-		int mpReg = (flags & RANDOMX_FLAG_V2) ? MaGPR24 : MxGPR25;  // r24 = ma, r25 = mx
 		state.emit(PPC64::xor_(8, RegisterMapR.getPpcGprNum(pcfg.readReg2), RegisterMapR.getPpcGprNum(pcfg.readReg3)));
 		// Zero-extend r8 to 32 bits (clear upper 32 bits)
 		state.emit(PPC64::rldicl(8, 8, 0, 32));

From 047320ab01454ba687f9d3032eb5f68413726b2c Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Wed, 29 Apr 2026 00:00:55 -0500
Subject: [PATCH 43/50] PPC64 JIT: Optimize Group F register scratchpad stores
 on pre-v3.0

We don't need to load zero into a register to use it as an offset--we
can just set RA in the instruction word to zero. This saves one ALU
operation per loop iteration.
---
 src/jit_compiler_ppc64.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 0d3202dd..2600b387 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -840,14 +840,14 @@ namespace randomx {
 				state.emit(PPC64::stxv(32 + 3, 16 * 3, SpAddr0GPR26));
 			}
 		} else {
-			state.emit(PPC64::li(8, 16 * 0));
+			//state.emit(PPC64::li(8, 16 * 0));
 			state.emit(PPC64::li(9, 16 * 1));
 			state.emit(PPC64::li(10, 16 * 2));
 			state.emit(PPC64::li(11, 16 * 3));
 
 			if (PPC_BIG_ENDIAN) {
 				state.emit(PPC64::vperm(12, 0, 0, ConstantVectorByteReverseMaskVR15));
-				state.emit(PPC64::stvx(12, 8, SpAddr0GPR26));
+				state.emit(PPC64::stvx(12, 0, SpAddr0GPR26));  // RA=0 for zero offset
 				state.emit(PPC64::vperm(13, 1, 1, ConstantVectorByteReverseMaskVR15));
 				state.emit(PPC64::stvx(13, 9, SpAddr0GPR26));
 				state.emit(PPC64::vperm(14, 2, 2, ConstantVectorByteReverseMaskVR15));
@@ -855,7 +855,7 @@ namespace randomx {
 				state.emit(PPC64::vperm(12, 3, 3, ConstantVectorByteReverseMaskVR15));
 				state.emit(PPC64::stvx(12, 11, SpAddr0GPR26));
 			} else {
-				state.emit(PPC64::stvx(0, 8, SpAddr0GPR26));
+				state.emit(PPC64::stvx(0, 0, SpAddr0GPR26));  // RA=0 for zero offset
 				state.emit(PPC64::stvx(1, 9, SpAddr0GPR26));
 				state.emit(PPC64::stvx(2, 10, SpAddr0GPR26));
 				state.emit(PPC64::stvx(3, 11, SpAddr0GPR26));

From 8b87ee8250909271ed7ec65dbf2c9634f8358a7d Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Wed, 29 Apr 2026 01:27:55 -0500
Subject: [PATCH 44/50] PPC64 JIT: Optimize IXOR_R for 16-bit and shifted
 unsigned 16-bit values

This doesn't really improve execution latency because the lis/ori
immediate load is executed in parallel with other instructions before
the xor, but for cases where the upper 16 bits are zero or where the
sign bit and lower 16 bits are zero, this will save a tiny bit of
instruction cache and maybe sometimes speed things up when the ALU
pipelines are very busy before the xor.
---
 src/jit_compiler_ppc64.cpp | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 2600b387..f564fb32 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -257,6 +257,8 @@ namespace PPC64 {
 	static inline uint32_t addis(uint32_t rt, uint32_t ra, uint32_t si) { return D_form(15, rt, ra, si); }
 	static inline uint32_t ori(uint32_t ra, uint32_t rs, uint32_t ui) { return D_form(24, rs, ra, ui); }
 	static inline uint32_t oris(uint32_t ra, uint32_t rs, uint32_t ui) { return D_form(25, rs, ra, ui); }
+	static inline uint32_t xori(uint32_t ra, uint32_t rs, uint32_t ui) { return D_form(26, rs, ra, ui); }
+	static inline uint32_t xoris(uint32_t ra, uint32_t rs, uint32_t ui) { return D_form(27, rs, ra, ui); }
 	static inline uint32_t andi_dot(uint32_t ra, uint32_t rs, uint32_t ui) { return D_form(28, rs, ra, ui); }
 
 	static inline uint32_t add(uint32_t rt, uint32_t ra, uint32_t rb) { return XO_form(31, rt, ra, rb, 0, 266, 0); }
@@ -1309,9 +1311,22 @@ namespace randomx {
 			int src = RegisterMapR.getPpcGprNum(isn.src);
 			state.emit(PPC64::xor_(dst, dst, src));
 		} else {
-			uint32_t tmp_gpr = jit->getTempGpr();
-			emitMovImm32(state, tmp_gpr, isn.getImm32());
-			state.emit(PPC64::xor_(dst, dst, tmp_gpr));
+			// Note: RandomX 32-bit immediates are sign-extended to 64 bits.
+			// xori/xoris zero-extend their 16-bit immediate, so they only match
+			// the sign-extended semantics when the imm32 is non-negative as a
+			// signed 32-bit value (i.e., <= 0x7FFFFFFF).
+			uint32_t imm = isn.getImm32();
+			if (imm <= 0xFFFF) {
+				// Fits in unsigned 16 bits; XOR of upper bits is a no-op.
+				state.emit(PPC64::xori(dst, dst, imm));
+			} else if ((imm & 0xFFFF) == 0 && imm <= 0x7FFFFFFF) {
+				// Only the high 16 bits are nonzero, and the value is non-negative.
+				state.emit(PPC64::xoris(dst, dst, (imm >> 16) & 0xFFFF));
+			} else {
+				uint32_t tmp_gpr = jit->getTempGpr();
+				emitMovImm32(state, tmp_gpr, imm);
+				state.emit(PPC64::xor_(dst, dst, tmp_gpr));
+			}
 		}
 	}
 	static void h_IXOR_M(HANDLER_ARGS) {

From 50b065866ccb6975e66188a96defa0493f1c955e Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Wed, 29 Apr 2026 01:58:12 -0500
Subject: [PATCH 45/50] PPC64 JIT: Rearrange the beq/bne instruction formatters

They're short enough to use one line for each.
---
 src/jit_compiler_ppc64.cpp | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index f564fb32..4a74baab 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -237,13 +237,8 @@ namespace PPC64 {
 		return B_form(16, bo, bi, (offset >> 2) & 0x3FFF, 0, 0);
 	}
 
-	static inline uint32_t beq(int32_t offset) {
-		return bc(12, 2, offset);
-	}
-
-	static inline uint32_t bne(int32_t offset) {
-		return bc(4, 2, offset);
-	}
+	static inline uint32_t beq(int32_t offset) { return bc(12, 2, offset); }
+	static inline uint32_t bne(int32_t offset) { return bc(4, 2, offset); }
 
 	static inline uint32_t cmpi(uint32_t bf, uint32_t l, uint32_t ra, int32_t si) {
 		if (!(bf <= 0x7)) throw std::runtime_error("bf <= 0x7");

From 78be8c875b12ec25c430c15d9c5bace48723a14b Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Wed, 29 Apr 2026 02:21:23 -0500
Subject: [PATCH 46/50] PPC64 JIT: Add branch hint for CFROUND in V2 mode

We know for a fact the branch to skip the rounding mode update is taken
93.75% of the time, so we might as well add the hint to indicate that to
the CPU.

Adding the hint reduces the number of branch mispredictions by about 5%,
from ~1.25% to ~1.19%, in RandomX V2 on POWER9.
---
 src/jit_compiler_ppc64.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 4a74baab..2d09883a 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -239,6 +239,7 @@ namespace PPC64 {
 
 	static inline uint32_t beq(int32_t offset) { return bc(12, 2, offset); }
 	static inline uint32_t bne(int32_t offset) { return bc(4, 2, offset); }
+	static inline uint32_t bne_predict_taken(int32_t offset) { return bc(7, 2, offset); }
 
 	static inline uint32_t cmpi(uint32_t bf, uint32_t l, uint32_t ra, int32_t si) {
 		if (!(bf <= 0x7)) throw std::runtime_error("bf <= 0x7");
@@ -1509,9 +1510,11 @@ namespace randomx {
 		}
 
 		if (flags & RANDOMX_FLAG_V2) {
-			// Patch in the conditional branch instruction.
+			// Patch in the conditional branch instruction. We predict that the branch is taken because
+			// there's only a 1-in-16 chance of bits 5:2 of the rotated value being equal to zero and
+			// falling through to the RN-update code.
 			int32_t branch_offset = state.codePos - patch_pos;
-			state.emitAt(patch_pos, PPC64::bne(branch_offset));
+			state.emitAt(patch_pos, PPC64::bne_predict_taken(branch_offset));
 		}
 	}
 	static void h_ISTORE(HANDLER_ARGS) {

From ef71a761ae00861b3b186b46f2f3e862b87a513f Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Tue, 5 May 2026 20:53:23 -0500
Subject: [PATCH 47/50] PPC64 JIT: Convert the BO field values to hexadecimal

This just makes it easier to see at a glance what bits are set in the
field. This change has no performance impact.
---
 src/jit_compiler_ppc64.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 2d09883a..61c18f27 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -237,9 +237,9 @@ namespace PPC64 {
 		return B_form(16, bo, bi, (offset >> 2) & 0x3FFF, 0, 0);
 	}
 
-	static inline uint32_t beq(int32_t offset) { return bc(12, 2, offset); }
-	static inline uint32_t bne(int32_t offset) { return bc(4, 2, offset); }
-	static inline uint32_t bne_predict_taken(int32_t offset) { return bc(7, 2, offset); }
+	static inline uint32_t beq(int32_t offset) { return bc(0x0C, 2, offset); }
+	static inline uint32_t bne(int32_t offset) { return bc(0x04, 2, offset); }
+	static inline uint32_t bne_predict_taken(int32_t offset) { return bc(0x07, 2, offset); }
 
 	static inline uint32_t cmpi(uint32_t bf, uint32_t l, uint32_t ra, int32_t si) {
 		if (!(bf <= 0x7)) throw std::runtime_error("bf <= 0x7");

From b4d31689983e390aa18ca4373ec7e709ec83d59c Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Tue, 5 May 2026 21:06:17 -0500
Subject: [PATCH 48/50] PPC64 JIT: Add branch hint for CBRANCH

We know for a fact the branch is only taken 0.390625% of the time, so we
might as well add the hint to indicate that to the CPU.

Adding the hint reduces the number of branch mispredictions on POWER9 by
about 6.9% in V1 (from ~0.961% to ~0.895%) and by about 7.5% in V2 (from
~1.19% to ~1.10%). This increases V1 performance by about 0.1% and V2
performance by <0.05%
---
 src/jit_compiler_ppc64.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 61c18f27..36915574 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -238,6 +238,7 @@ namespace PPC64 {
 	}
 
 	static inline uint32_t beq(int32_t offset) { return bc(0x0C, 2, offset); }
+	static inline uint32_t beq_predict_not_taken(int32_t offset) { return bc(0x0E, 2, offset); }
 	static inline uint32_t bne(int32_t offset) { return bc(0x04, 2, offset); }
 	static inline uint32_t bne_predict_taken(int32_t offset) { return bc(0x07, 2, offset); }
 
@@ -1442,10 +1443,10 @@ namespace randomx {
 		int offset = targetPos - state.codePos;
 
 		if (offset >= -(1 << 15) && offset < (1 << 15)) {
-			state.emit(PPC64::beq(offset));
+			state.emit(PPC64::beq_predict_not_taken(offset));
 		} else {
 			// Branch over the jump if not equal
-			state.emit(PPC64::bne(8));
+			state.emit(PPC64::bne_predict_taken(8));
 			state.emit(PPC64::b(offset - 4));
 		}
 

From e9a2c8ea250664b965bd28fe9e98bba09f63fcf7 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Thu, 7 May 2026 16:20:33 -0500
Subject: [PATCH 49/50] PPC64 JIT: Remove STORE_LE_VR macro

Removing the macro enables us to group all the permutation operations
together, which should reduce stalls on big-endian systems and makes the
code slightly easier to read. This change has no effect on little-endian
systems.
---
 src/jit_compiler_ppc64_static.S | 38 +++++++++++++++++----------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index 3f3ca8ca..e00b4904 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -100,16 +100,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	.global randomx_ppc64_vm_spad_store_mix_v2_soft_aes
 	.global randomx_ppc64_vm_spad_store_mix_v2_soft_aes_end
 
-// Macro to store a VR containing a RandomX Group F/E/A register to memory
-.macro STORE_LE_VR vr_src, vr_temp, offset_reg, base_reg
-#if PPC_BIG_ENDIAN
-	vperm \vr_temp, \vr_src, \vr_src, %v15  // Reverse the bytes so they're arranged as [ 0123 4567 ]
-	stvx \vr_temp, \offset_reg, \base_reg   // Store the two doubles to memory
-#else
-	stvx \vr_src, \offset_reg, \base_reg    // Store the two doubles to memory
-#endif
-.endm
-
 // Macro to shuffle a VR after being loaded with lxsdx.
 .macro SHUFFLE_VR vr_reg
 #if PPC_BIG_ENDIAN
@@ -610,25 +600,37 @@ randomx_ppc64_vm_fix_loop:
 	STORE_LE_GPR %r20, 8*6, %r28
 	STORE_LE_GPR %r21, 8*7, %r28
 
+#if PPC_BIG_ENDIAN
+	// Reverse the Group F/E register bytes so they're arranged as [ 0123 4567 ]
+	vperm %v0, %v0, %v0, %v15
+	vperm %v1, %v1, %v1, %v15
+	vperm %v2, %v2, %v2, %v15
+	vperm %v3, %v3, %v3, %v15
+	vperm %v4, %v4, %v4, %v15
+	vperm %v5, %v5, %v5, %v15
+	vperm %v6, %v6, %v6, %v15
+	vperm %v7, %v7, %v7, %v15
+#endif
+
 	.equ registers_f_base, 8*8
 	li %r8, registers_f_base + 16*0
 	li %r9, registers_f_base + 16*1
 	li %r10, registers_f_base + 16*2
 	li %r11, registers_f_base + 16*3
-	STORE_LE_VR 0, 12, %r8, %r28
-	STORE_LE_VR 1, 13, %r9, %r28
-	STORE_LE_VR 2, 14, %r10, %r28
-	STORE_LE_VR 3, 12, %r11, %r28
+	stvx %v0, %r8, %r28
+	stvx %v1, %r9, %r28
+	stvx %v2, %r10, %r28
+	stvx %v3, %r11, %r28
 
 	.equ registers_e_base, 8*8+16*4
 	li %r8, registers_e_base + 16*0
 	li %r9, registers_e_base + 16*1
 	li %r10, registers_e_base + 16*2
 	li %r11, registers_e_base + 16*3
-	STORE_LE_VR 4, 12, %r8, %r28
-	STORE_LE_VR 5, 13, %r9, %r28
-	STORE_LE_VR 6, 14, %r10, %r28
-	STORE_LE_VR 7, 12, %r11, %r28
+	stvx %v4, %r8, %r28
+	stvx %v5, %r9, %r28
+	stvx %v6, %r10, %r28
+	stvx %v7, %r11, %r28
 
 	// Standard function epilogue
 	ld %r14, 112(%r1)

From 7eacafeff20f7afadc092a61162aa596ad238927 Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Thu, 7 May 2026 16:41:17 -0500
Subject: [PATCH 50/50] PPC64 JIT: Interleave immediate loads with vector loads
 and stores

According to the ISA manual, loading the offset value into a register
using `li rX, simm` and then performing the vector load or store
immediately after that using `rX` as the third argument will optimize
the memory access on POWER9. Presumably, this sequence gets fused into a
single micro-op in the CPU.

Changing the code to use this sequence for the vector loads and stores
shows a small but measurable 0.05%-0.1% performance increase for RandomX
V1 on POWER9.
---
 src/jit_compiler_ppc64.cpp      | 23 +++++-----
 src/jit_compiler_ppc64_static.S | 75 ++++++++++++++++-----------------
 2 files changed, 48 insertions(+), 50 deletions(-)

diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
index 36915574..32b0ef9e 100644
--- a/src/jit_compiler_ppc64.cpp
+++ b/src/jit_compiler_ppc64.cpp
@@ -839,25 +839,26 @@ namespace randomx {
 				state.emit(PPC64::stxv(32 + 3, 16 * 3, SpAddr0GPR26));
 			}
 		} else {
-			//state.emit(PPC64::li(8, 16 * 0));
-			state.emit(PPC64::li(9, 16 * 1));
-			state.emit(PPC64::li(10, 16 * 2));
-			state.emit(PPC64::li(11, 16 * 3));
-
 			if (PPC_BIG_ENDIAN) {
 				state.emit(PPC64::vperm(12, 0, 0, ConstantVectorByteReverseMaskVR15));
 				state.emit(PPC64::stvx(12, 0, SpAddr0GPR26));  // RA=0 for zero offset
 				state.emit(PPC64::vperm(13, 1, 1, ConstantVectorByteReverseMaskVR15));
-				state.emit(PPC64::stvx(13, 9, SpAddr0GPR26));
+				state.emit(PPC64::li(9, 16 * 1));
+				state.emit(PPC64::stvx(13, SpAddr0GPR26, 9));
 				state.emit(PPC64::vperm(14, 2, 2, ConstantVectorByteReverseMaskVR15));
-				state.emit(PPC64::stvx(14, 10, SpAddr0GPR26));
+				state.emit(PPC64::li(10, 16 * 2));
+				state.emit(PPC64::stvx(14, SpAddr0GPR26, 10));
 				state.emit(PPC64::vperm(12, 3, 3, ConstantVectorByteReverseMaskVR15));
-				state.emit(PPC64::stvx(12, 11, SpAddr0GPR26));
+				state.emit(PPC64::li(11, 16 * 3));
+				state.emit(PPC64::stvx(12, SpAddr0GPR26, 11));
 			} else {
 				state.emit(PPC64::stvx(0, 0, SpAddr0GPR26));  // RA=0 for zero offset
-				state.emit(PPC64::stvx(1, 9, SpAddr0GPR26));
-				state.emit(PPC64::stvx(2, 10, SpAddr0GPR26));
-				state.emit(PPC64::stvx(3, 11, SpAddr0GPR26));
+				state.emit(PPC64::li(9, 16 * 1));
+				state.emit(PPC64::stvx(1, SpAddr0GPR26, 9));
+				state.emit(PPC64::li(10, 16 * 2));
+				state.emit(PPC64::stvx(2, SpAddr0GPR26, 10));
+				state.emit(PPC64::li(11, 16 * 3));
+				state.emit(PPC64::stvx(3, SpAddr0GPR26, 11));
 			}
 		}
 	}
diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
index e00b4904..ad3666fc 100644
--- a/src/jit_compiler_ppc64_static.S
+++ b/src/jit_compiler_ppc64_static.S
@@ -528,19 +528,17 @@ randomx_ppc64_vm_prologue:
 
 	// Load the vector constants/literals
 	li %r8, constant_vector_group_e_and_mask-randomx_ppc64_constants
+	lxvd2x %vs49, %r2, %r8
 	li %r9, constant_vector_fscal_xor_mask-randomx_ppc64_constants
+	lxvd2x %vs50, %r2, %r9
 	li %r10, literal_vector_group_e_or_mask-randomx_ppc64_constants
+	lvx %v19, %r2, %r10  // Use lvx to load the vector since it's written [ low word, high word ] in memory
 #if PPC_BIG_ENDIAN
 	li %r11, constant_vector_be_permutation_mask-randomx_ppc64_constants
+	lxvd2x %vs48, %r2, %r11  // Load the BE permutation mask (not needed for LE)
 #endif
 	li %r12, constant_vector_byte_reverse_mask-randomx_ppc64_constants
-	lxvd2x %vs49, %r8, %r2
-	lxvd2x %vs50, %r9, %r2
-	lvx %v19, %r10, %r2  // Use lvx to load the vector since it's written [ low word, high word ] in memory
-#if PPC_BIG_ENDIAN
-	lxvd2x %vs48, %r11, %r2  // Load the BE permutation mask (not needed for LE)
-#endif
-	lvx %v15, %r12, %r2
+	lvx %v15, %r2, %r12
 #if PPC_BIG_ENDIAN
 	vperm %v19, %v19, %v19, %v15  // Swap the byte order of the Group E OR mask vector
 #endif
@@ -562,14 +560,14 @@ randomx_ppc64_vm_prologue:
 
 	// Load a0-a3 from RegisterFile
 	.equ registers_a_base, 8*8+16*4+16*4
-	addi %r8, %r28, registers_a_base + 16*0
-	addi %r9, %r28, registers_a_base + 16*1
-	addi %r10, %r28, registers_a_base + 16*2
-	addi %r11, %r28, registers_a_base + 16*3
-	lvx %v8, 0, %r8
-	lvx %v9, 0, %r9
-	lvx %v10, 0, %r10
-	lvx %v11, 0, %r11
+	li %r8, registers_a_base + 16*0
+	lvx %v8, %r28, %r8
+	li %r9, registers_a_base + 16*1
+	lvx %v9, %r28, %r9
+	li %r10, registers_a_base + 16*2
+	lvx %v10, %r28, %r10
+	li %r11, registers_a_base + 16*3
+	lvx %v11, %r28, %r11
 #if PPC_BIG_ENDIAN
 	vperm %v8, %v8, %v8, %v15
 	vperm %v9, %v9, %v9, %v15
@@ -614,23 +612,23 @@ randomx_ppc64_vm_fix_loop:
 
 	.equ registers_f_base, 8*8
 	li %r8, registers_f_base + 16*0
+	stvx %v0, %r28, %r8
 	li %r9, registers_f_base + 16*1
+	stvx %v1, %r28, %r9
 	li %r10, registers_f_base + 16*2
+	stvx %v2, %r28, %r10
 	li %r11, registers_f_base + 16*3
-	stvx %v0, %r8, %r28
-	stvx %v1, %r9, %r28
-	stvx %v2, %r10, %r28
-	stvx %v3, %r11, %r28
+	stvx %v3, %r28, %r11
 
 	.equ registers_e_base, 8*8+16*4
 	li %r8, registers_e_base + 16*0
+	stvx %v4, %r28, %r8
 	li %r9, registers_e_base + 16*1
+	stvx %v5, %r28, %r9
 	li %r10, registers_e_base + 16*2
+	stvx %v6, %r28, %r10
 	li %r11, registers_e_base + 16*3
-	stvx %v4, %r8, %r28
-	stvx %v5, %r9, %r28
-	stvx %v6, %r10, %r28
-	stvx %v7, %r11, %r28
+	stvx %v7, %r28, %r11
 
 	// Standard function epilogue
 	ld %r14, 112(%r1)
@@ -680,14 +678,13 @@ randomx_ppc64_vm_loop_prologue:
 	xor %r21, %r21, %r11
 
 	// Load F registers (v0-v3 / vs32-vs35) from spAddr1 (r27)
-	//addi %r8, %r27, 8*0
-	addi %r9, %r27, 8*1
-	addi %r10, %r27, 8*2
-	addi %r11, %r27, 8*3
-	lxsdx %vs32, 0, %r27  // Use base address directly to avoid an `addi`
-	lxsdx %vs33, 0, %r9
-	lxsdx %vs34, 0, %r10
-	lxsdx %vs35, 0, %r11
+	lxsdx %vs32, 0, %r27  // Use base address directly to avoid an immediate load
+	li %r9, 8*1
+	lxsdx %vs33, %r27, %r9
+	li %r10, 8*2
+	lxsdx %vs34, %r27, %r10
+	li %r11, 8*3
+	lxsdx %vs35, %r27, %r11
 	SHUFFLE_VR 0
 	SHUFFLE_VR 1
 	SHUFFLE_VR 2
@@ -698,14 +695,14 @@ randomx_ppc64_vm_loop_prologue:
 	xvcvsxwdp %vs35, %vs35
 
 	// Load E registers (v4-v7 / vs36-vs39) from spAddr1 (r27) and fixup
-	addi %r8, %r27, 8*4
-	addi %r9, %r27, 8*5
-	addi %r10, %r27, 8*6
-	addi %r11, %r27, 8*7
-	lxsdx %vs36, 0, %r8
-	lxsdx %vs37, 0, %r9
-	lxsdx %vs38, 0, %r10
-	lxsdx %vs39, 0, %r11
+	li %r8, 8*4
+	lxsdx %vs36, %r27, %r8
+	li %r9, 8*5
+	lxsdx %vs37, %r27, %r9
+	li %r10, 8*6
+	lxsdx %vs38, %r27, %r10
+	li %r11, 8*7
+	lxsdx %vs39, %r27, %r11
 	SHUFFLE_VR 4
 	SHUFFLE_VR 5
 	SHUFFLE_VR 6