diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3d34e2fe..d3986e16 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -143,8 +143,22 @@ endif()
 
 # PowerPC
 if(ARCH_ID STREQUAL "ppc64" OR ARCH_ID STREQUAL "ppc64le")
-  if(ARCH STREQUAL "native")
-    add_flag("-mcpu=native")
+  list(APPEND randomx_sources
+    src/jit_compiler_ppc64_static.S
+    src/jit_compiler_ppc64.cpp)
+
+  set_property(SOURCE src/jit_compiler_ppc64_static.S PROPERTY LANGUAGE C)
+
+  if(ARCH STREQUAL "default")
+    if(ARCH_ID STREQUAL "ppc64le")
+      # Little-endian defaults to POWER8
+      add_flag("-mcpu=power8")
+    else()
+      # Big-endian defaults to POWER7
+      add_flag("-mcpu=power7")
+    endif()
+  else()
+    add_flag("-mcpu=${ARCH}")
   endif()
   # PowerPC AES requires ALTIVEC (POWER7+), so it cannot be enabled in the default build
 endif()
diff --git a/src/common.hpp b/src/common.hpp
index 579752d9..9b92d08a 100644
--- a/src/common.hpp
+++ b/src/common.hpp
@@ -138,6 +138,11 @@ namespace randomx {
 	#define RANDOMX_COMPILER_RV64
 	class JitCompilerRV64;
 	using JitCompiler = JitCompilerRV64;
+#elif defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)
+	#define RANDOMX_HAVE_COMPILER 1
+	#define RANDOMX_COMPILER_PPC64
+	class JitCompilerPPC64;
+	using JitCompiler = JitCompilerPPC64;
 #else
 	#define RANDOMX_HAVE_COMPILER 0
 	class JitCompilerFallback;
diff --git a/src/cpu.cpp b/src/cpu.cpp
index 3178d037..3faa0f45 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -47,6 +47,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#include <asm/hwcap.h>
 #endif
 
+#if defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)
+	#include <sys/auxv.h>
+	// From asm/cputable.h:
+	#ifndef PPC_FEATURE2_VEC_CRYPTO
+		#define PPC_FEATURE2_VEC_CRYPTO 0x02000000
+	#endif
+	#ifndef PPC_FEATURE2_ARCH_3_00
+		#define PPC_FEATURE2_ARCH_3_00 0x00800000
+	#endif
+#endif
+
 #ifdef __riscv
 #include <signal.h>
 #include <setjmp.h>
@@ -120,8 +131,11 @@ namespace randomx {
 
 			sigaction(SIGILL, &old_action, nullptr);
 		}
+#elif defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)
+		unsigned long hwcaps2 = getauxval(AT_HWCAP2);
+		aes_ = (hwcaps2 & PPC_FEATURE2_VEC_CRYPTO) != 0;
+		v3p0_ = (hwcaps2 & PPC_FEATURE2_ARCH_3_00) != 0;
 #endif
-		//TODO POWER8 AES
 	}
 
 	const Cpu cpu;
diff --git a/src/cpu.hpp b/src/cpu.hpp
index 7db03311..0c5058d6 100644
--- a/src/cpu.hpp
+++ b/src/cpu.hpp
@@ -41,6 +41,9 @@ namespace randomx {
 		inline bool hasRVV() const { return rvv_; }
 		inline int getRVV_Length() const { return rvv_length; }
 #endif
+#if defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)
+		inline bool hasV3P0() const { return v3p0_; }
+#endif
 
 	private:
 		bool aes_ = false;
@@ -49,6 +52,9 @@ namespace randomx {
 #ifdef __riscv
 		bool rvv_ = false;
 		int rvv_length = 0;
+#endif
+#if defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)
+		bool v3p0_ = false;
 #endif
 	};
 
diff --git a/src/intrin_portable.h b/src/intrin_portable.h
index 10530656..e1a06b12 100644
--- a/src/intrin_portable.h
+++ b/src/intrin_portable.h
@@ -277,11 +277,19 @@ FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
 }
 
 FORCE_INLINE rx_vec_f128 rx_cast_vec_i2f(rx_vec_i128 a) {
+#if defined(NATIVE_LITTLE_ENDIAN)
 	return (rx_vec_f128)a;
+#else
+	return (rx_vec_f128)vec_perm((__m128i)a, (__m128i)a, (__m128i){4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11});
+#endif
 }
 
 FORCE_INLINE rx_vec_i128 rx_cast_vec_f2i(rx_vec_f128 a) {
+#if defined(NATIVE_LITTLE_ENDIAN)
 	return (rx_vec_i128)a;
+#else
+	return (rx_vec_i128)vec_perm((__m128i)a, (__m128i)a, (__m128i){4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11});
+#endif
 }
 
 FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
diff --git a/src/jit_compiler.hpp b/src/jit_compiler.hpp
index 56c0655c..52fce1db 100644
--- a/src/jit_compiler.hpp
+++ b/src/jit_compiler.hpp
@@ -70,6 +70,8 @@ namespace randomx {
 #include "jit_compiler_a64.hpp"
 #elif defined(RANDOMX_COMPILER_RV64)
 #include "jit_compiler_rv64.hpp"
+#elif defined(RANDOMX_COMPILER_PPC64)
+#include "jit_compiler_ppc64.hpp"
 #else
 #include "jit_compiler_fallback.hpp"
 #endif
diff --git a/src/jit_compiler_ppc64.cpp b/src/jit_compiler_ppc64.cpp
new file mode 100644
index 00000000..32b0ef9e
--- /dev/null
+++ b/src/jit_compiler_ppc64.cpp
@@ -0,0 +1,1622 @@
+/*
+Copyright (c) 2023 tevador <tevador@gmail.com>
+Copyright (c) 2026, Forest Crossman <cyrozap@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdexcept>
+#include <cstring>
+
+#include <unistd.h>
+
+#include "cpu.hpp"
+#include "program.hpp"
+#include "reciprocal.h"
+#include "superscalar.hpp"
+#include "virtual_memory.h"
+
+#include "jit_compiler_ppc64.hpp"
+
+namespace {
+#define HANDLER_ARGS randomx::JitCompilerPPC64* jit, randomx::CompilerState& state, randomx::Instruction isn, int i, randomx_flags flags
+	using InstructionHandler = void(HANDLER_ARGS);
+	extern InstructionHandler* opcodeMap1[256];
+}
+
+namespace PPC64 {
+
+	static inline uint32_t A_form(uint32_t po, uint32_t frt, uint32_t fra, uint32_t frb, uint32_t frc, uint32_t xo, uint32_t rc) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(frt <= 0x1F)) throw std::runtime_error("frt <= 0x1F");
+		if (!(fra <= 0x1F)) throw std::runtime_error("fra <= 0x1F");
+		if (!(frb <= 0x1F)) throw std::runtime_error("frb <= 0x1F");
+		if (!(frc <= 0x1F)) throw std::runtime_error("frc <= 0x1F");
+		if (!(xo <= 0x1F)) throw std::runtime_error("xo <= 0x1F");
+		if (!(rc <= 0x1)) throw std::runtime_error("rc <= 0x1");
+		return (po << 26) | (frt << 21) | (fra << 16) | (frb << 11) | (frc << 6) | (xo << 1) | rc;
+	}
+
+	static inline uint32_t B_form(uint32_t po, uint32_t bo, uint32_t bi, uint32_t bd, uint32_t aa, uint32_t lk) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(bo <= 0x1F)) throw std::runtime_error("bo <= 0x1F");
+		if (!(bi <= 0x1F)) throw std::runtime_error("bi <= 0x1F");
+		if (!(bd <= 0x3FFF)) throw std::runtime_error("bd <= 0x3FFF");
+		if (!(aa <= 0x1)) throw std::runtime_error("aa <= 0x1");
+		if (!(lk <= 0x1)) throw std::runtime_error("lk <= 0x1");
+		return (po << 26) | (bo << 21) | (bi << 16) | (bd << 2) | (aa << 1) | lk;
+	}
+
+	static inline uint32_t D_form(uint32_t po, uint32_t rt, uint32_t ra, uint32_t d) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(rt <= 0x1F)) throw std::runtime_error("rt <= 0x1F");
+		if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F");
+		if (!(d <= 0xFFFF)) throw std::runtime_error("d <= 0xFFFF");
+		return (po << 26) | (rt << 21) | (ra << 16) | d;
+	}
+
+	static inline uint32_t DQ_form(uint32_t po, uint32_t s, uint32_t ra, uint32_t dq, uint32_t sx, uint32_t xo) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(s <= 0x1F)) throw std::runtime_error("s <= 0x1F");
+		if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F");
+		if (!(dq <= 0xFFF)) throw std::runtime_error("dq <= 0xFFF");
+		if (!(sx <= 0x1)) throw std::runtime_error("sx <= 0x1");
+		if (!(xo <= 0x7)) throw std::runtime_error("xo <= 0x7");
+		return (po << 26) | (s << 21) | (ra << 16) | (dq << 4) | (sx << 3) | xo;
+	}
+
+	static inline uint32_t DS_form(uint32_t po, uint32_t rt, uint32_t ra, uint32_t ds, uint32_t xo) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(rt <= 0x1F)) throw std::runtime_error("rt <= 0x1F");
+		if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F");
+		if (!(ds <= 0x3FFF)) throw std::runtime_error("ds <= 0x3FFF");
+		if (!(xo <= 0x3)) throw std::runtime_error("xo <= 0x3");
+		return (po << 26) | (rt << 21) | (ra << 16) | (ds << 2) | xo;
+	}
+
+	static inline uint32_t I_form(uint32_t po, uint32_t li, uint32_t aa, uint32_t lk) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(li <= 0xFFFFFF)) throw std::runtime_error("li <= 0xFFFFFF");
+		if (!(aa <= 0x1)) throw std::runtime_error("aa <= 0x1");
+		if (!(lk <= 0x1)) throw std::runtime_error("lk <= 0x1");
+		return (po << 26) | (li << 2) | (aa << 1) | lk;
+	}
+
+	static inline uint32_t M_form(uint32_t po, uint32_t rs, uint32_t ra, uint32_t sh, uint32_t mb, uint32_t me, uint32_t rc) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(rs <= 0x1F)) throw std::runtime_error("rs <= 0x1F");
+		if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F");
+		if (!(sh <= 0x1F)) throw std::runtime_error("sh <= 0x1F");
+		if (!(mb <= 0x1F)) throw std::runtime_error("mb <= 0x1F");
+		if (!(me <= 0x1F)) throw std::runtime_error("me <= 0x1F");
+		if (!(rc <= 0x1)) throw std::runtime_error("rc <= 0x1");
+		return (po << 26) | (rs << 21) | (ra << 16) | (sh << 11) | (mb << 6) | (me << 1) | rc;
+	}
+
+	static inline uint32_t MD_form(uint32_t po, uint32_t rs, uint32_t ra, uint32_t sh, uint32_t mb, uint32_t xo, uint32_t rc) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(rs <= 0x1F)) throw std::runtime_error("rs <= 0x1F");
+		if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F");
+		if (!(sh <= 0x3F)) throw std::runtime_error("sh <= 0x3F");
+		if (!(mb <= 0x3F)) throw std::runtime_error("mb <= 0x3F");
+		if (!(xo <= 0x7)) throw std::runtime_error("xo <= 0x7");
+		if (!(rc <= 0x1)) throw std::runtime_error("rc <= 0x1");
+		uint32_t sh0_4 = sh & 0x1F;
+		uint32_t sh5 = (sh >> 5) & 0x1;
+		uint32_t mb0_4 = mb & 0x1F;
+		uint32_t mb5 = (mb >> 5) & 0x1;
+		return (po << 26) | (rs << 21) | (ra << 16) | (sh0_4 << 11) | (mb0_4 << 6) | (mb5 << 5) | (xo << 2) | (sh5 << 1) | rc;
+	}
+
+	static inline uint32_t MDS_form(uint32_t po, uint32_t rs, uint32_t ra, uint32_t rb, uint32_t mb, uint32_t xo, uint32_t rc) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(rs <= 0x1F)) throw std::runtime_error("rs <= 0x1F");
+		if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F");
+		if (!(rb <= 0x1F)) throw std::runtime_error("rb <= 0x1F");
+		if (!(mb <= 0x3F)) throw std::runtime_error("mb <= 0x3F");
+		if (!(xo <= 0xF)) throw std::runtime_error("xo <= 0xF");
+		if (!(rc <= 0x1)) throw std::runtime_error("rc <= 0x1");
+		uint32_t mb0_4 = mb & 0x1F;
+		uint32_t mb5 = (mb >> 5) & 0x1;
+		return (po << 26) | (rs << 21) | (ra << 16) | (rb << 11) | (mb0_4 << 6) | (mb5 << 5) | (xo << 1) | rc;
+	}
+
+	static inline uint32_t VA_form(uint32_t po, uint32_t vrt, uint32_t vra, uint32_t vrb, uint32_t vrc, uint32_t xo) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(vrt <= 0x1F)) throw std::runtime_error("vrt <= 0x1F");
+		if (!(vra <= 0x1F)) throw std::runtime_error("vra <= 0x1F");
+		if (!(vrb <= 0x1F)) throw std::runtime_error("vrb <= 0x1F");
+		if (!(vrc <= 0x1F)) throw std::runtime_error("vrc <= 0x1F");
+		if (!(xo <= 0x3F)) throw std::runtime_error("xo <= 0x3F");
+		return (po << 26) | (vrt << 21) | (vra << 16) | (vrb << 11) | (vrc << 6) | xo;
+	}
+
+	static inline uint32_t VX_form(uint32_t po, uint32_t vrt, uint32_t vra, uint32_t vrb, uint32_t xo) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(vrt <= 0x1F)) throw std::runtime_error("vrt <= 0x1F");
+		if (!(vra <= 0x1F)) throw std::runtime_error("vra <= 0x1F");
+		if (!(vrb <= 0x1F)) throw std::runtime_error("vrb <= 0x1F");
+		if (!(xo <= 0x7FF)) throw std::runtime_error("xo <= 0x7FF");
+		return (po << 26) | (vrt << 21) | (vra << 16) | (vrb << 11) | xo;
+	}
+
+	static inline uint32_t X_form(uint32_t po, uint32_t rt, uint32_t ra, uint32_t rb, uint32_t xo, uint32_t rc) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(rt <= 0x1F)) throw std::runtime_error("rt <= 0x1F");
+		if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F");
+		if (!(rb <= 0x1F)) throw std::runtime_error("rb <= 0x1F");
+		if (!(xo <= 0x3FF)) throw std::runtime_error("xo <= 0x3FF");
+		if (!(rc <= 0x1)) throw std::runtime_error("rc <= 0x1");
+		return (po << 26) | (rt << 21) | (ra << 16) | (rb << 11) | (xo << 1) | rc;
+	}
+
+	static inline uint32_t XFL_form(uint32_t po, uint32_t l, uint32_t flm, uint32_t w, uint32_t frb, uint32_t xo, uint32_t rc) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(l <= 0x1)) throw std::runtime_error("l <= 0x1");
+		if (!(flm <= 0xFF)) throw std::runtime_error("flm <= 0xFF");
+		if (!(w <= 0x1)) throw std::runtime_error("w <= 0x1");
+		if (!(frb <= 0x1F)) throw std::runtime_error("frb <= 0x1F");
+		if (!(xo <= 0x3FF)) throw std::runtime_error("xo <= 0x3FF");
+		if (!(rc <= 0x1)) throw std::runtime_error("rc <= 0x1");
+		return (po << 26) | (l << 25) | (flm << 17) | (w << 16) | (frb << 11) | (xo << 1) | rc;
+	}
+
+	static inline uint32_t XO_form(uint32_t po, uint32_t rt, uint32_t ra, uint32_t rb, uint32_t oe, uint32_t xo, uint32_t rc) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(rt <= 0x1F)) throw std::runtime_error("rt <= 0x1F");
+		if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F");
+		if (!(rb <= 0x1F)) throw std::runtime_error("rb <= 0x1F");
+		if (!(oe <= 0x1)) throw std::runtime_error("oe <= 0x1");
+		if (!(xo <= 0x1FF)) throw std::runtime_error("xo <= 0x1FF");
+		if (!(rc <= 0x1)) throw std::runtime_error("rc <= 0x1");
+		return (po << 26) | (rt << 21) | (ra << 16) | (rb << 11) | (oe << 10) | (xo << 1) | rc;
+	}
+
+	static inline uint32_t XX2_form(uint32_t po, uint32_t t, uint32_t a, uint32_t b, uint32_t xo, uint32_t bx, uint32_t tx) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(t <= 0x1F)) throw std::runtime_error("t <= 0x1F");
+		if (!(a <= 0x1F)) throw std::runtime_error("a <= 0x1F");
+		if (!(b <= 0x1F)) throw std::runtime_error("b <= 0x1F");
+		if (!(xo <= 0x1FF)) throw std::runtime_error("xo <= 0x1FF");
+		if (!(bx <= 0x1)) throw std::runtime_error("bx <= 0x1");
+		if (!(tx <= 0x1)) throw std::runtime_error("tx <= 0x1");
+		return (po << 26) | (t << 21) | (a << 16) | (b << 11) | (xo << 2) | (bx << 1) | tx;
+	}
+
+	static inline uint32_t XX3_form(uint32_t po, uint32_t t, uint32_t a, uint32_t b, uint32_t xo, uint32_t ax, uint32_t bx, uint32_t tx) {
+		if (!(po <= 0x3F)) throw std::runtime_error("po <= 0x3F");
+		if (!(t <= 0x1F)) throw std::runtime_error("t <= 0x1F");
+		if (!(a <= 0x1F)) throw std::runtime_error("a <= 0x1F");
+		if (!(b <= 0x1F)) throw std::runtime_error("b <= 0x1F");
+		if (!(xo <= 0xFF)) throw std::runtime_error("xo <= 0xFF");
+		if (!(ax <= 0x1)) throw std::runtime_error("ax <= 0x1");
+		if (!(bx <= 0x1)) throw std::runtime_error("bx <= 0x1");
+		if (!(tx <= 0x1)) throw std::runtime_error("tx <= 0x1");
+		return (po << 26) | (t << 21) | (a << 16) | (b << 11) | (xo << 3) | (ax << 2) | (bx << 1) | tx;
+	}
+
+	static inline uint32_t b(int32_t offset) {
+		if (offset & 3) throw std::runtime_error("offset must be 4-byte aligned");
+		if (offset < -(1 << 25) || offset >= (1 << 25)) throw std::runtime_error("offset out of range");
+		return I_form(18, (offset >> 2) & 0xFFFFFF, 0, 0);
+	}
+
+	static inline uint32_t bl(int32_t offset) {
+		if (offset & 3) throw std::runtime_error("offset must be 4-byte aligned");
+		if (offset < -(1 << 25) || offset >= (1 << 25)) throw std::runtime_error("offset out of range");
+		return I_form(18, (offset >> 2) & 0xFFFFFF, 0, 1);
+	}
+
+	static inline uint32_t bc(uint32_t bo, uint32_t bi, int32_t offset) {
+		if (!(bo <= 0x1F)) throw std::runtime_error("bo <= 0x1F");
+		if (!(bi <= 0x1F)) throw std::runtime_error("bi <= 0x1F");
+		if (offset & 3) throw std::runtime_error("offset must be 4-byte aligned");
+		if (offset < -(1 << 15) || offset >= (1 << 15)) throw std::runtime_error("offset out of range");
+		return B_form(16, bo, bi, (offset >> 2) & 0x3FFF, 0, 0);
+	}
+
+	static inline uint32_t beq(int32_t offset) { return bc(0x0C, 2, offset); }
+	static inline uint32_t beq_predict_not_taken(int32_t offset) { return bc(0x0E, 2, offset); }
+	static inline uint32_t bne(int32_t offset) { return bc(0x04, 2, offset); }
+	static inline uint32_t bne_predict_taken(int32_t offset) { return bc(0x07, 2, offset); }
+
+	static inline uint32_t cmpi(uint32_t bf, uint32_t l, uint32_t ra, int32_t si) {
+		if (!(bf <= 0x7)) throw std::runtime_error("bf <= 0x7");
+		if (!(l <= 0x1)) throw std::runtime_error("l <= 0x1");
+		if (!(ra <= 0x1F)) throw std::runtime_error("ra <= 0x1F");
+		if (si < -(1 << 15) || si >= (1 << 15)) throw std::runtime_error("si out of range");
+		return D_form(11, (bf << 2) | l, ra, si);
+	}
+
+	static inline uint32_t addi(uint32_t rt, uint32_t ra, uint32_t si) { return D_form(14, rt, ra, si); }
+	static inline uint32_t addis(uint32_t rt, uint32_t ra, uint32_t si) { return D_form(15, rt, ra, si); }
+	static inline uint32_t ori(uint32_t ra, uint32_t rs, uint32_t ui) { return D_form(24, rs, ra, ui); }
+	static inline uint32_t oris(uint32_t ra, uint32_t rs, uint32_t ui) { return D_form(25, rs, ra, ui); }
+	static inline uint32_t xori(uint32_t ra, uint32_t rs, uint32_t ui) { return D_form(26, rs, ra, ui); }
+	static inline uint32_t xoris(uint32_t ra, uint32_t rs, uint32_t ui) { return D_form(27, rs, ra, ui); }
+	static inline uint32_t andi_dot(uint32_t ra, uint32_t rs, uint32_t ui) { return D_form(28, rs, ra, ui); }
+
+	static inline uint32_t add(uint32_t rt, uint32_t ra, uint32_t rb) { return XO_form(31, rt, ra, rb, 0, 266, 0); }
+	static inline uint32_t subf(uint32_t rt, uint32_t ra, uint32_t rb) { return XO_form(31, rt, ra, rb, 0, 40, 0); }
+	static inline uint32_t neg(uint32_t rt, uint32_t ra) { return XO_form(31, rt, ra, 0, 0, 104, 0); }
+	static inline uint32_t and_(uint32_t ra, uint32_t rs, uint32_t rb) { return X_form(31, rs, ra, rb, 28, 0); }
+	static inline uint32_t and_dot(uint32_t ra, uint32_t rs, uint32_t rb) { return X_form(31, rs, ra, rb, 28, 1); }
+	static inline uint32_t xor_(uint32_t ra, uint32_t rs, uint32_t rb) { return X_form(31, rs, ra, rb, 316, 0); }
+	static inline uint32_t or_(uint32_t ra, uint32_t rs, uint32_t rb) { return X_form(31, rs, ra, rb, 444, 0); }
+
+	static inline uint32_t mulld(uint32_t rt, uint32_t ra, uint32_t rb) { return XO_form(31, rt, ra, rb, 0, 233, 0); }
+	static inline uint32_t mulhdu(uint32_t rt, uint32_t ra, uint32_t rb) { return XO_form(31, rt, ra, rb, 0, 9, 0); }
+	static inline uint32_t mulhd(uint32_t rt, uint32_t ra, uint32_t rb) { return XO_form(31, rt, ra, rb, 0, 73, 0); }
+
+	static inline uint32_t rlwinm(uint32_t ra, uint32_t rs, uint32_t sh, uint32_t mb, uint32_t me) { return M_form(21, rs, ra, sh, mb, me, 0); }
+	static inline uint32_t rldicl(uint32_t ra, uint32_t rs, uint32_t sh, uint32_t mb) { return MD_form(30, rs, ra, sh, mb, 0, 0); }
+	static inline uint32_t rldicl_dot(uint32_t ra, uint32_t rs, uint32_t sh, uint32_t mb) { return MD_form(30, rs, ra, sh, mb, 0, 1); }
+	static inline uint32_t rldicr(uint32_t ra, uint32_t rs, uint32_t sh, uint32_t me) { return MD_form(30, rs, ra, sh, me, 1, 0); }
+	static inline uint32_t rldic(uint32_t ra, uint32_t rs, uint32_t sh, uint32_t mb) { return MD_form(30, rs, ra, sh, mb, 2, 0); }
+	static inline uint32_t rldcl(uint32_t ra, uint32_t rs, uint32_t rb, uint32_t mb) { return MDS_form(30, rs, ra, rb, mb, 8, 0); }
+
+	static inline uint32_t cmpdi(uint32_t rx, int32_t si) { return cmpi(0, 1, rx, si); }
+
+	static inline uint32_t li(uint32_t rx, int32_t si) { return addi(rx, 0, si); }
+	static inline uint32_t lis(uint32_t rx, int32_t si) { return addis(rx, 0, si); }
+	static inline uint32_t mr(uint32_t rx, uint32_t ry) { return or_(rx, ry, ry); }
+	static inline uint32_t rotldi(uint32_t ra, uint32_t rs, uint32_t n) { return rldicl(ra, rs, n, 0); }
+	static inline uint32_t rotrdi(uint32_t ra, uint32_t rs, uint32_t n) { return rldicl(ra, rs, 64-n, 0); }
+	static inline uint32_t sldi(uint32_t rx, uint32_t ry, uint32_t n) { return rldicr(rx, ry, n, 63-n); }
+	static inline uint32_t srdi(uint32_t rx, uint32_t ry, uint32_t n) { return rldicl(rx, ry, 64-n, n); }
+
+	static inline uint32_t ld(uint32_t rt, int32_t offset, uint32_t ra) {
+		if (offset & 3) throw std::runtime_error("offset must be 4-byte aligned");
+		if (offset < -(1 << 15) || offset >= (1 << 15)) throw std::runtime_error("offset out of range");
+		return DS_form(58, rt, ra, (offset >> 2) & 0x3FFF, 0);
+	}
+
+	static inline uint32_t ldx(uint32_t rt, uint32_t ra, uint32_t rb) { return X_form(31, rt, ra, rb, 21, 0); }
+	static inline uint32_t ldbrx(uint32_t rt, uint32_t ra, uint32_t rb) { return X_form(31, rt, ra, rb, 532, 0); }
+	static inline uint32_t stdx(uint32_t rs, uint32_t ra, uint32_t rb) { return X_form(31, rs, ra, rb, 149, 0); }
+	static inline uint32_t stdbrx(uint32_t rs, uint32_t ra, uint32_t rb) { return X_form(31, rs, ra, rb, 660, 0); }
+
+	static inline uint32_t lfd(uint32_t frt, uint32_t ra, uint32_t d) { return D_form(50, frt, ra, d); }
+	static inline uint32_t lfdx(uint32_t frt, uint32_t ra, uint32_t rb) { return X_form(31, frt, ra, rb, 599, 0); }
+	static inline uint32_t mtfsf(uint32_t flm, uint32_t frb, uint32_t l, uint32_t w) { return XFL_form(63, l, flm, w, frb, 711, 0); }
+	static inline uint32_t mffscrn(uint32_t frt, uint32_t frb) { return X_form(63, frt, 22, frb, 583, 0); }  // Only v3.0B and later
+
+	static inline uint32_t lxsdx(uint32_t xt, uint32_t ra, uint32_t rb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		return X_form(31, t, ra, rb, 588, tx);
+	}
+
+	static inline uint32_t lxvd2x(uint32_t xt, uint32_t ra, uint32_t rb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		return X_form(31, t, ra, rb, 844, tx);
+	}
+
+	static inline uint32_t stvx(uint32_t vrs, uint32_t ra, uint32_t rb) { return X_form(31, vrs, ra, rb, 231, 0); }
+
+	static inline uint32_t stxv(uint32_t xs, int32_t offset, uint32_t ra) {  // Only v3.0B and later
+		if (!(xs <= 0x3F)) throw std::runtime_error("xs <= 0x3F");
+		if (offset & 0xF) throw std::runtime_error("offset must be 16-byte aligned");
+		if (offset < -(1 << 15) || offset >= (1 << 15)) throw std::runtime_error("offset out of range");
+		uint32_t s = xs & 0x1F;
+		uint32_t sx = xs >> 5;
+		return DQ_form(61, s, ra, (offset >> 4) & 0xFFF, sx, 5);
+	}
+
+	static inline uint32_t vperm(uint32_t vrt, uint32_t vra, uint32_t vrb, uint32_t vrc) { return VA_form(4, vrt, vra, vrb, vrc, 43); }
+	static inline uint32_t vsel(uint32_t vrt, uint32_t vra, uint32_t vrb, uint32_t vrc) { return VA_form(4, vrt, vra, vrb, vrc, 42); }
+
+	static inline uint32_t vand(uint32_t vrt, uint32_t vra, uint32_t vrb) { return VX_form(4, vrt, vra, vrb, 1028); }
+	static inline uint32_t vor(uint32_t vrt, uint32_t vra, uint32_t vrb) { return VX_form(4, vrt, vra, vrb, 1156); }
+	static inline uint32_t vxor(uint32_t vrt, uint32_t vra, uint32_t vrb) { return VX_form(4, vrt, vra, vrb, 1220); }
+
+	static inline uint32_t xxmrghw(uint32_t xt, uint32_t xa, uint32_t xb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t a = xa & 0x1F;
+		uint32_t ax = xa >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX3_form(60, t, a, b, 18, ax, bx, tx);
+	}
+
+	static inline uint32_t xvadddp(uint32_t xt, uint32_t xa, uint32_t xb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t a = xa & 0x1F;
+		uint32_t ax = xa >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX3_form(60, t, a, b, 96, ax, bx, tx);
+	}
+
+	static inline uint32_t xvsubdp(uint32_t xt, uint32_t xa, uint32_t xb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t a = xa & 0x1F;
+		uint32_t ax = xa >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX3_form(60, t, a, b, 104, ax, bx, tx);
+	}
+
+	static inline uint32_t xvmuldp(uint32_t xt, uint32_t xa, uint32_t xb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t a = xa & 0x1F;
+		uint32_t ax = xa >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX3_form(60, t, a, b, 112, ax, bx, tx);
+	}
+
+	static inline uint32_t xvdivdp(uint32_t xt, uint32_t xa, uint32_t xb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t a = xa & 0x1F;
+		uint32_t ax = xa >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX3_form(60, t, a, b, 120, ax, bx, tx);
+	}
+
+	static inline uint32_t xvsqrtdp(uint32_t xt, uint32_t xb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX2_form(60, t, 0, b, 203, bx, tx);
+	}
+
+	static inline uint32_t xvcvsxwdp(uint32_t xt, uint32_t xb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX2_form(60, t, 0, b, 248, bx, tx);
+	}
+
+	static inline uint32_t xxpermdi(uint32_t xt, uint32_t xa, uint32_t xb, uint32_t dm) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t a = xa & 0x1F;
+		uint32_t ax = xa >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX3_form(60, t, a, b, (dm << 5) | 10, ax, bx, tx);
+	}
+
+	static inline uint32_t xxswapd(uint32_t xt, uint32_t xa) { return xxpermdi(xt, xa, xa, 2); }
+
+	static inline uint32_t xxland(uint32_t xt, uint32_t xa, uint32_t xb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t a = xa & 0x1F;
+		uint32_t ax = xa >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX3_form(60, t, a, b, 130, ax, bx, tx);
+	}
+
+	static inline uint32_t xxlor(uint32_t xt, uint32_t xa, uint32_t xb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t a = xa & 0x1F;
+		uint32_t ax = xa >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX3_form(60, t, a, b, 146, ax, bx, tx);
+	}
+
+	static inline uint32_t xxlxor(uint32_t xt, uint32_t xa, uint32_t xb) {
+		if (!(xt <= 0x3F)) throw std::runtime_error("xt <= 0x3F");
+		if (!(xa <= 0x3F)) throw std::runtime_error("xa <= 0x3F");
+		if (!(xb <= 0x3F)) throw std::runtime_error("xb <= 0x3F");
+		uint32_t t = xt & 0x1F;
+		uint32_t tx = xt >> 5;
+		uint32_t a = xa & 0x1F;
+		uint32_t ax = xa >> 5;
+		uint32_t b = xb & 0x1F;
+		uint32_t bx = xb >> 5;
+		return XX3_form(60, t, a, b, 154, ax, bx, tx);
+	}
+
+}
+
+namespace randomx {
+
+	static const uint8_t* codeConstants = (uint8_t*)&randomx_ppc64_constants;
+	static const uint8_t* codeConstantLutFprcToFpscr = (uint8_t*)&randomx_ppc64_constant_lut_fprc_to_fpscr;
+	static const uint8_t* codeConstantsEnd = (uint8_t*)&randomx_ppc64_constants_end;
+
+	static const uint8_t* codeDatasetInit = (uint8_t*)&randomx_ppc64_dataset_init;
+	static const uint8_t* codeDatasetInitFixCall = (uint8_t*)&randomx_ppc64_dataset_init_fix_call;
+	static const uint8_t* codeDatasetInitEnd = (uint8_t*)&randomx_ppc64_dataset_init_end;
+
+	static const uint8_t* codeSshashSingleItemPrologue = (uint8_t*)&randomx_ppc64_sshash_single_item_prologue;
+	static const uint8_t* codeSshashSingleItemPrologueEnd = (uint8_t*)&randomx_ppc64_sshash_single_item_prologue_end;
+	static const uint8_t* codeSshashSingleItemEpilogue = (uint8_t*)&randomx_ppc64_sshash_single_item_epilogue;
+	static const uint8_t* codeSshashSingleItemEpilogueEnd = (uint8_t*)&randomx_ppc64_sshash_single_item_epilogue_end;
+	static const uint8_t* codeSshashCachePrefetch = (uint8_t*)&randomx_ppc64_sshash_cache_prefetch;
+	static const uint8_t* codeSshashCachePrefetchEnd = (uint8_t*)&randomx_ppc64_sshash_cache_prefetch_end;
+	static const uint8_t* codeSshashXor = (uint8_t*)&randomx_ppc64_sshash_xor;
+	static const uint8_t* codeSshashXorEnd = (uint8_t*)&randomx_ppc64_sshash_xor_end;
+
+	static const uint8_t* codeVmPrologue = (uint8_t*)&randomx_ppc64_vm_prologue;
+	static const uint8_t* codeVmPrologueEnd = (uint8_t*)&randomx_ppc64_vm_prologue_end;
+	static const uint8_t* codeVmEpilogue = (uint8_t*)&randomx_ppc64_vm_epilogue;
+	static const uint8_t* codeVmFixLoop = (uint8_t*)&randomx_ppc64_vm_fix_loop;
+	static const uint8_t* codeVmEpilogueEnd = (uint8_t*)&randomx_ppc64_vm_epilogue_end;
+	static const uint8_t* codeVmLoopPrologue = (uint8_t*)&randomx_ppc64_vm_loop_prologue;
+	static const uint8_t* codeVmLoopPrologueEnd = (uint8_t*)&randomx_ppc64_vm_loop_prologue_end;
+	static const uint8_t* codeVmDataRead = (uint8_t*)&randomx_ppc64_vm_data_read;
+	static const uint8_t* codeVmDataReadEnd = (uint8_t*)&randomx_ppc64_vm_data_read_end;
+	static const uint8_t* codeVmDataReadLight = (uint8_t*)&randomx_ppc64_vm_data_read_light;
+	static const uint8_t* codeVmDataReadLightFixCall = (uint8_t*)&randomx_ppc64_vm_data_read_light_fix_call;
+	static const uint8_t* codeVmDataReadLightEnd = (uint8_t*)&randomx_ppc64_vm_data_read_light_end;
+	static const uint8_t* codeVmSpadStoreGroupR = (uint8_t*)&randomx_ppc64_vm_spad_store_group_r;
+	static const uint8_t* codeVmSpadStoreGroupREnd = (uint8_t*)&randomx_ppc64_vm_spad_store_group_r_end;
+	static const uint8_t* codeVmSpadStoreMixV1 = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v1;
+	static const uint8_t* codeVmSpadStoreMixV1End = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v1_end;
+	static const uint8_t* codeVmSpadStoreMixV2HardAes = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v2_hard_aes;
+	static const uint8_t* codeVmSpadStoreMixV2HardAesEnd = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end;
+	static const uint8_t* codeVmSpadStoreMixV2SoftAes = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v2_soft_aes;
+	static const uint8_t* codeVmSpadStoreMixV2SoftAesEnd = (uint8_t*)&randomx_ppc64_vm_spad_store_mix_v2_soft_aes_end;
+
+	static const int32_t sizeConstants = codeConstantsEnd - codeConstants;
+
+	static const int32_t sizeDatasetInit = codeDatasetInitEnd - codeDatasetInit;
+
+	static const int32_t sizeSshashSingleItemPrologue = codeSshashSingleItemPrologueEnd - codeSshashSingleItemPrologue;
+	static const int32_t sizeSshashSingleItemEpilogue = codeSshashSingleItemEpilogueEnd - codeSshashSingleItemEpilogue;
+	static const int32_t sizeSshashCachePrefetch = codeSshashCachePrefetchEnd - codeSshashCachePrefetch;
+	static const int32_t sizeSshashXor = codeSshashXorEnd - codeSshashXor;
+
+	static const int32_t sizeVmPrologue = codeVmPrologueEnd - codeVmPrologue;
+	static const int32_t sizeVmEpilogue = codeVmEpilogueEnd - codeVmEpilogue;
+	static const int32_t sizeVmLoopPrologue = codeVmLoopPrologueEnd - codeVmLoopPrologue;
+	static const int32_t sizeVmDataRead = codeVmDataReadEnd - codeVmDataRead;
+	static const int32_t sizeVmDataReadLight = codeVmDataReadLightEnd - codeVmDataReadLight;
+	static const int32_t sizeVmSpadStoreGroupR = codeVmSpadStoreGroupREnd - codeVmSpadStoreGroupR;
+	static const int32_t sizeVmSpadStoreMixV1 = codeVmSpadStoreMixV1End - codeVmSpadStoreMixV1;
+	static const int32_t sizeVmSpadStoreMixV2HardAes = codeVmSpadStoreMixV2HardAesEnd - codeVmSpadStoreMixV2HardAes;
+	static const int32_t sizeVmSpadStoreMixV2SoftAes = codeVmSpadStoreMixV2SoftAesEnd - codeVmSpadStoreMixV2SoftAes;
+	constexpr size_t sizeVmSpadStoreGroupF = 4*12;  // Worst case size is 12 instructions
+
+	static const int32_t offsetConstantLutFprcToFpscr = codeConstantLutFprcToFpscr - codeConstants;
+
+	static const int32_t offsetDatasetInitFixCall = codeDatasetInitFixCall - codeDatasetInit;
+
+	static const int32_t offsetVmFixLoop = codeVmFixLoop - codeVmEpilogue;
+	static const int32_t offsetVmDataReadLightFixCall = codeVmDataReadLightFixCall - codeVmDataReadLight;
+
+	constexpr size_t CodeAlign = 64*1024;  // 64 kB, to ensure alignment on systems with a page size <= 64 kB
+	constexpr size_t ReciprocalPoolSize = 8 * RANDOMX_PROGRAM_MAX_SIZE;  // RANDOMX_PROGRAM_MAX_SIZE 64-bit reciprocals
+	static const size_t ReciprocalPoolPos = sizeConstants + 16;  // Add 16 bytes for the Group E OR vector mask
+	static const size_t ConstantPoolSize = alignSize(sizeConstants + 16 + ReciprocalPoolSize, CodeAlign);  // Add 16 bytes for the Group E OR vector mask
+	static const size_t ReserveCodeSize = alignSize(sizeVmPrologue + sizeVmEpilogue + sizeVmLoopPrologue + sizeVmDataRead + sizeVmDataReadLight + sizeVmSpadStoreGroupR + sizeVmSpadStoreMixV2SoftAes + sizeVmSpadStoreGroupF, CodeAlign);
+	constexpr size_t MaxRandomXInstrCodeSize = 4*9;  // FDIV_M and CFROUND require at most 9 instructions
+	constexpr size_t MaxSuperscalarInstrSize = 4*6;  // IMUL_RCP requires at most 6 instructions
+	static const size_t SuperscalarProgramHeaders = sizeSshashSingleItemPrologue + sizeSshashSingleItemEpilogue;
+
+	static const size_t RandomXCodeSize = alignSize(ConstantPoolSize + ReserveCodeSize + MaxRandomXInstrCodeSize * RANDOMX_PROGRAM_MAX_SIZE, CodeAlign);
+	static const size_t SuperscalarSize = alignSize(sizeDatasetInit + SuperscalarProgramHeaders + (sizeSshashCachePrefetch + sizeSshashXor + MaxSuperscalarInstrSize * SuperscalarMaxSize) * RANDOMX_CACHE_ACCESSES, CodeAlign);
+
+	static const uint32_t CodeSize = RandomXCodeSize + SuperscalarSize;
+
+	constexpr uint32_t ConstantsBaseAddressRegisterGPR2 = 2;
+	constexpr uint32_t ConstantVectorByteReverseMaskVR15 = 15;
+	constexpr uint32_t ConstantVectorByteReverseMaskVSR47 = 32 + ConstantVectorByteReverseMaskVR15;
+	constexpr uint32_t ConstantVectorBePermutationMaskVR16 = 16;
+	constexpr uint32_t ConstantVectorBePermutationMaskVSR48 = 32 + ConstantVectorBePermutationMaskVR16;
+	constexpr uint32_t ConstantVectorGroupEAndMaskVR17 = 17;
+	constexpr uint32_t ConstantVectorGroupEAndMaskVSR49 = 32 + ConstantVectorGroupEAndMaskVR17;
+	constexpr uint32_t ConstantVectorFscalXorMaskVR18 = 18;
+	constexpr uint32_t ConstantVectorFscalXorMaskVSR50 = 32 + ConstantVectorFscalXorMaskVR18;
+	constexpr uint32_t ConstantVectorGroupEOrMaskVR19 = 19;
+	constexpr uint32_t ConstantVectorGroupEOrMaskVSR51 = 32 + ConstantVectorGroupEOrMaskVR19;
+
+	constexpr uint32_t MaGPR24 = 24;
+	constexpr uint32_t MxGPR25 = 25;
+	constexpr uint32_t SpAddr0GPR26 = 26;
+	constexpr uint32_t SpAddr1GPR27 = 27;
+	constexpr uint32_t ScratchpadPointerGPR30 = 30;
+
+	template<size_t N>
+	struct GprMap {
+		uint32_t regs[N];
+		uint32_t getPpcGprNum(uint8_t idx) const {
+			return regs[idx % N];
+		}
+	};
+
+	template<size_t N>
+	struct VsrMap {
+		uint32_t regs[N];
+		uint32_t getPpcVrNum(uint8_t idx) const {
+			return regs[idx % N];
+		}
+		uint32_t getPpcVsrNum(uint8_t idx) const {
+			return regs[idx % N] + 32;
+		}
+	};
+
+	static const GprMap<8> RegisterMapR = {{ 14, 15, 16, 17, 18, 19, 20, 21 }};
+	static const VsrMap<4> RegisterMapF = {{ 0, 1, 2, 3 }};
+	static const VsrMap<4> RegisterMapE = {{ 4, 5, 6, 7 }};
+	static const VsrMap<4> RegisterMapA = {{ 8, 9, 10, 11 }};
+	static const VsrMap<8> RegisterMapFE = {{ 0, 1, 2, 3, 4, 5, 6, 7 }};
+
+	static const GprMap<8> RegisterMapSsh = {{ 4, 6, 7, 9, 10, 11, 12, 22 }};
+
+	template<typename T> static constexpr size_t Log2(T value) { return (value > 1) ? (Log2(value / 2) + 1) : 0; }
+
+	constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) {
+		return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x);
+	}
+
+	static void syncInstructionCache(void* start_ptr, void* end_ptr) {
+		// Apparently GCC compiles __builtin___clear_cache to nothing, so we use LLVM's implementation instead.
+		//
+		// This code has been modified from compiler-rt/lib/builtins/clear_cache.c, found at
+		// https://github.com/llvm/llvm-project revision 7459e10f34aa86952b1620d0cb48b40be112ebe9.
+		//
+		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+		// See https://llvm.org/LICENSE.txt for license information.
+		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+		char* start = (char*)start_ptr;
+		char* end = (char*)end_ptr;
+		const size_t len = (uintptr_t)end - (uintptr_t)start;
+		if (len == 0) return;
+
+		// Query data and instruction cache line sizes
+		long dcache_val = 0;
+		long icache_val = 0;
+
+#ifdef _SC_LEVEL1_DCACHE_LINESIZE
+		dcache_val = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+#endif
+#ifdef _SC_LEVEL1_ICACHE_LINESIZE
+		icache_val = sysconf(_SC_LEVEL1_ICACHE_LINESIZE);
+#endif
+
+		// Default to 32 bytes if querying the line size fails
+		const size_t d_line_size = (dcache_val > 0) ? dcache_val : 32;
+		const size_t i_line_size = (icache_val > 0) ? icache_val : 32;
+
+		// Flush Data Cache
+		const uintptr_t d_mask = ~(d_line_size - 1);
+		const uintptr_t d_start_line = ((uintptr_t)start) & d_mask;
+		const uintptr_t d_end_line = ((uintptr_t)start + len + d_line_size - 1) & d_mask;
+
+		for (uintptr_t line = d_start_line; line < d_end_line; line += d_line_size)
+			__asm__ volatile("dcbst 0, %0" : : "r"(line));
+
+		// Wait for memory writes to complete
+		__asm__ volatile("sync");
+
+		// Invalidate Instruction Cache
+		const uintptr_t i_mask = ~(i_line_size - 1);
+		const uintptr_t i_start_line = ((uintptr_t)start) & i_mask;
+		const uintptr_t i_end_line = ((uintptr_t)start + len + i_line_size - 1) & i_mask;
+
+		for (uintptr_t line = i_start_line; line < i_end_line; line += i_line_size)
+			__asm__ volatile("icbi 0, %0" : : "r"(line));
+
+		// Flush the local instruction pipeline
+		__asm__ volatile("isync");
+	}
+
+	static void emitLoadGpr64(CompilerState& state, uint32_t rt, uint32_t ra, uint32_t rb) {
+		if (PPC_BIG_ENDIAN) {
+			state.emit(PPC64::ldbrx(rt, ra, rb));
+		} else {
+			state.emit(PPC64::ldx(rt, ra, rb));
+		}
+	}
+
+	static void emitStoreGpr64(CompilerState& state, uint32_t rs, uint32_t ra, uint32_t rb) {
+		if (PPC_BIG_ENDIAN) {
+			state.emit(PPC64::stdbrx(rs, ra, rb));
+		} else {
+			state.emit(PPC64::stdx(rs, ra, rb));
+		}
+	}
+
+	static void emitLoadVr64(CompilerState& state, uint32_t vrt, uint32_t ra, uint32_t rb) {
+		// We need to load the two packed little-endian signed 32-bit integers into a VSR, then we need to
+		// shuffle them so they're in the correct halves of the VSR register and in the correct byte order,
+		// and then we need to convert the signed 32-bit ints to doubles.
+		uint32_t xt = 32 + vrt;
+		state.emit(PPC64::lxsdx(xt, ra, rb));
+		if (PPC_BIG_ENDIAN) {
+			// Register XT contains the value as [ 0123 4567 zzzz zzzz ]
+			state.emit(PPC64::vperm(vrt, vrt, vrt, ConstantVectorBePermutationMaskVR16)); // Shuffles values in XT to be [ 7654 7654 3210 3210 ]
+		} else {
+			// Register XT contains the value as [ 7654 3210 zzzz zzzz ]
+			state.emit(PPC64::xxmrghw(xt, xt, xt)); // Shuffles values in XT to be [ 7654 7654 3210 3210 ]
+		}
+		state.emit(PPC64::xvcvsxwdp(xt, xt)); // Needs values in XT as [ 7654 zzzz 3210 zzzz ]
+	}
+
+	static void emitMovImm32(CompilerState& state, int reg, uint32_t imm) {
+		// Move signed 32-bit immediate into 64-bit register.
+		// Note that `imm` is a `uint32_t` and not an `int32_t` for type compatibility--it has no effect on
+		// functionality because `lis` will automatically sign-extend the 16-bit value.
+		int32_t simm = (int32_t)imm;
+		if (simm >= -32768 && simm <= 32767) {
+			state.emit(PPC64::li(reg, simm & 0xFFFF));
+		} else {
+			uint16_t upper  = (imm >> 16) & 0xFFFF;
+			uint16_t lower  = (imm >>  0) & 0xFFFF;
+
+			state.emit(PPC64::lis(reg, upper));
+			if (lower)
+				state.emit(PPC64::ori(reg, reg, lower));
+		}
+	}
+
+	static void emitAddImm32(CompilerState& state, uint32_t tmpReg, int dstReg, int srcReg, uint32_t imm) {
+		int32_t simm = (int32_t)imm;
+		if (simm >= -32768 && simm <= 32767) {
+			state.emit(PPC64::addi(dstReg, srcReg, simm & 0xFFFF));
+		} else if ((imm & 0xFFFF) == 0) {
+			state.emit(PPC64::addis(dstReg, srcReg, (imm >> 16) & 0xFFFF));
+		} else {
+			// Notes on optimization:
+			//
+			// 1. Performing an `addis` -> `addi` is not a complete replacement for `lis` -> `ori` -> `add`, as constants in the
+			//    range 0x7FFF8000 to 0x7FFFFFFF cannot be handled by `addis` -> `addi`. So to be able to handle all constants,
+			//    `lis` -> `ori` -> `add` must always be available as a fallback.
+			// 2. In the context of RandomX, `addis` -> `addi` is almost always slower than `lis` -> `ori` -> `add`. The reason
+			//    for this is subtle--with `addis` -> `addi`, execution blocks at the `addis` as the CPU waits for the source
+			//    register to become ready, and `addi` can't be executed because it depends on the result of `addis`. In
+			//    contrast, `lis` -> `ori` to a temporary register can almost always be executed while the CPU waits for the
+			//    source register to become ready, and so execution will usually only block on the single `add` instruction. So
+			//    despite significantly reducing the total number of instructions executed, using `addis` -> `addi` instead of
+			//    `lis` -> `ori` -> `add` results in a significant reduction in IPC (-5%) and a small overall reduction in
+			//    performance (-0.5%).
+			emitMovImm32(state, tmpReg, imm);
+			state.emit(PPC64::add(dstReg, srcReg, tmpReg));
+		}
+	}
+
+	static void emitMovImm64(CompilerState& state, int reg, uint64_t imm) {
+		if (imm == (uint64_t)(int64_t)(int32_t)imm) {
+			// Values that can be represented by loading a 32-bit signed immediate
+			emitMovImm32(state, reg, (uint32_t)imm);
+		} else {
+			uint64_t lowestBit = imm & -(int64_t)imm;
+			uint64_t added = imm + lowestBit;
+			if (imm != 0 && imm != ~0ULL && (added & (added - 1)) == 0) {
+				// Values that are a contiguous sequence of 1s
+				uint32_t mb = added == 0 ? 0 : __builtin_clzll(added);
+				uint32_t me = 63 - __builtin_ctzll(lowestBit);
+				state.emit(PPC64::li(reg, -1));
+				if (mb == 0) {
+					state.emit(PPC64::rldicr(reg, reg, 0, me));
+				} else if (me == 63) {
+					state.emit(PPC64::rldicl(reg, reg, 0, mb));
+				} else {
+					state.emit(PPC64::rldic(reg, reg, 63 - me, mb));
+				}
+				return;
+			}
+
+			// Values that can be generated by loading a <=32-bit immediate and rotating it
+			for (int i = 1; i < 64; ++i) {
+				uint64_t rot = (imm << i) | (imm >> (64 - i));
+				if (rot == (uint64_t)(int64_t)(int32_t)rot) {
+					emitMovImm32(state, reg, (uint32_t)rot);
+					state.emit(PPC64::rotldi(reg, reg, 64 - i));
+					return;
+				}
+			}
+
+			// All other values
+			uint32_t high = imm >> 32;
+			uint32_t low = imm & 0xFFFFFFFF;
+
+			if (high) {
+				emitMovImm32(state, reg, high);
+				state.emit(PPC64::sldi(reg, reg, 32));
+			} else {
+				state.emit(PPC64::li(reg, 0));
+			}
+
+			uint16_t lower = (low >> 16) & 0xFFFF;
+			uint16_t lowest = low & 0xFFFF;
+
+			if (lower)
+				state.emit(PPC64::oris(reg, reg, lower));
+
+			if (lowest)
+				state.emit(PPC64::ori(reg, reg, lowest));
+		}
+	}
+
+	static void emitLoadGprFromScratchpad(CompilerState& state, uint32_t tmp_gpr, uint32_t dst, uint32_t src, Instruction& instr) {
+		uint32_t imm = instr.getImm32();
+
+		if (src != dst) {
+			uint32_t size = instr.getModMem() ? RANDOMX_SCRATCHPAD_L1 : RANDOMX_SCRATCHPAD_L2;
+			imm &= size - 1;
+			emitAddImm32(state, tmp_gpr, tmp_gpr, src, imm);
+
+			uint32_t mb = 32 - Log2(size);
+			state.emit(PPC64::rlwinm(tmp_gpr, tmp_gpr, 0, mb, 28));
+		} else {
+			imm &= ScratchpadL3Mask;
+			emitMovImm64(state, tmp_gpr, imm);
+		}
+
+		emitLoadGpr64(state, tmp_gpr, ScratchpadPointerGPR30, tmp_gpr);
+	}
+
+	static void emitLoadVsrFromScratchpad(CompilerState& state, uint32_t tmp_gpr, uint32_t tmp_vr, Instruction& instr) {
+		int src = RegisterMapR.getPpcGprNum(instr.src);
+
+		uint32_t imm = instr.getImm32();
+		uint32_t size = instr.getModMem() ? RANDOMX_SCRATCHPAD_L1 : RANDOMX_SCRATCHPAD_L2;
+		imm &= size - 1;
+		emitAddImm32(state, tmp_gpr, tmp_gpr, src, imm);
+
+		uint32_t mb = 32 - Log2(size);
+		state.emit(PPC64::rlwinm(tmp_gpr, tmp_gpr, 0, mb, 28));
+
+		emitLoadVr64(state, tmp_vr, ScratchpadPointerGPR30, tmp_gpr);
+	}
+
+	static void emitVmSpadStoreGroupF(CompilerState& state) {
+		// Store F registers to scratchpad at spAddr0
+		if (randomx::cpu.hasV3P0()) {
+			if (PPC_BIG_ENDIAN) {
+				state.emit(PPC64::vperm(12, 0, 0, ConstantVectorByteReverseMaskVR15));
+				state.emit(PPC64::stxv(32 + 12, 16 * 0, SpAddr0GPR26));
+				state.emit(PPC64::vperm(13, 1, 1, ConstantVectorByteReverseMaskVR15));
+				state.emit(PPC64::stxv(32 + 13, 16 * 1, SpAddr0GPR26));
+				state.emit(PPC64::vperm(14, 2, 2, ConstantVectorByteReverseMaskVR15));
+				state.emit(PPC64::stxv(32 + 14, 16 * 2, SpAddr0GPR26));
+				state.emit(PPC64::vperm(12, 3, 3, ConstantVectorByteReverseMaskVR15));
+				state.emit(PPC64::stxv(32 + 12, 16 * 3, SpAddr0GPR26));
+			} else {
+				state.emit(PPC64::stxv(32 + 0, 16 * 0, SpAddr0GPR26));
+				state.emit(PPC64::stxv(32 + 1, 16 * 1, SpAddr0GPR26));
+				state.emit(PPC64::stxv(32 + 2, 16 * 2, SpAddr0GPR26));
+				state.emit(PPC64::stxv(32 + 3, 16 * 3, SpAddr0GPR26));
+			}
+		} else {
+			if (PPC_BIG_ENDIAN) {
+				state.emit(PPC64::vperm(12, 0, 0, ConstantVectorByteReverseMaskVR15));
+				state.emit(PPC64::stvx(12, 0, SpAddr0GPR26));  // RA=0 for zero offset
+				state.emit(PPC64::vperm(13, 1, 1, ConstantVectorByteReverseMaskVR15));
+				state.emit(PPC64::li(9, 16 * 1));
+				state.emit(PPC64::stvx(13, SpAddr0GPR26, 9));
+				state.emit(PPC64::vperm(14, 2, 2, ConstantVectorByteReverseMaskVR15));
+				state.emit(PPC64::li(10, 16 * 2));
+				state.emit(PPC64::stvx(14, SpAddr0GPR26, 10));
+				state.emit(PPC64::vperm(12, 3, 3, ConstantVectorByteReverseMaskVR15));
+				state.emit(PPC64::li(11, 16 * 3));
+				state.emit(PPC64::stvx(12, SpAddr0GPR26, 11));
+			} else {
+				state.emit(PPC64::stvx(0, 0, SpAddr0GPR26));  // RA=0 for zero offset
+				state.emit(PPC64::li(9, 16 * 1));
+				state.emit(PPC64::stvx(1, SpAddr0GPR26, 9));
+				state.emit(PPC64::li(10, 16 * 2));
+				state.emit(PPC64::stvx(2, SpAddr0GPR26, 10));
+				state.emit(PPC64::li(11, 16 * 3));
+				state.emit(PPC64::stvx(3, SpAddr0GPR26, 11));
+			}
+		}
+	}
+
+	uint32_t JitCompilerPPC64::getTempGpr() {
+		static const uint32_t gprs[] = {6, 7, 8, 9, 10, 11, 12};
+		uint32_t reg = gprs[tempGprIndex];
+		tempGprIndex = (tempGprIndex + 1) % 7;
+		return reg;
+	}
+
+	uint32_t JitCompilerPPC64::getTempVr() {
+		static const uint32_t vrs[] = {12, 13, 14};
+		uint32_t reg = vrs[tempVrIndex];
+		tempVrIndex = (tempVrIndex + 1) % 3;
+		return reg;
+	}
+
+	void JitCompilerPPC64::emitProgramPrefix(CompilerState& state, Program& prog, ProgramConfiguration& pcfg, randomx_flags flags) {
+		// Set the Group E OR vector mask
+		state.emitAt(sizeConstants, pcfg.eMask[0]);
+		state.emitAt(sizeConstants + 8, pcfg.eMask[1]);
+
+		state.codePos = RandomXCodePos;
+
+		state.emit(codeVmPrologue, sizeVmPrologue);
+		// Mask mx and ma with Scratchpad L3 mask
+		uint32_t mask_begin = 32 - Log2(RANDOMX_SCRATCHPAD_L3);
+		uint32_t mask_end = 31 - Log2(RANDOMX_DATASET_ITEM_SIZE);
+		state.emit(PPC64::rlwinm(SpAddr0GPR26, MxGPR25, 0, mask_begin, mask_end));
+		state.emit(PPC64::rlwinm(SpAddr1GPR27, MaGPR24, 0, mask_begin, mask_end));
+		// Init spAddr0 to masked mx + scratchpad base
+		state.emit(PPC64::add(SpAddr0GPR26, SpAddr0GPR26, ScratchpadPointerGPR30));
+		// Init spAddr1 to masked ma + scratchpad base
+		state.emit(PPC64::add(SpAddr1GPR27, SpAddr1GPR27, ScratchpadPointerGPR30));
+
+		LoopBeginPos = state.codePos;
+		state.emit(codeVmLoopPrologue, sizeVmLoopPrologue);
+
+		// Initialize the reciprocal pool to zero
+		state.rcpCount = 0;
+
+		// Step 4: The 256 instructions stored in the Program Buffer are executed.
+		for (unsigned i = 0; i < RegistersCount; ++i) {
+			state.registerUsage[i] = -1;
+		}
+		for (unsigned i = 0; i < prog.getSize(flags); ++i) {
+			Instruction instr = prog(i);
+			instr.src %= RegistersCount;
+			instr.dst %= RegistersCount;
+			state.instructionOffsets[i] = state.codePos;
+			opcodeMap1[instr.opcode](this, state, instr, i, flags);
+		}
+	}
+
+	void JitCompilerPPC64::emitProgramSuffix(CompilerState& state, ProgramConfiguration& pcfg, randomx_flags flags) {
+		state.emit(codeVmSpadStoreGroupR, sizeVmSpadStoreGroupR);
+
+		if (flags & RANDOMX_FLAG_V2) {
+			if (flags & RANDOMX_FLAG_HARD_AES) {
+				if (!randomx::cpu.hasAes()) {
+					throw std::runtime_error("This CPU is missing support for hardware AES!");
+				}
+				state.emit(codeVmSpadStoreMixV2HardAes, sizeVmSpadStoreMixV2HardAes);
+			} else {
+				state.emit(codeVmSpadStoreMixV2SoftAes, sizeVmSpadStoreMixV2SoftAes);
+			}
+		} else {
+			state.emit(codeVmSpadStoreMixV1, sizeVmSpadStoreMixV1);
+		}
+
+		emitVmSpadStoreGroupF(state);
+
+		state.emit(PPC64::xor_(SpAddr0GPR26, RegisterMapR.getPpcGprNum(pcfg.readReg0), RegisterMapR.getPpcGprNum(pcfg.readReg1)));
+
+		// spAddr1 (r27) = r26 >> 32
+		state.emit(PPC64::srdi(SpAddr1GPR27, SpAddr0GPR26, 32));
+		// spAddr0 (r26) = r26 & 0xFFFFFFFF
+		state.emit(PPC64::rldicl(SpAddr0GPR26, SpAddr0GPR26, 0, 32));
+
+		// Apply Scratchpad L3 mask
+		uint32_t mb = 32 - Log2(RANDOMX_SCRATCHPAD_L3);
+		uint32_t me = 31 - Log2(RANDOMX_DATASET_ITEM_SIZE);
+		state.emit(PPC64::rlwinm(SpAddr0GPR26, SpAddr0GPR26, 0, mb, me));
+		state.emit(PPC64::rlwinm(SpAddr1GPR27, SpAddr1GPR27, 0, mb, me));
+
+		// Add scratchpad base pointer (r30)
+		state.emit(PPC64::add(SpAddr0GPR26, SpAddr0GPR26, ScratchpadPointerGPR30));
+		state.emit(PPC64::add(SpAddr1GPR27, SpAddr1GPR27, ScratchpadPointerGPR30));
+
+		int32_t fixPos = state.codePos;
+		state.emit(codeVmEpilogue, sizeVmEpilogue);
+
+		int32_t fixContinuePos = fixPos + offsetVmFixLoop;
+		state.emitAt(fixContinuePos, PPC64::b(LoopBeginPos - fixContinuePos));
+	}
+
+	JitCompilerPPC64::JitCompilerPPC64() {
+		state.code = (uint8_t*) allocMemoryPages(CodeSize);
+		if (state.code == nullptr)
+			throw std::runtime_error("allocMemoryPages");
+
+		state.codePos = 0;
+		state.emit(codeConstants, sizeConstants);
+
+		state.codePos = ConstantPoolSize;
+		entryProgram = state.code + state.codePos;
+		if (PPC_ABI_V2) {
+			// Load r2 with the base address of the constant pool
+			emitMovImm64(state, ConstantsBaseAddressRegisterGPR2, reinterpret_cast<uint64_t>(state.code));
+		}
+		RandomXCodePos = state.codePos;
+
+		state.codePos = RandomXCodeSize;
+		entryDataInit = state.code + state.codePos;
+		if (PPC_ABI_V2) {
+			// Load r2 with the base address of the constant pool
+			emitMovImm64(state, ConstantsBaseAddressRegisterGPR2, reinterpret_cast<uint64_t>(state.code));
+		}
+		int32_t datasetInitFixCallPos = state.codePos + offsetDatasetInitFixCall;
+		state.emit(codeDatasetInit, sizeDatasetInit);
+		SshashSingleItemPos = alignSize(state.codePos, 128);
+		// Patch in the call to the SuperScalar Hash single item function
+		state.emitAt(datasetInitFixCallPos, PPC64::bl(SshashSingleItemPos - datasetInitFixCallPos));
+
+#if !PPC_ABI_V2
+		// Initialize the ABI V1 function descriptors
+		descriptorProgram[0] = reinterpret_cast<uint64_t>(entryProgram);
+		descriptorProgram[1] = reinterpret_cast<uint64_t>(state.code);
+		descriptorProgram[2] = 0;
+
+		descriptorDataInit[0] = reinterpret_cast<uint64_t>(entryDataInit);
+		descriptorDataInit[1] = reinterpret_cast<uint64_t>(state.code);
+		descriptorDataInit[2] = 0;
+#endif
+	}
+
+	JitCompilerPPC64::~JitCompilerPPC64() {
+		freePagedMemory(state.code, CodeSize);
+	}
+
+	void JitCompilerPPC64::enableWriting() {
+		setPagesRW(state.code, CodeSize);
+	}
+
+	void JitCompilerPPC64::enableExecution() {
+		setPagesRX(state.code, CodeSize);
+	}
+
+	void JitCompilerPPC64::enableAll() {
+		setPagesRWX(state.code, CodeSize);
+	}
+
+	void JitCompilerPPC64::generateProgram(Program& prog, ProgramConfiguration& pcfg) {
+		emitProgramPrefix(state, prog, pcfg, flags);
+
+		int mtReg = MaGPR24;
+		int mpReg = MxGPR25;
+
+		if (flags & RANDOMX_FLAG_V2) {
+			// Step 5a: Save ma in mt (r9, temporary)
+			mtReg = 9;
+			state.emit(PPC64::mr(mtReg, MaGPR24));
+
+			mpReg = MaGPR24;
+		}
+
+		// Step 5b: the mp register is XORed with the low 32 bits of registers readReg2 and readReg3
+		state.emit(PPC64::xor_(8, RegisterMapR.getPpcGprNum(pcfg.readReg2), RegisterMapR.getPpcGprNum(pcfg.readReg3)));
+		// Zero-extend r8 to 32 bits (clear upper 32 bits)
+		state.emit(PPC64::rldicl(8, 8, 0, 32));
+		// mp ^= (readReg2 ^ readReg3)
+		state.emit(PPC64::xor_(mpReg, mpReg, 8));
+
+		int32_t dataReadPos = state.codePos;
+		state.emit(codeVmDataRead, sizeVmDataRead);
+
+		uint32_t mask_begin = 32 - Log2(RANDOMX_DATASET_BASE_SIZE);
+		uint32_t mask_end = 31 - Log2(CacheLineSize);
+
+		// Patch prefetch address calculation (offset 0)
+		state.emitAt(dataReadPos, PPC64::rlwinm(8, mpReg, 0, mask_begin, mask_end));
+
+		// Patch read address calculation (offset 12)
+		state.emitAt(dataReadPos + 12, PPC64::rlwinm(8, mtReg, 0, mask_begin, mask_end));
+
+		emitProgramSuffix(state, pcfg, flags);
+
+		syncInstructionCache(entryProgram, state.code + state.codePos);
+	}
+
+	void JitCompilerPPC64::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) {
+		emitProgramPrefix(state, prog, pcfg, flags);
+
+		int mtReg = MaGPR24;
+		int mpReg = MxGPR25;
+
+		if (flags & RANDOMX_FLAG_V2) {
+			// Step 5a: Save ma in mt (r9, temporary)
+			mtReg = 9;
+			state.emit(PPC64::mr(mtReg, MaGPR24));
+
+			mpReg = MaGPR24;
+		}
+
+		// Step 5b: the mp register is XORed with the low 32 bits of registers readReg2 and readReg3
+		state.emit(PPC64::xor_(8, RegisterMapR.getPpcGprNum(pcfg.readReg2), RegisterMapR.getPpcGprNum(pcfg.readReg3)));
+		// Zero-extend r8 to 32 bits (clear upper 32 bits)
+		state.emit(PPC64::rldicl(8, 8, 0, 32));
+		// mp ^= (readReg2 ^ readReg3)
+		state.emit(PPC64::xor_(mpReg, mpReg, 8));
+
+		// Calculate itemNumber = (mt & datasetMask) / CacheLineSize
+		uint32_t datasetMask = (RANDOMX_DATASET_BASE_SIZE - 1) & ~63;
+		emitMovImm32(state, 8, datasetMask);
+		state.emit(PPC64::and_(5, mtReg, 8)); // r5 = mt & datasetMask
+		state.emit(PPC64::srdi(5, 5, Log2(CacheLineSize))); // r5 = r5 >> 6
+
+		emitAddImm32(state, 8, 5, 5, datasetOffset / CacheLineSize);
+
+		int32_t callPos = state.codePos + offsetVmDataReadLightFixCall;
+		state.emit(codeVmDataReadLight, sizeVmDataReadLight);
+		state.emitAt(callPos, PPC64::bl(SshashSingleItemPos - callPos));
+
+		emitProgramSuffix(state, pcfg, flags);
+
+		syncInstructionCache(entryProgram, state.code + state.codePos);
+	}
+
+	static void generateSuperscalarCode(CompilerState& state, Instruction instr, const std::vector<uint64_t>& reciprocalCache) {
+		int dst = RegisterMapSsh.getPpcGprNum(instr.dst);
+		int src = RegisterMapSsh.getPpcGprNum(instr.src);
+		uint32_t rotation = instr.getImm32() & 63;
+
+		switch ((SuperscalarInstructionType)instr.opcode) {
+			case SuperscalarInstructionType::ISUB_R:
+				// subf dst, src, dst
+				state.emit(PPC64::subf(dst, src, dst));
+				break;
+			case SuperscalarInstructionType::IXOR_R:
+				// xor dst, dst, src
+				state.emit(PPC64::xor_(dst, dst, src));
+				break;
+			case SuperscalarInstructionType::IADD_RS:
+				// sldi r8, src, shift
+				state.emit(PPC64::sldi(8, src, instr.getModShift()));
+				// add dst, dst, r8
+				state.emit(PPC64::add(dst, dst, 8));
+				break;
+			case SuperscalarInstructionType::IMUL_R:
+				// mulld dst, dst, src
+				state.emit(PPC64::mulld(dst, dst, src));
+				break;
+			case SuperscalarInstructionType::IROR_C:
+				if (rotation) {
+					// rotrdi dst, dst, imm
+					state.emit(PPC64::rotrdi(dst, dst, rotation));
+				}
+				break;
+			case SuperscalarInstructionType::IADD_C7:
+			case SuperscalarInstructionType::IADD_C8:
+			case SuperscalarInstructionType::IADD_C9:
+				emitMovImm32(state, 8, instr.getImm32());
+				// add dst, dst, r8
+				state.emit(PPC64::add(dst, dst, 8));
+				break;
+			case SuperscalarInstructionType::IXOR_C7:
+			case SuperscalarInstructionType::IXOR_C8:
+			case SuperscalarInstructionType::IXOR_C9:
+				emitMovImm32(state, 8, instr.getImm32());
+				// xor dst, dst, r8
+				state.emit(PPC64::xor_(dst, dst, 8));
+				break;
+			case SuperscalarInstructionType::IMULH_R:
+				// mulhdu dst, dst, src
+				state.emit(PPC64::mulhdu(dst, dst, src));
+				break;
+			case SuperscalarInstructionType::ISMULH_R:
+				// mulhd dst, dst, src
+				state.emit(PPC64::mulhd(dst, dst, src));
+				break;
+			case SuperscalarInstructionType::IMUL_RCP:
+				emitMovImm64(state, 8, reciprocalCache[instr.getImm32()]);
+				// mulld dst, dst, r8
+				state.emit(PPC64::mulld(dst, dst, 8));
+				break;
+			default:
+				UNREACHABLE;
+		}
+	}
+
+	void JitCompilerPPC64::generateSuperscalarHash(SuperscalarProgramList& programs, std::vector<uint64_t> &reciprocalCache) {
+		state.codePos = SshashSingleItemPos;
+
+		// Steps 1 and 2
+		state.emit(codeSshashSingleItemPrologue, sizeSshashSingleItemPrologue);
+
+		for (size_t i = 0; i < programs.size(); ++i) {
+			SuperscalarProgram& prog = programs[i];
+
+			// Step 4
+			// rldic r8, r5, Log2(CacheLineSize), 64 - Log2(CacheSize / CacheLineSize) - Log2(CacheLineSize)
+			state.emit(PPC64::rldic(8, 5, Log2(CacheLineSize), 64 - Log2(CacheSize / CacheLineSize) - Log2(CacheLineSize)));
+			state.emit(codeSshashCachePrefetch + 4, sizeSshashCachePrefetch - 4);
+
+			// Step 5
+			for (uint32_t j = 0; j < prog.getSize(); ++j) {
+				Instruction& instr = prog(j);
+				generateSuperscalarCode(state, instr, reciprocalCache);
+			}
+
+			// Step 6
+			state.emit(codeSshashXor, sizeSshashXor);
+
+			uint32_t addrReg = RegisterMapSsh.getPpcGprNum(prog.getAddressRegister());
+			state.emit(PPC64::mr(5, addrReg));
+
+		}
+
+		// Return
+		state.emit(codeSshashSingleItemEpilogue, sizeSshashSingleItemEpilogue);
+
+		syncInstructionCache(entryDataInit, state.code + state.codePos);
+	}
+
+	size_t JitCompilerPPC64::getCodeSize() {
+		return CodeSize;
+	}
+
+	static void h_IADD_RS(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		int shift = isn.getModShift();
+
+		if (shift) {
+			uint32_t tmp_gpr = jit->getTempGpr();
+			state.emit(PPC64::sldi(tmp_gpr, src, shift));
+			state.emit(PPC64::add(dst, dst, tmp_gpr));
+		} else {
+			state.emit(PPC64::add(dst, dst, src));
+		}
+
+		if (isn.dst == RegisterNeedsDisplacement) {
+			emitAddImm32(state, jit->getTempGpr(), dst, dst, isn.getImm32());
+		}
+	}
+	static void h_IADD_M(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		uint32_t tmp_gpr = jit->getTempGpr();
+		emitLoadGprFromScratchpad(state, tmp_gpr, dst, src, isn);
+		state.emit(PPC64::add(dst, dst, tmp_gpr));
+	}
+	static void h_ISUB_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		if (isn.src != isn.dst) {
+			int src = RegisterMapR.getPpcGprNum(isn.src);
+			state.emit(PPC64::subf(dst, src, dst));
+		} else {
+			int32_t imm = unsigned32ToSigned2sCompl(-isn.getImm32());
+			emitAddImm32(state, jit->getTempGpr(), dst, dst, imm);
+		}
+	}
+	static void h_ISUB_M(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		uint32_t tmp_gpr = jit->getTempGpr();
+		emitLoadGprFromScratchpad(state, tmp_gpr, dst, src, isn);
+		state.emit(PPC64::subf(dst, tmp_gpr, dst));
+	}
+	static void h_IMUL_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		if (isn.src != isn.dst) {
+			int src = RegisterMapR.getPpcGprNum(isn.src);
+			state.emit(PPC64::mulld(dst, dst, src));
+		} else {
+			uint32_t tmp_gpr = jit->getTempGpr();
+			emitMovImm32(state, tmp_gpr, isn.getImm32());
+			state.emit(PPC64::mulld(dst, dst, tmp_gpr));
+		}
+	}
+	static void h_IMUL_M(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		uint32_t tmp_gpr = jit->getTempGpr();
+		emitLoadGprFromScratchpad(state, tmp_gpr, dst, src, isn);
+		state.emit(PPC64::mulld(dst, dst, tmp_gpr));
+	}
+	static void h_IMULH_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		state.emit(PPC64::mulhdu(dst, dst, src));
+	}
+	static void h_IMULH_M(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		uint32_t tmp_gpr = jit->getTempGpr();
+		emitLoadGprFromScratchpad(state, tmp_gpr, dst, src, isn);
+		state.emit(PPC64::mulhdu(dst, dst, tmp_gpr));
+	}
+	static void h_ISMULH_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		state.emit(PPC64::mulhd(dst, dst, src));
+	}
+	static void h_ISMULH_M(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		uint32_t tmp_gpr = jit->getTempGpr();
+		emitLoadGprFromScratchpad(state, tmp_gpr, dst, src, isn);
+		state.emit(PPC64::mulhd(dst, dst, tmp_gpr));
+	}
+	static void h_IMUL_RCP(HANDLER_ARGS) {
+		uint32_t divisor = isn.getImm32();
+		if (!isZeroOrPowerOf2(divisor)) {
+			state.registerUsage[isn.dst] = i;
+			int dst = RegisterMapR.getPpcGprNum(isn.dst);
+			uint32_t tmp_gpr = jit->getTempGpr();
+
+			// Calculate and cache the reciprocal
+			int32_t offset = ReciprocalPoolPos + 8 * state.rcpCount++;
+			uint64_t rcp = randomx_reciprocal_fast(divisor);
+			state.emitAt(offset, rcp);
+
+			state.emit(PPC64::ld(tmp_gpr, offset, ConstantsBaseAddressRegisterGPR2));
+			state.emit(PPC64::mulld(dst, dst, tmp_gpr));
+		}
+	}
+	static void h_INEG_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		state.emit(PPC64::neg(dst, dst));
+	}
+	static void h_IXOR_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		if (isn.src != isn.dst) {
+			int src = RegisterMapR.getPpcGprNum(isn.src);
+			state.emit(PPC64::xor_(dst, dst, src));
+		} else {
+			// Note: RandomX 32-bit immediates are sign-extended to 64 bits.
+			// xori/xoris zero-extend their 16-bit immediate, so they only match
+			// the sign-extended semantics when the imm32 is non-negative as a
+			// signed 32-bit value (i.e., <= 0x7FFFFFFF).
+			uint32_t imm = isn.getImm32();
+			if (imm <= 0xFFFF) {
+				// Fits in unsigned 16 bits; XOR of upper bits is a no-op.
+				state.emit(PPC64::xori(dst, dst, imm));
+			} else if ((imm & 0xFFFF) == 0 && imm <= 0x7FFFFFFF) {
+				// Only the high 16 bits are nonzero, and the value is non-negative.
+				state.emit(PPC64::xoris(dst, dst, (imm >> 16) & 0xFFFF));
+			} else {
+				uint32_t tmp_gpr = jit->getTempGpr();
+				emitMovImm32(state, tmp_gpr, imm);
+				state.emit(PPC64::xor_(dst, dst, tmp_gpr));
+			}
+		}
+	}
+	static void h_IXOR_M(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		uint32_t tmp_gpr = jit->getTempGpr();
+		emitLoadGprFromScratchpad(state, tmp_gpr, dst, src, isn);
+		state.emit(PPC64::xor_(dst, dst, tmp_gpr));
+	}
+	static void h_IROR_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		if (isn.src != isn.dst) {
+			int src = RegisterMapR.getPpcGprNum(isn.src);
+			uint32_t tmp_gpr = jit->getTempGpr();
+			state.emit(PPC64::neg(tmp_gpr, src));
+			state.emit(PPC64::rldcl(dst, dst, tmp_gpr, 0));
+		} else {
+			uint32_t imm = isn.getImm32() & 63;
+			if (imm)
+				state.emit(PPC64::rotrdi(dst, dst, imm));
+		}
+	}
+	static void h_IROL_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		if (isn.src != isn.dst) {
+			int src = RegisterMapR.getPpcGprNum(isn.src);
+			state.emit(PPC64::rldcl(dst, dst, src, 0));
+		} else {
+			uint32_t imm = isn.getImm32() & 63;
+			if (imm)
+				state.emit(PPC64::rotldi(dst, dst, imm));
+		}
+	}
+	static void h_ISWAP_R(HANDLER_ARGS) {
+		if (isn.src != isn.dst) {
+			state.registerUsage[isn.dst] = i;
+			state.registerUsage[isn.src] = i;
+			int dst = RegisterMapR.getPpcGprNum(isn.dst);
+			int src = RegisterMapR.getPpcGprNum(isn.src);
+			uint32_t tmp_gpr = jit->getTempGpr();
+			state.emit(PPC64::mr(tmp_gpr, dst));
+			state.emit(PPC64::mr(dst, src));
+			state.emit(PPC64::mr(src, tmp_gpr));
+		}
+	}
+	static void h_FSWAP_R(HANDLER_ARGS) {
+		int dst = RegisterMapFE.getPpcVsrNum(isn.dst);
+		state.emit(PPC64::xxswapd(dst, dst));
+	}
+	static void h_FADD_R(HANDLER_ARGS) {
+		int dst = RegisterMapF.getPpcVsrNum(isn.dst);
+		int src = RegisterMapA.getPpcVsrNum(isn.src);
+		state.emit(PPC64::xvadddp(dst, dst, src));
+	}
+	static void h_FADD_M(HANDLER_ARGS) {
+		int dst = RegisterMapF.getPpcVsrNum(isn.dst);
+		uint32_t tmp_gpr = jit->getTempGpr();
+		uint32_t tmp_vr = jit->getTempVr();
+		emitLoadVsrFromScratchpad(state, tmp_gpr, tmp_vr, isn);
+		state.emit(PPC64::xvadddp(dst, dst, 32 + tmp_vr));
+	}
+	static void h_FSUB_R(HANDLER_ARGS) {
+		int dst = RegisterMapF.getPpcVsrNum(isn.dst);
+		int src = RegisterMapA.getPpcVsrNum(isn.src);
+		state.emit(PPC64::xvsubdp(dst, dst, src));
+	}
+	static void h_FSUB_M(HANDLER_ARGS) {
+		int dst = RegisterMapF.getPpcVsrNum(isn.dst);
+		uint32_t tmp_gpr = jit->getTempGpr();
+		uint32_t tmp_vr = jit->getTempVr();
+		emitLoadVsrFromScratchpad(state, tmp_gpr, tmp_vr, isn);
+		state.emit(PPC64::xvsubdp(dst, dst, 32 + tmp_vr));
+	}
+	static void h_FSCAL_R(HANDLER_ARGS) {
+		int dst = RegisterMapF.getPpcVrNum(isn.dst);
+		state.emit(PPC64::vxor(dst, dst, ConstantVectorFscalXorMaskVR18));
+	}
+	static void h_FMUL_R(HANDLER_ARGS) {
+		int dst = RegisterMapE.getPpcVsrNum(isn.dst);
+		int src = RegisterMapA.getPpcVsrNum(isn.src);
+		state.emit(PPC64::xvmuldp(dst, dst, src));
+	}
+	static void h_FDIV_M(HANDLER_ARGS) {
+		int dst = RegisterMapE.getPpcVsrNum(isn.dst);
+		uint32_t tmp_gpr = jit->getTempGpr();
+		uint32_t tmp_vr = jit->getTempVr();
+		emitLoadVsrFromScratchpad(state, tmp_gpr, tmp_vr, isn);
+		state.emit(PPC64::vsel(tmp_vr, ConstantVectorGroupEOrMaskVR19, tmp_vr, ConstantVectorGroupEAndMaskVR17));
+		state.emit(PPC64::xvdivdp(dst, dst, 32 + tmp_vr));
+	}
+	static void h_FSQRT_R(HANDLER_ARGS) {
+		int dst = RegisterMapE.getPpcVsrNum(isn.dst);
+		state.emit(PPC64::xvsqrtdp(dst, dst));
+	}
+	static void h_CBRANCH(HANDLER_ARGS) {
+		int reg = isn.dst;
+		int target = state.registerUsage[reg] + 1;
+		int shift = isn.getModCond() + ConditionOffset;
+		int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32());
+		imm |= (1UL << shift);
+		if (ConditionOffset > 0 || shift > 0)
+			imm &= ~(1UL << (shift - 1));
+
+		int dst = RegisterMapR.getPpcGprNum(reg);
+		emitAddImm32(state, jit->getTempGpr(), dst, dst, imm);
+
+		// Calculate the Mask Begin (MB) parameter
+		uint32_t mb = 64 - RANDOMX_JUMP_BITS;
+
+		// rldicl. tmp_gpr, dst, 64 - shift, mb
+		state.emit(PPC64::rldicl_dot(jit->getTempGpr(), dst, (64 - shift) & 63, mb));
+
+		int32_t targetPos = state.instructionOffsets[target];
+		int offset = targetPos - state.codePos;
+
+		if (offset >= -(1 << 15) && offset < (1 << 15)) {
+			state.emit(PPC64::beq_predict_not_taken(offset));
+		} else {
+			// Branch over the jump if not equal
+			state.emit(PPC64::bne_predict_taken(8));
+			state.emit(PPC64::b(offset - 4));
+		}
+
+		for (unsigned j = 0; j < RegistersCount; ++j) {
+			state.registerUsage[j] = i;
+		}
+	}
+	static void h_CFROUND(HANDLER_ARGS) {
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		int32_t rotateBits = isn.getImm32() & 63;
+
+		// Operate directly on src by default
+		int rot_src = src;
+
+		// Rotate right by rotateBits
+		if (rotateBits) {
+			uint32_t tmp_gpr = jit->getTempGpr();
+
+			// rotrdi tmp_gpr, src, rotateBits
+			state.emit(PPC64::rotrdi(tmp_gpr, src, rotateBits));
+
+			// We rotated src and put the new value in tmp_gpr
+			rot_src = tmp_gpr;
+		}
+
+		int32_t patch_pos = 0;
+		if (flags & RANDOMX_FLAG_V2) {
+			// Skip the rest of the code if bits 5:2 are not zero. Use GPR0 as a discard register.
+			// andi. r0, rot_src, 0x003C
+			state.emit(PPC64::andi_dot(0, rot_src, 0x003C));
+
+			// Get position to patch with conditional branch.
+			patch_pos = state.codePos;
+
+			// Emit invalid instruction now and patch later once we have the code length.
+			state.emit(0); // bne skip_update
+		}
+
+		uint32_t offset_gpr = jit->getTempGpr();
+
+		// Mask out bits 1:0 and multiply by 8 (shift left by 3) to get the table word offset (0, 8, 16, 24)
+		// rldic offset_gpr, rot_src, 3, 59
+		state.emit(PPC64::rldic(offset_gpr, rot_src, 3, 59));
+
+		uint32_t address_gpr = jit->getTempGpr();
+
+		// Load table address into scratch address_gpr
+		emitAddImm32(state, jit->getTempGpr(), address_gpr, ConstantsBaseAddressRegisterGPR2, offsetConstantLutFprcToFpscr);
+
+		// Load value from fprc-to-FPSCR table into temporary FPR0
+		// lfdx f0, offset_gpr, address_gpr
+		state.emit(PPC64::lfdx(0, offset_gpr, address_gpr));
+
+		if (randomx::cpu.hasV3P0()) {
+			// Move the RN value from scratch FPR0 to FPSCR field RN
+			// mffscrn f0, f0
+			state.emit(PPC64::mffscrn(0, 0));
+		} else {
+			// Move the RN value from scratch FPR0 to FPSCR (masked)
+			// mtfsf 0x01, f0, 0, 0
+			state.emit(PPC64::mtfsf(0x01, 0, 0, 0));
+		}
+
+		if (flags & RANDOMX_FLAG_V2) {
+			// Patch in the conditional branch instruction. We predict that the branch is taken because
+			// there's only a 1-in-16 chance of bits 5:2 of the rotated value being equal to zero and
+			// falling through to the RN-update code.
+			int32_t branch_offset = state.codePos - patch_pos;
+			state.emitAt(patch_pos, PPC64::bne_predict_taken(branch_offset));
+		}
+	}
+	static void h_ISTORE(HANDLER_ARGS) {
+		int dst = RegisterMapR.getPpcGprNum(isn.dst);
+		int src = RegisterMapR.getPpcGprNum(isn.src);
+		uint32_t imm = isn.getImm32();
+		uint32_t tmp_gpr = jit->getTempGpr();
+
+		uint32_t size;
+		if (isn.getModCond() < StoreL3Condition) {
+			size = isn.getModMem() ? RANDOMX_SCRATCHPAD_L1 : RANDOMX_SCRATCHPAD_L2;
+		} else {
+			size = RANDOMX_SCRATCHPAD_L3;
+		}
+		imm &= size - 1;
+
+		emitAddImm32(state, jit->getTempGpr(), tmp_gpr, dst, imm);
+
+		uint32_t mb = 32 - Log2(size);
+		state.emit(PPC64::rlwinm(tmp_gpr, tmp_gpr, 0, mb, 28));
+
+		emitStoreGpr64(state, src, ScratchpadPointerGPR30, tmp_gpr);
+	}
+	static void h_NOP(HANDLER_ARGS) {
+	}
+}
+
+#include "instruction_weights.hpp"
+
+namespace {
+
+#define INST_HANDLE(x) REPN(&randomx::h_##x, WT(x))
+
+	InstructionHandler* opcodeMap1[256] = {
+		INST_HANDLE(IADD_RS)
+		INST_HANDLE(IADD_M)
+		INST_HANDLE(ISUB_R)
+		INST_HANDLE(ISUB_M)
+		INST_HANDLE(IMUL_R)
+		INST_HANDLE(IMUL_M)
+		INST_HANDLE(IMULH_R)
+		INST_HANDLE(IMULH_M)
+		INST_HANDLE(ISMULH_R)
+		INST_HANDLE(ISMULH_M)
+		INST_HANDLE(IMUL_RCP)
+		INST_HANDLE(INEG_R)
+		INST_HANDLE(IXOR_R)
+		INST_HANDLE(IXOR_M)
+		INST_HANDLE(IROR_R)
+		INST_HANDLE(IROL_R)
+		INST_HANDLE(ISWAP_R)
+		INST_HANDLE(FSWAP_R)
+		INST_HANDLE(FADD_R)
+		INST_HANDLE(FADD_M)
+		INST_HANDLE(FSUB_R)
+		INST_HANDLE(FSUB_M)
+		INST_HANDLE(FSCAL_R)
+		INST_HANDLE(FMUL_R)
+		INST_HANDLE(FDIV_M)
+		INST_HANDLE(FSQRT_R)
+		INST_HANDLE(CBRANCH)
+		INST_HANDLE(CFROUND)
+		INST_HANDLE(ISTORE)
+		INST_HANDLE(NOP)
+	};
+
+#undef INST_HANDLE
+}
+
+#define INST_HANDLE(x) REPN(static_cast<uint8_t>(randomx::InstructionType::x), WT(x))
+
+alignas(128) uint8_t randomx::JitCompilerPPC64::instMap[256] = {
+	INST_HANDLE(IADD_RS)
+	INST_HANDLE(IADD_M)
+	INST_HANDLE(ISUB_R)
+	INST_HANDLE(ISUB_M)
+	INST_HANDLE(IMUL_R)
+	INST_HANDLE(IMUL_M)
+	INST_HANDLE(IMULH_R)
+	INST_HANDLE(IMULH_M)
+	INST_HANDLE(ISMULH_R)
+	INST_HANDLE(ISMULH_M)
+	INST_HANDLE(IMUL_RCP)
+	INST_HANDLE(INEG_R)
+	INST_HANDLE(IXOR_R)
+	INST_HANDLE(IXOR_M)
+	INST_HANDLE(IROR_R)
+	INST_HANDLE(IROL_R)
+	INST_HANDLE(ISWAP_R)
+	INST_HANDLE(FSWAP_R)
+	INST_HANDLE(FADD_R)
+	INST_HANDLE(FADD_M)
+	INST_HANDLE(FSUB_R)
+	INST_HANDLE(FSUB_M)
+	INST_HANDLE(FSCAL_R)
+	INST_HANDLE(FMUL_R)
+	INST_HANDLE(FDIV_M)
+	INST_HANDLE(FSQRT_R)
+	INST_HANDLE(CBRANCH)
+	INST_HANDLE(CFROUND)
+	INST_HANDLE(ISTORE)
+	INST_HANDLE(NOP)
+};
diff --git a/src/jit_compiler_ppc64.hpp b/src/jit_compiler_ppc64.hpp
new file mode 100644
index 00000000..b9392d92
--- /dev/null
+++ b/src/jit_compiler_ppc64.hpp
@@ -0,0 +1,120 @@
+/*
+Copyright (c) 2023 tevador <tevador@gmail.com>
+Copyright (c) 2026, Forest Crossman <cyrozap@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+#include <vector>
+
+#include "common.hpp"
+#include "jit_compiler.hpp"
+
+#include "jit_compiler_ppc64_static.hpp"
+
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	#define PPC_BIG_ENDIAN 1
+#else
+	#define PPC_BIG_ENDIAN 0
+#endif
+
+#if (defined(_CALL_ELF) && _CALL_ELF == 2) || (!defined(_CALL_ELF) && !PPC_BIG_ENDIAN)
+	#define PPC_ABI_V2 1
+#else
+	#define PPC_ABI_V2 0
+#endif
+
+namespace randomx {
+
+	class Program;
+	struct ProgramConfiguration;
+	class SuperscalarProgram;
+	class Instruction;
+
+	class JitCompilerPPC64 {
+	public:
+		JitCompilerPPC64();
+		~JitCompilerPPC64();
+
+		void generateProgram(Program&, ProgramConfiguration&);
+		void generateProgramLight(Program&, ProgramConfiguration&, uint32_t);
+
+		void generateSuperscalarHash(SuperscalarProgramList& programs, std::vector<uint64_t> &);
+
+		void generateDatasetInitCode() {}
+
+		ProgramFunc* getProgramFunc() {
+#if PPC_ABI_V2
+			return reinterpret_cast<ProgramFunc*>(entryProgram);
+#else
+			return reinterpret_cast<ProgramFunc*>(descriptorProgram);
+#endif
+		}
+		DatasetInitFunc* getDatasetInitFunc() {
+#if PPC_ABI_V2
+			return reinterpret_cast<DatasetInitFunc*>(entryDataInit);
+#else
+			return reinterpret_cast<DatasetInitFunc*>(descriptorDataInit);
+#endif
+		}
+		uint8_t* getCode() { return state.code; }
+		size_t getCodeSize();
+
+		void enableWriting();
+		void enableExecution();
+		void enableAll();
+
+		void setFlags(randomx_flags f) { flags = f; }
+
+		uint32_t getTempGpr();
+		uint32_t getTempVr();
+
+		static uint8_t instMap[256];
+
+	private:
+		void emitProgramPrefix(CompilerState& state, Program& prog, ProgramConfiguration& pcfg, randomx_flags flags);
+		void emitProgramSuffix(CompilerState& state, ProgramConfiguration& pcfg, randomx_flags flags);
+
+		CompilerState state;
+		randomx_flags flags;
+
+		void* entryDataInit = nullptr;
+		void* entryProgram = nullptr;
+#if !PPC_ABI_V2
+		uint64_t descriptorProgram[3];
+		uint64_t descriptorDataInit[3];
+#endif
+
+		int32_t RandomXCodePos;
+		int32_t SshashSingleItemPos;
+		int32_t LoopBeginPos;
+
+		uint32_t tempGprIndex = 0;
+		uint32_t tempVrIndex = 0;
+	};
+
+}
diff --git a/src/jit_compiler_ppc64_static.S b/src/jit_compiler_ppc64_static.S
new file mode 100644
index 00000000..ad3666fc
--- /dev/null
+++ b/src/jit_compiler_ppc64_static.S
@@ -0,0 +1,1389 @@
+/*
+Copyright (c) 2026, Forest Crossman <cyrozap@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+	.machine power7
+	.machine altivec
+	.section ".rodata"  // Not .text because it's not meant to be executed in-place.
+
+#include "configuration.h"
+
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	#define PPC_BIG_ENDIAN 1
+#else
+	#define PPC_BIG_ENDIAN 0
+#endif
+
+#if (defined(_CALL_ELF) && _CALL_ELF == 2) || (!defined(_CALL_ELF) && !PPC_BIG_ENDIAN)
+	#define PPC_ABI_V2 1
+#else
+	#define PPC_ABI_V2 0
+#endif
+
+#if PPC_ABI_V2
+	.abiversion 2
+	#define C_FUNCTION(name) \
+		.global name; \
+		name:
+#else
+	.abiversion 1
+	#define C_FUNCTION(name) \
+		.section ".opd","aw"; \
+		.align 3; \
+		.global name; \
+		name: \
+		.quad .name, .TOC.@tocbase, 0; \
+		.previous; \
+		.global .name; \
+		.name:
+#endif
+
+	.global randomx_ppc64_constants
+	.global randomx_ppc64_constant_lut_fprc_to_fpscr
+	.global randomx_ppc64_constants_end
+
+	.global randomx_ppc64_dataset_init
+	.global randomx_ppc64_dataset_init_fix_call
+	.global randomx_ppc64_dataset_init_end
+
+	.global randomx_ppc64_sshash_single_item_prologue
+	.global randomx_ppc64_sshash_single_item_prologue_end
+	.global randomx_ppc64_sshash_single_item_epilogue
+	.global randomx_ppc64_sshash_single_item_epilogue_end
+	.global randomx_ppc64_sshash_cache_prefetch
+	.global randomx_ppc64_sshash_cache_prefetch_end
+	.global randomx_ppc64_sshash_xor
+	.global randomx_ppc64_sshash_xor_end
+
+	.global randomx_ppc64_vm_prologue
+	.global randomx_ppc64_vm_prologue_end
+	.global randomx_ppc64_vm_epilogue
+	.global randomx_ppc64_vm_fix_loop
+	.global randomx_ppc64_vm_epilogue_end
+	.global randomx_ppc64_vm_loop_prologue
+	.global randomx_ppc64_vm_loop_prologue_end
+	.global randomx_ppc64_vm_data_read
+	.global randomx_ppc64_vm_data_read_end
+	.global randomx_ppc64_vm_data_read_light
+	.global randomx_ppc64_vm_data_read_light_fix_call
+	.global randomx_ppc64_vm_data_read_light_end
+	.global randomx_ppc64_vm_spad_store_group_r
+	.global randomx_ppc64_vm_spad_store_group_r_end
+	.global randomx_ppc64_vm_spad_store_mix_v1
+	.global randomx_ppc64_vm_spad_store_mix_v1_end
+	.global randomx_ppc64_vm_spad_store_mix_v2_hard_aes
+	.global randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end
+	.global randomx_ppc64_vm_spad_store_mix_v2_soft_aes
+	.global randomx_ppc64_vm_spad_store_mix_v2_soft_aes_end
+
+// Macro to shuffle a VR after being loaded with lxsdx.
+.macro SHUFFLE_VR vr_reg
+#if PPC_BIG_ENDIAN
+	vperm \vr_reg, \vr_reg, \vr_reg, %v16
+#else
+	xxmrghw \vr_reg + 32, \vr_reg + 32, \vr_reg + 32
+#endif
+.endm
+
+// Macro to load a GPR from little-endian bytes in memory
+// Clobbers (BE only): r0
+.macro LOAD_LE_GPR reg, offset, base_reg
+#if PPC_BIG_ENDIAN
+	li %r0, \offset
+	ldbrx \reg, \base_reg, %r0
+#else
+	ld \reg, \offset(\base_reg)
+#endif
+.endm
+
+// Macro to store a GPR to memory as little-endian bytes
+// Clobbers (BE only): r0
+.macro STORE_LE_GPR reg, offset, base_reg
+#if PPC_BIG_ENDIAN
+	li %r0, \offset
+	stdbrx \reg, \base_reg, %r0
+#else
+	std \reg, \offset(\base_reg)
+#endif
+.endm
+
+	// Align constants to 128 bytes (lowest 7 bits masked)
+	.align 7
+
+randomx_ppc64_constants:
+
+sshash_constant_0:  .8byte 6364136223846793005
+sshash_constant_1:  .8byte 9298411001130361340
+sshash_constant_2:  .8byte 12065312585734608966
+sshash_constant_3:  .8byte 9306329213124626780
+sshash_constant_4:  .8byte 5281919268842080866
+sshash_constant_5:  .8byte 10536153434571861004
+sshash_constant_6:  .8byte 3398623926847679864
+sshash_constant_7:  .8byte 9549104520008361294
+
+randomx_ppc64_constant_lut_fprc_to_fpscr:
+	// RandomX fprc to PPC64 FPSCR lookup table
+	.8byte 0  /* 00 Round to Nearest */
+	.8byte 3  /* 11 Round toward -Infinity */
+	.8byte 2  /* 10 Round toward +Infinity */
+	.8byte 1  /* 01 Round toward Zero */
+
+	// Align vector constants to 16 bytes (lowest 4 bits masked)
+	.align 4
+
+constant_vector_group_e_and_mask:
+	.8byte 0x00FFFFFFFFC00000
+	.8byte 0x00FFFFFFFFC00000
+
+constant_vector_fscal_xor_mask:
+	.8byte 0x80F0000000000000
+	.8byte 0x80F0000000000000
+
+constant_vector_byte_reverse_mask:
+	// Vector byte reverse mask
+	.octa 0x0F0E0D0C0B0A09080706050403020100
+
+#if PPC_BIG_ENDIAN
+constant_vector_be_permutation_mask:
+	// Big-endian vector permutation mask
+	.byte 7, 6, 5, 4, 7, 6, 5, 4
+	.byte 3, 2, 1, 0, 3, 2, 1, 0
+#endif
+
+constant_vector_soft_aes_galois_field_inversion_lo:
+	.octa 0xf001080d0f06050e020c0b0a09030704
+constant_vector_soft_aes_galois_field_inversion_hi:
+	.octa 0xf0070b0f060a0401090805020c0e0d03
+constant_vector_soft_aes_mixcolumns_forward:
+	.octa 0x03000102070405060b08090a0f0c0d0e
+constant_vector_soft_aes_mixcolumns_backward:
+	.octa 0x0102030005060704090a0b080d0e0f0c
+
+constant_vector_soft_aes_shiftrows:
+	.octa 0x04090e03080d02070c01060b00050a0f
+constant_vector_soft_aes_encrypt_input_transform_lo:
+	.octa 0x00702a5a98e8b2c20878225290e0baca
+constant_vector_soft_aes_encrypt_input_transform_hi:
+	.octa 0x004d7c317d30014c81ccfdb0fcb180cd
+constant_vector_soft_aes_subbytes_mul1_lo:
+	.octa 0x0023e2fa15d41836efd92e0dc1ccf73b
+constant_vector_soft_aes_subbytes_mul1_hi:
+	.octa 0x003e50cb8fe19bb144f52a146e7adfa5
+constant_vector_soft_aes_subbytes_mul2_lo:
+	.octa 0x0029e10a4088eb694a2382abc863a1c2
+constant_vector_soft_aes_subbytes_mul2_hi:
+	.octa 0x0024710bc6937ae2cd2f98bc55e9b75e
+constant_vector_soft_aes_encrypt_63:
+	.octa 0x5b5b5b5b5b5b5b5b5b5b5b5b5b5b5b5b
+constant_vector_soft_aes_encrypt_output_transform_lo:
+	.octa 0x0060b6d629499fff0868bede214197f7
+constant_vector_soft_aes_encrypt_output_transform_hi:
+	.octa 0x00ecbc5051bded01e00c5cb0b15d0de1
+
+constant_vector_soft_aes_invshiftrows:
+	.octa 0x0c090603000d0a0704010e0b0805020f
+constant_vector_soft_aes_decrypt_input_transform_lo:
+	.octa 0x005f540b045b500f1a454e111e414a15
+constant_vector_soft_aes_decrypt_input_transform_hi:
+	.octa 0x00650560e683e38694f191f472177712
+constant_vector_soft_aes_invsubbytes_mul9_lo:
+	.octa 0x00d6869a53031c85c94c994f501fd5ca
+constant_vector_soft_aes_invsubbytes_mul9_hi:
+	.octa 0x0049d7ec89173bc065a5fbb29e2c5e72
+constant_vector_soft_aes_invsubbytes_mulD_lo:
+	.octa 0x00a2b1e6dfcc577d39442a88139b6ef5
+constant_vector_soft_aes_invsubbytes_mulD_hi:
+	.octa 0x00cbc624f7fae23cd3efde150d183129
+constant_vector_soft_aes_invsubbytes_mulB_lo:
+	.octa 0x0042b496926422d004d4f2b0f6462660
+constant_vector_soft_aes_invsubbytes_mulB_hi:
+	.octa 0x006759cda69894c16baa55323e0cfff3
+constant_vector_soft_aes_invsubbytes_mulE_lo:
+	.octa 0x00d0d4269692f246b0f6b46404604222
+constant_vector_soft_aes_invsubbytes_mulE_hi:
+	.octa 0x00c1aaffcda6550c323e59986bf36794
+constant_vector_soft_aes_decrypt_63:
+	.octa 0xe8e8e8e8e8e8e8e8e8e8e8e8e8e8e8e8
+constant_vector_soft_aes_decrypt_output_transform_lo:
+	.octa 0x0024dffb0420dbfff8dc2703fcd82307
+constant_vector_soft_aes_decrypt_output_transform_hi:
+	.octa 0x002f19362906301fab84b29d82ad9bb4
+
+randomx_ppc64_constants_end:
+
+literal_vector_group_e_or_mask:
+	// Program generator will write the vector here
+
+
+// Register allocations: dataset_init
+//
+// Passed on entry:
+//
+// r1 (non-volatile) -> stack pointer (sp)
+// r2 (volatile)     -> constants base address (randomx_ppc64_constants), patched in by JIT, restored from stack frame by caller
+// r3 (volatile)     -> arg0, pointer to randomx_cache
+// r4 (volatile)     -> arg1, pointer to dataset (uint8_t *)
+// r5 (volatile)     -> arg2, uint32_t startBlock / itemNumber / initial cacheIndex
+// r6 (volatile)     -> arg3, uint32_t endBlock
+//
+// After prologue:
+//
+// r0 (volatile)     -> scratch register
+// r1 (non-volatile) -> stack pointer (sp)
+// r2 (volatile)     -> constants base address (randomx_ppc64_constants), patched in by JIT, restored from stack frame by caller
+// r3 (volatile)     -> arg0, pointer to cache memory
+// r4 (volatile)     -> arg1, pointer to dataset (uint8_t *)
+// r5 (volatile)     -> arg2, uint32_t startBlock / itemNumber / initial cacheIndex
+// r6 (volatile)     -> arg3, uint32_t endBlock
+// r7-r12 (volatile) -> scratch registers
+// r14 (non-volatile) -> saved pointer to dataset (uint8_t *)
+// r15 (non-volatile) -> saved itemNumber
+// r16-r31 (non-volatile) -> unused
+
+randomx_ppc64_dataset_init:
+	// JIT compiler MUST emit immediate load to r2 before this code (ABI v2 only)
+
+	// Standard function prologue
+	mflr %r0
+	std %r0, 16(%r1)
+	stdu %r1, -128(%r1)
+	std %r14, 112(%r1)
+	std %r15, 120(%r1)
+
+	// Load cache->memory pointer
+	ld %r3, 0(%r3)
+
+	// Save the dataset pointer (r4) to r14
+	mr %r14, %r4
+
+	// Save the itemNumber (r5) to r15
+	mr %r15, %r5
+
+	// Loop setup
+	// for (size_t itemNumber = startBlock; itemNumber < endBlock; itemNumber++) { ... }
+	sub %r8, %r6, %r5
+	mtctr %r8
+
+1:
+	// r5 gets clobbered by the item-hashing function, so we need to restore it
+	// from r15 before calling the function again.
+	mr %r5, %r15
+
+randomx_ppc64_dataset_init_fix_call:
+	// JIT compiler MUST patch this to bl to the item hashing function
+	b 0
+
+	// Store the 64 computed bytes back in the dataset
+	std %r4, 8*0(%r14)
+	std %r6, 8*1(%r14)
+	std %r7, 8*2(%r14)
+	std %r9, 8*3(%r14)
+	std %r10, 8*4(%r14)
+	std %r11, 8*5(%r14)
+	std %r12, 8*6(%r14)
+	std %r5, 8*7(%r14)
+
+	// Increment the dataset pointer by 64 bytes
+	addi %r14, %r14, 8*8
+
+	// Increment the itemNumber by one
+	addi %r15, %r15, 1
+
+	// Loop
+	bdnz 1b
+
+	// Standard function epilogue
+	ld %r14, 112(%r1)
+	ld %r15, 120(%r1)
+	addi %r1, %r1, 128
+	ld %r0, 16(%r1)
+	mtlr %r0
+	blr
+
+randomx_ppc64_dataset_init_end:
+
+
+// Register allocations: sshash_single_item
+//
+// Passed on entry:
+//
+// r1 (non-volatile) -> stack pointer (sp)
+// r2 (volatile)     -> constants base address (randomx_ppc64_constants), patched in by JIT, restored from stack frame by caller
+// r3 (volatile)     -> arg0, pointer to cache memory
+// r5 (volatile)     -> arg2, uint32_t itemNumber
+//
+// After prologue:
+//
+// r0 (volatile)     -> scratch register
+// r1 (non-volatile) -> stack pointer (sp)
+// r2 (volatile)     -> constants base address (randomx_ppc64_constants), patched in by JIT, restored from stack frame by caller
+// r3 (volatile)     -> arg0, pointer to cache memory
+// r4 (volatile)     -> SuperscalarHash r0
+// r5 (volatile)     -> cacheIndex, set to SuperscalarHash r7 on return
+// r6-r7 (volatile)  -> SuperscalarHash r1-r2
+// r8 (volatile)     -> scratch register
+// r9-r12 (volatile) -> SuperscalarHash r3-r6
+// r14-r21 (non-volatile) -> unused
+// r22 (non-volatile) -> SuperscalarHash r7
+// r23 (non-volatile) -> cache line address
+// r24-r31 (non-volatile) -> unused
+
+randomx_ppc64_sshash_single_item_prologue:
+	// Standard function prologue
+	mflr %r0
+	std %r0, 16(%r1)
+	stdu %r1, -128(%r1)
+	std %r22, 112(%r1)
+	std %r23, 120(%r1)
+
+	// Step 1. Initialize registers
+
+	// r0 = (itemNumber + 1) * 6364136223846793005
+	ld %r8, (sshash_constant_0-randomx_ppc64_constants)(%r2)
+	addi %r0, %r5, 1
+	mulld %r4, %r8, %r0
+
+	// r1 = r0 ^ 9298411001130361340
+	ld %r8, (sshash_constant_1-randomx_ppc64_constants)(%r2)
+	xor %r6, %r4, %r8
+
+	// r2 = r0 ^ 12065312585734608966
+	ld %r8, (sshash_constant_2-randomx_ppc64_constants)(%r2)
+	xor %r7, %r4, %r8
+
+	// r3 = r0 ^ 9306329213124626780
+	ld %r8, (sshash_constant_3-randomx_ppc64_constants)(%r2)
+	xor %r9, %r4, %r8
+
+	// r4 = r0 ^ 5281919268842080866
+	ld %r8, (sshash_constant_4-randomx_ppc64_constants)(%r2)
+	xor %r10, %r4, %r8
+
+	// r5 = r0 ^ 10536153434571861004
+	ld %r8, (sshash_constant_5-randomx_ppc64_constants)(%r2)
+	xor %r11, %r4, %r8
+
+	// r6 = r0 ^ 3398623926847679864
+	ld %r8, (sshash_constant_6-randomx_ppc64_constants)(%r2)
+	xor %r12, %r4, %r8
+
+	// r7 = r0 ^ 9549104520008361294
+	ld %r8, (sshash_constant_7-randomx_ppc64_constants)(%r2)
+	xor %r22, %r4, %r8
+
+	// Step 2. Use r5 (itemNumber) as cacheIndex so it can be used to generate the initial cache line mask
+
+randomx_ppc64_sshash_single_item_prologue_end:
+
+randomx_ppc64_sshash_single_item_epilogue:
+	// Return SuperscalarHash r7 in GPR5
+	mr %r5, %r22
+
+	// Standard function epilogue
+	ld %r22, 112(%r1)
+	ld %r23, 120(%r1)
+	addi %r1, %r1, 128
+	ld %r0, 16(%r1)
+	mtlr %r0
+	blr
+
+randomx_ppc64_sshash_single_item_epilogue_end:
+
+
+// Step 4. Load a 64-byte item from the Cache. The item index is given by cacheIndex modulo the total number of 64-byte items in Cache.
+randomx_ppc64_sshash_cache_prefetch:
+	// Actual mask MUST be inserted by JIT compiler
+	rldic %r8, %r5, 0, 63
+	add %r23, %r3, %r8
+	dcbt 0, %r23, 0
+	// If TH=0b00000, the dcbt/dcbtst instruction provides a
+	// hint that the program will probably soon access the
+	// block containing the byte addressed by EA.
+
+randomx_ppc64_sshash_cache_prefetch_end:
+
+// Step 6. XOR all registers with data loaded from randomx cache
+randomx_ppc64_sshash_xor:
+	ld %r8, 0(%r23)
+	ld %r0, 8(%r23)
+	xor %r4, %r4, %r8
+	xor %r6, %r6, %r0
+	ld %r8, 16(%r23)
+	ld %r0, 24(%r23)
+	xor %r7, %r7, %r8
+	xor %r9, %r9, %r0
+	ld %r8, 32(%r23)
+	ld %r0, 40(%r23)
+	xor %r10, %r10, %r8
+	xor %r11, %r11, %r0
+	ld %r8, 48(%r23)
+	ld %r0, 56(%r23)
+	xor %r12, %r12, %r8
+	xor %r22, %r22, %r0
+
+randomx_ppc64_sshash_xor_end:
+
+
+// Register allocations: vm
+//
+// Passed on entry:
+//
+// r1 (non-volatile) -> stack pointer (sp)
+// r2 (volatile)     -> constants base address (randomx_ppc64_constants), patched in by JIT, restored from stack frame by caller
+// r3 (volatile)     -> arg0, pointer to RegisterFile
+// r4 (volatile)     -> arg1, pointer to MemoryRegisters
+// r5 (volatile)     -> arg2, pointer to scratchpad (uint8_t *)
+// r6 (volatile)     -> arg3, uint64_t loop iterations
+//
+// After prologue:
+//
+// r0 (volatile)     -> scratch register
+// r1 (non-volatile) -> stack pointer (sp)
+// r2 (volatile)     -> constants base address (randomx_ppc64_constants), patched in by JIT, restored from stack frame by caller
+// r3 (volatile)     -> arg0 for SuperScalarHash (dataset pointer)
+// r4 (volatile)     -> scratch register
+// r5 (volatile)     -> arg2 for SuperScalarHash (loop iteration)
+// r6-r12 (volatile) -> scratch registers
+// r14-r21 (non-volatile) -> RandomX integer registers r0-r7
+// r22 (non-volatile) -> dataset pointer (memory)
+// r23 (non-volatile) -> unused
+// r24 (non-volatile) -> ma
+// r25 (non-volatile) -> mx
+// r26 (non-volatile) -> spAddr0
+// r27 (non-volatile) -> spAddr1
+// r28 (non-volatile) -> Saved pointer to RegisterFile
+// r29 (non-volatile) -> Saved pointer to MemoryRegisters
+// r30 (non-volatile) -> Saved pointer to scratchpad (uint8_t *)
+// r31 (non-volatile) -> unused
+// f0-f13 / vs0–vs13 (volatile) -> scratch registers
+// f14-f31 / vs14–vs31 (non-volatile) -> unused
+// v0-v3 / vs32-vs35 (volatile) -> RandomX floating point registers f0-f3
+// v4-v7 / vs36-vs39 (volatile) -> RandomX floating point registers e0-e3
+// v8-v11 / vs40-vs43 (volatile) -> RandomX floating point registers a0-a3
+// v12-v14 / vs44-vs46 (volatile) -> scratch registers
+// v15 / vs47 (volatile) -> constant_vector_byte_reverse_mask
+// v16 / vs48 (volatile) -> constant_vector_be_permutation_mask
+// v17 / vs49 (volatile) -> constant_vector_group_e_and_mask
+// v18 / vs50 (volatile) -> constant_vector_fscal_xor_mask
+// v19 / vs51 (volatile) -> literal_vector_group_e_or_mask
+// v20-v31 / vs52-vs63 (non-volatile) -> unused
+
+randomx_ppc64_vm_prologue:
+	// JIT compiler MUST emit immediate load to r2 before this code (ABI v2 only)
+
+	// Standard function prologue
+	mflr %r0
+	std %r0, 16(%r1)
+	stdu %r1, -256(%r1)
+	std %r14, 112(%r1)
+	std %r15, 120(%r1)
+	std %r16, 128(%r1)
+	std %r17, 136(%r1)
+	std %r18, 144(%r1)
+	std %r19, 152(%r1)
+	std %r20, 160(%r1)
+	std %r21, 168(%r1)
+	std %r22, 176(%r1)
+	std %r23, 184(%r1)
+	std %r24, 192(%r1)
+	std %r25, 200(%r1)
+	std %r26, 208(%r1)
+	std %r27, 216(%r1)
+	std %r28, 224(%r1)
+	std %r29, 232(%r1)
+	std %r30, 240(%r1)
+	//std %r31, 248(%r1)
+
+	// Save arguments
+	mr %r28, %r3
+	mr %r29, %r4
+	mr %r30, %r5
+
+	// Move the loop iterations into the counter
+	mtctr %r6
+
+	// Load the vector constants/literals
+	li %r8, constant_vector_group_e_and_mask-randomx_ppc64_constants
+	lxvd2x %vs49, %r2, %r8
+	li %r9, constant_vector_fscal_xor_mask-randomx_ppc64_constants
+	lxvd2x %vs50, %r2, %r9
+	li %r10, literal_vector_group_e_or_mask-randomx_ppc64_constants
+	lvx %v19, %r2, %r10  // Use lvx to load the vector since it's written [ low word, high word ] in memory
+#if PPC_BIG_ENDIAN
+	li %r11, constant_vector_be_permutation_mask-randomx_ppc64_constants
+	lxvd2x %vs48, %r2, %r11  // Load the BE permutation mask (not needed for LE)
+#endif
+	li %r12, constant_vector_byte_reverse_mask-randomx_ppc64_constants
+	lvx %v15, %r2, %r12
+#if PPC_BIG_ENDIAN
+	vperm %v19, %v19, %v19, %v15  // Swap the byte order of the Group E OR mask vector
+#endif
+
+	// Zero the RandomX integer registers
+	li %r14, 0
+	li %r15, 0
+	li %r16, 0
+	li %r17, 0
+	li %r18, 0
+	li %r19, 0
+	li %r20, 0
+	li %r21, 0
+
+	// Load MemoryRegisters (r29)
+	lwz %r25, 0(%r29)  // mx
+	lwz %r24, 4(%r29)  // ma
+	ld %r22, 8(%r29)   // memory (dataset pointer)
+
+	// Load a0-a3 from RegisterFile
+	.equ registers_a_base, 8*8+16*4+16*4
+	li %r8, registers_a_base + 16*0
+	lvx %v8, %r28, %r8
+	li %r9, registers_a_base + 16*1
+	lvx %v9, %r28, %r9
+	li %r10, registers_a_base + 16*2
+	lvx %v10, %r28, %r10
+	li %r11, registers_a_base + 16*3
+	lvx %v11, %r28, %r11
+#if PPC_BIG_ENDIAN
+	vperm %v8, %v8, %v8, %v15
+	vperm %v9, %v9, %v9, %v15
+	vperm %v10, %v10, %v10, %v15
+	vperm %v11, %v11, %v11, %v15
+#endif
+
+	// Instructions to mask mx and ma with Scratchpad L3 mask and set the
+	// initial values of spAddr0 and spAddr1 are appended here by the JIT
+
+randomx_ppc64_vm_prologue_end:
+
+randomx_ppc64_vm_epilogue:
+	// Loop
+	bdz 1f
+randomx_ppc64_vm_fix_loop:
+	// JIT compiler MUST patch this to b to vm_loop_prologue
+	b 0
+1:
+
+	// Store RandomX registers back into register file
+	STORE_LE_GPR %r14, 8*0, %r28
+	STORE_LE_GPR %r15, 8*1, %r28
+	STORE_LE_GPR %r16, 8*2, %r28
+	STORE_LE_GPR %r17, 8*3, %r28
+	STORE_LE_GPR %r18, 8*4, %r28
+	STORE_LE_GPR %r19, 8*5, %r28
+	STORE_LE_GPR %r20, 8*6, %r28
+	STORE_LE_GPR %r21, 8*7, %r28
+
+#if PPC_BIG_ENDIAN
+	// Reverse the Group F/E register bytes so they're arranged as [ 0123 4567 ]
+	vperm %v0, %v0, %v0, %v15
+	vperm %v1, %v1, %v1, %v15
+	vperm %v2, %v2, %v2, %v15
+	vperm %v3, %v3, %v3, %v15
+	vperm %v4, %v4, %v4, %v15
+	vperm %v5, %v5, %v5, %v15
+	vperm %v6, %v6, %v6, %v15
+	vperm %v7, %v7, %v7, %v15
+#endif
+
+	.equ registers_f_base, 8*8
+	li %r8, registers_f_base + 16*0
+	stvx %v0, %r28, %r8
+	li %r9, registers_f_base + 16*1
+	stvx %v1, %r28, %r9
+	li %r10, registers_f_base + 16*2
+	stvx %v2, %r28, %r10
+	li %r11, registers_f_base + 16*3
+	stvx %v3, %r28, %r11
+
+	.equ registers_e_base, 8*8+16*4
+	li %r8, registers_e_base + 16*0
+	stvx %v4, %r28, %r8
+	li %r9, registers_e_base + 16*1
+	stvx %v5, %r28, %r9
+	li %r10, registers_e_base + 16*2
+	stvx %v6, %r28, %r10
+	li %r11, registers_e_base + 16*3
+	stvx %v7, %r28, %r11
+
+	// Standard function epilogue
+	ld %r14, 112(%r1)
+	ld %r15, 120(%r1)
+	ld %r16, 128(%r1)
+	ld %r17, 136(%r1)
+	ld %r18, 144(%r1)
+	ld %r19, 152(%r1)
+	ld %r20, 160(%r1)
+	ld %r21, 168(%r1)
+	ld %r22, 176(%r1)
+	ld %r23, 184(%r1)
+	ld %r24, 192(%r1)
+	ld %r25, 200(%r1)
+	ld %r26, 208(%r1)
+	ld %r27, 216(%r1)
+	ld %r28, 224(%r1)
+	ld %r29, 232(%r1)
+	ld %r30, 240(%r1)
+	//ld %r31, 248(%r1)
+	addi %r1, %r1,256
+	ld %r0, 16(%r1)
+	mtlr %r0
+	blr
+
+randomx_ppc64_vm_epilogue_end:
+
+randomx_ppc64_vm_loop_prologue:
+	// Main loop start
+
+	// Load scratchpad data, mix registers, etc.
+	LOAD_LE_GPR %r8, 0, %r26
+	LOAD_LE_GPR %r9, 8, %r26
+	LOAD_LE_GPR %r10, 16, %r26
+	LOAD_LE_GPR %r11, 24, %r26
+	xor %r14, %r14, %r8
+	xor %r15, %r15, %r9
+	xor %r16, %r16, %r10
+	xor %r17, %r17, %r11
+	LOAD_LE_GPR %r8, 32, %r26
+	LOAD_LE_GPR %r9, 40, %r26
+	LOAD_LE_GPR %r10, 48, %r26
+	LOAD_LE_GPR %r11, 56, %r26
+	xor %r18, %r18, %r8
+	xor %r19, %r19, %r9
+	xor %r20, %r20, %r10
+	xor %r21, %r21, %r11
+
+	// Load F registers (v0-v3 / vs32-vs35) from spAddr1 (r27)
+	lxsdx %vs32, 0, %r27  // Use base address directly to avoid an immediate load
+	li %r9, 8*1
+	lxsdx %vs33, %r27, %r9
+	li %r10, 8*2
+	lxsdx %vs34, %r27, %r10
+	li %r11, 8*3
+	lxsdx %vs35, %r27, %r11
+	SHUFFLE_VR 0
+	SHUFFLE_VR 1
+	SHUFFLE_VR 2
+	SHUFFLE_VR 3
+	xvcvsxwdp %vs32, %vs32
+	xvcvsxwdp %vs33, %vs33
+	xvcvsxwdp %vs34, %vs34
+	xvcvsxwdp %vs35, %vs35
+
+	// Load E registers (v4-v7 / vs36-vs39) from spAddr1 (r27) and fixup
+	li %r8, 8*4
+	lxsdx %vs36, %r27, %r8
+	li %r9, 8*5
+	lxsdx %vs37, %r27, %r9
+	li %r10, 8*6
+	lxsdx %vs38, %r27, %r10
+	li %r11, 8*7
+	lxsdx %vs39, %r27, %r11
+	SHUFFLE_VR 4
+	SHUFFLE_VR 5
+	SHUFFLE_VR 6
+	SHUFFLE_VR 7
+	xvcvsxwdp %vs36, %vs36
+	xvcvsxwdp %vs37, %vs37
+	xvcvsxwdp %vs38, %vs38
+	xvcvsxwdp %vs39, %vs39
+	vsel %v4, %v19, %v4, %v17
+	vsel %v5, %v19, %v5, %v17
+	vsel %v6, %v19, %v6, %v17
+	vsel %v7, %v19, %v7, %v17
+
+randomx_ppc64_vm_loop_prologue_end:
+
+randomx_ppc64_vm_data_read:
+	// Read dataset logic
+
+	// Calculate prefetch address (JIT compiler MUST patch)
+	.long 0  // Placeholder for: rlwinm %r8, %mpReg, 0, mask_begin, mask_end
+	add %r8, %r8, %r22  // r22 holds dataset base pointer
+
+	// Prefetch the next block with dcbt. This is extremely important--without this
+	// we lose >20% performance in V1 and >16% in V2.
+	// Setting TH=0b10000 (dcbtt 0, %r8) didn't make any measurable difference in
+	// performance.
+	dcbt 0, %r8, 0
+
+	// Calculate read address (JIT compiler MUST patch)
+	.long 0  // Placeholder for: rlwinm %r8, %mtReg, 0, mask_begin, mask_end
+	add %r8, %r8, %r22
+
+	// Read 64 bytes and XOR with integer registers
+	ld %r9, 0(%r8)
+	ld %r10, 8(%r8)
+	ld %r11, 16(%r8)
+	ld %r12, 24(%r8)
+	xor %r14, %r14, %r9
+	xor %r15, %r15, %r10
+	xor %r16, %r16, %r11
+	xor %r17, %r17, %r12
+	ld %r9, 32(%r8)
+	ld %r10, 40(%r8)
+	ld %r11, 48(%r8)
+	ld %r12, 56(%r8)
+	xor %r18, %r18, %r9
+	xor %r19, %r19, %r10
+	xor %r20, %r20, %r11
+	xor %r21, %r21, %r12
+
+	// Swap mx and ma
+	mr %r8, %r25
+	mr %r25, %r24
+	mr %r24, %r8
+
+randomx_ppc64_vm_data_read_end:
+
+randomx_ppc64_vm_data_read_light:
+	// Light mode read dataset logic
+	// (Similar to data_read but uses sshash_single_item)
+
+	// Copy dataset pointer argument for sshash_single_item
+	mr %r3, %r22
+
+randomx_ppc64_vm_data_read_light_fix_call:
+	// JIT compiler MUST patch this to bl to sshash_single_item
+	b 0
+
+	// XOR the result from sshash_single_item with the VM registers
+	xor %r14, %r14, %r4
+	xor %r15, %r15, %r6
+	xor %r16, %r16, %r7
+	xor %r17, %r17, %r9
+	xor %r18, %r18, %r10
+	xor %r19, %r19, %r11
+	xor %r20, %r20, %r12
+	xor %r21, %r21, %r5
+
+	// Swap mx and ma
+	mr %r8, %r25
+	mr %r25, %r24
+	mr %r24, %r8
+
+randomx_ppc64_vm_data_read_light_end:
+
+randomx_ppc64_vm_spad_store_group_r:
+	// Store to scratchpad at spAddr1
+	STORE_LE_GPR %r14, 8*0, %r27
+	STORE_LE_GPR %r15, 8*1, %r27
+	STORE_LE_GPR %r16, 8*2, %r27
+	STORE_LE_GPR %r17, 8*3, %r27
+	STORE_LE_GPR %r18, 8*4, %r27
+	STORE_LE_GPR %r19, 8*5, %r27
+	STORE_LE_GPR %r20, 8*6, %r27
+	STORE_LE_GPR %r21, 8*7, %r27
+
+randomx_ppc64_vm_spad_store_group_r_end:
+
+randomx_ppc64_vm_spad_store_mix_v1:
+	// Mix F and E registers (f0-f3 are v0-v3, e0-e3 are v4-v7)
+	vxor %v0, %v0, %v4
+	vxor %v1, %v1, %v5
+	vxor %v2, %v2, %v6
+	vxor %v3, %v3, %v7
+
+randomx_ppc64_vm_spad_store_mix_v1_end:
+
+randomx_ppc64_vm_spad_store_mix_v2_hard_aes:
+	// Mix F and E registers (f0-f3 are v0-v3, e0-e3 are v4-v7)
+
+	// We need a zero vector to bypass vncipher's internal key XOR
+	vxor %v12, %v12, %v12
+
+	// Byte-reverse f0-f3 and e0-e3
+	vperm %v0, %v0, %v0, %v15
+	vperm %v1, %v1, %v1, %v15
+	vperm %v2, %v2, %v2, %v15
+	vperm %v3, %v3, %v3, %v15
+	vperm %v4, %v4, %v4, %v15
+	vperm %v5, %v5, %v5, %v15
+	vperm %v6, %v6, %v6, %v15
+	vperm %v7, %v7, %v7, %v15
+
+	vcipher %v0, %v0, %v4
+	vncipher %v1, %v1, %v12  // Pass 0 as the key
+	vcipher %v2, %v2, %v4
+	vncipher %v3, %v3, %v12
+	vxor %v1, %v1, %v4       // XOR the actual key afterwards
+	vxor %v3, %v3, %v4
+
+	vcipher %v0, %v0, %v5
+	vncipher %v1, %v1, %v12
+	vcipher %v2, %v2, %v5
+	vncipher %v3, %v3, %v12
+	vxor %v1, %v1, %v5
+	vxor %v3, %v3, %v5
+
+	vcipher %v0, %v0, %v6
+	vncipher %v1, %v1, %v12
+	vcipher %v2, %v2, %v6
+	vncipher %v3, %v3, %v12
+	vxor %v1, %v1, %v6
+	vxor %v3, %v3, %v6
+
+	vcipher %v0, %v0, %v7
+	vncipher %v1, %v1, %v12
+	vcipher %v2, %v2, %v7
+	vncipher %v3, %v3, %v12
+	vxor %v1, %v1, %v7
+	vxor %v3, %v3, %v7
+
+	// Byte-reverse f0-f3 and e0-e3
+	vperm %v0, %v0, %v0, %v15
+	vperm %v1, %v1, %v1, %v15
+	vperm %v2, %v2, %v2, %v15
+	vperm %v3, %v3, %v3, %v15
+	vperm %v4, %v4, %v4, %v15
+	vperm %v5, %v5, %v5, %v15
+	vperm %v6, %v6, %v6, %v15
+	vperm %v7, %v7, %v7, %v15
+
+randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end:
+
+
+// The following software AES code is based heavily on public-domain work by
+// Mike Hamburg of Stanford University. More information on that work can be
+// found here: https://crypto.stanford.edu/vpaes/
+//
+// This port of that code is not particularly well-optimized, partly because I
+// didn't really understand all the math behind it, and partly because I don't
+// yet have a POWER7 system to benchmark on. Possible areas for improvement
+// include:
+//
+// - Folding constants into other constants (to do this would require
+//   understanding the math behind this algorithm).
+// - Using functions and loops instead of unrolling everything with macros.
+// - Using VSX instructions to move values that are exclusively used in XOR
+//   operations to vs0-vs31 in order to reduce the number of registers we need
+//   to save to the stack.
+// - Using VSX loads to load constants into vs0-vs31 just once at the start of
+//   VM execution, then moving those constants into vector registers with xxlor
+//   during software AES. Might be faster if it means we can avoid having to
+//   save vector registers to the stack. Might also be slower if loading a
+//   series of constants from d-cache is faster than a bunch of xxlor
+//   operations to move them from vs0-vs31 into the vector registers.
+// - Restoring the overwritten constant registers (v15-v19) from the constant
+//   pool instead of the stack.
+// - Ordering the software AES constants in the constant pool based on the order
+//   they get loaded in, which could help with prefetching and reduce load
+//   latency.
+// - Loading the zero/4/0x0F vectors from the constant pool might be faster than
+//   generating them with vector instructions.
+// - Reordering instructions to keep the vector pipeline full and avoid false
+//   dependencies.
+
+// Macro: VPAES_TRANSFORM
+// Converts standard bytes to custom basis (using Lk_ipt tables)
+// OR custom basis back to standard bytes (using Lk_opt tables).
+//
+// Arguments:
+//   v_out    : Destination vector register
+//   v_in     : Source vector register (can be same as v_out)
+//   v_tmp    : Temporary vector register
+//   v_tab_lo : Vector register loaded with the low table (Lk_ipt lo / Lk_opt lo)
+//   v_tab_hi : Vector register loaded with the high table (Lk_ipt hi / Lk_opt hi)
+//   v_splat4 : Vector register pre-loaded with byte values of 0x04 (for shifting)
+
+.macro VPAES_TRANSFORM v_out, v_in, v_tmp, v_tab_lo, v_tab_hi, v_splat4
+	// Shift right by 4 to isolate the high nybbles into v_tmp
+	vsrb \v_tmp, \v_in, \v_splat4
+
+	// Lookup the low nybbles (vperm ignores the upper bits of the index)
+	vperm \v_out, \v_tab_lo, \v_tab_lo, \v_in
+
+	// Lookup the high nybbles
+	vperm \v_tmp, \v_tab_hi, \v_tab_hi, \v_tmp
+
+	// Combine the results
+	vxor \v_out, \v_out, \v_tmp
+.endm
+
+// Macro: VPAES_INVERSION
+// Performs Galois Field inversion in the custom composite field basis.
+//
+// Arguments:
+//   v_io      : Output vector 1 (io)
+//   v_jo      : Output vector 2 (jo)
+//   v_in      : Input vector (state in custom basis)
+//   v_invlo   : Lk_inv low table (first 16 bytes)
+//   v_invhi   : Lk_inv high table (second 16 bytes)
+//   v_splat4  : Vector pre-loaded with 0x04 (for shifting)
+//   v_splat0f : Vector pre-loaded with 0x0F (for masking)
+//   v_zero    : Vector pre-loaded with 0x00
+//   v_tmp1    : Temporary vector
+//   v_tmp2    : Temporary vector
+//   v_tmp3    : Temporary vector
+
+.macro VPAES_INVERSION v_io, v_jo, v_in, v_invlo, v_invhi, v_splat4, v_splat0f, v_zero, v_tmp1, v_tmp2, v_tmp3
+	// v_tmp1 = i (high nybbles)
+	vsrb \v_tmp1, \v_in, \v_splat4
+
+	// v_tmp3 = a/k
+	vperm \v_tmp3, \v_invhi, \v_invhi, \v_in
+
+	// v_tmp2 = j (low nybbles)
+	vxor \v_tmp2, \v_in, \v_tmp1
+
+	// v_io = 1/i
+	vperm \v_io, \v_invlo, \v_invlo, \v_tmp1
+
+	// v_jo = 1/j
+	vperm \v_jo, \v_invlo, \v_invlo, \v_tmp2
+
+	// mask j with 0x0F
+	vand \v_tmp2, \v_tmp2, \v_splat0f
+
+	// iak = 1/i + a/k
+	vxor \v_io, \v_io, \v_tmp3
+
+	// jak = 1/j + a/k
+	vxor \v_jo, \v_jo, \v_tmp3
+
+	// 1/iak (Note: v_zero is used for the second half of the table)
+	vperm \v_io, \v_invlo, \v_zero, \v_io
+
+	// 1/jak
+	vperm \v_jo, \v_invlo, \v_zero, \v_jo
+
+	// io = 1/iak + j
+	vxor \v_io, \v_io, \v_tmp2
+
+	// jo = 1/jak + i
+	vxor \v_jo, \v_jo, \v_tmp1
+.endm
+
+// Macro: VPAES_SB_MC
+// Performs combined SubBytes affine transform and MixColumns.
+// Output remains in the custom composite field basis.
+//
+// Arguments:
+//   v_out    : Output vector (custom basis)
+//   v_io     : Input vector 1 (inverted high nybbles from VPAES_INVERSION)
+//   v_jo     : Input vector 2 (inverted low nybbles from VPAES_INVERSION)
+//   v_sb1u   : Lk_sb1 low table (first 16 bytes)
+//   v_sb1t   : Lk_sb1 high table (second 16 bytes)
+//   v_sb2u   : Lk_sb2 low table (first 16 bytes)
+//   v_sb2t   : Lk_sb2 high table (second 16 bytes)
+//   v_mc_fwd : Lk_mc_forward base table (first 16 bytes)
+//   v_mc_bwd : Lk_mc_backward base table (first 16 bytes)
+//   v_zero   : Vector pre-loaded with 0x00
+//   v_tmp1   : Temporary vector
+//   v_tmp2   : Temporary vector
+//   v_tmp3   : Temporary vector
+
+.macro VPAES_SB_MC v_out, v_io, v_jo, v_sb1u, v_sb1t, v_sb2u, v_sb2t, v_mc_fwd, v_mc_bwd, v_zero, v_tmp1, v_tmp2, v_tmp3
+	// 1. Calculate A = sb1u(jo) ^ sb1t(io)
+	vperm \v_tmp1, \v_sb1u, \v_zero, \v_jo
+	vperm \v_out,  \v_sb1t, \v_zero, \v_io
+	vxor \v_out,  \v_out,  \v_tmp1              // v_out = A
+
+	// 2. Calculate 2A = sb2u(jo) ^ sb2t(io)
+	vperm \v_tmp2, \v_sb2u, \v_zero, \v_jo
+	vperm \v_tmp3, \v_sb2t, \v_zero, \v_io
+	vxor \v_tmp2, \v_tmp2, \v_tmp3              // v_tmp2 = 2A
+
+	// 3. Calculate B = rot(A, 1) and D = rot(A, 3)
+	vperm \v_tmp1, \v_out, \v_zero, \v_mc_fwd   // v_tmp1 = B
+	vperm \v_tmp3, \v_out, \v_zero, \v_mc_bwd   // v_tmp3 = D
+
+	// 4. Calculate 2A + B
+	vxor \v_tmp1, \v_tmp1, \v_tmp2              // v_tmp1 = 2A + B
+
+	// 5. Calculate 2B + C = rot(2A + B, 1)
+	vperm \v_tmp2, \v_tmp1, \v_zero, \v_mc_fwd  // v_tmp2 = 2B + C
+
+	// 6. Calculate 2A + B + D
+	vxor \v_out,  \v_tmp1, \v_tmp3              // v_out = 2A + B + D
+
+	// 7. Final Result = (2A + B + D) ^ (2B + C)
+	vxor \v_out,  \v_out,  \v_tmp2
+.endm
+
+// Macro: VPAES_INVSB_INVMC
+// Performs combined InvSubBytes affine transform and InvMixColumns on the inverted custom basis state.
+//
+// Arguments:
+//   v_out    : Output vector (custom basis)
+//   v_io     : Input vector 1 (inverted high nybbles from VPAES_INVERSION)
+//   v_jo     : Input vector 2 (inverted low nybbles from VPAES_INVERSION)
+//   v_sb9u, v_sb9t : Lk_dsb9 tables (low and high)
+//   v_sbdu, v_sbdt : Lk_dsbd tables (low and high)
+//   v_sbbu, v_sbbt : Lk_dsbb tables (low and high)
+//   v_sbeu, v_sbet : Lk_dsbe tables (low and high)
+//   v_mc_fwd : Lk_mc_forward base table (first 16 bytes)
+//   v_zero   : Vector pre-loaded with 0x00
+//   v_tmp1   : Temporary vector
+//   v_tmp2   : Temporary vector
+
+.macro VPAES_INVSB_INVMC v_out, v_io, v_jo, v_sb9u, v_sb9t, v_sbdu, v_sbdt, v_sbbu, v_sbbt, v_sbeu, v_sbet, v_mc_fwd, v_zero, v_tmp1, v_tmp2
+	// 1. Multiply by 0x09
+	vperm \v_tmp1, \v_sb9u, \v_zero, \v_io
+	vperm \v_tmp2, \v_sb9t, \v_zero, \v_jo
+	vxor \v_out,  \v_tmp1, \v_tmp2              // Acc = 0x09 * State
+
+	// 2. Rotate and add 0x0D
+	vperm \v_tmp1, \v_sbdu, \v_zero, \v_io
+	vperm \v_out,  \v_out,  \v_zero, \v_mc_fwd  // rot(Acc)
+	vperm \v_tmp2, \v_sbdt, \v_zero, \v_jo
+	vxor \v_out,  \v_out,  \v_tmp1
+	vxor \v_out,  \v_out,  \v_tmp2              // Acc = rot(Acc) ^ (0x0D * State)
+
+	// 3. Rotate and add 0x0B
+	vperm \v_tmp1, \v_sbbu, \v_zero, \v_io
+	vperm \v_out,  \v_out,  \v_zero, \v_mc_fwd  // rot(Acc)
+	vperm \v_tmp2, \v_sbbt, \v_zero, \v_jo
+	vxor \v_out,  \v_out,  \v_tmp1
+	vxor \v_out,  \v_out,  \v_tmp2              // Acc = rot(Acc) ^ (0x0B * State)
+
+	// 4. Rotate and add 0x0E
+	vperm \v_tmp1, \v_sbeu, \v_zero, \v_io
+	vperm \v_out,  \v_out,  \v_zero, \v_mc_fwd  // rot(Acc)
+	vperm \v_tmp2, \v_sbet, \v_zero, \v_jo
+	vxor \v_out,  \v_out,  \v_tmp1
+	vxor \v_out,  \v_out,  \v_tmp2              // Acc = rot(Acc) ^ (0x0E * State)
+.endm
+
+// Register allocations: Software AES
+//
+// v0-v3 / vs32-vs35 (in/out / volatile) -> RandomX floating point registers f0-f3
+// v4-v7 / vs36-vs39 (input / non-volatile) -> RandomX floating point registers e0-e3
+// v8 / vs40 (non-volatile) -> io (must reload: RandomX floating point registers a0)
+// v9 / vs41 (non-volatile) -> jo (must reload: RandomX floating point registers a1)
+// v10 / vs42 (non-volatile) -> 0x5b... / 0xe8... (must reload: RandomX floating point registers a2)
+// v11 / vs43 (non-volatile) -> invsubbytes_mulE_hi (must reload: RandomX floating point registers a3)
+// v12-v14 / vs44-vs46 (volatile) -> scratch registers
+// v15 / vs47 (non-volatile) -> shiftrows / invshiftrows (must reload: constant_vector_byte_reverse_mask)
+// v16 / vs48 (non-volatile) -> enc/dec input/output transform lo (must reload (BE only): constant_vector_be_permutation_mask)
+// v17 / vs49 (non-volatile) -> enc/dec input/output transform hi (must reload: constant_vector_group_e_and_mask)
+// v18 / vs50 (non-volatile) -> mixcolumns forward (must reload: constant_vector_fscal_xor_mask)
+// v19 / vs51 (non-volatile) -> mixcolumns backward / invsubbytes_mulE_lo (must reload: literal_vector_group_e_or_mask)
+// v20 / vs52 (non-volatile) -> Zero
+// v21 / vs53 (non-volatile) -> Shift amount (4)
+// v22 / vs54 (non-volatile) -> Low nybble mask (0x0F)
+// v23 / vs55 (non-volatile) -> Transformed round key
+// v24 / vs56 (non-volatile) -> galois_field_inversion_lo
+// v25 / vs57 (non-volatile) -> galois_field_inversion_hi
+// v26 / vs58 (non-volatile) -> subbytes_mul1_lo / invsubbytes_mul9_lo
+// v27 / vs59 (non-volatile) -> subbytes_mul1_hi / invsubbytes_mul9_hi
+// v28 / vs60 (non-volatile) -> subbytes_mul2_lo / invsubbytes_mulD_lo
+// v29 / vs61 (non-volatile) -> subbytes_mul2_lo / invsubbytes_mulD_hi
+// v30 / vs62 (non-volatile) -> invsubbytes_mulB_lo
+// v31 / vs63 (non-volatile) -> invsubbytes_mulB_hi
+
+randomx_ppc64_vm_spad_store_mix_v2_soft_aes:
+	// Mix F and E registers (f0-f3 are v0-v3, e0-e3 are v4-v7)
+
+	// Save v8-v11 and v15-v31 to the stack
+	addi %r6, %r1, -(16 * 21)
+	stvx %v8, 0, %r6
+	li %r7, 16*1
+	stvx %v9, %r7, %r6
+	li %r8, 16*2
+	stvx %v10, %r8, %r6
+	li %r9, 16*3
+	stvx %v11, %r9, %r6
+	li %r10, 16*4
+	stvx %v15, %r10, %r6
+	li %r11, 16*5
+	stvx %v16, %r11, %r6
+	li %r12, 16*6
+	stvx %v17, %r12, %r6
+	li %r7, 16*7
+	stvx %v18, %r7, %r6
+	li %r8, 16*8
+	stvx %v19, %r8, %r6
+	li %r9, 16*9
+	stvx %v20, %r9, %r6
+	li %r10, 16*10
+	stvx %v21, %r10, %r6
+	li %r11, 16*11
+	stvx %v22, %r11, %r6
+	li %r12, 16*12
+	stvx %v23, %r12, %r6
+	li %r7, 16*13
+	stvx %v24, %r7, %r6
+	li %r8, 16*14
+	stvx %v25, %r8, %r6
+	li %r9, 16*15
+	stvx %v26, %r9, %r6
+	li %r10, 16*16
+	stvx %v27, %r10, %r6
+	li %r11, 16*17
+	stvx %v28, %r11, %r6
+	li %r12, 16*18
+	stvx %v29, %r12, %r6
+	li %r7, 16*19
+	stvx %v30, %r7, %r6
+	li %r8, 16*20
+	stvx %v31, %r8, %r6
+
+	// Zero vector v20
+	vxor %v20, %v20, %v20
+
+	// Splat the shift amount to v21
+	vspltisb %v21, 4
+
+	// Splat the low nybble mask to v22
+	vspltisb %v22, 0x0F
+
+	// Load initial encryption constants
+	li %r6, constant_vector_soft_aes_encrypt_input_transform_lo-randomx_ppc64_constants
+	li %r7, constant_vector_soft_aes_encrypt_input_transform_hi-randomx_ppc64_constants
+	lvx %v16, %r6, %r2
+	lvx %v17, %r7, %r2
+	li %r8, constant_vector_soft_aes_mixcolumns_forward-randomx_ppc64_constants
+	li %r9, constant_vector_soft_aes_mixcolumns_backward-randomx_ppc64_constants
+	lvx %v18, %r8, %r2
+	lvx %v19, %r9, %r2
+	li %r6, constant_vector_soft_aes_galois_field_inversion_lo-randomx_ppc64_constants
+	li %r7, constant_vector_soft_aes_galois_field_inversion_hi-randomx_ppc64_constants
+	lvx %v24, %r6, %r2
+	lvx %v25, %r7, %r2
+	li %r8, constant_vector_soft_aes_subbytes_mul1_lo-randomx_ppc64_constants
+	li %r9, constant_vector_soft_aes_subbytes_mul1_hi-randomx_ppc64_constants
+	lvx %v26, %r8, %r2
+	lvx %v27, %r9, %r2
+	li %r6, constant_vector_soft_aes_subbytes_mul2_lo-randomx_ppc64_constants
+	li %r7, constant_vector_soft_aes_subbytes_mul2_hi-randomx_ppc64_constants
+	lvx %v28, %r6, %r2
+	lvx %v29, %r7, %r2
+	li %r8, constant_vector_soft_aes_shiftrows-randomx_ppc64_constants
+	li %r9, constant_vector_soft_aes_encrypt_63-randomx_ppc64_constants
+	lvx %v15, %r8, %r2
+	lvx %v10, %r9, %r2
+
+	// Transform inputs to composite field representation
+	VPAES_TRANSFORM %v0, %v0, %v12, %v16, %v17, %v21
+	VPAES_TRANSFORM %v2, %v2, %v12, %v16, %v17, %v21
+
+	// Round 0 (key v4)
+	VPAES_TRANSFORM %v23, %v4, %v12, %v16, %v17, %v21
+	vxor %v23, %v23, %v10
+	vperm %v0, %v0, %v0, %v15
+	VPAES_INVERSION %v8, %v9, %v0, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_SB_MC %v0, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14
+	vxor %v0, %v0, %v23
+	vperm %v2, %v2, %v2, %v15
+	VPAES_INVERSION %v8, %v9, %v2, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_SB_MC %v2, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14
+	vxor %v2, %v2, %v23
+
+	// Round 1 (key v5)
+	VPAES_TRANSFORM %v23, %v5, %v12, %v16, %v17, %v21
+	vxor %v23, %v23, %v10
+	vperm %v0, %v0, %v0, %v15
+	VPAES_INVERSION %v8, %v9, %v0, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_SB_MC %v0, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14
+	vxor %v0, %v0, %v23
+	vperm %v2, %v2, %v2, %v15
+	VPAES_INVERSION %v8, %v9, %v2, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_SB_MC %v2, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14
+	vxor %v2, %v2, %v23
+
+	// Round 2 (key v6)
+	VPAES_TRANSFORM %v23, %v6, %v12, %v16, %v17, %v21
+	vxor %v23, %v23, %v10
+	vperm %v0, %v0, %v0, %v15
+	VPAES_INVERSION %v8, %v9, %v0, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_SB_MC %v0, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14
+	vxor %v0, %v0, %v23
+	vperm %v2, %v2, %v2, %v15
+	VPAES_INVERSION %v8, %v9, %v2, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_SB_MC %v2, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14
+	vxor %v2, %v2, %v23
+
+	// Round 3 (key v7)
+	VPAES_TRANSFORM %v23, %v7, %v12, %v16, %v17, %v21
+	vxor %v23, %v23, %v10
+	vperm %v0, %v0, %v0, %v15
+	VPAES_INVERSION %v8, %v9, %v0, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_SB_MC %v0, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14
+	vxor %v0, %v0, %v23
+	vperm %v2, %v2, %v2, %v15
+	VPAES_INVERSION %v8, %v9, %v2, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_SB_MC %v2, %v8, %v9, %v26, %v27, %v28, %v29, %v18, %v19, %v20, %v12, %v13, %v14
+	vxor %v2, %v2, %v23
+
+	// Load encryption output transform constants
+	li %r6, constant_vector_soft_aes_encrypt_output_transform_lo-randomx_ppc64_constants
+	li %r7, constant_vector_soft_aes_encrypt_output_transform_hi-randomx_ppc64_constants
+	lvx %v16, %r6, %r2
+	lvx %v17, %r7, %r2
+
+	// Transform output from composite field representation
+	VPAES_TRANSFORM %v0, %v0, %v12, %v16, %v17, %v21
+	VPAES_TRANSFORM %v2, %v2, %v12, %v16, %v17, %v21
+
+	// Load initial decryption constants
+	li %r6, constant_vector_soft_aes_decrypt_input_transform_lo-randomx_ppc64_constants
+	li %r7, constant_vector_soft_aes_decrypt_input_transform_hi-randomx_ppc64_constants
+	lvx %v16, %r6, %r2
+	lvx %v17, %r7, %r2
+	li %r8, constant_vector_soft_aes_invsubbytes_mul9_lo-randomx_ppc64_constants
+	li %r9, constant_vector_soft_aes_invsubbytes_mul9_hi-randomx_ppc64_constants
+	lvx %v26, %r8, %r2
+	lvx %v27, %r9, %r2
+	li %r6, constant_vector_soft_aes_invsubbytes_mulD_lo-randomx_ppc64_constants
+	li %r7, constant_vector_soft_aes_invsubbytes_mulD_hi-randomx_ppc64_constants
+	lvx %v28, %r6, %r2
+	lvx %v29, %r7, %r2
+	li %r8, constant_vector_soft_aes_invsubbytes_mulB_lo-randomx_ppc64_constants
+	li %r9, constant_vector_soft_aes_invsubbytes_mulB_hi-randomx_ppc64_constants
+	lvx %v30, %r8, %r2
+	lvx %v31, %r9, %r2
+	li %r6, constant_vector_soft_aes_invsubbytes_mulE_lo-randomx_ppc64_constants
+	li %r7, constant_vector_soft_aes_invsubbytes_mulE_hi-randomx_ppc64_constants
+	lvx %v19, %r6, %r2
+	lvx %v11, %r7, %r2
+	li %r8, constant_vector_soft_aes_invshiftrows-randomx_ppc64_constants
+	li %r9, constant_vector_soft_aes_decrypt_63-randomx_ppc64_constants
+	lvx %v15, %r8, %r2
+	lvx %v10, %r9, %r2
+
+	// Transform inputs to composite field representation
+	VPAES_TRANSFORM %v1, %v1, %v12, %v16, %v17, %v21
+	VPAES_TRANSFORM %v3, %v3, %v12, %v16, %v17, %v21
+
+	// Round 0 (key v4)
+	VPAES_TRANSFORM %v23, %v4, %v12, %v16, %v17, %v21
+	vperm %v1, %v1, %v1, %v15
+	vxor %v1, %v1, %v10
+	VPAES_INVERSION %v8, %v9, %v1, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_INVSB_INVMC %v1, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13
+	vxor %v1, %v1, %v23
+	vperm %v3, %v3, %v3, %v15
+	vxor %v3, %v3, %v10
+	VPAES_INVERSION %v8, %v9, %v3, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_INVSB_INVMC %v3, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13
+	vxor %v3, %v3, %v23
+
+	// Round 1 (key v5)
+	VPAES_TRANSFORM %v23, %v5, %v12, %v16, %v17, %v21
+	vperm %v1, %v1, %v1, %v15
+	vxor %v1, %v1, %v10
+	VPAES_INVERSION %v8, %v9, %v1, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_INVSB_INVMC %v1, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13
+	vxor %v1, %v1, %v23
+	vperm %v3, %v3, %v3, %v15
+	vxor %v3, %v3, %v10
+	VPAES_INVERSION %v8, %v9, %v3, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_INVSB_INVMC %v3, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13
+	vxor %v3, %v3, %v23
+
+	// Round 2 (key v6)
+	VPAES_TRANSFORM %v23, %v6, %v12, %v16, %v17, %v21
+	vperm %v1, %v1, %v1, %v15
+	vxor %v1, %v1, %v10
+	VPAES_INVERSION %v8, %v9, %v1, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_INVSB_INVMC %v1, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13
+	vxor %v1, %v1, %v23
+	vperm %v3, %v3, %v3, %v15
+	vxor %v3, %v3, %v10
+	VPAES_INVERSION %v8, %v9, %v3, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_INVSB_INVMC %v3, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13
+	vxor %v3, %v3, %v23
+
+	// Round 3 (key v7)
+	VPAES_TRANSFORM %v23, %v7, %v12, %v16, %v17, %v21
+	vperm %v1, %v1, %v1, %v15
+	vxor %v1, %v1, %v10
+	VPAES_INVERSION %v8, %v9, %v1, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_INVSB_INVMC %v1, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13
+	vxor %v1, %v1, %v23
+	vperm %v3, %v3, %v3, %v15
+	vxor %v3, %v3, %v10
+	VPAES_INVERSION %v8, %v9, %v3, %v24, %v25, %v21, %v22, %v20, %v12, %v13, %v14
+	VPAES_INVSB_INVMC %v3, %v8, %v9, %v26, %v27, %v28, %v29, %v30, %v31, %v19, %v11, %v18, %v20, %v12, %v13
+	vxor %v3, %v3, %v23
+
+	// Load decryption output transform constants
+	li %r6, constant_vector_soft_aes_decrypt_output_transform_lo-randomx_ppc64_constants
+	li %r7, constant_vector_soft_aes_decrypt_output_transform_hi-randomx_ppc64_constants
+	lvx %v16, %r6, %r2
+	lvx %v17, %r7, %r2
+
+	// Transform output from composite field representation
+	VPAES_TRANSFORM %v1, %v1, %v12, %v16, %v17, %v21
+	VPAES_TRANSFORM %v3, %v3, %v12, %v16, %v17, %v21
+
+	// Restore v8-v11 and v15-v31 from the stack
+	addi %r6, %r1, -(16 * 21)
+	lvx %v8, 0, %r6
+	li %r7, 16*1
+	lvx %v9, %r7, %r6
+	li %r8, 16*2
+	lvx %v10, %r8, %r6
+	li %r9, 16*3
+	lvx %v11, %r9, %r6
+	li %r10, 16*4
+	lvx %v15, %r10, %r6
+	li %r11, 16*5
+	lvx %v16, %r11, %r6
+	li %r12, 16*6
+	lvx %v17, %r12, %r6
+	li %r7, 16*7
+	lvx %v18, %r7, %r6
+	li %r8, 16*8
+	lvx %v19, %r8, %r6
+	li %r9, 16*9
+	lvx %v20, %r9, %r6
+	li %r10, 16*10
+	lvx %v21, %r10, %r6
+	li %r11, 16*11
+	lvx %v22, %r11, %r6
+	li %r12, 16*12
+	lvx %v23, %r12, %r6
+	li %r7, 16*13
+	lvx %v24, %r7, %r6
+	li %r8, 16*14
+	lvx %v25, %r8, %r6
+	li %r9, 16*15
+	lvx %v26, %r9, %r6
+	li %r10, 16*16
+	lvx %v27, %r10, %r6
+	li %r11, 16*17
+	lvx %v28, %r11, %r6
+	li %r12, 16*18
+	lvx %v29, %r12, %r6
+	li %r7, 16*19
+	lvx %v30, %r7, %r6
+	li %r8, 16*20
+	lvx %v31, %r8, %r6
+
+randomx_ppc64_vm_spad_store_mix_v2_soft_aes_end:
+
+
+	.section ".text"
+C_FUNCTION(randomx_reciprocal_fast)
+	cntlzd  %r4, %r3       // r4 = 63 - k (count leading zeros)
+	li      %r5, 1         // r5 = 1
+	subfic  %r4, %r4, 63   // r4 = 63 - (63 - k) = k
+	sld     %r4, %r5, %r4  // r4 = 1 << k (this is the upper 64 bits of the dividend)
+	divdeu  %r3, %r4, %r3  // r3 = (r4 || 0x0000000000000000) / divisor
+	blr
diff --git a/src/jit_compiler_ppc64_static.hpp b/src/jit_compiler_ppc64_static.hpp
new file mode 100644
index 00000000..7909a81b
--- /dev/null
+++ b/src/jit_compiler_ppc64_static.hpp
@@ -0,0 +1,69 @@
+/*
+Copyright (c) 2026, Forest Crossman <cyrozap@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+extern "C" {
+	void randomx_ppc64_constants();
+	void randomx_ppc64_constant_lut_fprc_to_fpscr();
+	void randomx_ppc64_constants_end();
+
+	void randomx_ppc64_dataset_init();
+	void randomx_ppc64_dataset_init_fix_call();
+	void randomx_ppc64_dataset_init_end();
+
+	void randomx_ppc64_sshash_single_item_prologue();
+	void randomx_ppc64_sshash_single_item_prologue_end();
+	void randomx_ppc64_sshash_single_item_epilogue();
+	void randomx_ppc64_sshash_single_item_epilogue_end();
+	void randomx_ppc64_sshash_cache_prefetch();
+	void randomx_ppc64_sshash_cache_prefetch_end();
+	void randomx_ppc64_sshash_xor();
+	void randomx_ppc64_sshash_xor_end();
+
+	void randomx_ppc64_vm_prologue();
+	void randomx_ppc64_vm_prologue_end();
+	void randomx_ppc64_vm_epilogue();
+	void randomx_ppc64_vm_fix_loop();
+	void randomx_ppc64_vm_epilogue_end();
+	void randomx_ppc64_vm_loop_prologue();
+	void randomx_ppc64_vm_loop_prologue_end();
+	void randomx_ppc64_vm_data_read();
+	void randomx_ppc64_vm_data_read_end();
+	void randomx_ppc64_vm_data_read_light();
+	void randomx_ppc64_vm_data_read_light_fix_call();
+	void randomx_ppc64_vm_data_read_light_end();
+	void randomx_ppc64_vm_spad_store_group_r();
+	void randomx_ppc64_vm_spad_store_group_r_end();
+	void randomx_ppc64_vm_spad_store_mix_v1();
+	void randomx_ppc64_vm_spad_store_mix_v1_end();
+	void randomx_ppc64_vm_spad_store_mix_v2_hard_aes();
+	void randomx_ppc64_vm_spad_store_mix_v2_hard_aes_end();
+	void randomx_ppc64_vm_spad_store_mix_v2_soft_aes();
+	void randomx_ppc64_vm_spad_store_mix_v2_soft_aes_end();
+}
diff --git a/src/reciprocal.h b/src/reciprocal.h
index 90bd9b6b..57f3985f 100644
--- a/src/reciprocal.h
+++ b/src/reciprocal.h
@@ -30,7 +30,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <stdint.h>
 
-#if defined(_M_X64) || defined(__x86_64__)
+#if defined(_M_X64) || defined(__x86_64__) || (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__))
 #define RANDOMX_HAVE_FAST_RECIPROCAL 1
 #else
 #define RANDOMX_HAVE_FAST_RECIPROCAL 0