From 0e55a85832b281c67d28ff3fa32ce10bb32bc9e1 Mon Sep 17 00:00:00 2001
From: Floyd <you@example.com>
Date: Tue, 5 Mar 2024 20:11:09 +0100
Subject: [PATCH 1/3] gitbak

---
 src/simd/avx2.h                  | 337 ++++++++++++++++++++++++------
 src/simd/matrix/simple.h         | 126 +++++++++++
 src/simd/simd.h                  | 347 +++++++++++++++----------------
 tests/container/hashmap/simd.cpp |   2 +-
 4 files changed, 575 insertions(+), 237 deletions(-)
 create mode 100644 src/simd/matrix/simple.h

diff --git a/src/simd/avx2.h b/src/simd/avx2.h
index 29eeb788..da1796f8 100644
--- a/src/simd/avx2.h
+++ b/src/simd/avx2.h
@@ -40,17 +40,14 @@ namespace internal {
 
 
 namespace cryptanalysislib {
-	struct _uint16x8_t;
-	struct _uint32x4_t;
-	struct _uint64x2_t;
 
 	struct _uint8x16_t {
 		constexpr static uint32_t LIMBS = 16;
 		using limb_type = uint8_t;
 
-		constexpr inline _uint8x16_t& operator=(const _uint16x8_t& b) noexcept;
-		constexpr inline _uint8x16_t& operator=(const _uint32x4_t& b) noexcept;
-		constexpr inline _uint8x16_t& operator=(const _uint64x2_t& b) noexcept;
+		constexpr inline _uint8x16_t &operator=(const _uint16x8_t &b) noexcept;
+		constexpr inline _uint8x16_t &operator=(const _uint32x4_t &b) noexcept;
+		constexpr inline _uint8x16_t &operator=(const _uint64x2_t &b) noexcept;
 
 		constexpr _uint8x16_t() noexcept {}
 		constexpr _uint8x16_t(const _uint16x8_t &b) noexcept;
@@ -86,7 +83,7 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint8x16_t set(
-				uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept {
+		        uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept {
 			_uint8x16_t ret;
 			ret.v32[0] = d;
 			ret.v32[1] = c;
@@ -96,7 +93,7 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint8x16_t setr(
-				uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept {
+		        uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept {
 			_uint8x16_t ret;
 			ret.v32[0] = a;
 			ret.v32[1] = b;
@@ -106,22 +103,21 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint8x16_t set(
-				uint8_t a, uint8_t b, uint8_t c, uint8_t d,
-				uint8_t e, uint8_t f, uint8_t g, uint8_t h,
-				uint8_t i, uint8_t j, uint8_t k, uint8_t l,
-				uint8_t m, uint8_t n, uint8_t o, uint8_t p
-				) noexcept {
+		        uint8_t a, uint8_t b, uint8_t c, uint8_t d,
+		        uint8_t e, uint8_t f, uint8_t g, uint8_t h,
+		        uint8_t i, uint8_t j, uint8_t k, uint8_t l,
+		        uint8_t m, uint8_t n, uint8_t o, uint8_t p) noexcept {
 			_uint8x16_t ret;
-			ret.v8[ 0] = p;
-			ret.v8[ 1] = o;
-			ret.v8[ 2] = n;
-			ret.v8[ 3] = m;
-			ret.v8[ 4] = l;
-			ret.v8[ 5] = k;
-			ret.v8[ 6] = j;
-			ret.v8[ 7] = i;
-			ret.v8[ 8] = h;
-			ret.v8[ 9] = g;
+			ret.v8[0] = p;
+			ret.v8[1] = o;
+			ret.v8[2] = n;
+			ret.v8[3] = m;
+			ret.v8[4] = l;
+			ret.v8[5] = k;
+			ret.v8[6] = j;
+			ret.v8[7] = i;
+			ret.v8[8] = h;
+			ret.v8[9] = g;
 			ret.v8[10] = f;
 			ret.v8[11] = e;
 			ret.v8[12] = d;
@@ -132,22 +128,21 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint8x16_t setr(
-				uint8_t a, uint8_t b, uint8_t c, uint8_t d,
-				uint8_t e, uint8_t f, uint8_t g, uint8_t h,
-				uint8_t i, uint8_t j, uint8_t k, uint8_t l,
-				uint8_t m, uint8_t n, uint8_t o, uint8_t p
-				) noexcept {
+		        uint8_t a, uint8_t b, uint8_t c, uint8_t d,
+		        uint8_t e, uint8_t f, uint8_t g, uint8_t h,
+		        uint8_t i, uint8_t j, uint8_t k, uint8_t l,
+		        uint8_t m, uint8_t n, uint8_t o, uint8_t p) noexcept {
 			_uint8x16_t ret;
-			ret.v8[ 0] = a;
-			ret.v8[ 1] = b;
-			ret.v8[ 2] = c;
-			ret.v8[ 3] = d;
-			ret.v8[ 4] = e;
-			ret.v8[ 5] = f;
-			ret.v8[ 6] = g;
-			ret.v8[ 7] = h;
-			ret.v8[ 8] = i;
-			ret.v8[ 9] = j;
+			ret.v8[0] = a;
+			ret.v8[1] = b;
+			ret.v8[2] = c;
+			ret.v8[3] = d;
+			ret.v8[4] = e;
+			ret.v8[5] = f;
+			ret.v8[6] = g;
+			ret.v8[7] = h;
+			ret.v8[8] = i;
+			ret.v8[9] = j;
 			ret.v8[10] = k;
 			ret.v8[11] = l;
 			ret.v8[12] = m;
@@ -162,9 +157,9 @@ namespace cryptanalysislib {
 		constexpr static uint32_t LIMBS = 8;
 		using limb_type = uint16_t;
 
-		constexpr inline _uint16x8_t& operator=(const _uint8x16_t& b) noexcept;
-		constexpr inline _uint16x8_t& operator=(const _uint32x4_t& b) noexcept;
-		constexpr inline _uint16x8_t& operator=(const _uint64x2_t& b) noexcept;
+		constexpr inline _uint16x8_t &operator=(const _uint8x16_t &b) noexcept;
+		constexpr inline _uint16x8_t &operator=(const _uint32x4_t &b) noexcept;
+		constexpr inline _uint16x8_t &operator=(const _uint64x2_t &b) noexcept;
 
 		constexpr _uint16x8_t() noexcept {}
 		constexpr _uint16x8_t(const _uint8x16_t &b) noexcept;
@@ -198,7 +193,7 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint16x8_t set(
-				uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept {
+		        uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept {
 			_uint16x8_t ret;
 			ret.v32[0] = d;
 			ret.v32[1] = c;
@@ -208,7 +203,7 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint16x8_t setr(
-				uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept {
+		        uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept {
 			_uint16x8_t ret;
 			ret.v32[0] = a;
 			ret.v32[1] = b;
@@ -218,8 +213,8 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint16x8_t set(
-				uint16_t a, uint16_t b, uint16_t c, uint16_t d,
-				uint16_t e, uint16_t f, uint16_t g, uint16_t h) noexcept {
+		        uint16_t a, uint16_t b, uint16_t c, uint16_t d,
+		        uint16_t e, uint16_t f, uint16_t g, uint16_t h) noexcept {
 			_uint16x8_t ret;
 			ret.v16[0] = h;
 			ret.v16[1] = g;
@@ -233,8 +228,8 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint16x8_t setr(
-				uint16_t a, uint16_t b, uint16_t c, uint16_t d,
-				uint16_t e, uint16_t f, uint16_t g, uint16_t h) noexcept {
+		        uint16_t a, uint16_t b, uint16_t c, uint16_t d,
+		        uint16_t e, uint16_t f, uint16_t g, uint16_t h) noexcept {
 			_uint16x8_t ret;
 			ret.v64[0] = a;
 			ret.v64[1] = b;
@@ -252,9 +247,9 @@ namespace cryptanalysislib {
 		constexpr static uint32_t LIMBS = 4;
 		using limb_type = uint32_t;
 
-		constexpr inline _uint32x4_t& operator=(const _uint8x16_t& b) noexcept;
-		constexpr inline _uint32x4_t& operator=(const _uint16x8_t& b) noexcept;
-		constexpr inline _uint32x4_t& operator=(const _uint64x2_t& b) noexcept;
+		constexpr inline _uint32x4_t &operator=(const _uint8x16_t &b) noexcept;
+		constexpr inline _uint32x4_t &operator=(const _uint16x8_t &b) noexcept;
+		constexpr inline _uint32x4_t &operator=(const _uint64x2_t &b) noexcept;
 
 		constexpr _uint32x4_t() noexcept {}
 		constexpr _uint32x4_t(const _uint8x16_t &b) noexcept;
@@ -315,9 +310,9 @@ namespace cryptanalysislib {
 		constexpr static uint32_t LIMBS = 2;
 		using limb_type = uint64_t;
 
-		constexpr inline _uint64x2_t& operator=(const _uint8x16_t& b) noexcept;
-		constexpr inline _uint64x2_t& operator=(const _uint16x8_t& b) noexcept;
-		constexpr inline _uint64x2_t& operator=(const _uint32x4_t& b) noexcept;
+		constexpr inline _uint64x2_t &operator=(const _uint8x16_t &b) noexcept;
+		constexpr inline _uint64x2_t &operator=(const _uint16x8_t &b) noexcept;
+		constexpr inline _uint64x2_t &operator=(const _uint32x4_t &b) noexcept;
 
 		constexpr _uint64x2_t() noexcept {}
 		constexpr _uint64x2_t(const _uint8x16_t &b) noexcept;
@@ -370,15 +365,18 @@ struct uint8x32_t {
 		__m256i v256;
 	};
 
+	/// https://godbolt.org/#z:OYLghAFBqd5QCxAYwPYBMCmBRdBLAF1QCcAaPECAMzwBtMA7AQwFtMQByARg9KtQYEAysib0QXACx8BBAKoBnTAAUAHpwAMvAFYTStJg1DIApACYAQuYukl9ZATwDKjdAGFUtAK4sGEgBykrgAyeAyYAHI%2BAEaYxCCSZqQADqgKhE4MHt6%2BASlpGQKh4VEssfGJtpj2jgJCBEzEBNk%2BflyBdpgOmfWNBMWRMXEJSQoNTS257bbj/WGDZcOJAJS2qF7EyOwc5gDMYcjeWADUJrtuY/iCAHQIZ9gmGgCCewdHmKfneCwsYQTEYVu90eLzM%2BwYhy8JzObmQlycwOeILGxC8DmOXj%2B/lUuzMAH0CKcAOxWJEaACcXgYmWJpJeFMelKxBOOxwAbv5TgBWCy4kxcgAiZzpjMZmMEXAAbCy2VL%2BRY5YLhSDRRTxQRcTK%2BTzjv5%2BULdiKGWq/pLJDKzfLjpJ9cqyeTGXi8SwzFzJXh2a7JXb6Q6iQa6azWSCg8c0AwxphVMliMcxkxHMhjmFaPMMVicfjCaoSHiIOGxunBNjNYSwlxSCqKaGa7W6/WC4T1SWs8mGGZlrSqw6TcXMyz1gQfaryYPrmyvZ8BccIE6XW68J2oE62ZJ0F5luXx5P%2BdgZyu1xuwmZt27lsPjeSAPRXscTt1T47O%2BfSnPEPHpL0QLf3yWkNsnr%2B56Gt2jLEJgBAbAwxyDhefpCmSvo3le3bIdybjJI0rDHMk/yoTe6HgZBxAMN2jZRjGcYNImbapuERYEC2LJUmIeDAOE6B4rQqBMOg%2BYCIWbKoHg6DHAAVLhxCdiYJKgRSTBeEQ4mSTuuzTrOzpeh6YnLJJcFihmpYwYp%2BkUneqnThJ/xeqZ163opp6So%2Bz5elxPHrh%2BeBfhpL7abp/zAUaPbkkRUHGUOIH2jJCFPKGIZBsh%2BFXuhmHENhen2mh/IYVhLBtmRAkEBRsbxjRKZpkJIkYswqbsZgnFjCQmAQJVolWWQYaFQxTFlgw0myVFFJzlpynWQ%2BZzqcNC7if5xC2e1Flto5Ppxf6KprZF9JPOR0axq1xxFWMEDNiyYmDv%2Bjbdad5YXV1J2EmJx63RGTZ/EZfVds8ob8LGx1va2HoTccGjCsm1i4p8bjHKRhpg5YE24v1gZ1pdzb9oSTCPmjpYgCALG1Rxbm8d%2BDBcKcljJmJiMrSjd2Ga20RY/TBK4/jbGE9xxPHuTFiU9Tm0NnTfZGUmQPY1muNvnmTD/tEgXxbW4ss3jNXs/VH5EOBEDnWG8tfUG0XrTFIJ/McLBMGEEBIwrnUvcVDHvY%2BXBmHqAtBjtlH3eF8qkYKj4yRYIMbcjrIe7GXvlj7%2Br%2BySQcBjbYdXb1ZhR37QMB3HF6xUGh0EFAXs6TrN0AaQfU06yMZ/FQEDmCnrroNlpFmEk3s8iDgp6y8/ocKstCcFyvB%2BBwWikKgnBuNY1hxusmwfHsPCkAQmg96sADWIC7Ls1waP4kiSkSkhEv4GhH0ff59xwkiD8vo%2BcLwCggBoi/L6scCwEgaAsMkdBxOQlCf9/eg8RgBcFxKQLAbI8BbAAGp4EwAAdwAPLJEYJwBeNBaBFWIA/CA0Qb7RDCI0AAnmg3gBDmDECIYg6I2guhL24LwT%2BbBBCIIYLQEhw9eBYGiF4YAbgxC0Afgw8BmBzZGHEJw8BeBwLdDZJgIRI8oxdEUtsBefxqg31TNENKlCPBYBvv8b4pDVhUAMMABQsCEHINQcI/gggRBiHYFIGQghFAqHUJI3QFYDBGBQJPSw%2Bg8DRAfpAVYqBcKZCEQAWnNmyVQZhjhRMQbsXgqA5HEABFgEJVsqg1EyC4Bg7hPCtAkESIIhSBilHKBIJ%2BqR0i1CyMUqYZS6mFAYJUoY8QuBP06N0OosxJhtDKb0hpvQmgdMWF0npAymlDJmH0CZ1TumrAUDPLYEhe792vpIseHBjiqH8JKKJZpjjAGQEmUB1wEkQFwIQEg5NdhcGWLwehWhlhrw3lvLk/guDknJO0fw3yj4/OkBfK%2Bpcb67Pvo/Z%2BnDX4wEQCAQcyRFJ/34l/H%2BxAIisG2Aco5JyzkXK3mYXg9U7mZL0HY4QohxDOKpW4tQN8vGkHgWlZIxj9BbIhTszgiDFIosJKgKg%2BzDnHMkKc85xxLnXI8BioBDynkvJfh8ze1xN7qo1Zqzll9tkjyhbYGFryV7apJdyvVd9YVvNWOk9IzhJBAA%3D
+	constexpr uint8x32_t() noexcept = default;
+	constexpr uint8x32_t(const uint16x16_t &b) noexcept;
+	constexpr uint8x32_t(const uint32x8_t &b) noexcept;
+	constexpr uint8x32_t(const uint64x4_t &b) noexcept;
+	constexpr uint8x32_t(const uint128x2_t &b) noexcept;
+
 	[[nodiscard]] constexpr inline limb_type operator[](const uint32_t i) {
 		ASSERT(i < LIMBS);
 		return d[i];
 	}
 
-	/// Example of how the constexpr implementation works:
-	/// https://godbolt.org/#z:OYLghAFBqd5QCxAYwPYBMCmBRdBLAF1QCcAaPECAMzwBtMA7AQwFtMQByARg9KtQYEAysib0QXACx8BBAKoBnTAAUAHpwAMvAFYTStJg1DIApACYAQuYukl9ZATwDKjdAGFUtAK4sGEgBykrgAyeAyYAHI%2BAEaYxCCSZqQADqgKhE4MHt6%2BASlpGQKh4VEssfGJtpj2jgJCBEzEBNk%2BflyBdpgOmfWNBMWRMXEJSQoNTS257bbj/WGDZcOJAJS2qF7EyOwc5gDMYcjeWADUJrtuY/iCAHQIZ9gmGgCCewdHmKfneCwsYQTEYVu90eLzM%2BwYhy8JzObmQlycwOeILGxC8DmOXj%2B/lUuzMAH0CKcAOxWJEaACcXgYmWJpJeFMelKxBOOxwAbv5TgBWCy4kxcgAiZzpjMZmMEXAAbCy2VL%2BRY5YLhSDRRTxQRcTK%2BTzjv5%2BULdiKGWq/pLJDKzfLjpJ9cqyeTGXi8SwzFzJXh2a7JXb6Q6iQa6azWSCg8c0AwxphVMliMcxkxHMhjmFaPMMVicfjCaoSHiIOGxunBNjNYSwlxSCqKaGa7W6/WC4T1SWs8mGGZlrSqw6TcXMyz1gQfaryYPrmyvZ8BccIE6XW68J2oE62ZJ0F5luXx5P%2BdgZyu1xuwmZt27lsPjeSAPRXscTt1T47O%2BfSnPEPHpL0QLf3yWkNsnr%2B56Gt2jLEJgBAbAwxyDhefpCmSvo3le3bIdybjJI0rDHMk/yoTe6HgZBxAMN2jZRjGcYNImbapuERYEC2LJUmIeDAOE6B4rQqBMOg%2BYCIWbKoHg6DHAAVLhxCdiYJKgRSTBeEQ4mSTuuzTrOzpeh6YnLJJcFihmpYwYp%2BkUneqnThJ/xeqZ163opp6So%2Bz5elxPHrh%2BeBfhpL7abp/zAUaPbkkRUHGUOIH2jJCFPKGIZBsh%2BFXuhmHENhen2mh/IYVhLBtmRAkEBRsbxjRKZpkJIkYswqbsZgnFjCQmAQJVolWWQYaFQxTFlgw0myVFFJzlpynWQ%2BZzqcNC7if5xC2e1Flto5Ppxf6KprZF9JPOR0axq1xxFWMEDNiyYmDv%2Bjbdad5YXV1J2EmJx63RGTZ/EZfVds8ob8LGx1va2HoTccGjCsm1i4p8bjHKRhpg5YE24v1gZ1pdzb9oSTCPmjpYgCALG1Rxbm8d%2BDBcKcljJmJiMrSjd2Ga20RY/TBK4/jbGE9xxPHuTFiU9Tm0NnTfZGUmQPY1muNvnmTD/tEgXxbW4ss3jNXs/VH5EOBEDnWG8tfUG0XrTFIJ/McLBMGEEBIwrnUvcVDHvY%2BXBmHqAtBjtlH3eF8qkYKj4yRYIMbcjrIe7GXvlj7%2Br%2BySQcBjbYdXb1ZhR37QMB3HF6xUGh0EFAXs6TrN0AaQfU06yMZ/FQEDmCnrroNlpFmEk3s8iDgp6y8/ocKstCcFyvB%2BBwWikKgnBuNY1hxusmwfHsPCkAQmg96sADWIC7Ls1waP4kiSkSkhEv4GhH0ff59xwkiD8vo%2BcLwCggBoi/L6scCwEgaAsMkdBxOQlCf9/eg8RgBcFxKQLAbI8BbAAGp4EwAAdwAPLJEYJwBeNBaBFWIA/CA0Qb7RDCI0AAnmg3gBDmDECIYg6I2guhL24LwT%2BbBBCIIYLQEhw9eBYGiF4YAbgxC0Afgw8BmBzZGHEJw8BeBwLdDZJgIRI8oxdEUtsBefxqg31TNENKlCPBYBvv8b4pDVhUAMMABQsCEHINQcI/gggRBiHYFIGQghFAqHUJI3QFYDBGBQJPSw%2Bg8DRAfpAVYqBcKZCEQAWnNmyVQZhjhRMQbsXgqA5HEABFgEJVsqg1EyC4Bg7hPCtAkESIIhSBilHKBIJ%2BqR0i1CyMUqYZS6mFAYJUoY8QuBP06N0OosxJhtDKb0hpvQmgdMWF0npAymlDJmH0CZ1TumrAUDPLYEhe792vpIseHBjiqH8JKKJZpjjAGQEmUB1wEkQFwIQEg5NdhcGWLwehWhlhrw3lvLk/guDknJO0fw3yj4/OkBfK%2Bpcb67Pvo/Z%2BnDX4wEQCAQcyRFJ/34l/H%2BxAIisG2Aco5JyzkXK3mYXg9U7mZL0HY4QohxDOKpW4tQN8vGkHgWlZIxj9BbIhTszgiDFIosJKgKg%2BzDnHMkKc85xxLnXI8BioBDynkvJfh8ze1xN7qo1Zqzll9tkjyhbYGFryV7apJdyvVd9YVvNWOk9IzhJBAA%3D
-	constexpr uint8x32_t() noexcept = default;
-
 	/// NOTE: currently cannot be constexpr
 	/// \return
 	[[nodiscard]] static inline uint8x32_t random() noexcept {
@@ -446,6 +444,40 @@ struct uint8x32_t {
 		return out;
 	}
 
+	///
+	/// \param __q31
+	/// \param __q30
+	/// \param __q29
+	/// \param __q28
+	/// \param __q27
+	/// \param __q26
+	/// \param __q25
+	/// \param __q24
+	/// \param __q23
+	/// \param __q22
+	/// \param __q21
+	/// \param __q20
+	/// \param __q19
+	/// \param __q18
+	/// \param __q17
+	/// \param __q16
+	/// \param __q15
+	/// \param __q14
+	/// \param __q13
+	/// \param __q12
+	/// \param __q11
+	/// \param __q10
+	/// \param __q09
+	/// \param __q08
+	/// \param __q07
+	/// \param __q06
+	/// \param __q05
+	/// \param __q04
+	/// \param __q03
+	/// \param __q02
+	/// \param __q01
+	/// \param __q00
+	/// \return
 	[[nodiscard]] constexpr static inline uint8x32_t setr(char __q31, char __q30, char __q29, char __q28,
 	                                                      char __q27, char __q26, char __q25, char __q24,
 	                                                      char __q23, char __q22, char __q21, char __q20,
@@ -731,6 +763,19 @@ struct uint8x32_t {
 		ret.v256 = popcount_avx2_8(in.v256);
 		return ret;
 	}
+
+
+	/// wrapper around: `_mm256_blend_epi8`
+	/// \tparam in2
+	/// \param in1
+	/// \return
+	[[nodiscard]] constexpr static inline uint8x32_t blend(const uint8x32_t in1,
+	                                                       const uint8x32_t in2,
+	                                                       const uint8x32_t in3) noexcept {
+		uint8x32_t ret{};
+		ret.v256 = (__m256i) __builtin_ia32_pblendvb256((__v32qi) in1.v256, (__v32qi) in2.v256, (__v32qi) in3.v256);
+		return ret;
+	}
 };
 
 struct uint16x16_t {
@@ -748,6 +793,12 @@ struct uint16x16_t {
 		__m256i v256;
 	};
 
+	constexpr uint16x16_t() noexcept = default;
+	constexpr uint16x16_t(const uint8x32_t &b) noexcept;
+	constexpr uint16x16_t(const uint32x8_t &b) noexcept;
+	constexpr uint16x16_t(const uint64x4_t &b) noexcept;
+	constexpr uint16x16_t(const uint128x2_t &b) noexcept;
+
 	[[nodiscard]] constexpr inline limb_type operator[](const uint32_t i) const {
 		ASSERT(i < LIMBS);
 		return d[i];
@@ -1051,6 +1102,19 @@ struct uint16x16_t {
 		return ret;
 	}
 
+	/// wrapper around: `_mm256_blend_epi32`
+	/// \tparam in2
+	/// \param in1
+	/// \return
+	template<uint8_t imm>
+	[[nodiscard]] constexpr static inline uint16x16_t blend(const uint16x16_t in1,
+	                                                        const uint16x16_t in2) noexcept {
+		uint16x16_t ret{};
+		ret.v256 = ((__m256i) __builtin_ia32_pblendw256((__v16hi) (__m256i) (in1.v256),
+		                                                (__v16hi) (__m256i) (in2.v256), (int) (imm)));
+		return ret;
+	}
+
 	///
 	/// \param in
 	/// \return
@@ -1073,10 +1137,16 @@ struct uint32x8_t {
 		uint16_t v16[16];
 		uint32_t v32[8];
 		uint64_t v64[4];
-		cryptanalysislib::_uint32x4_t v128[2];
+		// TODO cryptanalysislib::_uint32x4_t v128[2];
 		__m256i v256;
 	};
 
+	constexpr uint32x8_t() noexcept = default;
+	constexpr uint32x8_t(const uint8x32_t &b) noexcept;
+	constexpr uint32x8_t(const uint16x16_t &b) noexcept;
+	constexpr uint32x8_t(const uint64x4_t &b) noexcept;
+	constexpr uint32x8_t(const uint128x2_t &b) noexcept;
+
 	[[nodiscard]] constexpr inline limb_type operator[](const uint32_t i) noexcept {
 		ASSERT(i < LIMBS);
 		return d[i];
@@ -1442,6 +1512,53 @@ struct uint32x8_t {
 		}
 	}
 
+	/// wrapper around: `_mm256_blend_epi32`
+	/// \tparam in2
+	/// \param in1
+	/// \return
+	template<uint8_t imm>
+	[[nodiscard]] constexpr static inline uint32x8_t blend(const uint32x8_t in1,
+	                                                       const uint32x8_t in2) noexcept {
+		uint32x8_t ret{};
+		ret.v256 = ((__m256i) __builtin_ia32_pblendd256((__v8si) (__m256i) (in1.v256),
+		                                                (__v8si) (__m256i) (in2.v256), (int) (imm)));
+		return ret;
+	}
+
+	/// wrapper around: `_mm256_unpacklo_epi64`
+	/// \tparam in2
+	/// \param in1
+	/// \return
+	[[nodiscard]] constexpr static inline uint32x8_t unpacklo(const uint32x8_t in1,
+	                                                          const uint32x8_t in2) noexcept {
+		uint32x8_t ret{};
+		ret.v256 = (__m256i) __builtin_shufflevector((__v8si) in1.v256, (__v8si) in2.v256, 0, 8 + 0, 1, 8 + 1, 4, 8 + 4, 5, 8 + 5);
+		return ret;
+	}
+
+	/// wrapper around: `_mm256_unpacklo_epi64`
+	/// \tparam in2
+	/// \param in1
+	/// \return
+	[[nodiscard]] constexpr static inline uint32x8_t unpackhi(const uint32x8_t in1,
+	                                                          const uint32x8_t in2) noexcept {
+		uint32x8_t ret{};
+		ret.v256 = (__m256i) __builtin_shufflevector((__v8si) in1.v256, (__v8si) in2.v256, 2, 8 + 2, 3, 8 + 3, 6, 8 + 6, 7, 8 + 7);
+		return ret;
+	}
+
+	/// wrapper around: `_mm256_permute2x128_si256`
+	/// TODO
+	/// \tparam in2
+	/// \param in1
+	/// \return
+	template<const uint32_t in3>
+	[[nodiscard]] constexpr static inline uint32x8_t permute(const uint32x8_t in1,
+	                                                         const uint32x8_t in2) noexcept {
+		uint32x8_t ret{};
+		return ret;
+	}
+
 	///
 	/// \param in
 	/// \param perm
@@ -1502,6 +1619,12 @@ struct uint64x4_t {
 		__m256i v256;
 	};
 
+	constexpr inline uint64x4_t() noexcept = default;
+	constexpr inline uint64x4_t(const uint8x32_t &b) noexcept;
+	constexpr inline uint64x4_t(const uint16x16_t &b) noexcept;
+	constexpr inline uint64x4_t(const uint32x8_t &b) noexcept;
+	constexpr inline uint64x4_t(const uint128x2_t &b) noexcept;
+
 	[[nodiscard]] constexpr inline limb_type operator[](const uint32_t i) {
 		ASSERT(i < LIMBS);
 		return d[i];
@@ -1759,7 +1882,41 @@ struct uint64x4_t {
 		return out;
 	}
 
-	///
+	/// wrapper around: `_mm256_unpacklo_epi64`
+	/// \tparam in2
+	/// \param in1
+	/// \return
+	[[nodiscard]] constexpr static inline uint64x4_t unpacklo(const uint64x4_t in1,
+	                                                          const uint64x4_t in2) noexcept {
+		uint64x4_t ret{};
+		ret.v256 = (__m256i) __builtin_shufflevector((__v4di) in1.v256, (__v4di) in2.v256, 0, 4 + 0, 2, 4 + 2);
+		return ret;
+	}
+
+	/// wrapper around: `_mm256_unpacklo_epi64`
+	/// \tparam in2
+	/// \param in1
+	/// \return
+	[[nodiscard]] constexpr static inline uint64x4_t unpackhi(const uint64x4_t in1,
+	                                                          const uint64x4_t in2) noexcept {
+		uint64x4_t ret{};
+		ret.v256 = (__m256i) __builtin_shufflevector((__v4di) in1.v256, (__v4di) in1.v256, 1, 4 + 1, 3, 4 + 3);
+		return ret;
+	}
+
+	/// wrapper around: `_mm256_permute2x128_si256`
+	/// \tparam in2
+	/// \param in1
+	/// \return
+	template<const uint32_t in3>
+	[[nodiscard]] constexpr static inline uint64x4_t permute(const uint64x4_t in1,
+	                                                         const uint64x4_t in2) noexcept {
+		uint64x4_t ret{};
+		ret.v256 = ((__m256i) __builtin_ia32_permti256((__v4di) (__m256i) (in1.v256), (__m256i) (in2.v256), (int) (in3)));
+		return ret;
+	}
+
+	/// wrapper around: `_mm256_permute4x64_epi64`
 	/// \tparam in2
 	/// \param in1
 	/// \return
@@ -1895,8 +2052,70 @@ struct uint64x4_t {
 	}
 };
 
+struct uint128x2_t {
+	constexpr static uint32_t LIMBS = 2;
+	using limb_type = __uint128_t;
 
+	union {
+		// compatibility with TxN_t
+		__uint128_t d[2];
 
+		uint8_t v8[32];
+		uint16_t v16[16];
+		uint32_t v32[8];
+		uint64_t v64[4];
+		__uint128_t v128[2];
+		__m256i v256;
+	};
+
+	constexpr uint128x2_t() noexcept = default;
+	constexpr uint128x2_t(const uint8x32_t &b) noexcept;
+	constexpr uint128x2_t(const uint16x16_t &b) noexcept;
+	constexpr uint128x2_t(const uint32x8_t &b) noexcept;
+	constexpr uint128x2_t(const uint64x4_t &b) noexcept;
+
+	[[nodiscard]] constexpr inline limb_type operator[](const uint32_t i) {
+		ASSERT(i < LIMBS);
+		return d[i];
+	}
+
+	/// NOTE: currently cannot be constexpr
+	/// \return
+	[[nodiscard]] static inline uint128x2_t random() noexcept {
+		uint128x2_t ret{};
+		for (size_t i = 0; i < 4; ++i) {
+			ret.v64[i] = fastrandombytes_uint64();
+		}
+		return ret;
+	}
+
+	///
+	/// \param binary
+	/// \param hex
+	constexpr inline void print(bool binary = false, bool hex = false) const;
+
+	/// wrapper around: `_mm256_bslli_epi128`
+	/// \tparam imm
+	/// \param in1
+	/// \return
+	template<const uint8_t imm>
+	[[nodiscard]] constexpr static uint128x2_t slli(const uint128x2_t in1) {
+		uint128x2_t ret{};
+		ret.v256 = ((__m256i) __builtin_ia32_pslldqi256_byteshift((__v4di) (__m256i) (in1.v256), (int) (imm)));
+		return ret;
+	}
+
+	/// wrapper around: `_mm256_bslli_epi128`
+	/// \tparam imm
+	/// \param in1
+	/// \return
+	template<const uint8_t imm>
+	[[nodiscard]] constexpr static uint128x2_t srli(const uint128x2_t in1) {
+		uint128x2_t ret{};
+		ret.v256 = ((__m256i) __builtin_ia32_psrldqi256_byteshift((__m256i) (in1.v256), (int) (imm)));
+		return ret;
+	}
+};
 
 
 #endif
diff --git a/src/simd/matrix/simple.h b/src/simd/matrix/simple.h
new file mode 100644
index 00000000..fbbb05d3
--- /dev/null
+++ b/src/simd/matrix/simple.h
@@ -0,0 +1,126 @@
+#ifndef CRYPTANALYSISLIB_SIMD_MATRIX_SIMPLE_H
+#define CRYPTANALYSISLIB_SIMD_MATRIX_SIMPLE_H
+
+class uint1x64x64_T {
+	uint64_t data[64];
+
+	constexpr static void transpose(uint64_t *out, uint64_t *in) noexcept {
+		constexpr static uint64_t masks[6][2] = {
+		        {0x5555555555555555, 0xAAAAAAAAAAAAAAAA},
+		        {0x3333333333333333, 0xCCCCCCCCCCCCCCCC},
+		        {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0},
+		        {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00},
+		        {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000},
+		        {0x00000000FFFFFFFF, 0xFFFFFFFF00000000}};
+
+		for (uint64_t i = 0; i < 64; i++) {
+			out[i] = in[i];
+		}
+
+		for (int32_t d = 5; d >= 0; d--) {
+			const uint32_t s = 1u << d;
+
+			for (uint32_t i = 0; i < 64u; i += s * 2u) {
+				for (uint32_t j = i; j < i + s; j++) {
+					const uint64_t x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s);
+					const uint64_t y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]);
+					out[j + 0] = x;
+					out[j + s] = y;
+				}
+			}
+		}
+	}
+
+	/// inplace transpose
+	constexpr void transpose() noexcept {
+		for (uint64_t j = 32, m = 0x00000000FFFFFFFF; j; j >>= 1, m ^= m << j) {
+			for (uint64_t k = 0; k < 64; k = ((k | j) + 1) & ~j) {
+				uint64_t t = (data[k] ^ (data[k | j] >> j)) & m;
+				data[k] ^= t;
+				data[k | j] ^= (t << j);
+			}
+		}
+	}
+
+	constexpr void mul() {
+	}
+};
+
+
+class uint8x32x32_t {
+	uint32_t data[32];
+
+	/// inlpace
+	constexpr void transpose() noexcept {
+#if 1
+		// TODO not correct,
+		// taken from https://github.com/pqov/pqov-paper/blob/main/src/avx2/blas_matrix_avx2.c
+		alignas(32) uint64x4_t mat[32];
+
+		// load
+		for (size_t i = 0; i < 32; i++) {
+			mat[i] = uint64x4_t::load(data + i);
+		}
+
+		// swap 16x16 blocks
+		for (size_t i = 0; i < 16; i++) {
+			uint64x4_t tmp = uint64x4_t::template permute<0x20>(mat[i], mat[i + 16]);
+			mat[i + 16] = uint64x4_t::template permute<0x31>(mat[i], mat[i + 16]);
+			mat[i] = tmp;
+		}
+
+		// swap 8x8 blocks
+		for (size_t i = 0; i < 2; i++) {
+			for (size_t j = 0; j < 8; j++) {
+				size_t r = 16 * i + j;
+				uint64x4_t tmp = uint64x4_t::unpacklo(mat[r], mat[r + 8]);
+				mat[r + 8] = uint64x4_t::unpackhi(mat[r], mat[r + 8]);
+				mat[r] = tmp;
+			}
+		}
+
+		// swap 4x4 blocks
+		for (size_t i = 0; i < 4; i++) {
+			for (size_t j = 0; j < 4; j++) {
+				size_t r = 8 * i + j;
+				uint32x8_t tmp = uint32x8_t::template blend<0xaa>(mat[r], uint128x2_t::template slli<4>(mat[r + 4]));
+				mat[r + 4] = uint32x8_t::template blend<0xaa>(uint128x2_t::template srli<4>(mat[r]), mat[r + 4]);
+				mat[r] = tmp;
+			}
+		}
+
+		// swap 2x2 blocks
+		for (size_t i = 0; i < 8; i++) {
+			for (size_t j = 0; j < 2; j++) {
+				size_t r = 4 * i + j;
+				uint16x16_t tmp = uint16x16_t::template blend<0xaa>(mat[r], uint128x2_t::template slli<2>(mat[r + 2]));
+				mat[r + 2] = uint16x16_t::template blend<0xaa>(uint128x2_t::template srli<0xaa>(mat[r]), mat[r + 2]);
+				mat[r] = tmp;
+			}
+		}
+
+		// swap last bytes
+		for (size_t i = 0; i < 16; i++) {
+			size_t r = 2 * i;
+			const uint16x16_t blend_mask = uint16x16_t::set1(0xFF00);
+			const uint64x4_t tmp = uint8x32_t::blend(mat[r], uint128x2_t::template slli<1>(mat[r + 1]), blend_mask);
+			mat[r + 1] = uint8x32_t::blend(uint128x2_t::template srli<1>(mat[r]), mat[r + 1], blend_mask);
+			mat[r] = tmp;
+		}
+		// store result
+		for (size_t i = 0; i < 32; i++) {
+			uint64x4_t::store(data + i * 64, mat[i]);
+		}
+#else
+		for (unsigned i = 0; i < 32; i++) {
+			for (unsigned j = i + 1; j < 32; j++) {
+				uint8_t tmp = mat[j * 64 + i];
+				mat[j * 64 + i] = mat[i * 64 + j];
+				mat[i * 64 + j] = tmp;
+			}
+		}
+#endif
+	}
+};
+
+#endif//CRYPTANALYSISLIB_SIMD_MATRIX_SIMPLE_H
diff --git a/src/simd/simd.h b/src/simd/simd.h
index d98fc2e2..19fa1740 100644
--- a/src/simd/simd.h
+++ b/src/simd/simd.h
@@ -10,6 +10,16 @@
 #include "print/print.h"
 #include "random.h"
 
+namespace cryptanalysislib {
+	struct _uint16x8_t;
+	struct _uint32x4_t;
+	struct _uint64x2_t;
+}// namespace cryptanalysislib
+
+struct uint16x16_t;
+struct uint32x8_t;
+struct uint64x4_t;
+struct uint128x2_t;
 
 #if defined(USE_AVX2)
 
@@ -30,17 +40,13 @@
 #else
 
 namespace cryptanalysislib {
-	struct _uint16x8_t;
-	struct _uint32x4_t;
-	struct _uint64x2_t;
-
 	struct _uint8x16_t {
 		constexpr static uint32_t LIMBS = 16;
 		using limb_type = uint8_t;
 
-		constexpr inline _uint8x16_t& operator=(const _uint16x8_t& b) noexcept;
-		constexpr inline _uint8x16_t& operator=(const _uint32x4_t& b) noexcept;
-		constexpr inline _uint8x16_t& operator=(const _uint64x2_t& b) noexcept;
+		constexpr inline _uint8x16_t &operator=(const _uint16x8_t &b) noexcept;
+		constexpr inline _uint8x16_t &operator=(const _uint32x4_t &b) noexcept;
+		constexpr inline _uint8x16_t &operator=(const _uint64x2_t &b) noexcept;
 
 		constexpr _uint8x16_t() noexcept {}
 		constexpr _uint8x16_t(const _uint16x8_t &b) noexcept;
@@ -57,7 +63,7 @@ namespace cryptanalysislib {
 			uint64_t v64[2];
 		};
 
-		[[nodiscard]] constexpr inline limb_type& operator[](const uint32_t i) noexcept {
+		[[nodiscard]] constexpr inline limb_type &operator[](const uint32_t i) noexcept {
 			ASSERT(i < LIMBS);
 			return d[i];
 		}
@@ -79,7 +85,7 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint8x16_t set(
-				uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept {
+		        uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept {
 			_uint8x16_t ret;
 			ret.v32[0] = d;
 			ret.v32[1] = c;
@@ -89,7 +95,7 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint8x16_t setr(
-				uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept {
+		        uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept {
 			_uint8x16_t ret;
 			ret.v32[0] = a;
 			ret.v32[1] = b;
@@ -99,22 +105,21 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint8x16_t set(
-				uint8_t a, uint8_t b, uint8_t c, uint8_t d,
-				uint8_t e, uint8_t f, uint8_t g, uint8_t h,
-				uint8_t i, uint8_t j, uint8_t k, uint8_t l,
-				uint8_t m, uint8_t n, uint8_t o, uint8_t p
-				) noexcept {
+		        uint8_t a, uint8_t b, uint8_t c, uint8_t d,
+		        uint8_t e, uint8_t f, uint8_t g, uint8_t h,
+		        uint8_t i, uint8_t j, uint8_t k, uint8_t l,
+		        uint8_t m, uint8_t n, uint8_t o, uint8_t p) noexcept {
 			_uint8x16_t ret;
-			ret.v8[ 0] = p;
-			ret.v8[ 1] = o;
-			ret.v8[ 2] = n;
-			ret.v8[ 3] = m;
-			ret.v8[ 4] = l;
-			ret.v8[ 5] = k;
-			ret.v8[ 6] = j;
-			ret.v8[ 7] = i;
-			ret.v8[ 8] = h;
-			ret.v8[ 9] = g;
+			ret.v8[0] = p;
+			ret.v8[1] = o;
+			ret.v8[2] = n;
+			ret.v8[3] = m;
+			ret.v8[4] = l;
+			ret.v8[5] = k;
+			ret.v8[6] = j;
+			ret.v8[7] = i;
+			ret.v8[8] = h;
+			ret.v8[9] = g;
 			ret.v8[10] = f;
 			ret.v8[11] = e;
 			ret.v8[12] = d;
@@ -125,22 +130,21 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint8x16_t setr(
-				uint8_t a, uint8_t b, uint8_t c, uint8_t d,
-				uint8_t e, uint8_t f, uint8_t g, uint8_t h,
-				uint8_t i, uint8_t j, uint8_t k, uint8_t l,
-				uint8_t m, uint8_t n, uint8_t o, uint8_t p
-				) noexcept {
+		        uint8_t a, uint8_t b, uint8_t c, uint8_t d,
+		        uint8_t e, uint8_t f, uint8_t g, uint8_t h,
+		        uint8_t i, uint8_t j, uint8_t k, uint8_t l,
+		        uint8_t m, uint8_t n, uint8_t o, uint8_t p) noexcept {
 			_uint8x16_t ret;
-			ret.v8[ 0] = a;
-			ret.v8[ 1] = b;
-			ret.v8[ 2] = c;
-			ret.v8[ 3] = d;
-			ret.v8[ 4] = e;
-			ret.v8[ 5] = f;
-			ret.v8[ 6] = g;
-			ret.v8[ 7] = h;
-			ret.v8[ 8] = i;
-			ret.v8[ 9] = j;
+			ret.v8[0] = a;
+			ret.v8[1] = b;
+			ret.v8[2] = c;
+			ret.v8[3] = d;
+			ret.v8[4] = e;
+			ret.v8[5] = f;
+			ret.v8[6] = g;
+			ret.v8[7] = h;
+			ret.v8[8] = i;
+			ret.v8[9] = j;
 			ret.v8[10] = k;
 			ret.v8[11] = l;
 			ret.v8[12] = m;
@@ -155,9 +159,9 @@ namespace cryptanalysislib {
 		constexpr static uint32_t LIMBS = 8;
 		using limb_type = uint16_t;
 
-		constexpr inline _uint16x8_t& operator=(const _uint8x16_t& b) noexcept;
-		constexpr inline _uint16x8_t& operator=(const _uint32x4_t& b) noexcept;
-		constexpr inline _uint16x8_t& operator=(const _uint64x2_t& b) noexcept;
+		constexpr inline _uint16x8_t &operator=(const _uint8x16_t &b) noexcept;
+		constexpr inline _uint16x8_t &operator=(const _uint32x4_t &b) noexcept;
+		constexpr inline _uint16x8_t &operator=(const _uint64x2_t &b) noexcept;
 
 		constexpr _uint16x8_t() noexcept {}
 		constexpr _uint16x8_t(const _uint8x16_t &b) noexcept;
@@ -174,7 +178,7 @@ namespace cryptanalysislib {
 			uint64_t v64[2];
 		};
 
-		[[nodiscard]] constexpr inline limb_type& operator[](const uint32_t i) noexcept {
+		[[nodiscard]] constexpr inline limb_type &operator[](const uint32_t i) noexcept {
 			ASSERT(i < LIMBS);
 			return d[i];
 		}
@@ -196,7 +200,7 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint16x8_t set(
-				uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept {
+		        uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept {
 			_uint16x8_t ret;
 			ret.v32[0] = d;
 			ret.v32[1] = c;
@@ -206,7 +210,7 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint16x8_t setr(
-				uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept {
+		        uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept {
 			_uint16x8_t ret;
 			ret.v32[0] = a;
 			ret.v32[1] = b;
@@ -216,8 +220,8 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint16x8_t set(
-				uint16_t a, uint16_t b, uint16_t c, uint16_t d,
-				uint16_t e, uint16_t f, uint16_t g, uint16_t h) noexcept {
+		        uint16_t a, uint16_t b, uint16_t c, uint16_t d,
+		        uint16_t e, uint16_t f, uint16_t g, uint16_t h) noexcept {
 			_uint16x8_t ret;
 			ret.v16[0] = h;
 			ret.v16[1] = g;
@@ -231,8 +235,8 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint16x8_t setr(
-				uint16_t a, uint16_t b, uint16_t c, uint16_t d,
-				uint16_t e, uint16_t f, uint16_t g, uint16_t h) noexcept {
+		        uint16_t a, uint16_t b, uint16_t c, uint16_t d,
+		        uint16_t e, uint16_t f, uint16_t g, uint16_t h) noexcept {
 			_uint16x8_t ret;
 			ret.v64[0] = a;
 			ret.v64[1] = b;
@@ -250,9 +254,9 @@ namespace cryptanalysislib {
 		constexpr static uint32_t LIMBS = 4;
 		using limb_type = uint32_t;
 
-		constexpr inline _uint32x4_t& operator=(const _uint8x16_t& b) noexcept;
-		constexpr inline _uint32x4_t& operator=(const _uint16x8_t& b) noexcept;
-		constexpr inline _uint32x4_t& operator=(const _uint64x2_t& b) noexcept;
+		constexpr inline _uint32x4_t &operator=(const _uint8x16_t &b) noexcept;
+		constexpr inline _uint32x4_t &operator=(const _uint16x8_t &b) noexcept;
+		constexpr inline _uint32x4_t &operator=(const _uint64x2_t &b) noexcept;
 
 		constexpr _uint32x4_t() noexcept {}
 		constexpr _uint32x4_t(const _uint8x16_t &b) noexcept;
@@ -269,7 +273,7 @@ namespace cryptanalysislib {
 			uint64_t v64[2];
 		};
 
-		[[nodiscard]] constexpr inline limb_type& operator[](const uint32_t i) noexcept {
+		[[nodiscard]] constexpr inline limb_type &operator[](const uint32_t i) noexcept {
 			ASSERT(i < LIMBS);
 			return d[i];
 		}
@@ -291,7 +295,7 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint32x4_t set(
-				uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept {
+		        uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept {
 			_uint32x4_t ret;
 			ret.v32[0] = d;
 			ret.v32[1] = c;
@@ -301,7 +305,7 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint32x4_t setr(
-				uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept {
+		        uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept {
 			_uint32x4_t ret;
 			ret.v32[0] = a;
 			ret.v32[1] = b;
@@ -311,7 +315,7 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint32x4_t set(
-				uint64_t a, uint64_t b) noexcept {
+		        uint64_t a, uint64_t b) noexcept {
 			_uint32x4_t ret;
 			ret.v64[0] = b;
 			ret.v64[1] = a;
@@ -319,7 +323,7 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint32x4_t setr(
-				uint64_t a, uint64_t b) noexcept {
+		        uint64_t a, uint64_t b) noexcept {
 			_uint32x4_t ret;
 			ret.v64[0] = a;
 			ret.v64[1] = b;
@@ -331,9 +335,9 @@ namespace cryptanalysislib {
 		constexpr static uint32_t LIMBS = 2;
 		using limb_type = uint64_t;
 
-		constexpr inline _uint64x2_t& operator=(const _uint8x16_t& b) noexcept;
-		constexpr inline _uint64x2_t& operator=(const _uint16x8_t& b) noexcept;
-		constexpr inline _uint64x2_t& operator=(const _uint32x4_t& b) noexcept;
+		constexpr inline _uint64x2_t &operator=(const _uint8x16_t &b) noexcept;
+		constexpr inline _uint64x2_t &operator=(const _uint16x8_t &b) noexcept;
+		constexpr inline _uint64x2_t &operator=(const _uint32x4_t &b) noexcept;
 
 		constexpr _uint64x2_t() noexcept {}
 		constexpr _uint64x2_t(const _uint8x16_t &b) noexcept;
@@ -349,7 +353,7 @@ namespace cryptanalysislib {
 			uint64_t v64[2];
 		};
 
-		[[nodiscard]] constexpr inline limb_type& operator[](const uint32_t i) noexcept {
+		[[nodiscard]] constexpr inline limb_type &operator[](const uint32_t i) noexcept {
 			ASSERT(i < LIMBS);
 			return d[i];
 		}
@@ -371,7 +375,7 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint64x2_t set(
-				uint64_t a, uint64_t b) noexcept{
+		        uint64_t a, uint64_t b) noexcept {
 			_uint64x2_t ret;
 			ret.v64[0] = b;
 			ret.v64[1] = a;
@@ -379,7 +383,7 @@ namespace cryptanalysislib {
 		}
 
 		[[nodiscard]] constexpr static inline _uint64x2_t setr(
-				uint64_t a, uint64_t b) noexcept {
+		        uint64_t a, uint64_t b) noexcept {
 			_uint64x2_t ret;
 			ret.v64[0] = a;
 			ret.v64[1] = b;
@@ -403,7 +407,7 @@ struct uint8x32_t {
 		uint64_t v64[4];
 	};
 
-	[[nodiscard]] constexpr inline limb_type& operator[](const uint32_t i) noexcept {
+	[[nodiscard]] constexpr inline limb_type &operator[](const uint32_t i) noexcept {
 		ASSERT(i < LIMBS);
 		return d[i];
 	}
@@ -730,7 +734,7 @@ struct uint8x32_t {
 	/// \param in2
 	/// \return
 	[[nodiscard]] constexpr static inline uint8x32_t mullo(const uint8x32_t in1,
-	                               const uint8x32_t in2) noexcept {
+	                                                       const uint8x32_t in2) noexcept {
 		uint8x32_t out;
 		for (uint32_t i = 0; i < 32; i++) {
 			out.v8[i] = in1.v8[i] * in2.v8[i];
@@ -740,7 +744,7 @@ struct uint8x32_t {
 
 	///
 	[[nodiscard]] constexpr static inline uint8x32_t mullo(const uint8x32_t in1,
-	                               const uint8_t in2) noexcept {
+	                                                       const uint8_t in2) noexcept {
 		uint8x32_t rs = uint8x32_t::set1(in2);
 		return uint8x32_t::mullo(in1, rs);
 	}
@@ -774,7 +778,7 @@ struct uint8x32_t {
 	}
 
 	[[nodiscard]] constexpr static inline uint8x32_t cmp_(const uint8x32_t in1,
-			const uint8x32_t in2) noexcept {
+	                                                      const uint8x32_t in2) noexcept {
 		uint8x32_t ret;
 		for (uint32_t i = 0; i < 32; i++) {
 			ret.v8[i] = in1.v8[i] == in2.v8[i];
@@ -783,7 +787,7 @@ struct uint8x32_t {
 	}
 
 	[[nodiscard]] constexpr static inline uint8x32_t gt_(const uint8x32_t in1,
-			const uint8x32_t in2) noexcept {
+	                                                     const uint8x32_t in2) noexcept {
 		uint8x32_t ret;
 		for (uint32_t i = 0; i < 32; i++) {
 			ret.v8[i] = in1.v8[i] > in2.v8[i];
@@ -792,7 +796,7 @@ struct uint8x32_t {
 	}
 
 	[[nodiscard]] constexpr static inline int cmp(const uint8x32_t in1,
-			const uint8x32_t in2) noexcept {
+	                                              const uint8x32_t in2) noexcept {
 		int ret = 0;
 		for (uint32_t i = 0; i < 32; i++) {
 			ret ^= (in1.v8[i] == in2.v8[i]) << i;
@@ -802,7 +806,7 @@ struct uint8x32_t {
 	}
 
 	[[nodiscard]] constexpr static inline int gt(const uint8x32_t in1,
-			const uint8x32_t in2) {
+	                                             const uint8x32_t in2) {
 		int ret = 0;
 		for (uint32_t i = 0; i < 32; i++) {
 			ret ^= (in1.v8[i] > in2.v8[i]) << i;
@@ -941,7 +945,7 @@ struct uint16x16_t {
 	/// \param ptr
 	/// \return
 	[[nodiscard]] constexpr static inline uint16x16_t unaligned_load(
-			const void *ptr) noexcept {
+	        const void *ptr) noexcept {
 		uint16x16_t out;
 		const uint64_t *ptr64 = (uint64_t *) ptr;
 		for (uint32_t i = 0; i < 4; i++) {
@@ -1079,7 +1083,7 @@ struct uint16x16_t {
 	/// \param in2
 	/// \return
 	[[nodiscard]] constexpr static inline uint16x16_t mullo(const uint16x16_t in1,
-	                                const uint16x16_t in2) noexcept {
+	                                                        const uint16x16_t in2) noexcept {
 		uint16x16_t out;
 		for (uint32_t i = 0; i < 16; i++) {
 			out.v16[i] = in1.v16[i] * in2.v16[i];
@@ -1089,7 +1093,7 @@ struct uint16x16_t {
 
 	///
 	[[nodiscard]] constexpr static inline uint16x16_t mullo(const uint16x16_t in1,
-	                                const uint8_t in2) noexcept {
+	                                                        const uint8_t in2) noexcept {
 		uint16x16_t rs = uint16x16_t::set1(in2);
 		return uint16x16_t::mullo(in1, rs);
 	}
@@ -1123,7 +1127,7 @@ struct uint16x16_t {
 	}
 
 	[[nodiscard]] constexpr static inline uint16x16_t cmp_(
-			const uint16x16_t in1, const uint16x16_t in2) noexcept {
+	        const uint16x16_t in1, const uint16x16_t in2) noexcept {
 		uint16x16_t ret;
 		for (uint32_t i = 0; i < 16; i++) {
 			ret.v16[i] = in1.v16[i] == in2.v16[i];
@@ -1142,7 +1146,7 @@ struct uint16x16_t {
 	}
 
 	[[nodiscard]] constexpr static inline uint32_t cmp(const uint16x16_t in1,
-			const uint16x16_t in2) noexcept {
+	                                                   const uint16x16_t in2) noexcept {
 		uint32_t ret = 0;
 		for (uint32_t i = 0; i < 16; i++) {
 			ret ^= (in1.v16[i] == in2.v16[i]) << i;
@@ -1151,8 +1155,8 @@ struct uint16x16_t {
 		return ret;
 	}
 
-	[[nodiscard]] constexpr static inline uint32_t gt(const uint16x16_t in1, 
-			const uint16x16_t in2) noexcept {
+	[[nodiscard]] constexpr static inline uint32_t gt(const uint16x16_t in1,
+	                                                  const uint16x16_t in2) noexcept {
 		uint32_t ret = 0;
 		for (uint32_t i = 0; i < 16; i++) {
 			ret ^= (in1.v16[i] > in2.v16[i]) << i;
@@ -1195,7 +1199,7 @@ struct uint32x8_t {
 		uint64_t v64[4];
 	};
 
-	[[nodiscard]] constexpr inline limb_type& operator[](const uint32_t i) noexcept {
+	[[nodiscard]] constexpr inline limb_type &operator[](const uint32_t i) noexcept {
 		ASSERT(i < LIMBS);
 		return d[i];
 	}
@@ -1461,7 +1465,7 @@ struct uint32x8_t {
 	}
 
 	[[nodiscard]] constexpr static inline uint32x8_t cmp_(const uint32x8_t in1,
-			const uint32x8_t in2) noexcept {
+	                                                      const uint32x8_t in2) noexcept {
 		uint32x8_t ret;
 		for (uint32_t i = 0; i < 8; i++) {
 			ret.v32[i] = in1.v32[i] == in2.v32[i];
@@ -1471,7 +1475,7 @@ struct uint32x8_t {
 	}
 
 	[[nodiscard]] constexpr static inline uint32x8_t gt_(const uint32x8_t in1,
-			const uint32x8_t in2) noexcept {
+	                                                     const uint32x8_t in2) noexcept {
 		uint32x8_t ret;
 		for (uint32_t i = 0; i < 8; i++) {
 			ret.v32[i] = in1.v32[i] > in2.v32[i];
@@ -1481,7 +1485,7 @@ struct uint32x8_t {
 	}
 
 	[[nodiscard]] constexpr static inline uint32_t cmp(const uint32x8_t in1,
-			const uint32x8_t in2) noexcept{
+	                                                   const uint32x8_t in2) noexcept {
 		uint32_t ret = 0;
 		for (uint32_t i = 0; i < 8; i++) {
 			ret ^= (in1.v32[i] == in2.v32[i]) << i;
@@ -1491,7 +1495,7 @@ struct uint32x8_t {
 	}
 
 	[[nodiscard]] constexpr static inline uint32_t gt(const uint32x8_t in1,
-			const uint32x8_t in2) noexcept {
+	                                                  const uint32x8_t in2) noexcept {
 		uint32_t ret = 0;
 		for (uint32_t i = 0; i < 8; i++) {
 			ret ^= (in1.v32[i] > in2.v32[i]) << i;
@@ -1515,7 +1519,7 @@ struct uint32x8_t {
 	/// \return
 	template<const uint32_t scale = 1>
 	[[nodiscard]] constexpr static inline uint32x8_t gather(const void *ptr,
-			const uint32x8_t data) noexcept {
+	                                                        const uint32x8_t data) noexcept {
 		uint32x8_t ret;
 		const uint8_t *ptr8 = (uint8_t *) ptr;
 		for (uint32_t i = 0; i < 8; i++) {
@@ -1530,7 +1534,7 @@ struct uint32x8_t {
 	/// \param perm
 	/// \return
 	[[nodiscard]] constexpr static inline uint32x8_t permute(const uint32x8_t in,
-			const uint32x8_t perm) noexcept {
+	                                                         const uint32x8_t perm) noexcept {
 		uint32x8_t ret;
 		for (uint32_t i = 0; i < 8; i++) {
 			ret.v32[i] = in.v32[perm.v32[i] & 0x7];
@@ -1584,7 +1588,7 @@ struct uint64x4_t {
 		uint64_t v64[4];
 	};
 
-	[[nodiscard]] constexpr inline limb_type& operator[](const uint32_t i) noexcept {
+	[[nodiscard]] constexpr inline limb_type &operator[](const uint32_t i) noexcept {
 		ASSERT(i < LIMBS);
 		return d[i];
 	}
@@ -1612,7 +1616,7 @@ struct uint64x4_t {
 
 	[[nodiscard]] constexpr static inline uint64x4_t setr(
 	        const uint64_t a0, const uint64_t a1,
-			const uint64_t a2, const uint64_t a3) noexcept {
+	        const uint64_t a2, const uint64_t a3) noexcept {
 		uint64x4_t out;
 		out.v64[0] = a0;
 		out.v64[1] = a1;
@@ -1623,7 +1627,7 @@ struct uint64x4_t {
 
 	[[nodiscard]] constexpr static inline uint64x4_t set(
 	        const uint64_t a0, const uint64_t a1,
-			const uint64_t a2, const uint64_t a3) noexcept {
+	        const uint64_t a2, const uint64_t a3) noexcept {
 		return uint64x4_t::setr(a3, a2, a1, a0);
 	}
 
@@ -1812,7 +1816,7 @@ struct uint64x4_t {
 
 	///
 	[[nodiscard]] constexpr static inline uint64x4_t mullo(const uint64x4_t in1,
-	                               const uint8_t in2) noexcept {
+	                                                       const uint8_t in2) noexcept {
 		uint64x4_t rs = uint64x4_t::set1(in2);
 		return uint64x4_t::mullo(in1, rs);
 	}
@@ -1846,7 +1850,7 @@ struct uint64x4_t {
 	}
 
 	[[nodiscard]] constexpr static inline uint64x4_t cmp_(const uint64x4_t in1,
-			const uint64x4_t in2) noexcept {
+	                                                      const uint64x4_t in2) noexcept {
 		uint64x4_t ret;
 		for (uint8_t i = 0; i < 4; i++) {
 			ret.v64[i] = in1.v64[i] == in2.v64[i];
@@ -1855,7 +1859,7 @@ struct uint64x4_t {
 	}
 
 	[[nodiscard]] constexpr static inline uint64x4_t gt_(const uint64x4_t in1,
-			const uint64x4_t in2) noexcept {
+	                                                     const uint64x4_t in2) noexcept {
 		uint64x4_t ret;
 		for (uint32_t i = 0; i < 4; i++) {
 			ret.v64[i] = in1.v64[i] > in2.v64[i];
@@ -1865,7 +1869,7 @@ struct uint64x4_t {
 	}
 
 	[[nodiscard]] constexpr static inline uint32_t cmp(const uint64x4_t in1,
-			const uint64x4_t in2) noexcept {
+	                                                   const uint64x4_t in2) noexcept {
 		uint32_t ret = 0;
 		for (uint8_t i = 0; i < 4; i++) {
 			ret ^= (in1.v64[i] == in2.v64[i]) << i;
@@ -1875,7 +1879,7 @@ struct uint64x4_t {
 	}
 
 	[[nodiscard]] constexpr static inline uint32_t gt(const uint64x4_t in1,
-			const uint64x4_t in2) noexcept {
+	                                                  const uint64x4_t in2) noexcept {
 		uint32_t ret = 0;
 		for (uint32_t i = 0; i < 4; i++) {
 			ret ^= (in1.v64[i] > in2.v64[i]) << i;
@@ -1899,7 +1903,7 @@ struct uint64x4_t {
 	/// \return
 	template<const uint32_t scale = 1>
 	[[nodiscard]] constexpr static inline uint64x4_t gather(const void *ptr,
-			const uint64x4_t data) noexcept {
+	                                                        const uint64x4_t data) noexcept {
 		static_assert(scale == 1 || scale == 2 || scale == 4 || scale == 8);
 
 		uint64x4_t ret;
@@ -1917,7 +1921,7 @@ struct uint64x4_t {
 	/// \return
 	template<const uint32_t scale = 1>
 	[[nodiscard]] constexpr static inline uint64x4_t gather(const void *ptr,
-			const cryptanalysislib::_uint32x4_t data) noexcept {
+	                                                        const cryptanalysislib::_uint32x4_t data) noexcept {
 		static_assert(scale == 1 || scale == 2 || scale == 4 || scale == 8);
 		uint64x4_t ret;
 		const uint8_t *ptr8 = (uint8_t *) ptr;
@@ -1933,7 +1937,7 @@ struct uint64x4_t {
 	/// \param perm
 	/// \return
 	[[nodiscard]] constexpr static inline uint64x4_t permute(const uint64x4_t in,
-			const uint64x4_t perm) noexcept {
+	                                                         const uint64x4_t perm) noexcept {
 		uint64x4_t ret;
 		for (uint32_t i = 0; i < 4; i++) {
 			ret.v64[i] = in.v64[perm.v64[i]];
@@ -1965,6 +1969,64 @@ struct uint64x4_t {
 	}
 };
 
+struct uint128x2_t {
+	constexpr static uint32_t LIMBS = 2;
+	using limb_type = __uint128_t;
+
+	union {
+		// compatibility with TxN_t
+		__uint128_t d[2];
+
+		uint8_t v8[32];
+		uint16_t v16[16];
+		uint32_t v32[8];
+		uint64_t v64[4];
+		__uint128_t v128[2];
+		__m256i v256;
+	};
+
+	[[nodiscard]] constexpr inline limb_type operator[](const uint32_t i) {
+		ASSERT(i < LIMBS);
+		return d[i];
+	}
+
+	/// Example of how the constexpr implementation works:
+	/// https://godbolt.org/#z:OYLghAFBqd5QCxAYwPYBMCmBRdBLAF1QCcAaPECAMzwBtMA7AQwFtMQByARg9KtQYEAysib0QXACx8BBAKoBnTAAUAHpwAMvAFYTStJg1DIApACYAQuYukl9ZATwDKjdAGFUtAK4sGEgBykrgAyeAyYAHI%2BAEaYxCCSZqQADqgKhE4MHt6%2BASlpGQKh4VEssfGJtpj2jgJCBEzEBNk%2BflyBdpgOmfWNBMWRMXEJSQoNTS257bbj/WGDZcOJAJS2qF7EyOwc5gDMYcjeWADUJrtuY/iCAHQIZ9gmGgCCewdHmKfneCwsYQTEYVu90eLzM%2BwYhy8JzObmQlycwOeILGxC8DmOXj%2B/lUuzMAH0CKcAOxWJEaACcXgYmWJpJeFMelKxBOOxwAbv5TgBWCy4kxcgAiZzpjMZmMEXAAbCy2VL%2BRY5YLhSDRRTxQRcTK%2BTzjv5%2BULdiKGWq/pLJDKzfLjpJ9cqyeTGXi8SwzFzJXh2a7JXb6Q6iQa6azWSCg8c0AwxphVMliMcxkxHMhjmFaPMMVicfjCaoSHiIOGxunBNjNYSwlxSCqKaGa7W6/WC4T1SWs8mGGZlrSqw6TcXMyz1gQfaryYPrmyvZ8BccIE6XW68J2oE62ZJ0F5luXx5P%2BdgZyu1xuwmZt27lsPjeSAPRXscTt1T47O%2BfSnPEPHpL0QLf3yWkNsnr%2B56Gt2jLEJgBAbAwxyDhefpCmSvo3le3bIdybjJI0rDHMk/yoTe6HgZBxAMN2jZRjGcYNImbapuERYEC2LJUmIeDAOE6B4rQqBMOg%2BYCIWbKoHg6DHAAVLhxCdiYJKgRSTBeEQ4mSTuuzTrOzpeh6YnLJJcFihmpYwYp%2BkUneqnThJ/xeqZ163opp6So%2Bz5elxPHrh%2BeBfhpL7abp/zAUaPbkkRUHGUOIH2jJCFPKGIZBsh%2BFXuhmHENhen2mh/IYVhLBtmRAkEBRsbxjRKZpkJIkYswqbsZgnFjCQmAQJVolWWQYaFQxTFlgw0myVFFJzlpynWQ%2BZzqcNC7if5xC2e1Flto5Ppxf6KprZF9JPOR0axq1xxFWMEDNiyYmDv%2Bjbdad5YXV1J2EmJx63RGTZ/EZfVds8ob8LGx1va2HoTccGjCsm1i4p8bjHKRhpg5YE24v1gZ1pdzb9oSTCPmjpYgCALG1Rxbm8d%2BDBcKcljJmJiMrSjd2Ga20RY/TBK4/jbGE9xxPHuTFiU9Tm0NnTfZGUmQPY1muNvnmTD/tEgXxbW4ss3jNXs/VH5EOBEDnWG8tfUG0XrTFIJ/McLBMGEEBIwrnUvcVDHvY%2BXBmHqAtBjtlH3eF8qkYKj4yRYIMbcjrIe7GXvlj7%2Br%2BySQcBjbYdXb1ZhR37QMB3HF6xUGh0EFAXs6TrN0AaQfU06yMZ/FQEDmCnrroNlpFmEk3s8iDgp6y8/ocKstCcFyvB%2BBwWikKgnBuNY1hxusmwfHsPCkAQmg96sADWIC7Ls1waP4kiSkSkhEv4GhH0ff59xwkiD8vo%2BcLwCggBoi/L6scCwEgaAsMkdBxOQlCf9/eg8RgBcFxKQLAbI8BbAAGp4EwAAdwAPLJEYJwBeNBaBFWIA/CA0Qb7RDCI0AAnmg3gBDmDECIYg6I2guhL24LwT%2BbBBCIIYLQEhw9eBYGiF4YAbgxC0Afgw8BmBzZGHEJw8BeBwLdDZJgIRI8oxdEUtsBefxqg31TNENKlCPBYBvv8b4pDVhUAMMABQsCEHINQcI/gggRBiHYFIGQghFAqHUJI3QFYDBGBQJPSw%2Bg8DRAfpAVYqBcKZCEQAWnNmyVQZhjhRMQbsXgqA5HEABFgEJVsqg1EyC4Bg7hPCtAkESIIhSBilHKBIJ%2BqR0i1CyMUqYZS6mFAYJUoY8QuBP06N0OosxJhtDKb0hpvQmgdMWF0npAymlDJmH0CZ1TumrAUDPLYEhe792vpIseHBjiqH8JKKJZpjjAGQEmUB1wEkQFwIQEg5NdhcGWLwehWhlhrw3lvLk/guDknJO0fw3yj4/OkBfK%2Bpcb67Pvo/Z%2BnDX4wEQCAQcyRFJ/34l/H%2BxAIisG2Aco5JyzkXK3mYXg9U7mZL0HY4QohxDOKpW4tQN8vGkHgWlZIxj9BbIhTszgiDFIosJKgKg%2BzDnHMkKc85xxLnXI8BioBDynkvJfh8ze1xN7qo1Zqzll9tkjyhbYGFryV7apJdyvVd9YVvNWOk9IzhJBAA%3D
+	constexpr uint128x2_t() noexcept = default;
+
+	/// NOTE: currently cannot be constexpr
+	/// \return
+	[[nodiscard]] static inline uint128x2_t random() noexcept {
+		uint128x2_t ret{};
+		for (size_t i = 0; i < 4; ++i) {
+			ret.v64[i] = fastrandombytes_uint64();
+		}
+		return ret;
+	}
+
+	///
+	/// \param binary
+	/// \param hex
+	constexpr inline void print(bool binary = false, bool hex = false) const;
+
+	template<const uint8_t imm>
+	[[nodiscard]] constexpr static uint128x2_t slli(const uint128x2_t in1) {
+		uint128x2_t ret{};
+		/// TODO
+		return ret;
+	}
+
+	/// wrapper around: `_mm256_bslli_epi128`
+	/// \tparam imm
+	/// \param in1
+	/// \return
+	template<const uint8_t imm>
+	[[nodiscard]] constexpr static uint128x2_t srli(const uint128x2_t in1) {
+		uint128x2_t ret{};
+		ret.v256 = ((__m256i) __builtin_ia32_psrldqi256_byteshift((__m256i) (in1.v256), (int) (imm)));
+		return ret;
+	}
+};
 
 #include "simd/float/simd.h"
 #endif// no SIMD unit available
@@ -2178,44 +2240,44 @@ constexpr inline int operator>(const uint8x32_t &a, const uint8x32_t &b) {
 
 ///
 constexpr inline int operator==(const uint16x16_t &a, const uint16x16_t &b) {
-	return (int)uint16x16_t::cmp(a, b);
+	return (int) uint16x16_t::cmp(a, b);
 }
 constexpr inline int operator!=(const uint16x16_t &a, const uint16x16_t &b) {
 	return 0xffff ^ uint16x16_t::cmp(a, b);
 }
 constexpr inline int operator<(const uint16x16_t &a, const uint16x16_t &b) {
-	return (int)uint16x16_t::gt(b, a);
+	return (int) uint16x16_t::gt(b, a);
 }
 constexpr inline int operator>(const uint16x16_t &a, const uint16x16_t &b) {
-	return (int)uint16x16_t::gt(a, b);
+	return (int) uint16x16_t::gt(a, b);
 }
 
 
 ///
 constexpr inline int operator==(const uint32x8_t &a, const uint32x8_t &b) {
-	return (int)uint32x8_t::cmp(a, b);
+	return (int) uint32x8_t::cmp(a, b);
 }
 constexpr inline int operator!=(const uint32x8_t &a, const uint32x8_t &b) {
 	return 0xff ^ uint32x8_t::cmp(a, b);
 }
 constexpr inline int operator<(const uint32x8_t &a, const uint32x8_t &b) {
-	return (int)uint32x8_t::gt(b, a);
+	return (int) uint32x8_t::gt(b, a);
 }
 constexpr inline int operator>(const uint32x8_t &a, const uint32x8_t &b) {
-	return (int)uint32x8_t::gt(a, b);
+	return (int) uint32x8_t::gt(a, b);
 }
 
 constexpr inline int operator==(const uint64x4_t &a, const uint64x4_t &b) {
-	return (int)uint64x4_t::cmp(a, b);
+	return (int) uint64x4_t::cmp(a, b);
 }
 constexpr inline int operator!=(const uint64x4_t &a, const uint64x4_t &b) {
 	return 0xf ^ uint64x4_t::cmp(a, b);
 }
 constexpr inline int operator<(const uint64x4_t &a, const uint64x4_t &b) {
-	return (int)uint64x4_t::gt(b, a);
+	return (int) uint64x4_t::gt(b, a);
 }
 constexpr inline int operator>(const uint64x4_t &a, const uint64x4_t &b) {
-	return (int)uint64x4_t::gt(a, b);
+	return (int) uint64x4_t::gt(a, b);
 }
 
 
@@ -2495,78 +2557,9 @@ namespace cryptanalysislib {
 			v64[i] = b.v64[i];
 		}
 	}
-}
-
-
-
-void transpose8(unsigned char A[8], int m, int n, 
-                unsigned char B[8]) {
-   unsigned x, y, t; 
-
-   // Load the array and pack it into x and y. 
-
-   x = (A[0]<<24)   | (A[m]<<16)   | (A[2*m]<<8) | A[3*m]; 
-   y = (A[4*m]<<24) | (A[5*m]<<16) | (A[6*m]<<8) | A[7*m]; 
-
-   t = (x ^ (x >> 7)) & 0x00AA00AA;  x = x ^ t ^ (t << 7); 
-   t = (y ^ (y >> 7)) & 0x00AA00AA;  y = y ^ t ^ (t << 7); 
-
-   t = (x ^ (x >>14)) & 0x0000CCCC;  x = x ^ t ^ (t <<14); 
-   t = (y ^ (y >>14)) & 0x0000CCCC;  y = y ^ t ^ (t <<14); 
-
-   t = (x & 0xF0F0F0F0) | ((y >> 4) & 0x0F0F0F0F); 
-   y = ((x << 4) & 0xF0F0F0F0) | (y & 0x0F0F0F0F); 
-   x = t; 
+}// namespace cryptanalysislib
 
-   B[0]=x>>24;    B[n]=x>>16;    B[2*n]=x>>8;  B[3*n]=x; 
-   B[4*n]=y>>24;  B[5*n]=y>>16;  B[6*n]=y>>8;  B[7*n]=y; 
-}
-
-/// input: in, a 64x64 matrix over GF(2)
-/// output: out, transpose of in
-void transpose_64x64(uint64_t * out, uint64_t * in) {
-	const static uint64_t masks[6][2] = {
-		{0x5555555555555555, 0xAAAAAAAAAAAAAAAA},
-	 	{0x3333333333333333, 0xCCCCCCCCCCCCCCCC},
-	 	{0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0},
-	 	{0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00},
-	 	{0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000},
-	 	{0x00000000FFFFFFFF, 0xFFFFFFFF00000000}
-	};
-
-	for (uint64_t i = 0; i < 64; i++) {
-		out[i] = in[i];
-	}
-
-	for (uint32_t d = 5; d >= 0; d--) {
-		const uint32_t s = 1 << d;
-
-		for (uint32_t i = 0; i < 64; i += s*2) {
-			for (uint32_t j = i; j < i+s; j++) {
-				const uint64_t x = (out[j] & masks[d][0]) | ((out[j+s] & masks[d][0]) << s);
-				const uint64_t y = ((out[j] & masks[d][1]) >> s) | (out[j+s] & masks[d][1]);
-				out[j+0] = x;
-				out[j+s] = y;
-			}
-		}
-	}
-}
-
-
-// inplace 
-inline void transpose64(uint64_t a[64]) noexcept {
-	for (uint64_t j = 32, m = 0x00000000FFFFFFFF; j; j >>= 1, m ^= m << j) {
-		for (uint64_t k = 0; k < 64; k = ((k | j) + 1) & ~j) {
-			uint64_t t = (a[k] ^ (a[k | j] >> j)) & m;
-			a[k] ^= t;
-			a[k | j] ^= (t << j);
-		}
-	}
-}
-
-// TODO
-//struct b256x64_T
-
-#include "simd/generic.h"
 #include "simd/bits/bits.h"
+#include "simd/generic.h"
+#include "simd/matrix/simple.h"
 #endif//CRYPTANALYSISLIB_SIMD_H
diff --git a/tests/container/hashmap/simd.cpp b/tests/container/hashmap/simd.cpp
index 75214e06..3ff10e27 100644
--- a/tests/container/hashmap/simd.cpp
+++ b/tests/container/hashmap/simd.cpp
@@ -34,7 +34,7 @@ TEST(SimdHashMap, avxInsert) {
 	using HM = SIMDHashMap<s>;
 	HM hm = HM{};
 
-	uint32x8_t data, index;
+	uint32x8_t data{}, index{};
 	for (uint64_t i = 0; i < ((1u << l) * bucket_size / 8); ++i) {
 		for (uint32_t j = 0; j < 8; ++j) {
 			data.v32[j] = i * 8 + j;

From 6e9f9027a13cb59c21cb970cea66970d75ec95e9 Mon Sep 17 00:00:00 2001
From: Floyd <floyd.zweydinge+gitr@rub.de>
Date: Wed, 6 Mar 2024 19:29:51 +0100
Subject: [PATCH 2/3] fixxed neon

---
 src/container/linkedlist/linkedlist.h |   4 +-
 src/simd/avx2.h                       |   2 +-
 src/simd/neon.h                       | 324 ++++++++++++++++++++++++--
 src/simd/simd.h                       |   3 +
 tests/container/hashmap/simple.cpp    |   2 +-
 5 files changed, 318 insertions(+), 17 deletions(-)

diff --git a/src/container/linkedlist/linkedlist.h b/src/container/linkedlist/linkedlist.h
index e1d6e4e7..2c285c88 100644
--- a/src/container/linkedlist/linkedlist.h
+++ b/src/container/linkedlist/linkedlist.h
@@ -1,5 +1,5 @@
-#ifndef CRYPTANALYSISLIB_LINKEDLIST_H
-#define CRYPTANALYSISLIB_LINKEDLIST_H
+#ifndef CRYPTANALYSISLIB_FREELIST_LINKEDLIST_H
+#define CRYPTANALYSISLIB_FREELIST_LINKEDLIST_H
 
 #include <stdint.h>
 
diff --git a/src/simd/avx2.h b/src/simd/avx2.h
index da1796f8..6b6ea8bd 100644
--- a/src/simd/avx2.h
+++ b/src/simd/avx2.h
@@ -1106,7 +1106,7 @@ struct uint16x16_t {
 	/// \tparam in2
 	/// \param in1
 	/// \return
-	template<uint8_t imm>
+	template<uint32_t imm>
 	[[nodiscard]] constexpr static inline uint16x16_t blend(const uint16x16_t in1,
 	                                                        const uint16x16_t in2) noexcept {
 		uint16x16_t ret{};
diff --git a/src/simd/neon.h b/src/simd/neon.h
index 6e81f645..30944973 100644
--- a/src/simd/neon.h
+++ b/src/simd/neon.h
@@ -12,6 +12,15 @@ namespace cryptanalysislib {
 		constexpr static uint32_t LIMBS = 16;
 		using limb_type = uint8_t;
 
+		constexpr inline _uint8x16_t &operator=(const _uint16x8_t &b) noexcept;
+		constexpr inline _uint8x16_t &operator=(const _uint32x4_t &b) noexcept;
+		constexpr inline _uint8x16_t &operator=(const _uint64x2_t &b) noexcept;
+
+		constexpr _uint8x16_t() noexcept {}
+		constexpr _uint8x16_t(const _uint16x8_t &b) noexcept;
+		constexpr _uint8x16_t(const _uint32x4_t &b) noexcept;
+		constexpr _uint8x16_t(const _uint64x2_t &b) noexcept;
+		
 		union {
 			// compatibility to `TxN_t`
 			uint8_t d[16];
@@ -120,6 +129,15 @@ namespace cryptanalysislib {
 		constexpr static uint32_t LIMBS = 8;
 		using limb_type = uint16_t;
 
+		constexpr inline _uint16x8_t &operator=(const _uint8x16_t &b) noexcept;
+		constexpr inline _uint16x8_t &operator=(const _uint32x4_t &b) noexcept;
+		constexpr inline _uint16x8_t &operator=(const _uint64x2_t &b) noexcept;
+
+		constexpr _uint16x8_t() noexcept {}
+		constexpr _uint16x8_t(const _uint8x16_t &b) noexcept;
+		constexpr _uint16x8_t(const _uint32x4_t &b) noexcept;
+		constexpr _uint16x8_t(const _uint64x2_t &b) noexcept;
+
 		union {
 			// compatibility to `TxN_t`
 			uint16_t d[8];
@@ -206,6 +224,15 @@ namespace cryptanalysislib {
 		constexpr static uint32_t LIMBS = 4;
 		using limb_type = uint32_t;
 
+		constexpr inline _uint32x4_t &operator=(const _uint8x16_t &b) noexcept;
+		constexpr inline _uint32x4_t &operator=(const _uint16x8_t &b) noexcept;
+		constexpr inline _uint32x4_t &operator=(const _uint64x2_t &b) noexcept;
+
+		constexpr _uint32x4_t() noexcept {}
+		constexpr _uint32x4_t(const _uint8x16_t &b) noexcept;
+		constexpr _uint32x4_t(const _uint16x8_t &b) noexcept;
+		constexpr _uint32x4_t(const _uint64x2_t &b) noexcept;
+
 		union {
 			// compatibility to `TxN_t`
 			uint32_t d[4];
@@ -278,6 +305,15 @@ namespace cryptanalysislib {
 		constexpr static uint32_t LIMBS = 2;
 		using limb_type = uint64_t;
 
+		constexpr inline _uint64x2_t &operator=(const _uint8x16_t &b) noexcept;
+		constexpr inline _uint64x2_t &operator=(const _uint16x8_t &b) noexcept;
+		constexpr inline _uint64x2_t &operator=(const _uint32x4_t &b) noexcept;
+
+		constexpr _uint64x2_t() noexcept {}
+		constexpr _uint64x2_t(const _uint8x16_t &b) noexcept;
+		constexpr _uint64x2_t(const _uint16x8_t &b) noexcept;
+		constexpr _uint64x2_t(const _uint32x4_t &b) noexcept;
+
 		union {
 			uint64_t d[2];
 
@@ -457,6 +493,12 @@ struct uint8x32_t {
 	constexpr static uint32_t LIMBS = 32;
 	using limb_type = uint8_t;
 
+	constexpr uint8x32_t() noexcept = default;
+	constexpr uint8x32_t(const uint16x16_t &b) noexcept;
+	constexpr uint8x32_t(const uint32x8_t &b) noexcept;
+	constexpr uint8x32_t(const uint64x4_t &b) noexcept;
+	constexpr uint8x32_t(const uint128x2_t &b) noexcept;
+
 	union {
 		// compatibility with txn_t
 		uint8_t d[32];
@@ -478,8 +520,6 @@ struct uint8x32_t {
 		return d[i];
 	}
 
-	constexpr uint8x32_t() noexcept {}
-
 	///
 	/// \param binary
 	/// \param hex
@@ -873,6 +913,24 @@ struct uint8x32_t {
 		return ret;
 	}
 
+	/// wrapper around: `_mm256_blend_epi8`
+	/// \tparam in2
+	/// \param in1
+	/// \return
+	[[nodiscard]] constexpr static inline uint8x32_t blend(const uint8x32_t in1,
+	                                                       const uint8x32_t in2,
+	                                                       const uint8x32_t in3) noexcept {
+		uint8x32_t ret{};
+		for (uint32_t i = 0; i < 32; i++) {
+			if (in3.v8[i]) {
+				ret.v8[i] = in1.v8[i];
+			} else {
+				ret.v8[i] = in2.v8[i];
+			}
+		}
+		return ret;
+	}
+
 	[[nodiscard]] constexpr static inline uint8x32_t popcnt(const uint8x32_t in) noexcept {
 		uint8x32_t out;
 
@@ -893,6 +951,12 @@ struct uint16x16_t {
 	constexpr static uint32_t LIMBS = 16;
 	using limb_type = uint16_t;
 
+	constexpr uint16x16_t() noexcept = default;
+	constexpr uint16x16_t(const uint8x32_t &b) noexcept;
+	constexpr uint16x16_t(const uint32x8_t &b) noexcept;
+	constexpr uint16x16_t(const uint64x4_t &b) noexcept;
+	constexpr uint16x16_t(const uint128x2_t &b) noexcept;
+	
 	union {
 		// compatibility with txn_t
 		uint16_t d[16];
@@ -1297,6 +1361,25 @@ struct uint16x16_t {
 		return ret;
 	}
 
+
+	/// wrapper around: `_mm256_blend_epi32`
+	/// \tparam in2
+	/// \param in1
+	/// \return
+	template<uint32_t imm>
+	[[nodiscard]] constexpr static inline uint16x16_t blend(const uint16x16_t in1,
+	                                                        const uint16x16_t in2) noexcept {
+		uint16x16_t ret{};
+		for (uint32_t i = 0; i < 16; i++) {
+			if (imm & (1u << (i%8))) {
+				ret.v16[i] = in2.v16[i];
+			} else {
+				ret.v16[i] = in1.v16[i];
+			}
+		}
+		return ret;
+	}
+
 	constexpr static inline uint16x16_t popcnt(const uint16x16_t in) noexcept {
 		uint16x16_t out;
 
@@ -1325,6 +1408,12 @@ struct uint32x8_t {
 	constexpr static uint32_t LIMBS = 8;
 	using limb_type = uint32_t;
 
+	constexpr uint32x8_t() noexcept = default;
+	constexpr uint32x8_t(const uint8x32_t &b) noexcept;
+	constexpr uint32x8_t(const uint16x16_t &b) noexcept;
+	constexpr uint32x8_t(const uint64x4_t &b) noexcept;
+	constexpr uint32x8_t(const uint128x2_t &b) noexcept;
+
 	union {
 		// compatibility with txn_t
 		uint32_t d[8];
@@ -1735,6 +1824,94 @@ struct uint32x8_t {
 		return ret;
 	}
 
+	///
+	/// \tparam scale
+	/// \param ptr
+	/// \param offset
+	/// \param data
+	/// \return
+	template<const uint32_t scale = 1>
+	constexpr static inline void scatter(const void *ptr, const uint32x8_t offset, const uint32x8_t data) noexcept {
+		static_assert(scale == 1 || scale == 2 || scale == 4 || scale == 8);
+		const uint8_t *ptr8 = (uint8_t *) ptr;
+		for (uint32_t i = 0; i < 8; i++) {
+			*(uint32_t *) (ptr8 + offset.v32[i] * scale) = data.v32[i];
+		}
+	}
+
+	/// wrapper around: `_mm256_blend_epi32`
+	/// \tparam in2
+	/// \param in1
+	/// \return
+	template<uint8_t imm>
+	[[nodiscard]] constexpr static inline uint32x8_t blend(const uint32x8_t in1,
+	                                                       const uint32x8_t in2) noexcept {
+		uint32x8_t ret{};
+		for (uint32_t i = 0; i < 7; i++) {
+			if (imm & (1u << i)) {
+				ret.v32[i] = in2.v32[i];
+			} else {
+				ret.v32[i] = in1.v32[i];
+			}
+		}
+		return ret;
+	}
+
+	/// wrapper around: `_mm256_unpacklo_epi64`
+	/// \tparam in2
+	/// \param in1
+	/// \return
+	[[nodiscard]] constexpr static inline uint32x8_t unpacklo(const uint32x8_t in1,
+	                                                          const uint32x8_t in2) noexcept {
+		uint32x8_t ret{};
+		ret.v64[0] = in1.v64[0];
+		ret.v64[1] = in2.v64[0];
+		ret.v64[2] = in1.v64[2];
+		ret.v64[3] = in2.v64[2];
+		return ret;
+	}
+
+	/// wrapper around: `_mm256_unpacklo_epi64`
+	/// \tparam in2
+	/// \param in1
+	/// \return
+	[[nodiscard]] constexpr static inline uint32x8_t unpackhi(const uint32x8_t in1,
+	                                                          const uint32x8_t in2) noexcept {
+		uint32x8_t ret{};
+		ret.v64[0] = in1.v64[1];
+		ret.v64[1] = in2.v64[1];
+		ret.v64[2] = in1.v64[3];
+		ret.v64[3] = in2.v64[3];
+		return ret;
+	}
+
+	/// wrapper around: `_mm256_permute2x128_si256`
+	/// TODO
+	/// \tparam in2
+	/// \param in1
+	/// \return
+	template<const uint32_t in3>
+	[[nodiscard]] constexpr static inline uint32x8_t permute(const uint32x8_t in1,
+	                                                         const uint32x8_t in2) noexcept {
+		uint32x8_t ret{};
+		switch (in3&0xf) {
+			case 0: ret.v128[0] = in1.v128[0]; break;
+			case 1: ret.v128[0] = in1.v128[1]; break;
+			case 2: ret.v128[0] = in2.v128[0]; break;
+			case 3: ret.v128[0] = in2.v128[1]; break;
+			default: ret.v128[0] = {0};
+		}
+
+		switch ((in3>>4)&0xf) {
+			case 0: ret.v128[1] = in1.v128[0]; break;
+			case 1: ret.v128[1] = in1.v128[1]; break;
+			case 2: ret.v128[1] = in2.v128[0]; break;
+			case 3: ret.v128[1] = in2.v128[1]; break;
+			default: ret.v128[1] = {0};
+		}
+		return ret;
+	}
+
 	/// TODO
 	/// \param in
 	/// \param perm
@@ -1777,6 +1954,12 @@ struct uint64x4_t {
 	constexpr static uint32_t LIMBS = 4;
 	using limb_type = uint64_t;
 
+	constexpr inline uint64x4_t() noexcept = default;
+	constexpr inline uint64x4_t(const uint8x32_t &b) noexcept;
+	constexpr inline uint64x4_t(const uint16x16_t &b) noexcept;
+	constexpr inline uint64x4_t(const uint32x8_t &b) noexcept;
+	constexpr inline uint64x4_t(const uint128x2_t &b) noexcept;
+
 	union {
 		// compatibility with txn_t
 		uint64_t d[4];
@@ -2087,17 +2270,6 @@ struct uint64x4_t {
 		return out;
 	}
 
-	///
-	/// \param in1
-	/// \param in2
-	/// \return
-	constexpr static inline uint64x4_t permute(const uint64x4_t in1,
-	                                           const uint32_t in2) noexcept {
-		uint64x4_t ret;
-		ASSERT(0);
-		return ret;
-	}
-
 	///
 	/// \param in1
 	/// \param in2
@@ -2219,7 +2391,70 @@ struct uint64x4_t {
 
 		return ret;
 	}
+	
+	/// wrapper around: `_mm256_unpacklo_epi64`
+	/// \tparam in2
+	/// \param in1
+	/// \return
+	[[nodiscard]] constexpr static inline uint64x4_t unpacklo(const uint64x4_t in1,
+	                                                          const uint64x4_t in2) noexcept {
+		uint64x4_t ret{};
+		ret.v64[0] = in1.v64[0];
+		ret.v64[1] = in2.v64[0];
+		ret.v64[2] = in1.v64[2];
+		ret.v64[3] = in2.v64[2];
+		return ret;
+	}
+
+	/// wrapper around: `_mm256_unpacklo_epi64`
+	/// \tparam in2
+	/// \param in1
+	/// \return
+	[[nodiscard]] constexpr static inline uint64x4_t unpackhi(const uint64x4_t in1,
+	                                                          const uint64x4_t in2) noexcept {
+		uint64x4_t ret{};
+		ret.v64[0] = in1.v64[1];
+		ret.v64[1] = in2.v64[1];
+		ret.v64[2] = in1.v64[3];
+		ret.v64[3] = in2.v64[3];
+		return ret;
+	}
 
+	/// wrapper around: `_mm256_permute2x128_si256`
+	/// \tparam in2
+	/// \param in1
+	/// \return
+	template<const uint32_t in3>
+	[[nodiscard]] constexpr static inline uint64x4_t permute(const uint64x4_t in1,
+	                                                         const uint64x4_t in2) noexcept {
+		uint64x4_t ret{};
+		switch (in3&0xf) {
+			case 0: ret.v128[0] = in1.v128[0]; break;
+			case 1: ret.v128[0] = in1.v128[1]; break;
+			case 2: ret.v128[0] = in2.v128[0]; break;
+			case 3: ret.v128[0] = in2.v128[1]; break;
+			default: ret.v128[0] = {0};
+		}
+
+		switch ((in3>>4)&0xf) {
+			case 0: ret.v128[1] = in1.v128[0]; break;
+			case 1: ret.v128[1] = in1.v128[1]; break;
+			case 2: ret.v128[1] = in2.v128[0]; break;
+			case 3: ret.v128[1] = in2.v128[1]; break;
+			default: ret.v128[1] = {0};
+		}
+		return ret;
+	}
+	///
+	/// \param in1
+	/// \param in2
+	/// \return
+	constexpr static inline uint64x4_t permute(const uint64x4_t in1,
+	                                           const uint32_t in2) noexcept {
+		uint64x4_t ret;
+		ASSERT(0);
+		return ret;
+	}
 	/// TODO
 	/// \tparam in2
 	/// \param in1
@@ -2261,4 +2496,67 @@ struct uint64x4_t {
 	}
 };
 
+struct uint128x2_t {
+	constexpr static uint32_t LIMBS = 2;
+	using limb_type = __uint128_t;
+
+	union {
+		// compatibility with TxN_t
+		__uint128_t d[2];
+
+		uint8_t v8[32];
+		uint16_t v16[16];
+		uint32_t v32[8];
+		uint64_t v64[4];
+		uint64x2_t v128[2];
+	};
+
+	constexpr uint128x2_t() noexcept = default;
+	constexpr uint128x2_t(const uint8x32_t &b) noexcept;
+	constexpr uint128x2_t(const uint16x16_t &b) noexcept;
+	constexpr uint128x2_t(const uint32x8_t &b) noexcept;
+	constexpr uint128x2_t(const uint64x4_t &b) noexcept;
+
+	[[nodiscard]] constexpr inline limb_type operator[](const uint32_t i) {
+		ASSERT(i < LIMBS);
+		return d[i];
+	}
+
+	/// NOTE: currently cannot be constexpr
+	/// \return
+	[[nodiscard]] static inline uint128x2_t random() noexcept {
+		uint128x2_t ret{};
+		for (size_t i = 0; i < 4; ++i) {
+			ret.v64[i] = fastrandombytes_uint64();
+		}
+		return ret;
+	}
+
+	///
+	/// \param binary
+	/// \param hex
+	constexpr inline void print(bool binary = false, bool hex = false) const;
+
+	/// wrapper around: `_mm256_bslli_epi128`
+	/// \tparam imm
+	/// \param in1
+	/// \return
+	template<const uint8_t imm>
+	[[nodiscard]] constexpr static uint128x2_t slli(const uint128x2_t in1) {
+		uint128x2_t ret{};
+		// TODO
+		return ret;
+	}
+
+	/// wrapper around: `_mm256_bslli_epi128`
+	/// \tparam imm
+	/// \param in1
+	/// \return
+	template<const uint8_t imm>
+	[[nodiscard]] constexpr static uint128x2_t srli(const uint128x2_t in1) {
+		uint128x2_t ret{};
+		// TODO
+		return ret;
+	}
+};
 #endif
diff --git a/src/simd/simd.h b/src/simd/simd.h
index 19fa1740..f40b4c36 100644
--- a/src/simd/simd.h
+++ b/src/simd/simd.h
@@ -2559,6 +2559,9 @@ namespace cryptanalysislib {
 	}
 }// namespace cryptanalysislib
 
+
+
+
 #include "simd/bits/bits.h"
 #include "simd/generic.h"
 #include "simd/matrix/simple.h"
diff --git a/tests/container/hashmap/simple.cpp b/tests/container/hashmap/simple.cpp
index 8d1bdde6..3905d892 100644
--- a/tests/container/hashmap/simple.cpp
+++ b/tests/container/hashmap/simple.cpp
@@ -26,7 +26,7 @@ TEST(HashMap, simd) {
 			data[j] = i;
 			index[j] = i + 1;
 		}
-		hm.insert_simd(data, index);
+		// TODO hm.insert_simd(data, index);
 	}
 
 	for (size_t i = 0; i < 1u << l; ++i) {

From f0d1470d90b61224c7d578698f89864b5bd1c094 Mon Sep 17 00:00:00 2001
From: Floyd <floyd.zweydinge+gitr@rub.de>
Date: Wed, 6 Mar 2024 20:10:33 +0100
Subject: [PATCH 3/3] fixxed buigs in neon gcc

---
 src/simd/matrix/simple.h | 2 +-
 src/simd/neon.h          | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/simd/matrix/simple.h b/src/simd/matrix/simple.h
index fbbb05d3..c7495882 100644
--- a/src/simd/matrix/simple.h
+++ b/src/simd/matrix/simple.h
@@ -5,7 +5,7 @@ class uint1x64x64_T {
 	uint64_t data[64];
 
 	constexpr static void transpose(uint64_t *out, uint64_t *in) noexcept {
-		constexpr static uint64_t masks[6][2] = {
+		constexpr uint64_t masks[6][2] = {
 		        {0x5555555555555555, 0xAAAAAAAAAAAAAAAA},
 		        {0x3333333333333333, 0xCCCCCCCCCCCCCCCC},
 		        {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0},
diff --git a/src/simd/neon.h b/src/simd/neon.h
index 30944973..8d75fa65 100644
--- a/src/simd/neon.h
+++ b/src/simd/neon.h
@@ -2433,7 +2433,7 @@ struct uint64x4_t {
 			case 1: ret.v128[0] = in1.v128[1]; break;
 			case 2: ret.v128[0] = in2.v128[0]; break;
 			case 3: ret.v128[0] = in2.v128[1]; break;
-			default: ret.v128[0] = {0};
+			default: ret.v64[0] = 0; ret.v64[1] = 0;
 		}
 
 		switch ((in3>>4)&0xf) {
@@ -2441,7 +2441,7 @@ struct uint64x4_t {
 			case 1: ret.v128[1] = in1.v128[1]; break;
 			case 2: ret.v128[1] = in2.v128[0]; break;
 			case 3: ret.v128[1] = in2.v128[1]; break;
-			default: ret.v128[1] = {0};
+			default: ret.v64[2] = 0; ret.v64[3] = 0;
 		}
 		return ret;
 	}