diff --git a/deps/b63 b/deps/b63 index 6efb14ce..150522d7 160000 --- a/deps/b63 +++ b/deps/b63 @@ -1 +1 @@ -Subproject commit 6efb14ce64132847fc990c805ac906bec59fe20f +Subproject commit 150522d7066189d31c3685bb5ea16cef1673c501 diff --git a/deps/cmake_optimize_for_architecture b/deps/cmake_optimize_for_architecture index 45cc0b06..24af3d05 160000 --- a/deps/cmake_optimize_for_architecture +++ b/deps/cmake_optimize_for_architecture @@ -1 +1 @@ -Subproject commit 45cc0b06b472758d3fdce509aa58e18818926f5e +Subproject commit 24af3d05c5514698d5482b8783126bfe30ffa324 diff --git a/src/simd/avx2.h b/src/simd/avx2.h index 262e75f8..426b2f0d 100644 --- a/src/simd/avx2.h +++ b/src/simd/avx2.h @@ -40,9 +40,6 @@ namespace internal { namespace cryptanalysislib { - struct _uint16x8_t; - struct _uint32x4_t; - struct _uint64x2_t; struct _uint8x16_t { constexpr static uint32_t LIMBS = 16; @@ -231,8 +228,8 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint16x8_t setr( - uint8_t a, uint8_t b, uint8_t c, uint8_t d, - uint8_t e, uint8_t f, uint8_t g, uint8_t h) noexcept { + uint16_t a, uint16_t b, uint16_t c, uint16_t d, + uint16_t e, uint16_t f, uint16_t g, uint16_t h) noexcept { _uint16x8_t ret; ret.v8[0] = a; ret.v8[1] = b; @@ -368,15 +365,18 @@ struct uint8x32_t { __m256i v256; }; + /// https://godbolt.org/#z:OYLghAFBqd5QCxAYwPYBMCmBRdBLAF1QCcAaPECAMzwBtMA7AQwFtMQByARg9KtQYEAysib0QXACx8BBAKoBnTAAUAHpwAMvAFYTStJg1DIApACYAQuYukl9ZATwDKjdAGFUtAK4sGEgBykrgAyeAyYAHI%2BAEaYxCCSZqQADqgKhE4MHt6%2BASlpGQKh4VEssfGJtpj2jgJCBEzEBNk%2BflyBdpgOmfWNBMWRMXEJSQoNTS257bbj/WGDZcOJAJS2qF7EyOwc5gDMYcjeWADUJrtuY/iCAHQIZ9gmGgCCewdHmKfneCwsYQTEYVu90eLzM%2BwYhy8JzObmQlycwOeILGxC8DmOXj%2B/lUuzMAH0CKcAOxWJEaACcXgYmWJpJeFMelKxBOOxwAbv5TgBWCy4kxcgAiZzpjMZmMEXAAbCy2VL%2BRY5YLhSDRRTxQRcTK%2BTzjv5%2BULdiKGWq/pLJDKzfLjpJ9cqyeTGXi8SwzFzJXh2a7JXb6Q6iQa6azWSCg8c0AwxphVMliMcxkxHMhjmFaPMMVicfjCaoSHiIOGxunBNjNYSwlxSCqKaGa7W6/WC4T1SWs8mGGZlrSqw6TcXMyz1gQfaryYPrmyvZ8BccIE6XW68J2oE62ZJ0F5luXx5P%2BdgZyu1xuwmZt27lsPjeSAPRXscTt1T47O%2BfSnPEPHpL0QLf3yWkNsnr%2B56Gt2jLEJgBAbAwxyDhefpCmSvo3le3bIdybjJI0rDHMk/yoTe6HgZBxAMN2jZRjGcYNImbapuERYEC2LJUmIeDAOE6B4rQqBMOg%2BYCIWbKoHg6DHAAVLhxCdiYJKgRSTBeEQ4mSTuuzTrOzpeh6YnLJJcFihmpYwYp%2BkUneqnThJ/xeqZ163opp6So%2Bz5elxPHrh%2BeBfhpL7abp/zAUaPbkkRUHGUOIH2jJCFPKGIZBsh%2BFXuhmHENhen2mh/IYVhLBtmRAkEBRsbxjRKZpkJIkYswqbsZgnFjCQmAQJVolWWQYaFQxTFlgw0myVFFJzlpynWQ%2BZzqcNC7if5xC2e1Flto5Ppxf6KprZF9JPOR0axq1xxFWMEDNiyYmDv%2Bjbdad5YXV1J2EmJx63RGTZ/EZfVds8ob8LGx1va2HoTccGjCsm1i4p8bjHKRhpg5YE24v1gZ1pdzb9oSTCPmjpYgCALG1Rxbm8d%2BDBcKcljJmJiMrSjd2Ga20RY/TBK4/jbGE9xxPHuTFiU9Tm0NnTfZGUmQPY1muNvnmTD/tEgXxbW4ss3jNXs/VH5EOBEDnWG8tfUG0XrTFIJ/McLBMGEEBIwrnUvcVDHvY%2BXBmHqAtBjtlH3eF8qkYKj4yRYIMbcjrIe7GXvlj7%2Br%2BySQcBjbYdXb1ZhR37QMB3HF6xUGh0EFAXs6TrN0AaQfU06yMZ/FQEDmCnrroNlpFmEk3s8iDgp6y8/ocKstCcFyvB%2BBwWikKgnBuNY1hxusmwfHsPCkAQmg96sADWIC7Ls1waP4kiSkSkhEv4GhH0ff59xwkiD8vo%2BcLwCggBoi/L6scCwEgaAsMkdBxOQlCf9/eg8RgBcFxKQLAbI8BbAAGp4EwAAdwAPLJEYJwBeNBaBFWIA/CA0Qb7RDCI0AAnmg3gBDmDECIYg6I2guhL24LwT%2BbBBCIIYLQEhw9eBYGiF4YAbgxC0Afgw8BmBzZGHEJw8BeBwLdDZJgIRI8oxdEUtsBefxqg31TNENKlCPBYBvv8b4pDVhUAMMABQsCEHINQcI/gggRBiHYFIGQghFAqHUJI3QFYDBGBQJPSw%2Bg8DRAfpAVYqBcKZCEQAWnNmyVQZhjhRMQbsXgqA5HEABFgEJVsqg1EyC4Bg7hPCtAkESIIhSBilHKBIJ%2BqR0i1CyMUqYZS6mFAYJUoY8QuBP06N0OosxJhtDKb0hpvQmgdMWF0npAymlDJmH0CZ1TumrAUDPLYEhe792vpIseHBjiqH8JKKJZpjjAGQEmUB1wEkQFwIQEg5NdhcGWLwehWhlhrw3lvLk/guDknJO0fw3yj4/OkBfK%2Bpcb67Pvo/Z%2BnDX4wEQCAQcyRFJ/34l/H%2BxAIisG2Aco5JyzkXK3mYXg9U7mZL0HY4QohxDOKpW4tQN8vGkHgWlZIxj9BbIhTszgiDFIosJKgKg%2BzDnHMkKc85xxLnXI8BioBDynkvJfh8ze1xN7qo1Zqzll9tkjyhbYGFryV7apJdyvVd9YVvNWOk9IzhJBAA%3D + constexpr uint8x32_t() noexcept = default; + constexpr uint8x32_t(const uint16x16_t &b) noexcept; + constexpr uint8x32_t(const uint32x8_t &b) noexcept; + constexpr uint8x32_t(const uint64x4_t &b) noexcept; + constexpr uint8x32_t(const uint128x2_t &b) noexcept; + [[nodiscard]] constexpr inline limb_type operator[](const uint32_t i) { ASSERT(i < LIMBS); return d[i]; } - /// Example of how the constexpr implementation works: - /// https://godbolt.org/#z:OYLghAFBqd5QCxAYwPYBMCmBRdBLAF1QCcAaPECAMzwBtMA7AQwFtMQByARg9KtQYEAysib0QXACx8BBAKoBnTAAUAHpwAMvAFYTStJg1DIApACYAQuYukl9ZATwDKjdAGFUtAK4sGEgBykrgAyeAyYAHI%2BAEaYxCCSZqQADqgKhE4MHt6%2BASlpGQKh4VEssfGJtpj2jgJCBEzEBNk%2BflyBdpgOmfWNBMWRMXEJSQoNTS257bbj/WGDZcOJAJS2qF7EyOwc5gDMYcjeWADUJrtuY/iCAHQIZ9gmGgCCewdHmKfneCwsYQTEYVu90eLzM%2BwYhy8JzObmQlycwOeILGxC8DmOXj%2B/lUuzMAH0CKcAOxWJEaACcXgYmWJpJeFMelKxBOOxwAbv5TgBWCy4kxcgAiZzpjMZmMEXAAbCy2VL%2BRY5YLhSDRRTxQRcTK%2BTzjv5%2BULdiKGWq/pLJDKzfLjpJ9cqyeTGXi8SwzFzJXh2a7JXb6Q6iQa6azWSCg8c0AwxphVMliMcxkxHMhjmFaPMMVicfjCaoSHiIOGxunBNjNYSwlxSCqKaGa7W6/WC4T1SWs8mGGZlrSqw6TcXMyz1gQfaryYPrmyvZ8BccIE6XW68J2oE62ZJ0F5luXx5P%2BdgZyu1xuwmZt27lsPjeSAPRXscTt1T47O%2BfSnPEPHpL0QLf3yWkNsnr%2B56Gt2jLEJgBAbAwxyDhefpCmSvo3le3bIdybjJI0rDHMk/yoTe6HgZBxAMN2jZRjGcYNImbapuERYEC2LJUmIeDAOE6B4rQqBMOg%2BYCIWbKoHg6DHAAVLhxCdiYJKgRSTBeEQ4mSTuuzTrOzpeh6YnLJJcFihmpYwYp%2BkUneqnThJ/xeqZ163opp6So%2Bz5elxPHrh%2BeBfhpL7abp/zAUaPbkkRUHGUOIH2jJCFPKGIZBsh%2BFXuhmHENhen2mh/IYVhLBtmRAkEBRsbxjRKZpkJIkYswqbsZgnFjCQmAQJVolWWQYaFQxTFlgw0myVFFJzlpynWQ%2BZzqcNC7if5xC2e1Flto5Ppxf6KprZF9JPOR0axq1xxFWMEDNiyYmDv%2Bjbdad5YXV1J2EmJx63RGTZ/EZfVds8ob8LGx1va2HoTccGjCsm1i4p8bjHKRhpg5YE24v1gZ1pdzb9oSTCPmjpYgCALG1Rxbm8d%2BDBcKcljJmJiMrSjd2Ga20RY/TBK4/jbGE9xxPHuTFiU9Tm0NnTfZGUmQPY1muNvnmTD/tEgXxbW4ss3jNXs/VH5EOBEDnWG8tfUG0XrTFIJ/McLBMGEEBIwrnUvcVDHvY%2BXBmHqAtBjtlH3eF8qkYKj4yRYIMbcjrIe7GXvlj7%2Br%2BySQcBjbYdXb1ZhR37QMB3HF6xUGh0EFAXs6TrN0AaQfU06yMZ/FQEDmCnrroNlpFmEk3s8iDgp6y8/ocKstCcFyvB%2BBwWikKgnBuNY1hxusmwfHsPCkAQmg96sADWIC7Ls1waP4kiSkSkhEv4GhH0ff59xwkiD8vo%2BcLwCggBoi/L6scCwEgaAsMkdBxOQlCf9/eg8RgBcFxKQLAbI8BbAAGp4EwAAdwAPLJEYJwBeNBaBFWIA/CA0Qb7RDCI0AAnmg3gBDmDECIYg6I2guhL24LwT%2BbBBCIIYLQEhw9eBYGiF4YAbgxC0Afgw8BmBzZGHEJw8BeBwLdDZJgIRI8oxdEUtsBefxqg31TNENKlCPBYBvv8b4pDVhUAMMABQsCEHINQcI/gggRBiHYFIGQghFAqHUJI3QFYDBGBQJPSw%2Bg8DRAfpAVYqBcKZCEQAWnNmyVQZhjhRMQbsXgqA5HEABFgEJVsqg1EyC4Bg7hPCtAkESIIhSBilHKBIJ%2BqR0i1CyMUqYZS6mFAYJUoY8QuBP06N0OosxJhtDKb0hpvQmgdMWF0npAymlDJmH0CZ1TumrAUDPLYEhe792vpIseHBjiqH8JKKJZpjjAGQEmUB1wEkQFwIQEg5NdhcGWLwehWhlhrw3lvLk/guDknJO0fw3yj4/OkBfK%2Bpcb67Pvo/Z%2BnDX4wEQCAQcyRFJ/34l/H%2BxAIisG2Aco5JyzkXK3mYXg9U7mZL0HY4QohxDOKpW4tQN8vGkHgWlZIxj9BbIhTszgiDFIosJKgKg%2BzDnHMkKc85xxLnXI8BioBDynkvJfh8ze1xN7qo1Zqzll9tkjyhbYGFryV7apJdyvVd9YVvNWOk9IzhJBAA%3D - constexpr uint8x32_t() noexcept = default; - /// NOTE: currently cannot be constexpr /// \return [[nodiscard]] static inline uint8x32_t random() noexcept { @@ -444,6 +444,40 @@ struct uint8x32_t { return out; } + /// + /// \param __q31 + /// \param __q30 + /// \param __q29 + /// \param __q28 + /// \param __q27 + /// \param __q26 + /// \param __q25 + /// \param __q24 + /// \param __q23 + /// \param __q22 + /// \param __q21 + /// \param __q20 + /// \param __q19 + /// \param __q18 + /// \param __q17 + /// \param __q16 + /// \param __q15 + /// \param __q14 + /// \param __q13 + /// \param __q12 + /// \param __q11 + /// \param __q10 + /// \param __q09 + /// \param __q08 + /// \param __q07 + /// \param __q06 + /// \param __q05 + /// \param __q04 + /// \param __q03 + /// \param __q02 + /// \param __q01 + /// \param __q00 + /// \return [[nodiscard]] constexpr static inline uint8x32_t setr(char __q31, char __q30, char __q29, char __q28, char __q27, char __q26, char __q25, char __q24, char __q23, char __q22, char __q21, char __q20, @@ -730,6 +764,19 @@ struct uint8x32_t { return ret; } + + /// wrapper around: `_mm256_blend_epi8` + /// \tparam in2 + /// \param in1 + /// \return + [[nodiscard]] constexpr static inline uint8x32_t blend(const uint8x32_t in1, + const uint8x32_t in2, + const uint8x32_t in3) noexcept { + uint8x32_t ret{}; + ret.v256 = (__m256i) __builtin_ia32_pblendvb256((__v32qi) in1.v256, (__v32qi) in2.v256, (__v32qi) in3.v256); + return ret; + } + /// checks if all bytes are equal /// source: https://github.com/WojciechMula/toys/tree/master/simd-all-bytes-equal /// \param in @@ -799,6 +846,12 @@ struct uint16x16_t { __m256i v256; }; + constexpr uint16x16_t() noexcept = default; + constexpr uint16x16_t(const uint8x32_t &b) noexcept; + constexpr uint16x16_t(const uint32x8_t &b) noexcept; + constexpr uint16x16_t(const uint64x4_t &b) noexcept; + constexpr uint16x16_t(const uint128x2_t &b) noexcept; + [[nodiscard]] constexpr inline limb_type operator[](const uint32_t i) const { ASSERT(i < LIMBS); return d[i]; @@ -1102,6 +1155,19 @@ struct uint16x16_t { return ret; } + /// wrapper around: `_mm256_blend_epi32` + /// \tparam in2 + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static inline uint16x16_t blend(const uint16x16_t in1, + const uint16x16_t in2) noexcept { + uint16x16_t ret{}; + ret.v256 = ((__m256i) __builtin_ia32_pblendw256((__v16hi) (__m256i) (in1.v256), + (__v16hi) (__m256i) (in2.v256), (int) (imm))); + return ret; + } + /// /// \param in /// \return @@ -1124,10 +1190,16 @@ struct uint32x8_t { uint16_t v16[16]; uint32_t v32[8]; uint64_t v64[4]; - cryptanalysislib::_uint32x4_t v128[2]; + // TODO cryptanalysislib::_uint32x4_t v128[2]; __m256i v256; }; + constexpr uint32x8_t() noexcept = default; + constexpr uint32x8_t(const uint8x32_t &b) noexcept; + constexpr uint32x8_t(const uint16x16_t &b) noexcept; + constexpr uint32x8_t(const uint64x4_t &b) noexcept; + constexpr uint32x8_t(const uint128x2_t &b) noexcept; + [[nodiscard]] constexpr inline limb_type operator[](const uint32_t i) noexcept { ASSERT(i < LIMBS); return d[i]; @@ -1493,6 +1565,53 @@ struct uint32x8_t { } } + /// wrapper around: `_mm256_blend_epi32` + /// \tparam in2 + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static inline uint32x8_t blend(const uint32x8_t in1, + const uint32x8_t in2) noexcept { + uint32x8_t ret{}; + ret.v256 = ((__m256i) __builtin_ia32_pblendd256((__v8si) (__m256i) (in1.v256), + (__v8si) (__m256i) (in2.v256), (int) (imm))); + return ret; + } + + /// wrapper around: `_mm256_unpacklo_epi64` + /// \tparam in2 + /// \param in1 + /// \return + [[nodiscard]] constexpr static inline uint32x8_t unpacklo(const uint32x8_t in1, + const uint32x8_t in2) noexcept { + uint32x8_t ret{}; + ret.v256 = (__m256i) __builtin_shufflevector((__v8si) in1.v256, (__v8si) in2.v256, 0, 8 + 0, 1, 8 + 1, 4, 8 + 4, 5, 8 + 5); + return ret; + } + + /// wrapper around: `_mm256_unpacklo_epi64` + /// \tparam in2 + /// \param in1 + /// \return + [[nodiscard]] constexpr static inline uint32x8_t unpackhi(const uint32x8_t in1, + const uint32x8_t in2) noexcept { + uint32x8_t ret{}; + ret.v256 = (__m256i) __builtin_shufflevector((__v8si) in1.v256, (__v8si) in2.v256, 2, 8 + 2, 3, 8 + 3, 6, 8 + 6, 7, 8 + 7); + return ret; + } + + /// wrapper around: `_mm256_permute2x128_si256` + /// TODO + /// \tparam in2 + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static inline uint32x8_t permute(const uint32x8_t in1, + const uint32x8_t in2) noexcept { + uint32x8_t ret{}; + return ret; + } + /// /// \param in /// \param perm @@ -1565,6 +1684,12 @@ struct uint64x4_t { __m256i v256; }; + constexpr inline uint64x4_t() noexcept = default; + constexpr inline uint64x4_t(const uint8x32_t &b) noexcept; + constexpr inline uint64x4_t(const uint16x16_t &b) noexcept; + constexpr inline uint64x4_t(const uint32x8_t &b) noexcept; + constexpr inline uint64x4_t(const uint128x2_t &b) noexcept; + [[nodiscard]] constexpr inline limb_type operator[](const uint32_t i) { ASSERT(i < LIMBS); return d[i]; @@ -1822,7 +1947,41 @@ struct uint64x4_t { return out; } - /// + /// wrapper around: `_mm256_unpacklo_epi64` + /// \tparam in2 + /// \param in1 + /// \return + [[nodiscard]] constexpr static inline uint64x4_t unpacklo(const uint64x4_t in1, + const uint64x4_t in2) noexcept { + uint64x4_t ret{}; + ret.v256 = (__m256i) __builtin_shufflevector((__v4di) in1.v256, (__v4di) in2.v256, 0, 4 + 0, 2, 4 + 2); + return ret; + } + + /// wrapper around: `_mm256_unpacklo_epi64` + /// \tparam in2 + /// \param in1 + /// \return + [[nodiscard]] constexpr static inline uint64x4_t unpackhi(const uint64x4_t in1, + const uint64x4_t in2) noexcept { + uint64x4_t ret{}; + ret.v256 = (__m256i) __builtin_shufflevector((__v4di) in1.v256, (__v4di) in1.v256, 1, 4 + 1, 3, 4 + 3); + return ret; + } + + /// wrapper around: `_mm256_permute2x128_si256` + /// \tparam in2 + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static inline uint64x4_t permute(const uint64x4_t in1, + const uint64x4_t in2) noexcept { + uint64x4_t ret{}; + ret.v256 = ((__m256i) __builtin_ia32_permti256((__v4di) (__m256i) (in1.v256), (__m256i) (in2.v256), (int) (in3))); + return ret; + } + + /// wrapper around: `_mm256_permute4x64_epi64` /// \tparam in2 /// \param in1 /// \return @@ -1958,5 +2117,73 @@ struct uint64x4_t { } }; +struct uint128x2_t { + constexpr static uint32_t LIMBS = 2; + using limb_type = __uint128_t; + +<<<<<<< HEAD + union { + // compatibility with TxN_t + __uint128_t d[2]; + + uint8_t v8[32]; + uint16_t v16[16]; + uint32_t v32[8]; + uint64_t v64[4]; + __uint128_t v128[2]; + __m256i v256; + }; + + constexpr uint128x2_t() noexcept = default; + constexpr uint128x2_t(const uint8x32_t &b) noexcept; + constexpr uint128x2_t(const uint16x16_t &b) noexcept; + constexpr uint128x2_t(const uint32x8_t &b) noexcept; + constexpr uint128x2_t(const uint64x4_t &b) noexcept; + + [[nodiscard]] constexpr inline limb_type operator[](const uint32_t i) { + ASSERT(i < LIMBS); + return d[i]; + } + + /// NOTE: currently cannot be constexpr + /// \return + [[nodiscard]] static inline uint128x2_t random() noexcept { + uint128x2_t ret{}; + for (size_t i = 0; i < 4; ++i) { + ret.v64[i] = fastrandombytes_uint64(); + } + return ret; + } + + /// + /// \param binary + /// \param hex + constexpr inline void print(bool binary = false, bool hex = false) const; + + /// wrapper around: `_mm256_bslli_epi128` + /// \tparam imm + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static uint128x2_t slli(const uint128x2_t in1) { + uint128x2_t ret{}; + ret.v256 = ((__m256i) __builtin_ia32_pslldqi256_byteshift((__v4di) (__m256i) (in1.v256), (int) (imm))); + return ret; + } + + /// wrapper around: `_mm256_bslli_epi128` + /// \tparam imm + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static uint128x2_t srli(const uint128x2_t in1) { + uint128x2_t ret{}; + ret.v256 = ((__m256i) __builtin_ia32_psrldqi256_byteshift((__m256i) (in1.v256), (int) (imm))); + return ret; + } +}; + +======= +>>>>>>> master #endif diff --git a/src/simd/matrix/simple.h b/src/simd/matrix/simple.h new file mode 100644 index 00000000..c7495882 --- /dev/null +++ b/src/simd/matrix/simple.h @@ -0,0 +1,126 @@ +#ifndef CRYPTANALYSISLIB_SIMD_MATRIX_SIMPLE_H +#define CRYPTANALYSISLIB_SIMD_MATRIX_SIMPLE_H + +class uint1x64x64_T { + uint64_t data[64]; + + constexpr static void transpose(uint64_t *out, uint64_t *in) noexcept { + constexpr uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000}}; + + for (uint64_t i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (int32_t d = 5; d >= 0; d--) { + const uint32_t s = 1u << d; + + for (uint32_t i = 0; i < 64u; i += s * 2u) { + for (uint32_t j = i; j < i + s; j++) { + const uint64_t x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + const uint64_t y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + out[j + 0] = x; + out[j + s] = y; + } + } + } + } + + /// inplace transpose + constexpr void transpose() noexcept { + for (uint64_t j = 32, m = 0x00000000FFFFFFFF; j; j >>= 1, m ^= m << j) { + for (uint64_t k = 0; k < 64; k = ((k | j) + 1) & ~j) { + uint64_t t = (data[k] ^ (data[k | j] >> j)) & m; + data[k] ^= t; + data[k | j] ^= (t << j); + } + } + } + + constexpr void mul() { + } +}; + + +class uint8x32x32_t { + uint32_t data[32]; + + /// inlpace + constexpr void transpose() noexcept { +#if 1 + // TODO not correct, + // taken from https://github.com/pqov/pqov-paper/blob/main/src/avx2/blas_matrix_avx2.c + alignas(32) uint64x4_t mat[32]; + + // load + for (size_t i = 0; i < 32; i++) { + mat[i] = uint64x4_t::load(data + i); + } + + // swap 16x16 blocks + for (size_t i = 0; i < 16; i++) { + uint64x4_t tmp = uint64x4_t::template permute<0x20>(mat[i], mat[i + 16]); + mat[i + 16] = uint64x4_t::template permute<0x31>(mat[i], mat[i + 16]); + mat[i] = tmp; + } + + // swap 8x8 blocks + for (size_t i = 0; i < 2; i++) { + for (size_t j = 0; j < 8; j++) { + size_t r = 16 * i + j; + uint64x4_t tmp = uint64x4_t::unpacklo(mat[r], mat[r + 8]); + mat[r + 8] = uint64x4_t::unpackhi(mat[r], mat[r + 8]); + mat[r] = tmp; + } + } + + // swap 4x4 blocks + for (size_t i = 0; i < 4; i++) { + for (size_t j = 0; j < 4; j++) { + size_t r = 8 * i + j; + uint32x8_t tmp = uint32x8_t::template blend<0xaa>(mat[r], uint128x2_t::template slli<4>(mat[r + 4])); + mat[r + 4] = uint32x8_t::template blend<0xaa>(uint128x2_t::template srli<4>(mat[r]), mat[r + 4]); + mat[r] = tmp; + } + } + + // swap 2x2 blocks + for (size_t i = 0; i < 8; i++) { + for (size_t j = 0; j < 2; j++) { + size_t r = 4 * i + j; + uint16x16_t tmp = uint16x16_t::template blend<0xaa>(mat[r], uint128x2_t::template slli<2>(mat[r + 2])); + mat[r + 2] = uint16x16_t::template blend<0xaa>(uint128x2_t::template srli<0xaa>(mat[r]), mat[r + 2]); + mat[r] = tmp; + } + } + + // swap last bytes + for (size_t i = 0; i < 16; i++) { + size_t r = 2 * i; + const uint16x16_t blend_mask = uint16x16_t::set1(0xFF00); + const uint64x4_t tmp = uint8x32_t::blend(mat[r], uint128x2_t::template slli<1>(mat[r + 1]), blend_mask); + mat[r + 1] = uint8x32_t::blend(uint128x2_t::template srli<1>(mat[r]), mat[r + 1], blend_mask); + mat[r] = tmp; + } + // store result + for (size_t i = 0; i < 32; i++) { + uint64x4_t::store(data + i * 64, mat[i]); + } +#else + for (unsigned i = 0; i < 32; i++) { + for (unsigned j = i + 1; j < 32; j++) { + uint8_t tmp = mat[j * 64 + i]; + mat[j * 64 + i] = mat[i * 64 + j]; + mat[i * 64 + j] = tmp; + } + } +#endif + } +}; + +#endif//CRYPTANALYSISLIB_SIMD_MATRIX_SIMPLE_H diff --git a/src/simd/neon.h b/src/simd/neon.h index e7d17fed..6c91a184 100644 --- a/src/simd/neon.h +++ b/src/simd/neon.h @@ -16,11 +16,14 @@ namespace cryptanalysislib { constexpr static uint32_t LIMBS = 16; using limb_type = uint8_t; + constexpr inline _uint8x16_t &operator=(const _uint16x8_t &b) noexcept; + constexpr inline _uint8x16_t &operator=(const _uint32x4_t &b) noexcept; + constexpr inline _uint8x16_t &operator=(const _uint64x2_t &b) noexcept; + constexpr _uint8x16_t() noexcept {} constexpr _uint8x16_t(const _uint16x8_t &b) noexcept; constexpr _uint8x16_t(const _uint32x4_t &b) noexcept; constexpr _uint8x16_t(const _uint64x2_t &b) noexcept; - union { // compatibility to `TxN_t` uint8_t d[16]; @@ -129,6 +132,9 @@ namespace cryptanalysislib { constexpr static uint32_t LIMBS = 8; using limb_type = uint16_t; + constexpr inline _uint16x8_t &operator=(const _uint8x16_t &b) noexcept; + constexpr inline _uint16x8_t &operator=(const _uint32x4_t &b) noexcept; + constexpr inline _uint16x8_t &operator=(const _uint64x2_t &b) noexcept; constexpr _uint16x8_t() noexcept {} constexpr _uint16x8_t(const _uint8x16_t &b) noexcept; constexpr _uint16x8_t(const _uint32x4_t &b) noexcept; @@ -220,6 +226,10 @@ namespace cryptanalysislib { constexpr static uint32_t LIMBS = 4; using limb_type = uint32_t; + constexpr inline _uint32x4_t &operator=(const _uint8x16_t &b) noexcept; + constexpr inline _uint32x4_t &operator=(const _uint16x8_t &b) noexcept; + constexpr inline _uint32x4_t &operator=(const _uint64x2_t &b) noexcept; + constexpr _uint32x4_t() noexcept {} constexpr _uint32x4_t(const _uint8x16_t &b) noexcept; constexpr _uint32x4_t(const _uint16x8_t &b) noexcept; @@ -297,6 +307,10 @@ namespace cryptanalysislib { constexpr static uint32_t LIMBS = 2; using limb_type = uint64_t; + constexpr inline _uint64x2_t &operator=(const _uint8x16_t &b) noexcept; + constexpr inline _uint64x2_t &operator=(const _uint16x8_t &b) noexcept; + constexpr inline _uint64x2_t &operator=(const _uint32x4_t &b) noexcept; + constexpr _uint64x2_t() noexcept {} constexpr _uint64x2_t(const _uint8x16_t &b) noexcept; constexpr _uint64x2_t(const _uint16x8_t &b) noexcept; @@ -481,6 +495,12 @@ struct uint8x32_t { constexpr static uint32_t LIMBS = 32; using limb_type = uint8_t; + constexpr uint8x32_t() noexcept = default; + constexpr uint8x32_t(const uint16x16_t &b) noexcept; + constexpr uint8x32_t(const uint32x8_t &b) noexcept; + constexpr uint8x32_t(const uint64x4_t &b) noexcept; + constexpr uint8x32_t(const uint128x2_t &b) noexcept; + union { // compatibility with txn_t uint8_t d[32]; @@ -502,8 +522,6 @@ struct uint8x32_t { return d[i]; } - constexpr uint8x32_t() noexcept {} - /// /// \param binary /// \param hex @@ -897,6 +915,24 @@ struct uint8x32_t { return ret; } + /// wrapper around: `_mm256_blend_epi8` + /// \tparam in2 + /// \param in1 + /// \return + [[nodiscard]] constexpr static inline uint8x32_t blend(const uint8x32_t in1, + const uint8x32_t in2, + const uint8x32_t in3) noexcept { + uint8x32_t ret{}; + for (uint32_t i = 0; i < 32; i++) { + if (in3.v8[i]) { + ret.v8[i] = in1.v8[i]; + } else { + ret.v8[i] = in2.v8[i]; + } + } + return ret; + } + [[nodiscard]] constexpr static inline uint8x32_t popcnt(const uint8x32_t in) noexcept { uint8x32_t out; @@ -917,6 +953,12 @@ struct uint16x16_t { constexpr static uint32_t LIMBS = 16; using limb_type = uint16_t; + constexpr uint16x16_t() noexcept = default; + constexpr uint16x16_t(const uint8x32_t &b) noexcept; + constexpr uint16x16_t(const uint32x8_t &b) noexcept; + constexpr uint16x16_t(const uint64x4_t &b) noexcept; + constexpr uint16x16_t(const uint128x2_t &b) noexcept; + union { // compatibility with txn_t uint16_t d[16]; @@ -1321,6 +1363,25 @@ struct uint16x16_t { return ret; } + + /// wrapper around: `_mm256_blend_epi32` + /// \tparam in2 + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static inline uint16x16_t blend(const uint16x16_t in1, + const uint16x16_t in2) noexcept { + uint16x16_t ret{}; + for (uint32_t i = 0; i < 16; i++) { + if (imm & (1u << (i%8))) { + ret.v16[i] = in2.v16[i]; + } else { + ret.v16[i] = in1.v16[i]; + } + } + return ret; + } + constexpr static inline uint16x16_t popcnt(const uint16x16_t in) noexcept { uint16x16_t out; @@ -1349,6 +1410,12 @@ struct uint32x8_t { constexpr static uint32_t LIMBS = 8; using limb_type = uint32_t; + constexpr uint32x8_t() noexcept = default; + constexpr uint32x8_t(const uint8x32_t &b) noexcept; + constexpr uint32x8_t(const uint16x16_t &b) noexcept; + constexpr uint32x8_t(const uint64x4_t &b) noexcept; + constexpr uint32x8_t(const uint128x2_t &b) noexcept; + union { // compatibility with txn_t uint32_t d[8]; @@ -1759,6 +1826,94 @@ struct uint32x8_t { return ret; } + /// + /// \tparam scale + /// \param ptr + /// \param offset + /// \param data + /// \return + template + constexpr static inline void scatter(const void *ptr, const uint32x8_t offset, const uint32x8_t data) noexcept { + static_assert(scale == 1 || scale == 2 || scale == 4 || scale == 8); + const uint8_t *ptr8 = (uint8_t *) ptr; + for (uint32_t i = 0; i < 8; i++) { + *(uint32_t *) (ptr8 + offset.v32[i] * scale) = data.v32[i]; + } + } + + /// wrapper around: `_mm256_blend_epi32` + /// \tparam in2 + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static inline uint32x8_t blend(const uint32x8_t in1, + const uint32x8_t in2) noexcept { + uint32x8_t ret{}; + for (uint32_t i = 0; i < 7; i++) { + if (imm & (1u << i)) { + ret.v32[i] = in2.v32[i]; + } else { + ret.v32[i] = in1.v32[i]; + } + } + return ret; + } + + /// wrapper around: `_mm256_unpacklo_epi64` + /// \tparam in2 + /// \param in1 + /// \return + [[nodiscard]] constexpr static inline uint32x8_t unpacklo(const uint32x8_t in1, + const uint32x8_t in2) noexcept { + uint32x8_t ret{}; + ret.v64[0] = in1.v64[0]; + ret.v64[1] = in2.v64[0]; + ret.v64[2] = in1.v64[2]; + ret.v64[3] = in2.v64[2]; + return ret; + } + + /// wrapper around: `_mm256_unpacklo_epi64` + /// \tparam in2 + /// \param in1 + /// \return + [[nodiscard]] constexpr static inline uint32x8_t unpackhi(const uint32x8_t in1, + const uint32x8_t in2) noexcept { + uint32x8_t ret{}; + ret.v64[0] = in1.v64[1]; + ret.v64[1] = in2.v64[1]; + ret.v64[2] = in1.v64[3]; + ret.v64[3] = in2.v64[3]; + return ret; + } + + /// wrapper around: `_mm256_permute2x128_si256` + /// TODO + /// \tparam in2 + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static inline uint32x8_t permute(const uint32x8_t in1, + const uint32x8_t in2) noexcept { + uint32x8_t ret{}; + switch (in3&0xf) { + case 0: ret.v128[0] = in1.v128[0]; break; + case 1: ret.v128[0] = in1.v128[1]; break; + case 2: ret.v128[0] = in2.v128[0]; break; + case 3: ret.v128[0] = in2.v128[1]; break; + default: ret.v128[0] = {0}; + } + + switch ((in3>>4)&0xf) { + case 0: ret.v128[1] = in1.v128[0]; break; + case 1: ret.v128[1] = in1.v128[1]; break; + case 2: ret.v128[1] = in2.v128[0]; break; + case 3: ret.v128[1] = in2.v128[1]; break; + default: ret.v128[1] = {0}; + } + return ret; + } + /// TODO /// \param in /// \param perm @@ -1801,6 +1956,12 @@ struct uint64x4_t { constexpr static uint32_t LIMBS = 4; using limb_type = uint64_t; + constexpr inline uint64x4_t() noexcept = default; + constexpr inline uint64x4_t(const uint8x32_t &b) noexcept; + constexpr inline uint64x4_t(const uint16x16_t &b) noexcept; + constexpr inline uint64x4_t(const uint32x8_t &b) noexcept; + constexpr inline uint64x4_t(const uint128x2_t &b) noexcept; + union { // compatibility with txn_t uint64_t d[4]; @@ -2111,17 +2272,6 @@ struct uint64x4_t { return out; } - /// - /// \param in1 - /// \param in2 - /// \return - constexpr static inline uint64x4_t permute(const uint64x4_t in1, - const uint32_t in2) noexcept { - uint64x4_t ret; - ASSERT(0); - return ret; - } - /// /// \param in1 /// \param in2 @@ -2243,7 +2393,70 @@ struct uint64x4_t { return ret; } + + /// wrapper around: `_mm256_unpacklo_epi64` + /// \tparam in2 + /// \param in1 + /// \return + [[nodiscard]] constexpr static inline uint64x4_t unpacklo(const uint64x4_t in1, + const uint64x4_t in2) noexcept { + uint64x4_t ret{}; + ret.v64[0] = in1.v64[0]; + ret.v64[1] = in2.v64[0]; + ret.v64[2] = in1.v64[2]; + ret.v64[3] = in2.v64[2]; + return ret; + } + + /// wrapper around: `_mm256_unpacklo_epi64` + /// \tparam in2 + /// \param in1 + /// \return + [[nodiscard]] constexpr static inline uint64x4_t unpackhi(const uint64x4_t in1, + const uint64x4_t in2) noexcept { + uint64x4_t ret{}; + ret.v64[0] = in1.v64[1]; + ret.v64[1] = in2.v64[1]; + ret.v64[2] = in1.v64[3]; + ret.v64[3] = in2.v64[3]; + return ret; + } + /// wrapper around: `_mm256_permute2x128_si256` + /// \tparam in2 + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static inline uint64x4_t permute(const uint64x4_t in1, + const uint64x4_t in2) noexcept { + uint64x4_t ret{}; + switch (in3&0xf) { + case 0: ret.v128[0] = in1.v128[0]; break; + case 1: ret.v128[0] = in1.v128[1]; break; + case 2: ret.v128[0] = in2.v128[0]; break; + case 3: ret.v128[0] = in2.v128[1]; break; + default: ret.v64[0] = 0; ret.v64[1] = 0; + } + + switch ((in3>>4)&0xf) { + case 0: ret.v128[1] = in1.v128[0]; break; + case 1: ret.v128[1] = in1.v128[1]; break; + case 2: ret.v128[1] = in2.v128[0]; break; + case 3: ret.v128[1] = in2.v128[1]; break; + default: ret.v64[2] = 0; ret.v64[3] = 0; + } + return ret; + } + /// + /// \param in1 + /// \param in2 + /// \return + constexpr static inline uint64x4_t permute(const uint64x4_t in1, + const uint32_t in2) noexcept { + uint64x4_t ret; + ASSERT(0); + return ret; + } /// TODO /// \tparam in2 /// \param in1 @@ -2285,4 +2498,67 @@ struct uint64x4_t { } }; +struct uint128x2_t { + constexpr static uint32_t LIMBS = 2; + using limb_type = __uint128_t; + + union { + // compatibility with TxN_t + __uint128_t d[2]; + + uint8_t v8[32]; + uint16_t v16[16]; + uint32_t v32[8]; + uint64_t v64[4]; + uint64x2_t v128[2]; + }; + + constexpr uint128x2_t() noexcept = default; + constexpr uint128x2_t(const uint8x32_t &b) noexcept; + constexpr uint128x2_t(const uint16x16_t &b) noexcept; + constexpr uint128x2_t(const uint32x8_t &b) noexcept; + constexpr uint128x2_t(const uint64x4_t &b) noexcept; + + [[nodiscard]] constexpr inline limb_type operator[](const uint32_t i) { + ASSERT(i < LIMBS); + return d[i]; + } + + /// NOTE: currently cannot be constexpr + /// \return + [[nodiscard]] static inline uint128x2_t random() noexcept { + uint128x2_t ret{}; + for (size_t i = 0; i < 4; ++i) { + ret.v64[i] = fastrandombytes_uint64(); + } + return ret; + } + + /// + /// \param binary + /// \param hex + constexpr inline void print(bool binary = false, bool hex = false) const; + + /// wrapper around: `_mm256_bslli_epi128` + /// \tparam imm + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static uint128x2_t slli(const uint128x2_t in1) { + uint128x2_t ret{}; + // TODO + return ret; + } + + /// wrapper around: `_mm256_bslli_epi128` + /// \tparam imm + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static uint128x2_t srli(const uint128x2_t in1) { + uint128x2_t ret{}; + // TODO + return ret; + } +}; #endif diff --git a/src/simd/simd.h b/src/simd/simd.h index 445c50e0..d99f0314 100644 --- a/src/simd/simd.h +++ b/src/simd/simd.h @@ -10,6 +10,16 @@ #include "print/print.h" #include "random.h" +namespace cryptanalysislib { + struct _uint16x8_t; + struct _uint32x4_t; + struct _uint64x2_t; +}// namespace cryptanalysislib + +struct uint16x16_t; +struct uint32x8_t; +struct uint64x4_t; +struct uint128x2_t; #define bit_shuffle_const(b0, b1, b2, b3, b4, b5, b6, b7) \ ((uint64_t(uint8_t(1 << b0)) << (7 * 8)) | \ (uint64_t(uint8_t(1 << b1)) << (6 * 8)) | \ @@ -39,10 +49,6 @@ #else namespace cryptanalysislib { - struct _uint16x8_t; - struct _uint32x4_t; - struct _uint64x2_t; - struct _uint8x16_t { constexpr static uint32_t LIMBS = 16; using limb_type = uint8_t; @@ -1976,6 +1982,64 @@ struct uint64x4_t { } }; +struct uint128x2_t { + constexpr static uint32_t LIMBS = 2; + using limb_type = __uint128_t; + + union { + // compatibility with TxN_t + __uint128_t d[2]; + + uint8_t v8[32]; + uint16_t v16[16]; + uint32_t v32[8]; + uint64_t v64[4]; + __uint128_t v128[2]; + __m256i v256; + }; + + [[nodiscard]] constexpr inline limb_type operator[](const uint32_t i) { + ASSERT(i < LIMBS); + return d[i]; + } + + /// Example of how the constexpr implementation works: + /// https://godbolt.org/#z:OYLghAFBqd5QCxAYwPYBMCmBRdBLAF1QCcAaPECAMzwBtMA7AQwFtMQByARg9KtQYEAysib0QXACx8BBAKoBnTAAUAHpwAMvAFYTStJg1DIApACYAQuYukl9ZATwDKjdAGFUtAK4sGEgBykrgAyeAyYAHI%2BAEaYxCCSZqQADqgKhE4MHt6%2BASlpGQKh4VEssfGJtpj2jgJCBEzEBNk%2BflyBdpgOmfWNBMWRMXEJSQoNTS257bbj/WGDZcOJAJS2qF7EyOwc5gDMYcjeWADUJrtuY/iCAHQIZ9gmGgCCewdHmKfneCwsYQTEYVu90eLzM%2BwYhy8JzObmQlycwOeILGxC8DmOXj%2B/lUuzMAH0CKcAOxWJEaACcXgYmWJpJeFMelKxBOOxwAbv5TgBWCy4kxcgAiZzpjMZmMEXAAbCy2VL%2BRY5YLhSDRRTxQRcTK%2BTzjv5%2BULdiKGWq/pLJDKzfLjpJ9cqyeTGXi8SwzFzJXh2a7JXb6Q6iQa6azWSCg8c0AwxphVMliMcxkxHMhjmFaPMMVicfjCaoSHiIOGxunBNjNYSwlxSCqKaGa7W6/WC4T1SWs8mGGZlrSqw6TcXMyz1gQfaryYPrmyvZ8BccIE6XW68J2oE62ZJ0F5luXx5P%2BdgZyu1xuwmZt27lsPjeSAPRXscTt1T47O%2BfSnPEPHpL0QLf3yWkNsnr%2B56Gt2jLEJgBAbAwxyDhefpCmSvo3le3bIdybjJI0rDHMk/yoTe6HgZBxAMN2jZRjGcYNImbapuERYEC2LJUmIeDAOE6B4rQqBMOg%2BYCIWbKoHg6DHAAVLhxCdiYJKgRSTBeEQ4mSTuuzTrOzpeh6YnLJJcFihmpYwYp%2BkUneqnThJ/xeqZ163opp6So%2Bz5elxPHrh%2BeBfhpL7abp/zAUaPbkkRUHGUOIH2jJCFPKGIZBsh%2BFXuhmHENhen2mh/IYVhLBtmRAkEBRsbxjRKZpkJIkYswqbsZgnFjCQmAQJVolWWQYaFQxTFlgw0myVFFJzlpynWQ%2BZzqcNC7if5xC2e1Flto5Ppxf6KprZF9JPOR0axq1xxFWMEDNiyYmDv%2Bjbdad5YXV1J2EmJx63RGTZ/EZfVds8ob8LGx1va2HoTccGjCsm1i4p8bjHKRhpg5YE24v1gZ1pdzb9oSTCPmjpYgCALG1Rxbm8d%2BDBcKcljJmJiMrSjd2Ga20RY/TBK4/jbGE9xxPHuTFiU9Tm0NnTfZGUmQPY1muNvnmTD/tEgXxbW4ss3jNXs/VH5EOBEDnWG8tfUG0XrTFIJ/McLBMGEEBIwrnUvcVDHvY%2BXBmHqAtBjtlH3eF8qkYKj4yRYIMbcjrIe7GXvlj7%2Br%2BySQcBjbYdXb1ZhR37QMB3HF6xUGh0EFAXs6TrN0AaQfU06yMZ/FQEDmCnrroNlpFmEk3s8iDgp6y8/ocKstCcFyvB%2BBwWikKgnBuNY1hxusmwfHsPCkAQmg96sADWIC7Ls1waP4kiSkSkhEv4GhH0ff59xwkiD8vo%2BcLwCggBoi/L6scCwEgaAsMkdBxOQlCf9/eg8RgBcFxKQLAbI8BbAAGp4EwAAdwAPLJEYJwBeNBaBFWIA/CA0Qb7RDCI0AAnmg3gBDmDECIYg6I2guhL24LwT%2BbBBCIIYLQEhw9eBYGiF4YAbgxC0Afgw8BmBzZGHEJw8BeBwLdDZJgIRI8oxdEUtsBefxqg31TNENKlCPBYBvv8b4pDVhUAMMABQsCEHINQcI/gggRBiHYFIGQghFAqHUJI3QFYDBGBQJPSw%2Bg8DRAfpAVYqBcKZCEQAWnNmyVQZhjhRMQbsXgqA5HEABFgEJVsqg1EyC4Bg7hPCtAkESIIhSBilHKBIJ%2BqR0i1CyMUqYZS6mFAYJUoY8QuBP06N0OosxJhtDKb0hpvQmgdMWF0npAymlDJmH0CZ1TumrAUDPLYEhe792vpIseHBjiqH8JKKJZpjjAGQEmUB1wEkQFwIQEg5NdhcGWLwehWhlhrw3lvLk/guDknJO0fw3yj4/OkBfK%2Bpcb67Pvo/Z%2BnDX4wEQCAQcyRFJ/34l/H%2BxAIisG2Aco5JyzkXK3mYXg9U7mZL0HY4QohxDOKpW4tQN8vGkHgWlZIxj9BbIhTszgiDFIosJKgKg%2BzDnHMkKc85xxLnXI8BioBDynkvJfh8ze1xN7qo1Zqzll9tkjyhbYGFryV7apJdyvVd9YVvNWOk9IzhJBAA%3D + constexpr uint128x2_t() noexcept = default; + + /// NOTE: currently cannot be constexpr + /// \return + [[nodiscard]] static inline uint128x2_t random() noexcept { + uint128x2_t ret{}; + for (size_t i = 0; i < 4; ++i) { + ret.v64[i] = fastrandombytes_uint64(); + } + return ret; + } + + /// + /// \param binary + /// \param hex + constexpr inline void print(bool binary = false, bool hex = false) const; + + template + [[nodiscard]] constexpr static uint128x2_t slli(const uint128x2_t in1) { + uint128x2_t ret{}; + /// TODO + return ret; + } + + /// wrapper around: `_mm256_bslli_epi128` + /// \tparam imm + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static uint128x2_t srli(const uint128x2_t in1) { + uint128x2_t ret{}; + ret.v256 = ((__m256i) __builtin_ia32_psrldqi256_byteshift((__m256i) (in1.v256), (int) (imm))); + return ret; + } +}; #include "simd/float/simd.h" #endif// no SIMD unit available @@ -2509,6 +2573,11 @@ namespace cryptanalysislib { }// namespace cryptanalysislib + + +#include "simd/bits/bits.h" +#include "simd/generic.h" +#include "simd/matrix/simple.h" void transpose8(unsigned char A[8], int m, int n, unsigned char B[8]) { unsigned x, y, t; diff --git a/tests/container/hashmap/simple.cpp b/tests/container/hashmap/simple.cpp index 8a72c66f..3905d892 100644 --- a/tests/container/hashmap/simple.cpp +++ b/tests/container/hashmap/simple.cpp @@ -26,7 +26,6 @@ TEST(HashMap, simd) { data[j] = i; index[j] = i + 1; } - // TODO hm.insert_simd(data, index); }