From 0e55a85832b281c67d28ff3fa32ce10bb32bc9e1 Mon Sep 17 00:00:00 2001 From: Floyd Date: Tue, 5 Mar 2024 20:11:09 +0100 Subject: [PATCH 1/3] gitbak --- src/simd/avx2.h | 337 ++++++++++++++++++++++++------ src/simd/matrix/simple.h | 126 +++++++++++ src/simd/simd.h | 347 +++++++++++++++---------------- tests/container/hashmap/simd.cpp | 2 +- 4 files changed, 575 insertions(+), 237 deletions(-) create mode 100644 src/simd/matrix/simple.h diff --git a/src/simd/avx2.h b/src/simd/avx2.h index 29eeb788..da1796f8 100644 --- a/src/simd/avx2.h +++ b/src/simd/avx2.h @@ -40,17 +40,14 @@ namespace internal { namespace cryptanalysislib { - struct _uint16x8_t; - struct _uint32x4_t; - struct _uint64x2_t; struct _uint8x16_t { constexpr static uint32_t LIMBS = 16; using limb_type = uint8_t; - constexpr inline _uint8x16_t& operator=(const _uint16x8_t& b) noexcept; - constexpr inline _uint8x16_t& operator=(const _uint32x4_t& b) noexcept; - constexpr inline _uint8x16_t& operator=(const _uint64x2_t& b) noexcept; + constexpr inline _uint8x16_t &operator=(const _uint16x8_t &b) noexcept; + constexpr inline _uint8x16_t &operator=(const _uint32x4_t &b) noexcept; + constexpr inline _uint8x16_t &operator=(const _uint64x2_t &b) noexcept; constexpr _uint8x16_t() noexcept {} constexpr _uint8x16_t(const _uint16x8_t &b) noexcept; @@ -86,7 +83,7 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint8x16_t set( - uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept { + uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept { _uint8x16_t ret; ret.v32[0] = d; ret.v32[1] = c; @@ -96,7 +93,7 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint8x16_t setr( - uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept { + uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept { _uint8x16_t ret; ret.v32[0] = a; ret.v32[1] = b; @@ -106,22 +103,21 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint8x16_t set( - uint8_t a, uint8_t b, uint8_t c, uint8_t d, - uint8_t e, uint8_t f, uint8_t g, uint8_t h, - uint8_t i, uint8_t j, uint8_t k, uint8_t l, - uint8_t m, uint8_t n, uint8_t o, uint8_t p - ) noexcept { + uint8_t a, uint8_t b, uint8_t c, uint8_t d, + uint8_t e, uint8_t f, uint8_t g, uint8_t h, + uint8_t i, uint8_t j, uint8_t k, uint8_t l, + uint8_t m, uint8_t n, uint8_t o, uint8_t p) noexcept { _uint8x16_t ret; - ret.v8[ 0] = p; - ret.v8[ 1] = o; - ret.v8[ 2] = n; - ret.v8[ 3] = m; - ret.v8[ 4] = l; - ret.v8[ 5] = k; - ret.v8[ 6] = j; - ret.v8[ 7] = i; - ret.v8[ 8] = h; - ret.v8[ 9] = g; + ret.v8[0] = p; + ret.v8[1] = o; + ret.v8[2] = n; + ret.v8[3] = m; + ret.v8[4] = l; + ret.v8[5] = k; + ret.v8[6] = j; + ret.v8[7] = i; + ret.v8[8] = h; + ret.v8[9] = g; ret.v8[10] = f; ret.v8[11] = e; ret.v8[12] = d; @@ -132,22 +128,21 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint8x16_t setr( - uint8_t a, uint8_t b, uint8_t c, uint8_t d, - uint8_t e, uint8_t f, uint8_t g, uint8_t h, - uint8_t i, uint8_t j, uint8_t k, uint8_t l, - uint8_t m, uint8_t n, uint8_t o, uint8_t p - ) noexcept { + uint8_t a, uint8_t b, uint8_t c, uint8_t d, + uint8_t e, uint8_t f, uint8_t g, uint8_t h, + uint8_t i, uint8_t j, uint8_t k, uint8_t l, + uint8_t m, uint8_t n, uint8_t o, uint8_t p) noexcept { _uint8x16_t ret; - ret.v8[ 0] = a; - ret.v8[ 1] = b; - ret.v8[ 2] = c; - ret.v8[ 3] = d; - ret.v8[ 4] = e; - ret.v8[ 5] = f; - ret.v8[ 6] = g; - ret.v8[ 7] = h; - ret.v8[ 8] = i; - ret.v8[ 9] = j; + ret.v8[0] = a; + ret.v8[1] = b; + ret.v8[2] = c; + ret.v8[3] = d; + ret.v8[4] = e; + ret.v8[5] = f; + ret.v8[6] = g; + ret.v8[7] = h; + ret.v8[8] = i; + ret.v8[9] = j; ret.v8[10] = k; ret.v8[11] = l; ret.v8[12] = m; @@ -162,9 +157,9 @@ namespace cryptanalysislib { constexpr static uint32_t LIMBS = 8; using limb_type = uint16_t; - constexpr inline _uint16x8_t& operator=(const _uint8x16_t& b) noexcept; - constexpr inline _uint16x8_t& operator=(const _uint32x4_t& b) noexcept; - constexpr inline _uint16x8_t& operator=(const _uint64x2_t& b) noexcept; + constexpr inline _uint16x8_t &operator=(const _uint8x16_t &b) noexcept; + constexpr inline _uint16x8_t &operator=(const _uint32x4_t &b) noexcept; + constexpr inline _uint16x8_t &operator=(const _uint64x2_t &b) noexcept; constexpr _uint16x8_t() noexcept {} constexpr _uint16x8_t(const _uint8x16_t &b) noexcept; @@ -198,7 +193,7 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint16x8_t set( - uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept { + uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept { _uint16x8_t ret; ret.v32[0] = d; ret.v32[1] = c; @@ -208,7 +203,7 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint16x8_t setr( - uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept { + uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept { _uint16x8_t ret; ret.v32[0] = a; ret.v32[1] = b; @@ -218,8 +213,8 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint16x8_t set( - uint16_t a, uint16_t b, uint16_t c, uint16_t d, - uint16_t e, uint16_t f, uint16_t g, uint16_t h) noexcept { + uint16_t a, uint16_t b, uint16_t c, uint16_t d, + uint16_t e, uint16_t f, uint16_t g, uint16_t h) noexcept { _uint16x8_t ret; ret.v16[0] = h; ret.v16[1] = g; @@ -233,8 +228,8 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint16x8_t setr( - uint16_t a, uint16_t b, uint16_t c, uint16_t d, - uint16_t e, uint16_t f, uint16_t g, uint16_t h) noexcept { + uint16_t a, uint16_t b, uint16_t c, uint16_t d, + uint16_t e, uint16_t f, uint16_t g, uint16_t h) noexcept { _uint16x8_t ret; ret.v64[0] = a; ret.v64[1] = b; @@ -252,9 +247,9 @@ namespace cryptanalysislib { constexpr static uint32_t LIMBS = 4; using limb_type = uint32_t; - constexpr inline _uint32x4_t& operator=(const _uint8x16_t& b) noexcept; - constexpr inline _uint32x4_t& operator=(const _uint16x8_t& b) noexcept; - constexpr inline _uint32x4_t& operator=(const _uint64x2_t& b) noexcept; + constexpr inline _uint32x4_t &operator=(const _uint8x16_t &b) noexcept; + constexpr inline _uint32x4_t &operator=(const _uint16x8_t &b) noexcept; + constexpr inline _uint32x4_t &operator=(const _uint64x2_t &b) noexcept; constexpr _uint32x4_t() noexcept {} constexpr _uint32x4_t(const _uint8x16_t &b) noexcept; @@ -315,9 +310,9 @@ namespace cryptanalysislib { constexpr static uint32_t LIMBS = 2; using limb_type = uint64_t; - constexpr inline _uint64x2_t& operator=(const _uint8x16_t& b) noexcept; - constexpr inline _uint64x2_t& operator=(const _uint16x8_t& b) noexcept; - constexpr inline _uint64x2_t& operator=(const _uint32x4_t& b) noexcept; + constexpr inline _uint64x2_t &operator=(const _uint8x16_t &b) noexcept; + constexpr inline _uint64x2_t &operator=(const _uint16x8_t &b) noexcept; + constexpr inline _uint64x2_t &operator=(const _uint32x4_t &b) noexcept; constexpr _uint64x2_t() noexcept {} constexpr _uint64x2_t(const _uint8x16_t &b) noexcept; @@ -370,15 +365,18 @@ struct uint8x32_t { __m256i v256; }; + /// https://godbolt.org/#z:OYLghAFBqd5QCxAYwPYBMCmBRdBLAF1QCcAaPECAMzwBtMA7AQwFtMQByARg9KtQYEAysib0QXACx8BBAKoBnTAAUAHpwAMvAFYTStJg1DIApACYAQuYukl9ZATwDKjdAGFUtAK4sGEgBykrgAyeAyYAHI%2BAEaYxCCSZqQADqgKhE4MHt6%2BASlpGQKh4VEssfGJtpj2jgJCBEzEBNk%2BflyBdpgOmfWNBMWRMXEJSQoNTS257bbj/WGDZcOJAJS2qF7EyOwc5gDMYcjeWADUJrtuY/iCAHQIZ9gmGgCCewdHmKfneCwsYQTEYVu90eLzM%2BwYhy8JzObmQlycwOeILGxC8DmOXj%2B/lUuzMAH0CKcAOxWJEaACcXgYmWJpJeFMelKxBOOxwAbv5TgBWCy4kxcgAiZzpjMZmMEXAAbCy2VL%2BRY5YLhSDRRTxQRcTK%2BTzjv5%2BULdiKGWq/pLJDKzfLjpJ9cqyeTGXi8SwzFzJXh2a7JXb6Q6iQa6azWSCg8c0AwxphVMliMcxkxHMhjmFaPMMVicfjCaoSHiIOGxunBNjNYSwlxSCqKaGa7W6/WC4T1SWs8mGGZlrSqw6TcXMyz1gQfaryYPrmyvZ8BccIE6XW68J2oE62ZJ0F5luXx5P%2BdgZyu1xuwmZt27lsPjeSAPRXscTt1T47O%2BfSnPEPHpL0QLf3yWkNsnr%2B56Gt2jLEJgBAbAwxyDhefpCmSvo3le3bIdybjJI0rDHMk/yoTe6HgZBxAMN2jZRjGcYNImbapuERYEC2LJUmIeDAOE6B4rQqBMOg%2BYCIWbKoHg6DHAAVLhxCdiYJKgRSTBeEQ4mSTuuzTrOzpeh6YnLJJcFihmpYwYp%2BkUneqnThJ/xeqZ163opp6So%2Bz5elxPHrh%2BeBfhpL7abp/zAUaPbkkRUHGUOIH2jJCFPKGIZBsh%2BFXuhmHENhen2mh/IYVhLBtmRAkEBRsbxjRKZpkJIkYswqbsZgnFjCQmAQJVolWWQYaFQxTFlgw0myVFFJzlpynWQ%2BZzqcNC7if5xC2e1Flto5Ppxf6KprZF9JPOR0axq1xxFWMEDNiyYmDv%2Bjbdad5YXV1J2EmJx63RGTZ/EZfVds8ob8LGx1va2HoTccGjCsm1i4p8bjHKRhpg5YE24v1gZ1pdzb9oSTCPmjpYgCALG1Rxbm8d%2BDBcKcljJmJiMrSjd2Ga20RY/TBK4/jbGE9xxPHuTFiU9Tm0NnTfZGUmQPY1muNvnmTD/tEgXxbW4ss3jNXs/VH5EOBEDnWG8tfUG0XrTFIJ/McLBMGEEBIwrnUvcVDHvY%2BXBmHqAtBjtlH3eF8qkYKj4yRYIMbcjrIe7GXvlj7%2Br%2BySQcBjbYdXb1ZhR37QMB3HF6xUGh0EFAXs6TrN0AaQfU06yMZ/FQEDmCnrroNlpFmEk3s8iDgp6y8/ocKstCcFyvB%2BBwWikKgnBuNY1hxusmwfHsPCkAQmg96sADWIC7Ls1waP4kiSkSkhEv4GhH0ff59xwkiD8vo%2BcLwCggBoi/L6scCwEgaAsMkdBxOQlCf9/eg8RgBcFxKQLAbI8BbAAGp4EwAAdwAPLJEYJwBeNBaBFWIA/CA0Qb7RDCI0AAnmg3gBDmDECIYg6I2guhL24LwT%2BbBBCIIYLQEhw9eBYGiF4YAbgxC0Afgw8BmBzZGHEJw8BeBwLdDZJgIRI8oxdEUtsBefxqg31TNENKlCPBYBvv8b4pDVhUAMMABQsCEHINQcI/gggRBiHYFIGQghFAqHUJI3QFYDBGBQJPSw%2Bg8DRAfpAVYqBcKZCEQAWnNmyVQZhjhRMQbsXgqA5HEABFgEJVsqg1EyC4Bg7hPCtAkESIIhSBilHKBIJ%2BqR0i1CyMUqYZS6mFAYJUoY8QuBP06N0OosxJhtDKb0hpvQmgdMWF0npAymlDJmH0CZ1TumrAUDPLYEhe792vpIseHBjiqH8JKKJZpjjAGQEmUB1wEkQFwIQEg5NdhcGWLwehWhlhrw3lvLk/guDknJO0fw3yj4/OkBfK%2Bpcb67Pvo/Z%2BnDX4wEQCAQcyRFJ/34l/H%2BxAIisG2Aco5JyzkXK3mYXg9U7mZL0HY4QohxDOKpW4tQN8vGkHgWlZIxj9BbIhTszgiDFIosJKgKg%2BzDnHMkKc85xxLnXI8BioBDynkvJfh8ze1xN7qo1Zqzll9tkjyhbYGFryV7apJdyvVd9YVvNWOk9IzhJBAA%3D + constexpr uint8x32_t() noexcept = default; + constexpr uint8x32_t(const uint16x16_t &b) noexcept; + constexpr uint8x32_t(const uint32x8_t &b) noexcept; + constexpr uint8x32_t(const uint64x4_t &b) noexcept; + constexpr uint8x32_t(const uint128x2_t &b) noexcept; + [[nodiscard]] constexpr inline limb_type operator[](const uint32_t i) { ASSERT(i < LIMBS); return d[i]; } - /// Example of how the constexpr implementation works: - /// https://godbolt.org/#z:OYLghAFBqd5QCxAYwPYBMCmBRdBLAF1QCcAaPECAMzwBtMA7AQwFtMQByARg9KtQYEAysib0QXACx8BBAKoBnTAAUAHpwAMvAFYTStJg1DIApACYAQuYukl9ZATwDKjdAGFUtAK4sGEgBykrgAyeAyYAHI%2BAEaYxCCSZqQADqgKhE4MHt6%2BASlpGQKh4VEssfGJtpj2jgJCBEzEBNk%2BflyBdpgOmfWNBMWRMXEJSQoNTS257bbj/WGDZcOJAJS2qF7EyOwc5gDMYcjeWADUJrtuY/iCAHQIZ9gmGgCCewdHmKfneCwsYQTEYVu90eLzM%2BwYhy8JzObmQlycwOeILGxC8DmOXj%2B/lUuzMAH0CKcAOxWJEaACcXgYmWJpJeFMelKxBOOxwAbv5TgBWCy4kxcgAiZzpjMZmMEXAAbCy2VL%2BRY5YLhSDRRTxQRcTK%2BTzjv5%2BULdiKGWq/pLJDKzfLjpJ9cqyeTGXi8SwzFzJXh2a7JXb6Q6iQa6azWSCg8c0AwxphVMliMcxkxHMhjmFaPMMVicfjCaoSHiIOGxunBNjNYSwlxSCqKaGa7W6/WC4T1SWs8mGGZlrSqw6TcXMyz1gQfaryYPrmyvZ8BccIE6XW68J2oE62ZJ0F5luXx5P%2BdgZyu1xuwmZt27lsPjeSAPRXscTt1T47O%2BfSnPEPHpL0QLf3yWkNsnr%2B56Gt2jLEJgBAbAwxyDhefpCmSvo3le3bIdybjJI0rDHMk/yoTe6HgZBxAMN2jZRjGcYNImbapuERYEC2LJUmIeDAOE6B4rQqBMOg%2BYCIWbKoHg6DHAAVLhxCdiYJKgRSTBeEQ4mSTuuzTrOzpeh6YnLJJcFihmpYwYp%2BkUneqnThJ/xeqZ163opp6So%2Bz5elxPHrh%2BeBfhpL7abp/zAUaPbkkRUHGUOIH2jJCFPKGIZBsh%2BFXuhmHENhen2mh/IYVhLBtmRAkEBRsbxjRKZpkJIkYswqbsZgnFjCQmAQJVolWWQYaFQxTFlgw0myVFFJzlpynWQ%2BZzqcNC7if5xC2e1Flto5Ppxf6KprZF9JPOR0axq1xxFWMEDNiyYmDv%2Bjbdad5YXV1J2EmJx63RGTZ/EZfVds8ob8LGx1va2HoTccGjCsm1i4p8bjHKRhpg5YE24v1gZ1pdzb9oSTCPmjpYgCALG1Rxbm8d%2BDBcKcljJmJiMrSjd2Ga20RY/TBK4/jbGE9xxPHuTFiU9Tm0NnTfZGUmQPY1muNvnmTD/tEgXxbW4ss3jNXs/VH5EOBEDnWG8tfUG0XrTFIJ/McLBMGEEBIwrnUvcVDHvY%2BXBmHqAtBjtlH3eF8qkYKj4yRYIMbcjrIe7GXvlj7%2Br%2BySQcBjbYdXb1ZhR37QMB3HF6xUGh0EFAXs6TrN0AaQfU06yMZ/FQEDmCnrroNlpFmEk3s8iDgp6y8/ocKstCcFyvB%2BBwWikKgnBuNY1hxusmwfHsPCkAQmg96sADWIC7Ls1waP4kiSkSkhEv4GhH0ff59xwkiD8vo%2BcLwCggBoi/L6scCwEgaAsMkdBxOQlCf9/eg8RgBcFxKQLAbI8BbAAGp4EwAAdwAPLJEYJwBeNBaBFWIA/CA0Qb7RDCI0AAnmg3gBDmDECIYg6I2guhL24LwT%2BbBBCIIYLQEhw9eBYGiF4YAbgxC0Afgw8BmBzZGHEJw8BeBwLdDZJgIRI8oxdEUtsBefxqg31TNENKlCPBYBvv8b4pDVhUAMMABQsCEHINQcI/gggRBiHYFIGQghFAqHUJI3QFYDBGBQJPSw%2Bg8DRAfpAVYqBcKZCEQAWnNmyVQZhjhRMQbsXgqA5HEABFgEJVsqg1EyC4Bg7hPCtAkESIIhSBilHKBIJ%2BqR0i1CyMUqYZS6mFAYJUoY8QuBP06N0OosxJhtDKb0hpvQmgdMWF0npAymlDJmH0CZ1TumrAUDPLYEhe792vpIseHBjiqH8JKKJZpjjAGQEmUB1wEkQFwIQEg5NdhcGWLwehWhlhrw3lvLk/guDknJO0fw3yj4/OkBfK%2Bpcb67Pvo/Z%2BnDX4wEQCAQcyRFJ/34l/H%2BxAIisG2Aco5JyzkXK3mYXg9U7mZL0HY4QohxDOKpW4tQN8vGkHgWlZIxj9BbIhTszgiDFIosJKgKg%2BzDnHMkKc85xxLnXI8BioBDynkvJfh8ze1xN7qo1Zqzll9tkjyhbYGFryV7apJdyvVd9YVvNWOk9IzhJBAA%3D - constexpr uint8x32_t() noexcept = default; - /// NOTE: currently cannot be constexpr /// \return [[nodiscard]] static inline uint8x32_t random() noexcept { @@ -446,6 +444,40 @@ struct uint8x32_t { return out; } + /// + /// \param __q31 + /// \param __q30 + /// \param __q29 + /// \param __q28 + /// \param __q27 + /// \param __q26 + /// \param __q25 + /// \param __q24 + /// \param __q23 + /// \param __q22 + /// \param __q21 + /// \param __q20 + /// \param __q19 + /// \param __q18 + /// \param __q17 + /// \param __q16 + /// \param __q15 + /// \param __q14 + /// \param __q13 + /// \param __q12 + /// \param __q11 + /// \param __q10 + /// \param __q09 + /// \param __q08 + /// \param __q07 + /// \param __q06 + /// \param __q05 + /// \param __q04 + /// \param __q03 + /// \param __q02 + /// \param __q01 + /// \param __q00 + /// \return [[nodiscard]] constexpr static inline uint8x32_t setr(char __q31, char __q30, char __q29, char __q28, char __q27, char __q26, char __q25, char __q24, char __q23, char __q22, char __q21, char __q20, @@ -731,6 +763,19 @@ struct uint8x32_t { ret.v256 = popcount_avx2_8(in.v256); return ret; } + + + /// wrapper around: `_mm256_blend_epi8` + /// \tparam in2 + /// \param in1 + /// \return + [[nodiscard]] constexpr static inline uint8x32_t blend(const uint8x32_t in1, + const uint8x32_t in2, + const uint8x32_t in3) noexcept { + uint8x32_t ret{}; + ret.v256 = (__m256i) __builtin_ia32_pblendvb256((__v32qi) in1.v256, (__v32qi) in2.v256, (__v32qi) in3.v256); + return ret; + } }; struct uint16x16_t { @@ -748,6 +793,12 @@ struct uint16x16_t { __m256i v256; }; + constexpr uint16x16_t() noexcept = default; + constexpr uint16x16_t(const uint8x32_t &b) noexcept; + constexpr uint16x16_t(const uint32x8_t &b) noexcept; + constexpr uint16x16_t(const uint64x4_t &b) noexcept; + constexpr uint16x16_t(const uint128x2_t &b) noexcept; + [[nodiscard]] constexpr inline limb_type operator[](const uint32_t i) const { ASSERT(i < LIMBS); return d[i]; @@ -1051,6 +1102,19 @@ struct uint16x16_t { return ret; } + /// wrapper around: `_mm256_blend_epi32` + /// \tparam in2 + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static inline uint16x16_t blend(const uint16x16_t in1, + const uint16x16_t in2) noexcept { + uint16x16_t ret{}; + ret.v256 = ((__m256i) __builtin_ia32_pblendw256((__v16hi) (__m256i) (in1.v256), + (__v16hi) (__m256i) (in2.v256), (int) (imm))); + return ret; + } + /// /// \param in /// \return @@ -1073,10 +1137,16 @@ struct uint32x8_t { uint16_t v16[16]; uint32_t v32[8]; uint64_t v64[4]; - cryptanalysislib::_uint32x4_t v128[2]; + // TODO cryptanalysislib::_uint32x4_t v128[2]; __m256i v256; }; + constexpr uint32x8_t() noexcept = default; + constexpr uint32x8_t(const uint8x32_t &b) noexcept; + constexpr uint32x8_t(const uint16x16_t &b) noexcept; + constexpr uint32x8_t(const uint64x4_t &b) noexcept; + constexpr uint32x8_t(const uint128x2_t &b) noexcept; + [[nodiscard]] constexpr inline limb_type operator[](const uint32_t i) noexcept { ASSERT(i < LIMBS); return d[i]; @@ -1442,6 +1512,53 @@ struct uint32x8_t { } } + /// wrapper around: `_mm256_blend_epi32` + /// \tparam in2 + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static inline uint32x8_t blend(const uint32x8_t in1, + const uint32x8_t in2) noexcept { + uint32x8_t ret{}; + ret.v256 = ((__m256i) __builtin_ia32_pblendd256((__v8si) (__m256i) (in1.v256), + (__v8si) (__m256i) (in2.v256), (int) (imm))); + return ret; + } + + /// wrapper around: `_mm256_unpacklo_epi64` + /// \tparam in2 + /// \param in1 + /// \return + [[nodiscard]] constexpr static inline uint32x8_t unpacklo(const uint32x8_t in1, + const uint32x8_t in2) noexcept { + uint32x8_t ret{}; + ret.v256 = (__m256i) __builtin_shufflevector((__v8si) in1.v256, (__v8si) in2.v256, 0, 8 + 0, 1, 8 + 1, 4, 8 + 4, 5, 8 + 5); + return ret; + } + + /// wrapper around: `_mm256_unpacklo_epi64` + /// \tparam in2 + /// \param in1 + /// \return + [[nodiscard]] constexpr static inline uint32x8_t unpackhi(const uint32x8_t in1, + const uint32x8_t in2) noexcept { + uint32x8_t ret{}; + ret.v256 = (__m256i) __builtin_shufflevector((__v8si) in1.v256, (__v8si) in2.v256, 2, 8 + 2, 3, 8 + 3, 6, 8 + 6, 7, 8 + 7); + return ret; + } + + /// wrapper around: `_mm256_permute2x128_si256` + /// TODO + /// \tparam in2 + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static inline uint32x8_t permute(const uint32x8_t in1, + const uint32x8_t in2) noexcept { + uint32x8_t ret{}; + return ret; + } + /// /// \param in /// \param perm @@ -1502,6 +1619,12 @@ struct uint64x4_t { __m256i v256; }; + constexpr inline uint64x4_t() noexcept = default; + constexpr inline uint64x4_t(const uint8x32_t &b) noexcept; + constexpr inline uint64x4_t(const uint16x16_t &b) noexcept; + constexpr inline uint64x4_t(const uint32x8_t &b) noexcept; + constexpr inline uint64x4_t(const uint128x2_t &b) noexcept; + [[nodiscard]] constexpr inline limb_type operator[](const uint32_t i) { ASSERT(i < LIMBS); return d[i]; @@ -1759,7 +1882,41 @@ struct uint64x4_t { return out; } - /// + /// wrapper around: `_mm256_unpacklo_epi64` + /// \tparam in2 + /// \param in1 + /// \return + [[nodiscard]] constexpr static inline uint64x4_t unpacklo(const uint64x4_t in1, + const uint64x4_t in2) noexcept { + uint64x4_t ret{}; + ret.v256 = (__m256i) __builtin_shufflevector((__v4di) in1.v256, (__v4di) in2.v256, 0, 4 + 0, 2, 4 + 2); + return ret; + } + + /// wrapper around: `_mm256_unpacklo_epi64` + /// \tparam in2 + /// \param in1 + /// \return + [[nodiscard]] constexpr static inline uint64x4_t unpackhi(const uint64x4_t in1, + const uint64x4_t in2) noexcept { + uint64x4_t ret{}; + ret.v256 = (__m256i) __builtin_shufflevector((__v4di) in1.v256, (__v4di) in1.v256, 1, 4 + 1, 3, 4 + 3); + return ret; + } + + /// wrapper around: `_mm256_permute2x128_si256` + /// \tparam in2 + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static inline uint64x4_t permute(const uint64x4_t in1, + const uint64x4_t in2) noexcept { + uint64x4_t ret{}; + ret.v256 = ((__m256i) __builtin_ia32_permti256((__v4di) (__m256i) (in1.v256), (__m256i) (in2.v256), (int) (in3))); + return ret; + } + + /// wrapper around: `_mm256_permute4x64_epi64` /// \tparam in2 /// \param in1 /// \return @@ -1895,8 +2052,70 @@ struct uint64x4_t { } }; +struct uint128x2_t { + constexpr static uint32_t LIMBS = 2; + using limb_type = __uint128_t; + union { + // compatibility with TxN_t + __uint128_t d[2]; + uint8_t v8[32]; + uint16_t v16[16]; + uint32_t v32[8]; + uint64_t v64[4]; + __uint128_t v128[2]; + __m256i v256; + }; + + constexpr uint128x2_t() noexcept = default; + constexpr uint128x2_t(const uint8x32_t &b) noexcept; + constexpr uint128x2_t(const uint16x16_t &b) noexcept; + constexpr uint128x2_t(const uint32x8_t &b) noexcept; + constexpr uint128x2_t(const uint64x4_t &b) noexcept; + + [[nodiscard]] constexpr inline limb_type operator[](const uint32_t i) { + ASSERT(i < LIMBS); + return d[i]; + } + + /// NOTE: currently cannot be constexpr + /// \return + [[nodiscard]] static inline uint128x2_t random() noexcept { + uint128x2_t ret{}; + for (size_t i = 0; i < 4; ++i) { + ret.v64[i] = fastrandombytes_uint64(); + } + return ret; + } + + /// + /// \param binary + /// \param hex + constexpr inline void print(bool binary = false, bool hex = false) const; + + /// wrapper around: `_mm256_bslli_epi128` + /// \tparam imm + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static uint128x2_t slli(const uint128x2_t in1) { + uint128x2_t ret{}; + ret.v256 = ((__m256i) __builtin_ia32_pslldqi256_byteshift((__v4di) (__m256i) (in1.v256), (int) (imm))); + return ret; + } + + /// wrapper around: `_mm256_bslli_epi128` + /// \tparam imm + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static uint128x2_t srli(const uint128x2_t in1) { + uint128x2_t ret{}; + ret.v256 = ((__m256i) __builtin_ia32_psrldqi256_byteshift((__m256i) (in1.v256), (int) (imm))); + return ret; + } +}; #endif diff --git a/src/simd/matrix/simple.h b/src/simd/matrix/simple.h new file mode 100644 index 00000000..fbbb05d3 --- /dev/null +++ b/src/simd/matrix/simple.h @@ -0,0 +1,126 @@ +#ifndef CRYPTANALYSISLIB_SIMD_MATRIX_SIMPLE_H +#define CRYPTANALYSISLIB_SIMD_MATRIX_SIMPLE_H + +class uint1x64x64_T { + uint64_t data[64]; + + constexpr static void transpose(uint64_t *out, uint64_t *in) noexcept { + constexpr static uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000}}; + + for (uint64_t i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (int32_t d = 5; d >= 0; d--) { + const uint32_t s = 1u << d; + + for (uint32_t i = 0; i < 64u; i += s * 2u) { + for (uint32_t j = i; j < i + s; j++) { + const uint64_t x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + const uint64_t y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + out[j + 0] = x; + out[j + s] = y; + } + } + } + } + + /// inplace transpose + constexpr void transpose() noexcept { + for (uint64_t j = 32, m = 0x00000000FFFFFFFF; j; j >>= 1, m ^= m << j) { + for (uint64_t k = 0; k < 64; k = ((k | j) + 1) & ~j) { + uint64_t t = (data[k] ^ (data[k | j] >> j)) & m; + data[k] ^= t; + data[k | j] ^= (t << j); + } + } + } + + constexpr void mul() { + } +}; + + +class uint8x32x32_t { + uint32_t data[32]; + + /// inlpace + constexpr void transpose() noexcept { +#if 1 + // TODO not correct, + // taken from https://github.com/pqov/pqov-paper/blob/main/src/avx2/blas_matrix_avx2.c + alignas(32) uint64x4_t mat[32]; + + // load + for (size_t i = 0; i < 32; i++) { + mat[i] = uint64x4_t::load(data + i); + } + + // swap 16x16 blocks + for (size_t i = 0; i < 16; i++) { + uint64x4_t tmp = uint64x4_t::template permute<0x20>(mat[i], mat[i + 16]); + mat[i + 16] = uint64x4_t::template permute<0x31>(mat[i], mat[i + 16]); + mat[i] = tmp; + } + + // swap 8x8 blocks + for (size_t i = 0; i < 2; i++) { + for (size_t j = 0; j < 8; j++) { + size_t r = 16 * i + j; + uint64x4_t tmp = uint64x4_t::unpacklo(mat[r], mat[r + 8]); + mat[r + 8] = uint64x4_t::unpackhi(mat[r], mat[r + 8]); + mat[r] = tmp; + } + } + + // swap 4x4 blocks + for (size_t i = 0; i < 4; i++) { + for (size_t j = 0; j < 4; j++) { + size_t r = 8 * i + j; + uint32x8_t tmp = uint32x8_t::template blend<0xaa>(mat[r], uint128x2_t::template slli<4>(mat[r + 4])); + mat[r + 4] = uint32x8_t::template blend<0xaa>(uint128x2_t::template srli<4>(mat[r]), mat[r + 4]); + mat[r] = tmp; + } + } + + // swap 2x2 blocks + for (size_t i = 0; i < 8; i++) { + for (size_t j = 0; j < 2; j++) { + size_t r = 4 * i + j; + uint16x16_t tmp = uint16x16_t::template blend<0xaa>(mat[r], uint128x2_t::template slli<2>(mat[r + 2])); + mat[r + 2] = uint16x16_t::template blend<0xaa>(uint128x2_t::template srli<0xaa>(mat[r]), mat[r + 2]); + mat[r] = tmp; + } + } + + // swap last bytes + for (size_t i = 0; i < 16; i++) { + size_t r = 2 * i; + const uint16x16_t blend_mask = uint16x16_t::set1(0xFF00); + const uint64x4_t tmp = uint8x32_t::blend(mat[r], uint128x2_t::template slli<1>(mat[r + 1]), blend_mask); + mat[r + 1] = uint8x32_t::blend(uint128x2_t::template srli<1>(mat[r]), mat[r + 1], blend_mask); + mat[r] = tmp; + } + // store result + for (size_t i = 0; i < 32; i++) { + uint64x4_t::store(data + i * 64, mat[i]); + } +#else + for (unsigned i = 0; i < 32; i++) { + for (unsigned j = i + 1; j < 32; j++) { + uint8_t tmp = mat[j * 64 + i]; + mat[j * 64 + i] = mat[i * 64 + j]; + mat[i * 64 + j] = tmp; + } + } +#endif + } +}; + +#endif//CRYPTANALYSISLIB_SIMD_MATRIX_SIMPLE_H diff --git a/src/simd/simd.h b/src/simd/simd.h index d98fc2e2..19fa1740 100644 --- a/src/simd/simd.h +++ b/src/simd/simd.h @@ -10,6 +10,16 @@ #include "print/print.h" #include "random.h" +namespace cryptanalysislib { + struct _uint16x8_t; + struct _uint32x4_t; + struct _uint64x2_t; +}// namespace cryptanalysislib + +struct uint16x16_t; +struct uint32x8_t; +struct uint64x4_t; +struct uint128x2_t; #if defined(USE_AVX2) @@ -30,17 +40,13 @@ #else namespace cryptanalysislib { - struct _uint16x8_t; - struct _uint32x4_t; - struct _uint64x2_t; - struct _uint8x16_t { constexpr static uint32_t LIMBS = 16; using limb_type = uint8_t; - constexpr inline _uint8x16_t& operator=(const _uint16x8_t& b) noexcept; - constexpr inline _uint8x16_t& operator=(const _uint32x4_t& b) noexcept; - constexpr inline _uint8x16_t& operator=(const _uint64x2_t& b) noexcept; + constexpr inline _uint8x16_t &operator=(const _uint16x8_t &b) noexcept; + constexpr inline _uint8x16_t &operator=(const _uint32x4_t &b) noexcept; + constexpr inline _uint8x16_t &operator=(const _uint64x2_t &b) noexcept; constexpr _uint8x16_t() noexcept {} constexpr _uint8x16_t(const _uint16x8_t &b) noexcept; @@ -57,7 +63,7 @@ namespace cryptanalysislib { uint64_t v64[2]; }; - [[nodiscard]] constexpr inline limb_type& operator[](const uint32_t i) noexcept { + [[nodiscard]] constexpr inline limb_type &operator[](const uint32_t i) noexcept { ASSERT(i < LIMBS); return d[i]; } @@ -79,7 +85,7 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint8x16_t set( - uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept { + uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept { _uint8x16_t ret; ret.v32[0] = d; ret.v32[1] = c; @@ -89,7 +95,7 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint8x16_t setr( - uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept { + uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept { _uint8x16_t ret; ret.v32[0] = a; ret.v32[1] = b; @@ -99,22 +105,21 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint8x16_t set( - uint8_t a, uint8_t b, uint8_t c, uint8_t d, - uint8_t e, uint8_t f, uint8_t g, uint8_t h, - uint8_t i, uint8_t j, uint8_t k, uint8_t l, - uint8_t m, uint8_t n, uint8_t o, uint8_t p - ) noexcept { + uint8_t a, uint8_t b, uint8_t c, uint8_t d, + uint8_t e, uint8_t f, uint8_t g, uint8_t h, + uint8_t i, uint8_t j, uint8_t k, uint8_t l, + uint8_t m, uint8_t n, uint8_t o, uint8_t p) noexcept { _uint8x16_t ret; - ret.v8[ 0] = p; - ret.v8[ 1] = o; - ret.v8[ 2] = n; - ret.v8[ 3] = m; - ret.v8[ 4] = l; - ret.v8[ 5] = k; - ret.v8[ 6] = j; - ret.v8[ 7] = i; - ret.v8[ 8] = h; - ret.v8[ 9] = g; + ret.v8[0] = p; + ret.v8[1] = o; + ret.v8[2] = n; + ret.v8[3] = m; + ret.v8[4] = l; + ret.v8[5] = k; + ret.v8[6] = j; + ret.v8[7] = i; + ret.v8[8] = h; + ret.v8[9] = g; ret.v8[10] = f; ret.v8[11] = e; ret.v8[12] = d; @@ -125,22 +130,21 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint8x16_t setr( - uint8_t a, uint8_t b, uint8_t c, uint8_t d, - uint8_t e, uint8_t f, uint8_t g, uint8_t h, - uint8_t i, uint8_t j, uint8_t k, uint8_t l, - uint8_t m, uint8_t n, uint8_t o, uint8_t p - ) noexcept { + uint8_t a, uint8_t b, uint8_t c, uint8_t d, + uint8_t e, uint8_t f, uint8_t g, uint8_t h, + uint8_t i, uint8_t j, uint8_t k, uint8_t l, + uint8_t m, uint8_t n, uint8_t o, uint8_t p) noexcept { _uint8x16_t ret; - ret.v8[ 0] = a; - ret.v8[ 1] = b; - ret.v8[ 2] = c; - ret.v8[ 3] = d; - ret.v8[ 4] = e; - ret.v8[ 5] = f; - ret.v8[ 6] = g; - ret.v8[ 7] = h; - ret.v8[ 8] = i; - ret.v8[ 9] = j; + ret.v8[0] = a; + ret.v8[1] = b; + ret.v8[2] = c; + ret.v8[3] = d; + ret.v8[4] = e; + ret.v8[5] = f; + ret.v8[6] = g; + ret.v8[7] = h; + ret.v8[8] = i; + ret.v8[9] = j; ret.v8[10] = k; ret.v8[11] = l; ret.v8[12] = m; @@ -155,9 +159,9 @@ namespace cryptanalysislib { constexpr static uint32_t LIMBS = 8; using limb_type = uint16_t; - constexpr inline _uint16x8_t& operator=(const _uint8x16_t& b) noexcept; - constexpr inline _uint16x8_t& operator=(const _uint32x4_t& b) noexcept; - constexpr inline _uint16x8_t& operator=(const _uint64x2_t& b) noexcept; + constexpr inline _uint16x8_t &operator=(const _uint8x16_t &b) noexcept; + constexpr inline _uint16x8_t &operator=(const _uint32x4_t &b) noexcept; + constexpr inline _uint16x8_t &operator=(const _uint64x2_t &b) noexcept; constexpr _uint16x8_t() noexcept {} constexpr _uint16x8_t(const _uint8x16_t &b) noexcept; @@ -174,7 +178,7 @@ namespace cryptanalysislib { uint64_t v64[2]; }; - [[nodiscard]] constexpr inline limb_type& operator[](const uint32_t i) noexcept { + [[nodiscard]] constexpr inline limb_type &operator[](const uint32_t i) noexcept { ASSERT(i < LIMBS); return d[i]; } @@ -196,7 +200,7 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint16x8_t set( - uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept { + uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept { _uint16x8_t ret; ret.v32[0] = d; ret.v32[1] = c; @@ -206,7 +210,7 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint16x8_t setr( - uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept { + uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept { _uint16x8_t ret; ret.v32[0] = a; ret.v32[1] = b; @@ -216,8 +220,8 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint16x8_t set( - uint16_t a, uint16_t b, uint16_t c, uint16_t d, - uint16_t e, uint16_t f, uint16_t g, uint16_t h) noexcept { + uint16_t a, uint16_t b, uint16_t c, uint16_t d, + uint16_t e, uint16_t f, uint16_t g, uint16_t h) noexcept { _uint16x8_t ret; ret.v16[0] = h; ret.v16[1] = g; @@ -231,8 +235,8 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint16x8_t setr( - uint16_t a, uint16_t b, uint16_t c, uint16_t d, - uint16_t e, uint16_t f, uint16_t g, uint16_t h) noexcept { + uint16_t a, uint16_t b, uint16_t c, uint16_t d, + uint16_t e, uint16_t f, uint16_t g, uint16_t h) noexcept { _uint16x8_t ret; ret.v64[0] = a; ret.v64[1] = b; @@ -250,9 +254,9 @@ namespace cryptanalysislib { constexpr static uint32_t LIMBS = 4; using limb_type = uint32_t; - constexpr inline _uint32x4_t& operator=(const _uint8x16_t& b) noexcept; - constexpr inline _uint32x4_t& operator=(const _uint16x8_t& b) noexcept; - constexpr inline _uint32x4_t& operator=(const _uint64x2_t& b) noexcept; + constexpr inline _uint32x4_t &operator=(const _uint8x16_t &b) noexcept; + constexpr inline _uint32x4_t &operator=(const _uint16x8_t &b) noexcept; + constexpr inline _uint32x4_t &operator=(const _uint64x2_t &b) noexcept; constexpr _uint32x4_t() noexcept {} constexpr _uint32x4_t(const _uint8x16_t &b) noexcept; @@ -269,7 +273,7 @@ namespace cryptanalysislib { uint64_t v64[2]; }; - [[nodiscard]] constexpr inline limb_type& operator[](const uint32_t i) noexcept { + [[nodiscard]] constexpr inline limb_type &operator[](const uint32_t i) noexcept { ASSERT(i < LIMBS); return d[i]; } @@ -291,7 +295,7 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint32x4_t set( - uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept { + uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept { _uint32x4_t ret; ret.v32[0] = d; ret.v32[1] = c; @@ -301,7 +305,7 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint32x4_t setr( - uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept { + uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept { _uint32x4_t ret; ret.v32[0] = a; ret.v32[1] = b; @@ -311,7 +315,7 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint32x4_t set( - uint64_t a, uint64_t b) noexcept { + uint64_t a, uint64_t b) noexcept { _uint32x4_t ret; ret.v64[0] = b; ret.v64[1] = a; @@ -319,7 +323,7 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint32x4_t setr( - uint64_t a, uint64_t b) noexcept { + uint64_t a, uint64_t b) noexcept { _uint32x4_t ret; ret.v64[0] = a; ret.v64[1] = b; @@ -331,9 +335,9 @@ namespace cryptanalysislib { constexpr static uint32_t LIMBS = 2; using limb_type = uint64_t; - constexpr inline _uint64x2_t& operator=(const _uint8x16_t& b) noexcept; - constexpr inline _uint64x2_t& operator=(const _uint16x8_t& b) noexcept; - constexpr inline _uint64x2_t& operator=(const _uint32x4_t& b) noexcept; + constexpr inline _uint64x2_t &operator=(const _uint8x16_t &b) noexcept; + constexpr inline _uint64x2_t &operator=(const _uint16x8_t &b) noexcept; + constexpr inline _uint64x2_t &operator=(const _uint32x4_t &b) noexcept; constexpr _uint64x2_t() noexcept {} constexpr _uint64x2_t(const _uint8x16_t &b) noexcept; @@ -349,7 +353,7 @@ namespace cryptanalysislib { uint64_t v64[2]; }; - [[nodiscard]] constexpr inline limb_type& operator[](const uint32_t i) noexcept { + [[nodiscard]] constexpr inline limb_type &operator[](const uint32_t i) noexcept { ASSERT(i < LIMBS); return d[i]; } @@ -371,7 +375,7 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint64x2_t set( - uint64_t a, uint64_t b) noexcept{ + uint64_t a, uint64_t b) noexcept { _uint64x2_t ret; ret.v64[0] = b; ret.v64[1] = a; @@ -379,7 +383,7 @@ namespace cryptanalysislib { } [[nodiscard]] constexpr static inline _uint64x2_t setr( - uint64_t a, uint64_t b) noexcept { + uint64_t a, uint64_t b) noexcept { _uint64x2_t ret; ret.v64[0] = a; ret.v64[1] = b; @@ -403,7 +407,7 @@ struct uint8x32_t { uint64_t v64[4]; }; - [[nodiscard]] constexpr inline limb_type& operator[](const uint32_t i) noexcept { + [[nodiscard]] constexpr inline limb_type &operator[](const uint32_t i) noexcept { ASSERT(i < LIMBS); return d[i]; } @@ -730,7 +734,7 @@ struct uint8x32_t { /// \param in2 /// \return [[nodiscard]] constexpr static inline uint8x32_t mullo(const uint8x32_t in1, - const uint8x32_t in2) noexcept { + const uint8x32_t in2) noexcept { uint8x32_t out; for (uint32_t i = 0; i < 32; i++) { out.v8[i] = in1.v8[i] * in2.v8[i]; @@ -740,7 +744,7 @@ struct uint8x32_t { /// [[nodiscard]] constexpr static inline uint8x32_t mullo(const uint8x32_t in1, - const uint8_t in2) noexcept { + const uint8_t in2) noexcept { uint8x32_t rs = uint8x32_t::set1(in2); return uint8x32_t::mullo(in1, rs); } @@ -774,7 +778,7 @@ struct uint8x32_t { } [[nodiscard]] constexpr static inline uint8x32_t cmp_(const uint8x32_t in1, - const uint8x32_t in2) noexcept { + const uint8x32_t in2) noexcept { uint8x32_t ret; for (uint32_t i = 0; i < 32; i++) { ret.v8[i] = in1.v8[i] == in2.v8[i]; @@ -783,7 +787,7 @@ struct uint8x32_t { } [[nodiscard]] constexpr static inline uint8x32_t gt_(const uint8x32_t in1, - const uint8x32_t in2) noexcept { + const uint8x32_t in2) noexcept { uint8x32_t ret; for (uint32_t i = 0; i < 32; i++) { ret.v8[i] = in1.v8[i] > in2.v8[i]; @@ -792,7 +796,7 @@ struct uint8x32_t { } [[nodiscard]] constexpr static inline int cmp(const uint8x32_t in1, - const uint8x32_t in2) noexcept { + const uint8x32_t in2) noexcept { int ret = 0; for (uint32_t i = 0; i < 32; i++) { ret ^= (in1.v8[i] == in2.v8[i]) << i; @@ -802,7 +806,7 @@ struct uint8x32_t { } [[nodiscard]] constexpr static inline int gt(const uint8x32_t in1, - const uint8x32_t in2) { + const uint8x32_t in2) { int ret = 0; for (uint32_t i = 0; i < 32; i++) { ret ^= (in1.v8[i] > in2.v8[i]) << i; @@ -941,7 +945,7 @@ struct uint16x16_t { /// \param ptr /// \return [[nodiscard]] constexpr static inline uint16x16_t unaligned_load( - const void *ptr) noexcept { + const void *ptr) noexcept { uint16x16_t out; const uint64_t *ptr64 = (uint64_t *) ptr; for (uint32_t i = 0; i < 4; i++) { @@ -1079,7 +1083,7 @@ struct uint16x16_t { /// \param in2 /// \return [[nodiscard]] constexpr static inline uint16x16_t mullo(const uint16x16_t in1, - const uint16x16_t in2) noexcept { + const uint16x16_t in2) noexcept { uint16x16_t out; for (uint32_t i = 0; i < 16; i++) { out.v16[i] = in1.v16[i] * in2.v16[i]; @@ -1089,7 +1093,7 @@ struct uint16x16_t { /// [[nodiscard]] constexpr static inline uint16x16_t mullo(const uint16x16_t in1, - const uint8_t in2) noexcept { + const uint8_t in2) noexcept { uint16x16_t rs = uint16x16_t::set1(in2); return uint16x16_t::mullo(in1, rs); } @@ -1123,7 +1127,7 @@ struct uint16x16_t { } [[nodiscard]] constexpr static inline uint16x16_t cmp_( - const uint16x16_t in1, const uint16x16_t in2) noexcept { + const uint16x16_t in1, const uint16x16_t in2) noexcept { uint16x16_t ret; for (uint32_t i = 0; i < 16; i++) { ret.v16[i] = in1.v16[i] == in2.v16[i]; @@ -1142,7 +1146,7 @@ struct uint16x16_t { } [[nodiscard]] constexpr static inline uint32_t cmp(const uint16x16_t in1, - const uint16x16_t in2) noexcept { + const uint16x16_t in2) noexcept { uint32_t ret = 0; for (uint32_t i = 0; i < 16; i++) { ret ^= (in1.v16[i] == in2.v16[i]) << i; @@ -1151,8 +1155,8 @@ struct uint16x16_t { return ret; } - [[nodiscard]] constexpr static inline uint32_t gt(const uint16x16_t in1, - const uint16x16_t in2) noexcept { + [[nodiscard]] constexpr static inline uint32_t gt(const uint16x16_t in1, + const uint16x16_t in2) noexcept { uint32_t ret = 0; for (uint32_t i = 0; i < 16; i++) { ret ^= (in1.v16[i] > in2.v16[i]) << i; @@ -1195,7 +1199,7 @@ struct uint32x8_t { uint64_t v64[4]; }; - [[nodiscard]] constexpr inline limb_type& operator[](const uint32_t i) noexcept { + [[nodiscard]] constexpr inline limb_type &operator[](const uint32_t i) noexcept { ASSERT(i < LIMBS); return d[i]; } @@ -1461,7 +1465,7 @@ struct uint32x8_t { } [[nodiscard]] constexpr static inline uint32x8_t cmp_(const uint32x8_t in1, - const uint32x8_t in2) noexcept { + const uint32x8_t in2) noexcept { uint32x8_t ret; for (uint32_t i = 0; i < 8; i++) { ret.v32[i] = in1.v32[i] == in2.v32[i]; @@ -1471,7 +1475,7 @@ struct uint32x8_t { } [[nodiscard]] constexpr static inline uint32x8_t gt_(const uint32x8_t in1, - const uint32x8_t in2) noexcept { + const uint32x8_t in2) noexcept { uint32x8_t ret; for (uint32_t i = 0; i < 8; i++) { ret.v32[i] = in1.v32[i] > in2.v32[i]; @@ -1481,7 +1485,7 @@ struct uint32x8_t { } [[nodiscard]] constexpr static inline uint32_t cmp(const uint32x8_t in1, - const uint32x8_t in2) noexcept{ + const uint32x8_t in2) noexcept { uint32_t ret = 0; for (uint32_t i = 0; i < 8; i++) { ret ^= (in1.v32[i] == in2.v32[i]) << i; @@ -1491,7 +1495,7 @@ struct uint32x8_t { } [[nodiscard]] constexpr static inline uint32_t gt(const uint32x8_t in1, - const uint32x8_t in2) noexcept { + const uint32x8_t in2) noexcept { uint32_t ret = 0; for (uint32_t i = 0; i < 8; i++) { ret ^= (in1.v32[i] > in2.v32[i]) << i; @@ -1515,7 +1519,7 @@ struct uint32x8_t { /// \return template [[nodiscard]] constexpr static inline uint32x8_t gather(const void *ptr, - const uint32x8_t data) noexcept { + const uint32x8_t data) noexcept { uint32x8_t ret; const uint8_t *ptr8 = (uint8_t *) ptr; for (uint32_t i = 0; i < 8; i++) { @@ -1530,7 +1534,7 @@ struct uint32x8_t { /// \param perm /// \return [[nodiscard]] constexpr static inline uint32x8_t permute(const uint32x8_t in, - const uint32x8_t perm) noexcept { + const uint32x8_t perm) noexcept { uint32x8_t ret; for (uint32_t i = 0; i < 8; i++) { ret.v32[i] = in.v32[perm.v32[i] & 0x7]; @@ -1584,7 +1588,7 @@ struct uint64x4_t { uint64_t v64[4]; }; - [[nodiscard]] constexpr inline limb_type& operator[](const uint32_t i) noexcept { + [[nodiscard]] constexpr inline limb_type &operator[](const uint32_t i) noexcept { ASSERT(i < LIMBS); return d[i]; } @@ -1612,7 +1616,7 @@ struct uint64x4_t { [[nodiscard]] constexpr static inline uint64x4_t setr( const uint64_t a0, const uint64_t a1, - const uint64_t a2, const uint64_t a3) noexcept { + const uint64_t a2, const uint64_t a3) noexcept { uint64x4_t out; out.v64[0] = a0; out.v64[1] = a1; @@ -1623,7 +1627,7 @@ struct uint64x4_t { [[nodiscard]] constexpr static inline uint64x4_t set( const uint64_t a0, const uint64_t a1, - const uint64_t a2, const uint64_t a3) noexcept { + const uint64_t a2, const uint64_t a3) noexcept { return uint64x4_t::setr(a3, a2, a1, a0); } @@ -1812,7 +1816,7 @@ struct uint64x4_t { /// [[nodiscard]] constexpr static inline uint64x4_t mullo(const uint64x4_t in1, - const uint8_t in2) noexcept { + const uint8_t in2) noexcept { uint64x4_t rs = uint64x4_t::set1(in2); return uint64x4_t::mullo(in1, rs); } @@ -1846,7 +1850,7 @@ struct uint64x4_t { } [[nodiscard]] constexpr static inline uint64x4_t cmp_(const uint64x4_t in1, - const uint64x4_t in2) noexcept { + const uint64x4_t in2) noexcept { uint64x4_t ret; for (uint8_t i = 0; i < 4; i++) { ret.v64[i] = in1.v64[i] == in2.v64[i]; @@ -1855,7 +1859,7 @@ struct uint64x4_t { } [[nodiscard]] constexpr static inline uint64x4_t gt_(const uint64x4_t in1, - const uint64x4_t in2) noexcept { + const uint64x4_t in2) noexcept { uint64x4_t ret; for (uint32_t i = 0; i < 4; i++) { ret.v64[i] = in1.v64[i] > in2.v64[i]; @@ -1865,7 +1869,7 @@ struct uint64x4_t { } [[nodiscard]] constexpr static inline uint32_t cmp(const uint64x4_t in1, - const uint64x4_t in2) noexcept { + const uint64x4_t in2) noexcept { uint32_t ret = 0; for (uint8_t i = 0; i < 4; i++) { ret ^= (in1.v64[i] == in2.v64[i]) << i; @@ -1875,7 +1879,7 @@ struct uint64x4_t { } [[nodiscard]] constexpr static inline uint32_t gt(const uint64x4_t in1, - const uint64x4_t in2) noexcept { + const uint64x4_t in2) noexcept { uint32_t ret = 0; for (uint32_t i = 0; i < 4; i++) { ret ^= (in1.v64[i] > in2.v64[i]) << i; @@ -1899,7 +1903,7 @@ struct uint64x4_t { /// \return template [[nodiscard]] constexpr static inline uint64x4_t gather(const void *ptr, - const uint64x4_t data) noexcept { + const uint64x4_t data) noexcept { static_assert(scale == 1 || scale == 2 || scale == 4 || scale == 8); uint64x4_t ret; @@ -1917,7 +1921,7 @@ struct uint64x4_t { /// \return template [[nodiscard]] constexpr static inline uint64x4_t gather(const void *ptr, - const cryptanalysislib::_uint32x4_t data) noexcept { + const cryptanalysislib::_uint32x4_t data) noexcept { static_assert(scale == 1 || scale == 2 || scale == 4 || scale == 8); uint64x4_t ret; const uint8_t *ptr8 = (uint8_t *) ptr; @@ -1933,7 +1937,7 @@ struct uint64x4_t { /// \param perm /// \return [[nodiscard]] constexpr static inline uint64x4_t permute(const uint64x4_t in, - const uint64x4_t perm) noexcept { + const uint64x4_t perm) noexcept { uint64x4_t ret; for (uint32_t i = 0; i < 4; i++) { ret.v64[i] = in.v64[perm.v64[i]]; @@ -1965,6 +1969,64 @@ struct uint64x4_t { } }; +struct uint128x2_t { + constexpr static uint32_t LIMBS = 2; + using limb_type = __uint128_t; + + union { + // compatibility with TxN_t + __uint128_t d[2]; + + uint8_t v8[32]; + uint16_t v16[16]; + uint32_t v32[8]; + uint64_t v64[4]; + __uint128_t v128[2]; + __m256i v256; + }; + + [[nodiscard]] constexpr inline limb_type operator[](const uint32_t i) { + ASSERT(i < LIMBS); + return d[i]; + } + + /// Example of how the constexpr implementation works: + /// https://godbolt.org/#z:OYLghAFBqd5QCxAYwPYBMCmBRdBLAF1QCcAaPECAMzwBtMA7AQwFtMQByARg9KtQYEAysib0QXACx8BBAKoBnTAAUAHpwAMvAFYTStJg1DIApACYAQuYukl9ZATwDKjdAGFUtAK4sGEgBykrgAyeAyYAHI%2BAEaYxCCSZqQADqgKhE4MHt6%2BASlpGQKh4VEssfGJtpj2jgJCBEzEBNk%2BflyBdpgOmfWNBMWRMXEJSQoNTS257bbj/WGDZcOJAJS2qF7EyOwc5gDMYcjeWADUJrtuY/iCAHQIZ9gmGgCCewdHmKfneCwsYQTEYVu90eLzM%2BwYhy8JzObmQlycwOeILGxC8DmOXj%2B/lUuzMAH0CKcAOxWJEaACcXgYmWJpJeFMelKxBOOxwAbv5TgBWCy4kxcgAiZzpjMZmMEXAAbCy2VL%2BRY5YLhSDRRTxQRcTK%2BTzjv5%2BULdiKGWq/pLJDKzfLjpJ9cqyeTGXi8SwzFzJXh2a7JXb6Q6iQa6azWSCg8c0AwxphVMliMcxkxHMhjmFaPMMVicfjCaoSHiIOGxunBNjNYSwlxSCqKaGa7W6/WC4T1SWs8mGGZlrSqw6TcXMyz1gQfaryYPrmyvZ8BccIE6XW68J2oE62ZJ0F5luXx5P%2BdgZyu1xuwmZt27lsPjeSAPRXscTt1T47O%2BfSnPEPHpL0QLf3yWkNsnr%2B56Gt2jLEJgBAbAwxyDhefpCmSvo3le3bIdybjJI0rDHMk/yoTe6HgZBxAMN2jZRjGcYNImbapuERYEC2LJUmIeDAOE6B4rQqBMOg%2BYCIWbKoHg6DHAAVLhxCdiYJKgRSTBeEQ4mSTuuzTrOzpeh6YnLJJcFihmpYwYp%2BkUneqnThJ/xeqZ163opp6So%2Bz5elxPHrh%2BeBfhpL7abp/zAUaPbkkRUHGUOIH2jJCFPKGIZBsh%2BFXuhmHENhen2mh/IYVhLBtmRAkEBRsbxjRKZpkJIkYswqbsZgnFjCQmAQJVolWWQYaFQxTFlgw0myVFFJzlpynWQ%2BZzqcNC7if5xC2e1Flto5Ppxf6KprZF9JPOR0axq1xxFWMEDNiyYmDv%2Bjbdad5YXV1J2EmJx63RGTZ/EZfVds8ob8LGx1va2HoTccGjCsm1i4p8bjHKRhpg5YE24v1gZ1pdzb9oSTCPmjpYgCALG1Rxbm8d%2BDBcKcljJmJiMrSjd2Ga20RY/TBK4/jbGE9xxPHuTFiU9Tm0NnTfZGUmQPY1muNvnmTD/tEgXxbW4ss3jNXs/VH5EOBEDnWG8tfUG0XrTFIJ/McLBMGEEBIwrnUvcVDHvY%2BXBmHqAtBjtlH3eF8qkYKj4yRYIMbcjrIe7GXvlj7%2Br%2BySQcBjbYdXb1ZhR37QMB3HF6xUGh0EFAXs6TrN0AaQfU06yMZ/FQEDmCnrroNlpFmEk3s8iDgp6y8/ocKstCcFyvB%2BBwWikKgnBuNY1hxusmwfHsPCkAQmg96sADWIC7Ls1waP4kiSkSkhEv4GhH0ff59xwkiD8vo%2BcLwCggBoi/L6scCwEgaAsMkdBxOQlCf9/eg8RgBcFxKQLAbI8BbAAGp4EwAAdwAPLJEYJwBeNBaBFWIA/CA0Qb7RDCI0AAnmg3gBDmDECIYg6I2guhL24LwT%2BbBBCIIYLQEhw9eBYGiF4YAbgxC0Afgw8BmBzZGHEJw8BeBwLdDZJgIRI8oxdEUtsBefxqg31TNENKlCPBYBvv8b4pDVhUAMMABQsCEHINQcI/gggRBiHYFIGQghFAqHUJI3QFYDBGBQJPSw%2Bg8DRAfpAVYqBcKZCEQAWnNmyVQZhjhRMQbsXgqA5HEABFgEJVsqg1EyC4Bg7hPCtAkESIIhSBilHKBIJ%2BqR0i1CyMUqYZS6mFAYJUoY8QuBP06N0OosxJhtDKb0hpvQmgdMWF0npAymlDJmH0CZ1TumrAUDPLYEhe792vpIseHBjiqH8JKKJZpjjAGQEmUB1wEkQFwIQEg5NdhcGWLwehWhlhrw3lvLk/guDknJO0fw3yj4/OkBfK%2Bpcb67Pvo/Z%2BnDX4wEQCAQcyRFJ/34l/H%2BxAIisG2Aco5JyzkXK3mYXg9U7mZL0HY4QohxDOKpW4tQN8vGkHgWlZIxj9BbIhTszgiDFIosJKgKg%2BzDnHMkKc85xxLnXI8BioBDynkvJfh8ze1xN7qo1Zqzll9tkjyhbYGFryV7apJdyvVd9YVvNWOk9IzhJBAA%3D + constexpr uint128x2_t() noexcept = default; + + /// NOTE: currently cannot be constexpr + /// \return + [[nodiscard]] static inline uint128x2_t random() noexcept { + uint128x2_t ret{}; + for (size_t i = 0; i < 4; ++i) { + ret.v64[i] = fastrandombytes_uint64(); + } + return ret; + } + + /// + /// \param binary + /// \param hex + constexpr inline void print(bool binary = false, bool hex = false) const; + + template + [[nodiscard]] constexpr static uint128x2_t slli(const uint128x2_t in1) { + uint128x2_t ret{}; + /// TODO + return ret; + } + + /// wrapper around: `_mm256_bslli_epi128` + /// \tparam imm + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static uint128x2_t srli(const uint128x2_t in1) { + uint128x2_t ret{}; + ret.v256 = ((__m256i) __builtin_ia32_psrldqi256_byteshift((__m256i) (in1.v256), (int) (imm))); + return ret; + } +}; #include "simd/float/simd.h" #endif// no SIMD unit available @@ -2178,44 +2240,44 @@ constexpr inline int operator>(const uint8x32_t &a, const uint8x32_t &b) { /// constexpr inline int operator==(const uint16x16_t &a, const uint16x16_t &b) { - return (int)uint16x16_t::cmp(a, b); + return (int) uint16x16_t::cmp(a, b); } constexpr inline int operator!=(const uint16x16_t &a, const uint16x16_t &b) { return 0xffff ^ uint16x16_t::cmp(a, b); } constexpr inline int operator<(const uint16x16_t &a, const uint16x16_t &b) { - return (int)uint16x16_t::gt(b, a); + return (int) uint16x16_t::gt(b, a); } constexpr inline int operator>(const uint16x16_t &a, const uint16x16_t &b) { - return (int)uint16x16_t::gt(a, b); + return (int) uint16x16_t::gt(a, b); } /// constexpr inline int operator==(const uint32x8_t &a, const uint32x8_t &b) { - return (int)uint32x8_t::cmp(a, b); + return (int) uint32x8_t::cmp(a, b); } constexpr inline int operator!=(const uint32x8_t &a, const uint32x8_t &b) { return 0xff ^ uint32x8_t::cmp(a, b); } constexpr inline int operator<(const uint32x8_t &a, const uint32x8_t &b) { - return (int)uint32x8_t::gt(b, a); + return (int) uint32x8_t::gt(b, a); } constexpr inline int operator>(const uint32x8_t &a, const uint32x8_t &b) { - return (int)uint32x8_t::gt(a, b); + return (int) uint32x8_t::gt(a, b); } constexpr inline int operator==(const uint64x4_t &a, const uint64x4_t &b) { - return (int)uint64x4_t::cmp(a, b); + return (int) uint64x4_t::cmp(a, b); } constexpr inline int operator!=(const uint64x4_t &a, const uint64x4_t &b) { return 0xf ^ uint64x4_t::cmp(a, b); } constexpr inline int operator<(const uint64x4_t &a, const uint64x4_t &b) { - return (int)uint64x4_t::gt(b, a); + return (int) uint64x4_t::gt(b, a); } constexpr inline int operator>(const uint64x4_t &a, const uint64x4_t &b) { - return (int)uint64x4_t::gt(a, b); + return (int) uint64x4_t::gt(a, b); } @@ -2495,78 +2557,9 @@ namespace cryptanalysislib { v64[i] = b.v64[i]; } } -} - - - -void transpose8(unsigned char A[8], int m, int n, - unsigned char B[8]) { - unsigned x, y, t; - - // Load the array and pack it into x and y. - - x = (A[0]<<24) | (A[m]<<16) | (A[2*m]<<8) | A[3*m]; - y = (A[4*m]<<24) | (A[5*m]<<16) | (A[6*m]<<8) | A[7*m]; - - t = (x ^ (x >> 7)) & 0x00AA00AA; x = x ^ t ^ (t << 7); - t = (y ^ (y >> 7)) & 0x00AA00AA; y = y ^ t ^ (t << 7); - - t = (x ^ (x >>14)) & 0x0000CCCC; x = x ^ t ^ (t <<14); - t = (y ^ (y >>14)) & 0x0000CCCC; y = y ^ t ^ (t <<14); - - t = (x & 0xF0F0F0F0) | ((y >> 4) & 0x0F0F0F0F); - y = ((x << 4) & 0xF0F0F0F0) | (y & 0x0F0F0F0F); - x = t; +}// namespace cryptanalysislib - B[0]=x>>24; B[n]=x>>16; B[2*n]=x>>8; B[3*n]=x; - B[4*n]=y>>24; B[5*n]=y>>16; B[6*n]=y>>8; B[7*n]=y; -} - -/// input: in, a 64x64 matrix over GF(2) -/// output: out, transpose of in -void transpose_64x64(uint64_t * out, uint64_t * in) { - const static uint64_t masks[6][2] = { - {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, - {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, - {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, - {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, - {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, - {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} - }; - - for (uint64_t i = 0; i < 64; i++) { - out[i] = in[i]; - } - - for (uint32_t d = 5; d >= 0; d--) { - const uint32_t s = 1 << d; - - for (uint32_t i = 0; i < 64; i += s*2) { - for (uint32_t j = i; j < i+s; j++) { - const uint64_t x = (out[j] & masks[d][0]) | ((out[j+s] & masks[d][0]) << s); - const uint64_t y = ((out[j] & masks[d][1]) >> s) | (out[j+s] & masks[d][1]); - out[j+0] = x; - out[j+s] = y; - } - } - } -} - - -// inplace -inline void transpose64(uint64_t a[64]) noexcept { - for (uint64_t j = 32, m = 0x00000000FFFFFFFF; j; j >>= 1, m ^= m << j) { - for (uint64_t k = 0; k < 64; k = ((k | j) + 1) & ~j) { - uint64_t t = (a[k] ^ (a[k | j] >> j)) & m; - a[k] ^= t; - a[k | j] ^= (t << j); - } - } -} - -// TODO -//struct b256x64_T - -#include "simd/generic.h" #include "simd/bits/bits.h" +#include "simd/generic.h" +#include "simd/matrix/simple.h" #endif//CRYPTANALYSISLIB_SIMD_H diff --git a/tests/container/hashmap/simd.cpp b/tests/container/hashmap/simd.cpp index 75214e06..3ff10e27 100644 --- a/tests/container/hashmap/simd.cpp +++ b/tests/container/hashmap/simd.cpp @@ -34,7 +34,7 @@ TEST(SimdHashMap, avxInsert) { using HM = SIMDHashMap; HM hm = HM{}; - uint32x8_t data, index; + uint32x8_t data{}, index{}; for (uint64_t i = 0; i < ((1u << l) * bucket_size / 8); ++i) { for (uint32_t j = 0; j < 8; ++j) { data.v32[j] = i * 8 + j; From 6e9f9027a13cb59c21cb970cea66970d75ec95e9 Mon Sep 17 00:00:00 2001 From: Floyd Date: Wed, 6 Mar 2024 19:29:51 +0100 Subject: [PATCH 2/3] fixxed neon --- src/container/linkedlist/linkedlist.h | 4 +- src/simd/avx2.h | 2 +- src/simd/neon.h | 324 ++++++++++++++++++++++++-- src/simd/simd.h | 3 + tests/container/hashmap/simple.cpp | 2 +- 5 files changed, 318 insertions(+), 17 deletions(-) diff --git a/src/container/linkedlist/linkedlist.h b/src/container/linkedlist/linkedlist.h index e1d6e4e7..2c285c88 100644 --- a/src/container/linkedlist/linkedlist.h +++ b/src/container/linkedlist/linkedlist.h @@ -1,5 +1,5 @@ -#ifndef CRYPTANALYSISLIB_LINKEDLIST_H -#define CRYPTANALYSISLIB_LINKEDLIST_H +#ifndef CRYPTANALYSISLIB_FREELIST_LINKEDLIST_H +#define CRYPTANALYSISLIB_FREELIST_LINKEDLIST_H #include diff --git a/src/simd/avx2.h b/src/simd/avx2.h index da1796f8..6b6ea8bd 100644 --- a/src/simd/avx2.h +++ b/src/simd/avx2.h @@ -1106,7 +1106,7 @@ struct uint16x16_t { /// \tparam in2 /// \param in1 /// \return - template + template [[nodiscard]] constexpr static inline uint16x16_t blend(const uint16x16_t in1, const uint16x16_t in2) noexcept { uint16x16_t ret{}; diff --git a/src/simd/neon.h b/src/simd/neon.h index 6e81f645..30944973 100644 --- a/src/simd/neon.h +++ b/src/simd/neon.h @@ -12,6 +12,15 @@ namespace cryptanalysislib { constexpr static uint32_t LIMBS = 16; using limb_type = uint8_t; + constexpr inline _uint8x16_t &operator=(const _uint16x8_t &b) noexcept; + constexpr inline _uint8x16_t &operator=(const _uint32x4_t &b) noexcept; + constexpr inline _uint8x16_t &operator=(const _uint64x2_t &b) noexcept; + + constexpr _uint8x16_t() noexcept {} + constexpr _uint8x16_t(const _uint16x8_t &b) noexcept; + constexpr _uint8x16_t(const _uint32x4_t &b) noexcept; + constexpr _uint8x16_t(const _uint64x2_t &b) noexcept; + union { // compatibility to `TxN_t` uint8_t d[16]; @@ -120,6 +129,15 @@ namespace cryptanalysislib { constexpr static uint32_t LIMBS = 8; using limb_type = uint16_t; + constexpr inline _uint16x8_t &operator=(const _uint8x16_t &b) noexcept; + constexpr inline _uint16x8_t &operator=(const _uint32x4_t &b) noexcept; + constexpr inline _uint16x8_t &operator=(const _uint64x2_t &b) noexcept; + + constexpr _uint16x8_t() noexcept {} + constexpr _uint16x8_t(const _uint8x16_t &b) noexcept; + constexpr _uint16x8_t(const _uint32x4_t &b) noexcept; + constexpr _uint16x8_t(const _uint64x2_t &b) noexcept; + union { // compatibility to `TxN_t` uint16_t d[8]; @@ -206,6 +224,15 @@ namespace cryptanalysislib { constexpr static uint32_t LIMBS = 4; using limb_type = uint32_t; + constexpr inline _uint32x4_t &operator=(const _uint8x16_t &b) noexcept; + constexpr inline _uint32x4_t &operator=(const _uint16x8_t &b) noexcept; + constexpr inline _uint32x4_t &operator=(const _uint64x2_t &b) noexcept; + + constexpr _uint32x4_t() noexcept {} + constexpr _uint32x4_t(const _uint8x16_t &b) noexcept; + constexpr _uint32x4_t(const _uint16x8_t &b) noexcept; + constexpr _uint32x4_t(const _uint64x2_t &b) noexcept; + union { // compatibility to `TxN_t` uint32_t d[4]; @@ -278,6 +305,15 @@ namespace cryptanalysislib { constexpr static uint32_t LIMBS = 2; using limb_type = uint64_t; + constexpr inline _uint64x2_t &operator=(const _uint8x16_t &b) noexcept; + constexpr inline _uint64x2_t &operator=(const _uint16x8_t &b) noexcept; + constexpr inline _uint64x2_t &operator=(const _uint32x4_t &b) noexcept; + + constexpr _uint64x2_t() noexcept {} + constexpr _uint64x2_t(const _uint8x16_t &b) noexcept; + constexpr _uint64x2_t(const _uint16x8_t &b) noexcept; + constexpr _uint64x2_t(const _uint32x4_t &b) noexcept; + union { uint64_t d[2]; @@ -457,6 +493,12 @@ struct uint8x32_t { constexpr static uint32_t LIMBS = 32; using limb_type = uint8_t; + constexpr uint8x32_t() noexcept = default; + constexpr uint8x32_t(const uint16x16_t &b) noexcept; + constexpr uint8x32_t(const uint32x8_t &b) noexcept; + constexpr uint8x32_t(const uint64x4_t &b) noexcept; + constexpr uint8x32_t(const uint128x2_t &b) noexcept; + union { // compatibility with txn_t uint8_t d[32]; @@ -478,8 +520,6 @@ struct uint8x32_t { return d[i]; } - constexpr uint8x32_t() noexcept {} - /// /// \param binary /// \param hex @@ -873,6 +913,24 @@ struct uint8x32_t { return ret; } + /// wrapper around: `_mm256_blend_epi8` + /// \tparam in2 + /// \param in1 + /// \return + [[nodiscard]] constexpr static inline uint8x32_t blend(const uint8x32_t in1, + const uint8x32_t in2, + const uint8x32_t in3) noexcept { + uint8x32_t ret{}; + for (uint32_t i = 0; i < 32; i++) { + if (in3.v8[i]) { + ret.v8[i] = in1.v8[i]; + } else { + ret.v8[i] = in2.v8[i]; + } + } + return ret; + } + [[nodiscard]] constexpr static inline uint8x32_t popcnt(const uint8x32_t in) noexcept { uint8x32_t out; @@ -893,6 +951,12 @@ struct uint16x16_t { constexpr static uint32_t LIMBS = 16; using limb_type = uint16_t; + constexpr uint16x16_t() noexcept = default; + constexpr uint16x16_t(const uint8x32_t &b) noexcept; + constexpr uint16x16_t(const uint32x8_t &b) noexcept; + constexpr uint16x16_t(const uint64x4_t &b) noexcept; + constexpr uint16x16_t(const uint128x2_t &b) noexcept; + union { // compatibility with txn_t uint16_t d[16]; @@ -1297,6 +1361,25 @@ struct uint16x16_t { return ret; } + + /// wrapper around: `_mm256_blend_epi32` + /// \tparam in2 + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static inline uint16x16_t blend(const uint16x16_t in1, + const uint16x16_t in2) noexcept { + uint16x16_t ret{}; + for (uint32_t i = 0; i < 16; i++) { + if (imm & (1u << (i%8))) { + ret.v16[i] = in2.v16[i]; + } else { + ret.v16[i] = in1.v16[i]; + } + } + return ret; + } + constexpr static inline uint16x16_t popcnt(const uint16x16_t in) noexcept { uint16x16_t out; @@ -1325,6 +1408,12 @@ struct uint32x8_t { constexpr static uint32_t LIMBS = 8; using limb_type = uint32_t; + constexpr uint32x8_t() noexcept = default; + constexpr uint32x8_t(const uint8x32_t &b) noexcept; + constexpr uint32x8_t(const uint16x16_t &b) noexcept; + constexpr uint32x8_t(const uint64x4_t &b) noexcept; + constexpr uint32x8_t(const uint128x2_t &b) noexcept; + union { // compatibility with txn_t uint32_t d[8]; @@ -1735,6 +1824,94 @@ struct uint32x8_t { return ret; } + /// + /// \tparam scale + /// \param ptr + /// \param offset + /// \param data + /// \return + template + constexpr static inline void scatter(const void *ptr, const uint32x8_t offset, const uint32x8_t data) noexcept { + static_assert(scale == 1 || scale == 2 || scale == 4 || scale == 8); + const uint8_t *ptr8 = (uint8_t *) ptr; + for (uint32_t i = 0; i < 8; i++) { + *(uint32_t *) (ptr8 + offset.v32[i] * scale) = data.v32[i]; + } + } + + /// wrapper around: `_mm256_blend_epi32` + /// \tparam in2 + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static inline uint32x8_t blend(const uint32x8_t in1, + const uint32x8_t in2) noexcept { + uint32x8_t ret{}; + for (uint32_t i = 0; i < 7; i++) { + if (imm & (1u << i)) { + ret.v32[i] = in2.v32[i]; + } else { + ret.v32[i] = in1.v32[i]; + } + } + return ret; + } + + /// wrapper around: `_mm256_unpacklo_epi64` + /// \tparam in2 + /// \param in1 + /// \return + [[nodiscard]] constexpr static inline uint32x8_t unpacklo(const uint32x8_t in1, + const uint32x8_t in2) noexcept { + uint32x8_t ret{}; + ret.v64[0] = in1.v64[0]; + ret.v64[1] = in2.v64[0]; + ret.v64[2] = in1.v64[2]; + ret.v64[3] = in2.v64[2]; + return ret; + } + + /// wrapper around: `_mm256_unpacklo_epi64` + /// \tparam in2 + /// \param in1 + /// \return + [[nodiscard]] constexpr static inline uint32x8_t unpackhi(const uint32x8_t in1, + const uint32x8_t in2) noexcept { + uint32x8_t ret{}; + ret.v64[0] = in1.v64[1]; + ret.v64[1] = in2.v64[1]; + ret.v64[2] = in1.v64[3]; + ret.v64[3] = in2.v64[3]; + return ret; + } + + /// wrapper around: `_mm256_permute2x128_si256` + /// TODO + /// \tparam in2 + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static inline uint32x8_t permute(const uint32x8_t in1, + const uint32x8_t in2) noexcept { + uint32x8_t ret{}; + switch (in3&0xf) { + case 0: ret.v128[0] = in1.v128[0]; break; + case 1: ret.v128[0] = in1.v128[1]; break; + case 2: ret.v128[0] = in2.v128[0]; break; + case 3: ret.v128[0] = in2.v128[1]; break; + default: ret.v128[0] = {0}; + } + + switch ((in3>>4)&0xf) { + case 0: ret.v128[1] = in1.v128[0]; break; + case 1: ret.v128[1] = in1.v128[1]; break; + case 2: ret.v128[1] = in2.v128[0]; break; + case 3: ret.v128[1] = in2.v128[1]; break; + default: ret.v128[1] = {0}; + } + return ret; + } + /// TODO /// \param in /// \param perm @@ -1777,6 +1954,12 @@ struct uint64x4_t { constexpr static uint32_t LIMBS = 4; using limb_type = uint64_t; + constexpr inline uint64x4_t() noexcept = default; + constexpr inline uint64x4_t(const uint8x32_t &b) noexcept; + constexpr inline uint64x4_t(const uint16x16_t &b) noexcept; + constexpr inline uint64x4_t(const uint32x8_t &b) noexcept; + constexpr inline uint64x4_t(const uint128x2_t &b) noexcept; + union { // compatibility with txn_t uint64_t d[4]; @@ -2087,17 +2270,6 @@ struct uint64x4_t { return out; } - /// - /// \param in1 - /// \param in2 - /// \return - constexpr static inline uint64x4_t permute(const uint64x4_t in1, - const uint32_t in2) noexcept { - uint64x4_t ret; - ASSERT(0); - return ret; - } - /// /// \param in1 /// \param in2 @@ -2219,7 +2391,70 @@ struct uint64x4_t { return ret; } + + /// wrapper around: `_mm256_unpacklo_epi64` + /// \tparam in2 + /// \param in1 + /// \return + [[nodiscard]] constexpr static inline uint64x4_t unpacklo(const uint64x4_t in1, + const uint64x4_t in2) noexcept { + uint64x4_t ret{}; + ret.v64[0] = in1.v64[0]; + ret.v64[1] = in2.v64[0]; + ret.v64[2] = in1.v64[2]; + ret.v64[3] = in2.v64[2]; + return ret; + } + + /// wrapper around: `_mm256_unpacklo_epi64` + /// \tparam in2 + /// \param in1 + /// \return + [[nodiscard]] constexpr static inline uint64x4_t unpackhi(const uint64x4_t in1, + const uint64x4_t in2) noexcept { + uint64x4_t ret{}; + ret.v64[0] = in1.v64[1]; + ret.v64[1] = in2.v64[1]; + ret.v64[2] = in1.v64[3]; + ret.v64[3] = in2.v64[3]; + return ret; + } + /// wrapper around: `_mm256_permute2x128_si256` + /// \tparam in2 + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static inline uint64x4_t permute(const uint64x4_t in1, + const uint64x4_t in2) noexcept { + uint64x4_t ret{}; + switch (in3&0xf) { + case 0: ret.v128[0] = in1.v128[0]; break; + case 1: ret.v128[0] = in1.v128[1]; break; + case 2: ret.v128[0] = in2.v128[0]; break; + case 3: ret.v128[0] = in2.v128[1]; break; + default: ret.v128[0] = {0}; + } + + switch ((in3>>4)&0xf) { + case 0: ret.v128[1] = in1.v128[0]; break; + case 1: ret.v128[1] = in1.v128[1]; break; + case 2: ret.v128[1] = in2.v128[0]; break; + case 3: ret.v128[1] = in2.v128[1]; break; + default: ret.v128[1] = {0}; + } + return ret; + } + /// + /// \param in1 + /// \param in2 + /// \return + constexpr static inline uint64x4_t permute(const uint64x4_t in1, + const uint32_t in2) noexcept { + uint64x4_t ret; + ASSERT(0); + return ret; + } /// TODO /// \tparam in2 /// \param in1 @@ -2261,4 +2496,67 @@ struct uint64x4_t { } }; +struct uint128x2_t { + constexpr static uint32_t LIMBS = 2; + using limb_type = __uint128_t; + + union { + // compatibility with TxN_t + __uint128_t d[2]; + + uint8_t v8[32]; + uint16_t v16[16]; + uint32_t v32[8]; + uint64_t v64[4]; + uint64x2_t v128[2]; + }; + + constexpr uint128x2_t() noexcept = default; + constexpr uint128x2_t(const uint8x32_t &b) noexcept; + constexpr uint128x2_t(const uint16x16_t &b) noexcept; + constexpr uint128x2_t(const uint32x8_t &b) noexcept; + constexpr uint128x2_t(const uint64x4_t &b) noexcept; + + [[nodiscard]] constexpr inline limb_type operator[](const uint32_t i) { + ASSERT(i < LIMBS); + return d[i]; + } + + /// NOTE: currently cannot be constexpr + /// \return + [[nodiscard]] static inline uint128x2_t random() noexcept { + uint128x2_t ret{}; + for (size_t i = 0; i < 4; ++i) { + ret.v64[i] = fastrandombytes_uint64(); + } + return ret; + } + + /// + /// \param binary + /// \param hex + constexpr inline void print(bool binary = false, bool hex = false) const; + + /// wrapper around: `_mm256_bslli_epi128` + /// \tparam imm + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static uint128x2_t slli(const uint128x2_t in1) { + uint128x2_t ret{}; + // TODO + return ret; + } + + /// wrapper around: `_mm256_bslli_epi128` + /// \tparam imm + /// \param in1 + /// \return + template + [[nodiscard]] constexpr static uint128x2_t srli(const uint128x2_t in1) { + uint128x2_t ret{}; + // TODO + return ret; + } +}; #endif diff --git a/src/simd/simd.h b/src/simd/simd.h index 19fa1740..f40b4c36 100644 --- a/src/simd/simd.h +++ b/src/simd/simd.h @@ -2559,6 +2559,9 @@ namespace cryptanalysislib { } }// namespace cryptanalysislib + + + #include "simd/bits/bits.h" #include "simd/generic.h" #include "simd/matrix/simple.h" diff --git a/tests/container/hashmap/simple.cpp b/tests/container/hashmap/simple.cpp index 8d1bdde6..3905d892 100644 --- a/tests/container/hashmap/simple.cpp +++ b/tests/container/hashmap/simple.cpp @@ -26,7 +26,7 @@ TEST(HashMap, simd) { data[j] = i; index[j] = i + 1; } - hm.insert_simd(data, index); + // TODO hm.insert_simd(data, index); } for (size_t i = 0; i < 1u << l; ++i) { From f0d1470d90b61224c7d578698f89864b5bd1c094 Mon Sep 17 00:00:00 2001 From: Floyd Date: Wed, 6 Mar 2024 20:10:33 +0100 Subject: [PATCH 3/3] fixxed buigs in neon gcc --- src/simd/matrix/simple.h | 2 +- src/simd/neon.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/simd/matrix/simple.h b/src/simd/matrix/simple.h index fbbb05d3..c7495882 100644 --- a/src/simd/matrix/simple.h +++ b/src/simd/matrix/simple.h @@ -5,7 +5,7 @@ class uint1x64x64_T { uint64_t data[64]; constexpr static void transpose(uint64_t *out, uint64_t *in) noexcept { - constexpr static uint64_t masks[6][2] = { + constexpr uint64_t masks[6][2] = { {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, diff --git a/src/simd/neon.h b/src/simd/neon.h index 30944973..8d75fa65 100644 --- a/src/simd/neon.h +++ b/src/simd/neon.h @@ -2433,7 +2433,7 @@ struct uint64x4_t { case 1: ret.v128[0] = in1.v128[1]; break; case 2: ret.v128[0] = in2.v128[0]; break; case 3: ret.v128[0] = in2.v128[1]; break; - default: ret.v128[0] = {0}; + default: ret.v64[0] = 0; ret.v64[1] = 0; } switch ((in3>>4)&0xf) { @@ -2441,7 +2441,7 @@ struct uint64x4_t { case 1: ret.v128[1] = in1.v128[1]; break; case 2: ret.v128[1] = in2.v128[0]; break; case 3: ret.v128[1] = in2.v128[1]; break; - default: ret.v128[1] = {0}; + default: ret.v64[2] = 0; ret.v64[3] = 0; } return ret; }