Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
244 changes: 242 additions & 2 deletions src/encauth/gcm/gcm_gf_mult.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,227 @@
*/
#include "tomcrypt_private.h"

#if defined(LTC_GCM_PCLMUL)
#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <cpuid.h>
#endif
#include <wmmintrin.h>
#include <smmintrin.h>
#include <emmintrin.h>

static LTC_INLINE int s_pclmul_is_supported(void)
{
static int initialized = 0, is_supported = 0;

if (initialized == 0) {
/* Test CPUID.1.0.ECX[1]
* EAX = 1, ECX = 0 */
#if defined(_MSC_VER)
int cpuInfo[4];
__cpuid(cpuInfo, 1);
is_supported = ((cpuInfo[2] >> 1) & 1);
#else
int a = 1 , b, c = 0, d;

asm volatile ("cpuid"
:"=a"(a), "=b"(b), "=c"(c), "=d"(d)
:"a"(a), "c"(c)
);

is_supported = ((c >> 1) & 1);
initialized = 1;
#endif
}

return is_supported;
}

/*
* 128x128-bit binary polynomial multiplication for Intel x86 and x86_64
* Based on "Intel Carry-Less Multiplication Instruction and its Usage for
* Computing the GCM Mode", Shay Gueron, Michael E. Kounavis
* https://cdrdv2-public.intel.com/836172/clmul-wp-rev-2-02-2014-04-20.pdf
*/
LTC_GCM_PCLMUL_TARGET
static void gfmul_pclmul(__m128i a, __m128i b, __m128i *res){
/* Page 25. Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
__m128i /*tmp0, tmp1,*/ tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
tmp4 = _mm_xor_si128(tmp4, tmp5);
tmp5 = _mm_slli_si128(tmp4, 8);
tmp4 = _mm_srli_si128(tmp4, 8);
tmp3 = _mm_xor_si128(tmp3, tmp5);
tmp6 = _mm_xor_si128(tmp6, tmp4);
tmp7 = _mm_srli_epi32(tmp3, 31);
tmp8 = _mm_srli_epi32(tmp6, 31);
tmp3 = _mm_slli_epi32(tmp3, 1);
tmp6 = _mm_slli_epi32(tmp6, 1);
tmp9 = _mm_srli_si128(tmp7, 12);
tmp8 = _mm_slli_si128(tmp8, 4);
tmp7 = _mm_slli_si128(tmp7, 4);
tmp3 = _mm_or_si128(tmp3, tmp7);
tmp6 = _mm_or_si128(tmp6, tmp8);
tmp6 = _mm_or_si128(tmp6, tmp9);
tmp7 = _mm_slli_epi32(tmp3, 31);
tmp8 = _mm_slli_epi32(tmp3, 30);
tmp9 = _mm_slli_epi32(tmp3, 25);
tmp7 = _mm_xor_si128(tmp7, tmp8);
tmp7 = _mm_xor_si128(tmp7, tmp9);
tmp8 = _mm_srli_si128(tmp7, 4);
tmp7 = _mm_slli_si128(tmp7, 12);
tmp3 = _mm_xor_si128(tmp3, tmp7);
tmp2 = _mm_srli_epi32(tmp3, 1);
tmp4 = _mm_srli_epi32(tmp3, 2);
tmp5 = _mm_srli_epi32(tmp3, 7);
tmp2 = _mm_xor_si128(tmp2, tmp4);
tmp2 = _mm_xor_si128(tmp2, tmp5);
tmp2 = _mm_xor_si128(tmp2, tmp8);
tmp3 = _mm_xor_si128(tmp3, tmp2);
tmp6 = _mm_xor_si128(tmp6, tmp3);
*res = tmp6;
}

LTC_GCM_PCLMUL_TARGET
static void gcm_gf_mult_pclmul(const unsigned char *a, const unsigned char *b, unsigned char *c)
{
__m128i ci;
__m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
__m128i ai = _mm_loadu_si128((const __m128i *) a);
__m128i bi = _mm_loadu_si128((const __m128i *) b);

ai = _mm_shuffle_epi8(ai, BSWAP_MASK);
bi = _mm_shuffle_epi8(bi, BSWAP_MASK);

gfmul_pclmul(ai, bi, &ci);

ci = _mm_shuffle_epi8(ci, BSWAP_MASK);

XMEMCPY(c, &ci, sizeof(ci));
}
#endif // defined(LTC_GCM_PCLMUL)

#if defined(LTC_GCM_PMULL)
#if defined(__GNUC__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wbad-function-cast"
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wmissing-braces"
#pragma GCC diagnostic ignored "-Wsign-compare"
#pragma GCC diagnostic ignored "-Wshadow"
#endif
#include <arm_neon.h>
#if defined(__GNUC__)
#pragma GCC diagnostic pop
#endif

#if defined(__APPLE__)
#include <sys/sysctl.h>
#elif defined(_WIN32)
#include <windows.h>
#else
#include <sys/auxv.h>
#include <asm/hwcap.h>
#endif

static LTC_INLINE int s_pmull_is_supported(void)
{
static int initialized = 0, is_supported = 0;

if (initialized == 0) {
#if defined(__APPLE__)
int val = 0;
size_t len = sizeof(val);
if (sysctlbyname("hw.optional.arm.FEAT_PMULL", &val, &len, NULL, 0) == 0) {
is_supported = (val != 0);
}
#elif defined (_WIN32)
is_supported = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
#else
unsigned long hwcaps = getauxval(AT_HWCAP);
is_supported = (hwcaps & HWCAP_PMULL);
#endif
initialized = 1;
}

return is_supported;
}

/*
* 128x128-bit binary polynomial multiplication for AArch64 using PMULL/PMULL2
* Based on "Implementing GCM on ARMv8", Conrado P. L. Gouvea and Julio Lopez
* https://conradoplg.modp.net/files/2010/12/gcm14.pdf
*/
#if defined(_MSC_VER)
#define GET_LOW_P64(x) vreinterpret_p64_u64(vcreate_u64((uint64_t)vgetq_lane_p64((x), 0)))
#else
#define GET_LOW_P64(x) vgetq_lane_p64((x), 0)
#endif

LTC_GCM_PMULL_TARGET
static void gfmul_pmull(uint8x16_t a, uint8x16_t b, uint8x16_t *res) {
uint8x16_t r0, r1, t0, t1, z, p;
poly64x2_t pa, pb, pt0, pr1, pp;

z = vdupq_n_u8(0);

pa = vreinterpretq_p64_u8(a);
pb = vreinterpretq_p64_u8(b);

/* Page 7. Algorithm 3 128 × 128-bit binary polynomial multiplier for ARMv8 AArch64 (PMULL) */
r0 = vreinterpretq_u8_p128(vmull_p64(GET_LOW_P64(pa), GET_LOW_P64(pb)));
r1 = vreinterpretq_u8_p128(vmull_high_p64(pa, pb));
t0 = vextq_u8(b, b, 8);
pt0 = vreinterpretq_p64_u8(t0);

t1 = vreinterpretq_u8_p128(vmull_p64(GET_LOW_P64(pa), GET_LOW_P64(pt0)));
t0 = vreinterpretq_u8_p128(vmull_high_p64(pa, pt0));
t0 = veorq_u8(t0, t1);
t1 = vextq_u8(z, t0, 8);
r0 = veorq_u8(r0, t1);
t1 = vextq_u8(t0, z, 8);
r1 = veorq_u8(r1, t1);

/* Page 8. Algorithm 5 256-bit to 128-bit GCM polynomial reduction for ARMv8 AAarch64 using PMULL */
p = vreinterpretq_u8_u64(vdupq_n_u64(0x0000000000000087ULL));
pp = vreinterpretq_p64_u8(p);
pr1 = vreinterpretq_p64_u8(r1);
t0 = vreinterpretq_u8_p128(vmull_high_p64(pr1, pp));
t1 = vextq_u8(t0, z, 8);
r1 = veorq_u8(r1, t1);
t1 = vextq_u8(z, t0, 8);
r0 = veorq_u8(r0, t1);
pr1 = vreinterpretq_p64_u8(r1);

t0 = vreinterpretq_u8_p128(vmull_p64(GET_LOW_P64(pr1), GET_LOW_P64(pp)));
a = veorq_u8(r0, t0);

*res = a;
}

LTC_GCM_PMULL_TARGET
static void gcm_gf_mult_pmull(const unsigned char *a, const unsigned char *b, unsigned char *c)
{
uint8x16_t va, vb, vc;

va = vld1q_u8(a);
vb = vld1q_u8(b);
va = vrbitq_u8(va);
vb = vrbitq_u8(vb);

gfmul_pmull(va, vb, &vc);

vc = vrbitq_u8(vc);

XMEMCPY(c, &vc, sizeof(vc));
}

#endif // defined(LTC_GCM_PMULL)

#if defined(LTC_GCM_TABLES) || defined(LTC_LRW_TABLES) || (defined(LTC_GCM_MODE) && defined(LTC_FAST))

/* this is x*2^128 mod p(x) ... the results are 16 bytes each stored in a packed format. Since only the
Expand Down Expand Up @@ -50,6 +271,7 @@ const unsigned char gcm_shift_table[256*2] = {

#if defined(LTC_GCM_MODE) || defined(LTC_LRW_MODE)


#ifndef LTC_FAST
/* right shift */
static void s_gcm_rightshift(unsigned char *a)
Expand All @@ -72,7 +294,7 @@ static const unsigned char poly[] = { 0x00, 0xE1 };
@param b Second value
@param c Destination for a * b
*/
void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *c)
void gcm_gf_mult_sw(const unsigned char *a, const unsigned char *b, unsigned char *c)
{
unsigned char Z[16], V[16];
unsigned char x, y, z;
Expand Down Expand Up @@ -106,7 +328,7 @@ void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *
@param b Second value
@param c Destination for a * b
*/
void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *c)
static void gcm_gf_mult_sw(const unsigned char *a, const unsigned char *b, unsigned char *c)
{
int i, j, k, u;
LTC_FAST_TYPE B[16][WPV], tmp[32 / sizeof(LTC_FAST_TYPE)], pB[16 / sizeof(LTC_FAST_TYPE)], zz, z;
Expand Down Expand Up @@ -209,5 +431,23 @@ void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *

#endif

void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *c)
{
#if defined(LTC_GCM_PCLMUL)
if(s_pclmul_is_supported()) {
gcm_gf_mult_pclmul(a, b, c);
return;
}
#endif
#if defined(LTC_GCM_PMULL)
if(s_pmull_is_supported()) {
gcm_gf_mult_pmull(a, b, c);
return;
}
#endif
gcm_gf_mult_sw(a, b, c);
}


#endif

25 changes: 24 additions & 1 deletion src/headers/tomcrypt_cfg.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ LTC_EXPORT void LTC_CALL XFREE(void *p);

LTC_EXPORT void LTC_CALL XQSORT(void *base, size_t nmemb, size_t size, int(*compar)(const void *, const void *));


/* change the clock function too */
LTC_EXPORT clock_t LTC_CALL XCLOCK(void);

Expand Down Expand Up @@ -250,6 +249,8 @@ typedef unsigned long ltc_mp_digit;
#define LTC_NO_CTZL
#define LTC_NO_ROLC
#define LTC_NO_ROTATE
#define LTC_NO_GCM_PCLMUL
#define LTC_NO_GCM_PMULL
#endif

/* No LTC_FAST if: explicitly disabled OR non-gcc/non-clang compiler OR old gcc OR using -ansi -std=c99 */
Expand Down Expand Up @@ -367,4 +368,26 @@ typedef unsigned long ltc_mp_digit;
# define LTC_ATTRIBUTE(x)
#endif

#if !defined(LTC_NO_GCM_PCLMUL) && (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86))
#define LTC_GCM_PCLMUL
#undef LTC_GCM_TABLES
#endif

#if defined(__clang__) || defined(__GNUC__)
#define LTC_GCM_PCLMUL_TARGET __attribute__((target("pclmul,ssse3")))
#else
#define LTC_GCM_PCLMUL_TARGET
#endif

#if !defined(LTC_NO_GCM_PMULL) && (defined(__aarch64__) || defined(_M_ARM64))
#define LTC_GCM_PMULL
#undef LTC_GCM_TABLES
#endif

#if defined(LTC_GCM_PMULL) && (defined(__clang__) || defined(__GNUC__))
#define LTC_GCM_PMULL_TARGET __attribute__((target("+crypto")))
#else
#define LTC_GCM_PMULL_TARGET
#endif

#endif /* TOMCRYPT_CFG_H */