24#ifndef CRYPTOPP_IMPORTS
28#if (CRYPTOPP_CLMUL_AVAILABLE)
29# include <emmintrin.h>
30# include <wmmintrin.h>
33#if (CRYPTOPP_ARM_PMULL_AVAILABLE)
37#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
42extern const char GF2N_SIMD_FNAME[] = __FILE__;
44ANONYMOUS_NAMESPACE_BEGIN
50#if (CRYPTOPP_ARM_PMULL_AVAILABLE)
54F2N_Multiply_128x128_ARMv8(uint64x2_t& c1, uint64x2_t& c0,
const uint64x2_t& a,
const uint64x2_t& b)
56 uint64x2_t t1, t2, z0={0};
60 t1 = vmovq_n_u64(vgetq_lane_u64(a, 1));
61 t1 = veorq_u64(a, t1);
62 t2 = vmovq_n_u64(vgetq_lane_u64(b, 1));
63 t2 = veorq_u64(b, t2);
64 t1 = PMULL_00(t1, t2);
65 t1 = veorq_u64(c0, t1);
66 t1 = veorq_u64(c1, t1);
68 t1 = vextq_u64(z0, t1, 1);
69 t2 = vextq_u64(t2, z0, 1);
70 c0 = veorq_u64(c0, t1);
71 c1 = veorq_u64(c1, t2);
76F2N_Multiply_256x256_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, uint64x2_t& c0,
77 const uint64x2_t& b1,
const uint64x2_t& b0,
const uint64x2_t& a1,
const uint64x2_t& a0)
80 uint64x2_t x0=a0, x1=a1, y0=b0, y1=b1;
82 F2N_Multiply_128x128_ARMv8(c1, c0, x0, y0);
83 F2N_Multiply_128x128_ARMv8(c3, c2, x1, y1);
85 x0 = veorq_u64(x0, x1);
86 y0 = veorq_u64(y0, y1);
88 F2N_Multiply_128x128_ARMv8(c5, c4, x0, y0);
90 c4 = veorq_u64(c4, c0);
91 c4 = veorq_u64(c4, c2);
92 c5 = veorq_u64(c5, c1);
93 c5 = veorq_u64(c5, c3);
94 c1 = veorq_u64(c1, c4);
95 c2 = veorq_u64(c2, c5);
100F2N_Square_256_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1,
101 uint64x2_t& c0,
const uint64x2_t& a1,
const uint64x2_t& a0)
103 c0 = PMULL_00(a0, a0);
104 c1 = PMULL_11(a0, a0);
105 c2 = PMULL_00(a1, a1);
106 c3 = PMULL_11(a1, a1);
110template <
unsigned int N>
111inline uint64x2_t ShiftLeft128_ARMv8(uint64x2_t x)
113 uint64x2_t u=x, v, z={0};
114 x = vshlq_n_u64(x, N);
115 u = vshrq_n_u64(u, (64-N));
116 v = vcombine_u64(vget_low_u64(z), vget_low_u64(u));
124GF2NT_233_Reduce_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, uint64x2_t& c0)
126 const unsigned int mask[4] = {
127 0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff,
130 uint64x2_t b3, b2, b1, a1, a0, m0, z0={0};
131 m0 = vreinterpretq_u64_u32(vld1q_u32(mask));
133 a0 = vcombine_u64(vget_low_u64(c1), vget_low_u64(z0));
134 a1 = vshlq_n_u64(a1, 23);
135 a1 = vshrq_n_u64(a1, 23);
136 c1 = vorrq_u64(a1, a0);
137 b2 = vshrq_n_u64(c2, (64-23));
138 c3 = ShiftLeft128_ARMv8<23>(c3);
139 a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
140 c3 = vorrq_u64(c3, a0);
141 b1 = vshrq_n_u64(b1, (64-23));
142 c2 = ShiftLeft128_ARMv8<23>(c2);
143 a0 = vcombine_u64(vget_high_u64(b1), vget_high_u64(z0));
144 c2 = vorrq_u64(c2, a0);
146 b2 = vshrq_n_u64(c2, (64-10));
147 b3 = ShiftLeft128_ARMv8<10>(b3);
148 a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
149 b3 = vorrq_u64(b3, a0);
150 a0 = vcombine_u64(vget_high_u64(c3), vget_high_u64(z0));
151 b3 = veorq_u64(b3, a0);
152 b1 = vshrq_n_u64(b3, (64-23));
153 b3 = ShiftLeft128_ARMv8<23>(b3);
154 b3 = vcombine_u64(vget_high_u64(b3), vget_high_u64(z0));
155 b3 = vorrq_u64(b3, b1);
156 c2 = veorq_u64(c2, b3);
158 b2 = vshrq_n_u64(c2, (64-10));
159 b3 = ShiftLeft128_ARMv8<10>(b3);
160 b2 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
161 b3 = vorrq_u64(b3, b2);
163 b2 = ShiftLeft128_ARMv8<10>(b2);
164 a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b2));
165 c2 = veorq_u64(c2, a0);
166 a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b3));
167 a1 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
168 a0 = vorrq_u64(a0, a1);
169 c3 = veorq_u64(c3, a0);
170 c0 = veorq_u64(c0, c2);
171 c1 = veorq_u64(c1, c3);
172 c1 = vandq_u64(c1, m0);
179#if (CRYPTOPP_CLMUL_AVAILABLE)
185F2N_Multiply_128x128_CLMUL(__m128i& c1, __m128i& c0,
const __m128i& a,
const __m128i& b)
189 c0 = _mm_clmulepi64_si128(a, b, 0x00);
190 c1 = _mm_clmulepi64_si128(a, b, 0x11);
191 t1 = _mm_shuffle_epi32(a, 0xEE);
192 t1 = _mm_xor_si128(a, t1);
193 t2 = _mm_shuffle_epi32(b, 0xEE);
194 t2 = _mm_xor_si128(b, t2);
195 t1 = _mm_clmulepi64_si128(t1, t2, 0x00);
196 t1 = _mm_xor_si128(c0, t1);
197 t1 = _mm_xor_si128(c1, t1);
199 t1 = _mm_slli_si128(t1, 8);
200 t2 = _mm_srli_si128(t2, 8);
201 c0 = _mm_xor_si128(c0, t1);
202 c1 = _mm_xor_si128(c1, t2);
207F2N_Multiply_256x256_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, __m128i& c0,
208 const __m128i& b1,
const __m128i& b0,
const __m128i& a1,
const __m128i& a0)
211 __m128i x0=a0, x1=a1, y0=b0, y1=b1;
213 F2N_Multiply_128x128_CLMUL(c1, c0, x0, y0);
214 F2N_Multiply_128x128_CLMUL(c3, c2, x1, y1);
216 x0 = _mm_xor_si128(x0, x1);
217 y0 = _mm_xor_si128(y0, y1);
219 F2N_Multiply_128x128_CLMUL(c5, c4, x0, y0);
221 c4 = _mm_xor_si128(c4, c0);
222 c4 = _mm_xor_si128(c4, c2);
223 c5 = _mm_xor_si128(c5, c1);
224 c5 = _mm_xor_si128(c5, c3);
225 c1 = _mm_xor_si128(c1, c4);
226 c2 = _mm_xor_si128(c2, c5);
231F2N_Square_256_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1,
232 __m128i& c0,
const __m128i& a1,
const __m128i& a0)
234 c0 = _mm_clmulepi64_si128(a0, a0, 0x00);
235 c1 = _mm_clmulepi64_si128(a0, a0, 0x11);
236 c2 = _mm_clmulepi64_si128(a1, a1, 0x00);
237 c3 = _mm_clmulepi64_si128(a1, a1, 0x11);
241template <
unsigned int N>
242inline __m128i ShiftLeft128_SSE(__m128i x,
const __m128i& z)
245 x = _mm_slli_epi64(x, N);
246 u = _mm_srli_epi64(u, (64-N));
247 v = _mm_unpacklo_epi64(z, u);
248 x = _mm_or_si128(x, v);
255GF2NT_233_Reduce_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, __m128i& c0)
257 const unsigned int m[4] = {
258 0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff
261 __m128i b3, b2, b1, a1, a0, m0, z0;
262 m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]);
263 z0 = _mm_setzero_si128();
265 a0 = _mm_move_epi64(c1);
266 a1 = _mm_slli_epi64(a1, 23);
267 a1 = _mm_srli_epi64(a1, 23);
268 c1 = _mm_or_si128(a1, a0);
269 b2 = _mm_srli_epi64(c2, (64-23));
270 c3 = ShiftLeft128_SSE<23>(c3, z0);
271 a0 = _mm_unpackhi_epi64(b2, z0);
272 c3 = _mm_or_si128(c3, a0);
273 b1 = _mm_srli_epi64(b1, (64-23));
274 c2 = ShiftLeft128_SSE<23>(c2, z0);
275 a0 = _mm_unpackhi_epi64(b1, z0);
276 c2 = _mm_or_si128(c2, a0);
278 b2 = _mm_srli_epi64(c2, (64-10));
279 b3 = ShiftLeft128_SSE<10>(b3, z0);
280 a0 = _mm_unpackhi_epi64(b2, z0);
281 b3 = _mm_or_si128(b3, a0);
282 a0 = _mm_unpackhi_epi64(c3, z0);
283 b3 = _mm_xor_si128(b3, a0);
284 b1 = _mm_srli_epi64(b3, (64-23));
285 b3 = ShiftLeft128_SSE<23>(b3, z0);
286 b3 = _mm_unpackhi_epi64(b3, z0);
287 b3 = _mm_or_si128(b3, b1);
288 c2 = _mm_xor_si128(c2, b3);
290 b2 = _mm_srli_epi64(c2, (64-10));
291 b3 = ShiftLeft128_SSE<10>(b3, z0);
292 b2 = _mm_unpackhi_epi64(b2, z0);
293 b3 = _mm_or_si128(b3, b2);
295 b2 = ShiftLeft128_SSE<10>(b2, z0);
296 a0 = _mm_unpacklo_epi64(z0, b2);
297 c2 = _mm_xor_si128(c2, a0);
298 a0 = _mm_unpacklo_epi64(z0, b3);
299 a1 = _mm_unpackhi_epi64(b2, z0);
300 a0 = _mm_or_si128(a0, a1);
301 c3 = _mm_xor_si128(c3, a0);
302 c0 = _mm_xor_si128(c0, c2);
303 c1 = _mm_xor_si128(c1, c3);
304 c1 = _mm_and_si128(c1, m0);
311#if (CRYPTOPP_POWER8_VMULL_AVAILABLE) && 0
315using CryptoPP::uint8x16_p;
316using CryptoPP::uint64x2_p;
318using CryptoPP::VecLoad;
319using CryptoPP::VecStore;
321using CryptoPP::VecOr;
322using CryptoPP::VecXor;
323using CryptoPP::VecAnd;
325using CryptoPP::VecPermute;
326using CryptoPP::VecMergeLow;
327using CryptoPP::VecMergeHigh;
328using CryptoPP::VecShiftLeft;
329using CryptoPP::VecShiftRight;
331using CryptoPP::VecIntelMultiply00;
332using CryptoPP::VecIntelMultiply11;
336F2N_Multiply_128x128_POWER8(uint64x2_p& c1, uint64x2_p& c0,
const uint64x2_p& a,
const uint64x2_p& b)
339 const uint64x2_p z0={0};
341 c0 = VecIntelMultiply00(a, b);
342 c1 = VecIntelMultiply11(a, b);
343 t1 = VecMergeLow(a, a);
345 t2 = VecMergeLow(b, b);
347 t1 = VecIntelMultiply00(t1, t2);
351 t1 = VecMergeHigh(z0, t1);
352 t2 = VecMergeLow(t2, z0);
359F2N_Multiply_256x256_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1, uint64x2_p& c0,
360 const uint64x2_p& b1,
const uint64x2_p& b0,
const uint64x2_p& a1,
const uint64x2_p& a0)
363 uint64x2_p x0=a0, x1=a1, y0=b0, y1=b1;
365 F2N_Multiply_128x128_POWER8(c1, c0, x0, y0);
366 F2N_Multiply_128x128_POWER8(c3, c2, x1, y1);
371 F2N_Multiply_128x128_POWER8(c5, c4, x0, y0);
383F2N_Square_256_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1,
384 uint64x2_p& c0,
const uint64x2_p& a1,
const uint64x2_p& a0)
386 c0 = VecIntelMultiply00(a0, a0);
387 c1 = VecIntelMultiply11(a0, a0);
388 c2 = VecIntelMultiply00(a1, a1);
389 c3 = VecIntelMultiply11(a1, a1);
393template <
unsigned int N>
394inline uint64x2_p ShiftLeft128_POWER8(uint64x2_p x)
397 const uint64x2_p z={0};
399 x = VecShiftLeft<N>(x);
400 u = VecShiftRight<64-N>(u);
401 v = VecMergeHigh(z, u);
409GF2NT_233_Reduce_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1, uint64x2_p& c0)
411 const uint64_t mod[] = {W64LIT(0xffffffffffffffff), W64LIT(0x01ffffffffff)};
412 const uint64x2_p m0 = (uint64x2_p)VecLoad(mod);
414 uint64x2_p b3, b2, b1, a1, a0;
415 const uint64x2_p z0={0};
418 a0 = VecMergeHigh(c1, z0);
419 a1 = VecShiftLeft<23>(a1);
420 a1 = VecShiftRight<23>(a1);
422 b2 = VecShiftRight<64-23>(c2);
423 c3 = ShiftLeft128_POWER8<23>(c3);
424 a0 = VecMergeLow(b2, z0);
426 b1 = VecShiftRight<64-23>(b1);
427 c2 = ShiftLeft128_POWER8<23>(c2);
428 a0 = VecMergeLow(b1, z0);
431 b2 = VecShiftRight<64-10>(c2);
432 b3 = ShiftLeft128_POWER8<10>(b3);
433 a0 = VecMergeLow(b2, z0);
435 a0 = VecMergeLow(c3, z0);
437 b1 = VecShiftRight<64-23>(b3);
438 b3 = ShiftLeft128_POWER8<23>(b3);
439 b3 = VecMergeLow(b3, z0);
443 b2 = VecShiftRight<64-10>(c2);
444 b3 = ShiftLeft128_POWER8<10>(b3);
445 b2 = VecMergeLow(b2, z0);
448 b2 = ShiftLeft128_POWER8<10>(b2);
449 a0 = VecMergeHigh(z0, b2);
451 a0 = VecMergeHigh(z0, b3);
452 a1 = VecMergeLow(b2, z0);
462ANONYMOUS_NAMESPACE_END
464NAMESPACE_BEGIN(CryptoPP)
466#if (CRYPTOPP_CLMUL_AVAILABLE)
469GF2NT_233_Multiply_Reduce_CLMUL(
const word* pA,
const word* pB, word* pC)
471 enum {S=
sizeof(__m128i)/
sizeof(word)};
472 __m128i a0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(pA+0*S));
473 __m128i a1 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(pA+1*S));
474 __m128i b0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(pB+0*S));
475 __m128i b1 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(pB+1*S));
477 __m128i c0, c1, c2, c3;
478 F2N_Multiply_256x256_CLMUL(c3, c2, c1, c0, a1, a0, b1, b0);
479 GF2NT_233_Reduce_CLMUL(c3, c2, c1, c0);
481 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(pC+0*S), c0);
482 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(pC+1*S), c1);
486GF2NT_233_Square_Reduce_CLMUL(
const word* pA, word* pC)
488 enum {S=
sizeof(__m128i)/
sizeof(word)};
489 __m128i a0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(pA+0*S));
490 __m128i a1 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(pA+1*S));
492 __m128i c0, c1, c2, c3;
493 F2N_Square_256_CLMUL(c3, c2, c1, c0, a1, a0);
494 GF2NT_233_Reduce_CLMUL(c3, c2, c1, c0);
496 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(pC+0*S), c0);
497 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(pC+1*S), c1);
500#elif (CRYPTOPP_ARM_PMULL_AVAILABLE)
503GF2NT_233_Multiply_Reduce_ARMv8(
const word* pA,
const word* pB, word* pC)
507 const uint32_t* pAA =
reinterpret_cast<const uint32_t*
>(pA);
508 const uint32_t* pBB =
reinterpret_cast<const uint32_t*
>(pB);
510 uint64x2_t a0 = vreinterpretq_u64_u32(vld1q_u32(pAA+0));
511 uint64x2_t a1 = vreinterpretq_u64_u32(vld1q_u32(pAA+4));
512 uint64x2_t b0 = vreinterpretq_u64_u32(vld1q_u32(pBB+0));
513 uint64x2_t b1 = vreinterpretq_u64_u32(vld1q_u32(pBB+4));
515 uint64x2_t c0, c1, c2, c3;
516 F2N_Multiply_256x256_ARMv8(c3, c2, c1, c0, a1, a0, b1, b0);
517 GF2NT_233_Reduce_ARMv8(c3, c2, c1, c0);
519 uint32_t* pCC =
reinterpret_cast<uint32_t*
>(pC);
520 vst1q_u32(pCC+0, vreinterpretq_u32_u64(c0));
521 vst1q_u32(pCC+4, vreinterpretq_u32_u64(c1));
525GF2NT_233_Square_Reduce_ARMv8(
const word* pA, word* pC)
529 const uint32_t* pAA =
reinterpret_cast<const uint32_t*
>(pA);
530 uint64x2_t a0 = vreinterpretq_u64_u32(vld1q_u32(pAA+0));
531 uint64x2_t a1 = vreinterpretq_u64_u32(vld1q_u32(pAA+4));
533 uint64x2_t c0, c1, c2, c3;
534 F2N_Square_256_ARMv8(c3, c2, c1, c0, a1, a0);
535 GF2NT_233_Reduce_ARMv8(c3, c2, c1, c0);
537 uint32_t* pCC =
reinterpret_cast<uint32_t*
>(pC);
538 vst1q_u32(pCC+0, vreinterpretq_u32_u64(c0));
539 vst1q_u32(pCC+4, vreinterpretq_u32_u64(c1));
542#elif (CRYPTOPP_POWER8_VMULL_AVAILABLE) && 0
545GF2NT_233_Multiply_Reduce_POWER8(
const word* pA,
const word* pB, word* pC)
549 const byte* pAA =
reinterpret_cast<const byte*
>(pA);
550 const byte* pBB =
reinterpret_cast<const byte*
>(pB);
552 uint64x2_p a0 = (uint64x2_p)VecLoad(pAA+0);
553 uint64x2_p a1 = (uint64x2_p)VecLoad(pAA+16);
554 uint64x2_p b0 = (uint64x2_p)VecLoad(pBB+0);
555 uint64x2_p b1 = (uint64x2_p)VecLoad(pBB+16);
557#if (CRYPTOPP_BIG_ENDIAN)
558 const uint8_t mb[] = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
559 const uint8x16_p m = (uint8x16_p)VecLoad(mb);
560 a0 = VecPermute(a0, m);
561 a1 = VecPermute(a1, m);
562 b0 = VecPermute(b0, m);
563 b1 = VecPermute(b1, m);
566 uint64x2_p c0, c1, c2, c3;
567 F2N_Multiply_256x256_POWER8(c3, c2, c1, c0, a1, a0, b1, b0);
568 GF2NT_233_Reduce_POWER8(c3, c2, c1, c0);
570#if (CRYPTOPP_BIG_ENDIAN)
571 c0 = VecPermute(c0, m);
572 c1 = VecPermute(c1, m);
575 byte* pCC =
reinterpret_cast<byte*
>(pC);
577 VecStore(c1, pCC+16);
581GF2NT_233_Square_Reduce_POWER8(
const word* pA, word* pC)
585 const byte* pAA =
reinterpret_cast<const byte*
>(pA);
586 uint64x2_p a0 = (uint64x2_p)VecLoad(pAA+0);
587 uint64x2_p a1 = (uint64x2_p)VecLoad(pAA+16);
589#if (CRYPTOPP_BIG_ENDIAN)
590 const uint8_t mb[] = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
591 const uint8x16_p m = (uint8x16_p)VecLoad(mb);
592 a0 = VecPermute(a0, m);
593 a1 = VecPermute(a1, m);
596 uint64x2_p c0, c1, c2, c3;
597 F2N_Square_256_POWER8(c3, c2, c1, c0, a1, a0);
598 GF2NT_233_Reduce_POWER8(c3, c2, c1, c0);
600#if (CRYPTOPP_BIG_ENDIAN)
601 c0 = VecPermute(c0, m);
602 c1 = VecPermute(c1, m);
605 byte* pCC =
reinterpret_cast<byte*
>(pC);
607 VecStore(c1, pCC+16);
Support functions for ARM and vector operations.
Library configuration file.
Classes and functions for schemes over GF(2^n)
Support functions for PowerPC and vector operations.