19#if (CRYPTOPP_SSSE3_AVAILABLE)
21# include <pmmintrin.h>
22# include <tmmintrin.h>
26# include <ammintrin.h>
28# include <x86intrin.h>
32#if (CRYPTOPP_ARM_NEON_HEADER)
37#if (CRYPTOPP_ARM_ACLE_HEADER)
46#if (CRYPTOPP_ALTIVEC_AVAILABLE)
52extern const char SIMON128_SIMD_FNAME[] = __FILE__;
54ANONYMOUS_NAMESPACE_BEGIN
57using CryptoPP::word32;
58using CryptoPP::word64;
59using CryptoPP::vec_swap;
63#if (CRYPTOPP_ARM_NEON_AVAILABLE)
66#if defined(_MSC_VER) && !defined(_M_ARM64)
67inline uint64x2_t vld1q_dup_u64(
const uint64_t* ptr)
69 return vmovq_n_u64(*ptr);
74inline T UnpackHigh64(
const T& a,
const T& b)
76 const uint64x1_t x(vget_high_u64((uint64x2_t)a));
77 const uint64x1_t y(vget_high_u64((uint64x2_t)b));
78 return (T)vcombine_u64(x, y);
82inline T UnpackLow64(
const T& a,
const T& b)
84 const uint64x1_t x(vget_low_u64((uint64x2_t)a));
85 const uint64x1_t y(vget_low_u64((uint64x2_t)b));
86 return (T)vcombine_u64(x, y);
89template <
unsigned int R>
90inline uint64x2_t RotateLeft64(
const uint64x2_t& val)
92 const uint64x2_t a(vshlq_n_u64(val, R));
93 const uint64x2_t b(vshrq_n_u64(val, 64 - R));
94 return vorrq_u64(a, b);
97template <
unsigned int R>
98inline uint64x2_t RotateRight64(
const uint64x2_t& val)
100 const uint64x2_t a(vshlq_n_u64(val, 64 - R));
101 const uint64x2_t b(vshrq_n_u64(val, R));
102 return vorrq_u64(a, b);
105#if defined(__aarch32__) || defined(__aarch64__)
108inline uint64x2_t RotateLeft64<8>(
const uint64x2_t& val)
110 const uint8_t maskb[16] = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
111 const uint8x16_t mask = vld1q_u8(maskb);
113 return vreinterpretq_u64_u8(
114 vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
119inline uint64x2_t RotateRight64<8>(
const uint64x2_t& val)
121 const uint8_t maskb[16] = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
122 const uint8x16_t mask = vld1q_u8(maskb);
124 return vreinterpretq_u64_u8(
125 vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
129inline uint64x2_t SIMON128_f(
const uint64x2_t& val)
131 return veorq_u64(RotateLeft64<2>(val),
132 vandq_u64(RotateLeft64<1>(val), RotateLeft64<8>(val)));
135inline void SIMON128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
136 const word64 *subkeys,
unsigned int rounds)
139 uint64x2_t x1 = UnpackHigh64(block0, block1);
140 uint64x2_t y1 = UnpackLow64(block0, block1);
142 for (
size_t i = 0; i < static_cast<size_t>(rounds & ~1)-1; i += 2)
144 const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i);
145 y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1);
147 const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1);
148 x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2);
153 const uint64x2_t rk = vld1q_dup_u64(subkeys+rounds-1);
155 y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk);
160 block0 = UnpackLow64(y1, x1);
161 block1 = UnpackHigh64(y1, x1);
164inline void SIMON128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
165 uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
166 const word64 *subkeys,
unsigned int rounds)
169 uint64x2_t x1 = UnpackHigh64(block0, block1);
170 uint64x2_t y1 = UnpackLow64(block0, block1);
171 uint64x2_t x2 = UnpackHigh64(block2, block3);
172 uint64x2_t y2 = UnpackLow64(block2, block3);
173 uint64x2_t x3 = UnpackHigh64(block4, block5);
174 uint64x2_t y3 = UnpackLow64(block4, block5);
176 for (
size_t i = 0; i < static_cast<size_t>(rounds & ~1) - 1; i += 2)
178 const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i);
179 y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1);
180 y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk1);
181 y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk1);
183 const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1);
184 x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2);
185 x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk2);
186 x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk2);
191 const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
193 y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk);
194 y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk);
195 y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk);
196 std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
200 block0 = UnpackLow64(y1, x1);
201 block1 = UnpackHigh64(y1, x1);
202 block2 = UnpackLow64(y2, x2);
203 block3 = UnpackHigh64(y2, x2);
204 block4 = UnpackLow64(y3, x3);
205 block5 = UnpackHigh64(y3, x3);
208inline void SIMON128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
209 const word64 *subkeys,
unsigned int rounds)
212 uint64x2_t x1 = UnpackHigh64(block0, block1);
213 uint64x2_t y1 = UnpackLow64(block0, block1);
218 const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
220 y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1));
224 for (
int i =
static_cast<int>(rounds-2); i >= 0; i -= 2)
226 const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i+1);
227 x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1);
229 const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i);
230 y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2);
234 block0 = UnpackLow64(y1, x1);
235 block1 = UnpackHigh64(y1, x1);
238inline void SIMON128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
239 uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
240 const word64 *subkeys,
unsigned int rounds)
243 uint64x2_t x1 = UnpackHigh64(block0, block1);
244 uint64x2_t y1 = UnpackLow64(block0, block1);
245 uint64x2_t x2 = UnpackHigh64(block2, block3);
246 uint64x2_t y2 = UnpackLow64(block2, block3);
247 uint64x2_t x3 = UnpackHigh64(block4, block5);
248 uint64x2_t y3 = UnpackLow64(block4, block5);
252 std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
253 const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
255 y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1));
256 y2 = veorq_u64(veorq_u64(y2, rk), SIMON128_f(x2));
257 y3 = veorq_u64(veorq_u64(y3, rk), SIMON128_f(x3));
261 for (
int i =
static_cast<int>(rounds-2); i >= 0; i -= 2)
263 const uint64x2_t rk1 = vld1q_dup_u64(subkeys + i + 1);
264 x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1);
265 x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk1);
266 x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk1);
268 const uint64x2_t rk2 = vld1q_dup_u64(subkeys + i);
269 y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2);
270 y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk2);
271 y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk2);
275 block0 = UnpackLow64(y1, x1);
276 block1 = UnpackHigh64(y1, x1);
277 block2 = UnpackLow64(y2, x2);
278 block3 = UnpackHigh64(y2, x2);
279 block4 = UnpackLow64(y3, x3);
280 block5 = UnpackHigh64(y3, x3);
287#if (CRYPTOPP_SSSE3_AVAILABLE)
291# define DOUBLE_CAST(x) ((double *)(void *)(x))
293#ifndef CONST_DOUBLE_CAST
294# define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x))
297inline void Swap128(__m128i& a,__m128i& b)
299#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
308template <
unsigned int R>
309inline __m128i RotateLeft64(
const __m128i& val)
312 return _mm_roti_epi64(val, R);
315 _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R));
319template <
unsigned int R>
320inline __m128i RotateRight64(
const __m128i& val)
323 return _mm_roti_epi64(val, 64-R);
326 _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R));
332__m128i RotateLeft64<8>(
const __m128i& val)
335 return _mm_roti_epi64(val, 8);
337 const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7);
338 return _mm_shuffle_epi8(val, mask);
344__m128i RotateRight64<8>(
const __m128i& val)
347 return _mm_roti_epi64(val, 64-8);
349 const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1);
350 return _mm_shuffle_epi8(val, mask);
354inline __m128i SIMON128_f(
const __m128i& v)
356 return _mm_xor_si128(RotateLeft64<2>(v),
357 _mm_and_si128(RotateLeft64<1>(v), RotateLeft64<8>(v)));
360inline void SIMON128_Enc_Block(__m128i &block0, __m128i &block1,
361 const word64 *subkeys,
unsigned int rounds)
364 __m128i x1 = _mm_unpackhi_epi64(block0, block1);
365 __m128i y1 = _mm_unpacklo_epi64(block0, block1);
367 for (
size_t i = 0; i < static_cast<size_t>(rounds & ~1)-1; i += 2)
370 const __m128i rk1 = _mm_load_si128(CONST_M128_CAST(subkeys+i*2));
371 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
373 const __m128i rk2 = _mm_load_si128(CONST_M128_CAST(subkeys+(i+1)*2));
374 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
380 const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+(rounds-1)*2));
382 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
387 block0 = _mm_unpacklo_epi64(y1, x1);
388 block1 = _mm_unpackhi_epi64(y1, x1);
391inline void SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
392 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
393 const word64 *subkeys,
unsigned int rounds)
396 __m128i x1 = _mm_unpackhi_epi64(block0, block1);
397 __m128i y1 = _mm_unpacklo_epi64(block0, block1);
398 __m128i x2 = _mm_unpackhi_epi64(block2, block3);
399 __m128i y2 = _mm_unpacklo_epi64(block2, block3);
400 __m128i x3 = _mm_unpackhi_epi64(block4, block5);
401 __m128i y3 = _mm_unpacklo_epi64(block4, block5);
403 for (
size_t i = 0; i < static_cast<size_t>(rounds & ~1) - 1; i += 2)
406 const __m128i rk1 = _mm_load_si128(CONST_M128_CAST(subkeys+i*2));
407 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
408 y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk1);
409 y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk1);
412 const __m128i rk2 = _mm_load_si128(CONST_M128_CAST(subkeys+(i+1)*2));
413 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
414 x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk2);
415 x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk2);
421 const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+(rounds-1)*2));
422 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
423 y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk);
424 y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk);
425 Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
429 block0 = _mm_unpacklo_epi64(y1, x1);
430 block1 = _mm_unpackhi_epi64(y1, x1);
431 block2 = _mm_unpacklo_epi64(y2, x2);
432 block3 = _mm_unpackhi_epi64(y2, x2);
433 block4 = _mm_unpacklo_epi64(y3, x3);
434 block5 = _mm_unpackhi_epi64(y3, x3);
437inline void SIMON128_Dec_Block(__m128i &block0, __m128i &block1,
438 const word64 *subkeys,
unsigned int rounds)
441 __m128i x1 = _mm_unpackhi_epi64(block0, block1);
442 __m128i y1 = _mm_unpacklo_epi64(block0, block1);
446 const __m128i rk = _mm_castpd_si128(
447 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
450 y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
454 for (
int i =
static_cast<int>(rounds-2); i >= 0; i -= 2)
456 const __m128i rk1 = _mm_castpd_si128(
457 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1)));
458 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
460 const __m128i rk2 = _mm_castpd_si128(
461 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
462 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
466 block0 = _mm_unpacklo_epi64(y1, x1);
467 block1 = _mm_unpackhi_epi64(y1, x1);
470inline void SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
471 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
472 const word64 *subkeys,
unsigned int rounds)
475 __m128i x1 = _mm_unpackhi_epi64(block0, block1);
476 __m128i y1 = _mm_unpacklo_epi64(block0, block1);
477 __m128i x2 = _mm_unpackhi_epi64(block2, block3);
478 __m128i y2 = _mm_unpacklo_epi64(block2, block3);
479 __m128i x3 = _mm_unpackhi_epi64(block4, block5);
480 __m128i y3 = _mm_unpacklo_epi64(block4, block5);
484 const __m128i rk = _mm_castpd_si128(
485 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
487 Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
488 y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
489 y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON128_f(x2));
490 y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON128_f(x3));
494 for (
int i =
static_cast<int>(rounds-2); i >= 0; i -= 2)
496 const __m128i rk1 = _mm_castpd_si128(
497 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1)));
498 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
499 x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk1);
500 x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk1);
502 const __m128i rk2 = _mm_castpd_si128(
503 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i)));
504 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
505 y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk2);
506 y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk2);
510 block0 = _mm_unpacklo_epi64(y1, x1);
511 block1 = _mm_unpackhi_epi64(y1, x1);
512 block2 = _mm_unpacklo_epi64(y2, x2);
513 block3 = _mm_unpackhi_epi64(y2, x2);
514 block4 = _mm_unpacklo_epi64(y3, x3);
515 block5 = _mm_unpackhi_epi64(y3, x3);
522#if (CRYPTOPP_ALTIVEC_AVAILABLE)
537using CryptoPP::uint8x16_p;
538using CryptoPP::uint32x4_p;
539#if defined(_ARCH_PWR8)
540using CryptoPP::uint64x2_p;
543using CryptoPP::VecAdd64;
544using CryptoPP::VecSub64;
545using CryptoPP::VecAnd64;
546using CryptoPP::VecOr64;
547using CryptoPP::VecXor64;
548using CryptoPP::VecRotateLeft64;
549using CryptoPP::VecRotateRight64;
550using CryptoPP::VecSplatElement64;
551using CryptoPP::VecLoad;
552using CryptoPP::VecLoadAligned;
553using CryptoPP::VecPermute;
555#if defined(_ARCH_PWR8)
556#define simon128_t uint64x2_p
558#define simon128_t uint32x4_p
561inline simon128_t SIMON128_f(
const simon128_t val)
563 return (simon128_t)VecXor64(VecRotateLeft64<2>(val),
564 VecAnd64(VecRotateLeft64<1>(val), VecRotateLeft64<8>(val)));
567inline void SIMON128_Enc_Block(uint32x4_p &block,
const word64 *subkeys,
unsigned int rounds)
569#if (CRYPTOPP_BIG_ENDIAN)
570 const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
571 const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
573 const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
574 const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
578 simon128_t x1 = (simon128_t)VecPermute(block, block, m1);
579 simon128_t y1 = (simon128_t)VecPermute(block, block, m2);
581 for (
size_t i = 0; i < static_cast<size_t>(rounds & ~1)-1; i += 2)
584 const word32* ptr1 =
reinterpret_cast<const word32*
>(subkeys+i*2);
585 const simon128_t rk1 = (simon128_t)VecLoadAligned(ptr1);
586 const word32* ptr2 =
reinterpret_cast<const word32*
>(subkeys+(i+1)*2);
587 const simon128_t rk2 = (simon128_t)VecLoadAligned(ptr2);
589 y1 = VecXor64(VecXor64(y1, SIMON128_f(x1)), rk1);
590 x1 = VecXor64(VecXor64(x1, SIMON128_f(y1)), rk2);
596 const word32* ptr =
reinterpret_cast<const word32*
>(subkeys+(rounds-1)*2);
597 const simon128_t rk = (simon128_t)VecLoadAligned(ptr);
599 y1 = VecXor64(VecXor64(y1, SIMON128_f(x1)), rk);
604#if (CRYPTOPP_BIG_ENDIAN)
605 const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
608 const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
613 block = (uint32x4_p)VecPermute(x1, y1, m3);
616inline void SIMON128_Dec_Block(uint32x4_p &block,
const word64 *subkeys,
unsigned int rounds)
618#if (CRYPTOPP_BIG_ENDIAN)
619 const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
620 const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
622 const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
623 const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
627 simon128_t x1 = (simon128_t)VecPermute(block, block, m1);
628 simon128_t y1 = (simon128_t)VecPermute(block, block, m2);
634 const word32* ptr =
reinterpret_cast<const word32*
>(subkeys+rounds-1);
635 const simon128_t tk = (simon128_t)VecLoad(ptr);
636 const simon128_t rk = (simon128_t)VecSplatElement64<0>(tk);
638 y1 = VecXor64(VecXor64(y1, rk), SIMON128_f(x1));
642 for (
int i =
static_cast<int>(rounds-2); i >= 0; i -= 2)
644 const word32* ptr =
reinterpret_cast<const word32*
>(subkeys+i);
645 const simon128_t tk = (simon128_t)VecLoad(ptr);
646 const simon128_t rk1 = (simon128_t)VecSplatElement64<1>(tk);
647 const simon128_t rk2 = (simon128_t)VecSplatElement64<0>(tk);
649 x1 = VecXor64(VecXor64(x1, SIMON128_f(y1)), rk1);
650 y1 = VecXor64(VecXor64(y1, SIMON128_f(x1)), rk2);
653#if (CRYPTOPP_BIG_ENDIAN)
654 const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
657 const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
662 block = (uint32x4_p)VecPermute(x1, y1, m3);
665inline void SIMON128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
666 uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
667 uint32x4_p &block5,
const word64 *subkeys,
unsigned int rounds)
669#if (CRYPTOPP_BIG_ENDIAN)
670 const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
671 const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
673 const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
674 const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
678 simon128_t x1 = (simon128_t)VecPermute(block0, block1, m1);
679 simon128_t y1 = (simon128_t)VecPermute(block0, block1, m2);
680 simon128_t x2 = (simon128_t)VecPermute(block2, block3, m1);
681 simon128_t y2 = (simon128_t)VecPermute(block2, block3, m2);
682 simon128_t x3 = (simon128_t)VecPermute(block4, block5, m1);
683 simon128_t y3 = (simon128_t)VecPermute(block4, block5, m2);
685 for (
size_t i = 0; i < static_cast<size_t>(rounds & ~1)-1; i += 2)
688 const word32* ptr1 =
reinterpret_cast<const word32*
>(subkeys+i*2);
689 const simon128_t rk1 = (simon128_t)VecLoadAligned(ptr1);
691 const word32* ptr2 =
reinterpret_cast<const word32*
>(subkeys+(i+1)*2);
692 const simon128_t rk2 = (simon128_t)VecLoadAligned(ptr2);
694 y1 = VecXor64(VecXor64(y1, SIMON128_f(x1)), rk1);
695 y2 = VecXor64(VecXor64(y2, SIMON128_f(x2)), rk1);
696 y3 = VecXor64(VecXor64(y3, SIMON128_f(x3)), rk1);
698 x1 = VecXor64(VecXor64(x1, SIMON128_f(y1)), rk2);
699 x2 = VecXor64(VecXor64(x2, SIMON128_f(y2)), rk2);
700 x3 = VecXor64(VecXor64(x3, SIMON128_f(y3)), rk2);
706 const word32* ptr =
reinterpret_cast<const word32*
>(subkeys+(rounds-1)*2);
707 const simon128_t rk = (simon128_t)VecLoadAligned(ptr);
709 y1 = VecXor64(VecXor64(y1, SIMON128_f(x1)), rk);
710 y2 = VecXor64(VecXor64(y2, SIMON128_f(x2)), rk);
711 y3 = VecXor64(VecXor64(y3, SIMON128_f(x3)), rk);
713 std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
716#if (CRYPTOPP_BIG_ENDIAN)
717 const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
718 const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
720 const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
721 const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
725 block0 = (uint32x4_p)VecPermute(x1, y1, m3);
726 block1 = (uint32x4_p)VecPermute(x1, y1, m4);
727 block2 = (uint32x4_p)VecPermute(x2, y2, m3);
728 block3 = (uint32x4_p)VecPermute(x2, y2, m4);
729 block4 = (uint32x4_p)VecPermute(x3, y3, m3);
730 block5 = (uint32x4_p)VecPermute(x3, y3, m4);
733inline void SIMON128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
734 uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
735 uint32x4_p &block5,
const word64 *subkeys,
unsigned int rounds)
737#if (CRYPTOPP_BIG_ENDIAN)
738 const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
739 const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
741 const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
742 const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
746 simon128_t x1 = (simon128_t)VecPermute(block0, block1, m1);
747 simon128_t y1 = (simon128_t)VecPermute(block0, block1, m2);
748 simon128_t x2 = (simon128_t)VecPermute(block2, block3, m1);
749 simon128_t y2 = (simon128_t)VecPermute(block2, block3, m2);
750 simon128_t x3 = (simon128_t)VecPermute(block4, block5, m1);
751 simon128_t y3 = (simon128_t)VecPermute(block4, block5, m2);
755 std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
757 const word32* ptr =
reinterpret_cast<const word32*
>(subkeys+rounds-1);
758 const simon128_t tk = (simon128_t)VecLoad(ptr);
759 const simon128_t rk = (simon128_t)VecSplatElement64<0>(tk);
761 y1 = VecXor64(VecXor64(y1, rk), SIMON128_f(x1));
762 y2 = VecXor64(VecXor64(y2, rk), SIMON128_f(x2));
763 y3 = VecXor64(VecXor64(y3, rk), SIMON128_f(x3));
767 for (
int i =
static_cast<int>(rounds-2); i >= 0; i -= 2)
769 const word32* ptr =
reinterpret_cast<const word32*
>(subkeys+i);
770 const simon128_t tk = (simon128_t)VecLoad(ptr);
771 const simon128_t rk1 = (simon128_t)VecSplatElement64<1>(tk);
772 const simon128_t rk2 = (simon128_t)VecSplatElement64<0>(tk);
774 x1 = VecXor64(VecXor64(x1, SIMON128_f(y1)), rk1);
775 x2 = VecXor64(VecXor64(x2, SIMON128_f(y2)), rk1);
776 x3 = VecXor64(VecXor64(x3, SIMON128_f(y3)), rk1);
778 y1 = VecXor64(VecXor64(y1, SIMON128_f(x1)), rk2);
779 y2 = VecXor64(VecXor64(y2, SIMON128_f(x2)), rk2);
780 y3 = VecXor64(VecXor64(y3, SIMON128_f(x3)), rk2);
783#if (CRYPTOPP_BIG_ENDIAN)
784 const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
785 const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
787 const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
788 const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
792 block0 = (uint32x4_p)VecPermute(x1, y1, m3);
793 block1 = (uint32x4_p)VecPermute(x1, y1, m4);
794 block2 = (uint32x4_p)VecPermute(x2, y2, m3);
795 block3 = (uint32x4_p)VecPermute(x2, y2, m4);
796 block4 = (uint32x4_p)VecPermute(x3, y3, m3);
797 block5 = (uint32x4_p)VecPermute(x3, y3, m4);
802ANONYMOUS_NAMESPACE_END
806NAMESPACE_BEGIN(CryptoPP)
810#if (CRYPTOPP_ARM_NEON_AVAILABLE)
811size_t SIMON128_Enc_AdvancedProcessBlocks_NEON(
const word64* subKeys,
size_t rounds,
812 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
814 return AdvancedProcessBlocks128_6x2_NEON(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks,
815 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
818size_t SIMON128_Dec_AdvancedProcessBlocks_NEON(
const word64* subKeys,
size_t rounds,
819 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
821 return AdvancedProcessBlocks128_6x2_NEON(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks,
822 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
828#if (CRYPTOPP_SSSE3_AVAILABLE)
829size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(
const word64* subKeys,
size_t rounds,
830 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
832 return AdvancedProcessBlocks128_6x2_SSE(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks,
833 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
836size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(
const word64* subKeys,
size_t rounds,
837 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
839 return AdvancedProcessBlocks128_6x2_SSE(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks,
840 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
846#if (CRYPTOPP_ALTIVEC_AVAILABLE)
847size_t SIMON128_Enc_AdvancedProcessBlocks_ALTIVEC(
const word64* subKeys,
size_t rounds,
848 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
850 return AdvancedProcessBlocks128_6x1_ALTIVEC(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks,
851 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
854size_t SIMON128_Dec_AdvancedProcessBlocks_ALTIVEC(
const word64* subKeys,
size_t rounds,
855 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
857 return AdvancedProcessBlocks128_6x1_ALTIVEC(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks,
858 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
Template for AdvancedProcessBlocks and SIMD processing.
Library configuration file.
unsigned int word32
32-bit unsigned datatype
Utility functions for the Crypto++ library.
void vec_swap(T &a, T &b)
Swaps two variables which are arrays.
Support functions for PowerPC and vector operations.
Classes for the Simon block cipher.