Security Scol plugin
speck128_simd.cpp
1// speck128_simd.cpp - written and placed in the public domain by Jeffrey Walton
2//
3// This source file uses intrinsics and built-ins to gain access to
4// SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate
5// source file is needed because additional CXXFLAGS are required to enable
6// the appropriate instructions sets in some build configurations.
7
8#include "pch.h"
9#include "config.h"
10
11#include "speck.h"
12#include "misc.h"
13
14// Uncomment for benchmarking C++ against SSE or NEON.
15// Do so in both speck.cpp and speck_simd.cpp.
16// #undef CRYPTOPP_SSSE3_AVAILABLE
17// #undef CRYPTOPP_ARM_NEON_AVAILABLE
18
19#if (CRYPTOPP_SSSE3_AVAILABLE)
20# include "adv_simd.h"
21# include <pmmintrin.h>
22# include <tmmintrin.h>
23#endif
24
25#if defined(__XOP__)
26# include <ammintrin.h>
27# if defined(__GNUC__)
28# include <x86intrin.h>
29# endif
30#endif
31
32#if (CRYPTOPP_ARM_NEON_HEADER)
33# include "adv_simd.h"
34# include <arm_neon.h>
35#endif
36
37#if (CRYPTOPP_ARM_ACLE_HEADER)
38# include <stdint.h>
39# include <arm_acle.h>
40#endif
41
42#if defined(_M_ARM64)
43# include "adv_simd.h"
44#endif
45
46#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
47# include "adv_simd.h"
48# include "ppc_simd.h"
49#endif
50
51// Squash MS LNK4221 and libtool warnings
52extern const char SPECK128_SIMD_FNAME[] = __FILE__;
53
54ANONYMOUS_NAMESPACE_BEGIN
55
56using CryptoPP::byte;
57using CryptoPP::word32;
58using CryptoPP::word64;
59
60// *************************** ARM NEON ************************** //
61
62#if (CRYPTOPP_ARM_NEON_AVAILABLE)
63
64// Missing from Microsoft's ARM A-32 implementation
65#if defined(_MSC_VER) && !defined(_M_ARM64)
66inline uint64x2_t vld1q_dup_u64(const uint64_t* ptr)
67{
68 return vmovq_n_u64(*ptr);
69}
70#endif
71
72template <class T>
73inline T UnpackHigh64(const T& a, const T& b)
74{
75 const uint64x1_t x(vget_high_u64((uint64x2_t)a));
76 const uint64x1_t y(vget_high_u64((uint64x2_t)b));
77 return (T)vcombine_u64(x, y);
78}
79
80template <class T>
81inline T UnpackLow64(const T& a, const T& b)
82{
83 const uint64x1_t x(vget_low_u64((uint64x2_t)a));
84 const uint64x1_t y(vget_low_u64((uint64x2_t)b));
85 return (T)vcombine_u64(x, y);
86}
87
88template <unsigned int R>
89inline uint64x2_t RotateLeft64(const uint64x2_t& val)
90{
91 const uint64x2_t a(vshlq_n_u64(val, R));
92 const uint64x2_t b(vshrq_n_u64(val, 64 - R));
93 return vorrq_u64(a, b);
94}
95
96template <unsigned int R>
97inline uint64x2_t RotateRight64(const uint64x2_t& val)
98{
99 const uint64x2_t a(vshlq_n_u64(val, 64 - R));
100 const uint64x2_t b(vshrq_n_u64(val, R));
101 return vorrq_u64(a, b);
102}
103
104#if defined(__aarch32__) || defined(__aarch64__)
105// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
106template <>
107inline uint64x2_t RotateLeft64<8>(const uint64x2_t& val)
108{
109 const uint8_t maskb[16] = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
110 const uint8x16_t mask = vld1q_u8(maskb);
111
112 return vreinterpretq_u64_u8(
113 vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
114}
115
116// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
117template <>
118inline uint64x2_t RotateRight64<8>(const uint64x2_t& val)
119{
120 const uint8_t maskb[16] = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
121 const uint8x16_t mask = vld1q_u8(maskb);
122
123 return vreinterpretq_u64_u8(
124 vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
125}
126#endif
127
128inline void SPECK128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
129 const word64 *subkeys, unsigned int rounds)
130{
131 // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
132 uint64x2_t x1 = UnpackHigh64(block0, block1);
133 uint64x2_t y1 = UnpackLow64(block0, block1);
134
135 for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
136 {
137 const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
138
139 x1 = RotateRight64<8>(x1);
140 x1 = vaddq_u64(x1, y1);
141 x1 = veorq_u64(x1, rk);
142 y1 = RotateLeft64<3>(y1);
143 y1 = veorq_u64(y1, x1);
144 }
145
146 // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
147 block0 = UnpackLow64(y1, x1);
148 block1 = UnpackHigh64(y1, x1);
149}
150
151inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
152 uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
153 const word64 *subkeys, unsigned int rounds)
154{
155 // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
156 uint64x2_t x1 = UnpackHigh64(block0, block1);
157 uint64x2_t y1 = UnpackLow64(block0, block1);
158 uint64x2_t x2 = UnpackHigh64(block2, block3);
159 uint64x2_t y2 = UnpackLow64(block2, block3);
160 uint64x2_t x3 = UnpackHigh64(block4, block5);
161 uint64x2_t y3 = UnpackLow64(block4, block5);
162
163 for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
164 {
165 const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
166
167 x1 = RotateRight64<8>(x1);
168 x2 = RotateRight64<8>(x2);
169 x3 = RotateRight64<8>(x3);
170 x1 = vaddq_u64(x1, y1);
171 x2 = vaddq_u64(x2, y2);
172 x3 = vaddq_u64(x3, y3);
173 x1 = veorq_u64(x1, rk);
174 x2 = veorq_u64(x2, rk);
175 x3 = veorq_u64(x3, rk);
176 y1 = RotateLeft64<3>(y1);
177 y2 = RotateLeft64<3>(y2);
178 y3 = RotateLeft64<3>(y3);
179 y1 = veorq_u64(y1, x1);
180 y2 = veorq_u64(y2, x2);
181 y3 = veorq_u64(y3, x3);
182 }
183
184 // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
185 block0 = UnpackLow64(y1, x1);
186 block1 = UnpackHigh64(y1, x1);
187 block2 = UnpackLow64(y2, x2);
188 block3 = UnpackHigh64(y2, x2);
189 block4 = UnpackLow64(y3, x3);
190 block5 = UnpackHigh64(y3, x3);
191}
192
193inline void SPECK128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
194 const word64 *subkeys, unsigned int rounds)
195{
196 // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
197 uint64x2_t x1 = UnpackHigh64(block0, block1);
198 uint64x2_t y1 = UnpackLow64(block0, block1);
199
200 for (int i = static_cast<int>(rounds-1); i >= 0; --i)
201 {
202 const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
203
204 y1 = veorq_u64(y1, x1);
205 y1 = RotateRight64<3>(y1);
206 x1 = veorq_u64(x1, rk);
207 x1 = vsubq_u64(x1, y1);
208 x1 = RotateLeft64<8>(x1);
209 }
210
211 // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
212 block0 = UnpackLow64(y1, x1);
213 block1 = UnpackHigh64(y1, x1);
214}
215
216inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
217 uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
218 const word64 *subkeys, unsigned int rounds)
219{
220 // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
221 uint64x2_t x1 = UnpackHigh64(block0, block1);
222 uint64x2_t y1 = UnpackLow64(block0, block1);
223 uint64x2_t x2 = UnpackHigh64(block2, block3);
224 uint64x2_t y2 = UnpackLow64(block2, block3);
225 uint64x2_t x3 = UnpackHigh64(block4, block5);
226 uint64x2_t y3 = UnpackLow64(block4, block5);
227
228 for (int i = static_cast<int>(rounds-1); i >= 0; --i)
229 {
230 const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
231
232 y1 = veorq_u64(y1, x1);
233 y2 = veorq_u64(y2, x2);
234 y3 = veorq_u64(y3, x3);
235 y1 = RotateRight64<3>(y1);
236 y2 = RotateRight64<3>(y2);
237 y3 = RotateRight64<3>(y3);
238 x1 = veorq_u64(x1, rk);
239 x2 = veorq_u64(x2, rk);
240 x3 = veorq_u64(x3, rk);
241 x1 = vsubq_u64(x1, y1);
242 x2 = vsubq_u64(x2, y2);
243 x3 = vsubq_u64(x3, y3);
244 x1 = RotateLeft64<8>(x1);
245 x2 = RotateLeft64<8>(x2);
246 x3 = RotateLeft64<8>(x3);
247 }
248
249 // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
250 block0 = UnpackLow64(y1, x1);
251 block1 = UnpackHigh64(y1, x1);
252 block2 = UnpackLow64(y2, x2);
253 block3 = UnpackHigh64(y2, x2);
254 block4 = UnpackLow64(y3, x3);
255 block5 = UnpackHigh64(y3, x3);
256}
257
258#endif // CRYPTOPP_ARM_NEON_AVAILABLE
259
260// ***************************** IA-32 ***************************** //
261
262#if defined(CRYPTOPP_SSSE3_AVAILABLE)
263
264// GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html
265#ifndef DOUBLE_CAST
266# define DOUBLE_CAST(x) ((double *)(void *)(x))
267#endif
268#ifndef CONST_DOUBLE_CAST
269# define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x))
270#endif
271
272template <unsigned int R>
273inline __m128i RotateLeft64(const __m128i& val)
274{
275#if defined(__XOP__)
276 return _mm_roti_epi64(val, R);
277#else
278 return _mm_or_si128(
279 _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R));
280#endif
281}
282
283template <unsigned int R>
284inline __m128i RotateRight64(const __m128i& val)
285{
286#if defined(__XOP__)
287 return _mm_roti_epi64(val, 64-R);
288#else
289 return _mm_or_si128(
290 _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R));
291#endif
292}
293
294// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
295template <>
296__m128i RotateLeft64<8>(const __m128i& val)
297{
298#if defined(__XOP__)
299 return _mm_roti_epi64(val, 8);
300#else
301 const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7);
302 return _mm_shuffle_epi8(val, mask);
303#endif
304}
305
306// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
307template <>
308__m128i RotateRight64<8>(const __m128i& val)
309{
310#if defined(__XOP__)
311 return _mm_roti_epi64(val, 64-8);
312#else
313 const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1);
314 return _mm_shuffle_epi8(val, mask);
315#endif
316}
317
318inline void SPECK128_Enc_Block(__m128i &block0, __m128i &block1,
319 const word64 *subkeys, unsigned int rounds)
320{
321 // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
322 __m128i x1 = _mm_unpackhi_epi64(block0, block1);
323 __m128i y1 = _mm_unpacklo_epi64(block0, block1);
324
325 for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
326 {
327 // Round keys are pre-splated in forward direction
328 const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+i*2));
329
330 x1 = RotateRight64<8>(x1);
331 x1 = _mm_add_epi64(x1, y1);
332 x1 = _mm_xor_si128(x1, rk);
333 y1 = RotateLeft64<3>(y1);
334 y1 = _mm_xor_si128(y1, x1);
335 }
336
337 // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
338 block0 = _mm_unpacklo_epi64(y1, x1);
339 block1 = _mm_unpackhi_epi64(y1, x1);
340}
341
342inline void SPECK128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
343 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
344 const word64 *subkeys, unsigned int rounds)
345{
346 // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
347 __m128i x1 = _mm_unpackhi_epi64(block0, block1);
348 __m128i y1 = _mm_unpacklo_epi64(block0, block1);
349 __m128i x2 = _mm_unpackhi_epi64(block2, block3);
350 __m128i y2 = _mm_unpacklo_epi64(block2, block3);
351 __m128i x3 = _mm_unpackhi_epi64(block4, block5);
352 __m128i y3 = _mm_unpacklo_epi64(block4, block5);
353
354 for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
355 {
356 // Round keys are pre-splated in forward direction
357 const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+i*2));
358
359 x1 = RotateRight64<8>(x1);
360 x2 = RotateRight64<8>(x2);
361 x3 = RotateRight64<8>(x3);
362 x1 = _mm_add_epi64(x1, y1);
363 x2 = _mm_add_epi64(x2, y2);
364 x3 = _mm_add_epi64(x3, y3);
365 x1 = _mm_xor_si128(x1, rk);
366 x2 = _mm_xor_si128(x2, rk);
367 x3 = _mm_xor_si128(x3, rk);
368 y1 = RotateLeft64<3>(y1);
369 y2 = RotateLeft64<3>(y2);
370 y3 = RotateLeft64<3>(y3);
371 y1 = _mm_xor_si128(y1, x1);
372 y2 = _mm_xor_si128(y2, x2);
373 y3 = _mm_xor_si128(y3, x3);
374 }
375
376 // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
377 block0 = _mm_unpacklo_epi64(y1, x1);
378 block1 = _mm_unpackhi_epi64(y1, x1);
379 block2 = _mm_unpacklo_epi64(y2, x2);
380 block3 = _mm_unpackhi_epi64(y2, x2);
381 block4 = _mm_unpacklo_epi64(y3, x3);
382 block5 = _mm_unpackhi_epi64(y3, x3);
383}
384
385inline void SPECK128_Dec_Block(__m128i &block0, __m128i &block1,
386 const word64 *subkeys, unsigned int rounds)
387{
388 // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
389 __m128i x1 = _mm_unpackhi_epi64(block0, block1);
390 __m128i y1 = _mm_unpacklo_epi64(block0, block1);
391
392 for (int i = static_cast<int>(rounds-1); i >= 0; --i)
393 {
394 const __m128i rk = _mm_castpd_si128(
395 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
396
397 y1 = _mm_xor_si128(y1, x1);
398 y1 = RotateRight64<3>(y1);
399 x1 = _mm_xor_si128(x1, rk);
400 x1 = _mm_sub_epi64(x1, y1);
401 x1 = RotateLeft64<8>(x1);
402 }
403
404 // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
405 block0 = _mm_unpacklo_epi64(y1, x1);
406 block1 = _mm_unpackhi_epi64(y1, x1);
407}
408
409inline void SPECK128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
410 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
411 const word64 *subkeys, unsigned int rounds)
412{
413 // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
414 __m128i x1 = _mm_unpackhi_epi64(block0, block1);
415 __m128i y1 = _mm_unpacklo_epi64(block0, block1);
416 __m128i x2 = _mm_unpackhi_epi64(block2, block3);
417 __m128i y2 = _mm_unpacklo_epi64(block2, block3);
418 __m128i x3 = _mm_unpackhi_epi64(block4, block5);
419 __m128i y3 = _mm_unpacklo_epi64(block4, block5);
420
421 for (int i = static_cast<int>(rounds-1); i >= 0; --i)
422 {
423 const __m128i rk = _mm_castpd_si128(
424 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
425
426 y1 = _mm_xor_si128(y1, x1);
427 y2 = _mm_xor_si128(y2, x2);
428 y3 = _mm_xor_si128(y3, x3);
429 y1 = RotateRight64<3>(y1);
430 y2 = RotateRight64<3>(y2);
431 y3 = RotateRight64<3>(y3);
432 x1 = _mm_xor_si128(x1, rk);
433 x2 = _mm_xor_si128(x2, rk);
434 x3 = _mm_xor_si128(x3, rk);
435 x1 = _mm_sub_epi64(x1, y1);
436 x2 = _mm_sub_epi64(x2, y2);
437 x3 = _mm_sub_epi64(x3, y3);
438 x1 = RotateLeft64<8>(x1);
439 x2 = RotateLeft64<8>(x2);
440 x3 = RotateLeft64<8>(x3);
441 }
442
443 // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
444 block0 = _mm_unpacklo_epi64(y1, x1);
445 block1 = _mm_unpackhi_epi64(y1, x1);
446 block2 = _mm_unpacklo_epi64(y2, x2);
447 block3 = _mm_unpackhi_epi64(y2, x2);
448 block4 = _mm_unpacklo_epi64(y3, x3);
449 block5 = _mm_unpackhi_epi64(y3, x3);
450}
451
452#endif // CRYPTOPP_SSSE3_AVAILABLE
453
454// ***************************** Altivec ***************************** //
455
456#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
457
458// Altivec uses native 64-bit types on 64-bit environments, or 32-bit types
459// in 32-bit environments. Speck128 will use the appropriate type for the
460// environment. Functions like VecAdd64 have two overloads, one for each
461// environment. The 32-bit overload treats uint32x4_p like a 64-bit type,
462// and does things like perform a add with carry or subtract with borrow.
463
464// Speck128 on Power8 performed as expected because of 64-bit environment.
465// Performance sucked on old PowerPC machines because of 32-bit environments.
466// At Crypto++ 8.3 we added an implementation that operated on 32-bit words.
467// Native 64-bit Speck128 performance dropped from about 4.1 to 6.3 cpb, but
468// 32-bit Speck128 improved from 66.5 cpb to 10.4 cpb. Overall it was a
469// good win even though we lost some performance in 64-bit environments.
470
471using CryptoPP::uint8x16_p;
472using CryptoPP::uint32x4_p;
473#if defined(_ARCH_PWR8)
474using CryptoPP::uint64x2_p;
475#endif
476
477using CryptoPP::VecAdd64;
478using CryptoPP::VecSub64;
479using CryptoPP::VecAnd64;
480using CryptoPP::VecOr64;
481using CryptoPP::VecXor64;
482using CryptoPP::VecSplatWord64;
483using CryptoPP::VecRotateLeft64;
484using CryptoPP::VecRotateRight64;
485using CryptoPP::VecLoad;
486using CryptoPP::VecLoadAligned;
487using CryptoPP::VecPermute;
488
489#if defined(_ARCH_PWR8)
490#define speck128_t uint64x2_p
491#else
492#define speck128_t uint32x4_p
493#endif
494
495void SPECK128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
496{
497#if (CRYPTOPP_BIG_ENDIAN)
498 const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
499 const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
500#else
501 const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
502 const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
503#endif
504
505 // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
506 speck128_t x1 = (speck128_t)VecPermute(block, block, m1);
507 speck128_t y1 = (speck128_t)VecPermute(block, block, m2);
508
509 for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
510 {
511 // Round keys are pre-splated in forward direction
512 const word32* ptr = reinterpret_cast<const word32*>(subkeys+i*2);
513 const speck128_t rk = (speck128_t)VecLoadAligned(ptr);
514
515 x1 = (speck128_t)VecRotateRight64<8>(x1);
516 x1 = (speck128_t)VecAdd64(x1, y1);
517 x1 = (speck128_t)VecXor64(x1, rk);
518
519 y1 = (speck128_t)VecRotateLeft64<3>(y1);
520 y1 = (speck128_t)VecXor64(y1, x1);
521 }
522
523#if (CRYPTOPP_BIG_ENDIAN)
524 const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
525 //const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
526#else
527 const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
528 //const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
529#endif
530
531 // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
532 block = (uint32x4_p)VecPermute(x1, y1, m3);
533}
534
535void SPECK128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
536{
537#if (CRYPTOPP_BIG_ENDIAN)
538 const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
539 const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
540#else
541 const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
542 const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
543#endif
544
545 // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
546 speck128_t x1 = (speck128_t)VecPermute(block, block, m1);
547 speck128_t y1 = (speck128_t)VecPermute(block, block, m2);
548
549 for (int i = static_cast<int>(rounds-1); i >= 0; --i)
550 {
551 const speck128_t rk = (speck128_t)VecSplatWord64(subkeys[i]);
552
553 y1 = (speck128_t)VecXor64(y1, x1);
554 y1 = (speck128_t)VecRotateRight64<3>(y1);
555 x1 = (speck128_t)VecXor64(x1, rk);
556 x1 = (speck128_t)VecSub64(x1, y1);
557 x1 = (speck128_t)VecRotateLeft64<8>(x1);
558 }
559
560#if (CRYPTOPP_BIG_ENDIAN)
561 const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
562 //const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
563#else
564 const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
565 //const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
566#endif
567
568 // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
569 block = (uint32x4_p)VecPermute(x1, y1, m3);
570}
571
572void SPECK128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
573 uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
574 uint32x4_p &block5, const word64 *subkeys, unsigned int rounds)
575{
576#if (CRYPTOPP_BIG_ENDIAN)
577 const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
578 const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
579#else
580 const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
581 const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
582#endif
583
584 // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
585 speck128_t x1 = (speck128_t)VecPermute(block0, block1, m1);
586 speck128_t y1 = (speck128_t)VecPermute(block0, block1, m2);
587 speck128_t x2 = (speck128_t)VecPermute(block2, block3, m1);
588 speck128_t y2 = (speck128_t)VecPermute(block2, block3, m2);
589 speck128_t x3 = (speck128_t)VecPermute(block4, block5, m1);
590 speck128_t y3 = (speck128_t)VecPermute(block4, block5, m2);
591
592 for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
593 {
594 // Round keys are pre-splated in forward direction
595 const word32* ptr = reinterpret_cast<const word32*>(subkeys+i*2);
596 const speck128_t rk = (speck128_t)VecLoadAligned(ptr);
597
598 x1 = (speck128_t)VecRotateRight64<8>(x1);
599 x2 = (speck128_t)VecRotateRight64<8>(x2);
600 x3 = (speck128_t)VecRotateRight64<8>(x3);
601 x1 = (speck128_t)VecAdd64(x1, y1);
602 x2 = (speck128_t)VecAdd64(x2, y2);
603 x3 = (speck128_t)VecAdd64(x3, y3);
604 x1 = (speck128_t)VecXor64(x1, rk);
605 x2 = (speck128_t)VecXor64(x2, rk);
606 x3 = (speck128_t)VecXor64(x3, rk);
607
608 y1 = (speck128_t)VecRotateLeft64<3>(y1);
609 y2 = (speck128_t)VecRotateLeft64<3>(y2);
610 y3 = (speck128_t)VecRotateLeft64<3>(y3);
611 y1 = (speck128_t)VecXor64(y1, x1);
612 y2 = (speck128_t)VecXor64(y2, x2);
613 y3 = (speck128_t)VecXor64(y3, x3);
614 }
615
616#if (CRYPTOPP_BIG_ENDIAN)
617 const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
618 const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
619#else
620 const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
621 const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
622#endif
623
624 // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
625 block0 = (uint32x4_p)VecPermute(x1, y1, m3);
626 block1 = (uint32x4_p)VecPermute(x1, y1, m4);
627 block2 = (uint32x4_p)VecPermute(x2, y2, m3);
628 block3 = (uint32x4_p)VecPermute(x2, y2, m4);
629 block4 = (uint32x4_p)VecPermute(x3, y3, m3);
630 block5 = (uint32x4_p)VecPermute(x3, y3, m4);
631}
632
633void SPECK128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
634 uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
635 uint32x4_p &block5, const word64 *subkeys, unsigned int rounds)
636{
637#if (CRYPTOPP_BIG_ENDIAN)
638 const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
639 const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
640#else
641 const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
642 const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
643#endif
644
645 // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
646 speck128_t x1 = (speck128_t)VecPermute(block0, block1, m1);
647 speck128_t y1 = (speck128_t)VecPermute(block0, block1, m2);
648 speck128_t x2 = (speck128_t)VecPermute(block2, block3, m1);
649 speck128_t y2 = (speck128_t)VecPermute(block2, block3, m2);
650 speck128_t x3 = (speck128_t)VecPermute(block4, block5, m1);
651 speck128_t y3 = (speck128_t)VecPermute(block4, block5, m2);
652
653 for (int i = static_cast<int>(rounds-1); i >= 0; --i)
654 {
655 const speck128_t rk = (speck128_t)VecSplatWord64(subkeys[i]);
656
657 y1 = (speck128_t)VecXor64(y1, x1);
658 y2 = (speck128_t)VecXor64(y2, x2);
659 y3 = (speck128_t)VecXor64(y3, x3);
660 y1 = (speck128_t)VecRotateRight64<3>(y1);
661 y2 = (speck128_t)VecRotateRight64<3>(y2);
662 y3 = (speck128_t)VecRotateRight64<3>(y3);
663
664 x1 = (speck128_t)VecXor64(x1, rk);
665 x2 = (speck128_t)VecXor64(x2, rk);
666 x3 = (speck128_t)VecXor64(x3, rk);
667 x1 = (speck128_t)VecSub64(x1, y1);
668 x2 = (speck128_t)VecSub64(x2, y2);
669 x3 = (speck128_t)VecSub64(x3, y3);
670 x1 = (speck128_t)VecRotateLeft64<8>(x1);
671 x2 = (speck128_t)VecRotateLeft64<8>(x2);
672 x3 = (speck128_t)VecRotateLeft64<8>(x3);
673 }
674
675#if (CRYPTOPP_BIG_ENDIAN)
676 const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
677 const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
678#else
679 const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
680 const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
681#endif
682
683 // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
684 block0 = (uint32x4_p)VecPermute(x1, y1, m3);
685 block1 = (uint32x4_p)VecPermute(x1, y1, m4);
686 block2 = (uint32x4_p)VecPermute(x2, y2, m3);
687 block3 = (uint32x4_p)VecPermute(x2, y2, m4);
688 block4 = (uint32x4_p)VecPermute(x3, y3, m3);
689 block5 = (uint32x4_p)VecPermute(x3, y3, m4);
690}
691
692#endif // CRYPTOPP_ALTIVEC_AVAILABLE
693
694ANONYMOUS_NAMESPACE_END
695
697
698NAMESPACE_BEGIN(CryptoPP)
699
700// *************************** ARM NEON **************************** //
701
702#if (CRYPTOPP_ARM_NEON_AVAILABLE)
703size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
704 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
705{
706 return AdvancedProcessBlocks128_6x2_NEON(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
707 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
708}
709
710size_t SPECK128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
711 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
712{
713 return AdvancedProcessBlocks128_6x2_NEON(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
714 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
715}
716#endif // CRYPTOPP_ARM_NEON_AVAILABLE
717
718// ***************************** IA-32 ***************************** //
719
720#if (CRYPTOPP_SSSE3_AVAILABLE)
721size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
722 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
723{
724 return AdvancedProcessBlocks128_6x2_SSE(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
725 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
726}
727
728size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
729 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
730{
731 return AdvancedProcessBlocks128_6x2_SSE(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
732 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
733}
734#endif // CRYPTOPP_SSSE3_AVAILABLE
735
736// ***************************** Altivec ***************************** //
737
738#if (CRYPTOPP_ALTIVEC_AVAILABLE)
739size_t SPECK128_Enc_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys, size_t rounds,
740 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
741{
742 return AdvancedProcessBlocks128_6x1_ALTIVEC(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
743 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
744}
745
746size_t SPECK128_Dec_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys, size_t rounds,
747 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
748{
749 return AdvancedProcessBlocks128_6x1_ALTIVEC(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
750 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
751}
752#endif // CRYPTOPP_ALTIVEC_AVAILABLE
753
754NAMESPACE_END
Template for AdvancedProcessBlocks and SIMD processing.
Library configuration file.
unsigned int word32
32-bit unsigned datatype
Definition config_int.h:62
Utility functions for the Crypto++ library.
Precompiled header file.
Support functions for PowerPC and vector operations.
Classes for the Speck block cipher.