Security Scol plugin
cham_simd.cpp
1// cham_simd.cpp - written and placed in the public domain by Jeffrey Walton
2//
3// This source file uses intrinsics and built-ins to gain access to
4// SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
5// source file is needed because additional CXXFLAGS are required to enable
6// the appropriate instructions sets in some build configurations.
7
8#include "pch.h"
9#include "config.h"
10
11#include "cham.h"
12#include "misc.h"
13
14// Uncomment for benchmarking C++ against SSE or NEON.
15// Do so in both simon.cpp and simon_simd.cpp.
16// #undef CRYPTOPP_SSSE3_AVAILABLE
17// #undef CRYPTOPP_ARM_NEON_AVAILABLE
18
19#if (CRYPTOPP_SSSE3_AVAILABLE)
20#include "adv_simd.h"
21# include <pmmintrin.h>
22# include <tmmintrin.h>
23#endif
24
25#if defined(__XOP__)
26# include <ammintrin.h>
27# if defined(__GNUC__)
28# include <x86intrin.h>
29# endif
30#endif
31
32// Clang intrinsic casts, http://bugs.llvm.org/show_bug.cgi?id=20670
33#define DOUBLE_CAST(x) ((double*)(void*)(x))
34#define CONST_DOUBLE_CAST(x) ((const double*)(const void*)(x))
35
36// Squash MS LNK4221 and libtool warnings
37extern const char CHAM_SIMD_FNAME[] = __FILE__;
38
39ANONYMOUS_NAMESPACE_BEGIN
40
41using CryptoPP::word16;
42using CryptoPP::word32;
43
44#if (CRYPTOPP_SSSE3_AVAILABLE)
45
47
48NAMESPACE_BEGIN(W32) // CHAM128, 32-bit word size
49
50template <unsigned int R>
51inline __m128i RotateLeft32(const __m128i& val)
52{
53#if defined(__XOP__)
54 return _mm_roti_epi32(val, R);
55#else
56 return _mm_or_si128(
57 _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
58#endif
59}
60
61template <unsigned int R>
62inline __m128i RotateRight32(const __m128i& val)
63{
64#if defined(__XOP__)
65 return _mm_roti_epi32(val, 32-R);
66#else
67 return _mm_or_si128(
68 _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
69#endif
70}
71
72// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
73template <>
74inline __m128i RotateLeft32<8>(const __m128i& val)
75{
76#if defined(__XOP__)
77 return _mm_roti_epi32(val, 8);
78#else
79 const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
80 return _mm_shuffle_epi8(val, mask);
81#endif
82}
83
84// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
85template <>
86inline __m128i RotateRight32<8>(const __m128i& val)
87{
88#if defined(__XOP__)
89 return _mm_roti_epi32(val, 32-8);
90#else
91 const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
92 return _mm_shuffle_epi8(val, mask);
93#endif
94}
95
96template <unsigned int IDX>
97inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
98{
99 // Should not be instantiated
100 CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
101 CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
102 CRYPTOPP_ASSERT(0);
103 return _mm_setzero_si128();
104}
105
106template <>
107inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
108{
109 // The shuffle converts to and from little-endian for SSE. A specialized
110 // CHAM implementation can avoid the shuffle by framing the data for
111 // encryption, decryption and benchmarks. The library cannot take the
112 // speed-up because of the byte oriented API.
113 const __m128i r1 = _mm_unpacklo_epi32(a, b);
114 const __m128i r2 = _mm_unpacklo_epi32(c, d);
115 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
116 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
117}
118
119template <>
120inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
121{
122 // The shuffle converts to and from little-endian for SSE. A specialized
123 // CHAM implementation can avoid the shuffle by framing the data for
124 // encryption, decryption and benchmarks. The library cannot take the
125 // speed-up because of the byte oriented API.
126 const __m128i r1 = _mm_unpacklo_epi32(a, b);
127 const __m128i r2 = _mm_unpacklo_epi32(c, d);
128 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
129 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
130}
131
132template <>
133inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
134{
135 // The shuffle converts to and from little-endian for SSE. A specialized
136 // CHAM implementation can avoid the shuffle by framing the data for
137 // encryption, decryption and benchmarks. The library cannot take the
138 // speed-up because of the byte oriented API.
139 const __m128i r1 = _mm_unpackhi_epi32(a, b);
140 const __m128i r2 = _mm_unpackhi_epi32(c, d);
141 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
142 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
143}
144
145template <>
146inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
147{
148 // The shuffle converts to and from little-endian for SSE. A specialized
149 // CHAM implementation can avoid the shuffle by framing the data for
150 // encryption, decryption and benchmarks. The library cannot take the
151 // speed-up because of the byte oriented API.
152 const __m128i r1 = _mm_unpackhi_epi32(a, b);
153 const __m128i r2 = _mm_unpackhi_epi32(c, d);
154 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
155 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
156}
157
158template <unsigned int IDX>
159inline __m128i UnpackXMM(const __m128i& v)
160{
161 // Should not be instantiated
162 CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
163 return _mm_setzero_si128();
164}
165
166template <>
167inline __m128i UnpackXMM<0>(const __m128i& v)
168{
169 return _mm_shuffle_epi8(v, _mm_set_epi8(0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3));
170}
171
172template <>
173inline __m128i UnpackXMM<1>(const __m128i& v)
174{
175 return _mm_shuffle_epi8(v, _mm_set_epi8(4,5,6,7, 4,5,6,7, 4,5,6,7, 4,5,6,7));
176}
177
178template <>
179inline __m128i UnpackXMM<2>(const __m128i& v)
180{
181 return _mm_shuffle_epi8(v, _mm_set_epi8(8,9,10,11, 8,9,10,11, 8,9,10,11, 8,9,10,11));
182}
183
184template <>
185inline __m128i UnpackXMM<3>(const __m128i& v)
186{
187 return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 12,13,14,15, 12,13,14,15, 12,13,14,15));
188}
189
190template <unsigned int IDX>
191inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
192{
193 return UnpackXMM<IDX>(a, b, c, d);
194}
195
196template <unsigned int IDX>
197inline __m128i RepackXMM(const __m128i& v)
198{
199 return UnpackXMM<IDX>(v);
200}
201
202inline void CHAM128_Enc_Block(__m128i &block0,
203 const word32 *subkeys, unsigned int rounds)
204{
205 // Rearrange the data for vectorization. UnpackXMM includes a
206 // little-endian swap for SSE. Thanks to Peter Cordes for help
207 // with packing and unpacking.
208 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
209 __m128i a = UnpackXMM<0>(block0);
210 __m128i b = UnpackXMM<1>(block0);
211 __m128i c = UnpackXMM<2>(block0);
212 __m128i d = UnpackXMM<3>(block0);
213
214 __m128i counter = _mm_set_epi32(0,0,0,0);
215 __m128i increment = _mm_set_epi32(1,1,1,1);
216
217 const unsigned int MASK = (rounds == 80 ? 7 : 15);
218 for (int i=0; i<static_cast<int>(rounds); i+=4)
219 {
220 __m128i k, k1, k2, t1, t2;
221 k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+0) & MASK])));
222
223 // Shuffle out two subkeys
224 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
225 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
226
227 t1 = _mm_xor_si128(a, counter);
228 t2 = _mm_xor_si128(RotateLeft32<1>(b), k1);
229 a = RotateLeft32<8>(_mm_add_epi32(t1, t2));
230
231 counter = _mm_add_epi32(counter, increment);
232
233 t1 = _mm_xor_si128(b, counter);
234 t2 = _mm_xor_si128(RotateLeft32<8>(c), k2);
235 b = RotateLeft32<1>(_mm_add_epi32(t1, t2));
236
237 counter = _mm_add_epi32(counter, increment);
238 k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+2) & MASK])));
239
240 // Shuffle out two subkeys
241 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
242 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
243
244 t1 = _mm_xor_si128(c, counter);
245 t2 = _mm_xor_si128(RotateLeft32<1>(d), k1);
246 c = RotateLeft32<8>(_mm_add_epi32(t1, t2));
247
248 counter = _mm_add_epi32(counter, increment);
249
250 t1 = _mm_xor_si128(d, counter);
251 t2 = _mm_xor_si128(RotateLeft32<8>(a), k2);
252 d = RotateLeft32<1>(_mm_add_epi32(t1, t2));
253
254 counter = _mm_add_epi32(counter, increment);
255 }
256
257 // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
258 block0 = RepackXMM<0>(a,b,c,d);
259}
260
261inline void CHAM128_Dec_Block(__m128i &block0,
262 const word32 *subkeys, unsigned int rounds)
263{
264 // Rearrange the data for vectorization. UnpackXMM includes a
265 // little-endian swap for SSE. Thanks to Peter Cordes for help
266 // with packing and unpacking.
267 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
268 __m128i a = UnpackXMM<0>(block0);
269 __m128i b = UnpackXMM<1>(block0);
270 __m128i c = UnpackXMM<2>(block0);
271 __m128i d = UnpackXMM<3>(block0);
272
273 __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1);
274 __m128i decrement = _mm_set_epi32(1,1,1,1);
275
276 const unsigned int MASK = (rounds == 80 ? 7 : 15);
277 for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
278 {
279 __m128i k, k1, k2, t1, t2;
280 k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-1) & MASK])));
281
282 // Shuffle out two subkeys
283 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
284 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
285
286 // Odd round
287 t1 = RotateRight32<1>(d);
288 t2 = _mm_xor_si128(RotateLeft32<8>(a), k1);
289 d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
290
291 counter = _mm_sub_epi32(counter, decrement);
292
293 // Even round
294 t1 = RotateRight32<8>(c);
295 t2 = _mm_xor_si128(RotateLeft32<1>(d), k2);
296 c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
297
298 counter = _mm_sub_epi32(counter, decrement);
299 k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-3) & MASK])));
300
301 // Shuffle out two subkeys
302 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
303 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
304
305 // Odd round
306 t1 = RotateRight32<1>(b);
307 t2 = _mm_xor_si128(RotateLeft32<8>(c), k1);
308 b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
309
310 counter = _mm_sub_epi32(counter, decrement);
311
312 // Even round
313 t1 = RotateRight32<8>(a);
314 t2 = _mm_xor_si128(RotateLeft32<1>(b), k2);
315 a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
316
317 counter = _mm_sub_epi32(counter, decrement);
318 }
319
320 // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
321 block0 = RepackXMM<0>(a,b,c,d);
322}
323
324inline void CHAM128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
325 __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
326{
327 // Rearrange the data for vectorization. UnpackXMM includes a
328 // little-endian swap for SSE. Thanks to Peter Cordes for help
329 // with packing and unpacking.
330 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
331 __m128i a = UnpackXMM<0>(block0, block1, block2, block3);
332 __m128i b = UnpackXMM<1>(block0, block1, block2, block3);
333 __m128i c = UnpackXMM<2>(block0, block1, block2, block3);
334 __m128i d = UnpackXMM<3>(block0, block1, block2, block3);
335
336 __m128i counter = _mm_set_epi32(0,0,0,0);
337 __m128i increment = _mm_set_epi32(1,1,1,1);
338
339 const unsigned int MASK = (rounds == 80 ? 7 : 15);
340 for (int i=0; i<static_cast<int>(rounds); i+=4)
341 {
342 __m128i k, k1, k2, t1, t2;
343 k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+0) & MASK])));
344
345 // Shuffle out two subkeys
346 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
347 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
348
349 t1 = _mm_xor_si128(a, counter);
350 t2 = _mm_xor_si128(RotateLeft32<1>(b), k1);
351 a = RotateLeft32<8>(_mm_add_epi32(t1, t2));
352
353 counter = _mm_add_epi32(counter, increment);
354
355 t1 = _mm_xor_si128(b, counter);
356 t2 = _mm_xor_si128(RotateLeft32<8>(c), k2);
357 b = RotateLeft32<1>(_mm_add_epi32(t1, t2));
358
359 counter = _mm_add_epi32(counter, increment);
360 k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+2) & MASK])));
361
362 // Shuffle out two subkeys
363 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
364 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
365
366 t1 = _mm_xor_si128(c, counter);
367 t2 = _mm_xor_si128(RotateLeft32<1>(d), k1);
368 c = RotateLeft32<8>(_mm_add_epi32(t1, t2));
369
370 counter = _mm_add_epi32(counter, increment);
371
372 t1 = _mm_xor_si128(d, counter);
373 t2 = _mm_xor_si128(RotateLeft32<8>(a), k2);
374 d = RotateLeft32<1>(_mm_add_epi32(t1, t2));
375
376 counter = _mm_add_epi32(counter, increment);
377 }
378
379 // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
380 block0 = RepackXMM<0>(a,b,c,d);
381 block1 = RepackXMM<1>(a,b,c,d);
382 block2 = RepackXMM<2>(a,b,c,d);
383 block3 = RepackXMM<3>(a,b,c,d);
384}
385
386inline void CHAM128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
387 __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
388{
389 // Rearrange the data for vectorization. UnpackXMM includes a
390 // little-endian swap for SSE. Thanks to Peter Cordes for help
391 // with packing and unpacking.
392 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
393 __m128i a = UnpackXMM<0>(block0, block1, block2, block3);
394 __m128i b = UnpackXMM<1>(block0, block1, block2, block3);
395 __m128i c = UnpackXMM<2>(block0, block1, block2, block3);
396 __m128i d = UnpackXMM<3>(block0, block1, block2, block3);
397
398 __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1);
399 __m128i decrement = _mm_set_epi32(1,1,1,1);
400
401 const unsigned int MASK = (rounds == 80 ? 7 : 15);
402 for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
403 {
404 __m128i k, k1, k2, t1, t2;
405 k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-1) & MASK])));
406
407 // Shuffle out two subkeys
408 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
409 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
410
411 // Odd round
412 t1 = RotateRight32<1>(d);
413 t2 = _mm_xor_si128(RotateLeft32<8>(a), k1);
414 d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
415
416 counter = _mm_sub_epi32(counter, decrement);
417
418 // Even round
419 t1 = RotateRight32<8>(c);
420 t2 = _mm_xor_si128(RotateLeft32<1>(d), k2);
421 c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
422
423 counter = _mm_sub_epi32(counter, decrement);
424 k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-3) & MASK])));
425
426 // Shuffle out two subkeys
427 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
428 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
429
430 // Odd round
431 t1 = RotateRight32<1>(b);
432 t2 = _mm_xor_si128(RotateLeft32<8>(c), k1);
433 b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
434
435 counter = _mm_sub_epi32(counter, decrement);
436
437 // Even round
438 t1 = RotateRight32<8>(a);
439 t2 = _mm_xor_si128(RotateLeft32<1>(b), k2);
440 a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
441
442 counter = _mm_sub_epi32(counter, decrement);
443 }
444
445 // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
446 block0 = RepackXMM<0>(a,b,c,d);
447 block1 = RepackXMM<1>(a,b,c,d);
448 block2 = RepackXMM<2>(a,b,c,d);
449 block3 = RepackXMM<3>(a,b,c,d);
450}
451
453
454NAMESPACE_END // W32
455
456#endif // CRYPTOPP_SSSE3_AVAILABLE
457
458ANONYMOUS_NAMESPACE_END
459
460NAMESPACE_BEGIN(CryptoPP)
461
462#if defined(CRYPTOPP_SSSE3_AVAILABLE)
463size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
464 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
465{
466 return AdvancedProcessBlocks128_4x1_SSE(W32::CHAM128_Enc_Block, W32::CHAM128_Enc_4_Blocks,
467 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
468}
469
470size_t CHAM128_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
471 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
472{
473 return AdvancedProcessBlocks128_4x1_SSE(W32::CHAM128_Dec_Block, W32::CHAM128_Dec_4_Blocks,
474 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
475}
476#endif // CRYPTOPP_SSSE3_AVAILABLE
477
478NAMESPACE_END
Template for AdvancedProcessBlocks and SIMD processing.
Classes for the CHAM block cipher.
Library configuration file.
unsigned int word32
32-bit unsigned datatype
Definition config_int.h:62
Utility functions for the Crypto++ library.
Precompiled header file.