14#if defined(CRYPTOPP_DISABLE_SHA_ASM)
15# undef CRYPTOPP_X86_ASM_AVAILABLE
16# undef CRYPTOPP_X32_ASM_AVAILABLE
17# undef CRYPTOPP_X64_ASM_AVAILABLE
18# undef CRYPTOPP_SSE2_ASM_AVAILABLE
21#if (CRYPTOPP_SHANI_AVAILABLE)
22# include <nmmintrin.h>
23# include <immintrin.h>
27#if (CRYPTOPP_BOOL_ARMV8)
28# if (CRYPTOPP_ARM_NEON_HEADER)
31# if (CRYPTOPP_ARM_ACLE_HEADER)
37#if CRYPTOPP_POWER8_SHA_AVAILABLE
41#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
46#ifndef EXCEPTION_EXECUTE_HANDLER
47# define EXCEPTION_EXECUTE_HANDLER 1
51extern const char SHA_SIMD_FNAME[] = __FILE__;
53NAMESPACE_BEGIN(CryptoPP)
57extern const word32 SHA256_K[64];
58extern const word64 SHA512_K[80];
62#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
64 typedef void (*SigHandler)(int);
66 static jmp_buf s_jmpSIGILL;
67 static void SigIllHandler(
int)
69 longjmp(s_jmpSIGILL, 1);
74#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8)
77#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
79#elif (CRYPTOPP_ARM_SHA1_AVAILABLE)
80# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
81 volatile bool result =
true;
84 unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
85 uint32x4_t data1 = vld1q_u32(w+0);
86 uint32x4_t data2 = vld1q_u32(w+4);
87 uint32x4_t data3 = vld1q_u32(w+8);
89 uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
90 uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
91 uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
92 uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
93 uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
95 result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
97 __except (EXCEPTION_EXECUTE_HANDLER)
106 volatile bool result =
true;
108 volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
109 if (oldHandler == SIG_ERR)
112 volatile sigset_t oldMask;
113 if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
115 signal(SIGILL, oldHandler);
119 if (setjmp(s_jmpSIGILL))
123 unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
124 uint32x4_t data1 = vld1q_u32(w+0);
125 uint32x4_t data2 = vld1q_u32(w+4);
126 uint32x4_t data3 = vld1q_u32(w+8);
128 uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
129 uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
130 uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
131 uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
132 uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
134 result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
137 sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
138 signal(SIGILL, oldHandler);
146bool CPU_ProbeSHA256()
148#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
150#elif (CRYPTOPP_ARM_SHA2_AVAILABLE)
151# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
152 volatile bool result =
true;
155 unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
156 uint32x4_t data1 = vld1q_u32(w+0);
157 uint32x4_t data2 = vld1q_u32(w+4);
158 uint32x4_t data3 = vld1q_u32(w+8);
160 uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
161 uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
162 uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
163 uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
165 result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
167 __except (EXCEPTION_EXECUTE_HANDLER)
176 volatile bool result =
true;
178 volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
179 if (oldHandler == SIG_ERR)
182 volatile sigset_t oldMask;
183 if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
185 signal(SIGILL, oldHandler);
189 if (setjmp(s_jmpSIGILL))
193 unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
194 uint32x4_t data1 = vld1q_u32(w+0);
195 uint32x4_t data2 = vld1q_u32(w+4);
196 uint32x4_t data3 = vld1q_u32(w+8);
198 uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
199 uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
200 uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
201 uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
203 result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
206 sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
207 signal(SIGILL, oldHandler);
222#if CRYPTOPP_SHANI_AVAILABLE
226 CRYPTOPP_ASSERT(state);
227 CRYPTOPP_ASSERT(data);
228 CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
230 __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
231 __m128i MASK, MSG0, MSG1, MSG2, MSG3;
234 ABCD = _mm_loadu_si128(CONST_M128_CAST(state));
235 E0 = _mm_set_epi32(state[4], 0, 0, 0);
236 ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
242 _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15) :
243 _mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12) ;
245 while (length >= SHA1::BLOCKSIZE)
252 MSG0 = _mm_loadu_si128(CONST_M128_CAST(data+0));
253 MSG0 = _mm_shuffle_epi8(MSG0, MASK);
254 E0 = _mm_add_epi32(E0, MSG0);
256 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
259 MSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
260 MSG1 = _mm_shuffle_epi8(MSG1, MASK);
261 E1 = _mm_sha1nexte_epu32(E1, MSG1);
263 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
264 MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
267 MSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
268 MSG2 = _mm_shuffle_epi8(MSG2, MASK);
269 E0 = _mm_sha1nexte_epu32(E0, MSG2);
271 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
272 MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
273 MSG0 = _mm_xor_si128(MSG0, MSG2);
276 MSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
277 MSG3 = _mm_shuffle_epi8(MSG3, MASK);
278 E1 = _mm_sha1nexte_epu32(E1, MSG3);
280 MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
281 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
282 MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
283 MSG1 = _mm_xor_si128(MSG1, MSG3);
286 E0 = _mm_sha1nexte_epu32(E0, MSG0);
288 MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
289 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
290 MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
291 MSG2 = _mm_xor_si128(MSG2, MSG0);
294 E1 = _mm_sha1nexte_epu32(E1, MSG1);
296 MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
297 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
298 MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
299 MSG3 = _mm_xor_si128(MSG3, MSG1);
302 E0 = _mm_sha1nexte_epu32(E0, MSG2);
304 MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
305 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
306 MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
307 MSG0 = _mm_xor_si128(MSG0, MSG2);
310 E1 = _mm_sha1nexte_epu32(E1, MSG3);
312 MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
313 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
314 MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
315 MSG1 = _mm_xor_si128(MSG1, MSG3);
318 E0 = _mm_sha1nexte_epu32(E0, MSG0);
320 MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
321 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
322 MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
323 MSG2 = _mm_xor_si128(MSG2, MSG0);
326 E1 = _mm_sha1nexte_epu32(E1, MSG1);
328 MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
329 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
330 MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
331 MSG3 = _mm_xor_si128(MSG3, MSG1);
334 E0 = _mm_sha1nexte_epu32(E0, MSG2);
336 MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
337 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
338 MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
339 MSG0 = _mm_xor_si128(MSG0, MSG2);
342 E1 = _mm_sha1nexte_epu32(E1, MSG3);
344 MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
345 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
346 MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
347 MSG1 = _mm_xor_si128(MSG1, MSG3);
350 E0 = _mm_sha1nexte_epu32(E0, MSG0);
352 MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
353 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
354 MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
355 MSG2 = _mm_xor_si128(MSG2, MSG0);
358 E1 = _mm_sha1nexte_epu32(E1, MSG1);
360 MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
361 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
362 MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
363 MSG3 = _mm_xor_si128(MSG3, MSG1);
366 E0 = _mm_sha1nexte_epu32(E0, MSG2);
368 MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
369 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
370 MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
371 MSG0 = _mm_xor_si128(MSG0, MSG2);
374 E1 = _mm_sha1nexte_epu32(E1, MSG3);
376 MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
377 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
378 MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
379 MSG1 = _mm_xor_si128(MSG1, MSG3);
382 E0 = _mm_sha1nexte_epu32(E0, MSG0);
384 MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
385 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
386 MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
387 MSG2 = _mm_xor_si128(MSG2, MSG0);
390 E1 = _mm_sha1nexte_epu32(E1, MSG1);
392 MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
393 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
394 MSG3 = _mm_xor_si128(MSG3, MSG1);
397 E0 = _mm_sha1nexte_epu32(E0, MSG2);
399 MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
400 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
403 E1 = _mm_sha1nexte_epu32(E1, MSG3);
405 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
408 E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
409 ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
411 data += SHA1::BLOCKSIZE/
sizeof(
word32);
412 length -= SHA1::BLOCKSIZE;
416 ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
417 _mm_storeu_si128(M128_CAST(state), ABCD);
418 state[4] = _mm_extract_epi32(E0, 3);
422void SHA256_HashMultipleBlocks_SHANI(
word32 *state,
const word32 *data,
size_t length,
ByteOrder order)
424 CRYPTOPP_ASSERT(state);
425 CRYPTOPP_ASSERT(data);
426 CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
428 __m128i STATE0, STATE1;
429 __m128i MSG, TMP, MASK;
430 __m128i TMSG0, TMSG1, TMSG2, TMSG3;
431 __m128i ABEF_SAVE, CDGH_SAVE;
434 TMP = _mm_loadu_si128(M128_CAST(&state[0]));
435 STATE1 = _mm_loadu_si128(M128_CAST(&state[4]));
441 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3) :
442 _mm_set_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0) ;
444 TMP = _mm_shuffle_epi32(TMP, 0xB1);
445 STATE1 = _mm_shuffle_epi32(STATE1, 0x1B);
446 STATE0 = _mm_alignr_epi8(TMP, STATE1, 8);
447 STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0);
449 while (length >= SHA256::BLOCKSIZE)
456 MSG = _mm_loadu_si128(CONST_M128_CAST(data+0));
457 TMSG0 = _mm_shuffle_epi8(MSG, MASK);
458 MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98)));
459 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
460 MSG = _mm_shuffle_epi32(MSG, 0x0E);
461 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
464 TMSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
465 TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
466 MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B)));
467 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
468 MSG = _mm_shuffle_epi32(MSG, 0x0E);
469 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
470 TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
473 TMSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
474 TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
475 MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98)));
476 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
477 MSG = _mm_shuffle_epi32(MSG, 0x0E);
478 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
479 TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
482 TMSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
483 TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
484 MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74)));
485 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
486 TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
487 TMSG0 = _mm_add_epi32(TMSG0, TMP);
488 TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
489 MSG = _mm_shuffle_epi32(MSG, 0x0E);
490 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
491 TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
494 MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1)));
495 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
496 TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
497 TMSG1 = _mm_add_epi32(TMSG1, TMP);
498 TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
499 MSG = _mm_shuffle_epi32(MSG, 0x0E);
500 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
501 TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
504 MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F)));
505 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
506 TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
507 TMSG2 = _mm_add_epi32(TMSG2, TMP);
508 TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
509 MSG = _mm_shuffle_epi32(MSG, 0x0E);
510 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
511 TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
514 MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152)));
515 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
516 TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
517 TMSG3 = _mm_add_epi32(TMSG3, TMP);
518 TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
519 MSG = _mm_shuffle_epi32(MSG, 0x0E);
520 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
521 TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
524 MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3)));
525 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
526 TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
527 TMSG0 = _mm_add_epi32(TMSG0, TMP);
528 TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
529 MSG = _mm_shuffle_epi32(MSG, 0x0E);
530 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
531 TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
534 MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85)));
535 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
536 TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
537 TMSG1 = _mm_add_epi32(TMSG1, TMP);
538 TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
539 MSG = _mm_shuffle_epi32(MSG, 0x0E);
540 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
541 TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
544 MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354)));
545 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
546 TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
547 TMSG2 = _mm_add_epi32(TMSG2, TMP);
548 TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
549 MSG = _mm_shuffle_epi32(MSG, 0x0E);
550 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
551 TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
554 MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1)));
555 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
556 TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
557 TMSG3 = _mm_add_epi32(TMSG3, TMP);
558 TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
559 MSG = _mm_shuffle_epi32(MSG, 0x0E);
560 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
561 TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
564 MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819)));
565 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
566 TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
567 TMSG0 = _mm_add_epi32(TMSG0, TMP);
568 TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
569 MSG = _mm_shuffle_epi32(MSG, 0x0E);
570 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
571 TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
574 MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116)));
575 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
576 TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
577 TMSG1 = _mm_add_epi32(TMSG1, TMP);
578 TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
579 MSG = _mm_shuffle_epi32(MSG, 0x0E);
580 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
581 TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
584 MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3)));
585 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
586 TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
587 TMSG2 = _mm_add_epi32(TMSG2, TMP);
588 TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
589 MSG = _mm_shuffle_epi32(MSG, 0x0E);
590 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
593 MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE)));
594 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
595 TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
596 TMSG3 = _mm_add_epi32(TMSG3, TMP);
597 TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
598 MSG = _mm_shuffle_epi32(MSG, 0x0E);
599 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
602 MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA)));
603 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
604 MSG = _mm_shuffle_epi32(MSG, 0x0E);
605 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
608 STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
609 STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
611 data += SHA256::BLOCKSIZE/
sizeof(
word32);
612 length -= SHA256::BLOCKSIZE;
615 TMP = _mm_shuffle_epi32(STATE0, 0x1B);
616 STATE1 = _mm_shuffle_epi32(STATE1, 0xB1);
617 STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0);
618 STATE1 = _mm_alignr_epi8(STATE1, TMP, 8);
621 _mm_storeu_si128(M128_CAST(&state[0]), STATE0);
622 _mm_storeu_si128(M128_CAST(&state[4]), STATE1);
636#if CRYPTOPP_ARM_SHA1_AVAILABLE
639 CRYPTOPP_ASSERT(state);
640 CRYPTOPP_ASSERT(data);
641 CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
643 uint32x4_t C0, C1, C2, C3;
644 uint32x4_t ABCD, ABCD_SAVED;
645 uint32x4_t MSG0, MSG1, MSG2, MSG3;
646 uint32x4_t TMP0, TMP1;
647 uint32_t E0, E0_SAVED, E1;
650 C0 = vdupq_n_u32(0x5A827999);
651 C1 = vdupq_n_u32(0x6ED9EBA1);
652 C2 = vdupq_n_u32(0x8F1BBCDC);
653 C3 = vdupq_n_u32(0xCA62C1D6);
655 ABCD = vld1q_u32(&state[0]);
658 while (length >= SHA1::BLOCKSIZE)
664 MSG0 = vld1q_u32(data + 0);
665 MSG1 = vld1q_u32(data + 4);
666 MSG2 = vld1q_u32(data + 8);
667 MSG3 = vld1q_u32(data + 12);
671 MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
672 MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
673 MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
674 MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
677 TMP0 = vaddq_u32(MSG0, C0);
678 TMP1 = vaddq_u32(MSG1, C0);
681 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
682 ABCD = vsha1cq_u32(ABCD, E0, TMP0);
683 TMP0 = vaddq_u32(MSG2, C0);
684 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
687 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
688 ABCD = vsha1cq_u32(ABCD, E1, TMP1);
689 TMP1 = vaddq_u32(MSG3, C0);
690 MSG0 = vsha1su1q_u32(MSG0, MSG3);
691 MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
694 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
695 ABCD = vsha1cq_u32(ABCD, E0, TMP0);
696 TMP0 = vaddq_u32(MSG0, C0);
697 MSG1 = vsha1su1q_u32(MSG1, MSG0);
698 MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
701 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
702 ABCD = vsha1cq_u32(ABCD, E1, TMP1);
703 TMP1 = vaddq_u32(MSG1, C1);
704 MSG2 = vsha1su1q_u32(MSG2, MSG1);
705 MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
708 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
709 ABCD = vsha1cq_u32(ABCD, E0, TMP0);
710 TMP0 = vaddq_u32(MSG2, C1);
711 MSG3 = vsha1su1q_u32(MSG3, MSG2);
712 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
715 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
716 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
717 TMP1 = vaddq_u32(MSG3, C1);
718 MSG0 = vsha1su1q_u32(MSG0, MSG3);
719 MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
722 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
723 ABCD = vsha1pq_u32(ABCD, E0, TMP0);
724 TMP0 = vaddq_u32(MSG0, C1);
725 MSG1 = vsha1su1q_u32(MSG1, MSG0);
726 MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
729 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
730 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
731 TMP1 = vaddq_u32(MSG1, C1);
732 MSG2 = vsha1su1q_u32(MSG2, MSG1);
733 MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
736 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
737 ABCD = vsha1pq_u32(ABCD, E0, TMP0);
738 TMP0 = vaddq_u32(MSG2, C2);
739 MSG3 = vsha1su1q_u32(MSG3, MSG2);
740 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
743 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
744 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
745 TMP1 = vaddq_u32(MSG3, C2);
746 MSG0 = vsha1su1q_u32(MSG0, MSG3);
747 MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
750 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
751 ABCD = vsha1mq_u32(ABCD, E0, TMP0);
752 TMP0 = vaddq_u32(MSG0, C2);
753 MSG1 = vsha1su1q_u32(MSG1, MSG0);
754 MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
757 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
758 ABCD = vsha1mq_u32(ABCD, E1, TMP1);
759 TMP1 = vaddq_u32(MSG1, C2);
760 MSG2 = vsha1su1q_u32(MSG2, MSG1);
761 MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
764 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
765 ABCD = vsha1mq_u32(ABCD, E0, TMP0);
766 TMP0 = vaddq_u32(MSG2, C2);
767 MSG3 = vsha1su1q_u32(MSG3, MSG2);
768 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
771 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
772 ABCD = vsha1mq_u32(ABCD, E1, TMP1);
773 TMP1 = vaddq_u32(MSG3, C3);
774 MSG0 = vsha1su1q_u32(MSG0, MSG3);
775 MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
778 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
779 ABCD = vsha1mq_u32(ABCD, E0, TMP0);
780 TMP0 = vaddq_u32(MSG0, C3);
781 MSG1 = vsha1su1q_u32(MSG1, MSG0);
782 MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
785 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
786 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
787 TMP1 = vaddq_u32(MSG1, C3);
788 MSG2 = vsha1su1q_u32(MSG2, MSG1);
789 MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
792 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
793 ABCD = vsha1pq_u32(ABCD, E0, TMP0);
794 TMP0 = vaddq_u32(MSG2, C3);
795 MSG3 = vsha1su1q_u32(MSG3, MSG2);
796 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
799 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
800 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
801 TMP1 = vaddq_u32(MSG3, C3);
802 MSG0 = vsha1su1q_u32(MSG0, MSG3);
805 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
806 ABCD = vsha1pq_u32(ABCD, E0, TMP0);
809 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
810 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
813 ABCD = vaddq_u32(ABCD_SAVED, ABCD);
815 data += SHA1::BLOCKSIZE/
sizeof(
word32);
816 length -= SHA1::BLOCKSIZE;
820 vst1q_u32(&state[0], ABCD);
825#if CRYPTOPP_ARM_SHA2_AVAILABLE
826void SHA256_HashMultipleBlocks_ARMV8(
word32 *state,
const word32 *data,
size_t length,
ByteOrder order)
828 CRYPTOPP_ASSERT(state);
829 CRYPTOPP_ASSERT(data);
830 CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
832 uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
833 uint32x4_t MSG0, MSG1, MSG2, MSG3;
834 uint32x4_t TMP0, TMP1, TMP2;
837 STATE0 = vld1q_u32(&state[0]);
838 STATE1 = vld1q_u32(&state[4]);
840 while (length >= SHA256::BLOCKSIZE)
847 MSG0 = vld1q_u32(data + 0);
848 MSG1 = vld1q_u32(data + 4);
849 MSG2 = vld1q_u32(data + 8);
850 MSG3 = vld1q_u32(data + 12);
854 MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
855 MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
856 MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
857 MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
860 TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x00]));
863 MSG0 = vsha256su0q_u32(MSG0, MSG1);
865 TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x04]));
866 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
867 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
868 MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
871 MSG1 = vsha256su0q_u32(MSG1, MSG2);
873 TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x08]));
874 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
875 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
876 MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
879 MSG2 = vsha256su0q_u32(MSG2, MSG3);
881 TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x0c]));
882 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
883 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
884 MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
887 MSG3 = vsha256su0q_u32(MSG3, MSG0);
889 TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x10]));
890 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
891 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
892 MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
895 MSG0 = vsha256su0q_u32(MSG0, MSG1);
897 TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x14]));
898 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
899 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
900 MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
903 MSG1 = vsha256su0q_u32(MSG1, MSG2);
905 TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x18]));
906 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
907 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
908 MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
911 MSG2 = vsha256su0q_u32(MSG2, MSG3);
913 TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x1c]));
914 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
915 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
916 MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
919 MSG3 = vsha256su0q_u32(MSG3, MSG0);
921 TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x20]));
922 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
923 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
924 MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
927 MSG0 = vsha256su0q_u32(MSG0, MSG1);
929 TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x24]));
930 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
931 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
932 MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
935 MSG1 = vsha256su0q_u32(MSG1, MSG2);
937 TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x28]));
938 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
939 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
940 MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
943 MSG2 = vsha256su0q_u32(MSG2, MSG3);
945 TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x2c]));
946 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
947 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
948 MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
951 MSG3 = vsha256su0q_u32(MSG3, MSG0);
953 TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x30]));
954 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
955 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
956 MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
960 TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x34]));
961 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
962 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
966 TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x38]));
967 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
968 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
972 TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x3c]));
973 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
974 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
978 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
979 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
982 STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
983 STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
985 data += SHA256::BLOCKSIZE/
sizeof(
word32);
986 length -= SHA256::BLOCKSIZE;
990 vst1q_u32(&state[0], STATE0);
991 vst1q_u32(&state[4], STATE1);
1005#if CRYPTOPP_POWER8_SHA_AVAILABLE
1008enum {A=0, B=1, C, D, E, F, G, H};
1011uint32x4_p VecLoad32(
const word32* data,
int offset)
1013#if (CRYPTOPP_LITTLE_ENDIAN)
1014 const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
1015 const uint32x4_p val = VecLoad(offset, data);
1016 return (uint32x4_p)VecPermute(val, val, mask);
1018 return VecLoad(offset, data);
1022template<
class T>
inline
1023void VecStore32(
const T data,
word32 dest[4])
1025 VecStore(data, dest);
1029uint32x4_p VectorCh(
const uint32x4_p x,
const uint32x4_p y,
const uint32x4_p z)
1032 return vec_sel(z,y,x);
1036uint32x4_p VectorMaj(
const uint32x4_p x,
const uint32x4_p y,
const uint32x4_p z)
1039 return vec_sel(y, z, VecXor(x, y));
1043uint32x4_p Vector_sigma0(
const uint32x4_p val)
1045 return VecSHA256<0,0>(val);
1049uint32x4_p Vector_sigma1(
const uint32x4_p val)
1051 return VecSHA256<0,0xf>(val);
1055uint32x4_p VectorSigma0(
const uint32x4_p val)
1057 return VecSHA256<1,0>(val);
1061uint32x4_p VectorSigma1(
const uint32x4_p val)
1063 return VecSHA256<1,0xf>(val);
1067uint32x4_p VectorPack(
const uint32x4_p a,
const uint32x4_p b,
1068 const uint32x4_p c,
const uint32x4_p d)
1070 const uint8x16_p m1 = {0,1,2,3, 16,17,18,19, 0,0,0,0, 0,0,0,0};
1071 const uint8x16_p m2 = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1072 return VecPermute(VecPermute(a,b,m1), VecPermute(c,d,m1), m2);
1075template <
unsigned int R>
inline
1076void SHA256_ROUND1(uint32x4_p W[16], uint32x4_p S[8],
const uint32x4_p K,
const uint32x4_p M)
1081 T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1082 T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1084 S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1086 S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1090template <
unsigned int R>
inline
1091void SHA256_ROUND2(uint32x4_p W[16], uint32x4_p S[8],
const uint32x4_p K)
1094 enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1096 const uint32x4_p s0 = Vector_sigma0(W[IDX1]);
1097 const uint32x4_p s1 = Vector_sigma1(W[IDX14]);
1099 uint32x4_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1100 T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1101 uint32x4_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1103 S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1105 S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1109void SHA256_HashMultipleBlocks_POWER8(
word32 *state,
const word32 *data,
size_t length,
ByteOrder order)
1111 CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1112 CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
1113 CRYPTOPP_UNUSED(order);
1115 const uint32_t* k =
reinterpret_cast<const uint32_t*
>(SHA256_K);
1116 const uint32_t* m =
reinterpret_cast<const uint32_t*
>(data);
1118 uint32x4_p abcd = VecLoad(state+0);
1119 uint32x4_p efgh = VecLoad(state+4);
1120 uint32x4_p W[16], S[8], vm, vk;
1122 size_t blocks = length / SHA256::BLOCKSIZE;
1125 unsigned int offset=0;
1127 S[A] = abcd; S[E] = efgh;
1128 S[B] = VecShiftLeftOctet<4>(S[A]);
1129 S[F] = VecShiftLeftOctet<4>(S[E]);
1130 S[C] = VecShiftLeftOctet<4>(S[B]);
1131 S[G] = VecShiftLeftOctet<4>(S[F]);
1132 S[D] = VecShiftLeftOctet<4>(S[C]);
1133 S[H] = VecShiftLeftOctet<4>(S[G]);
1136 vk = VecLoad(offset, k);
1137 vm = VecLoad32(m, offset);
1138 SHA256_ROUND1<0>(W,S, vk,vm);
1141 vk = VecShiftLeftOctet<4>(vk);
1142 vm = VecShiftLeftOctet<4>(vm);
1143 SHA256_ROUND1<1>(W,S, vk,vm);
1145 vk = VecShiftLeftOctet<4>(vk);
1146 vm = VecShiftLeftOctet<4>(vm);
1147 SHA256_ROUND1<2>(W,S, vk,vm);
1149 vk = VecShiftLeftOctet<4>(vk);
1150 vm = VecShiftLeftOctet<4>(vm);
1151 SHA256_ROUND1<3>(W,S, vk,vm);
1153 vk = VecLoad(offset, k);
1154 vm = VecLoad32(m, offset);
1155 SHA256_ROUND1<4>(W,S, vk,vm);
1158 vk = VecShiftLeftOctet<4>(vk);
1159 vm = VecShiftLeftOctet<4>(vm);
1160 SHA256_ROUND1<5>(W,S, vk,vm);
1162 vk = VecShiftLeftOctet<4>(vk);
1163 vm = VecShiftLeftOctet<4>(vm);
1164 SHA256_ROUND1<6>(W,S, vk,vm);
1166 vk = VecShiftLeftOctet<4>(vk);
1167 vm = VecShiftLeftOctet<4>(vm);
1168 SHA256_ROUND1<7>(W,S, vk,vm);
1170 vk = VecLoad(offset, k);
1171 vm = VecLoad32(m, offset);
1172 SHA256_ROUND1<8>(W,S, vk,vm);
1175 vk = VecShiftLeftOctet<4>(vk);
1176 vm = VecShiftLeftOctet<4>(vm);
1177 SHA256_ROUND1<9>(W,S, vk,vm);
1179 vk = VecShiftLeftOctet<4>(vk);
1180 vm = VecShiftLeftOctet<4>(vm);
1181 SHA256_ROUND1<10>(W,S, vk,vm);
1183 vk = VecShiftLeftOctet<4>(vk);
1184 vm = VecShiftLeftOctet<4>(vm);
1185 SHA256_ROUND1<11>(W,S, vk,vm);
1187 vk = VecLoad(offset, k);
1188 vm = VecLoad32(m, offset);
1189 SHA256_ROUND1<12>(W,S, vk,vm);
1192 vk = VecShiftLeftOctet<4>(vk);
1193 vm = VecShiftLeftOctet<4>(vm);
1194 SHA256_ROUND1<13>(W,S, vk,vm);
1196 vk = VecShiftLeftOctet<4>(vk);
1197 vm = VecShiftLeftOctet<4>(vm);
1198 SHA256_ROUND1<14>(W,S, vk,vm);
1200 vk = VecShiftLeftOctet<4>(vk);
1201 vm = VecShiftLeftOctet<4>(vm);
1202 SHA256_ROUND1<15>(W,S, vk,vm);
1207 for (
unsigned int i=16; i<64; i+=16)
1209 vk = VecLoad(offset, k);
1210 SHA256_ROUND2<0>(W,S, vk);
1211 SHA256_ROUND2<1>(W,S, VecShiftLeftOctet<4>(vk));
1212 SHA256_ROUND2<2>(W,S, VecShiftLeftOctet<8>(vk));
1213 SHA256_ROUND2<3>(W,S, VecShiftLeftOctet<12>(vk));
1216 vk = VecLoad(offset, k);
1217 SHA256_ROUND2<4>(W,S, vk);
1218 SHA256_ROUND2<5>(W,S, VecShiftLeftOctet<4>(vk));
1219 SHA256_ROUND2<6>(W,S, VecShiftLeftOctet<8>(vk));
1220 SHA256_ROUND2<7>(W,S, VecShiftLeftOctet<12>(vk));
1223 vk = VecLoad(offset, k);
1224 SHA256_ROUND2<8>(W,S, vk);
1225 SHA256_ROUND2<9>(W,S, VecShiftLeftOctet<4>(vk));
1226 SHA256_ROUND2<10>(W,S, VecShiftLeftOctet<8>(vk));
1227 SHA256_ROUND2<11>(W,S, VecShiftLeftOctet<12>(vk));
1230 vk = VecLoad(offset, k);
1231 SHA256_ROUND2<12>(W,S, vk);
1232 SHA256_ROUND2<13>(W,S, VecShiftLeftOctet<4>(vk));
1233 SHA256_ROUND2<14>(W,S, VecShiftLeftOctet<8>(vk));
1234 SHA256_ROUND2<15>(W,S, VecShiftLeftOctet<12>(vk));
1238 abcd += VectorPack(S[A],S[B],S[C],S[D]);
1239 efgh += VectorPack(S[E],S[F],S[G],S[H]);
1242 VecStore32(abcd, state+0);
1243 VecStore32(efgh, state+4);
1247void VecStore64(
const uint64x2_p val, word64* data)
1249 VecStore(val, data);
1253uint64x2_p VecLoad64(
const word64* data,
int offset)
1255#if (CRYPTOPP_LITTLE_ENDIAN)
1256 const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
1257 return VecPermute(VecLoad(offset, data), mask);
1259 return VecLoad(offset, data);
1264uint64x2_p VectorCh(
const uint64x2_p x,
const uint64x2_p y,
const uint64x2_p z)
1267 return vec_sel(z,y,x);
1271uint64x2_p VectorMaj(
const uint64x2_p x,
const uint64x2_p y,
const uint64x2_p z)
1274 return vec_sel(y, z, VecXor(x, y));
1278uint64x2_p Vector_sigma0(
const uint64x2_p val)
1280 return VecSHA512<0,0>(val);
1284uint64x2_p Vector_sigma1(
const uint64x2_p val)
1286 return VecSHA512<0,0xf>(val);
1290uint64x2_p VectorSigma0(
const uint64x2_p val)
1292 return VecSHA512<1,0>(val);
1296uint64x2_p VectorSigma1(
const uint64x2_p val)
1298 return VecSHA512<1,0xf>(val);
1302uint64x2_p VectorPack(
const uint64x2_p x,
const uint64x2_p y)
1304 const uint8x16_p m = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1305 return VecPermute(x,y,m);
1308template <
unsigned int R>
inline
1309void SHA512_ROUND1(uint64x2_p W[16], uint64x2_p S[8],
const uint64x2_p K,
const uint64x2_p M)
1314 T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1315 T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1317 S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1319 S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1323template <
unsigned int R>
inline
1324void SHA512_ROUND2(uint64x2_p W[16], uint64x2_p S[8],
const uint64x2_p K)
1327 enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1329 const uint64x2_p s0 = Vector_sigma0(W[IDX1]);
1330 const uint64x2_p s1 = Vector_sigma1(W[IDX14]);
1332 uint64x2_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1333 T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1334 uint64x2_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1336 S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1338 S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1342void SHA512_HashMultipleBlocks_POWER8(word64 *state,
const word64 *data,
size_t length,
ByteOrder order)
1344 CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1345 CRYPTOPP_ASSERT(length >= SHA512::BLOCKSIZE);
1346 CRYPTOPP_UNUSED(order);
1348 const uint64_t* k =
reinterpret_cast<const uint64_t*
>(SHA512_K);
1349 const uint64_t* m =
reinterpret_cast<const uint64_t*
>(data);
1351 uint64x2_p ab = VecLoad(state+0);
1352 uint64x2_p cd = VecLoad(state+2);
1353 uint64x2_p ef = VecLoad(state+4);
1354 uint64x2_p gh = VecLoad(state+6);
1355 uint64x2_p W[16], S[8], vm, vk;
1357 size_t blocks = length / SHA512::BLOCKSIZE;
1360 unsigned int offset=0;
1362 S[A] = ab; S[C] = cd;
1363 S[E] = ef; S[G] = gh;
1364 S[B] = VecShiftLeftOctet<8>(S[A]);
1365 S[D] = VecShiftLeftOctet<8>(S[C]);
1366 S[F] = VecShiftLeftOctet<8>(S[E]);
1367 S[H] = VecShiftLeftOctet<8>(S[G]);
1370 vk = VecLoad(offset, k);
1371 vm = VecLoad64(m, offset);
1372 SHA512_ROUND1<0>(W,S, vk,vm);
1375 vk = VecShiftLeftOctet<8>(vk);
1376 vm = VecShiftLeftOctet<8>(vm);
1377 SHA512_ROUND1<1>(W,S, vk,vm);
1379 vk = VecLoad(offset, k);
1380 vm = VecLoad64(m, offset);
1381 SHA512_ROUND1<2>(W,S, vk,vm);
1384 vk = VecShiftLeftOctet<8>(vk);
1385 vm = VecShiftLeftOctet<8>(vm);
1386 SHA512_ROUND1<3>(W,S, vk,vm);
1388 vk = VecLoad(offset, k);
1389 vm = VecLoad64(m, offset);
1390 SHA512_ROUND1<4>(W,S, vk,vm);
1393 vk = VecShiftLeftOctet<8>(vk);
1394 vm = VecShiftLeftOctet<8>(vm);
1395 SHA512_ROUND1<5>(W,S, vk,vm);
1397 vk = VecLoad(offset, k);
1398 vm = VecLoad64(m, offset);
1399 SHA512_ROUND1<6>(W,S, vk,vm);
1402 vk = VecShiftLeftOctet<8>(vk);
1403 vm = VecShiftLeftOctet<8>(vm);
1404 SHA512_ROUND1<7>(W,S, vk,vm);
1406 vk = VecLoad(offset, k);
1407 vm = VecLoad64(m, offset);
1408 SHA512_ROUND1<8>(W,S, vk,vm);
1411 vk = VecShiftLeftOctet<8>(vk);
1412 vm = VecShiftLeftOctet<8>(vm);
1413 SHA512_ROUND1<9>(W,S, vk,vm);
1415 vk = VecLoad(offset, k);
1416 vm = VecLoad64(m, offset);
1417 SHA512_ROUND1<10>(W,S, vk,vm);
1420 vk = VecShiftLeftOctet<8>(vk);
1421 vm = VecShiftLeftOctet<8>(vm);
1422 SHA512_ROUND1<11>(W,S, vk,vm);
1424 vk = VecLoad(offset, k);
1425 vm = VecLoad64(m, offset);
1426 SHA512_ROUND1<12>(W,S, vk,vm);
1429 vk = VecShiftLeftOctet<8>(vk);
1430 vm = VecShiftLeftOctet<8>(vm);
1431 SHA512_ROUND1<13>(W,S, vk,vm);
1433 vk = VecLoad(offset, k);
1434 vm = VecLoad64(m, offset);
1435 SHA512_ROUND1<14>(W,S, vk,vm);
1438 vk = VecShiftLeftOctet<8>(vk);
1439 vm = VecShiftLeftOctet<8>(vm);
1440 SHA512_ROUND1<15>(W,S, vk,vm);
1445 for (
unsigned int i=16; i<80; i+=16)
1447 vk = VecLoad(offset, k);
1448 SHA512_ROUND2<0>(W,S, vk);
1449 SHA512_ROUND2<1>(W,S, VecShiftLeftOctet<8>(vk));
1452 vk = VecLoad(offset, k);
1453 SHA512_ROUND2<2>(W,S, vk);
1454 SHA512_ROUND2<3>(W,S, VecShiftLeftOctet<8>(vk));
1457 vk = VecLoad(offset, k);
1458 SHA512_ROUND2<4>(W,S, vk);
1459 SHA512_ROUND2<5>(W,S, VecShiftLeftOctet<8>(vk));
1462 vk = VecLoad(offset, k);
1463 SHA512_ROUND2<6>(W,S, vk);
1464 SHA512_ROUND2<7>(W,S, VecShiftLeftOctet<8>(vk));
1467 vk = VecLoad(offset, k);
1468 SHA512_ROUND2<8>(W,S, vk);
1469 SHA512_ROUND2<9>(W,S, VecShiftLeftOctet<8>(vk));
1472 vk = VecLoad(offset, k);
1473 SHA512_ROUND2<10>(W,S, vk);
1474 SHA512_ROUND2<11>(W,S, VecShiftLeftOctet<8>(vk));
1477 vk = VecLoad(offset, k);
1478 SHA512_ROUND2<12>(W,S, vk);
1479 SHA512_ROUND2<13>(W,S, VecShiftLeftOctet<8>(vk));
1482 vk = VecLoad(offset, k);
1483 SHA512_ROUND2<14>(W,S, vk);
1484 SHA512_ROUND2<15>(W,S, VecShiftLeftOctet<8>(vk));
1488 ab += VectorPack(S[A],S[B]);
1489 cd += VectorPack(S[C],S[D]);
1490 ef += VectorPack(S[E],S[F]);
1491 gh += VectorPack(S[G],S[H]);
1494 VecStore64(ab, state+0);
1495 VecStore64(cd, state+2);
1496 VecStore64(ef, state+4);
1497 VecStore64(gh, state+6);
Library configuration file.
unsigned int word32
32-bit unsigned datatype
ByteOrder
Provides the byte ordering.
@ BIG_ENDIAN_ORDER
byte order is big-endian
Utility functions for the Crypto++ library.
Support functions for PowerPC and vector operations.
Classes for SHA-1 and SHA-2 family of message digests.