Security Scol plugin
ppc_simd.h
Go to the documentation of this file.
1// ppc_simd.h - written and placed in public domain by Jeffrey Walton
2
67
68// Use __ALTIVEC__, _ARCH_PWR7, __VSX__, and _ARCH_PWR8 when detecting
69// actual availaibility of the feature for the source file being compiled.
70// The preprocessor macros depend on compiler options like -maltivec; and
71// not compiler versions.
72
73// For GCC see https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions.html
74// For XLC see the Compiler Reference manual. For Clang you have to experiment.
75// Clang does not document the compiler options, does not reject options it does
76// not understand, and pretends to be other compilers even though it cannot
77// process the builtins and intrinsics. Clang will waste hours of your time.
78
79// DO NOT USE this pattern in VecLoad and VecStore. We have to use the
80// code paths guarded by preprocessor macros because XLC 12 generates
81// bad code in some places. To verify the bad code generation test on
82// GCC111 with XLC 12.01 installed. XLC 13.01 on GCC112 and GCC119 are OK.
83//
84// inline uint32x4_p VecLoad(const byte src[16])
85// {
86// #if defined(__VSX__) || defined(_ARCH_PWR8)
87// return (uint32x4_p) *(uint8x16_p*)((byte*)src);
88// #else
89// return VecLoad_ALTIVEC(src);
90// #endif
91// }
92
93// We should be able to perform the load using inline asm on Power7 with
94// VSX or Power8. The inline asm will avoid C undefined behavior due to
95// casting from byte* to word32*. We are safe because our byte* are
96// 16-byte aligned for Altivec. Below is the big endian load. Little
97// endian would need to follow with xxpermdi for the reversal.
98//
99// __asm__ ("lxvw4x %x0, %1, %2" : "=wa"(v) : "r"(0), "r"(src) : );
100
101// GCC and XLC use integer math for the address (D-form or byte-offset
102// in the ISA manual). LLVM uses pointer math for the address (DS-form
103// or indexed in the ISA manual). To keep them consistent we calculate
104// the address from the offset and pass to a load or store function
105// using a 0 offset.
106
107#ifndef CRYPTOPP_PPC_CRYPTO_H
108#define CRYPTOPP_PPC_CRYPTO_H
109
110#include "config.h"
111#include "misc.h"
112
113#if defined(__ALTIVEC__)
114# include <altivec.h>
115# undef vector
116# undef pixel
117# undef bool
118#endif
119
120// XL C++ on AIX does not define VSX and does not
121// provide an option to set it. We have to set it
122// for the code below. This define must stay in
123// sync with the define in test_ppc_power7.cpp.
124#ifndef CRYPTOPP_DISABLE_POWER7
125# if defined(_AIX) && defined(_ARCH_PWR7) && defined(__xlC__)
126# define __VSX__ 1
127# endif
128#endif
129
130// XL C++ on AIX does not define CRYPTO and does not
131// provide an option to set it. We have to set it
132// for the code below. This define must stay in
133// sync with the define in test_ppc_power8.cpp
134#ifndef CRYPTOPP_DISABLE_POWER8
135# if defined(_AIX) && defined(_ARCH_PWR8) && defined(__xlC__)
136# define __CRYPTO__ 1
137# endif
138#endif
139
145#define CONST_V8_CAST(x) ((unsigned char*)(x))
151#define CONST_V32_CAST(x) ((unsigned int*)(x))
157#define CONST_V64_CAST(x) ((unsigned long long*)(x))
163#define NCONST_V8_CAST(x) ((unsigned char*)(x))
169#define NCONST_V32_CAST(x) ((unsigned int*)(x))
175#define NCONST_V64_CAST(x) ((unsigned long long*)(x))
176
177// VecLoad_ALTIVEC and VecStore_ALTIVEC are
178// too noisy on modern compilers
179#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
180# pragma GCC diagnostic push
181# pragma GCC diagnostic ignored "-Wdeprecated"
182#endif
183
184NAMESPACE_BEGIN(CryptoPP)
185
186#if defined(__ALTIVEC__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
187
192typedef __vector unsigned char uint8x16_p;
197typedef __vector unsigned short uint16x8_p;
202typedef __vector unsigned int uint32x4_p;
203
204#if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
212typedef __vector unsigned long long uint64x2_p;
213#endif // VSX or ARCH_PWR8
214
218inline uint32x4_p VecZero()
219{
220 const uint32x4_p v = {0,0,0,0};
221 return v;
222}
223
227inline uint32x4_p VecOne()
228{
229 const uint32x4_p v = {1,1,1,1};
230 return v;
231}
232
241template <class T>
242inline T VecReverse(const T data)
243{
244#if defined(CRYPTOPP_BIG_ENDIAN)
245 const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
246 return (T)vec_perm(data, data, mask);
247#else
248 const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
249 return (T)vec_perm(data, data, mask);
250#endif
251}
252
262template <class T>
263inline T VecReverseLE(const T data)
264{
265#if defined(CRYPTOPP_LITTLE_ENDIAN)
266 const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
267 return (T)vec_perm(data, data, mask);
268#else
269 return data;
270#endif
271}
272
282template <class T>
283inline T VecReverseBE(const T data)
284{
285#if defined(CRYPTOPP_BIG_ENDIAN)
286 const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
287 return (T)vec_perm(data, data, mask);
288#else
289 return data;
290#endif
291}
292
294
295
308inline uint32x4_p VecLoad_ALTIVEC(const byte src[16])
309{
310 // Avoid IsAlignedOn for convenience.
311 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
312 if (addr % 16 == 0)
313 {
314 return (uint32x4_p)vec_ld(0, CONST_V8_CAST(addr));
315 }
316 else
317 {
318 // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
319 const uint8x16_p perm = vec_lvsl(0, CONST_V8_CAST(addr));
320 const uint8x16_p low = vec_ld(0, CONST_V8_CAST(addr));
321 const uint8x16_p high = vec_ld(15, CONST_V8_CAST(addr));
322 return (uint32x4_p)vec_perm(low, high, perm);
323 }
324}
325
339inline uint32x4_p VecLoad_ALTIVEC(int off, const byte src[16])
340{
341 // Avoid IsAlignedOn for convenience.
342 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
343 if (addr % 16 == 0)
344 {
345 return (uint32x4_p)vec_ld(0, CONST_V8_CAST(addr));
346 }
347 else
348 {
349 // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
350 const uint8x16_p perm = vec_lvsl(0, CONST_V8_CAST(addr));
351 const uint8x16_p low = vec_ld(0, CONST_V8_CAST(addr));
352 const uint8x16_p high = vec_ld(15, CONST_V8_CAST(addr));
353 return (uint32x4_p)vec_perm(low, high, perm);
354 }
355}
356
369inline uint32x4_p VecLoad(const byte src[16])
370{
371 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
372 // word pointers. The ISA lacks loads for short* and char*.
373 // Power9/ISA 3.0 provides vec_xl for all datatypes.
374
375 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
376 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
377 CRYPTOPP_UNUSED(addr);
378
379#if defined(_ARCH_PWR9)
380 return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
381#else
382 return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
383#endif
384}
385
399inline uint32x4_p VecLoad(int off, const byte src[16])
400{
401 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
402 // word pointers. The ISA lacks loads for short* and char*.
403 // Power9/ISA 3.0 provides vec_xl for all datatypes.
404
405 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
406 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
407 CRYPTOPP_UNUSED(addr);
408
409#if defined(_ARCH_PWR9)
410 return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
411#else
412 return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
413#endif
414}
415
428inline uint32x4_p VecLoad(const word32 src[4])
429{
430 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
431 // word pointers. The ISA lacks loads for short* and char*.
432 // Power9/ISA 3.0 provides vec_xl for all datatypes.
433
434 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
435 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
436 CRYPTOPP_UNUSED(addr);
437
438#if defined(_ARCH_PWR9)
439 return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
440#elif defined(__VSX__) || defined(_ARCH_PWR8)
441 return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
442#else
443 return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
444#endif
445}
446
460inline uint32x4_p VecLoad(int off, const word32 src[4])
461{
462 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
463 // word pointers. The ISA lacks loads for short* and char*.
464 // Power9/ISA 3.0 provides vec_xl for all datatypes.
465
466 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
467 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
468 CRYPTOPP_UNUSED(addr);
469
470#if defined(_ARCH_PWR9)
471 return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
472#elif defined(__VSX__) || defined(_ARCH_PWR8)
473 return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
474#else
475 return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
476#endif
477}
478
479#if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
480
494inline uint64x2_p VecLoad(const word64 src[2])
495{
496 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
497 // word pointers. The ISA lacks loads for short* and char*.
498 // Power9/ISA 3.0 provides vec_xl for all datatypes.
499
500 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
501 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
502 CRYPTOPP_UNUSED(addr);
503
504#if defined(_ARCH_PWR9)
505 return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src));
506#elif defined(__VSX__) || defined(_ARCH_PWR8)
507 // The 32-bit cast is not a typo. Compiler workaround.
508 return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
509#else
510 return (uint64x2_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
511#endif
512}
513
528inline uint64x2_p VecLoad(int off, const word64 src[2])
529{
530 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
531 // word pointers. The ISA lacks loads for short* and char*.
532 // Power9/ISA 3.0 provides vec_xl for all datatypes.
533
534 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
535 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
536 CRYPTOPP_UNUSED(addr);
537
538#if defined(_ARCH_PWR9)
539 return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src));
540#elif defined(__VSX__) || defined(_ARCH_PWR8)
541 // The 32-bit cast is not a typo. Compiler workaround.
542 return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
543#else
544 return (uint64x2_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
545#endif
546}
547
548#endif // VSX or ARCH_PWR8
549
560inline uint32x4_p VecLoadAligned(const byte src[16])
561{
562 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
563 // word pointers. The ISA lacks loads for short* and char*.
564 // Power9/ISA 3.0 provides vec_xl for all datatypes.
565
566 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
567 CRYPTOPP_ASSERT(addr % 16 == 0);
568 CRYPTOPP_UNUSED(addr);
569
570#if defined(_ARCH_PWR9)
571 return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
572#else
573 return (uint32x4_p)vec_ld(0, CONST_V8_CAST(src));
574#endif
575}
576
588inline uint32x4_p VecLoadAligned(int off, const byte src[16])
589{
590 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
591 // word pointers. The ISA lacks loads for short* and char*.
592 // Power9/ISA 3.0 provides vec_xl for all datatypes.
593
594 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
595 CRYPTOPP_ASSERT(addr % 16 == 0);
596 CRYPTOPP_UNUSED(addr);
597
598#if defined(_ARCH_PWR9)
599 return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
600#else
601 return (uint32x4_p)vec_ld(off, CONST_V8_CAST(src));
602#endif
603}
604
615inline uint32x4_p VecLoadAligned(const word32 src[4])
616{
617 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
618 // word pointers. The ISA lacks loads for short* and char*.
619 // Power9/ISA 3.0 provides vec_xl for all datatypes.
620
621 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
622 CRYPTOPP_ASSERT(addr % 16 == 0);
623 CRYPTOPP_UNUSED(addr);
624
625#if defined(_ARCH_PWR9)
626 return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
627#elif defined(__VSX__) || defined(_ARCH_PWR8)
628 return (uint32x4_p)vec_xl(0, CONST_V32_CAST(src));
629#else
630 return (uint32x4_p)vec_ld(0, CONST_V8_CAST(src));
631#endif
632}
633
645inline uint32x4_p VecLoadAligned(int off, const word32 src[4])
646{
647 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
648 // word pointers. The ISA lacks loads for short* and char*.
649 // Power9/ISA 3.0 provides vec_xl for all datatypes.
650
651 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
652 CRYPTOPP_ASSERT(addr % 16 == 0);
653 CRYPTOPP_UNUSED(addr);
654
655#if defined(_ARCH_PWR9)
656 return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
657#elif defined(__VSX__) || defined(_ARCH_PWR8)
658 return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
659#else
660 return (uint32x4_p)vec_ld(off, CONST_V8_CAST(src));
661#endif
662}
663
664#if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
665
676inline uint64x2_p VecLoadAligned(const word64 src[4])
677{
678 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
679 // word pointers. The ISA lacks loads for short* and char*.
680 // Power9/ISA 3.0 provides vec_xl for all datatypes.
681
682 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
683 CRYPTOPP_ASSERT(addr % 16 == 0);
684 CRYPTOPP_UNUSED(addr);
685
686#if defined(_ARCH_PWR9)
687 return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src));
688#elif defined(__VSX__) || defined(_ARCH_PWR8)
689 // The 32-bit cast is not a typo. Compiler workaround.
690 return (uint64x2_p)vec_xl(0, CONST_V32_CAST(src));
691#else
692 return (uint64x2_p)vec_ld(0, CONST_V8_CAST(src));
693#endif
694}
695
707inline uint64x2_p VecLoadAligned(int off, const word64 src[4])
708{
709 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
710 // word pointers. The ISA lacks loads for short* and char*.
711 // Power9/ISA 3.0 provides vec_xl for all datatypes.
712
713 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
714 CRYPTOPP_ASSERT(addr % 16 == 0);
715 CRYPTOPP_UNUSED(addr);
716
717#if defined(_ARCH_PWR9)
718 return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src));
719#elif defined(__VSX__) || defined(_ARCH_PWR8)
720 // The 32-bit cast is not a typo. Compiler workaround.
721 return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
722#else
723 return (uint64x2_p)vec_ld(off, CONST_V8_CAST(src));
724#endif
725}
726
727#endif
728
742inline uint32x4_p VecLoadBE(const byte src[16])
743{
744 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
745 // word pointers. The ISA lacks loads for short* and char*.
746 // Power9/ISA 3.0 provides vec_xl for all datatypes.
747
748 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
749 // CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
750 CRYPTOPP_UNUSED(addr);
751
752#if defined(_ARCH_PWR9)
753 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
754 return (uint32x4_p)vec_xl_be(0, CONST_V8_CAST(src));
755#elif defined(CRYPTOPP_BIG_ENDIAN)
756 return (uint32x4_p)VecLoad_ALTIVEC(0, CONST_V8_CAST(src));
757#else
758 return (uint32x4_p)VecReverseLE(VecLoad_ALTIVEC(CONST_V8_CAST(src)));
759#endif
760}
761
776inline uint32x4_p VecLoadBE(int off, const byte src[16])
777{
778 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
779 // word pointers. The ISA lacks loads for short* and char*.
780 // Power9/ISA 3.0 provides vec_xl for all datatypes.
781
782 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
783 // CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
784 CRYPTOPP_UNUSED(addr);
785
786#if defined(_ARCH_PWR9)
787 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
788 return (uint32x4_p)vec_xl_be(off, CONST_V8_CAST(src));
789#elif defined(CRYPTOPP_BIG_ENDIAN)
790 return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
791#else
792 return (uint32x4_p)VecReverseLE(VecLoad_ALTIVEC(CONST_V8_CAST(addr)));
793#endif
794}
795
797
799
800
816template<class T>
817inline void VecStore_ALTIVEC(const T data, byte dest[16])
818{
819 // Avoid IsAlignedOn for convenience.
820 uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
821 if (addr % 16 == 0)
822 {
823 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
824 }
825 else
826 {
827 // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
828 uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, NCONST_V8_CAST(addr)));
829 vec_ste((uint8x16_p) perm, 0, (unsigned char*) NCONST_V8_CAST(addr));
830 vec_ste((uint16x8_p) perm, 1, (unsigned short*)NCONST_V8_CAST(addr));
831 vec_ste((uint32x4_p) perm, 3, (unsigned int*) NCONST_V8_CAST(addr));
832 vec_ste((uint32x4_p) perm, 4, (unsigned int*) NCONST_V8_CAST(addr));
833 vec_ste((uint32x4_p) perm, 8, (unsigned int*) NCONST_V8_CAST(addr));
834 vec_ste((uint32x4_p) perm, 12, (unsigned int*) NCONST_V8_CAST(addr));
835 vec_ste((uint16x8_p) perm, 14, (unsigned short*)NCONST_V8_CAST(addr));
836 vec_ste((uint8x16_p) perm, 15, (unsigned char*) NCONST_V8_CAST(addr));
837 }
838}
839
856template<class T>
857inline void VecStore_ALTIVEC(const T data, int off, byte dest[16])
858{
859 // Avoid IsAlignedOn for convenience.
860 uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
861 if (addr % 16 == 0)
862 {
863 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
864 }
865 else
866 {
867 // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
868 uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, NCONST_V8_CAST(addr)));
869 vec_ste((uint8x16_p) perm, 0, (unsigned char*) NCONST_V8_CAST(addr));
870 vec_ste((uint16x8_p) perm, 1, (unsigned short*)NCONST_V8_CAST(addr));
871 vec_ste((uint32x4_p) perm, 3, (unsigned int*) NCONST_V8_CAST(addr));
872 vec_ste((uint32x4_p) perm, 4, (unsigned int*) NCONST_V8_CAST(addr));
873 vec_ste((uint32x4_p) perm, 8, (unsigned int*) NCONST_V8_CAST(addr));
874 vec_ste((uint32x4_p) perm, 12, (unsigned int*) NCONST_V8_CAST(addr));
875 vec_ste((uint16x8_p) perm, 14, (unsigned short*)NCONST_V8_CAST(addr));
876 vec_ste((uint8x16_p) perm, 15, (unsigned char*) NCONST_V8_CAST(addr));
877 }
878}
879
894template<class T>
895inline void VecStore(const T data, byte dest[16])
896{
897 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
898 // word pointers. The ISA lacks loads for short* and char*.
899 // Power9/ISA 3.0 provides vec_xl for all datatypes.
900
901 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
902 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
903 CRYPTOPP_UNUSED(addr);
904
905#if defined(_ARCH_PWR9)
906 vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
907#else
908 VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(dest));
909#endif
910}
911
927template<class T>
928inline void VecStore(const T data, int off, byte dest[16])
929{
930 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
931 // word pointers. The ISA lacks loads for short* and char*.
932 // Power9/ISA 3.0 provides vec_xl for all datatypes.
933
934 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
935 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
936 CRYPTOPP_UNUSED(addr);
937
938#if defined(_ARCH_PWR9)
939 vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
940#else
941 VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
942#endif
943}
944
959template<class T>
960inline void VecStore(const T data, word32 dest[4])
961{
962 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
963 // word pointers. The ISA lacks stores for short* and char*.
964 // Power9/ISA 3.0 provides vec_xst for all datatypes.
965
966 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
967 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
968 CRYPTOPP_UNUSED(addr);
969
970#if defined(_ARCH_PWR9)
971 vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
972#elif defined(__VSX__) || defined(_ARCH_PWR8)
973 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
974#else
975 VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
976#endif
977}
978
994template<class T>
995inline void VecStore(const T data, int off, word32 dest[4])
996{
997 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
998 // word pointers. The ISA lacks stores for short* and char*.
999 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1000
1001 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1002 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1003 CRYPTOPP_UNUSED(addr);
1004
1005#if defined(_ARCH_PWR9)
1006 vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1007#elif defined(__VSX__) || defined(_ARCH_PWR8)
1008 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1009#else
1010 VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
1011#endif
1012}
1013
1029template<class T>
1030inline void VecStore(const T data, word64 dest[2])
1031{
1032 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1033 // word pointers. The ISA lacks stores for short* and char*.
1034 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1035
1036 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1037 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
1038 CRYPTOPP_UNUSED(addr);
1039
1040#if defined(_ARCH_PWR9)
1041 vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1042#elif defined(__VSX__) || defined(_ARCH_PWR8)
1043 // 32-bit cast is not a typo. Compiler workaround.
1044 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1045#else
1046 VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
1047#endif
1048}
1049
1066template<class T>
1067inline void VecStore(const T data, int off, word64 dest[2])
1068{
1069 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1070 // word pointers. The ISA lacks stores for short* and char*.
1071 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1072
1073 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1074 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
1075 CRYPTOPP_UNUSED(addr);
1076
1077#if defined(_ARCH_PWR9)
1078 vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1079#elif defined(__VSX__) || defined(_ARCH_PWR8)
1080 // 32-bit cast is not a typo. Compiler workaround.
1081 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1082#else
1083 VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
1084#endif
1085}
1086
1099template<class T>
1100inline void VecStoreAligned(const T data, byte dest[16])
1101{
1102 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
1103 // word pointers. The ISA lacks loads for short* and char*.
1104 // Power9/ISA 3.0 provides vec_xl for all datatypes.
1105
1106 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1107 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1108 CRYPTOPP_UNUSED(addr);
1109
1110#if defined(_ARCH_PWR9)
1111 vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1112#else
1113 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1114#endif
1115}
1116
1130template<class T>
1131inline void VecStoreAligned(const T data, int off, byte dest[16])
1132{
1133 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
1134 // word pointers. The ISA lacks loads for short* and char*.
1135 // Power9/ISA 3.0 provides vec_xl for all datatypes.
1136
1137 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1138 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1139 CRYPTOPP_UNUSED(addr);
1140
1141#if defined(_ARCH_PWR9)
1142 vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1143#else
1144 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1145#endif
1146}
1147
1161template<class T>
1162inline void VecStoreAligned(const T data, word32 dest[4])
1163{
1164 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1165 // word pointers. The ISA lacks stores for short* and char*.
1166 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1167
1168 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1169 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1170 CRYPTOPP_UNUSED(addr);
1171
1172#if defined(_ARCH_PWR9)
1173 vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1174#elif defined(__VSX__) || defined(_ARCH_PWR8)
1175 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1176#else
1177 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1178#endif
1179}
1180
1195template<class T>
1196inline void VecStoreAligned(const T data, int off, word32 dest[4])
1197{
1198 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1199 // word pointers. The ISA lacks stores for short* and char*.
1200 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1201
1202 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1203 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1204 CRYPTOPP_UNUSED(addr);
1205
1206#if defined(_ARCH_PWR9)
1207 vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1208#elif defined(__VSX__) || defined(_ARCH_PWR8)
1209 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1210#else
1211 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1212#endif
1213}
1214
1230template <class T>
1231inline void VecStoreBE(const T data, byte dest[16])
1232{
1233 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1234 // word pointers. The ISA lacks stores for short* and char*.
1235 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1236
1237 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1238 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1239 CRYPTOPP_UNUSED(addr);
1240
1241#if defined(_ARCH_PWR9)
1242 vec_xst_be((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1243#elif defined(CRYPTOPP_BIG_ENDIAN)
1244 VecStore((uint8x16_p)data, NCONST_V8_CAST(addr));
1245#else
1246 VecStore((uint8x16_p)VecReverseLE(data), NCONST_V8_CAST(addr));
1247#endif
1248}
1249
1266template <class T>
1267inline void VecStoreBE(const T data, int off, byte dest[16])
1268{
1269 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1270 // word pointers. The ISA lacks stores for short* and char*.
1271 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1272
1273 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1274 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1275 CRYPTOPP_UNUSED(addr);
1276
1277#if defined(_ARCH_PWR9)
1278 vec_xst_be((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1279#elif defined(CRYPTOPP_BIG_ENDIAN)
1280 VecStore((uint8x16_p)data, NCONST_V8_CAST(addr));
1281#else
1282 VecStore((uint8x16_p)VecReverseLE(data), NCONST_V8_CAST(addr));
1283#endif
1284}
1285
1301template <class T>
1302inline void VecStoreBE(const T data, word32 dest[4])
1303{
1304 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1305 // word pointers. The ISA lacks stores for short* and char*.
1306 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1307
1308 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1309 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1310 CRYPTOPP_UNUSED(addr);
1311
1312#if defined(_ARCH_PWR9)
1313 vec_xst_be((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1314#elif defined(CRYPTOPP_BIG_ENDIAN)
1315 VecStore((uint32x4_p)data, NCONST_V32_CAST(addr));
1316#else
1317 VecStore((uint32x4_p)VecReverseLE(data), NCONST_V32_CAST(addr));
1318#endif
1319}
1320
1337template <class T>
1338inline void VecStoreBE(const T data, int off, word32 dest[4])
1339{
1340 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1341 // word pointers. The ISA lacks stores for short* and char*.
1342 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1343
1344 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1345 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1346 CRYPTOPP_UNUSED(addr);
1347
1348#if defined(_ARCH_PWR9)
1349 vec_xst_be((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1350#elif defined(CRYPTOPP_BIG_ENDIAN)
1351 VecStore((uint32x4_p)data, NCONST_V32_CAST(addr));
1352#else
1353 VecStore((uint32x4_p)VecReverseLE(data), NCONST_V32_CAST(addr));
1354#endif
1355}
1356
1358
1360
1361
1375template <class T1, class T2>
1376inline T1 VecAnd(const T1 vec1, const T2 vec2)
1377{
1378 return (T1)vec_and(vec1, (T1)vec2);
1379}
1380
1394template <class T1, class T2>
1395inline T1 VecOr(const T1 vec1, const T2 vec2)
1396{
1397 return (T1)vec_or(vec1, (T1)vec2);
1398}
1399
1413template <class T1, class T2>
1414inline T1 VecXor(const T1 vec1, const T2 vec2)
1415{
1416 return (T1)vec_xor(vec1, (T1)vec2);
1417}
1418
1420
1422
1423
1437template <class T1, class T2>
1438inline T1 VecAdd(const T1 vec1, const T2 vec2)
1439{
1440 return (T1)vec_add(vec1, (T1)vec2);
1441}
1442
1455template <class T1, class T2>
1456inline T1 VecSub(const T1 vec1, const T2 vec2)
1457{
1458 return (T1)vec_sub(vec1, (T1)vec2);
1459}
1460
1462
1464
1465
1477template <class T1, class T2>
1478inline T1 VecPermute(const T1 vec, const T2 mask)
1479{
1480 return (T1)vec_perm(vec, vec, (uint8x16_p)mask);
1481}
1482
1495template <class T1, class T2>
1496inline T1 VecPermute(const T1 vec1, const T1 vec2, const T2 mask)
1497{
1498 return (T1)vec_perm(vec1, (T1)vec2, (uint8x16_p)mask);
1499}
1500
1502
1504
1505
1527template <unsigned int C, class T>
1528inline T VecShiftLeftOctet(const T vec)
1529{
1530 const T zero = {0};
1531 if (C >= 16)
1532 {
1533 // Out of range
1534 return zero;
1535 }
1536 else if (C == 0)
1537 {
1538 // Noop
1539 return vec;
1540 }
1541 else
1542 {
1543#if defined(CRYPTOPP_BIG_ENDIAN)
1544 enum { R=C&0xf };
1545 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
1546#else
1547 enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1548 return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
1549#endif
1550 }
1551}
1552
1574template <unsigned int C, class T>
1575inline T VecShiftRightOctet(const T vec)
1576{
1577 const T zero = {0};
1578 if (C >= 16)
1579 {
1580 // Out of range
1581 return zero;
1582 }
1583 else if (C == 0)
1584 {
1585 // Noop
1586 return vec;
1587 }
1588 else
1589 {
1590#if defined(CRYPTOPP_BIG_ENDIAN)
1591 enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1592 return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
1593#else
1594 enum { R=C&0xf };
1595 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
1596#endif
1597 }
1598}
1599
1613template <unsigned int C, class T>
1614inline T VecRotateLeftOctet(const T vec)
1615{
1616#if defined(CRYPTOPP_BIG_ENDIAN)
1617 enum { R = C&0xf };
1618 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1619#else
1620 enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1621 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1622#endif
1623}
1624
1638template <unsigned int C, class T>
1639inline T VecRotateRightOctet(const T vec)
1640{
1641#if defined(CRYPTOPP_BIG_ENDIAN)
1642 enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1643 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1644#else
1645 enum { R = C&0xf };
1646 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1647#endif
1648}
1649
1659template<unsigned int C>
1660inline uint32x4_p VecRotateLeft(const uint32x4_p vec)
1661{
1662 const uint32x4_p m = {C, C, C, C};
1663 return vec_rl(vec, m);
1664}
1665
1675template<unsigned int C>
1676inline uint32x4_p VecRotateRight(const uint32x4_p vec)
1677{
1678 const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
1679 return vec_rl(vec, m);
1680}
1681
1691template<unsigned int C>
1692inline uint32x4_p VecShiftLeft(const uint32x4_p vec)
1693{
1694 const uint32x4_p m = {C, C, C, C};
1695 return vec_sl(vec, m);
1696}
1697
1707template<unsigned int C>
1708inline uint32x4_p VecShiftRight(const uint32x4_p vec)
1709{
1710 const uint32x4_p m = {C, C, C, C};
1711 return vec_sr(vec, m);
1712}
1713
1714// 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
1715#if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1716
1728template<unsigned int C>
1729inline uint64x2_p VecRotateLeft(const uint64x2_p vec)
1730{
1731 const uint64x2_p m = {C, C};
1732 return vec_rl(vec, m);
1733}
1734
1746template<unsigned int C>
1747inline uint64x2_p VecShiftLeft(const uint64x2_p vec)
1748{
1749 const uint64x2_p m = {C, C};
1750 return vec_sl(vec, m);
1751}
1752
1764template<unsigned int C>
1765inline uint64x2_p VecRotateRight(const uint64x2_p vec)
1766{
1767 const uint64x2_p m = {64-C, 64-C};
1768 return vec_rl(vec, m);
1769}
1770
1782template<unsigned int C>
1783inline uint64x2_p VecShiftRight(const uint64x2_p vec)
1784{
1785 const uint64x2_p m = {C, C};
1786 return vec_sr(vec, m);
1787}
1788
1789#endif // ARCH_PWR8
1790
1792
1794
1795
1804template <class T>
1805inline T VecMergeLow(const T vec1, const T vec2)
1806{
1807 return vec_mergel(vec1, vec2);
1808}
1809
1818template <class T>
1819inline T VecMergeHigh(const T vec1, const T vec2)
1820{
1821 return vec_mergeh(vec1, vec2);
1822}
1823
1830inline uint32x4_p VecSplatWord(word32 val)
1831{
1832 // Fix spurious GCC warning???
1833 CRYPTOPP_UNUSED(val);
1834
1835 // Apple Altivec and XL C++ do not offer vec_splats.
1836 // GCC offers vec_splats back to -mcpu=power4.
1837#if defined(_ARCH_PWR4) && defined(__GNUC__)
1838 return vec_splats(val);
1839#else
1840 //const word32 x[4] = {val,val,val,val};
1841 //return VecLoad(x);
1842 const word32 x[4] = {val};
1843 return vec_splat(VecLoad(x),0);
1844#endif
1845}
1846
1854template <unsigned int N>
1855inline uint32x4_p VecSplatElement(const uint32x4_p val)
1856{
1857 return vec_splat(val, N);
1858}
1859
1860#if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1867inline uint64x2_p VecSplatWord(word64 val)
1868{
1869 // The PPC64 ABI says so.
1870 return vec_splats((unsigned long long)val);
1871}
1872
1880template <unsigned int N>
1881inline uint64x2_p VecSplatElement(const uint64x2_p val)
1882{
1883#if defined(__VSX__) || defined(_ARCH_PWR8)
1884 return vec_splat(val, N);
1885#else
1886 enum {E=N&1};
1887 if (E == 0)
1888 {
1889 const uint8x16_p m = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
1890 return vec_perm(val, val, m);
1891 }
1892 else // (E == 1)
1893 {
1894 const uint8x16_p m = {8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15};
1895 return vec_perm(val, val, m);
1896 }
1897#endif
1898}
1899#endif
1900
1912template <class T>
1913inline T VecGetLow(const T val)
1914{
1915#if defined(CRYPTOPP_BIG_ENDIAN) && (defined(__VSX__) || defined(_ARCH_PWR8))
1916 const T zero = {0};
1917 return (T)VecMergeLow((uint64x2_p)zero, (uint64x2_p)val);
1918#else
1919 return VecShiftRightOctet<8>(VecShiftLeftOctet<8>(val));
1920#endif
1921}
1922
1934template <class T>
1935inline T VecGetHigh(const T val)
1936{
1937#if defined(CRYPTOPP_BIG_ENDIAN) && (defined(__VSX__) || defined(_ARCH_PWR8))
1938 const T zero = {0};
1939 return (T)VecMergeHigh((uint64x2_p)zero, (uint64x2_p)val);
1940#else
1941 return VecShiftRightOctet<8>(val);
1942#endif
1943}
1944
1952template <class T>
1953inline T VecSwapWords(const T vec)
1954{
1955 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 8);
1956}
1957
1959
1961
1962
1974template <class T1, class T2>
1975inline bool VecEqual(const T1 vec1, const T2 vec2)
1976{
1977 return 1 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
1978}
1979
1991template <class T1, class T2>
1992inline bool VecNotEqual(const T1 vec1, const T2 vec2)
1993{
1994 return 0 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
1995}
1996
1998
2000
2002
2003
2014inline uint32x4_p VecAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2)
2015{
2016 // 64-bit elements available at POWER7 with VSX, but addudm requires POWER8
2017#if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2018 return (uint32x4_p)vec_add((uint64x2_p)vec1, (uint64x2_p)vec2);
2019#else
2020 // The carry mask selects carrys for elements 1 and 3 and sets
2021 // remaining elements to 0. The results is then shifted so the
2022 // carried values are added to elements 0 and 2.
2023#if defined(CRYPTOPP_BIG_ENDIAN)
2024 const uint32x4_p zero = {0, 0, 0, 0};
2025 const uint32x4_p mask = {0, 1, 0, 1};
2026#else
2027 const uint32x4_p zero = {0, 0, 0, 0};
2028 const uint32x4_p mask = {1, 0, 1, 0};
2029#endif
2030
2031 uint32x4_p cy = vec_addc(vec1, vec2);
2032 uint32x4_p res = vec_add(vec1, vec2);
2033 cy = vec_and(mask, cy);
2034 cy = vec_sld (cy, zero, 4);
2035 return vec_add(res, cy);
2036#endif
2037}
2038
2039#if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2050inline uint64x2_p VecAdd64(const uint64x2_p& vec1, const uint64x2_p& vec2)
2051{
2052 // 64-bit elements available at POWER7 with VSX, but addudm requires POWER8
2053 const uint64x2_p res = vec_add(vec1, vec2);
2054
2055#if defined(CRYPTOPP_DEBUG)
2056 // Test 32-bit add in debug builds while we are here.
2057 const uint32x4_p x = (uint32x4_p)vec1;
2058 const uint32x4_p y = (uint32x4_p)vec2;
2059 const uint32x4_p r = VecAdd64(x, y);
2060
2061 CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2062#endif
2063
2064 return res;
2065}
2066#endif
2067
2077inline uint32x4_p VecSub64(const uint32x4_p& vec1, const uint32x4_p& vec2)
2078{
2079#if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2080 // 64-bit elements available at POWER7 with VSX, but subudm requires POWER8
2081 return (uint32x4_p)vec_sub((uint64x2_p)vec1, (uint64x2_p)vec2);
2082#else
2083 // The borrow mask selects borrows for elements 1 and 3 and sets
2084 // remaining elements to 0. The results is then shifted so the
2085 // borrowed values are subtracted from elements 0 and 2.
2086#if defined(CRYPTOPP_BIG_ENDIAN)
2087 const uint32x4_p zero = {0, 0, 0, 0};
2088 const uint32x4_p mask = {0, 1, 0, 1};
2089#else
2090 const uint32x4_p zero = {0, 0, 0, 0};
2091 const uint32x4_p mask = {1, 0, 1, 0};
2092#endif
2093
2094 // subc sets the complement of borrow, so we have to
2095 // un-complement it using andc.
2096 uint32x4_p bw = vec_subc(vec1, vec2);
2097 uint32x4_p res = vec_sub(vec1, vec2);
2098 bw = vec_andc(mask, bw);
2099 bw = vec_sld (bw, zero, 4);
2100 return vec_sub(res, bw);
2101#endif
2102}
2103
2104#if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2114inline uint64x2_p VecSub64(const uint64x2_p& vec1, const uint64x2_p& vec2)
2115{
2116 // 64-bit elements available at POWER7 with VSX, but subudm requires POWER8
2117 const uint64x2_p res = vec_sub(vec1, vec2);
2118
2119#if defined(CRYPTOPP_DEBUG)
2120 // Test 32-bit sub in debug builds while we are here.
2121 const uint32x4_p x = (uint32x4_p)vec1;
2122 const uint32x4_p y = (uint32x4_p)vec2;
2123 const uint32x4_p r = VecSub64(x, y);
2124
2125 CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2126#endif
2127
2128 return res;
2129}
2130#endif
2131
2141template<unsigned int C>
2142inline uint32x4_p VecRotateLeft64(const uint32x4_p vec)
2143{
2144#if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2145 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2146 return (uint32x4_p)VecRotateLeft<C>((uint64x2_p)vec);
2147#else
2148 // C=0, 32, or 64 needs special handling. That is S32 and S64 below.
2149 enum {S64=C&63, S32=C&31, BR=(S64>=32)};
2150
2151 // Get the low bits, shift them to high bits
2152 uint32x4_p t1 = VecShiftLeft<S32>(vec);
2153 // Get the high bits, shift them to low bits
2154 uint32x4_p t2 = VecShiftRight<32-S32>(vec);
2155
2156 if (S64 == 0)
2157 {
2158 const uint8x16_p m = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
2159 return VecPermute(vec, m);
2160 }
2161 else if (S64 == 32)
2162 {
2163 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2164 return VecPermute(vec, m);
2165 }
2166 else if (BR) // Big rotate amount?
2167 {
2168 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2169 t1 = VecPermute(t1, m);
2170 }
2171 else
2172 {
2173 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2174 t2 = VecPermute(t2, m);
2175 }
2176
2177 return vec_or(t1, t2);
2178#endif
2179}
2180
2190template<>
2191inline uint32x4_p VecRotateLeft64<8>(const uint32x4_p vec)
2192{
2193#if (CRYPTOPP_BIG_ENDIAN)
2194 const uint8x16_p m = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
2195 return VecPermute(vec, m);
2196#else
2197 const uint8x16_p m = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
2198 return VecPermute(vec, m);
2199#endif
2200}
2201
2202#if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2212template<unsigned int C>
2213inline uint64x2_p VecRotateLeft64(const uint64x2_p vec)
2214{
2215 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2216 const uint64x2_p res = VecRotateLeft<C>(vec);
2217
2218#if defined(CRYPTOPP_DEBUG)
2219 // Test 32-bit rotate in debug builds while we are here.
2220 const uint32x4_p x = (uint32x4_p)vec;
2221 const uint32x4_p r = VecRotateLeft64<C>(x);
2222
2223 CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2224#endif
2225
2226 return res;
2227}
2228#endif
2229
2239template<unsigned int C>
2240inline uint32x4_p VecRotateRight64(const uint32x4_p vec)
2241{
2242#if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2243 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2244 return (uint32x4_p)VecRotateRight<C>((uint64x2_p)vec);
2245#else
2246 // C=0, 32, or 64 needs special handling. That is S32 and S64 below.
2247 enum {S64=C&63, S32=C&31, BR=(S64>=32)};
2248
2249 // Get the low bits, shift them to high bits
2250 uint32x4_p t1 = VecShiftRight<S32>(vec);
2251 // Get the high bits, shift them to low bits
2252 uint32x4_p t2 = VecShiftLeft<32-S32>(vec);
2253
2254 if (S64 == 0)
2255 {
2256 const uint8x16_p m = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
2257 return VecPermute(vec, m);
2258 }
2259 else if (S64 == 32)
2260 {
2261 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2262 return VecPermute(vec, m);
2263 }
2264 else if (BR) // Big rotate amount?
2265 {
2266 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2267 t1 = VecPermute(t1, m);
2268 }
2269 else
2270 {
2271 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2272 t2 = VecPermute(t2, m);
2273 }
2274
2275 return vec_or(t1, t2);
2276#endif
2277}
2278
2289template<>
2290inline uint32x4_p VecRotateRight64<8>(const uint32x4_p vec)
2291{
2292#if (CRYPTOPP_BIG_ENDIAN)
2293 const uint8x16_p m = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
2294 return VecPermute(vec, m);
2295#else
2296 const uint8x16_p m = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
2297 return VecPermute(vec, m);
2298#endif
2299}
2300
2301#if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2311template<unsigned int C>
2312inline uint64x2_p VecRotateRight64(const uint64x2_p vec)
2313{
2314 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2315 const uint64x2_p res = VecRotateRight<C>(vec);
2316
2317#if defined(CRYPTOPP_DEBUG)
2318 // Test 32-bit rotate in debug builds while we are here.
2319 const uint32x4_p x = (uint32x4_p)vec;
2320 const uint32x4_p r = VecRotateRight64<C>(x);
2321
2322 CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2323#endif
2324
2325 return res;
2326}
2327#endif
2328
2342template <class T1, class T2>
2343inline T1 VecAnd64(const T1 vec1, const T2 vec2)
2344{
2345 return (T1)vec_and(vec1, (T1)vec2);
2346}
2347
2361template <class T1, class T2>
2362inline T1 VecOr64(const T1 vec1, const T2 vec2)
2363{
2364 return (T1)vec_or(vec1, (T1)vec2);
2365}
2366
2380template <class T1, class T2>
2381inline T1 VecXor64(const T1 vec1, const T2 vec2)
2382{
2383 return (T1)vec_xor(vec1, (T1)vec2);
2384}
2385
2392inline uint32x4_p VecSplatWord64(word64 val)
2393{
2394#if defined(_ARCH_PWR8)
2395 // The PPC64 ABI says so.
2396 return (uint32x4_p)vec_splats((unsigned long long)val);
2397#else
2398 const word64 x[2] = {val,val};
2399 return (uint32x4_p)VecLoad((const word32*)x);
2400#endif
2401}
2402
2410template <unsigned int N>
2411inline uint32x4_p VecSplatElement64(const uint32x4_p val)
2412{
2413#if defined(__VSX__) || defined(_ARCH_PWR8)
2414 return (uint32x4_p)vec_splat((uint64x2_p)val, N);
2415#else
2416 enum {E=N&1};
2417 if (E == 0)
2418 {
2419 const uint8x16_p m = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
2420 return (uint32x4_p)vec_perm(val, val, m);
2421 }
2422 else // (E == 1)
2423 {
2424 const uint8x16_p m = {8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15};
2425 return (uint32x4_p)vec_perm(val, val, m);
2426 }
2427#endif
2428}
2429
2430#if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2436template <unsigned int N>
2437inline uint64x2_p VecSplatElement64(const uint64x2_p val)
2438{
2439 return vec_splat(val, N);
2440}
2441#endif
2442
2444
2446
2447// __CRYPTO__ alone is not enough. Clang will define __CRYPTO__
2448// when it is not available, like with Power7. Sigh...
2449#if (defined(_ARCH_PWR8) && defined(__CRYPTO__)) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2450
2452
2453
2468inline uint32x4_p VecPolyMultiply(const uint32x4_p& a, const uint32x4_p& b)
2469{
2470#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2471 return __vpmsumw (a, b);
2472#elif defined(__clang__)
2473 return __builtin_altivec_crypto_vpmsumw (a, b);
2474#else
2475 return __builtin_crypto_vpmsumw (a, b);
2476#endif
2477}
2478
2493inline uint64x2_p VecPolyMultiply(const uint64x2_p& a, const uint64x2_p& b)
2494{
2495#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2496 return __vpmsumd (a, b);
2497#elif defined(__clang__)
2498 return __builtin_altivec_crypto_vpmsumd (a, b);
2499#else
2500 return __builtin_crypto_vpmsumd (a, b);
2501#endif
2502}
2503
2517inline uint64x2_p VecIntelMultiply00(const uint64x2_p& a, const uint64x2_p& b)
2518{
2519#if defined(CRYPTOPP_BIG_ENDIAN)
2520 return VecSwapWords(VecPolyMultiply(VecGetHigh(a), VecGetHigh(b)));
2521#else
2522 return VecPolyMultiply(VecGetHigh(a), VecGetHigh(b));
2523#endif
2524}
2525
2539inline uint64x2_p VecIntelMultiply01(const uint64x2_p& a, const uint64x2_p& b)
2540{
2541#if defined(CRYPTOPP_BIG_ENDIAN)
2542 return VecSwapWords(VecPolyMultiply(a, VecGetHigh(b)));
2543#else
2544 return VecPolyMultiply(a, VecGetHigh(b));
2545#endif
2546}
2547
2561inline uint64x2_p VecIntelMultiply10(const uint64x2_p& a, const uint64x2_p& b)
2562{
2563#if defined(CRYPTOPP_BIG_ENDIAN)
2564 return VecSwapWords(VecPolyMultiply(VecGetHigh(a), b));
2565#else
2566 return VecPolyMultiply(VecGetHigh(a), b);
2567#endif
2568}
2569
2583inline uint64x2_p VecIntelMultiply11(const uint64x2_p& a, const uint64x2_p& b)
2584{
2585#if defined(CRYPTOPP_BIG_ENDIAN)
2586 return VecSwapWords(VecPolyMultiply(VecGetLow(a), b));
2587#else
2588 return VecPolyMultiply(VecGetLow(a), b);
2589#endif
2590}
2591
2593
2595
2596
2608template <class T1, class T2>
2609inline T1 VecEncrypt(const T1 state, const T2 key)
2610{
2611#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2612 return (T1)__vcipher((uint8x16_p)state, (uint8x16_p)key);
2613#elif defined(__clang__)
2614 return (T1)__builtin_altivec_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
2615#elif defined(__GNUC__)
2616 return (T1)__builtin_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
2617#else
2618 CRYPTOPP_ASSERT(0);
2619#endif
2620}
2621
2633template <class T1, class T2>
2634inline T1 VecEncryptLast(const T1 state, const T2 key)
2635{
2636#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2637 return (T1)__vcipherlast((uint8x16_p)state, (uint8x16_p)key);
2638#elif defined(__clang__)
2639 return (T1)__builtin_altivec_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
2640#elif defined(__GNUC__)
2641 return (T1)__builtin_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
2642#else
2643 CRYPTOPP_ASSERT(0);
2644#endif
2645}
2646
2658template <class T1, class T2>
2659inline T1 VecDecrypt(const T1 state, const T2 key)
2660{
2661#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2662 return (T1)__vncipher((uint8x16_p)state, (uint8x16_p)key);
2663#elif defined(__clang__)
2664 return (T1)__builtin_altivec_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
2665#elif defined(__GNUC__)
2666 return (T1)__builtin_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
2667#else
2668 CRYPTOPP_ASSERT(0);
2669#endif
2670}
2671
2683template <class T1, class T2>
2684inline T1 VecDecryptLast(const T1 state, const T2 key)
2685{
2686#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2687 return (T1)__vncipherlast((uint8x16_p)state, (uint8x16_p)key);
2688#elif defined(__clang__)
2689 return (T1)__builtin_altivec_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
2690#elif defined(__GNUC__)
2691 return (T1)__builtin_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
2692#else
2693 CRYPTOPP_ASSERT(0);
2694#endif
2695}
2696
2698
2700
2701
2713template <int func, int fmask, class T>
2714inline T VecSHA256(const T data)
2715{
2716#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2717 return (T)__vshasigmaw((uint32x4_p)data, func, fmask);
2718#elif defined(__clang__)
2719 return (T)__builtin_altivec_crypto_vshasigmaw((uint32x4_p)data, func, fmask);
2720#elif defined(__GNUC__)
2721 return (T)__builtin_crypto_vshasigmaw((uint32x4_p)data, func, fmask);
2722#else
2723 CRYPTOPP_ASSERT(0);
2724#endif
2725}
2726
2738template <int func, int fmask, class T>
2739inline T VecSHA512(const T data)
2740{
2741#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2742 return (T)__vshasigmad((uint64x2_p)data, func, fmask);
2743#elif defined(__clang__)
2744 return (T)__builtin_altivec_crypto_vshasigmad((uint64x2_p)data, func, fmask);
2745#elif defined(__GNUC__)
2746 return (T)__builtin_crypto_vshasigmad((uint64x2_p)data, func, fmask);
2747#else
2748 CRYPTOPP_ASSERT(0);
2749#endif
2750}
2751
2753
2754#endif // __CRYPTO__
2755
2756#endif // _ALTIVEC_
2757
2758NAMESPACE_END
2759
2760#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
2761# pragma GCC diagnostic pop
2762#endif
2763
2764#endif // CRYPTOPP_PPC_CRYPTO_H
Library configuration file.
unsigned int word32
32-bit unsigned datatype
Definition config_int.h:62
Utility functions for the Crypto++ library.
#define NCONST_V32_CAST(x)
Cast array to vector pointer.
Definition ppc_simd.h:169
#define CONST_V8_CAST(x)
Cast array to vector pointer.
Definition ppc_simd.h:145
#define NCONST_V8_CAST(x)
Cast array to vector pointer.
Definition ppc_simd.h:163
#define CONST_V32_CAST(x)
Cast array to vector pointer.
Definition ppc_simd.h:151