documentation/html/ppc__simd_8h_source.html

// ppc_simd.h - written and placed in public domain by Jeffrey Walton


// Use __ALTIVEC__, _ARCH_PWR7, __VSX__, and _ARCH_PWR8 when detecting

// actual availaibility of the feature for the source file being compiled.

// The preprocessor macros depend on compiler options like -maltivec; and

// not compiler versions.


// For GCC see https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions.html

// For XLC see the Compiler Reference manual. For Clang you have to experiment.

// Clang does not document the compiler options, does not reject options it does

// not understand, and pretends to be other compilers even though it cannot

// process the builtins and intrinsics. Clang will waste hours of your time.


// DO NOT USE this pattern in VecLoad and VecStore. We have to use the

// code paths guarded by preprocessor macros because XLC 12 generates

// bad code in some places. To verify the bad code generation test on

// GCC111 with XLC 12.01 installed. XLC 13.01 on GCC112 and GCC119 are OK.

//

//   inline uint32x4_p VecLoad(const byte src[16])

//   {

//   #if defined(__VSX__) || defined(_ARCH_PWR8)

//       return (uint32x4_p) *(uint8x16_p*)((byte*)src);

//   #else

//       return VecLoad_ALTIVEC(src);

//   #endif

//   }


// We should be able to perform the load using inline asm on Power7 with

// VSX or Power8. The inline asm will avoid C undefined behavior due to

// casting from byte* to word32*. We are safe because our byte* are

// 16-byte aligned for Altivec. Below is the big endian load. Little

// endian would need to follow with xxpermdi for the reversal.

//

//   __asm__ ("lxvw4x %x0, %1, %2" : "=wa"(v) : "r"(0), "r"(src) : );


// GCC and XLC use integer math for the address (D-form or byte-offset

// in the ISA manual). LLVM uses pointer math for the address (DS-form

// or indexed in the ISA manual). To keep them consistent we calculate

// the address from the offset and pass to a load or store function

// using a 0 offset.


#ifndef CRYPTOPP_PPC_CRYPTO_H

#define CRYPTOPP_PPC_CRYPTO_H


#include "config.h"

#include "misc.h"


#if defined(__ALTIVEC__)

# include <altivec.h>

# undef vector

# undef pixel

# undef bool

#endif


// XL C++ on AIX does not define VSX and does not

// provide an option to set it. We have to set it

// for the code below. This define must stay in

// sync with the define in test_ppc_power7.cpp.

#ifndef CRYPTOPP_DISABLE_POWER7

# if defined(_AIX) && defined(_ARCH_PWR7) && defined(__xlC__)

#  define __VSX__ 1

# endif

#endif


// XL C++ on AIX does not define CRYPTO and does not

// provide an option to set it. We have to set it

// for the code below. This define must stay in

// sync with the define in test_ppc_power8.cpp

#ifndef CRYPTOPP_DISABLE_POWER8

# if defined(_AIX) && defined(_ARCH_PWR8) && defined(__xlC__)

#  define __CRYPTO__ 1

# endif

#endif


#define CONST_V8_CAST(x)  ((unsigned char*)(x))

#define CONST_V32_CAST(x) ((unsigned int*)(x))

#define CONST_V64_CAST(x) ((unsigned long long*)(x))

#define NCONST_V8_CAST(x)  ((unsigned char*)(x))

#define NCONST_V32_CAST(x) ((unsigned int*)(x))

#define NCONST_V64_CAST(x) ((unsigned long long*)(x))


// VecLoad_ALTIVEC and VecStore_ALTIVEC are

// too noisy on modern compilers

#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE

# pragma GCC diagnostic push

# pragma GCC diagnostic ignored "-Wdeprecated"

#endif


NAMESPACE_BEGIN(CryptoPP)


#if defined(__ALTIVEC__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)


typedef __vector unsigned char   uint8x16_p;

typedef __vector unsigned short  uint16x8_p;

typedef __vector unsigned int    uint32x4_p;


#if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)

typedef __vector unsigned long long uint64x2_p;

#endif  // VSX or ARCH_PWR8


inline uint32x4_p VecZero()

{

    const uint32x4_p v = {0,0,0,0};

    return v;

}


inline uint32x4_p VecOne()

{

    const uint32x4_p v = {1,1,1,1};

    return v;

}


template <class T>

inline T VecReverse(const T data)

{

#if defined(CRYPTOPP_BIG_ENDIAN)

    const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};

    return (T)vec_perm(data, data, mask);

#else

    const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};

    return (T)vec_perm(data, data, mask);

#endif

}


template <class T>

inline T VecReverseLE(const T data)

{

#if defined(CRYPTOPP_LITTLE_ENDIAN)

    const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};

    return (T)vec_perm(data, data, mask);

#else

    return data;

#endif

}


template <class T>

inline T VecReverseBE(const T data)

{

#if defined(CRYPTOPP_BIG_ENDIAN)

    const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};

    return (T)vec_perm(data, data, mask);

#else

    return data;

#endif

}


inline uint32x4_p VecLoad_ALTIVEC(const byte src[16])

{

    // Avoid IsAlignedOn for convenience.

    const uintptr_t addr = reinterpret_cast<uintptr_t>(src);

    if (addr % 16 == 0)

    {

        return (uint32x4_p)vec_ld(0, CONST_V8_CAST(addr));

    }

    else

    {

        // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf

        const uint8x16_p perm = vec_lvsl(0, CONST_V8_CAST(addr));

        const uint8x16_p low = vec_ld(0, CONST_V8_CAST(addr));

        const uint8x16_p high = vec_ld(15, CONST_V8_CAST(addr));

        return (uint32x4_p)vec_perm(low, high, perm);

    }

}


inline uint32x4_p VecLoad_ALTIVEC(int off, const byte src[16])

{

    // Avoid IsAlignedOn for convenience.

    const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;

    if (addr % 16 == 0)

    {

        return (uint32x4_p)vec_ld(0, CONST_V8_CAST(addr));

    }

    else

    {

        // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf

        const uint8x16_p perm = vec_lvsl(0, CONST_V8_CAST(addr));

        const uint8x16_p low = vec_ld(0, CONST_V8_CAST(addr));

        const uint8x16_p high = vec_ld(15, CONST_V8_CAST(addr));

        return (uint32x4_p)vec_perm(low, high, perm);

    }

}


inline uint32x4_p VecLoad(const byte src[16])

{

    // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks loads for short* and char*.

    // Power9/ISA 3.0 provides vec_xl for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(src);

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));

#else

    return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));

#endif

}


inline uint32x4_p VecLoad(int off, const byte src[16])

{

    // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks loads for short* and char*.

    // Power9/ISA 3.0 provides vec_xl for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));

#else

    return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));

#endif

}


inline uint32x4_p VecLoad(const word32 src[4])

{

    // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks loads for short* and char*.

    // Power9/ISA 3.0 provides vec_xl for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(src);

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));

#elif defined(__VSX__) || defined(_ARCH_PWR8)

    return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));

#else

    return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));

#endif

}


inline uint32x4_p VecLoad(int off, const word32 src[4])

{

    // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks loads for short* and char*.

    // Power9/ISA 3.0 provides vec_xl for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));

#elif defined(__VSX__) || defined(_ARCH_PWR8)

    return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));

#else

    return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));

#endif

}


#if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)


inline uint64x2_p VecLoad(const word64 src[2])

{

    // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks loads for short* and char*.

    // Power9/ISA 3.0 provides vec_xl for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(src);

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src));

#elif defined(__VSX__) || defined(_ARCH_PWR8)

    // The 32-bit cast is not a typo. Compiler workaround.

    return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));

#else

    return (uint64x2_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));

#endif

}


inline uint64x2_p VecLoad(int off, const word64 src[2])

{

    // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks loads for short* and char*.

    // Power9/ISA 3.0 provides vec_xl for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src));

#elif defined(__VSX__) || defined(_ARCH_PWR8)

    // The 32-bit cast is not a typo. Compiler workaround.

    return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));

#else

    return (uint64x2_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));

#endif

}


#endif  // VSX or ARCH_PWR8


inline uint32x4_p VecLoadAligned(const byte src[16])

{

    // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks loads for short* and char*.

    // Power9/ISA 3.0 provides vec_xl for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(src);

    CRYPTOPP_ASSERT(addr % 16 == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));

#else

    return (uint32x4_p)vec_ld(0, CONST_V8_CAST(src));

#endif

}


inline uint32x4_p VecLoadAligned(int off, const byte src[16])

{

    // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks loads for short* and char*.

    // Power9/ISA 3.0 provides vec_xl for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;

    CRYPTOPP_ASSERT(addr % 16 == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));

#else

    return (uint32x4_p)vec_ld(off, CONST_V8_CAST(src));

#endif

}


inline uint32x4_p VecLoadAligned(const word32 src[4])

{

    // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks loads for short* and char*.

    // Power9/ISA 3.0 provides vec_xl for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(src);

    CRYPTOPP_ASSERT(addr % 16 == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));

#elif defined(__VSX__) || defined(_ARCH_PWR8)

    return (uint32x4_p)vec_xl(0, CONST_V32_CAST(src));

#else

    return (uint32x4_p)vec_ld(0, CONST_V8_CAST(src));

#endif

}


inline uint32x4_p VecLoadAligned(int off, const word32 src[4])

{

    // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks loads for short* and char*.

    // Power9/ISA 3.0 provides vec_xl for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;

    CRYPTOPP_ASSERT(addr % 16 == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));

#elif defined(__VSX__) || defined(_ARCH_PWR8)

    return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));

#else

    return (uint32x4_p)vec_ld(off, CONST_V8_CAST(src));

#endif

}


#if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)


inline uint64x2_p VecLoadAligned(const word64 src[4])

{

    // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks loads for short* and char*.

    // Power9/ISA 3.0 provides vec_xl for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(src);

    CRYPTOPP_ASSERT(addr % 16 == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src));

#elif defined(__VSX__) || defined(_ARCH_PWR8)

    // The 32-bit cast is not a typo. Compiler workaround.

    return (uint64x2_p)vec_xl(0, CONST_V32_CAST(src));

#else

    return (uint64x2_p)vec_ld(0, CONST_V8_CAST(src));

#endif

}


inline uint64x2_p VecLoadAligned(int off, const word64 src[4])

{

    // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks loads for short* and char*.

    // Power9/ISA 3.0 provides vec_xl for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;

    CRYPTOPP_ASSERT(addr % 16 == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src));

#elif defined(__VSX__) || defined(_ARCH_PWR8)

    // The 32-bit cast is not a typo. Compiler workaround.

    return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));

#else

    return (uint64x2_p)vec_ld(off, CONST_V8_CAST(src));

#endif

}


#endif


inline uint32x4_p VecLoadBE(const byte src[16])

{

    // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks loads for short* and char*.

    // Power9/ISA 3.0 provides vec_xl for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(src);

    // CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);

    return (uint32x4_p)vec_xl_be(0, CONST_V8_CAST(src));

#elif defined(CRYPTOPP_BIG_ENDIAN)

    return (uint32x4_p)VecLoad_ALTIVEC(0, CONST_V8_CAST(src));

#else

    return (uint32x4_p)VecReverseLE(VecLoad_ALTIVEC(CONST_V8_CAST(src)));

#endif

}


inline uint32x4_p VecLoadBE(int off, const byte src[16])

{

    // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks loads for short* and char*.

    // Power9/ISA 3.0 provides vec_xl for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;

    // CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);

    return (uint32x4_p)vec_xl_be(off, CONST_V8_CAST(src));

#elif defined(CRYPTOPP_BIG_ENDIAN)

    return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));

#else

    return (uint32x4_p)VecReverseLE(VecLoad_ALTIVEC(CONST_V8_CAST(addr)));

#endif

}


template<class T>

inline void VecStore_ALTIVEC(const T data, byte dest[16])

{

    // Avoid IsAlignedOn for convenience.

    uintptr_t addr = reinterpret_cast<uintptr_t>(dest);

    if (addr % 16 == 0)

    {

        vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));

    }

    else

    {

        // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf

        uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, NCONST_V8_CAST(addr)));

        vec_ste((uint8x16_p) perm,  0, (unsigned char*) NCONST_V8_CAST(addr));

        vec_ste((uint16x8_p) perm,  1, (unsigned short*)NCONST_V8_CAST(addr));

        vec_ste((uint32x4_p) perm,  3, (unsigned int*)  NCONST_V8_CAST(addr));

        vec_ste((uint32x4_p) perm,  4, (unsigned int*)  NCONST_V8_CAST(addr));

        vec_ste((uint32x4_p) perm,  8, (unsigned int*)  NCONST_V8_CAST(addr));

        vec_ste((uint32x4_p) perm, 12, (unsigned int*)  NCONST_V8_CAST(addr));

        vec_ste((uint16x8_p) perm, 14, (unsigned short*)NCONST_V8_CAST(addr));

        vec_ste((uint8x16_p) perm, 15, (unsigned char*) NCONST_V8_CAST(addr));

    }

}


template<class T>

inline void VecStore_ALTIVEC(const T data, int off, byte dest[16])

{

    // Avoid IsAlignedOn for convenience.

    uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;

    if (addr % 16 == 0)

    {

        vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));

    }

    else

    {

        // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf

        uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, NCONST_V8_CAST(addr)));

        vec_ste((uint8x16_p) perm,  0, (unsigned char*) NCONST_V8_CAST(addr));

        vec_ste((uint16x8_p) perm,  1, (unsigned short*)NCONST_V8_CAST(addr));

        vec_ste((uint32x4_p) perm,  3, (unsigned int*)  NCONST_V8_CAST(addr));

        vec_ste((uint32x4_p) perm,  4, (unsigned int*)  NCONST_V8_CAST(addr));

        vec_ste((uint32x4_p) perm,  8, (unsigned int*)  NCONST_V8_CAST(addr));

        vec_ste((uint32x4_p) perm, 12, (unsigned int*)  NCONST_V8_CAST(addr));

        vec_ste((uint16x8_p) perm, 14, (unsigned short*)NCONST_V8_CAST(addr));

        vec_ste((uint8x16_p) perm, 15, (unsigned char*) NCONST_V8_CAST(addr));

    }

}


template<class T>

inline void VecStore(const T data, byte dest[16])

{

    // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks loads for short* and char*.

    // Power9/ISA 3.0 provides vec_xl for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));

#else

    VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(dest));

#endif

}


template<class T>

inline void VecStore(const T data, int off, byte dest[16])

{

    // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks loads for short* and char*.

    // Power9/ISA 3.0 provides vec_xl for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));

#else

    VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));

#endif

}


template<class T>

inline void VecStore(const T data, word32 dest[4])

{

    // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks stores for short* and char*.

    // Power9/ISA 3.0 provides vec_xst for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));

#elif defined(__VSX__) || defined(_ARCH_PWR8)

    vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));

#else

    VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));

#endif

}


template<class T>

inline void VecStore(const T data, int off, word32 dest[4])

{

    // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks stores for short* and char*.

    // Power9/ISA 3.0 provides vec_xst for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));

#elif defined(__VSX__) || defined(_ARCH_PWR8)

    vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));

#else

    VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));

#endif

}


template<class T>

inline void VecStore(const T data, word64 dest[2])

{

    // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks stores for short* and char*.

    // Power9/ISA 3.0 provides vec_xst for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));

#elif defined(__VSX__) || defined(_ARCH_PWR8)

    // 32-bit cast is not a typo. Compiler workaround.

    vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));

#else

    VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));

#endif

}


template<class T>

inline void VecStore(const T data, int off, word64 dest[2])

{

    // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks stores for short* and char*.

    // Power9/ISA 3.0 provides vec_xst for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));

#elif defined(__VSX__) || defined(_ARCH_PWR8)

    // 32-bit cast is not a typo. Compiler workaround.

    vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));

#else

    VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));

#endif

}


template<class T>

inline void VecStoreAligned(const T data, byte dest[16])

{

    // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks loads for short* and char*.

    // Power9/ISA 3.0 provides vec_xl for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));

#else

    vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));

#endif

}


template<class T>

inline void VecStoreAligned(const T data, int off, byte dest[16])

{

    // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks loads for short* and char*.

    // Power9/ISA 3.0 provides vec_xl for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));

#else

    vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));

#endif

}


template<class T>

inline void VecStoreAligned(const T data, word32 dest[4])

{

    // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks stores for short* and char*.

    // Power9/ISA 3.0 provides vec_xst for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));

#elif defined(__VSX__) || defined(_ARCH_PWR8)

    vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));

#else

    vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));

#endif

}


template<class T>

inline void VecStoreAligned(const T data, int off, word32 dest[4])

{

    // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks stores for short* and char*.

    // Power9/ISA 3.0 provides vec_xst for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));

#elif defined(__VSX__) || defined(_ARCH_PWR8)

    vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));

#else

    vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));

#endif

}


template <class T>

inline void VecStoreBE(const T data, byte dest[16])

{

    // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks stores for short* and char*.

    // Power9/ISA 3.0 provides vec_xst for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    vec_xst_be((uint8x16_p)data, 0, NCONST_V8_CAST(dest));

#elif defined(CRYPTOPP_BIG_ENDIAN)

    VecStore((uint8x16_p)data, NCONST_V8_CAST(addr));

#else

    VecStore((uint8x16_p)VecReverseLE(data), NCONST_V8_CAST(addr));

#endif

}


template <class T>

inline void VecStoreBE(const T data, int off, byte dest[16])

{

    // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks stores for short* and char*.

    // Power9/ISA 3.0 provides vec_xst for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    vec_xst_be((uint8x16_p)data, off, NCONST_V8_CAST(dest));

#elif defined(CRYPTOPP_BIG_ENDIAN)

    VecStore((uint8x16_p)data, NCONST_V8_CAST(addr));

#else

    VecStore((uint8x16_p)VecReverseLE(data), NCONST_V8_CAST(addr));

#endif

}


template <class T>

inline void VecStoreBE(const T data, word32 dest[4])

{

    // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks stores for short* and char*.

    // Power9/ISA 3.0 provides vec_xst for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    vec_xst_be((uint8x16_p)data, 0, NCONST_V8_CAST(dest));

#elif defined(CRYPTOPP_BIG_ENDIAN)

    VecStore((uint32x4_p)data, NCONST_V32_CAST(addr));

#else

    VecStore((uint32x4_p)VecReverseLE(data), NCONST_V32_CAST(addr));

#endif

}


template <class T>

inline void VecStoreBE(const T data, int off, word32 dest[4])

{

    // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit

    // word pointers. The ISA lacks stores for short* and char*.

    // Power9/ISA 3.0 provides vec_xst for all datatypes.


    const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;

    CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);

    CRYPTOPP_UNUSED(addr);


#if defined(_ARCH_PWR9)

    vec_xst_be((uint8x16_p)data, off, NCONST_V8_CAST(dest));

#elif defined(CRYPTOPP_BIG_ENDIAN)

    VecStore((uint32x4_p)data, NCONST_V32_CAST(addr));

#else

    VecStore((uint32x4_p)VecReverseLE(data), NCONST_V32_CAST(addr));

#endif

}


template <class T1, class T2>

inline T1 VecAnd(const T1 vec1, const T2 vec2)

{

    return (T1)vec_and(vec1, (T1)vec2);

}


template <class T1, class T2>

inline T1 VecOr(const T1 vec1, const T2 vec2)

{

    return (T1)vec_or(vec1, (T1)vec2);

}


template <class T1, class T2>

inline T1 VecXor(const T1 vec1, const T2 vec2)

{

    return (T1)vec_xor(vec1, (T1)vec2);

}


template <class T1, class T2>

inline T1 VecAdd(const T1 vec1, const T2 vec2)

{

    return (T1)vec_add(vec1, (T1)vec2);

}


template <class T1, class T2>

inline T1 VecSub(const T1 vec1, const T2 vec2)

{

    return (T1)vec_sub(vec1, (T1)vec2);

}


template <class T1, class T2>

inline T1 VecPermute(const T1 vec, const T2 mask)

{

    return (T1)vec_perm(vec, vec, (uint8x16_p)mask);

}


template <class T1, class T2>

inline T1 VecPermute(const T1 vec1, const T1 vec2, const T2 mask)

{

    return (T1)vec_perm(vec1, (T1)vec2, (uint8x16_p)mask);

}


template <unsigned int C, class T>

inline T VecShiftLeftOctet(const T vec)

{

    const T zero = {0};

    if (C >= 16)

    {

        // Out of range

        return zero;

    }

    else if (C == 0)

    {

        // Noop

        return vec;

    }

    else

    {

#if defined(CRYPTOPP_BIG_ENDIAN)

    enum { R=C&0xf };

    return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);

#else

    enum { R=(16-C)&0xf };  // Linux xlC 13.1 workaround in Debug builds

    return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);

#endif

    }

}


template <unsigned int C, class T>

inline T VecShiftRightOctet(const T vec)

{

    const T zero = {0};

    if (C >= 16)

    {

        // Out of range

        return zero;

    }

    else if (C == 0)

    {

        // Noop

        return vec;

    }

    else

    {

#if defined(CRYPTOPP_BIG_ENDIAN)

    enum { R=(16-C)&0xf };  // Linux xlC 13.1 workaround in Debug builds

    return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);

#else

    enum { R=C&0xf };

    return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);

#endif

    }

}


template <unsigned int C, class T>

inline T VecRotateLeftOctet(const T vec)

{

#if defined(CRYPTOPP_BIG_ENDIAN)

    enum { R = C&0xf };

    return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);

#else

    enum { R=(16-C)&0xf };  // Linux xlC 13.1 workaround in Debug builds

    return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);

#endif

}


template <unsigned int C, class T>

inline T VecRotateRightOctet(const T vec)

{

#if defined(CRYPTOPP_BIG_ENDIAN)

    enum { R=(16-C)&0xf };  // Linux xlC 13.1 workaround in Debug builds

    return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);

#else

    enum { R = C&0xf };

    return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);

#endif

}


template<unsigned int C>

inline uint32x4_p VecRotateLeft(const uint32x4_p vec)

{

    const uint32x4_p m = {C, C, C, C};

    return vec_rl(vec, m);

}


template<unsigned int C>

inline uint32x4_p VecRotateRight(const uint32x4_p vec)

{

    const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};

    return vec_rl(vec, m);

}


template<unsigned int C>

inline uint32x4_p VecShiftLeft(const uint32x4_p vec)

{

    const uint32x4_p m = {C, C, C, C};

    return vec_sl(vec, m);

}


template<unsigned int C>

inline uint32x4_p VecShiftRight(const uint32x4_p vec)

{

    const uint32x4_p m = {C, C, C, C};

    return vec_sr(vec, m);

}


// 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8

#if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)


template<unsigned int C>

inline uint64x2_p VecRotateLeft(const uint64x2_p vec)

{

    const uint64x2_p m = {C, C};

    return vec_rl(vec, m);

}


template<unsigned int C>

inline uint64x2_p VecShiftLeft(const uint64x2_p vec)

{

    const uint64x2_p m = {C, C};

    return vec_sl(vec, m);

}


template<unsigned int C>

inline uint64x2_p VecRotateRight(const uint64x2_p vec)

{

    const uint64x2_p m = {64-C, 64-C};

    return vec_rl(vec, m);

}


template<unsigned int C>

inline uint64x2_p VecShiftRight(const uint64x2_p vec)

{

    const uint64x2_p m = {C, C};

    return vec_sr(vec, m);

}


#endif  // ARCH_PWR8


template <class T>

inline T VecMergeLow(const T vec1, const T vec2)

{

    return vec_mergel(vec1, vec2);

}


template <class T>

inline T VecMergeHigh(const T vec1, const T vec2)

{

    return vec_mergeh(vec1, vec2);

}


inline uint32x4_p VecSplatWord(word32 val)

{

    // Fix spurious GCC warning???

    CRYPTOPP_UNUSED(val);


    // Apple Altivec and XL C++ do not offer vec_splats.

    // GCC offers vec_splats back to -mcpu=power4.

#if defined(_ARCH_PWR4) && defined(__GNUC__)

    return vec_splats(val);

#else

    //const word32 x[4] = {val,val,val,val};

    //return VecLoad(x);

    const word32 x[4] = {val};

    return vec_splat(VecLoad(x),0);

#endif

}


template <unsigned int N>

inline uint32x4_p VecSplatElement(const uint32x4_p val)

{

    return vec_splat(val, N);

}


#if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)

inline uint64x2_p VecSplatWord(word64 val)

{

    // The PPC64 ABI says so.

    return vec_splats((unsigned long long)val);

}


template <unsigned int N>

inline uint64x2_p VecSplatElement(const uint64x2_p val)

{

#if defined(__VSX__) || defined(_ARCH_PWR8)

    return vec_splat(val, N);

#else

    enum {E=N&1};

    if (E == 0)

    {

        const uint8x16_p m = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};

        return vec_perm(val, val, m);

    }

    else // (E == 1)

    {

        const uint8x16_p m = {8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15};

        return vec_perm(val, val, m);

    }

#endif

}

#endif


template <class T>

inline T VecGetLow(const T val)

{

#if defined(CRYPTOPP_BIG_ENDIAN) && (defined(__VSX__) || defined(_ARCH_PWR8))

    const T zero = {0};

    return (T)VecMergeLow((uint64x2_p)zero, (uint64x2_p)val);

#else

    return VecShiftRightOctet<8>(VecShiftLeftOctet<8>(val));

#endif

}


template <class T>

inline T VecGetHigh(const T val)

{

#if defined(CRYPTOPP_BIG_ENDIAN) && (defined(__VSX__) || defined(_ARCH_PWR8))

    const T zero = {0};

    return (T)VecMergeHigh((uint64x2_p)zero, (uint64x2_p)val);

#else

    return VecShiftRightOctet<8>(val);

#endif

}


template <class T>

inline T VecSwapWords(const T vec)

{

    return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 8);

}


template <class T1, class T2>

inline bool VecEqual(const T1 vec1, const T2 vec2)

{

    return 1 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);

}


template <class T1, class T2>

inline bool VecNotEqual(const T1 vec1, const T2 vec2)

{

    return 0 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);

}


inline uint32x4_p VecAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2)

{

    // 64-bit elements available at POWER7 with VSX, but addudm requires POWER8

#if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)

    return (uint32x4_p)vec_add((uint64x2_p)vec1, (uint64x2_p)vec2);

#else

    // The carry mask selects carrys for elements 1 and 3 and sets

    // remaining elements to 0. The results is then shifted so the

    // carried values are added to elements 0 and 2.

#if defined(CRYPTOPP_BIG_ENDIAN)

    const uint32x4_p zero = {0, 0, 0, 0};

    const uint32x4_p mask = {0, 1, 0, 1};

#else

    const uint32x4_p zero = {0, 0, 0, 0};

    const uint32x4_p mask = {1, 0, 1, 0};

#endif


    uint32x4_p cy = vec_addc(vec1, vec2);

    uint32x4_p res = vec_add(vec1, vec2);

    cy = vec_and(mask, cy);

    cy = vec_sld (cy, zero, 4);

    return vec_add(res, cy);

#endif

}


#if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)

inline uint64x2_p VecAdd64(const uint64x2_p& vec1, const uint64x2_p& vec2)

{

    // 64-bit elements available at POWER7 with VSX, but addudm requires POWER8

    const uint64x2_p res = vec_add(vec1, vec2);


#if defined(CRYPTOPP_DEBUG)

    // Test 32-bit add in debug builds while we are here.

    const uint32x4_p x = (uint32x4_p)vec1;

    const uint32x4_p y = (uint32x4_p)vec2;

    const uint32x4_p r = VecAdd64(x, y);


    CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);

#endif


    return res;

}

#endif


inline uint32x4_p VecSub64(const uint32x4_p& vec1, const uint32x4_p& vec2)

{

#if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)

    // 64-bit elements available at POWER7 with VSX, but subudm requires POWER8

    return (uint32x4_p)vec_sub((uint64x2_p)vec1, (uint64x2_p)vec2);

#else

    // The borrow mask selects borrows for elements 1 and 3 and sets

    // remaining elements to 0. The results is then shifted so the

    // borrowed values are subtracted from elements 0 and 2.

#if defined(CRYPTOPP_BIG_ENDIAN)

    const uint32x4_p zero = {0, 0, 0, 0};

    const uint32x4_p mask = {0, 1, 0, 1};

#else

    const uint32x4_p zero = {0, 0, 0, 0};

    const uint32x4_p mask = {1, 0, 1, 0};

#endif


    // subc sets the complement of borrow, so we have to

    // un-complement it using andc.

    uint32x4_p bw = vec_subc(vec1, vec2);

    uint32x4_p res = vec_sub(vec1, vec2);

    bw = vec_andc(mask, bw);

    bw = vec_sld (bw, zero, 4);

    return vec_sub(res, bw);

#endif

}


#if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)

inline uint64x2_p VecSub64(const uint64x2_p& vec1, const uint64x2_p& vec2)

{

    // 64-bit elements available at POWER7 with VSX, but subudm requires POWER8

    const uint64x2_p res = vec_sub(vec1, vec2);


#if defined(CRYPTOPP_DEBUG)

    // Test 32-bit sub in debug builds while we are here.

    const uint32x4_p x = (uint32x4_p)vec1;

    const uint32x4_p y = (uint32x4_p)vec2;

    const uint32x4_p r = VecSub64(x, y);


    CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);

#endif


    return res;

}

#endif


template<unsigned int C>

inline uint32x4_p VecRotateLeft64(const uint32x4_p vec)

{

#if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)

    // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8

    return (uint32x4_p)VecRotateLeft<C>((uint64x2_p)vec);

#else

    // C=0, 32, or 64 needs special handling. That is S32 and S64 below.

    enum {S64=C&63, S32=C&31, BR=(S64>=32)};


    // Get the low bits, shift them to high bits

    uint32x4_p t1 = VecShiftLeft<S32>(vec);

    // Get the high bits, shift them to low bits

    uint32x4_p t2 = VecShiftRight<32-S32>(vec);


    if (S64 == 0)

    {

        const uint8x16_p m = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};

        return VecPermute(vec, m);

    }

    else if (S64 == 32)

    {

        const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};

        return VecPermute(vec, m);

    }

    else if (BR)  // Big rotate amount?

    {

        const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};

        t1 = VecPermute(t1, m);

    }

    else

    {

        const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};

        t2 = VecPermute(t2, m);

    }


    return vec_or(t1, t2);

#endif

}


template<>

inline uint32x4_p VecRotateLeft64<8>(const uint32x4_p vec)

{

#if (CRYPTOPP_BIG_ENDIAN)

    const uint8x16_p m = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };

    return VecPermute(vec, m);

#else

    const uint8x16_p m = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };

    return VecPermute(vec, m);

#endif

}


#if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)

template<unsigned int C>

inline uint64x2_p VecRotateLeft64(const uint64x2_p vec)

{

    // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8

    const uint64x2_p res = VecRotateLeft<C>(vec);


#if defined(CRYPTOPP_DEBUG)

    // Test 32-bit rotate in debug builds while we are here.

    const uint32x4_p x = (uint32x4_p)vec;

    const uint32x4_p r = VecRotateLeft64<C>(x);


    CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);

#endif


    return res;

}

#endif


template<unsigned int C>

inline uint32x4_p VecRotateRight64(const uint32x4_p vec)

{

#if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)

    // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8

    return (uint32x4_p)VecRotateRight<C>((uint64x2_p)vec);

#else

    // C=0, 32, or 64 needs special handling. That is S32 and S64 below.

    enum {S64=C&63, S32=C&31, BR=(S64>=32)};


    // Get the low bits, shift them to high bits

    uint32x4_p t1 = VecShiftRight<S32>(vec);

    // Get the high bits, shift them to low bits

    uint32x4_p t2 = VecShiftLeft<32-S32>(vec);


    if (S64 == 0)

    {

        const uint8x16_p m = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};

        return VecPermute(vec, m);

    }

    else if (S64 == 32)

    {

        const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};

        return VecPermute(vec, m);

    }

    else if (BR)  // Big rotate amount?

    {

        const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};

        t1 = VecPermute(t1, m);

    }

    else

    {

        const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};

        t2 = VecPermute(t2, m);

    }


    return vec_or(t1, t2);

#endif

}


template<>

inline uint32x4_p VecRotateRight64<8>(const uint32x4_p vec)

{

#if (CRYPTOPP_BIG_ENDIAN)

    const uint8x16_p m = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };

    return VecPermute(vec, m);

#else

    const uint8x16_p m = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };

    return VecPermute(vec, m);

#endif

}


#if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)

template<unsigned int C>

inline uint64x2_p VecRotateRight64(const uint64x2_p vec)

{

    // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8

    const uint64x2_p res = VecRotateRight<C>(vec);


#if defined(CRYPTOPP_DEBUG)

    // Test 32-bit rotate in debug builds while we are here.

    const uint32x4_p x = (uint32x4_p)vec;

    const uint32x4_p r = VecRotateRight64<C>(x);


    CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);

#endif


    return res;

}

#endif


template <class T1, class T2>

inline T1 VecAnd64(const T1 vec1, const T2 vec2)

{

    return (T1)vec_and(vec1, (T1)vec2);

}


template <class T1, class T2>

inline T1 VecOr64(const T1 vec1, const T2 vec2)

{

    return (T1)vec_or(vec1, (T1)vec2);

}


template <class T1, class T2>

inline T1 VecXor64(const T1 vec1, const T2 vec2)

{

    return (T1)vec_xor(vec1, (T1)vec2);

}


inline uint32x4_p VecSplatWord64(word64 val)

{

#if defined(_ARCH_PWR8)

    // The PPC64 ABI says so.

    return (uint32x4_p)vec_splats((unsigned long long)val);

#else

    const word64 x[2] = {val,val};

    return (uint32x4_p)VecLoad((const word32*)x);

#endif

}


template <unsigned int N>

inline uint32x4_p VecSplatElement64(const uint32x4_p val)

{

#if defined(__VSX__) || defined(_ARCH_PWR8)

    return (uint32x4_p)vec_splat((uint64x2_p)val, N);

#else

    enum {E=N&1};

    if (E == 0)

    {

        const uint8x16_p m = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};

        return (uint32x4_p)vec_perm(val, val, m);

    }

    else // (E == 1)

    {

        const uint8x16_p m = {8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15};

        return (uint32x4_p)vec_perm(val, val, m);

    }

#endif

}


#if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)

template <unsigned int N>

inline uint64x2_p VecSplatElement64(const uint64x2_p val)

{

    return vec_splat(val, N);

}

#endif


// __CRYPTO__ alone is not enough. Clang will define __CRYPTO__

// when it is not available, like with Power7. Sigh...

#if (defined(_ARCH_PWR8) && defined(__CRYPTO__)) || defined(CRYPTOPP_DOXYGEN_PROCESSING)


inline uint32x4_p VecPolyMultiply(const uint32x4_p& a, const uint32x4_p& b)

{

#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))

    return __vpmsumw (a, b);

#elif defined(__clang__)

    return __builtin_altivec_crypto_vpmsumw (a, b);

#else

    return __builtin_crypto_vpmsumw (a, b);

#endif

}


inline uint64x2_p VecPolyMultiply(const uint64x2_p& a, const uint64x2_p& b)

{

#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))

    return __vpmsumd (a, b);

#elif defined(__clang__)

    return __builtin_altivec_crypto_vpmsumd (a, b);

#else

    return __builtin_crypto_vpmsumd (a, b);

#endif

}


inline uint64x2_p VecIntelMultiply00(const uint64x2_p& a, const uint64x2_p& b)

{

#if defined(CRYPTOPP_BIG_ENDIAN)

    return VecSwapWords(VecPolyMultiply(VecGetHigh(a), VecGetHigh(b)));

#else

    return VecPolyMultiply(VecGetHigh(a), VecGetHigh(b));

#endif

}


inline uint64x2_p VecIntelMultiply01(const uint64x2_p& a, const uint64x2_p& b)

{

#if defined(CRYPTOPP_BIG_ENDIAN)

    return VecSwapWords(VecPolyMultiply(a, VecGetHigh(b)));

#else

    return VecPolyMultiply(a, VecGetHigh(b));

#endif

}


inline uint64x2_p VecIntelMultiply10(const uint64x2_p& a, const uint64x2_p& b)

{

#if defined(CRYPTOPP_BIG_ENDIAN)

    return VecSwapWords(VecPolyMultiply(VecGetHigh(a), b));

#else

    return VecPolyMultiply(VecGetHigh(a), b);

#endif

}


inline uint64x2_p VecIntelMultiply11(const uint64x2_p& a, const uint64x2_p& b)

{

#if defined(CRYPTOPP_BIG_ENDIAN)

    return VecSwapWords(VecPolyMultiply(VecGetLow(a), b));

#else

    return VecPolyMultiply(VecGetLow(a), b);

#endif

}


template <class T1, class T2>

inline T1 VecEncrypt(const T1 state, const T2 key)

{

#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))

    return (T1)__vcipher((uint8x16_p)state, (uint8x16_p)key);

#elif defined(__clang__)

    return (T1)__builtin_altivec_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);

#elif defined(__GNUC__)

    return (T1)__builtin_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);

#else

    CRYPTOPP_ASSERT(0);

#endif

}


template <class T1, class T2>

inline T1 VecEncryptLast(const T1 state, const T2 key)

{

#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))

    return (T1)__vcipherlast((uint8x16_p)state, (uint8x16_p)key);

#elif defined(__clang__)

    return (T1)__builtin_altivec_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);

#elif defined(__GNUC__)

    return (T1)__builtin_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);

#else

    CRYPTOPP_ASSERT(0);

#endif

}


template <class T1, class T2>

inline T1 VecDecrypt(const T1 state, const T2 key)

{

#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))

    return (T1)__vncipher((uint8x16_p)state, (uint8x16_p)key);

#elif defined(__clang__)

    return (T1)__builtin_altivec_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);

#elif defined(__GNUC__)

    return (T1)__builtin_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);

#else

    CRYPTOPP_ASSERT(0);

#endif

}


template <class T1, class T2>

inline T1 VecDecryptLast(const T1 state, const T2 key)

{

#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))

    return (T1)__vncipherlast((uint8x16_p)state, (uint8x16_p)key);

#elif defined(__clang__)

    return (T1)__builtin_altivec_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);

#elif defined(__GNUC__)

    return (T1)__builtin_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);

#else

    CRYPTOPP_ASSERT(0);

#endif

}


template <int func, int fmask, class T>

inline T VecSHA256(const T data)

{

#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))

    return (T)__vshasigmaw((uint32x4_p)data, func, fmask);

#elif defined(__clang__)

    return (T)__builtin_altivec_crypto_vshasigmaw((uint32x4_p)data, func, fmask);

#elif defined(__GNUC__)

    return (T)__builtin_crypto_vshasigmaw((uint32x4_p)data, func, fmask);

#else

    CRYPTOPP_ASSERT(0);

#endif

}


template <int func, int fmask, class T>

inline T VecSHA512(const T data)

{

#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))

    return (T)__vshasigmad((uint64x2_p)data, func, fmask);

#elif defined(__clang__)

    return (T)__builtin_altivec_crypto_vshasigmad((uint64x2_p)data, func, fmask);

#elif defined(__GNUC__)

    return (T)__builtin_crypto_vshasigmad((uint64x2_p)data, func, fmask);

#else

    CRYPTOPP_ASSERT(0);

#endif

}


#endif  // __CRYPTO__


#endif  // _ALTIVEC_


NAMESPACE_END


#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE

# pragma GCC diagnostic pop

#endif


#endif  // CRYPTOPP_PPC_CRYPTO_H

config.h
Library configuration file.

word32
unsigned int word32
32-bit unsigned datatype
Definition config_int.h:62

misc.h
Utility functions for the Crypto++ library.

NCONST_V32_CAST
#define NCONST_V32_CAST(x)
Cast array to vector pointer.
Definition ppc_simd.h:169

CONST_V8_CAST
#define CONST_V8_CAST(x)
Cast array to vector pointer.
Definition ppc_simd.h:145

NCONST_V8_CAST
#define NCONST_V8_CAST(x)
Cast array to vector pointer.
Definition ppc_simd.h:163

CONST_V32_CAST
#define CONST_V32_CAST(x)
Cast array to vector pointer.
Definition ppc_simd.h:151