Security Scol plugin
aria_simd.cpp
1// aria_simd.cpp - written and placed in the public domain by
2// Jeffrey Walton, Uri Blumenthal and Marcel Raad.
3//
4// This source file uses intrinsics to gain access to ARMv7a and
5// ARMv8a NEON instructions. A separate source file is needed
6// because additional CXXFLAGS are required to enable the
7// appropriate instructions sets in some build configurations.
8
9#include "pch.h"
10#include "config.h"
11#include "misc.h"
12
13#if (CRYPTOPP_SSSE3_AVAILABLE)
14# include <tmmintrin.h>
15#endif
16
17#if (CRYPTOPP_ARM_NEON_HEADER)
18# include <arm_neon.h>
19#endif
20
21#if (CRYPTOPP_ARM_ACLE_HEADER)
22# include <stdint.h>
23# include <arm_acle.h>
24#endif
25
26// Squash MS LNK4221 and libtool warnings
27extern const char ARIA_SIMD_FNAME[] = __FILE__;
28
29NAMESPACE_BEGIN(CryptoPP)
30NAMESPACE_BEGIN(ARIATab)
31
32extern const word32 S1[256];
33extern const word32 S2[256];
34extern const word32 X1[256];
35extern const word32 X2[256];
36extern const word32 KRK[3][4];
37
38NAMESPACE_END
39NAMESPACE_END
40
41ANONYMOUS_NAMESPACE_BEGIN
42
43using CryptoPP::byte;
44using CryptoPP::word32;
45
46inline byte ARIA_BRF(const word32 x, const int y) {
47 return static_cast<byte>(GETBYTE(x, y));
48}
49
50ANONYMOUS_NAMESPACE_END
51
52NAMESPACE_BEGIN(CryptoPP)
53
54using CryptoPP::ARIATab::S1;
55using CryptoPP::ARIATab::S2;
56using CryptoPP::ARIATab::X1;
57using CryptoPP::ARIATab::X2;
58using CryptoPP::ARIATab::KRK;
59
60#if (CRYPTOPP_ARM_NEON_AVAILABLE)
61
62template <unsigned int N>
63inline void ARIA_GSRK_NEON(const uint32x4_t X, const uint32x4_t Y, byte RK[16])
64{
65 enum { Q1 = (4-(N/32)) % 4,
66 Q2 = (3-(N/32)) % 4,
67 R = N % 32
68 };
69
70 vst1q_u8(RK, vreinterpretq_u8_u32(
71 veorq_u32(X, veorq_u32(
72 vshrq_n_u32(vextq_u32(Y, Y, Q1), R),
73 vshlq_n_u32(vextq_u32(Y, Y, Q2), 32-R)))));
74}
75
76void ARIA_UncheckedSetKey_Schedule_NEON(byte* rk, word32* ws, unsigned int keylen)
77{
78 const uint32x4_t w0 = vld1q_u32(ws+ 0);
79 const uint32x4_t w1 = vld1q_u32(ws+ 8);
80 const uint32x4_t w2 = vld1q_u32(ws+12);
81 const uint32x4_t w3 = vld1q_u32(ws+16);
82
83 ARIA_GSRK_NEON<19>(w0, w1, rk + 0);
84 ARIA_GSRK_NEON<19>(w1, w2, rk + 16);
85 ARIA_GSRK_NEON<19>(w2, w3, rk + 32);
86 ARIA_GSRK_NEON<19>(w3, w0, rk + 48);
87 ARIA_GSRK_NEON<31>(w0, w1, rk + 64);
88 ARIA_GSRK_NEON<31>(w1, w2, rk + 80);
89 ARIA_GSRK_NEON<31>(w2, w3, rk + 96);
90 ARIA_GSRK_NEON<31>(w3, w0, rk + 112);
91 ARIA_GSRK_NEON<67>(w0, w1, rk + 128);
92 ARIA_GSRK_NEON<67>(w1, w2, rk + 144);
93 ARIA_GSRK_NEON<67>(w2, w3, rk + 160);
94 ARIA_GSRK_NEON<67>(w3, w0, rk + 176);
95 ARIA_GSRK_NEON<97>(w0, w1, rk + 192);
96
97 if (keylen > 16)
98 {
99 ARIA_GSRK_NEON<97>(w1, w2, rk + 208);
100 ARIA_GSRK_NEON<97>(w2, w3, rk + 224);
101
102 if (keylen > 24)
103 {
104 ARIA_GSRK_NEON< 97>(w3, w0, rk + 240);
105 ARIA_GSRK_NEON<109>(w0, w1, rk + 256);
106 }
107 }
108}
109
110void ARIA_ProcessAndXorBlock_NEON(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t)
111{
112 outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] );
113 outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8);
114 outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] );
115 outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] );
116 outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] );
117 outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8);
118 outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] );
119 outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] );
120 outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] );
121 outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8);
122 outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] );
123 outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] );
124 outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] );
125 outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8);
126 outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] );
127 outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] );
128
129 // 'outBlock' and 'xorBlock' may be unaligned.
130 if (xorBlock != NULLPTR)
131 {
132 vst1q_u8(outBlock,
133 veorq_u8(
134 vld1q_u8(xorBlock),
135 veorq_u8(
136 vld1q_u8(outBlock),
137 vrev32q_u8(vld1q_u8((rk))))));
138 }
139 else
140 {
141 vst1q_u8(outBlock,
142 veorq_u8(
143 vld1q_u8(outBlock),
144 vrev32q_u8(vld1q_u8(rk))));
145 }
146}
147
148#endif // CRYPTOPP_ARM_NEON_AVAILABLE
149
150#if (CRYPTOPP_SSSE3_AVAILABLE)
151
152void ARIA_ProcessAndXorBlock_SSSE3(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t)
153{
154 const __m128i MASK = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
155
156 outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] );
157 outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8);
158 outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] );
159 outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] );
160 outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] );
161 outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8);
162 outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] );
163 outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] );
164 outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] );
165 outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8);
166 outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] );
167 outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] );
168 outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] );
169 outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8);
170 outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] );
171 outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] );
172
173 // 'outBlock' and 'xorBlock' may be unaligned.
174 if (xorBlock != NULLPTR)
175 {
176 _mm_storeu_si128(M128_CAST(outBlock),
177 _mm_xor_si128(
178 _mm_loadu_si128(CONST_M128_CAST(xorBlock)),
179 _mm_xor_si128(
180 _mm_loadu_si128(CONST_M128_CAST(outBlock)),
181 _mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(rk)), MASK)))
182 );
183 }
184 else
185 {
186 _mm_storeu_si128(M128_CAST(outBlock),
187 _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(outBlock)),
188 _mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(rk)), MASK)));
189 }
190}
191
192#endif // CRYPTOPP_SSSE3_AVAILABLE
193
194NAMESPACE_END
Library configuration file.
unsigned char byte
8-bit unsigned datatype
Definition config_int.h:56
unsigned int word32
32-bit unsigned datatype
Definition config_int.h:62
Utility functions for the Crypto++ library.
Precompiled header file.