200#ifdef CRYPTOPP_X64_MASM_AVAILABLE
201 Salsa20_OperateKeystream(output, input, iterationCount, m_rounds, m_state.
data());
205#if CRYPTOPP_SSE2_ASM_AVAILABLE
206#ifdef CRYPTOPP_GENERATE_X64_MASM
208 Salsa20_OperateKeystream PROC FRAME
209 mov r10, [rsp + 5*8] ; state
210 alloc_stack(10*16 + 32*16 + 8)
211 save_xmm128 xmm6, 0200h
212 save_xmm128 xmm7, 0210h
213 save_xmm128 xmm8, 0220h
214 save_xmm128 xmm9, 0230h
215 save_xmm128 xmm10, 0240h
216 save_xmm128 xmm11, 0250h
217 save_xmm128 xmm12, 0260h
218 save_xmm128 xmm13, 0270h
219 save_xmm128 xmm14, 0280h
220 save_xmm128 xmm15, 0290h
223 #define REG_output rcx
224 #define REG_input rdx
225 #define REG_iterationCount r8
226 #define REG_state r10
227 #define REG_rounds e9d
228 #define REG_roundsLeft eax
229 #define REG_temp32 r11d
231 #define SSE2_WORKSPACE rsp
235 #if CRYPTOPP_BOOL_X64
236 #define REG_output %1
238 #define REG_iterationCount %2
240 #define REG_rounds %3
241 #define REG_roundsLeft eax
242 #define REG_temp32 edx
244 #define SSE2_WORKSPACE %5
246 CRYPTOPP_ALIGN_DATA(16)
byte workspace[16*32];
248 #define REG_output edi
249 #define REG_input eax
250 #define REG_iterationCount ecx
251 #define REG_state esi
252 #define REG_rounds edx
253 #define REG_roundsLeft ebx
254 #define REG_temp32 ebp
256 #define SSE2_WORKSPACE esp + WORD_SZ
265 void *s = m_state.
data();
268 AS2( mov REG_iterationCount, iterationCount)
269 AS2( mov REG_input, input)
270 AS2( mov REG_output, output)
271 AS2( mov REG_state, s)
272 AS2( mov REG_rounds, r)
277 AS2( cmp REG_iterationCount, 4)
287#define SSE2_EXPAND_S(i, j) \
288 ASS( pshufd xmm4, xmm##i, j, j, j, j) \
289 AS2( movdqa [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
291 AS2( movdqa xmm0, [REG_state + 0*16])
292 AS2( movdqa xmm1, [REG_state + 1*16])
293 AS2( movdqa xmm2, [REG_state + 2*16])
294 AS2( movdqa xmm3, [REG_state + 3*16])
310#define SSE2_EXPAND_S85(i) \
311 AS2( mov dword ptr [SSE2_WORKSPACE + 8*16 + i*4 + 256], REG_roundsLeft) \
312 AS2( mov dword ptr [SSE2_WORKSPACE + 5*16 + i*4 + 256], REG_temp32) \
313 AS2( add REG_roundsLeft, 1) \
314 AS2( adc REG_temp32, 0)
317 AS2( mov REG_roundsLeft, dword ptr [REG_state + 8*4])
318 AS2( mov REG_temp32, dword ptr [REG_state + 5*4])
323 AS2( mov dword ptr [REG_state + 8*4], REG_roundsLeft)
324 AS2( mov dword ptr [REG_state + 5*4], REG_temp32)
327#define SSE2_QUARTER_ROUND(a, b, d, i) \
328 AS2( movdqa xmm4, xmm##d) \
329 AS2( paddd xmm4, xmm##a) \
330 AS3( vprotd xmm4, xmm4, i) \
331 AS2( pxor xmm##b, xmm4)
333#define SSE2_QUARTER_ROUND(a, b, d, i) \
334 AS2( movdqa xmm4, xmm##d) \
335 AS2( paddd xmm4, xmm##a) \
336 AS2( movdqa xmm5, xmm4) \
337 AS2( pslld xmm4, i) \
338 AS2( psrld xmm5, 32-i) \
339 AS2( pxor xmm##b, xmm4) \
340 AS2( pxor xmm##b, xmm5)
343#define L01(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
344#define L02(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##C, [SSE2_WORKSPACE + a*16 + i*256])
345#define L03(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C)
348#define L04(A,B,C,D,a,b,c,d,i)
349#define L05(A,B,C,D,a,b,c,d,i) AS3( vprotd xmm##A, xmm##A, 7)
350#define L06(A,B,C,D,a,b,c,d,i)
351#define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256])
352#define L08(A,B,C,D,a,b,c,d,i)
354#define L04(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
355#define L05(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 7)
356#define L06(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-7)
357#define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256])
358#define L08(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B)
361#define L09(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + b*16], xmm##A)
362#define L10(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
363#define L11(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C)
366#define L12(A,B,C,D,a,b,c,d,i)
367#define L13(A,B,C,D,a,b,c,d,i) AS3( vprotd xmm##A, xmm##A, 9)
368#define L14(A,B,C,D,a,b,c,d,i)
369#define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256])
370#define L16(A,B,C,D,a,b,c,d,i)
372#define L12(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
373#define L13(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 9)
374#define L14(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-9)
375#define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256])
376#define L16(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D)
379#define L17(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + c*16], xmm##A)
380#define L18(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
381#define L19(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##B)
384#define L20(A,B,C,D,a,b,c,d,i)
385#define L21(A,B,C,D,a,b,c,d,i) AS3( vprotd xmm##A, xmm##A, 13)
386#define L22(A,B,C,D,a,b,c,d,i)
387#define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
388#define L24(A,B,C,D,a,b,c,d,i)
390#define L20(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
391#define L21(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 13)
392#define L22(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-13)
393#define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
394#define L24(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B)
397#define L25(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + d*16], xmm##A)
398#define L26(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##D)
401#define L27(A,B,C,D,a,b,c,d,i)
402#define L28(A,B,C,D,a,b,c,d,i) AS3( vprotd xmm##A, xmm##A, 18)
403#define L29(A,B,C,D,a,b,c,d,i)
404#define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C)
405#define L31(A,B,C,D,a,b,c,d,i)
407#define L27(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
408#define L28(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 18)
409#define L29(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-18)
410#define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C)
411#define L31(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D)
414#define L32(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + a*16], xmm##A)
416#define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h) \
417 L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) \
418 L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) \
419 L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) \
420 L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) \
421 L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) \
422 L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) \
423 L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) \
424 L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) \
425 L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) \
426 L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) \
427 L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) \
428 L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) \
429 L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) \
430 L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) \
431 L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) \
432 L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) \
433 L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) \
434 L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) \
435 L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) \
436 L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) \
437 L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) \
438 L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) \
439 L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) \
440 L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) \
441 L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) \
442 L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) \
443 L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) \
444 L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) \
445 L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) \
446 L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) \
447 L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) \
448 L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i)
450#define SSE2_QUARTER_ROUND_X16(i, a, b, c, d, e, f, g, h, A, B, C, D, E, F, G, H) \
451 L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) L01(8,9,10,11, A,B,C,D, i) L01(12,13,14,15, E,F,G,H, i) \
452 L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) L02(8,9,10,11, A,B,C,D, i) L02(12,13,14,15, E,F,G,H, i) \
453 L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) L03(8,9,10,11, A,B,C,D, i) L03(12,13,14,15, E,F,G,H, i) \
454 L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) L04(8,9,10,11, A,B,C,D, i) L04(12,13,14,15, E,F,G,H, i) \
455 L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) L05(8,9,10,11, A,B,C,D, i) L05(12,13,14,15, E,F,G,H, i) \
456 L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) L06(8,9,10,11, A,B,C,D, i) L06(12,13,14,15, E,F,G,H, i) \
457 L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) L07(8,9,10,11, A,B,C,D, i) L07(12,13,14,15, E,F,G,H, i) \
458 L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) L08(8,9,10,11, A,B,C,D, i) L08(12,13,14,15, E,F,G,H, i) \
459 L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) L09(8,9,10,11, A,B,C,D, i) L09(12,13,14,15, E,F,G,H, i) \
460 L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) L10(8,9,10,11, A,B,C,D, i) L10(12,13,14,15, E,F,G,H, i) \
461 L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) L11(8,9,10,11, A,B,C,D, i) L11(12,13,14,15, E,F,G,H, i) \
462 L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) L12(8,9,10,11, A,B,C,D, i) L12(12,13,14,15, E,F,G,H, i) \
463 L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) L13(8,9,10,11, A,B,C,D, i) L13(12,13,14,15, E,F,G,H, i) \
464 L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) L14(8,9,10,11, A,B,C,D, i) L14(12,13,14,15, E,F,G,H, i) \
465 L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) L15(8,9,10,11, A,B,C,D, i) L15(12,13,14,15, E,F,G,H, i) \
466 L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) L16(8,9,10,11, A,B,C,D, i) L16(12,13,14,15, E,F,G,H, i) \
467 L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) L17(8,9,10,11, A,B,C,D, i) L17(12,13,14,15, E,F,G,H, i) \
468 L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) L18(8,9,10,11, A,B,C,D, i) L18(12,13,14,15, E,F,G,H, i) \
469 L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) L19(8,9,10,11, A,B,C,D, i) L19(12,13,14,15, E,F,G,H, i) \
470 L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) L20(8,9,10,11, A,B,C,D, i) L20(12,13,14,15, E,F,G,H, i) \
471 L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) L21(8,9,10,11, A,B,C,D, i) L21(12,13,14,15, E,F,G,H, i) \
472 L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) L22(8,9,10,11, A,B,C,D, i) L22(12,13,14,15, E,F,G,H, i) \
473 L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) L23(8,9,10,11, A,B,C,D, i) L23(12,13,14,15, E,F,G,H, i) \
474 L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) L24(8,9,10,11, A,B,C,D, i) L24(12,13,14,15, E,F,G,H, i) \
475 L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) L25(8,9,10,11, A,B,C,D, i) L25(12,13,14,15, E,F,G,H, i) \
476 L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) L26(8,9,10,11, A,B,C,D, i) L26(12,13,14,15, E,F,G,H, i) \
477 L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) L27(8,9,10,11, A,B,C,D, i) L27(12,13,14,15, E,F,G,H, i) \
478 L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) L28(8,9,10,11, A,B,C,D, i) L28(12,13,14,15, E,F,G,H, i) \
479 L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) L29(8,9,10,11, A,B,C,D, i) L29(12,13,14,15, E,F,G,H, i) \
480 L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) L30(8,9,10,11, A,B,C,D, i) L30(12,13,14,15, E,F,G,H, i) \
481 L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) L31(8,9,10,11, A,B,C,D, i) L31(12,13,14,15, E,F,G,H, i) \
482 L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i) L32(8,9,10,11, A,B,C,D, i) L32(12,13,14,15, E,F,G,H, i)
485 SSE2_QUARTER_ROUND_X16(1, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
487 SSE2_QUARTER_ROUND_X8(1, 2, 6, 10, 14, 3, 7, 11, 15)
488 SSE2_QUARTER_ROUND_X8(1, 0, 4, 8, 12, 1, 5, 9, 13)
490 AS2( mov REG_roundsLeft, REG_rounds)
493 ASL(SSE2_Salsa_Output)
494 AS2( movdqa xmm0, xmm4)
495 AS2( punpckldq xmm4, xmm5)
496 AS2( movdqa xmm1, xmm6)
497 AS2( punpckldq xmm6, xmm7)
498 AS2( movdqa xmm2, xmm4)
499 AS2( punpcklqdq xmm4, xmm6)
500 AS2( punpckhqdq xmm2, xmm6)
501 AS2( punpckhdq xmm0, xmm5)
502 AS2( punpckhdq xmm1, xmm7)
503 AS2( movdqa xmm6, xmm0)
504 AS2( punpcklqdq xmm0, xmm1)
505 AS2( punpckhqdq xmm6, xmm1)
506 AS_XMM_OUTPUT4(SSE2_Salsa_Output_A, REG_input, REG_output, 4, 2, 0, 6, 1, 0, 4, 8, 12, 1)
511 SSE2_QUARTER_ROUND_X16(0, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
513 SSE2_QUARTER_ROUND_X16(0, 0, 13, 10, 7, 1, 14, 11, 4, 2, 15, 8, 5, 3, 12, 9, 6)
515 SSE2_QUARTER_ROUND_X8(0, 2, 6, 10, 14, 3, 7, 11, 15)
516 SSE2_QUARTER_ROUND_X8(0, 0, 4, 8, 12, 1, 5, 9, 13)
518 SSE2_QUARTER_ROUND_X8(0, 2, 15, 8, 5, 3, 12, 9, 6)
519 SSE2_QUARTER_ROUND_X8(0, 0, 13, 10, 7, 1, 14, 11, 4)
521 AS2( sub REG_roundsLeft, 2)
524#define SSE2_OUTPUT_4(a, b, c, d) \
525 AS2( movdqa xmm4, [SSE2_WORKSPACE + a*16 + 256])\
526 AS2( paddd xmm4, [SSE2_WORKSPACE + a*16])\
527 AS2( movdqa xmm5, [SSE2_WORKSPACE + b*16 + 256])\
528 AS2( paddd xmm5, [SSE2_WORKSPACE + b*16])\
529 AS2( movdqa xmm6, [SSE2_WORKSPACE + c*16 + 256])\
530 AS2( paddd xmm6, [SSE2_WORKSPACE + c*16])\
531 AS2( movdqa xmm7, [SSE2_WORKSPACE + d*16 + 256])\
532 AS2( paddd xmm7, [SSE2_WORKSPACE + d*16])\
533 ASC( call, SSE2_Salsa_Output)
535 SSE2_OUTPUT_4(0, 13, 10, 7)
536 SSE2_OUTPUT_4(4, 1, 14, 11)
537 SSE2_OUTPUT_4(8, 5, 2, 15)
538 SSE2_OUTPUT_4(12, 9, 6, 3)
539 AS2( test REG_input, REG_input)
541 AS2( add REG_input, 12*16)
543 AS2( add REG_output, 12*16)
544 AS2( sub REG_iterationCount, 4)
545 AS2( cmp REG_iterationCount, 4)
550 AS2( sub REG_iterationCount, 1)
552 AS2( movdqa xmm0, [REG_state + 0*16])
553 AS2( movdqa xmm1, [REG_state + 1*16])
554 AS2( movdqa xmm2, [REG_state + 2*16])
555 AS2( movdqa xmm3, [REG_state + 3*16])
556 AS2( mov REG_roundsLeft, REG_rounds)
559 SSE2_QUARTER_ROUND(0, 1, 3, 7)
560 SSE2_QUARTER_ROUND(1, 2, 0, 9)
561 SSE2_QUARTER_ROUND(2, 3, 1, 13)
562 SSE2_QUARTER_ROUND(3, 0, 2, 18)
563 ASS( pshufd xmm1, xmm1, 2, 1, 0, 3)
564 ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
565 ASS( pshufd xmm3, xmm3, 0, 3, 2, 1)
566 SSE2_QUARTER_ROUND(0, 3, 1, 7)
567 SSE2_QUARTER_ROUND(3, 2, 0, 9)
568 SSE2_QUARTER_ROUND(2, 1, 3, 13)
569 SSE2_QUARTER_ROUND(1, 0, 2, 18)
570 ASS( pshufd xmm1, xmm1, 0, 3, 2, 1)
571 ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
572 ASS( pshufd xmm3, xmm3, 2, 1, 0, 3)
573 AS2( sub REG_roundsLeft, 2)
576 AS2( paddd xmm0, [REG_state + 0*16])
577 AS2( paddd xmm1, [REG_state + 1*16])
578 AS2( paddd xmm2, [REG_state + 2*16])
579 AS2( paddd xmm3, [REG_state + 3*16])
581 AS2( add dword ptr [REG_state + 8*4], 1)
582 AS2( adc dword ptr [REG_state + 5*4], 0)
584 AS2( pcmpeqb xmm6, xmm6)
586 ASS( pshufd xmm7, xmm6, 0, 1, 2, 3)
587 AS2( movdqa xmm4, xmm0)
588 AS2( movdqa xmm5, xmm3)
589 AS2( pand xmm0, xmm7)
590 AS2( pand xmm4, xmm6)
591 AS2( pand xmm3, xmm6)
592 AS2( pand xmm5, xmm7)
594 AS2( movdqa xmm5, xmm1)
595 AS2( pand xmm1, xmm7)
596 AS2( pand xmm5, xmm6)
598 AS2( pand xmm6, xmm2)
599 AS2( pand xmm2, xmm7)
603 AS2( movdqa xmm5, xmm4)
604 AS2( movdqa xmm6, xmm0)
605 AS3( shufpd xmm4, xmm1, 2)
606 AS3( shufpd xmm0, xmm2, 2)
607 AS3( shufpd xmm1, xmm5, 2)
608 AS3( shufpd xmm2, xmm6, 2)
611 AS_XMM_OUTPUT4(SSE2_Salsa_Output_B, REG_input, REG_output, 4, 0, 1, 2, 3, 0, 1, 2, 3, 4)
619 #if CRYPTOPP_BOOL_X64
620 :
"+r" (input),
"+r" (output),
"+r" (iterationCount)
621 :
"r" (m_rounds),
"r" (m_state.
begin()),
"r" (workspace)
622 :
"%eax",
"%rdx",
"memory",
"cc",
"%xmm0",
"%xmm1",
"%xmm2",
"%xmm3",
"%xmm4",
"%xmm5",
"%xmm6",
"%xmm7",
"%xmm8",
"%xmm9",
"%xmm10",
"%xmm11",
"%xmm12",
"%xmm13",
"%xmm14",
"%xmm15"
624 :
"+a" (input),
"+D" (output),
"+c" (iterationCount)
625 :
"d" (m_rounds),
"S" (m_state.
begin())
630#ifdef CRYPTOPP_GENERATE_X64_MASM
631 movdqa xmm6, [rsp + 0200h]
632 movdqa xmm7, [rsp + 0210h]
633 movdqa xmm8, [rsp + 0220h]
634 movdqa xmm9, [rsp + 0230h]
635 movdqa xmm10, [rsp + 0240h]
636 movdqa xmm11, [rsp + 0250h]
637 movdqa xmm12, [rsp + 0260h]
638 movdqa xmm13, [rsp + 0270h]
639 movdqa xmm14, [rsp + 0280h]
640 movdqa xmm15, [rsp + 0290h]
641 add rsp, 10*16 + 32*16 + 8
643Salsa20_OperateKeystream ENDP
649#ifndef CRYPTOPP_GENERATE_X64_MASM
651 word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
653 while (iterationCount--)
655 x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
656 x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7];
657 x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11];
658 x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15];
660 for (
int i=m_rounds; i>0; i-=2)
662 #define QUARTER_ROUND(a, b, c, d) \
663 b = b ^ rotlConstant<7>(a + d); \
664 c = c ^ rotlConstant<9>(b + a); \
665 d = d ^ rotlConstant<13>(c + b); \
666 a = a ^ rotlConstant<18>(d + c);
668 QUARTER_ROUND(x0, x4, x8, x12)
669 QUARTER_ROUND(x1, x5, x9, x13)
670 QUARTER_ROUND(x2, x6, x10, x14)
671 QUARTER_ROUND(x3, x7, x11, x15)
673 QUARTER_ROUND(x0, x13, x10, x7)
674 QUARTER_ROUND(x1, x14, x11, x4)
675 QUARTER_ROUND(x2, x15, x8, x5)
676 QUARTER_ROUND(x3, x12, x9, x6)
679#ifndef CRYPTOPP_DOXYGEN_PROCESSING
680 #define SALSA_OUTPUT(x) {\
681 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
682 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\
683 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\
684 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\
685 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
686 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\
687 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\
688 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\
689 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
690 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\
691 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\
692 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\
693 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
694 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\
695 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\
696 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);}
702 if (++m_state[8] == 0)