aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@mbnet.fi>2011-12-20 05:58:06 -0500
committerHerbert Xu <herbert@gondor.apana.org.au>2012-01-13 00:38:40 -0500
commit847cb7ef565d31484f426677e0bea081bfd2acd9 (patch)
tree7325f4ce5961e0d51ea4707119aeba80622991c3 /arch/x86/crypto
parent4c58464b8034cef4317593bf4ccbfc19d5bb3a77 (diff)
crypto: serpent-sse2 - change transpose_4x4 to only use integer instructions
Matrix transpose macro in serpent-sse2 uses mix of SSE2 integer and SSE floating point instructions, which might cause performance penality on some CPUs. This patch replaces transpose_4x4 macro with version that uses only SSE2 integer instructions. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r--arch/x86/crypto/serpent-sse2-i586-asm_32.S29
-rw-r--r--arch/x86/crypto/serpent-sse2-x86_64-asm_64.S29
2 files changed, 26 insertions, 32 deletions
diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
index 4e37677ca851..c00053d42f99 100644
--- a/arch/x86/crypto/serpent-sse2-i586-asm_32.S
+++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
@@ -463,23 +463,20 @@
463 pand x0, x4; \ 463 pand x0, x4; \
464 pxor x2, x4; 464 pxor x2, x4;
465 465
466#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ 466#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
467 movdqa x2, t3; \
468 movdqa x0, t1; \
469 unpcklps x3, t3; \
470 movdqa x0, t2; \ 467 movdqa x0, t2; \
471 unpcklps x1, t1; \ 468 punpckldq x1, x0; \
472 unpckhps x1, t2; \ 469 punpckhdq x1, t2; \
473 movdqa t3, x1; \ 470 movdqa x2, t1; \
474 unpckhps x3, x2; \ 471 punpckhdq x3, x2; \
475 movdqa t1, x0; \ 472 punpckldq x3, t1; \
476 movhlps t1, x1; \ 473 movdqa x0, x1; \
477 movdqa t2, t1; \ 474 punpcklqdq t1, x0; \
478 movlhps t3, x0; \ 475 punpckhqdq t1, x1; \
479 movlhps x2, t1; \ 476 movdqa t2, x3; \
480 movhlps t2, x2; \ 477 punpcklqdq x2, t2; \
481 movdqa x2, x3; \ 478 punpckhqdq x2, x3; \
482 movdqa t1, x2; 479 movdqa t2, x2;
483 480
484#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ 481#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
485 movdqu (0*4*4)(in), x0; \ 482 movdqu (0*4*4)(in), x0; \
diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
index 7f24a1540821..3ee1ff04d3e9 100644
--- a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
@@ -585,23 +585,20 @@
585 get_key(i, 1, RK1); \ 585 get_key(i, 1, RK1); \
586 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 586 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
587 587
588#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ 588#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
589 movdqa x2, t3; \
590 movdqa x0, t1; \
591 unpcklps x3, t3; \
592 movdqa x0, t2; \ 589 movdqa x0, t2; \
593 unpcklps x1, t1; \ 590 punpckldq x1, x0; \
594 unpckhps x1, t2; \ 591 punpckhdq x1, t2; \
595 movdqa t3, x1; \ 592 movdqa x2, t1; \
596 unpckhps x3, x2; \ 593 punpckhdq x3, x2; \
597 movdqa t1, x0; \ 594 punpckldq x3, t1; \
598 movhlps t1, x1; \ 595 movdqa x0, x1; \
599 movdqa t2, t1; \ 596 punpcklqdq t1, x0; \
600 movlhps t3, x0; \ 597 punpckhqdq t1, x1; \
601 movlhps x2, t1; \ 598 movdqa t2, x3; \
602 movhlps t2, x2; \ 599 punpcklqdq x2, t2; \
603 movdqa x2, x3; \ 600 punpckhqdq x2, x3; \
604 movdqa t1, x2; 601 movdqa t2, x2;
605 602
606#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ 603#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
607 movdqu (0*4*4)(in), x0; \ 604 movdqu (0*4*4)(in), x0; \