aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@mbnet.fi>2012-10-20 08:06:51 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2012-10-24 09:10:55 -0400
commitfacd416fbc1cdee357730909a414898934f16ae1 (patch)
treee590e4bdd151c06c820bdcc1635b0660e525f84d
parent8435a3c3003c00c43f1b267368bbe1d8dada35d1 (diff)
crypto: serpent/avx - avoid using temporary stack buffers
Introduce new assembler functions to avoid use temporary stack buffers in glue code. This also allows use of vector instructions for xoring output in CTR and CBC modes and construction of IVs for CTR mode. ECB mode sees ~0.5% decrease in speed because added one extra function call. CBC mode decryption and CTR mode benefit from vector operations and gain ~3%. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r--arch/x86/crypto/serpent-avx-x86_64-asm_64.S166
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c43
-rw-r--r--arch/x86/include/asm/crypto/serpent-avx.h27
3 files changed, 121 insertions, 115 deletions
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index 504106bf04a2..02b0e9fe997c 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -24,7 +24,16 @@
24 * 24 *
25 */ 25 */
26 26
27#include "glue_helper-asm-avx.S"
28
27.file "serpent-avx-x86_64-asm_64.S" 29.file "serpent-avx-x86_64-asm_64.S"
30
31.data
32.align 16
33
34.Lbswap128_mask:
35 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
36
28.text 37.text
29 38
30#define CTX %rdi 39#define CTX %rdi
@@ -550,51 +559,27 @@
550 vpunpcklqdq x3, t2, x2; \ 559 vpunpcklqdq x3, t2, x2; \
551 vpunpckhqdq x3, t2, x3; 560 vpunpckhqdq x3, t2, x3;
552 561
553#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ 562#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
554 vmovdqu (0*4*4)(in), x0; \
555 vmovdqu (1*4*4)(in), x1; \
556 vmovdqu (2*4*4)(in), x2; \
557 vmovdqu (3*4*4)(in), x3; \
558 \
559 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 563 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
560 564
561#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ 565#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
562 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 566 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
563 \
564 vmovdqu x0, (0*4*4)(out); \
565 vmovdqu x1, (1*4*4)(out); \
566 vmovdqu x2, (2*4*4)(out); \
567 vmovdqu x3, (3*4*4)(out);
568
569#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
570 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
571 \
572 vpxor (0*4*4)(out), x0, x0; \
573 vmovdqu x0, (0*4*4)(out); \
574 vpxor (1*4*4)(out), x1, x1; \
575 vmovdqu x1, (1*4*4)(out); \
576 vpxor (2*4*4)(out), x2, x2; \
577 vmovdqu x2, (2*4*4)(out); \
578 vpxor (3*4*4)(out), x3, x3; \
579 vmovdqu x3, (3*4*4)(out);
580 567
581.align 8 568.align 8
582.global __serpent_enc_blk_8way_avx 569.type __serpent_enc_blk8_avx,@function;
583.type __serpent_enc_blk_8way_avx,@function;
584 570
585__serpent_enc_blk_8way_avx: 571__serpent_enc_blk8_avx:
586 /* input: 572 /* input:
587 * %rdi: ctx, CTX 573 * %rdi: ctx, CTX
588 * %rsi: dst 574 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
589 * %rdx: src 575 * output:
590 * %rcx: bool, if true: xor output 576 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
591 */ 577 */
592 578
593 vpcmpeqd RNOT, RNOT, RNOT; 579 vpcmpeqd RNOT, RNOT, RNOT;
594 580
595 leaq (4*4*4)(%rdx), %rax; 581 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
596 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); 582 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
597 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
598 583
599 K2(RA, RB, RC, RD, RE, 0); 584 K2(RA, RB, RC, RD, RE, 0);
600 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); 585 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
@@ -630,38 +615,26 @@ __serpent_enc_blk_8way_avx:
630 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); 615 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
631 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); 616 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
632 617
633 leaq (4*4*4)(%rsi), %rax; 618 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
634 619 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
635 testb %cl, %cl;
636 jnz __enc_xor8;
637
638 write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
639 write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
640
641 ret;
642
643__enc_xor8:
644 xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
645 xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
646 620
647 ret; 621 ret;
648 622
649.align 8 623.align 8
650.global serpent_dec_blk_8way_avx 624.type __serpent_dec_blk8_avx,@function;
651.type serpent_dec_blk_8way_avx,@function;
652 625
653serpent_dec_blk_8way_avx: 626__serpent_dec_blk8_avx:
654 /* input: 627 /* input:
655 * %rdi: ctx, CTX 628 * %rdi: ctx, CTX
656 * %rsi: dst 629 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
657 * %rdx: src 630 * output:
631 * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks
658 */ 632 */
659 633
660 vpcmpeqd RNOT, RNOT, RNOT; 634 vpcmpeqd RNOT, RNOT, RNOT;
661 635
662 leaq (4*4*4)(%rdx), %rax; 636 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
663 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); 637 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
664 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
665 638
666 K2(RA, RB, RC, RD, RE, 32); 639 K2(RA, RB, RC, RD, RE, 32);
667 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); 640 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
@@ -697,8 +670,85 @@ serpent_dec_blk_8way_avx:
697 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); 670 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
698 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); 671 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
699 672
700 leaq (4*4*4)(%rsi), %rax; 673 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
701 write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2); 674 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
702 write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); 675
676 ret;
677
678.align 8
679.global serpent_ecb_enc_8way_avx
680.type serpent_ecb_enc_8way_avx,@function;
681
682serpent_ecb_enc_8way_avx:
683 /* input:
684 * %rdi: ctx, CTX
685 * %rsi: dst
686 * %rdx: src
687 */
688
689 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
690
691 call __serpent_enc_blk8_avx;
692
693 store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
694
695 ret;
696
697.align 8
698.global serpent_ecb_dec_8way_avx
699.type serpent_ecb_dec_8way_avx,@function;
700
701serpent_ecb_dec_8way_avx:
702 /* input:
703 * %rdi: ctx, CTX
704 * %rsi: dst
705 * %rdx: src
706 */
707
708 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
709
710 call __serpent_dec_blk8_avx;
711
712 store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
713
714 ret;
715
716.align 8
717.global serpent_cbc_dec_8way_avx
718.type serpent_cbc_dec_8way_avx,@function;
719
720serpent_cbc_dec_8way_avx:
721 /* input:
722 * %rdi: ctx, CTX
723 * %rsi: dst
724 * %rdx: src
725 */
726
727 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
728
729 call __serpent_dec_blk8_avx;
730
731 store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
732
733 ret;
734
735.align 8
736.global serpent_ctr_8way_avx
737.type serpent_ctr_8way_avx,@function;
738
739serpent_ctr_8way_avx:
740 /* input:
741 * %rdi: ctx, CTX
742 * %rsi: dst
743 * %rdx: src
744 * %rcx: iv (little endian, 128bit)
745 */
746
747 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
748 RD2, RK0, RK1, RK2);
749
750 call __serpent_enc_blk8_avx;
751
752 store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
703 753
704 ret; 754 ret;
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 2aa31ade1e68..52abaaf28e7f 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -42,20 +42,6 @@
42#include <asm/crypto/ablk_helper.h> 42#include <asm/crypto/ablk_helper.h>
43#include <asm/crypto/glue_helper.h> 43#include <asm/crypto/glue_helper.h>
44 44
45static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
46{
47 u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
48 unsigned int j;
49
50 for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
51 ivs[j] = src[j];
52
53 serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
54
55 for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
56 u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
57}
58
59static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) 45static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
60{ 46{
61 be128 ctrblk; 47 be128 ctrblk;
@@ -67,30 +53,13 @@ static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
67 u128_xor(dst, src, (u128 *)&ctrblk); 53 u128_xor(dst, src, (u128 *)&ctrblk);
68} 54}
69 55
70static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
71 le128 *iv)
72{
73 be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
74 unsigned int i;
75
76 for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
77 if (dst != src)
78 dst[i] = src[i];
79
80 le128_to_be128(&ctrblks[i], iv);
81 le128_inc(iv);
82 }
83
84 serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
85}
86
87static const struct common_glue_ctx serpent_enc = { 56static const struct common_glue_ctx serpent_enc = {
88 .num_funcs = 2, 57 .num_funcs = 2,
89 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, 58 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
90 59
91 .funcs = { { 60 .funcs = { {
92 .num_blocks = SERPENT_PARALLEL_BLOCKS, 61 .num_blocks = SERPENT_PARALLEL_BLOCKS,
93 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) } 62 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) }
94 }, { 63 }, {
95 .num_blocks = 1, 64 .num_blocks = 1,
96 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } 65 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
@@ -103,7 +72,7 @@ static const struct common_glue_ctx serpent_ctr = {
103 72
104 .funcs = { { 73 .funcs = { {
105 .num_blocks = SERPENT_PARALLEL_BLOCKS, 74 .num_blocks = SERPENT_PARALLEL_BLOCKS,
106 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) } 75 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
107 }, { 76 }, {
108 .num_blocks = 1, 77 .num_blocks = 1,
109 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) } 78 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
@@ -116,7 +85,7 @@ static const struct common_glue_ctx serpent_dec = {
116 85
117 .funcs = { { 86 .funcs = { {
118 .num_blocks = SERPENT_PARALLEL_BLOCKS, 87 .num_blocks = SERPENT_PARALLEL_BLOCKS,
119 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) } 88 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) }
120 }, { 89 }, {
121 .num_blocks = 1, 90 .num_blocks = 1,
122 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } 91 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
@@ -129,7 +98,7 @@ static const struct common_glue_ctx serpent_dec_cbc = {
129 98
130 .funcs = { { 99 .funcs = { {
131 .num_blocks = SERPENT_PARALLEL_BLOCKS, 100 .num_blocks = SERPENT_PARALLEL_BLOCKS,
132 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) } 101 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) }
133 }, { 102 }, {
134 .num_blocks = 1, 103 .num_blocks = 1,
135 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } 104 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
@@ -193,7 +162,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
193 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); 162 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
194 163
195 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { 164 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
196 serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst); 165 serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
197 return; 166 return;
198 } 167 }
199 168
@@ -210,7 +179,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
210 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); 179 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
211 180
212 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { 181 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
213 serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst); 182 serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
214 return; 183 return;
215 } 184 }
216 185
diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h
index 432deedd2945..0da1d3e2a55c 100644
--- a/arch/x86/include/asm/crypto/serpent-avx.h
+++ b/arch/x86/include/asm/crypto/serpent-avx.h
@@ -6,27 +6,14 @@
6 6
7#define SERPENT_PARALLEL_BLOCKS 8 7#define SERPENT_PARALLEL_BLOCKS 8
8 8
9asmlinkage void __serpent_enc_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, 9asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
10 const u8 *src, bool xor); 10 const u8 *src);
11asmlinkage void serpent_dec_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, 11asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
12 const u8 *src); 12 const u8 *src);
13 13
14static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, 14asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
15 const u8 *src) 15 const u8 *src);
16{ 16asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
17 __serpent_enc_blk_8way_avx(ctx, dst, src, false); 17 const u8 *src, le128 *iv);
18}
19
20static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
21 const u8 *src)
22{
23 __serpent_enc_blk_8way_avx(ctx, dst, src, true);
24}
25
26static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
27 const u8 *src)
28{
29 serpent_dec_blk_8way_avx(ctx, dst, src);
30}
31 18
32#endif 19#endif