diff options
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/crypto/serpent-avx-x86_64-asm_64.S | 166 | ||||
-rw-r--r-- | arch/x86/crypto/serpent_avx_glue.c | 43 | ||||
-rw-r--r-- | arch/x86/include/asm/crypto/serpent-avx.h | 27 |
3 files changed, 121 insertions, 115 deletions
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index 504106bf04a2..02b0e9fe997c 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S | |||
@@ -24,7 +24,16 @@ | |||
24 | * | 24 | * |
25 | */ | 25 | */ |
26 | 26 | ||
27 | #include "glue_helper-asm-avx.S" | ||
28 | |||
27 | .file "serpent-avx-x86_64-asm_64.S" | 29 | .file "serpent-avx-x86_64-asm_64.S" |
30 | |||
31 | .data | ||
32 | .align 16 | ||
33 | |||
34 | .Lbswap128_mask: | ||
35 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | ||
36 | |||
28 | .text | 37 | .text |
29 | 38 | ||
30 | #define CTX %rdi | 39 | #define CTX %rdi |
@@ -550,51 +559,27 @@ | |||
550 | vpunpcklqdq x3, t2, x2; \ | 559 | vpunpcklqdq x3, t2, x2; \ |
551 | vpunpckhqdq x3, t2, x3; | 560 | vpunpckhqdq x3, t2, x3; |
552 | 561 | ||
553 | #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ | 562 | #define read_blocks(x0, x1, x2, x3, t0, t1, t2) \ |
554 | vmovdqu (0*4*4)(in), x0; \ | ||
555 | vmovdqu (1*4*4)(in), x1; \ | ||
556 | vmovdqu (2*4*4)(in), x2; \ | ||
557 | vmovdqu (3*4*4)(in), x3; \ | ||
558 | \ | ||
559 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) | 563 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) |
560 | 564 | ||
561 | #define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ | 565 | #define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ |
562 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | 566 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) |
563 | \ | ||
564 | vmovdqu x0, (0*4*4)(out); \ | ||
565 | vmovdqu x1, (1*4*4)(out); \ | ||
566 | vmovdqu x2, (2*4*4)(out); \ | ||
567 | vmovdqu x3, (3*4*4)(out); | ||
568 | |||
569 | #define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ | ||
570 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | ||
571 | \ | ||
572 | vpxor (0*4*4)(out), x0, x0; \ | ||
573 | vmovdqu x0, (0*4*4)(out); \ | ||
574 | vpxor (1*4*4)(out), x1, x1; \ | ||
575 | vmovdqu x1, (1*4*4)(out); \ | ||
576 | vpxor (2*4*4)(out), x2, x2; \ | ||
577 | vmovdqu x2, (2*4*4)(out); \ | ||
578 | vpxor (3*4*4)(out), x3, x3; \ | ||
579 | vmovdqu x3, (3*4*4)(out); | ||
580 | 567 | ||
581 | .align 8 | 568 | .align 8 |
582 | .global __serpent_enc_blk_8way_avx | 569 | .type __serpent_enc_blk8_avx,@function; |
583 | .type __serpent_enc_blk_8way_avx,@function; | ||
584 | 570 | ||
585 | __serpent_enc_blk_8way_avx: | 571 | __serpent_enc_blk8_avx: |
586 | /* input: | 572 | /* input: |
587 | * %rdi: ctx, CTX | 573 | * %rdi: ctx, CTX |
588 | * %rsi: dst | 574 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks |
589 | * %rdx: src | 575 | * output: |
590 | * %rcx: bool, if true: xor output | 576 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks |
591 | */ | 577 | */ |
592 | 578 | ||
593 | vpcmpeqd RNOT, RNOT, RNOT; | 579 | vpcmpeqd RNOT, RNOT, RNOT; |
594 | 580 | ||
595 | leaq (4*4*4)(%rdx), %rax; | 581 | read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); |
596 | read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); | 582 | read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); |
597 | read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); | ||
598 | 583 | ||
599 | K2(RA, RB, RC, RD, RE, 0); | 584 | K2(RA, RB, RC, RD, RE, 0); |
600 | S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); | 585 | S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); |
@@ -630,38 +615,26 @@ __serpent_enc_blk_8way_avx: | |||
630 | S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); | 615 | S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); |
631 | S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); | 616 | S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); |
632 | 617 | ||
633 | leaq (4*4*4)(%rsi), %rax; | 618 | write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); |
634 | 619 | write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); | |
635 | testb %cl, %cl; | ||
636 | jnz __enc_xor8; | ||
637 | |||
638 | write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); | ||
639 | write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); | ||
640 | |||
641 | ret; | ||
642 | |||
643 | __enc_xor8: | ||
644 | xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); | ||
645 | xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); | ||
646 | 620 | ||
647 | ret; | 621 | ret; |
648 | 622 | ||
649 | .align 8 | 623 | .align 8 |
650 | .global serpent_dec_blk_8way_avx | 624 | .type __serpent_dec_blk8_avx,@function; |
651 | .type serpent_dec_blk_8way_avx,@function; | ||
652 | 625 | ||
653 | serpent_dec_blk_8way_avx: | 626 | __serpent_dec_blk8_avx: |
654 | /* input: | 627 | /* input: |
655 | * %rdi: ctx, CTX | 628 | * %rdi: ctx, CTX |
656 | * %rsi: dst | 629 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks |
657 | * %rdx: src | 630 | * output: |
631 | * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks | ||
658 | */ | 632 | */ |
659 | 633 | ||
660 | vpcmpeqd RNOT, RNOT, RNOT; | 634 | vpcmpeqd RNOT, RNOT, RNOT; |
661 | 635 | ||
662 | leaq (4*4*4)(%rdx), %rax; | 636 | read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); |
663 | read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); | 637 | read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); |
664 | read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); | ||
665 | 638 | ||
666 | K2(RA, RB, RC, RD, RE, 32); | 639 | K2(RA, RB, RC, RD, RE, 32); |
667 | SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); | 640 | SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); |
@@ -697,8 +670,85 @@ serpent_dec_blk_8way_avx: | |||
697 | SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); | 670 | SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); |
698 | S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); | 671 | S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); |
699 | 672 | ||
700 | leaq (4*4*4)(%rsi), %rax; | 673 | write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); |
701 | write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2); | 674 | write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); |
702 | write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); | 675 | |
676 | ret; | ||
677 | |||
678 | .align 8 | ||
679 | .global serpent_ecb_enc_8way_avx | ||
680 | .type serpent_ecb_enc_8way_avx,@function; | ||
681 | |||
682 | serpent_ecb_enc_8way_avx: | ||
683 | /* input: | ||
684 | * %rdi: ctx, CTX | ||
685 | * %rsi: dst | ||
686 | * %rdx: src | ||
687 | */ | ||
688 | |||
689 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
690 | |||
691 | call __serpent_enc_blk8_avx; | ||
692 | |||
693 | store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
694 | |||
695 | ret; | ||
696 | |||
697 | .align 8 | ||
698 | .global serpent_ecb_dec_8way_avx | ||
699 | .type serpent_ecb_dec_8way_avx,@function; | ||
700 | |||
701 | serpent_ecb_dec_8way_avx: | ||
702 | /* input: | ||
703 | * %rdi: ctx, CTX | ||
704 | * %rsi: dst | ||
705 | * %rdx: src | ||
706 | */ | ||
707 | |||
708 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
709 | |||
710 | call __serpent_dec_blk8_avx; | ||
711 | |||
712 | store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); | ||
713 | |||
714 | ret; | ||
715 | |||
716 | .align 8 | ||
717 | .global serpent_cbc_dec_8way_avx | ||
718 | .type serpent_cbc_dec_8way_avx,@function; | ||
719 | |||
720 | serpent_cbc_dec_8way_avx: | ||
721 | /* input: | ||
722 | * %rdi: ctx, CTX | ||
723 | * %rsi: dst | ||
724 | * %rdx: src | ||
725 | */ | ||
726 | |||
727 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
728 | |||
729 | call __serpent_dec_blk8_avx; | ||
730 | |||
731 | store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); | ||
732 | |||
733 | ret; | ||
734 | |||
735 | .align 8 | ||
736 | .global serpent_ctr_8way_avx | ||
737 | .type serpent_ctr_8way_avx,@function; | ||
738 | |||
739 | serpent_ctr_8way_avx: | ||
740 | /* input: | ||
741 | * %rdi: ctx, CTX | ||
742 | * %rsi: dst | ||
743 | * %rdx: src | ||
744 | * %rcx: iv (little endian, 128bit) | ||
745 | */ | ||
746 | |||
747 | load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, | ||
748 | RD2, RK0, RK1, RK2); | ||
749 | |||
750 | call __serpent_enc_blk8_avx; | ||
751 | |||
752 | store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
703 | 753 | ||
704 | ret; | 754 | ret; |
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 2aa31ade1e68..52abaaf28e7f 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c | |||
@@ -42,20 +42,6 @@ | |||
42 | #include <asm/crypto/ablk_helper.h> | 42 | #include <asm/crypto/ablk_helper.h> |
43 | #include <asm/crypto/glue_helper.h> | 43 | #include <asm/crypto/glue_helper.h> |
44 | 44 | ||
45 | static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) | ||
46 | { | ||
47 | u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; | ||
48 | unsigned int j; | ||
49 | |||
50 | for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) | ||
51 | ivs[j] = src[j]; | ||
52 | |||
53 | serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); | ||
54 | |||
55 | for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) | ||
56 | u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); | ||
57 | } | ||
58 | |||
59 | static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) | 45 | static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) |
60 | { | 46 | { |
61 | be128 ctrblk; | 47 | be128 ctrblk; |
@@ -67,30 +53,13 @@ static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) | |||
67 | u128_xor(dst, src, (u128 *)&ctrblk); | 53 | u128_xor(dst, src, (u128 *)&ctrblk); |
68 | } | 54 | } |
69 | 55 | ||
70 | static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, | ||
71 | le128 *iv) | ||
72 | { | ||
73 | be128 ctrblks[SERPENT_PARALLEL_BLOCKS]; | ||
74 | unsigned int i; | ||
75 | |||
76 | for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { | ||
77 | if (dst != src) | ||
78 | dst[i] = src[i]; | ||
79 | |||
80 | le128_to_be128(&ctrblks[i], iv); | ||
81 | le128_inc(iv); | ||
82 | } | ||
83 | |||
84 | serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); | ||
85 | } | ||
86 | |||
87 | static const struct common_glue_ctx serpent_enc = { | 56 | static const struct common_glue_ctx serpent_enc = { |
88 | .num_funcs = 2, | 57 | .num_funcs = 2, |
89 | .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, | 58 | .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, |
90 | 59 | ||
91 | .funcs = { { | 60 | .funcs = { { |
92 | .num_blocks = SERPENT_PARALLEL_BLOCKS, | 61 | .num_blocks = SERPENT_PARALLEL_BLOCKS, |
93 | .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) } | 62 | .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) } |
94 | }, { | 63 | }, { |
95 | .num_blocks = 1, | 64 | .num_blocks = 1, |
96 | .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } | 65 | .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } |
@@ -103,7 +72,7 @@ static const struct common_glue_ctx serpent_ctr = { | |||
103 | 72 | ||
104 | .funcs = { { | 73 | .funcs = { { |
105 | .num_blocks = SERPENT_PARALLEL_BLOCKS, | 74 | .num_blocks = SERPENT_PARALLEL_BLOCKS, |
106 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) } | 75 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) } |
107 | }, { | 76 | }, { |
108 | .num_blocks = 1, | 77 | .num_blocks = 1, |
109 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) } | 78 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) } |
@@ -116,7 +85,7 @@ static const struct common_glue_ctx serpent_dec = { | |||
116 | 85 | ||
117 | .funcs = { { | 86 | .funcs = { { |
118 | .num_blocks = SERPENT_PARALLEL_BLOCKS, | 87 | .num_blocks = SERPENT_PARALLEL_BLOCKS, |
119 | .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) } | 88 | .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) } |
120 | }, { | 89 | }, { |
121 | .num_blocks = 1, | 90 | .num_blocks = 1, |
122 | .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } | 91 | .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } |
@@ -129,7 +98,7 @@ static const struct common_glue_ctx serpent_dec_cbc = { | |||
129 | 98 | ||
130 | .funcs = { { | 99 | .funcs = { { |
131 | .num_blocks = SERPENT_PARALLEL_BLOCKS, | 100 | .num_blocks = SERPENT_PARALLEL_BLOCKS, |
132 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) } | 101 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) } |
133 | }, { | 102 | }, { |
134 | .num_blocks = 1, | 103 | .num_blocks = 1, |
135 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } | 104 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } |
@@ -193,7 +162,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |||
193 | ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); | 162 | ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); |
194 | 163 | ||
195 | if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { | 164 | if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { |
196 | serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst); | 165 | serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst); |
197 | return; | 166 | return; |
198 | } | 167 | } |
199 | 168 | ||
@@ -210,7 +179,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |||
210 | ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); | 179 | ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); |
211 | 180 | ||
212 | if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { | 181 | if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { |
213 | serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst); | 182 | serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst); |
214 | return; | 183 | return; |
215 | } | 184 | } |
216 | 185 | ||
diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h index 432deedd2945..0da1d3e2a55c 100644 --- a/arch/x86/include/asm/crypto/serpent-avx.h +++ b/arch/x86/include/asm/crypto/serpent-avx.h | |||
@@ -6,27 +6,14 @@ | |||
6 | 6 | ||
7 | #define SERPENT_PARALLEL_BLOCKS 8 | 7 | #define SERPENT_PARALLEL_BLOCKS 8 |
8 | 8 | ||
9 | asmlinkage void __serpent_enc_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, | 9 | asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, |
10 | const u8 *src, bool xor); | 10 | const u8 *src); |
11 | asmlinkage void serpent_dec_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, | 11 | asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, |
12 | const u8 *src); | 12 | const u8 *src); |
13 | 13 | ||
14 | static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, | 14 | asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, |
15 | const u8 *src) | 15 | const u8 *src); |
16 | { | 16 | asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst, |
17 | __serpent_enc_blk_8way_avx(ctx, dst, src, false); | 17 | const u8 *src, le128 *iv); |
18 | } | ||
19 | |||
20 | static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, | ||
21 | const u8 *src) | ||
22 | { | ||
23 | __serpent_enc_blk_8way_avx(ctx, dst, src, true); | ||
24 | } | ||
25 | |||
26 | static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, | ||
27 | const u8 *src) | ||
28 | { | ||
29 | serpent_dec_blk_8way_avx(ctx, dst, src); | ||
30 | } | ||
31 | 18 | ||
32 | #endif | 19 | #endif |