diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-07-05 15:12:33 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-07-05 15:12:33 -0400 |
commit | b2c311075db578f1433d9b303698491bfa21279a (patch) | |
tree | 41d5f1b5ad6f45be7211f524328de81f7e9754be /arch/x86/crypto | |
parent | 45175476ae2dbebc860d5cf486f2916044343513 (diff) | |
parent | 02c0241b600e4ab8a732c89749e252165145d60c (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto update from Herbert Xu:
- Do not idle omap device between crypto operations in one session.
- Added sha224/sha384 shims for SSSE3.
- More optimisations for camellia-aesni-avx2.
- Removed defunct blowfish/twofish AVX2 implementations.
- Added unaligned buffer self-tests.
- Added PCLMULQDQ optimisation for CRCT10DIF.
- Added support for Freescale's DCP co-processor
- Misc fixes.
* git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (44 commits)
crypto: testmgr - test hash implementations with unaligned buffers
crypto: testmgr - test AEADs with unaligned buffers
crypto: testmgr - test skciphers with unaligned buffers
crypto: testmgr - check that entries in alg_test_descs are in correct order
Revert "crypto: twofish - add AVX2/x86_64 assembler implementation of twofish cipher"
Revert "crypto: blowfish - add AVX2/x86_64 implementation of blowfish cipher"
crypto: camellia-aesni-avx2 - tune assembly code for more performance
hwrng: bcm2835 - fix MODULE_LICENSE tag
hwrng: nomadik - use clk_prepare_enable()
crypto: picoxcell - replace strict_strtoul() with kstrtoul()
crypto: dcp - Staticize local symbols
crypto: dcp - Use NULL instead of 0
crypto: dcp - Use devm_* APIs
crypto: dcp - Remove redundant platform_set_drvdata()
hwrng: use platform_{get,set}_drvdata()
crypto: omap-aes - Don't idle/start AES device between Encrypt operations
crypto: crct10dif - Use PTR_RET
crypto: ux500 - Cocci spatch "resource_size.spatch"
crypto: sha256_ssse3 - add sha224 support
crypto: sha512_ssse3 - add sha384 support
...
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r-- | arch/x86/crypto/Makefile | 8 | ||||
-rw-r--r-- | arch/x86/crypto/blowfish-avx2-asm_64.S | 449 | ||||
-rw-r--r-- | arch/x86/crypto/blowfish_avx2_glue.c | 585 | ||||
-rw-r--r-- | arch/x86/crypto/blowfish_glue.c | 32 | ||||
-rw-r--r-- | arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 160 | ||||
-rw-r--r-- | arch/x86/crypto/crct10dif-pcl-asm_64.S | 643 | ||||
-rw-r--r-- | arch/x86/crypto/crct10dif-pclmul_glue.c | 151 | ||||
-rw-r--r-- | arch/x86/crypto/sha256_ssse3_glue.c | 57 | ||||
-rw-r--r-- | arch/x86/crypto/sha512_ssse3_glue.c | 58 | ||||
-rw-r--r-- | arch/x86/crypto/twofish-avx2-asm_64.S | 600 | ||||
-rw-r--r-- | arch/x86/crypto/twofish_avx2_glue.c | 584 | ||||
-rw-r--r-- | arch/x86/crypto/twofish_avx_glue.c | 14 |
12 files changed, 1016 insertions, 2325 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index a3a0ed80f17c..7d6ba9db1be9 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile | |||
@@ -3,8 +3,6 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) | 5 | avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) |
6 | avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ | ||
7 | $(comma)4)$(comma)%ymm2,yes,no) | ||
8 | 6 | ||
9 | obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o | 7 | obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o |
10 | obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o | 8 | obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o |
@@ -29,6 +27,7 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o | |||
29 | obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o | 27 | obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o |
30 | obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o | 28 | obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o |
31 | obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o | 29 | obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o |
30 | obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o | ||
32 | 31 | ||
33 | # These modules require assembler to support AVX. | 32 | # These modules require assembler to support AVX. |
34 | ifeq ($(avx_supported),yes) | 33 | ifeq ($(avx_supported),yes) |
@@ -42,10 +41,8 @@ endif | |||
42 | 41 | ||
43 | # These modules require assembler to support AVX2. | 42 | # These modules require assembler to support AVX2. |
44 | ifeq ($(avx2_supported),yes) | 43 | ifeq ($(avx2_supported),yes) |
45 | obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o | ||
46 | obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o | 44 | obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o |
47 | obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o | 45 | obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o |
48 | obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o | ||
49 | endif | 46 | endif |
50 | 47 | ||
51 | aes-i586-y := aes-i586-asm_32.o aes_glue.o | 48 | aes-i586-y := aes-i586-asm_32.o aes_glue.o |
@@ -73,10 +70,8 @@ ifeq ($(avx_supported),yes) | |||
73 | endif | 70 | endif |
74 | 71 | ||
75 | ifeq ($(avx2_supported),yes) | 72 | ifeq ($(avx2_supported),yes) |
76 | blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o | ||
77 | camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o | 73 | camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o |
78 | serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o | 74 | serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o |
79 | twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o | ||
80 | endif | 75 | endif |
81 | 76 | ||
82 | aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o | 77 | aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o |
@@ -87,3 +82,4 @@ crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o | |||
87 | crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o | 82 | crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o |
88 | sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o | 83 | sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o |
89 | sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o | 84 | sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o |
85 | crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o | ||
diff --git a/arch/x86/crypto/blowfish-avx2-asm_64.S b/arch/x86/crypto/blowfish-avx2-asm_64.S deleted file mode 100644 index 784452e0d05d..000000000000 --- a/arch/x86/crypto/blowfish-avx2-asm_64.S +++ /dev/null | |||
@@ -1,449 +0,0 @@ | |||
1 | /* | ||
2 | * x86_64/AVX2 assembler optimized version of Blowfish | ||
3 | * | ||
4 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #include <linux/linkage.h> | ||
14 | |||
15 | .file "blowfish-avx2-asm_64.S" | ||
16 | |||
17 | .data | ||
18 | .align 32 | ||
19 | |||
20 | .Lprefetch_mask: | ||
21 | .long 0*64 | ||
22 | .long 1*64 | ||
23 | .long 2*64 | ||
24 | .long 3*64 | ||
25 | .long 4*64 | ||
26 | .long 5*64 | ||
27 | .long 6*64 | ||
28 | .long 7*64 | ||
29 | |||
30 | .Lbswap32_mask: | ||
31 | .long 0x00010203 | ||
32 | .long 0x04050607 | ||
33 | .long 0x08090a0b | ||
34 | .long 0x0c0d0e0f | ||
35 | |||
36 | .Lbswap128_mask: | ||
37 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | ||
38 | .Lbswap_iv_mask: | ||
39 | .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0 | ||
40 | |||
41 | .text | ||
42 | /* structure of crypto context */ | ||
43 | #define p 0 | ||
44 | #define s0 ((16 + 2) * 4) | ||
45 | #define s1 ((16 + 2 + (1 * 256)) * 4) | ||
46 | #define s2 ((16 + 2 + (2 * 256)) * 4) | ||
47 | #define s3 ((16 + 2 + (3 * 256)) * 4) | ||
48 | |||
49 | /* register macros */ | ||
50 | #define CTX %rdi | ||
51 | #define RIO %rdx | ||
52 | |||
53 | #define RS0 %rax | ||
54 | #define RS1 %r8 | ||
55 | #define RS2 %r9 | ||
56 | #define RS3 %r10 | ||
57 | |||
58 | #define RLOOP %r11 | ||
59 | #define RLOOPd %r11d | ||
60 | |||
61 | #define RXr0 %ymm8 | ||
62 | #define RXr1 %ymm9 | ||
63 | #define RXr2 %ymm10 | ||
64 | #define RXr3 %ymm11 | ||
65 | #define RXl0 %ymm12 | ||
66 | #define RXl1 %ymm13 | ||
67 | #define RXl2 %ymm14 | ||
68 | #define RXl3 %ymm15 | ||
69 | |||
70 | /* temp regs */ | ||
71 | #define RT0 %ymm0 | ||
72 | #define RT0x %xmm0 | ||
73 | #define RT1 %ymm1 | ||
74 | #define RT1x %xmm1 | ||
75 | #define RIDX0 %ymm2 | ||
76 | #define RIDX1 %ymm3 | ||
77 | #define RIDX1x %xmm3 | ||
78 | #define RIDX2 %ymm4 | ||
79 | #define RIDX3 %ymm5 | ||
80 | |||
81 | /* vpgatherdd mask and '-1' */ | ||
82 | #define RNOT %ymm6 | ||
83 | |||
84 | /* byte mask, (-1 >> 24) */ | ||
85 | #define RBYTE %ymm7 | ||
86 | |||
87 | /*********************************************************************** | ||
88 | * 32-way AVX2 blowfish | ||
89 | ***********************************************************************/ | ||
90 | #define F(xl, xr) \ | ||
91 | vpsrld $24, xl, RIDX0; \ | ||
92 | vpsrld $16, xl, RIDX1; \ | ||
93 | vpsrld $8, xl, RIDX2; \ | ||
94 | vpand RBYTE, RIDX1, RIDX1; \ | ||
95 | vpand RBYTE, RIDX2, RIDX2; \ | ||
96 | vpand RBYTE, xl, RIDX3; \ | ||
97 | \ | ||
98 | vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \ | ||
99 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
100 | vpcmpeqd RIDX0, RIDX0, RIDX0; \ | ||
101 | \ | ||
102 | vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \ | ||
103 | vpcmpeqd RIDX1, RIDX1, RIDX1; \ | ||
104 | vpaddd RT0, RT1, RT0; \ | ||
105 | \ | ||
106 | vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \ | ||
107 | vpxor RT0, RT1, RT0; \ | ||
108 | \ | ||
109 | vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \ | ||
110 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
111 | vpaddd RT0, RT1, RT0; \ | ||
112 | \ | ||
113 | vpxor RT0, xr, xr; | ||
114 | |||
115 | #define add_roundkey(xl, nmem) \ | ||
116 | vpbroadcastd nmem, RT0; \ | ||
117 | vpxor RT0, xl ## 0, xl ## 0; \ | ||
118 | vpxor RT0, xl ## 1, xl ## 1; \ | ||
119 | vpxor RT0, xl ## 2, xl ## 2; \ | ||
120 | vpxor RT0, xl ## 3, xl ## 3; | ||
121 | |||
122 | #define round_enc() \ | ||
123 | add_roundkey(RXr, p(CTX,RLOOP,4)); \ | ||
124 | F(RXl0, RXr0); \ | ||
125 | F(RXl1, RXr1); \ | ||
126 | F(RXl2, RXr2); \ | ||
127 | F(RXl3, RXr3); \ | ||
128 | \ | ||
129 | add_roundkey(RXl, p+4(CTX,RLOOP,4)); \ | ||
130 | F(RXr0, RXl0); \ | ||
131 | F(RXr1, RXl1); \ | ||
132 | F(RXr2, RXl2); \ | ||
133 | F(RXr3, RXl3); | ||
134 | |||
135 | #define round_dec() \ | ||
136 | add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \ | ||
137 | F(RXl0, RXr0); \ | ||
138 | F(RXl1, RXr1); \ | ||
139 | F(RXl2, RXr2); \ | ||
140 | F(RXl3, RXr3); \ | ||
141 | \ | ||
142 | add_roundkey(RXl, p+4(CTX,RLOOP,4)); \ | ||
143 | F(RXr0, RXl0); \ | ||
144 | F(RXr1, RXl1); \ | ||
145 | F(RXr2, RXl2); \ | ||
146 | F(RXr3, RXl3); | ||
147 | |||
148 | #define init_round_constants() \ | ||
149 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
150 | leaq s0(CTX), RS0; \ | ||
151 | leaq s1(CTX), RS1; \ | ||
152 | leaq s2(CTX), RS2; \ | ||
153 | leaq s3(CTX), RS3; \ | ||
154 | vpsrld $24, RNOT, RBYTE; | ||
155 | |||
156 | #define transpose_2x2(x0, x1, t0) \ | ||
157 | vpunpckldq x0, x1, t0; \ | ||
158 | vpunpckhdq x0, x1, x1; \ | ||
159 | \ | ||
160 | vpunpcklqdq t0, x1, x0; \ | ||
161 | vpunpckhqdq t0, x1, x1; | ||
162 | |||
163 | #define read_block(xl, xr) \ | ||
164 | vbroadcasti128 .Lbswap32_mask, RT1; \ | ||
165 | \ | ||
166 | vpshufb RT1, xl ## 0, xl ## 0; \ | ||
167 | vpshufb RT1, xr ## 0, xr ## 0; \ | ||
168 | vpshufb RT1, xl ## 1, xl ## 1; \ | ||
169 | vpshufb RT1, xr ## 1, xr ## 1; \ | ||
170 | vpshufb RT1, xl ## 2, xl ## 2; \ | ||
171 | vpshufb RT1, xr ## 2, xr ## 2; \ | ||
172 | vpshufb RT1, xl ## 3, xl ## 3; \ | ||
173 | vpshufb RT1, xr ## 3, xr ## 3; \ | ||
174 | \ | ||
175 | transpose_2x2(xl ## 0, xr ## 0, RT0); \ | ||
176 | transpose_2x2(xl ## 1, xr ## 1, RT0); \ | ||
177 | transpose_2x2(xl ## 2, xr ## 2, RT0); \ | ||
178 | transpose_2x2(xl ## 3, xr ## 3, RT0); | ||
179 | |||
180 | #define write_block(xl, xr) \ | ||
181 | vbroadcasti128 .Lbswap32_mask, RT1; \ | ||
182 | \ | ||
183 | transpose_2x2(xl ## 0, xr ## 0, RT0); \ | ||
184 | transpose_2x2(xl ## 1, xr ## 1, RT0); \ | ||
185 | transpose_2x2(xl ## 2, xr ## 2, RT0); \ | ||
186 | transpose_2x2(xl ## 3, xr ## 3, RT0); \ | ||
187 | \ | ||
188 | vpshufb RT1, xl ## 0, xl ## 0; \ | ||
189 | vpshufb RT1, xr ## 0, xr ## 0; \ | ||
190 | vpshufb RT1, xl ## 1, xl ## 1; \ | ||
191 | vpshufb RT1, xr ## 1, xr ## 1; \ | ||
192 | vpshufb RT1, xl ## 2, xl ## 2; \ | ||
193 | vpshufb RT1, xr ## 2, xr ## 2; \ | ||
194 | vpshufb RT1, xl ## 3, xl ## 3; \ | ||
195 | vpshufb RT1, xr ## 3, xr ## 3; | ||
196 | |||
197 | .align 8 | ||
198 | __blowfish_enc_blk32: | ||
199 | /* input: | ||
200 | * %rdi: ctx, CTX | ||
201 | * RXl0..4, RXr0..4: plaintext | ||
202 | * output: | ||
203 | * RXl0..4, RXr0..4: ciphertext (RXl <=> RXr swapped) | ||
204 | */ | ||
205 | init_round_constants(); | ||
206 | |||
207 | read_block(RXl, RXr); | ||
208 | |||
209 | movl $1, RLOOPd; | ||
210 | add_roundkey(RXl, p+4*(0)(CTX)); | ||
211 | |||
212 | .align 4 | ||
213 | .L__enc_loop: | ||
214 | round_enc(); | ||
215 | |||
216 | leal 2(RLOOPd), RLOOPd; | ||
217 | cmpl $17, RLOOPd; | ||
218 | jne .L__enc_loop; | ||
219 | |||
220 | add_roundkey(RXr, p+4*(17)(CTX)); | ||
221 | |||
222 | write_block(RXl, RXr); | ||
223 | |||
224 | ret; | ||
225 | ENDPROC(__blowfish_enc_blk32) | ||
226 | |||
227 | .align 8 | ||
228 | __blowfish_dec_blk32: | ||
229 | /* input: | ||
230 | * %rdi: ctx, CTX | ||
231 | * RXl0..4, RXr0..4: ciphertext | ||
232 | * output: | ||
233 | * RXl0..4, RXr0..4: plaintext (RXl <=> RXr swapped) | ||
234 | */ | ||
235 | init_round_constants(); | ||
236 | |||
237 | read_block(RXl, RXr); | ||
238 | |||
239 | movl $14, RLOOPd; | ||
240 | add_roundkey(RXl, p+4*(17)(CTX)); | ||
241 | |||
242 | .align 4 | ||
243 | .L__dec_loop: | ||
244 | round_dec(); | ||
245 | |||
246 | addl $-2, RLOOPd; | ||
247 | jns .L__dec_loop; | ||
248 | |||
249 | add_roundkey(RXr, p+4*(0)(CTX)); | ||
250 | |||
251 | write_block(RXl, RXr); | ||
252 | |||
253 | ret; | ||
254 | ENDPROC(__blowfish_dec_blk32) | ||
255 | |||
256 | ENTRY(blowfish_ecb_enc_32way) | ||
257 | /* input: | ||
258 | * %rdi: ctx, CTX | ||
259 | * %rsi: dst | ||
260 | * %rdx: src | ||
261 | */ | ||
262 | |||
263 | vzeroupper; | ||
264 | |||
265 | vmovdqu 0*32(%rdx), RXl0; | ||
266 | vmovdqu 1*32(%rdx), RXr0; | ||
267 | vmovdqu 2*32(%rdx), RXl1; | ||
268 | vmovdqu 3*32(%rdx), RXr1; | ||
269 | vmovdqu 4*32(%rdx), RXl2; | ||
270 | vmovdqu 5*32(%rdx), RXr2; | ||
271 | vmovdqu 6*32(%rdx), RXl3; | ||
272 | vmovdqu 7*32(%rdx), RXr3; | ||
273 | |||
274 | call __blowfish_enc_blk32; | ||
275 | |||
276 | vmovdqu RXr0, 0*32(%rsi); | ||
277 | vmovdqu RXl0, 1*32(%rsi); | ||
278 | vmovdqu RXr1, 2*32(%rsi); | ||
279 | vmovdqu RXl1, 3*32(%rsi); | ||
280 | vmovdqu RXr2, 4*32(%rsi); | ||
281 | vmovdqu RXl2, 5*32(%rsi); | ||
282 | vmovdqu RXr3, 6*32(%rsi); | ||
283 | vmovdqu RXl3, 7*32(%rsi); | ||
284 | |||
285 | vzeroupper; | ||
286 | |||
287 | ret; | ||
288 | ENDPROC(blowfish_ecb_enc_32way) | ||
289 | |||
290 | ENTRY(blowfish_ecb_dec_32way) | ||
291 | /* input: | ||
292 | * %rdi: ctx, CTX | ||
293 | * %rsi: dst | ||
294 | * %rdx: src | ||
295 | */ | ||
296 | |||
297 | vzeroupper; | ||
298 | |||
299 | vmovdqu 0*32(%rdx), RXl0; | ||
300 | vmovdqu 1*32(%rdx), RXr0; | ||
301 | vmovdqu 2*32(%rdx), RXl1; | ||
302 | vmovdqu 3*32(%rdx), RXr1; | ||
303 | vmovdqu 4*32(%rdx), RXl2; | ||
304 | vmovdqu 5*32(%rdx), RXr2; | ||
305 | vmovdqu 6*32(%rdx), RXl3; | ||
306 | vmovdqu 7*32(%rdx), RXr3; | ||
307 | |||
308 | call __blowfish_dec_blk32; | ||
309 | |||
310 | vmovdqu RXr0, 0*32(%rsi); | ||
311 | vmovdqu RXl0, 1*32(%rsi); | ||
312 | vmovdqu RXr1, 2*32(%rsi); | ||
313 | vmovdqu RXl1, 3*32(%rsi); | ||
314 | vmovdqu RXr2, 4*32(%rsi); | ||
315 | vmovdqu RXl2, 5*32(%rsi); | ||
316 | vmovdqu RXr3, 6*32(%rsi); | ||
317 | vmovdqu RXl3, 7*32(%rsi); | ||
318 | |||
319 | vzeroupper; | ||
320 | |||
321 | ret; | ||
322 | ENDPROC(blowfish_ecb_dec_32way) | ||
323 | |||
324 | ENTRY(blowfish_cbc_dec_32way) | ||
325 | /* input: | ||
326 | * %rdi: ctx, CTX | ||
327 | * %rsi: dst | ||
328 | * %rdx: src | ||
329 | */ | ||
330 | |||
331 | vzeroupper; | ||
332 | |||
333 | vmovdqu 0*32(%rdx), RXl0; | ||
334 | vmovdqu 1*32(%rdx), RXr0; | ||
335 | vmovdqu 2*32(%rdx), RXl1; | ||
336 | vmovdqu 3*32(%rdx), RXr1; | ||
337 | vmovdqu 4*32(%rdx), RXl2; | ||
338 | vmovdqu 5*32(%rdx), RXr2; | ||
339 | vmovdqu 6*32(%rdx), RXl3; | ||
340 | vmovdqu 7*32(%rdx), RXr3; | ||
341 | |||
342 | call __blowfish_dec_blk32; | ||
343 | |||
344 | /* xor with src */ | ||
345 | vmovq (%rdx), RT0x; | ||
346 | vpshufd $0x4f, RT0x, RT0x; | ||
347 | vinserti128 $1, 8(%rdx), RT0, RT0; | ||
348 | vpxor RT0, RXr0, RXr0; | ||
349 | vpxor 0*32+24(%rdx), RXl0, RXl0; | ||
350 | vpxor 1*32+24(%rdx), RXr1, RXr1; | ||
351 | vpxor 2*32+24(%rdx), RXl1, RXl1; | ||
352 | vpxor 3*32+24(%rdx), RXr2, RXr2; | ||
353 | vpxor 4*32+24(%rdx), RXl2, RXl2; | ||
354 | vpxor 5*32+24(%rdx), RXr3, RXr3; | ||
355 | vpxor 6*32+24(%rdx), RXl3, RXl3; | ||
356 | |||
357 | vmovdqu RXr0, (0*32)(%rsi); | ||
358 | vmovdqu RXl0, (1*32)(%rsi); | ||
359 | vmovdqu RXr1, (2*32)(%rsi); | ||
360 | vmovdqu RXl1, (3*32)(%rsi); | ||
361 | vmovdqu RXr2, (4*32)(%rsi); | ||
362 | vmovdqu RXl2, (5*32)(%rsi); | ||
363 | vmovdqu RXr3, (6*32)(%rsi); | ||
364 | vmovdqu RXl3, (7*32)(%rsi); | ||
365 | |||
366 | vzeroupper; | ||
367 | |||
368 | ret; | ||
369 | ENDPROC(blowfish_cbc_dec_32way) | ||
370 | |||
371 | ENTRY(blowfish_ctr_32way) | ||
372 | /* input: | ||
373 | * %rdi: ctx, CTX | ||
374 | * %rsi: dst | ||
375 | * %rdx: src | ||
376 | * %rcx: iv (big endian, 64bit) | ||
377 | */ | ||
378 | |||
379 | vzeroupper; | ||
380 | |||
381 | vpcmpeqd RT0, RT0, RT0; | ||
382 | vpsrldq $8, RT0, RT0; /* a: -1, b: 0, c: -1, d: 0 */ | ||
383 | |||
384 | vpcmpeqd RT1x, RT1x, RT1x; | ||
385 | vpaddq RT1x, RT1x, RT1x; /* a: -2, b: -2 */ | ||
386 | vpxor RIDX0, RIDX0, RIDX0; | ||
387 | vinserti128 $1, RT1x, RIDX0, RIDX0; /* a: 0, b: 0, c: -2, d: -2 */ | ||
388 | |||
389 | vpaddq RIDX0, RT0, RT0; /* a: -1, b: 0, c: -3, d: -2 */ | ||
390 | |||
391 | vpcmpeqd RT1, RT1, RT1; | ||
392 | vpaddq RT1, RT1, RT1; /* a: -2, b: -2, c: -2, d: -2 */ | ||
393 | vpaddq RT1, RT1, RIDX2; /* a: -4, b: -4, c: -4, d: -4 */ | ||
394 | |||
395 | vbroadcasti128 .Lbswap_iv_mask, RIDX0; | ||
396 | vbroadcasti128 .Lbswap128_mask, RIDX1; | ||
397 | |||
398 | /* load IV and byteswap */ | ||
399 | vmovq (%rcx), RT1x; | ||
400 | vinserti128 $1, RT1x, RT1, RT1; /* a: BE, b: 0, c: BE, d: 0 */ | ||
401 | vpshufb RIDX0, RT1, RT1; /* a: LE, b: LE, c: LE, d: LE */ | ||
402 | |||
403 | /* construct IVs */ | ||
404 | vpsubq RT0, RT1, RT1; /* a: le1, b: le0, c: le3, d: le2 */ | ||
405 | vpshufb RIDX1, RT1, RXl0; /* a: be0, b: be1, c: be2, d: be3 */ | ||
406 | vpsubq RIDX2, RT1, RT1; /* le5, le4, le7, le6 */ | ||
407 | vpshufb RIDX1, RT1, RXr0; /* be4, be5, be6, be7 */ | ||
408 | vpsubq RIDX2, RT1, RT1; | ||
409 | vpshufb RIDX1, RT1, RXl1; | ||
410 | vpsubq RIDX2, RT1, RT1; | ||
411 | vpshufb RIDX1, RT1, RXr1; | ||
412 | vpsubq RIDX2, RT1, RT1; | ||
413 | vpshufb RIDX1, RT1, RXl2; | ||
414 | vpsubq RIDX2, RT1, RT1; | ||
415 | vpshufb RIDX1, RT1, RXr2; | ||
416 | vpsubq RIDX2, RT1, RT1; | ||
417 | vpshufb RIDX1, RT1, RXl3; | ||
418 | vpsubq RIDX2, RT1, RT1; | ||
419 | vpshufb RIDX1, RT1, RXr3; | ||
420 | |||
421 | /* store last IV */ | ||
422 | vpsubq RIDX2, RT1, RT1; /* a: le33, b: le32, ... */ | ||
423 | vpshufb RIDX1x, RT1x, RT1x; /* a: be32, ... */ | ||
424 | vmovq RT1x, (%rcx); | ||
425 | |||
426 | call __blowfish_enc_blk32; | ||
427 | |||
428 | /* dst = src ^ iv */ | ||
429 | vpxor 0*32(%rdx), RXr0, RXr0; | ||
430 | vpxor 1*32(%rdx), RXl0, RXl0; | ||
431 | vpxor 2*32(%rdx), RXr1, RXr1; | ||
432 | vpxor 3*32(%rdx), RXl1, RXl1; | ||
433 | vpxor 4*32(%rdx), RXr2, RXr2; | ||
434 | vpxor 5*32(%rdx), RXl2, RXl2; | ||
435 | vpxor 6*32(%rdx), RXr3, RXr3; | ||
436 | vpxor 7*32(%rdx), RXl3, RXl3; | ||
437 | vmovdqu RXr0, (0*32)(%rsi); | ||
438 | vmovdqu RXl0, (1*32)(%rsi); | ||
439 | vmovdqu RXr1, (2*32)(%rsi); | ||
440 | vmovdqu RXl1, (3*32)(%rsi); | ||
441 | vmovdqu RXr2, (4*32)(%rsi); | ||
442 | vmovdqu RXl2, (5*32)(%rsi); | ||
443 | vmovdqu RXr3, (6*32)(%rsi); | ||
444 | vmovdqu RXl3, (7*32)(%rsi); | ||
445 | |||
446 | vzeroupper; | ||
447 | |||
448 | ret; | ||
449 | ENDPROC(blowfish_ctr_32way) | ||
diff --git a/arch/x86/crypto/blowfish_avx2_glue.c b/arch/x86/crypto/blowfish_avx2_glue.c deleted file mode 100644 index 4417e9aea78d..000000000000 --- a/arch/x86/crypto/blowfish_avx2_glue.c +++ /dev/null | |||
@@ -1,585 +0,0 @@ | |||
1 | /* | ||
2 | * Glue Code for x86_64/AVX2 assembler optimized version of Blowfish | ||
3 | * | ||
4 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
5 | * | ||
6 | * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: | ||
7 | * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> | ||
8 | * CTR part based on code (crypto/ctr.c) by: | ||
9 | * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or modify | ||
12 | * it under the terms of the GNU General Public License as published by | ||
13 | * the Free Software Foundation; either version 2 of the License, or | ||
14 | * (at your option) any later version. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | * GNU General Public License for more details. | ||
20 | * | ||
21 | */ | ||
22 | |||
23 | #include <linux/module.h> | ||
24 | #include <linux/types.h> | ||
25 | #include <linux/crypto.h> | ||
26 | #include <linux/err.h> | ||
27 | #include <crypto/algapi.h> | ||
28 | #include <crypto/blowfish.h> | ||
29 | #include <crypto/cryptd.h> | ||
30 | #include <crypto/ctr.h> | ||
31 | #include <asm/i387.h> | ||
32 | #include <asm/xcr.h> | ||
33 | #include <asm/xsave.h> | ||
34 | #include <asm/crypto/blowfish.h> | ||
35 | #include <asm/crypto/ablk_helper.h> | ||
36 | #include <crypto/scatterwalk.h> | ||
37 | |||
38 | #define BF_AVX2_PARALLEL_BLOCKS 32 | ||
39 | |||
40 | /* 32-way AVX2 parallel cipher functions */ | ||
41 | asmlinkage void blowfish_ecb_enc_32way(struct bf_ctx *ctx, u8 *dst, | ||
42 | const u8 *src); | ||
43 | asmlinkage void blowfish_ecb_dec_32way(struct bf_ctx *ctx, u8 *dst, | ||
44 | const u8 *src); | ||
45 | asmlinkage void blowfish_cbc_dec_32way(struct bf_ctx *ctx, u8 *dst, | ||
46 | const u8 *src); | ||
47 | asmlinkage void blowfish_ctr_32way(struct bf_ctx *ctx, u8 *dst, const u8 *src, | ||
48 | __be64 *iv); | ||
49 | |||
50 | static inline bool bf_fpu_begin(bool fpu_enabled, unsigned int nbytes) | ||
51 | { | ||
52 | if (fpu_enabled) | ||
53 | return true; | ||
54 | |||
55 | /* FPU is only used when chunk to be processed is large enough, so | ||
56 | * do not enable FPU until it is necessary. | ||
57 | */ | ||
58 | if (nbytes < BF_BLOCK_SIZE * BF_AVX2_PARALLEL_BLOCKS) | ||
59 | return false; | ||
60 | |||
61 | kernel_fpu_begin(); | ||
62 | return true; | ||
63 | } | ||
64 | |||
65 | static inline void bf_fpu_end(bool fpu_enabled) | ||
66 | { | ||
67 | if (fpu_enabled) | ||
68 | kernel_fpu_end(); | ||
69 | } | ||
70 | |||
71 | static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, | ||
72 | bool enc) | ||
73 | { | ||
74 | bool fpu_enabled = false; | ||
75 | struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
76 | const unsigned int bsize = BF_BLOCK_SIZE; | ||
77 | unsigned int nbytes; | ||
78 | int err; | ||
79 | |||
80 | err = blkcipher_walk_virt(desc, walk); | ||
81 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
82 | |||
83 | while ((nbytes = walk->nbytes)) { | ||
84 | u8 *wsrc = walk->src.virt.addr; | ||
85 | u8 *wdst = walk->dst.virt.addr; | ||
86 | |||
87 | fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes); | ||
88 | |||
89 | /* Process multi-block AVX2 batch */ | ||
90 | if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) { | ||
91 | do { | ||
92 | if (enc) | ||
93 | blowfish_ecb_enc_32way(ctx, wdst, wsrc); | ||
94 | else | ||
95 | blowfish_ecb_dec_32way(ctx, wdst, wsrc); | ||
96 | |||
97 | wsrc += bsize * BF_AVX2_PARALLEL_BLOCKS; | ||
98 | wdst += bsize * BF_AVX2_PARALLEL_BLOCKS; | ||
99 | nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS; | ||
100 | } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS); | ||
101 | |||
102 | if (nbytes < bsize) | ||
103 | goto done; | ||
104 | } | ||
105 | |||
106 | /* Process multi-block batch */ | ||
107 | if (nbytes >= bsize * BF_PARALLEL_BLOCKS) { | ||
108 | do { | ||
109 | if (enc) | ||
110 | blowfish_enc_blk_4way(ctx, wdst, wsrc); | ||
111 | else | ||
112 | blowfish_dec_blk_4way(ctx, wdst, wsrc); | ||
113 | |||
114 | wsrc += bsize * BF_PARALLEL_BLOCKS; | ||
115 | wdst += bsize * BF_PARALLEL_BLOCKS; | ||
116 | nbytes -= bsize * BF_PARALLEL_BLOCKS; | ||
117 | } while (nbytes >= bsize * BF_PARALLEL_BLOCKS); | ||
118 | |||
119 | if (nbytes < bsize) | ||
120 | goto done; | ||
121 | } | ||
122 | |||
123 | /* Handle leftovers */ | ||
124 | do { | ||
125 | if (enc) | ||
126 | blowfish_enc_blk(ctx, wdst, wsrc); | ||
127 | else | ||
128 | blowfish_dec_blk(ctx, wdst, wsrc); | ||
129 | |||
130 | wsrc += bsize; | ||
131 | wdst += bsize; | ||
132 | nbytes -= bsize; | ||
133 | } while (nbytes >= bsize); | ||
134 | |||
135 | done: | ||
136 | err = blkcipher_walk_done(desc, walk, nbytes); | ||
137 | } | ||
138 | |||
139 | bf_fpu_end(fpu_enabled); | ||
140 | return err; | ||
141 | } | ||
142 | |||
143 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
144 | struct scatterlist *src, unsigned int nbytes) | ||
145 | { | ||
146 | struct blkcipher_walk walk; | ||
147 | |||
148 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
149 | return ecb_crypt(desc, &walk, true); | ||
150 | } | ||
151 | |||
152 | static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
153 | struct scatterlist *src, unsigned int nbytes) | ||
154 | { | ||
155 | struct blkcipher_walk walk; | ||
156 | |||
157 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
158 | return ecb_crypt(desc, &walk, false); | ||
159 | } | ||
160 | |||
161 | static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, | ||
162 | struct blkcipher_walk *walk) | ||
163 | { | ||
164 | struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
165 | unsigned int bsize = BF_BLOCK_SIZE; | ||
166 | unsigned int nbytes = walk->nbytes; | ||
167 | u64 *src = (u64 *)walk->src.virt.addr; | ||
168 | u64 *dst = (u64 *)walk->dst.virt.addr; | ||
169 | u64 *iv = (u64 *)walk->iv; | ||
170 | |||
171 | do { | ||
172 | *dst = *src ^ *iv; | ||
173 | blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst); | ||
174 | iv = dst; | ||
175 | |||
176 | src += 1; | ||
177 | dst += 1; | ||
178 | nbytes -= bsize; | ||
179 | } while (nbytes >= bsize); | ||
180 | |||
181 | *(u64 *)walk->iv = *iv; | ||
182 | return nbytes; | ||
183 | } | ||
184 | |||
185 | static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
186 | struct scatterlist *src, unsigned int nbytes) | ||
187 | { | ||
188 | struct blkcipher_walk walk; | ||
189 | int err; | ||
190 | |||
191 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
192 | err = blkcipher_walk_virt(desc, &walk); | ||
193 | |||
194 | while ((nbytes = walk.nbytes)) { | ||
195 | nbytes = __cbc_encrypt(desc, &walk); | ||
196 | err = blkcipher_walk_done(desc, &walk, nbytes); | ||
197 | } | ||
198 | |||
199 | return err; | ||
200 | } | ||
201 | |||
202 | static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, | ||
203 | struct blkcipher_walk *walk) | ||
204 | { | ||
205 | struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
206 | const unsigned int bsize = BF_BLOCK_SIZE; | ||
207 | unsigned int nbytes = walk->nbytes; | ||
208 | u64 *src = (u64 *)walk->src.virt.addr; | ||
209 | u64 *dst = (u64 *)walk->dst.virt.addr; | ||
210 | u64 last_iv; | ||
211 | int i; | ||
212 | |||
213 | /* Start of the last block. */ | ||
214 | src += nbytes / bsize - 1; | ||
215 | dst += nbytes / bsize - 1; | ||
216 | |||
217 | last_iv = *src; | ||
218 | |||
219 | /* Process multi-block AVX2 batch */ | ||
220 | if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) { | ||
221 | do { | ||
222 | nbytes -= bsize * (BF_AVX2_PARALLEL_BLOCKS - 1); | ||
223 | src -= BF_AVX2_PARALLEL_BLOCKS - 1; | ||
224 | dst -= BF_AVX2_PARALLEL_BLOCKS - 1; | ||
225 | |||
226 | blowfish_cbc_dec_32way(ctx, (u8 *)dst, (u8 *)src); | ||
227 | |||
228 | nbytes -= bsize; | ||
229 | if (nbytes < bsize) | ||
230 | goto done; | ||
231 | |||
232 | *dst ^= *(src - 1); | ||
233 | src -= 1; | ||
234 | dst -= 1; | ||
235 | } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS); | ||
236 | |||
237 | if (nbytes < bsize) | ||
238 | goto done; | ||
239 | } | ||
240 | |||
241 | /* Process multi-block batch */ | ||
242 | if (nbytes >= bsize * BF_PARALLEL_BLOCKS) { | ||
243 | u64 ivs[BF_PARALLEL_BLOCKS - 1]; | ||
244 | |||
245 | do { | ||
246 | nbytes -= bsize * (BF_PARALLEL_BLOCKS - 1); | ||
247 | src -= BF_PARALLEL_BLOCKS - 1; | ||
248 | dst -= BF_PARALLEL_BLOCKS - 1; | ||
249 | |||
250 | for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++) | ||
251 | ivs[i] = src[i]; | ||
252 | |||
253 | blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src); | ||
254 | |||
255 | for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++) | ||
256 | dst[i + 1] ^= ivs[i]; | ||
257 | |||
258 | nbytes -= bsize; | ||
259 | if (nbytes < bsize) | ||
260 | goto done; | ||
261 | |||
262 | *dst ^= *(src - 1); | ||
263 | src -= 1; | ||
264 | dst -= 1; | ||
265 | } while (nbytes >= bsize * BF_PARALLEL_BLOCKS); | ||
266 | |||
267 | if (nbytes < bsize) | ||
268 | goto done; | ||
269 | } | ||
270 | |||
271 | /* Handle leftovers */ | ||
272 | for (;;) { | ||
273 | blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src); | ||
274 | |||
275 | nbytes -= bsize; | ||
276 | if (nbytes < bsize) | ||
277 | break; | ||
278 | |||
279 | *dst ^= *(src - 1); | ||
280 | src -= 1; | ||
281 | dst -= 1; | ||
282 | } | ||
283 | |||
284 | done: | ||
285 | *dst ^= *(u64 *)walk->iv; | ||
286 | *(u64 *)walk->iv = last_iv; | ||
287 | |||
288 | return nbytes; | ||
289 | } | ||
290 | |||
291 | static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
292 | struct scatterlist *src, unsigned int nbytes) | ||
293 | { | ||
294 | bool fpu_enabled = false; | ||
295 | struct blkcipher_walk walk; | ||
296 | int err; | ||
297 | |||
298 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
299 | err = blkcipher_walk_virt(desc, &walk); | ||
300 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
301 | |||
302 | while ((nbytes = walk.nbytes)) { | ||
303 | fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes); | ||
304 | nbytes = __cbc_decrypt(desc, &walk); | ||
305 | err = blkcipher_walk_done(desc, &walk, nbytes); | ||
306 | } | ||
307 | |||
308 | bf_fpu_end(fpu_enabled); | ||
309 | return err; | ||
310 | } | ||
311 | |||
312 | static void ctr_crypt_final(struct blkcipher_desc *desc, | ||
313 | struct blkcipher_walk *walk) | ||
314 | { | ||
315 | struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
316 | u8 *ctrblk = walk->iv; | ||
317 | u8 keystream[BF_BLOCK_SIZE]; | ||
318 | u8 *src = walk->src.virt.addr; | ||
319 | u8 *dst = walk->dst.virt.addr; | ||
320 | unsigned int nbytes = walk->nbytes; | ||
321 | |||
322 | blowfish_enc_blk(ctx, keystream, ctrblk); | ||
323 | crypto_xor(keystream, src, nbytes); | ||
324 | memcpy(dst, keystream, nbytes); | ||
325 | |||
326 | crypto_inc(ctrblk, BF_BLOCK_SIZE); | ||
327 | } | ||
328 | |||
329 | static unsigned int __ctr_crypt(struct blkcipher_desc *desc, | ||
330 | struct blkcipher_walk *walk) | ||
331 | { | ||
332 | struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
333 | unsigned int bsize = BF_BLOCK_SIZE; | ||
334 | unsigned int nbytes = walk->nbytes; | ||
335 | u64 *src = (u64 *)walk->src.virt.addr; | ||
336 | u64 *dst = (u64 *)walk->dst.virt.addr; | ||
337 | int i; | ||
338 | |||
339 | /* Process multi-block AVX2 batch */ | ||
340 | if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) { | ||
341 | do { | ||
342 | blowfish_ctr_32way(ctx, (u8 *)dst, (u8 *)src, | ||
343 | (__be64 *)walk->iv); | ||
344 | |||
345 | src += BF_AVX2_PARALLEL_BLOCKS; | ||
346 | dst += BF_AVX2_PARALLEL_BLOCKS; | ||
347 | nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS; | ||
348 | } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS); | ||
349 | |||
350 | if (nbytes < bsize) | ||
351 | goto done; | ||
352 | } | ||
353 | |||
354 | /* Process four block batch */ | ||
355 | if (nbytes >= bsize * BF_PARALLEL_BLOCKS) { | ||
356 | __be64 ctrblocks[BF_PARALLEL_BLOCKS]; | ||
357 | u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv); | ||
358 | |||
359 | do { | ||
360 | /* create ctrblks for parallel encrypt */ | ||
361 | for (i = 0; i < BF_PARALLEL_BLOCKS; i++) { | ||
362 | if (dst != src) | ||
363 | dst[i] = src[i]; | ||
364 | |||
365 | ctrblocks[i] = cpu_to_be64(ctrblk++); | ||
366 | } | ||
367 | |||
368 | blowfish_enc_blk_xor_4way(ctx, (u8 *)dst, | ||
369 | (u8 *)ctrblocks); | ||
370 | |||
371 | src += BF_PARALLEL_BLOCKS; | ||
372 | dst += BF_PARALLEL_BLOCKS; | ||
373 | nbytes -= bsize * BF_PARALLEL_BLOCKS; | ||
374 | } while (nbytes >= bsize * BF_PARALLEL_BLOCKS); | ||
375 | |||
376 | *(__be64 *)walk->iv = cpu_to_be64(ctrblk); | ||
377 | |||
378 | if (nbytes < bsize) | ||
379 | goto done; | ||
380 | } | ||
381 | |||
382 | /* Handle leftovers */ | ||
383 | do { | ||
384 | u64 ctrblk; | ||
385 | |||
386 | if (dst != src) | ||
387 | *dst = *src; | ||
388 | |||
389 | ctrblk = *(u64 *)walk->iv; | ||
390 | be64_add_cpu((__be64 *)walk->iv, 1); | ||
391 | |||
392 | blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk); | ||
393 | |||
394 | src += 1; | ||
395 | dst += 1; | ||
396 | } while ((nbytes -= bsize) >= bsize); | ||
397 | |||
398 | done: | ||
399 | return nbytes; | ||
400 | } | ||
401 | |||
402 | static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
403 | struct scatterlist *src, unsigned int nbytes) | ||
404 | { | ||
405 | bool fpu_enabled = false; | ||
406 | struct blkcipher_walk walk; | ||
407 | int err; | ||
408 | |||
409 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
410 | err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE); | ||
411 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
412 | |||
413 | while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) { | ||
414 | fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes); | ||
415 | nbytes = __ctr_crypt(desc, &walk); | ||
416 | err = blkcipher_walk_done(desc, &walk, nbytes); | ||
417 | } | ||
418 | |||
419 | bf_fpu_end(fpu_enabled); | ||
420 | |||
421 | if (walk.nbytes) { | ||
422 | ctr_crypt_final(desc, &walk); | ||
423 | err = blkcipher_walk_done(desc, &walk, 0); | ||
424 | } | ||
425 | |||
426 | return err; | ||
427 | } | ||
428 | |||
429 | static struct crypto_alg bf_algs[6] = { { | ||
430 | .cra_name = "__ecb-blowfish-avx2", | ||
431 | .cra_driver_name = "__driver-ecb-blowfish-avx2", | ||
432 | .cra_priority = 0, | ||
433 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
434 | .cra_blocksize = BF_BLOCK_SIZE, | ||
435 | .cra_ctxsize = sizeof(struct bf_ctx), | ||
436 | .cra_alignmask = 0, | ||
437 | .cra_type = &crypto_blkcipher_type, | ||
438 | .cra_module = THIS_MODULE, | ||
439 | .cra_u = { | ||
440 | .blkcipher = { | ||
441 | .min_keysize = BF_MIN_KEY_SIZE, | ||
442 | .max_keysize = BF_MAX_KEY_SIZE, | ||
443 | .setkey = blowfish_setkey, | ||
444 | .encrypt = ecb_encrypt, | ||
445 | .decrypt = ecb_decrypt, | ||
446 | }, | ||
447 | }, | ||
448 | }, { | ||
449 | .cra_name = "__cbc-blowfish-avx2", | ||
450 | .cra_driver_name = "__driver-cbc-blowfish-avx2", | ||
451 | .cra_priority = 0, | ||
452 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
453 | .cra_blocksize = BF_BLOCK_SIZE, | ||
454 | .cra_ctxsize = sizeof(struct bf_ctx), | ||
455 | .cra_alignmask = 0, | ||
456 | .cra_type = &crypto_blkcipher_type, | ||
457 | .cra_module = THIS_MODULE, | ||
458 | .cra_u = { | ||
459 | .blkcipher = { | ||
460 | .min_keysize = BF_MIN_KEY_SIZE, | ||
461 | .max_keysize = BF_MAX_KEY_SIZE, | ||
462 | .setkey = blowfish_setkey, | ||
463 | .encrypt = cbc_encrypt, | ||
464 | .decrypt = cbc_decrypt, | ||
465 | }, | ||
466 | }, | ||
467 | }, { | ||
468 | .cra_name = "__ctr-blowfish-avx2", | ||
469 | .cra_driver_name = "__driver-ctr-blowfish-avx2", | ||
470 | .cra_priority = 0, | ||
471 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
472 | .cra_blocksize = 1, | ||
473 | .cra_ctxsize = sizeof(struct bf_ctx), | ||
474 | .cra_alignmask = 0, | ||
475 | .cra_type = &crypto_blkcipher_type, | ||
476 | .cra_module = THIS_MODULE, | ||
477 | .cra_u = { | ||
478 | .blkcipher = { | ||
479 | .min_keysize = BF_MIN_KEY_SIZE, | ||
480 | .max_keysize = BF_MAX_KEY_SIZE, | ||
481 | .ivsize = BF_BLOCK_SIZE, | ||
482 | .setkey = blowfish_setkey, | ||
483 | .encrypt = ctr_crypt, | ||
484 | .decrypt = ctr_crypt, | ||
485 | }, | ||
486 | }, | ||
487 | }, { | ||
488 | .cra_name = "ecb(blowfish)", | ||
489 | .cra_driver_name = "ecb-blowfish-avx2", | ||
490 | .cra_priority = 400, | ||
491 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
492 | .cra_blocksize = BF_BLOCK_SIZE, | ||
493 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
494 | .cra_alignmask = 0, | ||
495 | .cra_type = &crypto_ablkcipher_type, | ||
496 | .cra_module = THIS_MODULE, | ||
497 | .cra_init = ablk_init, | ||
498 | .cra_exit = ablk_exit, | ||
499 | .cra_u = { | ||
500 | .ablkcipher = { | ||
501 | .min_keysize = BF_MIN_KEY_SIZE, | ||
502 | .max_keysize = BF_MAX_KEY_SIZE, | ||
503 | .setkey = ablk_set_key, | ||
504 | .encrypt = ablk_encrypt, | ||
505 | .decrypt = ablk_decrypt, | ||
506 | }, | ||
507 | }, | ||
508 | }, { | ||
509 | .cra_name = "cbc(blowfish)", | ||
510 | .cra_driver_name = "cbc-blowfish-avx2", | ||
511 | .cra_priority = 400, | ||
512 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
513 | .cra_blocksize = BF_BLOCK_SIZE, | ||
514 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
515 | .cra_alignmask = 0, | ||
516 | .cra_type = &crypto_ablkcipher_type, | ||
517 | .cra_module = THIS_MODULE, | ||
518 | .cra_init = ablk_init, | ||
519 | .cra_exit = ablk_exit, | ||
520 | .cra_u = { | ||
521 | .ablkcipher = { | ||
522 | .min_keysize = BF_MIN_KEY_SIZE, | ||
523 | .max_keysize = BF_MAX_KEY_SIZE, | ||
524 | .ivsize = BF_BLOCK_SIZE, | ||
525 | .setkey = ablk_set_key, | ||
526 | .encrypt = __ablk_encrypt, | ||
527 | .decrypt = ablk_decrypt, | ||
528 | }, | ||
529 | }, | ||
530 | }, { | ||
531 | .cra_name = "ctr(blowfish)", | ||
532 | .cra_driver_name = "ctr-blowfish-avx2", | ||
533 | .cra_priority = 400, | ||
534 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
535 | .cra_blocksize = 1, | ||
536 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
537 | .cra_alignmask = 0, | ||
538 | .cra_type = &crypto_ablkcipher_type, | ||
539 | .cra_module = THIS_MODULE, | ||
540 | .cra_init = ablk_init, | ||
541 | .cra_exit = ablk_exit, | ||
542 | .cra_u = { | ||
543 | .ablkcipher = { | ||
544 | .min_keysize = BF_MIN_KEY_SIZE, | ||
545 | .max_keysize = BF_MAX_KEY_SIZE, | ||
546 | .ivsize = BF_BLOCK_SIZE, | ||
547 | .setkey = ablk_set_key, | ||
548 | .encrypt = ablk_encrypt, | ||
549 | .decrypt = ablk_encrypt, | ||
550 | .geniv = "chainiv", | ||
551 | }, | ||
552 | }, | ||
553 | } }; | ||
554 | |||
555 | |||
556 | static int __init init(void) | ||
557 | { | ||
558 | u64 xcr0; | ||
559 | |||
560 | if (!cpu_has_avx2 || !cpu_has_osxsave) { | ||
561 | pr_info("AVX2 instructions are not detected.\n"); | ||
562 | return -ENODEV; | ||
563 | } | ||
564 | |||
565 | xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); | ||
566 | if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { | ||
567 | pr_info("AVX detected but unusable.\n"); | ||
568 | return -ENODEV; | ||
569 | } | ||
570 | |||
571 | return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs)); | ||
572 | } | ||
573 | |||
574 | static void __exit fini(void) | ||
575 | { | ||
576 | crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs)); | ||
577 | } | ||
578 | |||
579 | module_init(init); | ||
580 | module_exit(fini); | ||
581 | |||
582 | MODULE_LICENSE("GPL"); | ||
583 | MODULE_DESCRIPTION("Blowfish Cipher Algorithm, AVX2 optimized"); | ||
584 | MODULE_ALIAS("blowfish"); | ||
585 | MODULE_ALIAS("blowfish-asm"); | ||
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 3548d76dbaa9..50ec333b70e6 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Glue Code for assembler optimized version of Blowfish | 2 | * Glue Code for assembler optimized version of Blowfish |
3 | * | 3 | * |
4 | * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> | 4 | * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> |
5 | * | 5 | * |
6 | * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: | 6 | * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: |
7 | * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> | 7 | * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> |
@@ -32,24 +32,40 @@ | |||
32 | #include <linux/module.h> | 32 | #include <linux/module.h> |
33 | #include <linux/types.h> | 33 | #include <linux/types.h> |
34 | #include <crypto/algapi.h> | 34 | #include <crypto/algapi.h> |
35 | #include <asm/crypto/blowfish.h> | ||
36 | 35 | ||
37 | /* regular block cipher functions */ | 36 | /* regular block cipher functions */ |
38 | asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, | 37 | asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, |
39 | bool xor); | 38 | bool xor); |
40 | EXPORT_SYMBOL_GPL(__blowfish_enc_blk); | ||
41 | |||
42 | asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); | 39 | asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); |
43 | EXPORT_SYMBOL_GPL(blowfish_dec_blk); | ||
44 | 40 | ||
45 | /* 4-way parallel cipher functions */ | 41 | /* 4-way parallel cipher functions */ |
46 | asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, | 42 | asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, |
47 | const u8 *src, bool xor); | 43 | const u8 *src, bool xor); |
48 | EXPORT_SYMBOL_GPL(__blowfish_enc_blk_4way); | ||
49 | |||
50 | asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, | 44 | asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, |
51 | const u8 *src); | 45 | const u8 *src); |
52 | EXPORT_SYMBOL_GPL(blowfish_dec_blk_4way); | 46 | |
47 | static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src) | ||
48 | { | ||
49 | __blowfish_enc_blk(ctx, dst, src, false); | ||
50 | } | ||
51 | |||
52 | static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst, | ||
53 | const u8 *src) | ||
54 | { | ||
55 | __blowfish_enc_blk(ctx, dst, src, true); | ||
56 | } | ||
57 | |||
58 | static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, | ||
59 | const u8 *src) | ||
60 | { | ||
61 | __blowfish_enc_blk_4way(ctx, dst, src, false); | ||
62 | } | ||
63 | |||
64 | static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst, | ||
65 | const u8 *src) | ||
66 | { | ||
67 | __blowfish_enc_blk_4way(ctx, dst, src, true); | ||
68 | } | ||
53 | 69 | ||
54 | static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) | 70 | static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) |
55 | { | 71 | { |
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 91a1878fcc3e..0e0b8863a34b 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S | |||
@@ -51,16 +51,6 @@ | |||
51 | #define ymm14_x xmm14 | 51 | #define ymm14_x xmm14 |
52 | #define ymm15_x xmm15 | 52 | #define ymm15_x xmm15 |
53 | 53 | ||
54 | /* | ||
55 | * AES-NI instructions do not support ymmX registers, so we need splitting and | ||
56 | * merging. | ||
57 | */ | ||
58 | #define vaesenclast256(zero, yreg, tmp) \ | ||
59 | vextracti128 $1, yreg, tmp##_x; \ | ||
60 | vaesenclast zero##_x, yreg##_x, yreg##_x; \ | ||
61 | vaesenclast zero##_x, tmp##_x, tmp##_x; \ | ||
62 | vinserti128 $1, tmp##_x, yreg, yreg; | ||
63 | |||
64 | /********************************************************************** | 54 | /********************************************************************** |
65 | 32-way camellia | 55 | 32-way camellia |
66 | **********************************************************************/ | 56 | **********************************************************************/ |
@@ -79,46 +69,70 @@ | |||
79 | * S-function with AES subbytes \ | 69 | * S-function with AES subbytes \ |
80 | */ \ | 70 | */ \ |
81 | vbroadcasti128 .Linv_shift_row, t4; \ | 71 | vbroadcasti128 .Linv_shift_row, t4; \ |
82 | vpbroadcastb .L0f0f0f0f, t7; \ | 72 | vpbroadcastd .L0f0f0f0f, t7; \ |
83 | vbroadcasti128 .Lpre_tf_lo_s1, t0; \ | 73 | vbroadcasti128 .Lpre_tf_lo_s1, t5; \ |
84 | vbroadcasti128 .Lpre_tf_hi_s1, t1; \ | 74 | vbroadcasti128 .Lpre_tf_hi_s1, t6; \ |
75 | vbroadcasti128 .Lpre_tf_lo_s4, t2; \ | ||
76 | vbroadcasti128 .Lpre_tf_hi_s4, t3; \ | ||
85 | \ | 77 | \ |
86 | /* AES inverse shift rows */ \ | 78 | /* AES inverse shift rows */ \ |
87 | vpshufb t4, x0, x0; \ | 79 | vpshufb t4, x0, x0; \ |
88 | vpshufb t4, x7, x7; \ | 80 | vpshufb t4, x7, x7; \ |
89 | vpshufb t4, x1, x1; \ | ||
90 | vpshufb t4, x4, x4; \ | ||
91 | vpshufb t4, x2, x2; \ | ||
92 | vpshufb t4, x5, x5; \ | ||
93 | vpshufb t4, x3, x3; \ | 81 | vpshufb t4, x3, x3; \ |
94 | vpshufb t4, x6, x6; \ | 82 | vpshufb t4, x6, x6; \ |
83 | vpshufb t4, x2, x2; \ | ||
84 | vpshufb t4, x5, x5; \ | ||
85 | vpshufb t4, x1, x1; \ | ||
86 | vpshufb t4, x4, x4; \ | ||
95 | \ | 87 | \ |
96 | /* prefilter sboxes 1, 2 and 3 */ \ | 88 | /* prefilter sboxes 1, 2 and 3 */ \ |
97 | vbroadcasti128 .Lpre_tf_lo_s4, t2; \ | ||
98 | vbroadcasti128 .Lpre_tf_hi_s4, t3; \ | ||
99 | filter_8bit(x0, t0, t1, t7, t6); \ | ||
100 | filter_8bit(x7, t0, t1, t7, t6); \ | ||
101 | filter_8bit(x1, t0, t1, t7, t6); \ | ||
102 | filter_8bit(x4, t0, t1, t7, t6); \ | ||
103 | filter_8bit(x2, t0, t1, t7, t6); \ | ||
104 | filter_8bit(x5, t0, t1, t7, t6); \ | ||
105 | \ | ||
106 | /* prefilter sbox 4 */ \ | 89 | /* prefilter sbox 4 */ \ |
90 | filter_8bit(x0, t5, t6, t7, t4); \ | ||
91 | filter_8bit(x7, t5, t6, t7, t4); \ | ||
92 | vextracti128 $1, x0, t0##_x; \ | ||
93 | vextracti128 $1, x7, t1##_x; \ | ||
94 | filter_8bit(x3, t2, t3, t7, t4); \ | ||
95 | filter_8bit(x6, t2, t3, t7, t4); \ | ||
96 | vextracti128 $1, x3, t3##_x; \ | ||
97 | vextracti128 $1, x6, t2##_x; \ | ||
98 | filter_8bit(x2, t5, t6, t7, t4); \ | ||
99 | filter_8bit(x5, t5, t6, t7, t4); \ | ||
100 | filter_8bit(x1, t5, t6, t7, t4); \ | ||
101 | filter_8bit(x4, t5, t6, t7, t4); \ | ||
102 | \ | ||
107 | vpxor t4##_x, t4##_x, t4##_x; \ | 103 | vpxor t4##_x, t4##_x, t4##_x; \ |
108 | filter_8bit(x3, t2, t3, t7, t6); \ | ||
109 | filter_8bit(x6, t2, t3, t7, t6); \ | ||
110 | \ | 104 | \ |
111 | /* AES subbytes + AES shift rows */ \ | 105 | /* AES subbytes + AES shift rows */ \ |
106 | vextracti128 $1, x2, t6##_x; \ | ||
107 | vextracti128 $1, x5, t5##_x; \ | ||
108 | vaesenclast t4##_x, x0##_x, x0##_x; \ | ||
109 | vaesenclast t4##_x, t0##_x, t0##_x; \ | ||
110 | vinserti128 $1, t0##_x, x0, x0; \ | ||
111 | vaesenclast t4##_x, x7##_x, x7##_x; \ | ||
112 | vaesenclast t4##_x, t1##_x, t1##_x; \ | ||
113 | vinserti128 $1, t1##_x, x7, x7; \ | ||
114 | vaesenclast t4##_x, x3##_x, x3##_x; \ | ||
115 | vaesenclast t4##_x, t3##_x, t3##_x; \ | ||
116 | vinserti128 $1, t3##_x, x3, x3; \ | ||
117 | vaesenclast t4##_x, x6##_x, x6##_x; \ | ||
118 | vaesenclast t4##_x, t2##_x, t2##_x; \ | ||
119 | vinserti128 $1, t2##_x, x6, x6; \ | ||
120 | vextracti128 $1, x1, t3##_x; \ | ||
121 | vextracti128 $1, x4, t2##_x; \ | ||
112 | vbroadcasti128 .Lpost_tf_lo_s1, t0; \ | 122 | vbroadcasti128 .Lpost_tf_lo_s1, t0; \ |
113 | vbroadcasti128 .Lpost_tf_hi_s1, t1; \ | 123 | vbroadcasti128 .Lpost_tf_hi_s1, t1; \ |
114 | vaesenclast256(t4, x0, t5); \ | 124 | vaesenclast t4##_x, x2##_x, x2##_x; \ |
115 | vaesenclast256(t4, x7, t5); \ | 125 | vaesenclast t4##_x, t6##_x, t6##_x; \ |
116 | vaesenclast256(t4, x1, t5); \ | 126 | vinserti128 $1, t6##_x, x2, x2; \ |
117 | vaesenclast256(t4, x4, t5); \ | 127 | vaesenclast t4##_x, x5##_x, x5##_x; \ |
118 | vaesenclast256(t4, x2, t5); \ | 128 | vaesenclast t4##_x, t5##_x, t5##_x; \ |
119 | vaesenclast256(t4, x5, t5); \ | 129 | vinserti128 $1, t5##_x, x5, x5; \ |
120 | vaesenclast256(t4, x3, t5); \ | 130 | vaesenclast t4##_x, x1##_x, x1##_x; \ |
121 | vaesenclast256(t4, x6, t5); \ | 131 | vaesenclast t4##_x, t3##_x, t3##_x; \ |
132 | vinserti128 $1, t3##_x, x1, x1; \ | ||
133 | vaesenclast t4##_x, x4##_x, x4##_x; \ | ||
134 | vaesenclast t4##_x, t2##_x, t2##_x; \ | ||
135 | vinserti128 $1, t2##_x, x4, x4; \ | ||
122 | \ | 136 | \ |
123 | /* postfilter sboxes 1 and 4 */ \ | 137 | /* postfilter sboxes 1 and 4 */ \ |
124 | vbroadcasti128 .Lpost_tf_lo_s3, t2; \ | 138 | vbroadcasti128 .Lpost_tf_lo_s3, t2; \ |
@@ -139,22 +153,12 @@ | |||
139 | /* postfilter sbox 2 */ \ | 153 | /* postfilter sbox 2 */ \ |
140 | filter_8bit(x1, t4, t5, t7, t2); \ | 154 | filter_8bit(x1, t4, t5, t7, t2); \ |
141 | filter_8bit(x4, t4, t5, t7, t2); \ | 155 | filter_8bit(x4, t4, t5, t7, t2); \ |
156 | vpxor t7, t7, t7; \ | ||
142 | \ | 157 | \ |
143 | vpsrldq $1, t0, t1; \ | 158 | vpsrldq $1, t0, t1; \ |
144 | vpsrldq $2, t0, t2; \ | 159 | vpsrldq $2, t0, t2; \ |
160 | vpshufb t7, t1, t1; \ | ||
145 | vpsrldq $3, t0, t3; \ | 161 | vpsrldq $3, t0, t3; \ |
146 | vpsrldq $4, t0, t4; \ | ||
147 | vpsrldq $5, t0, t5; \ | ||
148 | vpsrldq $6, t0, t6; \ | ||
149 | vpsrldq $7, t0, t7; \ | ||
150 | vpbroadcastb t0##_x, t0; \ | ||
151 | vpbroadcastb t1##_x, t1; \ | ||
152 | vpbroadcastb t2##_x, t2; \ | ||
153 | vpbroadcastb t3##_x, t3; \ | ||
154 | vpbroadcastb t4##_x, t4; \ | ||
155 | vpbroadcastb t6##_x, t6; \ | ||
156 | vpbroadcastb t5##_x, t5; \ | ||
157 | vpbroadcastb t7##_x, t7; \ | ||
158 | \ | 162 | \ |
159 | /* P-function */ \ | 163 | /* P-function */ \ |
160 | vpxor x5, x0, x0; \ | 164 | vpxor x5, x0, x0; \ |
@@ -162,11 +166,21 @@ | |||
162 | vpxor x7, x2, x2; \ | 166 | vpxor x7, x2, x2; \ |
163 | vpxor x4, x3, x3; \ | 167 | vpxor x4, x3, x3; \ |
164 | \ | 168 | \ |
169 | vpshufb t7, t2, t2; \ | ||
170 | vpsrldq $4, t0, t4; \ | ||
171 | vpshufb t7, t3, t3; \ | ||
172 | vpsrldq $5, t0, t5; \ | ||
173 | vpshufb t7, t4, t4; \ | ||
174 | \ | ||
165 | vpxor x2, x4, x4; \ | 175 | vpxor x2, x4, x4; \ |
166 | vpxor x3, x5, x5; \ | 176 | vpxor x3, x5, x5; \ |
167 | vpxor x0, x6, x6; \ | 177 | vpxor x0, x6, x6; \ |
168 | vpxor x1, x7, x7; \ | 178 | vpxor x1, x7, x7; \ |
169 | \ | 179 | \ |
180 | vpsrldq $6, t0, t6; \ | ||
181 | vpshufb t7, t5, t5; \ | ||
182 | vpshufb t7, t6, t6; \ | ||
183 | \ | ||
170 | vpxor x7, x0, x0; \ | 184 | vpxor x7, x0, x0; \ |
171 | vpxor x4, x1, x1; \ | 185 | vpxor x4, x1, x1; \ |
172 | vpxor x5, x2, x2; \ | 186 | vpxor x5, x2, x2; \ |
@@ -179,12 +193,16 @@ | |||
179 | \ | 193 | \ |
180 | /* Add key material and result to CD (x becomes new CD) */ \ | 194 | /* Add key material and result to CD (x becomes new CD) */ \ |
181 | \ | 195 | \ |
182 | vpxor t7, x0, x0; \ | ||
183 | vpxor 4 * 32(mem_cd), x0, x0; \ | ||
184 | \ | ||
185 | vpxor t6, x1, x1; \ | 196 | vpxor t6, x1, x1; \ |
186 | vpxor 5 * 32(mem_cd), x1, x1; \ | 197 | vpxor 5 * 32(mem_cd), x1, x1; \ |
187 | \ | 198 | \ |
199 | vpsrldq $7, t0, t6; \ | ||
200 | vpshufb t7, t0, t0; \ | ||
201 | vpshufb t7, t6, t7; \ | ||
202 | \ | ||
203 | vpxor t7, x0, x0; \ | ||
204 | vpxor 4 * 32(mem_cd), x0, x0; \ | ||
205 | \ | ||
188 | vpxor t5, x2, x2; \ | 206 | vpxor t5, x2, x2; \ |
189 | vpxor 6 * 32(mem_cd), x2, x2; \ | 207 | vpxor 6 * 32(mem_cd), x2, x2; \ |
190 | \ | 208 | \ |
@@ -204,7 +222,7 @@ | |||
204 | vpxor 3 * 32(mem_cd), x7, x7; | 222 | vpxor 3 * 32(mem_cd), x7, x7; |
205 | 223 | ||
206 | /* | 224 | /* |
207 | * Size optimization... with inlined roundsm16 binary would be over 5 times | 225 | * Size optimization... with inlined roundsm32 binary would be over 5 times |
208 | * larger and would only marginally faster. | 226 | * larger and would only marginally faster. |
209 | */ | 227 | */ |
210 | .align 8 | 228 | .align 8 |
@@ -324,13 +342,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) | |||
324 | */ \ | 342 | */ \ |
325 | vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ | 343 | vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ |
326 | vpxor tt0, tt0, tt0; \ | 344 | vpxor tt0, tt0, tt0; \ |
327 | vpbroadcastb t0##_x, t3; \ | 345 | vpshufb tt0, t0, t3; \ |
328 | vpsrldq $1, t0, t0; \ | 346 | vpsrldq $1, t0, t0; \ |
329 | vpbroadcastb t0##_x, t2; \ | 347 | vpshufb tt0, t0, t2; \ |
330 | vpsrldq $1, t0, t0; \ | 348 | vpsrldq $1, t0, t0; \ |
331 | vpbroadcastb t0##_x, t1; \ | 349 | vpshufb tt0, t0, t1; \ |
332 | vpsrldq $1, t0, t0; \ | 350 | vpsrldq $1, t0, t0; \ |
333 | vpbroadcastb t0##_x, t0; \ | 351 | vpshufb tt0, t0, t0; \ |
334 | \ | 352 | \ |
335 | vpand l0, t0, t0; \ | 353 | vpand l0, t0, t0; \ |
336 | vpand l1, t1, t1; \ | 354 | vpand l1, t1, t1; \ |
@@ -340,6 +358,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) | |||
340 | rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ | 358 | rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ |
341 | \ | 359 | \ |
342 | vpxor l4, t0, l4; \ | 360 | vpxor l4, t0, l4; \ |
361 | vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ | ||
343 | vmovdqu l4, 4 * 32(l); \ | 362 | vmovdqu l4, 4 * 32(l); \ |
344 | vpxor l5, t1, l5; \ | 363 | vpxor l5, t1, l5; \ |
345 | vmovdqu l5, 5 * 32(l); \ | 364 | vmovdqu l5, 5 * 32(l); \ |
@@ -354,14 +373,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) | |||
354 | * rl ^= t2; \ | 373 | * rl ^= t2; \ |
355 | */ \ | 374 | */ \ |
356 | \ | 375 | \ |
357 | vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ | 376 | vpshufb tt0, t0, t3; \ |
358 | vpbroadcastb t0##_x, t3; \ | ||
359 | vpsrldq $1, t0, t0; \ | 377 | vpsrldq $1, t0, t0; \ |
360 | vpbroadcastb t0##_x, t2; \ | 378 | vpshufb tt0, t0, t2; \ |
361 | vpsrldq $1, t0, t0; \ | 379 | vpsrldq $1, t0, t0; \ |
362 | vpbroadcastb t0##_x, t1; \ | 380 | vpshufb tt0, t0, t1; \ |
363 | vpsrldq $1, t0, t0; \ | 381 | vpsrldq $1, t0, t0; \ |
364 | vpbroadcastb t0##_x, t0; \ | 382 | vpshufb tt0, t0, t0; \ |
365 | \ | 383 | \ |
366 | vpor 4 * 32(r), t0, t0; \ | 384 | vpor 4 * 32(r), t0, t0; \ |
367 | vpor 5 * 32(r), t1, t1; \ | 385 | vpor 5 * 32(r), t1, t1; \ |
@@ -373,6 +391,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) | |||
373 | vpxor 2 * 32(r), t2, t2; \ | 391 | vpxor 2 * 32(r), t2, t2; \ |
374 | vpxor 3 * 32(r), t3, t3; \ | 392 | vpxor 3 * 32(r), t3, t3; \ |
375 | vmovdqu t0, 0 * 32(r); \ | 393 | vmovdqu t0, 0 * 32(r); \ |
394 | vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ | ||
376 | vmovdqu t1, 1 * 32(r); \ | 395 | vmovdqu t1, 1 * 32(r); \ |
377 | vmovdqu t2, 2 * 32(r); \ | 396 | vmovdqu t2, 2 * 32(r); \ |
378 | vmovdqu t3, 3 * 32(r); \ | 397 | vmovdqu t3, 3 * 32(r); \ |
@@ -382,14 +401,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) | |||
382 | * t2 &= rl; \ | 401 | * t2 &= rl; \ |
383 | * rr ^= rol32(t2, 1); \ | 402 | * rr ^= rol32(t2, 1); \ |
384 | */ \ | 403 | */ \ |
385 | vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ | 404 | vpshufb tt0, t0, t3; \ |
386 | vpbroadcastb t0##_x, t3; \ | ||
387 | vpsrldq $1, t0, t0; \ | 405 | vpsrldq $1, t0, t0; \ |
388 | vpbroadcastb t0##_x, t2; \ | 406 | vpshufb tt0, t0, t2; \ |
389 | vpsrldq $1, t0, t0; \ | 407 | vpsrldq $1, t0, t0; \ |
390 | vpbroadcastb t0##_x, t1; \ | 408 | vpshufb tt0, t0, t1; \ |
391 | vpsrldq $1, t0, t0; \ | 409 | vpsrldq $1, t0, t0; \ |
392 | vpbroadcastb t0##_x, t0; \ | 410 | vpshufb tt0, t0, t0; \ |
393 | \ | 411 | \ |
394 | vpand 0 * 32(r), t0, t0; \ | 412 | vpand 0 * 32(r), t0, t0; \ |
395 | vpand 1 * 32(r), t1, t1; \ | 413 | vpand 1 * 32(r), t1, t1; \ |
@@ -403,6 +421,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) | |||
403 | vpxor 6 * 32(r), t2, t2; \ | 421 | vpxor 6 * 32(r), t2, t2; \ |
404 | vpxor 7 * 32(r), t3, t3; \ | 422 | vpxor 7 * 32(r), t3, t3; \ |
405 | vmovdqu t0, 4 * 32(r); \ | 423 | vmovdqu t0, 4 * 32(r); \ |
424 | vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ | ||
406 | vmovdqu t1, 5 * 32(r); \ | 425 | vmovdqu t1, 5 * 32(r); \ |
407 | vmovdqu t2, 6 * 32(r); \ | 426 | vmovdqu t2, 6 * 32(r); \ |
408 | vmovdqu t3, 7 * 32(r); \ | 427 | vmovdqu t3, 7 * 32(r); \ |
@@ -413,14 +432,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) | |||
413 | * ll ^= t0; \ | 432 | * ll ^= t0; \ |
414 | */ \ | 433 | */ \ |
415 | \ | 434 | \ |
416 | vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ | 435 | vpshufb tt0, t0, t3; \ |
417 | vpbroadcastb t0##_x, t3; \ | ||
418 | vpsrldq $1, t0, t0; \ | 436 | vpsrldq $1, t0, t0; \ |
419 | vpbroadcastb t0##_x, t2; \ | 437 | vpshufb tt0, t0, t2; \ |
420 | vpsrldq $1, t0, t0; \ | 438 | vpsrldq $1, t0, t0; \ |
421 | vpbroadcastb t0##_x, t1; \ | 439 | vpshufb tt0, t0, t1; \ |
422 | vpsrldq $1, t0, t0; \ | 440 | vpsrldq $1, t0, t0; \ |
423 | vpbroadcastb t0##_x, t0; \ | 441 | vpshufb tt0, t0, t0; \ |
424 | \ | 442 | \ |
425 | vpor l4, t0, t0; \ | 443 | vpor l4, t0, t0; \ |
426 | vpor l5, t1, t1; \ | 444 | vpor l5, t1, t1; \ |
diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S new file mode 100644 index 000000000000..35e97569d05f --- /dev/null +++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S | |||
@@ -0,0 +1,643 @@ | |||
1 | ######################################################################## | ||
2 | # Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions | ||
3 | # | ||
4 | # Copyright (c) 2013, Intel Corporation | ||
5 | # | ||
6 | # Authors: | ||
7 | # Erdinc Ozturk <erdinc.ozturk@intel.com> | ||
8 | # Vinodh Gopal <vinodh.gopal@intel.com> | ||
9 | # James Guilford <james.guilford@intel.com> | ||
10 | # Tim Chen <tim.c.chen@linux.intel.com> | ||
11 | # | ||
12 | # This software is available to you under a choice of one of two | ||
13 | # licenses. You may choose to be licensed under the terms of the GNU | ||
14 | # General Public License (GPL) Version 2, available from the file | ||
15 | # COPYING in the main directory of this source tree, or the | ||
16 | # OpenIB.org BSD license below: | ||
17 | # | ||
18 | # Redistribution and use in source and binary forms, with or without | ||
19 | # modification, are permitted provided that the following conditions are | ||
20 | # met: | ||
21 | # | ||
22 | # * Redistributions of source code must retain the above copyright | ||
23 | # notice, this list of conditions and the following disclaimer. | ||
24 | # | ||
25 | # * Redistributions in binary form must reproduce the above copyright | ||
26 | # notice, this list of conditions and the following disclaimer in the | ||
27 | # documentation and/or other materials provided with the | ||
28 | # distribution. | ||
29 | # | ||
30 | # * Neither the name of the Intel Corporation nor the names of its | ||
31 | # contributors may be used to endorse or promote products derived from | ||
32 | # this software without specific prior written permission. | ||
33 | # | ||
34 | # | ||
35 | # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY | ||
36 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
37 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
38 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR | ||
39 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
40 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
41 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
42 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
43 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
44 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
45 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
46 | ######################################################################## | ||
47 | # Function API: | ||
48 | # UINT16 crc_t10dif_pcl( | ||
49 | # UINT16 init_crc, //initial CRC value, 16 bits | ||
50 | # const unsigned char *buf, //buffer pointer to calculate CRC on | ||
51 | # UINT64 len //buffer length in bytes (64-bit data) | ||
52 | # ); | ||
53 | # | ||
54 | # Reference paper titled "Fast CRC Computation for Generic | ||
55 | # Polynomials Using PCLMULQDQ Instruction" | ||
56 | # URL: http://www.intel.com/content/dam/www/public/us/en/documents | ||
57 | # /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf | ||
58 | # | ||
59 | # | ||
60 | |||
61 | #include <linux/linkage.h> | ||
62 | |||
63 | .text | ||
64 | |||
65 | #define arg1 %rdi | ||
66 | #define arg2 %rsi | ||
67 | #define arg3 %rdx | ||
68 | |||
69 | #define arg1_low32 %edi | ||
70 | |||
71 | ENTRY(crc_t10dif_pcl) | ||
72 | .align 16 | ||
73 | |||
74 | # adjust the 16-bit initial_crc value, scale it to 32 bits | ||
75 | shl $16, arg1_low32 | ||
76 | |||
77 | # Allocate Stack Space | ||
78 | mov %rsp, %rcx | ||
79 | sub $16*2, %rsp | ||
80 | # align stack to 16 byte boundary | ||
81 | and $~(0x10 - 1), %rsp | ||
82 | |||
83 | # check if smaller than 256 | ||
84 | cmp $256, arg3 | ||
85 | |||
86 | # for sizes less than 128, we can't fold 64B at a time... | ||
87 | jl _less_than_128 | ||
88 | |||
89 | |||
90 | # load the initial crc value | ||
91 | movd arg1_low32, %xmm10 # initial crc | ||
92 | |||
93 | # crc value does not need to be byte-reflected, but it needs | ||
94 | # to be moved to the high part of the register. | ||
95 | # because data will be byte-reflected and will align with | ||
96 | # initial crc at correct place. | ||
97 | pslldq $12, %xmm10 | ||
98 | |||
99 | movdqa SHUF_MASK(%rip), %xmm11 | ||
100 | # receive the initial 64B data, xor the initial crc value | ||
101 | movdqu 16*0(arg2), %xmm0 | ||
102 | movdqu 16*1(arg2), %xmm1 | ||
103 | movdqu 16*2(arg2), %xmm2 | ||
104 | movdqu 16*3(arg2), %xmm3 | ||
105 | movdqu 16*4(arg2), %xmm4 | ||
106 | movdqu 16*5(arg2), %xmm5 | ||
107 | movdqu 16*6(arg2), %xmm6 | ||
108 | movdqu 16*7(arg2), %xmm7 | ||
109 | |||
110 | pshufb %xmm11, %xmm0 | ||
111 | # XOR the initial_crc value | ||
112 | pxor %xmm10, %xmm0 | ||
113 | pshufb %xmm11, %xmm1 | ||
114 | pshufb %xmm11, %xmm2 | ||
115 | pshufb %xmm11, %xmm3 | ||
116 | pshufb %xmm11, %xmm4 | ||
117 | pshufb %xmm11, %xmm5 | ||
118 | pshufb %xmm11, %xmm6 | ||
119 | pshufb %xmm11, %xmm7 | ||
120 | |||
121 | movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4 | ||
122 | #imm value of pclmulqdq instruction | ||
123 | #will determine which constant to use | ||
124 | |||
125 | ################################################################# | ||
126 | # we subtract 256 instead of 128 to save one instruction from the loop | ||
127 | sub $256, arg3 | ||
128 | |||
129 | # at this section of the code, there is 64*x+y (0<=y<64) bytes of | ||
130 | # buffer. The _fold_64_B_loop will fold 64B at a time | ||
131 | # until we have 64+y Bytes of buffer | ||
132 | |||
133 | |||
134 | # fold 64B at a time. This section of the code folds 4 xmm | ||
135 | # registers in parallel | ||
136 | _fold_64_B_loop: | ||
137 | |||
138 | # update the buffer pointer | ||
139 | add $128, arg2 # buf += 64# | ||
140 | |||
141 | movdqu 16*0(arg2), %xmm9 | ||
142 | movdqu 16*1(arg2), %xmm12 | ||
143 | pshufb %xmm11, %xmm9 | ||
144 | pshufb %xmm11, %xmm12 | ||
145 | movdqa %xmm0, %xmm8 | ||
146 | movdqa %xmm1, %xmm13 | ||
147 | pclmulqdq $0x0 , %xmm10, %xmm0 | ||
148 | pclmulqdq $0x11, %xmm10, %xmm8 | ||
149 | pclmulqdq $0x0 , %xmm10, %xmm1 | ||
150 | pclmulqdq $0x11, %xmm10, %xmm13 | ||
151 | pxor %xmm9 , %xmm0 | ||
152 | xorps %xmm8 , %xmm0 | ||
153 | pxor %xmm12, %xmm1 | ||
154 | xorps %xmm13, %xmm1 | ||
155 | |||
156 | movdqu 16*2(arg2), %xmm9 | ||
157 | movdqu 16*3(arg2), %xmm12 | ||
158 | pshufb %xmm11, %xmm9 | ||
159 | pshufb %xmm11, %xmm12 | ||
160 | movdqa %xmm2, %xmm8 | ||
161 | movdqa %xmm3, %xmm13 | ||
162 | pclmulqdq $0x0, %xmm10, %xmm2 | ||
163 | pclmulqdq $0x11, %xmm10, %xmm8 | ||
164 | pclmulqdq $0x0, %xmm10, %xmm3 | ||
165 | pclmulqdq $0x11, %xmm10, %xmm13 | ||
166 | pxor %xmm9 , %xmm2 | ||
167 | xorps %xmm8 , %xmm2 | ||
168 | pxor %xmm12, %xmm3 | ||
169 | xorps %xmm13, %xmm3 | ||
170 | |||
171 | movdqu 16*4(arg2), %xmm9 | ||
172 | movdqu 16*5(arg2), %xmm12 | ||
173 | pshufb %xmm11, %xmm9 | ||
174 | pshufb %xmm11, %xmm12 | ||
175 | movdqa %xmm4, %xmm8 | ||
176 | movdqa %xmm5, %xmm13 | ||
177 | pclmulqdq $0x0, %xmm10, %xmm4 | ||
178 | pclmulqdq $0x11, %xmm10, %xmm8 | ||
179 | pclmulqdq $0x0, %xmm10, %xmm5 | ||
180 | pclmulqdq $0x11, %xmm10, %xmm13 | ||
181 | pxor %xmm9 , %xmm4 | ||
182 | xorps %xmm8 , %xmm4 | ||
183 | pxor %xmm12, %xmm5 | ||
184 | xorps %xmm13, %xmm5 | ||
185 | |||
186 | movdqu 16*6(arg2), %xmm9 | ||
187 | movdqu 16*7(arg2), %xmm12 | ||
188 | pshufb %xmm11, %xmm9 | ||
189 | pshufb %xmm11, %xmm12 | ||
190 | movdqa %xmm6 , %xmm8 | ||
191 | movdqa %xmm7 , %xmm13 | ||
192 | pclmulqdq $0x0 , %xmm10, %xmm6 | ||
193 | pclmulqdq $0x11, %xmm10, %xmm8 | ||
194 | pclmulqdq $0x0 , %xmm10, %xmm7 | ||
195 | pclmulqdq $0x11, %xmm10, %xmm13 | ||
196 | pxor %xmm9 , %xmm6 | ||
197 | xorps %xmm8 , %xmm6 | ||
198 | pxor %xmm12, %xmm7 | ||
199 | xorps %xmm13, %xmm7 | ||
200 | |||
201 | sub $128, arg3 | ||
202 | |||
203 | # check if there is another 64B in the buffer to be able to fold | ||
204 | jge _fold_64_B_loop | ||
205 | ################################################################## | ||
206 | |||
207 | |||
208 | add $128, arg2 | ||
209 | # at this point, the buffer pointer is pointing at the last y Bytes | ||
210 | # of the buffer the 64B of folded data is in 4 of the xmm | ||
211 | # registers: xmm0, xmm1, xmm2, xmm3 | ||
212 | |||
213 | |||
214 | # fold the 8 xmm registers to 1 xmm register with different constants | ||
215 | |||
216 | movdqa rk9(%rip), %xmm10 | ||
217 | movdqa %xmm0, %xmm8 | ||
218 | pclmulqdq $0x11, %xmm10, %xmm0 | ||
219 | pclmulqdq $0x0 , %xmm10, %xmm8 | ||
220 | pxor %xmm8, %xmm7 | ||
221 | xorps %xmm0, %xmm7 | ||
222 | |||
223 | movdqa rk11(%rip), %xmm10 | ||
224 | movdqa %xmm1, %xmm8 | ||
225 | pclmulqdq $0x11, %xmm10, %xmm1 | ||
226 | pclmulqdq $0x0 , %xmm10, %xmm8 | ||
227 | pxor %xmm8, %xmm7 | ||
228 | xorps %xmm1, %xmm7 | ||
229 | |||
230 | movdqa rk13(%rip), %xmm10 | ||
231 | movdqa %xmm2, %xmm8 | ||
232 | pclmulqdq $0x11, %xmm10, %xmm2 | ||
233 | pclmulqdq $0x0 , %xmm10, %xmm8 | ||
234 | pxor %xmm8, %xmm7 | ||
235 | pxor %xmm2, %xmm7 | ||
236 | |||
237 | movdqa rk15(%rip), %xmm10 | ||
238 | movdqa %xmm3, %xmm8 | ||
239 | pclmulqdq $0x11, %xmm10, %xmm3 | ||
240 | pclmulqdq $0x0 , %xmm10, %xmm8 | ||
241 | pxor %xmm8, %xmm7 | ||
242 | xorps %xmm3, %xmm7 | ||
243 | |||
244 | movdqa rk17(%rip), %xmm10 | ||
245 | movdqa %xmm4, %xmm8 | ||
246 | pclmulqdq $0x11, %xmm10, %xmm4 | ||
247 | pclmulqdq $0x0 , %xmm10, %xmm8 | ||
248 | pxor %xmm8, %xmm7 | ||
249 | pxor %xmm4, %xmm7 | ||
250 | |||
251 | movdqa rk19(%rip), %xmm10 | ||
252 | movdqa %xmm5, %xmm8 | ||
253 | pclmulqdq $0x11, %xmm10, %xmm5 | ||
254 | pclmulqdq $0x0 , %xmm10, %xmm8 | ||
255 | pxor %xmm8, %xmm7 | ||
256 | xorps %xmm5, %xmm7 | ||
257 | |||
258 | movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2 | ||
259 | #imm value of pclmulqdq instruction | ||
260 | #will determine which constant to use | ||
261 | movdqa %xmm6, %xmm8 | ||
262 | pclmulqdq $0x11, %xmm10, %xmm6 | ||
263 | pclmulqdq $0x0 , %xmm10, %xmm8 | ||
264 | pxor %xmm8, %xmm7 | ||
265 | pxor %xmm6, %xmm7 | ||
266 | |||
267 | |||
268 | # instead of 64, we add 48 to the loop counter to save 1 instruction | ||
269 | # from the loop instead of a cmp instruction, we use the negative | ||
270 | # flag with the jl instruction | ||
271 | add $128-16, arg3 | ||
272 | jl _final_reduction_for_128 | ||
273 | |||
274 | # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 | ||
275 | # and the rest is in memory. We can fold 16 bytes at a time if y>=16 | ||
276 | # continue folding 16B at a time | ||
277 | |||
278 | _16B_reduction_loop: | ||
279 | movdqa %xmm7, %xmm8 | ||
280 | pclmulqdq $0x11, %xmm10, %xmm7 | ||
281 | pclmulqdq $0x0 , %xmm10, %xmm8 | ||
282 | pxor %xmm8, %xmm7 | ||
283 | movdqu (arg2), %xmm0 | ||
284 | pshufb %xmm11, %xmm0 | ||
285 | pxor %xmm0 , %xmm7 | ||
286 | add $16, arg2 | ||
287 | sub $16, arg3 | ||
288 | # instead of a cmp instruction, we utilize the flags with the | ||
289 | # jge instruction equivalent of: cmp arg3, 16-16 | ||
290 | # check if there is any more 16B in the buffer to be able to fold | ||
291 | jge _16B_reduction_loop | ||
292 | |||
293 | #now we have 16+z bytes left to reduce, where 0<= z < 16. | ||
294 | #first, we reduce the data in the xmm7 register | ||
295 | |||
296 | |||
297 | _final_reduction_for_128: | ||
298 | # check if any more data to fold. If not, compute the CRC of | ||
299 | # the final 128 bits | ||
300 | add $16, arg3 | ||
301 | je _128_done | ||
302 | |||
303 | # here we are getting data that is less than 16 bytes. | ||
304 | # since we know that there was data before the pointer, we can | ||
305 | # offset the input pointer before the actual point, to receive | ||
306 | # exactly 16 bytes. after that the registers need to be adjusted. | ||
307 | _get_last_two_xmms: | ||
308 | movdqa %xmm7, %xmm2 | ||
309 | |||
310 | movdqu -16(arg2, arg3), %xmm1 | ||
311 | pshufb %xmm11, %xmm1 | ||
312 | |||
313 | # get rid of the extra data that was loaded before | ||
314 | # load the shift constant | ||
315 | lea pshufb_shf_table+16(%rip), %rax | ||
316 | sub arg3, %rax | ||
317 | movdqu (%rax), %xmm0 | ||
318 | |||
319 | # shift xmm2 to the left by arg3 bytes | ||
320 | pshufb %xmm0, %xmm2 | ||
321 | |||
322 | # shift xmm7 to the right by 16-arg3 bytes | ||
323 | pxor mask1(%rip), %xmm0 | ||
324 | pshufb %xmm0, %xmm7 | ||
325 | pblendvb %xmm2, %xmm1 #xmm0 is implicit | ||
326 | |||
327 | # fold 16 Bytes | ||
328 | movdqa %xmm1, %xmm2 | ||
329 | movdqa %xmm7, %xmm8 | ||
330 | pclmulqdq $0x11, %xmm10, %xmm7 | ||
331 | pclmulqdq $0x0 , %xmm10, %xmm8 | ||
332 | pxor %xmm8, %xmm7 | ||
333 | pxor %xmm2, %xmm7 | ||
334 | |||
335 | _128_done: | ||
336 | # compute crc of a 128-bit value | ||
337 | movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10 | ||
338 | movdqa %xmm7, %xmm0 | ||
339 | |||
340 | #64b fold | ||
341 | pclmulqdq $0x1, %xmm10, %xmm7 | ||
342 | pslldq $8 , %xmm0 | ||
343 | pxor %xmm0, %xmm7 | ||
344 | |||
345 | #32b fold | ||
346 | movdqa %xmm7, %xmm0 | ||
347 | |||
348 | pand mask2(%rip), %xmm0 | ||
349 | |||
350 | psrldq $12, %xmm7 | ||
351 | pclmulqdq $0x10, %xmm10, %xmm7 | ||
352 | pxor %xmm0, %xmm7 | ||
353 | |||
354 | #barrett reduction | ||
355 | _barrett: | ||
356 | movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10 | ||
357 | movdqa %xmm7, %xmm0 | ||
358 | pclmulqdq $0x01, %xmm10, %xmm7 | ||
359 | pslldq $4, %xmm7 | ||
360 | pclmulqdq $0x11, %xmm10, %xmm7 | ||
361 | |||
362 | pslldq $4, %xmm7 | ||
363 | pxor %xmm0, %xmm7 | ||
364 | pextrd $1, %xmm7, %eax | ||
365 | |||
366 | _cleanup: | ||
367 | # scale the result back to 16 bits | ||
368 | shr $16, %eax | ||
369 | mov %rcx, %rsp | ||
370 | ret | ||
371 | |||
372 | ######################################################################## | ||
373 | |||
374 | .align 16 | ||
375 | _less_than_128: | ||
376 | |||
377 | # check if there is enough buffer to be able to fold 16B at a time | ||
378 | cmp $32, arg3 | ||
379 | jl _less_than_32 | ||
380 | movdqa SHUF_MASK(%rip), %xmm11 | ||
381 | |||
382 | # now if there is, load the constants | ||
383 | movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 | ||
384 | |||
385 | movd arg1_low32, %xmm0 # get the initial crc value | ||
386 | pslldq $12, %xmm0 # align it to its correct place | ||
387 | movdqu (arg2), %xmm7 # load the plaintext | ||
388 | pshufb %xmm11, %xmm7 # byte-reflect the plaintext | ||
389 | pxor %xmm0, %xmm7 | ||
390 | |||
391 | |||
392 | # update the buffer pointer | ||
393 | add $16, arg2 | ||
394 | |||
395 | # update the counter. subtract 32 instead of 16 to save one | ||
396 | # instruction from the loop | ||
397 | sub $32, arg3 | ||
398 | |||
399 | jmp _16B_reduction_loop | ||
400 | |||
401 | |||
402 | .align 16 | ||
403 | _less_than_32: | ||
404 | # mov initial crc to the return value. this is necessary for | ||
405 | # zero-length buffers. | ||
406 | mov arg1_low32, %eax | ||
407 | test arg3, arg3 | ||
408 | je _cleanup | ||
409 | |||
410 | movdqa SHUF_MASK(%rip), %xmm11 | ||
411 | |||
412 | movd arg1_low32, %xmm0 # get the initial crc value | ||
413 | pslldq $12, %xmm0 # align it to its correct place | ||
414 | |||
415 | cmp $16, arg3 | ||
416 | je _exact_16_left | ||
417 | jl _less_than_16_left | ||
418 | |||
419 | movdqu (arg2), %xmm7 # load the plaintext | ||
420 | pshufb %xmm11, %xmm7 # byte-reflect the plaintext | ||
421 | pxor %xmm0 , %xmm7 # xor the initial crc value | ||
422 | add $16, arg2 | ||
423 | sub $16, arg3 | ||
424 | movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 | ||
425 | jmp _get_last_two_xmms | ||
426 | |||
427 | |||
428 | .align 16 | ||
429 | _less_than_16_left: | ||
430 | # use stack space to load data less than 16 bytes, zero-out | ||
431 | # the 16B in memory first. | ||
432 | |||
433 | pxor %xmm1, %xmm1 | ||
434 | mov %rsp, %r11 | ||
435 | movdqa %xmm1, (%r11) | ||
436 | |||
437 | cmp $4, arg3 | ||
438 | jl _only_less_than_4 | ||
439 | |||
440 | # backup the counter value | ||
441 | mov arg3, %r9 | ||
442 | cmp $8, arg3 | ||
443 | jl _less_than_8_left | ||
444 | |||
445 | # load 8 Bytes | ||
446 | mov (arg2), %rax | ||
447 | mov %rax, (%r11) | ||
448 | add $8, %r11 | ||
449 | sub $8, arg3 | ||
450 | add $8, arg2 | ||
451 | _less_than_8_left: | ||
452 | |||
453 | cmp $4, arg3 | ||
454 | jl _less_than_4_left | ||
455 | |||
456 | # load 4 Bytes | ||
457 | mov (arg2), %eax | ||
458 | mov %eax, (%r11) | ||
459 | add $4, %r11 | ||
460 | sub $4, arg3 | ||
461 | add $4, arg2 | ||
462 | _less_than_4_left: | ||
463 | |||
464 | cmp $2, arg3 | ||
465 | jl _less_than_2_left | ||
466 | |||
467 | # load 2 Bytes | ||
468 | mov (arg2), %ax | ||
469 | mov %ax, (%r11) | ||
470 | add $2, %r11 | ||
471 | sub $2, arg3 | ||
472 | add $2, arg2 | ||
473 | _less_than_2_left: | ||
474 | cmp $1, arg3 | ||
475 | jl _zero_left | ||
476 | |||
477 | # load 1 Byte | ||
478 | mov (arg2), %al | ||
479 | mov %al, (%r11) | ||
480 | _zero_left: | ||
481 | movdqa (%rsp), %xmm7 | ||
482 | pshufb %xmm11, %xmm7 | ||
483 | pxor %xmm0 , %xmm7 # xor the initial crc value | ||
484 | |||
485 | # shl r9, 4 | ||
486 | lea pshufb_shf_table+16(%rip), %rax | ||
487 | sub %r9, %rax | ||
488 | movdqu (%rax), %xmm0 | ||
489 | pxor mask1(%rip), %xmm0 | ||
490 | |||
491 | pshufb %xmm0, %xmm7 | ||
492 | jmp _128_done | ||
493 | |||
494 | .align 16 | ||
495 | _exact_16_left: | ||
496 | movdqu (arg2), %xmm7 | ||
497 | pshufb %xmm11, %xmm7 | ||
498 | pxor %xmm0 , %xmm7 # xor the initial crc value | ||
499 | |||
500 | jmp _128_done | ||
501 | |||
502 | _only_less_than_4: | ||
503 | cmp $3, arg3 | ||
504 | jl _only_less_than_3 | ||
505 | |||
506 | # load 3 Bytes | ||
507 | mov (arg2), %al | ||
508 | mov %al, (%r11) | ||
509 | |||
510 | mov 1(arg2), %al | ||
511 | mov %al, 1(%r11) | ||
512 | |||
513 | mov 2(arg2), %al | ||
514 | mov %al, 2(%r11) | ||
515 | |||
516 | movdqa (%rsp), %xmm7 | ||
517 | pshufb %xmm11, %xmm7 | ||
518 | pxor %xmm0 , %xmm7 # xor the initial crc value | ||
519 | |||
520 | psrldq $5, %xmm7 | ||
521 | |||
522 | jmp _barrett | ||
523 | _only_less_than_3: | ||
524 | cmp $2, arg3 | ||
525 | jl _only_less_than_2 | ||
526 | |||
527 | # load 2 Bytes | ||
528 | mov (arg2), %al | ||
529 | mov %al, (%r11) | ||
530 | |||
531 | mov 1(arg2), %al | ||
532 | mov %al, 1(%r11) | ||
533 | |||
534 | movdqa (%rsp), %xmm7 | ||
535 | pshufb %xmm11, %xmm7 | ||
536 | pxor %xmm0 , %xmm7 # xor the initial crc value | ||
537 | |||
538 | psrldq $6, %xmm7 | ||
539 | |||
540 | jmp _barrett | ||
541 | _only_less_than_2: | ||
542 | |||
543 | # load 1 Byte | ||
544 | mov (arg2), %al | ||
545 | mov %al, (%r11) | ||
546 | |||
547 | movdqa (%rsp), %xmm7 | ||
548 | pshufb %xmm11, %xmm7 | ||
549 | pxor %xmm0 , %xmm7 # xor the initial crc value | ||
550 | |||
551 | psrldq $7, %xmm7 | ||
552 | |||
553 | jmp _barrett | ||
554 | |||
555 | ENDPROC(crc_t10dif_pcl) | ||
556 | |||
557 | .data | ||
558 | |||
559 | # precomputed constants | ||
560 | # these constants are precomputed from the poly: | ||
561 | # 0x8bb70000 (0x8bb7 scaled to 32 bits) | ||
562 | .align 16 | ||
563 | # Q = 0x18BB70000 | ||
564 | # rk1 = 2^(32*3) mod Q << 32 | ||
565 | # rk2 = 2^(32*5) mod Q << 32 | ||
566 | # rk3 = 2^(32*15) mod Q << 32 | ||
567 | # rk4 = 2^(32*17) mod Q << 32 | ||
568 | # rk5 = 2^(32*3) mod Q << 32 | ||
569 | # rk6 = 2^(32*2) mod Q << 32 | ||
570 | # rk7 = floor(2^64/Q) | ||
571 | # rk8 = Q | ||
572 | rk1: | ||
573 | .quad 0x2d56000000000000 | ||
574 | rk2: | ||
575 | .quad 0x06df000000000000 | ||
576 | rk3: | ||
577 | .quad 0x9d9d000000000000 | ||
578 | rk4: | ||
579 | .quad 0x7cf5000000000000 | ||
580 | rk5: | ||
581 | .quad 0x2d56000000000000 | ||
582 | rk6: | ||
583 | .quad 0x1368000000000000 | ||
584 | rk7: | ||
585 | .quad 0x00000001f65a57f8 | ||
586 | rk8: | ||
587 | .quad 0x000000018bb70000 | ||
588 | |||
589 | rk9: | ||
590 | .quad 0xceae000000000000 | ||
591 | rk10: | ||
592 | .quad 0xbfd6000000000000 | ||
593 | rk11: | ||
594 | .quad 0x1e16000000000000 | ||
595 | rk12: | ||
596 | .quad 0x713c000000000000 | ||
597 | rk13: | ||
598 | .quad 0xf7f9000000000000 | ||
599 | rk14: | ||
600 | .quad 0x80a6000000000000 | ||
601 | rk15: | ||
602 | .quad 0x044c000000000000 | ||
603 | rk16: | ||
604 | .quad 0xe658000000000000 | ||
605 | rk17: | ||
606 | .quad 0xad18000000000000 | ||
607 | rk18: | ||
608 | .quad 0xa497000000000000 | ||
609 | rk19: | ||
610 | .quad 0x6ee3000000000000 | ||
611 | rk20: | ||
612 | .quad 0xe7b5000000000000 | ||
613 | |||
614 | |||
615 | |||
616 | mask1: | ||
617 | .octa 0x80808080808080808080808080808080 | ||
618 | mask2: | ||
619 | .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF | ||
620 | |||
621 | SHUF_MASK: | ||
622 | .octa 0x000102030405060708090A0B0C0D0E0F | ||
623 | |||
624 | pshufb_shf_table: | ||
625 | # use these values for shift constants for the pshufb instruction | ||
626 | # different alignments result in values as shown: | ||
627 | # DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1 | ||
628 | # DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2 | ||
629 | # DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3 | ||
630 | # DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4 | ||
631 | # DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5 | ||
632 | # DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6 | ||
633 | # DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7 | ||
634 | # DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8 | ||
635 | # DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9 | ||
636 | # DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10 | ||
637 | # DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11 | ||
638 | # DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12 | ||
639 | # DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13 | ||
640 | # DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14 | ||
641 | # DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15 | ||
642 | .octa 0x8f8e8d8c8b8a89888786858483828100 | ||
643 | .octa 0x000e0d0c0b0a09080706050403020100 | ||
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c new file mode 100644 index 000000000000..7845d7fd54c0 --- /dev/null +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c | |||
@@ -0,0 +1,151 @@ | |||
1 | /* | ||
2 | * Cryptographic API. | ||
3 | * | ||
4 | * T10 Data Integrity Field CRC16 Crypto Transform using PCLMULQDQ Instructions | ||
5 | * | ||
6 | * Copyright (C) 2013 Intel Corporation | ||
7 | * Author: Tim Chen <tim.c.chen@linux.intel.com> | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify it | ||
10 | * under the terms of the GNU General Public License as published by the Free | ||
11 | * Software Foundation; either version 2 of the License, or (at your option) | ||
12 | * any later version. | ||
13 | * | ||
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
15 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
16 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
17 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
18 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
19 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
20 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
21 | * SOFTWARE. | ||
22 | * | ||
23 | */ | ||
24 | |||
25 | #include <linux/types.h> | ||
26 | #include <linux/module.h> | ||
27 | #include <linux/crc-t10dif.h> | ||
28 | #include <crypto/internal/hash.h> | ||
29 | #include <linux/init.h> | ||
30 | #include <linux/string.h> | ||
31 | #include <linux/kernel.h> | ||
32 | #include <asm/i387.h> | ||
33 | #include <asm/cpufeature.h> | ||
34 | #include <asm/cpu_device_id.h> | ||
35 | |||
36 | asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf, | ||
37 | size_t len); | ||
38 | |||
39 | struct chksum_desc_ctx { | ||
40 | __u16 crc; | ||
41 | }; | ||
42 | |||
43 | /* | ||
44 | * Steps through buffer one byte at at time, calculates reflected | ||
45 | * crc using table. | ||
46 | */ | ||
47 | |||
48 | static int chksum_init(struct shash_desc *desc) | ||
49 | { | ||
50 | struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); | ||
51 | |||
52 | ctx->crc = 0; | ||
53 | |||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | static int chksum_update(struct shash_desc *desc, const u8 *data, | ||
58 | unsigned int length) | ||
59 | { | ||
60 | struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); | ||
61 | |||
62 | if (irq_fpu_usable()) { | ||
63 | kernel_fpu_begin(); | ||
64 | ctx->crc = crc_t10dif_pcl(ctx->crc, data, length); | ||
65 | kernel_fpu_end(); | ||
66 | } else | ||
67 | ctx->crc = crc_t10dif_generic(ctx->crc, data, length); | ||
68 | return 0; | ||
69 | } | ||
70 | |||
71 | static int chksum_final(struct shash_desc *desc, u8 *out) | ||
72 | { | ||
73 | struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); | ||
74 | |||
75 | *(__u16 *)out = ctx->crc; | ||
76 | return 0; | ||
77 | } | ||
78 | |||
79 | static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len, | ||
80 | u8 *out) | ||
81 | { | ||
82 | if (irq_fpu_usable()) { | ||
83 | kernel_fpu_begin(); | ||
84 | *(__u16 *)out = crc_t10dif_pcl(*crcp, data, len); | ||
85 | kernel_fpu_end(); | ||
86 | } else | ||
87 | *(__u16 *)out = crc_t10dif_generic(*crcp, data, len); | ||
88 | return 0; | ||
89 | } | ||
90 | |||
91 | static int chksum_finup(struct shash_desc *desc, const u8 *data, | ||
92 | unsigned int len, u8 *out) | ||
93 | { | ||
94 | struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); | ||
95 | |||
96 | return __chksum_finup(&ctx->crc, data, len, out); | ||
97 | } | ||
98 | |||
99 | static int chksum_digest(struct shash_desc *desc, const u8 *data, | ||
100 | unsigned int length, u8 *out) | ||
101 | { | ||
102 | struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); | ||
103 | |||
104 | return __chksum_finup(&ctx->crc, data, length, out); | ||
105 | } | ||
106 | |||
107 | static struct shash_alg alg = { | ||
108 | .digestsize = CRC_T10DIF_DIGEST_SIZE, | ||
109 | .init = chksum_init, | ||
110 | .update = chksum_update, | ||
111 | .final = chksum_final, | ||
112 | .finup = chksum_finup, | ||
113 | .digest = chksum_digest, | ||
114 | .descsize = sizeof(struct chksum_desc_ctx), | ||
115 | .base = { | ||
116 | .cra_name = "crct10dif", | ||
117 | .cra_driver_name = "crct10dif-pclmul", | ||
118 | .cra_priority = 200, | ||
119 | .cra_blocksize = CRC_T10DIF_BLOCK_SIZE, | ||
120 | .cra_module = THIS_MODULE, | ||
121 | } | ||
122 | }; | ||
123 | |||
124 | static const struct x86_cpu_id crct10dif_cpu_id[] = { | ||
125 | X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), | ||
126 | {} | ||
127 | }; | ||
128 | MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id); | ||
129 | |||
130 | static int __init crct10dif_intel_mod_init(void) | ||
131 | { | ||
132 | if (!x86_match_cpu(crct10dif_cpu_id)) | ||
133 | return -ENODEV; | ||
134 | |||
135 | return crypto_register_shash(&alg); | ||
136 | } | ||
137 | |||
138 | static void __exit crct10dif_intel_mod_fini(void) | ||
139 | { | ||
140 | crypto_unregister_shash(&alg); | ||
141 | } | ||
142 | |||
143 | module_init(crct10dif_intel_mod_init); | ||
144 | module_exit(crct10dif_intel_mod_fini); | ||
145 | |||
146 | MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>"); | ||
147 | MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ."); | ||
148 | MODULE_LICENSE("GPL"); | ||
149 | |||
150 | MODULE_ALIAS("crct10dif"); | ||
151 | MODULE_ALIAS("crct10dif-pclmul"); | ||
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 597d4da69656..50226c4b86ed 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c | |||
@@ -187,7 +187,36 @@ static int sha256_ssse3_import(struct shash_desc *desc, const void *in) | |||
187 | return 0; | 187 | return 0; |
188 | } | 188 | } |
189 | 189 | ||
190 | static struct shash_alg alg = { | 190 | static int sha224_ssse3_init(struct shash_desc *desc) |
191 | { | ||
192 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
193 | |||
194 | sctx->state[0] = SHA224_H0; | ||
195 | sctx->state[1] = SHA224_H1; | ||
196 | sctx->state[2] = SHA224_H2; | ||
197 | sctx->state[3] = SHA224_H3; | ||
198 | sctx->state[4] = SHA224_H4; | ||
199 | sctx->state[5] = SHA224_H5; | ||
200 | sctx->state[6] = SHA224_H6; | ||
201 | sctx->state[7] = SHA224_H7; | ||
202 | sctx->count = 0; | ||
203 | |||
204 | return 0; | ||
205 | } | ||
206 | |||
207 | static int sha224_ssse3_final(struct shash_desc *desc, u8 *hash) | ||
208 | { | ||
209 | u8 D[SHA256_DIGEST_SIZE]; | ||
210 | |||
211 | sha256_ssse3_final(desc, D); | ||
212 | |||
213 | memcpy(hash, D, SHA224_DIGEST_SIZE); | ||
214 | memset(D, 0, SHA256_DIGEST_SIZE); | ||
215 | |||
216 | return 0; | ||
217 | } | ||
218 | |||
219 | static struct shash_alg algs[] = { { | ||
191 | .digestsize = SHA256_DIGEST_SIZE, | 220 | .digestsize = SHA256_DIGEST_SIZE, |
192 | .init = sha256_ssse3_init, | 221 | .init = sha256_ssse3_init, |
193 | .update = sha256_ssse3_update, | 222 | .update = sha256_ssse3_update, |
@@ -204,7 +233,24 @@ static struct shash_alg alg = { | |||
204 | .cra_blocksize = SHA256_BLOCK_SIZE, | 233 | .cra_blocksize = SHA256_BLOCK_SIZE, |
205 | .cra_module = THIS_MODULE, | 234 | .cra_module = THIS_MODULE, |
206 | } | 235 | } |
207 | }; | 236 | }, { |
237 | .digestsize = SHA224_DIGEST_SIZE, | ||
238 | .init = sha224_ssse3_init, | ||
239 | .update = sha256_ssse3_update, | ||
240 | .final = sha224_ssse3_final, | ||
241 | .export = sha256_ssse3_export, | ||
242 | .import = sha256_ssse3_import, | ||
243 | .descsize = sizeof(struct sha256_state), | ||
244 | .statesize = sizeof(struct sha256_state), | ||
245 | .base = { | ||
246 | .cra_name = "sha224", | ||
247 | .cra_driver_name = "sha224-ssse3", | ||
248 | .cra_priority = 150, | ||
249 | .cra_flags = CRYPTO_ALG_TYPE_SHASH, | ||
250 | .cra_blocksize = SHA224_BLOCK_SIZE, | ||
251 | .cra_module = THIS_MODULE, | ||
252 | } | ||
253 | } }; | ||
208 | 254 | ||
209 | #ifdef CONFIG_AS_AVX | 255 | #ifdef CONFIG_AS_AVX |
210 | static bool __init avx_usable(void) | 256 | static bool __init avx_usable(void) |
@@ -227,7 +273,7 @@ static bool __init avx_usable(void) | |||
227 | 273 | ||
228 | static int __init sha256_ssse3_mod_init(void) | 274 | static int __init sha256_ssse3_mod_init(void) |
229 | { | 275 | { |
230 | /* test for SSE3 first */ | 276 | /* test for SSSE3 first */ |
231 | if (cpu_has_ssse3) | 277 | if (cpu_has_ssse3) |
232 | sha256_transform_asm = sha256_transform_ssse3; | 278 | sha256_transform_asm = sha256_transform_ssse3; |
233 | 279 | ||
@@ -254,7 +300,7 @@ static int __init sha256_ssse3_mod_init(void) | |||
254 | else | 300 | else |
255 | #endif | 301 | #endif |
256 | pr_info("Using SSSE3 optimized SHA-256 implementation\n"); | 302 | pr_info("Using SSSE3 optimized SHA-256 implementation\n"); |
257 | return crypto_register_shash(&alg); | 303 | return crypto_register_shashes(algs, ARRAY_SIZE(algs)); |
258 | } | 304 | } |
259 | pr_info("Neither AVX nor SSSE3 is available/usable.\n"); | 305 | pr_info("Neither AVX nor SSSE3 is available/usable.\n"); |
260 | 306 | ||
@@ -263,7 +309,7 @@ static int __init sha256_ssse3_mod_init(void) | |||
263 | 309 | ||
264 | static void __exit sha256_ssse3_mod_fini(void) | 310 | static void __exit sha256_ssse3_mod_fini(void) |
265 | { | 311 | { |
266 | crypto_unregister_shash(&alg); | 312 | crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); |
267 | } | 313 | } |
268 | 314 | ||
269 | module_init(sha256_ssse3_mod_init); | 315 | module_init(sha256_ssse3_mod_init); |
@@ -273,3 +319,4 @@ MODULE_LICENSE("GPL"); | |||
273 | MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated"); | 319 | MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated"); |
274 | 320 | ||
275 | MODULE_ALIAS("sha256"); | 321 | MODULE_ALIAS("sha256"); |
322 | MODULE_ALIAS("sha384"); | ||
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 6cbd8df348d2..f30cd10293f0 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c | |||
@@ -194,7 +194,37 @@ static int sha512_ssse3_import(struct shash_desc *desc, const void *in) | |||
194 | return 0; | 194 | return 0; |
195 | } | 195 | } |
196 | 196 | ||
197 | static struct shash_alg alg = { | 197 | static int sha384_ssse3_init(struct shash_desc *desc) |
198 | { | ||
199 | struct sha512_state *sctx = shash_desc_ctx(desc); | ||
200 | |||
201 | sctx->state[0] = SHA384_H0; | ||
202 | sctx->state[1] = SHA384_H1; | ||
203 | sctx->state[2] = SHA384_H2; | ||
204 | sctx->state[3] = SHA384_H3; | ||
205 | sctx->state[4] = SHA384_H4; | ||
206 | sctx->state[5] = SHA384_H5; | ||
207 | sctx->state[6] = SHA384_H6; | ||
208 | sctx->state[7] = SHA384_H7; | ||
209 | |||
210 | sctx->count[0] = sctx->count[1] = 0; | ||
211 | |||
212 | return 0; | ||
213 | } | ||
214 | |||
215 | static int sha384_ssse3_final(struct shash_desc *desc, u8 *hash) | ||
216 | { | ||
217 | u8 D[SHA512_DIGEST_SIZE]; | ||
218 | |||
219 | sha512_ssse3_final(desc, D); | ||
220 | |||
221 | memcpy(hash, D, SHA384_DIGEST_SIZE); | ||
222 | memset(D, 0, SHA512_DIGEST_SIZE); | ||
223 | |||
224 | return 0; | ||
225 | } | ||
226 | |||
227 | static struct shash_alg algs[] = { { | ||
198 | .digestsize = SHA512_DIGEST_SIZE, | 228 | .digestsize = SHA512_DIGEST_SIZE, |
199 | .init = sha512_ssse3_init, | 229 | .init = sha512_ssse3_init, |
200 | .update = sha512_ssse3_update, | 230 | .update = sha512_ssse3_update, |
@@ -211,7 +241,24 @@ static struct shash_alg alg = { | |||
211 | .cra_blocksize = SHA512_BLOCK_SIZE, | 241 | .cra_blocksize = SHA512_BLOCK_SIZE, |
212 | .cra_module = THIS_MODULE, | 242 | .cra_module = THIS_MODULE, |
213 | } | 243 | } |
214 | }; | 244 | }, { |
245 | .digestsize = SHA384_DIGEST_SIZE, | ||
246 | .init = sha384_ssse3_init, | ||
247 | .update = sha512_ssse3_update, | ||
248 | .final = sha384_ssse3_final, | ||
249 | .export = sha512_ssse3_export, | ||
250 | .import = sha512_ssse3_import, | ||
251 | .descsize = sizeof(struct sha512_state), | ||
252 | .statesize = sizeof(struct sha512_state), | ||
253 | .base = { | ||
254 | .cra_name = "sha384", | ||
255 | .cra_driver_name = "sha384-ssse3", | ||
256 | .cra_priority = 150, | ||
257 | .cra_flags = CRYPTO_ALG_TYPE_SHASH, | ||
258 | .cra_blocksize = SHA384_BLOCK_SIZE, | ||
259 | .cra_module = THIS_MODULE, | ||
260 | } | ||
261 | } }; | ||
215 | 262 | ||
216 | #ifdef CONFIG_AS_AVX | 263 | #ifdef CONFIG_AS_AVX |
217 | static bool __init avx_usable(void) | 264 | static bool __init avx_usable(void) |
@@ -234,7 +281,7 @@ static bool __init avx_usable(void) | |||
234 | 281 | ||
235 | static int __init sha512_ssse3_mod_init(void) | 282 | static int __init sha512_ssse3_mod_init(void) |
236 | { | 283 | { |
237 | /* test for SSE3 first */ | 284 | /* test for SSSE3 first */ |
238 | if (cpu_has_ssse3) | 285 | if (cpu_has_ssse3) |
239 | sha512_transform_asm = sha512_transform_ssse3; | 286 | sha512_transform_asm = sha512_transform_ssse3; |
240 | 287 | ||
@@ -261,7 +308,7 @@ static int __init sha512_ssse3_mod_init(void) | |||
261 | else | 308 | else |
262 | #endif | 309 | #endif |
263 | pr_info("Using SSSE3 optimized SHA-512 implementation\n"); | 310 | pr_info("Using SSSE3 optimized SHA-512 implementation\n"); |
264 | return crypto_register_shash(&alg); | 311 | return crypto_register_shashes(algs, ARRAY_SIZE(algs)); |
265 | } | 312 | } |
266 | pr_info("Neither AVX nor SSSE3 is available/usable.\n"); | 313 | pr_info("Neither AVX nor SSSE3 is available/usable.\n"); |
267 | 314 | ||
@@ -270,7 +317,7 @@ static int __init sha512_ssse3_mod_init(void) | |||
270 | 317 | ||
271 | static void __exit sha512_ssse3_mod_fini(void) | 318 | static void __exit sha512_ssse3_mod_fini(void) |
272 | { | 319 | { |
273 | crypto_unregister_shash(&alg); | 320 | crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); |
274 | } | 321 | } |
275 | 322 | ||
276 | module_init(sha512_ssse3_mod_init); | 323 | module_init(sha512_ssse3_mod_init); |
@@ -280,3 +327,4 @@ MODULE_LICENSE("GPL"); | |||
280 | MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated"); | 327 | MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated"); |
281 | 328 | ||
282 | MODULE_ALIAS("sha512"); | 329 | MODULE_ALIAS("sha512"); |
330 | MODULE_ALIAS("sha384"); | ||
diff --git a/arch/x86/crypto/twofish-avx2-asm_64.S b/arch/x86/crypto/twofish-avx2-asm_64.S deleted file mode 100644 index e1a83b9cd389..000000000000 --- a/arch/x86/crypto/twofish-avx2-asm_64.S +++ /dev/null | |||
@@ -1,600 +0,0 @@ | |||
1 | /* | ||
2 | * x86_64/AVX2 assembler optimized version of Twofish | ||
3 | * | ||
4 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #include <linux/linkage.h> | ||
14 | #include "glue_helper-asm-avx2.S" | ||
15 | |||
16 | .file "twofish-avx2-asm_64.S" | ||
17 | |||
18 | .data | ||
19 | .align 16 | ||
20 | |||
21 | .Lvpshufb_mask0: | ||
22 | .long 0x80808000 | ||
23 | .long 0x80808004 | ||
24 | .long 0x80808008 | ||
25 | .long 0x8080800c | ||
26 | |||
27 | .Lbswap128_mask: | ||
28 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | ||
29 | .Lxts_gf128mul_and_shl1_mask_0: | ||
30 | .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 | ||
31 | .Lxts_gf128mul_and_shl1_mask_1: | ||
32 | .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 | ||
33 | |||
34 | .text | ||
35 | |||
36 | /* structure of crypto context */ | ||
37 | #define s0 0 | ||
38 | #define s1 1024 | ||
39 | #define s2 2048 | ||
40 | #define s3 3072 | ||
41 | #define w 4096 | ||
42 | #define k 4128 | ||
43 | |||
44 | /* register macros */ | ||
45 | #define CTX %rdi | ||
46 | |||
47 | #define RS0 CTX | ||
48 | #define RS1 %r8 | ||
49 | #define RS2 %r9 | ||
50 | #define RS3 %r10 | ||
51 | #define RK %r11 | ||
52 | #define RW %rax | ||
53 | #define RROUND %r12 | ||
54 | #define RROUNDd %r12d | ||
55 | |||
56 | #define RA0 %ymm8 | ||
57 | #define RB0 %ymm9 | ||
58 | #define RC0 %ymm10 | ||
59 | #define RD0 %ymm11 | ||
60 | #define RA1 %ymm12 | ||
61 | #define RB1 %ymm13 | ||
62 | #define RC1 %ymm14 | ||
63 | #define RD1 %ymm15 | ||
64 | |||
65 | /* temp regs */ | ||
66 | #define RX0 %ymm0 | ||
67 | #define RY0 %ymm1 | ||
68 | #define RX1 %ymm2 | ||
69 | #define RY1 %ymm3 | ||
70 | #define RT0 %ymm4 | ||
71 | #define RIDX %ymm5 | ||
72 | |||
73 | #define RX0x %xmm0 | ||
74 | #define RY0x %xmm1 | ||
75 | #define RX1x %xmm2 | ||
76 | #define RY1x %xmm3 | ||
77 | #define RT0x %xmm4 | ||
78 | |||
79 | /* vpgatherdd mask and '-1' */ | ||
80 | #define RNOT %ymm6 | ||
81 | |||
82 | /* byte mask, (-1 >> 24) */ | ||
83 | #define RBYTE %ymm7 | ||
84 | |||
85 | /********************************************************************** | ||
86 | 16-way AVX2 twofish | ||
87 | **********************************************************************/ | ||
88 | #define init_round_constants() \ | ||
89 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
90 | vpsrld $24, RNOT, RBYTE; \ | ||
91 | leaq k(CTX), RK; \ | ||
92 | leaq w(CTX), RW; \ | ||
93 | leaq s1(CTX), RS1; \ | ||
94 | leaq s2(CTX), RS2; \ | ||
95 | leaq s3(CTX), RS3; \ | ||
96 | |||
97 | #define g16(ab, rs0, rs1, rs2, rs3, xy) \ | ||
98 | vpand RBYTE, ab ## 0, RIDX; \ | ||
99 | vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \ | ||
100 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
101 | \ | ||
102 | vpand RBYTE, ab ## 1, RIDX; \ | ||
103 | vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \ | ||
104 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
105 | \ | ||
106 | vpsrld $8, ab ## 0, RIDX; \ | ||
107 | vpand RBYTE, RIDX, RIDX; \ | ||
108 | vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \ | ||
109 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
110 | vpxor RT0, xy ## 0, xy ## 0; \ | ||
111 | \ | ||
112 | vpsrld $8, ab ## 1, RIDX; \ | ||
113 | vpand RBYTE, RIDX, RIDX; \ | ||
114 | vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \ | ||
115 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
116 | vpxor RT0, xy ## 1, xy ## 1; \ | ||
117 | \ | ||
118 | vpsrld $16, ab ## 0, RIDX; \ | ||
119 | vpand RBYTE, RIDX, RIDX; \ | ||
120 | vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \ | ||
121 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
122 | vpxor RT0, xy ## 0, xy ## 0; \ | ||
123 | \ | ||
124 | vpsrld $16, ab ## 1, RIDX; \ | ||
125 | vpand RBYTE, RIDX, RIDX; \ | ||
126 | vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \ | ||
127 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
128 | vpxor RT0, xy ## 1, xy ## 1; \ | ||
129 | \ | ||
130 | vpsrld $24, ab ## 0, RIDX; \ | ||
131 | vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \ | ||
132 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
133 | vpxor RT0, xy ## 0, xy ## 0; \ | ||
134 | \ | ||
135 | vpsrld $24, ab ## 1, RIDX; \ | ||
136 | vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \ | ||
137 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
138 | vpxor RT0, xy ## 1, xy ## 1; | ||
139 | |||
140 | #define g1_16(a, x) \ | ||
141 | g16(a, RS0, RS1, RS2, RS3, x); | ||
142 | |||
143 | #define g2_16(b, y) \ | ||
144 | g16(b, RS1, RS2, RS3, RS0, y); | ||
145 | |||
146 | #define encrypt_round_end16(a, b, c, d, nk) \ | ||
147 | vpaddd RY0, RX0, RX0; \ | ||
148 | vpaddd RX0, RY0, RY0; \ | ||
149 | vpbroadcastd nk(RK,RROUND,8), RT0; \ | ||
150 | vpaddd RT0, RX0, RX0; \ | ||
151 | vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ | ||
152 | vpaddd RT0, RY0, RY0; \ | ||
153 | \ | ||
154 | vpxor RY0, d ## 0, d ## 0; \ | ||
155 | \ | ||
156 | vpxor RX0, c ## 0, c ## 0; \ | ||
157 | vpsrld $1, c ## 0, RT0; \ | ||
158 | vpslld $31, c ## 0, c ## 0; \ | ||
159 | vpor RT0, c ## 0, c ## 0; \ | ||
160 | \ | ||
161 | vpaddd RY1, RX1, RX1; \ | ||
162 | vpaddd RX1, RY1, RY1; \ | ||
163 | vpbroadcastd nk(RK,RROUND,8), RT0; \ | ||
164 | vpaddd RT0, RX1, RX1; \ | ||
165 | vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ | ||
166 | vpaddd RT0, RY1, RY1; \ | ||
167 | \ | ||
168 | vpxor RY1, d ## 1, d ## 1; \ | ||
169 | \ | ||
170 | vpxor RX1, c ## 1, c ## 1; \ | ||
171 | vpsrld $1, c ## 1, RT0; \ | ||
172 | vpslld $31, c ## 1, c ## 1; \ | ||
173 | vpor RT0, c ## 1, c ## 1; \ | ||
174 | |||
175 | #define encrypt_round16(a, b, c, d, nk) \ | ||
176 | g2_16(b, RY); \ | ||
177 | \ | ||
178 | vpslld $1, b ## 0, RT0; \ | ||
179 | vpsrld $31, b ## 0, b ## 0; \ | ||
180 | vpor RT0, b ## 0, b ## 0; \ | ||
181 | \ | ||
182 | vpslld $1, b ## 1, RT0; \ | ||
183 | vpsrld $31, b ## 1, b ## 1; \ | ||
184 | vpor RT0, b ## 1, b ## 1; \ | ||
185 | \ | ||
186 | g1_16(a, RX); \ | ||
187 | \ | ||
188 | encrypt_round_end16(a, b, c, d, nk); | ||
189 | |||
190 | #define encrypt_round_first16(a, b, c, d, nk) \ | ||
191 | vpslld $1, d ## 0, RT0; \ | ||
192 | vpsrld $31, d ## 0, d ## 0; \ | ||
193 | vpor RT0, d ## 0, d ## 0; \ | ||
194 | \ | ||
195 | vpslld $1, d ## 1, RT0; \ | ||
196 | vpsrld $31, d ## 1, d ## 1; \ | ||
197 | vpor RT0, d ## 1, d ## 1; \ | ||
198 | \ | ||
199 | encrypt_round16(a, b, c, d, nk); | ||
200 | |||
201 | #define encrypt_round_last16(a, b, c, d, nk) \ | ||
202 | g2_16(b, RY); \ | ||
203 | \ | ||
204 | g1_16(a, RX); \ | ||
205 | \ | ||
206 | encrypt_round_end16(a, b, c, d, nk); | ||
207 | |||
208 | #define decrypt_round_end16(a, b, c, d, nk) \ | ||
209 | vpaddd RY0, RX0, RX0; \ | ||
210 | vpaddd RX0, RY0, RY0; \ | ||
211 | vpbroadcastd nk(RK,RROUND,8), RT0; \ | ||
212 | vpaddd RT0, RX0, RX0; \ | ||
213 | vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ | ||
214 | vpaddd RT0, RY0, RY0; \ | ||
215 | \ | ||
216 | vpxor RX0, c ## 0, c ## 0; \ | ||
217 | \ | ||
218 | vpxor RY0, d ## 0, d ## 0; \ | ||
219 | vpsrld $1, d ## 0, RT0; \ | ||
220 | vpslld $31, d ## 0, d ## 0; \ | ||
221 | vpor RT0, d ## 0, d ## 0; \ | ||
222 | \ | ||
223 | vpaddd RY1, RX1, RX1; \ | ||
224 | vpaddd RX1, RY1, RY1; \ | ||
225 | vpbroadcastd nk(RK,RROUND,8), RT0; \ | ||
226 | vpaddd RT0, RX1, RX1; \ | ||
227 | vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ | ||
228 | vpaddd RT0, RY1, RY1; \ | ||
229 | \ | ||
230 | vpxor RX1, c ## 1, c ## 1; \ | ||
231 | \ | ||
232 | vpxor RY1, d ## 1, d ## 1; \ | ||
233 | vpsrld $1, d ## 1, RT0; \ | ||
234 | vpslld $31, d ## 1, d ## 1; \ | ||
235 | vpor RT0, d ## 1, d ## 1; | ||
236 | |||
237 | #define decrypt_round16(a, b, c, d, nk) \ | ||
238 | g1_16(a, RX); \ | ||
239 | \ | ||
240 | vpslld $1, a ## 0, RT0; \ | ||
241 | vpsrld $31, a ## 0, a ## 0; \ | ||
242 | vpor RT0, a ## 0, a ## 0; \ | ||
243 | \ | ||
244 | vpslld $1, a ## 1, RT0; \ | ||
245 | vpsrld $31, a ## 1, a ## 1; \ | ||
246 | vpor RT0, a ## 1, a ## 1; \ | ||
247 | \ | ||
248 | g2_16(b, RY); \ | ||
249 | \ | ||
250 | decrypt_round_end16(a, b, c, d, nk); | ||
251 | |||
252 | #define decrypt_round_first16(a, b, c, d, nk) \ | ||
253 | vpslld $1, c ## 0, RT0; \ | ||
254 | vpsrld $31, c ## 0, c ## 0; \ | ||
255 | vpor RT0, c ## 0, c ## 0; \ | ||
256 | \ | ||
257 | vpslld $1, c ## 1, RT0; \ | ||
258 | vpsrld $31, c ## 1, c ## 1; \ | ||
259 | vpor RT0, c ## 1, c ## 1; \ | ||
260 | \ | ||
261 | decrypt_round16(a, b, c, d, nk) | ||
262 | |||
263 | #define decrypt_round_last16(a, b, c, d, nk) \ | ||
264 | g1_16(a, RX); \ | ||
265 | \ | ||
266 | g2_16(b, RY); \ | ||
267 | \ | ||
268 | decrypt_round_end16(a, b, c, d, nk); | ||
269 | |||
270 | #define encrypt_cycle16() \ | ||
271 | encrypt_round16(RA, RB, RC, RD, 0); \ | ||
272 | encrypt_round16(RC, RD, RA, RB, 8); | ||
273 | |||
274 | #define encrypt_cycle_first16() \ | ||
275 | encrypt_round_first16(RA, RB, RC, RD, 0); \ | ||
276 | encrypt_round16(RC, RD, RA, RB, 8); | ||
277 | |||
278 | #define encrypt_cycle_last16() \ | ||
279 | encrypt_round16(RA, RB, RC, RD, 0); \ | ||
280 | encrypt_round_last16(RC, RD, RA, RB, 8); | ||
281 | |||
282 | #define decrypt_cycle16(n) \ | ||
283 | decrypt_round16(RC, RD, RA, RB, 8); \ | ||
284 | decrypt_round16(RA, RB, RC, RD, 0); | ||
285 | |||
286 | #define decrypt_cycle_first16(n) \ | ||
287 | decrypt_round_first16(RC, RD, RA, RB, 8); \ | ||
288 | decrypt_round16(RA, RB, RC, RD, 0); | ||
289 | |||
290 | #define decrypt_cycle_last16(n) \ | ||
291 | decrypt_round16(RC, RD, RA, RB, 8); \ | ||
292 | decrypt_round_last16(RA, RB, RC, RD, 0); | ||
293 | |||
294 | #define transpose_4x4(x0,x1,x2,x3,t1,t2) \ | ||
295 | vpunpckhdq x1, x0, t2; \ | ||
296 | vpunpckldq x1, x0, x0; \ | ||
297 | \ | ||
298 | vpunpckldq x3, x2, t1; \ | ||
299 | vpunpckhdq x3, x2, x2; \ | ||
300 | \ | ||
301 | vpunpckhqdq t1, x0, x1; \ | ||
302 | vpunpcklqdq t1, x0, x0; \ | ||
303 | \ | ||
304 | vpunpckhqdq x2, t2, x3; \ | ||
305 | vpunpcklqdq x2, t2, x2; | ||
306 | |||
307 | #define read_blocks8(offs,a,b,c,d) \ | ||
308 | transpose_4x4(a, b, c, d, RX0, RY0); | ||
309 | |||
310 | #define write_blocks8(offs,a,b,c,d) \ | ||
311 | transpose_4x4(a, b, c, d, RX0, RY0); | ||
312 | |||
313 | #define inpack_enc8(a,b,c,d) \ | ||
314 | vpbroadcastd 4*0(RW), RT0; \ | ||
315 | vpxor RT0, a, a; \ | ||
316 | \ | ||
317 | vpbroadcastd 4*1(RW), RT0; \ | ||
318 | vpxor RT0, b, b; \ | ||
319 | \ | ||
320 | vpbroadcastd 4*2(RW), RT0; \ | ||
321 | vpxor RT0, c, c; \ | ||
322 | \ | ||
323 | vpbroadcastd 4*3(RW), RT0; \ | ||
324 | vpxor RT0, d, d; | ||
325 | |||
326 | #define outunpack_enc8(a,b,c,d) \ | ||
327 | vpbroadcastd 4*4(RW), RX0; \ | ||
328 | vpbroadcastd 4*5(RW), RY0; \ | ||
329 | vpxor RX0, c, RX0; \ | ||
330 | vpxor RY0, d, RY0; \ | ||
331 | \ | ||
332 | vpbroadcastd 4*6(RW), RT0; \ | ||
333 | vpxor RT0, a, c; \ | ||
334 | vpbroadcastd 4*7(RW), RT0; \ | ||
335 | vpxor RT0, b, d; \ | ||
336 | \ | ||
337 | vmovdqa RX0, a; \ | ||
338 | vmovdqa RY0, b; | ||
339 | |||
340 | #define inpack_dec8(a,b,c,d) \ | ||
341 | vpbroadcastd 4*4(RW), RX0; \ | ||
342 | vpbroadcastd 4*5(RW), RY0; \ | ||
343 | vpxor RX0, a, RX0; \ | ||
344 | vpxor RY0, b, RY0; \ | ||
345 | \ | ||
346 | vpbroadcastd 4*6(RW), RT0; \ | ||
347 | vpxor RT0, c, a; \ | ||
348 | vpbroadcastd 4*7(RW), RT0; \ | ||
349 | vpxor RT0, d, b; \ | ||
350 | \ | ||
351 | vmovdqa RX0, c; \ | ||
352 | vmovdqa RY0, d; | ||
353 | |||
354 | #define outunpack_dec8(a,b,c,d) \ | ||
355 | vpbroadcastd 4*0(RW), RT0; \ | ||
356 | vpxor RT0, a, a; \ | ||
357 | \ | ||
358 | vpbroadcastd 4*1(RW), RT0; \ | ||
359 | vpxor RT0, b, b; \ | ||
360 | \ | ||
361 | vpbroadcastd 4*2(RW), RT0; \ | ||
362 | vpxor RT0, c, c; \ | ||
363 | \ | ||
364 | vpbroadcastd 4*3(RW), RT0; \ | ||
365 | vpxor RT0, d, d; | ||
366 | |||
367 | #define read_blocks16(a,b,c,d) \ | ||
368 | read_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \ | ||
369 | read_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1); | ||
370 | |||
371 | #define write_blocks16(a,b,c,d) \ | ||
372 | write_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \ | ||
373 | write_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1); | ||
374 | |||
375 | #define xor_blocks16(a,b,c,d) \ | ||
376 | xor_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \ | ||
377 | xor_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1); | ||
378 | |||
379 | #define inpack_enc16(a,b,c,d) \ | ||
380 | inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \ | ||
381 | inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1); | ||
382 | |||
383 | #define outunpack_enc16(a,b,c,d) \ | ||
384 | outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \ | ||
385 | outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1); | ||
386 | |||
387 | #define inpack_dec16(a,b,c,d) \ | ||
388 | inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \ | ||
389 | inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1); | ||
390 | |||
391 | #define outunpack_dec16(a,b,c,d) \ | ||
392 | outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \ | ||
393 | outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1); | ||
394 | |||
395 | .align 8 | ||
396 | __twofish_enc_blk16: | ||
397 | /* input: | ||
398 | * %rdi: ctx, CTX | ||
399 | * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext | ||
400 | * output: | ||
401 | * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext | ||
402 | */ | ||
403 | init_round_constants(); | ||
404 | |||
405 | read_blocks16(RA, RB, RC, RD); | ||
406 | inpack_enc16(RA, RB, RC, RD); | ||
407 | |||
408 | xorl RROUNDd, RROUNDd; | ||
409 | encrypt_cycle_first16(); | ||
410 | movl $2, RROUNDd; | ||
411 | |||
412 | .align 4 | ||
413 | .L__enc_loop: | ||
414 | encrypt_cycle16(); | ||
415 | |||
416 | addl $2, RROUNDd; | ||
417 | cmpl $14, RROUNDd; | ||
418 | jne .L__enc_loop; | ||
419 | |||
420 | encrypt_cycle_last16(); | ||
421 | |||
422 | outunpack_enc16(RA, RB, RC, RD); | ||
423 | write_blocks16(RA, RB, RC, RD); | ||
424 | |||
425 | ret; | ||
426 | ENDPROC(__twofish_enc_blk16) | ||
427 | |||
428 | .align 8 | ||
429 | __twofish_dec_blk16: | ||
430 | /* input: | ||
431 | * %rdi: ctx, CTX | ||
432 | * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext | ||
433 | * output: | ||
434 | * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext | ||
435 | */ | ||
436 | init_round_constants(); | ||
437 | |||
438 | read_blocks16(RA, RB, RC, RD); | ||
439 | inpack_dec16(RA, RB, RC, RD); | ||
440 | |||
441 | movl $14, RROUNDd; | ||
442 | decrypt_cycle_first16(); | ||
443 | movl $12, RROUNDd; | ||
444 | |||
445 | .align 4 | ||
446 | .L__dec_loop: | ||
447 | decrypt_cycle16(); | ||
448 | |||
449 | addl $-2, RROUNDd; | ||
450 | jnz .L__dec_loop; | ||
451 | |||
452 | decrypt_cycle_last16(); | ||
453 | |||
454 | outunpack_dec16(RA, RB, RC, RD); | ||
455 | write_blocks16(RA, RB, RC, RD); | ||
456 | |||
457 | ret; | ||
458 | ENDPROC(__twofish_dec_blk16) | ||
459 | |||
460 | ENTRY(twofish_ecb_enc_16way) | ||
461 | /* input: | ||
462 | * %rdi: ctx, CTX | ||
463 | * %rsi: dst | ||
464 | * %rdx: src | ||
465 | */ | ||
466 | |||
467 | vzeroupper; | ||
468 | pushq %r12; | ||
469 | |||
470 | load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); | ||
471 | |||
472 | call __twofish_enc_blk16; | ||
473 | |||
474 | store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); | ||
475 | |||
476 | popq %r12; | ||
477 | vzeroupper; | ||
478 | |||
479 | ret; | ||
480 | ENDPROC(twofish_ecb_enc_16way) | ||
481 | |||
482 | ENTRY(twofish_ecb_dec_16way) | ||
483 | /* input: | ||
484 | * %rdi: ctx, CTX | ||
485 | * %rsi: dst | ||
486 | * %rdx: src | ||
487 | */ | ||
488 | |||
489 | vzeroupper; | ||
490 | pushq %r12; | ||
491 | |||
492 | load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); | ||
493 | |||
494 | call __twofish_dec_blk16; | ||
495 | |||
496 | store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); | ||
497 | |||
498 | popq %r12; | ||
499 | vzeroupper; | ||
500 | |||
501 | ret; | ||
502 | ENDPROC(twofish_ecb_dec_16way) | ||
503 | |||
504 | ENTRY(twofish_cbc_dec_16way) | ||
505 | /* input: | ||
506 | * %rdi: ctx, CTX | ||
507 | * %rsi: dst | ||
508 | * %rdx: src | ||
509 | */ | ||
510 | |||
511 | vzeroupper; | ||
512 | pushq %r12; | ||
513 | |||
514 | load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); | ||
515 | |||
516 | call __twofish_dec_blk16; | ||
517 | |||
518 | store_cbc_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1, | ||
519 | RX0); | ||
520 | |||
521 | popq %r12; | ||
522 | vzeroupper; | ||
523 | |||
524 | ret; | ||
525 | ENDPROC(twofish_cbc_dec_16way) | ||
526 | |||
527 | ENTRY(twofish_ctr_16way) | ||
528 | /* input: | ||
529 | * %rdi: ctx, CTX | ||
530 | * %rsi: dst (16 blocks) | ||
531 | * %rdx: src (16 blocks) | ||
532 | * %rcx: iv (little endian, 128bit) | ||
533 | */ | ||
534 | |||
535 | vzeroupper; | ||
536 | pushq %r12; | ||
537 | |||
538 | load_ctr_16way(%rcx, .Lbswap128_mask, RA0, RB0, RC0, RD0, RA1, RB1, RC1, | ||
539 | RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT, | ||
540 | RBYTE); | ||
541 | |||
542 | call __twofish_enc_blk16; | ||
543 | |||
544 | store_ctr_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); | ||
545 | |||
546 | popq %r12; | ||
547 | vzeroupper; | ||
548 | |||
549 | ret; | ||
550 | ENDPROC(twofish_ctr_16way) | ||
551 | |||
552 | .align 8 | ||
553 | twofish_xts_crypt_16way: | ||
554 | /* input: | ||
555 | * %rdi: ctx, CTX | ||
556 | * %rsi: dst (16 blocks) | ||
557 | * %rdx: src (16 blocks) | ||
558 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
559 | * %r8: pointer to __twofish_enc_blk16 or __twofish_dec_blk16 | ||
560 | */ | ||
561 | |||
562 | vzeroupper; | ||
563 | pushq %r12; | ||
564 | |||
565 | load_xts_16way(%rcx, %rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, | ||
566 | RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT, | ||
567 | .Lxts_gf128mul_and_shl1_mask_0, | ||
568 | .Lxts_gf128mul_and_shl1_mask_1); | ||
569 | |||
570 | call *%r8; | ||
571 | |||
572 | store_xts_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); | ||
573 | |||
574 | popq %r12; | ||
575 | vzeroupper; | ||
576 | |||
577 | ret; | ||
578 | ENDPROC(twofish_xts_crypt_16way) | ||
579 | |||
580 | ENTRY(twofish_xts_enc_16way) | ||
581 | /* input: | ||
582 | * %rdi: ctx, CTX | ||
583 | * %rsi: dst (16 blocks) | ||
584 | * %rdx: src (16 blocks) | ||
585 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
586 | */ | ||
587 | leaq __twofish_enc_blk16, %r8; | ||
588 | jmp twofish_xts_crypt_16way; | ||
589 | ENDPROC(twofish_xts_enc_16way) | ||
590 | |||
591 | ENTRY(twofish_xts_dec_16way) | ||
592 | /* input: | ||
593 | * %rdi: ctx, CTX | ||
594 | * %rsi: dst (16 blocks) | ||
595 | * %rdx: src (16 blocks) | ||
596 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
597 | */ | ||
598 | leaq __twofish_dec_blk16, %r8; | ||
599 | jmp twofish_xts_crypt_16way; | ||
600 | ENDPROC(twofish_xts_dec_16way) | ||
diff --git a/arch/x86/crypto/twofish_avx2_glue.c b/arch/x86/crypto/twofish_avx2_glue.c deleted file mode 100644 index ce33b5be64ee..000000000000 --- a/arch/x86/crypto/twofish_avx2_glue.c +++ /dev/null | |||
@@ -1,584 +0,0 @@ | |||
1 | /* | ||
2 | * Glue Code for x86_64/AVX2 assembler optimized version of Twofish | ||
3 | * | ||
4 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #include <linux/module.h> | ||
14 | #include <linux/types.h> | ||
15 | #include <linux/crypto.h> | ||
16 | #include <linux/err.h> | ||
17 | #include <crypto/algapi.h> | ||
18 | #include <crypto/ctr.h> | ||
19 | #include <crypto/twofish.h> | ||
20 | #include <crypto/lrw.h> | ||
21 | #include <crypto/xts.h> | ||
22 | #include <asm/xcr.h> | ||
23 | #include <asm/xsave.h> | ||
24 | #include <asm/crypto/twofish.h> | ||
25 | #include <asm/crypto/ablk_helper.h> | ||
26 | #include <asm/crypto/glue_helper.h> | ||
27 | #include <crypto/scatterwalk.h> | ||
28 | |||
29 | #define TF_AVX2_PARALLEL_BLOCKS 16 | ||
30 | |||
31 | /* 16-way AVX2 parallel cipher functions */ | ||
32 | asmlinkage void twofish_ecb_enc_16way(struct twofish_ctx *ctx, u8 *dst, | ||
33 | const u8 *src); | ||
34 | asmlinkage void twofish_ecb_dec_16way(struct twofish_ctx *ctx, u8 *dst, | ||
35 | const u8 *src); | ||
36 | asmlinkage void twofish_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src); | ||
37 | |||
38 | asmlinkage void twofish_ctr_16way(void *ctx, u128 *dst, const u128 *src, | ||
39 | le128 *iv); | ||
40 | |||
41 | asmlinkage void twofish_xts_enc_16way(struct twofish_ctx *ctx, u8 *dst, | ||
42 | const u8 *src, le128 *iv); | ||
43 | asmlinkage void twofish_xts_dec_16way(struct twofish_ctx *ctx, u8 *dst, | ||
44 | const u8 *src, le128 *iv); | ||
45 | |||
46 | static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, | ||
47 | const u8 *src) | ||
48 | { | ||
49 | __twofish_enc_blk_3way(ctx, dst, src, false); | ||
50 | } | ||
51 | |||
52 | static const struct common_glue_ctx twofish_enc = { | ||
53 | .num_funcs = 4, | ||
54 | .fpu_blocks_limit = 8, | ||
55 | |||
56 | .funcs = { { | ||
57 | .num_blocks = 16, | ||
58 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_16way) } | ||
59 | }, { | ||
60 | .num_blocks = 8, | ||
61 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) } | ||
62 | }, { | ||
63 | .num_blocks = 3, | ||
64 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } | ||
65 | }, { | ||
66 | .num_blocks = 1, | ||
67 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) } | ||
68 | } } | ||
69 | }; | ||
70 | |||
71 | static const struct common_glue_ctx twofish_ctr = { | ||
72 | .num_funcs = 4, | ||
73 | .fpu_blocks_limit = 8, | ||
74 | |||
75 | .funcs = { { | ||
76 | .num_blocks = 16, | ||
77 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_16way) } | ||
78 | }, { | ||
79 | .num_blocks = 8, | ||
80 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) } | ||
81 | }, { | ||
82 | .num_blocks = 3, | ||
83 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) } | ||
84 | }, { | ||
85 | .num_blocks = 1, | ||
86 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) } | ||
87 | } } | ||
88 | }; | ||
89 | |||
90 | static const struct common_glue_ctx twofish_enc_xts = { | ||
91 | .num_funcs = 3, | ||
92 | .fpu_blocks_limit = 8, | ||
93 | |||
94 | .funcs = { { | ||
95 | .num_blocks = 16, | ||
96 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_16way) } | ||
97 | }, { | ||
98 | .num_blocks = 8, | ||
99 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) } | ||
100 | }, { | ||
101 | .num_blocks = 1, | ||
102 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) } | ||
103 | } } | ||
104 | }; | ||
105 | |||
106 | static const struct common_glue_ctx twofish_dec = { | ||
107 | .num_funcs = 4, | ||
108 | .fpu_blocks_limit = 8, | ||
109 | |||
110 | .funcs = { { | ||
111 | .num_blocks = 16, | ||
112 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_16way) } | ||
113 | }, { | ||
114 | .num_blocks = 8, | ||
115 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) } | ||
116 | }, { | ||
117 | .num_blocks = 3, | ||
118 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } | ||
119 | }, { | ||
120 | .num_blocks = 1, | ||
121 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) } | ||
122 | } } | ||
123 | }; | ||
124 | |||
125 | static const struct common_glue_ctx twofish_dec_cbc = { | ||
126 | .num_funcs = 4, | ||
127 | .fpu_blocks_limit = 8, | ||
128 | |||
129 | .funcs = { { | ||
130 | .num_blocks = 16, | ||
131 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_16way) } | ||
132 | }, { | ||
133 | .num_blocks = 8, | ||
134 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) } | ||
135 | }, { | ||
136 | .num_blocks = 3, | ||
137 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } | ||
138 | }, { | ||
139 | .num_blocks = 1, | ||
140 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) } | ||
141 | } } | ||
142 | }; | ||
143 | |||
144 | static const struct common_glue_ctx twofish_dec_xts = { | ||
145 | .num_funcs = 3, | ||
146 | .fpu_blocks_limit = 8, | ||
147 | |||
148 | .funcs = { { | ||
149 | .num_blocks = 16, | ||
150 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_16way) } | ||
151 | }, { | ||
152 | .num_blocks = 8, | ||
153 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) } | ||
154 | }, { | ||
155 | .num_blocks = 1, | ||
156 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) } | ||
157 | } } | ||
158 | }; | ||
159 | |||
160 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
161 | struct scatterlist *src, unsigned int nbytes) | ||
162 | { | ||
163 | return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes); | ||
164 | } | ||
165 | |||
166 | static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
167 | struct scatterlist *src, unsigned int nbytes) | ||
168 | { | ||
169 | return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes); | ||
170 | } | ||
171 | |||
172 | static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
173 | struct scatterlist *src, unsigned int nbytes) | ||
174 | { | ||
175 | return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc, | ||
176 | dst, src, nbytes); | ||
177 | } | ||
178 | |||
179 | static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
180 | struct scatterlist *src, unsigned int nbytes) | ||
181 | { | ||
182 | return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src, | ||
183 | nbytes); | ||
184 | } | ||
185 | |||
186 | static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
187 | struct scatterlist *src, unsigned int nbytes) | ||
188 | { | ||
189 | return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes); | ||
190 | } | ||
191 | |||
192 | static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes) | ||
193 | { | ||
194 | /* since reusing AVX functions, starts using FPU at 8 parallel blocks */ | ||
195 | return glue_fpu_begin(TF_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes); | ||
196 | } | ||
197 | |||
198 | static inline void twofish_fpu_end(bool fpu_enabled) | ||
199 | { | ||
200 | glue_fpu_end(fpu_enabled); | ||
201 | } | ||
202 | |||
203 | struct crypt_priv { | ||
204 | struct twofish_ctx *ctx; | ||
205 | bool fpu_enabled; | ||
206 | }; | ||
207 | |||
208 | static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | ||
209 | { | ||
210 | const unsigned int bsize = TF_BLOCK_SIZE; | ||
211 | struct crypt_priv *ctx = priv; | ||
212 | int i; | ||
213 | |||
214 | ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); | ||
215 | |||
216 | while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) { | ||
217 | twofish_ecb_enc_16way(ctx->ctx, srcdst, srcdst); | ||
218 | srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS; | ||
219 | nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS; | ||
220 | } | ||
221 | |||
222 | while (nbytes >= 8 * bsize) { | ||
223 | twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst); | ||
224 | srcdst += bsize * 8; | ||
225 | nbytes -= bsize * 8; | ||
226 | } | ||
227 | |||
228 | while (nbytes >= 3 * bsize) { | ||
229 | twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst); | ||
230 | srcdst += bsize * 3; | ||
231 | nbytes -= bsize * 3; | ||
232 | } | ||
233 | |||
234 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | ||
235 | twofish_enc_blk(ctx->ctx, srcdst, srcdst); | ||
236 | } | ||
237 | |||
238 | static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | ||
239 | { | ||
240 | const unsigned int bsize = TF_BLOCK_SIZE; | ||
241 | struct crypt_priv *ctx = priv; | ||
242 | int i; | ||
243 | |||
244 | ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); | ||
245 | |||
246 | while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) { | ||
247 | twofish_ecb_dec_16way(ctx->ctx, srcdst, srcdst); | ||
248 | srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS; | ||
249 | nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS; | ||
250 | } | ||
251 | |||
252 | while (nbytes >= 8 * bsize) { | ||
253 | twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst); | ||
254 | srcdst += bsize * 8; | ||
255 | nbytes -= bsize * 8; | ||
256 | } | ||
257 | |||
258 | while (nbytes >= 3 * bsize) { | ||
259 | twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst); | ||
260 | srcdst += bsize * 3; | ||
261 | nbytes -= bsize * 3; | ||
262 | } | ||
263 | |||
264 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | ||
265 | twofish_dec_blk(ctx->ctx, srcdst, srcdst); | ||
266 | } | ||
267 | |||
268 | static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
269 | struct scatterlist *src, unsigned int nbytes) | ||
270 | { | ||
271 | struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
272 | be128 buf[TF_AVX2_PARALLEL_BLOCKS]; | ||
273 | struct crypt_priv crypt_ctx = { | ||
274 | .ctx = &ctx->twofish_ctx, | ||
275 | .fpu_enabled = false, | ||
276 | }; | ||
277 | struct lrw_crypt_req req = { | ||
278 | .tbuf = buf, | ||
279 | .tbuflen = sizeof(buf), | ||
280 | |||
281 | .table_ctx = &ctx->lrw_table, | ||
282 | .crypt_ctx = &crypt_ctx, | ||
283 | .crypt_fn = encrypt_callback, | ||
284 | }; | ||
285 | int ret; | ||
286 | |||
287 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
288 | ret = lrw_crypt(desc, dst, src, nbytes, &req); | ||
289 | twofish_fpu_end(crypt_ctx.fpu_enabled); | ||
290 | |||
291 | return ret; | ||
292 | } | ||
293 | |||
294 | static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
295 | struct scatterlist *src, unsigned int nbytes) | ||
296 | { | ||
297 | struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
298 | be128 buf[TF_AVX2_PARALLEL_BLOCKS]; | ||
299 | struct crypt_priv crypt_ctx = { | ||
300 | .ctx = &ctx->twofish_ctx, | ||
301 | .fpu_enabled = false, | ||
302 | }; | ||
303 | struct lrw_crypt_req req = { | ||
304 | .tbuf = buf, | ||
305 | .tbuflen = sizeof(buf), | ||
306 | |||
307 | .table_ctx = &ctx->lrw_table, | ||
308 | .crypt_ctx = &crypt_ctx, | ||
309 | .crypt_fn = decrypt_callback, | ||
310 | }; | ||
311 | int ret; | ||
312 | |||
313 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
314 | ret = lrw_crypt(desc, dst, src, nbytes, &req); | ||
315 | twofish_fpu_end(crypt_ctx.fpu_enabled); | ||
316 | |||
317 | return ret; | ||
318 | } | ||
319 | |||
320 | static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
321 | struct scatterlist *src, unsigned int nbytes) | ||
322 | { | ||
323 | struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
324 | |||
325 | return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes, | ||
326 | XTS_TWEAK_CAST(twofish_enc_blk), | ||
327 | &ctx->tweak_ctx, &ctx->crypt_ctx); | ||
328 | } | ||
329 | |||
330 | static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
331 | struct scatterlist *src, unsigned int nbytes) | ||
332 | { | ||
333 | struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
334 | |||
335 | return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes, | ||
336 | XTS_TWEAK_CAST(twofish_enc_blk), | ||
337 | &ctx->tweak_ctx, &ctx->crypt_ctx); | ||
338 | } | ||
339 | |||
340 | static struct crypto_alg tf_algs[10] = { { | ||
341 | .cra_name = "__ecb-twofish-avx2", | ||
342 | .cra_driver_name = "__driver-ecb-twofish-avx2", | ||
343 | .cra_priority = 0, | ||
344 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
345 | .cra_blocksize = TF_BLOCK_SIZE, | ||
346 | .cra_ctxsize = sizeof(struct twofish_ctx), | ||
347 | .cra_alignmask = 0, | ||
348 | .cra_type = &crypto_blkcipher_type, | ||
349 | .cra_module = THIS_MODULE, | ||
350 | .cra_u = { | ||
351 | .blkcipher = { | ||
352 | .min_keysize = TF_MIN_KEY_SIZE, | ||
353 | .max_keysize = TF_MAX_KEY_SIZE, | ||
354 | .setkey = twofish_setkey, | ||
355 | .encrypt = ecb_encrypt, | ||
356 | .decrypt = ecb_decrypt, | ||
357 | }, | ||
358 | }, | ||
359 | }, { | ||
360 | .cra_name = "__cbc-twofish-avx2", | ||
361 | .cra_driver_name = "__driver-cbc-twofish-avx2", | ||
362 | .cra_priority = 0, | ||
363 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
364 | .cra_blocksize = TF_BLOCK_SIZE, | ||
365 | .cra_ctxsize = sizeof(struct twofish_ctx), | ||
366 | .cra_alignmask = 0, | ||
367 | .cra_type = &crypto_blkcipher_type, | ||
368 | .cra_module = THIS_MODULE, | ||
369 | .cra_u = { | ||
370 | .blkcipher = { | ||
371 | .min_keysize = TF_MIN_KEY_SIZE, | ||
372 | .max_keysize = TF_MAX_KEY_SIZE, | ||
373 | .setkey = twofish_setkey, | ||
374 | .encrypt = cbc_encrypt, | ||
375 | .decrypt = cbc_decrypt, | ||
376 | }, | ||
377 | }, | ||
378 | }, { | ||
379 | .cra_name = "__ctr-twofish-avx2", | ||
380 | .cra_driver_name = "__driver-ctr-twofish-avx2", | ||
381 | .cra_priority = 0, | ||
382 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
383 | .cra_blocksize = 1, | ||
384 | .cra_ctxsize = sizeof(struct twofish_ctx), | ||
385 | .cra_alignmask = 0, | ||
386 | .cra_type = &crypto_blkcipher_type, | ||
387 | .cra_module = THIS_MODULE, | ||
388 | .cra_u = { | ||
389 | .blkcipher = { | ||
390 | .min_keysize = TF_MIN_KEY_SIZE, | ||
391 | .max_keysize = TF_MAX_KEY_SIZE, | ||
392 | .ivsize = TF_BLOCK_SIZE, | ||
393 | .setkey = twofish_setkey, | ||
394 | .encrypt = ctr_crypt, | ||
395 | .decrypt = ctr_crypt, | ||
396 | }, | ||
397 | }, | ||
398 | }, { | ||
399 | .cra_name = "__lrw-twofish-avx2", | ||
400 | .cra_driver_name = "__driver-lrw-twofish-avx2", | ||
401 | .cra_priority = 0, | ||
402 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
403 | .cra_blocksize = TF_BLOCK_SIZE, | ||
404 | .cra_ctxsize = sizeof(struct twofish_lrw_ctx), | ||
405 | .cra_alignmask = 0, | ||
406 | .cra_type = &crypto_blkcipher_type, | ||
407 | .cra_module = THIS_MODULE, | ||
408 | .cra_exit = lrw_twofish_exit_tfm, | ||
409 | .cra_u = { | ||
410 | .blkcipher = { | ||
411 | .min_keysize = TF_MIN_KEY_SIZE + | ||
412 | TF_BLOCK_SIZE, | ||
413 | .max_keysize = TF_MAX_KEY_SIZE + | ||
414 | TF_BLOCK_SIZE, | ||
415 | .ivsize = TF_BLOCK_SIZE, | ||
416 | .setkey = lrw_twofish_setkey, | ||
417 | .encrypt = lrw_encrypt, | ||
418 | .decrypt = lrw_decrypt, | ||
419 | }, | ||
420 | }, | ||
421 | }, { | ||
422 | .cra_name = "__xts-twofish-avx2", | ||
423 | .cra_driver_name = "__driver-xts-twofish-avx2", | ||
424 | .cra_priority = 0, | ||
425 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
426 | .cra_blocksize = TF_BLOCK_SIZE, | ||
427 | .cra_ctxsize = sizeof(struct twofish_xts_ctx), | ||
428 | .cra_alignmask = 0, | ||
429 | .cra_type = &crypto_blkcipher_type, | ||
430 | .cra_module = THIS_MODULE, | ||
431 | .cra_u = { | ||
432 | .blkcipher = { | ||
433 | .min_keysize = TF_MIN_KEY_SIZE * 2, | ||
434 | .max_keysize = TF_MAX_KEY_SIZE * 2, | ||
435 | .ivsize = TF_BLOCK_SIZE, | ||
436 | .setkey = xts_twofish_setkey, | ||
437 | .encrypt = xts_encrypt, | ||
438 | .decrypt = xts_decrypt, | ||
439 | }, | ||
440 | }, | ||
441 | }, { | ||
442 | .cra_name = "ecb(twofish)", | ||
443 | .cra_driver_name = "ecb-twofish-avx2", | ||
444 | .cra_priority = 500, | ||
445 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
446 | .cra_blocksize = TF_BLOCK_SIZE, | ||
447 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
448 | .cra_alignmask = 0, | ||
449 | .cra_type = &crypto_ablkcipher_type, | ||
450 | .cra_module = THIS_MODULE, | ||
451 | .cra_init = ablk_init, | ||
452 | .cra_exit = ablk_exit, | ||
453 | .cra_u = { | ||
454 | .ablkcipher = { | ||
455 | .min_keysize = TF_MIN_KEY_SIZE, | ||
456 | .max_keysize = TF_MAX_KEY_SIZE, | ||
457 | .setkey = ablk_set_key, | ||
458 | .encrypt = ablk_encrypt, | ||
459 | .decrypt = ablk_decrypt, | ||
460 | }, | ||
461 | }, | ||
462 | }, { | ||
463 | .cra_name = "cbc(twofish)", | ||
464 | .cra_driver_name = "cbc-twofish-avx2", | ||
465 | .cra_priority = 500, | ||
466 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
467 | .cra_blocksize = TF_BLOCK_SIZE, | ||
468 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
469 | .cra_alignmask = 0, | ||
470 | .cra_type = &crypto_ablkcipher_type, | ||
471 | .cra_module = THIS_MODULE, | ||
472 | .cra_init = ablk_init, | ||
473 | .cra_exit = ablk_exit, | ||
474 | .cra_u = { | ||
475 | .ablkcipher = { | ||
476 | .min_keysize = TF_MIN_KEY_SIZE, | ||
477 | .max_keysize = TF_MAX_KEY_SIZE, | ||
478 | .ivsize = TF_BLOCK_SIZE, | ||
479 | .setkey = ablk_set_key, | ||
480 | .encrypt = __ablk_encrypt, | ||
481 | .decrypt = ablk_decrypt, | ||
482 | }, | ||
483 | }, | ||
484 | }, { | ||
485 | .cra_name = "ctr(twofish)", | ||
486 | .cra_driver_name = "ctr-twofish-avx2", | ||
487 | .cra_priority = 500, | ||
488 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
489 | .cra_blocksize = 1, | ||
490 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
491 | .cra_alignmask = 0, | ||
492 | .cra_type = &crypto_ablkcipher_type, | ||
493 | .cra_module = THIS_MODULE, | ||
494 | .cra_init = ablk_init, | ||
495 | .cra_exit = ablk_exit, | ||
496 | .cra_u = { | ||
497 | .ablkcipher = { | ||
498 | .min_keysize = TF_MIN_KEY_SIZE, | ||
499 | .max_keysize = TF_MAX_KEY_SIZE, | ||
500 | .ivsize = TF_BLOCK_SIZE, | ||
501 | .setkey = ablk_set_key, | ||
502 | .encrypt = ablk_encrypt, | ||
503 | .decrypt = ablk_encrypt, | ||
504 | .geniv = "chainiv", | ||
505 | }, | ||
506 | }, | ||
507 | }, { | ||
508 | .cra_name = "lrw(twofish)", | ||
509 | .cra_driver_name = "lrw-twofish-avx2", | ||
510 | .cra_priority = 500, | ||
511 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
512 | .cra_blocksize = TF_BLOCK_SIZE, | ||
513 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
514 | .cra_alignmask = 0, | ||
515 | .cra_type = &crypto_ablkcipher_type, | ||
516 | .cra_module = THIS_MODULE, | ||
517 | .cra_init = ablk_init, | ||
518 | .cra_exit = ablk_exit, | ||
519 | .cra_u = { | ||
520 | .ablkcipher = { | ||
521 | .min_keysize = TF_MIN_KEY_SIZE + | ||
522 | TF_BLOCK_SIZE, | ||
523 | .max_keysize = TF_MAX_KEY_SIZE + | ||
524 | TF_BLOCK_SIZE, | ||
525 | .ivsize = TF_BLOCK_SIZE, | ||
526 | .setkey = ablk_set_key, | ||
527 | .encrypt = ablk_encrypt, | ||
528 | .decrypt = ablk_decrypt, | ||
529 | }, | ||
530 | }, | ||
531 | }, { | ||
532 | .cra_name = "xts(twofish)", | ||
533 | .cra_driver_name = "xts-twofish-avx2", | ||
534 | .cra_priority = 500, | ||
535 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
536 | .cra_blocksize = TF_BLOCK_SIZE, | ||
537 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
538 | .cra_alignmask = 0, | ||
539 | .cra_type = &crypto_ablkcipher_type, | ||
540 | .cra_module = THIS_MODULE, | ||
541 | .cra_init = ablk_init, | ||
542 | .cra_exit = ablk_exit, | ||
543 | .cra_u = { | ||
544 | .ablkcipher = { | ||
545 | .min_keysize = TF_MIN_KEY_SIZE * 2, | ||
546 | .max_keysize = TF_MAX_KEY_SIZE * 2, | ||
547 | .ivsize = TF_BLOCK_SIZE, | ||
548 | .setkey = ablk_set_key, | ||
549 | .encrypt = ablk_encrypt, | ||
550 | .decrypt = ablk_decrypt, | ||
551 | }, | ||
552 | }, | ||
553 | } }; | ||
554 | |||
555 | static int __init init(void) | ||
556 | { | ||
557 | u64 xcr0; | ||
558 | |||
559 | if (!cpu_has_avx2 || !cpu_has_osxsave) { | ||
560 | pr_info("AVX2 instructions are not detected.\n"); | ||
561 | return -ENODEV; | ||
562 | } | ||
563 | |||
564 | xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); | ||
565 | if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { | ||
566 | pr_info("AVX2 detected but unusable.\n"); | ||
567 | return -ENODEV; | ||
568 | } | ||
569 | |||
570 | return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs)); | ||
571 | } | ||
572 | |||
573 | static void __exit fini(void) | ||
574 | { | ||
575 | crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs)); | ||
576 | } | ||
577 | |||
578 | module_init(init); | ||
579 | module_exit(fini); | ||
580 | |||
581 | MODULE_LICENSE("GPL"); | ||
582 | MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX2 optimized"); | ||
583 | MODULE_ALIAS("twofish"); | ||
584 | MODULE_ALIAS("twofish-asm"); | ||
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 2047a562f6b3..a62ba541884e 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c | |||
@@ -50,26 +50,18 @@ | |||
50 | /* 8-way parallel cipher functions */ | 50 | /* 8-way parallel cipher functions */ |
51 | asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst, | 51 | asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst, |
52 | const u8 *src); | 52 | const u8 *src); |
53 | EXPORT_SYMBOL_GPL(twofish_ecb_enc_8way); | ||
54 | |||
55 | asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst, | 53 | asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst, |
56 | const u8 *src); | 54 | const u8 *src); |
57 | EXPORT_SYMBOL_GPL(twofish_ecb_dec_8way); | ||
58 | 55 | ||
59 | asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, | 56 | asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, |
60 | const u8 *src); | 57 | const u8 *src); |
61 | EXPORT_SYMBOL_GPL(twofish_cbc_dec_8way); | ||
62 | |||
63 | asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, | 58 | asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, |
64 | const u8 *src, le128 *iv); | 59 | const u8 *src, le128 *iv); |
65 | EXPORT_SYMBOL_GPL(twofish_ctr_8way); | ||
66 | 60 | ||
67 | asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst, | 61 | asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst, |
68 | const u8 *src, le128 *iv); | 62 | const u8 *src, le128 *iv); |
69 | EXPORT_SYMBOL_GPL(twofish_xts_enc_8way); | ||
70 | asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst, | 63 | asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst, |
71 | const u8 *src, le128 *iv); | 64 | const u8 *src, le128 *iv); |
72 | EXPORT_SYMBOL_GPL(twofish_xts_dec_8way); | ||
73 | 65 | ||
74 | static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, | 66 | static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, |
75 | const u8 *src) | 67 | const u8 *src) |
@@ -77,19 +69,17 @@ static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, | |||
77 | __twofish_enc_blk_3way(ctx, dst, src, false); | 69 | __twofish_enc_blk_3way(ctx, dst, src, false); |
78 | } | 70 | } |
79 | 71 | ||
80 | void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) | 72 | static void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) |
81 | { | 73 | { |
82 | glue_xts_crypt_128bit_one(ctx, dst, src, iv, | 74 | glue_xts_crypt_128bit_one(ctx, dst, src, iv, |
83 | GLUE_FUNC_CAST(twofish_enc_blk)); | 75 | GLUE_FUNC_CAST(twofish_enc_blk)); |
84 | } | 76 | } |
85 | EXPORT_SYMBOL_GPL(twofish_xts_enc); | ||
86 | 77 | ||
87 | void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) | 78 | static void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) |
88 | { | 79 | { |
89 | glue_xts_crypt_128bit_one(ctx, dst, src, iv, | 80 | glue_xts_crypt_128bit_one(ctx, dst, src, iv, |
90 | GLUE_FUNC_CAST(twofish_dec_blk)); | 81 | GLUE_FUNC_CAST(twofish_dec_blk)); |
91 | } | 82 | } |
92 | EXPORT_SYMBOL_GPL(twofish_xts_dec); | ||
93 | 83 | ||
94 | 84 | ||
95 | static const struct common_glue_ctx twofish_enc = { | 85 | static const struct common_glue_ctx twofish_enc = { |