diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-04-13 06:46:45 -0400 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2013-04-25 09:09:04 -0400 |
commit | 604880107010a1e5794552d184cd5471ea31b973 (patch) | |
tree | ed37e3b3e8454f758daab88a2fb9cb5f043ca8ad /arch/x86/crypto | |
parent | ad8b7c3e92868dd86c54d9d5321000bbb4096f0d (diff) |
crypto: blowfish - add AVX2/x86_64 implementation of blowfish cipher
Patch adds AVX2/x86-64 implementation of Blowfish cipher, requiring 32 parallel
blocks for input (256 bytes). Table look-ups are performed using vpgatherdd
instruction directly from vector registers and thus should be faster than
earlier implementations.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r-- | arch/x86/crypto/Makefile | 11 | ||||
-rw-r--r-- | arch/x86/crypto/blowfish-avx2-asm_64.S | 449 | ||||
-rw-r--r-- | arch/x86/crypto/blowfish_avx2_glue.c | 585 | ||||
-rw-r--r-- | arch/x86/crypto/blowfish_glue.c | 32 |
4 files changed, 1053 insertions, 24 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 03cd7313ad4b..28464ef6fa52 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile | |||
@@ -3,6 +3,8 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) | 5 | avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) |
6 | avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ | ||
7 | $(comma)4)$(comma)%ymm2,yes,no) | ||
6 | 8 | ||
7 | obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o | 9 | obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o |
8 | obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o | 10 | obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o |
@@ -38,6 +40,11 @@ ifeq ($(avx_supported),yes) | |||
38 | obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o | 40 | obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o |
39 | endif | 41 | endif |
40 | 42 | ||
43 | # These modules require assembler to support AVX2. | ||
44 | ifeq ($(avx2_supported),yes) | ||
45 | obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o | ||
46 | endif | ||
47 | |||
41 | aes-i586-y := aes-i586-asm_32.o aes_glue.o | 48 | aes-i586-y := aes-i586-asm_32.o aes_glue.o |
42 | twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o | 49 | twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o |
43 | salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o | 50 | salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o |
@@ -62,6 +69,10 @@ ifeq ($(avx_supported),yes) | |||
62 | serpent_avx_glue.o | 69 | serpent_avx_glue.o |
63 | endif | 70 | endif |
64 | 71 | ||
72 | ifeq ($(avx2_supported),yes) | ||
73 | blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o | ||
74 | endif | ||
75 | |||
65 | aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o | 76 | aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o |
66 | ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o | 77 | ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o |
67 | sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o | 78 | sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o |
diff --git a/arch/x86/crypto/blowfish-avx2-asm_64.S b/arch/x86/crypto/blowfish-avx2-asm_64.S new file mode 100644 index 000000000000..784452e0d05d --- /dev/null +++ b/arch/x86/crypto/blowfish-avx2-asm_64.S | |||
@@ -0,0 +1,449 @@ | |||
1 | /* | ||
2 | * x86_64/AVX2 assembler optimized version of Blowfish | ||
3 | * | ||
4 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #include <linux/linkage.h> | ||
14 | |||
15 | .file "blowfish-avx2-asm_64.S" | ||
16 | |||
17 | .data | ||
18 | .align 32 | ||
19 | |||
20 | .Lprefetch_mask: | ||
21 | .long 0*64 | ||
22 | .long 1*64 | ||
23 | .long 2*64 | ||
24 | .long 3*64 | ||
25 | .long 4*64 | ||
26 | .long 5*64 | ||
27 | .long 6*64 | ||
28 | .long 7*64 | ||
29 | |||
30 | .Lbswap32_mask: | ||
31 | .long 0x00010203 | ||
32 | .long 0x04050607 | ||
33 | .long 0x08090a0b | ||
34 | .long 0x0c0d0e0f | ||
35 | |||
36 | .Lbswap128_mask: | ||
37 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | ||
38 | .Lbswap_iv_mask: | ||
39 | .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0 | ||
40 | |||
41 | .text | ||
42 | /* structure of crypto context */ | ||
43 | #define p 0 | ||
44 | #define s0 ((16 + 2) * 4) | ||
45 | #define s1 ((16 + 2 + (1 * 256)) * 4) | ||
46 | #define s2 ((16 + 2 + (2 * 256)) * 4) | ||
47 | #define s3 ((16 + 2 + (3 * 256)) * 4) | ||
48 | |||
49 | /* register macros */ | ||
50 | #define CTX %rdi | ||
51 | #define RIO %rdx | ||
52 | |||
53 | #define RS0 %rax | ||
54 | #define RS1 %r8 | ||
55 | #define RS2 %r9 | ||
56 | #define RS3 %r10 | ||
57 | |||
58 | #define RLOOP %r11 | ||
59 | #define RLOOPd %r11d | ||
60 | |||
61 | #define RXr0 %ymm8 | ||
62 | #define RXr1 %ymm9 | ||
63 | #define RXr2 %ymm10 | ||
64 | #define RXr3 %ymm11 | ||
65 | #define RXl0 %ymm12 | ||
66 | #define RXl1 %ymm13 | ||
67 | #define RXl2 %ymm14 | ||
68 | #define RXl3 %ymm15 | ||
69 | |||
70 | /* temp regs */ | ||
71 | #define RT0 %ymm0 | ||
72 | #define RT0x %xmm0 | ||
73 | #define RT1 %ymm1 | ||
74 | #define RT1x %xmm1 | ||
75 | #define RIDX0 %ymm2 | ||
76 | #define RIDX1 %ymm3 | ||
77 | #define RIDX1x %xmm3 | ||
78 | #define RIDX2 %ymm4 | ||
79 | #define RIDX3 %ymm5 | ||
80 | |||
81 | /* vpgatherdd mask and '-1' */ | ||
82 | #define RNOT %ymm6 | ||
83 | |||
84 | /* byte mask, (-1 >> 24) */ | ||
85 | #define RBYTE %ymm7 | ||
86 | |||
87 | /*********************************************************************** | ||
88 | * 32-way AVX2 blowfish | ||
89 | ***********************************************************************/ | ||
90 | #define F(xl, xr) \ | ||
91 | vpsrld $24, xl, RIDX0; \ | ||
92 | vpsrld $16, xl, RIDX1; \ | ||
93 | vpsrld $8, xl, RIDX2; \ | ||
94 | vpand RBYTE, RIDX1, RIDX1; \ | ||
95 | vpand RBYTE, RIDX2, RIDX2; \ | ||
96 | vpand RBYTE, xl, RIDX3; \ | ||
97 | \ | ||
98 | vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \ | ||
99 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
100 | vpcmpeqd RIDX0, RIDX0, RIDX0; \ | ||
101 | \ | ||
102 | vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \ | ||
103 | vpcmpeqd RIDX1, RIDX1, RIDX1; \ | ||
104 | vpaddd RT0, RT1, RT0; \ | ||
105 | \ | ||
106 | vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \ | ||
107 | vpxor RT0, RT1, RT0; \ | ||
108 | \ | ||
109 | vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \ | ||
110 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
111 | vpaddd RT0, RT1, RT0; \ | ||
112 | \ | ||
113 | vpxor RT0, xr, xr; | ||
114 | |||
115 | #define add_roundkey(xl, nmem) \ | ||
116 | vpbroadcastd nmem, RT0; \ | ||
117 | vpxor RT0, xl ## 0, xl ## 0; \ | ||
118 | vpxor RT0, xl ## 1, xl ## 1; \ | ||
119 | vpxor RT0, xl ## 2, xl ## 2; \ | ||
120 | vpxor RT0, xl ## 3, xl ## 3; | ||
121 | |||
122 | #define round_enc() \ | ||
123 | add_roundkey(RXr, p(CTX,RLOOP,4)); \ | ||
124 | F(RXl0, RXr0); \ | ||
125 | F(RXl1, RXr1); \ | ||
126 | F(RXl2, RXr2); \ | ||
127 | F(RXl3, RXr3); \ | ||
128 | \ | ||
129 | add_roundkey(RXl, p+4(CTX,RLOOP,4)); \ | ||
130 | F(RXr0, RXl0); \ | ||
131 | F(RXr1, RXl1); \ | ||
132 | F(RXr2, RXl2); \ | ||
133 | F(RXr3, RXl3); | ||
134 | |||
135 | #define round_dec() \ | ||
136 | add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \ | ||
137 | F(RXl0, RXr0); \ | ||
138 | F(RXl1, RXr1); \ | ||
139 | F(RXl2, RXr2); \ | ||
140 | F(RXl3, RXr3); \ | ||
141 | \ | ||
142 | add_roundkey(RXl, p+4(CTX,RLOOP,4)); \ | ||
143 | F(RXr0, RXl0); \ | ||
144 | F(RXr1, RXl1); \ | ||
145 | F(RXr2, RXl2); \ | ||
146 | F(RXr3, RXl3); | ||
147 | |||
148 | #define init_round_constants() \ | ||
149 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
150 | leaq s0(CTX), RS0; \ | ||
151 | leaq s1(CTX), RS1; \ | ||
152 | leaq s2(CTX), RS2; \ | ||
153 | leaq s3(CTX), RS3; \ | ||
154 | vpsrld $24, RNOT, RBYTE; | ||
155 | |||
156 | #define transpose_2x2(x0, x1, t0) \ | ||
157 | vpunpckldq x0, x1, t0; \ | ||
158 | vpunpckhdq x0, x1, x1; \ | ||
159 | \ | ||
160 | vpunpcklqdq t0, x1, x0; \ | ||
161 | vpunpckhqdq t0, x1, x1; | ||
162 | |||
163 | #define read_block(xl, xr) \ | ||
164 | vbroadcasti128 .Lbswap32_mask, RT1; \ | ||
165 | \ | ||
166 | vpshufb RT1, xl ## 0, xl ## 0; \ | ||
167 | vpshufb RT1, xr ## 0, xr ## 0; \ | ||
168 | vpshufb RT1, xl ## 1, xl ## 1; \ | ||
169 | vpshufb RT1, xr ## 1, xr ## 1; \ | ||
170 | vpshufb RT1, xl ## 2, xl ## 2; \ | ||
171 | vpshufb RT1, xr ## 2, xr ## 2; \ | ||
172 | vpshufb RT1, xl ## 3, xl ## 3; \ | ||
173 | vpshufb RT1, xr ## 3, xr ## 3; \ | ||
174 | \ | ||
175 | transpose_2x2(xl ## 0, xr ## 0, RT0); \ | ||
176 | transpose_2x2(xl ## 1, xr ## 1, RT0); \ | ||
177 | transpose_2x2(xl ## 2, xr ## 2, RT0); \ | ||
178 | transpose_2x2(xl ## 3, xr ## 3, RT0); | ||
179 | |||
180 | #define write_block(xl, xr) \ | ||
181 | vbroadcasti128 .Lbswap32_mask, RT1; \ | ||
182 | \ | ||
183 | transpose_2x2(xl ## 0, xr ## 0, RT0); \ | ||
184 | transpose_2x2(xl ## 1, xr ## 1, RT0); \ | ||
185 | transpose_2x2(xl ## 2, xr ## 2, RT0); \ | ||
186 | transpose_2x2(xl ## 3, xr ## 3, RT0); \ | ||
187 | \ | ||
188 | vpshufb RT1, xl ## 0, xl ## 0; \ | ||
189 | vpshufb RT1, xr ## 0, xr ## 0; \ | ||
190 | vpshufb RT1, xl ## 1, xl ## 1; \ | ||
191 | vpshufb RT1, xr ## 1, xr ## 1; \ | ||
192 | vpshufb RT1, xl ## 2, xl ## 2; \ | ||
193 | vpshufb RT1, xr ## 2, xr ## 2; \ | ||
194 | vpshufb RT1, xl ## 3, xl ## 3; \ | ||
195 | vpshufb RT1, xr ## 3, xr ## 3; | ||
196 | |||
197 | .align 8 | ||
198 | __blowfish_enc_blk32: | ||
199 | /* input: | ||
200 | * %rdi: ctx, CTX | ||
201 | * RXl0..4, RXr0..4: plaintext | ||
202 | * output: | ||
203 | * RXl0..4, RXr0..4: ciphertext (RXl <=> RXr swapped) | ||
204 | */ | ||
205 | init_round_constants(); | ||
206 | |||
207 | read_block(RXl, RXr); | ||
208 | |||
209 | movl $1, RLOOPd; | ||
210 | add_roundkey(RXl, p+4*(0)(CTX)); | ||
211 | |||
212 | .align 4 | ||
213 | .L__enc_loop: | ||
214 | round_enc(); | ||
215 | |||
216 | leal 2(RLOOPd), RLOOPd; | ||
217 | cmpl $17, RLOOPd; | ||
218 | jne .L__enc_loop; | ||
219 | |||
220 | add_roundkey(RXr, p+4*(17)(CTX)); | ||
221 | |||
222 | write_block(RXl, RXr); | ||
223 | |||
224 | ret; | ||
225 | ENDPROC(__blowfish_enc_blk32) | ||
226 | |||
227 | .align 8 | ||
228 | __blowfish_dec_blk32: | ||
229 | /* input: | ||
230 | * %rdi: ctx, CTX | ||
231 | * RXl0..4, RXr0..4: ciphertext | ||
232 | * output: | ||
233 | * RXl0..4, RXr0..4: plaintext (RXl <=> RXr swapped) | ||
234 | */ | ||
235 | init_round_constants(); | ||
236 | |||
237 | read_block(RXl, RXr); | ||
238 | |||
239 | movl $14, RLOOPd; | ||
240 | add_roundkey(RXl, p+4*(17)(CTX)); | ||
241 | |||
242 | .align 4 | ||
243 | .L__dec_loop: | ||
244 | round_dec(); | ||
245 | |||
246 | addl $-2, RLOOPd; | ||
247 | jns .L__dec_loop; | ||
248 | |||
249 | add_roundkey(RXr, p+4*(0)(CTX)); | ||
250 | |||
251 | write_block(RXl, RXr); | ||
252 | |||
253 | ret; | ||
254 | ENDPROC(__blowfish_dec_blk32) | ||
255 | |||
256 | ENTRY(blowfish_ecb_enc_32way) | ||
257 | /* input: | ||
258 | * %rdi: ctx, CTX | ||
259 | * %rsi: dst | ||
260 | * %rdx: src | ||
261 | */ | ||
262 | |||
263 | vzeroupper; | ||
264 | |||
265 | vmovdqu 0*32(%rdx), RXl0; | ||
266 | vmovdqu 1*32(%rdx), RXr0; | ||
267 | vmovdqu 2*32(%rdx), RXl1; | ||
268 | vmovdqu 3*32(%rdx), RXr1; | ||
269 | vmovdqu 4*32(%rdx), RXl2; | ||
270 | vmovdqu 5*32(%rdx), RXr2; | ||
271 | vmovdqu 6*32(%rdx), RXl3; | ||
272 | vmovdqu 7*32(%rdx), RXr3; | ||
273 | |||
274 | call __blowfish_enc_blk32; | ||
275 | |||
276 | vmovdqu RXr0, 0*32(%rsi); | ||
277 | vmovdqu RXl0, 1*32(%rsi); | ||
278 | vmovdqu RXr1, 2*32(%rsi); | ||
279 | vmovdqu RXl1, 3*32(%rsi); | ||
280 | vmovdqu RXr2, 4*32(%rsi); | ||
281 | vmovdqu RXl2, 5*32(%rsi); | ||
282 | vmovdqu RXr3, 6*32(%rsi); | ||
283 | vmovdqu RXl3, 7*32(%rsi); | ||
284 | |||
285 | vzeroupper; | ||
286 | |||
287 | ret; | ||
288 | ENDPROC(blowfish_ecb_enc_32way) | ||
289 | |||
290 | ENTRY(blowfish_ecb_dec_32way) | ||
291 | /* input: | ||
292 | * %rdi: ctx, CTX | ||
293 | * %rsi: dst | ||
294 | * %rdx: src | ||
295 | */ | ||
296 | |||
297 | vzeroupper; | ||
298 | |||
299 | vmovdqu 0*32(%rdx), RXl0; | ||
300 | vmovdqu 1*32(%rdx), RXr0; | ||
301 | vmovdqu 2*32(%rdx), RXl1; | ||
302 | vmovdqu 3*32(%rdx), RXr1; | ||
303 | vmovdqu 4*32(%rdx), RXl2; | ||
304 | vmovdqu 5*32(%rdx), RXr2; | ||
305 | vmovdqu 6*32(%rdx), RXl3; | ||
306 | vmovdqu 7*32(%rdx), RXr3; | ||
307 | |||
308 | call __blowfish_dec_blk32; | ||
309 | |||
310 | vmovdqu RXr0, 0*32(%rsi); | ||
311 | vmovdqu RXl0, 1*32(%rsi); | ||
312 | vmovdqu RXr1, 2*32(%rsi); | ||
313 | vmovdqu RXl1, 3*32(%rsi); | ||
314 | vmovdqu RXr2, 4*32(%rsi); | ||
315 | vmovdqu RXl2, 5*32(%rsi); | ||
316 | vmovdqu RXr3, 6*32(%rsi); | ||
317 | vmovdqu RXl3, 7*32(%rsi); | ||
318 | |||
319 | vzeroupper; | ||
320 | |||
321 | ret; | ||
322 | ENDPROC(blowfish_ecb_dec_32way) | ||
323 | |||
324 | ENTRY(blowfish_cbc_dec_32way) | ||
325 | /* input: | ||
326 | * %rdi: ctx, CTX | ||
327 | * %rsi: dst | ||
328 | * %rdx: src | ||
329 | */ | ||
330 | |||
331 | vzeroupper; | ||
332 | |||
333 | vmovdqu 0*32(%rdx), RXl0; | ||
334 | vmovdqu 1*32(%rdx), RXr0; | ||
335 | vmovdqu 2*32(%rdx), RXl1; | ||
336 | vmovdqu 3*32(%rdx), RXr1; | ||
337 | vmovdqu 4*32(%rdx), RXl2; | ||
338 | vmovdqu 5*32(%rdx), RXr2; | ||
339 | vmovdqu 6*32(%rdx), RXl3; | ||
340 | vmovdqu 7*32(%rdx), RXr3; | ||
341 | |||
342 | call __blowfish_dec_blk32; | ||
343 | |||
344 | /* xor with src */ | ||
345 | vmovq (%rdx), RT0x; | ||
346 | vpshufd $0x4f, RT0x, RT0x; | ||
347 | vinserti128 $1, 8(%rdx), RT0, RT0; | ||
348 | vpxor RT0, RXr0, RXr0; | ||
349 | vpxor 0*32+24(%rdx), RXl0, RXl0; | ||
350 | vpxor 1*32+24(%rdx), RXr1, RXr1; | ||
351 | vpxor 2*32+24(%rdx), RXl1, RXl1; | ||
352 | vpxor 3*32+24(%rdx), RXr2, RXr2; | ||
353 | vpxor 4*32+24(%rdx), RXl2, RXl2; | ||
354 | vpxor 5*32+24(%rdx), RXr3, RXr3; | ||
355 | vpxor 6*32+24(%rdx), RXl3, RXl3; | ||
356 | |||
357 | vmovdqu RXr0, (0*32)(%rsi); | ||
358 | vmovdqu RXl0, (1*32)(%rsi); | ||
359 | vmovdqu RXr1, (2*32)(%rsi); | ||
360 | vmovdqu RXl1, (3*32)(%rsi); | ||
361 | vmovdqu RXr2, (4*32)(%rsi); | ||
362 | vmovdqu RXl2, (5*32)(%rsi); | ||
363 | vmovdqu RXr3, (6*32)(%rsi); | ||
364 | vmovdqu RXl3, (7*32)(%rsi); | ||
365 | |||
366 | vzeroupper; | ||
367 | |||
368 | ret; | ||
369 | ENDPROC(blowfish_cbc_dec_32way) | ||
370 | |||
371 | ENTRY(blowfish_ctr_32way) | ||
372 | /* input: | ||
373 | * %rdi: ctx, CTX | ||
374 | * %rsi: dst | ||
375 | * %rdx: src | ||
376 | * %rcx: iv (big endian, 64bit) | ||
377 | */ | ||
378 | |||
379 | vzeroupper; | ||
380 | |||
381 | vpcmpeqd RT0, RT0, RT0; | ||
382 | vpsrldq $8, RT0, RT0; /* a: -1, b: 0, c: -1, d: 0 */ | ||
383 | |||
384 | vpcmpeqd RT1x, RT1x, RT1x; | ||
385 | vpaddq RT1x, RT1x, RT1x; /* a: -2, b: -2 */ | ||
386 | vpxor RIDX0, RIDX0, RIDX0; | ||
387 | vinserti128 $1, RT1x, RIDX0, RIDX0; /* a: 0, b: 0, c: -2, d: -2 */ | ||
388 | |||
389 | vpaddq RIDX0, RT0, RT0; /* a: -1, b: 0, c: -3, d: -2 */ | ||
390 | |||
391 | vpcmpeqd RT1, RT1, RT1; | ||
392 | vpaddq RT1, RT1, RT1; /* a: -2, b: -2, c: -2, d: -2 */ | ||
393 | vpaddq RT1, RT1, RIDX2; /* a: -4, b: -4, c: -4, d: -4 */ | ||
394 | |||
395 | vbroadcasti128 .Lbswap_iv_mask, RIDX0; | ||
396 | vbroadcasti128 .Lbswap128_mask, RIDX1; | ||
397 | |||
398 | /* load IV and byteswap */ | ||
399 | vmovq (%rcx), RT1x; | ||
400 | vinserti128 $1, RT1x, RT1, RT1; /* a: BE, b: 0, c: BE, d: 0 */ | ||
401 | vpshufb RIDX0, RT1, RT1; /* a: LE, b: LE, c: LE, d: LE */ | ||
402 | |||
403 | /* construct IVs */ | ||
404 | vpsubq RT0, RT1, RT1; /* a: le1, b: le0, c: le3, d: le2 */ | ||
405 | vpshufb RIDX1, RT1, RXl0; /* a: be0, b: be1, c: be2, d: be3 */ | ||
406 | vpsubq RIDX2, RT1, RT1; /* le5, le4, le7, le6 */ | ||
407 | vpshufb RIDX1, RT1, RXr0; /* be4, be5, be6, be7 */ | ||
408 | vpsubq RIDX2, RT1, RT1; | ||
409 | vpshufb RIDX1, RT1, RXl1; | ||
410 | vpsubq RIDX2, RT1, RT1; | ||
411 | vpshufb RIDX1, RT1, RXr1; | ||
412 | vpsubq RIDX2, RT1, RT1; | ||
413 | vpshufb RIDX1, RT1, RXl2; | ||
414 | vpsubq RIDX2, RT1, RT1; | ||
415 | vpshufb RIDX1, RT1, RXr2; | ||
416 | vpsubq RIDX2, RT1, RT1; | ||
417 | vpshufb RIDX1, RT1, RXl3; | ||
418 | vpsubq RIDX2, RT1, RT1; | ||
419 | vpshufb RIDX1, RT1, RXr3; | ||
420 | |||
421 | /* store last IV */ | ||
422 | vpsubq RIDX2, RT1, RT1; /* a: le33, b: le32, ... */ | ||
423 | vpshufb RIDX1x, RT1x, RT1x; /* a: be32, ... */ | ||
424 | vmovq RT1x, (%rcx); | ||
425 | |||
426 | call __blowfish_enc_blk32; | ||
427 | |||
428 | /* dst = src ^ iv */ | ||
429 | vpxor 0*32(%rdx), RXr0, RXr0; | ||
430 | vpxor 1*32(%rdx), RXl0, RXl0; | ||
431 | vpxor 2*32(%rdx), RXr1, RXr1; | ||
432 | vpxor 3*32(%rdx), RXl1, RXl1; | ||
433 | vpxor 4*32(%rdx), RXr2, RXr2; | ||
434 | vpxor 5*32(%rdx), RXl2, RXl2; | ||
435 | vpxor 6*32(%rdx), RXr3, RXr3; | ||
436 | vpxor 7*32(%rdx), RXl3, RXl3; | ||
437 | vmovdqu RXr0, (0*32)(%rsi); | ||
438 | vmovdqu RXl0, (1*32)(%rsi); | ||
439 | vmovdqu RXr1, (2*32)(%rsi); | ||
440 | vmovdqu RXl1, (3*32)(%rsi); | ||
441 | vmovdqu RXr2, (4*32)(%rsi); | ||
442 | vmovdqu RXl2, (5*32)(%rsi); | ||
443 | vmovdqu RXr3, (6*32)(%rsi); | ||
444 | vmovdqu RXl3, (7*32)(%rsi); | ||
445 | |||
446 | vzeroupper; | ||
447 | |||
448 | ret; | ||
449 | ENDPROC(blowfish_ctr_32way) | ||
diff --git a/arch/x86/crypto/blowfish_avx2_glue.c b/arch/x86/crypto/blowfish_avx2_glue.c new file mode 100644 index 000000000000..4417e9aea78d --- /dev/null +++ b/arch/x86/crypto/blowfish_avx2_glue.c | |||
@@ -0,0 +1,585 @@ | |||
1 | /* | ||
2 | * Glue Code for x86_64/AVX2 assembler optimized version of Blowfish | ||
3 | * | ||
4 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
5 | * | ||
6 | * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: | ||
7 | * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> | ||
8 | * CTR part based on code (crypto/ctr.c) by: | ||
9 | * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or modify | ||
12 | * it under the terms of the GNU General Public License as published by | ||
13 | * the Free Software Foundation; either version 2 of the License, or | ||
14 | * (at your option) any later version. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | * GNU General Public License for more details. | ||
20 | * | ||
21 | */ | ||
22 | |||
23 | #include <linux/module.h> | ||
24 | #include <linux/types.h> | ||
25 | #include <linux/crypto.h> | ||
26 | #include <linux/err.h> | ||
27 | #include <crypto/algapi.h> | ||
28 | #include <crypto/blowfish.h> | ||
29 | #include <crypto/cryptd.h> | ||
30 | #include <crypto/ctr.h> | ||
31 | #include <asm/i387.h> | ||
32 | #include <asm/xcr.h> | ||
33 | #include <asm/xsave.h> | ||
34 | #include <asm/crypto/blowfish.h> | ||
35 | #include <asm/crypto/ablk_helper.h> | ||
36 | #include <crypto/scatterwalk.h> | ||
37 | |||
38 | #define BF_AVX2_PARALLEL_BLOCKS 32 | ||
39 | |||
40 | /* 32-way AVX2 parallel cipher functions */ | ||
41 | asmlinkage void blowfish_ecb_enc_32way(struct bf_ctx *ctx, u8 *dst, | ||
42 | const u8 *src); | ||
43 | asmlinkage void blowfish_ecb_dec_32way(struct bf_ctx *ctx, u8 *dst, | ||
44 | const u8 *src); | ||
45 | asmlinkage void blowfish_cbc_dec_32way(struct bf_ctx *ctx, u8 *dst, | ||
46 | const u8 *src); | ||
47 | asmlinkage void blowfish_ctr_32way(struct bf_ctx *ctx, u8 *dst, const u8 *src, | ||
48 | __be64 *iv); | ||
49 | |||
50 | static inline bool bf_fpu_begin(bool fpu_enabled, unsigned int nbytes) | ||
51 | { | ||
52 | if (fpu_enabled) | ||
53 | return true; | ||
54 | |||
55 | /* FPU is only used when chunk to be processed is large enough, so | ||
56 | * do not enable FPU until it is necessary. | ||
57 | */ | ||
58 | if (nbytes < BF_BLOCK_SIZE * BF_AVX2_PARALLEL_BLOCKS) | ||
59 | return false; | ||
60 | |||
61 | kernel_fpu_begin(); | ||
62 | return true; | ||
63 | } | ||
64 | |||
65 | static inline void bf_fpu_end(bool fpu_enabled) | ||
66 | { | ||
67 | if (fpu_enabled) | ||
68 | kernel_fpu_end(); | ||
69 | } | ||
70 | |||
71 | static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, | ||
72 | bool enc) | ||
73 | { | ||
74 | bool fpu_enabled = false; | ||
75 | struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
76 | const unsigned int bsize = BF_BLOCK_SIZE; | ||
77 | unsigned int nbytes; | ||
78 | int err; | ||
79 | |||
80 | err = blkcipher_walk_virt(desc, walk); | ||
81 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
82 | |||
83 | while ((nbytes = walk->nbytes)) { | ||
84 | u8 *wsrc = walk->src.virt.addr; | ||
85 | u8 *wdst = walk->dst.virt.addr; | ||
86 | |||
87 | fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes); | ||
88 | |||
89 | /* Process multi-block AVX2 batch */ | ||
90 | if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) { | ||
91 | do { | ||
92 | if (enc) | ||
93 | blowfish_ecb_enc_32way(ctx, wdst, wsrc); | ||
94 | else | ||
95 | blowfish_ecb_dec_32way(ctx, wdst, wsrc); | ||
96 | |||
97 | wsrc += bsize * BF_AVX2_PARALLEL_BLOCKS; | ||
98 | wdst += bsize * BF_AVX2_PARALLEL_BLOCKS; | ||
99 | nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS; | ||
100 | } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS); | ||
101 | |||
102 | if (nbytes < bsize) | ||
103 | goto done; | ||
104 | } | ||
105 | |||
106 | /* Process multi-block batch */ | ||
107 | if (nbytes >= bsize * BF_PARALLEL_BLOCKS) { | ||
108 | do { | ||
109 | if (enc) | ||
110 | blowfish_enc_blk_4way(ctx, wdst, wsrc); | ||
111 | else | ||
112 | blowfish_dec_blk_4way(ctx, wdst, wsrc); | ||
113 | |||
114 | wsrc += bsize * BF_PARALLEL_BLOCKS; | ||
115 | wdst += bsize * BF_PARALLEL_BLOCKS; | ||
116 | nbytes -= bsize * BF_PARALLEL_BLOCKS; | ||
117 | } while (nbytes >= bsize * BF_PARALLEL_BLOCKS); | ||
118 | |||
119 | if (nbytes < bsize) | ||
120 | goto done; | ||
121 | } | ||
122 | |||
123 | /* Handle leftovers */ | ||
124 | do { | ||
125 | if (enc) | ||
126 | blowfish_enc_blk(ctx, wdst, wsrc); | ||
127 | else | ||
128 | blowfish_dec_blk(ctx, wdst, wsrc); | ||
129 | |||
130 | wsrc += bsize; | ||
131 | wdst += bsize; | ||
132 | nbytes -= bsize; | ||
133 | } while (nbytes >= bsize); | ||
134 | |||
135 | done: | ||
136 | err = blkcipher_walk_done(desc, walk, nbytes); | ||
137 | } | ||
138 | |||
139 | bf_fpu_end(fpu_enabled); | ||
140 | return err; | ||
141 | } | ||
142 | |||
143 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
144 | struct scatterlist *src, unsigned int nbytes) | ||
145 | { | ||
146 | struct blkcipher_walk walk; | ||
147 | |||
148 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
149 | return ecb_crypt(desc, &walk, true); | ||
150 | } | ||
151 | |||
152 | static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
153 | struct scatterlist *src, unsigned int nbytes) | ||
154 | { | ||
155 | struct blkcipher_walk walk; | ||
156 | |||
157 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
158 | return ecb_crypt(desc, &walk, false); | ||
159 | } | ||
160 | |||
161 | static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, | ||
162 | struct blkcipher_walk *walk) | ||
163 | { | ||
164 | struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
165 | unsigned int bsize = BF_BLOCK_SIZE; | ||
166 | unsigned int nbytes = walk->nbytes; | ||
167 | u64 *src = (u64 *)walk->src.virt.addr; | ||
168 | u64 *dst = (u64 *)walk->dst.virt.addr; | ||
169 | u64 *iv = (u64 *)walk->iv; | ||
170 | |||
171 | do { | ||
172 | *dst = *src ^ *iv; | ||
173 | blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst); | ||
174 | iv = dst; | ||
175 | |||
176 | src += 1; | ||
177 | dst += 1; | ||
178 | nbytes -= bsize; | ||
179 | } while (nbytes >= bsize); | ||
180 | |||
181 | *(u64 *)walk->iv = *iv; | ||
182 | return nbytes; | ||
183 | } | ||
184 | |||
185 | static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
186 | struct scatterlist *src, unsigned int nbytes) | ||
187 | { | ||
188 | struct blkcipher_walk walk; | ||
189 | int err; | ||
190 | |||
191 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
192 | err = blkcipher_walk_virt(desc, &walk); | ||
193 | |||
194 | while ((nbytes = walk.nbytes)) { | ||
195 | nbytes = __cbc_encrypt(desc, &walk); | ||
196 | err = blkcipher_walk_done(desc, &walk, nbytes); | ||
197 | } | ||
198 | |||
199 | return err; | ||
200 | } | ||
201 | |||
202 | static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, | ||
203 | struct blkcipher_walk *walk) | ||
204 | { | ||
205 | struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
206 | const unsigned int bsize = BF_BLOCK_SIZE; | ||
207 | unsigned int nbytes = walk->nbytes; | ||
208 | u64 *src = (u64 *)walk->src.virt.addr; | ||
209 | u64 *dst = (u64 *)walk->dst.virt.addr; | ||
210 | u64 last_iv; | ||
211 | int i; | ||
212 | |||
213 | /* Start of the last block. */ | ||
214 | src += nbytes / bsize - 1; | ||
215 | dst += nbytes / bsize - 1; | ||
216 | |||
217 | last_iv = *src; | ||
218 | |||
219 | /* Process multi-block AVX2 batch */ | ||
220 | if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) { | ||
221 | do { | ||
222 | nbytes -= bsize * (BF_AVX2_PARALLEL_BLOCKS - 1); | ||
223 | src -= BF_AVX2_PARALLEL_BLOCKS - 1; | ||
224 | dst -= BF_AVX2_PARALLEL_BLOCKS - 1; | ||
225 | |||
226 | blowfish_cbc_dec_32way(ctx, (u8 *)dst, (u8 *)src); | ||
227 | |||
228 | nbytes -= bsize; | ||
229 | if (nbytes < bsize) | ||
230 | goto done; | ||
231 | |||
232 | *dst ^= *(src - 1); | ||
233 | src -= 1; | ||
234 | dst -= 1; | ||
235 | } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS); | ||
236 | |||
237 | if (nbytes < bsize) | ||
238 | goto done; | ||
239 | } | ||
240 | |||
241 | /* Process multi-block batch */ | ||
242 | if (nbytes >= bsize * BF_PARALLEL_BLOCKS) { | ||
243 | u64 ivs[BF_PARALLEL_BLOCKS - 1]; | ||
244 | |||
245 | do { | ||
246 | nbytes -= bsize * (BF_PARALLEL_BLOCKS - 1); | ||
247 | src -= BF_PARALLEL_BLOCKS - 1; | ||
248 | dst -= BF_PARALLEL_BLOCKS - 1; | ||
249 | |||
250 | for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++) | ||
251 | ivs[i] = src[i]; | ||
252 | |||
253 | blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src); | ||
254 | |||
255 | for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++) | ||
256 | dst[i + 1] ^= ivs[i]; | ||
257 | |||
258 | nbytes -= bsize; | ||
259 | if (nbytes < bsize) | ||
260 | goto done; | ||
261 | |||
262 | *dst ^= *(src - 1); | ||
263 | src -= 1; | ||
264 | dst -= 1; | ||
265 | } while (nbytes >= bsize * BF_PARALLEL_BLOCKS); | ||
266 | |||
267 | if (nbytes < bsize) | ||
268 | goto done; | ||
269 | } | ||
270 | |||
271 | /* Handle leftovers */ | ||
272 | for (;;) { | ||
273 | blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src); | ||
274 | |||
275 | nbytes -= bsize; | ||
276 | if (nbytes < bsize) | ||
277 | break; | ||
278 | |||
279 | *dst ^= *(src - 1); | ||
280 | src -= 1; | ||
281 | dst -= 1; | ||
282 | } | ||
283 | |||
284 | done: | ||
285 | *dst ^= *(u64 *)walk->iv; | ||
286 | *(u64 *)walk->iv = last_iv; | ||
287 | |||
288 | return nbytes; | ||
289 | } | ||
290 | |||
291 | static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
292 | struct scatterlist *src, unsigned int nbytes) | ||
293 | { | ||
294 | bool fpu_enabled = false; | ||
295 | struct blkcipher_walk walk; | ||
296 | int err; | ||
297 | |||
298 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
299 | err = blkcipher_walk_virt(desc, &walk); | ||
300 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
301 | |||
302 | while ((nbytes = walk.nbytes)) { | ||
303 | fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes); | ||
304 | nbytes = __cbc_decrypt(desc, &walk); | ||
305 | err = blkcipher_walk_done(desc, &walk, nbytes); | ||
306 | } | ||
307 | |||
308 | bf_fpu_end(fpu_enabled); | ||
309 | return err; | ||
310 | } | ||
311 | |||
312 | static void ctr_crypt_final(struct blkcipher_desc *desc, | ||
313 | struct blkcipher_walk *walk) | ||
314 | { | ||
315 | struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
316 | u8 *ctrblk = walk->iv; | ||
317 | u8 keystream[BF_BLOCK_SIZE]; | ||
318 | u8 *src = walk->src.virt.addr; | ||
319 | u8 *dst = walk->dst.virt.addr; | ||
320 | unsigned int nbytes = walk->nbytes; | ||
321 | |||
322 | blowfish_enc_blk(ctx, keystream, ctrblk); | ||
323 | crypto_xor(keystream, src, nbytes); | ||
324 | memcpy(dst, keystream, nbytes); | ||
325 | |||
326 | crypto_inc(ctrblk, BF_BLOCK_SIZE); | ||
327 | } | ||
328 | |||
329 | static unsigned int __ctr_crypt(struct blkcipher_desc *desc, | ||
330 | struct blkcipher_walk *walk) | ||
331 | { | ||
332 | struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
333 | unsigned int bsize = BF_BLOCK_SIZE; | ||
334 | unsigned int nbytes = walk->nbytes; | ||
335 | u64 *src = (u64 *)walk->src.virt.addr; | ||
336 | u64 *dst = (u64 *)walk->dst.virt.addr; | ||
337 | int i; | ||
338 | |||
339 | /* Process multi-block AVX2 batch */ | ||
340 | if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) { | ||
341 | do { | ||
342 | blowfish_ctr_32way(ctx, (u8 *)dst, (u8 *)src, | ||
343 | (__be64 *)walk->iv); | ||
344 | |||
345 | src += BF_AVX2_PARALLEL_BLOCKS; | ||
346 | dst += BF_AVX2_PARALLEL_BLOCKS; | ||
347 | nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS; | ||
348 | } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS); | ||
349 | |||
350 | if (nbytes < bsize) | ||
351 | goto done; | ||
352 | } | ||
353 | |||
354 | /* Process four block batch */ | ||
355 | if (nbytes >= bsize * BF_PARALLEL_BLOCKS) { | ||
356 | __be64 ctrblocks[BF_PARALLEL_BLOCKS]; | ||
357 | u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv); | ||
358 | |||
359 | do { | ||
360 | /* create ctrblks for parallel encrypt */ | ||
361 | for (i = 0; i < BF_PARALLEL_BLOCKS; i++) { | ||
362 | if (dst != src) | ||
363 | dst[i] = src[i]; | ||
364 | |||
365 | ctrblocks[i] = cpu_to_be64(ctrblk++); | ||
366 | } | ||
367 | |||
368 | blowfish_enc_blk_xor_4way(ctx, (u8 *)dst, | ||
369 | (u8 *)ctrblocks); | ||
370 | |||
371 | src += BF_PARALLEL_BLOCKS; | ||
372 | dst += BF_PARALLEL_BLOCKS; | ||
373 | nbytes -= bsize * BF_PARALLEL_BLOCKS; | ||
374 | } while (nbytes >= bsize * BF_PARALLEL_BLOCKS); | ||
375 | |||
376 | *(__be64 *)walk->iv = cpu_to_be64(ctrblk); | ||
377 | |||
378 | if (nbytes < bsize) | ||
379 | goto done; | ||
380 | } | ||
381 | |||
382 | /* Handle leftovers */ | ||
383 | do { | ||
384 | u64 ctrblk; | ||
385 | |||
386 | if (dst != src) | ||
387 | *dst = *src; | ||
388 | |||
389 | ctrblk = *(u64 *)walk->iv; | ||
390 | be64_add_cpu((__be64 *)walk->iv, 1); | ||
391 | |||
392 | blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk); | ||
393 | |||
394 | src += 1; | ||
395 | dst += 1; | ||
396 | } while ((nbytes -= bsize) >= bsize); | ||
397 | |||
398 | done: | ||
399 | return nbytes; | ||
400 | } | ||
401 | |||
402 | static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
403 | struct scatterlist *src, unsigned int nbytes) | ||
404 | { | ||
405 | bool fpu_enabled = false; | ||
406 | struct blkcipher_walk walk; | ||
407 | int err; | ||
408 | |||
409 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
410 | err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE); | ||
411 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
412 | |||
413 | while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) { | ||
414 | fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes); | ||
415 | nbytes = __ctr_crypt(desc, &walk); | ||
416 | err = blkcipher_walk_done(desc, &walk, nbytes); | ||
417 | } | ||
418 | |||
419 | bf_fpu_end(fpu_enabled); | ||
420 | |||
421 | if (walk.nbytes) { | ||
422 | ctr_crypt_final(desc, &walk); | ||
423 | err = blkcipher_walk_done(desc, &walk, 0); | ||
424 | } | ||
425 | |||
426 | return err; | ||
427 | } | ||
428 | |||
429 | static struct crypto_alg bf_algs[6] = { { | ||
430 | .cra_name = "__ecb-blowfish-avx2", | ||
431 | .cra_driver_name = "__driver-ecb-blowfish-avx2", | ||
432 | .cra_priority = 0, | ||
433 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
434 | .cra_blocksize = BF_BLOCK_SIZE, | ||
435 | .cra_ctxsize = sizeof(struct bf_ctx), | ||
436 | .cra_alignmask = 0, | ||
437 | .cra_type = &crypto_blkcipher_type, | ||
438 | .cra_module = THIS_MODULE, | ||
439 | .cra_u = { | ||
440 | .blkcipher = { | ||
441 | .min_keysize = BF_MIN_KEY_SIZE, | ||
442 | .max_keysize = BF_MAX_KEY_SIZE, | ||
443 | .setkey = blowfish_setkey, | ||
444 | .encrypt = ecb_encrypt, | ||
445 | .decrypt = ecb_decrypt, | ||
446 | }, | ||
447 | }, | ||
448 | }, { | ||
449 | .cra_name = "__cbc-blowfish-avx2", | ||
450 | .cra_driver_name = "__driver-cbc-blowfish-avx2", | ||
451 | .cra_priority = 0, | ||
452 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
453 | .cra_blocksize = BF_BLOCK_SIZE, | ||
454 | .cra_ctxsize = sizeof(struct bf_ctx), | ||
455 | .cra_alignmask = 0, | ||
456 | .cra_type = &crypto_blkcipher_type, | ||
457 | .cra_module = THIS_MODULE, | ||
458 | .cra_u = { | ||
459 | .blkcipher = { | ||
460 | .min_keysize = BF_MIN_KEY_SIZE, | ||
461 | .max_keysize = BF_MAX_KEY_SIZE, | ||
462 | .setkey = blowfish_setkey, | ||
463 | .encrypt = cbc_encrypt, | ||
464 | .decrypt = cbc_decrypt, | ||
465 | }, | ||
466 | }, | ||
467 | }, { | ||
468 | .cra_name = "__ctr-blowfish-avx2", | ||
469 | .cra_driver_name = "__driver-ctr-blowfish-avx2", | ||
470 | .cra_priority = 0, | ||
471 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
472 | .cra_blocksize = 1, | ||
473 | .cra_ctxsize = sizeof(struct bf_ctx), | ||
474 | .cra_alignmask = 0, | ||
475 | .cra_type = &crypto_blkcipher_type, | ||
476 | .cra_module = THIS_MODULE, | ||
477 | .cra_u = { | ||
478 | .blkcipher = { | ||
479 | .min_keysize = BF_MIN_KEY_SIZE, | ||
480 | .max_keysize = BF_MAX_KEY_SIZE, | ||
481 | .ivsize = BF_BLOCK_SIZE, | ||
482 | .setkey = blowfish_setkey, | ||
483 | .encrypt = ctr_crypt, | ||
484 | .decrypt = ctr_crypt, | ||
485 | }, | ||
486 | }, | ||
487 | }, { | ||
488 | .cra_name = "ecb(blowfish)", | ||
489 | .cra_driver_name = "ecb-blowfish-avx2", | ||
490 | .cra_priority = 400, | ||
491 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
492 | .cra_blocksize = BF_BLOCK_SIZE, | ||
493 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
494 | .cra_alignmask = 0, | ||
495 | .cra_type = &crypto_ablkcipher_type, | ||
496 | .cra_module = THIS_MODULE, | ||
497 | .cra_init = ablk_init, | ||
498 | .cra_exit = ablk_exit, | ||
499 | .cra_u = { | ||
500 | .ablkcipher = { | ||
501 | .min_keysize = BF_MIN_KEY_SIZE, | ||
502 | .max_keysize = BF_MAX_KEY_SIZE, | ||
503 | .setkey = ablk_set_key, | ||
504 | .encrypt = ablk_encrypt, | ||
505 | .decrypt = ablk_decrypt, | ||
506 | }, | ||
507 | }, | ||
508 | }, { | ||
509 | .cra_name = "cbc(blowfish)", | ||
510 | .cra_driver_name = "cbc-blowfish-avx2", | ||
511 | .cra_priority = 400, | ||
512 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
513 | .cra_blocksize = BF_BLOCK_SIZE, | ||
514 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
515 | .cra_alignmask = 0, | ||
516 | .cra_type = &crypto_ablkcipher_type, | ||
517 | .cra_module = THIS_MODULE, | ||
518 | .cra_init = ablk_init, | ||
519 | .cra_exit = ablk_exit, | ||
520 | .cra_u = { | ||
521 | .ablkcipher = { | ||
522 | .min_keysize = BF_MIN_KEY_SIZE, | ||
523 | .max_keysize = BF_MAX_KEY_SIZE, | ||
524 | .ivsize = BF_BLOCK_SIZE, | ||
525 | .setkey = ablk_set_key, | ||
526 | .encrypt = __ablk_encrypt, | ||
527 | .decrypt = ablk_decrypt, | ||
528 | }, | ||
529 | }, | ||
530 | }, { | ||
531 | .cra_name = "ctr(blowfish)", | ||
532 | .cra_driver_name = "ctr-blowfish-avx2", | ||
533 | .cra_priority = 400, | ||
534 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
535 | .cra_blocksize = 1, | ||
536 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
537 | .cra_alignmask = 0, | ||
538 | .cra_type = &crypto_ablkcipher_type, | ||
539 | .cra_module = THIS_MODULE, | ||
540 | .cra_init = ablk_init, | ||
541 | .cra_exit = ablk_exit, | ||
542 | .cra_u = { | ||
543 | .ablkcipher = { | ||
544 | .min_keysize = BF_MIN_KEY_SIZE, | ||
545 | .max_keysize = BF_MAX_KEY_SIZE, | ||
546 | .ivsize = BF_BLOCK_SIZE, | ||
547 | .setkey = ablk_set_key, | ||
548 | .encrypt = ablk_encrypt, | ||
549 | .decrypt = ablk_encrypt, | ||
550 | .geniv = "chainiv", | ||
551 | }, | ||
552 | }, | ||
553 | } }; | ||
554 | |||
555 | |||
556 | static int __init init(void) | ||
557 | { | ||
558 | u64 xcr0; | ||
559 | |||
560 | if (!cpu_has_avx2 || !cpu_has_osxsave) { | ||
561 | pr_info("AVX2 instructions are not detected.\n"); | ||
562 | return -ENODEV; | ||
563 | } | ||
564 | |||
565 | xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); | ||
566 | if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { | ||
567 | pr_info("AVX detected but unusable.\n"); | ||
568 | return -ENODEV; | ||
569 | } | ||
570 | |||
571 | return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs)); | ||
572 | } | ||
573 | |||
574 | static void __exit fini(void) | ||
575 | { | ||
576 | crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs)); | ||
577 | } | ||
578 | |||
579 | module_init(init); | ||
580 | module_exit(fini); | ||
581 | |||
582 | MODULE_LICENSE("GPL"); | ||
583 | MODULE_DESCRIPTION("Blowfish Cipher Algorithm, AVX2 optimized"); | ||
584 | MODULE_ALIAS("blowfish"); | ||
585 | MODULE_ALIAS("blowfish-asm"); | ||
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 50ec333b70e6..3548d76dbaa9 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Glue Code for assembler optimized version of Blowfish | 2 | * Glue Code for assembler optimized version of Blowfish |
3 | * | 3 | * |
4 | * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | 4 | * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
5 | * | 5 | * |
6 | * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: | 6 | * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: |
7 | * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> | 7 | * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> |
@@ -32,40 +32,24 @@ | |||
32 | #include <linux/module.h> | 32 | #include <linux/module.h> |
33 | #include <linux/types.h> | 33 | #include <linux/types.h> |
34 | #include <crypto/algapi.h> | 34 | #include <crypto/algapi.h> |
35 | #include <asm/crypto/blowfish.h> | ||
35 | 36 | ||
36 | /* regular block cipher functions */ | 37 | /* regular block cipher functions */ |
37 | asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, | 38 | asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, |
38 | bool xor); | 39 | bool xor); |
40 | EXPORT_SYMBOL_GPL(__blowfish_enc_blk); | ||
41 | |||
39 | asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); | 42 | asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); |
43 | EXPORT_SYMBOL_GPL(blowfish_dec_blk); | ||
40 | 44 | ||
41 | /* 4-way parallel cipher functions */ | 45 | /* 4-way parallel cipher functions */ |
42 | asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, | 46 | asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, |
43 | const u8 *src, bool xor); | 47 | const u8 *src, bool xor); |
48 | EXPORT_SYMBOL_GPL(__blowfish_enc_blk_4way); | ||
49 | |||
44 | asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, | 50 | asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, |
45 | const u8 *src); | 51 | const u8 *src); |
46 | 52 | EXPORT_SYMBOL_GPL(blowfish_dec_blk_4way); | |
47 | static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src) | ||
48 | { | ||
49 | __blowfish_enc_blk(ctx, dst, src, false); | ||
50 | } | ||
51 | |||
52 | static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst, | ||
53 | const u8 *src) | ||
54 | { | ||
55 | __blowfish_enc_blk(ctx, dst, src, true); | ||
56 | } | ||
57 | |||
58 | static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, | ||
59 | const u8 *src) | ||
60 | { | ||
61 | __blowfish_enc_blk_4way(ctx, dst, src, false); | ||
62 | } | ||
63 | |||
64 | static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst, | ||
65 | const u8 *src) | ||
66 | { | ||
67 | __blowfish_enc_blk_4way(ctx, dst, src, true); | ||
68 | } | ||
69 | 53 | ||
70 | static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) | 54 | static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) |
71 | { | 55 | { |