aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2013-04-13 06:46:45 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2013-04-25 09:09:04 -0400
commit604880107010a1e5794552d184cd5471ea31b973 (patch)
treeed37e3b3e8454f758daab88a2fb9cb5f043ca8ad
parentad8b7c3e92868dd86c54d9d5321000bbb4096f0d (diff)
crypto: blowfish - add AVX2/x86_64 implementation of blowfish cipher
Patch adds AVX2/x86-64 implementation of Blowfish cipher, requiring 32 parallel blocks for input (256 bytes). Table look-ups are performed using vpgatherdd instruction directly from vector registers and thus should be faster than earlier implementations. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r--arch/x86/crypto/Makefile11
-rw-r--r--arch/x86/crypto/blowfish-avx2-asm_64.S449
-rw-r--r--arch/x86/crypto/blowfish_avx2_glue.c585
-rw-r--r--arch/x86/crypto/blowfish_glue.c32
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/crypto/blowfish.h43
-rw-r--r--crypto/Kconfig18
-rw-r--r--crypto/testmgr.c12
8 files changed, 1127 insertions, 24 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 03cd7313ad4b..28464ef6fa52 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -3,6 +3,8 @@
3# 3#
4 4
5avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) 5avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
6avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
7 $(comma)4)$(comma)%ymm2,yes,no)
6 8
7obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o 9obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o
8obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o 10obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
@@ -38,6 +40,11 @@ ifeq ($(avx_supported),yes)
38 obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o 40 obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
39endif 41endif
40 42
43# These modules require assembler to support AVX2.
44ifeq ($(avx2_supported),yes)
45 obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o
46endif
47
41aes-i586-y := aes-i586-asm_32.o aes_glue.o 48aes-i586-y := aes-i586-asm_32.o aes_glue.o
42twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o 49twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
43salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o 50salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
@@ -62,6 +69,10 @@ ifeq ($(avx_supported),yes)
62 serpent_avx_glue.o 69 serpent_avx_glue.o
63endif 70endif
64 71
72ifeq ($(avx2_supported),yes)
73 blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o
74endif
75
65aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o 76aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
66ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o 77ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
67sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o 78sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
diff --git a/arch/x86/crypto/blowfish-avx2-asm_64.S b/arch/x86/crypto/blowfish-avx2-asm_64.S
new file mode 100644
index 000000000000..784452e0d05d
--- /dev/null
+++ b/arch/x86/crypto/blowfish-avx2-asm_64.S
@@ -0,0 +1,449 @@
1/*
2 * x86_64/AVX2 assembler optimized version of Blowfish
3 *
4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 */
12
13#include <linux/linkage.h>
14
15.file "blowfish-avx2-asm_64.S"
16
17.data
18.align 32
19
20.Lprefetch_mask:
21.long 0*64
22.long 1*64
23.long 2*64
24.long 3*64
25.long 4*64
26.long 5*64
27.long 6*64
28.long 7*64
29
30.Lbswap32_mask:
31.long 0x00010203
32.long 0x04050607
33.long 0x08090a0b
34.long 0x0c0d0e0f
35
36.Lbswap128_mask:
37 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
38.Lbswap_iv_mask:
39 .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
40
41.text
42/* structure of crypto context */
43#define p 0
44#define s0 ((16 + 2) * 4)
45#define s1 ((16 + 2 + (1 * 256)) * 4)
46#define s2 ((16 + 2 + (2 * 256)) * 4)
47#define s3 ((16 + 2 + (3 * 256)) * 4)
48
49/* register macros */
50#define CTX %rdi
51#define RIO %rdx
52
53#define RS0 %rax
54#define RS1 %r8
55#define RS2 %r9
56#define RS3 %r10
57
58#define RLOOP %r11
59#define RLOOPd %r11d
60
61#define RXr0 %ymm8
62#define RXr1 %ymm9
63#define RXr2 %ymm10
64#define RXr3 %ymm11
65#define RXl0 %ymm12
66#define RXl1 %ymm13
67#define RXl2 %ymm14
68#define RXl3 %ymm15
69
70/* temp regs */
71#define RT0 %ymm0
72#define RT0x %xmm0
73#define RT1 %ymm1
74#define RT1x %xmm1
75#define RIDX0 %ymm2
76#define RIDX1 %ymm3
77#define RIDX1x %xmm3
78#define RIDX2 %ymm4
79#define RIDX3 %ymm5
80
81/* vpgatherdd mask and '-1' */
82#define RNOT %ymm6
83
84/* byte mask, (-1 >> 24) */
85#define RBYTE %ymm7
86
87/***********************************************************************
88 * 32-way AVX2 blowfish
89 ***********************************************************************/
90#define F(xl, xr) \
91 vpsrld $24, xl, RIDX0; \
92 vpsrld $16, xl, RIDX1; \
93 vpsrld $8, xl, RIDX2; \
94 vpand RBYTE, RIDX1, RIDX1; \
95 vpand RBYTE, RIDX2, RIDX2; \
96 vpand RBYTE, xl, RIDX3; \
97 \
98 vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \
99 vpcmpeqd RNOT, RNOT, RNOT; \
100 vpcmpeqd RIDX0, RIDX0, RIDX0; \
101 \
102 vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \
103 vpcmpeqd RIDX1, RIDX1, RIDX1; \
104 vpaddd RT0, RT1, RT0; \
105 \
106 vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \
107 vpxor RT0, RT1, RT0; \
108 \
109 vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \
110 vpcmpeqd RNOT, RNOT, RNOT; \
111 vpaddd RT0, RT1, RT0; \
112 \
113 vpxor RT0, xr, xr;
114
115#define add_roundkey(xl, nmem) \
116 vpbroadcastd nmem, RT0; \
117 vpxor RT0, xl ## 0, xl ## 0; \
118 vpxor RT0, xl ## 1, xl ## 1; \
119 vpxor RT0, xl ## 2, xl ## 2; \
120 vpxor RT0, xl ## 3, xl ## 3;
121
122#define round_enc() \
123 add_roundkey(RXr, p(CTX,RLOOP,4)); \
124 F(RXl0, RXr0); \
125 F(RXl1, RXr1); \
126 F(RXl2, RXr2); \
127 F(RXl3, RXr3); \
128 \
129 add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
130 F(RXr0, RXl0); \
131 F(RXr1, RXl1); \
132 F(RXr2, RXl2); \
133 F(RXr3, RXl3);
134
135#define round_dec() \
136 add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \
137 F(RXl0, RXr0); \
138 F(RXl1, RXr1); \
139 F(RXl2, RXr2); \
140 F(RXl3, RXr3); \
141 \
142 add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
143 F(RXr0, RXl0); \
144 F(RXr1, RXl1); \
145 F(RXr2, RXl2); \
146 F(RXr3, RXl3);
147
148#define init_round_constants() \
149 vpcmpeqd RNOT, RNOT, RNOT; \
150 leaq s0(CTX), RS0; \
151 leaq s1(CTX), RS1; \
152 leaq s2(CTX), RS2; \
153 leaq s3(CTX), RS3; \
154 vpsrld $24, RNOT, RBYTE;
155
156#define transpose_2x2(x0, x1, t0) \
157 vpunpckldq x0, x1, t0; \
158 vpunpckhdq x0, x1, x1; \
159 \
160 vpunpcklqdq t0, x1, x0; \
161 vpunpckhqdq t0, x1, x1;
162
163#define read_block(xl, xr) \
164 vbroadcasti128 .Lbswap32_mask, RT1; \
165 \
166 vpshufb RT1, xl ## 0, xl ## 0; \
167 vpshufb RT1, xr ## 0, xr ## 0; \
168 vpshufb RT1, xl ## 1, xl ## 1; \
169 vpshufb RT1, xr ## 1, xr ## 1; \
170 vpshufb RT1, xl ## 2, xl ## 2; \
171 vpshufb RT1, xr ## 2, xr ## 2; \
172 vpshufb RT1, xl ## 3, xl ## 3; \
173 vpshufb RT1, xr ## 3, xr ## 3; \
174 \
175 transpose_2x2(xl ## 0, xr ## 0, RT0); \
176 transpose_2x2(xl ## 1, xr ## 1, RT0); \
177 transpose_2x2(xl ## 2, xr ## 2, RT0); \
178 transpose_2x2(xl ## 3, xr ## 3, RT0);
179
180#define write_block(xl, xr) \
181 vbroadcasti128 .Lbswap32_mask, RT1; \
182 \
183 transpose_2x2(xl ## 0, xr ## 0, RT0); \
184 transpose_2x2(xl ## 1, xr ## 1, RT0); \
185 transpose_2x2(xl ## 2, xr ## 2, RT0); \
186 transpose_2x2(xl ## 3, xr ## 3, RT0); \
187 \
188 vpshufb RT1, xl ## 0, xl ## 0; \
189 vpshufb RT1, xr ## 0, xr ## 0; \
190 vpshufb RT1, xl ## 1, xl ## 1; \
191 vpshufb RT1, xr ## 1, xr ## 1; \
192 vpshufb RT1, xl ## 2, xl ## 2; \
193 vpshufb RT1, xr ## 2, xr ## 2; \
194 vpshufb RT1, xl ## 3, xl ## 3; \
195 vpshufb RT1, xr ## 3, xr ## 3;
196
197.align 8
198__blowfish_enc_blk32:
199 /* input:
200 * %rdi: ctx, CTX
201 * RXl0..4, RXr0..4: plaintext
202 * output:
203 * RXl0..4, RXr0..4: ciphertext (RXl <=> RXr swapped)
204 */
205 init_round_constants();
206
207 read_block(RXl, RXr);
208
209 movl $1, RLOOPd;
210 add_roundkey(RXl, p+4*(0)(CTX));
211
212.align 4
213.L__enc_loop:
214 round_enc();
215
216 leal 2(RLOOPd), RLOOPd;
217 cmpl $17, RLOOPd;
218 jne .L__enc_loop;
219
220 add_roundkey(RXr, p+4*(17)(CTX));
221
222 write_block(RXl, RXr);
223
224 ret;
225ENDPROC(__blowfish_enc_blk32)
226
227.align 8
228__blowfish_dec_blk32:
229 /* input:
230 * %rdi: ctx, CTX
231 * RXl0..4, RXr0..4: ciphertext
232 * output:
233 * RXl0..4, RXr0..4: plaintext (RXl <=> RXr swapped)
234 */
235 init_round_constants();
236
237 read_block(RXl, RXr);
238
239 movl $14, RLOOPd;
240 add_roundkey(RXl, p+4*(17)(CTX));
241
242.align 4
243.L__dec_loop:
244 round_dec();
245
246 addl $-2, RLOOPd;
247 jns .L__dec_loop;
248
249 add_roundkey(RXr, p+4*(0)(CTX));
250
251 write_block(RXl, RXr);
252
253 ret;
254ENDPROC(__blowfish_dec_blk32)
255
256ENTRY(blowfish_ecb_enc_32way)
257 /* input:
258 * %rdi: ctx, CTX
259 * %rsi: dst
260 * %rdx: src
261 */
262
263 vzeroupper;
264
265 vmovdqu 0*32(%rdx), RXl0;
266 vmovdqu 1*32(%rdx), RXr0;
267 vmovdqu 2*32(%rdx), RXl1;
268 vmovdqu 3*32(%rdx), RXr1;
269 vmovdqu 4*32(%rdx), RXl2;
270 vmovdqu 5*32(%rdx), RXr2;
271 vmovdqu 6*32(%rdx), RXl3;
272 vmovdqu 7*32(%rdx), RXr3;
273
274 call __blowfish_enc_blk32;
275
276 vmovdqu RXr0, 0*32(%rsi);
277 vmovdqu RXl0, 1*32(%rsi);
278 vmovdqu RXr1, 2*32(%rsi);
279 vmovdqu RXl1, 3*32(%rsi);
280 vmovdqu RXr2, 4*32(%rsi);
281 vmovdqu RXl2, 5*32(%rsi);
282 vmovdqu RXr3, 6*32(%rsi);
283 vmovdqu RXl3, 7*32(%rsi);
284
285 vzeroupper;
286
287 ret;
288ENDPROC(blowfish_ecb_enc_32way)
289
290ENTRY(blowfish_ecb_dec_32way)
291 /* input:
292 * %rdi: ctx, CTX
293 * %rsi: dst
294 * %rdx: src
295 */
296
297 vzeroupper;
298
299 vmovdqu 0*32(%rdx), RXl0;
300 vmovdqu 1*32(%rdx), RXr0;
301 vmovdqu 2*32(%rdx), RXl1;
302 vmovdqu 3*32(%rdx), RXr1;
303 vmovdqu 4*32(%rdx), RXl2;
304 vmovdqu 5*32(%rdx), RXr2;
305 vmovdqu 6*32(%rdx), RXl3;
306 vmovdqu 7*32(%rdx), RXr3;
307
308 call __blowfish_dec_blk32;
309
310 vmovdqu RXr0, 0*32(%rsi);
311 vmovdqu RXl0, 1*32(%rsi);
312 vmovdqu RXr1, 2*32(%rsi);
313 vmovdqu RXl1, 3*32(%rsi);
314 vmovdqu RXr2, 4*32(%rsi);
315 vmovdqu RXl2, 5*32(%rsi);
316 vmovdqu RXr3, 6*32(%rsi);
317 vmovdqu RXl3, 7*32(%rsi);
318
319 vzeroupper;
320
321 ret;
322ENDPROC(blowfish_ecb_dec_32way)
323
324ENTRY(blowfish_cbc_dec_32way)
325 /* input:
326 * %rdi: ctx, CTX
327 * %rsi: dst
328 * %rdx: src
329 */
330
331 vzeroupper;
332
333 vmovdqu 0*32(%rdx), RXl0;
334 vmovdqu 1*32(%rdx), RXr0;
335 vmovdqu 2*32(%rdx), RXl1;
336 vmovdqu 3*32(%rdx), RXr1;
337 vmovdqu 4*32(%rdx), RXl2;
338 vmovdqu 5*32(%rdx), RXr2;
339 vmovdqu 6*32(%rdx), RXl3;
340 vmovdqu 7*32(%rdx), RXr3;
341
342 call __blowfish_dec_blk32;
343
344 /* xor with src */
345 vmovq (%rdx), RT0x;
346 vpshufd $0x4f, RT0x, RT0x;
347 vinserti128 $1, 8(%rdx), RT0, RT0;
348 vpxor RT0, RXr0, RXr0;
349 vpxor 0*32+24(%rdx), RXl0, RXl0;
350 vpxor 1*32+24(%rdx), RXr1, RXr1;
351 vpxor 2*32+24(%rdx), RXl1, RXl1;
352 vpxor 3*32+24(%rdx), RXr2, RXr2;
353 vpxor 4*32+24(%rdx), RXl2, RXl2;
354 vpxor 5*32+24(%rdx), RXr3, RXr3;
355 vpxor 6*32+24(%rdx), RXl3, RXl3;
356
357 vmovdqu RXr0, (0*32)(%rsi);
358 vmovdqu RXl0, (1*32)(%rsi);
359 vmovdqu RXr1, (2*32)(%rsi);
360 vmovdqu RXl1, (3*32)(%rsi);
361 vmovdqu RXr2, (4*32)(%rsi);
362 vmovdqu RXl2, (5*32)(%rsi);
363 vmovdqu RXr3, (6*32)(%rsi);
364 vmovdqu RXl3, (7*32)(%rsi);
365
366 vzeroupper;
367
368 ret;
369ENDPROC(blowfish_cbc_dec_32way)
370
371ENTRY(blowfish_ctr_32way)
372 /* input:
373 * %rdi: ctx, CTX
374 * %rsi: dst
375 * %rdx: src
376 * %rcx: iv (big endian, 64bit)
377 */
378
379 vzeroupper;
380
381 vpcmpeqd RT0, RT0, RT0;
382 vpsrldq $8, RT0, RT0; /* a: -1, b: 0, c: -1, d: 0 */
383
384 vpcmpeqd RT1x, RT1x, RT1x;
385 vpaddq RT1x, RT1x, RT1x; /* a: -2, b: -2 */
386 vpxor RIDX0, RIDX0, RIDX0;
387 vinserti128 $1, RT1x, RIDX0, RIDX0; /* a: 0, b: 0, c: -2, d: -2 */
388
389 vpaddq RIDX0, RT0, RT0; /* a: -1, b: 0, c: -3, d: -2 */
390
391 vpcmpeqd RT1, RT1, RT1;
392 vpaddq RT1, RT1, RT1; /* a: -2, b: -2, c: -2, d: -2 */
393 vpaddq RT1, RT1, RIDX2; /* a: -4, b: -4, c: -4, d: -4 */
394
395 vbroadcasti128 .Lbswap_iv_mask, RIDX0;
396 vbroadcasti128 .Lbswap128_mask, RIDX1;
397
398 /* load IV and byteswap */
399 vmovq (%rcx), RT1x;
400 vinserti128 $1, RT1x, RT1, RT1; /* a: BE, b: 0, c: BE, d: 0 */
401 vpshufb RIDX0, RT1, RT1; /* a: LE, b: LE, c: LE, d: LE */
402
403 /* construct IVs */
404 vpsubq RT0, RT1, RT1; /* a: le1, b: le0, c: le3, d: le2 */
405 vpshufb RIDX1, RT1, RXl0; /* a: be0, b: be1, c: be2, d: be3 */
406 vpsubq RIDX2, RT1, RT1; /* le5, le4, le7, le6 */
407 vpshufb RIDX1, RT1, RXr0; /* be4, be5, be6, be7 */
408 vpsubq RIDX2, RT1, RT1;
409 vpshufb RIDX1, RT1, RXl1;
410 vpsubq RIDX2, RT1, RT1;
411 vpshufb RIDX1, RT1, RXr1;
412 vpsubq RIDX2, RT1, RT1;
413 vpshufb RIDX1, RT1, RXl2;
414 vpsubq RIDX2, RT1, RT1;
415 vpshufb RIDX1, RT1, RXr2;
416 vpsubq RIDX2, RT1, RT1;
417 vpshufb RIDX1, RT1, RXl3;
418 vpsubq RIDX2, RT1, RT1;
419 vpshufb RIDX1, RT1, RXr3;
420
421 /* store last IV */
422 vpsubq RIDX2, RT1, RT1; /* a: le33, b: le32, ... */
423 vpshufb RIDX1x, RT1x, RT1x; /* a: be32, ... */
424 vmovq RT1x, (%rcx);
425
426 call __blowfish_enc_blk32;
427
428 /* dst = src ^ iv */
429 vpxor 0*32(%rdx), RXr0, RXr0;
430 vpxor 1*32(%rdx), RXl0, RXl0;
431 vpxor 2*32(%rdx), RXr1, RXr1;
432 vpxor 3*32(%rdx), RXl1, RXl1;
433 vpxor 4*32(%rdx), RXr2, RXr2;
434 vpxor 5*32(%rdx), RXl2, RXl2;
435 vpxor 6*32(%rdx), RXr3, RXr3;
436 vpxor 7*32(%rdx), RXl3, RXl3;
437 vmovdqu RXr0, (0*32)(%rsi);
438 vmovdqu RXl0, (1*32)(%rsi);
439 vmovdqu RXr1, (2*32)(%rsi);
440 vmovdqu RXl1, (3*32)(%rsi);
441 vmovdqu RXr2, (4*32)(%rsi);
442 vmovdqu RXl2, (5*32)(%rsi);
443 vmovdqu RXr3, (6*32)(%rsi);
444 vmovdqu RXl3, (7*32)(%rsi);
445
446 vzeroupper;
447
448 ret;
449ENDPROC(blowfish_ctr_32way)
diff --git a/arch/x86/crypto/blowfish_avx2_glue.c b/arch/x86/crypto/blowfish_avx2_glue.c
new file mode 100644
index 000000000000..4417e9aea78d
--- /dev/null
+++ b/arch/x86/crypto/blowfish_avx2_glue.c
@@ -0,0 +1,585 @@
1/*
2 * Glue Code for x86_64/AVX2 assembler optimized version of Blowfish
3 *
4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
7 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
8 * CTR part based on code (crypto/ctr.c) by:
9 * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 */
22
23#include <linux/module.h>
24#include <linux/types.h>
25#include <linux/crypto.h>
26#include <linux/err.h>
27#include <crypto/algapi.h>
28#include <crypto/blowfish.h>
29#include <crypto/cryptd.h>
30#include <crypto/ctr.h>
31#include <asm/i387.h>
32#include <asm/xcr.h>
33#include <asm/xsave.h>
34#include <asm/crypto/blowfish.h>
35#include <asm/crypto/ablk_helper.h>
36#include <crypto/scatterwalk.h>
37
38#define BF_AVX2_PARALLEL_BLOCKS 32
39
40/* 32-way AVX2 parallel cipher functions */
41asmlinkage void blowfish_ecb_enc_32way(struct bf_ctx *ctx, u8 *dst,
42 const u8 *src);
43asmlinkage void blowfish_ecb_dec_32way(struct bf_ctx *ctx, u8 *dst,
44 const u8 *src);
45asmlinkage void blowfish_cbc_dec_32way(struct bf_ctx *ctx, u8 *dst,
46 const u8 *src);
47asmlinkage void blowfish_ctr_32way(struct bf_ctx *ctx, u8 *dst, const u8 *src,
48 __be64 *iv);
49
50static inline bool bf_fpu_begin(bool fpu_enabled, unsigned int nbytes)
51{
52 if (fpu_enabled)
53 return true;
54
55 /* FPU is only used when chunk to be processed is large enough, so
56 * do not enable FPU until it is necessary.
57 */
58 if (nbytes < BF_BLOCK_SIZE * BF_AVX2_PARALLEL_BLOCKS)
59 return false;
60
61 kernel_fpu_begin();
62 return true;
63}
64
65static inline void bf_fpu_end(bool fpu_enabled)
66{
67 if (fpu_enabled)
68 kernel_fpu_end();
69}
70
71static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
72 bool enc)
73{
74 bool fpu_enabled = false;
75 struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
76 const unsigned int bsize = BF_BLOCK_SIZE;
77 unsigned int nbytes;
78 int err;
79
80 err = blkcipher_walk_virt(desc, walk);
81 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
82
83 while ((nbytes = walk->nbytes)) {
84 u8 *wsrc = walk->src.virt.addr;
85 u8 *wdst = walk->dst.virt.addr;
86
87 fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
88
89 /* Process multi-block AVX2 batch */
90 if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
91 do {
92 if (enc)
93 blowfish_ecb_enc_32way(ctx, wdst, wsrc);
94 else
95 blowfish_ecb_dec_32way(ctx, wdst, wsrc);
96
97 wsrc += bsize * BF_AVX2_PARALLEL_BLOCKS;
98 wdst += bsize * BF_AVX2_PARALLEL_BLOCKS;
99 nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS;
100 } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
101
102 if (nbytes < bsize)
103 goto done;
104 }
105
106 /* Process multi-block batch */
107 if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
108 do {
109 if (enc)
110 blowfish_enc_blk_4way(ctx, wdst, wsrc);
111 else
112 blowfish_dec_blk_4way(ctx, wdst, wsrc);
113
114 wsrc += bsize * BF_PARALLEL_BLOCKS;
115 wdst += bsize * BF_PARALLEL_BLOCKS;
116 nbytes -= bsize * BF_PARALLEL_BLOCKS;
117 } while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
118
119 if (nbytes < bsize)
120 goto done;
121 }
122
123 /* Handle leftovers */
124 do {
125 if (enc)
126 blowfish_enc_blk(ctx, wdst, wsrc);
127 else
128 blowfish_dec_blk(ctx, wdst, wsrc);
129
130 wsrc += bsize;
131 wdst += bsize;
132 nbytes -= bsize;
133 } while (nbytes >= bsize);
134
135done:
136 err = blkcipher_walk_done(desc, walk, nbytes);
137 }
138
139 bf_fpu_end(fpu_enabled);
140 return err;
141}
142
143static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
144 struct scatterlist *src, unsigned int nbytes)
145{
146 struct blkcipher_walk walk;
147
148 blkcipher_walk_init(&walk, dst, src, nbytes);
149 return ecb_crypt(desc, &walk, true);
150}
151
152static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
153 struct scatterlist *src, unsigned int nbytes)
154{
155 struct blkcipher_walk walk;
156
157 blkcipher_walk_init(&walk, dst, src, nbytes);
158 return ecb_crypt(desc, &walk, false);
159}
160
161static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
162 struct blkcipher_walk *walk)
163{
164 struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
165 unsigned int bsize = BF_BLOCK_SIZE;
166 unsigned int nbytes = walk->nbytes;
167 u64 *src = (u64 *)walk->src.virt.addr;
168 u64 *dst = (u64 *)walk->dst.virt.addr;
169 u64 *iv = (u64 *)walk->iv;
170
171 do {
172 *dst = *src ^ *iv;
173 blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
174 iv = dst;
175
176 src += 1;
177 dst += 1;
178 nbytes -= bsize;
179 } while (nbytes >= bsize);
180
181 *(u64 *)walk->iv = *iv;
182 return nbytes;
183}
184
185static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
186 struct scatterlist *src, unsigned int nbytes)
187{
188 struct blkcipher_walk walk;
189 int err;
190
191 blkcipher_walk_init(&walk, dst, src, nbytes);
192 err = blkcipher_walk_virt(desc, &walk);
193
194 while ((nbytes = walk.nbytes)) {
195 nbytes = __cbc_encrypt(desc, &walk);
196 err = blkcipher_walk_done(desc, &walk, nbytes);
197 }
198
199 return err;
200}
201
202static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
203 struct blkcipher_walk *walk)
204{
205 struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
206 const unsigned int bsize = BF_BLOCK_SIZE;
207 unsigned int nbytes = walk->nbytes;
208 u64 *src = (u64 *)walk->src.virt.addr;
209 u64 *dst = (u64 *)walk->dst.virt.addr;
210 u64 last_iv;
211 int i;
212
213 /* Start of the last block. */
214 src += nbytes / bsize - 1;
215 dst += nbytes / bsize - 1;
216
217 last_iv = *src;
218
219 /* Process multi-block AVX2 batch */
220 if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
221 do {
222 nbytes -= bsize * (BF_AVX2_PARALLEL_BLOCKS - 1);
223 src -= BF_AVX2_PARALLEL_BLOCKS - 1;
224 dst -= BF_AVX2_PARALLEL_BLOCKS - 1;
225
226 blowfish_cbc_dec_32way(ctx, (u8 *)dst, (u8 *)src);
227
228 nbytes -= bsize;
229 if (nbytes < bsize)
230 goto done;
231
232 *dst ^= *(src - 1);
233 src -= 1;
234 dst -= 1;
235 } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
236
237 if (nbytes < bsize)
238 goto done;
239 }
240
241 /* Process multi-block batch */
242 if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
243 u64 ivs[BF_PARALLEL_BLOCKS - 1];
244
245 do {
246 nbytes -= bsize * (BF_PARALLEL_BLOCKS - 1);
247 src -= BF_PARALLEL_BLOCKS - 1;
248 dst -= BF_PARALLEL_BLOCKS - 1;
249
250 for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++)
251 ivs[i] = src[i];
252
253 blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src);
254
255 for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++)
256 dst[i + 1] ^= ivs[i];
257
258 nbytes -= bsize;
259 if (nbytes < bsize)
260 goto done;
261
262 *dst ^= *(src - 1);
263 src -= 1;
264 dst -= 1;
265 } while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
266
267 if (nbytes < bsize)
268 goto done;
269 }
270
271 /* Handle leftovers */
272 for (;;) {
273 blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src);
274
275 nbytes -= bsize;
276 if (nbytes < bsize)
277 break;
278
279 *dst ^= *(src - 1);
280 src -= 1;
281 dst -= 1;
282 }
283
284done:
285 *dst ^= *(u64 *)walk->iv;
286 *(u64 *)walk->iv = last_iv;
287
288 return nbytes;
289}
290
291static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
292 struct scatterlist *src, unsigned int nbytes)
293{
294 bool fpu_enabled = false;
295 struct blkcipher_walk walk;
296 int err;
297
298 blkcipher_walk_init(&walk, dst, src, nbytes);
299 err = blkcipher_walk_virt(desc, &walk);
300 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
301
302 while ((nbytes = walk.nbytes)) {
303 fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
304 nbytes = __cbc_decrypt(desc, &walk);
305 err = blkcipher_walk_done(desc, &walk, nbytes);
306 }
307
308 bf_fpu_end(fpu_enabled);
309 return err;
310}
311
312static void ctr_crypt_final(struct blkcipher_desc *desc,
313 struct blkcipher_walk *walk)
314{
315 struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
316 u8 *ctrblk = walk->iv;
317 u8 keystream[BF_BLOCK_SIZE];
318 u8 *src = walk->src.virt.addr;
319 u8 *dst = walk->dst.virt.addr;
320 unsigned int nbytes = walk->nbytes;
321
322 blowfish_enc_blk(ctx, keystream, ctrblk);
323 crypto_xor(keystream, src, nbytes);
324 memcpy(dst, keystream, nbytes);
325
326 crypto_inc(ctrblk, BF_BLOCK_SIZE);
327}
328
329static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
330 struct blkcipher_walk *walk)
331{
332 struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
333 unsigned int bsize = BF_BLOCK_SIZE;
334 unsigned int nbytes = walk->nbytes;
335 u64 *src = (u64 *)walk->src.virt.addr;
336 u64 *dst = (u64 *)walk->dst.virt.addr;
337 int i;
338
339 /* Process multi-block AVX2 batch */
340 if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
341 do {
342 blowfish_ctr_32way(ctx, (u8 *)dst, (u8 *)src,
343 (__be64 *)walk->iv);
344
345 src += BF_AVX2_PARALLEL_BLOCKS;
346 dst += BF_AVX2_PARALLEL_BLOCKS;
347 nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS;
348 } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
349
350 if (nbytes < bsize)
351 goto done;
352 }
353
354 /* Process four block batch */
355 if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
356 __be64 ctrblocks[BF_PARALLEL_BLOCKS];
357 u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
358
359 do {
360 /* create ctrblks for parallel encrypt */
361 for (i = 0; i < BF_PARALLEL_BLOCKS; i++) {
362 if (dst != src)
363 dst[i] = src[i];
364
365 ctrblocks[i] = cpu_to_be64(ctrblk++);
366 }
367
368 blowfish_enc_blk_xor_4way(ctx, (u8 *)dst,
369 (u8 *)ctrblocks);
370
371 src += BF_PARALLEL_BLOCKS;
372 dst += BF_PARALLEL_BLOCKS;
373 nbytes -= bsize * BF_PARALLEL_BLOCKS;
374 } while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
375
376 *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
377
378 if (nbytes < bsize)
379 goto done;
380 }
381
382 /* Handle leftovers */
383 do {
384 u64 ctrblk;
385
386 if (dst != src)
387 *dst = *src;
388
389 ctrblk = *(u64 *)walk->iv;
390 be64_add_cpu((__be64 *)walk->iv, 1);
391
392 blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
393
394 src += 1;
395 dst += 1;
396 } while ((nbytes -= bsize) >= bsize);
397
398done:
399 return nbytes;
400}
401
402static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
403 struct scatterlist *src, unsigned int nbytes)
404{
405 bool fpu_enabled = false;
406 struct blkcipher_walk walk;
407 int err;
408
409 blkcipher_walk_init(&walk, dst, src, nbytes);
410 err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE);
411 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
412
413 while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) {
414 fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
415 nbytes = __ctr_crypt(desc, &walk);
416 err = blkcipher_walk_done(desc, &walk, nbytes);
417 }
418
419 bf_fpu_end(fpu_enabled);
420
421 if (walk.nbytes) {
422 ctr_crypt_final(desc, &walk);
423 err = blkcipher_walk_done(desc, &walk, 0);
424 }
425
426 return err;
427}
428
429static struct crypto_alg bf_algs[6] = { {
430 .cra_name = "__ecb-blowfish-avx2",
431 .cra_driver_name = "__driver-ecb-blowfish-avx2",
432 .cra_priority = 0,
433 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
434 .cra_blocksize = BF_BLOCK_SIZE,
435 .cra_ctxsize = sizeof(struct bf_ctx),
436 .cra_alignmask = 0,
437 .cra_type = &crypto_blkcipher_type,
438 .cra_module = THIS_MODULE,
439 .cra_u = {
440 .blkcipher = {
441 .min_keysize = BF_MIN_KEY_SIZE,
442 .max_keysize = BF_MAX_KEY_SIZE,
443 .setkey = blowfish_setkey,
444 .encrypt = ecb_encrypt,
445 .decrypt = ecb_decrypt,
446 },
447 },
448}, {
449 .cra_name = "__cbc-blowfish-avx2",
450 .cra_driver_name = "__driver-cbc-blowfish-avx2",
451 .cra_priority = 0,
452 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
453 .cra_blocksize = BF_BLOCK_SIZE,
454 .cra_ctxsize = sizeof(struct bf_ctx),
455 .cra_alignmask = 0,
456 .cra_type = &crypto_blkcipher_type,
457 .cra_module = THIS_MODULE,
458 .cra_u = {
459 .blkcipher = {
460 .min_keysize = BF_MIN_KEY_SIZE,
461 .max_keysize = BF_MAX_KEY_SIZE,
462 .setkey = blowfish_setkey,
463 .encrypt = cbc_encrypt,
464 .decrypt = cbc_decrypt,
465 },
466 },
467}, {
468 .cra_name = "__ctr-blowfish-avx2",
469 .cra_driver_name = "__driver-ctr-blowfish-avx2",
470 .cra_priority = 0,
471 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
472 .cra_blocksize = 1,
473 .cra_ctxsize = sizeof(struct bf_ctx),
474 .cra_alignmask = 0,
475 .cra_type = &crypto_blkcipher_type,
476 .cra_module = THIS_MODULE,
477 .cra_u = {
478 .blkcipher = {
479 .min_keysize = BF_MIN_KEY_SIZE,
480 .max_keysize = BF_MAX_KEY_SIZE,
481 .ivsize = BF_BLOCK_SIZE,
482 .setkey = blowfish_setkey,
483 .encrypt = ctr_crypt,
484 .decrypt = ctr_crypt,
485 },
486 },
487}, {
488 .cra_name = "ecb(blowfish)",
489 .cra_driver_name = "ecb-blowfish-avx2",
490 .cra_priority = 400,
491 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
492 .cra_blocksize = BF_BLOCK_SIZE,
493 .cra_ctxsize = sizeof(struct async_helper_ctx),
494 .cra_alignmask = 0,
495 .cra_type = &crypto_ablkcipher_type,
496 .cra_module = THIS_MODULE,
497 .cra_init = ablk_init,
498 .cra_exit = ablk_exit,
499 .cra_u = {
500 .ablkcipher = {
501 .min_keysize = BF_MIN_KEY_SIZE,
502 .max_keysize = BF_MAX_KEY_SIZE,
503 .setkey = ablk_set_key,
504 .encrypt = ablk_encrypt,
505 .decrypt = ablk_decrypt,
506 },
507 },
508}, {
509 .cra_name = "cbc(blowfish)",
510 .cra_driver_name = "cbc-blowfish-avx2",
511 .cra_priority = 400,
512 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
513 .cra_blocksize = BF_BLOCK_SIZE,
514 .cra_ctxsize = sizeof(struct async_helper_ctx),
515 .cra_alignmask = 0,
516 .cra_type = &crypto_ablkcipher_type,
517 .cra_module = THIS_MODULE,
518 .cra_init = ablk_init,
519 .cra_exit = ablk_exit,
520 .cra_u = {
521 .ablkcipher = {
522 .min_keysize = BF_MIN_KEY_SIZE,
523 .max_keysize = BF_MAX_KEY_SIZE,
524 .ivsize = BF_BLOCK_SIZE,
525 .setkey = ablk_set_key,
526 .encrypt = __ablk_encrypt,
527 .decrypt = ablk_decrypt,
528 },
529 },
530}, {
531 .cra_name = "ctr(blowfish)",
532 .cra_driver_name = "ctr-blowfish-avx2",
533 .cra_priority = 400,
534 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
535 .cra_blocksize = 1,
536 .cra_ctxsize = sizeof(struct async_helper_ctx),
537 .cra_alignmask = 0,
538 .cra_type = &crypto_ablkcipher_type,
539 .cra_module = THIS_MODULE,
540 .cra_init = ablk_init,
541 .cra_exit = ablk_exit,
542 .cra_u = {
543 .ablkcipher = {
544 .min_keysize = BF_MIN_KEY_SIZE,
545 .max_keysize = BF_MAX_KEY_SIZE,
546 .ivsize = BF_BLOCK_SIZE,
547 .setkey = ablk_set_key,
548 .encrypt = ablk_encrypt,
549 .decrypt = ablk_encrypt,
550 .geniv = "chainiv",
551 },
552 },
553} };
554
555
556static int __init init(void)
557{
558 u64 xcr0;
559
560 if (!cpu_has_avx2 || !cpu_has_osxsave) {
561 pr_info("AVX2 instructions are not detected.\n");
562 return -ENODEV;
563 }
564
565 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
566 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
567 pr_info("AVX detected but unusable.\n");
568 return -ENODEV;
569 }
570
571 return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs));
572}
573
574static void __exit fini(void)
575{
576 crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs));
577}
578
579module_init(init);
580module_exit(fini);
581
582MODULE_LICENSE("GPL");
583MODULE_DESCRIPTION("Blowfish Cipher Algorithm, AVX2 optimized");
584MODULE_ALIAS("blowfish");
585MODULE_ALIAS("blowfish-asm");
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index 50ec333b70e6..3548d76dbaa9 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Glue Code for assembler optimized version of Blowfish 2 * Glue Code for assembler optimized version of Blowfish
3 * 3 *
4 * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 4 * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
5 * 5 *
6 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: 6 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
7 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> 7 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
@@ -32,40 +32,24 @@
32#include <linux/module.h> 32#include <linux/module.h>
33#include <linux/types.h> 33#include <linux/types.h>
34#include <crypto/algapi.h> 34#include <crypto/algapi.h>
35#include <asm/crypto/blowfish.h>
35 36
36/* regular block cipher functions */ 37/* regular block cipher functions */
37asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, 38asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
38 bool xor); 39 bool xor);
40EXPORT_SYMBOL_GPL(__blowfish_enc_blk);
41
39asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); 42asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
43EXPORT_SYMBOL_GPL(blowfish_dec_blk);
40 44
41/* 4-way parallel cipher functions */ 45/* 4-way parallel cipher functions */
42asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, 46asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
43 const u8 *src, bool xor); 47 const u8 *src, bool xor);
48EXPORT_SYMBOL_GPL(__blowfish_enc_blk_4way);
49
44asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, 50asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
45 const u8 *src); 51 const u8 *src);
46 52EXPORT_SYMBOL_GPL(blowfish_dec_blk_4way);
47static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
48{
49 __blowfish_enc_blk(ctx, dst, src, false);
50}
51
52static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
53 const u8 *src)
54{
55 __blowfish_enc_blk(ctx, dst, src, true);
56}
57
58static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
59 const u8 *src)
60{
61 __blowfish_enc_blk_4way(ctx, dst, src, false);
62}
63
64static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
65 const u8 *src)
66{
67 __blowfish_enc_blk_4way(ctx, dst, src, true);
68}
69 53
70static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) 54static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
71{ 55{
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 93fe929d1cee..1243272605aa 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -278,6 +278,7 @@ extern const char * const x86_power_flags[32];
278#define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3) 278#define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3)
279#define cpu_has_aes boot_cpu_has(X86_FEATURE_AES) 279#define cpu_has_aes boot_cpu_has(X86_FEATURE_AES)
280#define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) 280#define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX)
281#define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2)
281#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT) 282#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT)
282#define cpu_has_mp boot_cpu_has(X86_FEATURE_MP) 283#define cpu_has_mp boot_cpu_has(X86_FEATURE_MP)
283#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX) 284#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)
diff --git a/arch/x86/include/asm/crypto/blowfish.h b/arch/x86/include/asm/crypto/blowfish.h
new file mode 100644
index 000000000000..f097b2face10
--- /dev/null
+++ b/arch/x86/include/asm/crypto/blowfish.h
@@ -0,0 +1,43 @@
1#ifndef ASM_X86_BLOWFISH_H
2#define ASM_X86_BLOWFISH_H
3
4#include <linux/crypto.h>
5#include <crypto/blowfish.h>
6
7#define BF_PARALLEL_BLOCKS 4
8
9/* regular block cipher functions */
10asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
11 bool xor);
12asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
13
14/* 4-way parallel cipher functions */
15asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
16 const u8 *src, bool xor);
17asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
18 const u8 *src);
19
20static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
21{
22 __blowfish_enc_blk(ctx, dst, src, false);
23}
24
25static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
26 const u8 *src)
27{
28 __blowfish_enc_blk(ctx, dst, src, true);
29}
30
31static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
32 const u8 *src)
33{
34 __blowfish_enc_blk_4way(ctx, dst, src, false);
35}
36
37static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
38 const u8 *src)
39{
40 __blowfish_enc_blk_4way(ctx, dst, src, true);
41}
42
43#endif
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 0e7a23723b45..6b9564f91168 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -820,6 +820,24 @@ config CRYPTO_BLOWFISH_X86_64
820 See also: 820 See also:
821 <http://www.schneier.com/blowfish.html> 821 <http://www.schneier.com/blowfish.html>
822 822
823config CRYPTO_BLOWFISH_AVX2_X86_64
824 tristate "Blowfish cipher algorithm (x86_64/AVX2)"
825 depends on X86 && 64BIT
826 select CRYPTO_ALGAPI
827 select CRYPTO_CRYPTD
828 select CRYPTO_ABLK_HELPER_X86
829 select CRYPTO_BLOWFISH_COMMON
830 select CRYPTO_BLOWFISH_X86_64
831 help
832 Blowfish cipher algorithm (x86_64/AVX2), by Bruce Schneier.
833
834 This is a variable key length cipher which can use keys from 32
835 bits to 448 bits in length. It's fast, simple and specifically
836 designed for use on "large microprocessors".
837
838 See also:
839 <http://www.schneier.com/blowfish.html>
840
823config CRYPTO_CAMELLIA 841config CRYPTO_CAMELLIA
824 tristate "Camellia cipher algorithms" 842 tristate "Camellia cipher algorithms"
825 depends on CRYPTO 843 depends on CRYPTO
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 380708477b35..f3effb42531e 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1655,6 +1655,9 @@ static const struct alg_test_desc alg_test_descs[] = {
1655 .test = alg_test_null, 1655 .test = alg_test_null,
1656 .fips_allowed = 1, 1656 .fips_allowed = 1,
1657 }, { 1657 }, {
1658 .alg = "__driver-cbc-blowfish-avx2",
1659 .test = alg_test_null,
1660 }, {
1658 .alg = "__driver-cbc-camellia-aesni", 1661 .alg = "__driver-cbc-camellia-aesni",
1659 .test = alg_test_null, 1662 .test = alg_test_null,
1660 }, { 1663 }, {
@@ -1677,6 +1680,9 @@ static const struct alg_test_desc alg_test_descs[] = {
1677 .test = alg_test_null, 1680 .test = alg_test_null,
1678 .fips_allowed = 1, 1681 .fips_allowed = 1,
1679 }, { 1682 }, {
1683 .alg = "__driver-ecb-blowfish-avx2",
1684 .test = alg_test_null,
1685 }, {
1680 .alg = "__driver-ecb-camellia-aesni", 1686 .alg = "__driver-ecb-camellia-aesni",
1681 .test = alg_test_null, 1687 .test = alg_test_null,
1682 }, { 1688 }, {
@@ -1948,6 +1954,9 @@ static const struct alg_test_desc alg_test_descs[] = {
1948 .test = alg_test_null, 1954 .test = alg_test_null,
1949 .fips_allowed = 1, 1955 .fips_allowed = 1,
1950 }, { 1956 }, {
1957 .alg = "cryptd(__driver-cbc-blowfish-avx2)",
1958 .test = alg_test_null,
1959 }, {
1951 .alg = "cryptd(__driver-cbc-camellia-aesni)", 1960 .alg = "cryptd(__driver-cbc-camellia-aesni)",
1952 .test = alg_test_null, 1961 .test = alg_test_null,
1953 }, { 1962 }, {
@@ -1955,6 +1964,9 @@ static const struct alg_test_desc alg_test_descs[] = {
1955 .test = alg_test_null, 1964 .test = alg_test_null,
1956 .fips_allowed = 1, 1965 .fips_allowed = 1,
1957 }, { 1966 }, {
1967 .alg = "cryptd(__driver-ecb-blowfish-avx2)",
1968 .test = alg_test_null,
1969 }, {
1958 .alg = "cryptd(__driver-ecb-camellia-aesni)", 1970 .alg = "cryptd(__driver-ecb-camellia-aesni)",
1959 .test = alg_test_null, 1971 .test = alg_test_null,
1960 }, { 1972 }, {