aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2013-06-08 05:17:47 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2013-06-21 02:44:29 -0400
commit99f42f937a080995b34e1ed75ed6934b5f96f9ca (patch)
tree1a9c3482104dd4d99dfc1b839c02678b6a550a53 /arch/x86/crypto
parent3d387ef08c40382315b8e9baa4bc9a07f7c49fce (diff)
Revert "crypto: twofish - add AVX2/x86_64 assembler implementation of twofish cipher"
This reverts commit cf1521a1a5e21fd1e79a458605c4282fbfbbeee2. Instruction (vpgatherdd) that this implementation relied on turned out to be slow performer on real hardware (i5-4570). The previous 8-way twofish/AVX implementation is therefore faster and this implementation should be removed. Converting this implementation to use the same method as in twofish/AVX for table look-ups would give additional ~3% speed up vs twofish/AVX, but would hardly be worth of the added code and binary size. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/twofish-avx2-asm_64.S600
-rw-r--r--arch/x86/crypto/twofish_avx2_glue.c584
-rw-r--r--arch/x86/crypto/twofish_avx_glue.c14
4 files changed, 2 insertions, 1198 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 9ce341839f4a..7d6ba9db1be9 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -43,7 +43,6 @@ endif
43ifeq ($(avx2_supported),yes) 43ifeq ($(avx2_supported),yes)
44 obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o 44 obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
45 obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o 45 obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
46 obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o
47endif 46endif
48 47
49aes-i586-y := aes-i586-asm_32.o aes_glue.o 48aes-i586-y := aes-i586-asm_32.o aes_glue.o
@@ -73,7 +72,6 @@ endif
73ifeq ($(avx2_supported),yes) 72ifeq ($(avx2_supported),yes)
74 camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o 73 camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
75 serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o 74 serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
76 twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o
77endif 75endif
78 76
79aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o 77aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
diff --git a/arch/x86/crypto/twofish-avx2-asm_64.S b/arch/x86/crypto/twofish-avx2-asm_64.S
deleted file mode 100644
index e1a83b9cd389..000000000000
--- a/arch/x86/crypto/twofish-avx2-asm_64.S
+++ /dev/null
@@ -1,600 +0,0 @@
1/*
2 * x86_64/AVX2 assembler optimized version of Twofish
3 *
4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 */
12
13#include <linux/linkage.h>
14#include "glue_helper-asm-avx2.S"
15
16.file "twofish-avx2-asm_64.S"
17
18.data
19.align 16
20
21.Lvpshufb_mask0:
22.long 0x80808000
23.long 0x80808004
24.long 0x80808008
25.long 0x8080800c
26
27.Lbswap128_mask:
28 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
29.Lxts_gf128mul_and_shl1_mask_0:
30 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
31.Lxts_gf128mul_and_shl1_mask_1:
32 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
33
34.text
35
36/* structure of crypto context */
37#define s0 0
38#define s1 1024
39#define s2 2048
40#define s3 3072
41#define w 4096
42#define k 4128
43
44/* register macros */
45#define CTX %rdi
46
47#define RS0 CTX
48#define RS1 %r8
49#define RS2 %r9
50#define RS3 %r10
51#define RK %r11
52#define RW %rax
53#define RROUND %r12
54#define RROUNDd %r12d
55
56#define RA0 %ymm8
57#define RB0 %ymm9
58#define RC0 %ymm10
59#define RD0 %ymm11
60#define RA1 %ymm12
61#define RB1 %ymm13
62#define RC1 %ymm14
63#define RD1 %ymm15
64
65/* temp regs */
66#define RX0 %ymm0
67#define RY0 %ymm1
68#define RX1 %ymm2
69#define RY1 %ymm3
70#define RT0 %ymm4
71#define RIDX %ymm5
72
73#define RX0x %xmm0
74#define RY0x %xmm1
75#define RX1x %xmm2
76#define RY1x %xmm3
77#define RT0x %xmm4
78
79/* vpgatherdd mask and '-1' */
80#define RNOT %ymm6
81
82/* byte mask, (-1 >> 24) */
83#define RBYTE %ymm7
84
85/**********************************************************************
86 16-way AVX2 twofish
87 **********************************************************************/
88#define init_round_constants() \
89 vpcmpeqd RNOT, RNOT, RNOT; \
90 vpsrld $24, RNOT, RBYTE; \
91 leaq k(CTX), RK; \
92 leaq w(CTX), RW; \
93 leaq s1(CTX), RS1; \
94 leaq s2(CTX), RS2; \
95 leaq s3(CTX), RS3; \
96
97#define g16(ab, rs0, rs1, rs2, rs3, xy) \
98 vpand RBYTE, ab ## 0, RIDX; \
99 vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \
100 vpcmpeqd RNOT, RNOT, RNOT; \
101 \
102 vpand RBYTE, ab ## 1, RIDX; \
103 vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \
104 vpcmpeqd RNOT, RNOT, RNOT; \
105 \
106 vpsrld $8, ab ## 0, RIDX; \
107 vpand RBYTE, RIDX, RIDX; \
108 vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
109 vpcmpeqd RNOT, RNOT, RNOT; \
110 vpxor RT0, xy ## 0, xy ## 0; \
111 \
112 vpsrld $8, ab ## 1, RIDX; \
113 vpand RBYTE, RIDX, RIDX; \
114 vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
115 vpcmpeqd RNOT, RNOT, RNOT; \
116 vpxor RT0, xy ## 1, xy ## 1; \
117 \
118 vpsrld $16, ab ## 0, RIDX; \
119 vpand RBYTE, RIDX, RIDX; \
120 vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
121 vpcmpeqd RNOT, RNOT, RNOT; \
122 vpxor RT0, xy ## 0, xy ## 0; \
123 \
124 vpsrld $16, ab ## 1, RIDX; \
125 vpand RBYTE, RIDX, RIDX; \
126 vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
127 vpcmpeqd RNOT, RNOT, RNOT; \
128 vpxor RT0, xy ## 1, xy ## 1; \
129 \
130 vpsrld $24, ab ## 0, RIDX; \
131 vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
132 vpcmpeqd RNOT, RNOT, RNOT; \
133 vpxor RT0, xy ## 0, xy ## 0; \
134 \
135 vpsrld $24, ab ## 1, RIDX; \
136 vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
137 vpcmpeqd RNOT, RNOT, RNOT; \
138 vpxor RT0, xy ## 1, xy ## 1;
139
140#define g1_16(a, x) \
141 g16(a, RS0, RS1, RS2, RS3, x);
142
143#define g2_16(b, y) \
144 g16(b, RS1, RS2, RS3, RS0, y);
145
146#define encrypt_round_end16(a, b, c, d, nk) \
147 vpaddd RY0, RX0, RX0; \
148 vpaddd RX0, RY0, RY0; \
149 vpbroadcastd nk(RK,RROUND,8), RT0; \
150 vpaddd RT0, RX0, RX0; \
151 vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
152 vpaddd RT0, RY0, RY0; \
153 \
154 vpxor RY0, d ## 0, d ## 0; \
155 \
156 vpxor RX0, c ## 0, c ## 0; \
157 vpsrld $1, c ## 0, RT0; \
158 vpslld $31, c ## 0, c ## 0; \
159 vpor RT0, c ## 0, c ## 0; \
160 \
161 vpaddd RY1, RX1, RX1; \
162 vpaddd RX1, RY1, RY1; \
163 vpbroadcastd nk(RK,RROUND,8), RT0; \
164 vpaddd RT0, RX1, RX1; \
165 vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
166 vpaddd RT0, RY1, RY1; \
167 \
168 vpxor RY1, d ## 1, d ## 1; \
169 \
170 vpxor RX1, c ## 1, c ## 1; \
171 vpsrld $1, c ## 1, RT0; \
172 vpslld $31, c ## 1, c ## 1; \
173 vpor RT0, c ## 1, c ## 1; \
174
175#define encrypt_round16(a, b, c, d, nk) \
176 g2_16(b, RY); \
177 \
178 vpslld $1, b ## 0, RT0; \
179 vpsrld $31, b ## 0, b ## 0; \
180 vpor RT0, b ## 0, b ## 0; \
181 \
182 vpslld $1, b ## 1, RT0; \
183 vpsrld $31, b ## 1, b ## 1; \
184 vpor RT0, b ## 1, b ## 1; \
185 \
186 g1_16(a, RX); \
187 \
188 encrypt_round_end16(a, b, c, d, nk);
189
190#define encrypt_round_first16(a, b, c, d, nk) \
191 vpslld $1, d ## 0, RT0; \
192 vpsrld $31, d ## 0, d ## 0; \
193 vpor RT0, d ## 0, d ## 0; \
194 \
195 vpslld $1, d ## 1, RT0; \
196 vpsrld $31, d ## 1, d ## 1; \
197 vpor RT0, d ## 1, d ## 1; \
198 \
199 encrypt_round16(a, b, c, d, nk);
200
201#define encrypt_round_last16(a, b, c, d, nk) \
202 g2_16(b, RY); \
203 \
204 g1_16(a, RX); \
205 \
206 encrypt_round_end16(a, b, c, d, nk);
207
208#define decrypt_round_end16(a, b, c, d, nk) \
209 vpaddd RY0, RX0, RX0; \
210 vpaddd RX0, RY0, RY0; \
211 vpbroadcastd nk(RK,RROUND,8), RT0; \
212 vpaddd RT0, RX0, RX0; \
213 vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
214 vpaddd RT0, RY0, RY0; \
215 \
216 vpxor RX0, c ## 0, c ## 0; \
217 \
218 vpxor RY0, d ## 0, d ## 0; \
219 vpsrld $1, d ## 0, RT0; \
220 vpslld $31, d ## 0, d ## 0; \
221 vpor RT0, d ## 0, d ## 0; \
222 \
223 vpaddd RY1, RX1, RX1; \
224 vpaddd RX1, RY1, RY1; \
225 vpbroadcastd nk(RK,RROUND,8), RT0; \
226 vpaddd RT0, RX1, RX1; \
227 vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
228 vpaddd RT0, RY1, RY1; \
229 \
230 vpxor RX1, c ## 1, c ## 1; \
231 \
232 vpxor RY1, d ## 1, d ## 1; \
233 vpsrld $1, d ## 1, RT0; \
234 vpslld $31, d ## 1, d ## 1; \
235 vpor RT0, d ## 1, d ## 1;
236
237#define decrypt_round16(a, b, c, d, nk) \
238 g1_16(a, RX); \
239 \
240 vpslld $1, a ## 0, RT0; \
241 vpsrld $31, a ## 0, a ## 0; \
242 vpor RT0, a ## 0, a ## 0; \
243 \
244 vpslld $1, a ## 1, RT0; \
245 vpsrld $31, a ## 1, a ## 1; \
246 vpor RT0, a ## 1, a ## 1; \
247 \
248 g2_16(b, RY); \
249 \
250 decrypt_round_end16(a, b, c, d, nk);
251
252#define decrypt_round_first16(a, b, c, d, nk) \
253 vpslld $1, c ## 0, RT0; \
254 vpsrld $31, c ## 0, c ## 0; \
255 vpor RT0, c ## 0, c ## 0; \
256 \
257 vpslld $1, c ## 1, RT0; \
258 vpsrld $31, c ## 1, c ## 1; \
259 vpor RT0, c ## 1, c ## 1; \
260 \
261 decrypt_round16(a, b, c, d, nk)
262
263#define decrypt_round_last16(a, b, c, d, nk) \
264 g1_16(a, RX); \
265 \
266 g2_16(b, RY); \
267 \
268 decrypt_round_end16(a, b, c, d, nk);
269
270#define encrypt_cycle16() \
271 encrypt_round16(RA, RB, RC, RD, 0); \
272 encrypt_round16(RC, RD, RA, RB, 8);
273
274#define encrypt_cycle_first16() \
275 encrypt_round_first16(RA, RB, RC, RD, 0); \
276 encrypt_round16(RC, RD, RA, RB, 8);
277
278#define encrypt_cycle_last16() \
279 encrypt_round16(RA, RB, RC, RD, 0); \
280 encrypt_round_last16(RC, RD, RA, RB, 8);
281
282#define decrypt_cycle16(n) \
283 decrypt_round16(RC, RD, RA, RB, 8); \
284 decrypt_round16(RA, RB, RC, RD, 0);
285
286#define decrypt_cycle_first16(n) \
287 decrypt_round_first16(RC, RD, RA, RB, 8); \
288 decrypt_round16(RA, RB, RC, RD, 0);
289
290#define decrypt_cycle_last16(n) \
291 decrypt_round16(RC, RD, RA, RB, 8); \
292 decrypt_round_last16(RA, RB, RC, RD, 0);
293
294#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
295 vpunpckhdq x1, x0, t2; \
296 vpunpckldq x1, x0, x0; \
297 \
298 vpunpckldq x3, x2, t1; \
299 vpunpckhdq x3, x2, x2; \
300 \
301 vpunpckhqdq t1, x0, x1; \
302 vpunpcklqdq t1, x0, x0; \
303 \
304 vpunpckhqdq x2, t2, x3; \
305 vpunpcklqdq x2, t2, x2;
306
307#define read_blocks8(offs,a,b,c,d) \
308 transpose_4x4(a, b, c, d, RX0, RY0);
309
310#define write_blocks8(offs,a,b,c,d) \
311 transpose_4x4(a, b, c, d, RX0, RY0);
312
313#define inpack_enc8(a,b,c,d) \
314 vpbroadcastd 4*0(RW), RT0; \
315 vpxor RT0, a, a; \
316 \
317 vpbroadcastd 4*1(RW), RT0; \
318 vpxor RT0, b, b; \
319 \
320 vpbroadcastd 4*2(RW), RT0; \
321 vpxor RT0, c, c; \
322 \
323 vpbroadcastd 4*3(RW), RT0; \
324 vpxor RT0, d, d;
325
326#define outunpack_enc8(a,b,c,d) \
327 vpbroadcastd 4*4(RW), RX0; \
328 vpbroadcastd 4*5(RW), RY0; \
329 vpxor RX0, c, RX0; \
330 vpxor RY0, d, RY0; \
331 \
332 vpbroadcastd 4*6(RW), RT0; \
333 vpxor RT0, a, c; \
334 vpbroadcastd 4*7(RW), RT0; \
335 vpxor RT0, b, d; \
336 \
337 vmovdqa RX0, a; \
338 vmovdqa RY0, b;
339
340#define inpack_dec8(a,b,c,d) \
341 vpbroadcastd 4*4(RW), RX0; \
342 vpbroadcastd 4*5(RW), RY0; \
343 vpxor RX0, a, RX0; \
344 vpxor RY0, b, RY0; \
345 \
346 vpbroadcastd 4*6(RW), RT0; \
347 vpxor RT0, c, a; \
348 vpbroadcastd 4*7(RW), RT0; \
349 vpxor RT0, d, b; \
350 \
351 vmovdqa RX0, c; \
352 vmovdqa RY0, d;
353
354#define outunpack_dec8(a,b,c,d) \
355 vpbroadcastd 4*0(RW), RT0; \
356 vpxor RT0, a, a; \
357 \
358 vpbroadcastd 4*1(RW), RT0; \
359 vpxor RT0, b, b; \
360 \
361 vpbroadcastd 4*2(RW), RT0; \
362 vpxor RT0, c, c; \
363 \
364 vpbroadcastd 4*3(RW), RT0; \
365 vpxor RT0, d, d;
366
367#define read_blocks16(a,b,c,d) \
368 read_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
369 read_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
370
371#define write_blocks16(a,b,c,d) \
372 write_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
373 write_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
374
375#define xor_blocks16(a,b,c,d) \
376 xor_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
377 xor_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
378
379#define inpack_enc16(a,b,c,d) \
380 inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
381 inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
382
383#define outunpack_enc16(a,b,c,d) \
384 outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
385 outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
386
387#define inpack_dec16(a,b,c,d) \
388 inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
389 inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
390
391#define outunpack_dec16(a,b,c,d) \
392 outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
393 outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
394
395.align 8
396__twofish_enc_blk16:
397 /* input:
398 * %rdi: ctx, CTX
399 * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
400 * output:
401 * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
402 */
403 init_round_constants();
404
405 read_blocks16(RA, RB, RC, RD);
406 inpack_enc16(RA, RB, RC, RD);
407
408 xorl RROUNDd, RROUNDd;
409 encrypt_cycle_first16();
410 movl $2, RROUNDd;
411
412.align 4
413.L__enc_loop:
414 encrypt_cycle16();
415
416 addl $2, RROUNDd;
417 cmpl $14, RROUNDd;
418 jne .L__enc_loop;
419
420 encrypt_cycle_last16();
421
422 outunpack_enc16(RA, RB, RC, RD);
423 write_blocks16(RA, RB, RC, RD);
424
425 ret;
426ENDPROC(__twofish_enc_blk16)
427
428.align 8
429__twofish_dec_blk16:
430 /* input:
431 * %rdi: ctx, CTX
432 * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
433 * output:
434 * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
435 */
436 init_round_constants();
437
438 read_blocks16(RA, RB, RC, RD);
439 inpack_dec16(RA, RB, RC, RD);
440
441 movl $14, RROUNDd;
442 decrypt_cycle_first16();
443 movl $12, RROUNDd;
444
445.align 4
446.L__dec_loop:
447 decrypt_cycle16();
448
449 addl $-2, RROUNDd;
450 jnz .L__dec_loop;
451
452 decrypt_cycle_last16();
453
454 outunpack_dec16(RA, RB, RC, RD);
455 write_blocks16(RA, RB, RC, RD);
456
457 ret;
458ENDPROC(__twofish_dec_blk16)
459
460ENTRY(twofish_ecb_enc_16way)
461 /* input:
462 * %rdi: ctx, CTX
463 * %rsi: dst
464 * %rdx: src
465 */
466
467 vzeroupper;
468 pushq %r12;
469
470 load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
471
472 call __twofish_enc_blk16;
473
474 store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
475
476 popq %r12;
477 vzeroupper;
478
479 ret;
480ENDPROC(twofish_ecb_enc_16way)
481
482ENTRY(twofish_ecb_dec_16way)
483 /* input:
484 * %rdi: ctx, CTX
485 * %rsi: dst
486 * %rdx: src
487 */
488
489 vzeroupper;
490 pushq %r12;
491
492 load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
493
494 call __twofish_dec_blk16;
495
496 store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
497
498 popq %r12;
499 vzeroupper;
500
501 ret;
502ENDPROC(twofish_ecb_dec_16way)
503
504ENTRY(twofish_cbc_dec_16way)
505 /* input:
506 * %rdi: ctx, CTX
507 * %rsi: dst
508 * %rdx: src
509 */
510
511 vzeroupper;
512 pushq %r12;
513
514 load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
515
516 call __twofish_dec_blk16;
517
518 store_cbc_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1,
519 RX0);
520
521 popq %r12;
522 vzeroupper;
523
524 ret;
525ENDPROC(twofish_cbc_dec_16way)
526
527ENTRY(twofish_ctr_16way)
528 /* input:
529 * %rdi: ctx, CTX
530 * %rsi: dst (16 blocks)
531 * %rdx: src (16 blocks)
532 * %rcx: iv (little endian, 128bit)
533 */
534
535 vzeroupper;
536 pushq %r12;
537
538 load_ctr_16way(%rcx, .Lbswap128_mask, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
539 RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
540 RBYTE);
541
542 call __twofish_enc_blk16;
543
544 store_ctr_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
545
546 popq %r12;
547 vzeroupper;
548
549 ret;
550ENDPROC(twofish_ctr_16way)
551
552.align 8
553twofish_xts_crypt_16way:
554 /* input:
555 * %rdi: ctx, CTX
556 * %rsi: dst (16 blocks)
557 * %rdx: src (16 blocks)
558 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
559 * %r8: pointer to __twofish_enc_blk16 or __twofish_dec_blk16
560 */
561
562 vzeroupper;
563 pushq %r12;
564
565 load_xts_16way(%rcx, %rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
566 RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
567 .Lxts_gf128mul_and_shl1_mask_0,
568 .Lxts_gf128mul_and_shl1_mask_1);
569
570 call *%r8;
571
572 store_xts_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
573
574 popq %r12;
575 vzeroupper;
576
577 ret;
578ENDPROC(twofish_xts_crypt_16way)
579
580ENTRY(twofish_xts_enc_16way)
581 /* input:
582 * %rdi: ctx, CTX
583 * %rsi: dst (16 blocks)
584 * %rdx: src (16 blocks)
585 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
586 */
587 leaq __twofish_enc_blk16, %r8;
588 jmp twofish_xts_crypt_16way;
589ENDPROC(twofish_xts_enc_16way)
590
591ENTRY(twofish_xts_dec_16way)
592 /* input:
593 * %rdi: ctx, CTX
594 * %rsi: dst (16 blocks)
595 * %rdx: src (16 blocks)
596 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
597 */
598 leaq __twofish_dec_blk16, %r8;
599 jmp twofish_xts_crypt_16way;
600ENDPROC(twofish_xts_dec_16way)
diff --git a/arch/x86/crypto/twofish_avx2_glue.c b/arch/x86/crypto/twofish_avx2_glue.c
deleted file mode 100644
index ce33b5be64ee..000000000000
--- a/arch/x86/crypto/twofish_avx2_glue.c
+++ /dev/null
@@ -1,584 +0,0 @@
1/*
2 * Glue Code for x86_64/AVX2 assembler optimized version of Twofish
3 *
4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 */
12
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/crypto.h>
16#include <linux/err.h>
17#include <crypto/algapi.h>
18#include <crypto/ctr.h>
19#include <crypto/twofish.h>
20#include <crypto/lrw.h>
21#include <crypto/xts.h>
22#include <asm/xcr.h>
23#include <asm/xsave.h>
24#include <asm/crypto/twofish.h>
25#include <asm/crypto/ablk_helper.h>
26#include <asm/crypto/glue_helper.h>
27#include <crypto/scatterwalk.h>
28
29#define TF_AVX2_PARALLEL_BLOCKS 16
30
31/* 16-way AVX2 parallel cipher functions */
32asmlinkage void twofish_ecb_enc_16way(struct twofish_ctx *ctx, u8 *dst,
33 const u8 *src);
34asmlinkage void twofish_ecb_dec_16way(struct twofish_ctx *ctx, u8 *dst,
35 const u8 *src);
36asmlinkage void twofish_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src);
37
38asmlinkage void twofish_ctr_16way(void *ctx, u128 *dst, const u128 *src,
39 le128 *iv);
40
41asmlinkage void twofish_xts_enc_16way(struct twofish_ctx *ctx, u8 *dst,
42 const u8 *src, le128 *iv);
43asmlinkage void twofish_xts_dec_16way(struct twofish_ctx *ctx, u8 *dst,
44 const u8 *src, le128 *iv);
45
46static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
47 const u8 *src)
48{
49 __twofish_enc_blk_3way(ctx, dst, src, false);
50}
51
52static const struct common_glue_ctx twofish_enc = {
53 .num_funcs = 4,
54 .fpu_blocks_limit = 8,
55
56 .funcs = { {
57 .num_blocks = 16,
58 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_16way) }
59 }, {
60 .num_blocks = 8,
61 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
62 }, {
63 .num_blocks = 3,
64 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
65 }, {
66 .num_blocks = 1,
67 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
68 } }
69};
70
71static const struct common_glue_ctx twofish_ctr = {
72 .num_funcs = 4,
73 .fpu_blocks_limit = 8,
74
75 .funcs = { {
76 .num_blocks = 16,
77 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_16way) }
78 }, {
79 .num_blocks = 8,
80 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
81 }, {
82 .num_blocks = 3,
83 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
84 }, {
85 .num_blocks = 1,
86 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) }
87 } }
88};
89
90static const struct common_glue_ctx twofish_enc_xts = {
91 .num_funcs = 3,
92 .fpu_blocks_limit = 8,
93
94 .funcs = { {
95 .num_blocks = 16,
96 .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_16way) }
97 }, {
98 .num_blocks = 8,
99 .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) }
100 }, {
101 .num_blocks = 1,
102 .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) }
103 } }
104};
105
106static const struct common_glue_ctx twofish_dec = {
107 .num_funcs = 4,
108 .fpu_blocks_limit = 8,
109
110 .funcs = { {
111 .num_blocks = 16,
112 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_16way) }
113 }, {
114 .num_blocks = 8,
115 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
116 }, {
117 .num_blocks = 3,
118 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
119 }, {
120 .num_blocks = 1,
121 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
122 } }
123};
124
125static const struct common_glue_ctx twofish_dec_cbc = {
126 .num_funcs = 4,
127 .fpu_blocks_limit = 8,
128
129 .funcs = { {
130 .num_blocks = 16,
131 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_16way) }
132 }, {
133 .num_blocks = 8,
134 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
135 }, {
136 .num_blocks = 3,
137 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
138 }, {
139 .num_blocks = 1,
140 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
141 } }
142};
143
144static const struct common_glue_ctx twofish_dec_xts = {
145 .num_funcs = 3,
146 .fpu_blocks_limit = 8,
147
148 .funcs = { {
149 .num_blocks = 16,
150 .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_16way) }
151 }, {
152 .num_blocks = 8,
153 .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) }
154 }, {
155 .num_blocks = 1,
156 .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) }
157 } }
158};
159
160static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
161 struct scatterlist *src, unsigned int nbytes)
162{
163 return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);
164}
165
166static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
167 struct scatterlist *src, unsigned int nbytes)
168{
169 return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);
170}
171
172static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
173 struct scatterlist *src, unsigned int nbytes)
174{
175 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc,
176 dst, src, nbytes);
177}
178
179static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
180 struct scatterlist *src, unsigned int nbytes)
181{
182 return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src,
183 nbytes);
184}
185
186static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
187 struct scatterlist *src, unsigned int nbytes)
188{
189 return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);
190}
191
192static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes)
193{
194 /* since reusing AVX functions, starts using FPU at 8 parallel blocks */
195 return glue_fpu_begin(TF_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes);
196}
197
198static inline void twofish_fpu_end(bool fpu_enabled)
199{
200 glue_fpu_end(fpu_enabled);
201}
202
203struct crypt_priv {
204 struct twofish_ctx *ctx;
205 bool fpu_enabled;
206};
207
208static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
209{
210 const unsigned int bsize = TF_BLOCK_SIZE;
211 struct crypt_priv *ctx = priv;
212 int i;
213
214 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
215
216 while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) {
217 twofish_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
218 srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS;
219 nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS;
220 }
221
222 while (nbytes >= 8 * bsize) {
223 twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
224 srcdst += bsize * 8;
225 nbytes -= bsize * 8;
226 }
227
228 while (nbytes >= 3 * bsize) {
229 twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
230 srcdst += bsize * 3;
231 nbytes -= bsize * 3;
232 }
233
234 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
235 twofish_enc_blk(ctx->ctx, srcdst, srcdst);
236}
237
238static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
239{
240 const unsigned int bsize = TF_BLOCK_SIZE;
241 struct crypt_priv *ctx = priv;
242 int i;
243
244 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
245
246 while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) {
247 twofish_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
248 srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS;
249 nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS;
250 }
251
252 while (nbytes >= 8 * bsize) {
253 twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
254 srcdst += bsize * 8;
255 nbytes -= bsize * 8;
256 }
257
258 while (nbytes >= 3 * bsize) {
259 twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
260 srcdst += bsize * 3;
261 nbytes -= bsize * 3;
262 }
263
264 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
265 twofish_dec_blk(ctx->ctx, srcdst, srcdst);
266}
267
268static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
269 struct scatterlist *src, unsigned int nbytes)
270{
271 struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
272 be128 buf[TF_AVX2_PARALLEL_BLOCKS];
273 struct crypt_priv crypt_ctx = {
274 .ctx = &ctx->twofish_ctx,
275 .fpu_enabled = false,
276 };
277 struct lrw_crypt_req req = {
278 .tbuf = buf,
279 .tbuflen = sizeof(buf),
280
281 .table_ctx = &ctx->lrw_table,
282 .crypt_ctx = &crypt_ctx,
283 .crypt_fn = encrypt_callback,
284 };
285 int ret;
286
287 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
288 ret = lrw_crypt(desc, dst, src, nbytes, &req);
289 twofish_fpu_end(crypt_ctx.fpu_enabled);
290
291 return ret;
292}
293
294static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
295 struct scatterlist *src, unsigned int nbytes)
296{
297 struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
298 be128 buf[TF_AVX2_PARALLEL_BLOCKS];
299 struct crypt_priv crypt_ctx = {
300 .ctx = &ctx->twofish_ctx,
301 .fpu_enabled = false,
302 };
303 struct lrw_crypt_req req = {
304 .tbuf = buf,
305 .tbuflen = sizeof(buf),
306
307 .table_ctx = &ctx->lrw_table,
308 .crypt_ctx = &crypt_ctx,
309 .crypt_fn = decrypt_callback,
310 };
311 int ret;
312
313 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
314 ret = lrw_crypt(desc, dst, src, nbytes, &req);
315 twofish_fpu_end(crypt_ctx.fpu_enabled);
316
317 return ret;
318}
319
320static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
321 struct scatterlist *src, unsigned int nbytes)
322{
323 struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
324
325 return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes,
326 XTS_TWEAK_CAST(twofish_enc_blk),
327 &ctx->tweak_ctx, &ctx->crypt_ctx);
328}
329
330static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
331 struct scatterlist *src, unsigned int nbytes)
332{
333 struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
334
335 return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes,
336 XTS_TWEAK_CAST(twofish_enc_blk),
337 &ctx->tweak_ctx, &ctx->crypt_ctx);
338}
339
340static struct crypto_alg tf_algs[10] = { {
341 .cra_name = "__ecb-twofish-avx2",
342 .cra_driver_name = "__driver-ecb-twofish-avx2",
343 .cra_priority = 0,
344 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
345 .cra_blocksize = TF_BLOCK_SIZE,
346 .cra_ctxsize = sizeof(struct twofish_ctx),
347 .cra_alignmask = 0,
348 .cra_type = &crypto_blkcipher_type,
349 .cra_module = THIS_MODULE,
350 .cra_u = {
351 .blkcipher = {
352 .min_keysize = TF_MIN_KEY_SIZE,
353 .max_keysize = TF_MAX_KEY_SIZE,
354 .setkey = twofish_setkey,
355 .encrypt = ecb_encrypt,
356 .decrypt = ecb_decrypt,
357 },
358 },
359}, {
360 .cra_name = "__cbc-twofish-avx2",
361 .cra_driver_name = "__driver-cbc-twofish-avx2",
362 .cra_priority = 0,
363 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
364 .cra_blocksize = TF_BLOCK_SIZE,
365 .cra_ctxsize = sizeof(struct twofish_ctx),
366 .cra_alignmask = 0,
367 .cra_type = &crypto_blkcipher_type,
368 .cra_module = THIS_MODULE,
369 .cra_u = {
370 .blkcipher = {
371 .min_keysize = TF_MIN_KEY_SIZE,
372 .max_keysize = TF_MAX_KEY_SIZE,
373 .setkey = twofish_setkey,
374 .encrypt = cbc_encrypt,
375 .decrypt = cbc_decrypt,
376 },
377 },
378}, {
379 .cra_name = "__ctr-twofish-avx2",
380 .cra_driver_name = "__driver-ctr-twofish-avx2",
381 .cra_priority = 0,
382 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
383 .cra_blocksize = 1,
384 .cra_ctxsize = sizeof(struct twofish_ctx),
385 .cra_alignmask = 0,
386 .cra_type = &crypto_blkcipher_type,
387 .cra_module = THIS_MODULE,
388 .cra_u = {
389 .blkcipher = {
390 .min_keysize = TF_MIN_KEY_SIZE,
391 .max_keysize = TF_MAX_KEY_SIZE,
392 .ivsize = TF_BLOCK_SIZE,
393 .setkey = twofish_setkey,
394 .encrypt = ctr_crypt,
395 .decrypt = ctr_crypt,
396 },
397 },
398}, {
399 .cra_name = "__lrw-twofish-avx2",
400 .cra_driver_name = "__driver-lrw-twofish-avx2",
401 .cra_priority = 0,
402 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
403 .cra_blocksize = TF_BLOCK_SIZE,
404 .cra_ctxsize = sizeof(struct twofish_lrw_ctx),
405 .cra_alignmask = 0,
406 .cra_type = &crypto_blkcipher_type,
407 .cra_module = THIS_MODULE,
408 .cra_exit = lrw_twofish_exit_tfm,
409 .cra_u = {
410 .blkcipher = {
411 .min_keysize = TF_MIN_KEY_SIZE +
412 TF_BLOCK_SIZE,
413 .max_keysize = TF_MAX_KEY_SIZE +
414 TF_BLOCK_SIZE,
415 .ivsize = TF_BLOCK_SIZE,
416 .setkey = lrw_twofish_setkey,
417 .encrypt = lrw_encrypt,
418 .decrypt = lrw_decrypt,
419 },
420 },
421}, {
422 .cra_name = "__xts-twofish-avx2",
423 .cra_driver_name = "__driver-xts-twofish-avx2",
424 .cra_priority = 0,
425 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
426 .cra_blocksize = TF_BLOCK_SIZE,
427 .cra_ctxsize = sizeof(struct twofish_xts_ctx),
428 .cra_alignmask = 0,
429 .cra_type = &crypto_blkcipher_type,
430 .cra_module = THIS_MODULE,
431 .cra_u = {
432 .blkcipher = {
433 .min_keysize = TF_MIN_KEY_SIZE * 2,
434 .max_keysize = TF_MAX_KEY_SIZE * 2,
435 .ivsize = TF_BLOCK_SIZE,
436 .setkey = xts_twofish_setkey,
437 .encrypt = xts_encrypt,
438 .decrypt = xts_decrypt,
439 },
440 },
441}, {
442 .cra_name = "ecb(twofish)",
443 .cra_driver_name = "ecb-twofish-avx2",
444 .cra_priority = 500,
445 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
446 .cra_blocksize = TF_BLOCK_SIZE,
447 .cra_ctxsize = sizeof(struct async_helper_ctx),
448 .cra_alignmask = 0,
449 .cra_type = &crypto_ablkcipher_type,
450 .cra_module = THIS_MODULE,
451 .cra_init = ablk_init,
452 .cra_exit = ablk_exit,
453 .cra_u = {
454 .ablkcipher = {
455 .min_keysize = TF_MIN_KEY_SIZE,
456 .max_keysize = TF_MAX_KEY_SIZE,
457 .setkey = ablk_set_key,
458 .encrypt = ablk_encrypt,
459 .decrypt = ablk_decrypt,
460 },
461 },
462}, {
463 .cra_name = "cbc(twofish)",
464 .cra_driver_name = "cbc-twofish-avx2",
465 .cra_priority = 500,
466 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
467 .cra_blocksize = TF_BLOCK_SIZE,
468 .cra_ctxsize = sizeof(struct async_helper_ctx),
469 .cra_alignmask = 0,
470 .cra_type = &crypto_ablkcipher_type,
471 .cra_module = THIS_MODULE,
472 .cra_init = ablk_init,
473 .cra_exit = ablk_exit,
474 .cra_u = {
475 .ablkcipher = {
476 .min_keysize = TF_MIN_KEY_SIZE,
477 .max_keysize = TF_MAX_KEY_SIZE,
478 .ivsize = TF_BLOCK_SIZE,
479 .setkey = ablk_set_key,
480 .encrypt = __ablk_encrypt,
481 .decrypt = ablk_decrypt,
482 },
483 },
484}, {
485 .cra_name = "ctr(twofish)",
486 .cra_driver_name = "ctr-twofish-avx2",
487 .cra_priority = 500,
488 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
489 .cra_blocksize = 1,
490 .cra_ctxsize = sizeof(struct async_helper_ctx),
491 .cra_alignmask = 0,
492 .cra_type = &crypto_ablkcipher_type,
493 .cra_module = THIS_MODULE,
494 .cra_init = ablk_init,
495 .cra_exit = ablk_exit,
496 .cra_u = {
497 .ablkcipher = {
498 .min_keysize = TF_MIN_KEY_SIZE,
499 .max_keysize = TF_MAX_KEY_SIZE,
500 .ivsize = TF_BLOCK_SIZE,
501 .setkey = ablk_set_key,
502 .encrypt = ablk_encrypt,
503 .decrypt = ablk_encrypt,
504 .geniv = "chainiv",
505 },
506 },
507}, {
508 .cra_name = "lrw(twofish)",
509 .cra_driver_name = "lrw-twofish-avx2",
510 .cra_priority = 500,
511 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
512 .cra_blocksize = TF_BLOCK_SIZE,
513 .cra_ctxsize = sizeof(struct async_helper_ctx),
514 .cra_alignmask = 0,
515 .cra_type = &crypto_ablkcipher_type,
516 .cra_module = THIS_MODULE,
517 .cra_init = ablk_init,
518 .cra_exit = ablk_exit,
519 .cra_u = {
520 .ablkcipher = {
521 .min_keysize = TF_MIN_KEY_SIZE +
522 TF_BLOCK_SIZE,
523 .max_keysize = TF_MAX_KEY_SIZE +
524 TF_BLOCK_SIZE,
525 .ivsize = TF_BLOCK_SIZE,
526 .setkey = ablk_set_key,
527 .encrypt = ablk_encrypt,
528 .decrypt = ablk_decrypt,
529 },
530 },
531}, {
532 .cra_name = "xts(twofish)",
533 .cra_driver_name = "xts-twofish-avx2",
534 .cra_priority = 500,
535 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
536 .cra_blocksize = TF_BLOCK_SIZE,
537 .cra_ctxsize = sizeof(struct async_helper_ctx),
538 .cra_alignmask = 0,
539 .cra_type = &crypto_ablkcipher_type,
540 .cra_module = THIS_MODULE,
541 .cra_init = ablk_init,
542 .cra_exit = ablk_exit,
543 .cra_u = {
544 .ablkcipher = {
545 .min_keysize = TF_MIN_KEY_SIZE * 2,
546 .max_keysize = TF_MAX_KEY_SIZE * 2,
547 .ivsize = TF_BLOCK_SIZE,
548 .setkey = ablk_set_key,
549 .encrypt = ablk_encrypt,
550 .decrypt = ablk_decrypt,
551 },
552 },
553} };
554
555static int __init init(void)
556{
557 u64 xcr0;
558
559 if (!cpu_has_avx2 || !cpu_has_osxsave) {
560 pr_info("AVX2 instructions are not detected.\n");
561 return -ENODEV;
562 }
563
564 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
565 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
566 pr_info("AVX2 detected but unusable.\n");
567 return -ENODEV;
568 }
569
570 return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs));
571}
572
573static void __exit fini(void)
574{
575 crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs));
576}
577
578module_init(init);
579module_exit(fini);
580
581MODULE_LICENSE("GPL");
582MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX2 optimized");
583MODULE_ALIAS("twofish");
584MODULE_ALIAS("twofish-asm");
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index 2047a562f6b3..a62ba541884e 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -50,26 +50,18 @@
50/* 8-way parallel cipher functions */ 50/* 8-way parallel cipher functions */
51asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst, 51asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
52 const u8 *src); 52 const u8 *src);
53EXPORT_SYMBOL_GPL(twofish_ecb_enc_8way);
54
55asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst, 53asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
56 const u8 *src); 54 const u8 *src);
57EXPORT_SYMBOL_GPL(twofish_ecb_dec_8way);
58 55
59asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, 56asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
60 const u8 *src); 57 const u8 *src);
61EXPORT_SYMBOL_GPL(twofish_cbc_dec_8way);
62
63asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, 58asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
64 const u8 *src, le128 *iv); 59 const u8 *src, le128 *iv);
65EXPORT_SYMBOL_GPL(twofish_ctr_8way);
66 60
67asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst, 61asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst,
68 const u8 *src, le128 *iv); 62 const u8 *src, le128 *iv);
69EXPORT_SYMBOL_GPL(twofish_xts_enc_8way);
70asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst, 63asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst,
71 const u8 *src, le128 *iv); 64 const u8 *src, le128 *iv);
72EXPORT_SYMBOL_GPL(twofish_xts_dec_8way);
73 65
74static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, 66static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
75 const u8 *src) 67 const u8 *src)
@@ -77,19 +69,17 @@ static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
77 __twofish_enc_blk_3way(ctx, dst, src, false); 69 __twofish_enc_blk_3way(ctx, dst, src, false);
78} 70}
79 71
80void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) 72static void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
81{ 73{
82 glue_xts_crypt_128bit_one(ctx, dst, src, iv, 74 glue_xts_crypt_128bit_one(ctx, dst, src, iv,
83 GLUE_FUNC_CAST(twofish_enc_blk)); 75 GLUE_FUNC_CAST(twofish_enc_blk));
84} 76}
85EXPORT_SYMBOL_GPL(twofish_xts_enc);
86 77
87void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) 78static void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
88{ 79{
89 glue_xts_crypt_128bit_one(ctx, dst, src, iv, 80 glue_xts_crypt_128bit_one(ctx, dst, src, iv,
90 GLUE_FUNC_CAST(twofish_dec_blk)); 81 GLUE_FUNC_CAST(twofish_dec_blk));
91} 82}
92EXPORT_SYMBOL_GPL(twofish_xts_dec);
93 83
94 84
95static const struct common_glue_ctx twofish_enc = { 85static const struct common_glue_ctx twofish_enc = {