aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/serpent-avx2-asm_64.S800
-rw-r--r--arch/x86/crypto/serpent_avx2_glue.c562
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c62
-rw-r--r--arch/x86/include/asm/crypto/serpent-avx.h24
-rw-r--r--crypto/Kconfig23
-rw-r--r--crypto/testmgr.c15
7 files changed, 1468 insertions, 20 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 1f6e0c2e9140..a21af593ab8d 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -43,6 +43,7 @@ endif
43# These modules require assembler to support AVX2. 43# These modules require assembler to support AVX2.
44ifeq ($(avx2_supported),yes) 44ifeq ($(avx2_supported),yes)
45 obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o 45 obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o
46 obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
46 obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o 47 obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o
47endif 48endif
48 49
@@ -72,6 +73,7 @@ endif
72 73
73ifeq ($(avx2_supported),yes) 74ifeq ($(avx2_supported),yes)
74 blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o 75 blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o
76 serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
75 twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o 77 twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o
76endif 78endif
77 79
diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S
new file mode 100644
index 000000000000..b222085cccac
--- /dev/null
+++ b/arch/x86/crypto/serpent-avx2-asm_64.S
@@ -0,0 +1,800 @@
1/*
2 * x86_64/AVX2 assembler optimized version of Serpent
3 *
4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * Based on AVX assembler implementation of Serpent by:
7 * Copyright © 2012 Johannes Goetzfried
8 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 */
16
17#include <linux/linkage.h>
18#include "glue_helper-asm-avx2.S"
19
20.file "serpent-avx2-asm_64.S"
21
22.data
23.align 16
24
25.Lbswap128_mask:
26 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
27.Lxts_gf128mul_and_shl1_mask_0:
28 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
29.Lxts_gf128mul_and_shl1_mask_1:
30 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
31
32.text
33
34#define CTX %rdi
35
36#define RNOT %ymm0
37#define tp %ymm1
38
39#define RA1 %ymm2
40#define RA2 %ymm3
41#define RB1 %ymm4
42#define RB2 %ymm5
43#define RC1 %ymm6
44#define RC2 %ymm7
45#define RD1 %ymm8
46#define RD2 %ymm9
47#define RE1 %ymm10
48#define RE2 %ymm11
49
50#define RK0 %ymm12
51#define RK1 %ymm13
52#define RK2 %ymm14
53#define RK3 %ymm15
54
55#define RK0x %xmm12
56#define RK1x %xmm13
57#define RK2x %xmm14
58#define RK3x %xmm15
59
60#define S0_1(x0, x1, x2, x3, x4) \
61 vpor x0, x3, tp; \
62 vpxor x3, x0, x0; \
63 vpxor x2, x3, x4; \
64 vpxor RNOT, x4, x4; \
65 vpxor x1, tp, x3; \
66 vpand x0, x1, x1; \
67 vpxor x4, x1, x1; \
68 vpxor x0, x2, x2;
69#define S0_2(x0, x1, x2, x3, x4) \
70 vpxor x3, x0, x0; \
71 vpor x0, x4, x4; \
72 vpxor x2, x0, x0; \
73 vpand x1, x2, x2; \
74 vpxor x2, x3, x3; \
75 vpxor RNOT, x1, x1; \
76 vpxor x4, x2, x2; \
77 vpxor x2, x1, x1;
78
79#define S1_1(x0, x1, x2, x3, x4) \
80 vpxor x0, x1, tp; \
81 vpxor x3, x0, x0; \
82 vpxor RNOT, x3, x3; \
83 vpand tp, x1, x4; \
84 vpor tp, x0, x0; \
85 vpxor x2, x3, x3; \
86 vpxor x3, x0, x0; \
87 vpxor x3, tp, x1;
88#define S1_2(x0, x1, x2, x3, x4) \
89 vpxor x4, x3, x3; \
90 vpor x4, x1, x1; \
91 vpxor x2, x4, x4; \
92 vpand x0, x2, x2; \
93 vpxor x1, x2, x2; \
94 vpor x0, x1, x1; \
95 vpxor RNOT, x0, x0; \
96 vpxor x2, x0, x0; \
97 vpxor x1, x4, x4;
98
99#define S2_1(x0, x1, x2, x3, x4) \
100 vpxor RNOT, x3, x3; \
101 vpxor x0, x1, x1; \
102 vpand x2, x0, tp; \
103 vpxor x3, tp, tp; \
104 vpor x0, x3, x3; \
105 vpxor x1, x2, x2; \
106 vpxor x1, x3, x3; \
107 vpand tp, x1, x1;
108#define S2_2(x0, x1, x2, x3, x4) \
109 vpxor x2, tp, tp; \
110 vpand x3, x2, x2; \
111 vpor x1, x3, x3; \
112 vpxor RNOT, tp, tp; \
113 vpxor tp, x3, x3; \
114 vpxor tp, x0, x4; \
115 vpxor x2, tp, x0; \
116 vpor x2, x1, x1;
117
118#define S3_1(x0, x1, x2, x3, x4) \
119 vpxor x3, x1, tp; \
120 vpor x0, x3, x3; \
121 vpand x0, x1, x4; \
122 vpxor x2, x0, x0; \
123 vpxor tp, x2, x2; \
124 vpand x3, tp, x1; \
125 vpxor x3, x2, x2; \
126 vpor x4, x0, x0; \
127 vpxor x3, x4, x4;
128#define S3_2(x0, x1, x2, x3, x4) \
129 vpxor x0, x1, x1; \
130 vpand x3, x0, x0; \
131 vpand x4, x3, x3; \
132 vpxor x2, x3, x3; \
133 vpor x1, x4, x4; \
134 vpand x1, x2, x2; \
135 vpxor x3, x4, x4; \
136 vpxor x3, x0, x0; \
137 vpxor x2, x3, x3;
138
139#define S4_1(x0, x1, x2, x3, x4) \
140 vpand x0, x3, tp; \
141 vpxor x3, x0, x0; \
142 vpxor x2, tp, tp; \
143 vpor x3, x2, x2; \
144 vpxor x1, x0, x0; \
145 vpxor tp, x3, x4; \
146 vpor x0, x2, x2; \
147 vpxor x1, x2, x2;
148#define S4_2(x0, x1, x2, x3, x4) \
149 vpand x0, x1, x1; \
150 vpxor x4, x1, x1; \
151 vpand x2, x4, x4; \
152 vpxor tp, x2, x2; \
153 vpxor x0, x4, x4; \
154 vpor x1, tp, x3; \
155 vpxor RNOT, x1, x1; \
156 vpxor x0, x3, x3;
157
158#define S5_1(x0, x1, x2, x3, x4) \
159 vpor x0, x1, tp; \
160 vpxor tp, x2, x2; \
161 vpxor RNOT, x3, x3; \
162 vpxor x0, x1, x4; \
163 vpxor x2, x0, x0; \
164 vpand x4, tp, x1; \
165 vpor x3, x4, x4; \
166 vpxor x0, x4, x4;
167#define S5_2(x0, x1, x2, x3, x4) \
168 vpand x3, x0, x0; \
169 vpxor x3, x1, x1; \
170 vpxor x2, x3, x3; \
171 vpxor x1, x0, x0; \
172 vpand x4, x2, x2; \
173 vpxor x2, x1, x1; \
174 vpand x0, x2, x2; \
175 vpxor x2, x3, x3;
176
177#define S6_1(x0, x1, x2, x3, x4) \
178 vpxor x0, x3, x3; \
179 vpxor x2, x1, tp; \
180 vpxor x0, x2, x2; \
181 vpand x3, x0, x0; \
182 vpor x3, tp, tp; \
183 vpxor RNOT, x1, x4; \
184 vpxor tp, x0, x0; \
185 vpxor x2, tp, x1;
186#define S6_2(x0, x1, x2, x3, x4) \
187 vpxor x4, x3, x3; \
188 vpxor x0, x4, x4; \
189 vpand x0, x2, x2; \
190 vpxor x1, x4, x4; \
191 vpxor x3, x2, x2; \
192 vpand x1, x3, x3; \
193 vpxor x0, x3, x3; \
194 vpxor x2, x1, x1;
195
196#define S7_1(x0, x1, x2, x3, x4) \
197 vpxor RNOT, x1, tp; \
198 vpxor RNOT, x0, x0; \
199 vpand x2, tp, x1; \
200 vpxor x3, x1, x1; \
201 vpor tp, x3, x3; \
202 vpxor x2, tp, x4; \
203 vpxor x3, x2, x2; \
204 vpxor x0, x3, x3; \
205 vpor x1, x0, x0;
206#define S7_2(x0, x1, x2, x3, x4) \
207 vpand x0, x2, x2; \
208 vpxor x4, x0, x0; \
209 vpxor x3, x4, x4; \
210 vpand x0, x3, x3; \
211 vpxor x1, x4, x4; \
212 vpxor x4, x2, x2; \
213 vpxor x1, x3, x3; \
214 vpor x0, x4, x4; \
215 vpxor x1, x4, x4;
216
217#define SI0_1(x0, x1, x2, x3, x4) \
218 vpxor x0, x1, x1; \
219 vpor x1, x3, tp; \
220 vpxor x1, x3, x4; \
221 vpxor RNOT, x0, x0; \
222 vpxor tp, x2, x2; \
223 vpxor x0, tp, x3; \
224 vpand x1, x0, x0; \
225 vpxor x2, x0, x0;
226#define SI0_2(x0, x1, x2, x3, x4) \
227 vpand x3, x2, x2; \
228 vpxor x4, x3, x3; \
229 vpxor x3, x2, x2; \
230 vpxor x3, x1, x1; \
231 vpand x0, x3, x3; \
232 vpxor x0, x1, x1; \
233 vpxor x2, x0, x0; \
234 vpxor x3, x4, x4;
235
236#define SI1_1(x0, x1, x2, x3, x4) \
237 vpxor x3, x1, x1; \
238 vpxor x2, x0, tp; \
239 vpxor RNOT, x2, x2; \
240 vpor x1, x0, x4; \
241 vpxor x3, x4, x4; \
242 vpand x1, x3, x3; \
243 vpxor x2, x1, x1; \
244 vpand x4, x2, x2;
245#define SI1_2(x0, x1, x2, x3, x4) \
246 vpxor x1, x4, x4; \
247 vpor x3, x1, x1; \
248 vpxor tp, x3, x3; \
249 vpxor tp, x2, x2; \
250 vpor x4, tp, x0; \
251 vpxor x4, x2, x2; \
252 vpxor x0, x1, x1; \
253 vpxor x1, x4, x4;
254
255#define SI2_1(x0, x1, x2, x3, x4) \
256 vpxor x1, x2, x2; \
257 vpxor RNOT, x3, tp; \
258 vpor x2, tp, tp; \
259 vpxor x3, x2, x2; \
260 vpxor x0, x3, x4; \
261 vpxor x1, tp, x3; \
262 vpor x2, x1, x1; \
263 vpxor x0, x2, x2;
264#define SI2_2(x0, x1, x2, x3, x4) \
265 vpxor x4, x1, x1; \
266 vpor x3, x4, x4; \
267 vpxor x3, x2, x2; \
268 vpxor x2, x4, x4; \
269 vpand x1, x2, x2; \
270 vpxor x3, x2, x2; \
271 vpxor x4, x3, x3; \
272 vpxor x0, x4, x4;
273
274#define SI3_1(x0, x1, x2, x3, x4) \
275 vpxor x1, x2, x2; \
276 vpand x2, x1, tp; \
277 vpxor x0, tp, tp; \
278 vpor x1, x0, x0; \
279 vpxor x3, x1, x4; \
280 vpxor x3, x0, x0; \
281 vpor tp, x3, x3; \
282 vpxor x2, tp, x1;
283#define SI3_2(x0, x1, x2, x3, x4) \
284 vpxor x3, x1, x1; \
285 vpxor x2, x0, x0; \
286 vpxor x3, x2, x2; \
287 vpand x1, x3, x3; \
288 vpxor x0, x1, x1; \
289 vpand x2, x0, x0; \
290 vpxor x3, x4, x4; \
291 vpxor x0, x3, x3; \
292 vpxor x1, x0, x0;
293
294#define SI4_1(x0, x1, x2, x3, x4) \
295 vpxor x3, x2, x2; \
296 vpand x1, x0, tp; \
297 vpxor x2, tp, tp; \
298 vpor x3, x2, x2; \
299 vpxor RNOT, x0, x4; \
300 vpxor tp, x1, x1; \
301 vpxor x2, tp, x0; \
302 vpand x4, x2, x2;
303#define SI4_2(x0, x1, x2, x3, x4) \
304 vpxor x0, x2, x2; \
305 vpor x4, x0, x0; \
306 vpxor x3, x0, x0; \
307 vpand x2, x3, x3; \
308 vpxor x3, x4, x4; \
309 vpxor x1, x3, x3; \
310 vpand x0, x1, x1; \
311 vpxor x1, x4, x4; \
312 vpxor x3, x0, x0;
313
314#define SI5_1(x0, x1, x2, x3, x4) \
315 vpor x2, x1, tp; \
316 vpxor x1, x2, x2; \
317 vpxor x3, tp, tp; \
318 vpand x1, x3, x3; \
319 vpxor x3, x2, x2; \
320 vpor x0, x3, x3; \
321 vpxor RNOT, x0, x0; \
322 vpxor x2, x3, x3; \
323 vpor x0, x2, x2;
324#define SI5_2(x0, x1, x2, x3, x4) \
325 vpxor tp, x1, x4; \
326 vpxor x4, x2, x2; \
327 vpand x0, x4, x4; \
328 vpxor tp, x0, x0; \
329 vpxor x3, tp, x1; \
330 vpand x2, x0, x0; \
331 vpxor x3, x2, x2; \
332 vpxor x2, x0, x0; \
333 vpxor x4, x2, x2; \
334 vpxor x3, x4, x4;
335
336#define SI6_1(x0, x1, x2, x3, x4) \
337 vpxor x2, x0, x0; \
338 vpand x3, x0, tp; \
339 vpxor x3, x2, x2; \
340 vpxor x2, tp, tp; \
341 vpxor x1, x3, x3; \
342 vpor x0, x2, x2; \
343 vpxor x3, x2, x2; \
344 vpand tp, x3, x3;
345#define SI6_2(x0, x1, x2, x3, x4) \
346 vpxor RNOT, tp, tp; \
347 vpxor x1, x3, x3; \
348 vpand x2, x1, x1; \
349 vpxor tp, x0, x4; \
350 vpxor x4, x3, x3; \
351 vpxor x2, x4, x4; \
352 vpxor x1, tp, x0; \
353 vpxor x0, x2, x2;
354
355#define SI7_1(x0, x1, x2, x3, x4) \
356 vpand x0, x3, tp; \
357 vpxor x2, x0, x0; \
358 vpor x3, x2, x2; \
359 vpxor x1, x3, x4; \
360 vpxor RNOT, x0, x0; \
361 vpor tp, x1, x1; \
362 vpxor x0, x4, x4; \
363 vpand x2, x0, x0; \
364 vpxor x1, x0, x0;
365#define SI7_2(x0, x1, x2, x3, x4) \
366 vpand x2, x1, x1; \
367 vpxor x2, tp, x3; \
368 vpxor x3, x4, x4; \
369 vpand x3, x2, x2; \
370 vpor x0, x3, x3; \
371 vpxor x4, x1, x1; \
372 vpxor x4, x3, x3; \
373 vpand x0, x4, x4; \
374 vpxor x2, x4, x4;
375
376#define get_key(i,j,t) \
377 vpbroadcastd (4*(i)+(j))*4(CTX), t;
378
379#define K2(x0, x1, x2, x3, x4, i) \
380 get_key(i, 0, RK0); \
381 get_key(i, 1, RK1); \
382 get_key(i, 2, RK2); \
383 get_key(i, 3, RK3); \
384 vpxor RK0, x0 ## 1, x0 ## 1; \
385 vpxor RK1, x1 ## 1, x1 ## 1; \
386 vpxor RK2, x2 ## 1, x2 ## 1; \
387 vpxor RK3, x3 ## 1, x3 ## 1; \
388 vpxor RK0, x0 ## 2, x0 ## 2; \
389 vpxor RK1, x1 ## 2, x1 ## 2; \
390 vpxor RK2, x2 ## 2, x2 ## 2; \
391 vpxor RK3, x3 ## 2, x3 ## 2;
392
393#define LK2(x0, x1, x2, x3, x4, i) \
394 vpslld $13, x0 ## 1, x4 ## 1; \
395 vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \
396 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
397 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
398 vpslld $3, x2 ## 1, x4 ## 1; \
399 vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \
400 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
401 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
402 vpslld $13, x0 ## 2, x4 ## 2; \
403 vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \
404 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
405 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
406 vpslld $3, x2 ## 2, x4 ## 2; \
407 vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \
408 vpor x4 ## 2, x2 ## 2, x2 ## 2; \
409 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
410 vpslld $1, x1 ## 1, x4 ## 1; \
411 vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \
412 vpor x4 ## 1, x1 ## 1, x1 ## 1; \
413 vpslld $3, x0 ## 1, x4 ## 1; \
414 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
415 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
416 get_key(i, 1, RK1); \
417 vpslld $1, x1 ## 2, x4 ## 2; \
418 vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \
419 vpor x4 ## 2, x1 ## 2, x1 ## 2; \
420 vpslld $3, x0 ## 2, x4 ## 2; \
421 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
422 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
423 get_key(i, 3, RK3); \
424 vpslld $7, x3 ## 1, x4 ## 1; \
425 vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \
426 vpor x4 ## 1, x3 ## 1, x3 ## 1; \
427 vpslld $7, x1 ## 1, x4 ## 1; \
428 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
429 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
430 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
431 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
432 get_key(i, 0, RK0); \
433 vpslld $7, x3 ## 2, x4 ## 2; \
434 vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \
435 vpor x4 ## 2, x3 ## 2, x3 ## 2; \
436 vpslld $7, x1 ## 2, x4 ## 2; \
437 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
438 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
439 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
440 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
441 get_key(i, 2, RK2); \
442 vpxor RK1, x1 ## 1, x1 ## 1; \
443 vpxor RK3, x3 ## 1, x3 ## 1; \
444 vpslld $5, x0 ## 1, x4 ## 1; \
445 vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \
446 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
447 vpslld $22, x2 ## 1, x4 ## 1; \
448 vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \
449 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
450 vpxor RK0, x0 ## 1, x0 ## 1; \
451 vpxor RK2, x2 ## 1, x2 ## 1; \
452 vpxor RK1, x1 ## 2, x1 ## 2; \
453 vpxor RK3, x3 ## 2, x3 ## 2; \
454 vpslld $5, x0 ## 2, x4 ## 2; \
455 vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \
456 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
457 vpslld $22, x2 ## 2, x4 ## 2; \
458 vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \
459 vpor x4 ## 2, x2 ## 2, x2 ## 2; \
460 vpxor RK0, x0 ## 2, x0 ## 2; \
461 vpxor RK2, x2 ## 2, x2 ## 2;
462
463#define KL2(x0, x1, x2, x3, x4, i) \
464 vpxor RK0, x0 ## 1, x0 ## 1; \
465 vpxor RK2, x2 ## 1, x2 ## 1; \
466 vpsrld $5, x0 ## 1, x4 ## 1; \
467 vpslld $(32 - 5), x0 ## 1, x0 ## 1; \
468 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
469 vpxor RK3, x3 ## 1, x3 ## 1; \
470 vpxor RK1, x1 ## 1, x1 ## 1; \
471 vpsrld $22, x2 ## 1, x4 ## 1; \
472 vpslld $(32 - 22), x2 ## 1, x2 ## 1; \
473 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
474 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
475 vpxor RK0, x0 ## 2, x0 ## 2; \
476 vpxor RK2, x2 ## 2, x2 ## 2; \
477 vpsrld $5, x0 ## 2, x4 ## 2; \
478 vpslld $(32 - 5), x0 ## 2, x0 ## 2; \
479 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
480 vpxor RK3, x3 ## 2, x3 ## 2; \
481 vpxor RK1, x1 ## 2, x1 ## 2; \
482 vpsrld $22, x2 ## 2, x4 ## 2; \
483 vpslld $(32 - 22), x2 ## 2, x2 ## 2; \
484 vpor x4 ## 2, x2 ## 2, x2 ## 2; \
485 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
486 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
487 vpslld $7, x1 ## 1, x4 ## 1; \
488 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
489 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
490 vpsrld $1, x1 ## 1, x4 ## 1; \
491 vpslld $(32 - 1), x1 ## 1, x1 ## 1; \
492 vpor x4 ## 1, x1 ## 1, x1 ## 1; \
493 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
494 vpslld $7, x1 ## 2, x4 ## 2; \
495 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
496 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
497 vpsrld $1, x1 ## 2, x4 ## 2; \
498 vpslld $(32 - 1), x1 ## 2, x1 ## 2; \
499 vpor x4 ## 2, x1 ## 2, x1 ## 2; \
500 vpsrld $7, x3 ## 1, x4 ## 1; \
501 vpslld $(32 - 7), x3 ## 1, x3 ## 1; \
502 vpor x4 ## 1, x3 ## 1, x3 ## 1; \
503 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
504 vpslld $3, x0 ## 1, x4 ## 1; \
505 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
506 vpsrld $7, x3 ## 2, x4 ## 2; \
507 vpslld $(32 - 7), x3 ## 2, x3 ## 2; \
508 vpor x4 ## 2, x3 ## 2, x3 ## 2; \
509 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
510 vpslld $3, x0 ## 2, x4 ## 2; \
511 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
512 vpsrld $13, x0 ## 1, x4 ## 1; \
513 vpslld $(32 - 13), x0 ## 1, x0 ## 1; \
514 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
515 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
516 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
517 vpsrld $3, x2 ## 1, x4 ## 1; \
518 vpslld $(32 - 3), x2 ## 1, x2 ## 1; \
519 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
520 vpsrld $13, x0 ## 2, x4 ## 2; \
521 vpslld $(32 - 13), x0 ## 2, x0 ## 2; \
522 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
523 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
524 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
525 vpsrld $3, x2 ## 2, x4 ## 2; \
526 vpslld $(32 - 3), x2 ## 2, x2 ## 2; \
527 vpor x4 ## 2, x2 ## 2, x2 ## 2;
528
529#define S(SBOX, x0, x1, x2, x3, x4) \
530 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
531 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
532 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
533 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
534
535#define SP(SBOX, x0, x1, x2, x3, x4, i) \
536 get_key(i, 0, RK0); \
537 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
538 get_key(i, 2, RK2); \
539 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
540 get_key(i, 3, RK3); \
541 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
542 get_key(i, 1, RK1); \
543 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
544
545#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
546 vpunpckldq x1, x0, t0; \
547 vpunpckhdq x1, x0, t2; \
548 vpunpckldq x3, x2, t1; \
549 vpunpckhdq x3, x2, x3; \
550 \
551 vpunpcklqdq t1, t0, x0; \
552 vpunpckhqdq t1, t0, x1; \
553 vpunpcklqdq x3, t2, x2; \
554 vpunpckhqdq x3, t2, x3;
555
556#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
557 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
558
559#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
560 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
561
562.align 8
563__serpent_enc_blk16:
564 /* input:
565 * %rdi: ctx, CTX
566 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext
567 * output:
568 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
569 */
570
571 vpcmpeqd RNOT, RNOT, RNOT;
572
573 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
574 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
575
576 K2(RA, RB, RC, RD, RE, 0);
577 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
578 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
579 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
580 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
581 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
582 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
583 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
584 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
585 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
586 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
587 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
588 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
589 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
590 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
591 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
592 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
593 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
594 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
595 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
596 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
597 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
598 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
599 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
600 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
601 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
602 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
603 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
604 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
605 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
606 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
607 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
608 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
609
610 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
611 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
612
613 ret;
614ENDPROC(__serpent_enc_blk16)
615
616.align 8
617__serpent_dec_blk16:
618 /* input:
619 * %rdi: ctx, CTX
620 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
621 * output:
622 * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext
623 */
624
625 vpcmpeqd RNOT, RNOT, RNOT;
626
627 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
628 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
629
630 K2(RA, RB, RC, RD, RE, 32);
631 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
632 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
633 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
634 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
635 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
636 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
637 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
638 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
639 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
640 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
641 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
642 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
643 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
644 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
645 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
646 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
647 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
648 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
649 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
650 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
651 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
652 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
653 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
654 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
655 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
656 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
657 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
658 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
659 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
660 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
661 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
662 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
663
664 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
665 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
666
667 ret;
668ENDPROC(__serpent_dec_blk16)
669
670ENTRY(serpent_ecb_enc_16way)
671 /* input:
672 * %rdi: ctx, CTX
673 * %rsi: dst
674 * %rdx: src
675 */
676
677 vzeroupper;
678
679 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
680
681 call __serpent_enc_blk16;
682
683 store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
684
685 vzeroupper;
686
687 ret;
688ENDPROC(serpent_ecb_enc_16way)
689
690ENTRY(serpent_ecb_dec_16way)
691 /* input:
692 * %rdi: ctx, CTX
693 * %rsi: dst
694 * %rdx: src
695 */
696
697 vzeroupper;
698
699 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
700
701 call __serpent_dec_blk16;
702
703 store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
704
705 vzeroupper;
706
707 ret;
708ENDPROC(serpent_ecb_dec_16way)
709
710ENTRY(serpent_cbc_dec_16way)
711 /* input:
712 * %rdi: ctx, CTX
713 * %rsi: dst
714 * %rdx: src
715 */
716
717 vzeroupper;
718
719 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
720
721 call __serpent_dec_blk16;
722
723 store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,
724 RK0);
725
726 vzeroupper;
727
728 ret;
729ENDPROC(serpent_cbc_dec_16way)
730
731ENTRY(serpent_ctr_16way)
732 /* input:
733 * %rdi: ctx, CTX
734 * %rsi: dst (16 blocks)
735 * %rdx: src (16 blocks)
736 * %rcx: iv (little endian, 128bit)
737 */
738
739 vzeroupper;
740
741 load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
742 RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
743 tp);
744
745 call __serpent_enc_blk16;
746
747 store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
748
749 vzeroupper;
750
751 ret;
752ENDPROC(serpent_ctr_16way)
753
754ENTRY(serpent_xts_enc_16way)
755 /* input:
756 * %rdi: ctx, CTX
757 * %rsi: dst (16 blocks)
758 * %rdx: src (16 blocks)
759 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
760 */
761
762 vzeroupper;
763
764 load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
765 RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
766 .Lxts_gf128mul_and_shl1_mask_0,
767 .Lxts_gf128mul_and_shl1_mask_1);
768
769 call __serpent_enc_blk16;
770
771 store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
772
773 vzeroupper;
774
775 ret;
776ENDPROC(serpent_xts_enc_16way)
777
778ENTRY(serpent_xts_dec_16way)
779 /* input:
780 * %rdi: ctx, CTX
781 * %rsi: dst (16 blocks)
782 * %rdx: src (16 blocks)
783 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
784 */
785
786 vzeroupper;
787
788 load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
789 RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
790 .Lxts_gf128mul_and_shl1_mask_0,
791 .Lxts_gf128mul_and_shl1_mask_1);
792
793 call __serpent_dec_blk16;
794
795 store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
796
797 vzeroupper;
798
799 ret;
800ENDPROC(serpent_xts_dec_16way)
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
new file mode 100644
index 000000000000..23aabc6c20a5
--- /dev/null
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -0,0 +1,562 @@
1/*
2 * Glue Code for x86_64/AVX2 assembler optimized version of Serpent
3 *
4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 */
12
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/crypto.h>
16#include <linux/err.h>
17#include <crypto/algapi.h>
18#include <crypto/ctr.h>
19#include <crypto/lrw.h>
20#include <crypto/xts.h>
21#include <crypto/serpent.h>
22#include <asm/xcr.h>
23#include <asm/xsave.h>
24#include <asm/crypto/serpent-avx.h>
25#include <asm/crypto/ablk_helper.h>
26#include <asm/crypto/glue_helper.h>
27
28#define SERPENT_AVX2_PARALLEL_BLOCKS 16
29
30/* 16-way AVX2 parallel cipher functions */
31asmlinkage void serpent_ecb_enc_16way(struct serpent_ctx *ctx, u8 *dst,
32 const u8 *src);
33asmlinkage void serpent_ecb_dec_16way(struct serpent_ctx *ctx, u8 *dst,
34 const u8 *src);
35asmlinkage void serpent_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src);
36
37asmlinkage void serpent_ctr_16way(void *ctx, u128 *dst, const u128 *src,
38 le128 *iv);
39asmlinkage void serpent_xts_enc_16way(struct serpent_ctx *ctx, u8 *dst,
40 const u8 *src, le128 *iv);
41asmlinkage void serpent_xts_dec_16way(struct serpent_ctx *ctx, u8 *dst,
42 const u8 *src, le128 *iv);
43
44static const struct common_glue_ctx serpent_enc = {
45 .num_funcs = 3,
46 .fpu_blocks_limit = 8,
47
48 .funcs = { {
49 .num_blocks = 16,
50 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_16way) }
51 }, {
52 .num_blocks = 8,
53 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) }
54 }, {
55 .num_blocks = 1,
56 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
57 } }
58};
59
60static const struct common_glue_ctx serpent_ctr = {
61 .num_funcs = 3,
62 .fpu_blocks_limit = 8,
63
64 .funcs = { {
65 .num_blocks = 16,
66 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_16way) }
67 }, {
68 .num_blocks = 8,
69 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
70 }, {
71 .num_blocks = 1,
72 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) }
73 } }
74};
75
76static const struct common_glue_ctx serpent_enc_xts = {
77 .num_funcs = 3,
78 .fpu_blocks_limit = 8,
79
80 .funcs = { {
81 .num_blocks = 16,
82 .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_16way) }
83 }, {
84 .num_blocks = 8,
85 .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) }
86 }, {
87 .num_blocks = 1,
88 .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) }
89 } }
90};
91
92static const struct common_glue_ctx serpent_dec = {
93 .num_funcs = 3,
94 .fpu_blocks_limit = 8,
95
96 .funcs = { {
97 .num_blocks = 16,
98 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_16way) }
99 }, {
100 .num_blocks = 8,
101 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) }
102 }, {
103 .num_blocks = 1,
104 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
105 } }
106};
107
108static const struct common_glue_ctx serpent_dec_cbc = {
109 .num_funcs = 3,
110 .fpu_blocks_limit = 8,
111
112 .funcs = { {
113 .num_blocks = 16,
114 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_16way) }
115 }, {
116 .num_blocks = 8,
117 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) }
118 }, {
119 .num_blocks = 1,
120 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
121 } }
122};
123
124static const struct common_glue_ctx serpent_dec_xts = {
125 .num_funcs = 3,
126 .fpu_blocks_limit = 8,
127
128 .funcs = { {
129 .num_blocks = 16,
130 .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_16way) }
131 }, {
132 .num_blocks = 8,
133 .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) }
134 }, {
135 .num_blocks = 1,
136 .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) }
137 } }
138};
139
140static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
141 struct scatterlist *src, unsigned int nbytes)
142{
143 return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes);
144}
145
146static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
147 struct scatterlist *src, unsigned int nbytes)
148{
149 return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes);
150}
151
152static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
153 struct scatterlist *src, unsigned int nbytes)
154{
155 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc,
156 dst, src, nbytes);
157}
158
159static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
160 struct scatterlist *src, unsigned int nbytes)
161{
162 return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src,
163 nbytes);
164}
165
166static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
167 struct scatterlist *src, unsigned int nbytes)
168{
169 return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes);
170}
171
172static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
173{
174 /* since reusing AVX functions, starts using FPU at 8 parallel blocks */
175 return glue_fpu_begin(SERPENT_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes);
176}
177
178static inline void serpent_fpu_end(bool fpu_enabled)
179{
180 glue_fpu_end(fpu_enabled);
181}
182
183struct crypt_priv {
184 struct serpent_ctx *ctx;
185 bool fpu_enabled;
186};
187
188static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
189{
190 const unsigned int bsize = SERPENT_BLOCK_SIZE;
191 struct crypt_priv *ctx = priv;
192 int i;
193
194 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
195
196 if (nbytes >= SERPENT_AVX2_PARALLEL_BLOCKS * bsize) {
197 serpent_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
198 srcdst += bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
199 nbytes -= bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
200 }
201
202 while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
203 serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
204 srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
205 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
206 }
207
208 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
209 __serpent_encrypt(ctx->ctx, srcdst, srcdst);
210}
211
212static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
213{
214 const unsigned int bsize = SERPENT_BLOCK_SIZE;
215 struct crypt_priv *ctx = priv;
216 int i;
217
218 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
219
220 if (nbytes >= SERPENT_AVX2_PARALLEL_BLOCKS * bsize) {
221 serpent_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
222 srcdst += bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
223 nbytes -= bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
224 }
225
226 while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
227 serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
228 srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
229 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
230 }
231
232 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
233 __serpent_decrypt(ctx->ctx, srcdst, srcdst);
234}
235
236static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
237 struct scatterlist *src, unsigned int nbytes)
238{
239 struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
240 be128 buf[SERPENT_AVX2_PARALLEL_BLOCKS];
241 struct crypt_priv crypt_ctx = {
242 .ctx = &ctx->serpent_ctx,
243 .fpu_enabled = false,
244 };
245 struct lrw_crypt_req req = {
246 .tbuf = buf,
247 .tbuflen = sizeof(buf),
248
249 .table_ctx = &ctx->lrw_table,
250 .crypt_ctx = &crypt_ctx,
251 .crypt_fn = encrypt_callback,
252 };
253 int ret;
254
255 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
256 ret = lrw_crypt(desc, dst, src, nbytes, &req);
257 serpent_fpu_end(crypt_ctx.fpu_enabled);
258
259 return ret;
260}
261
262static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
263 struct scatterlist *src, unsigned int nbytes)
264{
265 struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
266 be128 buf[SERPENT_AVX2_PARALLEL_BLOCKS];
267 struct crypt_priv crypt_ctx = {
268 .ctx = &ctx->serpent_ctx,
269 .fpu_enabled = false,
270 };
271 struct lrw_crypt_req req = {
272 .tbuf = buf,
273 .tbuflen = sizeof(buf),
274
275 .table_ctx = &ctx->lrw_table,
276 .crypt_ctx = &crypt_ctx,
277 .crypt_fn = decrypt_callback,
278 };
279 int ret;
280
281 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
282 ret = lrw_crypt(desc, dst, src, nbytes, &req);
283 serpent_fpu_end(crypt_ctx.fpu_enabled);
284
285 return ret;
286}
287
288static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
289 struct scatterlist *src, unsigned int nbytes)
290{
291 struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
292
293 return glue_xts_crypt_128bit(&serpent_enc_xts, desc, dst, src, nbytes,
294 XTS_TWEAK_CAST(__serpent_encrypt),
295 &ctx->tweak_ctx, &ctx->crypt_ctx);
296}
297
298static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
299 struct scatterlist *src, unsigned int nbytes)
300{
301 struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
302
303 return glue_xts_crypt_128bit(&serpent_dec_xts, desc, dst, src, nbytes,
304 XTS_TWEAK_CAST(__serpent_encrypt),
305 &ctx->tweak_ctx, &ctx->crypt_ctx);
306}
307
308static struct crypto_alg srp_algs[10] = { {
309 .cra_name = "__ecb-serpent-avx2",
310 .cra_driver_name = "__driver-ecb-serpent-avx2",
311 .cra_priority = 0,
312 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
313 .cra_blocksize = SERPENT_BLOCK_SIZE,
314 .cra_ctxsize = sizeof(struct serpent_ctx),
315 .cra_alignmask = 0,
316 .cra_type = &crypto_blkcipher_type,
317 .cra_module = THIS_MODULE,
318 .cra_list = LIST_HEAD_INIT(srp_algs[0].cra_list),
319 .cra_u = {
320 .blkcipher = {
321 .min_keysize = SERPENT_MIN_KEY_SIZE,
322 .max_keysize = SERPENT_MAX_KEY_SIZE,
323 .setkey = serpent_setkey,
324 .encrypt = ecb_encrypt,
325 .decrypt = ecb_decrypt,
326 },
327 },
328}, {
329 .cra_name = "__cbc-serpent-avx2",
330 .cra_driver_name = "__driver-cbc-serpent-avx2",
331 .cra_priority = 0,
332 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
333 .cra_blocksize = SERPENT_BLOCK_SIZE,
334 .cra_ctxsize = sizeof(struct serpent_ctx),
335 .cra_alignmask = 0,
336 .cra_type = &crypto_blkcipher_type,
337 .cra_module = THIS_MODULE,
338 .cra_list = LIST_HEAD_INIT(srp_algs[1].cra_list),
339 .cra_u = {
340 .blkcipher = {
341 .min_keysize = SERPENT_MIN_KEY_SIZE,
342 .max_keysize = SERPENT_MAX_KEY_SIZE,
343 .setkey = serpent_setkey,
344 .encrypt = cbc_encrypt,
345 .decrypt = cbc_decrypt,
346 },
347 },
348}, {
349 .cra_name = "__ctr-serpent-avx2",
350 .cra_driver_name = "__driver-ctr-serpent-avx2",
351 .cra_priority = 0,
352 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
353 .cra_blocksize = 1,
354 .cra_ctxsize = sizeof(struct serpent_ctx),
355 .cra_alignmask = 0,
356 .cra_type = &crypto_blkcipher_type,
357 .cra_module = THIS_MODULE,
358 .cra_list = LIST_HEAD_INIT(srp_algs[2].cra_list),
359 .cra_u = {
360 .blkcipher = {
361 .min_keysize = SERPENT_MIN_KEY_SIZE,
362 .max_keysize = SERPENT_MAX_KEY_SIZE,
363 .ivsize = SERPENT_BLOCK_SIZE,
364 .setkey = serpent_setkey,
365 .encrypt = ctr_crypt,
366 .decrypt = ctr_crypt,
367 },
368 },
369}, {
370 .cra_name = "__lrw-serpent-avx2",
371 .cra_driver_name = "__driver-lrw-serpent-avx2",
372 .cra_priority = 0,
373 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
374 .cra_blocksize = SERPENT_BLOCK_SIZE,
375 .cra_ctxsize = sizeof(struct serpent_lrw_ctx),
376 .cra_alignmask = 0,
377 .cra_type = &crypto_blkcipher_type,
378 .cra_module = THIS_MODULE,
379 .cra_list = LIST_HEAD_INIT(srp_algs[3].cra_list),
380 .cra_exit = lrw_serpent_exit_tfm,
381 .cra_u = {
382 .blkcipher = {
383 .min_keysize = SERPENT_MIN_KEY_SIZE +
384 SERPENT_BLOCK_SIZE,
385 .max_keysize = SERPENT_MAX_KEY_SIZE +
386 SERPENT_BLOCK_SIZE,
387 .ivsize = SERPENT_BLOCK_SIZE,
388 .setkey = lrw_serpent_setkey,
389 .encrypt = lrw_encrypt,
390 .decrypt = lrw_decrypt,
391 },
392 },
393}, {
394 .cra_name = "__xts-serpent-avx2",
395 .cra_driver_name = "__driver-xts-serpent-avx2",
396 .cra_priority = 0,
397 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
398 .cra_blocksize = SERPENT_BLOCK_SIZE,
399 .cra_ctxsize = sizeof(struct serpent_xts_ctx),
400 .cra_alignmask = 0,
401 .cra_type = &crypto_blkcipher_type,
402 .cra_module = THIS_MODULE,
403 .cra_list = LIST_HEAD_INIT(srp_algs[4].cra_list),
404 .cra_u = {
405 .blkcipher = {
406 .min_keysize = SERPENT_MIN_KEY_SIZE * 2,
407 .max_keysize = SERPENT_MAX_KEY_SIZE * 2,
408 .ivsize = SERPENT_BLOCK_SIZE,
409 .setkey = xts_serpent_setkey,
410 .encrypt = xts_encrypt,
411 .decrypt = xts_decrypt,
412 },
413 },
414}, {
415 .cra_name = "ecb(serpent)",
416 .cra_driver_name = "ecb-serpent-avx2",
417 .cra_priority = 600,
418 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
419 .cra_blocksize = SERPENT_BLOCK_SIZE,
420 .cra_ctxsize = sizeof(struct async_helper_ctx),
421 .cra_alignmask = 0,
422 .cra_type = &crypto_ablkcipher_type,
423 .cra_module = THIS_MODULE,
424 .cra_list = LIST_HEAD_INIT(srp_algs[5].cra_list),
425 .cra_init = ablk_init,
426 .cra_exit = ablk_exit,
427 .cra_u = {
428 .ablkcipher = {
429 .min_keysize = SERPENT_MIN_KEY_SIZE,
430 .max_keysize = SERPENT_MAX_KEY_SIZE,
431 .setkey = ablk_set_key,
432 .encrypt = ablk_encrypt,
433 .decrypt = ablk_decrypt,
434 },
435 },
436}, {
437 .cra_name = "cbc(serpent)",
438 .cra_driver_name = "cbc-serpent-avx2",
439 .cra_priority = 600,
440 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
441 .cra_blocksize = SERPENT_BLOCK_SIZE,
442 .cra_ctxsize = sizeof(struct async_helper_ctx),
443 .cra_alignmask = 0,
444 .cra_type = &crypto_ablkcipher_type,
445 .cra_module = THIS_MODULE,
446 .cra_list = LIST_HEAD_INIT(srp_algs[6].cra_list),
447 .cra_init = ablk_init,
448 .cra_exit = ablk_exit,
449 .cra_u = {
450 .ablkcipher = {
451 .min_keysize = SERPENT_MIN_KEY_SIZE,
452 .max_keysize = SERPENT_MAX_KEY_SIZE,
453 .ivsize = SERPENT_BLOCK_SIZE,
454 .setkey = ablk_set_key,
455 .encrypt = __ablk_encrypt,
456 .decrypt = ablk_decrypt,
457 },
458 },
459}, {
460 .cra_name = "ctr(serpent)",
461 .cra_driver_name = "ctr-serpent-avx2",
462 .cra_priority = 600,
463 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
464 .cra_blocksize = 1,
465 .cra_ctxsize = sizeof(struct async_helper_ctx),
466 .cra_alignmask = 0,
467 .cra_type = &crypto_ablkcipher_type,
468 .cra_module = THIS_MODULE,
469 .cra_list = LIST_HEAD_INIT(srp_algs[7].cra_list),
470 .cra_init = ablk_init,
471 .cra_exit = ablk_exit,
472 .cra_u = {
473 .ablkcipher = {
474 .min_keysize = SERPENT_MIN_KEY_SIZE,
475 .max_keysize = SERPENT_MAX_KEY_SIZE,
476 .ivsize = SERPENT_BLOCK_SIZE,
477 .setkey = ablk_set_key,
478 .encrypt = ablk_encrypt,
479 .decrypt = ablk_encrypt,
480 .geniv = "chainiv",
481 },
482 },
483}, {
484 .cra_name = "lrw(serpent)",
485 .cra_driver_name = "lrw-serpent-avx2",
486 .cra_priority = 600,
487 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
488 .cra_blocksize = SERPENT_BLOCK_SIZE,
489 .cra_ctxsize = sizeof(struct async_helper_ctx),
490 .cra_alignmask = 0,
491 .cra_type = &crypto_ablkcipher_type,
492 .cra_module = THIS_MODULE,
493 .cra_list = LIST_HEAD_INIT(srp_algs[8].cra_list),
494 .cra_init = ablk_init,
495 .cra_exit = ablk_exit,
496 .cra_u = {
497 .ablkcipher = {
498 .min_keysize = SERPENT_MIN_KEY_SIZE +
499 SERPENT_BLOCK_SIZE,
500 .max_keysize = SERPENT_MAX_KEY_SIZE +
501 SERPENT_BLOCK_SIZE,
502 .ivsize = SERPENT_BLOCK_SIZE,
503 .setkey = ablk_set_key,
504 .encrypt = ablk_encrypt,
505 .decrypt = ablk_decrypt,
506 },
507 },
508}, {
509 .cra_name = "xts(serpent)",
510 .cra_driver_name = "xts-serpent-avx2",
511 .cra_priority = 600,
512 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
513 .cra_blocksize = SERPENT_BLOCK_SIZE,
514 .cra_ctxsize = sizeof(struct async_helper_ctx),
515 .cra_alignmask = 0,
516 .cra_type = &crypto_ablkcipher_type,
517 .cra_module = THIS_MODULE,
518 .cra_list = LIST_HEAD_INIT(srp_algs[9].cra_list),
519 .cra_init = ablk_init,
520 .cra_exit = ablk_exit,
521 .cra_u = {
522 .ablkcipher = {
523 .min_keysize = SERPENT_MIN_KEY_SIZE * 2,
524 .max_keysize = SERPENT_MAX_KEY_SIZE * 2,
525 .ivsize = SERPENT_BLOCK_SIZE,
526 .setkey = ablk_set_key,
527 .encrypt = ablk_encrypt,
528 .decrypt = ablk_decrypt,
529 },
530 },
531} };
532
533static int __init init(void)
534{
535 u64 xcr0;
536
537 if (!cpu_has_avx2 || !cpu_has_osxsave) {
538 pr_info("AVX2 instructions are not detected.\n");
539 return -ENODEV;
540 }
541
542 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
543 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
544 pr_info("AVX detected but unusable.\n");
545 return -ENODEV;
546 }
547
548 return crypto_register_algs(srp_algs, ARRAY_SIZE(srp_algs));
549}
550
551static void __exit fini(void)
552{
553 crypto_unregister_algs(srp_algs, ARRAY_SIZE(srp_algs));
554}
555
556module_init(init);
557module_exit(fini);
558
559MODULE_LICENSE("GPL");
560MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized");
561MODULE_ALIAS("serpent");
562MODULE_ALIAS("serpent-asm");
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 0f8519cf4ac2..9ae83cf8d21e 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -41,7 +41,32 @@
41#include <asm/crypto/ablk_helper.h> 41#include <asm/crypto/ablk_helper.h>
42#include <asm/crypto/glue_helper.h> 42#include <asm/crypto/glue_helper.h>
43 43
44static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) 44/* 8-way parallel cipher functions */
45asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
46 const u8 *src);
47EXPORT_SYMBOL_GPL(serpent_ecb_enc_8way_avx);
48
49asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
50 const u8 *src);
51EXPORT_SYMBOL_GPL(serpent_ecb_dec_8way_avx);
52
53asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
54 const u8 *src);
55EXPORT_SYMBOL_GPL(serpent_cbc_dec_8way_avx);
56
57asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
58 const u8 *src, le128 *iv);
59EXPORT_SYMBOL_GPL(serpent_ctr_8way_avx);
60
61asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
62 const u8 *src, le128 *iv);
63EXPORT_SYMBOL_GPL(serpent_xts_enc_8way_avx);
64
65asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
66 const u8 *src, le128 *iv);
67EXPORT_SYMBOL_GPL(serpent_xts_dec_8way_avx);
68
69void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
45{ 70{
46 be128 ctrblk; 71 be128 ctrblk;
47 72
@@ -51,18 +76,22 @@ static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
51 __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); 76 __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
52 u128_xor(dst, src, (u128 *)&ctrblk); 77 u128_xor(dst, src, (u128 *)&ctrblk);
53} 78}
79EXPORT_SYMBOL_GPL(__serpent_crypt_ctr);
54 80
55static void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) 81void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
56{ 82{
57 glue_xts_crypt_128bit_one(ctx, dst, src, iv, 83 glue_xts_crypt_128bit_one(ctx, dst, src, iv,
58 GLUE_FUNC_CAST(__serpent_encrypt)); 84 GLUE_FUNC_CAST(__serpent_encrypt));
59} 85}
86EXPORT_SYMBOL_GPL(serpent_xts_enc);
60 87
61static void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) 88void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
62{ 89{
63 glue_xts_crypt_128bit_one(ctx, dst, src, iv, 90 glue_xts_crypt_128bit_one(ctx, dst, src, iv,
64 GLUE_FUNC_CAST(__serpent_decrypt)); 91 GLUE_FUNC_CAST(__serpent_decrypt));
65} 92}
93EXPORT_SYMBOL_GPL(serpent_xts_dec);
94
66 95
67static const struct common_glue_ctx serpent_enc = { 96static const struct common_glue_ctx serpent_enc = {
68 .num_funcs = 2, 97 .num_funcs = 2,
@@ -86,7 +115,7 @@ static const struct common_glue_ctx serpent_ctr = {
86 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) } 115 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
87 }, { 116 }, {
88 .num_blocks = 1, 117 .num_blocks = 1,
89 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) } 118 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) }
90 } } 119 } }
91}; 120};
92 121
@@ -224,13 +253,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
224 __serpent_decrypt(ctx->ctx, srcdst, srcdst); 253 __serpent_decrypt(ctx->ctx, srcdst, srcdst);
225} 254}
226 255
227struct serpent_lrw_ctx { 256int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
228 struct lrw_table_ctx lrw_table; 257 unsigned int keylen)
229 struct serpent_ctx serpent_ctx;
230};
231
232static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
233 unsigned int keylen)
234{ 258{
235 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); 259 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
236 int err; 260 int err;
@@ -243,6 +267,7 @@ static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
243 return lrw_init_table(&ctx->lrw_table, key + keylen - 267 return lrw_init_table(&ctx->lrw_table, key + keylen -
244 SERPENT_BLOCK_SIZE); 268 SERPENT_BLOCK_SIZE);
245} 269}
270EXPORT_SYMBOL_GPL(lrw_serpent_setkey);
246 271
247static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 272static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
248 struct scatterlist *src, unsigned int nbytes) 273 struct scatterlist *src, unsigned int nbytes)
@@ -296,20 +321,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
296 return ret; 321 return ret;
297} 322}
298 323
299static void lrw_exit_tfm(struct crypto_tfm *tfm) 324void lrw_serpent_exit_tfm(struct crypto_tfm *tfm)
300{ 325{
301 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); 326 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
302 327
303 lrw_free_table(&ctx->lrw_table); 328 lrw_free_table(&ctx->lrw_table);
304} 329}
330EXPORT_SYMBOL_GPL(lrw_serpent_exit_tfm);
305 331
306struct serpent_xts_ctx { 332int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
307 struct serpent_ctx tweak_ctx; 333 unsigned int keylen)
308 struct serpent_ctx crypt_ctx;
309};
310
311static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
312 unsigned int keylen)
313{ 334{
314 struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm); 335 struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
315 u32 *flags = &tfm->crt_flags; 336 u32 *flags = &tfm->crt_flags;
@@ -331,6 +352,7 @@ static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
331 /* second half of xts-key is for tweak */ 352 /* second half of xts-key is for tweak */
332 return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); 353 return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
333} 354}
355EXPORT_SYMBOL_GPL(xts_serpent_setkey);
334 356
335static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 357static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
336 struct scatterlist *src, unsigned int nbytes) 358 struct scatterlist *src, unsigned int nbytes)
@@ -420,7 +442,7 @@ static struct crypto_alg serpent_algs[10] = { {
420 .cra_alignmask = 0, 442 .cra_alignmask = 0,
421 .cra_type = &crypto_blkcipher_type, 443 .cra_type = &crypto_blkcipher_type,
422 .cra_module = THIS_MODULE, 444 .cra_module = THIS_MODULE,
423 .cra_exit = lrw_exit_tfm, 445 .cra_exit = lrw_serpent_exit_tfm,
424 .cra_u = { 446 .cra_u = {
425 .blkcipher = { 447 .blkcipher = {
426 .min_keysize = SERPENT_MIN_KEY_SIZE + 448 .min_keysize = SERPENT_MIN_KEY_SIZE +
diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h
index 56e79cc57eaf..33c2b8a435da 100644
--- a/arch/x86/include/asm/crypto/serpent-avx.h
+++ b/arch/x86/include/asm/crypto/serpent-avx.h
@@ -6,6 +6,16 @@
6 6
7#define SERPENT_PARALLEL_BLOCKS 8 7#define SERPENT_PARALLEL_BLOCKS 8
8 8
9struct serpent_lrw_ctx {
10 struct lrw_table_ctx lrw_table;
11 struct serpent_ctx serpent_ctx;
12};
13
14struct serpent_xts_ctx {
15 struct serpent_ctx tweak_ctx;
16 struct serpent_ctx crypt_ctx;
17};
18
9asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, 19asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
10 const u8 *src); 20 const u8 *src);
11asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, 21asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
@@ -21,4 +31,18 @@ asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
21asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, 31asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
22 const u8 *src, le128 *iv); 32 const u8 *src, le128 *iv);
23 33
34extern void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src,
35 le128 *iv);
36
37extern void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
38extern void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
39
40extern int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
41 unsigned int keylen);
42
43extern void lrw_serpent_exit_tfm(struct crypto_tfm *tfm);
44
45extern int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
46 unsigned int keylen);
47
24#endif 48#endif
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 1ba48ddd4da1..9ad3d78c1075 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1131,6 +1131,29 @@ config CRYPTO_SERPENT_AVX_X86_64
1131 See also: 1131 See also:
1132 <http://www.cl.cam.ac.uk/~rja14/serpent.html> 1132 <http://www.cl.cam.ac.uk/~rja14/serpent.html>
1133 1133
1134config CRYPTO_SERPENT_AVX2_X86_64
1135 tristate "Serpent cipher algorithm (x86_64/AVX2)"
1136 depends on X86 && 64BIT
1137 select CRYPTO_ALGAPI
1138 select CRYPTO_CRYPTD
1139 select CRYPTO_ABLK_HELPER_X86
1140 select CRYPTO_GLUE_HELPER_X86
1141 select CRYPTO_SERPENT
1142 select CRYPTO_SERPENT_AVX_X86_64
1143 select CRYPTO_LRW
1144 select CRYPTO_XTS
1145 help
1146 Serpent cipher algorithm, by Anderson, Biham & Knudsen.
1147
1148 Keys are allowed to be from 0 to 256 bits in length, in steps
1149 of 8 bits.
1150
1151 This module provides Serpent cipher algorithm that processes 16
1152 blocks parallel using AVX2 instruction set.
1153
1154 See also:
1155 <http://www.cl.cam.ac.uk/~rja14/serpent.html>
1156
1134config CRYPTO_TEA 1157config CRYPTO_TEA
1135 tristate "TEA, XTEA and XETA cipher algorithms" 1158 tristate "TEA, XTEA and XETA cipher algorithms"
1136 select CRYPTO_ALGAPI 1159 select CRYPTO_ALGAPI
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index fea7841dd6f3..f5e13dea8cc9 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1645,6 +1645,9 @@ static const struct alg_test_desc alg_test_descs[] = {
1645 .alg = "__cbc-serpent-avx", 1645 .alg = "__cbc-serpent-avx",
1646 .test = alg_test_null, 1646 .test = alg_test_null,
1647 }, { 1647 }, {
1648 .alg = "__cbc-serpent-avx2",
1649 .test = alg_test_null,
1650 }, {
1648 .alg = "__cbc-serpent-sse2", 1651 .alg = "__cbc-serpent-sse2",
1649 .test = alg_test_null, 1652 .test = alg_test_null,
1650 }, { 1653 }, {
@@ -1673,6 +1676,9 @@ static const struct alg_test_desc alg_test_descs[] = {
1673 .alg = "__driver-cbc-serpent-avx", 1676 .alg = "__driver-cbc-serpent-avx",
1674 .test = alg_test_null, 1677 .test = alg_test_null,
1675 }, { 1678 }, {
1679 .alg = "__driver-cbc-serpent-avx2",
1680 .test = alg_test_null,
1681 }, {
1676 .alg = "__driver-cbc-serpent-sse2", 1682 .alg = "__driver-cbc-serpent-sse2",
1677 .test = alg_test_null, 1683 .test = alg_test_null,
1678 }, { 1684 }, {
@@ -1701,6 +1707,9 @@ static const struct alg_test_desc alg_test_descs[] = {
1701 .alg = "__driver-ecb-serpent-avx", 1707 .alg = "__driver-ecb-serpent-avx",
1702 .test = alg_test_null, 1708 .test = alg_test_null,
1703 }, { 1709 }, {
1710 .alg = "__driver-ecb-serpent-avx2",
1711 .test = alg_test_null,
1712 }, {
1704 .alg = "__driver-ecb-serpent-sse2", 1713 .alg = "__driver-ecb-serpent-sse2",
1705 .test = alg_test_null, 1714 .test = alg_test_null,
1706 }, { 1715 }, {
@@ -1969,6 +1978,9 @@ static const struct alg_test_desc alg_test_descs[] = {
1969 .alg = "cryptd(__driver-cbc-camellia-aesni)", 1978 .alg = "cryptd(__driver-cbc-camellia-aesni)",
1970 .test = alg_test_null, 1979 .test = alg_test_null,
1971 }, { 1980 }, {
1981 .alg = "cryptd(__driver-cbc-serpent-avx2)",
1982 .test = alg_test_null,
1983 }, {
1972 .alg = "cryptd(__driver-ecb-aes-aesni)", 1984 .alg = "cryptd(__driver-ecb-aes-aesni)",
1973 .test = alg_test_null, 1985 .test = alg_test_null,
1974 .fips_allowed = 1, 1986 .fips_allowed = 1,
@@ -1988,6 +2000,9 @@ static const struct alg_test_desc alg_test_descs[] = {
1988 .alg = "cryptd(__driver-ecb-serpent-avx)", 2000 .alg = "cryptd(__driver-ecb-serpent-avx)",
1989 .test = alg_test_null, 2001 .test = alg_test_null,
1990 }, { 2002 }, {
2003 .alg = "cryptd(__driver-ecb-serpent-avx2)",
2004 .test = alg_test_null,
2005 }, {
1991 .alg = "cryptd(__driver-ecb-serpent-sse2)", 2006 .alg = "cryptd(__driver-ecb-serpent-sse2)",
1992 .test = alg_test_null, 2007 .test = alg_test_null,
1993 }, { 2008 }, {