aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/serpent-sse2-x86_64-asm_64.S761
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c719
-rw-r--r--arch/x86/include/asm/serpent.h32
-rw-r--r--crypto/Kconfig17
-rw-r--r--crypto/testmgr.c60
6 files changed, 1591 insertions, 0 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 3537d4b91f74..12ebdbd80ccb 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
11obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 11obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
12obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o 12obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
13obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o 13obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
14obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
14obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o 15obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
15obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o 16obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
16 17
@@ -26,6 +27,7 @@ blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
26twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o 27twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
27twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o 28twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
28salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o 29salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
30serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
29 31
30aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o 32aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
31 33
diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
new file mode 100644
index 000000000000..7f24a1540821
--- /dev/null
+++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
@@ -0,0 +1,761 @@
1/*
2 * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2)
3 *
4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * Based on crypto/serpent.c by
7 * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
8 * 2003 Herbert Valerio Riedel <hvr@gnu.org>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27.file "serpent-sse2-x86_64-asm_64.S"
28.text
29
30#define CTX %rdi
31
32/**********************************************************************
33 8-way SSE2 serpent
34 **********************************************************************/
35#define RA1 %xmm0
36#define RB1 %xmm1
37#define RC1 %xmm2
38#define RD1 %xmm3
39#define RE1 %xmm4
40
41#define RA2 %xmm5
42#define RB2 %xmm6
43#define RC2 %xmm7
44#define RD2 %xmm8
45#define RE2 %xmm9
46
47#define RNOT %xmm10
48
49#define RK0 %xmm11
50#define RK1 %xmm12
51#define RK2 %xmm13
52#define RK3 %xmm14
53
54#define S0_1(x0, x1, x2, x3, x4) \
55 movdqa x3, x4; \
56 por x0, x3; \
57 pxor x4, x0; \
58 pxor x2, x4; \
59 pxor RNOT, x4; \
60 pxor x1, x3; \
61 pand x0, x1; \
62 pxor x4, x1; \
63 pxor x0, x2;
64#define S0_2(x0, x1, x2, x3, x4) \
65 pxor x3, x0; \
66 por x0, x4; \
67 pxor x2, x0; \
68 pand x1, x2; \
69 pxor x2, x3; \
70 pxor RNOT, x1; \
71 pxor x4, x2; \
72 pxor x2, x1;
73
74#define S1_1(x0, x1, x2, x3, x4) \
75 movdqa x1, x4; \
76 pxor x0, x1; \
77 pxor x3, x0; \
78 pxor RNOT, x3; \
79 pand x1, x4; \
80 por x1, x0; \
81 pxor x2, x3; \
82 pxor x3, x0; \
83 pxor x3, x1;
84#define S1_2(x0, x1, x2, x3, x4) \
85 pxor x4, x3; \
86 por x4, x1; \
87 pxor x2, x4; \
88 pand x0, x2; \
89 pxor x1, x2; \
90 por x0, x1; \
91 pxor RNOT, x0; \
92 pxor x2, x0; \
93 pxor x1, x4;
94
95#define S2_1(x0, x1, x2, x3, x4) \
96 pxor RNOT, x3; \
97 pxor x0, x1; \
98 movdqa x0, x4; \
99 pand x2, x0; \
100 pxor x3, x0; \
101 por x4, x3; \
102 pxor x1, x2; \
103 pxor x1, x3; \
104 pand x0, x1;
105#define S2_2(x0, x1, x2, x3, x4) \
106 pxor x2, x0; \
107 pand x3, x2; \
108 por x1, x3; \
109 pxor RNOT, x0; \
110 pxor x0, x3; \
111 pxor x0, x4; \
112 pxor x2, x0; \
113 por x2, x1;
114
115#define S3_1(x0, x1, x2, x3, x4) \
116 movdqa x1, x4; \
117 pxor x3, x1; \
118 por x0, x3; \
119 pand x0, x4; \
120 pxor x2, x0; \
121 pxor x1, x2; \
122 pand x3, x1; \
123 pxor x3, x2; \
124 por x4, x0; \
125 pxor x3, x4;
126#define S3_2(x0, x1, x2, x3, x4) \
127 pxor x0, x1; \
128 pand x3, x0; \
129 pand x4, x3; \
130 pxor x2, x3; \
131 por x1, x4; \
132 pand x1, x2; \
133 pxor x3, x4; \
134 pxor x3, x0; \
135 pxor x2, x3;
136
137#define S4_1(x0, x1, x2, x3, x4) \
138 movdqa x3, x4; \
139 pand x0, x3; \
140 pxor x4, x0; \
141 pxor x2, x3; \
142 por x4, x2; \
143 pxor x1, x0; \
144 pxor x3, x4; \
145 por x0, x2; \
146 pxor x1, x2;
147#define S4_2(x0, x1, x2, x3, x4) \
148 pand x0, x1; \
149 pxor x4, x1; \
150 pand x2, x4; \
151 pxor x3, x2; \
152 pxor x0, x4; \
153 por x1, x3; \
154 pxor RNOT, x1; \
155 pxor x0, x3;
156
157#define S5_1(x0, x1, x2, x3, x4) \
158 movdqa x1, x4; \
159 por x0, x1; \
160 pxor x1, x2; \
161 pxor RNOT, x3; \
162 pxor x0, x4; \
163 pxor x2, x0; \
164 pand x4, x1; \
165 por x3, x4; \
166 pxor x0, x4;
167#define S5_2(x0, x1, x2, x3, x4) \
168 pand x3, x0; \
169 pxor x3, x1; \
170 pxor x2, x3; \
171 pxor x1, x0; \
172 pand x4, x2; \
173 pxor x2, x1; \
174 pand x0, x2; \
175 pxor x2, x3;
176
177#define S6_1(x0, x1, x2, x3, x4) \
178 movdqa x1, x4; \
179 pxor x0, x3; \
180 pxor x2, x1; \
181 pxor x0, x2; \
182 pand x3, x0; \
183 por x3, x1; \
184 pxor RNOT, x4; \
185 pxor x1, x0; \
186 pxor x2, x1;
187#define S6_2(x0, x1, x2, x3, x4) \
188 pxor x4, x3; \
189 pxor x0, x4; \
190 pand x0, x2; \
191 pxor x1, x4; \
192 pxor x3, x2; \
193 pand x1, x3; \
194 pxor x0, x3; \
195 pxor x2, x1;
196
197#define S7_1(x0, x1, x2, x3, x4) \
198 pxor RNOT, x1; \
199 movdqa x1, x4; \
200 pxor RNOT, x0; \
201 pand x2, x1; \
202 pxor x3, x1; \
203 por x4, x3; \
204 pxor x2, x4; \
205 pxor x3, x2; \
206 pxor x0, x3; \
207 por x1, x0;
208#define S7_2(x0, x1, x2, x3, x4) \
209 pand x0, x2; \
210 pxor x4, x0; \
211 pxor x3, x4; \
212 pand x0, x3; \
213 pxor x1, x4; \
214 pxor x4, x2; \
215 pxor x1, x3; \
216 por x0, x4; \
217 pxor x1, x4;
218
219#define SI0_1(x0, x1, x2, x3, x4) \
220 movdqa x3, x4; \
221 pxor x0, x1; \
222 por x1, x3; \
223 pxor x1, x4; \
224 pxor RNOT, x0; \
225 pxor x3, x2; \
226 pxor x0, x3; \
227 pand x1, x0; \
228 pxor x2, x0;
229#define SI0_2(x0, x1, x2, x3, x4) \
230 pand x3, x2; \
231 pxor x4, x3; \
232 pxor x3, x2; \
233 pxor x3, x1; \
234 pand x0, x3; \
235 pxor x0, x1; \
236 pxor x2, x0; \
237 pxor x3, x4;
238
239#define SI1_1(x0, x1, x2, x3, x4) \
240 pxor x3, x1; \
241 movdqa x0, x4; \
242 pxor x2, x0; \
243 pxor RNOT, x2; \
244 por x1, x4; \
245 pxor x3, x4; \
246 pand x1, x3; \
247 pxor x2, x1; \
248 pand x4, x2;
249#define SI1_2(x0, x1, x2, x3, x4) \
250 pxor x1, x4; \
251 por x3, x1; \
252 pxor x0, x3; \
253 pxor x0, x2; \
254 por x4, x0; \
255 pxor x4, x2; \
256 pxor x0, x1; \
257 pxor x1, x4;
258
259#define SI2_1(x0, x1, x2, x3, x4) \
260 pxor x1, x2; \
261 movdqa x3, x4; \
262 pxor RNOT, x3; \
263 por x2, x3; \
264 pxor x4, x2; \
265 pxor x0, x4; \
266 pxor x1, x3; \
267 por x2, x1; \
268 pxor x0, x2;
269#define SI2_2(x0, x1, x2, x3, x4) \
270 pxor x4, x1; \
271 por x3, x4; \
272 pxor x3, x2; \
273 pxor x2, x4; \
274 pand x1, x2; \
275 pxor x3, x2; \
276 pxor x4, x3; \
277 pxor x0, x4;
278
279#define SI3_1(x0, x1, x2, x3, x4) \
280 pxor x1, x2; \
281 movdqa x1, x4; \
282 pand x2, x1; \
283 pxor x0, x1; \
284 por x4, x0; \
285 pxor x3, x4; \
286 pxor x3, x0; \
287 por x1, x3; \
288 pxor x2, x1;
289#define SI3_2(x0, x1, x2, x3, x4) \
290 pxor x3, x1; \
291 pxor x2, x0; \
292 pxor x3, x2; \
293 pand x1, x3; \
294 pxor x0, x1; \
295 pand x2, x0; \
296 pxor x3, x4; \
297 pxor x0, x3; \
298 pxor x1, x0;
299
300#define SI4_1(x0, x1, x2, x3, x4) \
301 pxor x3, x2; \
302 movdqa x0, x4; \
303 pand x1, x0; \
304 pxor x2, x0; \
305 por x3, x2; \
306 pxor RNOT, x4; \
307 pxor x0, x1; \
308 pxor x2, x0; \
309 pand x4, x2;
310#define SI4_2(x0, x1, x2, x3, x4) \
311 pxor x0, x2; \
312 por x4, x0; \
313 pxor x3, x0; \
314 pand x2, x3; \
315 pxor x3, x4; \
316 pxor x1, x3; \
317 pand x0, x1; \
318 pxor x1, x4; \
319 pxor x3, x0;
320
321#define SI5_1(x0, x1, x2, x3, x4) \
322 movdqa x1, x4; \
323 por x2, x1; \
324 pxor x4, x2; \
325 pxor x3, x1; \
326 pand x4, x3; \
327 pxor x3, x2; \
328 por x0, x3; \
329 pxor RNOT, x0; \
330 pxor x2, x3; \
331 por x0, x2;
332#define SI5_2(x0, x1, x2, x3, x4) \
333 pxor x1, x4; \
334 pxor x4, x2; \
335 pand x0, x4; \
336 pxor x1, x0; \
337 pxor x3, x1; \
338 pand x2, x0; \
339 pxor x3, x2; \
340 pxor x2, x0; \
341 pxor x4, x2; \
342 pxor x3, x4;
343
344#define SI6_1(x0, x1, x2, x3, x4) \
345 pxor x2, x0; \
346 movdqa x0, x4; \
347 pand x3, x0; \
348 pxor x3, x2; \
349 pxor x2, x0; \
350 pxor x1, x3; \
351 por x4, x2; \
352 pxor x3, x2; \
353 pand x0, x3;
354#define SI6_2(x0, x1, x2, x3, x4) \
355 pxor RNOT, x0; \
356 pxor x1, x3; \
357 pand x2, x1; \
358 pxor x0, x4; \
359 pxor x4, x3; \
360 pxor x2, x4; \
361 pxor x1, x0; \
362 pxor x0, x2;
363
364#define SI7_1(x0, x1, x2, x3, x4) \
365 movdqa x3, x4; \
366 pand x0, x3; \
367 pxor x2, x0; \
368 por x4, x2; \
369 pxor x1, x4; \
370 pxor RNOT, x0; \
371 por x3, x1; \
372 pxor x0, x4; \
373 pand x2, x0; \
374 pxor x1, x0;
375#define SI7_2(x0, x1, x2, x3, x4) \
376 pand x2, x1; \
377 pxor x2, x3; \
378 pxor x3, x4; \
379 pand x3, x2; \
380 por x0, x3; \
381 pxor x4, x1; \
382 pxor x4, x3; \
383 pand x0, x4; \
384 pxor x2, x4;
385
386#define get_key(i, j, t) \
387 movd (4*(i)+(j))*4(CTX), t; \
388 pshufd $0, t, t;
389
390#define K2(x0, x1, x2, x3, x4, i) \
391 get_key(i, 0, RK0); \
392 get_key(i, 1, RK1); \
393 get_key(i, 2, RK2); \
394 get_key(i, 3, RK3); \
395 pxor RK0, x0 ## 1; \
396 pxor RK1, x1 ## 1; \
397 pxor RK2, x2 ## 1; \
398 pxor RK3, x3 ## 1; \
399 pxor RK0, x0 ## 2; \
400 pxor RK1, x1 ## 2; \
401 pxor RK2, x2 ## 2; \
402 pxor RK3, x3 ## 2;
403
404#define LK2(x0, x1, x2, x3, x4, i) \
405 movdqa x0 ## 1, x4 ## 1; \
406 pslld $13, x0 ## 1; \
407 psrld $(32 - 13), x4 ## 1; \
408 por x4 ## 1, x0 ## 1; \
409 pxor x0 ## 1, x1 ## 1; \
410 movdqa x2 ## 1, x4 ## 1; \
411 pslld $3, x2 ## 1; \
412 psrld $(32 - 3), x4 ## 1; \
413 por x4 ## 1, x2 ## 1; \
414 pxor x2 ## 1, x1 ## 1; \
415 movdqa x0 ## 2, x4 ## 2; \
416 pslld $13, x0 ## 2; \
417 psrld $(32 - 13), x4 ## 2; \
418 por x4 ## 2, x0 ## 2; \
419 pxor x0 ## 2, x1 ## 2; \
420 movdqa x2 ## 2, x4 ## 2; \
421 pslld $3, x2 ## 2; \
422 psrld $(32 - 3), x4 ## 2; \
423 por x4 ## 2, x2 ## 2; \
424 pxor x2 ## 2, x1 ## 2; \
425 movdqa x1 ## 1, x4 ## 1; \
426 pslld $1, x1 ## 1; \
427 psrld $(32 - 1), x4 ## 1; \
428 por x4 ## 1, x1 ## 1; \
429 movdqa x0 ## 1, x4 ## 1; \
430 pslld $3, x4 ## 1; \
431 pxor x2 ## 1, x3 ## 1; \
432 pxor x4 ## 1, x3 ## 1; \
433 movdqa x3 ## 1, x4 ## 1; \
434 get_key(i, 1, RK1); \
435 movdqa x1 ## 2, x4 ## 2; \
436 pslld $1, x1 ## 2; \
437 psrld $(32 - 1), x4 ## 2; \
438 por x4 ## 2, x1 ## 2; \
439 movdqa x0 ## 2, x4 ## 2; \
440 pslld $3, x4 ## 2; \
441 pxor x2 ## 2, x3 ## 2; \
442 pxor x4 ## 2, x3 ## 2; \
443 movdqa x3 ## 2, x4 ## 2; \
444 get_key(i, 3, RK3); \
445 pslld $7, x3 ## 1; \
446 psrld $(32 - 7), x4 ## 1; \
447 por x4 ## 1, x3 ## 1; \
448 movdqa x1 ## 1, x4 ## 1; \
449 pslld $7, x4 ## 1; \
450 pxor x1 ## 1, x0 ## 1; \
451 pxor x3 ## 1, x0 ## 1; \
452 pxor x3 ## 1, x2 ## 1; \
453 pxor x4 ## 1, x2 ## 1; \
454 get_key(i, 0, RK0); \
455 pslld $7, x3 ## 2; \
456 psrld $(32 - 7), x4 ## 2; \
457 por x4 ## 2, x3 ## 2; \
458 movdqa x1 ## 2, x4 ## 2; \
459 pslld $7, x4 ## 2; \
460 pxor x1 ## 2, x0 ## 2; \
461 pxor x3 ## 2, x0 ## 2; \
462 pxor x3 ## 2, x2 ## 2; \
463 pxor x4 ## 2, x2 ## 2; \
464 get_key(i, 2, RK2); \
465 pxor RK1, x1 ## 1; \
466 pxor RK3, x3 ## 1; \
467 movdqa x0 ## 1, x4 ## 1; \
468 pslld $5, x0 ## 1; \
469 psrld $(32 - 5), x4 ## 1; \
470 por x4 ## 1, x0 ## 1; \
471 movdqa x2 ## 1, x4 ## 1; \
472 pslld $22, x2 ## 1; \
473 psrld $(32 - 22), x4 ## 1; \
474 por x4 ## 1, x2 ## 1; \
475 pxor RK0, x0 ## 1; \
476 pxor RK2, x2 ## 1; \
477 pxor RK1, x1 ## 2; \
478 pxor RK3, x3 ## 2; \
479 movdqa x0 ## 2, x4 ## 2; \
480 pslld $5, x0 ## 2; \
481 psrld $(32 - 5), x4 ## 2; \
482 por x4 ## 2, x0 ## 2; \
483 movdqa x2 ## 2, x4 ## 2; \
484 pslld $22, x2 ## 2; \
485 psrld $(32 - 22), x4 ## 2; \
486 por x4 ## 2, x2 ## 2; \
487 pxor RK0, x0 ## 2; \
488 pxor RK2, x2 ## 2;
489
490#define KL2(x0, x1, x2, x3, x4, i) \
491 pxor RK0, x0 ## 1; \
492 pxor RK2, x2 ## 1; \
493 movdqa x0 ## 1, x4 ## 1; \
494 psrld $5, x0 ## 1; \
495 pslld $(32 - 5), x4 ## 1; \
496 por x4 ## 1, x0 ## 1; \
497 pxor RK3, x3 ## 1; \
498 pxor RK1, x1 ## 1; \
499 movdqa x2 ## 1, x4 ## 1; \
500 psrld $22, x2 ## 1; \
501 pslld $(32 - 22), x4 ## 1; \
502 por x4 ## 1, x2 ## 1; \
503 pxor x3 ## 1, x2 ## 1; \
504 pxor RK0, x0 ## 2; \
505 pxor RK2, x2 ## 2; \
506 movdqa x0 ## 2, x4 ## 2; \
507 psrld $5, x0 ## 2; \
508 pslld $(32 - 5), x4 ## 2; \
509 por x4 ## 2, x0 ## 2; \
510 pxor RK3, x3 ## 2; \
511 pxor RK1, x1 ## 2; \
512 movdqa x2 ## 2, x4 ## 2; \
513 psrld $22, x2 ## 2; \
514 pslld $(32 - 22), x4 ## 2; \
515 por x4 ## 2, x2 ## 2; \
516 pxor x3 ## 2, x2 ## 2; \
517 pxor x3 ## 1, x0 ## 1; \
518 movdqa x1 ## 1, x4 ## 1; \
519 pslld $7, x4 ## 1; \
520 pxor x1 ## 1, x0 ## 1; \
521 pxor x4 ## 1, x2 ## 1; \
522 movdqa x1 ## 1, x4 ## 1; \
523 psrld $1, x1 ## 1; \
524 pslld $(32 - 1), x4 ## 1; \
525 por x4 ## 1, x1 ## 1; \
526 pxor x3 ## 2, x0 ## 2; \
527 movdqa x1 ## 2, x4 ## 2; \
528 pslld $7, x4 ## 2; \
529 pxor x1 ## 2, x0 ## 2; \
530 pxor x4 ## 2, x2 ## 2; \
531 movdqa x1 ## 2, x4 ## 2; \
532 psrld $1, x1 ## 2; \
533 pslld $(32 - 1), x4 ## 2; \
534 por x4 ## 2, x1 ## 2; \
535 movdqa x3 ## 1, x4 ## 1; \
536 psrld $7, x3 ## 1; \
537 pslld $(32 - 7), x4 ## 1; \
538 por x4 ## 1, x3 ## 1; \
539 pxor x0 ## 1, x1 ## 1; \
540 movdqa x0 ## 1, x4 ## 1; \
541 pslld $3, x4 ## 1; \
542 pxor x4 ## 1, x3 ## 1; \
543 movdqa x0 ## 1, x4 ## 1; \
544 movdqa x3 ## 2, x4 ## 2; \
545 psrld $7, x3 ## 2; \
546 pslld $(32 - 7), x4 ## 2; \
547 por x4 ## 2, x3 ## 2; \
548 pxor x0 ## 2, x1 ## 2; \
549 movdqa x0 ## 2, x4 ## 2; \
550 pslld $3, x4 ## 2; \
551 pxor x4 ## 2, x3 ## 2; \
552 movdqa x0 ## 2, x4 ## 2; \
553 psrld $13, x0 ## 1; \
554 pslld $(32 - 13), x4 ## 1; \
555 por x4 ## 1, x0 ## 1; \
556 pxor x2 ## 1, x1 ## 1; \
557 pxor x2 ## 1, x3 ## 1; \
558 movdqa x2 ## 1, x4 ## 1; \
559 psrld $3, x2 ## 1; \
560 pslld $(32 - 3), x4 ## 1; \
561 por x4 ## 1, x2 ## 1; \
562 psrld $13, x0 ## 2; \
563 pslld $(32 - 13), x4 ## 2; \
564 por x4 ## 2, x0 ## 2; \
565 pxor x2 ## 2, x1 ## 2; \
566 pxor x2 ## 2, x3 ## 2; \
567 movdqa x2 ## 2, x4 ## 2; \
568 psrld $3, x2 ## 2; \
569 pslld $(32 - 3), x4 ## 2; \
570 por x4 ## 2, x2 ## 2;
571
572#define S(SBOX, x0, x1, x2, x3, x4) \
573 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
574 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
575 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
576 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
577
578#define SP(SBOX, x0, x1, x2, x3, x4, i) \
579 get_key(i, 0, RK0); \
580 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
581 get_key(i, 2, RK2); \
582 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
583 get_key(i, 3, RK3); \
584 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
585 get_key(i, 1, RK1); \
586 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
587
588#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
589 movdqa x2, t3; \
590 movdqa x0, t1; \
591 unpcklps x3, t3; \
592 movdqa x0, t2; \
593 unpcklps x1, t1; \
594 unpckhps x1, t2; \
595 movdqa t3, x1; \
596 unpckhps x3, x2; \
597 movdqa t1, x0; \
598 movhlps t1, x1; \
599 movdqa t2, t1; \
600 movlhps t3, x0; \
601 movlhps x2, t1; \
602 movhlps t2, x2; \
603 movdqa x2, x3; \
604 movdqa t1, x2;
605
606#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
607 movdqu (0*4*4)(in), x0; \
608 movdqu (1*4*4)(in), x1; \
609 movdqu (2*4*4)(in), x2; \
610 movdqu (3*4*4)(in), x3; \
611 \
612 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
613
614#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
615 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
616 \
617 movdqu x0, (0*4*4)(out); \
618 movdqu x1, (1*4*4)(out); \
619 movdqu x2, (2*4*4)(out); \
620 movdqu x3, (3*4*4)(out);
621
622#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
623 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
624 \
625 movdqu (0*4*4)(out), t0; \
626 pxor t0, x0; \
627 movdqu x0, (0*4*4)(out); \
628 movdqu (1*4*4)(out), t0; \
629 pxor t0, x1; \
630 movdqu x1, (1*4*4)(out); \
631 movdqu (2*4*4)(out), t0; \
632 pxor t0, x2; \
633 movdqu x2, (2*4*4)(out); \
634 movdqu (3*4*4)(out), t0; \
635 pxor t0, x3; \
636 movdqu x3, (3*4*4)(out);
637
638.align 8
639.global __serpent_enc_blk_8way
640.type __serpent_enc_blk_8way,@function;
641
642__serpent_enc_blk_8way:
643 /* input:
644 * %rdi: ctx, CTX
645 * %rsi: dst
646 * %rdx: src
647 * %rcx: bool, if true: xor output
648 */
649
650 pcmpeqd RNOT, RNOT;
651
652 leaq (4*4*4)(%rdx), %rax;
653 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
654 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
655
656 K2(RA, RB, RC, RD, RE, 0);
657 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
658 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
659 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
660 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
661 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
662 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
663 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
664 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
665 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
666 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
667 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
668 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
669 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
670 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
671 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
672 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
673 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
674 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
675 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
676 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
677 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
678 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
679 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
680 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
681 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
682 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
683 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
684 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
685 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
686 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
687 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
688 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
689
690 leaq (4*4*4)(%rsi), %rax;
691
692 testb %cl, %cl;
693 jnz __enc_xor8;
694
695 write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
696 write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
697
698 ret;
699
700__enc_xor8:
701 xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
702 xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
703
704 ret;
705
706.align 8
707.global serpent_dec_blk_8way
708.type serpent_dec_blk_8way,@function;
709
710serpent_dec_blk_8way:
711 /* input:
712 * %rdi: ctx, CTX
713 * %rsi: dst
714 * %rdx: src
715 */
716
717 pcmpeqd RNOT, RNOT;
718
719 leaq (4*4*4)(%rdx), %rax;
720 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
721 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
722
723 K2(RA, RB, RC, RD, RE, 32);
724 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
725 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
726 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
727 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
728 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
729 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
730 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
731 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
732 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
733 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
734 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
735 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
736 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
737 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
738 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
739 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
740 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
741 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
742 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
743 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
744 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
745 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
746 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
747 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
748 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
749 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
750 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
751 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
752 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
753 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
754 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
755 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
756
757 leaq (4*4*4)(%rsi), %rax;
758 write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
759 write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
760
761 ret;
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
new file mode 100644
index 000000000000..947cf570f6a7
--- /dev/null
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -0,0 +1,719 @@
1/*
2 * Glue Code for SSE2 assembler versions of Serpent Cipher
3 *
4 * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * Glue code based on aesni-intel_glue.c by:
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 *
10 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
11 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
12 * CTR part based on code (crypto/ctr.c) by:
13 * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
19 *
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * You should have received a copy of the GNU General Public License
26 * along with this program; if not, write to the Free Software
27 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
28 * USA
29 *
30 */
31
32#include <linux/module.h>
33#include <linux/hardirq.h>
34#include <linux/types.h>
35#include <linux/crypto.h>
36#include <linux/err.h>
37#include <crypto/algapi.h>
38#include <crypto/serpent.h>
39#include <crypto/cryptd.h>
40#include <crypto/b128ops.h>
41#include <crypto/ctr.h>
42#include <asm/i387.h>
43#include <asm/serpent.h>
44#include <crypto/scatterwalk.h>
45#include <linux/workqueue.h>
46#include <linux/spinlock.h>
47
48struct async_serpent_ctx {
49 struct cryptd_ablkcipher *cryptd_tfm;
50};
51
52static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
53{
54 if (fpu_enabled)
55 return true;
56
57 /* SSE2 is only used when chunk to be processed is large enough, so
58 * do not enable FPU until it is necessary.
59 */
60 if (nbytes < SERPENT_BLOCK_SIZE * SERPENT_PARALLEL_BLOCKS)
61 return false;
62
63 kernel_fpu_begin();
64 return true;
65}
66
67static inline void serpent_fpu_end(bool fpu_enabled)
68{
69 if (fpu_enabled)
70 kernel_fpu_end();
71}
72
73static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
74 bool enc)
75{
76 bool fpu_enabled = false;
77 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
78 const unsigned int bsize = SERPENT_BLOCK_SIZE;
79 unsigned int nbytes;
80 int err;
81
82 err = blkcipher_walk_virt(desc, walk);
83 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
84
85 while ((nbytes = walk->nbytes)) {
86 u8 *wsrc = walk->src.virt.addr;
87 u8 *wdst = walk->dst.virt.addr;
88
89 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
90
91 /* Process multi-block batch */
92 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
93 do {
94 if (enc)
95 serpent_enc_blk_xway(ctx, wdst, wsrc);
96 else
97 serpent_dec_blk_xway(ctx, wdst, wsrc);
98
99 wsrc += bsize * SERPENT_PARALLEL_BLOCKS;
100 wdst += bsize * SERPENT_PARALLEL_BLOCKS;
101 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
102 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
103
104 if (nbytes < bsize)
105 goto done;
106 }
107
108 /* Handle leftovers */
109 do {
110 if (enc)
111 __serpent_encrypt(ctx, wdst, wsrc);
112 else
113 __serpent_decrypt(ctx, wdst, wsrc);
114
115 wsrc += bsize;
116 wdst += bsize;
117 nbytes -= bsize;
118 } while (nbytes >= bsize);
119
120done:
121 err = blkcipher_walk_done(desc, walk, nbytes);
122 }
123
124 serpent_fpu_end(fpu_enabled);
125 return err;
126}
127
128static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
129 struct scatterlist *src, unsigned int nbytes)
130{
131 struct blkcipher_walk walk;
132
133 blkcipher_walk_init(&walk, dst, src, nbytes);
134 return ecb_crypt(desc, &walk, true);
135}
136
137static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
138 struct scatterlist *src, unsigned int nbytes)
139{
140 struct blkcipher_walk walk;
141
142 blkcipher_walk_init(&walk, dst, src, nbytes);
143 return ecb_crypt(desc, &walk, false);
144}
145
146static struct crypto_alg blk_ecb_alg = {
147 .cra_name = "__ecb-serpent-sse2",
148 .cra_driver_name = "__driver-ecb-serpent-sse2",
149 .cra_priority = 0,
150 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
151 .cra_blocksize = SERPENT_BLOCK_SIZE,
152 .cra_ctxsize = sizeof(struct serpent_ctx),
153 .cra_alignmask = 0,
154 .cra_type = &crypto_blkcipher_type,
155 .cra_module = THIS_MODULE,
156 .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list),
157 .cra_u = {
158 .blkcipher = {
159 .min_keysize = SERPENT_MIN_KEY_SIZE,
160 .max_keysize = SERPENT_MAX_KEY_SIZE,
161 .setkey = serpent_setkey,
162 .encrypt = ecb_encrypt,
163 .decrypt = ecb_decrypt,
164 },
165 },
166};
167
168static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
169 struct blkcipher_walk *walk)
170{
171 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
172 const unsigned int bsize = SERPENT_BLOCK_SIZE;
173 unsigned int nbytes = walk->nbytes;
174 u128 *src = (u128 *)walk->src.virt.addr;
175 u128 *dst = (u128 *)walk->dst.virt.addr;
176 u128 *iv = (u128 *)walk->iv;
177
178 do {
179 u128_xor(dst, src, iv);
180 __serpent_encrypt(ctx, (u8 *)dst, (u8 *)dst);
181 iv = dst;
182
183 src += 1;
184 dst += 1;
185 nbytes -= bsize;
186 } while (nbytes >= bsize);
187
188 u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
189 return nbytes;
190}
191
192static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
193 struct scatterlist *src, unsigned int nbytes)
194{
195 struct blkcipher_walk walk;
196 int err;
197
198 blkcipher_walk_init(&walk, dst, src, nbytes);
199 err = blkcipher_walk_virt(desc, &walk);
200
201 while ((nbytes = walk.nbytes)) {
202 nbytes = __cbc_encrypt(desc, &walk);
203 err = blkcipher_walk_done(desc, &walk, nbytes);
204 }
205
206 return err;
207}
208
209static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
210 struct blkcipher_walk *walk)
211{
212 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
213 const unsigned int bsize = SERPENT_BLOCK_SIZE;
214 unsigned int nbytes = walk->nbytes;
215 u128 *src = (u128 *)walk->src.virt.addr;
216 u128 *dst = (u128 *)walk->dst.virt.addr;
217 u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
218 u128 last_iv;
219 int i;
220
221 /* Start of the last block. */
222 src += nbytes / bsize - 1;
223 dst += nbytes / bsize - 1;
224
225 last_iv = *src;
226
227 /* Process multi-block batch */
228 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
229 do {
230 nbytes -= bsize * (SERPENT_PARALLEL_BLOCKS - 1);
231 src -= SERPENT_PARALLEL_BLOCKS - 1;
232 dst -= SERPENT_PARALLEL_BLOCKS - 1;
233
234 for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
235 ivs[i] = src[i];
236
237 serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
238
239 for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
240 u128_xor(dst + (i + 1), dst + (i + 1), ivs + i);
241
242 nbytes -= bsize;
243 if (nbytes < bsize)
244 goto done;
245
246 u128_xor(dst, dst, src - 1);
247 src -= 1;
248 dst -= 1;
249 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
250
251 if (nbytes < bsize)
252 goto done;
253 }
254
255 /* Handle leftovers */
256 for (;;) {
257 __serpent_decrypt(ctx, (u8 *)dst, (u8 *)src);
258
259 nbytes -= bsize;
260 if (nbytes < bsize)
261 break;
262
263 u128_xor(dst, dst, src - 1);
264 src -= 1;
265 dst -= 1;
266 }
267
268done:
269 u128_xor(dst, dst, (u128 *)walk->iv);
270 *(u128 *)walk->iv = last_iv;
271
272 return nbytes;
273}
274
275static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
276 struct scatterlist *src, unsigned int nbytes)
277{
278 bool fpu_enabled = false;
279 struct blkcipher_walk walk;
280 int err;
281
282 blkcipher_walk_init(&walk, dst, src, nbytes);
283 err = blkcipher_walk_virt(desc, &walk);
284 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
285
286 while ((nbytes = walk.nbytes)) {
287 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
288 nbytes = __cbc_decrypt(desc, &walk);
289 err = blkcipher_walk_done(desc, &walk, nbytes);
290 }
291
292 serpent_fpu_end(fpu_enabled);
293 return err;
294}
295
296static struct crypto_alg blk_cbc_alg = {
297 .cra_name = "__cbc-serpent-sse2",
298 .cra_driver_name = "__driver-cbc-serpent-sse2",
299 .cra_priority = 0,
300 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
301 .cra_blocksize = SERPENT_BLOCK_SIZE,
302 .cra_ctxsize = sizeof(struct serpent_ctx),
303 .cra_alignmask = 0,
304 .cra_type = &crypto_blkcipher_type,
305 .cra_module = THIS_MODULE,
306 .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list),
307 .cra_u = {
308 .blkcipher = {
309 .min_keysize = SERPENT_MIN_KEY_SIZE,
310 .max_keysize = SERPENT_MAX_KEY_SIZE,
311 .setkey = serpent_setkey,
312 .encrypt = cbc_encrypt,
313 .decrypt = cbc_decrypt,
314 },
315 },
316};
317
318static inline void u128_to_be128(be128 *dst, const u128 *src)
319{
320 dst->a = cpu_to_be64(src->a);
321 dst->b = cpu_to_be64(src->b);
322}
323
324static inline void be128_to_u128(u128 *dst, const be128 *src)
325{
326 dst->a = be64_to_cpu(src->a);
327 dst->b = be64_to_cpu(src->b);
328}
329
330static inline void u128_inc(u128 *i)
331{
332 i->b++;
333 if (!i->b)
334 i->a++;
335}
336
337static void ctr_crypt_final(struct blkcipher_desc *desc,
338 struct blkcipher_walk *walk)
339{
340 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
341 u8 *ctrblk = walk->iv;
342 u8 keystream[SERPENT_BLOCK_SIZE];
343 u8 *src = walk->src.virt.addr;
344 u8 *dst = walk->dst.virt.addr;
345 unsigned int nbytes = walk->nbytes;
346
347 __serpent_encrypt(ctx, keystream, ctrblk);
348 crypto_xor(keystream, src, nbytes);
349 memcpy(dst, keystream, nbytes);
350
351 crypto_inc(ctrblk, SERPENT_BLOCK_SIZE);
352}
353
354static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
355 struct blkcipher_walk *walk)
356{
357 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
358 const unsigned int bsize = SERPENT_BLOCK_SIZE;
359 unsigned int nbytes = walk->nbytes;
360 u128 *src = (u128 *)walk->src.virt.addr;
361 u128 *dst = (u128 *)walk->dst.virt.addr;
362 u128 ctrblk;
363 be128 ctrblocks[SERPENT_PARALLEL_BLOCKS];
364 int i;
365
366 be128_to_u128(&ctrblk, (be128 *)walk->iv);
367
368 /* Process multi-block batch */
369 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
370 do {
371 /* create ctrblks for parallel encrypt */
372 for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
373 if (dst != src)
374 dst[i] = src[i];
375
376 u128_to_be128(&ctrblocks[i], &ctrblk);
377 u128_inc(&ctrblk);
378 }
379
380 serpent_enc_blk_xway_xor(ctx, (u8 *)dst,
381 (u8 *)ctrblocks);
382
383 src += SERPENT_PARALLEL_BLOCKS;
384 dst += SERPENT_PARALLEL_BLOCKS;
385 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
386 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
387
388 if (nbytes < bsize)
389 goto done;
390 }
391
392 /* Handle leftovers */
393 do {
394 if (dst != src)
395 *dst = *src;
396
397 u128_to_be128(&ctrblocks[0], &ctrblk);
398 u128_inc(&ctrblk);
399
400 __serpent_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
401 u128_xor(dst, dst, (u128 *)ctrblocks);
402
403 src += 1;
404 dst += 1;
405 nbytes -= bsize;
406 } while (nbytes >= bsize);
407
408done:
409 u128_to_be128((be128 *)walk->iv, &ctrblk);
410 return nbytes;
411}
412
413static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
414 struct scatterlist *src, unsigned int nbytes)
415{
416 bool fpu_enabled = false;
417 struct blkcipher_walk walk;
418 int err;
419
420 blkcipher_walk_init(&walk, dst, src, nbytes);
421 err = blkcipher_walk_virt_block(desc, &walk, SERPENT_BLOCK_SIZE);
422 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
423
424 while ((nbytes = walk.nbytes) >= SERPENT_BLOCK_SIZE) {
425 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
426 nbytes = __ctr_crypt(desc, &walk);
427 err = blkcipher_walk_done(desc, &walk, nbytes);
428 }
429
430 serpent_fpu_end(fpu_enabled);
431
432 if (walk.nbytes) {
433 ctr_crypt_final(desc, &walk);
434 err = blkcipher_walk_done(desc, &walk, 0);
435 }
436
437 return err;
438}
439
440static struct crypto_alg blk_ctr_alg = {
441 .cra_name = "__ctr-serpent-sse2",
442 .cra_driver_name = "__driver-ctr-serpent-sse2",
443 .cra_priority = 0,
444 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
445 .cra_blocksize = 1,
446 .cra_ctxsize = sizeof(struct serpent_ctx),
447 .cra_alignmask = 0,
448 .cra_type = &crypto_blkcipher_type,
449 .cra_module = THIS_MODULE,
450 .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list),
451 .cra_u = {
452 .blkcipher = {
453 .min_keysize = SERPENT_MIN_KEY_SIZE,
454 .max_keysize = SERPENT_MAX_KEY_SIZE,
455 .ivsize = SERPENT_BLOCK_SIZE,
456 .setkey = serpent_setkey,
457 .encrypt = ctr_crypt,
458 .decrypt = ctr_crypt,
459 },
460 },
461};
462
463static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
464 unsigned int key_len)
465{
466 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
467 struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
468 int err;
469
470 crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
471 crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
472 & CRYPTO_TFM_REQ_MASK);
473 err = crypto_ablkcipher_setkey(child, key, key_len);
474 crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
475 & CRYPTO_TFM_RES_MASK);
476 return err;
477}
478
479static int __ablk_encrypt(struct ablkcipher_request *req)
480{
481 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
482 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
483 struct blkcipher_desc desc;
484
485 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
486 desc.info = req->info;
487 desc.flags = 0;
488
489 return crypto_blkcipher_crt(desc.tfm)->encrypt(
490 &desc, req->dst, req->src, req->nbytes);
491}
492
493static int ablk_encrypt(struct ablkcipher_request *req)
494{
495 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
496 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
497
498 if (!irq_fpu_usable()) {
499 struct ablkcipher_request *cryptd_req =
500 ablkcipher_request_ctx(req);
501
502 memcpy(cryptd_req, req, sizeof(*req));
503 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
504
505 return crypto_ablkcipher_encrypt(cryptd_req);
506 } else {
507 return __ablk_encrypt(req);
508 }
509}
510
511static int ablk_decrypt(struct ablkcipher_request *req)
512{
513 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
514 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
515
516 if (!irq_fpu_usable()) {
517 struct ablkcipher_request *cryptd_req =
518 ablkcipher_request_ctx(req);
519
520 memcpy(cryptd_req, req, sizeof(*req));
521 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
522
523 return crypto_ablkcipher_decrypt(cryptd_req);
524 } else {
525 struct blkcipher_desc desc;
526
527 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
528 desc.info = req->info;
529 desc.flags = 0;
530
531 return crypto_blkcipher_crt(desc.tfm)->decrypt(
532 &desc, req->dst, req->src, req->nbytes);
533 }
534}
535
536static void ablk_exit(struct crypto_tfm *tfm)
537{
538 struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
539
540 cryptd_free_ablkcipher(ctx->cryptd_tfm);
541}
542
543static void ablk_init_common(struct crypto_tfm *tfm,
544 struct cryptd_ablkcipher *cryptd_tfm)
545{
546 struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
547
548 ctx->cryptd_tfm = cryptd_tfm;
549 tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
550 crypto_ablkcipher_reqsize(&cryptd_tfm->base);
551}
552
553static int ablk_ecb_init(struct crypto_tfm *tfm)
554{
555 struct cryptd_ablkcipher *cryptd_tfm;
556
557 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ecb-serpent-sse2", 0, 0);
558 if (IS_ERR(cryptd_tfm))
559 return PTR_ERR(cryptd_tfm);
560 ablk_init_common(tfm, cryptd_tfm);
561 return 0;
562}
563
564static struct crypto_alg ablk_ecb_alg = {
565 .cra_name = "ecb(serpent)",
566 .cra_driver_name = "ecb-serpent-sse2",
567 .cra_priority = 400,
568 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
569 .cra_blocksize = SERPENT_BLOCK_SIZE,
570 .cra_ctxsize = sizeof(struct async_serpent_ctx),
571 .cra_alignmask = 0,
572 .cra_type = &crypto_ablkcipher_type,
573 .cra_module = THIS_MODULE,
574 .cra_list = LIST_HEAD_INIT(ablk_ecb_alg.cra_list),
575 .cra_init = ablk_ecb_init,
576 .cra_exit = ablk_exit,
577 .cra_u = {
578 .ablkcipher = {
579 .min_keysize = SERPENT_MIN_KEY_SIZE,
580 .max_keysize = SERPENT_MAX_KEY_SIZE,
581 .setkey = ablk_set_key,
582 .encrypt = ablk_encrypt,
583 .decrypt = ablk_decrypt,
584 },
585 },
586};
587
588static int ablk_cbc_init(struct crypto_tfm *tfm)
589{
590 struct cryptd_ablkcipher *cryptd_tfm;
591
592 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-serpent-sse2", 0, 0);
593 if (IS_ERR(cryptd_tfm))
594 return PTR_ERR(cryptd_tfm);
595 ablk_init_common(tfm, cryptd_tfm);
596 return 0;
597}
598
599static struct crypto_alg ablk_cbc_alg = {
600 .cra_name = "cbc(serpent)",
601 .cra_driver_name = "cbc-serpent-sse2",
602 .cra_priority = 400,
603 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
604 .cra_blocksize = SERPENT_BLOCK_SIZE,
605 .cra_ctxsize = sizeof(struct async_serpent_ctx),
606 .cra_alignmask = 0,
607 .cra_type = &crypto_ablkcipher_type,
608 .cra_module = THIS_MODULE,
609 .cra_list = LIST_HEAD_INIT(ablk_cbc_alg.cra_list),
610 .cra_init = ablk_cbc_init,
611 .cra_exit = ablk_exit,
612 .cra_u = {
613 .ablkcipher = {
614 .min_keysize = SERPENT_MIN_KEY_SIZE,
615 .max_keysize = SERPENT_MAX_KEY_SIZE,
616 .ivsize = SERPENT_BLOCK_SIZE,
617 .setkey = ablk_set_key,
618 .encrypt = __ablk_encrypt,
619 .decrypt = ablk_decrypt,
620 },
621 },
622};
623
624static int ablk_ctr_init(struct crypto_tfm *tfm)
625{
626 struct cryptd_ablkcipher *cryptd_tfm;
627
628 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-serpent-sse2", 0, 0);
629 if (IS_ERR(cryptd_tfm))
630 return PTR_ERR(cryptd_tfm);
631 ablk_init_common(tfm, cryptd_tfm);
632 return 0;
633}
634
635static struct crypto_alg ablk_ctr_alg = {
636 .cra_name = "ctr(serpent)",
637 .cra_driver_name = "ctr-serpent-sse2",
638 .cra_priority = 400,
639 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
640 .cra_blocksize = 1,
641 .cra_ctxsize = sizeof(struct async_serpent_ctx),
642 .cra_alignmask = 0,
643 .cra_type = &crypto_ablkcipher_type,
644 .cra_module = THIS_MODULE,
645 .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list),
646 .cra_init = ablk_ctr_init,
647 .cra_exit = ablk_exit,
648 .cra_u = {
649 .ablkcipher = {
650 .min_keysize = SERPENT_MIN_KEY_SIZE,
651 .max_keysize = SERPENT_MAX_KEY_SIZE,
652 .ivsize = SERPENT_BLOCK_SIZE,
653 .setkey = ablk_set_key,
654 .encrypt = ablk_encrypt,
655 .decrypt = ablk_encrypt,
656 .geniv = "chainiv",
657 },
658 },
659};
660
661static int __init serpent_sse2_init(void)
662{
663 int err;
664
665 if (!cpu_has_xmm2) {
666 printk(KERN_INFO "SSE2 instructions are not detected.\n");
667 return -ENODEV;
668 }
669
670 err = crypto_register_alg(&blk_ecb_alg);
671 if (err)
672 goto blk_ecb_err;
673 err = crypto_register_alg(&blk_cbc_alg);
674 if (err)
675 goto blk_cbc_err;
676 err = crypto_register_alg(&blk_ctr_alg);
677 if (err)
678 goto blk_ctr_err;
679 err = crypto_register_alg(&ablk_ecb_alg);
680 if (err)
681 goto ablk_ecb_err;
682 err = crypto_register_alg(&ablk_cbc_alg);
683 if (err)
684 goto ablk_cbc_err;
685 err = crypto_register_alg(&ablk_ctr_alg);
686 if (err)
687 goto ablk_ctr_err;
688 return err;
689
690ablk_ctr_err:
691 crypto_unregister_alg(&ablk_cbc_alg);
692ablk_cbc_err:
693 crypto_unregister_alg(&ablk_ecb_alg);
694ablk_ecb_err:
695 crypto_unregister_alg(&blk_ctr_alg);
696blk_ctr_err:
697 crypto_unregister_alg(&blk_cbc_alg);
698blk_cbc_err:
699 crypto_unregister_alg(&blk_ecb_alg);
700blk_ecb_err:
701 return err;
702}
703
704static void __exit serpent_sse2_exit(void)
705{
706 crypto_unregister_alg(&ablk_ctr_alg);
707 crypto_unregister_alg(&ablk_cbc_alg);
708 crypto_unregister_alg(&ablk_ecb_alg);
709 crypto_unregister_alg(&blk_ctr_alg);
710 crypto_unregister_alg(&blk_cbc_alg);
711 crypto_unregister_alg(&blk_ecb_alg);
712}
713
714module_init(serpent_sse2_init);
715module_exit(serpent_sse2_exit);
716
717MODULE_DESCRIPTION("Serpent Cipher Algorithm, SSE2 optimized");
718MODULE_LICENSE("GPL");
719MODULE_ALIAS("serpent");
diff --git a/arch/x86/include/asm/serpent.h b/arch/x86/include/asm/serpent.h
new file mode 100644
index 000000000000..b7fd3b595b27
--- /dev/null
+++ b/arch/x86/include/asm/serpent.h
@@ -0,0 +1,32 @@
1#ifndef ASM_X86_SERPENT_H
2#define ASM_X86_SERPENT_H
3
4#include <linux/crypto.h>
5#include <crypto/serpent.h>
6
7#define SERPENT_PARALLEL_BLOCKS 8
8
9asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst,
10 const u8 *src, bool xor);
11asmlinkage void serpent_dec_blk_8way(struct serpent_ctx *ctx, u8 *dst,
12 const u8 *src);
13
14static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
15 const u8 *src)
16{
17 __serpent_enc_blk_8way(ctx, dst, src, false);
18}
19
20static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
21 const u8 *src)
22{
23 __serpent_enc_blk_8way(ctx, dst, src, true);
24}
25
26static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
27 const u8 *src)
28{
29 serpent_dec_blk_8way(ctx, dst, src);
30}
31
32#endif
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 733208fe0a2d..2df61e458f06 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -766,6 +766,23 @@ config CRYPTO_SERPENT
766 See also: 766 See also:
767 <http://www.cl.cam.ac.uk/~rja14/serpent.html> 767 <http://www.cl.cam.ac.uk/~rja14/serpent.html>
768 768
769config CRYPTO_SERPENT_SSE2_X86_64
770 tristate "Serpent cipher algorithm (x86_64/SSE2)"
771 depends on X86 && 64BIT
772 select CRYPTO_ALGAPI
773 select CRYPTO_SERPENT
774 help
775 Serpent cipher algorithm, by Anderson, Biham & Knudsen.
776
777 Keys are allowed to be from 0 to 256 bits in length, in steps
778 of 8 bits.
779
780 This module provides Serpent cipher algorithm that processes eigth
781 blocks parallel using SSE2 instruction set.
782
783 See also:
784 <http://www.cl.cam.ac.uk/~rja14/serpent.html>
785
769config CRYPTO_TEA 786config CRYPTO_TEA
770 tristate "TEA, XTEA and XETA cipher algorithms" 787 tristate "TEA, XTEA and XETA cipher algorithms"
771 select CRYPTO_ALGAPI 788 select CRYPTO_ALGAPI
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 01553a6754b7..bb54b882d738 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1534,6 +1534,21 @@ static int alg_test_null(const struct alg_test_desc *desc,
1534/* Please keep this list sorted by algorithm name. */ 1534/* Please keep this list sorted by algorithm name. */
1535static const struct alg_test_desc alg_test_descs[] = { 1535static const struct alg_test_desc alg_test_descs[] = {
1536 { 1536 {
1537 .alg = "__cbc-serpent-sse2",
1538 .test = alg_test_null,
1539 .suite = {
1540 .cipher = {
1541 .enc = {
1542 .vecs = NULL,
1543 .count = 0
1544 },
1545 .dec = {
1546 .vecs = NULL,
1547 .count = 0
1548 }
1549 }
1550 }
1551 }, {
1537 .alg = "__driver-cbc-aes-aesni", 1552 .alg = "__driver-cbc-aes-aesni",
1538 .test = alg_test_null, 1553 .test = alg_test_null,
1539 .suite = { 1554 .suite = {
@@ -1549,6 +1564,21 @@ static const struct alg_test_desc alg_test_descs[] = {
1549 } 1564 }
1550 } 1565 }
1551 }, { 1566 }, {
1567 .alg = "__driver-cbc-serpent-sse2",
1568 .test = alg_test_null,
1569 .suite = {
1570 .cipher = {
1571 .enc = {
1572 .vecs = NULL,
1573 .count = 0
1574 },
1575 .dec = {
1576 .vecs = NULL,
1577 .count = 0
1578 }
1579 }
1580 }
1581 }, {
1552 .alg = "__driver-ecb-aes-aesni", 1582 .alg = "__driver-ecb-aes-aesni",
1553 .test = alg_test_null, 1583 .test = alg_test_null,
1554 .suite = { 1584 .suite = {
@@ -1564,6 +1594,21 @@ static const struct alg_test_desc alg_test_descs[] = {
1564 } 1594 }
1565 } 1595 }
1566 }, { 1596 }, {
1597 .alg = "__driver-ecb-serpent-sse2",
1598 .test = alg_test_null,
1599 .suite = {
1600 .cipher = {
1601 .enc = {
1602 .vecs = NULL,
1603 .count = 0
1604 },
1605 .dec = {
1606 .vecs = NULL,
1607 .count = 0
1608 }
1609 }
1610 }
1611 }, {
1567 .alg = "__ghash-pclmulqdqni", 1612 .alg = "__ghash-pclmulqdqni",
1568 .test = alg_test_null, 1613 .test = alg_test_null,
1569 .suite = { 1614 .suite = {
@@ -1746,6 +1791,21 @@ static const struct alg_test_desc alg_test_descs[] = {
1746 } 1791 }
1747 } 1792 }
1748 }, { 1793 }, {
1794 .alg = "cryptd(__driver-ecb-serpent-sse2)",
1795 .test = alg_test_null,
1796 .suite = {
1797 .cipher = {
1798 .enc = {
1799 .vecs = NULL,
1800 .count = 0
1801 },
1802 .dec = {
1803 .vecs = NULL,
1804 .count = 0
1805 }
1806 }
1807 }
1808 }, {
1749 .alg = "cryptd(__ghash-pclmulqdqni)", 1809 .alg = "cryptd(__ghash-pclmulqdqni)",
1750 .test = alg_test_null, 1810 .test = alg_test_null,
1751 .suite = { 1811 .suite = {