aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@mbnet.fi>2011-11-09 09:26:25 -0500
committerHerbert Xu <herbert@gondor.apana.org.au>2011-11-21 03:13:23 -0500
commit937c30d7f560210b0163035edd42b2aef78fed9e (patch)
treec47348474ca6cdda0a87c95f3a6831c732f27b4d /arch
parentd19978f58745e586d9385d306d557e7c785abe23 (diff)
crypto: serpent - add 8-way parallel x86_64/SSE2 assembler implementation
Patch adds x86_64/SSE2 assembler implementation of serpent cipher. Assembler functions crypt data in eigth block chunks (two 4 block chunk SSE2 operations in parallel to improve performance on out-of-order CPUs). Glue code is based on one from AES-NI implementation, so requests from irq context are redirected to cryptd. v2: - add missing include of linux/module.h (appearently crypto.h used to include module.h, which changed for 3.2 by commit 7c926402a7e8c9b279968fd94efec8700ba3859e) Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmarks results (serpent-sse2/serpent_generic speed ratios): AMD Phenom II 1055T (fam:16, model:10): size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec 16B 1.03x 1.01x 1.03x 1.05x 1.00x 0.99x 64B 1.00x 1.01x 1.02x 1.04x 1.02x 1.01x 256B 2.34x 2.41x 0.99x 2.43x 2.39x 2.40x 1024B 2.51x 2.57x 1.00x 2.59x 2.56x 2.56x 8192B 2.50x 2.54x 1.00x 2.55x 2.57x 2.57x Intel Celeron T1600 (fam:6, model:15, step:13): size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec 16B 0.97x 0.97x 1.01x 1.01x 1.01x 1.02x 64B 1.00x 1.00x 1.00x 1.02x 1.01x 1.01x 256B 3.41x 3.35x 1.00x 3.39x 3.42x 3.44x 1024B 3.75x 3.72x 0.99x 3.74x 3.75x 3.75x 8192B 3.70x 3.68x 0.99x 3.68x 3.69x 3.69x Full output: http://koti.mbnet.fi/axh/kernel/crypto/phenom-ii-1055t/serpent-generic.txt http://koti.mbnet.fi/axh/kernel/crypto/phenom-ii-1055t/serpent-sse2.txt http://koti.mbnet.fi/axh/kernel/crypto/celeron-t1600/serpent-generic.txt http://koti.mbnet.fi/axh/kernel/crypto/celeron-t1600/serpent-sse2.txt Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/serpent-sse2-x86_64-asm_64.S761
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c719
-rw-r--r--arch/x86/include/asm/serpent.h32
4 files changed, 1514 insertions, 0 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 3537d4b91f74..12ebdbd80ccb 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
11obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 11obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
12obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o 12obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
13obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o 13obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
14obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
14obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o 15obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
15obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o 16obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
16 17
@@ -26,6 +27,7 @@ blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
26twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o 27twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
27twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o 28twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
28salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o 29salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
30serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
29 31
30aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o 32aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
31 33
diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
new file mode 100644
index 000000000000..7f24a1540821
--- /dev/null
+++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
@@ -0,0 +1,761 @@
1/*
2 * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2)
3 *
4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * Based on crypto/serpent.c by
7 * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
8 * 2003 Herbert Valerio Riedel <hvr@gnu.org>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27.file "serpent-sse2-x86_64-asm_64.S"
28.text
29
30#define CTX %rdi
31
32/**********************************************************************
33 8-way SSE2 serpent
34 **********************************************************************/
35#define RA1 %xmm0
36#define RB1 %xmm1
37#define RC1 %xmm2
38#define RD1 %xmm3
39#define RE1 %xmm4
40
41#define RA2 %xmm5
42#define RB2 %xmm6
43#define RC2 %xmm7
44#define RD2 %xmm8
45#define RE2 %xmm9
46
47#define RNOT %xmm10
48
49#define RK0 %xmm11
50#define RK1 %xmm12
51#define RK2 %xmm13
52#define RK3 %xmm14
53
54#define S0_1(x0, x1, x2, x3, x4) \
55 movdqa x3, x4; \
56 por x0, x3; \
57 pxor x4, x0; \
58 pxor x2, x4; \
59 pxor RNOT, x4; \
60 pxor x1, x3; \
61 pand x0, x1; \
62 pxor x4, x1; \
63 pxor x0, x2;
64#define S0_2(x0, x1, x2, x3, x4) \
65 pxor x3, x0; \
66 por x0, x4; \
67 pxor x2, x0; \
68 pand x1, x2; \
69 pxor x2, x3; \
70 pxor RNOT, x1; \
71 pxor x4, x2; \
72 pxor x2, x1;
73
74#define S1_1(x0, x1, x2, x3, x4) \
75 movdqa x1, x4; \
76 pxor x0, x1; \
77 pxor x3, x0; \
78 pxor RNOT, x3; \
79 pand x1, x4; \
80 por x1, x0; \
81 pxor x2, x3; \
82 pxor x3, x0; \
83 pxor x3, x1;
84#define S1_2(x0, x1, x2, x3, x4) \
85 pxor x4, x3; \
86 por x4, x1; \
87 pxor x2, x4; \
88 pand x0, x2; \
89 pxor x1, x2; \
90 por x0, x1; \
91 pxor RNOT, x0; \
92 pxor x2, x0; \
93 pxor x1, x4;
94
95#define S2_1(x0, x1, x2, x3, x4) \
96 pxor RNOT, x3; \
97 pxor x0, x1; \
98 movdqa x0, x4; \
99 pand x2, x0; \
100 pxor x3, x0; \
101 por x4, x3; \
102 pxor x1, x2; \
103 pxor x1, x3; \
104 pand x0, x1;
105#define S2_2(x0, x1, x2, x3, x4) \
106 pxor x2, x0; \
107 pand x3, x2; \
108 por x1, x3; \
109 pxor RNOT, x0; \
110 pxor x0, x3; \
111 pxor x0, x4; \
112 pxor x2, x0; \
113 por x2, x1;
114
115#define S3_1(x0, x1, x2, x3, x4) \
116 movdqa x1, x4; \
117 pxor x3, x1; \
118 por x0, x3; \
119 pand x0, x4; \
120 pxor x2, x0; \
121 pxor x1, x2; \
122 pand x3, x1; \
123 pxor x3, x2; \
124 por x4, x0; \
125 pxor x3, x4;
126#define S3_2(x0, x1, x2, x3, x4) \
127 pxor x0, x1; \
128 pand x3, x0; \
129 pand x4, x3; \
130 pxor x2, x3; \
131 por x1, x4; \
132 pand x1, x2; \
133 pxor x3, x4; \
134 pxor x3, x0; \
135 pxor x2, x3;
136
137#define S4_1(x0, x1, x2, x3, x4) \
138 movdqa x3, x4; \
139 pand x0, x3; \
140 pxor x4, x0; \
141 pxor x2, x3; \
142 por x4, x2; \
143 pxor x1, x0; \
144 pxor x3, x4; \
145 por x0, x2; \
146 pxor x1, x2;
147#define S4_2(x0, x1, x2, x3, x4) \
148 pand x0, x1; \
149 pxor x4, x1; \
150 pand x2, x4; \
151 pxor x3, x2; \
152 pxor x0, x4; \
153 por x1, x3; \
154 pxor RNOT, x1; \
155 pxor x0, x3;
156
157#define S5_1(x0, x1, x2, x3, x4) \
158 movdqa x1, x4; \
159 por x0, x1; \
160 pxor x1, x2; \
161 pxor RNOT, x3; \
162 pxor x0, x4; \
163 pxor x2, x0; \
164 pand x4, x1; \
165 por x3, x4; \
166 pxor x0, x4;
167#define S5_2(x0, x1, x2, x3, x4) \
168 pand x3, x0; \
169 pxor x3, x1; \
170 pxor x2, x3; \
171 pxor x1, x0; \
172 pand x4, x2; \
173 pxor x2, x1; \
174 pand x0, x2; \
175 pxor x2, x3;
176
177#define S6_1(x0, x1, x2, x3, x4) \
178 movdqa x1, x4; \
179 pxor x0, x3; \
180 pxor x2, x1; \
181 pxor x0, x2; \
182 pand x3, x0; \
183 por x3, x1; \
184 pxor RNOT, x4; \
185 pxor x1, x0; \
186 pxor x2, x1;
187#define S6_2(x0, x1, x2, x3, x4) \
188 pxor x4, x3; \
189 pxor x0, x4; \
190 pand x0, x2; \
191 pxor x1, x4; \
192 pxor x3, x2; \
193 pand x1, x3; \
194 pxor x0, x3; \
195 pxor x2, x1;
196
197#define S7_1(x0, x1, x2, x3, x4) \
198 pxor RNOT, x1; \
199 movdqa x1, x4; \
200 pxor RNOT, x0; \
201 pand x2, x1; \
202 pxor x3, x1; \
203 por x4, x3; \
204 pxor x2, x4; \
205 pxor x3, x2; \
206 pxor x0, x3; \
207 por x1, x0;
208#define S7_2(x0, x1, x2, x3, x4) \
209 pand x0, x2; \
210 pxor x4, x0; \
211 pxor x3, x4; \
212 pand x0, x3; \
213 pxor x1, x4; \
214 pxor x4, x2; \
215 pxor x1, x3; \
216 por x0, x4; \
217 pxor x1, x4;
218
219#define SI0_1(x0, x1, x2, x3, x4) \
220 movdqa x3, x4; \
221 pxor x0, x1; \
222 por x1, x3; \
223 pxor x1, x4; \
224 pxor RNOT, x0; \
225 pxor x3, x2; \
226 pxor x0, x3; \
227 pand x1, x0; \
228 pxor x2, x0;
229#define SI0_2(x0, x1, x2, x3, x4) \
230 pand x3, x2; \
231 pxor x4, x3; \
232 pxor x3, x2; \
233 pxor x3, x1; \
234 pand x0, x3; \
235 pxor x0, x1; \
236 pxor x2, x0; \
237 pxor x3, x4;
238
239#define SI1_1(x0, x1, x2, x3, x4) \
240 pxor x3, x1; \
241 movdqa x0, x4; \
242 pxor x2, x0; \
243 pxor RNOT, x2; \
244 por x1, x4; \
245 pxor x3, x4; \
246 pand x1, x3; \
247 pxor x2, x1; \
248 pand x4, x2;
249#define SI1_2(x0, x1, x2, x3, x4) \
250 pxor x1, x4; \
251 por x3, x1; \
252 pxor x0, x3; \
253 pxor x0, x2; \
254 por x4, x0; \
255 pxor x4, x2; \
256 pxor x0, x1; \
257 pxor x1, x4;
258
259#define SI2_1(x0, x1, x2, x3, x4) \
260 pxor x1, x2; \
261 movdqa x3, x4; \
262 pxor RNOT, x3; \
263 por x2, x3; \
264 pxor x4, x2; \
265 pxor x0, x4; \
266 pxor x1, x3; \
267 por x2, x1; \
268 pxor x0, x2;
269#define SI2_2(x0, x1, x2, x3, x4) \
270 pxor x4, x1; \
271 por x3, x4; \
272 pxor x3, x2; \
273 pxor x2, x4; \
274 pand x1, x2; \
275 pxor x3, x2; \
276 pxor x4, x3; \
277 pxor x0, x4;
278
279#define SI3_1(x0, x1, x2, x3, x4) \
280 pxor x1, x2; \
281 movdqa x1, x4; \
282 pand x2, x1; \
283 pxor x0, x1; \
284 por x4, x0; \
285 pxor x3, x4; \
286 pxor x3, x0; \
287 por x1, x3; \
288 pxor x2, x1;
289#define SI3_2(x0, x1, x2, x3, x4) \
290 pxor x3, x1; \
291 pxor x2, x0; \
292 pxor x3, x2; \
293 pand x1, x3; \
294 pxor x0, x1; \
295 pand x2, x0; \
296 pxor x3, x4; \
297 pxor x0, x3; \
298 pxor x1, x0;
299
300#define SI4_1(x0, x1, x2, x3, x4) \
301 pxor x3, x2; \
302 movdqa x0, x4; \
303 pand x1, x0; \
304 pxor x2, x0; \
305 por x3, x2; \
306 pxor RNOT, x4; \
307 pxor x0, x1; \
308 pxor x2, x0; \
309 pand x4, x2;
310#define SI4_2(x0, x1, x2, x3, x4) \
311 pxor x0, x2; \
312 por x4, x0; \
313 pxor x3, x0; \
314 pand x2, x3; \
315 pxor x3, x4; \
316 pxor x1, x3; \
317 pand x0, x1; \
318 pxor x1, x4; \
319 pxor x3, x0;
320
321#define SI5_1(x0, x1, x2, x3, x4) \
322 movdqa x1, x4; \
323 por x2, x1; \
324 pxor x4, x2; \
325 pxor x3, x1; \
326 pand x4, x3; \
327 pxor x3, x2; \
328 por x0, x3; \
329 pxor RNOT, x0; \
330 pxor x2, x3; \
331 por x0, x2;
332#define SI5_2(x0, x1, x2, x3, x4) \
333 pxor x1, x4; \
334 pxor x4, x2; \
335 pand x0, x4; \
336 pxor x1, x0; \
337 pxor x3, x1; \
338 pand x2, x0; \
339 pxor x3, x2; \
340 pxor x2, x0; \
341 pxor x4, x2; \
342 pxor x3, x4;
343
344#define SI6_1(x0, x1, x2, x3, x4) \
345 pxor x2, x0; \
346 movdqa x0, x4; \
347 pand x3, x0; \
348 pxor x3, x2; \
349 pxor x2, x0; \
350 pxor x1, x3; \
351 por x4, x2; \
352 pxor x3, x2; \
353 pand x0, x3;
354#define SI6_2(x0, x1, x2, x3, x4) \
355 pxor RNOT, x0; \
356 pxor x1, x3; \
357 pand x2, x1; \
358 pxor x0, x4; \
359 pxor x4, x3; \
360 pxor x2, x4; \
361 pxor x1, x0; \
362 pxor x0, x2;
363
364#define SI7_1(x0, x1, x2, x3, x4) \
365 movdqa x3, x4; \
366 pand x0, x3; \
367 pxor x2, x0; \
368 por x4, x2; \
369 pxor x1, x4; \
370 pxor RNOT, x0; \
371 por x3, x1; \
372 pxor x0, x4; \
373 pand x2, x0; \
374 pxor x1, x0;
375#define SI7_2(x0, x1, x2, x3, x4) \
376 pand x2, x1; \
377 pxor x2, x3; \
378 pxor x3, x4; \
379 pand x3, x2; \
380 por x0, x3; \
381 pxor x4, x1; \
382 pxor x4, x3; \
383 pand x0, x4; \
384 pxor x2, x4;
385
386#define get_key(i, j, t) \
387 movd (4*(i)+(j))*4(CTX), t; \
388 pshufd $0, t, t;
389
390#define K2(x0, x1, x2, x3, x4, i) \
391 get_key(i, 0, RK0); \
392 get_key(i, 1, RK1); \
393 get_key(i, 2, RK2); \
394 get_key(i, 3, RK3); \
395 pxor RK0, x0 ## 1; \
396 pxor RK1, x1 ## 1; \
397 pxor RK2, x2 ## 1; \
398 pxor RK3, x3 ## 1; \
399 pxor RK0, x0 ## 2; \
400 pxor RK1, x1 ## 2; \
401 pxor RK2, x2 ## 2; \
402 pxor RK3, x3 ## 2;
403
404#define LK2(x0, x1, x2, x3, x4, i) \
405 movdqa x0 ## 1, x4 ## 1; \
406 pslld $13, x0 ## 1; \
407 psrld $(32 - 13), x4 ## 1; \
408 por x4 ## 1, x0 ## 1; \
409 pxor x0 ## 1, x1 ## 1; \
410 movdqa x2 ## 1, x4 ## 1; \
411 pslld $3, x2 ## 1; \
412 psrld $(32 - 3), x4 ## 1; \
413 por x4 ## 1, x2 ## 1; \
414 pxor x2 ## 1, x1 ## 1; \
415 movdqa x0 ## 2, x4 ## 2; \
416 pslld $13, x0 ## 2; \
417 psrld $(32 - 13), x4 ## 2; \
418 por x4 ## 2, x0 ## 2; \
419 pxor x0 ## 2, x1 ## 2; \
420 movdqa x2 ## 2, x4 ## 2; \
421 pslld $3, x2 ## 2; \
422 psrld $(32 - 3), x4 ## 2; \
423 por x4 ## 2, x2 ## 2; \
424 pxor x2 ## 2, x1 ## 2; \
425 movdqa x1 ## 1, x4 ## 1; \
426 pslld $1, x1 ## 1; \
427 psrld $(32 - 1), x4 ## 1; \
428 por x4 ## 1, x1 ## 1; \
429 movdqa x0 ## 1, x4 ## 1; \
430 pslld $3, x4 ## 1; \
431 pxor x2 ## 1, x3 ## 1; \
432 pxor x4 ## 1, x3 ## 1; \
433 movdqa x3 ## 1, x4 ## 1; \
434 get_key(i, 1, RK1); \
435 movdqa x1 ## 2, x4 ## 2; \
436 pslld $1, x1 ## 2; \
437 psrld $(32 - 1), x4 ## 2; \
438 por x4 ## 2, x1 ## 2; \
439 movdqa x0 ## 2, x4 ## 2; \
440 pslld $3, x4 ## 2; \
441 pxor x2 ## 2, x3 ## 2; \
442 pxor x4 ## 2, x3 ## 2; \
443 movdqa x3 ## 2, x4 ## 2; \
444 get_key(i, 3, RK3); \
445 pslld $7, x3 ## 1; \
446 psrld $(32 - 7), x4 ## 1; \
447 por x4 ## 1, x3 ## 1; \
448 movdqa x1 ## 1, x4 ## 1; \
449 pslld $7, x4 ## 1; \
450 pxor x1 ## 1, x0 ## 1; \
451 pxor x3 ## 1, x0 ## 1; \
452 pxor x3 ## 1, x2 ## 1; \
453 pxor x4 ## 1, x2 ## 1; \
454 get_key(i, 0, RK0); \
455 pslld $7, x3 ## 2; \
456 psrld $(32 - 7), x4 ## 2; \
457 por x4 ## 2, x3 ## 2; \
458 movdqa x1 ## 2, x4 ## 2; \
459 pslld $7, x4 ## 2; \
460 pxor x1 ## 2, x0 ## 2; \
461 pxor x3 ## 2, x0 ## 2; \
462 pxor x3 ## 2, x2 ## 2; \
463 pxor x4 ## 2, x2 ## 2; \
464 get_key(i, 2, RK2); \
465 pxor RK1, x1 ## 1; \
466 pxor RK3, x3 ## 1; \
467 movdqa x0 ## 1, x4 ## 1; \
468 pslld $5, x0 ## 1; \
469 psrld $(32 - 5), x4 ## 1; \
470 por x4 ## 1, x0 ## 1; \
471 movdqa x2 ## 1, x4 ## 1; \
472 pslld $22, x2 ## 1; \
473 psrld $(32 - 22), x4 ## 1; \
474 por x4 ## 1, x2 ## 1; \
475 pxor RK0, x0 ## 1; \
476 pxor RK2, x2 ## 1; \
477 pxor RK1, x1 ## 2; \
478 pxor RK3, x3 ## 2; \
479 movdqa x0 ## 2, x4 ## 2; \
480 pslld $5, x0 ## 2; \
481 psrld $(32 - 5), x4 ## 2; \
482 por x4 ## 2, x0 ## 2; \
483 movdqa x2 ## 2, x4 ## 2; \
484 pslld $22, x2 ## 2; \
485 psrld $(32 - 22), x4 ## 2; \
486 por x4 ## 2, x2 ## 2; \
487 pxor RK0, x0 ## 2; \
488 pxor RK2, x2 ## 2;
489
490#define KL2(x0, x1, x2, x3, x4, i) \
491 pxor RK0, x0 ## 1; \
492 pxor RK2, x2 ## 1; \
493 movdqa x0 ## 1, x4 ## 1; \
494 psrld $5, x0 ## 1; \
495 pslld $(32 - 5), x4 ## 1; \
496 por x4 ## 1, x0 ## 1; \
497 pxor RK3, x3 ## 1; \
498 pxor RK1, x1 ## 1; \
499 movdqa x2 ## 1, x4 ## 1; \
500 psrld $22, x2 ## 1; \
501 pslld $(32 - 22), x4 ## 1; \
502 por x4 ## 1, x2 ## 1; \
503 pxor x3 ## 1, x2 ## 1; \
504 pxor RK0, x0 ## 2; \
505 pxor RK2, x2 ## 2; \
506 movdqa x0 ## 2, x4 ## 2; \
507 psrld $5, x0 ## 2; \
508 pslld $(32 - 5), x4 ## 2; \
509 por x4 ## 2, x0 ## 2; \
510 pxor RK3, x3 ## 2; \
511 pxor RK1, x1 ## 2; \
512 movdqa x2 ## 2, x4 ## 2; \
513 psrld $22, x2 ## 2; \
514 pslld $(32 - 22), x4 ## 2; \
515 por x4 ## 2, x2 ## 2; \
516 pxor x3 ## 2, x2 ## 2; \
517 pxor x3 ## 1, x0 ## 1; \
518 movdqa x1 ## 1, x4 ## 1; \
519 pslld $7, x4 ## 1; \
520 pxor x1 ## 1, x0 ## 1; \
521 pxor x4 ## 1, x2 ## 1; \
522 movdqa x1 ## 1, x4 ## 1; \
523 psrld $1, x1 ## 1; \
524 pslld $(32 - 1), x4 ## 1; \
525 por x4 ## 1, x1 ## 1; \
526 pxor x3 ## 2, x0 ## 2; \
527 movdqa x1 ## 2, x4 ## 2; \
528 pslld $7, x4 ## 2; \
529 pxor x1 ## 2, x0 ## 2; \
530 pxor x4 ## 2, x2 ## 2; \
531 movdqa x1 ## 2, x4 ## 2; \
532 psrld $1, x1 ## 2; \
533 pslld $(32 - 1), x4 ## 2; \
534 por x4 ## 2, x1 ## 2; \
535 movdqa x3 ## 1, x4 ## 1; \
536 psrld $7, x3 ## 1; \
537 pslld $(32 - 7), x4 ## 1; \
538 por x4 ## 1, x3 ## 1; \
539 pxor x0 ## 1, x1 ## 1; \
540 movdqa x0 ## 1, x4 ## 1; \
541 pslld $3, x4 ## 1; \
542 pxor x4 ## 1, x3 ## 1; \
543 movdqa x0 ## 1, x4 ## 1; \
544 movdqa x3 ## 2, x4 ## 2; \
545 psrld $7, x3 ## 2; \
546 pslld $(32 - 7), x4 ## 2; \
547 por x4 ## 2, x3 ## 2; \
548 pxor x0 ## 2, x1 ## 2; \
549 movdqa x0 ## 2, x4 ## 2; \
550 pslld $3, x4 ## 2; \
551 pxor x4 ## 2, x3 ## 2; \
552 movdqa x0 ## 2, x4 ## 2; \
553 psrld $13, x0 ## 1; \
554 pslld $(32 - 13), x4 ## 1; \
555 por x4 ## 1, x0 ## 1; \
556 pxor x2 ## 1, x1 ## 1; \
557 pxor x2 ## 1, x3 ## 1; \
558 movdqa x2 ## 1, x4 ## 1; \
559 psrld $3, x2 ## 1; \
560 pslld $(32 - 3), x4 ## 1; \
561 por x4 ## 1, x2 ## 1; \
562 psrld $13, x0 ## 2; \
563 pslld $(32 - 13), x4 ## 2; \
564 por x4 ## 2, x0 ## 2; \
565 pxor x2 ## 2, x1 ## 2; \
566 pxor x2 ## 2, x3 ## 2; \
567 movdqa x2 ## 2, x4 ## 2; \
568 psrld $3, x2 ## 2; \
569 pslld $(32 - 3), x4 ## 2; \
570 por x4 ## 2, x2 ## 2;
571
572#define S(SBOX, x0, x1, x2, x3, x4) \
573 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
574 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
575 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
576 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
577
578#define SP(SBOX, x0, x1, x2, x3, x4, i) \
579 get_key(i, 0, RK0); \
580 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
581 get_key(i, 2, RK2); \
582 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
583 get_key(i, 3, RK3); \
584 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
585 get_key(i, 1, RK1); \
586 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
587
588#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
589 movdqa x2, t3; \
590 movdqa x0, t1; \
591 unpcklps x3, t3; \
592 movdqa x0, t2; \
593 unpcklps x1, t1; \
594 unpckhps x1, t2; \
595 movdqa t3, x1; \
596 unpckhps x3, x2; \
597 movdqa t1, x0; \
598 movhlps t1, x1; \
599 movdqa t2, t1; \
600 movlhps t3, x0; \
601 movlhps x2, t1; \
602 movhlps t2, x2; \
603 movdqa x2, x3; \
604 movdqa t1, x2;
605
606#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
607 movdqu (0*4*4)(in), x0; \
608 movdqu (1*4*4)(in), x1; \
609 movdqu (2*4*4)(in), x2; \
610 movdqu (3*4*4)(in), x3; \
611 \
612 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
613
614#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
615 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
616 \
617 movdqu x0, (0*4*4)(out); \
618 movdqu x1, (1*4*4)(out); \
619 movdqu x2, (2*4*4)(out); \
620 movdqu x3, (3*4*4)(out);
621
622#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
623 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
624 \
625 movdqu (0*4*4)(out), t0; \
626 pxor t0, x0; \
627 movdqu x0, (0*4*4)(out); \
628 movdqu (1*4*4)(out), t0; \
629 pxor t0, x1; \
630 movdqu x1, (1*4*4)(out); \
631 movdqu (2*4*4)(out), t0; \
632 pxor t0, x2; \
633 movdqu x2, (2*4*4)(out); \
634 movdqu (3*4*4)(out), t0; \
635 pxor t0, x3; \
636 movdqu x3, (3*4*4)(out);
637
638.align 8
639.global __serpent_enc_blk_8way
640.type __serpent_enc_blk_8way,@function;
641
642__serpent_enc_blk_8way:
643 /* input:
644 * %rdi: ctx, CTX
645 * %rsi: dst
646 * %rdx: src
647 * %rcx: bool, if true: xor output
648 */
649
650 pcmpeqd RNOT, RNOT;
651
652 leaq (4*4*4)(%rdx), %rax;
653 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
654 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
655
656 K2(RA, RB, RC, RD, RE, 0);
657 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
658 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
659 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
660 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
661 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
662 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
663 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
664 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
665 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
666 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
667 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
668 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
669 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
670 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
671 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
672 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
673 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
674 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
675 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
676 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
677 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
678 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
679 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
680 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
681 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
682 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
683 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
684 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
685 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
686 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
687 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
688 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
689
690 leaq (4*4*4)(%rsi), %rax;
691
692 testb %cl, %cl;
693 jnz __enc_xor8;
694
695 write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
696 write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
697
698 ret;
699
700__enc_xor8:
701 xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
702 xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
703
704 ret;
705
706.align 8
707.global serpent_dec_blk_8way
708.type serpent_dec_blk_8way,@function;
709
710serpent_dec_blk_8way:
711 /* input:
712 * %rdi: ctx, CTX
713 * %rsi: dst
714 * %rdx: src
715 */
716
717 pcmpeqd RNOT, RNOT;
718
719 leaq (4*4*4)(%rdx), %rax;
720 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
721 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
722
723 K2(RA, RB, RC, RD, RE, 32);
724 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
725 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
726 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
727 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
728 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
729 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
730 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
731 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
732 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
733 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
734 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
735 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
736 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
737 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
738 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
739 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
740 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
741 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
742 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
743 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
744 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
745 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
746 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
747 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
748 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
749 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
750 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
751 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
752 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
753 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
754 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
755 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
756
757 leaq (4*4*4)(%rsi), %rax;
758 write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
759 write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
760
761 ret;
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
new file mode 100644
index 000000000000..947cf570f6a7
--- /dev/null
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -0,0 +1,719 @@
1/*
2 * Glue Code for SSE2 assembler versions of Serpent Cipher
3 *
4 * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * Glue code based on aesni-intel_glue.c by:
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 *
10 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
11 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
12 * CTR part based on code (crypto/ctr.c) by:
13 * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
19 *
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * You should have received a copy of the GNU General Public License
26 * along with this program; if not, write to the Free Software
27 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
28 * USA
29 *
30 */
31
32#include <linux/module.h>
33#include <linux/hardirq.h>
34#include <linux/types.h>
35#include <linux/crypto.h>
36#include <linux/err.h>
37#include <crypto/algapi.h>
38#include <crypto/serpent.h>
39#include <crypto/cryptd.h>
40#include <crypto/b128ops.h>
41#include <crypto/ctr.h>
42#include <asm/i387.h>
43#include <asm/serpent.h>
44#include <crypto/scatterwalk.h>
45#include <linux/workqueue.h>
46#include <linux/spinlock.h>
47
48struct async_serpent_ctx {
49 struct cryptd_ablkcipher *cryptd_tfm;
50};
51
52static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
53{
54 if (fpu_enabled)
55 return true;
56
57 /* SSE2 is only used when chunk to be processed is large enough, so
58 * do not enable FPU until it is necessary.
59 */
60 if (nbytes < SERPENT_BLOCK_SIZE * SERPENT_PARALLEL_BLOCKS)
61 return false;
62
63 kernel_fpu_begin();
64 return true;
65}
66
67static inline void serpent_fpu_end(bool fpu_enabled)
68{
69 if (fpu_enabled)
70 kernel_fpu_end();
71}
72
73static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
74 bool enc)
75{
76 bool fpu_enabled = false;
77 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
78 const unsigned int bsize = SERPENT_BLOCK_SIZE;
79 unsigned int nbytes;
80 int err;
81
82 err = blkcipher_walk_virt(desc, walk);
83 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
84
85 while ((nbytes = walk->nbytes)) {
86 u8 *wsrc = walk->src.virt.addr;
87 u8 *wdst = walk->dst.virt.addr;
88
89 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
90
91 /* Process multi-block batch */
92 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
93 do {
94 if (enc)
95 serpent_enc_blk_xway(ctx, wdst, wsrc);
96 else
97 serpent_dec_blk_xway(ctx, wdst, wsrc);
98
99 wsrc += bsize * SERPENT_PARALLEL_BLOCKS;
100 wdst += bsize * SERPENT_PARALLEL_BLOCKS;
101 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
102 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
103
104 if (nbytes < bsize)
105 goto done;
106 }
107
108 /* Handle leftovers */
109 do {
110 if (enc)
111 __serpent_encrypt(ctx, wdst, wsrc);
112 else
113 __serpent_decrypt(ctx, wdst, wsrc);
114
115 wsrc += bsize;
116 wdst += bsize;
117 nbytes -= bsize;
118 } while (nbytes >= bsize);
119
120done:
121 err = blkcipher_walk_done(desc, walk, nbytes);
122 }
123
124 serpent_fpu_end(fpu_enabled);
125 return err;
126}
127
128static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
129 struct scatterlist *src, unsigned int nbytes)
130{
131 struct blkcipher_walk walk;
132
133 blkcipher_walk_init(&walk, dst, src, nbytes);
134 return ecb_crypt(desc, &walk, true);
135}
136
137static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
138 struct scatterlist *src, unsigned int nbytes)
139{
140 struct blkcipher_walk walk;
141
142 blkcipher_walk_init(&walk, dst, src, nbytes);
143 return ecb_crypt(desc, &walk, false);
144}
145
146static struct crypto_alg blk_ecb_alg = {
147 .cra_name = "__ecb-serpent-sse2",
148 .cra_driver_name = "__driver-ecb-serpent-sse2",
149 .cra_priority = 0,
150 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
151 .cra_blocksize = SERPENT_BLOCK_SIZE,
152 .cra_ctxsize = sizeof(struct serpent_ctx),
153 .cra_alignmask = 0,
154 .cra_type = &crypto_blkcipher_type,
155 .cra_module = THIS_MODULE,
156 .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list),
157 .cra_u = {
158 .blkcipher = {
159 .min_keysize = SERPENT_MIN_KEY_SIZE,
160 .max_keysize = SERPENT_MAX_KEY_SIZE,
161 .setkey = serpent_setkey,
162 .encrypt = ecb_encrypt,
163 .decrypt = ecb_decrypt,
164 },
165 },
166};
167
168static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
169 struct blkcipher_walk *walk)
170{
171 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
172 const unsigned int bsize = SERPENT_BLOCK_SIZE;
173 unsigned int nbytes = walk->nbytes;
174 u128 *src = (u128 *)walk->src.virt.addr;
175 u128 *dst = (u128 *)walk->dst.virt.addr;
176 u128 *iv = (u128 *)walk->iv;
177
178 do {
179 u128_xor(dst, src, iv);
180 __serpent_encrypt(ctx, (u8 *)dst, (u8 *)dst);
181 iv = dst;
182
183 src += 1;
184 dst += 1;
185 nbytes -= bsize;
186 } while (nbytes >= bsize);
187
188 u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
189 return nbytes;
190}
191
192static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
193 struct scatterlist *src, unsigned int nbytes)
194{
195 struct blkcipher_walk walk;
196 int err;
197
198 blkcipher_walk_init(&walk, dst, src, nbytes);
199 err = blkcipher_walk_virt(desc, &walk);
200
201 while ((nbytes = walk.nbytes)) {
202 nbytes = __cbc_encrypt(desc, &walk);
203 err = blkcipher_walk_done(desc, &walk, nbytes);
204 }
205
206 return err;
207}
208
209static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
210 struct blkcipher_walk *walk)
211{
212 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
213 const unsigned int bsize = SERPENT_BLOCK_SIZE;
214 unsigned int nbytes = walk->nbytes;
215 u128 *src = (u128 *)walk->src.virt.addr;
216 u128 *dst = (u128 *)walk->dst.virt.addr;
217 u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
218 u128 last_iv;
219 int i;
220
221 /* Start of the last block. */
222 src += nbytes / bsize - 1;
223 dst += nbytes / bsize - 1;
224
225 last_iv = *src;
226
227 /* Process multi-block batch */
228 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
229 do {
230 nbytes -= bsize * (SERPENT_PARALLEL_BLOCKS - 1);
231 src -= SERPENT_PARALLEL_BLOCKS - 1;
232 dst -= SERPENT_PARALLEL_BLOCKS - 1;
233
234 for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
235 ivs[i] = src[i];
236
237 serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
238
239 for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
240 u128_xor(dst + (i + 1), dst + (i + 1), ivs + i);
241
242 nbytes -= bsize;
243 if (nbytes < bsize)
244 goto done;
245
246 u128_xor(dst, dst, src - 1);
247 src -= 1;
248 dst -= 1;
249 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
250
251 if (nbytes < bsize)
252 goto done;
253 }
254
255 /* Handle leftovers */
256 for (;;) {
257 __serpent_decrypt(ctx, (u8 *)dst, (u8 *)src);
258
259 nbytes -= bsize;
260 if (nbytes < bsize)
261 break;
262
263 u128_xor(dst, dst, src - 1);
264 src -= 1;
265 dst -= 1;
266 }
267
268done:
269 u128_xor(dst, dst, (u128 *)walk->iv);
270 *(u128 *)walk->iv = last_iv;
271
272 return nbytes;
273}
274
275static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
276 struct scatterlist *src, unsigned int nbytes)
277{
278 bool fpu_enabled = false;
279 struct blkcipher_walk walk;
280 int err;
281
282 blkcipher_walk_init(&walk, dst, src, nbytes);
283 err = blkcipher_walk_virt(desc, &walk);
284 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
285
286 while ((nbytes = walk.nbytes)) {
287 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
288 nbytes = __cbc_decrypt(desc, &walk);
289 err = blkcipher_walk_done(desc, &walk, nbytes);
290 }
291
292 serpent_fpu_end(fpu_enabled);
293 return err;
294}
295
296static struct crypto_alg blk_cbc_alg = {
297 .cra_name = "__cbc-serpent-sse2",
298 .cra_driver_name = "__driver-cbc-serpent-sse2",
299 .cra_priority = 0,
300 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
301 .cra_blocksize = SERPENT_BLOCK_SIZE,
302 .cra_ctxsize = sizeof(struct serpent_ctx),
303 .cra_alignmask = 0,
304 .cra_type = &crypto_blkcipher_type,
305 .cra_module = THIS_MODULE,
306 .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list),
307 .cra_u = {
308 .blkcipher = {
309 .min_keysize = SERPENT_MIN_KEY_SIZE,
310 .max_keysize = SERPENT_MAX_KEY_SIZE,
311 .setkey = serpent_setkey,
312 .encrypt = cbc_encrypt,
313 .decrypt = cbc_decrypt,
314 },
315 },
316};
317
318static inline void u128_to_be128(be128 *dst, const u128 *src)
319{
320 dst->a = cpu_to_be64(src->a);
321 dst->b = cpu_to_be64(src->b);
322}
323
324static inline void be128_to_u128(u128 *dst, const be128 *src)
325{
326 dst->a = be64_to_cpu(src->a);
327 dst->b = be64_to_cpu(src->b);
328}
329
330static inline void u128_inc(u128 *i)
331{
332 i->b++;
333 if (!i->b)
334 i->a++;
335}
336
337static void ctr_crypt_final(struct blkcipher_desc *desc,
338 struct blkcipher_walk *walk)
339{
340 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
341 u8 *ctrblk = walk->iv;
342 u8 keystream[SERPENT_BLOCK_SIZE];
343 u8 *src = walk->src.virt.addr;
344 u8 *dst = walk->dst.virt.addr;
345 unsigned int nbytes = walk->nbytes;
346
347 __serpent_encrypt(ctx, keystream, ctrblk);
348 crypto_xor(keystream, src, nbytes);
349 memcpy(dst, keystream, nbytes);
350
351 crypto_inc(ctrblk, SERPENT_BLOCK_SIZE);
352}
353
354static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
355 struct blkcipher_walk *walk)
356{
357 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
358 const unsigned int bsize = SERPENT_BLOCK_SIZE;
359 unsigned int nbytes = walk->nbytes;
360 u128 *src = (u128 *)walk->src.virt.addr;
361 u128 *dst = (u128 *)walk->dst.virt.addr;
362 u128 ctrblk;
363 be128 ctrblocks[SERPENT_PARALLEL_BLOCKS];
364 int i;
365
366 be128_to_u128(&ctrblk, (be128 *)walk->iv);
367
368 /* Process multi-block batch */
369 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
370 do {
371 /* create ctrblks for parallel encrypt */
372 for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
373 if (dst != src)
374 dst[i] = src[i];
375
376 u128_to_be128(&ctrblocks[i], &ctrblk);
377 u128_inc(&ctrblk);
378 }
379
380 serpent_enc_blk_xway_xor(ctx, (u8 *)dst,
381 (u8 *)ctrblocks);
382
383 src += SERPENT_PARALLEL_BLOCKS;
384 dst += SERPENT_PARALLEL_BLOCKS;
385 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
386 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
387
388 if (nbytes < bsize)
389 goto done;
390 }
391
392 /* Handle leftovers */
393 do {
394 if (dst != src)
395 *dst = *src;
396
397 u128_to_be128(&ctrblocks[0], &ctrblk);
398 u128_inc(&ctrblk);
399
400 __serpent_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
401 u128_xor(dst, dst, (u128 *)ctrblocks);
402
403 src += 1;
404 dst += 1;
405 nbytes -= bsize;
406 } while (nbytes >= bsize);
407
408done:
409 u128_to_be128((be128 *)walk->iv, &ctrblk);
410 return nbytes;
411}
412
413static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
414 struct scatterlist *src, unsigned int nbytes)
415{
416 bool fpu_enabled = false;
417 struct blkcipher_walk walk;
418 int err;
419
420 blkcipher_walk_init(&walk, dst, src, nbytes);
421 err = blkcipher_walk_virt_block(desc, &walk, SERPENT_BLOCK_SIZE);
422 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
423
424 while ((nbytes = walk.nbytes) >= SERPENT_BLOCK_SIZE) {
425 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
426 nbytes = __ctr_crypt(desc, &walk);
427 err = blkcipher_walk_done(desc, &walk, nbytes);
428 }
429
430 serpent_fpu_end(fpu_enabled);
431
432 if (walk.nbytes) {
433 ctr_crypt_final(desc, &walk);
434 err = blkcipher_walk_done(desc, &walk, 0);
435 }
436
437 return err;
438}
439
440static struct crypto_alg blk_ctr_alg = {
441 .cra_name = "__ctr-serpent-sse2",
442 .cra_driver_name = "__driver-ctr-serpent-sse2",
443 .cra_priority = 0,
444 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
445 .cra_blocksize = 1,
446 .cra_ctxsize = sizeof(struct serpent_ctx),
447 .cra_alignmask = 0,
448 .cra_type = &crypto_blkcipher_type,
449 .cra_module = THIS_MODULE,
450 .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list),
451 .cra_u = {
452 .blkcipher = {
453 .min_keysize = SERPENT_MIN_KEY_SIZE,
454 .max_keysize = SERPENT_MAX_KEY_SIZE,
455 .ivsize = SERPENT_BLOCK_SIZE,
456 .setkey = serpent_setkey,
457 .encrypt = ctr_crypt,
458 .decrypt = ctr_crypt,
459 },
460 },
461};
462
463static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
464 unsigned int key_len)
465{
466 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
467 struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
468 int err;
469
470 crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
471 crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
472 & CRYPTO_TFM_REQ_MASK);
473 err = crypto_ablkcipher_setkey(child, key, key_len);
474 crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
475 & CRYPTO_TFM_RES_MASK);
476 return err;
477}
478
479static int __ablk_encrypt(struct ablkcipher_request *req)
480{
481 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
482 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
483 struct blkcipher_desc desc;
484
485 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
486 desc.info = req->info;
487 desc.flags = 0;
488
489 return crypto_blkcipher_crt(desc.tfm)->encrypt(
490 &desc, req->dst, req->src, req->nbytes);
491}
492
493static int ablk_encrypt(struct ablkcipher_request *req)
494{
495 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
496 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
497
498 if (!irq_fpu_usable()) {
499 struct ablkcipher_request *cryptd_req =
500 ablkcipher_request_ctx(req);
501
502 memcpy(cryptd_req, req, sizeof(*req));
503 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
504
505 return crypto_ablkcipher_encrypt(cryptd_req);
506 } else {
507 return __ablk_encrypt(req);
508 }
509}
510
511static int ablk_decrypt(struct ablkcipher_request *req)
512{
513 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
514 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
515
516 if (!irq_fpu_usable()) {
517 struct ablkcipher_request *cryptd_req =
518 ablkcipher_request_ctx(req);
519
520 memcpy(cryptd_req, req, sizeof(*req));
521 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
522
523 return crypto_ablkcipher_decrypt(cryptd_req);
524 } else {
525 struct blkcipher_desc desc;
526
527 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
528 desc.info = req->info;
529 desc.flags = 0;
530
531 return crypto_blkcipher_crt(desc.tfm)->decrypt(
532 &desc, req->dst, req->src, req->nbytes);
533 }
534}
535
536static void ablk_exit(struct crypto_tfm *tfm)
537{
538 struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
539
540 cryptd_free_ablkcipher(ctx->cryptd_tfm);
541}
542
543static void ablk_init_common(struct crypto_tfm *tfm,
544 struct cryptd_ablkcipher *cryptd_tfm)
545{
546 struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
547
548 ctx->cryptd_tfm = cryptd_tfm;
549 tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
550 crypto_ablkcipher_reqsize(&cryptd_tfm->base);
551}
552
553static int ablk_ecb_init(struct crypto_tfm *tfm)
554{
555 struct cryptd_ablkcipher *cryptd_tfm;
556
557 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ecb-serpent-sse2", 0, 0);
558 if (IS_ERR(cryptd_tfm))
559 return PTR_ERR(cryptd_tfm);
560 ablk_init_common(tfm, cryptd_tfm);
561 return 0;
562}
563
564static struct crypto_alg ablk_ecb_alg = {
565 .cra_name = "ecb(serpent)",
566 .cra_driver_name = "ecb-serpent-sse2",
567 .cra_priority = 400,
568 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
569 .cra_blocksize = SERPENT_BLOCK_SIZE,
570 .cra_ctxsize = sizeof(struct async_serpent_ctx),
571 .cra_alignmask = 0,
572 .cra_type = &crypto_ablkcipher_type,
573 .cra_module = THIS_MODULE,
574 .cra_list = LIST_HEAD_INIT(ablk_ecb_alg.cra_list),
575 .cra_init = ablk_ecb_init,
576 .cra_exit = ablk_exit,
577 .cra_u = {
578 .ablkcipher = {
579 .min_keysize = SERPENT_MIN_KEY_SIZE,
580 .max_keysize = SERPENT_MAX_KEY_SIZE,
581 .setkey = ablk_set_key,
582 .encrypt = ablk_encrypt,
583 .decrypt = ablk_decrypt,
584 },
585 },
586};
587
588static int ablk_cbc_init(struct crypto_tfm *tfm)
589{
590 struct cryptd_ablkcipher *cryptd_tfm;
591
592 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-serpent-sse2", 0, 0);
593 if (IS_ERR(cryptd_tfm))
594 return PTR_ERR(cryptd_tfm);
595 ablk_init_common(tfm, cryptd_tfm);
596 return 0;
597}
598
599static struct crypto_alg ablk_cbc_alg = {
600 .cra_name = "cbc(serpent)",
601 .cra_driver_name = "cbc-serpent-sse2",
602 .cra_priority = 400,
603 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
604 .cra_blocksize = SERPENT_BLOCK_SIZE,
605 .cra_ctxsize = sizeof(struct async_serpent_ctx),
606 .cra_alignmask = 0,
607 .cra_type = &crypto_ablkcipher_type,
608 .cra_module = THIS_MODULE,
609 .cra_list = LIST_HEAD_INIT(ablk_cbc_alg.cra_list),
610 .cra_init = ablk_cbc_init,
611 .cra_exit = ablk_exit,
612 .cra_u = {
613 .ablkcipher = {
614 .min_keysize = SERPENT_MIN_KEY_SIZE,
615 .max_keysize = SERPENT_MAX_KEY_SIZE,
616 .ivsize = SERPENT_BLOCK_SIZE,
617 .setkey = ablk_set_key,
618 .encrypt = __ablk_encrypt,
619 .decrypt = ablk_decrypt,
620 },
621 },
622};
623
624static int ablk_ctr_init(struct crypto_tfm *tfm)
625{
626 struct cryptd_ablkcipher *cryptd_tfm;
627
628 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-serpent-sse2", 0, 0);
629 if (IS_ERR(cryptd_tfm))
630 return PTR_ERR(cryptd_tfm);
631 ablk_init_common(tfm, cryptd_tfm);
632 return 0;
633}
634
635static struct crypto_alg ablk_ctr_alg = {
636 .cra_name = "ctr(serpent)",
637 .cra_driver_name = "ctr-serpent-sse2",
638 .cra_priority = 400,
639 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
640 .cra_blocksize = 1,
641 .cra_ctxsize = sizeof(struct async_serpent_ctx),
642 .cra_alignmask = 0,
643 .cra_type = &crypto_ablkcipher_type,
644 .cra_module = THIS_MODULE,
645 .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list),
646 .cra_init = ablk_ctr_init,
647 .cra_exit = ablk_exit,
648 .cra_u = {
649 .ablkcipher = {
650 .min_keysize = SERPENT_MIN_KEY_SIZE,
651 .max_keysize = SERPENT_MAX_KEY_SIZE,
652 .ivsize = SERPENT_BLOCK_SIZE,
653 .setkey = ablk_set_key,
654 .encrypt = ablk_encrypt,
655 .decrypt = ablk_encrypt,
656 .geniv = "chainiv",
657 },
658 },
659};
660
661static int __init serpent_sse2_init(void)
662{
663 int err;
664
665 if (!cpu_has_xmm2) {
666 printk(KERN_INFO "SSE2 instructions are not detected.\n");
667 return -ENODEV;
668 }
669
670 err = crypto_register_alg(&blk_ecb_alg);
671 if (err)
672 goto blk_ecb_err;
673 err = crypto_register_alg(&blk_cbc_alg);
674 if (err)
675 goto blk_cbc_err;
676 err = crypto_register_alg(&blk_ctr_alg);
677 if (err)
678 goto blk_ctr_err;
679 err = crypto_register_alg(&ablk_ecb_alg);
680 if (err)
681 goto ablk_ecb_err;
682 err = crypto_register_alg(&ablk_cbc_alg);
683 if (err)
684 goto ablk_cbc_err;
685 err = crypto_register_alg(&ablk_ctr_alg);
686 if (err)
687 goto ablk_ctr_err;
688 return err;
689
690ablk_ctr_err:
691 crypto_unregister_alg(&ablk_cbc_alg);
692ablk_cbc_err:
693 crypto_unregister_alg(&ablk_ecb_alg);
694ablk_ecb_err:
695 crypto_unregister_alg(&blk_ctr_alg);
696blk_ctr_err:
697 crypto_unregister_alg(&blk_cbc_alg);
698blk_cbc_err:
699 crypto_unregister_alg(&blk_ecb_alg);
700blk_ecb_err:
701 return err;
702}
703
704static void __exit serpent_sse2_exit(void)
705{
706 crypto_unregister_alg(&ablk_ctr_alg);
707 crypto_unregister_alg(&ablk_cbc_alg);
708 crypto_unregister_alg(&ablk_ecb_alg);
709 crypto_unregister_alg(&blk_ctr_alg);
710 crypto_unregister_alg(&blk_cbc_alg);
711 crypto_unregister_alg(&blk_ecb_alg);
712}
713
714module_init(serpent_sse2_init);
715module_exit(serpent_sse2_exit);
716
717MODULE_DESCRIPTION("Serpent Cipher Algorithm, SSE2 optimized");
718MODULE_LICENSE("GPL");
719MODULE_ALIAS("serpent");
diff --git a/arch/x86/include/asm/serpent.h b/arch/x86/include/asm/serpent.h
new file mode 100644
index 000000000000..b7fd3b595b27
--- /dev/null
+++ b/arch/x86/include/asm/serpent.h
@@ -0,0 +1,32 @@
1#ifndef ASM_X86_SERPENT_H
2#define ASM_X86_SERPENT_H
3
4#include <linux/crypto.h>
5#include <crypto/serpent.h>
6
7#define SERPENT_PARALLEL_BLOCKS 8
8
9asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst,
10 const u8 *src, bool xor);
11asmlinkage void serpent_dec_blk_8way(struct serpent_ctx *ctx, u8 *dst,
12 const u8 *src);
13
14static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
15 const u8 *src)
16{
17 __serpent_enc_blk_8way(ctx, dst, src, false);
18}
19
20static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
21 const u8 *src)
22{
23 __serpent_enc_blk_8way(ctx, dst, src, true);
24}
25
26static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
27 const u8 *src)
28{
29 serpent_dec_blk_8way(ctx, dst, src);
30}
31
32#endif