aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJohannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>2012-06-12 04:47:43 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2012-06-12 04:47:43 -0400
commit7efe4076725aeb01722445b56613681aa492c8d6 (patch)
tree138a24608d523e9794012586a904be0360cfebce
parent4da7de4d8be7d18559c56bca446b1161a3b63acc (diff)
crypto: serpent - add x86_64/avx assembler implementation
This patch adds a x86_64/avx assembler implementation of the Serpent block cipher. The implementation is very similar to the sse2 implementation and processes eight blocks in parallel. Because of the new non-destructive three operand syntax all move-instructions can be removed and therefore a little performance increase is provided. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) serpent-avx-x86_64 vs. serpent-sse2-x86_64 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 1.03x 1.01x 1.01x 1.01x 1.00x 1.00x 1.00x 1.00x 1.00x 1.01x 64B 1.00x 1.00x 1.00x 1.00x 1.00x 0.99x 1.00x 1.01x 1.00x 1.00x 256B 1.05x 1.03x 1.00x 1.02x 1.05x 1.06x 1.05x 1.02x 1.05x 1.02x 1024B 1.05x 1.02x 1.00x 1.02x 1.05x 1.06x 1.05x 1.03x 1.05x 1.02x 8192B 1.05x 1.02x 1.00x 1.02x 1.06x 1.06x 1.04x 1.03x 1.04x 1.02x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 1.01x 1.00x 1.01x 1.01x 1.00x 1.00x 0.99x 1.03x 1.01x 1.01x 64B 1.00x 1.00x 1.00x 1.00x 1.00x 1.00x 1.00x 1.01x 1.00x 1.02x 256B 1.05x 1.02x 1.00x 1.02x 1.05x 1.02x 1.04x 1.05x 1.05x 1.02x 1024B 1.06x 1.02x 1.00x 1.02x 1.07x 1.06x 1.05x 1.04x 1.05x 1.02x 8192B 1.05x 1.02x 1.00x 1.02x 1.06x 1.06x 1.04x 1.05x 1.05x 1.02x serpent-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.26x 1.73x ecb-dec 1.20x 1.64x cbc-enc 0.33x 0.45x cbc-dec 1.24x 1.67x ctr-enc 1.32x 1.76x ctr-dec 1.32x 1.76x lrw-enc 1.20x 1.60x lrw-dec 1.15x 1.54x xts-enc 1.22x 1.64x xts-dec 1.17x 1.57x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/serpent-avx-x86_64-asm_64.S704
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c949
-rw-r--r--crypto/Kconfig20
-rw-r--r--crypto/testmgr.c60
5 files changed, 1735 insertions, 0 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 3420feef0c7..83caa4b948c 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
15obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o 15obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
16obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o 16obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
17obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o 17obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
18obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
18obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o 19obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
19obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o 20obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
20 21
@@ -34,6 +35,7 @@ twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
34twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o 35twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o
35salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o 36salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
36serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o 37serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
38serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
37 39
38aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o 40aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
39ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o 41ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
new file mode 100644
index 00000000000..0ed47a124ba
--- /dev/null
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -0,0 +1,704 @@
1/*
2 * Serpent Cipher 8-way parallel algorithm (x86_64/AVX)
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * Based on arch/x86/crypto/serpent-sse2-x86_64-asm_64.S by
8 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27.file "serpent-avx-x86_64-asm_64.S"
28.text
29
30#define CTX %rdi
31
32/**********************************************************************
33 8-way AVX serpent
34 **********************************************************************/
35#define RA1 %xmm0
36#define RB1 %xmm1
37#define RC1 %xmm2
38#define RD1 %xmm3
39#define RE1 %xmm4
40
41#define tp %xmm5
42
43#define RA2 %xmm6
44#define RB2 %xmm7
45#define RC2 %xmm8
46#define RD2 %xmm9
47#define RE2 %xmm10
48
49#define RNOT %xmm11
50
51#define RK0 %xmm12
52#define RK1 %xmm13
53#define RK2 %xmm14
54#define RK3 %xmm15
55
56
57#define S0_1(x0, x1, x2, x3, x4) \
58 vpor x0, x3, tp; \
59 vpxor x3, x0, x0; \
60 vpxor x2, x3, x4; \
61 vpxor RNOT, x4, x4; \
62 vpxor x1, tp, x3; \
63 vpand x0, x1, x1; \
64 vpxor x4, x1, x1; \
65 vpxor x0, x2, x2;
66#define S0_2(x0, x1, x2, x3, x4) \
67 vpxor x3, x0, x0; \
68 vpor x0, x4, x4; \
69 vpxor x2, x0, x0; \
70 vpand x1, x2, x2; \
71 vpxor x2, x3, x3; \
72 vpxor RNOT, x1, x1; \
73 vpxor x4, x2, x2; \
74 vpxor x2, x1, x1;
75
76#define S1_1(x0, x1, x2, x3, x4) \
77 vpxor x0, x1, tp; \
78 vpxor x3, x0, x0; \
79 vpxor RNOT, x3, x3; \
80 vpand tp, x1, x4; \
81 vpor tp, x0, x0; \
82 vpxor x2, x3, x3; \
83 vpxor x3, x0, x0; \
84 vpxor x3, tp, x1;
85#define S1_2(x0, x1, x2, x3, x4) \
86 vpxor x4, x3, x3; \
87 vpor x4, x1, x1; \
88 vpxor x2, x4, x4; \
89 vpand x0, x2, x2; \
90 vpxor x1, x2, x2; \
91 vpor x0, x1, x1; \
92 vpxor RNOT, x0, x0; \
93 vpxor x2, x0, x0; \
94 vpxor x1, x4, x4;
95
96#define S2_1(x0, x1, x2, x3, x4) \
97 vpxor RNOT, x3, x3; \
98 vpxor x0, x1, x1; \
99 vpand x2, x0, tp; \
100 vpxor x3, tp, tp; \
101 vpor x0, x3, x3; \
102 vpxor x1, x2, x2; \
103 vpxor x1, x3, x3; \
104 vpand tp, x1, x1;
105#define S2_2(x0, x1, x2, x3, x4) \
106 vpxor x2, tp, tp; \
107 vpand x3, x2, x2; \
108 vpor x1, x3, x3; \
109 vpxor RNOT, tp, tp; \
110 vpxor tp, x3, x3; \
111 vpxor tp, x0, x4; \
112 vpxor x2, tp, x0; \
113 vpor x2, x1, x1;
114
115#define S3_1(x0, x1, x2, x3, x4) \
116 vpxor x3, x1, tp; \
117 vpor x0, x3, x3; \
118 vpand x0, x1, x4; \
119 vpxor x2, x0, x0; \
120 vpxor tp, x2, x2; \
121 vpand x3, tp, x1; \
122 vpxor x3, x2, x2; \
123 vpor x4, x0, x0; \
124 vpxor x3, x4, x4;
125#define S3_2(x0, x1, x2, x3, x4) \
126 vpxor x0, x1, x1; \
127 vpand x3, x0, x0; \
128 vpand x4, x3, x3; \
129 vpxor x2, x3, x3; \
130 vpor x1, x4, x4; \
131 vpand x1, x2, x2; \
132 vpxor x3, x4, x4; \
133 vpxor x3, x0, x0; \
134 vpxor x2, x3, x3;
135
136#define S4_1(x0, x1, x2, x3, x4) \
137 vpand x0, x3, tp; \
138 vpxor x3, x0, x0; \
139 vpxor x2, tp, tp; \
140 vpor x3, x2, x2; \
141 vpxor x1, x0, x0; \
142 vpxor tp, x3, x4; \
143 vpor x0, x2, x2; \
144 vpxor x1, x2, x2;
145#define S4_2(x0, x1, x2, x3, x4) \
146 vpand x0, x1, x1; \
147 vpxor x4, x1, x1; \
148 vpand x2, x4, x4; \
149 vpxor tp, x2, x2; \
150 vpxor x0, x4, x4; \
151 vpor x1, tp, x3; \
152 vpxor RNOT, x1, x1; \
153 vpxor x0, x3, x3;
154
155#define S5_1(x0, x1, x2, x3, x4) \
156 vpor x0, x1, tp; \
157 vpxor tp, x2, x2; \
158 vpxor RNOT, x3, x3; \
159 vpxor x0, x1, x4; \
160 vpxor x2, x0, x0; \
161 vpand x4, tp, x1; \
162 vpor x3, x4, x4; \
163 vpxor x0, x4, x4;
164#define S5_2(x0, x1, x2, x3, x4) \
165 vpand x3, x0, x0; \
166 vpxor x3, x1, x1; \
167 vpxor x2, x3, x3; \
168 vpxor x1, x0, x0; \
169 vpand x4, x2, x2; \
170 vpxor x2, x1, x1; \
171 vpand x0, x2, x2; \
172 vpxor x2, x3, x3;
173
174#define S6_1(x0, x1, x2, x3, x4) \
175 vpxor x0, x3, x3; \
176 vpxor x2, x1, tp; \
177 vpxor x0, x2, x2; \
178 vpand x3, x0, x0; \
179 vpor x3, tp, tp; \
180 vpxor RNOT, x1, x4; \
181 vpxor tp, x0, x0; \
182 vpxor x2, tp, x1;
183#define S6_2(x0, x1, x2, x3, x4) \
184 vpxor x4, x3, x3; \
185 vpxor x0, x4, x4; \
186 vpand x0, x2, x2; \
187 vpxor x1, x4, x4; \
188 vpxor x3, x2, x2; \
189 vpand x1, x3, x3; \
190 vpxor x0, x3, x3; \
191 vpxor x2, x1, x1;
192
193#define S7_1(x0, x1, x2, x3, x4) \
194 vpxor RNOT, x1, tp; \
195 vpxor RNOT, x0, x0; \
196 vpand x2, tp, x1; \
197 vpxor x3, x1, x1; \
198 vpor tp, x3, x3; \
199 vpxor x2, tp, x4; \
200 vpxor x3, x2, x2; \
201 vpxor x0, x3, x3; \
202 vpor x1, x0, x0;
203#define S7_2(x0, x1, x2, x3, x4) \
204 vpand x0, x2, x2; \
205 vpxor x4, x0, x0; \
206 vpxor x3, x4, x4; \
207 vpand x0, x3, x3; \
208 vpxor x1, x4, x4; \
209 vpxor x4, x2, x2; \
210 vpxor x1, x3, x3; \
211 vpor x0, x4, x4; \
212 vpxor x1, x4, x4;
213
214#define SI0_1(x0, x1, x2, x3, x4) \
215 vpxor x0, x1, x1; \
216 vpor x1, x3, tp; \
217 vpxor x1, x3, x4; \
218 vpxor RNOT, x0, x0; \
219 vpxor tp, x2, x2; \
220 vpxor x0, tp, x3; \
221 vpand x1, x0, x0; \
222 vpxor x2, x0, x0;
223#define SI0_2(x0, x1, x2, x3, x4) \
224 vpand x3, x2, x2; \
225 vpxor x4, x3, x3; \
226 vpxor x3, x2, x2; \
227 vpxor x3, x1, x1; \
228 vpand x0, x3, x3; \
229 vpxor x0, x1, x1; \
230 vpxor x2, x0, x0; \
231 vpxor x3, x4, x4;
232
233#define SI1_1(x0, x1, x2, x3, x4) \
234 vpxor x3, x1, x1; \
235 vpxor x2, x0, tp; \
236 vpxor RNOT, x2, x2; \
237 vpor x1, x0, x4; \
238 vpxor x3, x4, x4; \
239 vpand x1, x3, x3; \
240 vpxor x2, x1, x1; \
241 vpand x4, x2, x2;
242#define SI1_2(x0, x1, x2, x3, x4) \
243 vpxor x1, x4, x4; \
244 vpor x3, x1, x1; \
245 vpxor tp, x3, x3; \
246 vpxor tp, x2, x2; \
247 vpor x4, tp, x0; \
248 vpxor x4, x2, x2; \
249 vpxor x0, x1, x1; \
250 vpxor x1, x4, x4;
251
252#define SI2_1(x0, x1, x2, x3, x4) \
253 vpxor x1, x2, x2; \
254 vpxor RNOT, x3, tp; \
255 vpor x2, tp, tp; \
256 vpxor x3, x2, x2; \
257 vpxor x0, x3, x4; \
258 vpxor x1, tp, x3; \
259 vpor x2, x1, x1; \
260 vpxor x0, x2, x2;
261#define SI2_2(x0, x1, x2, x3, x4) \
262 vpxor x4, x1, x1; \
263 vpor x3, x4, x4; \
264 vpxor x3, x2, x2; \
265 vpxor x2, x4, x4; \
266 vpand x1, x2, x2; \
267 vpxor x3, x2, x2; \
268 vpxor x4, x3, x3; \
269 vpxor x0, x4, x4;
270
271#define SI3_1(x0, x1, x2, x3, x4) \
272 vpxor x1, x2, x2; \
273 vpand x2, x1, tp; \
274 vpxor x0, tp, tp; \
275 vpor x1, x0, x0; \
276 vpxor x3, x1, x4; \
277 vpxor x3, x0, x0; \
278 vpor tp, x3, x3; \
279 vpxor x2, tp, x1;
280#define SI3_2(x0, x1, x2, x3, x4) \
281 vpxor x3, x1, x1; \
282 vpxor x2, x0, x0; \
283 vpxor x3, x2, x2; \
284 vpand x1, x3, x3; \
285 vpxor x0, x1, x1; \
286 vpand x2, x0, x0; \
287 vpxor x3, x4, x4; \
288 vpxor x0, x3, x3; \
289 vpxor x1, x0, x0;
290
291#define SI4_1(x0, x1, x2, x3, x4) \
292 vpxor x3, x2, x2; \
293 vpand x1, x0, tp; \
294 vpxor x2, tp, tp; \
295 vpor x3, x2, x2; \
296 vpxor RNOT, x0, x4; \
297 vpxor tp, x1, x1; \
298 vpxor x2, tp, x0; \
299 vpand x4, x2, x2;
300#define SI4_2(x0, x1, x2, x3, x4) \
301 vpxor x0, x2, x2; \
302 vpor x4, x0, x0; \
303 vpxor x3, x0, x0; \
304 vpand x2, x3, x3; \
305 vpxor x3, x4, x4; \
306 vpxor x1, x3, x3; \
307 vpand x0, x1, x1; \
308 vpxor x1, x4, x4; \
309 vpxor x3, x0, x0;
310
311#define SI5_1(x0, x1, x2, x3, x4) \
312 vpor x2, x1, tp; \
313 vpxor x1, x2, x2; \
314 vpxor x3, tp, tp; \
315 vpand x1, x3, x3; \
316 vpxor x3, x2, x2; \
317 vpor x0, x3, x3; \
318 vpxor RNOT, x0, x0; \
319 vpxor x2, x3, x3; \
320 vpor x0, x2, x2;
321#define SI5_2(x0, x1, x2, x3, x4) \
322 vpxor tp, x1, x4; \
323 vpxor x4, x2, x2; \
324 vpand x0, x4, x4; \
325 vpxor tp, x0, x0; \
326 vpxor x3, tp, x1; \
327 vpand x2, x0, x0; \
328 vpxor x3, x2, x2; \
329 vpxor x2, x0, x0; \
330 vpxor x4, x2, x2; \
331 vpxor x3, x4, x4;
332
333#define SI6_1(x0, x1, x2, x3, x4) \
334 vpxor x2, x0, x0; \
335 vpand x3, x0, tp; \
336 vpxor x3, x2, x2; \
337 vpxor x2, tp, tp; \
338 vpxor x1, x3, x3; \
339 vpor x0, x2, x2; \
340 vpxor x3, x2, x2; \
341 vpand tp, x3, x3;
342#define SI6_2(x0, x1, x2, x3, x4) \
343 vpxor RNOT, tp, tp; \
344 vpxor x1, x3, x3; \
345 vpand x2, x1, x1; \
346 vpxor tp, x0, x4; \
347 vpxor x4, x3, x3; \
348 vpxor x2, x4, x4; \
349 vpxor x1, tp, x0; \
350 vpxor x0, x2, x2;
351
352#define SI7_1(x0, x1, x2, x3, x4) \
353 vpand x0, x3, tp; \
354 vpxor x2, x0, x0; \
355 vpor x3, x2, x2; \
356 vpxor x1, x3, x4; \
357 vpxor RNOT, x0, x0; \
358 vpor tp, x1, x1; \
359 vpxor x0, x4, x4; \
360 vpand x2, x0, x0; \
361 vpxor x1, x0, x0;
362#define SI7_2(x0, x1, x2, x3, x4) \
363 vpand x2, x1, x1; \
364 vpxor x2, tp, x3; \
365 vpxor x3, x4, x4; \
366 vpand x3, x2, x2; \
367 vpor x0, x3, x3; \
368 vpxor x4, x1, x1; \
369 vpxor x4, x3, x3; \
370 vpand x0, x4, x4; \
371 vpxor x2, x4, x4;
372
373#define get_key(i, j, t) \
374 vbroadcastss (4*(i)+(j))*4(CTX), t;
375
376#define K2(x0, x1, x2, x3, x4, i) \
377 get_key(i, 0, RK0); \
378 get_key(i, 1, RK1); \
379 get_key(i, 2, RK2); \
380 get_key(i, 3, RK3); \
381 vpxor RK0, x0 ## 1, x0 ## 1; \
382 vpxor RK1, x1 ## 1, x1 ## 1; \
383 vpxor RK2, x2 ## 1, x2 ## 1; \
384 vpxor RK3, x3 ## 1, x3 ## 1; \
385 vpxor RK0, x0 ## 2, x0 ## 2; \
386 vpxor RK1, x1 ## 2, x1 ## 2; \
387 vpxor RK2, x2 ## 2, x2 ## 2; \
388 vpxor RK3, x3 ## 2, x3 ## 2;
389
390#define LK2(x0, x1, x2, x3, x4, i) \
391 vpslld $13, x0 ## 1, x4 ## 1; \
392 vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \
393 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
394 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
395 vpslld $3, x2 ## 1, x4 ## 1; \
396 vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \
397 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
398 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
399 vpslld $13, x0 ## 2, x4 ## 2; \
400 vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \
401 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
402 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
403 vpslld $3, x2 ## 2, x4 ## 2; \
404 vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \
405 vpor x4 ## 2, x2 ## 2, x2 ## 2; \
406 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
407 vpslld $1, x1 ## 1, x4 ## 1; \
408 vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \
409 vpor x4 ## 1, x1 ## 1, x1 ## 1; \
410 vpslld $3, x0 ## 1, x4 ## 1; \
411 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
412 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
413 get_key(i, 1, RK1); \
414 vpslld $1, x1 ## 2, x4 ## 2; \
415 vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \
416 vpor x4 ## 2, x1 ## 2, x1 ## 2; \
417 vpslld $3, x0 ## 2, x4 ## 2; \
418 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
419 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
420 get_key(i, 3, RK3); \
421 vpslld $7, x3 ## 1, x4 ## 1; \
422 vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \
423 vpor x4 ## 1, x3 ## 1, x3 ## 1; \
424 vpslld $7, x1 ## 1, x4 ## 1; \
425 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
426 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
427 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
428 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
429 get_key(i, 0, RK0); \
430 vpslld $7, x3 ## 2, x4 ## 2; \
431 vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \
432 vpor x4 ## 2, x3 ## 2, x3 ## 2; \
433 vpslld $7, x1 ## 2, x4 ## 2; \
434 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
435 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
436 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
437 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
438 get_key(i, 2, RK2); \
439 vpxor RK1, x1 ## 1, x1 ## 1; \
440 vpxor RK3, x3 ## 1, x3 ## 1; \
441 vpslld $5, x0 ## 1, x4 ## 1; \
442 vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \
443 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
444 vpslld $22, x2 ## 1, x4 ## 1; \
445 vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \
446 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
447 vpxor RK0, x0 ## 1, x0 ## 1; \
448 vpxor RK2, x2 ## 1, x2 ## 1; \
449 vpxor RK1, x1 ## 2, x1 ## 2; \
450 vpxor RK3, x3 ## 2, x3 ## 2; \
451 vpslld $5, x0 ## 2, x4 ## 2; \
452 vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \
453 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
454 vpslld $22, x2 ## 2, x4 ## 2; \
455 vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \
456 vpor x4 ## 2, x2 ## 2, x2 ## 2; \
457 vpxor RK0, x0 ## 2, x0 ## 2; \
458 vpxor RK2, x2 ## 2, x2 ## 2;
459
460#define KL2(x0, x1, x2, x3, x4, i) \
461 vpxor RK0, x0 ## 1, x0 ## 1; \
462 vpxor RK2, x2 ## 1, x2 ## 1; \
463 vpsrld $5, x0 ## 1, x4 ## 1; \
464 vpslld $(32 - 5), x0 ## 1, x0 ## 1; \
465 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
466 vpxor RK3, x3 ## 1, x3 ## 1; \
467 vpxor RK1, x1 ## 1, x1 ## 1; \
468 vpsrld $22, x2 ## 1, x4 ## 1; \
469 vpslld $(32 - 22), x2 ## 1, x2 ## 1; \
470 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
471 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
472 vpxor RK0, x0 ## 2, x0 ## 2; \
473 vpxor RK2, x2 ## 2, x2 ## 2; \
474 vpsrld $5, x0 ## 2, x4 ## 2; \
475 vpslld $(32 - 5), x0 ## 2, x0 ## 2; \
476 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
477 vpxor RK3, x3 ## 2, x3 ## 2; \
478 vpxor RK1, x1 ## 2, x1 ## 2; \
479 vpsrld $22, x2 ## 2, x4 ## 2; \
480 vpslld $(32 - 22), x2 ## 2, x2 ## 2; \
481 vpor x4 ## 2, x2 ## 2, x2 ## 2; \
482 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
483 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
484 vpslld $7, x1 ## 1, x4 ## 1; \
485 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
486 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
487 vpsrld $1, x1 ## 1, x4 ## 1; \
488 vpslld $(32 - 1), x1 ## 1, x1 ## 1; \
489 vpor x4 ## 1, x1 ## 1, x1 ## 1; \
490 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
491 vpslld $7, x1 ## 2, x4 ## 2; \
492 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
493 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
494 vpsrld $1, x1 ## 2, x4 ## 2; \
495 vpslld $(32 - 1), x1 ## 2, x1 ## 2; \
496 vpor x4 ## 2, x1 ## 2, x1 ## 2; \
497 vpsrld $7, x3 ## 1, x4 ## 1; \
498 vpslld $(32 - 7), x3 ## 1, x3 ## 1; \
499 vpor x4 ## 1, x3 ## 1, x3 ## 1; \
500 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
501 vpslld $3, x0 ## 1, x4 ## 1; \
502 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
503 vpsrld $7, x3 ## 2, x4 ## 2; \
504 vpslld $(32 - 7), x3 ## 2, x3 ## 2; \
505 vpor x4 ## 2, x3 ## 2, x3 ## 2; \
506 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
507 vpslld $3, x0 ## 2, x4 ## 2; \
508 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
509 vpsrld $13, x0 ## 1, x4 ## 1; \
510 vpslld $(32 - 13), x0 ## 1, x0 ## 1; \
511 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
512 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
513 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
514 vpsrld $3, x2 ## 1, x4 ## 1; \
515 vpslld $(32 - 3), x2 ## 1, x2 ## 1; \
516 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
517 vpsrld $13, x0 ## 2, x4 ## 2; \
518 vpslld $(32 - 13), x0 ## 2, x0 ## 2; \
519 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
520 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
521 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
522 vpsrld $3, x2 ## 2, x4 ## 2; \
523 vpslld $(32 - 3), x2 ## 2, x2 ## 2; \
524 vpor x4 ## 2, x2 ## 2, x2 ## 2;
525
526#define S(SBOX, x0, x1, x2, x3, x4) \
527 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
528 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
529 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
530 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
531
532#define SP(SBOX, x0, x1, x2, x3, x4, i) \
533 get_key(i, 0, RK0); \
534 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
535 get_key(i, 2, RK2); \
536 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
537 get_key(i, 3, RK3); \
538 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
539 get_key(i, 1, RK1); \
540 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
541
542#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
543 vpunpckldq x1, x0, t0; \
544 vpunpckhdq x1, x0, t2; \
545 vpunpckldq x3, x2, t1; \
546 vpunpckhdq x3, x2, x3; \
547 \
548 vpunpcklqdq t1, t0, x0; \
549 vpunpckhqdq t1, t0, x1; \
550 vpunpcklqdq x3, t2, x2; \
551 vpunpckhqdq x3, t2, x3;
552
553#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
554 vmovdqu (0*4*4)(in), x0; \
555 vmovdqu (1*4*4)(in), x1; \
556 vmovdqu (2*4*4)(in), x2; \
557 vmovdqu (3*4*4)(in), x3; \
558 \
559 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
560
561#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
562 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
563 \
564 vmovdqu x0, (0*4*4)(out); \
565 vmovdqu x1, (1*4*4)(out); \
566 vmovdqu x2, (2*4*4)(out); \
567 vmovdqu x3, (3*4*4)(out);
568
569#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
570 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
571 \
572 vpxor (0*4*4)(out), x0, x0; \
573 vmovdqu x0, (0*4*4)(out); \
574 vpxor (1*4*4)(out), x1, x1; \
575 vmovdqu x1, (1*4*4)(out); \
576 vpxor (2*4*4)(out), x2, x2; \
577 vmovdqu x2, (2*4*4)(out); \
578 vpxor (3*4*4)(out), x3, x3; \
579 vmovdqu x3, (3*4*4)(out);
580
581.align 8
582.global __serpent_enc_blk_8way
583.type __serpent_enc_blk_8way,@function;
584
585__serpent_enc_blk_8way:
586 /* input:
587 * %rdi: ctx, CTX
588 * %rsi: dst
589 * %rdx: src
590 * %rcx: bool, if true: xor output
591 */
592
593 vpcmpeqd RNOT, RNOT, RNOT;
594
595 leaq (4*4*4)(%rdx), %rax;
596 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
597 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
598
599 K2(RA, RB, RC, RD, RE, 0);
600 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
601 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
602 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
603 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
604 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
605 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
606 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
607 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
608 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
609 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
610 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
611 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
612 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
613 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
614 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
615 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
616 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
617 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
618 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
619 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
620 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
621 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
622 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
623 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
624 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
625 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
626 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
627 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
628 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
629 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
630 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
631 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
632
633 leaq (4*4*4)(%rsi), %rax;
634
635 testb %cl, %cl;
636 jnz __enc_xor8;
637
638 write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
639 write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
640
641 ret;
642
643__enc_xor8:
644 xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
645 xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
646
647 ret;
648
649.align 8
650.global serpent_dec_blk_8way
651.type serpent_dec_blk_8way,@function;
652
653serpent_dec_blk_8way:
654 /* input:
655 * %rdi: ctx, CTX
656 * %rsi: dst
657 * %rdx: src
658 */
659
660 vpcmpeqd RNOT, RNOT, RNOT;
661
662 leaq (4*4*4)(%rdx), %rax;
663 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
664 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
665
666 K2(RA, RB, RC, RD, RE, 32);
667 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
668 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
669 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
670 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
671 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
672 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
673 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
674 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
675 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
676 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
677 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
678 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
679 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
680 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
681 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
682 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
683 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
684 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
685 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
686 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
687 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
688 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
689 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
690 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
691 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
692 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
693 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
694 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
695 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
696 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
697 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
698 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
699
700 leaq (4*4*4)(%rsi), %rax;
701 write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
702 write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
703
704 ret;
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
new file mode 100644
index 00000000000..0dc7a26535e
--- /dev/null
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -0,0 +1,949 @@
1/*
2 * Glue Code for AVX assembler versions of Serpent Cipher
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * Glue code based on serpent_sse2_glue.c by:
8 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27#include <linux/module.h>
28#include <linux/hardirq.h>
29#include <linux/types.h>
30#include <linux/crypto.h>
31#include <linux/err.h>
32#include <crypto/algapi.h>
33#include <crypto/serpent.h>
34#include <crypto/cryptd.h>
35#include <crypto/b128ops.h>
36#include <crypto/ctr.h>
37#include <crypto/lrw.h>
38#include <crypto/xts.h>
39#include <asm/i387.h>
40#include <asm/xcr.h>
41#include <asm/xsave.h>
42#include <asm/serpent.h>
43#include <crypto/scatterwalk.h>
44#include <linux/workqueue.h>
45#include <linux/spinlock.h>
46
47struct async_serpent_ctx {
48 struct cryptd_ablkcipher *cryptd_tfm;
49};
50
51static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
52{
53 if (fpu_enabled)
54 return true;
55
56 /* AVX is only used when chunk to be processed is large enough, so
57 * do not enable FPU until it is necessary.
58 */
59 if (nbytes < SERPENT_BLOCK_SIZE * SERPENT_PARALLEL_BLOCKS)
60 return false;
61
62 kernel_fpu_begin();
63 return true;
64}
65
66static inline void serpent_fpu_end(bool fpu_enabled)
67{
68 if (fpu_enabled)
69 kernel_fpu_end();
70}
71
72static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
73 bool enc)
74{
75 bool fpu_enabled = false;
76 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
77 const unsigned int bsize = SERPENT_BLOCK_SIZE;
78 unsigned int nbytes;
79 int err;
80
81 err = blkcipher_walk_virt(desc, walk);
82 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
83
84 while ((nbytes = walk->nbytes)) {
85 u8 *wsrc = walk->src.virt.addr;
86 u8 *wdst = walk->dst.virt.addr;
87
88 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
89
90 /* Process multi-block batch */
91 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
92 do {
93 if (enc)
94 serpent_enc_blk_xway(ctx, wdst, wsrc);
95 else
96 serpent_dec_blk_xway(ctx, wdst, wsrc);
97
98 wsrc += bsize * SERPENT_PARALLEL_BLOCKS;
99 wdst += bsize * SERPENT_PARALLEL_BLOCKS;
100 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
101 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
102
103 if (nbytes < bsize)
104 goto done;
105 }
106
107 /* Handle leftovers */
108 do {
109 if (enc)
110 __serpent_encrypt(ctx, wdst, wsrc);
111 else
112 __serpent_decrypt(ctx, wdst, wsrc);
113
114 wsrc += bsize;
115 wdst += bsize;
116 nbytes -= bsize;
117 } while (nbytes >= bsize);
118
119done:
120 err = blkcipher_walk_done(desc, walk, nbytes);
121 }
122
123 serpent_fpu_end(fpu_enabled);
124 return err;
125}
126
127static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
128 struct scatterlist *src, unsigned int nbytes)
129{
130 struct blkcipher_walk walk;
131
132 blkcipher_walk_init(&walk, dst, src, nbytes);
133 return ecb_crypt(desc, &walk, true);
134}
135
136static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
137 struct scatterlist *src, unsigned int nbytes)
138{
139 struct blkcipher_walk walk;
140
141 blkcipher_walk_init(&walk, dst, src, nbytes);
142 return ecb_crypt(desc, &walk, false);
143}
144
145static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
146 struct blkcipher_walk *walk)
147{
148 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
149 const unsigned int bsize = SERPENT_BLOCK_SIZE;
150 unsigned int nbytes = walk->nbytes;
151 u128 *src = (u128 *)walk->src.virt.addr;
152 u128 *dst = (u128 *)walk->dst.virt.addr;
153 u128 *iv = (u128 *)walk->iv;
154
155 do {
156 u128_xor(dst, src, iv);
157 __serpent_encrypt(ctx, (u8 *)dst, (u8 *)dst);
158 iv = dst;
159
160 src += 1;
161 dst += 1;
162 nbytes -= bsize;
163 } while (nbytes >= bsize);
164
165 u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
166 return nbytes;
167}
168
169static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
170 struct scatterlist *src, unsigned int nbytes)
171{
172 struct blkcipher_walk walk;
173 int err;
174
175 blkcipher_walk_init(&walk, dst, src, nbytes);
176 err = blkcipher_walk_virt(desc, &walk);
177
178 while ((nbytes = walk.nbytes)) {
179 nbytes = __cbc_encrypt(desc, &walk);
180 err = blkcipher_walk_done(desc, &walk, nbytes);
181 }
182
183 return err;
184}
185
186static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
187 struct blkcipher_walk *walk)
188{
189 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
190 const unsigned int bsize = SERPENT_BLOCK_SIZE;
191 unsigned int nbytes = walk->nbytes;
192 u128 *src = (u128 *)walk->src.virt.addr;
193 u128 *dst = (u128 *)walk->dst.virt.addr;
194 u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
195 u128 last_iv;
196 int i;
197
198 /* Start of the last block. */
199 src += nbytes / bsize - 1;
200 dst += nbytes / bsize - 1;
201
202 last_iv = *src;
203
204 /* Process multi-block batch */
205 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
206 do {
207 nbytes -= bsize * (SERPENT_PARALLEL_BLOCKS - 1);
208 src -= SERPENT_PARALLEL_BLOCKS - 1;
209 dst -= SERPENT_PARALLEL_BLOCKS - 1;
210
211 for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
212 ivs[i] = src[i];
213
214 serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
215
216 for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
217 u128_xor(dst + (i + 1), dst + (i + 1), ivs + i);
218
219 nbytes -= bsize;
220 if (nbytes < bsize)
221 goto done;
222
223 u128_xor(dst, dst, src - 1);
224 src -= 1;
225 dst -= 1;
226 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
227
228 if (nbytes < bsize)
229 goto done;
230 }
231
232 /* Handle leftovers */
233 for (;;) {
234 __serpent_decrypt(ctx, (u8 *)dst, (u8 *)src);
235
236 nbytes -= bsize;
237 if (nbytes < bsize)
238 break;
239
240 u128_xor(dst, dst, src - 1);
241 src -= 1;
242 dst -= 1;
243 }
244
245done:
246 u128_xor(dst, dst, (u128 *)walk->iv);
247 *(u128 *)walk->iv = last_iv;
248
249 return nbytes;
250}
251
252static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
253 struct scatterlist *src, unsigned int nbytes)
254{
255 bool fpu_enabled = false;
256 struct blkcipher_walk walk;
257 int err;
258
259 blkcipher_walk_init(&walk, dst, src, nbytes);
260 err = blkcipher_walk_virt(desc, &walk);
261 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
262
263 while ((nbytes = walk.nbytes)) {
264 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
265 nbytes = __cbc_decrypt(desc, &walk);
266 err = blkcipher_walk_done(desc, &walk, nbytes);
267 }
268
269 serpent_fpu_end(fpu_enabled);
270 return err;
271}
272
273static inline void u128_to_be128(be128 *dst, const u128 *src)
274{
275 dst->a = cpu_to_be64(src->a);
276 dst->b = cpu_to_be64(src->b);
277}
278
279static inline void be128_to_u128(u128 *dst, const be128 *src)
280{
281 dst->a = be64_to_cpu(src->a);
282 dst->b = be64_to_cpu(src->b);
283}
284
285static inline void u128_inc(u128 *i)
286{
287 i->b++;
288 if (!i->b)
289 i->a++;
290}
291
292static void ctr_crypt_final(struct blkcipher_desc *desc,
293 struct blkcipher_walk *walk)
294{
295 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
296 u8 *ctrblk = walk->iv;
297 u8 keystream[SERPENT_BLOCK_SIZE];
298 u8 *src = walk->src.virt.addr;
299 u8 *dst = walk->dst.virt.addr;
300 unsigned int nbytes = walk->nbytes;
301
302 __serpent_encrypt(ctx, keystream, ctrblk);
303 crypto_xor(keystream, src, nbytes);
304 memcpy(dst, keystream, nbytes);
305
306 crypto_inc(ctrblk, SERPENT_BLOCK_SIZE);
307}
308
309static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
310 struct blkcipher_walk *walk)
311{
312 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
313 const unsigned int bsize = SERPENT_BLOCK_SIZE;
314 unsigned int nbytes = walk->nbytes;
315 u128 *src = (u128 *)walk->src.virt.addr;
316 u128 *dst = (u128 *)walk->dst.virt.addr;
317 u128 ctrblk;
318 be128 ctrblocks[SERPENT_PARALLEL_BLOCKS];
319 int i;
320
321 be128_to_u128(&ctrblk, (be128 *)walk->iv);
322
323 /* Process multi-block batch */
324 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
325 do {
326 /* create ctrblks for parallel encrypt */
327 for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
328 if (dst != src)
329 dst[i] = src[i];
330
331 u128_to_be128(&ctrblocks[i], &ctrblk);
332 u128_inc(&ctrblk);
333 }
334
335 serpent_enc_blk_xway_xor(ctx, (u8 *)dst,
336 (u8 *)ctrblocks);
337
338 src += SERPENT_PARALLEL_BLOCKS;
339 dst += SERPENT_PARALLEL_BLOCKS;
340 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
341 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
342
343 if (nbytes < bsize)
344 goto done;
345 }
346
347 /* Handle leftovers */
348 do {
349 if (dst != src)
350 *dst = *src;
351
352 u128_to_be128(&ctrblocks[0], &ctrblk);
353 u128_inc(&ctrblk);
354
355 __serpent_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
356 u128_xor(dst, dst, (u128 *)ctrblocks);
357
358 src += 1;
359 dst += 1;
360 nbytes -= bsize;
361 } while (nbytes >= bsize);
362
363done:
364 u128_to_be128((be128 *)walk->iv, &ctrblk);
365 return nbytes;
366}
367
368static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
369 struct scatterlist *src, unsigned int nbytes)
370{
371 bool fpu_enabled = false;
372 struct blkcipher_walk walk;
373 int err;
374
375 blkcipher_walk_init(&walk, dst, src, nbytes);
376 err = blkcipher_walk_virt_block(desc, &walk, SERPENT_BLOCK_SIZE);
377 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
378
379 while ((nbytes = walk.nbytes) >= SERPENT_BLOCK_SIZE) {
380 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
381 nbytes = __ctr_crypt(desc, &walk);
382 err = blkcipher_walk_done(desc, &walk, nbytes);
383 }
384
385 serpent_fpu_end(fpu_enabled);
386
387 if (walk.nbytes) {
388 ctr_crypt_final(desc, &walk);
389 err = blkcipher_walk_done(desc, &walk, 0);
390 }
391
392 return err;
393}
394
395struct crypt_priv {
396 struct serpent_ctx *ctx;
397 bool fpu_enabled;
398};
399
400static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
401{
402 const unsigned int bsize = SERPENT_BLOCK_SIZE;
403 struct crypt_priv *ctx = priv;
404 int i;
405
406 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
407
408 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
409 serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
410 return;
411 }
412
413 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
414 __serpent_encrypt(ctx->ctx, srcdst, srcdst);
415}
416
417static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
418{
419 const unsigned int bsize = SERPENT_BLOCK_SIZE;
420 struct crypt_priv *ctx = priv;
421 int i;
422
423 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
424
425 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
426 serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
427 return;
428 }
429
430 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
431 __serpent_decrypt(ctx->ctx, srcdst, srcdst);
432}
433
434struct serpent_lrw_ctx {
435 struct lrw_table_ctx lrw_table;
436 struct serpent_ctx serpent_ctx;
437};
438
439static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
440 unsigned int keylen)
441{
442 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
443 int err;
444
445 err = __serpent_setkey(&ctx->serpent_ctx, key, keylen -
446 SERPENT_BLOCK_SIZE);
447 if (err)
448 return err;
449
450 return lrw_init_table(&ctx->lrw_table, key + keylen -
451 SERPENT_BLOCK_SIZE);
452}
453
454static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
455 struct scatterlist *src, unsigned int nbytes)
456{
457 struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
458 be128 buf[SERPENT_PARALLEL_BLOCKS];
459 struct crypt_priv crypt_ctx = {
460 .ctx = &ctx->serpent_ctx,
461 .fpu_enabled = false,
462 };
463 struct lrw_crypt_req req = {
464 .tbuf = buf,
465 .tbuflen = sizeof(buf),
466
467 .table_ctx = &ctx->lrw_table,
468 .crypt_ctx = &crypt_ctx,
469 .crypt_fn = encrypt_callback,
470 };
471 int ret;
472
473 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
474 ret = lrw_crypt(desc, dst, src, nbytes, &req);
475 serpent_fpu_end(crypt_ctx.fpu_enabled);
476
477 return ret;
478}
479
480static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
481 struct scatterlist *src, unsigned int nbytes)
482{
483 struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
484 be128 buf[SERPENT_PARALLEL_BLOCKS];
485 struct crypt_priv crypt_ctx = {
486 .ctx = &ctx->serpent_ctx,
487 .fpu_enabled = false,
488 };
489 struct lrw_crypt_req req = {
490 .tbuf = buf,
491 .tbuflen = sizeof(buf),
492
493 .table_ctx = &ctx->lrw_table,
494 .crypt_ctx = &crypt_ctx,
495 .crypt_fn = decrypt_callback,
496 };
497 int ret;
498
499 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
500 ret = lrw_crypt(desc, dst, src, nbytes, &req);
501 serpent_fpu_end(crypt_ctx.fpu_enabled);
502
503 return ret;
504}
505
506static void lrw_exit_tfm(struct crypto_tfm *tfm)
507{
508 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
509
510 lrw_free_table(&ctx->lrw_table);
511}
512
513struct serpent_xts_ctx {
514 struct serpent_ctx tweak_ctx;
515 struct serpent_ctx crypt_ctx;
516};
517
518static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
519 unsigned int keylen)
520{
521 struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
522 u32 *flags = &tfm->crt_flags;
523 int err;
524
525 /* key consists of keys of equal size concatenated, therefore
526 * the length must be even
527 */
528 if (keylen % 2) {
529 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
530 return -EINVAL;
531 }
532
533 /* first half of xts-key is for crypt */
534 err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
535 if (err)
536 return err;
537
538 /* second half of xts-key is for tweak */
539 return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
540}
541
542static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
543 struct scatterlist *src, unsigned int nbytes)
544{
545 struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
546 be128 buf[SERPENT_PARALLEL_BLOCKS];
547 struct crypt_priv crypt_ctx = {
548 .ctx = &ctx->crypt_ctx,
549 .fpu_enabled = false,
550 };
551 struct xts_crypt_req req = {
552 .tbuf = buf,
553 .tbuflen = sizeof(buf),
554
555 .tweak_ctx = &ctx->tweak_ctx,
556 .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
557 .crypt_ctx = &crypt_ctx,
558 .crypt_fn = encrypt_callback,
559 };
560 int ret;
561
562 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
563 ret = xts_crypt(desc, dst, src, nbytes, &req);
564 serpent_fpu_end(crypt_ctx.fpu_enabled);
565
566 return ret;
567}
568
569static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
570 struct scatterlist *src, unsigned int nbytes)
571{
572 struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
573 be128 buf[SERPENT_PARALLEL_BLOCKS];
574 struct crypt_priv crypt_ctx = {
575 .ctx = &ctx->crypt_ctx,
576 .fpu_enabled = false,
577 };
578 struct xts_crypt_req req = {
579 .tbuf = buf,
580 .tbuflen = sizeof(buf),
581
582 .tweak_ctx = &ctx->tweak_ctx,
583 .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
584 .crypt_ctx = &crypt_ctx,
585 .crypt_fn = decrypt_callback,
586 };
587 int ret;
588
589 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
590 ret = xts_crypt(desc, dst, src, nbytes, &req);
591 serpent_fpu_end(crypt_ctx.fpu_enabled);
592
593 return ret;
594}
595
596static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
597 unsigned int key_len)
598{
599 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
600 struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
601 int err;
602
603 crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
604 crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
605 & CRYPTO_TFM_REQ_MASK);
606 err = crypto_ablkcipher_setkey(child, key, key_len);
607 crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
608 & CRYPTO_TFM_RES_MASK);
609 return err;
610}
611
612static int __ablk_encrypt(struct ablkcipher_request *req)
613{
614 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
615 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
616 struct blkcipher_desc desc;
617
618 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
619 desc.info = req->info;
620 desc.flags = 0;
621
622 return crypto_blkcipher_crt(desc.tfm)->encrypt(
623 &desc, req->dst, req->src, req->nbytes);
624}
625
626static int ablk_encrypt(struct ablkcipher_request *req)
627{
628 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
629 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
630
631 if (!irq_fpu_usable()) {
632 struct ablkcipher_request *cryptd_req =
633 ablkcipher_request_ctx(req);
634
635 memcpy(cryptd_req, req, sizeof(*req));
636 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
637
638 return crypto_ablkcipher_encrypt(cryptd_req);
639 } else {
640 return __ablk_encrypt(req);
641 }
642}
643
644static int ablk_decrypt(struct ablkcipher_request *req)
645{
646 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
647 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
648
649 if (!irq_fpu_usable()) {
650 struct ablkcipher_request *cryptd_req =
651 ablkcipher_request_ctx(req);
652
653 memcpy(cryptd_req, req, sizeof(*req));
654 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
655
656 return crypto_ablkcipher_decrypt(cryptd_req);
657 } else {
658 struct blkcipher_desc desc;
659
660 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
661 desc.info = req->info;
662 desc.flags = 0;
663
664 return crypto_blkcipher_crt(desc.tfm)->decrypt(
665 &desc, req->dst, req->src, req->nbytes);
666 }
667}
668
669static void ablk_exit(struct crypto_tfm *tfm)
670{
671 struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
672
673 cryptd_free_ablkcipher(ctx->cryptd_tfm);
674}
675
676static int ablk_init(struct crypto_tfm *tfm)
677{
678 struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
679 struct cryptd_ablkcipher *cryptd_tfm;
680 char drv_name[CRYPTO_MAX_ALG_NAME];
681
682 snprintf(drv_name, sizeof(drv_name), "__driver-%s",
683 crypto_tfm_alg_driver_name(tfm));
684
685 cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0);
686 if (IS_ERR(cryptd_tfm))
687 return PTR_ERR(cryptd_tfm);
688
689 ctx->cryptd_tfm = cryptd_tfm;
690 tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
691 crypto_ablkcipher_reqsize(&cryptd_tfm->base);
692
693 return 0;
694}
695
696static struct crypto_alg serpent_algs[10] = { {
697 .cra_name = "__ecb-serpent-avx",
698 .cra_driver_name = "__driver-ecb-serpent-avx",
699 .cra_priority = 0,
700 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
701 .cra_blocksize = SERPENT_BLOCK_SIZE,
702 .cra_ctxsize = sizeof(struct serpent_ctx),
703 .cra_alignmask = 0,
704 .cra_type = &crypto_blkcipher_type,
705 .cra_module = THIS_MODULE,
706 .cra_list = LIST_HEAD_INIT(serpent_algs[0].cra_list),
707 .cra_u = {
708 .blkcipher = {
709 .min_keysize = SERPENT_MIN_KEY_SIZE,
710 .max_keysize = SERPENT_MAX_KEY_SIZE,
711 .setkey = serpent_setkey,
712 .encrypt = ecb_encrypt,
713 .decrypt = ecb_decrypt,
714 },
715 },
716}, {
717 .cra_name = "__cbc-serpent-avx",
718 .cra_driver_name = "__driver-cbc-serpent-avx",
719 .cra_priority = 0,
720 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
721 .cra_blocksize = SERPENT_BLOCK_SIZE,
722 .cra_ctxsize = sizeof(struct serpent_ctx),
723 .cra_alignmask = 0,
724 .cra_type = &crypto_blkcipher_type,
725 .cra_module = THIS_MODULE,
726 .cra_list = LIST_HEAD_INIT(serpent_algs[1].cra_list),
727 .cra_u = {
728 .blkcipher = {
729 .min_keysize = SERPENT_MIN_KEY_SIZE,
730 .max_keysize = SERPENT_MAX_KEY_SIZE,
731 .setkey = serpent_setkey,
732 .encrypt = cbc_encrypt,
733 .decrypt = cbc_decrypt,
734 },
735 },
736}, {
737 .cra_name = "__ctr-serpent-avx",
738 .cra_driver_name = "__driver-ctr-serpent-avx",
739 .cra_priority = 0,
740 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
741 .cra_blocksize = 1,
742 .cra_ctxsize = sizeof(struct serpent_ctx),
743 .cra_alignmask = 0,
744 .cra_type = &crypto_blkcipher_type,
745 .cra_module = THIS_MODULE,
746 .cra_list = LIST_HEAD_INIT(serpent_algs[2].cra_list),
747 .cra_u = {
748 .blkcipher = {
749 .min_keysize = SERPENT_MIN_KEY_SIZE,
750 .max_keysize = SERPENT_MAX_KEY_SIZE,
751 .ivsize = SERPENT_BLOCK_SIZE,
752 .setkey = serpent_setkey,
753 .encrypt = ctr_crypt,
754 .decrypt = ctr_crypt,
755 },
756 },
757}, {
758 .cra_name = "__lrw-serpent-avx",
759 .cra_driver_name = "__driver-lrw-serpent-avx",
760 .cra_priority = 0,
761 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
762 .cra_blocksize = SERPENT_BLOCK_SIZE,
763 .cra_ctxsize = sizeof(struct serpent_lrw_ctx),
764 .cra_alignmask = 0,
765 .cra_type = &crypto_blkcipher_type,
766 .cra_module = THIS_MODULE,
767 .cra_list = LIST_HEAD_INIT(serpent_algs[3].cra_list),
768 .cra_exit = lrw_exit_tfm,
769 .cra_u = {
770 .blkcipher = {
771 .min_keysize = SERPENT_MIN_KEY_SIZE +
772 SERPENT_BLOCK_SIZE,
773 .max_keysize = SERPENT_MAX_KEY_SIZE +
774 SERPENT_BLOCK_SIZE,
775 .ivsize = SERPENT_BLOCK_SIZE,
776 .setkey = lrw_serpent_setkey,
777 .encrypt = lrw_encrypt,
778 .decrypt = lrw_decrypt,
779 },
780 },
781}, {
782 .cra_name = "__xts-serpent-avx",
783 .cra_driver_name = "__driver-xts-serpent-avx",
784 .cra_priority = 0,
785 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
786 .cra_blocksize = SERPENT_BLOCK_SIZE,
787 .cra_ctxsize = sizeof(struct serpent_xts_ctx),
788 .cra_alignmask = 0,
789 .cra_type = &crypto_blkcipher_type,
790 .cra_module = THIS_MODULE,
791 .cra_list = LIST_HEAD_INIT(serpent_algs[4].cra_list),
792 .cra_u = {
793 .blkcipher = {
794 .min_keysize = SERPENT_MIN_KEY_SIZE * 2,
795 .max_keysize = SERPENT_MAX_KEY_SIZE * 2,
796 .ivsize = SERPENT_BLOCK_SIZE,
797 .setkey = xts_serpent_setkey,
798 .encrypt = xts_encrypt,
799 .decrypt = xts_decrypt,
800 },
801 },
802}, {
803 .cra_name = "ecb(serpent)",
804 .cra_driver_name = "ecb-serpent-avx",
805 .cra_priority = 500,
806 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
807 .cra_blocksize = SERPENT_BLOCK_SIZE,
808 .cra_ctxsize = sizeof(struct async_serpent_ctx),
809 .cra_alignmask = 0,
810 .cra_type = &crypto_ablkcipher_type,
811 .cra_module = THIS_MODULE,
812 .cra_list = LIST_HEAD_INIT(serpent_algs[5].cra_list),
813 .cra_init = ablk_init,
814 .cra_exit = ablk_exit,
815 .cra_u = {
816 .ablkcipher = {
817 .min_keysize = SERPENT_MIN_KEY_SIZE,
818 .max_keysize = SERPENT_MAX_KEY_SIZE,
819 .setkey = ablk_set_key,
820 .encrypt = ablk_encrypt,
821 .decrypt = ablk_decrypt,
822 },
823 },
824}, {
825 .cra_name = "cbc(serpent)",
826 .cra_driver_name = "cbc-serpent-avx",
827 .cra_priority = 500,
828 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
829 .cra_blocksize = SERPENT_BLOCK_SIZE,
830 .cra_ctxsize = sizeof(struct async_serpent_ctx),
831 .cra_alignmask = 0,
832 .cra_type = &crypto_ablkcipher_type,
833 .cra_module = THIS_MODULE,
834 .cra_list = LIST_HEAD_INIT(serpent_algs[6].cra_list),
835 .cra_init = ablk_init,
836 .cra_exit = ablk_exit,
837 .cra_u = {
838 .ablkcipher = {
839 .min_keysize = SERPENT_MIN_KEY_SIZE,
840 .max_keysize = SERPENT_MAX_KEY_SIZE,
841 .ivsize = SERPENT_BLOCK_SIZE,
842 .setkey = ablk_set_key,
843 .encrypt = __ablk_encrypt,
844 .decrypt = ablk_decrypt,
845 },
846 },
847}, {
848 .cra_name = "ctr(serpent)",
849 .cra_driver_name = "ctr-serpent-avx",
850 .cra_priority = 500,
851 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
852 .cra_blocksize = 1,
853 .cra_ctxsize = sizeof(struct async_serpent_ctx),
854 .cra_alignmask = 0,
855 .cra_type = &crypto_ablkcipher_type,
856 .cra_module = THIS_MODULE,
857 .cra_list = LIST_HEAD_INIT(serpent_algs[7].cra_list),
858 .cra_init = ablk_init,
859 .cra_exit = ablk_exit,
860 .cra_u = {
861 .ablkcipher = {
862 .min_keysize = SERPENT_MIN_KEY_SIZE,
863 .max_keysize = SERPENT_MAX_KEY_SIZE,
864 .ivsize = SERPENT_BLOCK_SIZE,
865 .setkey = ablk_set_key,
866 .encrypt = ablk_encrypt,
867 .decrypt = ablk_encrypt,
868 .geniv = "chainiv",
869 },
870 },
871}, {
872 .cra_name = "lrw(serpent)",
873 .cra_driver_name = "lrw-serpent-avx",
874 .cra_priority = 500,
875 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
876 .cra_blocksize = SERPENT_BLOCK_SIZE,
877 .cra_ctxsize = sizeof(struct async_serpent_ctx),
878 .cra_alignmask = 0,
879 .cra_type = &crypto_ablkcipher_type,
880 .cra_module = THIS_MODULE,
881 .cra_list = LIST_HEAD_INIT(serpent_algs[8].cra_list),
882 .cra_init = ablk_init,
883 .cra_exit = ablk_exit,
884 .cra_u = {
885 .ablkcipher = {
886 .min_keysize = SERPENT_MIN_KEY_SIZE +
887 SERPENT_BLOCK_SIZE,
888 .max_keysize = SERPENT_MAX_KEY_SIZE +
889 SERPENT_BLOCK_SIZE,
890 .ivsize = SERPENT_BLOCK_SIZE,
891 .setkey = ablk_set_key,
892 .encrypt = ablk_encrypt,
893 .decrypt = ablk_decrypt,
894 },
895 },
896}, {
897 .cra_name = "xts(serpent)",
898 .cra_driver_name = "xts-serpent-avx",
899 .cra_priority = 500,
900 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
901 .cra_blocksize = SERPENT_BLOCK_SIZE,
902 .cra_ctxsize = sizeof(struct async_serpent_ctx),
903 .cra_alignmask = 0,
904 .cra_type = &crypto_ablkcipher_type,
905 .cra_module = THIS_MODULE,
906 .cra_list = LIST_HEAD_INIT(serpent_algs[9].cra_list),
907 .cra_init = ablk_init,
908 .cra_exit = ablk_exit,
909 .cra_u = {
910 .ablkcipher = {
911 .min_keysize = SERPENT_MIN_KEY_SIZE * 2,
912 .max_keysize = SERPENT_MAX_KEY_SIZE * 2,
913 .ivsize = SERPENT_BLOCK_SIZE,
914 .setkey = ablk_set_key,
915 .encrypt = ablk_encrypt,
916 .decrypt = ablk_decrypt,
917 },
918 },
919} };
920
921static int __init serpent_init(void)
922{
923 u64 xcr0;
924
925 if (!cpu_has_avx || !cpu_has_osxsave) {
926 printk(KERN_INFO "AVX instructions are not detected.\n");
927 return -ENODEV;
928 }
929
930 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
931 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
932 printk(KERN_INFO "AVX detected but unusable.\n");
933 return -ENODEV;
934 }
935
936 return crypto_register_algs(serpent_algs, ARRAY_SIZE(serpent_algs));
937}
938
939static void __exit serpent_exit(void)
940{
941 crypto_unregister_algs(serpent_algs, ARRAY_SIZE(serpent_algs));
942}
943
944module_init(serpent_init);
945module_exit(serpent_exit);
946
947MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX optimized");
948MODULE_LICENSE("GPL");
949MODULE_ALIAS("serpent");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index e00a4e49e01..2c1c2dfcc02 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -821,6 +821,26 @@ config CRYPTO_SERPENT_SSE2_586
821 See also: 821 See also:
822 <http://www.cl.cam.ac.uk/~rja14/serpent.html> 822 <http://www.cl.cam.ac.uk/~rja14/serpent.html>
823 823
824config CRYPTO_SERPENT_AVX_X86_64
825 tristate "Serpent cipher algorithm (x86_64/AVX)"
826 depends on X86 && 64BIT
827 select CRYPTO_ALGAPI
828 select CRYPTO_CRYPTD
829 select CRYPTO_SERPENT
830 select CRYPTO_LRW
831 select CRYPTO_XTS
832 help
833 Serpent cipher algorithm, by Anderson, Biham & Knudsen.
834
835 Keys are allowed to be from 0 to 256 bits in length, in steps
836 of 8 bits.
837
838 This module provides the Serpent cipher algorithm that processes
839 eight blocks parallel using the AVX instruction set.
840
841 See also:
842 <http://www.cl.cam.ac.uk/~rja14/serpent.html>
843
824config CRYPTO_TEA 844config CRYPTO_TEA
825 tristate "TEA, XTEA and XETA cipher algorithms" 845 tristate "TEA, XTEA and XETA cipher algorithms"
826 select CRYPTO_ALGAPI 846 select CRYPTO_ALGAPI
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 73b3ec6fe1a..36748a5996e 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1534,6 +1534,21 @@ static int alg_test_null(const struct alg_test_desc *desc,
1534/* Please keep this list sorted by algorithm name. */ 1534/* Please keep this list sorted by algorithm name. */
1535static const struct alg_test_desc alg_test_descs[] = { 1535static const struct alg_test_desc alg_test_descs[] = {
1536 { 1536 {
1537 .alg = "__cbc-serpent-avx",
1538 .test = alg_test_null,
1539 .suite = {
1540 .cipher = {
1541 .enc = {
1542 .vecs = NULL,
1543 .count = 0
1544 },
1545 .dec = {
1546 .vecs = NULL,
1547 .count = 0
1548 }
1549 }
1550 }
1551 }, {
1537 .alg = "__cbc-serpent-sse2", 1552 .alg = "__cbc-serpent-sse2",
1538 .test = alg_test_null, 1553 .test = alg_test_null,
1539 .suite = { 1554 .suite = {
@@ -1579,6 +1594,21 @@ static const struct alg_test_desc alg_test_descs[] = {
1579 } 1594 }
1580 } 1595 }
1581 }, { 1596 }, {
1597 .alg = "__driver-cbc-serpent-avx",
1598 .test = alg_test_null,
1599 .suite = {
1600 .cipher = {
1601 .enc = {
1602 .vecs = NULL,
1603 .count = 0
1604 },
1605 .dec = {
1606 .vecs = NULL,
1607 .count = 0
1608 }
1609 }
1610 }
1611 }, {
1582 .alg = "__driver-cbc-serpent-sse2", 1612 .alg = "__driver-cbc-serpent-sse2",
1583 .test = alg_test_null, 1613 .test = alg_test_null,
1584 .suite = { 1614 .suite = {
@@ -1624,6 +1654,21 @@ static const struct alg_test_desc alg_test_descs[] = {
1624 } 1654 }
1625 } 1655 }
1626 }, { 1656 }, {
1657 .alg = "__driver-ecb-serpent-avx",
1658 .test = alg_test_null,
1659 .suite = {
1660 .cipher = {
1661 .enc = {
1662 .vecs = NULL,
1663 .count = 0
1664 },
1665 .dec = {
1666 .vecs = NULL,
1667 .count = 0
1668 }
1669 }
1670 }
1671 }, {
1627 .alg = "__driver-ecb-serpent-sse2", 1672 .alg = "__driver-ecb-serpent-sse2",
1628 .test = alg_test_null, 1673 .test = alg_test_null,
1629 .suite = { 1674 .suite = {
@@ -1836,6 +1881,21 @@ static const struct alg_test_desc alg_test_descs[] = {
1836 } 1881 }
1837 } 1882 }
1838 }, { 1883 }, {
1884 .alg = "cryptd(__driver-ecb-serpent-avx)",
1885 .test = alg_test_null,
1886 .suite = {
1887 .cipher = {
1888 .enc = {
1889 .vecs = NULL,
1890 .count = 0
1891 },
1892 .dec = {
1893 .vecs = NULL,
1894 .count = 0
1895 }
1896 }
1897 }
1898 }, {
1839 .alg = "cryptd(__driver-ecb-serpent-sse2)", 1899 .alg = "cryptd(__driver-ecb-serpent-sse2)",
1840 .test = alg_test_null, 1900 .test = alg_test_null,
1841 .suite = { 1901 .suite = {