aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto/cast5_avx_glue.c
diff options
context:
space:
mode:
authorJohannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>2012-07-11 13:37:37 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2012-08-01 05:47:30 -0400
commit4d6d6a2c850f89bc9283d02519cb536baba72032 (patch)
tree8433747260d88000d79849bcd4db0e56b86aa6e4 /arch/x86/crypto/cast5_avx_glue.c
parenta2c5826095562983bf316e3a7eb137ef04a71a24 (diff)
crypto: cast5 - add x86_64/avx assembler implementation
This patch adds a x86_64/avx assembler implementation of the Cast5 block cipher. The implementation processes sixteen blocks in parallel (four 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the functions from the generic module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) cast5-avx-x86_64 vs. cast5-generic 64bit key: size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec 16B 0.99x 0.99x 1.00x 1.00x 1.02x 1.01x 64B 1.00x 1.00x 0.98x 1.00x 1.01x 1.02x 256B 2.03x 2.01x 0.95x 2.11x 2.12x 2.13x 1024B 2.30x 2.24x 0.95x 2.29x 2.35x 2.35x 8192B 2.31x 2.27x 0.95x 2.31x 2.39x 2.39x 128bit key: size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec 16B 0.99x 0.99x 1.00x 1.00x 1.01x 1.01x 64B 1.00x 1.00x 0.98x 1.01x 1.02x 1.01x 256B 2.17x 2.13x 0.96x 2.19x 2.19x 2.19x 1024B 2.29x 2.32x 0.95x 2.34x 2.37x 2.38x 8192B 2.35x 2.32x 0.95x 2.35x 2.39x 2.39x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/cast5_avx_glue.c')
-rw-r--r--arch/x86/crypto/cast5_avx_glue.c530
1 files changed, 530 insertions, 0 deletions
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
new file mode 100644
index 000000000000..445aab06387b
--- /dev/null
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -0,0 +1,530 @@
1/*
2 * Glue Code for the AVX assembler implemention of the Cast5 Cipher
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 * USA
21 *
22 */
23
24#include <linux/module.h>
25#include <linux/hardirq.h>
26#include <linux/types.h>
27#include <linux/crypto.h>
28#include <linux/err.h>
29#include <crypto/algapi.h>
30#include <crypto/cast5.h>
31#include <crypto/cryptd.h>
32#include <crypto/ctr.h>
33#include <asm/xcr.h>
34#include <asm/xsave.h>
35#include <asm/crypto/ablk_helper.h>
36#include <asm/crypto/glue_helper.h>
37
38#define CAST5_PARALLEL_BLOCKS 16
39
40asmlinkage void __cast5_enc_blk_16way(struct cast5_ctx *ctx, u8 *dst,
41 const u8 *src, bool xor);
42asmlinkage void cast5_dec_blk_16way(struct cast5_ctx *ctx, u8 *dst,
43 const u8 *src);
44
45static inline void cast5_enc_blk_xway(struct cast5_ctx *ctx, u8 *dst,
46 const u8 *src)
47{
48 __cast5_enc_blk_16way(ctx, dst, src, false);
49}
50
51static inline void cast5_enc_blk_xway_xor(struct cast5_ctx *ctx, u8 *dst,
52 const u8 *src)
53{
54 __cast5_enc_blk_16way(ctx, dst, src, true);
55}
56
57static inline void cast5_dec_blk_xway(struct cast5_ctx *ctx, u8 *dst,
58 const u8 *src)
59{
60 cast5_dec_blk_16way(ctx, dst, src);
61}
62
63
64static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes)
65{
66 return glue_fpu_begin(CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS,
67 NULL, fpu_enabled, nbytes);
68}
69
70static inline void cast5_fpu_end(bool fpu_enabled)
71{
72 return glue_fpu_end(fpu_enabled);
73}
74
75static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
76 bool enc)
77{
78 bool fpu_enabled = false;
79 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
80 const unsigned int bsize = CAST5_BLOCK_SIZE;
81 unsigned int nbytes;
82 int err;
83
84 err = blkcipher_walk_virt(desc, walk);
85 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
86
87 while ((nbytes = walk->nbytes)) {
88 u8 *wsrc = walk->src.virt.addr;
89 u8 *wdst = walk->dst.virt.addr;
90
91 fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
92
93 /* Process multi-block batch */
94 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
95 do {
96 if (enc)
97 cast5_enc_blk_xway(ctx, wdst, wsrc);
98 else
99 cast5_dec_blk_xway(ctx, wdst, wsrc);
100
101 wsrc += bsize * CAST5_PARALLEL_BLOCKS;
102 wdst += bsize * CAST5_PARALLEL_BLOCKS;
103 nbytes -= bsize * CAST5_PARALLEL_BLOCKS;
104 } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
105
106 if (nbytes < bsize)
107 goto done;
108 }
109
110 /* Handle leftovers */
111 do {
112 if (enc)
113 __cast5_encrypt(ctx, wdst, wsrc);
114 else
115 __cast5_decrypt(ctx, wdst, wsrc);
116
117 wsrc += bsize;
118 wdst += bsize;
119 nbytes -= bsize;
120 } while (nbytes >= bsize);
121
122done:
123 err = blkcipher_walk_done(desc, walk, nbytes);
124 }
125
126 cast5_fpu_end(fpu_enabled);
127 return err;
128}
129
130static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
131 struct scatterlist *src, unsigned int nbytes)
132{
133 struct blkcipher_walk walk;
134
135 blkcipher_walk_init(&walk, dst, src, nbytes);
136 return ecb_crypt(desc, &walk, true);
137}
138
139static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
140 struct scatterlist *src, unsigned int nbytes)
141{
142 struct blkcipher_walk walk;
143
144 blkcipher_walk_init(&walk, dst, src, nbytes);
145 return ecb_crypt(desc, &walk, false);
146}
147
148static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
149 struct blkcipher_walk *walk)
150{
151 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
152 const unsigned int bsize = CAST5_BLOCK_SIZE;
153 unsigned int nbytes = walk->nbytes;
154 u64 *src = (u64 *)walk->src.virt.addr;
155 u64 *dst = (u64 *)walk->dst.virt.addr;
156 u64 *iv = (u64 *)walk->iv;
157
158 do {
159 *dst = *src ^ *iv;
160 __cast5_encrypt(ctx, (u8 *)dst, (u8 *)dst);
161 iv = dst;
162
163 src += 1;
164 dst += 1;
165 nbytes -= bsize;
166 } while (nbytes >= bsize);
167
168 *(u64 *)walk->iv ^= *iv;
169 return nbytes;
170}
171
172static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
173 struct scatterlist *src, unsigned int nbytes)
174{
175 struct blkcipher_walk walk;
176 int err;
177
178 blkcipher_walk_init(&walk, dst, src, nbytes);
179 err = blkcipher_walk_virt(desc, &walk);
180
181 while ((nbytes = walk.nbytes)) {
182 nbytes = __cbc_encrypt(desc, &walk);
183 err = blkcipher_walk_done(desc, &walk, nbytes);
184 }
185
186 return err;
187}
188
189static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
190 struct blkcipher_walk *walk)
191{
192 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
193 const unsigned int bsize = CAST5_BLOCK_SIZE;
194 unsigned int nbytes = walk->nbytes;
195 u64 *src = (u64 *)walk->src.virt.addr;
196 u64 *dst = (u64 *)walk->dst.virt.addr;
197 u64 ivs[CAST5_PARALLEL_BLOCKS - 1];
198 u64 last_iv;
199 int i;
200
201 /* Start of the last block. */
202 src += nbytes / bsize - 1;
203 dst += nbytes / bsize - 1;
204
205 last_iv = *src;
206
207 /* Process multi-block batch */
208 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
209 do {
210 nbytes -= bsize * (CAST5_PARALLEL_BLOCKS - 1);
211 src -= CAST5_PARALLEL_BLOCKS - 1;
212 dst -= CAST5_PARALLEL_BLOCKS - 1;
213
214 for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
215 ivs[i] = src[i];
216
217 cast5_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
218
219 for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
220 *(dst + (i + 1)) ^= *(ivs + i);
221
222 nbytes -= bsize;
223 if (nbytes < bsize)
224 goto done;
225
226 *dst ^= *(src - 1);
227 src -= 1;
228 dst -= 1;
229 } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
230
231 if (nbytes < bsize)
232 goto done;
233 }
234
235 /* Handle leftovers */
236 for (;;) {
237 __cast5_decrypt(ctx, (u8 *)dst, (u8 *)src);
238
239 nbytes -= bsize;
240 if (nbytes < bsize)
241 break;
242
243 *dst ^= *(src - 1);
244 src -= 1;
245 dst -= 1;
246 }
247
248done:
249 *dst ^= *(u64 *)walk->iv;
250 *(u64 *)walk->iv = last_iv;
251
252 return nbytes;
253}
254
255static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
256 struct scatterlist *src, unsigned int nbytes)
257{
258 bool fpu_enabled = false;
259 struct blkcipher_walk walk;
260 int err;
261
262 blkcipher_walk_init(&walk, dst, src, nbytes);
263 err = blkcipher_walk_virt(desc, &walk);
264 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
265
266 while ((nbytes = walk.nbytes)) {
267 fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
268 nbytes = __cbc_decrypt(desc, &walk);
269 err = blkcipher_walk_done(desc, &walk, nbytes);
270 }
271
272 cast5_fpu_end(fpu_enabled);
273 return err;
274}
275
276static void ctr_crypt_final(struct blkcipher_desc *desc,
277 struct blkcipher_walk *walk)
278{
279 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
280 u8 *ctrblk = walk->iv;
281 u8 keystream[CAST5_BLOCK_SIZE];
282 u8 *src = walk->src.virt.addr;
283 u8 *dst = walk->dst.virt.addr;
284 unsigned int nbytes = walk->nbytes;
285
286 __cast5_encrypt(ctx, keystream, ctrblk);
287 crypto_xor(keystream, src, nbytes);
288 memcpy(dst, keystream, nbytes);
289
290 crypto_inc(ctrblk, CAST5_BLOCK_SIZE);
291}
292
293static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
294 struct blkcipher_walk *walk)
295{
296 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
297 const unsigned int bsize = CAST5_BLOCK_SIZE;
298 unsigned int nbytes = walk->nbytes;
299 u64 *src = (u64 *)walk->src.virt.addr;
300 u64 *dst = (u64 *)walk->dst.virt.addr;
301 u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
302 __be64 ctrblocks[CAST5_PARALLEL_BLOCKS];
303 int i;
304
305 /* Process multi-block batch */
306 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
307 do {
308 /* create ctrblks for parallel encrypt */
309 for (i = 0; i < CAST5_PARALLEL_BLOCKS; i++) {
310 if (dst != src)
311 dst[i] = src[i];
312
313 ctrblocks[i] = cpu_to_be64(ctrblk++);
314 }
315
316 cast5_enc_blk_xway_xor(ctx, (u8 *)dst,
317 (u8 *)ctrblocks);
318
319 src += CAST5_PARALLEL_BLOCKS;
320 dst += CAST5_PARALLEL_BLOCKS;
321 nbytes -= bsize * CAST5_PARALLEL_BLOCKS;
322 } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
323
324 if (nbytes < bsize)
325 goto done;
326 }
327
328 /* Handle leftovers */
329 do {
330 if (dst != src)
331 *dst = *src;
332
333 ctrblocks[0] = cpu_to_be64(ctrblk++);
334
335 __cast5_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
336 *dst ^= ctrblocks[0];
337
338 src += 1;
339 dst += 1;
340 nbytes -= bsize;
341 } while (nbytes >= bsize);
342
343done:
344 *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
345 return nbytes;
346}
347
348static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
349 struct scatterlist *src, unsigned int nbytes)
350{
351 bool fpu_enabled = false;
352 struct blkcipher_walk walk;
353 int err;
354
355 blkcipher_walk_init(&walk, dst, src, nbytes);
356 err = blkcipher_walk_virt_block(desc, &walk, CAST5_BLOCK_SIZE);
357 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
358
359 while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
360 fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
361 nbytes = __ctr_crypt(desc, &walk);
362 err = blkcipher_walk_done(desc, &walk, nbytes);
363 }
364
365 cast5_fpu_end(fpu_enabled);
366
367 if (walk.nbytes) {
368 ctr_crypt_final(desc, &walk);
369 err = blkcipher_walk_done(desc, &walk, 0);
370 }
371
372 return err;
373}
374
375
376static struct crypto_alg cast5_algs[6] = { {
377 .cra_name = "__ecb-cast5-avx",
378 .cra_driver_name = "__driver-ecb-cast5-avx",
379 .cra_priority = 0,
380 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
381 .cra_blocksize = CAST5_BLOCK_SIZE,
382 .cra_ctxsize = sizeof(struct cast5_ctx),
383 .cra_alignmask = 0,
384 .cra_type = &crypto_blkcipher_type,
385 .cra_module = THIS_MODULE,
386 .cra_u = {
387 .blkcipher = {
388 .min_keysize = CAST5_MIN_KEY_SIZE,
389 .max_keysize = CAST5_MAX_KEY_SIZE,
390 .setkey = cast5_setkey,
391 .encrypt = ecb_encrypt,
392 .decrypt = ecb_decrypt,
393 },
394 },
395}, {
396 .cra_name = "__cbc-cast5-avx",
397 .cra_driver_name = "__driver-cbc-cast5-avx",
398 .cra_priority = 0,
399 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
400 .cra_blocksize = CAST5_BLOCK_SIZE,
401 .cra_ctxsize = sizeof(struct cast5_ctx),
402 .cra_alignmask = 0,
403 .cra_type = &crypto_blkcipher_type,
404 .cra_module = THIS_MODULE,
405 .cra_u = {
406 .blkcipher = {
407 .min_keysize = CAST5_MIN_KEY_SIZE,
408 .max_keysize = CAST5_MAX_KEY_SIZE,
409 .setkey = cast5_setkey,
410 .encrypt = cbc_encrypt,
411 .decrypt = cbc_decrypt,
412 },
413 },
414}, {
415 .cra_name = "__ctr-cast5-avx",
416 .cra_driver_name = "__driver-ctr-cast5-avx",
417 .cra_priority = 0,
418 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
419 .cra_blocksize = 1,
420 .cra_ctxsize = sizeof(struct cast5_ctx),
421 .cra_alignmask = 0,
422 .cra_type = &crypto_blkcipher_type,
423 .cra_module = THIS_MODULE,
424 .cra_u = {
425 .blkcipher = {
426 .min_keysize = CAST5_MIN_KEY_SIZE,
427 .max_keysize = CAST5_MAX_KEY_SIZE,
428 .ivsize = CAST5_BLOCK_SIZE,
429 .setkey = cast5_setkey,
430 .encrypt = ctr_crypt,
431 .decrypt = ctr_crypt,
432 },
433 },
434}, {
435 .cra_name = "ecb(cast5)",
436 .cra_driver_name = "ecb-cast5-avx",
437 .cra_priority = 200,
438 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
439 .cra_blocksize = CAST5_BLOCK_SIZE,
440 .cra_ctxsize = sizeof(struct async_helper_ctx),
441 .cra_alignmask = 0,
442 .cra_type = &crypto_ablkcipher_type,
443 .cra_module = THIS_MODULE,
444 .cra_init = ablk_init,
445 .cra_exit = ablk_exit,
446 .cra_u = {
447 .ablkcipher = {
448 .min_keysize = CAST5_MIN_KEY_SIZE,
449 .max_keysize = CAST5_MAX_KEY_SIZE,
450 .setkey = ablk_set_key,
451 .encrypt = ablk_encrypt,
452 .decrypt = ablk_decrypt,
453 },
454 },
455}, {
456 .cra_name = "cbc(cast5)",
457 .cra_driver_name = "cbc-cast5-avx",
458 .cra_priority = 200,
459 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
460 .cra_blocksize = CAST5_BLOCK_SIZE,
461 .cra_ctxsize = sizeof(struct async_helper_ctx),
462 .cra_alignmask = 0,
463 .cra_type = &crypto_ablkcipher_type,
464 .cra_module = THIS_MODULE,
465 .cra_init = ablk_init,
466 .cra_exit = ablk_exit,
467 .cra_u = {
468 .ablkcipher = {
469 .min_keysize = CAST5_MIN_KEY_SIZE,
470 .max_keysize = CAST5_MAX_KEY_SIZE,
471 .ivsize = CAST5_BLOCK_SIZE,
472 .setkey = ablk_set_key,
473 .encrypt = __ablk_encrypt,
474 .decrypt = ablk_decrypt,
475 },
476 },
477}, {
478 .cra_name = "ctr(cast5)",
479 .cra_driver_name = "ctr-cast5-avx",
480 .cra_priority = 200,
481 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
482 .cra_blocksize = 1,
483 .cra_ctxsize = sizeof(struct async_helper_ctx),
484 .cra_alignmask = 0,
485 .cra_type = &crypto_ablkcipher_type,
486 .cra_module = THIS_MODULE,
487 .cra_init = ablk_init,
488 .cra_exit = ablk_exit,
489 .cra_u = {
490 .ablkcipher = {
491 .min_keysize = CAST5_MIN_KEY_SIZE,
492 .max_keysize = CAST5_MAX_KEY_SIZE,
493 .ivsize = CAST5_BLOCK_SIZE,
494 .setkey = ablk_set_key,
495 .encrypt = ablk_encrypt,
496 .decrypt = ablk_encrypt,
497 .geniv = "chainiv",
498 },
499 },
500} };
501
502static int __init cast5_init(void)
503{
504 u64 xcr0;
505
506 if (!cpu_has_avx || !cpu_has_osxsave) {
507 pr_info("AVX instructions are not detected.\n");
508 return -ENODEV;
509 }
510
511 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
512 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
513 pr_info("AVX detected but unusable.\n");
514 return -ENODEV;
515 }
516
517 return crypto_register_algs(cast5_algs, ARRAY_SIZE(cast5_algs));
518}
519
520static void __exit cast5_exit(void)
521{
522 crypto_unregister_algs(cast5_algs, ARRAY_SIZE(cast5_algs));
523}
524
525module_init(cast5_init);
526module_exit(cast5_exit);
527
528MODULE_DESCRIPTION("Cast5 Cipher Algorithm, AVX optimized");
529MODULE_LICENSE("GPL");
530MODULE_ALIAS("cast5");