aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto/cast5_avx_glue.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@mbnet.fi>2012-10-20 08:06:56 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2012-10-24 09:10:55 -0400
commitc12ab20b162c9414acadc18c6da6cfd3eea54b7b (patch)
tree01efc5cd0712cbab4cdd0b091cbe173c9dd9500f /arch/x86/crypto/cast5_avx_glue.c
parentfacd416fbc1cdee357730909a414898934f16ae1 (diff)
crypto: cast5/avx - avoid using temporary stack buffers
Introduce new assembler functions to avoid use temporary stack buffers in glue code. This also allows use of vector instructions for xoring output in CTR and CBC modes and construction of IVs for CTR mode. ECB mode sees ~0.5% decrease in speed because added one extra function call. CBC mode decryption and CTR mode benefit from vector operations and gain ~5%. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/cast5_avx_glue.c')
-rw-r--r--arch/x86/crypto/cast5_avx_glue.c79
1 files changed, 23 insertions, 56 deletions
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index e0ea14f9547f..c6631813dc11 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -37,29 +37,14 @@
37 37
38#define CAST5_PARALLEL_BLOCKS 16 38#define CAST5_PARALLEL_BLOCKS 16
39 39
40asmlinkage void __cast5_enc_blk_16way(struct cast5_ctx *ctx, u8 *dst, 40asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst,
41 const u8 *src, bool xor);
42asmlinkage void cast5_dec_blk_16way(struct cast5_ctx *ctx, u8 *dst,
43 const u8 *src); 41 const u8 *src);
44 42asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst,
45static inline void cast5_enc_blk_xway(struct cast5_ctx *ctx, u8 *dst, 43 const u8 *src);
46 const u8 *src) 44asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst,
47{ 45 const u8 *src);
48 __cast5_enc_blk_16way(ctx, dst, src, false); 46asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src,
49} 47 __be64 *iv);
50
51static inline void cast5_enc_blk_xway_xor(struct cast5_ctx *ctx, u8 *dst,
52 const u8 *src)
53{
54 __cast5_enc_blk_16way(ctx, dst, src, true);
55}
56
57static inline void cast5_dec_blk_xway(struct cast5_ctx *ctx, u8 *dst,
58 const u8 *src)
59{
60 cast5_dec_blk_16way(ctx, dst, src);
61}
62
63 48
64static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes) 49static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes)
65{ 50{
@@ -79,8 +64,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
79 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 64 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
80 const unsigned int bsize = CAST5_BLOCK_SIZE; 65 const unsigned int bsize = CAST5_BLOCK_SIZE;
81 unsigned int nbytes; 66 unsigned int nbytes;
67 void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src);
82 int err; 68 int err;
83 69
70 fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way;
71
84 err = blkcipher_walk_virt(desc, walk); 72 err = blkcipher_walk_virt(desc, walk);
85 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 73 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
86 74
@@ -93,10 +81,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
93 /* Process multi-block batch */ 81 /* Process multi-block batch */
94 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { 82 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
95 do { 83 do {
96 if (enc) 84 fn(ctx, wdst, wsrc);
97 cast5_enc_blk_xway(ctx, wdst, wsrc);
98 else
99 cast5_dec_blk_xway(ctx, wdst, wsrc);
100 85
101 wsrc += bsize * CAST5_PARALLEL_BLOCKS; 86 wsrc += bsize * CAST5_PARALLEL_BLOCKS;
102 wdst += bsize * CAST5_PARALLEL_BLOCKS; 87 wdst += bsize * CAST5_PARALLEL_BLOCKS;
@@ -107,12 +92,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
107 goto done; 92 goto done;
108 } 93 }
109 94
95 fn = (enc) ? __cast5_encrypt : __cast5_decrypt;
96
110 /* Handle leftovers */ 97 /* Handle leftovers */
111 do { 98 do {
112 if (enc) 99 fn(ctx, wdst, wsrc);
113 __cast5_encrypt(ctx, wdst, wsrc);
114 else
115 __cast5_decrypt(ctx, wdst, wsrc);
116 100
117 wsrc += bsize; 101 wsrc += bsize;
118 wdst += bsize; 102 wdst += bsize;
@@ -194,9 +178,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
194 unsigned int nbytes = walk->nbytes; 178 unsigned int nbytes = walk->nbytes;
195 u64 *src = (u64 *)walk->src.virt.addr; 179 u64 *src = (u64 *)walk->src.virt.addr;
196 u64 *dst = (u64 *)walk->dst.virt.addr; 180 u64 *dst = (u64 *)walk->dst.virt.addr;
197 u64 ivs[CAST5_PARALLEL_BLOCKS - 1];
198 u64 last_iv; 181 u64 last_iv;
199 int i;
200 182
201 /* Start of the last block. */ 183 /* Start of the last block. */
202 src += nbytes / bsize - 1; 184 src += nbytes / bsize - 1;
@@ -211,13 +193,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
211 src -= CAST5_PARALLEL_BLOCKS - 1; 193 src -= CAST5_PARALLEL_BLOCKS - 1;
212 dst -= CAST5_PARALLEL_BLOCKS - 1; 194 dst -= CAST5_PARALLEL_BLOCKS - 1;
213 195
214 for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++) 196 cast5_cbc_dec_16way(ctx, (u8 *)dst, (u8 *)src);
215 ivs[i] = src[i];
216
217 cast5_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
218
219 for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
220 *(dst + (i + 1)) ^= *(ivs + i);
221 197
222 nbytes -= bsize; 198 nbytes -= bsize;
223 if (nbytes < bsize) 199 if (nbytes < bsize)
@@ -298,23 +274,12 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
298 unsigned int nbytes = walk->nbytes; 274 unsigned int nbytes = walk->nbytes;
299 u64 *src = (u64 *)walk->src.virt.addr; 275 u64 *src = (u64 *)walk->src.virt.addr;
300 u64 *dst = (u64 *)walk->dst.virt.addr; 276 u64 *dst = (u64 *)walk->dst.virt.addr;
301 u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
302 __be64 ctrblocks[CAST5_PARALLEL_BLOCKS];
303 int i;
304 277
305 /* Process multi-block batch */ 278 /* Process multi-block batch */
306 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { 279 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
307 do { 280 do {
308 /* create ctrblks for parallel encrypt */ 281 cast5_ctr_16way(ctx, (u8 *)dst, (u8 *)src,
309 for (i = 0; i < CAST5_PARALLEL_BLOCKS; i++) { 282 (__be64 *)walk->iv);
310 if (dst != src)
311 dst[i] = src[i];
312
313 ctrblocks[i] = cpu_to_be64(ctrblk++);
314 }
315
316 cast5_enc_blk_xway_xor(ctx, (u8 *)dst,
317 (u8 *)ctrblocks);
318 283
319 src += CAST5_PARALLEL_BLOCKS; 284 src += CAST5_PARALLEL_BLOCKS;
320 dst += CAST5_PARALLEL_BLOCKS; 285 dst += CAST5_PARALLEL_BLOCKS;
@@ -327,13 +292,16 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
327 292
328 /* Handle leftovers */ 293 /* Handle leftovers */
329 do { 294 do {
295 u64 ctrblk;
296
330 if (dst != src) 297 if (dst != src)
331 *dst = *src; 298 *dst = *src;
332 299
333 ctrblocks[0] = cpu_to_be64(ctrblk++); 300 ctrblk = *(u64 *)walk->iv;
301 be64_add_cpu((__be64 *)walk->iv, 1);
334 302
335 __cast5_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); 303 __cast5_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
336 *dst ^= ctrblocks[0]; 304 *dst ^= ctrblk;
337 305
338 src += 1; 306 src += 1;
339 dst += 1; 307 dst += 1;
@@ -341,7 +309,6 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
341 } while (nbytes >= bsize); 309 } while (nbytes >= bsize);
342 310
343done: 311done:
344 *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
345 return nbytes; 312 return nbytes;
346} 313}
347 314