aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@mbnet.fi>2012-10-20 08:06:56 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2012-10-24 09:10:55 -0400
commitc12ab20b162c9414acadc18c6da6cfd3eea54b7b (patch)
tree01efc5cd0712cbab4cdd0b091cbe173c9dd9500f
parentfacd416fbc1cdee357730909a414898934f16ae1 (diff)
crypto: cast5/avx - avoid using temporary stack buffers
Introduce new assembler functions to avoid use temporary stack buffers in glue code. This also allows use of vector instructions for xoring output in CTR and CBC modes and construction of IVs for CTR mode. ECB mode sees ~0.5% decrease in speed because added one extra function call. CBC mode decryption and CTR mode benefit from vector operations and gain ~5%. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r--arch/x86/crypto/cast5-avx-x86_64-asm_64.S332
-rw-r--r--arch/x86/crypto/cast5_avx_glue.c79
2 files changed, 280 insertions, 131 deletions
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index a41a3aaba220..12478e472368 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -180,31 +180,17 @@
180 vpunpcklqdq t1, t0, x0; \ 180 vpunpcklqdq t1, t0, x0; \
181 vpunpckhqdq t1, t0, x1; 181 vpunpckhqdq t1, t0, x1;
182 182
183#define inpack_blocks(in, x0, x1, t0, t1, rmask) \ 183#define inpack_blocks(x0, x1, t0, t1, rmask) \
184 vmovdqu (0*4*4)(in), x0; \
185 vmovdqu (1*4*4)(in), x1; \
186 vpshufb rmask, x0, x0; \ 184 vpshufb rmask, x0, x0; \
187 vpshufb rmask, x1, x1; \ 185 vpshufb rmask, x1, x1; \
188 \ 186 \
189 transpose_2x4(x0, x1, t0, t1) 187 transpose_2x4(x0, x1, t0, t1)
190 188
191#define outunpack_blocks(out, x0, x1, t0, t1, rmask) \ 189#define outunpack_blocks(x0, x1, t0, t1, rmask) \
192 transpose_2x4(x0, x1, t0, t1) \ 190 transpose_2x4(x0, x1, t0, t1) \
193 \ 191 \
194 vpshufb rmask, x0, x0; \ 192 vpshufb rmask, x0, x0; \
195 vpshufb rmask, x1, x1; \ 193 vpshufb rmask, x1, x1;
196 vmovdqu x0, (0*4*4)(out); \
197 vmovdqu x1, (1*4*4)(out);
198
199#define outunpack_xor_blocks(out, x0, x1, t0, t1, rmask) \
200 transpose_2x4(x0, x1, t0, t1) \
201 \
202 vpshufb rmask, x0, x0; \
203 vpshufb rmask, x1, x1; \
204 vpxor (0*4*4)(out), x0, x0; \
205 vmovdqu x0, (0*4*4)(out); \
206 vpxor (1*4*4)(out), x1, x1; \
207 vmovdqu x1, (1*4*4)(out);
208 194
209.data 195.data
210 196
@@ -213,6 +199,8 @@
213 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 199 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
214.Lbswap128_mask: 200.Lbswap128_mask:
215 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 201 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
202.Lbswap_iv_mask:
203 .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
216.L16_mask: 204.L16_mask:
217 .byte 16, 16, 16, 16 205 .byte 16, 16, 16, 16
218.L32_mask: 206.L32_mask:
@@ -223,35 +211,42 @@
223.text 211.text
224 212
225.align 16 213.align 16
226.global __cast5_enc_blk_16way 214.type __cast5_enc_blk16,@function;
227.type __cast5_enc_blk_16way,@function;
228 215
229__cast5_enc_blk_16way: 216__cast5_enc_blk16:
230 /* input: 217 /* input:
231 * %rdi: ctx, CTX 218 * %rdi: ctx, CTX
232 * %rsi: dst 219 * RL1: blocks 1 and 2
233 * %rdx: src 220 * RR1: blocks 3 and 4
234 * %rcx: bool, if true: xor output 221 * RL2: blocks 5 and 6
222 * RR2: blocks 7 and 8
223 * RL3: blocks 9 and 10
224 * RR3: blocks 11 and 12
225 * RL4: blocks 13 and 14
226 * RR4: blocks 15 and 16
227 * output:
228 * RL1: encrypted blocks 1 and 2
229 * RR1: encrypted blocks 3 and 4
230 * RL2: encrypted blocks 5 and 6
231 * RR2: encrypted blocks 7 and 8
232 * RL3: encrypted blocks 9 and 10
233 * RR3: encrypted blocks 11 and 12
234 * RL4: encrypted blocks 13 and 14
235 * RR4: encrypted blocks 15 and 16
235 */ 236 */
236 237
237 pushq %rbp; 238 pushq %rbp;
238 pushq %rbx; 239 pushq %rbx;
239 pushq %rcx;
240 240
241 vmovdqa .Lbswap_mask, RKM; 241 vmovdqa .Lbswap_mask, RKM;
242 vmovd .Lfirst_mask, R1ST; 242 vmovd .Lfirst_mask, R1ST;
243 vmovd .L32_mask, R32; 243 vmovd .L32_mask, R32;
244 enc_preload_rkr(); 244 enc_preload_rkr();
245 245
246 leaq 1*(2*4*4)(%rdx), %rax; 246 inpack_blocks(RL1, RR1, RTMP, RX, RKM);
247 inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM); 247 inpack_blocks(RL2, RR2, RTMP, RX, RKM);
248 inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM); 248 inpack_blocks(RL3, RR3, RTMP, RX, RKM);
249 leaq 2*(2*4*4)(%rdx), %rax; 249 inpack_blocks(RL4, RR4, RTMP, RX, RKM);
250 inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM);
251 leaq 3*(2*4*4)(%rdx), %rax;
252 inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM);
253
254 movq %rsi, %r11;
255 250
256 round(RL, RR, 0, 1); 251 round(RL, RR, 0, 1);
257 round(RR, RL, 1, 2); 252 round(RR, RL, 1, 2);
@@ -276,44 +271,41 @@ __cast5_enc_blk_16way:
276 round(RR, RL, 15, 1); 271 round(RR, RL, 15, 1);
277 272
278__skip_enc: 273__skip_enc:
279 popq %rcx;
280 popq %rbx; 274 popq %rbx;
281 popq %rbp; 275 popq %rbp;
282 276
283 vmovdqa .Lbswap_mask, RKM; 277 vmovdqa .Lbswap_mask, RKM;
284 leaq 1*(2*4*4)(%r11), %rax;
285 278
286 testb %cl, %cl; 279 outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
287 jnz __enc_xor16; 280 outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
288 281 outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
289 outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM); 282 outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
290 outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
291 leaq 2*(2*4*4)(%r11), %rax;
292 outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
293 leaq 3*(2*4*4)(%r11), %rax;
294 outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
295
296 ret;
297
298__enc_xor16:
299 outunpack_xor_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
300 outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
301 leaq 2*(2*4*4)(%r11), %rax;
302 outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
303 leaq 3*(2*4*4)(%r11), %rax;
304 outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
305 283
306 ret; 284 ret;
307 285
308.align 16 286.align 16
309.global cast5_dec_blk_16way 287.type __cast5_dec_blk16,@function;
310.type cast5_dec_blk_16way,@function;
311 288
312cast5_dec_blk_16way: 289__cast5_dec_blk16:
313 /* input: 290 /* input:
314 * %rdi: ctx, CTX 291 * %rdi: ctx, CTX
315 * %rsi: dst 292 * RL1: encrypted blocks 1 and 2
316 * %rdx: src 293 * RR1: encrypted blocks 3 and 4
294 * RL2: encrypted blocks 5 and 6
295 * RR2: encrypted blocks 7 and 8
296 * RL3: encrypted blocks 9 and 10
297 * RR3: encrypted blocks 11 and 12
298 * RL4: encrypted blocks 13 and 14
299 * RR4: encrypted blocks 15 and 16
300 * output:
301 * RL1: decrypted blocks 1 and 2
302 * RR1: decrypted blocks 3 and 4
303 * RL2: decrypted blocks 5 and 6
304 * RR2: decrypted blocks 7 and 8
305 * RL3: decrypted blocks 9 and 10
306 * RR3: decrypted blocks 11 and 12
307 * RL4: decrypted blocks 13 and 14
308 * RR4: decrypted blocks 15 and 16
317 */ 309 */
318 310
319 pushq %rbp; 311 pushq %rbp;
@@ -324,15 +316,10 @@ cast5_dec_blk_16way:
324 vmovd .L32_mask, R32; 316 vmovd .L32_mask, R32;
325 dec_preload_rkr(); 317 dec_preload_rkr();
326 318
327 leaq 1*(2*4*4)(%rdx), %rax; 319 inpack_blocks(RL1, RR1, RTMP, RX, RKM);
328 inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM); 320 inpack_blocks(RL2, RR2, RTMP, RX, RKM);
329 inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM); 321 inpack_blocks(RL3, RR3, RTMP, RX, RKM);
330 leaq 2*(2*4*4)(%rdx), %rax; 322 inpack_blocks(RL4, RR4, RTMP, RX, RKM);
331 inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM);
332 leaq 3*(2*4*4)(%rdx), %rax;
333 inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM);
334
335 movq %rsi, %r11;
336 323
337 movzbl rr(CTX), %eax; 324 movzbl rr(CTX), %eax;
338 testl %eax, %eax; 325 testl %eax, %eax;
@@ -361,16 +348,211 @@ __dec_tail:
361 popq %rbx; 348 popq %rbx;
362 popq %rbp; 349 popq %rbp;
363 350
364 leaq 1*(2*4*4)(%r11), %rax; 351 outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
365 outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM); 352 outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
366 outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM); 353 outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
367 leaq 2*(2*4*4)(%r11), %rax; 354 outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
368 outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
369 leaq 3*(2*4*4)(%r11), %rax;
370 outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
371 355
372 ret; 356 ret;
373 357
374__skip_dec: 358__skip_dec:
375 vpsrldq $4, RKR, RKR; 359 vpsrldq $4, RKR, RKR;
376 jmp __dec_tail; 360 jmp __dec_tail;
361
362.align 16
363.global cast5_ecb_enc_16way
364.type cast5_ecb_enc_16way,@function;
365
366cast5_ecb_enc_16way:
367 /* input:
368 * %rdi: ctx, CTX
369 * %rsi: dst
370 * %rdx: src
371 */
372
373 movq %rsi, %r11;
374
375 vmovdqu (0*4*4)(%rdx), RL1;
376 vmovdqu (1*4*4)(%rdx), RR1;
377 vmovdqu (2*4*4)(%rdx), RL2;
378 vmovdqu (3*4*4)(%rdx), RR2;
379 vmovdqu (4*4*4)(%rdx), RL3;
380 vmovdqu (5*4*4)(%rdx), RR3;
381 vmovdqu (6*4*4)(%rdx), RL4;
382 vmovdqu (7*4*4)(%rdx), RR4;
383
384 call __cast5_enc_blk16;
385
386 vmovdqu RR1, (0*4*4)(%r11);
387 vmovdqu RL1, (1*4*4)(%r11);
388 vmovdqu RR2, (2*4*4)(%r11);
389 vmovdqu RL2, (3*4*4)(%r11);
390 vmovdqu RR3, (4*4*4)(%r11);
391 vmovdqu RL3, (5*4*4)(%r11);
392 vmovdqu RR4, (6*4*4)(%r11);
393 vmovdqu RL4, (7*4*4)(%r11);
394
395 ret;
396
397.align 16
398.global cast5_ecb_dec_16way
399.type cast5_ecb_dec_16way,@function;
400
401cast5_ecb_dec_16way:
402 /* input:
403 * %rdi: ctx, CTX
404 * %rsi: dst
405 * %rdx: src
406 */
407
408 movq %rsi, %r11;
409
410 vmovdqu (0*4*4)(%rdx), RL1;
411 vmovdqu (1*4*4)(%rdx), RR1;
412 vmovdqu (2*4*4)(%rdx), RL2;
413 vmovdqu (3*4*4)(%rdx), RR2;
414 vmovdqu (4*4*4)(%rdx), RL3;
415 vmovdqu (5*4*4)(%rdx), RR3;
416 vmovdqu (6*4*4)(%rdx), RL4;
417 vmovdqu (7*4*4)(%rdx), RR4;
418
419 call __cast5_dec_blk16;
420
421 vmovdqu RR1, (0*4*4)(%r11);
422 vmovdqu RL1, (1*4*4)(%r11);
423 vmovdqu RR2, (2*4*4)(%r11);
424 vmovdqu RL2, (3*4*4)(%r11);
425 vmovdqu RR3, (4*4*4)(%r11);
426 vmovdqu RL3, (5*4*4)(%r11);
427 vmovdqu RR4, (6*4*4)(%r11);
428 vmovdqu RL4, (7*4*4)(%r11);
429
430 ret;
431
432.align 16
433.global cast5_cbc_dec_16way
434.type cast5_cbc_dec_16way,@function;
435
436cast5_cbc_dec_16way:
437 /* input:
438 * %rdi: ctx, CTX
439 * %rsi: dst
440 * %rdx: src
441 */
442
443 pushq %r12;
444
445 movq %rsi, %r11;
446 movq %rdx, %r12;
447
448 vmovdqu (0*16)(%rdx), RL1;
449 vmovdqu (1*16)(%rdx), RR1;
450 vmovdqu (2*16)(%rdx), RL2;
451 vmovdqu (3*16)(%rdx), RR2;
452 vmovdqu (4*16)(%rdx), RL3;
453 vmovdqu (5*16)(%rdx), RR3;
454 vmovdqu (6*16)(%rdx), RL4;
455 vmovdqu (7*16)(%rdx), RR4;
456
457 call __cast5_dec_blk16;
458
459 /* xor with src */
460 vmovq (%r12), RX;
461 vpshufd $0x4f, RX, RX;
462 vpxor RX, RR1, RR1;
463 vpxor 0*16+8(%r12), RL1, RL1;
464 vpxor 1*16+8(%r12), RR2, RR2;
465 vpxor 2*16+8(%r12), RL2, RL2;
466 vpxor 3*16+8(%r12), RR3, RR3;
467 vpxor 4*16+8(%r12), RL3, RL3;
468 vpxor 5*16+8(%r12), RR4, RR4;
469 vpxor 6*16+8(%r12), RL4, RL4;
470
471 vmovdqu RR1, (0*16)(%r11);
472 vmovdqu RL1, (1*16)(%r11);
473 vmovdqu RR2, (2*16)(%r11);
474 vmovdqu RL2, (3*16)(%r11);
475 vmovdqu RR3, (4*16)(%r11);
476 vmovdqu RL3, (5*16)(%r11);
477 vmovdqu RR4, (6*16)(%r11);
478 vmovdqu RL4, (7*16)(%r11);
479
480 popq %r12;
481
482 ret;
483
484.align 16
485.global cast5_ctr_16way
486.type cast5_ctr_16way,@function;
487
488cast5_ctr_16way:
489 /* input:
490 * %rdi: ctx, CTX
491 * %rsi: dst
492 * %rdx: src
493 * %rcx: iv (big endian, 64bit)
494 */
495
496 pushq %r12;
497
498 movq %rsi, %r11;
499 movq %rdx, %r12;
500
501 vpcmpeqd RTMP, RTMP, RTMP;
502 vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */
503
504 vpcmpeqd RKR, RKR, RKR;
505 vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
506 vmovdqa .Lbswap_iv_mask, R1ST;
507 vmovdqa .Lbswap128_mask, RKM;
508
509 /* load IV and byteswap */
510 vmovq (%rcx), RX;
511 vpshufb R1ST, RX, RX;
512
513 /* construct IVs */
514 vpsubq RTMP, RX, RX; /* le: IV1, IV0 */
515 vpshufb RKM, RX, RL1; /* be: IV0, IV1 */
516 vpsubq RKR, RX, RX;
517 vpshufb RKM, RX, RR1; /* be: IV2, IV3 */
518 vpsubq RKR, RX, RX;
519 vpshufb RKM, RX, RL2; /* be: IV4, IV5 */
520 vpsubq RKR, RX, RX;
521 vpshufb RKM, RX, RR2; /* be: IV6, IV7 */
522 vpsubq RKR, RX, RX;
523 vpshufb RKM, RX, RL3; /* be: IV8, IV9 */
524 vpsubq RKR, RX, RX;
525 vpshufb RKM, RX, RR3; /* be: IV10, IV11 */
526 vpsubq RKR, RX, RX;
527 vpshufb RKM, RX, RL4; /* be: IV12, IV13 */
528 vpsubq RKR, RX, RX;
529 vpshufb RKM, RX, RR4; /* be: IV14, IV15 */
530
531 /* store last IV */
532 vpsubq RTMP, RX, RX; /* le: IV16, IV14 */
533 vpshufb R1ST, RX, RX; /* be: IV16, IV16 */
534 vmovq RX, (%rcx);
535
536 call __cast5_enc_blk16;
537
538 /* dst = src ^ iv */
539 vpxor (0*16)(%r12), RR1, RR1;
540 vpxor (1*16)(%r12), RL1, RL1;
541 vpxor (2*16)(%r12), RR2, RR2;
542 vpxor (3*16)(%r12), RL2, RL2;
543 vpxor (4*16)(%r12), RR3, RR3;
544 vpxor (5*16)(%r12), RL3, RL3;
545 vpxor (6*16)(%r12), RR4, RR4;
546 vpxor (7*16)(%r12), RL4, RL4;
547 vmovdqu RR1, (0*16)(%r11);
548 vmovdqu RL1, (1*16)(%r11);
549 vmovdqu RR2, (2*16)(%r11);
550 vmovdqu RL2, (3*16)(%r11);
551 vmovdqu RR3, (4*16)(%r11);
552 vmovdqu RL3, (5*16)(%r11);
553 vmovdqu RR4, (6*16)(%r11);
554 vmovdqu RL4, (7*16)(%r11);
555
556 popq %r12;
557
558 ret;
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index e0ea14f9547f..c6631813dc11 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -37,29 +37,14 @@
37 37
38#define CAST5_PARALLEL_BLOCKS 16 38#define CAST5_PARALLEL_BLOCKS 16
39 39
40asmlinkage void __cast5_enc_blk_16way(struct cast5_ctx *ctx, u8 *dst, 40asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst,
41 const u8 *src, bool xor);
42asmlinkage void cast5_dec_blk_16way(struct cast5_ctx *ctx, u8 *dst,
43 const u8 *src); 41 const u8 *src);
44 42asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst,
45static inline void cast5_enc_blk_xway(struct cast5_ctx *ctx, u8 *dst, 43 const u8 *src);
46 const u8 *src) 44asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst,
47{ 45 const u8 *src);
48 __cast5_enc_blk_16way(ctx, dst, src, false); 46asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src,
49} 47 __be64 *iv);
50
51static inline void cast5_enc_blk_xway_xor(struct cast5_ctx *ctx, u8 *dst,
52 const u8 *src)
53{
54 __cast5_enc_blk_16way(ctx, dst, src, true);
55}
56
57static inline void cast5_dec_blk_xway(struct cast5_ctx *ctx, u8 *dst,
58 const u8 *src)
59{
60 cast5_dec_blk_16way(ctx, dst, src);
61}
62
63 48
64static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes) 49static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes)
65{ 50{
@@ -79,8 +64,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
79 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 64 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
80 const unsigned int bsize = CAST5_BLOCK_SIZE; 65 const unsigned int bsize = CAST5_BLOCK_SIZE;
81 unsigned int nbytes; 66 unsigned int nbytes;
67 void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src);
82 int err; 68 int err;
83 69
70 fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way;
71
84 err = blkcipher_walk_virt(desc, walk); 72 err = blkcipher_walk_virt(desc, walk);
85 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 73 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
86 74
@@ -93,10 +81,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
93 /* Process multi-block batch */ 81 /* Process multi-block batch */
94 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { 82 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
95 do { 83 do {
96 if (enc) 84 fn(ctx, wdst, wsrc);
97 cast5_enc_blk_xway(ctx, wdst, wsrc);
98 else
99 cast5_dec_blk_xway(ctx, wdst, wsrc);
100 85
101 wsrc += bsize * CAST5_PARALLEL_BLOCKS; 86 wsrc += bsize * CAST5_PARALLEL_BLOCKS;
102 wdst += bsize * CAST5_PARALLEL_BLOCKS; 87 wdst += bsize * CAST5_PARALLEL_BLOCKS;
@@ -107,12 +92,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
107 goto done; 92 goto done;
108 } 93 }
109 94
95 fn = (enc) ? __cast5_encrypt : __cast5_decrypt;
96
110 /* Handle leftovers */ 97 /* Handle leftovers */
111 do { 98 do {
112 if (enc) 99 fn(ctx, wdst, wsrc);
113 __cast5_encrypt(ctx, wdst, wsrc);
114 else
115 __cast5_decrypt(ctx, wdst, wsrc);
116 100
117 wsrc += bsize; 101 wsrc += bsize;
118 wdst += bsize; 102 wdst += bsize;
@@ -194,9 +178,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
194 unsigned int nbytes = walk->nbytes; 178 unsigned int nbytes = walk->nbytes;
195 u64 *src = (u64 *)walk->src.virt.addr; 179 u64 *src = (u64 *)walk->src.virt.addr;
196 u64 *dst = (u64 *)walk->dst.virt.addr; 180 u64 *dst = (u64 *)walk->dst.virt.addr;
197 u64 ivs[CAST5_PARALLEL_BLOCKS - 1];
198 u64 last_iv; 181 u64 last_iv;
199 int i;
200 182
201 /* Start of the last block. */ 183 /* Start of the last block. */
202 src += nbytes / bsize - 1; 184 src += nbytes / bsize - 1;
@@ -211,13 +193,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
211 src -= CAST5_PARALLEL_BLOCKS - 1; 193 src -= CAST5_PARALLEL_BLOCKS - 1;
212 dst -= CAST5_PARALLEL_BLOCKS - 1; 194 dst -= CAST5_PARALLEL_BLOCKS - 1;
213 195
214 for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++) 196 cast5_cbc_dec_16way(ctx, (u8 *)dst, (u8 *)src);
215 ivs[i] = src[i];
216
217 cast5_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
218
219 for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
220 *(dst + (i + 1)) ^= *(ivs + i);
221 197
222 nbytes -= bsize; 198 nbytes -= bsize;
223 if (nbytes < bsize) 199 if (nbytes < bsize)
@@ -298,23 +274,12 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
298 unsigned int nbytes = walk->nbytes; 274 unsigned int nbytes = walk->nbytes;
299 u64 *src = (u64 *)walk->src.virt.addr; 275 u64 *src = (u64 *)walk->src.virt.addr;
300 u64 *dst = (u64 *)walk->dst.virt.addr; 276 u64 *dst = (u64 *)walk->dst.virt.addr;
301 u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
302 __be64 ctrblocks[CAST5_PARALLEL_BLOCKS];
303 int i;
304 277
305 /* Process multi-block batch */ 278 /* Process multi-block batch */
306 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { 279 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
307 do { 280 do {
308 /* create ctrblks for parallel encrypt */ 281 cast5_ctr_16way(ctx, (u8 *)dst, (u8 *)src,
309 for (i = 0; i < CAST5_PARALLEL_BLOCKS; i++) { 282 (__be64 *)walk->iv);
310 if (dst != src)
311 dst[i] = src[i];
312
313 ctrblocks[i] = cpu_to_be64(ctrblk++);
314 }
315
316 cast5_enc_blk_xway_xor(ctx, (u8 *)dst,
317 (u8 *)ctrblocks);
318 283
319 src += CAST5_PARALLEL_BLOCKS; 284 src += CAST5_PARALLEL_BLOCKS;
320 dst += CAST5_PARALLEL_BLOCKS; 285 dst += CAST5_PARALLEL_BLOCKS;
@@ -327,13 +292,16 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
327 292
328 /* Handle leftovers */ 293 /* Handle leftovers */
329 do { 294 do {
295 u64 ctrblk;
296
330 if (dst != src) 297 if (dst != src)
331 *dst = *src; 298 *dst = *src;
332 299
333 ctrblocks[0] = cpu_to_be64(ctrblk++); 300 ctrblk = *(u64 *)walk->iv;
301 be64_add_cpu((__be64 *)walk->iv, 1);
334 302
335 __cast5_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); 303 __cast5_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
336 *dst ^= ctrblocks[0]; 304 *dst ^= ctrblk;
337 305
338 src += 1; 306 src += 1;
339 dst += 1; 307 dst += 1;
@@ -341,7 +309,6 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
341 } while (nbytes >= bsize); 309 } while (nbytes >= bsize);
342 310
343done: 311done:
344 *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
345 return nbytes; 312 return nbytes;
346} 313}
347 314