diff options
author | Ard Biesheuvel <ard.biesheuvel@linaro.org> | 2018-07-30 17:06:42 -0400 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2018-08-07 05:38:04 -0400 |
commit | 30f1a9f53e77e4c9ddf55ebfda8a9d7666e46964 (patch) | |
tree | 6696057d0c4cb95671a65cad2df8f9b573a0ed6d | |
parent | e0bd888dc487e0c444ee5f3bf55020862d16a225 (diff) |
crypto: arm64/aes-ce-gcm - don't reload key schedule if avoidable
Squeeze out another 5% of performance by minimizing the number
of invocations of kernel_neon_begin()/kernel_neon_end() on the
common path, which also allows some reloads of the key schedule
to be optimized away.
The resulting code runs at 2.3 cycles per byte on a Cortex-A53.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r-- | arch/arm64/crypto/ghash-ce-core.S | 9 | ||||
-rw-r--r-- | arch/arm64/crypto/ghash-ce-glue.c | 81 |
2 files changed, 49 insertions, 41 deletions
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index f7281e7a592f..913e49932ae6 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Accelerated GHASH implementation with ARMv8 PMULL instructions. | 2 | * Accelerated GHASH implementation with ARMv8 PMULL instructions. |
3 | * | 3 | * |
4 | * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> | 4 | * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or modify it | 6 | * This program is free software; you can redistribute it and/or modify it |
7 | * under the terms of the GNU General Public License version 2 as published | 7 | * under the terms of the GNU General Public License version 2 as published |
@@ -332,8 +332,6 @@ ENDPROC(pmull_ghash_update_p8) | |||
332 | ld1 {XL.2d}, [x1] | 332 | ld1 {XL.2d}, [x1] |
333 | ldr x8, [x5, #8] // load lower counter | 333 | ldr x8, [x5, #8] // load lower counter |
334 | 334 | ||
335 | load_round_keys w7, x6 | ||
336 | |||
337 | movi MASK.16b, #0xe1 | 335 | movi MASK.16b, #0xe1 |
338 | trn1 SHASH2.2d, SHASH.2d, HH.2d | 336 | trn1 SHASH2.2d, SHASH.2d, HH.2d |
339 | trn2 T1.2d, SHASH.2d, HH.2d | 337 | trn2 T1.2d, SHASH.2d, HH.2d |
@@ -346,6 +344,8 @@ CPU_LE( rev x8, x8 ) | |||
346 | ld1 {KS0.16b-KS1.16b}, [x10] | 344 | ld1 {KS0.16b-KS1.16b}, [x10] |
347 | .endif | 345 | .endif |
348 | 346 | ||
347 | cbnz x6, 4f | ||
348 | |||
349 | 0: ld1 {INP0.16b-INP1.16b}, [x3], #32 | 349 | 0: ld1 {INP0.16b-INP1.16b}, [x3], #32 |
350 | 350 | ||
351 | rev x9, x8 | 351 | rev x9, x8 |
@@ -471,6 +471,9 @@ CPU_LE( rev x8, x8 ) | |||
471 | enc_round KS0, v20 | 471 | enc_round KS0, v20 |
472 | enc_round KS1, v20 | 472 | enc_round KS1, v20 |
473 | b 1b | 473 | b 1b |
474 | |||
475 | 4: load_round_keys w7, x6 | ||
476 | b 0b | ||
474 | .endm | 477 | .endm |
475 | 478 | ||
476 | /* | 479 | /* |
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index cd91b146c87d..42a0e84e276c 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Accelerated GHASH implementation with ARMv8 PMULL instructions. | 2 | * Accelerated GHASH implementation with ARMv8 PMULL instructions. |
3 | * | 3 | * |
4 | * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> | 4 | * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or modify it | 6 | * This program is free software; you can redistribute it and/or modify it |
7 | * under the terms of the GNU General Public License version 2 as published | 7 | * under the terms of the GNU General Public License version 2 as published |
@@ -373,37 +373,39 @@ static int gcm_encrypt(struct aead_request *req) | |||
373 | memcpy(iv, req->iv, GCM_IV_SIZE); | 373 | memcpy(iv, req->iv, GCM_IV_SIZE); |
374 | put_unaligned_be32(1, iv + GCM_IV_SIZE); | 374 | put_unaligned_be32(1, iv + GCM_IV_SIZE); |
375 | 375 | ||
376 | if (likely(may_use_simd())) { | 376 | err = skcipher_walk_aead_encrypt(&walk, req, false); |
377 | kernel_neon_begin(); | ||
378 | 377 | ||
378 | if (likely(may_use_simd() && walk.total >= 2 * AES_BLOCK_SIZE)) { | ||
379 | u32 const *rk = NULL; | ||
380 | |||
381 | kernel_neon_begin(); | ||
379 | pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds); | 382 | pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds); |
380 | put_unaligned_be32(2, iv + GCM_IV_SIZE); | 383 | put_unaligned_be32(2, iv + GCM_IV_SIZE); |
381 | pmull_gcm_encrypt_block(ks, iv, NULL, nrounds); | 384 | pmull_gcm_encrypt_block(ks, iv, NULL, nrounds); |
382 | put_unaligned_be32(3, iv + GCM_IV_SIZE); | 385 | put_unaligned_be32(3, iv + GCM_IV_SIZE); |
383 | pmull_gcm_encrypt_block(ks + AES_BLOCK_SIZE, iv, NULL, nrounds); | 386 | pmull_gcm_encrypt_block(ks + AES_BLOCK_SIZE, iv, NULL, nrounds); |
384 | put_unaligned_be32(4, iv + GCM_IV_SIZE); | 387 | put_unaligned_be32(4, iv + GCM_IV_SIZE); |
385 | kernel_neon_end(); | ||
386 | |||
387 | err = skcipher_walk_aead_encrypt(&walk, req, false); | ||
388 | 388 | ||
389 | while (walk.nbytes >= 2 * AES_BLOCK_SIZE) { | 389 | do { |
390 | int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2; | 390 | int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2; |
391 | 391 | ||
392 | kernel_neon_begin(); | 392 | if (rk) |
393 | kernel_neon_begin(); | ||
394 | |||
393 | pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr, | 395 | pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr, |
394 | walk.src.virt.addr, ctx->h2, iv, | 396 | walk.src.virt.addr, ctx->h2, iv, |
395 | ctx->aes_key.key_enc, nrounds, ks); | 397 | rk, nrounds, ks); |
396 | kernel_neon_end(); | 398 | kernel_neon_end(); |
397 | 399 | ||
398 | err = skcipher_walk_done(&walk, | 400 | err = skcipher_walk_done(&walk, |
399 | walk.nbytes % (2 * AES_BLOCK_SIZE)); | 401 | walk.nbytes % (2 * AES_BLOCK_SIZE)); |
400 | } | 402 | |
403 | rk = ctx->aes_key.key_enc; | ||
404 | } while (walk.nbytes >= 2 * AES_BLOCK_SIZE); | ||
401 | } else { | 405 | } else { |
402 | __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, nrounds); | 406 | __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, nrounds); |
403 | put_unaligned_be32(2, iv + GCM_IV_SIZE); | 407 | put_unaligned_be32(2, iv + GCM_IV_SIZE); |
404 | 408 | ||
405 | err = skcipher_walk_aead_encrypt(&walk, req, false); | ||
406 | |||
407 | while (walk.nbytes >= AES_BLOCK_SIZE) { | 409 | while (walk.nbytes >= AES_BLOCK_SIZE) { |
408 | int blocks = walk.nbytes / AES_BLOCK_SIZE; | 410 | int blocks = walk.nbytes / AES_BLOCK_SIZE; |
409 | u8 *dst = walk.dst.virt.addr; | 411 | u8 *dst = walk.dst.virt.addr; |
@@ -485,50 +487,53 @@ static int gcm_decrypt(struct aead_request *req) | |||
485 | memcpy(iv, req->iv, GCM_IV_SIZE); | 487 | memcpy(iv, req->iv, GCM_IV_SIZE); |
486 | put_unaligned_be32(1, iv + GCM_IV_SIZE); | 488 | put_unaligned_be32(1, iv + GCM_IV_SIZE); |
487 | 489 | ||
488 | if (likely(may_use_simd())) { | 490 | err = skcipher_walk_aead_decrypt(&walk, req, false); |
491 | |||
492 | if (likely(may_use_simd() && walk.total >= 2 * AES_BLOCK_SIZE)) { | ||
493 | u32 const *rk = NULL; | ||
494 | |||
489 | kernel_neon_begin(); | 495 | kernel_neon_begin(); |
490 | pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds); | 496 | pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds); |
491 | put_unaligned_be32(2, iv + GCM_IV_SIZE); | 497 | put_unaligned_be32(2, iv + GCM_IV_SIZE); |
492 | kernel_neon_end(); | ||
493 | 498 | ||
494 | err = skcipher_walk_aead_decrypt(&walk, req, false); | 499 | do { |
495 | |||
496 | while (walk.nbytes >= 2 * AES_BLOCK_SIZE) { | ||
497 | int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2; | 500 | int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2; |
501 | int rem = walk.total - blocks * AES_BLOCK_SIZE; | ||
502 | |||
503 | if (rk) | ||
504 | kernel_neon_begin(); | ||
498 | 505 | ||
499 | kernel_neon_begin(); | ||
500 | pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr, | 506 | pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr, |
501 | walk.src.virt.addr, ctx->h2, iv, | 507 | walk.src.virt.addr, ctx->h2, iv, |
502 | ctx->aes_key.key_enc, nrounds); | 508 | rk, nrounds); |
503 | kernel_neon_end(); | ||
504 | 509 | ||
505 | err = skcipher_walk_done(&walk, | 510 | /* check if this is the final iteration of the loop */ |
506 | walk.nbytes % (2 * AES_BLOCK_SIZE)); | 511 | if (rem < (2 * AES_BLOCK_SIZE)) { |
507 | } | 512 | u8 *iv2 = iv + AES_BLOCK_SIZE; |
508 | 513 | ||
509 | if (walk.nbytes) { | 514 | if (rem > AES_BLOCK_SIZE) { |
510 | u8 *iv2 = iv + AES_BLOCK_SIZE; | 515 | memcpy(iv2, iv, AES_BLOCK_SIZE); |
516 | crypto_inc(iv2, AES_BLOCK_SIZE); | ||
517 | } | ||
511 | 518 | ||
512 | if (walk.nbytes > AES_BLOCK_SIZE) { | 519 | pmull_gcm_encrypt_block(iv, iv, NULL, nrounds); |
513 | memcpy(iv2, iv, AES_BLOCK_SIZE); | ||
514 | crypto_inc(iv2, AES_BLOCK_SIZE); | ||
515 | } | ||
516 | 520 | ||
517 | kernel_neon_begin(); | 521 | if (rem > AES_BLOCK_SIZE) |
518 | pmull_gcm_encrypt_block(iv, iv, ctx->aes_key.key_enc, | 522 | pmull_gcm_encrypt_block(iv2, iv2, NULL, |
519 | nrounds); | 523 | nrounds); |
524 | } | ||
520 | 525 | ||
521 | if (walk.nbytes > AES_BLOCK_SIZE) | ||
522 | pmull_gcm_encrypt_block(iv2, iv2, NULL, | ||
523 | nrounds); | ||
524 | kernel_neon_end(); | 526 | kernel_neon_end(); |
525 | } | 527 | |
528 | err = skcipher_walk_done(&walk, | ||
529 | walk.nbytes % (2 * AES_BLOCK_SIZE)); | ||
530 | |||
531 | rk = ctx->aes_key.key_enc; | ||
532 | } while (walk.nbytes >= 2 * AES_BLOCK_SIZE); | ||
526 | } else { | 533 | } else { |
527 | __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, nrounds); | 534 | __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, nrounds); |
528 | put_unaligned_be32(2, iv + GCM_IV_SIZE); | 535 | put_unaligned_be32(2, iv + GCM_IV_SIZE); |
529 | 536 | ||
530 | err = skcipher_walk_aead_decrypt(&walk, req, false); | ||
531 | |||
532 | while (walk.nbytes >= AES_BLOCK_SIZE) { | 537 | while (walk.nbytes >= AES_BLOCK_SIZE) { |
533 | int blocks = walk.nbytes / AES_BLOCK_SIZE; | 538 | int blocks = walk.nbytes / AES_BLOCK_SIZE; |
534 | u8 *dst = walk.dst.virt.addr; | 539 | u8 *dst = walk.dst.virt.addr; |