aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2013-04-08 14:51:16 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2013-04-25 09:01:53 -0400
commitc456a9cd1ac4eae9147ffd7ac4fb77ca0fa980c6 (patch)
treef8e88f1e30b24e08104367fc6244dab88cd90d0e /arch
parentb5c5b072dc2f35d45d3404b957e264a3e8e71069 (diff)
crypto: aesni_intel - add more optimized XTS mode for x86-64
Add more optimized XTS code for aesni_intel in 64-bit mode, for smaller stack usage and boost for speed. tcrypt results, with Intel i5-2450M: 256-bit key enc dec 16B 0.98x 0.99x 64B 0.64x 0.63x 256B 1.29x 1.32x 1024B 1.54x 1.58x 8192B 1.57x 1.60x 512-bit key enc dec 16B 0.98x 0.99x 64B 0.60x 0.59x 256B 1.24x 1.25x 1024B 1.39x 1.42x 8192B 1.38x 1.42x I chose not to optimize smaller than block size of 256 bytes, since XTS is practically always used with data blocks of size 512 bytes. This is why performance is reduced in tcrypt for 64 byte long blocks. Cc: Huang Ying <ying.huang@intel.com> Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S117
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c80
2 files changed, 197 insertions, 0 deletions
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 04b797767b9e..62fe22cd4cba 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -34,6 +34,10 @@
34 34
35#ifdef __x86_64__ 35#ifdef __x86_64__
36.data 36.data
37.align 16
38.Lgf128mul_x_ble_mask:
39 .octa 0x00000000000000010000000000000087
40
37POLY: .octa 0xC2000000000000000000000000000001 41POLY: .octa 0xC2000000000000000000000000000001
38TWOONE: .octa 0x00000001000000000000000000000001 42TWOONE: .octa 0x00000001000000000000000000000001
39 43
@@ -105,6 +109,8 @@ enc: .octa 0x2
105#define CTR %xmm11 109#define CTR %xmm11
106#define INC %xmm12 110#define INC %xmm12
107 111
112#define GF128MUL_MASK %xmm10
113
108#ifdef __x86_64__ 114#ifdef __x86_64__
109#define AREG %rax 115#define AREG %rax
110#define KEYP %rdi 116#define KEYP %rdi
@@ -2636,4 +2642,115 @@ ENTRY(aesni_ctr_enc)
2636.Lctr_enc_just_ret: 2642.Lctr_enc_just_ret:
2637 ret 2643 ret
2638ENDPROC(aesni_ctr_enc) 2644ENDPROC(aesni_ctr_enc)
2645
2646/*
2647 * _aesni_gf128mul_x_ble: internal ABI
2648 * Multiply in GF(2^128) for XTS IVs
2649 * input:
2650 * IV: current IV
2651 * GF128MUL_MASK == mask with 0x87 and 0x01
2652 * output:
2653 * IV: next IV
2654 * changed:
2655 * CTR: == temporary value
2656 */
2657#define _aesni_gf128mul_x_ble() \
2658 pshufd $0x13, IV, CTR; \
2659 paddq IV, IV; \
2660 psrad $31, CTR; \
2661 pand GF128MUL_MASK, CTR; \
2662 pxor CTR, IV;
2663
2664/*
2665 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2666 * bool enc, u8 *iv)
2667 */
2668ENTRY(aesni_xts_crypt8)
2669 cmpb $0, %cl
2670 movl $0, %ecx
2671 movl $240, %r10d
2672 leaq _aesni_enc4, %r11
2673 leaq _aesni_dec4, %rax
2674 cmovel %r10d, %ecx
2675 cmoveq %rax, %r11
2676
2677 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2678 movups (IVP), IV
2679
2680 mov 480(KEYP), KLEN
2681 addq %rcx, KEYP
2682
2683 movdqa IV, STATE1
2684 pxor 0x00(INP), STATE1
2685 movdqu IV, 0x00(OUTP)
2686
2687 _aesni_gf128mul_x_ble()
2688 movdqa IV, STATE2
2689 pxor 0x10(INP), STATE2
2690 movdqu IV, 0x10(OUTP)
2691
2692 _aesni_gf128mul_x_ble()
2693 movdqa IV, STATE3
2694 pxor 0x20(INP), STATE3
2695 movdqu IV, 0x20(OUTP)
2696
2697 _aesni_gf128mul_x_ble()
2698 movdqa IV, STATE4
2699 pxor 0x30(INP), STATE4
2700 movdqu IV, 0x30(OUTP)
2701
2702 call *%r11
2703
2704 pxor 0x00(OUTP), STATE1
2705 movdqu STATE1, 0x00(OUTP)
2706
2707 _aesni_gf128mul_x_ble()
2708 movdqa IV, STATE1
2709 pxor 0x40(INP), STATE1
2710 movdqu IV, 0x40(OUTP)
2711
2712 pxor 0x10(OUTP), STATE2
2713 movdqu STATE2, 0x10(OUTP)
2714
2715 _aesni_gf128mul_x_ble()
2716 movdqa IV, STATE2
2717 pxor 0x50(INP), STATE2
2718 movdqu IV, 0x50(OUTP)
2719
2720 pxor 0x20(OUTP), STATE3
2721 movdqu STATE3, 0x20(OUTP)
2722
2723 _aesni_gf128mul_x_ble()
2724 movdqa IV, STATE3
2725 pxor 0x60(INP), STATE3
2726 movdqu IV, 0x60(OUTP)
2727
2728 pxor 0x30(OUTP), STATE4
2729 movdqu STATE4, 0x30(OUTP)
2730
2731 _aesni_gf128mul_x_ble()
2732 movdqa IV, STATE4
2733 pxor 0x70(INP), STATE4
2734 movdqu IV, 0x70(OUTP)
2735
2736 _aesni_gf128mul_x_ble()
2737 movups IV, (IVP)
2738
2739 call *%r11
2740
2741 pxor 0x40(OUTP), STATE1
2742 movdqu STATE1, 0x40(OUTP)
2743
2744 pxor 0x50(OUTP), STATE2
2745 movdqu STATE2, 0x50(OUTP)
2746
2747 pxor 0x60(OUTP), STATE3
2748 movdqu STATE3, 0x60(OUTP)
2749
2750 pxor 0x70(OUTP), STATE4
2751 movdqu STATE4, 0x70(OUTP)
2752
2753 ret
2754ENDPROC(aesni_xts_crypt8)
2755
2639#endif 2756#endif
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index a0795da22c02..f80e668785c0 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -39,6 +39,9 @@
39#include <crypto/internal/aead.h> 39#include <crypto/internal/aead.h>
40#include <linux/workqueue.h> 40#include <linux/workqueue.h>
41#include <linux/spinlock.h> 41#include <linux/spinlock.h>
42#ifdef CONFIG_X86_64
43#include <asm/crypto/glue_helper.h>
44#endif
42 45
43#if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE) 46#if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE)
44#define HAS_PCBC 47#define HAS_PCBC
@@ -102,6 +105,9 @@ void crypto_fpu_exit(void);
102asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, 105asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
103 const u8 *in, unsigned int len, u8 *iv); 106 const u8 *in, unsigned int len, u8 *iv);
104 107
108asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out,
109 const u8 *in, bool enc, u8 *iv);
110
105/* asmlinkage void aesni_gcm_enc() 111/* asmlinkage void aesni_gcm_enc()
106 * void *ctx, AES Key schedule. Starts on a 16 byte boundary. 112 * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
107 * u8 *out, Ciphertext output. Encrypt in-place is allowed. 113 * u8 *out, Ciphertext output. Encrypt in-place is allowed.
@@ -510,6 +516,78 @@ static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in)
510 aesni_enc(ctx, out, in); 516 aesni_enc(ctx, out, in);
511} 517}
512 518
519#ifdef CONFIG_X86_64
520
521static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
522{
523 glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc));
524}
525
526static void aesni_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
527{
528 glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_dec));
529}
530
531static void aesni_xts_enc8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
532{
533 aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, true, (u8 *)iv);
534}
535
536static void aesni_xts_dec8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
537{
538 aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, false, (u8 *)iv);
539}
540
541static const struct common_glue_ctx aesni_enc_xts = {
542 .num_funcs = 2,
543 .fpu_blocks_limit = 1,
544
545 .funcs = { {
546 .num_blocks = 8,
547 .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc8) }
548 }, {
549 .num_blocks = 1,
550 .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc) }
551 } }
552};
553
554static const struct common_glue_ctx aesni_dec_xts = {
555 .num_funcs = 2,
556 .fpu_blocks_limit = 1,
557
558 .funcs = { {
559 .num_blocks = 8,
560 .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec8) }
561 }, {
562 .num_blocks = 1,
563 .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec) }
564 } }
565};
566
567static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
568 struct scatterlist *src, unsigned int nbytes)
569{
570 struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
571
572 return glue_xts_crypt_128bit(&aesni_enc_xts, desc, dst, src, nbytes,
573 XTS_TWEAK_CAST(aesni_xts_tweak),
574 aes_ctx(ctx->raw_tweak_ctx),
575 aes_ctx(ctx->raw_crypt_ctx));
576}
577
578static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
579 struct scatterlist *src, unsigned int nbytes)
580{
581 struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
582
583 return glue_xts_crypt_128bit(&aesni_dec_xts, desc, dst, src, nbytes,
584 XTS_TWEAK_CAST(aesni_xts_tweak),
585 aes_ctx(ctx->raw_tweak_ctx),
586 aes_ctx(ctx->raw_crypt_ctx));
587}
588
589#else
590
513static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 591static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
514 struct scatterlist *src, unsigned int nbytes) 592 struct scatterlist *src, unsigned int nbytes)
515{ 593{
@@ -560,6 +638,8 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
560 return ret; 638 return ret;
561} 639}
562 640
641#endif
642
563#ifdef CONFIG_X86_64 643#ifdef CONFIG_X86_64
564static int rfc4106_init(struct crypto_tfm *tfm) 644static int rfc4106_init(struct crypto_tfm *tfm)
565{ 645{