Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto update from Herbert Xu: - XTS mode optimisation for twofish/cast6/camellia/aes on x86 - AVX2/x86_64 implementation for blowfish/twofish/serpent/camellia - SSSE3/AVX/AVX2 optimisations for sha256/sha512 - Added driver for SAHARA2 crypto accelerator - Fix for GMAC when used in non-IPsec secnarios - Added generic CMAC implementation (including IPsec glue) - IP update for crypto/atmel - Support for more than one device in hwrng/timeriomem - Added Broadcom BCM2835 RNG driver - Misc fixes * git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (59 commits) crypto: caam - fix job ring cleanup code crypto: camellia - add AVX2/AES-NI/x86_64 assembler implementation of camellia cipher crypto: serpent - add AVX2/x86_64 assembler implementation of serpent cipher crypto: twofish - add AVX2/x86_64 assembler implementation of twofish cipher crypto: blowfish - add AVX2/x86_64 implementation of blowfish cipher crypto: tcrypt - add async cipher speed tests for blowfish crypto: testmgr - extend camellia test-vectors for camellia-aesni/avx2 crypto: aesni_intel - fix Kconfig problem with CRYPTO_GLUE_HELPER_X86 crypto: aesni_intel - add more optimized XTS mode for x86-64 crypto: x86/camellia-aesni-avx - add more optimized XTS code crypto: cast6-avx: use new optimized XTS code crypto: x86/twofish-avx - use optimized XTS code crypto: x86 - add more optimized XTS-mode for serpent-avx xfrm: add rfc4494 AES-CMAC-96 support crypto: add CMAC support to CryptoAPI crypto: testmgr - add empty test vectors for null ciphers crypto: testmgr - add AES GMAC test vectors crypto: gcm - fix rfc4543 to handle async crypto correctly crypto: gcm - make GMAC work when dst and src are different hwrng: timeriomem - added devicetree hooks ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2013-05-02 17:53:12 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-05-02 17:53:12 -0400
commit: 797994f81a8b2bdca2eecffa415c1e7a89a4f961 (patch)
tree: 1383dc469c26ad37fdf960f682d9a48c782935c5 /arch
parent: c8d8566952fda026966784a62f324c8352f77430 (diff)
parent: 3862de1f6c442d53bd828d39f86d07d933a70605 (diff)
40 files changed, 10762 insertions, 240 deletions
diff --git a/arch/arm/mach-at91/at91sam9g45_devices.c b/arch/arm/mach-at91/at91sam9g45_devices.c
index 827c9f2a70fb..f0bf68268ca2 100644
--- a/arch/arm/mach-at91/at91sam9g45_devices.c
+++ b/arch/arm/mach-at91/at91sam9g45_devices.c
@@ -18,7 +18,7 @@
 #include <linux/platform_device.h>
 #include <linux/i2c-gpio.h>
 #include <linux/atmel-mci.h>
-#include <linux/platform_data/atmel-aes.h>
+#include <linux/platform_data/crypto-atmel.h>
 #include <linux/platform_data/at91_adc.h>
@@ -1900,7 +1900,8 @@ static void __init at91_add_device_tdes(void) {}
 * -------------------------------------------------------------------- */
 #if defined(CONFIG_CRYPTO_DEV_ATMEL_AES) || defined(CONFIG_CRYPTO_DEV_ATMEL_AES_MODULE)
-static struct aes_platform_data aes_data;
+static struct crypto_platform_data aes_data;
+static struct crypto_dma_data alt_atslave;
 static u64 aes_dmamask = DMA_BIT_MASK(32);
 static struct resource aes_resources[] = {
@@ -1931,23 +1932,20 @@ static struct platform_device at91sam9g45_aes_device = {
 static void __init at91_add_device_aes(void)
 {
        struct at_dma_slave     *atslave;
-        struct aes_dma_data     *alt_atslave;
-        alt_atslave = kzalloc(sizeof(struct aes_dma_data), GFP_KERNEL);
        /* DMA TX slave channel configuration */
-        atslave = &alt_atslave->txdata;
+        atslave = &alt_atslave.txdata;
        atslave->dma_dev = &at_hdmac_device.dev;
        atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE  | ATC_SRC_H2SEL_HW |
                                                ATC_SRC_PER(AT_DMA_ID_AES_RX);
        /* DMA RX slave channel configuration */
-        atslave = &alt_atslave->rxdata;
+        atslave = &alt_atslave.rxdata;
        atslave->dma_dev = &at_hdmac_device.dev;
        atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE  | ATC_DST_H2SEL_HW |
                                                ATC_DST_PER(AT_DMA_ID_AES_TX);
-        aes_data.dma_slave = alt_atslave;
+        aes_data.dma_slave = &alt_atslave;
        platform_device_register(&at91sam9g45_aes_device);
 }
 #else
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 63947a8f9f0f..a3a0ed80f17c 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,6 +2,10 @@
 # Arch-specific CryptoAPI modules.
 #
+avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
+avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
+                                        $(comma)4)$(comma)%ymm2,yes,no)
 obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o
 obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
@@ -12,22 +16,37 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
 obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
-obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o
-obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
-obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
-obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
 obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
-obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
 obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
 obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
+obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
+obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
+# These modules require assembler to support AVX.
+ifeq ($(avx_supported),yes)
+        obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
+                                                camellia-aesni-avx-x86_64.o
+        obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
+        obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
+        obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
+        obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
+endif
+# These modules require assembler to support AVX2.
+ifeq ($(avx2_supported),yes)
+        obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o
+        obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
+        obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
+        obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o
+endif
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
 twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
@@ -36,21 +55,35 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
 camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
-camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
-                               camellia_aesni_avx_glue.o
-cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
-cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
-twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o
 salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
-serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
+ifeq ($(avx_supported),yes)
+        camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
+                                        camellia_aesni_avx_glue.o
+        cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
+        cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
+        twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o \
+                                twofish_avx_glue.o
+        serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o \
+                                serpent_avx_glue.o
+endif
+ifeq ($(avx2_supported),yes)
+        blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o
+        camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
+        serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
+        twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o
+endif
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
 crc32c-intel-y := crc32c-intel_glue.o
-crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o
+crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
 crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
+sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
+sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 04b797767b9e..62fe22cd4cba 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -34,6 +34,10 @@
 #ifdef __x86_64__
 .data
+.align 16
+.Lgf128mul_x_ble_mask:
+        .octa 0x00000000000000010000000000000087
 POLY:   .octa 0xC2000000000000000000000000000001
 TWOONE: .octa 0x00000001000000000000000000000001
@@ -105,6 +109,8 @@ enc:        .octa 0x2
 #define CTR     %xmm11
 #define INC     %xmm12
+#define GF128MUL_MASK %xmm10
 #ifdef __x86_64__
 #define AREG    %rax
 #define KEYP    %rdi
@@ -2636,4 +2642,115 @@ ENTRY(aesni_ctr_enc)
 .Lctr_enc_just_ret:
        ret
 ENDPROC(aesni_ctr_enc)
+/*
+ * _aesni_gf128mul_x_ble:               internal ABI
+ *      Multiply in GF(2^128) for XTS IVs
+ * input:
+ *      IV:     current IV
+ *      GF128MUL_MASK == mask with 0x87 and 0x01
+ * output:
+ *      IV:     next IV
+ * changed:
+ *      CTR:    == temporary value
+ */
+#define _aesni_gf128mul_x_ble() \
+        pshufd $0x13, IV, CTR; \
+        paddq IV, IV; \
+        psrad $31, CTR; \
+        pand GF128MUL_MASK, CTR; \
+        pxor CTR, IV;
+/*
+ * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ *                       bool enc, u8 *iv)
+ */
+ENTRY(aesni_xts_crypt8)
+        cmpb $0, %cl
+        movl $0, %ecx
+        movl $240, %r10d
+        leaq _aesni_enc4, %r11
+        leaq _aesni_dec4, %rax
+        cmovel %r10d, %ecx
+        cmoveq %rax, %r11
+        movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+        movups (IVP), IV
+        mov 480(KEYP), KLEN
+        addq %rcx, KEYP
+        movdqa IV, STATE1
+        pxor 0x00(INP), STATE1
+        movdqu IV, 0x00(OUTP)
+        _aesni_gf128mul_x_ble()
+        movdqa IV, STATE2
+        pxor 0x10(INP), STATE2
+        movdqu IV, 0x10(OUTP)
+        _aesni_gf128mul_x_ble()
+        movdqa IV, STATE3
+        pxor 0x20(INP), STATE3
+        movdqu IV, 0x20(OUTP)
+        _aesni_gf128mul_x_ble()
+        movdqa IV, STATE4
+        pxor 0x30(INP), STATE4
+        movdqu IV, 0x30(OUTP)
+        call *%r11
+        pxor 0x00(OUTP), STATE1
+        movdqu STATE1, 0x00(OUTP)
+        _aesni_gf128mul_x_ble()
+        movdqa IV, STATE1
+        pxor 0x40(INP), STATE1
+        movdqu IV, 0x40(OUTP)
+        pxor 0x10(OUTP), STATE2
+        movdqu STATE2, 0x10(OUTP)
+        _aesni_gf128mul_x_ble()
+        movdqa IV, STATE2
+        pxor 0x50(INP), STATE2
+        movdqu IV, 0x50(OUTP)
+        pxor 0x20(OUTP), STATE3
+        movdqu STATE3, 0x20(OUTP)
+        _aesni_gf128mul_x_ble()
+        movdqa IV, STATE3
+        pxor 0x60(INP), STATE3
+        movdqu IV, 0x60(OUTP)
+        pxor 0x30(OUTP), STATE4
+        movdqu STATE4, 0x30(OUTP)
+        _aesni_gf128mul_x_ble()
+        movdqa IV, STATE4
+        pxor 0x70(INP), STATE4
+        movdqu IV, 0x70(OUTP)
+        _aesni_gf128mul_x_ble()
+        movups IV, (IVP)
+        call *%r11
+        pxor 0x40(OUTP), STATE1
+        movdqu STATE1, 0x40(OUTP)
+        pxor 0x50(OUTP), STATE2
+        movdqu STATE2, 0x50(OUTP)
+        pxor 0x60(OUTP), STATE3
+        movdqu STATE3, 0x60(OUTP)
+        pxor 0x70(OUTP), STATE4
+        movdqu STATE4, 0x70(OUTP)
+        ret
+ENDPROC(aesni_xts_crypt8)
 #endif
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index a0795da22c02..f80e668785c0 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -39,6 +39,9 @@
 #include <crypto/internal/aead.h>
 #include <linux/workqueue.h>
 #include <linux/spinlock.h>
+#ifdef CONFIG_X86_64
+#include <asm/crypto/glue_helper.h>
+#endif
 #if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE)
 #define HAS_PCBC
@@ -102,6 +105,9 @@ void crypto_fpu_exit(void);
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
                              const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out,
+                                 const u8 *in, bool enc, u8 *iv);
 /* asmlinkage void aesni_gcm_enc()
 * void *ctx,  AES Key schedule. Starts on a 16 byte boundary.
 * u8 *out, Ciphertext output. Encrypt in-place is allowed.
@@ -510,6 +516,78 @@ static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in)
        aesni_enc(ctx, out, in);
 }
+#ifdef CONFIG_X86_64
+static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+        glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc));
+}
+static void aesni_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+        glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_dec));
+}
+static void aesni_xts_enc8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+        aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, true, (u8 *)iv);
+}
+static void aesni_xts_dec8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+        aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, false, (u8 *)iv);
+}
+static const struct common_glue_ctx aesni_enc_xts = {
+        .num_funcs = 2,
+        .fpu_blocks_limit = 1,
+        .funcs = { {
+                .num_blocks = 8,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc8) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc) }
+        } }
+};
+static const struct common_glue_ctx aesni_dec_xts = {
+        .num_funcs = 2,
+        .fpu_blocks_limit = 1,
+        .funcs = { {
+                .num_blocks = 8,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec8) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec) }
+        } }
+};
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        return glue_xts_crypt_128bit(&aesni_enc_xts, desc, dst, src, nbytes,
+                                     XTS_TWEAK_CAST(aesni_xts_tweak),
+                                     aes_ctx(ctx->raw_tweak_ctx),
+                                     aes_ctx(ctx->raw_crypt_ctx));
+}
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        return glue_xts_crypt_128bit(&aesni_dec_xts, desc, dst, src, nbytes,
+                                     XTS_TWEAK_CAST(aesni_xts_tweak),
+                                     aes_ctx(ctx->raw_tweak_ctx),
+                                     aes_ctx(ctx->raw_crypt_ctx));
+}
+#else
 static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
                       struct scatterlist *src, unsigned int nbytes)
 {
@@ -560,6 +638,8 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
        return ret;
 }
+#endif
 #ifdef CONFIG_X86_64
 static int rfc4106_init(struct crypto_tfm *tfm)
 {
diff --git a/arch/x86/crypto/blowfish-avx2-asm_64.S b/arch/x86/crypto/blowfish-avx2-asm_64.S
new file mode 100644
index 000000000000..784452e0d05d
--- /dev/null
+++ b/arch/x86/crypto/blowfish-avx2-asm_64.S
@@ -0,0 +1,449 @@
+/*
+ * x86_64/AVX2 assembler optimized version of Blowfish
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+#include <linux/linkage.h>
+.file "blowfish-avx2-asm_64.S"
+.data
+.align 32
+.Lprefetch_mask:
+.long 0*64
+.long 1*64
+.long 2*64
+.long 3*64
+.long 4*64
+.long 5*64
+.long 6*64
+.long 7*64
+.Lbswap32_mask:
+.long 0x00010203
+.long 0x04050607
+.long 0x08090a0b
+.long 0x0c0d0e0f
+.Lbswap128_mask:
+        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lbswap_iv_mask:
+        .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
+.text
+/* structure of crypto context */
+#define p       0
+#define s0      ((16 + 2) * 4)
+#define s1      ((16 + 2 + (1 * 256)) * 4)
+#define s2      ((16 + 2 + (2 * 256)) * 4)
+#define s3      ((16 + 2 + (3 * 256)) * 4)
+/* register macros */
+#define CTX     %rdi
+#define RIO      %rdx
+#define RS0     %rax
+#define RS1     %r8
+#define RS2     %r9
+#define RS3     %r10
+#define RLOOP   %r11
+#define RLOOPd  %r11d
+#define RXr0    %ymm8
+#define RXr1    %ymm9
+#define RXr2    %ymm10
+#define RXr3    %ymm11
+#define RXl0    %ymm12
+#define RXl1    %ymm13
+#define RXl2    %ymm14
+#define RXl3    %ymm15
+/* temp regs */
+#define RT0     %ymm0
+#define RT0x    %xmm0
+#define RT1     %ymm1
+#define RT1x    %xmm1
+#define RIDX0   %ymm2
+#define RIDX1   %ymm3
+#define RIDX1x  %xmm3
+#define RIDX2   %ymm4
+#define RIDX3   %ymm5
+/* vpgatherdd mask and '-1' */
+#define RNOT    %ymm6
+/* byte mask, (-1 >> 24) */
+#define RBYTE   %ymm7
+/***********************************************************************
+ * 32-way AVX2 blowfish
+ ***********************************************************************/
+#define F(xl, xr) \
+        vpsrld $24, xl, RIDX0; \
+        vpsrld $16, xl, RIDX1; \
+        vpsrld $8, xl, RIDX2; \
+        vpand RBYTE, RIDX1, RIDX1; \
+        vpand RBYTE, RIDX2, RIDX2; \
+        vpand RBYTE, xl, RIDX3; \
+        \
+        vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \
+        vpcmpeqd RNOT, RNOT, RNOT; \
+        vpcmpeqd RIDX0, RIDX0, RIDX0; \
+        \
+        vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \
+        vpcmpeqd RIDX1, RIDX1, RIDX1; \
+        vpaddd RT0, RT1, RT0; \
+        \
+        vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \
+        vpxor RT0, RT1, RT0; \
+        \
+        vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \
+        vpcmpeqd RNOT, RNOT, RNOT; \
+        vpaddd RT0, RT1, RT0; \
+        \
+        vpxor RT0, xr, xr;
+#define add_roundkey(xl, nmem) \
+        vpbroadcastd nmem, RT0; \
+        vpxor RT0, xl ## 0, xl ## 0; \
+        vpxor RT0, xl ## 1, xl ## 1; \
+        vpxor RT0, xl ## 2, xl ## 2; \
+        vpxor RT0, xl ## 3, xl ## 3;
+#define round_enc() \
+        add_roundkey(RXr, p(CTX,RLOOP,4)); \
+        F(RXl0, RXr0); \
+        F(RXl1, RXr1); \
+        F(RXl2, RXr2); \
+        F(RXl3, RXr3); \
+        \
+        add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
+        F(RXr0, RXl0); \
+        F(RXr1, RXl1); \
+        F(RXr2, RXl2); \
+        F(RXr3, RXl3);
+#define round_dec() \
+        add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \
+        F(RXl0, RXr0); \
+        F(RXl1, RXr1); \
+        F(RXl2, RXr2); \
+        F(RXl3, RXr3); \
+        \
+        add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
+        F(RXr0, RXl0); \
+        F(RXr1, RXl1); \
+        F(RXr2, RXl2); \
+        F(RXr3, RXl3);
+#define init_round_constants() \
+        vpcmpeqd RNOT, RNOT, RNOT; \
+        leaq s0(CTX), RS0; \
+        leaq s1(CTX), RS1; \
+        leaq s2(CTX), RS2; \
+        leaq s3(CTX), RS3; \
+        vpsrld $24, RNOT, RBYTE;
+#define transpose_2x2(x0, x1, t0) \
+        vpunpckldq x0, x1, t0; \
+        vpunpckhdq x0, x1, x1; \
+        \
+        vpunpcklqdq t0, x1, x0; \
+        vpunpckhqdq t0, x1, x1;
+#define read_block(xl, xr) \
+        vbroadcasti128 .Lbswap32_mask, RT1; \
+        \
+        vpshufb RT1, xl ## 0, xl ## 0; \
+        vpshufb RT1, xr ## 0, xr ## 0; \
+        vpshufb RT1, xl ## 1, xl ## 1; \
+        vpshufb RT1, xr ## 1, xr ## 1; \
+        vpshufb RT1, xl ## 2, xl ## 2; \
+        vpshufb RT1, xr ## 2, xr ## 2; \
+        vpshufb RT1, xl ## 3, xl ## 3; \
+        vpshufb RT1, xr ## 3, xr ## 3; \
+        \
+        transpose_2x2(xl ## 0, xr ## 0, RT0); \
+        transpose_2x2(xl ## 1, xr ## 1, RT0); \
+        transpose_2x2(xl ## 2, xr ## 2, RT0); \
+        transpose_2x2(xl ## 3, xr ## 3, RT0);
+#define write_block(xl, xr) \
+        vbroadcasti128 .Lbswap32_mask, RT1; \
+        \
+        transpose_2x2(xl ## 0, xr ## 0, RT0); \
+        transpose_2x2(xl ## 1, xr ## 1, RT0); \
+        transpose_2x2(xl ## 2, xr ## 2, RT0); \
+        transpose_2x2(xl ## 3, xr ## 3, RT0); \
+        \
+        vpshufb RT1, xl ## 0, xl ## 0; \
+        vpshufb RT1, xr ## 0, xr ## 0; \
+        vpshufb RT1, xl ## 1, xl ## 1; \
+        vpshufb RT1, xr ## 1, xr ## 1; \
+        vpshufb RT1, xl ## 2, xl ## 2; \
+        vpshufb RT1, xr ## 2, xr ## 2; \
+        vpshufb RT1, xl ## 3, xl ## 3; \
+        vpshufb RT1, xr ## 3, xr ## 3;
+.align 8
+__blowfish_enc_blk32:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      RXl0..4, RXr0..4: plaintext
+         * output:
+         *      RXl0..4, RXr0..4: ciphertext (RXl <=> RXr swapped)
+         */
+        init_round_constants();
+        read_block(RXl, RXr);
+        movl $1, RLOOPd;
+        add_roundkey(RXl, p+4*(0)(CTX));
+.align 4
+.L__enc_loop:
+        round_enc();
+        leal 2(RLOOPd), RLOOPd;
+        cmpl $17, RLOOPd;
+        jne .L__enc_loop;
+        add_roundkey(RXr, p+4*(17)(CTX));
+        write_block(RXl, RXr);
+        ret;
+ENDPROC(__blowfish_enc_blk32)
+.align 8
+__blowfish_dec_blk32:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      RXl0..4, RXr0..4: ciphertext
+         * output:
+         *      RXl0..4, RXr0..4: plaintext (RXl <=> RXr swapped)
+         */
+        init_round_constants();
+        read_block(RXl, RXr);
+        movl $14, RLOOPd;
+        add_roundkey(RXl, p+4*(17)(CTX));
+.align 4
+.L__dec_loop:
+        round_dec();
+        addl $-2, RLOOPd;
+        jns .L__dec_loop;
+        add_roundkey(RXr, p+4*(0)(CTX));
+        write_block(RXl, RXr);
+        ret;
+ENDPROC(__blowfish_dec_blk32)
+ENTRY(blowfish_ecb_enc_32way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        vzeroupper;
+        vmovdqu 0*32(%rdx), RXl0;
+        vmovdqu 1*32(%rdx), RXr0;
+        vmovdqu 2*32(%rdx), RXl1;
+        vmovdqu 3*32(%rdx), RXr1;
+        vmovdqu 4*32(%rdx), RXl2;
+        vmovdqu 5*32(%rdx), RXr2;
+        vmovdqu 6*32(%rdx), RXl3;
+        vmovdqu 7*32(%rdx), RXr3;
+        call __blowfish_enc_blk32;
+        vmovdqu RXr0, 0*32(%rsi);
+        vmovdqu RXl0, 1*32(%rsi);
+        vmovdqu RXr1, 2*32(%rsi);
+        vmovdqu RXl1, 3*32(%rsi);
+        vmovdqu RXr2, 4*32(%rsi);
+        vmovdqu RXl2, 5*32(%rsi);
+        vmovdqu RXr3, 6*32(%rsi);
+        vmovdqu RXl3, 7*32(%rsi);
+        vzeroupper;
+        ret;
+ENDPROC(blowfish_ecb_enc_32way)
+ENTRY(blowfish_ecb_dec_32way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        vzeroupper;
+        vmovdqu 0*32(%rdx), RXl0;
+        vmovdqu 1*32(%rdx), RXr0;
+        vmovdqu 2*32(%rdx), RXl1;
+        vmovdqu 3*32(%rdx), RXr1;
+        vmovdqu 4*32(%rdx), RXl2;
+        vmovdqu 5*32(%rdx), RXr2;
+        vmovdqu 6*32(%rdx), RXl3;
+        vmovdqu 7*32(%rdx), RXr3;
+        call __blowfish_dec_blk32;
+        vmovdqu RXr0, 0*32(%rsi);
+        vmovdqu RXl0, 1*32(%rsi);
+        vmovdqu RXr1, 2*32(%rsi);
+        vmovdqu RXl1, 3*32(%rsi);
+        vmovdqu RXr2, 4*32(%rsi);
+        vmovdqu RXl2, 5*32(%rsi);
+        vmovdqu RXr3, 6*32(%rsi);
+        vmovdqu RXl3, 7*32(%rsi);
+        vzeroupper;
+        ret;
+ENDPROC(blowfish_ecb_dec_32way)
+ENTRY(blowfish_cbc_dec_32way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        vzeroupper;
+        vmovdqu 0*32(%rdx), RXl0;
+        vmovdqu 1*32(%rdx), RXr0;
+        vmovdqu 2*32(%rdx), RXl1;
+        vmovdqu 3*32(%rdx), RXr1;
+        vmovdqu 4*32(%rdx), RXl2;
+        vmovdqu 5*32(%rdx), RXr2;
+        vmovdqu 6*32(%rdx), RXl3;
+        vmovdqu 7*32(%rdx), RXr3;
+        call __blowfish_dec_blk32;
+        /* xor with src */
+        vmovq (%rdx), RT0x;
+        vpshufd $0x4f, RT0x, RT0x;
+        vinserti128 $1, 8(%rdx), RT0, RT0;
+        vpxor RT0, RXr0, RXr0;
+        vpxor 0*32+24(%rdx), RXl0, RXl0;
+        vpxor 1*32+24(%rdx), RXr1, RXr1;
+        vpxor 2*32+24(%rdx), RXl1, RXl1;
+        vpxor 3*32+24(%rdx), RXr2, RXr2;
+        vpxor 4*32+24(%rdx), RXl2, RXl2;
+        vpxor 5*32+24(%rdx), RXr3, RXr3;
+        vpxor 6*32+24(%rdx), RXl3, RXl3;
+        vmovdqu RXr0, (0*32)(%rsi);
+        vmovdqu RXl0, (1*32)(%rsi);
+        vmovdqu RXr1, (2*32)(%rsi);
+        vmovdqu RXl1, (3*32)(%rsi);
+        vmovdqu RXr2, (4*32)(%rsi);
+        vmovdqu RXl2, (5*32)(%rsi);
+        vmovdqu RXr3, (6*32)(%rsi);
+        vmovdqu RXl3, (7*32)(%rsi);
+        vzeroupper;
+        ret;
+ENDPROC(blowfish_cbc_dec_32way)
+ENTRY(blowfish_ctr_32way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         *      %rcx: iv (big endian, 64bit)
+         */
+        vzeroupper;
+        vpcmpeqd RT0, RT0, RT0;
+        vpsrldq $8, RT0, RT0; /* a: -1, b: 0, c: -1, d: 0 */
+        vpcmpeqd RT1x, RT1x, RT1x;
+        vpaddq RT1x, RT1x, RT1x; /* a: -2, b: -2 */
+        vpxor RIDX0, RIDX0, RIDX0;
+        vinserti128 $1, RT1x, RIDX0, RIDX0; /* a: 0, b: 0, c: -2, d: -2 */
+        vpaddq RIDX0, RT0, RT0; /* a: -1, b: 0, c: -3, d: -2 */
+        vpcmpeqd RT1, RT1, RT1;
+        vpaddq RT1, RT1, RT1; /* a: -2, b: -2, c: -2, d: -2 */
+        vpaddq RT1, RT1, RIDX2; /* a: -4, b: -4, c: -4, d: -4 */
+        vbroadcasti128 .Lbswap_iv_mask, RIDX0;
+        vbroadcasti128 .Lbswap128_mask, RIDX1;
+        /* load IV and byteswap */
+        vmovq (%rcx), RT1x;
+        vinserti128 $1, RT1x, RT1, RT1; /* a: BE, b: 0, c: BE, d: 0 */
+        vpshufb RIDX0, RT1, RT1; /* a: LE, b: LE, c: LE, d: LE */
+        /* construct IVs */
+        vpsubq RT0, RT1, RT1;           /* a: le1, b: le0, c: le3, d: le2 */
+        vpshufb RIDX1, RT1, RXl0;       /* a: be0, b: be1, c: be2, d: be3 */
+        vpsubq RIDX2, RT1, RT1;         /* le5, le4, le7, le6 */
+        vpshufb RIDX1, RT1, RXr0;       /* be4, be5, be6, be7 */
+        vpsubq RIDX2, RT1, RT1;
+        vpshufb RIDX1, RT1, RXl1;
+        vpsubq RIDX2, RT1, RT1;
+        vpshufb RIDX1, RT1, RXr1;
+        vpsubq RIDX2, RT1, RT1;
+        vpshufb RIDX1, RT1, RXl2;
+        vpsubq RIDX2, RT1, RT1;
+        vpshufb RIDX1, RT1, RXr2;
+        vpsubq RIDX2, RT1, RT1;
+        vpshufb RIDX1, RT1, RXl3;
+        vpsubq RIDX2, RT1, RT1;
+        vpshufb RIDX1, RT1, RXr3;
+        /* store last IV */
+        vpsubq RIDX2, RT1, RT1; /* a: le33, b: le32, ... */
+        vpshufb RIDX1x, RT1x, RT1x; /* a: be32, ... */
+        vmovq RT1x, (%rcx);
+        call __blowfish_enc_blk32;
+        /* dst = src ^ iv */
+        vpxor 0*32(%rdx), RXr0, RXr0;
+        vpxor 1*32(%rdx), RXl0, RXl0;
+        vpxor 2*32(%rdx), RXr1, RXr1;
+        vpxor 3*32(%rdx), RXl1, RXl1;
+        vpxor 4*32(%rdx), RXr2, RXr2;
+        vpxor 5*32(%rdx), RXl2, RXl2;
+        vpxor 6*32(%rdx), RXr3, RXr3;
+        vpxor 7*32(%rdx), RXl3, RXl3;
+        vmovdqu RXr0, (0*32)(%rsi);
+        vmovdqu RXl0, (1*32)(%rsi);
+        vmovdqu RXr1, (2*32)(%rsi);
+        vmovdqu RXl1, (3*32)(%rsi);
+        vmovdqu RXr2, (4*32)(%rsi);
+        vmovdqu RXl2, (5*32)(%rsi);
+        vmovdqu RXr3, (6*32)(%rsi);
+        vmovdqu RXl3, (7*32)(%rsi);
+        vzeroupper;
+        ret;
+ENDPROC(blowfish_ctr_32way)
diff --git a/arch/x86/crypto/blowfish_avx2_glue.c b/arch/x86/crypto/blowfish_avx2_glue.c
new file mode 100644
index 000000000000..4417e9aea78d
--- /dev/null
+++ b/arch/x86/crypto/blowfish_avx2_glue.c
@@ -0,0 +1,585 @@
+/*
+ * Glue Code for x86_64/AVX2 assembler optimized version of Blowfish
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ * CTR part based on code (crypto/ctr.c) by:
+ *   (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/blowfish.h>
+#include <crypto/cryptd.h>
+#include <crypto/ctr.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <asm/crypto/blowfish.h>
+#include <asm/crypto/ablk_helper.h>
+#include <crypto/scatterwalk.h>
+#define BF_AVX2_PARALLEL_BLOCKS 32
+/* 32-way AVX2 parallel cipher functions */
+asmlinkage void blowfish_ecb_enc_32way(struct bf_ctx *ctx, u8 *dst,
+                                       const u8 *src);
+asmlinkage void blowfish_ecb_dec_32way(struct bf_ctx *ctx, u8 *dst,
+                                       const u8 *src);
+asmlinkage void blowfish_cbc_dec_32way(struct bf_ctx *ctx, u8 *dst,
+                                       const u8 *src);
+asmlinkage void blowfish_ctr_32way(struct bf_ctx *ctx, u8 *dst, const u8 *src,
+                                   __be64 *iv);
+static inline bool bf_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+{
+        if (fpu_enabled)
+                return true;
+        /* FPU is only used when chunk to be processed is large enough, so
+         * do not enable FPU until it is necessary.
+         */
+        if (nbytes < BF_BLOCK_SIZE * BF_AVX2_PARALLEL_BLOCKS)
+                return false;
+        kernel_fpu_begin();
+        return true;
+}
+static inline void bf_fpu_end(bool fpu_enabled)
+{
+        if (fpu_enabled)
+                kernel_fpu_end();
+}
+static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
+                     bool enc)
+{
+        bool fpu_enabled = false;
+        struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        const unsigned int bsize = BF_BLOCK_SIZE;
+        unsigned int nbytes;
+        int err;
+        err = blkcipher_walk_virt(desc, walk);
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        while ((nbytes = walk->nbytes)) {
+                u8 *wsrc = walk->src.virt.addr;
+                u8 *wdst = walk->dst.virt.addr;
+                fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
+                /* Process multi-block AVX2 batch */
+                if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
+                        do {
+                                if (enc)
+                                        blowfish_ecb_enc_32way(ctx, wdst, wsrc);
+                                else
+                                        blowfish_ecb_dec_32way(ctx, wdst, wsrc);
+                                wsrc += bsize * BF_AVX2_PARALLEL_BLOCKS;
+                                wdst += bsize * BF_AVX2_PARALLEL_BLOCKS;
+                                nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS;
+                        } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
+                        if (nbytes < bsize)
+                                goto done;
+                }
+                /* Process multi-block batch */
+                if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
+                        do {
+                                if (enc)
+                                        blowfish_enc_blk_4way(ctx, wdst, wsrc);
+                                else
+                                        blowfish_dec_blk_4way(ctx, wdst, wsrc);
+                                wsrc += bsize * BF_PARALLEL_BLOCKS;
+                                wdst += bsize * BF_PARALLEL_BLOCKS;
+                                nbytes -= bsize * BF_PARALLEL_BLOCKS;
+                        } while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
+                        if (nbytes < bsize)
+                                goto done;
+                }
+                /* Handle leftovers */
+                do {
+                        if (enc)
+                                blowfish_enc_blk(ctx, wdst, wsrc);
+                        else
+                                blowfish_dec_blk(ctx, wdst, wsrc);
+                        wsrc += bsize;
+                        wdst += bsize;
+                        nbytes -= bsize;
+                } while (nbytes >= bsize);
+done:
+                err = blkcipher_walk_done(desc, walk, nbytes);
+        }
+        bf_fpu_end(fpu_enabled);
+        return err;
+}
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct blkcipher_walk walk;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        return ecb_crypt(desc, &walk, true);
+}
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct blkcipher_walk walk;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        return ecb_crypt(desc, &walk, false);
+}
+static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
+                                  struct blkcipher_walk *walk)
+{
+        struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        unsigned int bsize = BF_BLOCK_SIZE;
+        unsigned int nbytes = walk->nbytes;
+        u64 *src = (u64 *)walk->src.virt.addr;
+        u64 *dst = (u64 *)walk->dst.virt.addr;
+        u64 *iv = (u64 *)walk->iv;
+        do {
+                *dst = *src ^ *iv;
+                blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
+                iv = dst;
+                src += 1;
+                dst += 1;
+                nbytes -= bsize;
+        } while (nbytes >= bsize);
+        *(u64 *)walk->iv = *iv;
+        return nbytes;
+}
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct blkcipher_walk walk;
+        int err;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt(desc, &walk);
+        while ((nbytes = walk.nbytes)) {
+                nbytes = __cbc_encrypt(desc, &walk);
+                err = blkcipher_walk_done(desc, &walk, nbytes);
+        }
+        return err;
+}
+static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
+                                  struct blkcipher_walk *walk)
+{
+        struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        const unsigned int bsize = BF_BLOCK_SIZE;
+        unsigned int nbytes = walk->nbytes;
+        u64 *src = (u64 *)walk->src.virt.addr;
+        u64 *dst = (u64 *)walk->dst.virt.addr;
+        u64 last_iv;
+        int i;
+        /* Start of the last block. */
+        src += nbytes / bsize - 1;
+        dst += nbytes / bsize - 1;
+        last_iv = *src;
+        /* Process multi-block AVX2 batch */
+        if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
+                do {
+                        nbytes -= bsize * (BF_AVX2_PARALLEL_BLOCKS - 1);
+                        src -= BF_AVX2_PARALLEL_BLOCKS - 1;
+                        dst -= BF_AVX2_PARALLEL_BLOCKS - 1;
+                        blowfish_cbc_dec_32way(ctx, (u8 *)dst, (u8 *)src);
+                        nbytes -= bsize;
+                        if (nbytes < bsize)
+                                goto done;
+                        *dst ^= *(src - 1);
+                        src -= 1;
+                        dst -= 1;
+                } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
+                if (nbytes < bsize)
+                        goto done;
+        }
+        /* Process multi-block batch */
+        if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
+                u64 ivs[BF_PARALLEL_BLOCKS - 1];
+                do {
+                        nbytes -= bsize * (BF_PARALLEL_BLOCKS - 1);
+                        src -= BF_PARALLEL_BLOCKS - 1;
+                        dst -= BF_PARALLEL_BLOCKS - 1;
+                        for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++)
+                                ivs[i] = src[i];
+                        blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src);
+                        for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++)
+                                dst[i + 1] ^= ivs[i];
+                        nbytes -= bsize;
+                        if (nbytes < bsize)
+                                goto done;
+                        *dst ^= *(src - 1);
+                        src -= 1;
+                        dst -= 1;
+                } while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
+                if (nbytes < bsize)
+                        goto done;
+        }
+        /* Handle leftovers */
+        for (;;) {
+                blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src);
+                nbytes -= bsize;
+                if (nbytes < bsize)
+                        break;
+                *dst ^= *(src - 1);
+                src -= 1;
+                dst -= 1;
+        }
+done:
+        *dst ^= *(u64 *)walk->iv;
+        *(u64 *)walk->iv = last_iv;
+        return nbytes;
+}
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        bool fpu_enabled = false;
+        struct blkcipher_walk walk;
+        int err;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt(desc, &walk);
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        while ((nbytes = walk.nbytes)) {
+                fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
+                nbytes = __cbc_decrypt(desc, &walk);
+                err = blkcipher_walk_done(desc, &walk, nbytes);
+        }
+        bf_fpu_end(fpu_enabled);
+        return err;
+}
+static void ctr_crypt_final(struct blkcipher_desc *desc,
+                            struct blkcipher_walk *walk)
+{
+        struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        u8 *ctrblk = walk->iv;
+        u8 keystream[BF_BLOCK_SIZE];
+        u8 *src = walk->src.virt.addr;
+        u8 *dst = walk->dst.virt.addr;
+        unsigned int nbytes = walk->nbytes;
+        blowfish_enc_blk(ctx, keystream, ctrblk);
+        crypto_xor(keystream, src, nbytes);
+        memcpy(dst, keystream, nbytes);
+        crypto_inc(ctrblk, BF_BLOCK_SIZE);
+}
+static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
+                                struct blkcipher_walk *walk)
+{
+        struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        unsigned int bsize = BF_BLOCK_SIZE;
+        unsigned int nbytes = walk->nbytes;
+        u64 *src = (u64 *)walk->src.virt.addr;
+        u64 *dst = (u64 *)walk->dst.virt.addr;
+        int i;
+        /* Process multi-block AVX2 batch */
+        if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
+                do {
+                        blowfish_ctr_32way(ctx, (u8 *)dst, (u8 *)src,
+                                           (__be64 *)walk->iv);
+                        src += BF_AVX2_PARALLEL_BLOCKS;
+                        dst += BF_AVX2_PARALLEL_BLOCKS;
+                        nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS;
+                } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
+                if (nbytes < bsize)
+                        goto done;
+        }
+        /* Process four block batch */
+        if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
+                __be64 ctrblocks[BF_PARALLEL_BLOCKS];
+                u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
+                do {
+                        /* create ctrblks for parallel encrypt */
+                        for (i = 0; i < BF_PARALLEL_BLOCKS; i++) {
+                                if (dst != src)
+                                        dst[i] = src[i];
+                                ctrblocks[i] = cpu_to_be64(ctrblk++);
+                        }
+                        blowfish_enc_blk_xor_4way(ctx, (u8 *)dst,
+                                                  (u8 *)ctrblocks);
+                        src += BF_PARALLEL_BLOCKS;
+                        dst += BF_PARALLEL_BLOCKS;
+                        nbytes -= bsize * BF_PARALLEL_BLOCKS;
+                } while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
+                *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
+                if (nbytes < bsize)
+                        goto done;
+        }
+        /* Handle leftovers */
+        do {
+                u64 ctrblk;
+                if (dst != src)
+                        *dst = *src;
+                ctrblk = *(u64 *)walk->iv;
+                be64_add_cpu((__be64 *)walk->iv, 1);
+                blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
+                src += 1;
+                dst += 1;
+        } while ((nbytes -= bsize) >= bsize);
+done:
+        return nbytes;
+}
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                     struct scatterlist *src, unsigned int nbytes)
+{
+        bool fpu_enabled = false;
+        struct blkcipher_walk walk;
+        int err;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE);
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) {
+                fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
+                nbytes = __ctr_crypt(desc, &walk);
+                err = blkcipher_walk_done(desc, &walk, nbytes);
+        }
+        bf_fpu_end(fpu_enabled);
+        if (walk.nbytes) {
+                ctr_crypt_final(desc, &walk);
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        return err;
+}
+static struct crypto_alg bf_algs[6] = { {
+        .cra_name               = "__ecb-blowfish-avx2",
+        .cra_driver_name        = "__driver-ecb-blowfish-avx2",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = BF_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct bf_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = BF_MIN_KEY_SIZE,
+                        .max_keysize    = BF_MAX_KEY_SIZE,
+                        .setkey         = blowfish_setkey,
+                        .encrypt        = ecb_encrypt,
+                        .decrypt        = ecb_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "__cbc-blowfish-avx2",
+        .cra_driver_name        = "__driver-cbc-blowfish-avx2",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = BF_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct bf_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = BF_MIN_KEY_SIZE,
+                        .max_keysize    = BF_MAX_KEY_SIZE,
+                        .setkey         = blowfish_setkey,
+                        .encrypt        = cbc_encrypt,
+                        .decrypt        = cbc_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "__ctr-blowfish-avx2",
+        .cra_driver_name        = "__driver-ctr-blowfish-avx2",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = 1,
+        .cra_ctxsize            = sizeof(struct bf_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = BF_MIN_KEY_SIZE,
+                        .max_keysize    = BF_MAX_KEY_SIZE,
+                        .ivsize         = BF_BLOCK_SIZE,
+                        .setkey         = blowfish_setkey,
+                        .encrypt        = ctr_crypt,
+                        .decrypt        = ctr_crypt,
+                },
+        },
+}, {
+        .cra_name               = "ecb(blowfish)",
+        .cra_driver_name        = "ecb-blowfish-avx2",
+        .cra_priority           = 400,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = BF_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = BF_MIN_KEY_SIZE,
+                        .max_keysize    = BF_MAX_KEY_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = ablk_encrypt,
+                        .decrypt        = ablk_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "cbc(blowfish)",
+        .cra_driver_name        = "cbc-blowfish-avx2",
+        .cra_priority           = 400,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = BF_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = BF_MIN_KEY_SIZE,
+                        .max_keysize    = BF_MAX_KEY_SIZE,
+                        .ivsize         = BF_BLOCK_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = __ablk_encrypt,
+                        .decrypt        = ablk_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "ctr(blowfish)",
+        .cra_driver_name        = "ctr-blowfish-avx2",
+        .cra_priority           = 400,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = 1,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = BF_MIN_KEY_SIZE,
+                        .max_keysize    = BF_MAX_KEY_SIZE,
+                        .ivsize         = BF_BLOCK_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = ablk_encrypt,
+                        .decrypt        = ablk_encrypt,
+                        .geniv          = "chainiv",
+                },
+        },
+} };
+static int __init init(void)
+{
+        u64 xcr0;
+        if (!cpu_has_avx2 || !cpu_has_osxsave) {
+                pr_info("AVX2 instructions are not detected.\n");
+                return -ENODEV;
+        }
+        xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+        if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+                pr_info("AVX detected but unusable.\n");
+                return -ENODEV;
+        }
+        return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs));
+}
+static void __exit fini(void)
+{
+        crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs));
+}
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Blowfish Cipher Algorithm, AVX2 optimized");
+MODULE_ALIAS("blowfish");
+MODULE_ALIAS("blowfish-asm");
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index 50ec333b70e6..3548d76dbaa9 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -1,7 +1,7 @@
 /*
 * Glue Code for assembler optimized version of Blowfish
 *
- * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
 *
 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
 *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
@@ -32,40 +32,24 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <crypto/algapi.h>
+#include <asm/crypto/blowfish.h>
 /* regular block cipher functions */
 asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
                                   bool xor);
+EXPORT_SYMBOL_GPL(__blowfish_enc_blk);
 asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
+EXPORT_SYMBOL_GPL(blowfish_dec_blk);
 /* 4-way parallel cipher functions */
 asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
                                        const u8 *src, bool xor);
+EXPORT_SYMBOL_GPL(__blowfish_enc_blk_4way);
 asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
                                      const u8 *src);
+EXPORT_SYMBOL_GPL(blowfish_dec_blk_4way);
-static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
-{
-        __blowfish_enc_blk(ctx, dst, src, false);
-}
-static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
-                                        const u8 *src)
-{
-        __blowfish_enc_blk(ctx, dst, src, true);
-}
-static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
-                                         const u8 *src)
-{
-        __blowfish_enc_blk_4way(ctx, dst, src, false);
-}
-static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
-                                      const u8 *src)
-{
-        __blowfish_enc_blk_4way(ctx, dst, src, true);
-}
 static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index cfc163469c71..ce71f9212409 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -1,7 +1,7 @@
 /*
 * x86_64/AVX/AES-NI assembler implementation of Camellia
 *
- * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -589,6 +589,10 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 .Lbswap128_mask:
        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+/* For XTS mode IV generation */
+.Lxts_gf128mul_and_shl1_mask:
+        .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
 /*
 * pre-SubByte transform
 *
@@ -1090,3 +1094,177 @@ ENTRY(camellia_ctr_16way)
        ret;
 ENDPROC(camellia_ctr_16way)
+#define gf128mul_x_ble(iv, mask, tmp) \
+        vpsrad $31, iv, tmp; \
+        vpaddq iv, iv, iv; \
+        vpshufd $0x13, tmp, tmp; \
+        vpand mask, tmp, tmp; \
+        vpxor tmp, iv, iv;
+.align 8
+camellia_xts_crypt_16way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst (16 blocks)
+         *      %rdx: src (16 blocks)
+         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+         *      %r8: index for input whitening key
+         *      %r9: pointer to  __camellia_enc_blk16 or __camellia_dec_blk16
+         */
+        subq $(16 * 16), %rsp;
+        movq %rsp, %rax;
+        vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
+        /* load IV */
+        vmovdqu (%rcx), %xmm0;
+        vpxor 0 * 16(%rdx), %xmm0, %xmm15;
+        vmovdqu %xmm15, 15 * 16(%rax);
+        vmovdqu %xmm0, 0 * 16(%rsi);
+        /* construct IVs */
+        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+        vpxor 1 * 16(%rdx), %xmm0, %xmm15;
+        vmovdqu %xmm15, 14 * 16(%rax);
+        vmovdqu %xmm0, 1 * 16(%rsi);
+        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+        vpxor 2 * 16(%rdx), %xmm0, %xmm13;
+        vmovdqu %xmm0, 2 * 16(%rsi);
+        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+        vpxor 3 * 16(%rdx), %xmm0, %xmm12;
+        vmovdqu %xmm0, 3 * 16(%rsi);
+        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+        vpxor 4 * 16(%rdx), %xmm0, %xmm11;
+        vmovdqu %xmm0, 4 * 16(%rsi);
+        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+        vpxor 5 * 16(%rdx), %xmm0, %xmm10;
+        vmovdqu %xmm0, 5 * 16(%rsi);
+        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+        vpxor 6 * 16(%rdx), %xmm0, %xmm9;
+        vmovdqu %xmm0, 6 * 16(%rsi);
+        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+        vpxor 7 * 16(%rdx), %xmm0, %xmm8;
+        vmovdqu %xmm0, 7 * 16(%rsi);
+        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+        vpxor 8 * 16(%rdx), %xmm0, %xmm7;
+        vmovdqu %xmm0, 8 * 16(%rsi);
+        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+        vpxor 9 * 16(%rdx), %xmm0, %xmm6;
+        vmovdqu %xmm0, 9 * 16(%rsi);
+        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+        vpxor 10 * 16(%rdx), %xmm0, %xmm5;
+        vmovdqu %xmm0, 10 * 16(%rsi);
+        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+        vpxor 11 * 16(%rdx), %xmm0, %xmm4;
+        vmovdqu %xmm0, 11 * 16(%rsi);
+        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+        vpxor 12 * 16(%rdx), %xmm0, %xmm3;
+        vmovdqu %xmm0, 12 * 16(%rsi);
+        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+        vpxor 13 * 16(%rdx), %xmm0, %xmm2;
+        vmovdqu %xmm0, 13 * 16(%rsi);
+        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+        vpxor 14 * 16(%rdx), %xmm0, %xmm1;
+        vmovdqu %xmm0, 14 * 16(%rsi);
+        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+        vpxor 15 * 16(%rdx), %xmm0, %xmm15;
+        vmovdqu %xmm15, 0 * 16(%rax);
+        vmovdqu %xmm0, 15 * 16(%rsi);
+        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+        vmovdqu %xmm0, (%rcx);
+        /* inpack16_pre: */
+        vmovq (key_table)(CTX, %r8, 8), %xmm15;
+        vpshufb .Lpack_bswap, %xmm15, %xmm15;
+        vpxor 0 * 16(%rax), %xmm15, %xmm0;
+        vpxor %xmm1, %xmm15, %xmm1;
+        vpxor %xmm2, %xmm15, %xmm2;
+        vpxor %xmm3, %xmm15, %xmm3;
+        vpxor %xmm4, %xmm15, %xmm4;
+        vpxor %xmm5, %xmm15, %xmm5;
+        vpxor %xmm6, %xmm15, %xmm6;
+        vpxor %xmm7, %xmm15, %xmm7;
+        vpxor %xmm8, %xmm15, %xmm8;
+        vpxor %xmm9, %xmm15, %xmm9;
+        vpxor %xmm10, %xmm15, %xmm10;
+        vpxor %xmm11, %xmm15, %xmm11;
+        vpxor %xmm12, %xmm15, %xmm12;
+        vpxor %xmm13, %xmm15, %xmm13;
+        vpxor 14 * 16(%rax), %xmm15, %xmm14;
+        vpxor 15 * 16(%rax), %xmm15, %xmm15;
+        call *%r9;
+        addq $(16 * 16), %rsp;
+        vpxor 0 * 16(%rsi), %xmm7, %xmm7;
+        vpxor 1 * 16(%rsi), %xmm6, %xmm6;
+        vpxor 2 * 16(%rsi), %xmm5, %xmm5;
+        vpxor 3 * 16(%rsi), %xmm4, %xmm4;
+        vpxor 4 * 16(%rsi), %xmm3, %xmm3;
+        vpxor 5 * 16(%rsi), %xmm2, %xmm2;
+        vpxor 6 * 16(%rsi), %xmm1, %xmm1;
+        vpxor 7 * 16(%rsi), %xmm0, %xmm0;
+        vpxor 8 * 16(%rsi), %xmm15, %xmm15;
+        vpxor 9 * 16(%rsi), %xmm14, %xmm14;
+        vpxor 10 * 16(%rsi), %xmm13, %xmm13;
+        vpxor 11 * 16(%rsi), %xmm12, %xmm12;
+        vpxor 12 * 16(%rsi), %xmm11, %xmm11;
+        vpxor 13 * 16(%rsi), %xmm10, %xmm10;
+        vpxor 14 * 16(%rsi), %xmm9, %xmm9;
+        vpxor 15 * 16(%rsi), %xmm8, %xmm8;
+        write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+                     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+                     %xmm8, %rsi);
+        ret;
+ENDPROC(camellia_xts_crypt_16way)
+ENTRY(camellia_xts_enc_16way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst (16 blocks)
+         *      %rdx: src (16 blocks)
+         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+         */
+        xorl %r8d, %r8d; /* input whitening key, 0 for enc */
+        leaq __camellia_enc_blk16, %r9;
+        jmp camellia_xts_crypt_16way;
+ENDPROC(camellia_xts_enc_16way)
+ENTRY(camellia_xts_dec_16way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst (16 blocks)
+         *      %rdx: src (16 blocks)
+         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+         */
+        cmpl $16, key_length(CTX);
+        movl $32, %r8d;
+        movl $24, %eax;
+        cmovel %eax, %r8d;  /* input whitening key, last for dec */
+        leaq __camellia_dec_blk16, %r9;
+        jmp camellia_xts_crypt_16way;
+ENDPROC(camellia_xts_dec_16way)
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
new file mode 100644
index 000000000000..91a1878fcc3e
--- /dev/null
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -0,0 +1,1368 @@
+/*
+ * x86_64/AVX2/AES-NI assembler implementation of Camellia
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+#include <linux/linkage.h>
+#define CAMELLIA_TABLE_BYTE_LEN 272
+/* struct camellia_ctx: */
+#define key_table 0
+#define key_length CAMELLIA_TABLE_BYTE_LEN
+/* register macros */
+#define CTX %rdi
+#define RIO %r8
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+        vpand x, mask4bit, tmp0; \
+        vpandn x, mask4bit, x; \
+        vpsrld $4, x, x; \
+        \
+        vpshufb tmp0, lo_t, tmp0; \
+        vpshufb x, hi_t, x; \
+        vpxor tmp0, x, x;
+#define ymm0_x xmm0
+#define ymm1_x xmm1
+#define ymm2_x xmm2
+#define ymm3_x xmm3
+#define ymm4_x xmm4
+#define ymm5_x xmm5
+#define ymm6_x xmm6
+#define ymm7_x xmm7
+#define ymm8_x xmm8
+#define ymm9_x xmm9
+#define ymm10_x xmm10
+#define ymm11_x xmm11
+#define ymm12_x xmm12
+#define ymm13_x xmm13
+#define ymm14_x xmm14
+#define ymm15_x xmm15
+/*
+ * AES-NI instructions do not support ymmX registers, so we need splitting and
+ * merging.
+ */
+#define vaesenclast256(zero, yreg, tmp) \
+        vextracti128 $1, yreg, tmp##_x; \
+        vaesenclast zero##_x, yreg##_x, yreg##_x; \
+        vaesenclast zero##_x, tmp##_x, tmp##_x; \
+        vinserti128 $1, tmp##_x, yreg, yreg;
+/**********************************************************************
+  32-way camellia
+ **********************************************************************/
+/*
+ * IN:
+ *   x0..x7: byte-sliced AB state
+ *   mem_cd: register pointer storing CD state
+ *   key: index for key material
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
+                  t7, mem_cd, key) \
+        /* \
+         * S-function with AES subbytes \
+         */ \
+        vbroadcasti128 .Linv_shift_row, t4; \
+        vpbroadcastb .L0f0f0f0f, t7; \
+        vbroadcasti128 .Lpre_tf_lo_s1, t0; \
+        vbroadcasti128 .Lpre_tf_hi_s1, t1; \
+        \
+        /* AES inverse shift rows */ \
+        vpshufb t4, x0, x0; \
+        vpshufb t4, x7, x7; \
+        vpshufb t4, x1, x1; \
+        vpshufb t4, x4, x4; \
+        vpshufb t4, x2, x2; \
+        vpshufb t4, x5, x5; \
+        vpshufb t4, x3, x3; \
+        vpshufb t4, x6, x6; \
+        \
+        /* prefilter sboxes 1, 2 and 3 */ \
+        vbroadcasti128 .Lpre_tf_lo_s4, t2; \
+        vbroadcasti128 .Lpre_tf_hi_s4, t3; \
+        filter_8bit(x0, t0, t1, t7, t6); \
+        filter_8bit(x7, t0, t1, t7, t6); \
+        filter_8bit(x1, t0, t1, t7, t6); \
+        filter_8bit(x4, t0, t1, t7, t6); \
+        filter_8bit(x2, t0, t1, t7, t6); \
+        filter_8bit(x5, t0, t1, t7, t6); \
+        \
+        /* prefilter sbox 4 */ \
+        vpxor t4##_x, t4##_x, t4##_x; \
+        filter_8bit(x3, t2, t3, t7, t6); \
+        filter_8bit(x6, t2, t3, t7, t6); \
+        \
+        /* AES subbytes + AES shift rows */ \
+        vbroadcasti128 .Lpost_tf_lo_s1, t0; \
+        vbroadcasti128 .Lpost_tf_hi_s1, t1; \
+        vaesenclast256(t4, x0, t5); \
+        vaesenclast256(t4, x7, t5); \
+        vaesenclast256(t4, x1, t5); \
+        vaesenclast256(t4, x4, t5); \
+        vaesenclast256(t4, x2, t5); \
+        vaesenclast256(t4, x5, t5); \
+        vaesenclast256(t4, x3, t5); \
+        vaesenclast256(t4, x6, t5); \
+        \
+        /* postfilter sboxes 1 and 4 */ \
+        vbroadcasti128 .Lpost_tf_lo_s3, t2; \
+        vbroadcasti128 .Lpost_tf_hi_s3, t3; \
+        filter_8bit(x0, t0, t1, t7, t6); \
+        filter_8bit(x7, t0, t1, t7, t6); \
+        filter_8bit(x3, t0, t1, t7, t6); \
+        filter_8bit(x6, t0, t1, t7, t6); \
+        \
+        /* postfilter sbox 3 */ \
+        vbroadcasti128 .Lpost_tf_lo_s2, t4; \
+        vbroadcasti128 .Lpost_tf_hi_s2, t5; \
+        filter_8bit(x2, t2, t3, t7, t6); \
+        filter_8bit(x5, t2, t3, t7, t6); \
+        \
+        vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
+        \
+        /* postfilter sbox 2 */ \
+        filter_8bit(x1, t4, t5, t7, t2); \
+        filter_8bit(x4, t4, t5, t7, t2); \
+        \
+        vpsrldq $1, t0, t1; \
+        vpsrldq $2, t0, t2; \
+        vpsrldq $3, t0, t3; \
+        vpsrldq $4, t0, t4; \
+        vpsrldq $5, t0, t5; \
+        vpsrldq $6, t0, t6; \
+        vpsrldq $7, t0, t7; \
+        vpbroadcastb t0##_x, t0; \
+        vpbroadcastb t1##_x, t1; \
+        vpbroadcastb t2##_x, t2; \
+        vpbroadcastb t3##_x, t3; \
+        vpbroadcastb t4##_x, t4; \
+        vpbroadcastb t6##_x, t6; \
+        vpbroadcastb t5##_x, t5; \
+        vpbroadcastb t7##_x, t7; \
+        \
+        /* P-function */ \
+        vpxor x5, x0, x0; \
+        vpxor x6, x1, x1; \
+        vpxor x7, x2, x2; \
+        vpxor x4, x3, x3; \
+        \
+        vpxor x2, x4, x4; \
+        vpxor x3, x5, x5; \
+        vpxor x0, x6, x6; \
+        vpxor x1, x7, x7; \
+        \
+        vpxor x7, x0, x0; \
+        vpxor x4, x1, x1; \
+        vpxor x5, x2, x2; \
+        vpxor x6, x3, x3; \
+        \
+        vpxor x3, x4, x4; \
+        vpxor x0, x5, x5; \
+        vpxor x1, x6, x6; \
+        vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+        \
+        /* Add key material and result to CD (x becomes new CD) */ \
+        \
+        vpxor t7, x0, x0; \
+        vpxor 4 * 32(mem_cd), x0, x0; \
+        \
+        vpxor t6, x1, x1; \
+        vpxor 5 * 32(mem_cd), x1, x1; \
+        \
+        vpxor t5, x2, x2; \
+        vpxor 6 * 32(mem_cd), x2, x2; \
+        \
+        vpxor t4, x3, x3; \
+        vpxor 7 * 32(mem_cd), x3, x3; \
+        \
+        vpxor t3, x4, x4; \
+        vpxor 0 * 32(mem_cd), x4, x4; \
+        \
+        vpxor t2, x5, x5; \
+        vpxor 1 * 32(mem_cd), x5, x5; \
+        \
+        vpxor t1, x6, x6; \
+        vpxor 2 * 32(mem_cd), x6, x6; \
+        \
+        vpxor t0, x7, x7; \
+        vpxor 3 * 32(mem_cd), x7, x7;
+/*
+ * Size optimization... with inlined roundsm16 binary would be over 5 times
+ * larger and would only marginally faster.
+ */
+.align 8
+roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
+        roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                  %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
+                  %rcx, (%r9));
+        ret;
+ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
+.align 8
+roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
+        roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
+                  %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
+                  %rax, (%r9));
+        ret;
+ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
+/*
+ * IN/OUT:
+ *  x0..x7: byte-sliced AB state preloaded
+ *  mem_ab: byte-sliced AB state in memory
+ *  mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+        leaq (key_table + (i) * 8)(CTX), %r9; \
+        call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
+        \
+        vmovdqu x0, 4 * 32(mem_cd); \
+        vmovdqu x1, 5 * 32(mem_cd); \
+        vmovdqu x2, 6 * 32(mem_cd); \
+        vmovdqu x3, 7 * 32(mem_cd); \
+        vmovdqu x4, 0 * 32(mem_cd); \
+        vmovdqu x5, 1 * 32(mem_cd); \
+        vmovdqu x6, 2 * 32(mem_cd); \
+        vmovdqu x7, 3 * 32(mem_cd); \
+        \
+        leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
+        call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
+        \
+        store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
+        /* Store new AB state */ \
+        vmovdqu x4, 4 * 32(mem_ab); \
+        vmovdqu x5, 5 * 32(mem_ab); \
+        vmovdqu x6, 6 * 32(mem_ab); \
+        vmovdqu x7, 7 * 32(mem_ab); \
+        vmovdqu x0, 0 * 32(mem_ab); \
+        vmovdqu x1, 1 * 32(mem_ab); \
+        vmovdqu x2, 2 * 32(mem_ab); \
+        vmovdqu x3, 3 * 32(mem_ab);
+#define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                      y6, y7, mem_ab, mem_cd, i) \
+        two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
+        two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
+        two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
+#define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                      y6, y7, mem_ab, mem_cd, i) \
+        two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
+        two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
+        two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
+/*
+ * IN:
+ *  v0..3: byte-sliced 32-bit integers
+ * OUT:
+ *  v0..3: (IN <<< 1)
+ */
+#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
+        vpcmpgtb v0, zero, t0; \
+        vpaddb v0, v0, v0; \
+        vpabsb t0, t0; \
+        \
+        vpcmpgtb v1, zero, t1; \
+        vpaddb v1, v1, v1; \
+        vpabsb t1, t1; \
+        \
+        vpcmpgtb v2, zero, t2; \
+        vpaddb v2, v2, v2; \
+        vpabsb t2, t2; \
+        \
+        vpor t0, v1, v1; \
+        \
+        vpcmpgtb v3, zero, t0; \
+        vpaddb v3, v3, v3; \
+        vpabsb t0, t0; \
+        \
+        vpor t1, v2, v2; \
+        vpor t2, v3, v3; \
+        vpor t0, v0, v0;
+/*
+ * IN:
+ *   r: byte-sliced AB state in memory
+ *   l: byte-sliced CD state in memory
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
+              tt1, tt2, tt3, kll, klr, krl, krr) \
+        /* \
+         * t0 = kll; \
+         * t0 &= ll; \
+         * lr ^= rol32(t0, 1); \
+         */ \
+        vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
+        vpxor tt0, tt0, tt0; \
+        vpbroadcastb t0##_x, t3; \
+        vpsrldq $1, t0, t0; \
+        vpbroadcastb t0##_x, t2; \
+        vpsrldq $1, t0, t0; \
+        vpbroadcastb t0##_x, t1; \
+        vpsrldq $1, t0, t0; \
+        vpbroadcastb t0##_x, t0; \
+        \
+        vpand l0, t0, t0; \
+        vpand l1, t1, t1; \
+        vpand l2, t2, t2; \
+        vpand l3, t3, t3; \
+        \
+        rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+        \
+        vpxor l4, t0, l4; \
+        vmovdqu l4, 4 * 32(l); \
+        vpxor l5, t1, l5; \
+        vmovdqu l5, 5 * 32(l); \
+        vpxor l6, t2, l6; \
+        vmovdqu l6, 6 * 32(l); \
+        vpxor l7, t3, l7; \
+        vmovdqu l7, 7 * 32(l); \
+        \
+        /* \
+         * t2 = krr; \
+         * t2 |= rr; \
+         * rl ^= t2; \
+         */ \
+        \
+        vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
+        vpbroadcastb t0##_x, t3; \
+        vpsrldq $1, t0, t0; \
+        vpbroadcastb t0##_x, t2; \
+        vpsrldq $1, t0, t0; \
+        vpbroadcastb t0##_x, t1; \
+        vpsrldq $1, t0, t0; \
+        vpbroadcastb t0##_x, t0; \
+        \
+        vpor 4 * 32(r), t0, t0; \
+        vpor 5 * 32(r), t1, t1; \
+        vpor 6 * 32(r), t2, t2; \
+        vpor 7 * 32(r), t3, t3; \
+        \
+        vpxor 0 * 32(r), t0, t0; \
+        vpxor 1 * 32(r), t1, t1; \
+        vpxor 2 * 32(r), t2, t2; \
+        vpxor 3 * 32(r), t3, t3; \
+        vmovdqu t0, 0 * 32(r); \
+        vmovdqu t1, 1 * 32(r); \
+        vmovdqu t2, 2 * 32(r); \
+        vmovdqu t3, 3 * 32(r); \
+        \
+        /* \
+         * t2 = krl; \
+         * t2 &= rl; \
+         * rr ^= rol32(t2, 1); \
+         */ \
+        vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
+        vpbroadcastb t0##_x, t3; \
+        vpsrldq $1, t0, t0; \
+        vpbroadcastb t0##_x, t2; \
+        vpsrldq $1, t0, t0; \
+        vpbroadcastb t0##_x, t1; \
+        vpsrldq $1, t0, t0; \
+        vpbroadcastb t0##_x, t0; \
+        \
+        vpand 0 * 32(r), t0, t0; \
+        vpand 1 * 32(r), t1, t1; \
+        vpand 2 * 32(r), t2, t2; \
+        vpand 3 * 32(r), t3, t3; \
+        \
+        rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+        \
+        vpxor 4 * 32(r), t0, t0; \
+        vpxor 5 * 32(r), t1, t1; \
+        vpxor 6 * 32(r), t2, t2; \
+        vpxor 7 * 32(r), t3, t3; \
+        vmovdqu t0, 4 * 32(r); \
+        vmovdqu t1, 5 * 32(r); \
+        vmovdqu t2, 6 * 32(r); \
+        vmovdqu t3, 7 * 32(r); \
+        \
+        /* \
+         * t0 = klr; \
+         * t0 |= lr; \
+         * ll ^= t0; \
+         */ \
+        \
+        vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
+        vpbroadcastb t0##_x, t3; \
+        vpsrldq $1, t0, t0; \
+        vpbroadcastb t0##_x, t2; \
+        vpsrldq $1, t0, t0; \
+        vpbroadcastb t0##_x, t1; \
+        vpsrldq $1, t0, t0; \
+        vpbroadcastb t0##_x, t0; \
+        \
+        vpor l4, t0, t0; \
+        vpor l5, t1, t1; \
+        vpor l6, t2, t2; \
+        vpor l7, t3, t3; \
+        \
+        vpxor l0, t0, l0; \
+        vmovdqu l0, 0 * 32(l); \
+        vpxor l1, t1, l1; \
+        vmovdqu l1, 1 * 32(l); \
+        vpxor l2, t2, l2; \
+        vmovdqu l2, 2 * 32(l); \
+        vpxor l3, t3, l3; \
+        vmovdqu l3, 3 * 32(l);
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+        vpunpckhdq x1, x0, t2; \
+        vpunpckldq x1, x0, x0; \
+        \
+        vpunpckldq x3, x2, t1; \
+        vpunpckhdq x3, x2, x2; \
+        \
+        vpunpckhqdq t1, x0, x1; \
+        vpunpcklqdq t1, x0, x0; \
+        \
+        vpunpckhqdq x2, t2, x3; \
+        vpunpcklqdq x2, t2, x2;
+#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
+                              a3, b3, c3, d3, st0, st1) \
+        vmovdqu d2, st0; \
+        vmovdqu d3, st1; \
+        transpose_4x4(a0, a1, a2, a3, d2, d3); \
+        transpose_4x4(b0, b1, b2, b3, d2, d3); \
+        vmovdqu st0, d2; \
+        vmovdqu st1, d3; \
+        \
+        vmovdqu a0, st0; \
+        vmovdqu a1, st1; \
+        transpose_4x4(c0, c1, c2, c3, a0, a1); \
+        transpose_4x4(d0, d1, d2, d3, a0, a1); \
+        \
+        vbroadcasti128 .Lshufb_16x16b, a0; \
+        vmovdqu st1, a1; \
+        vpshufb a0, a2, a2; \
+        vpshufb a0, a3, a3; \
+        vpshufb a0, b0, b0; \
+        vpshufb a0, b1, b1; \
+        vpshufb a0, b2, b2; \
+        vpshufb a0, b3, b3; \
+        vpshufb a0, a1, a1; \
+        vpshufb a0, c0, c0; \
+        vpshufb a0, c1, c1; \
+        vpshufb a0, c2, c2; \
+        vpshufb a0, c3, c3; \
+        vpshufb a0, d0, d0; \
+        vpshufb a0, d1, d1; \
+        vpshufb a0, d2, d2; \
+        vpshufb a0, d3, d3; \
+        vmovdqu d3, st1; \
+        vmovdqu st0, d3; \
+        vpshufb a0, d3, a0; \
+        vmovdqu d2, st0; \
+        \
+        transpose_4x4(a0, b0, c0, d0, d2, d3); \
+        transpose_4x4(a1, b1, c1, d1, d2, d3); \
+        vmovdqu st0, d2; \
+        vmovdqu st1, d3; \
+        \
+        vmovdqu b0, st0; \
+        vmovdqu b1, st1; \
+        transpose_4x4(a2, b2, c2, d2, b0, b1); \
+        transpose_4x4(a3, b3, c3, d3, b0, b1); \
+        vmovdqu st0, b0; \
+        vmovdqu st1, b1; \
+        /* does not adjust output bytes inside vectors */
+/* load blocks to registers and apply pre-whitening */
+#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, rio, key) \
+        vpbroadcastq key, x0; \
+        vpshufb .Lpack_bswap, x0, x0; \
+        \
+        vpxor 0 * 32(rio), x0, y7; \
+        vpxor 1 * 32(rio), x0, y6; \
+        vpxor 2 * 32(rio), x0, y5; \
+        vpxor 3 * 32(rio), x0, y4; \
+        vpxor 4 * 32(rio), x0, y3; \
+        vpxor 5 * 32(rio), x0, y2; \
+        vpxor 6 * 32(rio), x0, y1; \
+        vpxor 7 * 32(rio), x0, y0; \
+        vpxor 8 * 32(rio), x0, x7; \
+        vpxor 9 * 32(rio), x0, x6; \
+        vpxor 10 * 32(rio), x0, x5; \
+        vpxor 11 * 32(rio), x0, x4; \
+        vpxor 12 * 32(rio), x0, x3; \
+        vpxor 13 * 32(rio), x0, x2; \
+        vpxor 14 * 32(rio), x0, x1; \
+        vpxor 15 * 32(rio), x0, x0;
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                      y6, y7, mem_ab, mem_cd) \
+        byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
+                              y4, y5, y6, y7, (mem_ab), (mem_cd)); \
+        \
+        vmovdqu x0, 0 * 32(mem_ab); \
+        vmovdqu x1, 1 * 32(mem_ab); \
+        vmovdqu x2, 2 * 32(mem_ab); \
+        vmovdqu x3, 3 * 32(mem_ab); \
+        vmovdqu x4, 4 * 32(mem_ab); \
+        vmovdqu x5, 5 * 32(mem_ab); \
+        vmovdqu x6, 6 * 32(mem_ab); \
+        vmovdqu x7, 7 * 32(mem_ab); \
+        vmovdqu y0, 0 * 32(mem_cd); \
+        vmovdqu y1, 1 * 32(mem_cd); \
+        vmovdqu y2, 2 * 32(mem_cd); \
+        vmovdqu y3, 3 * 32(mem_cd); \
+        vmovdqu y4, 4 * 32(mem_cd); \
+        vmovdqu y5, 5 * 32(mem_cd); \
+        vmovdqu y6, 6 * 32(mem_cd); \
+        vmovdqu y7, 7 * 32(mem_cd);
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+                    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
+        byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
+                              y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
+        \
+        vmovdqu x0, stack_tmp0; \
+        \
+        vpbroadcastq key, x0; \
+        vpshufb .Lpack_bswap, x0, x0; \
+        \
+        vpxor x0, y7, y7; \
+        vpxor x0, y6, y6; \
+        vpxor x0, y5, y5; \
+        vpxor x0, y4, y4; \
+        vpxor x0, y3, y3; \
+        vpxor x0, y2, y2; \
+        vpxor x0, y1, y1; \
+        vpxor x0, y0, y0; \
+        vpxor x0, x7, x7; \
+        vpxor x0, x6, x6; \
+        vpxor x0, x5, x5; \
+        vpxor x0, x4, x4; \
+        vpxor x0, x3, x3; \
+        vpxor x0, x2, x2; \
+        vpxor x0, x1, x1; \
+        vpxor stack_tmp0, x0, x0;
+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, rio) \
+        vmovdqu x0, 0 * 32(rio); \
+        vmovdqu x1, 1 * 32(rio); \
+        vmovdqu x2, 2 * 32(rio); \
+        vmovdqu x3, 3 * 32(rio); \
+        vmovdqu x4, 4 * 32(rio); \
+        vmovdqu x5, 5 * 32(rio); \
+        vmovdqu x6, 6 * 32(rio); \
+        vmovdqu x7, 7 * 32(rio); \
+        vmovdqu y0, 8 * 32(rio); \
+        vmovdqu y1, 9 * 32(rio); \
+        vmovdqu y2, 10 * 32(rio); \
+        vmovdqu y3, 11 * 32(rio); \
+        vmovdqu y4, 12 * 32(rio); \
+        vmovdqu y5, 13 * 32(rio); \
+        vmovdqu y6, 14 * 32(rio); \
+        vmovdqu y7, 15 * 32(rio);
+.data
+.align 32
+#define SHUFB_BYTES(idx) \
+        0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+.Lshufb_16x16b:
+        .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+        .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+.Lpack_bswap:
+        .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+        .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+/* For XTS mode */
+.Lxts_gf128mul_and_shl1_mask_0:
+        .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+.Lxts_gf128mul_and_shl1_mask_1:
+        .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox1, sbox2, sbox3:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s1:
+        .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
+        .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
+.Lpre_tf_hi_s1:
+        .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
+        .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox4:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in <<< 1)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s4:
+        .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
+        .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
+.Lpre_tf_hi_s4:
+        .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
+        .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox1, sbox4:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  )
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s1:
+        .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
+        .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
+.Lpost_tf_hi_s1:
+        .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
+        .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox2:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) <<< 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s2:
+        .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
+        .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
+.Lpost_tf_hi_s2:
+        .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
+        .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox3:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) >>> 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s3:
+        .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
+        .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
+.Lpost_tf_hi_s3:
+        .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
+        .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+        .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+        .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+.align 4
+/* 4-bit mask */
+.L0f0f0f0f:
+        .long 0x0f0f0f0f
+.text
+.align 8
+__camellia_enc_blk32:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rax: temporary storage, 512 bytes
+         *      %ymm0..%ymm15: 32 plaintext blocks
+         * output:
+         *      %ymm0..%ymm15: 32 encrypted blocks, order swapped:
+         *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+         */
+        leaq 8 * 32(%rax), %rcx;
+        inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                      %ymm15, %rax, %rcx);
+        enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                     %ymm15, %rax, %rcx, 0);
+        fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+              %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+              %ymm15,
+              ((key_table + (8) * 8) + 0)(CTX),
+              ((key_table + (8) * 8) + 4)(CTX),
+              ((key_table + (8) * 8) + 8)(CTX),
+              ((key_table + (8) * 8) + 12)(CTX));
+        enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                     %ymm15, %rax, %rcx, 8);
+        fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+              %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+              %ymm15,
+              ((key_table + (16) * 8) + 0)(CTX),
+              ((key_table + (16) * 8) + 4)(CTX),
+              ((key_table + (16) * 8) + 8)(CTX),
+              ((key_table + (16) * 8) + 12)(CTX));
+        enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                     %ymm15, %rax, %rcx, 16);
+        movl $24, %r8d;
+        cmpl $16, key_length(CTX);
+        jne .Lenc_max32;
+.Lenc_done:
+        /* load CD for output */
+        vmovdqu 0 * 32(%rcx), %ymm8;
+        vmovdqu 1 * 32(%rcx), %ymm9;
+        vmovdqu 2 * 32(%rcx), %ymm10;
+        vmovdqu 3 * 32(%rcx), %ymm11;
+        vmovdqu 4 * 32(%rcx), %ymm12;
+        vmovdqu 5 * 32(%rcx), %ymm13;
+        vmovdqu 6 * 32(%rcx), %ymm14;
+        vmovdqu 7 * 32(%rcx), %ymm15;
+        outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
+        ret;
+.align 8
+.Lenc_max32:
+        movl $32, %r8d;
+        fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+              %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+              %ymm15,
+              ((key_table + (24) * 8) + 0)(CTX),
+              ((key_table + (24) * 8) + 4)(CTX),
+              ((key_table + (24) * 8) + 8)(CTX),
+              ((key_table + (24) * 8) + 12)(CTX));
+        enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                     %ymm15, %rax, %rcx, 24);
+        jmp .Lenc_done;
+ENDPROC(__camellia_enc_blk32)
+.align 8
+__camellia_dec_blk32:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rax: temporary storage, 512 bytes
+         *      %r8d: 24 for 16 byte key, 32 for larger
+         *      %ymm0..%ymm15: 16 encrypted blocks
+         * output:
+         *      %ymm0..%ymm15: 16 plaintext blocks, order swapped:
+         *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+         */
+        leaq 8 * 32(%rax), %rcx;
+        inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                      %ymm15, %rax, %rcx);
+        cmpl $32, %r8d;
+        je .Ldec_max32;
+.Ldec_max24:
+        dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                     %ymm15, %rax, %rcx, 16);
+        fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+              %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+              %ymm15,
+              ((key_table + (16) * 8) + 8)(CTX),
+              ((key_table + (16) * 8) + 12)(CTX),
+              ((key_table + (16) * 8) + 0)(CTX),
+              ((key_table + (16) * 8) + 4)(CTX));
+        dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                     %ymm15, %rax, %rcx, 8);
+        fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+              %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+              %ymm15,
+              ((key_table + (8) * 8) + 8)(CTX),
+              ((key_table + (8) * 8) + 12)(CTX),
+              ((key_table + (8) * 8) + 0)(CTX),
+              ((key_table + (8) * 8) + 4)(CTX));
+        dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                     %ymm15, %rax, %rcx, 0);
+        /* load CD for output */
+        vmovdqu 0 * 32(%rcx), %ymm8;
+        vmovdqu 1 * 32(%rcx), %ymm9;
+        vmovdqu 2 * 32(%rcx), %ymm10;
+        vmovdqu 3 * 32(%rcx), %ymm11;
+        vmovdqu 4 * 32(%rcx), %ymm12;
+        vmovdqu 5 * 32(%rcx), %ymm13;
+        vmovdqu 6 * 32(%rcx), %ymm14;
+        vmovdqu 7 * 32(%rcx), %ymm15;
+        outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
+        ret;
+.align 8
+.Ldec_max32:
+        dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                     %ymm15, %rax, %rcx, 24);
+        fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+              %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+              %ymm15,
+              ((key_table + (24) * 8) + 8)(CTX),
+              ((key_table + (24) * 8) + 12)(CTX),
+              ((key_table + (24) * 8) + 0)(CTX),
+              ((key_table + (24) * 8) + 4)(CTX));
+        jmp .Ldec_max24;
+ENDPROC(__camellia_dec_blk32)
+ENTRY(camellia_ecb_enc_32way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst (32 blocks)
+         *      %rdx: src (32 blocks)
+         */
+        vzeroupper;
+        inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                     %ymm15, %rdx, (key_table)(CTX));
+        /* now dst can be used as temporary buffer (even in src == dst case) */
+        movq    %rsi, %rax;
+        call __camellia_enc_blk32;
+        write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+                     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+                     %ymm8, %rsi);
+        vzeroupper;
+        ret;
+ENDPROC(camellia_ecb_enc_32way)
+ENTRY(camellia_ecb_dec_32way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst (32 blocks)
+         *      %rdx: src (32 blocks)
+         */
+        vzeroupper;
+        cmpl $16, key_length(CTX);
+        movl $32, %r8d;
+        movl $24, %eax;
+        cmovel %eax, %r8d; /* max */
+        inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                     %ymm15, %rdx, (key_table)(CTX, %r8, 8));
+        /* now dst can be used as temporary buffer (even in src == dst case) */
+        movq    %rsi, %rax;
+        call __camellia_dec_blk32;
+        write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+                     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+                     %ymm8, %rsi);
+        vzeroupper;
+        ret;
+ENDPROC(camellia_ecb_dec_32way)
+ENTRY(camellia_cbc_dec_32way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst (32 blocks)
+         *      %rdx: src (32 blocks)
+         */
+        vzeroupper;
+        cmpl $16, key_length(CTX);
+        movl $32, %r8d;
+        movl $24, %eax;
+        cmovel %eax, %r8d; /* max */
+        inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                     %ymm15, %rdx, (key_table)(CTX, %r8, 8));
+        movq %rsp, %r10;
+        cmpq %rsi, %rdx;
+        je .Lcbc_dec_use_stack;
+        /* dst can be used as temporary storage, src is not overwritten. */
+        movq %rsi, %rax;
+        jmp .Lcbc_dec_continue;
+.Lcbc_dec_use_stack:
+        /*
+         * dst still in-use (because dst == src), so use stack for temporary
+         * storage.
+         */
+        subq $(16 * 32), %rsp;
+        movq %rsp, %rax;
+.Lcbc_dec_continue:
+        call __camellia_dec_blk32;
+        vmovdqu %ymm7, (%rax);
+        vpxor %ymm7, %ymm7, %ymm7;
+        vinserti128 $1, (%rdx), %ymm7, %ymm7;
+        vpxor (%rax), %ymm7, %ymm7;
+        movq %r10, %rsp;
+        vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
+        vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
+        vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
+        vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
+        vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
+        vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
+        vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
+        vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
+        vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
+        vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
+        vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
+        vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
+        vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
+        vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
+        vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
+        write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+                     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+                     %ymm8, %rsi);
+        vzeroupper;
+        ret;
+ENDPROC(camellia_cbc_dec_32way)
+#define inc_le128(x, minus_one, tmp) \
+        vpcmpeqq minus_one, x, tmp; \
+        vpsubq minus_one, x, x; \
+        vpslldq $8, tmp, tmp; \
+        vpsubq tmp, x, x;
+#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
+        vpcmpeqq minus_one, x, tmp1; \
+        vpcmpeqq minus_two, x, tmp2; \
+        vpsubq minus_two, x, x; \
+        vpor tmp2, tmp1, tmp1; \
+        vpslldq $8, tmp1, tmp1; \
+        vpsubq tmp1, x, x;
+ENTRY(camellia_ctr_32way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst (32 blocks)
+         *      %rdx: src (32 blocks)
+         *      %rcx: iv (little endian, 128bit)
+         */
+        vzeroupper;
+        movq %rsp, %r10;
+        cmpq %rsi, %rdx;
+        je .Lctr_use_stack;
+        /* dst can be used as temporary storage, src is not overwritten. */
+        movq %rsi, %rax;
+        jmp .Lctr_continue;
+.Lctr_use_stack:
+        subq $(16 * 32), %rsp;
+        movq %rsp, %rax;
+.Lctr_continue:
+        vpcmpeqd %ymm15, %ymm15, %ymm15;
+        vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
+        vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */
+        /* load IV and byteswap */
+        vmovdqu (%rcx), %xmm0;
+        vmovdqa %xmm0, %xmm1;
+        inc_le128(%xmm0, %xmm15, %xmm14);
+        vbroadcasti128 .Lbswap128_mask, %ymm14;
+        vinserti128 $1, %xmm0, %ymm1, %ymm0;
+        vpshufb %ymm14, %ymm0, %ymm13;
+        vmovdqu %ymm13, 15 * 32(%rax);
+        /* construct IVs */
+        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */
+        vpshufb %ymm14, %ymm0, %ymm13;
+        vmovdqu %ymm13, 14 * 32(%rax);
+        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+        vpshufb %ymm14, %ymm0, %ymm13;
+        vmovdqu %ymm13, 13 * 32(%rax);
+        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+        vpshufb %ymm14, %ymm0, %ymm13;
+        vmovdqu %ymm13, 12 * 32(%rax);
+        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+        vpshufb %ymm14, %ymm0, %ymm13;
+        vmovdqu %ymm13, 11 * 32(%rax);
+        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+        vpshufb %ymm14, %ymm0, %ymm10;
+        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+        vpshufb %ymm14, %ymm0, %ymm9;
+        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+        vpshufb %ymm14, %ymm0, %ymm8;
+        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+        vpshufb %ymm14, %ymm0, %ymm7;
+        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+        vpshufb %ymm14, %ymm0, %ymm6;
+        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+        vpshufb %ymm14, %ymm0, %ymm5;
+        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+        vpshufb %ymm14, %ymm0, %ymm4;
+        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+        vpshufb %ymm14, %ymm0, %ymm3;
+        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+        vpshufb %ymm14, %ymm0, %ymm2;
+        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+        vpshufb %ymm14, %ymm0, %ymm1;
+        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+        vextracti128 $1, %ymm0, %xmm13;
+        vpshufb %ymm14, %ymm0, %ymm0;
+        inc_le128(%xmm13, %xmm15, %xmm14);
+        vmovdqu %xmm13, (%rcx);
+        /* inpack32_pre: */
+        vpbroadcastq (key_table)(CTX), %ymm15;
+        vpshufb .Lpack_bswap, %ymm15, %ymm15;
+        vpxor %ymm0, %ymm15, %ymm0;
+        vpxor %ymm1, %ymm15, %ymm1;
+        vpxor %ymm2, %ymm15, %ymm2;
+        vpxor %ymm3, %ymm15, %ymm3;
+        vpxor %ymm4, %ymm15, %ymm4;
+        vpxor %ymm5, %ymm15, %ymm5;
+        vpxor %ymm6, %ymm15, %ymm6;
+        vpxor %ymm7, %ymm15, %ymm7;
+        vpxor %ymm8, %ymm15, %ymm8;
+        vpxor %ymm9, %ymm15, %ymm9;
+        vpxor %ymm10, %ymm15, %ymm10;
+        vpxor 11 * 32(%rax), %ymm15, %ymm11;
+        vpxor 12 * 32(%rax), %ymm15, %ymm12;
+        vpxor 13 * 32(%rax), %ymm15, %ymm13;
+        vpxor 14 * 32(%rax), %ymm15, %ymm14;
+        vpxor 15 * 32(%rax), %ymm15, %ymm15;
+        call __camellia_enc_blk32;
+        movq %r10, %rsp;
+        vpxor 0 * 32(%rdx), %ymm7, %ymm7;
+        vpxor 1 * 32(%rdx), %ymm6, %ymm6;
+        vpxor 2 * 32(%rdx), %ymm5, %ymm5;
+        vpxor 3 * 32(%rdx), %ymm4, %ymm4;
+        vpxor 4 * 32(%rdx), %ymm3, %ymm3;
+        vpxor 5 * 32(%rdx), %ymm2, %ymm2;
+        vpxor 6 * 32(%rdx), %ymm1, %ymm1;
+        vpxor 7 * 32(%rdx), %ymm0, %ymm0;
+        vpxor 8 * 32(%rdx), %ymm15, %ymm15;
+        vpxor 9 * 32(%rdx), %ymm14, %ymm14;
+        vpxor 10 * 32(%rdx), %ymm13, %ymm13;
+        vpxor 11 * 32(%rdx), %ymm12, %ymm12;
+        vpxor 12 * 32(%rdx), %ymm11, %ymm11;
+        vpxor 13 * 32(%rdx), %ymm10, %ymm10;
+        vpxor 14 * 32(%rdx), %ymm9, %ymm9;
+        vpxor 15 * 32(%rdx), %ymm8, %ymm8;
+        write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+                     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+                     %ymm8, %rsi);
+        vzeroupper;
+        ret;
+ENDPROC(camellia_ctr_32way)
+#define gf128mul_x_ble(iv, mask, tmp) \
+        vpsrad $31, iv, tmp; \
+        vpaddq iv, iv, iv; \
+        vpshufd $0x13, tmp, tmp; \
+        vpand mask, tmp, tmp; \
+        vpxor tmp, iv, iv;
+#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
+        vpsrad $31, iv, tmp0; \
+        vpaddq iv, iv, tmp1; \
+        vpsllq $2, iv, iv; \
+        vpshufd $0x13, tmp0, tmp0; \
+        vpsrad $31, tmp1, tmp1; \
+        vpand mask2, tmp0, tmp0; \
+        vpshufd $0x13, tmp1, tmp1; \
+        vpxor tmp0, iv, iv; \
+        vpand mask1, tmp1, tmp1; \
+        vpxor tmp1, iv, iv;
+.align 8
+camellia_xts_crypt_32way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst (32 blocks)
+         *      %rdx: src (32 blocks)
+         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+         *      %r8: index for input whitening key
+         *      %r9: pointer to  __camellia_enc_blk32 or __camellia_dec_blk32
+         */
+        vzeroupper;
+        subq $(16 * 32), %rsp;
+        movq %rsp, %rax;
+        vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
+        /* load IV and construct second IV */
+        vmovdqu (%rcx), %xmm0;
+        vmovdqa %xmm0, %xmm15;
+        gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
+        vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
+        vinserti128 $1, %xmm0, %ymm15, %ymm0;
+        vpxor 0 * 32(%rdx), %ymm0, %ymm15;
+        vmovdqu %ymm15, 15 * 32(%rax);
+        vmovdqu %ymm0, 0 * 32(%rsi);
+        /* construct IVs */
+        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+        vpxor 1 * 32(%rdx), %ymm0, %ymm15;
+        vmovdqu %ymm15, 14 * 32(%rax);
+        vmovdqu %ymm0, 1 * 32(%rsi);
+        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+        vpxor 2 * 32(%rdx), %ymm0, %ymm15;
+        vmovdqu %ymm15, 13 * 32(%rax);
+        vmovdqu %ymm0, 2 * 32(%rsi);
+        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+        vpxor 3 * 32(%rdx), %ymm0, %ymm15;
+        vmovdqu %ymm15, 12 * 32(%rax);
+        vmovdqu %ymm0, 3 * 32(%rsi);
+        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+        vpxor 4 * 32(%rdx), %ymm0, %ymm11;
+        vmovdqu %ymm0, 4 * 32(%rsi);
+        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+        vpxor 5 * 32(%rdx), %ymm0, %ymm10;
+        vmovdqu %ymm0, 5 * 32(%rsi);
+        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+        vpxor 6 * 32(%rdx), %ymm0, %ymm9;
+        vmovdqu %ymm0, 6 * 32(%rsi);
+        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+        vpxor 7 * 32(%rdx), %ymm0, %ymm8;
+        vmovdqu %ymm0, 7 * 32(%rsi);
+        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+        vpxor 8 * 32(%rdx), %ymm0, %ymm7;
+        vmovdqu %ymm0, 8 * 32(%rsi);
+        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+        vpxor 9 * 32(%rdx), %ymm0, %ymm6;
+        vmovdqu %ymm0, 9 * 32(%rsi);
+        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+        vpxor 10 * 32(%rdx), %ymm0, %ymm5;
+        vmovdqu %ymm0, 10 * 32(%rsi);
+        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+        vpxor 11 * 32(%rdx), %ymm0, %ymm4;
+        vmovdqu %ymm0, 11 * 32(%rsi);
+        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+        vpxor 12 * 32(%rdx), %ymm0, %ymm3;
+        vmovdqu %ymm0, 12 * 32(%rsi);
+        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+        vpxor 13 * 32(%rdx), %ymm0, %ymm2;
+        vmovdqu %ymm0, 13 * 32(%rsi);
+        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+        vpxor 14 * 32(%rdx), %ymm0, %ymm1;
+        vmovdqu %ymm0, 14 * 32(%rsi);
+        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+        vpxor 15 * 32(%rdx), %ymm0, %ymm15;
+        vmovdqu %ymm15, 0 * 32(%rax);
+        vmovdqu %ymm0, 15 * 32(%rsi);
+        vextracti128 $1, %ymm0, %xmm0;
+        gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
+        vmovdqu %xmm0, (%rcx);
+        /* inpack32_pre: */
+        vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
+        vpshufb .Lpack_bswap, %ymm15, %ymm15;
+        vpxor 0 * 32(%rax), %ymm15, %ymm0;
+        vpxor %ymm1, %ymm15, %ymm1;
+        vpxor %ymm2, %ymm15, %ymm2;
+        vpxor %ymm3, %ymm15, %ymm3;
+        vpxor %ymm4, %ymm15, %ymm4;
+        vpxor %ymm5, %ymm15, %ymm5;
+        vpxor %ymm6, %ymm15, %ymm6;
+        vpxor %ymm7, %ymm15, %ymm7;
+        vpxor %ymm8, %ymm15, %ymm8;
+        vpxor %ymm9, %ymm15, %ymm9;
+        vpxor %ymm10, %ymm15, %ymm10;
+        vpxor %ymm11, %ymm15, %ymm11;
+        vpxor 12 * 32(%rax), %ymm15, %ymm12;
+        vpxor 13 * 32(%rax), %ymm15, %ymm13;
+        vpxor 14 * 32(%rax), %ymm15, %ymm14;
+        vpxor 15 * 32(%rax), %ymm15, %ymm15;
+        call *%r9;
+        addq $(16 * 32), %rsp;
+        vpxor 0 * 32(%rsi), %ymm7, %ymm7;
+        vpxor 1 * 32(%rsi), %ymm6, %ymm6;
+        vpxor 2 * 32(%rsi), %ymm5, %ymm5;
+        vpxor 3 * 32(%rsi), %ymm4, %ymm4;
+        vpxor 4 * 32(%rsi), %ymm3, %ymm3;
+        vpxor 5 * 32(%rsi), %ymm2, %ymm2;
+        vpxor 6 * 32(%rsi), %ymm1, %ymm1;
+        vpxor 7 * 32(%rsi), %ymm0, %ymm0;
+        vpxor 8 * 32(%rsi), %ymm15, %ymm15;
+        vpxor 9 * 32(%rsi), %ymm14, %ymm14;
+        vpxor 10 * 32(%rsi), %ymm13, %ymm13;
+        vpxor 11 * 32(%rsi), %ymm12, %ymm12;
+        vpxor 12 * 32(%rsi), %ymm11, %ymm11;
+        vpxor 13 * 32(%rsi), %ymm10, %ymm10;
+        vpxor 14 * 32(%rsi), %ymm9, %ymm9;
+        vpxor 15 * 32(%rsi), %ymm8, %ymm8;
+        write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+                     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+                     %ymm8, %rsi);
+        vzeroupper;
+        ret;
+ENDPROC(camellia_xts_crypt_32way)
+ENTRY(camellia_xts_enc_32way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst (32 blocks)
+         *      %rdx: src (32 blocks)
+         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+         */
+        xorl %r8d, %r8d; /* input whitening key, 0 for enc */
+        leaq __camellia_enc_blk32, %r9;
+        jmp camellia_xts_crypt_32way;
+ENDPROC(camellia_xts_enc_32way)
+ENTRY(camellia_xts_dec_32way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst (32 blocks)
+         *      %rdx: src (32 blocks)
+         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+         */
+        cmpl $16, key_length(CTX);
+        movl $32, %r8d;
+        movl $24, %eax;
+        cmovel %eax, %r8d;  /* input whitening key, last for dec */
+        leaq __camellia_dec_blk32, %r9;
+        jmp camellia_xts_crypt_32way;
+ENDPROC(camellia_xts_dec_32way)
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
new file mode 100644
index 000000000000..414fe5d7946b
--- /dev/null
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -0,0 +1,586 @@
+/*
+ * Glue Code for x86_64/AVX2/AES-NI assembler optimized version of Camellia
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/ctr.h>
+#include <crypto/lrw.h>
+#include <crypto/xts.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <asm/crypto/camellia.h>
+#include <asm/crypto/ablk_helper.h>
+#include <asm/crypto/glue_helper.h>
+#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
+#define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32
+/* 32-way AVX2/AES-NI parallel cipher functions */
+asmlinkage void camellia_ecb_enc_32way(struct camellia_ctx *ctx, u8 *dst,
+                                       const u8 *src);
+asmlinkage void camellia_ecb_dec_32way(struct camellia_ctx *ctx, u8 *dst,
+                                       const u8 *src);
+asmlinkage void camellia_cbc_dec_32way(struct camellia_ctx *ctx, u8 *dst,
+                                       const u8 *src);
+asmlinkage void camellia_ctr_32way(struct camellia_ctx *ctx, u8 *dst,
+                                   const u8 *src, le128 *iv);
+asmlinkage void camellia_xts_enc_32way(struct camellia_ctx *ctx, u8 *dst,
+                                       const u8 *src, le128 *iv);
+asmlinkage void camellia_xts_dec_32way(struct camellia_ctx *ctx, u8 *dst,
+                                       const u8 *src, le128 *iv);
+static const struct common_glue_ctx camellia_enc = {
+        .num_funcs = 4,
+        .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+        .funcs = { {
+                .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_32way) }
+        }, {
+                .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) }
+        }, {
+                .num_blocks = 2,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
+        } }
+};
+static const struct common_glue_ctx camellia_ctr = {
+        .num_funcs = 4,
+        .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+        .funcs = { {
+                .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_32way) }
+        }, {
+                .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) }
+        }, {
+                .num_blocks = 2,
+                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
+        } }
+};
+static const struct common_glue_ctx camellia_enc_xts = {
+        .num_funcs = 3,
+        .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+        .funcs = { {
+                .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_32way) }
+        }, {
+                .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) }
+        } }
+};
+static const struct common_glue_ctx camellia_dec = {
+        .num_funcs = 4,
+        .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+        .funcs = { {
+                .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_32way) }
+        }, {
+                .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) }
+        }, {
+                .num_blocks = 2,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
+        } }
+};
+static const struct common_glue_ctx camellia_dec_cbc = {
+        .num_funcs = 4,
+        .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+        .funcs = { {
+                .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_32way) }
+        }, {
+                .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) }
+        }, {
+                .num_blocks = 2,
+                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
+        } }
+};
+static const struct common_glue_ctx camellia_dec_xts = {
+        .num_funcs = 3,
+        .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+        .funcs = { {
+                .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_32way) }
+        }, {
+                .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) }
+        } }
+};
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes);
+}
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes);
+}
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc,
+                                       dst, src, nbytes);
+}
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src,
+                                       nbytes);
+}
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                     struct scatterlist *src, unsigned int nbytes)
+{
+        return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes);
+}
+static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+{
+        return glue_fpu_begin(CAMELLIA_BLOCK_SIZE,
+                              CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled,
+                              nbytes);
+}
+static inline void camellia_fpu_end(bool fpu_enabled)
+{
+        glue_fpu_end(fpu_enabled);
+}
+static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
+                           unsigned int key_len)
+{
+        return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len,
+                                 &tfm->crt_flags);
+}
+struct crypt_priv {
+        struct camellia_ctx *ctx;
+        bool fpu_enabled;
+};
+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+        const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
+        struct crypt_priv *ctx = priv;
+        int i;
+        ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
+        if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) {
+                camellia_ecb_enc_32way(ctx->ctx, srcdst, srcdst);
+                srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
+                nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
+        }
+        if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
+                camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
+                srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
+                nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
+        }
+        while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+                camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
+                srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
+                nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
+        }
+        for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+                camellia_enc_blk(ctx->ctx, srcdst, srcdst);
+}
+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+        const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
+        struct crypt_priv *ctx = priv;
+        int i;
+        ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
+        if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) {
+                camellia_ecb_dec_32way(ctx->ctx, srcdst, srcdst);
+                srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
+                nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
+        }
+        if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
+                camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
+                srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
+                nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
+        }
+        while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+                camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
+                srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
+                nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
+        }
+        for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+                camellia_dec_blk(ctx->ctx, srcdst, srcdst);
+}
+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS];
+        struct crypt_priv crypt_ctx = {
+                .ctx = &ctx->camellia_ctx,
+                .fpu_enabled = false,
+        };
+        struct lrw_crypt_req req = {
+                .tbuf = buf,
+                .tbuflen = sizeof(buf),
+                .table_ctx = &ctx->lrw_table,
+                .crypt_ctx = &crypt_ctx,
+                .crypt_fn = encrypt_callback,
+        };
+        int ret;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        ret = lrw_crypt(desc, dst, src, nbytes, &req);
+        camellia_fpu_end(crypt_ctx.fpu_enabled);
+        return ret;
+}
+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS];
+        struct crypt_priv crypt_ctx = {
+                .ctx = &ctx->camellia_ctx,
+                .fpu_enabled = false,
+        };
+        struct lrw_crypt_req req = {
+                .tbuf = buf,
+                .tbuflen = sizeof(buf),
+                .table_ctx = &ctx->lrw_table,
+                .crypt_ctx = &crypt_ctx,
+                .crypt_fn = decrypt_callback,
+        };
+        int ret;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        ret = lrw_crypt(desc, dst, src, nbytes, &req);
+        camellia_fpu_end(crypt_ctx.fpu_enabled);
+        return ret;
+}
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes,
+                                     XTS_TWEAK_CAST(camellia_enc_blk),
+                                     &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes,
+                                     XTS_TWEAK_CAST(camellia_enc_blk),
+                                     &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+static struct crypto_alg cmll_algs[10] = { {
+        .cra_name               = "__ecb-camellia-aesni-avx2",
+        .cra_driver_name        = "__driver-ecb-camellia-aesni-avx2",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct camellia_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = CAMELLIA_MIN_KEY_SIZE,
+                        .max_keysize    = CAMELLIA_MAX_KEY_SIZE,
+                        .setkey         = camellia_setkey,
+                        .encrypt        = ecb_encrypt,
+                        .decrypt        = ecb_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "__cbc-camellia-aesni-avx2",
+        .cra_driver_name        = "__driver-cbc-camellia-aesni-avx2",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct camellia_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = CAMELLIA_MIN_KEY_SIZE,
+                        .max_keysize    = CAMELLIA_MAX_KEY_SIZE,
+                        .setkey         = camellia_setkey,
+                        .encrypt        = cbc_encrypt,
+                        .decrypt        = cbc_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "__ctr-camellia-aesni-avx2",
+        .cra_driver_name        = "__driver-ctr-camellia-aesni-avx2",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = 1,
+        .cra_ctxsize            = sizeof(struct camellia_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = CAMELLIA_MIN_KEY_SIZE,
+                        .max_keysize    = CAMELLIA_MAX_KEY_SIZE,
+                        .ivsize         = CAMELLIA_BLOCK_SIZE,
+                        .setkey         = camellia_setkey,
+                        .encrypt        = ctr_crypt,
+                        .decrypt        = ctr_crypt,
+                },
+        },
+}, {
+        .cra_name               = "__lrw-camellia-aesni-avx2",
+        .cra_driver_name        = "__driver-lrw-camellia-aesni-avx2",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct camellia_lrw_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_exit               = lrw_camellia_exit_tfm,
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = CAMELLIA_MIN_KEY_SIZE +
+                                          CAMELLIA_BLOCK_SIZE,
+                        .max_keysize    = CAMELLIA_MAX_KEY_SIZE +
+                                          CAMELLIA_BLOCK_SIZE,
+                        .ivsize         = CAMELLIA_BLOCK_SIZE,
+                        .setkey         = lrw_camellia_setkey,
+                        .encrypt        = lrw_encrypt,
+                        .decrypt        = lrw_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "__xts-camellia-aesni-avx2",
+        .cra_driver_name        = "__driver-xts-camellia-aesni-avx2",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct camellia_xts_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = CAMELLIA_MIN_KEY_SIZE * 2,
+                        .max_keysize    = CAMELLIA_MAX_KEY_SIZE * 2,
+                        .ivsize         = CAMELLIA_BLOCK_SIZE,
+                        .setkey         = xts_camellia_setkey,
+                        .encrypt        = xts_encrypt,
+                        .decrypt        = xts_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "ecb(camellia)",
+        .cra_driver_name        = "ecb-camellia-aesni-avx2",
+        .cra_priority           = 500,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = CAMELLIA_MIN_KEY_SIZE,
+                        .max_keysize    = CAMELLIA_MAX_KEY_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = ablk_encrypt,
+                        .decrypt        = ablk_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "cbc(camellia)",
+        .cra_driver_name        = "cbc-camellia-aesni-avx2",
+        .cra_priority           = 500,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = CAMELLIA_MIN_KEY_SIZE,
+                        .max_keysize    = CAMELLIA_MAX_KEY_SIZE,
+                        .ivsize         = CAMELLIA_BLOCK_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = __ablk_encrypt,
+                        .decrypt        = ablk_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "ctr(camellia)",
+        .cra_driver_name        = "ctr-camellia-aesni-avx2",
+        .cra_priority           = 500,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = 1,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = CAMELLIA_MIN_KEY_SIZE,
+                        .max_keysize    = CAMELLIA_MAX_KEY_SIZE,
+                        .ivsize         = CAMELLIA_BLOCK_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = ablk_encrypt,
+                        .decrypt        = ablk_encrypt,
+                        .geniv          = "chainiv",
+                },
+        },
+}, {
+        .cra_name               = "lrw(camellia)",
+        .cra_driver_name        = "lrw-camellia-aesni-avx2",
+        .cra_priority           = 500,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = CAMELLIA_MIN_KEY_SIZE +
+                                          CAMELLIA_BLOCK_SIZE,
+                        .max_keysize    = CAMELLIA_MAX_KEY_SIZE +
+                                          CAMELLIA_BLOCK_SIZE,
+                        .ivsize         = CAMELLIA_BLOCK_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = ablk_encrypt,
+                        .decrypt        = ablk_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "xts(camellia)",
+        .cra_driver_name        = "xts-camellia-aesni-avx2",
+        .cra_priority           = 500,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = CAMELLIA_MIN_KEY_SIZE * 2,
+                        .max_keysize    = CAMELLIA_MAX_KEY_SIZE * 2,
+                        .ivsize         = CAMELLIA_BLOCK_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = ablk_encrypt,
+                        .decrypt        = ablk_decrypt,
+                },
+        },
+} };
+static int __init camellia_aesni_init(void)
+{
+        u64 xcr0;
+        if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) {
+                pr_info("AVX2 or AES-NI instructions are not detected.\n");
+                return -ENODEV;
+        }
+        xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+        if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+                pr_info("AVX2 detected but unusable.\n");
+                return -ENODEV;
+        }
+        return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
+}
+static void __exit camellia_aesni_fini(void)
+{
+        crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
+}
+module_init(camellia_aesni_init);
+module_exit(camellia_aesni_fini);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX2 optimized");
+MODULE_ALIAS("camellia");
+MODULE_ALIAS("camellia-asm");
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index 96cbb6068fce..37fd0c0a81ea 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -1,7 +1,7 @@
 /*
 * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia
 *
- * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -26,16 +26,44 @@
 #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
-/* 16-way AES-NI parallel cipher functions */
+/* 16-way parallel cipher functions (avx/aes-ni) */
 asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst,
                                       const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_ecb_enc_16way);
 asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst,
                                       const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way);
 asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
                                       const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way);
 asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
                                   const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(camellia_ctr_16way);
+asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst,
+                                       const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(camellia_xts_enc_16way);
+asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+                                       const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(camellia_xts_dec_16way);
+void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+        glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+                                  GLUE_FUNC_CAST(camellia_enc_blk));
+}
+EXPORT_SYMBOL_GPL(camellia_xts_enc);
+void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+        glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+                                  GLUE_FUNC_CAST(camellia_dec_blk));
+}
+EXPORT_SYMBOL_GPL(camellia_xts_dec);
 static const struct common_glue_ctx camellia_enc = {
        .num_funcs = 3,
@@ -69,6 +97,19 @@ static const struct common_glue_ctx camellia_ctr = {
        } }
 };
+static const struct common_glue_ctx camellia_enc_xts = {
+        .num_funcs = 2,
+        .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+        .funcs = { {
+                .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) }
+        } }
+};
 static const struct common_glue_ctx camellia_dec = {
        .num_funcs = 3,
        .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
@@ -101,6 +142,19 @@ static const struct common_glue_ctx camellia_dec_cbc = {
        } }
 };
+static const struct common_glue_ctx camellia_dec_xts = {
+        .num_funcs = 2,
+        .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+        .funcs = { {
+                .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) }
+        } }
+};
 static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
                       struct scatterlist *src, unsigned int nbytes)
 {
@@ -261,54 +315,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
                       struct scatterlist *src, unsigned int nbytes)
 {
        struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-        be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
-        struct crypt_priv crypt_ctx = {
-                .ctx = &ctx->crypt_ctx,
-                .fpu_enabled = false,
-        };
-        struct xts_crypt_req req = {
-                .tbuf = buf,
-                .tbuflen = sizeof(buf),
-                .tweak_ctx = &ctx->tweak_ctx,
+        return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes,
-                .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
+                                     XTS_TWEAK_CAST(camellia_enc_blk),
-                .crypt_ctx = &crypt_ctx,
+                                     &ctx->tweak_ctx, &ctx->crypt_ctx);
-                .crypt_fn = encrypt_callback,
-        };
-        int ret;
-        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-        ret = xts_crypt(desc, dst, src, nbytes, &req);
-        camellia_fpu_end(crypt_ctx.fpu_enabled);
-        return ret;
 }
 static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
                       struct scatterlist *src, unsigned int nbytes)
 {
        struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-        be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
-        struct crypt_priv crypt_ctx = {
-                .ctx = &ctx->crypt_ctx,
-                .fpu_enabled = false,
-        };
-        struct xts_crypt_req req = {
-                .tbuf = buf,
-                .tbuflen = sizeof(buf),
-                .tweak_ctx = &ctx->tweak_ctx,
-                .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
-                .crypt_ctx = &crypt_ctx,
-                .crypt_fn = decrypt_callback,
-        };
-        int ret;
-        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes,
-        ret = xts_crypt(desc, dst, src, nbytes, &req);
+                                     XTS_TWEAK_CAST(camellia_enc_blk),
-        camellia_fpu_end(crypt_ctx.fpu_enabled);
+                                     &ctx->tweak_ctx, &ctx->crypt_ctx);
-        return ret;
 }
 static struct crypto_alg cmll_algs[10] = { {
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index f93b6105a0ce..e3531f833951 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -4,7 +4,7 @@
 * Copyright (C) 2012 Johannes Goetzfried
 *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
 *
- * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -227,6 +227,8 @@
 .data
 .align 16
+.Lxts_gf128mul_and_shl1_mask:
+        .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
 .Lbswap_mask:
        .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 .Lbswap128_mask:
@@ -424,3 +426,47 @@ ENTRY(cast6_ctr_8way)
        ret;
 ENDPROC(cast6_ctr_8way)
+ENTRY(cast6_xts_enc_8way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+         */
+        movq %rsi, %r11;
+        /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+        load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+                      RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
+        call __cast6_enc_blk8;
+        /* dst <= regs xor IVs(in dst) */
+        store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        ret;
+ENDPROC(cast6_xts_enc_8way)
+ENTRY(cast6_xts_dec_8way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+         */
+        movq %rsi, %r11;
+        /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+        load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+                      RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
+        call __cast6_dec_blk8;
+        /* dst <= regs xor IVs(in dst) */
+        store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        ret;
+ENDPROC(cast6_xts_dec_8way)
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 92f7ca24790a..8d0dfb86a559 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -4,6 +4,8 @@
 * Copyright (C) 2012 Johannes Goetzfried
 *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
 *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
@@ -50,6 +52,23 @@ asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst,
 asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src,
                               le128 *iv);
+asmlinkage void cast6_xts_enc_8way(struct cast6_ctx *ctx, u8 *dst,
+                                   const u8 *src, le128 *iv);
+asmlinkage void cast6_xts_dec_8way(struct cast6_ctx *ctx, u8 *dst,
+                                   const u8 *src, le128 *iv);
+static void cast6_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+        glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+                                  GLUE_FUNC_CAST(__cast6_encrypt));
+}
+static void cast6_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+        glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+                                  GLUE_FUNC_CAST(__cast6_decrypt));
+}
 static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
        be128 ctrblk;
@@ -87,6 +106,19 @@ static const struct common_glue_ctx cast6_ctr = {
        } }
 };
+static const struct common_glue_ctx cast6_enc_xts = {
+        .num_funcs = 2,
+        .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
+        .funcs = { {
+                .num_blocks = CAST6_PARALLEL_BLOCKS,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc_8way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc) }
+        } }
+};
 static const struct common_glue_ctx cast6_dec = {
        .num_funcs = 2,
        .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
@@ -113,6 +145,19 @@ static const struct common_glue_ctx cast6_dec_cbc = {
        } }
 };
+static const struct common_glue_ctx cast6_dec_xts = {
+        .num_funcs = 2,
+        .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
+        .funcs = { {
+                .num_blocks = CAST6_PARALLEL_BLOCKS,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec_8way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec) }
+        } }
+};
 static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
                       struct scatterlist *src, unsigned int nbytes)
 {
@@ -307,54 +352,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
                       struct scatterlist *src, unsigned int nbytes)
 {
        struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-        be128 buf[CAST6_PARALLEL_BLOCKS];
-        struct crypt_priv crypt_ctx = {
-                .ctx = &ctx->crypt_ctx,
-                .fpu_enabled = false,
-        };
-        struct xts_crypt_req req = {
-                .tbuf = buf,
-                .tbuflen = sizeof(buf),
-                .tweak_ctx = &ctx->tweak_ctx,
+        return glue_xts_crypt_128bit(&cast6_enc_xts, desc, dst, src, nbytes,
-                .tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt),
+                                     XTS_TWEAK_CAST(__cast6_encrypt),
-                .crypt_ctx = &crypt_ctx,
+                                     &ctx->tweak_ctx, &ctx->crypt_ctx);
-                .crypt_fn = encrypt_callback,
-        };
-        int ret;
-        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-        ret = xts_crypt(desc, dst, src, nbytes, &req);
-        cast6_fpu_end(crypt_ctx.fpu_enabled);
-        return ret;
 }
 static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
                       struct scatterlist *src, unsigned int nbytes)
 {
        struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-        be128 buf[CAST6_PARALLEL_BLOCKS];
-        struct crypt_priv crypt_ctx = {
-                .ctx = &ctx->crypt_ctx,
-                .fpu_enabled = false,
-        };
-        struct xts_crypt_req req = {
-                .tbuf = buf,
-                .tbuflen = sizeof(buf),
-                .tweak_ctx = &ctx->tweak_ctx,
-                .tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt),
-                .crypt_ctx = &crypt_ctx,
-                .crypt_fn = decrypt_callback,
-        };
-        int ret;
-        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        return glue_xts_crypt_128bit(&cast6_dec_xts, desc, dst, src, nbytes,
-        ret = xts_crypt(desc, dst, src, nbytes, &req);
+                                     XTS_TWEAK_CAST(__cast6_encrypt),
-        cast6_fpu_end(crypt_ctx.fpu_enabled);
+                                     &ctx->tweak_ctx, &ctx->crypt_ctx);
-        return ret;
 }
 static struct crypto_alg cast6_algs[10] = { {
diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S
index c8335014a044..94c27df8a549 100644
--- a/arch/x86/crypto/crc32-pclmul_asm.S
+++ b/arch/x86/crypto/crc32-pclmul_asm.S
@@ -101,9 +101,8 @@
 *      uint crc32_pclmul_le_16(unsigned char const *buffer,
 *                           size_t len, uint crc32)
 */
-.globl crc32_pclmul_le_16
-.align 4, 0x90
+ENTRY(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
-crc32_pclmul_le_16:/* buffer and buffer size are 16 bytes aligned */
        movdqa  (BUF), %xmm1
        movdqa  0x10(BUF), %xmm2
        movdqa  0x20(BUF), %xmm3
@@ -244,3 +243,4 @@ fold_64:
        pextrd  $0x01, %xmm1, %eax
        ret
+ENDPROC(crc32_pclmul_le_16)
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index cf1a7ec4cc3a..dbc4339b5417 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -1,9 +1,10 @@
 /*
 * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
 *
- * The white paper on CRC32C calculations with PCLMULQDQ instruction can be
+ * The white papers on CRC32C calculations with PCLMULQDQ instruction can be
 * downloaded from:
- * http://download.intel.com/design/intarch/papers/323405.pdf
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
 *
 * Copyright (C) 2012 Intel Corporation.
 *
@@ -42,6 +43,7 @@
 * SOFTWARE.
 */
+#include <asm/inst.h>
 #include <linux/linkage.h>
 ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
@@ -225,10 +227,10 @@ LABEL crc_ %i
        movdqa  (bufp), %xmm0                   # 2 consts: K1:K2
        movq    crc_init, %xmm1                 # CRC for block 1
-        pclmulqdq $0x00,%xmm0,%xmm1             # Multiply by K2
+        PCLMULQDQ 0x00,%xmm0,%xmm1              # Multiply by K2
        movq    crc1, %xmm2                     # CRC for block 2
-        pclmulqdq $0x10, %xmm0, %xmm2           # Multiply by K1
+        PCLMULQDQ 0x10, %xmm0, %xmm2            # Multiply by K1
        pxor    %xmm2,%xmm1
        movq    %xmm1, %rax
diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S
index f7b6ea2ddfdb..02ee2308fb38 100644
--- a/arch/x86/crypto/glue_helper-asm-avx.S
+++ b/arch/x86/crypto/glue_helper-asm-avx.S
@@ -1,7 +1,7 @@
 /*
 * Shared glue code for 128bit block ciphers, AVX assembler macros
 *
- * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -89,3 +89,62 @@
        vpxor (6*16)(src), x6, x6; \
        vpxor (7*16)(src), x7, x7; \
        store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+#define gf128mul_x_ble(iv, mask, tmp) \
+        vpsrad $31, iv, tmp; \
+        vpaddq iv, iv, iv; \
+        vpshufd $0x13, tmp, tmp; \
+        vpand mask, tmp, tmp; \
+        vpxor tmp, iv, iv;
+#define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \
+                      t1, xts_gf128mul_and_shl1_mask) \
+        vmovdqa xts_gf128mul_and_shl1_mask, t0; \
+        \
+        /* load IV */ \
+        vmovdqu (iv), tiv; \
+        vpxor (0*16)(src), tiv, x0; \
+        vmovdqu tiv, (0*16)(dst); \
+        \
+        /* construct and store IVs, also xor with source */ \
+        gf128mul_x_ble(tiv, t0, t1); \
+        vpxor (1*16)(src), tiv, x1; \
+        vmovdqu tiv, (1*16)(dst); \
+        \
+        gf128mul_x_ble(tiv, t0, t1); \
+        vpxor (2*16)(src), tiv, x2; \
+        vmovdqu tiv, (2*16)(dst); \
+        \
+        gf128mul_x_ble(tiv, t0, t1); \
+        vpxor (3*16)(src), tiv, x3; \
+        vmovdqu tiv, (3*16)(dst); \
+        \
+        gf128mul_x_ble(tiv, t0, t1); \
+        vpxor (4*16)(src), tiv, x4; \
+        vmovdqu tiv, (4*16)(dst); \
+        \
+        gf128mul_x_ble(tiv, t0, t1); \
+        vpxor (5*16)(src), tiv, x5; \
+        vmovdqu tiv, (5*16)(dst); \
+        \
+        gf128mul_x_ble(tiv, t0, t1); \
+        vpxor (6*16)(src), tiv, x6; \
+        vmovdqu tiv, (6*16)(dst); \
+        \
+        gf128mul_x_ble(tiv, t0, t1); \
+        vpxor (7*16)(src), tiv, x7; \
+        vmovdqu tiv, (7*16)(dst); \
+        \
+        gf128mul_x_ble(tiv, t0, t1); \
+        vmovdqu tiv, (iv);
+#define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+        vpxor (0*16)(dst), x0, x0; \
+        vpxor (1*16)(dst), x1, x1; \
+        vpxor (2*16)(dst), x2, x2; \
+        vpxor (3*16)(dst), x3, x3; \
+        vpxor (4*16)(dst), x4, x4; \
+        vpxor (5*16)(dst), x5, x5; \
+        vpxor (6*16)(dst), x6, x6; \
+        vpxor (7*16)(dst), x7, x7; \
+        store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/arch/x86/crypto/glue_helper-asm-avx2.S b/arch/x86/crypto/glue_helper-asm-avx2.S
new file mode 100644
index 000000000000..a53ac11dd385
--- /dev/null
+++ b/arch/x86/crypto/glue_helper-asm-avx2.S
@@ -0,0 +1,180 @@
+/*
+ * Shared glue code for 128bit block ciphers, AVX2 assembler macros
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+#define load_16way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
+        vmovdqu (0*32)(src), x0; \
+        vmovdqu (1*32)(src), x1; \
+        vmovdqu (2*32)(src), x2; \
+        vmovdqu (3*32)(src), x3; \
+        vmovdqu (4*32)(src), x4; \
+        vmovdqu (5*32)(src), x5; \
+        vmovdqu (6*32)(src), x6; \
+        vmovdqu (7*32)(src), x7;
+#define store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+        vmovdqu x0, (0*32)(dst); \
+        vmovdqu x1, (1*32)(dst); \
+        vmovdqu x2, (2*32)(dst); \
+        vmovdqu x3, (3*32)(dst); \
+        vmovdqu x4, (4*32)(dst); \
+        vmovdqu x5, (5*32)(dst); \
+        vmovdqu x6, (6*32)(dst); \
+        vmovdqu x7, (7*32)(dst);
+#define store_cbc_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7, t0) \
+        vpxor t0, t0, t0; \
+        vinserti128 $1, (src), t0, t0; \
+        vpxor t0, x0, x0; \
+        vpxor (0*32+16)(src), x1, x1; \
+        vpxor (1*32+16)(src), x2, x2; \
+        vpxor (2*32+16)(src), x3, x3; \
+        vpxor (3*32+16)(src), x4, x4; \
+        vpxor (4*32+16)(src), x5, x5; \
+        vpxor (5*32+16)(src), x6, x6; \
+        vpxor (6*32+16)(src), x7, x7; \
+        store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+#define inc_le128(x, minus_one, tmp) \
+        vpcmpeqq minus_one, x, tmp; \
+        vpsubq minus_one, x, x; \
+        vpslldq $8, tmp, tmp; \
+        vpsubq tmp, x, x;
+#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
+        vpcmpeqq minus_one, x, tmp1; \
+        vpcmpeqq minus_two, x, tmp2; \
+        vpsubq minus_two, x, x; \
+        vpor tmp2, tmp1, tmp1; \
+        vpslldq $8, tmp1, tmp1; \
+        vpsubq tmp1, x, x;
+#define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \
+                       t1x, t2, t2x, t3, t3x, t4, t5) \
+        vpcmpeqd t0, t0, t0; \
+        vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \
+        vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\
+        \
+        /* load IV and byteswap */ \
+        vmovdqu (iv), t2x; \
+        vmovdqa t2x, t3x; \
+        inc_le128(t2x, t0x, t1x); \
+        vbroadcasti128 bswap, t1; \
+        vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \
+        vpshufb t1, t2, x0; \
+        \
+        /* construct IVs */ \
+        add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \
+        vpshufb t1, t2, x1; \
+        add2_le128(t2, t0, t4, t3, t5); \
+        vpshufb t1, t2, x2; \
+        add2_le128(t2, t0, t4, t3, t5); \
+        vpshufb t1, t2, x3; \
+        add2_le128(t2, t0, t4, t3, t5); \
+        vpshufb t1, t2, x4; \
+        add2_le128(t2, t0, t4, t3, t5); \
+        vpshufb t1, t2, x5; \
+        add2_le128(t2, t0, t4, t3, t5); \
+        vpshufb t1, t2, x6; \
+        add2_le128(t2, t0, t4, t3, t5); \
+        vpshufb t1, t2, x7; \
+        vextracti128 $1, t2, t2x; \
+        inc_le128(t2x, t0x, t3x); \
+        vmovdqu t2x, (iv);
+#define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+        vpxor (0*32)(src), x0, x0; \
+        vpxor (1*32)(src), x1, x1; \
+        vpxor (2*32)(src), x2, x2; \
+        vpxor (3*32)(src), x3, x3; \
+        vpxor (4*32)(src), x4, x4; \
+        vpxor (5*32)(src), x5, x5; \
+        vpxor (6*32)(src), x6, x6; \
+        vpxor (7*32)(src), x7, x7; \
+        store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+#define gf128mul_x_ble(iv, mask, tmp) \
+        vpsrad $31, iv, tmp; \
+        vpaddq iv, iv, iv; \
+        vpshufd $0x13, tmp, tmp; \
+        vpand mask, tmp, tmp; \
+        vpxor tmp, iv, iv;
+#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
+        vpsrad $31, iv, tmp0; \
+        vpaddq iv, iv, tmp1; \
+        vpsllq $2, iv, iv; \
+        vpshufd $0x13, tmp0, tmp0; \
+        vpsrad $31, tmp1, tmp1; \
+        vpand mask2, tmp0, tmp0; \
+        vpshufd $0x13, tmp1, tmp1; \
+        vpxor tmp0, iv, iv; \
+        vpand mask1, tmp1, tmp1; \
+        vpxor tmp1, iv, iv;
+#define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \
+                       tivx, t0, t0x, t1, t1x, t2, t2x, t3, \
+                       xts_gf128mul_and_shl1_mask_0, \
+                       xts_gf128mul_and_shl1_mask_1) \
+        vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \
+        \
+        /* load IV and construct second IV */ \
+        vmovdqu (iv), tivx; \
+        vmovdqa tivx, t0x; \
+        gf128mul_x_ble(tivx, t1x, t2x); \
+        vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \
+        vinserti128 $1, tivx, t0, tiv; \
+        vpxor (0*32)(src), tiv, x0; \
+        vmovdqu tiv, (0*32)(dst); \
+        \
+        /* construct and store IVs, also xor with source */ \
+        gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+        vpxor (1*32)(src), tiv, x1; \
+        vmovdqu tiv, (1*32)(dst); \
+        \
+        gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+        vpxor (2*32)(src), tiv, x2; \
+        vmovdqu tiv, (2*32)(dst); \
+        \
+        gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+        vpxor (3*32)(src), tiv, x3; \
+        vmovdqu tiv, (3*32)(dst); \
+        \
+        gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+        vpxor (4*32)(src), tiv, x4; \
+        vmovdqu tiv, (4*32)(dst); \
+        \
+        gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+        vpxor (5*32)(src), tiv, x5; \
+        vmovdqu tiv, (5*32)(dst); \
+        \
+        gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+        vpxor (6*32)(src), tiv, x6; \
+        vmovdqu tiv, (6*32)(dst); \
+        \
+        gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+        vpxor (7*32)(src), tiv, x7; \
+        vmovdqu tiv, (7*32)(dst); \
+        \
+        vextracti128 $1, tiv, tivx; \
+        gf128mul_x_ble(tivx, t1x, t2x); \
+        vmovdqu tivx, (iv);
+#define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+        vpxor (0*32)(dst), x0, x0; \
+        vpxor (1*32)(dst), x1, x1; \
+        vpxor (2*32)(dst), x2, x2; \
+        vpxor (3*32)(dst), x3, x3; \
+        vpxor (4*32)(dst), x4, x4; \
+        vpxor (5*32)(dst), x5, x5; \
+        vpxor (6*32)(dst), x6, x6; \
+        vpxor (7*32)(dst), x7, x7; \
+        store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index 22ce4f683e55..432f1d76ceb8 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -1,7 +1,7 @@
 /*
 * Shared glue code for 128bit block ciphers
 *
- * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
 *
 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
 *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
@@ -304,4 +304,99 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
 }
 EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit);
+static unsigned int __glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
+                                            void *ctx,
+                                            struct blkcipher_desc *desc,
+                                            struct blkcipher_walk *walk)
+{
+        const unsigned int bsize = 128 / 8;
+        unsigned int nbytes = walk->nbytes;
+        u128 *src = (u128 *)walk->src.virt.addr;
+        u128 *dst = (u128 *)walk->dst.virt.addr;
+        unsigned int num_blocks, func_bytes;
+        unsigned int i;
+        /* Process multi-block batch */
+        for (i = 0; i < gctx->num_funcs; i++) {
+                num_blocks = gctx->funcs[i].num_blocks;
+                func_bytes = bsize * num_blocks;
+                if (nbytes >= func_bytes) {
+                        do {
+                                gctx->funcs[i].fn_u.xts(ctx, dst, src,
+                                                        (le128 *)walk->iv);
+                                src += num_blocks;
+                                dst += num_blocks;
+                                nbytes -= func_bytes;
+                        } while (nbytes >= func_bytes);
+                        if (nbytes < bsize)
+                                goto done;
+                }
+        }
+done:
+        return nbytes;
+}
+/* for implementations implementing faster XTS IV generator */
+int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
+                          struct blkcipher_desc *desc, struct scatterlist *dst,
+                          struct scatterlist *src, unsigned int nbytes,
+                          void (*tweak_fn)(void *ctx, u8 *dst, const u8 *src),
+                          void *tweak_ctx, void *crypt_ctx)
+{
+        const unsigned int bsize = 128 / 8;
+        bool fpu_enabled = false;
+        struct blkcipher_walk walk;
+        int err;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt(desc, &walk);
+        nbytes = walk.nbytes;
+        if (!nbytes)
+                return err;
+        /* set minimum length to bsize, for tweak_fn */
+        fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+                                     desc, fpu_enabled,
+                                     nbytes < bsize ? bsize : nbytes);
+        /* calculate first value of T */
+        tweak_fn(tweak_ctx, walk.iv, walk.iv);
+        while (nbytes) {
+                nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
+                err = blkcipher_walk_done(desc, &walk, nbytes);
+                nbytes = walk.nbytes;
+        }
+        glue_fpu_end(fpu_enabled);
+        return err;
+}
+EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
+void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv,
+                               common_glue_func_t fn)
+{
+        le128 ivblk = *iv;
+        /* generate next IV */
+        le128_gf128mul_x_ble(iv, &ivblk);
+        /* CC <- T xor C */
+        u128_xor(dst, src, (u128 *)&ivblk);
+        /* PP <- D(Key2,CC) */
+        fn(ctx, (u8 *)dst, (u8 *)dst);
+        /* P <- T xor PP */
+        u128_xor(dst, dst, (u128 *)&ivblk);
+}
+EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit_one);
 MODULE_LICENSE("GPL");
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index 43c938612b74..2f202f49872b 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -4,8 +4,7 @@
 * Copyright (C) 2012 Johannes Goetzfried
 *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
 *
- * Based on arch/x86/crypto/serpent-sse2-x86_64-asm_64.S by
+ * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *  Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -34,6 +33,8 @@
 .Lbswap128_mask:
        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lxts_gf128mul_and_shl1_mask:
+        .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
 .text
@@ -739,3 +740,43 @@ ENTRY(serpent_ctr_8way_avx)
        ret;
 ENDPROC(serpent_ctr_8way_avx)
+ENTRY(serpent_xts_enc_8way_avx)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+         */
+        /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+        load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+                      RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
+        call __serpent_enc_blk8_avx;
+        /* dst <= regs xor IVs(in dst) */
+        store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        ret;
+ENDPROC(serpent_xts_enc_8way_avx)
+ENTRY(serpent_xts_dec_8way_avx)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+         */
+        /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+        load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+                      RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
+        call __serpent_dec_blk8_avx;
+        /* dst <= regs xor IVs(in dst) */
+        store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+        ret;
+ENDPROC(serpent_xts_dec_8way_avx)
diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S
new file mode 100644
index 000000000000..b222085cccac
--- /dev/null
+++ b/arch/x86/crypto/serpent-avx2-asm_64.S
@@ -0,0 +1,800 @@
+/*
+ * x86_64/AVX2 assembler optimized version of Serpent
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Based on AVX assembler implementation of Serpent by:
+ *  Copyright © 2012 Johannes Goetzfried
+ *      <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+#include <linux/linkage.h>
+#include "glue_helper-asm-avx2.S"
+.file "serpent-avx2-asm_64.S"
+.data
+.align 16
+.Lbswap128_mask:
+        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lxts_gf128mul_and_shl1_mask_0:
+        .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+.Lxts_gf128mul_and_shl1_mask_1:
+        .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
+.text
+#define CTX %rdi
+#define RNOT %ymm0
+#define tp  %ymm1
+#define RA1 %ymm2
+#define RA2 %ymm3
+#define RB1 %ymm4
+#define RB2 %ymm5
+#define RC1 %ymm6
+#define RC2 %ymm7
+#define RD1 %ymm8
+#define RD2 %ymm9
+#define RE1 %ymm10
+#define RE2 %ymm11
+#define RK0 %ymm12
+#define RK1 %ymm13
+#define RK2 %ymm14
+#define RK3 %ymm15
+#define RK0x %xmm12
+#define RK1x %xmm13
+#define RK2x %xmm14
+#define RK3x %xmm15
+#define S0_1(x0, x1, x2, x3, x4)      \
+        vpor            x0,   x3, tp; \
+        vpxor           x3,   x0, x0; \
+        vpxor           x2,   x3, x4; \
+        vpxor           RNOT, x4, x4; \
+        vpxor           x1,   tp, x3; \
+        vpand           x0,   x1, x1; \
+        vpxor           x4,   x1, x1; \
+        vpxor           x0,   x2, x2;
+#define S0_2(x0, x1, x2, x3, x4)      \
+        vpxor           x3,   x0, x0; \
+        vpor            x0,   x4, x4; \
+        vpxor           x2,   x0, x0; \
+        vpand           x1,   x2, x2; \
+        vpxor           x2,   x3, x3; \
+        vpxor           RNOT, x1, x1; \
+        vpxor           x4,   x2, x2; \
+        vpxor           x2,   x1, x1;
+#define S1_1(x0, x1, x2, x3, x4)      \
+        vpxor           x0,   x1, tp; \
+        vpxor           x3,   x0, x0; \
+        vpxor           RNOT, x3, x3; \
+        vpand           tp,   x1, x4; \
+        vpor            tp,   x0, x0; \
+        vpxor           x2,   x3, x3; \
+        vpxor           x3,   x0, x0; \
+        vpxor           x3,   tp, x1;
+#define S1_2(x0, x1, x2, x3, x4)      \
+        vpxor           x4,   x3, x3; \
+        vpor            x4,   x1, x1; \
+        vpxor           x2,   x4, x4; \
+        vpand           x0,   x2, x2; \
+        vpxor           x1,   x2, x2; \
+        vpor            x0,   x1, x1; \
+        vpxor           RNOT, x0, x0; \
+        vpxor           x2,   x0, x0; \
+        vpxor           x1,   x4, x4;
+#define S2_1(x0, x1, x2, x3, x4)      \
+        vpxor           RNOT, x3, x3; \
+        vpxor           x0,   x1, x1; \
+        vpand           x2,   x0, tp; \
+        vpxor           x3,   tp, tp; \
+        vpor            x0,   x3, x3; \
+        vpxor           x1,   x2, x2; \
+        vpxor           x1,   x3, x3; \
+        vpand           tp,   x1, x1;
+#define S2_2(x0, x1, x2, x3, x4)      \
+        vpxor           x2,   tp, tp; \
+        vpand           x3,   x2, x2; \
+        vpor            x1,   x3, x3; \
+        vpxor           RNOT, tp, tp; \
+        vpxor           tp,   x3, x3; \
+        vpxor           tp,   x0, x4; \
+        vpxor           x2,   tp, x0; \
+        vpor            x2,   x1, x1;
+#define S3_1(x0, x1, x2, x3, x4)      \
+        vpxor           x3,   x1, tp; \
+        vpor            x0,   x3, x3; \
+        vpand           x0,   x1, x4; \
+        vpxor           x2,   x0, x0; \
+        vpxor           tp,   x2, x2; \
+        vpand           x3,   tp, x1; \
+        vpxor           x3,   x2, x2; \
+        vpor            x4,   x0, x0; \
+        vpxor           x3,   x4, x4;
+#define S3_2(x0, x1, x2, x3, x4)      \
+        vpxor           x0,   x1, x1; \
+        vpand           x3,   x0, x0; \
+        vpand           x4,   x3, x3; \
+        vpxor           x2,   x3, x3; \
+        vpor            x1,   x4, x4; \
+        vpand           x1,   x2, x2; \
+        vpxor           x3,   x4, x4; \
+        vpxor           x3,   x0, x0; \
+        vpxor           x2,   x3, x3;
+#define S4_1(x0, x1, x2, x3, x4)      \
+        vpand           x0,   x3, tp; \
+        vpxor           x3,   x0, x0; \
+        vpxor           x2,   tp, tp; \
+        vpor            x3,   x2, x2; \
+        vpxor           x1,   x0, x0; \
+        vpxor           tp,   x3, x4; \
+        vpor            x0,   x2, x2; \
+        vpxor           x1,   x2, x2;
+#define S4_2(x0, x1, x2, x3, x4)      \
+        vpand           x0,   x1, x1; \
+        vpxor           x4,   x1, x1; \
+        vpand           x2,   x4, x4; \
+        vpxor           tp,   x2, x2; \
+        vpxor           x0,   x4, x4; \
+        vpor            x1,   tp, x3; \
+        vpxor           RNOT, x1, x1; \
+        vpxor           x0,   x3, x3;
+#define S5_1(x0, x1, x2, x3, x4)      \
+        vpor            x0,   x1, tp; \
+        vpxor           tp,   x2, x2; \
+        vpxor           RNOT, x3, x3; \
+        vpxor           x0,   x1, x4; \
+        vpxor           x2,   x0, x0; \
+        vpand           x4,   tp, x1; \
+        vpor            x3,   x4, x4; \
+        vpxor           x0,   x4, x4;
+#define S5_2(x0, x1, x2, x3, x4)      \
+        vpand           x3,   x0, x0; \
+        vpxor           x3,   x1, x1; \
+        vpxor           x2,   x3, x3; \
+        vpxor           x1,   x0, x0; \
+        vpand           x4,   x2, x2; \
+        vpxor           x2,   x1, x1; \
+        vpand           x0,   x2, x2; \
+        vpxor           x2,   x3, x3;
+#define S6_1(x0, x1, x2, x3, x4)      \
+        vpxor           x0,   x3, x3; \
+        vpxor           x2,   x1, tp; \
+        vpxor           x0,   x2, x2; \
+        vpand           x3,   x0, x0; \
+        vpor            x3,   tp, tp; \
+        vpxor           RNOT, x1, x4; \
+        vpxor           tp,   x0, x0; \
+        vpxor           x2,   tp, x1;
+#define S6_2(x0, x1, x2, x3, x4)      \
+        vpxor           x4,   x3, x3; \
+        vpxor           x0,   x4, x4; \
+        vpand           x0,   x2, x2; \
+        vpxor           x1,   x4, x4; \
+        vpxor           x3,   x2, x2; \
+        vpand           x1,   x3, x3; \
+        vpxor           x0,   x3, x3; \
+        vpxor           x2,   x1, x1;
+#define S7_1(x0, x1, x2, x3, x4)      \
+        vpxor           RNOT, x1, tp; \
+        vpxor           RNOT, x0, x0; \
+        vpand           x2,   tp, x1; \
+        vpxor           x3,   x1, x1; \
+        vpor            tp,   x3, x3; \
+        vpxor           x2,   tp, x4; \
+        vpxor           x3,   x2, x2; \
+        vpxor           x0,   x3, x3; \
+        vpor            x1,   x0, x0;
+#define S7_2(x0, x1, x2, x3, x4)      \
+        vpand           x0,   x2, x2; \
+        vpxor           x4,   x0, x0; \
+        vpxor           x3,   x4, x4; \
+        vpand           x0,   x3, x3; \
+        vpxor           x1,   x4, x4; \
+        vpxor           x4,   x2, x2; \
+        vpxor           x1,   x3, x3; \
+        vpor            x0,   x4, x4; \
+        vpxor           x1,   x4, x4;
+#define SI0_1(x0, x1, x2, x3, x4)     \
+        vpxor           x0,   x1, x1; \
+        vpor            x1,   x3, tp; \
+        vpxor           x1,   x3, x4; \
+        vpxor           RNOT, x0, x0; \
+        vpxor           tp,   x2, x2; \
+        vpxor           x0,   tp, x3; \
+        vpand           x1,   x0, x0; \
+        vpxor           x2,   x0, x0;
+#define SI0_2(x0, x1, x2, x3, x4)     \
+        vpand           x3,   x2, x2; \
+        vpxor           x4,   x3, x3; \
+        vpxor           x3,   x2, x2; \
+        vpxor           x3,   x1, x1; \
+        vpand           x0,   x3, x3; \
+        vpxor           x0,   x1, x1; \
+        vpxor           x2,   x0, x0; \
+        vpxor           x3,   x4, x4;
+#define SI1_1(x0, x1, x2, x3, x4)     \
+        vpxor           x3,   x1, x1; \
+        vpxor           x2,   x0, tp; \
+        vpxor           RNOT, x2, x2; \
+        vpor            x1,   x0, x4; \
+        vpxor           x3,   x4, x4; \
+        vpand           x1,   x3, x3; \
+        vpxor           x2,   x1, x1; \
+        vpand           x4,   x2, x2;
+#define SI1_2(x0, x1, x2, x3, x4)     \
+        vpxor           x1,   x4, x4; \
+        vpor            x3,   x1, x1; \
+        vpxor           tp,   x3, x3; \
+        vpxor           tp,   x2, x2; \
+        vpor            x4,   tp, x0; \
+        vpxor           x4,   x2, x2; \
+        vpxor           x0,   x1, x1; \
+        vpxor           x1,   x4, x4;
+#define SI2_1(x0, x1, x2, x3, x4)     \
+        vpxor           x1,   x2, x2; \
+        vpxor           RNOT, x3, tp; \
+        vpor            x2,   tp, tp; \
+        vpxor           x3,   x2, x2; \
+        vpxor           x0,   x3, x4; \
+        vpxor           x1,   tp, x3; \
+        vpor            x2,   x1, x1; \
+        vpxor           x0,   x2, x2;
+#define SI2_2(x0, x1, x2, x3, x4)     \
+        vpxor           x4,   x1, x1; \
+        vpor            x3,   x4, x4; \
+        vpxor           x3,   x2, x2; \
+        vpxor           x2,   x4, x4; \
+        vpand           x1,   x2, x2; \
+        vpxor           x3,   x2, x2; \
+        vpxor           x4,   x3, x3; \
+        vpxor           x0,   x4, x4;
+#define SI3_1(x0, x1, x2, x3, x4)     \
+        vpxor           x1,   x2, x2; \
+        vpand           x2,   x1, tp; \
+        vpxor           x0,   tp, tp; \
+        vpor            x1,   x0, x0; \
+        vpxor           x3,   x1, x4; \
+        vpxor           x3,   x0, x0; \
+        vpor            tp,   x3, x3; \
+        vpxor           x2,   tp, x1;
+#define SI3_2(x0, x1, x2, x3, x4)     \
+        vpxor           x3,   x1, x1; \
+        vpxor           x2,   x0, x0; \
+        vpxor           x3,   x2, x2; \
+        vpand           x1,   x3, x3; \
+        vpxor           x0,   x1, x1; \
+        vpand           x2,   x0, x0; \
+        vpxor           x3,   x4, x4; \
+        vpxor           x0,   x3, x3; \
+        vpxor           x1,   x0, x0;
+#define SI4_1(x0, x1, x2, x3, x4)     \
+        vpxor           x3,   x2, x2; \
+        vpand           x1,   x0, tp; \
+        vpxor           x2,   tp, tp; \
+        vpor            x3,   x2, x2; \
+        vpxor           RNOT, x0, x4; \
+        vpxor           tp,   x1, x1; \
+        vpxor           x2,   tp, x0; \
+        vpand           x4,   x2, x2;
+#define SI4_2(x0, x1, x2, x3, x4)     \
+        vpxor           x0,   x2, x2; \
+        vpor            x4,   x0, x0; \
+        vpxor           x3,   x0, x0; \
+        vpand           x2,   x3, x3; \
+        vpxor           x3,   x4, x4; \
+        vpxor           x1,   x3, x3; \
+        vpand           x0,   x1, x1; \
+        vpxor           x1,   x4, x4; \
+        vpxor           x3,   x0, x0;
+#define SI5_1(x0, x1, x2, x3, x4)     \
+        vpor            x2,   x1, tp; \
+        vpxor           x1,   x2, x2; \
+        vpxor           x3,   tp, tp; \
+        vpand           x1,   x3, x3; \
+        vpxor           x3,   x2, x2; \
+        vpor            x0,   x3, x3; \
+        vpxor           RNOT, x0, x0; \
+        vpxor           x2,   x3, x3; \
+        vpor            x0,   x2, x2;
+#define SI5_2(x0, x1, x2, x3, x4)     \
+        vpxor           tp,   x1, x4; \
+        vpxor           x4,   x2, x2; \
+        vpand           x0,   x4, x4; \
+        vpxor           tp,   x0, x0; \
+        vpxor           x3,   tp, x1; \
+        vpand           x2,   x0, x0; \
+        vpxor           x3,   x2, x2; \
+        vpxor           x2,   x0, x0; \
+        vpxor           x4,   x2, x2; \
+        vpxor           x3,   x4, x4;
+#define SI6_1(x0, x1, x2, x3, x4)     \
+        vpxor           x2,   x0, x0; \
+        vpand           x3,   x0, tp; \
+        vpxor           x3,   x2, x2; \
+        vpxor           x2,   tp, tp; \
+        vpxor           x1,   x3, x3; \
+        vpor            x0,   x2, x2; \
+        vpxor           x3,   x2, x2; \
+        vpand           tp,   x3, x3;
+#define SI6_2(x0, x1, x2, x3, x4)     \
+        vpxor           RNOT, tp, tp; \
+        vpxor           x1,   x3, x3; \
+        vpand           x2,   x1, x1; \
+        vpxor           tp,   x0, x4; \
+        vpxor           x4,   x3, x3; \
+        vpxor           x2,   x4, x4; \
+        vpxor           x1,   tp, x0; \
+        vpxor           x0,   x2, x2;
+#define SI7_1(x0, x1, x2, x3, x4)     \
+        vpand           x0,   x3, tp; \
+        vpxor           x2,   x0, x0; \
+        vpor            x3,   x2, x2; \
+        vpxor           x1,   x3, x4; \
+        vpxor           RNOT, x0, x0; \
+        vpor            tp,   x1, x1; \
+        vpxor           x0,   x4, x4; \
+        vpand           x2,   x0, x0; \
+        vpxor           x1,   x0, x0;
+#define SI7_2(x0, x1, x2, x3, x4)     \
+        vpand           x2,   x1, x1; \
+        vpxor           x2,   tp, x3; \
+        vpxor           x3,   x4, x4; \
+        vpand           x3,   x2, x2; \
+        vpor            x0,   x3, x3; \
+        vpxor           x4,   x1, x1; \
+        vpxor           x4,   x3, x3; \
+        vpand           x0,   x4, x4; \
+        vpxor           x2,   x4, x4;
+#define get_key(i,j,t) \
+        vpbroadcastd (4*(i)+(j))*4(CTX), t;
+#define K2(x0, x1, x2, x3, x4, i) \
+        get_key(i, 0, RK0); \
+        get_key(i, 1, RK1); \
+        get_key(i, 2, RK2); \
+        get_key(i, 3, RK3); \
+        vpxor RK0,      x0 ## 1, x0 ## 1; \
+        vpxor RK1,      x1 ## 1, x1 ## 1; \
+        vpxor RK2,      x2 ## 1, x2 ## 1; \
+        vpxor RK3,      x3 ## 1, x3 ## 1; \
+                vpxor RK0,      x0 ## 2, x0 ## 2; \
+                vpxor RK1,      x1 ## 2, x1 ## 2; \
+                vpxor RK2,      x2 ## 2, x2 ## 2; \
+                vpxor RK3,      x3 ## 2, x3 ## 2;
+#define LK2(x0, x1, x2, x3, x4, i) \
+        vpslld $13,             x0 ## 1, x4 ## 1;          \
+        vpsrld $(32 - 13),      x0 ## 1, x0 ## 1;          \
+        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
+        vpxor                   x0 ## 1, x1 ## 1, x1 ## 1; \
+        vpslld $3,              x2 ## 1, x4 ## 1;          \
+        vpsrld $(32 - 3),       x2 ## 1, x2 ## 1;          \
+        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
+        vpxor                   x2 ## 1, x1 ## 1, x1 ## 1; \
+                vpslld $13,             x0 ## 2, x4 ## 2;          \
+                vpsrld $(32 - 13),      x0 ## 2, x0 ## 2;          \
+                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
+                vpxor                   x0 ## 2, x1 ## 2, x1 ## 2; \
+                vpslld $3,              x2 ## 2, x4 ## 2;          \
+                vpsrld $(32 - 3),       x2 ## 2, x2 ## 2;          \
+                vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
+                vpxor                   x2 ## 2, x1 ## 2, x1 ## 2; \
+        vpslld $1,              x1 ## 1, x4 ## 1;          \
+        vpsrld $(32 - 1),       x1 ## 1, x1 ## 1;          \
+        vpor                    x4 ## 1, x1 ## 1, x1 ## 1; \
+        vpslld $3,              x0 ## 1, x4 ## 1;          \
+        vpxor                   x2 ## 1, x3 ## 1, x3 ## 1; \
+        vpxor                   x4 ## 1, x3 ## 1, x3 ## 1; \
+        get_key(i, 1, RK1); \
+                vpslld $1,              x1 ## 2, x4 ## 2;          \
+                vpsrld $(32 - 1),       x1 ## 2, x1 ## 2;          \
+                vpor                    x4 ## 2, x1 ## 2, x1 ## 2; \
+                vpslld $3,              x0 ## 2, x4 ## 2;          \
+                vpxor                   x2 ## 2, x3 ## 2, x3 ## 2; \
+                vpxor                   x4 ## 2, x3 ## 2, x3 ## 2; \
+                get_key(i, 3, RK3); \
+        vpslld $7,              x3 ## 1, x4 ## 1;          \
+        vpsrld $(32 - 7),       x3 ## 1, x3 ## 1;          \
+        vpor                    x4 ## 1, x3 ## 1, x3 ## 1; \
+        vpslld $7,              x1 ## 1, x4 ## 1;          \
+        vpxor                   x1 ## 1, x0 ## 1, x0 ## 1; \
+        vpxor                   x3 ## 1, x0 ## 1, x0 ## 1; \
+        vpxor                   x3 ## 1, x2 ## 1, x2 ## 1; \
+        vpxor                   x4 ## 1, x2 ## 1, x2 ## 1; \
+        get_key(i, 0, RK0); \
+                vpslld $7,              x3 ## 2, x4 ## 2;          \
+                vpsrld $(32 - 7),       x3 ## 2, x3 ## 2;          \
+                vpor                    x4 ## 2, x3 ## 2, x3 ## 2; \
+                vpslld $7,              x1 ## 2, x4 ## 2;          \
+                vpxor                   x1 ## 2, x0 ## 2, x0 ## 2; \
+                vpxor                   x3 ## 2, x0 ## 2, x0 ## 2; \
+                vpxor                   x3 ## 2, x2 ## 2, x2 ## 2; \
+                vpxor                   x4 ## 2, x2 ## 2, x2 ## 2; \
+                get_key(i, 2, RK2); \
+        vpxor                   RK1, x1 ## 1, x1 ## 1;     \
+        vpxor                   RK3, x3 ## 1, x3 ## 1;     \
+        vpslld $5,              x0 ## 1, x4 ## 1;          \
+        vpsrld $(32 - 5),       x0 ## 1, x0 ## 1;          \
+        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
+        vpslld $22,             x2 ## 1, x4 ## 1;          \
+        vpsrld $(32 - 22),      x2 ## 1, x2 ## 1;          \
+        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
+        vpxor                   RK0, x0 ## 1, x0 ## 1;     \
+        vpxor                   RK2, x2 ## 1, x2 ## 1;     \
+                vpxor                   RK1, x1 ## 2, x1 ## 2;     \
+                vpxor                   RK3, x3 ## 2, x3 ## 2;     \
+                vpslld $5,              x0 ## 2, x4 ## 2;          \
+                vpsrld $(32 - 5),       x0 ## 2, x0 ## 2;          \
+                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
+                vpslld $22,             x2 ## 2, x4 ## 2;          \
+                vpsrld $(32 - 22),      x2 ## 2, x2 ## 2;          \
+                vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
+                vpxor                   RK0, x0 ## 2, x0 ## 2;     \
+                vpxor                   RK2, x2 ## 2, x2 ## 2;
+#define KL2(x0, x1, x2, x3, x4, i) \
+        vpxor                   RK0, x0 ## 1, x0 ## 1;     \
+        vpxor                   RK2, x2 ## 1, x2 ## 1;     \
+        vpsrld $5,              x0 ## 1, x4 ## 1;          \
+        vpslld $(32 - 5),       x0 ## 1, x0 ## 1;          \
+        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
+        vpxor                   RK3, x3 ## 1, x3 ## 1;     \
+        vpxor                   RK1, x1 ## 1, x1 ## 1;     \
+        vpsrld $22,             x2 ## 1, x4 ## 1;          \
+        vpslld $(32 - 22),      x2 ## 1, x2 ## 1;          \
+        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
+        vpxor                   x3 ## 1, x2 ## 1, x2 ## 1; \
+                vpxor                   RK0, x0 ## 2, x0 ## 2;     \
+                vpxor                   RK2, x2 ## 2, x2 ## 2;     \
+                vpsrld $5,              x0 ## 2, x4 ## 2;          \
+                vpslld $(32 - 5),       x0 ## 2, x0 ## 2;          \
+                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
+                vpxor                   RK3, x3 ## 2, x3 ## 2;     \
+                vpxor                   RK1, x1 ## 2, x1 ## 2;     \
+                vpsrld $22,             x2 ## 2, x4 ## 2;          \
+                vpslld $(32 - 22),      x2 ## 2, x2 ## 2;          \
+                vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
+                vpxor                   x3 ## 2, x2 ## 2, x2 ## 2; \
+        vpxor                   x3 ## 1, x0 ## 1, x0 ## 1; \
+        vpslld $7,              x1 ## 1, x4 ## 1;          \
+        vpxor                   x1 ## 1, x0 ## 1, x0 ## 1; \
+        vpxor                   x4 ## 1, x2 ## 1, x2 ## 1; \
+        vpsrld $1,              x1 ## 1, x4 ## 1;          \
+        vpslld $(32 - 1),       x1 ## 1, x1 ## 1;          \
+        vpor                    x4 ## 1, x1 ## 1, x1 ## 1; \
+                vpxor                   x3 ## 2, x0 ## 2, x0 ## 2; \
+                vpslld $7,              x1 ## 2, x4 ## 2;          \
+                vpxor                   x1 ## 2, x0 ## 2, x0 ## 2; \
+                vpxor                   x4 ## 2, x2 ## 2, x2 ## 2; \
+                vpsrld $1,              x1 ## 2, x4 ## 2;          \
+                vpslld $(32 - 1),       x1 ## 2, x1 ## 2;          \
+                vpor                    x4 ## 2, x1 ## 2, x1 ## 2; \
+        vpsrld $7,              x3 ## 1, x4 ## 1;          \
+        vpslld $(32 - 7),       x3 ## 1, x3 ## 1;          \
+        vpor                    x4 ## 1, x3 ## 1, x3 ## 1; \
+        vpxor                   x0 ## 1, x1 ## 1, x1 ## 1; \
+        vpslld $3,              x0 ## 1, x4 ## 1;          \
+        vpxor                   x4 ## 1, x3 ## 1, x3 ## 1; \
+                vpsrld $7,              x3 ## 2, x4 ## 2;          \
+                vpslld $(32 - 7),       x3 ## 2, x3 ## 2;          \
+                vpor                    x4 ## 2, x3 ## 2, x3 ## 2; \
+                vpxor                   x0 ## 2, x1 ## 2, x1 ## 2; \
+                vpslld $3,              x0 ## 2, x4 ## 2;          \
+                vpxor                   x4 ## 2, x3 ## 2, x3 ## 2; \
+        vpsrld $13,             x0 ## 1, x4 ## 1;          \
+        vpslld $(32 - 13),      x0 ## 1, x0 ## 1;          \
+        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
+        vpxor                   x2 ## 1, x1 ## 1, x1 ## 1; \
+        vpxor                   x2 ## 1, x3 ## 1, x3 ## 1; \
+        vpsrld $3,              x2 ## 1, x4 ## 1;          \
+        vpslld $(32 - 3),       x2 ## 1, x2 ## 1;          \
+        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
+                vpsrld $13,             x0 ## 2, x4 ## 2;          \
+                vpslld $(32 - 13),      x0 ## 2, x0 ## 2;          \
+                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
+                vpxor                   x2 ## 2, x1 ## 2, x1 ## 2; \
+                vpxor                   x2 ## 2, x3 ## 2, x3 ## 2; \
+                vpsrld $3,              x2 ## 2, x4 ## 2;          \
+                vpslld $(32 - 3),       x2 ## 2, x2 ## 2;          \
+                vpor                    x4 ## 2, x2 ## 2, x2 ## 2;
+#define S(SBOX, x0, x1, x2, x3, x4) \
+        SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+        SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+        SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+        SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
+#define SP(SBOX, x0, x1, x2, x3, x4, i) \
+        get_key(i, 0, RK0); \
+        SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+        get_key(i, 2, RK2); \
+        SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+        get_key(i, 3, RK3); \
+        SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+        get_key(i, 1, RK1); \
+        SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+        vpunpckldq              x1, x0, t0; \
+        vpunpckhdq              x1, x0, t2; \
+        vpunpckldq              x3, x2, t1; \
+        vpunpckhdq              x3, x2, x3; \
+        \
+        vpunpcklqdq             t1, t0, x0; \
+        vpunpckhqdq             t1, t0, x1; \
+        vpunpcklqdq             x3, t2, x2; \
+        vpunpckhqdq             x3, t2, x3;
+#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
+        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
+        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+.align 8
+__serpent_enc_blk16:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext
+         * output:
+         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
+         */
+        vpcmpeqd RNOT, RNOT, RNOT;
+        read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+        read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+                                                 K2(RA, RB, RC, RD, RE, 0);
+        S(S0, RA, RB, RC, RD, RE);              LK2(RC, RB, RD, RA, RE, 1);
+        S(S1, RC, RB, RD, RA, RE);              LK2(RE, RD, RA, RC, RB, 2);
+        S(S2, RE, RD, RA, RC, RB);              LK2(RB, RD, RE, RC, RA, 3);
+        S(S3, RB, RD, RE, RC, RA);              LK2(RC, RA, RD, RB, RE, 4);
+        S(S4, RC, RA, RD, RB, RE);              LK2(RA, RD, RB, RE, RC, 5);
+        S(S5, RA, RD, RB, RE, RC);              LK2(RC, RA, RD, RE, RB, 6);
+        S(S6, RC, RA, RD, RE, RB);              LK2(RD, RB, RA, RE, RC, 7);
+        S(S7, RD, RB, RA, RE, RC);              LK2(RC, RA, RE, RD, RB, 8);
+        S(S0, RC, RA, RE, RD, RB);              LK2(RE, RA, RD, RC, RB, 9);
+        S(S1, RE, RA, RD, RC, RB);              LK2(RB, RD, RC, RE, RA, 10);
+        S(S2, RB, RD, RC, RE, RA);              LK2(RA, RD, RB, RE, RC, 11);
+        S(S3, RA, RD, RB, RE, RC);              LK2(RE, RC, RD, RA, RB, 12);
+        S(S4, RE, RC, RD, RA, RB);              LK2(RC, RD, RA, RB, RE, 13);
+        S(S5, RC, RD, RA, RB, RE);              LK2(RE, RC, RD, RB, RA, 14);
+        S(S6, RE, RC, RD, RB, RA);              LK2(RD, RA, RC, RB, RE, 15);
+        S(S7, RD, RA, RC, RB, RE);              LK2(RE, RC, RB, RD, RA, 16);
+        S(S0, RE, RC, RB, RD, RA);              LK2(RB, RC, RD, RE, RA, 17);
+        S(S1, RB, RC, RD, RE, RA);              LK2(RA, RD, RE, RB, RC, 18);
+        S(S2, RA, RD, RE, RB, RC);              LK2(RC, RD, RA, RB, RE, 19);
+        S(S3, RC, RD, RA, RB, RE);              LK2(RB, RE, RD, RC, RA, 20);
+        S(S4, RB, RE, RD, RC, RA);              LK2(RE, RD, RC, RA, RB, 21);
+        S(S5, RE, RD, RC, RA, RB);              LK2(RB, RE, RD, RA, RC, 22);
+        S(S6, RB, RE, RD, RA, RC);              LK2(RD, RC, RE, RA, RB, 23);
+        S(S7, RD, RC, RE, RA, RB);              LK2(RB, RE, RA, RD, RC, 24);
+        S(S0, RB, RE, RA, RD, RC);              LK2(RA, RE, RD, RB, RC, 25);
+        S(S1, RA, RE, RD, RB, RC);              LK2(RC, RD, RB, RA, RE, 26);
+        S(S2, RC, RD, RB, RA, RE);              LK2(RE, RD, RC, RA, RB, 27);
+        S(S3, RE, RD, RC, RA, RB);              LK2(RA, RB, RD, RE, RC, 28);
+        S(S4, RA, RB, RD, RE, RC);              LK2(RB, RD, RE, RC, RA, 29);
+        S(S5, RB, RD, RE, RC, RA);              LK2(RA, RB, RD, RC, RE, 30);
+        S(S6, RA, RB, RD, RC, RE);              LK2(RD, RE, RB, RC, RA, 31);
+        S(S7, RD, RE, RB, RC, RA);               K2(RA, RB, RC, RD, RE, 32);
+        write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+        write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+        ret;
+ENDPROC(__serpent_enc_blk16)
+.align 8
+__serpent_dec_blk16:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
+         * output:
+         *      RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext
+         */
+        vpcmpeqd RNOT, RNOT, RNOT;
+        read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+        read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+                                                 K2(RA, RB, RC, RD, RE, 32);
+        SP(SI7, RA, RB, RC, RD, RE, 31);        KL2(RB, RD, RA, RE, RC, 31);
+        SP(SI6, RB, RD, RA, RE, RC, 30);        KL2(RA, RC, RE, RB, RD, 30);
+        SP(SI5, RA, RC, RE, RB, RD, 29);        KL2(RC, RD, RA, RE, RB, 29);
+        SP(SI4, RC, RD, RA, RE, RB, 28);        KL2(RC, RA, RB, RE, RD, 28);
+        SP(SI3, RC, RA, RB, RE, RD, 27);        KL2(RB, RC, RD, RE, RA, 27);
+        SP(SI2, RB, RC, RD, RE, RA, 26);        KL2(RC, RA, RE, RD, RB, 26);
+        SP(SI1, RC, RA, RE, RD, RB, 25);        KL2(RB, RA, RE, RD, RC, 25);
+        SP(SI0, RB, RA, RE, RD, RC, 24);        KL2(RE, RC, RA, RB, RD, 24);
+        SP(SI7, RE, RC, RA, RB, RD, 23);        KL2(RC, RB, RE, RD, RA, 23);
+        SP(SI6, RC, RB, RE, RD, RA, 22);        KL2(RE, RA, RD, RC, RB, 22);
+        SP(SI5, RE, RA, RD, RC, RB, 21);        KL2(RA, RB, RE, RD, RC, 21);
+        SP(SI4, RA, RB, RE, RD, RC, 20);        KL2(RA, RE, RC, RD, RB, 20);
+        SP(SI3, RA, RE, RC, RD, RB, 19);        KL2(RC, RA, RB, RD, RE, 19);
+        SP(SI2, RC, RA, RB, RD, RE, 18);        KL2(RA, RE, RD, RB, RC, 18);
+        SP(SI1, RA, RE, RD, RB, RC, 17);        KL2(RC, RE, RD, RB, RA, 17);
+        SP(SI0, RC, RE, RD, RB, RA, 16);        KL2(RD, RA, RE, RC, RB, 16);
+        SP(SI7, RD, RA, RE, RC, RB, 15);        KL2(RA, RC, RD, RB, RE, 15);
+        SP(SI6, RA, RC, RD, RB, RE, 14);        KL2(RD, RE, RB, RA, RC, 14);
+        SP(SI5, RD, RE, RB, RA, RC, 13);        KL2(RE, RC, RD, RB, RA, 13);
+        SP(SI4, RE, RC, RD, RB, RA, 12);        KL2(RE, RD, RA, RB, RC, 12);
+        SP(SI3, RE, RD, RA, RB, RC, 11);        KL2(RA, RE, RC, RB, RD, 11);
+        SP(SI2, RA, RE, RC, RB, RD, 10);        KL2(RE, RD, RB, RC, RA, 10);
+        SP(SI1, RE, RD, RB, RC, RA, 9);         KL2(RA, RD, RB, RC, RE, 9);
+        SP(SI0, RA, RD, RB, RC, RE, 8);         KL2(RB, RE, RD, RA, RC, 8);
+        SP(SI7, RB, RE, RD, RA, RC, 7);         KL2(RE, RA, RB, RC, RD, 7);
+        SP(SI6, RE, RA, RB, RC, RD, 6);         KL2(RB, RD, RC, RE, RA, 6);
+        SP(SI5, RB, RD, RC, RE, RA, 5);         KL2(RD, RA, RB, RC, RE, 5);
+        SP(SI4, RD, RA, RB, RC, RE, 4);         KL2(RD, RB, RE, RC, RA, 4);
+        SP(SI3, RD, RB, RE, RC, RA, 3);         KL2(RE, RD, RA, RC, RB, 3);
+        SP(SI2, RE, RD, RA, RC, RB, 2);         KL2(RD, RB, RC, RA, RE, 2);
+        SP(SI1, RD, RB, RC, RA, RE, 1);         KL2(RE, RB, RC, RA, RD, 1);
+        S(SI0, RE, RB, RC, RA, RD);              K2(RC, RD, RB, RE, RA, 0);
+        write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
+        write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+        ret;
+ENDPROC(__serpent_dec_blk16)
+ENTRY(serpent_ecb_enc_16way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        vzeroupper;
+        load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        call __serpent_enc_blk16;
+        store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        vzeroupper;
+        ret;
+ENDPROC(serpent_ecb_enc_16way)
+ENTRY(serpent_ecb_dec_16way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        vzeroupper;
+        load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        call __serpent_dec_blk16;
+        store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+        vzeroupper;
+        ret;
+ENDPROC(serpent_ecb_dec_16way)
+ENTRY(serpent_cbc_dec_16way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        vzeroupper;
+        load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        call __serpent_dec_blk16;
+        store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,
+                        RK0);
+        vzeroupper;
+        ret;
+ENDPROC(serpent_cbc_dec_16way)
+ENTRY(serpent_ctr_16way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst (16 blocks)
+         *      %rdx: src (16 blocks)
+         *      %rcx: iv (little endian, 128bit)
+         */
+        vzeroupper;
+        load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+                       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
+                       tp);
+        call __serpent_enc_blk16;
+        store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        vzeroupper;
+        ret;
+ENDPROC(serpent_ctr_16way)
+ENTRY(serpent_xts_enc_16way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst (16 blocks)
+         *      %rdx: src (16 blocks)
+         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+         */
+        vzeroupper;
+        load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+                       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
+                       .Lxts_gf128mul_and_shl1_mask_0,
+                       .Lxts_gf128mul_and_shl1_mask_1);
+        call __serpent_enc_blk16;
+        store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        vzeroupper;
+        ret;
+ENDPROC(serpent_xts_enc_16way)
+ENTRY(serpent_xts_dec_16way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst (16 blocks)
+         *      %rdx: src (16 blocks)
+         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+         */
+        vzeroupper;
+        load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+                       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
+                       .Lxts_gf128mul_and_shl1_mask_0,
+                       .Lxts_gf128mul_and_shl1_mask_1);
+        call __serpent_dec_blk16;
+        store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+        vzeroupper;
+        ret;
+ENDPROC(serpent_xts_dec_16way)
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
new file mode 100644
index 000000000000..23aabc6c20a5
--- /dev/null
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -0,0 +1,562 @@
+/*
+ * Glue Code for x86_64/AVX2 assembler optimized version of Serpent
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/ctr.h>
+#include <crypto/lrw.h>
+#include <crypto/xts.h>
+#include <crypto/serpent.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <asm/crypto/serpent-avx.h>
+#include <asm/crypto/ablk_helper.h>
+#include <asm/crypto/glue_helper.h>
+#define SERPENT_AVX2_PARALLEL_BLOCKS 16
+/* 16-way AVX2 parallel cipher functions */
+asmlinkage void serpent_ecb_enc_16way(struct serpent_ctx *ctx, u8 *dst,
+                                      const u8 *src);
+asmlinkage void serpent_ecb_dec_16way(struct serpent_ctx *ctx, u8 *dst,
+                                      const u8 *src);
+asmlinkage void serpent_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src);
+asmlinkage void serpent_ctr_16way(void *ctx, u128 *dst, const u128 *src,
+                                  le128 *iv);
+asmlinkage void serpent_xts_enc_16way(struct serpent_ctx *ctx, u8 *dst,
+                                      const u8 *src, le128 *iv);
+asmlinkage void serpent_xts_dec_16way(struct serpent_ctx *ctx, u8 *dst,
+                                      const u8 *src, le128 *iv);
+static const struct common_glue_ctx serpent_enc = {
+        .num_funcs = 3,
+        .fpu_blocks_limit = 8,
+        .funcs = { {
+                .num_blocks = 16,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_16way) }
+        }, {
+                .num_blocks = 8,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
+        } }
+};
+static const struct common_glue_ctx serpent_ctr = {
+        .num_funcs = 3,
+        .fpu_blocks_limit = 8,
+        .funcs = { {
+                .num_blocks = 16,
+                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_16way) }
+        },  {
+                .num_blocks = 8,
+                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) }
+        } }
+};
+static const struct common_glue_ctx serpent_enc_xts = {
+        .num_funcs = 3,
+        .fpu_blocks_limit = 8,
+        .funcs = { {
+                .num_blocks = 16,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_16way) }
+        }, {
+                .num_blocks = 8,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) }
+        } }
+};
+static const struct common_glue_ctx serpent_dec = {
+        .num_funcs = 3,
+        .fpu_blocks_limit = 8,
+        .funcs = { {
+                .num_blocks = 16,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_16way) }
+        }, {
+                .num_blocks = 8,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
+        } }
+};
+static const struct common_glue_ctx serpent_dec_cbc = {
+        .num_funcs = 3,
+        .fpu_blocks_limit = 8,
+        .funcs = { {
+                .num_blocks = 16,
+                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_16way) }
+        }, {
+                .num_blocks = 8,
+                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
+        } }
+};
+static const struct common_glue_ctx serpent_dec_xts = {
+        .num_funcs = 3,
+        .fpu_blocks_limit = 8,
+        .funcs = { {
+                .num_blocks = 16,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_16way) }
+        }, {
+                .num_blocks = 8,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) }
+        } }
+};
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes);
+}
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes);
+}
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc,
+                                       dst, src, nbytes);
+}
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src,
+                                       nbytes);
+}
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                     struct scatterlist *src, unsigned int nbytes)
+{
+        return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes);
+}
+static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+{
+        /* since reusing AVX functions, starts using FPU at 8 parallel blocks */
+        return glue_fpu_begin(SERPENT_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes);
+}
+static inline void serpent_fpu_end(bool fpu_enabled)
+{
+        glue_fpu_end(fpu_enabled);
+}
+struct crypt_priv {
+        struct serpent_ctx *ctx;
+        bool fpu_enabled;
+};
+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+        const unsigned int bsize = SERPENT_BLOCK_SIZE;
+        struct crypt_priv *ctx = priv;
+        int i;
+        ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
+        if (nbytes >= SERPENT_AVX2_PARALLEL_BLOCKS * bsize) {
+                serpent_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
+                srcdst += bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
+                nbytes -= bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
+        }
+        while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
+                serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
+                srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
+                nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
+        }
+        for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+                __serpent_encrypt(ctx->ctx, srcdst, srcdst);
+}
+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+        const unsigned int bsize = SERPENT_BLOCK_SIZE;
+        struct crypt_priv *ctx = priv;
+        int i;
+        ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
+        if (nbytes >= SERPENT_AVX2_PARALLEL_BLOCKS * bsize) {
+                serpent_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
+                srcdst += bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
+                nbytes -= bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
+        }
+        while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
+                serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
+                srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
+                nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
+        }
+        for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+                __serpent_decrypt(ctx->ctx, srcdst, srcdst);
+}
+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        be128 buf[SERPENT_AVX2_PARALLEL_BLOCKS];
+        struct crypt_priv crypt_ctx = {
+                .ctx = &ctx->serpent_ctx,
+                .fpu_enabled = false,
+        };
+        struct lrw_crypt_req req = {
+                .tbuf = buf,
+                .tbuflen = sizeof(buf),
+                .table_ctx = &ctx->lrw_table,
+                .crypt_ctx = &crypt_ctx,
+                .crypt_fn = encrypt_callback,
+        };
+        int ret;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        ret = lrw_crypt(desc, dst, src, nbytes, &req);
+        serpent_fpu_end(crypt_ctx.fpu_enabled);
+        return ret;
+}
+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        be128 buf[SERPENT_AVX2_PARALLEL_BLOCKS];
+        struct crypt_priv crypt_ctx = {
+                .ctx = &ctx->serpent_ctx,
+                .fpu_enabled = false,
+        };
+        struct lrw_crypt_req req = {
+                .tbuf = buf,
+                .tbuflen = sizeof(buf),
+                .table_ctx = &ctx->lrw_table,
+                .crypt_ctx = &crypt_ctx,
+                .crypt_fn = decrypt_callback,
+        };
+        int ret;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        ret = lrw_crypt(desc, dst, src, nbytes, &req);
+        serpent_fpu_end(crypt_ctx.fpu_enabled);
+        return ret;
+}
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        return glue_xts_crypt_128bit(&serpent_enc_xts, desc, dst, src, nbytes,
+                                     XTS_TWEAK_CAST(__serpent_encrypt),
+                                     &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        return glue_xts_crypt_128bit(&serpent_dec_xts, desc, dst, src, nbytes,
+                                     XTS_TWEAK_CAST(__serpent_encrypt),
+                                     &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+static struct crypto_alg srp_algs[10] = { {
+        .cra_name               = "__ecb-serpent-avx2",
+        .cra_driver_name        = "__driver-ecb-serpent-avx2",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = SERPENT_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct serpent_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_list               = LIST_HEAD_INIT(srp_algs[0].cra_list),
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = SERPENT_MIN_KEY_SIZE,
+                        .max_keysize    = SERPENT_MAX_KEY_SIZE,
+                        .setkey         = serpent_setkey,
+                        .encrypt        = ecb_encrypt,
+                        .decrypt        = ecb_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "__cbc-serpent-avx2",
+        .cra_driver_name        = "__driver-cbc-serpent-avx2",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = SERPENT_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct serpent_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_list               = LIST_HEAD_INIT(srp_algs[1].cra_list),
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = SERPENT_MIN_KEY_SIZE,
+                        .max_keysize    = SERPENT_MAX_KEY_SIZE,
+                        .setkey         = serpent_setkey,
+                        .encrypt        = cbc_encrypt,
+                        .decrypt        = cbc_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "__ctr-serpent-avx2",
+        .cra_driver_name        = "__driver-ctr-serpent-avx2",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = 1,
+        .cra_ctxsize            = sizeof(struct serpent_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_list               = LIST_HEAD_INIT(srp_algs[2].cra_list),
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = SERPENT_MIN_KEY_SIZE,
+                        .max_keysize    = SERPENT_MAX_KEY_SIZE,
+                        .ivsize         = SERPENT_BLOCK_SIZE,
+                        .setkey         = serpent_setkey,
+                        .encrypt        = ctr_crypt,
+                        .decrypt        = ctr_crypt,
+                },
+        },
+}, {
+        .cra_name               = "__lrw-serpent-avx2",
+        .cra_driver_name        = "__driver-lrw-serpent-avx2",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = SERPENT_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct serpent_lrw_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_list               = LIST_HEAD_INIT(srp_algs[3].cra_list),
+        .cra_exit               = lrw_serpent_exit_tfm,
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = SERPENT_MIN_KEY_SIZE +
+                                          SERPENT_BLOCK_SIZE,
+                        .max_keysize    = SERPENT_MAX_KEY_SIZE +
+                                          SERPENT_BLOCK_SIZE,
+                        .ivsize         = SERPENT_BLOCK_SIZE,
+                        .setkey         = lrw_serpent_setkey,
+                        .encrypt        = lrw_encrypt,
+                        .decrypt        = lrw_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "__xts-serpent-avx2",
+        .cra_driver_name        = "__driver-xts-serpent-avx2",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = SERPENT_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct serpent_xts_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_list               = LIST_HEAD_INIT(srp_algs[4].cra_list),
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = SERPENT_MIN_KEY_SIZE * 2,
+                        .max_keysize    = SERPENT_MAX_KEY_SIZE * 2,
+                        .ivsize         = SERPENT_BLOCK_SIZE,
+                        .setkey         = xts_serpent_setkey,
+                        .encrypt        = xts_encrypt,
+                        .decrypt        = xts_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "ecb(serpent)",
+        .cra_driver_name        = "ecb-serpent-avx2",
+        .cra_priority           = 600,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = SERPENT_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_list               = LIST_HEAD_INIT(srp_algs[5].cra_list),
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = SERPENT_MIN_KEY_SIZE,
+                        .max_keysize    = SERPENT_MAX_KEY_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = ablk_encrypt,
+                        .decrypt        = ablk_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "cbc(serpent)",
+        .cra_driver_name        = "cbc-serpent-avx2",
+        .cra_priority           = 600,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = SERPENT_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_list               = LIST_HEAD_INIT(srp_algs[6].cra_list),
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = SERPENT_MIN_KEY_SIZE,
+                        .max_keysize    = SERPENT_MAX_KEY_SIZE,
+                        .ivsize         = SERPENT_BLOCK_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = __ablk_encrypt,
+                        .decrypt        = ablk_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "ctr(serpent)",
+        .cra_driver_name        = "ctr-serpent-avx2",
+        .cra_priority           = 600,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = 1,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_list               = LIST_HEAD_INIT(srp_algs[7].cra_list),
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = SERPENT_MIN_KEY_SIZE,
+                        .max_keysize    = SERPENT_MAX_KEY_SIZE,
+                        .ivsize         = SERPENT_BLOCK_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = ablk_encrypt,
+                        .decrypt        = ablk_encrypt,
+                        .geniv          = "chainiv",
+                },
+        },
+}, {
+        .cra_name               = "lrw(serpent)",
+        .cra_driver_name        = "lrw-serpent-avx2",
+        .cra_priority           = 600,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = SERPENT_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_list               = LIST_HEAD_INIT(srp_algs[8].cra_list),
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = SERPENT_MIN_KEY_SIZE +
+                                          SERPENT_BLOCK_SIZE,
+                        .max_keysize    = SERPENT_MAX_KEY_SIZE +
+                                          SERPENT_BLOCK_SIZE,
+                        .ivsize         = SERPENT_BLOCK_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = ablk_encrypt,
+                        .decrypt        = ablk_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "xts(serpent)",
+        .cra_driver_name        = "xts-serpent-avx2",
+        .cra_priority           = 600,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = SERPENT_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_list               = LIST_HEAD_INIT(srp_algs[9].cra_list),
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = SERPENT_MIN_KEY_SIZE * 2,
+                        .max_keysize    = SERPENT_MAX_KEY_SIZE * 2,
+                        .ivsize         = SERPENT_BLOCK_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = ablk_encrypt,
+                        .decrypt        = ablk_decrypt,
+                },
+        },
+} };
+static int __init init(void)
+{
+        u64 xcr0;
+        if (!cpu_has_avx2 || !cpu_has_osxsave) {
+                pr_info("AVX2 instructions are not detected.\n");
+                return -ENODEV;
+        }
+        xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+        if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+                pr_info("AVX detected but unusable.\n");
+                return -ENODEV;
+        }
+        return crypto_register_algs(srp_algs, ARRAY_SIZE(srp_algs));
+}
+static void __exit fini(void)
+{
+        crypto_unregister_algs(srp_algs, ARRAY_SIZE(srp_algs));
+}
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized");
+MODULE_ALIAS("serpent");
+MODULE_ALIAS("serpent-asm");
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 52abaaf28e7f..9ae83cf8d21e 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -4,8 +4,7 @@
 * Copyright (C) 2012 Johannes Goetzfried
 *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
 *
- * Glue code based on serpent_sse2_glue.c by:
+ * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *  Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -42,7 +41,32 @@
 #include <asm/crypto/ablk_helper.h>
 #include <asm/crypto/glue_helper.h>
-static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+/* 8-way parallel cipher functions */
+asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+                                         const u8 *src);
+EXPORT_SYMBOL_GPL(serpent_ecb_enc_8way_avx);
+asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+                                         const u8 *src);
+EXPORT_SYMBOL_GPL(serpent_ecb_dec_8way_avx);
+asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+                                         const u8 *src);
+EXPORT_SYMBOL_GPL(serpent_cbc_dec_8way_avx);
+asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+                                     const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(serpent_ctr_8way_avx);
+asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+                                         const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(serpent_xts_enc_8way_avx);
+asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+                                         const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(serpent_xts_dec_8way_avx);
+void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
        be128 ctrblk;
@@ -52,6 +76,22 @@ static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
        __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
        u128_xor(dst, src, (u128 *)&ctrblk);
 }
+EXPORT_SYMBOL_GPL(__serpent_crypt_ctr);
+void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+        glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+                                  GLUE_FUNC_CAST(__serpent_encrypt));
+}
+EXPORT_SYMBOL_GPL(serpent_xts_enc);
+void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+        glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+                                  GLUE_FUNC_CAST(__serpent_decrypt));
+}
+EXPORT_SYMBOL_GPL(serpent_xts_dec);
 static const struct common_glue_ctx serpent_enc = {
        .num_funcs = 2,
@@ -75,7 +115,20 @@ static const struct common_glue_ctx serpent_ctr = {
                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
        }, {
                .num_blocks = 1,
-                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
+                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) }
+        } }
+};
+static const struct common_glue_ctx serpent_enc_xts = {
+        .num_funcs = 2,
+        .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+        .funcs = { {
+                .num_blocks = SERPENT_PARALLEL_BLOCKS,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) }
        } }
 };
@@ -105,6 +158,19 @@ static const struct common_glue_ctx serpent_dec_cbc = {
        } }
 };
+static const struct common_glue_ctx serpent_dec_xts = {
+        .num_funcs = 2,
+        .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+        .funcs = { {
+                .num_blocks = SERPENT_PARALLEL_BLOCKS,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) }
+        } }
+};
 static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
                       struct scatterlist *src, unsigned int nbytes)
 {
@@ -187,13 +253,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
                __serpent_decrypt(ctx->ctx, srcdst, srcdst);
 }
-struct serpent_lrw_ctx {
+int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
-        struct lrw_table_ctx lrw_table;
+                       unsigned int keylen)
-        struct serpent_ctx serpent_ctx;
-};
-static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
-                              unsigned int keylen)
 {
        struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
        int err;
@@ -206,6 +267,7 @@ static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
        return lrw_init_table(&ctx->lrw_table, key + keylen -
                                                SERPENT_BLOCK_SIZE);
 }
+EXPORT_SYMBOL_GPL(lrw_serpent_setkey);
 static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
                       struct scatterlist *src, unsigned int nbytes)
@@ -259,20 +321,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
        return ret;
 }
-static void lrw_exit_tfm(struct crypto_tfm *tfm)
+void lrw_serpent_exit_tfm(struct crypto_tfm *tfm)
 {
        struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
        lrw_free_table(&ctx->lrw_table);
 }
+EXPORT_SYMBOL_GPL(lrw_serpent_exit_tfm);
-struct serpent_xts_ctx {
+int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
-        struct serpent_ctx tweak_ctx;
+                       unsigned int keylen)
-        struct serpent_ctx crypt_ctx;
-};
-static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
-                              unsigned int keylen)
 {
        struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
        u32 *flags = &tfm->crt_flags;
@@ -294,59 +352,26 @@ static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
        /* second half of xts-key is for tweak */
        return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
 }
+EXPORT_SYMBOL_GPL(xts_serpent_setkey);
 static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
                       struct scatterlist *src, unsigned int nbytes)
 {
        struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-        be128 buf[SERPENT_PARALLEL_BLOCKS];
-        struct crypt_priv crypt_ctx = {
-                .ctx = &ctx->crypt_ctx,
-                .fpu_enabled = false,
-        };
-        struct xts_crypt_req req = {
-                .tbuf = buf,
-                .tbuflen = sizeof(buf),
-                .tweak_ctx = &ctx->tweak_ctx,
-                .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
-                .crypt_ctx = &crypt_ctx,
-                .crypt_fn = encrypt_callback,
-        };
-        int ret;
-        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-        ret = xts_crypt(desc, dst, src, nbytes, &req);
-        serpent_fpu_end(crypt_ctx.fpu_enabled);
-        return ret;
+        return glue_xts_crypt_128bit(&serpent_enc_xts, desc, dst, src, nbytes,
+                                     XTS_TWEAK_CAST(__serpent_encrypt),
+                                     &ctx->tweak_ctx, &ctx->crypt_ctx);
 }
 static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
                       struct scatterlist *src, unsigned int nbytes)
 {
        struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-        be128 buf[SERPENT_PARALLEL_BLOCKS];
-        struct crypt_priv crypt_ctx = {
-                .ctx = &ctx->crypt_ctx,
-                .fpu_enabled = false,
-        };
-        struct xts_crypt_req req = {
-                .tbuf = buf,
-                .tbuflen = sizeof(buf),
-                .tweak_ctx = &ctx->tweak_ctx,
-                .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
-                .crypt_ctx = &crypt_ctx,
-                .crypt_fn = decrypt_callback,
-        };
-        int ret;
-        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        return glue_xts_crypt_128bit(&serpent_dec_xts, desc, dst, src, nbytes,
-        ret = xts_crypt(desc, dst, src, nbytes, &req);
+                                     XTS_TWEAK_CAST(__serpent_encrypt),
-        serpent_fpu_end(crypt_ctx.fpu_enabled);
+                                     &ctx->tweak_ctx, &ctx->crypt_ctx);
-        return ret;
 }
 static struct crypto_alg serpent_algs[10] = { {
@@ -417,7 +442,7 @@ static struct crypto_alg serpent_algs[10] = { {
        .cra_alignmask          = 0,
        .cra_type               = &crypto_blkcipher_type,
        .cra_module             = THIS_MODULE,
-        .cra_exit               = lrw_exit_tfm,
+        .cra_exit               = lrw_serpent_exit_tfm,
        .cra_u = {
                .blkcipher = {
                        .min_keysize    = SERPENT_MIN_KEY_SIZE +
diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
new file mode 100644
index 000000000000..56610c4bf31b
--- /dev/null
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -0,0 +1,496 @@
+########################################################################
+# Implement fast SHA-256 with AVX1 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 1 block at a time, with 4 lanes per block
+########################################################################
+#ifdef CONFIG_AS_AVX
+#include <linux/linkage.h>
+## assume buffers not aligned
+#define    VMOVDQ vmovdqu
+################################ Define Macros
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+        add     \p1, \p2
+        mov     \p2, \p1
+.endm
+.macro MY_ROR p1 p2
+        shld    $(32-(\p1)), \p2, \p2
+.endm
+################################
+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+# Load xmm with mem and byte swap each dword
+.macro COPY_XMM_AND_BSWAP p1 p2 p3
+        VMOVDQ \p2, \p1
+        vpshufb \p3, \p1, \p1
+.endm
+################################
+X0 = %xmm4
+X1 = %xmm5
+X2 = %xmm6
+X3 = %xmm7
+XTMP0 = %xmm0
+XTMP1 = %xmm1
+XTMP2 = %xmm2
+XTMP3 = %xmm3
+XTMP4 = %xmm8
+XFER = %xmm9
+XTMP5 = %xmm11
+SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
+SHUF_DC00 = %xmm12      # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %xmm13
+NUM_BLKS = %rdx   # 3rd arg
+CTX = %rsi        # 2nd arg
+INP = %rdi        # 1st arg
+SRND = %rdi       # clobbers INP
+c = %ecx
+d = %r8d
+e = %edx
+TBL = %rbp
+a = %eax
+b = %ebx
+f = %r9d
+g = %r10d
+h = %r11d
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_XFER_SIZE = 8
+_XMM_SAVE_SIZE = 0
+_INP_END = 0
+_INP            = _INP_END  + _INP_END_SIZE
+_XFER           = _INP      + _INP_SIZE
+_XMM_SAVE       = _XFER     + _XFER_SIZE
+STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+X_ = X0
+X0 = X1
+X1 = X2
+X2 = X3
+X3 = X_
+.endm
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+.macro FOUR_ROUNDS_AND_SCHED
+        ## compute s0 four at a time and s1 two at a time
+        ## compute W[-16] + W[-7] 4 at a time
+        mov     e, y0                   # y0 = e
+        MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+        mov     a, y1                   # y1 = a
+        vpalignr $4, X2, X3, XTMP0      # XTMP0 = W[-7]
+        MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+        xor     e, y0                   # y0 = e ^ (e >> (25-11))
+        mov     f, y2                   # y2 = f
+        MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+        xor     a, y1                   # y1 = a ^ (a >> (22-13)
+        xor     g, y2                   # y2 = f^g
+        vpaddd  X0, XTMP0, XTMP0        # XTMP0 = W[-7] + W[-16]
+        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+        and     e, y2                   # y2 = (f^g)&e
+        MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+        ## compute s0
+        vpalignr $4, X0, X1, XTMP1      # XTMP1 = W[-15]
+        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+        MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+        MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+        add     y0, y2                  # y2 = S1 + CH
+        add     _XFER(%rsp), y2         # y2 = k + w + S1 + CH
+        mov     a, y0                   # y0 = a
+        add     y2, h                   # h = h + S1 + CH + k + w
+        mov     a, y2                   # y2 = a
+        vpsrld  $7, XTMP1, XTMP2
+        or      c, y0                   # y0 = a|c
+        add     h, d                    # d = d + h + S1 + CH + k + w
+        and     c, y2                   # y2 = a&c
+        vpslld  $(32-7), XTMP1, XTMP3
+        and     b, y0                   # y0 = (a|c)&b
+        add     y1, h                   # h = h + S1 + CH + k + w + S0
+        vpor    XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7
+        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+        ROTATE_ARGS
+        mov     e, y0                   # y0 = e
+        mov     a, y1                   # y1 = a
+        MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+        xor     e, y0                   # y0 = e ^ (e >> (25-11))
+        mov     f, y2                   # y2 = f
+        MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+        vpsrld  $18, XTMP1, XTMP2       #
+        xor     a, y1                   # y1 = a ^ (a >> (22-13)
+        MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+        xor     g, y2                   # y2 = f^g
+        vpsrld  $3, XTMP1, XTMP4        # XTMP4 = W[-15] >> 3
+        MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+        and     e, y2                   # y2 = (f^g)&e
+        MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+        vpslld  $(32-18), XTMP1, XTMP1
+        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+        vpxor   XTMP1, XTMP3, XTMP3     #
+        add     y0, y2                  # y2 = S1 + CH
+        add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+        MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+        vpxor   XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
+        mov     a, y0                   # y0 = a
+        add     y2, h                   # h = h + S1 + CH + k + w
+        mov     a, y2                   # y2 = a
+        vpxor   XTMP4, XTMP3, XTMP1     # XTMP1 = s0
+        or      c, y0                   # y0 = a|c
+        add     h, d                    # d = d + h + S1 + CH + k + w
+        and     c, y2                   # y2 = a&c
+        ## compute low s1
+        vpshufd $0b11111010, X3, XTMP2  # XTMP2 = W[-2] {BBAA}
+        and     b, y0                   # y0 = (a|c)&b
+        add     y1, h                   # h = h + S1 + CH + k + w + S0
+        vpaddd  XTMP1, XTMP0, XTMP0     # XTMP0 = W[-16] + W[-7] + s0
+        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+        ROTATE_ARGS
+        mov     e, y0                   # y0 = e
+        mov     a, y1                   # y1 = a
+        MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+        xor     e, y0                   # y0 = e ^ (e >> (25-11))
+        MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+        mov     f, y2                   # y2 = f
+        xor     a, y1                   # y1 = a ^ (a >> (22-13)
+        MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+        vpsrld  $10, XTMP2, XTMP4       # XTMP4 = W[-2] >> 10 {BBAA}
+        xor     g, y2                   # y2 = f^g
+        vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xBxA}
+        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+        and     e, y2                   # y2 = (f^g)&e
+        vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xBxA}
+        MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+        MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+        vpxor   XTMP3, XTMP2, XTMP2     #
+        add     y0, y2                  # y2 = S1 + CH
+        MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+        add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+        vpxor   XTMP2, XTMP4, XTMP4     # XTMP4 = s1 {xBxA}
+        mov     a, y0                   # y0 = a
+        add     y2, h                   # h = h + S1 + CH + k + w
+        mov     a, y2                   # y2 = a
+        vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
+        or      c, y0                   # y0 = a|c
+        add     h, d                    # d = d + h + S1 + CH + k + w
+        and     c, y2                   # y2 = a&c
+        vpaddd  XTMP4, XTMP0, XTMP0     # XTMP0 = {..., ..., W[1], W[0]}
+        and     b, y0                   # y0 = (a|c)&b
+        add     y1, h                   # h = h + S1 + CH + k + w + S0
+        ## compute high s1
+        vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
+        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+        ROTATE_ARGS
+        mov     e, y0                   # y0 = e
+        MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+        mov     a, y1                   # y1 = a
+        MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+        xor     e, y0                   # y0 = e ^ (e >> (25-11))
+        mov     f, y2                   # y2 = f
+        MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+        vpsrld  $10, XTMP2, XTMP5       # XTMP5 = W[-2] >> 10 {DDCC}
+        xor     a, y1                   # y1 = a ^ (a >> (22-13)
+        xor     g, y2                   # y2 = f^g
+        vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xDxC}
+        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+        and     e, y2                   # y2 = (f^g)&e
+        MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+        vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xDxC}
+        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+        MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+        vpxor   XTMP3, XTMP2, XTMP2
+        MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+        add     y0, y2                  # y2 = S1 + CH
+        add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+        vpxor   XTMP2, XTMP5, XTMP5     # XTMP5 = s1 {xDxC}
+        mov     a, y0                   # y0 = a
+        add     y2, h                   # h = h + S1 + CH + k + w
+        mov     a, y2                   # y2 = a
+        vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
+        or      c, y0                   # y0 = a|c
+        add     h, d                    # d = d + h + S1 + CH + k + w
+        and     c, y2                   # y2 = a&c
+        vpaddd  XTMP0, XTMP5, X0        # X0 = {W[3], W[2], W[1], W[0]}
+        and     b, y0                   # y0 = (a|c)&b
+        add     y1, h                   # h = h + S1 + CH + k + w + S0
+        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+        ROTATE_ARGS
+        rotate_Xs
+.endm
+## input is [rsp + _XFER + %1 * 4]
+.macro DO_ROUND round
+        mov     e, y0                   # y0 = e
+        MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+        mov     a, y1                   # y1 = a
+        xor     e, y0                   # y0 = e ^ (e >> (25-11))
+        MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+        mov     f, y2                   # y2 = f
+        xor     a, y1                   # y1 = a ^ (a >> (22-13)
+        MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+        xor     g, y2                   # y2 = f^g
+        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+        MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+        and     e, y2                   # y2 = (f^g)&e
+        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+        MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+        add     y0, y2                  # y2 = S1 + CH
+        MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+        offset = \round * 4 + _XFER     #
+        add     offset(%rsp), y2        # y2 = k + w + S1 + CH
+        mov     a, y0                   # y0 = a
+        add     y2, h                   # h = h + S1 + CH + k + w
+        mov     a, y2                   # y2 = a
+        or      c, y0                   # y0 = a|c
+        add     h, d                    # d = d + h + S1 + CH + k + w
+        and     c, y2                   # y2 = a&c
+        and     b, y0                   # y0 = (a|c)&b
+        add     y1, h                   # h = h + S1 + CH + k + w + S0
+        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+        ROTATE_ARGS
+.endm
+########################################################################
+## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
+## arg 1 : pointer to input data
+## arg 2 : pointer to digest
+## arg 3 : Num blocks
+########################################################################
+.text
+ENTRY(sha256_transform_avx)
+.align 32
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        pushq   %r12
+        mov     %rsp, %r12
+        subq    $STACK_SIZE, %rsp       # allocate stack space
+        and     $~15, %rsp              # align stack pointer
+        shl     $6, NUM_BLKS            # convert to bytes
+        jz      done_hash
+        add     INP, NUM_BLKS           # pointer to end of data
+        mov     NUM_BLKS, _INP_END(%rsp)
+        ## load initial digest
+        mov     4*0(CTX), a
+        mov     4*1(CTX), b
+        mov     4*2(CTX), c
+        mov     4*3(CTX), d
+        mov     4*4(CTX), e
+        mov     4*5(CTX), f
+        mov     4*6(CTX), g
+        mov     4*7(CTX), h
+        vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+        vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
+        vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
+loop0:
+        lea     K256(%rip), TBL
+        ## byte swap first 16 dwords
+        COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
+        COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
+        COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
+        COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
+        mov     INP, _INP(%rsp)
+        ## schedule 48 input dwords, by doing 3 rounds of 16 each
+        mov     $3, SRND
+.align 16
+loop1:
+        vpaddd  (TBL), X0, XFER
+        vmovdqa XFER, _XFER(%rsp)
+        FOUR_ROUNDS_AND_SCHED
+        vpaddd  1*16(TBL), X0, XFER
+        vmovdqa XFER, _XFER(%rsp)
+        FOUR_ROUNDS_AND_SCHED
+        vpaddd  2*16(TBL), X0, XFER
+        vmovdqa XFER, _XFER(%rsp)
+        FOUR_ROUNDS_AND_SCHED
+        vpaddd  3*16(TBL), X0, XFER
+        vmovdqa XFER, _XFER(%rsp)
+        add     $4*16, TBL
+        FOUR_ROUNDS_AND_SCHED
+        sub     $1, SRND
+        jne     loop1
+        mov     $2, SRND
+loop2:
+        vpaddd  (TBL), X0, XFER
+        vmovdqa XFER, _XFER(%rsp)
+        DO_ROUND        0
+        DO_ROUND        1
+        DO_ROUND        2
+        DO_ROUND        3
+        vpaddd  1*16(TBL), X1, XFER
+        vmovdqa XFER, _XFER(%rsp)
+        add     $2*16, TBL
+        DO_ROUND        0
+        DO_ROUND        1
+        DO_ROUND        2
+        DO_ROUND        3
+        vmovdqa X2, X0
+        vmovdqa X3, X1
+        sub     $1, SRND
+        jne     loop2
+        addm    (4*0)(CTX),a
+        addm    (4*1)(CTX),b
+        addm    (4*2)(CTX),c
+        addm    (4*3)(CTX),d
+        addm    (4*4)(CTX),e
+        addm    (4*5)(CTX),f
+        addm    (4*6)(CTX),g
+        addm    (4*7)(CTX),h
+        mov     _INP(%rsp), INP
+        add     $64, INP
+        cmp     _INP_END(%rsp), INP
+        jne     loop0
+done_hash:
+        mov     %r12, %rsp
+        popq    %r12
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %rbp
+        popq    %rbx
+        ret
+ENDPROC(sha256_transform_avx)
+.data
+.align 64
+K256:
+        .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+        .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+        .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+        .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+        .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+        .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+        .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+        .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+        .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+        .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+        .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+        .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+        .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+        .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+        .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+        .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+PSHUFFLE_BYTE_FLIP_MASK:
+        .octa 0x0c0d0e0f08090a0b0405060700010203
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+        .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+        .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
+#endif
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
new file mode 100644
index 000000000000..9e86944c539d
--- /dev/null
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -0,0 +1,772 @@
+########################################################################
+# Implement fast SHA-256 with AVX2 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 2 blocks at a time, with 4 lanes per block
+########################################################################
+#ifdef CONFIG_AS_AVX2
+#include <linux/linkage.h>
+## assume buffers not aligned
+#define VMOVDQ vmovdqu
+################################ Define Macros
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+        add     \p1, \p2
+        mov     \p2, \p1
+.endm
+################################
+X0 = %ymm4
+X1 = %ymm5
+X2 = %ymm6
+X3 = %ymm7
+# XMM versions of above
+XWORD0 = %xmm4
+XWORD1 = %xmm5
+XWORD2 = %xmm6
+XWORD3 = %xmm7
+XTMP0 = %ymm0
+XTMP1 = %ymm1
+XTMP2 = %ymm2
+XTMP3 = %ymm3
+XTMP4 = %ymm8
+XFER  = %ymm9
+XTMP5 = %ymm11
+SHUF_00BA =     %ymm10 # shuffle xBxA -> 00BA
+SHUF_DC00 =     %ymm12 # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %ymm13
+X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
+NUM_BLKS = %rdx # 3rd arg
+CTX     = %rsi  # 2nd arg
+INP     = %rdi  # 1st arg
+c       = %ecx
+d       = %r8d
+e       = %edx  # clobbers NUM_BLKS
+y3      = %edi  # clobbers INP
+TBL     = %rbp
+SRND    = CTX   # SRND is same register as CTX
+a = %eax
+b = %ebx
+f = %r9d
+g = %r10d
+h = %r11d
+old_h = %r11d
+T1 = %r12d
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+_XFER_SIZE      = 2*64*4        # 2 blocks, 64 rounds, 4 bytes/round
+_XMM_SAVE_SIZE  = 0
+_INP_END_SIZE   = 8
+_INP_SIZE       = 8
+_CTX_SIZE       = 8
+_RSP_SIZE       = 8
+_XFER           = 0
+_XMM_SAVE       = _XFER     + _XFER_SIZE
+_INP_END        = _XMM_SAVE + _XMM_SAVE_SIZE
+_INP            = _INP_END  + _INP_END_SIZE
+_CTX            = _INP      + _INP_SIZE
+_RSP            = _CTX      + _CTX_SIZE
+STACK_SIZE      = _RSP      + _RSP_SIZE
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+        X_ = X0
+        X0 = X1
+        X1 = X2
+        X2 = X3
+        X3 = X_
+.endm
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+        old_h = h
+        TMP_ = h
+        h = g
+        g = f
+        f = e
+        e = d
+        d = c
+        c = b
+        b = a
+        a = TMP_
+.endm
+.macro FOUR_ROUNDS_AND_SCHED disp
+################################### RND N + 0 ############################
+        mov     a, y3           # y3 = a                                # MAJA
+        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
+        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
+        addl    \disp(%rsp, SRND), h            # h = k + w + h         # --
+        or      c, y3           # y3 = a|c                              # MAJA
+        vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
+        mov     f, y2           # y2 = f                                # CH
+        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
+        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
+        xor     g, y2           # y2 = f^g                              # CH
+        vpaddd  X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
+        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
+        and     e, y2           # y2 = (f^g)&e                          # CH
+        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
+        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
+        add     h, d            # d = k + w + h + d                     # --
+        and     b, y3           # y3 = (a|c)&b                          # MAJA
+        vpalignr $4, X0, X1, XTMP1      # XTMP1 = W[-15]
+        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
+        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
+        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
+        vpsrld  $7, XTMP1, XTMP2
+        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
+        mov     a, T1           # T1 = a                                # MAJB
+        and     c, T1           # T1 = a&c                              # MAJB
+        add     y0, y2          # y2 = S1 + CH                          # --
+        vpslld  $(32-7), XTMP1, XTMP3
+        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+        add     y1, h           # h = k + w + h + S0                    # --
+        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
+        vpor    XTMP2, XTMP3, XTMP3     # XTMP3 = W[-15] ror 7
+        vpsrld  $18, XTMP1, XTMP2
+        add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+        add     y3, h           # h = t1 + S0 + MAJ                     # --
+        ROTATE_ARGS
+################################### RND N + 1 ############################
+        mov     a, y3           # y3 = a                                # MAJA
+        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
+        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
+        offset = \disp + 1*4
+        addl    offset(%rsp, SRND), h   # h = k + w + h         # --
+        or      c, y3           # y3 = a|c                              # MAJA
+        vpsrld  $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
+        mov     f, y2           # y2 = f                                # CH
+        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
+        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
+        xor     g, y2           # y2 = f^g                              # CH
+        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
+        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
+        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
+        and     e, y2           # y2 = (f^g)&e                          # CH
+        add     h, d            # d = k + w + h + d                     # --
+        vpslld  $(32-18), XTMP1, XTMP1
+        and     b, y3           # y3 = (a|c)&b                          # MAJA
+        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
+        vpxor   XTMP1, XTMP3, XTMP3
+        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
+        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
+        vpxor   XTMP2, XTMP3, XTMP3     # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
+        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
+        mov     a, T1           # T1 = a                                # MAJB
+        and     c, T1           # T1 = a&c                              # MAJB
+        add     y0, y2          # y2 = S1 + CH                          # --
+        vpxor   XTMP4, XTMP3, XTMP1     # XTMP1 = s0
+        vpshufd $0b11111010, X3, XTMP2  # XTMP2 = W[-2] {BBAA}
+        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+        add     y1, h           # h = k + w + h + S0                    # --
+        vpaddd  XTMP1, XTMP0, XTMP0     # XTMP0 = W[-16] + W[-7] + s0
+        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
+        add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+        add     y3, h           # h = t1 + S0 + MAJ                     # --
+        vpsrld  $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
+        ROTATE_ARGS
+################################### RND N + 2 ############################
+        mov     a, y3           # y3 = a                                # MAJA
+        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
+        offset = \disp + 2*4
+        addl    offset(%rsp, SRND), h   # h = k + w + h         # --
+        vpsrlq  $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
+        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
+        or      c, y3           # y3 = a|c                              # MAJA
+        mov     f, y2           # y2 = f                                # CH
+        xor     g, y2           # y2 = f^g                              # CH
+        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
+        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
+        vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] ror 17 {xBxA}
+        and     e, y2           # y2 = (f^g)&e                          # CH
+        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
+        vpxor   XTMP3, XTMP2, XTMP2
+        add     h, d            # d = k + w + h + d                     # --
+        and     b, y3           # y3 = (a|c)&b                          # MAJA
+        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
+        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
+        vpxor   XTMP2, XTMP4, XTMP4     # XTMP4 = s1 {xBxA}
+        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
+        vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
+        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
+        rorx    $2, a ,T1       # T1 = (a >> 2)                         # S0
+        vpaddd  XTMP4, XTMP0, XTMP0     # XTMP0 = {..., ..., W[1], W[0]}
+        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
+        mov     a, T1           # T1 = a                                # MAJB
+        and     c, T1           # T1 = a&c                              # MAJB
+        add     y0, y2          # y2 = S1 + CH                          # --
+        vpshufd $0b01010000, XTMP0, XTMP2       # XTMP2 = W[-2] {DDCC}
+        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+        add     y1,h            # h = k + w + h + S0                    # --
+        add     y2,d            # d = k + w + h + d + S1 + CH = d + t1  # --
+        add     y2,h            # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+        add     y3,h            # h = t1 + S0 + MAJ                     # --
+        ROTATE_ARGS
+################################### RND N + 3 ############################
+        mov     a, y3           # y3 = a                                # MAJA
+        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
+        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
+        offset = \disp + 3*4
+        addl    offset(%rsp, SRND), h   # h = k + w + h         # --
+        or      c, y3           # y3 = a|c                              # MAJA
+        vpsrld  $10, XTMP2, XTMP5       # XTMP5 = W[-2] >> 10 {DDCC}
+        mov     f, y2           # y2 = f                                # CH
+        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
+        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
+        xor     g, y2           # y2 = f^g                              # CH
+        vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] ror 19 {xDxC}
+        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
+        and     e, y2           # y2 = (f^g)&e                          # CH
+        add     h, d            # d = k + w + h + d                     # --
+        and     b, y3           # y3 = (a|c)&b                          # MAJA
+        vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] ror 17 {xDxC}
+        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
+        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
+        vpxor   XTMP3, XTMP2, XTMP2
+        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
+        add     y0, y2          # y2 = S1 + CH                          # --
+        vpxor   XTMP2, XTMP5, XTMP5     # XTMP5 = s1 {xDxC}
+        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
+        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
+        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
+        vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
+        vpaddd  XTMP0, XTMP5, X0        # X0 = {W[3], W[2], W[1], W[0]}
+        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
+        mov     a, T1           # T1 = a                                # MAJB
+        and     c, T1           # T1 = a&c                              # MAJB
+        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+        add     y1, h           # h = k + w + h + S0                    # --
+        add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+        add     y3, h           # h = t1 + S0 + MAJ                     # --
+        ROTATE_ARGS
+        rotate_Xs
+.endm
+.macro DO_4ROUNDS disp
+################################### RND N + 0 ###########################
+        mov     f, y2           # y2 = f                                # CH
+        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
+        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
+        xor     g, y2           # y2 = f^g                              # CH
+        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
+        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
+        and     e, y2           # y2 = (f^g)&e                          # CH
+        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
+        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
+        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
+        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
+        mov     a, y3           # y3 = a                                # MAJA
+        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
+        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
+        addl    \disp(%rsp, SRND), h            # h = k + w + h # --
+        or      c, y3           # y3 = a|c                              # MAJA
+        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
+        mov     a, T1           # T1 = a                                # MAJB
+        and     b, y3           # y3 = (a|c)&b                          # MAJA
+        and     c, T1           # T1 = a&c                              # MAJB
+        add     y0, y2          # y2 = S1 + CH                          # --
+        add     h, d            # d = k + w + h + d                     # --
+        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+        add     y1, h           # h = k + w + h + S0                    # --
+        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
+        ROTATE_ARGS
+################################### RND N + 1 ###########################
+        add     y2, old_h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+        mov     f, y2           # y2 = f                                # CH
+        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
+        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
+        xor     g, y2           # y2 = f^g                              # CH
+        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
+        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
+        and     e, y2           # y2 = (f^g)&e                          # CH
+        add     y3, old_h       # h = t1 + S0 + MAJ                     # --
+        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
+        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
+        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
+        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
+        mov     a, y3           # y3 = a                                # MAJA
+        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
+        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
+        offset = 4*1 + \disp
+        addl    offset(%rsp, SRND), h           # h = k + w + h # --
+        or      c, y3           # y3 = a|c                              # MAJA
+        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
+        mov     a, T1           # T1 = a                                # MAJB
+        and     b, y3           # y3 = (a|c)&b                          # MAJA
+        and     c, T1           # T1 = a&c                              # MAJB
+        add     y0, y2          # y2 = S1 + CH                          # --
+        add     h, d            # d = k + w + h + d                     # --
+        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+        add     y1, h           # h = k + w + h + S0                    # --
+        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
+        ROTATE_ARGS
+################################### RND N + 2 ##############################
+        add     y2, old_h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+        mov     f, y2           # y2 = f                                # CH
+        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
+        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
+        xor     g, y2           # y2 = f^g                              # CH
+        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
+        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
+        and     e, y2           # y2 = (f^g)&e                          # CH
+        add     y3, old_h       # h = t1 + S0 + MAJ                     # --
+        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
+        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
+        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
+        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
+        mov     a, y3           # y3 = a                                # MAJA
+        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
+        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
+        offset = 4*2 + \disp
+        addl    offset(%rsp, SRND), h           # h = k + w + h # --
+        or      c, y3           # y3 = a|c                              # MAJA
+        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
+        mov     a, T1           # T1 = a                                # MAJB
+        and     b, y3           # y3 = (a|c)&b                          # MAJA
+        and     c, T1           # T1 = a&c                              # MAJB
+        add     y0, y2          # y2 = S1 + CH                          # --
+        add     h, d            # d = k + w + h + d                     # --
+        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+        add     y1, h           # h = k + w + h + S0                    # --
+        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
+        ROTATE_ARGS
+################################### RND N + 3 ###########################
+        add     y2, old_h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+        mov     f, y2           # y2 = f                                # CH
+        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
+        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
+        xor     g, y2           # y2 = f^g                              # CH
+        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
+        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
+        and     e, y2           # y2 = (f^g)&e                          # CH
+        add     y3, old_h       # h = t1 + S0 + MAJ                     # --
+        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
+        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
+        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
+        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
+        mov     a, y3           # y3 = a                                # MAJA
+        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
+        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
+        offset = 4*3 + \disp
+        addl    offset(%rsp, SRND), h           # h = k + w + h # --
+        or      c, y3           # y3 = a|c                              # MAJA
+        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
+        mov     a, T1           # T1 = a                                # MAJB
+        and     b, y3           # y3 = (a|c)&b                          # MAJA
+        and     c, T1           # T1 = a&c                              # MAJB
+        add     y0, y2          # y2 = S1 + CH                          # --
+        add     h, d            # d = k + w + h + d                     # --
+        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+        add     y1, h           # h = k + w + h + S0                    # --
+        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
+        add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+        add     y3, h           # h = t1 + S0 + MAJ                     # --
+        ROTATE_ARGS
+.endm
+########################################################################
+## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
+## arg 1 : pointer to input data
+## arg 2 : pointer to digest
+## arg 3 : Num blocks
+########################################################################
+.text
+ENTRY(sha256_transform_rorx)
+.align 32
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        mov     %rsp, %rax
+        subq    $STACK_SIZE, %rsp
+        and     $-32, %rsp      # align rsp to 32 byte boundary
+        mov     %rax, _RSP(%rsp)
+        shl     $6, NUM_BLKS    # convert to bytes
+        jz      done_hash
+        lea     -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
+        mov     NUM_BLKS, _INP_END(%rsp)
+        cmp     NUM_BLKS, INP
+        je      only_one_block
+        ## load initial digest
+        mov     (CTX), a
+        mov     4*1(CTX), b
+        mov     4*2(CTX), c
+        mov     4*3(CTX), d
+        mov     4*4(CTX), e
+        mov     4*5(CTX), f
+        mov     4*6(CTX), g
+        mov     4*7(CTX), h
+        vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+        vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
+        vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
+        mov     CTX, _CTX(%rsp)
+loop0:
+        lea     K256(%rip), TBL
+        ## Load first 16 dwords from two blocks
+        VMOVDQ  0*32(INP),XTMP0
+        VMOVDQ  1*32(INP),XTMP1
+        VMOVDQ  2*32(INP),XTMP2
+        VMOVDQ  3*32(INP),XTMP3
+        ## byte swap data
+        vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
+        vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
+        vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
+        vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
+        ## transpose data into high/low halves
+        vperm2i128      $0x20, XTMP2, XTMP0, X0
+        vperm2i128      $0x31, XTMP2, XTMP0, X1
+        vperm2i128      $0x20, XTMP3, XTMP1, X2
+        vperm2i128      $0x31, XTMP3, XTMP1, X3
+last_block_enter:
+        add     $64, INP
+        mov     INP, _INP(%rsp)
+        ## schedule 48 input dwords, by doing 3 rounds of 12 each
+        xor     SRND, SRND
+.align 16
+loop1:
+        vpaddd  0*32(TBL, SRND), X0, XFER
+        vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
+        FOUR_ROUNDS_AND_SCHED   _XFER + 0*32
+        vpaddd  1*32(TBL, SRND), X0, XFER
+        vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
+        FOUR_ROUNDS_AND_SCHED   _XFER + 1*32
+        vpaddd  2*32(TBL, SRND), X0, XFER
+        vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
+        FOUR_ROUNDS_AND_SCHED   _XFER + 2*32
+        vpaddd  3*32(TBL, SRND), X0, XFER
+        vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
+        FOUR_ROUNDS_AND_SCHED   _XFER + 3*32
+        add     $4*32, SRND
+        cmp     $3*4*32, SRND
+        jb      loop1
+loop2:
+        ## Do last 16 rounds with no scheduling
+        vpaddd  0*32(TBL, SRND), X0, XFER
+        vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
+        DO_4ROUNDS      _XFER + 0*32
+        vpaddd  1*32(TBL, SRND), X1, XFER
+        vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
+        DO_4ROUNDS      _XFER + 1*32
+        add     $2*32, SRND
+        vmovdqa X2, X0
+        vmovdqa X3, X1
+        cmp     $4*4*32, SRND
+        jb      loop2
+        mov     _CTX(%rsp), CTX
+        mov     _INP(%rsp), INP
+        addm    (4*0)(CTX),a
+        addm    (4*1)(CTX),b
+        addm    (4*2)(CTX),c
+        addm    (4*3)(CTX),d
+        addm    (4*4)(CTX),e
+        addm    (4*5)(CTX),f
+        addm    (4*6)(CTX),g
+        addm    (4*7)(CTX),h
+        cmp     _INP_END(%rsp), INP
+        ja      done_hash
+        #### Do second block using previously scheduled results
+        xor     SRND, SRND
+.align 16
+loop3:
+        DO_4ROUNDS       _XFER + 0*32 + 16
+        DO_4ROUNDS       _XFER + 1*32 + 16
+        add     $2*32, SRND
+        cmp     $4*4*32, SRND
+        jb      loop3
+        mov     _CTX(%rsp), CTX
+        mov     _INP(%rsp), INP
+        add     $64, INP
+        addm    (4*0)(CTX),a
+        addm    (4*1)(CTX),b
+        addm    (4*2)(CTX),c
+        addm    (4*3)(CTX),d
+        addm    (4*4)(CTX),e
+        addm    (4*5)(CTX),f
+        addm    (4*6)(CTX),g
+        addm    (4*7)(CTX),h
+        cmp     _INP_END(%rsp), INP
+        jb      loop0
+        ja      done_hash
+do_last_block:
+        #### do last block
+        lea     K256(%rip), TBL
+        VMOVDQ  0*16(INP),XWORD0
+        VMOVDQ  1*16(INP),XWORD1
+        VMOVDQ  2*16(INP),XWORD2
+        VMOVDQ  3*16(INP),XWORD3
+        vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
+        vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
+        vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
+        vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
+        jmp     last_block_enter
+only_one_block:
+        ## load initial digest
+        mov     (4*0)(CTX),a
+        mov     (4*1)(CTX),b
+        mov     (4*2)(CTX),c
+        mov     (4*3)(CTX),d
+        mov     (4*4)(CTX),e
+        mov     (4*5)(CTX),f
+        mov     (4*6)(CTX),g
+        mov     (4*7)(CTX),h
+        vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+        vmovdqa _SHUF_00BA(%rip), SHUF_00BA
+        vmovdqa _SHUF_DC00(%rip), SHUF_DC00
+        mov     CTX, _CTX(%rsp)
+        jmp     do_last_block
+done_hash:
+        mov     _RSP(%rsp), %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+        ret
+ENDPROC(sha256_transform_rorx)
+.data
+.align 64
+K256:
+        .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+        .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+        .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+        .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+        .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+        .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+        .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+        .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+        .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+        .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+        .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+        .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+        .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+        .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+        .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+        .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+        .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+        .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+        .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+        .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+        .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+        .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+        .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+        .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+        .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+        .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+        .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+        .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+        .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+        .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+        .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+        .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+PSHUFFLE_BYTE_FLIP_MASK:
+        .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+        .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+        .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
+#endif
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S
new file mode 100644
index 000000000000..98d3c391da81
--- /dev/null
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -0,0 +1,506 @@
+########################################################################
+# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+#include <linux/linkage.h>
+## assume buffers not aligned
+#define    MOVDQ movdqu
+################################ Define Macros
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+        add     \p1, \p2
+        mov     \p2, \p1
+.endm
+################################
+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+# Load xmm with mem and byte swap each dword
+.macro COPY_XMM_AND_BSWAP p1 p2 p3
+        MOVDQ \p2, \p1
+        pshufb \p3, \p1
+.endm
+################################
+X0 = %xmm4
+X1 = %xmm5
+X2 = %xmm6
+X3 = %xmm7
+XTMP0 = %xmm0
+XTMP1 = %xmm1
+XTMP2 = %xmm2
+XTMP3 = %xmm3
+XTMP4 = %xmm8
+XFER = %xmm9
+SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
+SHUF_DC00 = %xmm11      # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %xmm12
+NUM_BLKS = %rdx   # 3rd arg
+CTX = %rsi        # 2nd arg
+INP = %rdi        # 1st arg
+SRND = %rdi       # clobbers INP
+c = %ecx
+d = %r8d
+e = %edx
+TBL = %rbp
+a = %eax
+b = %ebx
+f = %r9d
+g = %r10d
+h = %r11d
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_XFER_SIZE = 8
+_XMM_SAVE_SIZE = 0
+_INP_END = 0
+_INP            = _INP_END  + _INP_END_SIZE
+_XFER           = _INP      + _INP_SIZE
+_XMM_SAVE       = _XFER     + _XFER_SIZE
+STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+X_ = X0
+X0 = X1
+X1 = X2
+X2 = X3
+X3 = X_
+.endm
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+.macro FOUR_ROUNDS_AND_SCHED
+        ## compute s0 four at a time and s1 two at a time
+        ## compute W[-16] + W[-7] 4 at a time
+        movdqa  X3, XTMP0
+        mov     e, y0                   # y0 = e
+        ror     $(25-11), y0            # y0 = e >> (25-11)
+        mov     a, y1                   # y1 = a
+        palignr $4, X2, XTMP0           # XTMP0 = W[-7]
+        ror     $(22-13), y1            # y1 = a >> (22-13)
+        xor     e, y0                   # y0 = e ^ (e >> (25-11))
+        mov     f, y2                   # y2 = f
+        ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
+        movdqa  X1, XTMP1
+        xor     a, y1                   # y1 = a ^ (a >> (22-13)
+        xor     g, y2                   # y2 = f^g
+        paddd   X0, XTMP0               # XTMP0 = W[-7] + W[-16]
+        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+        and     e, y2                   # y2 = (f^g)&e
+        ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
+        ## compute s0
+        palignr $4, X0, XTMP1           # XTMP1 = W[-15]
+        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+        ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+        movdqa  XTMP1, XTMP2            # XTMP2 = W[-15]
+        ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+        add     y0, y2                  # y2 = S1 + CH
+        add     _XFER(%rsp) , y2        # y2 = k + w + S1 + CH
+        movdqa  XTMP1, XTMP3            # XTMP3 = W[-15]
+        mov     a, y0                   # y0 = a
+        add     y2, h                   # h = h + S1 + CH + k + w
+        mov     a, y2                   # y2 = a
+        pslld   $(32-7), XTMP1          #
+        or      c, y0                   # y0 = a|c
+        add     h, d                    # d = d + h + S1 + CH + k + w
+        and     c, y2                   # y2 = a&c
+        psrld   $7, XTMP2               #
+        and     b, y0                   # y0 = (a|c)&b
+        add     y1, h                   # h = h + S1 + CH + k + w + S0
+        por     XTMP2, XTMP1            # XTMP1 = W[-15] ror 7
+        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+                                        #
+        ROTATE_ARGS                     #
+        movdqa  XTMP3, XTMP2            # XTMP2 = W[-15]
+        mov     e, y0                   # y0 = e
+        mov     a, y1                   # y1 = a
+        movdqa  XTMP3, XTMP4            # XTMP4 = W[-15]
+        ror     $(25-11), y0            # y0 = e >> (25-11)
+        xor     e, y0                   # y0 = e ^ (e >> (25-11))
+        mov     f, y2                   # y2 = f
+        ror     $(22-13), y1            # y1 = a >> (22-13)
+        pslld   $(32-18), XTMP3         #
+        xor     a, y1                   # y1 = a ^ (a >> (22-13)
+        ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
+        xor     g, y2                   # y2 = f^g
+        psrld   $18, XTMP2              #
+        ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
+        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+        and     e, y2                   # y2 = (f^g)&e
+        ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+        pxor    XTMP3, XTMP1
+        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+        psrld   $3, XTMP4               # XTMP4 = W[-15] >> 3
+        add     y0, y2                  # y2 = S1 + CH
+        add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+        ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+        pxor    XTMP2, XTMP1            # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
+        mov     a, y0                   # y0 = a
+        add     y2, h                   # h = h + S1 + CH + k + w
+        mov     a, y2                   # y2 = a
+        pxor    XTMP4, XTMP1            # XTMP1 = s0
+        or      c, y0                   # y0 = a|c
+        add     h, d                    # d = d + h + S1 + CH + k + w
+        and     c, y2                   # y2 = a&c
+        ## compute low s1
+        pshufd  $0b11111010, X3, XTMP2   # XTMP2 = W[-2] {BBAA}
+        and     b, y0                   # y0 = (a|c)&b
+        add     y1, h                   # h = h + S1 + CH + k + w + S0
+        paddd   XTMP1, XTMP0            # XTMP0 = W[-16] + W[-7] + s0
+        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+        ROTATE_ARGS
+        movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {BBAA}
+        mov     e, y0                   # y0 = e
+        mov     a, y1                   # y1 = a
+        ror     $(25-11), y0            # y0 = e >> (25-11)
+        movdqa  XTMP2, XTMP4            # XTMP4 = W[-2] {BBAA}
+        xor     e, y0                   # y0 = e ^ (e >> (25-11))
+        ror     $(22-13), y1            # y1 = a >> (22-13)
+        mov     f, y2                   # y2 = f
+        xor     a, y1                   # y1 = a ^ (a >> (22-13)
+        ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
+        psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xBxA}
+        xor     g, y2                   # y2 = f^g
+        psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xBxA}
+        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+        and     e, y2                   # y2 = (f^g)&e
+        psrld   $10, XTMP4              # XTMP4 = W[-2] >> 10 {BBAA}
+        ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
+        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+        ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+        pxor    XTMP3, XTMP2
+        add     y0, y2                  # y2 = S1 + CH
+        ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+        add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+        pxor    XTMP2, XTMP4            # XTMP4 = s1 {xBxA}
+        mov     a, y0                   # y0 = a
+        add     y2, h                   # h = h + S1 + CH + k + w
+        mov     a, y2                   # y2 = a
+        pshufb  SHUF_00BA, XTMP4        # XTMP4 = s1 {00BA}
+        or      c, y0                   # y0 = a|c
+        add     h, d                    # d = d + h + S1 + CH + k + w
+        and     c, y2                   # y2 = a&c
+        paddd   XTMP4, XTMP0            # XTMP0 = {..., ..., W[1], W[0]}
+        and     b, y0                   # y0 = (a|c)&b
+        add     y1, h                   # h = h + S1 + CH + k + w + S0
+        ## compute high s1
+        pshufd  $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
+        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+                                        #
+        ROTATE_ARGS                     #
+        movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {DDCC}
+        mov     e, y0                   # y0 = e
+        ror     $(25-11), y0            # y0 = e >> (25-11)
+        mov     a, y1                   # y1 = a
+        movdqa  XTMP2, X0               # X0    = W[-2] {DDCC}
+        ror     $(22-13), y1            # y1 = a >> (22-13)
+        xor     e, y0                   # y0 = e ^ (e >> (25-11))
+        mov     f, y2                   # y2 = f
+        ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
+        psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xDxC}
+        xor     a, y1                   # y1 = a ^ (a >> (22-13)
+        xor     g, y2                   # y2 = f^g
+        psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xDxC}
+        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25
+        and     e, y2                   # y2 = (f^g)&e
+        ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
+        psrld   $10, X0                 # X0 = W[-2] >> 10 {DDCC}
+        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22
+        ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
+        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+        pxor    XTMP3, XTMP2            #
+        ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
+        add     y0, y2                  # y2 = S1 + CH
+        add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+        pxor    XTMP2, X0               # X0 = s1 {xDxC}
+        mov     a, y0                   # y0 = a
+        add     y2, h                   # h = h + S1 + CH + k + w
+        mov     a, y2                   # y2 = a
+        pshufb  SHUF_DC00, X0           # X0 = s1 {DC00}
+        or      c, y0                   # y0 = a|c
+        add     h, d                    # d = d + h + S1 + CH + k + w
+        and     c, y2                   # y2 = a&c
+        paddd   XTMP0, X0               # X0 = {W[3], W[2], W[1], W[0]}
+        and     b, y0                   # y0 = (a|c)&b
+        add     y1, h                   # h = h + S1 + CH + k + w + S0
+        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+        ROTATE_ARGS
+        rotate_Xs
+.endm
+## input is [rsp + _XFER + %1 * 4]
+.macro DO_ROUND round
+        mov     e, y0                 # y0 = e
+        ror     $(25-11), y0          # y0 = e >> (25-11)
+        mov     a, y1                 # y1 = a
+        xor     e, y0                 # y0 = e ^ (e >> (25-11))
+        ror     $(22-13), y1          # y1 = a >> (22-13)
+        mov     f, y2                 # y2 = f
+        xor     a, y1                 # y1 = a ^ (a >> (22-13)
+        ror     $(11-6), y0           # y0 = (e >> (11-6)) ^ (e >> (25-6))
+        xor     g, y2                 # y2 = f^g
+        xor     e, y0                 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+        ror     $(13-2), y1           # y1 = (a >> (13-2)) ^ (a >> (22-2))
+        and     e, y2                 # y2 = (f^g)&e
+        xor     a, y1                 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+        ror     $6, y0                # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+        xor     g, y2                 # y2 = CH = ((f^g)&e)^g
+        add     y0, y2                # y2 = S1 + CH
+        ror     $2, y1                # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+        offset = \round * 4 + _XFER
+        add     offset(%rsp), y2      # y2 = k + w + S1 + CH
+        mov     a, y0                 # y0 = a
+        add     y2, h                 # h = h + S1 + CH + k + w
+        mov     a, y2                 # y2 = a
+        or      c, y0                 # y0 = a|c
+        add     h, d                  # d = d + h + S1 + CH + k + w
+        and     c, y2                 # y2 = a&c
+        and     b, y0                 # y0 = (a|c)&b
+        add     y1, h                 # h = h + S1 + CH + k + w + S0
+        or      y2, y0                # y0 = MAJ = (a|c)&b)|(a&c)
+        add     y0, h                 # h = h + S1 + CH + k + w + S0 + MAJ
+        ROTATE_ARGS
+.endm
+########################################################################
+## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks)
+## arg 1 : pointer to input data
+## arg 2 : pointer to digest
+## arg 3 : Num blocks
+########################################################################
+.text
+ENTRY(sha256_transform_ssse3)
+.align 32
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        pushq   %r12
+        mov     %rsp, %r12
+        subq    $STACK_SIZE, %rsp
+        and     $~15, %rsp
+        shl     $6, NUM_BLKS             # convert to bytes
+        jz      done_hash
+        add     INP, NUM_BLKS
+        mov     NUM_BLKS, _INP_END(%rsp) # pointer to end of data
+        ## load initial digest
+        mov     4*0(CTX), a
+        mov     4*1(CTX), b
+        mov     4*2(CTX), c
+        mov     4*3(CTX), d
+        mov     4*4(CTX), e
+        mov     4*5(CTX), f
+        mov     4*6(CTX), g
+        mov     4*7(CTX), h
+        movdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+        movdqa  _SHUF_00BA(%rip), SHUF_00BA
+        movdqa  _SHUF_DC00(%rip), SHUF_DC00
+loop0:
+        lea     K256(%rip), TBL
+        ## byte swap first 16 dwords
+        COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
+        COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
+        COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
+        COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
+        mov     INP, _INP(%rsp)
+        ## schedule 48 input dwords, by doing 3 rounds of 16 each
+        mov     $3, SRND
+.align 16
+loop1:
+        movdqa  (TBL), XFER
+        paddd   X0, XFER
+        movdqa  XFER, _XFER(%rsp)
+        FOUR_ROUNDS_AND_SCHED
+        movdqa  1*16(TBL), XFER
+        paddd   X0, XFER
+        movdqa  XFER, _XFER(%rsp)
+        FOUR_ROUNDS_AND_SCHED
+        movdqa  2*16(TBL), XFER
+        paddd   X0, XFER
+        movdqa  XFER, _XFER(%rsp)
+        FOUR_ROUNDS_AND_SCHED
+        movdqa  3*16(TBL), XFER
+        paddd   X0, XFER
+        movdqa  XFER, _XFER(%rsp)
+        add     $4*16, TBL
+        FOUR_ROUNDS_AND_SCHED
+        sub     $1, SRND
+        jne     loop1
+        mov     $2, SRND
+loop2:
+        paddd   (TBL), X0
+        movdqa  X0, _XFER(%rsp)
+        DO_ROUND        0
+        DO_ROUND        1
+        DO_ROUND        2
+        DO_ROUND        3
+        paddd   1*16(TBL), X1
+        movdqa  X1, _XFER(%rsp)
+        add     $2*16, TBL
+        DO_ROUND        0
+        DO_ROUND        1
+        DO_ROUND        2
+        DO_ROUND        3
+        movdqa  X2, X0
+        movdqa  X3, X1
+        sub     $1, SRND
+        jne     loop2
+        addm    (4*0)(CTX),a
+        addm    (4*1)(CTX),b
+        addm    (4*2)(CTX),c
+        addm    (4*3)(CTX),d
+        addm    (4*4)(CTX),e
+        addm    (4*5)(CTX),f
+        addm    (4*6)(CTX),g
+        addm    (4*7)(CTX),h
+        mov     _INP(%rsp), INP
+        add     $64, INP
+        cmp     _INP_END(%rsp), INP
+        jne     loop0
+done_hash:
+        mov     %r12, %rsp
+        popq    %r12
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %rbp
+        popq    %rbx
+        ret
+ENDPROC(sha256_transform_ssse3)
+.data
+.align 64
+K256:
+        .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+        .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+        .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+        .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+        .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+        .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+        .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+        .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+        .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+        .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+        .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+        .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+        .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+        .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+        .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+        .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+PSHUFFLE_BYTE_FLIP_MASK:
+        .octa 0x0c0d0e0f08090a0b0405060700010203
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+        .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+        .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
new file mode 100644
index 000000000000..597d4da69656
--- /dev/null
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -0,0 +1,275 @@
+/*
+ * Cryptographic API.
+ *
+ * Glue code for the SHA256 Secure Hash Algorithm assembler
+ * implementation using supplemental SSE3 / AVX / AVX2 instructions.
+ *
+ * This file is based on sha256_generic.c
+ *
+ * Copyright (C) 2013 Intel Corporation.
+ *
+ * Author:
+ *     Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#define pr_fmt(fmt)     KBUILD_MODNAME ": " fmt
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <linux/string.h>
+asmlinkage void sha256_transform_ssse3(const char *data, u32 *digest,
+                                     u64 rounds);
+#ifdef CONFIG_AS_AVX
+asmlinkage void sha256_transform_avx(const char *data, u32 *digest,
+                                     u64 rounds);
+#endif
+#ifdef CONFIG_AS_AVX2
+asmlinkage void sha256_transform_rorx(const char *data, u32 *digest,
+                                     u64 rounds);
+#endif
+static asmlinkage void (*sha256_transform_asm)(const char *, u32 *, u64);
+static int sha256_ssse3_init(struct shash_desc *desc)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        sctx->state[0] = SHA256_H0;
+        sctx->state[1] = SHA256_H1;
+        sctx->state[2] = SHA256_H2;
+        sctx->state[3] = SHA256_H3;
+        sctx->state[4] = SHA256_H4;
+        sctx->state[5] = SHA256_H5;
+        sctx->state[6] = SHA256_H6;
+        sctx->state[7] = SHA256_H7;
+        sctx->count = 0;
+        return 0;
+}
+static int __sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
+                               unsigned int len, unsigned int partial)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        unsigned int done = 0;
+        sctx->count += len;
+        if (partial) {
+                done = SHA256_BLOCK_SIZE - partial;
+                memcpy(sctx->buf + partial, data, done);
+                sha256_transform_asm(sctx->buf, sctx->state, 1);
+        }
+        if (len - done >= SHA256_BLOCK_SIZE) {
+                const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+                sha256_transform_asm(data + done, sctx->state, (u64) rounds);
+                done += rounds * SHA256_BLOCK_SIZE;
+        }
+        memcpy(sctx->buf, data + done, len - done);
+        return 0;
+}
+static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
+                             unsigned int len)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+        int res;
+        /* Handle the fast case right here */
+        if (partial + len < SHA256_BLOCK_SIZE) {
+                sctx->count += len;
+                memcpy(sctx->buf + partial, data, len);
+                return 0;
+        }
+        if (!irq_fpu_usable()) {
+                res = crypto_sha256_update(desc, data, len);
+        } else {
+                kernel_fpu_begin();
+                res = __sha256_ssse3_update(desc, data, len, partial);
+                kernel_fpu_end();
+        }
+        return res;
+}
+/* Add padding and return the message digest. */
+static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        unsigned int i, index, padlen;
+        __be32 *dst = (__be32 *)out;
+        __be64 bits;
+        static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+        bits = cpu_to_be64(sctx->count << 3);
+        /* Pad out to 56 mod 64 and append length */
+        index = sctx->count % SHA256_BLOCK_SIZE;
+        padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+        if (!irq_fpu_usable()) {
+                crypto_sha256_update(desc, padding, padlen);
+                crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits));
+        } else {
+                kernel_fpu_begin();
+                /* We need to fill a whole block for __sha256_ssse3_update() */
+                if (padlen <= 56) {
+                        sctx->count += padlen;
+                        memcpy(sctx->buf + index, padding, padlen);
+                } else {
+                        __sha256_ssse3_update(desc, padding, padlen, index);
+                }
+                __sha256_ssse3_update(desc, (const u8 *)&bits,
+                                        sizeof(bits), 56);
+                kernel_fpu_end();
+        }
+        /* Store state in digest */
+        for (i = 0; i < 8; i++)
+                dst[i] = cpu_to_be32(sctx->state[i]);
+        /* Wipe context */
+        memset(sctx, 0, sizeof(*sctx));
+        return 0;
+}
+static int sha256_ssse3_export(struct shash_desc *desc, void *out)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        memcpy(out, sctx, sizeof(*sctx));
+        return 0;
+}
+static int sha256_ssse3_import(struct shash_desc *desc, const void *in)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        memcpy(sctx, in, sizeof(*sctx));
+        return 0;
+}
+static struct shash_alg alg = {
+        .digestsize     =       SHA256_DIGEST_SIZE,
+        .init           =       sha256_ssse3_init,
+        .update         =       sha256_ssse3_update,
+        .final          =       sha256_ssse3_final,
+        .export         =       sha256_ssse3_export,
+        .import         =       sha256_ssse3_import,
+        .descsize       =       sizeof(struct sha256_state),
+        .statesize      =       sizeof(struct sha256_state),
+        .base           =       {
+                .cra_name       =       "sha256",
+                .cra_driver_name =      "sha256-ssse3",
+                .cra_priority   =       150,
+                .cra_flags      =       CRYPTO_ALG_TYPE_SHASH,
+                .cra_blocksize  =       SHA256_BLOCK_SIZE,
+                .cra_module     =       THIS_MODULE,
+        }
+};
+#ifdef CONFIG_AS_AVX
+static bool __init avx_usable(void)
+{
+        u64 xcr0;
+        if (!cpu_has_avx || !cpu_has_osxsave)
+                return false;
+        xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+        if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+                pr_info("AVX detected but unusable.\n");
+                return false;
+        }
+        return true;
+}
+#endif
+static int __init sha256_ssse3_mod_init(void)
+{
+        /* test for SSE3 first */
+        if (cpu_has_ssse3)
+                sha256_transform_asm = sha256_transform_ssse3;
+#ifdef CONFIG_AS_AVX
+        /* allow AVX to override SSSE3, it's a little faster */
+        if (avx_usable()) {
+#ifdef CONFIG_AS_AVX2
+                if (boot_cpu_has(X86_FEATURE_AVX2))
+                        sha256_transform_asm = sha256_transform_rorx;
+                else
+#endif
+                        sha256_transform_asm = sha256_transform_avx;
+        }
+#endif
+        if (sha256_transform_asm) {
+#ifdef CONFIG_AS_AVX
+                if (sha256_transform_asm == sha256_transform_avx)
+                        pr_info("Using AVX optimized SHA-256 implementation\n");
+#ifdef CONFIG_AS_AVX2
+                else if (sha256_transform_asm == sha256_transform_rorx)
+                        pr_info("Using AVX2 optimized SHA-256 implementation\n");
+#endif
+                else
+#endif
+                        pr_info("Using SSSE3 optimized SHA-256 implementation\n");
+                return crypto_register_shash(&alg);
+        }
+        pr_info("Neither AVX nor SSSE3 is available/usable.\n");
+        return -ENODEV;
+}
+static void __exit sha256_ssse3_mod_fini(void)
+{
+        crypto_unregister_shash(&alg);
+}
+module_init(sha256_ssse3_mod_init);
+module_exit(sha256_ssse3_mod_fini);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated");
+MODULE_ALIAS("sha256");
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
new file mode 100644
index 000000000000..974dde9bc6cd
--- /dev/null
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -0,0 +1,423 @@
+########################################################################
+# Implement fast SHA-512 with AVX instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     David Cote <david.m.cote@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+#ifdef CONFIG_AS_AVX
+#include <linux/linkage.h>
+.text
+# Virtual Registers
+# ARG1
+msg     = %rdi
+# ARG2
+digest  = %rsi
+# ARG3
+msglen  = %rdx
+T1      = %rcx
+T2      = %r8
+a_64    = %r9
+b_64    = %r10
+c_64    = %r11
+d_64    = %r12
+e_64    = %r13
+f_64    = %r14
+g_64    = %r15
+h_64    = %rbx
+tmp0    = %rax
+# Local variables (stack frame)
+# Message Schedule
+W_SIZE = 80*8
+# W[t] + K[t] | W[t+1] + K[t+1]
+WK_SIZE = 2*8
+RSPSAVE_SIZE = 1*8
+GPRSAVE_SIZE = 5*8
+frame_W = 0
+frame_WK = frame_W + W_SIZE
+frame_RSPSAVE = frame_WK + WK_SIZE
+frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
+frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+# Useful QWORD "arrays" for simpler memory references
+# MSG, DIGEST, K_t, W_t are arrays
+# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+# Input message (arg1)
+#define MSG(i)    8*i(msg)
+# Output Digest (arg2)
+#define DIGEST(i) 8*i(digest)
+# SHA Constants (static mem)
+#define K_t(i)    8*i+K512(%rip)
+# Message Schedule (stack frame)
+#define W_t(i)    8*i+frame_W(%rsp)
+# W[t]+K[t] (stack frame)
+#define WK_2(i)   8*((i%2))+frame_WK(%rsp)
+.macro RotateState
+        # Rotate symbols a..h right
+        TMP   = h_64
+        h_64  = g_64
+        g_64  = f_64
+        f_64  = e_64
+        e_64  = d_64
+        d_64  = c_64
+        c_64  = b_64
+        b_64  = a_64
+        a_64  = TMP
+.endm
+.macro RORQ p1 p2
+        # shld is faster than ror on Sandybridge
+        shld    $(64-\p2), \p1, \p1
+.endm
+.macro SHA512_Round rnd
+        # Compute Round %%t
+        mov     f_64, T1          # T1 = f
+        mov     e_64, tmp0        # tmp = e
+        xor     g_64, T1          # T1 = f ^ g
+        RORQ    tmp0, 23   # 41    # tmp = e ror 23
+        and     e_64, T1          # T1 = (f ^ g) & e
+        xor     e_64, tmp0        # tmp = (e ror 23) ^ e
+        xor     g_64, T1          # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+        idx = \rnd
+        add     WK_2(idx), T1     # W[t] + K[t] from message scheduler
+        RORQ    tmp0, 4   # 18    # tmp = ((e ror 23) ^ e) ror 4
+        xor     e_64, tmp0        # tmp = (((e ror 23) ^ e) ror 4) ^ e
+        mov     a_64, T2          # T2 = a
+        add     h_64, T1          # T1 = CH(e,f,g) + W[t] + K[t] + h
+        RORQ    tmp0, 14  # 14    # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+        add     tmp0, T1          # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+        mov     a_64, tmp0        # tmp = a
+        xor     c_64, T2          # T2 = a ^ c
+        and     c_64, tmp0        # tmp = a & c
+        and     b_64, T2          # T2 = (a ^ c) & b
+        xor     tmp0, T2          # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+        mov     a_64, tmp0        # tmp = a
+        RORQ    tmp0, 5  # 39     # tmp = a ror 5
+        xor     a_64, tmp0        # tmp = (a ror 5) ^ a
+        add     T1, d_64          # e(next_state) = d + T1
+        RORQ    tmp0, 6  # 34     # tmp = ((a ror 5) ^ a) ror 6
+        xor     a_64, tmp0        # tmp = (((a ror 5) ^ a) ror 6) ^ a
+        lea     (T1, T2), h_64    # a(next_state) = T1 + Maj(a,b,c)
+        RORQ    tmp0, 28  # 28    # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+        add     tmp0, h_64        # a(next_state) = T1 + Maj(a,b,c) S0(a)
+        RotateState
+.endm
+.macro SHA512_2Sched_2Round_avx rnd
+        # Compute rounds t-2 and t-1
+        # Compute message schedule QWORDS t and t+1
+        #   Two rounds are computed based on the values for K[t-2]+W[t-2] and
+        # K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+        # scheduler.
+        #   The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)].
+        # They are then added to their respective SHA512 constants at
+        # [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)]
+        #   For brievity, the comments following vectored instructions only refer to
+        # the first of a pair of QWORDS.
+        # Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
+        #   The computation of the message schedule and the rounds are tightly
+        # stitched to take advantage of instruction-level parallelism.
+        idx = \rnd - 2
+        vmovdqa W_t(idx), %xmm4         # XMM4 = W[t-2]
+        idx = \rnd - 15
+        vmovdqu W_t(idx), %xmm5         # XMM5 = W[t-15]
+        mov     f_64, T1
+        vpsrlq  $61, %xmm4, %xmm0       # XMM0 = W[t-2]>>61
+        mov     e_64, tmp0
+        vpsrlq  $1, %xmm5, %xmm6        # XMM6 = W[t-15]>>1
+        xor     g_64, T1
+        RORQ    tmp0, 23 # 41
+        vpsrlq  $19, %xmm4, %xmm1       # XMM1 = W[t-2]>>19
+        and     e_64, T1
+        xor     e_64, tmp0
+        vpxor   %xmm1, %xmm0, %xmm0     # XMM0 = W[t-2]>>61 ^ W[t-2]>>19
+        xor     g_64, T1
+        idx = \rnd
+        add     WK_2(idx), T1#
+        vpsrlq  $8, %xmm5, %xmm7        # XMM7 = W[t-15]>>8
+        RORQ    tmp0, 4 # 18
+        vpsrlq  $6, %xmm4, %xmm2        # XMM2 = W[t-2]>>6
+        xor     e_64, tmp0
+        mov     a_64, T2
+        add     h_64, T1
+        vpxor   %xmm7, %xmm6, %xmm6     # XMM6 = W[t-15]>>1 ^ W[t-15]>>8
+        RORQ    tmp0, 14 # 14
+        add     tmp0, T1
+        vpsrlq  $7, %xmm5, %xmm8        # XMM8 = W[t-15]>>7
+        mov     a_64, tmp0
+        xor     c_64, T2
+        vpsllq  $(64-61), %xmm4, %xmm3  # XMM3 = W[t-2]<<3
+        and     c_64, tmp0
+        and     b_64, T2
+        vpxor   %xmm3, %xmm2, %xmm2     # XMM2 = W[t-2]>>6 ^ W[t-2]<<3
+        xor     tmp0, T2
+        mov     a_64, tmp0
+        vpsllq  $(64-1), %xmm5, %xmm9   # XMM9 = W[t-15]<<63
+        RORQ    tmp0, 5 # 39
+        vpxor   %xmm9, %xmm8, %xmm8     # XMM8 = W[t-15]>>7 ^ W[t-15]<<63
+        xor     a_64, tmp0
+        add     T1, d_64
+        RORQ    tmp0, 6 # 34
+        xor     a_64, tmp0
+        vpxor   %xmm8, %xmm6, %xmm6     # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^
+                                        #  W[t-15]>>7 ^ W[t-15]<<63
+        lea     (T1, T2), h_64
+        RORQ    tmp0, 28 # 28
+        vpsllq  $(64-19), %xmm4, %xmm4  # XMM4 = W[t-2]<<25
+        add     tmp0, h_64
+        RotateState
+        vpxor   %xmm4, %xmm0, %xmm0     # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^
+                                        #        W[t-2]<<25
+        mov     f_64, T1
+        vpxor   %xmm2, %xmm0, %xmm0     # XMM0 = s1(W[t-2])
+        mov     e_64, tmp0
+        xor     g_64, T1
+        idx = \rnd - 16
+        vpaddq  W_t(idx), %xmm0, %xmm0  # XMM0 = s1(W[t-2]) + W[t-16]
+        idx = \rnd - 7
+        vmovdqu W_t(idx), %xmm1         # XMM1 = W[t-7]
+        RORQ    tmp0, 23 # 41
+        and     e_64, T1
+        xor     e_64, tmp0
+        xor     g_64, T1
+        vpsllq  $(64-8), %xmm5, %xmm5   # XMM5 = W[t-15]<<56
+        idx = \rnd + 1
+        add     WK_2(idx), T1
+        vpxor   %xmm5, %xmm6, %xmm6     # XMM6 = s0(W[t-15])
+        RORQ    tmp0, 4 # 18
+        vpaddq  %xmm6, %xmm0, %xmm0     # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
+        xor     e_64, tmp0
+        vpaddq  %xmm1, %xmm0, %xmm0     # XMM0 = W[t] = s1(W[t-2]) + W[t-7] +
+                                        #               s0(W[t-15]) + W[t-16]
+        mov     a_64, T2
+        add     h_64, T1
+        RORQ    tmp0, 14 # 14
+        add     tmp0, T1
+        idx = \rnd
+        vmovdqa %xmm0, W_t(idx)         # Store W[t]
+        vpaddq  K_t(idx), %xmm0, %xmm0  # Compute W[t]+K[t]
+        vmovdqa %xmm0, WK_2(idx)        # Store W[t]+K[t] for next rounds
+        mov     a_64, tmp0
+        xor     c_64, T2
+        and     c_64, tmp0
+        and     b_64, T2
+        xor     tmp0, T2
+        mov     a_64, tmp0
+        RORQ    tmp0, 5 # 39
+        xor     a_64, tmp0
+        add     T1, d_64
+        RORQ    tmp0, 6 # 34
+        xor     a_64, tmp0
+        lea     (T1, T2), h_64
+        RORQ    tmp0, 28 # 28
+        add     tmp0, h_64
+        RotateState
+.endm
+########################################################################
+# void sha512_transform_avx(const void* M, void* D, u64 L)
+# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+# The size of the message pointed to by M must be an integer multiple of SHA512
+# message blocks.
+# L is the message length in SHA512 blocks
+########################################################################
+ENTRY(sha512_transform_avx)
+        cmp $0, msglen
+        je nowork
+        # Allocate Stack Space
+        mov     %rsp, %rax
+        sub     $frame_size, %rsp
+        and     $~(0x20 - 1), %rsp
+        mov     %rax, frame_RSPSAVE(%rsp)
+        # Save GPRs
+        mov     %rbx, frame_GPRSAVE(%rsp)
+        mov     %r12, frame_GPRSAVE +8*1(%rsp)
+        mov     %r13, frame_GPRSAVE +8*2(%rsp)
+        mov     %r14, frame_GPRSAVE +8*3(%rsp)
+        mov     %r15, frame_GPRSAVE +8*4(%rsp)
+updateblock:
+        # Load state variables
+        mov     DIGEST(0), a_64
+        mov     DIGEST(1), b_64
+        mov     DIGEST(2), c_64
+        mov     DIGEST(3), d_64
+        mov     DIGEST(4), e_64
+        mov     DIGEST(5), f_64
+        mov     DIGEST(6), g_64
+        mov     DIGEST(7), h_64
+        t = 0
+        .rept 80/2 + 1
+        # (80 rounds) / (2 rounds/iteration) + (1 iteration)
+        # +1 iteration because the scheduler leads hashing by 1 iteration
+                .if t < 2
+                        # BSWAP 2 QWORDS
+                        vmovdqa  XMM_QWORD_BSWAP(%rip), %xmm1
+                        vmovdqu  MSG(t), %xmm0
+                        vpshufb  %xmm1, %xmm0, %xmm0    # BSWAP
+                        vmovdqa  %xmm0, W_t(t) # Store Scheduled Pair
+                        vpaddq   K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
+                        vmovdqa  %xmm0, WK_2(t) # Store into WK for rounds
+                .elseif t < 16
+                        # BSWAP 2 QWORDS# Compute 2 Rounds
+                        vmovdqu  MSG(t), %xmm0
+                        vpshufb  %xmm1, %xmm0, %xmm0    # BSWAP
+                        SHA512_Round t-2    # Round t-2
+                        vmovdqa  %xmm0, W_t(t) # Store Scheduled Pair
+                        vpaddq   K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
+                        SHA512_Round t-1    # Round t-1
+                        vmovdqa  %xmm0, WK_2(t)# Store W[t]+K[t] into WK
+                .elseif t < 79
+                        # Schedule 2 QWORDS# Compute 2 Rounds
+                        SHA512_2Sched_2Round_avx t
+                .else
+                        # Compute 2 Rounds
+                        SHA512_Round t-2
+                        SHA512_Round t-1
+                .endif
+                t = t+2
+        .endr
+        # Update digest
+        add     a_64, DIGEST(0)
+        add     b_64, DIGEST(1)
+        add     c_64, DIGEST(2)
+        add     d_64, DIGEST(3)
+        add     e_64, DIGEST(4)
+        add     f_64, DIGEST(5)
+        add     g_64, DIGEST(6)
+        add     h_64, DIGEST(7)
+        # Advance to next message block
+        add     $16*8, msg
+        dec     msglen
+        jnz     updateblock
+        # Restore GPRs
+        mov     frame_GPRSAVE(%rsp),      %rbx
+        mov     frame_GPRSAVE +8*1(%rsp), %r12
+        mov     frame_GPRSAVE +8*2(%rsp), %r13
+        mov     frame_GPRSAVE +8*3(%rsp), %r14
+        mov     frame_GPRSAVE +8*4(%rsp), %r15
+        # Restore Stack Pointer
+        mov     frame_RSPSAVE(%rsp), %rsp
+nowork:
+        ret
+ENDPROC(sha512_transform_avx)
+########################################################################
+### Binary Data
+.data
+.align 16
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+        .octa 0x08090a0b0c0d0e0f0001020304050607
+# K[t] used in SHA512 hashing
+K512:
+        .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+        .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+        .quad 0x3956c25bf348b538,0x59f111f1b605d019
+        .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+        .quad 0xd807aa98a3030242,0x12835b0145706fbe
+        .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+        .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+        .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+        .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+        .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+        .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+        .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+        .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+        .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+        .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+        .quad 0x06ca6351e003826f,0x142929670a0e6e70
+        .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+        .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+        .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+        .quad 0x81c2c92e47edaee6,0x92722c851482353b
+        .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+        .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+        .quad 0xd192e819d6ef5218,0xd69906245565a910
+        .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+        .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+        .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+        .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+        .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+        .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+        .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+        .quad 0x90befffa23631e28,0xa4506cebde82bde9
+        .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+        .quad 0xca273eceea26619c,0xd186b8c721c0c207
+        .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+        .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+        .quad 0x113f9804bef90dae,0x1b710b35131c471b
+        .quad 0x28db77f523047d84,0x32caab7b40c72493
+        .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+        .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+        .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+#endif
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
new file mode 100644
index 000000000000..568b96105f5c
--- /dev/null
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -0,0 +1,743 @@
+########################################################################
+# Implement fast SHA-512 with AVX2 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     David Cote <david.m.cote@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 1 blocks at a time, with 4 lanes per block
+########################################################################
+#ifdef CONFIG_AS_AVX2
+#include <linux/linkage.h>
+.text
+# Virtual Registers
+Y_0 = %ymm4
+Y_1 = %ymm5
+Y_2 = %ymm6
+Y_3 = %ymm7
+YTMP0 = %ymm0
+YTMP1 = %ymm1
+YTMP2 = %ymm2
+YTMP3 = %ymm3
+YTMP4 = %ymm8
+XFER  = YTMP0
+BYTE_FLIP_MASK  = %ymm9
+# 1st arg
+INP         = %rdi
+# 2nd arg
+CTX         = %rsi
+# 3rd arg
+NUM_BLKS    = %rdx
+c           = %rcx
+d           = %r8
+e           = %rdx
+y3          = %rdi
+TBL   = %rbp
+a     = %rax
+b     = %rbx
+f     = %r9
+g     = %r10
+h     = %r11
+old_h = %r11
+T1    = %r12
+y0    = %r13
+y1    = %r14
+y2    = %r15
+y4    = %r12
+# Local variables (stack frame)
+XFER_SIZE = 4*8
+SRND_SIZE = 1*8
+INP_SIZE = 1*8
+INPEND_SIZE = 1*8
+RSPSAVE_SIZE = 1*8
+GPRSAVE_SIZE = 6*8
+frame_XFER = 0
+frame_SRND = frame_XFER + XFER_SIZE
+frame_INP = frame_SRND + SRND_SIZE
+frame_INPEND = frame_INP + INP_SIZE
+frame_RSPSAVE = frame_INPEND + INPEND_SIZE
+frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
+frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+## assume buffers not aligned
+#define VMOVDQ vmovdqu
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+        add     \p1, \p2
+        mov     \p2, \p1
+.endm
+# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
+# Load ymm with mem and byte swap each dword
+.macro COPY_YMM_AND_BSWAP p1 p2 p3
+        VMOVDQ \p2, \p1
+        vpshufb \p3, \p1, \p1
+.endm
+# rotate_Ys
+# Rotate values of symbols Y0...Y3
+.macro rotate_Ys
+        Y_ = Y_0
+        Y_0 = Y_1
+        Y_1 = Y_2
+        Y_2 = Y_3
+        Y_3 = Y_
+.endm
+# RotateState
+.macro RotateState
+        # Rotate symbols a..h right
+        old_h  = h
+        TMP_   = h
+        h      = g
+        g      = f
+        f      = e
+        e      = d
+        d      = c
+        c      = b
+        b      = a
+        a      = TMP_
+.endm
+# macro MY_VPALIGNR     YDST, YSRC1, YSRC2, RVAL
+# YDST = {YSRC1, YSRC2} >> RVAL*8
+.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
+        vperm2f128      $0x3, \YSRC2, \YSRC1, \YDST     # YDST = {YS1_LO, YS2_HI}
+        vpalignr        $\RVAL, \YSRC2, \YDST, \YDST    # YDST = {YDS1, YS2} >> RVAL*8
+.endm
+.macro FOUR_ROUNDS_AND_SCHED
+################################### RND N + 0 #########################################
+        # Extract w[t-7]
+        MY_VPALIGNR     YTMP0, Y_3, Y_2, 8              # YTMP0 = W[-7]
+        # Calculate w[t-16] + w[t-7]
+        vpaddq          Y_0, YTMP0, YTMP0               # YTMP0 = W[-7] + W[-16]
+        # Extract w[t-15]
+        MY_VPALIGNR     YTMP1, Y_1, Y_0, 8              # YTMP1 = W[-15]
+        # Calculate sigma0
+        # Calculate w[t-15] ror 1
+        vpsrlq          $1, YTMP1, YTMP2
+        vpsllq          $(64-1), YTMP1, YTMP3
+        vpor            YTMP2, YTMP3, YTMP3             # YTMP3 = W[-15] ror 1
+        # Calculate w[t-15] shr 7
+        vpsrlq          $7, YTMP1, YTMP4                # YTMP4 = W[-15] >> 7
+        mov     a, y3           # y3 = a                                # MAJA
+        rorx    $41, e, y0      # y0 = e >> 41                          # S1A
+        rorx    $18, e, y1      # y1 = e >> 18                          # S1B
+        add     frame_XFER(%rsp),h              # h = k + w + h         # --
+        or      c, y3           # y3 = a|c                              # MAJA
+        mov     f, y2           # y2 = f                                # CH
+        rorx    $34, a, T1      # T1 = a >> 34                          # S0B
+        xor     y1, y0          # y0 = (e>>41) ^ (e>>18)                # S1
+        xor     g, y2           # y2 = f^g                              # CH
+        rorx    $14, e, y1      # y1 = (e >> 14)                        # S1
+        and     e, y2           # y2 = (f^g)&e                          # CH
+        xor     y1, y0          # y0 = (e>>41) ^ (e>>18) ^ (e>>14)      # S1
+        rorx    $39, a, y1      # y1 = a >> 39                          # S0A
+        add     h, d            # d = k + w + h + d                     # --
+        and     b, y3           # y3 = (a|c)&b                          # MAJA
+        xor     T1, y1          # y1 = (a>>39) ^ (a>>34)                # S0
+        rorx    $28, a, T1      # T1 = (a >> 28)                        # S0
+        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
+        xor     T1, y1          # y1 = (a>>39) ^ (a>>34) ^ (a>>28)      # S0
+        mov     a, T1           # T1 = a                                # MAJB
+        and     c, T1           # T1 = a&c                              # MAJB
+        add     y0, y2          # y2 = S1 + CH                          # --
+        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+        add     y1, h           # h = k + w + h + S0                    # --
+        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
+        add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+        add     y3, h           # h = t1 + S0 + MAJ                     # --
+        RotateState
+################################### RND N + 1 #########################################
+        # Calculate w[t-15] ror 8
+        vpsrlq          $8, YTMP1, YTMP2
+        vpsllq          $(64-8), YTMP1, YTMP1
+        vpor            YTMP2, YTMP1, YTMP1             # YTMP1 = W[-15] ror 8
+        # XOR the three components
+        vpxor           YTMP4, YTMP3, YTMP3             # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
+        vpxor           YTMP1, YTMP3, YTMP1             # YTMP1 = s0
+        # Add three components, w[t-16], w[t-7] and sigma0
+        vpaddq          YTMP1, YTMP0, YTMP0             # YTMP0 = W[-16] + W[-7] + s0
+        # Move to appropriate lanes for calculating w[16] and w[17]
+        vperm2f128      $0x0, YTMP0, YTMP0, Y_0         # Y_0 = W[-16] + W[-7] + s0 {BABA}
+        # Move to appropriate lanes for calculating w[18] and w[19]
+        vpand           MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00}
+        # Calculate w[16] and w[17] in both 128 bit lanes
+        # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
+        vperm2f128      $0x11, Y_3, Y_3, YTMP2          # YTMP2 = W[-2] {BABA}
+        vpsrlq          $6, YTMP2, YTMP4                # YTMP4 = W[-2] >> 6 {BABA}
+        mov     a, y3           # y3 = a                                # MAJA
+        rorx    $41, e, y0      # y0 = e >> 41                          # S1A
+        rorx    $18, e, y1      # y1 = e >> 18                          # S1B
+        add     1*8+frame_XFER(%rsp), h         # h = k + w + h         # --
+        or      c, y3           # y3 = a|c                              # MAJA
+        mov     f, y2           # y2 = f                                # CH
+        rorx    $34, a, T1      # T1 = a >> 34                          # S0B
+        xor     y1, y0          # y0 = (e>>41) ^ (e>>18)                # S1
+        xor     g, y2           # y2 = f^g                              # CH
+        rorx    $14, e, y1      # y1 = (e >> 14)                        # S1
+        xor     y1, y0          # y0 = (e>>41) ^ (e>>18) ^ (e>>14)      # S1
+        rorx    $39, a, y1      # y1 = a >> 39                          # S0A
+        and     e, y2           # y2 = (f^g)&e                          # CH
+        add     h, d            # d = k + w + h + d                     # --
+        and     b, y3           # y3 = (a|c)&b                          # MAJA
+        xor     T1, y1          # y1 = (a>>39) ^ (a>>34)                # S0
+        rorx    $28, a, T1      # T1 = (a >> 28)                        # S0
+        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
+        xor     T1, y1          # y1 = (a>>39) ^ (a>>34) ^ (a>>28)      # S0
+        mov     a, T1           # T1 = a                                # MAJB
+        and     c, T1           # T1 = a&c                              # MAJB
+        add     y0, y2          # y2 = S1 + CH                          # --
+        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+        add     y1, h           # h = k + w + h + S0                    # --
+        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
+        add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+        add     y3, h           # h = t1 + S0 + MAJ                     # --
+        RotateState
+################################### RND N + 2 #########################################
+        vpsrlq          $19, YTMP2, YTMP3               # YTMP3 = W[-2] >> 19 {BABA}
+        vpsllq          $(64-19), YTMP2, YTMP1          # YTMP1 = W[-2] << 19 {BABA}
+        vpor            YTMP1, YTMP3, YTMP3             # YTMP3 = W[-2] ror 19 {BABA}
+        vpxor           YTMP3, YTMP4, YTMP4             # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
+        vpsrlq          $61, YTMP2, YTMP3               # YTMP3 = W[-2] >> 61 {BABA}
+        vpsllq          $(64-61), YTMP2, YTMP1          # YTMP1 = W[-2] << 61 {BABA}
+        vpor            YTMP1, YTMP3, YTMP3             # YTMP3 = W[-2] ror 61 {BABA}
+        vpxor           YTMP3, YTMP4, YTMP4             # YTMP4 = s1 = (W[-2] ror 19) ^
+                                                        #  (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
+        # Add sigma1 to the other compunents to get w[16] and w[17]
+        vpaddq          YTMP4, Y_0, Y_0                 # Y_0 = {W[1], W[0], W[1], W[0]}
+        # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
+        vpsrlq          $6, Y_0, YTMP4                  # YTMP4 = W[-2] >> 6 {DC--}
+        mov     a, y3           # y3 = a                                # MAJA
+        rorx    $41, e, y0      # y0 = e >> 41                          # S1A
+        add     2*8+frame_XFER(%rsp), h         # h = k + w + h         # --
+        rorx    $18, e, y1      # y1 = e >> 18                          # S1B
+        or      c, y3           # y3 = a|c                              # MAJA
+        mov     f, y2           # y2 = f                                # CH
+        xor     g, y2           # y2 = f^g                              # CH
+        rorx    $34, a, T1      # T1 = a >> 34                          # S0B
+        xor     y1, y0          # y0 = (e>>41) ^ (e>>18)                # S1
+        and     e, y2           # y2 = (f^g)&e                          # CH
+        rorx    $14, e, y1      # y1 = (e >> 14)                        # S1
+        add     h, d            # d = k + w + h + d                     # --
+        and     b, y3           # y3 = (a|c)&b                          # MAJA
+        xor     y1, y0          # y0 = (e>>41) ^ (e>>18) ^ (e>>14)      # S1
+        rorx    $39, a, y1      # y1 = a >> 39                          # S0A
+        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
+        xor     T1, y1          # y1 = (a>>39) ^ (a>>34)                # S0
+        rorx    $28, a, T1      # T1 = (a >> 28)                        # S0
+        xor     T1, y1          # y1 = (a>>39) ^ (a>>34) ^ (a>>28)      # S0
+        mov     a, T1           # T1 = a                                # MAJB
+        and     c, T1           # T1 = a&c                              # MAJB
+        add     y0, y2          # y2 = S1 + CH                          # --
+        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+        add     y1, h           # h = k + w + h + S0                    # --
+        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
+        add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+        add     y3, h           # h = t1 + S0 + MAJ                     # --
+        RotateState
+################################### RND N + 3 #########################################
+        vpsrlq          $19, Y_0, YTMP3                 # YTMP3 = W[-2] >> 19 {DC--}
+        vpsllq          $(64-19), Y_0, YTMP1            # YTMP1 = W[-2] << 19 {DC--}
+        vpor            YTMP1, YTMP3, YTMP3             # YTMP3 = W[-2] ror 19 {DC--}
+        vpxor           YTMP3, YTMP4, YTMP4             # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
+        vpsrlq          $61, Y_0, YTMP3                 # YTMP3 = W[-2] >> 61 {DC--}
+        vpsllq          $(64-61), Y_0, YTMP1            # YTMP1 = W[-2] << 61 {DC--}
+        vpor            YTMP1, YTMP3, YTMP3             # YTMP3 = W[-2] ror 61 {DC--}
+        vpxor           YTMP3, YTMP4, YTMP4             # YTMP4 = s1 = (W[-2] ror 19) ^
+                                                        #  (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
+        # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
+        # to newly calculated sigma1 to get w[18] and w[19]
+        vpaddq          YTMP4, YTMP0, YTMP2             # YTMP2 = {W[3], W[2], --, --}
+        # Form w[19, w[18], w17], w[16]
+        vpblendd                $0xF0, YTMP2, Y_0, Y_0          # Y_0 = {W[3], W[2], W[1], W[0]}
+        mov     a, y3           # y3 = a                                # MAJA
+        rorx    $41, e, y0      # y0 = e >> 41                          # S1A
+        rorx    $18, e, y1      # y1 = e >> 18                          # S1B
+        add     3*8+frame_XFER(%rsp), h         # h = k + w + h         # --
+        or      c, y3           # y3 = a|c                              # MAJA
+        mov     f, y2           # y2 = f                                # CH
+        rorx    $34, a, T1      # T1 = a >> 34                          # S0B
+        xor     y1, y0          # y0 = (e>>41) ^ (e>>18)                # S1
+        xor     g, y2           # y2 = f^g                              # CH
+        rorx    $14, e, y1      # y1 = (e >> 14)                        # S1
+        and     e, y2           # y2 = (f^g)&e                          # CH
+        add     h, d            # d = k + w + h + d                     # --
+        and     b, y3           # y3 = (a|c)&b                          # MAJA
+        xor     y1, y0          # y0 = (e>>41) ^ (e>>18) ^ (e>>14)      # S1
+        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
+        rorx    $39, a, y1      # y1 = a >> 39                          # S0A
+        add     y0, y2          # y2 = S1 + CH                          # --
+        xor     T1, y1          # y1 = (a>>39) ^ (a>>34)                # S0
+        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
+        rorx    $28, a, T1      # T1 = (a >> 28)                        # S0
+        xor     T1, y1          # y1 = (a>>39) ^ (a>>34) ^ (a>>28)      # S0
+        mov     a, T1           # T1 = a                                # MAJB
+        and     c, T1           # T1 = a&c                              # MAJB
+        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+        add     y1, h           # h = k + w + h + S0                    # --
+        add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+        add     y3, h           # h = t1 + S0 + MAJ                     # --
+        RotateState
+        rotate_Ys
+.endm
+.macro DO_4ROUNDS
+################################### RND N + 0 #########################################
+        mov     f, y2           # y2 = f                                # CH
+        rorx    $41, e, y0      # y0 = e >> 41                          # S1A
+        rorx    $18, e, y1      # y1 = e >> 18                          # S1B
+        xor     g, y2           # y2 = f^g                              # CH
+        xor     y1, y0          # y0 = (e>>41) ^ (e>>18)                # S1
+        rorx    $14, e, y1      # y1 = (e >> 14)                        # S1
+        and     e, y2           # y2 = (f^g)&e                          # CH
+        xor     y1, y0          # y0 = (e>>41) ^ (e>>18) ^ (e>>14)      # S1
+        rorx    $34, a, T1      # T1 = a >> 34                          # S0B
+        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
+        rorx    $39, a, y1      # y1 = a >> 39                          # S0A
+        mov     a, y3           # y3 = a                                # MAJA
+        xor     T1, y1          # y1 = (a>>39) ^ (a>>34)                # S0
+        rorx    $28, a, T1      # T1 = (a >> 28)                        # S0
+        add     frame_XFER(%rsp), h             # h = k + w + h         # --
+        or      c, y3           # y3 = a|c                              # MAJA
+        xor     T1, y1          # y1 = (a>>39) ^ (a>>34) ^ (a>>28)      # S0
+        mov     a, T1           # T1 = a                                # MAJB
+        and     b, y3           # y3 = (a|c)&b                          # MAJA
+        and     c, T1           # T1 = a&c                              # MAJB
+        add     y0, y2          # y2 = S1 + CH                          # --
+        add     h, d            # d = k + w + h + d                     # --
+        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+        add     y1, h           # h = k + w + h + S0                    # --
+        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
+        RotateState
+################################### RND N + 1 #########################################
+        add     y2, old_h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+        mov     f, y2           # y2 = f                                # CH
+        rorx    $41, e, y0      # y0 = e >> 41                          # S1A
+        rorx    $18, e, y1      # y1 = e >> 18                          # S1B
+        xor     g, y2           # y2 = f^g                              # CH
+        xor     y1, y0          # y0 = (e>>41) ^ (e>>18)                # S1
+        rorx    $14, e, y1      # y1 = (e >> 14)                        # S1
+        and     e, y2           # y2 = (f^g)&e                          # CH
+        add     y3, old_h       # h = t1 + S0 + MAJ                     # --
+        xor     y1, y0          # y0 = (e>>41) ^ (e>>18) ^ (e>>14)      # S1
+        rorx    $34, a, T1      # T1 = a >> 34                          # S0B
+        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
+        rorx    $39, a, y1      # y1 = a >> 39                          # S0A
+        mov     a, y3           # y3 = a                                # MAJA
+        xor     T1, y1          # y1 = (a>>39) ^ (a>>34)                # S0
+        rorx    $28, a, T1      # T1 = (a >> 28)                        # S0
+        add     8*1+frame_XFER(%rsp), h         # h = k + w + h         # --
+        or      c, y3           # y3 = a|c                              # MAJA
+        xor     T1, y1          # y1 = (a>>39) ^ (a>>34) ^ (a>>28)      # S0
+        mov     a, T1           # T1 = a                                # MAJB
+        and     b, y3           # y3 = (a|c)&b                          # MAJA
+        and     c, T1           # T1 = a&c                              # MAJB
+        add     y0, y2          # y2 = S1 + CH                          # --
+        add     h, d            # d = k + w + h + d                     # --
+        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+        add     y1, h           # h = k + w + h + S0                    # --
+        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
+        RotateState
+################################### RND N + 2 #########################################
+        add     y2, old_h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+        mov     f, y2           # y2 = f                                # CH
+        rorx    $41, e, y0      # y0 = e >> 41                          # S1A
+        rorx    $18, e, y1      # y1 = e >> 18                          # S1B
+        xor     g, y2           # y2 = f^g                              # CH
+        xor     y1, y0          # y0 = (e>>41) ^ (e>>18)                # S1
+        rorx    $14, e, y1      # y1 = (e >> 14)                        # S1
+        and     e, y2           # y2 = (f^g)&e                          # CH
+        add     y3, old_h       # h = t1 + S0 + MAJ                     # --
+        xor     y1, y0          # y0 = (e>>41) ^ (e>>18) ^ (e>>14)      # S1
+        rorx    $34, a, T1      # T1 = a >> 34                          # S0B
+        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
+        rorx    $39, a, y1      # y1 = a >> 39                          # S0A
+        mov     a, y3           # y3 = a                                # MAJA
+        xor     T1, y1          # y1 = (a>>39) ^ (a>>34)                # S0
+        rorx    $28, a, T1      # T1 = (a >> 28)                        # S0
+        add     8*2+frame_XFER(%rsp), h         # h = k + w + h         # --
+        or      c, y3           # y3 = a|c                              # MAJA
+        xor     T1, y1          # y1 = (a>>39) ^ (a>>34) ^ (a>>28)      # S0
+        mov     a, T1           # T1 = a                                # MAJB
+        and     b, y3           # y3 = (a|c)&b                          # MAJA
+        and     c, T1           # T1 = a&c                              # MAJB
+        add     y0, y2          # y2 = S1 + CH                          # --
+        add     h, d            # d = k + w + h + d                     # --
+        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+        add     y1, h           # h = k + w + h + S0                    # --
+        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
+        RotateState
+################################### RND N + 3 #########################################
+        add     y2, old_h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+        mov     f, y2           # y2 = f                                # CH
+        rorx    $41, e, y0      # y0 = e >> 41                          # S1A
+        rorx    $18, e, y1      # y1 = e >> 18                          # S1B
+        xor     g, y2           # y2 = f^g                              # CH
+        xor     y1, y0          # y0 = (e>>41) ^ (e>>18)                # S1
+        rorx    $14, e, y1      # y1 = (e >> 14)                        # S1
+        and     e, y2           # y2 = (f^g)&e                          # CH
+        add     y3, old_h       # h = t1 + S0 + MAJ                     # --
+        xor     y1, y0          # y0 = (e>>41) ^ (e>>18) ^ (e>>14)      # S1
+        rorx    $34, a, T1      # T1 = a >> 34                          # S0B
+        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
+        rorx    $39, a, y1      # y1 = a >> 39                          # S0A
+        mov     a, y3           # y3 = a                                # MAJA
+        xor     T1, y1          # y1 = (a>>39) ^ (a>>34)                # S0
+        rorx    $28, a, T1      # T1 = (a >> 28)                        # S0
+        add     8*3+frame_XFER(%rsp), h         # h = k + w + h         # --
+        or      c, y3           # y3 = a|c                              # MAJA
+        xor     T1, y1          # y1 = (a>>39) ^ (a>>34) ^ (a>>28)      # S0
+        mov     a, T1           # T1 = a                                # MAJB
+        and     b, y3           # y3 = (a|c)&b                          # MAJA
+        and     c, T1           # T1 = a&c                              # MAJB
+        add     y0, y2          # y2 = S1 + CH                          # --
+        add     h, d            # d = k + w + h + d                     # --
+        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+        add     y1, h           # h = k + w + h + S0                    # --
+        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
+        add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+        add     y3, h           # h = t1 + S0 + MAJ                     # --
+        RotateState
+.endm
+########################################################################
+# void sha512_transform_rorx(const void* M, void* D, uint64_t L)#
+# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+# The size of the message pointed to by M must be an integer multiple of SHA512
+#   message blocks.
+# L is the message length in SHA512 blocks
+########################################################################
+ENTRY(sha512_transform_rorx)
+        # Allocate Stack Space
+        mov     %rsp, %rax
+        sub     $frame_size, %rsp
+        and     $~(0x20 - 1), %rsp
+        mov     %rax, frame_RSPSAVE(%rsp)
+        # Save GPRs
+        mov     %rbp, frame_GPRSAVE(%rsp)
+        mov     %rbx, 8*1+frame_GPRSAVE(%rsp)
+        mov     %r12, 8*2+frame_GPRSAVE(%rsp)
+        mov     %r13, 8*3+frame_GPRSAVE(%rsp)
+        mov     %r14, 8*4+frame_GPRSAVE(%rsp)
+        mov     %r15, 8*5+frame_GPRSAVE(%rsp)
+        shl     $7, NUM_BLKS    # convert to bytes
+        jz      done_hash
+        add     INP, NUM_BLKS   # pointer to end of data
+        mov     NUM_BLKS, frame_INPEND(%rsp)
+        ## load initial digest
+        mov     8*0(CTX),a
+        mov     8*1(CTX),b
+        mov     8*2(CTX),c
+        mov     8*3(CTX),d
+        mov     8*4(CTX),e
+        mov     8*5(CTX),f
+        mov     8*6(CTX),g
+        mov     8*7(CTX),h
+        vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+loop0:
+        lea     K512(%rip), TBL
+        ## byte swap first 16 dwords
+        COPY_YMM_AND_BSWAP      Y_0, (INP), BYTE_FLIP_MASK
+        COPY_YMM_AND_BSWAP      Y_1, 1*32(INP), BYTE_FLIP_MASK
+        COPY_YMM_AND_BSWAP      Y_2, 2*32(INP), BYTE_FLIP_MASK
+        COPY_YMM_AND_BSWAP      Y_3, 3*32(INP), BYTE_FLIP_MASK
+        mov     INP, frame_INP(%rsp)
+        ## schedule 64 input dwords, by doing 12 rounds of 4 each
+        movq    $4, frame_SRND(%rsp)
+.align 16
+loop1:
+        vpaddq  (TBL), Y_0, XFER
+        vmovdqa XFER, frame_XFER(%rsp)
+        FOUR_ROUNDS_AND_SCHED
+        vpaddq  1*32(TBL), Y_0, XFER
+        vmovdqa XFER, frame_XFER(%rsp)
+        FOUR_ROUNDS_AND_SCHED
+        vpaddq  2*32(TBL), Y_0, XFER
+        vmovdqa XFER, frame_XFER(%rsp)
+        FOUR_ROUNDS_AND_SCHED
+        vpaddq  3*32(TBL), Y_0, XFER
+        vmovdqa XFER, frame_XFER(%rsp)
+        add     $(4*32), TBL
+        FOUR_ROUNDS_AND_SCHED
+        subq    $1, frame_SRND(%rsp)
+        jne     loop1
+        movq    $2, frame_SRND(%rsp)
+loop2:
+        vpaddq  (TBL), Y_0, XFER
+        vmovdqa XFER, frame_XFER(%rsp)
+        DO_4ROUNDS
+        vpaddq  1*32(TBL), Y_1, XFER
+        vmovdqa XFER, frame_XFER(%rsp)
+        add     $(2*32), TBL
+        DO_4ROUNDS
+        vmovdqa Y_2, Y_0
+        vmovdqa Y_3, Y_1
+        subq    $1, frame_SRND(%rsp)
+        jne     loop2
+        addm    8*0(CTX),a
+        addm    8*1(CTX),b
+        addm    8*2(CTX),c
+        addm    8*3(CTX),d
+        addm    8*4(CTX),e
+        addm    8*5(CTX),f
+        addm    8*6(CTX),g
+        addm    8*7(CTX),h
+        mov     frame_INP(%rsp), INP
+        add     $128, INP
+        cmp     frame_INPEND(%rsp), INP
+        jne     loop0
+done_hash:
+# Restore GPRs
+        mov     frame_GPRSAVE(%rsp)     ,%rbp
+        mov     8*1+frame_GPRSAVE(%rsp) ,%rbx
+        mov     8*2+frame_GPRSAVE(%rsp) ,%r12
+        mov     8*3+frame_GPRSAVE(%rsp) ,%r13
+        mov     8*4+frame_GPRSAVE(%rsp) ,%r14
+        mov     8*5+frame_GPRSAVE(%rsp) ,%r15
+        # Restore Stack Pointer
+        mov     frame_RSPSAVE(%rsp), %rsp
+        ret
+ENDPROC(sha512_transform_rorx)
+########################################################################
+### Binary Data
+.data
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+        .quad   0x428a2f98d728ae22,0x7137449123ef65cd
+        .quad   0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+        .quad   0x3956c25bf348b538,0x59f111f1b605d019
+        .quad   0x923f82a4af194f9b,0xab1c5ed5da6d8118
+        .quad   0xd807aa98a3030242,0x12835b0145706fbe
+        .quad   0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+        .quad   0x72be5d74f27b896f,0x80deb1fe3b1696b1
+        .quad   0x9bdc06a725c71235,0xc19bf174cf692694
+        .quad   0xe49b69c19ef14ad2,0xefbe4786384f25e3
+        .quad   0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+        .quad   0x2de92c6f592b0275,0x4a7484aa6ea6e483
+        .quad   0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+        .quad   0x983e5152ee66dfab,0xa831c66d2db43210
+        .quad   0xb00327c898fb213f,0xbf597fc7beef0ee4
+        .quad   0xc6e00bf33da88fc2,0xd5a79147930aa725
+        .quad   0x06ca6351e003826f,0x142929670a0e6e70
+        .quad   0x27b70a8546d22ffc,0x2e1b21385c26c926
+        .quad   0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+        .quad   0x650a73548baf63de,0x766a0abb3c77b2a8
+        .quad   0x81c2c92e47edaee6,0x92722c851482353b
+        .quad   0xa2bfe8a14cf10364,0xa81a664bbc423001
+        .quad   0xc24b8b70d0f89791,0xc76c51a30654be30
+        .quad   0xd192e819d6ef5218,0xd69906245565a910
+        .quad   0xf40e35855771202a,0x106aa07032bbd1b8
+        .quad   0x19a4c116b8d2d0c8,0x1e376c085141ab53
+        .quad   0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+        .quad   0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+        .quad   0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+        .quad   0x748f82ee5defb2fc,0x78a5636f43172f60
+        .quad   0x84c87814a1f0ab72,0x8cc702081a6439ec
+        .quad   0x90befffa23631e28,0xa4506cebde82bde9
+        .quad   0xbef9a3f7b2c67915,0xc67178f2e372532b
+        .quad   0xca273eceea26619c,0xd186b8c721c0c207
+        .quad   0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+        .quad   0x06f067aa72176fba,0x0a637dc5a2c898a6
+        .quad   0x113f9804bef90dae,0x1b710b35131c471b
+        .quad   0x28db77f523047d84,0x32caab7b40c72493
+        .quad   0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+        .quad   0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+        .quad   0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.align 32
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+PSHUFFLE_BYTE_FLIP_MASK:
+        .octa 0x08090a0b0c0d0e0f0001020304050607
+        .octa 0x18191a1b1c1d1e1f1011121314151617
+MASK_YMM_LO:
+        .octa 0x00000000000000000000000000000000
+        .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
+#endif
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
new file mode 100644
index 000000000000..fb56855d51f5
--- /dev/null
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -0,0 +1,421 @@
+########################################################################
+# Implement fast SHA-512 with SSSE3 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     David Cote <david.m.cote@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+#include <linux/linkage.h>
+.text
+# Virtual Registers
+# ARG1
+msg =           %rdi
+# ARG2
+digest =        %rsi
+# ARG3
+msglen =        %rdx
+T1 =            %rcx
+T2 =            %r8
+a_64 =          %r9
+b_64 =          %r10
+c_64 =          %r11
+d_64 =          %r12
+e_64 =          %r13
+f_64 =          %r14
+g_64 =          %r15
+h_64 =          %rbx
+tmp0 =          %rax
+# Local variables (stack frame)
+W_SIZE = 80*8
+WK_SIZE = 2*8
+RSPSAVE_SIZE = 1*8
+GPRSAVE_SIZE = 5*8
+frame_W = 0
+frame_WK = frame_W + W_SIZE
+frame_RSPSAVE = frame_WK + WK_SIZE
+frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
+frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+# Useful QWORD "arrays" for simpler memory references
+# MSG, DIGEST, K_t, W_t are arrays
+# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+# Input message (arg1)
+#define MSG(i)    8*i(msg)
+# Output Digest (arg2)
+#define DIGEST(i) 8*i(digest)
+# SHA Constants (static mem)
+#define K_t(i)    8*i+K512(%rip)
+# Message Schedule (stack frame)
+#define W_t(i)    8*i+frame_W(%rsp)
+# W[t]+K[t] (stack frame)
+#define WK_2(i)   8*((i%2))+frame_WK(%rsp)
+.macro RotateState
+        # Rotate symbols a..h right
+        TMP   = h_64
+        h_64  = g_64
+        g_64  = f_64
+        f_64  = e_64
+        e_64  = d_64
+        d_64  = c_64
+        c_64  = b_64
+        b_64  = a_64
+        a_64  = TMP
+.endm
+.macro SHA512_Round rnd
+        # Compute Round %%t
+        mov     f_64, T1          # T1 = f
+        mov     e_64, tmp0        # tmp = e
+        xor     g_64, T1          # T1 = f ^ g
+        ror     $23, tmp0 # 41    # tmp = e ror 23
+        and     e_64, T1          # T1 = (f ^ g) & e
+        xor     e_64, tmp0        # tmp = (e ror 23) ^ e
+        xor     g_64, T1          # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+        idx = \rnd
+        add     WK_2(idx), T1     # W[t] + K[t] from message scheduler
+        ror     $4, tmp0  # 18    # tmp = ((e ror 23) ^ e) ror 4
+        xor     e_64, tmp0        # tmp = (((e ror 23) ^ e) ror 4) ^ e
+        mov     a_64, T2          # T2 = a
+        add     h_64, T1          # T1 = CH(e,f,g) + W[t] + K[t] + h
+        ror     $14, tmp0 # 14    # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+        add     tmp0, T1          # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+        mov     a_64, tmp0        # tmp = a
+        xor     c_64, T2          # T2 = a ^ c
+        and     c_64, tmp0        # tmp = a & c
+        and     b_64, T2          # T2 = (a ^ c) & b
+        xor     tmp0, T2          # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+        mov     a_64, tmp0        # tmp = a
+        ror     $5, tmp0 # 39     # tmp = a ror 5
+        xor     a_64, tmp0        # tmp = (a ror 5) ^ a
+        add     T1, d_64          # e(next_state) = d + T1
+        ror     $6, tmp0 # 34     # tmp = ((a ror 5) ^ a) ror 6
+        xor     a_64, tmp0        # tmp = (((a ror 5) ^ a) ror 6) ^ a
+        lea     (T1, T2), h_64    # a(next_state) = T1 + Maj(a,b,c)
+        ror     $28, tmp0 # 28    # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+        add     tmp0, h_64        # a(next_state) = T1 + Maj(a,b,c) S0(a)
+        RotateState
+.endm
+.macro SHA512_2Sched_2Round_sse rnd
+        # Compute rounds t-2 and t-1
+        # Compute message schedule QWORDS t and t+1
+        #   Two rounds are computed based on the values for K[t-2]+W[t-2] and
+        # K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+        # scheduler.
+        #   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
+        # They are then added to their respective SHA512 constants at
+        # [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
+        #   For brievity, the comments following vectored instructions only refer to
+        # the first of a pair of QWORDS.
+        # Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
+        #   The computation of the message schedule and the rounds are tightly
+        # stitched to take advantage of instruction-level parallelism.
+        # For clarity, integer instructions (for the rounds calculation) are indented
+        # by one tab. Vectored instructions (for the message scheduler) are indented
+        # by two tabs.
+        mov     f_64, T1
+        idx = \rnd -2
+        movdqa  W_t(idx), %xmm2             # XMM2 = W[t-2]
+        xor     g_64, T1
+        and     e_64, T1
+        movdqa  %xmm2, %xmm0                # XMM0 = W[t-2]
+        xor     g_64, T1
+        idx = \rnd
+        add     WK_2(idx), T1
+        idx = \rnd - 15
+        movdqu  W_t(idx), %xmm5             # XMM5 = W[t-15]
+        mov     e_64, tmp0
+        ror     $23, tmp0 # 41
+        movdqa  %xmm5, %xmm3                # XMM3 = W[t-15]
+        xor     e_64, tmp0
+        ror     $4, tmp0 # 18
+        psrlq   $61-19, %xmm0               # XMM0 = W[t-2] >> 42
+        xor     e_64, tmp0
+        ror     $14, tmp0 # 14
+        psrlq   $(8-7), %xmm3               # XMM3 = W[t-15] >> 1
+        add     tmp0, T1
+        add     h_64, T1
+        pxor    %xmm2, %xmm0                # XMM0 = (W[t-2] >> 42) ^ W[t-2]
+        mov     a_64, T2
+        xor     c_64, T2
+        pxor    %xmm5, %xmm3                # XMM3 = (W[t-15] >> 1) ^ W[t-15]
+        and     b_64, T2
+        mov     a_64, tmp0
+        psrlq   $(19-6), %xmm0              # XMM0 = ((W[t-2]>>42)^W[t-2])>>13
+        and     c_64, tmp0
+        xor     tmp0, T2
+        psrlq   $(7-1), %xmm3               # XMM3 = ((W[t-15]>>1)^W[t-15])>>6
+        mov     a_64, tmp0
+        ror     $5, tmp0 # 39
+        pxor    %xmm2, %xmm0                # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
+        xor     a_64, tmp0
+        ror     $6, tmp0 # 34
+        pxor    %xmm5, %xmm3                # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
+        xor     a_64, tmp0
+        ror     $28, tmp0 # 28
+        psrlq   $6, %xmm0                   # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
+        add     tmp0, T2
+        add     T1, d_64
+        psrlq   $1, %xmm3                   # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
+        lea     (T1, T2), h_64
+        RotateState
+        movdqa  %xmm2, %xmm1                # XMM1 = W[t-2]
+        mov     f_64, T1
+        xor     g_64, T1
+        movdqa  %xmm5, %xmm4                # XMM4 = W[t-15]
+        and     e_64, T1
+        xor     g_64, T1
+        psllq   $(64-19)-(64-61) , %xmm1    # XMM1 = W[t-2] << 42
+        idx = \rnd + 1
+        add     WK_2(idx), T1
+        mov     e_64, tmp0
+        psllq   $(64-1)-(64-8), %xmm4       # XMM4 = W[t-15] << 7
+        ror     $23, tmp0 # 41
+        xor     e_64, tmp0
+        pxor    %xmm2, %xmm1                # XMM1 = (W[t-2] << 42)^W[t-2]
+        ror     $4, tmp0 # 18
+        xor     e_64, tmp0
+        pxor    %xmm5, %xmm4                # XMM4 = (W[t-15]<<7)^W[t-15]
+        ror     $14, tmp0 # 14
+        add     tmp0, T1
+        psllq   $(64-61), %xmm1             # XMM1 = ((W[t-2] << 42)^W[t-2])<<3
+        add     h_64, T1
+        mov     a_64, T2
+        psllq   $(64-8), %xmm4              # XMM4 = ((W[t-15]<<7)^W[t-15])<<56
+        xor     c_64, T2
+        and     b_64, T2
+        pxor    %xmm1, %xmm0                # XMM0 = s1(W[t-2])
+        mov     a_64, tmp0
+        and     c_64, tmp0
+        idx = \rnd - 7
+        movdqu  W_t(idx), %xmm1             # XMM1 = W[t-7]
+        xor     tmp0, T2
+        pxor    %xmm4, %xmm3                # XMM3 = s0(W[t-15])
+        mov     a_64, tmp0
+        paddq   %xmm3, %xmm0                # XMM0 = s1(W[t-2]) + s0(W[t-15])
+        ror     $5, tmp0 # 39
+        idx =\rnd-16
+        paddq   W_t(idx), %xmm0             # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
+        xor     a_64, tmp0
+        paddq   %xmm1, %xmm0                # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
+        ror     $6, tmp0 # 34
+        movdqa  %xmm0, W_t(\rnd)            # Store scheduled qwords
+        xor     a_64, tmp0
+        paddq   K_t(\rnd), %xmm0            # Compute W[t]+K[t]
+        ror     $28, tmp0 # 28
+        idx = \rnd
+        movdqa  %xmm0, WK_2(idx)            # Store W[t]+K[t] for next rounds
+        add     tmp0, T2
+        add     T1, d_64
+        lea     (T1, T2), h_64
+        RotateState
+.endm
+########################################################################
+# void sha512_transform_ssse3(const void* M, void* D, u64 L)#
+# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+# The size of the message pointed to by M must be an integer multiple of SHA512
+#   message blocks.
+# L is the message length in SHA512 blocks.
+########################################################################
+ENTRY(sha512_transform_ssse3)
+        cmp $0, msglen
+        je nowork
+        # Allocate Stack Space
+        mov     %rsp, %rax
+        sub     $frame_size, %rsp
+        and     $~(0x20 - 1), %rsp
+        mov     %rax, frame_RSPSAVE(%rsp)
+        # Save GPRs
+        mov     %rbx, frame_GPRSAVE(%rsp)
+        mov     %r12, frame_GPRSAVE +8*1(%rsp)
+        mov     %r13, frame_GPRSAVE +8*2(%rsp)
+        mov     %r14, frame_GPRSAVE +8*3(%rsp)
+        mov     %r15, frame_GPRSAVE +8*4(%rsp)
+updateblock:
+# Load state variables
+        mov     DIGEST(0), a_64
+        mov     DIGEST(1), b_64
+        mov     DIGEST(2), c_64
+        mov     DIGEST(3), d_64
+        mov     DIGEST(4), e_64
+        mov     DIGEST(5), f_64
+        mov     DIGEST(6), g_64
+        mov     DIGEST(7), h_64
+        t = 0
+        .rept 80/2 + 1
+        # (80 rounds) / (2 rounds/iteration) + (1 iteration)
+        # +1 iteration because the scheduler leads hashing by 1 iteration
+                .if t < 2
+                        # BSWAP 2 QWORDS
+                        movdqa  XMM_QWORD_BSWAP(%rip), %xmm1
+                        movdqu  MSG(t), %xmm0
+                        pshufb  %xmm1, %xmm0    # BSWAP
+                        movdqa  %xmm0, W_t(t)   # Store Scheduled Pair
+                        paddq   K_t(t), %xmm0   # Compute W[t]+K[t]
+                        movdqa  %xmm0, WK_2(t)  # Store into WK for rounds
+                .elseif t < 16
+                        # BSWAP 2 QWORDS# Compute 2 Rounds
+                        movdqu  MSG(t), %xmm0
+                        pshufb  %xmm1, %xmm0    # BSWAP
+                        SHA512_Round t-2        # Round t-2
+                        movdqa  %xmm0, W_t(t)   # Store Scheduled Pair
+                        paddq   K_t(t), %xmm0   # Compute W[t]+K[t]
+                        SHA512_Round t-1        # Round t-1
+                        movdqa  %xmm0, WK_2(t)  # Store W[t]+K[t] into WK
+                .elseif t < 79
+                        # Schedule 2 QWORDS# Compute 2 Rounds
+                        SHA512_2Sched_2Round_sse t
+                .else
+                        # Compute 2 Rounds
+                        SHA512_Round t-2
+                        SHA512_Round t-1
+                .endif
+                t = t+2
+        .endr
+        # Update digest
+        add     a_64, DIGEST(0)
+        add     b_64, DIGEST(1)
+        add     c_64, DIGEST(2)
+        add     d_64, DIGEST(3)
+        add     e_64, DIGEST(4)
+        add     f_64, DIGEST(5)
+        add     g_64, DIGEST(6)
+        add     h_64, DIGEST(7)
+        # Advance to next message block
+        add     $16*8, msg
+        dec     msglen
+        jnz     updateblock
+        # Restore GPRs
+        mov     frame_GPRSAVE(%rsp),      %rbx
+        mov     frame_GPRSAVE +8*1(%rsp), %r12
+        mov     frame_GPRSAVE +8*2(%rsp), %r13
+        mov     frame_GPRSAVE +8*3(%rsp), %r14
+        mov     frame_GPRSAVE +8*4(%rsp), %r15
+        # Restore Stack Pointer
+        mov     frame_RSPSAVE(%rsp), %rsp
+nowork:
+        ret
+ENDPROC(sha512_transform_ssse3)
+########################################################################
+### Binary Data
+.data
+.align 16
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+        .octa 0x08090a0b0c0d0e0f0001020304050607
+# K[t] used in SHA512 hashing
+K512:
+        .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+        .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+        .quad 0x3956c25bf348b538,0x59f111f1b605d019
+        .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+        .quad 0xd807aa98a3030242,0x12835b0145706fbe
+        .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+        .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+        .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+        .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+        .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+        .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+        .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+        .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+        .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+        .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+        .quad 0x06ca6351e003826f,0x142929670a0e6e70
+        .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+        .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+        .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+        .quad 0x81c2c92e47edaee6,0x92722c851482353b
+        .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+        .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+        .quad 0xd192e819d6ef5218,0xd69906245565a910
+        .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+        .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+        .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+        .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+        .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+        .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+        .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+        .quad 0x90befffa23631e28,0xa4506cebde82bde9
+        .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+        .quad 0xca273eceea26619c,0xd186b8c721c0c207
+        .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+        .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+        .quad 0x113f9804bef90dae,0x1b710b35131c471b
+        .quad 0x28db77f523047d84,0x32caab7b40c72493
+        .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+        .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+        .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
new file mode 100644
index 000000000000..6cbd8df348d2
--- /dev/null
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -0,0 +1,282 @@
+/*
+ * Cryptographic API.
+ *
+ * Glue code for the SHA512 Secure Hash Algorithm assembler
+ * implementation using supplemental SSE3 / AVX / AVX2 instructions.
+ *
+ * This file is based on sha512_generic.c
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#define pr_fmt(fmt)     KBUILD_MODNAME ": " fmt
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <linux/string.h>
+asmlinkage void sha512_transform_ssse3(const char *data, u64 *digest,
+                                     u64 rounds);
+#ifdef CONFIG_AS_AVX
+asmlinkage void sha512_transform_avx(const char *data, u64 *digest,
+                                     u64 rounds);
+#endif
+#ifdef CONFIG_AS_AVX2
+asmlinkage void sha512_transform_rorx(const char *data, u64 *digest,
+                                     u64 rounds);
+#endif
+static asmlinkage void (*sha512_transform_asm)(const char *, u64 *, u64);
+static int sha512_ssse3_init(struct shash_desc *desc)
+{
+        struct sha512_state *sctx = shash_desc_ctx(desc);
+        sctx->state[0] = SHA512_H0;
+        sctx->state[1] = SHA512_H1;
+        sctx->state[2] = SHA512_H2;
+        sctx->state[3] = SHA512_H3;
+        sctx->state[4] = SHA512_H4;
+        sctx->state[5] = SHA512_H5;
+        sctx->state[6] = SHA512_H6;
+        sctx->state[7] = SHA512_H7;
+        sctx->count[0] = sctx->count[1] = 0;
+        return 0;
+}
+static int __sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
+                               unsigned int len, unsigned int partial)
+{
+        struct sha512_state *sctx = shash_desc_ctx(desc);
+        unsigned int done = 0;
+        sctx->count[0] += len;
+        if (sctx->count[0] < len)
+                sctx->count[1]++;
+        if (partial) {
+                done = SHA512_BLOCK_SIZE - partial;
+                memcpy(sctx->buf + partial, data, done);
+                sha512_transform_asm(sctx->buf, sctx->state, 1);
+        }
+        if (len - done >= SHA512_BLOCK_SIZE) {
+                const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE;
+                sha512_transform_asm(data + done, sctx->state, (u64) rounds);
+                done += rounds * SHA512_BLOCK_SIZE;
+        }
+        memcpy(sctx->buf, data + done, len - done);
+        return 0;
+}
+static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
+                             unsigned int len)
+{
+        struct sha512_state *sctx = shash_desc_ctx(desc);
+        unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE;
+        int res;
+        /* Handle the fast case right here */
+        if (partial + len < SHA512_BLOCK_SIZE) {
+                sctx->count[0] += len;
+                if (sctx->count[0] < len)
+                        sctx->count[1]++;
+                memcpy(sctx->buf + partial, data, len);
+                return 0;
+        }
+        if (!irq_fpu_usable()) {
+                res = crypto_sha512_update(desc, data, len);
+        } else {
+                kernel_fpu_begin();
+                res = __sha512_ssse3_update(desc, data, len, partial);
+                kernel_fpu_end();
+        }
+        return res;
+}
+/* Add padding and return the message digest. */
+static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
+{
+        struct sha512_state *sctx = shash_desc_ctx(desc);
+        unsigned int i, index, padlen;
+        __be64 *dst = (__be64 *)out;
+        __be64 bits[2];
+        static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, };
+        /* save number of bits */
+        bits[1] = cpu_to_be64(sctx->count[0] << 3);
+        bits[0] = cpu_to_be64(sctx->count[1] << 3) | sctx->count[0] >> 61;
+        /* Pad out to 112 mod 128 and append length */
+        index = sctx->count[0] & 0x7f;
+        padlen = (index < 112) ? (112 - index) : ((128+112) - index);
+        if (!irq_fpu_usable()) {
+                crypto_sha512_update(desc, padding, padlen);
+                crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits));
+        } else {
+                kernel_fpu_begin();
+                /* We need to fill a whole block for __sha512_ssse3_update() */
+                if (padlen <= 112) {
+                        sctx->count[0] += padlen;
+                        if (sctx->count[0] < padlen)
+                                sctx->count[1]++;
+                        memcpy(sctx->buf + index, padding, padlen);
+                } else {
+                        __sha512_ssse3_update(desc, padding, padlen, index);
+                }
+                __sha512_ssse3_update(desc, (const u8 *)&bits,
+                                        sizeof(bits), 112);
+                kernel_fpu_end();
+        }
+        /* Store state in digest */
+        for (i = 0; i < 8; i++)
+                dst[i] = cpu_to_be64(sctx->state[i]);
+        /* Wipe context */
+        memset(sctx, 0, sizeof(*sctx));
+        return 0;
+}
+static int sha512_ssse3_export(struct shash_desc *desc, void *out)
+{
+        struct sha512_state *sctx = shash_desc_ctx(desc);
+        memcpy(out, sctx, sizeof(*sctx));
+        return 0;
+}
+static int sha512_ssse3_import(struct shash_desc *desc, const void *in)
+{
+        struct sha512_state *sctx = shash_desc_ctx(desc);
+        memcpy(sctx, in, sizeof(*sctx));
+        return 0;
+}
+static struct shash_alg alg = {
+        .digestsize     =       SHA512_DIGEST_SIZE,
+        .init           =       sha512_ssse3_init,
+        .update         =       sha512_ssse3_update,
+        .final          =       sha512_ssse3_final,
+        .export         =       sha512_ssse3_export,
+        .import         =       sha512_ssse3_import,
+        .descsize       =       sizeof(struct sha512_state),
+        .statesize      =       sizeof(struct sha512_state),
+        .base           =       {
+                .cra_name       =       "sha512",
+                .cra_driver_name =      "sha512-ssse3",
+                .cra_priority   =       150,
+                .cra_flags      =       CRYPTO_ALG_TYPE_SHASH,
+                .cra_blocksize  =       SHA512_BLOCK_SIZE,
+                .cra_module     =       THIS_MODULE,
+        }
+};
+#ifdef CONFIG_AS_AVX
+static bool __init avx_usable(void)
+{
+        u64 xcr0;
+        if (!cpu_has_avx || !cpu_has_osxsave)
+                return false;
+        xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+        if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+                pr_info("AVX detected but unusable.\n");
+                return false;
+        }
+        return true;
+}
+#endif
+static int __init sha512_ssse3_mod_init(void)
+{
+        /* test for SSE3 first */
+        if (cpu_has_ssse3)
+                sha512_transform_asm = sha512_transform_ssse3;
+#ifdef CONFIG_AS_AVX
+        /* allow AVX to override SSSE3, it's a little faster */
+        if (avx_usable()) {
+#ifdef CONFIG_AS_AVX2
+                if (boot_cpu_has(X86_FEATURE_AVX2))
+                        sha512_transform_asm = sha512_transform_rorx;
+                else
+#endif
+                        sha512_transform_asm = sha512_transform_avx;
+        }
+#endif
+        if (sha512_transform_asm) {
+#ifdef CONFIG_AS_AVX
+                if (sha512_transform_asm == sha512_transform_avx)
+                        pr_info("Using AVX optimized SHA-512 implementation\n");
+#ifdef CONFIG_AS_AVX2
+                else if (sha512_transform_asm == sha512_transform_rorx)
+                        pr_info("Using AVX2 optimized SHA-512 implementation\n");
+#endif
+                else
+#endif
+                        pr_info("Using SSSE3 optimized SHA-512 implementation\n");
+                return crypto_register_shash(&alg);
+        }
+        pr_info("Neither AVX nor SSSE3 is available/usable.\n");
+        return -ENODEV;
+}
+static void __exit sha512_ssse3_mod_fini(void)
+{
+        crypto_unregister_shash(&alg);
+}
+module_init(sha512_ssse3_mod_init);
+module_exit(sha512_ssse3_mod_fini);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated");
+MODULE_ALIAS("sha512");
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 8d3e113b2c95..05058134c443 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -4,7 +4,7 @@
 * Copyright (C) 2012 Johannes Goetzfried
 *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
 *
- * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -33,6 +33,8 @@
 .Lbswap128_mask:
        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lxts_gf128mul_and_shl1_mask:
+        .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
 .text
@@ -408,3 +410,47 @@ ENTRY(twofish_ctr_8way)
        ret;
 ENDPROC(twofish_ctr_8way)
+ENTRY(twofish_xts_enc_8way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+         */
+        movq %rsi, %r11;
+        /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+        load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+                      RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
+        call __twofish_enc_blk8;
+        /* dst <= regs xor IVs(in dst) */
+        store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+        ret;
+ENDPROC(twofish_xts_enc_8way)
+ENTRY(twofish_xts_dec_8way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+         */
+        movq %rsi, %r11;
+        /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+        load_xts_8way(%rcx, %rdx, %rsi, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2,
+                      RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
+        call __twofish_dec_blk8;
+        /* dst <= regs xor IVs(in dst) */
+        store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        ret;
+ENDPROC(twofish_xts_dec_8way)
diff --git a/arch/x86/crypto/twofish-avx2-asm_64.S b/arch/x86/crypto/twofish-avx2-asm_64.S
new file mode 100644
index 000000000000..e1a83b9cd389
--- /dev/null
+++ b/arch/x86/crypto/twofish-avx2-asm_64.S
@@ -0,0 +1,600 @@
+/*
+ * x86_64/AVX2 assembler optimized version of Twofish
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+#include <linux/linkage.h>
+#include "glue_helper-asm-avx2.S"
+.file "twofish-avx2-asm_64.S"
+.data
+.align 16
+.Lvpshufb_mask0:
+.long 0x80808000
+.long 0x80808004
+.long 0x80808008
+.long 0x8080800c
+.Lbswap128_mask:
+        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lxts_gf128mul_and_shl1_mask_0:
+        .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+.Lxts_gf128mul_and_shl1_mask_1:
+        .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
+.text
+/* structure of crypto context */
+#define s0      0
+#define s1      1024
+#define s2      2048
+#define s3      3072
+#define w       4096
+#define k       4128
+/* register macros */
+#define CTX     %rdi
+#define RS0     CTX
+#define RS1     %r8
+#define RS2     %r9
+#define RS3     %r10
+#define RK      %r11
+#define RW      %rax
+#define RROUND  %r12
+#define RROUNDd %r12d
+#define RA0     %ymm8
+#define RB0     %ymm9
+#define RC0     %ymm10
+#define RD0     %ymm11
+#define RA1     %ymm12
+#define RB1     %ymm13
+#define RC1     %ymm14
+#define RD1     %ymm15
+/* temp regs */
+#define RX0     %ymm0
+#define RY0     %ymm1
+#define RX1     %ymm2
+#define RY1     %ymm3
+#define RT0     %ymm4
+#define RIDX    %ymm5
+#define RX0x    %xmm0
+#define RY0x    %xmm1
+#define RX1x    %xmm2
+#define RY1x    %xmm3
+#define RT0x    %xmm4
+/* vpgatherdd mask and '-1' */
+#define RNOT    %ymm6
+/* byte mask, (-1 >> 24) */
+#define RBYTE   %ymm7
+/**********************************************************************
+  16-way AVX2 twofish
+ **********************************************************************/
+#define init_round_constants() \
+        vpcmpeqd RNOT, RNOT, RNOT; \
+        vpsrld $24, RNOT, RBYTE; \
+        leaq k(CTX), RK; \
+        leaq w(CTX), RW; \
+        leaq s1(CTX), RS1; \
+        leaq s2(CTX), RS2; \
+        leaq s3(CTX), RS3; \
+#define g16(ab, rs0, rs1, rs2, rs3, xy) \
+        vpand RBYTE, ab ## 0, RIDX; \
+        vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \
+        vpcmpeqd RNOT, RNOT, RNOT; \
+                \
+                vpand RBYTE, ab ## 1, RIDX; \
+                vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \
+                vpcmpeqd RNOT, RNOT, RNOT; \
+        \
+        vpsrld $8, ab ## 0, RIDX; \
+        vpand RBYTE, RIDX, RIDX; \
+        vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
+        vpcmpeqd RNOT, RNOT, RNOT; \
+        vpxor RT0, xy ## 0, xy ## 0; \
+                \
+                vpsrld $8, ab ## 1, RIDX; \
+                vpand RBYTE, RIDX, RIDX; \
+                vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
+                vpcmpeqd RNOT, RNOT, RNOT; \
+                vpxor RT0, xy ## 1, xy ## 1; \
+        \
+        vpsrld $16, ab ## 0, RIDX; \
+        vpand RBYTE, RIDX, RIDX; \
+        vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
+        vpcmpeqd RNOT, RNOT, RNOT; \
+        vpxor RT0, xy ## 0, xy ## 0; \
+                \
+                vpsrld $16, ab ## 1, RIDX; \
+                vpand RBYTE, RIDX, RIDX; \
+                vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
+                vpcmpeqd RNOT, RNOT, RNOT; \
+                vpxor RT0, xy ## 1, xy ## 1; \
+        \
+        vpsrld $24, ab ## 0, RIDX; \
+        vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
+        vpcmpeqd RNOT, RNOT, RNOT; \
+        vpxor RT0, xy ## 0, xy ## 0; \
+                \
+                vpsrld $24, ab ## 1, RIDX; \
+                vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
+                vpcmpeqd RNOT, RNOT, RNOT; \
+                vpxor RT0, xy ## 1, xy ## 1;
+#define g1_16(a, x) \
+        g16(a, RS0, RS1, RS2, RS3, x);
+#define g2_16(b, y) \
+        g16(b, RS1, RS2, RS3, RS0, y);
+#define encrypt_round_end16(a, b, c, d, nk) \
+        vpaddd RY0, RX0, RX0; \
+        vpaddd RX0, RY0, RY0; \
+        vpbroadcastd nk(RK,RROUND,8), RT0; \
+        vpaddd RT0, RX0, RX0; \
+        vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
+        vpaddd RT0, RY0, RY0; \
+        \
+        vpxor RY0, d ## 0, d ## 0; \
+        \
+        vpxor RX0, c ## 0, c ## 0; \
+        vpsrld $1, c ## 0, RT0; \
+        vpslld $31, c ## 0, c ## 0; \
+        vpor RT0, c ## 0, c ## 0; \
+        \
+                vpaddd RY1, RX1, RX1; \
+                vpaddd RX1, RY1, RY1; \
+                vpbroadcastd nk(RK,RROUND,8), RT0; \
+                vpaddd RT0, RX1, RX1; \
+                vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
+                vpaddd RT0, RY1, RY1; \
+                \
+                vpxor RY1, d ## 1, d ## 1; \
+                \
+                vpxor RX1, c ## 1, c ## 1; \
+                vpsrld $1, c ## 1, RT0; \
+                vpslld $31, c ## 1, c ## 1; \
+                vpor RT0, c ## 1, c ## 1; \
+#define encrypt_round16(a, b, c, d, nk) \
+        g2_16(b, RY); \
+        \
+        vpslld $1, b ## 0, RT0; \
+        vpsrld $31, b ## 0, b ## 0; \
+        vpor RT0, b ## 0, b ## 0; \
+        \
+                vpslld $1, b ## 1, RT0; \
+                vpsrld $31, b ## 1, b ## 1; \
+                vpor RT0, b ## 1, b ## 1; \
+        \
+        g1_16(a, RX); \
+        \
+        encrypt_round_end16(a, b, c, d, nk);
+#define encrypt_round_first16(a, b, c, d, nk) \
+        vpslld $1, d ## 0, RT0; \
+        vpsrld $31, d ## 0, d ## 0; \
+        vpor RT0, d ## 0, d ## 0; \
+        \
+                vpslld $1, d ## 1, RT0; \
+                vpsrld $31, d ## 1, d ## 1; \
+                vpor RT0, d ## 1, d ## 1; \
+        \
+        encrypt_round16(a, b, c, d, nk);
+#define encrypt_round_last16(a, b, c, d, nk) \
+        g2_16(b, RY); \
+        \
+        g1_16(a, RX); \
+        \
+        encrypt_round_end16(a, b, c, d, nk);
+#define decrypt_round_end16(a, b, c, d, nk) \
+        vpaddd RY0, RX0, RX0; \
+        vpaddd RX0, RY0, RY0; \
+        vpbroadcastd nk(RK,RROUND,8), RT0; \
+        vpaddd RT0, RX0, RX0; \
+        vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
+        vpaddd RT0, RY0, RY0; \
+        \
+        vpxor RX0, c ## 0, c ## 0; \
+        \
+        vpxor RY0, d ## 0, d ## 0; \
+        vpsrld $1, d ## 0, RT0; \
+        vpslld $31, d ## 0, d ## 0; \
+        vpor RT0, d ## 0, d ## 0; \
+        \
+                vpaddd RY1, RX1, RX1; \
+                vpaddd RX1, RY1, RY1; \
+                vpbroadcastd nk(RK,RROUND,8), RT0; \
+                vpaddd RT0, RX1, RX1; \
+                vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
+                vpaddd RT0, RY1, RY1; \
+                \
+                vpxor RX1, c ## 1, c ## 1; \
+                \
+                vpxor RY1, d ## 1, d ## 1; \
+                vpsrld $1, d ## 1, RT0; \
+                vpslld $31, d ## 1, d ## 1; \
+                vpor RT0, d ## 1, d ## 1;
+#define decrypt_round16(a, b, c, d, nk) \
+        g1_16(a, RX); \
+        \
+        vpslld $1, a ## 0, RT0; \
+        vpsrld $31, a ## 0, a ## 0; \
+        vpor RT0, a ## 0, a ## 0; \
+        \
+                vpslld $1, a ## 1, RT0; \
+                vpsrld $31, a ## 1, a ## 1; \
+                vpor RT0, a ## 1, a ## 1; \
+        \
+        g2_16(b, RY); \
+        \
+        decrypt_round_end16(a, b, c, d, nk);
+#define decrypt_round_first16(a, b, c, d, nk) \
+        vpslld $1, c ## 0, RT0; \
+        vpsrld $31, c ## 0, c ## 0; \
+        vpor RT0, c ## 0, c ## 0; \
+        \
+                vpslld $1, c ## 1, RT0; \
+                vpsrld $31, c ## 1, c ## 1; \
+                vpor RT0, c ## 1, c ## 1; \
+        \
+        decrypt_round16(a, b, c, d, nk)
+#define decrypt_round_last16(a, b, c, d, nk) \
+        g1_16(a, RX); \
+        \
+        g2_16(b, RY); \
+        \
+        decrypt_round_end16(a, b, c, d, nk);
+#define encrypt_cycle16() \
+        encrypt_round16(RA, RB, RC, RD, 0); \
+        encrypt_round16(RC, RD, RA, RB, 8);
+#define encrypt_cycle_first16() \
+        encrypt_round_first16(RA, RB, RC, RD, 0); \
+        encrypt_round16(RC, RD, RA, RB, 8);
+#define encrypt_cycle_last16() \
+        encrypt_round16(RA, RB, RC, RD, 0); \
+        encrypt_round_last16(RC, RD, RA, RB, 8);
+#define decrypt_cycle16(n) \
+        decrypt_round16(RC, RD, RA, RB, 8); \
+        decrypt_round16(RA, RB, RC, RD, 0);
+#define decrypt_cycle_first16(n) \
+        decrypt_round_first16(RC, RD, RA, RB, 8); \
+        decrypt_round16(RA, RB, RC, RD, 0);
+#define decrypt_cycle_last16(n) \
+        decrypt_round16(RC, RD, RA, RB, 8); \
+        decrypt_round_last16(RA, RB, RC, RD, 0);
+#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
+        vpunpckhdq x1, x0, t2; \
+        vpunpckldq x1, x0, x0; \
+        \
+        vpunpckldq x3, x2, t1; \
+        vpunpckhdq x3, x2, x2; \
+        \
+        vpunpckhqdq t1, x0, x1; \
+        vpunpcklqdq t1, x0, x0; \
+        \
+        vpunpckhqdq x2, t2, x3; \
+        vpunpcklqdq x2, t2, x2;
+#define read_blocks8(offs,a,b,c,d) \
+        transpose_4x4(a, b, c, d, RX0, RY0);
+#define write_blocks8(offs,a,b,c,d) \
+        transpose_4x4(a, b, c, d, RX0, RY0);
+#define inpack_enc8(a,b,c,d) \
+        vpbroadcastd 4*0(RW), RT0; \
+        vpxor RT0, a, a; \
+        \
+        vpbroadcastd 4*1(RW), RT0; \
+        vpxor RT0, b, b; \
+        \
+        vpbroadcastd 4*2(RW), RT0; \
+        vpxor RT0, c, c; \
+        \
+        vpbroadcastd 4*3(RW), RT0; \
+        vpxor RT0, d, d;
+#define outunpack_enc8(a,b,c,d) \
+        vpbroadcastd 4*4(RW), RX0; \
+        vpbroadcastd 4*5(RW), RY0; \
+        vpxor RX0, c, RX0; \
+        vpxor RY0, d, RY0; \
+        \
+        vpbroadcastd 4*6(RW), RT0; \
+        vpxor RT0, a, c; \
+        vpbroadcastd 4*7(RW), RT0; \
+        vpxor RT0, b, d; \
+        \
+        vmovdqa RX0, a; \
+        vmovdqa RY0, b;
+#define inpack_dec8(a,b,c,d) \
+        vpbroadcastd 4*4(RW), RX0; \
+        vpbroadcastd 4*5(RW), RY0; \
+        vpxor RX0, a, RX0; \
+        vpxor RY0, b, RY0; \
+        \
+        vpbroadcastd 4*6(RW), RT0; \
+        vpxor RT0, c, a; \
+        vpbroadcastd 4*7(RW), RT0; \
+        vpxor RT0, d, b; \
+        \
+        vmovdqa RX0, c; \
+        vmovdqa RY0, d;
+#define outunpack_dec8(a,b,c,d) \
+        vpbroadcastd 4*0(RW), RT0; \
+        vpxor RT0, a, a; \
+        \
+        vpbroadcastd 4*1(RW), RT0; \
+        vpxor RT0, b, b; \
+        \
+        vpbroadcastd 4*2(RW), RT0; \
+        vpxor RT0, c, c; \
+        \
+        vpbroadcastd 4*3(RW), RT0; \
+        vpxor RT0, d, d;
+#define read_blocks16(a,b,c,d) \
+        read_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
+        read_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
+#define write_blocks16(a,b,c,d) \
+        write_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
+        write_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
+#define xor_blocks16(a,b,c,d) \
+        xor_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
+        xor_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
+#define inpack_enc16(a,b,c,d) \
+        inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
+        inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
+#define outunpack_enc16(a,b,c,d) \
+        outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
+        outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
+#define inpack_dec16(a,b,c,d) \
+        inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
+        inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
+#define outunpack_dec16(a,b,c,d) \
+        outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
+        outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
+.align 8
+__twofish_enc_blk16:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
+         * output:
+         *      RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
+         */
+        init_round_constants();
+        read_blocks16(RA, RB, RC, RD);
+        inpack_enc16(RA, RB, RC, RD);
+        xorl RROUNDd, RROUNDd;
+        encrypt_cycle_first16();
+        movl $2, RROUNDd;
+.align 4
+.L__enc_loop:
+        encrypt_cycle16();
+        addl $2, RROUNDd;
+        cmpl $14, RROUNDd;
+        jne .L__enc_loop;
+        encrypt_cycle_last16();
+        outunpack_enc16(RA, RB, RC, RD);
+        write_blocks16(RA, RB, RC, RD);
+        ret;
+ENDPROC(__twofish_enc_blk16)
+.align 8
+__twofish_dec_blk16:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
+         * output:
+         *      RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
+         */
+        init_round_constants();
+        read_blocks16(RA, RB, RC, RD);
+        inpack_dec16(RA, RB, RC, RD);
+        movl $14, RROUNDd;
+        decrypt_cycle_first16();
+        movl $12, RROUNDd;
+.align 4
+.L__dec_loop:
+        decrypt_cycle16();
+        addl $-2, RROUNDd;
+        jnz .L__dec_loop;
+        decrypt_cycle_last16();
+        outunpack_dec16(RA, RB, RC, RD);
+        write_blocks16(RA, RB, RC, RD);
+        ret;
+ENDPROC(__twofish_dec_blk16)
+ENTRY(twofish_ecb_enc_16way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        vzeroupper;
+        pushq %r12;
+        load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
+        call __twofish_enc_blk16;
+        store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
+        popq %r12;
+        vzeroupper;
+        ret;
+ENDPROC(twofish_ecb_enc_16way)
+ENTRY(twofish_ecb_dec_16way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        vzeroupper;
+        pushq %r12;
+        load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
+        call __twofish_dec_blk16;
+        store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
+        popq %r12;
+        vzeroupper;
+        ret;
+ENDPROC(twofish_ecb_dec_16way)
+ENTRY(twofish_cbc_dec_16way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        vzeroupper;
+        pushq %r12;
+        load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
+        call __twofish_dec_blk16;
+        store_cbc_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1,
+                        RX0);
+        popq %r12;
+        vzeroupper;
+        ret;
+ENDPROC(twofish_cbc_dec_16way)
+ENTRY(twofish_ctr_16way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst (16 blocks)
+         *      %rdx: src (16 blocks)
+         *      %rcx: iv (little endian, 128bit)
+         */
+        vzeroupper;
+        pushq %r12;
+        load_ctr_16way(%rcx, .Lbswap128_mask, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
+                       RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
+                       RBYTE);
+        call __twofish_enc_blk16;
+        store_ctr_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
+        popq %r12;
+        vzeroupper;
+        ret;
+ENDPROC(twofish_ctr_16way)
+.align 8
+twofish_xts_crypt_16way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst (16 blocks)
+         *      %rdx: src (16 blocks)
+         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+         *      %r8: pointer to __twofish_enc_blk16 or __twofish_dec_blk16
+         */
+        vzeroupper;
+        pushq %r12;
+        load_xts_16way(%rcx, %rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
+                       RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
+                       .Lxts_gf128mul_and_shl1_mask_0,
+                       .Lxts_gf128mul_and_shl1_mask_1);
+        call *%r8;
+        store_xts_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
+        popq %r12;
+        vzeroupper;
+        ret;
+ENDPROC(twofish_xts_crypt_16way)
+ENTRY(twofish_xts_enc_16way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst (16 blocks)
+         *      %rdx: src (16 blocks)
+         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+         */
+        leaq __twofish_enc_blk16, %r8;
+        jmp twofish_xts_crypt_16way;
+ENDPROC(twofish_xts_enc_16way)
+ENTRY(twofish_xts_dec_16way)
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst (16 blocks)
+         *      %rdx: src (16 blocks)
+         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+         */
+        leaq __twofish_dec_blk16, %r8;
+        jmp twofish_xts_crypt_16way;
+ENDPROC(twofish_xts_dec_16way)
diff --git a/arch/x86/crypto/twofish_avx2_glue.c b/arch/x86/crypto/twofish_avx2_glue.c
new file mode 100644
index 000000000000..ce33b5be64ee
--- /dev/null
+++ b/arch/x86/crypto/twofish_avx2_glue.c
@@ -0,0 +1,584 @@
+/*
+ * Glue Code for x86_64/AVX2 assembler optimized version of Twofish
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/ctr.h>
+#include <crypto/twofish.h>
+#include <crypto/lrw.h>
+#include <crypto/xts.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <asm/crypto/twofish.h>
+#include <asm/crypto/ablk_helper.h>
+#include <asm/crypto/glue_helper.h>
+#include <crypto/scatterwalk.h>
+#define TF_AVX2_PARALLEL_BLOCKS 16
+/* 16-way AVX2 parallel cipher functions */
+asmlinkage void twofish_ecb_enc_16way(struct twofish_ctx *ctx, u8 *dst,
+                                      const u8 *src);
+asmlinkage void twofish_ecb_dec_16way(struct twofish_ctx *ctx, u8 *dst,
+                                      const u8 *src);
+asmlinkage void twofish_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src);
+asmlinkage void twofish_ctr_16way(void *ctx, u128 *dst, const u128 *src,
+                                  le128 *iv);
+asmlinkage void twofish_xts_enc_16way(struct twofish_ctx *ctx, u8 *dst,
+                                      const u8 *src, le128 *iv);
+asmlinkage void twofish_xts_dec_16way(struct twofish_ctx *ctx, u8 *dst,
+                                      const u8 *src, le128 *iv);
+static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
+                                        const u8 *src)
+{
+        __twofish_enc_blk_3way(ctx, dst, src, false);
+}
+static const struct common_glue_ctx twofish_enc = {
+        .num_funcs = 4,
+        .fpu_blocks_limit = 8,
+        .funcs = { {
+                .num_blocks = 16,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_16way) }
+        }, {
+                .num_blocks = 8,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
+        }, {
+                .num_blocks = 3,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
+        } }
+};
+static const struct common_glue_ctx twofish_ctr = {
+        .num_funcs = 4,
+        .fpu_blocks_limit = 8,
+        .funcs = { {
+                .num_blocks = 16,
+                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_16way) }
+        },  {
+                .num_blocks = 8,
+                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
+        }, {
+                .num_blocks = 3,
+                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) }
+        } }
+};
+static const struct common_glue_ctx twofish_enc_xts = {
+        .num_funcs = 3,
+        .fpu_blocks_limit = 8,
+        .funcs = { {
+                .num_blocks = 16,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_16way) }
+        }, {
+                .num_blocks = 8,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) }
+        } }
+};
+static const struct common_glue_ctx twofish_dec = {
+        .num_funcs = 4,
+        .fpu_blocks_limit = 8,
+        .funcs = { {
+                .num_blocks = 16,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_16way) }
+        }, {
+                .num_blocks = 8,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
+        }, {
+                .num_blocks = 3,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
+        } }
+};
+static const struct common_glue_ctx twofish_dec_cbc = {
+        .num_funcs = 4,
+        .fpu_blocks_limit = 8,
+        .funcs = { {
+                .num_blocks = 16,
+                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_16way) }
+        }, {
+                .num_blocks = 8,
+                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
+        }, {
+                .num_blocks = 3,
+                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
+        } }
+};
+static const struct common_glue_ctx twofish_dec_xts = {
+        .num_funcs = 3,
+        .fpu_blocks_limit = 8,
+        .funcs = { {
+                .num_blocks = 16,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_16way) }
+        }, {
+                .num_blocks = 8,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) }
+        } }
+};
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);
+}
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);
+}
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc,
+                                       dst, src, nbytes);
+}
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src,
+                                       nbytes);
+}
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                     struct scatterlist *src, unsigned int nbytes)
+{
+        return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);
+}
+static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+{
+        /* since reusing AVX functions, starts using FPU at 8 parallel blocks */
+        return glue_fpu_begin(TF_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes);
+}
+static inline void twofish_fpu_end(bool fpu_enabled)
+{
+        glue_fpu_end(fpu_enabled);
+}
+struct crypt_priv {
+        struct twofish_ctx *ctx;
+        bool fpu_enabled;
+};
+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+        const unsigned int bsize = TF_BLOCK_SIZE;
+        struct crypt_priv *ctx = priv;
+        int i;
+        ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
+        while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) {
+                twofish_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
+                srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS;
+                nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS;
+        }
+        while (nbytes >= 8 * bsize) {
+                twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
+                srcdst += bsize * 8;
+                nbytes -= bsize * 8;
+        }
+        while (nbytes >= 3 * bsize) {
+                twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
+                srcdst += bsize * 3;
+                nbytes -= bsize * 3;
+        }
+        for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+                twofish_enc_blk(ctx->ctx, srcdst, srcdst);
+}
+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+        const unsigned int bsize = TF_BLOCK_SIZE;
+        struct crypt_priv *ctx = priv;
+        int i;
+        ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
+        while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) {
+                twofish_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
+                srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS;
+                nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS;
+        }
+        while (nbytes >= 8 * bsize) {
+                twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
+                srcdst += bsize * 8;
+                nbytes -= bsize * 8;
+        }
+        while (nbytes >= 3 * bsize) {
+                twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
+                srcdst += bsize * 3;
+                nbytes -= bsize * 3;
+        }
+        for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+                twofish_dec_blk(ctx->ctx, srcdst, srcdst);
+}
+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        be128 buf[TF_AVX2_PARALLEL_BLOCKS];
+        struct crypt_priv crypt_ctx = {
+                .ctx = &ctx->twofish_ctx,
+                .fpu_enabled = false,
+        };
+        struct lrw_crypt_req req = {
+                .tbuf = buf,
+                .tbuflen = sizeof(buf),
+                .table_ctx = &ctx->lrw_table,
+                .crypt_ctx = &crypt_ctx,
+                .crypt_fn = encrypt_callback,
+        };
+        int ret;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        ret = lrw_crypt(desc, dst, src, nbytes, &req);
+        twofish_fpu_end(crypt_ctx.fpu_enabled);
+        return ret;
+}
+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        be128 buf[TF_AVX2_PARALLEL_BLOCKS];
+        struct crypt_priv crypt_ctx = {
+                .ctx = &ctx->twofish_ctx,
+                .fpu_enabled = false,
+        };
+        struct lrw_crypt_req req = {
+                .tbuf = buf,
+                .tbuflen = sizeof(buf),
+                .table_ctx = &ctx->lrw_table,
+                .crypt_ctx = &crypt_ctx,
+                .crypt_fn = decrypt_callback,
+        };
+        int ret;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        ret = lrw_crypt(desc, dst, src, nbytes, &req);
+        twofish_fpu_end(crypt_ctx.fpu_enabled);
+        return ret;
+}
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes,
+                                     XTS_TWEAK_CAST(twofish_enc_blk),
+                                     &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes,
+                                     XTS_TWEAK_CAST(twofish_enc_blk),
+                                     &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+static struct crypto_alg tf_algs[10] = { {
+        .cra_name               = "__ecb-twofish-avx2",
+        .cra_driver_name        = "__driver-ecb-twofish-avx2",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = TF_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct twofish_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = TF_MIN_KEY_SIZE,
+                        .max_keysize    = TF_MAX_KEY_SIZE,
+                        .setkey         = twofish_setkey,
+                        .encrypt        = ecb_encrypt,
+                        .decrypt        = ecb_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "__cbc-twofish-avx2",
+        .cra_driver_name        = "__driver-cbc-twofish-avx2",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = TF_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct twofish_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = TF_MIN_KEY_SIZE,
+                        .max_keysize    = TF_MAX_KEY_SIZE,
+                        .setkey         = twofish_setkey,
+                        .encrypt        = cbc_encrypt,
+                        .decrypt        = cbc_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "__ctr-twofish-avx2",
+        .cra_driver_name        = "__driver-ctr-twofish-avx2",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = 1,
+        .cra_ctxsize            = sizeof(struct twofish_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = TF_MIN_KEY_SIZE,
+                        .max_keysize    = TF_MAX_KEY_SIZE,
+                        .ivsize         = TF_BLOCK_SIZE,
+                        .setkey         = twofish_setkey,
+                        .encrypt        = ctr_crypt,
+                        .decrypt        = ctr_crypt,
+                },
+        },
+}, {
+        .cra_name               = "__lrw-twofish-avx2",
+        .cra_driver_name        = "__driver-lrw-twofish-avx2",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = TF_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct twofish_lrw_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_exit               = lrw_twofish_exit_tfm,
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = TF_MIN_KEY_SIZE +
+                                          TF_BLOCK_SIZE,
+                        .max_keysize    = TF_MAX_KEY_SIZE +
+                                          TF_BLOCK_SIZE,
+                        .ivsize         = TF_BLOCK_SIZE,
+                        .setkey         = lrw_twofish_setkey,
+                        .encrypt        = lrw_encrypt,
+                        .decrypt        = lrw_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "__xts-twofish-avx2",
+        .cra_driver_name        = "__driver-xts-twofish-avx2",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = TF_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct twofish_xts_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = TF_MIN_KEY_SIZE * 2,
+                        .max_keysize    = TF_MAX_KEY_SIZE * 2,
+                        .ivsize         = TF_BLOCK_SIZE,
+                        .setkey         = xts_twofish_setkey,
+                        .encrypt        = xts_encrypt,
+                        .decrypt        = xts_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "ecb(twofish)",
+        .cra_driver_name        = "ecb-twofish-avx2",
+        .cra_priority           = 500,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = TF_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = TF_MIN_KEY_SIZE,
+                        .max_keysize    = TF_MAX_KEY_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = ablk_encrypt,
+                        .decrypt        = ablk_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "cbc(twofish)",
+        .cra_driver_name        = "cbc-twofish-avx2",
+        .cra_priority           = 500,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = TF_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = TF_MIN_KEY_SIZE,
+                        .max_keysize    = TF_MAX_KEY_SIZE,
+                        .ivsize         = TF_BLOCK_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = __ablk_encrypt,
+                        .decrypt        = ablk_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "ctr(twofish)",
+        .cra_driver_name        = "ctr-twofish-avx2",
+        .cra_priority           = 500,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = 1,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = TF_MIN_KEY_SIZE,
+                        .max_keysize    = TF_MAX_KEY_SIZE,
+                        .ivsize         = TF_BLOCK_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = ablk_encrypt,
+                        .decrypt        = ablk_encrypt,
+                        .geniv          = "chainiv",
+                },
+        },
+}, {
+        .cra_name               = "lrw(twofish)",
+        .cra_driver_name        = "lrw-twofish-avx2",
+        .cra_priority           = 500,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = TF_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = TF_MIN_KEY_SIZE +
+                                          TF_BLOCK_SIZE,
+                        .max_keysize    = TF_MAX_KEY_SIZE +
+                                          TF_BLOCK_SIZE,
+                        .ivsize         = TF_BLOCK_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = ablk_encrypt,
+                        .decrypt        = ablk_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "xts(twofish)",
+        .cra_driver_name        = "xts-twofish-avx2",
+        .cra_priority           = 500,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = TF_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = TF_MIN_KEY_SIZE * 2,
+                        .max_keysize    = TF_MAX_KEY_SIZE * 2,
+                        .ivsize         = TF_BLOCK_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = ablk_encrypt,
+                        .decrypt        = ablk_decrypt,
+                },
+        },
+} };
+static int __init init(void)
+{
+        u64 xcr0;
+        if (!cpu_has_avx2 || !cpu_has_osxsave) {
+                pr_info("AVX2 instructions are not detected.\n");
+                return -ENODEV;
+        }
+        xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+        if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+                pr_info("AVX2 detected but unusable.\n");
+                return -ENODEV;
+        }
+        return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs));
+}
+static void __exit fini(void)
+{
+        crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs));
+}
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX2 optimized");
+MODULE_ALIAS("twofish");
+MODULE_ALIAS("twofish-asm");
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index 94ac91d26e47..2047a562f6b3 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -4,6 +4,8 @@
 * Copyright (C) 2012 Johannes Goetzfried
 *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
 *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
@@ -48,13 +50,26 @@
 /* 8-way parallel cipher functions */
 asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
                                     const u8 *src);
+EXPORT_SYMBOL_GPL(twofish_ecb_enc_8way);
 asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
                                     const u8 *src);
+EXPORT_SYMBOL_GPL(twofish_ecb_dec_8way);
 asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
                                     const u8 *src);
+EXPORT_SYMBOL_GPL(twofish_cbc_dec_8way);
 asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
                                 const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(twofish_ctr_8way);
+asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst,
+                                     const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(twofish_xts_enc_8way);
+asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst,
+                                     const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(twofish_xts_dec_8way);
 static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
                                        const u8 *src)
@@ -62,6 +77,20 @@ static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
        __twofish_enc_blk_3way(ctx, dst, src, false);
 }
+void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+        glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+                                  GLUE_FUNC_CAST(twofish_enc_blk));
+}
+EXPORT_SYMBOL_GPL(twofish_xts_enc);
+void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+        glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+                                  GLUE_FUNC_CAST(twofish_dec_blk));
+}
+EXPORT_SYMBOL_GPL(twofish_xts_dec);
 static const struct common_glue_ctx twofish_enc = {
        .num_funcs = 3,
@@ -95,6 +124,19 @@ static const struct common_glue_ctx twofish_ctr = {
        } }
 };
+static const struct common_glue_ctx twofish_enc_xts = {
+        .num_funcs = 2,
+        .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+        .funcs = { {
+                .num_blocks = TWOFISH_PARALLEL_BLOCKS,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) }
+        } }
+};
 static const struct common_glue_ctx twofish_dec = {
        .num_funcs = 3,
        .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
@@ -127,6 +169,19 @@ static const struct common_glue_ctx twofish_dec_cbc = {
        } }
 };
+static const struct common_glue_ctx twofish_dec_xts = {
+        .num_funcs = 2,
+        .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+        .funcs = { {
+                .num_blocks = TWOFISH_PARALLEL_BLOCKS,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) }
+        } }
+};
 static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
                       struct scatterlist *src, unsigned int nbytes)
 {
@@ -275,54 +330,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
                       struct scatterlist *src, unsigned int nbytes)
 {
        struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-        be128 buf[TWOFISH_PARALLEL_BLOCKS];
-        struct crypt_priv crypt_ctx = {
-                .ctx = &ctx->crypt_ctx,
-                .fpu_enabled = false,
-        };
-        struct xts_crypt_req req = {
-                .tbuf = buf,
-                .tbuflen = sizeof(buf),
-                .tweak_ctx = &ctx->tweak_ctx,
+        return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes,
-                .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
+                                     XTS_TWEAK_CAST(twofish_enc_blk),
-                .crypt_ctx = &crypt_ctx,
+                                     &ctx->tweak_ctx, &ctx->crypt_ctx);
-                .crypt_fn = encrypt_callback,
-        };
-        int ret;
-        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-        ret = xts_crypt(desc, dst, src, nbytes, &req);
-        twofish_fpu_end(crypt_ctx.fpu_enabled);
-        return ret;
 }
 static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
                       struct scatterlist *src, unsigned int nbytes)
 {
        struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-        be128 buf[TWOFISH_PARALLEL_BLOCKS];
-        struct crypt_priv crypt_ctx = {
-                .ctx = &ctx->crypt_ctx,
-                .fpu_enabled = false,
-        };
-        struct xts_crypt_req req = {
-                .tbuf = buf,
-                .tbuflen = sizeof(buf),
-                .tweak_ctx = &ctx->tweak_ctx,
-                .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
-                .crypt_ctx = &crypt_ctx,
-                .crypt_fn = decrypt_callback,
-        };
-        int ret;
-        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes,
-        ret = xts_crypt(desc, dst, src, nbytes, &req);
+                                     XTS_TWEAK_CAST(twofish_enc_blk),
-        twofish_fpu_end(crypt_ctx.fpu_enabled);
+                                     &ctx->tweak_ctx, &ctx->crypt_ctx);
-        return ret;
 }
 static struct crypto_alg twofish_algs[10] = { {
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 8010ebc5705f..e99ac27f95b2 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -293,6 +293,7 @@ extern const char * const x86_power_flags[32];
 #define cpu_has_ssse3           boot_cpu_has(X86_FEATURE_SSSE3)
 #define cpu_has_aes             boot_cpu_has(X86_FEATURE_AES)
 #define cpu_has_avx             boot_cpu_has(X86_FEATURE_AVX)
+#define cpu_has_avx2            boot_cpu_has(X86_FEATURE_AVX2)
 #define cpu_has_ht              boot_cpu_has(X86_FEATURE_HT)
 #define cpu_has_mp              boot_cpu_has(X86_FEATURE_MP)
 #define cpu_has_nx              boot_cpu_has(X86_FEATURE_NX)
diff --git a/arch/x86/include/asm/crypto/blowfish.h b/arch/x86/include/asm/crypto/blowfish.h
new file mode 100644
index 000000000000..f097b2face10
--- /dev/null
+++ b/arch/x86/include/asm/crypto/blowfish.h
@@ -0,0 +1,43 @@
+#ifndef ASM_X86_BLOWFISH_H
+#define ASM_X86_BLOWFISH_H
+#include <linux/crypto.h>
+#include <crypto/blowfish.h>
+#define BF_PARALLEL_BLOCKS 4
+/* regular block cipher functions */
+asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
+                                   bool xor);
+asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
+/* 4-way parallel cipher functions */
+asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
+                                        const u8 *src, bool xor);
+asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
+                                      const u8 *src);
+static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
+{
+        __blowfish_enc_blk(ctx, dst, src, false);
+}
+static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
+                                        const u8 *src)
+{
+        __blowfish_enc_blk(ctx, dst, src, true);
+}
+static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
+                                         const u8 *src)
+{
+        __blowfish_enc_blk_4way(ctx, dst, src, false);
+}
+static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
+                                      const u8 *src)
+{
+        __blowfish_enc_blk_4way(ctx, dst, src, true);
+}
+#endif
diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/include/asm/crypto/camellia.h
index 98038add801e..bb93333d9200 100644
--- a/arch/x86/include/asm/crypto/camellia.h
+++ b/arch/x86/include/asm/crypto/camellia.h
@@ -48,6 +48,22 @@ asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
 asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
                                      const u8 *src);
+/* 16-way parallel cipher functions (avx/aes-ni) */
+asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst,
+                                       const u8 *src);
+asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+                                       const u8 *src);
+asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+                                       const u8 *src);
+asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
+                                   const u8 *src, le128 *iv);
+asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst,
+                                       const u8 *src, le128 *iv);
+asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+                                       const u8 *src, le128 *iv);
 static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
                                    const u8 *src)
 {
@@ -79,4 +95,7 @@ extern void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src,
 extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
                                    le128 *iv);
+extern void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+extern void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
 #endif /* ASM_X86_CAMELLIA_H */
diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h
index e2d65b061d27..1eef55596e82 100644
--- a/arch/x86/include/asm/crypto/glue_helper.h
+++ b/arch/x86/include/asm/crypto/glue_helper.h
@@ -14,10 +14,13 @@ typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);
 typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src);
 typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src,
                                       le128 *iv);
+typedef void (*common_glue_xts_func_t)(void *ctx, u128 *dst, const u128 *src,
+                                       le128 *iv);
 #define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn))
 #define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn))
 #define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn))
+#define GLUE_XTS_FUNC_CAST(fn) ((common_glue_xts_func_t)(fn))
 struct common_glue_func_entry {
        unsigned int num_blocks; /* number of blocks that @fn will process */
@@ -25,6 +28,7 @@ struct common_glue_func_entry {
                common_glue_func_t ecb;
                common_glue_cbc_func_t cbc;
                common_glue_ctr_func_t ctr;
+                common_glue_xts_func_t xts;
        } fn_u;
 };
@@ -96,6 +100,16 @@ static inline void le128_inc(le128 *i)
        i->b = cpu_to_le64(b);
 }
+static inline void le128_gf128mul_x_ble(le128 *dst, const le128 *src)
+{
+        u64 a = le64_to_cpu(src->a);
+        u64 b = le64_to_cpu(src->b);
+        u64 _tt = ((s64)a >> 63) & 0x87;
+        dst->a = cpu_to_le64((a << 1) ^ (b >> 63));
+        dst->b = cpu_to_le64((b << 1) ^ _tt);
+}
 extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
                                 struct blkcipher_desc *desc,
                                 struct scatterlist *dst,
@@ -118,4 +132,14 @@ extern int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
                                 struct scatterlist *dst,
                                 struct scatterlist *src, unsigned int nbytes);
+extern int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
+                                 struct blkcipher_desc *desc,
+                                 struct scatterlist *dst,
+                                 struct scatterlist *src, unsigned int nbytes,
+                                 common_glue_func_t tweak_fn, void *tweak_ctx,
+                                 void *crypt_ctx);
+extern void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src,
+                                      le128 *iv, common_glue_func_t fn);
 #endif /* _CRYPTO_GLUE_HELPER_H */
diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h
index 0da1d3e2a55c..33c2b8a435da 100644
--- a/arch/x86/include/asm/crypto/serpent-avx.h
+++ b/arch/x86/include/asm/crypto/serpent-avx.h
@@ -6,6 +6,16 @@
 #define SERPENT_PARALLEL_BLOCKS 8
+struct serpent_lrw_ctx {
+        struct lrw_table_ctx lrw_table;
+        struct serpent_ctx serpent_ctx;
+};
+struct serpent_xts_ctx {
+        struct serpent_ctx tweak_ctx;
+        struct serpent_ctx crypt_ctx;
+};
 asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
                                         const u8 *src);
 asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
@@ -16,4 +26,23 @@ asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
 asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
                                     const u8 *src, le128 *iv);
+asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+                                         const u8 *src, le128 *iv);
+asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+                                         const u8 *src, le128 *iv);
+extern void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src,
+                                le128 *iv);
+extern void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+extern void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+extern int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
+                              unsigned int keylen);
+extern void lrw_serpent_exit_tfm(struct crypto_tfm *tfm);
+extern int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
+                              unsigned int keylen);
 #endif
diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h
index 878c51ceebb5..e655c6029b45 100644
--- a/arch/x86/include/asm/crypto/twofish.h
+++ b/arch/x86/include/asm/crypto/twofish.h
@@ -28,6 +28,20 @@ asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
                                     const u8 *src);
+/* 8-way parallel cipher functions */
+asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
+                                     const u8 *src);
+asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
+                                     const u8 *src);
+asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
+                                     const u8 *src);
+asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
+                                 const u8 *src, le128 *iv);
+asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst,
+                                     const u8 *src, le128 *iv);
+asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst,
+                                     const u8 *src, le128 *iv);
 /* helpers from twofish_x86_64-3way module */
 extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);
 extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src,
@@ -43,4 +57,8 @@ extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm);
 extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
                              unsigned int keylen);
+/* helpers from twofish-avx module */
+extern void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+extern void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
 #endif /* ASM_X86_TWOFISH_H */
author	Linus Torvalds <torvalds@linux-foundation.org>	2013-05-02 17:53:12 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-05-02 17:53:12 -0400
commit	797994f81a8b2bdca2eecffa415c1e7a89a4f961 (patch)
tree	1383dc469c26ad37fdf960f682d9a48c782935c5 /arch
parent	c8d8566952fda026966784a62f324c8352f77430 (diff)
parent	3862de1f6c442d53bd828d39f86d07d933a70605 (diff)