Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto update from Herbert Xu: - Do not idle omap device between crypto operations in one session. - Added sha224/sha384 shims for SSSE3. - More optimisations for camellia-aesni-avx2. - Removed defunct blowfish/twofish AVX2 implementations. - Added unaligned buffer self-tests. - Added PCLMULQDQ optimisation for CRCT10DIF. - Added support for Freescale's DCP co-processor - Misc fixes. * git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (44 commits) crypto: testmgr - test hash implementations with unaligned buffers crypto: testmgr - test AEADs with unaligned buffers crypto: testmgr - test skciphers with unaligned buffers crypto: testmgr - check that entries in alg_test_descs are in correct order Revert "crypto: twofish - add AVX2/x86_64 assembler implementation of twofish cipher" Revert "crypto: blowfish - add AVX2/x86_64 implementation of blowfish cipher" crypto: camellia-aesni-avx2 - tune assembly code for more performance hwrng: bcm2835 - fix MODULE_LICENSE tag hwrng: nomadik - use clk_prepare_enable() crypto: picoxcell - replace strict_strtoul() with kstrtoul() crypto: dcp - Staticize local symbols crypto: dcp - Use NULL instead of 0 crypto: dcp - Use devm_* APIs crypto: dcp - Remove redundant platform_set_drvdata() hwrng: use platform_{get,set}_drvdata() crypto: omap-aes - Don't idle/start AES device between Encrypt operations crypto: crct10dif - Use PTR_RET crypto: ux500 - Cocci spatch "resource_size.spatch" crypto: sha256_ssse3 - add sha224 support crypto: sha512_ssse3 - add sha384 support ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2013-07-05 15:12:33 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-07-05 15:12:33 -0400
commit: b2c311075db578f1433d9b303698491bfa21279a (patch)
tree: 41d5f1b5ad6f45be7211f524328de81f7e9754be /arch/x86/crypto
parent: 45175476ae2dbebc860d5cf486f2916044343513 (diff)
parent: 02c0241b600e4ab8a732c89749e252165145d60c (diff)
12 files changed, 1016 insertions, 2325 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index a3a0ed80f17c..7d6ba9db1be9 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -3,8 +3,6 @@
 #
 avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
-avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
-                                        $(comma)4)$(comma)%ymm2,yes,no)
 obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o
 obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
@@ -29,6 +27,7 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
 obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
 obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
 obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
+obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
 # These modules require assembler to support AVX.
 ifeq ($(avx_supported),yes)
@@ -42,10 +41,8 @@ endif
 # These modules require assembler to support AVX2.
 ifeq ($(avx2_supported),yes)
-        obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o
        obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
        obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
-        obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o
 endif
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
@@ -73,10 +70,8 @@ ifeq ($(avx_supported),yes)
 endif
 ifeq ($(avx2_supported),yes)
-        blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o
        camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
        serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
-        twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o
 endif
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
@@ -87,3 +82,4 @@ crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
 crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
 sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
 sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
+crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
diff --git a/arch/x86/crypto/blowfish-avx2-asm_64.S b/arch/x86/crypto/blowfish-avx2-asm_64.S
deleted file mode 100644
index 784452e0d05d..000000000000
--- a/arch/x86/crypto/blowfish-avx2-asm_64.S
+++ /dev/null
@@ -1,449 +0,0 @@
-/*
- * x86_64/AVX2 assembler optimized version of Blowfish
- *
- * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- */
-#include <linux/linkage.h>
-.file "blowfish-avx2-asm_64.S"
-.data
-.align 32
-.Lprefetch_mask:
-.long 0*64
-.long 1*64
-.long 2*64
-.long 3*64
-.long 4*64
-.long 5*64
-.long 6*64
-.long 7*64
-.Lbswap32_mask:
-.long 0x00010203
-.long 0x04050607
-.long 0x08090a0b
-.long 0x0c0d0e0f
-.Lbswap128_mask:
-        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-.Lbswap_iv_mask:
-        .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
-.text
-/* structure of crypto context */
-#define p       0
-#define s0      ((16 + 2) * 4)
-#define s1      ((16 + 2 + (1 * 256)) * 4)
-#define s2      ((16 + 2 + (2 * 256)) * 4)
-#define s3      ((16 + 2 + (3 * 256)) * 4)
-/* register macros */
-#define CTX     %rdi
-#define RIO      %rdx
-#define RS0     %rax
-#define RS1     %r8
-#define RS2     %r9
-#define RS3     %r10
-#define RLOOP   %r11
-#define RLOOPd  %r11d
-#define RXr0    %ymm8
-#define RXr1    %ymm9
-#define RXr2    %ymm10
-#define RXr3    %ymm11
-#define RXl0    %ymm12
-#define RXl1    %ymm13
-#define RXl2    %ymm14
-#define RXl3    %ymm15
-/* temp regs */
-#define RT0     %ymm0
-#define RT0x    %xmm0
-#define RT1     %ymm1
-#define RT1x    %xmm1
-#define RIDX0   %ymm2
-#define RIDX1   %ymm3
-#define RIDX1x  %xmm3
-#define RIDX2   %ymm4
-#define RIDX3   %ymm5
-/* vpgatherdd mask and '-1' */
-#define RNOT    %ymm6
-/* byte mask, (-1 >> 24) */
-#define RBYTE   %ymm7
-/***********************************************************************
- * 32-way AVX2 blowfish
- ***********************************************************************/
-#define F(xl, xr) \
-        vpsrld $24, xl, RIDX0; \
-        vpsrld $16, xl, RIDX1; \
-        vpsrld $8, xl, RIDX2; \
-        vpand RBYTE, RIDX1, RIDX1; \
-        vpand RBYTE, RIDX2, RIDX2; \
-        vpand RBYTE, xl, RIDX3; \
-        \
-        vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \
-        vpcmpeqd RNOT, RNOT, RNOT; \
-        vpcmpeqd RIDX0, RIDX0, RIDX0; \
-        \
-        vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \
-        vpcmpeqd RIDX1, RIDX1, RIDX1; \
-        vpaddd RT0, RT1, RT0; \
-        \
-        vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \
-        vpxor RT0, RT1, RT0; \
-        \
-        vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \
-        vpcmpeqd RNOT, RNOT, RNOT; \
-        vpaddd RT0, RT1, RT0; \
-        \
-        vpxor RT0, xr, xr;
-#define add_roundkey(xl, nmem) \
-        vpbroadcastd nmem, RT0; \
-        vpxor RT0, xl ## 0, xl ## 0; \
-        vpxor RT0, xl ## 1, xl ## 1; \
-        vpxor RT0, xl ## 2, xl ## 2; \
-        vpxor RT0, xl ## 3, xl ## 3;
-#define round_enc() \
-        add_roundkey(RXr, p(CTX,RLOOP,4)); \
-        F(RXl0, RXr0); \
-        F(RXl1, RXr1); \
-        F(RXl2, RXr2); \
-        F(RXl3, RXr3); \
-        \
-        add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
-        F(RXr0, RXl0); \
-        F(RXr1, RXl1); \
-        F(RXr2, RXl2); \
-        F(RXr3, RXl3);
-#define round_dec() \
-        add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \
-        F(RXl0, RXr0); \
-        F(RXl1, RXr1); \
-        F(RXl2, RXr2); \
-        F(RXl3, RXr3); \
-        \
-        add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
-        F(RXr0, RXl0); \
-        F(RXr1, RXl1); \
-        F(RXr2, RXl2); \
-        F(RXr3, RXl3);
-#define init_round_constants() \
-        vpcmpeqd RNOT, RNOT, RNOT; \
-        leaq s0(CTX), RS0; \
-        leaq s1(CTX), RS1; \
-        leaq s2(CTX), RS2; \
-        leaq s3(CTX), RS3; \
-        vpsrld $24, RNOT, RBYTE;
-#define transpose_2x2(x0, x1, t0) \
-        vpunpckldq x0, x1, t0; \
-        vpunpckhdq x0, x1, x1; \
-        \
-        vpunpcklqdq t0, x1, x0; \
-        vpunpckhqdq t0, x1, x1;
-#define read_block(xl, xr) \
-        vbroadcasti128 .Lbswap32_mask, RT1; \
-        \
-        vpshufb RT1, xl ## 0, xl ## 0; \
-        vpshufb RT1, xr ## 0, xr ## 0; \
-        vpshufb RT1, xl ## 1, xl ## 1; \
-        vpshufb RT1, xr ## 1, xr ## 1; \
-        vpshufb RT1, xl ## 2, xl ## 2; \
-        vpshufb RT1, xr ## 2, xr ## 2; \
-        vpshufb RT1, xl ## 3, xl ## 3; \
-        vpshufb RT1, xr ## 3, xr ## 3; \
-        \
-        transpose_2x2(xl ## 0, xr ## 0, RT0); \
-        transpose_2x2(xl ## 1, xr ## 1, RT0); \
-        transpose_2x2(xl ## 2, xr ## 2, RT0); \
-        transpose_2x2(xl ## 3, xr ## 3, RT0);
-#define write_block(xl, xr) \
-        vbroadcasti128 .Lbswap32_mask, RT1; \
-        \
-        transpose_2x2(xl ## 0, xr ## 0, RT0); \
-        transpose_2x2(xl ## 1, xr ## 1, RT0); \
-        transpose_2x2(xl ## 2, xr ## 2, RT0); \
-        transpose_2x2(xl ## 3, xr ## 3, RT0); \
-        \
-        vpshufb RT1, xl ## 0, xl ## 0; \
-        vpshufb RT1, xr ## 0, xr ## 0; \
-        vpshufb RT1, xl ## 1, xl ## 1; \
-        vpshufb RT1, xr ## 1, xr ## 1; \
-        vpshufb RT1, xl ## 2, xl ## 2; \
-        vpshufb RT1, xr ## 2, xr ## 2; \
-        vpshufb RT1, xl ## 3, xl ## 3; \
-        vpshufb RT1, xr ## 3, xr ## 3;
-.align 8
-__blowfish_enc_blk32:
-        /* input:
-         *      %rdi: ctx, CTX
-         *      RXl0..4, RXr0..4: plaintext
-         * output:
-         *      RXl0..4, RXr0..4: ciphertext (RXl <=> RXr swapped)
-         */
-        init_round_constants();
-        read_block(RXl, RXr);
-        movl $1, RLOOPd;
-        add_roundkey(RXl, p+4*(0)(CTX));
-.align 4
-.L__enc_loop:
-        round_enc();
-        leal 2(RLOOPd), RLOOPd;
-        cmpl $17, RLOOPd;
-        jne .L__enc_loop;
-        add_roundkey(RXr, p+4*(17)(CTX));
-        write_block(RXl, RXr);
-        ret;
-ENDPROC(__blowfish_enc_blk32)
-.align 8
-__blowfish_dec_blk32:
-        /* input:
-         *      %rdi: ctx, CTX
-         *      RXl0..4, RXr0..4: ciphertext
-         * output:
-         *      RXl0..4, RXr0..4: plaintext (RXl <=> RXr swapped)
-         */
-        init_round_constants();
-        read_block(RXl, RXr);
-        movl $14, RLOOPd;
-        add_roundkey(RXl, p+4*(17)(CTX));
-.align 4
-.L__dec_loop:
-        round_dec();
-        addl $-2, RLOOPd;
-        jns .L__dec_loop;
-        add_roundkey(RXr, p+4*(0)(CTX));
-        write_block(RXl, RXr);
-        ret;
-ENDPROC(__blowfish_dec_blk32)
-ENTRY(blowfish_ecb_enc_32way)
-        /* input:
-         *      %rdi: ctx, CTX
-         *      %rsi: dst
-         *      %rdx: src
-         */
-        vzeroupper;
-        vmovdqu 0*32(%rdx), RXl0;
-        vmovdqu 1*32(%rdx), RXr0;
-        vmovdqu 2*32(%rdx), RXl1;
-        vmovdqu 3*32(%rdx), RXr1;
-        vmovdqu 4*32(%rdx), RXl2;
-        vmovdqu 5*32(%rdx), RXr2;
-        vmovdqu 6*32(%rdx), RXl3;
-        vmovdqu 7*32(%rdx), RXr3;
-        call __blowfish_enc_blk32;
-        vmovdqu RXr0, 0*32(%rsi);
-        vmovdqu RXl0, 1*32(%rsi);
-        vmovdqu RXr1, 2*32(%rsi);
-        vmovdqu RXl1, 3*32(%rsi);
-        vmovdqu RXr2, 4*32(%rsi);
-        vmovdqu RXl2, 5*32(%rsi);
-        vmovdqu RXr3, 6*32(%rsi);
-        vmovdqu RXl3, 7*32(%rsi);
-        vzeroupper;
-        ret;
-ENDPROC(blowfish_ecb_enc_32way)
-ENTRY(blowfish_ecb_dec_32way)
-        /* input:
-         *      %rdi: ctx, CTX
-         *      %rsi: dst
-         *      %rdx: src
-         */
-        vzeroupper;
-        vmovdqu 0*32(%rdx), RXl0;
-        vmovdqu 1*32(%rdx), RXr0;
-        vmovdqu 2*32(%rdx), RXl1;
-        vmovdqu 3*32(%rdx), RXr1;
-        vmovdqu 4*32(%rdx), RXl2;
-        vmovdqu 5*32(%rdx), RXr2;
-        vmovdqu 6*32(%rdx), RXl3;
-        vmovdqu 7*32(%rdx), RXr3;
-        call __blowfish_dec_blk32;
-        vmovdqu RXr0, 0*32(%rsi);
-        vmovdqu RXl0, 1*32(%rsi);
-        vmovdqu RXr1, 2*32(%rsi);
-        vmovdqu RXl1, 3*32(%rsi);
-        vmovdqu RXr2, 4*32(%rsi);
-        vmovdqu RXl2, 5*32(%rsi);
-        vmovdqu RXr3, 6*32(%rsi);
-        vmovdqu RXl3, 7*32(%rsi);
-        vzeroupper;
-        ret;
-ENDPROC(blowfish_ecb_dec_32way)
-ENTRY(blowfish_cbc_dec_32way)
-        /* input:
-         *      %rdi: ctx, CTX
-         *      %rsi: dst
-         *      %rdx: src
-         */
-        vzeroupper;
-        vmovdqu 0*32(%rdx), RXl0;
-        vmovdqu 1*32(%rdx), RXr0;
-        vmovdqu 2*32(%rdx), RXl1;
-        vmovdqu 3*32(%rdx), RXr1;
-        vmovdqu 4*32(%rdx), RXl2;
-        vmovdqu 5*32(%rdx), RXr2;
-        vmovdqu 6*32(%rdx), RXl3;
-        vmovdqu 7*32(%rdx), RXr3;
-        call __blowfish_dec_blk32;
-        /* xor with src */
-        vmovq (%rdx), RT0x;
-        vpshufd $0x4f, RT0x, RT0x;
-        vinserti128 $1, 8(%rdx), RT0, RT0;
-        vpxor RT0, RXr0, RXr0;
-        vpxor 0*32+24(%rdx), RXl0, RXl0;
-        vpxor 1*32+24(%rdx), RXr1, RXr1;
-        vpxor 2*32+24(%rdx), RXl1, RXl1;
-        vpxor 3*32+24(%rdx), RXr2, RXr2;
-        vpxor 4*32+24(%rdx), RXl2, RXl2;
-        vpxor 5*32+24(%rdx), RXr3, RXr3;
-        vpxor 6*32+24(%rdx), RXl3, RXl3;
-        vmovdqu RXr0, (0*32)(%rsi);
-        vmovdqu RXl0, (1*32)(%rsi);
-        vmovdqu RXr1, (2*32)(%rsi);
-        vmovdqu RXl1, (3*32)(%rsi);
-        vmovdqu RXr2, (4*32)(%rsi);
-        vmovdqu RXl2, (5*32)(%rsi);
-        vmovdqu RXr3, (6*32)(%rsi);
-        vmovdqu RXl3, (7*32)(%rsi);
-        vzeroupper;
-        ret;
-ENDPROC(blowfish_cbc_dec_32way)
-ENTRY(blowfish_ctr_32way)
-        /* input:
-         *      %rdi: ctx, CTX
-         *      %rsi: dst
-         *      %rdx: src
-         *      %rcx: iv (big endian, 64bit)
-         */
-        vzeroupper;
-        vpcmpeqd RT0, RT0, RT0;
-        vpsrldq $8, RT0, RT0; /* a: -1, b: 0, c: -1, d: 0 */
-        vpcmpeqd RT1x, RT1x, RT1x;
-        vpaddq RT1x, RT1x, RT1x; /* a: -2, b: -2 */
-        vpxor RIDX0, RIDX0, RIDX0;
-        vinserti128 $1, RT1x, RIDX0, RIDX0; /* a: 0, b: 0, c: -2, d: -2 */
-        vpaddq RIDX0, RT0, RT0; /* a: -1, b: 0, c: -3, d: -2 */
-        vpcmpeqd RT1, RT1, RT1;
-        vpaddq RT1, RT1, RT1; /* a: -2, b: -2, c: -2, d: -2 */
-        vpaddq RT1, RT1, RIDX2; /* a: -4, b: -4, c: -4, d: -4 */
-        vbroadcasti128 .Lbswap_iv_mask, RIDX0;
-        vbroadcasti128 .Lbswap128_mask, RIDX1;
-        /* load IV and byteswap */
-        vmovq (%rcx), RT1x;
-        vinserti128 $1, RT1x, RT1, RT1; /* a: BE, b: 0, c: BE, d: 0 */
-        vpshufb RIDX0, RT1, RT1; /* a: LE, b: LE, c: LE, d: LE */
-        /* construct IVs */
-        vpsubq RT0, RT1, RT1;           /* a: le1, b: le0, c: le3, d: le2 */
-        vpshufb RIDX1, RT1, RXl0;       /* a: be0, b: be1, c: be2, d: be3 */
-        vpsubq RIDX2, RT1, RT1;         /* le5, le4, le7, le6 */
-        vpshufb RIDX1, RT1, RXr0;       /* be4, be5, be6, be7 */
-        vpsubq RIDX2, RT1, RT1;
-        vpshufb RIDX1, RT1, RXl1;
-        vpsubq RIDX2, RT1, RT1;
-        vpshufb RIDX1, RT1, RXr1;
-        vpsubq RIDX2, RT1, RT1;
-        vpshufb RIDX1, RT1, RXl2;
-        vpsubq RIDX2, RT1, RT1;
-        vpshufb RIDX1, RT1, RXr2;
-        vpsubq RIDX2, RT1, RT1;
-        vpshufb RIDX1, RT1, RXl3;
-        vpsubq RIDX2, RT1, RT1;
-        vpshufb RIDX1, RT1, RXr3;
-        /* store last IV */
-        vpsubq RIDX2, RT1, RT1; /* a: le33, b: le32, ... */
-        vpshufb RIDX1x, RT1x, RT1x; /* a: be32, ... */
-        vmovq RT1x, (%rcx);
-        call __blowfish_enc_blk32;
-        /* dst = src ^ iv */
-        vpxor 0*32(%rdx), RXr0, RXr0;
-        vpxor 1*32(%rdx), RXl0, RXl0;
-        vpxor 2*32(%rdx), RXr1, RXr1;
-        vpxor 3*32(%rdx), RXl1, RXl1;
-        vpxor 4*32(%rdx), RXr2, RXr2;
-        vpxor 5*32(%rdx), RXl2, RXl2;
-        vpxor 6*32(%rdx), RXr3, RXr3;
-        vpxor 7*32(%rdx), RXl3, RXl3;
-        vmovdqu RXr0, (0*32)(%rsi);
-        vmovdqu RXl0, (1*32)(%rsi);
-        vmovdqu RXr1, (2*32)(%rsi);
-        vmovdqu RXl1, (3*32)(%rsi);
-        vmovdqu RXr2, (4*32)(%rsi);
-        vmovdqu RXl2, (5*32)(%rsi);
-        vmovdqu RXr3, (6*32)(%rsi);
-        vmovdqu RXl3, (7*32)(%rsi);
-        vzeroupper;
-        ret;
-ENDPROC(blowfish_ctr_32way)
diff --git a/arch/x86/crypto/blowfish_avx2_glue.c b/arch/x86/crypto/blowfish_avx2_glue.c
deleted file mode 100644
index 4417e9aea78d..000000000000
--- a/arch/x86/crypto/blowfish_avx2_glue.c
+++ /dev/null
@@ -1,585 +0,0 @@
-/*
- * Glue Code for x86_64/AVX2 assembler optimized version of Blowfish
- *
- * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
- *
- * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
- *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
- * CTR part based on code (crypto/ctr.c) by:
- *   (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/crypto.h>
-#include <linux/err.h>
-#include <crypto/algapi.h>
-#include <crypto/blowfish.h>
-#include <crypto/cryptd.h>
-#include <crypto/ctr.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
-#include <asm/crypto/blowfish.h>
-#include <asm/crypto/ablk_helper.h>
-#include <crypto/scatterwalk.h>
-#define BF_AVX2_PARALLEL_BLOCKS 32
-/* 32-way AVX2 parallel cipher functions */
-asmlinkage void blowfish_ecb_enc_32way(struct bf_ctx *ctx, u8 *dst,
-                                       const u8 *src);
-asmlinkage void blowfish_ecb_dec_32way(struct bf_ctx *ctx, u8 *dst,
-                                       const u8 *src);
-asmlinkage void blowfish_cbc_dec_32way(struct bf_ctx *ctx, u8 *dst,
-                                       const u8 *src);
-asmlinkage void blowfish_ctr_32way(struct bf_ctx *ctx, u8 *dst, const u8 *src,
-                                   __be64 *iv);
-static inline bool bf_fpu_begin(bool fpu_enabled, unsigned int nbytes)
-{
-        if (fpu_enabled)
-                return true;
-        /* FPU is only used when chunk to be processed is large enough, so
-         * do not enable FPU until it is necessary.
-         */
-        if (nbytes < BF_BLOCK_SIZE * BF_AVX2_PARALLEL_BLOCKS)
-                return false;
-        kernel_fpu_begin();
-        return true;
-}
-static inline void bf_fpu_end(bool fpu_enabled)
-{
-        if (fpu_enabled)
-                kernel_fpu_end();
-}
-static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
-                     bool enc)
-{
-        bool fpu_enabled = false;
-        struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-        const unsigned int bsize = BF_BLOCK_SIZE;
-        unsigned int nbytes;
-        int err;
-        err = blkcipher_walk_virt(desc, walk);
-        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-        while ((nbytes = walk->nbytes)) {
-                u8 *wsrc = walk->src.virt.addr;
-                u8 *wdst = walk->dst.virt.addr;
-                fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
-                /* Process multi-block AVX2 batch */
-                if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
-                        do {
-                                if (enc)
-                                        blowfish_ecb_enc_32way(ctx, wdst, wsrc);
-                                else
-                                        blowfish_ecb_dec_32way(ctx, wdst, wsrc);
-                                wsrc += bsize * BF_AVX2_PARALLEL_BLOCKS;
-                                wdst += bsize * BF_AVX2_PARALLEL_BLOCKS;
-                                nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS;
-                        } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
-                        if (nbytes < bsize)
-                                goto done;
-                }
-                /* Process multi-block batch */
-                if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
-                        do {
-                                if (enc)
-                                        blowfish_enc_blk_4way(ctx, wdst, wsrc);
-                                else
-                                        blowfish_dec_blk_4way(ctx, wdst, wsrc);
-                                wsrc += bsize * BF_PARALLEL_BLOCKS;
-                                wdst += bsize * BF_PARALLEL_BLOCKS;
-                                nbytes -= bsize * BF_PARALLEL_BLOCKS;
-                        } while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
-                        if (nbytes < bsize)
-                                goto done;
-                }
-                /* Handle leftovers */
-                do {
-                        if (enc)
-                                blowfish_enc_blk(ctx, wdst, wsrc);
-                        else
-                                blowfish_dec_blk(ctx, wdst, wsrc);
-                        wsrc += bsize;
-                        wdst += bsize;
-                        nbytes -= bsize;
-                } while (nbytes >= bsize);
-done:
-                err = blkcipher_walk_done(desc, walk, nbytes);
-        }
-        bf_fpu_end(fpu_enabled);
-        return err;
-}
-static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                       struct scatterlist *src, unsigned int nbytes)
-{
-        struct blkcipher_walk walk;
-        blkcipher_walk_init(&walk, dst, src, nbytes);
-        return ecb_crypt(desc, &walk, true);
-}
-static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                       struct scatterlist *src, unsigned int nbytes)
-{
-        struct blkcipher_walk walk;
-        blkcipher_walk_init(&walk, dst, src, nbytes);
-        return ecb_crypt(desc, &walk, false);
-}
-static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
-                                  struct blkcipher_walk *walk)
-{
-        struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-        unsigned int bsize = BF_BLOCK_SIZE;
-        unsigned int nbytes = walk->nbytes;
-        u64 *src = (u64 *)walk->src.virt.addr;
-        u64 *dst = (u64 *)walk->dst.virt.addr;
-        u64 *iv = (u64 *)walk->iv;
-        do {
-                *dst = *src ^ *iv;
-                blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
-                iv = dst;
-                src += 1;
-                dst += 1;
-                nbytes -= bsize;
-        } while (nbytes >= bsize);
-        *(u64 *)walk->iv = *iv;
-        return nbytes;
-}
-static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                       struct scatterlist *src, unsigned int nbytes)
-{
-        struct blkcipher_walk walk;
-        int err;
-        blkcipher_walk_init(&walk, dst, src, nbytes);
-        err = blkcipher_walk_virt(desc, &walk);
-        while ((nbytes = walk.nbytes)) {
-                nbytes = __cbc_encrypt(desc, &walk);
-                err = blkcipher_walk_done(desc, &walk, nbytes);
-        }
-        return err;
-}
-static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
-                                  struct blkcipher_walk *walk)
-{
-        struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-        const unsigned int bsize = BF_BLOCK_SIZE;
-        unsigned int nbytes = walk->nbytes;
-        u64 *src = (u64 *)walk->src.virt.addr;
-        u64 *dst = (u64 *)walk->dst.virt.addr;
-        u64 last_iv;
-        int i;
-        /* Start of the last block. */
-        src += nbytes / bsize - 1;
-        dst += nbytes / bsize - 1;
-        last_iv = *src;
-        /* Process multi-block AVX2 batch */
-        if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
-                do {
-                        nbytes -= bsize * (BF_AVX2_PARALLEL_BLOCKS - 1);
-                        src -= BF_AVX2_PARALLEL_BLOCKS - 1;
-                        dst -= BF_AVX2_PARALLEL_BLOCKS - 1;
-                        blowfish_cbc_dec_32way(ctx, (u8 *)dst, (u8 *)src);
-                        nbytes -= bsize;
-                        if (nbytes < bsize)
-                                goto done;
-                        *dst ^= *(src - 1);
-                        src -= 1;
-                        dst -= 1;
-                } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
-                if (nbytes < bsize)
-                        goto done;
-        }
-        /* Process multi-block batch */
-        if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
-                u64 ivs[BF_PARALLEL_BLOCKS - 1];
-                do {
-                        nbytes -= bsize * (BF_PARALLEL_BLOCKS - 1);
-                        src -= BF_PARALLEL_BLOCKS - 1;
-                        dst -= BF_PARALLEL_BLOCKS - 1;
-                        for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++)
-                                ivs[i] = src[i];
-                        blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src);
-                        for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++)
-                                dst[i + 1] ^= ivs[i];
-                        nbytes -= bsize;
-                        if (nbytes < bsize)
-                                goto done;
-                        *dst ^= *(src - 1);
-                        src -= 1;
-                        dst -= 1;
-                } while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
-                if (nbytes < bsize)
-                        goto done;
-        }
-        /* Handle leftovers */
-        for (;;) {
-                blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src);
-                nbytes -= bsize;
-                if (nbytes < bsize)
-                        break;
-                *dst ^= *(src - 1);
-                src -= 1;
-                dst -= 1;
-        }
-done:
-        *dst ^= *(u64 *)walk->iv;
-        *(u64 *)walk->iv = last_iv;
-        return nbytes;
-}
-static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                       struct scatterlist *src, unsigned int nbytes)
-{
-        bool fpu_enabled = false;
-        struct blkcipher_walk walk;
-        int err;
-        blkcipher_walk_init(&walk, dst, src, nbytes);
-        err = blkcipher_walk_virt(desc, &walk);
-        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-        while ((nbytes = walk.nbytes)) {
-                fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
-                nbytes = __cbc_decrypt(desc, &walk);
-                err = blkcipher_walk_done(desc, &walk, nbytes);
-        }
-        bf_fpu_end(fpu_enabled);
-        return err;
-}
-static void ctr_crypt_final(struct blkcipher_desc *desc,
-                            struct blkcipher_walk *walk)
-{
-        struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-        u8 *ctrblk = walk->iv;
-        u8 keystream[BF_BLOCK_SIZE];
-        u8 *src = walk->src.virt.addr;
-        u8 *dst = walk->dst.virt.addr;
-        unsigned int nbytes = walk->nbytes;
-        blowfish_enc_blk(ctx, keystream, ctrblk);
-        crypto_xor(keystream, src, nbytes);
-        memcpy(dst, keystream, nbytes);
-        crypto_inc(ctrblk, BF_BLOCK_SIZE);
-}
-static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
-                                struct blkcipher_walk *walk)
-{
-        struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-        unsigned int bsize = BF_BLOCK_SIZE;
-        unsigned int nbytes = walk->nbytes;
-        u64 *src = (u64 *)walk->src.virt.addr;
-        u64 *dst = (u64 *)walk->dst.virt.addr;
-        int i;
-        /* Process multi-block AVX2 batch */
-        if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
-                do {
-                        blowfish_ctr_32way(ctx, (u8 *)dst, (u8 *)src,
-                                           (__be64 *)walk->iv);
-                        src += BF_AVX2_PARALLEL_BLOCKS;
-                        dst += BF_AVX2_PARALLEL_BLOCKS;
-                        nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS;
-                } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
-                if (nbytes < bsize)
-                        goto done;
-        }
-        /* Process four block batch */
-        if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
-                __be64 ctrblocks[BF_PARALLEL_BLOCKS];
-                u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
-                do {
-                        /* create ctrblks for parallel encrypt */
-                        for (i = 0; i < BF_PARALLEL_BLOCKS; i++) {
-                                if (dst != src)
-                                        dst[i] = src[i];
-                                ctrblocks[i] = cpu_to_be64(ctrblk++);
-                        }
-                        blowfish_enc_blk_xor_4way(ctx, (u8 *)dst,
-                                                  (u8 *)ctrblocks);
-                        src += BF_PARALLEL_BLOCKS;
-                        dst += BF_PARALLEL_BLOCKS;
-                        nbytes -= bsize * BF_PARALLEL_BLOCKS;
-                } while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
-                *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
-                if (nbytes < bsize)
-                        goto done;
-        }
-        /* Handle leftovers */
-        do {
-                u64 ctrblk;
-                if (dst != src)
-                        *dst = *src;
-                ctrblk = *(u64 *)walk->iv;
-                be64_add_cpu((__be64 *)walk->iv, 1);
-                blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
-                src += 1;
-                dst += 1;
-        } while ((nbytes -= bsize) >= bsize);
-done:
-        return nbytes;
-}
-static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                     struct scatterlist *src, unsigned int nbytes)
-{
-        bool fpu_enabled = false;
-        struct blkcipher_walk walk;
-        int err;
-        blkcipher_walk_init(&walk, dst, src, nbytes);
-        err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE);
-        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-        while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) {
-                fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
-                nbytes = __ctr_crypt(desc, &walk);
-                err = blkcipher_walk_done(desc, &walk, nbytes);
-        }
-        bf_fpu_end(fpu_enabled);
-        if (walk.nbytes) {
-                ctr_crypt_final(desc, &walk);
-                err = blkcipher_walk_done(desc, &walk, 0);
-        }
-        return err;
-}
-static struct crypto_alg bf_algs[6] = { {
-        .cra_name               = "__ecb-blowfish-avx2",
-        .cra_driver_name        = "__driver-ecb-blowfish-avx2",
-        .cra_priority           = 0,
-        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
-        .cra_blocksize          = BF_BLOCK_SIZE,
-        .cra_ctxsize            = sizeof(struct bf_ctx),
-        .cra_alignmask          = 0,
-        .cra_type               = &crypto_blkcipher_type,
-        .cra_module             = THIS_MODULE,
-        .cra_u = {
-                .blkcipher = {
-                        .min_keysize    = BF_MIN_KEY_SIZE,
-                        .max_keysize    = BF_MAX_KEY_SIZE,
-                        .setkey         = blowfish_setkey,
-                        .encrypt        = ecb_encrypt,
-                        .decrypt        = ecb_decrypt,
-                },
-        },
-}, {
-        .cra_name               = "__cbc-blowfish-avx2",
-        .cra_driver_name        = "__driver-cbc-blowfish-avx2",
-        .cra_priority           = 0,
-        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
-        .cra_blocksize          = BF_BLOCK_SIZE,
-        .cra_ctxsize            = sizeof(struct bf_ctx),
-        .cra_alignmask          = 0,
-        .cra_type               = &crypto_blkcipher_type,
-        .cra_module             = THIS_MODULE,
-        .cra_u = {
-                .blkcipher = {
-                        .min_keysize    = BF_MIN_KEY_SIZE,
-                        .max_keysize    = BF_MAX_KEY_SIZE,
-                        .setkey         = blowfish_setkey,
-                        .encrypt        = cbc_encrypt,
-                        .decrypt        = cbc_decrypt,
-                },
-        },
-}, {
-        .cra_name               = "__ctr-blowfish-avx2",
-        .cra_driver_name        = "__driver-ctr-blowfish-avx2",
-        .cra_priority           = 0,
-        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
-        .cra_blocksize          = 1,
-        .cra_ctxsize            = sizeof(struct bf_ctx),
-        .cra_alignmask          = 0,
-        .cra_type               = &crypto_blkcipher_type,
-        .cra_module             = THIS_MODULE,
-        .cra_u = {
-                .blkcipher = {
-                        .min_keysize    = BF_MIN_KEY_SIZE,
-                        .max_keysize    = BF_MAX_KEY_SIZE,
-                        .ivsize         = BF_BLOCK_SIZE,
-                        .setkey         = blowfish_setkey,
-                        .encrypt        = ctr_crypt,
-                        .decrypt        = ctr_crypt,
-                },
-        },
-}, {
-        .cra_name               = "ecb(blowfish)",
-        .cra_driver_name        = "ecb-blowfish-avx2",
-        .cra_priority           = 400,
-        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-        .cra_blocksize          = BF_BLOCK_SIZE,
-        .cra_ctxsize            = sizeof(struct async_helper_ctx),
-        .cra_alignmask          = 0,
-        .cra_type               = &crypto_ablkcipher_type,
-        .cra_module             = THIS_MODULE,
-        .cra_init               = ablk_init,
-        .cra_exit               = ablk_exit,
-        .cra_u = {
-                .ablkcipher = {
-                        .min_keysize    = BF_MIN_KEY_SIZE,
-                        .max_keysize    = BF_MAX_KEY_SIZE,
-                        .setkey         = ablk_set_key,
-                        .encrypt        = ablk_encrypt,
-                        .decrypt        = ablk_decrypt,
-                },
-        },
-}, {
-        .cra_name               = "cbc(blowfish)",
-        .cra_driver_name        = "cbc-blowfish-avx2",
-        .cra_priority           = 400,
-        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-        .cra_blocksize          = BF_BLOCK_SIZE,
-        .cra_ctxsize            = sizeof(struct async_helper_ctx),
-        .cra_alignmask          = 0,
-        .cra_type               = &crypto_ablkcipher_type,
-        .cra_module             = THIS_MODULE,
-        .cra_init               = ablk_init,
-        .cra_exit               = ablk_exit,
-        .cra_u = {
-                .ablkcipher = {
-                        .min_keysize    = BF_MIN_KEY_SIZE,
-                        .max_keysize    = BF_MAX_KEY_SIZE,
-                        .ivsize         = BF_BLOCK_SIZE,
-                        .setkey         = ablk_set_key,
-                        .encrypt        = __ablk_encrypt,
-                        .decrypt        = ablk_decrypt,
-                },
-        },
-}, {
-        .cra_name               = "ctr(blowfish)",
-        .cra_driver_name        = "ctr-blowfish-avx2",
-        .cra_priority           = 400,
-        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-        .cra_blocksize          = 1,
-        .cra_ctxsize            = sizeof(struct async_helper_ctx),
-        .cra_alignmask          = 0,
-        .cra_type               = &crypto_ablkcipher_type,
-        .cra_module             = THIS_MODULE,
-        .cra_init               = ablk_init,
-        .cra_exit               = ablk_exit,
-        .cra_u = {
-                .ablkcipher = {
-                        .min_keysize    = BF_MIN_KEY_SIZE,
-                        .max_keysize    = BF_MAX_KEY_SIZE,
-                        .ivsize         = BF_BLOCK_SIZE,
-                        .setkey         = ablk_set_key,
-                        .encrypt        = ablk_encrypt,
-                        .decrypt        = ablk_encrypt,
-                        .geniv          = "chainiv",
-                },
-        },
-} };
-static int __init init(void)
-{
-        u64 xcr0;
-        if (!cpu_has_avx2 || !cpu_has_osxsave) {
-                pr_info("AVX2 instructions are not detected.\n");
-                return -ENODEV;
-        }
-        xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-        if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-                pr_info("AVX detected but unusable.\n");
-                return -ENODEV;
-        }
-        return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs));
-}
-static void __exit fini(void)
-{
-        crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs));
-}
-module_init(init);
-module_exit(fini);
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Blowfish Cipher Algorithm, AVX2 optimized");
-MODULE_ALIAS("blowfish");
-MODULE_ALIAS("blowfish-asm");
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index 3548d76dbaa9..50ec333b70e6 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -1,7 +1,7 @@
 /*
 * Glue Code for assembler optimized version of Blowfish
 *
- * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
 *
 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
 *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
@@ -32,24 +32,40 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <crypto/algapi.h>
-#include <asm/crypto/blowfish.h>
 /* regular block cipher functions */
 asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
                                   bool xor);
-EXPORT_SYMBOL_GPL(__blowfish_enc_blk);
 asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
-EXPORT_SYMBOL_GPL(blowfish_dec_blk);
 /* 4-way parallel cipher functions */
 asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
                                        const u8 *src, bool xor);
-EXPORT_SYMBOL_GPL(__blowfish_enc_blk_4way);
 asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
                                      const u8 *src);
-EXPORT_SYMBOL_GPL(blowfish_dec_blk_4way);
+static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
+{
+        __blowfish_enc_blk(ctx, dst, src, false);
+}
+static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
+                                        const u8 *src)
+{
+        __blowfish_enc_blk(ctx, dst, src, true);
+}
+static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
+                                         const u8 *src)
+{
+        __blowfish_enc_blk_4way(ctx, dst, src, false);
+}
+static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
+                                      const u8 *src)
+{
+        __blowfish_enc_blk_4way(ctx, dst, src, true);
+}
 static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index 91a1878fcc3e..0e0b8863a34b 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -51,16 +51,6 @@
 #define ymm14_x xmm14
 #define ymm15_x xmm15
-/*
- * AES-NI instructions do not support ymmX registers, so we need splitting and
- * merging.
- */
-#define vaesenclast256(zero, yreg, tmp) \
-        vextracti128 $1, yreg, tmp##_x; \
-        vaesenclast zero##_x, yreg##_x, yreg##_x; \
-        vaesenclast zero##_x, tmp##_x, tmp##_x; \
-        vinserti128 $1, tmp##_x, yreg, yreg;
 /**********************************************************************
  32-way camellia
 **********************************************************************/
@@ -79,46 +69,70 @@
         * S-function with AES subbytes \
         */ \
        vbroadcasti128 .Linv_shift_row, t4; \
-        vpbroadcastb .L0f0f0f0f, t7; \
+        vpbroadcastd .L0f0f0f0f, t7; \
-        vbroadcasti128 .Lpre_tf_lo_s1, t0; \
+        vbroadcasti128 .Lpre_tf_lo_s1, t5; \
-        vbroadcasti128 .Lpre_tf_hi_s1, t1; \
+        vbroadcasti128 .Lpre_tf_hi_s1, t6; \
+        vbroadcasti128 .Lpre_tf_lo_s4, t2; \
+        vbroadcasti128 .Lpre_tf_hi_s4, t3; \
        \
        /* AES inverse shift rows */ \
        vpshufb t4, x0, x0; \
        vpshufb t4, x7, x7; \
-        vpshufb t4, x1, x1; \
-        vpshufb t4, x4, x4; \
-        vpshufb t4, x2, x2; \
-        vpshufb t4, x5, x5; \
        vpshufb t4, x3, x3; \
        vpshufb t4, x6, x6; \
+        vpshufb t4, x2, x2; \
+        vpshufb t4, x5, x5; \
+        vpshufb t4, x1, x1; \
+        vpshufb t4, x4, x4; \
        \
        /* prefilter sboxes 1, 2 and 3 */ \
-        vbroadcasti128 .Lpre_tf_lo_s4, t2; \
-        vbroadcasti128 .Lpre_tf_hi_s4, t3; \
-        filter_8bit(x0, t0, t1, t7, t6); \
-        filter_8bit(x7, t0, t1, t7, t6); \
-        filter_8bit(x1, t0, t1, t7, t6); \
-        filter_8bit(x4, t0, t1, t7, t6); \
-        filter_8bit(x2, t0, t1, t7, t6); \
-        filter_8bit(x5, t0, t1, t7, t6); \
-        \
        /* prefilter sbox 4 */ \
+        filter_8bit(x0, t5, t6, t7, t4); \
+        filter_8bit(x7, t5, t6, t7, t4); \
+        vextracti128 $1, x0, t0##_x; \
+        vextracti128 $1, x7, t1##_x; \
+        filter_8bit(x3, t2, t3, t7, t4); \
+        filter_8bit(x6, t2, t3, t7, t4); \
+        vextracti128 $1, x3, t3##_x; \
+        vextracti128 $1, x6, t2##_x; \
+        filter_8bit(x2, t5, t6, t7, t4); \
+        filter_8bit(x5, t5, t6, t7, t4); \
+        filter_8bit(x1, t5, t6, t7, t4); \
+        filter_8bit(x4, t5, t6, t7, t4); \
+        \
        vpxor t4##_x, t4##_x, t4##_x; \
-        filter_8bit(x3, t2, t3, t7, t6); \
-        filter_8bit(x6, t2, t3, t7, t6); \
        \
        /* AES subbytes + AES shift rows */ \
+        vextracti128 $1, x2, t6##_x; \
+        vextracti128 $1, x5, t5##_x; \
+        vaesenclast t4##_x, x0##_x, x0##_x; \
+        vaesenclast t4##_x, t0##_x, t0##_x; \
+        vinserti128 $1, t0##_x, x0, x0; \
+        vaesenclast t4##_x, x7##_x, x7##_x; \
+        vaesenclast t4##_x, t1##_x, t1##_x; \
+        vinserti128 $1, t1##_x, x7, x7; \
+        vaesenclast t4##_x, x3##_x, x3##_x; \
+        vaesenclast t4##_x, t3##_x, t3##_x; \
+        vinserti128 $1, t3##_x, x3, x3; \
+        vaesenclast t4##_x, x6##_x, x6##_x; \
+        vaesenclast t4##_x, t2##_x, t2##_x; \
+        vinserti128 $1, t2##_x, x6, x6; \
+        vextracti128 $1, x1, t3##_x; \
+        vextracti128 $1, x4, t2##_x; \
        vbroadcasti128 .Lpost_tf_lo_s1, t0; \
        vbroadcasti128 .Lpost_tf_hi_s1, t1; \
-        vaesenclast256(t4, x0, t5); \
+        vaesenclast t4##_x, x2##_x, x2##_x; \
-        vaesenclast256(t4, x7, t5); \
+        vaesenclast t4##_x, t6##_x, t6##_x; \
-        vaesenclast256(t4, x1, t5); \
+        vinserti128 $1, t6##_x, x2, x2; \
-        vaesenclast256(t4, x4, t5); \
+        vaesenclast t4##_x, x5##_x, x5##_x; \
-        vaesenclast256(t4, x2, t5); \
+        vaesenclast t4##_x, t5##_x, t5##_x; \
-        vaesenclast256(t4, x5, t5); \
+        vinserti128 $1, t5##_x, x5, x5; \
-        vaesenclast256(t4, x3, t5); \
+        vaesenclast t4##_x, x1##_x, x1##_x; \
-        vaesenclast256(t4, x6, t5); \
+        vaesenclast t4##_x, t3##_x, t3##_x; \
+        vinserti128 $1, t3##_x, x1, x1; \
+        vaesenclast t4##_x, x4##_x, x4##_x; \
+        vaesenclast t4##_x, t2##_x, t2##_x; \
+        vinserti128 $1, t2##_x, x4, x4; \
        \
        /* postfilter sboxes 1 and 4 */ \
        vbroadcasti128 .Lpost_tf_lo_s3, t2; \
@@ -139,22 +153,12 @@
        /* postfilter sbox 2 */ \
        filter_8bit(x1, t4, t5, t7, t2); \
        filter_8bit(x4, t4, t5, t7, t2); \
+        vpxor t7, t7, t7; \
        \
        vpsrldq $1, t0, t1; \
        vpsrldq $2, t0, t2; \
+        vpshufb t7, t1, t1; \
        vpsrldq $3, t0, t3; \
-        vpsrldq $4, t0, t4; \
-        vpsrldq $5, t0, t5; \
-        vpsrldq $6, t0, t6; \
-        vpsrldq $7, t0, t7; \
-        vpbroadcastb t0##_x, t0; \
-        vpbroadcastb t1##_x, t1; \
-        vpbroadcastb t2##_x, t2; \
-        vpbroadcastb t3##_x, t3; \
-        vpbroadcastb t4##_x, t4; \
-        vpbroadcastb t6##_x, t6; \
-        vpbroadcastb t5##_x, t5; \
-        vpbroadcastb t7##_x, t7; \
        \
        /* P-function */ \
        vpxor x5, x0, x0; \
@@ -162,11 +166,21 @@
        vpxor x7, x2, x2; \
        vpxor x4, x3, x3; \
        \
+        vpshufb t7, t2, t2; \
+        vpsrldq $4, t0, t4; \
+        vpshufb t7, t3, t3; \
+        vpsrldq $5, t0, t5; \
+        vpshufb t7, t4, t4; \
+        \
        vpxor x2, x4, x4; \
        vpxor x3, x5, x5; \
        vpxor x0, x6, x6; \
        vpxor x1, x7, x7; \
        \
+        vpsrldq $6, t0, t6; \
+        vpshufb t7, t5, t5; \
+        vpshufb t7, t6, t6; \
+        \
        vpxor x7, x0, x0; \
        vpxor x4, x1, x1; \
        vpxor x5, x2, x2; \
@@ -179,12 +193,16 @@
        \
        /* Add key material and result to CD (x becomes new CD) */ \
        \
-        vpxor t7, x0, x0; \
-        vpxor 4 * 32(mem_cd), x0, x0; \
-        \
        vpxor t6, x1, x1; \
        vpxor 5 * 32(mem_cd), x1, x1; \
        \
+        vpsrldq $7, t0, t6; \
+        vpshufb t7, t0, t0; \
+        vpshufb t7, t6, t7; \
+        \
+        vpxor t7, x0, x0; \
+        vpxor 4 * 32(mem_cd), x0, x0; \
+        \
        vpxor t5, x2, x2; \
        vpxor 6 * 32(mem_cd), x2, x2; \
        \
@@ -204,7 +222,7 @@
        vpxor 3 * 32(mem_cd), x7, x7;
 /*
- * Size optimization... with inlined roundsm16 binary would be over 5 times
+ * Size optimization... with inlined roundsm32 binary would be over 5 times
 * larger and would only marginally faster.
 */
 .align 8
@@ -324,13 +342,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
         */ \
        vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
        vpxor tt0, tt0, tt0; \
-        vpbroadcastb t0##_x, t3; \
+        vpshufb tt0, t0, t3; \
        vpsrldq $1, t0, t0; \
-        vpbroadcastb t0##_x, t2; \
+        vpshufb tt0, t0, t2; \
        vpsrldq $1, t0, t0; \
-        vpbroadcastb t0##_x, t1; \
+        vpshufb tt0, t0, t1; \
        vpsrldq $1, t0, t0; \
-        vpbroadcastb t0##_x, t0; \
+        vpshufb tt0, t0, t0; \
        \
        vpand l0, t0, t0; \
        vpand l1, t1, t1; \
@@ -340,6 +358,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
        rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
        \
        vpxor l4, t0, l4; \
+        vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
        vmovdqu l4, 4 * 32(l); \
        vpxor l5, t1, l5; \
        vmovdqu l5, 5 * 32(l); \
@@ -354,14 +373,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
         * rl ^= t2; \
         */ \
        \
-        vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
+        vpshufb tt0, t0, t3; \
-        vpbroadcastb t0##_x, t3; \
        vpsrldq $1, t0, t0; \
-        vpbroadcastb t0##_x, t2; \
+        vpshufb tt0, t0, t2; \
        vpsrldq $1, t0, t0; \
-        vpbroadcastb t0##_x, t1; \
+        vpshufb tt0, t0, t1; \
        vpsrldq $1, t0, t0; \
-        vpbroadcastb t0##_x, t0; \
+        vpshufb tt0, t0, t0; \
        \
        vpor 4 * 32(r), t0, t0; \
        vpor 5 * 32(r), t1, t1; \
@@ -373,6 +391,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
        vpxor 2 * 32(r), t2, t2; \
        vpxor 3 * 32(r), t3, t3; \
        vmovdqu t0, 0 * 32(r); \
+        vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
        vmovdqu t1, 1 * 32(r); \
        vmovdqu t2, 2 * 32(r); \
        vmovdqu t3, 3 * 32(r); \
@@ -382,14 +401,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
         * t2 &= rl; \
         * rr ^= rol32(t2, 1); \
         */ \
-        vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
+        vpshufb tt0, t0, t3; \
-        vpbroadcastb t0##_x, t3; \
        vpsrldq $1, t0, t0; \
-        vpbroadcastb t0##_x, t2; \
+        vpshufb tt0, t0, t2; \
        vpsrldq $1, t0, t0; \
-        vpbroadcastb t0##_x, t1; \
+        vpshufb tt0, t0, t1; \
        vpsrldq $1, t0, t0; \
-        vpbroadcastb t0##_x, t0; \
+        vpshufb tt0, t0, t0; \
        \
        vpand 0 * 32(r), t0, t0; \
        vpand 1 * 32(r), t1, t1; \
@@ -403,6 +421,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
        vpxor 6 * 32(r), t2, t2; \
        vpxor 7 * 32(r), t3, t3; \
        vmovdqu t0, 4 * 32(r); \
+        vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
        vmovdqu t1, 5 * 32(r); \
        vmovdqu t2, 6 * 32(r); \
        vmovdqu t3, 7 * 32(r); \
@@ -413,14 +432,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
         * ll ^= t0; \
         */ \
        \
-        vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
+        vpshufb tt0, t0, t3; \
-        vpbroadcastb t0##_x, t3; \
        vpsrldq $1, t0, t0; \
-        vpbroadcastb t0##_x, t2; \
+        vpshufb tt0, t0, t2; \
        vpsrldq $1, t0, t0; \
-        vpbroadcastb t0##_x, t1; \
+        vpshufb tt0, t0, t1; \
        vpsrldq $1, t0, t0; \
-        vpbroadcastb t0##_x, t0; \
+        vpshufb tt0, t0, t0; \
        \
        vpor l4, t0, t0; \
        vpor l5, t1, t1; \
diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S
new file mode 100644
index 000000000000..35e97569d05f
--- /dev/null
+++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S
@@ -0,0 +1,643 @@
+########################################################################
+# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+#
+# Copyright (c) 2013, Intel Corporation
+#
+# Authors:
+#     Erdinc Ozturk <erdinc.ozturk@intel.com>
+#     Vinodh Gopal <vinodh.gopal@intel.com>
+#     James Guilford <james.guilford@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the
+#   distribution.
+#
+# * Neither the name of the Intel Corporation nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+#
+# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+#       Function API:
+#       UINT16 crc_t10dif_pcl(
+#               UINT16 init_crc, //initial CRC value, 16 bits
+#               const unsigned char *buf, //buffer pointer to calculate CRC on
+#               UINT64 len //buffer length in bytes (64-bit data)
+#       );
+#
+#       Reference paper titled "Fast CRC Computation for Generic
+#       Polynomials Using PCLMULQDQ Instruction"
+#       URL: http://www.intel.com/content/dam/www/public/us/en/documents
+#  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+#
+#
+#include <linux/linkage.h>
+.text
+#define        arg1 %rdi
+#define        arg2 %rsi
+#define        arg3 %rdx
+#define        arg1_low32 %edi
+ENTRY(crc_t10dif_pcl)
+.align 16
+        # adjust the 16-bit initial_crc value, scale it to 32 bits
+        shl     $16, arg1_low32
+        # Allocate Stack Space
+        mov     %rsp, %rcx
+        sub     $16*2, %rsp
+        # align stack to 16 byte boundary
+        and     $~(0x10 - 1), %rsp
+        # check if smaller than 256
+        cmp     $256, arg3
+        # for sizes less than 128, we can't fold 64B at a time...
+        jl      _less_than_128
+        # load the initial crc value
+        movd    arg1_low32, %xmm10      # initial crc
+        # crc value does not need to be byte-reflected, but it needs
+        # to be moved to the high part of the register.
+        # because data will be byte-reflected and will align with
+        # initial crc at correct place.
+        pslldq  $12, %xmm10
+        movdqa  SHUF_MASK(%rip), %xmm11
+        # receive the initial 64B data, xor the initial crc value
+        movdqu  16*0(arg2), %xmm0
+        movdqu  16*1(arg2), %xmm1
+        movdqu  16*2(arg2), %xmm2
+        movdqu  16*3(arg2), %xmm3
+        movdqu  16*4(arg2), %xmm4
+        movdqu  16*5(arg2), %xmm5
+        movdqu  16*6(arg2), %xmm6
+        movdqu  16*7(arg2), %xmm7
+        pshufb  %xmm11, %xmm0
+        # XOR the initial_crc value
+        pxor    %xmm10, %xmm0
+        pshufb  %xmm11, %xmm1
+        pshufb  %xmm11, %xmm2
+        pshufb  %xmm11, %xmm3
+        pshufb  %xmm11, %xmm4
+        pshufb  %xmm11, %xmm5
+        pshufb  %xmm11, %xmm6
+        pshufb  %xmm11, %xmm7
+        movdqa  rk3(%rip), %xmm10       #xmm10 has rk3 and rk4
+                                        #imm value of pclmulqdq instruction
+                                        #will determine which constant to use
+        #################################################################
+        # we subtract 256 instead of 128 to save one instruction from the loop
+        sub     $256, arg3
+        # at this section of the code, there is 64*x+y (0<=y<64) bytes of
+        # buffer. The _fold_64_B_loop will fold 64B at a time
+        # until we have 64+y Bytes of buffer
+        # fold 64B at a time. This section of the code folds 4 xmm
+        # registers in parallel
+_fold_64_B_loop:
+        # update the buffer pointer
+        add     $128, arg2              #    buf += 64#
+        movdqu  16*0(arg2), %xmm9
+        movdqu  16*1(arg2), %xmm12
+        pshufb  %xmm11, %xmm9
+        pshufb  %xmm11, %xmm12
+        movdqa  %xmm0, %xmm8
+        movdqa  %xmm1, %xmm13
+        pclmulqdq       $0x0 , %xmm10, %xmm0
+        pclmulqdq       $0x11, %xmm10, %xmm8
+        pclmulqdq       $0x0 , %xmm10, %xmm1
+        pclmulqdq       $0x11, %xmm10, %xmm13
+        pxor    %xmm9 , %xmm0
+        xorps   %xmm8 , %xmm0
+        pxor    %xmm12, %xmm1
+        xorps   %xmm13, %xmm1
+        movdqu  16*2(arg2), %xmm9
+        movdqu  16*3(arg2), %xmm12
+        pshufb  %xmm11, %xmm9
+        pshufb  %xmm11, %xmm12
+        movdqa  %xmm2, %xmm8
+        movdqa  %xmm3, %xmm13
+        pclmulqdq       $0x0, %xmm10, %xmm2
+        pclmulqdq       $0x11, %xmm10, %xmm8
+        pclmulqdq       $0x0, %xmm10, %xmm3
+        pclmulqdq       $0x11, %xmm10, %xmm13
+        pxor    %xmm9 , %xmm2
+        xorps   %xmm8 , %xmm2
+        pxor    %xmm12, %xmm3
+        xorps   %xmm13, %xmm3
+        movdqu  16*4(arg2), %xmm9
+        movdqu  16*5(arg2), %xmm12
+        pshufb  %xmm11, %xmm9
+        pshufb  %xmm11, %xmm12
+        movdqa  %xmm4, %xmm8
+        movdqa  %xmm5, %xmm13
+        pclmulqdq       $0x0,  %xmm10, %xmm4
+        pclmulqdq       $0x11, %xmm10, %xmm8
+        pclmulqdq       $0x0,  %xmm10, %xmm5
+        pclmulqdq       $0x11, %xmm10, %xmm13
+        pxor    %xmm9 ,  %xmm4
+        xorps   %xmm8 ,  %xmm4
+        pxor    %xmm12,  %xmm5
+        xorps   %xmm13,  %xmm5
+        movdqu  16*6(arg2), %xmm9
+        movdqu  16*7(arg2), %xmm12
+        pshufb  %xmm11, %xmm9
+        pshufb  %xmm11, %xmm12
+        movdqa  %xmm6 , %xmm8
+        movdqa  %xmm7 , %xmm13
+        pclmulqdq       $0x0 , %xmm10, %xmm6
+        pclmulqdq       $0x11, %xmm10, %xmm8
+        pclmulqdq       $0x0 , %xmm10, %xmm7
+        pclmulqdq       $0x11, %xmm10, %xmm13
+        pxor    %xmm9 , %xmm6
+        xorps   %xmm8 , %xmm6
+        pxor    %xmm12, %xmm7
+        xorps   %xmm13, %xmm7
+        sub     $128, arg3
+        # check if there is another 64B in the buffer to be able to fold
+        jge     _fold_64_B_loop
+        ##################################################################
+        add     $128, arg2
+        # at this point, the buffer pointer is pointing at the last y Bytes
+        # of the buffer the 64B of folded data is in 4 of the xmm
+        # registers: xmm0, xmm1, xmm2, xmm3
+        # fold the 8 xmm registers to 1 xmm register with different constants
+        movdqa  rk9(%rip), %xmm10
+        movdqa  %xmm0, %xmm8
+        pclmulqdq       $0x11, %xmm10, %xmm0
+        pclmulqdq       $0x0 , %xmm10, %xmm8
+        pxor    %xmm8, %xmm7
+        xorps   %xmm0, %xmm7
+        movdqa  rk11(%rip), %xmm10
+        movdqa  %xmm1, %xmm8
+        pclmulqdq        $0x11, %xmm10, %xmm1
+        pclmulqdq        $0x0 , %xmm10, %xmm8
+        pxor    %xmm8, %xmm7
+        xorps   %xmm1, %xmm7
+        movdqa  rk13(%rip), %xmm10
+        movdqa  %xmm2, %xmm8
+        pclmulqdq        $0x11, %xmm10, %xmm2
+        pclmulqdq        $0x0 , %xmm10, %xmm8
+        pxor    %xmm8, %xmm7
+        pxor    %xmm2, %xmm7
+        movdqa  rk15(%rip), %xmm10
+        movdqa  %xmm3, %xmm8
+        pclmulqdq       $0x11, %xmm10, %xmm3
+        pclmulqdq       $0x0 , %xmm10, %xmm8
+        pxor    %xmm8, %xmm7
+        xorps   %xmm3, %xmm7
+        movdqa  rk17(%rip), %xmm10
+        movdqa  %xmm4, %xmm8
+        pclmulqdq       $0x11, %xmm10, %xmm4
+        pclmulqdq       $0x0 , %xmm10, %xmm8
+        pxor    %xmm8, %xmm7
+        pxor    %xmm4, %xmm7
+        movdqa  rk19(%rip), %xmm10
+        movdqa  %xmm5, %xmm8
+        pclmulqdq       $0x11, %xmm10, %xmm5
+        pclmulqdq       $0x0 , %xmm10, %xmm8
+        pxor    %xmm8, %xmm7
+        xorps   %xmm5, %xmm7
+        movdqa  rk1(%rip), %xmm10       #xmm10 has rk1 and rk2
+                                        #imm value of pclmulqdq instruction
+                                        #will determine which constant to use
+        movdqa  %xmm6, %xmm8
+        pclmulqdq       $0x11, %xmm10, %xmm6
+        pclmulqdq       $0x0 , %xmm10, %xmm8
+        pxor    %xmm8, %xmm7
+        pxor    %xmm6, %xmm7
+        # instead of 64, we add 48 to the loop counter to save 1 instruction
+        # from the loop instead of a cmp instruction, we use the negative
+        # flag with the jl instruction
+        add     $128-16, arg3
+        jl      _final_reduction_for_128
+        # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7
+        # and the rest is in memory. We can fold 16 bytes at a time if y>=16
+        # continue folding 16B at a time
+_16B_reduction_loop:
+        movdqa  %xmm7, %xmm8
+        pclmulqdq       $0x11, %xmm10, %xmm7
+        pclmulqdq       $0x0 , %xmm10, %xmm8
+        pxor    %xmm8, %xmm7
+        movdqu  (arg2), %xmm0
+        pshufb  %xmm11, %xmm0
+        pxor    %xmm0 , %xmm7
+        add     $16, arg2
+        sub     $16, arg3
+        # instead of a cmp instruction, we utilize the flags with the
+        # jge instruction equivalent of: cmp arg3, 16-16
+        # check if there is any more 16B in the buffer to be able to fold
+        jge     _16B_reduction_loop
+        #now we have 16+z bytes left to reduce, where 0<= z < 16.
+        #first, we reduce the data in the xmm7 register
+_final_reduction_for_128:
+        # check if any more data to fold. If not, compute the CRC of
+        # the final 128 bits
+        add     $16, arg3
+        je      _128_done
+        # here we are getting data that is less than 16 bytes.
+        # since we know that there was data before the pointer, we can
+        # offset the input pointer before the actual point, to receive
+        # exactly 16 bytes. after that the registers need to be adjusted.
+_get_last_two_xmms:
+        movdqa  %xmm7, %xmm2
+        movdqu  -16(arg2, arg3), %xmm1
+        pshufb  %xmm11, %xmm1
+        # get rid of the extra data that was loaded before
+        # load the shift constant
+        lea     pshufb_shf_table+16(%rip), %rax
+        sub     arg3, %rax
+        movdqu  (%rax), %xmm0
+        # shift xmm2 to the left by arg3 bytes
+        pshufb  %xmm0, %xmm2
+        # shift xmm7 to the right by 16-arg3 bytes
+        pxor    mask1(%rip), %xmm0
+        pshufb  %xmm0, %xmm7
+        pblendvb        %xmm2, %xmm1    #xmm0 is implicit
+        # fold 16 Bytes
+        movdqa  %xmm1, %xmm2
+        movdqa  %xmm7, %xmm8
+        pclmulqdq       $0x11, %xmm10, %xmm7
+        pclmulqdq       $0x0 , %xmm10, %xmm8
+        pxor    %xmm8, %xmm7
+        pxor    %xmm2, %xmm7
+_128_done:
+        # compute crc of a 128-bit value
+        movdqa  rk5(%rip), %xmm10       # rk5 and rk6 in xmm10
+        movdqa  %xmm7, %xmm0
+        #64b fold
+        pclmulqdq       $0x1, %xmm10, %xmm7
+        pslldq  $8   ,  %xmm0
+        pxor    %xmm0,  %xmm7
+        #32b fold
+        movdqa  %xmm7, %xmm0
+        pand    mask2(%rip), %xmm0
+        psrldq  $12, %xmm7
+        pclmulqdq       $0x10, %xmm10, %xmm7
+        pxor    %xmm0, %xmm7
+        #barrett reduction
+_barrett:
+        movdqa  rk7(%rip), %xmm10       # rk7 and rk8 in xmm10
+        movdqa  %xmm7, %xmm0
+        pclmulqdq       $0x01, %xmm10, %xmm7
+        pslldq  $4, %xmm7
+        pclmulqdq       $0x11, %xmm10, %xmm7
+        pslldq  $4, %xmm7
+        pxor    %xmm0, %xmm7
+        pextrd  $1, %xmm7, %eax
+_cleanup:
+        # scale the result back to 16 bits
+        shr     $16, %eax
+        mov     %rcx, %rsp
+        ret
+########################################################################
+.align 16
+_less_than_128:
+        # check if there is enough buffer to be able to fold 16B at a time
+        cmp     $32, arg3
+        jl      _less_than_32
+        movdqa  SHUF_MASK(%rip), %xmm11
+        # now if there is, load the constants
+        movdqa  rk1(%rip), %xmm10       # rk1 and rk2 in xmm10
+        movd    arg1_low32, %xmm0       # get the initial crc value
+        pslldq  $12, %xmm0      # align it to its correct place
+        movdqu  (arg2), %xmm7   # load the plaintext
+        pshufb  %xmm11, %xmm7   # byte-reflect the plaintext
+        pxor    %xmm0, %xmm7
+        # update the buffer pointer
+        add     $16, arg2
+        # update the counter. subtract 32 instead of 16 to save one
+        # instruction from the loop
+        sub     $32, arg3
+        jmp     _16B_reduction_loop
+.align 16
+_less_than_32:
+        # mov initial crc to the return value. this is necessary for
+        # zero-length buffers.
+        mov     arg1_low32, %eax
+        test    arg3, arg3
+        je      _cleanup
+        movdqa  SHUF_MASK(%rip), %xmm11
+        movd    arg1_low32, %xmm0       # get the initial crc value
+        pslldq  $12, %xmm0      # align it to its correct place
+        cmp     $16, arg3
+        je      _exact_16_left
+        jl      _less_than_16_left
+        movdqu  (arg2), %xmm7   # load the plaintext
+        pshufb  %xmm11, %xmm7   # byte-reflect the plaintext
+        pxor    %xmm0 , %xmm7   # xor the initial crc value
+        add     $16, arg2
+        sub     $16, arg3
+        movdqa  rk1(%rip), %xmm10       # rk1 and rk2 in xmm10
+        jmp     _get_last_two_xmms
+.align 16
+_less_than_16_left:
+        # use stack space to load data less than 16 bytes, zero-out
+        # the 16B in memory first.
+        pxor    %xmm1, %xmm1
+        mov     %rsp, %r11
+        movdqa  %xmm1, (%r11)
+        cmp     $4, arg3
+        jl      _only_less_than_4
+        # backup the counter value
+        mov     arg3, %r9
+        cmp     $8, arg3
+        jl      _less_than_8_left
+        # load 8 Bytes
+        mov     (arg2), %rax
+        mov     %rax, (%r11)
+        add     $8, %r11
+        sub     $8, arg3
+        add     $8, arg2
+_less_than_8_left:
+        cmp     $4, arg3
+        jl      _less_than_4_left
+        # load 4 Bytes
+        mov     (arg2), %eax
+        mov     %eax, (%r11)
+        add     $4, %r11
+        sub     $4, arg3
+        add     $4, arg2
+_less_than_4_left:
+        cmp     $2, arg3
+        jl      _less_than_2_left
+        # load 2 Bytes
+        mov     (arg2), %ax
+        mov     %ax, (%r11)
+        add     $2, %r11
+        sub     $2, arg3
+        add     $2, arg2
+_less_than_2_left:
+        cmp     $1, arg3
+        jl      _zero_left
+        # load 1 Byte
+        mov     (arg2), %al
+        mov     %al, (%r11)
+_zero_left:
+        movdqa  (%rsp), %xmm7
+        pshufb  %xmm11, %xmm7
+        pxor    %xmm0 , %xmm7   # xor the initial crc value
+        # shl r9, 4
+        lea     pshufb_shf_table+16(%rip), %rax
+        sub     %r9, %rax
+        movdqu  (%rax), %xmm0
+        pxor    mask1(%rip), %xmm0
+        pshufb  %xmm0, %xmm7
+        jmp     _128_done
+.align 16
+_exact_16_left:
+        movdqu  (arg2), %xmm7
+        pshufb  %xmm11, %xmm7
+        pxor    %xmm0 , %xmm7   # xor the initial crc value
+        jmp     _128_done
+_only_less_than_4:
+        cmp     $3, arg3
+        jl      _only_less_than_3
+        # load 3 Bytes
+        mov     (arg2), %al
+        mov     %al, (%r11)
+        mov     1(arg2), %al
+        mov     %al, 1(%r11)
+        mov     2(arg2), %al
+        mov     %al, 2(%r11)
+        movdqa   (%rsp), %xmm7
+        pshufb   %xmm11, %xmm7
+        pxor     %xmm0 , %xmm7  # xor the initial crc value
+        psrldq  $5, %xmm7
+        jmp     _barrett
+_only_less_than_3:
+        cmp     $2, arg3
+        jl      _only_less_than_2
+        # load 2 Bytes
+        mov     (arg2), %al
+        mov     %al, (%r11)
+        mov     1(arg2), %al
+        mov     %al, 1(%r11)
+        movdqa  (%rsp), %xmm7
+        pshufb  %xmm11, %xmm7
+        pxor    %xmm0 , %xmm7   # xor the initial crc value
+        psrldq  $6, %xmm7
+        jmp     _barrett
+_only_less_than_2:
+        # load 1 Byte
+        mov     (arg2), %al
+        mov     %al, (%r11)
+        movdqa  (%rsp), %xmm7
+        pshufb  %xmm11, %xmm7
+        pxor    %xmm0 , %xmm7   # xor the initial crc value
+        psrldq  $7, %xmm7
+        jmp     _barrett
+ENDPROC(crc_t10dif_pcl)
+.data
+# precomputed constants
+# these constants are precomputed from the poly:
+# 0x8bb70000 (0x8bb7 scaled to 32 bits)
+.align 16
+# Q = 0x18BB70000
+# rk1 = 2^(32*3) mod Q << 32
+# rk2 = 2^(32*5) mod Q << 32
+# rk3 = 2^(32*15) mod Q << 32
+# rk4 = 2^(32*17) mod Q << 32
+# rk5 = 2^(32*3) mod Q << 32
+# rk6 = 2^(32*2) mod Q << 32
+# rk7 = floor(2^64/Q)
+# rk8 = Q
+rk1:
+.quad 0x2d56000000000000
+rk2:
+.quad 0x06df000000000000
+rk3:
+.quad 0x9d9d000000000000
+rk4:
+.quad 0x7cf5000000000000
+rk5:
+.quad 0x2d56000000000000
+rk6:
+.quad 0x1368000000000000
+rk7:
+.quad 0x00000001f65a57f8
+rk8:
+.quad 0x000000018bb70000
+rk9:
+.quad 0xceae000000000000
+rk10:
+.quad 0xbfd6000000000000
+rk11:
+.quad 0x1e16000000000000
+rk12:
+.quad 0x713c000000000000
+rk13:
+.quad 0xf7f9000000000000
+rk14:
+.quad 0x80a6000000000000
+rk15:
+.quad 0x044c000000000000
+rk16:
+.quad 0xe658000000000000
+rk17:
+.quad 0xad18000000000000
+rk18:
+.quad 0xa497000000000000
+rk19:
+.quad 0x6ee3000000000000
+rk20:
+.quad 0xe7b5000000000000
+mask1:
+.octa 0x80808080808080808080808080808080
+mask2:
+.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
+SHUF_MASK:
+.octa 0x000102030405060708090A0B0C0D0E0F
+pshufb_shf_table:
+# use these values for shift constants for the pshufb instruction
+# different alignments result in values as shown:
+#       DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
+#       DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
+#       DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
+#       DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
+#       DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
+#       DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
+#       DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
+#       DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
+#       DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
+#       DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
+#       DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
+#       DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
+#       DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
+#       DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
+#       DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15
+.octa 0x8f8e8d8c8b8a89888786858483828100
+.octa 0x000e0d0c0b0a09080706050403020100
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
new file mode 100644
index 000000000000..7845d7fd54c0
--- /dev/null
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -0,0 +1,151 @@
+/*
+ * Cryptographic API.
+ *
+ * T10 Data Integrity Field CRC16 Crypto Transform using PCLMULQDQ Instructions
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/crc-t10dif.h>
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <asm/i387.h>
+#include <asm/cpufeature.h>
+#include <asm/cpu_device_id.h>
+asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf,
+                                size_t len);
+struct chksum_desc_ctx {
+        __u16 crc;
+};
+/*
+ * Steps through buffer one byte at at time, calculates reflected
+ * crc using table.
+ */
+static int chksum_init(struct shash_desc *desc)
+{
+        struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+        ctx->crc = 0;
+        return 0;
+}
+static int chksum_update(struct shash_desc *desc, const u8 *data,
+                         unsigned int length)
+{
+        struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+        if (irq_fpu_usable()) {
+                kernel_fpu_begin();
+                ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);
+                kernel_fpu_end();
+        } else
+                ctx->crc = crc_t10dif_generic(ctx->crc, data, length);
+        return 0;
+}
+static int chksum_final(struct shash_desc *desc, u8 *out)
+{
+        struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+        *(__u16 *)out = ctx->crc;
+        return 0;
+}
+static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len,
+                        u8 *out)
+{
+        if (irq_fpu_usable()) {
+                kernel_fpu_begin();
+                *(__u16 *)out = crc_t10dif_pcl(*crcp, data, len);
+                kernel_fpu_end();
+        } else
+                *(__u16 *)out = crc_t10dif_generic(*crcp, data, len);
+        return 0;
+}
+static int chksum_finup(struct shash_desc *desc, const u8 *data,
+                        unsigned int len, u8 *out)
+{
+        struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+        return __chksum_finup(&ctx->crc, data, len, out);
+}
+static int chksum_digest(struct shash_desc *desc, const u8 *data,
+                         unsigned int length, u8 *out)
+{
+        struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+        return __chksum_finup(&ctx->crc, data, length, out);
+}
+static struct shash_alg alg = {
+        .digestsize             =       CRC_T10DIF_DIGEST_SIZE,
+        .init           =       chksum_init,
+        .update         =       chksum_update,
+        .final          =       chksum_final,
+        .finup          =       chksum_finup,
+        .digest         =       chksum_digest,
+        .descsize               =       sizeof(struct chksum_desc_ctx),
+        .base                   =       {
+                .cra_name               =       "crct10dif",
+                .cra_driver_name        =       "crct10dif-pclmul",
+                .cra_priority           =       200,
+                .cra_blocksize          =       CRC_T10DIF_BLOCK_SIZE,
+                .cra_module             =       THIS_MODULE,
+        }
+};
+static const struct x86_cpu_id crct10dif_cpu_id[] = {
+        X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ),
+        {}
+};
+MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id);
+static int __init crct10dif_intel_mod_init(void)
+{
+        if (!x86_match_cpu(crct10dif_cpu_id))
+                return -ENODEV;
+        return crypto_register_shash(&alg);
+}
+static void __exit crct10dif_intel_mod_fini(void)
+{
+        crypto_unregister_shash(&alg);
+}
+module_init(crct10dif_intel_mod_init);
+module_exit(crct10dif_intel_mod_fini);
+MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>");
+MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ.");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("crct10dif");
+MODULE_ALIAS("crct10dif-pclmul");
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index 597d4da69656..50226c4b86ed 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -187,7 +187,36 @@ static int sha256_ssse3_import(struct shash_desc *desc, const void *in)
        return 0;
 }
-static struct shash_alg alg = {
+static int sha224_ssse3_init(struct shash_desc *desc)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        sctx->state[0] = SHA224_H0;
+        sctx->state[1] = SHA224_H1;
+        sctx->state[2] = SHA224_H2;
+        sctx->state[3] = SHA224_H3;
+        sctx->state[4] = SHA224_H4;
+        sctx->state[5] = SHA224_H5;
+        sctx->state[6] = SHA224_H6;
+        sctx->state[7] = SHA224_H7;
+        sctx->count = 0;
+        return 0;
+}
+static int sha224_ssse3_final(struct shash_desc *desc, u8 *hash)
+{
+        u8 D[SHA256_DIGEST_SIZE];
+        sha256_ssse3_final(desc, D);
+        memcpy(hash, D, SHA224_DIGEST_SIZE);
+        memset(D, 0, SHA256_DIGEST_SIZE);
+        return 0;
+}
+static struct shash_alg algs[] = { {
        .digestsize     =       SHA256_DIGEST_SIZE,
        .init           =       sha256_ssse3_init,
        .update         =       sha256_ssse3_update,
@@ -204,7 +233,24 @@ static struct shash_alg alg = {
                .cra_blocksize  =       SHA256_BLOCK_SIZE,
                .cra_module     =       THIS_MODULE,
        }
-};
+}, {
+        .digestsize     =       SHA224_DIGEST_SIZE,
+        .init           =       sha224_ssse3_init,
+        .update         =       sha256_ssse3_update,
+        .final          =       sha224_ssse3_final,
+        .export         =       sha256_ssse3_export,
+        .import         =       sha256_ssse3_import,
+        .descsize       =       sizeof(struct sha256_state),
+        .statesize      =       sizeof(struct sha256_state),
+        .base           =       {
+                .cra_name       =       "sha224",
+                .cra_driver_name =      "sha224-ssse3",
+                .cra_priority   =       150,
+                .cra_flags      =       CRYPTO_ALG_TYPE_SHASH,
+                .cra_blocksize  =       SHA224_BLOCK_SIZE,
+                .cra_module     =       THIS_MODULE,
+        }
+} };
 #ifdef CONFIG_AS_AVX
 static bool __init avx_usable(void)
@@ -227,7 +273,7 @@ static bool __init avx_usable(void)
 static int __init sha256_ssse3_mod_init(void)
 {
-        /* test for SSE3 first */
+        /* test for SSSE3 first */
        if (cpu_has_ssse3)
                sha256_transform_asm = sha256_transform_ssse3;
@@ -254,7 +300,7 @@ static int __init sha256_ssse3_mod_init(void)
                else
 #endif
                        pr_info("Using SSSE3 optimized SHA-256 implementation\n");
-                return crypto_register_shash(&alg);
+                return crypto_register_shashes(algs, ARRAY_SIZE(algs));
        }
        pr_info("Neither AVX nor SSSE3 is available/usable.\n");
@@ -263,7 +309,7 @@ static int __init sha256_ssse3_mod_init(void)
 static void __exit sha256_ssse3_mod_fini(void)
 {
-        crypto_unregister_shash(&alg);
+        crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
 }
 module_init(sha256_ssse3_mod_init);
@@ -273,3 +319,4 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated");
 MODULE_ALIAS("sha256");
+MODULE_ALIAS("sha384");
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 6cbd8df348d2..f30cd10293f0 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -194,7 +194,37 @@ static int sha512_ssse3_import(struct shash_desc *desc, const void *in)
        return 0;
 }
-static struct shash_alg alg = {
+static int sha384_ssse3_init(struct shash_desc *desc)
+{
+        struct sha512_state *sctx = shash_desc_ctx(desc);
+        sctx->state[0] = SHA384_H0;
+        sctx->state[1] = SHA384_H1;
+        sctx->state[2] = SHA384_H2;
+        sctx->state[3] = SHA384_H3;
+        sctx->state[4] = SHA384_H4;
+        sctx->state[5] = SHA384_H5;
+        sctx->state[6] = SHA384_H6;
+        sctx->state[7] = SHA384_H7;
+        sctx->count[0] = sctx->count[1] = 0;
+        return 0;
+}
+static int sha384_ssse3_final(struct shash_desc *desc, u8 *hash)
+{
+        u8 D[SHA512_DIGEST_SIZE];
+        sha512_ssse3_final(desc, D);
+        memcpy(hash, D, SHA384_DIGEST_SIZE);
+        memset(D, 0, SHA512_DIGEST_SIZE);
+        return 0;
+}
+static struct shash_alg algs[] = { {
        .digestsize     =       SHA512_DIGEST_SIZE,
        .init           =       sha512_ssse3_init,
        .update         =       sha512_ssse3_update,
@@ -211,7 +241,24 @@ static struct shash_alg alg = {
                .cra_blocksize  =       SHA512_BLOCK_SIZE,
                .cra_module     =       THIS_MODULE,
        }
-};
+},  {
+        .digestsize     =       SHA384_DIGEST_SIZE,
+        .init           =       sha384_ssse3_init,
+        .update         =       sha512_ssse3_update,
+        .final          =       sha384_ssse3_final,
+        .export         =       sha512_ssse3_export,
+        .import         =       sha512_ssse3_import,
+        .descsize       =       sizeof(struct sha512_state),
+        .statesize      =       sizeof(struct sha512_state),
+        .base           =       {
+                .cra_name       =       "sha384",
+                .cra_driver_name =      "sha384-ssse3",
+                .cra_priority   =       150,
+                .cra_flags      =       CRYPTO_ALG_TYPE_SHASH,
+                .cra_blocksize  =       SHA384_BLOCK_SIZE,
+                .cra_module     =       THIS_MODULE,
+        }
+} };
 #ifdef CONFIG_AS_AVX
 static bool __init avx_usable(void)
@@ -234,7 +281,7 @@ static bool __init avx_usable(void)
 static int __init sha512_ssse3_mod_init(void)
 {
-        /* test for SSE3 first */
+        /* test for SSSE3 first */
        if (cpu_has_ssse3)
                sha512_transform_asm = sha512_transform_ssse3;
@@ -261,7 +308,7 @@ static int __init sha512_ssse3_mod_init(void)
                else
 #endif
                        pr_info("Using SSSE3 optimized SHA-512 implementation\n");
-                return crypto_register_shash(&alg);
+                return crypto_register_shashes(algs, ARRAY_SIZE(algs));
        }
        pr_info("Neither AVX nor SSSE3 is available/usable.\n");
@@ -270,7 +317,7 @@ static int __init sha512_ssse3_mod_init(void)
 static void __exit sha512_ssse3_mod_fini(void)
 {
-        crypto_unregister_shash(&alg);
+        crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
 }
 module_init(sha512_ssse3_mod_init);
@@ -280,3 +327,4 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated");
 MODULE_ALIAS("sha512");
+MODULE_ALIAS("sha384");
diff --git a/arch/x86/crypto/twofish-avx2-asm_64.S b/arch/x86/crypto/twofish-avx2-asm_64.S
deleted file mode 100644
index e1a83b9cd389..000000000000
--- a/arch/x86/crypto/twofish-avx2-asm_64.S
+++ /dev/null
@@ -1,600 +0,0 @@
-/*
- * x86_64/AVX2 assembler optimized version of Twofish
- *
- * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- */
-#include <linux/linkage.h>
-#include "glue_helper-asm-avx2.S"
-.file "twofish-avx2-asm_64.S"
-.data
-.align 16
-.Lvpshufb_mask0:
-.long 0x80808000
-.long 0x80808004
-.long 0x80808008
-.long 0x8080800c
-.Lbswap128_mask:
-        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-.Lxts_gf128mul_and_shl1_mask_0:
-        .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
-.Lxts_gf128mul_and_shl1_mask_1:
-        .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
-.text
-/* structure of crypto context */
-#define s0      0
-#define s1      1024
-#define s2      2048
-#define s3      3072
-#define w       4096
-#define k       4128
-/* register macros */
-#define CTX     %rdi
-#define RS0     CTX
-#define RS1     %r8
-#define RS2     %r9
-#define RS3     %r10
-#define RK      %r11
-#define RW      %rax
-#define RROUND  %r12
-#define RROUNDd %r12d
-#define RA0     %ymm8
-#define RB0     %ymm9
-#define RC0     %ymm10
-#define RD0     %ymm11
-#define RA1     %ymm12
-#define RB1     %ymm13
-#define RC1     %ymm14
-#define RD1     %ymm15
-/* temp regs */
-#define RX0     %ymm0
-#define RY0     %ymm1
-#define RX1     %ymm2
-#define RY1     %ymm3
-#define RT0     %ymm4
-#define RIDX    %ymm5
-#define RX0x    %xmm0
-#define RY0x    %xmm1
-#define RX1x    %xmm2
-#define RY1x    %xmm3
-#define RT0x    %xmm4
-/* vpgatherdd mask and '-1' */
-#define RNOT    %ymm6
-/* byte mask, (-1 >> 24) */
-#define RBYTE   %ymm7
-/**********************************************************************
-  16-way AVX2 twofish
- **********************************************************************/
-#define init_round_constants() \
-        vpcmpeqd RNOT, RNOT, RNOT; \
-        vpsrld $24, RNOT, RBYTE; \
-        leaq k(CTX), RK; \
-        leaq w(CTX), RW; \
-        leaq s1(CTX), RS1; \
-        leaq s2(CTX), RS2; \
-        leaq s3(CTX), RS3; \
-#define g16(ab, rs0, rs1, rs2, rs3, xy) \
-        vpand RBYTE, ab ## 0, RIDX; \
-        vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \
-        vpcmpeqd RNOT, RNOT, RNOT; \
-                \
-                vpand RBYTE, ab ## 1, RIDX; \
-                vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \
-                vpcmpeqd RNOT, RNOT, RNOT; \
-        \
-        vpsrld $8, ab ## 0, RIDX; \
-        vpand RBYTE, RIDX, RIDX; \
-        vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
-        vpcmpeqd RNOT, RNOT, RNOT; \
-        vpxor RT0, xy ## 0, xy ## 0; \
-                \
-                vpsrld $8, ab ## 1, RIDX; \
-                vpand RBYTE, RIDX, RIDX; \
-                vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
-                vpcmpeqd RNOT, RNOT, RNOT; \
-                vpxor RT0, xy ## 1, xy ## 1; \
-        \
-        vpsrld $16, ab ## 0, RIDX; \
-        vpand RBYTE, RIDX, RIDX; \
-        vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
-        vpcmpeqd RNOT, RNOT, RNOT; \
-        vpxor RT0, xy ## 0, xy ## 0; \
-                \
-                vpsrld $16, ab ## 1, RIDX; \
-                vpand RBYTE, RIDX, RIDX; \
-                vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
-                vpcmpeqd RNOT, RNOT, RNOT; \
-                vpxor RT0, xy ## 1, xy ## 1; \
-        \
-        vpsrld $24, ab ## 0, RIDX; \
-        vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
-        vpcmpeqd RNOT, RNOT, RNOT; \
-        vpxor RT0, xy ## 0, xy ## 0; \
-                \
-                vpsrld $24, ab ## 1, RIDX; \
-                vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
-                vpcmpeqd RNOT, RNOT, RNOT; \
-                vpxor RT0, xy ## 1, xy ## 1;
-#define g1_16(a, x) \
-        g16(a, RS0, RS1, RS2, RS3, x);
-#define g2_16(b, y) \
-        g16(b, RS1, RS2, RS3, RS0, y);
-#define encrypt_round_end16(a, b, c, d, nk) \
-        vpaddd RY0, RX0, RX0; \
-        vpaddd RX0, RY0, RY0; \
-        vpbroadcastd nk(RK,RROUND,8), RT0; \
-        vpaddd RT0, RX0, RX0; \
-        vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
-        vpaddd RT0, RY0, RY0; \
-        \
-        vpxor RY0, d ## 0, d ## 0; \
-        \
-        vpxor RX0, c ## 0, c ## 0; \
-        vpsrld $1, c ## 0, RT0; \
-        vpslld $31, c ## 0, c ## 0; \
-        vpor RT0, c ## 0, c ## 0; \
-        \
-                vpaddd RY1, RX1, RX1; \
-                vpaddd RX1, RY1, RY1; \
-                vpbroadcastd nk(RK,RROUND,8), RT0; \
-                vpaddd RT0, RX1, RX1; \
-                vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
-                vpaddd RT0, RY1, RY1; \
-                \
-                vpxor RY1, d ## 1, d ## 1; \
-                \
-                vpxor RX1, c ## 1, c ## 1; \
-                vpsrld $1, c ## 1, RT0; \
-                vpslld $31, c ## 1, c ## 1; \
-                vpor RT0, c ## 1, c ## 1; \
-#define encrypt_round16(a, b, c, d, nk) \
-        g2_16(b, RY); \
-        \
-        vpslld $1, b ## 0, RT0; \
-        vpsrld $31, b ## 0, b ## 0; \
-        vpor RT0, b ## 0, b ## 0; \
-        \
-                vpslld $1, b ## 1, RT0; \
-                vpsrld $31, b ## 1, b ## 1; \
-                vpor RT0, b ## 1, b ## 1; \
-        \
-        g1_16(a, RX); \
-        \
-        encrypt_round_end16(a, b, c, d, nk);
-#define encrypt_round_first16(a, b, c, d, nk) \
-        vpslld $1, d ## 0, RT0; \
-        vpsrld $31, d ## 0, d ## 0; \
-        vpor RT0, d ## 0, d ## 0; \
-        \
-                vpslld $1, d ## 1, RT0; \
-                vpsrld $31, d ## 1, d ## 1; \
-                vpor RT0, d ## 1, d ## 1; \
-        \
-        encrypt_round16(a, b, c, d, nk);
-#define encrypt_round_last16(a, b, c, d, nk) \
-        g2_16(b, RY); \
-        \
-        g1_16(a, RX); \
-        \
-        encrypt_round_end16(a, b, c, d, nk);
-#define decrypt_round_end16(a, b, c, d, nk) \
-        vpaddd RY0, RX0, RX0; \
-        vpaddd RX0, RY0, RY0; \
-        vpbroadcastd nk(RK,RROUND,8), RT0; \
-        vpaddd RT0, RX0, RX0; \
-        vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
-        vpaddd RT0, RY0, RY0; \
-        \
-        vpxor RX0, c ## 0, c ## 0; \
-        \
-        vpxor RY0, d ## 0, d ## 0; \
-        vpsrld $1, d ## 0, RT0; \
-        vpslld $31, d ## 0, d ## 0; \
-        vpor RT0, d ## 0, d ## 0; \
-        \
-                vpaddd RY1, RX1, RX1; \
-                vpaddd RX1, RY1, RY1; \
-                vpbroadcastd nk(RK,RROUND,8), RT0; \
-                vpaddd RT0, RX1, RX1; \
-                vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
-                vpaddd RT0, RY1, RY1; \
-                \
-                vpxor RX1, c ## 1, c ## 1; \
-                \
-                vpxor RY1, d ## 1, d ## 1; \
-                vpsrld $1, d ## 1, RT0; \
-                vpslld $31, d ## 1, d ## 1; \
-                vpor RT0, d ## 1, d ## 1;
-#define decrypt_round16(a, b, c, d, nk) \
-        g1_16(a, RX); \
-        \
-        vpslld $1, a ## 0, RT0; \
-        vpsrld $31, a ## 0, a ## 0; \
-        vpor RT0, a ## 0, a ## 0; \
-        \
-                vpslld $1, a ## 1, RT0; \
-                vpsrld $31, a ## 1, a ## 1; \
-                vpor RT0, a ## 1, a ## 1; \
-        \
-        g2_16(b, RY); \
-        \
-        decrypt_round_end16(a, b, c, d, nk);
-#define decrypt_round_first16(a, b, c, d, nk) \
-        vpslld $1, c ## 0, RT0; \
-        vpsrld $31, c ## 0, c ## 0; \
-        vpor RT0, c ## 0, c ## 0; \
-        \
-                vpslld $1, c ## 1, RT0; \
-                vpsrld $31, c ## 1, c ## 1; \
-                vpor RT0, c ## 1, c ## 1; \
-        \
-        decrypt_round16(a, b, c, d, nk)
-#define decrypt_round_last16(a, b, c, d, nk) \
-        g1_16(a, RX); \
-        \
-        g2_16(b, RY); \
-        \
-        decrypt_round_end16(a, b, c, d, nk);
-#define encrypt_cycle16() \
-        encrypt_round16(RA, RB, RC, RD, 0); \
-        encrypt_round16(RC, RD, RA, RB, 8);
-#define encrypt_cycle_first16() \
-        encrypt_round_first16(RA, RB, RC, RD, 0); \
-        encrypt_round16(RC, RD, RA, RB, 8);
-#define encrypt_cycle_last16() \
-        encrypt_round16(RA, RB, RC, RD, 0); \
-        encrypt_round_last16(RC, RD, RA, RB, 8);
-#define decrypt_cycle16(n) \
-        decrypt_round16(RC, RD, RA, RB, 8); \
-        decrypt_round16(RA, RB, RC, RD, 0);
-#define decrypt_cycle_first16(n) \
-        decrypt_round_first16(RC, RD, RA, RB, 8); \
-        decrypt_round16(RA, RB, RC, RD, 0);
-#define decrypt_cycle_last16(n) \
-        decrypt_round16(RC, RD, RA, RB, 8); \
-        decrypt_round_last16(RA, RB, RC, RD, 0);
-#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
-        vpunpckhdq x1, x0, t2; \
-        vpunpckldq x1, x0, x0; \
-        \
-        vpunpckldq x3, x2, t1; \
-        vpunpckhdq x3, x2, x2; \
-        \
-        vpunpckhqdq t1, x0, x1; \
-        vpunpcklqdq t1, x0, x0; \
-        \
-        vpunpckhqdq x2, t2, x3; \
-        vpunpcklqdq x2, t2, x2;
-#define read_blocks8(offs,a,b,c,d) \
-        transpose_4x4(a, b, c, d, RX0, RY0);
-#define write_blocks8(offs,a,b,c,d) \
-        transpose_4x4(a, b, c, d, RX0, RY0);
-#define inpack_enc8(a,b,c,d) \
-        vpbroadcastd 4*0(RW), RT0; \
-        vpxor RT0, a, a; \
-        \
-        vpbroadcastd 4*1(RW), RT0; \
-        vpxor RT0, b, b; \
-        \
-        vpbroadcastd 4*2(RW), RT0; \
-        vpxor RT0, c, c; \
-        \
-        vpbroadcastd 4*3(RW), RT0; \
-        vpxor RT0, d, d;
-#define outunpack_enc8(a,b,c,d) \
-        vpbroadcastd 4*4(RW), RX0; \
-        vpbroadcastd 4*5(RW), RY0; \
-        vpxor RX0, c, RX0; \
-        vpxor RY0, d, RY0; \
-        \
-        vpbroadcastd 4*6(RW), RT0; \
-        vpxor RT0, a, c; \
-        vpbroadcastd 4*7(RW), RT0; \
-        vpxor RT0, b, d; \
-        \
-        vmovdqa RX0, a; \
-        vmovdqa RY0, b;
-#define inpack_dec8(a,b,c,d) \
-        vpbroadcastd 4*4(RW), RX0; \
-        vpbroadcastd 4*5(RW), RY0; \
-        vpxor RX0, a, RX0; \
-        vpxor RY0, b, RY0; \
-        \
-        vpbroadcastd 4*6(RW), RT0; \
-        vpxor RT0, c, a; \
-        vpbroadcastd 4*7(RW), RT0; \
-        vpxor RT0, d, b; \
-        \
-        vmovdqa RX0, c; \
-        vmovdqa RY0, d;
-#define outunpack_dec8(a,b,c,d) \
-        vpbroadcastd 4*0(RW), RT0; \
-        vpxor RT0, a, a; \
-        \
-        vpbroadcastd 4*1(RW), RT0; \
-        vpxor RT0, b, b; \
-        \
-        vpbroadcastd 4*2(RW), RT0; \
-        vpxor RT0, c, c; \
-        \
-        vpbroadcastd 4*3(RW), RT0; \
-        vpxor RT0, d, d;
-#define read_blocks16(a,b,c,d) \
-        read_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
-        read_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
-#define write_blocks16(a,b,c,d) \
-        write_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
-        write_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
-#define xor_blocks16(a,b,c,d) \
-        xor_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
-        xor_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
-#define inpack_enc16(a,b,c,d) \
-        inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
-        inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
-#define outunpack_enc16(a,b,c,d) \
-        outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
-        outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
-#define inpack_dec16(a,b,c,d) \
-        inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
-        inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
-#define outunpack_dec16(a,b,c,d) \
-        outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
-        outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
-.align 8
-__twofish_enc_blk16:
-        /* input:
-         *      %rdi: ctx, CTX
-         *      RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
-         * output:
-         *      RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
-         */
-        init_round_constants();
-        read_blocks16(RA, RB, RC, RD);
-        inpack_enc16(RA, RB, RC, RD);
-        xorl RROUNDd, RROUNDd;
-        encrypt_cycle_first16();
-        movl $2, RROUNDd;
-.align 4
-.L__enc_loop:
-        encrypt_cycle16();
-        addl $2, RROUNDd;
-        cmpl $14, RROUNDd;
-        jne .L__enc_loop;
-        encrypt_cycle_last16();
-        outunpack_enc16(RA, RB, RC, RD);
-        write_blocks16(RA, RB, RC, RD);
-        ret;
-ENDPROC(__twofish_enc_blk16)
-.align 8
-__twofish_dec_blk16:
-        /* input:
-         *      %rdi: ctx, CTX
-         *      RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
-         * output:
-         *      RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
-         */
-        init_round_constants();
-        read_blocks16(RA, RB, RC, RD);
-        inpack_dec16(RA, RB, RC, RD);
-        movl $14, RROUNDd;
-        decrypt_cycle_first16();
-        movl $12, RROUNDd;
-.align 4
-.L__dec_loop:
-        decrypt_cycle16();
-        addl $-2, RROUNDd;
-        jnz .L__dec_loop;
-        decrypt_cycle_last16();
-        outunpack_dec16(RA, RB, RC, RD);
-        write_blocks16(RA, RB, RC, RD);
-        ret;
-ENDPROC(__twofish_dec_blk16)
-ENTRY(twofish_ecb_enc_16way)
-        /* input:
-         *      %rdi: ctx, CTX
-         *      %rsi: dst
-         *      %rdx: src
-         */
-        vzeroupper;
-        pushq %r12;
-        load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
-        call __twofish_enc_blk16;
-        store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
-        popq %r12;
-        vzeroupper;
-        ret;
-ENDPROC(twofish_ecb_enc_16way)
-ENTRY(twofish_ecb_dec_16way)
-        /* input:
-         *      %rdi: ctx, CTX
-         *      %rsi: dst
-         *      %rdx: src
-         */
-        vzeroupper;
-        pushq %r12;
-        load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
-        call __twofish_dec_blk16;
-        store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
-        popq %r12;
-        vzeroupper;
-        ret;
-ENDPROC(twofish_ecb_dec_16way)
-ENTRY(twofish_cbc_dec_16way)
-        /* input:
-         *      %rdi: ctx, CTX
-         *      %rsi: dst
-         *      %rdx: src
-         */
-        vzeroupper;
-        pushq %r12;
-        load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
-        call __twofish_dec_blk16;
-        store_cbc_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1,
-                        RX0);
-        popq %r12;
-        vzeroupper;
-        ret;
-ENDPROC(twofish_cbc_dec_16way)
-ENTRY(twofish_ctr_16way)
-        /* input:
-         *      %rdi: ctx, CTX
-         *      %rsi: dst (16 blocks)
-         *      %rdx: src (16 blocks)
-         *      %rcx: iv (little endian, 128bit)
-         */
-        vzeroupper;
-        pushq %r12;
-        load_ctr_16way(%rcx, .Lbswap128_mask, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
-                       RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
-                       RBYTE);
-        call __twofish_enc_blk16;
-        store_ctr_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
-        popq %r12;
-        vzeroupper;
-        ret;
-ENDPROC(twofish_ctr_16way)
-.align 8
-twofish_xts_crypt_16way:
-        /* input:
-         *      %rdi: ctx, CTX
-         *      %rsi: dst (16 blocks)
-         *      %rdx: src (16 blocks)
-         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
-         *      %r8: pointer to __twofish_enc_blk16 or __twofish_dec_blk16
-         */
-        vzeroupper;
-        pushq %r12;
-        load_xts_16way(%rcx, %rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
-                       RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
-                       .Lxts_gf128mul_and_shl1_mask_0,
-                       .Lxts_gf128mul_and_shl1_mask_1);
-        call *%r8;
-        store_xts_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
-        popq %r12;
-        vzeroupper;
-        ret;
-ENDPROC(twofish_xts_crypt_16way)
-ENTRY(twofish_xts_enc_16way)
-        /* input:
-         *      %rdi: ctx, CTX
-         *      %rsi: dst (16 blocks)
-         *      %rdx: src (16 blocks)
-         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
-         */
-        leaq __twofish_enc_blk16, %r8;
-        jmp twofish_xts_crypt_16way;
-ENDPROC(twofish_xts_enc_16way)
-ENTRY(twofish_xts_dec_16way)
-        /* input:
-         *      %rdi: ctx, CTX
-         *      %rsi: dst (16 blocks)
-         *      %rdx: src (16 blocks)
-         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
-         */
-        leaq __twofish_dec_blk16, %r8;
-        jmp twofish_xts_crypt_16way;
-ENDPROC(twofish_xts_dec_16way)
diff --git a/arch/x86/crypto/twofish_avx2_glue.c b/arch/x86/crypto/twofish_avx2_glue.c
deleted file mode 100644
index ce33b5be64ee..000000000000
--- a/arch/x86/crypto/twofish_avx2_glue.c
+++ /dev/null
@@ -1,584 +0,0 @@
-/*
- * Glue Code for x86_64/AVX2 assembler optimized version of Twofish
- *
- * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- */
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/crypto.h>
-#include <linux/err.h>
-#include <crypto/algapi.h>
-#include <crypto/ctr.h>
-#include <crypto/twofish.h>
-#include <crypto/lrw.h>
-#include <crypto/xts.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
-#include <asm/crypto/twofish.h>
-#include <asm/crypto/ablk_helper.h>
-#include <asm/crypto/glue_helper.h>
-#include <crypto/scatterwalk.h>
-#define TF_AVX2_PARALLEL_BLOCKS 16
-/* 16-way AVX2 parallel cipher functions */
-asmlinkage void twofish_ecb_enc_16way(struct twofish_ctx *ctx, u8 *dst,
-                                      const u8 *src);
-asmlinkage void twofish_ecb_dec_16way(struct twofish_ctx *ctx, u8 *dst,
-                                      const u8 *src);
-asmlinkage void twofish_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src);
-asmlinkage void twofish_ctr_16way(void *ctx, u128 *dst, const u128 *src,
-                                  le128 *iv);
-asmlinkage void twofish_xts_enc_16way(struct twofish_ctx *ctx, u8 *dst,
-                                      const u8 *src, le128 *iv);
-asmlinkage void twofish_xts_dec_16way(struct twofish_ctx *ctx, u8 *dst,
-                                      const u8 *src, le128 *iv);
-static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
-                                        const u8 *src)
-{
-        __twofish_enc_blk_3way(ctx, dst, src, false);
-}
-static const struct common_glue_ctx twofish_enc = {
-        .num_funcs = 4,
-        .fpu_blocks_limit = 8,
-        .funcs = { {
-                .num_blocks = 16,
-                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_16way) }
-        }, {
-                .num_blocks = 8,
-                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
-        }, {
-                .num_blocks = 3,
-                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
-        }, {
-                .num_blocks = 1,
-                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
-        } }
-};
-static const struct common_glue_ctx twofish_ctr = {
-        .num_funcs = 4,
-        .fpu_blocks_limit = 8,
-        .funcs = { {
-                .num_blocks = 16,
-                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_16way) }
-        },  {
-                .num_blocks = 8,
-                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
-        }, {
-                .num_blocks = 3,
-                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
-        }, {
-                .num_blocks = 1,
-                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) }
-        } }
-};
-static const struct common_glue_ctx twofish_enc_xts = {
-        .num_funcs = 3,
-        .fpu_blocks_limit = 8,
-        .funcs = { {
-                .num_blocks = 16,
-                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_16way) }
-        }, {
-                .num_blocks = 8,
-                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) }
-        }, {
-                .num_blocks = 1,
-                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) }
-        } }
-};
-static const struct common_glue_ctx twofish_dec = {
-        .num_funcs = 4,
-        .fpu_blocks_limit = 8,
-        .funcs = { {
-                .num_blocks = 16,
-                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_16way) }
-        }, {
-                .num_blocks = 8,
-                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
-        }, {
-                .num_blocks = 3,
-                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
-        }, {
-                .num_blocks = 1,
-                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
-        } }
-};
-static const struct common_glue_ctx twofish_dec_cbc = {
-        .num_funcs = 4,
-        .fpu_blocks_limit = 8,
-        .funcs = { {
-                .num_blocks = 16,
-                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_16way) }
-        }, {
-                .num_blocks = 8,
-                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
-        }, {
-                .num_blocks = 3,
-                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
-        }, {
-                .num_blocks = 1,
-                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
-        } }
-};
-static const struct common_glue_ctx twofish_dec_xts = {
-        .num_funcs = 3,
-        .fpu_blocks_limit = 8,
-        .funcs = { {
-                .num_blocks = 16,
-                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_16way) }
-        }, {
-                .num_blocks = 8,
-                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) }
-        }, {
-                .num_blocks = 1,
-                .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) }
-        } }
-};
-static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                       struct scatterlist *src, unsigned int nbytes)
-{
-        return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);
-}
-static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                       struct scatterlist *src, unsigned int nbytes)
-{
-        return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);
-}
-static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                       struct scatterlist *src, unsigned int nbytes)
-{
-        return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc,
-                                       dst, src, nbytes);
-}
-static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                       struct scatterlist *src, unsigned int nbytes)
-{
-        return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src,
-                                       nbytes);
-}
-static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                     struct scatterlist *src, unsigned int nbytes)
-{
-        return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);
-}
-static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes)
-{
-        /* since reusing AVX functions, starts using FPU at 8 parallel blocks */
-        return glue_fpu_begin(TF_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes);
-}
-static inline void twofish_fpu_end(bool fpu_enabled)
-{
-        glue_fpu_end(fpu_enabled);
-}
-struct crypt_priv {
-        struct twofish_ctx *ctx;
-        bool fpu_enabled;
-};
-static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
-{
-        const unsigned int bsize = TF_BLOCK_SIZE;
-        struct crypt_priv *ctx = priv;
-        int i;
-        ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
-        while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) {
-                twofish_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
-                srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS;
-                nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS;
-        }
-        while (nbytes >= 8 * bsize) {
-                twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
-                srcdst += bsize * 8;
-                nbytes -= bsize * 8;
-        }
-        while (nbytes >= 3 * bsize) {
-                twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
-                srcdst += bsize * 3;
-                nbytes -= bsize * 3;
-        }
-        for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
-                twofish_enc_blk(ctx->ctx, srcdst, srcdst);
-}
-static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
-{
-        const unsigned int bsize = TF_BLOCK_SIZE;
-        struct crypt_priv *ctx = priv;
-        int i;
-        ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
-        while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) {
-                twofish_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
-                srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS;
-                nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS;
-        }
-        while (nbytes >= 8 * bsize) {
-                twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
-                srcdst += bsize * 8;
-                nbytes -= bsize * 8;
-        }
-        while (nbytes >= 3 * bsize) {
-                twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
-                srcdst += bsize * 3;
-                nbytes -= bsize * 3;
-        }
-        for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
-                twofish_dec_blk(ctx->ctx, srcdst, srcdst);
-}
-static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                       struct scatterlist *src, unsigned int nbytes)
-{
-        struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-        be128 buf[TF_AVX2_PARALLEL_BLOCKS];
-        struct crypt_priv crypt_ctx = {
-                .ctx = &ctx->twofish_ctx,
-                .fpu_enabled = false,
-        };
-        struct lrw_crypt_req req = {
-                .tbuf = buf,
-                .tbuflen = sizeof(buf),
-                .table_ctx = &ctx->lrw_table,
-                .crypt_ctx = &crypt_ctx,
-                .crypt_fn = encrypt_callback,
-        };
-        int ret;
-        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-        ret = lrw_crypt(desc, dst, src, nbytes, &req);
-        twofish_fpu_end(crypt_ctx.fpu_enabled);
-        return ret;
-}
-static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                       struct scatterlist *src, unsigned int nbytes)
-{
-        struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-        be128 buf[TF_AVX2_PARALLEL_BLOCKS];
-        struct crypt_priv crypt_ctx = {
-                .ctx = &ctx->twofish_ctx,
-                .fpu_enabled = false,
-        };
-        struct lrw_crypt_req req = {
-                .tbuf = buf,
-                .tbuflen = sizeof(buf),
-                .table_ctx = &ctx->lrw_table,
-                .crypt_ctx = &crypt_ctx,
-                .crypt_fn = decrypt_callback,
-        };
-        int ret;
-        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-        ret = lrw_crypt(desc, dst, src, nbytes, &req);
-        twofish_fpu_end(crypt_ctx.fpu_enabled);
-        return ret;
-}
-static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                       struct scatterlist *src, unsigned int nbytes)
-{
-        struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-        return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes,
-                                     XTS_TWEAK_CAST(twofish_enc_blk),
-                                     &ctx->tweak_ctx, &ctx->crypt_ctx);
-}
-static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                       struct scatterlist *src, unsigned int nbytes)
-{
-        struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-        return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes,
-                                     XTS_TWEAK_CAST(twofish_enc_blk),
-                                     &ctx->tweak_ctx, &ctx->crypt_ctx);
-}
-static struct crypto_alg tf_algs[10] = { {
-        .cra_name               = "__ecb-twofish-avx2",
-        .cra_driver_name        = "__driver-ecb-twofish-avx2",
-        .cra_priority           = 0,
-        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
-        .cra_blocksize          = TF_BLOCK_SIZE,
-        .cra_ctxsize            = sizeof(struct twofish_ctx),
-        .cra_alignmask          = 0,
-        .cra_type               = &crypto_blkcipher_type,
-        .cra_module             = THIS_MODULE,
-        .cra_u = {
-                .blkcipher = {
-                        .min_keysize    = TF_MIN_KEY_SIZE,
-                        .max_keysize    = TF_MAX_KEY_SIZE,
-                        .setkey         = twofish_setkey,
-                        .encrypt        = ecb_encrypt,
-                        .decrypt        = ecb_decrypt,
-                },
-        },
-}, {
-        .cra_name               = "__cbc-twofish-avx2",
-        .cra_driver_name        = "__driver-cbc-twofish-avx2",
-        .cra_priority           = 0,
-        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
-        .cra_blocksize          = TF_BLOCK_SIZE,
-        .cra_ctxsize            = sizeof(struct twofish_ctx),
-        .cra_alignmask          = 0,
-        .cra_type               = &crypto_blkcipher_type,
-        .cra_module             = THIS_MODULE,
-        .cra_u = {
-                .blkcipher = {
-                        .min_keysize    = TF_MIN_KEY_SIZE,
-                        .max_keysize    = TF_MAX_KEY_SIZE,
-                        .setkey         = twofish_setkey,
-                        .encrypt        = cbc_encrypt,
-                        .decrypt        = cbc_decrypt,
-                },
-        },
-}, {
-        .cra_name               = "__ctr-twofish-avx2",
-        .cra_driver_name        = "__driver-ctr-twofish-avx2",
-        .cra_priority           = 0,
-        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
-        .cra_blocksize          = 1,
-        .cra_ctxsize            = sizeof(struct twofish_ctx),
-        .cra_alignmask          = 0,
-        .cra_type               = &crypto_blkcipher_type,
-        .cra_module             = THIS_MODULE,
-        .cra_u = {
-                .blkcipher = {
-                        .min_keysize    = TF_MIN_KEY_SIZE,
-                        .max_keysize    = TF_MAX_KEY_SIZE,
-                        .ivsize         = TF_BLOCK_SIZE,
-                        .setkey         = twofish_setkey,
-                        .encrypt        = ctr_crypt,
-                        .decrypt        = ctr_crypt,
-                },
-        },
-}, {
-        .cra_name               = "__lrw-twofish-avx2",
-        .cra_driver_name        = "__driver-lrw-twofish-avx2",
-        .cra_priority           = 0,
-        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
-        .cra_blocksize          = TF_BLOCK_SIZE,
-        .cra_ctxsize            = sizeof(struct twofish_lrw_ctx),
-        .cra_alignmask          = 0,
-        .cra_type               = &crypto_blkcipher_type,
-        .cra_module             = THIS_MODULE,
-        .cra_exit               = lrw_twofish_exit_tfm,
-        .cra_u = {
-                .blkcipher = {
-                        .min_keysize    = TF_MIN_KEY_SIZE +
-                                          TF_BLOCK_SIZE,
-                        .max_keysize    = TF_MAX_KEY_SIZE +
-                                          TF_BLOCK_SIZE,
-                        .ivsize         = TF_BLOCK_SIZE,
-                        .setkey         = lrw_twofish_setkey,
-                        .encrypt        = lrw_encrypt,
-                        .decrypt        = lrw_decrypt,
-                },
-        },
-}, {
-        .cra_name               = "__xts-twofish-avx2",
-        .cra_driver_name        = "__driver-xts-twofish-avx2",
-        .cra_priority           = 0,
-        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
-        .cra_blocksize          = TF_BLOCK_SIZE,
-        .cra_ctxsize            = sizeof(struct twofish_xts_ctx),
-        .cra_alignmask          = 0,
-        .cra_type               = &crypto_blkcipher_type,
-        .cra_module             = THIS_MODULE,
-        .cra_u = {
-                .blkcipher = {
-                        .min_keysize    = TF_MIN_KEY_SIZE * 2,
-                        .max_keysize    = TF_MAX_KEY_SIZE * 2,
-                        .ivsize         = TF_BLOCK_SIZE,
-                        .setkey         = xts_twofish_setkey,
-                        .encrypt        = xts_encrypt,
-                        .decrypt        = xts_decrypt,
-                },
-        },
-}, {
-        .cra_name               = "ecb(twofish)",
-        .cra_driver_name        = "ecb-twofish-avx2",
-        .cra_priority           = 500,
-        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-        .cra_blocksize          = TF_BLOCK_SIZE,
-        .cra_ctxsize            = sizeof(struct async_helper_ctx),
-        .cra_alignmask          = 0,
-        .cra_type               = &crypto_ablkcipher_type,
-        .cra_module             = THIS_MODULE,
-        .cra_init               = ablk_init,
-        .cra_exit               = ablk_exit,
-        .cra_u = {
-                .ablkcipher = {
-                        .min_keysize    = TF_MIN_KEY_SIZE,
-                        .max_keysize    = TF_MAX_KEY_SIZE,
-                        .setkey         = ablk_set_key,
-                        .encrypt        = ablk_encrypt,
-                        .decrypt        = ablk_decrypt,
-                },
-        },
-}, {
-        .cra_name               = "cbc(twofish)",
-        .cra_driver_name        = "cbc-twofish-avx2",
-        .cra_priority           = 500,
-        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-        .cra_blocksize          = TF_BLOCK_SIZE,
-        .cra_ctxsize            = sizeof(struct async_helper_ctx),
-        .cra_alignmask          = 0,
-        .cra_type               = &crypto_ablkcipher_type,
-        .cra_module             = THIS_MODULE,
-        .cra_init               = ablk_init,
-        .cra_exit               = ablk_exit,
-        .cra_u = {
-                .ablkcipher = {
-                        .min_keysize    = TF_MIN_KEY_SIZE,
-                        .max_keysize    = TF_MAX_KEY_SIZE,
-                        .ivsize         = TF_BLOCK_SIZE,
-                        .setkey         = ablk_set_key,
-                        .encrypt        = __ablk_encrypt,
-                        .decrypt        = ablk_decrypt,
-                },
-        },
-}, {
-        .cra_name               = "ctr(twofish)",
-        .cra_driver_name        = "ctr-twofish-avx2",
-        .cra_priority           = 500,
-        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-        .cra_blocksize          = 1,
-        .cra_ctxsize            = sizeof(struct async_helper_ctx),
-        .cra_alignmask          = 0,
-        .cra_type               = &crypto_ablkcipher_type,
-        .cra_module             = THIS_MODULE,
-        .cra_init               = ablk_init,
-        .cra_exit               = ablk_exit,
-        .cra_u = {
-                .ablkcipher = {
-                        .min_keysize    = TF_MIN_KEY_SIZE,
-                        .max_keysize    = TF_MAX_KEY_SIZE,
-                        .ivsize         = TF_BLOCK_SIZE,
-                        .setkey         = ablk_set_key,
-                        .encrypt        = ablk_encrypt,
-                        .decrypt        = ablk_encrypt,
-                        .geniv          = "chainiv",
-                },
-        },
-}, {
-        .cra_name               = "lrw(twofish)",
-        .cra_driver_name        = "lrw-twofish-avx2",
-        .cra_priority           = 500,
-        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-        .cra_blocksize          = TF_BLOCK_SIZE,
-        .cra_ctxsize            = sizeof(struct async_helper_ctx),
-        .cra_alignmask          = 0,
-        .cra_type               = &crypto_ablkcipher_type,
-        .cra_module             = THIS_MODULE,
-        .cra_init               = ablk_init,
-        .cra_exit               = ablk_exit,
-        .cra_u = {
-                .ablkcipher = {
-                        .min_keysize    = TF_MIN_KEY_SIZE +
-                                          TF_BLOCK_SIZE,
-                        .max_keysize    = TF_MAX_KEY_SIZE +
-                                          TF_BLOCK_SIZE,
-                        .ivsize         = TF_BLOCK_SIZE,
-                        .setkey         = ablk_set_key,
-                        .encrypt        = ablk_encrypt,
-                        .decrypt        = ablk_decrypt,
-                },
-        },
-}, {
-        .cra_name               = "xts(twofish)",
-        .cra_driver_name        = "xts-twofish-avx2",
-        .cra_priority           = 500,
-        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-        .cra_blocksize          = TF_BLOCK_SIZE,
-        .cra_ctxsize            = sizeof(struct async_helper_ctx),
-        .cra_alignmask          = 0,
-        .cra_type               = &crypto_ablkcipher_type,
-        .cra_module             = THIS_MODULE,
-        .cra_init               = ablk_init,
-        .cra_exit               = ablk_exit,
-        .cra_u = {
-                .ablkcipher = {
-                        .min_keysize    = TF_MIN_KEY_SIZE * 2,
-                        .max_keysize    = TF_MAX_KEY_SIZE * 2,
-                        .ivsize         = TF_BLOCK_SIZE,
-                        .setkey         = ablk_set_key,
-                        .encrypt        = ablk_encrypt,
-                        .decrypt        = ablk_decrypt,
-                },
-        },
-} };
-static int __init init(void)
-{
-        u64 xcr0;
-        if (!cpu_has_avx2 || !cpu_has_osxsave) {
-                pr_info("AVX2 instructions are not detected.\n");
-                return -ENODEV;
-        }
-        xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-        if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-                pr_info("AVX2 detected but unusable.\n");
-                return -ENODEV;
-        }
-        return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs));
-}
-static void __exit fini(void)
-{
-        crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs));
-}
-module_init(init);
-module_exit(fini);
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX2 optimized");
-MODULE_ALIAS("twofish");
-MODULE_ALIAS("twofish-asm");
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index 2047a562f6b3..a62ba541884e 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -50,26 +50,18 @@
 /* 8-way parallel cipher functions */
 asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
                                     const u8 *src);
-EXPORT_SYMBOL_GPL(twofish_ecb_enc_8way);
 asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
                                     const u8 *src);
-EXPORT_SYMBOL_GPL(twofish_ecb_dec_8way);
 asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
                                     const u8 *src);
-EXPORT_SYMBOL_GPL(twofish_cbc_dec_8way);
 asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
                                 const u8 *src, le128 *iv);
-EXPORT_SYMBOL_GPL(twofish_ctr_8way);
 asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst,
                                     const u8 *src, le128 *iv);
-EXPORT_SYMBOL_GPL(twofish_xts_enc_8way);
 asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst,
                                     const u8 *src, le128 *iv);
-EXPORT_SYMBOL_GPL(twofish_xts_dec_8way);
 static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
                                        const u8 *src)
@@ -77,19 +69,17 @@ static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
        __twofish_enc_blk_3way(ctx, dst, src, false);
 }
-void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+static void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
        glue_xts_crypt_128bit_one(ctx, dst, src, iv,
                                  GLUE_FUNC_CAST(twofish_enc_blk));
 }
-EXPORT_SYMBOL_GPL(twofish_xts_enc);
-void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+static void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
        glue_xts_crypt_128bit_one(ctx, dst, src, iv,
                                  GLUE_FUNC_CAST(twofish_dec_blk));
 }
-EXPORT_SYMBOL_GPL(twofish_xts_dec);
 static const struct common_glue_ctx twofish_enc = {
author	Linus Torvalds <torvalds@linux-foundation.org>	2013-07-05 15:12:33 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-07-05 15:12:33 -0400
commit	b2c311075db578f1433d9b303698491bfa21279a (patch)
tree	41d5f1b5ad6f45be7211f524328de81f7e9754be /arch/x86/crypto
parent	45175476ae2dbebc860d5cf486f2916044343513 (diff)
parent	02c0241b600e4ab8a732c89749e252165145d60c (diff)