Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto update from Herbert Xu: - Added aesni/avx/x86_64 implementations for camellia. - Optimised AVX code for cast5/serpent/twofish/cast6. - Fixed vmac bug with unaligned input. - Allow compression algorithms in FIPS mode. - Optimised crc32c implementation for Intel. - Misc fixes. * git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (32 commits) crypto: caam - Updated SEC-4.0 device tree binding for ERA information. crypto: testmgr - remove superfluous initializers for xts(aes) crypto: testmgr - allow compression algs in fips mode crypto: testmgr - add larger crc32c test vector to test FPU path in crc32c_intel crypto: testmgr - clean alg_test_null entries in alg_test_descs[] crypto: testmgr - remove fips_allowed flag from camellia-aesni null-tests crypto: cast5/cast6 - move lookup tables to shared module padata: use __this_cpu_read per-cpu helper crypto: s5p-sss - Fix compilation error crypto: picoxcell - Add terminating entry for platform_device_id table crypto: omap-aes - select BLKCIPHER2 crypto: camellia - add AES-NI/AVX/x86_64 assembler implementation of camellia cipher crypto: camellia-x86_64 - share common functions and move structures and function definitions to header file crypto: tcrypt - add async speed test for camellia cipher crypto: tegra-aes - fix error-valued pointer dereference crypto: tegra - fix missing unlock on error case crypto: cast5/avx - avoid using temporary stack buffers crypto: serpent/avx - avoid using temporary stack buffers crypto: twofish/avx - avoid using temporary stack buffers crypto: cast6/avx - avoid using temporary stack buffers ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-12-15 15:35:19 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-12-15 15:35:19 -0500
commit: 1ed55eac3b1fc30b29cdb52251e0f13b24fc344c (patch)
tree: b7a4c67f2e29f8aa418708c5da871e64c511f3ff /arch/x86
parent: 08242bc2210938761230f79c5288dbcf72e94808 (diff)
parent: a2c0911c09190125f52c9941b9d187f601c2f7be (diff)
22 files changed, 3160 insertions, 620 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 5bacb4a226ac..e0ca7c9ac383 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
 obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
+obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o
 obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
 obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
@@ -34,6 +35,8 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
 camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
+camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
+                               camellia_aesni_avx_glue.o
 cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
 cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
@@ -47,3 +50,5 @@ serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
+crc32c-intel-y := crc32c-intel_glue.o
+crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
new file mode 100644
index 000000000000..2306d2e4816f
--- /dev/null
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -0,0 +1,1102 @@
+/*
+ * x86_64/AVX/AES-NI assembler implementation of Camellia
+ *
+ * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+/*
+ * Version licensed under 2-clause BSD License is available at:
+ *      http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz
+ */
+#define CAMELLIA_TABLE_BYTE_LEN 272
+/* struct camellia_ctx: */
+#define key_table 0
+#define key_length CAMELLIA_TABLE_BYTE_LEN
+/* register macros */
+#define CTX %rdi
+/**********************************************************************
+  16-way camellia
+ **********************************************************************/
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+        vpand x, mask4bit, tmp0; \
+        vpandn x, mask4bit, x; \
+        vpsrld $4, x, x; \
+        \
+        vpshufb tmp0, lo_t, tmp0; \
+        vpshufb x, hi_t, x; \
+        vpxor tmp0, x, x;
+/*
+ * IN:
+ *   x0..x7: byte-sliced AB state
+ *   mem_cd: register pointer storing CD state
+ *   key: index for key material
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
+                  t7, mem_cd, key) \
+        /* \
+         * S-function with AES subbytes \
+         */ \
+        vmovdqa .Linv_shift_row, t4; \
+        vbroadcastss .L0f0f0f0f, t7; \
+        vmovdqa .Lpre_tf_lo_s1, t0; \
+        vmovdqa .Lpre_tf_hi_s1, t1; \
+        \
+        /* AES inverse shift rows */ \
+        vpshufb t4, x0, x0; \
+        vpshufb t4, x7, x7; \
+        vpshufb t4, x1, x1; \
+        vpshufb t4, x4, x4; \
+        vpshufb t4, x2, x2; \
+        vpshufb t4, x5, x5; \
+        vpshufb t4, x3, x3; \
+        vpshufb t4, x6, x6; \
+        \
+        /* prefilter sboxes 1, 2 and 3 */ \
+        vmovdqa .Lpre_tf_lo_s4, t2; \
+        vmovdqa .Lpre_tf_hi_s4, t3; \
+        filter_8bit(x0, t0, t1, t7, t6); \
+        filter_8bit(x7, t0, t1, t7, t6); \
+        filter_8bit(x1, t0, t1, t7, t6); \
+        filter_8bit(x4, t0, t1, t7, t6); \
+        filter_8bit(x2, t0, t1, t7, t6); \
+        filter_8bit(x5, t0, t1, t7, t6); \
+        \
+        /* prefilter sbox 4 */ \
+        vpxor t4, t4, t4; \
+        filter_8bit(x3, t2, t3, t7, t6); \
+        filter_8bit(x6, t2, t3, t7, t6); \
+        \
+        /* AES subbytes + AES shift rows */ \
+        vmovdqa .Lpost_tf_lo_s1, t0; \
+        vmovdqa .Lpost_tf_hi_s1, t1; \
+        vaesenclast t4, x0, x0; \
+        vaesenclast t4, x7, x7; \
+        vaesenclast t4, x1, x1; \
+        vaesenclast t4, x4, x4; \
+        vaesenclast t4, x2, x2; \
+        vaesenclast t4, x5, x5; \
+        vaesenclast t4, x3, x3; \
+        vaesenclast t4, x6, x6; \
+        \
+        /* postfilter sboxes 1 and 4 */ \
+        vmovdqa .Lpost_tf_lo_s3, t2; \
+        vmovdqa .Lpost_tf_hi_s3, t3; \
+        filter_8bit(x0, t0, t1, t7, t6); \
+        filter_8bit(x7, t0, t1, t7, t6); \
+        filter_8bit(x3, t0, t1, t7, t6); \
+        filter_8bit(x6, t0, t1, t7, t6); \
+        \
+        /* postfilter sbox 3 */ \
+        vmovdqa .Lpost_tf_lo_s2, t4; \
+        vmovdqa .Lpost_tf_hi_s2, t5; \
+        filter_8bit(x2, t2, t3, t7, t6); \
+        filter_8bit(x5, t2, t3, t7, t6); \
+        \
+        vpxor t6, t6, t6; \
+        vmovq key, t0; \
+        \
+        /* postfilter sbox 2 */ \
+        filter_8bit(x1, t4, t5, t7, t2); \
+        filter_8bit(x4, t4, t5, t7, t2); \
+        \
+        vpsrldq $5, t0, t5; \
+        vpsrldq $1, t0, t1; \
+        vpsrldq $2, t0, t2; \
+        vpsrldq $3, t0, t3; \
+        vpsrldq $4, t0, t4; \
+        vpshufb t6, t0, t0; \
+        vpshufb t6, t1, t1; \
+        vpshufb t6, t2, t2; \
+        vpshufb t6, t3, t3; \
+        vpshufb t6, t4, t4; \
+        vpsrldq $2, t5, t7; \
+        vpshufb t6, t7, t7; \
+        \
+        /* \
+         * P-function \
+         */ \
+        vpxor x5, x0, x0; \
+        vpxor x6, x1, x1; \
+        vpxor x7, x2, x2; \
+        vpxor x4, x3, x3; \
+        \
+        vpxor x2, x4, x4; \
+        vpxor x3, x5, x5; \
+        vpxor x0, x6, x6; \
+        vpxor x1, x7, x7; \
+        \
+        vpxor x7, x0, x0; \
+        vpxor x4, x1, x1; \
+        vpxor x5, x2, x2; \
+        vpxor x6, x3, x3; \
+        \
+        vpxor x3, x4, x4; \
+        vpxor x0, x5, x5; \
+        vpxor x1, x6, x6; \
+        vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+        \
+        /* \
+         * Add key material and result to CD (x becomes new CD) \
+         */ \
+        \
+        vpxor t3, x4, x4; \
+        vpxor 0 * 16(mem_cd), x4, x4; \
+        \
+        vpxor t2, x5, x5; \
+        vpxor 1 * 16(mem_cd), x5, x5; \
+        \
+        vpsrldq $1, t5, t3; \
+        vpshufb t6, t5, t5; \
+        vpshufb t6, t3, t6; \
+        \
+        vpxor t1, x6, x6; \
+        vpxor 2 * 16(mem_cd), x6, x6; \
+        \
+        vpxor t0, x7, x7; \
+        vpxor 3 * 16(mem_cd), x7, x7; \
+        \
+        vpxor t7, x0, x0; \
+        vpxor 4 * 16(mem_cd), x0, x0; \
+        \
+        vpxor t6, x1, x1; \
+        vpxor 5 * 16(mem_cd), x1, x1; \
+        \
+        vpxor t5, x2, x2; \
+        vpxor 6 * 16(mem_cd), x2, x2; \
+        \
+        vpxor t4, x3, x3; \
+        vpxor 7 * 16(mem_cd), x3, x3;
+/*
+ * Size optimization... with inlined roundsm16, binary would be over 5 times
+ * larger and would only be 0.5% faster (on sandy-bridge).
+ */
+.align 8
+roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
+        roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                  %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
+                  %rcx, (%r9));
+        ret;
+.align 8
+roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
+        roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
+                  %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
+                  %rax, (%r9));
+        ret;
+/*
+ * IN/OUT:
+ *  x0..x7: byte-sliced AB state preloaded
+ *  mem_ab: byte-sliced AB state in memory
+ *  mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+        leaq (key_table + (i) * 8)(CTX), %r9; \
+        call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
+        \
+        vmovdqu x4, 0 * 16(mem_cd); \
+        vmovdqu x5, 1 * 16(mem_cd); \
+        vmovdqu x6, 2 * 16(mem_cd); \
+        vmovdqu x7, 3 * 16(mem_cd); \
+        vmovdqu x0, 4 * 16(mem_cd); \
+        vmovdqu x1, 5 * 16(mem_cd); \
+        vmovdqu x2, 6 * 16(mem_cd); \
+        vmovdqu x3, 7 * 16(mem_cd); \
+        \
+        leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
+        call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
+        \
+        store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
+        /* Store new AB state */ \
+        vmovdqu x0, 0 * 16(mem_ab); \
+        vmovdqu x1, 1 * 16(mem_ab); \
+        vmovdqu x2, 2 * 16(mem_ab); \
+        vmovdqu x3, 3 * 16(mem_ab); \
+        vmovdqu x4, 4 * 16(mem_ab); \
+        vmovdqu x5, 5 * 16(mem_ab); \
+        vmovdqu x6, 6 * 16(mem_ab); \
+        vmovdqu x7, 7 * 16(mem_ab);
+#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                      y6, y7, mem_ab, mem_cd, i) \
+        two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
+        two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
+        two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
+#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                      y6, y7, mem_ab, mem_cd, i) \
+        two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
+        two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
+        two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
+/*
+ * IN:
+ *  v0..3: byte-sliced 32-bit integers
+ * OUT:
+ *  v0..3: (IN <<< 1)
+ */
+#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
+        vpcmpgtb v0, zero, t0; \
+        vpaddb v0, v0, v0; \
+        vpabsb t0, t0; \
+        \
+        vpcmpgtb v1, zero, t1; \
+        vpaddb v1, v1, v1; \
+        vpabsb t1, t1; \
+        \
+        vpcmpgtb v2, zero, t2; \
+        vpaddb v2, v2, v2; \
+        vpabsb t2, t2; \
+        \
+        vpor t0, v1, v1; \
+        \
+        vpcmpgtb v3, zero, t0; \
+        vpaddb v3, v3, v3; \
+        vpabsb t0, t0; \
+        \
+        vpor t1, v2, v2; \
+        vpor t2, v3, v3; \
+        vpor t0, v0, v0;
+/*
+ * IN:
+ *   r: byte-sliced AB state in memory
+ *   l: byte-sliced CD state in memory
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
+              tt1, tt2, tt3, kll, klr, krl, krr) \
+        /* \
+         * t0 = kll; \
+         * t0 &= ll; \
+         * lr ^= rol32(t0, 1); \
+         */ \
+        vpxor tt0, tt0, tt0; \
+        vmovd kll, t0; \
+        vpshufb tt0, t0, t3; \
+        vpsrldq $1, t0, t0; \
+        vpshufb tt0, t0, t2; \
+        vpsrldq $1, t0, t0; \
+        vpshufb tt0, t0, t1; \
+        vpsrldq $1, t0, t0; \
+        vpshufb tt0, t0, t0; \
+        \
+        vpand l0, t0, t0; \
+        vpand l1, t1, t1; \
+        vpand l2, t2, t2; \
+        vpand l3, t3, t3; \
+        \
+        rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+        \
+        vpxor l4, t0, l4; \
+        vmovdqu l4, 4 * 16(l); \
+        vpxor l5, t1, l5; \
+        vmovdqu l5, 5 * 16(l); \
+        vpxor l6, t2, l6; \
+        vmovdqu l6, 6 * 16(l); \
+        vpxor l7, t3, l7; \
+        vmovdqu l7, 7 * 16(l); \
+        \
+        /* \
+         * t2 = krr; \
+         * t2 |= rr; \
+         * rl ^= t2; \
+         */ \
+        \
+        vmovd krr, t0; \
+        vpshufb tt0, t0, t3; \
+        vpsrldq $1, t0, t0; \
+        vpshufb tt0, t0, t2; \
+        vpsrldq $1, t0, t0; \
+        vpshufb tt0, t0, t1; \
+        vpsrldq $1, t0, t0; \
+        vpshufb tt0, t0, t0; \
+        \
+        vpor 4 * 16(r), t0, t0; \
+        vpor 5 * 16(r), t1, t1; \
+        vpor 6 * 16(r), t2, t2; \
+        vpor 7 * 16(r), t3, t3; \
+        \
+        vpxor 0 * 16(r), t0, t0; \
+        vpxor 1 * 16(r), t1, t1; \
+        vpxor 2 * 16(r), t2, t2; \
+        vpxor 3 * 16(r), t3, t3; \
+        vmovdqu t0, 0 * 16(r); \
+        vmovdqu t1, 1 * 16(r); \
+        vmovdqu t2, 2 * 16(r); \
+        vmovdqu t3, 3 * 16(r); \
+        \
+        /* \
+         * t2 = krl; \
+         * t2 &= rl; \
+         * rr ^= rol32(t2, 1); \
+         */ \
+        vmovd krl, t0; \
+        vpshufb tt0, t0, t3; \
+        vpsrldq $1, t0, t0; \
+        vpshufb tt0, t0, t2; \
+        vpsrldq $1, t0, t0; \
+        vpshufb tt0, t0, t1; \
+        vpsrldq $1, t0, t0; \
+        vpshufb tt0, t0, t0; \
+        \
+        vpand 0 * 16(r), t0, t0; \
+        vpand 1 * 16(r), t1, t1; \
+        vpand 2 * 16(r), t2, t2; \
+        vpand 3 * 16(r), t3, t3; \
+        \
+        rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+        \
+        vpxor 4 * 16(r), t0, t0; \
+        vpxor 5 * 16(r), t1, t1; \
+        vpxor 6 * 16(r), t2, t2; \
+        vpxor 7 * 16(r), t3, t3; \
+        vmovdqu t0, 4 * 16(r); \
+        vmovdqu t1, 5 * 16(r); \
+        vmovdqu t2, 6 * 16(r); \
+        vmovdqu t3, 7 * 16(r); \
+        \
+        /* \
+         * t0 = klr; \
+         * t0 |= lr; \
+         * ll ^= t0; \
+         */ \
+        \
+        vmovd klr, t0; \
+        vpshufb tt0, t0, t3; \
+        vpsrldq $1, t0, t0; \
+        vpshufb tt0, t0, t2; \
+        vpsrldq $1, t0, t0; \
+        vpshufb tt0, t0, t1; \
+        vpsrldq $1, t0, t0; \
+        vpshufb tt0, t0, t0; \
+        \
+        vpor l4, t0, t0; \
+        vpor l5, t1, t1; \
+        vpor l6, t2, t2; \
+        vpor l7, t3, t3; \
+        \
+        vpxor l0, t0, l0; \
+        vmovdqu l0, 0 * 16(l); \
+        vpxor l1, t1, l1; \
+        vmovdqu l1, 1 * 16(l); \
+        vpxor l2, t2, l2; \
+        vmovdqu l2, 2 * 16(l); \
+        vpxor l3, t3, l3; \
+        vmovdqu l3, 3 * 16(l);
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+        vpunpckhdq x1, x0, t2; \
+        vpunpckldq x1, x0, x0; \
+        \
+        vpunpckldq x3, x2, t1; \
+        vpunpckhdq x3, x2, x2; \
+        \
+        vpunpckhqdq t1, x0, x1; \
+        vpunpcklqdq t1, x0, x0; \
+        \
+        vpunpckhqdq x2, t2, x3; \
+        vpunpcklqdq x2, t2, x2;
+#define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
+                         b3, c3, d3, st0, st1) \
+        vmovdqu d2, st0; \
+        vmovdqu d3, st1; \
+        transpose_4x4(a0, a1, a2, a3, d2, d3); \
+        transpose_4x4(b0, b1, b2, b3, d2, d3); \
+        vmovdqu st0, d2; \
+        vmovdqu st1, d3; \
+        \
+        vmovdqu a0, st0; \
+        vmovdqu a1, st1; \
+        transpose_4x4(c0, c1, c2, c3, a0, a1); \
+        transpose_4x4(d0, d1, d2, d3, a0, a1); \
+        \
+        vmovdqu .Lshufb_16x16b, a0; \
+        vmovdqu st1, a1; \
+        vpshufb a0, a2, a2; \
+        vpshufb a0, a3, a3; \
+        vpshufb a0, b0, b0; \
+        vpshufb a0, b1, b1; \
+        vpshufb a0, b2, b2; \
+        vpshufb a0, b3, b3; \
+        vpshufb a0, a1, a1; \
+        vpshufb a0, c0, c0; \
+        vpshufb a0, c1, c1; \
+        vpshufb a0, c2, c2; \
+        vpshufb a0, c3, c3; \
+        vpshufb a0, d0, d0; \
+        vpshufb a0, d1, d1; \
+        vpshufb a0, d2, d2; \
+        vpshufb a0, d3, d3; \
+        vmovdqu d3, st1; \
+        vmovdqu st0, d3; \
+        vpshufb a0, d3, a0; \
+        vmovdqu d2, st0; \
+        \
+        transpose_4x4(a0, b0, c0, d0, d2, d3); \
+        transpose_4x4(a1, b1, c1, d1, d2, d3); \
+        vmovdqu st0, d2; \
+        vmovdqu st1, d3; \
+        \
+        vmovdqu b0, st0; \
+        vmovdqu b1, st1; \
+        transpose_4x4(a2, b2, c2, d2, b0, b1); \
+        transpose_4x4(a3, b3, c3, d3, b0, b1); \
+        vmovdqu st0, b0; \
+        vmovdqu st1, b1; \
+        /* does not adjust output bytes inside vectors */
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, rio, key) \
+        vmovq key, x0; \
+        vpshufb .Lpack_bswap, x0, x0; \
+        \
+        vpxor 0 * 16(rio), x0, y7; \
+        vpxor 1 * 16(rio), x0, y6; \
+        vpxor 2 * 16(rio), x0, y5; \
+        vpxor 3 * 16(rio), x0, y4; \
+        vpxor 4 * 16(rio), x0, y3; \
+        vpxor 5 * 16(rio), x0, y2; \
+        vpxor 6 * 16(rio), x0, y1; \
+        vpxor 7 * 16(rio), x0, y0; \
+        vpxor 8 * 16(rio), x0, x7; \
+        vpxor 9 * 16(rio), x0, x6; \
+        vpxor 10 * 16(rio), x0, x5; \
+        vpxor 11 * 16(rio), x0, x4; \
+        vpxor 12 * 16(rio), x0, x3; \
+        vpxor 13 * 16(rio), x0, x2; \
+        vpxor 14 * 16(rio), x0, x1; \
+        vpxor 15 * 16(rio), x0, x0;
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                      y6, y7, mem_ab, mem_cd) \
+        byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+                         y5, y6, y7, (mem_ab), (mem_cd)); \
+        \
+        vmovdqu x0, 0 * 16(mem_ab); \
+        vmovdqu x1, 1 * 16(mem_ab); \
+        vmovdqu x2, 2 * 16(mem_ab); \
+        vmovdqu x3, 3 * 16(mem_ab); \
+        vmovdqu x4, 4 * 16(mem_ab); \
+        vmovdqu x5, 5 * 16(mem_ab); \
+        vmovdqu x6, 6 * 16(mem_ab); \
+        vmovdqu x7, 7 * 16(mem_ab); \
+        vmovdqu y0, 0 * 16(mem_cd); \
+        vmovdqu y1, 1 * 16(mem_cd); \
+        vmovdqu y2, 2 * 16(mem_cd); \
+        vmovdqu y3, 3 * 16(mem_cd); \
+        vmovdqu y4, 4 * 16(mem_cd); \
+        vmovdqu y5, 5 * 16(mem_cd); \
+        vmovdqu y6, 6 * 16(mem_cd); \
+        vmovdqu y7, 7 * 16(mem_cd);
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+                    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
+        byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
+                         y7, x3, x7, stack_tmp0, stack_tmp1); \
+        \
+        vmovdqu x0, stack_tmp0; \
+        \
+        vmovq key, x0; \
+        vpshufb .Lpack_bswap, x0, x0; \
+        \
+        vpxor x0, y7, y7; \
+        vpxor x0, y6, y6; \
+        vpxor x0, y5, y5; \
+        vpxor x0, y4, y4; \
+        vpxor x0, y3, y3; \
+        vpxor x0, y2, y2; \
+        vpxor x0, y1, y1; \
+        vpxor x0, y0, y0; \
+        vpxor x0, x7, x7; \
+        vpxor x0, x6, x6; \
+        vpxor x0, x5, x5; \
+        vpxor x0, x4, x4; \
+        vpxor x0, x3, x3; \
+        vpxor x0, x2, x2; \
+        vpxor x0, x1, x1; \
+        vpxor stack_tmp0, x0, x0;
+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, rio) \
+        vmovdqu x0, 0 * 16(rio); \
+        vmovdqu x1, 1 * 16(rio); \
+        vmovdqu x2, 2 * 16(rio); \
+        vmovdqu x3, 3 * 16(rio); \
+        vmovdqu x4, 4 * 16(rio); \
+        vmovdqu x5, 5 * 16(rio); \
+        vmovdqu x6, 6 * 16(rio); \
+        vmovdqu x7, 7 * 16(rio); \
+        vmovdqu y0, 8 * 16(rio); \
+        vmovdqu y1, 9 * 16(rio); \
+        vmovdqu y2, 10 * 16(rio); \
+        vmovdqu y3, 11 * 16(rio); \
+        vmovdqu y4, 12 * 16(rio); \
+        vmovdqu y5, 13 * 16(rio); \
+        vmovdqu y6, 14 * 16(rio); \
+        vmovdqu y7, 15 * 16(rio);
+.data
+.align 16
+#define SHUFB_BYTES(idx) \
+        0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+.Lshufb_16x16b:
+        .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
+.Lpack_bswap:
+        .long 0x00010203
+        .long 0x04050607
+        .long 0x80808080
+        .long 0x80808080
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox1, sbox2, sbox3:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s1:
+        .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
+        .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
+.Lpre_tf_hi_s1:
+        .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
+        .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox4:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in <<< 1)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s4:
+        .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
+        .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
+.Lpre_tf_hi_s4:
+        .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
+        .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox1, sbox4:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  )
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s1:
+        .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
+        .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
+.Lpost_tf_hi_s1:
+        .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
+        .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox2:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) <<< 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s2:
+        .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
+        .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
+.Lpost_tf_hi_s2:
+        .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
+        .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox3:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) >>> 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s3:
+        .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
+        .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
+.Lpost_tf_hi_s3:
+        .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
+        .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+        .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+        .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+/* 4-bit mask */
+.align 4
+.L0f0f0f0f:
+        .long 0x0f0f0f0f
+.text
+.align 8
+.type   __camellia_enc_blk16,@function;
+__camellia_enc_blk16:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rax: temporary storage, 256 bytes
+         *      %xmm0..%xmm15: 16 plaintext blocks
+         * output:
+         *      %xmm0..%xmm15: 16 encrypted blocks, order swapped:
+         *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+         */
+        leaq 8 * 16(%rax), %rcx;
+        inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                      %xmm15, %rax, %rcx);
+        enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                     %xmm15, %rax, %rcx, 0);
+        fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+              %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+              %xmm15,
+              ((key_table + (8) * 8) + 0)(CTX),
+              ((key_table + (8) * 8) + 4)(CTX),
+              ((key_table + (8) * 8) + 8)(CTX),
+              ((key_table + (8) * 8) + 12)(CTX));
+        enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                     %xmm15, %rax, %rcx, 8);
+        fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+              %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+              %xmm15,
+              ((key_table + (16) * 8) + 0)(CTX),
+              ((key_table + (16) * 8) + 4)(CTX),
+              ((key_table + (16) * 8) + 8)(CTX),
+              ((key_table + (16) * 8) + 12)(CTX));
+        enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                     %xmm15, %rax, %rcx, 16);
+        movl $24, %r8d;
+        cmpl $16, key_length(CTX);
+        jne .Lenc_max32;
+.Lenc_done:
+        /* load CD for output */
+        vmovdqu 0 * 16(%rcx), %xmm8;
+        vmovdqu 1 * 16(%rcx), %xmm9;
+        vmovdqu 2 * 16(%rcx), %xmm10;
+        vmovdqu 3 * 16(%rcx), %xmm11;
+        vmovdqu 4 * 16(%rcx), %xmm12;
+        vmovdqu 5 * 16(%rcx), %xmm13;
+        vmovdqu 6 * 16(%rcx), %xmm14;
+        vmovdqu 7 * 16(%rcx), %xmm15;
+        outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
+        ret;
+.align 8
+.Lenc_max32:
+        movl $32, %r8d;
+        fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+              %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+              %xmm15,
+              ((key_table + (24) * 8) + 0)(CTX),
+              ((key_table + (24) * 8) + 4)(CTX),
+              ((key_table + (24) * 8) + 8)(CTX),
+              ((key_table + (24) * 8) + 12)(CTX));
+        enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                     %xmm15, %rax, %rcx, 24);
+        jmp .Lenc_done;
+.align 8
+.type   __camellia_dec_blk16,@function;
+__camellia_dec_blk16:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rax: temporary storage, 256 bytes
+         *      %r8d: 24 for 16 byte key, 32 for larger
+         *      %xmm0..%xmm15: 16 encrypted blocks
+         * output:
+         *      %xmm0..%xmm15: 16 plaintext blocks, order swapped:
+         *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+         */
+        leaq 8 * 16(%rax), %rcx;
+        inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                      %xmm15, %rax, %rcx);
+        cmpl $32, %r8d;
+        je .Ldec_max32;
+.Ldec_max24:
+        dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                     %xmm15, %rax, %rcx, 16);
+        fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+              %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+              %xmm15,
+              ((key_table + (16) * 8) + 8)(CTX),
+              ((key_table + (16) * 8) + 12)(CTX),
+              ((key_table + (16) * 8) + 0)(CTX),
+              ((key_table + (16) * 8) + 4)(CTX));
+        dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                     %xmm15, %rax, %rcx, 8);
+        fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+              %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+              %xmm15,
+              ((key_table + (8) * 8) + 8)(CTX),
+              ((key_table + (8) * 8) + 12)(CTX),
+              ((key_table + (8) * 8) + 0)(CTX),
+              ((key_table + (8) * 8) + 4)(CTX));
+        dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                     %xmm15, %rax, %rcx, 0);
+        /* load CD for output */
+        vmovdqu 0 * 16(%rcx), %xmm8;
+        vmovdqu 1 * 16(%rcx), %xmm9;
+        vmovdqu 2 * 16(%rcx), %xmm10;
+        vmovdqu 3 * 16(%rcx), %xmm11;
+        vmovdqu 4 * 16(%rcx), %xmm12;
+        vmovdqu 5 * 16(%rcx), %xmm13;
+        vmovdqu 6 * 16(%rcx), %xmm14;
+        vmovdqu 7 * 16(%rcx), %xmm15;
+        outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
+        ret;
+.align 8
+.Ldec_max32:
+        dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                     %xmm15, %rax, %rcx, 24);
+        fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+              %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+              %xmm15,
+              ((key_table + (24) * 8) + 8)(CTX),
+              ((key_table + (24) * 8) + 12)(CTX),
+              ((key_table + (24) * 8) + 0)(CTX),
+              ((key_table + (24) * 8) + 4)(CTX));
+        jmp .Ldec_max24;
+.align 8
+.global camellia_ecb_enc_16way
+.type   camellia_ecb_enc_16way,@function;
+camellia_ecb_enc_16way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst (16 blocks)
+         *      %rdx: src (16 blocks)
+         */
+        inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                     %xmm15, %rdx, (key_table)(CTX));
+        /* now dst can be used as temporary buffer (even in src == dst case) */
+        movq    %rsi, %rax;
+        call __camellia_enc_blk16;
+        write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+                     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+                     %xmm8, %rsi);
+        ret;
+.align 8
+.global camellia_ecb_dec_16way
+.type   camellia_ecb_dec_16way,@function;
+camellia_ecb_dec_16way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst (16 blocks)
+         *      %rdx: src (16 blocks)
+         */
+        cmpl $16, key_length(CTX);
+        movl $32, %r8d;
+        movl $24, %eax;
+        cmovel %eax, %r8d; /* max */
+        inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
+        /* now dst can be used as temporary buffer (even in src == dst case) */
+        movq    %rsi, %rax;
+        call __camellia_dec_blk16;
+        write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+                     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+                     %xmm8, %rsi);
+        ret;
+.align 8
+.global camellia_cbc_dec_16way
+.type   camellia_cbc_dec_16way,@function;
+camellia_cbc_dec_16way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst (16 blocks)
+         *      %rdx: src (16 blocks)
+         */
+        cmpl $16, key_length(CTX);
+        movl $32, %r8d;
+        movl $24, %eax;
+        cmovel %eax, %r8d; /* max */
+        inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
+        /*
+         * dst might still be in-use (in case dst == src), so use stack for
+         * temporary storage.
+         */
+        subq $(16 * 16), %rsp;
+        movq %rsp, %rax;
+        call __camellia_dec_blk16;
+        addq $(16 * 16), %rsp;
+        vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
+        vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
+        vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
+        vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
+        vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
+        vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
+        vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
+        vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
+        vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
+        vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
+        vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
+        vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
+        vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
+        vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
+        vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
+        write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+                     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+                     %xmm8, %rsi);
+        ret;
+#define inc_le128(x, minus_one, tmp) \
+        vpcmpeqq minus_one, x, tmp; \
+        vpsubq minus_one, x, x; \
+        vpslldq $8, tmp, tmp; \
+        vpsubq tmp, x, x;
+.align 8
+.global camellia_ctr_16way
+.type   camellia_ctr_16way,@function;
+camellia_ctr_16way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst (16 blocks)
+         *      %rdx: src (16 blocks)
+         *      %rcx: iv (little endian, 128bit)
+         */
+        subq $(16 * 16), %rsp;
+        movq %rsp, %rax;
+        vmovdqa .Lbswap128_mask, %xmm14;
+        /* load IV and byteswap */
+        vmovdqu (%rcx), %xmm0;
+        vpshufb %xmm14, %xmm0, %xmm15;
+        vmovdqu %xmm15, 15 * 16(%rax);
+        vpcmpeqd %xmm15, %xmm15, %xmm15;
+        vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
+        /* construct IVs */
+        inc_le128(%xmm0, %xmm15, %xmm13);
+        vpshufb %xmm14, %xmm0, %xmm13;
+        vmovdqu %xmm13, 14 * 16(%rax);
+        inc_le128(%xmm0, %xmm15, %xmm13);
+        vpshufb %xmm14, %xmm0, %xmm13;
+        vmovdqu %xmm13, 13 * 16(%rax);
+        inc_le128(%xmm0, %xmm15, %xmm13);
+        vpshufb %xmm14, %xmm0, %xmm12;
+        inc_le128(%xmm0, %xmm15, %xmm13);
+        vpshufb %xmm14, %xmm0, %xmm11;
+        inc_le128(%xmm0, %xmm15, %xmm13);
+        vpshufb %xmm14, %xmm0, %xmm10;
+        inc_le128(%xmm0, %xmm15, %xmm13);
+        vpshufb %xmm14, %xmm0, %xmm9;
+        inc_le128(%xmm0, %xmm15, %xmm13);
+        vpshufb %xmm14, %xmm0, %xmm8;
+        inc_le128(%xmm0, %xmm15, %xmm13);
+        vpshufb %xmm14, %xmm0, %xmm7;
+        inc_le128(%xmm0, %xmm15, %xmm13);
+        vpshufb %xmm14, %xmm0, %xmm6;
+        inc_le128(%xmm0, %xmm15, %xmm13);
+        vpshufb %xmm14, %xmm0, %xmm5;
+        inc_le128(%xmm0, %xmm15, %xmm13);
+        vpshufb %xmm14, %xmm0, %xmm4;
+        inc_le128(%xmm0, %xmm15, %xmm13);
+        vpshufb %xmm14, %xmm0, %xmm3;
+        inc_le128(%xmm0, %xmm15, %xmm13);
+        vpshufb %xmm14, %xmm0, %xmm2;
+        inc_le128(%xmm0, %xmm15, %xmm13);
+        vpshufb %xmm14, %xmm0, %xmm1;
+        inc_le128(%xmm0, %xmm15, %xmm13);
+        vmovdqa %xmm0, %xmm13;
+        vpshufb %xmm14, %xmm0, %xmm0;
+        inc_le128(%xmm13, %xmm15, %xmm14);
+        vmovdqu %xmm13, (%rcx);
+        /* inpack16_pre: */
+        vmovq (key_table)(CTX), %xmm15;
+        vpshufb .Lpack_bswap, %xmm15, %xmm15;
+        vpxor %xmm0, %xmm15, %xmm0;
+        vpxor %xmm1, %xmm15, %xmm1;
+        vpxor %xmm2, %xmm15, %xmm2;
+        vpxor %xmm3, %xmm15, %xmm3;
+        vpxor %xmm4, %xmm15, %xmm4;
+        vpxor %xmm5, %xmm15, %xmm5;
+        vpxor %xmm6, %xmm15, %xmm6;
+        vpxor %xmm7, %xmm15, %xmm7;
+        vpxor %xmm8, %xmm15, %xmm8;
+        vpxor %xmm9, %xmm15, %xmm9;
+        vpxor %xmm10, %xmm15, %xmm10;
+        vpxor %xmm11, %xmm15, %xmm11;
+        vpxor %xmm12, %xmm15, %xmm12;
+        vpxor 13 * 16(%rax), %xmm15, %xmm13;
+        vpxor 14 * 16(%rax), %xmm15, %xmm14;
+        vpxor 15 * 16(%rax), %xmm15, %xmm15;
+        call __camellia_enc_blk16;
+        addq $(16 * 16), %rsp;
+        vpxor 0 * 16(%rdx), %xmm7, %xmm7;
+        vpxor 1 * 16(%rdx), %xmm6, %xmm6;
+        vpxor 2 * 16(%rdx), %xmm5, %xmm5;
+        vpxor 3 * 16(%rdx), %xmm4, %xmm4;
+        vpxor 4 * 16(%rdx), %xmm3, %xmm3;
+        vpxor 5 * 16(%rdx), %xmm2, %xmm2;
+        vpxor 6 * 16(%rdx), %xmm1, %xmm1;
+        vpxor 7 * 16(%rdx), %xmm0, %xmm0;
+        vpxor 8 * 16(%rdx), %xmm15, %xmm15;
+        vpxor 9 * 16(%rdx), %xmm14, %xmm14;
+        vpxor 10 * 16(%rdx), %xmm13, %xmm13;
+        vpxor 11 * 16(%rdx), %xmm12, %xmm12;
+        vpxor 12 * 16(%rdx), %xmm11, %xmm11;
+        vpxor 13 * 16(%rdx), %xmm10, %xmm10;
+        vpxor 14 * 16(%rdx), %xmm9, %xmm9;
+        vpxor 15 * 16(%rdx), %xmm8, %xmm8;
+        write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+                     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+                     %xmm8, %rsi);
+        ret;
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
new file mode 100644
index 000000000000..96cbb6068fce
--- /dev/null
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -0,0 +1,558 @@
+/*
+ * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia
+ *
+ * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/ctr.h>
+#include <crypto/lrw.h>
+#include <crypto/xts.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <asm/crypto/camellia.h>
+#include <asm/crypto/ablk_helper.h>
+#include <asm/crypto/glue_helper.h>
+#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
+/* 16-way AES-NI parallel cipher functions */
+asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst,
+                                       const u8 *src);
+asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+                                       const u8 *src);
+asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+                                       const u8 *src);
+asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
+                                   const u8 *src, le128 *iv);
+static const struct common_glue_ctx camellia_enc = {
+        .num_funcs = 3,
+        .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+        .funcs = { {
+                .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) }
+        }, {
+                .num_blocks = 2,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
+        } }
+};
+static const struct common_glue_ctx camellia_ctr = {
+        .num_funcs = 3,
+        .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+        .funcs = { {
+                .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) }
+        }, {
+                .num_blocks = 2,
+                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
+        } }
+};
+static const struct common_glue_ctx camellia_dec = {
+        .num_funcs = 3,
+        .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+        .funcs = { {
+                .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) }
+        }, {
+                .num_blocks = 2,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
+        } }
+};
+static const struct common_glue_ctx camellia_dec_cbc = {
+        .num_funcs = 3,
+        .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+        .funcs = { {
+                .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) }
+        }, {
+                .num_blocks = 2,
+                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
+        }, {
+                .num_blocks = 1,
+                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
+        } }
+};
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes);
+}
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes);
+}
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc,
+                                       dst, src, nbytes);
+}
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src,
+                                       nbytes);
+}
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                     struct scatterlist *src, unsigned int nbytes)
+{
+        return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes);
+}
+static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+{
+        return glue_fpu_begin(CAMELLIA_BLOCK_SIZE,
+                              CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled,
+                              nbytes);
+}
+static inline void camellia_fpu_end(bool fpu_enabled)
+{
+        glue_fpu_end(fpu_enabled);
+}
+static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
+                           unsigned int key_len)
+{
+        return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len,
+                                 &tfm->crt_flags);
+}
+struct crypt_priv {
+        struct camellia_ctx *ctx;
+        bool fpu_enabled;
+};
+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+        const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
+        struct crypt_priv *ctx = priv;
+        int i;
+        ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
+        if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
+                camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
+                srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
+                nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
+        }
+        while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+                camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
+                srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
+                nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
+        }
+        for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+                camellia_enc_blk(ctx->ctx, srcdst, srcdst);
+}
+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+        const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
+        struct crypt_priv *ctx = priv;
+        int i;
+        ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
+        if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
+                camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
+                srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
+                nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
+        }
+        while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+                camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
+                srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
+                nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
+        }
+        for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+                camellia_dec_blk(ctx->ctx, srcdst, srcdst);
+}
+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
+        struct crypt_priv crypt_ctx = {
+                .ctx = &ctx->camellia_ctx,
+                .fpu_enabled = false,
+        };
+        struct lrw_crypt_req req = {
+                .tbuf = buf,
+                .tbuflen = sizeof(buf),
+                .table_ctx = &ctx->lrw_table,
+                .crypt_ctx = &crypt_ctx,
+                .crypt_fn = encrypt_callback,
+        };
+        int ret;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        ret = lrw_crypt(desc, dst, src, nbytes, &req);
+        camellia_fpu_end(crypt_ctx.fpu_enabled);
+        return ret;
+}
+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
+        struct crypt_priv crypt_ctx = {
+                .ctx = &ctx->camellia_ctx,
+                .fpu_enabled = false,
+        };
+        struct lrw_crypt_req req = {
+                .tbuf = buf,
+                .tbuflen = sizeof(buf),
+                .table_ctx = &ctx->lrw_table,
+                .crypt_ctx = &crypt_ctx,
+                .crypt_fn = decrypt_callback,
+        };
+        int ret;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        ret = lrw_crypt(desc, dst, src, nbytes, &req);
+        camellia_fpu_end(crypt_ctx.fpu_enabled);
+        return ret;
+}
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
+        struct crypt_priv crypt_ctx = {
+                .ctx = &ctx->crypt_ctx,
+                .fpu_enabled = false,
+        };
+        struct xts_crypt_req req = {
+                .tbuf = buf,
+                .tbuflen = sizeof(buf),
+                .tweak_ctx = &ctx->tweak_ctx,
+                .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
+                .crypt_ctx = &crypt_ctx,
+                .crypt_fn = encrypt_callback,
+        };
+        int ret;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        ret = xts_crypt(desc, dst, src, nbytes, &req);
+        camellia_fpu_end(crypt_ctx.fpu_enabled);
+        return ret;
+}
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
+        struct crypt_priv crypt_ctx = {
+                .ctx = &ctx->crypt_ctx,
+                .fpu_enabled = false,
+        };
+        struct xts_crypt_req req = {
+                .tbuf = buf,
+                .tbuflen = sizeof(buf),
+                .tweak_ctx = &ctx->tweak_ctx,
+                .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
+                .crypt_ctx = &crypt_ctx,
+                .crypt_fn = decrypt_callback,
+        };
+        int ret;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        ret = xts_crypt(desc, dst, src, nbytes, &req);
+        camellia_fpu_end(crypt_ctx.fpu_enabled);
+        return ret;
+}
+static struct crypto_alg cmll_algs[10] = { {
+        .cra_name               = "__ecb-camellia-aesni",
+        .cra_driver_name        = "__driver-ecb-camellia-aesni",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct camellia_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = CAMELLIA_MIN_KEY_SIZE,
+                        .max_keysize    = CAMELLIA_MAX_KEY_SIZE,
+                        .setkey         = camellia_setkey,
+                        .encrypt        = ecb_encrypt,
+                        .decrypt        = ecb_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "__cbc-camellia-aesni",
+        .cra_driver_name        = "__driver-cbc-camellia-aesni",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct camellia_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = CAMELLIA_MIN_KEY_SIZE,
+                        .max_keysize    = CAMELLIA_MAX_KEY_SIZE,
+                        .setkey         = camellia_setkey,
+                        .encrypt        = cbc_encrypt,
+                        .decrypt        = cbc_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "__ctr-camellia-aesni",
+        .cra_driver_name        = "__driver-ctr-camellia-aesni",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = 1,
+        .cra_ctxsize            = sizeof(struct camellia_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = CAMELLIA_MIN_KEY_SIZE,
+                        .max_keysize    = CAMELLIA_MAX_KEY_SIZE,
+                        .ivsize         = CAMELLIA_BLOCK_SIZE,
+                        .setkey         = camellia_setkey,
+                        .encrypt        = ctr_crypt,
+                        .decrypt        = ctr_crypt,
+                },
+        },
+}, {
+        .cra_name               = "__lrw-camellia-aesni",
+        .cra_driver_name        = "__driver-lrw-camellia-aesni",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct camellia_lrw_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_exit               = lrw_camellia_exit_tfm,
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = CAMELLIA_MIN_KEY_SIZE +
+                                          CAMELLIA_BLOCK_SIZE,
+                        .max_keysize    = CAMELLIA_MAX_KEY_SIZE +
+                                          CAMELLIA_BLOCK_SIZE,
+                        .ivsize         = CAMELLIA_BLOCK_SIZE,
+                        .setkey         = lrw_camellia_setkey,
+                        .encrypt        = lrw_encrypt,
+                        .decrypt        = lrw_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "__xts-camellia-aesni",
+        .cra_driver_name        = "__driver-xts-camellia-aesni",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct camellia_xts_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_u = {
+                .blkcipher = {
+                        .min_keysize    = CAMELLIA_MIN_KEY_SIZE * 2,
+                        .max_keysize    = CAMELLIA_MAX_KEY_SIZE * 2,
+                        .ivsize         = CAMELLIA_BLOCK_SIZE,
+                        .setkey         = xts_camellia_setkey,
+                        .encrypt        = xts_encrypt,
+                        .decrypt        = xts_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "ecb(camellia)",
+        .cra_driver_name        = "ecb-camellia-aesni",
+        .cra_priority           = 400,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = CAMELLIA_MIN_KEY_SIZE,
+                        .max_keysize    = CAMELLIA_MAX_KEY_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = ablk_encrypt,
+                        .decrypt        = ablk_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "cbc(camellia)",
+        .cra_driver_name        = "cbc-camellia-aesni",
+        .cra_priority           = 400,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = CAMELLIA_MIN_KEY_SIZE,
+                        .max_keysize    = CAMELLIA_MAX_KEY_SIZE,
+                        .ivsize         = CAMELLIA_BLOCK_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = __ablk_encrypt,
+                        .decrypt        = ablk_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "ctr(camellia)",
+        .cra_driver_name        = "ctr-camellia-aesni",
+        .cra_priority           = 400,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = 1,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = CAMELLIA_MIN_KEY_SIZE,
+                        .max_keysize    = CAMELLIA_MAX_KEY_SIZE,
+                        .ivsize         = CAMELLIA_BLOCK_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = ablk_encrypt,
+                        .decrypt        = ablk_encrypt,
+                        .geniv          = "chainiv",
+                },
+        },
+}, {
+        .cra_name               = "lrw(camellia)",
+        .cra_driver_name        = "lrw-camellia-aesni",
+        .cra_priority           = 400,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = CAMELLIA_MIN_KEY_SIZE +
+                                          CAMELLIA_BLOCK_SIZE,
+                        .max_keysize    = CAMELLIA_MAX_KEY_SIZE +
+                                          CAMELLIA_BLOCK_SIZE,
+                        .ivsize         = CAMELLIA_BLOCK_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = ablk_encrypt,
+                        .decrypt        = ablk_decrypt,
+                },
+        },
+}, {
+        .cra_name               = "xts(camellia)",
+        .cra_driver_name        = "xts-camellia-aesni",
+        .cra_priority           = 400,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 0,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_u = {
+                .ablkcipher = {
+                        .min_keysize    = CAMELLIA_MIN_KEY_SIZE * 2,
+                        .max_keysize    = CAMELLIA_MAX_KEY_SIZE * 2,
+                        .ivsize         = CAMELLIA_BLOCK_SIZE,
+                        .setkey         = ablk_set_key,
+                        .encrypt        = ablk_encrypt,
+                        .decrypt        = ablk_decrypt,
+                },
+        },
+} };
+static int __init camellia_aesni_init(void)
+{
+        u64 xcr0;
+        if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) {
+                pr_info("AVX or AES-NI instructions are not detected.\n");
+                return -ENODEV;
+        }
+        xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+        if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+                pr_info("AVX detected but unusable.\n");
+                return -ENODEV;
+        }
+        return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
+}
+static void __exit camellia_aesni_fini(void)
+{
+        crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
+}
+module_init(camellia_aesni_init);
+module_exit(camellia_aesni_fini);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX optimized");
+MODULE_ALIAS("camellia");
+MODULE_ALIAS("camellia-asm");
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 42ffd2bbab5b..5cb86ccd4acb 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -32,53 +32,24 @@
 #include <crypto/algapi.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
+#include <asm/crypto/camellia.h>
 #include <asm/crypto/glue_helper.h>
-#define CAMELLIA_MIN_KEY_SIZE   16
-#define CAMELLIA_MAX_KEY_SIZE   32
-#define CAMELLIA_BLOCK_SIZE     16
-#define CAMELLIA_TABLE_BYTE_LEN 272
-struct camellia_ctx {
-        u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)];
-        u32 key_length;
-};
 /* regular block cipher functions */
 asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
                                   const u8 *src, bool xor);
+EXPORT_SYMBOL_GPL(__camellia_enc_blk);
 asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst,
                                 const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_dec_blk);
 /* 2-way parallel cipher functions */
 asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
                                        const u8 *src, bool xor);
+EXPORT_SYMBOL_GPL(__camellia_enc_blk_2way);
 asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
                                      const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_dec_blk_2way);
-static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
-                                    const u8 *src)
-{
-        __camellia_enc_blk(ctx, dst, src, false);
-}
-static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst,
-                                        const u8 *src)
-{
-        __camellia_enc_blk(ctx, dst, src, true);
-}
-static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
-                                         const u8 *src)
-{
-        __camellia_enc_blk_2way(ctx, dst, src, false);
-}
-static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst,
-                                             const u8 *src)
-{
-        __camellia_enc_blk_2way(ctx, dst, src, true);
-}
 static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
@@ -1275,9 +1246,8 @@ static void camellia_setup192(const unsigned char *key, u64 *subkey)
        camellia_setup256(kk, subkey);
 }
-static int __camellia_setkey(struct camellia_ctx *cctx,
+int __camellia_setkey(struct camellia_ctx *cctx, const unsigned char *key,
-                             const unsigned char *key,
+                      unsigned int key_len, u32 *flags)
-                             unsigned int key_len, u32 *flags)
 {
        if (key_len != 16 && key_len != 24 && key_len != 32) {
                *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
@@ -1300,6 +1270,7 @@ static int __camellia_setkey(struct camellia_ctx *cctx,
        return 0;
 }
+EXPORT_SYMBOL_GPL(__camellia_setkey);
 static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
                           unsigned int key_len)
@@ -1308,7 +1279,7 @@ static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
                                 &tfm->crt_flags);
 }
-static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
+void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
 {
        u128 iv = *src;
@@ -1316,22 +1287,23 @@ static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
        u128_xor(&dst[1], &dst[1], &iv);
 }
+EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way);
-static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
+void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
        be128 ctrblk;
        if (dst != src)
                *dst = *src;
-        u128_to_be128(&ctrblk, iv);
+        le128_to_be128(&ctrblk, iv);
-        u128_inc(iv);
+        le128_inc(iv);
        camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
 }
+EXPORT_SYMBOL_GPL(camellia_crypt_ctr);
-static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
+void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, le128 *iv)
-                                    u128 *iv)
 {
        be128 ctrblks[2];
@@ -1340,13 +1312,14 @@ static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
                dst[1] = src[1];
        }
-        u128_to_be128(&ctrblks[0], iv);
+        le128_to_be128(&ctrblks[0], iv);
-        u128_inc(iv);
+        le128_inc(iv);
-        u128_to_be128(&ctrblks[1], iv);
+        le128_to_be128(&ctrblks[1], iv);
-        u128_inc(iv);
+        le128_inc(iv);
        camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks);
 }
+EXPORT_SYMBOL_GPL(camellia_crypt_ctr_2way);
 static const struct common_glue_ctx camellia_enc = {
        .num_funcs = 2,
@@ -1464,13 +1437,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
                camellia_dec_blk(ctx, srcdst, srcdst);
 }
-struct camellia_lrw_ctx {
+int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
-        struct lrw_table_ctx lrw_table;
+                        unsigned int keylen)
-        struct camellia_ctx camellia_ctx;
-};
-static int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
-                              unsigned int keylen)
 {
        struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
        int err;
@@ -1484,6 +1452,7 @@ static int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
        return lrw_init_table(&ctx->lrw_table,
                              key + keylen - CAMELLIA_BLOCK_SIZE);
 }
+EXPORT_SYMBOL_GPL(lrw_camellia_setkey);
 static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
                       struct scatterlist *src, unsigned int nbytes)
@@ -1519,20 +1488,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
        return lrw_crypt(desc, dst, src, nbytes, &req);
 }
-static void lrw_exit_tfm(struct crypto_tfm *tfm)
+void lrw_camellia_exit_tfm(struct crypto_tfm *tfm)
 {
        struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
        lrw_free_table(&ctx->lrw_table);
 }
+EXPORT_SYMBOL_GPL(lrw_camellia_exit_tfm);
-struct camellia_xts_ctx {
+int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
-        struct camellia_ctx tweak_ctx;
+                        unsigned int keylen)
-        struct camellia_ctx crypt_ctx;
-};
-static int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
-                              unsigned int keylen)
 {
        struct camellia_xts_ctx *ctx = crypto_tfm_ctx(tfm);
        u32 *flags = &tfm->crt_flags;
@@ -1555,6 +1520,7 @@ static int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
        return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
                                flags);
 }
+EXPORT_SYMBOL_GPL(xts_camellia_setkey);
 static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
                       struct scatterlist *src, unsigned int nbytes)
@@ -1679,7 +1645,7 @@ static struct crypto_alg camellia_algs[6] = { {
        .cra_alignmask          = 0,
        .cra_type               = &crypto_blkcipher_type,
        .cra_module             = THIS_MODULE,
-        .cra_exit               = lrw_exit_tfm,
+        .cra_exit               = lrw_camellia_exit_tfm,
        .cra_u = {
                .blkcipher = {
                        .min_keysize    = CAMELLIA_MIN_KEY_SIZE +
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index a41a3aaba220..15b00ac7cbd3 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -25,10 +25,10 @@
 .file "cast5-avx-x86_64-asm_64.S"
-.extern cast5_s1
+.extern cast_s1
-.extern cast5_s2
+.extern cast_s2
-.extern cast5_s3
+.extern cast_s3
-.extern cast5_s4
+.extern cast_s4
 /* structure of crypto context */
 #define km      0
@@ -36,10 +36,10 @@
 #define rr      ((16*4)+16)
 /* s-boxes */
-#define s1      cast5_s1
+#define s1      cast_s1
-#define s2      cast5_s2
+#define s2      cast_s2
-#define s3      cast5_s3
+#define s3      cast_s3
-#define s4      cast5_s4
+#define s4      cast_s4
 /**********************************************************************
  16-way AVX cast5
@@ -180,31 +180,17 @@
        vpunpcklqdq             t1, t0, x0; \
        vpunpckhqdq             t1, t0, x1;
-#define inpack_blocks(in, x0, x1, t0, t1, rmask) \
+#define inpack_blocks(x0, x1, t0, t1, rmask) \
-        vmovdqu (0*4*4)(in),    x0; \
-        vmovdqu (1*4*4)(in),    x1; \
        vpshufb rmask,  x0,     x0; \
        vpshufb rmask,  x1,     x1; \
        \
        transpose_2x4(x0, x1, t0, t1)
-#define outunpack_blocks(out, x0, x1, t0, t1, rmask) \
+#define outunpack_blocks(x0, x1, t0, t1, rmask) \
        transpose_2x4(x0, x1, t0, t1) \
        \
        vpshufb rmask,  x0, x0;           \
-        vpshufb rmask,  x1, x1;           \
+        vpshufb rmask,  x1, x1;
-        vmovdqu         x0, (0*4*4)(out); \
-        vmovdqu         x1, (1*4*4)(out);
-#define outunpack_xor_blocks(out, x0, x1, t0, t1, rmask) \
-        transpose_2x4(x0, x1, t0, t1) \
-        \
-        vpshufb rmask,  x0, x0;               \
-        vpshufb rmask,  x1, x1;               \
-        vpxor           (0*4*4)(out), x0, x0; \
-        vmovdqu         x0, (0*4*4)(out);     \
-        vpxor           (1*4*4)(out), x1, x1; \
-        vmovdqu         x1, (1*4*4)(out);
 .data
@@ -213,6 +199,8 @@
        .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 .Lbswap128_mask:
        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lbswap_iv_mask:
+        .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
 .L16_mask:
        .byte 16, 16, 16, 16
 .L32_mask:
@@ -223,35 +211,42 @@
 .text
 .align 16
-.global __cast5_enc_blk_16way
+.type   __cast5_enc_blk16,@function;
-.type   __cast5_enc_blk_16way,@function;
-__cast5_enc_blk_16way:
+__cast5_enc_blk16:
        /* input:
         *      %rdi: ctx, CTX
-         *      %rsi: dst
+         *      RL1: blocks 1 and 2
-         *      %rdx: src
+         *      RR1: blocks 3 and 4
-         *      %rcx: bool, if true: xor output
+         *      RL2: blocks 5 and 6
+         *      RR2: blocks 7 and 8
+         *      RL3: blocks 9 and 10
+         *      RR3: blocks 11 and 12
+         *      RL4: blocks 13 and 14
+         *      RR4: blocks 15 and 16
+         * output:
+         *      RL1: encrypted blocks 1 and 2
+         *      RR1: encrypted blocks 3 and 4
+         *      RL2: encrypted blocks 5 and 6
+         *      RR2: encrypted blocks 7 and 8
+         *      RL3: encrypted blocks 9 and 10
+         *      RR3: encrypted blocks 11 and 12
+         *      RL4: encrypted blocks 13 and 14
+         *      RR4: encrypted blocks 15 and 16
         */
        pushq %rbp;
        pushq %rbx;
-        pushq %rcx;
        vmovdqa .Lbswap_mask, RKM;
        vmovd .Lfirst_mask, R1ST;
        vmovd .L32_mask, R32;
        enc_preload_rkr();
-        leaq 1*(2*4*4)(%rdx), %rax;
+        inpack_blocks(RL1, RR1, RTMP, RX, RKM);
-        inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM);
+        inpack_blocks(RL2, RR2, RTMP, RX, RKM);
-        inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM);
+        inpack_blocks(RL3, RR3, RTMP, RX, RKM);
-        leaq 2*(2*4*4)(%rdx), %rax;
+        inpack_blocks(RL4, RR4, RTMP, RX, RKM);
-        inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM);
-        leaq 3*(2*4*4)(%rdx), %rax;
-        inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM);
-        movq %rsi, %r11;
        round(RL, RR, 0, 1);
        round(RR, RL, 1, 2);
@@ -276,44 +271,41 @@ __cast5_enc_blk_16way:
        round(RR, RL, 15, 1);
 __skip_enc:
-        popq %rcx;
        popq %rbx;
        popq %rbp;
        vmovdqa .Lbswap_mask, RKM;
-        leaq 1*(2*4*4)(%r11), %rax;
-        testb %cl, %cl;
+        outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
-        jnz __enc_xor16;
+        outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
+        outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
-        outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
+        outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
-        outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
-        leaq 2*(2*4*4)(%r11), %rax;
-        outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
-        leaq 3*(2*4*4)(%r11), %rax;
-        outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
-        ret;
-__enc_xor16:
-        outunpack_xor_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
-        outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
-        leaq 2*(2*4*4)(%r11), %rax;
-        outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
-        leaq 3*(2*4*4)(%r11), %rax;
-        outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
        ret;
 .align 16
-.global cast5_dec_blk_16way
+.type   __cast5_dec_blk16,@function;
-.type   cast5_dec_blk_16way,@function;
-cast5_dec_blk_16way:
+__cast5_dec_blk16:
        /* input:
         *      %rdi: ctx, CTX
-         *      %rsi: dst
+         *      RL1: encrypted blocks 1 and 2
-         *      %rdx: src
+         *      RR1: encrypted blocks 3 and 4
+         *      RL2: encrypted blocks 5 and 6
+         *      RR2: encrypted blocks 7 and 8
+         *      RL3: encrypted blocks 9 and 10
+         *      RR3: encrypted blocks 11 and 12
+         *      RL4: encrypted blocks 13 and 14
+         *      RR4: encrypted blocks 15 and 16
+         * output:
+         *      RL1: decrypted blocks 1 and 2
+         *      RR1: decrypted blocks 3 and 4
+         *      RL2: decrypted blocks 5 and 6
+         *      RR2: decrypted blocks 7 and 8
+         *      RL3: decrypted blocks 9 and 10
+         *      RR3: decrypted blocks 11 and 12
+         *      RL4: decrypted blocks 13 and 14
+         *      RR4: decrypted blocks 15 and 16
         */
        pushq %rbp;
@@ -324,15 +316,10 @@ cast5_dec_blk_16way:
        vmovd .L32_mask, R32;
        dec_preload_rkr();
-        leaq 1*(2*4*4)(%rdx), %rax;
+        inpack_blocks(RL1, RR1, RTMP, RX, RKM);
-        inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM);
+        inpack_blocks(RL2, RR2, RTMP, RX, RKM);
-        inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM);
+        inpack_blocks(RL3, RR3, RTMP, RX, RKM);
-        leaq 2*(2*4*4)(%rdx), %rax;
+        inpack_blocks(RL4, RR4, RTMP, RX, RKM);
-        inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM);
-        leaq 3*(2*4*4)(%rdx), %rax;
-        inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM);
-        movq %rsi, %r11;
        movzbl rr(CTX), %eax;
        testl %eax, %eax;
@@ -361,16 +348,211 @@ __dec_tail:
        popq %rbx;
        popq %rbp;
-        leaq 1*(2*4*4)(%r11), %rax;
+        outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
-        outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
+        outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
-        outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
+        outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
-        leaq 2*(2*4*4)(%r11), %rax;
+        outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
-        outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
-        leaq 3*(2*4*4)(%r11), %rax;
-        outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
        ret;
 __skip_dec:
        vpsrldq $4, RKR, RKR;
        jmp __dec_tail;
+.align 16
+.global cast5_ecb_enc_16way
+.type   cast5_ecb_enc_16way,@function;
+cast5_ecb_enc_16way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        movq %rsi, %r11;
+        vmovdqu (0*4*4)(%rdx), RL1;
+        vmovdqu (1*4*4)(%rdx), RR1;
+        vmovdqu (2*4*4)(%rdx), RL2;
+        vmovdqu (3*4*4)(%rdx), RR2;
+        vmovdqu (4*4*4)(%rdx), RL3;
+        vmovdqu (5*4*4)(%rdx), RR3;
+        vmovdqu (6*4*4)(%rdx), RL4;
+        vmovdqu (7*4*4)(%rdx), RR4;
+        call __cast5_enc_blk16;
+        vmovdqu RR1, (0*4*4)(%r11);
+        vmovdqu RL1, (1*4*4)(%r11);
+        vmovdqu RR2, (2*4*4)(%r11);
+        vmovdqu RL2, (3*4*4)(%r11);
+        vmovdqu RR3, (4*4*4)(%r11);
+        vmovdqu RL3, (5*4*4)(%r11);
+        vmovdqu RR4, (6*4*4)(%r11);
+        vmovdqu RL4, (7*4*4)(%r11);
+        ret;
+.align 16
+.global cast5_ecb_dec_16way
+.type   cast5_ecb_dec_16way,@function;
+cast5_ecb_dec_16way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        movq %rsi, %r11;
+        vmovdqu (0*4*4)(%rdx), RL1;
+        vmovdqu (1*4*4)(%rdx), RR1;
+        vmovdqu (2*4*4)(%rdx), RL2;
+        vmovdqu (3*4*4)(%rdx), RR2;
+        vmovdqu (4*4*4)(%rdx), RL3;
+        vmovdqu (5*4*4)(%rdx), RR3;
+        vmovdqu (6*4*4)(%rdx), RL4;
+        vmovdqu (7*4*4)(%rdx), RR4;
+        call __cast5_dec_blk16;
+        vmovdqu RR1, (0*4*4)(%r11);
+        vmovdqu RL1, (1*4*4)(%r11);
+        vmovdqu RR2, (2*4*4)(%r11);
+        vmovdqu RL2, (3*4*4)(%r11);
+        vmovdqu RR3, (4*4*4)(%r11);
+        vmovdqu RL3, (5*4*4)(%r11);
+        vmovdqu RR4, (6*4*4)(%r11);
+        vmovdqu RL4, (7*4*4)(%r11);
+        ret;
+.align 16
+.global cast5_cbc_dec_16way
+.type   cast5_cbc_dec_16way,@function;
+cast5_cbc_dec_16way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        pushq %r12;
+        movq %rsi, %r11;
+        movq %rdx, %r12;
+        vmovdqu (0*16)(%rdx), RL1;
+        vmovdqu (1*16)(%rdx), RR1;
+        vmovdqu (2*16)(%rdx), RL2;
+        vmovdqu (3*16)(%rdx), RR2;
+        vmovdqu (4*16)(%rdx), RL3;
+        vmovdqu (5*16)(%rdx), RR3;
+        vmovdqu (6*16)(%rdx), RL4;
+        vmovdqu (7*16)(%rdx), RR4;
+        call __cast5_dec_blk16;
+        /* xor with src */
+        vmovq (%r12), RX;
+        vpshufd $0x4f, RX, RX;
+        vpxor RX, RR1, RR1;
+        vpxor 0*16+8(%r12), RL1, RL1;
+        vpxor 1*16+8(%r12), RR2, RR2;
+        vpxor 2*16+8(%r12), RL2, RL2;
+        vpxor 3*16+8(%r12), RR3, RR3;
+        vpxor 4*16+8(%r12), RL3, RL3;
+        vpxor 5*16+8(%r12), RR4, RR4;
+        vpxor 6*16+8(%r12), RL4, RL4;
+        vmovdqu RR1, (0*16)(%r11);
+        vmovdqu RL1, (1*16)(%r11);
+        vmovdqu RR2, (2*16)(%r11);
+        vmovdqu RL2, (3*16)(%r11);
+        vmovdqu RR3, (4*16)(%r11);
+        vmovdqu RL3, (5*16)(%r11);
+        vmovdqu RR4, (6*16)(%r11);
+        vmovdqu RL4, (7*16)(%r11);
+        popq %r12;
+        ret;
+.align 16
+.global cast5_ctr_16way
+.type   cast5_ctr_16way,@function;
+cast5_ctr_16way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         *      %rcx: iv (big endian, 64bit)
+         */
+        pushq %r12;
+        movq %rsi, %r11;
+        movq %rdx, %r12;
+        vpcmpeqd RTMP, RTMP, RTMP;
+        vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */
+        vpcmpeqd RKR, RKR, RKR;
+        vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
+        vmovdqa .Lbswap_iv_mask, R1ST;
+        vmovdqa .Lbswap128_mask, RKM;
+        /* load IV and byteswap */
+        vmovq (%rcx), RX;
+        vpshufb R1ST, RX, RX;
+        /* construct IVs */
+        vpsubq RTMP, RX, RX;  /* le: IV1, IV0 */
+        vpshufb RKM, RX, RL1; /* be: IV0, IV1 */
+        vpsubq RKR, RX, RX;
+        vpshufb RKM, RX, RR1; /* be: IV2, IV3 */
+        vpsubq RKR, RX, RX;
+        vpshufb RKM, RX, RL2; /* be: IV4, IV5 */
+        vpsubq RKR, RX, RX;
+        vpshufb RKM, RX, RR2; /* be: IV6, IV7 */
+        vpsubq RKR, RX, RX;
+        vpshufb RKM, RX, RL3; /* be: IV8, IV9 */
+        vpsubq RKR, RX, RX;
+        vpshufb RKM, RX, RR3; /* be: IV10, IV11 */
+        vpsubq RKR, RX, RX;
+        vpshufb RKM, RX, RL4; /* be: IV12, IV13 */
+        vpsubq RKR, RX, RX;
+        vpshufb RKM, RX, RR4; /* be: IV14, IV15 */
+        /* store last IV */
+        vpsubq RTMP, RX, RX; /* le: IV16, IV14 */
+        vpshufb R1ST, RX, RX; /* be: IV16, IV16 */
+        vmovq RX, (%rcx);
+        call __cast5_enc_blk16;
+        /* dst = src ^ iv */
+        vpxor (0*16)(%r12), RR1, RR1;
+        vpxor (1*16)(%r12), RL1, RL1;
+        vpxor (2*16)(%r12), RR2, RR2;
+        vpxor (3*16)(%r12), RL2, RL2;
+        vpxor (4*16)(%r12), RR3, RR3;
+        vpxor (5*16)(%r12), RL3, RL3;
+        vpxor (6*16)(%r12), RR4, RR4;
+        vpxor (7*16)(%r12), RL4, RL4;
+        vmovdqu RR1, (0*16)(%r11);
+        vmovdqu RL1, (1*16)(%r11);
+        vmovdqu RR2, (2*16)(%r11);
+        vmovdqu RL2, (3*16)(%r11);
+        vmovdqu RR3, (4*16)(%r11);
+        vmovdqu RL3, (5*16)(%r11);
+        vmovdqu RR4, (6*16)(%r11);
+        vmovdqu RL4, (7*16)(%r11);
+        popq %r12;
+        ret;
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index e0ea14f9547f..c6631813dc11 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -37,29 +37,14 @@
 #define CAST5_PARALLEL_BLOCKS 16
-asmlinkage void __cast5_enc_blk_16way(struct cast5_ctx *ctx, u8 *dst,
+asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst,
-                                      const u8 *src, bool xor);
-asmlinkage void cast5_dec_blk_16way(struct cast5_ctx *ctx, u8 *dst,
                                    const u8 *src);
+asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst,
-static inline void cast5_enc_blk_xway(struct cast5_ctx *ctx, u8 *dst,
+                                    const u8 *src);
-                                      const u8 *src)
+asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst,
-{
+                                    const u8 *src);
-        __cast5_enc_blk_16way(ctx, dst, src, false);
+asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src,
-}
+                                __be64 *iv);
-static inline void cast5_enc_blk_xway_xor(struct cast5_ctx *ctx, u8 *dst,
-                                          const u8 *src)
-{
-        __cast5_enc_blk_16way(ctx, dst, src, true);
-}
-static inline void cast5_dec_blk_xway(struct cast5_ctx *ctx, u8 *dst,
-                                      const u8 *src)
-{
-        cast5_dec_blk_16way(ctx, dst, src);
-}
 static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes)
 {
@@ -79,8 +64,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
        struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
        const unsigned int bsize = CAST5_BLOCK_SIZE;
        unsigned int nbytes;
+        void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src);
        int err;
+        fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way;
        err = blkcipher_walk_virt(desc, walk);
        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
@@ -93,10 +81,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
                /* Process multi-block batch */
                if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
                        do {
-                                if (enc)
+                                fn(ctx, wdst, wsrc);
-                                        cast5_enc_blk_xway(ctx, wdst, wsrc);
-                                else
-                                        cast5_dec_blk_xway(ctx, wdst, wsrc);
                                wsrc += bsize * CAST5_PARALLEL_BLOCKS;
                                wdst += bsize * CAST5_PARALLEL_BLOCKS;
@@ -107,12 +92,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
                                goto done;
                }
+                fn = (enc) ? __cast5_encrypt : __cast5_decrypt;
                /* Handle leftovers */
                do {
-                        if (enc)
+                        fn(ctx, wdst, wsrc);
-                                __cast5_encrypt(ctx, wdst, wsrc);
-                        else
-                                __cast5_decrypt(ctx, wdst, wsrc);
                        wsrc += bsize;
                        wdst += bsize;
@@ -194,9 +178,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
        unsigned int nbytes = walk->nbytes;
        u64 *src = (u64 *)walk->src.virt.addr;
        u64 *dst = (u64 *)walk->dst.virt.addr;
-        u64 ivs[CAST5_PARALLEL_BLOCKS - 1];
        u64 last_iv;
-        int i;
        /* Start of the last block. */
        src += nbytes / bsize - 1;
@@ -211,13 +193,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
                        src -= CAST5_PARALLEL_BLOCKS - 1;
                        dst -= CAST5_PARALLEL_BLOCKS - 1;
-                        for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
+                        cast5_cbc_dec_16way(ctx, (u8 *)dst, (u8 *)src);
-                                ivs[i] = src[i];
-                        cast5_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
-                        for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
-                                *(dst + (i + 1)) ^= *(ivs + i);
                        nbytes -= bsize;
                        if (nbytes < bsize)
@@ -298,23 +274,12 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
        unsigned int nbytes = walk->nbytes;
        u64 *src = (u64 *)walk->src.virt.addr;
        u64 *dst = (u64 *)walk->dst.virt.addr;
-        u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
-        __be64 ctrblocks[CAST5_PARALLEL_BLOCKS];
-        int i;
        /* Process multi-block batch */
        if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
                do {
-                        /* create ctrblks for parallel encrypt */
+                        cast5_ctr_16way(ctx, (u8 *)dst, (u8 *)src,
-                        for (i = 0; i < CAST5_PARALLEL_BLOCKS; i++) {
+                                        (__be64 *)walk->iv);
-                                if (dst != src)
-                                        dst[i] = src[i];
-                                ctrblocks[i] = cpu_to_be64(ctrblk++);
-                        }
-                        cast5_enc_blk_xway_xor(ctx, (u8 *)dst,
-                                               (u8 *)ctrblocks);
                        src += CAST5_PARALLEL_BLOCKS;
                        dst += CAST5_PARALLEL_BLOCKS;
@@ -327,13 +292,16 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
        /* Handle leftovers */
        do {
+                u64 ctrblk;
                if (dst != src)
                        *dst = *src;
-                ctrblocks[0] = cpu_to_be64(ctrblk++);
+                ctrblk = *(u64 *)walk->iv;
+                be64_add_cpu((__be64 *)walk->iv, 1);
-                __cast5_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
+                __cast5_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
-                *dst ^= ctrblocks[0];
+                *dst ^= ctrblk;
                src += 1;
                dst += 1;
@@ -341,7 +309,6 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
        } while (nbytes >= bsize);
 done:
-        *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
        return nbytes;
 }
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index 218d283772f4..2569d0da841f 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -23,22 +23,24 @@
 *
 */
+#include "glue_helper-asm-avx.S"
 .file "cast6-avx-x86_64-asm_64.S"
-.extern cast6_s1
+.extern cast_s1
-.extern cast6_s2
+.extern cast_s2
-.extern cast6_s3
+.extern cast_s3
-.extern cast6_s4
+.extern cast_s4
 /* structure of crypto context */
 #define km      0
 #define kr      (12*4*4)
 /* s-boxes */
-#define s1      cast6_s1
+#define s1      cast_s1
-#define s2      cast6_s2
+#define s2      cast_s2
-#define s3      cast6_s3
+#define s3      cast_s3
-#define s4      cast6_s4
+#define s4      cast_s4
 /**********************************************************************
  8-way AVX cast6
@@ -205,11 +207,7 @@
        vpunpcklqdq             x3, t2, x2; \
        vpunpckhqdq             x3, t2, x3;
-#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2, rmask) \
+#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
-        vmovdqu (0*4*4)(in),    x0; \
-        vmovdqu (1*4*4)(in),    x1; \
-        vmovdqu (2*4*4)(in),    x2; \
-        vmovdqu (3*4*4)(in),    x3; \
        vpshufb rmask, x0,      x0; \
        vpshufb rmask, x1,      x1; \
        vpshufb rmask, x2,      x2; \
@@ -217,39 +215,21 @@
        \
        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
-#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
+#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
        transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
        \
        vpshufb rmask,          x0, x0;       \
        vpshufb rmask,          x1, x1;       \
        vpshufb rmask,          x2, x2;       \
-        vpshufb rmask,          x3, x3;       \
+        vpshufb rmask,          x3, x3;
-        vmovdqu x0,             (0*4*4)(out); \
-        vmovdqu x1,             (1*4*4)(out); \
-        vmovdqu x2,             (2*4*4)(out); \
-        vmovdqu x3,             (3*4*4)(out);
-#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
-        transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
-        \
-        vpshufb rmask,          x0, x0;       \
-        vpshufb rmask,          x1, x1;       \
-        vpshufb rmask,          x2, x2;       \
-        vpshufb rmask,          x3, x3;       \
-        vpxor (0*4*4)(out),     x0, x0;       \
-        vmovdqu x0,             (0*4*4)(out); \
-        vpxor (1*4*4)(out),     x1, x1;       \
-        vmovdqu x1,             (1*4*4)(out); \
-        vpxor (2*4*4)(out),     x2, x2;       \
-        vmovdqu x2,             (2*4*4)(out); \
-        vpxor (3*4*4)(out),     x3, x3;       \
-        vmovdqu x3,             (3*4*4)(out);
 .data
 .align 16
 .Lbswap_mask:
        .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+.Lbswap128_mask:
+        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 .Lrkr_enc_Q_Q_QBAR_QBAR:
        .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
 .Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
@@ -269,31 +249,26 @@
 .text
-.align 16
+.align 8
-.global __cast6_enc_blk_8way
+.type   __cast6_enc_blk8,@function;
-.type   __cast6_enc_blk_8way,@function;
-__cast6_enc_blk_8way:
+__cast6_enc_blk8:
        /* input:
         *      %rdi: ctx, CTX
-         *      %rsi: dst
+         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
-         *      %rdx: src
+         * output:
-         *      %rcx: bool, if true: xor output
+         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
         */
        pushq %rbp;
        pushq %rbx;
-        pushq %rcx;
        vmovdqa .Lbswap_mask, RKM;
        vmovd .Lfirst_mask, R1ST;
        vmovd .L32_mask, R32;
-        leaq (4*4*4)(%rdx), %rax;
+        inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
-        inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+        inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
-        inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
-        movq %rsi, %r11;
        preload_rkr(0, dummy, none);
        Q(0);
@@ -311,36 +286,25 @@ __cast6_enc_blk_8way:
        QBAR(10);
        QBAR(11);
-        popq %rcx;
        popq %rbx;
        popq %rbp;
        vmovdqa .Lbswap_mask, RKM;
-        leaq (4*4*4)(%r11), %rax;
-        testb %cl, %cl;
-        jnz __enc_xor8;
-        outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
-        outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
-        ret;
-__enc_xor8:
+        outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
-        outunpack_xor_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+        outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
-        outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
        ret;
-.align 16
+.align 8
-.global cast6_dec_blk_8way
+.type   __cast6_dec_blk8,@function;
-.type   cast6_dec_blk_8way,@function;
-cast6_dec_blk_8way:
+__cast6_dec_blk8:
        /* input:
         *      %rdi: ctx, CTX
-         *      %rsi: dst
+         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
-         *      %rdx: src
+         * output:
+         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
         */
        pushq %rbp;
@@ -350,11 +314,8 @@ cast6_dec_blk_8way:
        vmovd .Lfirst_mask, R1ST;
        vmovd .L32_mask, R32;
-        leaq (4*4*4)(%rdx), %rax;
+        inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
-        inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+        inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
-        inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
-        movq %rsi, %r11;
        preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
        Q(11);
@@ -376,8 +337,103 @@ cast6_dec_blk_8way:
        popq %rbp;
        vmovdqa .Lbswap_mask, RKM;
-        leaq (4*4*4)(%r11), %rax;
+        outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
-        outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+        outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
-        outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
+        ret;
+.align 8
+.global cast6_ecb_enc_8way
+.type   cast6_ecb_enc_8way,@function;
+cast6_ecb_enc_8way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        movq %rsi, %r11;
+        load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        call __cast6_enc_blk8;
+        store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        ret;
+.align 8
+.global cast6_ecb_dec_8way
+.type   cast6_ecb_dec_8way,@function;
+cast6_ecb_dec_8way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        movq %rsi, %r11;
+        load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        call __cast6_dec_blk8;
+        store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        ret;
+.align 8
+.global cast6_cbc_dec_8way
+.type   cast6_cbc_dec_8way,@function;
+cast6_cbc_dec_8way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        pushq %r12;
+        movq %rsi, %r11;
+        movq %rdx, %r12;
+        load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        call __cast6_dec_blk8;
+        store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        popq %r12;
+        ret;
+.align 8
+.global cast6_ctr_8way
+.type   cast6_ctr_8way,@function;
+cast6_ctr_8way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         *      %rcx: iv (little endian, 128bit)
+         */
+        pushq %r12;
+        movq %rsi, %r11;
+        movq %rdx, %r12;
+        load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+                      RD2, RX, RKR, RKM);
+        call __cast6_enc_blk8;
+        store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        popq %r12;
        ret;
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 15e5f85a5011..92f7ca24790a 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -40,79 +40,34 @@
 #define CAST6_PARALLEL_BLOCKS 8
-asmlinkage void __cast6_enc_blk_8way(struct cast6_ctx *ctx, u8 *dst,
+asmlinkage void cast6_ecb_enc_8way(struct cast6_ctx *ctx, u8 *dst,
-                                     const u8 *src, bool xor);
+                                   const u8 *src);
-asmlinkage void cast6_dec_blk_8way(struct cast6_ctx *ctx, u8 *dst,
+asmlinkage void cast6_ecb_dec_8way(struct cast6_ctx *ctx, u8 *dst,
                                   const u8 *src);
-static inline void cast6_enc_blk_xway(struct cast6_ctx *ctx, u8 *dst,
+asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst,
-                                      const u8 *src)
+                                   const u8 *src);
-{
+asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src,
-        __cast6_enc_blk_8way(ctx, dst, src, false);
+                               le128 *iv);
-}
-static inline void cast6_enc_blk_xway_xor(struct cast6_ctx *ctx, u8 *dst,
-                                          const u8 *src)
-{
-        __cast6_enc_blk_8way(ctx, dst, src, true);
-}
-static inline void cast6_dec_blk_xway(struct cast6_ctx *ctx, u8 *dst,
-                                      const u8 *src)
-{
-        cast6_dec_blk_8way(ctx, dst, src);
-}
-static void cast6_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
-{
-        u128 ivs[CAST6_PARALLEL_BLOCKS - 1];
-        unsigned int j;
-        for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++)
-                ivs[j] = src[j];
-        cast6_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
-        for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++)
-                u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
-}
-static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
+static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
        be128 ctrblk;
-        u128_to_be128(&ctrblk, iv);
+        le128_to_be128(&ctrblk, iv);
-        u128_inc(iv);
+        le128_inc(iv);
        __cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
        u128_xor(dst, src, (u128 *)&ctrblk);
 }
-static void cast6_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
-                                   u128 *iv)
-{
-        be128 ctrblks[CAST6_PARALLEL_BLOCKS];
-        unsigned int i;
-        for (i = 0; i < CAST6_PARALLEL_BLOCKS; i++) {
-                if (dst != src)
-                        dst[i] = src[i];
-                u128_to_be128(&ctrblks[i], iv);
-                u128_inc(iv);
-        }
-        cast6_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
-}
 static const struct common_glue_ctx cast6_enc = {
        .num_funcs = 2,
        .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
        .funcs = { {
                .num_blocks = CAST6_PARALLEL_BLOCKS,
-                .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_enc_blk_xway) }
+                .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_enc_8way) }
        }, {
                .num_blocks = 1,
                .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) }
@@ -125,7 +80,7 @@ static const struct common_glue_ctx cast6_ctr = {
        .funcs = { {
                .num_blocks = CAST6_PARALLEL_BLOCKS,
-                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr_xway) }
+                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_ctr_8way) }
        }, {
                .num_blocks = 1,
                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) }
@@ -138,7 +93,7 @@ static const struct common_glue_ctx cast6_dec = {
        .funcs = { {
                .num_blocks = CAST6_PARALLEL_BLOCKS,
-                .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_dec_blk_xway) }
+                .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_dec_8way) }
        }, {
                .num_blocks = 1,
                .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) }
@@ -151,7 +106,7 @@ static const struct common_glue_ctx cast6_dec_cbc = {
        .funcs = { {
                .num_blocks = CAST6_PARALLEL_BLOCKS,
-                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_decrypt_cbc_xway) }
+                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_cbc_dec_8way) }
        }, {
                .num_blocks = 1,
                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) }
@@ -215,7 +170,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
        ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
        if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
-                cast6_enc_blk_xway(ctx->ctx, srcdst, srcdst);
+                cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
                return;
        }
@@ -232,7 +187,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
        ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
        if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
-                cast6_dec_blk_xway(ctx->ctx, srcdst, srcdst);
+                cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
                return;
        }
diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel_glue.c
index 493f959261f7..6812ad98355c 100644
--- a/arch/x86/crypto/crc32c-intel.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -32,6 +32,8 @@
 #include <asm/cpufeature.h>
 #include <asm/cpu_device_id.h>
+#include <asm/i387.h>
+#include <asm/fpu-internal.h>
 #define CHKSUM_BLOCK_SIZE       1
 #define CHKSUM_DIGEST_SIZE      4
@@ -44,6 +46,31 @@
 #define REX_PRE
 #endif
+#ifdef CONFIG_X86_64
+/*
+ * use carryless multiply version of crc32c when buffer
+ * size is >= 512 (when eager fpu is enabled) or
+ * >= 1024 (when eager fpu is disabled) to account
+ * for fpu state save/restore overhead.
+ */
+#define CRC32C_PCL_BREAKEVEN_EAGERFPU   512
+#define CRC32C_PCL_BREAKEVEN_NOEAGERFPU 1024
+asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
+                                unsigned int crc_init);
+static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU;
+#if defined(X86_FEATURE_EAGER_FPU)
+#define set_pcl_breakeven_point()                                       \
+do {                                                                    \
+        if (!use_eager_fpu())                                           \
+                crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU; \
+} while (0)
+#else
+#define set_pcl_breakeven_point()                                       \
+        (crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU)
+#endif
+#endif /* CONFIG_X86_64 */
 static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
 {
        while (length--) {
@@ -154,6 +181,52 @@ static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
        return 0;
 }
+#ifdef CONFIG_X86_64
+static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
+                               unsigned int len)
+{
+        u32 *crcp = shash_desc_ctx(desc);
+        /*
+         * use faster PCL version if datasize is large enough to
+         * overcome kernel fpu state save/restore overhead
+         */
+        if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
+                kernel_fpu_begin();
+                *crcp = crc_pcl(data, len, *crcp);
+                kernel_fpu_end();
+        } else
+                *crcp = crc32c_intel_le_hw(*crcp, data, len);
+        return 0;
+}
+static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
+                                u8 *out)
+{
+        if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
+                kernel_fpu_begin();
+                *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
+                kernel_fpu_end();
+        } else
+                *(__le32 *)out =
+                        ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
+        return 0;
+}
+static int crc32c_pcl_intel_finup(struct shash_desc *desc, const u8 *data,
+                              unsigned int len, u8 *out)
+{
+        return __crc32c_pcl_intel_finup(shash_desc_ctx(desc), data, len, out);
+}
+static int crc32c_pcl_intel_digest(struct shash_desc *desc, const u8 *data,
+                               unsigned int len, u8 *out)
+{
+        return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
+                                    out);
+}
+#endif /* CONFIG_X86_64 */
 static struct shash_alg alg = {
        .setkey                 =       crc32c_intel_setkey,
        .init                   =       crc32c_intel_init,
@@ -184,6 +257,14 @@ static int __init crc32c_intel_mod_init(void)
 {
        if (!x86_match_cpu(crc32c_cpu_id))
                return -ENODEV;
+#ifdef CONFIG_X86_64
+        if (cpu_has_pclmulqdq) {
+                alg.update = crc32c_pcl_intel_update;
+                alg.finup = crc32c_pcl_intel_finup;
+                alg.digest = crc32c_pcl_intel_digest;
+                set_pcl_breakeven_point();
+        }
+#endif
        return crypto_register_shash(&alg);
 }
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
new file mode 100644
index 000000000000..93c6d39237ac
--- /dev/null
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -0,0 +1,460 @@
+/*
+ * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
+ *
+ * The white paper on CRC32C calculations with PCLMULQDQ instruction can be
+ * downloaded from:
+ * http://download.intel.com/design/intarch/papers/323405.pdf
+ *
+ * Copyright (C) 2012 Intel Corporation.
+ *
+ * Authors:
+ *      Wajdi Feghali <wajdi.k.feghali@intel.com>
+ *      James Guilford <james.guilford@intel.com>
+ *      David Cote <david.m.cote@intel.com>
+ *      Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
+.macro LABEL prefix n
+\prefix\n\():
+.endm
+.macro JMPTBL_ENTRY i
+.word crc_\i - crc_array
+.endm
+.macro JNC_LESS_THAN j
+        jnc less_than_\j
+.endm
+# Define threshold where buffers are considered "small" and routed to more
+# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
+# SMALL_SIZE can be no larger than 255.
+#define SMALL_SIZE 200
+.if (SMALL_SIZE > 255)
+.error "SMALL_ SIZE must be < 256"
+.endif
+# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
+.global crc_pcl
+crc_pcl:
+#define    bufp         %rdi
+#define    bufp_dw      %edi
+#define    bufp_w       %di
+#define    bufp_b       %dil
+#define    bufptmp      %rcx
+#define    block_0      %rcx
+#define    block_1      %rdx
+#define    block_2      %r11
+#define    len          %rsi
+#define    len_dw       %esi
+#define    len_w        %si
+#define    len_b        %sil
+#define    crc_init_arg %rdx
+#define    tmp          %rbx
+#define    crc_init     %r8
+#define    crc_init_dw  %r8d
+#define    crc1         %r9
+#define    crc2         %r10
+        pushq   %rbx
+        pushq   %rdi
+        pushq   %rsi
+        ## Move crc_init for Linux to a different
+        mov     crc_init_arg, crc_init
+        ################################################################
+        ## 1) ALIGN:
+        ################################################################
+        mov     bufp, bufptmp           # rdi = *buf
+        neg     bufp
+        and     $7, bufp                # calculate the unalignment amount of
+                                        # the address
+        je      proc_block              # Skip if aligned
+        ## If len is less than 8 and we're unaligned, we need to jump
+        ## to special code to avoid reading beyond the end of the buffer
+        cmp     $8, len
+        jae     do_align
+        # less_than_8 expects length in upper 3 bits of len_dw
+        # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
+        shl     $32-3+1, len_dw
+        jmp     less_than_8_post_shl1
+do_align:
+        #### Calculate CRC of unaligned bytes of the buffer (if any)
+        movq    (bufptmp), tmp          # load a quadward from the buffer
+        add     bufp, bufptmp           # align buffer pointer for quadword
+                                        # processing
+        sub     bufp, len               # update buffer length
+align_loop:
+        crc32b  %bl, crc_init_dw        # compute crc32 of 1-byte
+        shr     $8, tmp                 # get next byte
+        dec     bufp
+        jne     align_loop
+proc_block:
+        ################################################################
+        ## 2) PROCESS  BLOCKS:
+        ################################################################
+        ## compute num of bytes to be processed
+        movq    len, tmp                # save num bytes in tmp
+        cmpq    $128*24, len
+        jae     full_block
+continue_block:
+        cmpq    $SMALL_SIZE, len
+        jb      small
+        ## len < 128*24
+        movq    $2731, %rax             # 2731 = ceil(2^16 / 24)
+        mul     len_dw
+        shrq    $16, %rax
+        ## eax contains floor(bytes / 24) = num 24-byte chunks to do
+        ## process rax 24-byte chunks (128 >= rax >= 0)
+        ## compute end address of each block
+        ## block 0 (base addr + RAX * 8)
+        ## block 1 (base addr + RAX * 16)
+        ## block 2 (base addr + RAX * 24)
+        lea     (bufptmp, %rax, 8), block_0
+        lea     (block_0, %rax, 8), block_1
+        lea     (block_1, %rax, 8), block_2
+        xor     crc1, crc1
+        xor     crc2, crc2
+        ## branch into array
+        lea     jump_table(%rip), bufp
+        movzxw  (bufp, %rax, 2), len
+        offset=crc_array-jump_table
+        lea     offset(bufp, len, 1), bufp
+        jmp     *bufp
+        ################################################################
+        ## 2a) PROCESS FULL BLOCKS:
+        ################################################################
+full_block:
+        movq    $128,%rax
+        lea     128*8*2(block_0), block_1
+        lea     128*8*3(block_0), block_2
+        add     $128*8*1, block_0
+        xor     crc1,crc1
+        xor     crc2,crc2
+        # Fall thruogh into top of crc array (crc_128)
+        ################################################################
+        ## 3) CRC Array:
+        ################################################################
+crc_array:
+        i=128
+.rept 128-1
+.altmacro
+LABEL crc_ %i
+.noaltmacro
+        crc32q   -i*8(block_0), crc_init
+        crc32q   -i*8(block_1), crc1
+        crc32q   -i*8(block_2), crc2
+        i=(i-1)
+.endr
+.altmacro
+LABEL crc_ %i
+.noaltmacro
+        crc32q   -i*8(block_0), crc_init
+        crc32q   -i*8(block_1), crc1
+# SKIP  crc32  -i*8(block_2), crc2 ; Don't do this one yet
+        mov     block_2, block_0
+        ################################################################
+        ## 4) Combine three results:
+        ################################################################
+        lea     (K_table-16)(%rip), bufp        # first entry is for idx 1
+        shlq    $3, %rax                        # rax *= 8
+        subq    %rax, tmp                       # tmp -= rax*8
+        shlq    $1, %rax
+        subq    %rax, tmp                       # tmp -= rax*16
+                                                # (total tmp -= rax*24)
+        addq    %rax, bufp
+        movdqa  (bufp), %xmm0                   # 2 consts: K1:K2
+        movq    crc_init, %xmm1                 # CRC for block 1
+        pclmulqdq $0x00,%xmm0,%xmm1             # Multiply by K2
+        movq    crc1, %xmm2                     # CRC for block 2
+        pclmulqdq $0x10, %xmm0, %xmm2           # Multiply by K1
+        pxor    %xmm2,%xmm1
+        movq    %xmm1, %rax
+        xor     -i*8(block_2), %rax
+        mov     crc2, crc_init
+        crc32   %rax, crc_init
+################################################################
+## 5) Check for end:
+################################################################
+LABEL crc_ 0
+        mov     tmp, len
+        cmp     $128*24, tmp
+        jae     full_block
+        cmp     $24, tmp
+        jae     continue_block
+less_than_24:
+        shl     $32-4, len_dw                   # less_than_16 expects length
+                                                # in upper 4 bits of len_dw
+        jnc     less_than_16
+        crc32q  (bufptmp), crc_init
+        crc32q  8(bufptmp), crc_init
+        jz      do_return
+        add     $16, bufptmp
+        # len is less than 8 if we got here
+        # less_than_8 expects length in upper 3 bits of len_dw
+        # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
+        shl     $2, len_dw
+        jmp     less_than_8_post_shl1
+        #######################################################################
+        ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
+        #######################################################################
+small:
+        shl $32-8, len_dw               # Prepare len_dw for less_than_256
+        j=256
+.rept 5                                 # j = {256, 128, 64, 32, 16}
+.altmacro
+LABEL less_than_ %j                     # less_than_j: Length should be in
+                                        # upper lg(j) bits of len_dw
+        j=(j/2)
+        shl     $1, len_dw              # Get next MSB
+        JNC_LESS_THAN %j
+.noaltmacro
+        i=0
+.rept (j/8)
+        crc32q  i(bufptmp), crc_init    # Compute crc32 of 8-byte data
+        i=i+8
+.endr
+        jz      do_return               # Return if remaining length is zero
+        add     $j, bufptmp             # Advance buf
+.endr
+less_than_8:                            # Length should be stored in
+                                        # upper 3 bits of len_dw
+        shl     $1, len_dw
+less_than_8_post_shl1:
+        jnc     less_than_4
+        crc32l  (bufptmp), crc_init_dw  # CRC of 4 bytes
+        jz      do_return               # return if remaining data is zero
+        add     $4, bufptmp
+less_than_4:                            # Length should be stored in
+                                        # upper 2 bits of len_dw
+        shl     $1, len_dw
+        jnc     less_than_2
+        crc32w  (bufptmp), crc_init_dw  # CRC of 2 bytes
+        jz      do_return               # return if remaining data is zero
+        add     $2, bufptmp
+less_than_2:                            # Length should be stored in the MSB
+                                        # of len_dw
+        shl     $1, len_dw
+        jnc     less_than_1
+        crc32b  (bufptmp), crc_init_dw  # CRC of 1 byte
+less_than_1:                            # Length should be zero
+do_return:
+        movq    crc_init, %rax
+        popq    %rsi
+        popq    %rdi
+        popq    %rbx
+        ret
+        ################################################################
+        ## jump table        Table is 129 entries x 2 bytes each
+        ################################################################
+.align 4
+jump_table:
+        i=0
+.rept 129
+.altmacro
+JMPTBL_ENTRY %i
+.noaltmacro
+        i=i+1
+.endr
+        ################################################################
+        ## PCLMULQDQ tables
+        ## Table is 128 entries x 2 quad words each
+        ################################################################
+.data
+.align 64
+K_table:
+        .quad 0x14cd00bd6,0x105ec76f0
+        .quad 0x0ba4fc28e,0x14cd00bd6
+        .quad 0x1d82c63da,0x0f20c0dfe
+        .quad 0x09e4addf8,0x0ba4fc28e
+        .quad 0x039d3b296,0x1384aa63a
+        .quad 0x102f9b8a2,0x1d82c63da
+        .quad 0x14237f5e6,0x01c291d04
+        .quad 0x00d3b6092,0x09e4addf8
+        .quad 0x0c96cfdc0,0x0740eef02
+        .quad 0x18266e456,0x039d3b296
+        .quad 0x0daece73e,0x0083a6eec
+        .quad 0x0ab7aff2a,0x102f9b8a2
+        .quad 0x1248ea574,0x1c1733996
+        .quad 0x083348832,0x14237f5e6
+        .quad 0x12c743124,0x02ad91c30
+        .quad 0x0b9e02b86,0x00d3b6092
+        .quad 0x018b33a4e,0x06992cea2
+        .quad 0x1b331e26a,0x0c96cfdc0
+        .quad 0x17d35ba46,0x07e908048
+        .quad 0x1bf2e8b8a,0x18266e456
+        .quad 0x1a3e0968a,0x11ed1f9d8
+        .quad 0x0ce7f39f4,0x0daece73e
+        .quad 0x061d82e56,0x0f1d0f55e
+        .quad 0x0d270f1a2,0x0ab7aff2a
+        .quad 0x1c3f5f66c,0x0a87ab8a8
+        .quad 0x12ed0daac,0x1248ea574
+        .quad 0x065863b64,0x08462d800
+        .quad 0x11eef4f8e,0x083348832
+        .quad 0x1ee54f54c,0x071d111a8
+        .quad 0x0b3e32c28,0x12c743124
+        .quad 0x0064f7f26,0x0ffd852c6
+        .quad 0x0dd7e3b0c,0x0b9e02b86
+        .quad 0x0f285651c,0x0dcb17aa4
+        .quad 0x010746f3c,0x018b33a4e
+        .quad 0x1c24afea4,0x0f37c5aee
+        .quad 0x0271d9844,0x1b331e26a
+        .quad 0x08e766a0c,0x06051d5a2
+        .quad 0x093a5f730,0x17d35ba46
+        .quad 0x06cb08e5c,0x11d5ca20e
+        .quad 0x06b749fb2,0x1bf2e8b8a
+        .quad 0x1167f94f2,0x021f3d99c
+        .quad 0x0cec3662e,0x1a3e0968a
+        .quad 0x19329634a,0x08f158014
+        .quad 0x0e6fc4e6a,0x0ce7f39f4
+        .quad 0x08227bb8a,0x1a5e82106
+        .quad 0x0b0cd4768,0x061d82e56
+        .quad 0x13c2b89c4,0x188815ab2
+        .quad 0x0d7a4825c,0x0d270f1a2
+        .quad 0x10f5ff2ba,0x105405f3e
+        .quad 0x00167d312,0x1c3f5f66c
+        .quad 0x0f6076544,0x0e9adf796
+        .quad 0x026f6a60a,0x12ed0daac
+        .quad 0x1a2adb74e,0x096638b34
+        .quad 0x19d34af3a,0x065863b64
+        .quad 0x049c3cc9c,0x1e50585a0
+        .quad 0x068bce87a,0x11eef4f8e
+        .quad 0x1524fa6c6,0x19f1c69dc
+        .quad 0x16cba8aca,0x1ee54f54c
+        .quad 0x042d98888,0x12913343e
+        .quad 0x1329d9f7e,0x0b3e32c28
+        .quad 0x1b1c69528,0x088f25a3a
+        .quad 0x02178513a,0x0064f7f26
+        .quad 0x0e0ac139e,0x04e36f0b0
+        .quad 0x0170076fa,0x0dd7e3b0c
+        .quad 0x141a1a2e2,0x0bd6f81f8
+        .quad 0x16ad828b4,0x0f285651c
+        .quad 0x041d17b64,0x19425cbba
+        .quad 0x1fae1cc66,0x010746f3c
+        .quad 0x1a75b4b00,0x18db37e8a
+        .quad 0x0f872e54c,0x1c24afea4
+        .quad 0x01e41e9fc,0x04c144932
+        .quad 0x086d8e4d2,0x0271d9844
+        .quad 0x160f7af7a,0x052148f02
+        .quad 0x05bb8f1bc,0x08e766a0c
+        .quad 0x0a90fd27a,0x0a3c6f37a
+        .quad 0x0b3af077a,0x093a5f730
+        .quad 0x04984d782,0x1d22c238e
+        .quad 0x0ca6ef3ac,0x06cb08e5c
+        .quad 0x0234e0b26,0x063ded06a
+        .quad 0x1d88abd4a,0x06b749fb2
+        .quad 0x04597456a,0x04d56973c
+        .quad 0x0e9e28eb4,0x1167f94f2
+        .quad 0x07b3ff57a,0x19385bf2e
+        .quad 0x0c9c8b782,0x0cec3662e
+        .quad 0x13a9cba9e,0x0e417f38a
+        .quad 0x093e106a4,0x19329634a
+        .quad 0x167001a9c,0x14e727980
+        .quad 0x1ddffc5d4,0x0e6fc4e6a
+        .quad 0x00df04680,0x0d104b8fc
+        .quad 0x02342001e,0x08227bb8a
+        .quad 0x00a2a8d7e,0x05b397730
+        .quad 0x168763fa6,0x0b0cd4768
+        .quad 0x1ed5a407a,0x0e78eb416
+        .quad 0x0d2c3ed1a,0x13c2b89c4
+        .quad 0x0995a5724,0x1641378f0
+        .quad 0x19b1afbc4,0x0d7a4825c
+        .quad 0x109ffedc0,0x08d96551c
+        .quad 0x0f2271e60,0x10f5ff2ba
+        .quad 0x00b0bf8ca,0x00bf80dd2
+        .quad 0x123888b7a,0x00167d312
+        .quad 0x1e888f7dc,0x18dcddd1c
+        .quad 0x002ee03b2,0x0f6076544
+        .quad 0x183e8d8fe,0x06a45d2b2
+        .quad 0x133d7a042,0x026f6a60a
+        .quad 0x116b0f50c,0x1dd3e10e8
+        .quad 0x05fabe670,0x1a2adb74e
+        .quad 0x130004488,0x0de87806c
+        .quad 0x000bcf5f6,0x19d34af3a
+        .quad 0x18f0c7078,0x014338754
+        .quad 0x017f27698,0x049c3cc9c
+        .quad 0x058ca5f00,0x15e3e77ee
+        .quad 0x1af900c24,0x068bce87a
+        .quad 0x0b5cfca28,0x0dd07448e
+        .quad 0x0ded288f8,0x1524fa6c6
+        .quad 0x059f229bc,0x1d8048348
+        .quad 0x06d390dec,0x16cba8aca
+        .quad 0x037170390,0x0a3e3e02c
+        .quad 0x06353c1cc,0x042d98888
+        .quad 0x0c4584f5c,0x0d73c7bea
+        .quad 0x1f16a3418,0x1329d9f7e
+        .quad 0x0531377e2,0x185137662
+        .quad 0x1d8d9ca7c,0x1b1c69528
+        .quad 0x0b25b29f2,0x18a08b5bc
+        .quad 0x19fb2a8b0,0x02178513a
+        .quad 0x1a08fe6ac,0x1da758ae0
+        .quad 0x045cddf4e,0x0e0ac139e
+        .quad 0x1a91647f2,0x169cf9eb0
+        .quad 0x1a0f717c4,0x0170076fa
diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S
new file mode 100644
index 000000000000..f7b6ea2ddfdb
--- /dev/null
+++ b/arch/x86/crypto/glue_helper-asm-avx.S
@@ -0,0 +1,91 @@
+/*
+ * Shared glue code for 128bit block ciphers, AVX assembler macros
+ *
+ * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
+        vmovdqu (0*16)(src), x0; \
+        vmovdqu (1*16)(src), x1; \
+        vmovdqu (2*16)(src), x2; \
+        vmovdqu (3*16)(src), x3; \
+        vmovdqu (4*16)(src), x4; \
+        vmovdqu (5*16)(src), x5; \
+        vmovdqu (6*16)(src), x6; \
+        vmovdqu (7*16)(src), x7;
+#define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+        vmovdqu x0, (0*16)(dst); \
+        vmovdqu x1, (1*16)(dst); \
+        vmovdqu x2, (2*16)(dst); \
+        vmovdqu x3, (3*16)(dst); \
+        vmovdqu x4, (4*16)(dst); \
+        vmovdqu x5, (5*16)(dst); \
+        vmovdqu x6, (6*16)(dst); \
+        vmovdqu x7, (7*16)(dst);
+#define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+        vpxor (0*16)(src), x1, x1; \
+        vpxor (1*16)(src), x2, x2; \
+        vpxor (2*16)(src), x3, x3; \
+        vpxor (3*16)(src), x4, x4; \
+        vpxor (4*16)(src), x5, x5; \
+        vpxor (5*16)(src), x6, x6; \
+        vpxor (6*16)(src), x7, x7; \
+        store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+#define inc_le128(x, minus_one, tmp) \
+        vpcmpeqq minus_one, x, tmp; \
+        vpsubq minus_one, x, x; \
+        vpslldq $8, tmp, tmp; \
+        vpsubq tmp, x, x;
+#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \
+        vpcmpeqd t0, t0, t0; \
+        vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \
+        vmovdqa bswap, t1; \
+        \
+        /* load IV and byteswap */ \
+        vmovdqu (iv), x7; \
+        vpshufb t1, x7, x0; \
+        \
+        /* construct IVs */ \
+        inc_le128(x7, t0, t2); \
+        vpshufb t1, x7, x1; \
+        inc_le128(x7, t0, t2); \
+        vpshufb t1, x7, x2; \
+        inc_le128(x7, t0, t2); \
+        vpshufb t1, x7, x3; \
+        inc_le128(x7, t0, t2); \
+        vpshufb t1, x7, x4; \
+        inc_le128(x7, t0, t2); \
+        vpshufb t1, x7, x5; \
+        inc_le128(x7, t0, t2); \
+        vpshufb t1, x7, x6; \
+        inc_le128(x7, t0, t2); \
+        vmovdqa x7, t2; \
+        vpshufb t1, x7, x7; \
+        inc_le128(t2, t0, t1); \
+        vmovdqu t2, (iv);
+#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+        vpxor (0*16)(src), x0, x0; \
+        vpxor (1*16)(src), x1, x1; \
+        vpxor (2*16)(src), x2, x2; \
+        vpxor (3*16)(src), x3, x3; \
+        vpxor (4*16)(src), x4, x4; \
+        vpxor (5*16)(src), x5, x5; \
+        vpxor (6*16)(src), x6, x6; \
+        vpxor (7*16)(src), x7, x7; \
+        store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index 30b3927bd733..22ce4f683e55 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -221,16 +221,16 @@ static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr,
        u8 *src = (u8 *)walk->src.virt.addr;
        u8 *dst = (u8 *)walk->dst.virt.addr;
        unsigned int nbytes = walk->nbytes;
-        u128 ctrblk;
+        le128 ctrblk;
        u128 tmp;
-        be128_to_u128(&ctrblk, (be128 *)walk->iv);
+        be128_to_le128(&ctrblk, (be128 *)walk->iv);
        memcpy(&tmp, src, nbytes);
        fn_ctr(ctx, &tmp, &tmp, &ctrblk);
        memcpy(dst, &tmp, nbytes);
-        u128_to_be128((be128 *)walk->iv, &ctrblk);
+        le128_to_be128((be128 *)walk->iv, &ctrblk);
 }
 EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit);
@@ -243,11 +243,11 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
        unsigned int nbytes = walk->nbytes;
        u128 *src = (u128 *)walk->src.virt.addr;
        u128 *dst = (u128 *)walk->dst.virt.addr;
-        u128 ctrblk;
+        le128 ctrblk;
        unsigned int num_blocks, func_bytes;
        unsigned int i;
-        be128_to_u128(&ctrblk, (be128 *)walk->iv);
+        be128_to_le128(&ctrblk, (be128 *)walk->iv);
        /* Process multi-block batch */
        for (i = 0; i < gctx->num_funcs; i++) {
@@ -269,7 +269,7 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
        }
 done:
-        u128_to_be128((be128 *)walk->iv, &ctrblk);
+        le128_to_be128((be128 *)walk->iv, &ctrblk);
        return nbytes;
 }
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index 504106bf04a2..02b0e9fe997c 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -24,7 +24,16 @@
 *
 */
+#include "glue_helper-asm-avx.S"
 .file "serpent-avx-x86_64-asm_64.S"
+.data
+.align 16
+.Lbswap128_mask:
+        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 .text
 #define CTX %rdi
@@ -550,51 +559,27 @@
        vpunpcklqdq             x3, t2, x2; \
        vpunpckhqdq             x3, t2, x3;
-#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
+#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
-        vmovdqu (0*4*4)(in),    x0; \
-        vmovdqu (1*4*4)(in),    x1; \
-        vmovdqu (2*4*4)(in),    x2; \
-        vmovdqu (3*4*4)(in),    x3; \
-        \
        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
-#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
-        transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
-        \
-        vmovdqu x0,             (0*4*4)(out); \
-        vmovdqu x1,             (1*4*4)(out); \
-        vmovdqu x2,             (2*4*4)(out); \
-        vmovdqu x3,             (3*4*4)(out);
-#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
-        transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
-        \
-        vpxor (0*4*4)(out),     x0, x0;       \
-        vmovdqu x0,             (0*4*4)(out); \
-        vpxor (1*4*4)(out),     x1, x1;       \
-        vmovdqu x1,             (1*4*4)(out); \
-        vpxor (2*4*4)(out),     x2, x2;       \
-        vmovdqu x2,             (2*4*4)(out); \
-        vpxor (3*4*4)(out),     x3, x3;       \
-        vmovdqu x3,             (3*4*4)(out);
 .align 8
-.global __serpent_enc_blk_8way_avx
+.type   __serpent_enc_blk8_avx,@function;
-.type   __serpent_enc_blk_8way_avx,@function;
-__serpent_enc_blk_8way_avx:
+__serpent_enc_blk8_avx:
        /* input:
         *      %rdi: ctx, CTX
-         *      %rsi: dst
+         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
-         *      %rdx: src
+         * output:
-         *      %rcx: bool, if true: xor output
+         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
         */
        vpcmpeqd RNOT, RNOT, RNOT;
-        leaq (4*4*4)(%rdx), %rax;
+        read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
-        read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+        read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
-        read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
                                                 K2(RA, RB, RC, RD, RE, 0);
        S(S0, RA, RB, RC, RD, RE);              LK2(RC, RB, RD, RA, RE, 1);
@@ -630,38 +615,26 @@ __serpent_enc_blk_8way_avx:
        S(S6, RA, RB, RD, RC, RE);              LK2(RD, RE, RB, RC, RA, 31);
        S(S7, RD, RE, RB, RC, RA);               K2(RA, RB, RC, RD, RE, 32);
-        leaq (4*4*4)(%rsi), %rax;
+        write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+        write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
-        testb %cl, %cl;
-        jnz __enc_xor8;
-        write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
-        write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
-        ret;
-__enc_xor8:
-        xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
-        xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
        ret;
 .align 8
-.global serpent_dec_blk_8way_avx
+.type   __serpent_dec_blk8_avx,@function;
-.type   serpent_dec_blk_8way_avx,@function;
-serpent_dec_blk_8way_avx:
+__serpent_dec_blk8_avx:
        /* input:
         *      %rdi: ctx, CTX
-         *      %rsi: dst
+         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
-         *      %rdx: src
+         * output:
+         *      RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks
         */
        vpcmpeqd RNOT, RNOT, RNOT;
-        leaq (4*4*4)(%rdx), %rax;
+        read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
-        read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+        read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
-        read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
                                                 K2(RA, RB, RC, RD, RE, 32);
        SP(SI7, RA, RB, RC, RD, RE, 31);        KL2(RB, RD, RA, RE, RC, 31);
@@ -697,8 +670,85 @@ serpent_dec_blk_8way_avx:
        SP(SI1, RD, RB, RC, RA, RE, 1);         KL2(RE, RB, RC, RA, RD, 1);
        S(SI0, RE, RB, RC, RA, RD);              K2(RC, RD, RB, RE, RA, 0);
-        leaq (4*4*4)(%rsi), %rax;
+        write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
-        write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
+        write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
-        write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+        ret;
+.align 8
+.global serpent_ecb_enc_8way_avx
+.type   serpent_ecb_enc_8way_avx,@function;
+serpent_ecb_enc_8way_avx:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        call __serpent_enc_blk8_avx;
+        store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        ret;
+.align 8
+.global serpent_ecb_dec_8way_avx
+.type   serpent_ecb_dec_8way_avx,@function;
+serpent_ecb_dec_8way_avx:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        call __serpent_dec_blk8_avx;
+        store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+        ret;
+.align 8
+.global serpent_cbc_dec_8way_avx
+.type   serpent_cbc_dec_8way_avx,@function;
+serpent_cbc_dec_8way_avx:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        call __serpent_dec_blk8_avx;
+        store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+        ret;
+.align 8
+.global serpent_ctr_8way_avx
+.type   serpent_ctr_8way_avx,@function;
+serpent_ctr_8way_avx:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         *      %rcx: iv (little endian, 128bit)
+         */
+        load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+                      RD2, RK0, RK1, RK2);
+        call __serpent_enc_blk8_avx;
+        store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
        ret;
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 3f543a04cf1e..52abaaf28e7f 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -42,55 +42,24 @@
 #include <asm/crypto/ablk_helper.h>
 #include <asm/crypto/glue_helper.h>
-static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
+static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
-{
-        u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
-        unsigned int j;
-        for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
-                ivs[j] = src[j];
-        serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
-        for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
-                u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
-}
-static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
 {
        be128 ctrblk;
-        u128_to_be128(&ctrblk, iv);
+        le128_to_be128(&ctrblk, iv);
-        u128_inc(iv);
+        le128_inc(iv);
        __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
        u128_xor(dst, src, (u128 *)&ctrblk);
 }
-static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
-                                   u128 *iv)
-{
-        be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
-        unsigned int i;
-        for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
-                if (dst != src)
-                        dst[i] = src[i];
-                u128_to_be128(&ctrblks[i], iv);
-                u128_inc(iv);
-        }
-        serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
-}
 static const struct common_glue_ctx serpent_enc = {
        .num_funcs = 2,
        .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
        .funcs = { {
                .num_blocks = SERPENT_PARALLEL_BLOCKS,
-                .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) }
+                .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) }
        }, {
                .num_blocks = 1,
                .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
@@ -103,7 +72,7 @@ static const struct common_glue_ctx serpent_ctr = {
        .funcs = { {
                .num_blocks = SERPENT_PARALLEL_BLOCKS,
-                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) }
+                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
        }, {
                .num_blocks = 1,
                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
@@ -116,7 +85,7 @@ static const struct common_glue_ctx serpent_dec = {
        .funcs = { {
                .num_blocks = SERPENT_PARALLEL_BLOCKS,
-                .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) }
+                .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) }
        }, {
                .num_blocks = 1,
                .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
@@ -129,7 +98,7 @@ static const struct common_glue_ctx serpent_dec_cbc = {
        .funcs = { {
                .num_blocks = SERPENT_PARALLEL_BLOCKS,
-                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) }
+                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) }
        }, {
                .num_blocks = 1,
                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
@@ -193,7 +162,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
        ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
        if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
-                serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
+                serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
                return;
        }
@@ -210,7 +179,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
        ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
        if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
-                serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
+                serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
                return;
        }
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 9107a9908c41..97a356ece24d 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -59,19 +59,19 @@ static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
                u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
 }
-static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
+static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
        be128 ctrblk;
-        u128_to_be128(&ctrblk, iv);
+        le128_to_be128(&ctrblk, iv);
-        u128_inc(iv);
+        le128_inc(iv);
        __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
        u128_xor(dst, src, (u128 *)&ctrblk);
 }
 static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
-                                   u128 *iv)
+                                   le128 *iv)
 {
        be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
        unsigned int i;
@@ -80,8 +80,8 @@ static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
                if (dst != src)
                        dst[i] = src[i];
-                u128_to_be128(&ctrblks[i], iv);
+                le128_to_be128(&ctrblks[i], iv);
-                u128_inc(iv);
+                le128_inc(iv);
        }
        serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 1585abb13dde..ebac16bfa830 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -23,7 +23,16 @@
 *
 */
+#include "glue_helper-asm-avx.S"
 .file "twofish-avx-x86_64-asm_64.S"
+.data
+.align 16
+.Lbswap128_mask:
+        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 .text
 /* structure of crypto context */
@@ -217,69 +226,45 @@
        vpunpcklqdq             x3, t2, x2; \
        vpunpckhqdq             x3, t2, x3;
-#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \
+#define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
-        vpxor (0*4*4)(in),      wkey, x0; \
+        vpxor           x0, wkey, x0; \
-        vpxor (1*4*4)(in),      wkey, x1; \
+        vpxor           x1, wkey, x1; \
-        vpxor (2*4*4)(in),      wkey, x2; \
+        vpxor           x2, wkey, x2; \
-        vpxor (3*4*4)(in),      wkey, x3; \
+        vpxor           x3, wkey, x3; \
        \
        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
-#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
+#define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
-        transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
-        \
-        vpxor           x0, wkey, x0;     \
-        vmovdqu         x0, (0*4*4)(out); \
-        vpxor           x1, wkey, x1;     \
-        vmovdqu         x1, (1*4*4)(out); \
-        vpxor           x2, wkey, x2;     \
-        vmovdqu         x2, (2*4*4)(out); \
-        vpxor           x3, wkey, x3;     \
-        vmovdqu         x3, (3*4*4)(out);
-#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
        transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
        \
-        vpxor           x0, wkey, x0;         \
+        vpxor           x0, wkey, x0; \
-        vpxor           (0*4*4)(out), x0, x0; \
+        vpxor           x1, wkey, x1; \
-        vmovdqu         x0, (0*4*4)(out);     \
+        vpxor           x2, wkey, x2; \
-        vpxor           x1, wkey, x1;         \
+        vpxor           x3, wkey, x3;
-        vpxor           (1*4*4)(out), x1, x1; \
-        vmovdqu         x1, (1*4*4)(out);     \
-        vpxor           x2, wkey, x2;         \
-        vpxor           (2*4*4)(out), x2, x2; \
-        vmovdqu         x2, (2*4*4)(out);     \
-        vpxor           x3, wkey, x3;         \
-        vpxor           (3*4*4)(out), x3, x3; \
-        vmovdqu         x3, (3*4*4)(out);
 .align 8
-.global __twofish_enc_blk_8way
+.type   __twofish_enc_blk8,@function;
-.type   __twofish_enc_blk_8way,@function;
-__twofish_enc_blk_8way:
+__twofish_enc_blk8:
        /* input:
         *      %rdi: ctx, CTX
-         *      %rsi: dst
+         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
-         *      %rdx: src
+         * output:
-         *      %rcx: bool, if true: xor output
+         *      RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
         */
+        vmovdqu w(CTX), RK1;
        pushq %rbp;
        pushq %rbx;
        pushq %rcx;
-        vmovdqu w(CTX), RK1;
+        inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
-        leaq (4*4*4)(%rdx), %rax;
-        inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
        preload_rgi(RA1);
        rotate_1l(RD1);
-        inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
+        inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
        rotate_1l(RD2);
-        movq %rsi, %r11;
        encrypt_cycle(0);
        encrypt_cycle(1);
        encrypt_cycle(2);
@@ -295,47 +280,33 @@ __twofish_enc_blk_8way:
        popq %rbx;
        popq %rbp;
-        leaq (4*4*4)(%r11), %rax;
+        outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+        outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
-        testb %cl, %cl;
-        jnz __enc_xor8;
-        outunpack_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
-        outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
-        ret;
-__enc_xor8:
-        outunpack_xor_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
-        outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
        ret;
 .align 8
-.global twofish_dec_blk_8way
+.type   __twofish_dec_blk8,@function;
-.type   twofish_dec_blk_8way,@function;
-twofish_dec_blk_8way:
+__twofish_dec_blk8:
        /* input:
         *      %rdi: ctx, CTX
-         *      %rsi: dst
+         *      RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
-         *      %rdx: src
+         * output:
+         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
         */
+        vmovdqu (w+4*4)(CTX), RK1;
        pushq %rbp;
        pushq %rbx;
-        vmovdqu (w+4*4)(CTX), RK1;
+        inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
-        leaq (4*4*4)(%rdx), %rax;
-        inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
        preload_rgi(RC1);
        rotate_1l(RA1);
-        inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
+        inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
        rotate_1l(RA2);
-        movq %rsi, %r11;
        decrypt_cycle(7);
        decrypt_cycle(6);
        decrypt_cycle(5);
@@ -350,8 +321,103 @@ twofish_dec_blk_8way:
        popq %rbx;
        popq %rbp;
-        leaq (4*4*4)(%r11), %rax;
+        outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
-        outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
+        outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
-        outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
+        ret;
+.align 8
+.global twofish_ecb_enc_8way
+.type   twofish_ecb_enc_8way,@function;
+twofish_ecb_enc_8way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        movq %rsi, %r11;
+        load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        call __twofish_enc_blk8;
+        store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+        ret;
+.align 8
+.global twofish_ecb_dec_8way
+.type   twofish_ecb_dec_8way,@function;
+twofish_ecb_dec_8way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        movq %rsi, %r11;
+        load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+        call __twofish_dec_blk8;
+        store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        ret;
+.align 8
+.global twofish_cbc_dec_8way
+.type   twofish_cbc_dec_8way,@function;
+twofish_cbc_dec_8way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        pushq %r12;
+        movq %rsi, %r11;
+        movq %rdx, %r12;
+        load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+        call __twofish_dec_blk8;
+        store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+        popq %r12;
+        ret;
+.align 8
+.global twofish_ctr_8way
+.type   twofish_ctr_8way,@function;
+twofish_ctr_8way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         *      %rcx: iv (little endian, 128bit)
+         */
+        pushq %r12;
+        movq %rsi, %r11;
+        movq %rdx, %r12;
+        load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+                      RD2, RX0, RX1, RY0);
+        call __twofish_enc_blk8;
+        store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+        popq %r12;
        ret;
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index e7708b5442e0..94ac91d26e47 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -45,66 +45,23 @@
 #define TWOFISH_PARALLEL_BLOCKS 8
-static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
-                                        const u8 *src)
-{
-        __twofish_enc_blk_3way(ctx, dst, src, false);
-}
 /* 8-way parallel cipher functions */
-asmlinkage void __twofish_enc_blk_8way(struct twofish_ctx *ctx, u8 *dst,
+asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
-                                       const u8 *src, bool xor);
+                                     const u8 *src);
-asmlinkage void twofish_dec_blk_8way(struct twofish_ctx *ctx, u8 *dst,
+asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
                                     const u8 *src);
-static inline void twofish_enc_blk_xway(struct twofish_ctx *ctx, u8 *dst,
+asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
-                                        const u8 *src)
+                                     const u8 *src);
-{
+asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
-        __twofish_enc_blk_8way(ctx, dst, src, false);
+                                 const u8 *src, le128 *iv);
-}
-static inline void twofish_enc_blk_xway_xor(struct twofish_ctx *ctx, u8 *dst,
-                                            const u8 *src)
-{
-        __twofish_enc_blk_8way(ctx, dst, src, true);
-}
-static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst,
+static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
                                        const u8 *src)
 {
-        twofish_dec_blk_8way(ctx, dst, src);
+        __twofish_enc_blk_3way(ctx, dst, src, false);
-}
-static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src)
-{
-        u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1];
-        unsigned int j;
-        for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
-                ivs[j] = src[j];
-        twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
-        for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
-                u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
 }
-static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src,
-                                     u128 *iv)
-{
-        be128 ctrblks[TWOFISH_PARALLEL_BLOCKS];
-        unsigned int i;
-        for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) {
-                if (dst != src)
-                        dst[i] = src[i];
-                u128_to_be128(&ctrblks[i], iv);
-                u128_inc(iv);
-        }
-        twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
-}
 static const struct common_glue_ctx twofish_enc = {
        .num_funcs = 3,
@@ -112,7 +69,7 @@ static const struct common_glue_ctx twofish_enc = {
        .funcs = { {
                .num_blocks = TWOFISH_PARALLEL_BLOCKS,
-                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_xway) }
+                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
        }, {
                .num_blocks = 3,
                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
@@ -128,7 +85,7 @@ static const struct common_glue_ctx twofish_ctr = {
        .funcs = { {
                .num_blocks = TWOFISH_PARALLEL_BLOCKS,
-                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_xway) }
+                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
        }, {
                .num_blocks = 3,
                .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
@@ -144,7 +101,7 @@ static const struct common_glue_ctx twofish_dec = {
        .funcs = { {
                .num_blocks = TWOFISH_PARALLEL_BLOCKS,
-                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_xway) }
+                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
        }, {
                .num_blocks = 3,
                .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
@@ -160,7 +117,7 @@ static const struct common_glue_ctx twofish_dec_cbc = {
        .funcs = { {
                .num_blocks = TWOFISH_PARALLEL_BLOCKS,
-                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_xway) }
+                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
        }, {
                .num_blocks = 3,
                .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
@@ -227,7 +184,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
        ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
        if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
-                twofish_enc_blk_xway(ctx->ctx, srcdst, srcdst);
+                twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
                return;
        }
@@ -249,7 +206,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
        ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
        if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
-                twofish_dec_blk_xway(ctx->ctx, srcdst, srcdst);
+                twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
                return;
        }
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index aa3eb358b7e8..13e63b3e1dfb 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -62,15 +62,15 @@ void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src)
 }
 EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way);
-void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
+void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
        be128 ctrblk;
        if (dst != src)
                *dst = *src;
-        u128_to_be128(&ctrblk, iv);
+        le128_to_be128(&ctrblk, iv);
-        u128_inc(iv);
+        le128_inc(iv);
        twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
        u128_xor(dst, dst, (u128 *)&ctrblk);
@@ -78,7 +78,7 @@ void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
 EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr);
 void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
-                                     u128 *iv)
+                              le128 *iv)
 {
        be128 ctrblks[3];
@@ -88,12 +88,12 @@ void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
                dst[2] = src[2];
        }
-        u128_to_be128(&ctrblks[0], iv);
+        le128_to_be128(&ctrblks[0], iv);
-        u128_inc(iv);
+        le128_inc(iv);
-        u128_to_be128(&ctrblks[1], iv);
+        le128_to_be128(&ctrblks[1], iv);
-        u128_inc(iv);
+        le128_inc(iv);
-        u128_to_be128(&ctrblks[2], iv);
+        le128_to_be128(&ctrblks[2], iv);
-        u128_inc(iv);
+        le128_inc(iv);
        twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks);
 }
diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/include/asm/crypto/camellia.h
new file mode 100644
index 000000000000..98038add801e
--- /dev/null
+++ b/arch/x86/include/asm/crypto/camellia.h
@@ -0,0 +1,82 @@
+#ifndef ASM_X86_CAMELLIA_H
+#define ASM_X86_CAMELLIA_H
+#include <linux/kernel.h>
+#include <linux/crypto.h>
+#define CAMELLIA_MIN_KEY_SIZE   16
+#define CAMELLIA_MAX_KEY_SIZE   32
+#define CAMELLIA_BLOCK_SIZE     16
+#define CAMELLIA_TABLE_BYTE_LEN 272
+#define CAMELLIA_PARALLEL_BLOCKS 2
+struct camellia_ctx {
+        u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)];
+        u32 key_length;
+};
+struct camellia_lrw_ctx {
+        struct lrw_table_ctx lrw_table;
+        struct camellia_ctx camellia_ctx;
+};
+struct camellia_xts_ctx {
+        struct camellia_ctx tweak_ctx;
+        struct camellia_ctx crypt_ctx;
+};
+extern int __camellia_setkey(struct camellia_ctx *cctx,
+                             const unsigned char *key,
+                             unsigned int key_len, u32 *flags);
+extern int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
+                               unsigned int keylen);
+extern void lrw_camellia_exit_tfm(struct crypto_tfm *tfm);
+extern int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
+                               unsigned int keylen);
+/* regular block cipher functions */
+asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
+                                   const u8 *src, bool xor);
+asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst,
+                                 const u8 *src);
+/* 2-way parallel cipher functions */
+asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
+                                        const u8 *src, bool xor);
+asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
+                                      const u8 *src);
+static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
+                                    const u8 *src)
+{
+        __camellia_enc_blk(ctx, dst, src, false);
+}
+static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst,
+                                        const u8 *src)
+{
+        __camellia_enc_blk(ctx, dst, src, true);
+}
+static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
+                                         const u8 *src)
+{
+        __camellia_enc_blk_2way(ctx, dst, src, false);
+}
+static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst,
+                                             const u8 *src)
+{
+        __camellia_enc_blk_2way(ctx, dst, src, true);
+}
+/* glue helpers */
+extern void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src);
+extern void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src,
+                               le128 *iv);
+extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
+                                    le128 *iv);
+#endif /* ASM_X86_CAMELLIA_H */
diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h
index 3e408bddc96f..e2d65b061d27 100644
--- a/arch/x86/include/asm/crypto/glue_helper.h
+++ b/arch/x86/include/asm/crypto/glue_helper.h
@@ -13,7 +13,7 @@
 typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);
 typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src);
 typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src,
-                                       u128 *iv);
+                                       le128 *iv);
 #define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn))
 #define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn))
@@ -71,23 +71,29 @@ static inline void glue_fpu_end(bool fpu_enabled)
                kernel_fpu_end();
 }
-static inline void u128_to_be128(be128 *dst, const u128 *src)
+static inline void le128_to_be128(be128 *dst, const le128 *src)
 {
-        dst->a = cpu_to_be64(src->a);
+        dst->a = cpu_to_be64(le64_to_cpu(src->a));
-        dst->b = cpu_to_be64(src->b);
+        dst->b = cpu_to_be64(le64_to_cpu(src->b));
 }
-static inline void be128_to_u128(u128 *dst, const be128 *src)
+static inline void be128_to_le128(le128 *dst, const be128 *src)
 {
-        dst->a = be64_to_cpu(src->a);
+        dst->a = cpu_to_le64(be64_to_cpu(src->a));
-        dst->b = be64_to_cpu(src->b);
+        dst->b = cpu_to_le64(be64_to_cpu(src->b));
 }
-static inline void u128_inc(u128 *i)
+static inline void le128_inc(le128 *i)
 {
-        i->b++;
+        u64 a = le64_to_cpu(i->a);
-        if (!i->b)
+        u64 b = le64_to_cpu(i->b);
-                i->a++;
+        b++;
+        if (!b)
+                a++;
+        i->a = cpu_to_le64(a);
+        i->b = cpu_to_le64(b);
 }
 extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h
index 432deedd2945..0da1d3e2a55c 100644
--- a/arch/x86/include/asm/crypto/serpent-avx.h
+++ b/arch/x86/include/asm/crypto/serpent-avx.h
@@ -6,27 +6,14 @@
 #define SERPENT_PARALLEL_BLOCKS 8
-asmlinkage void __serpent_enc_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
-                                           const u8 *src, bool xor);
+                                         const u8 *src);
-asmlinkage void serpent_dec_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
                                         const u8 *src);
-static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
-                                   const u8 *src)
+                                         const u8 *src);
-{
+asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
-        __serpent_enc_blk_8way_avx(ctx, dst, src, false);
+                                     const u8 *src, le128 *iv);
-}
-static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
-                                       const u8 *src)
-{
-        __serpent_enc_blk_8way_avx(ctx, dst, src, true);
-}
-static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
-                                   const u8 *src)
-{
-        serpent_dec_blk_8way_avx(ctx, dst, src);
-}
 #endif
diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h
index 9d2c514bd5f9..878c51ceebb5 100644
--- a/arch/x86/include/asm/crypto/twofish.h
+++ b/arch/x86/include/asm/crypto/twofish.h
@@ -31,9 +31,9 @@ asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 /* helpers from twofish_x86_64-3way module */
 extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);
 extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src,
-                                u128 *iv);
+                                le128 *iv);
 extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
-                                     u128 *iv);
+                                     le128 *iv);
 extern int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
                              unsigned int keylen);
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-12-15 15:35:19 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-12-15 15:35:19 -0500
commit	1ed55eac3b1fc30b29cdb52251e0f13b24fc344c (patch)
tree	b7a4c67f2e29f8aa418708c5da871e64c511f3ff /arch/x86
parent	08242bc2210938761230f79c5288dbcf72e94808 (diff)
parent	a2c0911c09190125f52c9941b9d187f601c2f7be (diff)