crypto: crc32 - add crc32 pclmulqdq implementation and wrappers for table implementation

This patch adds crc32 algorithms to shash crypto api. One is wrapper to gerneric crc32_le function. Second is crc32 pclmulqdq implementation. It use hardware provided PCLMULQDQ instruction to accelerate the CRC32 disposal. This instruction present from Intel Westmere and AMD Bulldozer CPUs. For intel core i5 I got 450MB/s for table implementation and 2100MB/s for pclmulqdq implementation. Signed-off-by: Alexander Boyko <alexander_boyko@xyratex.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
author: Alexander Boyko <alexander_boyko@xyratex.com> 2013-01-10 09:54:59 -0500
committer: Herbert Xu <herbert@gondor.apana.org.au> 2013-01-19 18:16:45 -0500
commit: 78c37d191dd6899d8c219fee597a17d6e3c5d288 (patch)
tree: 123ed7322996e4e4a6922791d6e3a674ffc05cba /arch
parent: 5c22ba6619796da82ea0aa18c72caf4fe003a329 (diff)
3 files changed, 450 insertions, 0 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index e0ca7c9ac383..63947a8f9f0f 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
 obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
+obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
 twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
@@ -52,3 +53,4 @@ ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
 crc32c-intel-y := crc32c-intel_glue.o
 crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o
+crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S
new file mode 100644
index 000000000000..65ea6a624907
--- /dev/null
+++ b/arch/x86/crypto/crc32-pclmul_asm.S
@@ -0,0 +1,247 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
+ * calculation.
+ * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
+ * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
+ * at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2B: Instruction Set Reference, N-Z
+ *
+ * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
+ *            Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+#include <linux/linkage.h>
+#include <asm/inst.h>
+.align 16
+/*
+ * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
+ * #define CONSTANT_R1  0x154442bd4LL
+ *
+ * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
+ * #define CONSTANT_R2  0x1c6e41596LL
+ */
+.Lconstant_R2R1:
+        .octa 0x00000001c6e415960000000154442bd4
+/*
+ * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
+ * #define CONSTANT_R3  0x1751997d0LL
+ *
+ * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
+ * #define CONSTANT_R4  0x0ccaa009eLL
+ */
+.Lconstant_R4R3:
+        .octa 0x00000000ccaa009e00000001751997d0
+/*
+ * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
+ * #define CONSTANT_R5  0x163cd6124LL
+ */
+.Lconstant_R5:
+        .octa 0x00000000000000000000000163cd6124
+.Lconstant_mask32:
+        .octa 0x000000000000000000000000FFFFFFFF
+/*
+ * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
+ *
+ * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
+ * #define CONSTANT_RU  0x1F7011641LL
+ */
+.Lconstant_RUpoly:
+        .octa 0x00000001F701164100000001DB710641
+#define CONSTANT %xmm0
+#ifdef __x86_64__
+#define BUF     %rdi
+#define LEN     %rsi
+#define CRC     %edx
+#else
+#warning Using 32bit code support
+#define BUF     %eax
+#define LEN     %edx
+#define CRC     %ecx
+#endif
+.text
+/**
+ *      Calculate crc32
+ *      BUF - buffer (16 bytes aligned)
+ *      LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
+ *      CRC - initial crc32
+ *      return %eax crc32
+ *      uint crc32_pclmul_le_16(unsigned char const *buffer,
+ *                           size_t len, uint crc32)
+ */
+.globl crc32_pclmul_le_16
+.align 4, 0x90
+crc32_pclmul_le_16:/* buffer and buffer size are 16 bytes aligned */
+        movdqa  (BUF), %xmm1
+        movdqa  0x10(BUF), %xmm2
+        movdqa  0x20(BUF), %xmm3
+        movdqa  0x30(BUF), %xmm4
+        movd    CRC, CONSTANT
+        pxor    CONSTANT, %xmm1
+        sub     $0x40, LEN
+        add     $0x40, BUF
+#ifndef __x86_64__
+        /* This is for position independent code(-fPIC) support for 32bit */
+        call    delta
+delta:
+        pop     %ecx
+#endif
+        cmp     $0x40, LEN
+        jb      less_64
+#ifdef __x86_64__
+        movdqa .Lconstant_R2R1(%rip), CONSTANT
+#else
+        movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT
+#endif
+loop_64:/*  64 bytes Full cache line folding */
+        prefetchnta    0x40(BUF)
+        movdqa  %xmm1, %xmm5
+        movdqa  %xmm2, %xmm6
+        movdqa  %xmm3, %xmm7
+#ifdef __x86_64__
+        movdqa  %xmm4, %xmm8
+#endif
+        PCLMULQDQ 00, CONSTANT, %xmm1
+        PCLMULQDQ 00, CONSTANT, %xmm2
+        PCLMULQDQ 00, CONSTANT, %xmm3
+#ifdef __x86_64__
+        PCLMULQDQ 00, CONSTANT, %xmm4
+#endif
+        PCLMULQDQ 0x11, CONSTANT, %xmm5
+        PCLMULQDQ 0x11, CONSTANT, %xmm6
+        PCLMULQDQ 0x11, CONSTANT, %xmm7
+#ifdef __x86_64__
+        PCLMULQDQ 0x11, CONSTANT, %xmm8
+#endif
+        pxor    %xmm5, %xmm1
+        pxor    %xmm6, %xmm2
+        pxor    %xmm7, %xmm3
+#ifdef __x86_64__
+        pxor    %xmm8, %xmm4
+#else
+        /* xmm8 unsupported for x32 */
+        movdqa  %xmm4, %xmm5
+        PCLMULQDQ 00, CONSTANT, %xmm4
+        PCLMULQDQ 0x11, CONSTANT, %xmm5
+        pxor    %xmm5, %xmm4
+#endif
+        pxor    (BUF), %xmm1
+        pxor    0x10(BUF), %xmm2
+        pxor    0x20(BUF), %xmm3
+        pxor    0x30(BUF), %xmm4
+        sub     $0x40, LEN
+        add     $0x40, BUF
+        cmp     $0x40, LEN
+        jge     loop_64
+less_64:/*  Folding cache line into 128bit */
+#ifdef __x86_64__
+        movdqa  .Lconstant_R4R3(%rip), CONSTANT
+#else
+        movdqa  .Lconstant_R4R3 - delta(%ecx), CONSTANT
+#endif
+        prefetchnta     (BUF)
+        movdqa  %xmm1, %xmm5
+        PCLMULQDQ 0x00, CONSTANT, %xmm1
+        PCLMULQDQ 0x11, CONSTANT, %xmm5
+        pxor    %xmm5, %xmm1
+        pxor    %xmm2, %xmm1
+        movdqa  %xmm1, %xmm5
+        PCLMULQDQ 0x00, CONSTANT, %xmm1
+        PCLMULQDQ 0x11, CONSTANT, %xmm5
+        pxor    %xmm5, %xmm1
+        pxor    %xmm3, %xmm1
+        movdqa  %xmm1, %xmm5
+        PCLMULQDQ 0x00, CONSTANT, %xmm1
+        PCLMULQDQ 0x11, CONSTANT, %xmm5
+        pxor    %xmm5, %xmm1
+        pxor    %xmm4, %xmm1
+        cmp     $0x10, LEN
+        jb      fold_64
+loop_16:/* Folding rest buffer into 128bit */
+        movdqa  %xmm1, %xmm5
+        PCLMULQDQ 0x00, CONSTANT, %xmm1
+        PCLMULQDQ 0x11, CONSTANT, %xmm5
+        pxor    %xmm5, %xmm1
+        pxor    (BUF), %xmm1
+        sub     $0x10, LEN
+        add     $0x10, BUF
+        cmp     $0x10, LEN
+        jge     loop_16
+fold_64:
+        /* perform the last 64 bit fold, also adds 32 zeroes
+         * to the input stream */
+        PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
+        psrldq  $0x08, %xmm1
+        pxor    CONSTANT, %xmm1
+        /* final 32-bit fold */
+        movdqa  %xmm1, %xmm2
+#ifdef __x86_64__
+        movdqa  .Lconstant_R5(%rip), CONSTANT
+        movdqa  .Lconstant_mask32(%rip), %xmm3
+#else
+        movdqa  .Lconstant_R5 - delta(%ecx), CONSTANT
+        movdqa  .Lconstant_mask32 - delta(%ecx), %xmm3
+#endif
+        psrldq  $0x04, %xmm2
+        pand    %xmm3, %xmm1
+        PCLMULQDQ 0x00, CONSTANT, %xmm1
+        pxor    %xmm2, %xmm1
+        /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
+#ifdef __x86_64__
+        movdqa  .Lconstant_RUpoly(%rip), CONSTANT
+#else
+        movdqa  .Lconstant_RUpoly - delta(%ecx), CONSTANT
+#endif
+        movdqa  %xmm1, %xmm2
+        pand    %xmm3, %xmm1
+        PCLMULQDQ 0x10, CONSTANT, %xmm1
+        pand    %xmm3, %xmm1
+        PCLMULQDQ 0x00, CONSTANT, %xmm1
+        pxor    %xmm2, %xmm1
+        pextrd  $0x01, %xmm1, %eax
+        ret
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
new file mode 100644
index 000000000000..9d014a74ef96
--- /dev/null
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -0,0 +1,201 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Wrappers for kernel crypto shash api to pclmulqdq crc32 imlementation.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/crc32.h>
+#include <crypto/internal/hash.h>
+#include <asm/cpufeature.h>
+#include <asm/cpu_device_id.h>
+#include <asm/i387.h>
+#define CHKSUM_BLOCK_SIZE       1
+#define CHKSUM_DIGEST_SIZE      4
+#define PCLMUL_MIN_LEN          64L     /* minimum size of buffer
+                                         * for crc32_pclmul_le_16 */
+#define SCALE_F                 16L     /* size of xmm register */
+#define SCALE_F_MASK            (SCALE_F - 1)
+u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32);
+static u32 __attribute__((pure))
+        crc32_pclmul_le(u32 crc, unsigned char const *p, size_t len)
+{
+        unsigned int iquotient;
+        unsigned int iremainder;
+        unsigned int prealign;
+        if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !irq_fpu_usable())
+                return crc32_le(crc, p, len);
+        if ((long)p & SCALE_F_MASK) {
+                /* align p to 16 byte */
+                prealign = SCALE_F - ((long)p & SCALE_F_MASK);
+                crc = crc32_le(crc, p, prealign);
+                len -= prealign;
+                p = (unsigned char *)(((unsigned long)p + SCALE_F_MASK) &
+                                     ~SCALE_F_MASK);
+        }
+        iquotient = len & (~SCALE_F_MASK);
+        iremainder = len & SCALE_F_MASK;
+        kernel_fpu_begin();
+        crc = crc32_pclmul_le_16(p, iquotient, crc);
+        kernel_fpu_end();
+        if (iremainder)
+                crc = crc32_le(crc, p + iquotient, iremainder);
+        return crc;
+}
+static int crc32_pclmul_cra_init(struct crypto_tfm *tfm)
+{
+        u32 *key = crypto_tfm_ctx(tfm);
+        *key = 0;
+        return 0;
+}
+static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key,
+                        unsigned int keylen)
+{
+        u32 *mctx = crypto_shash_ctx(hash);
+        if (keylen != sizeof(u32)) {
+                crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+                return -EINVAL;
+        }
+        *mctx = le32_to_cpup((__le32 *)key);
+        return 0;
+}
+static int crc32_pclmul_init(struct shash_desc *desc)
+{
+        u32 *mctx = crypto_shash_ctx(desc->tfm);
+        u32 *crcp = shash_desc_ctx(desc);
+        *crcp = *mctx;
+        return 0;
+}
+static int crc32_pclmul_update(struct shash_desc *desc, const u8 *data,
+                               unsigned int len)
+{
+        u32 *crcp = shash_desc_ctx(desc);
+        *crcp = crc32_pclmul_le(*crcp, data, len);
+        return 0;
+}
+/* No final XOR 0xFFFFFFFF, like crc32_le */
+static int __crc32_pclmul_finup(u32 *crcp, const u8 *data, unsigned int len,
+                                u8 *out)
+{
+        *(__le32 *)out = cpu_to_le32(crc32_pclmul_le(*crcp, data, len));
+        return 0;
+}
+static int crc32_pclmul_finup(struct shash_desc *desc, const u8 *data,
+                              unsigned int len, u8 *out)
+{
+        return __crc32_pclmul_finup(shash_desc_ctx(desc), data, len, out);
+}
+static int crc32_pclmul_final(struct shash_desc *desc, u8 *out)
+{
+        u32 *crcp = shash_desc_ctx(desc);
+        *(__le32 *)out = cpu_to_le32p(crcp);
+        return 0;
+}
+static int crc32_pclmul_digest(struct shash_desc *desc, const u8 *data,
+                               unsigned int len, u8 *out)
+{
+        return __crc32_pclmul_finup(crypto_shash_ctx(desc->tfm), data, len,
+                                    out);
+}
+static struct shash_alg alg = {
+        .setkey         = crc32_pclmul_setkey,
+        .init           = crc32_pclmul_init,
+        .update         = crc32_pclmul_update,
+        .final          = crc32_pclmul_final,
+        .finup          = crc32_pclmul_finup,
+        .digest         = crc32_pclmul_digest,
+        .descsize       = sizeof(u32),
+        .digestsize     = CHKSUM_DIGEST_SIZE,
+        .base           = {
+                        .cra_name               = "crc32",
+                        .cra_driver_name        = "crc32-pclmul",
+                        .cra_priority           = 200,
+                        .cra_blocksize          = CHKSUM_BLOCK_SIZE,
+                        .cra_ctxsize            = sizeof(u32),
+                        .cra_module             = THIS_MODULE,
+                        .cra_init               = crc32_pclmul_cra_init,
+        }
+};
+static const struct x86_cpu_id crc32pclmul_cpu_id[] = {
+        X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ),
+        {}
+};
+MODULE_DEVICE_TABLE(x86cpu, crc32pclmul_cpu_id);
+static int __init crc32_pclmul_mod_init(void)
+{
+        if (!x86_match_cpu(crc32pclmul_cpu_id)) {
+                pr_info("PCLMULQDQ-NI instructions are not detected.\n");
+                return -ENODEV;
+        }
+        return crypto_register_shash(&alg);
+}
+static void __exit crc32_pclmul_mod_fini(void)
+{
+        crypto_unregister_shash(&alg);
+}
+module_init(crc32_pclmul_mod_init);
+module_exit(crc32_pclmul_mod_fini);
+MODULE_AUTHOR("Alexander Boyko <alexander_boyko@xyratex.com>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("crc32");
+MODULE_ALIAS("crc32-pclmul");
author	Alexander Boyko <alexander_boyko@xyratex.com>	2013-01-10 09:54:59 -0500
committer	Herbert Xu <herbert@gondor.apana.org.au>	2013-01-19 18:16:45 -0500
commit	78c37d191dd6899d8c219fee597a17d6e3c5d288 (patch)
tree	123ed7322996e4e4a6922791d6e3a674ffc05cba /arch
parent	5c22ba6619796da82ea0aa18c72caf4fe003a329 (diff)

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index e0ca7c9ac383..63947a8f9f0f 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
27		27
28	obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o	28	obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
29	obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o	29	obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
		30	obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
30		31
31	aes-i586-y := aes-i586-asm_32.o aes_glue.o	32	aes-i586-y := aes-i586-asm_32.o aes_glue.o
32	twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o	33	twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
@@ -52,3 +53,4 @@ ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
52	sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o	53	sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
53	crc32c-intel-y := crc32c-intel_glue.o	54	crc32c-intel-y := crc32c-intel_glue.o
54	crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o	55	crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o
		56	crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o


diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S new file mode 100644 index 000000000000..65ea6a624907 --- /dev/null +++ b/arch/x86/crypto/crc32-pclmul_asm.S
@@ -0,0 +1,247 @@
		1	/* GPL HEADER START
		2	*
		3	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
		4	*
		5	* This program is free software; you can redistribute it and/or modify
		6	* it under the terms of the GNU General Public License version 2 only,
		7	* as published by the Free Software Foundation.
		8	*
		9	* This program is distributed in the hope that it will be useful, but
		10	* WITHOUT ANY WARRANTY; without even the implied warranty of
		11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
		12	* General Public License version 2 for more details (a copy is included
		13	* in the LICENSE file that accompanied this code).
		14	*
		15	* You should have received a copy of the GNU General Public License
		16	* version 2 along with this program; If not, see http://www.gnu.org/licenses
		17	*
		18	* Please visit http://www.xyratex.com/contact if you need additional
		19	* information or have any questions.
		20	*
		21	* GPL HEADER END
		22	*/
		23
		24	/*
		25	* Copyright 2012 Xyratex Technology Limited
		26	*
		27	* Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
		28	* calculation.
		29	* CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
		30	* PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
		31	* at:
		32	* http://www.intel.com/products/processor/manuals/
		33	* Intel(R) 64 and IA-32 Architectures Software Developer's Manual
		34	* Volume 2B: Instruction Set Reference, N-Z
		35	*
		36	* Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
		37	* Alexander Boyko <Alexander_Boyko@xyratex.com>
		38	*/
		39
		40	#include <linux/linkage.h>
		41	#include <asm/inst.h>
		42
		43
		44	.align 16
		45	/*
		46	* [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
		47	* #define CONSTANT_R1 0x154442bd4LL
		48	*
		49	* [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
		50	* #define CONSTANT_R2 0x1c6e41596LL
		51	*/
		52	.Lconstant_R2R1:
		53	.octa 0x00000001c6e415960000000154442bd4
		54	/*
		55	* [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
		56	* #define CONSTANT_R3 0x1751997d0LL
		57	*
		58	* [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
		59	* #define CONSTANT_R4 0x0ccaa009eLL
		60	*/
		61	.Lconstant_R4R3:
		62	.octa 0x00000000ccaa009e00000001751997d0
		63	/*
		64	* [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
		65	* #define CONSTANT_R5 0x163cd6124LL
		66	*/
		67	.Lconstant_R5:
		68	.octa 0x00000000000000000000000163cd6124
		69	.Lconstant_mask32:
		70	.octa 0x000000000000000000000000FFFFFFFF
		71	/*
		72	* #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
		73	*
		74	* Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
		75	* #define CONSTANT_RU 0x1F7011641LL
		76	*/
		77	.Lconstant_RUpoly:
		78	.octa 0x00000001F701164100000001DB710641
		79
		80	#define CONSTANT %xmm0
		81
		82	#ifdef __x86_64__
		83	#define BUF %rdi
		84	#define LEN %rsi
		85	#define CRC %edx
		86	#else
		87	#warning Using 32bit code support
		88	#define BUF %eax
		89	#define LEN %edx
		90	#define CRC %ecx
		91	#endif
		92
		93
		94
		95	.text
		96	/**
		97	* Calculate crc32
		98	* BUF - buffer (16 bytes aligned)
		99	* LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
		100	* CRC - initial crc32
		101	* return %eax crc32
		102	* uint crc32_pclmul_le_16(unsigned char const *buffer,
		103	* size_t len, uint crc32)
		104	*/
		105	.globl crc32_pclmul_le_16
		106	.align 4, 0x90
		107	crc32_pclmul_le_16:/* buffer and buffer size are 16 bytes aligned */
		108	movdqa (BUF), %xmm1
		109	movdqa 0x10(BUF), %xmm2
		110	movdqa 0x20(BUF), %xmm3
		111	movdqa 0x30(BUF), %xmm4
		112	movd CRC, CONSTANT
		113	pxor CONSTANT, %xmm1
		114	sub $0x40, LEN
		115	add $0x40, BUF
		116	#ifndef __x86_64__
		117	/* This is for position independent code(-fPIC) support for 32bit */
		118	call delta
		119	delta:
		120	pop %ecx
		121	#endif
		122	cmp $0x40, LEN
		123	jb less_64
		124
		125	#ifdef __x86_64__
		126	movdqa .Lconstant_R2R1(%rip), CONSTANT
		127	#else
		128	movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT
		129	#endif
		130
		131	loop_64:/* 64 bytes Full cache line folding */
		132	prefetchnta 0x40(BUF)
		133	movdqa %xmm1, %xmm5
		134	movdqa %xmm2, %xmm6
		135	movdqa %xmm3, %xmm7
		136	#ifdef __x86_64__
		137	movdqa %xmm4, %xmm8
		138	#endif
		139	PCLMULQDQ 00, CONSTANT, %xmm1
		140	PCLMULQDQ 00, CONSTANT, %xmm2
		141	PCLMULQDQ 00, CONSTANT, %xmm3
		142	#ifdef __x86_64__
		143	PCLMULQDQ 00, CONSTANT, %xmm4
		144	#endif
		145	PCLMULQDQ 0x11, CONSTANT, %xmm5
		146	PCLMULQDQ 0x11, CONSTANT, %xmm6
		147	PCLMULQDQ 0x11, CONSTANT, %xmm7
		148	#ifdef __x86_64__
		149	PCLMULQDQ 0x11, CONSTANT, %xmm8
		150	#endif
		151	pxor %xmm5, %xmm1
		152	pxor %xmm6, %xmm2
		153	pxor %xmm7, %xmm3
		154	#ifdef __x86_64__
		155	pxor %xmm8, %xmm4
		156	#else
		157	/* xmm8 unsupported for x32 */
		158	movdqa %xmm4, %xmm5
		159	PCLMULQDQ 00, CONSTANT, %xmm4
		160	PCLMULQDQ 0x11, CONSTANT, %xmm5
		161	pxor %xmm5, %xmm4
		162	#endif
		163
		164	pxor (BUF), %xmm1
		165	pxor 0x10(BUF), %xmm2
		166	pxor 0x20(BUF), %xmm3
		167	pxor 0x30(BUF), %xmm4
		168
		169	sub $0x40, LEN
		170	add $0x40, BUF
		171	cmp $0x40, LEN
		172	jge loop_64
		173	less_64:/* Folding cache line into 128bit */
		174	#ifdef __x86_64__
		175	movdqa .Lconstant_R4R3(%rip), CONSTANT
		176	#else
		177	movdqa .Lconstant_R4R3 - delta(%ecx), CONSTANT
		178	#endif
		179	prefetchnta (BUF)
		180
		181	movdqa %xmm1, %xmm5
		182	PCLMULQDQ 0x00, CONSTANT, %xmm1
		183	PCLMULQDQ 0x11, CONSTANT, %xmm5
		184	pxor %xmm5, %xmm1
		185	pxor %xmm2, %xmm1
		186
		187	movdqa %xmm1, %xmm5
		188	PCLMULQDQ 0x00, CONSTANT, %xmm1
		189	PCLMULQDQ 0x11, CONSTANT, %xmm5
		190	pxor %xmm5, %xmm1
		191	pxor %xmm3, %xmm1
		192
		193	movdqa %xmm1, %xmm5
		194	PCLMULQDQ 0x00, CONSTANT, %xmm1
		195	PCLMULQDQ 0x11, CONSTANT, %xmm5
		196	pxor %xmm5, %xmm1
		197	pxor %xmm4, %xmm1
		198
		199	cmp $0x10, LEN
		200	jb fold_64
		201	loop_16:/* Folding rest buffer into 128bit */
		202	movdqa %xmm1, %xmm5
		203	PCLMULQDQ 0x00, CONSTANT, %xmm1
		204	PCLMULQDQ 0x11, CONSTANT, %xmm5
		205	pxor %xmm5, %xmm1
		206	pxor (BUF), %xmm1
		207	sub $0x10, LEN
		208	add $0x10, BUF
		209	cmp $0x10, LEN
		210	jge loop_16
		211
		212	fold_64:
		213	/* perform the last 64 bit fold, also adds 32 zeroes
		214	* to the input stream */
		215	PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
		216	psrldq $0x08, %xmm1
		217	pxor CONSTANT, %xmm1
		218
		219	/* final 32-bit fold */
		220	movdqa %xmm1, %xmm2
		221	#ifdef __x86_64__
		222	movdqa .Lconstant_R5(%rip), CONSTANT
		223	movdqa .Lconstant_mask32(%rip), %xmm3
		224	#else
		225	movdqa .Lconstant_R5 - delta(%ecx), CONSTANT
		226	movdqa .Lconstant_mask32 - delta(%ecx), %xmm3
		227	#endif
		228	psrldq $0x04, %xmm2
		229	pand %xmm3, %xmm1
		230	PCLMULQDQ 0x00, CONSTANT, %xmm1
		231	pxor %xmm2, %xmm1
		232
		233	/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
		234	#ifdef __x86_64__
		235	movdqa .Lconstant_RUpoly(%rip), CONSTANT
		236	#else
		237	movdqa .Lconstant_RUpoly - delta(%ecx), CONSTANT
		238	#endif
		239	movdqa %xmm1, %xmm2
		240	pand %xmm3, %xmm1
		241	PCLMULQDQ 0x10, CONSTANT, %xmm1
		242	pand %xmm3, %xmm1
		243	PCLMULQDQ 0x00, CONSTANT, %xmm1
		244	pxor %xmm2, %xmm1
		245	pextrd $0x01, %xmm1, %eax
		246
		247	ret


diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c new file mode 100644 index 000000000000..9d014a74ef96 --- /dev/null +++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -0,0 +1,201 @@
		1	/* GPL HEADER START
		2	*
		3	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
		4	*
		5	* This program is free software; you can redistribute it and/or modify
		6	* it under the terms of the GNU General Public License version 2 only,
		7	* as published by the Free Software Foundation.
		8	*
		9	* This program is distributed in the hope that it will be useful, but
		10	* WITHOUT ANY WARRANTY; without even the implied warranty of
		11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
		12	* General Public License version 2 for more details (a copy is included
		13	* in the LICENSE file that accompanied this code).
		14	*
		15	* You should have received a copy of the GNU General Public License
		16	* version 2 along with this program; If not, see http://www.gnu.org/licenses
		17	*
		18	* Please visit http://www.xyratex.com/contact if you need additional
		19	* information or have any questions.
		20	*
		21	* GPL HEADER END
		22	*/
		23
		24	/*
		25	* Copyright 2012 Xyratex Technology Limited
		26	*
		27	* Wrappers for kernel crypto shash api to pclmulqdq crc32 imlementation.
		28	*/
		29	#include <linux/init.h>
		30	#include <linux/module.h>
		31	#include <linux/string.h>
		32	#include <linux/kernel.h>
		33	#include <linux/crc32.h>
		34	#include <crypto/internal/hash.h>
		35
		36	#include <asm/cpufeature.h>
		37	#include <asm/cpu_device_id.h>
		38	#include <asm/i387.h>
		39
		40	#define CHKSUM_BLOCK_SIZE 1
		41	#define CHKSUM_DIGEST_SIZE 4
		42
		43	#define PCLMUL_MIN_LEN 64L /* minimum size of buffer
		44	* for crc32_pclmul_le_16 */
		45	#define SCALE_F 16L /* size of xmm register */
		46	#define SCALE_F_MASK (SCALE_F - 1)
		47
		48	u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32);
		49
		50	static u32 __attribute__((pure))
		51	crc32_pclmul_le(u32 crc, unsigned char const *p, size_t len)
		52	{
		53	unsigned int iquotient;
		54	unsigned int iremainder;
		55	unsigned int prealign;
		56
		57	if (len < PCLMUL_MIN_LEN + SCALE_F_MASK \|\| !irq_fpu_usable())
		58	return crc32_le(crc, p, len);
		59
		60	if ((long)p & SCALE_F_MASK) {
		61	/* align p to 16 byte */
		62	prealign = SCALE_F - ((long)p & SCALE_F_MASK);
		63
		64	crc = crc32_le(crc, p, prealign);
		65	len -= prealign;
		66	p = (unsigned char *)(((unsigned long)p + SCALE_F_MASK) &
		67	~SCALE_F_MASK);
		68	}
		69	iquotient = len & (~SCALE_F_MASK);
		70	iremainder = len & SCALE_F_MASK;
		71
		72	kernel_fpu_begin();
		73	crc = crc32_pclmul_le_16(p, iquotient, crc);
		74	kernel_fpu_end();
		75
		76	if (iremainder)
		77	crc = crc32_le(crc, p + iquotient, iremainder);
		78
		79	return crc;
		80	}
		81
		82	static int crc32_pclmul_cra_init(struct crypto_tfm *tfm)
		83	{
		84	u32 *key = crypto_tfm_ctx(tfm);
		85
		86	*key = 0;
		87
		88	return 0;
		89	}
		90
		91	static int crc32_pclmul_setkey(struct crypto_shash hash, const u8 key,
		92	unsigned int keylen)
		93	{
		94	u32 *mctx = crypto_shash_ctx(hash);
		95
		96	if (keylen != sizeof(u32)) {
		97	crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
		98	return -EINVAL;
		99	}
		100	mctx = le32_to_cpup((__le32 )key);
		101	return 0;
		102	}
		103
		104	static int crc32_pclmul_init(struct shash_desc *desc)
		105	{
		106	u32 *mctx = crypto_shash_ctx(desc->tfm);
		107	u32 *crcp = shash_desc_ctx(desc);
		108
		109	crcp = mctx;
		110
		111	return 0;
		112	}
		113
		114	static int crc32_pclmul_update(struct shash_desc desc, const u8 data,
		115	unsigned int len)
		116	{
		117	u32 *crcp = shash_desc_ctx(desc);
		118
		119	crcp = crc32_pclmul_le(crcp, data, len);
		120	return 0;
		121	}
		122
		123	/* No final XOR 0xFFFFFFFF, like crc32_le */
		124	static int __crc32_pclmul_finup(u32 crcp, const u8 data, unsigned int len,
		125	u8 *out)
		126	{
		127	(__le32 )out = cpu_to_le32(crc32_pclmul_le(*crcp, data, len));
		128	return 0;
		129	}
		130
		131	static int crc32_pclmul_finup(struct shash_desc desc, const u8 data,
		132	unsigned int len, u8 *out)
		133	{
		134	return __crc32_pclmul_finup(shash_desc_ctx(desc), data, len, out);
		135	}
		136
		137	static int crc32_pclmul_final(struct shash_desc desc, u8 out)
		138	{
		139	u32 *crcp = shash_desc_ctx(desc);
		140
		141	(__le32 )out = cpu_to_le32p(crcp);
		142	return 0;
		143	}
		144
		145	static int crc32_pclmul_digest(struct shash_desc desc, const u8 data,
		146	unsigned int len, u8 *out)
		147	{
		148	return __crc32_pclmul_finup(crypto_shash_ctx(desc->tfm), data, len,
		149	out);
		150	}
		151
		152	static struct shash_alg alg = {
		153	.setkey = crc32_pclmul_setkey,
		154	.init = crc32_pclmul_init,
		155	.update = crc32_pclmul_update,
		156	.final = crc32_pclmul_final,
		157	.finup = crc32_pclmul_finup,
		158	.digest = crc32_pclmul_digest,
		159	.descsize = sizeof(u32),
		160	.digestsize = CHKSUM_DIGEST_SIZE,
		161	.base = {
		162	.cra_name = "crc32",
		163	.cra_driver_name = "crc32-pclmul",
		164	.cra_priority = 200,
		165	.cra_blocksize = CHKSUM_BLOCK_SIZE,
		166	.cra_ctxsize = sizeof(u32),
		167	.cra_module = THIS_MODULE,
		168	.cra_init = crc32_pclmul_cra_init,
		169	}
		170	};
		171
		172	static const struct x86_cpu_id crc32pclmul_cpu_id[] = {
		173	X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ),
		174	{}
		175	};
		176	MODULE_DEVICE_TABLE(x86cpu, crc32pclmul_cpu_id);
		177
		178
		179	static int __init crc32_pclmul_mod_init(void)
		180	{
		181
		182	if (!x86_match_cpu(crc32pclmul_cpu_id)) {
		183	pr_info("PCLMULQDQ-NI instructions are not detected.\n");
		184	return -ENODEV;
		185	}
		186	return crypto_register_shash(&alg);
		187	}
		188
		189	static void __exit crc32_pclmul_mod_fini(void)
		190	{
		191	crypto_unregister_shash(&alg);
		192	}
		193
		194	module_init(crc32_pclmul_mod_init);
		195	module_exit(crc32_pclmul_mod_fini);
		196
		197	MODULE_AUTHOR("Alexander Boyko <alexander_boyko@xyratex.com>");
		198	MODULE_LICENSE("GPL");
		199
		200	MODULE_ALIAS("crc32");
		201	MODULE_ALIAS("crc32-pclmul");