aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorAlexander Boyko <alexander_boyko@xyratex.com>2013-01-10 09:54:59 -0500
committerHerbert Xu <herbert@gondor.apana.org.au>2013-01-19 18:16:45 -0500
commit78c37d191dd6899d8c219fee597a17d6e3c5d288 (patch)
tree123ed7322996e4e4a6922791d6e3a674ffc05cba /arch
parent5c22ba6619796da82ea0aa18c72caf4fe003a329 (diff)
crypto: crc32 - add crc32 pclmulqdq implementation and wrappers for table implementation
This patch adds crc32 algorithms to shash crypto api. One is wrapper to gerneric crc32_le function. Second is crc32 pclmulqdq implementation. It use hardware provided PCLMULQDQ instruction to accelerate the CRC32 disposal. This instruction present from Intel Westmere and AMD Bulldozer CPUs. For intel core i5 I got 450MB/s for table implementation and 2100MB/s for pclmulqdq implementation. Signed-off-by: Alexander Boyko <alexander_boyko@xyratex.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/crc32-pclmul_asm.S247
-rw-r--r--arch/x86/crypto/crc32-pclmul_glue.c201
3 files changed, 450 insertions, 0 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index e0ca7c9ac383..63947a8f9f0f 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
27 27
28obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o 28obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
29obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o 29obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
30obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
30 31
31aes-i586-y := aes-i586-asm_32.o aes_glue.o 32aes-i586-y := aes-i586-asm_32.o aes_glue.o
32twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o 33twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
@@ -52,3 +53,4 @@ ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
52sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o 53sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
53crc32c-intel-y := crc32c-intel_glue.o 54crc32c-intel-y := crc32c-intel_glue.o
54crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o 55crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o
56crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S
new file mode 100644
index 000000000000..65ea6a624907
--- /dev/null
+++ b/arch/x86/crypto/crc32-pclmul_asm.S
@@ -0,0 +1,247 @@
1/* GPL HEADER START
2 *
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 only,
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License version 2 for more details (a copy is included
13 * in the LICENSE file that accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License
16 * version 2 along with this program; If not, see http://www.gnu.org/licenses
17 *
18 * Please visit http://www.xyratex.com/contact if you need additional
19 * information or have any questions.
20 *
21 * GPL HEADER END
22 */
23
24/*
25 * Copyright 2012 Xyratex Technology Limited
26 *
27 * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
28 * calculation.
29 * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
30 * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
31 * at:
32 * http://www.intel.com/products/processor/manuals/
33 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
34 * Volume 2B: Instruction Set Reference, N-Z
35 *
36 * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
37 * Alexander Boyko <Alexander_Boyko@xyratex.com>
38 */
39
40#include <linux/linkage.h>
41#include <asm/inst.h>
42
43
44.align 16
45/*
46 * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
47 * #define CONSTANT_R1 0x154442bd4LL
48 *
49 * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
50 * #define CONSTANT_R2 0x1c6e41596LL
51 */
52.Lconstant_R2R1:
53 .octa 0x00000001c6e415960000000154442bd4
54/*
55 * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
56 * #define CONSTANT_R3 0x1751997d0LL
57 *
58 * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
59 * #define CONSTANT_R4 0x0ccaa009eLL
60 */
61.Lconstant_R4R3:
62 .octa 0x00000000ccaa009e00000001751997d0
63/*
64 * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
65 * #define CONSTANT_R5 0x163cd6124LL
66 */
67.Lconstant_R5:
68 .octa 0x00000000000000000000000163cd6124
69.Lconstant_mask32:
70 .octa 0x000000000000000000000000FFFFFFFF
71/*
72 * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
73 *
74 * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
75 * #define CONSTANT_RU 0x1F7011641LL
76 */
77.Lconstant_RUpoly:
78 .octa 0x00000001F701164100000001DB710641
79
80#define CONSTANT %xmm0
81
82#ifdef __x86_64__
83#define BUF %rdi
84#define LEN %rsi
85#define CRC %edx
86#else
87#warning Using 32bit code support
88#define BUF %eax
89#define LEN %edx
90#define CRC %ecx
91#endif
92
93
94
95.text
96/**
97 * Calculate crc32
98 * BUF - buffer (16 bytes aligned)
99 * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
100 * CRC - initial crc32
101 * return %eax crc32
102 * uint crc32_pclmul_le_16(unsigned char const *buffer,
103 * size_t len, uint crc32)
104 */
105.globl crc32_pclmul_le_16
106.align 4, 0x90
107crc32_pclmul_le_16:/* buffer and buffer size are 16 bytes aligned */
108 movdqa (BUF), %xmm1
109 movdqa 0x10(BUF), %xmm2
110 movdqa 0x20(BUF), %xmm3
111 movdqa 0x30(BUF), %xmm4
112 movd CRC, CONSTANT
113 pxor CONSTANT, %xmm1
114 sub $0x40, LEN
115 add $0x40, BUF
116#ifndef __x86_64__
117 /* This is for position independent code(-fPIC) support for 32bit */
118 call delta
119delta:
120 pop %ecx
121#endif
122 cmp $0x40, LEN
123 jb less_64
124
125#ifdef __x86_64__
126 movdqa .Lconstant_R2R1(%rip), CONSTANT
127#else
128 movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT
129#endif
130
131loop_64:/* 64 bytes Full cache line folding */
132 prefetchnta 0x40(BUF)
133 movdqa %xmm1, %xmm5
134 movdqa %xmm2, %xmm6
135 movdqa %xmm3, %xmm7
136#ifdef __x86_64__
137 movdqa %xmm4, %xmm8
138#endif
139 PCLMULQDQ 00, CONSTANT, %xmm1
140 PCLMULQDQ 00, CONSTANT, %xmm2
141 PCLMULQDQ 00, CONSTANT, %xmm3
142#ifdef __x86_64__
143 PCLMULQDQ 00, CONSTANT, %xmm4
144#endif
145 PCLMULQDQ 0x11, CONSTANT, %xmm5
146 PCLMULQDQ 0x11, CONSTANT, %xmm6
147 PCLMULQDQ 0x11, CONSTANT, %xmm7
148#ifdef __x86_64__
149 PCLMULQDQ 0x11, CONSTANT, %xmm8
150#endif
151 pxor %xmm5, %xmm1
152 pxor %xmm6, %xmm2
153 pxor %xmm7, %xmm3
154#ifdef __x86_64__
155 pxor %xmm8, %xmm4
156#else
157 /* xmm8 unsupported for x32 */
158 movdqa %xmm4, %xmm5
159 PCLMULQDQ 00, CONSTANT, %xmm4
160 PCLMULQDQ 0x11, CONSTANT, %xmm5
161 pxor %xmm5, %xmm4
162#endif
163
164 pxor (BUF), %xmm1
165 pxor 0x10(BUF), %xmm2
166 pxor 0x20(BUF), %xmm3
167 pxor 0x30(BUF), %xmm4
168
169 sub $0x40, LEN
170 add $0x40, BUF
171 cmp $0x40, LEN
172 jge loop_64
173less_64:/* Folding cache line into 128bit */
174#ifdef __x86_64__
175 movdqa .Lconstant_R4R3(%rip), CONSTANT
176#else
177 movdqa .Lconstant_R4R3 - delta(%ecx), CONSTANT
178#endif
179 prefetchnta (BUF)
180
181 movdqa %xmm1, %xmm5
182 PCLMULQDQ 0x00, CONSTANT, %xmm1
183 PCLMULQDQ 0x11, CONSTANT, %xmm5
184 pxor %xmm5, %xmm1
185 pxor %xmm2, %xmm1
186
187 movdqa %xmm1, %xmm5
188 PCLMULQDQ 0x00, CONSTANT, %xmm1
189 PCLMULQDQ 0x11, CONSTANT, %xmm5
190 pxor %xmm5, %xmm1
191 pxor %xmm3, %xmm1
192
193 movdqa %xmm1, %xmm5
194 PCLMULQDQ 0x00, CONSTANT, %xmm1
195 PCLMULQDQ 0x11, CONSTANT, %xmm5
196 pxor %xmm5, %xmm1
197 pxor %xmm4, %xmm1
198
199 cmp $0x10, LEN
200 jb fold_64
201loop_16:/* Folding rest buffer into 128bit */
202 movdqa %xmm1, %xmm5
203 PCLMULQDQ 0x00, CONSTANT, %xmm1
204 PCLMULQDQ 0x11, CONSTANT, %xmm5
205 pxor %xmm5, %xmm1
206 pxor (BUF), %xmm1
207 sub $0x10, LEN
208 add $0x10, BUF
209 cmp $0x10, LEN
210 jge loop_16
211
212fold_64:
213 /* perform the last 64 bit fold, also adds 32 zeroes
214 * to the input stream */
215 PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
216 psrldq $0x08, %xmm1
217 pxor CONSTANT, %xmm1
218
219 /* final 32-bit fold */
220 movdqa %xmm1, %xmm2
221#ifdef __x86_64__
222 movdqa .Lconstant_R5(%rip), CONSTANT
223 movdqa .Lconstant_mask32(%rip), %xmm3
224#else
225 movdqa .Lconstant_R5 - delta(%ecx), CONSTANT
226 movdqa .Lconstant_mask32 - delta(%ecx), %xmm3
227#endif
228 psrldq $0x04, %xmm2
229 pand %xmm3, %xmm1
230 PCLMULQDQ 0x00, CONSTANT, %xmm1
231 pxor %xmm2, %xmm1
232
233 /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
234#ifdef __x86_64__
235 movdqa .Lconstant_RUpoly(%rip), CONSTANT
236#else
237 movdqa .Lconstant_RUpoly - delta(%ecx), CONSTANT
238#endif
239 movdqa %xmm1, %xmm2
240 pand %xmm3, %xmm1
241 PCLMULQDQ 0x10, CONSTANT, %xmm1
242 pand %xmm3, %xmm1
243 PCLMULQDQ 0x00, CONSTANT, %xmm1
244 pxor %xmm2, %xmm1
245 pextrd $0x01, %xmm1, %eax
246
247 ret
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
new file mode 100644
index 000000000000..9d014a74ef96
--- /dev/null
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -0,0 +1,201 @@
1/* GPL HEADER START
2 *
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 only,
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License version 2 for more details (a copy is included
13 * in the LICENSE file that accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License
16 * version 2 along with this program; If not, see http://www.gnu.org/licenses
17 *
18 * Please visit http://www.xyratex.com/contact if you need additional
19 * information or have any questions.
20 *
21 * GPL HEADER END
22 */
23
24/*
25 * Copyright 2012 Xyratex Technology Limited
26 *
27 * Wrappers for kernel crypto shash api to pclmulqdq crc32 imlementation.
28 */
29#include <linux/init.h>
30#include <linux/module.h>
31#include <linux/string.h>
32#include <linux/kernel.h>
33#include <linux/crc32.h>
34#include <crypto/internal/hash.h>
35
36#include <asm/cpufeature.h>
37#include <asm/cpu_device_id.h>
38#include <asm/i387.h>
39
40#define CHKSUM_BLOCK_SIZE 1
41#define CHKSUM_DIGEST_SIZE 4
42
43#define PCLMUL_MIN_LEN 64L /* minimum size of buffer
44 * for crc32_pclmul_le_16 */
45#define SCALE_F 16L /* size of xmm register */
46#define SCALE_F_MASK (SCALE_F - 1)
47
48u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32);
49
50static u32 __attribute__((pure))
51 crc32_pclmul_le(u32 crc, unsigned char const *p, size_t len)
52{
53 unsigned int iquotient;
54 unsigned int iremainder;
55 unsigned int prealign;
56
57 if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !irq_fpu_usable())
58 return crc32_le(crc, p, len);
59
60 if ((long)p & SCALE_F_MASK) {
61 /* align p to 16 byte */
62 prealign = SCALE_F - ((long)p & SCALE_F_MASK);
63
64 crc = crc32_le(crc, p, prealign);
65 len -= prealign;
66 p = (unsigned char *)(((unsigned long)p + SCALE_F_MASK) &
67 ~SCALE_F_MASK);
68 }
69 iquotient = len & (~SCALE_F_MASK);
70 iremainder = len & SCALE_F_MASK;
71
72 kernel_fpu_begin();
73 crc = crc32_pclmul_le_16(p, iquotient, crc);
74 kernel_fpu_end();
75
76 if (iremainder)
77 crc = crc32_le(crc, p + iquotient, iremainder);
78
79 return crc;
80}
81
82static int crc32_pclmul_cra_init(struct crypto_tfm *tfm)
83{
84 u32 *key = crypto_tfm_ctx(tfm);
85
86 *key = 0;
87
88 return 0;
89}
90
91static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key,
92 unsigned int keylen)
93{
94 u32 *mctx = crypto_shash_ctx(hash);
95
96 if (keylen != sizeof(u32)) {
97 crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
98 return -EINVAL;
99 }
100 *mctx = le32_to_cpup((__le32 *)key);
101 return 0;
102}
103
104static int crc32_pclmul_init(struct shash_desc *desc)
105{
106 u32 *mctx = crypto_shash_ctx(desc->tfm);
107 u32 *crcp = shash_desc_ctx(desc);
108
109 *crcp = *mctx;
110
111 return 0;
112}
113
114static int crc32_pclmul_update(struct shash_desc *desc, const u8 *data,
115 unsigned int len)
116{
117 u32 *crcp = shash_desc_ctx(desc);
118
119 *crcp = crc32_pclmul_le(*crcp, data, len);
120 return 0;
121}
122
123/* No final XOR 0xFFFFFFFF, like crc32_le */
124static int __crc32_pclmul_finup(u32 *crcp, const u8 *data, unsigned int len,
125 u8 *out)
126{
127 *(__le32 *)out = cpu_to_le32(crc32_pclmul_le(*crcp, data, len));
128 return 0;
129}
130
131static int crc32_pclmul_finup(struct shash_desc *desc, const u8 *data,
132 unsigned int len, u8 *out)
133{
134 return __crc32_pclmul_finup(shash_desc_ctx(desc), data, len, out);
135}
136
137static int crc32_pclmul_final(struct shash_desc *desc, u8 *out)
138{
139 u32 *crcp = shash_desc_ctx(desc);
140
141 *(__le32 *)out = cpu_to_le32p(crcp);
142 return 0;
143}
144
145static int crc32_pclmul_digest(struct shash_desc *desc, const u8 *data,
146 unsigned int len, u8 *out)
147{
148 return __crc32_pclmul_finup(crypto_shash_ctx(desc->tfm), data, len,
149 out);
150}
151
152static struct shash_alg alg = {
153 .setkey = crc32_pclmul_setkey,
154 .init = crc32_pclmul_init,
155 .update = crc32_pclmul_update,
156 .final = crc32_pclmul_final,
157 .finup = crc32_pclmul_finup,
158 .digest = crc32_pclmul_digest,
159 .descsize = sizeof(u32),
160 .digestsize = CHKSUM_DIGEST_SIZE,
161 .base = {
162 .cra_name = "crc32",
163 .cra_driver_name = "crc32-pclmul",
164 .cra_priority = 200,
165 .cra_blocksize = CHKSUM_BLOCK_SIZE,
166 .cra_ctxsize = sizeof(u32),
167 .cra_module = THIS_MODULE,
168 .cra_init = crc32_pclmul_cra_init,
169 }
170};
171
172static const struct x86_cpu_id crc32pclmul_cpu_id[] = {
173 X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ),
174 {}
175};
176MODULE_DEVICE_TABLE(x86cpu, crc32pclmul_cpu_id);
177
178
179static int __init crc32_pclmul_mod_init(void)
180{
181
182 if (!x86_match_cpu(crc32pclmul_cpu_id)) {
183 pr_info("PCLMULQDQ-NI instructions are not detected.\n");
184 return -ENODEV;
185 }
186 return crypto_register_shash(&alg);
187}
188
189static void __exit crc32_pclmul_mod_fini(void)
190{
191 crypto_unregister_shash(&alg);
192}
193
194module_init(crc32_pclmul_mod_init);
195module_exit(crc32_pclmul_mod_fini);
196
197MODULE_AUTHOR("Alexander Boyko <alexander_boyko@xyratex.com>");
198MODULE_LICENSE("GPL");
199
200MODULE_ALIAS("crc32");
201MODULE_ALIAS("crc32-pclmul");