aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArd Biesheuvel <ard.biesheuvel@linaro.org>2016-12-05 13:42:27 -0500
committerHerbert Xu <herbert@gondor.apana.org.au>2016-12-07 07:01:22 -0500
commit8fefde90e90c9f5c2770e46ceb127813d3f20c34 (patch)
tree75fcea660971403632402ae8114ca4d88fe535fe
parent1d481f1cd8925bd92387983ea1245a0ea0f16d32 (diff)
crypto: arm64/crc32 - accelerated support based on x86 SSE implementation
This is a combination of the the Intel algorithm implemented using SSE and PCLMULQDQ instructions from arch/x86/crypto/crc32-pclmul_asm.S, and the new CRC32 extensions introduced for both 32-bit and 64-bit ARM in version 8 of the architecture. Two versions of the above combo are provided, one for CRC32 and one for CRC32C. The PMULL/NEON algorithm is faster, but operates on blocks of at least 64 bytes, and on multiples of 16 bytes only. For the remaining input, or for all input on systems that lack the PMULL 64x64->128 instructions, the CRC32 instructions will be used. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r--arch/arm64/crypto/Kconfig6
-rw-r--r--arch/arm64/crypto/Makefile3
-rw-r--r--arch/arm64/crypto/crc32-ce-core.S266
-rw-r--r--arch/arm64/crypto/crc32-ce-glue.c212
4 files changed, 487 insertions, 0 deletions
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index f1e6dd0fc174..450a85df041a 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -36,6 +36,11 @@ config CRYPTO_CRCT10DIF_ARM64_CE
36 depends on KERNEL_MODE_NEON && CRC_T10DIF 36 depends on KERNEL_MODE_NEON && CRC_T10DIF
37 select CRYPTO_HASH 37 select CRYPTO_HASH
38 38
39config CRYPTO_CRC32_ARM64_CE
40 tristate "CRC32 and CRC32C digest algorithms using PMULL instructions"
41 depends on KERNEL_MODE_NEON && CRC32
42 select CRYPTO_HASH
43
39config CRYPTO_AES_ARM64_CE 44config CRYPTO_AES_ARM64_CE
40 tristate "AES core cipher using ARMv8 Crypto Extensions" 45 tristate "AES core cipher using ARMv8 Crypto Extensions"
41 depends on ARM64 && KERNEL_MODE_NEON 46 depends on ARM64 && KERNEL_MODE_NEON
@@ -66,4 +71,5 @@ config CRYPTO_CRC32_ARM64
66 tristate "CRC32 and CRC32C using optional ARMv8 instructions" 71 tristate "CRC32 and CRC32C using optional ARMv8 instructions"
67 depends on ARM64 72 depends on ARM64
68 select CRYPTO_HASH 73 select CRYPTO_HASH
74
69endif 75endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index d3f1ba6d4771..aa8888d7b744 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -20,6 +20,9 @@ ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
20obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o 20obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
21crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o 21crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
22 22
23obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
24crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
25
23obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o 26obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
24CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto 27CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
25 28
diff --git a/arch/arm64/crypto/crc32-ce-core.S b/arch/arm64/crypto/crc32-ce-core.S
new file mode 100644
index 000000000000..18f5a8442276
--- /dev/null
+++ b/arch/arm64/crypto/crc32-ce-core.S
@@ -0,0 +1,266 @@
1/*
2 * Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions
3 *
4 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11/* GPL HEADER START
12 *
13 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License version 2 only,
17 * as published by the Free Software Foundation.
18 *
19 * This program is distributed in the hope that it will be useful, but
20 * WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 * General Public License version 2 for more details (a copy is included
23 * in the LICENSE file that accompanied this code).
24 *
25 * You should have received a copy of the GNU General Public License
26 * version 2 along with this program; If not, see http://www.gnu.org/licenses
27 *
28 * Please visit http://www.xyratex.com/contact if you need additional
29 * information or have any questions.
30 *
31 * GPL HEADER END
32 */
33
34/*
35 * Copyright 2012 Xyratex Technology Limited
36 *
37 * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
38 * calculation.
39 * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
40 * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
41 * at:
42 * http://www.intel.com/products/processor/manuals/
43 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
44 * Volume 2B: Instruction Set Reference, N-Z
45 *
46 * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
47 * Alexander Boyko <Alexander_Boyko@xyratex.com>
48 */
49
50#include <linux/linkage.h>
51#include <asm/assembler.h>
52
53 .text
54 .align 6
55 .cpu generic+crypto+crc
56
57.Lcrc32_constants:
58 /*
59 * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
60 * #define CONSTANT_R1 0x154442bd4LL
61 *
62 * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
63 * #define CONSTANT_R2 0x1c6e41596LL
64 */
65 .octa 0x00000001c6e415960000000154442bd4
66
67 /*
68 * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
69 * #define CONSTANT_R3 0x1751997d0LL
70 *
71 * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
72 * #define CONSTANT_R4 0x0ccaa009eLL
73 */
74 .octa 0x00000000ccaa009e00000001751997d0
75
76 /*
77 * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
78 * #define CONSTANT_R5 0x163cd6124LL
79 */
80 .quad 0x0000000163cd6124
81 .quad 0x00000000FFFFFFFF
82
83 /*
84 * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
85 *
86 * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
87 * = 0x1F7011641LL
88 * #define CONSTANT_RU 0x1F7011641LL
89 */
90 .octa 0x00000001F701164100000001DB710641
91
92.Lcrc32c_constants:
93 .octa 0x000000009e4addf800000000740eef02
94 .octa 0x000000014cd00bd600000000f20c0dfe
95 .quad 0x00000000dd45aab8
96 .quad 0x00000000FFFFFFFF
97 .octa 0x00000000dea713f10000000105ec76f0
98
99 vCONSTANT .req v0
100 dCONSTANT .req d0
101 qCONSTANT .req q0
102
103 BUF .req x0
104 LEN .req x1
105 CRC .req x2
106
107 vzr .req v9
108
109 /**
110 * Calculate crc32
111 * BUF - buffer
112 * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
113 * CRC - initial crc32
114 * return %eax crc32
115 * uint crc32_pmull_le(unsigned char const *buffer,
116 * size_t len, uint crc32)
117 */
118ENTRY(crc32_pmull_le)
119 adr x3, .Lcrc32_constants
120 b 0f
121
122ENTRY(crc32c_pmull_le)
123 adr x3, .Lcrc32c_constants
124
1250: bic LEN, LEN, #15
126 ld1 {v1.16b-v4.16b}, [BUF], #0x40
127 movi vzr.16b, #0
128 fmov dCONSTANT, CRC
129 eor v1.16b, v1.16b, vCONSTANT.16b
130 sub LEN, LEN, #0x40
131 cmp LEN, #0x40
132 b.lt less_64
133
134 ldr qCONSTANT, [x3]
135
136loop_64: /* 64 bytes Full cache line folding */
137 sub LEN, LEN, #0x40
138
139 pmull2 v5.1q, v1.2d, vCONSTANT.2d
140 pmull2 v6.1q, v2.2d, vCONSTANT.2d
141 pmull2 v7.1q, v3.2d, vCONSTANT.2d
142 pmull2 v8.1q, v4.2d, vCONSTANT.2d
143
144 pmull v1.1q, v1.1d, vCONSTANT.1d
145 pmull v2.1q, v2.1d, vCONSTANT.1d
146 pmull v3.1q, v3.1d, vCONSTANT.1d
147 pmull v4.1q, v4.1d, vCONSTANT.1d
148
149 eor v1.16b, v1.16b, v5.16b
150 ld1 {v5.16b}, [BUF], #0x10
151 eor v2.16b, v2.16b, v6.16b
152 ld1 {v6.16b}, [BUF], #0x10
153 eor v3.16b, v3.16b, v7.16b
154 ld1 {v7.16b}, [BUF], #0x10
155 eor v4.16b, v4.16b, v8.16b
156 ld1 {v8.16b}, [BUF], #0x10
157
158 eor v1.16b, v1.16b, v5.16b
159 eor v2.16b, v2.16b, v6.16b
160 eor v3.16b, v3.16b, v7.16b
161 eor v4.16b, v4.16b, v8.16b
162
163 cmp LEN, #0x40
164 b.ge loop_64
165
166less_64: /* Folding cache line into 128bit */
167 ldr qCONSTANT, [x3, #16]
168
169 pmull2 v5.1q, v1.2d, vCONSTANT.2d
170 pmull v1.1q, v1.1d, vCONSTANT.1d
171 eor v1.16b, v1.16b, v5.16b
172 eor v1.16b, v1.16b, v2.16b
173
174 pmull2 v5.1q, v1.2d, vCONSTANT.2d
175 pmull v1.1q, v1.1d, vCONSTANT.1d
176 eor v1.16b, v1.16b, v5.16b
177 eor v1.16b, v1.16b, v3.16b
178
179 pmull2 v5.1q, v1.2d, vCONSTANT.2d
180 pmull v1.1q, v1.1d, vCONSTANT.1d
181 eor v1.16b, v1.16b, v5.16b
182 eor v1.16b, v1.16b, v4.16b
183
184 cbz LEN, fold_64
185
186loop_16: /* Folding rest buffer into 128bit */
187 subs LEN, LEN, #0x10
188
189 ld1 {v2.16b}, [BUF], #0x10
190 pmull2 v5.1q, v1.2d, vCONSTANT.2d
191 pmull v1.1q, v1.1d, vCONSTANT.1d
192 eor v1.16b, v1.16b, v5.16b
193 eor v1.16b, v1.16b, v2.16b
194
195 b.ne loop_16
196
197fold_64:
198 /* perform the last 64 bit fold, also adds 32 zeroes
199 * to the input stream */
200 ext v2.16b, v1.16b, v1.16b, #8
201 pmull2 v2.1q, v2.2d, vCONSTANT.2d
202 ext v1.16b, v1.16b, vzr.16b, #8
203 eor v1.16b, v1.16b, v2.16b
204
205 /* final 32-bit fold */
206 ldr dCONSTANT, [x3, #32]
207 ldr d3, [x3, #40]
208
209 ext v2.16b, v1.16b, vzr.16b, #4
210 and v1.16b, v1.16b, v3.16b
211 pmull v1.1q, v1.1d, vCONSTANT.1d
212 eor v1.16b, v1.16b, v2.16b
213
214 /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
215 ldr qCONSTANT, [x3, #48]
216
217 and v2.16b, v1.16b, v3.16b
218 ext v2.16b, vzr.16b, v2.16b, #8
219 pmull2 v2.1q, v2.2d, vCONSTANT.2d
220 and v2.16b, v2.16b, v3.16b
221 pmull v2.1q, v2.1d, vCONSTANT.1d
222 eor v1.16b, v1.16b, v2.16b
223 mov w0, v1.s[1]
224
225 ret
226ENDPROC(crc32_pmull_le)
227ENDPROC(crc32c_pmull_le)
228
229 .macro __crc32, c
2300: subs x2, x2, #16
231 b.mi 8f
232 ldp x3, x4, [x1], #16
233CPU_BE( rev x3, x3 )
234CPU_BE( rev x4, x4 )
235 crc32\c\()x w0, w0, x3
236 crc32\c\()x w0, w0, x4
237 b.ne 0b
238 ret
239
2408: tbz x2, #3, 4f
241 ldr x3, [x1], #8
242CPU_BE( rev x3, x3 )
243 crc32\c\()x w0, w0, x3
2444: tbz x2, #2, 2f
245 ldr w3, [x1], #4
246CPU_BE( rev w3, w3 )
247 crc32\c\()w w0, w0, w3
2482: tbz x2, #1, 1f
249 ldrh w3, [x1], #2
250CPU_BE( rev16 w3, w3 )
251 crc32\c\()h w0, w0, w3
2521: tbz x2, #0, 0f
253 ldrb w3, [x1]
254 crc32\c\()b w0, w0, w3
2550: ret
256 .endm
257
258 .align 5
259ENTRY(crc32_armv8_le)
260 __crc32
261ENDPROC(crc32_armv8_le)
262
263 .align 5
264ENTRY(crc32c_armv8_le)
265 __crc32 c
266ENDPROC(crc32c_armv8_le)
diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c
new file mode 100644
index 000000000000..8594127d5e01
--- /dev/null
+++ b/arch/arm64/crypto/crc32-ce-glue.c
@@ -0,0 +1,212 @@
1/*
2 * Accelerated CRC32(C) using arm64 NEON and Crypto Extensions instructions
3 *
4 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/cpufeature.h>
12#include <linux/crc32.h>
13#include <linux/init.h>
14#include <linux/kernel.h>
15#include <linux/module.h>
16#include <linux/string.h>
17
18#include <crypto/internal/hash.h>
19
20#include <asm/hwcap.h>
21#include <asm/neon.h>
22#include <asm/unaligned.h>
23
24#define PMULL_MIN_LEN 64L /* minimum size of buffer
25 * for crc32_pmull_le_16 */
26#define SCALE_F 16L /* size of NEON register */
27
28asmlinkage u32 crc32_pmull_le(const u8 buf[], u64 len, u32 init_crc);
29asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], size_t len);
30
31asmlinkage u32 crc32c_pmull_le(const u8 buf[], u64 len, u32 init_crc);
32asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], size_t len);
33
34static u32 (*fallback_crc32)(u32 init_crc, const u8 buf[], size_t len);
35static u32 (*fallback_crc32c)(u32 init_crc, const u8 buf[], size_t len);
36
37static int crc32_pmull_cra_init(struct crypto_tfm *tfm)
38{
39 u32 *key = crypto_tfm_ctx(tfm);
40
41 *key = 0;
42 return 0;
43}
44
45static int crc32c_pmull_cra_init(struct crypto_tfm *tfm)
46{
47 u32 *key = crypto_tfm_ctx(tfm);
48
49 *key = ~0;
50 return 0;
51}
52
53static int crc32_pmull_setkey(struct crypto_shash *hash, const u8 *key,
54 unsigned int keylen)
55{
56 u32 *mctx = crypto_shash_ctx(hash);
57
58 if (keylen != sizeof(u32)) {
59 crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
60 return -EINVAL;
61 }
62 *mctx = le32_to_cpup((__le32 *)key);
63 return 0;
64}
65
66static int crc32_pmull_init(struct shash_desc *desc)
67{
68 u32 *mctx = crypto_shash_ctx(desc->tfm);
69 u32 *crc = shash_desc_ctx(desc);
70
71 *crc = *mctx;
72 return 0;
73}
74
75static int crc32_pmull_update(struct shash_desc *desc, const u8 *data,
76 unsigned int length)
77{
78 u32 *crc = shash_desc_ctx(desc);
79 unsigned int l;
80
81 if ((u64)data % SCALE_F) {
82 l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
83
84 *crc = fallback_crc32(*crc, data, l);
85
86 data += l;
87 length -= l;
88 }
89
90 if (length >= PMULL_MIN_LEN) {
91 l = round_down(length, SCALE_F);
92
93 kernel_neon_begin_partial(10);
94 *crc = crc32_pmull_le(data, l, *crc);
95 kernel_neon_end();
96
97 data += l;
98 length -= l;
99 }
100
101 if (length > 0)
102 *crc = fallback_crc32(*crc, data, length);
103
104 return 0;
105}
106
107static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data,
108 unsigned int length)
109{
110 u32 *crc = shash_desc_ctx(desc);
111 unsigned int l;
112
113 if ((u64)data % SCALE_F) {
114 l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
115
116 *crc = fallback_crc32c(*crc, data, l);
117
118 data += l;
119 length -= l;
120 }
121
122 if (length >= PMULL_MIN_LEN) {
123 l = round_down(length, SCALE_F);
124
125 kernel_neon_begin_partial(10);
126 *crc = crc32c_pmull_le(data, l, *crc);
127 kernel_neon_end();
128
129 data += l;
130 length -= l;
131 }
132
133 if (length > 0) {
134 *crc = fallback_crc32c(*crc, data, length);
135 }
136
137 return 0;
138}
139
140static int crc32_pmull_final(struct shash_desc *desc, u8 *out)
141{
142 u32 *crc = shash_desc_ctx(desc);
143
144 put_unaligned_le32(*crc, out);
145 return 0;
146}
147
148static int crc32c_pmull_final(struct shash_desc *desc, u8 *out)
149{
150 u32 *crc = shash_desc_ctx(desc);
151
152 put_unaligned_le32(~*crc, out);
153 return 0;
154}
155
156static struct shash_alg crc32_pmull_algs[] = { {
157 .setkey = crc32_pmull_setkey,
158 .init = crc32_pmull_init,
159 .update = crc32_pmull_update,
160 .final = crc32_pmull_final,
161 .descsize = sizeof(u32),
162 .digestsize = sizeof(u32),
163
164 .base.cra_ctxsize = sizeof(u32),
165 .base.cra_init = crc32_pmull_cra_init,
166 .base.cra_name = "crc32",
167 .base.cra_driver_name = "crc32-arm64-ce",
168 .base.cra_priority = 200,
169 .base.cra_blocksize = 1,
170 .base.cra_module = THIS_MODULE,
171}, {
172 .setkey = crc32_pmull_setkey,
173 .init = crc32_pmull_init,
174 .update = crc32c_pmull_update,
175 .final = crc32c_pmull_final,
176 .descsize = sizeof(u32),
177 .digestsize = sizeof(u32),
178
179 .base.cra_ctxsize = sizeof(u32),
180 .base.cra_init = crc32c_pmull_cra_init,
181 .base.cra_name = "crc32c",
182 .base.cra_driver_name = "crc32c-arm64-ce",
183 .base.cra_priority = 200,
184 .base.cra_blocksize = 1,
185 .base.cra_module = THIS_MODULE,
186} };
187
188static int __init crc32_pmull_mod_init(void)
189{
190 if (elf_hwcap & HWCAP_CRC32) {
191 fallback_crc32 = crc32_armv8_le;
192 fallback_crc32c = crc32c_armv8_le;
193 } else {
194 fallback_crc32 = crc32_le;
195 fallback_crc32c = __crc32c_le;
196 }
197
198 return crypto_register_shashes(crc32_pmull_algs,
199 ARRAY_SIZE(crc32_pmull_algs));
200}
201
202static void __exit crc32_pmull_mod_exit(void)
203{
204 crypto_unregister_shashes(crc32_pmull_algs,
205 ARRAY_SIZE(crc32_pmull_algs));
206}
207
208module_cpu_feature_match(PMULL, crc32_pmull_mod_init);
209module_exit(crc32_pmull_mod_exit);
210
211MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
212MODULE_LICENSE("GPL v2");