diff options
author | Yazen Ghannam <yazen.ghannam@linaro.org> | 2014-11-19 12:19:37 -0500 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2014-11-20 09:39:39 -0500 |
commit | f6f203faa3ebd8fa229e34424850a0919ded6c10 (patch) | |
tree | 0099a95e8a8f7dbe059cf626aec5ae0b7d7a0cb9 | |
parent | aa408d6019775c1b4362895df7929a043fa79804 (diff) |
crypto: crc32 - Add ARM64 CRC32 hw accelerated module
This module registers a crc32 algorithm and a crc32c algorithm
that use the optional CRC32 and CRC32C instructions in ARMv8.
Tested on AMD Seattle.
Improvement compared to crc32c-generic algorithm:
TCRYPT CRC32C speed test shows ~450% speedup.
Simple dd write tests to btrfs filesystem show ~30% speedup.
Signed-off-by: Yazen Ghannam <yazen.ghannam@linaro.org>
Acked-by: Steve Capper <steve.capper@linaro.org>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r-- | arch/arm64/crypto/Kconfig | 4 | ||||
-rw-r--r-- | arch/arm64/crypto/Makefile | 4 | ||||
-rw-r--r-- | arch/arm64/crypto/crc32-arm64.c | 274 |
3 files changed, 282 insertions, 0 deletions
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 5562652c5316..c1a0468f7156 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig | |||
@@ -50,4 +50,8 @@ config CRYPTO_AES_ARM64_NEON_BLK | |||
50 | select CRYPTO_AES | 50 | select CRYPTO_AES |
51 | select CRYPTO_ABLK_HELPER | 51 | select CRYPTO_ABLK_HELPER |
52 | 52 | ||
53 | config CRYPTO_CRC32_ARM64 | ||
54 | tristate "CRC32 and CRC32C using optional ARMv8 instructions" | ||
55 | depends on ARM64 | ||
56 | select CRYPTO_HASH | ||
53 | endif | 57 | endif |
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index a3f935fde975..5720608c50b1 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile | |||
@@ -34,5 +34,9 @@ AFLAGS_aes-neon.o := -DINTERLEAVE=4 | |||
34 | 34 | ||
35 | CFLAGS_aes-glue-ce.o := -DUSE_V8_CRYPTO_EXTENSIONS | 35 | CFLAGS_aes-glue-ce.o := -DUSE_V8_CRYPTO_EXTENSIONS |
36 | 36 | ||
37 | obj-$(CONFIG_CRYPTO_CRC32_ARM64) += crc32-arm64.o | ||
38 | |||
39 | CFLAGS_crc32-arm64.o := -mcpu=generic+crc | ||
40 | |||
37 | $(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE | 41 | $(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE |
38 | $(call if_changed_rule,cc_o_c) | 42 | $(call if_changed_rule,cc_o_c) |
diff --git a/arch/arm64/crypto/crc32-arm64.c b/arch/arm64/crypto/crc32-arm64.c new file mode 100644 index 000000000000..9499199924ae --- /dev/null +++ b/arch/arm64/crypto/crc32-arm64.c | |||
@@ -0,0 +1,274 @@ | |||
1 | /* | ||
2 | * crc32-arm64.c - CRC32 and CRC32C using optional ARMv8 instructions | ||
3 | * | ||
4 | * Module based on crypto/crc32c_generic.c | ||
5 | * | ||
6 | * CRC32 loop taken from Ed Nevill's Hadoop CRC patch | ||
7 | * http://mail-archives.apache.org/mod_mbox/hadoop-common-dev/201406.mbox/%3C1403687030.3355.19.camel%40localhost.localdomain%3E | ||
8 | * | ||
9 | * Using inline assembly instead of intrinsics in order to be backwards | ||
10 | * compatible with older compilers. | ||
11 | * | ||
12 | * Copyright (C) 2014 Linaro Ltd <yazen.ghannam@linaro.org> | ||
13 | * | ||
14 | * This program is free software; you can redistribute it and/or modify | ||
15 | * it under the terms of the GNU General Public License version 2 as | ||
16 | * published by the Free Software Foundation. | ||
17 | */ | ||
18 | |||
19 | #include <linux/unaligned/access_ok.h> | ||
20 | #include <linux/cpufeature.h> | ||
21 | #include <linux/init.h> | ||
22 | #include <linux/kernel.h> | ||
23 | #include <linux/module.h> | ||
24 | #include <linux/string.h> | ||
25 | |||
26 | #include <crypto/internal/hash.h> | ||
27 | |||
28 | MODULE_AUTHOR("Yazen Ghannam <yazen.ghannam@linaro.org>"); | ||
29 | MODULE_DESCRIPTION("CRC32 and CRC32C using optional ARMv8 instructions"); | ||
30 | MODULE_LICENSE("GPL v2"); | ||
31 | |||
32 | #define CRC32X(crc, value) __asm__("crc32x %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value)) | ||
33 | #define CRC32W(crc, value) __asm__("crc32w %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) | ||
34 | #define CRC32H(crc, value) __asm__("crc32h %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) | ||
35 | #define CRC32B(crc, value) __asm__("crc32b %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) | ||
36 | #define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value)) | ||
37 | #define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) | ||
38 | #define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) | ||
39 | #define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) | ||
40 | |||
41 | static u32 crc32_arm64_le_hw(u32 crc, const u8 *p, unsigned int len) | ||
42 | { | ||
43 | s64 length = len; | ||
44 | |||
45 | while ((length -= sizeof(u64)) >= 0) { | ||
46 | CRC32X(crc, get_unaligned_le64(p)); | ||
47 | p += sizeof(u64); | ||
48 | } | ||
49 | |||
50 | /* The following is more efficient than the straight loop */ | ||
51 | if (length & sizeof(u32)) { | ||
52 | CRC32W(crc, get_unaligned_le32(p)); | ||
53 | p += sizeof(u32); | ||
54 | } | ||
55 | if (length & sizeof(u16)) { | ||
56 | CRC32H(crc, get_unaligned_le16(p)); | ||
57 | p += sizeof(u16); | ||
58 | } | ||
59 | if (length & sizeof(u8)) | ||
60 | CRC32B(crc, *p); | ||
61 | |||
62 | return crc; | ||
63 | } | ||
64 | |||
65 | static u32 crc32c_arm64_le_hw(u32 crc, const u8 *p, unsigned int len) | ||
66 | { | ||
67 | s64 length = len; | ||
68 | |||
69 | while ((length -= sizeof(u64)) >= 0) { | ||
70 | CRC32CX(crc, get_unaligned_le64(p)); | ||
71 | p += sizeof(u64); | ||
72 | } | ||
73 | |||
74 | /* The following is more efficient than the straight loop */ | ||
75 | if (length & sizeof(u32)) { | ||
76 | CRC32CW(crc, get_unaligned_le32(p)); | ||
77 | p += sizeof(u32); | ||
78 | } | ||
79 | if (length & sizeof(u16)) { | ||
80 | CRC32CH(crc, get_unaligned_le16(p)); | ||
81 | p += sizeof(u16); | ||
82 | } | ||
83 | if (length & sizeof(u8)) | ||
84 | CRC32CB(crc, *p); | ||
85 | |||
86 | return crc; | ||
87 | } | ||
88 | |||
89 | #define CHKSUM_BLOCK_SIZE 1 | ||
90 | #define CHKSUM_DIGEST_SIZE 4 | ||
91 | |||
92 | struct chksum_ctx { | ||
93 | u32 key; | ||
94 | }; | ||
95 | |||
96 | struct chksum_desc_ctx { | ||
97 | u32 crc; | ||
98 | }; | ||
99 | |||
100 | static int chksum_init(struct shash_desc *desc) | ||
101 | { | ||
102 | struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); | ||
103 | struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); | ||
104 | |||
105 | ctx->crc = mctx->key; | ||
106 | |||
107 | return 0; | ||
108 | } | ||
109 | |||
110 | /* | ||
111 | * Setting the seed allows arbitrary accumulators and flexible XOR policy | ||
112 | * If your algorithm starts with ~0, then XOR with ~0 before you set | ||
113 | * the seed. | ||
114 | */ | ||
115 | static int chksum_setkey(struct crypto_shash *tfm, const u8 *key, | ||
116 | unsigned int keylen) | ||
117 | { | ||
118 | struct chksum_ctx *mctx = crypto_shash_ctx(tfm); | ||
119 | |||
120 | if (keylen != sizeof(mctx->key)) { | ||
121 | crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); | ||
122 | return -EINVAL; | ||
123 | } | ||
124 | mctx->key = get_unaligned_le32(key); | ||
125 | return 0; | ||
126 | } | ||
127 | |||
128 | static int chksum_update(struct shash_desc *desc, const u8 *data, | ||
129 | unsigned int length) | ||
130 | { | ||
131 | struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); | ||
132 | |||
133 | ctx->crc = crc32_arm64_le_hw(ctx->crc, data, length); | ||
134 | return 0; | ||
135 | } | ||
136 | |||
137 | static int chksumc_update(struct shash_desc *desc, const u8 *data, | ||
138 | unsigned int length) | ||
139 | { | ||
140 | struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); | ||
141 | |||
142 | ctx->crc = crc32c_arm64_le_hw(ctx->crc, data, length); | ||
143 | return 0; | ||
144 | } | ||
145 | |||
146 | static int chksum_final(struct shash_desc *desc, u8 *out) | ||
147 | { | ||
148 | struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); | ||
149 | |||
150 | put_unaligned_le32(~ctx->crc, out); | ||
151 | return 0; | ||
152 | } | ||
153 | |||
154 | static int __chksum_finup(u32 crc, const u8 *data, unsigned int len, u8 *out) | ||
155 | { | ||
156 | put_unaligned_le32(~crc32_arm64_le_hw(crc, data, len), out); | ||
157 | return 0; | ||
158 | } | ||
159 | |||
160 | static int __chksumc_finup(u32 crc, const u8 *data, unsigned int len, u8 *out) | ||
161 | { | ||
162 | put_unaligned_le32(~crc32c_arm64_le_hw(crc, data, len), out); | ||
163 | return 0; | ||
164 | } | ||
165 | |||
166 | static int chksum_finup(struct shash_desc *desc, const u8 *data, | ||
167 | unsigned int len, u8 *out) | ||
168 | { | ||
169 | struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); | ||
170 | |||
171 | return __chksum_finup(ctx->crc, data, len, out); | ||
172 | } | ||
173 | |||
174 | static int chksumc_finup(struct shash_desc *desc, const u8 *data, | ||
175 | unsigned int len, u8 *out) | ||
176 | { | ||
177 | struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); | ||
178 | |||
179 | return __chksumc_finup(ctx->crc, data, len, out); | ||
180 | } | ||
181 | |||
182 | static int chksum_digest(struct shash_desc *desc, const u8 *data, | ||
183 | unsigned int length, u8 *out) | ||
184 | { | ||
185 | struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); | ||
186 | |||
187 | return __chksum_finup(mctx->key, data, length, out); | ||
188 | } | ||
189 | |||
190 | static int chksumc_digest(struct shash_desc *desc, const u8 *data, | ||
191 | unsigned int length, u8 *out) | ||
192 | { | ||
193 | struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); | ||
194 | |||
195 | return __chksumc_finup(mctx->key, data, length, out); | ||
196 | } | ||
197 | |||
198 | static int crc32_cra_init(struct crypto_tfm *tfm) | ||
199 | { | ||
200 | struct chksum_ctx *mctx = crypto_tfm_ctx(tfm); | ||
201 | |||
202 | mctx->key = ~0; | ||
203 | return 0; | ||
204 | } | ||
205 | |||
206 | static struct shash_alg crc32_alg = { | ||
207 | .digestsize = CHKSUM_DIGEST_SIZE, | ||
208 | .setkey = chksum_setkey, | ||
209 | .init = chksum_init, | ||
210 | .update = chksum_update, | ||
211 | .final = chksum_final, | ||
212 | .finup = chksum_finup, | ||
213 | .digest = chksum_digest, | ||
214 | .descsize = sizeof(struct chksum_desc_ctx), | ||
215 | .base = { | ||
216 | .cra_name = "crc32", | ||
217 | .cra_driver_name = "crc32-arm64-hw", | ||
218 | .cra_priority = 300, | ||
219 | .cra_blocksize = CHKSUM_BLOCK_SIZE, | ||
220 | .cra_alignmask = 0, | ||
221 | .cra_ctxsize = sizeof(struct chksum_ctx), | ||
222 | .cra_module = THIS_MODULE, | ||
223 | .cra_init = crc32_cra_init, | ||
224 | } | ||
225 | }; | ||
226 | |||
227 | static struct shash_alg crc32c_alg = { | ||
228 | .digestsize = CHKSUM_DIGEST_SIZE, | ||
229 | .setkey = chksum_setkey, | ||
230 | .init = chksum_init, | ||
231 | .update = chksumc_update, | ||
232 | .final = chksum_final, | ||
233 | .finup = chksumc_finup, | ||
234 | .digest = chksumc_digest, | ||
235 | .descsize = sizeof(struct chksum_desc_ctx), | ||
236 | .base = { | ||
237 | .cra_name = "crc32c", | ||
238 | .cra_driver_name = "crc32c-arm64-hw", | ||
239 | .cra_priority = 300, | ||
240 | .cra_blocksize = CHKSUM_BLOCK_SIZE, | ||
241 | .cra_alignmask = 0, | ||
242 | .cra_ctxsize = sizeof(struct chksum_ctx), | ||
243 | .cra_module = THIS_MODULE, | ||
244 | .cra_init = crc32_cra_init, | ||
245 | } | ||
246 | }; | ||
247 | |||
248 | static int __init crc32_mod_init(void) | ||
249 | { | ||
250 | int err; | ||
251 | |||
252 | err = crypto_register_shash(&crc32_alg); | ||
253 | |||
254 | if (err) | ||
255 | return err; | ||
256 | |||
257 | err = crypto_register_shash(&crc32c_alg); | ||
258 | |||
259 | if (err) { | ||
260 | crypto_unregister_shash(&crc32_alg); | ||
261 | return err; | ||
262 | } | ||
263 | |||
264 | return 0; | ||
265 | } | ||
266 | |||
267 | static void __exit crc32_mod_exit(void) | ||
268 | { | ||
269 | crypto_unregister_shash(&crc32_alg); | ||
270 | crypto_unregister_shash(&crc32c_alg); | ||
271 | } | ||
272 | |||
273 | module_cpu_feature_match(CRC32, crc32_mod_init); | ||
274 | module_exit(crc32_mod_exit); | ||