diff options
author | Ard Biesheuvel <ard.biesheuvel@linaro.org> | 2016-12-05 13:42:26 -0500 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2016-12-07 07:01:21 -0500 |
commit | 1d481f1cd8925bd92387983ea1245a0ea0f16d32 (patch) | |
tree | 255fbaada604e7a2cce1dc0a33cde04b2ae88472 | |
parent | 6ef5737f39314907704d68719b74fcca11f4f342 (diff) |
crypto: arm/crct10dif - port x86 SSE implementation to ARM
This is a transliteration of the Intel algorithm implemented
using SSE and PCLMULQDQ instructions that resides in the file
arch/x86/crypto/crct10dif-pcl-asm_64.S, but simplified to only
operate on buffers that are 16 byte aligned (but of any size)
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r-- | arch/arm/crypto/Kconfig | 5 | ||||
-rw-r--r-- | arch/arm/crypto/Makefile | 2 | ||||
-rw-r--r-- | arch/arm/crypto/crct10dif-ce-core.S | 427 | ||||
-rw-r--r-- | arch/arm/crypto/crct10dif-ce-glue.c | 101 |
4 files changed, 535 insertions, 0 deletions
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index dd90e389708e..491a6edfeff6 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig | |||
@@ -120,4 +120,9 @@ config CRYPTO_GHASH_ARM_CE | |||
120 | that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64) | 120 | that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64) |
121 | that is part of the ARMv8 Crypto Extensions | 121 | that is part of the ARMv8 Crypto Extensions |
122 | 122 | ||
123 | config CRYPTO_CRCT10DIF_ARM_CE | ||
124 | tristate "CRCT10DIF digest algorithm using PMULL instructions" | ||
125 | depends on KERNEL_MODE_NEON && CRC_T10DIF | ||
126 | select CRYPTO_HASH | ||
127 | |||
123 | endif | 128 | endif |
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index fc5150702b64..fc77265014b7 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile | |||
@@ -13,6 +13,7 @@ ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o | |||
13 | ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o | 13 | ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o |
14 | ce-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o | 14 | ce-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o |
15 | ce-obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o | 15 | ce-obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o |
16 | ce-obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM_CE) += crct10dif-arm-ce.o | ||
16 | 17 | ||
17 | ifneq ($(ce-obj-y)$(ce-obj-m),) | 18 | ifneq ($(ce-obj-y)$(ce-obj-m),) |
18 | ifeq ($(call as-instr,.fpu crypto-neon-fp-armv8,y,n),y) | 19 | ifeq ($(call as-instr,.fpu crypto-neon-fp-armv8,y,n),y) |
@@ -36,6 +37,7 @@ sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o | |||
36 | sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o | 37 | sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o |
37 | aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o | 38 | aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o |
38 | ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o | 39 | ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o |
40 | crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o | ||
39 | 41 | ||
40 | quiet_cmd_perl = PERL $@ | 42 | quiet_cmd_perl = PERL $@ |
41 | cmd_perl = $(PERL) $(<) > $(@) | 43 | cmd_perl = $(PERL) $(<) > $(@) |
diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S new file mode 100644 index 000000000000..ce45ba0c0687 --- /dev/null +++ b/arch/arm/crypto/crct10dif-ce-core.S | |||
@@ -0,0 +1,427 @@ | |||
1 | // | ||
2 | // Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions | ||
3 | // | ||
4 | // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | // | ||
6 | // This program is free software; you can redistribute it and/or modify | ||
7 | // it under the terms of the GNU General Public License version 2 as | ||
8 | // published by the Free Software Foundation. | ||
9 | // | ||
10 | |||
11 | // | ||
12 | // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions | ||
13 | // | ||
14 | // Copyright (c) 2013, Intel Corporation | ||
15 | // | ||
16 | // Authors: | ||
17 | // Erdinc Ozturk <erdinc.ozturk@intel.com> | ||
18 | // Vinodh Gopal <vinodh.gopal@intel.com> | ||
19 | // James Guilford <james.guilford@intel.com> | ||
20 | // Tim Chen <tim.c.chen@linux.intel.com> | ||
21 | // | ||
22 | // This software is available to you under a choice of one of two | ||
23 | // licenses. You may choose to be licensed under the terms of the GNU | ||
24 | // General Public License (GPL) Version 2, available from the file | ||
25 | // COPYING in the main directory of this source tree, or the | ||
26 | // OpenIB.org BSD license below: | ||
27 | // | ||
28 | // Redistribution and use in source and binary forms, with or without | ||
29 | // modification, are permitted provided that the following conditions are | ||
30 | // met: | ||
31 | // | ||
32 | // * Redistributions of source code must retain the above copyright | ||
33 | // notice, this list of conditions and the following disclaimer. | ||
34 | // | ||
35 | // * Redistributions in binary form must reproduce the above copyright | ||
36 | // notice, this list of conditions and the following disclaimer in the | ||
37 | // documentation and/or other materials provided with the | ||
38 | // distribution. | ||
39 | // | ||
40 | // * Neither the name of the Intel Corporation nor the names of its | ||
41 | // contributors may be used to endorse or promote products derived from | ||
42 | // this software without specific prior written permission. | ||
43 | // | ||
44 | // | ||
45 | // THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY | ||
46 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
47 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
48 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR | ||
49 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
50 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
51 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
52 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
53 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
54 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
55 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
56 | // | ||
57 | // Function API: | ||
58 | // UINT16 crc_t10dif_pcl( | ||
59 | // UINT16 init_crc, //initial CRC value, 16 bits | ||
60 | // const unsigned char *buf, //buffer pointer to calculate CRC on | ||
61 | // UINT64 len //buffer length in bytes (64-bit data) | ||
62 | // ); | ||
63 | // | ||
64 | // Reference paper titled "Fast CRC Computation for Generic | ||
65 | // Polynomials Using PCLMULQDQ Instruction" | ||
66 | // URL: http://www.intel.com/content/dam/www/public/us/en/documents | ||
67 | // /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf | ||
68 | // | ||
69 | // | ||
70 | |||
71 | #include <linux/linkage.h> | ||
72 | #include <asm/assembler.h> | ||
73 | |||
74 | #ifdef CONFIG_CPU_ENDIAN_BE8 | ||
75 | #define CPU_LE(code...) | ||
76 | #else | ||
77 | #define CPU_LE(code...) code | ||
78 | #endif | ||
79 | |||
80 | .text | ||
81 | .fpu crypto-neon-fp-armv8 | ||
82 | |||
83 | arg1_low32 .req r0 | ||
84 | arg2 .req r1 | ||
85 | arg3 .req r2 | ||
86 | |||
87 | qzr .req q13 | ||
88 | |||
89 | q0l .req d0 | ||
90 | q0h .req d1 | ||
91 | q1l .req d2 | ||
92 | q1h .req d3 | ||
93 | q2l .req d4 | ||
94 | q2h .req d5 | ||
95 | q3l .req d6 | ||
96 | q3h .req d7 | ||
97 | q4l .req d8 | ||
98 | q4h .req d9 | ||
99 | q5l .req d10 | ||
100 | q5h .req d11 | ||
101 | q6l .req d12 | ||
102 | q6h .req d13 | ||
103 | q7l .req d14 | ||
104 | q7h .req d15 | ||
105 | |||
106 | ENTRY(crc_t10dif_pmull) | ||
107 | vmov.i8 qzr, #0 // init zero register | ||
108 | |||
109 | // adjust the 16-bit initial_crc value, scale it to 32 bits | ||
110 | lsl arg1_low32, arg1_low32, #16 | ||
111 | |||
112 | // check if smaller than 256 | ||
113 | cmp arg3, #256 | ||
114 | |||
115 | // for sizes less than 128, we can't fold 64B at a time... | ||
116 | blt _less_than_128 | ||
117 | |||
118 | // load the initial crc value | ||
119 | // crc value does not need to be byte-reflected, but it needs | ||
120 | // to be moved to the high part of the register. | ||
121 | // because data will be byte-reflected and will align with | ||
122 | // initial crc at correct place. | ||
123 | vmov s0, arg1_low32 // initial crc | ||
124 | vext.8 q10, qzr, q0, #4 | ||
125 | |||
126 | // receive the initial 64B data, xor the initial crc value | ||
127 | vld1.64 {q0-q1}, [arg2, :128]! | ||
128 | vld1.64 {q2-q3}, [arg2, :128]! | ||
129 | vld1.64 {q4-q5}, [arg2, :128]! | ||
130 | vld1.64 {q6-q7}, [arg2, :128]! | ||
131 | CPU_LE( vrev64.8 q0, q0 ) | ||
132 | CPU_LE( vrev64.8 q1, q1 ) | ||
133 | CPU_LE( vrev64.8 q2, q2 ) | ||
134 | CPU_LE( vrev64.8 q3, q3 ) | ||
135 | CPU_LE( vrev64.8 q4, q4 ) | ||
136 | CPU_LE( vrev64.8 q5, q5 ) | ||
137 | CPU_LE( vrev64.8 q6, q6 ) | ||
138 | CPU_LE( vrev64.8 q7, q7 ) | ||
139 | |||
140 | vswp d0, d1 | ||
141 | vswp d2, d3 | ||
142 | vswp d4, d5 | ||
143 | vswp d6, d7 | ||
144 | vswp d8, d9 | ||
145 | vswp d10, d11 | ||
146 | vswp d12, d13 | ||
147 | vswp d14, d15 | ||
148 | |||
149 | // XOR the initial_crc value | ||
150 | veor.8 q0, q0, q10 | ||
151 | |||
152 | adr ip, rk3 | ||
153 | vld1.64 {q10}, [ip, :128] // xmm10 has rk3 and rk4 | ||
154 | |||
155 | // | ||
156 | // we subtract 256 instead of 128 to save one instruction from the loop | ||
157 | // | ||
158 | sub arg3, arg3, #256 | ||
159 | |||
160 | // at this section of the code, there is 64*x+y (0<=y<64) bytes of | ||
161 | // buffer. The _fold_64_B_loop will fold 64B at a time | ||
162 | // until we have 64+y Bytes of buffer | ||
163 | |||
164 | |||
165 | // fold 64B at a time. This section of the code folds 4 vector | ||
166 | // registers in parallel | ||
167 | _fold_64_B_loop: | ||
168 | |||
169 | .macro fold64, reg1, reg2 | ||
170 | vld1.64 {q11-q12}, [arg2, :128]! | ||
171 | |||
172 | vmull.p64 q8, \reg1\()h, d21 | ||
173 | vmull.p64 \reg1, \reg1\()l, d20 | ||
174 | vmull.p64 q9, \reg2\()h, d21 | ||
175 | vmull.p64 \reg2, \reg2\()l, d20 | ||
176 | |||
177 | CPU_LE( vrev64.8 q11, q11 ) | ||
178 | CPU_LE( vrev64.8 q12, q12 ) | ||
179 | vswp d22, d23 | ||
180 | vswp d24, d25 | ||
181 | |||
182 | veor.8 \reg1, \reg1, q8 | ||
183 | veor.8 \reg2, \reg2, q9 | ||
184 | veor.8 \reg1, \reg1, q11 | ||
185 | veor.8 \reg2, \reg2, q12 | ||
186 | .endm | ||
187 | |||
188 | fold64 q0, q1 | ||
189 | fold64 q2, q3 | ||
190 | fold64 q4, q5 | ||
191 | fold64 q6, q7 | ||
192 | |||
193 | subs arg3, arg3, #128 | ||
194 | |||
195 | // check if there is another 64B in the buffer to be able to fold | ||
196 | bge _fold_64_B_loop | ||
197 | |||
198 | // at this point, the buffer pointer is pointing at the last y Bytes | ||
199 | // of the buffer the 64B of folded data is in 4 of the vector | ||
200 | // registers: v0, v1, v2, v3 | ||
201 | |||
202 | // fold the 8 vector registers to 1 vector register with different | ||
203 | // constants | ||
204 | |||
205 | adr ip, rk9 | ||
206 | vld1.64 {q10}, [ip, :128]! | ||
207 | |||
208 | .macro fold16, reg, rk | ||
209 | vmull.p64 q8, \reg\()l, d20 | ||
210 | vmull.p64 \reg, \reg\()h, d21 | ||
211 | .ifnb \rk | ||
212 | vld1.64 {q10}, [ip, :128]! | ||
213 | .endif | ||
214 | veor.8 q7, q7, q8 | ||
215 | veor.8 q7, q7, \reg | ||
216 | .endm | ||
217 | |||
218 | fold16 q0, rk11 | ||
219 | fold16 q1, rk13 | ||
220 | fold16 q2, rk15 | ||
221 | fold16 q3, rk17 | ||
222 | fold16 q4, rk19 | ||
223 | fold16 q5, rk1 | ||
224 | fold16 q6 | ||
225 | |||
226 | // instead of 64, we add 48 to the loop counter to save 1 instruction | ||
227 | // from the loop instead of a cmp instruction, we use the negative | ||
228 | // flag with the jl instruction | ||
229 | adds arg3, arg3, #(128-16) | ||
230 | blt _final_reduction_for_128 | ||
231 | |||
232 | // now we have 16+y bytes left to reduce. 16 Bytes is in register v7 | ||
233 | // and the rest is in memory. We can fold 16 bytes at a time if y>=16 | ||
234 | // continue folding 16B at a time | ||
235 | |||
236 | _16B_reduction_loop: | ||
237 | vmull.p64 q8, d14, d20 | ||
238 | vmull.p64 q7, d15, d21 | ||
239 | veor.8 q7, q7, q8 | ||
240 | |||
241 | vld1.64 {q0}, [arg2, :128]! | ||
242 | CPU_LE( vrev64.8 q0, q0 ) | ||
243 | vswp d0, d1 | ||
244 | veor.8 q7, q7, q0 | ||
245 | subs arg3, arg3, #16 | ||
246 | |||
247 | // instead of a cmp instruction, we utilize the flags with the | ||
248 | // jge instruction equivalent of: cmp arg3, 16-16 | ||
249 | // check if there is any more 16B in the buffer to be able to fold | ||
250 | bge _16B_reduction_loop | ||
251 | |||
252 | // now we have 16+z bytes left to reduce, where 0<= z < 16. | ||
253 | // first, we reduce the data in the xmm7 register | ||
254 | |||
255 | _final_reduction_for_128: | ||
256 | // check if any more data to fold. If not, compute the CRC of | ||
257 | // the final 128 bits | ||
258 | adds arg3, arg3, #16 | ||
259 | beq _128_done | ||
260 | |||
261 | // here we are getting data that is less than 16 bytes. | ||
262 | // since we know that there was data before the pointer, we can | ||
263 | // offset the input pointer before the actual point, to receive | ||
264 | // exactly 16 bytes. after that the registers need to be adjusted. | ||
265 | _get_last_two_regs: | ||
266 | add arg2, arg2, arg3 | ||
267 | sub arg2, arg2, #16 | ||
268 | vld1.64 {q1}, [arg2] | ||
269 | CPU_LE( vrev64.8 q1, q1 ) | ||
270 | vswp d2, d3 | ||
271 | |||
272 | // get rid of the extra data that was loaded before | ||
273 | // load the shift constant | ||
274 | adr ip, tbl_shf_table + 16 | ||
275 | sub ip, ip, arg3 | ||
276 | vld1.8 {q0}, [ip] | ||
277 | |||
278 | // shift v2 to the left by arg3 bytes | ||
279 | vtbl.8 d4, {d14-d15}, d0 | ||
280 | vtbl.8 d5, {d14-d15}, d1 | ||
281 | |||
282 | // shift v7 to the right by 16-arg3 bytes | ||
283 | vmov.i8 q9, #0x80 | ||
284 | veor.8 q0, q0, q9 | ||
285 | vtbl.8 d18, {d14-d15}, d0 | ||
286 | vtbl.8 d19, {d14-d15}, d1 | ||
287 | |||
288 | // blend | ||
289 | vshr.s8 q0, q0, #7 // convert to 8-bit mask | ||
290 | vbsl.8 q0, q2, q1 | ||
291 | |||
292 | // fold 16 Bytes | ||
293 | vmull.p64 q8, d18, d20 | ||
294 | vmull.p64 q7, d19, d21 | ||
295 | veor.8 q7, q7, q8 | ||
296 | veor.8 q7, q7, q0 | ||
297 | |||
298 | _128_done: | ||
299 | // compute crc of a 128-bit value | ||
300 | vldr d20, rk5 | ||
301 | vldr d21, rk6 // rk5 and rk6 in xmm10 | ||
302 | |||
303 | // 64b fold | ||
304 | vext.8 q0, qzr, q7, #8 | ||
305 | vmull.p64 q7, d15, d20 | ||
306 | veor.8 q7, q7, q0 | ||
307 | |||
308 | // 32b fold | ||
309 | vext.8 q0, q7, qzr, #12 | ||
310 | vmov s31, s3 | ||
311 | vmull.p64 q0, d0, d21 | ||
312 | veor.8 q7, q0, q7 | ||
313 | |||
314 | // barrett reduction | ||
315 | _barrett: | ||
316 | vldr d20, rk7 | ||
317 | vldr d21, rk8 | ||
318 | |||
319 | vmull.p64 q0, d15, d20 | ||
320 | vext.8 q0, qzr, q0, #12 | ||
321 | vmull.p64 q0, d1, d21 | ||
322 | vext.8 q0, qzr, q0, #12 | ||
323 | veor.8 q7, q7, q0 | ||
324 | vmov r0, s29 | ||
325 | |||
326 | _cleanup: | ||
327 | // scale the result back to 16 bits | ||
328 | lsr r0, r0, #16 | ||
329 | bx lr | ||
330 | |||
331 | _less_than_128: | ||
332 | teq arg3, #0 | ||
333 | beq _cleanup | ||
334 | |||
335 | vmov.i8 q0, #0 | ||
336 | vmov s3, arg1_low32 // get the initial crc value | ||
337 | |||
338 | vld1.64 {q7}, [arg2, :128]! | ||
339 | CPU_LE( vrev64.8 q7, q7 ) | ||
340 | vswp d14, d15 | ||
341 | veor.8 q7, q7, q0 | ||
342 | |||
343 | cmp arg3, #16 | ||
344 | beq _128_done // exactly 16 left | ||
345 | blt _less_than_16_left | ||
346 | |||
347 | // now if there is, load the constants | ||
348 | vldr d20, rk1 | ||
349 | vldr d21, rk2 // rk1 and rk2 in xmm10 | ||
350 | |||
351 | // check if there is enough buffer to be able to fold 16B at a time | ||
352 | subs arg3, arg3, #32 | ||
353 | addlt arg3, arg3, #16 | ||
354 | blt _get_last_two_regs | ||
355 | b _16B_reduction_loop | ||
356 | |||
357 | _less_than_16_left: | ||
358 | // shl r9, 4 | ||
359 | adr ip, tbl_shf_table + 16 | ||
360 | sub ip, ip, arg3 | ||
361 | vld1.8 {q0}, [ip] | ||
362 | vmov.i8 q9, #0x80 | ||
363 | veor.8 q0, q0, q9 | ||
364 | vtbl.8 d18, {d14-d15}, d0 | ||
365 | vtbl.8 d15, {d14-d15}, d1 | ||
366 | vmov d14, d18 | ||
367 | b _128_done | ||
368 | ENDPROC(crc_t10dif_pmull) | ||
369 | |||
370 | // precomputed constants | ||
371 | // these constants are precomputed from the poly: | ||
372 | // 0x8bb70000 (0x8bb7 scaled to 32 bits) | ||
373 | .align 4 | ||
374 | // Q = 0x18BB70000 | ||
375 | // rk1 = 2^(32*3) mod Q << 32 | ||
376 | // rk2 = 2^(32*5) mod Q << 32 | ||
377 | // rk3 = 2^(32*15) mod Q << 32 | ||
378 | // rk4 = 2^(32*17) mod Q << 32 | ||
379 | // rk5 = 2^(32*3) mod Q << 32 | ||
380 | // rk6 = 2^(32*2) mod Q << 32 | ||
381 | // rk7 = floor(2^64/Q) | ||
382 | // rk8 = Q | ||
383 | |||
384 | rk3: .quad 0x9d9d000000000000 | ||
385 | rk4: .quad 0x7cf5000000000000 | ||
386 | rk5: .quad 0x2d56000000000000 | ||
387 | rk6: .quad 0x1368000000000000 | ||
388 | rk7: .quad 0x00000001f65a57f8 | ||
389 | rk8: .quad 0x000000018bb70000 | ||
390 | rk9: .quad 0xceae000000000000 | ||
391 | rk10: .quad 0xbfd6000000000000 | ||
392 | rk11: .quad 0x1e16000000000000 | ||
393 | rk12: .quad 0x713c000000000000 | ||
394 | rk13: .quad 0xf7f9000000000000 | ||
395 | rk14: .quad 0x80a6000000000000 | ||
396 | rk15: .quad 0x044c000000000000 | ||
397 | rk16: .quad 0xe658000000000000 | ||
398 | rk17: .quad 0xad18000000000000 | ||
399 | rk18: .quad 0xa497000000000000 | ||
400 | rk19: .quad 0x6ee3000000000000 | ||
401 | rk20: .quad 0xe7b5000000000000 | ||
402 | rk1: .quad 0x2d56000000000000 | ||
403 | rk2: .quad 0x06df000000000000 | ||
404 | |||
405 | tbl_shf_table: | ||
406 | // use these values for shift constants for the tbl/tbx instruction | ||
407 | // different alignments result in values as shown: | ||
408 | // DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1 | ||
409 | // DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2 | ||
410 | // DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3 | ||
411 | // DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4 | ||
412 | // DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5 | ||
413 | // DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6 | ||
414 | // DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7 | ||
415 | // DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8 | ||
416 | // DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9 | ||
417 | // DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10 | ||
418 | // DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11 | ||
419 | // DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12 | ||
420 | // DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13 | ||
421 | // DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14 | ||
422 | // DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15 | ||
423 | |||
424 | .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 | ||
425 | .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f | ||
426 | .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 | ||
427 | .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 | ||
diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c new file mode 100644 index 000000000000..d428355cf38d --- /dev/null +++ b/arch/arm/crypto/crct10dif-ce-glue.c | |||
@@ -0,0 +1,101 @@ | |||
1 | /* | ||
2 | * Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions | ||
3 | * | ||
4 | * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <linux/crc-t10dif.h> | ||
12 | #include <linux/init.h> | ||
13 | #include <linux/kernel.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/string.h> | ||
16 | |||
17 | #include <crypto/internal/hash.h> | ||
18 | |||
19 | #include <asm/neon.h> | ||
20 | #include <asm/simd.h> | ||
21 | |||
22 | #define CRC_T10DIF_PMULL_CHUNK_SIZE 16U | ||
23 | |||
24 | asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u32 len); | ||
25 | |||
26 | static int crct10dif_init(struct shash_desc *desc) | ||
27 | { | ||
28 | u16 *crc = shash_desc_ctx(desc); | ||
29 | |||
30 | *crc = 0; | ||
31 | return 0; | ||
32 | } | ||
33 | |||
34 | static int crct10dif_update(struct shash_desc *desc, const u8 *data, | ||
35 | unsigned int length) | ||
36 | { | ||
37 | u16 *crc = shash_desc_ctx(desc); | ||
38 | unsigned int l; | ||
39 | |||
40 | if (!may_use_simd()) { | ||
41 | *crc = crc_t10dif_generic(*crc, data, length); | ||
42 | } else { | ||
43 | if (unlikely((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) { | ||
44 | l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE - | ||
45 | ((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE)); | ||
46 | |||
47 | *crc = crc_t10dif_generic(*crc, data, l); | ||
48 | |||
49 | length -= l; | ||
50 | data += l; | ||
51 | } | ||
52 | if (length > 0) { | ||
53 | kernel_neon_begin(); | ||
54 | *crc = crc_t10dif_pmull(*crc, data, length); | ||
55 | kernel_neon_end(); | ||
56 | } | ||
57 | } | ||
58 | return 0; | ||
59 | } | ||
60 | |||
61 | static int crct10dif_final(struct shash_desc *desc, u8 *out) | ||
62 | { | ||
63 | u16 *crc = shash_desc_ctx(desc); | ||
64 | |||
65 | *(u16 *)out = *crc; | ||
66 | return 0; | ||
67 | } | ||
68 | |||
69 | static struct shash_alg crc_t10dif_alg = { | ||
70 | .digestsize = CRC_T10DIF_DIGEST_SIZE, | ||
71 | .init = crct10dif_init, | ||
72 | .update = crct10dif_update, | ||
73 | .final = crct10dif_final, | ||
74 | .descsize = CRC_T10DIF_DIGEST_SIZE, | ||
75 | |||
76 | .base.cra_name = "crct10dif", | ||
77 | .base.cra_driver_name = "crct10dif-arm-ce", | ||
78 | .base.cra_priority = 200, | ||
79 | .base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE, | ||
80 | .base.cra_module = THIS_MODULE, | ||
81 | }; | ||
82 | |||
83 | static int __init crc_t10dif_mod_init(void) | ||
84 | { | ||
85 | if (!(elf_hwcap2 & HWCAP2_PMULL)) | ||
86 | return -ENODEV; | ||
87 | |||
88 | return crypto_register_shash(&crc_t10dif_alg); | ||
89 | } | ||
90 | |||
91 | static void __exit crc_t10dif_mod_exit(void) | ||
92 | { | ||
93 | crypto_unregister_shash(&crc_t10dif_alg); | ||
94 | } | ||
95 | |||
96 | module_init(crc_t10dif_mod_init); | ||
97 | module_exit(crc_t10dif_mod_exit); | ||
98 | |||
99 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); | ||
100 | MODULE_LICENSE("GPL v2"); | ||
101 | MODULE_ALIAS_CRYPTO("crct10dif"); | ||