aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm/crypto
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2014-07-29 12:15:24 -0400
committerRussell King <rmk+kernel@arm.linux.org.uk>2014-08-02 03:51:50 -0400
commitc8611d712ad01289a0b6a83cc93bba3a1ef4e990 (patch)
treef3988ae6af0e0719fe501cbe3f040c41a917f345 /arch/arm/crypto
parent604682551aa511e00e57706ad5d9fcf955ee0323 (diff)
ARM: 8120/1: crypto: sha512: add ARM NEON implementation
This patch adds ARM NEON assembly implementation of SHA-512 and SHA-384 algorithms. tcrypt benchmark results on Cortex-A8, sha512-generic vs sha512-neon-asm: block-size bytes/update old-vs-new 16 16 2.99x 64 16 2.67x 64 64 3.00x 256 16 2.64x 256 64 3.06x 256 256 3.33x 1024 16 2.53x 1024 256 3.39x 1024 1024 3.52x 2048 16 2.50x 2048 256 3.41x 2048 1024 3.54x 2048 2048 3.57x 4096 16 2.49x 4096 256 3.42x 4096 1024 3.56x 4096 4096 3.59x 8192 16 2.48x 8192 256 3.42x 8192 1024 3.56x 8192 4096 3.60x 8192 8192 3.60x Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Diffstat (limited to 'arch/arm/crypto')
-rw-r--r--arch/arm/crypto/Makefile2
-rw-r--r--arch/arm/crypto/sha512-armv7-neon.S455
-rw-r--r--arch/arm/crypto/sha512_neon_glue.c305
3 files changed, 762 insertions, 0 deletions
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 374956d2f896..b48fa341648d 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -6,11 +6,13 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
6obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o 6obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
7obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o 7obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
8obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o 8obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
9obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
9 10
10aes-arm-y := aes-armv4.o aes_glue.o 11aes-arm-y := aes-armv4.o aes_glue.o
11aes-arm-bs-y := aesbs-core.o aesbs-glue.o 12aes-arm-bs-y := aesbs-core.o aesbs-glue.o
12sha1-arm-y := sha1-armv4-large.o sha1_glue.o 13sha1-arm-y := sha1-armv4-large.o sha1_glue.o
13sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o 14sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o
15sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
14 16
15quiet_cmd_perl = PERL $@ 17quiet_cmd_perl = PERL $@
16 cmd_perl = $(PERL) $(<) > $(@) 18 cmd_perl = $(PERL) $(<) > $(@)
diff --git a/arch/arm/crypto/sha512-armv7-neon.S b/arch/arm/crypto/sha512-armv7-neon.S
new file mode 100644
index 000000000000..fe99472e507c
--- /dev/null
+++ b/arch/arm/crypto/sha512-armv7-neon.S
@@ -0,0 +1,455 @@
1/* sha512-armv7-neon.S - ARM/NEON assembly implementation of SHA-512 transform
2 *
3 * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License as published by the Free
7 * Software Foundation; either version 2 of the License, or (at your option)
8 * any later version.
9 */
10
11#include <linux/linkage.h>
12
13
14.syntax unified
15.code 32
16.fpu neon
17
18.text
19
20/* structure of SHA512_CONTEXT */
21#define hd_a 0
22#define hd_b ((hd_a) + 8)
23#define hd_c ((hd_b) + 8)
24#define hd_d ((hd_c) + 8)
25#define hd_e ((hd_d) + 8)
26#define hd_f ((hd_e) + 8)
27#define hd_g ((hd_f) + 8)
28
29/* register macros */
30#define RK %r2
31
32#define RA d0
33#define RB d1
34#define RC d2
35#define RD d3
36#define RE d4
37#define RF d5
38#define RG d6
39#define RH d7
40
41#define RT0 d8
42#define RT1 d9
43#define RT2 d10
44#define RT3 d11
45#define RT4 d12
46#define RT5 d13
47#define RT6 d14
48#define RT7 d15
49
50#define RT01q q4
51#define RT23q q5
52#define RT45q q6
53#define RT67q q7
54
55#define RW0 d16
56#define RW1 d17
57#define RW2 d18
58#define RW3 d19
59#define RW4 d20
60#define RW5 d21
61#define RW6 d22
62#define RW7 d23
63#define RW8 d24
64#define RW9 d25
65#define RW10 d26
66#define RW11 d27
67#define RW12 d28
68#define RW13 d29
69#define RW14 d30
70#define RW15 d31
71
72#define RW01q q8
73#define RW23q q9
74#define RW45q q10
75#define RW67q q11
76#define RW89q q12
77#define RW1011q q13
78#define RW1213q q14
79#define RW1415q q15
80
81/***********************************************************************
82 * ARM assembly implementation of sha512 transform
83 ***********************************************************************/
84#define rounds2_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, rw01q, rw2, \
85 rw23q, rw1415q, rw9, rw10, interleave_op, arg1) \
86 /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
87 vshr.u64 RT2, re, #14; \
88 vshl.u64 RT3, re, #64 - 14; \
89 interleave_op(arg1); \
90 vshr.u64 RT4, re, #18; \
91 vshl.u64 RT5, re, #64 - 18; \
92 vld1.64 {RT0}, [RK]!; \
93 veor.64 RT23q, RT23q, RT45q; \
94 vshr.u64 RT4, re, #41; \
95 vshl.u64 RT5, re, #64 - 41; \
96 vadd.u64 RT0, RT0, rw0; \
97 veor.64 RT23q, RT23q, RT45q; \
98 vmov.64 RT7, re; \
99 veor.64 RT1, RT2, RT3; \
100 vbsl.64 RT7, rf, rg; \
101 \
102 vadd.u64 RT1, RT1, rh; \
103 vshr.u64 RT2, ra, #28; \
104 vshl.u64 RT3, ra, #64 - 28; \
105 vadd.u64 RT1, RT1, RT0; \
106 vshr.u64 RT4, ra, #34; \
107 vshl.u64 RT5, ra, #64 - 34; \
108 vadd.u64 RT1, RT1, RT7; \
109 \
110 /* h = Sum0 (a) + Maj (a, b, c); */ \
111 veor.64 RT23q, RT23q, RT45q; \
112 vshr.u64 RT4, ra, #39; \
113 vshl.u64 RT5, ra, #64 - 39; \
114 veor.64 RT0, ra, rb; \
115 veor.64 RT23q, RT23q, RT45q; \
116 vbsl.64 RT0, rc, rb; \
117 vadd.u64 rd, rd, RT1; /* d+=t1; */ \
118 veor.64 rh, RT2, RT3; \
119 \
120 /* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
121 vshr.u64 RT2, rd, #14; \
122 vshl.u64 RT3, rd, #64 - 14; \
123 vadd.u64 rh, rh, RT0; \
124 vshr.u64 RT4, rd, #18; \
125 vshl.u64 RT5, rd, #64 - 18; \
126 vadd.u64 rh, rh, RT1; /* h+=t1; */ \
127 vld1.64 {RT0}, [RK]!; \
128 veor.64 RT23q, RT23q, RT45q; \
129 vshr.u64 RT4, rd, #41; \
130 vshl.u64 RT5, rd, #64 - 41; \
131 vadd.u64 RT0, RT0, rw1; \
132 veor.64 RT23q, RT23q, RT45q; \
133 vmov.64 RT7, rd; \
134 veor.64 RT1, RT2, RT3; \
135 vbsl.64 RT7, re, rf; \
136 \
137 vadd.u64 RT1, RT1, rg; \
138 vshr.u64 RT2, rh, #28; \
139 vshl.u64 RT3, rh, #64 - 28; \
140 vadd.u64 RT1, RT1, RT0; \
141 vshr.u64 RT4, rh, #34; \
142 vshl.u64 RT5, rh, #64 - 34; \
143 vadd.u64 RT1, RT1, RT7; \
144 \
145 /* g = Sum0 (h) + Maj (h, a, b); */ \
146 veor.64 RT23q, RT23q, RT45q; \
147 vshr.u64 RT4, rh, #39; \
148 vshl.u64 RT5, rh, #64 - 39; \
149 veor.64 RT0, rh, ra; \
150 veor.64 RT23q, RT23q, RT45q; \
151 vbsl.64 RT0, rb, ra; \
152 vadd.u64 rc, rc, RT1; /* c+=t1; */ \
153 veor.64 rg, RT2, RT3; \
154 \
155 /* w[0] += S1 (w[14]) + w[9] + S0 (w[1]); */ \
156 /* w[1] += S1 (w[15]) + w[10] + S0 (w[2]); */ \
157 \
158 /**** S0(w[1:2]) */ \
159 \
160 /* w[0:1] += w[9:10] */ \
161 /* RT23q = rw1:rw2 */ \
162 vext.u64 RT23q, rw01q, rw23q, #1; \
163 vadd.u64 rw0, rw9; \
164 vadd.u64 rg, rg, RT0; \
165 vadd.u64 rw1, rw10;\
166 vadd.u64 rg, rg, RT1; /* g+=t1; */ \
167 \
168 vshr.u64 RT45q, RT23q, #1; \
169 vshl.u64 RT67q, RT23q, #64 - 1; \
170 vshr.u64 RT01q, RT23q, #8; \
171 veor.u64 RT45q, RT45q, RT67q; \
172 vshl.u64 RT67q, RT23q, #64 - 8; \
173 veor.u64 RT45q, RT45q, RT01q; \
174 vshr.u64 RT01q, RT23q, #7; \
175 veor.u64 RT45q, RT45q, RT67q; \
176 \
177 /**** S1(w[14:15]) */ \
178 vshr.u64 RT23q, rw1415q, #6; \
179 veor.u64 RT01q, RT01q, RT45q; \
180 vshr.u64 RT45q, rw1415q, #19; \
181 vshl.u64 RT67q, rw1415q, #64 - 19; \
182 veor.u64 RT23q, RT23q, RT45q; \
183 vshr.u64 RT45q, rw1415q, #61; \
184 veor.u64 RT23q, RT23q, RT67q; \
185 vshl.u64 RT67q, rw1415q, #64 - 61; \
186 veor.u64 RT23q, RT23q, RT45q; \
187 vadd.u64 rw01q, RT01q; /* w[0:1] += S(w[1:2]) */ \
188 veor.u64 RT01q, RT23q, RT67q;
189#define vadd_RT01q(rw01q) \
190 /* w[0:1] += S(w[14:15]) */ \
191 vadd.u64 rw01q, RT01q;
192
193#define dummy(_) /*_*/
194
195#define rounds2_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, \
196 interleave_op1, arg1, interleave_op2, arg2) \
197 /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
198 vshr.u64 RT2, re, #14; \
199 vshl.u64 RT3, re, #64 - 14; \
200 interleave_op1(arg1); \
201 vshr.u64 RT4, re, #18; \
202 vshl.u64 RT5, re, #64 - 18; \
203 interleave_op2(arg2); \
204 vld1.64 {RT0}, [RK]!; \
205 veor.64 RT23q, RT23q, RT45q; \
206 vshr.u64 RT4, re, #41; \
207 vshl.u64 RT5, re, #64 - 41; \
208 vadd.u64 RT0, RT0, rw0; \
209 veor.64 RT23q, RT23q, RT45q; \
210 vmov.64 RT7, re; \
211 veor.64 RT1, RT2, RT3; \
212 vbsl.64 RT7, rf, rg; \
213 \
214 vadd.u64 RT1, RT1, rh; \
215 vshr.u64 RT2, ra, #28; \
216 vshl.u64 RT3, ra, #64 - 28; \
217 vadd.u64 RT1, RT1, RT0; \
218 vshr.u64 RT4, ra, #34; \
219 vshl.u64 RT5, ra, #64 - 34; \
220 vadd.u64 RT1, RT1, RT7; \
221 \
222 /* h = Sum0 (a) + Maj (a, b, c); */ \
223 veor.64 RT23q, RT23q, RT45q; \
224 vshr.u64 RT4, ra, #39; \
225 vshl.u64 RT5, ra, #64 - 39; \
226 veor.64 RT0, ra, rb; \
227 veor.64 RT23q, RT23q, RT45q; \
228 vbsl.64 RT0, rc, rb; \
229 vadd.u64 rd, rd, RT1; /* d+=t1; */ \
230 veor.64 rh, RT2, RT3; \
231 \
232 /* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
233 vshr.u64 RT2, rd, #14; \
234 vshl.u64 RT3, rd, #64 - 14; \
235 vadd.u64 rh, rh, RT0; \
236 vshr.u64 RT4, rd, #18; \
237 vshl.u64 RT5, rd, #64 - 18; \
238 vadd.u64 rh, rh, RT1; /* h+=t1; */ \
239 vld1.64 {RT0}, [RK]!; \
240 veor.64 RT23q, RT23q, RT45q; \
241 vshr.u64 RT4, rd, #41; \
242 vshl.u64 RT5, rd, #64 - 41; \
243 vadd.u64 RT0, RT0, rw1; \
244 veor.64 RT23q, RT23q, RT45q; \
245 vmov.64 RT7, rd; \
246 veor.64 RT1, RT2, RT3; \
247 vbsl.64 RT7, re, rf; \
248 \
249 vadd.u64 RT1, RT1, rg; \
250 vshr.u64 RT2, rh, #28; \
251 vshl.u64 RT3, rh, #64 - 28; \
252 vadd.u64 RT1, RT1, RT0; \
253 vshr.u64 RT4, rh, #34; \
254 vshl.u64 RT5, rh, #64 - 34; \
255 vadd.u64 RT1, RT1, RT7; \
256 \
257 /* g = Sum0 (h) + Maj (h, a, b); */ \
258 veor.64 RT23q, RT23q, RT45q; \
259 vshr.u64 RT4, rh, #39; \
260 vshl.u64 RT5, rh, #64 - 39; \
261 veor.64 RT0, rh, ra; \
262 veor.64 RT23q, RT23q, RT45q; \
263 vbsl.64 RT0, rb, ra; \
264 vadd.u64 rc, rc, RT1; /* c+=t1; */ \
265 veor.64 rg, RT2, RT3;
266#define vadd_rg_RT0(rg) \
267 vadd.u64 rg, rg, RT0;
268#define vadd_rg_RT1(rg) \
269 vadd.u64 rg, rg, RT1; /* g+=t1; */
270
271.align 3
272ENTRY(sha512_transform_neon)
273 /* Input:
274 * %r0: SHA512_CONTEXT
275 * %r1: data
276 * %r2: u64 k[] constants
277 * %r3: nblks
278 */
279 push {%lr};
280
281 mov %lr, #0;
282
283 /* Load context to d0-d7 */
284 vld1.64 {RA-RD}, [%r0]!;
285 vld1.64 {RE-RH}, [%r0];
286 sub %r0, #(4*8);
287
288 /* Load input to w[16], d16-d31 */
289 /* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */
290 vld1.64 {RW0-RW3}, [%r1]!;
291 vld1.64 {RW4-RW7}, [%r1]!;
292 vld1.64 {RW8-RW11}, [%r1]!;
293 vld1.64 {RW12-RW15}, [%r1]!;
294#ifdef __ARMEL__
295 /* byteswap */
296 vrev64.8 RW01q, RW01q;
297 vrev64.8 RW23q, RW23q;
298 vrev64.8 RW45q, RW45q;
299 vrev64.8 RW67q, RW67q;
300 vrev64.8 RW89q, RW89q;
301 vrev64.8 RW1011q, RW1011q;
302 vrev64.8 RW1213q, RW1213q;
303 vrev64.8 RW1415q, RW1415q;
304#endif
305
306 /* EABI says that d8-d15 must be preserved by callee. */
307 /*vpush {RT0-RT7};*/
308
309.Loop:
310 rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2,
311 RW23q, RW1415q, RW9, RW10, dummy, _);
312 b .Lenter_rounds;
313
314.Loop_rounds:
315 rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2,
316 RW23q, RW1415q, RW9, RW10, vadd_RT01q, RW1415q);
317.Lenter_rounds:
318 rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, RW23q, RW4,
319 RW45q, RW01q, RW11, RW12, vadd_RT01q, RW01q);
320 rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, RW45q, RW6,
321 RW67q, RW23q, RW13, RW14, vadd_RT01q, RW23q);
322 rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, RW67q, RW8,
323 RW89q, RW45q, RW15, RW0, vadd_RT01q, RW45q);
324 rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, RW89q, RW10,
325 RW1011q, RW67q, RW1, RW2, vadd_RT01q, RW67q);
326 rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, RW1011q, RW12,
327 RW1213q, RW89q, RW3, RW4, vadd_RT01q, RW89q);
328 add %lr, #16;
329 rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, RW1213q, RW14,
330 RW1415q, RW1011q, RW5, RW6, vadd_RT01q, RW1011q);
331 cmp %lr, #64;
332 rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, RW1415q, RW0,
333 RW01q, RW1213q, RW7, RW8, vadd_RT01q, RW1213q);
334 bne .Loop_rounds;
335
336 subs %r3, #1;
337
338 rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1,
339 vadd_RT01q, RW1415q, dummy, _);
340 rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3,
341 vadd_rg_RT0, RG, vadd_rg_RT1, RG);
342 beq .Lhandle_tail;
343 vld1.64 {RW0-RW3}, [%r1]!;
344 rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5,
345 vadd_rg_RT0, RE, vadd_rg_RT1, RE);
346 rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7,
347 vadd_rg_RT0, RC, vadd_rg_RT1, RC);
348#ifdef __ARMEL__
349 vrev64.8 RW01q, RW01q;
350 vrev64.8 RW23q, RW23q;
351#endif
352 vld1.64 {RW4-RW7}, [%r1]!;
353 rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9,
354 vadd_rg_RT0, RA, vadd_rg_RT1, RA);
355 rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11,
356 vadd_rg_RT0, RG, vadd_rg_RT1, RG);
357#ifdef __ARMEL__
358 vrev64.8 RW45q, RW45q;
359 vrev64.8 RW67q, RW67q;
360#endif
361 vld1.64 {RW8-RW11}, [%r1]!;
362 rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13,
363 vadd_rg_RT0, RE, vadd_rg_RT1, RE);
364 rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15,
365 vadd_rg_RT0, RC, vadd_rg_RT1, RC);
366#ifdef __ARMEL__
367 vrev64.8 RW89q, RW89q;
368 vrev64.8 RW1011q, RW1011q;
369#endif
370 vld1.64 {RW12-RW15}, [%r1]!;
371 vadd_rg_RT0(RA);
372 vadd_rg_RT1(RA);
373
374 /* Load context */
375 vld1.64 {RT0-RT3}, [%r0]!;
376 vld1.64 {RT4-RT7}, [%r0];
377 sub %r0, #(4*8);
378
379#ifdef __ARMEL__
380 vrev64.8 RW1213q, RW1213q;
381 vrev64.8 RW1415q, RW1415q;
382#endif
383
384 vadd.u64 RA, RT0;
385 vadd.u64 RB, RT1;
386 vadd.u64 RC, RT2;
387 vadd.u64 RD, RT3;
388 vadd.u64 RE, RT4;
389 vadd.u64 RF, RT5;
390 vadd.u64 RG, RT6;
391 vadd.u64 RH, RT7;
392
393 /* Store the first half of context */
394 vst1.64 {RA-RD}, [%r0]!;
395 sub RK, $(8*80);
396 vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
397 mov %lr, #0;
398 sub %r0, #(4*8);
399
400 b .Loop;
401
402.Lhandle_tail:
403 rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5,
404 vadd_rg_RT0, RE, vadd_rg_RT1, RE);
405 rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7,
406 vadd_rg_RT0, RC, vadd_rg_RT1, RC);
407 rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9,
408 vadd_rg_RT0, RA, vadd_rg_RT1, RA);
409 rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11,
410 vadd_rg_RT0, RG, vadd_rg_RT1, RG);
411 rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13,
412 vadd_rg_RT0, RE, vadd_rg_RT1, RE);
413 rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15,
414 vadd_rg_RT0, RC, vadd_rg_RT1, RC);
415
416 /* Load context to d16-d23 */
417 vld1.64 {RW0-RW3}, [%r0]!;
418 vadd_rg_RT0(RA);
419 vld1.64 {RW4-RW7}, [%r0];
420 vadd_rg_RT1(RA);
421 sub %r0, #(4*8);
422
423 vadd.u64 RA, RW0;
424 vadd.u64 RB, RW1;
425 vadd.u64 RC, RW2;
426 vadd.u64 RD, RW3;
427 vadd.u64 RE, RW4;
428 vadd.u64 RF, RW5;
429 vadd.u64 RG, RW6;
430 vadd.u64 RH, RW7;
431
432 /* Store the first half of context */
433 vst1.64 {RA-RD}, [%r0]!;
434
435 /* Clear used registers */
436 /* d16-d31 */
437 veor.u64 RW01q, RW01q;
438 veor.u64 RW23q, RW23q;
439 veor.u64 RW45q, RW45q;
440 veor.u64 RW67q, RW67q;
441 vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
442 veor.u64 RW89q, RW89q;
443 veor.u64 RW1011q, RW1011q;
444 veor.u64 RW1213q, RW1213q;
445 veor.u64 RW1415q, RW1415q;
446 /* d8-d15 */
447 /*vpop {RT0-RT7};*/
448 /* d0-d7 (q0-q3) */
449 veor.u64 %q0, %q0;
450 veor.u64 %q1, %q1;
451 veor.u64 %q2, %q2;
452 veor.u64 %q3, %q3;
453
454 pop {%pc};
455ENDPROC(sha512_transform_neon)
diff --git a/arch/arm/crypto/sha512_neon_glue.c b/arch/arm/crypto/sha512_neon_glue.c
new file mode 100644
index 000000000000..0d2758ff5e12
--- /dev/null
+++ b/arch/arm/crypto/sha512_neon_glue.c
@@ -0,0 +1,305 @@
1/*
2 * Glue code for the SHA512 Secure Hash Algorithm assembly implementation
3 * using NEON instructions.
4 *
5 * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
6 *
7 * This file is based on sha512_ssse3_glue.c:
8 * Copyright (C) 2013 Intel Corporation
9 * Author: Tim Chen <tim.c.chen@linux.intel.com>
10 *
11 * This program is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License as published by the Free
13 * Software Foundation; either version 2 of the License, or (at your option)
14 * any later version.
15 *
16 */
17
18#include <crypto/internal/hash.h>
19#include <linux/init.h>
20#include <linux/module.h>
21#include <linux/mm.h>
22#include <linux/cryptohash.h>
23#include <linux/types.h>
24#include <linux/string.h>
25#include <crypto/sha.h>
26#include <asm/byteorder.h>
27#include <asm/simd.h>
28#include <asm/neon.h>
29
30
31static const u64 sha512_k[] = {
32 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
33 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
34 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
35 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
36 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
37 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
38 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
39 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
40 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
41 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
42 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
43 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
44 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
45 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
46 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
47 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
48 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
49 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
50 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
51 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
52 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
53 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
54 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
55 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
56 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
57 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
58 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
59 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
60 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
61 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
62 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
63 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
64 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
65 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
66 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
67 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
68 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
69 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
70 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
71 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
72};
73
74
75asmlinkage void sha512_transform_neon(u64 *digest, const void *data,
76 const u64 k[], unsigned int num_blks);
77
78
79static int sha512_neon_init(struct shash_desc *desc)
80{
81 struct sha512_state *sctx = shash_desc_ctx(desc);
82
83 sctx->state[0] = SHA512_H0;
84 sctx->state[1] = SHA512_H1;
85 sctx->state[2] = SHA512_H2;
86 sctx->state[3] = SHA512_H3;
87 sctx->state[4] = SHA512_H4;
88 sctx->state[5] = SHA512_H5;
89 sctx->state[6] = SHA512_H6;
90 sctx->state[7] = SHA512_H7;
91 sctx->count[0] = sctx->count[1] = 0;
92
93 return 0;
94}
95
96static int __sha512_neon_update(struct shash_desc *desc, const u8 *data,
97 unsigned int len, unsigned int partial)
98{
99 struct sha512_state *sctx = shash_desc_ctx(desc);
100 unsigned int done = 0;
101
102 sctx->count[0] += len;
103 if (sctx->count[0] < len)
104 sctx->count[1]++;
105
106 if (partial) {
107 done = SHA512_BLOCK_SIZE - partial;
108 memcpy(sctx->buf + partial, data, done);
109 sha512_transform_neon(sctx->state, sctx->buf, sha512_k, 1);
110 }
111
112 if (len - done >= SHA512_BLOCK_SIZE) {
113 const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE;
114
115 sha512_transform_neon(sctx->state, data + done, sha512_k,
116 rounds);
117
118 done += rounds * SHA512_BLOCK_SIZE;
119 }
120
121 memcpy(sctx->buf, data + done, len - done);
122
123 return 0;
124}
125
126static int sha512_neon_update(struct shash_desc *desc, const u8 *data,
127 unsigned int len)
128{
129 struct sha512_state *sctx = shash_desc_ctx(desc);
130 unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE;
131 int res;
132
133 /* Handle the fast case right here */
134 if (partial + len < SHA512_BLOCK_SIZE) {
135 sctx->count[0] += len;
136 if (sctx->count[0] < len)
137 sctx->count[1]++;
138 memcpy(sctx->buf + partial, data, len);
139
140 return 0;
141 }
142
143 if (!may_use_simd()) {
144 res = crypto_sha512_update(desc, data, len);
145 } else {
146 kernel_neon_begin();
147 res = __sha512_neon_update(desc, data, len, partial);
148 kernel_neon_end();
149 }
150
151 return res;
152}
153
154
155/* Add padding and return the message digest. */
156static int sha512_neon_final(struct shash_desc *desc, u8 *out)
157{
158 struct sha512_state *sctx = shash_desc_ctx(desc);
159 unsigned int i, index, padlen;
160 __be64 *dst = (__be64 *)out;
161 __be64 bits[2];
162 static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, };
163
164 /* save number of bits */
165 bits[1] = cpu_to_be64(sctx->count[0] << 3);
166 bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
167
168 /* Pad out to 112 mod 128 and append length */
169 index = sctx->count[0] & 0x7f;
170 padlen = (index < 112) ? (112 - index) : ((128+112) - index);
171
172 if (!may_use_simd()) {
173 crypto_sha512_update(desc, padding, padlen);
174 crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits));
175 } else {
176 kernel_neon_begin();
177 /* We need to fill a whole block for __sha512_neon_update() */
178 if (padlen <= 112) {
179 sctx->count[0] += padlen;
180 if (sctx->count[0] < padlen)
181 sctx->count[1]++;
182 memcpy(sctx->buf + index, padding, padlen);
183 } else {
184 __sha512_neon_update(desc, padding, padlen, index);
185 }
186 __sha512_neon_update(desc, (const u8 *)&bits,
187 sizeof(bits), 112);
188 kernel_neon_end();
189 }
190
191 /* Store state in digest */
192 for (i = 0; i < 8; i++)
193 dst[i] = cpu_to_be64(sctx->state[i]);
194
195 /* Wipe context */
196 memset(sctx, 0, sizeof(*sctx));
197
198 return 0;
199}
200
201static int sha512_neon_export(struct shash_desc *desc, void *out)
202{
203 struct sha512_state *sctx = shash_desc_ctx(desc);
204
205 memcpy(out, sctx, sizeof(*sctx));
206
207 return 0;
208}
209
210static int sha512_neon_import(struct shash_desc *desc, const void *in)
211{
212 struct sha512_state *sctx = shash_desc_ctx(desc);
213
214 memcpy(sctx, in, sizeof(*sctx));
215
216 return 0;
217}
218
219static int sha384_neon_init(struct shash_desc *desc)
220{
221 struct sha512_state *sctx = shash_desc_ctx(desc);
222
223 sctx->state[0] = SHA384_H0;
224 sctx->state[1] = SHA384_H1;
225 sctx->state[2] = SHA384_H2;
226 sctx->state[3] = SHA384_H3;
227 sctx->state[4] = SHA384_H4;
228 sctx->state[5] = SHA384_H5;
229 sctx->state[6] = SHA384_H6;
230 sctx->state[7] = SHA384_H7;
231
232 sctx->count[0] = sctx->count[1] = 0;
233
234 return 0;
235}
236
237static int sha384_neon_final(struct shash_desc *desc, u8 *hash)
238{
239 u8 D[SHA512_DIGEST_SIZE];
240
241 sha512_neon_final(desc, D);
242
243 memcpy(hash, D, SHA384_DIGEST_SIZE);
244 memset(D, 0, SHA512_DIGEST_SIZE);
245
246 return 0;
247}
248
249static struct shash_alg algs[] = { {
250 .digestsize = SHA512_DIGEST_SIZE,
251 .init = sha512_neon_init,
252 .update = sha512_neon_update,
253 .final = sha512_neon_final,
254 .export = sha512_neon_export,
255 .import = sha512_neon_import,
256 .descsize = sizeof(struct sha512_state),
257 .statesize = sizeof(struct sha512_state),
258 .base = {
259 .cra_name = "sha512",
260 .cra_driver_name = "sha512-neon",
261 .cra_priority = 250,
262 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
263 .cra_blocksize = SHA512_BLOCK_SIZE,
264 .cra_module = THIS_MODULE,
265 }
266}, {
267 .digestsize = SHA384_DIGEST_SIZE,
268 .init = sha384_neon_init,
269 .update = sha512_neon_update,
270 .final = sha384_neon_final,
271 .export = sha512_neon_export,
272 .import = sha512_neon_import,
273 .descsize = sizeof(struct sha512_state),
274 .statesize = sizeof(struct sha512_state),
275 .base = {
276 .cra_name = "sha384",
277 .cra_driver_name = "sha384-neon",
278 .cra_priority = 250,
279 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
280 .cra_blocksize = SHA384_BLOCK_SIZE,
281 .cra_module = THIS_MODULE,
282 }
283} };
284
285static int __init sha512_neon_mod_init(void)
286{
287 if (!cpu_has_neon())
288 return -ENODEV;
289
290 return crypto_register_shashes(algs, ARRAY_SIZE(algs));
291}
292
293static void __exit sha512_neon_mod_fini(void)
294{
295 crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
296}
297
298module_init(sha512_neon_mod_init);
299module_exit(sha512_neon_mod_fini);
300
301MODULE_LICENSE("GPL");
302MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, NEON accelerated");
303
304MODULE_ALIAS("sha512");
305MODULE_ALIAS("sha384");