aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm/crypto
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2014-07-29 12:14:14 -0400
committerRussell King <rmk+kernel@arm.linux.org.uk>2014-08-02 03:51:47 -0400
commit604682551aa511e00e57706ad5d9fcf955ee0323 (patch)
treeed97c0b46b271b45dd02209482b7ea81d594728d /arch/arm/crypto
parent1f8673d31a999ed7e20d9f66fcdad39e39f6b276 (diff)
ARM: 8119/1: crypto: sha1: add ARM NEON implementation
This patch adds ARM NEON assembly implementation of SHA-1 algorithm. tcrypt benchmark results on Cortex-A8, sha1-arm-asm vs sha1-neon-asm: block-size bytes/update old-vs-new 16 16 1.04x 64 16 1.02x 64 64 1.05x 256 16 1.03x 256 64 1.04x 256 256 1.30x 1024 16 1.03x 1024 256 1.36x 1024 1024 1.52x 2048 16 1.03x 2048 256 1.39x 2048 1024 1.55x 2048 2048 1.59x 4096 16 1.03x 4096 256 1.40x 4096 1024 1.57x 4096 4096 1.62x 8192 16 1.03x 8192 256 1.40x 8192 1024 1.58x 8192 4096 1.63x 8192 8192 1.63x Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Diffstat (limited to 'arch/arm/crypto')
-rw-r--r--arch/arm/crypto/Makefile2
-rw-r--r--arch/arm/crypto/sha1-armv7-neon.S634
-rw-r--r--arch/arm/crypto/sha1_glue.c8
-rw-r--r--arch/arm/crypto/sha1_neon_glue.c197
4 files changed, 838 insertions, 3 deletions
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 81cda39860c5..374956d2f896 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -5,10 +5,12 @@
5obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o 5obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
6obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o 6obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
7obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o 7obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
8obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
8 9
9aes-arm-y := aes-armv4.o aes_glue.o 10aes-arm-y := aes-armv4.o aes_glue.o
10aes-arm-bs-y := aesbs-core.o aesbs-glue.o 11aes-arm-bs-y := aesbs-core.o aesbs-glue.o
11sha1-arm-y := sha1-armv4-large.o sha1_glue.o 12sha1-arm-y := sha1-armv4-large.o sha1_glue.o
13sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o
12 14
13quiet_cmd_perl = PERL $@ 15quiet_cmd_perl = PERL $@
14 cmd_perl = $(PERL) $(<) > $(@) 16 cmd_perl = $(PERL) $(<) > $(@)
diff --git a/arch/arm/crypto/sha1-armv7-neon.S b/arch/arm/crypto/sha1-armv7-neon.S
new file mode 100644
index 000000000000..50013c0e2864
--- /dev/null
+++ b/arch/arm/crypto/sha1-armv7-neon.S
@@ -0,0 +1,634 @@
1/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
2 *
3 * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License as published by the Free
7 * Software Foundation; either version 2 of the License, or (at your option)
8 * any later version.
9 */
10
11#include <linux/linkage.h>
12
13
14.syntax unified
15.code 32
16.fpu neon
17
18.text
19
20
21/* Context structure */
22
23#define state_h0 0
24#define state_h1 4
25#define state_h2 8
26#define state_h3 12
27#define state_h4 16
28
29
30/* Constants */
31
32#define K1 0x5A827999
33#define K2 0x6ED9EBA1
34#define K3 0x8F1BBCDC
35#define K4 0xCA62C1D6
36.align 4
37.LK_VEC:
38.LK1: .long K1, K1, K1, K1
39.LK2: .long K2, K2, K2, K2
40.LK3: .long K3, K3, K3, K3
41.LK4: .long K4, K4, K4, K4
42
43
44/* Register macros */
45
46#define RSTATE r0
47#define RDATA r1
48#define RNBLKS r2
49#define ROLDSTACK r3
50#define RWK lr
51
52#define _a r4
53#define _b r5
54#define _c r6
55#define _d r7
56#define _e r8
57
58#define RT0 r9
59#define RT1 r10
60#define RT2 r11
61#define RT3 r12
62
63#define W0 q0
64#define W1 q1
65#define W2 q2
66#define W3 q3
67#define W4 q4
68#define W5 q5
69#define W6 q6
70#define W7 q7
71
72#define tmp0 q8
73#define tmp1 q9
74#define tmp2 q10
75#define tmp3 q11
76
77#define qK1 q12
78#define qK2 q13
79#define qK3 q14
80#define qK4 q15
81
82
83/* Round function macros. */
84
85#define WK_offs(i) (((i) & 15) * 4)
86
87#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
88 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
89 ldr RT3, [sp, WK_offs(i)]; \
90 pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
91 bic RT0, d, b; \
92 add e, e, a, ror #(32 - 5); \
93 and RT1, c, b; \
94 pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
95 add RT0, RT0, RT3; \
96 add e, e, RT1; \
97 ror b, #(32 - 30); \
98 pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
99 add e, e, RT0;
100
101#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
102 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
103 ldr RT3, [sp, WK_offs(i)]; \
104 pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
105 eor RT0, d, b; \
106 add e, e, a, ror #(32 - 5); \
107 eor RT0, RT0, c; \
108 pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
109 add e, e, RT3; \
110 ror b, #(32 - 30); \
111 pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
112 add e, e, RT0; \
113
114#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
115 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
116 ldr RT3, [sp, WK_offs(i)]; \
117 pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
118 eor RT0, b, c; \
119 and RT1, b, c; \
120 add e, e, a, ror #(32 - 5); \
121 pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
122 and RT0, RT0, d; \
123 add RT1, RT1, RT3; \
124 add e, e, RT0; \
125 ror b, #(32 - 30); \
126 pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
127 add e, e, RT1;
128
129#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
130 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
131 _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
132 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
133
134#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
135 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
136 _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
137 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
138
139#define R(a,b,c,d,e,f,i) \
140 _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
141 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
142
143#define dummy(...)
144
145
146/* Input expansion macros. */
147
148/********* Precalc macros for rounds 0-15 *************************************/
149
150#define W_PRECALC_00_15() \
151 add RWK, sp, #(WK_offs(0)); \
152 \
153 vld1.32 {tmp0, tmp1}, [RDATA]!; \
154 vrev32.8 W0, tmp0; /* big => little */ \
155 vld1.32 {tmp2, tmp3}, [RDATA]!; \
156 vadd.u32 tmp0, W0, curK; \
157 vrev32.8 W7, tmp1; /* big => little */ \
158 vrev32.8 W6, tmp2; /* big => little */ \
159 vadd.u32 tmp1, W7, curK; \
160 vrev32.8 W5, tmp3; /* big => little */ \
161 vadd.u32 tmp2, W6, curK; \
162 vst1.32 {tmp0, tmp1}, [RWK]!; \
163 vadd.u32 tmp3, W5, curK; \
164 vst1.32 {tmp2, tmp3}, [RWK]; \
165
166#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
167 vld1.32 {tmp0, tmp1}, [RDATA]!; \
168
169#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
170 add RWK, sp, #(WK_offs(0)); \
171
172#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
173 vrev32.8 W0, tmp0; /* big => little */ \
174
175#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
176 vld1.32 {tmp2, tmp3}, [RDATA]!; \
177
178#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
179 vadd.u32 tmp0, W0, curK; \
180
181#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
182 vrev32.8 W7, tmp1; /* big => little */ \
183
184#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
185 vrev32.8 W6, tmp2; /* big => little */ \
186
187#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
188 vadd.u32 tmp1, W7, curK; \
189
190#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
191 vrev32.8 W5, tmp3; /* big => little */ \
192
193#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
194 vadd.u32 tmp2, W6, curK; \
195
196#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
197 vst1.32 {tmp0, tmp1}, [RWK]!; \
198
199#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
200 vadd.u32 tmp3, W5, curK; \
201
202#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
203 vst1.32 {tmp2, tmp3}, [RWK]; \
204
205
206/********* Precalc macros for rounds 16-31 ************************************/
207
208#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
209 veor tmp0, tmp0; \
210 vext.8 W, W_m16, W_m12, #8; \
211
212#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
213 add RWK, sp, #(WK_offs(i)); \
214 vext.8 tmp0, W_m04, tmp0, #4; \
215
216#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
217 veor tmp0, tmp0, W_m16; \
218 veor.32 W, W, W_m08; \
219
220#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
221 veor tmp1, tmp1; \
222 veor W, W, tmp0; \
223
224#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
225 vshl.u32 tmp0, W, #1; \
226
227#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
228 vext.8 tmp1, tmp1, W, #(16-12); \
229 vshr.u32 W, W, #31; \
230
231#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
232 vorr tmp0, tmp0, W; \
233 vshr.u32 W, tmp1, #30; \
234
235#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
236 vshl.u32 tmp1, tmp1, #2; \
237
238#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
239 veor tmp0, tmp0, W; \
240
241#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
242 veor W, tmp0, tmp1; \
243
244#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
245 vadd.u32 tmp0, W, curK; \
246
247#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
248 vst1.32 {tmp0}, [RWK];
249
250
251/********* Precalc macros for rounds 32-79 ************************************/
252
253#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
254 veor W, W_m28; \
255
256#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
257 vext.8 tmp0, W_m08, W_m04, #8; \
258
259#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
260 veor W, W_m16; \
261
262#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
263 veor W, tmp0; \
264
265#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
266 add RWK, sp, #(WK_offs(i&~3)); \
267
268#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
269 vshl.u32 tmp1, W, #2; \
270
271#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
272 vshr.u32 tmp0, W, #30; \
273
274#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
275 vorr W, tmp0, tmp1; \
276
277#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
278 vadd.u32 tmp0, W, curK; \
279
280#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
281 vst1.32 {tmp0}, [RWK];
282
283
284/*
285 * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
286 *
287 * unsigned int
288 * sha1_transform_neon (void *ctx, const unsigned char *data,
289 * unsigned int nblks)
290 */
291.align 3
292ENTRY(sha1_transform_neon)
293 /* input:
294 * r0: ctx, CTX
295 * r1: data (64*nblks bytes)
296 * r2: nblks
297 */
298
299 cmp RNBLKS, #0;
300 beq .Ldo_nothing;
301
302 push {r4-r12, lr};
303 /*vpush {q4-q7};*/
304
305 adr RT3, .LK_VEC;
306
307 mov ROLDSTACK, sp;
308
309 /* Align stack. */
310 sub RT0, sp, #(16*4);
311 and RT0, #(~(16-1));
312 mov sp, RT0;
313
314 vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
315
316 /* Get the values of the chaining variables. */
317 ldm RSTATE, {_a-_e};
318
319 vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
320
321#undef curK
322#define curK qK1
323 /* Precalc 0-15. */
324 W_PRECALC_00_15();
325
326.Loop:
327 /* Transform 0-15 + Precalc 16-31. */
328 _R( _a, _b, _c, _d, _e, F1, 0,
329 WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
330 W4, W5, W6, W7, W0, _, _, _ );
331 _R( _e, _a, _b, _c, _d, F1, 1,
332 WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
333 W4, W5, W6, W7, W0, _, _, _ );
334 _R( _d, _e, _a, _b, _c, F1, 2,
335 WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
336 W4, W5, W6, W7, W0, _, _, _ );
337 _R( _c, _d, _e, _a, _b, F1, 3,
338 WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
339 W4, W5, W6, W7, W0, _, _, _ );
340
341#undef curK
342#define curK qK2
343 _R( _b, _c, _d, _e, _a, F1, 4,
344 WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
345 W3, W4, W5, W6, W7, _, _, _ );
346 _R( _a, _b, _c, _d, _e, F1, 5,
347 WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
348 W3, W4, W5, W6, W7, _, _, _ );
349 _R( _e, _a, _b, _c, _d, F1, 6,
350 WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
351 W3, W4, W5, W6, W7, _, _, _ );
352 _R( _d, _e, _a, _b, _c, F1, 7,
353 WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
354 W3, W4, W5, W6, W7, _, _, _ );
355
356 _R( _c, _d, _e, _a, _b, F1, 8,
357 WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
358 W2, W3, W4, W5, W6, _, _, _ );
359 _R( _b, _c, _d, _e, _a, F1, 9,
360 WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
361 W2, W3, W4, W5, W6, _, _, _ );
362 _R( _a, _b, _c, _d, _e, F1, 10,
363 WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
364 W2, W3, W4, W5, W6, _, _, _ );
365 _R( _e, _a, _b, _c, _d, F1, 11,
366 WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
367 W2, W3, W4, W5, W6, _, _, _ );
368
369 _R( _d, _e, _a, _b, _c, F1, 12,
370 WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
371 W1, W2, W3, W4, W5, _, _, _ );
372 _R( _c, _d, _e, _a, _b, F1, 13,
373 WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
374 W1, W2, W3, W4, W5, _, _, _ );
375 _R( _b, _c, _d, _e, _a, F1, 14,
376 WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
377 W1, W2, W3, W4, W5, _, _, _ );
378 _R( _a, _b, _c, _d, _e, F1, 15,
379 WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
380 W1, W2, W3, W4, W5, _, _, _ );
381
382 /* Transform 16-63 + Precalc 32-79. */
383 _R( _e, _a, _b, _c, _d, F1, 16,
384 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
385 W0, W1, W2, W3, W4, W5, W6, W7);
386 _R( _d, _e, _a, _b, _c, F1, 17,
387 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
388 W0, W1, W2, W3, W4, W5, W6, W7);
389 _R( _c, _d, _e, _a, _b, F1, 18,
390 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32,
391 W0, W1, W2, W3, W4, W5, W6, W7);
392 _R( _b, _c, _d, _e, _a, F1, 19,
393 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32,
394 W0, W1, W2, W3, W4, W5, W6, W7);
395
396 _R( _a, _b, _c, _d, _e, F2, 20,
397 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
398 W7, W0, W1, W2, W3, W4, W5, W6);
399 _R( _e, _a, _b, _c, _d, F2, 21,
400 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
401 W7, W0, W1, W2, W3, W4, W5, W6);
402 _R( _d, _e, _a, _b, _c, F2, 22,
403 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36,
404 W7, W0, W1, W2, W3, W4, W5, W6);
405 _R( _c, _d, _e, _a, _b, F2, 23,
406 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36,
407 W7, W0, W1, W2, W3, W4, W5, W6);
408
409#undef curK
410#define curK qK3
411 _R( _b, _c, _d, _e, _a, F2, 24,
412 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
413 W6, W7, W0, W1, W2, W3, W4, W5);
414 _R( _a, _b, _c, _d, _e, F2, 25,
415 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
416 W6, W7, W0, W1, W2, W3, W4, W5);
417 _R( _e, _a, _b, _c, _d, F2, 26,
418 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40,
419 W6, W7, W0, W1, W2, W3, W4, W5);
420 _R( _d, _e, _a, _b, _c, F2, 27,
421 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40,
422 W6, W7, W0, W1, W2, W3, W4, W5);
423
424 _R( _c, _d, _e, _a, _b, F2, 28,
425 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
426 W5, W6, W7, W0, W1, W2, W3, W4);
427 _R( _b, _c, _d, _e, _a, F2, 29,
428 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
429 W5, W6, W7, W0, W1, W2, W3, W4);
430 _R( _a, _b, _c, _d, _e, F2, 30,
431 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44,
432 W5, W6, W7, W0, W1, W2, W3, W4);
433 _R( _e, _a, _b, _c, _d, F2, 31,
434 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44,
435 W5, W6, W7, W0, W1, W2, W3, W4);
436
437 _R( _d, _e, _a, _b, _c, F2, 32,
438 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
439 W4, W5, W6, W7, W0, W1, W2, W3);
440 _R( _c, _d, _e, _a, _b, F2, 33,
441 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
442 W4, W5, W6, W7, W0, W1, W2, W3);
443 _R( _b, _c, _d, _e, _a, F2, 34,
444 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48,
445 W4, W5, W6, W7, W0, W1, W2, W3);
446 _R( _a, _b, _c, _d, _e, F2, 35,
447 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48,
448 W4, W5, W6, W7, W0, W1, W2, W3);
449
450 _R( _e, _a, _b, _c, _d, F2, 36,
451 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
452 W3, W4, W5, W6, W7, W0, W1, W2);
453 _R( _d, _e, _a, _b, _c, F2, 37,
454 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
455 W3, W4, W5, W6, W7, W0, W1, W2);
456 _R( _c, _d, _e, _a, _b, F2, 38,
457 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52,
458 W3, W4, W5, W6, W7, W0, W1, W2);
459 _R( _b, _c, _d, _e, _a, F2, 39,
460 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52,
461 W3, W4, W5, W6, W7, W0, W1, W2);
462
463 _R( _a, _b, _c, _d, _e, F3, 40,
464 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
465 W2, W3, W4, W5, W6, W7, W0, W1);
466 _R( _e, _a, _b, _c, _d, F3, 41,
467 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
468 W2, W3, W4, W5, W6, W7, W0, W1);
469 _R( _d, _e, _a, _b, _c, F3, 42,
470 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56,
471 W2, W3, W4, W5, W6, W7, W0, W1);
472 _R( _c, _d, _e, _a, _b, F3, 43,
473 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56,
474 W2, W3, W4, W5, W6, W7, W0, W1);
475
476#undef curK
477#define curK qK4
478 _R( _b, _c, _d, _e, _a, F3, 44,
479 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
480 W1, W2, W3, W4, W5, W6, W7, W0);
481 _R( _a, _b, _c, _d, _e, F3, 45,
482 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
483 W1, W2, W3, W4, W5, W6, W7, W0);
484 _R( _e, _a, _b, _c, _d, F3, 46,
485 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60,
486 W1, W2, W3, W4, W5, W6, W7, W0);
487 _R( _d, _e, _a, _b, _c, F3, 47,
488 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60,
489 W1, W2, W3, W4, W5, W6, W7, W0);
490
491 _R( _c, _d, _e, _a, _b, F3, 48,
492 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
493 W0, W1, W2, W3, W4, W5, W6, W7);
494 _R( _b, _c, _d, _e, _a, F3, 49,
495 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
496 W0, W1, W2, W3, W4, W5, W6, W7);
497 _R( _a, _b, _c, _d, _e, F3, 50,
498 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64,
499 W0, W1, W2, W3, W4, W5, W6, W7);
500 _R( _e, _a, _b, _c, _d, F3, 51,
501 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64,
502 W0, W1, W2, W3, W4, W5, W6, W7);
503
504 _R( _d, _e, _a, _b, _c, F3, 52,
505 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
506 W7, W0, W1, W2, W3, W4, W5, W6);
507 _R( _c, _d, _e, _a, _b, F3, 53,
508 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
509 W7, W0, W1, W2, W3, W4, W5, W6);
510 _R( _b, _c, _d, _e, _a, F3, 54,
511 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68,
512 W7, W0, W1, W2, W3, W4, W5, W6);
513 _R( _a, _b, _c, _d, _e, F3, 55,
514 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68,
515 W7, W0, W1, W2, W3, W4, W5, W6);
516
517 _R( _e, _a, _b, _c, _d, F3, 56,
518 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
519 W6, W7, W0, W1, W2, W3, W4, W5);
520 _R( _d, _e, _a, _b, _c, F3, 57,
521 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
522 W6, W7, W0, W1, W2, W3, W4, W5);
523 _R( _c, _d, _e, _a, _b, F3, 58,
524 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72,
525 W6, W7, W0, W1, W2, W3, W4, W5);
526 _R( _b, _c, _d, _e, _a, F3, 59,
527 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72,
528 W6, W7, W0, W1, W2, W3, W4, W5);
529
530 subs RNBLKS, #1;
531
532 _R( _a, _b, _c, _d, _e, F4, 60,
533 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
534 W5, W6, W7, W0, W1, W2, W3, W4);
535 _R( _e, _a, _b, _c, _d, F4, 61,
536 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
537 W5, W6, W7, W0, W1, W2, W3, W4);
538 _R( _d, _e, _a, _b, _c, F4, 62,
539 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76,
540 W5, W6, W7, W0, W1, W2, W3, W4);
541 _R( _c, _d, _e, _a, _b, F4, 63,
542 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76,
543 W5, W6, W7, W0, W1, W2, W3, W4);
544
545 beq .Lend;
546
547 /* Transform 64-79 + Precalc 0-15 of next block. */
548#undef curK
549#define curK qK1
550 _R( _b, _c, _d, _e, _a, F4, 64,
551 WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
552 _R( _a, _b, _c, _d, _e, F4, 65,
553 WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
554 _R( _e, _a, _b, _c, _d, F4, 66,
555 WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
556 _R( _d, _e, _a, _b, _c, F4, 67,
557 WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
558
559 _R( _c, _d, _e, _a, _b, F4, 68,
560 dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
561 _R( _b, _c, _d, _e, _a, F4, 69,
562 dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
563 _R( _a, _b, _c, _d, _e, F4, 70,
564 WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
565 _R( _e, _a, _b, _c, _d, F4, 71,
566 WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
567
568 _R( _d, _e, _a, _b, _c, F4, 72,
569 dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
570 _R( _c, _d, _e, _a, _b, F4, 73,
571 dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
572 _R( _b, _c, _d, _e, _a, F4, 74,
573 WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
574 _R( _a, _b, _c, _d, _e, F4, 75,
575 WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
576
577 _R( _e, _a, _b, _c, _d, F4, 76,
578 WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
579 _R( _d, _e, _a, _b, _c, F4, 77,
580 WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
581 _R( _c, _d, _e, _a, _b, F4, 78,
582 WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
583 _R( _b, _c, _d, _e, _a, F4, 79,
584 WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
585
586 /* Update the chaining variables. */
587 ldm RSTATE, {RT0-RT3};
588 add _a, RT0;
589 ldr RT0, [RSTATE, #state_h4];
590 add _b, RT1;
591 add _c, RT2;
592 add _d, RT3;
593 add _e, RT0;
594 stm RSTATE, {_a-_e};
595
596 b .Loop;
597
598.Lend:
599 /* Transform 64-79 */
600 R( _b, _c, _d, _e, _a, F4, 64 );
601 R( _a, _b, _c, _d, _e, F4, 65 );
602 R( _e, _a, _b, _c, _d, F4, 66 );
603 R( _d, _e, _a, _b, _c, F4, 67 );
604 R( _c, _d, _e, _a, _b, F4, 68 );
605 R( _b, _c, _d, _e, _a, F4, 69 );
606 R( _a, _b, _c, _d, _e, F4, 70 );
607 R( _e, _a, _b, _c, _d, F4, 71 );
608 R( _d, _e, _a, _b, _c, F4, 72 );
609 R( _c, _d, _e, _a, _b, F4, 73 );
610 R( _b, _c, _d, _e, _a, F4, 74 );
611 R( _a, _b, _c, _d, _e, F4, 75 );
612 R( _e, _a, _b, _c, _d, F4, 76 );
613 R( _d, _e, _a, _b, _c, F4, 77 );
614 R( _c, _d, _e, _a, _b, F4, 78 );
615 R( _b, _c, _d, _e, _a, F4, 79 );
616
617 mov sp, ROLDSTACK;
618
619 /* Update the chaining variables. */
620 ldm RSTATE, {RT0-RT3};
621 add _a, RT0;
622 ldr RT0, [RSTATE, #state_h4];
623 add _b, RT1;
624 add _c, RT2;
625 add _d, RT3;
626 /*vpop {q4-q7};*/
627 add _e, RT0;
628 stm RSTATE, {_a-_e};
629
630 pop {r4-r12, pc};
631
632.Ldo_nothing:
633 bx lr
634ENDPROC(sha1_transform_neon)
diff --git a/arch/arm/crypto/sha1_glue.c b/arch/arm/crypto/sha1_glue.c
index c494e579ffc3..84f2a756588b 100644
--- a/arch/arm/crypto/sha1_glue.c
+++ b/arch/arm/crypto/sha1_glue.c
@@ -23,6 +23,7 @@
23#include <linux/types.h> 23#include <linux/types.h>
24#include <crypto/sha.h> 24#include <crypto/sha.h>
25#include <asm/byteorder.h> 25#include <asm/byteorder.h>
26#include <asm/crypto/sha1.h>
26 27
27 28
28asmlinkage void sha1_block_data_order(u32 *digest, 29asmlinkage void sha1_block_data_order(u32 *digest,
@@ -65,8 +66,8 @@ static int __sha1_update(struct sha1_state *sctx, const u8 *data,
65} 66}
66 67
67 68
68static int sha1_update(struct shash_desc *desc, const u8 *data, 69int sha1_update_arm(struct shash_desc *desc, const u8 *data,
69 unsigned int len) 70 unsigned int len)
70{ 71{
71 struct sha1_state *sctx = shash_desc_ctx(desc); 72 struct sha1_state *sctx = shash_desc_ctx(desc);
72 unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; 73 unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
@@ -81,6 +82,7 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
81 res = __sha1_update(sctx, data, len, partial); 82 res = __sha1_update(sctx, data, len, partial);
82 return res; 83 return res;
83} 84}
85EXPORT_SYMBOL_GPL(sha1_update_arm);
84 86
85 87
86/* Add padding and return the message digest. */ 88/* Add padding and return the message digest. */
@@ -135,7 +137,7 @@ static int sha1_import(struct shash_desc *desc, const void *in)
135static struct shash_alg alg = { 137static struct shash_alg alg = {
136 .digestsize = SHA1_DIGEST_SIZE, 138 .digestsize = SHA1_DIGEST_SIZE,
137 .init = sha1_init, 139 .init = sha1_init,
138 .update = sha1_update, 140 .update = sha1_update_arm,
139 .final = sha1_final, 141 .final = sha1_final,
140 .export = sha1_export, 142 .export = sha1_export,
141 .import = sha1_import, 143 .import = sha1_import,
diff --git a/arch/arm/crypto/sha1_neon_glue.c b/arch/arm/crypto/sha1_neon_glue.c
new file mode 100644
index 000000000000..6f1b411b1d55
--- /dev/null
+++ b/arch/arm/crypto/sha1_neon_glue.c
@@ -0,0 +1,197 @@
1/*
2 * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using
3 * ARM NEON instructions.
4 *
5 * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
6 *
7 * This file is based on sha1_generic.c and sha1_ssse3_glue.c:
8 * Copyright (c) Alan Smithee.
9 * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
10 * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
11 * Copyright (c) Mathias Krause <minipli@googlemail.com>
12 * Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com>
13 *
14 * This program is free software; you can redistribute it and/or modify it
15 * under the terms of the GNU General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option)
17 * any later version.
18 *
19 */
20
21#include <crypto/internal/hash.h>
22#include <linux/init.h>
23#include <linux/module.h>
24#include <linux/mm.h>
25#include <linux/cryptohash.h>
26#include <linux/types.h>
27#include <crypto/sha.h>
28#include <asm/byteorder.h>
29#include <asm/neon.h>
30#include <asm/simd.h>
31#include <asm/crypto/sha1.h>
32
33
34asmlinkage void sha1_transform_neon(void *state_h, const char *data,
35 unsigned int rounds);
36
37
38static int sha1_neon_init(struct shash_desc *desc)
39{
40 struct sha1_state *sctx = shash_desc_ctx(desc);
41
42 *sctx = (struct sha1_state){
43 .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
44 };
45
46 return 0;
47}
48
49static int __sha1_neon_update(struct shash_desc *desc, const u8 *data,
50 unsigned int len, unsigned int partial)
51{
52 struct sha1_state *sctx = shash_desc_ctx(desc);
53 unsigned int done = 0;
54
55 sctx->count += len;
56
57 if (partial) {
58 done = SHA1_BLOCK_SIZE - partial;
59 memcpy(sctx->buffer + partial, data, done);
60 sha1_transform_neon(sctx->state, sctx->buffer, 1);
61 }
62
63 if (len - done >= SHA1_BLOCK_SIZE) {
64 const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
65
66 sha1_transform_neon(sctx->state, data + done, rounds);
67 done += rounds * SHA1_BLOCK_SIZE;
68 }
69
70 memcpy(sctx->buffer, data + done, len - done);
71
72 return 0;
73}
74
75static int sha1_neon_update(struct shash_desc *desc, const u8 *data,
76 unsigned int len)
77{
78 struct sha1_state *sctx = shash_desc_ctx(desc);
79 unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
80 int res;
81
82 /* Handle the fast case right here */
83 if (partial + len < SHA1_BLOCK_SIZE) {
84 sctx->count += len;
85 memcpy(sctx->buffer + partial, data, len);
86
87 return 0;
88 }
89
90 if (!may_use_simd()) {
91 res = sha1_update_arm(desc, data, len);
92 } else {
93 kernel_neon_begin();
94 res = __sha1_neon_update(desc, data, len, partial);
95 kernel_neon_end();
96 }
97
98 return res;
99}
100
101
102/* Add padding and return the message digest. */
103static int sha1_neon_final(struct shash_desc *desc, u8 *out)
104{
105 struct sha1_state *sctx = shash_desc_ctx(desc);
106 unsigned int i, index, padlen;
107 __be32 *dst = (__be32 *)out;
108 __be64 bits;
109 static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
110
111 bits = cpu_to_be64(sctx->count << 3);
112
113 /* Pad out to 56 mod 64 and append length */
114 index = sctx->count % SHA1_BLOCK_SIZE;
115 padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index);
116 if (!may_use_simd()) {
117 sha1_update_arm(desc, padding, padlen);
118 sha1_update_arm(desc, (const u8 *)&bits, sizeof(bits));
119 } else {
120 kernel_neon_begin();
121 /* We need to fill a whole block for __sha1_neon_update() */
122 if (padlen <= 56) {
123 sctx->count += padlen;
124 memcpy(sctx->buffer + index, padding, padlen);
125 } else {
126 __sha1_neon_update(desc, padding, padlen, index);
127 }
128 __sha1_neon_update(desc, (const u8 *)&bits, sizeof(bits), 56);
129 kernel_neon_end();
130 }
131
132 /* Store state in digest */
133 for (i = 0; i < 5; i++)
134 dst[i] = cpu_to_be32(sctx->state[i]);
135
136 /* Wipe context */
137 memset(sctx, 0, sizeof(*sctx));
138
139 return 0;
140}
141
142static int sha1_neon_export(struct shash_desc *desc, void *out)
143{
144 struct sha1_state *sctx = shash_desc_ctx(desc);
145
146 memcpy(out, sctx, sizeof(*sctx));
147
148 return 0;
149}
150
151static int sha1_neon_import(struct shash_desc *desc, const void *in)
152{
153 struct sha1_state *sctx = shash_desc_ctx(desc);
154
155 memcpy(sctx, in, sizeof(*sctx));
156
157 return 0;
158}
159
160static struct shash_alg alg = {
161 .digestsize = SHA1_DIGEST_SIZE,
162 .init = sha1_neon_init,
163 .update = sha1_neon_update,
164 .final = sha1_neon_final,
165 .export = sha1_neon_export,
166 .import = sha1_neon_import,
167 .descsize = sizeof(struct sha1_state),
168 .statesize = sizeof(struct sha1_state),
169 .base = {
170 .cra_name = "sha1",
171 .cra_driver_name = "sha1-neon",
172 .cra_priority = 250,
173 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
174 .cra_blocksize = SHA1_BLOCK_SIZE,
175 .cra_module = THIS_MODULE,
176 }
177};
178
179static int __init sha1_neon_mod_init(void)
180{
181 if (!cpu_has_neon())
182 return -ENODEV;
183
184 return crypto_register_shash(&alg);
185}
186
187static void __exit sha1_neon_mod_fini(void)
188{
189 crypto_unregister_shash(&alg);
190}
191
192module_init(sha1_neon_mod_init);
193module_exit(sha1_neon_mod_fini);
194
195MODULE_LICENSE("GPL");
196MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, NEON accelerated");
197MODULE_ALIAS("sha1");