diff options
author | Nicolas Pitre <nico@cam.org> | 2005-10-28 10:26:40 -0400 |
---|---|---|
committer | Russell King <rmk+kernel@arm.linux.org.uk> | 2005-10-28 10:26:40 -0400 |
commit | c09f98271f685af349d3f0199360f1c0e85550e0 (patch) | |
tree | 2e8a393d76e386ff64af2a786cf5baf370f3823b | |
parent | 0b7cd62ecdc1f09b7df4608a3fee644b1c27985b (diff) |
[ARM] 2930/1: optimized sha1 implementation for ARM
Patch from Nicolas Pitre
Here's an ARM assembly SHA1 implementation to replace the default C
version. It is approximately 50% faster than the generic C version. On
an XScale processor running at 400MHz:
generic C version: 9.8 MB/s
my version: 14.5 MB/s
This code is useful to quite a few callers in the tree:
crypto/sha1.c: sha_transform(sctx->state, sctx->buffer, temp);
crypto/sha1.c: sha_transform(sctx->state, &data[i], temp);
drivers/char/random.c: sha_transform(buf, (__u8 *)r->pool+i, buf + 5);
drivers/char/random.c: sha_transform(buf, (__u8 *)data, buf + 5);
net/ipv4/syncookies.c: sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
Signed-off-by: Nicolas Pitre <nico@cam.org>
Seems to work fine on big-endian as well.
Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
-rw-r--r-- | arch/arm/lib/Makefile | 2 | ||||
-rw-r--r-- | arch/arm/lib/sha1.S | 206 |
2 files changed, 207 insertions, 1 deletions
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index 8725d63e4219..71e5b99e519e 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile | |||
@@ -11,7 +11,7 @@ lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \ | |||
11 | strnlen_user.o strchr.o strrchr.o testchangebit.o \ | 11 | strnlen_user.o strchr.o strrchr.o testchangebit.o \ |
12 | testclearbit.o testsetbit.o uaccess.o getuser.o \ | 12 | testclearbit.o testsetbit.o uaccess.o getuser.o \ |
13 | putuser.o ashldi3.o ashrdi3.o lshrdi3.o muldi3.o \ | 13 | putuser.o ashldi3.o ashrdi3.o lshrdi3.o muldi3.o \ |
14 | ucmpdi2.o lib1funcs.o div64.o \ | 14 | ucmpdi2.o lib1funcs.o div64.o sha1.o \ |
15 | io-readsb.o io-writesb.o io-readsl.o io-writesl.o | 15 | io-readsb.o io-writesb.o io-readsl.o io-writesl.o |
16 | 16 | ||
17 | ifeq ($(CONFIG_CPU_32v3),y) | 17 | ifeq ($(CONFIG_CPU_32v3),y) |
diff --git a/arch/arm/lib/sha1.S b/arch/arm/lib/sha1.S new file mode 100644 index 000000000000..ff6ece487ffc --- /dev/null +++ b/arch/arm/lib/sha1.S | |||
@@ -0,0 +1,206 @@ | |||
1 | /* | ||
2 | * linux/arch/arm/lib/sha1.S | ||
3 | * | ||
4 | * SHA transform optimized for ARM | ||
5 | * | ||
6 | * Copyright: (C) 2005 by Nicolas Pitre <nico@cam.org> | ||
7 | * Created: September 17, 2005 | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License version 2 as | ||
11 | * published by the Free Software Foundation. | ||
12 | * | ||
13 | * The reference implementation for this code is linux/lib/sha1.c | ||
14 | */ | ||
15 | |||
16 | #include <linux/linkage.h> | ||
17 | |||
18 | .text | ||
19 | |||
20 | |||
21 | /* | ||
22 | * void sha_transform(__u32 *digest, const char *in, __u32 *W) | ||
23 | * | ||
24 | * Note: the "in" ptr may be unaligned. | ||
25 | */ | ||
26 | |||
27 | ENTRY(sha_transform) | ||
28 | |||
29 | stmfd sp!, {r4 - r8, lr} | ||
30 | |||
31 | @ for (i = 0; i < 16; i++) | ||
32 | @ W[i] = be32_to_cpu(in[i]); */ | ||
33 | |||
34 | #ifdef __ARMEB__ | ||
35 | mov r4, r0 | ||
36 | mov r0, r2 | ||
37 | mov r2, #64 | ||
38 | bl memcpy | ||
39 | mov r2, r0 | ||
40 | mov r0, r4 | ||
41 | #else | ||
42 | mov r3, r2 | ||
43 | mov lr, #16 | ||
44 | 1: ldrb r4, [r1], #1 | ||
45 | ldrb r5, [r1], #1 | ||
46 | ldrb r6, [r1], #1 | ||
47 | ldrb r7, [r1], #1 | ||
48 | subs lr, lr, #1 | ||
49 | orr r5, r5, r4, lsl #8 | ||
50 | orr r6, r6, r5, lsl #8 | ||
51 | orr r7, r7, r6, lsl #8 | ||
52 | str r7, [r3], #4 | ||
53 | bne 1b | ||
54 | #endif | ||
55 | |||
56 | @ for (i = 0; i < 64; i++) | ||
57 | @ W[i+16] = ror(W[i+13] ^ W[i+8] ^ W[i+2] ^ W[i], 31); | ||
58 | |||
59 | sub r3, r2, #4 | ||
60 | mov lr, #64 | ||
61 | 2: ldr r4, [r3, #4]! | ||
62 | subs lr, lr, #1 | ||
63 | ldr r5, [r3, #8] | ||
64 | ldr r6, [r3, #32] | ||
65 | ldr r7, [r3, #52] | ||
66 | eor r4, r4, r5 | ||
67 | eor r4, r4, r6 | ||
68 | eor r4, r4, r7 | ||
69 | mov r4, r4, ror #31 | ||
70 | str r4, [r3, #64] | ||
71 | bne 2b | ||
72 | |||
73 | /* | ||
74 | * The SHA functions are: | ||
75 | * | ||
76 | * f1(B,C,D) = (D ^ (B & (C ^ D))) | ||
77 | * f2(B,C,D) = (B ^ C ^ D) | ||
78 | * f3(B,C,D) = ((B & C) | (D & (B | C))) | ||
79 | * | ||
80 | * Then the sub-blocks are processed as follows: | ||
81 | * | ||
82 | * A' = ror(A, 27) + f(B,C,D) + E + K + *W++ | ||
83 | * B' = A | ||
84 | * C' = ror(B, 2) | ||
85 | * D' = C | ||
86 | * E' = D | ||
87 | * | ||
88 | * We therefore unroll each loop 5 times to avoid register shuffling. | ||
89 | * Also the ror for C (and also D and E which are successivelyderived | ||
90 | * from it) is applied in place to cut on an additional mov insn for | ||
91 | * each round. | ||
92 | */ | ||
93 | |||
94 | .macro sha_f1, A, B, C, D, E | ||
95 | ldr r3, [r2], #4 | ||
96 | eor ip, \C, \D | ||
97 | add \E, r1, \E, ror #2 | ||
98 | and ip, \B, ip, ror #2 | ||
99 | add \E, \E, \A, ror #27 | ||
100 | eor ip, ip, \D, ror #2 | ||
101 | add \E, \E, r3 | ||
102 | add \E, \E, ip | ||
103 | .endm | ||
104 | |||
105 | .macro sha_f2, A, B, C, D, E | ||
106 | ldr r3, [r2], #4 | ||
107 | add \E, r1, \E, ror #2 | ||
108 | eor ip, \B, \C, ror #2 | ||
109 | add \E, \E, \A, ror #27 | ||
110 | eor ip, ip, \D, ror #2 | ||
111 | add \E, \E, r3 | ||
112 | add \E, \E, ip | ||
113 | .endm | ||
114 | |||
115 | .macro sha_f3, A, B, C, D, E | ||
116 | ldr r3, [r2], #4 | ||
117 | add \E, r1, \E, ror #2 | ||
118 | orr ip, \B, \C, ror #2 | ||
119 | add \E, \E, \A, ror #27 | ||
120 | and ip, ip, \D, ror #2 | ||
121 | add \E, \E, r3 | ||
122 | and r3, \B, \C, ror #2 | ||
123 | orr ip, ip, r3 | ||
124 | add \E, \E, ip | ||
125 | .endm | ||
126 | |||
127 | ldmia r0, {r4 - r8} | ||
128 | |||
129 | mov lr, #4 | ||
130 | ldr r1, .L_sha_K + 0 | ||
131 | |||
132 | /* adjust initial values */ | ||
133 | mov r6, r6, ror #30 | ||
134 | mov r7, r7, ror #30 | ||
135 | mov r8, r8, ror #30 | ||
136 | |||
137 | 3: subs lr, lr, #1 | ||
138 | sha_f1 r4, r5, r6, r7, r8 | ||
139 | sha_f1 r8, r4, r5, r6, r7 | ||
140 | sha_f1 r7, r8, r4, r5, r6 | ||
141 | sha_f1 r6, r7, r8, r4, r5 | ||
142 | sha_f1 r5, r6, r7, r8, r4 | ||
143 | bne 3b | ||
144 | |||
145 | ldr r1, .L_sha_K + 4 | ||
146 | mov lr, #4 | ||
147 | |||
148 | 4: subs lr, lr, #1 | ||
149 | sha_f2 r4, r5, r6, r7, r8 | ||
150 | sha_f2 r8, r4, r5, r6, r7 | ||
151 | sha_f2 r7, r8, r4, r5, r6 | ||
152 | sha_f2 r6, r7, r8, r4, r5 | ||
153 | sha_f2 r5, r6, r7, r8, r4 | ||
154 | bne 4b | ||
155 | |||
156 | ldr r1, .L_sha_K + 8 | ||
157 | mov lr, #4 | ||
158 | |||
159 | 5: subs lr, lr, #1 | ||
160 | sha_f3 r4, r5, r6, r7, r8 | ||
161 | sha_f3 r8, r4, r5, r6, r7 | ||
162 | sha_f3 r7, r8, r4, r5, r6 | ||
163 | sha_f3 r6, r7, r8, r4, r5 | ||
164 | sha_f3 r5, r6, r7, r8, r4 | ||
165 | bne 5b | ||
166 | |||
167 | ldr r1, .L_sha_K + 12 | ||
168 | mov lr, #4 | ||
169 | |||
170 | 6: subs lr, lr, #1 | ||
171 | sha_f2 r4, r5, r6, r7, r8 | ||
172 | sha_f2 r8, r4, r5, r6, r7 | ||
173 | sha_f2 r7, r8, r4, r5, r6 | ||
174 | sha_f2 r6, r7, r8, r4, r5 | ||
175 | sha_f2 r5, r6, r7, r8, r4 | ||
176 | bne 6b | ||
177 | |||
178 | ldmia r0, {r1, r2, r3, ip, lr} | ||
179 | add r4, r1, r4 | ||
180 | add r5, r2, r5 | ||
181 | add r6, r3, r6, ror #2 | ||
182 | add r7, ip, r7, ror #2 | ||
183 | add r8, lr, r8, ror #2 | ||
184 | stmia r0, {r4 - r8} | ||
185 | |||
186 | ldmfd sp!, {r4 - r8, pc} | ||
187 | |||
188 | .L_sha_K: | ||
189 | .word 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6 | ||
190 | |||
191 | |||
192 | /* | ||
193 | * void sha_init(__u32 *buf) | ||
194 | */ | ||
195 | |||
196 | .L_sha_initial_digest: | ||
197 | .word 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0 | ||
198 | |||
199 | ENTRY(sha_init) | ||
200 | |||
201 | str lr, [sp, #-4]! | ||
202 | adr r1, .L_sha_initial_digest | ||
203 | ldmia r1, {r1, r2, r3, ip, lr} | ||
204 | stmia r0, {r1, r2, r3, ip, lr} | ||
205 | ldr pc, [sp], #4 | ||
206 | |||