diff options
| author | Nicolas Pitre <nico@cam.org> | 2005-10-28 10:26:40 -0400 |
|---|---|---|
| committer | Russell King <rmk+kernel@arm.linux.org.uk> | 2005-10-28 10:26:40 -0400 |
| commit | c09f98271f685af349d3f0199360f1c0e85550e0 (patch) | |
| tree | 2e8a393d76e386ff64af2a786cf5baf370f3823b | |
| parent | 0b7cd62ecdc1f09b7df4608a3fee644b1c27985b (diff) | |
[ARM] 2930/1: optimized sha1 implementation for ARM
Patch from Nicolas Pitre
Here's an ARM assembly SHA1 implementation to replace the default C
version. It is approximately 50% faster than the generic C version. On
an XScale processor running at 400MHz:
generic C version: 9.8 MB/s
my version: 14.5 MB/s
This code is useful to quite a few callers in the tree:
crypto/sha1.c: sha_transform(sctx->state, sctx->buffer, temp);
crypto/sha1.c: sha_transform(sctx->state, &data[i], temp);
drivers/char/random.c: sha_transform(buf, (__u8 *)r->pool+i, buf + 5);
drivers/char/random.c: sha_transform(buf, (__u8 *)data, buf + 5);
net/ipv4/syncookies.c: sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
Signed-off-by: Nicolas Pitre <nico@cam.org>
Seems to work fine on big-endian as well.
Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
| -rw-r--r-- | arch/arm/lib/Makefile | 2 | ||||
| -rw-r--r-- | arch/arm/lib/sha1.S | 206 |
2 files changed, 207 insertions, 1 deletions
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index 8725d63e4219..71e5b99e519e 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile | |||
| @@ -11,7 +11,7 @@ lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \ | |||
| 11 | strnlen_user.o strchr.o strrchr.o testchangebit.o \ | 11 | strnlen_user.o strchr.o strrchr.o testchangebit.o \ |
| 12 | testclearbit.o testsetbit.o uaccess.o getuser.o \ | 12 | testclearbit.o testsetbit.o uaccess.o getuser.o \ |
| 13 | putuser.o ashldi3.o ashrdi3.o lshrdi3.o muldi3.o \ | 13 | putuser.o ashldi3.o ashrdi3.o lshrdi3.o muldi3.o \ |
| 14 | ucmpdi2.o lib1funcs.o div64.o \ | 14 | ucmpdi2.o lib1funcs.o div64.o sha1.o \ |
| 15 | io-readsb.o io-writesb.o io-readsl.o io-writesl.o | 15 | io-readsb.o io-writesb.o io-readsl.o io-writesl.o |
| 16 | 16 | ||
| 17 | ifeq ($(CONFIG_CPU_32v3),y) | 17 | ifeq ($(CONFIG_CPU_32v3),y) |
diff --git a/arch/arm/lib/sha1.S b/arch/arm/lib/sha1.S new file mode 100644 index 000000000000..ff6ece487ffc --- /dev/null +++ b/arch/arm/lib/sha1.S | |||
| @@ -0,0 +1,206 @@ | |||
| 1 | /* | ||
| 2 | * linux/arch/arm/lib/sha1.S | ||
| 3 | * | ||
| 4 | * SHA transform optimized for ARM | ||
| 5 | * | ||
| 6 | * Copyright: (C) 2005 by Nicolas Pitre <nico@cam.org> | ||
| 7 | * Created: September 17, 2005 | ||
| 8 | * | ||
| 9 | * This program is free software; you can redistribute it and/or modify | ||
| 10 | * it under the terms of the GNU General Public License version 2 as | ||
| 11 | * published by the Free Software Foundation. | ||
| 12 | * | ||
| 13 | * The reference implementation for this code is linux/lib/sha1.c | ||
| 14 | */ | ||
| 15 | |||
| 16 | #include <linux/linkage.h> | ||
| 17 | |||
| 18 | .text | ||
| 19 | |||
| 20 | |||
| 21 | /* | ||
| 22 | * void sha_transform(__u32 *digest, const char *in, __u32 *W) | ||
| 23 | * | ||
| 24 | * Note: the "in" ptr may be unaligned. | ||
| 25 | */ | ||
| 26 | |||
| 27 | ENTRY(sha_transform) | ||
| 28 | |||
| 29 | stmfd sp!, {r4 - r8, lr} | ||
| 30 | |||
| 31 | @ for (i = 0; i < 16; i++) | ||
| 32 | @ W[i] = be32_to_cpu(in[i]); */ | ||
| 33 | |||
| 34 | #ifdef __ARMEB__ | ||
| 35 | mov r4, r0 | ||
| 36 | mov r0, r2 | ||
| 37 | mov r2, #64 | ||
| 38 | bl memcpy | ||
| 39 | mov r2, r0 | ||
| 40 | mov r0, r4 | ||
| 41 | #else | ||
| 42 | mov r3, r2 | ||
| 43 | mov lr, #16 | ||
| 44 | 1: ldrb r4, [r1], #1 | ||
| 45 | ldrb r5, [r1], #1 | ||
| 46 | ldrb r6, [r1], #1 | ||
| 47 | ldrb r7, [r1], #1 | ||
| 48 | subs lr, lr, #1 | ||
| 49 | orr r5, r5, r4, lsl #8 | ||
| 50 | orr r6, r6, r5, lsl #8 | ||
| 51 | orr r7, r7, r6, lsl #8 | ||
| 52 | str r7, [r3], #4 | ||
| 53 | bne 1b | ||
| 54 | #endif | ||
| 55 | |||
| 56 | @ for (i = 0; i < 64; i++) | ||
| 57 | @ W[i+16] = ror(W[i+13] ^ W[i+8] ^ W[i+2] ^ W[i], 31); | ||
| 58 | |||
| 59 | sub r3, r2, #4 | ||
| 60 | mov lr, #64 | ||
| 61 | 2: ldr r4, [r3, #4]! | ||
| 62 | subs lr, lr, #1 | ||
| 63 | ldr r5, [r3, #8] | ||
| 64 | ldr r6, [r3, #32] | ||
| 65 | ldr r7, [r3, #52] | ||
| 66 | eor r4, r4, r5 | ||
| 67 | eor r4, r4, r6 | ||
| 68 | eor r4, r4, r7 | ||
| 69 | mov r4, r4, ror #31 | ||
| 70 | str r4, [r3, #64] | ||
| 71 | bne 2b | ||
| 72 | |||
| 73 | /* | ||
| 74 | * The SHA functions are: | ||
| 75 | * | ||
| 76 | * f1(B,C,D) = (D ^ (B & (C ^ D))) | ||
| 77 | * f2(B,C,D) = (B ^ C ^ D) | ||
| 78 | * f3(B,C,D) = ((B & C) | (D & (B | C))) | ||
| 79 | * | ||
| 80 | * Then the sub-blocks are processed as follows: | ||
| 81 | * | ||
| 82 | * A' = ror(A, 27) + f(B,C,D) + E + K + *W++ | ||
| 83 | * B' = A | ||
| 84 | * C' = ror(B, 2) | ||
| 85 | * D' = C | ||
| 86 | * E' = D | ||
| 87 | * | ||
| 88 | * We therefore unroll each loop 5 times to avoid register shuffling. | ||
| 89 | * Also the ror for C (and also D and E which are successivelyderived | ||
| 90 | * from it) is applied in place to cut on an additional mov insn for | ||
| 91 | * each round. | ||
| 92 | */ | ||
| 93 | |||
| 94 | .macro sha_f1, A, B, C, D, E | ||
| 95 | ldr r3, [r2], #4 | ||
| 96 | eor ip, \C, \D | ||
| 97 | add \E, r1, \E, ror #2 | ||
| 98 | and ip, \B, ip, ror #2 | ||
| 99 | add \E, \E, \A, ror #27 | ||
| 100 | eor ip, ip, \D, ror #2 | ||
| 101 | add \E, \E, r3 | ||
| 102 | add \E, \E, ip | ||
| 103 | .endm | ||
| 104 | |||
| 105 | .macro sha_f2, A, B, C, D, E | ||
| 106 | ldr r3, [r2], #4 | ||
| 107 | add \E, r1, \E, ror #2 | ||
| 108 | eor ip, \B, \C, ror #2 | ||
| 109 | add \E, \E, \A, ror #27 | ||
| 110 | eor ip, ip, \D, ror #2 | ||
| 111 | add \E, \E, r3 | ||
| 112 | add \E, \E, ip | ||
| 113 | .endm | ||
| 114 | |||
| 115 | .macro sha_f3, A, B, C, D, E | ||
| 116 | ldr r3, [r2], #4 | ||
| 117 | add \E, r1, \E, ror #2 | ||
| 118 | orr ip, \B, \C, ror #2 | ||
| 119 | add \E, \E, \A, ror #27 | ||
| 120 | and ip, ip, \D, ror #2 | ||
| 121 | add \E, \E, r3 | ||
| 122 | and r3, \B, \C, ror #2 | ||
| 123 | orr ip, ip, r3 | ||
| 124 | add \E, \E, ip | ||
| 125 | .endm | ||
| 126 | |||
| 127 | ldmia r0, {r4 - r8} | ||
| 128 | |||
| 129 | mov lr, #4 | ||
| 130 | ldr r1, .L_sha_K + 0 | ||
| 131 | |||
| 132 | /* adjust initial values */ | ||
| 133 | mov r6, r6, ror #30 | ||
| 134 | mov r7, r7, ror #30 | ||
| 135 | mov r8, r8, ror #30 | ||
| 136 | |||
| 137 | 3: subs lr, lr, #1 | ||
| 138 | sha_f1 r4, r5, r6, r7, r8 | ||
| 139 | sha_f1 r8, r4, r5, r6, r7 | ||
| 140 | sha_f1 r7, r8, r4, r5, r6 | ||
| 141 | sha_f1 r6, r7, r8, r4, r5 | ||
| 142 | sha_f1 r5, r6, r7, r8, r4 | ||
| 143 | bne 3b | ||
| 144 | |||
| 145 | ldr r1, .L_sha_K + 4 | ||
| 146 | mov lr, #4 | ||
| 147 | |||
| 148 | 4: subs lr, lr, #1 | ||
| 149 | sha_f2 r4, r5, r6, r7, r8 | ||
| 150 | sha_f2 r8, r4, r5, r6, r7 | ||
| 151 | sha_f2 r7, r8, r4, r5, r6 | ||
| 152 | sha_f2 r6, r7, r8, r4, r5 | ||
| 153 | sha_f2 r5, r6, r7, r8, r4 | ||
| 154 | bne 4b | ||
| 155 | |||
| 156 | ldr r1, .L_sha_K + 8 | ||
| 157 | mov lr, #4 | ||
| 158 | |||
| 159 | 5: subs lr, lr, #1 | ||
| 160 | sha_f3 r4, r5, r6, r7, r8 | ||
| 161 | sha_f3 r8, r4, r5, r6, r7 | ||
| 162 | sha_f3 r7, r8, r4, r5, r6 | ||
| 163 | sha_f3 r6, r7, r8, r4, r5 | ||
| 164 | sha_f3 r5, r6, r7, r8, r4 | ||
| 165 | bne 5b | ||
| 166 | |||
| 167 | ldr r1, .L_sha_K + 12 | ||
| 168 | mov lr, #4 | ||
| 169 | |||
| 170 | 6: subs lr, lr, #1 | ||
| 171 | sha_f2 r4, r5, r6, r7, r8 | ||
| 172 | sha_f2 r8, r4, r5, r6, r7 | ||
| 173 | sha_f2 r7, r8, r4, r5, r6 | ||
| 174 | sha_f2 r6, r7, r8, r4, r5 | ||
| 175 | sha_f2 r5, r6, r7, r8, r4 | ||
| 176 | bne 6b | ||
| 177 | |||
| 178 | ldmia r0, {r1, r2, r3, ip, lr} | ||
| 179 | add r4, r1, r4 | ||
| 180 | add r5, r2, r5 | ||
| 181 | add r6, r3, r6, ror #2 | ||
| 182 | add r7, ip, r7, ror #2 | ||
| 183 | add r8, lr, r8, ror #2 | ||
| 184 | stmia r0, {r4 - r8} | ||
| 185 | |||
| 186 | ldmfd sp!, {r4 - r8, pc} | ||
| 187 | |||
| 188 | .L_sha_K: | ||
| 189 | .word 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6 | ||
| 190 | |||
| 191 | |||
| 192 | /* | ||
| 193 | * void sha_init(__u32 *buf) | ||
| 194 | */ | ||
| 195 | |||
| 196 | .L_sha_initial_digest: | ||
| 197 | .word 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0 | ||
| 198 | |||
| 199 | ENTRY(sha_init) | ||
| 200 | |||
| 201 | str lr, [sp, #-4]! | ||
| 202 | adr r1, .L_sha_initial_digest | ||
| 203 | ldmia r1, {r1, r2, r3, ip, lr} | ||
| 204 | stmia r0, {r1, r2, r3, ip, lr} | ||
| 205 | ldr pc, [sp], #4 | ||
| 206 | |||
