diff options
Diffstat (limited to 'arch/powerpc/lib/checksum_64.S')
-rw-r--r-- | arch/powerpc/lib/checksum_64.S | 229 |
1 files changed, 229 insertions, 0 deletions
diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S new file mode 100644 index 000000000000..ef96c6c58efc --- /dev/null +++ b/arch/powerpc/lib/checksum_64.S | |||
@@ -0,0 +1,229 @@ | |||
1 | /* | ||
2 | * This file contains assembly-language implementations | ||
3 | * of IP-style 1's complement checksum routines. | ||
4 | * | ||
5 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version | ||
10 | * 2 of the License, or (at your option) any later version. | ||
11 | * | ||
12 | * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). | ||
13 | */ | ||
14 | |||
15 | #include <linux/sys.h> | ||
16 | #include <asm/processor.h> | ||
17 | #include <asm/errno.h> | ||
18 | #include <asm/ppc_asm.h> | ||
19 | |||
20 | /* | ||
21 | * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header | ||
22 | * len is in words and is always >= 5. | ||
23 | * | ||
24 | * In practice len == 5, but this is not guaranteed. So this code does not | ||
25 | * attempt to use doubleword instructions. | ||
26 | */ | ||
27 | _GLOBAL(ip_fast_csum) | ||
28 | lwz r0,0(r3) | ||
29 | lwzu r5,4(r3) | ||
30 | addic. r4,r4,-2 | ||
31 | addc r0,r0,r5 | ||
32 | mtctr r4 | ||
33 | blelr- | ||
34 | 1: lwzu r4,4(r3) | ||
35 | adde r0,r0,r4 | ||
36 | bdnz 1b | ||
37 | addze r0,r0 /* add in final carry */ | ||
38 | rldicl r4,r0,32,0 /* fold two 32-bit halves together */ | ||
39 | add r0,r0,r4 | ||
40 | srdi r0,r0,32 | ||
41 | rlwinm r3,r0,16,0,31 /* fold two halves together */ | ||
42 | add r3,r0,r3 | ||
43 | not r3,r3 | ||
44 | srwi r3,r3,16 | ||
45 | blr | ||
46 | |||
47 | /* | ||
48 | * Compute checksum of TCP or UDP pseudo-header: | ||
49 | * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum) | ||
50 | * No real gain trying to do this specially for 64 bit, but | ||
51 | * the 32 bit addition may spill into the upper bits of | ||
52 | * the doubleword so we still must fold it down from 64. | ||
53 | */ | ||
54 | _GLOBAL(csum_tcpudp_magic) | ||
55 | rlwimi r5,r6,16,0,15 /* put proto in upper half of len */ | ||
56 | addc r0,r3,r4 /* add 4 32-bit words together */ | ||
57 | adde r0,r0,r5 | ||
58 | adde r0,r0,r7 | ||
59 | rldicl r4,r0,32,0 /* fold 64 bit value */ | ||
60 | add r0,r4,r0 | ||
61 | srdi r0,r0,32 | ||
62 | rlwinm r3,r0,16,0,31 /* fold two halves together */ | ||
63 | add r3,r0,r3 | ||
64 | not r3,r3 | ||
65 | srwi r3,r3,16 | ||
66 | blr | ||
67 | |||
68 | /* | ||
69 | * Computes the checksum of a memory block at buff, length len, | ||
70 | * and adds in "sum" (32-bit). | ||
71 | * | ||
72 | * This code assumes at least halfword alignment, though the length | ||
73 | * can be any number of bytes. The sum is accumulated in r5. | ||
74 | * | ||
75 | * csum_partial(r3=buff, r4=len, r5=sum) | ||
76 | */ | ||
77 | _GLOBAL(csum_partial) | ||
78 | subi r3,r3,8 /* we'll offset by 8 for the loads */ | ||
79 | srdi. r6,r4,3 /* divide by 8 for doubleword count */ | ||
80 | addic r5,r5,0 /* clear carry */ | ||
81 | beq 3f /* if we're doing < 8 bytes */ | ||
82 | andi. r0,r3,2 /* aligned on a word boundary already? */ | ||
83 | beq+ 1f | ||
84 | lhz r6,8(r3) /* do 2 bytes to get aligned */ | ||
85 | addi r3,r3,2 | ||
86 | subi r4,r4,2 | ||
87 | addc r5,r5,r6 | ||
88 | srdi. r6,r4,3 /* recompute number of doublewords */ | ||
89 | beq 3f /* any left? */ | ||
90 | 1: mtctr r6 | ||
91 | 2: ldu r6,8(r3) /* main sum loop */ | ||
92 | adde r5,r5,r6 | ||
93 | bdnz 2b | ||
94 | andi. r4,r4,7 /* compute bytes left to sum after doublewords */ | ||
95 | 3: cmpwi 0,r4,4 /* is at least a full word left? */ | ||
96 | blt 4f | ||
97 | lwz r6,8(r3) /* sum this word */ | ||
98 | addi r3,r3,4 | ||
99 | subi r4,r4,4 | ||
100 | adde r5,r5,r6 | ||
101 | 4: cmpwi 0,r4,2 /* is at least a halfword left? */ | ||
102 | blt+ 5f | ||
103 | lhz r6,8(r3) /* sum this halfword */ | ||
104 | addi r3,r3,2 | ||
105 | subi r4,r4,2 | ||
106 | adde r5,r5,r6 | ||
107 | 5: cmpwi 0,r4,1 /* is at least a byte left? */ | ||
108 | bne+ 6f | ||
109 | lbz r6,8(r3) /* sum this byte */ | ||
110 | slwi r6,r6,8 /* this byte is assumed to be the upper byte of a halfword */ | ||
111 | adde r5,r5,r6 | ||
112 | 6: addze r5,r5 /* add in final carry */ | ||
113 | rldicl r4,r5,32,0 /* fold two 32-bit halves together */ | ||
114 | add r3,r4,r5 | ||
115 | srdi r3,r3,32 | ||
116 | blr | ||
117 | |||
118 | /* | ||
119 | * Computes the checksum of a memory block at src, length len, | ||
120 | * and adds in "sum" (32-bit), while copying the block to dst. | ||
121 | * If an access exception occurs on src or dst, it stores -EFAULT | ||
122 | * to *src_err or *dst_err respectively, and (for an error on | ||
123 | * src) zeroes the rest of dst. | ||
124 | * | ||
125 | * This code needs to be reworked to take advantage of 64 bit sum+copy. | ||
126 | * However, due to tokenring halfword alignment problems this will be very | ||
127 | * tricky. For now we'll leave it until we instrument it somehow. | ||
128 | * | ||
129 | * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) | ||
130 | */ | ||
131 | _GLOBAL(csum_partial_copy_generic) | ||
132 | addic r0,r6,0 | ||
133 | subi r3,r3,4 | ||
134 | subi r4,r4,4 | ||
135 | srwi. r6,r5,2 | ||
136 | beq 3f /* if we're doing < 4 bytes */ | ||
137 | andi. r9,r4,2 /* Align dst to longword boundary */ | ||
138 | beq+ 1f | ||
139 | 81: lhz r6,4(r3) /* do 2 bytes to get aligned */ | ||
140 | addi r3,r3,2 | ||
141 | subi r5,r5,2 | ||
142 | 91: sth r6,4(r4) | ||
143 | addi r4,r4,2 | ||
144 | addc r0,r0,r6 | ||
145 | srwi. r6,r5,2 /* # words to do */ | ||
146 | beq 3f | ||
147 | 1: mtctr r6 | ||
148 | 82: lwzu r6,4(r3) /* the bdnz has zero overhead, so it should */ | ||
149 | 92: stwu r6,4(r4) /* be unnecessary to unroll this loop */ | ||
150 | adde r0,r0,r6 | ||
151 | bdnz 82b | ||
152 | andi. r5,r5,3 | ||
153 | 3: cmpwi 0,r5,2 | ||
154 | blt+ 4f | ||
155 | 83: lhz r6,4(r3) | ||
156 | addi r3,r3,2 | ||
157 | subi r5,r5,2 | ||
158 | 93: sth r6,4(r4) | ||
159 | addi r4,r4,2 | ||
160 | adde r0,r0,r6 | ||
161 | 4: cmpwi 0,r5,1 | ||
162 | bne+ 5f | ||
163 | 84: lbz r6,4(r3) | ||
164 | 94: stb r6,4(r4) | ||
165 | slwi r6,r6,8 /* Upper byte of word */ | ||
166 | adde r0,r0,r6 | ||
167 | 5: addze r3,r0 /* add in final carry (unlikely with 64-bit regs) */ | ||
168 | rldicl r4,r3,32,0 /* fold 64 bit value */ | ||
169 | add r3,r4,r3 | ||
170 | srdi r3,r3,32 | ||
171 | blr | ||
172 | |||
173 | /* These shouldn't go in the fixup section, since that would | ||
174 | cause the ex_table addresses to get out of order. */ | ||
175 | |||
176 | .globl src_error_1 | ||
177 | src_error_1: | ||
178 | li r6,0 | ||
179 | subi r5,r5,2 | ||
180 | 95: sth r6,4(r4) | ||
181 | addi r4,r4,2 | ||
182 | srwi. r6,r5,2 | ||
183 | beq 3f | ||
184 | mtctr r6 | ||
185 | .globl src_error_2 | ||
186 | src_error_2: | ||
187 | li r6,0 | ||
188 | 96: stwu r6,4(r4) | ||
189 | bdnz 96b | ||
190 | 3: andi. r5,r5,3 | ||
191 | beq src_error | ||
192 | .globl src_error_3 | ||
193 | src_error_3: | ||
194 | li r6,0 | ||
195 | mtctr r5 | ||
196 | addi r4,r4,3 | ||
197 | 97: stbu r6,1(r4) | ||
198 | bdnz 97b | ||
199 | .globl src_error | ||
200 | src_error: | ||
201 | cmpdi 0,r7,0 | ||
202 | beq 1f | ||
203 | li r6,-EFAULT | ||
204 | stw r6,0(r7) | ||
205 | 1: addze r3,r0 | ||
206 | blr | ||
207 | |||
208 | .globl dst_error | ||
209 | dst_error: | ||
210 | cmpdi 0,r8,0 | ||
211 | beq 1f | ||
212 | li r6,-EFAULT | ||
213 | stw r6,0(r8) | ||
214 | 1: addze r3,r0 | ||
215 | blr | ||
216 | |||
217 | .section __ex_table,"a" | ||
218 | .align 3 | ||
219 | .llong 81b,src_error_1 | ||
220 | .llong 91b,dst_error | ||
221 | .llong 82b,src_error_2 | ||
222 | .llong 92b,dst_error | ||
223 | .llong 83b,src_error_3 | ||
224 | .llong 93b,dst_error | ||
225 | .llong 84b,src_error_3 | ||
226 | .llong 94b,dst_error | ||
227 | .llong 95b,dst_error | ||
228 | .llong 96b,dst_error | ||
229 | .llong 97b,dst_error | ||