diff options
Diffstat (limited to 'arch/sparc/lib/checksum_32.S')
-rw-r--r-- | arch/sparc/lib/checksum_32.S | 583 |
1 files changed, 583 insertions, 0 deletions
diff --git a/arch/sparc/lib/checksum_32.S b/arch/sparc/lib/checksum_32.S new file mode 100644 index 000000000000..77f228533d47 --- /dev/null +++ b/arch/sparc/lib/checksum_32.S | |||
@@ -0,0 +1,583 @@ | |||
1 | /* checksum.S: Sparc optimized checksum code. | ||
2 | * | ||
3 | * Copyright(C) 1995 Linus Torvalds | ||
4 | * Copyright(C) 1995 Miguel de Icaza | ||
5 | * Copyright(C) 1996 David S. Miller | ||
6 | * Copyright(C) 1997 Jakub Jelinek | ||
7 | * | ||
8 | * derived from: | ||
9 | * Linux/Alpha checksum c-code | ||
10 | * Linux/ix86 inline checksum assembly | ||
11 | * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code) | ||
12 | * David Mosberger-Tang for optimized reference c-code | ||
13 | * BSD4.4 portable checksum routine | ||
14 | */ | ||
15 | |||
16 | #include <asm/errno.h> | ||
17 | |||
18 | #define CSUM_BIGCHUNK(buf, offset, sum, t0, t1, t2, t3, t4, t5) \ | ||
19 | ldd [buf + offset + 0x00], t0; \ | ||
20 | ldd [buf + offset + 0x08], t2; \ | ||
21 | addxcc t0, sum, sum; \ | ||
22 | addxcc t1, sum, sum; \ | ||
23 | ldd [buf + offset + 0x10], t4; \ | ||
24 | addxcc t2, sum, sum; \ | ||
25 | addxcc t3, sum, sum; \ | ||
26 | ldd [buf + offset + 0x18], t0; \ | ||
27 | addxcc t4, sum, sum; \ | ||
28 | addxcc t5, sum, sum; \ | ||
29 | addxcc t0, sum, sum; \ | ||
30 | addxcc t1, sum, sum; | ||
31 | |||
32 | #define CSUM_LASTCHUNK(buf, offset, sum, t0, t1, t2, t3) \ | ||
33 | ldd [buf - offset - 0x08], t0; \ | ||
34 | ldd [buf - offset - 0x00], t2; \ | ||
35 | addxcc t0, sum, sum; \ | ||
36 | addxcc t1, sum, sum; \ | ||
37 | addxcc t2, sum, sum; \ | ||
38 | addxcc t3, sum, sum; | ||
39 | |||
40 | /* Do end cruft out of band to get better cache patterns. */ | ||
41 | csum_partial_end_cruft: | ||
42 | be 1f ! caller asks %o1 & 0x8 | ||
43 | andcc %o1, 4, %g0 ! nope, check for word remaining | ||
44 | ldd [%o0], %g2 ! load two | ||
45 | addcc %g2, %o2, %o2 ! add first word to sum | ||
46 | addxcc %g3, %o2, %o2 ! add second word as well | ||
47 | add %o0, 8, %o0 ! advance buf ptr | ||
48 | addx %g0, %o2, %o2 ! add in final carry | ||
49 | andcc %o1, 4, %g0 ! check again for word remaining | ||
50 | 1: be 1f ! nope, skip this code | ||
51 | andcc %o1, 3, %o1 ! check for trailing bytes | ||
52 | ld [%o0], %g2 ! load it | ||
53 | addcc %g2, %o2, %o2 ! add to sum | ||
54 | add %o0, 4, %o0 ! advance buf ptr | ||
55 | addx %g0, %o2, %o2 ! add in final carry | ||
56 | andcc %o1, 3, %g0 ! check again for trailing bytes | ||
57 | 1: be 1f ! no trailing bytes, return | ||
58 | addcc %o1, -1, %g0 ! only one byte remains? | ||
59 | bne 2f ! at least two bytes more | ||
60 | subcc %o1, 2, %o1 ! only two bytes more? | ||
61 | b 4f ! only one byte remains | ||
62 | or %g0, %g0, %o4 ! clear fake hword value | ||
63 | 2: lduh [%o0], %o4 ! get hword | ||
64 | be 6f ! jmp if only hword remains | ||
65 | add %o0, 2, %o0 ! advance buf ptr either way | ||
66 | sll %o4, 16, %o4 ! create upper hword | ||
67 | 4: ldub [%o0], %o5 ! get final byte | ||
68 | sll %o5, 8, %o5 ! put into place | ||
69 | or %o5, %o4, %o4 ! coalese with hword (if any) | ||
70 | 6: addcc %o4, %o2, %o2 ! add to sum | ||
71 | 1: retl ! get outta here | ||
72 | addx %g0, %o2, %o0 ! add final carry into retval | ||
73 | |||
74 | /* Also do alignment out of band to get better cache patterns. */ | ||
75 | csum_partial_fix_alignment: | ||
76 | cmp %o1, 6 | ||
77 | bl cpte - 0x4 | ||
78 | andcc %o0, 0x2, %g0 | ||
79 | be 1f | ||
80 | andcc %o0, 0x4, %g0 | ||
81 | lduh [%o0 + 0x00], %g2 | ||
82 | sub %o1, 2, %o1 | ||
83 | add %o0, 2, %o0 | ||
84 | sll %g2, 16, %g2 | ||
85 | addcc %g2, %o2, %o2 | ||
86 | srl %o2, 16, %g3 | ||
87 | addx %g0, %g3, %g2 | ||
88 | sll %o2, 16, %o2 | ||
89 | sll %g2, 16, %g3 | ||
90 | srl %o2, 16, %o2 | ||
91 | andcc %o0, 0x4, %g0 | ||
92 | or %g3, %o2, %o2 | ||
93 | 1: be cpa | ||
94 | andcc %o1, 0xffffff80, %o3 | ||
95 | ld [%o0 + 0x00], %g2 | ||
96 | sub %o1, 4, %o1 | ||
97 | addcc %g2, %o2, %o2 | ||
98 | add %o0, 4, %o0 | ||
99 | addx %g0, %o2, %o2 | ||
100 | b cpa | ||
101 | andcc %o1, 0xffffff80, %o3 | ||
102 | |||
103 | /* The common case is to get called with a nicely aligned | ||
104 | * buffer of size 0x20. Follow the code path for that case. | ||
105 | */ | ||
106 | .globl csum_partial | ||
107 | csum_partial: /* %o0=buf, %o1=len, %o2=sum */ | ||
108 | andcc %o0, 0x7, %g0 ! alignment problems? | ||
109 | bne csum_partial_fix_alignment ! yep, handle it | ||
110 | sethi %hi(cpte - 8), %g7 ! prepare table jmp ptr | ||
111 | andcc %o1, 0xffffff80, %o3 ! num loop iterations | ||
112 | cpa: be 3f ! none to do | ||
113 | andcc %o1, 0x70, %g1 ! clears carry flag too | ||
114 | 5: CSUM_BIGCHUNK(%o0, 0x00, %o2, %o4, %o5, %g2, %g3, %g4, %g5) | ||
115 | CSUM_BIGCHUNK(%o0, 0x20, %o2, %o4, %o5, %g2, %g3, %g4, %g5) | ||
116 | CSUM_BIGCHUNK(%o0, 0x40, %o2, %o4, %o5, %g2, %g3, %g4, %g5) | ||
117 | CSUM_BIGCHUNK(%o0, 0x60, %o2, %o4, %o5, %g2, %g3, %g4, %g5) | ||
118 | addx %g0, %o2, %o2 ! sink in final carry | ||
119 | subcc %o3, 128, %o3 ! detract from loop iters | ||
120 | bne 5b ! more to do | ||
121 | add %o0, 128, %o0 ! advance buf ptr | ||
122 | andcc %o1, 0x70, %g1 ! clears carry flag too | ||
123 | 3: be cpte ! nope | ||
124 | andcc %o1, 0xf, %g0 ! anything left at all? | ||
125 | srl %g1, 1, %o4 ! compute offset | ||
126 | sub %g7, %g1, %g7 ! adjust jmp ptr | ||
127 | sub %g7, %o4, %g7 ! final jmp ptr adjust | ||
128 | jmp %g7 + %lo(cpte - 8) ! enter the table | ||
129 | add %o0, %g1, %o0 ! advance buf ptr | ||
130 | cptbl: CSUM_LASTCHUNK(%o0, 0x68, %o2, %g2, %g3, %g4, %g5) | ||
131 | CSUM_LASTCHUNK(%o0, 0x58, %o2, %g2, %g3, %g4, %g5) | ||
132 | CSUM_LASTCHUNK(%o0, 0x48, %o2, %g2, %g3, %g4, %g5) | ||
133 | CSUM_LASTCHUNK(%o0, 0x38, %o2, %g2, %g3, %g4, %g5) | ||
134 | CSUM_LASTCHUNK(%o0, 0x28, %o2, %g2, %g3, %g4, %g5) | ||
135 | CSUM_LASTCHUNK(%o0, 0x18, %o2, %g2, %g3, %g4, %g5) | ||
136 | CSUM_LASTCHUNK(%o0, 0x08, %o2, %g2, %g3, %g4, %g5) | ||
137 | addx %g0, %o2, %o2 ! fetch final carry | ||
138 | andcc %o1, 0xf, %g0 ! anything left at all? | ||
139 | cpte: bne csum_partial_end_cruft ! yep, handle it | ||
140 | andcc %o1, 8, %g0 ! check how much | ||
141 | cpout: retl ! get outta here | ||
142 | mov %o2, %o0 ! return computed csum | ||
143 | |||
144 | .globl __csum_partial_copy_start, __csum_partial_copy_end | ||
145 | __csum_partial_copy_start: | ||
146 | |||
147 | /* Work around cpp -rob */ | ||
148 | #define ALLOC #alloc | ||
149 | #define EXECINSTR #execinstr | ||
150 | #define EX(x,y,a,b) \ | ||
151 | 98: x,y; \ | ||
152 | .section .fixup,ALLOC,EXECINSTR; \ | ||
153 | .align 4; \ | ||
154 | 99: ba 30f; \ | ||
155 | a, b, %o3; \ | ||
156 | .section __ex_table,ALLOC; \ | ||
157 | .align 4; \ | ||
158 | .word 98b, 99b; \ | ||
159 | .text; \ | ||
160 | .align 4 | ||
161 | |||
162 | #define EX2(x,y) \ | ||
163 | 98: x,y; \ | ||
164 | .section __ex_table,ALLOC; \ | ||
165 | .align 4; \ | ||
166 | .word 98b, 30f; \ | ||
167 | .text; \ | ||
168 | .align 4 | ||
169 | |||
170 | #define EX3(x,y) \ | ||
171 | 98: x,y; \ | ||
172 | .section __ex_table,ALLOC; \ | ||
173 | .align 4; \ | ||
174 | .word 98b, 96f; \ | ||
175 | .text; \ | ||
176 | .align 4 | ||
177 | |||
178 | #define EXT(start,end,handler) \ | ||
179 | .section __ex_table,ALLOC; \ | ||
180 | .align 4; \ | ||
181 | .word start, 0, end, handler; \ | ||
182 | .text; \ | ||
183 | .align 4 | ||
184 | |||
185 | /* This aligned version executes typically in 8.5 superscalar cycles, this | ||
186 | * is the best I can do. I say 8.5 because the final add will pair with | ||
187 | * the next ldd in the main unrolled loop. Thus the pipe is always full. | ||
188 | * If you change these macros (including order of instructions), | ||
189 | * please check the fixup code below as well. | ||
190 | */ | ||
191 | #define CSUMCOPY_BIGCHUNK_ALIGNED(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7) \ | ||
192 | ldd [src + off + 0x00], t0; \ | ||
193 | ldd [src + off + 0x08], t2; \ | ||
194 | addxcc t0, sum, sum; \ | ||
195 | ldd [src + off + 0x10], t4; \ | ||
196 | addxcc t1, sum, sum; \ | ||
197 | ldd [src + off + 0x18], t6; \ | ||
198 | addxcc t2, sum, sum; \ | ||
199 | std t0, [dst + off + 0x00]; \ | ||
200 | addxcc t3, sum, sum; \ | ||
201 | std t2, [dst + off + 0x08]; \ | ||
202 | addxcc t4, sum, sum; \ | ||
203 | std t4, [dst + off + 0x10]; \ | ||
204 | addxcc t5, sum, sum; \ | ||
205 | std t6, [dst + off + 0x18]; \ | ||
206 | addxcc t6, sum, sum; \ | ||
207 | addxcc t7, sum, sum; | ||
208 | |||
209 | /* 12 superscalar cycles seems to be the limit for this case, | ||
210 | * because of this we thus do all the ldd's together to get | ||
211 | * Viking MXCC into streaming mode. Ho hum... | ||
212 | */ | ||
213 | #define CSUMCOPY_BIGCHUNK(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7) \ | ||
214 | ldd [src + off + 0x00], t0; \ | ||
215 | ldd [src + off + 0x08], t2; \ | ||
216 | ldd [src + off + 0x10], t4; \ | ||
217 | ldd [src + off + 0x18], t6; \ | ||
218 | st t0, [dst + off + 0x00]; \ | ||
219 | addxcc t0, sum, sum; \ | ||
220 | st t1, [dst + off + 0x04]; \ | ||
221 | addxcc t1, sum, sum; \ | ||
222 | st t2, [dst + off + 0x08]; \ | ||
223 | addxcc t2, sum, sum; \ | ||
224 | st t3, [dst + off + 0x0c]; \ | ||
225 | addxcc t3, sum, sum; \ | ||
226 | st t4, [dst + off + 0x10]; \ | ||
227 | addxcc t4, sum, sum; \ | ||
228 | st t5, [dst + off + 0x14]; \ | ||
229 | addxcc t5, sum, sum; \ | ||
230 | st t6, [dst + off + 0x18]; \ | ||
231 | addxcc t6, sum, sum; \ | ||
232 | st t7, [dst + off + 0x1c]; \ | ||
233 | addxcc t7, sum, sum; | ||
234 | |||
235 | /* Yuck, 6 superscalar cycles... */ | ||
236 | #define CSUMCOPY_LASTCHUNK(src, dst, sum, off, t0, t1, t2, t3) \ | ||
237 | ldd [src - off - 0x08], t0; \ | ||
238 | ldd [src - off - 0x00], t2; \ | ||
239 | addxcc t0, sum, sum; \ | ||
240 | st t0, [dst - off - 0x08]; \ | ||
241 | addxcc t1, sum, sum; \ | ||
242 | st t1, [dst - off - 0x04]; \ | ||
243 | addxcc t2, sum, sum; \ | ||
244 | st t2, [dst - off - 0x00]; \ | ||
245 | addxcc t3, sum, sum; \ | ||
246 | st t3, [dst - off + 0x04]; | ||
247 | |||
248 | /* Handle the end cruft code out of band for better cache patterns. */ | ||
249 | cc_end_cruft: | ||
250 | be 1f | ||
251 | andcc %o3, 4, %g0 | ||
252 | EX(ldd [%o0 + 0x00], %g2, and %o3, 0xf) | ||
253 | add %o1, 8, %o1 | ||
254 | addcc %g2, %g7, %g7 | ||
255 | add %o0, 8, %o0 | ||
256 | addxcc %g3, %g7, %g7 | ||
257 | EX2(st %g2, [%o1 - 0x08]) | ||
258 | addx %g0, %g7, %g7 | ||
259 | andcc %o3, 4, %g0 | ||
260 | EX2(st %g3, [%o1 - 0x04]) | ||
261 | 1: be 1f | ||
262 | andcc %o3, 3, %o3 | ||
263 | EX(ld [%o0 + 0x00], %g2, add %o3, 4) | ||
264 | add %o1, 4, %o1 | ||
265 | addcc %g2, %g7, %g7 | ||
266 | EX2(st %g2, [%o1 - 0x04]) | ||
267 | addx %g0, %g7, %g7 | ||
268 | andcc %o3, 3, %g0 | ||
269 | add %o0, 4, %o0 | ||
270 | 1: be 1f | ||
271 | addcc %o3, -1, %g0 | ||
272 | bne 2f | ||
273 | subcc %o3, 2, %o3 | ||
274 | b 4f | ||
275 | or %g0, %g0, %o4 | ||
276 | 2: EX(lduh [%o0 + 0x00], %o4, add %o3, 2) | ||
277 | add %o0, 2, %o0 | ||
278 | EX2(sth %o4, [%o1 + 0x00]) | ||
279 | be 6f | ||
280 | add %o1, 2, %o1 | ||
281 | sll %o4, 16, %o4 | ||
282 | 4: EX(ldub [%o0 + 0x00], %o5, add %g0, 1) | ||
283 | EX2(stb %o5, [%o1 + 0x00]) | ||
284 | sll %o5, 8, %o5 | ||
285 | or %o5, %o4, %o4 | ||
286 | 6: addcc %o4, %g7, %g7 | ||
287 | 1: retl | ||
288 | addx %g0, %g7, %o0 | ||
289 | |||
290 | /* Also, handle the alignment code out of band. */ | ||
291 | cc_dword_align: | ||
292 | cmp %g1, 6 | ||
293 | bl,a ccte | ||
294 | andcc %g1, 0xf, %o3 | ||
295 | andcc %o0, 0x1, %g0 | ||
296 | bne ccslow | ||
297 | andcc %o0, 0x2, %g0 | ||
298 | be 1f | ||
299 | andcc %o0, 0x4, %g0 | ||
300 | EX(lduh [%o0 + 0x00], %g4, add %g1, 0) | ||
301 | sub %g1, 2, %g1 | ||
302 | EX2(sth %g4, [%o1 + 0x00]) | ||
303 | add %o0, 2, %o0 | ||
304 | sll %g4, 16, %g4 | ||
305 | addcc %g4, %g7, %g7 | ||
306 | add %o1, 2, %o1 | ||
307 | srl %g7, 16, %g3 | ||
308 | addx %g0, %g3, %g4 | ||
309 | sll %g7, 16, %g7 | ||
310 | sll %g4, 16, %g3 | ||
311 | srl %g7, 16, %g7 | ||
312 | andcc %o0, 0x4, %g0 | ||
313 | or %g3, %g7, %g7 | ||
314 | 1: be 3f | ||
315 | andcc %g1, 0xffffff80, %g0 | ||
316 | EX(ld [%o0 + 0x00], %g4, add %g1, 0) | ||
317 | sub %g1, 4, %g1 | ||
318 | EX2(st %g4, [%o1 + 0x00]) | ||
319 | add %o0, 4, %o0 | ||
320 | addcc %g4, %g7, %g7 | ||
321 | add %o1, 4, %o1 | ||
322 | addx %g0, %g7, %g7 | ||
323 | b 3f | ||
324 | andcc %g1, 0xffffff80, %g0 | ||
325 | |||
326 | /* Sun, you just can't beat me, you just can't. Stop trying, | ||
327 | * give up. I'm serious, I am going to kick the living shit | ||
328 | * out of you, game over, lights out. | ||
329 | */ | ||
330 | .align 8 | ||
331 | .globl __csum_partial_copy_sparc_generic | ||
332 | __csum_partial_copy_sparc_generic: | ||
333 | /* %o0=src, %o1=dest, %g1=len, %g7=sum */ | ||
334 | xor %o0, %o1, %o4 ! get changing bits | ||
335 | andcc %o4, 3, %g0 ! check for mismatched alignment | ||
336 | bne ccslow ! better this than unaligned/fixups | ||
337 | andcc %o0, 7, %g0 ! need to align things? | ||
338 | bne cc_dword_align ! yes, we check for short lengths there | ||
339 | andcc %g1, 0xffffff80, %g0 ! can we use unrolled loop? | ||
340 | 3: be 3f ! nope, less than one loop remains | ||
341 | andcc %o1, 4, %g0 ! dest aligned on 4 or 8 byte boundary? | ||
342 | be ccdbl + 4 ! 8 byte aligned, kick ass | ||
343 | 5: CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) | ||
344 | CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) | ||
345 | CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) | ||
346 | CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) | ||
347 | 10: EXT(5b, 10b, 20f) ! note for exception handling | ||
348 | sub %g1, 128, %g1 ! detract from length | ||
349 | addx %g0, %g7, %g7 ! add in last carry bit | ||
350 | andcc %g1, 0xffffff80, %g0 ! more to csum? | ||
351 | add %o0, 128, %o0 ! advance src ptr | ||
352 | bne 5b ! we did not go negative, continue looping | ||
353 | add %o1, 128, %o1 ! advance dest ptr | ||
354 | 3: andcc %g1, 0x70, %o2 ! can use table? | ||
355 | ccmerge:be ccte ! nope, go and check for end cruft | ||
356 | andcc %g1, 0xf, %o3 ! get low bits of length (clears carry btw) | ||
357 | srl %o2, 1, %o4 ! begin negative offset computation | ||
358 | sethi %hi(12f), %o5 ! set up table ptr end | ||
359 | add %o0, %o2, %o0 ! advance src ptr | ||
360 | sub %o5, %o4, %o5 ! continue table calculation | ||
361 | sll %o2, 1, %g2 ! constant multiplies are fun... | ||
362 | sub %o5, %g2, %o5 ! some more adjustments | ||
363 | jmp %o5 + %lo(12f) ! jump into it, duff style, wheee... | ||
364 | add %o1, %o2, %o1 ! advance dest ptr (carry is clear btw) | ||
365 | cctbl: CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x68,%g2,%g3,%g4,%g5) | ||
366 | CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x58,%g2,%g3,%g4,%g5) | ||
367 | CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x48,%g2,%g3,%g4,%g5) | ||
368 | CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x38,%g2,%g3,%g4,%g5) | ||
369 | CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x28,%g2,%g3,%g4,%g5) | ||
370 | CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x18,%g2,%g3,%g4,%g5) | ||
371 | CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x08,%g2,%g3,%g4,%g5) | ||
372 | 12: EXT(cctbl, 12b, 22f) ! note for exception table handling | ||
373 | addx %g0, %g7, %g7 | ||
374 | andcc %o3, 0xf, %g0 ! check for low bits set | ||
375 | ccte: bne cc_end_cruft ! something left, handle it out of band | ||
376 | andcc %o3, 8, %g0 ! begin checks for that code | ||
377 | retl ! return | ||
378 | mov %g7, %o0 ! give em the computed checksum | ||
379 | ccdbl: CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) | ||
380 | CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) | ||
381 | CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) | ||
382 | CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) | ||
383 | 11: EXT(ccdbl, 11b, 21f) ! note for exception table handling | ||
384 | sub %g1, 128, %g1 ! detract from length | ||
385 | addx %g0, %g7, %g7 ! add in last carry bit | ||
386 | andcc %g1, 0xffffff80, %g0 ! more to csum? | ||
387 | add %o0, 128, %o0 ! advance src ptr | ||
388 | bne ccdbl ! we did not go negative, continue looping | ||
389 | add %o1, 128, %o1 ! advance dest ptr | ||
390 | b ccmerge ! finish it off, above | ||
391 | andcc %g1, 0x70, %o2 ! can use table? (clears carry btw) | ||
392 | |||
393 | ccslow: cmp %g1, 0 | ||
394 | mov 0, %g5 | ||
395 | bleu 4f | ||
396 | andcc %o0, 1, %o5 | ||
397 | be,a 1f | ||
398 | srl %g1, 1, %g4 | ||
399 | sub %g1, 1, %g1 | ||
400 | EX(ldub [%o0], %g5, add %g1, 1) | ||
401 | add %o0, 1, %o0 | ||
402 | EX2(stb %g5, [%o1]) | ||
403 | srl %g1, 1, %g4 | ||
404 | add %o1, 1, %o1 | ||
405 | 1: cmp %g4, 0 | ||
406 | be,a 3f | ||
407 | andcc %g1, 1, %g0 | ||
408 | andcc %o0, 2, %g0 | ||
409 | be,a 1f | ||
410 | srl %g4, 1, %g4 | ||
411 | EX(lduh [%o0], %o4, add %g1, 0) | ||
412 | sub %g1, 2, %g1 | ||
413 | srl %o4, 8, %g2 | ||
414 | sub %g4, 1, %g4 | ||
415 | EX2(stb %g2, [%o1]) | ||
416 | add %o4, %g5, %g5 | ||
417 | EX2(stb %o4, [%o1 + 1]) | ||
418 | add %o0, 2, %o0 | ||
419 | srl %g4, 1, %g4 | ||
420 | add %o1, 2, %o1 | ||
421 | 1: cmp %g4, 0 | ||
422 | be,a 2f | ||
423 | andcc %g1, 2, %g0 | ||
424 | EX3(ld [%o0], %o4) | ||
425 | 5: srl %o4, 24, %g2 | ||
426 | srl %o4, 16, %g3 | ||
427 | EX2(stb %g2, [%o1]) | ||
428 | srl %o4, 8, %g2 | ||
429 | EX2(stb %g3, [%o1 + 1]) | ||
430 | add %o0, 4, %o0 | ||
431 | EX2(stb %g2, [%o1 + 2]) | ||
432 | addcc %o4, %g5, %g5 | ||
433 | EX2(stb %o4, [%o1 + 3]) | ||
434 | addx %g5, %g0, %g5 ! I am now to lazy to optimize this (question it | ||
435 | add %o1, 4, %o1 ! is worthy). Maybe some day - with the sll/srl | ||
436 | subcc %g4, 1, %g4 ! tricks | ||
437 | bne,a 5b | ||
438 | EX3(ld [%o0], %o4) | ||
439 | sll %g5, 16, %g2 | ||
440 | srl %g5, 16, %g5 | ||
441 | srl %g2, 16, %g2 | ||
442 | andcc %g1, 2, %g0 | ||
443 | add %g2, %g5, %g5 | ||
444 | 2: be,a 3f | ||
445 | andcc %g1, 1, %g0 | ||
446 | EX(lduh [%o0], %o4, and %g1, 3) | ||
447 | andcc %g1, 1, %g0 | ||
448 | srl %o4, 8, %g2 | ||
449 | add %o0, 2, %o0 | ||
450 | EX2(stb %g2, [%o1]) | ||
451 | add %g5, %o4, %g5 | ||
452 | EX2(stb %o4, [%o1 + 1]) | ||
453 | add %o1, 2, %o1 | ||
454 | 3: be,a 1f | ||
455 | sll %g5, 16, %o4 | ||
456 | EX(ldub [%o0], %g2, add %g0, 1) | ||
457 | sll %g2, 8, %o4 | ||
458 | EX2(stb %g2, [%o1]) | ||
459 | add %g5, %o4, %g5 | ||
460 | sll %g5, 16, %o4 | ||
461 | 1: addcc %o4, %g5, %g5 | ||
462 | srl %g5, 16, %o4 | ||
463 | addx %g0, %o4, %g5 | ||
464 | orcc %o5, %g0, %g0 | ||
465 | be 4f | ||
466 | srl %g5, 8, %o4 | ||
467 | and %g5, 0xff, %g2 | ||
468 | and %o4, 0xff, %o4 | ||
469 | sll %g2, 8, %g2 | ||
470 | or %g2, %o4, %g5 | ||
471 | 4: addcc %g7, %g5, %g7 | ||
472 | retl | ||
473 | addx %g0, %g7, %o0 | ||
474 | __csum_partial_copy_end: | ||
475 | |||
476 | /* We do these strange calculations for the csum_*_from_user case only, ie. | ||
477 | * we only bother with faults on loads... */ | ||
478 | |||
479 | /* o2 = ((g2%20)&3)*8 | ||
480 | * o3 = g1 - (g2/20)*32 - o2 */ | ||
481 | 20: | ||
482 | cmp %g2, 20 | ||
483 | blu,a 1f | ||
484 | and %g2, 3, %o2 | ||
485 | sub %g1, 32, %g1 | ||
486 | b 20b | ||
487 | sub %g2, 20, %g2 | ||
488 | 1: | ||
489 | sll %o2, 3, %o2 | ||
490 | b 31f | ||
491 | sub %g1, %o2, %o3 | ||
492 | |||
493 | /* o2 = (!(g2 & 15) ? 0 : (((g2 & 15) + 1) & ~1)*8) | ||
494 | * o3 = g1 - (g2/16)*32 - o2 */ | ||
495 | 21: | ||
496 | andcc %g2, 15, %o3 | ||
497 | srl %g2, 4, %g2 | ||
498 | be,a 1f | ||
499 | clr %o2 | ||
500 | add %o3, 1, %o3 | ||
501 | and %o3, 14, %o3 | ||
502 | sll %o3, 3, %o2 | ||
503 | 1: | ||
504 | sll %g2, 5, %g2 | ||
505 | sub %g1, %g2, %o3 | ||
506 | b 31f | ||
507 | sub %o3, %o2, %o3 | ||
508 | |||
509 | /* o0 += (g2/10)*16 - 0x70 | ||
510 | * 01 += (g2/10)*16 - 0x70 | ||
511 | * o2 = (g2 % 10) ? 8 : 0 | ||
512 | * o3 += 0x70 - (g2/10)*16 - o2 */ | ||
513 | 22: | ||
514 | cmp %g2, 10 | ||
515 | blu,a 1f | ||
516 | sub %o0, 0x70, %o0 | ||
517 | add %o0, 16, %o0 | ||
518 | add %o1, 16, %o1 | ||
519 | sub %o3, 16, %o3 | ||
520 | b 22b | ||
521 | sub %g2, 10, %g2 | ||
522 | 1: | ||
523 | sub %o1, 0x70, %o1 | ||
524 | add %o3, 0x70, %o3 | ||
525 | clr %o2 | ||
526 | tst %g2 | ||
527 | bne,a 1f | ||
528 | mov 8, %o2 | ||
529 | 1: | ||
530 | b 31f | ||
531 | sub %o3, %o2, %o3 | ||
532 | 96: | ||
533 | and %g1, 3, %g1 | ||
534 | sll %g4, 2, %g4 | ||
535 | add %g1, %g4, %o3 | ||
536 | 30: | ||
537 | /* %o1 is dst | ||
538 | * %o3 is # bytes to zero out | ||
539 | * %o4 is faulting address | ||
540 | * %o5 is %pc where fault occurred */ | ||
541 | clr %o2 | ||
542 | 31: | ||
543 | /* %o0 is src | ||
544 | * %o1 is dst | ||
545 | * %o2 is # of bytes to copy from src to dst | ||
546 | * %o3 is # bytes to zero out | ||
547 | * %o4 is faulting address | ||
548 | * %o5 is %pc where fault occurred */ | ||
549 | save %sp, -104, %sp | ||
550 | mov %i5, %o0 | ||
551 | mov %i7, %o1 | ||
552 | mov %i4, %o2 | ||
553 | call lookup_fault | ||
554 | mov %g7, %i4 | ||
555 | cmp %o0, 2 | ||
556 | bne 1f | ||
557 | add %g0, -EFAULT, %i5 | ||
558 | tst %i2 | ||
559 | be 2f | ||
560 | mov %i0, %o1 | ||
561 | mov %i1, %o0 | ||
562 | 5: | ||
563 | call __memcpy | ||
564 | mov %i2, %o2 | ||
565 | tst %o0 | ||
566 | bne,a 2f | ||
567 | add %i3, %i2, %i3 | ||
568 | add %i1, %i2, %i1 | ||
569 | 2: | ||
570 | mov %i1, %o0 | ||
571 | 6: | ||
572 | call __bzero | ||
573 | mov %i3, %o1 | ||
574 | 1: | ||
575 | ld [%sp + 168], %o2 ! struct_ptr of parent | ||
576 | st %i5, [%o2] | ||
577 | ret | ||
578 | restore | ||
579 | |||
580 | .section __ex_table,#alloc | ||
581 | .align 4 | ||
582 | .word 5b,2 | ||
583 | .word 6b,2 | ||