diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/i386/lib/checksum.S |
Linux-2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'arch/i386/lib/checksum.S')
-rw-r--r-- | arch/i386/lib/checksum.S | 496 |
1 files changed, 496 insertions, 0 deletions
diff --git a/arch/i386/lib/checksum.S b/arch/i386/lib/checksum.S new file mode 100644 index 00000000000..94c7867ddc3 --- /dev/null +++ b/arch/i386/lib/checksum.S | |||
@@ -0,0 +1,496 @@ | |||
1 | /* | ||
2 | * INET An implementation of the TCP/IP protocol suite for the LINUX | ||
3 | * operating system. INET is implemented using the BSD Socket | ||
4 | * interface as the means of communication with the user level. | ||
5 | * | ||
6 | * IP/TCP/UDP checksumming routines | ||
7 | * | ||
8 | * Authors: Jorge Cwik, <jorge@laser.satlink.net> | ||
9 | * Arnt Gulbrandsen, <agulbra@nvg.unit.no> | ||
10 | * Tom May, <ftom@netcom.com> | ||
11 | * Pentium Pro/II routines: | ||
12 | * Alexander Kjeldaas <astor@guardian.no> | ||
13 | * Finn Arne Gangstad <finnag@guardian.no> | ||
14 | * Lots of code moved from tcp.c and ip.c; see those files | ||
15 | * for more names. | ||
16 | * | ||
17 | * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception | ||
18 | * handling. | ||
19 | * Andi Kleen, add zeroing on error | ||
20 | * converted to pure assembler | ||
21 | * | ||
22 | * This program is free software; you can redistribute it and/or | ||
23 | * modify it under the terms of the GNU General Public License | ||
24 | * as published by the Free Software Foundation; either version | ||
25 | * 2 of the License, or (at your option) any later version. | ||
26 | */ | ||
27 | |||
28 | #include <linux/config.h> | ||
29 | #include <asm/errno.h> | ||
30 | |||
31 | /* | ||
32 | * computes a partial checksum, e.g. for TCP/UDP fragments | ||
33 | */ | ||
34 | |||
35 | /* | ||
36 | unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) | ||
37 | */ | ||
38 | |||
39 | .text | ||
40 | .align 4 | ||
41 | .globl csum_partial | ||
42 | |||
43 | #ifndef CONFIG_X86_USE_PPRO_CHECKSUM | ||
44 | |||
45 | /* | ||
46 | * Experiments with Ethernet and SLIP connections show that buff | ||
47 | * is aligned on either a 2-byte or 4-byte boundary. We get at | ||
48 | * least a twofold speedup on 486 and Pentium if it is 4-byte aligned. | ||
49 | * Fortunately, it is easy to convert 2-byte alignment to 4-byte | ||
50 | * alignment for the unrolled loop. | ||
51 | */ | ||
52 | csum_partial: | ||
53 | pushl %esi | ||
54 | pushl %ebx | ||
55 | movl 20(%esp),%eax # Function arg: unsigned int sum | ||
56 | movl 16(%esp),%ecx # Function arg: int len | ||
57 | movl 12(%esp),%esi # Function arg: unsigned char *buff | ||
58 | testl $3, %esi # Check alignment. | ||
59 | jz 2f # Jump if alignment is ok. | ||
60 | testl $1, %esi # Check alignment. | ||
61 | jz 10f # Jump if alignment is boundary of 2bytes. | ||
62 | |||
63 | # buf is odd | ||
64 | dec %ecx | ||
65 | jl 8f | ||
66 | movzbl (%esi), %ebx | ||
67 | adcl %ebx, %eax | ||
68 | roll $8, %eax | ||
69 | inc %esi | ||
70 | testl $2, %esi | ||
71 | jz 2f | ||
72 | 10: | ||
73 | subl $2, %ecx # Alignment uses up two bytes. | ||
74 | jae 1f # Jump if we had at least two bytes. | ||
75 | addl $2, %ecx # ecx was < 2. Deal with it. | ||
76 | jmp 4f | ||
77 | 1: movw (%esi), %bx | ||
78 | addl $2, %esi | ||
79 | addw %bx, %ax | ||
80 | adcl $0, %eax | ||
81 | 2: | ||
82 | movl %ecx, %edx | ||
83 | shrl $5, %ecx | ||
84 | jz 2f | ||
85 | testl %esi, %esi | ||
86 | 1: movl (%esi), %ebx | ||
87 | adcl %ebx, %eax | ||
88 | movl 4(%esi), %ebx | ||
89 | adcl %ebx, %eax | ||
90 | movl 8(%esi), %ebx | ||
91 | adcl %ebx, %eax | ||
92 | movl 12(%esi), %ebx | ||
93 | adcl %ebx, %eax | ||
94 | movl 16(%esi), %ebx | ||
95 | adcl %ebx, %eax | ||
96 | movl 20(%esi), %ebx | ||
97 | adcl %ebx, %eax | ||
98 | movl 24(%esi), %ebx | ||
99 | adcl %ebx, %eax | ||
100 | movl 28(%esi), %ebx | ||
101 | adcl %ebx, %eax | ||
102 | lea 32(%esi), %esi | ||
103 | dec %ecx | ||
104 | jne 1b | ||
105 | adcl $0, %eax | ||
106 | 2: movl %edx, %ecx | ||
107 | andl $0x1c, %edx | ||
108 | je 4f | ||
109 | shrl $2, %edx # This clears CF | ||
110 | 3: adcl (%esi), %eax | ||
111 | lea 4(%esi), %esi | ||
112 | dec %edx | ||
113 | jne 3b | ||
114 | adcl $0, %eax | ||
115 | 4: andl $3, %ecx | ||
116 | jz 7f | ||
117 | cmpl $2, %ecx | ||
118 | jb 5f | ||
119 | movw (%esi),%cx | ||
120 | leal 2(%esi),%esi | ||
121 | je 6f | ||
122 | shll $16,%ecx | ||
123 | 5: movb (%esi),%cl | ||
124 | 6: addl %ecx,%eax | ||
125 | adcl $0, %eax | ||
126 | 7: | ||
127 | testl $1, 12(%esp) | ||
128 | jz 8f | ||
129 | roll $8, %eax | ||
130 | 8: | ||
131 | popl %ebx | ||
132 | popl %esi | ||
133 | ret | ||
134 | |||
135 | #else | ||
136 | |||
137 | /* Version for PentiumII/PPro */ | ||
138 | |||
139 | csum_partial: | ||
140 | pushl %esi | ||
141 | pushl %ebx | ||
142 | movl 20(%esp),%eax # Function arg: unsigned int sum | ||
143 | movl 16(%esp),%ecx # Function arg: int len | ||
144 | movl 12(%esp),%esi # Function arg: const unsigned char *buf | ||
145 | |||
146 | testl $3, %esi | ||
147 | jnz 25f | ||
148 | 10: | ||
149 | movl %ecx, %edx | ||
150 | movl %ecx, %ebx | ||
151 | andl $0x7c, %ebx | ||
152 | shrl $7, %ecx | ||
153 | addl %ebx,%esi | ||
154 | shrl $2, %ebx | ||
155 | negl %ebx | ||
156 | lea 45f(%ebx,%ebx,2), %ebx | ||
157 | testl %esi, %esi | ||
158 | jmp *%ebx | ||
159 | |||
160 | # Handle 2-byte-aligned regions | ||
161 | 20: addw (%esi), %ax | ||
162 | lea 2(%esi), %esi | ||
163 | adcl $0, %eax | ||
164 | jmp 10b | ||
165 | 25: | ||
166 | testl $1, %esi | ||
167 | jz 30f | ||
168 | # buf is odd | ||
169 | dec %ecx | ||
170 | jl 90f | ||
171 | movzbl (%esi), %ebx | ||
172 | addl %ebx, %eax | ||
173 | adcl $0, %eax | ||
174 | roll $8, %eax | ||
175 | inc %esi | ||
176 | testl $2, %esi | ||
177 | jz 10b | ||
178 | |||
179 | 30: subl $2, %ecx | ||
180 | ja 20b | ||
181 | je 32f | ||
182 | addl $2, %ecx | ||
183 | jz 80f | ||
184 | movzbl (%esi),%ebx # csumming 1 byte, 2-aligned | ||
185 | addl %ebx, %eax | ||
186 | adcl $0, %eax | ||
187 | jmp 80f | ||
188 | 32: | ||
189 | addw (%esi), %ax # csumming 2 bytes, 2-aligned | ||
190 | adcl $0, %eax | ||
191 | jmp 80f | ||
192 | |||
193 | 40: | ||
194 | addl -128(%esi), %eax | ||
195 | adcl -124(%esi), %eax | ||
196 | adcl -120(%esi), %eax | ||
197 | adcl -116(%esi), %eax | ||
198 | adcl -112(%esi), %eax | ||
199 | adcl -108(%esi), %eax | ||
200 | adcl -104(%esi), %eax | ||
201 | adcl -100(%esi), %eax | ||
202 | adcl -96(%esi), %eax | ||
203 | adcl -92(%esi), %eax | ||
204 | adcl -88(%esi), %eax | ||
205 | adcl -84(%esi), %eax | ||
206 | adcl -80(%esi), %eax | ||
207 | adcl -76(%esi), %eax | ||
208 | adcl -72(%esi), %eax | ||
209 | adcl -68(%esi), %eax | ||
210 | adcl -64(%esi), %eax | ||
211 | adcl -60(%esi), %eax | ||
212 | adcl -56(%esi), %eax | ||
213 | adcl -52(%esi), %eax | ||
214 | adcl -48(%esi), %eax | ||
215 | adcl -44(%esi), %eax | ||
216 | adcl -40(%esi), %eax | ||
217 | adcl -36(%esi), %eax | ||
218 | adcl -32(%esi), %eax | ||
219 | adcl -28(%esi), %eax | ||
220 | adcl -24(%esi), %eax | ||
221 | adcl -20(%esi), %eax | ||
222 | adcl -16(%esi), %eax | ||
223 | adcl -12(%esi), %eax | ||
224 | adcl -8(%esi), %eax | ||
225 | adcl -4(%esi), %eax | ||
226 | 45: | ||
227 | lea 128(%esi), %esi | ||
228 | adcl $0, %eax | ||
229 | dec %ecx | ||
230 | jge 40b | ||
231 | movl %edx, %ecx | ||
232 | 50: andl $3, %ecx | ||
233 | jz 80f | ||
234 | |||
235 | # Handle the last 1-3 bytes without jumping | ||
236 | notl %ecx # 1->2, 2->1, 3->0, higher bits are masked | ||
237 | movl $0xffffff,%ebx # by the shll and shrl instructions | ||
238 | shll $3,%ecx | ||
239 | shrl %cl,%ebx | ||
240 | andl -128(%esi),%ebx # esi is 4-aligned so should be ok | ||
241 | addl %ebx,%eax | ||
242 | adcl $0,%eax | ||
243 | 80: | ||
244 | testl $1, 12(%esp) | ||
245 | jz 90f | ||
246 | roll $8, %eax | ||
247 | 90: | ||
248 | popl %ebx | ||
249 | popl %esi | ||
250 | ret | ||
251 | |||
252 | #endif | ||
253 | |||
254 | /* | ||
255 | unsigned int csum_partial_copy_generic (const char *src, char *dst, | ||
256 | int len, int sum, int *src_err_ptr, int *dst_err_ptr) | ||
257 | */ | ||
258 | |||
259 | /* | ||
260 | * Copy from ds while checksumming, otherwise like csum_partial | ||
261 | * | ||
262 | * The macros SRC and DST specify the type of access for the instruction. | ||
263 | * thus we can call a custom exception handler for all access types. | ||
264 | * | ||
265 | * FIXME: could someone double-check whether I haven't mixed up some SRC and | ||
266 | * DST definitions? It's damn hard to trigger all cases. I hope I got | ||
267 | * them all but there's no guarantee. | ||
268 | */ | ||
269 | |||
270 | #define SRC(y...) \ | ||
271 | 9999: y; \ | ||
272 | .section __ex_table, "a"; \ | ||
273 | .long 9999b, 6001f ; \ | ||
274 | .previous | ||
275 | |||
276 | #define DST(y...) \ | ||
277 | 9999: y; \ | ||
278 | .section __ex_table, "a"; \ | ||
279 | .long 9999b, 6002f ; \ | ||
280 | .previous | ||
281 | |||
282 | .align 4 | ||
283 | .globl csum_partial_copy_generic | ||
284 | |||
285 | #ifndef CONFIG_X86_USE_PPRO_CHECKSUM | ||
286 | |||
287 | #define ARGBASE 16 | ||
288 | #define FP 12 | ||
289 | |||
290 | csum_partial_copy_generic: | ||
291 | subl $4,%esp | ||
292 | pushl %edi | ||
293 | pushl %esi | ||
294 | pushl %ebx | ||
295 | movl ARGBASE+16(%esp),%eax # sum | ||
296 | movl ARGBASE+12(%esp),%ecx # len | ||
297 | movl ARGBASE+4(%esp),%esi # src | ||
298 | movl ARGBASE+8(%esp),%edi # dst | ||
299 | |||
300 | testl $2, %edi # Check alignment. | ||
301 | jz 2f # Jump if alignment is ok. | ||
302 | subl $2, %ecx # Alignment uses up two bytes. | ||
303 | jae 1f # Jump if we had at least two bytes. | ||
304 | addl $2, %ecx # ecx was < 2. Deal with it. | ||
305 | jmp 4f | ||
306 | SRC(1: movw (%esi), %bx ) | ||
307 | addl $2, %esi | ||
308 | DST( movw %bx, (%edi) ) | ||
309 | addl $2, %edi | ||
310 | addw %bx, %ax | ||
311 | adcl $0, %eax | ||
312 | 2: | ||
313 | movl %ecx, FP(%esp) | ||
314 | shrl $5, %ecx | ||
315 | jz 2f | ||
316 | testl %esi, %esi | ||
317 | SRC(1: movl (%esi), %ebx ) | ||
318 | SRC( movl 4(%esi), %edx ) | ||
319 | adcl %ebx, %eax | ||
320 | DST( movl %ebx, (%edi) ) | ||
321 | adcl %edx, %eax | ||
322 | DST( movl %edx, 4(%edi) ) | ||
323 | |||
324 | SRC( movl 8(%esi), %ebx ) | ||
325 | SRC( movl 12(%esi), %edx ) | ||
326 | adcl %ebx, %eax | ||
327 | DST( movl %ebx, 8(%edi) ) | ||
328 | adcl %edx, %eax | ||
329 | DST( movl %edx, 12(%edi) ) | ||
330 | |||
331 | SRC( movl 16(%esi), %ebx ) | ||
332 | SRC( movl 20(%esi), %edx ) | ||
333 | adcl %ebx, %eax | ||
334 | DST( movl %ebx, 16(%edi) ) | ||
335 | adcl %edx, %eax | ||
336 | DST( movl %edx, 20(%edi) ) | ||
337 | |||
338 | SRC( movl 24(%esi), %ebx ) | ||
339 | SRC( movl 28(%esi), %edx ) | ||
340 | adcl %ebx, %eax | ||
341 | DST( movl %ebx, 24(%edi) ) | ||
342 | adcl %edx, %eax | ||
343 | DST( movl %edx, 28(%edi) ) | ||
344 | |||
345 | lea 32(%esi), %esi | ||
346 | lea 32(%edi), %edi | ||
347 | dec %ecx | ||
348 | jne 1b | ||
349 | adcl $0, %eax | ||
350 | 2: movl FP(%esp), %edx | ||
351 | movl %edx, %ecx | ||
352 | andl $0x1c, %edx | ||
353 | je 4f | ||
354 | shrl $2, %edx # This clears CF | ||
355 | SRC(3: movl (%esi), %ebx ) | ||
356 | adcl %ebx, %eax | ||
357 | DST( movl %ebx, (%edi) ) | ||
358 | lea 4(%esi), %esi | ||
359 | lea 4(%edi), %edi | ||
360 | dec %edx | ||
361 | jne 3b | ||
362 | adcl $0, %eax | ||
363 | 4: andl $3, %ecx | ||
364 | jz 7f | ||
365 | cmpl $2, %ecx | ||
366 | jb 5f | ||
367 | SRC( movw (%esi), %cx ) | ||
368 | leal 2(%esi), %esi | ||
369 | DST( movw %cx, (%edi) ) | ||
370 | leal 2(%edi), %edi | ||
371 | je 6f | ||
372 | shll $16,%ecx | ||
373 | SRC(5: movb (%esi), %cl ) | ||
374 | DST( movb %cl, (%edi) ) | ||
375 | 6: addl %ecx, %eax | ||
376 | adcl $0, %eax | ||
377 | 7: | ||
378 | 5000: | ||
379 | |||
380 | # Exception handler: | ||
381 | .section .fixup, "ax" | ||
382 | |||
383 | 6001: | ||
384 | movl ARGBASE+20(%esp), %ebx # src_err_ptr | ||
385 | movl $-EFAULT, (%ebx) | ||
386 | |||
387 | # zero the complete destination - computing the rest | ||
388 | # is too much work | ||
389 | movl ARGBASE+8(%esp), %edi # dst | ||
390 | movl ARGBASE+12(%esp), %ecx # len | ||
391 | xorl %eax,%eax | ||
392 | rep ; stosb | ||
393 | |||
394 | jmp 5000b | ||
395 | |||
396 | 6002: | ||
397 | movl ARGBASE+24(%esp), %ebx # dst_err_ptr | ||
398 | movl $-EFAULT,(%ebx) | ||
399 | jmp 5000b | ||
400 | |||
401 | .previous | ||
402 | |||
403 | popl %ebx | ||
404 | popl %esi | ||
405 | popl %edi | ||
406 | popl %ecx # equivalent to addl $4,%esp | ||
407 | ret | ||
408 | |||
409 | #else | ||
410 | |||
411 | /* Version for PentiumII/PPro */ | ||
412 | |||
413 | #define ROUND1(x) \ | ||
414 | SRC(movl x(%esi), %ebx ) ; \ | ||
415 | addl %ebx, %eax ; \ | ||
416 | DST(movl %ebx, x(%edi) ) ; | ||
417 | |||
418 | #define ROUND(x) \ | ||
419 | SRC(movl x(%esi), %ebx ) ; \ | ||
420 | adcl %ebx, %eax ; \ | ||
421 | DST(movl %ebx, x(%edi) ) ; | ||
422 | |||
423 | #define ARGBASE 12 | ||
424 | |||
425 | csum_partial_copy_generic: | ||
426 | pushl %ebx | ||
427 | pushl %edi | ||
428 | pushl %esi | ||
429 | movl ARGBASE+4(%esp),%esi #src | ||
430 | movl ARGBASE+8(%esp),%edi #dst | ||
431 | movl ARGBASE+12(%esp),%ecx #len | ||
432 | movl ARGBASE+16(%esp),%eax #sum | ||
433 | # movl %ecx, %edx | ||
434 | movl %ecx, %ebx | ||
435 | movl %esi, %edx | ||
436 | shrl $6, %ecx | ||
437 | andl $0x3c, %ebx | ||
438 | negl %ebx | ||
439 | subl %ebx, %esi | ||
440 | subl %ebx, %edi | ||
441 | lea -1(%esi),%edx | ||
442 | andl $-32,%edx | ||
443 | lea 3f(%ebx,%ebx), %ebx | ||
444 | testl %esi, %esi | ||
445 | jmp *%ebx | ||
446 | 1: addl $64,%esi | ||
447 | addl $64,%edi | ||
448 | SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) | ||
449 | ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52) | ||
450 | ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36) | ||
451 | ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20) | ||
452 | ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4) | ||
453 | 3: adcl $0,%eax | ||
454 | addl $64, %edx | ||
455 | dec %ecx | ||
456 | jge 1b | ||
457 | 4: movl ARGBASE+12(%esp),%edx #len | ||
458 | andl $3, %edx | ||
459 | jz 7f | ||
460 | cmpl $2, %edx | ||
461 | jb 5f | ||
462 | SRC( movw (%esi), %dx ) | ||
463 | leal 2(%esi), %esi | ||
464 | DST( movw %dx, (%edi) ) | ||
465 | leal 2(%edi), %edi | ||
466 | je 6f | ||
467 | shll $16,%edx | ||
468 | 5: | ||
469 | SRC( movb (%esi), %dl ) | ||
470 | DST( movb %dl, (%edi) ) | ||
471 | 6: addl %edx, %eax | ||
472 | adcl $0, %eax | ||
473 | 7: | ||
474 | .section .fixup, "ax" | ||
475 | 6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr | ||
476 | movl $-EFAULT, (%ebx) | ||
477 | # zero the complete destination (computing the rest is too much work) | ||
478 | movl ARGBASE+8(%esp),%edi # dst | ||
479 | movl ARGBASE+12(%esp),%ecx # len | ||
480 | xorl %eax,%eax | ||
481 | rep; stosb | ||
482 | jmp 7b | ||
483 | 6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr | ||
484 | movl $-EFAULT, (%ebx) | ||
485 | jmp 7b | ||
486 | .previous | ||
487 | |||
488 | popl %esi | ||
489 | popl %edi | ||
490 | popl %ebx | ||
491 | ret | ||
492 | |||
493 | #undef ROUND | ||
494 | #undef ROUND1 | ||
495 | |||
496 | #endif | ||