diff options
Diffstat (limited to 'arch/um/sys-x86/checksum_32.S')
-rw-r--r-- | arch/um/sys-x86/checksum_32.S | 458 |
1 files changed, 458 insertions, 0 deletions
diff --git a/arch/um/sys-x86/checksum_32.S b/arch/um/sys-x86/checksum_32.S new file mode 100644 index 000000000000..f058d2f82e18 --- /dev/null +++ b/arch/um/sys-x86/checksum_32.S | |||
@@ -0,0 +1,458 @@ | |||
1 | /* | ||
2 | * INET An implementation of the TCP/IP protocol suite for the LINUX | ||
3 | * operating system. INET is implemented using the BSD Socket | ||
4 | * interface as the means of communication with the user level. | ||
5 | * | ||
6 | * IP/TCP/UDP checksumming routines | ||
7 | * | ||
8 | * Authors: Jorge Cwik, <jorge@laser.satlink.net> | ||
9 | * Arnt Gulbrandsen, <agulbra@nvg.unit.no> | ||
10 | * Tom May, <ftom@netcom.com> | ||
11 | * Pentium Pro/II routines: | ||
12 | * Alexander Kjeldaas <astor@guardian.no> | ||
13 | * Finn Arne Gangstad <finnag@guardian.no> | ||
14 | * Lots of code moved from tcp.c and ip.c; see those files | ||
15 | * for more names. | ||
16 | * | ||
17 | * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception | ||
18 | * handling. | ||
19 | * Andi Kleen, add zeroing on error | ||
20 | * converted to pure assembler | ||
21 | * | ||
22 | * This program is free software; you can redistribute it and/or | ||
23 | * modify it under the terms of the GNU General Public License | ||
24 | * as published by the Free Software Foundation; either version | ||
25 | * 2 of the License, or (at your option) any later version. | ||
26 | */ | ||
27 | |||
28 | #include <asm/errno.h> | ||
29 | |||
30 | /* | ||
31 | * computes a partial checksum, e.g. for TCP/UDP fragments | ||
32 | */ | ||
33 | |||
34 | /* | ||
35 | unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) | ||
36 | */ | ||
37 | |||
38 | .text | ||
39 | .align 4 | ||
40 | .globl csum_partial | ||
41 | |||
42 | #ifndef CONFIG_X86_USE_PPRO_CHECKSUM | ||
43 | |||
44 | /* | ||
45 | * Experiments with Ethernet and SLIP connections show that buff | ||
46 | * is aligned on either a 2-byte or 4-byte boundary. We get at | ||
47 | * least a twofold speedup on 486 and Pentium if it is 4-byte aligned. | ||
48 | * Fortunately, it is easy to convert 2-byte alignment to 4-byte | ||
49 | * alignment for the unrolled loop. | ||
50 | */ | ||
51 | csum_partial: | ||
52 | pushl %esi | ||
53 | pushl %ebx | ||
54 | movl 20(%esp),%eax # Function arg: unsigned int sum | ||
55 | movl 16(%esp),%ecx # Function arg: int len | ||
56 | movl 12(%esp),%esi # Function arg: unsigned char *buff | ||
57 | testl $2, %esi # Check alignment. | ||
58 | jz 2f # Jump if alignment is ok. | ||
59 | subl $2, %ecx # Alignment uses up two bytes. | ||
60 | jae 1f # Jump if we had at least two bytes. | ||
61 | addl $2, %ecx # ecx was < 2. Deal with it. | ||
62 | jmp 4f | ||
63 | 1: movw (%esi), %bx | ||
64 | addl $2, %esi | ||
65 | addw %bx, %ax | ||
66 | adcl $0, %eax | ||
67 | 2: | ||
68 | movl %ecx, %edx | ||
69 | shrl $5, %ecx | ||
70 | jz 2f | ||
71 | testl %esi, %esi | ||
72 | 1: movl (%esi), %ebx | ||
73 | adcl %ebx, %eax | ||
74 | movl 4(%esi), %ebx | ||
75 | adcl %ebx, %eax | ||
76 | movl 8(%esi), %ebx | ||
77 | adcl %ebx, %eax | ||
78 | movl 12(%esi), %ebx | ||
79 | adcl %ebx, %eax | ||
80 | movl 16(%esi), %ebx | ||
81 | adcl %ebx, %eax | ||
82 | movl 20(%esi), %ebx | ||
83 | adcl %ebx, %eax | ||
84 | movl 24(%esi), %ebx | ||
85 | adcl %ebx, %eax | ||
86 | movl 28(%esi), %ebx | ||
87 | adcl %ebx, %eax | ||
88 | lea 32(%esi), %esi | ||
89 | dec %ecx | ||
90 | jne 1b | ||
91 | adcl $0, %eax | ||
92 | 2: movl %edx, %ecx | ||
93 | andl $0x1c, %edx | ||
94 | je 4f | ||
95 | shrl $2, %edx # This clears CF | ||
96 | 3: adcl (%esi), %eax | ||
97 | lea 4(%esi), %esi | ||
98 | dec %edx | ||
99 | jne 3b | ||
100 | adcl $0, %eax | ||
101 | 4: andl $3, %ecx | ||
102 | jz 7f | ||
103 | cmpl $2, %ecx | ||
104 | jb 5f | ||
105 | movw (%esi),%cx | ||
106 | leal 2(%esi),%esi | ||
107 | je 6f | ||
108 | shll $16,%ecx | ||
109 | 5: movb (%esi),%cl | ||
110 | 6: addl %ecx,%eax | ||
111 | adcl $0, %eax | ||
112 | 7: | ||
113 | popl %ebx | ||
114 | popl %esi | ||
115 | ret | ||
116 | |||
117 | #else | ||
118 | |||
119 | /* Version for PentiumII/PPro */ | ||
120 | |||
121 | csum_partial: | ||
122 | pushl %esi | ||
123 | pushl %ebx | ||
124 | movl 20(%esp),%eax # Function arg: unsigned int sum | ||
125 | movl 16(%esp),%ecx # Function arg: int len | ||
126 | movl 12(%esp),%esi # Function arg: const unsigned char *buf | ||
127 | |||
128 | testl $2, %esi | ||
129 | jnz 30f | ||
130 | 10: | ||
131 | movl %ecx, %edx | ||
132 | movl %ecx, %ebx | ||
133 | andl $0x7c, %ebx | ||
134 | shrl $7, %ecx | ||
135 | addl %ebx,%esi | ||
136 | shrl $2, %ebx | ||
137 | negl %ebx | ||
138 | lea 45f(%ebx,%ebx,2), %ebx | ||
139 | testl %esi, %esi | ||
140 | jmp *%ebx | ||
141 | |||
142 | # Handle 2-byte-aligned regions | ||
143 | 20: addw (%esi), %ax | ||
144 | lea 2(%esi), %esi | ||
145 | adcl $0, %eax | ||
146 | jmp 10b | ||
147 | |||
148 | 30: subl $2, %ecx | ||
149 | ja 20b | ||
150 | je 32f | ||
151 | movzbl (%esi),%ebx # csumming 1 byte, 2-aligned | ||
152 | addl %ebx, %eax | ||
153 | adcl $0, %eax | ||
154 | jmp 80f | ||
155 | 32: | ||
156 | addw (%esi), %ax # csumming 2 bytes, 2-aligned | ||
157 | adcl $0, %eax | ||
158 | jmp 80f | ||
159 | |||
160 | 40: | ||
161 | addl -128(%esi), %eax | ||
162 | adcl -124(%esi), %eax | ||
163 | adcl -120(%esi), %eax | ||
164 | adcl -116(%esi), %eax | ||
165 | adcl -112(%esi), %eax | ||
166 | adcl -108(%esi), %eax | ||
167 | adcl -104(%esi), %eax | ||
168 | adcl -100(%esi), %eax | ||
169 | adcl -96(%esi), %eax | ||
170 | adcl -92(%esi), %eax | ||
171 | adcl -88(%esi), %eax | ||
172 | adcl -84(%esi), %eax | ||
173 | adcl -80(%esi), %eax | ||
174 | adcl -76(%esi), %eax | ||
175 | adcl -72(%esi), %eax | ||
176 | adcl -68(%esi), %eax | ||
177 | adcl -64(%esi), %eax | ||
178 | adcl -60(%esi), %eax | ||
179 | adcl -56(%esi), %eax | ||
180 | adcl -52(%esi), %eax | ||
181 | adcl -48(%esi), %eax | ||
182 | adcl -44(%esi), %eax | ||
183 | adcl -40(%esi), %eax | ||
184 | adcl -36(%esi), %eax | ||
185 | adcl -32(%esi), %eax | ||
186 | adcl -28(%esi), %eax | ||
187 | adcl -24(%esi), %eax | ||
188 | adcl -20(%esi), %eax | ||
189 | adcl -16(%esi), %eax | ||
190 | adcl -12(%esi), %eax | ||
191 | adcl -8(%esi), %eax | ||
192 | adcl -4(%esi), %eax | ||
193 | 45: | ||
194 | lea 128(%esi), %esi | ||
195 | adcl $0, %eax | ||
196 | dec %ecx | ||
197 | jge 40b | ||
198 | movl %edx, %ecx | ||
199 | 50: andl $3, %ecx | ||
200 | jz 80f | ||
201 | |||
202 | # Handle the last 1-3 bytes without jumping | ||
203 | notl %ecx # 1->2, 2->1, 3->0, higher bits are masked | ||
204 | movl $0xffffff,%ebx # by the shll and shrl instructions | ||
205 | shll $3,%ecx | ||
206 | shrl %cl,%ebx | ||
207 | andl -128(%esi),%ebx # esi is 4-aligned so should be ok | ||
208 | addl %ebx,%eax | ||
209 | adcl $0,%eax | ||
210 | 80: | ||
211 | popl %ebx | ||
212 | popl %esi | ||
213 | ret | ||
214 | |||
215 | #endif | ||
216 | |||
217 | /* | ||
218 | unsigned int csum_partial_copy_generic (const char *src, char *dst, | ||
219 | int len, int sum, int *src_err_ptr, int *dst_err_ptr) | ||
220 | */ | ||
221 | |||
222 | /* | ||
223 | * Copy from ds while checksumming, otherwise like csum_partial | ||
224 | * | ||
225 | * The macros SRC and DST specify the type of access for the instruction. | ||
226 | * thus we can call a custom exception handler for all access types. | ||
227 | * | ||
228 | * FIXME: could someone double-check whether I haven't mixed up some SRC and | ||
229 | * DST definitions? It's damn hard to trigger all cases. I hope I got | ||
230 | * them all but there's no guarantee. | ||
231 | */ | ||
232 | |||
233 | #define SRC(y...) \ | ||
234 | 9999: y; \ | ||
235 | .section __ex_table, "a"; \ | ||
236 | .long 9999b, 6001f ; \ | ||
237 | .previous | ||
238 | |||
239 | #define DST(y...) \ | ||
240 | 9999: y; \ | ||
241 | .section __ex_table, "a"; \ | ||
242 | .long 9999b, 6002f ; \ | ||
243 | .previous | ||
244 | |||
245 | .align 4 | ||
246 | |||
247 | #ifndef CONFIG_X86_USE_PPRO_CHECKSUM | ||
248 | |||
249 | #define ARGBASE 16 | ||
250 | #define FP 12 | ||
251 | |||
252 | csum_partial_copy_generic_i386: | ||
253 | subl $4,%esp | ||
254 | pushl %edi | ||
255 | pushl %esi | ||
256 | pushl %ebx | ||
257 | movl ARGBASE+16(%esp),%eax # sum | ||
258 | movl ARGBASE+12(%esp),%ecx # len | ||
259 | movl ARGBASE+4(%esp),%esi # src | ||
260 | movl ARGBASE+8(%esp),%edi # dst | ||
261 | |||
262 | testl $2, %edi # Check alignment. | ||
263 | jz 2f # Jump if alignment is ok. | ||
264 | subl $2, %ecx # Alignment uses up two bytes. | ||
265 | jae 1f # Jump if we had at least two bytes. | ||
266 | addl $2, %ecx # ecx was < 2. Deal with it. | ||
267 | jmp 4f | ||
268 | SRC(1: movw (%esi), %bx ) | ||
269 | addl $2, %esi | ||
270 | DST( movw %bx, (%edi) ) | ||
271 | addl $2, %edi | ||
272 | addw %bx, %ax | ||
273 | adcl $0, %eax | ||
274 | 2: | ||
275 | movl %ecx, FP(%esp) | ||
276 | shrl $5, %ecx | ||
277 | jz 2f | ||
278 | testl %esi, %esi | ||
279 | SRC(1: movl (%esi), %ebx ) | ||
280 | SRC( movl 4(%esi), %edx ) | ||
281 | adcl %ebx, %eax | ||
282 | DST( movl %ebx, (%edi) ) | ||
283 | adcl %edx, %eax | ||
284 | DST( movl %edx, 4(%edi) ) | ||
285 | |||
286 | SRC( movl 8(%esi), %ebx ) | ||
287 | SRC( movl 12(%esi), %edx ) | ||
288 | adcl %ebx, %eax | ||
289 | DST( movl %ebx, 8(%edi) ) | ||
290 | adcl %edx, %eax | ||
291 | DST( movl %edx, 12(%edi) ) | ||
292 | |||
293 | SRC( movl 16(%esi), %ebx ) | ||
294 | SRC( movl 20(%esi), %edx ) | ||
295 | adcl %ebx, %eax | ||
296 | DST( movl %ebx, 16(%edi) ) | ||
297 | adcl %edx, %eax | ||
298 | DST( movl %edx, 20(%edi) ) | ||
299 | |||
300 | SRC( movl 24(%esi), %ebx ) | ||
301 | SRC( movl 28(%esi), %edx ) | ||
302 | adcl %ebx, %eax | ||
303 | DST( movl %ebx, 24(%edi) ) | ||
304 | adcl %edx, %eax | ||
305 | DST( movl %edx, 28(%edi) ) | ||
306 | |||
307 | lea 32(%esi), %esi | ||
308 | lea 32(%edi), %edi | ||
309 | dec %ecx | ||
310 | jne 1b | ||
311 | adcl $0, %eax | ||
312 | 2: movl FP(%esp), %edx | ||
313 | movl %edx, %ecx | ||
314 | andl $0x1c, %edx | ||
315 | je 4f | ||
316 | shrl $2, %edx # This clears CF | ||
317 | SRC(3: movl (%esi), %ebx ) | ||
318 | adcl %ebx, %eax | ||
319 | DST( movl %ebx, (%edi) ) | ||
320 | lea 4(%esi), %esi | ||
321 | lea 4(%edi), %edi | ||
322 | dec %edx | ||
323 | jne 3b | ||
324 | adcl $0, %eax | ||
325 | 4: andl $3, %ecx | ||
326 | jz 7f | ||
327 | cmpl $2, %ecx | ||
328 | jb 5f | ||
329 | SRC( movw (%esi), %cx ) | ||
330 | leal 2(%esi), %esi | ||
331 | DST( movw %cx, (%edi) ) | ||
332 | leal 2(%edi), %edi | ||
333 | je 6f | ||
334 | shll $16,%ecx | ||
335 | SRC(5: movb (%esi), %cl ) | ||
336 | DST( movb %cl, (%edi) ) | ||
337 | 6: addl %ecx, %eax | ||
338 | adcl $0, %eax | ||
339 | 7: | ||
340 | 5000: | ||
341 | |||
342 | # Exception handler: | ||
343 | .section .fixup, "ax" | ||
344 | |||
345 | 6001: | ||
346 | movl ARGBASE+20(%esp), %ebx # src_err_ptr | ||
347 | movl $-EFAULT, (%ebx) | ||
348 | |||
349 | # zero the complete destination - computing the rest | ||
350 | # is too much work | ||
351 | movl ARGBASE+8(%esp), %edi # dst | ||
352 | movl ARGBASE+12(%esp), %ecx # len | ||
353 | xorl %eax,%eax | ||
354 | rep ; stosb | ||
355 | |||
356 | jmp 5000b | ||
357 | |||
358 | 6002: | ||
359 | movl ARGBASE+24(%esp), %ebx # dst_err_ptr | ||
360 | movl $-EFAULT,(%ebx) | ||
361 | jmp 5000b | ||
362 | |||
363 | .previous | ||
364 | |||
365 | popl %ebx | ||
366 | popl %esi | ||
367 | popl %edi | ||
368 | popl %ecx # equivalent to addl $4,%esp | ||
369 | ret | ||
370 | |||
371 | #else | ||
372 | |||
373 | /* Version for PentiumII/PPro */ | ||
374 | |||
375 | #define ROUND1(x) \ | ||
376 | SRC(movl x(%esi), %ebx ) ; \ | ||
377 | addl %ebx, %eax ; \ | ||
378 | DST(movl %ebx, x(%edi) ) ; | ||
379 | |||
380 | #define ROUND(x) \ | ||
381 | SRC(movl x(%esi), %ebx ) ; \ | ||
382 | adcl %ebx, %eax ; \ | ||
383 | DST(movl %ebx, x(%edi) ) ; | ||
384 | |||
385 | #define ARGBASE 12 | ||
386 | |||
387 | csum_partial_copy_generic_i386: | ||
388 | pushl %ebx | ||
389 | pushl %edi | ||
390 | pushl %esi | ||
391 | movl ARGBASE+4(%esp),%esi #src | ||
392 | movl ARGBASE+8(%esp),%edi #dst | ||
393 | movl ARGBASE+12(%esp),%ecx #len | ||
394 | movl ARGBASE+16(%esp),%eax #sum | ||
395 | # movl %ecx, %edx | ||
396 | movl %ecx, %ebx | ||
397 | movl %esi, %edx | ||
398 | shrl $6, %ecx | ||
399 | andl $0x3c, %ebx | ||
400 | negl %ebx | ||
401 | subl %ebx, %esi | ||
402 | subl %ebx, %edi | ||
403 | lea -1(%esi),%edx | ||
404 | andl $-32,%edx | ||
405 | lea 3f(%ebx,%ebx), %ebx | ||
406 | testl %esi, %esi | ||
407 | jmp *%ebx | ||
408 | 1: addl $64,%esi | ||
409 | addl $64,%edi | ||
410 | SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) | ||
411 | ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52) | ||
412 | ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36) | ||
413 | ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20) | ||
414 | ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4) | ||
415 | 3: adcl $0,%eax | ||
416 | addl $64, %edx | ||
417 | dec %ecx | ||
418 | jge 1b | ||
419 | 4: movl ARGBASE+12(%esp),%edx #len | ||
420 | andl $3, %edx | ||
421 | jz 7f | ||
422 | cmpl $2, %edx | ||
423 | jb 5f | ||
424 | SRC( movw (%esi), %dx ) | ||
425 | leal 2(%esi), %esi | ||
426 | DST( movw %dx, (%edi) ) | ||
427 | leal 2(%edi), %edi | ||
428 | je 6f | ||
429 | shll $16,%edx | ||
430 | 5: | ||
431 | SRC( movb (%esi), %dl ) | ||
432 | DST( movb %dl, (%edi) ) | ||
433 | 6: addl %edx, %eax | ||
434 | adcl $0, %eax | ||
435 | 7: | ||
436 | .section .fixup, "ax" | ||
437 | 6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr | ||
438 | movl $-EFAULT, (%ebx) | ||
439 | # zero the complete destination (computing the rest is too much work) | ||
440 | movl ARGBASE+8(%esp),%edi # dst | ||
441 | movl ARGBASE+12(%esp),%ecx # len | ||
442 | xorl %eax,%eax | ||
443 | rep; stosb | ||
444 | jmp 7b | ||
445 | 6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr | ||
446 | movl $-EFAULT, (%ebx) | ||
447 | jmp 7b | ||
448 | .previous | ||
449 | |||
450 | popl %esi | ||
451 | popl %edi | ||
452 | popl %ebx | ||
453 | ret | ||
454 | |||
455 | #undef ROUND | ||
456 | #undef ROUND1 | ||
457 | |||
458 | #endif | ||