diff options
Diffstat (limited to 'arch/mips/lib/csum_partial.S')
-rw-r--r-- | arch/mips/lib/csum_partial.S | 275 |
1 files changed, 158 insertions, 117 deletions
diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S index c0a77fe038be..8d7784122c14 100644 --- a/arch/mips/lib/csum_partial.S +++ b/arch/mips/lib/csum_partial.S | |||
@@ -7,6 +7,7 @@ | |||
7 | * | 7 | * |
8 | * Copyright (C) 1998, 1999 Ralf Baechle | 8 | * Copyright (C) 1998, 1999 Ralf Baechle |
9 | * Copyright (C) 1999 Silicon Graphics, Inc. | 9 | * Copyright (C) 1999 Silicon Graphics, Inc. |
10 | * Copyright (C) 2007 Maciej W. Rozycki | ||
10 | */ | 11 | */ |
11 | #include <linux/errno.h> | 12 | #include <linux/errno.h> |
12 | #include <asm/asm.h> | 13 | #include <asm/asm.h> |
@@ -52,9 +53,12 @@ | |||
52 | #define UNIT(unit) ((unit)*NBYTES) | 53 | #define UNIT(unit) ((unit)*NBYTES) |
53 | 54 | ||
54 | #define ADDC(sum,reg) \ | 55 | #define ADDC(sum,reg) \ |
56 | .set push; \ | ||
57 | .set noat; \ | ||
55 | ADD sum, reg; \ | 58 | ADD sum, reg; \ |
56 | sltu v1, sum, reg; \ | 59 | sltu v1, sum, reg; \ |
57 | ADD sum, v1 | 60 | ADD sum, v1; \ |
61 | .set pop | ||
58 | 62 | ||
59 | #define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3) \ | 63 | #define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3) \ |
60 | LOAD _t0, (offset + UNIT(0))(src); \ | 64 | LOAD _t0, (offset + UNIT(0))(src); \ |
@@ -92,13 +96,13 @@ LEAF(csum_partial) | |||
92 | move t7, zero | 96 | move t7, zero |
93 | 97 | ||
94 | sltiu t8, a1, 0x8 | 98 | sltiu t8, a1, 0x8 |
95 | bnez t8, small_csumcpy /* < 8 bytes to copy */ | 99 | bnez t8, .Lsmall_csumcpy /* < 8 bytes to copy */ |
96 | move t2, a1 | 100 | move t2, a1 |
97 | 101 | ||
98 | andi t7, src, 0x1 /* odd buffer? */ | 102 | andi t7, src, 0x1 /* odd buffer? */ |
99 | 103 | ||
100 | hword_align: | 104 | .Lhword_align: |
101 | beqz t7, word_align | 105 | beqz t7, .Lword_align |
102 | andi t8, src, 0x2 | 106 | andi t8, src, 0x2 |
103 | 107 | ||
104 | lbu t0, (src) | 108 | lbu t0, (src) |
@@ -110,8 +114,8 @@ hword_align: | |||
110 | PTR_ADDU src, src, 0x1 | 114 | PTR_ADDU src, src, 0x1 |
111 | andi t8, src, 0x2 | 115 | andi t8, src, 0x2 |
112 | 116 | ||
113 | word_align: | 117 | .Lword_align: |
114 | beqz t8, dword_align | 118 | beqz t8, .Ldword_align |
115 | sltiu t8, a1, 56 | 119 | sltiu t8, a1, 56 |
116 | 120 | ||
117 | lhu t0, (src) | 121 | lhu t0, (src) |
@@ -120,12 +124,12 @@ word_align: | |||
120 | sltiu t8, a1, 56 | 124 | sltiu t8, a1, 56 |
121 | PTR_ADDU src, src, 0x2 | 125 | PTR_ADDU src, src, 0x2 |
122 | 126 | ||
123 | dword_align: | 127 | .Ldword_align: |
124 | bnez t8, do_end_words | 128 | bnez t8, .Ldo_end_words |
125 | move t8, a1 | 129 | move t8, a1 |
126 | 130 | ||
127 | andi t8, src, 0x4 | 131 | andi t8, src, 0x4 |
128 | beqz t8, qword_align | 132 | beqz t8, .Lqword_align |
129 | andi t8, src, 0x8 | 133 | andi t8, src, 0x8 |
130 | 134 | ||
131 | lw t0, 0x00(src) | 135 | lw t0, 0x00(src) |
@@ -134,8 +138,8 @@ dword_align: | |||
134 | PTR_ADDU src, src, 0x4 | 138 | PTR_ADDU src, src, 0x4 |
135 | andi t8, src, 0x8 | 139 | andi t8, src, 0x8 |
136 | 140 | ||
137 | qword_align: | 141 | .Lqword_align: |
138 | beqz t8, oword_align | 142 | beqz t8, .Loword_align |
139 | andi t8, src, 0x10 | 143 | andi t8, src, 0x10 |
140 | 144 | ||
141 | #ifdef USE_DOUBLE | 145 | #ifdef USE_DOUBLE |
@@ -152,8 +156,8 @@ qword_align: | |||
152 | PTR_ADDU src, src, 0x8 | 156 | PTR_ADDU src, src, 0x8 |
153 | andi t8, src, 0x10 | 157 | andi t8, src, 0x10 |
154 | 158 | ||
155 | oword_align: | 159 | .Loword_align: |
156 | beqz t8, begin_movement | 160 | beqz t8, .Lbegin_movement |
157 | LONG_SRL t8, a1, 0x7 | 161 | LONG_SRL t8, a1, 0x7 |
158 | 162 | ||
159 | #ifdef USE_DOUBLE | 163 | #ifdef USE_DOUBLE |
@@ -168,51 +172,55 @@ oword_align: | |||
168 | PTR_ADDU src, src, 0x10 | 172 | PTR_ADDU src, src, 0x10 |
169 | LONG_SRL t8, a1, 0x7 | 173 | LONG_SRL t8, a1, 0x7 |
170 | 174 | ||
171 | begin_movement: | 175 | .Lbegin_movement: |
172 | beqz t8, 1f | 176 | beqz t8, 1f |
173 | andi t2, a1, 0x40 | 177 | andi t2, a1, 0x40 |
174 | 178 | ||
175 | move_128bytes: | 179 | .Lmove_128bytes: |
176 | CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4) | 180 | CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4) |
177 | CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4) | 181 | CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4) |
178 | CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4) | 182 | CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4) |
179 | CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4) | 183 | CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4) |
180 | LONG_SUBU t8, t8, 0x01 | 184 | LONG_SUBU t8, t8, 0x01 |
181 | bnez t8, move_128bytes | 185 | .set reorder /* DADDI_WAR */ |
182 | PTR_ADDU src, src, 0x80 | 186 | PTR_ADDU src, src, 0x80 |
187 | bnez t8, .Lmove_128bytes | ||
188 | .set noreorder | ||
183 | 189 | ||
184 | 1: | 190 | 1: |
185 | beqz t2, 1f | 191 | beqz t2, 1f |
186 | andi t2, a1, 0x20 | 192 | andi t2, a1, 0x20 |
187 | 193 | ||
188 | move_64bytes: | 194 | .Lmove_64bytes: |
189 | CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4) | 195 | CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4) |
190 | CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4) | 196 | CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4) |
191 | PTR_ADDU src, src, 0x40 | 197 | PTR_ADDU src, src, 0x40 |
192 | 198 | ||
193 | 1: | 199 | 1: |
194 | beqz t2, do_end_words | 200 | beqz t2, .Ldo_end_words |
195 | andi t8, a1, 0x1c | 201 | andi t8, a1, 0x1c |
196 | 202 | ||
197 | move_32bytes: | 203 | .Lmove_32bytes: |
198 | CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4) | 204 | CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4) |
199 | andi t8, a1, 0x1c | 205 | andi t8, a1, 0x1c |
200 | PTR_ADDU src, src, 0x20 | 206 | PTR_ADDU src, src, 0x20 |
201 | 207 | ||
202 | do_end_words: | 208 | .Ldo_end_words: |
203 | beqz t8, small_csumcpy | 209 | beqz t8, .Lsmall_csumcpy |
204 | andi t2, a1, 0x3 | 210 | andi t2, a1, 0x3 |
205 | LONG_SRL t8, t8, 0x2 | 211 | LONG_SRL t8, t8, 0x2 |
206 | 212 | ||
207 | end_words: | 213 | .Lend_words: |
208 | lw t0, (src) | 214 | lw t0, (src) |
209 | LONG_SUBU t8, t8, 0x1 | 215 | LONG_SUBU t8, t8, 0x1 |
210 | ADDC(sum, t0) | 216 | ADDC(sum, t0) |
211 | bnez t8, end_words | 217 | .set reorder /* DADDI_WAR */ |
212 | PTR_ADDU src, src, 0x4 | 218 | PTR_ADDU src, src, 0x4 |
219 | bnez t8, .Lend_words | ||
220 | .set noreorder | ||
213 | 221 | ||
214 | /* unknown src alignment and < 8 bytes to go */ | 222 | /* unknown src alignment and < 8 bytes to go */ |
215 | small_csumcpy: | 223 | .Lsmall_csumcpy: |
216 | move a1, t2 | 224 | move a1, t2 |
217 | 225 | ||
218 | andi t0, a1, 4 | 226 | andi t0, a1, 4 |
@@ -246,6 +254,8 @@ small_csumcpy: | |||
246 | 1: ADDC(sum, t1) | 254 | 1: ADDC(sum, t1) |
247 | 255 | ||
248 | /* fold checksum */ | 256 | /* fold checksum */ |
257 | .set push | ||
258 | .set noat | ||
249 | #ifdef USE_DOUBLE | 259 | #ifdef USE_DOUBLE |
250 | dsll32 v1, sum, 0 | 260 | dsll32 v1, sum, 0 |
251 | daddu sum, v1 | 261 | daddu sum, v1 |
@@ -266,6 +276,7 @@ small_csumcpy: | |||
266 | srl sum, sum, 8 | 276 | srl sum, sum, 8 |
267 | or sum, v1 | 277 | or sum, v1 |
268 | andi sum, 0xffff | 278 | andi sum, 0xffff |
279 | .set pop | ||
269 | 1: | 280 | 1: |
270 | .set reorder | 281 | .set reorder |
271 | /* Add the passed partial csum. */ | 282 | /* Add the passed partial csum. */ |
@@ -373,7 +384,11 @@ small_csumcpy: | |||
373 | 384 | ||
374 | #define ADDRMASK (NBYTES-1) | 385 | #define ADDRMASK (NBYTES-1) |
375 | 386 | ||
387 | #ifndef CONFIG_CPU_DADDI_WORKAROUNDS | ||
376 | .set noat | 388 | .set noat |
389 | #else | ||
390 | .set at=v1 | ||
391 | #endif | ||
377 | 392 | ||
378 | LEAF(__csum_partial_copy_user) | 393 | LEAF(__csum_partial_copy_user) |
379 | PTR_ADDU AT, src, len /* See (1) above. */ | 394 | PTR_ADDU AT, src, len /* See (1) above. */ |
@@ -398,95 +413,101 @@ FEXPORT(csum_partial_copy_nocheck) | |||
398 | */ | 413 | */ |
399 | sltu t2, len, NBYTES | 414 | sltu t2, len, NBYTES |
400 | and t1, dst, ADDRMASK | 415 | and t1, dst, ADDRMASK |
401 | bnez t2, copy_bytes_checklen | 416 | bnez t2, .Lcopy_bytes_checklen |
402 | and t0, src, ADDRMASK | 417 | and t0, src, ADDRMASK |
403 | andi odd, dst, 0x1 /* odd buffer? */ | 418 | andi odd, dst, 0x1 /* odd buffer? */ |
404 | bnez t1, dst_unaligned | 419 | bnez t1, .Ldst_unaligned |
405 | nop | 420 | nop |
406 | bnez t0, src_unaligned_dst_aligned | 421 | bnez t0, .Lsrc_unaligned_dst_aligned |
407 | /* | 422 | /* |
408 | * use delay slot for fall-through | 423 | * use delay slot for fall-through |
409 | * src and dst are aligned; need to compute rem | 424 | * src and dst are aligned; need to compute rem |
410 | */ | 425 | */ |
411 | both_aligned: | 426 | .Lboth_aligned: |
412 | SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter | 427 | SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter |
413 | beqz t0, cleanup_both_aligned # len < 8*NBYTES | 428 | beqz t0, .Lcleanup_both_aligned # len < 8*NBYTES |
414 | nop | 429 | nop |
415 | SUB len, 8*NBYTES # subtract here for bgez loop | 430 | SUB len, 8*NBYTES # subtract here for bgez loop |
416 | .align 4 | 431 | .align 4 |
417 | 1: | 432 | 1: |
418 | EXC( LOAD t0, UNIT(0)(src), l_exc) | 433 | EXC( LOAD t0, UNIT(0)(src), .Ll_exc) |
419 | EXC( LOAD t1, UNIT(1)(src), l_exc_copy) | 434 | EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy) |
420 | EXC( LOAD t2, UNIT(2)(src), l_exc_copy) | 435 | EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy) |
421 | EXC( LOAD t3, UNIT(3)(src), l_exc_copy) | 436 | EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy) |
422 | EXC( LOAD t4, UNIT(4)(src), l_exc_copy) | 437 | EXC( LOAD t4, UNIT(4)(src), .Ll_exc_copy) |
423 | EXC( LOAD t5, UNIT(5)(src), l_exc_copy) | 438 | EXC( LOAD t5, UNIT(5)(src), .Ll_exc_copy) |
424 | EXC( LOAD t6, UNIT(6)(src), l_exc_copy) | 439 | EXC( LOAD t6, UNIT(6)(src), .Ll_exc_copy) |
425 | EXC( LOAD t7, UNIT(7)(src), l_exc_copy) | 440 | EXC( LOAD t7, UNIT(7)(src), .Ll_exc_copy) |
426 | SUB len, len, 8*NBYTES | 441 | SUB len, len, 8*NBYTES |
427 | ADD src, src, 8*NBYTES | 442 | ADD src, src, 8*NBYTES |
428 | EXC( STORE t0, UNIT(0)(dst), s_exc) | 443 | EXC( STORE t0, UNIT(0)(dst), .Ls_exc) |
429 | ADDC(sum, t0) | 444 | ADDC(sum, t0) |
430 | EXC( STORE t1, UNIT(1)(dst), s_exc) | 445 | EXC( STORE t1, UNIT(1)(dst), .Ls_exc) |
431 | ADDC(sum, t1) | 446 | ADDC(sum, t1) |
432 | EXC( STORE t2, UNIT(2)(dst), s_exc) | 447 | EXC( STORE t2, UNIT(2)(dst), .Ls_exc) |
433 | ADDC(sum, t2) | 448 | ADDC(sum, t2) |
434 | EXC( STORE t3, UNIT(3)(dst), s_exc) | 449 | EXC( STORE t3, UNIT(3)(dst), .Ls_exc) |
435 | ADDC(sum, t3) | 450 | ADDC(sum, t3) |
436 | EXC( STORE t4, UNIT(4)(dst), s_exc) | 451 | EXC( STORE t4, UNIT(4)(dst), .Ls_exc) |
437 | ADDC(sum, t4) | 452 | ADDC(sum, t4) |
438 | EXC( STORE t5, UNIT(5)(dst), s_exc) | 453 | EXC( STORE t5, UNIT(5)(dst), .Ls_exc) |
439 | ADDC(sum, t5) | 454 | ADDC(sum, t5) |
440 | EXC( STORE t6, UNIT(6)(dst), s_exc) | 455 | EXC( STORE t6, UNIT(6)(dst), .Ls_exc) |
441 | ADDC(sum, t6) | 456 | ADDC(sum, t6) |
442 | EXC( STORE t7, UNIT(7)(dst), s_exc) | 457 | EXC( STORE t7, UNIT(7)(dst), .Ls_exc) |
443 | ADDC(sum, t7) | 458 | ADDC(sum, t7) |
459 | .set reorder /* DADDI_WAR */ | ||
460 | ADD dst, dst, 8*NBYTES | ||
444 | bgez len, 1b | 461 | bgez len, 1b |
445 | ADD dst, dst, 8*NBYTES | 462 | .set noreorder |
446 | ADD len, 8*NBYTES # revert len (see above) | 463 | ADD len, 8*NBYTES # revert len (see above) |
447 | 464 | ||
448 | /* | 465 | /* |
449 | * len == the number of bytes left to copy < 8*NBYTES | 466 | * len == the number of bytes left to copy < 8*NBYTES |
450 | */ | 467 | */ |
451 | cleanup_both_aligned: | 468 | .Lcleanup_both_aligned: |
452 | #define rem t7 | 469 | #define rem t7 |
453 | beqz len, done | 470 | beqz len, .Ldone |
454 | sltu t0, len, 4*NBYTES | 471 | sltu t0, len, 4*NBYTES |
455 | bnez t0, less_than_4units | 472 | bnez t0, .Lless_than_4units |
456 | and rem, len, (NBYTES-1) # rem = len % NBYTES | 473 | and rem, len, (NBYTES-1) # rem = len % NBYTES |
457 | /* | 474 | /* |
458 | * len >= 4*NBYTES | 475 | * len >= 4*NBYTES |
459 | */ | 476 | */ |
460 | EXC( LOAD t0, UNIT(0)(src), l_exc) | 477 | EXC( LOAD t0, UNIT(0)(src), .Ll_exc) |
461 | EXC( LOAD t1, UNIT(1)(src), l_exc_copy) | 478 | EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy) |
462 | EXC( LOAD t2, UNIT(2)(src), l_exc_copy) | 479 | EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy) |
463 | EXC( LOAD t3, UNIT(3)(src), l_exc_copy) | 480 | EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy) |
464 | SUB len, len, 4*NBYTES | 481 | SUB len, len, 4*NBYTES |
465 | ADD src, src, 4*NBYTES | 482 | ADD src, src, 4*NBYTES |
466 | EXC( STORE t0, UNIT(0)(dst), s_exc) | 483 | EXC( STORE t0, UNIT(0)(dst), .Ls_exc) |
467 | ADDC(sum, t0) | 484 | ADDC(sum, t0) |
468 | EXC( STORE t1, UNIT(1)(dst), s_exc) | 485 | EXC( STORE t1, UNIT(1)(dst), .Ls_exc) |
469 | ADDC(sum, t1) | 486 | ADDC(sum, t1) |
470 | EXC( STORE t2, UNIT(2)(dst), s_exc) | 487 | EXC( STORE t2, UNIT(2)(dst), .Ls_exc) |
471 | ADDC(sum, t2) | 488 | ADDC(sum, t2) |
472 | EXC( STORE t3, UNIT(3)(dst), s_exc) | 489 | EXC( STORE t3, UNIT(3)(dst), .Ls_exc) |
473 | ADDC(sum, t3) | 490 | ADDC(sum, t3) |
474 | beqz len, done | 491 | .set reorder /* DADDI_WAR */ |
475 | ADD dst, dst, 4*NBYTES | 492 | ADD dst, dst, 4*NBYTES |
476 | less_than_4units: | 493 | beqz len, .Ldone |
494 | .set noreorder | ||
495 | .Lless_than_4units: | ||
477 | /* | 496 | /* |
478 | * rem = len % NBYTES | 497 | * rem = len % NBYTES |
479 | */ | 498 | */ |
480 | beq rem, len, copy_bytes | 499 | beq rem, len, .Lcopy_bytes |
481 | nop | 500 | nop |
482 | 1: | 501 | 1: |
483 | EXC( LOAD t0, 0(src), l_exc) | 502 | EXC( LOAD t0, 0(src), .Ll_exc) |
484 | ADD src, src, NBYTES | 503 | ADD src, src, NBYTES |
485 | SUB len, len, NBYTES | 504 | SUB len, len, NBYTES |
486 | EXC( STORE t0, 0(dst), s_exc) | 505 | EXC( STORE t0, 0(dst), .Ls_exc) |
487 | ADDC(sum, t0) | 506 | ADDC(sum, t0) |
507 | .set reorder /* DADDI_WAR */ | ||
508 | ADD dst, dst, NBYTES | ||
488 | bne rem, len, 1b | 509 | bne rem, len, 1b |
489 | ADD dst, dst, NBYTES | 510 | .set noreorder |
490 | 511 | ||
491 | /* | 512 | /* |
492 | * src and dst are aligned, need to copy rem bytes (rem < NBYTES) | 513 | * src and dst are aligned, need to copy rem bytes (rem < NBYTES) |
@@ -500,20 +521,20 @@ EXC( STORE t0, 0(dst), s_exc) | |||
500 | * more instruction-level parallelism. | 521 | * more instruction-level parallelism. |
501 | */ | 522 | */ |
502 | #define bits t2 | 523 | #define bits t2 |
503 | beqz len, done | 524 | beqz len, .Ldone |
504 | ADD t1, dst, len # t1 is just past last byte of dst | 525 | ADD t1, dst, len # t1 is just past last byte of dst |
505 | li bits, 8*NBYTES | 526 | li bits, 8*NBYTES |
506 | SLL rem, len, 3 # rem = number of bits to keep | 527 | SLL rem, len, 3 # rem = number of bits to keep |
507 | EXC( LOAD t0, 0(src), l_exc) | 528 | EXC( LOAD t0, 0(src), .Ll_exc) |
508 | SUB bits, bits, rem # bits = number of bits to discard | 529 | SUB bits, bits, rem # bits = number of bits to discard |
509 | SHIFT_DISCARD t0, t0, bits | 530 | SHIFT_DISCARD t0, t0, bits |
510 | EXC( STREST t0, -1(t1), s_exc) | 531 | EXC( STREST t0, -1(t1), .Ls_exc) |
511 | SHIFT_DISCARD_REVERT t0, t0, bits | 532 | SHIFT_DISCARD_REVERT t0, t0, bits |
512 | .set reorder | 533 | .set reorder |
513 | ADDC(sum, t0) | 534 | ADDC(sum, t0) |
514 | b done | 535 | b .Ldone |
515 | .set noreorder | 536 | .set noreorder |
516 | dst_unaligned: | 537 | .Ldst_unaligned: |
517 | /* | 538 | /* |
518 | * dst is unaligned | 539 | * dst is unaligned |
519 | * t0 = src & ADDRMASK | 540 | * t0 = src & ADDRMASK |
@@ -524,25 +545,25 @@ dst_unaligned: | |||
524 | * Set match = (src and dst have same alignment) | 545 | * Set match = (src and dst have same alignment) |
525 | */ | 546 | */ |
526 | #define match rem | 547 | #define match rem |
527 | EXC( LDFIRST t3, FIRST(0)(src), l_exc) | 548 | EXC( LDFIRST t3, FIRST(0)(src), .Ll_exc) |
528 | ADD t2, zero, NBYTES | 549 | ADD t2, zero, NBYTES |
529 | EXC( LDREST t3, REST(0)(src), l_exc_copy) | 550 | EXC( LDREST t3, REST(0)(src), .Ll_exc_copy) |
530 | SUB t2, t2, t1 # t2 = number of bytes copied | 551 | SUB t2, t2, t1 # t2 = number of bytes copied |
531 | xor match, t0, t1 | 552 | xor match, t0, t1 |
532 | EXC( STFIRST t3, FIRST(0)(dst), s_exc) | 553 | EXC( STFIRST t3, FIRST(0)(dst), .Ls_exc) |
533 | SLL t4, t1, 3 # t4 = number of bits to discard | 554 | SLL t4, t1, 3 # t4 = number of bits to discard |
534 | SHIFT_DISCARD t3, t3, t4 | 555 | SHIFT_DISCARD t3, t3, t4 |
535 | /* no SHIFT_DISCARD_REVERT to handle odd buffer properly */ | 556 | /* no SHIFT_DISCARD_REVERT to handle odd buffer properly */ |
536 | ADDC(sum, t3) | 557 | ADDC(sum, t3) |
537 | beq len, t2, done | 558 | beq len, t2, .Ldone |
538 | SUB len, len, t2 | 559 | SUB len, len, t2 |
539 | ADD dst, dst, t2 | 560 | ADD dst, dst, t2 |
540 | beqz match, both_aligned | 561 | beqz match, .Lboth_aligned |
541 | ADD src, src, t2 | 562 | ADD src, src, t2 |
542 | 563 | ||
543 | src_unaligned_dst_aligned: | 564 | .Lsrc_unaligned_dst_aligned: |
544 | SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter | 565 | SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter |
545 | beqz t0, cleanup_src_unaligned | 566 | beqz t0, .Lcleanup_src_unaligned |
546 | and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES | 567 | and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES |
547 | 1: | 568 | 1: |
548 | /* | 569 | /* |
@@ -551,49 +572,53 @@ src_unaligned_dst_aligned: | |||
551 | * It's OK to load FIRST(N+1) before REST(N) because the two addresses | 572 | * It's OK to load FIRST(N+1) before REST(N) because the two addresses |
552 | * are to the same unit (unless src is aligned, but it's not). | 573 | * are to the same unit (unless src is aligned, but it's not). |
553 | */ | 574 | */ |
554 | EXC( LDFIRST t0, FIRST(0)(src), l_exc) | 575 | EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc) |
555 | EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy) | 576 | EXC( LDFIRST t1, FIRST(1)(src), .Ll_exc_copy) |
556 | SUB len, len, 4*NBYTES | 577 | SUB len, len, 4*NBYTES |
557 | EXC( LDREST t0, REST(0)(src), l_exc_copy) | 578 | EXC( LDREST t0, REST(0)(src), .Ll_exc_copy) |
558 | EXC( LDREST t1, REST(1)(src), l_exc_copy) | 579 | EXC( LDREST t1, REST(1)(src), .Ll_exc_copy) |
559 | EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy) | 580 | EXC( LDFIRST t2, FIRST(2)(src), .Ll_exc_copy) |
560 | EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy) | 581 | EXC( LDFIRST t3, FIRST(3)(src), .Ll_exc_copy) |
561 | EXC( LDREST t2, REST(2)(src), l_exc_copy) | 582 | EXC( LDREST t2, REST(2)(src), .Ll_exc_copy) |
562 | EXC( LDREST t3, REST(3)(src), l_exc_copy) | 583 | EXC( LDREST t3, REST(3)(src), .Ll_exc_copy) |
563 | ADD src, src, 4*NBYTES | 584 | ADD src, src, 4*NBYTES |
564 | #ifdef CONFIG_CPU_SB1 | 585 | #ifdef CONFIG_CPU_SB1 |
565 | nop # improves slotting | 586 | nop # improves slotting |
566 | #endif | 587 | #endif |
567 | EXC( STORE t0, UNIT(0)(dst), s_exc) | 588 | EXC( STORE t0, UNIT(0)(dst), .Ls_exc) |
568 | ADDC(sum, t0) | 589 | ADDC(sum, t0) |
569 | EXC( STORE t1, UNIT(1)(dst), s_exc) | 590 | EXC( STORE t1, UNIT(1)(dst), .Ls_exc) |
570 | ADDC(sum, t1) | 591 | ADDC(sum, t1) |
571 | EXC( STORE t2, UNIT(2)(dst), s_exc) | 592 | EXC( STORE t2, UNIT(2)(dst), .Ls_exc) |
572 | ADDC(sum, t2) | 593 | ADDC(sum, t2) |
573 | EXC( STORE t3, UNIT(3)(dst), s_exc) | 594 | EXC( STORE t3, UNIT(3)(dst), .Ls_exc) |
574 | ADDC(sum, t3) | 595 | ADDC(sum, t3) |
596 | .set reorder /* DADDI_WAR */ | ||
597 | ADD dst, dst, 4*NBYTES | ||
575 | bne len, rem, 1b | 598 | bne len, rem, 1b |
576 | ADD dst, dst, 4*NBYTES | 599 | .set noreorder |
577 | 600 | ||
578 | cleanup_src_unaligned: | 601 | .Lcleanup_src_unaligned: |
579 | beqz len, done | 602 | beqz len, .Ldone |
580 | and rem, len, NBYTES-1 # rem = len % NBYTES | 603 | and rem, len, NBYTES-1 # rem = len % NBYTES |
581 | beq rem, len, copy_bytes | 604 | beq rem, len, .Lcopy_bytes |
582 | nop | 605 | nop |
583 | 1: | 606 | 1: |
584 | EXC( LDFIRST t0, FIRST(0)(src), l_exc) | 607 | EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc) |
585 | EXC( LDREST t0, REST(0)(src), l_exc_copy) | 608 | EXC( LDREST t0, REST(0)(src), .Ll_exc_copy) |
586 | ADD src, src, NBYTES | 609 | ADD src, src, NBYTES |
587 | SUB len, len, NBYTES | 610 | SUB len, len, NBYTES |
588 | EXC( STORE t0, 0(dst), s_exc) | 611 | EXC( STORE t0, 0(dst), .Ls_exc) |
589 | ADDC(sum, t0) | 612 | ADDC(sum, t0) |
613 | .set reorder /* DADDI_WAR */ | ||
614 | ADD dst, dst, NBYTES | ||
590 | bne len, rem, 1b | 615 | bne len, rem, 1b |
591 | ADD dst, dst, NBYTES | 616 | .set noreorder |
592 | 617 | ||
593 | copy_bytes_checklen: | 618 | .Lcopy_bytes_checklen: |
594 | beqz len, done | 619 | beqz len, .Ldone |
595 | nop | 620 | nop |
596 | copy_bytes: | 621 | .Lcopy_bytes: |
597 | /* 0 < len < NBYTES */ | 622 | /* 0 < len < NBYTES */ |
598 | #ifdef CONFIG_CPU_LITTLE_ENDIAN | 623 | #ifdef CONFIG_CPU_LITTLE_ENDIAN |
599 | #define SHIFT_START 0 | 624 | #define SHIFT_START 0 |
@@ -604,14 +629,14 @@ copy_bytes: | |||
604 | #endif | 629 | #endif |
605 | move t2, zero # partial word | 630 | move t2, zero # partial word |
606 | li t3, SHIFT_START # shift | 631 | li t3, SHIFT_START # shift |
607 | /* use l_exc_copy here to return correct sum on fault */ | 632 | /* use .Ll_exc_copy here to return correct sum on fault */ |
608 | #define COPY_BYTE(N) \ | 633 | #define COPY_BYTE(N) \ |
609 | EXC( lbu t0, N(src), l_exc_copy); \ | 634 | EXC( lbu t0, N(src), .Ll_exc_copy); \ |
610 | SUB len, len, 1; \ | 635 | SUB len, len, 1; \ |
611 | EXC( sb t0, N(dst), s_exc); \ | 636 | EXC( sb t0, N(dst), .Ls_exc); \ |
612 | SLLV t0, t0, t3; \ | 637 | SLLV t0, t0, t3; \ |
613 | addu t3, SHIFT_INC; \ | 638 | addu t3, SHIFT_INC; \ |
614 | beqz len, copy_bytes_done; \ | 639 | beqz len, .Lcopy_bytes_done; \ |
615 | or t2, t0 | 640 | or t2, t0 |
616 | 641 | ||
617 | COPY_BYTE(0) | 642 | COPY_BYTE(0) |
@@ -622,15 +647,17 @@ EXC( sb t0, N(dst), s_exc); \ | |||
622 | COPY_BYTE(4) | 647 | COPY_BYTE(4) |
623 | COPY_BYTE(5) | 648 | COPY_BYTE(5) |
624 | #endif | 649 | #endif |
625 | EXC( lbu t0, NBYTES-2(src), l_exc_copy) | 650 | EXC( lbu t0, NBYTES-2(src), .Ll_exc_copy) |
626 | SUB len, len, 1 | 651 | SUB len, len, 1 |
627 | EXC( sb t0, NBYTES-2(dst), s_exc) | 652 | EXC( sb t0, NBYTES-2(dst), .Ls_exc) |
628 | SLLV t0, t0, t3 | 653 | SLLV t0, t0, t3 |
629 | or t2, t0 | 654 | or t2, t0 |
630 | copy_bytes_done: | 655 | .Lcopy_bytes_done: |
631 | ADDC(sum, t2) | 656 | ADDC(sum, t2) |
632 | done: | 657 | .Ldone: |
633 | /* fold checksum */ | 658 | /* fold checksum */ |
659 | .set push | ||
660 | .set noat | ||
634 | #ifdef USE_DOUBLE | 661 | #ifdef USE_DOUBLE |
635 | dsll32 v1, sum, 0 | 662 | dsll32 v1, sum, 0 |
636 | daddu sum, v1 | 663 | daddu sum, v1 |
@@ -651,13 +678,14 @@ done: | |||
651 | srl sum, sum, 8 | 678 | srl sum, sum, 8 |
652 | or sum, v1 | 679 | or sum, v1 |
653 | andi sum, 0xffff | 680 | andi sum, 0xffff |
681 | .set pop | ||
654 | 1: | 682 | 1: |
655 | .set reorder | 683 | .set reorder |
656 | ADDC(sum, psum) | 684 | ADDC(sum, psum) |
657 | jr ra | 685 | jr ra |
658 | .set noreorder | 686 | .set noreorder |
659 | 687 | ||
660 | l_exc_copy: | 688 | .Ll_exc_copy: |
661 | /* | 689 | /* |
662 | * Copy bytes from src until faulting load address (or until a | 690 | * Copy bytes from src until faulting load address (or until a |
663 | * lb faults) | 691 | * lb faults) |
@@ -672,15 +700,17 @@ l_exc_copy: | |||
672 | li t2, SHIFT_START | 700 | li t2, SHIFT_START |
673 | LOAD t0, THREAD_BUADDR(t0) | 701 | LOAD t0, THREAD_BUADDR(t0) |
674 | 1: | 702 | 1: |
675 | EXC( lbu t1, 0(src), l_exc) | 703 | EXC( lbu t1, 0(src), .Ll_exc) |
676 | ADD src, src, 1 | 704 | ADD src, src, 1 |
677 | sb t1, 0(dst) # can't fault -- we're copy_from_user | 705 | sb t1, 0(dst) # can't fault -- we're copy_from_user |
678 | SLLV t1, t1, t2 | 706 | SLLV t1, t1, t2 |
679 | addu t2, SHIFT_INC | 707 | addu t2, SHIFT_INC |
680 | ADDC(sum, t1) | 708 | ADDC(sum, t1) |
709 | .set reorder /* DADDI_WAR */ | ||
710 | ADD dst, dst, 1 | ||
681 | bne src, t0, 1b | 711 | bne src, t0, 1b |
682 | ADD dst, dst, 1 | 712 | .set noreorder |
683 | l_exc: | 713 | .Ll_exc: |
684 | LOAD t0, TI_TASK($28) | 714 | LOAD t0, TI_TASK($28) |
685 | nop | 715 | nop |
686 | LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address | 716 | LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address |
@@ -697,19 +727,30 @@ l_exc: | |||
697 | * Clear len bytes starting at dst. Can't call __bzero because it | 727 | * Clear len bytes starting at dst. Can't call __bzero because it |
698 | * might modify len. An inefficient loop for these rare times... | 728 | * might modify len. An inefficient loop for these rare times... |
699 | */ | 729 | */ |
700 | beqz len, done | 730 | .set reorder /* DADDI_WAR */ |
701 | SUB src, len, 1 | 731 | SUB src, len, 1 |
732 | beqz len, .Ldone | ||
733 | .set noreorder | ||
702 | 1: sb zero, 0(dst) | 734 | 1: sb zero, 0(dst) |
703 | ADD dst, dst, 1 | 735 | ADD dst, dst, 1 |
736 | .set push | ||
737 | .set noat | ||
738 | #ifndef CONFIG_CPU_DADDI_WORKAROUNDS | ||
704 | bnez src, 1b | 739 | bnez src, 1b |
705 | SUB src, src, 1 | 740 | SUB src, src, 1 |
741 | #else | ||
742 | li v1, 1 | ||
743 | bnez src, 1b | ||
744 | SUB src, src, v1 | ||
745 | #endif | ||
706 | li v1, -EFAULT | 746 | li v1, -EFAULT |
707 | b done | 747 | b .Ldone |
708 | sw v1, (errptr) | 748 | sw v1, (errptr) |
709 | 749 | ||
710 | s_exc: | 750 | .Ls_exc: |
711 | li v0, -1 /* invalid checksum */ | 751 | li v0, -1 /* invalid checksum */ |
712 | li v1, -EFAULT | 752 | li v1, -EFAULT |
713 | jr ra | 753 | jr ra |
714 | sw v1, (errptr) | 754 | sw v1, (errptr) |
755 | .set pop | ||
715 | END(__csum_partial_copy_user) | 756 | END(__csum_partial_copy_user) |