aboutsummaryrefslogtreecommitdiffstats
path: root/arch/mips/lib/csum_partial.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/mips/lib/csum_partial.S')
-rw-r--r--arch/mips/lib/csum_partial.S275
1 files changed, 158 insertions, 117 deletions
diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S
index c0a77fe038be..8d7784122c14 100644
--- a/arch/mips/lib/csum_partial.S
+++ b/arch/mips/lib/csum_partial.S
@@ -7,6 +7,7 @@
7 * 7 *
8 * Copyright (C) 1998, 1999 Ralf Baechle 8 * Copyright (C) 1998, 1999 Ralf Baechle
9 * Copyright (C) 1999 Silicon Graphics, Inc. 9 * Copyright (C) 1999 Silicon Graphics, Inc.
10 * Copyright (C) 2007 Maciej W. Rozycki
10 */ 11 */
11#include <linux/errno.h> 12#include <linux/errno.h>
12#include <asm/asm.h> 13#include <asm/asm.h>
@@ -52,9 +53,12 @@
52#define UNIT(unit) ((unit)*NBYTES) 53#define UNIT(unit) ((unit)*NBYTES)
53 54
54#define ADDC(sum,reg) \ 55#define ADDC(sum,reg) \
56 .set push; \
57 .set noat; \
55 ADD sum, reg; \ 58 ADD sum, reg; \
56 sltu v1, sum, reg; \ 59 sltu v1, sum, reg; \
57 ADD sum, v1 60 ADD sum, v1; \
61 .set pop
58 62
59#define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3) \ 63#define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3) \
60 LOAD _t0, (offset + UNIT(0))(src); \ 64 LOAD _t0, (offset + UNIT(0))(src); \
@@ -92,13 +96,13 @@ LEAF(csum_partial)
92 move t7, zero 96 move t7, zero
93 97
94 sltiu t8, a1, 0x8 98 sltiu t8, a1, 0x8
95 bnez t8, small_csumcpy /* < 8 bytes to copy */ 99 bnez t8, .Lsmall_csumcpy /* < 8 bytes to copy */
96 move t2, a1 100 move t2, a1
97 101
98 andi t7, src, 0x1 /* odd buffer? */ 102 andi t7, src, 0x1 /* odd buffer? */
99 103
100hword_align: 104.Lhword_align:
101 beqz t7, word_align 105 beqz t7, .Lword_align
102 andi t8, src, 0x2 106 andi t8, src, 0x2
103 107
104 lbu t0, (src) 108 lbu t0, (src)
@@ -110,8 +114,8 @@ hword_align:
110 PTR_ADDU src, src, 0x1 114 PTR_ADDU src, src, 0x1
111 andi t8, src, 0x2 115 andi t8, src, 0x2
112 116
113word_align: 117.Lword_align:
114 beqz t8, dword_align 118 beqz t8, .Ldword_align
115 sltiu t8, a1, 56 119 sltiu t8, a1, 56
116 120
117 lhu t0, (src) 121 lhu t0, (src)
@@ -120,12 +124,12 @@ word_align:
120 sltiu t8, a1, 56 124 sltiu t8, a1, 56
121 PTR_ADDU src, src, 0x2 125 PTR_ADDU src, src, 0x2
122 126
123dword_align: 127.Ldword_align:
124 bnez t8, do_end_words 128 bnez t8, .Ldo_end_words
125 move t8, a1 129 move t8, a1
126 130
127 andi t8, src, 0x4 131 andi t8, src, 0x4
128 beqz t8, qword_align 132 beqz t8, .Lqword_align
129 andi t8, src, 0x8 133 andi t8, src, 0x8
130 134
131 lw t0, 0x00(src) 135 lw t0, 0x00(src)
@@ -134,8 +138,8 @@ dword_align:
134 PTR_ADDU src, src, 0x4 138 PTR_ADDU src, src, 0x4
135 andi t8, src, 0x8 139 andi t8, src, 0x8
136 140
137qword_align: 141.Lqword_align:
138 beqz t8, oword_align 142 beqz t8, .Loword_align
139 andi t8, src, 0x10 143 andi t8, src, 0x10
140 144
141#ifdef USE_DOUBLE 145#ifdef USE_DOUBLE
@@ -152,8 +156,8 @@ qword_align:
152 PTR_ADDU src, src, 0x8 156 PTR_ADDU src, src, 0x8
153 andi t8, src, 0x10 157 andi t8, src, 0x10
154 158
155oword_align: 159.Loword_align:
156 beqz t8, begin_movement 160 beqz t8, .Lbegin_movement
157 LONG_SRL t8, a1, 0x7 161 LONG_SRL t8, a1, 0x7
158 162
159#ifdef USE_DOUBLE 163#ifdef USE_DOUBLE
@@ -168,51 +172,55 @@ oword_align:
168 PTR_ADDU src, src, 0x10 172 PTR_ADDU src, src, 0x10
169 LONG_SRL t8, a1, 0x7 173 LONG_SRL t8, a1, 0x7
170 174
171begin_movement: 175.Lbegin_movement:
172 beqz t8, 1f 176 beqz t8, 1f
173 andi t2, a1, 0x40 177 andi t2, a1, 0x40
174 178
175move_128bytes: 179.Lmove_128bytes:
176 CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4) 180 CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
177 CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4) 181 CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
178 CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4) 182 CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
179 CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4) 183 CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
180 LONG_SUBU t8, t8, 0x01 184 LONG_SUBU t8, t8, 0x01
181 bnez t8, move_128bytes 185 .set reorder /* DADDI_WAR */
182 PTR_ADDU src, src, 0x80 186 PTR_ADDU src, src, 0x80
187 bnez t8, .Lmove_128bytes
188 .set noreorder
183 189
1841: 1901:
185 beqz t2, 1f 191 beqz t2, 1f
186 andi t2, a1, 0x20 192 andi t2, a1, 0x20
187 193
188move_64bytes: 194.Lmove_64bytes:
189 CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4) 195 CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
190 CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4) 196 CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
191 PTR_ADDU src, src, 0x40 197 PTR_ADDU src, src, 0x40
192 198
1931: 1991:
194 beqz t2, do_end_words 200 beqz t2, .Ldo_end_words
195 andi t8, a1, 0x1c 201 andi t8, a1, 0x1c
196 202
197move_32bytes: 203.Lmove_32bytes:
198 CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4) 204 CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
199 andi t8, a1, 0x1c 205 andi t8, a1, 0x1c
200 PTR_ADDU src, src, 0x20 206 PTR_ADDU src, src, 0x20
201 207
202do_end_words: 208.Ldo_end_words:
203 beqz t8, small_csumcpy 209 beqz t8, .Lsmall_csumcpy
204 andi t2, a1, 0x3 210 andi t2, a1, 0x3
205 LONG_SRL t8, t8, 0x2 211 LONG_SRL t8, t8, 0x2
206 212
207end_words: 213.Lend_words:
208 lw t0, (src) 214 lw t0, (src)
209 LONG_SUBU t8, t8, 0x1 215 LONG_SUBU t8, t8, 0x1
210 ADDC(sum, t0) 216 ADDC(sum, t0)
211 bnez t8, end_words 217 .set reorder /* DADDI_WAR */
212 PTR_ADDU src, src, 0x4 218 PTR_ADDU src, src, 0x4
219 bnez t8, .Lend_words
220 .set noreorder
213 221
214/* unknown src alignment and < 8 bytes to go */ 222/* unknown src alignment and < 8 bytes to go */
215small_csumcpy: 223.Lsmall_csumcpy:
216 move a1, t2 224 move a1, t2
217 225
218 andi t0, a1, 4 226 andi t0, a1, 4
@@ -246,6 +254,8 @@ small_csumcpy:
2461: ADDC(sum, t1) 2541: ADDC(sum, t1)
247 255
248 /* fold checksum */ 256 /* fold checksum */
257 .set push
258 .set noat
249#ifdef USE_DOUBLE 259#ifdef USE_DOUBLE
250 dsll32 v1, sum, 0 260 dsll32 v1, sum, 0
251 daddu sum, v1 261 daddu sum, v1
@@ -266,6 +276,7 @@ small_csumcpy:
266 srl sum, sum, 8 276 srl sum, sum, 8
267 or sum, v1 277 or sum, v1
268 andi sum, 0xffff 278 andi sum, 0xffff
279 .set pop
2691: 2801:
270 .set reorder 281 .set reorder
271 /* Add the passed partial csum. */ 282 /* Add the passed partial csum. */
@@ -373,7 +384,11 @@ small_csumcpy:
373 384
374#define ADDRMASK (NBYTES-1) 385#define ADDRMASK (NBYTES-1)
375 386
387#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
376 .set noat 388 .set noat
389#else
390 .set at=v1
391#endif
377 392
378LEAF(__csum_partial_copy_user) 393LEAF(__csum_partial_copy_user)
379 PTR_ADDU AT, src, len /* See (1) above. */ 394 PTR_ADDU AT, src, len /* See (1) above. */
@@ -398,95 +413,101 @@ FEXPORT(csum_partial_copy_nocheck)
398 */ 413 */
399 sltu t2, len, NBYTES 414 sltu t2, len, NBYTES
400 and t1, dst, ADDRMASK 415 and t1, dst, ADDRMASK
401 bnez t2, copy_bytes_checklen 416 bnez t2, .Lcopy_bytes_checklen
402 and t0, src, ADDRMASK 417 and t0, src, ADDRMASK
403 andi odd, dst, 0x1 /* odd buffer? */ 418 andi odd, dst, 0x1 /* odd buffer? */
404 bnez t1, dst_unaligned 419 bnez t1, .Ldst_unaligned
405 nop 420 nop
406 bnez t0, src_unaligned_dst_aligned 421 bnez t0, .Lsrc_unaligned_dst_aligned
407 /* 422 /*
408 * use delay slot for fall-through 423 * use delay slot for fall-through
409 * src and dst are aligned; need to compute rem 424 * src and dst are aligned; need to compute rem
410 */ 425 */
411both_aligned: 426.Lboth_aligned:
412 SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter 427 SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter
413 beqz t0, cleanup_both_aligned # len < 8*NBYTES 428 beqz t0, .Lcleanup_both_aligned # len < 8*NBYTES
414 nop 429 nop
415 SUB len, 8*NBYTES # subtract here for bgez loop 430 SUB len, 8*NBYTES # subtract here for bgez loop
416 .align 4 431 .align 4
4171: 4321:
418EXC( LOAD t0, UNIT(0)(src), l_exc) 433EXC( LOAD t0, UNIT(0)(src), .Ll_exc)
419EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 434EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)
420EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 435EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)
421EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 436EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)
422EXC( LOAD t4, UNIT(4)(src), l_exc_copy) 437EXC( LOAD t4, UNIT(4)(src), .Ll_exc_copy)
423EXC( LOAD t5, UNIT(5)(src), l_exc_copy) 438EXC( LOAD t5, UNIT(5)(src), .Ll_exc_copy)
424EXC( LOAD t6, UNIT(6)(src), l_exc_copy) 439EXC( LOAD t6, UNIT(6)(src), .Ll_exc_copy)
425EXC( LOAD t7, UNIT(7)(src), l_exc_copy) 440EXC( LOAD t7, UNIT(7)(src), .Ll_exc_copy)
426 SUB len, len, 8*NBYTES 441 SUB len, len, 8*NBYTES
427 ADD src, src, 8*NBYTES 442 ADD src, src, 8*NBYTES
428EXC( STORE t0, UNIT(0)(dst), s_exc) 443EXC( STORE t0, UNIT(0)(dst), .Ls_exc)
429 ADDC(sum, t0) 444 ADDC(sum, t0)
430EXC( STORE t1, UNIT(1)(dst), s_exc) 445EXC( STORE t1, UNIT(1)(dst), .Ls_exc)
431 ADDC(sum, t1) 446 ADDC(sum, t1)
432EXC( STORE t2, UNIT(2)(dst), s_exc) 447EXC( STORE t2, UNIT(2)(dst), .Ls_exc)
433 ADDC(sum, t2) 448 ADDC(sum, t2)
434EXC( STORE t3, UNIT(3)(dst), s_exc) 449EXC( STORE t3, UNIT(3)(dst), .Ls_exc)
435 ADDC(sum, t3) 450 ADDC(sum, t3)
436EXC( STORE t4, UNIT(4)(dst), s_exc) 451EXC( STORE t4, UNIT(4)(dst), .Ls_exc)
437 ADDC(sum, t4) 452 ADDC(sum, t4)
438EXC( STORE t5, UNIT(5)(dst), s_exc) 453EXC( STORE t5, UNIT(5)(dst), .Ls_exc)
439 ADDC(sum, t5) 454 ADDC(sum, t5)
440EXC( STORE t6, UNIT(6)(dst), s_exc) 455EXC( STORE t6, UNIT(6)(dst), .Ls_exc)
441 ADDC(sum, t6) 456 ADDC(sum, t6)
442EXC( STORE t7, UNIT(7)(dst), s_exc) 457EXC( STORE t7, UNIT(7)(dst), .Ls_exc)
443 ADDC(sum, t7) 458 ADDC(sum, t7)
459 .set reorder /* DADDI_WAR */
460 ADD dst, dst, 8*NBYTES
444 bgez len, 1b 461 bgez len, 1b
445 ADD dst, dst, 8*NBYTES 462 .set noreorder
446 ADD len, 8*NBYTES # revert len (see above) 463 ADD len, 8*NBYTES # revert len (see above)
447 464
448 /* 465 /*
449 * len == the number of bytes left to copy < 8*NBYTES 466 * len == the number of bytes left to copy < 8*NBYTES
450 */ 467 */
451cleanup_both_aligned: 468.Lcleanup_both_aligned:
452#define rem t7 469#define rem t7
453 beqz len, done 470 beqz len, .Ldone
454 sltu t0, len, 4*NBYTES 471 sltu t0, len, 4*NBYTES
455 bnez t0, less_than_4units 472 bnez t0, .Lless_than_4units
456 and rem, len, (NBYTES-1) # rem = len % NBYTES 473 and rem, len, (NBYTES-1) # rem = len % NBYTES
457 /* 474 /*
458 * len >= 4*NBYTES 475 * len >= 4*NBYTES
459 */ 476 */
460EXC( LOAD t0, UNIT(0)(src), l_exc) 477EXC( LOAD t0, UNIT(0)(src), .Ll_exc)
461EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 478EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)
462EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 479EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)
463EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 480EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)
464 SUB len, len, 4*NBYTES 481 SUB len, len, 4*NBYTES
465 ADD src, src, 4*NBYTES 482 ADD src, src, 4*NBYTES
466EXC( STORE t0, UNIT(0)(dst), s_exc) 483EXC( STORE t0, UNIT(0)(dst), .Ls_exc)
467 ADDC(sum, t0) 484 ADDC(sum, t0)
468EXC( STORE t1, UNIT(1)(dst), s_exc) 485EXC( STORE t1, UNIT(1)(dst), .Ls_exc)
469 ADDC(sum, t1) 486 ADDC(sum, t1)
470EXC( STORE t2, UNIT(2)(dst), s_exc) 487EXC( STORE t2, UNIT(2)(dst), .Ls_exc)
471 ADDC(sum, t2) 488 ADDC(sum, t2)
472EXC( STORE t3, UNIT(3)(dst), s_exc) 489EXC( STORE t3, UNIT(3)(dst), .Ls_exc)
473 ADDC(sum, t3) 490 ADDC(sum, t3)
474 beqz len, done 491 .set reorder /* DADDI_WAR */
475 ADD dst, dst, 4*NBYTES 492 ADD dst, dst, 4*NBYTES
476less_than_4units: 493 beqz len, .Ldone
494 .set noreorder
495.Lless_than_4units:
477 /* 496 /*
478 * rem = len % NBYTES 497 * rem = len % NBYTES
479 */ 498 */
480 beq rem, len, copy_bytes 499 beq rem, len, .Lcopy_bytes
481 nop 500 nop
4821: 5011:
483EXC( LOAD t0, 0(src), l_exc) 502EXC( LOAD t0, 0(src), .Ll_exc)
484 ADD src, src, NBYTES 503 ADD src, src, NBYTES
485 SUB len, len, NBYTES 504 SUB len, len, NBYTES
486EXC( STORE t0, 0(dst), s_exc) 505EXC( STORE t0, 0(dst), .Ls_exc)
487 ADDC(sum, t0) 506 ADDC(sum, t0)
507 .set reorder /* DADDI_WAR */
508 ADD dst, dst, NBYTES
488 bne rem, len, 1b 509 bne rem, len, 1b
489 ADD dst, dst, NBYTES 510 .set noreorder
490 511
491 /* 512 /*
492 * src and dst are aligned, need to copy rem bytes (rem < NBYTES) 513 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
@@ -500,20 +521,20 @@ EXC( STORE t0, 0(dst), s_exc)
500 * more instruction-level parallelism. 521 * more instruction-level parallelism.
501 */ 522 */
502#define bits t2 523#define bits t2
503 beqz len, done 524 beqz len, .Ldone
504 ADD t1, dst, len # t1 is just past last byte of dst 525 ADD t1, dst, len # t1 is just past last byte of dst
505 li bits, 8*NBYTES 526 li bits, 8*NBYTES
506 SLL rem, len, 3 # rem = number of bits to keep 527 SLL rem, len, 3 # rem = number of bits to keep
507EXC( LOAD t0, 0(src), l_exc) 528EXC( LOAD t0, 0(src), .Ll_exc)
508 SUB bits, bits, rem # bits = number of bits to discard 529 SUB bits, bits, rem # bits = number of bits to discard
509 SHIFT_DISCARD t0, t0, bits 530 SHIFT_DISCARD t0, t0, bits
510EXC( STREST t0, -1(t1), s_exc) 531EXC( STREST t0, -1(t1), .Ls_exc)
511 SHIFT_DISCARD_REVERT t0, t0, bits 532 SHIFT_DISCARD_REVERT t0, t0, bits
512 .set reorder 533 .set reorder
513 ADDC(sum, t0) 534 ADDC(sum, t0)
514 b done 535 b .Ldone
515 .set noreorder 536 .set noreorder
516dst_unaligned: 537.Ldst_unaligned:
517 /* 538 /*
518 * dst is unaligned 539 * dst is unaligned
519 * t0 = src & ADDRMASK 540 * t0 = src & ADDRMASK
@@ -524,25 +545,25 @@ dst_unaligned:
524 * Set match = (src and dst have same alignment) 545 * Set match = (src and dst have same alignment)
525 */ 546 */
526#define match rem 547#define match rem
527EXC( LDFIRST t3, FIRST(0)(src), l_exc) 548EXC( LDFIRST t3, FIRST(0)(src), .Ll_exc)
528 ADD t2, zero, NBYTES 549 ADD t2, zero, NBYTES
529EXC( LDREST t3, REST(0)(src), l_exc_copy) 550EXC( LDREST t3, REST(0)(src), .Ll_exc_copy)
530 SUB t2, t2, t1 # t2 = number of bytes copied 551 SUB t2, t2, t1 # t2 = number of bytes copied
531 xor match, t0, t1 552 xor match, t0, t1
532EXC( STFIRST t3, FIRST(0)(dst), s_exc) 553EXC( STFIRST t3, FIRST(0)(dst), .Ls_exc)
533 SLL t4, t1, 3 # t4 = number of bits to discard 554 SLL t4, t1, 3 # t4 = number of bits to discard
534 SHIFT_DISCARD t3, t3, t4 555 SHIFT_DISCARD t3, t3, t4
535 /* no SHIFT_DISCARD_REVERT to handle odd buffer properly */ 556 /* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
536 ADDC(sum, t3) 557 ADDC(sum, t3)
537 beq len, t2, done 558 beq len, t2, .Ldone
538 SUB len, len, t2 559 SUB len, len, t2
539 ADD dst, dst, t2 560 ADD dst, dst, t2
540 beqz match, both_aligned 561 beqz match, .Lboth_aligned
541 ADD src, src, t2 562 ADD src, src, t2
542 563
543src_unaligned_dst_aligned: 564.Lsrc_unaligned_dst_aligned:
544 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter 565 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
545 beqz t0, cleanup_src_unaligned 566 beqz t0, .Lcleanup_src_unaligned
546 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES 567 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
5471: 5681:
548/* 569/*
@@ -551,49 +572,53 @@ src_unaligned_dst_aligned:
551 * It's OK to load FIRST(N+1) before REST(N) because the two addresses 572 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
552 * are to the same unit (unless src is aligned, but it's not). 573 * are to the same unit (unless src is aligned, but it's not).
553 */ 574 */
554EXC( LDFIRST t0, FIRST(0)(src), l_exc) 575EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)
555EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy) 576EXC( LDFIRST t1, FIRST(1)(src), .Ll_exc_copy)
556 SUB len, len, 4*NBYTES 577 SUB len, len, 4*NBYTES
557EXC( LDREST t0, REST(0)(src), l_exc_copy) 578EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)
558EXC( LDREST t1, REST(1)(src), l_exc_copy) 579EXC( LDREST t1, REST(1)(src), .Ll_exc_copy)
559EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy) 580EXC( LDFIRST t2, FIRST(2)(src), .Ll_exc_copy)
560EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy) 581EXC( LDFIRST t3, FIRST(3)(src), .Ll_exc_copy)
561EXC( LDREST t2, REST(2)(src), l_exc_copy) 582EXC( LDREST t2, REST(2)(src), .Ll_exc_copy)
562EXC( LDREST t3, REST(3)(src), l_exc_copy) 583EXC( LDREST t3, REST(3)(src), .Ll_exc_copy)
563 ADD src, src, 4*NBYTES 584 ADD src, src, 4*NBYTES
564#ifdef CONFIG_CPU_SB1 585#ifdef CONFIG_CPU_SB1
565 nop # improves slotting 586 nop # improves slotting
566#endif 587#endif
567EXC( STORE t0, UNIT(0)(dst), s_exc) 588EXC( STORE t0, UNIT(0)(dst), .Ls_exc)
568 ADDC(sum, t0) 589 ADDC(sum, t0)
569EXC( STORE t1, UNIT(1)(dst), s_exc) 590EXC( STORE t1, UNIT(1)(dst), .Ls_exc)
570 ADDC(sum, t1) 591 ADDC(sum, t1)
571EXC( STORE t2, UNIT(2)(dst), s_exc) 592EXC( STORE t2, UNIT(2)(dst), .Ls_exc)
572 ADDC(sum, t2) 593 ADDC(sum, t2)
573EXC( STORE t3, UNIT(3)(dst), s_exc) 594EXC( STORE t3, UNIT(3)(dst), .Ls_exc)
574 ADDC(sum, t3) 595 ADDC(sum, t3)
596 .set reorder /* DADDI_WAR */
597 ADD dst, dst, 4*NBYTES
575 bne len, rem, 1b 598 bne len, rem, 1b
576 ADD dst, dst, 4*NBYTES 599 .set noreorder
577 600
578cleanup_src_unaligned: 601.Lcleanup_src_unaligned:
579 beqz len, done 602 beqz len, .Ldone
580 and rem, len, NBYTES-1 # rem = len % NBYTES 603 and rem, len, NBYTES-1 # rem = len % NBYTES
581 beq rem, len, copy_bytes 604 beq rem, len, .Lcopy_bytes
582 nop 605 nop
5831: 6061:
584EXC( LDFIRST t0, FIRST(0)(src), l_exc) 607EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)
585EXC( LDREST t0, REST(0)(src), l_exc_copy) 608EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)
586 ADD src, src, NBYTES 609 ADD src, src, NBYTES
587 SUB len, len, NBYTES 610 SUB len, len, NBYTES
588EXC( STORE t0, 0(dst), s_exc) 611EXC( STORE t0, 0(dst), .Ls_exc)
589 ADDC(sum, t0) 612 ADDC(sum, t0)
613 .set reorder /* DADDI_WAR */
614 ADD dst, dst, NBYTES
590 bne len, rem, 1b 615 bne len, rem, 1b
591 ADD dst, dst, NBYTES 616 .set noreorder
592 617
593copy_bytes_checklen: 618.Lcopy_bytes_checklen:
594 beqz len, done 619 beqz len, .Ldone
595 nop 620 nop
596copy_bytes: 621.Lcopy_bytes:
597 /* 0 < len < NBYTES */ 622 /* 0 < len < NBYTES */
598#ifdef CONFIG_CPU_LITTLE_ENDIAN 623#ifdef CONFIG_CPU_LITTLE_ENDIAN
599#define SHIFT_START 0 624#define SHIFT_START 0
@@ -604,14 +629,14 @@ copy_bytes:
604#endif 629#endif
605 move t2, zero # partial word 630 move t2, zero # partial word
606 li t3, SHIFT_START # shift 631 li t3, SHIFT_START # shift
607/* use l_exc_copy here to return correct sum on fault */ 632/* use .Ll_exc_copy here to return correct sum on fault */
608#define COPY_BYTE(N) \ 633#define COPY_BYTE(N) \
609EXC( lbu t0, N(src), l_exc_copy); \ 634EXC( lbu t0, N(src), .Ll_exc_copy); \
610 SUB len, len, 1; \ 635 SUB len, len, 1; \
611EXC( sb t0, N(dst), s_exc); \ 636EXC( sb t0, N(dst), .Ls_exc); \
612 SLLV t0, t0, t3; \ 637 SLLV t0, t0, t3; \
613 addu t3, SHIFT_INC; \ 638 addu t3, SHIFT_INC; \
614 beqz len, copy_bytes_done; \ 639 beqz len, .Lcopy_bytes_done; \
615 or t2, t0 640 or t2, t0
616 641
617 COPY_BYTE(0) 642 COPY_BYTE(0)
@@ -622,15 +647,17 @@ EXC( sb t0, N(dst), s_exc); \
622 COPY_BYTE(4) 647 COPY_BYTE(4)
623 COPY_BYTE(5) 648 COPY_BYTE(5)
624#endif 649#endif
625EXC( lbu t0, NBYTES-2(src), l_exc_copy) 650EXC( lbu t0, NBYTES-2(src), .Ll_exc_copy)
626 SUB len, len, 1 651 SUB len, len, 1
627EXC( sb t0, NBYTES-2(dst), s_exc) 652EXC( sb t0, NBYTES-2(dst), .Ls_exc)
628 SLLV t0, t0, t3 653 SLLV t0, t0, t3
629 or t2, t0 654 or t2, t0
630copy_bytes_done: 655.Lcopy_bytes_done:
631 ADDC(sum, t2) 656 ADDC(sum, t2)
632done: 657.Ldone:
633 /* fold checksum */ 658 /* fold checksum */
659 .set push
660 .set noat
634#ifdef USE_DOUBLE 661#ifdef USE_DOUBLE
635 dsll32 v1, sum, 0 662 dsll32 v1, sum, 0
636 daddu sum, v1 663 daddu sum, v1
@@ -651,13 +678,14 @@ done:
651 srl sum, sum, 8 678 srl sum, sum, 8
652 or sum, v1 679 or sum, v1
653 andi sum, 0xffff 680 andi sum, 0xffff
681 .set pop
6541: 6821:
655 .set reorder 683 .set reorder
656 ADDC(sum, psum) 684 ADDC(sum, psum)
657 jr ra 685 jr ra
658 .set noreorder 686 .set noreorder
659 687
660l_exc_copy: 688.Ll_exc_copy:
661 /* 689 /*
662 * Copy bytes from src until faulting load address (or until a 690 * Copy bytes from src until faulting load address (or until a
663 * lb faults) 691 * lb faults)
@@ -672,15 +700,17 @@ l_exc_copy:
672 li t2, SHIFT_START 700 li t2, SHIFT_START
673 LOAD t0, THREAD_BUADDR(t0) 701 LOAD t0, THREAD_BUADDR(t0)
6741: 7021:
675EXC( lbu t1, 0(src), l_exc) 703EXC( lbu t1, 0(src), .Ll_exc)
676 ADD src, src, 1 704 ADD src, src, 1
677 sb t1, 0(dst) # can't fault -- we're copy_from_user 705 sb t1, 0(dst) # can't fault -- we're copy_from_user
678 SLLV t1, t1, t2 706 SLLV t1, t1, t2
679 addu t2, SHIFT_INC 707 addu t2, SHIFT_INC
680 ADDC(sum, t1) 708 ADDC(sum, t1)
709 .set reorder /* DADDI_WAR */
710 ADD dst, dst, 1
681 bne src, t0, 1b 711 bne src, t0, 1b
682 ADD dst, dst, 1 712 .set noreorder
683l_exc: 713.Ll_exc:
684 LOAD t0, TI_TASK($28) 714 LOAD t0, TI_TASK($28)
685 nop 715 nop
686 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address 716 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
@@ -697,19 +727,30 @@ l_exc:
697 * Clear len bytes starting at dst. Can't call __bzero because it 727 * Clear len bytes starting at dst. Can't call __bzero because it
698 * might modify len. An inefficient loop for these rare times... 728 * might modify len. An inefficient loop for these rare times...
699 */ 729 */
700 beqz len, done 730 .set reorder /* DADDI_WAR */
701 SUB src, len, 1 731 SUB src, len, 1
732 beqz len, .Ldone
733 .set noreorder
7021: sb zero, 0(dst) 7341: sb zero, 0(dst)
703 ADD dst, dst, 1 735 ADD dst, dst, 1
736 .set push
737 .set noat
738#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
704 bnez src, 1b 739 bnez src, 1b
705 SUB src, src, 1 740 SUB src, src, 1
741#else
742 li v1, 1
743 bnez src, 1b
744 SUB src, src, v1
745#endif
706 li v1, -EFAULT 746 li v1, -EFAULT
707 b done 747 b .Ldone
708 sw v1, (errptr) 748 sw v1, (errptr)
709 749
710s_exc: 750.Ls_exc:
711 li v0, -1 /* invalid checksum */ 751 li v0, -1 /* invalid checksum */
712 li v1, -EFAULT 752 li v1, -EFAULT
713 jr ra 753 jr ra
714 sw v1, (errptr) 754 sw v1, (errptr)
755 .set pop
715 END(__csum_partial_copy_user) 756 END(__csum_partial_copy_user)