aboutsummaryrefslogtreecommitdiffstats
path: root/arch/mips/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/mips/lib')
-rw-r--r--arch/mips/lib/csum_partial.S275
-rw-r--r--arch/mips/lib/memcpy-inatomic.S141
-rw-r--r--arch/mips/lib/memcpy.S250
-rw-r--r--arch/mips/lib/memset.S44
-rw-r--r--arch/mips/lib/strlen_user.S6
-rw-r--r--arch/mips/lib/strncpy_user.S15
-rw-r--r--arch/mips/lib/strnlen_user.S7
-rw-r--r--arch/mips/lib/uncached.c12
8 files changed, 433 insertions, 317 deletions
diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S
index c0a77fe038be..8d7784122c14 100644
--- a/arch/mips/lib/csum_partial.S
+++ b/arch/mips/lib/csum_partial.S
@@ -7,6 +7,7 @@
7 * 7 *
8 * Copyright (C) 1998, 1999 Ralf Baechle 8 * Copyright (C) 1998, 1999 Ralf Baechle
9 * Copyright (C) 1999 Silicon Graphics, Inc. 9 * Copyright (C) 1999 Silicon Graphics, Inc.
10 * Copyright (C) 2007 Maciej W. Rozycki
10 */ 11 */
11#include <linux/errno.h> 12#include <linux/errno.h>
12#include <asm/asm.h> 13#include <asm/asm.h>
@@ -52,9 +53,12 @@
52#define UNIT(unit) ((unit)*NBYTES) 53#define UNIT(unit) ((unit)*NBYTES)
53 54
54#define ADDC(sum,reg) \ 55#define ADDC(sum,reg) \
56 .set push; \
57 .set noat; \
55 ADD sum, reg; \ 58 ADD sum, reg; \
56 sltu v1, sum, reg; \ 59 sltu v1, sum, reg; \
57 ADD sum, v1 60 ADD sum, v1; \
61 .set pop
58 62
59#define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3) \ 63#define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3) \
60 LOAD _t0, (offset + UNIT(0))(src); \ 64 LOAD _t0, (offset + UNIT(0))(src); \
@@ -92,13 +96,13 @@ LEAF(csum_partial)
92 move t7, zero 96 move t7, zero
93 97
94 sltiu t8, a1, 0x8 98 sltiu t8, a1, 0x8
95 bnez t8, small_csumcpy /* < 8 bytes to copy */ 99 bnez t8, .Lsmall_csumcpy /* < 8 bytes to copy */
96 move t2, a1 100 move t2, a1
97 101
98 andi t7, src, 0x1 /* odd buffer? */ 102 andi t7, src, 0x1 /* odd buffer? */
99 103
100hword_align: 104.Lhword_align:
101 beqz t7, word_align 105 beqz t7, .Lword_align
102 andi t8, src, 0x2 106 andi t8, src, 0x2
103 107
104 lbu t0, (src) 108 lbu t0, (src)
@@ -110,8 +114,8 @@ hword_align:
110 PTR_ADDU src, src, 0x1 114 PTR_ADDU src, src, 0x1
111 andi t8, src, 0x2 115 andi t8, src, 0x2
112 116
113word_align: 117.Lword_align:
114 beqz t8, dword_align 118 beqz t8, .Ldword_align
115 sltiu t8, a1, 56 119 sltiu t8, a1, 56
116 120
117 lhu t0, (src) 121 lhu t0, (src)
@@ -120,12 +124,12 @@ word_align:
120 sltiu t8, a1, 56 124 sltiu t8, a1, 56
121 PTR_ADDU src, src, 0x2 125 PTR_ADDU src, src, 0x2
122 126
123dword_align: 127.Ldword_align:
124 bnez t8, do_end_words 128 bnez t8, .Ldo_end_words
125 move t8, a1 129 move t8, a1
126 130
127 andi t8, src, 0x4 131 andi t8, src, 0x4
128 beqz t8, qword_align 132 beqz t8, .Lqword_align
129 andi t8, src, 0x8 133 andi t8, src, 0x8
130 134
131 lw t0, 0x00(src) 135 lw t0, 0x00(src)
@@ -134,8 +138,8 @@ dword_align:
134 PTR_ADDU src, src, 0x4 138 PTR_ADDU src, src, 0x4
135 andi t8, src, 0x8 139 andi t8, src, 0x8
136 140
137qword_align: 141.Lqword_align:
138 beqz t8, oword_align 142 beqz t8, .Loword_align
139 andi t8, src, 0x10 143 andi t8, src, 0x10
140 144
141#ifdef USE_DOUBLE 145#ifdef USE_DOUBLE
@@ -152,8 +156,8 @@ qword_align:
152 PTR_ADDU src, src, 0x8 156 PTR_ADDU src, src, 0x8
153 andi t8, src, 0x10 157 andi t8, src, 0x10
154 158
155oword_align: 159.Loword_align:
156 beqz t8, begin_movement 160 beqz t8, .Lbegin_movement
157 LONG_SRL t8, a1, 0x7 161 LONG_SRL t8, a1, 0x7
158 162
159#ifdef USE_DOUBLE 163#ifdef USE_DOUBLE
@@ -168,51 +172,55 @@ oword_align:
168 PTR_ADDU src, src, 0x10 172 PTR_ADDU src, src, 0x10
169 LONG_SRL t8, a1, 0x7 173 LONG_SRL t8, a1, 0x7
170 174
171begin_movement: 175.Lbegin_movement:
172 beqz t8, 1f 176 beqz t8, 1f
173 andi t2, a1, 0x40 177 andi t2, a1, 0x40
174 178
175move_128bytes: 179.Lmove_128bytes:
176 CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4) 180 CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
177 CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4) 181 CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
178 CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4) 182 CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
179 CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4) 183 CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
180 LONG_SUBU t8, t8, 0x01 184 LONG_SUBU t8, t8, 0x01
181 bnez t8, move_128bytes 185 .set reorder /* DADDI_WAR */
182 PTR_ADDU src, src, 0x80 186 PTR_ADDU src, src, 0x80
187 bnez t8, .Lmove_128bytes
188 .set noreorder
183 189
1841: 1901:
185 beqz t2, 1f 191 beqz t2, 1f
186 andi t2, a1, 0x20 192 andi t2, a1, 0x20
187 193
188move_64bytes: 194.Lmove_64bytes:
189 CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4) 195 CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
190 CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4) 196 CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
191 PTR_ADDU src, src, 0x40 197 PTR_ADDU src, src, 0x40
192 198
1931: 1991:
194 beqz t2, do_end_words 200 beqz t2, .Ldo_end_words
195 andi t8, a1, 0x1c 201 andi t8, a1, 0x1c
196 202
197move_32bytes: 203.Lmove_32bytes:
198 CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4) 204 CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
199 andi t8, a1, 0x1c 205 andi t8, a1, 0x1c
200 PTR_ADDU src, src, 0x20 206 PTR_ADDU src, src, 0x20
201 207
202do_end_words: 208.Ldo_end_words:
203 beqz t8, small_csumcpy 209 beqz t8, .Lsmall_csumcpy
204 andi t2, a1, 0x3 210 andi t2, a1, 0x3
205 LONG_SRL t8, t8, 0x2 211 LONG_SRL t8, t8, 0x2
206 212
207end_words: 213.Lend_words:
208 lw t0, (src) 214 lw t0, (src)
209 LONG_SUBU t8, t8, 0x1 215 LONG_SUBU t8, t8, 0x1
210 ADDC(sum, t0) 216 ADDC(sum, t0)
211 bnez t8, end_words 217 .set reorder /* DADDI_WAR */
212 PTR_ADDU src, src, 0x4 218 PTR_ADDU src, src, 0x4
219 bnez t8, .Lend_words
220 .set noreorder
213 221
214/* unknown src alignment and < 8 bytes to go */ 222/* unknown src alignment and < 8 bytes to go */
215small_csumcpy: 223.Lsmall_csumcpy:
216 move a1, t2 224 move a1, t2
217 225
218 andi t0, a1, 4 226 andi t0, a1, 4
@@ -246,6 +254,8 @@ small_csumcpy:
2461: ADDC(sum, t1) 2541: ADDC(sum, t1)
247 255
248 /* fold checksum */ 256 /* fold checksum */
257 .set push
258 .set noat
249#ifdef USE_DOUBLE 259#ifdef USE_DOUBLE
250 dsll32 v1, sum, 0 260 dsll32 v1, sum, 0
251 daddu sum, v1 261 daddu sum, v1
@@ -266,6 +276,7 @@ small_csumcpy:
266 srl sum, sum, 8 276 srl sum, sum, 8
267 or sum, v1 277 or sum, v1
268 andi sum, 0xffff 278 andi sum, 0xffff
279 .set pop
2691: 2801:
270 .set reorder 281 .set reorder
271 /* Add the passed partial csum. */ 282 /* Add the passed partial csum. */
@@ -373,7 +384,11 @@ small_csumcpy:
373 384
374#define ADDRMASK (NBYTES-1) 385#define ADDRMASK (NBYTES-1)
375 386
387#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
376 .set noat 388 .set noat
389#else
390 .set at=v1
391#endif
377 392
378LEAF(__csum_partial_copy_user) 393LEAF(__csum_partial_copy_user)
379 PTR_ADDU AT, src, len /* See (1) above. */ 394 PTR_ADDU AT, src, len /* See (1) above. */
@@ -398,95 +413,101 @@ FEXPORT(csum_partial_copy_nocheck)
398 */ 413 */
399 sltu t2, len, NBYTES 414 sltu t2, len, NBYTES
400 and t1, dst, ADDRMASK 415 and t1, dst, ADDRMASK
401 bnez t2, copy_bytes_checklen 416 bnez t2, .Lcopy_bytes_checklen
402 and t0, src, ADDRMASK 417 and t0, src, ADDRMASK
403 andi odd, dst, 0x1 /* odd buffer? */ 418 andi odd, dst, 0x1 /* odd buffer? */
404 bnez t1, dst_unaligned 419 bnez t1, .Ldst_unaligned
405 nop 420 nop
406 bnez t0, src_unaligned_dst_aligned 421 bnez t0, .Lsrc_unaligned_dst_aligned
407 /* 422 /*
408 * use delay slot for fall-through 423 * use delay slot for fall-through
409 * src and dst are aligned; need to compute rem 424 * src and dst are aligned; need to compute rem
410 */ 425 */
411both_aligned: 426.Lboth_aligned:
412 SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter 427 SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter
413 beqz t0, cleanup_both_aligned # len < 8*NBYTES 428 beqz t0, .Lcleanup_both_aligned # len < 8*NBYTES
414 nop 429 nop
415 SUB len, 8*NBYTES # subtract here for bgez loop 430 SUB len, 8*NBYTES # subtract here for bgez loop
416 .align 4 431 .align 4
4171: 4321:
418EXC( LOAD t0, UNIT(0)(src), l_exc) 433EXC( LOAD t0, UNIT(0)(src), .Ll_exc)
419EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 434EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)
420EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 435EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)
421EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 436EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)
422EXC( LOAD t4, UNIT(4)(src), l_exc_copy) 437EXC( LOAD t4, UNIT(4)(src), .Ll_exc_copy)
423EXC( LOAD t5, UNIT(5)(src), l_exc_copy) 438EXC( LOAD t5, UNIT(5)(src), .Ll_exc_copy)
424EXC( LOAD t6, UNIT(6)(src), l_exc_copy) 439EXC( LOAD t6, UNIT(6)(src), .Ll_exc_copy)
425EXC( LOAD t7, UNIT(7)(src), l_exc_copy) 440EXC( LOAD t7, UNIT(7)(src), .Ll_exc_copy)
426 SUB len, len, 8*NBYTES 441 SUB len, len, 8*NBYTES
427 ADD src, src, 8*NBYTES 442 ADD src, src, 8*NBYTES
428EXC( STORE t0, UNIT(0)(dst), s_exc) 443EXC( STORE t0, UNIT(0)(dst), .Ls_exc)
429 ADDC(sum, t0) 444 ADDC(sum, t0)
430EXC( STORE t1, UNIT(1)(dst), s_exc) 445EXC( STORE t1, UNIT(1)(dst), .Ls_exc)
431 ADDC(sum, t1) 446 ADDC(sum, t1)
432EXC( STORE t2, UNIT(2)(dst), s_exc) 447EXC( STORE t2, UNIT(2)(dst), .Ls_exc)
433 ADDC(sum, t2) 448 ADDC(sum, t2)
434EXC( STORE t3, UNIT(3)(dst), s_exc) 449EXC( STORE t3, UNIT(3)(dst), .Ls_exc)
435 ADDC(sum, t3) 450 ADDC(sum, t3)
436EXC( STORE t4, UNIT(4)(dst), s_exc) 451EXC( STORE t4, UNIT(4)(dst), .Ls_exc)
437 ADDC(sum, t4) 452 ADDC(sum, t4)
438EXC( STORE t5, UNIT(5)(dst), s_exc) 453EXC( STORE t5, UNIT(5)(dst), .Ls_exc)
439 ADDC(sum, t5) 454 ADDC(sum, t5)
440EXC( STORE t6, UNIT(6)(dst), s_exc) 455EXC( STORE t6, UNIT(6)(dst), .Ls_exc)
441 ADDC(sum, t6) 456 ADDC(sum, t6)
442EXC( STORE t7, UNIT(7)(dst), s_exc) 457EXC( STORE t7, UNIT(7)(dst), .Ls_exc)
443 ADDC(sum, t7) 458 ADDC(sum, t7)
459 .set reorder /* DADDI_WAR */
460 ADD dst, dst, 8*NBYTES
444 bgez len, 1b 461 bgez len, 1b
445 ADD dst, dst, 8*NBYTES 462 .set noreorder
446 ADD len, 8*NBYTES # revert len (see above) 463 ADD len, 8*NBYTES # revert len (see above)
447 464
448 /* 465 /*
449 * len == the number of bytes left to copy < 8*NBYTES 466 * len == the number of bytes left to copy < 8*NBYTES
450 */ 467 */
451cleanup_both_aligned: 468.Lcleanup_both_aligned:
452#define rem t7 469#define rem t7
453 beqz len, done 470 beqz len, .Ldone
454 sltu t0, len, 4*NBYTES 471 sltu t0, len, 4*NBYTES
455 bnez t0, less_than_4units 472 bnez t0, .Lless_than_4units
456 and rem, len, (NBYTES-1) # rem = len % NBYTES 473 and rem, len, (NBYTES-1) # rem = len % NBYTES
457 /* 474 /*
458 * len >= 4*NBYTES 475 * len >= 4*NBYTES
459 */ 476 */
460EXC( LOAD t0, UNIT(0)(src), l_exc) 477EXC( LOAD t0, UNIT(0)(src), .Ll_exc)
461EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 478EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)
462EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 479EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)
463EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 480EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)
464 SUB len, len, 4*NBYTES 481 SUB len, len, 4*NBYTES
465 ADD src, src, 4*NBYTES 482 ADD src, src, 4*NBYTES
466EXC( STORE t0, UNIT(0)(dst), s_exc) 483EXC( STORE t0, UNIT(0)(dst), .Ls_exc)
467 ADDC(sum, t0) 484 ADDC(sum, t0)
468EXC( STORE t1, UNIT(1)(dst), s_exc) 485EXC( STORE t1, UNIT(1)(dst), .Ls_exc)
469 ADDC(sum, t1) 486 ADDC(sum, t1)
470EXC( STORE t2, UNIT(2)(dst), s_exc) 487EXC( STORE t2, UNIT(2)(dst), .Ls_exc)
471 ADDC(sum, t2) 488 ADDC(sum, t2)
472EXC( STORE t3, UNIT(3)(dst), s_exc) 489EXC( STORE t3, UNIT(3)(dst), .Ls_exc)
473 ADDC(sum, t3) 490 ADDC(sum, t3)
474 beqz len, done 491 .set reorder /* DADDI_WAR */
475 ADD dst, dst, 4*NBYTES 492 ADD dst, dst, 4*NBYTES
476less_than_4units: 493 beqz len, .Ldone
494 .set noreorder
495.Lless_than_4units:
477 /* 496 /*
478 * rem = len % NBYTES 497 * rem = len % NBYTES
479 */ 498 */
480 beq rem, len, copy_bytes 499 beq rem, len, .Lcopy_bytes
481 nop 500 nop
4821: 5011:
483EXC( LOAD t0, 0(src), l_exc) 502EXC( LOAD t0, 0(src), .Ll_exc)
484 ADD src, src, NBYTES 503 ADD src, src, NBYTES
485 SUB len, len, NBYTES 504 SUB len, len, NBYTES
486EXC( STORE t0, 0(dst), s_exc) 505EXC( STORE t0, 0(dst), .Ls_exc)
487 ADDC(sum, t0) 506 ADDC(sum, t0)
507 .set reorder /* DADDI_WAR */
508 ADD dst, dst, NBYTES
488 bne rem, len, 1b 509 bne rem, len, 1b
489 ADD dst, dst, NBYTES 510 .set noreorder
490 511
491 /* 512 /*
492 * src and dst are aligned, need to copy rem bytes (rem < NBYTES) 513 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
@@ -500,20 +521,20 @@ EXC( STORE t0, 0(dst), s_exc)
500 * more instruction-level parallelism. 521 * more instruction-level parallelism.
501 */ 522 */
502#define bits t2 523#define bits t2
503 beqz len, done 524 beqz len, .Ldone
504 ADD t1, dst, len # t1 is just past last byte of dst 525 ADD t1, dst, len # t1 is just past last byte of dst
505 li bits, 8*NBYTES 526 li bits, 8*NBYTES
506 SLL rem, len, 3 # rem = number of bits to keep 527 SLL rem, len, 3 # rem = number of bits to keep
507EXC( LOAD t0, 0(src), l_exc) 528EXC( LOAD t0, 0(src), .Ll_exc)
508 SUB bits, bits, rem # bits = number of bits to discard 529 SUB bits, bits, rem # bits = number of bits to discard
509 SHIFT_DISCARD t0, t0, bits 530 SHIFT_DISCARD t0, t0, bits
510EXC( STREST t0, -1(t1), s_exc) 531EXC( STREST t0, -1(t1), .Ls_exc)
511 SHIFT_DISCARD_REVERT t0, t0, bits 532 SHIFT_DISCARD_REVERT t0, t0, bits
512 .set reorder 533 .set reorder
513 ADDC(sum, t0) 534 ADDC(sum, t0)
514 b done 535 b .Ldone
515 .set noreorder 536 .set noreorder
516dst_unaligned: 537.Ldst_unaligned:
517 /* 538 /*
518 * dst is unaligned 539 * dst is unaligned
519 * t0 = src & ADDRMASK 540 * t0 = src & ADDRMASK
@@ -524,25 +545,25 @@ dst_unaligned:
524 * Set match = (src and dst have same alignment) 545 * Set match = (src and dst have same alignment)
525 */ 546 */
526#define match rem 547#define match rem
527EXC( LDFIRST t3, FIRST(0)(src), l_exc) 548EXC( LDFIRST t3, FIRST(0)(src), .Ll_exc)
528 ADD t2, zero, NBYTES 549 ADD t2, zero, NBYTES
529EXC( LDREST t3, REST(0)(src), l_exc_copy) 550EXC( LDREST t3, REST(0)(src), .Ll_exc_copy)
530 SUB t2, t2, t1 # t2 = number of bytes copied 551 SUB t2, t2, t1 # t2 = number of bytes copied
531 xor match, t0, t1 552 xor match, t0, t1
532EXC( STFIRST t3, FIRST(0)(dst), s_exc) 553EXC( STFIRST t3, FIRST(0)(dst), .Ls_exc)
533 SLL t4, t1, 3 # t4 = number of bits to discard 554 SLL t4, t1, 3 # t4 = number of bits to discard
534 SHIFT_DISCARD t3, t3, t4 555 SHIFT_DISCARD t3, t3, t4
535 /* no SHIFT_DISCARD_REVERT to handle odd buffer properly */ 556 /* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
536 ADDC(sum, t3) 557 ADDC(sum, t3)
537 beq len, t2, done 558 beq len, t2, .Ldone
538 SUB len, len, t2 559 SUB len, len, t2
539 ADD dst, dst, t2 560 ADD dst, dst, t2
540 beqz match, both_aligned 561 beqz match, .Lboth_aligned
541 ADD src, src, t2 562 ADD src, src, t2
542 563
543src_unaligned_dst_aligned: 564.Lsrc_unaligned_dst_aligned:
544 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter 565 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
545 beqz t0, cleanup_src_unaligned 566 beqz t0, .Lcleanup_src_unaligned
546 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES 567 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
5471: 5681:
548/* 569/*
@@ -551,49 +572,53 @@ src_unaligned_dst_aligned:
551 * It's OK to load FIRST(N+1) before REST(N) because the two addresses 572 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
552 * are to the same unit (unless src is aligned, but it's not). 573 * are to the same unit (unless src is aligned, but it's not).
553 */ 574 */
554EXC( LDFIRST t0, FIRST(0)(src), l_exc) 575EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)
555EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy) 576EXC( LDFIRST t1, FIRST(1)(src), .Ll_exc_copy)
556 SUB len, len, 4*NBYTES 577 SUB len, len, 4*NBYTES
557EXC( LDREST t0, REST(0)(src), l_exc_copy) 578EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)
558EXC( LDREST t1, REST(1)(src), l_exc_copy) 579EXC( LDREST t1, REST(1)(src), .Ll_exc_copy)
559EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy) 580EXC( LDFIRST t2, FIRST(2)(src), .Ll_exc_copy)
560EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy) 581EXC( LDFIRST t3, FIRST(3)(src), .Ll_exc_copy)
561EXC( LDREST t2, REST(2)(src), l_exc_copy) 582EXC( LDREST t2, REST(2)(src), .Ll_exc_copy)
562EXC( LDREST t3, REST(3)(src), l_exc_copy) 583EXC( LDREST t3, REST(3)(src), .Ll_exc_copy)
563 ADD src, src, 4*NBYTES 584 ADD src, src, 4*NBYTES
564#ifdef CONFIG_CPU_SB1 585#ifdef CONFIG_CPU_SB1
565 nop # improves slotting 586 nop # improves slotting
566#endif 587#endif
567EXC( STORE t0, UNIT(0)(dst), s_exc) 588EXC( STORE t0, UNIT(0)(dst), .Ls_exc)
568 ADDC(sum, t0) 589 ADDC(sum, t0)
569EXC( STORE t1, UNIT(1)(dst), s_exc) 590EXC( STORE t1, UNIT(1)(dst), .Ls_exc)
570 ADDC(sum, t1) 591 ADDC(sum, t1)
571EXC( STORE t2, UNIT(2)(dst), s_exc) 592EXC( STORE t2, UNIT(2)(dst), .Ls_exc)
572 ADDC(sum, t2) 593 ADDC(sum, t2)
573EXC( STORE t3, UNIT(3)(dst), s_exc) 594EXC( STORE t3, UNIT(3)(dst), .Ls_exc)
574 ADDC(sum, t3) 595 ADDC(sum, t3)
596 .set reorder /* DADDI_WAR */
597 ADD dst, dst, 4*NBYTES
575 bne len, rem, 1b 598 bne len, rem, 1b
576 ADD dst, dst, 4*NBYTES 599 .set noreorder
577 600
578cleanup_src_unaligned: 601.Lcleanup_src_unaligned:
579 beqz len, done 602 beqz len, .Ldone
580 and rem, len, NBYTES-1 # rem = len % NBYTES 603 and rem, len, NBYTES-1 # rem = len % NBYTES
581 beq rem, len, copy_bytes 604 beq rem, len, .Lcopy_bytes
582 nop 605 nop
5831: 6061:
584EXC( LDFIRST t0, FIRST(0)(src), l_exc) 607EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)
585EXC( LDREST t0, REST(0)(src), l_exc_copy) 608EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)
586 ADD src, src, NBYTES 609 ADD src, src, NBYTES
587 SUB len, len, NBYTES 610 SUB len, len, NBYTES
588EXC( STORE t0, 0(dst), s_exc) 611EXC( STORE t0, 0(dst), .Ls_exc)
589 ADDC(sum, t0) 612 ADDC(sum, t0)
613 .set reorder /* DADDI_WAR */
614 ADD dst, dst, NBYTES
590 bne len, rem, 1b 615 bne len, rem, 1b
591 ADD dst, dst, NBYTES 616 .set noreorder
592 617
593copy_bytes_checklen: 618.Lcopy_bytes_checklen:
594 beqz len, done 619 beqz len, .Ldone
595 nop 620 nop
596copy_bytes: 621.Lcopy_bytes:
597 /* 0 < len < NBYTES */ 622 /* 0 < len < NBYTES */
598#ifdef CONFIG_CPU_LITTLE_ENDIAN 623#ifdef CONFIG_CPU_LITTLE_ENDIAN
599#define SHIFT_START 0 624#define SHIFT_START 0
@@ -604,14 +629,14 @@ copy_bytes:
604#endif 629#endif
605 move t2, zero # partial word 630 move t2, zero # partial word
606 li t3, SHIFT_START # shift 631 li t3, SHIFT_START # shift
607/* use l_exc_copy here to return correct sum on fault */ 632/* use .Ll_exc_copy here to return correct sum on fault */
608#define COPY_BYTE(N) \ 633#define COPY_BYTE(N) \
609EXC( lbu t0, N(src), l_exc_copy); \ 634EXC( lbu t0, N(src), .Ll_exc_copy); \
610 SUB len, len, 1; \ 635 SUB len, len, 1; \
611EXC( sb t0, N(dst), s_exc); \ 636EXC( sb t0, N(dst), .Ls_exc); \
612 SLLV t0, t0, t3; \ 637 SLLV t0, t0, t3; \
613 addu t3, SHIFT_INC; \ 638 addu t3, SHIFT_INC; \
614 beqz len, copy_bytes_done; \ 639 beqz len, .Lcopy_bytes_done; \
615 or t2, t0 640 or t2, t0
616 641
617 COPY_BYTE(0) 642 COPY_BYTE(0)
@@ -622,15 +647,17 @@ EXC( sb t0, N(dst), s_exc); \
622 COPY_BYTE(4) 647 COPY_BYTE(4)
623 COPY_BYTE(5) 648 COPY_BYTE(5)
624#endif 649#endif
625EXC( lbu t0, NBYTES-2(src), l_exc_copy) 650EXC( lbu t0, NBYTES-2(src), .Ll_exc_copy)
626 SUB len, len, 1 651 SUB len, len, 1
627EXC( sb t0, NBYTES-2(dst), s_exc) 652EXC( sb t0, NBYTES-2(dst), .Ls_exc)
628 SLLV t0, t0, t3 653 SLLV t0, t0, t3
629 or t2, t0 654 or t2, t0
630copy_bytes_done: 655.Lcopy_bytes_done:
631 ADDC(sum, t2) 656 ADDC(sum, t2)
632done: 657.Ldone:
633 /* fold checksum */ 658 /* fold checksum */
659 .set push
660 .set noat
634#ifdef USE_DOUBLE 661#ifdef USE_DOUBLE
635 dsll32 v1, sum, 0 662 dsll32 v1, sum, 0
636 daddu sum, v1 663 daddu sum, v1
@@ -651,13 +678,14 @@ done:
651 srl sum, sum, 8 678 srl sum, sum, 8
652 or sum, v1 679 or sum, v1
653 andi sum, 0xffff 680 andi sum, 0xffff
681 .set pop
6541: 6821:
655 .set reorder 683 .set reorder
656 ADDC(sum, psum) 684 ADDC(sum, psum)
657 jr ra 685 jr ra
658 .set noreorder 686 .set noreorder
659 687
660l_exc_copy: 688.Ll_exc_copy:
661 /* 689 /*
662 * Copy bytes from src until faulting load address (or until a 690 * Copy bytes from src until faulting load address (or until a
663 * lb faults) 691 * lb faults)
@@ -672,15 +700,17 @@ l_exc_copy:
672 li t2, SHIFT_START 700 li t2, SHIFT_START
673 LOAD t0, THREAD_BUADDR(t0) 701 LOAD t0, THREAD_BUADDR(t0)
6741: 7021:
675EXC( lbu t1, 0(src), l_exc) 703EXC( lbu t1, 0(src), .Ll_exc)
676 ADD src, src, 1 704 ADD src, src, 1
677 sb t1, 0(dst) # can't fault -- we're copy_from_user 705 sb t1, 0(dst) # can't fault -- we're copy_from_user
678 SLLV t1, t1, t2 706 SLLV t1, t1, t2
679 addu t2, SHIFT_INC 707 addu t2, SHIFT_INC
680 ADDC(sum, t1) 708 ADDC(sum, t1)
709 .set reorder /* DADDI_WAR */
710 ADD dst, dst, 1
681 bne src, t0, 1b 711 bne src, t0, 1b
682 ADD dst, dst, 1 712 .set noreorder
683l_exc: 713.Ll_exc:
684 LOAD t0, TI_TASK($28) 714 LOAD t0, TI_TASK($28)
685 nop 715 nop
686 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address 716 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
@@ -697,19 +727,30 @@ l_exc:
697 * Clear len bytes starting at dst. Can't call __bzero because it 727 * Clear len bytes starting at dst. Can't call __bzero because it
698 * might modify len. An inefficient loop for these rare times... 728 * might modify len. An inefficient loop for these rare times...
699 */ 729 */
700 beqz len, done 730 .set reorder /* DADDI_WAR */
701 SUB src, len, 1 731 SUB src, len, 1
732 beqz len, .Ldone
733 .set noreorder
7021: sb zero, 0(dst) 7341: sb zero, 0(dst)
703 ADD dst, dst, 1 735 ADD dst, dst, 1
736 .set push
737 .set noat
738#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
704 bnez src, 1b 739 bnez src, 1b
705 SUB src, src, 1 740 SUB src, src, 1
741#else
742 li v1, 1
743 bnez src, 1b
744 SUB src, src, v1
745#endif
706 li v1, -EFAULT 746 li v1, -EFAULT
707 b done 747 b .Ldone
708 sw v1, (errptr) 748 sw v1, (errptr)
709 749
710s_exc: 750.Ls_exc:
711 li v0, -1 /* invalid checksum */ 751 li v0, -1 /* invalid checksum */
712 li v1, -EFAULT 752 li v1, -EFAULT
713 jr ra 753 jr ra
714 sw v1, (errptr) 754 sw v1, (errptr)
755 .set pop
715 END(__csum_partial_copy_user) 756 END(__csum_partial_copy_user)
diff --git a/arch/mips/lib/memcpy-inatomic.S b/arch/mips/lib/memcpy-inatomic.S
index 3a534b2baa0f..736d0fb56a94 100644
--- a/arch/mips/lib/memcpy-inatomic.S
+++ b/arch/mips/lib/memcpy-inatomic.S
@@ -9,6 +9,7 @@
9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc. 9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
10 * Copyright (C) 2002 Broadcom, Inc. 10 * Copyright (C) 2002 Broadcom, Inc.
11 * memcpy/copy_user author: Mark Vandevoorde 11 * memcpy/copy_user author: Mark Vandevoorde
12 * Copyright (C) 2007 Maciej W. Rozycki
12 * 13 *
13 * Mnemonic names for arguments to memcpy/__copy_user 14 * Mnemonic names for arguments to memcpy/__copy_user
14 */ 15 */
@@ -175,7 +176,11 @@
175 176
176 .text 177 .text
177 .set noreorder 178 .set noreorder
179#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
178 .set noat 180 .set noat
181#else
182 .set at=v1
183#endif
179 184
180/* 185/*
181 * A combined memcpy/__copy_user 186 * A combined memcpy/__copy_user
@@ -204,36 +209,36 @@ LEAF(__copy_user_inatomic)
204 and t1, dst, ADDRMASK 209 and t1, dst, ADDRMASK
205 PREF( 0, 1*32(src) ) 210 PREF( 0, 1*32(src) )
206 PREF( 1, 1*32(dst) ) 211 PREF( 1, 1*32(dst) )
207 bnez t2, copy_bytes_checklen 212 bnez t2, .Lcopy_bytes_checklen
208 and t0, src, ADDRMASK 213 and t0, src, ADDRMASK
209 PREF( 0, 2*32(src) ) 214 PREF( 0, 2*32(src) )
210 PREF( 1, 2*32(dst) ) 215 PREF( 1, 2*32(dst) )
211 bnez t1, dst_unaligned 216 bnez t1, .Ldst_unaligned
212 nop 217 nop
213 bnez t0, src_unaligned_dst_aligned 218 bnez t0, .Lsrc_unaligned_dst_aligned
214 /* 219 /*
215 * use delay slot for fall-through 220 * use delay slot for fall-through
216 * src and dst are aligned; need to compute rem 221 * src and dst are aligned; need to compute rem
217 */ 222 */
218both_aligned: 223.Lboth_aligned:
219 SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter 224 SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter
220 beqz t0, cleanup_both_aligned # len < 8*NBYTES 225 beqz t0, .Lcleanup_both_aligned # len < 8*NBYTES
221 and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES) 226 and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES)
222 PREF( 0, 3*32(src) ) 227 PREF( 0, 3*32(src) )
223 PREF( 1, 3*32(dst) ) 228 PREF( 1, 3*32(dst) )
224 .align 4 229 .align 4
2251: 2301:
226EXC( LOAD t0, UNIT(0)(src), l_exc) 231EXC( LOAD t0, UNIT(0)(src), .Ll_exc)
227EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 232EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)
228EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 233EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)
229EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 234EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)
230 SUB len, len, 8*NBYTES 235 SUB len, len, 8*NBYTES
231EXC( LOAD t4, UNIT(4)(src), l_exc_copy) 236EXC( LOAD t4, UNIT(4)(src), .Ll_exc_copy)
232EXC( LOAD t7, UNIT(5)(src), l_exc_copy) 237EXC( LOAD t7, UNIT(5)(src), .Ll_exc_copy)
233 STORE t0, UNIT(0)(dst) 238 STORE t0, UNIT(0)(dst)
234 STORE t1, UNIT(1)(dst) 239 STORE t1, UNIT(1)(dst)
235EXC( LOAD t0, UNIT(6)(src), l_exc_copy) 240EXC( LOAD t0, UNIT(6)(src), .Ll_exc_copy)
236EXC( LOAD t1, UNIT(7)(src), l_exc_copy) 241EXC( LOAD t1, UNIT(7)(src), .Ll_exc_copy)
237 ADD src, src, 8*NBYTES 242 ADD src, src, 8*NBYTES
238 ADD dst, dst, 8*NBYTES 243 ADD dst, dst, 8*NBYTES
239 STORE t2, UNIT(-6)(dst) 244 STORE t2, UNIT(-6)(dst)
@@ -250,39 +255,43 @@ EXC( LOAD t1, UNIT(7)(src), l_exc_copy)
250 /* 255 /*
251 * len == rem == the number of bytes left to copy < 8*NBYTES 256 * len == rem == the number of bytes left to copy < 8*NBYTES
252 */ 257 */
253cleanup_both_aligned: 258.Lcleanup_both_aligned:
254 beqz len, done 259 beqz len, .Ldone
255 sltu t0, len, 4*NBYTES 260 sltu t0, len, 4*NBYTES
256 bnez t0, less_than_4units 261 bnez t0, .Lless_than_4units
257 and rem, len, (NBYTES-1) # rem = len % NBYTES 262 and rem, len, (NBYTES-1) # rem = len % NBYTES
258 /* 263 /*
259 * len >= 4*NBYTES 264 * len >= 4*NBYTES
260 */ 265 */
261EXC( LOAD t0, UNIT(0)(src), l_exc) 266EXC( LOAD t0, UNIT(0)(src), .Ll_exc)
262EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 267EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)
263EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 268EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)
264EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 269EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)
265 SUB len, len, 4*NBYTES 270 SUB len, len, 4*NBYTES
266 ADD src, src, 4*NBYTES 271 ADD src, src, 4*NBYTES
267 STORE t0, UNIT(0)(dst) 272 STORE t0, UNIT(0)(dst)
268 STORE t1, UNIT(1)(dst) 273 STORE t1, UNIT(1)(dst)
269 STORE t2, UNIT(2)(dst) 274 STORE t2, UNIT(2)(dst)
270 STORE t3, UNIT(3)(dst) 275 STORE t3, UNIT(3)(dst)
271 beqz len, done 276 .set reorder /* DADDI_WAR */
272 ADD dst, dst, 4*NBYTES 277 ADD dst, dst, 4*NBYTES
273less_than_4units: 278 beqz len, .Ldone
279 .set noreorder
280.Lless_than_4units:
274 /* 281 /*
275 * rem = len % NBYTES 282 * rem = len % NBYTES
276 */ 283 */
277 beq rem, len, copy_bytes 284 beq rem, len, .Lcopy_bytes
278 nop 285 nop
2791: 2861:
280EXC( LOAD t0, 0(src), l_exc) 287EXC( LOAD t0, 0(src), .Ll_exc)
281 ADD src, src, NBYTES 288 ADD src, src, NBYTES
282 SUB len, len, NBYTES 289 SUB len, len, NBYTES
283 STORE t0, 0(dst) 290 STORE t0, 0(dst)
291 .set reorder /* DADDI_WAR */
292 ADD dst, dst, NBYTES
284 bne rem, len, 1b 293 bne rem, len, 1b
285 ADD dst, dst, NBYTES 294 .set noreorder
286 295
287 /* 296 /*
288 * src and dst are aligned, need to copy rem bytes (rem < NBYTES) 297 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
@@ -296,17 +305,17 @@ EXC( LOAD t0, 0(src), l_exc)
296 * more instruction-level parallelism. 305 * more instruction-level parallelism.
297 */ 306 */
298#define bits t2 307#define bits t2
299 beqz len, done 308 beqz len, .Ldone
300 ADD t1, dst, len # t1 is just past last byte of dst 309 ADD t1, dst, len # t1 is just past last byte of dst
301 li bits, 8*NBYTES 310 li bits, 8*NBYTES
302 SLL rem, len, 3 # rem = number of bits to keep 311 SLL rem, len, 3 # rem = number of bits to keep
303EXC( LOAD t0, 0(src), l_exc) 312EXC( LOAD t0, 0(src), .Ll_exc)
304 SUB bits, bits, rem # bits = number of bits to discard 313 SUB bits, bits, rem # bits = number of bits to discard
305 SHIFT_DISCARD t0, t0, bits 314 SHIFT_DISCARD t0, t0, bits
306 STREST t0, -1(t1) 315 STREST t0, -1(t1)
307 jr ra 316 jr ra
308 move len, zero 317 move len, zero
309dst_unaligned: 318.Ldst_unaligned:
310 /* 319 /*
311 * dst is unaligned 320 * dst is unaligned
312 * t0 = src & ADDRMASK 321 * t0 = src & ADDRMASK
@@ -317,22 +326,22 @@ dst_unaligned:
317 * Set match = (src and dst have same alignment) 326 * Set match = (src and dst have same alignment)
318 */ 327 */
319#define match rem 328#define match rem
320EXC( LDFIRST t3, FIRST(0)(src), l_exc) 329EXC( LDFIRST t3, FIRST(0)(src), .Ll_exc)
321 ADD t2, zero, NBYTES 330 ADD t2, zero, NBYTES
322EXC( LDREST t3, REST(0)(src), l_exc_copy) 331EXC( LDREST t3, REST(0)(src), .Ll_exc_copy)
323 SUB t2, t2, t1 # t2 = number of bytes copied 332 SUB t2, t2, t1 # t2 = number of bytes copied
324 xor match, t0, t1 333 xor match, t0, t1
325 STFIRST t3, FIRST(0)(dst) 334 STFIRST t3, FIRST(0)(dst)
326 beq len, t2, done 335 beq len, t2, .Ldone
327 SUB len, len, t2 336 SUB len, len, t2
328 ADD dst, dst, t2 337 ADD dst, dst, t2
329 beqz match, both_aligned 338 beqz match, .Lboth_aligned
330 ADD src, src, t2 339 ADD src, src, t2
331 340
332src_unaligned_dst_aligned: 341.Lsrc_unaligned_dst_aligned:
333 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter 342 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
334 PREF( 0, 3*32(src) ) 343 PREF( 0, 3*32(src) )
335 beqz t0, cleanup_src_unaligned 344 beqz t0, .Lcleanup_src_unaligned
336 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES 345 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
337 PREF( 1, 3*32(dst) ) 346 PREF( 1, 3*32(dst) )
3381: 3471:
@@ -342,15 +351,15 @@ src_unaligned_dst_aligned:
342 * It's OK to load FIRST(N+1) before REST(N) because the two addresses 351 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
343 * are to the same unit (unless src is aligned, but it's not). 352 * are to the same unit (unless src is aligned, but it's not).
344 */ 353 */
345EXC( LDFIRST t0, FIRST(0)(src), l_exc) 354EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)
346EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy) 355EXC( LDFIRST t1, FIRST(1)(src), .Ll_exc_copy)
347 SUB len, len, 4*NBYTES 356 SUB len, len, 4*NBYTES
348EXC( LDREST t0, REST(0)(src), l_exc_copy) 357EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)
349EXC( LDREST t1, REST(1)(src), l_exc_copy) 358EXC( LDREST t1, REST(1)(src), .Ll_exc_copy)
350EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy) 359EXC( LDFIRST t2, FIRST(2)(src), .Ll_exc_copy)
351EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy) 360EXC( LDFIRST t3, FIRST(3)(src), .Ll_exc_copy)
352EXC( LDREST t2, REST(2)(src), l_exc_copy) 361EXC( LDREST t2, REST(2)(src), .Ll_exc_copy)
353EXC( LDREST t3, REST(3)(src), l_exc_copy) 362EXC( LDREST t3, REST(3)(src), .Ll_exc_copy)
354 PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed) 363 PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed)
355 ADD src, src, 4*NBYTES 364 ADD src, src, 4*NBYTES
356#ifdef CONFIG_CPU_SB1 365#ifdef CONFIG_CPU_SB1
@@ -361,32 +370,36 @@ EXC( LDREST t3, REST(3)(src), l_exc_copy)
361 STORE t2, UNIT(2)(dst) 370 STORE t2, UNIT(2)(dst)
362 STORE t3, UNIT(3)(dst) 371 STORE t3, UNIT(3)(dst)
363 PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed) 372 PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed)
373 .set reorder /* DADDI_WAR */
374 ADD dst, dst, 4*NBYTES
364 bne len, rem, 1b 375 bne len, rem, 1b
365 ADD dst, dst, 4*NBYTES 376 .set noreorder
366 377
367cleanup_src_unaligned: 378.Lcleanup_src_unaligned:
368 beqz len, done 379 beqz len, .Ldone
369 and rem, len, NBYTES-1 # rem = len % NBYTES 380 and rem, len, NBYTES-1 # rem = len % NBYTES
370 beq rem, len, copy_bytes 381 beq rem, len, .Lcopy_bytes
371 nop 382 nop
3721: 3831:
373EXC( LDFIRST t0, FIRST(0)(src), l_exc) 384EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)
374EXC( LDREST t0, REST(0)(src), l_exc_copy) 385EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)
375 ADD src, src, NBYTES 386 ADD src, src, NBYTES
376 SUB len, len, NBYTES 387 SUB len, len, NBYTES
377 STORE t0, 0(dst) 388 STORE t0, 0(dst)
389 .set reorder /* DADDI_WAR */
390 ADD dst, dst, NBYTES
378 bne len, rem, 1b 391 bne len, rem, 1b
379 ADD dst, dst, NBYTES 392 .set noreorder
380 393
381copy_bytes_checklen: 394.Lcopy_bytes_checklen:
382 beqz len, done 395 beqz len, .Ldone
383 nop 396 nop
384copy_bytes: 397.Lcopy_bytes:
385 /* 0 < len < NBYTES */ 398 /* 0 < len < NBYTES */
386#define COPY_BYTE(N) \ 399#define COPY_BYTE(N) \
387EXC( lb t0, N(src), l_exc); \ 400EXC( lb t0, N(src), .Ll_exc); \
388 SUB len, len, 1; \ 401 SUB len, len, 1; \
389 beqz len, done; \ 402 beqz len, .Ldone; \
390 sb t0, N(dst) 403 sb t0, N(dst)
391 404
392 COPY_BYTE(0) 405 COPY_BYTE(0)
@@ -397,16 +410,16 @@ EXC( lb t0, N(src), l_exc); \
397 COPY_BYTE(4) 410 COPY_BYTE(4)
398 COPY_BYTE(5) 411 COPY_BYTE(5)
399#endif 412#endif
400EXC( lb t0, NBYTES-2(src), l_exc) 413EXC( lb t0, NBYTES-2(src), .Ll_exc)
401 SUB len, len, 1 414 SUB len, len, 1
402 jr ra 415 jr ra
403 sb t0, NBYTES-2(dst) 416 sb t0, NBYTES-2(dst)
404done: 417.Ldone:
405 jr ra 418 jr ra
406 nop 419 nop
407 END(__copy_user_inatomic) 420 END(__copy_user_inatomic)
408 421
409l_exc_copy: 422.Ll_exc_copy:
410 /* 423 /*
411 * Copy bytes from src until faulting load address (or until a 424 * Copy bytes from src until faulting load address (or until a
412 * lb faults) 425 * lb faults)
@@ -421,12 +434,14 @@ l_exc_copy:
421 nop 434 nop
422 LOAD t0, THREAD_BUADDR(t0) 435 LOAD t0, THREAD_BUADDR(t0)
4231: 4361:
424EXC( lb t1, 0(src), l_exc) 437EXC( lb t1, 0(src), .Ll_exc)
425 ADD src, src, 1 438 ADD src, src, 1
426 sb t1, 0(dst) # can't fault -- we're copy_from_user 439 sb t1, 0(dst) # can't fault -- we're copy_from_user
440 .set reorder /* DADDI_WAR */
441 ADD dst, dst, 1
427 bne src, t0, 1b 442 bne src, t0, 1b
428 ADD dst, dst, 1 443 .set noreorder
429l_exc: 444.Ll_exc:
430 LOAD t0, TI_TASK($28) 445 LOAD t0, TI_TASK($28)
431 nop 446 nop
432 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address 447 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
diff --git a/arch/mips/lib/memcpy.S b/arch/mips/lib/memcpy.S
index a526c62cb76a..c06cccf60bec 100644
--- a/arch/mips/lib/memcpy.S
+++ b/arch/mips/lib/memcpy.S
@@ -9,6 +9,7 @@
9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc. 9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
10 * Copyright (C) 2002 Broadcom, Inc. 10 * Copyright (C) 2002 Broadcom, Inc.
11 * memcpy/copy_user author: Mark Vandevoorde 11 * memcpy/copy_user author: Mark Vandevoorde
12 * Copyright (C) 2007 Maciej W. Rozycki
12 * 13 *
13 * Mnemonic names for arguments to memcpy/__copy_user 14 * Mnemonic names for arguments to memcpy/__copy_user
14 */ 15 */
@@ -175,7 +176,11 @@
175 176
176 .text 177 .text
177 .set noreorder 178 .set noreorder
179#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
178 .set noat 180 .set noat
181#else
182 .set at=v1
183#endif
179 184
180/* 185/*
181 * A combined memcpy/__copy_user 186 * A combined memcpy/__copy_user
@@ -186,7 +191,7 @@
186 .align 5 191 .align 5
187LEAF(memcpy) /* a0=dst a1=src a2=len */ 192LEAF(memcpy) /* a0=dst a1=src a2=len */
188 move v0, dst /* return value */ 193 move v0, dst /* return value */
189__memcpy: 194.L__memcpy:
190FEXPORT(__copy_user) 195FEXPORT(__copy_user)
191 /* 196 /*
192 * Note: dst & src may be unaligned, len may be 0 197 * Note: dst & src may be unaligned, len may be 0
@@ -194,6 +199,7 @@ FEXPORT(__copy_user)
194 */ 199 */
195#define rem t8 200#define rem t8
196 201
202 R10KCBARRIER(0(ra))
197 /* 203 /*
198 * The "issue break"s below are very approximate. 204 * The "issue break"s below are very approximate.
199 * Issue delays for dcache fills will perturb the schedule, as will 205 * Issue delays for dcache fills will perturb the schedule, as will
@@ -207,44 +213,45 @@ FEXPORT(__copy_user)
207 and t1, dst, ADDRMASK 213 and t1, dst, ADDRMASK
208 PREF( 0, 1*32(src) ) 214 PREF( 0, 1*32(src) )
209 PREF( 1, 1*32(dst) ) 215 PREF( 1, 1*32(dst) )
210 bnez t2, copy_bytes_checklen 216 bnez t2, .Lcopy_bytes_checklen
211 and t0, src, ADDRMASK 217 and t0, src, ADDRMASK
212 PREF( 0, 2*32(src) ) 218 PREF( 0, 2*32(src) )
213 PREF( 1, 2*32(dst) ) 219 PREF( 1, 2*32(dst) )
214 bnez t1, dst_unaligned 220 bnez t1, .Ldst_unaligned
215 nop 221 nop
216 bnez t0, src_unaligned_dst_aligned 222 bnez t0, .Lsrc_unaligned_dst_aligned
217 /* 223 /*
218 * use delay slot for fall-through 224 * use delay slot for fall-through
219 * src and dst are aligned; need to compute rem 225 * src and dst are aligned; need to compute rem
220 */ 226 */
221both_aligned: 227.Lboth_aligned:
222 SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter 228 SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter
223 beqz t0, cleanup_both_aligned # len < 8*NBYTES 229 beqz t0, .Lcleanup_both_aligned # len < 8*NBYTES
224 and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES) 230 and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES)
225 PREF( 0, 3*32(src) ) 231 PREF( 0, 3*32(src) )
226 PREF( 1, 3*32(dst) ) 232 PREF( 1, 3*32(dst) )
227 .align 4 233 .align 4
2281: 2341:
229EXC( LOAD t0, UNIT(0)(src), l_exc) 235 R10KCBARRIER(0(ra))
230EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 236EXC( LOAD t0, UNIT(0)(src), .Ll_exc)
231EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 237EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)
232EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 238EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)
239EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)
233 SUB len, len, 8*NBYTES 240 SUB len, len, 8*NBYTES
234EXC( LOAD t4, UNIT(4)(src), l_exc_copy) 241EXC( LOAD t4, UNIT(4)(src), .Ll_exc_copy)
235EXC( LOAD t7, UNIT(5)(src), l_exc_copy) 242EXC( LOAD t7, UNIT(5)(src), .Ll_exc_copy)
236EXC( STORE t0, UNIT(0)(dst), s_exc_p8u) 243EXC( STORE t0, UNIT(0)(dst), .Ls_exc_p8u)
237EXC( STORE t1, UNIT(1)(dst), s_exc_p7u) 244EXC( STORE t1, UNIT(1)(dst), .Ls_exc_p7u)
238EXC( LOAD t0, UNIT(6)(src), l_exc_copy) 245EXC( LOAD t0, UNIT(6)(src), .Ll_exc_copy)
239EXC( LOAD t1, UNIT(7)(src), l_exc_copy) 246EXC( LOAD t1, UNIT(7)(src), .Ll_exc_copy)
240 ADD src, src, 8*NBYTES 247 ADD src, src, 8*NBYTES
241 ADD dst, dst, 8*NBYTES 248 ADD dst, dst, 8*NBYTES
242EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u) 249EXC( STORE t2, UNIT(-6)(dst), .Ls_exc_p6u)
243EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u) 250EXC( STORE t3, UNIT(-5)(dst), .Ls_exc_p5u)
244EXC( STORE t4, UNIT(-4)(dst), s_exc_p4u) 251EXC( STORE t4, UNIT(-4)(dst), .Ls_exc_p4u)
245EXC( STORE t7, UNIT(-3)(dst), s_exc_p3u) 252EXC( STORE t7, UNIT(-3)(dst), .Ls_exc_p3u)
246EXC( STORE t0, UNIT(-2)(dst), s_exc_p2u) 253EXC( STORE t0, UNIT(-2)(dst), .Ls_exc_p2u)
247EXC( STORE t1, UNIT(-1)(dst), s_exc_p1u) 254EXC( STORE t1, UNIT(-1)(dst), .Ls_exc_p1u)
248 PREF( 0, 8*32(src) ) 255 PREF( 0, 8*32(src) )
249 PREF( 1, 8*32(dst) ) 256 PREF( 1, 8*32(dst) )
250 bne len, rem, 1b 257 bne len, rem, 1b
@@ -253,39 +260,45 @@ EXC( STORE t1, UNIT(-1)(dst), s_exc_p1u)
253 /* 260 /*
254 * len == rem == the number of bytes left to copy < 8*NBYTES 261 * len == rem == the number of bytes left to copy < 8*NBYTES
255 */ 262 */
256cleanup_both_aligned: 263.Lcleanup_both_aligned:
257 beqz len, done 264 beqz len, .Ldone
258 sltu t0, len, 4*NBYTES 265 sltu t0, len, 4*NBYTES
259 bnez t0, less_than_4units 266 bnez t0, .Lless_than_4units
260 and rem, len, (NBYTES-1) # rem = len % NBYTES 267 and rem, len, (NBYTES-1) # rem = len % NBYTES
261 /* 268 /*
262 * len >= 4*NBYTES 269 * len >= 4*NBYTES
263 */ 270 */
264EXC( LOAD t0, UNIT(0)(src), l_exc) 271EXC( LOAD t0, UNIT(0)(src), .Ll_exc)
265EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 272EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)
266EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 273EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)
267EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 274EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)
268 SUB len, len, 4*NBYTES 275 SUB len, len, 4*NBYTES
269 ADD src, src, 4*NBYTES 276 ADD src, src, 4*NBYTES
270EXC( STORE t0, UNIT(0)(dst), s_exc_p4u) 277 R10KCBARRIER(0(ra))
271EXC( STORE t1, UNIT(1)(dst), s_exc_p3u) 278EXC( STORE t0, UNIT(0)(dst), .Ls_exc_p4u)
272EXC( STORE t2, UNIT(2)(dst), s_exc_p2u) 279EXC( STORE t1, UNIT(1)(dst), .Ls_exc_p3u)
273EXC( STORE t3, UNIT(3)(dst), s_exc_p1u) 280EXC( STORE t2, UNIT(2)(dst), .Ls_exc_p2u)
274 beqz len, done 281EXC( STORE t3, UNIT(3)(dst), .Ls_exc_p1u)
275 ADD dst, dst, 4*NBYTES 282 .set reorder /* DADDI_WAR */
276less_than_4units: 283 ADD dst, dst, 4*NBYTES
284 beqz len, .Ldone
285 .set noreorder
286.Lless_than_4units:
277 /* 287 /*
278 * rem = len % NBYTES 288 * rem = len % NBYTES
279 */ 289 */
280 beq rem, len, copy_bytes 290 beq rem, len, .Lcopy_bytes
281 nop 291 nop
2821: 2921:
283EXC( LOAD t0, 0(src), l_exc) 293 R10KCBARRIER(0(ra))
294EXC( LOAD t0, 0(src), .Ll_exc)
284 ADD src, src, NBYTES 295 ADD src, src, NBYTES
285 SUB len, len, NBYTES 296 SUB len, len, NBYTES
286EXC( STORE t0, 0(dst), s_exc_p1u) 297EXC( STORE t0, 0(dst), .Ls_exc_p1u)
298 .set reorder /* DADDI_WAR */
299 ADD dst, dst, NBYTES
287 bne rem, len, 1b 300 bne rem, len, 1b
288 ADD dst, dst, NBYTES 301 .set noreorder
289 302
290 /* 303 /*
291 * src and dst are aligned, need to copy rem bytes (rem < NBYTES) 304 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
@@ -299,17 +312,17 @@ EXC( STORE t0, 0(dst), s_exc_p1u)
299 * more instruction-level parallelism. 312 * more instruction-level parallelism.
300 */ 313 */
301#define bits t2 314#define bits t2
302 beqz len, done 315 beqz len, .Ldone
303 ADD t1, dst, len # t1 is just past last byte of dst 316 ADD t1, dst, len # t1 is just past last byte of dst
304 li bits, 8*NBYTES 317 li bits, 8*NBYTES
305 SLL rem, len, 3 # rem = number of bits to keep 318 SLL rem, len, 3 # rem = number of bits to keep
306EXC( LOAD t0, 0(src), l_exc) 319EXC( LOAD t0, 0(src), .Ll_exc)
307 SUB bits, bits, rem # bits = number of bits to discard 320 SUB bits, bits, rem # bits = number of bits to discard
308 SHIFT_DISCARD t0, t0, bits 321 SHIFT_DISCARD t0, t0, bits
309EXC( STREST t0, -1(t1), s_exc) 322EXC( STREST t0, -1(t1), .Ls_exc)
310 jr ra 323 jr ra
311 move len, zero 324 move len, zero
312dst_unaligned: 325.Ldst_unaligned:
313 /* 326 /*
314 * dst is unaligned 327 * dst is unaligned
315 * t0 = src & ADDRMASK 328 * t0 = src & ADDRMASK
@@ -320,22 +333,23 @@ dst_unaligned:
320 * Set match = (src and dst have same alignment) 333 * Set match = (src and dst have same alignment)
321 */ 334 */
322#define match rem 335#define match rem
323EXC( LDFIRST t3, FIRST(0)(src), l_exc) 336EXC( LDFIRST t3, FIRST(0)(src), .Ll_exc)
324 ADD t2, zero, NBYTES 337 ADD t2, zero, NBYTES
325EXC( LDREST t3, REST(0)(src), l_exc_copy) 338EXC( LDREST t3, REST(0)(src), .Ll_exc_copy)
326 SUB t2, t2, t1 # t2 = number of bytes copied 339 SUB t2, t2, t1 # t2 = number of bytes copied
327 xor match, t0, t1 340 xor match, t0, t1
328EXC( STFIRST t3, FIRST(0)(dst), s_exc) 341 R10KCBARRIER(0(ra))
329 beq len, t2, done 342EXC( STFIRST t3, FIRST(0)(dst), .Ls_exc)
343 beq len, t2, .Ldone
330 SUB len, len, t2 344 SUB len, len, t2
331 ADD dst, dst, t2 345 ADD dst, dst, t2
332 beqz match, both_aligned 346 beqz match, .Lboth_aligned
333 ADD src, src, t2 347 ADD src, src, t2
334 348
335src_unaligned_dst_aligned: 349.Lsrc_unaligned_dst_aligned:
336 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter 350 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
337 PREF( 0, 3*32(src) ) 351 PREF( 0, 3*32(src) )
338 beqz t0, cleanup_src_unaligned 352 beqz t0, .Lcleanup_src_unaligned
339 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES 353 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
340 PREF( 1, 3*32(dst) ) 354 PREF( 1, 3*32(dst) )
3411: 3551:
@@ -345,52 +359,59 @@ src_unaligned_dst_aligned:
345 * It's OK to load FIRST(N+1) before REST(N) because the two addresses 359 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
346 * are to the same unit (unless src is aligned, but it's not). 360 * are to the same unit (unless src is aligned, but it's not).
347 */ 361 */
348EXC( LDFIRST t0, FIRST(0)(src), l_exc) 362 R10KCBARRIER(0(ra))
349EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy) 363EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)
364EXC( LDFIRST t1, FIRST(1)(src), .Ll_exc_copy)
350 SUB len, len, 4*NBYTES 365 SUB len, len, 4*NBYTES
351EXC( LDREST t0, REST(0)(src), l_exc_copy) 366EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)
352EXC( LDREST t1, REST(1)(src), l_exc_copy) 367EXC( LDREST t1, REST(1)(src), .Ll_exc_copy)
353EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy) 368EXC( LDFIRST t2, FIRST(2)(src), .Ll_exc_copy)
354EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy) 369EXC( LDFIRST t3, FIRST(3)(src), .Ll_exc_copy)
355EXC( LDREST t2, REST(2)(src), l_exc_copy) 370EXC( LDREST t2, REST(2)(src), .Ll_exc_copy)
356EXC( LDREST t3, REST(3)(src), l_exc_copy) 371EXC( LDREST t3, REST(3)(src), .Ll_exc_copy)
357 PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed) 372 PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed)
358 ADD src, src, 4*NBYTES 373 ADD src, src, 4*NBYTES
359#ifdef CONFIG_CPU_SB1 374#ifdef CONFIG_CPU_SB1
360 nop # improves slotting 375 nop # improves slotting
361#endif 376#endif
362EXC( STORE t0, UNIT(0)(dst), s_exc_p4u) 377EXC( STORE t0, UNIT(0)(dst), .Ls_exc_p4u)
363EXC( STORE t1, UNIT(1)(dst), s_exc_p3u) 378EXC( STORE t1, UNIT(1)(dst), .Ls_exc_p3u)
364EXC( STORE t2, UNIT(2)(dst), s_exc_p2u) 379EXC( STORE t2, UNIT(2)(dst), .Ls_exc_p2u)
365EXC( STORE t3, UNIT(3)(dst), s_exc_p1u) 380EXC( STORE t3, UNIT(3)(dst), .Ls_exc_p1u)
366 PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed) 381 PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed)
382 .set reorder /* DADDI_WAR */
383 ADD dst, dst, 4*NBYTES
367 bne len, rem, 1b 384 bne len, rem, 1b
368 ADD dst, dst, 4*NBYTES 385 .set noreorder
369 386
370cleanup_src_unaligned: 387.Lcleanup_src_unaligned:
371 beqz len, done 388 beqz len, .Ldone
372 and rem, len, NBYTES-1 # rem = len % NBYTES 389 and rem, len, NBYTES-1 # rem = len % NBYTES
373 beq rem, len, copy_bytes 390 beq rem, len, .Lcopy_bytes
374 nop 391 nop
3751: 3921:
376EXC( LDFIRST t0, FIRST(0)(src), l_exc) 393 R10KCBARRIER(0(ra))
377EXC( LDREST t0, REST(0)(src), l_exc_copy) 394EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)
395EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)
378 ADD src, src, NBYTES 396 ADD src, src, NBYTES
379 SUB len, len, NBYTES 397 SUB len, len, NBYTES
380EXC( STORE t0, 0(dst), s_exc_p1u) 398EXC( STORE t0, 0(dst), .Ls_exc_p1u)
399 .set reorder /* DADDI_WAR */
400 ADD dst, dst, NBYTES
381 bne len, rem, 1b 401 bne len, rem, 1b
382 ADD dst, dst, NBYTES 402 .set noreorder
383 403
384copy_bytes_checklen: 404.Lcopy_bytes_checklen:
385 beqz len, done 405 beqz len, .Ldone
386 nop 406 nop
387copy_bytes: 407.Lcopy_bytes:
388 /* 0 < len < NBYTES */ 408 /* 0 < len < NBYTES */
409 R10KCBARRIER(0(ra))
389#define COPY_BYTE(N) \ 410#define COPY_BYTE(N) \
390EXC( lb t0, N(src), l_exc); \ 411EXC( lb t0, N(src), .Ll_exc); \
391 SUB len, len, 1; \ 412 SUB len, len, 1; \
392 beqz len, done; \ 413 beqz len, .Ldone; \
393EXC( sb t0, N(dst), s_exc_p1) 414EXC( sb t0, N(dst), .Ls_exc_p1)
394 415
395 COPY_BYTE(0) 416 COPY_BYTE(0)
396 COPY_BYTE(1) 417 COPY_BYTE(1)
@@ -400,16 +421,16 @@ EXC( sb t0, N(dst), s_exc_p1)
400 COPY_BYTE(4) 421 COPY_BYTE(4)
401 COPY_BYTE(5) 422 COPY_BYTE(5)
402#endif 423#endif
403EXC( lb t0, NBYTES-2(src), l_exc) 424EXC( lb t0, NBYTES-2(src), .Ll_exc)
404 SUB len, len, 1 425 SUB len, len, 1
405 jr ra 426 jr ra
406EXC( sb t0, NBYTES-2(dst), s_exc_p1) 427EXC( sb t0, NBYTES-2(dst), .Ls_exc_p1)
407done: 428.Ldone:
408 jr ra 429 jr ra
409 nop 430 nop
410 END(memcpy) 431 END(memcpy)
411 432
412l_exc_copy: 433.Ll_exc_copy:
413 /* 434 /*
414 * Copy bytes from src until faulting load address (or until a 435 * Copy bytes from src until faulting load address (or until a
415 * lb faults) 436 * lb faults)
@@ -424,12 +445,14 @@ l_exc_copy:
424 nop 445 nop
425 LOAD t0, THREAD_BUADDR(t0) 446 LOAD t0, THREAD_BUADDR(t0)
4261: 4471:
427EXC( lb t1, 0(src), l_exc) 448EXC( lb t1, 0(src), .Ll_exc)
428 ADD src, src, 1 449 ADD src, src, 1
429 sb t1, 0(dst) # can't fault -- we're copy_from_user 450 sb t1, 0(dst) # can't fault -- we're copy_from_user
451 .set reorder /* DADDI_WAR */
452 ADD dst, dst, 1
430 bne src, t0, 1b 453 bne src, t0, 1b
431 ADD dst, dst, 1 454 .set noreorder
432l_exc: 455.Ll_exc:
433 LOAD t0, TI_TASK($28) 456 LOAD t0, TI_TASK($28)
434 nop 457 nop
435 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address 458 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
@@ -446,20 +469,33 @@ l_exc:
446 * Clear len bytes starting at dst. Can't call __bzero because it 469 * Clear len bytes starting at dst. Can't call __bzero because it
447 * might modify len. An inefficient loop for these rare times... 470 * might modify len. An inefficient loop for these rare times...
448 */ 471 */
449 beqz len, done 472 .set reorder /* DADDI_WAR */
450 SUB src, len, 1 473 SUB src, len, 1
474 beqz len, .Ldone
475 .set noreorder
4511: sb zero, 0(dst) 4761: sb zero, 0(dst)
452 ADD dst, dst, 1 477 ADD dst, dst, 1
478#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
453 bnez src, 1b 479 bnez src, 1b
454 SUB src, src, 1 480 SUB src, src, 1
481#else
482 .set push
483 .set noat
484 li v1, 1
485 bnez src, 1b
486 SUB src, src, v1
487 .set pop
488#endif
455 jr ra 489 jr ra
456 nop 490 nop
457 491
458 492
459#define SEXC(n) \ 493#define SEXC(n) \
460s_exc_p ## n ## u: \ 494 .set reorder; /* DADDI_WAR */ \
461 jr ra; \ 495.Ls_exc_p ## n ## u: \
462 ADD len, len, n*NBYTES 496 ADD len, len, n*NBYTES; \
497 jr ra; \
498 .set noreorder
463 499
464SEXC(8) 500SEXC(8)
465SEXC(7) 501SEXC(7)
@@ -470,10 +506,12 @@ SEXC(3)
470SEXC(2) 506SEXC(2)
471SEXC(1) 507SEXC(1)
472 508
473s_exc_p1: 509.Ls_exc_p1:
510 .set reorder /* DADDI_WAR */
511 ADD len, len, 1
474 jr ra 512 jr ra
475 ADD len, len, 1 513 .set noreorder
476s_exc: 514.Ls_exc:
477 jr ra 515 jr ra
478 nop 516 nop
479 517
@@ -484,38 +522,44 @@ LEAF(memmove)
484 sltu t0, a1, t0 # dst + len <= src -> memcpy 522 sltu t0, a1, t0 # dst + len <= src -> memcpy
485 sltu t1, a0, t1 # dst >= src + len -> memcpy 523 sltu t1, a0, t1 # dst >= src + len -> memcpy
486 and t0, t1 524 and t0, t1
487 beqz t0, __memcpy 525 beqz t0, .L__memcpy
488 move v0, a0 /* return value */ 526 move v0, a0 /* return value */
489 beqz a2, r_out 527 beqz a2, .Lr_out
490 END(memmove) 528 END(memmove)
491 529
492 /* fall through to __rmemcpy */ 530 /* fall through to __rmemcpy */
493LEAF(__rmemcpy) /* a0=dst a1=src a2=len */ 531LEAF(__rmemcpy) /* a0=dst a1=src a2=len */
494 sltu t0, a1, a0 532 sltu t0, a1, a0
495 beqz t0, r_end_bytes_up # src >= dst 533 beqz t0, .Lr_end_bytes_up # src >= dst
496 nop 534 nop
497 ADD a0, a2 # dst = dst + len 535 ADD a0, a2 # dst = dst + len
498 ADD a1, a2 # src = src + len 536 ADD a1, a2 # src = src + len
499 537
500r_end_bytes: 538.Lr_end_bytes:
539 R10KCBARRIER(0(ra))
501 lb t0, -1(a1) 540 lb t0, -1(a1)
502 SUB a2, a2, 0x1 541 SUB a2, a2, 0x1
503 sb t0, -1(a0) 542 sb t0, -1(a0)
504 SUB a1, a1, 0x1 543 SUB a1, a1, 0x1
505 bnez a2, r_end_bytes 544 .set reorder /* DADDI_WAR */
506 SUB a0, a0, 0x1 545 SUB a0, a0, 0x1
546 bnez a2, .Lr_end_bytes
547 .set noreorder
507 548
508r_out: 549.Lr_out:
509 jr ra 550 jr ra
510 move a2, zero 551 move a2, zero
511 552
512r_end_bytes_up: 553.Lr_end_bytes_up:
554 R10KCBARRIER(0(ra))
513 lb t0, (a1) 555 lb t0, (a1)
514 SUB a2, a2, 0x1 556 SUB a2, a2, 0x1
515 sb t0, (a0) 557 sb t0, (a0)
516 ADD a1, a1, 0x1 558 ADD a1, a1, 0x1
517 bnez a2, r_end_bytes_up 559 .set reorder /* DADDI_WAR */
518 ADD a0, a0, 0x1 560 ADD a0, a0, 0x1
561 bnez a2, .Lr_end_bytes_up
562 .set noreorder
519 563
520 jr ra 564 jr ra
521 move a2, zero 565 move a2, zero
diff --git a/arch/mips/lib/memset.S b/arch/mips/lib/memset.S
index 3f8b8b3d0b23..77dc3b20110a 100644
--- a/arch/mips/lib/memset.S
+++ b/arch/mips/lib/memset.S
@@ -5,6 +5,7 @@
5 * 5 *
6 * Copyright (C) 1998, 1999, 2000 by Ralf Baechle 6 * Copyright (C) 1998, 1999, 2000 by Ralf Baechle
7 * Copyright (C) 1999, 2000 Silicon Graphics, Inc. 7 * Copyright (C) 1999, 2000 Silicon Graphics, Inc.
8 * Copyright (C) 2007 Maciej W. Rozycki
8 */ 9 */
9#include <asm/asm.h> 10#include <asm/asm.h>
10#include <asm/asm-offsets.h> 11#include <asm/asm-offsets.h>
@@ -71,34 +72,45 @@ LEAF(memset)
71 72
72FEXPORT(__bzero) 73FEXPORT(__bzero)
73 sltiu t0, a2, LONGSIZE /* very small region? */ 74 sltiu t0, a2, LONGSIZE /* very small region? */
74 bnez t0, small_memset 75 bnez t0, .Lsmall_memset
75 andi t0, a0, LONGMASK /* aligned? */ 76 andi t0, a0, LONGMASK /* aligned? */
76 77
78#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
77 beqz t0, 1f 79 beqz t0, 1f
78 PTR_SUBU t0, LONGSIZE /* alignment in bytes */ 80 PTR_SUBU t0, LONGSIZE /* alignment in bytes */
81#else
82 .set noat
83 li AT, LONGSIZE
84 beqz t0, 1f
85 PTR_SUBU t0, AT /* alignment in bytes */
86 .set at
87#endif
79 88
89 R10KCBARRIER(0(ra))
80#ifdef __MIPSEB__ 90#ifdef __MIPSEB__
81 EX(LONG_S_L, a1, (a0), first_fixup) /* make word/dword aligned */ 91 EX(LONG_S_L, a1, (a0), .Lfirst_fixup) /* make word/dword aligned */
82#endif 92#endif
83#ifdef __MIPSEL__ 93#ifdef __MIPSEL__
84 EX(LONG_S_R, a1, (a0), first_fixup) /* make word/dword aligned */ 94 EX(LONG_S_R, a1, (a0), .Lfirst_fixup) /* make word/dword aligned */
85#endif 95#endif
86 PTR_SUBU a0, t0 /* long align ptr */ 96 PTR_SUBU a0, t0 /* long align ptr */
87 PTR_ADDU a2, t0 /* correct size */ 97 PTR_ADDU a2, t0 /* correct size */
88 98
891: ori t1, a2, 0x3f /* # of full blocks */ 991: ori t1, a2, 0x3f /* # of full blocks */
90 xori t1, 0x3f 100 xori t1, 0x3f
91 beqz t1, memset_partial /* no block to fill */ 101 beqz t1, .Lmemset_partial /* no block to fill */
92 andi t0, a2, 0x40-LONGSIZE 102 andi t0, a2, 0x40-LONGSIZE
93 103
94 PTR_ADDU t1, a0 /* end address */ 104 PTR_ADDU t1, a0 /* end address */
95 .set reorder 105 .set reorder
961: PTR_ADDIU a0, 64 1061: PTR_ADDIU a0, 64
97 f_fill64 a0, -64, a1, fwd_fixup 107 R10KCBARRIER(0(ra))
108 f_fill64 a0, -64, a1, .Lfwd_fixup
98 bne t1, a0, 1b 109 bne t1, a0, 1b
99 .set noreorder 110 .set noreorder
100 111
101memset_partial: 112.Lmemset_partial:
113 R10KCBARRIER(0(ra))
102 PTR_LA t1, 2f /* where to start */ 114 PTR_LA t1, 2f /* where to start */
103#if LONGSIZE == 4 115#if LONGSIZE == 4
104 PTR_SUBU t1, t0 116 PTR_SUBU t1, t0
@@ -106,7 +118,7 @@ memset_partial:
106 .set noat 118 .set noat
107 LONG_SRL AT, t0, 1 119 LONG_SRL AT, t0, 1
108 PTR_SUBU t1, AT 120 PTR_SUBU t1, AT
109 .set noat 121 .set at
110#endif 122#endif
111 jr t1 123 jr t1
112 PTR_ADDU a0, t0 /* dest ptr */ 124 PTR_ADDU a0, t0 /* dest ptr */
@@ -114,26 +126,28 @@ memset_partial:
114 .set push 126 .set push
115 .set noreorder 127 .set noreorder
116 .set nomacro 128 .set nomacro
117 f_fill64 a0, -64, a1, partial_fixup /* ... but first do longs ... */ 129 f_fill64 a0, -64, a1, .Lpartial_fixup /* ... but first do longs ... */
1182: .set pop 1302: .set pop
119 andi a2, LONGMASK /* At most one long to go */ 131 andi a2, LONGMASK /* At most one long to go */
120 132
121 beqz a2, 1f 133 beqz a2, 1f
122 PTR_ADDU a0, a2 /* What's left */ 134 PTR_ADDU a0, a2 /* What's left */
135 R10KCBARRIER(0(ra))
123#ifdef __MIPSEB__ 136#ifdef __MIPSEB__
124 EX(LONG_S_R, a1, -1(a0), last_fixup) 137 EX(LONG_S_R, a1, -1(a0), .Llast_fixup)
125#endif 138#endif
126#ifdef __MIPSEL__ 139#ifdef __MIPSEL__
127 EX(LONG_S_L, a1, -1(a0), last_fixup) 140 EX(LONG_S_L, a1, -1(a0), .Llast_fixup)
128#endif 141#endif
1291: jr ra 1421: jr ra
130 move a2, zero 143 move a2, zero
131 144
132small_memset: 145.Lsmall_memset:
133 beqz a2, 2f 146 beqz a2, 2f
134 PTR_ADDU t1, a0, a2 147 PTR_ADDU t1, a0, a2
135 148
1361: PTR_ADDIU a0, 1 /* fill bytewise */ 1491: PTR_ADDIU a0, 1 /* fill bytewise */
150 R10KCBARRIER(0(ra))
137 bne t1, a0, 1b 151 bne t1, a0, 1b
138 sb a1, -1(a0) 152 sb a1, -1(a0)
139 153
@@ -141,11 +155,11 @@ small_memset:
141 move a2, zero 155 move a2, zero
142 END(memset) 156 END(memset)
143 157
144first_fixup: 158.Lfirst_fixup:
145 jr ra 159 jr ra
146 nop 160 nop
147 161
148fwd_fixup: 162.Lfwd_fixup:
149 PTR_L t0, TI_TASK($28) 163 PTR_L t0, TI_TASK($28)
150 LONG_L t0, THREAD_BUADDR(t0) 164 LONG_L t0, THREAD_BUADDR(t0)
151 andi a2, 0x3f 165 andi a2, 0x3f
@@ -153,7 +167,7 @@ fwd_fixup:
153 jr ra 167 jr ra
154 LONG_SUBU a2, t0 168 LONG_SUBU a2, t0
155 169
156partial_fixup: 170.Lpartial_fixup:
157 PTR_L t0, TI_TASK($28) 171 PTR_L t0, TI_TASK($28)
158 LONG_L t0, THREAD_BUADDR(t0) 172 LONG_L t0, THREAD_BUADDR(t0)
159 andi a2, LONGMASK 173 andi a2, LONGMASK
@@ -161,6 +175,6 @@ partial_fixup:
161 jr ra 175 jr ra
162 LONG_SUBU a2, t0 176 LONG_SUBU a2, t0
163 177
164last_fixup: 178.Llast_fixup:
165 jr ra 179 jr ra
166 andi v1, a2, LONGMASK 180 andi v1, a2, LONGMASK
diff --git a/arch/mips/lib/strlen_user.S b/arch/mips/lib/strlen_user.S
index eca558d83a37..fdbb970f670d 100644
--- a/arch/mips/lib/strlen_user.S
+++ b/arch/mips/lib/strlen_user.S
@@ -24,16 +24,16 @@
24LEAF(__strlen_user_asm) 24LEAF(__strlen_user_asm)
25 LONG_L v0, TI_ADDR_LIMIT($28) # pointer ok? 25 LONG_L v0, TI_ADDR_LIMIT($28) # pointer ok?
26 and v0, a0 26 and v0, a0
27 bnez v0, fault 27 bnez v0, .Lfault
28 28
29FEXPORT(__strlen_user_nocheck_asm) 29FEXPORT(__strlen_user_nocheck_asm)
30 move v0, a0 30 move v0, a0
311: EX(lb, t0, (v0), fault) 311: EX(lb, t0, (v0), .Lfault)
32 PTR_ADDIU v0, 1 32 PTR_ADDIU v0, 1
33 bnez t0, 1b 33 bnez t0, 1b
34 PTR_SUBU v0, a0 34 PTR_SUBU v0, a0
35 jr ra 35 jr ra
36 END(__strlen_user_asm) 36 END(__strlen_user_asm)
37 37
38fault: move v0, zero 38.Lfault: move v0, zero
39 jr ra 39 jr ra
diff --git a/arch/mips/lib/strncpy_user.S b/arch/mips/lib/strncpy_user.S
index d16c76fbfac7..7201b2ff08c8 100644
--- a/arch/mips/lib/strncpy_user.S
+++ b/arch/mips/lib/strncpy_user.S
@@ -30,29 +30,30 @@
30LEAF(__strncpy_from_user_asm) 30LEAF(__strncpy_from_user_asm)
31 LONG_L v0, TI_ADDR_LIMIT($28) # pointer ok? 31 LONG_L v0, TI_ADDR_LIMIT($28) # pointer ok?
32 and v0, a1 32 and v0, a1
33 bnez v0, fault 33 bnez v0, .Lfault
34 34
35FEXPORT(__strncpy_from_user_nocheck_asm) 35FEXPORT(__strncpy_from_user_nocheck_asm)
36 move v0, zero 36 move v0, zero
37 move v1, a1 37 move v1, a1
38 .set noreorder 38 .set noreorder
391: EX(lbu, t0, (v1), fault) 391: EX(lbu, t0, (v1), .Lfault)
40 PTR_ADDIU v1, 1 40 PTR_ADDIU v1, 1
41 R10KCBARRIER(0(ra))
41 beqz t0, 2f 42 beqz t0, 2f
42 sb t0, (a0) 43 sb t0, (a0)
43 PTR_ADDIU v0, 1 44 PTR_ADDIU v0, 1
44 bne v0, a2, 1b
45 PTR_ADDIU a0, 1
46 .set reorder 45 .set reorder
46 PTR_ADDIU a0, 1
47 bne v0, a2, 1b
472: PTR_ADDU t0, a1, v0 482: PTR_ADDU t0, a1, v0
48 xor t0, a1 49 xor t0, a1
49 bltz t0, fault 50 bltz t0, .Lfault
50 jr ra # return n 51 jr ra # return n
51 END(__strncpy_from_user_asm) 52 END(__strncpy_from_user_asm)
52 53
53fault: li v0, -EFAULT 54.Lfault: li v0, -EFAULT
54 jr ra 55 jr ra
55 56
56 .section __ex_table,"a" 57 .section __ex_table,"a"
57 PTR 1b, fault 58 PTR 1b, .Lfault
58 .previous 59 .previous
diff --git a/arch/mips/lib/strnlen_user.S b/arch/mips/lib/strnlen_user.S
index c0ea15194a0e..c768e3000616 100644
--- a/arch/mips/lib/strnlen_user.S
+++ b/arch/mips/lib/strnlen_user.S
@@ -28,18 +28,19 @@
28LEAF(__strnlen_user_asm) 28LEAF(__strnlen_user_asm)
29 LONG_L v0, TI_ADDR_LIMIT($28) # pointer ok? 29 LONG_L v0, TI_ADDR_LIMIT($28) # pointer ok?
30 and v0, a0 30 and v0, a0
31 bnez v0, fault 31 bnez v0, .Lfault
32 32
33FEXPORT(__strnlen_user_nocheck_asm) 33FEXPORT(__strnlen_user_nocheck_asm)
34 move v0, a0 34 move v0, a0
35 PTR_ADDU a1, a0 # stop pointer 35 PTR_ADDU a1, a0 # stop pointer
361: beq v0, a1, 1f # limit reached? 361: beq v0, a1, 1f # limit reached?
37 EX(lb, t0, (v0), fault) 37 EX(lb, t0, (v0), .Lfault)
38 PTR_ADDU v0, 1 38 PTR_ADDU v0, 1
39 bnez t0, 1b 39 bnez t0, 1b
401: PTR_SUBU v0, a0 401: PTR_SUBU v0, a0
41 jr ra 41 jr ra
42 END(__strnlen_user_asm) 42 END(__strnlen_user_asm)
43 43
44fault: move v0, zero 44.Lfault:
45 move v0, zero
45 jr ra 46 jr ra
diff --git a/arch/mips/lib/uncached.c b/arch/mips/lib/uncached.c
index 58d14f4d9349..27b012d4341c 100644
--- a/arch/mips/lib/uncached.c
+++ b/arch/mips/lib/uncached.c
@@ -46,9 +46,9 @@ unsigned long __init run_uncached(void *func)
46 if (sp >= (long)CKSEG0 && sp < (long)CKSEG2) 46 if (sp >= (long)CKSEG0 && sp < (long)CKSEG2)
47 usp = CKSEG1ADDR(sp); 47 usp = CKSEG1ADDR(sp);
48#ifdef CONFIG_64BIT 48#ifdef CONFIG_64BIT
49 else if ((long long)sp >= (long long)PHYS_TO_XKPHYS(0LL, 0) && 49 else if ((long long)sp >= (long long)PHYS_TO_XKPHYS(0, 0) &&
50 (long long)sp < (long long)PHYS_TO_XKPHYS(8LL, 0)) 50 (long long)sp < (long long)PHYS_TO_XKPHYS(8, 0))
51 usp = PHYS_TO_XKPHYS((long long)K_CALG_UNCACHED, 51 usp = PHYS_TO_XKPHYS(K_CALG_UNCACHED,
52 XKPHYS_TO_PHYS((long long)sp)); 52 XKPHYS_TO_PHYS((long long)sp));
53#endif 53#endif
54 else { 54 else {
@@ -58,9 +58,9 @@ unsigned long __init run_uncached(void *func)
58 if (lfunc >= (long)CKSEG0 && lfunc < (long)CKSEG2) 58 if (lfunc >= (long)CKSEG0 && lfunc < (long)CKSEG2)
59 ufunc = CKSEG1ADDR(lfunc); 59 ufunc = CKSEG1ADDR(lfunc);
60#ifdef CONFIG_64BIT 60#ifdef CONFIG_64BIT
61 else if ((long long)lfunc >= (long long)PHYS_TO_XKPHYS(0LL, 0) && 61 else if ((long long)lfunc >= (long long)PHYS_TO_XKPHYS(0, 0) &&
62 (long long)lfunc < (long long)PHYS_TO_XKPHYS(8LL, 0)) 62 (long long)lfunc < (long long)PHYS_TO_XKPHYS(8, 0))
63 ufunc = PHYS_TO_XKPHYS((long long)K_CALG_UNCACHED, 63 ufunc = PHYS_TO_XKPHYS(K_CALG_UNCACHED,
64 XKPHYS_TO_PHYS((long long)lfunc)); 64 XKPHYS_TO_PHYS((long long)lfunc));
65#endif 65#endif
66 else { 66 else {