aboutsummaryrefslogtreecommitdiffstats
path: root/arch/mips/lib/memcpy.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/mips/lib/memcpy.S')
-rw-r--r--arch/mips/lib/memcpy.S250
1 files changed, 147 insertions, 103 deletions
diff --git a/arch/mips/lib/memcpy.S b/arch/mips/lib/memcpy.S
index a526c62cb76a..c06cccf60bec 100644
--- a/arch/mips/lib/memcpy.S
+++ b/arch/mips/lib/memcpy.S
@@ -9,6 +9,7 @@
9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc. 9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
10 * Copyright (C) 2002 Broadcom, Inc. 10 * Copyright (C) 2002 Broadcom, Inc.
11 * memcpy/copy_user author: Mark Vandevoorde 11 * memcpy/copy_user author: Mark Vandevoorde
12 * Copyright (C) 2007 Maciej W. Rozycki
12 * 13 *
13 * Mnemonic names for arguments to memcpy/__copy_user 14 * Mnemonic names for arguments to memcpy/__copy_user
14 */ 15 */
@@ -175,7 +176,11 @@
175 176
176 .text 177 .text
177 .set noreorder 178 .set noreorder
179#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
178 .set noat 180 .set noat
181#else
182 .set at=v1
183#endif
179 184
180/* 185/*
181 * A combined memcpy/__copy_user 186 * A combined memcpy/__copy_user
@@ -186,7 +191,7 @@
186 .align 5 191 .align 5
187LEAF(memcpy) /* a0=dst a1=src a2=len */ 192LEAF(memcpy) /* a0=dst a1=src a2=len */
188 move v0, dst /* return value */ 193 move v0, dst /* return value */
189__memcpy: 194.L__memcpy:
190FEXPORT(__copy_user) 195FEXPORT(__copy_user)
191 /* 196 /*
192 * Note: dst & src may be unaligned, len may be 0 197 * Note: dst & src may be unaligned, len may be 0
@@ -194,6 +199,7 @@ FEXPORT(__copy_user)
194 */ 199 */
195#define rem t8 200#define rem t8
196 201
202 R10KCBARRIER(0(ra))
197 /* 203 /*
198 * The "issue break"s below are very approximate. 204 * The "issue break"s below are very approximate.
199 * Issue delays for dcache fills will perturb the schedule, as will 205 * Issue delays for dcache fills will perturb the schedule, as will
@@ -207,44 +213,45 @@ FEXPORT(__copy_user)
207 and t1, dst, ADDRMASK 213 and t1, dst, ADDRMASK
208 PREF( 0, 1*32(src) ) 214 PREF( 0, 1*32(src) )
209 PREF( 1, 1*32(dst) ) 215 PREF( 1, 1*32(dst) )
210 bnez t2, copy_bytes_checklen 216 bnez t2, .Lcopy_bytes_checklen
211 and t0, src, ADDRMASK 217 and t0, src, ADDRMASK
212 PREF( 0, 2*32(src) ) 218 PREF( 0, 2*32(src) )
213 PREF( 1, 2*32(dst) ) 219 PREF( 1, 2*32(dst) )
214 bnez t1, dst_unaligned 220 bnez t1, .Ldst_unaligned
215 nop 221 nop
216 bnez t0, src_unaligned_dst_aligned 222 bnez t0, .Lsrc_unaligned_dst_aligned
217 /* 223 /*
218 * use delay slot for fall-through 224 * use delay slot for fall-through
219 * src and dst are aligned; need to compute rem 225 * src and dst are aligned; need to compute rem
220 */ 226 */
221both_aligned: 227.Lboth_aligned:
222 SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter 228 SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter
223 beqz t0, cleanup_both_aligned # len < 8*NBYTES 229 beqz t0, .Lcleanup_both_aligned # len < 8*NBYTES
224 and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES) 230 and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES)
225 PREF( 0, 3*32(src) ) 231 PREF( 0, 3*32(src) )
226 PREF( 1, 3*32(dst) ) 232 PREF( 1, 3*32(dst) )
227 .align 4 233 .align 4
2281: 2341:
229EXC( LOAD t0, UNIT(0)(src), l_exc) 235 R10KCBARRIER(0(ra))
230EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 236EXC( LOAD t0, UNIT(0)(src), .Ll_exc)
231EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 237EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)
232EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 238EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)
239EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)
233 SUB len, len, 8*NBYTES 240 SUB len, len, 8*NBYTES
234EXC( LOAD t4, UNIT(4)(src), l_exc_copy) 241EXC( LOAD t4, UNIT(4)(src), .Ll_exc_copy)
235EXC( LOAD t7, UNIT(5)(src), l_exc_copy) 242EXC( LOAD t7, UNIT(5)(src), .Ll_exc_copy)
236EXC( STORE t0, UNIT(0)(dst), s_exc_p8u) 243EXC( STORE t0, UNIT(0)(dst), .Ls_exc_p8u)
237EXC( STORE t1, UNIT(1)(dst), s_exc_p7u) 244EXC( STORE t1, UNIT(1)(dst), .Ls_exc_p7u)
238EXC( LOAD t0, UNIT(6)(src), l_exc_copy) 245EXC( LOAD t0, UNIT(6)(src), .Ll_exc_copy)
239EXC( LOAD t1, UNIT(7)(src), l_exc_copy) 246EXC( LOAD t1, UNIT(7)(src), .Ll_exc_copy)
240 ADD src, src, 8*NBYTES 247 ADD src, src, 8*NBYTES
241 ADD dst, dst, 8*NBYTES 248 ADD dst, dst, 8*NBYTES
242EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u) 249EXC( STORE t2, UNIT(-6)(dst), .Ls_exc_p6u)
243EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u) 250EXC( STORE t3, UNIT(-5)(dst), .Ls_exc_p5u)
244EXC( STORE t4, UNIT(-4)(dst), s_exc_p4u) 251EXC( STORE t4, UNIT(-4)(dst), .Ls_exc_p4u)
245EXC( STORE t7, UNIT(-3)(dst), s_exc_p3u) 252EXC( STORE t7, UNIT(-3)(dst), .Ls_exc_p3u)
246EXC( STORE t0, UNIT(-2)(dst), s_exc_p2u) 253EXC( STORE t0, UNIT(-2)(dst), .Ls_exc_p2u)
247EXC( STORE t1, UNIT(-1)(dst), s_exc_p1u) 254EXC( STORE t1, UNIT(-1)(dst), .Ls_exc_p1u)
248 PREF( 0, 8*32(src) ) 255 PREF( 0, 8*32(src) )
249 PREF( 1, 8*32(dst) ) 256 PREF( 1, 8*32(dst) )
250 bne len, rem, 1b 257 bne len, rem, 1b
@@ -253,39 +260,45 @@ EXC( STORE t1, UNIT(-1)(dst), s_exc_p1u)
253 /* 260 /*
254 * len == rem == the number of bytes left to copy < 8*NBYTES 261 * len == rem == the number of bytes left to copy < 8*NBYTES
255 */ 262 */
256cleanup_both_aligned: 263.Lcleanup_both_aligned:
257 beqz len, done 264 beqz len, .Ldone
258 sltu t0, len, 4*NBYTES 265 sltu t0, len, 4*NBYTES
259 bnez t0, less_than_4units 266 bnez t0, .Lless_than_4units
260 and rem, len, (NBYTES-1) # rem = len % NBYTES 267 and rem, len, (NBYTES-1) # rem = len % NBYTES
261 /* 268 /*
262 * len >= 4*NBYTES 269 * len >= 4*NBYTES
263 */ 270 */
264EXC( LOAD t0, UNIT(0)(src), l_exc) 271EXC( LOAD t0, UNIT(0)(src), .Ll_exc)
265EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 272EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)
266EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 273EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)
267EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 274EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)
268 SUB len, len, 4*NBYTES 275 SUB len, len, 4*NBYTES
269 ADD src, src, 4*NBYTES 276 ADD src, src, 4*NBYTES
270EXC( STORE t0, UNIT(0)(dst), s_exc_p4u) 277 R10KCBARRIER(0(ra))
271EXC( STORE t1, UNIT(1)(dst), s_exc_p3u) 278EXC( STORE t0, UNIT(0)(dst), .Ls_exc_p4u)
272EXC( STORE t2, UNIT(2)(dst), s_exc_p2u) 279EXC( STORE t1, UNIT(1)(dst), .Ls_exc_p3u)
273EXC( STORE t3, UNIT(3)(dst), s_exc_p1u) 280EXC( STORE t2, UNIT(2)(dst), .Ls_exc_p2u)
274 beqz len, done 281EXC( STORE t3, UNIT(3)(dst), .Ls_exc_p1u)
275 ADD dst, dst, 4*NBYTES 282 .set reorder /* DADDI_WAR */
276less_than_4units: 283 ADD dst, dst, 4*NBYTES
284 beqz len, .Ldone
285 .set noreorder
286.Lless_than_4units:
277 /* 287 /*
278 * rem = len % NBYTES 288 * rem = len % NBYTES
279 */ 289 */
280 beq rem, len, copy_bytes 290 beq rem, len, .Lcopy_bytes
281 nop 291 nop
2821: 2921:
283EXC( LOAD t0, 0(src), l_exc) 293 R10KCBARRIER(0(ra))
294EXC( LOAD t0, 0(src), .Ll_exc)
284 ADD src, src, NBYTES 295 ADD src, src, NBYTES
285 SUB len, len, NBYTES 296 SUB len, len, NBYTES
286EXC( STORE t0, 0(dst), s_exc_p1u) 297EXC( STORE t0, 0(dst), .Ls_exc_p1u)
298 .set reorder /* DADDI_WAR */
299 ADD dst, dst, NBYTES
287 bne rem, len, 1b 300 bne rem, len, 1b
288 ADD dst, dst, NBYTES 301 .set noreorder
289 302
290 /* 303 /*
291 * src and dst are aligned, need to copy rem bytes (rem < NBYTES) 304 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
@@ -299,17 +312,17 @@ EXC( STORE t0, 0(dst), s_exc_p1u)
299 * more instruction-level parallelism. 312 * more instruction-level parallelism.
300 */ 313 */
301#define bits t2 314#define bits t2
302 beqz len, done 315 beqz len, .Ldone
303 ADD t1, dst, len # t1 is just past last byte of dst 316 ADD t1, dst, len # t1 is just past last byte of dst
304 li bits, 8*NBYTES 317 li bits, 8*NBYTES
305 SLL rem, len, 3 # rem = number of bits to keep 318 SLL rem, len, 3 # rem = number of bits to keep
306EXC( LOAD t0, 0(src), l_exc) 319EXC( LOAD t0, 0(src), .Ll_exc)
307 SUB bits, bits, rem # bits = number of bits to discard 320 SUB bits, bits, rem # bits = number of bits to discard
308 SHIFT_DISCARD t0, t0, bits 321 SHIFT_DISCARD t0, t0, bits
309EXC( STREST t0, -1(t1), s_exc) 322EXC( STREST t0, -1(t1), .Ls_exc)
310 jr ra 323 jr ra
311 move len, zero 324 move len, zero
312dst_unaligned: 325.Ldst_unaligned:
313 /* 326 /*
314 * dst is unaligned 327 * dst is unaligned
315 * t0 = src & ADDRMASK 328 * t0 = src & ADDRMASK
@@ -320,22 +333,23 @@ dst_unaligned:
320 * Set match = (src and dst have same alignment) 333 * Set match = (src and dst have same alignment)
321 */ 334 */
322#define match rem 335#define match rem
323EXC( LDFIRST t3, FIRST(0)(src), l_exc) 336EXC( LDFIRST t3, FIRST(0)(src), .Ll_exc)
324 ADD t2, zero, NBYTES 337 ADD t2, zero, NBYTES
325EXC( LDREST t3, REST(0)(src), l_exc_copy) 338EXC( LDREST t3, REST(0)(src), .Ll_exc_copy)
326 SUB t2, t2, t1 # t2 = number of bytes copied 339 SUB t2, t2, t1 # t2 = number of bytes copied
327 xor match, t0, t1 340 xor match, t0, t1
328EXC( STFIRST t3, FIRST(0)(dst), s_exc) 341 R10KCBARRIER(0(ra))
329 beq len, t2, done 342EXC( STFIRST t3, FIRST(0)(dst), .Ls_exc)
343 beq len, t2, .Ldone
330 SUB len, len, t2 344 SUB len, len, t2
331 ADD dst, dst, t2 345 ADD dst, dst, t2
332 beqz match, both_aligned 346 beqz match, .Lboth_aligned
333 ADD src, src, t2 347 ADD src, src, t2
334 348
335src_unaligned_dst_aligned: 349.Lsrc_unaligned_dst_aligned:
336 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter 350 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
337 PREF( 0, 3*32(src) ) 351 PREF( 0, 3*32(src) )
338 beqz t0, cleanup_src_unaligned 352 beqz t0, .Lcleanup_src_unaligned
339 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES 353 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
340 PREF( 1, 3*32(dst) ) 354 PREF( 1, 3*32(dst) )
3411: 3551:
@@ -345,52 +359,59 @@ src_unaligned_dst_aligned:
345 * It's OK to load FIRST(N+1) before REST(N) because the two addresses 359 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
346 * are to the same unit (unless src is aligned, but it's not). 360 * are to the same unit (unless src is aligned, but it's not).
347 */ 361 */
348EXC( LDFIRST t0, FIRST(0)(src), l_exc) 362 R10KCBARRIER(0(ra))
349EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy) 363EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)
364EXC( LDFIRST t1, FIRST(1)(src), .Ll_exc_copy)
350 SUB len, len, 4*NBYTES 365 SUB len, len, 4*NBYTES
351EXC( LDREST t0, REST(0)(src), l_exc_copy) 366EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)
352EXC( LDREST t1, REST(1)(src), l_exc_copy) 367EXC( LDREST t1, REST(1)(src), .Ll_exc_copy)
353EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy) 368EXC( LDFIRST t2, FIRST(2)(src), .Ll_exc_copy)
354EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy) 369EXC( LDFIRST t3, FIRST(3)(src), .Ll_exc_copy)
355EXC( LDREST t2, REST(2)(src), l_exc_copy) 370EXC( LDREST t2, REST(2)(src), .Ll_exc_copy)
356EXC( LDREST t3, REST(3)(src), l_exc_copy) 371EXC( LDREST t3, REST(3)(src), .Ll_exc_copy)
357 PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed) 372 PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed)
358 ADD src, src, 4*NBYTES 373 ADD src, src, 4*NBYTES
359#ifdef CONFIG_CPU_SB1 374#ifdef CONFIG_CPU_SB1
360 nop # improves slotting 375 nop # improves slotting
361#endif 376#endif
362EXC( STORE t0, UNIT(0)(dst), s_exc_p4u) 377EXC( STORE t0, UNIT(0)(dst), .Ls_exc_p4u)
363EXC( STORE t1, UNIT(1)(dst), s_exc_p3u) 378EXC( STORE t1, UNIT(1)(dst), .Ls_exc_p3u)
364EXC( STORE t2, UNIT(2)(dst), s_exc_p2u) 379EXC( STORE t2, UNIT(2)(dst), .Ls_exc_p2u)
365EXC( STORE t3, UNIT(3)(dst), s_exc_p1u) 380EXC( STORE t3, UNIT(3)(dst), .Ls_exc_p1u)
366 PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed) 381 PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed)
382 .set reorder /* DADDI_WAR */
383 ADD dst, dst, 4*NBYTES
367 bne len, rem, 1b 384 bne len, rem, 1b
368 ADD dst, dst, 4*NBYTES 385 .set noreorder
369 386
370cleanup_src_unaligned: 387.Lcleanup_src_unaligned:
371 beqz len, done 388 beqz len, .Ldone
372 and rem, len, NBYTES-1 # rem = len % NBYTES 389 and rem, len, NBYTES-1 # rem = len % NBYTES
373 beq rem, len, copy_bytes 390 beq rem, len, .Lcopy_bytes
374 nop 391 nop
3751: 3921:
376EXC( LDFIRST t0, FIRST(0)(src), l_exc) 393 R10KCBARRIER(0(ra))
377EXC( LDREST t0, REST(0)(src), l_exc_copy) 394EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)
395EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)
378 ADD src, src, NBYTES 396 ADD src, src, NBYTES
379 SUB len, len, NBYTES 397 SUB len, len, NBYTES
380EXC( STORE t0, 0(dst), s_exc_p1u) 398EXC( STORE t0, 0(dst), .Ls_exc_p1u)
399 .set reorder /* DADDI_WAR */
400 ADD dst, dst, NBYTES
381 bne len, rem, 1b 401 bne len, rem, 1b
382 ADD dst, dst, NBYTES 402 .set noreorder
383 403
384copy_bytes_checklen: 404.Lcopy_bytes_checklen:
385 beqz len, done 405 beqz len, .Ldone
386 nop 406 nop
387copy_bytes: 407.Lcopy_bytes:
388 /* 0 < len < NBYTES */ 408 /* 0 < len < NBYTES */
409 R10KCBARRIER(0(ra))
389#define COPY_BYTE(N) \ 410#define COPY_BYTE(N) \
390EXC( lb t0, N(src), l_exc); \ 411EXC( lb t0, N(src), .Ll_exc); \
391 SUB len, len, 1; \ 412 SUB len, len, 1; \
392 beqz len, done; \ 413 beqz len, .Ldone; \
393EXC( sb t0, N(dst), s_exc_p1) 414EXC( sb t0, N(dst), .Ls_exc_p1)
394 415
395 COPY_BYTE(0) 416 COPY_BYTE(0)
396 COPY_BYTE(1) 417 COPY_BYTE(1)
@@ -400,16 +421,16 @@ EXC( sb t0, N(dst), s_exc_p1)
400 COPY_BYTE(4) 421 COPY_BYTE(4)
401 COPY_BYTE(5) 422 COPY_BYTE(5)
402#endif 423#endif
403EXC( lb t0, NBYTES-2(src), l_exc) 424EXC( lb t0, NBYTES-2(src), .Ll_exc)
404 SUB len, len, 1 425 SUB len, len, 1
405 jr ra 426 jr ra
406EXC( sb t0, NBYTES-2(dst), s_exc_p1) 427EXC( sb t0, NBYTES-2(dst), .Ls_exc_p1)
407done: 428.Ldone:
408 jr ra 429 jr ra
409 nop 430 nop
410 END(memcpy) 431 END(memcpy)
411 432
412l_exc_copy: 433.Ll_exc_copy:
413 /* 434 /*
414 * Copy bytes from src until faulting load address (or until a 435 * Copy bytes from src until faulting load address (or until a
415 * lb faults) 436 * lb faults)
@@ -424,12 +445,14 @@ l_exc_copy:
424 nop 445 nop
425 LOAD t0, THREAD_BUADDR(t0) 446 LOAD t0, THREAD_BUADDR(t0)
4261: 4471:
427EXC( lb t1, 0(src), l_exc) 448EXC( lb t1, 0(src), .Ll_exc)
428 ADD src, src, 1 449 ADD src, src, 1
429 sb t1, 0(dst) # can't fault -- we're copy_from_user 450 sb t1, 0(dst) # can't fault -- we're copy_from_user
451 .set reorder /* DADDI_WAR */
452 ADD dst, dst, 1
430 bne src, t0, 1b 453 bne src, t0, 1b
431 ADD dst, dst, 1 454 .set noreorder
432l_exc: 455.Ll_exc:
433 LOAD t0, TI_TASK($28) 456 LOAD t0, TI_TASK($28)
434 nop 457 nop
435 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address 458 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
@@ -446,20 +469,33 @@ l_exc:
446 * Clear len bytes starting at dst. Can't call __bzero because it 469 * Clear len bytes starting at dst. Can't call __bzero because it
447 * might modify len. An inefficient loop for these rare times... 470 * might modify len. An inefficient loop for these rare times...
448 */ 471 */
449 beqz len, done 472 .set reorder /* DADDI_WAR */
450 SUB src, len, 1 473 SUB src, len, 1
474 beqz len, .Ldone
475 .set noreorder
4511: sb zero, 0(dst) 4761: sb zero, 0(dst)
452 ADD dst, dst, 1 477 ADD dst, dst, 1
478#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
453 bnez src, 1b 479 bnez src, 1b
454 SUB src, src, 1 480 SUB src, src, 1
481#else
482 .set push
483 .set noat
484 li v1, 1
485 bnez src, 1b
486 SUB src, src, v1
487 .set pop
488#endif
455 jr ra 489 jr ra
456 nop 490 nop
457 491
458 492
459#define SEXC(n) \ 493#define SEXC(n) \
460s_exc_p ## n ## u: \ 494 .set reorder; /* DADDI_WAR */ \
461 jr ra; \ 495.Ls_exc_p ## n ## u: \
462 ADD len, len, n*NBYTES 496 ADD len, len, n*NBYTES; \
497 jr ra; \
498 .set noreorder
463 499
464SEXC(8) 500SEXC(8)
465SEXC(7) 501SEXC(7)
@@ -470,10 +506,12 @@ SEXC(3)
470SEXC(2) 506SEXC(2)
471SEXC(1) 507SEXC(1)
472 508
473s_exc_p1: 509.Ls_exc_p1:
510 .set reorder /* DADDI_WAR */
511 ADD len, len, 1
474 jr ra 512 jr ra
475 ADD len, len, 1 513 .set noreorder
476s_exc: 514.Ls_exc:
477 jr ra 515 jr ra
478 nop 516 nop
479 517
@@ -484,38 +522,44 @@ LEAF(memmove)
484 sltu t0, a1, t0 # dst + len <= src -> memcpy 522 sltu t0, a1, t0 # dst + len <= src -> memcpy
485 sltu t1, a0, t1 # dst >= src + len -> memcpy 523 sltu t1, a0, t1 # dst >= src + len -> memcpy
486 and t0, t1 524 and t0, t1
487 beqz t0, __memcpy 525 beqz t0, .L__memcpy
488 move v0, a0 /* return value */ 526 move v0, a0 /* return value */
489 beqz a2, r_out 527 beqz a2, .Lr_out
490 END(memmove) 528 END(memmove)
491 529
492 /* fall through to __rmemcpy */ 530 /* fall through to __rmemcpy */
493LEAF(__rmemcpy) /* a0=dst a1=src a2=len */ 531LEAF(__rmemcpy) /* a0=dst a1=src a2=len */
494 sltu t0, a1, a0 532 sltu t0, a1, a0
495 beqz t0, r_end_bytes_up # src >= dst 533 beqz t0, .Lr_end_bytes_up # src >= dst
496 nop 534 nop
497 ADD a0, a2 # dst = dst + len 535 ADD a0, a2 # dst = dst + len
498 ADD a1, a2 # src = src + len 536 ADD a1, a2 # src = src + len
499 537
500r_end_bytes: 538.Lr_end_bytes:
539 R10KCBARRIER(0(ra))
501 lb t0, -1(a1) 540 lb t0, -1(a1)
502 SUB a2, a2, 0x1 541 SUB a2, a2, 0x1
503 sb t0, -1(a0) 542 sb t0, -1(a0)
504 SUB a1, a1, 0x1 543 SUB a1, a1, 0x1
505 bnez a2, r_end_bytes 544 .set reorder /* DADDI_WAR */
506 SUB a0, a0, 0x1 545 SUB a0, a0, 0x1
546 bnez a2, .Lr_end_bytes
547 .set noreorder
507 548
508r_out: 549.Lr_out:
509 jr ra 550 jr ra
510 move a2, zero 551 move a2, zero
511 552
512r_end_bytes_up: 553.Lr_end_bytes_up:
554 R10KCBARRIER(0(ra))
513 lb t0, (a1) 555 lb t0, (a1)
514 SUB a2, a2, 0x1 556 SUB a2, a2, 0x1
515 sb t0, (a0) 557 sb t0, (a0)
516 ADD a1, a1, 0x1 558 ADD a1, a1, 0x1
517 bnez a2, r_end_bytes_up 559 .set reorder /* DADDI_WAR */
518 ADD a0, a0, 0x1 560 ADD a0, a0, 0x1
561 bnez a2, .Lr_end_bytes_up
562 .set noreorder
519 563
520 jr ra 564 jr ra
521 move a2, zero 565 move a2, zero