aboutsummaryrefslogtreecommitdiffstats
path: root/arch/xtensa/lib
diff options
context:
space:
mode:
authorChris Zankel <chris@zankel.net>2012-10-16 00:41:19 -0400
committerChris Zankel <chris@zankel.net>2012-10-16 00:41:19 -0400
commiteae8a416afe140df4b054c448476654db0d46bde (patch)
treebd5f48ace717e58d535bb3776c13b70360605f46 /arch/xtensa/lib
parentc88d8df0cc69fe0238f2c805a87cc67fb27a43fe (diff)
xtensa: fix memmove(), bcopy(), and memcpy().
- fix memmove to correctly handle overlapping src and dst; - fix memcpy loop ending conditions from signed '<=' to '!='; - modify bcopy to call memmove; Signed-off-by: Max Filippov <jcmvbkbc@gmail.com> Signed-off-by: Chris Zankel <chris@zankel.net>
Diffstat (limited to 'arch/xtensa/lib')
-rw-r--r--arch/xtensa/lib/memcopy.S309
1 files changed, 284 insertions, 25 deletions
diff --git a/arch/xtensa/lib/memcopy.S b/arch/xtensa/lib/memcopy.S
index ea59dcd03866..c48b80acb5f0 100644
--- a/arch/xtensa/lib/memcopy.S
+++ b/arch/xtensa/lib/memcopy.S
@@ -6,7 +6,7 @@
6 * License. See the file "COPYING" in the main directory of this archive 6 * License. See the file "COPYING" in the main directory of this archive
7 * for more details. 7 * for more details.
8 * 8 *
9 * Copyright (C) 2002 - 2005 Tensilica Inc. 9 * Copyright (C) 2002 - 2012 Tensilica Inc.
10 */ 10 */
11 11
12#include <variant/core.h> 12#include <variant/core.h>
@@ -27,14 +27,11 @@
27#endif 27#endif
28 .endm 28 .endm
29 29
30
31/* 30/*
32 * void *memcpy(void *dst, const void *src, size_t len); 31 * void *memcpy(void *dst, const void *src, size_t len);
33 * void *memmove(void *dst, const void *src, size_t len);
34 * void *bcopy(const void *src, void *dst, size_t len);
35 * 32 *
36 * This function is intended to do the same thing as the standard 33 * This function is intended to do the same thing as the standard
37 * library function memcpy() (or bcopy()) for most cases. 34 * library function memcpy() for most cases.
38 * However, where the source and/or destination references 35 * However, where the source and/or destination references
39 * an instruction RAM or ROM or a data RAM or ROM, that 36 * an instruction RAM or ROM or a data RAM or ROM, that
40 * source and/or destination will always be accessed with 37 * source and/or destination will always be accessed with
@@ -45,9 +42,6 @@
45 * !!!!!!! Handling of IRAM/IROM has not yet 42 * !!!!!!! Handling of IRAM/IROM has not yet
46 * !!!!!!! been implemented. 43 * !!!!!!! been implemented.
47 * 44 *
48 * The bcopy version is provided here to avoid the overhead
49 * of an extra call, for callers that require this convention.
50 *
51 * The (general case) algorithm is as follows: 45 * The (general case) algorithm is as follows:
52 * If destination is unaligned, align it by conditionally 46 * If destination is unaligned, align it by conditionally
53 * copying 1 and 2 bytes. 47 * copying 1 and 2 bytes.
@@ -76,17 +70,6 @@
76 */ 70 */
77 71
78 .text 72 .text
79 .align 4
80 .global bcopy
81 .type bcopy,@function
82bcopy:
83 entry sp, 16 # minimal stack frame
84 # a2=src, a3=dst, a4=len
85 mov a5, a3 # copy dst so that a2 is return value
86 mov a3, a2
87 mov a2, a5
88 j .Lcommon # go to common code for memcpy+bcopy
89
90 73
91/* 74/*
92 * Byte by byte copy 75 * Byte by byte copy
@@ -107,7 +90,7 @@ bcopy:
107 s8i a6, a5, 0 90 s8i a6, a5, 0
108 addi a5, a5, 1 91 addi a5, a5, 1
109#if !XCHAL_HAVE_LOOPS 92#if !XCHAL_HAVE_LOOPS
110 blt a3, a7, .Lnextbyte 93 bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
111#endif /* !XCHAL_HAVE_LOOPS */ 94#endif /* !XCHAL_HAVE_LOOPS */
112.Lbytecopydone: 95.Lbytecopydone:
113 retw 96 retw
@@ -144,9 +127,6 @@ bcopy:
144 .global memcpy 127 .global memcpy
145 .type memcpy,@function 128 .type memcpy,@function
146memcpy: 129memcpy:
147 .global memmove
148 .type memmove,@function
149memmove:
150 130
151 entry sp, 16 # minimal stack frame 131 entry sp, 16 # minimal stack frame
152 # a2/ dst, a3/ src, a4/ len 132 # a2/ dst, a3/ src, a4/ len
@@ -182,7 +162,7 @@ memmove:
182 s32i a7, a5, 12 162 s32i a7, a5, 12
183 addi a5, a5, 16 163 addi a5, a5, 16
184#if !XCHAL_HAVE_LOOPS 164#if !XCHAL_HAVE_LOOPS
185 blt a3, a8, .Loop1 165 bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end
186#endif /* !XCHAL_HAVE_LOOPS */ 166#endif /* !XCHAL_HAVE_LOOPS */
187.Loop1done: 167.Loop1done:
188 bbci.l a4, 3, .L2 168 bbci.l a4, 3, .L2
@@ -260,7 +240,7 @@ memmove:
260 s32i a9, a5, 12 240 s32i a9, a5, 12
261 addi a5, a5, 16 241 addi a5, a5, 16
262#if !XCHAL_HAVE_LOOPS 242#if !XCHAL_HAVE_LOOPS
263 blt a3, a10, .Loop2 243 bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
264#endif /* !XCHAL_HAVE_LOOPS */ 244#endif /* !XCHAL_HAVE_LOOPS */
265.Loop2done: 245.Loop2done:
266 bbci.l a4, 3, .L12 246 bbci.l a4, 3, .L12
@@ -305,6 +285,285 @@ memmove:
305 l8ui a6, a3, 0 285 l8ui a6, a3, 0
306 s8i a6, a5, 0 286 s8i a6, a5, 0
307 retw 287 retw
288
289
290/*
291 * void bcopy(const void *src, void *dest, size_t n);
292 */
293 .align 4
294 .global bcopy
295 .type bcopy,@function
296bcopy:
297 entry sp, 16 # minimal stack frame
298 # a2=src, a3=dst, a4=len
299 mov a5, a3
300 mov a3, a2
301 mov a2, a5
302 j .Lmovecommon # go to common code for memmove+bcopy
303
304/*
305 * void *memmove(void *dst, const void *src, size_t len);
306 *
307 * This function is intended to do the same thing as the standard
308 * library function memmove() for most cases.
309 * However, where the source and/or destination references
310 * an instruction RAM or ROM or a data RAM or ROM, that
311 * source and/or destination will always be accessed with
312 * 32-bit load and store instructions (as required for these
313 * types of devices).
314 *
315 * !!!!!!! XTFIXME:
316 * !!!!!!! Handling of IRAM/IROM has not yet
317 * !!!!!!! been implemented.
318 *
319 * The (general case) algorithm is as follows:
320 * If end of source doesn't overlap destination then use memcpy.
321 * Otherwise do memcpy backwards.
322 *
323 * Register use:
324 * a0/ return address
325 * a1/ stack pointer
326 * a2/ return value
327 * a3/ src
328 * a4/ length
329 * a5/ dst
330 * a6/ tmp
331 * a7/ tmp
332 * a8/ tmp
333 * a9/ tmp
334 * a10/ tmp
335 * a11/ tmp
336 */
337
338/*
339 * Byte by byte copy
340 */
341 .align 4
342 .byte 0 # 1 mod 4 alignment for LOOPNEZ
343 # (0 mod 4 alignment for LBEG)
344.Lbackbytecopy:
345#if XCHAL_HAVE_LOOPS
346 loopnez a4, .Lbackbytecopydone
347#else /* !XCHAL_HAVE_LOOPS */
348 beqz a4, .Lbackbytecopydone
349 sub a7, a3, a4 # a7 = start address for source
350#endif /* !XCHAL_HAVE_LOOPS */
351.Lbacknextbyte:
352 addi a3, a3, -1
353 l8ui a6, a3, 0
354 addi a5, a5, -1
355 s8i a6, a5, 0
356#if !XCHAL_HAVE_LOOPS
357 bne a3, a7, .Lbacknextbyte # continue loop if
358 # $a3:src != $a7:src_start
359#endif /* !XCHAL_HAVE_LOOPS */
360.Lbackbytecopydone:
361 retw
362
363/*
364 * Destination is unaligned
365 */
366
367 .align 4
368.Lbackdst1mod2: # dst is only byte aligned
369 _bltui a4, 7, .Lbackbytecopy # do short copies byte by byte
370
371 # copy 1 byte
372 addi a3, a3, -1
373 l8ui a6, a3, 0
374 addi a5, a5, -1
375 s8i a6, a5, 0
376 addi a4, a4, -1
377 _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then
378 # return to main algorithm
379.Lbackdst2mod4: # dst 16-bit aligned
380 # copy 2 bytes
381 _bltui a4, 6, .Lbackbytecopy # do short copies byte by byte
382 addi a3, a3, -2
383 l8ui a6, a3, 0
384 l8ui a7, a3, 1
385 addi a5, a5, -2
386 s8i a6, a5, 0
387 s8i a7, a5, 1
388 addi a4, a4, -2
389 j .Lbackdstaligned # dst is now aligned,
390 # return to main algorithm
391
392 .align 4
393 .global memmove
394 .type memmove,@function
395memmove:
396
397 entry sp, 16 # minimal stack frame
398 # a2/ dst, a3/ src, a4/ len
399 mov a5, a2 # copy dst so that a2 is return value
400.Lmovecommon:
401 sub a6, a5, a3
402 bgeu a6, a4, .Lcommon
403
404 add a5, a5, a4
405 add a3, a3, a4
406
407 _bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2
408 _bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4
409.Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned
410 srli a7, a4, 4 # number of loop iterations with 16B
411 # per iteration
412 movi a8, 3 # if source is not aligned,
413 _bany a3, a8, .Lbacksrcunaligned # then use shifting copy
414 /*
415 * Destination and source are word-aligned, use word copy.
416 */
417 # copy 16 bytes per iteration for word-aligned dst and word-aligned src
418#if XCHAL_HAVE_LOOPS
419 loopnez a7, .backLoop1done
420#else /* !XCHAL_HAVE_LOOPS */
421 beqz a7, .backLoop1done
422 slli a8, a7, 4
423 sub a8, a3, a8 # a8 = start of first 16B source chunk
424#endif /* !XCHAL_HAVE_LOOPS */
425.backLoop1:
426 addi a3, a3, -16
427 l32i a7, a3, 12
428 l32i a6, a3, 8
429 addi a5, a5, -16
430 s32i a7, a5, 12
431 l32i a7, a3, 4
432 s32i a6, a5, 8
433 l32i a6, a3, 0
434 s32i a7, a5, 4
435 s32i a6, a5, 0
436#if !XCHAL_HAVE_LOOPS
437 bne a3, a8, .backLoop1 # continue loop if a3:src != a8:src_start
438#endif /* !XCHAL_HAVE_LOOPS */
439.backLoop1done:
440 bbci.l a4, 3, .Lback2
441 # copy 8 bytes
442 addi a3, a3, -8
443 l32i a6, a3, 0
444 l32i a7, a3, 4
445 addi a5, a5, -8
446 s32i a6, a5, 0
447 s32i a7, a5, 4
448.Lback2:
449 bbsi.l a4, 2, .Lback3
450 bbsi.l a4, 1, .Lback4
451 bbsi.l a4, 0, .Lback5
452 retw
453.Lback3:
454 # copy 4 bytes
455 addi a3, a3, -4
456 l32i a6, a3, 0
457 addi a5, a5, -4
458 s32i a6, a5, 0
459 bbsi.l a4, 1, .Lback4
460 bbsi.l a4, 0, .Lback5
461 retw
462.Lback4:
463 # copy 2 bytes
464 addi a3, a3, -2
465 l16ui a6, a3, 0
466 addi a5, a5, -2
467 s16i a6, a5, 0
468 bbsi.l a4, 0, .Lback5
469 retw
470.Lback5:
471 # copy 1 byte
472 addi a3, a3, -1
473 l8ui a6, a3, 0
474 addi a5, a5, -1
475 s8i a6, a5, 0
476 retw
477
478/*
479 * Destination is aligned, Source is unaligned
480 */
481
482 .align 4
483.Lbacksrcunaligned:
484 _beqz a4, .Lbackdone # avoid loading anything for zero-length copies
485 # copy 16 bytes per iteration for word-aligned dst and unaligned src
486 ssa8 a3 # set shift amount from byte offset
487#define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with
488 * the lint or ferret client, or 0
489 * to save a few cycles */
490#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
491 and a11, a3, a8 # save unalignment offset for below
492 sub a3, a3, a11 # align a3
493#endif
494 l32i a6, a3, 0 # load first word
495#if XCHAL_HAVE_LOOPS
496 loopnez a7, .backLoop2done
497#else /* !XCHAL_HAVE_LOOPS */
498 beqz a7, .backLoop2done
499 slli a10, a7, 4
500 sub a10, a3, a10 # a10 = start of first 16B source chunk
501#endif /* !XCHAL_HAVE_LOOPS */
502.backLoop2:
503 addi a3, a3, -16
504 l32i a7, a3, 12
505 l32i a8, a3, 8
506 addi a5, a5, -16
507 src_b a6, a7, a6
508 s32i a6, a5, 12
509 l32i a9, a3, 4
510 src_b a7, a8, a7
511 s32i a7, a5, 8
512 l32i a6, a3, 0
513 src_b a8, a9, a8
514 s32i a8, a5, 4
515 src_b a9, a6, a9
516 s32i a9, a5, 0
517#if !XCHAL_HAVE_LOOPS
518 bne a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
519#endif /* !XCHAL_HAVE_LOOPS */
520.backLoop2done:
521 bbci.l a4, 3, .Lback12
522 # copy 8 bytes
523 addi a3, a3, -8
524 l32i a7, a3, 4
525 l32i a8, a3, 0
526 addi a5, a5, -8
527 src_b a6, a7, a6
528 s32i a6, a5, 4
529 src_b a7, a8, a7
530 s32i a7, a5, 0
531 mov a6, a8
532.Lback12:
533 bbci.l a4, 2, .Lback13
534 # copy 4 bytes
535 addi a3, a3, -4
536 l32i a7, a3, 0
537 addi a5, a5, -4
538 src_b a6, a7, a6
539 s32i a6, a5, 0
540 mov a6, a7
541.Lback13:
542#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
543 add a3, a3, a11 # readjust a3 with correct misalignment
544#endif
545 bbsi.l a4, 1, .Lback14
546 bbsi.l a4, 0, .Lback15
547.Lbackdone:
548 retw
549.Lback14:
550 # copy 2 bytes
551 addi a3, a3, -2
552 l8ui a6, a3, 0
553 l8ui a7, a3, 1
554 addi a5, a5, -2
555 s8i a6, a5, 0
556 s8i a7, a5, 1
557 bbsi.l a4, 0, .Lback15
558 retw
559.Lback15:
560 # copy 1 byte
561 addi a3, a3, -1
562 addi a5, a5, -1
563 l8ui a6, a3, 0
564 s8i a6, a5, 0
565 retw
566
308 567
309/* 568/*
310 * Local Variables: 569 * Local Variables: