xtensa: fix memmove(), bcopy(), and memcpy().

- fix memmove to correctly handle overlapping src and dst; - fix memcpy loop ending conditions from signed '<=' to '!='; - modify bcopy to call memmove; Signed-off-by: Max Filippov <jcmvbkbc@gmail.com> Signed-off-by: Chris Zankel <chris@zankel.net>
author: Chris Zankel <chris@zankel.net> 2012-10-16 00:41:19 -0400
committer: Chris Zankel <chris@zankel.net> 2012-10-16 00:41:19 -0400
commit: eae8a416afe140df4b054c448476654db0d46bde (patch)
tree: bd5f48ace717e58d535bb3776c13b70360605f46 /arch/xtensa/lib
parent: c88d8df0cc69fe0238f2c805a87cc67fb27a43fe (diff)
1 files changed, 284 insertions, 25 deletions
diff --git a/arch/xtensa/lib/memcopy.S b/arch/xtensa/lib/memcopy.S
index ea59dcd03866..c48b80acb5f0 100644
--- a/arch/xtensa/lib/memcopy.S
+++ b/arch/xtensa/lib/memcopy.S
@@ -6,7 +6,7 @@
 * License.  See the file "COPYING" in the main directory of this archive
 * for more details.
 *
- * Copyright (C) 2002 - 2005 Tensilica Inc.
+ * Copyright (C) 2002 - 2012 Tensilica Inc.
 */
 #include <variant/core.h>
@@ -27,14 +27,11 @@
 #endif
        .endm
 /*
 * void *memcpy(void *dst, const void *src, size_t len);
- * void *memmove(void *dst, const void *src, size_t len);
- * void *bcopy(const void *src, void *dst, size_t len);
 *
 * This function is intended to do the same thing as the standard
- * library function memcpy() (or bcopy()) for most cases.
+ * library function memcpy() for most cases.
 * However, where the source and/or destination references
 * an instruction RAM or ROM or a data RAM or ROM, that
 * source and/or destination will always be accessed with
@@ -45,9 +42,6 @@
 * !!!!!!!  Handling of IRAM/IROM has not yet
 * !!!!!!!  been implemented.
 *
- * The bcopy version is provided here to avoid the overhead
- * of an extra call, for callers that require this convention.
- *
 * The (general case) algorithm is as follows:
 *   If destination is unaligned, align it by conditionally
 *     copying 1 and 2 bytes.
@@ -76,17 +70,6 @@
 */
        .text
-        .align  4
-        .global bcopy
-        .type   bcopy,@function
-bcopy:
-        entry   sp, 16          # minimal stack frame
-        # a2=src, a3=dst, a4=len
-        mov     a5, a3          # copy dst so that a2 is return value
-        mov     a3, a2
-        mov     a2, a5
-        j       .Lcommon        # go to common code for memcpy+bcopy
 /*
 * Byte by byte copy
@@ -107,7 +90,7 @@ bcopy:
        s8i     a6, a5, 0
        addi    a5, a5, 1
 #if !XCHAL_HAVE_LOOPS
-        blt     a3, a7, .Lnextbyte
+        bne     a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
 #endif /* !XCHAL_HAVE_LOOPS */
 .Lbytecopydone:
        retw
@@ -144,9 +127,6 @@ bcopy:
        .global memcpy
        .type   memcpy,@function
 memcpy:
-        .global memmove
-        .type   memmove,@function
-memmove:
        entry   sp, 16          # minimal stack frame
        # a2/ dst, a3/ src, a4/ len
@@ -182,7 +162,7 @@ memmove:
        s32i    a7, a5, 12
        addi    a5, a5, 16
 #if !XCHAL_HAVE_LOOPS
-        blt     a3, a8, .Loop1
+        bne     a3, a8, .Loop1  # continue loop if a3:src != a8:src_end
 #endif /* !XCHAL_HAVE_LOOPS */
 .Loop1done:
        bbci.l  a4, 3, .L2
@@ -260,7 +240,7 @@ memmove:
        s32i    a9, a5, 12
        addi    a5, a5, 16
 #if !XCHAL_HAVE_LOOPS
-        blt     a3, a10, .Loop2
+        bne     a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
 #endif /* !XCHAL_HAVE_LOOPS */
 .Loop2done:
        bbci.l  a4, 3, .L12
@@ -305,6 +285,285 @@ memmove:
        l8ui    a6, a3,  0
        s8i     a6, a5,  0
        retw
+/*
+ * void bcopy(const void *src, void *dest, size_t n);
+ */
+        .align  4
+        .global bcopy
+        .type   bcopy,@function
+bcopy:
+        entry   sp, 16          # minimal stack frame
+        # a2=src, a3=dst, a4=len
+        mov     a5, a3
+        mov     a3, a2
+        mov     a2, a5
+        j       .Lmovecommon    # go to common code for memmove+bcopy
+/*
+ * void *memmove(void *dst, const void *src, size_t len);
+ *
+ * This function is intended to do the same thing as the standard
+ * library function memmove() for most cases.
+ * However, where the source and/or destination references
+ * an instruction RAM or ROM or a data RAM or ROM, that
+ * source and/or destination will always be accessed with
+ * 32-bit load and store instructions (as required for these
+ * types of devices).
+ *
+ * !!!!!!!  XTFIXME:
+ * !!!!!!!  Handling of IRAM/IROM has not yet
+ * !!!!!!!  been implemented.
+ *
+ * The (general case) algorithm is as follows:
+ *   If end of source doesn't overlap destination then use memcpy.
+ *   Otherwise do memcpy backwards.
+ *
+ * Register use:
+ *      a0/ return address
+ *      a1/ stack pointer
+ *      a2/ return value
+ *      a3/ src
+ *      a4/ length
+ *      a5/ dst
+ *      a6/ tmp
+ *      a7/ tmp
+ *      a8/ tmp
+ *      a9/ tmp
+ *      a10/ tmp
+ *      a11/ tmp
+ */
+/*
+ * Byte by byte copy
+ */
+        .align  4
+        .byte   0               # 1 mod 4 alignment for LOOPNEZ
+                                # (0 mod 4 alignment for LBEG)
+.Lbackbytecopy:
+#if XCHAL_HAVE_LOOPS
+        loopnez a4, .Lbackbytecopydone
+#else /* !XCHAL_HAVE_LOOPS */
+        beqz    a4, .Lbackbytecopydone
+        sub     a7, a3, a4      # a7 = start address for source
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lbacknextbyte:
+        addi    a3, a3, -1
+        l8ui    a6, a3, 0
+        addi    a5, a5, -1
+        s8i     a6, a5, 0
+#if !XCHAL_HAVE_LOOPS
+        bne     a3, a7, .Lbacknextbyte # continue loop if
+                                       # $a3:src != $a7:src_start
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lbackbytecopydone:
+        retw
+/*
+ * Destination is unaligned
+ */
+        .align  4
+.Lbackdst1mod2: # dst is only byte aligned
+        _bltui  a4, 7, .Lbackbytecopy   # do short copies byte by byte
+        # copy 1 byte
+        addi    a3, a3, -1
+        l8ui    a6, a3,  0
+        addi    a5, a5, -1
+        s8i     a6, a5,  0
+        addi    a4, a4, -1
+        _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then
+                                        # return to main algorithm
+.Lbackdst2mod4: # dst 16-bit aligned
+        # copy 2 bytes
+        _bltui  a4, 6, .Lbackbytecopy   # do short copies byte by byte
+        addi    a3, a3, -2
+        l8ui    a6, a3,  0
+        l8ui    a7, a3,  1
+        addi    a5, a5, -2
+        s8i     a6, a5,  0
+        s8i     a7, a5,  1
+        addi    a4, a4, -2
+        j       .Lbackdstaligned        # dst is now aligned,
+                                        # return to main algorithm
+        .align  4
+        .global memmove
+        .type   memmove,@function
+memmove:
+        entry   sp, 16          # minimal stack frame
+        # a2/ dst, a3/ src, a4/ len
+        mov     a5, a2          # copy dst so that a2 is return value
+.Lmovecommon:
+        sub     a6, a5, a3
+        bgeu    a6, a4, .Lcommon
+        add     a5, a5, a4
+        add     a3, a3, a4
+        _bbsi.l a5, 0, .Lbackdst1mod2   # if dst is 1 mod 2
+        _bbsi.l a5, 1, .Lbackdst2mod4   # if dst is 2 mod 4
+.Lbackdstaligned:       # return here from .Lbackdst?mod? once dst is aligned
+        srli    a7, a4, 4       # number of loop iterations with 16B
+                                # per iteration
+        movi    a8, 3           # if source is not aligned,
+        _bany   a3, a8, .Lbacksrcunaligned      # then use shifting copy
+        /*
+         * Destination and source are word-aligned, use word copy.
+         */
+        # copy 16 bytes per iteration for word-aligned dst and word-aligned src
+#if XCHAL_HAVE_LOOPS
+        loopnez a7, .backLoop1done
+#else /* !XCHAL_HAVE_LOOPS */
+        beqz    a7, .backLoop1done
+        slli    a8, a7, 4
+        sub     a8, a3, a8      # a8 = start of first 16B source chunk
+#endif /* !XCHAL_HAVE_LOOPS */
+.backLoop1:
+        addi    a3, a3, -16
+        l32i    a7, a3, 12
+        l32i    a6, a3,  8
+        addi    a5, a5, -16
+        s32i    a7, a5, 12
+        l32i    a7, a3,  4
+        s32i    a6, a5,  8
+        l32i    a6, a3,  0
+        s32i    a7, a5,  4
+        s32i    a6, a5,  0
+#if !XCHAL_HAVE_LOOPS
+        bne     a3, a8, .backLoop1  # continue loop if a3:src != a8:src_start
+#endif /* !XCHAL_HAVE_LOOPS */
+.backLoop1done:
+        bbci.l  a4, 3, .Lback2
+        # copy 8 bytes
+        addi    a3, a3, -8
+        l32i    a6, a3,  0
+        l32i    a7, a3,  4
+        addi    a5, a5, -8
+        s32i    a6, a5,  0
+        s32i    a7, a5,  4
+.Lback2:
+        bbsi.l  a4, 2, .Lback3
+        bbsi.l  a4, 1, .Lback4
+        bbsi.l  a4, 0, .Lback5
+        retw
+.Lback3:
+        # copy 4 bytes
+        addi    a3, a3, -4
+        l32i    a6, a3,  0
+        addi    a5, a5, -4
+        s32i    a6, a5,  0
+        bbsi.l  a4, 1, .Lback4
+        bbsi.l  a4, 0, .Lback5
+        retw
+.Lback4:
+        # copy 2 bytes
+        addi    a3, a3, -2
+        l16ui   a6, a3,  0
+        addi    a5, a5, -2
+        s16i    a6, a5,  0
+        bbsi.l  a4, 0, .Lback5
+        retw
+.Lback5:
+        # copy 1 byte
+        addi    a3, a3, -1
+        l8ui    a6, a3,  0
+        addi    a5, a5, -1
+        s8i     a6, a5,  0
+        retw
+/*
+ * Destination is aligned, Source is unaligned
+ */
+        .align  4
+.Lbacksrcunaligned:
+        _beqz   a4, .Lbackdone  # avoid loading anything for zero-length copies
+        # copy 16 bytes per iteration for word-aligned dst and unaligned src
+        ssa8    a3              # set shift amount from byte offset
+#define SIM_CHECKS_ALIGNMENT    1       /* set to 1 when running on ISS with
+                                         * the lint or ferret client, or 0
+                                         * to save a few cycles */
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+        and     a11, a3, a8     # save unalignment offset for below
+        sub     a3, a3, a11     # align a3
+#endif
+        l32i    a6, a3, 0       # load first word
+#if XCHAL_HAVE_LOOPS
+        loopnez a7, .backLoop2done
+#else /* !XCHAL_HAVE_LOOPS */
+        beqz    a7, .backLoop2done
+        slli    a10, a7, 4
+        sub     a10, a3, a10    # a10 = start of first 16B source chunk
+#endif /* !XCHAL_HAVE_LOOPS */
+.backLoop2:
+        addi    a3, a3, -16
+        l32i    a7, a3, 12
+        l32i    a8, a3,  8
+        addi    a5, a5, -16
+        src_b   a6, a7, a6
+        s32i    a6, a5, 12
+        l32i    a9, a3,  4
+        src_b   a7, a8, a7
+        s32i    a7, a5,  8
+        l32i    a6, a3,  0
+        src_b   a8, a9, a8
+        s32i    a8, a5,  4
+        src_b   a9, a6, a9
+        s32i    a9, a5,  0
+#if !XCHAL_HAVE_LOOPS
+        bne     a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
+#endif /* !XCHAL_HAVE_LOOPS */
+.backLoop2done:
+        bbci.l  a4, 3, .Lback12
+        # copy 8 bytes
+        addi    a3, a3, -8
+        l32i    a7, a3,  4
+        l32i    a8, a3,  0
+        addi    a5, a5, -8
+        src_b   a6, a7, a6
+        s32i    a6, a5,  4
+        src_b   a7, a8, a7
+        s32i    a7, a5,  0
+        mov     a6, a8
+.Lback12:
+        bbci.l  a4, 2, .Lback13
+        # copy 4 bytes
+        addi    a3, a3, -4
+        l32i    a7, a3,  0
+        addi    a5, a5, -4
+        src_b   a6, a7, a6
+        s32i    a6, a5,  0
+        mov     a6, a7
+.Lback13:
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+        add     a3, a3, a11     # readjust a3 with correct misalignment
+#endif
+        bbsi.l  a4, 1, .Lback14
+        bbsi.l  a4, 0, .Lback15
+.Lbackdone:
+        retw
+.Lback14:
+        # copy 2 bytes
+        addi    a3, a3, -2
+        l8ui    a6, a3,  0
+        l8ui    a7, a3,  1
+        addi    a5, a5, -2
+        s8i     a6, a5,  0
+        s8i     a7, a5,  1
+        bbsi.l  a4, 0, .Lback15
+        retw
+.Lback15:
+        # copy 1 byte
+        addi    a3, a3, -1
+        addi    a5, a5, -1
+        l8ui    a6, a3,  0
+        s8i     a6, a5,  0
+        retw
 /*
 * Local Variables:
author	Chris Zankel <chris@zankel.net>	2012-10-16 00:41:19 -0400
committer	Chris Zankel <chris@zankel.net>	2012-10-16 00:41:19 -0400
commit	eae8a416afe140df4b054c448476654db0d46bde (patch)
tree	bd5f48ace717e58d535bb3776c13b70360605f46 /arch/xtensa/lib
parent	c88d8df0cc69fe0238f2c805a87cc67fb27a43fe (diff)

diff --git a/arch/xtensa/lib/memcopy.S b/arch/xtensa/lib/memcopy.S index ea59dcd03866..c48b80acb5f0 100644 --- a/arch/xtensa/lib/memcopy.S +++ b/arch/xtensa/lib/memcopy.S
@@ -6,7 +6,7 @@
6	* License. See the file "COPYING" in the main directory of this archive	6	* License. See the file "COPYING" in the main directory of this archive
7	* for more details.	7	* for more details.
8	*	8	*
9	* Copyright (C) 2002 - 2005 Tensilica Inc.	9	* Copyright (C) 2002 - 2012 Tensilica Inc.
10	*/	10	*/
11		11
12	#include <variant/core.h>	12	#include <variant/core.h>
@@ -27,14 +27,11 @@
27	#endif	27	#endif
28	.endm	28	.endm
29		29
30
31	/*	30	/*
32	* void memcpy(void dst, const void *src, size_t len);	31	* void memcpy(void dst, const void *src, size_t len);
33	* void memmove(void dst, const void *src, size_t len);
34	* void bcopy(const void src, void *dst, size_t len);
35	*	32	*
36	* This function is intended to do the same thing as the standard	33	* This function is intended to do the same thing as the standard
37	* library function memcpy() (or bcopy()) for most cases.	34	* library function memcpy() for most cases.
38	* However, where the source and/or destination references	35	* However, where the source and/or destination references
39	* an instruction RAM or ROM or a data RAM or ROM, that	36	* an instruction RAM or ROM or a data RAM or ROM, that
40	* source and/or destination will always be accessed with	37	* source and/or destination will always be accessed with
@@ -45,9 +42,6 @@
45	* !!!!!!! Handling of IRAM/IROM has not yet	42	* !!!!!!! Handling of IRAM/IROM has not yet
46	* !!!!!!! been implemented.	43	* !!!!!!! been implemented.
47	*	44	*
48	* The bcopy version is provided here to avoid the overhead
49	* of an extra call, for callers that require this convention.
50	*
51	* The (general case) algorithm is as follows:	45	* The (general case) algorithm is as follows:
52	* If destination is unaligned, align it by conditionally	46	* If destination is unaligned, align it by conditionally
53	* copying 1 and 2 bytes.	47	* copying 1 and 2 bytes.
@@ -76,17 +70,6 @@
76	*/	70	*/
77		71
78	.text	72	.text
79	.align 4
80	.global bcopy
81	.type bcopy,@function
82	bcopy:
83	entry sp, 16 # minimal stack frame
84	# a2=src, a3=dst, a4=len
85	mov a5, a3 # copy dst so that a2 is return value
86	mov a3, a2
87	mov a2, a5
88	j .Lcommon # go to common code for memcpy+bcopy
89
90		73
91	/*	74	/*
92	* Byte by byte copy	75	* Byte by byte copy
@@ -107,7 +90,7 @@ bcopy:
107	s8i a6, a5, 0	90	s8i a6, a5, 0
108	addi a5, a5, 1	91	addi a5, a5, 1
109	#if !XCHAL_HAVE_LOOPS	92	#if !XCHAL_HAVE_LOOPS
110	blt a3, a7, .Lnextbyte	93	bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
111	#endif /* !XCHAL_HAVE_LOOPS */	94	#endif /* !XCHAL_HAVE_LOOPS */
112	.Lbytecopydone:	95	.Lbytecopydone:
113	retw	96	retw
@@ -144,9 +127,6 @@ bcopy:
144	.global memcpy	127	.global memcpy
145	.type memcpy,@function	128	.type memcpy,@function
146	memcpy:	129	memcpy:
147	.global memmove
148	.type memmove,@function
149	memmove:
150		130
151	entry sp, 16 # minimal stack frame	131	entry sp, 16 # minimal stack frame
152	# a2/ dst, a3/ src, a4/ len	132	# a2/ dst, a3/ src, a4/ len
@@ -182,7 +162,7 @@ memmove:
182	s32i a7, a5, 12	162	s32i a7, a5, 12
183	addi a5, a5, 16	163	addi a5, a5, 16
184	#if !XCHAL_HAVE_LOOPS	164	#if !XCHAL_HAVE_LOOPS
185	blt a3, a8, .Loop1	165	bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end
186	#endif /* !XCHAL_HAVE_LOOPS */	166	#endif /* !XCHAL_HAVE_LOOPS */
187	.Loop1done:	167	.Loop1done:
188	bbci.l a4, 3, .L2	168	bbci.l a4, 3, .L2
@@ -260,7 +240,7 @@ memmove:
260	s32i a9, a5, 12	240	s32i a9, a5, 12
261	addi a5, a5, 16	241	addi a5, a5, 16
262	#if !XCHAL_HAVE_LOOPS	242	#if !XCHAL_HAVE_LOOPS
263	blt a3, a10, .Loop2	243	bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
264	#endif /* !XCHAL_HAVE_LOOPS */	244	#endif /* !XCHAL_HAVE_LOOPS */
265	.Loop2done:	245	.Loop2done:
266	bbci.l a4, 3, .L12	246	bbci.l a4, 3, .L12
@@ -305,6 +285,285 @@ memmove:
305	l8ui a6, a3, 0	285	l8ui a6, a3, 0
306	s8i a6, a5, 0	286	s8i a6, a5, 0
307	retw	287	retw
		288
		289
		290	/*
		291	* void bcopy(const void src, void dest, size_t n);
		292	*/
		293	.align 4
		294	.global bcopy
		295	.type bcopy,@function
		296	bcopy:
		297	entry sp, 16 # minimal stack frame
		298	# a2=src, a3=dst, a4=len
		299	mov a5, a3
		300	mov a3, a2
		301	mov a2, a5
		302	j .Lmovecommon # go to common code for memmove+bcopy
		303
		304	/*
		305	* void memmove(void dst, const void *src, size_t len);
		306	*
		307	* This function is intended to do the same thing as the standard
		308	* library function memmove() for most cases.
		309	* However, where the source and/or destination references
		310	* an instruction RAM or ROM or a data RAM or ROM, that
		311	* source and/or destination will always be accessed with
		312	* 32-bit load and store instructions (as required for these
		313	* types of devices).
		314	*
		315	* !!!!!!! XTFIXME:
		316	* !!!!!!! Handling of IRAM/IROM has not yet
		317	* !!!!!!! been implemented.
		318	*
		319	* The (general case) algorithm is as follows:
		320	* If end of source doesn't overlap destination then use memcpy.
		321	* Otherwise do memcpy backwards.
		322	*
		323	* Register use:
		324	* a0/ return address
		325	* a1/ stack pointer
		326	* a2/ return value
		327	* a3/ src
		328	* a4/ length
		329	* a5/ dst
		330	* a6/ tmp
		331	* a7/ tmp
		332	* a8/ tmp
		333	* a9/ tmp
		334	* a10/ tmp
		335	* a11/ tmp
		336	*/
		337
		338	/*
		339	* Byte by byte copy
		340	*/
		341	.align 4
		342	.byte 0 # 1 mod 4 alignment for LOOPNEZ
		343	# (0 mod 4 alignment for LBEG)
		344	.Lbackbytecopy:
		345	#if XCHAL_HAVE_LOOPS
		346	loopnez a4, .Lbackbytecopydone
		347	#else /* !XCHAL_HAVE_LOOPS */
		348	beqz a4, .Lbackbytecopydone
		349	sub a7, a3, a4 # a7 = start address for source
		350	#endif /* !XCHAL_HAVE_LOOPS */
		351	.Lbacknextbyte:
		352	addi a3, a3, -1
		353	l8ui a6, a3, 0
		354	addi a5, a5, -1
		355	s8i a6, a5, 0
		356	#if !XCHAL_HAVE_LOOPS
		357	bne a3, a7, .Lbacknextbyte # continue loop if
		358	# $a3:src != $a7:src_start
		359	#endif /* !XCHAL_HAVE_LOOPS */
		360	.Lbackbytecopydone:
		361	retw
		362
		363	/*
		364	* Destination is unaligned
		365	*/
		366
		367	.align 4
		368	.Lbackdst1mod2: # dst is only byte aligned
		369	_bltui a4, 7, .Lbackbytecopy # do short copies byte by byte
		370
		371	# copy 1 byte
		372	addi a3, a3, -1
		373	l8ui a6, a3, 0
		374	addi a5, a5, -1
		375	s8i a6, a5, 0
		376	addi a4, a4, -1
		377	_bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then
		378	# return to main algorithm
		379	.Lbackdst2mod4: # dst 16-bit aligned
		380	# copy 2 bytes
		381	_bltui a4, 6, .Lbackbytecopy # do short copies byte by byte
		382	addi a3, a3, -2
		383	l8ui a6, a3, 0
		384	l8ui a7, a3, 1
		385	addi a5, a5, -2
		386	s8i a6, a5, 0
		387	s8i a7, a5, 1
		388	addi a4, a4, -2
		389	j .Lbackdstaligned # dst is now aligned,
		390	# return to main algorithm
		391
		392	.align 4
		393	.global memmove
		394	.type memmove,@function
		395	memmove:
		396
		397	entry sp, 16 # minimal stack frame
		398	# a2/ dst, a3/ src, a4/ len
		399	mov a5, a2 # copy dst so that a2 is return value
		400	.Lmovecommon:
		401	sub a6, a5, a3
		402	bgeu a6, a4, .Lcommon
		403
		404	add a5, a5, a4
		405	add a3, a3, a4
		406
		407	_bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2
		408	_bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4
		409	.Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned
		410	srli a7, a4, 4 # number of loop iterations with 16B
		411	# per iteration
		412	movi a8, 3 # if source is not aligned,
		413	_bany a3, a8, .Lbacksrcunaligned # then use shifting copy
		414	/*
		415	* Destination and source are word-aligned, use word copy.
		416	*/
		417	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
		418	#if XCHAL_HAVE_LOOPS
		419	loopnez a7, .backLoop1done
		420	#else /* !XCHAL_HAVE_LOOPS */
		421	beqz a7, .backLoop1done
		422	slli a8, a7, 4
		423	sub a8, a3, a8 # a8 = start of first 16B source chunk
		424	#endif /* !XCHAL_HAVE_LOOPS */
		425	.backLoop1:
		426	addi a3, a3, -16
		427	l32i a7, a3, 12
		428	l32i a6, a3, 8
		429	addi a5, a5, -16
		430	s32i a7, a5, 12
		431	l32i a7, a3, 4
		432	s32i a6, a5, 8
		433	l32i a6, a3, 0
		434	s32i a7, a5, 4
		435	s32i a6, a5, 0
		436	#if !XCHAL_HAVE_LOOPS
		437	bne a3, a8, .backLoop1 # continue loop if a3:src != a8:src_start
		438	#endif /* !XCHAL_HAVE_LOOPS */
		439	.backLoop1done:
		440	bbci.l a4, 3, .Lback2
		441	# copy 8 bytes
		442	addi a3, a3, -8
		443	l32i a6, a3, 0
		444	l32i a7, a3, 4
		445	addi a5, a5, -8
		446	s32i a6, a5, 0
		447	s32i a7, a5, 4
		448	.Lback2:
		449	bbsi.l a4, 2, .Lback3
		450	bbsi.l a4, 1, .Lback4
		451	bbsi.l a4, 0, .Lback5
		452	retw
		453	.Lback3:
		454	# copy 4 bytes
		455	addi a3, a3, -4
		456	l32i a6, a3, 0
		457	addi a5, a5, -4
		458	s32i a6, a5, 0
		459	bbsi.l a4, 1, .Lback4
		460	bbsi.l a4, 0, .Lback5
		461	retw
		462	.Lback4:
		463	# copy 2 bytes
		464	addi a3, a3, -2
		465	l16ui a6, a3, 0
		466	addi a5, a5, -2
		467	s16i a6, a5, 0
		468	bbsi.l a4, 0, .Lback5
		469	retw
		470	.Lback5:
		471	# copy 1 byte
		472	addi a3, a3, -1
		473	l8ui a6, a3, 0
		474	addi a5, a5, -1
		475	s8i a6, a5, 0
		476	retw
		477
		478	/*
		479	* Destination is aligned, Source is unaligned
		480	*/
		481
		482	.align 4
		483	.Lbacksrcunaligned:
		484	_beqz a4, .Lbackdone # avoid loading anything for zero-length copies
		485	# copy 16 bytes per iteration for word-aligned dst and unaligned src
		486	ssa8 a3 # set shift amount from byte offset
		487	#define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with
		488	* the lint or ferret client, or 0
		489	* to save a few cycles */
		490	#if XCHAL_UNALIGNED_LOAD_EXCEPTION \|\| SIM_CHECKS_ALIGNMENT
		491	and a11, a3, a8 # save unalignment offset for below
		492	sub a3, a3, a11 # align a3
		493	#endif
		494	l32i a6, a3, 0 # load first word
		495	#if XCHAL_HAVE_LOOPS
		496	loopnez a7, .backLoop2done
		497	#else /* !XCHAL_HAVE_LOOPS */
		498	beqz a7, .backLoop2done
		499	slli a10, a7, 4
		500	sub a10, a3, a10 # a10 = start of first 16B source chunk
		501	#endif /* !XCHAL_HAVE_LOOPS */
		502	.backLoop2:
		503	addi a3, a3, -16
		504	l32i a7, a3, 12
		505	l32i a8, a3, 8
		506	addi a5, a5, -16
		507	src_b a6, a7, a6
		508	s32i a6, a5, 12
		509	l32i a9, a3, 4
		510	src_b a7, a8, a7
		511	s32i a7, a5, 8
		512	l32i a6, a3, 0
		513	src_b a8, a9, a8
		514	s32i a8, a5, 4
		515	src_b a9, a6, a9
		516	s32i a9, a5, 0
		517	#if !XCHAL_HAVE_LOOPS
		518	bne a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
		519	#endif /* !XCHAL_HAVE_LOOPS */
		520	.backLoop2done:
		521	bbci.l a4, 3, .Lback12
		522	# copy 8 bytes
		523	addi a3, a3, -8
		524	l32i a7, a3, 4
		525	l32i a8, a3, 0
		526	addi a5, a5, -8
		527	src_b a6, a7, a6
		528	s32i a6, a5, 4
		529	src_b a7, a8, a7
		530	s32i a7, a5, 0
		531	mov a6, a8
		532	.Lback12:
		533	bbci.l a4, 2, .Lback13
		534	# copy 4 bytes
		535	addi a3, a3, -4
		536	l32i a7, a3, 0
		537	addi a5, a5, -4
		538	src_b a6, a7, a6
		539	s32i a6, a5, 0
		540	mov a6, a7
		541	.Lback13:
		542	#if XCHAL_UNALIGNED_LOAD_EXCEPTION \|\| SIM_CHECKS_ALIGNMENT
		543	add a3, a3, a11 # readjust a3 with correct misalignment
		544	#endif
		545	bbsi.l a4, 1, .Lback14
		546	bbsi.l a4, 0, .Lback15
		547	.Lbackdone:
		548	retw
		549	.Lback14:
		550	# copy 2 bytes
		551	addi a3, a3, -2
		552	l8ui a6, a3, 0
		553	l8ui a7, a3, 1
		554	addi a5, a5, -2
		555	s8i a6, a5, 0
		556	s8i a7, a5, 1
		557	bbsi.l a4, 0, .Lback15
		558	retw
		559	.Lback15:
		560	# copy 1 byte
		561	addi a3, a3, -1
		562	addi a5, a5, -1
		563	l8ui a6, a3, 0
		564	s8i a6, a5, 0
		565	retw
		566
308		567
309	/*	568	/*
310	* Local Variables:	569	* Local Variables: