1 files changed, 410 insertions, 0 deletions
diff --git a/arch/xtensa/lib/checksum.S b/arch/xtensa/lib/checksum.S
new file mode 100644
index 000000000000..e2d64dfd530c
--- /dev/null
+++ b/arch/xtensa/lib/checksum.S
@@ -0,0 +1,410 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              IP/TCP/UDP checksumming routines
+ *
+ * Xtensa version:  Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
+ *                  Optimized by Joe Taylor
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ */
+#include <asm/errno.h>
+#include <linux/linkage.h>
+#define _ASMLANGUAGE
+#include <xtensa/config/core.h>
+/*
+ * computes a partial checksum, e.g. for TCP/UDP fragments
+ */
+/*
+ * unsigned int csum_partial(const unsigned char *buf, int len,
+ *                           unsigned int sum);
+ *    a2 = buf
+ *    a3 = len
+ *    a4 = sum
+ *
+ * This function assumes 2- or 4-byte alignment.  Other alignments will fail!
+ */
+/* ONES_ADD converts twos-complement math to ones-complement. */
+#define ONES_ADD(sum, val)        \
+        add     sum, sum, val   ; \
+        bgeu    sum, val, 99f   ; \
+        addi    sum, sum, 1     ; \
+99:                             ;
+.text
+ENTRY(csum_partial)
+          /*
+           * Experiments with Ethernet and SLIP connections show that buf
+           * is aligned on either a 2-byte or 4-byte boundary.
+           */
+        entry   sp, 32
+        extui   a5, a2, 0, 2
+        bnez    a5, 8f          /* branch if 2-byte aligned */
+        /* Fall-through on common case, 4-byte alignment */
+1:
+        srli    a5, a3, 5       /* 32-byte chunks */
+#if XCHAL_HAVE_LOOPS
+        loopgtz a5, 2f
+#else
+        beqz    a5, 2f
+        slli    a5, a5, 5
+        add     a5, a5, a2      /* a5 = end of last 32-byte chunk */
+.Loop1:
+#endif
+        l32i    a6, a2, 0
+        l32i    a7, a2, 4
+        ONES_ADD(a4, a6)
+        ONES_ADD(a4, a7)
+        l32i    a6, a2, 8
+        l32i    a7, a2, 12
+        ONES_ADD(a4, a6)
+        ONES_ADD(a4, a7)
+        l32i    a6, a2, 16
+        l32i    a7, a2, 20
+        ONES_ADD(a4, a6)
+        ONES_ADD(a4, a7)
+        l32i    a6, a2, 24
+        l32i    a7, a2, 28
+        ONES_ADD(a4, a6)
+        ONES_ADD(a4, a7)
+        addi    a2, a2, 4*8
+#if !XCHAL_HAVE_LOOPS
+        blt     a2, a5, .Loop1
+#endif
+2:
+        extui   a5, a3, 2, 3    /* remaining 4-byte chunks */
+#if XCHAL_HAVE_LOOPS
+        loopgtz a5, 3f
+#else
+        beqz    a5, 3f
+        slli    a5, a5, 2
+        add     a5, a5, a2      /* a5 = end of last 4-byte chunk */
+.Loop2:
+#endif
+        l32i    a6, a2, 0
+        ONES_ADD(a4, a6)
+        addi    a2, a2, 4
+#if !XCHAL_HAVE_LOOPS
+        blt     a2, a5, .Loop2
+#endif
+3:
+        _bbci.l a3, 1, 5f       /* remaining 2-byte chunk */
+        l16ui   a6, a2, 0
+        ONES_ADD(a4, a6)
+        addi    a2, a2, 2
+5:
+        _bbci.l a3, 0, 7f       /* remaining 1-byte chunk */
+6:      l8ui    a6, a2, 0
+#ifdef __XTENSA_EB__
+        slli    a6, a6, 8       /* load byte into bits 8..15 */
+#endif
+        ONES_ADD(a4, a6)
+7:
+        mov     a2, a4
+        retw
+        /* uncommon case, buf is 2-byte aligned */
+8:
+        beqz    a3, 7b          /* branch if len == 0 */
+        beqi    a3, 1, 6b       /* branch if len == 1 */
+        extui   a5, a2, 0, 1
+        bnez    a5, 8f          /* branch if 1-byte aligned */
+        l16ui   a6, a2, 0       /* common case, len >= 2 */
+        ONES_ADD(a4, a6)
+        addi    a2, a2, 2       /* adjust buf */
+        addi    a3, a3, -2      /* adjust len */
+        j       1b              /* now buf is 4-byte aligned */
+        /* case: odd-byte aligned, len > 1
+         * This case is dog slow, so don't give us an odd address.
+         * (I don't think this ever happens, but just in case.)
+         */
+8:
+        srli    a5, a3, 2       /* 4-byte chunks */
+#if XCHAL_HAVE_LOOPS
+        loopgtz a5, 2f
+#else
+        beqz    a5, 2f
+        slli    a5, a5, 2
+        add     a5, a5, a2      /* a5 = end of last 4-byte chunk */
+.Loop3:
+#endif
+        l8ui    a6, a2, 0       /* bits 24..31 */
+        l16ui   a7, a2, 1       /* bits  8..23 */
+        l8ui    a8, a2, 3       /* bits  0.. 8 */
+#ifdef  __XTENSA_EB__
+        slli    a6, a6, 24
+#else
+        slli    a8, a8, 24
+#endif
+        slli    a7, a7, 8
+        or      a7, a7, a6
+        or      a7, a7, a8
+        ONES_ADD(a4, a7)
+        addi    a2, a2, 4
+#if !XCHAL_HAVE_LOOPS
+        blt     a2, a5, .Loop3
+#endif
+2:
+        _bbci.l a3, 1, 3f       /* remaining 2-byte chunk, still odd addr */
+        l8ui    a6, a2, 0
+        l8ui    a7, a2, 1
+#ifdef  __XTENSA_EB__
+        slli    a6, a6, 8
+#else
+        slli    a7, a7, 8
+#endif
+        or      a7, a7, a6
+        ONES_ADD(a4, a7)
+        addi    a2, a2, 2
+3:
+        j       5b              /* branch to handle the remaining byte */
+/*
+ * Copy from ds while checksumming, otherwise like csum_partial
+ *
+ * The macros SRC and DST specify the type of access for the instruction.
+ * thus we can call a custom exception handler for each access type.
+ */
+#define SRC(y...)                       \
+        9999: y;                        \
+        .section __ex_table, "a";       \
+        .long 9999b, 6001f      ;       \
+        .previous
+#define DST(y...)                       \
+        9999: y;                        \
+        .section __ex_table, "a";       \
+        .long 9999b, 6002f      ;       \
+        .previous
+/*
+unsigned int csum_partial_copy_generic (const char *src, char *dst, int len,
+                                        int sum, int *src_err_ptr, int *dst_err_ptr)
+        a2  = src
+        a3  = dst
+        a4  = len
+        a5  = sum
+        a6  = src_err_ptr
+        a7  = dst_err_ptr
+        a8  = temp
+        a9  = temp
+        a10 = temp
+        a11 = original len for exception handling
+        a12 = original dst for exception handling
+    This function is optimized for 4-byte aligned addresses.  Other
+    alignments work, but not nearly as efficiently.
+ */
+ENTRY(csum_partial_copy_generic)
+        entry   sp, 32
+        mov     a12, a3
+        mov     a11, a4
+        or      a10, a2, a3
+        /* We optimize the following alignment tests for the 4-byte
+        aligned case.  Two bbsi.l instructions might seem more optimal
+        (commented out below).  However, both labels 5: and 3: are out
+        of the imm8 range, so the assembler relaxes them into
+        equivalent bbci.l, j combinations, which is actually
+        slower. */
+        extui   a9, a10, 0, 2
+        beqz    a9, 1f          /* branch if both are 4-byte aligned */
+        bbsi.l  a10, 0, 5f      /* branch if one address is odd */
+        j       3f              /* one address is 2-byte aligned */
+/*      _bbsi.l a10, 0, 5f */   /* branch if odd address */
+/*      _bbsi.l a10, 1, 3f */   /* branch if 2-byte-aligned address */
+1:
+        /* src and dst are both 4-byte aligned */
+        srli    a10, a4, 5      /* 32-byte chunks */
+#if XCHAL_HAVE_LOOPS
+        loopgtz a10, 2f
+#else
+        beqz    a10, 2f
+        slli    a10, a10, 5
+        add     a10, a10, a2    /* a10 = end of last 32-byte src chunk */
+.Loop5:
+#endif
+SRC(    l32i    a9, a2, 0       )
+SRC(    l32i    a8, a2, 4       )
+DST(    s32i    a9, a3, 0       )
+DST(    s32i    a8, a3, 4       )
+        ONES_ADD(a5, a9)
+        ONES_ADD(a5, a8)
+SRC(    l32i    a9, a2, 8       )
+SRC(    l32i    a8, a2, 12      )
+DST(    s32i    a9, a3, 8       )
+DST(    s32i    a8, a3, 12      )
+        ONES_ADD(a5, a9)
+        ONES_ADD(a5, a8)
+SRC(    l32i    a9, a2, 16      )
+SRC(    l32i    a8, a2, 20      )
+DST(    s32i    a9, a3, 16      )
+DST(    s32i    a8, a3, 20      )
+        ONES_ADD(a5, a9)
+        ONES_ADD(a5, a8)
+SRC(    l32i    a9, a2, 24      )
+SRC(    l32i    a8, a2, 28      )
+DST(    s32i    a9, a3, 24      )
+DST(    s32i    a8, a3, 28      )
+        ONES_ADD(a5, a9)
+        ONES_ADD(a5, a8)
+        addi    a2, a2, 32
+        addi    a3, a3, 32
+#if !XCHAL_HAVE_LOOPS
+        blt     a2, a10, .Loop5
+#endif
+2:
+        extui   a10, a4, 2, 3   /* remaining 4-byte chunks */
+        extui   a4, a4, 0, 2    /* reset len for general-case, 2-byte chunks */
+#if XCHAL_HAVE_LOOPS
+        loopgtz a10, 3f
+#else
+        beqz    a10, 3f
+        slli    a10, a10, 2
+        add     a10, a10, a2    /* a10 = end of last 4-byte src chunk */
+.Loop6:
+#endif
+SRC(    l32i    a9, a2, 0       )
+DST(    s32i    a9, a3, 0       )
+        ONES_ADD(a5, a9)
+        addi    a2, a2, 4
+        addi    a3, a3, 4
+#if !XCHAL_HAVE_LOOPS
+        blt     a2, a10, .Loop6
+#endif
+3:
+        /*
+        Control comes to here in two cases: (1) It may fall through
+        to here from the 4-byte alignment case to process, at most,
+        one 2-byte chunk.  (2) It branches to here from above if
+        either src or dst is 2-byte aligned, and we process all bytes
+        here, except for perhaps a trailing odd byte.  It's
+        inefficient, so align your addresses to 4-byte boundaries.
+        a2 = src
+        a3 = dst
+        a4 = len
+        a5 = sum
+        */
+        srli    a10, a4, 1      /* 2-byte chunks */
+#if XCHAL_HAVE_LOOPS
+        loopgtz a10, 4f
+#else
+        beqz    a10, 4f
+        slli    a10, a10, 1
+        add     a10, a10, a2    /* a10 = end of last 2-byte src chunk */
+.Loop7:
+#endif
+SRC(    l16ui   a9, a2, 0       )
+DST(    s16i    a9, a3, 0       )
+        ONES_ADD(a5, a9)
+        addi    a2, a2, 2
+        addi    a3, a3, 2
+#if !XCHAL_HAVE_LOOPS
+        blt     a2, a10, .Loop7
+#endif
+4:
+        /* This section processes a possible trailing odd byte. */
+        _bbci.l a4, 0, 8f       /* 1-byte chunk */
+SRC(    l8ui    a9, a2, 0       )
+DST(    s8i     a9, a3, 0       )
+#ifdef __XTENSA_EB__
+        slli    a9, a9, 8       /* shift byte to bits 8..15 */
+#endif
+        ONES_ADD(a5, a9)
+8:
+        mov     a2, a5
+        retw
+5:
+        /* Control branch to here when either src or dst is odd.  We
+        process all bytes using 8-bit accesses.  Grossly inefficient,
+        so don't feed us an odd address. */
+        srli    a10, a4, 1      /* handle in pairs for 16-bit csum */
+#if XCHAL_HAVE_LOOPS
+        loopgtz a10, 6f
+#else
+        beqz    a10, 6f
+        slli    a10, a10, 1
+        add     a10, a10, a2    /* a10 = end of last odd-aligned, 2-byte src chunk */
+.Loop8:
+#endif
+SRC(    l8ui    a9, a2, 0       )
+SRC(    l8ui    a8, a2, 1       )
+DST(    s8i     a9, a3, 0       )
+DST(    s8i     a8, a3, 1       )
+#ifdef __XTENSA_EB__
+        slli    a9, a9, 8       /* combine into a single 16-bit value */
+#else                           /* for checksum computation */
+        slli    a8, a8, 8
+#endif
+        or      a9, a9, a8
+        ONES_ADD(a5, a9)
+        addi    a2, a2, 2
+        addi    a3, a3, 2
+#if !XCHAL_HAVE_LOOPS
+        blt     a2, a10, .Loop8
+#endif
+6:
+        j       4b              /* process the possible trailing odd byte */
+# Exception handler:
+.section .fixup, "ax"
+/*
+        a6  = src_err_ptr
+        a7  = dst_err_ptr
+        a11 = original len for exception handling
+        a12 = original dst for exception handling
+*/
+6001:
+        _movi   a2, -EFAULT
+        s32i    a2, a6, 0       /* src_err_ptr */
+        # clear the complete destination - computing the rest
+        # is too much work
+        movi    a2, 0
+#if XCHAL_HAVE_LOOPS
+        loopgtz a11, 2f
+#else
+        beqz    a11, 2f
+        add     a11, a11, a12   /* a11 = ending address */
+.Leloop:
+#endif
+        s8i     a2, a12, 0
+        addi    a12, a12, 1
+#if !XCHAL_HAVE_LOOPS
+        blt     a12, a11, .Leloop
+#endif
+2:
+        retw
+6002:
+        movi    a2, -EFAULT
+        s32i    a2, a7, 0       /* dst_err_ptr */
+        movi    a2, 0
+        retw
+.previous

diff --git a/arch/xtensa/lib/checksum.S b/arch/xtensa/lib/checksum.S new file mode 100644 index 000000000000..e2d64dfd530c --- /dev/null +++ b/arch/xtensa/lib/checksum.S
@@ -0,0 +1,410 @@
	1	/*
	2	* INET An implementation of the TCP/IP protocol suite for the LINUX
	3	* operating system. INET is implemented using the BSD Socket
	4	* interface as the means of communication with the user level.
	5	*
	6	* IP/TCP/UDP checksumming routines
	7	*
	8	* Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
	9	* Optimized by Joe Taylor
	10	*
	11	* This program is free software; you can redistribute it and/or
	12	* modify it under the terms of the GNU General Public License
	13	* as published by the Free Software Foundation; either version
	14	* 2 of the License, or (at your option) any later version.
	15	*/
	16
	17	#include <asm/errno.h>
	18	#include <linux/linkage.h>
	19	#define _ASMLANGUAGE
	20	#include <xtensa/config/core.h>
	21
	22	/*
	23	* computes a partial checksum, e.g. for TCP/UDP fragments
	24	*/
	25
	26	/*
	27	* unsigned int csum_partial(const unsigned char *buf, int len,
	28	* unsigned int sum);
	29	* a2 = buf
	30	* a3 = len
	31	* a4 = sum
	32	*
	33	* This function assumes 2- or 4-byte alignment. Other alignments will fail!
	34	*/
	35
	36	/* ONES_ADD converts twos-complement math to ones-complement. */
	37	#define ONES_ADD(sum, val) \
	38	add sum, sum, val ; \
	39	bgeu sum, val, 99f ; \
	40	addi sum, sum, 1 ; \
	41	99: ;
	42
	43	.text
	44	ENTRY(csum_partial)
	45	/*
	46	* Experiments with Ethernet and SLIP connections show that buf
	47	* is aligned on either a 2-byte or 4-byte boundary.
	48	*/
	49	entry sp, 32
	50	extui a5, a2, 0, 2
	51	bnez a5, 8f /* branch if 2-byte aligned */
	52	/* Fall-through on common case, 4-byte alignment */
	53	1:
	54	srli a5, a3, 5 /* 32-byte chunks */
	55	#if XCHAL_HAVE_LOOPS
	56	loopgtz a5, 2f
	57	#else
	58	beqz a5, 2f
	59	slli a5, a5, 5
	60	add a5, a5, a2 /* a5 = end of last 32-byte chunk */
	61	.Loop1:
	62	#endif
	63	l32i a6, a2, 0
	64	l32i a7, a2, 4
	65	ONES_ADD(a4, a6)
	66	ONES_ADD(a4, a7)
	67	l32i a6, a2, 8
	68	l32i a7, a2, 12
	69	ONES_ADD(a4, a6)
	70	ONES_ADD(a4, a7)
	71	l32i a6, a2, 16
	72	l32i a7, a2, 20
	73	ONES_ADD(a4, a6)
	74	ONES_ADD(a4, a7)
	75	l32i a6, a2, 24
	76	l32i a7, a2, 28
	77	ONES_ADD(a4, a6)
	78	ONES_ADD(a4, a7)
	79	addi a2, a2, 4*8
	80	#if !XCHAL_HAVE_LOOPS
	81	blt a2, a5, .Loop1
	82	#endif
	83	2:
	84	extui a5, a3, 2, 3 /* remaining 4-byte chunks */
	85	#if XCHAL_HAVE_LOOPS
	86	loopgtz a5, 3f
	87	#else
	88	beqz a5, 3f
	89	slli a5, a5, 2
	90	add a5, a5, a2 /* a5 = end of last 4-byte chunk */
	91	.Loop2:
	92	#endif
	93	l32i a6, a2, 0
	94	ONES_ADD(a4, a6)
	95	addi a2, a2, 4
	96	#if !XCHAL_HAVE_LOOPS
	97	blt a2, a5, .Loop2
	98	#endif
	99	3:
	100	_bbci.l a3, 1, 5f /* remaining 2-byte chunk */
	101	l16ui a6, a2, 0
	102	ONES_ADD(a4, a6)
	103	addi a2, a2, 2
	104	5:
	105	_bbci.l a3, 0, 7f /* remaining 1-byte chunk */
	106	6: l8ui a6, a2, 0
	107	#ifdef __XTENSA_EB__
	108	slli a6, a6, 8 /* load byte into bits 8..15 */
	109	#endif
	110	ONES_ADD(a4, a6)
	111	7:
	112	mov a2, a4
	113	retw
	114
	115	/* uncommon case, buf is 2-byte aligned */
	116	8:
	117	beqz a3, 7b /* branch if len == 0 */
	118	beqi a3, 1, 6b /* branch if len == 1 */
	119
	120	extui a5, a2, 0, 1
	121	bnez a5, 8f /* branch if 1-byte aligned */
	122
	123	l16ui a6, a2, 0 /* common case, len >= 2 */
	124	ONES_ADD(a4, a6)
	125	addi a2, a2, 2 /* adjust buf */
	126	addi a3, a3, -2 /* adjust len */
	127	j 1b /* now buf is 4-byte aligned */
	128
	129	/* case: odd-byte aligned, len > 1
	130	* This case is dog slow, so don't give us an odd address.
	131	* (I don't think this ever happens, but just in case.)
	132	*/
	133	8:
	134	srli a5, a3, 2 /* 4-byte chunks */
	135	#if XCHAL_HAVE_LOOPS
	136	loopgtz a5, 2f
	137	#else
	138	beqz a5, 2f
	139	slli a5, a5, 2
	140	add a5, a5, a2 /* a5 = end of last 4-byte chunk */
	141	.Loop3:
	142	#endif
	143	l8ui a6, a2, 0 /* bits 24..31 */
	144	l16ui a7, a2, 1 /* bits 8..23 */
	145	l8ui a8, a2, 3 /* bits 0.. 8 */
	146	#ifdef __XTENSA_EB__
	147	slli a6, a6, 24
	148	#else
	149	slli a8, a8, 24
	150	#endif
	151	slli a7, a7, 8
	152	or a7, a7, a6
	153	or a7, a7, a8
	154	ONES_ADD(a4, a7)
	155	addi a2, a2, 4
	156	#if !XCHAL_HAVE_LOOPS
	157	blt a2, a5, .Loop3
	158	#endif
	159	2:
	160	_bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */
	161	l8ui a6, a2, 0
	162	l8ui a7, a2, 1
	163	#ifdef __XTENSA_EB__
	164	slli a6, a6, 8
	165	#else
	166	slli a7, a7, 8
	167	#endif
	168	or a7, a7, a6
	169	ONES_ADD(a4, a7)
	170	addi a2, a2, 2
	171	3:
	172	j 5b /* branch to handle the remaining byte */
	173
	174
	175
	176	/*
	177	* Copy from ds while checksumming, otherwise like csum_partial
	178	*
	179	* The macros SRC and DST specify the type of access for the instruction.
	180	* thus we can call a custom exception handler for each access type.
	181	*/
	182
	183	#define SRC(y...) \
	184	9999: y; \
	185	.section __ex_table, "a"; \
	186	.long 9999b, 6001f ; \
	187	.previous
	188
	189	#define DST(y...) \
	190	9999: y; \
	191	.section __ex_table, "a"; \
	192	.long 9999b, 6002f ; \
	193	.previous
	194
	195	/*
	196	unsigned int csum_partial_copy_generic (const char src, char dst, int len,
	197	int sum, int src_err_ptr, int dst_err_ptr)
	198	a2 = src
	199	a3 = dst
	200	a4 = len
	201	a5 = sum
	202	a6 = src_err_ptr
	203	a7 = dst_err_ptr
	204	a8 = temp
	205	a9 = temp
	206	a10 = temp
	207	a11 = original len for exception handling
	208	a12 = original dst for exception handling
	209
	210	This function is optimized for 4-byte aligned addresses. Other
	211	alignments work, but not nearly as efficiently.
	212	*/
	213
	214	ENTRY(csum_partial_copy_generic)
	215	entry sp, 32
	216	mov a12, a3
	217	mov a11, a4
	218	or a10, a2, a3
	219
	220	/* We optimize the following alignment tests for the 4-byte
	221	aligned case. Two bbsi.l instructions might seem more optimal
	222	(commented out below). However, both labels 5: and 3: are out
	223	of the imm8 range, so the assembler relaxes them into
	224	equivalent bbci.l, j combinations, which is actually
	225	slower. */
	226
	227	extui a9, a10, 0, 2
	228	beqz a9, 1f /* branch if both are 4-byte aligned */
	229	bbsi.l a10, 0, 5f /* branch if one address is odd */
	230	j 3f /* one address is 2-byte aligned */
	231
	232	/* _bbsi.l a10, 0, 5f / / branch if odd address */
	233	/* _bbsi.l a10, 1, 3f / / branch if 2-byte-aligned address */
	234
	235	1:
	236	/* src and dst are both 4-byte aligned */
	237	srli a10, a4, 5 /* 32-byte chunks */
	238	#if XCHAL_HAVE_LOOPS
	239	loopgtz a10, 2f
	240	#else
	241	beqz a10, 2f
	242	slli a10, a10, 5
	243	add a10, a10, a2 /* a10 = end of last 32-byte src chunk */
	244	.Loop5:
	245	#endif
	246	SRC( l32i a9, a2, 0 )
	247	SRC( l32i a8, a2, 4 )
	248	DST( s32i a9, a3, 0 )
	249	DST( s32i a8, a3, 4 )
	250	ONES_ADD(a5, a9)
	251	ONES_ADD(a5, a8)
	252	SRC( l32i a9, a2, 8 )
	253	SRC( l32i a8, a2, 12 )
	254	DST( s32i a9, a3, 8 )
	255	DST( s32i a8, a3, 12 )
	256	ONES_ADD(a5, a9)
	257	ONES_ADD(a5, a8)
	258	SRC( l32i a9, a2, 16 )
	259	SRC( l32i a8, a2, 20 )
	260	DST( s32i a9, a3, 16 )
	261	DST( s32i a8, a3, 20 )
	262	ONES_ADD(a5, a9)
	263	ONES_ADD(a5, a8)
	264	SRC( l32i a9, a2, 24 )
	265	SRC( l32i a8, a2, 28 )
	266	DST( s32i a9, a3, 24 )
	267	DST( s32i a8, a3, 28 )
	268	ONES_ADD(a5, a9)
	269	ONES_ADD(a5, a8)
	270	addi a2, a2, 32
	271	addi a3, a3, 32
	272	#if !XCHAL_HAVE_LOOPS
	273	blt a2, a10, .Loop5
	274	#endif
	275	2:
	276	extui a10, a4, 2, 3 /* remaining 4-byte chunks */
	277	extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */
	278	#if XCHAL_HAVE_LOOPS
	279	loopgtz a10, 3f
	280	#else
	281	beqz a10, 3f
	282	slli a10, a10, 2
	283	add a10, a10, a2 /* a10 = end of last 4-byte src chunk */
	284	.Loop6:
	285	#endif
	286	SRC( l32i a9, a2, 0 )
	287	DST( s32i a9, a3, 0 )
	288	ONES_ADD(a5, a9)
	289	addi a2, a2, 4
	290	addi a3, a3, 4
	291	#if !XCHAL_HAVE_LOOPS
	292	blt a2, a10, .Loop6
	293	#endif
	294	3:
	295	/*
	296	Control comes to here in two cases: (1) It may fall through
	297	to here from the 4-byte alignment case to process, at most,
	298	one 2-byte chunk. (2) It branches to here from above if
	299	either src or dst is 2-byte aligned, and we process all bytes
	300	here, except for perhaps a trailing odd byte. It's
	301	inefficient, so align your addresses to 4-byte boundaries.
	302
	303	a2 = src
	304	a3 = dst
	305	a4 = len
	306	a5 = sum
	307	*/
	308	srli a10, a4, 1 /* 2-byte chunks */
	309	#if XCHAL_HAVE_LOOPS
	310	loopgtz a10, 4f
	311	#else
	312	beqz a10, 4f
	313	slli a10, a10, 1
	314	add a10, a10, a2 /* a10 = end of last 2-byte src chunk */
	315	.Loop7:
	316	#endif
	317	SRC( l16ui a9, a2, 0 )
	318	DST( s16i a9, a3, 0 )
	319	ONES_ADD(a5, a9)
	320	addi a2, a2, 2
	321	addi a3, a3, 2
	322	#if !XCHAL_HAVE_LOOPS
	323	blt a2, a10, .Loop7
	324	#endif
	325	4:
	326	/* This section processes a possible trailing odd byte. */
	327	_bbci.l a4, 0, 8f /* 1-byte chunk */
	328	SRC( l8ui a9, a2, 0 )
	329	DST( s8i a9, a3, 0 )
	330	#ifdef __XTENSA_EB__
	331	slli a9, a9, 8 /* shift byte to bits 8..15 */
	332	#endif
	333	ONES_ADD(a5, a9)
	334	8:
	335	mov a2, a5
	336	retw
	337
	338	5:
	339	/* Control branch to here when either src or dst is odd. We
	340	process all bytes using 8-bit accesses. Grossly inefficient,
	341	so don't feed us an odd address. */
	342
	343	srli a10, a4, 1 /* handle in pairs for 16-bit csum */
	344	#if XCHAL_HAVE_LOOPS
	345	loopgtz a10, 6f
	346	#else
	347	beqz a10, 6f
	348	slli a10, a10, 1
	349	add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */
	350	.Loop8:
	351	#endif
	352	SRC( l8ui a9, a2, 0 )
	353	SRC( l8ui a8, a2, 1 )
	354	DST( s8i a9, a3, 0 )
	355	DST( s8i a8, a3, 1 )
	356	#ifdef __XTENSA_EB__
	357	slli a9, a9, 8 /* combine into a single 16-bit value */
	358	#else /* for checksum computation */
	359	slli a8, a8, 8
	360	#endif
	361	or a9, a9, a8
	362	ONES_ADD(a5, a9)
	363	addi a2, a2, 2
	364	addi a3, a3, 2
	365	#if !XCHAL_HAVE_LOOPS
	366	blt a2, a10, .Loop8
	367	#endif
	368	6:
	369	j 4b /* process the possible trailing odd byte */
	370
	371
	372	# Exception handler:
	373	.section .fixup, "ax"
	374	/*
	375	a6 = src_err_ptr
	376	a7 = dst_err_ptr
	377	a11 = original len for exception handling
	378	a12 = original dst for exception handling
	379	*/
	380
	381	6001:
	382	_movi a2, -EFAULT
	383	s32i a2, a6, 0 /* src_err_ptr */
	384
	385	# clear the complete destination - computing the rest
	386	# is too much work
	387	movi a2, 0
	388	#if XCHAL_HAVE_LOOPS
	389	loopgtz a11, 2f
	390	#else
	391	beqz a11, 2f
	392	add a11, a11, a12 /* a11 = ending address */
	393	.Leloop:
	394	#endif
	395	s8i a2, a12, 0
	396	addi a12, a12, 1
	397	#if !XCHAL_HAVE_LOOPS
	398	blt a12, a11, .Leloop
	399	#endif
	400	2:
	401	retw
	402
	403	6002:
	404	movi a2, -EFAULT
	405	s32i a2, a7, 0 /* dst_err_ptr */
	406	movi a2, 0
	407	retw
	408
	409	.previous
	410