1 files changed, 436 insertions, 0 deletions
diff --git a/arch/mips/lib/memcpy-inatomic.S b/arch/mips/lib/memcpy-inatomic.S
new file mode 100644
index 000000000000..3a534b2baa0f
--- /dev/null
+++ b/arch/mips/lib/memcpy-inatomic.S
@@ -0,0 +1,436 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Unified implementation of memcpy, memmove and the __copy_user backend.
+ *
+ * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
+ * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
+ * Copyright (C) 2002 Broadcom, Inc.
+ *   memcpy/copy_user author: Mark Vandevoorde
+ *
+ * Mnemonic names for arguments to memcpy/__copy_user
+ */
+/*
+ * Hack to resolve longstanding prefetch issue
+ *
+ * Prefetching may be fatal on some systems if we're prefetching beyond the
+ * end of memory on some systems.  It's also a seriously bad idea on non
+ * dma-coherent systems.
+ */
+#if !defined(CONFIG_DMA_COHERENT) || !defined(CONFIG_DMA_IP27)
+#undef CONFIG_CPU_HAS_PREFETCH
+#endif
+#ifdef CONFIG_MIPS_MALTA
+#undef CONFIG_CPU_HAS_PREFETCH
+#endif
+#include <asm/asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/regdef.h>
+#define dst a0
+#define src a1
+#define len a2
+/*
+ * Spec
+ *
+ * memcpy copies len bytes from src to dst and sets v0 to dst.
+ * It assumes that
+ *   - src and dst don't overlap
+ *   - src is readable
+ *   - dst is writable
+ * memcpy uses the standard calling convention
+ *
+ * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
+ * the number of uncopied bytes due to an exception caused by a read or write.
+ * __copy_user assumes that src and dst don't overlap, and that the call is
+ * implementing one of the following:
+ *   copy_to_user
+ *     - src is readable  (no exceptions when reading src)
+ *   copy_from_user
+ *     - dst is writable  (no exceptions when writing dst)
+ * __copy_user uses a non-standard calling convention; see
+ * include/asm-mips/uaccess.h
+ *
+ * When an exception happens on a load, the handler must
+ # ensure that all of the destination buffer is overwritten to prevent
+ * leaking information to user mode programs.
+ */
+/*
+ * Implementation
+ */
+/*
+ * The exception handler for loads requires that:
+ *  1- AT contain the address of the byte just past the end of the source
+ *     of the copy,
+ *  2- src_entry <= src < AT, and
+ *  3- (dst - src) == (dst_entry - src_entry),
+ * The _entry suffix denotes values when __copy_user was called.
+ *
+ * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
+ * (2) is met by incrementing src by the number of bytes copied
+ * (3) is met by not doing loads between a pair of increments of dst and src
+ *
+ * The exception handlers for stores adjust len (if necessary) and return.
+ * These handlers do not need to overwrite any data.
+ *
+ * For __rmemcpy and memmove an exception is always a kernel bug, therefore
+ * they're not protected.
+ */
+#define EXC(inst_reg,addr,handler)              \
+9:      inst_reg, addr;                         \
+        .section __ex_table,"a";                \
+        PTR     9b, handler;                    \
+        .previous
+/*
+ * Only on the 64-bit kernel we can made use of 64-bit registers.
+ */
+#ifdef CONFIG_64BIT
+#define USE_DOUBLE
+#endif
+#ifdef USE_DOUBLE
+#define LOAD   ld
+#define LOADL  ldl
+#define LOADR  ldr
+#define STOREL sdl
+#define STORER sdr
+#define STORE  sd
+#define ADD    daddu
+#define SUB    dsubu
+#define SRL    dsrl
+#define SRA    dsra
+#define SLL    dsll
+#define SLLV   dsllv
+#define SRLV   dsrlv
+#define NBYTES 8
+#define LOG_NBYTES 3
+/*
+ * As we are sharing code base with the mips32 tree (which use the o32 ABI
+ * register definitions). We need to redefine the register definitions from
+ * the n64 ABI register naming to the o32 ABI register naming.
+ */
+#undef t0
+#undef t1
+#undef t2
+#undef t3
+#define t0      $8
+#define t1      $9
+#define t2      $10
+#define t3      $11
+#define t4      $12
+#define t5      $13
+#define t6      $14
+#define t7      $15
+#else
+#define LOAD   lw
+#define LOADL  lwl
+#define LOADR  lwr
+#define STOREL swl
+#define STORER swr
+#define STORE  sw
+#define ADD    addu
+#define SUB    subu
+#define SRL    srl
+#define SLL    sll
+#define SRA    sra
+#define SLLV   sllv
+#define SRLV   srlv
+#define NBYTES 4
+#define LOG_NBYTES 2
+#endif /* USE_DOUBLE */
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+#define LDFIRST LOADR
+#define LDREST  LOADL
+#define STFIRST STORER
+#define STREST  STOREL
+#define SHIFT_DISCARD SLLV
+#else
+#define LDFIRST LOADL
+#define LDREST  LOADR
+#define STFIRST STOREL
+#define STREST  STORER
+#define SHIFT_DISCARD SRLV
+#endif
+#define FIRST(unit) ((unit)*NBYTES)
+#define REST(unit)  (FIRST(unit)+NBYTES-1)
+#define UNIT(unit)  FIRST(unit)
+#define ADDRMASK (NBYTES-1)
+        .text
+        .set    noreorder
+        .set    noat
+/*
+ * A combined memcpy/__copy_user
+ * __copy_user sets len to 0 for success; else to an upper bound of
+ * the number of uncopied bytes.
+ * memcpy sets v0 to dst.
+ */
+        .align  5
+LEAF(__copy_user_inatomic)
+        /*
+         * Note: dst & src may be unaligned, len may be 0
+         * Temps
+         */
+#define rem t8
+        /*
+         * The "issue break"s below are very approximate.
+         * Issue delays for dcache fills will perturb the schedule, as will
+         * load queue full replay traps, etc.
+         *
+         * If len < NBYTES use byte operations.
+         */
+        PREF(   0, 0(src) )
+        PREF(   1, 0(dst) )
+        sltu    t2, len, NBYTES
+        and     t1, dst, ADDRMASK
+        PREF(   0, 1*32(src) )
+        PREF(   1, 1*32(dst) )
+        bnez    t2, copy_bytes_checklen
+         and    t0, src, ADDRMASK
+        PREF(   0, 2*32(src) )
+        PREF(   1, 2*32(dst) )
+        bnez    t1, dst_unaligned
+         nop
+        bnez    t0, src_unaligned_dst_aligned
+        /*
+         * use delay slot for fall-through
+         * src and dst are aligned; need to compute rem
+         */
+both_aligned:
+         SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
+        beqz    t0, cleanup_both_aligned # len < 8*NBYTES
+         and    rem, len, (8*NBYTES-1)   # rem = len % (8*NBYTES)
+        PREF(   0, 3*32(src) )
+        PREF(   1, 3*32(dst) )
+        .align  4
+1:
+EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
+EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
+EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
+EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
+        SUB     len, len, 8*NBYTES
+EXC(    LOAD    t4, UNIT(4)(src),       l_exc_copy)
+EXC(    LOAD    t7, UNIT(5)(src),       l_exc_copy)
+        STORE   t0, UNIT(0)(dst)
+        STORE   t1, UNIT(1)(dst)
+EXC(    LOAD    t0, UNIT(6)(src),       l_exc_copy)
+EXC(    LOAD    t1, UNIT(7)(src),       l_exc_copy)
+        ADD     src, src, 8*NBYTES
+        ADD     dst, dst, 8*NBYTES
+        STORE   t2, UNIT(-6)(dst)
+        STORE   t3, UNIT(-5)(dst)
+        STORE   t4, UNIT(-4)(dst)
+        STORE   t7, UNIT(-3)(dst)
+        STORE   t0, UNIT(-2)(dst)
+        STORE   t1, UNIT(-1)(dst)
+        PREF(   0, 8*32(src) )
+        PREF(   1, 8*32(dst) )
+        bne     len, rem, 1b
+         nop
+        /*
+         * len == rem == the number of bytes left to copy < 8*NBYTES
+         */
+cleanup_both_aligned:
+        beqz    len, done
+         sltu   t0, len, 4*NBYTES
+        bnez    t0, less_than_4units
+         and    rem, len, (NBYTES-1)    # rem = len % NBYTES
+        /*
+         * len >= 4*NBYTES
+         */
+EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
+EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
+EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
+EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
+        SUB     len, len, 4*NBYTES
+        ADD     src, src, 4*NBYTES
+        STORE   t0, UNIT(0)(dst)
+        STORE   t1, UNIT(1)(dst)
+        STORE   t2, UNIT(2)(dst)
+        STORE   t3, UNIT(3)(dst)
+        beqz    len, done
+         ADD    dst, dst, 4*NBYTES
+less_than_4units:
+        /*
+         * rem = len % NBYTES
+         */
+        beq     rem, len, copy_bytes
+         nop
+1:
+EXC(    LOAD    t0, 0(src),             l_exc)
+        ADD     src, src, NBYTES
+        SUB     len, len, NBYTES
+        STORE   t0, 0(dst)
+        bne     rem, len, 1b
+         ADD    dst, dst, NBYTES
+        /*
+         * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
+         * A loop would do only a byte at a time with possible branch
+         * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
+         * because can't assume read-access to dst.  Instead, use
+         * STREST dst, which doesn't require read access to dst.
+         *
+         * This code should perform better than a simple loop on modern,
+         * wide-issue mips processors because the code has fewer branches and
+         * more instruction-level parallelism.
+         */
+#define bits t2
+        beqz    len, done
+         ADD    t1, dst, len    # t1 is just past last byte of dst
+        li      bits, 8*NBYTES
+        SLL     rem, len, 3     # rem = number of bits to keep
+EXC(    LOAD    t0, 0(src),             l_exc)
+        SUB     bits, bits, rem # bits = number of bits to discard
+        SHIFT_DISCARD t0, t0, bits
+        STREST  t0, -1(t1)
+        jr      ra
+         move   len, zero
+dst_unaligned:
+        /*
+         * dst is unaligned
+         * t0 = src & ADDRMASK
+         * t1 = dst & ADDRMASK; T1 > 0
+         * len >= NBYTES
+         *
+         * Copy enough bytes to align dst
+         * Set match = (src and dst have same alignment)
+         */
+#define match rem
+EXC(    LDFIRST t3, FIRST(0)(src),      l_exc)
+        ADD     t2, zero, NBYTES
+EXC(    LDREST  t3, REST(0)(src),       l_exc_copy)
+        SUB     t2, t2, t1      # t2 = number of bytes copied
+        xor     match, t0, t1
+        STFIRST t3, FIRST(0)(dst)
+        beq     len, t2, done
+         SUB    len, len, t2
+        ADD     dst, dst, t2
+        beqz    match, both_aligned
+         ADD    src, src, t2
+src_unaligned_dst_aligned:
+        SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
+        PREF(   0, 3*32(src) )
+        beqz    t0, cleanup_src_unaligned
+         and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
+        PREF(   1, 3*32(dst) )
+1:
+/*
+ * Avoid consecutive LD*'s to the same register since some mips
+ * implementations can't issue them in the same cycle.
+ * It's OK to load FIRST(N+1) before REST(N) because the two addresses
+ * are to the same unit (unless src is aligned, but it's not).
+ */
+EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
+EXC(    LDFIRST t1, FIRST(1)(src),      l_exc_copy)
+        SUB     len, len, 4*NBYTES
+EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
+EXC(    LDREST  t1, REST(1)(src),       l_exc_copy)
+EXC(    LDFIRST t2, FIRST(2)(src),      l_exc_copy)
+EXC(    LDFIRST t3, FIRST(3)(src),      l_exc_copy)
+EXC(    LDREST  t2, REST(2)(src),       l_exc_copy)
+EXC(    LDREST  t3, REST(3)(src),       l_exc_copy)
+        PREF(   0, 9*32(src) )          # 0 is PREF_LOAD  (not streamed)
+        ADD     src, src, 4*NBYTES
+#ifdef CONFIG_CPU_SB1
+        nop                             # improves slotting
+#endif
+        STORE   t0, UNIT(0)(dst)
+        STORE   t1, UNIT(1)(dst)
+        STORE   t2, UNIT(2)(dst)
+        STORE   t3, UNIT(3)(dst)
+        PREF(   1, 9*32(dst) )          # 1 is PREF_STORE (not streamed)
+        bne     len, rem, 1b
+         ADD    dst, dst, 4*NBYTES
+cleanup_src_unaligned:
+        beqz    len, done
+         and    rem, len, NBYTES-1  # rem = len % NBYTES
+        beq     rem, len, copy_bytes
+         nop
+1:
+EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
+EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
+        ADD     src, src, NBYTES
+        SUB     len, len, NBYTES
+        STORE   t0, 0(dst)
+        bne     len, rem, 1b
+         ADD    dst, dst, NBYTES
+copy_bytes_checklen:
+        beqz    len, done
+         nop
+copy_bytes:
+        /* 0 < len < NBYTES  */
+#define COPY_BYTE(N)                    \
+EXC(    lb      t0, N(src), l_exc);     \
+        SUB     len, len, 1;            \
+        beqz    len, done;              \
+         sb     t0, N(dst)
+        COPY_BYTE(0)
+        COPY_BYTE(1)
+#ifdef USE_DOUBLE
+        COPY_BYTE(2)
+        COPY_BYTE(3)
+        COPY_BYTE(4)
+        COPY_BYTE(5)
+#endif
+EXC(    lb      t0, NBYTES-2(src), l_exc)
+        SUB     len, len, 1
+        jr      ra
+         sb     t0, NBYTES-2(dst)
+done:
+        jr      ra
+         nop
+        END(__copy_user_inatomic)
+l_exc_copy:
+        /*
+         * Copy bytes from src until faulting load address (or until a
+         * lb faults)
+         *
+         * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
+         * may be more than a byte beyond the last address.
+         * Hence, the lb below may get an exception.
+         *
+         * Assumes src < THREAD_BUADDR($28)
+         */
+        LOAD    t0, TI_TASK($28)
+         nop
+        LOAD    t0, THREAD_BUADDR(t0)
+1:
+EXC(    lb      t1, 0(src),     l_exc)
+        ADD     src, src, 1
+        sb      t1, 0(dst)      # can't fault -- we're copy_from_user
+        bne     src, t0, 1b
+         ADD    dst, dst, 1
+l_exc:
+        LOAD    t0, TI_TASK($28)
+         nop
+        LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
+         nop
+        SUB     len, AT, t0             # len number of uncopied bytes
+        jr      ra
+         nop

diff --git a/arch/mips/lib/memcpy-inatomic.S b/arch/mips/lib/memcpy-inatomic.S new file mode 100644 index 000000000000..3a534b2baa0f --- /dev/null +++ b/arch/mips/lib/memcpy-inatomic.S
@@ -0,0 +1,436 @@
	1	/*
	2	* This file is subject to the terms and conditions of the GNU General Public
	3	* License. See the file "COPYING" in the main directory of this archive
	4	* for more details.
	5	*
	6	* Unified implementation of memcpy, memmove and the __copy_user backend.
	7	*
	8	* Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
	9	* Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
	10	* Copyright (C) 2002 Broadcom, Inc.
	11	* memcpy/copy_user author: Mark Vandevoorde
	12	*
	13	* Mnemonic names for arguments to memcpy/__copy_user
	14	*/
	15
	16	/*
	17	* Hack to resolve longstanding prefetch issue
	18	*
	19	* Prefetching may be fatal on some systems if we're prefetching beyond the
	20	* end of memory on some systems. It's also a seriously bad idea on non
	21	* dma-coherent systems.
	22	*/
	23	#if !defined(CONFIG_DMA_COHERENT) \|\| !defined(CONFIG_DMA_IP27)
	24	#undef CONFIG_CPU_HAS_PREFETCH
	25	#endif
	26	#ifdef CONFIG_MIPS_MALTA
	27	#undef CONFIG_CPU_HAS_PREFETCH
	28	#endif
	29
	30	#include <asm/asm.h>
	31	#include <asm/asm-offsets.h>
	32	#include <asm/regdef.h>
	33
	34	#define dst a0
	35	#define src a1
	36	#define len a2
	37
	38	/*
	39	* Spec
	40	*
	41	* memcpy copies len bytes from src to dst and sets v0 to dst.
	42	* It assumes that
	43	* - src and dst don't overlap
	44	* - src is readable
	45	* - dst is writable
	46	* memcpy uses the standard calling convention
	47	*
	48	* __copy_user copies up to len bytes from src to dst and sets a2 (len) to
	49	* the number of uncopied bytes due to an exception caused by a read or write.
	50	* __copy_user assumes that src and dst don't overlap, and that the call is
	51	* implementing one of the following:
	52	* copy_to_user
	53	* - src is readable (no exceptions when reading src)
	54	* copy_from_user
	55	* - dst is writable (no exceptions when writing dst)
	56	* __copy_user uses a non-standard calling convention; see
	57	* include/asm-mips/uaccess.h
	58	*
	59	* When an exception happens on a load, the handler must
	60	# ensure that all of the destination buffer is overwritten to prevent
	61	* leaking information to user mode programs.
	62	*/
	63
	64	/*
	65	* Implementation
	66	*/
	67
	68	/*
	69	* The exception handler for loads requires that:
	70	* 1- AT contain the address of the byte just past the end of the source
	71	* of the copy,
	72	* 2- src_entry <= src < AT, and
	73	* 3- (dst - src) == (dst_entry - src_entry),
	74	* The _entry suffix denotes values when __copy_user was called.
	75	*
	76	* (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
	77	* (2) is met by incrementing src by the number of bytes copied
	78	* (3) is met by not doing loads between a pair of increments of dst and src
	79	*
	80	* The exception handlers for stores adjust len (if necessary) and return.
	81	* These handlers do not need to overwrite any data.
	82	*
	83	* For __rmemcpy and memmove an exception is always a kernel bug, therefore
	84	* they're not protected.
	85	*/
	86
	87	#define EXC(inst_reg,addr,handler) \
	88	9: inst_reg, addr; \
	89	.section __ex_table,"a"; \
	90	PTR 9b, handler; \
	91	.previous
	92
	93	/*
	94	* Only on the 64-bit kernel we can made use of 64-bit registers.
	95	*/
	96	#ifdef CONFIG_64BIT
	97	#define USE_DOUBLE
	98	#endif
	99
	100	#ifdef USE_DOUBLE
	101
	102	#define LOAD ld
	103	#define LOADL ldl
	104	#define LOADR ldr
	105	#define STOREL sdl
	106	#define STORER sdr
	107	#define STORE sd
	108	#define ADD daddu
	109	#define SUB dsubu
	110	#define SRL dsrl
	111	#define SRA dsra
	112	#define SLL dsll
	113	#define SLLV dsllv
	114	#define SRLV dsrlv
	115	#define NBYTES 8
	116	#define LOG_NBYTES 3
	117
	118	/*
	119	* As we are sharing code base with the mips32 tree (which use the o32 ABI
	120	* register definitions). We need to redefine the register definitions from
	121	* the n64 ABI register naming to the o32 ABI register naming.
	122	*/
	123	#undef t0
	124	#undef t1
	125	#undef t2
	126	#undef t3
	127	#define t0 $8
	128	#define t1 $9
	129	#define t2 $10
	130	#define t3 $11
	131	#define t4 $12
	132	#define t5 $13
	133	#define t6 $14
	134	#define t7 $15
	135
	136	#else
	137
	138	#define LOAD lw
	139	#define LOADL lwl
	140	#define LOADR lwr
	141	#define STOREL swl
	142	#define STORER swr
	143	#define STORE sw
	144	#define ADD addu
	145	#define SUB subu
	146	#define SRL srl
	147	#define SLL sll
	148	#define SRA sra
	149	#define SLLV sllv
	150	#define SRLV srlv
	151	#define NBYTES 4
	152	#define LOG_NBYTES 2
	153
	154	#endif /* USE_DOUBLE */
	155
	156	#ifdef CONFIG_CPU_LITTLE_ENDIAN
	157	#define LDFIRST LOADR
	158	#define LDREST LOADL
	159	#define STFIRST STORER
	160	#define STREST STOREL
	161	#define SHIFT_DISCARD SLLV
	162	#else
	163	#define LDFIRST LOADL
	164	#define LDREST LOADR
	165	#define STFIRST STOREL
	166	#define STREST STORER
	167	#define SHIFT_DISCARD SRLV
	168	#endif
	169
	170	#define FIRST(unit) ((unit)*NBYTES)
	171	#define REST(unit) (FIRST(unit)+NBYTES-1)
	172	#define UNIT(unit) FIRST(unit)
	173
	174	#define ADDRMASK (NBYTES-1)
	175
	176	.text
	177	.set noreorder
	178	.set noat
	179
	180	/*
	181	* A combined memcpy/__copy_user
	182	* __copy_user sets len to 0 for success; else to an upper bound of
	183	* the number of uncopied bytes.
	184	* memcpy sets v0 to dst.
	185	*/
	186	.align 5
	187	LEAF(__copy_user_inatomic)
	188	/*
	189	* Note: dst & src may be unaligned, len may be 0
	190	* Temps
	191	*/
	192	#define rem t8
	193
	194	/*
	195	* The "issue break"s below are very approximate.
	196	* Issue delays for dcache fills will perturb the schedule, as will
	197	* load queue full replay traps, etc.
	198	*
	199	* If len < NBYTES use byte operations.
	200	*/
	201	PREF( 0, 0(src) )
	202	PREF( 1, 0(dst) )
	203	sltu t2, len, NBYTES
	204	and t1, dst, ADDRMASK
	205	PREF( 0, 1*32(src) )
	206	PREF( 1, 1*32(dst) )
	207	bnez t2, copy_bytes_checklen
	208	and t0, src, ADDRMASK
	209	PREF( 0, 2*32(src) )
	210	PREF( 1, 2*32(dst) )
	211	bnez t1, dst_unaligned
	212	nop
	213	bnez t0, src_unaligned_dst_aligned
	214	/*
	215	* use delay slot for fall-through
	216	* src and dst are aligned; need to compute rem
	217	*/
	218	both_aligned:
	219	SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter
	220	beqz t0, cleanup_both_aligned # len < 8*NBYTES
	221	and rem, len, (8NBYTES-1) # rem = len % (8NBYTES)
	222	PREF( 0, 3*32(src) )
	223	PREF( 1, 3*32(dst) )
	224	.align 4
	225	1:
	226	EXC( LOAD t0, UNIT(0)(src), l_exc)
	227	EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
	228	EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
	229	EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
	230	SUB len, len, 8*NBYTES
	231	EXC( LOAD t4, UNIT(4)(src), l_exc_copy)
	232	EXC( LOAD t7, UNIT(5)(src), l_exc_copy)
	233	STORE t0, UNIT(0)(dst)
	234	STORE t1, UNIT(1)(dst)
	235	EXC( LOAD t0, UNIT(6)(src), l_exc_copy)
	236	EXC( LOAD t1, UNIT(7)(src), l_exc_copy)
	237	ADD src, src, 8*NBYTES
	238	ADD dst, dst, 8*NBYTES
	239	STORE t2, UNIT(-6)(dst)
	240	STORE t3, UNIT(-5)(dst)
	241	STORE t4, UNIT(-4)(dst)
	242	STORE t7, UNIT(-3)(dst)
	243	STORE t0, UNIT(-2)(dst)
	244	STORE t1, UNIT(-1)(dst)
	245	PREF( 0, 8*32(src) )
	246	PREF( 1, 8*32(dst) )
	247	bne len, rem, 1b
	248	nop
	249
	250	/*
	251	* len == rem == the number of bytes left to copy < 8*NBYTES
	252	*/
	253	cleanup_both_aligned:
	254	beqz len, done
	255	sltu t0, len, 4*NBYTES
	256	bnez t0, less_than_4units
	257	and rem, len, (NBYTES-1) # rem = len % NBYTES
	258	/*
	259	* len >= 4*NBYTES
	260	*/
	261	EXC( LOAD t0, UNIT(0)(src), l_exc)
	262	EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
	263	EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
	264	EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
	265	SUB len, len, 4*NBYTES
	266	ADD src, src, 4*NBYTES
	267	STORE t0, UNIT(0)(dst)
	268	STORE t1, UNIT(1)(dst)
	269	STORE t2, UNIT(2)(dst)
	270	STORE t3, UNIT(3)(dst)
	271	beqz len, done
	272	ADD dst, dst, 4*NBYTES
	273	less_than_4units:
	274	/*
	275	* rem = len % NBYTES
	276	*/
	277	beq rem, len, copy_bytes
	278	nop
	279	1:
	280	EXC( LOAD t0, 0(src), l_exc)
	281	ADD src, src, NBYTES
	282	SUB len, len, NBYTES
	283	STORE t0, 0(dst)
	284	bne rem, len, 1b
	285	ADD dst, dst, NBYTES
	286
	287	/*
	288	* src and dst are aligned, need to copy rem bytes (rem < NBYTES)
	289	* A loop would do only a byte at a time with possible branch
	290	* mispredicts. Can't do an explicit LOAD dst,mask,or,STORE
	291	* because can't assume read-access to dst. Instead, use
	292	* STREST dst, which doesn't require read access to dst.
	293	*
	294	* This code should perform better than a simple loop on modern,
	295	* wide-issue mips processors because the code has fewer branches and
	296	* more instruction-level parallelism.
	297	*/
	298	#define bits t2
	299	beqz len, done
	300	ADD t1, dst, len # t1 is just past last byte of dst
	301	li bits, 8*NBYTES
	302	SLL rem, len, 3 # rem = number of bits to keep
	303	EXC( LOAD t0, 0(src), l_exc)
	304	SUB bits, bits, rem # bits = number of bits to discard
	305	SHIFT_DISCARD t0, t0, bits
	306	STREST t0, -1(t1)
	307	jr ra
	308	move len, zero
	309	dst_unaligned:
	310	/*
	311	* dst is unaligned
	312	* t0 = src & ADDRMASK
	313	* t1 = dst & ADDRMASK; T1 > 0
	314	* len >= NBYTES
	315	*
	316	* Copy enough bytes to align dst
	317	* Set match = (src and dst have same alignment)
	318	*/
	319	#define match rem
	320	EXC( LDFIRST t3, FIRST(0)(src), l_exc)
	321	ADD t2, zero, NBYTES
	322	EXC( LDREST t3, REST(0)(src), l_exc_copy)
	323	SUB t2, t2, t1 # t2 = number of bytes copied
	324	xor match, t0, t1
	325	STFIRST t3, FIRST(0)(dst)
	326	beq len, t2, done
	327	SUB len, len, t2
	328	ADD dst, dst, t2
	329	beqz match, both_aligned
	330	ADD src, src, t2
	331
	332	src_unaligned_dst_aligned:
	333	SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
	334	PREF( 0, 3*32(src) )
	335	beqz t0, cleanup_src_unaligned
	336	and rem, len, (4NBYTES-1) # rem = len % 4NBYTES
	337	PREF( 1, 3*32(dst) )
	338	1:
	339	/*
	340	* Avoid consecutive LD*'s to the same register since some mips
	341	* implementations can't issue them in the same cycle.
	342	* It's OK to load FIRST(N+1) before REST(N) because the two addresses
	343	* are to the same unit (unless src is aligned, but it's not).
	344	*/
	345	EXC( LDFIRST t0, FIRST(0)(src), l_exc)
	346	EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)
	347	SUB len, len, 4*NBYTES
	348	EXC( LDREST t0, REST(0)(src), l_exc_copy)
	349	EXC( LDREST t1, REST(1)(src), l_exc_copy)
	350	EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)
	351	EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)
	352	EXC( LDREST t2, REST(2)(src), l_exc_copy)
	353	EXC( LDREST t3, REST(3)(src), l_exc_copy)
	354	PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed)
	355	ADD src, src, 4*NBYTES
	356	#ifdef CONFIG_CPU_SB1
	357	nop # improves slotting
	358	#endif
	359	STORE t0, UNIT(0)(dst)
	360	STORE t1, UNIT(1)(dst)
	361	STORE t2, UNIT(2)(dst)
	362	STORE t3, UNIT(3)(dst)
	363	PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed)
	364	bne len, rem, 1b
	365	ADD dst, dst, 4*NBYTES
	366
	367	cleanup_src_unaligned:
	368	beqz len, done
	369	and rem, len, NBYTES-1 # rem = len % NBYTES
	370	beq rem, len, copy_bytes
	371	nop
	372	1:
	373	EXC( LDFIRST t0, FIRST(0)(src), l_exc)
	374	EXC( LDREST t0, REST(0)(src), l_exc_copy)
	375	ADD src, src, NBYTES
	376	SUB len, len, NBYTES
	377	STORE t0, 0(dst)
	378	bne len, rem, 1b
	379	ADD dst, dst, NBYTES
	380
	381	copy_bytes_checklen:
	382	beqz len, done
	383	nop
	384	copy_bytes:
	385	/* 0 < len < NBYTES */
	386	#define COPY_BYTE(N) \
	387	EXC( lb t0, N(src), l_exc); \
	388	SUB len, len, 1; \
	389	beqz len, done; \
	390	sb t0, N(dst)
	391
	392	COPY_BYTE(0)
	393	COPY_BYTE(1)
	394	#ifdef USE_DOUBLE
	395	COPY_BYTE(2)
	396	COPY_BYTE(3)
	397	COPY_BYTE(4)
	398	COPY_BYTE(5)
	399	#endif
	400	EXC( lb t0, NBYTES-2(src), l_exc)
	401	SUB len, len, 1
	402	jr ra
	403	sb t0, NBYTES-2(dst)
	404	done:
	405	jr ra
	406	nop
	407	END(__copy_user_inatomic)
	408
	409	l_exc_copy:
	410	/*
	411	* Copy bytes from src until faulting load address (or until a
	412	* lb faults)
	413	*
	414	* When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
	415	* may be more than a byte beyond the last address.
	416	* Hence, the lb below may get an exception.
	417	*
	418	* Assumes src < THREAD_BUADDR($28)
	419	*/
	420	LOAD t0, TI_TASK($28)
	421	nop
	422	LOAD t0, THREAD_BUADDR(t0)
	423	1:
	424	EXC( lb t1, 0(src), l_exc)
	425	ADD src, src, 1
	426	sb t1, 0(dst) # can't fault -- we're copy_from_user
	427	bne src, t0, 1b
	428	ADD dst, dst, 1
	429	l_exc:
	430	LOAD t0, TI_TASK($28)
	431	nop
	432	LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
	433	nop
	434	SUB len, AT, t0 # len number of uncopied bytes
	435	jr ra
	436	nop