1 files changed, 508 insertions, 0 deletions
diff --git a/arch/mips/lib/memcpy.S b/arch/mips/lib/memcpy.S
new file mode 100644
index 000000000000..afa8eae18ff6
--- /dev/null
+++ b/arch/mips/lib/memcpy.S
@@ -0,0 +1,508 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Unified implementation of memcpy, memmove and the __copy_user backend.
+ *
+ * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
+ * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
+ * Copyright (C) 2002 Broadcom, Inc.
+ *   memcpy/copy_user author: Mark Vandevoorde
+ *
+ * Mnemonic names for arguments to memcpy/__copy_user
+ */
+#include <linux/config.h>
+#include <asm/asm.h>
+#include <asm/offset.h>
+#include <asm/regdef.h>
+#define dst a0
+#define src a1
+#define len a2
+/*
+ * Spec
+ *
+ * memcpy copies len bytes from src to dst and sets v0 to dst.
+ * It assumes that
+ *   - src and dst don't overlap
+ *   - src is readable
+ *   - dst is writable
+ * memcpy uses the standard calling convention
+ *
+ * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
+ * the number of uncopied bytes due to an exception caused by a read or write.
+ * __copy_user assumes that src and dst don't overlap, and that the call is
+ * implementing one of the following:
+ *   copy_to_user
+ *     - src is readable  (no exceptions when reading src)
+ *   copy_from_user
+ *     - dst is writable  (no exceptions when writing dst)
+ * __copy_user uses a non-standard calling convention; see
+ * include/asm-mips/uaccess.h
+ *
+ * When an exception happens on a load, the handler must
+ # ensure that all of the destination buffer is overwritten to prevent
+ * leaking information to user mode programs.
+ */
+/*
+ * Implementation
+ */
+/*
+ * The exception handler for loads requires that:
+ *  1- AT contain the address of the byte just past the end of the source
+ *     of the copy,
+ *  2- src_entry <= src < AT, and
+ *  3- (dst - src) == (dst_entry - src_entry),
+ * The _entry suffix denotes values when __copy_user was called.
+ *
+ * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
+ * (2) is met by incrementing src by the number of bytes copied
+ * (3) is met by not doing loads between a pair of increments of dst and src
+ *
+ * The exception handlers for stores adjust len (if necessary) and return.
+ * These handlers do not need to overwrite any data.
+ *
+ * For __rmemcpy and memmove an exception is always a kernel bug, therefore
+ * they're not protected.
+ */
+#define EXC(inst_reg,addr,handler)              \
+9:      inst_reg, addr;                         \
+        .section __ex_table,"a";                \
+        PTR     9b, handler;                    \
+        .previous
+/*
+ * Only on the 64-bit kernel we can made use of 64-bit registers.
+ */
+#ifdef CONFIG_MIPS64
+#define USE_DOUBLE
+#endif
+#ifdef USE_DOUBLE
+#define LOAD   ld
+#define LOADL  ldl
+#define LOADR  ldr
+#define STOREL sdl
+#define STORER sdr
+#define STORE  sd
+#define ADD    daddu
+#define SUB    dsubu
+#define SRL    dsrl
+#define SRA    dsra
+#define SLL    dsll
+#define SLLV   dsllv
+#define SRLV   dsrlv
+#define NBYTES 8
+#define LOG_NBYTES 3
+/* 
+ * As we are sharing code base with the mips32 tree (which use the o32 ABI
+ * register definitions). We need to redefine the register definitions from
+ * the n64 ABI register naming to the o32 ABI register naming.
+ */
+#undef t0
+#undef t1
+#undef t2
+#undef t3
+#define t0      $8
+#define t1      $9
+#define t2      $10
+#define t3      $11
+#define t4      $12
+#define t5      $13
+#define t6      $14
+#define t7      $15
+        
+#else
+#define LOAD   lw
+#define LOADL  lwl
+#define LOADR  lwr
+#define STOREL swl
+#define STORER swr
+#define STORE  sw
+#define ADD    addu
+#define SUB    subu
+#define SRL    srl
+#define SLL    sll
+#define SRA    sra
+#define SLLV   sllv
+#define SRLV   srlv
+#define NBYTES 4
+#define LOG_NBYTES 2
+#endif /* USE_DOUBLE */
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+#define LDFIRST LOADR
+#define LDREST  LOADL
+#define STFIRST STORER
+#define STREST  STOREL
+#define SHIFT_DISCARD SLLV
+#else
+#define LDFIRST LOADL
+#define LDREST  LOADR
+#define STFIRST STOREL
+#define STREST  STORER
+#define SHIFT_DISCARD SRLV
+#endif
+#define FIRST(unit) ((unit)*NBYTES)
+#define REST(unit)  (FIRST(unit)+NBYTES-1)
+#define UNIT(unit)  FIRST(unit)
+#define ADDRMASK (NBYTES-1)
+        .text
+        .set    noreorder
+        .set    noat
+/*
+ * A combined memcpy/__copy_user
+ * __copy_user sets len to 0 for success; else to an upper bound of
+ * the number of uncopied bytes.
+ * memcpy sets v0 to dst.
+ */
+        .align  5
+LEAF(memcpy)                                    /* a0=dst a1=src a2=len */
+        move    v0, dst                         /* return value */
+__memcpy:
+FEXPORT(__copy_user)
+        /*
+         * Note: dst & src may be unaligned, len may be 0
+         * Temps
+         */
+#define rem t8
+        /*
+         * The "issue break"s below are very approximate.
+         * Issue delays for dcache fills will perturb the schedule, as will
+         * load queue full replay traps, etc.
+         *
+         * If len < NBYTES use byte operations.
+         */
+        PREF(   0, 0(src) )
+        PREF(   1, 0(dst) )
+        sltu    t2, len, NBYTES
+        and     t1, dst, ADDRMASK
+        PREF(   0, 1*32(src) )
+        PREF(   1, 1*32(dst) )
+        bnez    t2, copy_bytes_checklen
+         and    t0, src, ADDRMASK
+        PREF(   0, 2*32(src) )
+        PREF(   1, 2*32(dst) )
+        bnez    t1, dst_unaligned
+         nop
+        bnez    t0, src_unaligned_dst_aligned
+        /*
+         * use delay slot for fall-through
+         * src and dst are aligned; need to compute rem
+         */
+both_aligned:
+         SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
+        beqz    t0, cleanup_both_aligned # len < 8*NBYTES
+         and    rem, len, (8*NBYTES-1)   # rem = len % (8*NBYTES)
+        PREF(   0, 3*32(src) )
+        PREF(   1, 3*32(dst) )
+        .align  4
+1:
+EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
+EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
+EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
+EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
+        SUB     len, len, 8*NBYTES
+EXC(    LOAD    t4, UNIT(4)(src),       l_exc_copy)
+EXC(    LOAD    t7, UNIT(5)(src),       l_exc_copy)
+EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p8u)
+EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p7u)
+EXC(    LOAD    t0, UNIT(6)(src),       l_exc_copy)
+EXC(    LOAD    t1, UNIT(7)(src),       l_exc_copy)
+        ADD     src, src, 8*NBYTES
+        ADD     dst, dst, 8*NBYTES
+EXC(    STORE   t2, UNIT(-6)(dst),      s_exc_p6u)
+EXC(    STORE   t3, UNIT(-5)(dst),      s_exc_p5u)
+EXC(    STORE   t4, UNIT(-4)(dst),      s_exc_p4u)
+EXC(    STORE   t7, UNIT(-3)(dst),      s_exc_p3u)
+EXC(    STORE   t0, UNIT(-2)(dst),      s_exc_p2u)
+EXC(    STORE   t1, UNIT(-1)(dst),      s_exc_p1u)
+        PREF(   0, 8*32(src) )
+        PREF(   1, 8*32(dst) )
+        bne     len, rem, 1b
+         nop
+        /*
+         * len == rem == the number of bytes left to copy < 8*NBYTES
+         */
+cleanup_both_aligned:
+        beqz    len, done
+         sltu   t0, len, 4*NBYTES
+        bnez    t0, less_than_4units
+         and    rem, len, (NBYTES-1)    # rem = len % NBYTES
+        /*
+         * len >= 4*NBYTES
+         */
+EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
+EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
+EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
+EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
+        SUB     len, len, 4*NBYTES
+        ADD     src, src, 4*NBYTES
+EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p4u)
+EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p3u)
+EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p2u)
+EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p1u)
+        beqz    len, done
+         ADD    dst, dst, 4*NBYTES
+less_than_4units:
+        /*
+         * rem = len % NBYTES
+         */
+        beq     rem, len, copy_bytes
+         nop
+1:
+EXC(    LOAD    t0, 0(src),             l_exc)
+        ADD     src, src, NBYTES
+        SUB     len, len, NBYTES
+EXC(    STORE   t0, 0(dst),             s_exc_p1u)
+        bne     rem, len, 1b
+         ADD    dst, dst, NBYTES
+        /*
+         * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
+         * A loop would do only a byte at a time with possible branch
+         * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
+         * because can't assume read-access to dst.  Instead, use
+         * STREST dst, which doesn't require read access to dst.
+         *
+         * This code should perform better than a simple loop on modern,
+         * wide-issue mips processors because the code has fewer branches and
+         * more instruction-level parallelism.
+         */
+#define bits t2
+        beqz    len, done
+         ADD    t1, dst, len    # t1 is just past last byte of dst
+        li      bits, 8*NBYTES
+        SLL     rem, len, 3     # rem = number of bits to keep
+EXC(    LOAD    t0, 0(src),             l_exc)
+        SUB     bits, bits, rem # bits = number of bits to discard
+        SHIFT_DISCARD t0, t0, bits
+EXC(    STREST  t0, -1(t1),             s_exc)
+        jr      ra
+         move   len, zero
+dst_unaligned:
+        /*
+         * dst is unaligned
+         * t0 = src & ADDRMASK
+         * t1 = dst & ADDRMASK; T1 > 0
+         * len >= NBYTES
+         *
+         * Copy enough bytes to align dst
+         * Set match = (src and dst have same alignment)
+         */
+#define match rem
+EXC(    LDFIRST t3, FIRST(0)(src),      l_exc)
+        ADD     t2, zero, NBYTES
+EXC(    LDREST  t3, REST(0)(src),       l_exc_copy)
+        SUB     t2, t2, t1      # t2 = number of bytes copied
+        xor     match, t0, t1
+EXC(    STFIRST t3, FIRST(0)(dst),      s_exc)
+        beq     len, t2, done
+         SUB    len, len, t2
+        ADD     dst, dst, t2
+        beqz    match, both_aligned
+         ADD    src, src, t2
+src_unaligned_dst_aligned:
+        SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
+        PREF(   0, 3*32(src) )
+        beqz    t0, cleanup_src_unaligned
+         and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
+        PREF(   1, 3*32(dst) )
+1:
+/*
+ * Avoid consecutive LD*'s to the same register since some mips
+ * implementations can't issue them in the same cycle.
+ * It's OK to load FIRST(N+1) before REST(N) because the two addresses
+ * are to the same unit (unless src is aligned, but it's not).
+ */
+EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
+EXC(    LDFIRST t1, FIRST(1)(src),      l_exc_copy)
+        SUB     len, len, 4*NBYTES
+EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
+EXC(    LDREST  t1, REST(1)(src),       l_exc_copy)
+EXC(    LDFIRST t2, FIRST(2)(src),      l_exc_copy)
+EXC(    LDFIRST t3, FIRST(3)(src),      l_exc_copy)
+EXC(    LDREST  t2, REST(2)(src),       l_exc_copy)
+EXC(    LDREST  t3, REST(3)(src),       l_exc_copy)
+        PREF(   0, 9*32(src) )          # 0 is PREF_LOAD  (not streamed)
+        ADD     src, src, 4*NBYTES
+#ifdef CONFIG_CPU_SB1
+        nop                             # improves slotting
+#endif
+EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p4u)
+EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p3u)
+EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p2u)
+EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p1u)
+        PREF(   1, 9*32(dst) )          # 1 is PREF_STORE (not streamed)
+        bne     len, rem, 1b
+         ADD    dst, dst, 4*NBYTES
+cleanup_src_unaligned:
+        beqz    len, done
+         and    rem, len, NBYTES-1  # rem = len % NBYTES
+        beq     rem, len, copy_bytes
+         nop
+1:
+EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
+EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
+        ADD     src, src, NBYTES
+        SUB     len, len, NBYTES
+EXC(    STORE   t0, 0(dst),             s_exc_p1u)
+        bne     len, rem, 1b
+         ADD    dst, dst, NBYTES
+copy_bytes_checklen:
+        beqz    len, done
+         nop
+copy_bytes:
+        /* 0 < len < NBYTES  */
+#define COPY_BYTE(N)                    \
+EXC(    lb      t0, N(src), l_exc);     \
+        SUB     len, len, 1;            \
+        beqz    len, done;              \
+EXC(     sb     t0, N(dst), s_exc_p1)
+        COPY_BYTE(0)
+        COPY_BYTE(1)
+#ifdef USE_DOUBLE
+        COPY_BYTE(2)
+        COPY_BYTE(3)
+        COPY_BYTE(4)
+        COPY_BYTE(5)
+#endif
+EXC(    lb      t0, NBYTES-2(src), l_exc)
+        SUB     len, len, 1
+        jr      ra
+EXC(     sb     t0, NBYTES-2(dst), s_exc_p1)
+done:
+        jr      ra
+         nop
+        END(memcpy)
+l_exc_copy:
+        /*
+         * Copy bytes from src until faulting load address (or until a
+         * lb faults)
+         *
+         * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
+         * may be more than a byte beyond the last address.
+         * Hence, the lb below may get an exception.
+         *
+         * Assumes src < THREAD_BUADDR($28)
+         */
+        LOAD    t0, TI_TASK($28)
+         nop
+        LOAD    t0, THREAD_BUADDR(t0)
+1:
+EXC(    lb      t1, 0(src),     l_exc)
+        ADD     src, src, 1
+        sb      t1, 0(dst)      # can't fault -- we're copy_from_user
+        bne     src, t0, 1b
+         ADD    dst, dst, 1
+l_exc:
+        LOAD    t0, TI_TASK($28)
+         nop
+        LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
+         nop
+        SUB     len, AT, t0             # len number of uncopied bytes
+        /*
+         * Here's where we rely on src and dst being incremented in tandem,
+         *   See (3) above.
+         * dst += (fault addr - src) to put dst at first byte to clear
+         */
+        ADD     dst, t0                 # compute start address in a1
+        SUB     dst, src
+        /*
+         * Clear len bytes starting at dst.  Can't call __bzero because it
+         * might modify len.  An inefficient loop for these rare times...
+         */
+        beqz    len, done
+         SUB    src, len, 1
+1:      sb      zero, 0(dst)
+        ADD     dst, dst, 1
+        bnez    src, 1b
+         SUB    src, src, 1
+        jr      ra
+         nop
+#define SEXC(n)                         \
+s_exc_p ## n ## u:                      \
+        jr      ra;                     \
+         ADD    len, len, n*NBYTES
+SEXC(8)
+SEXC(7)
+SEXC(6)
+SEXC(5)
+SEXC(4)
+SEXC(3)
+SEXC(2)
+SEXC(1)
+s_exc_p1:
+        jr      ra
+         ADD    len, len, 1
+s_exc:
+        jr      ra
+         nop
+        .align  5
+LEAF(memmove)
+        ADD     t0, a0, a2
+        ADD     t1, a1, a2
+        sltu    t0, a1, t0                      # dst + len <= src -> memcpy
+        sltu    t1, a0, t1                      # dst >= src + len -> memcpy
+        and     t0, t1
+        beqz    t0, __memcpy
+         move   v0, a0                          /* return value */
+        beqz    a2, r_out
+        END(memmove)
+        /* fall through to __rmemcpy */
+LEAF(__rmemcpy)                                 /* a0=dst a1=src a2=len */
+         sltu   t0, a1, a0
+        beqz    t0, r_end_bytes_up              # src >= dst
+         nop
+        ADD     a0, a2                          # dst = dst + len
+        ADD     a1, a2                          # src = src + len
+r_end_bytes:
+        lb      t0, -1(a1)
+        SUB     a2, a2, 0x1
+        sb      t0, -1(a0)
+        SUB     a1, a1, 0x1
+        bnez    a2, r_end_bytes
+         SUB    a0, a0, 0x1
+r_out:
+        jr      ra
+         move   a2, zero
+r_end_bytes_up:
+        lb      t0, (a1)
+        SUB     a2, a2, 0x1
+        sb      t0, (a0)
+        ADD     a1, a1, 0x1
+        bnez    a2, r_end_bytes_up
+         ADD    a0, a0, 0x1
+        jr      ra
+         move   a2, zero
+        END(__rmemcpy)

diff --git a/arch/mips/lib/memcpy.S b/arch/mips/lib/memcpy.S new file mode 100644 index 000000000000..afa8eae18ff6 --- /dev/null +++ b/arch/mips/lib/memcpy.S
@@ -0,0 +1,508 @@
	1	/*
	2	* This file is subject to the terms and conditions of the GNU General Public
	3	* License. See the file "COPYING" in the main directory of this archive
	4	* for more details.
	5	*
	6	* Unified implementation of memcpy, memmove and the __copy_user backend.
	7	*
	8	* Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
	9	* Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
	10	* Copyright (C) 2002 Broadcom, Inc.
	11	* memcpy/copy_user author: Mark Vandevoorde
	12	*
	13	* Mnemonic names for arguments to memcpy/__copy_user
	14	*/
	15	#include <linux/config.h>
	16	#include <asm/asm.h>
	17	#include <asm/offset.h>
	18	#include <asm/regdef.h>
	19
	20	#define dst a0
	21	#define src a1
	22	#define len a2
	23
	24	/*
	25	* Spec
	26	*
	27	* memcpy copies len bytes from src to dst and sets v0 to dst.
	28	* It assumes that
	29	* - src and dst don't overlap
	30	* - src is readable
	31	* - dst is writable
	32	* memcpy uses the standard calling convention
	33	*
	34	* __copy_user copies up to len bytes from src to dst and sets a2 (len) to
	35	* the number of uncopied bytes due to an exception caused by a read or write.
	36	* __copy_user assumes that src and dst don't overlap, and that the call is
	37	* implementing one of the following:
	38	* copy_to_user
	39	* - src is readable (no exceptions when reading src)
	40	* copy_from_user
	41	* - dst is writable (no exceptions when writing dst)
	42	* __copy_user uses a non-standard calling convention; see
	43	* include/asm-mips/uaccess.h
	44	*
	45	* When an exception happens on a load, the handler must
	46	# ensure that all of the destination buffer is overwritten to prevent
	47	* leaking information to user mode programs.
	48	*/
	49
	50	/*
	51	* Implementation
	52	*/
	53
	54	/*
	55	* The exception handler for loads requires that:
	56	* 1- AT contain the address of the byte just past the end of the source
	57	* of the copy,
	58	* 2- src_entry <= src < AT, and
	59	* 3- (dst - src) == (dst_entry - src_entry),
	60	* The _entry suffix denotes values when __copy_user was called.
	61	*
	62	* (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
	63	* (2) is met by incrementing src by the number of bytes copied
	64	* (3) is met by not doing loads between a pair of increments of dst and src
	65	*
	66	* The exception handlers for stores adjust len (if necessary) and return.
	67	* These handlers do not need to overwrite any data.
	68	*
	69	* For __rmemcpy and memmove an exception is always a kernel bug, therefore
	70	* they're not protected.
	71	*/
	72
	73	#define EXC(inst_reg,addr,handler) \
	74	9: inst_reg, addr; \
	75	.section __ex_table,"a"; \
	76	PTR 9b, handler; \
	77	.previous
	78
	79	/*
	80	* Only on the 64-bit kernel we can made use of 64-bit registers.
	81	*/
	82	#ifdef CONFIG_MIPS64
	83	#define USE_DOUBLE
	84	#endif
	85
	86	#ifdef USE_DOUBLE
	87
	88	#define LOAD ld
	89	#define LOADL ldl
	90	#define LOADR ldr
	91	#define STOREL sdl
	92	#define STORER sdr
	93	#define STORE sd
	94	#define ADD daddu
	95	#define SUB dsubu
	96	#define SRL dsrl
	97	#define SRA dsra
	98	#define SLL dsll
	99	#define SLLV dsllv
	100	#define SRLV dsrlv
	101	#define NBYTES 8
	102	#define LOG_NBYTES 3
	103
	104	/*
	105	* As we are sharing code base with the mips32 tree (which use the o32 ABI
	106	* register definitions). We need to redefine the register definitions from
	107	* the n64 ABI register naming to the o32 ABI register naming.
	108	*/
	109	#undef t0
	110	#undef t1
	111	#undef t2
	112	#undef t3
	113	#define t0 $8
	114	#define t1 $9
	115	#define t2 $10
	116	#define t3 $11
	117	#define t4 $12
	118	#define t5 $13
	119	#define t6 $14
	120	#define t7 $15
	121
	122	#else
	123
	124	#define LOAD lw
	125	#define LOADL lwl
	126	#define LOADR lwr
	127	#define STOREL swl
	128	#define STORER swr
	129	#define STORE sw
	130	#define ADD addu
	131	#define SUB subu
	132	#define SRL srl
	133	#define SLL sll
	134	#define SRA sra
	135	#define SLLV sllv
	136	#define SRLV srlv
	137	#define NBYTES 4
	138	#define LOG_NBYTES 2
	139
	140	#endif /* USE_DOUBLE */
	141
	142	#ifdef CONFIG_CPU_LITTLE_ENDIAN
	143	#define LDFIRST LOADR
	144	#define LDREST LOADL
	145	#define STFIRST STORER
	146	#define STREST STOREL
	147	#define SHIFT_DISCARD SLLV
	148	#else
	149	#define LDFIRST LOADL
	150	#define LDREST LOADR
	151	#define STFIRST STOREL
	152	#define STREST STORER
	153	#define SHIFT_DISCARD SRLV
	154	#endif
	155
	156	#define FIRST(unit) ((unit)*NBYTES)
	157	#define REST(unit) (FIRST(unit)+NBYTES-1)
	158	#define UNIT(unit) FIRST(unit)
	159
	160	#define ADDRMASK (NBYTES-1)
	161
	162	.text
	163	.set noreorder
	164	.set noat
	165
	166	/*
	167	* A combined memcpy/__copy_user
	168	* __copy_user sets len to 0 for success; else to an upper bound of
	169	* the number of uncopied bytes.
	170	* memcpy sets v0 to dst.
	171	*/
	172	.align 5
	173	LEAF(memcpy) /* a0=dst a1=src a2=len */
	174	move v0, dst /* return value */
	175	__memcpy:
	176	FEXPORT(__copy_user)
	177	/*
	178	* Note: dst & src may be unaligned, len may be 0
	179	* Temps
	180	*/
	181	#define rem t8
	182
	183	/*
	184	* The "issue break"s below are very approximate.
	185	* Issue delays for dcache fills will perturb the schedule, as will
	186	* load queue full replay traps, etc.
	187	*
	188	* If len < NBYTES use byte operations.
	189	*/
	190	PREF( 0, 0(src) )
	191	PREF( 1, 0(dst) )
	192	sltu t2, len, NBYTES
	193	and t1, dst, ADDRMASK
	194	PREF( 0, 1*32(src) )
	195	PREF( 1, 1*32(dst) )
	196	bnez t2, copy_bytes_checklen
	197	and t0, src, ADDRMASK
	198	PREF( 0, 2*32(src) )
	199	PREF( 1, 2*32(dst) )
	200	bnez t1, dst_unaligned
	201	nop
	202	bnez t0, src_unaligned_dst_aligned
	203	/*
	204	* use delay slot for fall-through
	205	* src and dst are aligned; need to compute rem
	206	*/
	207	both_aligned:
	208	SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter
	209	beqz t0, cleanup_both_aligned # len < 8*NBYTES
	210	and rem, len, (8NBYTES-1) # rem = len % (8NBYTES)
	211	PREF( 0, 3*32(src) )
	212	PREF( 1, 3*32(dst) )
	213	.align 4
	214	1:
	215	EXC( LOAD t0, UNIT(0)(src), l_exc)
	216	EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
	217	EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
	218	EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
	219	SUB len, len, 8*NBYTES
	220	EXC( LOAD t4, UNIT(4)(src), l_exc_copy)
	221	EXC( LOAD t7, UNIT(5)(src), l_exc_copy)
	222	EXC( STORE t0, UNIT(0)(dst), s_exc_p8u)
	223	EXC( STORE t1, UNIT(1)(dst), s_exc_p7u)
	224	EXC( LOAD t0, UNIT(6)(src), l_exc_copy)
	225	EXC( LOAD t1, UNIT(7)(src), l_exc_copy)
	226	ADD src, src, 8*NBYTES
	227	ADD dst, dst, 8*NBYTES
	228	EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u)
	229	EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u)
	230	EXC( STORE t4, UNIT(-4)(dst), s_exc_p4u)
	231	EXC( STORE t7, UNIT(-3)(dst), s_exc_p3u)
	232	EXC( STORE t0, UNIT(-2)(dst), s_exc_p2u)
	233	EXC( STORE t1, UNIT(-1)(dst), s_exc_p1u)
	234	PREF( 0, 8*32(src) )
	235	PREF( 1, 8*32(dst) )
	236	bne len, rem, 1b
	237	nop
	238
	239	/*
	240	* len == rem == the number of bytes left to copy < 8*NBYTES
	241	*/
	242	cleanup_both_aligned:
	243	beqz len, done
	244	sltu t0, len, 4*NBYTES
	245	bnez t0, less_than_4units
	246	and rem, len, (NBYTES-1) # rem = len % NBYTES
	247	/*
	248	* len >= 4*NBYTES
	249	*/
	250	EXC( LOAD t0, UNIT(0)(src), l_exc)
	251	EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
	252	EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
	253	EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
	254	SUB len, len, 4*NBYTES
	255	ADD src, src, 4*NBYTES
	256	EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
	257	EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
	258	EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
	259	EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
	260	beqz len, done
	261	ADD dst, dst, 4*NBYTES
	262	less_than_4units:
	263	/*
	264	* rem = len % NBYTES
	265	*/
	266	beq rem, len, copy_bytes
	267	nop
	268	1:
	269	EXC( LOAD t0, 0(src), l_exc)
	270	ADD src, src, NBYTES
	271	SUB len, len, NBYTES
	272	EXC( STORE t0, 0(dst), s_exc_p1u)
	273	bne rem, len, 1b
	274	ADD dst, dst, NBYTES
	275
	276	/*
	277	* src and dst are aligned, need to copy rem bytes (rem < NBYTES)
	278	* A loop would do only a byte at a time with possible branch
	279	* mispredicts. Can't do an explicit LOAD dst,mask,or,STORE
	280	* because can't assume read-access to dst. Instead, use
	281	* STREST dst, which doesn't require read access to dst.
	282	*
	283	* This code should perform better than a simple loop on modern,
	284	* wide-issue mips processors because the code has fewer branches and
	285	* more instruction-level parallelism.
	286	*/
	287	#define bits t2
	288	beqz len, done
	289	ADD t1, dst, len # t1 is just past last byte of dst
	290	li bits, 8*NBYTES
	291	SLL rem, len, 3 # rem = number of bits to keep
	292	EXC( LOAD t0, 0(src), l_exc)
	293	SUB bits, bits, rem # bits = number of bits to discard
	294	SHIFT_DISCARD t0, t0, bits
	295	EXC( STREST t0, -1(t1), s_exc)
	296	jr ra
	297	move len, zero
	298	dst_unaligned:
	299	/*
	300	* dst is unaligned
	301	* t0 = src & ADDRMASK
	302	* t1 = dst & ADDRMASK; T1 > 0
	303	* len >= NBYTES
	304	*
	305	* Copy enough bytes to align dst
	306	* Set match = (src and dst have same alignment)
	307	*/
	308	#define match rem
	309	EXC( LDFIRST t3, FIRST(0)(src), l_exc)
	310	ADD t2, zero, NBYTES
	311	EXC( LDREST t3, REST(0)(src), l_exc_copy)
	312	SUB t2, t2, t1 # t2 = number of bytes copied
	313	xor match, t0, t1
	314	EXC( STFIRST t3, FIRST(0)(dst), s_exc)
	315	beq len, t2, done
	316	SUB len, len, t2
	317	ADD dst, dst, t2
	318	beqz match, both_aligned
	319	ADD src, src, t2
	320
	321	src_unaligned_dst_aligned:
	322	SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
	323	PREF( 0, 3*32(src) )
	324	beqz t0, cleanup_src_unaligned
	325	and rem, len, (4NBYTES-1) # rem = len % 4NBYTES
	326	PREF( 1, 3*32(dst) )
	327	1:
	328	/*
	329	* Avoid consecutive LD*'s to the same register since some mips
	330	* implementations can't issue them in the same cycle.
	331	* It's OK to load FIRST(N+1) before REST(N) because the two addresses
	332	* are to the same unit (unless src is aligned, but it's not).
	333	*/
	334	EXC( LDFIRST t0, FIRST(0)(src), l_exc)
	335	EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)
	336	SUB len, len, 4*NBYTES
	337	EXC( LDREST t0, REST(0)(src), l_exc_copy)
	338	EXC( LDREST t1, REST(1)(src), l_exc_copy)
	339	EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)
	340	EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)
	341	EXC( LDREST t2, REST(2)(src), l_exc_copy)
	342	EXC( LDREST t3, REST(3)(src), l_exc_copy)
	343	PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed)
	344	ADD src, src, 4*NBYTES
	345	#ifdef CONFIG_CPU_SB1
	346	nop # improves slotting
	347	#endif
	348	EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
	349	EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
	350	EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
	351	EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
	352	PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed)
	353	bne len, rem, 1b
	354	ADD dst, dst, 4*NBYTES
	355
	356	cleanup_src_unaligned:
	357	beqz len, done
	358	and rem, len, NBYTES-1 # rem = len % NBYTES
	359	beq rem, len, copy_bytes
	360	nop
	361	1:
	362	EXC( LDFIRST t0, FIRST(0)(src), l_exc)
	363	EXC( LDREST t0, REST(0)(src), l_exc_copy)
	364	ADD src, src, NBYTES
	365	SUB len, len, NBYTES
	366	EXC( STORE t0, 0(dst), s_exc_p1u)
	367	bne len, rem, 1b
	368	ADD dst, dst, NBYTES
	369
	370	copy_bytes_checklen:
	371	beqz len, done
	372	nop
	373	copy_bytes:
	374	/* 0 < len < NBYTES */
	375	#define COPY_BYTE(N) \
	376	EXC( lb t0, N(src), l_exc); \
	377	SUB len, len, 1; \
	378	beqz len, done; \
	379	EXC( sb t0, N(dst), s_exc_p1)
	380
	381	COPY_BYTE(0)
	382	COPY_BYTE(1)
	383	#ifdef USE_DOUBLE
	384	COPY_BYTE(2)
	385	COPY_BYTE(3)
	386	COPY_BYTE(4)
	387	COPY_BYTE(5)
	388	#endif
	389	EXC( lb t0, NBYTES-2(src), l_exc)
	390	SUB len, len, 1
	391	jr ra
	392	EXC( sb t0, NBYTES-2(dst), s_exc_p1)
	393	done:
	394	jr ra
	395	nop
	396	END(memcpy)
	397
	398	l_exc_copy:
	399	/*
	400	* Copy bytes from src until faulting load address (or until a
	401	* lb faults)
	402	*
	403	* When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
	404	* may be more than a byte beyond the last address.
	405	* Hence, the lb below may get an exception.
	406	*
	407	* Assumes src < THREAD_BUADDR($28)
	408	*/
	409	LOAD t0, TI_TASK($28)
	410	nop
	411	LOAD t0, THREAD_BUADDR(t0)
	412	1:
	413	EXC( lb t1, 0(src), l_exc)
	414	ADD src, src, 1
	415	sb t1, 0(dst) # can't fault -- we're copy_from_user
	416	bne src, t0, 1b
	417	ADD dst, dst, 1
	418	l_exc:
	419	LOAD t0, TI_TASK($28)
	420	nop
	421	LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
	422	nop
	423	SUB len, AT, t0 # len number of uncopied bytes
	424	/*
	425	* Here's where we rely on src and dst being incremented in tandem,
	426	* See (3) above.
	427	* dst += (fault addr - src) to put dst at first byte to clear
	428	*/
	429	ADD dst, t0 # compute start address in a1
	430	SUB dst, src
	431	/*
	432	* Clear len bytes starting at dst. Can't call __bzero because it
	433	* might modify len. An inefficient loop for these rare times...
	434	*/
	435	beqz len, done
	436	SUB src, len, 1
	437	1: sb zero, 0(dst)
	438	ADD dst, dst, 1
	439	bnez src, 1b
	440	SUB src, src, 1
	441	jr ra
	442	nop
	443
	444
	445	#define SEXC(n) \
	446	s_exc_p ## n ## u: \
	447	jr ra; \
	448	ADD len, len, n*NBYTES
	449
	450	SEXC(8)
	451	SEXC(7)
	452	SEXC(6)
	453	SEXC(5)
	454	SEXC(4)
	455	SEXC(3)
	456	SEXC(2)
	457	SEXC(1)
	458
	459	s_exc_p1:
	460	jr ra
	461	ADD len, len, 1
	462	s_exc:
	463	jr ra
	464	nop
	465
	466	.align 5
	467	LEAF(memmove)
	468	ADD t0, a0, a2
	469	ADD t1, a1, a2
	470	sltu t0, a1, t0 # dst + len <= src -> memcpy
	471	sltu t1, a0, t1 # dst >= src + len -> memcpy
	472	and t0, t1
	473	beqz t0, __memcpy
	474	move v0, a0 /* return value */
	475	beqz a2, r_out
	476	END(memmove)
	477
	478	/* fall through to __rmemcpy */
	479	LEAF(__rmemcpy) /* a0=dst a1=src a2=len */
	480	sltu t0, a1, a0
	481	beqz t0, r_end_bytes_up # src >= dst
	482	nop
	483	ADD a0, a2 # dst = dst + len
	484	ADD a1, a2 # src = src + len
	485
	486	r_end_bytes:
	487	lb t0, -1(a1)
	488	SUB a2, a2, 0x1
	489	sb t0, -1(a0)
	490	SUB a1, a1, 0x1
	491	bnez a2, r_end_bytes
	492	SUB a0, a0, 0x1
	493
	494	r_out:
	495	jr ra
	496	move a2, zero
	497
	498	r_end_bytes_up:
	499	lb t0, (a1)
	500	SUB a2, a2, 0x1
	501	sb t0, (a0)
	502	ADD a1, a1, 0x1
	503	bnez a2, r_end_bytes_up
	504	ADD a0, a0, 0x1
	505
	506	jr ra
	507	move a2, zero
	508	END(__rmemcpy)