[MIPS] Fixup copy_from_user_inatomic

From the 01408c4939479ec46c15aa7ef6e2406be50eeeca log message: The problem is that when we write to a file, the copy from userspace to pagecache is first done with preemption disabled, so if the source address is not immediately available the copy fails *and* *zeros* *the* *destination*. This is a problem because a concurrent read (which admittedly is an odd thing to do) might see zeros rather that was there before the write, or what was there after, or some mixture of the two (any of these being a reasonable thing to see). If the copy did fail, it will immediately be retried with preemption re-enabled so any transient problem with accessing the source won't cause an error. The first copying does not need to zero any uncopied bytes, and doing so causes the problem. It uses copy_from_user_atomic rather than copy_from_user so the simple expedient is to change copy_from_user_atomic to *not* zero out bytes on failure. < --- end cite --- > This patch finally implements at least a not so pretty solution by duplicating the relevant part of __copy_user. Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
author: Ralf Baechle <ralf@linux-mips.org> 2007-02-19 11:59:24 -0500
committer: Ralf Baechle <ralf@linux-mips.org> 2007-02-19 20:26:42 -0500
commit: e03b526932a9ae1ff20b47459c040f3c6407f625 (patch)
tree: 6c1753fc5a0497621b05c7dae9d3d686503bc5d7 /arch
parent: 269dd2b2526d046d8b43554ff27b486e2ddb3f08 (diff)
2 files changed, 437 insertions, 1 deletions
diff --git a/arch/mips/lib/Makefile b/arch/mips/lib/Makefile
index 9e5d985936b3..2453ea244cb8 100644
--- a/arch/mips/lib/Makefile
+++ b/arch/mips/lib/Makefile
@@ -2,7 +2,7 @@
 # Makefile for MIPS-specific library files..
 #
-lib-y   += csum_partial.o memcpy.o memset.o promlib.o \
+lib-y   += csum_partial.o memcpy.o memcpy-inatomic.o memset.o promlib.o \
           strlen_user.o strncpy_user.o strnlen_user.o uncached.o
 obj-y                   += iomap.o
diff --git a/arch/mips/lib/memcpy-inatomic.S b/arch/mips/lib/memcpy-inatomic.S
new file mode 100644
index 000000000000..3a534b2baa0f
--- /dev/null
+++ b/arch/mips/lib/memcpy-inatomic.S
@@ -0,0 +1,436 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Unified implementation of memcpy, memmove and the __copy_user backend.
+ *
+ * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
+ * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
+ * Copyright (C) 2002 Broadcom, Inc.
+ *   memcpy/copy_user author: Mark Vandevoorde
+ *
+ * Mnemonic names for arguments to memcpy/__copy_user
+ */
+/*
+ * Hack to resolve longstanding prefetch issue
+ *
+ * Prefetching may be fatal on some systems if we're prefetching beyond the
+ * end of memory on some systems.  It's also a seriously bad idea on non
+ * dma-coherent systems.
+ */
+#if !defined(CONFIG_DMA_COHERENT) || !defined(CONFIG_DMA_IP27)
+#undef CONFIG_CPU_HAS_PREFETCH
+#endif
+#ifdef CONFIG_MIPS_MALTA
+#undef CONFIG_CPU_HAS_PREFETCH
+#endif
+#include <asm/asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/regdef.h>
+#define dst a0
+#define src a1
+#define len a2
+/*
+ * Spec
+ *
+ * memcpy copies len bytes from src to dst and sets v0 to dst.
+ * It assumes that
+ *   - src and dst don't overlap
+ *   - src is readable
+ *   - dst is writable
+ * memcpy uses the standard calling convention
+ *
+ * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
+ * the number of uncopied bytes due to an exception caused by a read or write.
+ * __copy_user assumes that src and dst don't overlap, and that the call is
+ * implementing one of the following:
+ *   copy_to_user
+ *     - src is readable  (no exceptions when reading src)
+ *   copy_from_user
+ *     - dst is writable  (no exceptions when writing dst)
+ * __copy_user uses a non-standard calling convention; see
+ * include/asm-mips/uaccess.h
+ *
+ * When an exception happens on a load, the handler must
+ # ensure that all of the destination buffer is overwritten to prevent
+ * leaking information to user mode programs.
+ */
+/*
+ * Implementation
+ */
+/*
+ * The exception handler for loads requires that:
+ *  1- AT contain the address of the byte just past the end of the source
+ *     of the copy,
+ *  2- src_entry <= src < AT, and
+ *  3- (dst - src) == (dst_entry - src_entry),
+ * The _entry suffix denotes values when __copy_user was called.
+ *
+ * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
+ * (2) is met by incrementing src by the number of bytes copied
+ * (3) is met by not doing loads between a pair of increments of dst and src
+ *
+ * The exception handlers for stores adjust len (if necessary) and return.
+ * These handlers do not need to overwrite any data.
+ *
+ * For __rmemcpy and memmove an exception is always a kernel bug, therefore
+ * they're not protected.
+ */
+#define EXC(inst_reg,addr,handler)              \
+9:      inst_reg, addr;                         \
+        .section __ex_table,"a";                \
+        PTR     9b, handler;                    \
+        .previous
+/*
+ * Only on the 64-bit kernel we can made use of 64-bit registers.
+ */
+#ifdef CONFIG_64BIT
+#define USE_DOUBLE
+#endif
+#ifdef USE_DOUBLE
+#define LOAD   ld
+#define LOADL  ldl
+#define LOADR  ldr
+#define STOREL sdl
+#define STORER sdr
+#define STORE  sd
+#define ADD    daddu
+#define SUB    dsubu
+#define SRL    dsrl
+#define SRA    dsra
+#define SLL    dsll
+#define SLLV   dsllv
+#define SRLV   dsrlv
+#define NBYTES 8
+#define LOG_NBYTES 3
+/*
+ * As we are sharing code base with the mips32 tree (which use the o32 ABI
+ * register definitions). We need to redefine the register definitions from
+ * the n64 ABI register naming to the o32 ABI register naming.
+ */
+#undef t0
+#undef t1
+#undef t2
+#undef t3
+#define t0      $8
+#define t1      $9
+#define t2      $10
+#define t3      $11
+#define t4      $12
+#define t5      $13
+#define t6      $14
+#define t7      $15
+#else
+#define LOAD   lw
+#define LOADL  lwl
+#define LOADR  lwr
+#define STOREL swl
+#define STORER swr
+#define STORE  sw
+#define ADD    addu
+#define SUB    subu
+#define SRL    srl
+#define SLL    sll
+#define SRA    sra
+#define SLLV   sllv
+#define SRLV   srlv
+#define NBYTES 4
+#define LOG_NBYTES 2
+#endif /* USE_DOUBLE */
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+#define LDFIRST LOADR
+#define LDREST  LOADL
+#define STFIRST STORER
+#define STREST  STOREL
+#define SHIFT_DISCARD SLLV
+#else
+#define LDFIRST LOADL
+#define LDREST  LOADR
+#define STFIRST STOREL
+#define STREST  STORER
+#define SHIFT_DISCARD SRLV
+#endif
+#define FIRST(unit) ((unit)*NBYTES)
+#define REST(unit)  (FIRST(unit)+NBYTES-1)
+#define UNIT(unit)  FIRST(unit)
+#define ADDRMASK (NBYTES-1)
+        .text
+        .set    noreorder
+        .set    noat
+/*
+ * A combined memcpy/__copy_user
+ * __copy_user sets len to 0 for success; else to an upper bound of
+ * the number of uncopied bytes.
+ * memcpy sets v0 to dst.
+ */
+        .align  5
+LEAF(__copy_user_inatomic)
+        /*
+         * Note: dst & src may be unaligned, len may be 0
+         * Temps
+         */
+#define rem t8
+        /*
+         * The "issue break"s below are very approximate.
+         * Issue delays for dcache fills will perturb the schedule, as will
+         * load queue full replay traps, etc.
+         *
+         * If len < NBYTES use byte operations.
+         */
+        PREF(   0, 0(src) )
+        PREF(   1, 0(dst) )
+        sltu    t2, len, NBYTES
+        and     t1, dst, ADDRMASK
+        PREF(   0, 1*32(src) )
+        PREF(   1, 1*32(dst) )
+        bnez    t2, copy_bytes_checklen
+         and    t0, src, ADDRMASK
+        PREF(   0, 2*32(src) )
+        PREF(   1, 2*32(dst) )
+        bnez    t1, dst_unaligned
+         nop
+        bnez    t0, src_unaligned_dst_aligned
+        /*
+         * use delay slot for fall-through
+         * src and dst are aligned; need to compute rem
+         */
+both_aligned:
+         SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
+        beqz    t0, cleanup_both_aligned # len < 8*NBYTES
+         and    rem, len, (8*NBYTES-1)   # rem = len % (8*NBYTES)
+        PREF(   0, 3*32(src) )
+        PREF(   1, 3*32(dst) )
+        .align  4
+1:
+EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
+EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
+EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
+EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
+        SUB     len, len, 8*NBYTES
+EXC(    LOAD    t4, UNIT(4)(src),       l_exc_copy)
+EXC(    LOAD    t7, UNIT(5)(src),       l_exc_copy)
+        STORE   t0, UNIT(0)(dst)
+        STORE   t1, UNIT(1)(dst)
+EXC(    LOAD    t0, UNIT(6)(src),       l_exc_copy)
+EXC(    LOAD    t1, UNIT(7)(src),       l_exc_copy)
+        ADD     src, src, 8*NBYTES
+        ADD     dst, dst, 8*NBYTES
+        STORE   t2, UNIT(-6)(dst)
+        STORE   t3, UNIT(-5)(dst)
+        STORE   t4, UNIT(-4)(dst)
+        STORE   t7, UNIT(-3)(dst)
+        STORE   t0, UNIT(-2)(dst)
+        STORE   t1, UNIT(-1)(dst)
+        PREF(   0, 8*32(src) )
+        PREF(   1, 8*32(dst) )
+        bne     len, rem, 1b
+         nop
+        /*
+         * len == rem == the number of bytes left to copy < 8*NBYTES
+         */
+cleanup_both_aligned:
+        beqz    len, done
+         sltu   t0, len, 4*NBYTES
+        bnez    t0, less_than_4units
+         and    rem, len, (NBYTES-1)    # rem = len % NBYTES
+        /*
+         * len >= 4*NBYTES
+         */
+EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
+EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
+EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
+EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
+        SUB     len, len, 4*NBYTES
+        ADD     src, src, 4*NBYTES
+        STORE   t0, UNIT(0)(dst)
+        STORE   t1, UNIT(1)(dst)
+        STORE   t2, UNIT(2)(dst)
+        STORE   t3, UNIT(3)(dst)
+        beqz    len, done
+         ADD    dst, dst, 4*NBYTES
+less_than_4units:
+        /*
+         * rem = len % NBYTES
+         */
+        beq     rem, len, copy_bytes
+         nop
+1:
+EXC(    LOAD    t0, 0(src),             l_exc)
+        ADD     src, src, NBYTES
+        SUB     len, len, NBYTES
+        STORE   t0, 0(dst)
+        bne     rem, len, 1b
+         ADD    dst, dst, NBYTES
+        /*
+         * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
+         * A loop would do only a byte at a time with possible branch
+         * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
+         * because can't assume read-access to dst.  Instead, use
+         * STREST dst, which doesn't require read access to dst.
+         *
+         * This code should perform better than a simple loop on modern,
+         * wide-issue mips processors because the code has fewer branches and
+         * more instruction-level parallelism.
+         */
+#define bits t2
+        beqz    len, done
+         ADD    t1, dst, len    # t1 is just past last byte of dst
+        li      bits, 8*NBYTES
+        SLL     rem, len, 3     # rem = number of bits to keep
+EXC(    LOAD    t0, 0(src),             l_exc)
+        SUB     bits, bits, rem # bits = number of bits to discard
+        SHIFT_DISCARD t0, t0, bits
+        STREST  t0, -1(t1)
+        jr      ra
+         move   len, zero
+dst_unaligned:
+        /*
+         * dst is unaligned
+         * t0 = src & ADDRMASK
+         * t1 = dst & ADDRMASK; T1 > 0
+         * len >= NBYTES
+         *
+         * Copy enough bytes to align dst
+         * Set match = (src and dst have same alignment)
+         */
+#define match rem
+EXC(    LDFIRST t3, FIRST(0)(src),      l_exc)
+        ADD     t2, zero, NBYTES
+EXC(    LDREST  t3, REST(0)(src),       l_exc_copy)
+        SUB     t2, t2, t1      # t2 = number of bytes copied
+        xor     match, t0, t1
+        STFIRST t3, FIRST(0)(dst)
+        beq     len, t2, done
+         SUB    len, len, t2
+        ADD     dst, dst, t2
+        beqz    match, both_aligned
+         ADD    src, src, t2
+src_unaligned_dst_aligned:
+        SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
+        PREF(   0, 3*32(src) )
+        beqz    t0, cleanup_src_unaligned
+         and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
+        PREF(   1, 3*32(dst) )
+1:
+/*
+ * Avoid consecutive LD*'s to the same register since some mips
+ * implementations can't issue them in the same cycle.
+ * It's OK to load FIRST(N+1) before REST(N) because the two addresses
+ * are to the same unit (unless src is aligned, but it's not).
+ */
+EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
+EXC(    LDFIRST t1, FIRST(1)(src),      l_exc_copy)
+        SUB     len, len, 4*NBYTES
+EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
+EXC(    LDREST  t1, REST(1)(src),       l_exc_copy)
+EXC(    LDFIRST t2, FIRST(2)(src),      l_exc_copy)
+EXC(    LDFIRST t3, FIRST(3)(src),      l_exc_copy)
+EXC(    LDREST  t2, REST(2)(src),       l_exc_copy)
+EXC(    LDREST  t3, REST(3)(src),       l_exc_copy)
+        PREF(   0, 9*32(src) )          # 0 is PREF_LOAD  (not streamed)
+        ADD     src, src, 4*NBYTES
+#ifdef CONFIG_CPU_SB1
+        nop                             # improves slotting
+#endif
+        STORE   t0, UNIT(0)(dst)
+        STORE   t1, UNIT(1)(dst)
+        STORE   t2, UNIT(2)(dst)
+        STORE   t3, UNIT(3)(dst)
+        PREF(   1, 9*32(dst) )          # 1 is PREF_STORE (not streamed)
+        bne     len, rem, 1b
+         ADD    dst, dst, 4*NBYTES
+cleanup_src_unaligned:
+        beqz    len, done
+         and    rem, len, NBYTES-1  # rem = len % NBYTES
+        beq     rem, len, copy_bytes
+         nop
+1:
+EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
+EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
+        ADD     src, src, NBYTES
+        SUB     len, len, NBYTES
+        STORE   t0, 0(dst)
+        bne     len, rem, 1b
+         ADD    dst, dst, NBYTES
+copy_bytes_checklen:
+        beqz    len, done
+         nop
+copy_bytes:
+        /* 0 < len < NBYTES  */
+#define COPY_BYTE(N)                    \
+EXC(    lb      t0, N(src), l_exc);     \
+        SUB     len, len, 1;            \
+        beqz    len, done;              \
+         sb     t0, N(dst)
+        COPY_BYTE(0)
+        COPY_BYTE(1)
+#ifdef USE_DOUBLE
+        COPY_BYTE(2)
+        COPY_BYTE(3)
+        COPY_BYTE(4)
+        COPY_BYTE(5)
+#endif
+EXC(    lb      t0, NBYTES-2(src), l_exc)
+        SUB     len, len, 1
+        jr      ra
+         sb     t0, NBYTES-2(dst)
+done:
+        jr      ra
+         nop
+        END(__copy_user_inatomic)
+l_exc_copy:
+        /*
+         * Copy bytes from src until faulting load address (or until a
+         * lb faults)
+         *
+         * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
+         * may be more than a byte beyond the last address.
+         * Hence, the lb below may get an exception.
+         *
+         * Assumes src < THREAD_BUADDR($28)
+         */
+        LOAD    t0, TI_TASK($28)
+         nop
+        LOAD    t0, THREAD_BUADDR(t0)
+1:
+EXC(    lb      t1, 0(src),     l_exc)
+        ADD     src, src, 1
+        sb      t1, 0(dst)      # can't fault -- we're copy_from_user
+        bne     src, t0, 1b
+         ADD    dst, dst, 1
+l_exc:
+        LOAD    t0, TI_TASK($28)
+         nop
+        LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
+         nop
+        SUB     len, AT, t0             # len number of uncopied bytes
+        jr      ra
+         nop
author	Ralf Baechle <ralf@linux-mips.org>	2007-02-19 11:59:24 -0500
committer	Ralf Baechle <ralf@linux-mips.org>	2007-02-19 20:26:42 -0500
commit	e03b526932a9ae1ff20b47459c040f3c6407f625 (patch)
tree	6c1753fc5a0497621b05c7dae9d3d686503bc5d7 /arch
parent	269dd2b2526d046d8b43554ff27b486e2ddb3f08 (diff)

diff --git a/arch/mips/lib/Makefile b/arch/mips/lib/Makefile index 9e5d985936b3..2453ea244cb8 100644 --- a/arch/mips/lib/Makefile +++ b/arch/mips/lib/Makefile
@@ -2,7 +2,7 @@
2	# Makefile for MIPS-specific library files..	2	# Makefile for MIPS-specific library files..
3	#	3	#
4		4
5	lib-y += csum_partial.o memcpy.o memset.o promlib.o \	5	lib-y += csum_partial.o memcpy.o memcpy-inatomic.o memset.o promlib.o \
6	strlen_user.o strncpy_user.o strnlen_user.o uncached.o	6	strlen_user.o strncpy_user.o strnlen_user.o uncached.o
7		7
8	obj-y += iomap.o	8	obj-y += iomap.o


diff --git a/arch/mips/lib/memcpy-inatomic.S b/arch/mips/lib/memcpy-inatomic.S new file mode 100644 index 000000000000..3a534b2baa0f --- /dev/null +++ b/arch/mips/lib/memcpy-inatomic.S
@@ -0,0 +1,436 @@
		1	/*
		2	* This file is subject to the terms and conditions of the GNU General Public
		3	* License. See the file "COPYING" in the main directory of this archive
		4	* for more details.
		5	*
		6	* Unified implementation of memcpy, memmove and the __copy_user backend.
		7	*
		8	* Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
		9	* Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
		10	* Copyright (C) 2002 Broadcom, Inc.
		11	* memcpy/copy_user author: Mark Vandevoorde
		12	*
		13	* Mnemonic names for arguments to memcpy/__copy_user
		14	*/
		15
		16	/*
		17	* Hack to resolve longstanding prefetch issue
		18	*
		19	* Prefetching may be fatal on some systems if we're prefetching beyond the
		20	* end of memory on some systems. It's also a seriously bad idea on non
		21	* dma-coherent systems.
		22	*/
		23	#if !defined(CONFIG_DMA_COHERENT) \|\| !defined(CONFIG_DMA_IP27)
		24	#undef CONFIG_CPU_HAS_PREFETCH
		25	#endif
		26	#ifdef CONFIG_MIPS_MALTA
		27	#undef CONFIG_CPU_HAS_PREFETCH
		28	#endif
		29
		30	#include <asm/asm.h>
		31	#include <asm/asm-offsets.h>
		32	#include <asm/regdef.h>
		33
		34	#define dst a0
		35	#define src a1
		36	#define len a2
		37
		38	/*
		39	* Spec
		40	*
		41	* memcpy copies len bytes from src to dst and sets v0 to dst.
		42	* It assumes that
		43	* - src and dst don't overlap
		44	* - src is readable
		45	* - dst is writable
		46	* memcpy uses the standard calling convention
		47	*
		48	* __copy_user copies up to len bytes from src to dst and sets a2 (len) to
		49	* the number of uncopied bytes due to an exception caused by a read or write.
		50	* __copy_user assumes that src and dst don't overlap, and that the call is
		51	* implementing one of the following:
		52	* copy_to_user
		53	* - src is readable (no exceptions when reading src)
		54	* copy_from_user
		55	* - dst is writable (no exceptions when writing dst)
		56	* __copy_user uses a non-standard calling convention; see
		57	* include/asm-mips/uaccess.h
		58	*
		59	* When an exception happens on a load, the handler must
		60	# ensure that all of the destination buffer is overwritten to prevent
		61	* leaking information to user mode programs.
		62	*/
		63
		64	/*
		65	* Implementation
		66	*/
		67
		68	/*
		69	* The exception handler for loads requires that:
		70	* 1- AT contain the address of the byte just past the end of the source
		71	* of the copy,
		72	* 2- src_entry <= src < AT, and
		73	* 3- (dst - src) == (dst_entry - src_entry),
		74	* The _entry suffix denotes values when __copy_user was called.
		75	*
		76	* (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
		77	* (2) is met by incrementing src by the number of bytes copied
		78	* (3) is met by not doing loads between a pair of increments of dst and src
		79	*
		80	* The exception handlers for stores adjust len (if necessary) and return.
		81	* These handlers do not need to overwrite any data.
		82	*
		83	* For __rmemcpy and memmove an exception is always a kernel bug, therefore
		84	* they're not protected.
		85	*/
		86
		87	#define EXC(inst_reg,addr,handler) \
		88	9: inst_reg, addr; \
		89	.section __ex_table,"a"; \
		90	PTR 9b, handler; \
		91	.previous
		92
		93	/*
		94	* Only on the 64-bit kernel we can made use of 64-bit registers.
		95	*/
		96	#ifdef CONFIG_64BIT
		97	#define USE_DOUBLE
		98	#endif
		99
		100	#ifdef USE_DOUBLE
		101
		102	#define LOAD ld
		103	#define LOADL ldl
		104	#define LOADR ldr
		105	#define STOREL sdl
		106	#define STORER sdr
		107	#define STORE sd
		108	#define ADD daddu
		109	#define SUB dsubu
		110	#define SRL dsrl
		111	#define SRA dsra
		112	#define SLL dsll
		113	#define SLLV dsllv
		114	#define SRLV dsrlv
		115	#define NBYTES 8
		116	#define LOG_NBYTES 3
		117
		118	/*
		119	* As we are sharing code base with the mips32 tree (which use the o32 ABI
		120	* register definitions). We need to redefine the register definitions from
		121	* the n64 ABI register naming to the o32 ABI register naming.
		122	*/
		123	#undef t0
		124	#undef t1
		125	#undef t2
		126	#undef t3
		127	#define t0 $8
		128	#define t1 $9
		129	#define t2 $10
		130	#define t3 $11
		131	#define t4 $12
		132	#define t5 $13
		133	#define t6 $14
		134	#define t7 $15
		135
		136	#else
		137
		138	#define LOAD lw
		139	#define LOADL lwl
		140	#define LOADR lwr
		141	#define STOREL swl
		142	#define STORER swr
		143	#define STORE sw
		144	#define ADD addu
		145	#define SUB subu
		146	#define SRL srl
		147	#define SLL sll
		148	#define SRA sra
		149	#define SLLV sllv
		150	#define SRLV srlv
		151	#define NBYTES 4
		152	#define LOG_NBYTES 2
		153
		154	#endif /* USE_DOUBLE */
		155
		156	#ifdef CONFIG_CPU_LITTLE_ENDIAN
		157	#define LDFIRST LOADR
		158	#define LDREST LOADL
		159	#define STFIRST STORER
		160	#define STREST STOREL
		161	#define SHIFT_DISCARD SLLV
		162	#else
		163	#define LDFIRST LOADL
		164	#define LDREST LOADR
		165	#define STFIRST STOREL
		166	#define STREST STORER
		167	#define SHIFT_DISCARD SRLV
		168	#endif
		169
		170	#define FIRST(unit) ((unit)*NBYTES)
		171	#define REST(unit) (FIRST(unit)+NBYTES-1)
		172	#define UNIT(unit) FIRST(unit)
		173
		174	#define ADDRMASK (NBYTES-1)
		175
		176	.text
		177	.set noreorder
		178	.set noat
		179
		180	/*
		181	* A combined memcpy/__copy_user
		182	* __copy_user sets len to 0 for success; else to an upper bound of
		183	* the number of uncopied bytes.
		184	* memcpy sets v0 to dst.
		185	*/
		186	.align 5
		187	LEAF(__copy_user_inatomic)
		188	/*
		189	* Note: dst & src may be unaligned, len may be 0
		190	* Temps
		191	*/
		192	#define rem t8
		193
		194	/*
		195	* The "issue break"s below are very approximate.
		196	* Issue delays for dcache fills will perturb the schedule, as will
		197	* load queue full replay traps, etc.
		198	*
		199	* If len < NBYTES use byte operations.
		200	*/
		201	PREF( 0, 0(src) )
		202	PREF( 1, 0(dst) )
		203	sltu t2, len, NBYTES
		204	and t1, dst, ADDRMASK
		205	PREF( 0, 1*32(src) )
		206	PREF( 1, 1*32(dst) )
		207	bnez t2, copy_bytes_checklen
		208	and t0, src, ADDRMASK
		209	PREF( 0, 2*32(src) )
		210	PREF( 1, 2*32(dst) )
		211	bnez t1, dst_unaligned
		212	nop
		213	bnez t0, src_unaligned_dst_aligned
		214	/*
		215	* use delay slot for fall-through
		216	* src and dst are aligned; need to compute rem
		217	*/
		218	both_aligned:
		219	SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter
		220	beqz t0, cleanup_both_aligned # len < 8*NBYTES
		221	and rem, len, (8NBYTES-1) # rem = len % (8NBYTES)
		222	PREF( 0, 3*32(src) )
		223	PREF( 1, 3*32(dst) )
		224	.align 4
		225	1:
		226	EXC( LOAD t0, UNIT(0)(src), l_exc)
		227	EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
		228	EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
		229	EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
		230	SUB len, len, 8*NBYTES
		231	EXC( LOAD t4, UNIT(4)(src), l_exc_copy)
		232	EXC( LOAD t7, UNIT(5)(src), l_exc_copy)
		233	STORE t0, UNIT(0)(dst)
		234	STORE t1, UNIT(1)(dst)
		235	EXC( LOAD t0, UNIT(6)(src), l_exc_copy)
		236	EXC( LOAD t1, UNIT(7)(src), l_exc_copy)
		237	ADD src, src, 8*NBYTES
		238	ADD dst, dst, 8*NBYTES
		239	STORE t2, UNIT(-6)(dst)
		240	STORE t3, UNIT(-5)(dst)
		241	STORE t4, UNIT(-4)(dst)
		242	STORE t7, UNIT(-3)(dst)
		243	STORE t0, UNIT(-2)(dst)
		244	STORE t1, UNIT(-1)(dst)
		245	PREF( 0, 8*32(src) )
		246	PREF( 1, 8*32(dst) )
		247	bne len, rem, 1b
		248	nop
		249
		250	/*
		251	* len == rem == the number of bytes left to copy < 8*NBYTES
		252	*/
		253	cleanup_both_aligned:
		254	beqz len, done
		255	sltu t0, len, 4*NBYTES
		256	bnez t0, less_than_4units
		257	and rem, len, (NBYTES-1) # rem = len % NBYTES
		258	/*
		259	* len >= 4*NBYTES
		260	*/
		261	EXC( LOAD t0, UNIT(0)(src), l_exc)
		262	EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
		263	EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
		264	EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
		265	SUB len, len, 4*NBYTES
		266	ADD src, src, 4*NBYTES
		267	STORE t0, UNIT(0)(dst)
		268	STORE t1, UNIT(1)(dst)
		269	STORE t2, UNIT(2)(dst)
		270	STORE t3, UNIT(3)(dst)
		271	beqz len, done
		272	ADD dst, dst, 4*NBYTES
		273	less_than_4units:
		274	/*
		275	* rem = len % NBYTES
		276	*/
		277	beq rem, len, copy_bytes
		278	nop
		279	1:
		280	EXC( LOAD t0, 0(src), l_exc)
		281	ADD src, src, NBYTES
		282	SUB len, len, NBYTES
		283	STORE t0, 0(dst)
		284	bne rem, len, 1b
		285	ADD dst, dst, NBYTES
		286
		287	/*
		288	* src and dst are aligned, need to copy rem bytes (rem < NBYTES)
		289	* A loop would do only a byte at a time with possible branch
		290	* mispredicts. Can't do an explicit LOAD dst,mask,or,STORE
		291	* because can't assume read-access to dst. Instead, use
		292	* STREST dst, which doesn't require read access to dst.
		293	*
		294	* This code should perform better than a simple loop on modern,
		295	* wide-issue mips processors because the code has fewer branches and
		296	* more instruction-level parallelism.
		297	*/
		298	#define bits t2
		299	beqz len, done
		300	ADD t1, dst, len # t1 is just past last byte of dst
		301	li bits, 8*NBYTES
		302	SLL rem, len, 3 # rem = number of bits to keep
		303	EXC( LOAD t0, 0(src), l_exc)
		304	SUB bits, bits, rem # bits = number of bits to discard
		305	SHIFT_DISCARD t0, t0, bits
		306	STREST t0, -1(t1)
		307	jr ra
		308	move len, zero
		309	dst_unaligned:
		310	/*
		311	* dst is unaligned
		312	* t0 = src & ADDRMASK
		313	* t1 = dst & ADDRMASK; T1 > 0
		314	* len >= NBYTES
		315	*
		316	* Copy enough bytes to align dst
		317	* Set match = (src and dst have same alignment)
		318	*/
		319	#define match rem
		320	EXC( LDFIRST t3, FIRST(0)(src), l_exc)
		321	ADD t2, zero, NBYTES
		322	EXC( LDREST t3, REST(0)(src), l_exc_copy)
		323	SUB t2, t2, t1 # t2 = number of bytes copied
		324	xor match, t0, t1
		325	STFIRST t3, FIRST(0)(dst)
		326	beq len, t2, done
		327	SUB len, len, t2
		328	ADD dst, dst, t2
		329	beqz match, both_aligned
		330	ADD src, src, t2
		331
		332	src_unaligned_dst_aligned:
		333	SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
		334	PREF( 0, 3*32(src) )
		335	beqz t0, cleanup_src_unaligned
		336	and rem, len, (4NBYTES-1) # rem = len % 4NBYTES
		337	PREF( 1, 3*32(dst) )
		338	1:
		339	/*
		340	* Avoid consecutive LD*'s to the same register since some mips
		341	* implementations can't issue them in the same cycle.
		342	* It's OK to load FIRST(N+1) before REST(N) because the two addresses
		343	* are to the same unit (unless src is aligned, but it's not).
		344	*/
		345	EXC( LDFIRST t0, FIRST(0)(src), l_exc)
		346	EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)
		347	SUB len, len, 4*NBYTES
		348	EXC( LDREST t0, REST(0)(src), l_exc_copy)
		349	EXC( LDREST t1, REST(1)(src), l_exc_copy)
		350	EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)
		351	EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)
		352	EXC( LDREST t2, REST(2)(src), l_exc_copy)
		353	EXC( LDREST t3, REST(3)(src), l_exc_copy)
		354	PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed)
		355	ADD src, src, 4*NBYTES
		356	#ifdef CONFIG_CPU_SB1
		357	nop # improves slotting
		358	#endif
		359	STORE t0, UNIT(0)(dst)
		360	STORE t1, UNIT(1)(dst)
		361	STORE t2, UNIT(2)(dst)
		362	STORE t3, UNIT(3)(dst)
		363	PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed)
		364	bne len, rem, 1b
		365	ADD dst, dst, 4*NBYTES
		366
		367	cleanup_src_unaligned:
		368	beqz len, done
		369	and rem, len, NBYTES-1 # rem = len % NBYTES
		370	beq rem, len, copy_bytes
		371	nop
		372	1:
		373	EXC( LDFIRST t0, FIRST(0)(src), l_exc)
		374	EXC( LDREST t0, REST(0)(src), l_exc_copy)
		375	ADD src, src, NBYTES
		376	SUB len, len, NBYTES
		377	STORE t0, 0(dst)
		378	bne len, rem, 1b
		379	ADD dst, dst, NBYTES
		380
		381	copy_bytes_checklen:
		382	beqz len, done
		383	nop
		384	copy_bytes:
		385	/* 0 < len < NBYTES */
		386	#define COPY_BYTE(N) \
		387	EXC( lb t0, N(src), l_exc); \
		388	SUB len, len, 1; \
		389	beqz len, done; \
		390	sb t0, N(dst)
		391
		392	COPY_BYTE(0)
		393	COPY_BYTE(1)
		394	#ifdef USE_DOUBLE
		395	COPY_BYTE(2)
		396	COPY_BYTE(3)
		397	COPY_BYTE(4)
		398	COPY_BYTE(5)
		399	#endif
		400	EXC( lb t0, NBYTES-2(src), l_exc)
		401	SUB len, len, 1
		402	jr ra
		403	sb t0, NBYTES-2(dst)
		404	done:
		405	jr ra
		406	nop
		407	END(__copy_user_inatomic)
		408
		409	l_exc_copy:
		410	/*
		411	* Copy bytes from src until faulting load address (or until a
		412	* lb faults)
		413	*
		414	* When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
		415	* may be more than a byte beyond the last address.
		416	* Hence, the lb below may get an exception.
		417	*
		418	* Assumes src < THREAD_BUADDR($28)
		419	*/
		420	LOAD t0, TI_TASK($28)
		421	nop
		422	LOAD t0, THREAD_BUADDR(t0)
		423	1:
		424	EXC( lb t1, 0(src), l_exc)
		425	ADD src, src, 1
		426	sb t1, 0(dst) # can't fault -- we're copy_from_user
		427	bne src, t0, 1b
		428	ADD dst, dst, 1
		429	l_exc:
		430	LOAD t0, TI_TASK($28)
		431	nop
		432	LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
		433	nop
		434	SUB len, AT, t0 # len number of uncopied bytes
		435	jr ra
		436	nop