MIPS: Unify memcpy.S and memcpy-inatomic.S

We can save the 451 lines of code that comprise memcpy-inatomic.S at the expense of a single instruction in the memcpy prolog. We also use an additional register (t6), so this may cause increased register pressure in some places as well. But I think the reduced maintenance burden, of not having two nearly identical implementations, makes it worth it. Signed-off-by: David Daney <david.daney@cavium.com> Cc: linux-mips@linux-mips.org Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
author: David Daney <david.daney@cavium.com> 2012-06-06 18:00:31 -0400
committer: Ralf Baechle <ralf@linux-mips.org> 2012-07-23 08:55:55 -0400
commit: bb0757ebb929d5d6ba484b4313976847285ba280 (patch)
tree: f3a138d260d52fca71d7170e42415235ce6cc46b /arch/mips/lib
parent: 914f848077fb2ec0ec9c041af9ae1101ed0320f3 (diff)
3 files changed, 12 insertions, 452 deletions
diff --git a/arch/mips/lib/Makefile b/arch/mips/lib/Makefile
index 2a7c74fc15fc..399a50a541d4 100644
--- a/arch/mips/lib/Makefile
+++ b/arch/mips/lib/Makefile
@@ -2,7 +2,7 @@
 # Makefile for MIPS-specific library files..
 #
-lib-y   += csum_partial.o delay.o memcpy.o memcpy-inatomic.o memset.o \
+lib-y   += csum_partial.o delay.o memcpy.o memset.o \
           strlen_user.o strncpy_user.o strnlen_user.o uncached.o
 obj-y                   += iomap.o
diff --git a/arch/mips/lib/memcpy-inatomic.S b/arch/mips/lib/memcpy-inatomic.S
deleted file mode 100644
index 68853a038d3f..000000000000
--- a/arch/mips/lib/memcpy-inatomic.S
+++ /dev/null
@@ -1,451 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Unified implementation of memcpy, memmove and the __copy_user backend.
- *
- * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
- * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
- * Copyright (C) 2002 Broadcom, Inc.
- *   memcpy/copy_user author: Mark Vandevoorde
- * Copyright (C) 2007  Maciej W. Rozycki
- *
- * Mnemonic names for arguments to memcpy/__copy_user
- */
-/*
- * Hack to resolve longstanding prefetch issue
- *
- * Prefetching may be fatal on some systems if we're prefetching beyond the
- * end of memory on some systems.  It's also a seriously bad idea on non
- * dma-coherent systems.
- */
-#ifdef CONFIG_DMA_NONCOHERENT
-#undef CONFIG_CPU_HAS_PREFETCH
-#endif
-#ifdef CONFIG_MIPS_MALTA
-#undef CONFIG_CPU_HAS_PREFETCH
-#endif
-#include <asm/asm.h>
-#include <asm/asm-offsets.h>
-#include <asm/regdef.h>
-#define dst a0
-#define src a1
-#define len a2
-/*
- * Spec
- *
- * memcpy copies len bytes from src to dst and sets v0 to dst.
- * It assumes that
- *   - src and dst don't overlap
- *   - src is readable
- *   - dst is writable
- * memcpy uses the standard calling convention
- *
- * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
- * the number of uncopied bytes due to an exception caused by a read or write.
- * __copy_user assumes that src and dst don't overlap, and that the call is
- * implementing one of the following:
- *   copy_to_user
- *     - src is readable  (no exceptions when reading src)
- *   copy_from_user
- *     - dst is writable  (no exceptions when writing dst)
- * __copy_user uses a non-standard calling convention; see
- * include/asm-mips/uaccess.h
- *
- * When an exception happens on a load, the handler must
- # ensure that all of the destination buffer is overwritten to prevent
- * leaking information to user mode programs.
- */
-/*
- * Implementation
- */
-/*
- * The exception handler for loads requires that:
- *  1- AT contain the address of the byte just past the end of the source
- *     of the copy,
- *  2- src_entry <= src < AT, and
- *  3- (dst - src) == (dst_entry - src_entry),
- * The _entry suffix denotes values when __copy_user was called.
- *
- * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
- * (2) is met by incrementing src by the number of bytes copied
- * (3) is met by not doing loads between a pair of increments of dst and src
- *
- * The exception handlers for stores adjust len (if necessary) and return.
- * These handlers do not need to overwrite any data.
- *
- * For __rmemcpy and memmove an exception is always a kernel bug, therefore
- * they're not protected.
- */
-#define EXC(inst_reg,addr,handler)              \
-9:      inst_reg, addr;                         \
-        .section __ex_table,"a";                \
-        PTR     9b, handler;                    \
-        .previous
-/*
- * Only on the 64-bit kernel we can made use of 64-bit registers.
- */
-#ifdef CONFIG_64BIT
-#define USE_DOUBLE
-#endif
-#ifdef USE_DOUBLE
-#define LOAD   ld
-#define LOADL  ldl
-#define LOADR  ldr
-#define STOREL sdl
-#define STORER sdr
-#define STORE  sd
-#define ADD    daddu
-#define SUB    dsubu
-#define SRL    dsrl
-#define SRA    dsra
-#define SLL    dsll
-#define SLLV   dsllv
-#define SRLV   dsrlv
-#define NBYTES 8
-#define LOG_NBYTES 3
-/*
- * As we are sharing code base with the mips32 tree (which use the o32 ABI
- * register definitions). We need to redefine the register definitions from
- * the n64 ABI register naming to the o32 ABI register naming.
- */
-#undef t0
-#undef t1
-#undef t2
-#undef t3
-#define t0      $8
-#define t1      $9
-#define t2      $10
-#define t3      $11
-#define t4      $12
-#define t5      $13
-#define t6      $14
-#define t7      $15
-#else
-#define LOAD   lw
-#define LOADL  lwl
-#define LOADR  lwr
-#define STOREL swl
-#define STORER swr
-#define STORE  sw
-#define ADD    addu
-#define SUB    subu
-#define SRL    srl
-#define SLL    sll
-#define SRA    sra
-#define SLLV   sllv
-#define SRLV   srlv
-#define NBYTES 4
-#define LOG_NBYTES 2
-#endif /* USE_DOUBLE */
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
-#define LDFIRST LOADR
-#define LDREST  LOADL
-#define STFIRST STORER
-#define STREST  STOREL
-#define SHIFT_DISCARD SLLV
-#else
-#define LDFIRST LOADL
-#define LDREST  LOADR
-#define STFIRST STOREL
-#define STREST  STORER
-#define SHIFT_DISCARD SRLV
-#endif
-#define FIRST(unit) ((unit)*NBYTES)
-#define REST(unit)  (FIRST(unit)+NBYTES-1)
-#define UNIT(unit)  FIRST(unit)
-#define ADDRMASK (NBYTES-1)
-        .text
-        .set    noreorder
-#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
-        .set    noat
-#else
-        .set    at=v1
-#endif
-/*
- * A combined memcpy/__copy_user
- * __copy_user sets len to 0 for success; else to an upper bound of
- * the number of uncopied bytes.
- * memcpy sets v0 to dst.
- */
-        .align  5
-LEAF(__copy_user_inatomic)
-        /*
-         * Note: dst & src may be unaligned, len may be 0
-         * Temps
-         */
-#define rem t8
-        /*
-         * The "issue break"s below are very approximate.
-         * Issue delays for dcache fills will perturb the schedule, as will
-         * load queue full replay traps, etc.
-         *
-         * If len < NBYTES use byte operations.
-         */
-        PREF(   0, 0(src) )
-        PREF(   1, 0(dst) )
-        sltu    t2, len, NBYTES
-        and     t1, dst, ADDRMASK
-        PREF(   0, 1*32(src) )
-        PREF(   1, 1*32(dst) )
-        bnez    t2, .Lcopy_bytes_checklen
-         and    t0, src, ADDRMASK
-        PREF(   0, 2*32(src) )
-        PREF(   1, 2*32(dst) )
-        bnez    t1, .Ldst_unaligned
-         nop
-        bnez    t0, .Lsrc_unaligned_dst_aligned
-        /*
-         * use delay slot for fall-through
-         * src and dst are aligned; need to compute rem
-         */
-.Lboth_aligned:
-         SRL    t0, len, LOG_NBYTES+3           # +3 for 8 units/iter
-        beqz    t0, .Lcleanup_both_aligned      # len < 8*NBYTES
-         and    rem, len, (8*NBYTES-1)          # rem = len % (8*NBYTES)
-        PREF(   0, 3*32(src) )
-        PREF(   1, 3*32(dst) )
-        .align  4
-1:
-EXC(    LOAD    t0, UNIT(0)(src),       .Ll_exc)
-EXC(    LOAD    t1, UNIT(1)(src),       .Ll_exc_copy)
-EXC(    LOAD    t2, UNIT(2)(src),       .Ll_exc_copy)
-EXC(    LOAD    t3, UNIT(3)(src),       .Ll_exc_copy)
-        SUB     len, len, 8*NBYTES
-EXC(    LOAD    t4, UNIT(4)(src),       .Ll_exc_copy)
-EXC(    LOAD    t7, UNIT(5)(src),       .Ll_exc_copy)
-        STORE   t0, UNIT(0)(dst)
-        STORE   t1, UNIT(1)(dst)
-EXC(    LOAD    t0, UNIT(6)(src),       .Ll_exc_copy)
-EXC(    LOAD    t1, UNIT(7)(src),       .Ll_exc_copy)
-        ADD     src, src, 8*NBYTES
-        ADD     dst, dst, 8*NBYTES
-        STORE   t2, UNIT(-6)(dst)
-        STORE   t3, UNIT(-5)(dst)
-        STORE   t4, UNIT(-4)(dst)
-        STORE   t7, UNIT(-3)(dst)
-        STORE   t0, UNIT(-2)(dst)
-        STORE   t1, UNIT(-1)(dst)
-        PREF(   0, 8*32(src) )
-        PREF(   1, 8*32(dst) )
-        bne     len, rem, 1b
-         nop
-        /*
-         * len == rem == the number of bytes left to copy < 8*NBYTES
-         */
-.Lcleanup_both_aligned:
-        beqz    len, .Ldone
-         sltu   t0, len, 4*NBYTES
-        bnez    t0, .Lless_than_4units
-         and    rem, len, (NBYTES-1)    # rem = len % NBYTES
-        /*
-         * len >= 4*NBYTES
-         */
-EXC(    LOAD    t0, UNIT(0)(src),       .Ll_exc)
-EXC(    LOAD    t1, UNIT(1)(src),       .Ll_exc_copy)
-EXC(    LOAD    t2, UNIT(2)(src),       .Ll_exc_copy)
-EXC(    LOAD    t3, UNIT(3)(src),       .Ll_exc_copy)
-        SUB     len, len, 4*NBYTES
-        ADD     src, src, 4*NBYTES
-        STORE   t0, UNIT(0)(dst)
-        STORE   t1, UNIT(1)(dst)
-        STORE   t2, UNIT(2)(dst)
-        STORE   t3, UNIT(3)(dst)
-        .set    reorder                         /* DADDI_WAR */
-        ADD     dst, dst, 4*NBYTES
-        beqz    len, .Ldone
-        .set    noreorder
-.Lless_than_4units:
-        /*
-         * rem = len % NBYTES
-         */
-        beq     rem, len, .Lcopy_bytes
-         nop
-1:
-EXC(    LOAD    t0, 0(src),             .Ll_exc)
-        ADD     src, src, NBYTES
-        SUB     len, len, NBYTES
-        STORE   t0, 0(dst)
-        .set    reorder                         /* DADDI_WAR */
-        ADD     dst, dst, NBYTES
-        bne     rem, len, 1b
-        .set    noreorder
-        /*
-         * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
-         * A loop would do only a byte at a time with possible branch
-         * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
-         * because can't assume read-access to dst.  Instead, use
-         * STREST dst, which doesn't require read access to dst.
-         *
-         * This code should perform better than a simple loop on modern,
-         * wide-issue mips processors because the code has fewer branches and
-         * more instruction-level parallelism.
-         */
-#define bits t2
-        beqz    len, .Ldone
-         ADD    t1, dst, len    # t1 is just past last byte of dst
-        li      bits, 8*NBYTES
-        SLL     rem, len, 3     # rem = number of bits to keep
-EXC(    LOAD    t0, 0(src),             .Ll_exc)
-        SUB     bits, bits, rem # bits = number of bits to discard
-        SHIFT_DISCARD t0, t0, bits
-        STREST  t0, -1(t1)
-        jr      ra
-         move   len, zero
-.Ldst_unaligned:
-        /*
-         * dst is unaligned
-         * t0 = src & ADDRMASK
-         * t1 = dst & ADDRMASK; T1 > 0
-         * len >= NBYTES
-         *
-         * Copy enough bytes to align dst
-         * Set match = (src and dst have same alignment)
-         */
-#define match rem
-EXC(    LDFIRST t3, FIRST(0)(src),      .Ll_exc)
-        ADD     t2, zero, NBYTES
-EXC(    LDREST  t3, REST(0)(src),       .Ll_exc_copy)
-        SUB     t2, t2, t1      # t2 = number of bytes copied
-        xor     match, t0, t1
-        STFIRST t3, FIRST(0)(dst)
-        beq     len, t2, .Ldone
-         SUB    len, len, t2
-        ADD     dst, dst, t2
-        beqz    match, .Lboth_aligned
-         ADD    src, src, t2
-.Lsrc_unaligned_dst_aligned:
-        SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
-        PREF(   0, 3*32(src) )
-        beqz    t0, .Lcleanup_src_unaligned
-         and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
-        PREF(   1, 3*32(dst) )
-1:
-/*
- * Avoid consecutive LD*'s to the same register since some mips
- * implementations can't issue them in the same cycle.
- * It's OK to load FIRST(N+1) before REST(N) because the two addresses
- * are to the same unit (unless src is aligned, but it's not).
- */
-EXC(    LDFIRST t0, FIRST(0)(src),      .Ll_exc)
-EXC(    LDFIRST t1, FIRST(1)(src),      .Ll_exc_copy)
-        SUB     len, len, 4*NBYTES
-EXC(    LDREST  t0, REST(0)(src),       .Ll_exc_copy)
-EXC(    LDREST  t1, REST(1)(src),       .Ll_exc_copy)
-EXC(    LDFIRST t2, FIRST(2)(src),      .Ll_exc_copy)
-EXC(    LDFIRST t3, FIRST(3)(src),      .Ll_exc_copy)
-EXC(    LDREST  t2, REST(2)(src),       .Ll_exc_copy)
-EXC(    LDREST  t3, REST(3)(src),       .Ll_exc_copy)
-        PREF(   0, 9*32(src) )          # 0 is PREF_LOAD  (not streamed)
-        ADD     src, src, 4*NBYTES
-#ifdef CONFIG_CPU_SB1
-        nop                             # improves slotting
-#endif
-        STORE   t0, UNIT(0)(dst)
-        STORE   t1, UNIT(1)(dst)
-        STORE   t2, UNIT(2)(dst)
-        STORE   t3, UNIT(3)(dst)
-        PREF(   1, 9*32(dst) )          # 1 is PREF_STORE (not streamed)
-        .set    reorder                         /* DADDI_WAR */
-        ADD     dst, dst, 4*NBYTES
-        bne     len, rem, 1b
-        .set    noreorder
-.Lcleanup_src_unaligned:
-        beqz    len, .Ldone
-         and    rem, len, NBYTES-1  # rem = len % NBYTES
-        beq     rem, len, .Lcopy_bytes
-         nop
-1:
-EXC(    LDFIRST t0, FIRST(0)(src),      .Ll_exc)
-EXC(    LDREST  t0, REST(0)(src),       .Ll_exc_copy)
-        ADD     src, src, NBYTES
-        SUB     len, len, NBYTES
-        STORE   t0, 0(dst)
-        .set    reorder                         /* DADDI_WAR */
-        ADD     dst, dst, NBYTES
-        bne     len, rem, 1b
-        .set    noreorder
-.Lcopy_bytes_checklen:
-        beqz    len, .Ldone
-         nop
-.Lcopy_bytes:
-        /* 0 < len < NBYTES  */
-#define COPY_BYTE(N)                    \
-EXC(    lb      t0, N(src), .Ll_exc);   \
-        SUB     len, len, 1;            \
-        beqz    len, .Ldone;            \
-         sb     t0, N(dst)
-        COPY_BYTE(0)
-        COPY_BYTE(1)
-#ifdef USE_DOUBLE
-        COPY_BYTE(2)
-        COPY_BYTE(3)
-        COPY_BYTE(4)
-        COPY_BYTE(5)
-#endif
-EXC(    lb      t0, NBYTES-2(src), .Ll_exc)
-        SUB     len, len, 1
-        jr      ra
-         sb     t0, NBYTES-2(dst)
-.Ldone:
-        jr      ra
-         nop
-        END(__copy_user_inatomic)
-.Ll_exc_copy:
-        /*
-         * Copy bytes from src until faulting load address (or until a
-         * lb faults)
-         *
-         * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
-         * may be more than a byte beyond the last address.
-         * Hence, the lb below may get an exception.
-         *
-         * Assumes src < THREAD_BUADDR($28)
-         */
-        LOAD    t0, TI_TASK($28)
-         nop
-        LOAD    t0, THREAD_BUADDR(t0)
-1:
-EXC(    lb      t1, 0(src),     .Ll_exc)
-        ADD     src, src, 1
-        sb      t1, 0(dst)      # can't fault -- we're copy_from_user
-        .set    reorder                         /* DADDI_WAR */
-        ADD     dst, dst, 1
-        bne     src, t0, 1b
-        .set    noreorder
-.Ll_exc:
-        LOAD    t0, TI_TASK($28)
-         nop
-        LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
-         nop
-        SUB     len, AT, t0             # len number of uncopied bytes
-        jr      ra
-         nop
diff --git a/arch/mips/lib/memcpy.S b/arch/mips/lib/memcpy.S
index 56a1f85a1ce8..65192c06781e 100644
--- a/arch/mips/lib/memcpy.S
+++ b/arch/mips/lib/memcpy.S
@@ -183,6 +183,14 @@
 #endif
 /*
+ * t6 is used as a flag to note inatomic mode.
+ */
+LEAF(__copy_user_inatomic)
+        b       __copy_user_common
+         li     t6, 1
+        END(__copy_user_inatomic)
+/*
 * A combined memcpy/__copy_user
 * __copy_user sets len to 0 for success; else to an upper bound of
 * the number of uncopied bytes.
@@ -193,6 +201,8 @@ LEAF(memcpy)					/* a0=dst a1=src a2=len */
        move    v0, dst                         /* return value */
 .L__memcpy:
 FEXPORT(__copy_user)
+        li      t6, 0   /* not inatomic */
+__copy_user_common:
        /*
         * Note: dst & src may be unaligned, len may be 0
         * Temps
@@ -458,6 +468,7 @@ EXC(	lb	t1, 0(src),	.Ll_exc)
        LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
         nop
        SUB     len, AT, t0             # len number of uncopied bytes
+        bnez    t6, .Ldone      /* Skip the zeroing part if inatomic */
        /*
         * Here's where we rely on src and dst being incremented in tandem,
         *   See (3) above.
author	David Daney <david.daney@cavium.com>	2012-06-06 18:00:31 -0400
committer	Ralf Baechle <ralf@linux-mips.org>	2012-07-23 08:55:55 -0400
commit	bb0757ebb929d5d6ba484b4313976847285ba280 (patch)
tree	f3a138d260d52fca71d7170e42415235ce6cc46b /arch/mips/lib
parent	914f848077fb2ec0ec9c041af9ae1101ed0320f3 (diff)

diff --git a/arch/mips/lib/Makefile b/arch/mips/lib/Makefile index 2a7c74fc15fc..399a50a541d4 100644 --- a/arch/mips/lib/Makefile +++ b/arch/mips/lib/Makefile
@@ -2,7 +2,7 @@
2	# Makefile for MIPS-specific library files..	2	# Makefile for MIPS-specific library files..
3	#	3	#
4		4
5	lib-y += csum_partial.o delay.o memcpy.o memcpy-inatomic.o memset.o \	5	lib-y += csum_partial.o delay.o memcpy.o memset.o \
6	strlen_user.o strncpy_user.o strnlen_user.o uncached.o	6	strlen_user.o strncpy_user.o strnlen_user.o uncached.o
7		7
8	obj-y += iomap.o	8	obj-y += iomap.o


diff --git a/arch/mips/lib/memcpy-inatomic.S b/arch/mips/lib/memcpy-inatomic.S deleted file mode 100644 index 68853a038d3f..000000000000 --- a/arch/mips/lib/memcpy-inatomic.S +++ /dev/null
@@ -1,451 +0,0 @@
1	/*
2	* This file is subject to the terms and conditions of the GNU General Public
3	* License. See the file "COPYING" in the main directory of this archive
4	* for more details.
5	*
6	* Unified implementation of memcpy, memmove and the __copy_user backend.
7	*
8	* Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
9	* Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
10	* Copyright (C) 2002 Broadcom, Inc.
11	* memcpy/copy_user author: Mark Vandevoorde
12	* Copyright (C) 2007 Maciej W. Rozycki
13	*
14	* Mnemonic names for arguments to memcpy/__copy_user
15	*/
16
17	/*
18	* Hack to resolve longstanding prefetch issue
19	*
20	* Prefetching may be fatal on some systems if we're prefetching beyond the
21	* end of memory on some systems. It's also a seriously bad idea on non
22	* dma-coherent systems.
23	*/
24	#ifdef CONFIG_DMA_NONCOHERENT
25	#undef CONFIG_CPU_HAS_PREFETCH
26	#endif
27	#ifdef CONFIG_MIPS_MALTA
28	#undef CONFIG_CPU_HAS_PREFETCH
29	#endif
30
31	#include <asm/asm.h>
32	#include <asm/asm-offsets.h>
33	#include <asm/regdef.h>
34
35	#define dst a0
36	#define src a1
37	#define len a2
38
39	/*
40	* Spec
41	*
42	* memcpy copies len bytes from src to dst and sets v0 to dst.
43	* It assumes that
44	* - src and dst don't overlap
45	* - src is readable
46	* - dst is writable
47	* memcpy uses the standard calling convention
48	*
49	* __copy_user copies up to len bytes from src to dst and sets a2 (len) to
50	* the number of uncopied bytes due to an exception caused by a read or write.
51	* __copy_user assumes that src and dst don't overlap, and that the call is
52	* implementing one of the following:
53	* copy_to_user
54	* - src is readable (no exceptions when reading src)
55	* copy_from_user
56	* - dst is writable (no exceptions when writing dst)
57	* __copy_user uses a non-standard calling convention; see
58	* include/asm-mips/uaccess.h
59	*
60	* When an exception happens on a load, the handler must
61	# ensure that all of the destination buffer is overwritten to prevent
62	* leaking information to user mode programs.
63	*/
64
65	/*
66	* Implementation
67	*/
68
69	/*
70	* The exception handler for loads requires that:
71	* 1- AT contain the address of the byte just past the end of the source
72	* of the copy,
73	* 2- src_entry <= src < AT, and
74	* 3- (dst - src) == (dst_entry - src_entry),
75	* The _entry suffix denotes values when __copy_user was called.
76	*
77	* (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
78	* (2) is met by incrementing src by the number of bytes copied
79	* (3) is met by not doing loads between a pair of increments of dst and src
80	*
81	* The exception handlers for stores adjust len (if necessary) and return.
82	* These handlers do not need to overwrite any data.
83	*
84	* For __rmemcpy and memmove an exception is always a kernel bug, therefore
85	* they're not protected.
86	*/
87
88	#define EXC(inst_reg,addr,handler) \
89	9: inst_reg, addr; \
90	.section __ex_table,"a"; \
91	PTR 9b, handler; \
92	.previous
93
94	/*
95	* Only on the 64-bit kernel we can made use of 64-bit registers.
96	*/
97	#ifdef CONFIG_64BIT
98	#define USE_DOUBLE
99	#endif
100
101	#ifdef USE_DOUBLE
102
103	#define LOAD ld
104	#define LOADL ldl
105	#define LOADR ldr
106	#define STOREL sdl
107	#define STORER sdr
108	#define STORE sd
109	#define ADD daddu
110	#define SUB dsubu
111	#define SRL dsrl
112	#define SRA dsra
113	#define SLL dsll
114	#define SLLV dsllv
115	#define SRLV dsrlv
116	#define NBYTES 8
117	#define LOG_NBYTES 3
118
119	/*
120	* As we are sharing code base with the mips32 tree (which use the o32 ABI
121	* register definitions). We need to redefine the register definitions from
122	* the n64 ABI register naming to the o32 ABI register naming.
123	*/
124	#undef t0
125	#undef t1
126	#undef t2
127	#undef t3
128	#define t0 $8
129	#define t1 $9
130	#define t2 $10
131	#define t3 $11
132	#define t4 $12
133	#define t5 $13
134	#define t6 $14
135	#define t7 $15
136
137	#else
138
139	#define LOAD lw
140	#define LOADL lwl
141	#define LOADR lwr
142	#define STOREL swl
143	#define STORER swr
144	#define STORE sw
145	#define ADD addu
146	#define SUB subu
147	#define SRL srl
148	#define SLL sll
149	#define SRA sra
150	#define SLLV sllv
151	#define SRLV srlv
152	#define NBYTES 4
153	#define LOG_NBYTES 2
154
155	#endif /* USE_DOUBLE */
156
157	#ifdef CONFIG_CPU_LITTLE_ENDIAN
158	#define LDFIRST LOADR
159	#define LDREST LOADL
160	#define STFIRST STORER
161	#define STREST STOREL
162	#define SHIFT_DISCARD SLLV
163	#else
164	#define LDFIRST LOADL
165	#define LDREST LOADR
166	#define STFIRST STOREL
167	#define STREST STORER
168	#define SHIFT_DISCARD SRLV
169	#endif
170
171	#define FIRST(unit) ((unit)*NBYTES)
172	#define REST(unit) (FIRST(unit)+NBYTES-1)
173	#define UNIT(unit) FIRST(unit)
174
175	#define ADDRMASK (NBYTES-1)
176
177	.text
178	.set noreorder
179	#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
180	.set noat
181	#else
182	.set at=v1
183	#endif
184
185	/*
186	* A combined memcpy/__copy_user
187	* __copy_user sets len to 0 for success; else to an upper bound of
188	* the number of uncopied bytes.
189	* memcpy sets v0 to dst.
190	*/
191	.align 5
192	LEAF(__copy_user_inatomic)
193	/*
194	* Note: dst & src may be unaligned, len may be 0
195	* Temps
196	*/
197	#define rem t8
198
199	/*
200	* The "issue break"s below are very approximate.
201	* Issue delays for dcache fills will perturb the schedule, as will
202	* load queue full replay traps, etc.
203	*
204	* If len < NBYTES use byte operations.
205	*/
206	PREF( 0, 0(src) )
207	PREF( 1, 0(dst) )
208	sltu t2, len, NBYTES
209	and t1, dst, ADDRMASK
210	PREF( 0, 1*32(src) )
211	PREF( 1, 1*32(dst) )
212	bnez t2, .Lcopy_bytes_checklen
213	and t0, src, ADDRMASK
214	PREF( 0, 2*32(src) )
215	PREF( 1, 2*32(dst) )
216	bnez t1, .Ldst_unaligned
217	nop
218	bnez t0, .Lsrc_unaligned_dst_aligned
219	/*
220	* use delay slot for fall-through
221	* src and dst are aligned; need to compute rem
222	*/
223	.Lboth_aligned:
224	SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter
225	beqz t0, .Lcleanup_both_aligned # len < 8*NBYTES
226	and rem, len, (8NBYTES-1) # rem = len % (8NBYTES)
227	PREF( 0, 3*32(src) )
228	PREF( 1, 3*32(dst) )
229	.align 4
230	1:
231	EXC( LOAD t0, UNIT(0)(src), .Ll_exc)
232	EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)
233	EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)
234	EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)
235	SUB len, len, 8*NBYTES
236	EXC( LOAD t4, UNIT(4)(src), .Ll_exc_copy)
237	EXC( LOAD t7, UNIT(5)(src), .Ll_exc_copy)
238	STORE t0, UNIT(0)(dst)
239	STORE t1, UNIT(1)(dst)
240	EXC( LOAD t0, UNIT(6)(src), .Ll_exc_copy)
241	EXC( LOAD t1, UNIT(7)(src), .Ll_exc_copy)
242	ADD src, src, 8*NBYTES
243	ADD dst, dst, 8*NBYTES
244	STORE t2, UNIT(-6)(dst)
245	STORE t3, UNIT(-5)(dst)
246	STORE t4, UNIT(-4)(dst)
247	STORE t7, UNIT(-3)(dst)
248	STORE t0, UNIT(-2)(dst)
249	STORE t1, UNIT(-1)(dst)
250	PREF( 0, 8*32(src) )
251	PREF( 1, 8*32(dst) )
252	bne len, rem, 1b
253	nop
254
255	/*
256	* len == rem == the number of bytes left to copy < 8*NBYTES
257	*/
258	.Lcleanup_both_aligned:
259	beqz len, .Ldone
260	sltu t0, len, 4*NBYTES
261	bnez t0, .Lless_than_4units
262	and rem, len, (NBYTES-1) # rem = len % NBYTES
263	/*
264	* len >= 4*NBYTES
265	*/
266	EXC( LOAD t0, UNIT(0)(src), .Ll_exc)
267	EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)
268	EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)
269	EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)
270	SUB len, len, 4*NBYTES
271	ADD src, src, 4*NBYTES
272	STORE t0, UNIT(0)(dst)
273	STORE t1, UNIT(1)(dst)
274	STORE t2, UNIT(2)(dst)
275	STORE t3, UNIT(3)(dst)
276	.set reorder /* DADDI_WAR */
277	ADD dst, dst, 4*NBYTES
278	beqz len, .Ldone
279	.set noreorder
280	.Lless_than_4units:
281	/*
282	* rem = len % NBYTES
283	*/
284	beq rem, len, .Lcopy_bytes
285	nop
286	1:
287	EXC( LOAD t0, 0(src), .Ll_exc)
288	ADD src, src, NBYTES
289	SUB len, len, NBYTES
290	STORE t0, 0(dst)
291	.set reorder /* DADDI_WAR */
292	ADD dst, dst, NBYTES
293	bne rem, len, 1b
294	.set noreorder
295
296	/*
297	* src and dst are aligned, need to copy rem bytes (rem < NBYTES)
298	* A loop would do only a byte at a time with possible branch
299	* mispredicts. Can't do an explicit LOAD dst,mask,or,STORE
300	* because can't assume read-access to dst. Instead, use
301	* STREST dst, which doesn't require read access to dst.
302	*
303	* This code should perform better than a simple loop on modern,
304	* wide-issue mips processors because the code has fewer branches and
305	* more instruction-level parallelism.
306	*/
307	#define bits t2
308	beqz len, .Ldone
309	ADD t1, dst, len # t1 is just past last byte of dst
310	li bits, 8*NBYTES
311	SLL rem, len, 3 # rem = number of bits to keep
312	EXC( LOAD t0, 0(src), .Ll_exc)
313	SUB bits, bits, rem # bits = number of bits to discard
314	SHIFT_DISCARD t0, t0, bits
315	STREST t0, -1(t1)
316	jr ra
317	move len, zero
318	.Ldst_unaligned:
319	/*
320	* dst is unaligned
321	* t0 = src & ADDRMASK
322	* t1 = dst & ADDRMASK; T1 > 0
323	* len >= NBYTES
324	*
325	* Copy enough bytes to align dst
326	* Set match = (src and dst have same alignment)
327	*/
328	#define match rem
329	EXC( LDFIRST t3, FIRST(0)(src), .Ll_exc)
330	ADD t2, zero, NBYTES
331	EXC( LDREST t3, REST(0)(src), .Ll_exc_copy)
332	SUB t2, t2, t1 # t2 = number of bytes copied
333	xor match, t0, t1
334	STFIRST t3, FIRST(0)(dst)
335	beq len, t2, .Ldone
336	SUB len, len, t2
337	ADD dst, dst, t2
338	beqz match, .Lboth_aligned
339	ADD src, src, t2
340
341	.Lsrc_unaligned_dst_aligned:
342	SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
343	PREF( 0, 3*32(src) )
344	beqz t0, .Lcleanup_src_unaligned
345	and rem, len, (4NBYTES-1) # rem = len % 4NBYTES
346	PREF( 1, 3*32(dst) )
347	1:
348	/*
349	* Avoid consecutive LD*'s to the same register since some mips
350	* implementations can't issue them in the same cycle.
351	* It's OK to load FIRST(N+1) before REST(N) because the two addresses
352	* are to the same unit (unless src is aligned, but it's not).
353	*/
354	EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)
355	EXC( LDFIRST t1, FIRST(1)(src), .Ll_exc_copy)
356	SUB len, len, 4*NBYTES
357	EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)
358	EXC( LDREST t1, REST(1)(src), .Ll_exc_copy)
359	EXC( LDFIRST t2, FIRST(2)(src), .Ll_exc_copy)
360	EXC( LDFIRST t3, FIRST(3)(src), .Ll_exc_copy)
361	EXC( LDREST t2, REST(2)(src), .Ll_exc_copy)
362	EXC( LDREST t3, REST(3)(src), .Ll_exc_copy)
363	PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed)
364	ADD src, src, 4*NBYTES
365	#ifdef CONFIG_CPU_SB1
366	nop # improves slotting
367	#endif
368	STORE t0, UNIT(0)(dst)
369	STORE t1, UNIT(1)(dst)
370	STORE t2, UNIT(2)(dst)
371	STORE t3, UNIT(3)(dst)
372	PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed)
373	.set reorder /* DADDI_WAR */
374	ADD dst, dst, 4*NBYTES
375	bne len, rem, 1b
376	.set noreorder
377
378	.Lcleanup_src_unaligned:
379	beqz len, .Ldone
380	and rem, len, NBYTES-1 # rem = len % NBYTES
381	beq rem, len, .Lcopy_bytes
382	nop
383	1:
384	EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)
385	EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)
386	ADD src, src, NBYTES
387	SUB len, len, NBYTES
388	STORE t0, 0(dst)
389	.set reorder /* DADDI_WAR */
390	ADD dst, dst, NBYTES
391	bne len, rem, 1b
392	.set noreorder
393
394	.Lcopy_bytes_checklen:
395	beqz len, .Ldone
396	nop
397	.Lcopy_bytes:
398	/* 0 < len < NBYTES */
399	#define COPY_BYTE(N) \
400	EXC( lb t0, N(src), .Ll_exc); \
401	SUB len, len, 1; \
402	beqz len, .Ldone; \
403	sb t0, N(dst)
404
405	COPY_BYTE(0)
406	COPY_BYTE(1)
407	#ifdef USE_DOUBLE
408	COPY_BYTE(2)
409	COPY_BYTE(3)
410	COPY_BYTE(4)
411	COPY_BYTE(5)
412	#endif
413	EXC( lb t0, NBYTES-2(src), .Ll_exc)
414	SUB len, len, 1
415	jr ra
416	sb t0, NBYTES-2(dst)
417	.Ldone:
418	jr ra
419	nop
420	END(__copy_user_inatomic)
421
422	.Ll_exc_copy:
423	/*
424	* Copy bytes from src until faulting load address (or until a
425	* lb faults)
426	*
427	* When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
428	* may be more than a byte beyond the last address.
429	* Hence, the lb below may get an exception.
430	*
431	* Assumes src < THREAD_BUADDR($28)
432	*/
433	LOAD t0, TI_TASK($28)
434	nop
435	LOAD t0, THREAD_BUADDR(t0)
436	1:
437	EXC( lb t1, 0(src), .Ll_exc)
438	ADD src, src, 1
439	sb t1, 0(dst) # can't fault -- we're copy_from_user
440	.set reorder /* DADDI_WAR */
441	ADD dst, dst, 1
442	bne src, t0, 1b
443	.set noreorder
444	.Ll_exc:
445	LOAD t0, TI_TASK($28)
446	nop
447	LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
448	nop
449	SUB len, AT, t0 # len number of uncopied bytes
450	jr ra
451	nop


diff --git a/arch/mips/lib/memcpy.S b/arch/mips/lib/memcpy.S index 56a1f85a1ce8..65192c06781e 100644 --- a/arch/mips/lib/memcpy.S +++ b/arch/mips/lib/memcpy.S
@@ -183,6 +183,14 @@
183	#endif	183	#endif
184		184
185	/*	185	/*
		186	* t6 is used as a flag to note inatomic mode.
		187	*/
		188	LEAF(__copy_user_inatomic)
		189	b __copy_user_common
		190	li t6, 1
		191	END(__copy_user_inatomic)
		192
		193	/*
186	* A combined memcpy/__copy_user	194	* A combined memcpy/__copy_user
187	* __copy_user sets len to 0 for success; else to an upper bound of	195	* __copy_user sets len to 0 for success; else to an upper bound of
188	* the number of uncopied bytes.	196	* the number of uncopied bytes.
@@ -193,6 +201,8 @@ LEAF(memcpy) /* a0=dst a1=src a2=len */
193	move v0, dst /* return value */	201	move v0, dst /* return value */
194	.L__memcpy:	202	.L__memcpy:
195	FEXPORT(__copy_user)	203	FEXPORT(__copy_user)
		204	li t6, 0 /* not inatomic */
		205	__copy_user_common:
196	/*	206	/*
197	* Note: dst & src may be unaligned, len may be 0	207	* Note: dst & src may be unaligned, len may be 0
198	* Temps	208	* Temps
@@ -458,6 +468,7 @@ EXC( lb t1, 0(src), .Ll_exc)
458	LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address	468	LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
459	nop	469	nop
460	SUB len, AT, t0 # len number of uncopied bytes	470	SUB len, AT, t0 # len number of uncopied bytes
		471	bnez t6, .Ldone /* Skip the zeroing part if inatomic */
461	/*	472	/*
462	* Here's where we rely on src and dst being incremented in tandem,	473	* Here's where we rely on src and dst being incremented in tandem,
463	* See (3) above.	474	* See (3) above.