13 files changed, 1980 insertions, 0 deletions
diff --git a/arch/sh/lib/Makefile b/arch/sh/lib/Makefile
new file mode 100644
index 000000000000..b5681e3f9684
--- /dev/null
+++ b/arch/sh/lib/Makefile
@@ -0,0 +1,13 @@
+#
+# Makefile for SuperH-specific library files..
+#
+lib-y  = delay.o memset.o memmove.o memchr.o \
+         checksum.o strcasecmp.o strlen.o div64.o udivdi3.o \
+         div64-generic.o
+memcpy-y                        := memcpy.o
+memcpy-$(CONFIG_CPU_SH4)        := memcpy-sh4.o
+lib-y   += $(memcpy-y)
diff --git a/arch/sh/lib/checksum.S b/arch/sh/lib/checksum.S
new file mode 100644
index 000000000000..7c50dfe68c07
--- /dev/null
+++ b/arch/sh/lib/checksum.S
@@ -0,0 +1,385 @@
+/* $Id: checksum.S,v 1.10 2001/07/06 13:11:32 gniibe Exp $
+ *
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              IP/TCP/UDP checksumming routines
+ *
+ * Authors:     Jorge Cwik, <jorge@laser.satlink.net>
+ *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *              Tom May, <ftom@netcom.com>
+ *              Pentium Pro/II routines:
+ *              Alexander Kjeldaas <astor@guardian.no>
+ *              Finn Arne Gangstad <finnag@guardian.no>
+ *              Lots of code moved from tcp.c and ip.c; see those files
+ *              for more names.
+ *
+ * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
+ *                           handling.
+ *              Andi Kleen,  add zeroing on error
+ *                   converted to pure assembler
+ *
+ * SuperH version:  Copyright (C) 1999  Niibe Yutaka
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ */
+#include <asm/errno.h>
+#include <linux/linkage.h>
+/*
+ * computes a partial checksum, e.g. for TCP/UDP fragments
+ */
+/*      
+ * unsigned int csum_partial(const unsigned char *buf, int len,
+ *                           unsigned int sum);
+ */
+.text
+ENTRY(csum_partial)
+          /*
+           * Experiments with Ethernet and SLIP connections show that buff
+           * is aligned on either a 2-byte or 4-byte boundary.  We get at
+           * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
+           * Fortunately, it is easy to convert 2-byte alignment to 4-byte
+           * alignment for the unrolled loop.
+           */
+        mov     r5, r1
+        mov     r4, r0
+        tst     #2, r0          ! Check alignment.
+        bt      2f              ! Jump if alignment is ok.
+        !
+        add     #-2, r5         ! Alignment uses up two bytes.
+        cmp/pz  r5              !
+        bt/s    1f              ! Jump if we had at least two bytes.
+         clrt
+        bra     6f
+         add    #2, r5          ! r5 was < 2.  Deal with it.
+1:
+        mov     r5, r1          ! Save new len for later use.
+        mov.w   @r4+, r0
+        extu.w  r0, r0
+        addc    r0, r6
+        bf      2f
+        add     #1, r6
+2:
+        mov     #-5, r0
+        shld    r0, r5
+        tst     r5, r5
+        bt/s    4f              ! if it's =0, go to 4f
+         clrt
+        .align  2
+3:
+        mov.l   @r4+, r0
+        mov.l   @r4+, r2
+        mov.l   @r4+, r3
+        addc    r0, r6
+        mov.l   @r4+, r0
+        addc    r2, r6
+        mov.l   @r4+, r2
+        addc    r3, r6
+        mov.l   @r4+, r3
+        addc    r0, r6
+        mov.l   @r4+, r0
+        addc    r2, r6
+        mov.l   @r4+, r2
+        addc    r3, r6
+        addc    r0, r6
+        addc    r2, r6
+        movt    r0
+        dt      r5
+        bf/s    3b
+         cmp/eq #1, r0
+        ! here, we know r5==0
+        addc    r5, r6                  ! add carry to r6
+4:
+        mov     r1, r0
+        and     #0x1c, r0
+        tst     r0, r0
+        bt/s    6f
+         mov    r0, r5
+        shlr2   r5
+        mov     #0, r2
+5:
+        addc    r2, r6
+        mov.l   @r4+, r2
+        movt    r0
+        dt      r5
+        bf/s    5b
+         cmp/eq #1, r0
+        addc    r2, r6
+        addc    r5, r6          ! r5==0 here, so it means add carry-bit
+6:
+        mov     r1, r5
+        mov     #3, r0
+        and     r0, r5
+        tst     r5, r5
+        bt      9f              ! if it's =0 go to 9f
+        mov     #2, r1
+        cmp/hs  r1, r5
+        bf      7f
+        mov.w   @r4+, r0
+        extu.w  r0, r0
+        cmp/eq  r1, r5
+        bt/s    8f
+         clrt
+        shll16  r0
+        addc    r0, r6
+7:
+        mov.b   @r4+, r0
+        extu.b  r0, r0
+#ifndef __LITTLE_ENDIAN__
+        shll8   r0
+#endif
+8:
+        addc    r0, r6
+        mov     #0, r0
+        addc    r0, r6 
+9:
+        rts
+         mov    r6, r0
+/*
+unsigned int csum_partial_copy_generic (const char *src, char *dst, int len, 
+                                        int sum, int *src_err_ptr, int *dst_err_ptr)
+ */ 
+/*
+ * Copy from ds while checksumming, otherwise like csum_partial
+ *
+ * The macros SRC and DST specify the type of access for the instruction.
+ * thus we can call a custom exception handler for all access types.
+ *
+ * FIXME: could someone double-check whether I haven't mixed up some SRC and
+ *        DST definitions? It's damn hard to trigger all cases.  I hope I got
+ *        them all but there's no guarantee.
+ */
+#define SRC(...)                        \
+        9999: __VA_ARGS__ ;             \
+        .section __ex_table, "a";       \
+        .long 9999b, 6001f      ;       \
+        .previous
+#define DST(...)                        \
+        9999: __VA_ARGS__ ;             \
+        .section __ex_table, "a";       \
+        .long 9999b, 6002f      ;       \
+        .previous
+!
+! r4:   const char *SRC
+! r5:   char *DST
+! r6:   int LEN
+! r7:   int SUM
+!
+! on stack:
+! int *SRC_ERR_PTR
+! int *DST_ERR_PTR
+!
+ENTRY(csum_partial_copy_generic)
+        mov.l   r5,@-r15
+        mov.l   r6,@-r15
+        mov     #3,r0           ! Check src and dest are equally aligned
+        mov     r4,r1
+        and     r0,r1
+        and     r5,r0
+        cmp/eq  r1,r0
+        bf      3f              ! Different alignments, use slow version
+        tst     #1,r0           ! Check dest word aligned
+        bf      3f              ! If not, do it the slow way
+        mov     #2,r0
+        tst     r0,r5           ! Check dest alignment. 
+        bt      2f              ! Jump if alignment is ok.
+        add     #-2,r6          ! Alignment uses up two bytes.
+        cmp/pz  r6              ! Jump if we had at least two bytes.
+        bt/s    1f
+         clrt
+        bra     4f
+         add    #2,r6           ! r6 was < 2.   Deal with it.
+3:      ! Handle different src and dest alignments.
+        ! This is not common, so simple byte by byte copy will do.
+        mov     r6,r2
+        shlr    r6
+        tst     r6,r6
+        bt      4f
+        clrt
+        .align  2
+5:
+SRC(    mov.b   @r4+,r1         )
+SRC(    mov.b   @r4+,r0         )
+        extu.b  r1,r1
+DST(    mov.b   r1,@r5          )
+DST(    mov.b   r0,@(1,r5)      )
+        extu.b  r0,r0
+        add     #2,r5
+#ifdef  __LITTLE_ENDIAN__
+        shll8   r0
+#else
+        shll8   r1
+#endif
+        or      r1,r0
+        addc    r0,r7
+        movt    r0
+        dt      r6
+        bf/s    5b
+         cmp/eq #1,r0
+        mov     #0,r0
+        addc    r0, r7
+        mov     r2, r0
+        tst     #1, r0
+        bt      7f
+        bra     5f
+         clrt
+        ! src and dest equally aligned, but to a two byte boundary.
+        ! Handle first two bytes as a special case
+        .align  2
+1:      
+SRC(    mov.w   @r4+,r0         )
+DST(    mov.w   r0,@r5          )
+        add     #2,r5
+        extu.w  r0,r0
+        addc    r0,r7
+        mov     #0,r0
+        addc    r0,r7
+2:
+        mov     r6,r2
+        mov     #-5,r0
+        shld    r0,r6
+        tst     r6,r6
+        bt/s    2f
+         clrt
+        .align  2
+1:      
+SRC(    mov.l   @r4+,r0         )
+SRC(    mov.l   @r4+,r1         )
+        addc    r0,r7
+DST(    mov.l   r0,@r5          )
+DST(    mov.l   r1,@(4,r5)      )
+        addc    r1,r7
+SRC(    mov.l   @r4+,r0         )
+SRC(    mov.l   @r4+,r1         )
+        addc    r0,r7
+DST(    mov.l   r0,@(8,r5)      )
+DST(    mov.l   r1,@(12,r5)     )
+        addc    r1,r7
+SRC(    mov.l   @r4+,r0         )
+SRC(    mov.l   @r4+,r1         )
+        addc    r0,r7
+DST(    mov.l   r0,@(16,r5)     )
+DST(    mov.l   r1,@(20,r5)     )
+        addc    r1,r7
+SRC(    mov.l   @r4+,r0         )
+SRC(    mov.l   @r4+,r1         )
+        addc    r0,r7
+DST(    mov.l   r0,@(24,r5)     )
+DST(    mov.l   r1,@(28,r5)     )
+        addc    r1,r7
+        add     #32,r5
+        movt    r0
+        dt      r6
+        bf/s    1b
+         cmp/eq #1,r0
+        mov     #0,r0
+        addc    r0,r7
+2:      mov     r2,r6
+        mov     #0x1c,r0
+        and     r0,r6
+        cmp/pl  r6
+        bf/s    4f
+         clrt
+        shlr2   r6
+3:      
+SRC(    mov.l   @r4+,r0 )
+        addc    r0,r7
+DST(    mov.l   r0,@r5  )
+        add     #4,r5
+        movt    r0
+        dt      r6
+        bf/s    3b
+         cmp/eq #1,r0
+        mov     #0,r0
+        addc    r0,r7
+4:      mov     r2,r6
+        mov     #3,r0
+        and     r0,r6
+        cmp/pl  r6
+        bf      7f
+        mov     #2,r1
+        cmp/hs  r1,r6
+        bf      5f
+SRC(    mov.w   @r4+,r0 )
+DST(    mov.w   r0,@r5  )
+        extu.w  r0,r0
+        add     #2,r5
+        cmp/eq  r1,r6
+        bt/s    6f
+         clrt
+        shll16  r0
+        addc    r0,r7
+5:      
+SRC(    mov.b   @r4+,r0 )
+DST(    mov.b   r0,@r5  )
+        extu.b  r0,r0
+#ifndef __LITTLE_ENDIAN__
+        shll8   r0
+#endif
+6:      addc    r0,r7
+        mov     #0,r0
+        addc    r0,r7
+7:
+5000:
+# Exception handler:
+.section .fixup, "ax"                                                   
+6001:
+        mov.l   @(8,r15),r0                     ! src_err_ptr
+        mov     #-EFAULT,r1
+        mov.l   r1,@r0
+        ! zero the complete destination - computing the rest
+        ! is too much work 
+        mov.l   @(4,r15),r5             ! dst
+        mov.l   @r15,r6                 ! len
+        mov     #0,r7
+1:      mov.b   r7,@r5
+        dt      r6
+        bf/s    1b
+         add    #1,r5
+        mov.l   8000f,r0
+        jmp     @r0
+         nop
+        .align  2
+8000:   .long   5000b
+6002:
+        mov.l   @(12,r15),r0                    ! dst_err_ptr
+        mov     #-EFAULT,r1
+        mov.l   r1,@r0
+        mov.l   8001f,r0
+        jmp     @r0
+         nop
+        .align  2
+8001:   .long   5000b
+.previous
+        add     #8,r15
+        rts
+         mov    r7,r0
diff --git a/arch/sh/lib/delay.c b/arch/sh/lib/delay.c
new file mode 100644
index 000000000000..50b36037d86b
--- /dev/null
+++ b/arch/sh/lib/delay.c
@@ -0,0 +1,41 @@
+/*
+ *      Precise Delay Loops for SuperH
+ *
+ *      Copyright (C) 1999 Niibe Yutaka & Kaz Kojima
+ */
+#include <linux/sched.h>
+#include <linux/delay.h>
+void __delay(unsigned long loops)
+{
+        __asm__ __volatile__(
+                "tst    %0, %0\n\t"
+                "1:\t"
+                "bf/s   1b\n\t"
+                " dt    %0"
+                : "=r" (loops)
+                : "0" (loops)
+                : "t");
+}
+inline void __const_udelay(unsigned long xloops)
+{
+        __asm__("dmulu.l        %0, %2\n\t"
+                "sts    mach, %0"
+                : "=r" (xloops)
+                : "0" (xloops), "r" (cpu_data[_smp_processor_id()].loops_per_jiffy)
+                : "macl", "mach");
+        __delay(xloops * HZ);
+}
+void __udelay(unsigned long usecs)
+{
+        __const_udelay(usecs * 0x000010c6);  /* 2**32 / 1000000 */
+}
+void __ndelay(unsigned long nsecs)
+{
+        __const_udelay(nsecs * 0x00000005);
+}
diff --git a/arch/sh/lib/div64-generic.c b/arch/sh/lib/div64-generic.c
new file mode 100644
index 000000000000..c02473afd581
--- /dev/null
+++ b/arch/sh/lib/div64-generic.c
@@ -0,0 +1,19 @@
+/*
+ * Generic __div64_32 wrapper for __xdiv64_32.
+ */
+#include <linux/types.h>
+extern u64 __xdiv64_32(u64 n, u32 d);
+u64 __div64_32(u64 *xp, u32 y)
+{
+        u64 rem;
+        u64 q = __xdiv64_32(*xp, y);
+        rem = *xp - q * y;
+        *xp = q;
+        return rem;
+}
diff --git a/arch/sh/lib/div64.S b/arch/sh/lib/div64.S
new file mode 100644
index 000000000000..eefc275d64a7
--- /dev/null
+++ b/arch/sh/lib/div64.S
@@ -0,0 +1,46 @@
+/*      
+ * unsigned long long __xdiv64_32(unsigned long long n, unsigned long d); 
+ */
+#include <linux/linkage.h>
+.text
+ENTRY(__xdiv64_32)
+#ifdef  __LITTLE_ENDIAN__
+        mov     r4, r0
+        mov     r5, r1
+#else
+        mov     r4, r1
+        mov     r5, r0
+#endif
+        cmp/hs  r6, r1
+        bf.s    1f
+         mov    #0, r2
+        mov     r1, r2
+        mov     #0, r3
+        div0u
+        .rept   32
+        rotcl   r2
+        div1    r6, r3
+        .endr
+        rotcl   r2
+        mul.l   r6, r2
+        sts     macl, r3
+        sub     r3, r1
+1:
+        div0u
+        .rept   32
+        rotcl   r0
+        div1    r6, r1
+        .endr
+#ifdef  __LITTLE_ENDIAN__
+        mov     r2, r1
+        rts
+         rotcl  r0
+#else
+        rotcl   r0
+        mov     r0, r1
+        rts
+         mov    r2, r0
+#endif
diff --git a/arch/sh/lib/memchr.S b/arch/sh/lib/memchr.S
new file mode 100644
index 000000000000..bc6036ad5706
--- /dev/null
+++ b/arch/sh/lib/memchr.S
@@ -0,0 +1,26 @@
+/* $Id: memchr.S,v 1.1 2000/04/14 16:49:01 mjd Exp $
+ *
+ * "memchr" implementation of SuperH
+ *
+ * Copyright (C) 1999  Niibe Yutaka
+ *
+ */
+/*
+ * void *memchr(const void *s, int c, size_t n);
+ */
+#include <linux/linkage.h>
+ENTRY(memchr)
+        tst     r6,r6
+        bt/s    2f
+         exts.b r5,r5
+1:      mov.b   @r4,r1
+        cmp/eq  r1,r5
+        bt/s    3f
+         dt     r6
+        bf/s    1b
+         add    #1,r4
+2:      mov     #0,r4
+3:      rts
+         mov    r4,r0
diff --git a/arch/sh/lib/memcpy-sh4.S b/arch/sh/lib/memcpy-sh4.S
new file mode 100644
index 000000000000..55f227441f9e
--- /dev/null
+++ b/arch/sh/lib/memcpy-sh4.S
@@ -0,0 +1,800 @@
+/*
+ * "memcpy" implementation of SuperH
+ *
+ * Copyright (C) 1999  Niibe Yutaka
+ * Copyright (c) 2002  STMicroelectronics Ltd
+ *   Modified from memcpy.S and micro-optimised for SH4
+ *   Stuart Menefy (stuart.menefy@st.com)
+ *
+ */
+#include <linux/linkage.h>
+#include <linux/config.h>
+/*
+ * void *memcpy(void *dst, const void *src, size_t n);
+ *
+ * It is assumed that there is no overlap between src and dst.
+ * If there is an overlap, then the results are undefined.
+ */
+        !
+        !       GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
+        !
+        ! Size is 16 or greater, and may have trailing bytes
+        .balign 32
+.Lcase1:
+        ! Read a long word and write a long word at once
+        ! At the start of each iteration, r7 contains last long load
+        add     #-1,r5          !  79 EX
+        mov     r4,r2           !   5 MT (0 cycles latency)
+        mov.l   @(r0,r5),r7     !  21 LS (2 cycles latency)
+        add     #-4,r5          !  50 EX
+        add     #7,r2           !  79 EX
+        !
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+        ! 6 cycles, 4 bytes per iteration
+3:      mov.l   @(r0,r5),r1     !  21 LS (latency=2)    ! NMLK
+        mov     r7, r3          !   5 MT (latency=0)    ! RQPO
+        cmp/hi  r2,r0           !  57 MT
+        shll16  r3              ! 103 EX
+        mov     r1,r6           !   5 MT (latency=0)
+        shll8   r3              ! 102 EX                ! Oxxx
+        shlr8   r6              ! 106 EX                ! xNML
+        mov     r1, r7          !   5 MT (latency=0)
+        or      r6,r3           !  82 EX                ! ONML
+        bt/s    3b              ! 109 BR
+         mov.l  r3,@-r0         !  30 LS
+#else
+3:      mov.l   @(r0,r5),r1     !  21 LS (latency=2)    ! KLMN
+        mov     r7,r3           !   5 MT (latency=0)    ! OPQR
+        cmp/hi  r2,r0           !  57 MT
+        shlr16  r3              ! 107 EX
+        shlr8   r3              ! 106 EX                ! xxxO
+        mov     r1,r6           !   5 MT (latency=0)
+        shll8   r6              ! 102 EX                ! LMNx
+        mov     r1,r7           !   5 MT (latency=0)
+        or      r6,r3           !  82 EX                ! LMNO
+        bt/s    3b              ! 109 BR
+         mov.l  r3,@-r0         !  30 LS
+#endif
+        ! Finally, copy a byte at once, if necessary
+        add     #4,r5           !  50 EX
+        cmp/eq  r4,r0           !  54 MT
+        add     #-6,r2          !  50 EX
+        bt      9f              ! 109 BR
+8:      cmp/hi  r2,r0           !  57 MT
+        mov.b   @(r0,r5),r1     !  20 LS (latency=2)
+        bt/s    8b              ! 109 BR
+         mov.b  r1,@-r0         !  29 LS
+9:      rts
+         nop
+        !
+        !       GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
+        !
+        ! Size is 16 or greater, and may have trailing bytes
+        .balign 32
+.Lcase3:
+        ! Read a long word and write a long word at once
+        ! At the start of each iteration, r7 contains last long load
+        add     #-3,r5          ! 79 EX
+        mov     r4,r2           !  5 MT (0 cycles latency)
+        mov.l   @(r0,r5),r7     ! 21 LS (2 cycles latency)
+        add     #-4,r5          ! 50 EX
+        add     #7,r2           !  79 EX
+        !
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+        ! 6 cycles, 4 bytes per iteration
+3:      mov.l   @(r0,r5),r1     !  21 LS (latency=2)    ! NMLK
+        mov     r7, r3          !   5 MT (latency=0)    ! RQPO
+        cmp/hi  r2,r0           !  57 MT
+        shll8   r3              ! 102 EX                ! QPOx
+        mov     r1,r6           !   5 MT (latency=0)
+        shlr16  r6              ! 107 EX
+        shlr8   r6              ! 106 EX                ! xxxN
+        mov     r1, r7          !   5 MT (latency=0)
+        or      r6,r3           !  82 EX                ! QPON
+        bt/s    3b              ! 109 BR
+         mov.l  r3,@-r0         !  30 LS
+#else
+3:      mov     r1,r3           ! OPQR
+        shlr8   r3              ! xOPQ
+        mov.l   @(r0,r5),r1     ! KLMN
+        mov     r1,r6
+        shll16  r6
+        shll8   r6              ! Nxxx
+        or      r6,r3           ! NOPQ
+        cmp/hi  r2,r0
+        bt/s    3b
+         mov.l  r3,@-r0
+#endif
+        ! Finally, copy a byte at once, if necessary
+        add     #6,r5           !  50 EX
+        cmp/eq  r4,r0           !  54 MT
+        add     #-6,r2          !  50 EX
+        bt      9f              ! 109 BR
+8:      cmp/hi  r2,r0           !  57 MT
+        mov.b   @(r0,r5),r1     !  20 LS (latency=2)
+        bt/s    8b              ! 109 BR
+         mov.b  r1,@-r0         !  29 LS
+9:      rts
+         nop
+ENTRY(memcpy)
+        ! Calculate the invariants which will be used in the remainder
+        ! of the code:
+        !
+        !      r4   -->  [ ...  ] DST             [ ...  ] SRC
+        !                [ ...  ]                 [ ...  ]
+        !                  :                        :
+        !      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
+        !
+        !
+        ! Short circuit the common case of src, dst and len being 32 bit aligned
+        ! and test for zero length move
+        mov     r6, r0          !   5 MT (0 cycle latency)
+        or      r4, r0          !  82 EX
+        or      r5, r0          !  82 EX
+        tst     r6, r6          !  86 MT
+        bt/s    99f             ! 111 BR                (zero len)
+         tst    #3, r0          !  87 MT
+        mov     r4, r0          !   5 MT (0 cycle latency)
+        add     r6, r0          !  49 EX
+        mov     #16, r1         !   6 EX
+        bt/s    .Lcase00        ! 111 BR                (aligned)
+         sub    r4, r5          !  75 EX
+        ! Arguments are not nicely long word aligned or zero len.
+        ! Check for small copies, and if so do a simple byte at a time copy.
+        !
+        ! Deciding on an exact value of 'small' is not easy, as the point at which
+        ! using the optimised routines become worthwhile varies (these are the
+        ! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
+        !       size    byte-at-time    long    word    byte
+        !       16      42              39-40   46-50   50-55
+        !       24      58              43-44   54-58   62-67
+        !       36      82              49-50   66-70   80-85
+        ! However the penalty for getting it 'wrong' is much higher for long word
+        ! aligned data (and this is more common), so use a value of 16.
+        cmp/gt  r6,r1           !  56 MT
+        add     #-1,r5          !  50 EX
+        bf/s    6f              ! 108 BR                (not small)
+         mov    r5, r3          !   5 MT (latency=0)
+        shlr    r6              ! 104 EX
+        mov.b   @(r0,r5),r1     !  20 LS (latency=2)
+        bf/s    4f              ! 111 BR
+         add    #-1,r3          !  50 EX
+        tst     r6, r6          !  86 MT
+        bt/s    98f             ! 110 BR
+         mov.b  r1,@-r0         !  29 LS
+        ! 4 cycles, 2 bytes per iteration
+3:      mov.b   @(r0,r5),r1     !  20 LS (latency=2)
+4:      mov.b   @(r0,r3),r2     !  20 LS (latency=2)
+        dt      r6              !  67 EX
+        mov.b   r1,@-r0         !  29 LS
+        bf/s    3b              ! 111 BR
+         mov.b  r2,@-r0         !  29 LS
+98:
+        rts
+         nop
+99:     rts
+         mov    r4, r0
+        ! Size is not small, so its worthwhile looking for optimisations.
+        ! First align destination to a long word boundary.
+        !
+        ! r5 = normal value -1
+6:      tst     #3, r0          !  87 MT
+        mov     #3, r3          !   6 EX
+        bt/s    2f              ! 111 BR
+         and    r0,r3           !  78 EX
+        ! 3 cycles, 1 byte per iteration
+1:      dt      r3              !  67 EX
+        mov.b   @(r0,r5),r1     !  19 LS (latency=2)
+        add     #-1, r6         !  79 EX
+        bf/s    1b              ! 109 BR
+         mov.b  r1,@-r0         !  28 LS
+2:      add     #1, r5          !  79 EX
+        ! Now select the appropriate bulk transfer code based on relative
+        ! alignment of src and dst.
+        mov     r0, r3          !   5 MT (latency=0)
+        mov     r5, r0          !   5 MT (latency=0)
+        tst     #1, r0          !  87 MT
+        bf/s    1f              ! 111 BR
+         mov    #64, r7         !   6 EX
+        ! bit 0 clear
+        cmp/ge  r7, r6          !  55 MT
+        bt/s    2f              ! 111 BR
+         tst    #2, r0          !  87 MT
+        ! small
+        bt/s    .Lcase0
+         mov    r3, r0
+        bra     .Lcase2
+         nop
+        ! big
+2:      bt/s    .Lcase0b
+         mov    r3, r0
+        bra     .Lcase2b
+         nop
+        ! bit 0 set
+1:      tst     #2, r0          ! 87 MT
+        bt/s    .Lcase1
+         mov    r3, r0
+        bra     .Lcase3
+         nop
+        !
+        !       GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
+        !
+        ! src, dst and size are all long word aligned
+        ! size is non-zero
+        .balign 32
+.Lcase00:
+        mov     #64, r1         !   6 EX
+        mov     r5, r3          !   5 MT (latency=0)
+        cmp/gt  r6, r1          !  56 MT
+        add     #-4, r5         !  50 EX
+        bf      .Lcase00b       ! 108 BR                (big loop)
+        shlr2   r6              ! 105 EX
+        shlr    r6              ! 104 EX
+        mov.l   @(r0, r5), r1   !  21 LS (latency=2)
+        bf/s    4f              ! 111 BR
+         add    #-8, r3         !  50 EX
+        tst     r6, r6          !  86 MT
+        bt/s    5f              ! 110 BR
+         mov.l  r1,@-r0         !  30 LS
+        ! 4 cycles, 2 long words per iteration
+3:      mov.l   @(r0, r5), r1   !  21 LS (latency=2)
+4:      mov.l   @(r0, r3), r2   !  21 LS (latency=2)
+        dt      r6              !  67 EX
+        mov.l   r1, @-r0        !  30 LS
+        bf/s    3b              ! 109 BR
+         mov.l  r2, @-r0        !  30 LS
+5:      rts
+         nop
+        ! Size is 16 or greater and less than 64, but may have trailing bytes
+        .balign 32
+.Lcase0:
+        add     #-4, r5         !  50 EX
+        mov     r4, r7          !   5 MT (latency=0)
+        mov.l   @(r0, r5), r1   !  21 LS (latency=2)
+        mov     #4, r2          !   6 EX
+        add     #11, r7         !  50 EX
+        tst     r2, r6          !  86 MT
+        mov     r5, r3          !   5 MT (latency=0)
+        bt/s    4f              ! 111 BR
+         add    #-4, r3         !  50 EX
+        mov.l   r1,@-r0         !  30 LS
+        ! 4 cycles, 2 long words per iteration
+3:      mov.l   @(r0, r5), r1   !  21 LS (latency=2)
+4:      mov.l   @(r0, r3), r2   !  21 LS (latency=2)
+        cmp/hi  r7, r0
+        mov.l   r1, @-r0        !  30 LS
+        bt/s    3b              ! 109 BR
+         mov.l  r2, @-r0        !  30 LS
+        ! Copy the final 0-3 bytes
+        add     #3,r5           !  50 EX
+        cmp/eq  r0, r4          !  54 MT
+        add     #-10, r7        !  50 EX
+        bt      9f              ! 110 BR
+        ! 3 cycles, 1 byte per iteration
+1:      mov.b   @(r0,r5),r1     !  19 LS
+        cmp/hi  r7,r0           !  57 MT
+        bt/s    1b              ! 111 BR
+         mov.b  r1,@-r0         !  28 LS
+9:      rts
+         nop
+        ! Size is at least 64 bytes, so will be going round the big loop at least once.
+        !
+        !   r2 = rounded up r4
+        !   r3 = rounded down r0
+        .balign 32
+.Lcase0b:
+        add     #-4, r5         !  50 EX
+.Lcase00b:
+        mov     r0, r3          !   5 MT (latency=0)
+        mov     #(~0x1f), r1    !   6 EX
+        and     r1, r3          !  78 EX
+        mov     r4, r2          !   5 MT (latency=0)
+        cmp/eq  r3, r0          !  54 MT
+        add     #0x1f, r2       !  50 EX
+        bt/s    1f              ! 110 BR
+         and    r1, r2          !  78 EX
+        ! copy initial words until cache line aligned
+        mov.l   @(r0, r5), r1   !  21 LS (latency=2)
+        tst     #4, r0          !  87 MT
+        mov     r5, r6          !   5 MT (latency=0)
+        add     #-4, r6         !  50 EX
+        bt/s    4f              ! 111 BR
+         add    #8, r3          !  50 EX
+        tst     #0x18, r0       !  87 MT
+        bt/s    1f              ! 109 BR
+         mov.l  r1,@-r0         !  30 LS
+        ! 4 cycles, 2 long words per iteration
+3:      mov.l   @(r0, r5), r1   !  21 LS (latency=2)
+4:      mov.l   @(r0, r6), r7   !  21 LS (latency=2)
+        cmp/eq  r3, r0          !  54 MT
+        mov.l   r1, @-r0        !  30 LS
+        bf/s    3b              ! 109 BR
+         mov.l  r7, @-r0        !  30 LS
+        ! Copy the cache line aligned blocks
+        !
+        ! In use: r0, r2, r4, r5
+        ! Scratch: r1, r3, r6, r7
+        !
+        ! We could do this with the four scratch registers, but if src
+        ! and dest hit the same cache line, this will thrash, so make
+        ! use of additional registers.
+        !
+        ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
+        !   r5:  src (was r0+r5)
+        !   r1:  dest (was r0)
+        ! this can be reversed at the end, so we don't need to save any extra
+        ! state.
+        !
+1:      mov.l   r8, @-r15       !  30 LS
+        add     r0, r5          !  49 EX
+        mov.l   r9, @-r15       !  30 LS
+        mov     r0, r1          !   5 MT (latency=0)
+        mov.l   r10, @-r15      !  30 LS
+        add     #-0x1c, r5      !  50 EX
+        mov.l   r11, @-r15      !  30 LS
+        ! 16 cycles, 32 bytes per iteration
+2:      mov.l   @(0x00,r5),r0   ! 18 LS (latency=2)
+        add     #-0x20, r1      ! 50 EX
+        mov.l   @(0x04,r5),r3   ! 18 LS (latency=2)
+        mov.l   @(0x08,r5),r6   ! 18 LS (latency=2)
+        mov.l   @(0x0c,r5),r7   ! 18 LS (latency=2)
+        mov.l   @(0x10,r5),r8   ! 18 LS (latency=2)
+        mov.l   @(0x14,r5),r9   ! 18 LS (latency=2)
+        mov.l   @(0x18,r5),r10  ! 18 LS (latency=2)
+        mov.l   @(0x1c,r5),r11  ! 18 LS (latency=2)
+        movca.l r0,@r1          ! 40 LS (latency=3-7)
+        mov.l   r3,@(0x04,r1)   ! 33 LS
+        mov.l   r6,@(0x08,r1)   ! 33 LS
+        mov.l   r7,@(0x0c,r1)   ! 33 LS
+        mov.l   r8,@(0x10,r1)   ! 33 LS
+        add     #-0x20, r5      ! 50 EX
+        mov.l   r9,@(0x14,r1)   ! 33 LS
+        cmp/eq  r2,r1           ! 54 MT
+        mov.l   r10,@(0x18,r1)  !  33 LS
+        bf/s    2b              ! 109 BR
+         mov.l  r11,@(0x1c,r1)  !  33 LS
+        mov     r1, r0          !   5 MT (latency=0)
+        mov.l   @r15+, r11      !  15 LS
+        sub     r1, r5          !  75 EX
+        mov.l   @r15+, r10      !  15 LS
+        cmp/eq  r4, r0          !  54 MT
+        bf/s    1f              ! 109 BR
+         mov.l   @r15+, r9      !  15 LS
+        rts
+1:       mov.l  @r15+, r8       !  15 LS
+        sub     r4, r1          !  75 EX                (len remaining)
+        ! number of trailing bytes is non-zero
+        !
+        ! invariants restored (r5 already decremented by 4)
+        ! also r1=num bytes remaining
+        mov     #4, r2          !   6 EX
+        mov     r4, r7          !   5 MT (latency=0)
+        add     #0x1c, r5       !  50 EX                (back to -4)
+        cmp/hs  r2, r1          !  58 MT
+        bf/s    5f              ! 108 BR
+         add     #11, r7        !  50 EX
+        mov.l   @(r0, r5), r6   !  21 LS (latency=2)
+        tst     r2, r1          !  86 MT
+        mov     r5, r3          !   5 MT (latency=0)
+        bt/s    4f              ! 111 BR
+         add    #-4, r3         !  50 EX
+        cmp/hs  r2, r1          !  58 MT
+        bt/s    5f              ! 111 BR
+         mov.l  r6,@-r0         !  30 LS
+        ! 4 cycles, 2 long words per iteration
+3:      mov.l   @(r0, r5), r6   !  21 LS (latency=2)
+4:      mov.l   @(r0, r3), r2   !  21 LS (latency=2)
+        cmp/hi  r7, r0
+        mov.l   r6, @-r0        !  30 LS
+        bt/s    3b              ! 109 BR
+         mov.l  r2, @-r0        !  30 LS
+        ! Copy the final 0-3 bytes
+5:      cmp/eq  r0, r4          !  54 MT
+        add     #-10, r7        !  50 EX
+        bt      9f              ! 110 BR
+        add     #3,r5           !  50 EX
+        ! 3 cycles, 1 byte per iteration
+1:      mov.b   @(r0,r5),r1     !  19 LS
+        cmp/hi  r7,r0           !  57 MT
+        bt/s    1b              ! 111 BR
+         mov.b  r1,@-r0         !  28 LS
+9:      rts
+         nop
+        !
+        !       GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
+        !
+        .balign 32
+.Lcase2:
+        ! Size is 16 or greater and less then 64, but may have trailing bytes
+2:      mov     r5, r6          !   5 MT (latency=0)
+        add     #-2,r5          !  50 EX
+        mov     r4,r2           !   5 MT (latency=0)
+        add     #-4,r6          !  50 EX
+        add     #7,r2           !  50 EX
+3:      mov.w   @(r0,r5),r1     !  20 LS (latency=2)
+        mov.w   @(r0,r6),r3     !  20 LS (latency=2)
+        cmp/hi  r2,r0           !  57 MT
+        mov.w   r1,@-r0         !  29 LS
+        bt/s    3b              ! 111 BR
+         mov.w  r3,@-r0         !  29 LS
+        bra     10f
+         nop
+        .balign 32
+.Lcase2b:
+        ! Size is at least 64 bytes, so will be going round the big loop at least once.
+        !
+        !   r2 = rounded up r4
+        !   r3 = rounded down r0
+        mov     r0, r3          !   5 MT (latency=0)
+        mov     #(~0x1f), r1    !   6 EX
+        and     r1, r3          !  78 EX
+        mov     r4, r2          !   5 MT (latency=0)
+        cmp/eq  r3, r0          !  54 MT
+        add     #0x1f, r2       !  50 EX
+        add     #-2, r5         !  50 EX
+        bt/s    1f              ! 110 BR
+         and    r1, r2          !  78 EX
+        ! Copy a short word one at a time until we are cache line aligned
+        !   Normal values: r0, r2, r3, r4
+        !   Unused: r1, r6, r7
+        !   Mod: r5 (=r5-2)
+        !
+        add     #2, r3          !  50 EX
+2:      mov.w   @(r0,r5),r1     !  20 LS (latency=2)
+        cmp/eq  r3,r0           !  54 MT
+        bf/s    2b              ! 111 BR
+         mov.w  r1,@-r0         !  29 LS
+        ! Copy the cache line aligned blocks
+        !
+        ! In use: r0, r2, r4, r5 (=r5-2)
+        ! Scratch: r1, r3, r6, r7
+        !
+        ! We could do this with the four scratch registers, but if src
+        ! and dest hit the same cache line, this will thrash, so make
+        ! use of additional registers.
+        !
+        ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
+        !   r5:  src (was r0+r5)
+        !   r1:  dest (was r0)
+        ! this can be reversed at the end, so we don't need to save any extra
+        ! state.
+        !
+1:      mov.l   r8, @-r15       !  30 LS
+        add     r0, r5          !  49 EX
+        mov.l   r9, @-r15       !  30 LS
+        mov     r0, r1          !   5 MT (latency=0)
+        mov.l   r10, @-r15      !  30 LS
+        add     #-0x1e, r5      !  50 EX
+        mov.l   r11, @-r15      !  30 LS
+        mov.l   r12, @-r15      !  30 LS
+        ! 17 cycles, 32 bytes per iteration
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+2:      mov.w   @r5+, r0        !  14 LS (latency=2)            ..JI
+        add     #-0x20, r1      !  50 EX
+        mov.l   @r5+, r3        !  15 LS (latency=2)            NMLK
+        mov.l   @r5+, r6        !  15 LS (latency=2)            RQPO
+        shll16  r0              ! 103 EX                        JI..
+        mov.l   @r5+, r7        !  15 LS (latency=2)
+        xtrct   r3, r0          !  48 EX                        LKJI
+        mov.l   @r5+, r8        !  15 LS (latency=2)
+        xtrct   r6, r3          !  48 EX                        PONM
+        mov.l   @r5+, r9        !  15 LS (latency=2)
+        xtrct   r7, r6          !  48 EX
+        mov.l   @r5+, r10       !  15 LS (latency=2)
+        xtrct   r8, r7          !  48 EX
+        mov.l   @r5+, r11       !  15 LS (latency=2)
+        xtrct   r9, r8          !  48 EX
+        mov.w   @r5+, r12       !  15 LS (latency=2)
+        xtrct   r10, r9         !  48 EX
+        movca.l r0,@r1          !  40 LS (latency=3-7)
+        xtrct   r11, r10        !  48 EX
+        mov.l   r3, @(0x04,r1)  !  33 LS
+        xtrct   r12, r11        !  48 EX
+        mov.l   r6, @(0x08,r1)  !  33 LS
+        mov.l   r7, @(0x0c,r1)  !  33 LS
+        mov.l   r8, @(0x10,r1)  !  33 LS
+        add     #-0x40, r5      !  50 EX
+        mov.l   r9, @(0x14,r1)  !  33 LS
+        cmp/eq  r2,r1           !  54 MT
+        mov.l   r10, @(0x18,r1) !  33 LS
+        bf/s    2b              ! 109 BR
+         mov.l  r11, @(0x1c,r1) !  33 LS
+#else
+2:      mov.w   @(0x1e,r5), r0  !  17 LS (latency=2)
+        add     #-2, r5         !  50 EX
+        mov.l   @(0x1c,r5), r3  !  18 LS (latency=2)
+        add     #-4, r1         !  50 EX
+        mov.l   @(0x18,r5), r6  !  18 LS (latency=2)
+        shll16  r0              ! 103 EX
+        mov.l   @(0x14,r5), r7  !  18 LS (latency=2)
+        xtrct   r3, r0          !  48 EX
+        mov.l   @(0x10,r5), r8  !  18 LS (latency=2)
+        xtrct   r6, r3          !  48 EX
+        mov.l   @(0x0c,r5), r9  !  18 LS (latency=2)
+        xtrct   r7, r6          !  48 EX
+        mov.l   @(0x08,r5), r10 !  18 LS (latency=2)
+        xtrct   r8, r7          !  48 EX
+        mov.l   @(0x04,r5), r11 !  18 LS (latency=2)
+        xtrct   r9, r8          !  48 EX
+        mov.w   @(0x02,r5), r12 !  18 LS (latency=2)
+        xtrct   r10, r9         !  48 EX
+        movca.l r0,@r1          !  40 LS (latency=3-7)
+        add     #-0x1c, r1      !  50 EX
+        mov.l   r3, @(0x1c,r1)  !  33 LS
+        xtrct   r11, r10        !  48 EX
+        mov.l   r6, @(0x18,r1)  !  33 LS
+        xtrct   r12, r11        !  48 EX
+        mov.l   r7, @(0x14,r1)  !  33 LS
+        mov.l   r8, @(0x10,r1)  !  33 LS
+        add     #-0x3e, r5      !  50 EX
+        mov.l   r9, @(0x0c,r1)  !  33 LS
+        cmp/eq  r2,r1           !  54 MT
+        mov.l   r10, @(0x08,r1) !  33 LS
+        bf/s    2b              ! 109 BR
+         mov.l  r11, @(0x04,r1) !  33 LS
+#endif
+        mov.l   @r15+, r12
+        mov     r1, r0          !   5 MT (latency=0)
+        mov.l   @r15+, r11      !  15 LS
+        sub     r1, r5          !  75 EX
+        mov.l   @r15+, r10      !  15 LS
+        cmp/eq  r4, r0          !  54 MT
+        bf/s    1f              ! 109 BR
+         mov.l   @r15+, r9      !  15 LS
+        rts
+1:       mov.l  @r15+, r8       !  15 LS
+        add     #0x1e, r5       !  50 EX
+        ! Finish off a short word at a time
+        ! r5 must be invariant - 2
+10:     mov     r4,r2           !   5 MT (latency=0)
+        add     #1,r2           !  50 EX
+        cmp/hi  r2, r0          !  57 MT
+        bf/s    1f              ! 109 BR
+         add    #2, r2          !  50 EX
+3:      mov.w   @(r0,r5),r1     !  20 LS
+        cmp/hi  r2,r0           !  57 MT
+        bt/s    3b              ! 109 BR
+         mov.w  r1,@-r0         !  29 LS
+1:
+        !
+        ! Finally, copy the last byte if necessary
+        cmp/eq  r4,r0           !  54 MT
+        bt/s    9b
+         add    #1,r5
+        mov.b   @(r0,r5),r1
+        rts
+         mov.b  r1,@-r0
diff --git a/arch/sh/lib/memcpy.S b/arch/sh/lib/memcpy.S
new file mode 100644
index 000000000000..232fab34c261
--- /dev/null
+++ b/arch/sh/lib/memcpy.S
@@ -0,0 +1,227 @@
+/* $Id: memcpy.S,v 1.3 2001/07/27 11:50:52 gniibe Exp $
+ *
+ * "memcpy" implementation of SuperH
+ *
+ * Copyright (C) 1999  Niibe Yutaka
+ *
+ */
+/*
+ * void *memcpy(void *dst, const void *src, size_t n);
+ * No overlap between the memory of DST and of SRC are assumed.
+ */
+#include <linux/linkage.h>
+ENTRY(memcpy)
+        tst     r6,r6
+        bt/s    9f              ! if n=0, do nothing
+         mov    r4,r0
+        sub     r4,r5           ! From here, r5 has the distance to r0
+        add     r6,r0           ! From here, r0 points the end of copying point
+        mov     #12,r1
+        cmp/gt  r6,r1
+        bt/s    7f              ! if it's too small, copy a byte at once
+         add    #-1,r5
+        add     #1,r5
+        !                       From here, r6 is free
+        !
+        !      r4   -->  [ ...  ] DST             [ ...  ] SRC
+        !                [ ...  ]                 [ ...  ]
+        !                  :                        :
+        !      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
+        !
+        !
+        mov     r5,r1
+        mov     #3,r2
+        and     r2,r1
+        shll2   r1
+        mov     r0,r3           ! Save the value on R0 to R3
+        mova    jmptable,r0
+        add     r1,r0
+        mov.l   @r0,r1
+        jmp     @r1
+         mov    r3,r0           ! and back to R0
+        .balign 4
+jmptable:
+        .long   case0
+        .long   case1
+        .long   case2
+        .long   case3
+        ! copy a byte at once
+7:      mov     r4,r2
+        add     #1,r2
+8:
+        cmp/hi  r2,r0
+        mov.b   @(r0,r5),r1
+        bt/s    8b                      ! while (r0>r2)
+         mov.b  r1,@-r0
+9:
+        rts
+         nop
+case0:
+        !
+        !       GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
+        !
+        ! First, align to long word boundary
+        mov     r0,r3
+        and     r2,r3
+        tst     r3,r3
+        bt/s    2f
+         add    #-4,r5
+        add     #3,r5
+1:      dt      r3
+        mov.b   @(r0,r5),r1
+        bf/s    1b
+         mov.b  r1,@-r0
+        !
+        add     #-3,r5
+2:      ! Second, copy a long word at once
+        mov     r4,r2
+        add     #7,r2
+3:      mov.l   @(r0,r5),r1
+        cmp/hi  r2,r0
+        bt/s    3b
+         mov.l  r1,@-r0
+        !
+        ! Third, copy a byte at once, if necessary
+        cmp/eq  r4,r0
+        bt/s    9b
+         add    #3,r5
+        bra     8b
+         add    #-6,r2
+case1:
+        !
+        !       GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
+        !
+        ! First, align to long word boundary
+        mov     r0,r3
+        and     r2,r3
+        tst     r3,r3
+        bt/s    2f
+         add    #-1,r5
+1:      dt      r3
+        mov.b   @(r0,r5),r1
+        bf/s    1b
+         mov.b  r1,@-r0
+        !
+2:      ! Second, read a long word and write a long word at once
+        mov.l   @(r0,r5),r1
+        add     #-4,r5
+        mov     r4,r2
+        add     #7,r2
+        !
+#ifdef __LITTLE_ENDIAN__
+3:      mov     r1,r3           ! RQPO
+        shll16  r3
+        shll8   r3              ! Oxxx
+        mov.l   @(r0,r5),r1     ! NMLK
+        mov     r1,r6
+        shlr8   r6              ! xNML
+        or      r6,r3           ! ONML
+        cmp/hi  r2,r0
+        bt/s    3b
+         mov.l  r3,@-r0
+#else
+3:      mov     r1,r3           ! OPQR
+        shlr16  r3
+        shlr8   r3              ! xxxO
+        mov.l   @(r0,r5),r1     ! KLMN
+        mov     r1,r6
+        shll8   r6              ! LMNx
+        or      r6,r3           ! LMNO
+        cmp/hi  r2,r0
+        bt/s    3b
+         mov.l  r3,@-r0
+#endif
+        !
+        ! Third, copy a byte at once, if necessary
+        cmp/eq  r4,r0
+        bt/s    9b
+         add    #4,r5
+        bra     8b
+         add    #-6,r2
+case2:
+        !
+        !       GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
+        !
+        ! First, align to word boundary
+        tst     #1,r0
+        bt/s    2f
+         add    #-1,r5
+        mov.b   @(r0,r5),r1
+        mov.b   r1,@-r0
+        !
+2:      ! Second, read a word and write a word at once
+        add     #-1,r5
+        mov     r4,r2
+        add     #3,r2
+        !
+3:      mov.w   @(r0,r5),r1
+        cmp/hi  r2,r0
+        bt/s    3b
+         mov.w  r1,@-r0
+        !
+        ! Third, copy a byte at once, if necessary
+        cmp/eq  r4,r0
+        bt/s    9b
+         add    #1,r5
+        mov.b   @(r0,r5),r1
+        rts
+         mov.b  r1,@-r0
+case3:
+        !
+        !       GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
+        !
+        ! First, align to long word boundary
+        mov     r0,r3
+        and     r2,r3
+        tst     r3,r3
+        bt/s    2f
+         add    #-1,r5
+1:      dt      r3
+        mov.b   @(r0,r5),r1
+        bf/s    1b
+         mov.b  r1,@-r0
+        !
+2:      ! Second, read a long word and write a long word at once
+        add     #-2,r5
+        mov.l   @(r0,r5),r1
+        add     #-4,r5
+        mov     r4,r2
+        add     #7,r2
+        !
+#ifdef __LITTLE_ENDIAN__
+3:      mov     r1,r3           ! RQPO
+        shll8   r3              ! QPOx
+        mov.l   @(r0,r5),r1     ! NMLK
+        mov     r1,r6
+        shlr16  r6
+        shlr8   r6              ! xxxN
+        or      r6,r3           ! QPON
+        cmp/hi  r2,r0
+        bt/s    3b
+         mov.l  r3,@-r0
+#else
+3:      mov     r1,r3           ! OPQR
+        shlr8   r3              ! xOPQ
+        mov.l   @(r0,r5),r1     ! KLMN
+        mov     r1,r6
+        shll16  r6
+        shll8   r6              ! Nxxx
+        or      r6,r3           ! NOPQ
+        cmp/hi  r2,r0
+        bt/s    3b
+         mov.l  r3,@-r0
+#endif
+        !
+        ! Third, copy a byte at once, if necessary
+        cmp/eq  r4,r0
+        bt/s    9b
+         add    #6,r5
+        bra     8b
+         add    #-6,r2
diff --git a/arch/sh/lib/memmove.S b/arch/sh/lib/memmove.S
new file mode 100644
index 000000000000..5a2211f09202
--- /dev/null
+++ b/arch/sh/lib/memmove.S
@@ -0,0 +1,254 @@
+/* $Id: memmove.S,v 1.2 2001/07/27 11:51:09 gniibe Exp $
+ *
+ * "memmove" implementation of SuperH
+ *
+ * Copyright (C) 1999  Niibe Yutaka
+ *
+ */
+/*
+ * void *memmove(void *dst, const void *src, size_t n);
+ * The memory areas may overlap.
+ */
+#include <linux/linkage.h>
+ENTRY(memmove)
+        ! if dest > src, call memcpy (it copies in decreasing order)
+        cmp/hi  r5,r4
+        bf      1f
+        mov.l   2f,r0
+        jmp     @r0
+         nop
+        .balign 4
+2:      .long   memcpy
+1:
+        sub     r5,r4           ! From here, r4 has the distance to r0
+        tst     r6,r6
+        bt/s    9f              ! if n=0, do nothing
+         mov    r5,r0
+        add     r6,r5
+        mov     #12,r1
+        cmp/gt  r6,r1
+        bt/s    8f              ! if it's too small, copy a byte at once
+         add    #-1,r4
+        add     #1,r4
+        !
+        !                [ ...  ] DST             [ ...  ] SRC
+        !                [ ...  ]                 [ ...  ]
+        !                  :                        :
+        !      r0+r4-->  [ ...  ]       r0    --> [ ...  ]
+        !                  :                        :
+        !                [ ...  ]                 [ ...  ]
+        !                               r5    -->
+        !
+        mov     r4,r1
+        mov     #3,r2
+        and     r2,r1
+        shll2   r1
+        mov     r0,r3           ! Save the value on R0 to R3
+        mova    jmptable,r0
+        add     r1,r0
+        mov.l   @r0,r1
+        jmp     @r1
+         mov    r3,r0           ! and back to R0
+        .balign 4
+jmptable:
+        .long   case0
+        .long   case1
+        .long   case2
+        .long   case3
+        ! copy a byte at once
+8:      mov.b   @r0+,r1
+        cmp/hs  r5,r0
+        bf/s    8b                      ! while (r0<r5)
+         mov.b  r1,@(r0,r4)
+        add     #1,r4
+9:
+        add     r4,r0
+        rts
+         sub    r6,r0
+case_none:
+        bra     8b
+         add    #-1,r4
+case0:
+        !
+        !       GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
+        !
+        ! First, align to long word boundary
+        mov     r0,r3
+        and     r2,r3
+        tst     r3,r3
+        bt/s    2f
+         add    #-1,r4
+        mov     #4,r2
+        sub     r3,r2
+1:      dt      r2
+        mov.b   @r0+,r1
+        bf/s    1b
+         mov.b  r1,@(r0,r4)
+        !
+2:      ! Second, copy a long word at once
+        add     #-3,r4
+        add     #-3,r5
+3:      mov.l   @r0+,r1
+        cmp/hs  r5,r0
+        bf/s    3b
+         mov.l  r1,@(r0,r4)
+        add     #3,r5
+        !
+        ! Third, copy a byte at once, if necessary
+        cmp/eq  r5,r0
+        bt/s    9b
+         add    #4,r4
+        bra     8b
+         add    #-1,r4
+case3:
+        !
+        !       GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
+        !
+        ! First, align to long word boundary
+        mov     r0,r3
+        and     r2,r3
+        tst     r3,r3
+        bt/s    2f
+         add    #-1,r4
+        mov     #4,r2
+        sub     r3,r2
+1:      dt      r2
+        mov.b   @r0+,r1
+        bf/s    1b
+         mov.b  r1,@(r0,r4)
+        !
+2:      ! Second, read a long word and write a long word at once
+        add     #-2,r4
+        mov.l   @(r0,r4),r1
+        add     #-7,r5
+        add     #-4,r4
+        !
+#ifdef __LITTLE_ENDIAN__
+        shll8   r1
+3:      mov     r1,r3           ! JIHG
+        shlr8   r3              ! xJIH
+        mov.l   @r0+,r1         ! NMLK
+        mov     r1,r2
+        shll16  r2
+        shll8   r2              ! Kxxx
+        or      r2,r3           ! KJIH
+        cmp/hs  r5,r0
+        bf/s    3b
+         mov.l  r3,@(r0,r4)
+#else
+        shlr8   r1
+3:      mov     r1,r3           ! GHIJ
+        shll8   r3              ! HIJx
+        mov.l   @r0+,r1         ! KLMN
+        mov     r1,r2
+        shlr16  r2
+        shlr8   r2              ! xxxK
+        or      r2,r3           ! HIJK
+        cmp/hs  r5,r0
+        bf/s    3b
+         mov.l  r3,@(r0,r4)
+#endif
+        add     #7,r5
+        !
+        ! Third, copy a byte at once, if necessary
+        cmp/eq  r5,r0
+        bt/s    9b
+         add    #7,r4
+        add     #-3,r0
+        bra     8b
+         add    #-1,r4
+case2:
+        !
+        !       GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
+        !
+        ! First, align to word boundary
+        tst     #1,r0
+        bt/s    2f
+         add    #-1,r4
+        mov.b   @r0+,r1
+        mov.b   r1,@(r0,r4)
+        !
+2:      ! Second, read a word and write a word at once
+        add     #-1,r4
+        add     #-1,r5
+        !
+3:      mov.w   @r0+,r1
+        cmp/hs  r5,r0
+        bf/s    3b
+         mov.w  r1,@(r0,r4)
+        add     #1,r5
+        !
+        ! Third, copy a byte at once, if necessary
+        cmp/eq  r5,r0
+        bt/s    9b
+         add    #2,r4
+        mov.b   @r0,r1
+        mov.b   r1,@(r0,r4)
+        bra     9b
+         add    #1,r0
+case1:
+        !
+        !       GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
+        !
+        ! First, align to long word boundary
+        mov     r0,r3
+        and     r2,r3
+        tst     r3,r3
+        bt/s    2f
+         add    #-1,r4
+        mov     #4,r2
+        sub     r3,r2
+1:      dt      r2
+        mov.b   @r0+,r1
+        bf/s    1b
+         mov.b  r1,@(r0,r4)
+        !
+2:      ! Second, read a long word and write a long word at once
+        mov.l   @(r0,r4),r1
+        add     #-7,r5
+        add     #-4,r4
+        !
+#ifdef __LITTLE_ENDIAN__
+        shll16  r1
+        shll8   r1
+3:      mov     r1,r3           ! JIHG
+        shlr16  r3
+        shlr8   r3              ! xxxJ
+        mov.l   @r0+,r1         ! NMLK
+        mov     r1,r2
+        shll8   r2              ! MLKx
+        or      r2,r3           ! MLKJ
+        cmp/hs  r5,r0
+        bf/s    3b
+         mov.l  r3,@(r0,r4)
+#else
+        shlr16  r1
+        shlr8   r1
+3:      mov     r1,r3           ! GHIJ
+        shll16  r3
+        shll8   r3              ! Jxxx
+        mov.l   @r0+,r1         ! KLMN
+        mov     r1,r2
+        shlr8   r2              ! xKLM
+        or      r2,r3           ! JKLM
+        cmp/hs  r5,r0
+        bf/s    3b              ! while(r0<r5)
+         mov.l  r3,@(r0,r4)
+#endif
+        add     #7,r5
+        !
+        ! Third, copy a byte at once, if necessary
+        cmp/eq  r5,r0
+        bt/s    9b
+         add    #5,r4
+        add     #-3,r0
+        bra     8b
+         add    #-1,r4
diff --git a/arch/sh/lib/memset.S b/arch/sh/lib/memset.S
new file mode 100644
index 000000000000..95670090680e
--- /dev/null
+++ b/arch/sh/lib/memset.S
@@ -0,0 +1,57 @@
+/* $Id: memset.S,v 1.1 2000/04/14 16:49:01 mjd Exp $
+ *
+ * "memset" implementation of SuperH
+ *
+ * Copyright (C) 1999  Niibe Yutaka
+ *
+ */
+/*
+ *            void *memset(void *s, int c, size_t n);
+ */
+#include <linux/linkage.h>
+ENTRY(memset)
+        tst     r6,r6
+        bt/s    5f              ! if n=0, do nothing
+         add    r6,r4
+        mov     #12,r0
+        cmp/gt  r6,r0
+        bt/s    4f              ! if it's too small, set a byte at once
+         mov    r4,r0
+        and     #3,r0
+        cmp/eq  #0,r0
+        bt/s    2f              ! It's aligned
+         sub    r0,r6
+1:
+        dt      r0
+        bf/s    1b
+         mov.b  r5,@-r4
+2:                              ! make VVVV
+        swap.b  r5,r0           !   V0
+        or      r0,r5           !   VV
+        swap.w  r5,r0           ! VV00
+        or      r0,r5           ! VVVV
+        !
+        mov     r6,r0
+        shlr2   r0
+        shlr    r0              ! r0 = r6 >> 3
+3:
+        dt      r0
+        mov.l   r5,@-r4         ! set 8-byte at once
+        bf/s    3b
+         mov.l  r5,@-r4
+        !
+        mov     #7,r0
+        and     r0,r6
+        tst     r6,r6
+        bt      5f
+        ! fill bytes
+4:
+        dt      r6
+        bf/s    4b
+         mov.b  r5,@-r4
+5:
+        rts
+         mov    r4,r0
diff --git a/arch/sh/lib/strcasecmp.c b/arch/sh/lib/strcasecmp.c
new file mode 100644
index 000000000000..4e57a216feaf
--- /dev/null
+++ b/arch/sh/lib/strcasecmp.c
@@ -0,0 +1,26 @@
+/*
+ *  linux/arch/alpha/lib/strcasecmp.c
+ */
+#include <linux/string.h>
+/* We handle nothing here except the C locale.  Since this is used in
+   only one place, on strings known to contain only 7 bit ASCII, this
+   is ok.  */
+int strcasecmp(const char *a, const char *b)
+{
+        int ca, cb;
+        do {
+                ca = *a++ & 0xff;
+                cb = *b++ & 0xff;
+                if (ca >= 'A' && ca <= 'Z')
+                        ca += 'a' - 'A';
+                if (cb >= 'A' && cb <= 'Z')
+                        cb += 'a' - 'A';
+        } while (ca == cb && ca != '\0');
+        return ca - cb;
+}
diff --git a/arch/sh/lib/strlen.S b/arch/sh/lib/strlen.S
new file mode 100644
index 000000000000..f8ab296047b3
--- /dev/null
+++ b/arch/sh/lib/strlen.S
@@ -0,0 +1,70 @@
+/* $Id: strlen.S,v 1.2 2001/06/29 14:07:15 gniibe Exp $
+ *
+ * "strlen" implementation of SuperH
+ *
+ * Copyright (C) 1999  Kaz Kojima
+ *
+ */
+/* size_t strlen (const char *s)  */
+#include <linux/linkage.h>
+ENTRY(strlen)
+        mov     r4,r0
+        and     #3,r0
+        tst     r0,r0
+        bt/s    1f
+         mov    #0,r2
+        add     #-1,r0
+        shll2   r0
+        shll    r0
+        braf    r0
+         nop
+        mov.b   @r4+,r1
+        tst     r1,r1
+        bt      8f
+        add     #1,r2
+        mov.b   @r4+,r1
+        tst     r1,r1
+        bt      8f
+        add     #1,r2
+        mov.b   @r4+,r1
+        tst     r1,r1
+        bt      8f
+        add     #1,r2   
+1:
+        mov     #0,r3
+2:
+        mov.l   @r4+,r1
+        cmp/str r3,r1
+        bf/s    2b
+         add    #4,r2
+        add     #-4,r2
+#ifndef __LITTLE_ENDIAN__
+        swap.b  r1,r1
+        swap.w  r1,r1
+        swap.b  r1,r1
+#endif
+        extu.b  r1,r0
+        tst     r0,r0
+        bt/s    8f
+         shlr8  r1
+        add     #1,r2
+        extu.b  r1,r0
+        tst     r0,r0
+        bt/s    8f
+         shlr8  r1
+        add     #1,r2
+        extu.b  r1,r0
+        tst     r0,r0
+        bt      8f
+        add     #1,r2
+8:
+        rts
+         mov    r2,r0
diff --git a/arch/sh/lib/udivdi3.c b/arch/sh/lib/udivdi3.c
new file mode 100644
index 000000000000..68f038bf3c50
--- /dev/null
+++ b/arch/sh/lib/udivdi3.c
@@ -0,0 +1,16 @@
+/*
+ * Simple __udivdi3 function which doesn't use FPU.
+ */
+#include <linux/types.h>
+extern u64 __xdiv64_32(u64 n, u32 d);
+extern void panic(const char * fmt, ...);
+u64 __udivdi3(u64 n, u64 d)
+{
+        if (d & ~0xffffffff)
+                panic("Need true 64-bit/64-bit division");
+        return __xdiv64_32(n, (u32)d);
+}