1 files changed, 800 insertions, 0 deletions
diff --git a/arch/sh/lib/memcpy-sh4.S b/arch/sh/lib/memcpy-sh4.S
new file mode 100644
index 000000000000..55f227441f9e
--- /dev/null
+++ b/arch/sh/lib/memcpy-sh4.S
@@ -0,0 +1,800 @@
+/*
+ * "memcpy" implementation of SuperH
+ *
+ * Copyright (C) 1999  Niibe Yutaka
+ * Copyright (c) 2002  STMicroelectronics Ltd
+ *   Modified from memcpy.S and micro-optimised for SH4
+ *   Stuart Menefy (stuart.menefy@st.com)
+ *
+ */
+#include <linux/linkage.h>
+#include <linux/config.h>
+/*
+ * void *memcpy(void *dst, const void *src, size_t n);
+ *
+ * It is assumed that there is no overlap between src and dst.
+ * If there is an overlap, then the results are undefined.
+ */
+        !
+        !       GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
+        !
+        ! Size is 16 or greater, and may have trailing bytes
+        .balign 32
+.Lcase1:
+        ! Read a long word and write a long word at once
+        ! At the start of each iteration, r7 contains last long load
+        add     #-1,r5          !  79 EX
+        mov     r4,r2           !   5 MT (0 cycles latency)
+        mov.l   @(r0,r5),r7     !  21 LS (2 cycles latency)
+        add     #-4,r5          !  50 EX
+        add     #7,r2           !  79 EX
+        !
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+        ! 6 cycles, 4 bytes per iteration
+3:      mov.l   @(r0,r5),r1     !  21 LS (latency=2)    ! NMLK
+        mov     r7, r3          !   5 MT (latency=0)    ! RQPO
+        cmp/hi  r2,r0           !  57 MT
+        shll16  r3              ! 103 EX
+        mov     r1,r6           !   5 MT (latency=0)
+        shll8   r3              ! 102 EX                ! Oxxx
+        shlr8   r6              ! 106 EX                ! xNML
+        mov     r1, r7          !   5 MT (latency=0)
+        or      r6,r3           !  82 EX                ! ONML
+        bt/s    3b              ! 109 BR
+         mov.l  r3,@-r0         !  30 LS
+#else
+3:      mov.l   @(r0,r5),r1     !  21 LS (latency=2)    ! KLMN
+        mov     r7,r3           !   5 MT (latency=0)    ! OPQR
+        cmp/hi  r2,r0           !  57 MT
+        shlr16  r3              ! 107 EX
+        shlr8   r3              ! 106 EX                ! xxxO
+        mov     r1,r6           !   5 MT (latency=0)
+        shll8   r6              ! 102 EX                ! LMNx
+        mov     r1,r7           !   5 MT (latency=0)
+        or      r6,r3           !  82 EX                ! LMNO
+        bt/s    3b              ! 109 BR
+         mov.l  r3,@-r0         !  30 LS
+#endif
+        ! Finally, copy a byte at once, if necessary
+        add     #4,r5           !  50 EX
+        cmp/eq  r4,r0           !  54 MT
+        add     #-6,r2          !  50 EX
+        bt      9f              ! 109 BR
+8:      cmp/hi  r2,r0           !  57 MT
+        mov.b   @(r0,r5),r1     !  20 LS (latency=2)
+        bt/s    8b              ! 109 BR
+         mov.b  r1,@-r0         !  29 LS
+9:      rts
+         nop
+        !
+        !       GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
+        !
+        ! Size is 16 or greater, and may have trailing bytes
+        .balign 32
+.Lcase3:
+        ! Read a long word and write a long word at once
+        ! At the start of each iteration, r7 contains last long load
+        add     #-3,r5          ! 79 EX
+        mov     r4,r2           !  5 MT (0 cycles latency)
+        mov.l   @(r0,r5),r7     ! 21 LS (2 cycles latency)
+        add     #-4,r5          ! 50 EX
+        add     #7,r2           !  79 EX
+        !
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+        ! 6 cycles, 4 bytes per iteration
+3:      mov.l   @(r0,r5),r1     !  21 LS (latency=2)    ! NMLK
+        mov     r7, r3          !   5 MT (latency=0)    ! RQPO
+        cmp/hi  r2,r0           !  57 MT
+        shll8   r3              ! 102 EX                ! QPOx
+        mov     r1,r6           !   5 MT (latency=0)
+        shlr16  r6              ! 107 EX
+        shlr8   r6              ! 106 EX                ! xxxN
+        mov     r1, r7          !   5 MT (latency=0)
+        or      r6,r3           !  82 EX                ! QPON
+        bt/s    3b              ! 109 BR
+         mov.l  r3,@-r0         !  30 LS
+#else
+3:      mov     r1,r3           ! OPQR
+        shlr8   r3              ! xOPQ
+        mov.l   @(r0,r5),r1     ! KLMN
+        mov     r1,r6
+        shll16  r6
+        shll8   r6              ! Nxxx
+        or      r6,r3           ! NOPQ
+        cmp/hi  r2,r0
+        bt/s    3b
+         mov.l  r3,@-r0
+#endif
+        ! Finally, copy a byte at once, if necessary
+        add     #6,r5           !  50 EX
+        cmp/eq  r4,r0           !  54 MT
+        add     #-6,r2          !  50 EX
+        bt      9f              ! 109 BR
+8:      cmp/hi  r2,r0           !  57 MT
+        mov.b   @(r0,r5),r1     !  20 LS (latency=2)
+        bt/s    8b              ! 109 BR
+         mov.b  r1,@-r0         !  29 LS
+9:      rts
+         nop
+ENTRY(memcpy)
+        ! Calculate the invariants which will be used in the remainder
+        ! of the code:
+        !
+        !      r4   -->  [ ...  ] DST             [ ...  ] SRC
+        !                [ ...  ]                 [ ...  ]
+        !                  :                        :
+        !      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
+        !
+        !
+        ! Short circuit the common case of src, dst and len being 32 bit aligned
+        ! and test for zero length move
+        mov     r6, r0          !   5 MT (0 cycle latency)
+        or      r4, r0          !  82 EX
+        or      r5, r0          !  82 EX
+        tst     r6, r6          !  86 MT
+        bt/s    99f             ! 111 BR                (zero len)
+         tst    #3, r0          !  87 MT
+        mov     r4, r0          !   5 MT (0 cycle latency)
+        add     r6, r0          !  49 EX
+        mov     #16, r1         !   6 EX
+        bt/s    .Lcase00        ! 111 BR                (aligned)
+         sub    r4, r5          !  75 EX
+        ! Arguments are not nicely long word aligned or zero len.
+        ! Check for small copies, and if so do a simple byte at a time copy.
+        !
+        ! Deciding on an exact value of 'small' is not easy, as the point at which
+        ! using the optimised routines become worthwhile varies (these are the
+        ! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
+        !       size    byte-at-time    long    word    byte
+        !       16      42              39-40   46-50   50-55
+        !       24      58              43-44   54-58   62-67
+        !       36      82              49-50   66-70   80-85
+        ! However the penalty for getting it 'wrong' is much higher for long word
+        ! aligned data (and this is more common), so use a value of 16.
+        cmp/gt  r6,r1           !  56 MT
+        add     #-1,r5          !  50 EX
+        bf/s    6f              ! 108 BR                (not small)
+         mov    r5, r3          !   5 MT (latency=0)
+        shlr    r6              ! 104 EX
+        mov.b   @(r0,r5),r1     !  20 LS (latency=2)
+        bf/s    4f              ! 111 BR
+         add    #-1,r3          !  50 EX
+        tst     r6, r6          !  86 MT
+        bt/s    98f             ! 110 BR
+         mov.b  r1,@-r0         !  29 LS
+        ! 4 cycles, 2 bytes per iteration
+3:      mov.b   @(r0,r5),r1     !  20 LS (latency=2)
+4:      mov.b   @(r0,r3),r2     !  20 LS (latency=2)
+        dt      r6              !  67 EX
+        mov.b   r1,@-r0         !  29 LS
+        bf/s    3b              ! 111 BR
+         mov.b  r2,@-r0         !  29 LS
+98:
+        rts
+         nop
+99:     rts
+         mov    r4, r0
+        ! Size is not small, so its worthwhile looking for optimisations.
+        ! First align destination to a long word boundary.
+        !
+        ! r5 = normal value -1
+6:      tst     #3, r0          !  87 MT
+        mov     #3, r3          !   6 EX
+        bt/s    2f              ! 111 BR
+         and    r0,r3           !  78 EX
+        ! 3 cycles, 1 byte per iteration
+1:      dt      r3              !  67 EX
+        mov.b   @(r0,r5),r1     !  19 LS (latency=2)
+        add     #-1, r6         !  79 EX
+        bf/s    1b              ! 109 BR
+         mov.b  r1,@-r0         !  28 LS
+2:      add     #1, r5          !  79 EX
+        ! Now select the appropriate bulk transfer code based on relative
+        ! alignment of src and dst.
+        mov     r0, r3          !   5 MT (latency=0)
+        mov     r5, r0          !   5 MT (latency=0)
+        tst     #1, r0          !  87 MT
+        bf/s    1f              ! 111 BR
+         mov    #64, r7         !   6 EX
+        ! bit 0 clear
+        cmp/ge  r7, r6          !  55 MT
+        bt/s    2f              ! 111 BR
+         tst    #2, r0          !  87 MT
+        ! small
+        bt/s    .Lcase0
+         mov    r3, r0
+        bra     .Lcase2
+         nop
+        ! big
+2:      bt/s    .Lcase0b
+         mov    r3, r0
+        bra     .Lcase2b
+         nop
+        ! bit 0 set
+1:      tst     #2, r0          ! 87 MT
+        bt/s    .Lcase1
+         mov    r3, r0
+        bra     .Lcase3
+         nop
+        !
+        !       GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
+        !
+        ! src, dst and size are all long word aligned
+        ! size is non-zero
+        .balign 32
+.Lcase00:
+        mov     #64, r1         !   6 EX
+        mov     r5, r3          !   5 MT (latency=0)
+        cmp/gt  r6, r1          !  56 MT
+        add     #-4, r5         !  50 EX
+        bf      .Lcase00b       ! 108 BR                (big loop)
+        shlr2   r6              ! 105 EX
+        shlr    r6              ! 104 EX
+        mov.l   @(r0, r5), r1   !  21 LS (latency=2)
+        bf/s    4f              ! 111 BR
+         add    #-8, r3         !  50 EX
+        tst     r6, r6          !  86 MT
+        bt/s    5f              ! 110 BR
+         mov.l  r1,@-r0         !  30 LS
+        ! 4 cycles, 2 long words per iteration
+3:      mov.l   @(r0, r5), r1   !  21 LS (latency=2)
+4:      mov.l   @(r0, r3), r2   !  21 LS (latency=2)
+        dt      r6              !  67 EX
+        mov.l   r1, @-r0        !  30 LS
+        bf/s    3b              ! 109 BR
+         mov.l  r2, @-r0        !  30 LS
+5:      rts
+         nop
+        ! Size is 16 or greater and less than 64, but may have trailing bytes
+        .balign 32
+.Lcase0:
+        add     #-4, r5         !  50 EX
+        mov     r4, r7          !   5 MT (latency=0)
+        mov.l   @(r0, r5), r1   !  21 LS (latency=2)
+        mov     #4, r2          !   6 EX
+        add     #11, r7         !  50 EX
+        tst     r2, r6          !  86 MT
+        mov     r5, r3          !   5 MT (latency=0)
+        bt/s    4f              ! 111 BR
+         add    #-4, r3         !  50 EX
+        mov.l   r1,@-r0         !  30 LS
+        ! 4 cycles, 2 long words per iteration
+3:      mov.l   @(r0, r5), r1   !  21 LS (latency=2)
+4:      mov.l   @(r0, r3), r2   !  21 LS (latency=2)
+        cmp/hi  r7, r0
+        mov.l   r1, @-r0        !  30 LS
+        bt/s    3b              ! 109 BR
+         mov.l  r2, @-r0        !  30 LS
+        ! Copy the final 0-3 bytes
+        add     #3,r5           !  50 EX
+        cmp/eq  r0, r4          !  54 MT
+        add     #-10, r7        !  50 EX
+        bt      9f              ! 110 BR
+        ! 3 cycles, 1 byte per iteration
+1:      mov.b   @(r0,r5),r1     !  19 LS
+        cmp/hi  r7,r0           !  57 MT
+        bt/s    1b              ! 111 BR
+         mov.b  r1,@-r0         !  28 LS
+9:      rts
+         nop
+        ! Size is at least 64 bytes, so will be going round the big loop at least once.
+        !
+        !   r2 = rounded up r4
+        !   r3 = rounded down r0
+        .balign 32
+.Lcase0b:
+        add     #-4, r5         !  50 EX
+.Lcase00b:
+        mov     r0, r3          !   5 MT (latency=0)
+        mov     #(~0x1f), r1    !   6 EX
+        and     r1, r3          !  78 EX
+        mov     r4, r2          !   5 MT (latency=0)
+        cmp/eq  r3, r0          !  54 MT
+        add     #0x1f, r2       !  50 EX
+        bt/s    1f              ! 110 BR
+         and    r1, r2          !  78 EX
+        ! copy initial words until cache line aligned
+        mov.l   @(r0, r5), r1   !  21 LS (latency=2)
+        tst     #4, r0          !  87 MT
+        mov     r5, r6          !   5 MT (latency=0)
+        add     #-4, r6         !  50 EX
+        bt/s    4f              ! 111 BR
+         add    #8, r3          !  50 EX
+        tst     #0x18, r0       !  87 MT
+        bt/s    1f              ! 109 BR
+         mov.l  r1,@-r0         !  30 LS
+        ! 4 cycles, 2 long words per iteration
+3:      mov.l   @(r0, r5), r1   !  21 LS (latency=2)
+4:      mov.l   @(r0, r6), r7   !  21 LS (latency=2)
+        cmp/eq  r3, r0          !  54 MT
+        mov.l   r1, @-r0        !  30 LS
+        bf/s    3b              ! 109 BR
+         mov.l  r7, @-r0        !  30 LS
+        ! Copy the cache line aligned blocks
+        !
+        ! In use: r0, r2, r4, r5
+        ! Scratch: r1, r3, r6, r7
+        !
+        ! We could do this with the four scratch registers, but if src
+        ! and dest hit the same cache line, this will thrash, so make
+        ! use of additional registers.
+        !
+        ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
+        !   r5:  src (was r0+r5)
+        !   r1:  dest (was r0)
+        ! this can be reversed at the end, so we don't need to save any extra
+        ! state.
+        !
+1:      mov.l   r8, @-r15       !  30 LS
+        add     r0, r5          !  49 EX
+        mov.l   r9, @-r15       !  30 LS
+        mov     r0, r1          !   5 MT (latency=0)
+        mov.l   r10, @-r15      !  30 LS
+        add     #-0x1c, r5      !  50 EX
+        mov.l   r11, @-r15      !  30 LS
+        ! 16 cycles, 32 bytes per iteration
+2:      mov.l   @(0x00,r5),r0   ! 18 LS (latency=2)
+        add     #-0x20, r1      ! 50 EX
+        mov.l   @(0x04,r5),r3   ! 18 LS (latency=2)
+        mov.l   @(0x08,r5),r6   ! 18 LS (latency=2)
+        mov.l   @(0x0c,r5),r7   ! 18 LS (latency=2)
+        mov.l   @(0x10,r5),r8   ! 18 LS (latency=2)
+        mov.l   @(0x14,r5),r9   ! 18 LS (latency=2)
+        mov.l   @(0x18,r5),r10  ! 18 LS (latency=2)
+        mov.l   @(0x1c,r5),r11  ! 18 LS (latency=2)
+        movca.l r0,@r1          ! 40 LS (latency=3-7)
+        mov.l   r3,@(0x04,r1)   ! 33 LS
+        mov.l   r6,@(0x08,r1)   ! 33 LS
+        mov.l   r7,@(0x0c,r1)   ! 33 LS
+        mov.l   r8,@(0x10,r1)   ! 33 LS
+        add     #-0x20, r5      ! 50 EX
+        mov.l   r9,@(0x14,r1)   ! 33 LS
+        cmp/eq  r2,r1           ! 54 MT
+        mov.l   r10,@(0x18,r1)  !  33 LS
+        bf/s    2b              ! 109 BR
+         mov.l  r11,@(0x1c,r1)  !  33 LS
+        mov     r1, r0          !   5 MT (latency=0)
+        mov.l   @r15+, r11      !  15 LS
+        sub     r1, r5          !  75 EX
+        mov.l   @r15+, r10      !  15 LS
+        cmp/eq  r4, r0          !  54 MT
+        bf/s    1f              ! 109 BR
+         mov.l   @r15+, r9      !  15 LS
+        rts
+1:       mov.l  @r15+, r8       !  15 LS
+        sub     r4, r1          !  75 EX                (len remaining)
+        ! number of trailing bytes is non-zero
+        !
+        ! invariants restored (r5 already decremented by 4)
+        ! also r1=num bytes remaining
+        mov     #4, r2          !   6 EX
+        mov     r4, r7          !   5 MT (latency=0)
+        add     #0x1c, r5       !  50 EX                (back to -4)
+        cmp/hs  r2, r1          !  58 MT
+        bf/s    5f              ! 108 BR
+         add     #11, r7        !  50 EX
+        mov.l   @(r0, r5), r6   !  21 LS (latency=2)
+        tst     r2, r1          !  86 MT
+        mov     r5, r3          !   5 MT (latency=0)
+        bt/s    4f              ! 111 BR
+         add    #-4, r3         !  50 EX
+        cmp/hs  r2, r1          !  58 MT
+        bt/s    5f              ! 111 BR
+         mov.l  r6,@-r0         !  30 LS
+        ! 4 cycles, 2 long words per iteration
+3:      mov.l   @(r0, r5), r6   !  21 LS (latency=2)
+4:      mov.l   @(r0, r3), r2   !  21 LS (latency=2)
+        cmp/hi  r7, r0
+        mov.l   r6, @-r0        !  30 LS
+        bt/s    3b              ! 109 BR
+         mov.l  r2, @-r0        !  30 LS
+        ! Copy the final 0-3 bytes
+5:      cmp/eq  r0, r4          !  54 MT
+        add     #-10, r7        !  50 EX
+        bt      9f              ! 110 BR
+        add     #3,r5           !  50 EX
+        ! 3 cycles, 1 byte per iteration
+1:      mov.b   @(r0,r5),r1     !  19 LS
+        cmp/hi  r7,r0           !  57 MT
+        bt/s    1b              ! 111 BR
+         mov.b  r1,@-r0         !  28 LS
+9:      rts
+         nop
+        !
+        !       GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
+        !
+        .balign 32
+.Lcase2:
+        ! Size is 16 or greater and less then 64, but may have trailing bytes
+2:      mov     r5, r6          !   5 MT (latency=0)
+        add     #-2,r5          !  50 EX
+        mov     r4,r2           !   5 MT (latency=0)
+        add     #-4,r6          !  50 EX
+        add     #7,r2           !  50 EX
+3:      mov.w   @(r0,r5),r1     !  20 LS (latency=2)
+        mov.w   @(r0,r6),r3     !  20 LS (latency=2)
+        cmp/hi  r2,r0           !  57 MT
+        mov.w   r1,@-r0         !  29 LS
+        bt/s    3b              ! 111 BR
+         mov.w  r3,@-r0         !  29 LS
+        bra     10f
+         nop
+        .balign 32
+.Lcase2b:
+        ! Size is at least 64 bytes, so will be going round the big loop at least once.
+        !
+        !   r2 = rounded up r4
+        !   r3 = rounded down r0
+        mov     r0, r3          !   5 MT (latency=0)
+        mov     #(~0x1f), r1    !   6 EX
+        and     r1, r3          !  78 EX
+        mov     r4, r2          !   5 MT (latency=0)
+        cmp/eq  r3, r0          !  54 MT
+        add     #0x1f, r2       !  50 EX
+        add     #-2, r5         !  50 EX
+        bt/s    1f              ! 110 BR
+         and    r1, r2          !  78 EX
+        ! Copy a short word one at a time until we are cache line aligned
+        !   Normal values: r0, r2, r3, r4
+        !   Unused: r1, r6, r7
+        !   Mod: r5 (=r5-2)
+        !
+        add     #2, r3          !  50 EX
+2:      mov.w   @(r0,r5),r1     !  20 LS (latency=2)
+        cmp/eq  r3,r0           !  54 MT
+        bf/s    2b              ! 111 BR
+         mov.w  r1,@-r0         !  29 LS
+        ! Copy the cache line aligned blocks
+        !
+        ! In use: r0, r2, r4, r5 (=r5-2)
+        ! Scratch: r1, r3, r6, r7
+        !
+        ! We could do this with the four scratch registers, but if src
+        ! and dest hit the same cache line, this will thrash, so make
+        ! use of additional registers.
+        !
+        ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
+        !   r5:  src (was r0+r5)
+        !   r1:  dest (was r0)
+        ! this can be reversed at the end, so we don't need to save any extra
+        ! state.
+        !
+1:      mov.l   r8, @-r15       !  30 LS
+        add     r0, r5          !  49 EX
+        mov.l   r9, @-r15       !  30 LS
+        mov     r0, r1          !   5 MT (latency=0)
+        mov.l   r10, @-r15      !  30 LS
+        add     #-0x1e, r5      !  50 EX
+        mov.l   r11, @-r15      !  30 LS
+        mov.l   r12, @-r15      !  30 LS
+        ! 17 cycles, 32 bytes per iteration
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+2:      mov.w   @r5+, r0        !  14 LS (latency=2)            ..JI
+        add     #-0x20, r1      !  50 EX
+        mov.l   @r5+, r3        !  15 LS (latency=2)            NMLK
+        mov.l   @r5+, r6        !  15 LS (latency=2)            RQPO
+        shll16  r0              ! 103 EX                        JI..
+        mov.l   @r5+, r7        !  15 LS (latency=2)
+        xtrct   r3, r0          !  48 EX                        LKJI
+        mov.l   @r5+, r8        !  15 LS (latency=2)
+        xtrct   r6, r3          !  48 EX                        PONM
+        mov.l   @r5+, r9        !  15 LS (latency=2)
+        xtrct   r7, r6          !  48 EX
+        mov.l   @r5+, r10       !  15 LS (latency=2)
+        xtrct   r8, r7          !  48 EX
+        mov.l   @r5+, r11       !  15 LS (latency=2)
+        xtrct   r9, r8          !  48 EX
+        mov.w   @r5+, r12       !  15 LS (latency=2)
+        xtrct   r10, r9         !  48 EX
+        movca.l r0,@r1          !  40 LS (latency=3-7)
+        xtrct   r11, r10        !  48 EX
+        mov.l   r3, @(0x04,r1)  !  33 LS
+        xtrct   r12, r11        !  48 EX
+        mov.l   r6, @(0x08,r1)  !  33 LS
+        mov.l   r7, @(0x0c,r1)  !  33 LS
+        mov.l   r8, @(0x10,r1)  !  33 LS
+        add     #-0x40, r5      !  50 EX
+        mov.l   r9, @(0x14,r1)  !  33 LS
+        cmp/eq  r2,r1           !  54 MT
+        mov.l   r10, @(0x18,r1) !  33 LS
+        bf/s    2b              ! 109 BR
+         mov.l  r11, @(0x1c,r1) !  33 LS
+#else
+2:      mov.w   @(0x1e,r5), r0  !  17 LS (latency=2)
+        add     #-2, r5         !  50 EX
+        mov.l   @(0x1c,r5), r3  !  18 LS (latency=2)
+        add     #-4, r1         !  50 EX
+        mov.l   @(0x18,r5), r6  !  18 LS (latency=2)
+        shll16  r0              ! 103 EX
+        mov.l   @(0x14,r5), r7  !  18 LS (latency=2)
+        xtrct   r3, r0          !  48 EX
+        mov.l   @(0x10,r5), r8  !  18 LS (latency=2)
+        xtrct   r6, r3          !  48 EX
+        mov.l   @(0x0c,r5), r9  !  18 LS (latency=2)
+        xtrct   r7, r6          !  48 EX
+        mov.l   @(0x08,r5), r10 !  18 LS (latency=2)
+        xtrct   r8, r7          !  48 EX
+        mov.l   @(0x04,r5), r11 !  18 LS (latency=2)
+        xtrct   r9, r8          !  48 EX
+        mov.w   @(0x02,r5), r12 !  18 LS (latency=2)
+        xtrct   r10, r9         !  48 EX
+        movca.l r0,@r1          !  40 LS (latency=3-7)
+        add     #-0x1c, r1      !  50 EX
+        mov.l   r3, @(0x1c,r1)  !  33 LS
+        xtrct   r11, r10        !  48 EX
+        mov.l   r6, @(0x18,r1)  !  33 LS
+        xtrct   r12, r11        !  48 EX
+        mov.l   r7, @(0x14,r1)  !  33 LS
+        mov.l   r8, @(0x10,r1)  !  33 LS
+        add     #-0x3e, r5      !  50 EX
+        mov.l   r9, @(0x0c,r1)  !  33 LS
+        cmp/eq  r2,r1           !  54 MT
+        mov.l   r10, @(0x08,r1) !  33 LS
+        bf/s    2b              ! 109 BR
+         mov.l  r11, @(0x04,r1) !  33 LS
+#endif
+        mov.l   @r15+, r12
+        mov     r1, r0          !   5 MT (latency=0)
+        mov.l   @r15+, r11      !  15 LS
+        sub     r1, r5          !  75 EX
+        mov.l   @r15+, r10      !  15 LS
+        cmp/eq  r4, r0          !  54 MT
+        bf/s    1f              ! 109 BR
+         mov.l   @r15+, r9      !  15 LS
+        rts
+1:       mov.l  @r15+, r8       !  15 LS
+        add     #0x1e, r5       !  50 EX
+        ! Finish off a short word at a time
+        ! r5 must be invariant - 2
+10:     mov     r4,r2           !   5 MT (latency=0)
+        add     #1,r2           !  50 EX
+        cmp/hi  r2, r0          !  57 MT
+        bf/s    1f              ! 109 BR
+         add    #2, r2          !  50 EX
+3:      mov.w   @(r0,r5),r1     !  20 LS
+        cmp/hi  r2,r0           !  57 MT
+        bt/s    3b              ! 109 BR
+         mov.w  r1,@-r0         !  29 LS
+1:
+        !
+        ! Finally, copy the last byte if necessary
+        cmp/eq  r4,r0           !  54 MT
+        bt/s    9b
+         add    #1,r5
+        mov.b   @(r0,r5),r1
+        rts
+         mov.b  r1,@-r0

diff --git a/arch/sh/lib/memcpy-sh4.S b/arch/sh/lib/memcpy-sh4.S new file mode 100644 index 000000000000..55f227441f9e --- /dev/null +++ b/arch/sh/lib/memcpy-sh4.S
@@ -0,0 +1,800 @@
	1	/*
	2	* "memcpy" implementation of SuperH
	3	*
	4	* Copyright (C) 1999 Niibe Yutaka
	5	* Copyright (c) 2002 STMicroelectronics Ltd
	6	* Modified from memcpy.S and micro-optimised for SH4
	7	* Stuart Menefy (stuart.menefy@st.com)
	8	*
	9	*/
	10	#include <linux/linkage.h>
	11	#include <linux/config.h>
	12
	13	/*
	14	* void memcpy(void dst, const void *src, size_t n);
	15	*
	16	* It is assumed that there is no overlap between src and dst.
	17	* If there is an overlap, then the results are undefined.
	18	*/
	19
	20	!
	21	! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR.
	22	!
	23
	24	! Size is 16 or greater, and may have trailing bytes
	25
	26	.balign 32
	27	.Lcase1:
	28	! Read a long word and write a long word at once
	29	! At the start of each iteration, r7 contains last long load
	30	add #-1,r5 ! 79 EX
	31	mov r4,r2 ! 5 MT (0 cycles latency)
	32
	33	mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
	34	add #-4,r5 ! 50 EX
	35
	36	add #7,r2 ! 79 EX
	37	!
	38	#ifdef CONFIG_CPU_LITTLE_ENDIAN
	39	! 6 cycles, 4 bytes per iteration
	40	3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
	41	mov r7, r3 ! 5 MT (latency=0) ! RQPO
	42
	43	cmp/hi r2,r0 ! 57 MT
	44	shll16 r3 ! 103 EX
	45
	46	mov r1,r6 ! 5 MT (latency=0)
	47	shll8 r3 ! 102 EX ! Oxxx
	48
	49	shlr8 r6 ! 106 EX ! xNML
	50	mov r1, r7 ! 5 MT (latency=0)
	51
	52	or r6,r3 ! 82 EX ! ONML
	53	bt/s 3b ! 109 BR
	54
	55	mov.l r3,@-r0 ! 30 LS
	56	#else
	57	3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN
	58	mov r7,r3 ! 5 MT (latency=0) ! OPQR
	59
	60	cmp/hi r2,r0 ! 57 MT
	61	shlr16 r3 ! 107 EX
	62
	63	shlr8 r3 ! 106 EX ! xxxO
	64	mov r1,r6 ! 5 MT (latency=0)
	65
	66	shll8 r6 ! 102 EX ! LMNx
	67	mov r1,r7 ! 5 MT (latency=0)
	68
	69	or r6,r3 ! 82 EX ! LMNO
	70	bt/s 3b ! 109 BR
	71
	72	mov.l r3,@-r0 ! 30 LS
	73	#endif
	74	! Finally, copy a byte at once, if necessary
	75
	76	add #4,r5 ! 50 EX
	77	cmp/eq r4,r0 ! 54 MT
	78
	79	add #-6,r2 ! 50 EX
	80	bt 9f ! 109 BR
	81
	82	8: cmp/hi r2,r0 ! 57 MT
	83	mov.b @(r0,r5),r1 ! 20 LS (latency=2)
	84
	85	bt/s 8b ! 109 BR
	86
	87	mov.b r1,@-r0 ! 29 LS
	88
	89	9: rts
	90	nop
	91
	92
	93	!
	94	! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R...
	95	!
	96
	97	! Size is 16 or greater, and may have trailing bytes
	98
	99	.balign 32
	100	.Lcase3:
	101	! Read a long word and write a long word at once
	102	! At the start of each iteration, r7 contains last long load
	103	add #-3,r5 ! 79 EX
	104	mov r4,r2 ! 5 MT (0 cycles latency)
	105
	106	mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
	107	add #-4,r5 ! 50 EX
	108
	109	add #7,r2 ! 79 EX
	110	!
	111	#ifdef CONFIG_CPU_LITTLE_ENDIAN
	112	! 6 cycles, 4 bytes per iteration
	113	3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
	114	mov r7, r3 ! 5 MT (latency=0) ! RQPO
	115
	116	cmp/hi r2,r0 ! 57 MT
	117	shll8 r3 ! 102 EX ! QPOx
	118
	119	mov r1,r6 ! 5 MT (latency=0)
	120	shlr16 r6 ! 107 EX
	121
	122	shlr8 r6 ! 106 EX ! xxxN
	123	mov r1, r7 ! 5 MT (latency=0)
	124
	125	or r6,r3 ! 82 EX ! QPON
	126	bt/s 3b ! 109 BR
	127
	128	mov.l r3,@-r0 ! 30 LS
	129	#else
	130	3: mov r1,r3 ! OPQR
	131	shlr8 r3 ! xOPQ
	132	mov.l @(r0,r5),r1 ! KLMN
	133	mov r1,r6
	134	shll16 r6
	135	shll8 r6 ! Nxxx
	136	or r6,r3 ! NOPQ
	137	cmp/hi r2,r0
	138	bt/s 3b
	139	mov.l r3,@-r0
	140	#endif
	141
	142	! Finally, copy a byte at once, if necessary
	143
	144	add #6,r5 ! 50 EX
	145	cmp/eq r4,r0 ! 54 MT
	146
	147	add #-6,r2 ! 50 EX
	148	bt 9f ! 109 BR
	149
	150	8: cmp/hi r2,r0 ! 57 MT
	151	mov.b @(r0,r5),r1 ! 20 LS (latency=2)
	152
	153	bt/s 8b ! 109 BR
	154
	155	mov.b r1,@-r0 ! 29 LS
	156
	157	9: rts
	158	nop
	159
	160	ENTRY(memcpy)
	161
	162	! Calculate the invariants which will be used in the remainder
	163	! of the code:
	164	!
	165	! r4 --> [ ... ] DST [ ... ] SRC
	166	! [ ... ] [ ... ]
	167	! : :
	168	! r0 --> [ ... ] r0+r5 --> [ ... ]
	169	!
	170	!
	171
	172	! Short circuit the common case of src, dst and len being 32 bit aligned
	173	! and test for zero length move
	174
	175	mov r6, r0 ! 5 MT (0 cycle latency)
	176	or r4, r0 ! 82 EX
	177
	178	or r5, r0 ! 82 EX
	179	tst r6, r6 ! 86 MT
	180
	181	bt/s 99f ! 111 BR (zero len)
	182	tst #3, r0 ! 87 MT
	183
	184	mov r4, r0 ! 5 MT (0 cycle latency)
	185	add r6, r0 ! 49 EX
	186
	187	mov #16, r1 ! 6 EX
	188	bt/s .Lcase00 ! 111 BR (aligned)
	189
	190	sub r4, r5 ! 75 EX
	191
	192	! Arguments are not nicely long word aligned or zero len.
	193	! Check for small copies, and if so do a simple byte at a time copy.
	194	!
	195	! Deciding on an exact value of 'small' is not easy, as the point at which
	196	! using the optimised routines become worthwhile varies (these are the
	197	! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
	198	! size byte-at-time long word byte
	199	! 16 42 39-40 46-50 50-55
	200	! 24 58 43-44 54-58 62-67
	201	! 36 82 49-50 66-70 80-85
	202	! However the penalty for getting it 'wrong' is much higher for long word
	203	! aligned data (and this is more common), so use a value of 16.
	204
	205	cmp/gt r6,r1 ! 56 MT
	206
	207	add #-1,r5 ! 50 EX
	208	bf/s 6f ! 108 BR (not small)
	209
	210	mov r5, r3 ! 5 MT (latency=0)
	211	shlr r6 ! 104 EX
	212
	213	mov.b @(r0,r5),r1 ! 20 LS (latency=2)
	214	bf/s 4f ! 111 BR
	215
	216	add #-1,r3 ! 50 EX
	217	tst r6, r6 ! 86 MT
	218
	219	bt/s 98f ! 110 BR
	220	mov.b r1,@-r0 ! 29 LS
	221
	222	! 4 cycles, 2 bytes per iteration
	223	3: mov.b @(r0,r5),r1 ! 20 LS (latency=2)
	224
	225	4: mov.b @(r0,r3),r2 ! 20 LS (latency=2)
	226	dt r6 ! 67 EX
	227
	228	mov.b r1,@-r0 ! 29 LS
	229	bf/s 3b ! 111 BR
	230
	231	mov.b r2,@-r0 ! 29 LS
	232	98:
	233	rts
	234	nop
	235
	236	99: rts
	237	mov r4, r0
	238
	239	! Size is not small, so its worthwhile looking for optimisations.
	240	! First align destination to a long word boundary.
	241	!
	242	! r5 = normal value -1
	243
	244	6: tst #3, r0 ! 87 MT
	245	mov #3, r3 ! 6 EX
	246
	247	bt/s 2f ! 111 BR
	248	and r0,r3 ! 78 EX
	249
	250	! 3 cycles, 1 byte per iteration
	251	1: dt r3 ! 67 EX
	252	mov.b @(r0,r5),r1 ! 19 LS (latency=2)
	253
	254	add #-1, r6 ! 79 EX
	255	bf/s 1b ! 109 BR
	256
	257	mov.b r1,@-r0 ! 28 LS
	258
	259	2: add #1, r5 ! 79 EX
	260
	261	! Now select the appropriate bulk transfer code based on relative
	262	! alignment of src and dst.
	263
	264	mov r0, r3 ! 5 MT (latency=0)
	265
	266	mov r5, r0 ! 5 MT (latency=0)
	267	tst #1, r0 ! 87 MT
	268
	269	bf/s 1f ! 111 BR
	270	mov #64, r7 ! 6 EX
	271
	272	! bit 0 clear
	273
	274	cmp/ge r7, r6 ! 55 MT
	275
	276	bt/s 2f ! 111 BR
	277	tst #2, r0 ! 87 MT
	278
	279	! small
	280	bt/s .Lcase0
	281	mov r3, r0
	282
	283	bra .Lcase2
	284	nop
	285
	286	! big
	287	2: bt/s .Lcase0b
	288	mov r3, r0
	289
	290	bra .Lcase2b
	291	nop
	292
	293	! bit 0 set
	294	1: tst #2, r0 ! 87 MT
	295
	296	bt/s .Lcase1
	297	mov r3, r0
	298
	299	bra .Lcase3
	300	nop
	301
	302
	303	!
	304	! GHIJ KLMN OPQR --> GHIJ KLMN OPQR
	305	!
	306
	307	! src, dst and size are all long word aligned
	308	! size is non-zero
	309
	310	.balign 32
	311	.Lcase00:
	312	mov #64, r1 ! 6 EX
	313	mov r5, r3 ! 5 MT (latency=0)
	314
	315	cmp/gt r6, r1 ! 56 MT
	316	add #-4, r5 ! 50 EX
	317
	318	bf .Lcase00b ! 108 BR (big loop)
	319	shlr2 r6 ! 105 EX
	320
	321	shlr r6 ! 104 EX
	322	mov.l @(r0, r5), r1 ! 21 LS (latency=2)
	323
	324	bf/s 4f ! 111 BR
	325	add #-8, r3 ! 50 EX
	326
	327	tst r6, r6 ! 86 MT
	328	bt/s 5f ! 110 BR
	329
	330	mov.l r1,@-r0 ! 30 LS
	331
	332	! 4 cycles, 2 long words per iteration
	333	3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
	334
	335	4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
	336	dt r6 ! 67 EX
	337
	338	mov.l r1, @-r0 ! 30 LS
	339	bf/s 3b ! 109 BR
	340
	341	mov.l r2, @-r0 ! 30 LS
	342
	343	5: rts
	344	nop
	345
	346
	347	! Size is 16 or greater and less than 64, but may have trailing bytes
	348
	349	.balign 32
	350	.Lcase0:
	351	add #-4, r5 ! 50 EX
	352	mov r4, r7 ! 5 MT (latency=0)
	353
	354	mov.l @(r0, r5), r1 ! 21 LS (latency=2)
	355	mov #4, r2 ! 6 EX
	356
	357	add #11, r7 ! 50 EX
	358	tst r2, r6 ! 86 MT
	359
	360	mov r5, r3 ! 5 MT (latency=0)
	361	bt/s 4f ! 111 BR
	362
	363	add #-4, r3 ! 50 EX
	364	mov.l r1,@-r0 ! 30 LS
	365
	366	! 4 cycles, 2 long words per iteration
	367	3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
	368
	369	4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
	370	cmp/hi r7, r0
	371
	372	mov.l r1, @-r0 ! 30 LS
	373	bt/s 3b ! 109 BR
	374
	375	mov.l r2, @-r0 ! 30 LS
	376
	377	! Copy the final 0-3 bytes
	378
	379	add #3,r5 ! 50 EX
	380
	381	cmp/eq r0, r4 ! 54 MT
	382	add #-10, r7 ! 50 EX
	383
	384	bt 9f ! 110 BR
	385
	386	! 3 cycles, 1 byte per iteration
	387	1: mov.b @(r0,r5),r1 ! 19 LS
	388	cmp/hi r7,r0 ! 57 MT
	389
	390	bt/s 1b ! 111 BR
	391	mov.b r1,@-r0 ! 28 LS
	392
	393	9: rts
	394	nop
	395
	396	! Size is at least 64 bytes, so will be going round the big loop at least once.
	397	!
	398	! r2 = rounded up r4
	399	! r3 = rounded down r0
	400
	401	.balign 32
	402	.Lcase0b:
	403	add #-4, r5 ! 50 EX
	404
	405	.Lcase00b:
	406	mov r0, r3 ! 5 MT (latency=0)
	407	mov #(~0x1f), r1 ! 6 EX
	408
	409	and r1, r3 ! 78 EX
	410	mov r4, r2 ! 5 MT (latency=0)
	411
	412	cmp/eq r3, r0 ! 54 MT
	413	add #0x1f, r2 ! 50 EX
	414
	415	bt/s 1f ! 110 BR
	416	and r1, r2 ! 78 EX
	417
	418	! copy initial words until cache line aligned
	419
	420	mov.l @(r0, r5), r1 ! 21 LS (latency=2)
	421	tst #4, r0 ! 87 MT
	422
	423	mov r5, r6 ! 5 MT (latency=0)
	424	add #-4, r6 ! 50 EX
	425
	426	bt/s 4f ! 111 BR
	427	add #8, r3 ! 50 EX
	428
	429	tst #0x18, r0 ! 87 MT
	430
	431	bt/s 1f ! 109 BR
	432	mov.l r1,@-r0 ! 30 LS
	433
	434	! 4 cycles, 2 long words per iteration
	435	3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
	436
	437	4: mov.l @(r0, r6), r7 ! 21 LS (latency=2)
	438	cmp/eq r3, r0 ! 54 MT
	439
	440	mov.l r1, @-r0 ! 30 LS
	441	bf/s 3b ! 109 BR
	442
	443	mov.l r7, @-r0 ! 30 LS
	444
	445	! Copy the cache line aligned blocks
	446	!
	447	! In use: r0, r2, r4, r5
	448	! Scratch: r1, r3, r6, r7
	449	!
	450	! We could do this with the four scratch registers, but if src
	451	! and dest hit the same cache line, this will thrash, so make
	452	! use of additional registers.
	453	!
	454	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
	455	! r5: src (was r0+r5)
	456	! r1: dest (was r0)
	457	! this can be reversed at the end, so we don't need to save any extra
	458	! state.
	459	!
	460	1: mov.l r8, @-r15 ! 30 LS
	461	add r0, r5 ! 49 EX
	462
	463	mov.l r9, @-r15 ! 30 LS
	464	mov r0, r1 ! 5 MT (latency=0)
	465
	466	mov.l r10, @-r15 ! 30 LS
	467	add #-0x1c, r5 ! 50 EX
	468
	469	mov.l r11, @-r15 ! 30 LS
	470
	471	! 16 cycles, 32 bytes per iteration
	472	2: mov.l @(0x00,r5),r0 ! 18 LS (latency=2)
	473	add #-0x20, r1 ! 50 EX
	474	mov.l @(0x04,r5),r3 ! 18 LS (latency=2)
	475	mov.l @(0x08,r5),r6 ! 18 LS (latency=2)
	476	mov.l @(0x0c,r5),r7 ! 18 LS (latency=2)
	477	mov.l @(0x10,r5),r8 ! 18 LS (latency=2)
	478	mov.l @(0x14,r5),r9 ! 18 LS (latency=2)
	479	mov.l @(0x18,r5),r10 ! 18 LS (latency=2)
	480	mov.l @(0x1c,r5),r11 ! 18 LS (latency=2)
	481	movca.l r0,@r1 ! 40 LS (latency=3-7)
	482	mov.l r3,@(0x04,r1) ! 33 LS
	483	mov.l r6,@(0x08,r1) ! 33 LS
	484	mov.l r7,@(0x0c,r1) ! 33 LS
	485
	486	mov.l r8,@(0x10,r1) ! 33 LS
	487	add #-0x20, r5 ! 50 EX
	488
	489	mov.l r9,@(0x14,r1) ! 33 LS
	490	cmp/eq r2,r1 ! 54 MT
	491
	492	mov.l r10,@(0x18,r1) ! 33 LS
	493	bf/s 2b ! 109 BR
	494
	495	mov.l r11,@(0x1c,r1) ! 33 LS
	496
	497	mov r1, r0 ! 5 MT (latency=0)
	498
	499	mov.l @r15+, r11 ! 15 LS
	500	sub r1, r5 ! 75 EX
	501
	502	mov.l @r15+, r10 ! 15 LS
	503	cmp/eq r4, r0 ! 54 MT
	504
	505	bf/s 1f ! 109 BR
	506	mov.l @r15+, r9 ! 15 LS
	507
	508	rts
	509	1: mov.l @r15+, r8 ! 15 LS
	510	sub r4, r1 ! 75 EX (len remaining)
	511
	512	! number of trailing bytes is non-zero
	513	!
	514	! invariants restored (r5 already decremented by 4)
	515	! also r1=num bytes remaining
	516
	517	mov #4, r2 ! 6 EX
	518	mov r4, r7 ! 5 MT (latency=0)
	519
	520	add #0x1c, r5 ! 50 EX (back to -4)
	521	cmp/hs r2, r1 ! 58 MT
	522
	523	bf/s 5f ! 108 BR
	524	add #11, r7 ! 50 EX
	525
	526	mov.l @(r0, r5), r6 ! 21 LS (latency=2)
	527	tst r2, r1 ! 86 MT
	528
	529	mov r5, r3 ! 5 MT (latency=0)
	530	bt/s 4f ! 111 BR
	531
	532	add #-4, r3 ! 50 EX
	533	cmp/hs r2, r1 ! 58 MT
	534
	535	bt/s 5f ! 111 BR
	536	mov.l r6,@-r0 ! 30 LS
	537
	538	! 4 cycles, 2 long words per iteration
	539	3: mov.l @(r0, r5), r6 ! 21 LS (latency=2)
	540
	541	4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
	542	cmp/hi r7, r0
	543
	544	mov.l r6, @-r0 ! 30 LS
	545	bt/s 3b ! 109 BR
	546
	547	mov.l r2, @-r0 ! 30 LS
	548
	549	! Copy the final 0-3 bytes
	550
	551	5: cmp/eq r0, r4 ! 54 MT
	552	add #-10, r7 ! 50 EX
	553
	554	bt 9f ! 110 BR
	555	add #3,r5 ! 50 EX
	556
	557	! 3 cycles, 1 byte per iteration
	558	1: mov.b @(r0,r5),r1 ! 19 LS
	559	cmp/hi r7,r0 ! 57 MT
	560
	561	bt/s 1b ! 111 BR
	562	mov.b r1,@-r0 ! 28 LS
	563
	564	9: rts
	565	nop
	566
	567	!
	568	! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR..
	569	!
	570
	571	.balign 32
	572	.Lcase2:
	573	! Size is 16 or greater and less then 64, but may have trailing bytes
	574
	575	2: mov r5, r6 ! 5 MT (latency=0)
	576	add #-2,r5 ! 50 EX
	577
	578	mov r4,r2 ! 5 MT (latency=0)
	579	add #-4,r6 ! 50 EX
	580
	581	add #7,r2 ! 50 EX
	582	3: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
	583
	584	mov.w @(r0,r6),r3 ! 20 LS (latency=2)
	585	cmp/hi r2,r0 ! 57 MT
	586
	587	mov.w r1,@-r0 ! 29 LS
	588	bt/s 3b ! 111 BR
	589
	590	mov.w r3,@-r0 ! 29 LS
	591
	592	bra 10f
	593	nop
	594
	595
	596	.balign 32
	597	.Lcase2b:
	598	! Size is at least 64 bytes, so will be going round the big loop at least once.
	599	!
	600	! r2 = rounded up r4
	601	! r3 = rounded down r0
	602
	603	mov r0, r3 ! 5 MT (latency=0)
	604	mov #(~0x1f), r1 ! 6 EX
	605
	606	and r1, r3 ! 78 EX
	607	mov r4, r2 ! 5 MT (latency=0)
	608
	609	cmp/eq r3, r0 ! 54 MT
	610	add #0x1f, r2 ! 50 EX
	611
	612	add #-2, r5 ! 50 EX
	613	bt/s 1f ! 110 BR
	614	and r1, r2 ! 78 EX
	615
	616	! Copy a short word one at a time until we are cache line aligned
	617	! Normal values: r0, r2, r3, r4
	618	! Unused: r1, r6, r7
	619	! Mod: r5 (=r5-2)
	620	!
	621	add #2, r3 ! 50 EX
	622
	623	2: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
	624	cmp/eq r3,r0 ! 54 MT
	625
	626	bf/s 2b ! 111 BR
	627
	628	mov.w r1,@-r0 ! 29 LS
	629
	630	! Copy the cache line aligned blocks
	631	!
	632	! In use: r0, r2, r4, r5 (=r5-2)
	633	! Scratch: r1, r3, r6, r7
	634	!
	635	! We could do this with the four scratch registers, but if src
	636	! and dest hit the same cache line, this will thrash, so make
	637	! use of additional registers.
	638	!
	639	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
	640	! r5: src (was r0+r5)
	641	! r1: dest (was r0)
	642	! this can be reversed at the end, so we don't need to save any extra
	643	! state.
	644	!
	645	1: mov.l r8, @-r15 ! 30 LS
	646	add r0, r5 ! 49 EX
	647
	648	mov.l r9, @-r15 ! 30 LS
	649	mov r0, r1 ! 5 MT (latency=0)
	650
	651	mov.l r10, @-r15 ! 30 LS
	652	add #-0x1e, r5 ! 50 EX
	653
	654	mov.l r11, @-r15 ! 30 LS
	655
	656	mov.l r12, @-r15 ! 30 LS
	657
	658	! 17 cycles, 32 bytes per iteration
	659	#ifdef CONFIG_CPU_LITTLE_ENDIAN
	660	2: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI
	661	add #-0x20, r1 ! 50 EX
	662
	663	mov.l @r5+, r3 ! 15 LS (latency=2) NMLK
	664
	665	mov.l @r5+, r6 ! 15 LS (latency=2) RQPO
	666	shll16 r0 ! 103 EX JI..
	667
	668	mov.l @r5+, r7 ! 15 LS (latency=2)
	669	xtrct r3, r0 ! 48 EX LKJI
	670
	671	mov.l @r5+, r8 ! 15 LS (latency=2)
	672	xtrct r6, r3 ! 48 EX PONM
	673
	674	mov.l @r5+, r9 ! 15 LS (latency=2)
	675	xtrct r7, r6 ! 48 EX
	676
	677	mov.l @r5+, r10 ! 15 LS (latency=2)
	678	xtrct r8, r7 ! 48 EX
	679
	680	mov.l @r5+, r11 ! 15 LS (latency=2)
	681	xtrct r9, r8 ! 48 EX
	682
	683	mov.w @r5+, r12 ! 15 LS (latency=2)
	684	xtrct r10, r9 ! 48 EX
	685
	686	movca.l r0,@r1 ! 40 LS (latency=3-7)
	687	xtrct r11, r10 ! 48 EX
	688
	689	mov.l r3, @(0x04,r1) ! 33 LS
	690	xtrct r12, r11 ! 48 EX
	691
	692	mov.l r6, @(0x08,r1) ! 33 LS
	693
	694	mov.l r7, @(0x0c,r1) ! 33 LS
	695
	696	mov.l r8, @(0x10,r1) ! 33 LS
	697	add #-0x40, r5 ! 50 EX
	698
	699	mov.l r9, @(0x14,r1) ! 33 LS
	700	cmp/eq r2,r1 ! 54 MT
	701
	702	mov.l r10, @(0x18,r1) ! 33 LS
	703	bf/s 2b ! 109 BR
	704
	705	mov.l r11, @(0x1c,r1) ! 33 LS
	706	#else
	707	2: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2)
	708	add #-2, r5 ! 50 EX
	709
	710	mov.l @(0x1c,r5), r3 ! 18 LS (latency=2)
	711	add #-4, r1 ! 50 EX
	712
	713	mov.l @(0x18,r5), r6 ! 18 LS (latency=2)
	714	shll16 r0 ! 103 EX
	715
	716	mov.l @(0x14,r5), r7 ! 18 LS (latency=2)
	717	xtrct r3, r0 ! 48 EX
	718
	719	mov.l @(0x10,r5), r8 ! 18 LS (latency=2)
	720	xtrct r6, r3 ! 48 EX
	721
	722	mov.l @(0x0c,r5), r9 ! 18 LS (latency=2)
	723	xtrct r7, r6 ! 48 EX
	724
	725	mov.l @(0x08,r5), r10 ! 18 LS (latency=2)
	726	xtrct r8, r7 ! 48 EX
	727
	728	mov.l @(0x04,r5), r11 ! 18 LS (latency=2)
	729	xtrct r9, r8 ! 48 EX
	730
	731	mov.w @(0x02,r5), r12 ! 18 LS (latency=2)
	732	xtrct r10, r9 ! 48 EX
	733
	734	movca.l r0,@r1 ! 40 LS (latency=3-7)
	735	add #-0x1c, r1 ! 50 EX
	736
	737	mov.l r3, @(0x1c,r1) ! 33 LS
	738	xtrct r11, r10 ! 48 EX
	739
	740	mov.l r6, @(0x18,r1) ! 33 LS
	741	xtrct r12, r11 ! 48 EX
	742
	743	mov.l r7, @(0x14,r1) ! 33 LS
	744
	745	mov.l r8, @(0x10,r1) ! 33 LS
	746	add #-0x3e, r5 ! 50 EX
	747
	748	mov.l r9, @(0x0c,r1) ! 33 LS
	749	cmp/eq r2,r1 ! 54 MT
	750
	751	mov.l r10, @(0x08,r1) ! 33 LS
	752	bf/s 2b ! 109 BR
	753
	754	mov.l r11, @(0x04,r1) ! 33 LS
	755	#endif
	756
	757	mov.l @r15+, r12
	758	mov r1, r0 ! 5 MT (latency=0)
	759
	760	mov.l @r15+, r11 ! 15 LS
	761	sub r1, r5 ! 75 EX
	762
	763	mov.l @r15+, r10 ! 15 LS
	764	cmp/eq r4, r0 ! 54 MT
	765
	766	bf/s 1f ! 109 BR
	767	mov.l @r15+, r9 ! 15 LS
	768
	769	rts
	770	1: mov.l @r15+, r8 ! 15 LS
	771
	772	add #0x1e, r5 ! 50 EX
	773
	774	! Finish off a short word at a time
	775	! r5 must be invariant - 2
	776	10: mov r4,r2 ! 5 MT (latency=0)
	777	add #1,r2 ! 50 EX
	778
	779	cmp/hi r2, r0 ! 57 MT
	780	bf/s 1f ! 109 BR
	781
	782	add #2, r2 ! 50 EX
	783
	784	3: mov.w @(r0,r5),r1 ! 20 LS
	785	cmp/hi r2,r0 ! 57 MT
	786
	787	bt/s 3b ! 109 BR
	788
	789	mov.w r1,@-r0 ! 29 LS
	790	1:
	791
	792	!
	793	! Finally, copy the last byte if necessary
	794	cmp/eq r4,r0 ! 54 MT
	795	bt/s 9b
	796	add #1,r5
	797	mov.b @(r0,r5),r1
	798	rts
	799	mov.b r1,@-r0
	800