2 files changed, 345 insertions, 317 deletions
diff --git a/arch/cris/arch-v10/lib/string.c b/arch/cris/arch-v10/lib/string.c
index 7161a2bef4fe..c7bd6ebdc93c 100644
--- a/arch/cris/arch-v10/lib/string.c
+++ b/arch/cris/arch-v10/lib/string.c
@@ -1,55 +1,59 @@
-/*#************************************************************************#*/
+/* A memcpy for CRIS.
-/*#-------------------------------------------------------------------------*/
+   Copyright (C) 1994-2005 Axis Communications.
-/*#                                                                         */
+   All rights reserved.
-/*# FUNCTION NAME: memcpy()                                                 */
-/*#                                                                         */
+   Redistribution and use in source and binary forms, with or without
-/*# PARAMETERS:  void* dst;   Destination address.                          */
+   modification, are permitted provided that the following conditions
-/*#              void* src;   Source address.                               */
+   are met:
-/*#              int   len;   Number of bytes to copy.                      */
-/*#                                                                         */
+   1. Redistributions of source code must retain the above copyright
-/*# RETURNS:     dst.                                                       */
+      notice, this list of conditions and the following disclaimer.
-/*#                                                                         */
-/*# DESCRIPTION: Copies len bytes of memory from src to dst.  No guarantees */
+   2. Neither the name of Axis Communications nor the names of its
-/*#              about copying of overlapping memory areas. This routine is */
+      contributors may be used to endorse or promote products derived
-/*#              very sensitive to compiler changes in register allocation. */
+      from this software without specific prior written permission.
-/*#              Should really be rewritten to avoid this problem.          */
-/*#                                                                         */
+   THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS
-/*#-------------------------------------------------------------------------*/
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-/*#                                                                         */
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-/*# HISTORY                                                                 */
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS
-/*#                                                                         */
+   COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-/*# DATE      NAME            CHANGES                                       */
+   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-/*# ----      ----            -------                                       */
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-/*# 941007    Kenny R         Creation                                      */
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-/*# 941011    Kenny R         Lots of optimizations and inlining.           */
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-/*# 941129    Ulf A           Adapted for use in libc.                      */
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-/*# 950216    HP              N==0 forgotten if non-aligned src/dst.        */
+   IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-/*#                           Added some optimizations.                     */
+   POSSIBILITY OF SUCH DAMAGE.  */
-/*# 001025    HP              Make src and dst char *.  Align dst to        */
-/*#                           dword, not just word-if-both-src-and-dst-     */
+/* FIXME: This file should really only be used for reference, as the
-/*#                           are-misaligned.                               */
+   result is somewhat depending on gcc generating what we expect rather
-/*#                                                                         */
+   than what we describe.  An assembly file should be used instead.  */
-/*#-------------------------------------------------------------------------*/
+#include <stddef.h>
-#include <linux/types.h>
+/* Break even between movem and move16 is really at 38.7 * 2, but
-void *memcpy(void *pdst,
+   modulo 44, so up to the next multiple of 44, we use ordinary code.  */
-             const void *psrc,
+#define MEMCPY_BY_BLOCK_THRESHOLD (44 * 2)
-             size_t pn)
+/* No name ambiguities in this file.  */
+__asm__ (".syntax no_register_prefix");
+void *
+memcpy(void *pdst, const void *psrc, size_t pn)
 {
-  /* Ok.  Now we want the parameters put in special registers.
+  /* Now we want the parameters put in special registers.
     Make sure the compiler is able to make something useful of this.
-      As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
+     As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
-     If gcc was alright, it really would need no temporaries, and no
+     If gcc was allright, it really would need no temporaries, and no
-     stack space to save stuff on. */
+     stack space to save stuff on.  */
  register void *return_dst __asm__ ("r10") = pdst;
-  register char *dst __asm__ ("r13") = pdst;
+  register unsigned char *dst __asm__ ("r13") = pdst;
-  register const char *src __asm__ ("r11") = psrc;
+  register unsigned const char *src __asm__ ("r11") = psrc;
  register int n __asm__ ("r12") = pn;
-  
- 
  /* When src is aligned but not dst, this makes a few extra needless
     cycles.  I believe it would take as many to check that the
     re-alignment was unnecessary.  */
@@ -59,167 +63,174 @@ void *memcpy(void *pdst,
      && n >= 3)
  {
    if ((unsigned long) dst & 1)
-    {
+      {
-      n--;
+        n--;
-      *(char*)dst = *(char*)src;
+        *dst = *src;
-      src++;
+        src++;
-      dst++;
+        dst++;
-    }
+      }
    if ((unsigned long) dst & 2)
-    {
+      {
-      n -= 2;
+        n -= 2;
-      *(short*)dst = *(short*)src;
+        *(short *) dst = *(short *) src;
-      src += 2;
+        src += 2;
-      dst += 2;
+        dst += 2;
-    }
+      }
  }
-  /* Decide which copying method to use. */
+  /* Decide which copying method to use.  */
-  if (n >= 44*2)                /* Break even between movem and
+  if (n >= MEMCPY_BY_BLOCK_THRESHOLD)
-                                   move16 is at 38.7*2, but modulo 44. */
+    {
-  {
+      /* It is not optimal to tell the compiler about clobbering any
-    /* For large copies we use 'movem' */
+         registers; that will move the saving/restoring of those registers
+         to the function prologue/epilogue, and make non-movem sizes
-  /* It is not optimal to tell the compiler about clobbering any
+         suboptimal.  */
-     registers; that will move the saving/restoring of those registers
+      __asm__ volatile
-     to the function prologue/epilogue, and make non-movem sizes
+        ("\
-     suboptimal.
+         ;; GCC does promise correct register allocations, but let's    \n\
+         ;; make sure it keeps its promises.                            \n\
-      This method is not foolproof; it assumes that the "asm reg"
+         .ifnc %0-%1-%2,$r13-$r11-$r12                                  \n\
-     declarations at the beginning of the function really are used
+         .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\"       \n\
-     here (beware: they may be moved to temporary registers).
+         .endif                                                         \n\
-      This way, we do not have to save/move the registers around into
+                                                                        \n\
-     temporaries; we can safely use them straight away.
+         ;; Save the registers we'll use in the movem process           \n\
+         ;; on the stack.                                               \n\
-      If you want to check that the allocation was right; then
+         subq   11*4,sp                                                 \n\
-      check the equalities in the first comment.  It should say
+         movem  r10,[sp]                                                \n\
-      "r13=r13, r11=r11, r12=r12" */
-    __asm__ volatile ("\n\
-        ;; Check that the following is true (same register names on     \n\
-        ;; both sides of equal sign, as in r8=r8):                      \n\
-        ;; %0=r13, %1=r11, %2=r12                                       \n\
-        ;;                                                              \n\
-        ;; Save the registers we'll use in the movem process            \n\
-        ;; on the stack.                                                \n\
-        subq    11*4,$sp                                                \n\
-        movem   $r10,[$sp]                                              \n\
                                                                        \n\
-        ;; Now we've got this:                                          \n\
+         ;; Now we've got this:                                         \n\
-        ;; r11 - src                                                    \n\
+         ;; r11 - src                                                   \n\
-        ;; r13 - dst                                                    \n\
+         ;; r13 - dst                                                   \n\
-        ;; r12 - n                                                      \n\
+         ;; r12 - n                                                     \n\
                                                                        \n\
-        ;; Update n for the first loop                                  \n\
+         ;; Update n for the first loop.                                \n\
-        subq    44,$r12                                                 \n\
+         subq    44,r12                                                 \n\
 0:                                                                      \n\
-        movem   [$r11+],$r10                                            \n\
+"
-        subq    44,$r12                                                 \n\
+#ifdef __arch_common_v10_v32
-        bge     0b                                                      \n\
+         /* Cater to branch offset difference between v32 and v10.  We
-        movem   $r10,[$r13+]                                            \n\
+            assume the branch below has an 8-bit offset.  */
+"        setf\n"
+#endif
+"        movem  [r11+],r10                                              \n\
+         subq   44,r12                                                  \n\
+         bge     0b                                                     \n\
+         movem  r10,[r13+]                                              \n\
                                                                        \n\
-        addq    44,$r12 ;; compensate for last loop underflowing n      \n\
+         ;; Compensate for last loop underflowing n.                    \n\
+         addq   44,r12                                                  \n\
                                                                        \n\
-        ;; Restore registers from stack                                 \n\
+         ;; Restore registers from stack.                               \n\
-        movem   [$sp+],$r10"
+         movem [sp+],r10"
-     /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n) 
+         /* Outputs.  */
-     /* Inputs */ : "0" (dst), "1" (src), "2" (n));
+         : "=r" (dst), "=r" (src), "=r" (n)
-    
-  }
-  /* Either we directly starts copying, using dword copying
+         /* Inputs.  */
-     in a loop, or we copy as much as possible with 'movem' 
+         : "0" (dst), "1" (src), "2" (n));
-     and then the last block (<44 bytes) is copied here.
+    }
-     This will work since 'movem' will have updated src,dst,n. */
-  while ( n >= 16 )
+  while (n >= 16)
-  {
+    {
-    *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-    *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-    *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-    *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-    n -= 16;
-  }
+      n -= 16;
+    }
-  /* A switch() is definitely the fastest although it takes a LOT of code.
-   * Particularly if you inline code this.
-   */
  switch (n)
-  {
+    {
    case 0:
      break;
    case 1:
-      *(char*)dst = *(char*)src;
+      *dst = *src;
      break;
    case 2:
-      *(short*)dst = *(short*)src;
+      *(short *) dst = *(short *) src;
      break;
    case 3:
-      *((short*)dst)++ = *((short*)src)++;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
-      *(char*)dst = *(char*)src;
+      *dst = *src;
      break;
    case 4:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src;
      break;
    case 5:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *(char*)dst = *(char*)src;
+      *dst = *src;
      break;
    case 6:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *(short*)dst = *(short*)src;
+      *(short *) dst = *(short *) src;
      break;
    case 7:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((short*)dst)++ = *((short*)src)++;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
-      *(char*)dst = *(char*)src;
+      *dst = *src;
      break;
    case 8:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src;
      break;
    case 9:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *(char*)dst = *(char*)src;
+      *dst = *src;
      break;
    case 10:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *(short*)dst = *(short*)src;
+      *(short *) dst = *(short *) src;
      break;
    case 11:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((short*)dst)++ = *((short*)src)++;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
-      *(char*)dst = *(char*)src;
+      *dst = *src;
      break;
    case 12:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src;
      break;
    case 13:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *(char*)dst = *(char*)src;
+      *dst = *src;
      break;
    case 14:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *(short*)dst = *(short*)src;
+      *(short *) dst = *(short *) src;
      break;
    case 15:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((short*)dst)++ = *((short*)src)++;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
-      *(char*)dst = *(char*)src;
+      *dst = *src;
      break;
-  }
+    }
-  return return_dst; /* destination pointer. */
+  return return_dst;
-} /* memcpy() */
+}
diff --git a/arch/cris/arch-v32/lib/string.c b/arch/cris/arch-v32/lib/string.c
index 6740b2cebae5..c7bd6ebdc93c 100644
--- a/arch/cris/arch-v32/lib/string.c
+++ b/arch/cris/arch-v32/lib/string.c
@@ -1,55 +1,59 @@
-/*#************************************************************************#*/
+/* A memcpy for CRIS.
-/*#-------------------------------------------------------------------------*/
+   Copyright (C) 1994-2005 Axis Communications.
-/*#                                                                         */
+   All rights reserved.
-/*# FUNCTION NAME: memcpy()                                                 */
-/*#                                                                         */
+   Redistribution and use in source and binary forms, with or without
-/*# PARAMETERS:  void* dst;   Destination address.                          */
+   modification, are permitted provided that the following conditions
-/*#              void* src;   Source address.                               */
+   are met:
-/*#              int   len;   Number of bytes to copy.                      */
-/*#                                                                         */
+   1. Redistributions of source code must retain the above copyright
-/*# RETURNS:     dst.                                                       */
+      notice, this list of conditions and the following disclaimer.
-/*#                                                                         */
-/*# DESCRIPTION: Copies len bytes of memory from src to dst.  No guarantees */
+   2. Neither the name of Axis Communications nor the names of its
-/*#              about copying of overlapping memory areas. This routine is */
+      contributors may be used to endorse or promote products derived
-/*#              very sensitive to compiler changes in register allocation. */
+      from this software without specific prior written permission.
-/*#              Should really be rewritten to avoid this problem.          */
-/*#                                                                         */
+   THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS
-/*#-------------------------------------------------------------------------*/
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-/*#                                                                         */
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-/*# HISTORY                                                                 */
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS
-/*#                                                                         */
+   COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-/*# DATE      NAME            CHANGES                                       */
+   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-/*# ----      ----            -------                                       */
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-/*# 941007    Kenny R         Creation                                      */
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-/*# 941011    Kenny R         Lots of optimizations and inlining.           */
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-/*# 941129    Ulf A           Adapted for use in libc.                      */
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-/*# 950216    HP              N==0 forgotten if non-aligned src/dst.        */
+   IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-/*#                           Added some optimizations.                     */
+   POSSIBILITY OF SUCH DAMAGE.  */
-/*# 001025    HP              Make src and dst char *.  Align dst to        */
-/*#                           dword, not just word-if-both-src-and-dst-     */
+/* FIXME: This file should really only be used for reference, as the
-/*#                           are-misaligned.                               */
+   result is somewhat depending on gcc generating what we expect rather
-/*#                                                                         */
+   than what we describe.  An assembly file should be used instead.  */
-/*#-------------------------------------------------------------------------*/
+#include <stddef.h>
-#include <linux/types.h>
+/* Break even between movem and move16 is really at 38.7 * 2, but
-void *memcpy(void *pdst,
+   modulo 44, so up to the next multiple of 44, we use ordinary code.  */
-             const void *psrc,
+#define MEMCPY_BY_BLOCK_THRESHOLD (44 * 2)
-             size_t pn)
+/* No name ambiguities in this file.  */
+__asm__ (".syntax no_register_prefix");
+void *
+memcpy(void *pdst, const void *psrc, size_t pn)
 {
-  /* Ok.  Now we want the parameters put in special registers.
+  /* Now we want the parameters put in special registers.
     Make sure the compiler is able to make something useful of this.
-      As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
+     As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
-     If gcc was alright, it really would need no temporaries, and no
+     If gcc was allright, it really would need no temporaries, and no
-     stack space to save stuff on. */
+     stack space to save stuff on.  */
  register void *return_dst __asm__ ("r10") = pdst;
-  register char *dst __asm__ ("r13") = pdst;
+  register unsigned char *dst __asm__ ("r13") = pdst;
-  register const char *src __asm__ ("r11") = psrc;
+  register unsigned const char *src __asm__ ("r11") = psrc;
  register int n __asm__ ("r12") = pn;
  /* When src is aligned but not dst, this makes a few extra needless
     cycles.  I believe it would take as many to check that the
     re-alignment was unnecessary.  */
@@ -59,161 +63,174 @@ void *memcpy(void *pdst,
      && n >= 3)
  {
    if ((unsigned long) dst & 1)
-    {
+      {
-      n--;
+        n--;
-      *(char*)dst = *(char*)src;
+        *dst = *src;
-      src++;
+        src++;
-      dst++;
+        dst++;
-    }
+      }
    if ((unsigned long) dst & 2)
-    {
+      {
-      n -= 2;
+        n -= 2;
-      *(short*)dst = *(short*)src;
+        *(short *) dst = *(short *) src;
-      src += 2;
+        src += 2;
-      dst += 2;
+        dst += 2;
-    }
+      }
  }
-  /* Decide which copying method to use.  Movem is dirt cheap, so the
+  /* Decide which copying method to use.  */
-     overheap is low enough to always use the minimum block size as the
+  if (n >= MEMCPY_BY_BLOCK_THRESHOLD)
-     threshold.  */
+    {
-  if (n >= 44)
+      /* It is not optimal to tell the compiler about clobbering any
-  {
+         registers; that will move the saving/restoring of those registers
-    /* For large copies we use 'movem' */
+         to the function prologue/epilogue, and make non-movem sizes
+         suboptimal.  */
-  /* It is not optimal to tell the compiler about clobbering any
+      __asm__ volatile
-     registers; that will move the saving/restoring of those registers
+        ("\
-     to the function prologue/epilogue, and make non-movem sizes
+         ;; GCC does promise correct register allocations, but let's    \n\
-     suboptimal.  */
+         ;; make sure it keeps its promises.                            \n\
-    __asm__ volatile ("                                                 \n\
+         .ifnc %0-%1-%2,$r13-$r11-$r12                                  \n\
-        ;; Check that the register asm declaration got right.           \n\
+         .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\"       \n\
-        ;; The GCC manual explicitly says TRT will happen.              \n\
+         .endif                                                         \n\
-        .ifnc %0-%1-%2,$r13-$r11-$r12                                   \n\
-        .err                                                            \n\
-        .endif                                                          \n\
-                                                                        \n\
-        ;; Save the registers we'll use in the movem process            \n\
                                                                        \n\
-        ;; on the stack.                                                \n\
+         ;; Save the registers we'll use in the movem process           \n\
-        subq    11*4,$sp                                                \n\
+         ;; on the stack.                                               \n\
-        movem   $r10,[$sp]                                              \n\
+         subq   11*4,sp                                                 \n\
+         movem  r10,[sp]                                                \n\
                                                                        \n\
-        ;; Now we've got this:                                          \n\
+         ;; Now we've got this:                                         \n\
-        ;; r11 - src                                                    \n\
+         ;; r11 - src                                                   \n\
-        ;; r13 - dst                                                    \n\
+         ;; r13 - dst                                                   \n\
-        ;; r12 - n                                                      \n\
+         ;; r12 - n                                                     \n\
                                                                        \n\
-        ;; Update n for the first loop                                  \n\
+         ;; Update n for the first loop.                                \n\
-        subq    44,$r12                                                 \n\
+         subq    44,r12                                                 \n\
 0:                                                                      \n\
-        movem   [$r11+],$r10                                            \n\
+"
-        subq   44,$r12                                                  \n\
+#ifdef __arch_common_v10_v32
-        bge     0b                                                      \n\
+         /* Cater to branch offset difference between v32 and v10.  We
-        movem   $r10,[$r13+]                                            \n\
+            assume the branch below has an 8-bit offset.  */
+"        setf\n"
+#endif
+"        movem  [r11+],r10                                              \n\
+         subq   44,r12                                                  \n\
+         bge     0b                                                     \n\
+         movem  r10,[r13+]                                              \n\
                                                                        \n\
-        addq   44,$r12  ;; compensate for last loop underflowing n      \n\
+         ;; Compensate for last loop underflowing n.                    \n\
+         addq   44,r12                                                  \n\
                                                                        \n\
-        ;; Restore registers from stack                                 \n\
+         ;; Restore registers from stack.                               \n\
-        movem [$sp+],$r10"
+         movem [sp+],r10"
-     /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n)
+         /* Outputs.  */
-     /* Inputs */ : "0" (dst), "1" (src), "2" (n));
+         : "=r" (dst), "=r" (src), "=r" (n)
-  }
+         /* Inputs.  */
+         : "0" (dst), "1" (src), "2" (n));
+    }
-  /* Either we directly starts copying, using dword copying
+  while (n >= 16)
-     in a loop, or we copy as much as possible with 'movem'
+    {
-     and then the last block (<44 bytes) is copied here.
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-     This will work since 'movem' will have updated src,dst,n. */
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-  while ( n >= 16 )
+      n -= 16;
-  {
+    }
-    *((long*)dst)++ = *((long*)src)++;
-    *((long*)dst)++ = *((long*)src)++;
-    *((long*)dst)++ = *((long*)src)++;
-    *((long*)dst)++ = *((long*)src)++;
-    n -= 16;
-  }
-  /* A switch() is definitely the fastest although it takes a LOT of code.
-   * Particularly if you inline code this.
-   */
  switch (n)
-  {
+    {
    case 0:
      break;
    case 1:
-      *(char*)dst = *(char*)src;
+      *dst = *src;
      break;
    case 2:
-      *(short*)dst = *(short*)src;
+      *(short *) dst = *(short *) src;
      break;
    case 3:
-      *((short*)dst)++ = *((short*)src)++;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
-      *(char*)dst = *(char*)src;
+      *dst = *src;
      break;
    case 4:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src;
      break;
    case 5:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *(char*)dst = *(char*)src;
+      *dst = *src;
      break;
    case 6:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *(short*)dst = *(short*)src;
+      *(short *) dst = *(short *) src;
      break;
    case 7:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((short*)dst)++ = *((short*)src)++;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
-      *(char*)dst = *(char*)src;
+      *dst = *src;
      break;
    case 8:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src;
      break;
    case 9:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *(char*)dst = *(char*)src;
+      *dst = *src;
      break;
    case 10:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *(short*)dst = *(short*)src;
+      *(short *) dst = *(short *) src;
      break;
    case 11:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((short*)dst)++ = *((short*)src)++;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
-      *(char*)dst = *(char*)src;
+      *dst = *src;
      break;
    case 12:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src;
      break;
    case 13:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *(char*)dst = *(char*)src;
+      *dst = *src;
      break;
    case 14:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *(short*)dst = *(short*)src;
+      *(short *) dst = *(short *) src;
      break;
    case 15:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
-      *((short*)dst)++ = *((short*)src)++;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
-      *(char*)dst = *(char*)src;
+      *dst = *src;
      break;
-  }
+    }
-  return return_dst; /* destination pointer. */
+  return return_dst;
-} /* memcpy() */
+}