Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/parisc/lib/memcpy.c
1 files changed, 522 insertions, 0 deletions
diff --git a/arch/parisc/lib/memcpy.c b/arch/parisc/lib/memcpy.c
new file mode 100644
index 000000000000..feb1b9f42c2b
--- /dev/null
+++ b/arch/parisc/lib/memcpy.c
@@ -0,0 +1,522 @@
+/*
+ *    Optimized memory copy routines.
+ *
+ *    Copyright (C) 2004 Randolph Chung <tausq@debian.org>
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; either version 2, or (at your option)
+ *    any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *    Portions derived from the GNU C Library
+ *    Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
+ *
+ * Several strategies are tried to try to get the best performance for various
+ * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using 
+ * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
+ * general registers.  Unaligned copies are handled either by aligning the 
+ * destination and then using shift-and-write method, or in a few cases by 
+ * falling back to a byte-at-a-time copy.
+ *
+ * I chose to implement this in C because it is easier to maintain and debug,
+ * and in my experiments it appears that the C code generated by gcc (3.3/3.4
+ * at the time of writing) is fairly optimal. Unfortunately some of the 
+ * semantics of the copy routine (exception handling) is difficult to express
+ * in C, so we have to play some tricks to get it to work.
+ *
+ * All the loads and stores are done via explicit asm() code in order to use
+ * the right space registers. 
+ * 
+ * Testing with various alignments and buffer sizes shows that this code is 
+ * often >10x faster than a simple byte-at-a-time copy, even for strangely
+ * aligned operands. It is interesting to note that the glibc version
+ * of memcpy (written in C) is actually quite fast already. This routine is 
+ * able to beat it by 30-40% for aligned copies because of the loop unrolling, 
+ * but in some cases the glibc version is still slightly faster. This lends 
+ * more credibility that gcc can generate very good code as long as we are 
+ * careful.
+ *
+ * TODO:
+ * - cache prefetching needs more experimentation to get optimal settings
+ * - try not to use the post-increment address modifiers; they create additional
+ *   interlocks
+ * - replace byte-copy loops with stybs sequences
+ */
+#ifdef __KERNEL__
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <asm/uaccess.h>
+#define s_space "%%sr1"
+#define d_space "%%sr2"
+#else
+#include "memcpy.h"
+#define s_space "%%sr0"
+#define d_space "%%sr0"
+#define pa_memcpy new2_copy
+#endif
+DECLARE_PER_CPU(struct exception_data, exception_data);
+#define preserve_branch(label)  do {                                    \
+        volatile int dummy;                                             \
+        /* The following branch is never taken, it's just here to  */   \
+        /* prevent gcc from optimizing away our exception code. */      \
+        if (unlikely(dummy != dummy))                                   \
+                goto label;                                             \
+} while (0)
+#define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
+#define get_kernel_space() (0)
+#define MERGE(w0, sh_1, w1, sh_2)  ({                                   \
+        unsigned int _r;                                                \
+        asm volatile (                                                  \
+        "mtsar %3\n"                                                    \
+        "shrpw %1, %2, %%sar, %0\n"                                     \
+        : "=r"(_r)                                                      \
+        : "r"(w0), "r"(w1), "r"(sh_2)                                   \
+        );                                                              \
+        _r;                                                             \
+})
+#define THRESHOLD       16
+#ifdef DEBUG_MEMCPY
+#define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __FUNCTION__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
+#else
+#define DPRINTF(fmt, args...)
+#endif
+#ifndef __LP64__
+#define EXC_WORD ".word"
+#else
+#define EXC_WORD ".dword"
+#endif
+#define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e)     \
+        __asm__ __volatile__ (                          \
+        "1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n"   \
+        "\t.section __ex_table,\"aw\"\n"                \
+        "\t" EXC_WORD "\t1b\n"                          \
+        "\t" EXC_WORD "\t" #_e "\n"                     \
+        "\t.previous\n"                                 \
+        : _tt(_t), "+r"(_a)                             \
+        :                                               \
+        : "r8")
+#define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e)    \
+        __asm__ __volatile__ (                          \
+        "1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n"   \
+        "\t.section __ex_table,\"aw\"\n"                \
+        "\t" EXC_WORD "\t1b\n"                          \
+        "\t" EXC_WORD "\t" #_e "\n"                     \
+        "\t.previous\n"                                 \
+        : "+r"(_a)                                      \
+        : _tt(_t)                                       \
+        : "r8")
+#define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)
+#define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)
+#define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)
+#define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)
+#define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
+#define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)
+#define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e)         \
+        __asm__ __volatile__ (                          \
+        "1:\t" #_insn " " #_o "(" _s ",%1), %0\n"       \
+        "\t.section __ex_table,\"aw\"\n"                \
+        "\t" EXC_WORD "\t1b\n"                          \
+        "\t" EXC_WORD "\t" #_e "\n"                     \
+        "\t.previous\n"                                 \
+        : _tt(_t)                                       \
+        : "r"(_a)                                       \
+        : "r8")
+#define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e)        \
+        __asm__ __volatile__ (                          \
+        "1:\t" #_insn " %0, " #_o "(" _s ",%1)\n"       \
+        "\t.section __ex_table,\"aw\"\n"                \
+        "\t" EXC_WORD "\t1b\n"                          \
+        "\t" EXC_WORD "\t" #_e "\n"                     \
+        "\t.previous\n"                                 \
+        :                                               \
+        : _tt(_t), "r"(_a)                              \
+        : "r8")
+#define ldw(_s,_o,_a,_t,_e)     def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
+#define stw(_s,_t,_o,_a,_e)     def_store_insn(stw,"r",_s,_t,_o,_a,_e)
+#ifdef  CONFIG_PREFETCH
+extern inline void prefetch_src(const void *addr)
+{
+        __asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
+}
+extern inline void prefetch_dst(const void *addr)
+{
+        __asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
+}
+#else
+#define prefetch_src(addr)
+#define prefetch_dst(addr)
+#endif
+/* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
+ * per loop.  This code is derived from glibc. 
+ */
+static inline unsigned long copy_dstaligned(unsigned long dst, unsigned long src, unsigned long len, unsigned long o_dst, unsigned long o_src, unsigned long o_len)
+{
+        /* gcc complains that a2 and a3 may be uninitialized, but actually
+         * they cannot be.  Initialize a2/a3 to shut gcc up.
+         */
+        register unsigned int a0, a1, a2 = 0, a3 = 0;
+        int sh_1, sh_2;
+        struct exception_data *d;
+        /* prefetch_src((const void *)src); */
+        /* Calculate how to shift a word read at the memory operation
+           aligned srcp to make it aligned for copy.  */
+        sh_1 = 8 * (src % sizeof(unsigned int));
+        sh_2 = 8 * sizeof(unsigned int) - sh_1;
+        /* Make src aligned by rounding it down.  */
+        src &= -sizeof(unsigned int);
+        switch (len % 4)
+        {
+                case 2:
+                        /* a1 = ((unsigned int *) src)[0];
+                           a2 = ((unsigned int *) src)[1]; */
+                        ldw(s_space, 0, src, a1, cda_ldw_exc);
+                        ldw(s_space, 4, src, a2, cda_ldw_exc);
+                        src -= 1 * sizeof(unsigned int);
+                        dst -= 3 * sizeof(unsigned int);
+                        len += 2;
+                        goto do1;
+                case 3:
+                        /* a0 = ((unsigned int *) src)[0];
+                           a1 = ((unsigned int *) src)[1]; */
+                        ldw(s_space, 0, src, a0, cda_ldw_exc);
+                        ldw(s_space, 4, src, a1, cda_ldw_exc);
+                        src -= 0 * sizeof(unsigned int);
+                        dst -= 2 * sizeof(unsigned int);
+                        len += 1;
+                        goto do2;
+                case 0:
+                        if (len == 0)
+                                return 0;
+                        /* a3 = ((unsigned int *) src)[0];
+                           a0 = ((unsigned int *) src)[1]; */
+                        ldw(s_space, 0, src, a3, cda_ldw_exc);
+                        ldw(s_space, 4, src, a0, cda_ldw_exc);
+                        src -=-1 * sizeof(unsigned int);
+                        dst -= 1 * sizeof(unsigned int);
+                        len += 0;
+                        goto do3;
+                case 1:
+                        /* a2 = ((unsigned int *) src)[0];
+                           a3 = ((unsigned int *) src)[1]; */
+                        ldw(s_space, 0, src, a2, cda_ldw_exc);
+                        ldw(s_space, 4, src, a3, cda_ldw_exc);
+                        src -=-2 * sizeof(unsigned int);
+                        dst -= 0 * sizeof(unsigned int);
+                        len -= 1;
+                        if (len == 0)
+                                goto do0;
+                        goto do4;                       /* No-op.  */
+        }
+        do
+        {
+                /* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
+do4:
+                /* a0 = ((unsigned int *) src)[0]; */
+                ldw(s_space, 0, src, a0, cda_ldw_exc);
+                /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
+                stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
+do3:
+                /* a1 = ((unsigned int *) src)[1]; */
+                ldw(s_space, 4, src, a1, cda_ldw_exc);
+                /* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */
+                stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc);
+do2:
+                /* a2 = ((unsigned int *) src)[2]; */
+                ldw(s_space, 8, src, a2, cda_ldw_exc);
+                /* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */
+                stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc);
+do1:
+                /* a3 = ((unsigned int *) src)[3]; */
+                ldw(s_space, 12, src, a3, cda_ldw_exc);
+                /* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */
+                stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc);
+                src += 4 * sizeof(unsigned int);
+                dst += 4 * sizeof(unsigned int);
+                len -= 4;
+        }
+        while (len != 0);
+do0:
+        /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
+        stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
+        preserve_branch(handle_load_error);
+        preserve_branch(handle_store_error);
+        return 0;
+handle_load_error:
+        __asm__ __volatile__ ("cda_ldw_exc:\n");
+        d = &__get_cpu_var(exception_data);
+        DPRINTF("cda_ldw_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
+                o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
+        return o_len * 4 - d->fault_addr + o_src;
+handle_store_error:
+        __asm__ __volatile__ ("cda_stw_exc:\n");
+        d = &__get_cpu_var(exception_data);
+        DPRINTF("cda_stw_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
+                o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
+        return o_len * 4 - d->fault_addr + o_dst;
+}
+/* Returns 0 for success, otherwise, returns number of bytes not transferred. */
+unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
+{
+        register unsigned long src, dst, t1, t2, t3;
+        register unsigned char *pcs, *pcd;
+        register unsigned int *pws, *pwd;
+        register double *pds, *pdd;
+        unsigned long ret = 0;
+        unsigned long o_dst, o_src, o_len;
+        struct exception_data *d;
+        src = (unsigned long)srcp;
+        dst = (unsigned long)dstp;
+        pcs = (unsigned char *)srcp;
+        pcd = (unsigned char *)dstp;
+        o_dst = dst; o_src = src; o_len = len;
+        /* prefetch_src((const void *)srcp); */
+        if (len < THRESHOLD)
+                goto byte_copy;
+        /* Check alignment */
+        t1 = (src ^ dst);
+        if (unlikely(t1 & (sizeof(double)-1)))
+                goto unaligned_copy;
+        /* src and dst have same alignment. */
+        /* Copy bytes till we are double-aligned. */
+        t2 = src & (sizeof(double) - 1);
+        if (unlikely(t2 != 0)) {
+                t2 = sizeof(double) - t2;
+                while (t2 && len) {
+                        /* *pcd++ = *pcs++; */
+                        ldbma(s_space, pcs, t3, pmc_load_exc);
+                        len--;
+                        stbma(d_space, t3, pcd, pmc_store_exc);
+                        t2--;
+                }
+        }
+        pds = (double *)pcs;
+        pdd = (double *)pcd;
+        /* Copy 8 doubles at a time */
+        while (len >= 8*sizeof(double)) {
+                register double r1, r2, r3, r4, r5, r6, r7, r8;
+                /* prefetch_src((char *)pds + L1_CACHE_BYTES); */
+                flddma(s_space, pds, r1, pmc_load_exc);
+                flddma(s_space, pds, r2, pmc_load_exc);
+                flddma(s_space, pds, r3, pmc_load_exc);
+                flddma(s_space, pds, r4, pmc_load_exc);
+                fstdma(d_space, r1, pdd, pmc_store_exc);
+                fstdma(d_space, r2, pdd, pmc_store_exc);
+                fstdma(d_space, r3, pdd, pmc_store_exc);
+                fstdma(d_space, r4, pdd, pmc_store_exc);
+#if 0
+                if (L1_CACHE_BYTES <= 32)
+                        prefetch_src((char *)pds + L1_CACHE_BYTES);
+#endif
+                flddma(s_space, pds, r5, pmc_load_exc);
+                flddma(s_space, pds, r6, pmc_load_exc);
+                flddma(s_space, pds, r7, pmc_load_exc);
+                flddma(s_space, pds, r8, pmc_load_exc);
+                fstdma(d_space, r5, pdd, pmc_store_exc);
+                fstdma(d_space, r6, pdd, pmc_store_exc);
+                fstdma(d_space, r7, pdd, pmc_store_exc);
+                fstdma(d_space, r8, pdd, pmc_store_exc);
+                len -= 8*sizeof(double);
+        }
+        pws = (unsigned int *)pds;
+        pwd = (unsigned int *)pdd;
+word_copy:
+        while (len >= 8*sizeof(unsigned int)) {
+                register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
+                /* prefetch_src((char *)pws + L1_CACHE_BYTES); */
+                ldwma(s_space, pws, r1, pmc_load_exc);
+                ldwma(s_space, pws, r2, pmc_load_exc);
+                ldwma(s_space, pws, r3, pmc_load_exc);
+                ldwma(s_space, pws, r4, pmc_load_exc);
+                stwma(d_space, r1, pwd, pmc_store_exc);
+                stwma(d_space, r2, pwd, pmc_store_exc);
+                stwma(d_space, r3, pwd, pmc_store_exc);
+                stwma(d_space, r4, pwd, pmc_store_exc);
+                ldwma(s_space, pws, r5, pmc_load_exc);
+                ldwma(s_space, pws, r6, pmc_load_exc);
+                ldwma(s_space, pws, r7, pmc_load_exc);
+                ldwma(s_space, pws, r8, pmc_load_exc);
+                stwma(d_space, r5, pwd, pmc_store_exc);
+                stwma(d_space, r6, pwd, pmc_store_exc);
+                stwma(d_space, r7, pwd, pmc_store_exc);
+                stwma(d_space, r8, pwd, pmc_store_exc);
+                len -= 8*sizeof(unsigned int);
+        }
+        while (len >= 4*sizeof(unsigned int)) {
+                register unsigned int r1,r2,r3,r4;
+                ldwma(s_space, pws, r1, pmc_load_exc);
+                ldwma(s_space, pws, r2, pmc_load_exc);
+                ldwma(s_space, pws, r3, pmc_load_exc);
+                ldwma(s_space, pws, r4, pmc_load_exc);
+                stwma(d_space, r1, pwd, pmc_store_exc);
+                stwma(d_space, r2, pwd, pmc_store_exc);
+                stwma(d_space, r3, pwd, pmc_store_exc);
+                stwma(d_space, r4, pwd, pmc_store_exc);
+                len -= 4*sizeof(unsigned int);
+        }
+        pcs = (unsigned char *)pws;
+        pcd = (unsigned char *)pwd;
+byte_copy:
+        while (len) {
+                /* *pcd++ = *pcs++; */
+                ldbma(s_space, pcs, t3, pmc_load_exc);
+                stbma(d_space, t3, pcd, pmc_store_exc);
+                len--;
+        }
+        return 0;
+unaligned_copy:
+        /* possibly we are aligned on a word, but not on a double... */
+        if (likely(t1 & (sizeof(unsigned int)-1)) == 0) {
+                t2 = src & (sizeof(unsigned int) - 1);
+                if (unlikely(t2 != 0)) {
+                        t2 = sizeof(unsigned int) - t2;
+                        while (t2) {
+                                /* *pcd++ = *pcs++; */
+                                ldbma(s_space, pcs, t3, pmc_load_exc);
+                                stbma(d_space, t3, pcd, pmc_store_exc);
+                                len--;
+                                t2--;
+                        }
+                }
+                pws = (unsigned int *)pcs;
+                pwd = (unsigned int *)pcd;
+                goto word_copy;
+        }
+        /* Align the destination.  */
+        if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
+                t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
+                while (t2) {
+                        /* *pcd++ = *pcs++; */
+                        ldbma(s_space, pcs, t3, pmc_load_exc);
+                        stbma(d_space, t3, pcd, pmc_store_exc);
+                        len--;
+                        t2--;
+                }
+                dst = (unsigned long)pcd;
+                src = (unsigned long)pcs;
+        }
+        ret = copy_dstaligned(dst, src, len / sizeof(unsigned int), 
+                o_dst, o_src, o_len);
+        if (ret)
+                return ret;
+        pcs += (len & -sizeof(unsigned int));
+        pcd += (len & -sizeof(unsigned int));
+        len %= sizeof(unsigned int);
+        preserve_branch(handle_load_error);
+        preserve_branch(handle_store_error);
+        goto byte_copy;
+handle_load_error:
+        __asm__ __volatile__ ("pmc_load_exc:\n");
+        d = &__get_cpu_var(exception_data);
+        DPRINTF("pmc_load_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
+                o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
+        return o_len - d->fault_addr + o_src;
+handle_store_error:
+        __asm__ __volatile__ ("pmc_store_exc:\n");
+        d = &__get_cpu_var(exception_data);
+        DPRINTF("pmc_store_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
+                o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
+        return o_len - d->fault_addr + o_dst;
+}
+#ifdef __KERNEL__
+unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len)
+{
+        mtsp(get_kernel_space(), 1);
+        mtsp(get_user_space(), 2);
+        return pa_memcpy((void __force *)dst, src, len);
+}
+unsigned long copy_from_user(void *dst, const void __user *src, unsigned long len)
+{
+        mtsp(get_user_space(), 1);
+        mtsp(get_kernel_space(), 2);
+        return pa_memcpy(dst, (void __force *)src, len);
+}
+unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len)
+{
+        mtsp(get_user_space(), 1);
+        mtsp(get_user_space(), 2);
+        return pa_memcpy((void __force *)dst, (void __force *)src, len);
+}
+void * memcpy(void * dst,const void *src, size_t count)
+{
+        mtsp(get_kernel_space(), 1);
+        mtsp(get_kernel_space(), 2);
+        pa_memcpy(dst, src, count);
+        return dst;
+}
+EXPORT_SYMBOL(copy_to_user);
+EXPORT_SYMBOL(copy_from_user);
+EXPORT_SYMBOL(copy_in_user);
+EXPORT_SYMBOL(memcpy);
+#endif
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/parisc/lib/memcpy.c

diff --git a/arch/parisc/lib/memcpy.c b/arch/parisc/lib/memcpy.c new file mode 100644 index 000000000000..feb1b9f42c2b --- /dev/null +++ b/arch/parisc/lib/memcpy.c
@@ -0,0 +1,522 @@
	1	/*
	2	* Optimized memory copy routines.
	3	*
	4	* Copyright (C) 2004 Randolph Chung <tausq@debian.org>
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License as published by
	8	* the Free Software Foundation; either version 2, or (at your option)
	9	* any later version.
	10	*
	11	* This program is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* along with this program; if not, write to the Free Software
	18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	19	*
	20	* Portions derived from the GNU C Library
	21	* Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
	22	*
	23	* Several strategies are tried to try to get the best performance for various
	24	* conditions. In the optimal case, we copy 64-bytes in an unrolled loop using
	25	* fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
	26	* general registers. Unaligned copies are handled either by aligning the
	27	* destination and then using shift-and-write method, or in a few cases by
	28	* falling back to a byte-at-a-time copy.
	29	*
	30	* I chose to implement this in C because it is easier to maintain and debug,
	31	* and in my experiments it appears that the C code generated by gcc (3.3/3.4
	32	* at the time of writing) is fairly optimal. Unfortunately some of the
	33	* semantics of the copy routine (exception handling) is difficult to express
	34	* in C, so we have to play some tricks to get it to work.
	35	*
	36	* All the loads and stores are done via explicit asm() code in order to use
	37	* the right space registers.
	38	*
	39	* Testing with various alignments and buffer sizes shows that this code is
	40	* often >10x faster than a simple byte-at-a-time copy, even for strangely
	41	* aligned operands. It is interesting to note that the glibc version
	42	* of memcpy (written in C) is actually quite fast already. This routine is
	43	* able to beat it by 30-40% for aligned copies because of the loop unrolling,
	44	* but in some cases the glibc version is still slightly faster. This lends
	45	* more credibility that gcc can generate very good code as long as we are
	46	* careful.
	47	*
	48	* TODO:
	49	* - cache prefetching needs more experimentation to get optimal settings
	50	* - try not to use the post-increment address modifiers; they create additional
	51	* interlocks
	52	* - replace byte-copy loops with stybs sequences
	53	*/
	54
	55	#ifdef __KERNEL__
	56	#include <linux/config.h>
	57	#include <linux/module.h>
	58	#include <linux/compiler.h>
	59	#include <asm/uaccess.h>
	60	#define s_space "%%sr1"
	61	#define d_space "%%sr2"
	62	#else
	63	#include "memcpy.h"
	64	#define s_space "%%sr0"
	65	#define d_space "%%sr0"
	66	#define pa_memcpy new2_copy
	67	#endif
	68
	69	DECLARE_PER_CPU(struct exception_data, exception_data);
	70
	71	#define preserve_branch(label) do { \
	72	volatile int dummy; \
	73	/* The following branch is never taken, it's just here to */ \
	74	/* prevent gcc from optimizing away our exception code. */ \
	75	if (unlikely(dummy != dummy)) \
	76	goto label; \
	77	} while (0)
	78
	79	#define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
	80	#define get_kernel_space() (0)
	81
	82	#define MERGE(w0, sh_1, w1, sh_2) ({ \
	83	unsigned int _r; \
	84	asm volatile ( \
	85	"mtsar %3\n" \
	86	"shrpw %1, %2, %%sar, %0\n" \
	87	: "=r"(_r) \
	88	: "r"(w0), "r"(w1), "r"(sh_2) \
	89	); \
	90	_r; \
	91	})
	92	#define THRESHOLD 16
	93
	94	#ifdef DEBUG_MEMCPY
	95	#define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __FUNCTION__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
	96	#else
	97	#define DPRINTF(fmt, args...)
	98	#endif
	99
	100	#ifndef __LP64__
	101	#define EXC_WORD ".word"
	102	#else
	103	#define EXC_WORD ".dword"
	104	#endif
	105
	106	#define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
	107	__asm__ __volatile__ ( \
	108	"1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n" \
	109	"\t.section __ex_table,\"aw\"\n" \
	110	"\t" EXC_WORD "\t1b\n" \
	111	"\t" EXC_WORD "\t" #_e "\n" \
	112	"\t.previous\n" \
	113	: _tt(_t), "+r"(_a) \
	114	: \
	115	: "r8")
	116
	117	#define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
	118	__asm__ __volatile__ ( \
	119	"1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n" \
	120	"\t.section __ex_table,\"aw\"\n" \
	121	"\t" EXC_WORD "\t1b\n" \
	122	"\t" EXC_WORD "\t" #_e "\n" \
	123	"\t.previous\n" \
	124	: "+r"(_a) \
	125	: _tt(_t) \
	126	: "r8")
	127
	128	#define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)
	129	#define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)
	130	#define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)
	131	#define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)
	132	#define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
	133	#define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)
	134
	135	#define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) \
	136	__asm__ __volatile__ ( \
	137	"1:\t" #_insn " " #_o "(" _s ",%1), %0\n" \
	138	"\t.section __ex_table,\"aw\"\n" \
	139	"\t" EXC_WORD "\t1b\n" \
	140	"\t" EXC_WORD "\t" #_e "\n" \
	141	"\t.previous\n" \
	142	: _tt(_t) \
	143	: "r"(_a) \
	144	: "r8")
	145
	146	#define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) \
	147	__asm__ __volatile__ ( \
	148	"1:\t" #_insn " %0, " #_o "(" _s ",%1)\n" \
	149	"\t.section __ex_table,\"aw\"\n" \
	150	"\t" EXC_WORD "\t1b\n" \
	151	"\t" EXC_WORD "\t" #_e "\n" \
	152	"\t.previous\n" \
	153	: \
	154	: _tt(_t), "r"(_a) \
	155	: "r8")
	156
	157	#define ldw(_s,_o,_a,_t,_e) def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
	158	#define stw(_s,_t,_o,_a,_e) def_store_insn(stw,"r",_s,_t,_o,_a,_e)
	159
	160	#ifdef CONFIG_PREFETCH
	161	extern inline void prefetch_src(const void *addr)
	162	{
	163	__asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
	164	}
	165
	166	extern inline void prefetch_dst(const void *addr)
	167	{
	168	__asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
	169	}
	170	#else
	171	#define prefetch_src(addr)
	172	#define prefetch_dst(addr)
	173	#endif
	174
	175	/* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
	176	* per loop. This code is derived from glibc.
	177	*/
	178	static inline unsigned long copy_dstaligned(unsigned long dst, unsigned long src, unsigned long len, unsigned long o_dst, unsigned long o_src, unsigned long o_len)
	179	{
	180	/* gcc complains that a2 and a3 may be uninitialized, but actually
	181	* they cannot be. Initialize a2/a3 to shut gcc up.
	182	*/
	183	register unsigned int a0, a1, a2 = 0, a3 = 0;
	184	int sh_1, sh_2;
	185	struct exception_data *d;
	186
	187	/* prefetch_src((const void )src); /
	188
	189	/* Calculate how to shift a word read at the memory operation
	190	aligned srcp to make it aligned for copy. */
	191	sh_1 = 8 * (src % sizeof(unsigned int));
	192	sh_2 = 8 * sizeof(unsigned int) - sh_1;
	193
	194	/* Make src aligned by rounding it down. */
	195	src &= -sizeof(unsigned int);
	196
	197	switch (len % 4)
	198	{
	199	case 2:
	200	/* a1 = ((unsigned int *) src)[0];
	201	a2 = ((unsigned int ) src)[1]; /
	202	ldw(s_space, 0, src, a1, cda_ldw_exc);
	203	ldw(s_space, 4, src, a2, cda_ldw_exc);
	204	src -= 1 * sizeof(unsigned int);
	205	dst -= 3 * sizeof(unsigned int);
	206	len += 2;
	207	goto do1;
	208	case 3:
	209	/* a0 = ((unsigned int *) src)[0];
	210	a1 = ((unsigned int ) src)[1]; /
	211	ldw(s_space, 0, src, a0, cda_ldw_exc);
	212	ldw(s_space, 4, src, a1, cda_ldw_exc);
	213	src -= 0 * sizeof(unsigned int);
	214	dst -= 2 * sizeof(unsigned int);
	215	len += 1;
	216	goto do2;
	217	case 0:
	218	if (len == 0)
	219	return 0;
	220	/* a3 = ((unsigned int *) src)[0];
	221	a0 = ((unsigned int ) src)[1]; /
	222	ldw(s_space, 0, src, a3, cda_ldw_exc);
	223	ldw(s_space, 4, src, a0, cda_ldw_exc);
	224	src -=-1 * sizeof(unsigned int);
	225	dst -= 1 * sizeof(unsigned int);
	226	len += 0;
	227	goto do3;
	228	case 1:
	229	/* a2 = ((unsigned int *) src)[0];
	230	a3 = ((unsigned int ) src)[1]; /
	231	ldw(s_space, 0, src, a2, cda_ldw_exc);
	232	ldw(s_space, 4, src, a3, cda_ldw_exc);
	233	src -=-2 * sizeof(unsigned int);
	234	dst -= 0 * sizeof(unsigned int);
	235	len -= 1;
	236	if (len == 0)
	237	goto do0;
	238	goto do4; /* No-op. */
	239	}
	240
	241	do
	242	{
	243	/* prefetch_src((const void )(src + 4 sizeof(unsigned int))); */
	244	do4:
	245	/* a0 = ((unsigned int ) src)[0]; /
	246	ldw(s_space, 0, src, a0, cda_ldw_exc);
	247	/* ((unsigned int ) dst)[0] = MERGE (a2, sh_1, a3, sh_2); /
	248	stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
	249	do3:
	250	/* a1 = ((unsigned int ) src)[1]; /
	251	ldw(s_space, 4, src, a1, cda_ldw_exc);
	252	/* ((unsigned int ) dst)[1] = MERGE (a3, sh_1, a0, sh_2); /
	253	stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc);
	254	do2:
	255	/* a2 = ((unsigned int ) src)[2]; /
	256	ldw(s_space, 8, src, a2, cda_ldw_exc);
	257	/* ((unsigned int ) dst)[2] = MERGE (a0, sh_1, a1, sh_2); /
	258	stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc);
	259	do1:
	260	/* a3 = ((unsigned int ) src)[3]; /
	261	ldw(s_space, 12, src, a3, cda_ldw_exc);
	262	/* ((unsigned int ) dst)[3] = MERGE (a1, sh_1, a2, sh_2); /
	263	stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc);
	264
	265	src += 4 * sizeof(unsigned int);
	266	dst += 4 * sizeof(unsigned int);
	267	len -= 4;
	268	}
	269	while (len != 0);
	270
	271	do0:
	272	/* ((unsigned int ) dst)[0] = MERGE (a2, sh_1, a3, sh_2); /
	273	stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
	274
	275	preserve_branch(handle_load_error);
	276	preserve_branch(handle_store_error);
	277
	278	return 0;
	279
	280	handle_load_error:
	281	__asm__ __volatile__ ("cda_ldw_exc:\n");
	282	d = &__get_cpu_var(exception_data);
	283	DPRINTF("cda_ldw_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
	284	o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
	285	return o_len * 4 - d->fault_addr + o_src;
	286
	287	handle_store_error:
	288	__asm__ __volatile__ ("cda_stw_exc:\n");
	289	d = &__get_cpu_var(exception_data);
	290	DPRINTF("cda_stw_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
	291	o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
	292	return o_len * 4 - d->fault_addr + o_dst;
	293	}
	294
	295
	296	/* Returns 0 for success, otherwise, returns number of bytes not transferred. */
	297	unsigned long pa_memcpy(void dstp, const void srcp, unsigned long len)
	298	{
	299	register unsigned long src, dst, t1, t2, t3;
	300	register unsigned char pcs, pcd;
	301	register unsigned int pws, pwd;
	302	register double pds, pdd;
	303	unsigned long ret = 0;
	304	unsigned long o_dst, o_src, o_len;
	305	struct exception_data *d;
	306
	307	src = (unsigned long)srcp;
	308	dst = (unsigned long)dstp;
	309	pcs = (unsigned char *)srcp;
	310	pcd = (unsigned char *)dstp;
	311
	312	o_dst = dst; o_src = src; o_len = len;
	313
	314	/* prefetch_src((const void )srcp); /
	315
	316	if (len < THRESHOLD)
	317	goto byte_copy;
	318
	319	/* Check alignment */
	320	t1 = (src ^ dst);
	321	if (unlikely(t1 & (sizeof(double)-1)))
	322	goto unaligned_copy;
	323
	324	/* src and dst have same alignment. */
	325
	326	/* Copy bytes till we are double-aligned. */
	327	t2 = src & (sizeof(double) - 1);
	328	if (unlikely(t2 != 0)) {
	329	t2 = sizeof(double) - t2;
	330	while (t2 && len) {
	331	/* pcd++ = pcs++; */
	332	ldbma(s_space, pcs, t3, pmc_load_exc);
	333	len--;
	334	stbma(d_space, t3, pcd, pmc_store_exc);
	335	t2--;
	336	}
	337	}
	338
	339	pds = (double *)pcs;
	340	pdd = (double *)pcd;
	341
	342	/* Copy 8 doubles at a time */
	343	while (len >= 8*sizeof(double)) {
	344	register double r1, r2, r3, r4, r5, r6, r7, r8;
	345	/* prefetch_src((char )pds + L1_CACHE_BYTES); /
	346	flddma(s_space, pds, r1, pmc_load_exc);
	347	flddma(s_space, pds, r2, pmc_load_exc);
	348	flddma(s_space, pds, r3, pmc_load_exc);
	349	flddma(s_space, pds, r4, pmc_load_exc);
	350	fstdma(d_space, r1, pdd, pmc_store_exc);
	351	fstdma(d_space, r2, pdd, pmc_store_exc);
	352	fstdma(d_space, r3, pdd, pmc_store_exc);
	353	fstdma(d_space, r4, pdd, pmc_store_exc);
	354
	355	#if 0
	356	if (L1_CACHE_BYTES <= 32)
	357	prefetch_src((char *)pds + L1_CACHE_BYTES);
	358	#endif
	359	flddma(s_space, pds, r5, pmc_load_exc);
	360	flddma(s_space, pds, r6, pmc_load_exc);
	361	flddma(s_space, pds, r7, pmc_load_exc);
	362	flddma(s_space, pds, r8, pmc_load_exc);
	363	fstdma(d_space, r5, pdd, pmc_store_exc);
	364	fstdma(d_space, r6, pdd, pmc_store_exc);
	365	fstdma(d_space, r7, pdd, pmc_store_exc);
	366	fstdma(d_space, r8, pdd, pmc_store_exc);
	367	len -= 8*sizeof(double);
	368	}
	369
	370	pws = (unsigned int *)pds;
	371	pwd = (unsigned int *)pdd;
	372
	373	word_copy:
	374	while (len >= 8*sizeof(unsigned int)) {
	375	register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
	376	/* prefetch_src((char )pws + L1_CACHE_BYTES); /
	377	ldwma(s_space, pws, r1, pmc_load_exc);
	378	ldwma(s_space, pws, r2, pmc_load_exc);
	379	ldwma(s_space, pws, r3, pmc_load_exc);
	380	ldwma(s_space, pws, r4, pmc_load_exc);
	381	stwma(d_space, r1, pwd, pmc_store_exc);
	382	stwma(d_space, r2, pwd, pmc_store_exc);
	383	stwma(d_space, r3, pwd, pmc_store_exc);
	384	stwma(d_space, r4, pwd, pmc_store_exc);
	385
	386	ldwma(s_space, pws, r5, pmc_load_exc);
	387	ldwma(s_space, pws, r6, pmc_load_exc);
	388	ldwma(s_space, pws, r7, pmc_load_exc);
	389	ldwma(s_space, pws, r8, pmc_load_exc);
	390	stwma(d_space, r5, pwd, pmc_store_exc);
	391	stwma(d_space, r6, pwd, pmc_store_exc);
	392	stwma(d_space, r7, pwd, pmc_store_exc);
	393	stwma(d_space, r8, pwd, pmc_store_exc);
	394	len -= 8*sizeof(unsigned int);
	395	}
	396
	397	while (len >= 4*sizeof(unsigned int)) {
	398	register unsigned int r1,r2,r3,r4;
	399	ldwma(s_space, pws, r1, pmc_load_exc);
	400	ldwma(s_space, pws, r2, pmc_load_exc);
	401	ldwma(s_space, pws, r3, pmc_load_exc);
	402	ldwma(s_space, pws, r4, pmc_load_exc);
	403	stwma(d_space, r1, pwd, pmc_store_exc);
	404	stwma(d_space, r2, pwd, pmc_store_exc);
	405	stwma(d_space, r3, pwd, pmc_store_exc);
	406	stwma(d_space, r4, pwd, pmc_store_exc);
	407	len -= 4*sizeof(unsigned int);
	408	}
	409
	410	pcs = (unsigned char *)pws;
	411	pcd = (unsigned char *)pwd;
	412
	413	byte_copy:
	414	while (len) {
	415	/* pcd++ = pcs++; */
	416	ldbma(s_space, pcs, t3, pmc_load_exc);
	417	stbma(d_space, t3, pcd, pmc_store_exc);
	418	len--;
	419	}
	420
	421	return 0;
	422
	423	unaligned_copy:
	424	/* possibly we are aligned on a word, but not on a double... */
	425	if (likely(t1 & (sizeof(unsigned int)-1)) == 0) {
	426	t2 = src & (sizeof(unsigned int) - 1);
	427
	428	if (unlikely(t2 != 0)) {
	429	t2 = sizeof(unsigned int) - t2;
	430	while (t2) {
	431	/* pcd++ = pcs++; */
	432	ldbma(s_space, pcs, t3, pmc_load_exc);
	433	stbma(d_space, t3, pcd, pmc_store_exc);
	434	len--;
	435	t2--;
	436	}
	437	}
	438
	439	pws = (unsigned int *)pcs;
	440	pwd = (unsigned int *)pcd;
	441	goto word_copy;
	442	}
	443
	444	/* Align the destination. */
	445	if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
	446	t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
	447	while (t2) {
	448	/* pcd++ = pcs++; */
	449	ldbma(s_space, pcs, t3, pmc_load_exc);
	450	stbma(d_space, t3, pcd, pmc_store_exc);
	451	len--;
	452	t2--;
	453	}
	454	dst = (unsigned long)pcd;
	455	src = (unsigned long)pcs;
	456	}
	457
	458	ret = copy_dstaligned(dst, src, len / sizeof(unsigned int),
	459	o_dst, o_src, o_len);
	460	if (ret)
	461	return ret;
	462
	463	pcs += (len & -sizeof(unsigned int));
	464	pcd += (len & -sizeof(unsigned int));
	465	len %= sizeof(unsigned int);
	466
	467	preserve_branch(handle_load_error);
	468	preserve_branch(handle_store_error);
	469
	470	goto byte_copy;
	471
	472	handle_load_error:
	473	__asm__ __volatile__ ("pmc_load_exc:\n");
	474	d = &__get_cpu_var(exception_data);
	475	DPRINTF("pmc_load_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
	476	o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
	477	return o_len - d->fault_addr + o_src;
	478
	479	handle_store_error:
	480	__asm__ __volatile__ ("pmc_store_exc:\n");
	481	d = &__get_cpu_var(exception_data);
	482	DPRINTF("pmc_store_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
	483	o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
	484	return o_len - d->fault_addr + o_dst;
	485	}
	486
	487	#ifdef __KERNEL__
	488	unsigned long copy_to_user(void __user dst, const void src, unsigned long len)
	489	{
	490	mtsp(get_kernel_space(), 1);
	491	mtsp(get_user_space(), 2);
	492	return pa_memcpy((void __force *)dst, src, len);
	493	}
	494
	495	unsigned long copy_from_user(void dst, const void __user src, unsigned long len)
	496	{
	497	mtsp(get_user_space(), 1);
	498	mtsp(get_kernel_space(), 2);
	499	return pa_memcpy(dst, (void __force *)src, len);
	500	}
	501
	502	unsigned long copy_in_user(void __user dst, const void __user src, unsigned long len)
	503	{
	504	mtsp(get_user_space(), 1);
	505	mtsp(get_user_space(), 2);
	506	return pa_memcpy((void __force )dst, (void __force )src, len);
	507	}
	508
	509
	510	void * memcpy(void * dst,const void *src, size_t count)
	511	{
	512	mtsp(get_kernel_space(), 1);
	513	mtsp(get_kernel_space(), 2);
	514	pa_memcpy(dst, src, count);
	515	return dst;
	516	}
	517
	518	EXPORT_SYMBOL(copy_to_user);
	519	EXPORT_SYMBOL(copy_from_user);
	520	EXPORT_SYMBOL(copy_in_user);
	521	EXPORT_SYMBOL(memcpy);
	522	#endif