Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp

Conflicts: litmus/sched_cedf.c
author: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
committer: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
commit: c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree: ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/tile/lib
parent: ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent: 6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
22 files changed, 1336 insertions, 256 deletions
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile
index 746dc81ed3c4..0c26086ecbef 100644
--- a/arch/tile/lib/Makefile
+++ b/arch/tile/lib/Makefile
@@ -2,9 +2,8 @@
 # Makefile for TILE-specific library files..
 #
-lib-y = cacheflush.o checksum.o cpumask.o delay.o \
+lib-y = cacheflush.o checksum.o cpumask.o delay.o uaccess.o \
-        mb_incoherent.o uaccess.o \
+        memmove.o memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \
-        memcpy_$(BITS).o memchr_$(BITS).o memmove_$(BITS).o memset_$(BITS).o \
        strchr_$(BITS).o strlen_$(BITS).o
 ifeq ($(CONFIG_TILEGX),y)
diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c
index 8040b42a8eea..46570211df52 100644
--- a/arch/tile/lib/atomic_32.c
+++ b/arch/tile/lib/atomic_32.c
@@ -46,14 +46,13 @@ struct atomic_locks_on_cpu *atomic_lock_ptr[ATOMIC_HASH_L1_SIZE]
 #else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
 /* This page is remapped on startup to be hash-for-home. */
-int atomic_locks[PAGE_SIZE / sizeof(int) /* Only ATOMIC_HASH_SIZE is used */]
+int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss;
-  __attribute__((aligned(PAGE_SIZE), section(".bss.page_aligned")));
 #endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
 static inline int *__atomic_hashed_lock(volatile void *v)
 {
-        /* NOTE: this code must match "sys_cmpxchg" in kernel/intvec.S */
+        /* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */
 #if ATOMIC_LOCKS_FOUND_VIA_TABLE()
        unsigned long i =
                (unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long));
@@ -203,32 +202,32 @@ static inline int *__futex_setup(int __user *v)
        return __atomic_hashed_lock((int __force *)v);
 }
-struct __get_user futex_set(int __user *v, int i)
+struct __get_user futex_set(u32 __user *v, int i)
 {
        return __atomic_xchg((int __force *)v, __futex_setup(v), i);
 }
-struct __get_user futex_add(int __user *v, int n)
+struct __get_user futex_add(u32 __user *v, int n)
 {
        return __atomic_xchg_add((int __force *)v, __futex_setup(v), n);
 }
-struct __get_user futex_or(int __user *v, int n)
+struct __get_user futex_or(u32 __user *v, int n)
 {
        return __atomic_or((int __force *)v, __futex_setup(v), n);
 }
-struct __get_user futex_andn(int __user *v, int n)
+struct __get_user futex_andn(u32 __user *v, int n)
 {
        return __atomic_andn((int __force *)v, __futex_setup(v), n);
 }
-struct __get_user futex_xor(int __user *v, int n)
+struct __get_user futex_xor(u32 __user *v, int n)
 {
        return __atomic_xor((int __force *)v, __futex_setup(v), n);
 }
-struct __get_user futex_cmpxchg(int __user *v, int o, int n)
+struct __get_user futex_cmpxchg(u32 __user *v, int o, int n)
 {
        return __atomic_cmpxchg((int __force *)v, __futex_setup(v), o, n);
 }
@@ -300,7 +299,7 @@ void __init __init_atomic_per_cpu(void)
 #else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
        /* Validate power-of-two and "bigger than cpus" assumption */
-        BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1));
+        BUILD_BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1));
        BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids);
        /*
@@ -314,17 +313,17 @@ void __init __init_atomic_per_cpu(void)
        BUG_ON((unsigned long)atomic_locks % PAGE_SIZE != 0);
        /* The locks must all fit on one page. */
-        BUG_ON(ATOMIC_HASH_SIZE * sizeof(int) > PAGE_SIZE);
+        BUILD_BUG_ON(ATOMIC_HASH_SIZE * sizeof(int) > PAGE_SIZE);
        /*
         * We use the page offset of the atomic value's address as
         * an index into atomic_locks, excluding the low 3 bits.
         * That should not produce more indices than ATOMIC_HASH_SIZE.
         */
-        BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE);
+        BUILD_BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE);
 #endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
        /* The futex code makes this assumption, so we validate it here. */
-        BUG_ON(sizeof(atomic_t) != sizeof(int));
+        BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int));
 }
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S
index 5a5514b77e78..24448734f6f1 100644
--- a/arch/tile/lib/atomic_asm_32.S
+++ b/arch/tile/lib/atomic_asm_32.S
@@ -14,7 +14,7 @@
 * Support routines for atomic operations.  Each function takes:
 *
 * r0: address to manipulate
- * r1: pointer to atomic lock guarding this operation (for FUTEX_LOCK_REG)
+ * r1: pointer to atomic lock guarding this operation (for ATOMIC_LOCK_REG)
 * r2: new value to write, or for cmpxchg/add_unless, value to compare against
 * r3: (cmpxchg/xchg_add_unless) new value to write or add;
 *     (atomic64 ops) high word of value to write
@@ -59,7 +59,7 @@
 * bad kernel addresses).
 *
 * Note that if the value we would store is the same as what we
- * loaded, we bypass the load.  Other platforms with true atomics can
+ * loaded, we bypass the store.  Other platforms with true atomics can
 * make the guarantee that a non-atomic __clear_bit(), for example,
 * can safely race with an atomic test_and_set_bit(); this example is
 * from bit_spinlock.h in slub_lock() / slub_unlock().  We can't do
diff --git a/arch/tile/lib/cacheflush.c b/arch/tile/lib/cacheflush.c
index 11b6164c2097..8928aace7a64 100644
--- a/arch/tile/lib/cacheflush.c
+++ b/arch/tile/lib/cacheflush.c
@@ -15,9 +15,129 @@
 #include <asm/page.h>
 #include <asm/cacheflush.h>
 #include <arch/icache.h>
+#include <arch/spr_def.h>
 void __flush_icache_range(unsigned long start, unsigned long end)
 {
        invalidate_icache((const void *)start, end - start, PAGE_SIZE);
 }
+/* Force a load instruction to issue. */
+static inline void force_load(char *p)
+{
+        *(volatile char *)p;
+}
+/*
+ * Flush and invalidate a VA range that is homed remotely on a single
+ * core (if "!hfh") or homed via hash-for-home (if "hfh"), waiting
+ * until the memory controller holds the flushed values.
+ */
+void finv_buffer_remote(void *buffer, size_t size, int hfh)
+{
+        char *p, *base;
+        size_t step_size, load_count;
+        const unsigned long STRIPE_WIDTH = 8192;
+#ifdef __tilegx__
+        /*
+         * On TILE-Gx, we must disable the dstream prefetcher before doing
+         * a cache flush; otherwise, we could end up with data in the cache
+         * that we don't want there.  Note that normally we'd do an mf
+         * after the SPR write to disabling the prefetcher, but we do one
+         * below, before any further loads, so there's no need to do it
+         * here.
+         */
+        uint_reg_t old_dstream_pf = __insn_mfspr(SPR_DSTREAM_PF);
+        __insn_mtspr(SPR_DSTREAM_PF, 0);
+#endif
+        /*
+         * Flush and invalidate the buffer out of the local L1/L2
+         * and request the home cache to flush and invalidate as well.
+         */
+        __finv_buffer(buffer, size);
+        /*
+         * Wait for the home cache to acknowledge that it has processed
+         * all the flush-and-invalidate requests.  This does not mean
+         * that the flushed data has reached the memory controller yet,
+         * but it does mean the home cache is processing the flushes.
+         */
+        __insn_mf();
+        /*
+         * Issue a load to the last cache line, which can't complete
+         * until all the previously-issued flushes to the same memory
+         * controller have also completed.  If we weren't striping
+         * memory, that one load would be sufficient, but since we may
+         * be, we also need to back up to the last load issued to
+         * another memory controller, which would be the point where
+         * we crossed an 8KB boundary (the granularity of striping
+         * across memory controllers).  Keep backing up and doing this
+         * until we are before the beginning of the buffer, or have
+         * hit all the controllers.
+         *
+         * If we are flushing a hash-for-home buffer, it's even worse.
+         * Each line may be homed on a different tile, and each tile
+         * may have up to four lines that are on different
+         * controllers.  So as we walk backwards, we have to touch
+         * enough cache lines to satisfy these constraints.  In
+         * practice this ends up being close enough to "load from
+         * every cache line on a full memory stripe on each
+         * controller" that we simply do that, to simplify the logic.
+         *
+         * FIXME: See bug 9535 for some issues with this code.
+         */
+        if (hfh) {
+                step_size = L2_CACHE_BYTES;
+                load_count = (STRIPE_WIDTH / L2_CACHE_BYTES) *
+                              (1 << CHIP_LOG_NUM_MSHIMS());
+        } else {
+                step_size = STRIPE_WIDTH;
+                load_count = (1 << CHIP_LOG_NUM_MSHIMS());
+        }
+        /* Load the last byte of the buffer. */
+        p = (char *)buffer + size - 1;
+        force_load(p);
+        /* Bump down to the end of the previous stripe or cache line. */
+        p -= step_size;
+        p = (char *)((unsigned long)p | (step_size - 1));
+        /* Figure out how far back we need to go. */
+        base = p - (step_size * (load_count - 2));
+        if ((long)base < (long)buffer)
+                base = buffer;
+        /*
+         * Fire all the loads we need.  The MAF only has eight entries
+         * so we can have at most eight outstanding loads, so we
+         * unroll by that amount.
+         */
+#pragma unroll 8
+        for (; p >= base; p -= step_size)
+                force_load(p);
+        /*
+         * Repeat, but with inv's instead of loads, to get rid of the
+         * data we just loaded into our own cache and the old home L3.
+         * No need to unroll since inv's don't target a register.
+         */
+        p = (char *)buffer + size - 1;
+        __insn_inv(p);
+        p -= step_size;
+        p = (char *)((unsigned long)p | (step_size - 1));
+        for (; p >= base; p -= step_size)
+                __insn_inv(p);
+        /* Wait for the load+inv's (and thus finvs) to have completed. */
+        __insn_mf();
+#ifdef __tilegx__
+        /* Reenable the prefetcher. */
+        __insn_mtspr(SPR_DSTREAM_PF, old_dstream_pf);
+#endif
+}
diff --git a/arch/tile/lib/delay.c b/arch/tile/lib/delay.c
index 5801b03c13ef..cdacdd11d360 100644
--- a/arch/tile/lib/delay.c
+++ b/arch/tile/lib/delay.c
@@ -15,20 +15,31 @@
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/thread_info.h>
-#include <asm/fixmap.h>
+#include <asm/timex.h>
-#include <hv/hypervisor.h>
 void __udelay(unsigned long usecs)
 {
-        hv_nanosleep(usecs * 1000);
+        if (usecs > ULONG_MAX / 1000) {
+                WARN_ON_ONCE(usecs > ULONG_MAX / 1000);
+                usecs = ULONG_MAX / 1000;
+        }
+        __ndelay(usecs * 1000);
 }
 EXPORT_SYMBOL(__udelay);
 void __ndelay(unsigned long nsecs)
 {
-        hv_nanosleep(nsecs);
+        cycles_t target = get_cycles();
+        target += ns2cycles(nsecs);
+        while (get_cycles() < target)
+                cpu_relax();
 }
 EXPORT_SYMBOL(__ndelay);
-/* FIXME: should be declared in a header somewhere. */
+void __delay(unsigned long cycles)
+{
+        cycles_t target = get_cycles() + cycles;
+        while (get_cycles() < target)
+                cpu_relax();
+}
 EXPORT_SYMBOL(__delay);
diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c
index ce5dbf56578f..49284fae9d09 100644
--- a/arch/tile/lib/exports.c
+++ b/arch/tile/lib/exports.c
@@ -29,6 +29,9 @@ EXPORT_SYMBOL(__put_user_8);
 EXPORT_SYMBOL(strnlen_user_asm);
 EXPORT_SYMBOL(strncpy_from_user_asm);
 EXPORT_SYMBOL(clear_user_asm);
+EXPORT_SYMBOL(flush_user_asm);
+EXPORT_SYMBOL(inv_user_asm);
+EXPORT_SYMBOL(finv_user_asm);
 /* arch/tile/kernel/entry.S */
 #include <linux/kernel.h>
@@ -82,4 +85,8 @@ int64_t __muldi3(int64_t, int64_t);
 EXPORT_SYMBOL(__muldi3);
 uint64_t __lshrdi3(uint64_t, unsigned int);
 EXPORT_SYMBOL(__lshrdi3);
+uint64_t __ashrdi3(uint64_t, unsigned int);
+EXPORT_SYMBOL(__ashrdi3);
+uint64_t __ashldi3(uint64_t, unsigned int);
+EXPORT_SYMBOL(__ashldi3);
 #endif
diff --git a/arch/tile/lib/mb_incoherent.S b/arch/tile/lib/mb_incoherent.S
deleted file mode 100644
index 989ad7b68d5a..000000000000
--- a/arch/tile/lib/mb_incoherent.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful, but
- *   WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *   NON INFRINGEMENT.  See the GNU General Public License for
- *   more details.
- *
- * Assembly code for invoking the HV's fence_incoherent syscall.
- */
-#include <linux/linkage.h>
-#include <hv/syscall_public.h>
-#include <arch/abi.h>
-#include <arch/chip.h>
-#if !CHIP_HAS_MF_WAITS_FOR_VICTIMS()
-/*
- * Invoke the hypervisor's fence_incoherent syscall, which guarantees
- * that all victims for cachelines homed on this tile have reached memory.
- */
-STD_ENTRY(__mb_incoherent)
-        moveli TREG_SYSCALL_NR_NAME, HV_SYS_fence_incoherent
-        swint2
-        jrp lr
-        STD_ENDPROC(__mb_incoherent)
-#endif
diff --git a/arch/tile/lib/memchr_32.c b/arch/tile/lib/memchr_32.c
index 6235283b4859..cc3d9badf030 100644
--- a/arch/tile/lib/memchr_32.c
+++ b/arch/tile/lib/memchr_32.c
@@ -18,12 +18,24 @@
 void *memchr(const void *s, int c, size_t n)
 {
+        const uint32_t *last_word_ptr;
+        const uint32_t *p;
+        const char *last_byte_ptr;
+        uintptr_t s_int;
+        uint32_t goal, before_mask, v, bits;
+        char *ret;
+        if (__builtin_expect(n == 0, 0)) {
+                /* Don't dereference any memory if the array is empty. */
+                return NULL;
+        }
        /* Get an aligned pointer. */
-        const uintptr_t s_int = (uintptr_t) s;
+        s_int = (uintptr_t) s;
-        const uint32_t *p = (const uint32_t *)(s_int & -4);
+        p = (const uint32_t *)(s_int & -4);
        /* Create four copies of the byte for which we are looking. */
-        const uint32_t goal = 0x01010101 * (uint8_t) c;
+        goal = 0x01010101 * (uint8_t) c;
        /* Read the first word, but munge it so that bytes before the array
         * will not match goal.
@@ -31,23 +43,14 @@ void *memchr(const void *s, int c, size_t n)
         * Note that this shift count expression works because we know
         * shift counts are taken mod 32.
         */
-        const uint32_t before_mask = (1 << (s_int << 3)) - 1;
+        before_mask = (1 << (s_int << 3)) - 1;
-        uint32_t v = (*p | before_mask) ^ (goal & before_mask);
+        v = (*p | before_mask) ^ (goal & before_mask);
        /* Compute the address of the last byte. */
-        const char *const last_byte_ptr = (const char *)s + n - 1;
+        last_byte_ptr = (const char *)s + n - 1;
        /* Compute the address of the word containing the last byte. */
-        const uint32_t *const last_word_ptr =
+        last_word_ptr = (const uint32_t *)((uintptr_t) last_byte_ptr & -4);
-            (const uint32_t *)((uintptr_t) last_byte_ptr & -4);
-        uint32_t bits;
-        char *ret;
-        if (__builtin_expect(n == 0, 0)) {
-                /* Don't dereference any memory if the array is empty. */
-                return NULL;
-        }
        while ((bits = __insn_seqb(v, goal)) == 0) {
                if (__builtin_expect(p == last_word_ptr, 0)) {
diff --git a/arch/tile/lib/memchr_64.c b/arch/tile/lib/memchr_64.c
new file mode 100644
index 000000000000..84fdc8d8e735
--- /dev/null
+++ b/arch/tile/lib/memchr_64.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+void *memchr(const void *s, int c, size_t n)
+{
+        const uint64_t *last_word_ptr;
+        const uint64_t *p;
+        const char *last_byte_ptr;
+        uintptr_t s_int;
+        uint64_t goal, before_mask, v, bits;
+        char *ret;
+        if (__builtin_expect(n == 0, 0)) {
+                /* Don't dereference any memory if the array is empty. */
+                return NULL;
+        }
+        /* Get an aligned pointer. */
+        s_int = (uintptr_t) s;
+        p = (const uint64_t *)(s_int & -8);
+        /* Create eight copies of the byte for which we are looking. */
+        goal = 0x0101010101010101ULL * (uint8_t) c;
+        /* Read the first word, but munge it so that bytes before the array
+         * will not match goal.
+         *
+         * Note that this shift count expression works because we know
+         * shift counts are taken mod 64.
+         */
+        before_mask = (1ULL << (s_int << 3)) - 1;
+        v = (*p | before_mask) ^ (goal & before_mask);
+        /* Compute the address of the last byte. */
+        last_byte_ptr = (const char *)s + n - 1;
+        /* Compute the address of the word containing the last byte. */
+        last_word_ptr = (const uint64_t *)((uintptr_t) last_byte_ptr & -8);
+        while ((bits = __insn_v1cmpeq(v, goal)) == 0) {
+                if (__builtin_expect(p == last_word_ptr, 0)) {
+                        /* We already read the last word in the array,
+                         * so give up.
+                         */
+                        return NULL;
+                }
+                v = *++p;
+        }
+        /* We found a match, but it might be in a byte past the end
+         * of the array.
+         */
+        ret = ((char *)p) + (__insn_ctz(bits) >> 3);
+        return (ret <= last_byte_ptr) ? ret : NULL;
+}
+EXPORT_SYMBOL(memchr);
diff --git a/arch/tile/lib/memcpy_32.S b/arch/tile/lib/memcpy_32.S
index 30c3b7ebb55d..2a419a6122db 100644
--- a/arch/tile/lib/memcpy_32.S
+++ b/arch/tile/lib/memcpy_32.S
@@ -10,14 +10,16 @@
 *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 *   NON INFRINGEMENT.  See the GNU General Public License for
 *   more details.
- *
- * This file shares the implementation of the userspace memcpy and
- * the kernel's memcpy, copy_to_user and copy_from_user.
 */
 #include <arch/chip.h>
+/*
+ * This file shares the implementation of the userspace memcpy and
+ * the kernel's memcpy, copy_to_user and copy_from_user.
+ */
 #include <linux/linkage.h>
 /* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */
@@ -53,9 +55,9 @@
 */
 ENTRY(__copy_from_user_inatomic)
 .type __copy_from_user_inatomic, @function
-        FEEDBACK_ENTER_EXPLICIT(__copy_from_user_inatomic, \
+        FEEDBACK_ENTER_EXPLICIT(__copy_from_user_inatomic, \
          .text.memcpy_common, \
-          .Lend_memcpy_common - __copy_from_user_inatomic)
+          .Lend_memcpy_common - __copy_from_user_inatomic)
        { movei r29, IS_COPY_FROM_USER; j memcpy_common }
        .size __copy_from_user_inatomic, . - __copy_from_user_inatomic
@@ -64,7 +66,7 @@ ENTRY(__copy_from_user_inatomic)
 */
 ENTRY(__copy_from_user_zeroing)
 .type __copy_from_user_zeroing, @function
-        FEEDBACK_REENTER(__copy_from_user_inatomic)
+        FEEDBACK_REENTER(__copy_from_user_inatomic)
        { movei r29, IS_COPY_FROM_USER_ZEROING; j memcpy_common }
        .size __copy_from_user_zeroing, . - __copy_from_user_zeroing
@@ -74,13 +76,13 @@ ENTRY(__copy_from_user_zeroing)
 */
 ENTRY(__copy_to_user_inatomic)
 .type __copy_to_user_inatomic, @function
-        FEEDBACK_REENTER(__copy_from_user_inatomic)
+        FEEDBACK_REENTER(__copy_from_user_inatomic)
        { movei r29, IS_COPY_TO_USER; j memcpy_common }
        .size __copy_to_user_inatomic, . - __copy_to_user_inatomic
 ENTRY(memcpy)
 .type memcpy, @function
-        FEEDBACK_REENTER(__copy_from_user_inatomic)
+        FEEDBACK_REENTER(__copy_from_user_inatomic)
        { movei r29, IS_MEMCPY }
        .size memcpy, . - memcpy
        /* Fall through */
@@ -157,35 +159,35 @@ EX:	{ sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
        { addi r3, r1, 60; andi r9, r9, -64 }
 #if CHIP_HAS_WH64()
-        /* No need to prefetch dst, we'll just do the wh64
+        /* No need to prefetch dst, we'll just do the wh64
-         * right before we copy a line.
+         * right before we copy a line.
         */
 #endif
 EX:     { lw r5, r3; addi r3, r3, 64; movei r4, 1 }
-        /* Intentionally stall for a few cycles to leave L2 cache alone. */
+        /* Intentionally stall for a few cycles to leave L2 cache alone. */
-        { bnzt zero, .; move r27, lr }
+        { bnzt zero, .; move r27, lr }
 EX:     { lw r6, r3; addi r3, r3, 64 }
-        /* Intentionally stall for a few cycles to leave L2 cache alone. */
+        /* Intentionally stall for a few cycles to leave L2 cache alone. */
-        { bnzt zero, . }
+        { bnzt zero, . }
 EX:     { lw r7, r3; addi r3, r3, 64 }
 #if !CHIP_HAS_WH64()
-        /* Prefetch the dest */
+        /* Prefetch the dest */
-        /* Intentionally stall for a few cycles to leave L2 cache alone. */
+        /* Intentionally stall for a few cycles to leave L2 cache alone. */
-        { bnzt zero, . }
+        { bnzt zero, . }
-        /* Use a real load to cause a TLB miss if necessary.  We aren't using
+        /* Use a real load to cause a TLB miss if necessary.  We aren't using
-         * r28, so this should be fine.
+         * r28, so this should be fine.
-         */
+         */
 EX:     { lw r28, r9; addi r9, r9, 64 }
-        /* Intentionally stall for a few cycles to leave L2 cache alone. */
+        /* Intentionally stall for a few cycles to leave L2 cache alone. */
-        { bnzt zero, . }
+        { bnzt zero, . }
-        { prefetch r9; addi r9, r9, 64 }
+        { prefetch r9; addi r9, r9, 64 }
-        /* Intentionally stall for a few cycles to leave L2 cache alone. */
+        /* Intentionally stall for a few cycles to leave L2 cache alone. */
-        { bnzt zero, . }
+        { bnzt zero, . }
-        { prefetch r9; addi r9, r9, 64 }
+        { prefetch r9; addi r9, r9, 64 }
 #endif
-        /* Intentionally stall for a few cycles to leave L2 cache alone. */
+        /* Intentionally stall for a few cycles to leave L2 cache alone. */
-        { bz zero, .Lbig_loop2 }
+        { bz zero, .Lbig_loop2 }
        /* On entry to this loop:
         * - r0 points to the start of dst line 0
@@ -197,7 +199,7 @@ EX:	{ lw r28, r9; addi r9, r9, 64 }
         *   to some "safe" recently loaded address.
         * - r5 contains *(r1 + 60)       [i.e. last word of source line 0]
         * - r6 contains *(r1 + 64 + 60)  [i.e. last word of source line 1]
-         * - r9 contains ((r0 + 63) & -64)
+         * - r9 contains ((r0 + 63) & -64)
         *     [start of next dst cache line.]
         */
@@ -208,137 +210,137 @@ EX:	{ lw r28, r9; addi r9, r9, 64 }
        /* Copy line 0, first stalling until r5 is ready. */
 EX:     { move r12, r5; lw r16, r1 }
        { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
-        /* Prefetch several lines ahead. */
+        /* Prefetch several lines ahead. */
 EX:     { lw r5, r3; addi r3, r3, 64 }
-        { jal .Lcopy_line }
+        { jal .Lcopy_line }
        /* Copy line 1, first stalling until r6 is ready. */
 EX:     { move r12, r6; lw r16, r1 }
        { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
-        /* Prefetch several lines ahead. */
+        /* Prefetch several lines ahead. */
 EX:     { lw r6, r3; addi r3, r3, 64 }
        { jal .Lcopy_line }
        /* Copy line 2, first stalling until r7 is ready. */
 EX:     { move r12, r7; lw r16, r1 }
        { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
-        /* Prefetch several lines ahead. */
+        /* Prefetch several lines ahead. */
 EX:     { lw r7, r3; addi r3, r3, 64 }
-        /* Use up a caches-busy cycle by jumping back to the top of the
+        /* Use up a caches-busy cycle by jumping back to the top of the
-         * loop. Might as well get it out of the way now.
+         * loop. Might as well get it out of the way now.
-         */
+         */
-        { j .Lbig_loop }
+        { j .Lbig_loop }
        /* On entry:
         * - r0 points to the destination line.
         * - r1 points to the source line.
-         * - r3 is the next prefetch address.
+         * - r3 is the next prefetch address.
         * - r9 holds the last address used for wh64.
         * - r12 = WORD_15
-         * - r16 = WORD_0.
+         * - r16 = WORD_0.
-         * - r17 == r1 + 16.
+         * - r17 == r1 + 16.
-         * - r27 holds saved lr to restore.
+         * - r27 holds saved lr to restore.
         *
         * On exit:
         * - r0 is incremented by 64.
         * - r1 is incremented by 64, unless that would point to a word
-         *   beyond the end of the source array, in which case it is redirected
+         *   beyond the end of the source array, in which case it is redirected
-         *   to point to an arbitrary word already in the cache.
+         *   to point to an arbitrary word already in the cache.
         * - r2 is decremented by 64.
-         * - r3 is unchanged, unless it points to a word beyond the
+         * - r3 is unchanged, unless it points to a word beyond the
-         *   end of the source array, in which case it is redirected
+         *   end of the source array, in which case it is redirected
-         *   to point to an arbitrary word already in the cache.
+         *   to point to an arbitrary word already in the cache.
-         *   Redirecting is OK since if we are that close to the end
+         *   Redirecting is OK since if we are that close to the end
-         *   of the array we will not come back to this subroutine
+         *   of the array we will not come back to this subroutine
-         *   and use the contents of the prefetched address.
+         *   and use the contents of the prefetched address.
         * - r4 is nonzero iff r2 >= 64.
-         * - r9 is incremented by 64, unless it points beyond the
+         * - r9 is incremented by 64, unless it points beyond the
-         *   end of the last full destination cache line, in which
+         *   end of the last full destination cache line, in which
-         *   case it is redirected to a "safe address" that can be
+         *   case it is redirected to a "safe address" that can be
-         *   clobbered (sp - 64)
+         *   clobbered (sp - 64)
         * - lr contains the value in r27.
         */
 /* r26 unused */
 .Lcopy_line:
-        /* TODO: when r3 goes past the end, we would like to redirect it
+        /* TODO: when r3 goes past the end, we would like to redirect it
-         * to prefetch the last partial cache line (if any) just once, for the
+         * to prefetch the last partial cache line (if any) just once, for the
-         * benefit of the final cleanup loop. But we don't want to
+         * benefit of the final cleanup loop. But we don't want to
-         * prefetch that line more than once, or subsequent prefetches
+         * prefetch that line more than once, or subsequent prefetches
-         * will go into the RTF. But then .Lbig_loop should unconditionally
+         * will go into the RTF. But then .Lbig_loop should unconditionally
-         * branch to top of loop to execute final prefetch, and its
+         * branch to top of loop to execute final prefetch, and its
-         * nop should become a conditional branch.
+         * nop should become a conditional branch.
-         */
+         */
-        /* We need two non-memory cycles here to cover the resources
+        /* We need two non-memory cycles here to cover the resources
-         * used by the loads initiated by the caller.
+         * used by the loads initiated by the caller.
-         */
+         */
-        { add r15, r1, r2 }
+        { add r15, r1, r2 }
 .Lcopy_line2:
-        { slt_u r13, r3, r15; addi r17, r1, 16 }
+        { slt_u r13, r3, r15; addi r17, r1, 16 }
-        /* NOTE: this will stall for one cycle as L1 is busy. */
+        /* NOTE: this will stall for one cycle as L1 is busy. */
-        /* Fill second L1D line. */
+        /* Fill second L1D line. */
 EX:     { lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */
 #if CHIP_HAS_WH64()
-        /* Prepare destination line for writing. */
+        /* Prepare destination line for writing. */
 EX:     { wh64 r9; addi r9, r9, 64 }
 #else
-        /* Prefetch dest line */
+        /* Prefetch dest line */
        { prefetch r9; addi r9, r9, 64 }
 #endif
-        /* Load seven words that are L1D hits to cover wh64 L2 usage. */
+        /* Load seven words that are L1D hits to cover wh64 L2 usage. */
-        /* Load the three remaining words from the last L1D line, which
+        /* Load the three remaining words from the last L1D line, which
-         * we know has already filled the L1D.
+         * we know has already filled the L1D.
-         */
+         */
 EX:     { lw r4, r1;  addi r1, r1, 4;   addi r20, r1, 16 }   /* r4 = WORD_12 */
 EX:     { lw r8, r1;  addi r1, r1, 4;   slt_u r13, r20, r15 }/* r8 = WORD_13 */
 EX:     { lw r11, r1; addi r1, r1, -52; mvz r20, r13, r1 }  /* r11 = WORD_14 */
-        /* Load the three remaining words from the first L1D line, first
+        /* Load the three remaining words from the first L1D line, first
-         * stalling until it has filled by "looking at" r16.
+         * stalling until it has filled by "looking at" r16.
-         */
+         */
 EX:     { lw r13, r1; addi r1, r1, 4; move zero, r16 }   /* r13 = WORD_1 */
 EX:     { lw r14, r1; addi r1, r1, 4 }                   /* r14 = WORD_2 */
 EX:     { lw r15, r1; addi r1, r1, 8; addi r10, r0, 60 } /* r15 = WORD_3 */
-        /* Load second word from the second L1D line, first
+        /* Load second word from the second L1D line, first
-         * stalling until it has filled by "looking at" r17.
+         * stalling until it has filled by "looking at" r17.
-         */
+         */
 EX:     { lw r19, r1; addi r1, r1, 4; move zero, r17 }  /* r19 = WORD_5 */
-        /* Store last word to the destination line, potentially dirtying it
+        /* Store last word to the destination line, potentially dirtying it
-         * for the first time, which keeps the L2 busy for two cycles.
+         * for the first time, which keeps the L2 busy for two cycles.
-         */
+         */
 EX:     { sw r10, r12 }                                 /* store(WORD_15) */
-        /* Use two L1D hits to cover the sw L2 access above. */
+        /* Use two L1D hits to cover the sw L2 access above. */
 EX:     { lw r10, r1; addi r1, r1, 4 }                  /* r10 = WORD_6 */
 EX:     { lw r12, r1; addi r1, r1, 4 }                  /* r12 = WORD_7 */
-        /* Fill third L1D line. */
+        /* Fill third L1D line. */
 EX:     { lw r18, r1; addi r1, r1, 4 }                  /* r18 = WORD_8 */
-        /* Store first L1D line. */
+        /* Store first L1D line. */
 EX:     { sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */
 EX:     { sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */
 EX:     { sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */
 #if CHIP_HAS_WH64()
 EX:     { sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */
 #else
-        /* Back up the r9 to a cache line we are already storing to
+        /* Back up the r9 to a cache line we are already storing to
         * if it gets past the end of the dest vector.  Strictly speaking,
         * we don't need to back up to the start of a cache line, but it's free
         * and tidy, so why not?
-         */
+         */
 EX:     { sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */
 #endif
-        /* Store second L1D line. */
+        /* Store second L1D line. */
 EX:     { sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */
 EX:     { sw r0, r19; addi r0, r0, 4 }                  /* store(WORD_5) */
 EX:     { sw r0, r10; addi r0, r0, 4 }                  /* store(WORD_6) */
@@ -348,30 +350,30 @@ EX:	{ lw r13, r1; addi r1, r1, 4; move zero, r18 }  /* r13 = WORD_9 */
 EX:     { lw r14, r1; addi r1, r1, 4 }                  /* r14 = WORD_10 */
 EX:     { lw r15, r1; move r1, r20   }                  /* r15 = WORD_11 */
-        /* Store third L1D line. */
+        /* Store third L1D line. */
 EX:     { sw r0, r18; addi r0, r0, 4 }                  /* store(WORD_8) */
 EX:     { sw r0, r13; addi r0, r0, 4 }                  /* store(WORD_9) */
 EX:     { sw r0, r14; addi r0, r0, 4 }                  /* store(WORD_10) */
 EX:     { sw r0, r15; addi r0, r0, 4 }                  /* store(WORD_11) */
-        /* Store rest of fourth L1D line. */
+        /* Store rest of fourth L1D line. */
 EX:     { sw r0, r4;  addi r0, r0, 4 }                  /* store(WORD_12) */
-        {
+        {
 EX:     sw r0, r8                                       /* store(WORD_13) */
-        addi r0, r0, 4
+        addi r0, r0, 4
        /* Will r2 be > 64 after we subtract 64 below? */
-        shri r4, r2, 7
+        shri r4, r2, 7
-        }
+        }
-        {
+        {
 EX:     sw r0, r11                                      /* store(WORD_14) */
-        addi r0, r0, 8
+        addi r0, r0, 8
-        /* Record 64 bytes successfully copied. */
+        /* Record 64 bytes successfully copied. */
-        addi r2, r2, -64
+        addi r2, r2, -64
-        }
+        }
        { jrp lr; move lr, r27 }
-        /* Convey to the backtrace library that the stack frame is size
+        /* Convey to the backtrace library that the stack frame is size
         * zero, and the real return address is on the stack rather than
         * in 'lr'.
         */
diff --git a/arch/tile/lib/memcpy_64.c b/arch/tile/lib/memcpy_64.c
new file mode 100644
index 000000000000..3fab9a6a2bbe
--- /dev/null
+++ b/arch/tile/lib/memcpy_64.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#define __memcpy memcpy
+/* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */
+/* Must be 8 bytes in size. */
+#define word_t uint64_t
+#if CHIP_L2_LINE_SIZE() != 64 && CHIP_L2_LINE_SIZE() != 128
+#error "Assumes 64 or 128 byte line size"
+#endif
+/* How many cache lines ahead should we prefetch? */
+#define PREFETCH_LINES_AHEAD 3
+/*
+ * Provide "base versions" of load and store for the normal code path.
+ * The kernel provides other versions for userspace copies.
+ */
+#define ST(p, v) (*(p) = (v))
+#define LD(p) (*(p))
+#ifndef USERCOPY_FUNC
+#define ST1 ST
+#define ST2 ST
+#define ST4 ST
+#define ST8 ST
+#define LD1 LD
+#define LD2 LD
+#define LD4 LD
+#define LD8 LD
+#define RETVAL dstv
+void *memcpy(void *__restrict dstv, const void *__restrict srcv, size_t n)
+#else
+/*
+ * Special kernel version will provide implementation of the LDn/STn
+ * macros to return a count of uncopied bytes due to mm fault.
+ */
+#define RETVAL 0
+int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
+#endif
+{
+        char *__restrict dst1 = (char *)dstv;
+        const char *__restrict src1 = (const char *)srcv;
+        const char *__restrict src1_end;
+        const char *__restrict prefetch;
+        word_t *__restrict dst8;    /* 8-byte pointer to destination memory. */
+        word_t final; /* Final bytes to write to trailing word, if any */
+        long i;
+        if (n < 16) {
+                for (; n; n--)
+                        ST1(dst1++, LD1(src1++));
+                return RETVAL;
+        }
+        /*
+         * Locate the end of source memory we will copy.  Don't
+         * prefetch past this.
+         */
+        src1_end = src1 + n - 1;
+        /* Prefetch ahead a few cache lines, but not past the end. */
+        prefetch = src1;
+        for (i = 0; i < PREFETCH_LINES_AHEAD; i++) {
+                __insn_prefetch(prefetch);
+                prefetch += CHIP_L2_LINE_SIZE();
+                prefetch = (prefetch > src1_end) ? prefetch : src1;
+        }
+        /* Copy bytes until dst is word-aligned. */
+        for (; (uintptr_t)dst1 & (sizeof(word_t) - 1); n--)
+                ST1(dst1++, LD1(src1++));
+        /* 8-byte pointer to destination memory. */
+        dst8 = (word_t *)dst1;
+        if (__builtin_expect((uintptr_t)src1 & (sizeof(word_t) - 1), 0)) {
+                /*
+                 * Misaligned copy.  Copy 8 bytes at a time, but don't
+                 * bother with other fanciness.
+                 *
+                 * TODO: Consider prefetching and using wh64 as well.
+                 */
+                /* Create an aligned src8. */
+                const word_t *__restrict src8 =
+                        (const word_t *)((uintptr_t)src1 & -sizeof(word_t));
+                word_t b;
+                word_t a = LD8(src8++);
+                for (; n >= sizeof(word_t); n -= sizeof(word_t)) {
+                        b = LD8(src8++);
+                        a = __insn_dblalign(a, b, src1);
+                        ST8(dst8++, a);
+                        a = b;
+                }
+                if (n == 0)
+                        return RETVAL;
+                b = ((const char *)src8 <= src1_end) ? *src8 : 0;
+                /*
+                 * Final source bytes to write to trailing partial
+                 * word, if any.
+                 */
+                final = __insn_dblalign(a, b, src1);
+        } else {
+                /* Aligned copy. */
+                const word_t* __restrict src8 = (const word_t *)src1;
+                /* src8 and dst8 are both word-aligned. */
+                if (n >= CHIP_L2_LINE_SIZE()) {
+                        /* Copy until 'dst' is cache-line-aligned. */
+                        for (; (uintptr_t)dst8 & (CHIP_L2_LINE_SIZE() - 1);
+                             n -= sizeof(word_t))
+                                ST8(dst8++, LD8(src8++));
+                        for (; n >= CHIP_L2_LINE_SIZE(); ) {
+                                __insn_wh64(dst8);
+                                /*
+                                 * Prefetch and advance to next line
+                                 * to prefetch, but don't go past the end
+                                 */
+                                __insn_prefetch(prefetch);
+                                prefetch += CHIP_L2_LINE_SIZE();
+                                prefetch = (prefetch > src1_end) ? prefetch :
+                                        (const char *)src8;
+                                /*
+                                 * Copy an entire cache line.  Manually
+                                 * unrolled to avoid idiosyncracies of
+                                 * compiler unrolling.
+                                 */
+#define COPY_WORD(offset) ({ ST8(dst8+offset, LD8(src8+offset)); n -= 8; })
+                                COPY_WORD(0);
+                                COPY_WORD(1);
+                                COPY_WORD(2);
+                                COPY_WORD(3);
+                                COPY_WORD(4);
+                                COPY_WORD(5);
+                                COPY_WORD(6);
+                                COPY_WORD(7);
+#if CHIP_L2_LINE_SIZE() == 128
+                                COPY_WORD(8);
+                                COPY_WORD(9);
+                                COPY_WORD(10);
+                                COPY_WORD(11);
+                                COPY_WORD(12);
+                                COPY_WORD(13);
+                                COPY_WORD(14);
+                                COPY_WORD(15);
+#elif CHIP_L2_LINE_SIZE() != 64
+# error Fix code that assumes particular L2 cache line sizes
+#endif
+                                dst8 += CHIP_L2_LINE_SIZE() / sizeof(word_t);
+                                src8 += CHIP_L2_LINE_SIZE() / sizeof(word_t);
+                        }
+                }
+                for (; n >= sizeof(word_t); n -= sizeof(word_t))
+                        ST8(dst8++, LD8(src8++));
+                if (__builtin_expect(n == 0, 1))
+                        return RETVAL;
+                final = LD8(src8);
+        }
+        /* n != 0 if we get here.  Write out any trailing bytes. */
+        dst1 = (char *)dst8;
+        if (n & 4) {
+                ST4((uint32_t *)dst1, final);
+                dst1 += 4;
+                final >>= 32;
+                n &= 3;
+        }
+        if (n & 2) {
+                ST2((uint16_t *)dst1, final);
+                dst1 += 2;
+                final >>= 16;
+                n &= 1;
+        }
+        if (n)
+                ST1((uint8_t *)dst1, final);
+        return RETVAL;
+}
+#ifdef USERCOPY_FUNC
+#undef ST1
+#undef ST2
+#undef ST4
+#undef ST8
+#undef LD1
+#undef LD2
+#undef LD4
+#undef LD8
+#undef USERCOPY_FUNC
+#endif
diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c
index dfedea7b266b..b2fe15e01075 100644
--- a/arch/tile/lib/memcpy_tile64.c
+++ b/arch/tile/lib/memcpy_tile64.c
@@ -54,7 +54,7 @@ typedef unsigned long (*memcpy_t)(void *, const void *, unsigned long);
 * we must run with interrupts disabled to avoid the risk of some
 * other code seeing the incoherent data in our cache.  (Recall that
 * our cache is indexed by PA, so even if the other code doesn't use
- * our KM_MEMCPY virtual addresses, they'll still hit in cache using
+ * our kmap_atomic virtual addresses, they'll still hit in cache using
 * the normal VAs that aren't supposed to hit in cache.)
 */
 static void memcpy_multicache(void *dest, const void *source,
@@ -64,6 +64,7 @@ static void memcpy_multicache(void *dest, const void *source,
        unsigned long flags, newsrc, newdst;
        pmd_t *pmdp;
        pte_t *ptep;
+        int type0, type1;
        int cpu = get_cpu();
        /*
@@ -77,7 +78,8 @@ static void memcpy_multicache(void *dest, const void *source,
        sim_allow_multiple_caching(1);
        /* Set up the new dest mapping */
-        idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + KM_MEMCPY0;
+        type0 = kmap_atomic_idx_push();
+        idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + type0;
        newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1));
        pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst);
        ptep = pte_offset_kernel(pmdp, newdst);
@@ -87,13 +89,14 @@ static void memcpy_multicache(void *dest, const void *source,
        }
        /* Set up the new source mapping */
-        idx += (KM_MEMCPY0 - KM_MEMCPY1);
+        type1 = kmap_atomic_idx_push();
+        idx += (type0 - type1);
        src_pte = hv_pte_set_nc(src_pte);
        src_pte = hv_pte_clear_writable(src_pte);  /* be paranoid */
        newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1));
        pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc);
        ptep = pte_offset_kernel(pmdp, newsrc);
-        *ptep = src_pte;   /* set_pte() would be confused by this */
+        __set_pte(ptep, src_pte);   /* set_pte() would be confused by this */
        local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
        /* Actually move the data. */
@@ -106,7 +109,7 @@ static void memcpy_multicache(void *dest, const void *source,
         */
        src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3);
        src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */
-        *ptep = src_pte;   /* set_pte() would be confused by this */
+        __set_pte(ptep, src_pte);   /* set_pte() would be confused by this */
        local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
        /*
@@ -119,6 +122,8 @@ static void memcpy_multicache(void *dest, const void *source,
         * We're done: notify the simulator that all is back to normal,
         * and re-enable interrupts and pre-emption.
         */
+        kmap_atomic_idx_pop();
+        kmap_atomic_idx_pop();
        sim_allow_multiple_caching(0);
        local_irq_restore(flags);
        put_cpu();
diff --git a/arch/tile/lib/memcpy_user_64.c b/arch/tile/lib/memcpy_user_64.c
new file mode 100644
index 000000000000..4763b3aff1cc
--- /dev/null
+++ b/arch/tile/lib/memcpy_user_64.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * Do memcpy(), but trap and return "n" when a load or store faults.
+ *
+ * Note: this idiom only works when memcpy() compiles to a leaf function.
+ * If "sp" is updated during memcpy, the "jrp lr" will be incorrect.
+ *
+ * Also note that we are capturing "n" from the containing scope here.
+ */
+#define _ST(p, inst, v)                                         \
+        ({                                                      \
+                asm("1: " #inst " %0, %1;"                      \
+                    ".pushsection .coldtext.memcpy,\"ax\";"     \
+                    "2: { move r0, %2; jrp lr };"               \
+                    ".section __ex_table,\"a\";"                \
+                    ".quad 1b, 2b;"                             \
+                    ".popsection"                               \
+                    : "=m" (*(p)) : "r" (v), "r" (n));          \
+        })
+#define _LD(p, inst)                                            \
+        ({                                                      \
+                unsigned long __v;                              \
+                asm("1: " #inst " %0, %1;"                      \
+                    ".pushsection .coldtext.memcpy,\"ax\";"     \
+                    "2: { move r0, %2; jrp lr };"               \
+                    ".section __ex_table,\"a\";"                \
+                    ".quad 1b, 2b;"                             \
+                    ".popsection"                               \
+                    : "=r" (__v) : "m" (*(p)), "r" (n));        \
+                __v;                                            \
+        })
+#define USERCOPY_FUNC __copy_to_user_inatomic
+#define ST1(p, v) _ST((p), st1, (v))
+#define ST2(p, v) _ST((p), st2, (v))
+#define ST4(p, v) _ST((p), st4, (v))
+#define ST8(p, v) _ST((p), st, (v))
+#define LD1 LD
+#define LD2 LD
+#define LD4 LD
+#define LD8 LD
+#include "memcpy_64.c"
+#define USERCOPY_FUNC __copy_from_user_inatomic
+#define ST1 ST
+#define ST2 ST
+#define ST4 ST
+#define ST8 ST
+#define LD1(p) _LD((p), ld1u)
+#define LD2(p) _LD((p), ld2u)
+#define LD4(p) _LD((p), ld4u)
+#define LD8(p) _LD((p), ld)
+#include "memcpy_64.c"
+#define USERCOPY_FUNC __copy_in_user_inatomic
+#define ST1(p, v) _ST((p), st1, (v))
+#define ST2(p, v) _ST((p), st2, (v))
+#define ST4(p, v) _ST((p), st4, (v))
+#define ST8(p, v) _ST((p), st, (v))
+#define LD1(p) _LD((p), ld1u)
+#define LD2(p) _LD((p), ld2u)
+#define LD4(p) _LD((p), ld4u)
+#define LD8(p) _LD((p), ld)
+#include "memcpy_64.c"
+unsigned long __copy_from_user_zeroing(void *to, const void __user *from,
+                                       unsigned long n)
+{
+        unsigned long rc = __copy_from_user_inatomic(to, from, n);
+        if (unlikely(rc))
+                memset(to + n - rc, 0, rc);
+        return rc;
+}
diff --git a/arch/tile/lib/memmove_32.c b/arch/tile/lib/memmove.c
index fd615ae6ade7..fd615ae6ade7 100644
--- a/arch/tile/lib/memmove_32.c
+++ b/arch/tile/lib/memmove.c
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c
index d014c1fbcbc2..57dbb3a5bff8 100644
--- a/arch/tile/lib/memset_32.c
+++ b/arch/tile/lib/memset_32.c
@@ -18,6 +18,7 @@
 #include <linux/string.h>
 #include <linux/module.h>
+#undef memset
 void *memset(void *s, int c, size_t n)
 {
diff --git a/arch/tile/lib/memset_64.c b/arch/tile/lib/memset_64.c
new file mode 100644
index 000000000000..3873085711d5
--- /dev/null
+++ b/arch/tile/lib/memset_64.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#include <arch/chip.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#undef memset
+void *memset(void *s, int c, size_t n)
+{
+        uint64_t *out64;
+        int n64, to_align64;
+        uint64_t v64;
+        uint8_t *out8 = s;
+        /* Experimentation shows that a trivial tight loop is a win up until
+         * around a size of 20, where writing a word at a time starts to win.
+         */
+#define BYTE_CUTOFF 20
+#if BYTE_CUTOFF < 7
+        /* This must be at least at least this big, or some code later
+         * on doesn't work.
+         */
+#error "BYTE_CUTOFF is too small"
+#endif
+        if (n < BYTE_CUTOFF) {
+                /* Strangely, this turns out to be the tightest way to
+                 * write this loop.
+                 */
+                if (n != 0) {
+                        do {
+                                /* Strangely, combining these into one line
+                                 * performs worse.
+                                 */
+                                *out8 = c;
+                                out8++;
+                        } while (--n != 0);
+                }
+                return s;
+        }
+        /* Align 'out8'. We know n >= 7 so this won't write past the end. */
+        while (((uintptr_t) out8 & 7) != 0) {
+                *out8++ = c;
+                --n;
+        }
+        /* Align 'n'. */
+        while (n & 7)
+                out8[--n] = c;
+        out64 = (uint64_t *) out8;
+        n64 = n >> 3;
+        /* Tile input byte out to 64 bits. */
+        /* KLUDGE */
+        v64 = 0x0101010101010101ULL * (uint8_t)c;
+        /* This must be at least 8 or the following loop doesn't work. */
+#define CACHE_LINE_SIZE_IN_DOUBLEWORDS (CHIP_L2_LINE_SIZE() / 8)
+        /* Determine how many words we need to emit before the 'out32'
+         * pointer becomes aligned modulo the cache line size.
+         */
+        to_align64 = (-((uintptr_t)out64 >> 3)) &
+                (CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1);
+        /* Only bother aligning and using wh64 if there is at least
+         * one full cache line to process.  This check also prevents
+         * overrunning the end of the buffer with alignment words.
+         */
+        if (to_align64 <= n64 - CACHE_LINE_SIZE_IN_DOUBLEWORDS) {
+                int lines_left;
+                /* Align out64 mod the cache line size so we can use wh64. */
+                n64 -= to_align64;
+                for (; to_align64 != 0; to_align64--) {
+                        *out64 = v64;
+                        out64++;
+                }
+                /* Use unsigned divide to turn this into a right shift. */
+                lines_left = (unsigned)n64 / CACHE_LINE_SIZE_IN_DOUBLEWORDS;
+                do {
+                        /* Only wh64 a few lines at a time, so we don't
+                         * exceed the maximum number of victim lines.
+                         */
+                        int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS())
+                                  ? lines_left
+                                  : CHIP_MAX_OUTSTANDING_VICTIMS());
+                        uint64_t *wh = out64;
+                        int i = x;
+                        int j;
+                        lines_left -= x;
+                        do {
+                                __insn_wh64(wh);
+                                wh += CACHE_LINE_SIZE_IN_DOUBLEWORDS;
+                        } while (--i);
+                        for (j = x * (CACHE_LINE_SIZE_IN_DOUBLEWORDS / 4);
+                             j != 0; j--) {
+                                *out64++ = v64;
+                                *out64++ = v64;
+                                *out64++ = v64;
+                                *out64++ = v64;
+                        }
+                } while (lines_left != 0);
+                /* We processed all full lines above, so only this many
+                 * words remain to be processed.
+                 */
+                n64 &= CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1;
+        }
+        /* Now handle any leftover values. */
+        if (n64 != 0) {
+                do {
+                        *out64 = v64;
+                        out64++;
+                } while (--n64 != 0);
+        }
+        return s;
+}
+EXPORT_SYMBOL(memset);
diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c
index 485e24d62c6b..cb0999fb64b4 100644
--- a/arch/tile/lib/spinlock_32.c
+++ b/arch/tile/lib/spinlock_32.c
@@ -15,6 +15,7 @@
 #include <linux/spinlock.h>
 #include <linux/module.h>
 #include <asm/processor.h>
+#include <arch/spr_def.h>
 #include "spinlock_common.h"
@@ -91,75 +92,75 @@ EXPORT_SYMBOL(arch_spin_unlock_wait);
 #define RD_COUNT_MASK   ((1 << RD_COUNT_WIDTH) - 1)
-/* Lock the word, spinning until there are no tns-ers. */
+/*
-static inline u32 get_rwlock(arch_rwlock_t *rwlock)
+ * We can get the read lock if everything but the reader bits (which
-{
+ * are in the high part of the word) is zero, i.e. no active or
-        u32 iterations = 0;
+ * waiting writers, no tns.
-        for (;;) {
+ *
-                u32 val = __insn_tns((int *)&rwlock->lock);
+ * We guard the tns/store-back with an interrupt critical section to
-                if (unlikely(val & 1)) {
+ * preserve the semantic that the same read lock can be acquired in an
-                        delay_backoff(iterations++);
+ * interrupt context.
-                        continue;
+ */
-                }
+inline int arch_read_trylock(arch_rwlock_t *rwlock)
-                return val;
-        }
-}
-int arch_read_trylock_slow(arch_rwlock_t *rwlock)
-{
-        u32 val = get_rwlock(rwlock);
-        int locked = (val << RD_COUNT_WIDTH) == 0;
-        rwlock->lock = val + (locked << RD_COUNT_SHIFT);
-        return locked;
-}
-EXPORT_SYMBOL(arch_read_trylock_slow);
-void arch_read_unlock_slow(arch_rwlock_t *rwlock)
-{
-        u32 val = get_rwlock(rwlock);
-        rwlock->lock = val - (1 << RD_COUNT_SHIFT);
-}
-EXPORT_SYMBOL(arch_read_unlock_slow);
-void arch_write_unlock_slow(arch_rwlock_t *rwlock, u32 val)
 {
-        u32 eq, mask = 1 << WR_CURR_SHIFT;
+        u32 val;
-        while (unlikely(val & 1)) {
+        __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
-                /* Limited backoff since we are the highest-priority task. */
+        val = __insn_tns((int *)&rwlock->lock);
-                relax(4);
+        if (likely((val << _RD_COUNT_WIDTH) == 0)) {
-                val = __insn_tns((int *)&rwlock->lock);
+                val += 1 << RD_COUNT_SHIFT;
+                rwlock->lock = val;
+                __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+                BUG_ON(val == 0);  /* we don't expect wraparound */
+                return 1;
        }
-        val = __insn_addb(val, mask);
+        if ((val & 1) == 0)
-        eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT));
+                rwlock->lock = val;
-        val = __insn_mz(eq & mask, val);
+        __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
-        rwlock->lock = val;
+        return 0;
 }
-EXPORT_SYMBOL(arch_write_unlock_slow);
+EXPORT_SYMBOL(arch_read_trylock);
 /*
- * We spin until everything but the reader bits (which are in the high
+ * Spin doing arch_read_trylock() until we acquire the lock.
- * part of the word) are zero, i.e. no active or waiting writers, no tns.
- *
 * ISSUE: This approach can permanently starve readers.  A reader who sees
 * a writer could instead take a ticket lock (just like a writer would),
 * and atomically enter read mode (with 1 reader) when it gets the ticket.
- * This way both readers and writers will always make forward progress
+ * This way both readers and writers would always make forward progress
 * in a finite time.
 */
-void arch_read_lock_slow(arch_rwlock_t *rwlock, u32 val)
+void arch_read_lock(arch_rwlock_t *rwlock)
 {
        u32 iterations = 0;
-        do {
+        while (unlikely(!arch_read_trylock(rwlock)))
-                if (!(val & 1))
-                        rwlock->lock = val;
                delay_backoff(iterations++);
+}
+EXPORT_SYMBOL(arch_read_lock);
+void arch_read_unlock(arch_rwlock_t *rwlock)
+{
+        u32 val, iterations = 0;
+        mb();  /* guarantee anything modified under the lock is visible */
+        for (;;) {
+                __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
                val = __insn_tns((int *)&rwlock->lock);
-        } while ((val << RD_COUNT_WIDTH) != 0);
+                if (likely(val & 1) == 0) {
-        rwlock->lock = val + (1 << RD_COUNT_SHIFT);
+                        rwlock->lock = val - (1 << _RD_COUNT_SHIFT);
+                        __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+                        break;
+                }
+                __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+                delay_backoff(iterations++);
+        }
 }
-EXPORT_SYMBOL(arch_read_lock_slow);
+EXPORT_SYMBOL(arch_read_unlock);
-void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
+/*
+ * We don't need an interrupt critical section here (unlike for
+ * arch_read_lock) since we should never use a bare write lock where
+ * it could be interrupted by code that could try to re-acquire it.
+ */
+void arch_write_lock(arch_rwlock_t *rwlock)
 {
        /*
         * The trailing underscore on this variable (and curr_ below)
@@ -167,23 +168,36 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
         * when we compare them.
         */
        u32 my_ticket_;
+        u32 iterations = 0;
+        u32 val = __insn_tns((int *)&rwlock->lock);
-        /* Take out the next ticket; this will also stop would-be readers. */
+        if (likely(val == 0)) {
-        if (val & 1)
+                rwlock->lock = 1 << _WR_NEXT_SHIFT;
-                val = get_rwlock(rwlock);
+                return;
-        rwlock->lock = __insn_addb(val, 1 << WR_NEXT_SHIFT);
+        }
-        /* Extract my ticket value from the original word. */
-        my_ticket_ = val >> WR_NEXT_SHIFT;
        /*
-         * Wait until the "current" field matches our ticket, and
+         * Wait until there are no readers, then bump up the next
-         * there are no remaining readers.
+         * field and capture the ticket value.
         */
        for (;;) {
+                if (!(val & 1)) {
+                        if ((val >> RD_COUNT_SHIFT) == 0)
+                                break;
+                        rwlock->lock = val;
+                }
+                delay_backoff(iterations++);
+                val = __insn_tns((int *)&rwlock->lock);
+        }
+        /* Take out the next ticket and extract my ticket value. */
+        rwlock->lock = __insn_addb(val, 1 << WR_NEXT_SHIFT);
+        my_ticket_ = val >> WR_NEXT_SHIFT;
+        /* Wait until the "current" field matches our ticket. */
+        for (;;) {
                u32 curr_ = val >> WR_CURR_SHIFT;
-                u32 readers = val >> RD_COUNT_SHIFT;
+                u32 delta = ((my_ticket_ - curr_) & WR_MASK);
-                u32 delta = ((my_ticket_ - curr_) & WR_MASK) + !!readers;
                if (likely(delta == 0))
                        break;
@@ -199,23 +213,47 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
                        relax(4);
        }
 }
-EXPORT_SYMBOL(arch_write_lock_slow);
+EXPORT_SYMBOL(arch_write_lock);
-int __tns_atomic_acquire(atomic_t *lock)
+int arch_write_trylock(arch_rwlock_t *rwlock)
 {
-        int ret;
+        u32 val = __insn_tns((int *)&rwlock->lock);
-        u32 iterations = 0;
-        BUG_ON(__insn_mfspr(SPR_INTERRUPT_CRITICAL_SECTION));
+        /*
-        __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
+         * If a tns is in progress, or there's a waiting or active locker,
+         * or active readers, we can't take the lock, so give up.
+         */
+        if (unlikely(val != 0)) {
+                if (!(val & 1))
+                        rwlock->lock = val;
+                return 0;
+        }
-        while ((ret = __insn_tns((void *)&lock->counter)) == 1)
+        /* Set the "next" field to mark it locked. */
-                delay_backoff(iterations++);
+        rwlock->lock = 1 << _WR_NEXT_SHIFT;
-        return ret;
+        return 1;
 }
+EXPORT_SYMBOL(arch_write_trylock);
-void __tns_atomic_release(atomic_t *p, int v)
+void arch_write_unlock(arch_rwlock_t *rwlock)
 {
-        p->counter = v;
+        u32 val, eq, mask;
-        __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+        mb();  /* guarantee anything modified under the lock is visible */
+        val = __insn_tns((int *)&rwlock->lock);
+        if (likely(val == (1 << _WR_NEXT_SHIFT))) {
+                rwlock->lock = 0;
+                return;
+        }
+        while (unlikely(val & 1)) {
+                /* Limited backoff since we are the highest-priority task. */
+                relax(4);
+                val = __insn_tns((int *)&rwlock->lock);
+        }
+        mask = 1 << WR_CURR_SHIFT;
+        val = __insn_addb(val, mask);
+        eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT));
+        val = __insn_mz(eq & mask, val);
+        rwlock->lock = val;
 }
+EXPORT_SYMBOL(arch_write_unlock);
diff --git a/arch/tile/lib/spinlock_64.c b/arch/tile/lib/spinlock_64.c
new file mode 100644
index 000000000000..d6fb9581e980
--- /dev/null
+++ b/arch/tile/lib/spinlock_64.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+#include "spinlock_common.h"
+/*
+ * Read the spinlock value without allocating in our cache and without
+ * causing an invalidation to another cpu with a copy of the cacheline.
+ * This is important when we are spinning waiting for the lock.
+ */
+static inline u32 arch_spin_read_noalloc(void *lock)
+{
+        return atomic_cmpxchg((atomic_t *)lock, -1, -1);
+}
+/*
+ * Wait until the high bits (current) match my ticket.
+ * If we notice the overflow bit set on entry, we clear it.
+ */
+void arch_spin_lock_slow(arch_spinlock_t *lock, u32 my_ticket)
+{
+        if (unlikely(my_ticket & __ARCH_SPIN_NEXT_OVERFLOW)) {
+                __insn_fetchand4(&lock->lock, ~__ARCH_SPIN_NEXT_OVERFLOW);
+                my_ticket &= ~__ARCH_SPIN_NEXT_OVERFLOW;
+        }
+        for (;;) {
+                u32 val = arch_spin_read_noalloc(lock);
+                u32 delta = my_ticket - arch_spin_current(val);
+                if (delta == 0)
+                        return;
+                relax((128 / CYCLES_PER_RELAX_LOOP) * delta);
+        }
+}
+EXPORT_SYMBOL(arch_spin_lock_slow);
+/*
+ * Check the lock to see if it is plausible, and try to get it with cmpxchg().
+ */
+int arch_spin_trylock(arch_spinlock_t *lock)
+{
+        u32 val = arch_spin_read_noalloc(lock);
+        if (unlikely(arch_spin_current(val) != arch_spin_next(val)))
+                return 0;
+        return cmpxchg(&lock->lock, val, (val + 1) & ~__ARCH_SPIN_NEXT_OVERFLOW)
+                == val;
+}
+EXPORT_SYMBOL(arch_spin_trylock);
+void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+        u32 iterations = 0;
+        while (arch_spin_is_locked(lock))
+                delay_backoff(iterations++);
+}
+EXPORT_SYMBOL(arch_spin_unlock_wait);
+/*
+ * If the read lock fails due to a writer, we retry periodically
+ * until the value is positive and we write our incremented reader count.
+ */
+void __read_lock_failed(arch_rwlock_t *rw)
+{
+        u32 val;
+        int iterations = 0;
+        do {
+                delay_backoff(iterations++);
+                val = __insn_fetchaddgez4(&rw->lock, 1);
+        } while (unlikely(arch_write_val_locked(val)));
+}
+EXPORT_SYMBOL(__read_lock_failed);
+/*
+ * If we failed because there were readers, clear the "writer" bit
+ * so we don't block additional readers.  Otherwise, there was another
+ * writer anyway, so our "fetchor" made no difference.  Then wait,
+ * issuing periodic fetchor instructions, till we get the lock.
+ */
+void __write_lock_failed(arch_rwlock_t *rw, u32 val)
+{
+        int iterations = 0;
+        do {
+                if (!arch_write_val_locked(val))
+                        val = __insn_fetchand4(&rw->lock, ~__WRITE_LOCK_BIT);
+                delay_backoff(iterations++);
+                val = __insn_fetchor4(&rw->lock, __WRITE_LOCK_BIT);
+        } while (val != 0);
+}
+EXPORT_SYMBOL(__write_lock_failed);
diff --git a/arch/tile/lib/strchr_64.c b/arch/tile/lib/strchr_64.c
new file mode 100644
index 000000000000..617a9273aaa8
--- /dev/null
+++ b/arch/tile/lib/strchr_64.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#undef strchr
+char *strchr(const char *s, int c)
+{
+        int z, g;
+        /* Get an aligned pointer. */
+        const uintptr_t s_int = (uintptr_t) s;
+        const uint64_t *p = (const uint64_t *)(s_int & -8);
+        /* Create eight copies of the byte for which we are looking. */
+        const uint64_t goal = 0x0101010101010101ULL * (uint8_t) c;
+        /* Read the first aligned word, but force bytes before the string to
+         * match neither zero nor goal (we make sure the high bit of each
+         * byte is 1, and the low 7 bits are all the opposite of the goal
+         * byte).
+         *
+         * Note that this shift count expression works because we know shift
+         * counts are taken mod 64.
+         */
+        const uint64_t before_mask = (1ULL << (s_int << 3)) - 1;
+        uint64_t v = (*p | before_mask) ^
+                (goal & __insn_v1shrsi(before_mask, 1));
+        uint64_t zero_matches, goal_matches;
+        while (1) {
+                /* Look for a terminating '\0'. */
+                zero_matches = __insn_v1cmpeqi(v, 0);
+                /* Look for the goal byte. */
+                goal_matches = __insn_v1cmpeq(v, goal);
+                if (__builtin_expect((zero_matches | goal_matches) != 0, 0))
+                        break;
+                v = *++p;
+        }
+        z = __insn_ctz(zero_matches);
+        g = __insn_ctz(goal_matches);
+        /* If we found c before '\0' we got a match. Note that if c == '\0'
+         * then g == z, and we correctly return the address of the '\0'
+         * rather than NULL.
+         */
+        return (g <= z) ? ((char *)p) + (g >> 3) : NULL;
+}
+EXPORT_SYMBOL(strchr);
diff --git a/arch/tile/lib/strlen_32.c b/arch/tile/lib/strlen_32.c
index f26f88e11e4a..4974292a5534 100644
--- a/arch/tile/lib/strlen_32.c
+++ b/arch/tile/lib/strlen_32.c
@@ -16,6 +16,8 @@
 #include <linux/string.h>
 #include <linux/module.h>
+#undef strlen
 size_t strlen(const char *s)
 {
        /* Get an aligned pointer. */
diff --git a/arch/tile/lib/strlen_64.c b/arch/tile/lib/strlen_64.c
new file mode 100644
index 000000000000..1c92d46202a8
--- /dev/null
+++ b/arch/tile/lib/strlen_64.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#undef strlen
+size_t strlen(const char *s)
+{
+        /* Get an aligned pointer. */
+        const uintptr_t s_int = (uintptr_t) s;
+        const uint64_t *p = (const uint64_t *)(s_int & -8);
+        /* Read the first word, but force bytes before the string to be nonzero.
+         * This expression works because we know shift counts are taken mod 64.
+         */
+        uint64_t v = *p | ((1ULL << (s_int << 3)) - 1);
+        uint64_t bits;
+        while ((bits = __insn_v1cmpeqi(v, 0)) == 0)
+                v = *++p;
+        return ((const char *)p) + (__insn_ctz(bits) >> 3) - s;
+}
+EXPORT_SYMBOL(strlen);
diff --git a/arch/tile/lib/usercopy_64.S b/arch/tile/lib/usercopy_64.S
new file mode 100644
index 000000000000..2ff44f87b78e
--- /dev/null
+++ b/arch/tile/lib/usercopy_64.S
@@ -0,0 +1,196 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#include <linux/linkage.h>
+#include <asm/errno.h>
+#include <asm/cache.h>
+#include <arch/chip.h>
+/* Access user memory, but use MMU to avoid propagating kernel exceptions. */
+        .pushsection .fixup,"ax"
+get_user_fault:
+        { movei r1, -EFAULT; move r0, zero }
+        jrp lr
+        ENDPROC(get_user_fault)
+put_user_fault:
+        { movei r0, -EFAULT; jrp lr }
+        ENDPROC(put_user_fault)
+        .popsection
+/*
+ * __get_user_N functions take a pointer in r0, and return 0 in r1
+ * on success, with the value in r0; or else -EFAULT in r1.
+ */
+#define __get_user_N(bytes, LOAD) \
+        STD_ENTRY(__get_user_##bytes); \
+1:      { LOAD r0, r0; move r1, zero }; \
+        jrp lr; \
+        STD_ENDPROC(__get_user_##bytes); \
+        .pushsection __ex_table,"a"; \
+        .quad 1b, get_user_fault; \
+        .popsection
+__get_user_N(1, ld1u)
+__get_user_N(2, ld2u)
+__get_user_N(4, ld4u)
+__get_user_N(8, ld)
+/*
+ * __put_user_N functions take a value in r0 and a pointer in r1,
+ * and return 0 in r0 on success or -EFAULT on failure.
+ */
+#define __put_user_N(bytes, STORE) \
+        STD_ENTRY(__put_user_##bytes); \
+1:      { STORE r1, r0; move r0, zero }; \
+        jrp lr; \
+        STD_ENDPROC(__put_user_##bytes); \
+        .pushsection __ex_table,"a"; \
+        .quad 1b, put_user_fault; \
+        .popsection
+__put_user_N(1, st1)
+__put_user_N(2, st2)
+__put_user_N(4, st4)
+__put_user_N(8, st)
+/*
+ * strnlen_user_asm takes the pointer in r0, and the length bound in r1.
+ * It returns the length, including the terminating NUL, or zero on exception.
+ * If length is greater than the bound, returns one plus the bound.
+ */
+STD_ENTRY(strnlen_user_asm)
+        { beqz r1, 2f; addi r3, r0, -1 }  /* bias down to include NUL */
+1:      { ld1u r4, r0; addi r1, r1, -1 }
+        beqz r4, 2f
+        { bnezt r1, 1b; addi r0, r0, 1 }
+2:      { sub r0, r0, r3; jrp lr }
+        STD_ENDPROC(strnlen_user_asm)
+        .pushsection .fixup,"ax"
+strnlen_user_fault:
+        { move r0, zero; jrp lr }
+        ENDPROC(strnlen_user_fault)
+        .section __ex_table,"a"
+        .quad 1b, strnlen_user_fault
+        .popsection
+/*
+ * strncpy_from_user_asm takes the kernel target pointer in r0,
+ * the userspace source pointer in r1, and the length bound (including
+ * the trailing NUL) in r2.  On success, it returns the string length
+ * (not including the trailing NUL), or -EFAULT on failure.
+ */
+STD_ENTRY(strncpy_from_user_asm)
+        { beqz r2, 2f; move r3, r0 }
+1:      { ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
+        { st1 r0, r4; addi r0, r0, 1 }
+        beqz r2, 2f
+        bnezt r4, 1b
+        addi r0, r0, -1   /* don't count the trailing NUL */
+2:      { sub r0, r0, r3; jrp lr }
+        STD_ENDPROC(strncpy_from_user_asm)
+        .pushsection .fixup,"ax"
+strncpy_from_user_fault:
+        { movei r0, -EFAULT; jrp lr }
+        ENDPROC(strncpy_from_user_fault)
+        .section __ex_table,"a"
+        .quad 1b, strncpy_from_user_fault
+        .popsection
+/*
+ * clear_user_asm takes the user target address in r0 and the
+ * number of bytes to zero in r1.
+ * It returns the number of uncopiable bytes (hopefully zero) in r0.
+ * Note that we don't use a separate .fixup section here since we fall
+ * through into the "fixup" code as the last straight-line bundle anyway.
+ */
+STD_ENTRY(clear_user_asm)
+        { beqz r1, 2f; or r2, r0, r1 }
+        andi r2, r2, 7
+        beqzt r2, .Lclear_aligned_user_asm
+1:      { st1 r0, zero; addi r0, r0, 1; addi r1, r1, -1 }
+        bnezt r1, 1b
+2:      { move r0, r1; jrp lr }
+        .pushsection __ex_table,"a"
+        .quad 1b, 2b
+        .popsection
+.Lclear_aligned_user_asm:
+1:      { st r0, zero; addi r0, r0, 8; addi r1, r1, -8 }
+        bnezt r1, 1b
+2:      { move r0, r1; jrp lr }
+        STD_ENDPROC(clear_user_asm)
+        .pushsection __ex_table,"a"
+        .quad 1b, 2b
+        .popsection
+/*
+ * flush_user_asm takes the user target address in r0 and the
+ * number of bytes to flush in r1.
+ * It returns the number of unflushable bytes (hopefully zero) in r0.
+ */
+STD_ENTRY(flush_user_asm)
+        beqz r1, 2f
+        { movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
+        { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
+        { and r0, r0, r2; and r1, r1, r2 }
+        { sub r1, r1, r0 }
+1:      { flush r0; addi r1, r1, -CHIP_FLUSH_STRIDE() }
+        { addi r0, r0, CHIP_FLUSH_STRIDE(); bnezt r1, 1b }
+2:      { move r0, r1; jrp lr }
+        STD_ENDPROC(flush_user_asm)
+        .pushsection __ex_table,"a"
+        .quad 1b, 2b
+        .popsection
+/*
+ * inv_user_asm takes the user target address in r0 and the
+ * number of bytes to invalidate in r1.
+ * It returns the number of not inv'able bytes (hopefully zero) in r0.
+ */
+STD_ENTRY(inv_user_asm)
+        beqz r1, 2f
+        { movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
+        { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
+        { and r0, r0, r2; and r1, r1, r2 }
+        { sub r1, r1, r0 }
+1:      { inv r0; addi r1, r1, -CHIP_INV_STRIDE() }
+        { addi r0, r0, CHIP_INV_STRIDE(); bnezt r1, 1b }
+2:      { move r0, r1; jrp lr }
+        STD_ENDPROC(inv_user_asm)
+        .pushsection __ex_table,"a"
+        .quad 1b, 2b
+        .popsection
+/*
+ * finv_user_asm takes the user target address in r0 and the
+ * number of bytes to flush-invalidate in r1.
+ * It returns the number of not finv'able bytes (hopefully zero) in r0.
+ */
+STD_ENTRY(finv_user_asm)
+        beqz r1, 2f
+        { movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
+        { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
+        { and r0, r0, r2; and r1, r1, r2 }
+        { sub r1, r1, r0 }
+1:      { finv r0; addi r1, r1, -CHIP_FINV_STRIDE() }
+        { addi r0, r0, CHIP_FINV_STRIDE(); bnezt r1, 1b }
+2:      { move r0, r1; jrp lr }
+        STD_ENDPROC(finv_user_asm)
+        .pushsection __ex_table,"a"
+        .quad 1b, 2b
+        .popsection
author	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
committer	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
commit	c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree	ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/tile/lib
parent	ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent	6a00f206debf8a5c8899055726ad127dbeeed098 (diff)