9 files changed, 228 insertions, 116 deletions
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile
index 93122d5b1558..0c26086ecbef 100644
--- a/arch/tile/lib/Makefile
+++ b/arch/tile/lib/Makefile
@@ -2,9 +2,8 @@
 # Makefile for TILE-specific library files..
 #
-lib-y = cacheflush.o checksum.o cpumask.o delay.o \
+lib-y = cacheflush.o checksum.o cpumask.o delay.o uaccess.o \
-        mb_incoherent.o uaccess.o memmove.o \
+        memmove.o memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \
-        memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \
        strchr_$(BITS).o strlen_$(BITS).o
 ifeq ($(CONFIG_TILEGX),y)
diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c
index 7a5cc706ab62..f02040d3614e 100644
--- a/arch/tile/lib/atomic_32.c
+++ b/arch/tile/lib/atomic_32.c
@@ -46,14 +46,13 @@ struct atomic_locks_on_cpu *atomic_lock_ptr[ATOMIC_HASH_L1_SIZE]
 #else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
 /* This page is remapped on startup to be hash-for-home. */
-int atomic_locks[PAGE_SIZE / sizeof(int) /* Only ATOMIC_HASH_SIZE is used */]
+int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss;
-  __attribute__((aligned(PAGE_SIZE), section(".bss.page_aligned")));
 #endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
 static inline int *__atomic_hashed_lock(volatile void *v)
 {
-        /* NOTE: this code must match "sys_cmpxchg" in kernel/intvec.S */
+        /* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */
 #if ATOMIC_LOCKS_FOUND_VIA_TABLE()
        unsigned long i =
                (unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long));
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S
index 5a5514b77e78..82f64cc63658 100644
--- a/arch/tile/lib/atomic_asm_32.S
+++ b/arch/tile/lib/atomic_asm_32.S
@@ -14,7 +14,7 @@
 * Support routines for atomic operations.  Each function takes:
 *
 * r0: address to manipulate
- * r1: pointer to atomic lock guarding this operation (for FUTEX_LOCK_REG)
+ * r1: pointer to atomic lock guarding this operation (for ATOMIC_LOCK_REG)
 * r2: new value to write, or for cmpxchg/add_unless, value to compare against
 * r3: (cmpxchg/xchg_add_unless) new value to write or add;
 *     (atomic64 ops) high word of value to write
diff --git a/arch/tile/lib/cacheflush.c b/arch/tile/lib/cacheflush.c
index 11b6164c2097..35c1d8ca5f38 100644
--- a/arch/tile/lib/cacheflush.c
+++ b/arch/tile/lib/cacheflush.c
@@ -21,3 +21,105 @@ void __flush_icache_range(unsigned long start, unsigned long end)
 {
        invalidate_icache((const void *)start, end - start, PAGE_SIZE);
 }
+/* Force a load instruction to issue. */
+static inline void force_load(char *p)
+{
+        *(volatile char *)p;
+}
+/*
+ * Flush and invalidate a VA range that is homed remotely on a single
+ * core (if "!hfh") or homed via hash-for-home (if "hfh"), waiting
+ * until the memory controller holds the flushed values.
+ */
+void finv_buffer_remote(void *buffer, size_t size, int hfh)
+{
+        char *p, *base;
+        size_t step_size, load_count;
+        const unsigned long STRIPE_WIDTH = 8192;
+        /*
+         * Flush and invalidate the buffer out of the local L1/L2
+         * and request the home cache to flush and invalidate as well.
+         */
+        __finv_buffer(buffer, size);
+        /*
+         * Wait for the home cache to acknowledge that it has processed
+         * all the flush-and-invalidate requests.  This does not mean
+         * that the flushed data has reached the memory controller yet,
+         * but it does mean the home cache is processing the flushes.
+         */
+        __insn_mf();
+        /*
+         * Issue a load to the last cache line, which can't complete
+         * until all the previously-issued flushes to the same memory
+         * controller have also completed.  If we weren't striping
+         * memory, that one load would be sufficient, but since we may
+         * be, we also need to back up to the last load issued to
+         * another memory controller, which would be the point where
+         * we crossed an 8KB boundary (the granularity of striping
+         * across memory controllers).  Keep backing up and doing this
+         * until we are before the beginning of the buffer, or have
+         * hit all the controllers.
+         *
+         * If we are flushing a hash-for-home buffer, it's even worse.
+         * Each line may be homed on a different tile, and each tile
+         * may have up to four lines that are on different
+         * controllers.  So as we walk backwards, we have to touch
+         * enough cache lines to satisfy these constraints.  In
+         * practice this ends up being close enough to "load from
+         * every cache line on a full memory stripe on each
+         * controller" that we simply do that, to simplify the logic.
+         *
+         * FIXME: See bug 9535 for some issues with this code.
+         */
+        if (hfh) {
+                step_size = L2_CACHE_BYTES;
+                load_count = (STRIPE_WIDTH / L2_CACHE_BYTES) *
+                              (1 << CHIP_LOG_NUM_MSHIMS());
+        } else {
+                step_size = STRIPE_WIDTH;
+                load_count = (1 << CHIP_LOG_NUM_MSHIMS());
+        }
+        /* Load the last byte of the buffer. */
+        p = (char *)buffer + size - 1;
+        force_load(p);
+        /* Bump down to the end of the previous stripe or cache line. */
+        p -= step_size;
+        p = (char *)((unsigned long)p | (step_size - 1));
+        /* Figure out how far back we need to go. */
+        base = p - (step_size * (load_count - 2));
+        if ((long)base < (long)buffer)
+                base = buffer;
+        /*
+         * Fire all the loads we need.  The MAF only has eight entries
+         * so we can have at most eight outstanding loads, so we
+         * unroll by that amount.
+         */
+#pragma unroll 8
+        for (; p >= base; p -= step_size)
+                force_load(p);
+        /*
+         * Repeat, but with inv's instead of loads, to get rid of the
+         * data we just loaded into our own cache and the old home L3.
+         * No need to unroll since inv's don't target a register.
+         */
+        p = (char *)buffer + size - 1;
+        __insn_inv(p);
+        p -= step_size;
+        p = (char *)((unsigned long)p | (step_size - 1));
+        for (; p >= base; p -= step_size)
+                __insn_inv(p);
+        /* Wait for the load+inv's (and thus finvs) to have completed. */
+        __insn_mf();
+}
diff --git a/arch/tile/lib/delay.c b/arch/tile/lib/delay.c
index 5801b03c13ef..cdacdd11d360 100644
--- a/arch/tile/lib/delay.c
+++ b/arch/tile/lib/delay.c
@@ -15,20 +15,31 @@
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/thread_info.h>
-#include <asm/fixmap.h>
+#include <asm/timex.h>
-#include <hv/hypervisor.h>
 void __udelay(unsigned long usecs)
 {
-        hv_nanosleep(usecs * 1000);
+        if (usecs > ULONG_MAX / 1000) {
+                WARN_ON_ONCE(usecs > ULONG_MAX / 1000);
+                usecs = ULONG_MAX / 1000;
+        }
+        __ndelay(usecs * 1000);
 }
 EXPORT_SYMBOL(__udelay);
 void __ndelay(unsigned long nsecs)
 {
-        hv_nanosleep(nsecs);
+        cycles_t target = get_cycles();
+        target += ns2cycles(nsecs);
+        while (get_cycles() < target)
+                cpu_relax();
 }
 EXPORT_SYMBOL(__ndelay);
-/* FIXME: should be declared in a header somewhere. */
+void __delay(unsigned long cycles)
+{
+        cycles_t target = get_cycles() + cycles;
+        while (get_cycles() < target)
+                cpu_relax();
+}
 EXPORT_SYMBOL(__delay);
diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c
index 1509c5597653..49284fae9d09 100644
--- a/arch/tile/lib/exports.c
+++ b/arch/tile/lib/exports.c
@@ -29,6 +29,9 @@ EXPORT_SYMBOL(__put_user_8);
 EXPORT_SYMBOL(strnlen_user_asm);
 EXPORT_SYMBOL(strncpy_from_user_asm);
 EXPORT_SYMBOL(clear_user_asm);
+EXPORT_SYMBOL(flush_user_asm);
+EXPORT_SYMBOL(inv_user_asm);
+EXPORT_SYMBOL(finv_user_asm);
 /* arch/tile/kernel/entry.S */
 #include <linux/kernel.h>
@@ -45,9 +48,6 @@ EXPORT_SYMBOL(__copy_from_user_zeroing);
 EXPORT_SYMBOL(__copy_in_user_inatomic);
 #endif
-/* arch/tile/lib/mb_incoherent.S */
-EXPORT_SYMBOL(__mb_incoherent);
 /* hypervisor glue */
 #include <hv/hypervisor.h>
 EXPORT_SYMBOL(hv_dev_open);
@@ -85,4 +85,8 @@ int64_t __muldi3(int64_t, int64_t);
 EXPORT_SYMBOL(__muldi3);
 uint64_t __lshrdi3(uint64_t, unsigned int);
 EXPORT_SYMBOL(__lshrdi3);
+uint64_t __ashrdi3(uint64_t, unsigned int);
+EXPORT_SYMBOL(__ashrdi3);
+uint64_t __ashldi3(uint64_t, unsigned int);
+EXPORT_SYMBOL(__ashldi3);
 #endif
diff --git a/arch/tile/lib/mb_incoherent.S b/arch/tile/lib/mb_incoherent.S
deleted file mode 100644
index 989ad7b68d5a..000000000000
--- a/arch/tile/lib/mb_incoherent.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful, but
- *   WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *   NON INFRINGEMENT.  See the GNU General Public License for
- *   more details.
- *
- * Assembly code for invoking the HV's fence_incoherent syscall.
- */
-#include <linux/linkage.h>
-#include <hv/syscall_public.h>
-#include <arch/abi.h>
-#include <arch/chip.h>
-#if !CHIP_HAS_MF_WAITS_FOR_VICTIMS()
-/*
- * Invoke the hypervisor's fence_incoherent syscall, which guarantees
- * that all victims for cachelines homed on this tile have reached memory.
- */
-STD_ENTRY(__mb_incoherent)
-        moveli TREG_SYSCALL_NR_NAME, HV_SYS_fence_incoherent
-        swint2
-        jrp lr
-        STD_ENDPROC(__mb_incoherent)
-#endif
diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c
index f7d4a6ad61e8..b2fe15e01075 100644
--- a/arch/tile/lib/memcpy_tile64.c
+++ b/arch/tile/lib/memcpy_tile64.c
@@ -96,7 +96,7 @@ static void memcpy_multicache(void *dest, const void *source,
        newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1));
        pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc);
        ptep = pte_offset_kernel(pmdp, newsrc);
-        *ptep = src_pte;   /* set_pte() would be confused by this */
+        __set_pte(ptep, src_pte);   /* set_pte() would be confused by this */
        local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
        /* Actually move the data. */
@@ -109,7 +109,7 @@ static void memcpy_multicache(void *dest, const void *source,
         */
        src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3);
        src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */
-        *ptep = src_pte;   /* set_pte() would be confused by this */
+        __set_pte(ptep, src_pte);   /* set_pte() would be confused by this */
        local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
        /*
diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c
index 5cd1c4004eca..cb0999fb64b4 100644
--- a/arch/tile/lib/spinlock_32.c
+++ b/arch/tile/lib/spinlock_32.c
@@ -15,6 +15,7 @@
 #include <linux/spinlock.h>
 #include <linux/module.h>
 #include <asm/processor.h>
+#include <arch/spr_def.h>
 #include "spinlock_common.h"
@@ -91,75 +92,75 @@ EXPORT_SYMBOL(arch_spin_unlock_wait);
 #define RD_COUNT_MASK   ((1 << RD_COUNT_WIDTH) - 1)
-/* Lock the word, spinning until there are no tns-ers. */
+/*
-static inline u32 get_rwlock(arch_rwlock_t *rwlock)
+ * We can get the read lock if everything but the reader bits (which
-{
+ * are in the high part of the word) is zero, i.e. no active or
-        u32 iterations = 0;
+ * waiting writers, no tns.
-        for (;;) {
+ *
-                u32 val = __insn_tns((int *)&rwlock->lock);
+ * We guard the tns/store-back with an interrupt critical section to
-                if (unlikely(val & 1)) {
+ * preserve the semantic that the same read lock can be acquired in an
-                        delay_backoff(iterations++);
+ * interrupt context.
-                        continue;
+ */
-                }
+inline int arch_read_trylock(arch_rwlock_t *rwlock)
-                return val;
-        }
-}
-int arch_read_trylock_slow(arch_rwlock_t *rwlock)
-{
-        u32 val = get_rwlock(rwlock);
-        int locked = (val << RD_COUNT_WIDTH) == 0;
-        rwlock->lock = val + (locked << RD_COUNT_SHIFT);
-        return locked;
-}
-EXPORT_SYMBOL(arch_read_trylock_slow);
-void arch_read_unlock_slow(arch_rwlock_t *rwlock)
-{
-        u32 val = get_rwlock(rwlock);
-        rwlock->lock = val - (1 << RD_COUNT_SHIFT);
-}
-EXPORT_SYMBOL(arch_read_unlock_slow);
-void arch_write_unlock_slow(arch_rwlock_t *rwlock, u32 val)
 {
-        u32 eq, mask = 1 << WR_CURR_SHIFT;
+        u32 val;
-        while (unlikely(val & 1)) {
+        __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
-                /* Limited backoff since we are the highest-priority task. */
+        val = __insn_tns((int *)&rwlock->lock);
-                relax(4);
+        if (likely((val << _RD_COUNT_WIDTH) == 0)) {
-                val = __insn_tns((int *)&rwlock->lock);
+                val += 1 << RD_COUNT_SHIFT;
+                rwlock->lock = val;
+                __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+                BUG_ON(val == 0);  /* we don't expect wraparound */
+                return 1;
        }
-        val = __insn_addb(val, mask);
+        if ((val & 1) == 0)
-        eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT));
+                rwlock->lock = val;
-        val = __insn_mz(eq & mask, val);
+        __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
-        rwlock->lock = val;
+        return 0;
 }
-EXPORT_SYMBOL(arch_write_unlock_slow);
+EXPORT_SYMBOL(arch_read_trylock);
 /*
- * We spin until everything but the reader bits (which are in the high
+ * Spin doing arch_read_trylock() until we acquire the lock.
- * part of the word) are zero, i.e. no active or waiting writers, no tns.
- *
 * ISSUE: This approach can permanently starve readers.  A reader who sees
 * a writer could instead take a ticket lock (just like a writer would),
 * and atomically enter read mode (with 1 reader) when it gets the ticket.
- * This way both readers and writers will always make forward progress
+ * This way both readers and writers would always make forward progress
 * in a finite time.
 */
-void arch_read_lock_slow(arch_rwlock_t *rwlock, u32 val)
+void arch_read_lock(arch_rwlock_t *rwlock)
 {
        u32 iterations = 0;
-        do {
+        while (unlikely(!arch_read_trylock(rwlock)))
-                if (!(val & 1))
-                        rwlock->lock = val;
                delay_backoff(iterations++);
+}
+EXPORT_SYMBOL(arch_read_lock);
+void arch_read_unlock(arch_rwlock_t *rwlock)
+{
+        u32 val, iterations = 0;
+        mb();  /* guarantee anything modified under the lock is visible */
+        for (;;) {
+                __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
                val = __insn_tns((int *)&rwlock->lock);
-        } while ((val << RD_COUNT_WIDTH) != 0);
+                if (likely(val & 1) == 0) {
-        rwlock->lock = val + (1 << RD_COUNT_SHIFT);
+                        rwlock->lock = val - (1 << _RD_COUNT_SHIFT);
+                        __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+                        break;
+                }
+                __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+                delay_backoff(iterations++);
+        }
 }
-EXPORT_SYMBOL(arch_read_lock_slow);
+EXPORT_SYMBOL(arch_read_unlock);
-void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
+/*
+ * We don't need an interrupt critical section here (unlike for
+ * arch_read_lock) since we should never use a bare write lock where
+ * it could be interrupted by code that could try to re-acquire it.
+ */
+void arch_write_lock(arch_rwlock_t *rwlock)
 {
        /*
         * The trailing underscore on this variable (and curr_ below)
@@ -168,6 +169,12 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
         */
        u32 my_ticket_;
        u32 iterations = 0;
+        u32 val = __insn_tns((int *)&rwlock->lock);
+        if (likely(val == 0)) {
+                rwlock->lock = 1 << _WR_NEXT_SHIFT;
+                return;
+        }
        /*
         * Wait until there are no readers, then bump up the next
@@ -206,23 +213,47 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
                        relax(4);
        }
 }
-EXPORT_SYMBOL(arch_write_lock_slow);
+EXPORT_SYMBOL(arch_write_lock);
-int __tns_atomic_acquire(atomic_t *lock)
+int arch_write_trylock(arch_rwlock_t *rwlock)
 {
-        int ret;
+        u32 val = __insn_tns((int *)&rwlock->lock);
-        u32 iterations = 0;
-        BUG_ON(__insn_mfspr(SPR_INTERRUPT_CRITICAL_SECTION));
+        /*
-        __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
+         * If a tns is in progress, or there's a waiting or active locker,
+         * or active readers, we can't take the lock, so give up.
+         */
+        if (unlikely(val != 0)) {
+                if (!(val & 1))
+                        rwlock->lock = val;
+                return 0;
+        }
-        while ((ret = __insn_tns((void *)&lock->counter)) == 1)
+        /* Set the "next" field to mark it locked. */
-                delay_backoff(iterations++);
+        rwlock->lock = 1 << _WR_NEXT_SHIFT;
-        return ret;
+        return 1;
 }
+EXPORT_SYMBOL(arch_write_trylock);
-void __tns_atomic_release(atomic_t *p, int v)
+void arch_write_unlock(arch_rwlock_t *rwlock)
 {
-        p->counter = v;
+        u32 val, eq, mask;
-        __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+        mb();  /* guarantee anything modified under the lock is visible */
+        val = __insn_tns((int *)&rwlock->lock);
+        if (likely(val == (1 << _WR_NEXT_SHIFT))) {
+                rwlock->lock = 0;
+                return;
+        }
+        while (unlikely(val & 1)) {
+                /* Limited backoff since we are the highest-priority task. */
+                relax(4);
+                val = __insn_tns((int *)&rwlock->lock);
+        }
+        mask = 1 << WR_CURR_SHIFT;
+        val = __insn_addb(val, mask);
+        eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT));
+        val = __insn_mz(eq & mask, val);
+        rwlock->lock = val;
 }
+EXPORT_SYMBOL(arch_write_unlock);

diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile index 93122d5b1558..0c26086ecbef 100644 --- a/arch/tile/lib/Makefile +++ b/arch/tile/lib/Makefile
@@ -2,9 +2,8 @@
2	# Makefile for TILE-specific library files..	2	# Makefile for TILE-specific library files..
3	#	3	#
4		4
5	lib-y = cacheflush.o checksum.o cpumask.o delay.o \	5	lib-y = cacheflush.o checksum.o cpumask.o delay.o uaccess.o \
6	mb_incoherent.o uaccess.o memmove.o \	6	memmove.o memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \
7	memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \
8	strchr_$(BITS).o strlen_$(BITS).o	7	strchr_$(BITS).o strlen_$(BITS).o
9		8
10	ifeq ($(CONFIG_TILEGX),y)	9	ifeq ($(CONFIG_TILEGX),y)


diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c index 7a5cc706ab62..f02040d3614e 100644 --- a/arch/tile/lib/atomic_32.c +++ b/arch/tile/lib/atomic_32.c
@@ -46,14 +46,13 @@ struct atomic_locks_on_cpu *atomic_lock_ptr[ATOMIC_HASH_L1_SIZE]
46	#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */	46	#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
47		47
48	/* This page is remapped on startup to be hash-for-home. */	48	/* This page is remapped on startup to be hash-for-home. */
49	int atomic_locks[PAGE_SIZE / sizeof(int) /* Only ATOMIC_HASH_SIZE is used */]	49	int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss;
50	__attribute__((aligned(PAGE_SIZE), section(".bss.page_aligned")));
51		50
52	#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */	51	#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
53		52
54	static inline int __atomic_hashed_lock(volatile void v)	53	static inline int __atomic_hashed_lock(volatile void v)
55	{	54	{
56	/* NOTE: this code must match "sys_cmpxchg" in kernel/intvec.S */	55	/* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */
57	#if ATOMIC_LOCKS_FOUND_VIA_TABLE()	56	#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
58	unsigned long i =	57	unsigned long i =
59	(unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long));	58	(unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long));


diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S index 5a5514b77e78..82f64cc63658 100644 --- a/arch/tile/lib/atomic_asm_32.S +++ b/arch/tile/lib/atomic_asm_32.S
@@ -14,7 +14,7 @@
14	* Support routines for atomic operations. Each function takes:	14	* Support routines for atomic operations. Each function takes:
15	*	15	*
16	* r0: address to manipulate	16	* r0: address to manipulate
17	* r1: pointer to atomic lock guarding this operation (for FUTEX_LOCK_REG)	17	* r1: pointer to atomic lock guarding this operation (for ATOMIC_LOCK_REG)
18	* r2: new value to write, or for cmpxchg/add_unless, value to compare against	18	* r2: new value to write, or for cmpxchg/add_unless, value to compare against
19	* r3: (cmpxchg/xchg_add_unless) new value to write or add;	19	* r3: (cmpxchg/xchg_add_unless) new value to write or add;
20	* (atomic64 ops) high word of value to write	20	* (atomic64 ops) high word of value to write


diff --git a/arch/tile/lib/cacheflush.c b/arch/tile/lib/cacheflush.c index 11b6164c2097..35c1d8ca5f38 100644 --- a/arch/tile/lib/cacheflush.c +++ b/arch/tile/lib/cacheflush.c
@@ -21,3 +21,105 @@ void __flush_icache_range(unsigned long start, unsigned long end)
21	{	21	{
22	invalidate_icache((const void *)start, end - start, PAGE_SIZE);	22	invalidate_icache((const void *)start, end - start, PAGE_SIZE);
23	}	23	}
		24
		25
		26	/* Force a load instruction to issue. */
		27	static inline void force_load(char *p)
		28	{
		29	(volatile char )p;
		30	}
		31
		32	/*
		33	* Flush and invalidate a VA range that is homed remotely on a single
		34	* core (if "!hfh") or homed via hash-for-home (if "hfh"), waiting
		35	* until the memory controller holds the flushed values.
		36	*/
		37	void finv_buffer_remote(void *buffer, size_t size, int hfh)
		38	{
		39	char p, base;
		40	size_t step_size, load_count;
		41	const unsigned long STRIPE_WIDTH = 8192;
		42
		43	/*
		44	* Flush and invalidate the buffer out of the local L1/L2
		45	* and request the home cache to flush and invalidate as well.
		46	*/
		47	__finv_buffer(buffer, size);
		48
		49	/*
		50	* Wait for the home cache to acknowledge that it has processed
		51	* all the flush-and-invalidate requests. This does not mean
		52	* that the flushed data has reached the memory controller yet,
		53	* but it does mean the home cache is processing the flushes.
		54	*/
		55	__insn_mf();
		56
		57	/*
		58	* Issue a load to the last cache line, which can't complete
		59	* until all the previously-issued flushes to the same memory
		60	* controller have also completed. If we weren't striping
		61	* memory, that one load would be sufficient, but since we may
		62	* be, we also need to back up to the last load issued to
		63	* another memory controller, which would be the point where
		64	* we crossed an 8KB boundary (the granularity of striping
		65	* across memory controllers). Keep backing up and doing this
		66	* until we are before the beginning of the buffer, or have
		67	* hit all the controllers.
		68	*
		69	* If we are flushing a hash-for-home buffer, it's even worse.
		70	* Each line may be homed on a different tile, and each tile
		71	* may have up to four lines that are on different
		72	* controllers. So as we walk backwards, we have to touch
		73	* enough cache lines to satisfy these constraints. In
		74	* practice this ends up being close enough to "load from
		75	* every cache line on a full memory stripe on each
		76	* controller" that we simply do that, to simplify the logic.
		77	*
		78	* FIXME: See bug 9535 for some issues with this code.
		79	*/
		80	if (hfh) {
		81	step_size = L2_CACHE_BYTES;
		82	load_count = (STRIPE_WIDTH / L2_CACHE_BYTES) *
		83	(1 << CHIP_LOG_NUM_MSHIMS());
		84	} else {
		85	step_size = STRIPE_WIDTH;
		86	load_count = (1 << CHIP_LOG_NUM_MSHIMS());
		87	}
		88
		89	/* Load the last byte of the buffer. */
		90	p = (char *)buffer + size - 1;
		91	force_load(p);
		92
		93	/* Bump down to the end of the previous stripe or cache line. */
		94	p -= step_size;
		95	p = (char *)((unsigned long)p \| (step_size - 1));
		96
		97	/* Figure out how far back we need to go. */
		98	base = p - (step_size * (load_count - 2));
		99	if ((long)base < (long)buffer)
		100	base = buffer;
		101
		102	/*
		103	* Fire all the loads we need. The MAF only has eight entries
		104	* so we can have at most eight outstanding loads, so we
		105	* unroll by that amount.
		106	*/
		107	#pragma unroll 8
		108	for (; p >= base; p -= step_size)
		109	force_load(p);
		110
		111	/*
		112	* Repeat, but with inv's instead of loads, to get rid of the
		113	* data we just loaded into our own cache and the old home L3.
		114	* No need to unroll since inv's don't target a register.
		115	*/
		116	p = (char *)buffer + size - 1;
		117	__insn_inv(p);
		118	p -= step_size;
		119	p = (char *)((unsigned long)p \| (step_size - 1));
		120	for (; p >= base; p -= step_size)
		121	__insn_inv(p);
		122
		123	/* Wait for the load+inv's (and thus finvs) to have completed. */
		124	__insn_mf();
		125	}


diff --git a/arch/tile/lib/delay.c b/arch/tile/lib/delay.c index 5801b03c13ef..cdacdd11d360 100644 --- a/arch/tile/lib/delay.c +++ b/arch/tile/lib/delay.c
@@ -15,20 +15,31 @@
15	#include <linux/module.h>	15	#include <linux/module.h>
16	#include <linux/delay.h>	16	#include <linux/delay.h>
17	#include <linux/thread_info.h>	17	#include <linux/thread_info.h>
18	#include <asm/fixmap.h>	18	#include <asm/timex.h>
19	#include <hv/hypervisor.h>
20		19
21	void __udelay(unsigned long usecs)	20	void __udelay(unsigned long usecs)
22	{	21	{
23	hv_nanosleep(usecs * 1000);	22	if (usecs > ULONG_MAX / 1000) {
		23	WARN_ON_ONCE(usecs > ULONG_MAX / 1000);
		24	usecs = ULONG_MAX / 1000;
		25	}
		26	__ndelay(usecs * 1000);
24	}	27	}
25	EXPORT_SYMBOL(__udelay);	28	EXPORT_SYMBOL(__udelay);
26		29
27	void __ndelay(unsigned long nsecs)	30	void __ndelay(unsigned long nsecs)
28	{	31	{
29	hv_nanosleep(nsecs);	32	cycles_t target = get_cycles();
		33	target += ns2cycles(nsecs);
		34	while (get_cycles() < target)
		35	cpu_relax();
30	}	36	}
31	EXPORT_SYMBOL(__ndelay);	37	EXPORT_SYMBOL(__ndelay);
32		38
33	/* FIXME: should be declared in a header somewhere. */	39	void __delay(unsigned long cycles)
		40	{
		41	cycles_t target = get_cycles() + cycles;
		42	while (get_cycles() < target)
		43	cpu_relax();
		44	}
34	EXPORT_SYMBOL(__delay);	45	EXPORT_SYMBOL(__delay);


diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c index 1509c5597653..49284fae9d09 100644 --- a/arch/tile/lib/exports.c +++ b/arch/tile/lib/exports.c
@@ -29,6 +29,9 @@ EXPORT_SYMBOL(__put_user_8);
29	EXPORT_SYMBOL(strnlen_user_asm);	29	EXPORT_SYMBOL(strnlen_user_asm);
30	EXPORT_SYMBOL(strncpy_from_user_asm);	30	EXPORT_SYMBOL(strncpy_from_user_asm);
31	EXPORT_SYMBOL(clear_user_asm);	31	EXPORT_SYMBOL(clear_user_asm);
		32	EXPORT_SYMBOL(flush_user_asm);
		33	EXPORT_SYMBOL(inv_user_asm);
		34	EXPORT_SYMBOL(finv_user_asm);
32		35
33	/* arch/tile/kernel/entry.S */	36	/* arch/tile/kernel/entry.S */
34	#include <linux/kernel.h>	37	#include <linux/kernel.h>
@@ -45,9 +48,6 @@ EXPORT_SYMBOL(__copy_from_user_zeroing);
45	EXPORT_SYMBOL(__copy_in_user_inatomic);	48	EXPORT_SYMBOL(__copy_in_user_inatomic);
46	#endif	49	#endif
47		50
48	/* arch/tile/lib/mb_incoherent.S */
49	EXPORT_SYMBOL(__mb_incoherent);
50
51	/* hypervisor glue */	51	/* hypervisor glue */
52	#include <hv/hypervisor.h>	52	#include <hv/hypervisor.h>
53	EXPORT_SYMBOL(hv_dev_open);	53	EXPORT_SYMBOL(hv_dev_open);
@@ -85,4 +85,8 @@ int64_t __muldi3(int64_t, int64_t);
85	EXPORT_SYMBOL(__muldi3);	85	EXPORT_SYMBOL(__muldi3);
86	uint64_t __lshrdi3(uint64_t, unsigned int);	86	uint64_t __lshrdi3(uint64_t, unsigned int);
87	EXPORT_SYMBOL(__lshrdi3);	87	EXPORT_SYMBOL(__lshrdi3);
		88	uint64_t __ashrdi3(uint64_t, unsigned int);
		89	EXPORT_SYMBOL(__ashrdi3);
		90	uint64_t __ashldi3(uint64_t, unsigned int);
		91	EXPORT_SYMBOL(__ashldi3);
88	#endif	92	#endif


diff --git a/arch/tile/lib/mb_incoherent.S b/arch/tile/lib/mb_incoherent.S deleted file mode 100644 index 989ad7b68d5a..000000000000 --- a/arch/tile/lib/mb_incoherent.S +++ /dev/null
@@ -1,34 +0,0 @@
1	/*
2	* Copyright 2010 Tilera Corporation. All Rights Reserved.
3	*
4	* This program is free software; you can redistribute it and/or
5	* modify it under the terms of the GNU General Public License
6	* as published by the Free Software Foundation, version 2.
7	*
8	* This program is distributed in the hope that it will be useful, but
9	* WITHOUT ANY WARRANTY; without even the implied warranty of
10	* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11	* NON INFRINGEMENT. See the GNU General Public License for
12	* more details.
13	*
14	* Assembly code for invoking the HV's fence_incoherent syscall.
15	*/
16
17	#include <linux/linkage.h>
18	#include <hv/syscall_public.h>
19	#include <arch/abi.h>
20	#include <arch/chip.h>
21
22	#if !CHIP_HAS_MF_WAITS_FOR_VICTIMS()
23
24	/*
25	* Invoke the hypervisor's fence_incoherent syscall, which guarantees
26	* that all victims for cachelines homed on this tile have reached memory.
27	*/
28	STD_ENTRY(__mb_incoherent)
29	moveli TREG_SYSCALL_NR_NAME, HV_SYS_fence_incoherent
30	swint2
31	jrp lr
32	STD_ENDPROC(__mb_incoherent)
33
34	#endif


diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c index f7d4a6ad61e8..b2fe15e01075 100644 --- a/arch/tile/lib/memcpy_tile64.c +++ b/arch/tile/lib/memcpy_tile64.c
@@ -96,7 +96,7 @@ static void memcpy_multicache(void dest, const void source,
96	newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1));	96	newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1));
97	pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc);	97	pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc);
98	ptep = pte_offset_kernel(pmdp, newsrc);	98	ptep = pte_offset_kernel(pmdp, newsrc);
99	ptep = src_pte; / set_pte() would be confused by this */	99	__set_pte(ptep, src_pte); /* set_pte() would be confused by this */
100	local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);	100	local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
101		101
102	/* Actually move the data. */	102	/* Actually move the data. */
@@ -109,7 +109,7 @@ static void memcpy_multicache(void dest, const void source,
109	*/	109	*/
110	src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3);	110	src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3);
111	src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */	111	src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */
112	ptep = src_pte; / set_pte() would be confused by this */	112	__set_pte(ptep, src_pte); /* set_pte() would be confused by this */
113	local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);	113	local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
114		114
115	/*	115	/*


diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c index 5cd1c4004eca..cb0999fb64b4 100644 --- a/arch/tile/lib/spinlock_32.c +++ b/arch/tile/lib/spinlock_32.c
@@ -15,6 +15,7 @@
15	#include <linux/spinlock.h>	15	#include <linux/spinlock.h>
16	#include <linux/module.h>	16	#include <linux/module.h>
17	#include <asm/processor.h>	17	#include <asm/processor.h>
		18	#include <arch/spr_def.h>
18		19
19	#include "spinlock_common.h"	20	#include "spinlock_common.h"
20		21
@@ -91,75 +92,75 @@ EXPORT_SYMBOL(arch_spin_unlock_wait);
91	#define RD_COUNT_MASK ((1 << RD_COUNT_WIDTH) - 1)	92	#define RD_COUNT_MASK ((1 << RD_COUNT_WIDTH) - 1)
92		93
93		94
94	/* Lock the word, spinning until there are no tns-ers. */	95	/*
95	static inline u32 get_rwlock(arch_rwlock_t *rwlock)	96	* We can get the read lock if everything but the reader bits (which
96	{	97	* are in the high part of the word) is zero, i.e. no active or
97	u32 iterations = 0;	98	* waiting writers, no tns.
98	for (;;) {	99	*
99	u32 val = __insn_tns((int *)&rwlock->lock);	100	* We guard the tns/store-back with an interrupt critical section to
100	if (unlikely(val & 1)) {	101	* preserve the semantic that the same read lock can be acquired in an
101	delay_backoff(iterations++);	102	* interrupt context.
102	continue;	103	*/
103	}	104	inline int arch_read_trylock(arch_rwlock_t *rwlock)
104	return val;
105	}
106	}
107
108	int arch_read_trylock_slow(arch_rwlock_t *rwlock)
109	{
110	u32 val = get_rwlock(rwlock);
111	int locked = (val << RD_COUNT_WIDTH) == 0;
112	rwlock->lock = val + (locked << RD_COUNT_SHIFT);
113	return locked;
114	}
115	EXPORT_SYMBOL(arch_read_trylock_slow);
116
117	void arch_read_unlock_slow(arch_rwlock_t *rwlock)
118	{
119	u32 val = get_rwlock(rwlock);
120	rwlock->lock = val - (1 << RD_COUNT_SHIFT);
121	}
122	EXPORT_SYMBOL(arch_read_unlock_slow);
123
124	void arch_write_unlock_slow(arch_rwlock_t *rwlock, u32 val)
125	{	105	{
126	u32 eq, mask = 1 << WR_CURR_SHIFT;	106	u32 val;
127	while (unlikely(val & 1)) {	107	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
128	/* Limited backoff since we are the highest-priority task. */	108	val = __insn_tns((int *)&rwlock->lock);
129	relax(4);	109	if (likely((val << _RD_COUNT_WIDTH) == 0)) {
130	val = __insn_tns((int *)&rwlock->lock);	110	val += 1 << RD_COUNT_SHIFT;
		111	rwlock->lock = val;
		112	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
		113	BUG_ON(val == 0); /* we don't expect wraparound */
		114	return 1;
131	}	115	}
132	val = __insn_addb(val, mask);	116	if ((val & 1) == 0)
133	eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT));	117	rwlock->lock = val;
134	val = __insn_mz(eq & mask, val);	118	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
135	rwlock->lock = val;	119	return 0;
136	}	120	}
137	EXPORT_SYMBOL(arch_write_unlock_slow);	121	EXPORT_SYMBOL(arch_read_trylock);
138		122
139	/*	123	/*
140	* We spin until everything but the reader bits (which are in the high	124	* Spin doing arch_read_trylock() until we acquire the lock.
141	* part of the word) are zero, i.e. no active or waiting writers, no tns.
142	*
143	* ISSUE: This approach can permanently starve readers. A reader who sees	125	* ISSUE: This approach can permanently starve readers. A reader who sees
144	* a writer could instead take a ticket lock (just like a writer would),	126	* a writer could instead take a ticket lock (just like a writer would),
145	* and atomically enter read mode (with 1 reader) when it gets the ticket.	127	* and atomically enter read mode (with 1 reader) when it gets the ticket.
146	* This way both readers and writers will always make forward progress	128	* This way both readers and writers would always make forward progress
147	* in a finite time.	129	* in a finite time.
148	*/	130	*/
149	void arch_read_lock_slow(arch_rwlock_t *rwlock, u32 val)	131	void arch_read_lock(arch_rwlock_t *rwlock)
150	{	132	{
151	u32 iterations = 0;	133	u32 iterations = 0;
152	do {	134	while (unlikely(!arch_read_trylock(rwlock)))
153	if (!(val & 1))
154	rwlock->lock = val;
155	delay_backoff(iterations++);	135	delay_backoff(iterations++);
		136	}
		137	EXPORT_SYMBOL(arch_read_lock);
		138
		139	void arch_read_unlock(arch_rwlock_t *rwlock)
		140	{
		141	u32 val, iterations = 0;
		142
		143	mb(); /* guarantee anything modified under the lock is visible */
		144	for (;;) {
		145	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
156	val = __insn_tns((int *)&rwlock->lock);	146	val = __insn_tns((int *)&rwlock->lock);
157	} while ((val << RD_COUNT_WIDTH) != 0);	147	if (likely(val & 1) == 0) {
158	rwlock->lock = val + (1 << RD_COUNT_SHIFT);	148	rwlock->lock = val - (1 << _RD_COUNT_SHIFT);
		149	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
		150	break;
		151	}
		152	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
		153	delay_backoff(iterations++);
		154	}
159	}	155	}
160	EXPORT_SYMBOL(arch_read_lock_slow);	156	EXPORT_SYMBOL(arch_read_unlock);
161		157
162	void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)	158	/*
		159	* We don't need an interrupt critical section here (unlike for
		160	* arch_read_lock) since we should never use a bare write lock where
		161	* it could be interrupted by code that could try to re-acquire it.
		162	*/
		163	void arch_write_lock(arch_rwlock_t *rwlock)
163	{	164	{
164	/*	165	/*
165	* The trailing underscore on this variable (and curr_ below)	166	* The trailing underscore on this variable (and curr_ below)
@@ -168,6 +169,12 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
168	*/	169	*/
169	u32 my_ticket_;	170	u32 my_ticket_;
170	u32 iterations = 0;	171	u32 iterations = 0;
		172	u32 val = __insn_tns((int *)&rwlock->lock);
		173
		174	if (likely(val == 0)) {
		175	rwlock->lock = 1 << _WR_NEXT_SHIFT;
		176	return;
		177	}
171		178
172	/*	179	/*
173	* Wait until there are no readers, then bump up the next	180	* Wait until there are no readers, then bump up the next
@@ -206,23 +213,47 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
206	relax(4);	213	relax(4);
207	}	214	}
208	}	215	}
209	EXPORT_SYMBOL(arch_write_lock_slow);	216	EXPORT_SYMBOL(arch_write_lock);
210		217
211	int __tns_atomic_acquire(atomic_t *lock)	218	int arch_write_trylock(arch_rwlock_t *rwlock)
212	{	219	{
213	int ret;	220	u32 val = __insn_tns((int *)&rwlock->lock);
214	u32 iterations = 0;
215		221
216	BUG_ON(__insn_mfspr(SPR_INTERRUPT_CRITICAL_SECTION));	222	/*
217	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);	223	* If a tns is in progress, or there's a waiting or active locker,
		224	* or active readers, we can't take the lock, so give up.
		225	*/
		226	if (unlikely(val != 0)) {
		227	if (!(val & 1))
		228	rwlock->lock = val;
		229	return 0;
		230	}
218		231
219	while ((ret = __insn_tns((void *)&lock->counter)) == 1)	232	/* Set the "next" field to mark it locked. */
220	delay_backoff(iterations++);	233	rwlock->lock = 1 << _WR_NEXT_SHIFT;
221	return ret;	234	return 1;
222	}	235	}
		236	EXPORT_SYMBOL(arch_write_trylock);
223		237
224	void __tns_atomic_release(atomic_t *p, int v)	238	void arch_write_unlock(arch_rwlock_t *rwlock)
225	{	239	{
226	p->counter = v;	240	u32 val, eq, mask;
227	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);	241
		242	mb(); /* guarantee anything modified under the lock is visible */
		243	val = __insn_tns((int *)&rwlock->lock);
		244	if (likely(val == (1 << _WR_NEXT_SHIFT))) {
		245	rwlock->lock = 0;
		246	return;
		247	}
		248	while (unlikely(val & 1)) {
		249	/* Limited backoff since we are the highest-priority task. */
		250	relax(4);
		251	val = __insn_tns((int *)&rwlock->lock);
		252	}
		253	mask = 1 << WR_CURR_SHIFT;
		254	val = __insn_addb(val, mask);
		255	eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT));
		256	val = __insn_mz(eq & mask, val);
		257	rwlock->lock = val;
228	}	258	}
		259	EXPORT_SYMBOL(arch_write_unlock);