arch/tile: finish enabling support for TILE-Gx 64-bit chip

This support was partially present in the existing code (look for "__tilegx__" ifdefs) but with this change you can build a working kernel using the TILE-Gx toolchain and ARCH=tilegx. Most of these files are new, generally adding a foo_64.c file where previously there was just a foo_32.c file. The ARCH=tilegx directive redirects to arch/tile, not arch/tilegx, using the existing SRCARCH mechanism in the top-level Makefile. Changes to existing files: - <asm/bitops.h> and <asm/bitops_32.h> changed to factor the include of <asm-generic/bitops/non-atomic.h> in the common header. - <asm/compat.h> and arch/tile/kernel/compat.c changed to remove the "const" markers I had put on compat_sys_execve() when trying to match some recent similar changes to the non-compat execve. It turns out the compat version wasn't "upgraded" to use const. - <asm/opcode-tile_64.h> and <asm/opcode_constants_64.h> were previously included accidentally, with the 32-bit contents. Now they have the proper 64-bit contents. Finally, I had to hack the existing hacky drivers/input/input-compat.h to add yet another "#ifdef" for INPUT_COMPAT_TEST (same as x86_64). Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com> [drivers/input]
author: Chris Metcalf <cmetcalf@tilera.com> 2011-05-04 14:38:26 -0400
committer: Chris Metcalf <cmetcalf@tilera.com> 2011-05-12 15:52:12 -0400
commit: 18aecc2b645bbb07851b196452a2af314222069b (patch)
tree: 959f765f69af01046c6e26db12b45c3390799d3e /arch/tile/lib
parent: be84cb43833ee40a42e08f5425d20310f16229c7 (diff)
8 files changed, 927 insertions, 0 deletions
diff --git a/arch/tile/lib/memchr_64.c b/arch/tile/lib/memchr_64.c
new file mode 100644
index 00000000000..84fdc8d8e73
--- /dev/null
+++ b/arch/tile/lib/memchr_64.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+void *memchr(const void *s, int c, size_t n)
+{
+        const uint64_t *last_word_ptr;
+        const uint64_t *p;
+        const char *last_byte_ptr;
+        uintptr_t s_int;
+        uint64_t goal, before_mask, v, bits;
+        char *ret;
+        if (__builtin_expect(n == 0, 0)) {
+                /* Don't dereference any memory if the array is empty. */
+                return NULL;
+        }
+        /* Get an aligned pointer. */
+        s_int = (uintptr_t) s;
+        p = (const uint64_t *)(s_int & -8);
+        /* Create eight copies of the byte for which we are looking. */
+        goal = 0x0101010101010101ULL * (uint8_t) c;
+        /* Read the first word, but munge it so that bytes before the array
+         * will not match goal.
+         *
+         * Note that this shift count expression works because we know
+         * shift counts are taken mod 64.
+         */
+        before_mask = (1ULL << (s_int << 3)) - 1;
+        v = (*p | before_mask) ^ (goal & before_mask);
+        /* Compute the address of the last byte. */
+        last_byte_ptr = (const char *)s + n - 1;
+        /* Compute the address of the word containing the last byte. */
+        last_word_ptr = (const uint64_t *)((uintptr_t) last_byte_ptr & -8);
+        while ((bits = __insn_v1cmpeq(v, goal)) == 0) {
+                if (__builtin_expect(p == last_word_ptr, 0)) {
+                        /* We already read the last word in the array,
+                         * so give up.
+                         */
+                        return NULL;
+                }
+                v = *++p;
+        }
+        /* We found a match, but it might be in a byte past the end
+         * of the array.
+         */
+        ret = ((char *)p) + (__insn_ctz(bits) >> 3);
+        return (ret <= last_byte_ptr) ? ret : NULL;
+}
+EXPORT_SYMBOL(memchr);
diff --git a/arch/tile/lib/memcpy_64.c b/arch/tile/lib/memcpy_64.c
new file mode 100644
index 00000000000..3fab9a6a2bb
--- /dev/null
+++ b/arch/tile/lib/memcpy_64.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#define __memcpy memcpy
+/* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */
+/* Must be 8 bytes in size. */
+#define word_t uint64_t
+#if CHIP_L2_LINE_SIZE() != 64 && CHIP_L2_LINE_SIZE() != 128
+#error "Assumes 64 or 128 byte line size"
+#endif
+/* How many cache lines ahead should we prefetch? */
+#define PREFETCH_LINES_AHEAD 3
+/*
+ * Provide "base versions" of load and store for the normal code path.
+ * The kernel provides other versions for userspace copies.
+ */
+#define ST(p, v) (*(p) = (v))
+#define LD(p) (*(p))
+#ifndef USERCOPY_FUNC
+#define ST1 ST
+#define ST2 ST
+#define ST4 ST
+#define ST8 ST
+#define LD1 LD
+#define LD2 LD
+#define LD4 LD
+#define LD8 LD
+#define RETVAL dstv
+void *memcpy(void *__restrict dstv, const void *__restrict srcv, size_t n)
+#else
+/*
+ * Special kernel version will provide implementation of the LDn/STn
+ * macros to return a count of uncopied bytes due to mm fault.
+ */
+#define RETVAL 0
+int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
+#endif
+{
+        char *__restrict dst1 = (char *)dstv;
+        const char *__restrict src1 = (const char *)srcv;
+        const char *__restrict src1_end;
+        const char *__restrict prefetch;
+        word_t *__restrict dst8;    /* 8-byte pointer to destination memory. */
+        word_t final; /* Final bytes to write to trailing word, if any */
+        long i;
+        if (n < 16) {
+                for (; n; n--)
+                        ST1(dst1++, LD1(src1++));
+                return RETVAL;
+        }
+        /*
+         * Locate the end of source memory we will copy.  Don't
+         * prefetch past this.
+         */
+        src1_end = src1 + n - 1;
+        /* Prefetch ahead a few cache lines, but not past the end. */
+        prefetch = src1;
+        for (i = 0; i < PREFETCH_LINES_AHEAD; i++) {
+                __insn_prefetch(prefetch);
+                prefetch += CHIP_L2_LINE_SIZE();
+                prefetch = (prefetch > src1_end) ? prefetch : src1;
+        }
+        /* Copy bytes until dst is word-aligned. */
+        for (; (uintptr_t)dst1 & (sizeof(word_t) - 1); n--)
+                ST1(dst1++, LD1(src1++));
+        /* 8-byte pointer to destination memory. */
+        dst8 = (word_t *)dst1;
+        if (__builtin_expect((uintptr_t)src1 & (sizeof(word_t) - 1), 0)) {
+                /*
+                 * Misaligned copy.  Copy 8 bytes at a time, but don't
+                 * bother with other fanciness.
+                 *
+                 * TODO: Consider prefetching and using wh64 as well.
+                 */
+                /* Create an aligned src8. */
+                const word_t *__restrict src8 =
+                        (const word_t *)((uintptr_t)src1 & -sizeof(word_t));
+                word_t b;
+                word_t a = LD8(src8++);
+                for (; n >= sizeof(word_t); n -= sizeof(word_t)) {
+                        b = LD8(src8++);
+                        a = __insn_dblalign(a, b, src1);
+                        ST8(dst8++, a);
+                        a = b;
+                }
+                if (n == 0)
+                        return RETVAL;
+                b = ((const char *)src8 <= src1_end) ? *src8 : 0;
+                /*
+                 * Final source bytes to write to trailing partial
+                 * word, if any.
+                 */
+                final = __insn_dblalign(a, b, src1);
+        } else {
+                /* Aligned copy. */
+                const word_t* __restrict src8 = (const word_t *)src1;
+                /* src8 and dst8 are both word-aligned. */
+                if (n >= CHIP_L2_LINE_SIZE()) {
+                        /* Copy until 'dst' is cache-line-aligned. */
+                        for (; (uintptr_t)dst8 & (CHIP_L2_LINE_SIZE() - 1);
+                             n -= sizeof(word_t))
+                                ST8(dst8++, LD8(src8++));
+                        for (; n >= CHIP_L2_LINE_SIZE(); ) {
+                                __insn_wh64(dst8);
+                                /*
+                                 * Prefetch and advance to next line
+                                 * to prefetch, but don't go past the end
+                                 */
+                                __insn_prefetch(prefetch);
+                                prefetch += CHIP_L2_LINE_SIZE();
+                                prefetch = (prefetch > src1_end) ? prefetch :
+                                        (const char *)src8;
+                                /*
+                                 * Copy an entire cache line.  Manually
+                                 * unrolled to avoid idiosyncracies of
+                                 * compiler unrolling.
+                                 */
+#define COPY_WORD(offset) ({ ST8(dst8+offset, LD8(src8+offset)); n -= 8; })
+                                COPY_WORD(0);
+                                COPY_WORD(1);
+                                COPY_WORD(2);
+                                COPY_WORD(3);
+                                COPY_WORD(4);
+                                COPY_WORD(5);
+                                COPY_WORD(6);
+                                COPY_WORD(7);
+#if CHIP_L2_LINE_SIZE() == 128
+                                COPY_WORD(8);
+                                COPY_WORD(9);
+                                COPY_WORD(10);
+                                COPY_WORD(11);
+                                COPY_WORD(12);
+                                COPY_WORD(13);
+                                COPY_WORD(14);
+                                COPY_WORD(15);
+#elif CHIP_L2_LINE_SIZE() != 64
+# error Fix code that assumes particular L2 cache line sizes
+#endif
+                                dst8 += CHIP_L2_LINE_SIZE() / sizeof(word_t);
+                                src8 += CHIP_L2_LINE_SIZE() / sizeof(word_t);
+                        }
+                }
+                for (; n >= sizeof(word_t); n -= sizeof(word_t))
+                        ST8(dst8++, LD8(src8++));
+                if (__builtin_expect(n == 0, 1))
+                        return RETVAL;
+                final = LD8(src8);
+        }
+        /* n != 0 if we get here.  Write out any trailing bytes. */
+        dst1 = (char *)dst8;
+        if (n & 4) {
+                ST4((uint32_t *)dst1, final);
+                dst1 += 4;
+                final >>= 32;
+                n &= 3;
+        }
+        if (n & 2) {
+                ST2((uint16_t *)dst1, final);
+                dst1 += 2;
+                final >>= 16;
+                n &= 1;
+        }
+        if (n)
+                ST1((uint8_t *)dst1, final);
+        return RETVAL;
+}
+#ifdef USERCOPY_FUNC
+#undef ST1
+#undef ST2
+#undef ST4
+#undef ST8
+#undef LD1
+#undef LD2
+#undef LD4
+#undef LD8
+#undef USERCOPY_FUNC
+#endif
diff --git a/arch/tile/lib/memcpy_user_64.c b/arch/tile/lib/memcpy_user_64.c
new file mode 100644
index 00000000000..4763b3aff1c
--- /dev/null
+++ b/arch/tile/lib/memcpy_user_64.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * Do memcpy(), but trap and return "n" when a load or store faults.
+ *
+ * Note: this idiom only works when memcpy() compiles to a leaf function.
+ * If "sp" is updated during memcpy, the "jrp lr" will be incorrect.
+ *
+ * Also note that we are capturing "n" from the containing scope here.
+ */
+#define _ST(p, inst, v)                                         \
+        ({                                                      \
+                asm("1: " #inst " %0, %1;"                      \
+                    ".pushsection .coldtext.memcpy,\"ax\";"     \
+                    "2: { move r0, %2; jrp lr };"               \
+                    ".section __ex_table,\"a\";"                \
+                    ".quad 1b, 2b;"                             \
+                    ".popsection"                               \
+                    : "=m" (*(p)) : "r" (v), "r" (n));          \
+        })
+#define _LD(p, inst)                                            \
+        ({                                                      \
+                unsigned long __v;                              \
+                asm("1: " #inst " %0, %1;"                      \
+                    ".pushsection .coldtext.memcpy,\"ax\";"     \
+                    "2: { move r0, %2; jrp lr };"               \
+                    ".section __ex_table,\"a\";"                \
+                    ".quad 1b, 2b;"                             \
+                    ".popsection"                               \
+                    : "=r" (__v) : "m" (*(p)), "r" (n));        \
+                __v;                                            \
+        })
+#define USERCOPY_FUNC __copy_to_user_inatomic
+#define ST1(p, v) _ST((p), st1, (v))
+#define ST2(p, v) _ST((p), st2, (v))
+#define ST4(p, v) _ST((p), st4, (v))
+#define ST8(p, v) _ST((p), st, (v))
+#define LD1 LD
+#define LD2 LD
+#define LD4 LD
+#define LD8 LD
+#include "memcpy_64.c"
+#define USERCOPY_FUNC __copy_from_user_inatomic
+#define ST1 ST
+#define ST2 ST
+#define ST4 ST
+#define ST8 ST
+#define LD1(p) _LD((p), ld1u)
+#define LD2(p) _LD((p), ld2u)
+#define LD4(p) _LD((p), ld4u)
+#define LD8(p) _LD((p), ld)
+#include "memcpy_64.c"
+#define USERCOPY_FUNC __copy_in_user_inatomic
+#define ST1(p, v) _ST((p), st1, (v))
+#define ST2(p, v) _ST((p), st2, (v))
+#define ST4(p, v) _ST((p), st4, (v))
+#define ST8(p, v) _ST((p), st, (v))
+#define LD1(p) _LD((p), ld1u)
+#define LD2(p) _LD((p), ld2u)
+#define LD4(p) _LD((p), ld4u)
+#define LD8(p) _LD((p), ld)
+#include "memcpy_64.c"
+unsigned long __copy_from_user_zeroing(void *to, const void __user *from,
+                                       unsigned long n)
+{
+        unsigned long rc = __copy_from_user_inatomic(to, from, n);
+        if (unlikely(rc))
+                memset(to + n - rc, 0, rc);
+        return rc;
+}
diff --git a/arch/tile/lib/memset_64.c b/arch/tile/lib/memset_64.c
new file mode 100644
index 00000000000..3873085711d
--- /dev/null
+++ b/arch/tile/lib/memset_64.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#include <arch/chip.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#undef memset
+void *memset(void *s, int c, size_t n)
+{
+        uint64_t *out64;
+        int n64, to_align64;
+        uint64_t v64;
+        uint8_t *out8 = s;
+        /* Experimentation shows that a trivial tight loop is a win up until
+         * around a size of 20, where writing a word at a time starts to win.
+         */
+#define BYTE_CUTOFF 20
+#if BYTE_CUTOFF < 7
+        /* This must be at least at least this big, or some code later
+         * on doesn't work.
+         */
+#error "BYTE_CUTOFF is too small"
+#endif
+        if (n < BYTE_CUTOFF) {
+                /* Strangely, this turns out to be the tightest way to
+                 * write this loop.
+                 */
+                if (n != 0) {
+                        do {
+                                /* Strangely, combining these into one line
+                                 * performs worse.
+                                 */
+                                *out8 = c;
+                                out8++;
+                        } while (--n != 0);
+                }
+                return s;
+        }
+        /* Align 'out8'. We know n >= 7 so this won't write past the end. */
+        while (((uintptr_t) out8 & 7) != 0) {
+                *out8++ = c;
+                --n;
+        }
+        /* Align 'n'. */
+        while (n & 7)
+                out8[--n] = c;
+        out64 = (uint64_t *) out8;
+        n64 = n >> 3;
+        /* Tile input byte out to 64 bits. */
+        /* KLUDGE */
+        v64 = 0x0101010101010101ULL * (uint8_t)c;
+        /* This must be at least 8 or the following loop doesn't work. */
+#define CACHE_LINE_SIZE_IN_DOUBLEWORDS (CHIP_L2_LINE_SIZE() / 8)
+        /* Determine how many words we need to emit before the 'out32'
+         * pointer becomes aligned modulo the cache line size.
+         */
+        to_align64 = (-((uintptr_t)out64 >> 3)) &
+                (CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1);
+        /* Only bother aligning and using wh64 if there is at least
+         * one full cache line to process.  This check also prevents
+         * overrunning the end of the buffer with alignment words.
+         */
+        if (to_align64 <= n64 - CACHE_LINE_SIZE_IN_DOUBLEWORDS) {
+                int lines_left;
+                /* Align out64 mod the cache line size so we can use wh64. */
+                n64 -= to_align64;
+                for (; to_align64 != 0; to_align64--) {
+                        *out64 = v64;
+                        out64++;
+                }
+                /* Use unsigned divide to turn this into a right shift. */
+                lines_left = (unsigned)n64 / CACHE_LINE_SIZE_IN_DOUBLEWORDS;
+                do {
+                        /* Only wh64 a few lines at a time, so we don't
+                         * exceed the maximum number of victim lines.
+                         */
+                        int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS())
+                                  ? lines_left
+                                  : CHIP_MAX_OUTSTANDING_VICTIMS());
+                        uint64_t *wh = out64;
+                        int i = x;
+                        int j;
+                        lines_left -= x;
+                        do {
+                                __insn_wh64(wh);
+                                wh += CACHE_LINE_SIZE_IN_DOUBLEWORDS;
+                        } while (--i);
+                        for (j = x * (CACHE_LINE_SIZE_IN_DOUBLEWORDS / 4);
+                             j != 0; j--) {
+                                *out64++ = v64;
+                                *out64++ = v64;
+                                *out64++ = v64;
+                                *out64++ = v64;
+                        }
+                } while (lines_left != 0);
+                /* We processed all full lines above, so only this many
+                 * words remain to be processed.
+                 */
+                n64 &= CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1;
+        }
+        /* Now handle any leftover values. */
+        if (n64 != 0) {
+                do {
+                        *out64 = v64;
+                        out64++;
+                } while (--n64 != 0);
+        }
+        return s;
+}
+EXPORT_SYMBOL(memset);
diff --git a/arch/tile/lib/spinlock_64.c b/arch/tile/lib/spinlock_64.c
new file mode 100644
index 00000000000..d6fb9581e98
--- /dev/null
+++ b/arch/tile/lib/spinlock_64.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+#include "spinlock_common.h"
+/*
+ * Read the spinlock value without allocating in our cache and without
+ * causing an invalidation to another cpu with a copy of the cacheline.
+ * This is important when we are spinning waiting for the lock.
+ */
+static inline u32 arch_spin_read_noalloc(void *lock)
+{
+        return atomic_cmpxchg((atomic_t *)lock, -1, -1);
+}
+/*
+ * Wait until the high bits (current) match my ticket.
+ * If we notice the overflow bit set on entry, we clear it.
+ */
+void arch_spin_lock_slow(arch_spinlock_t *lock, u32 my_ticket)
+{
+        if (unlikely(my_ticket & __ARCH_SPIN_NEXT_OVERFLOW)) {
+                __insn_fetchand4(&lock->lock, ~__ARCH_SPIN_NEXT_OVERFLOW);
+                my_ticket &= ~__ARCH_SPIN_NEXT_OVERFLOW;
+        }
+        for (;;) {
+                u32 val = arch_spin_read_noalloc(lock);
+                u32 delta = my_ticket - arch_spin_current(val);
+                if (delta == 0)
+                        return;
+                relax((128 / CYCLES_PER_RELAX_LOOP) * delta);
+        }
+}
+EXPORT_SYMBOL(arch_spin_lock_slow);
+/*
+ * Check the lock to see if it is plausible, and try to get it with cmpxchg().
+ */
+int arch_spin_trylock(arch_spinlock_t *lock)
+{
+        u32 val = arch_spin_read_noalloc(lock);
+        if (unlikely(arch_spin_current(val) != arch_spin_next(val)))
+                return 0;
+        return cmpxchg(&lock->lock, val, (val + 1) & ~__ARCH_SPIN_NEXT_OVERFLOW)
+                == val;
+}
+EXPORT_SYMBOL(arch_spin_trylock);
+void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+        u32 iterations = 0;
+        while (arch_spin_is_locked(lock))
+                delay_backoff(iterations++);
+}
+EXPORT_SYMBOL(arch_spin_unlock_wait);
+/*
+ * If the read lock fails due to a writer, we retry periodically
+ * until the value is positive and we write our incremented reader count.
+ */
+void __read_lock_failed(arch_rwlock_t *rw)
+{
+        u32 val;
+        int iterations = 0;
+        do {
+                delay_backoff(iterations++);
+                val = __insn_fetchaddgez4(&rw->lock, 1);
+        } while (unlikely(arch_write_val_locked(val)));
+}
+EXPORT_SYMBOL(__read_lock_failed);
+/*
+ * If we failed because there were readers, clear the "writer" bit
+ * so we don't block additional readers.  Otherwise, there was another
+ * writer anyway, so our "fetchor" made no difference.  Then wait,
+ * issuing periodic fetchor instructions, till we get the lock.
+ */
+void __write_lock_failed(arch_rwlock_t *rw, u32 val)
+{
+        int iterations = 0;
+        do {
+                if (!arch_write_val_locked(val))
+                        val = __insn_fetchand4(&rw->lock, ~__WRITE_LOCK_BIT);
+                delay_backoff(iterations++);
+                val = __insn_fetchor4(&rw->lock, __WRITE_LOCK_BIT);
+        } while (val != 0);
+}
+EXPORT_SYMBOL(__write_lock_failed);
diff --git a/arch/tile/lib/strchr_64.c b/arch/tile/lib/strchr_64.c
new file mode 100644
index 00000000000..617a9273aaa
--- /dev/null
+++ b/arch/tile/lib/strchr_64.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#undef strchr
+char *strchr(const char *s, int c)
+{
+        int z, g;
+        /* Get an aligned pointer. */
+        const uintptr_t s_int = (uintptr_t) s;
+        const uint64_t *p = (const uint64_t *)(s_int & -8);
+        /* Create eight copies of the byte for which we are looking. */
+        const uint64_t goal = 0x0101010101010101ULL * (uint8_t) c;
+        /* Read the first aligned word, but force bytes before the string to
+         * match neither zero nor goal (we make sure the high bit of each
+         * byte is 1, and the low 7 bits are all the opposite of the goal
+         * byte).
+         *
+         * Note that this shift count expression works because we know shift
+         * counts are taken mod 64.
+         */
+        const uint64_t before_mask = (1ULL << (s_int << 3)) - 1;
+        uint64_t v = (*p | before_mask) ^
+                (goal & __insn_v1shrsi(before_mask, 1));
+        uint64_t zero_matches, goal_matches;
+        while (1) {
+                /* Look for a terminating '\0'. */
+                zero_matches = __insn_v1cmpeqi(v, 0);
+                /* Look for the goal byte. */
+                goal_matches = __insn_v1cmpeq(v, goal);
+                if (__builtin_expect((zero_matches | goal_matches) != 0, 0))
+                        break;
+                v = *++p;
+        }
+        z = __insn_ctz(zero_matches);
+        g = __insn_ctz(goal_matches);
+        /* If we found c before '\0' we got a match. Note that if c == '\0'
+         * then g == z, and we correctly return the address of the '\0'
+         * rather than NULL.
+         */
+        return (g <= z) ? ((char *)p) + (g >> 3) : NULL;
+}
+EXPORT_SYMBOL(strchr);
diff --git a/arch/tile/lib/strlen_64.c b/arch/tile/lib/strlen_64.c
new file mode 100644
index 00000000000..1c92d46202a
--- /dev/null
+++ b/arch/tile/lib/strlen_64.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#undef strlen
+size_t strlen(const char *s)
+{
+        /* Get an aligned pointer. */
+        const uintptr_t s_int = (uintptr_t) s;
+        const uint64_t *p = (const uint64_t *)(s_int & -8);
+        /* Read the first word, but force bytes before the string to be nonzero.
+         * This expression works because we know shift counts are taken mod 64.
+         */
+        uint64_t v = *p | ((1ULL << (s_int << 3)) - 1);
+        uint64_t bits;
+        while ((bits = __insn_v1cmpeqi(v, 0)) == 0)
+                v = *++p;
+        return ((const char *)p) + (__insn_ctz(bits) >> 3) - s;
+}
+EXPORT_SYMBOL(strlen);
diff --git a/arch/tile/lib/usercopy_64.S b/arch/tile/lib/usercopy_64.S
new file mode 100644
index 00000000000..2ff44f87b78
--- /dev/null
+++ b/arch/tile/lib/usercopy_64.S
@@ -0,0 +1,196 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#include <linux/linkage.h>
+#include <asm/errno.h>
+#include <asm/cache.h>
+#include <arch/chip.h>
+/* Access user memory, but use MMU to avoid propagating kernel exceptions. */
+        .pushsection .fixup,"ax"
+get_user_fault:
+        { movei r1, -EFAULT; move r0, zero }
+        jrp lr
+        ENDPROC(get_user_fault)
+put_user_fault:
+        { movei r0, -EFAULT; jrp lr }
+        ENDPROC(put_user_fault)
+        .popsection
+/*
+ * __get_user_N functions take a pointer in r0, and return 0 in r1
+ * on success, with the value in r0; or else -EFAULT in r1.
+ */
+#define __get_user_N(bytes, LOAD) \
+        STD_ENTRY(__get_user_##bytes); \
+1:      { LOAD r0, r0; move r1, zero }; \
+        jrp lr; \
+        STD_ENDPROC(__get_user_##bytes); \
+        .pushsection __ex_table,"a"; \
+        .quad 1b, get_user_fault; \
+        .popsection
+__get_user_N(1, ld1u)
+__get_user_N(2, ld2u)
+__get_user_N(4, ld4u)
+__get_user_N(8, ld)
+/*
+ * __put_user_N functions take a value in r0 and a pointer in r1,
+ * and return 0 in r0 on success or -EFAULT on failure.
+ */
+#define __put_user_N(bytes, STORE) \
+        STD_ENTRY(__put_user_##bytes); \
+1:      { STORE r1, r0; move r0, zero }; \
+        jrp lr; \
+        STD_ENDPROC(__put_user_##bytes); \
+        .pushsection __ex_table,"a"; \
+        .quad 1b, put_user_fault; \
+        .popsection
+__put_user_N(1, st1)
+__put_user_N(2, st2)
+__put_user_N(4, st4)
+__put_user_N(8, st)
+/*
+ * strnlen_user_asm takes the pointer in r0, and the length bound in r1.
+ * It returns the length, including the terminating NUL, or zero on exception.
+ * If length is greater than the bound, returns one plus the bound.
+ */
+STD_ENTRY(strnlen_user_asm)
+        { beqz r1, 2f; addi r3, r0, -1 }  /* bias down to include NUL */
+1:      { ld1u r4, r0; addi r1, r1, -1 }
+        beqz r4, 2f
+        { bnezt r1, 1b; addi r0, r0, 1 }
+2:      { sub r0, r0, r3; jrp lr }
+        STD_ENDPROC(strnlen_user_asm)
+        .pushsection .fixup,"ax"
+strnlen_user_fault:
+        { move r0, zero; jrp lr }
+        ENDPROC(strnlen_user_fault)
+        .section __ex_table,"a"
+        .quad 1b, strnlen_user_fault
+        .popsection
+/*
+ * strncpy_from_user_asm takes the kernel target pointer in r0,
+ * the userspace source pointer in r1, and the length bound (including
+ * the trailing NUL) in r2.  On success, it returns the string length
+ * (not including the trailing NUL), or -EFAULT on failure.
+ */
+STD_ENTRY(strncpy_from_user_asm)
+        { beqz r2, 2f; move r3, r0 }
+1:      { ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
+        { st1 r0, r4; addi r0, r0, 1 }
+        beqz r2, 2f
+        bnezt r4, 1b
+        addi r0, r0, -1   /* don't count the trailing NUL */
+2:      { sub r0, r0, r3; jrp lr }
+        STD_ENDPROC(strncpy_from_user_asm)
+        .pushsection .fixup,"ax"
+strncpy_from_user_fault:
+        { movei r0, -EFAULT; jrp lr }
+        ENDPROC(strncpy_from_user_fault)
+        .section __ex_table,"a"
+        .quad 1b, strncpy_from_user_fault
+        .popsection
+/*
+ * clear_user_asm takes the user target address in r0 and the
+ * number of bytes to zero in r1.
+ * It returns the number of uncopiable bytes (hopefully zero) in r0.
+ * Note that we don't use a separate .fixup section here since we fall
+ * through into the "fixup" code as the last straight-line bundle anyway.
+ */
+STD_ENTRY(clear_user_asm)
+        { beqz r1, 2f; or r2, r0, r1 }
+        andi r2, r2, 7
+        beqzt r2, .Lclear_aligned_user_asm
+1:      { st1 r0, zero; addi r0, r0, 1; addi r1, r1, -1 }
+        bnezt r1, 1b
+2:      { move r0, r1; jrp lr }
+        .pushsection __ex_table,"a"
+        .quad 1b, 2b
+        .popsection
+.Lclear_aligned_user_asm:
+1:      { st r0, zero; addi r0, r0, 8; addi r1, r1, -8 }
+        bnezt r1, 1b
+2:      { move r0, r1; jrp lr }
+        STD_ENDPROC(clear_user_asm)
+        .pushsection __ex_table,"a"
+        .quad 1b, 2b
+        .popsection
+/*
+ * flush_user_asm takes the user target address in r0 and the
+ * number of bytes to flush in r1.
+ * It returns the number of unflushable bytes (hopefully zero) in r0.
+ */
+STD_ENTRY(flush_user_asm)
+        beqz r1, 2f
+        { movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
+        { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
+        { and r0, r0, r2; and r1, r1, r2 }
+        { sub r1, r1, r0 }
+1:      { flush r0; addi r1, r1, -CHIP_FLUSH_STRIDE() }
+        { addi r0, r0, CHIP_FLUSH_STRIDE(); bnezt r1, 1b }
+2:      { move r0, r1; jrp lr }
+        STD_ENDPROC(flush_user_asm)
+        .pushsection __ex_table,"a"
+        .quad 1b, 2b
+        .popsection
+/*
+ * inv_user_asm takes the user target address in r0 and the
+ * number of bytes to invalidate in r1.
+ * It returns the number of not inv'able bytes (hopefully zero) in r0.
+ */
+STD_ENTRY(inv_user_asm)
+        beqz r1, 2f
+        { movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
+        { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
+        { and r0, r0, r2; and r1, r1, r2 }
+        { sub r1, r1, r0 }
+1:      { inv r0; addi r1, r1, -CHIP_INV_STRIDE() }
+        { addi r0, r0, CHIP_INV_STRIDE(); bnezt r1, 1b }
+2:      { move r0, r1; jrp lr }
+        STD_ENDPROC(inv_user_asm)
+        .pushsection __ex_table,"a"
+        .quad 1b, 2b
+        .popsection
+/*
+ * finv_user_asm takes the user target address in r0 and the
+ * number of bytes to flush-invalidate in r1.
+ * It returns the number of not finv'able bytes (hopefully zero) in r0.
+ */
+STD_ENTRY(finv_user_asm)
+        beqz r1, 2f
+        { movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
+        { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
+        { and r0, r0, r2; and r1, r1, r2 }
+        { sub r1, r1, r0 }
+1:      { finv r0; addi r1, r1, -CHIP_FINV_STRIDE() }
+        { addi r0, r0, CHIP_FINV_STRIDE(); bnezt r1, 1b }
+2:      { move r0, r1; jrp lr }
+        STD_ENDPROC(finv_user_asm)
+        .pushsection __ex_table,"a"
+        .quad 1b, 2b
+        .popsection
author	Chris Metcalf <cmetcalf@tilera.com>	2011-05-04 14:38:26 -0400
committer	Chris Metcalf <cmetcalf@tilera.com>	2011-05-12 15:52:12 -0400
commit	18aecc2b645bbb07851b196452a2af314222069b (patch)
tree	959f765f69af01046c6e26db12b45c3390799d3e /arch/tile/lib
parent	be84cb43833ee40a42e08f5425d20310f16229c7 (diff)