26 files changed, 669 insertions, 294 deletions
diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild
index 0bb42642343a..143473e3a0bb 100644
--- a/arch/tile/include/asm/Kbuild
+++ b/arch/tile/include/asm/Kbuild
@@ -2,6 +2,7 @@ include include/asm-generic/Kbuild.asm
 header-y += ../arch/
+header-y += cachectl.h
 header-y += ucontext.h
 header-y += hardwall.h
@@ -21,7 +22,6 @@ generic-y += ipcbuf.h
 generic-y += irq_regs.h
 generic-y += kdebug.h
 generic-y += local.h
-generic-y += module.h
 generic-y += msgbuf.h
 generic-y += mutex.h
 generic-y += param.h
diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h
index 54d1da826f93..e7fb5cfb9597 100644
--- a/arch/tile/include/asm/atomic_32.h
+++ b/arch/tile/include/asm/atomic_32.h
@@ -303,7 +303,14 @@ void __init_atomic_per_cpu(void);
 void __atomic_fault_unlock(int *lock_ptr);
 #endif
+/* Return a pointer to the lock for the given address. */
+int *__atomic_hashed_lock(volatile void *v);
 /* Private helper routines in lib/atomic_asm_32.S */
+struct __get_user {
+        unsigned long val;
+        int err;
+};
 extern struct __get_user __atomic_cmpxchg(volatile int *p,
                                          int *lock, int o, int n);
 extern struct __get_user __atomic_xchg(volatile int *p, int *lock, int n);
@@ -319,6 +326,9 @@ extern u64 __atomic64_xchg_add(volatile u64 *p, int *lock, u64 n);
 extern u64 __atomic64_xchg_add_unless(volatile u64 *p,
                                      int *lock, u64 o, u64 n);
+/* Return failure from the atomic wrappers. */
+struct __get_user __atomic_bad_address(int __user *addr);
 #endif /* !__ASSEMBLY__ */
 #endif /* _ASM_TILE_ATOMIC_32_H */
diff --git a/arch/tile/include/asm/bitops.h b/arch/tile/include/asm/bitops.h
index 16f1fa51fea1..bd186c4eaa50 100644
--- a/arch/tile/include/asm/bitops.h
+++ b/arch/tile/include/asm/bitops.h
@@ -77,6 +77,11 @@ static inline int ffs(int x)
        return __builtin_ffs(x);
 }
+static inline int fls64(__u64 w)
+{
+        return (sizeof(__u64) * 8) - __builtin_clzll(w);
+}
 /**
 * fls - find last set bit in word
 * @x: the word to search
@@ -90,12 +95,7 @@ static inline int ffs(int x)
 */
 static inline int fls(int x)
 {
-        return (sizeof(int) * 8) - __builtin_clz(x);
+        return fls64((unsigned int) x);
-}
-static inline int fls64(__u64 w)
-{
-        return (sizeof(__u64) * 8) - __builtin_clzll(w);
 }
 static inline unsigned int __arch_hweight32(unsigned int w)
diff --git a/arch/tile/include/asm/byteorder.h b/arch/tile/include/asm/byteorder.h
index 9558416d578b..fb72ecf49218 100644
--- a/arch/tile/include/asm/byteorder.h
+++ b/arch/tile/include/asm/byteorder.h
@@ -1 +1,21 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#if defined (__BIG_ENDIAN__)
+#include <linux/byteorder/big_endian.h>
+#elif defined (__LITTLE_ENDIAN__)
 #include <linux/byteorder/little_endian.h>
+#else
+#error "__BIG_ENDIAN__ or __LITTLE_ENDIAN__ must be defined."
+#endif
diff --git a/arch/tile/include/asm/cachectl.h b/arch/tile/include/asm/cachectl.h
new file mode 100644
index 000000000000..af4c9f9154d1
--- /dev/null
+++ b/arch/tile/include/asm/cachectl.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#ifndef _ASM_TILE_CACHECTL_H
+#define _ASM_TILE_CACHECTL_H
+/*
+ * Options for cacheflush system call.
+ *
+ * The ICACHE flush is performed on all cores currently running the
+ * current process's address space.  The intent is for user
+ * applications to be able to modify code, invoke the system call,
+ * then allow arbitrary other threads in the same address space to see
+ * the newly-modified code.  Passing a length of CHIP_L1I_CACHE_SIZE()
+ * or more invalidates the entire icache on all cores in the address
+ * spaces.  (Note: currently this option invalidates the entire icache
+ * regardless of the requested address and length, but we may choose
+ * to honor the arguments at some point.)
+ *
+ * Flush and invalidation of memory can normally be performed with the
+ * __insn_flush(), __insn_inv(), and __insn_finv() instructions from
+ * userspace.  The DCACHE option to the system call allows userspace
+ * to flush the entire L1+L2 data cache from the core.  In this case,
+ * the address and length arguments are not used.  The DCACHE flush is
+ * restricted to the current core, not all cores in the address space.
+ */
+#define ICACHE  (1<<0)          /* invalidate L1 instruction cache */
+#define DCACHE  (1<<1)          /* flush and invalidate data cache */
+#define BCACHE  (ICACHE|DCACHE) /* flush both caches               */
+#endif  /* _ASM_TILE_CACHECTL_H */
diff --git a/arch/tile/include/asm/compat.h b/arch/tile/include/asm/compat.h
index 4b4b28969a65..69adc08d36a5 100644
--- a/arch/tile/include/asm/compat.h
+++ b/arch/tile/include/asm/compat.h
@@ -242,9 +242,6 @@ long compat_sys_fallocate(int fd, int mode,
 long compat_sys_sched_rr_get_interval(compat_pid_t pid,
                                      struct compat_timespec __user *interval);
-/* Tilera Linux syscalls that don't have "compat" versions. */
-#define compat_sys_flush_cache sys_flush_cache
 /* These are the intvec_64.S trampolines. */
 long _compat_sys_execve(const char __user *path,
                        const compat_uptr_t __user *argv,
diff --git a/arch/tile/include/asm/elf.h b/arch/tile/include/asm/elf.h
index 623a6bb741c1..d16d006d660e 100644
--- a/arch/tile/include/asm/elf.h
+++ b/arch/tile/include/asm/elf.h
@@ -44,7 +44,11 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG];
 #else
 #define ELF_CLASS       ELFCLASS32
 #endif
+#ifdef __BIG_ENDIAN__
+#define ELF_DATA        ELFDATA2MSB
+#else
 #define ELF_DATA        ELFDATA2LSB
+#endif
 /*
 * There seems to be a bug in how compat_binfmt_elf.c works: it
@@ -59,6 +63,7 @@ enum { ELF_ARCH = CHIP_ELF_TYPE() };
 */
 #define elf_check_arch(x)  \
        ((x)->e_ident[EI_CLASS] == ELF_CLASS && \
+         (x)->e_ident[EI_DATA] == ELF_DATA && \
         (x)->e_machine == CHIP_ELF_TYPE())
 /* The module loader only handles a few relocation types. */
diff --git a/arch/tile/include/asm/futex.h b/arch/tile/include/asm/futex.h
index d03ec124a598..5909ac3d7218 100644
--- a/arch/tile/include/asm/futex.h
+++ b/arch/tile/include/asm/futex.h
@@ -28,29 +28,81 @@
 #include <linux/futex.h>
 #include <linux/uaccess.h>
 #include <linux/errno.h>
+#include <asm/atomic.h>
-extern struct __get_user futex_set(u32 __user *v, int i);
+/*
-extern struct __get_user futex_add(u32 __user *v, int n);
+ * Support macros for futex operations.  Do not use these macros directly.
-extern struct __get_user futex_or(u32 __user *v, int n);
+ * They assume "ret", "val", "oparg", and "uaddr" in the lexical context.
-extern struct __get_user futex_andn(u32 __user *v, int n);
+ * __futex_cmpxchg() additionally assumes "oldval".
-extern struct __get_user futex_cmpxchg(u32 __user *v, int o, int n);
+ */
+#ifdef __tilegx__
+#define __futex_asm(OP) \
+        asm("1: {" #OP " %1, %3, %4; movei %0, 0 }\n"           \
+            ".pushsection .fixup,\"ax\"\n"                      \
+            "0: { movei %0, %5; j 9f }\n"                       \
+            ".section __ex_table,\"a\"\n"                       \
+            ".quad 1b, 0b\n"                                    \
+            ".popsection\n"                                     \
+            "9:"                                                \
+            : "=r" (ret), "=r" (val), "+m" (*(uaddr))           \
+            : "r" (uaddr), "r" (oparg), "i" (-EFAULT))
+#define __futex_set() __futex_asm(exch4)
+#define __futex_add() __futex_asm(fetchadd4)
+#define __futex_or() __futex_asm(fetchor4)
+#define __futex_andn() ({ oparg = ~oparg; __futex_asm(fetchand4); })
+#define __futex_cmpxchg() \
+        ({ __insn_mtspr(SPR_CMPEXCH_VALUE, oldval); __futex_asm(cmpexch4); })
+#define __futex_xor()                                           \
+        ({                                                      \
+                u32 oldval, n = oparg;                          \
+                if ((ret = __get_user(oldval, uaddr)) == 0) {   \
+                        do {                                    \
+                                oparg = oldval ^ n;             \
+                                __futex_cmpxchg();              \
+                        } while (ret == 0 && oldval != val);    \
+                }                                               \
+        })
+/* No need to prefetch, since the atomic ops go to the home cache anyway. */
+#define __futex_prolog()
-#ifndef __tilegx__
-extern struct __get_user futex_xor(u32 __user *v, int n);
 #else
-static inline struct __get_user futex_xor(u32 __user *uaddr, int n)
-{
+#define __futex_call(FN)                                                \
-        struct __get_user asm_ret = __get_user_4(uaddr);
+        {                                                               \
-        if (!asm_ret.err) {
+                struct __get_user gu = FN((u32 __force *)uaddr, lock, oparg); \
-                int oldval, newval;
+                val = gu.val;                                           \
-                do {
+                ret = gu.err;                                           \
-                        oldval = asm_ret.val;
-                        newval = oldval ^ n;
-                        asm_ret = futex_cmpxchg(uaddr, oldval, newval);
-                } while (asm_ret.err == 0 && oldval != asm_ret.val);
        }
-        return asm_ret;
-}
+#define __futex_set() __futex_call(__atomic_xchg)
+#define __futex_add() __futex_call(__atomic_xchg_add)
+#define __futex_or() __futex_call(__atomic_or)
+#define __futex_andn() __futex_call(__atomic_andn)
+#define __futex_xor() __futex_call(__atomic_xor)
+#define __futex_cmpxchg()                                               \
+        {                                                               \
+                struct __get_user gu = __atomic_cmpxchg((u32 __force *)uaddr, \
+                                                        lock, oldval, oparg); \
+                val = gu.val;                                           \
+                ret = gu.err;                                           \
+        }
+/*
+ * Find the lock pointer for the atomic calls to use, and issue a
+ * prefetch to the user address to bring it into cache.  Similar to
+ * __atomic_setup(), but we can't do a read into the L1 since it might
+ * fault; instead we do a prefetch into the L2.
+ */
+#define __futex_prolog()                                        \
+        int *lock;                                              \
+        __insn_prefetch(uaddr);                                 \
+        lock = __atomic_hashed_lock((int __force *)uaddr)
 #endif
 static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
@@ -59,8 +111,12 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
        int cmp = (encoded_op >> 24) & 15;
        int oparg = (encoded_op << 8) >> 20;
        int cmparg = (encoded_op << 20) >> 20;
-        int ret;
+        int uninitialized_var(val), ret;
-        struct __get_user asm_ret;
+        __futex_prolog();
+        /* The 32-bit futex code makes this assumption, so validate it here. */
+        BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int));
        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
                oparg = 1 << oparg;
@@ -71,46 +127,45 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
        pagefault_disable();
        switch (op) {
        case FUTEX_OP_SET:
-                asm_ret = futex_set(uaddr, oparg);
+                __futex_set();
                break;
        case FUTEX_OP_ADD:
-                asm_ret = futex_add(uaddr, oparg);
+                __futex_add();
                break;
        case FUTEX_OP_OR:
-                asm_ret = futex_or(uaddr, oparg);
+                __futex_or();
                break;
        case FUTEX_OP_ANDN:
-                asm_ret = futex_andn(uaddr, oparg);
+                __futex_andn();
                break;
        case FUTEX_OP_XOR:
-                asm_ret = futex_xor(uaddr, oparg);
+                __futex_xor();
                break;
        default:
-                asm_ret.err = -ENOSYS;
+                ret = -ENOSYS;
+                break;
        }
        pagefault_enable();
-        ret = asm_ret.err;
        if (!ret) {
                switch (cmp) {
                case FUTEX_OP_CMP_EQ:
-                        ret = (asm_ret.val == cmparg);
+                        ret = (val == cmparg);
                        break;
                case FUTEX_OP_CMP_NE:
-                        ret = (asm_ret.val != cmparg);
+                        ret = (val != cmparg);
                        break;
                case FUTEX_OP_CMP_LT:
-                        ret = (asm_ret.val < cmparg);
+                        ret = (val < cmparg);
                        break;
                case FUTEX_OP_CMP_GE:
-                        ret = (asm_ret.val >= cmparg);
+                        ret = (val >= cmparg);
                        break;
                case FUTEX_OP_CMP_LE:
-                        ret = (asm_ret.val <= cmparg);
+                        ret = (val <= cmparg);
                        break;
                case FUTEX_OP_CMP_GT:
-                        ret = (asm_ret.val > cmparg);
+                        ret = (val > cmparg);
                        break;
                default:
                        ret = -ENOSYS;
@@ -120,22 +175,20 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 }
 static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
-                                                u32 oldval, u32 newval)
+                                                u32 oldval, u32 oparg)
 {
-        struct __get_user asm_ret;
+        int ret, val;
+        __futex_prolog();
        if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
                return -EFAULT;
-        asm_ret = futex_cmpxchg(uaddr, oldval, newval);
+        __futex_cmpxchg();
-        *uval = asm_ret.val;
-        return asm_ret.err;
-}
-#ifndef __tilegx__
+        *uval = val;
-/* Return failure from the atomic wrappers. */
+        return ret;
-struct __get_user __atomic_bad_address(int __user *addr);
+}
-#endif
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/tile/include/asm/hardwall.h b/arch/tile/include/asm/hardwall.h
index 2ac422848c7d..47514a58d685 100644
--- a/arch/tile/include/asm/hardwall.h
+++ b/arch/tile/include/asm/hardwall.h
@@ -11,12 +11,14 @@
 *   NON INFRINGEMENT.  See the GNU General Public License for
 *   more details.
 *
- * Provide methods for the HARDWALL_FILE for accessing the UDN.
+ * Provide methods for access control of per-cpu resources like
+ * UDN, IDN, or IPI.
 */
 #ifndef _ASM_TILE_HARDWALL_H
 #define _ASM_TILE_HARDWALL_H
+#include <arch/chip.h>
 #include <linux/ioctl.h>
 #define HARDWALL_IOCTL_BASE 0xa2
@@ -24,8 +26,9 @@
 /*
 * The HARDWALL_CREATE() ioctl is a macro with a "size" argument.
 * The resulting ioctl value is passed to the kernel in conjunction
- * with a pointer to a little-endian bitmask of cpus, which must be
+ * with a pointer to a standard kernel bitmask of cpus.
- * physically in a rectangular configuration on the chip.
+ * For network resources (UDN or IDN) the bitmask must physically
+ * represent a rectangular configuration on the chip.
 * The "size" is the number of bytes of cpu mask data.
 */
 #define _HARDWALL_CREATE 1
@@ -44,13 +47,7 @@
 #define HARDWALL_GET_ID \
 _IO(HARDWALL_IOCTL_BASE, _HARDWALL_GET_ID)
-#ifndef __KERNEL__
+#ifdef __KERNEL__
-/* This is the canonical name expected by userspace. */
-#define HARDWALL_FILE "/dev/hardwall"
-#else
 /* /proc hooks for hardwall. */
 struct proc_dir_entry;
 #ifdef CONFIG_HARDWALL
@@ -59,7 +56,6 @@ int proc_pid_hardwall(struct task_struct *task, char *buffer);
 #else
 static inline void proc_tile_hardwall_init(struct proc_dir_entry *root) {}
 #endif
 #endif
 #endif /* _ASM_TILE_HARDWALL_H */
diff --git a/arch/tile/include/asm/hugetlb.h b/arch/tile/include/asm/hugetlb.h
index d396d1805163..b2042380a5aa 100644
--- a/arch/tile/include/asm/hugetlb.h
+++ b/arch/tile/include/asm/hugetlb.h
@@ -106,4 +106,25 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
+                                       struct page *page, int writable)
+{
+        size_t pagesize = huge_page_size(hstate_vma(vma));
+        if (pagesize != PUD_SIZE && pagesize != PMD_SIZE)
+                entry = pte_mksuper(entry);
+        return entry;
+}
+#define arch_make_huge_pte arch_make_huge_pte
+/* Sizes to scale up page size for PTEs with HV_PTE_SUPER bit. */
+enum {
+        HUGE_SHIFT_PGDIR = 0,
+        HUGE_SHIFT_PMD = 1,
+        HUGE_SHIFT_PAGE = 2,
+        HUGE_SHIFT_ENTRIES
+};
+extern int huge_shift[HUGE_SHIFT_ENTRIES];
+#endif
 #endif /* _ASM_TILE_HUGETLB_H */
diff --git a/arch/tile/include/asm/irqflags.h b/arch/tile/include/asm/irqflags.h
index 5db0ce54284d..b4e96fef2cf8 100644
--- a/arch/tile/include/asm/irqflags.h
+++ b/arch/tile/include/asm/irqflags.h
@@ -28,10 +28,10 @@
 */
 #if CHIP_HAS_AUX_PERF_COUNTERS()
 #define LINUX_MASKABLE_INTERRUPTS_HI \
-       (~(INT_MASK_HI(INT_PERF_COUNT) | INT_MASK_HI(INT_AUX_PERF_COUNT)))
+        (~(INT_MASK_HI(INT_PERF_COUNT) | INT_MASK_HI(INT_AUX_PERF_COUNT)))
 #else
 #define LINUX_MASKABLE_INTERRUPTS_HI \
-       (~(INT_MASK_HI(INT_PERF_COUNT)))
+        (~(INT_MASK_HI(INT_PERF_COUNT)))
 #endif
 #else
@@ -90,6 +90,14 @@
        __insn_mtspr(SPR_INTERRUPT_MASK_RESET_K_0, (unsigned long)(__m)); \
        __insn_mtspr(SPR_INTERRUPT_MASK_RESET_K_1, (unsigned long)(__m>>32)); \
 } while (0)
+#define interrupt_mask_save_mask() \
+        (__insn_mfspr(SPR_INTERRUPT_MASK_SET_K_0) | \
+         (((unsigned long long)__insn_mfspr(SPR_INTERRUPT_MASK_SET_K_1))<<32))
+#define interrupt_mask_restore_mask(mask) do { \
+        unsigned long long __m = (mask); \
+        __insn_mtspr(SPR_INTERRUPT_MASK_K_0, (unsigned long)(__m)); \
+        __insn_mtspr(SPR_INTERRUPT_MASK_K_1, (unsigned long)(__m>>32)); \
+} while (0)
 #else
 #define interrupt_mask_set(n) \
        __insn_mtspr(SPR_INTERRUPT_MASK_SET_K, (1UL << (n)))
@@ -101,6 +109,10 @@
        __insn_mtspr(SPR_INTERRUPT_MASK_SET_K, (mask))
 #define interrupt_mask_reset_mask(mask) \
        __insn_mtspr(SPR_INTERRUPT_MASK_RESET_K, (mask))
+#define interrupt_mask_save_mask() \
+        __insn_mfspr(SPR_INTERRUPT_MASK_K)
+#define interrupt_mask_restore_mask(mask) \
+        __insn_mtspr(SPR_INTERRUPT_MASK_K, (mask))
 #endif
 /*
@@ -122,7 +134,7 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 /* Disable all interrupts, including NMIs. */
 #define arch_local_irq_disable_all() \
-        interrupt_mask_set_mask(-1UL)
+        interrupt_mask_set_mask(-1ULL)
 /* Re-enable all maskable interrupts. */
 #define arch_local_irq_enable() \
@@ -179,7 +191,7 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 #ifdef __tilegx__
 #if INT_MEM_ERROR != 0
-# error Fix IRQ_DISABLED() macro
+# error Fix IRQS_DISABLED() macro
 #endif
 /* Return 0 or 1 to indicate whether interrupts are currently disabled. */
@@ -207,9 +219,10 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
        mtspr   SPR_INTERRUPT_MASK_SET_K, tmp
 /* Enable interrupts. */
-#define IRQ_ENABLE(tmp0, tmp1)                                  \
+#define IRQ_ENABLE_LOAD(tmp0, tmp1)                             \
        GET_INTERRUPTS_ENABLED_MASK_PTR(tmp0);                  \
-        ld      tmp0, tmp0;                                     \
+        ld      tmp0, tmp0
+#define IRQ_ENABLE_APPLY(tmp0, tmp1)                            \
        mtspr   SPR_INTERRUPT_MASK_RESET_K, tmp0
 #else /* !__tilegx__ */
@@ -253,17 +266,22 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
        mtspr   SPR_INTERRUPT_MASK_SET_K_1, tmp
 /* Enable interrupts. */
-#define IRQ_ENABLE(tmp0, tmp1)                                  \
+#define IRQ_ENABLE_LOAD(tmp0, tmp1)                             \
        GET_INTERRUPTS_ENABLED_MASK_PTR(tmp0);                  \
        {                                                       \
         lw     tmp0, tmp0;                                     \
         addi   tmp1, tmp0, 4                                   \
        };                                                      \
-        lw      tmp1, tmp1;                                     \
+        lw      tmp1, tmp1
+#define IRQ_ENABLE_APPLY(tmp0, tmp1)                            \
        mtspr   SPR_INTERRUPT_MASK_RESET_K_0, tmp0;             \
        mtspr   SPR_INTERRUPT_MASK_RESET_K_1, tmp1
 #endif
+#define IRQ_ENABLE(tmp0, tmp1)                                  \
+        IRQ_ENABLE_LOAD(tmp0, tmp1);                            \
+        IRQ_ENABLE_APPLY(tmp0, tmp1)
 /*
 * Do the CPU's IRQ-state tracing from assembly code. We call a
 * C function, but almost everywhere we do, we don't mind clobbering
diff --git a/arch/tile/include/asm/kexec.h b/arch/tile/include/asm/kexec.h
index c11a6cc73bb8..fc98ccfc98ac 100644
--- a/arch/tile/include/asm/kexec.h
+++ b/arch/tile/include/asm/kexec.h
@@ -19,12 +19,24 @@
 #include <asm/page.h>
+#ifndef __tilegx__
 /* Maximum physical address we can use pages from. */
 #define KEXEC_SOURCE_MEMORY_LIMIT TASK_SIZE
 /* Maximum address we can reach in physical address mode. */
 #define KEXEC_DESTINATION_MEMORY_LIMIT TASK_SIZE
 /* Maximum address we can use for the control code buffer. */
 #define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
+#else
+/* We need to limit the memory below PGDIR_SIZE since
+ * we only setup page table for [0, PGDIR_SIZE) before final kexec.
+ */
+/* Maximum physical address we can use pages from. */
+#define KEXEC_SOURCE_MEMORY_LIMIT PGDIR_SIZE
+/* Maximum address we can reach in physical address mode. */
+#define KEXEC_DESTINATION_MEMORY_LIMIT PGDIR_SIZE
+/* Maximum address we can use for the control code buffer. */
+#define KEXEC_CONTROL_MEMORY_LIMIT PGDIR_SIZE
+#endif
 #define KEXEC_CONTROL_PAGE_SIZE PAGE_SIZE
diff --git a/arch/tile/include/asm/mmu.h b/arch/tile/include/asm/mmu.h
index 92f94c77b6e4..e2c789096795 100644
--- a/arch/tile/include/asm/mmu.h
+++ b/arch/tile/include/asm/mmu.h
@@ -21,7 +21,7 @@ struct mm_context {
         * Written under the mmap_sem semaphore; read without the
         * semaphore but atomically, but it is conservatively set.
         */
-        unsigned int priority_cached;
+        unsigned long priority_cached;
 };
 typedef struct mm_context mm_context_t;
diff --git a/arch/tile/include/asm/mmu_context.h b/arch/tile/include/asm/mmu_context.h
index 15fb24641120..37f0b741dee7 100644
--- a/arch/tile/include/asm/mmu_context.h
+++ b/arch/tile/include/asm/mmu_context.h
@@ -30,11 +30,15 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm)
        return 0;
 }
-/* Note that arch/tile/kernel/head.S also calls hv_install_context() */
+/*
+ * Note that arch/tile/kernel/head_NN.S and arch/tile/mm/migrate_NN.S
+ * also call hv_install_context().
+ */
 static inline void __install_page_table(pgd_t *pgdir, int asid, pgprot_t prot)
 {
        /* FIXME: DIRECTIO should not always be set. FIXME. */
-        int rc = hv_install_context(__pa(pgdir), prot, asid, HV_CTX_DIRECTIO);
+        int rc = hv_install_context(__pa(pgdir), prot, asid,
+                                    HV_CTX_DIRECTIO | CTX_PAGE_FLAG);
        if (rc < 0)
                panic("hv_install_context failed: %d", rc);
 }
diff --git a/arch/tile/include/asm/module.h b/arch/tile/include/asm/module.h
new file mode 100644
index 000000000000..44ed07ccd3d2
--- /dev/null
+++ b/arch/tile/include/asm/module.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#ifndef _ASM_TILE_MODULE_H
+#define _ASM_TILE_MODULE_H
+#include <arch/chip.h>
+#include <asm-generic/module.h>
+/* We can't use modules built with different page sizes. */
+#if defined(CONFIG_PAGE_SIZE_16KB)
+# define MODULE_PGSZ " 16KB"
+#elif defined(CONFIG_PAGE_SIZE_64KB)
+# define MODULE_PGSZ " 64KB"
+#else
+# define MODULE_PGSZ ""
+#endif
+/* We don't really support no-SMP so tag if someone tries. */
+#ifdef CONFIG_SMP
+#define MODULE_NOSMP ""
+#else
+#define MODULE_NOSMP " nosmp"
+#endif
+#define MODULE_ARCH_VERMAGIC CHIP_ARCH_NAME MODULE_PGSZ MODULE_NOSMP
+#endif /* _ASM_TILE_MODULE_H */
diff --git a/arch/tile/include/asm/page.h b/arch/tile/include/asm/page.h
index db93518fac03..9d9131e5c552 100644
--- a/arch/tile/include/asm/page.h
+++ b/arch/tile/include/asm/page.h
@@ -20,8 +20,17 @@
 #include <arch/chip.h>
 /* PAGE_SHIFT and HPAGE_SHIFT determine the page sizes. */
-#define PAGE_SHIFT      HV_LOG2_PAGE_SIZE_SMALL
+#if defined(CONFIG_PAGE_SIZE_16KB)
-#define HPAGE_SHIFT     HV_LOG2_PAGE_SIZE_LARGE
+#define PAGE_SHIFT      14
+#define CTX_PAGE_FLAG   HV_CTX_PG_SM_16K
+#elif defined(CONFIG_PAGE_SIZE_64KB)
+#define PAGE_SHIFT      16
+#define CTX_PAGE_FLAG   HV_CTX_PG_SM_64K
+#else
+#define PAGE_SHIFT      HV_LOG2_DEFAULT_PAGE_SIZE_SMALL
+#define CTX_PAGE_FLAG   0
+#endif
+#define HPAGE_SHIFT     HV_LOG2_DEFAULT_PAGE_SIZE_LARGE
 #define PAGE_SIZE       (_AC(1, UL) << PAGE_SHIFT)
 #define HPAGE_SIZE      (_AC(1, UL) << HPAGE_SHIFT)
@@ -78,8 +87,7 @@ typedef HV_PTE pgprot_t;
 /*
 * User L2 page tables are managed as one L2 page table per page,
 * because we use the page allocator for them.  This keeps the allocation
- * simple and makes it potentially useful to implement HIGHPTE at some point.
+ * simple, but it's also inefficient, since L2 page tables are much smaller
- * However, it's also inefficient, since L2 page tables are much smaller
 * than pages (currently 2KB vs 64KB).  So we should revisit this.
 */
 typedef struct page *pgtable_t;
@@ -128,7 +136,7 @@ static inline __attribute_const__ int get_order(unsigned long size)
 #define HUGETLB_PAGE_ORDER      (HPAGE_SHIFT - PAGE_SHIFT)
-#define HUGE_MAX_HSTATE         2
+#define HUGE_MAX_HSTATE         6
 #ifdef CONFIG_HUGETLB_PAGE
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
diff --git a/arch/tile/include/asm/pgalloc.h b/arch/tile/include/asm/pgalloc.h
index e919c0bdc22d..1b902508b664 100644
--- a/arch/tile/include/asm/pgalloc.h
+++ b/arch/tile/include/asm/pgalloc.h
@@ -19,24 +19,24 @@
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <asm/fixmap.h>
+#include <asm/page.h>
 #include <hv/hypervisor.h>
 /* Bits for the size of the second-level page table. */
-#define L2_KERNEL_PGTABLE_SHIFT \
+#define L2_KERNEL_PGTABLE_SHIFT _HV_LOG2_L2_SIZE(HPAGE_SHIFT, PAGE_SHIFT)
-  (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL + HV_LOG2_PTE_SIZE)
+/* How big is a kernel L2 page table? */
+#define L2_KERNEL_PGTABLE_SIZE (1UL << L2_KERNEL_PGTABLE_SHIFT)
 /* We currently allocate user L2 page tables by page (unlike kernel L2s). */
-#if L2_KERNEL_PGTABLE_SHIFT < HV_LOG2_PAGE_SIZE_SMALL
+#if L2_KERNEL_PGTABLE_SHIFT < PAGE_SHIFT
-#define L2_USER_PGTABLE_SHIFT HV_LOG2_PAGE_SIZE_SMALL
+#define L2_USER_PGTABLE_SHIFT PAGE_SHIFT
 #else
 #define L2_USER_PGTABLE_SHIFT L2_KERNEL_PGTABLE_SHIFT
 #endif
 /* How many pages do we need, as an "order", for a user L2 page table? */
-#define L2_USER_PGTABLE_ORDER (L2_USER_PGTABLE_SHIFT - HV_LOG2_PAGE_SIZE_SMALL)
+#define L2_USER_PGTABLE_ORDER (L2_USER_PGTABLE_SHIFT - PAGE_SHIFT)
-/* How big is a kernel L2 page table? */
-#define L2_KERNEL_PGTABLE_SIZE (1 << L2_KERNEL_PGTABLE_SHIFT)
 static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 {
@@ -50,14 +50,14 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 static inline void pmd_populate_kernel(struct mm_struct *mm,
                                       pmd_t *pmd, pte_t *ptep)
 {
-        set_pmd(pmd, ptfn_pmd(__pa(ptep) >> HV_LOG2_PAGE_TABLE_ALIGN,
+        set_pmd(pmd, ptfn_pmd(HV_CPA_TO_PTFN(__pa(ptep)),
                              __pgprot(_PAGE_PRESENT)));
 }
 static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
                                pgtable_t page)
 {
-        set_pmd(pmd, ptfn_pmd(HV_PFN_TO_PTFN(page_to_pfn(page)),
+        set_pmd(pmd, ptfn_pmd(HV_CPA_TO_PTFN(PFN_PHYS(page_to_pfn(page))),
                              __pgprot(_PAGE_PRESENT)));
 }
@@ -68,8 +68,20 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
-extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address);
+extern pgtable_t pgtable_alloc_one(struct mm_struct *mm, unsigned long address,
-extern void pte_free(struct mm_struct *mm, struct page *pte);
+                                   int order);
+extern void pgtable_free(struct mm_struct *mm, struct page *pte, int order);
+static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
+                                      unsigned long address)
+{
+        return pgtable_alloc_one(mm, address, L2_USER_PGTABLE_ORDER);
+}
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
+{
+        pgtable_free(mm, pte, L2_USER_PGTABLE_ORDER);
+}
 #define pmd_pgtable(pmd) pmd_page(pmd)
@@ -85,8 +97,13 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
        pte_free(mm, virt_to_page(pte));
 }
-extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
+extern void __pgtable_free_tlb(struct mmu_gather *tlb, struct page *pte,
-                           unsigned long address);
+                               unsigned long address, int order);
+static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
+                                  unsigned long address)
+{
+        __pgtable_free_tlb(tlb, pte, address, L2_USER_PGTABLE_ORDER);
+}
 #define check_pgt_cache()       do { } while (0)
@@ -104,19 +121,44 @@ void shatter_pmd(pmd_t *pmd);
 void shatter_huge_page(unsigned long addr);
 #ifdef __tilegx__
-/* We share a single page allocator for both L1 and L2 page tables. */
-#if HV_L1_SIZE != HV_L2_SIZE
-# error Rework assumption that L1 and L2 page tables are same size.
-#endif
-#define L1_USER_PGTABLE_ORDER L2_USER_PGTABLE_ORDER
 #define pud_populate(mm, pud, pmd) \
  pmd_populate_kernel((mm), (pmd_t *)(pud), (pte_t *)(pmd))
-#define pmd_alloc_one(mm, addr) \
-  ((pmd_t *)page_to_virt(pte_alloc_one((mm), (addr))))
+/* Bits for the size of the L1 (intermediate) page table. */
-#define pmd_free(mm, pmdp) \
+#define L1_KERNEL_PGTABLE_SHIFT _HV_LOG2_L1_SIZE(HPAGE_SHIFT)
-  pte_free((mm), virt_to_page(pmdp))
-#define __pmd_free_tlb(tlb, pmdp, address) \
+/* How big is a kernel L2 page table? */
-  __pte_free_tlb((tlb), virt_to_page(pmdp), (address))
+#define L1_KERNEL_PGTABLE_SIZE (1UL << L1_KERNEL_PGTABLE_SHIFT)
+/* We currently allocate L1 page tables by page. */
+#if L1_KERNEL_PGTABLE_SHIFT < PAGE_SHIFT
+#define L1_USER_PGTABLE_SHIFT PAGE_SHIFT
+#else
+#define L1_USER_PGTABLE_SHIFT L1_KERNEL_PGTABLE_SHIFT
 #endif
+/* How many pages do we need, as an "order", for an L1 page table? */
+#define L1_USER_PGTABLE_ORDER (L1_USER_PGTABLE_SHIFT - PAGE_SHIFT)
+static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+        struct page *p = pgtable_alloc_one(mm, address, L1_USER_PGTABLE_ORDER);
+        return (pmd_t *)page_to_virt(p);
+}
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmdp)
+{
+        pgtable_free(mm, virt_to_page(pmdp), L1_USER_PGTABLE_ORDER);
+}
+static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
+                                  unsigned long address)
+{
+        __pgtable_free_tlb(tlb, virt_to_page(pmdp), address,
+                           L1_USER_PGTABLE_ORDER);
+}
+#endif /* __tilegx__ */
 #endif /* _ASM_TILE_PGALLOC_H */
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h
index 67490910774d..73b1a4c9ad03 100644
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -27,8 +27,10 @@
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
+#include <linux/pfn.h>
 #include <asm/processor.h>
 #include <asm/fixmap.h>
+#include <asm/page.h>
 struct mm_struct;
 struct vm_area_struct;
@@ -69,6 +71,7 @@ extern void set_page_homes(void);
 #define _PAGE_PRESENT           HV_PTE_PRESENT
 #define _PAGE_HUGE_PAGE         HV_PTE_PAGE
+#define _PAGE_SUPER_PAGE        HV_PTE_SUPER
 #define _PAGE_READABLE          HV_PTE_READABLE
 #define _PAGE_WRITABLE          HV_PTE_WRITABLE
 #define _PAGE_EXECUTABLE        HV_PTE_EXECUTABLE
@@ -85,6 +88,7 @@ extern void set_page_homes(void);
 #define _PAGE_ALL (\
  _PAGE_PRESENT | \
  _PAGE_HUGE_PAGE | \
+  _PAGE_SUPER_PAGE | \
  _PAGE_READABLE | \
  _PAGE_WRITABLE | \
  _PAGE_EXECUTABLE | \
@@ -162,7 +166,7 @@ extern void set_page_homes(void);
  (pgprot_t) { ((oldprot).val & ~_PAGE_ALL) | (newprot).val }
 /* Just setting the PFN to zero suffices. */
-#define pte_pgprot(x) hv_pte_set_pfn((x), 0)
+#define pte_pgprot(x) hv_pte_set_pa((x), 0)
 /*
 * For PTEs and PDEs, we must clear the Present bit first when
@@ -187,6 +191,7 @@ static inline void __pte_clear(pte_t *ptep)
 * Undefined behaviour if not..
 */
 #define pte_present hv_pte_get_present
+#define pte_mknotpresent hv_pte_clear_present
 #define pte_user hv_pte_get_user
 #define pte_read hv_pte_get_readable
 #define pte_dirty hv_pte_get_dirty
@@ -194,6 +199,7 @@ static inline void __pte_clear(pte_t *ptep)
 #define pte_write hv_pte_get_writable
 #define pte_exec hv_pte_get_executable
 #define pte_huge hv_pte_get_page
+#define pte_super hv_pte_get_super
 #define pte_rdprotect hv_pte_clear_readable
 #define pte_exprotect hv_pte_clear_executable
 #define pte_mkclean hv_pte_clear_dirty
@@ -206,6 +212,7 @@ static inline void __pte_clear(pte_t *ptep)
 #define pte_mkyoung hv_pte_set_accessed
 #define pte_mkwrite hv_pte_set_writable
 #define pte_mkhuge hv_pte_set_page
+#define pte_mksuper hv_pte_set_super
 #define pte_special(pte) 0
 #define pte_mkspecial(pte) (pte)
@@ -261,7 +268,7 @@ static inline int pte_none(pte_t pte)
 static inline unsigned long pte_pfn(pte_t pte)
 {
-        return hv_pte_get_pfn(pte);
+        return PFN_DOWN(hv_pte_get_pa(pte));
 }
 /* Set or get the remote cache cpu in a pgprot with remote caching. */
@@ -270,7 +277,7 @@ extern int get_remote_cache_cpu(pgprot_t prot);
 static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot)
 {
-        return hv_pte_set_pfn(prot, pfn);
+        return hv_pte_set_pa(prot, PFN_PHYS(pfn));
 }
 /* Support for priority mappings. */
@@ -312,7 +319,7 @@ extern void check_mm_caching(struct mm_struct *prev, struct mm_struct *next);
 */
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
-        return pfn_pte(hv_pte_get_pfn(pte), newprot);
+        return pfn_pte(pte_pfn(pte), newprot);
 }
 /*
@@ -335,13 +342,8 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 */
 #define pgd_offset_k(address) pgd_offset(&init_mm, address)
-#if defined(CONFIG_HIGHPTE)
-extern pte_t *pte_offset_map(pmd_t *, unsigned long address);
-#define pte_unmap(pte) kunmap_atomic(pte)
-#else
 #define pte_offset_map(dir, address) pte_offset_kernel(dir, address)
 #define pte_unmap(pte) do { } while (0)
-#endif
 /* Clear a non-executable kernel PTE and flush it from the TLB. */
 #define kpte_clear_flush(ptep, vaddr)           \
@@ -410,6 +412,46 @@ static inline unsigned long pmd_index(unsigned long address)
        return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
 }
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+                                            unsigned long address,
+                                            pmd_t *pmdp)
+{
+        return ptep_test_and_clear_young(vma, address, pmdp_ptep(pmdp));
+}
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+                                      unsigned long address, pmd_t *pmdp)
+{
+        ptep_set_wrprotect(mm, address, pmdp_ptep(pmdp));
+}
+#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+                                       unsigned long address,
+                                       pmd_t *pmdp)
+{
+        return pte_pmd(ptep_get_and_clear(mm, address, pmdp_ptep(pmdp)));
+}
+static inline void __set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+        set_pte(pmdp_ptep(pmdp), pmd_pte(pmdval));
+}
+#define set_pmd_at(mm, addr, pmdp, pmdval) __set_pmd(pmdp, pmdval)
+/* Create a pmd from a PTFN. */
+static inline pmd_t ptfn_pmd(unsigned long ptfn, pgprot_t prot)
+{
+        return pte_pmd(hv_pte_set_ptfn(prot, ptfn));
+}
+/* Return the page-table frame number (ptfn) that a pmd_t points at. */
+#define pmd_ptfn(pmd) hv_pte_get_ptfn(pmd_pte(pmd))
 /*
 * A given kernel pmd_t maps to a specific virtual address (either a
 * kernel huge page or a kernel pte_t table).  Since kernel pte_t
@@ -430,7 +472,48 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 * OK for pte_lockptr(), since we just end up with potentially one
 * lock being used for several pte_t arrays.
 */
-#define pmd_page(pmd) pfn_to_page(HV_PTFN_TO_PFN(pmd_ptfn(pmd)))
+#define pmd_page(pmd) pfn_to_page(PFN_DOWN(HV_PTFN_TO_CPA(pmd_ptfn(pmd))))
+static inline void pmd_clear(pmd_t *pmdp)
+{
+        __pte_clear(pmdp_ptep(pmdp));
+}
+#define pmd_mknotpresent(pmd)   pte_pmd(pte_mknotpresent(pmd_pte(pmd)))
+#define pmd_young(pmd)          pte_young(pmd_pte(pmd))
+#define pmd_mkyoung(pmd)        pte_pmd(pte_mkyoung(pmd_pte(pmd)))
+#define pmd_mkold(pmd)          pte_pmd(pte_mkold(pmd_pte(pmd)))
+#define pmd_mkwrite(pmd)        pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+#define pmd_write(pmd)          pte_write(pmd_pte(pmd))
+#define pmd_wrprotect(pmd)      pte_pmd(pte_wrprotect(pmd_pte(pmd)))
+#define pmd_mkdirty(pmd)        pte_pmd(pte_mkdirty(pmd_pte(pmd)))
+#define pmd_huge_page(pmd)      pte_huge(pmd_pte(pmd))
+#define pmd_mkhuge(pmd)         pte_pmd(pte_mkhuge(pmd_pte(pmd)))
+#define __HAVE_ARCH_PMD_WRITE
+#define pfn_pmd(pfn, pgprot)    pte_pmd(pfn_pte((pfn), (pgprot)))
+#define pmd_pfn(pmd)            pte_pfn(pmd_pte(pmd))
+#define mk_pmd(page, pgprot)    pfn_pmd(page_to_pfn(page), (pgprot))
+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+        return pfn_pmd(pmd_pfn(pmd), newprot);
+}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define has_transparent_hugepage() 1
+#define pmd_trans_huge pmd_huge_page
+static inline pmd_t pmd_mksplitting(pmd_t pmd)
+{
+        return pte_pmd(hv_pte_set_client2(pmd_pte(pmd)));
+}
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+        return hv_pte_get_client2(pmd_pte(pmd));
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 /*
 * The pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
@@ -448,17 +531,13 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
       return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
 }
-static inline int pmd_huge_page(pmd_t pmd)
-{
-        return pmd_val(pmd) & _PAGE_HUGE_PAGE;
-}
 #include <asm-generic/pgtable.h>
 /* Support /proc/NN/pgtable API. */
 struct seq_file;
 int arch_proc_pgtable_show(struct seq_file *m, struct mm_struct *mm,
-                           unsigned long vaddr, pte_t *ptep, void **datap);
+                           unsigned long vaddr, unsigned long pagesize,
+                           pte_t *ptep, void **datap);
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/tile/include/asm/pgtable_32.h b/arch/tile/include/asm/pgtable_32.h
index 9f98529761fd..4ce4a7a99c24 100644
--- a/arch/tile/include/asm/pgtable_32.h
+++ b/arch/tile/include/asm/pgtable_32.h
@@ -20,11 +20,12 @@
 * The level-1 index is defined by the huge page size.  A PGD is composed
 * of PTRS_PER_PGD pgd_t's and is the top level of the page table.
 */
-#define PGDIR_SHIFT     HV_LOG2_PAGE_SIZE_LARGE
+#define PGDIR_SHIFT     HPAGE_SHIFT
-#define PGDIR_SIZE      HV_PAGE_SIZE_LARGE
+#define PGDIR_SIZE      HPAGE_SIZE
 #define PGDIR_MASK      (~(PGDIR_SIZE-1))
-#define PTRS_PER_PGD    (1 << (32 - PGDIR_SHIFT))
+#define PTRS_PER_PGD    _HV_L1_ENTRIES(HPAGE_SHIFT)
-#define SIZEOF_PGD      (PTRS_PER_PGD * sizeof(pgd_t))
+#define PGD_INDEX(va)   _HV_L1_INDEX(va, HPAGE_SHIFT)
+#define SIZEOF_PGD      _HV_L1_SIZE(HPAGE_SHIFT)
 /*
 * The level-2 index is defined by the difference between the huge
@@ -33,8 +34,9 @@
 * Note that the hypervisor docs use PTE for what we call pte_t, so
 * this nomenclature is somewhat confusing.
 */
-#define PTRS_PER_PTE (1 << (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL))
+#define PTRS_PER_PTE    _HV_L2_ENTRIES(HPAGE_SHIFT, PAGE_SHIFT)
-#define SIZEOF_PTE      (PTRS_PER_PTE * sizeof(pte_t))
+#define PTE_INDEX(va)   _HV_L2_INDEX(va, HPAGE_SHIFT, PAGE_SHIFT)
+#define SIZEOF_PTE      _HV_L2_SIZE(HPAGE_SHIFT, PAGE_SHIFT)
 #ifndef __ASSEMBLY__
@@ -111,24 +113,14 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
        return pte;
 }
-static inline void __set_pmd(pmd_t *pmdp, pmd_t pmdval)
+/*
-{
+ * pmds are wrappers around pgds, which are the same as ptes.
-        set_pte(&pmdp->pud.pgd, pmdval.pud.pgd);
+ * It's often convenient to "cast" back and forth and use the pte methods,
-}
+ * which are the methods supplied by the hypervisor.
+ */
-/* Create a pmd from a PTFN. */
+#define pmd_pte(pmd) ((pmd).pud.pgd)
-static inline pmd_t ptfn_pmd(unsigned long ptfn, pgprot_t prot)
+#define pmdp_ptep(pmdp) (&(pmdp)->pud.pgd)
-{
+#define pte_pmd(pte) ((pmd_t){ { (pte) } })
-        return (pmd_t){ { hv_pte_set_ptfn(prot, ptfn) } };
-}
-/* Return the page-table frame number (ptfn) that a pmd_t points at. */
-#define pmd_ptfn(pmd) hv_pte_get_ptfn((pmd).pud.pgd)
-static inline void pmd_clear(pmd_t *pmdp)
-{
-        __pte_clear(&pmdp->pud.pgd);
-}
 #endif /* __ASSEMBLY__ */
diff --git a/arch/tile/include/asm/pgtable_64.h b/arch/tile/include/asm/pgtable_64.h
index fd80328523b4..2492fa5478e7 100644
--- a/arch/tile/include/asm/pgtable_64.h
+++ b/arch/tile/include/asm/pgtable_64.h
@@ -21,17 +21,19 @@
 #define PGDIR_SIZE      HV_L1_SPAN
 #define PGDIR_MASK      (~(PGDIR_SIZE-1))
 #define PTRS_PER_PGD    HV_L0_ENTRIES
-#define SIZEOF_PGD      (PTRS_PER_PGD * sizeof(pgd_t))
+#define PGD_INDEX(va)   HV_L0_INDEX(va)
+#define SIZEOF_PGD      HV_L0_SIZE
 /*
 * The level-1 index is defined by the huge page size.  A PMD is composed
 * of PTRS_PER_PMD pgd_t's and is the middle level of the page table.
 */
-#define PMD_SHIFT       HV_LOG2_PAGE_SIZE_LARGE
+#define PMD_SHIFT       HPAGE_SHIFT
-#define PMD_SIZE        HV_PAGE_SIZE_LARGE
+#define PMD_SIZE        HPAGE_SIZE
 #define PMD_MASK        (~(PMD_SIZE-1))
-#define PTRS_PER_PMD    (1 << (PGDIR_SHIFT - PMD_SHIFT))
+#define PTRS_PER_PMD    _HV_L1_ENTRIES(HPAGE_SHIFT)
-#define SIZEOF_PMD      (PTRS_PER_PMD * sizeof(pmd_t))
+#define PMD_INDEX(va)   _HV_L1_INDEX(va, HPAGE_SHIFT)
+#define SIZEOF_PMD      _HV_L1_SIZE(HPAGE_SHIFT)
 /*
 * The level-2 index is defined by the difference between the huge
@@ -40,17 +42,19 @@
 * Note that the hypervisor docs use PTE for what we call pte_t, so
 * this nomenclature is somewhat confusing.
 */
-#define PTRS_PER_PTE (1 << (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL))
+#define PTRS_PER_PTE    _HV_L2_ENTRIES(HPAGE_SHIFT, PAGE_SHIFT)
-#define SIZEOF_PTE      (PTRS_PER_PTE * sizeof(pte_t))
+#define PTE_INDEX(va)   _HV_L2_INDEX(va, HPAGE_SHIFT, PAGE_SHIFT)
+#define SIZEOF_PTE      _HV_L2_SIZE(HPAGE_SHIFT, PAGE_SHIFT)
 /*
- * Align the vmalloc area to an L2 page table, and leave a guard page
+ * Align the vmalloc area to an L2 page table.  Omit guard pages at
- * at the beginning and end.  The vmalloc code also puts in an internal
+ * the beginning and end for simplicity (particularly in the per-cpu
+ * memory allocation code).  The vmalloc code puts in an internal
 * guard page between each allocation.
 */
 #define _VMALLOC_END    HUGE_VMAP_BASE
-#define VMALLOC_END     (_VMALLOC_END - PAGE_SIZE)
+#define VMALLOC_END     _VMALLOC_END
-#define VMALLOC_START   (_VMALLOC_START + PAGE_SIZE)
+#define VMALLOC_START   _VMALLOC_START
 #define HUGE_VMAP_END   (HUGE_VMAP_BASE + PGDIR_SIZE)
@@ -98,7 +102,7 @@ static inline int pud_bad(pud_t pud)
 * A pud_t points to a pmd_t array.  Since we can have multiple per
 * page, we don't have a one-to-one mapping of pud_t's to pages.
 */
-#define pud_page(pud) pfn_to_page(HV_PTFN_TO_PFN(pud_ptfn(pud)))
+#define pud_page(pud) pfn_to_page(PFN_DOWN(HV_PTFN_TO_CPA(pud_ptfn(pud))))
 static inline unsigned long pud_index(unsigned long address)
 {
@@ -108,28 +112,6 @@ static inline unsigned long pud_index(unsigned long address)
 #define pmd_offset(pud, address) \
        ((pmd_t *)pud_page_vaddr(*(pud)) + pmd_index(address))
-static inline void __set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-        set_pte(pmdp, pmdval);
-}
-/* Create a pmd from a PTFN and pgprot. */
-static inline pmd_t ptfn_pmd(unsigned long ptfn, pgprot_t prot)
-{
-        return hv_pte_set_ptfn(prot, ptfn);
-}
-/* Return the page-table frame number (ptfn) that a pmd_t points at. */
-static inline unsigned long pmd_ptfn(pmd_t pmd)
-{
-        return hv_pte_get_ptfn(pmd);
-}
-static inline void pmd_clear(pmd_t *pmdp)
-{
-        __pte_clear(pmdp);
-}
 /* Normalize an address to having the correct high bits set. */
 #define pgd_addr_normalize pgd_addr_normalize
 static inline unsigned long pgd_addr_normalize(unsigned long addr)
@@ -170,6 +152,13 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
        return hv_pte(__insn_exch(&ptep->val, 0UL));
 }
+/*
+ * pmds are the same as pgds and ptes, so converting is a no-op.
+ */
+#define pmd_pte(pmd) (pmd)
+#define pmdp_ptep(pmdp) (pmdp)
+#define pte_pmd(pte) (pte)
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_TILE_PGTABLE_64_H */
diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h
index 15cd8a4a06ce..8c4dd9ff91eb 100644
--- a/arch/tile/include/asm/processor.h
+++ b/arch/tile/include/asm/processor.h
@@ -76,6 +76,17 @@ struct async_tlb {
 #ifdef CONFIG_HARDWALL
 struct hardwall_info;
+struct hardwall_task {
+        /* Which hardwall is this task tied to? (or NULL if none) */
+        struct hardwall_info *info;
+        /* Chains this task into the list at info->task_head. */
+        struct list_head list;
+};
+#ifdef __tilepro__
+#define HARDWALL_TYPES 1   /* udn */
+#else
+#define HARDWALL_TYPES 3   /* udn, idn, and ipi */
+#endif
 #endif
 struct thread_struct {
@@ -116,10 +127,8 @@ struct thread_struct {
        unsigned long dstream_pf;
 #endif
 #ifdef CONFIG_HARDWALL
-        /* Is this task tied to an activated hardwall? */
+        /* Hardwall information for various resources. */
-        struct hardwall_info *hardwall;
+        struct hardwall_task hardwall[HARDWALL_TYPES];
-        /* Chains this task into the list at hardwall->list. */
-        struct list_head hardwall_list;
 #endif
 #if CHIP_HAS_TILE_DMA()
        /* Async DMA TLB fault information */
diff --git a/arch/tile/include/asm/setup.h b/arch/tile/include/asm/setup.h
index e58613e0752f..c67eb70ea78e 100644
--- a/arch/tile/include/asm/setup.h
+++ b/arch/tile/include/asm/setup.h
@@ -41,15 +41,15 @@ void restrict_dma_mpls(void);
 #ifdef CONFIG_HARDWALL
 /* User-level network management functions */
 void reset_network_state(void);
-void grant_network_mpls(void);
-void restrict_network_mpls(void);
 struct task_struct;
-int hardwall_deactivate(struct task_struct *task);
+void hardwall_switch_tasks(struct task_struct *prev, struct task_struct *next);
+void hardwall_deactivate_all(struct task_struct *task);
+int hardwall_ipi_valid(int cpu);
 /* Hook hardwall code into changes in affinity. */
 #define arch_set_cpus_allowed(p, new_mask) do { \
-        if (p->thread.hardwall && !cpumask_equal(&p->cpus_allowed, new_mask)) \
+        if (!cpumask_equal(&p->cpus_allowed, new_mask)) \
-                hardwall_deactivate(p); \
+                hardwall_deactivate_all(p); \
 } while (0)
 #endif
diff --git a/arch/tile/include/asm/syscalls.h b/arch/tile/include/asm/syscalls.h
index 3b5507c31eae..06f0464cfed9 100644
--- a/arch/tile/include/asm/syscalls.h
+++ b/arch/tile/include/asm/syscalls.h
@@ -43,7 +43,8 @@ long sys32_fadvise64(int fd, u32 offset_lo, u32 offset_hi,
                     u32 len, int advice);
 int sys32_fadvise64_64(int fd, u32 offset_lo, u32 offset_hi,
                       u32 len_lo, u32 len_hi, int advice);
-long sys_flush_cache(void);
+long sys_cacheflush(unsigned long addr, unsigned long len,
+                    unsigned long flags);
 #ifndef __tilegx__  /* No mmap() in the 32-bit kernel. */
 #define sys_mmap sys_mmap
 #endif
diff --git a/arch/tile/include/asm/tlbflush.h b/arch/tile/include/asm/tlbflush.h
index 96199d214fb8..dcf91b25a1e5 100644
--- a/arch/tile/include/asm/tlbflush.h
+++ b/arch/tile/include/asm/tlbflush.h
@@ -38,16 +38,11 @@ DECLARE_PER_CPU(int, current_asid);
 /* The hypervisor tells us what ASIDs are available to us. */
 extern int min_asid, max_asid;
-static inline unsigned long hv_page_size(const struct vm_area_struct *vma)
-{
-        return (vma->vm_flags & VM_HUGETLB) ? HPAGE_SIZE : PAGE_SIZE;
-}
 /* Pass as vma pointer for non-executable mapping, if no vma available. */
-#define FLUSH_NONEXEC ((const struct vm_area_struct *)-1UL)
+#define FLUSH_NONEXEC ((struct vm_area_struct *)-1UL)
 /* Flush a single user page on this cpu. */
-static inline void local_flush_tlb_page(const struct vm_area_struct *vma,
+static inline void local_flush_tlb_page(struct vm_area_struct *vma,
                                        unsigned long addr,
                                        unsigned long page_size)
 {
@@ -60,7 +55,7 @@ static inline void local_flush_tlb_page(const struct vm_area_struct *vma,
 }
 /* Flush range of user pages on this cpu. */
-static inline void local_flush_tlb_pages(const struct vm_area_struct *vma,
+static inline void local_flush_tlb_pages(struct vm_area_struct *vma,
                                         unsigned long addr,
                                         unsigned long page_size,
                                         unsigned long len)
@@ -117,10 +112,10 @@ extern void flush_tlb_all(void);
 extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
 extern void flush_tlb_current_task(void);
 extern void flush_tlb_mm(struct mm_struct *);
-extern void flush_tlb_page(const struct vm_area_struct *, unsigned long);
+extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
-extern void flush_tlb_page_mm(const struct vm_area_struct *,
+extern void flush_tlb_page_mm(struct vm_area_struct *,
                              struct mm_struct *, unsigned long);
-extern void flush_tlb_range(const struct vm_area_struct *,
+extern void flush_tlb_range(struct vm_area_struct *,
                            unsigned long start, unsigned long end);
 #define flush_tlb()     flush_tlb_current_task()
diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h
index ef34d2caa5b1..c3dd275f25e2 100644
--- a/arch/tile/include/asm/uaccess.h
+++ b/arch/tile/include/asm/uaccess.h
@@ -114,45 +114,75 @@ struct exception_table_entry {
 extern int fixup_exception(struct pt_regs *regs);
 /*
- * We return the __get_user_N function results in a structure,
+ * Support macros for __get_user().
- * thus in r0 and r1.  If "err" is zero, "val" is the result
+ *
- * of the read; otherwise, "err" is -EFAULT.
+ * Implementation note: The "case 8" logic of casting to the type of
- *
+ * the result of subtracting the value from itself is basically a way
- * We rarely need 8-byte values on a 32-bit architecture, but
+ * of keeping all integer types the same, but casting any pointers to
- * we size the structure to accommodate.  In practice, for the
+ * ptrdiff_t, i.e. also an integer type.  This way there are no
- * the smaller reads, we can zero the high word for free, and
+ * questionable casts seen by the compiler on an ILP32 platform.
- * the caller will ignore it by virtue of casting anyway.
+ *
+ * Note that __get_user() and __put_user() assume proper alignment.
 */
-struct __get_user {
-        unsigned long long val;
-        int err;
-};
-/*
+#ifdef __LP64__
- * FIXME: we should express these as inline extended assembler, since
+#define _ASM_PTR        ".quad"
- * they're fundamentally just a variable dereference and some
+#else
- * supporting exception_table gunk.  Note that (a la i386) we can
+#define _ASM_PTR        ".long"
- * extend the copy_to_user and copy_from_user routines to call into
+#endif
- * such extended assembler routines, though we will have to use a
- * different return code in that case (1, 2, or 4, rather than -EFAULT).
+#define __get_user_asm(OP, x, ptr, ret)                                 \
- */
+        asm volatile("1: {" #OP " %1, %2; movei %0, 0 }\n"              \
-extern struct __get_user __get_user_1(const void __user *);
+                     ".pushsection .fixup,\"ax\"\n"                     \
-extern struct __get_user __get_user_2(const void __user *);
+                     "0: { movei %1, 0; movei %0, %3 }\n"               \
-extern struct __get_user __get_user_4(const void __user *);
+                     "j 9f\n"                                           \
-extern struct __get_user __get_user_8(const void __user *);
+                     ".section __ex_table,\"a\"\n"                      \
-extern int __put_user_1(long, void __user *);
+                     _ASM_PTR " 1b, 0b\n"                               \
-extern int __put_user_2(long, void __user *);
+                     ".popsection\n"                                    \
-extern int __put_user_4(long, void __user *);
+                     "9:"                                               \
-extern int __put_user_8(long long, void __user *);
+                     : "=r" (ret), "=r" (x)                             \
+                     : "r" (ptr), "i" (-EFAULT))
-/* Unimplemented routines to cause linker failures */
-extern struct __get_user __get_user_bad(void);
+#ifdef __tilegx__
-extern int __put_user_bad(void);
+#define __get_user_1(x, ptr, ret) __get_user_asm(ld1u, x, ptr, ret)
+#define __get_user_2(x, ptr, ret) __get_user_asm(ld2u, x, ptr, ret)
+#define __get_user_4(x, ptr, ret) __get_user_asm(ld4u, x, ptr, ret)
+#define __get_user_8(x, ptr, ret) __get_user_asm(ld, x, ptr, ret)
+#else
+#define __get_user_1(x, ptr, ret) __get_user_asm(lb_u, x, ptr, ret)
+#define __get_user_2(x, ptr, ret) __get_user_asm(lh_u, x, ptr, ret)
+#define __get_user_4(x, ptr, ret) __get_user_asm(lw, x, ptr, ret)
+#ifdef __LITTLE_ENDIAN
+#define __lo32(a, b) a
+#define __hi32(a, b) b
+#else
+#define __lo32(a, b) b
+#define __hi32(a, b) a
+#endif
+#define __get_user_8(x, ptr, ret)                                       \
+        ({                                                              \
+                unsigned int __a, __b;                                  \
+                asm volatile("1: { lw %1, %3; addi %2, %3, 4 }\n"       \
+                             "2: { lw %2, %2; movei %0, 0 }\n"          \
+                             ".pushsection .fixup,\"ax\"\n"             \
+                             "0: { movei %1, 0; movei %2, 0 }\n"        \
+                             "{ movei %0, %4; j 9f }\n"                 \
+                             ".section __ex_table,\"a\"\n"              \
+                             ".word 1b, 0b\n"                           \
+                             ".word 2b, 0b\n"                           \
+                             ".popsection\n"                            \
+                             "9:"                                       \
+                             : "=r" (ret), "=r" (__a), "=&r" (__b)      \
+                             : "r" (ptr), "i" (-EFAULT));               \
+                (x) = (__typeof(x))(__typeof((x)-(x)))                  \
+                        (((u64)__hi32(__a, __b) << 32) |                \
+                         __lo32(__a, __b));                             \
+        })
+#endif
+extern int __get_user_bad(void)
+  __attribute__((warning("sizeof __get_user argument not 1, 2, 4 or 8")));
-/*
- * Careful: we have to cast the result to the type of the pointer
- * for sign reasons.
- */
 /**
 * __get_user: - Get a simple variable from user space, with less checking.
 * @x:   Variable to store result.
@@ -174,30 +204,62 @@ extern int __put_user_bad(void);
 * function.
 */
 #define __get_user(x, ptr)                                              \
-({      struct __get_user __ret;                                        \
+        ({                                                              \
-        __typeof__(*(ptr)) const __user *__gu_addr = (ptr);             \
+                int __ret;                                              \
-        __chk_user_ptr(__gu_addr);                                      \
+                __chk_user_ptr(ptr);                                    \
-        switch (sizeof(*(__gu_addr))) {                                 \
+                switch (sizeof(*(ptr))) {                               \
-        case 1:                                                         \
+                case 1: __get_user_1(x, ptr, __ret); break;             \
-                __ret = __get_user_1(__gu_addr);                        \
+                case 2: __get_user_2(x, ptr, __ret); break;             \
-                break;                                                  \
+                case 4: __get_user_4(x, ptr, __ret); break;             \
-        case 2:                                                         \
+                case 8: __get_user_8(x, ptr, __ret); break;             \
-                __ret = __get_user_2(__gu_addr);                        \
+                default: __ret = __get_user_bad(); break;               \
-                break;                                                  \
+                }                                                       \
-        case 4:                                                         \
+                __ret;                                                  \
-                __ret = __get_user_4(__gu_addr);                        \
+        })
-                break;                                                  \
-        case 8:                                                         \
+/* Support macros for __put_user(). */
-                __ret = __get_user_8(__gu_addr);                        \
-                break;                                                  \
+#define __put_user_asm(OP, x, ptr, ret)                 \
-        default:                                                        \
+        asm volatile("1: {" #OP " %1, %2; movei %0, 0 }\n"              \
-                __ret = __get_user_bad();                               \
+                     ".pushsection .fixup,\"ax\"\n"                     \
-                break;                                                  \
+                     "0: { movei %0, %3; j 9f }\n"                      \
-        }                                                               \
+                     ".section __ex_table,\"a\"\n"                      \
-        (x) = (__typeof__(*__gu_addr)) (__typeof__(*__gu_addr - *__gu_addr)) \
+                     _ASM_PTR " 1b, 0b\n"                               \
-          __ret.val;                                                    \
+                     ".popsection\n"                                    \
-        __ret.err;                                                      \
+                     "9:"                                               \
-})
+                     : "=r" (ret)                                       \
+                     : "r" (ptr), "r" (x), "i" (-EFAULT))
+#ifdef __tilegx__
+#define __put_user_1(x, ptr, ret) __put_user_asm(st1, x, ptr, ret)
+#define __put_user_2(x, ptr, ret) __put_user_asm(st2, x, ptr, ret)
+#define __put_user_4(x, ptr, ret) __put_user_asm(st4, x, ptr, ret)
+#define __put_user_8(x, ptr, ret) __put_user_asm(st, x, ptr, ret)
+#else
+#define __put_user_1(x, ptr, ret) __put_user_asm(sb, x, ptr, ret)
+#define __put_user_2(x, ptr, ret) __put_user_asm(sh, x, ptr, ret)
+#define __put_user_4(x, ptr, ret) __put_user_asm(sw, x, ptr, ret)
+#define __put_user_8(x, ptr, ret)                                       \
+        ({                                                              \
+                u64 __x = (__typeof((x)-(x)))(x);                       \
+                int __lo = (int) __x, __hi = (int) (__x >> 32);         \
+                asm volatile("1: { sw %1, %2; addi %0, %1, 4 }\n"       \
+                             "2: { sw %0, %3; movei %0, 0 }\n"          \
+                             ".pushsection .fixup,\"ax\"\n"             \
+                             "0: { movei %0, %4; j 9f }\n"              \
+                             ".section __ex_table,\"a\"\n"              \
+                             ".word 1b, 0b\n"                           \
+                             ".word 2b, 0b\n"                           \
+                             ".popsection\n"                            \
+                             "9:"                                       \
+                             : "=&r" (ret)                              \
+                             : "r" (ptr), "r" (__lo32(__lo, __hi)),     \
+                             "r" (__hi32(__lo, __hi)), "i" (-EFAULT));  \
+        })
+#endif
+extern int __put_user_bad(void)
+  __attribute__((warning("sizeof __put_user argument not 1, 2, 4 or 8")));
 /**
 * __put_user: - Write a simple value into user space, with less checking.
@@ -217,39 +279,19 @@ extern int __put_user_bad(void);
 * function.
 *
 * Returns zero on success, or -EFAULT on error.
- *
- * Implementation note: The "case 8" logic of casting to the type of
- * the result of subtracting the value from itself is basically a way
- * of keeping all integer types the same, but casting any pointers to
- * ptrdiff_t, i.e. also an integer type.  This way there are no
- * questionable casts seen by the compiler on an ILP32 platform.
 */
 #define __put_user(x, ptr)                                              \
 ({                                                                      \
-        int __pu_err = 0;                                               \
+        int __ret;                                                      \
-        __typeof__(*(ptr)) __user *__pu_addr = (ptr);                   \
+        __chk_user_ptr(ptr);                                            \
-        typeof(*__pu_addr) __pu_val = (x);                              \
+        switch (sizeof(*(ptr))) {                                       \
-        __chk_user_ptr(__pu_addr);                                      \
+        case 1: __put_user_1(x, ptr, __ret); break;                     \
-        switch (sizeof(__pu_val)) {                                     \
+        case 2: __put_user_2(x, ptr, __ret); break;                     \
-        case 1:                                                         \
+        case 4: __put_user_4(x, ptr, __ret); break;                     \
-                __pu_err = __put_user_1((long)__pu_val, __pu_addr);     \
+        case 8: __put_user_8(x, ptr, __ret); break;                     \
-                break;                                                  \
+        default: __ret = __put_user_bad(); break;                       \
-        case 2:                                                         \
-                __pu_err = __put_user_2((long)__pu_val, __pu_addr);     \
-                break;                                                  \
-        case 4:                                                         \
-                __pu_err = __put_user_4((long)__pu_val, __pu_addr);     \
-                break;                                                  \
-        case 8:                                                         \
-                __pu_err =                                              \
-                  __put_user_8((__typeof__(__pu_val - __pu_val))__pu_val,\
-                        __pu_addr);                                     \
-                break;                                                  \
-        default:                                                        \
-                __pu_err = __put_user_bad();                            \
-                break;                                                  \
        }                                                               \
-        __pu_err;                                                       \
+        __ret;                                                          \
 })
 /*
@@ -378,7 +420,7 @@ static inline unsigned long __must_check copy_from_user(void *to,
 /**
 * __copy_in_user() - copy data within user space, with less checking.
 * @to:   Destination address, in user space.
- * @from: Source address, in kernel space.
+ * @from: Source address, in user space.
 * @n:    Number of bytes to copy.
 *
 * Context: User context only.  This function may sleep.
diff --git a/arch/tile/include/asm/unistd.h b/arch/tile/include/asm/unistd.h
index f70bf1c541f1..a017246ca0ce 100644
--- a/arch/tile/include/asm/unistd.h
+++ b/arch/tile/include/asm/unistd.h
@@ -24,8 +24,8 @@
 #include <asm-generic/unistd.h>
 /* Additional Tilera-specific syscalls. */
-#define __NR_flush_cache        (__NR_arch_specific_syscall + 1)
+#define __NR_cacheflush (__NR_arch_specific_syscall + 1)
-__SYSCALL(__NR_flush_cache, sys_flush_cache)
+__SYSCALL(__NR_cacheflush, sys_cacheflush)
 #ifndef __tilegx__
 /* "Fast" syscalls provide atomic support for 32-bit chips. */