Merge Linus' tree to be be to apply submitted patches to newer code than

current trivial.git base
author: Jiri Kosina <jkosina@suse.cz> 2014-11-20 08:42:02 -0500
committer: Jiri Kosina <jkosina@suse.cz> 2014-11-20 08:42:02 -0500
commit: a02001086bbfb4da35d1228bebc2f1b442db455f (patch)
tree: 62ab47936cef06fd08657ca5b6cd1df98c19be57 /arch/powerpc/mm
parent: eff264efeeb0898408e8c9df72d8a32621035bed (diff)
parent: fc14f9c1272f62c3e8d01300f52467c0d9af50f9 (diff)
22 files changed, 780 insertions, 576 deletions
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 51230ee6a407..325e861616a1 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -13,9 +13,7 @@ obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o \
                                   tlb_nohash_low.o
 obj-$(CONFIG_PPC_BOOK3E)        += tlb_low_$(CONFIG_WORD_SIZE)e.o
 hash64-$(CONFIG_PPC_NATIVE)     := hash_native_64.o
-obj-$(CONFIG_PPC_STD_MMU_64)    += hash_utils_64.o \
+obj-$(CONFIG_PPC_STD_MMU_64)    += hash_utils_64.o slb_low.o slb.o $(hash64-y)
-                                   slb_low.o slb.o stab.o \
-                                   $(hash64-y)
 obj-$(CONFIG_PPC_STD_MMU_32)    += ppc_mmu_32.o
 obj-$(CONFIG_PPC_STD_MMU)       += hash_low_$(CONFIG_WORD_SIZE).o \
                                   tlb_hash$(CONFIG_WORD_SIZE).o \
@@ -36,3 +34,4 @@ obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o
 obj-$(CONFIG_PPC_SUBPAGE_PROT)  += subpage-prot.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)           += highmem.o
+obj-$(CONFIG_PPC_COPRO_BASE)    += copro_fault.o
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
new file mode 100644
index 000000000000..5a236f082c78
--- /dev/null
+++ b/arch/powerpc/mm/copro_fault.c
@@ -0,0 +1,148 @@
+/*
+ * CoProcessor (SPU/AFU) mm fault handler
+ *
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2007
+ *
+ * Author: Arnd Bergmann <arndb@de.ibm.com>
+ * Author: Jeremy Kerr <jk@ozlabs.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <asm/reg.h>
+#include <asm/copro.h>
+#include <asm/spu.h>
+#include <misc/cxl.h>
+/*
+ * This ought to be kept in sync with the powerpc specific do_page_fault
+ * function. Currently, there are a few corner cases that we haven't had
+ * to handle fortunately.
+ */
+int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
+                unsigned long dsisr, unsigned *flt)
+{
+        struct vm_area_struct *vma;
+        unsigned long is_write;
+        int ret;
+        if (mm == NULL)
+                return -EFAULT;
+        if (mm->pgd == NULL)
+                return -EFAULT;
+        down_read(&mm->mmap_sem);
+        ret = -EFAULT;
+        vma = find_vma(mm, ea);
+        if (!vma)
+                goto out_unlock;
+        if (ea < vma->vm_start) {
+                if (!(vma->vm_flags & VM_GROWSDOWN))
+                        goto out_unlock;
+                if (expand_stack(vma, ea))
+                        goto out_unlock;
+        }
+        is_write = dsisr & DSISR_ISSTORE;
+        if (is_write) {
+                if (!(vma->vm_flags & VM_WRITE))
+                        goto out_unlock;
+        } else {
+                if (dsisr & DSISR_PROTFAULT)
+                        goto out_unlock;
+                if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+                        goto out_unlock;
+        }
+        ret = 0;
+        *flt = handle_mm_fault(mm, vma, ea, is_write ? FAULT_FLAG_WRITE : 0);
+        if (unlikely(*flt & VM_FAULT_ERROR)) {
+                if (*flt & VM_FAULT_OOM) {
+                        ret = -ENOMEM;
+                        goto out_unlock;
+                } else if (*flt & VM_FAULT_SIGBUS) {
+                        ret = -EFAULT;
+                        goto out_unlock;
+                }
+                BUG();
+        }
+        if (*flt & VM_FAULT_MAJOR)
+                current->maj_flt++;
+        else
+                current->min_flt++;
+out_unlock:
+        up_read(&mm->mmap_sem);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(copro_handle_mm_fault);
+int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
+{
+        u64 vsid;
+        int psize, ssize;
+        switch (REGION_ID(ea)) {
+        case USER_REGION_ID:
+                pr_devel("%s: 0x%llx -- USER_REGION_ID\n", __func__, ea);
+                psize = get_slice_psize(mm, ea);
+                ssize = user_segment_size(ea);
+                vsid = get_vsid(mm->context.id, ea, ssize);
+                break;
+        case VMALLOC_REGION_ID:
+                pr_devel("%s: 0x%llx -- VMALLOC_REGION_ID\n", __func__, ea);
+                if (ea < VMALLOC_END)
+                        psize = mmu_vmalloc_psize;
+                else
+                        psize = mmu_io_psize;
+                ssize = mmu_kernel_ssize;
+                vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+                break;
+        case KERNEL_REGION_ID:
+                pr_devel("%s: 0x%llx -- KERNEL_REGION_ID\n", __func__, ea);
+                psize = mmu_linear_psize;
+                ssize = mmu_kernel_ssize;
+                vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+                break;
+        default:
+                pr_debug("%s: invalid region access at %016llx\n", __func__, ea);
+                return 1;
+        }
+        vsid = (vsid << slb_vsid_shift(ssize)) | SLB_VSID_USER;
+        vsid |= mmu_psize_defs[psize].sllp |
+                ((ssize == MMU_SEGSIZE_1T) ? SLB_VSID_B_1T : 0);
+        slb->esid = (ea & (ssize == MMU_SEGSIZE_1T ? ESID_MASK_1T : ESID_MASK)) | SLB_ESID_V;
+        slb->vsid = vsid;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(copro_calculate_slb);
+void copro_flush_all_slbs(struct mm_struct *mm)
+{
+#ifdef CONFIG_SPU_BASE
+        spu_flush_all_slbs(mm);
+#endif
+        cxl_slbia(mm);
+}
+EXPORT_SYMBOL_GPL(copro_flush_all_slbs);
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
index 7b6c10750179..d85e86aac7fb 100644
--- a/arch/powerpc/mm/dma-noncoherent.c
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -33,6 +33,7 @@
 #include <linux/export.h>
 #include <asm/tlbflush.h>
+#include <asm/dma.h>
 #include "mmu_decl.h"
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 51ab9e7e6c39..08d659a9fcdb 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -30,9 +30,9 @@
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
 #include <linux/perf_event.h>
-#include <linux/magic.h>
 #include <linux/ratelimit.h>
 #include <linux/context_tracking.h>
+#include <linux/hugetlb.h>
 #include <asm/firmware.h>
 #include <asm/page.h>
@@ -114,22 +114,37 @@ static int store_updates_sp(struct pt_regs *regs)
 #define MM_FAULT_CONTINUE       -1
 #define MM_FAULT_ERR(sig)       (sig)
-static int do_sigbus(struct pt_regs *regs, unsigned long address)
+static int do_sigbus(struct pt_regs *regs, unsigned long address,
+                     unsigned int fault)
 {
        siginfo_t info;
+        unsigned int lsb = 0;
        up_read(&current->mm->mmap_sem);
-        if (user_mode(regs)) {
+        if (!user_mode(regs))
-                current->thread.trap_nr = BUS_ADRERR;
+                return MM_FAULT_ERR(SIGBUS);
-                info.si_signo = SIGBUS;
-                info.si_errno = 0;
+        current->thread.trap_nr = BUS_ADRERR;
-                info.si_code = BUS_ADRERR;
+        info.si_signo = SIGBUS;
-                info.si_addr = (void __user *)address;
+        info.si_errno = 0;
-                force_sig_info(SIGBUS, &info, current);
+        info.si_code = BUS_ADRERR;
-                return MM_FAULT_RETURN;
+        info.si_addr = (void __user *)address;
+#ifdef CONFIG_MEMORY_FAILURE
+        if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
+                pr_err("MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
+                        current->comm, current->pid, address);
+                info.si_code = BUS_MCEERR_AR;
        }
-        return MM_FAULT_ERR(SIGBUS);
+        if (fault & VM_FAULT_HWPOISON_LARGE)
+                lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
+        if (fault & VM_FAULT_HWPOISON)
+                lsb = PAGE_SHIFT;
+#endif
+        info.si_addr_lsb = lsb;
+        force_sig_info(SIGBUS, &info, current);
+        return MM_FAULT_RETURN;
 }
 static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
@@ -170,11 +185,8 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
                return MM_FAULT_RETURN;
        }
-        /* Bus error. x86 handles HWPOISON here, we'll add this if/when
+        if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE))
-         * we support the feature in HW
+                return do_sigbus(regs, addr, fault);
-         */
-        if (fault & VM_FAULT_SIGBUS)
-                return do_sigbus(regs, addr);
        /* We don't understand the fault code, this is fatal */
        BUG();
@@ -508,7 +520,6 @@ bail:
 void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
 {
        const struct exception_table_entry *entry;
-        unsigned long *stackend;
        /* Are we prepared to handle this fault?  */
        if ((entry = search_exception_tables(regs->nip)) != NULL) {
@@ -537,8 +548,7 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
        printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n",
                regs->nip);
-        stackend = end_of_stack(current);
+        if (task_stack_end_corrupted(current))
-        if (current != &init_task && *stackend != STACK_END_MAGIC)
                printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
        die("Kernel access of bad area", regs, sig);
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index cf1d325eae8b..ae4962a06476 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -29,6 +29,8 @@
 #include <asm/kexec.h>
 #include <asm/ppc-opcode.h>
+#include <misc/cxl.h>
 #ifdef DEBUG_LOW
 #define DBG_LOW(fmt...) udbg_printf(fmt)
 #else
@@ -149,9 +151,11 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
 static inline void tlbie(unsigned long vpn, int psize, int apsize,
                         int ssize, int local)
 {
-        unsigned int use_local = local && mmu_has_feature(MMU_FTR_TLBIEL);
+        unsigned int use_local;
        int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+        use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) && !cxl_ctx_in_use();
        if (use_local)
                use_local = mmu_psize_defs[psize].tlbiel;
        if (lock_tlbie && !use_local)
@@ -412,18 +416,18 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
        local_irq_restore(flags);
 }
-static void native_hugepage_invalidate(struct mm_struct *mm,
+static void native_hugepage_invalidate(unsigned long vsid,
+                                       unsigned long addr,
                                       unsigned char *hpte_slot_array,
-                                       unsigned long addr, int psize)
+                                       int psize, int ssize)
 {
-        int ssize = 0, i;
+        int i;
-        int lock_tlbie;
        struct hash_pte *hptep;
        int actual_psize = MMU_PAGE_16M;
        unsigned int max_hpte_count, valid;
        unsigned long flags, s_addr = addr;
        unsigned long hpte_v, want_v, shift;
-        unsigned long hidx, vpn = 0, vsid, hash, slot;
+        unsigned long hidx, vpn = 0, hash, slot;
        shift = mmu_psize_defs[psize].shift;
        max_hpte_count = 1U << (PMD_SHIFT - shift);
@@ -437,15 +441,6 @@ static void native_hugepage_invalidate(struct mm_struct *mm,
                /* get the vpn */
                addr = s_addr + (i * (1ul << shift));
-                if (!is_kernel_addr(addr)) {
-                        ssize = user_segment_size(addr);
-                        vsid = get_vsid(mm->context.id, addr, ssize);
-                        WARN_ON(vsid == 0);
-                } else {
-                        vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
-                        ssize = mmu_kernel_ssize;
-                }
                vpn = hpt_vpn(addr, vsid, ssize);
                hash = hpt_hash(vpn, shift, ssize);
                if (hidx & _PTEIDX_SECONDARY)
@@ -465,22 +460,13 @@ static void native_hugepage_invalidate(struct mm_struct *mm,
                else
                        /* Invalidate the hpte. NOTE: this also unlocks it */
                        hptep->v = 0;
+                /*
+                 * We need to do tlb invalidate for all the address, tlbie
+                 * instruction compares entry_VA in tlb with the VA specified
+                 * here
+                 */
+                tlbie(vpn, psize, actual_psize, ssize, 0);
        }
-        /*
-         * Since this is a hugepage, we just need a single tlbie.
-         * use the last vpn.
-         */
-        lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
-        if (lock_tlbie)
-                raw_spin_lock(&native_tlbie_lock);
-        asm volatile("ptesync":::"memory");
-        __tlbie(vpn, psize, actual_psize, ssize);
-        asm volatile("eieio; tlbsync; ptesync":::"memory");
-        if (lock_tlbie)
-                raw_spin_unlock(&native_tlbie_lock);
        local_irq_restore(flags);
 }
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 88fdd9d25077..d5339a3b9945 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -51,7 +51,7 @@
 #include <asm/cacheflush.h>
 #include <asm/cputable.h>
 #include <asm/sections.h>
-#include <asm/spu.h>
+#include <asm/copro.h>
 #include <asm/udbg.h>
 #include <asm/code-patching.h>
 #include <asm/fadump.h>
@@ -92,12 +92,14 @@ extern unsigned long dart_tablebase;
 static unsigned long _SDR1;
 struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
+EXPORT_SYMBOL_GPL(mmu_psize_defs);
 struct hash_pte *htab_address;
 unsigned long htab_size_bytes;
 unsigned long htab_hash_mask;
 EXPORT_SYMBOL_GPL(htab_hash_mask);
 int mmu_linear_psize = MMU_PAGE_4K;
+EXPORT_SYMBOL_GPL(mmu_linear_psize);
 int mmu_virtual_psize = MMU_PAGE_4K;
 int mmu_vmalloc_psize = MMU_PAGE_4K;
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
@@ -105,6 +107,7 @@ int mmu_vmemmap_psize = MMU_PAGE_4K;
 #endif
 int mmu_io_psize = MMU_PAGE_4K;
 int mmu_kernel_ssize = MMU_SEGSIZE_256M;
+EXPORT_SYMBOL_GPL(mmu_kernel_ssize);
 int mmu_highuser_ssize = MMU_SEGSIZE_256M;
 u16 mmu_slb_size = 64;
 EXPORT_SYMBOL_GPL(mmu_slb_size);
@@ -243,7 +246,7 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
 }
 #ifdef CONFIG_MEMORY_HOTPLUG
-static int htab_remove_mapping(unsigned long vstart, unsigned long vend,
+int htab_remove_mapping(unsigned long vstart, unsigned long vend,
                      int psize, int ssize)
 {
        unsigned long vaddr;
@@ -333,70 +336,69 @@ static int __init htab_dt_scan_page_sizes(unsigned long node,
                return 0;
        prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size);
-        if (prop != NULL) {
+        if (!prop)
-                pr_info("Page sizes from device-tree:\n");
+                return 0;
-                size /= 4;
-                cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE);
+        pr_info("Page sizes from device-tree:\n");
-                while(size > 0) {
+        size /= 4;
-                        unsigned int base_shift = be32_to_cpu(prop[0]);
+        cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE);
-                        unsigned int slbenc = be32_to_cpu(prop[1]);
+        while(size > 0) {
-                        unsigned int lpnum = be32_to_cpu(prop[2]);
+                unsigned int base_shift = be32_to_cpu(prop[0]);
-                        struct mmu_psize_def *def;
+                unsigned int slbenc = be32_to_cpu(prop[1]);
-                        int idx, base_idx;
+                unsigned int lpnum = be32_to_cpu(prop[2]);
+                struct mmu_psize_def *def;
-                        size -= 3; prop += 3;
+                int idx, base_idx;
-                        base_idx = get_idx_from_shift(base_shift);
-                        if (base_idx < 0) {
+                size -= 3; prop += 3;
-                                /*
+                base_idx = get_idx_from_shift(base_shift);
-                                 * skip the pte encoding also
+                if (base_idx < 0) {
-                                 */
+                        /* skip the pte encoding also */
-                                prop += lpnum * 2; size -= lpnum * 2;
+                        prop += lpnum * 2; size -= lpnum * 2;
+                        continue;
+                }
+                def = &mmu_psize_defs[base_idx];
+                if (base_idx == MMU_PAGE_16M)
+                        cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE;
+                def->shift = base_shift;
+                if (base_shift <= 23)
+                        def->avpnm = 0;
+                else
+                        def->avpnm = (1 << (base_shift - 23)) - 1;
+                def->sllp = slbenc;
+                /*
+                 * We don't know for sure what's up with tlbiel, so
+                 * for now we only set it for 4K and 64K pages
+                 */
+                if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K)
+                        def->tlbiel = 1;
+                else
+                        def->tlbiel = 0;
+                while (size > 0 && lpnum) {
+                        unsigned int shift = be32_to_cpu(prop[0]);
+                        int penc  = be32_to_cpu(prop[1]);
+                        prop += 2; size -= 2;
+                        lpnum--;
+                        idx = get_idx_from_shift(shift);
+                        if (idx < 0)
                                continue;
-                        }
-                        def = &mmu_psize_defs[base_idx];
+                        if (penc == -1)
-                        if (base_idx == MMU_PAGE_16M)
+                                pr_err("Invalid penc for base_shift=%d "
-                                cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE;
+                                       "shift=%d\n", base_shift, shift);
-                        def->shift = base_shift;
+                        def->penc[idx] = penc;
-                        if (base_shift <= 23)
+                        pr_info("base_shift=%d: shift=%d, sllp=0x%04lx,"
-                                def->avpnm = 0;
+                                " avpnm=0x%08lx, tlbiel=%d, penc=%d\n",
-                        else
+                                base_shift, shift, def->sllp,
-                                def->avpnm = (1 << (base_shift - 23)) - 1;
+                                def->avpnm, def->tlbiel, def->penc[idx]);
-                        def->sllp = slbenc;
-                        /*
-                         * We don't know for sure what's up with tlbiel, so
-                         * for now we only set it for 4K and 64K pages
-                         */
-                        if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K)
-                                def->tlbiel = 1;
-                        else
-                                def->tlbiel = 0;
-                        while (size > 0 && lpnum) {
-                                unsigned int shift = be32_to_cpu(prop[0]);
-                                int penc  = be32_to_cpu(prop[1]);
-                                prop += 2; size -= 2;
-                                lpnum--;
-                                idx = get_idx_from_shift(shift);
-                                if (idx < 0)
-                                        continue;
-                                if (penc == -1)
-                                        pr_err("Invalid penc for base_shift=%d "
-                                               "shift=%d\n", base_shift, shift);
-                                def->penc[idx] = penc;
-                                pr_info("base_shift=%d: shift=%d, sllp=0x%04lx,"
-                                        " avpnm=0x%08lx, tlbiel=%d, penc=%d\n",
-                                        base_shift, shift, def->sllp,
-                                        def->avpnm, def->tlbiel, def->penc[idx]);
-                        }
                }
-                return 1;
        }
-        return 0;
+        return 1;
 }
 #ifdef CONFIG_HUGETLB_PAGE
@@ -821,21 +823,14 @@ static void __init htab_initialize(void)
 void __init early_init_mmu(void)
 {
-        /* Setup initial STAB address in the PACA */
-        get_paca()->stab_real = __pa((u64)&initial_stab);
-        get_paca()->stab_addr = (u64)&initial_stab;
        /* Initialize the MMU Hash table and create the linear mapping
-         * of memory. Has to be done before stab/slb initialization as
+         * of memory. Has to be done before SLB initialization as this is
-         * this is currently where the page size encoding is obtained
+         * currently where the page size encoding is obtained.
         */
        htab_initialize();
-        /* Initialize stab / SLB management */
+        /* Initialize SLB management */
-        if (mmu_has_feature(MMU_FTR_SLB))
+        slb_initialize();
-                slb_initialize();
-        else
-                stab_initialize(get_paca()->stab_real);
 }
 #ifdef CONFIG_SMP
@@ -845,13 +840,8 @@ void early_init_mmu_secondary(void)
        if (!firmware_has_feature(FW_FEATURE_LPAR))
                mtspr(SPRN_SDR1, _SDR1);
-        /* Initialize STAB/SLB. We use a virtual address as it works
+        /* Initialize SLB */
-         * in real mode on pSeries.
+        slb_initialize();
-         */
-        if (mmu_has_feature(MMU_FTR_SLB))
-                slb_initialize();
-        else
-                stab_initialize(get_paca()->stab_addr);
 }
 #endif /* CONFIG_SMP */
@@ -879,7 +869,7 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
 }
 #ifdef CONFIG_PPC_MM_SLICES
-unsigned int get_paca_psize(unsigned long addr)
+static unsigned int get_paca_psize(unsigned long addr)
 {
        u64 lpsizes;
        unsigned char *hpsizes;
@@ -913,10 +903,8 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
        if (get_slice_psize(mm, addr) == MMU_PAGE_4K)
                return;
        slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K);
-#ifdef CONFIG_SPU_BASE
+        copro_flush_all_slbs(mm);
-        spu_flush_all_slbs(mm);
+        if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) {
-#endif
-        if (get_paca_psize(addr) != MMU_PAGE_4K) {
                get_paca()->context = mm->context;
                slb_flush_and_rebolt();
        }
@@ -1001,12 +989,11 @@ static void check_paca_psize(unsigned long ea, struct mm_struct *mm,
 * -1 - critical hash insertion error
 * -2 - access not permitted by subpage protection mechanism
 */
-int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
+int hash_page_mm(struct mm_struct *mm, unsigned long ea, unsigned long access, unsigned long trap)
 {
        enum ctx_state prev_state = exception_enter();
        pgd_t *pgdir;
        unsigned long vsid;
-        struct mm_struct *mm;
        pte_t *ptep;
        unsigned hugeshift;
        const struct cpumask *tmp;
@@ -1020,7 +1007,6 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
        switch (REGION_ID(ea)) {
        case USER_REGION_ID:
                user_region = 1;
-                mm = current->mm;
                if (! mm) {
                        DBG_LOW(" user region with no mm !\n");
                        rc = 1;
@@ -1031,7 +1017,6 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
                vsid = get_vsid(mm->context.id, ea, ssize);
                break;
        case VMALLOC_REGION_ID:
-                mm = &init_mm;
                vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
                if (ea < VMALLOC_END)
                        psize = mmu_vmalloc_psize;
@@ -1116,7 +1101,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
                        WARN_ON(1);
                }
 #endif
-                check_paca_psize(ea, mm, psize, user_region);
+                if (current->mm == mm)
+                        check_paca_psize(ea, mm, psize, user_region);
                goto bail;
        }
@@ -1153,13 +1139,12 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
                               "to 4kB pages because of "
                               "non-cacheable mapping\n");
                        psize = mmu_vmalloc_psize = MMU_PAGE_4K;
-#ifdef CONFIG_SPU_BASE
+                        copro_flush_all_slbs(mm);
-                        spu_flush_all_slbs(mm);
-#endif
                }
        }
-        check_paca_psize(ea, mm, psize, user_region);
+        if (current->mm == mm)
+                check_paca_psize(ea, mm, psize, user_region);
 #endif /* CONFIG_PPC_64K_PAGES */
 #ifdef CONFIG_PPC_HAS_HASH_64K
@@ -1194,6 +1179,17 @@ bail:
        exception_exit(prev_state);
        return rc;
 }
+EXPORT_SYMBOL_GPL(hash_page_mm);
+int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
+{
+        struct mm_struct *mm = current->mm;
+        if (REGION_ID(ea) == VMALLOC_REGION_ID)
+                mm = &init_mm;
+        return hash_page_mm(mm, ea, access, trap);
+}
 EXPORT_SYMBOL_GPL(hash_page);
 void hash_preload(struct mm_struct *mm, unsigned long ea,
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index 826893fcb3a7..5f5e6328c21c 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -18,6 +18,57 @@
 #include <linux/mm.h>
 #include <asm/machdep.h>
+static void invalidate_old_hpte(unsigned long vsid, unsigned long addr,
+                                pmd_t *pmdp, unsigned int psize, int ssize)
+{
+        int i, max_hpte_count, valid;
+        unsigned long s_addr;
+        unsigned char *hpte_slot_array;
+        unsigned long hidx, shift, vpn, hash, slot;
+        s_addr = addr & HPAGE_PMD_MASK;
+        hpte_slot_array = get_hpte_slot_array(pmdp);
+        /*
+         * IF we try to do a HUGE PTE update after a withdraw is done.
+         * we will find the below NULL. This happens when we do
+         * split_huge_page_pmd
+         */
+        if (!hpte_slot_array)
+                return;
+        if (ppc_md.hugepage_invalidate)
+                return ppc_md.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
+                                                  psize, ssize);
+        /*
+         * No bluk hpte removal support, invalidate each entry
+         */
+        shift = mmu_psize_defs[psize].shift;
+        max_hpte_count = HPAGE_PMD_SIZE >> shift;
+        for (i = 0; i < max_hpte_count; i++) {
+                /*
+                 * 8 bits per each hpte entries
+                 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
+                 */
+                valid = hpte_valid(hpte_slot_array, i);
+                if (!valid)
+                        continue;
+                hidx =  hpte_hash_index(hpte_slot_array, i);
+                /* get the vpn */
+                addr = s_addr + (i * (1ul << shift));
+                vpn = hpt_vpn(addr, vsid, ssize);
+                hash = hpt_hash(vpn, shift, ssize);
+                if (hidx & _PTEIDX_SECONDARY)
+                        hash = ~hash;
+                slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+                slot += hidx & _PTEIDX_GROUP_IX;
+                ppc_md.hpte_invalidate(slot, vpn, psize,
+                                       MMU_PAGE_16M, ssize, 0);
+        }
+}
 int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
                    pmd_t *pmdp, unsigned long trap, int local, int ssize,
                    unsigned int psize)
@@ -33,7 +84,9 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
         * atomically mark the linux large page PMD busy and dirty
         */
        do {
-                old_pmd = pmd_val(*pmdp);
+                pmd_t pmd = ACCESS_ONCE(*pmdp);
+                old_pmd = pmd_val(pmd);
                /* If PMD busy, retry the access */
                if (unlikely(old_pmd & _PAGE_BUSY))
                        return 0;
@@ -85,6 +138,15 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
        vpn = hpt_vpn(ea, vsid, ssize);
        hash = hpt_hash(vpn, shift, ssize);
        hpte_slot_array = get_hpte_slot_array(pmdp);
+        if (psize == MMU_PAGE_4K) {
+                /*
+                 * invalidate the old hpte entry if we have that mapped via 64K
+                 * base page size. This is because demote_segment won't flush
+                 * hash page table entries.
+                 */
+                if ((old_pmd & _PAGE_HASHPTE) && !(old_pmd & _PAGE_COMBO))
+                        invalidate_old_hpte(vsid, ea, pmdp, MMU_PAGE_64K, ssize);
+        }
        valid = hpte_valid(hpte_slot_array, index);
        if (valid) {
@@ -107,11 +169,8 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
                         * safely update this here.
                         */
                        valid = 0;
-                        new_pmd &= ~_PAGE_HPTEFLAGS;
                        hpte_slot_array[index] = 0;
-                } else
+                }
-                        /* clear the busy bits and set the hash pte bits */
-                        new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
        }
        if (!valid) {
@@ -119,11 +178,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
                /* insert new entry */
                pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
-repeat:
+                new_pmd |= _PAGE_HASHPTE;
-                hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
-                /* clear the busy bits and set the hash pte bits */
-                new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
                /* Add in WIMG bits */
                rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
@@ -132,6 +187,8 @@ repeat:
                 * enable the memory coherence always
                 */
                rflags |= HPTE_R_M;
+repeat:
+                hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
                /* Insert into the hash table, primary slot */
                slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
@@ -172,8 +229,17 @@ repeat:
                mark_hpte_slot_valid(hpte_slot_array, index, slot);
        }
        /*
-         * No need to use ldarx/stdcx here
+         * Mark the pte with _PAGE_COMBO, if we are trying to hash it with
+         * base page size 4k.
+         */
+        if (psize == MMU_PAGE_4K)
+                new_pmd |= _PAGE_COMBO;
+        /*
+         * The hpte valid is stored in the pgtable whose address is in the
+         * second half of the PMD. Order this against clearing of the busy bit in
+         * huge pmd.
         */
+        smp_wmb();
        *pmdp = __pmd(new_pmd & ~_PAGE_BUSY);
        return 0;
 }
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index cff59f1bec23..415a51b028b9 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -103,14 +103,14 @@ unsigned long __max_low_memory = MAX_LOW_MEM;
 /*
 * Check for command-line options that affect what MMU_init will do.
 */
-void MMU_setup(void)
+void __init MMU_setup(void)
 {
        /* Check for nobats option (used in mapin_ram). */
-        if (strstr(cmd_line, "nobats")) {
+        if (strstr(boot_command_line, "nobats")) {
                __map_without_bats = 1;
        }
-        if (strstr(cmd_line, "noltlbs")) {
+        if (strstr(boot_command_line, "noltlbs")) {
                __map_without_ltlbs = 1;
        }
 #ifdef CONFIG_DEBUG_PAGEALLOC
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index e3734edffa69..3481556a1880 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -175,9 +175,10 @@ static unsigned long __meminit vmemmap_section_start(unsigned long page)
 static int __meminit vmemmap_populated(unsigned long start, int page_size)
 {
        unsigned long end = start + page_size;
+        start = (unsigned long)(pfn_to_page(vmemmap_section_start(start)));
        for (; start < end; start += (PAGES_PER_SECTION * sizeof(struct page)))
-                if (pfn_valid(vmemmap_section_start(start)))
+                if (pfn_valid(page_to_pfn((struct page *)start)))
                        return 1;
        return 0;
@@ -212,6 +213,13 @@ static void __meminit vmemmap_create_mapping(unsigned long start,
        for (i = 0; i < page_size; i += PAGE_SIZE)
                BUG_ON(map_kernel_page(start + i, phys, flags));
 }
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void vmemmap_remove_mapping(unsigned long start,
+                                   unsigned long page_size)
+{
+}
+#endif
 #else /* CONFIG_PPC_BOOK3E */
 static void __meminit vmemmap_create_mapping(unsigned long start,
                                             unsigned long page_size,
@@ -223,17 +231,39 @@ static void __meminit vmemmap_create_mapping(unsigned long start,
                                        mmu_kernel_ssize);
        BUG_ON(mapped < 0);
 }
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void vmemmap_remove_mapping(unsigned long start,
+                                   unsigned long page_size)
+{
+        int mapped = htab_remove_mapping(start, start + page_size,
+                                         mmu_vmemmap_psize,
+                                         mmu_kernel_ssize);
+        BUG_ON(mapped < 0);
+}
+#endif
 #endif /* CONFIG_PPC_BOOK3E */
 struct vmemmap_backing *vmemmap_list;
+static struct vmemmap_backing *next;
+static int num_left;
+static int num_freed;
 static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node)
 {
-        static struct vmemmap_backing *next;
+        struct vmemmap_backing *vmem_back;
-        static int num_left;
+        /* get from freed entries first */
+        if (num_freed) {
+                num_freed--;
+                vmem_back = next;
+                next = next->list;
+                return vmem_back;
+        }
        /* allocate a page when required and hand out chunks */
-        if (!next || !num_left) {
+        if (!num_left) {
                next = vmemmap_alloc_block(PAGE_SIZE, node);
                if (unlikely(!next)) {
                        WARN_ON(1);
@@ -296,10 +326,85 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
        return 0;
 }
-void vmemmap_free(unsigned long start, unsigned long end)
+#ifdef CONFIG_MEMORY_HOTPLUG
+static unsigned long vmemmap_list_free(unsigned long start)
 {
+        struct vmemmap_backing *vmem_back, *vmem_back_prev;
+        vmem_back_prev = vmem_back = vmemmap_list;
+        /* look for it with prev pointer recorded */
+        for (; vmem_back; vmem_back = vmem_back->list) {
+                if (vmem_back->virt_addr == start)
+                        break;
+                vmem_back_prev = vmem_back;
+        }
+        if (unlikely(!vmem_back)) {
+                WARN_ON(1);
+                return 0;
+        }
+        /* remove it from vmemmap_list */
+        if (vmem_back == vmemmap_list) /* remove head */
+                vmemmap_list = vmem_back->list;
+        else
+                vmem_back_prev->list = vmem_back->list;
+        /* next point to this freed entry */
+        vmem_back->list = next;
+        next = vmem_back;
+        num_freed++;
+        return vmem_back->phys;
 }
+void __ref vmemmap_free(unsigned long start, unsigned long end)
+{
+        unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
+        start = _ALIGN_DOWN(start, page_size);
+        pr_debug("vmemmap_free %lx...%lx\n", start, end);
+        for (; start < end; start += page_size) {
+                unsigned long addr;
+                /*
+                 * the section has already be marked as invalid, so
+                 * vmemmap_populated() true means some other sections still
+                 * in this page, so skip it.
+                 */
+                if (vmemmap_populated(start, page_size))
+                        continue;
+                addr = vmemmap_list_free(start);
+                if (addr) {
+                        struct page *page = pfn_to_page(addr >> PAGE_SHIFT);
+                        if (PageReserved(page)) {
+                                /* allocated from bootmem */
+                                if (page_size < PAGE_SIZE) {
+                                        /*
+                                         * this shouldn't happen, but if it is
+                                         * the case, leave the memory there
+                                         */
+                                        WARN_ON_ONCE(1);
+                                } else {
+                                        unsigned int nr_pages =
+                                                1 << get_order(page_size);
+                                        while (nr_pages--)
+                                                free_reserved_page(page++);
+                                }
+                        } else
+                                free_pages((unsigned long)(__va(addr)),
+                                                        get_order(page_size));
+                        vmemmap_remove_mapping(start, page_size);
+                }
+        }
+}
+#endif
 void register_page_bootmem_memmap(unsigned long section_nr,
                                  struct page *start_page, unsigned long size)
 {
@@ -331,16 +436,16 @@ struct page *realmode_pfn_to_page(unsigned long pfn)
                if (pg_va < vmem_back->virt_addr)
                        continue;
-                /* Check that page struct is not split between real pages */
+                /* After vmemmap_list entry free is possible, need check all */
-                if ((pg_va + sizeof(struct page)) >
+                if ((pg_va + sizeof(struct page)) <=
-                                (vmem_back->virt_addr + page_size))
+                                (vmem_back->virt_addr + page_size)) {
-                        return NULL;
+                        page = (struct page *) (vmem_back->phys + pg_va -
-                page = (struct page *) (vmem_back->phys + pg_va -
                                vmem_back->virt_addr);
-                return page;
+                        return page;
+                }
        }
+        /* Probably that page struct is split between real pages */
        return NULL;
 }
 EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 2c8e90f5789e..8ebaac75c940 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -128,7 +128,8 @@ int arch_add_memory(int nid, u64 start, u64 size)
                return -EINVAL;
        /* this should work for most non-highmem platforms */
-        zone = pgdata->node_zones;
+        zone = pgdata->node_zones +
+                zone_for_memory(nid, start, size, 0);
        return __add_pages(nid, zone, start_pfn, nr_pages);
 }
@@ -259,6 +260,60 @@ static int __init mark_nonram_nosave(void)
        }
        return 0;
 }
+#else /* CONFIG_NEED_MULTIPLE_NODES */
+static int __init mark_nonram_nosave(void)
+{
+        return 0;
+}
+#endif
+static bool zone_limits_final;
+static unsigned long max_zone_pfns[MAX_NR_ZONES] = {
+        [0 ... MAX_NR_ZONES - 1] = ~0UL
+};
+/*
+ * Restrict the specified zone and all more restrictive zones
+ * to be below the specified pfn.  May not be called after
+ * paging_init().
+ */
+void __init limit_zone_pfn(enum zone_type zone, unsigned long pfn_limit)
+{
+        int i;
+        if (WARN_ON(zone_limits_final))
+                return;
+        for (i = zone; i >= 0; i--) {
+                if (max_zone_pfns[i] > pfn_limit)
+                        max_zone_pfns[i] = pfn_limit;
+        }
+}
+/*
+ * Find the least restrictive zone that is entirely below the
+ * specified pfn limit.  Returns < 0 if no suitable zone is found.
+ *
+ * pfn_limit must be u64 because it can exceed 32 bits even on 32-bit
+ * systems -- the DMA limit can be higher than any possible real pfn.
+ */
+int dma_pfn_limit_to_zone(u64 pfn_limit)
+{
+        enum zone_type top_zone = ZONE_NORMAL;
+        int i;
+#ifdef CONFIG_HIGHMEM
+        top_zone = ZONE_HIGHMEM;
+#endif
+        for (i = top_zone; i >= 0; i--) {
+                if (max_zone_pfns[i] <= pfn_limit)
+                        return i;
+        }
+        return -EPERM;
+}
 /*
 * paging_init() sets up the page tables - in fact we've already done this.
@@ -267,7 +322,7 @@ void __init paging_init(void)
 {
        unsigned long long total_ram = memblock_phys_mem_size();
        phys_addr_t top_of_ram = memblock_end_of_DRAM();
-        unsigned long max_zone_pfns[MAX_NR_ZONES];
+        enum zone_type top_zone;
 #ifdef CONFIG_PPC32
        unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1);
@@ -289,18 +344,20 @@ void __init paging_init(void)
               (unsigned long long)top_of_ram, total_ram);
        printk(KERN_DEBUG "Memory hole size: %ldMB\n",
               (long int)((top_of_ram - total_ram) >> 20));
-        memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 #ifdef CONFIG_HIGHMEM
-        max_zone_pfns[ZONE_DMA] = lowmem_end_addr >> PAGE_SHIFT;
+        top_zone = ZONE_HIGHMEM;
-        max_zone_pfns[ZONE_HIGHMEM] = top_of_ram >> PAGE_SHIFT;
+        limit_zone_pfn(ZONE_NORMAL, lowmem_end_addr >> PAGE_SHIFT);
 #else
-        max_zone_pfns[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
+        top_zone = ZONE_NORMAL;
 #endif
+        limit_zone_pfn(top_zone, top_of_ram >> PAGE_SHIFT);
+        zone_limits_final = true;
        free_area_init_nodes(max_zone_pfns);
        mark_nonram_nosave();
 }
-#endif /* ! CONFIG_NEED_MULTIPLE_NODES */
 static void __init register_page_bootmem_info(void)
 {
diff --git a/arch/powerpc/mm/mmu_context_hash32.c b/arch/powerpc/mm/mmu_context_hash32.c
index 78fef6726e10..aa5a7fd89461 100644
--- a/arch/powerpc/mm/mmu_context_hash32.c
+++ b/arch/powerpc/mm/mmu_context_hash32.c
@@ -2,7 +2,7 @@
 * This file contains the routines for handling the MMU on those
 * PowerPC implementations where the MMU substantially follows the
 * architecture specification.  This includes the 6xx, 7xx, 7xxx,
- * 8260, and POWER3 implementations but excludes the 8xx and 4xx.
+ * and 8260 implementations but excludes the 8xx and 4xx.
 *  -- paulus
 *
 *  Derived from arch/ppc/mm/init.c:
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 3b181b22cd46..b9d1dfdbe5bb 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -8,6 +8,8 @@
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 */
+#define pr_fmt(fmt) "numa: " fmt
 #include <linux/threads.h>
 #include <linux/bootmem.h>
 #include <linux/init.h>
@@ -538,7 +540,7 @@ static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
 */
 static int numa_setup_cpu(unsigned long lcpu)
 {
-        int nid;
+        int nid = -1;
        struct device_node *cpu;
        /*
@@ -555,19 +557,21 @@ static int numa_setup_cpu(unsigned long lcpu)
        if (!cpu) {
                WARN_ON(1);
-                nid = 0;
+                if (cpu_present(lcpu))
-                goto out;
+                        goto out_present;
+                else
+                        goto out;
        }
        nid = of_node_to_nid_single(cpu);
+out_present:
        if (nid < 0 || !node_online(nid))
                nid = first_online_node;
-out:
-        map_cpu_to_node(lcpu, nid);
+        map_cpu_to_node(lcpu, nid);
        of_node_put(cpu);
+out:
        return nid;
 }
@@ -611,8 +615,8 @@ static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action,
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
                unmap_cpu_from_node(lcpu);
-                break;
                ret = NOTIFY_OK;
+                break;
 #endif
        }
        return ret;
@@ -1049,7 +1053,7 @@ static void __init mark_reserved_regions_for_nid(int nid)
 void __init do_init_bootmem(void)
 {
-        int nid;
+        int nid, cpu;
        min_low_pfn = 0;
        max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
@@ -1122,16 +1126,14 @@ void __init do_init_bootmem(void)
        reset_numa_cpu_lookup_table();
        register_cpu_notifier(&ppc64_numa_nb);
-        cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
+        /*
-                          (void *)(unsigned long)boot_cpuid);
+         * We need the numa_cpu_lookup_table to be accurate for all CPUs,
-}
+         * even before we online them, so that we can use cpu_to_{node,mem}
+         * early in boot, cf. smp_prepare_cpus().
-void __init paging_init(void)
+         */
-{
+        for_each_present_cpu(cpu) {
-        unsigned long max_zone_pfns[MAX_NR_ZONES];
+                numa_setup_cpu((unsigned long)cpu);
-        memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+        }
-        max_zone_pfns[ZONE_DMA] = memblock_end_of_DRAM() >> PAGE_SHIFT;
-        free_area_init_nodes(max_zone_pfns);
 }
 static int __init early_numa(char *p)
@@ -1153,6 +1155,22 @@ static int __init early_numa(char *p)
 }
 early_param("numa", early_numa);
+static bool topology_updates_enabled = true;
+static int __init early_topology_updates(char *p)
+{
+        if (!p)
+                return 0;
+        if (!strcmp(p, "off")) {
+                pr_info("Disabling topology updates\n");
+                topology_updates_enabled = false;
+        }
+        return 0;
+}
+early_param("topology_updates", early_topology_updates);
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
 * Find the node associated with a hot added memory section for
@@ -1442,8 +1460,11 @@ static long hcall_vphn(unsigned long cpu, __be32 *associativity)
        long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
        u64 flags = 1;
        int hwcpu = get_hard_smp_processor_id(cpu);
+        int i;
        rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu);
+        for (i = 0; i < 6; i++)
+                retbuf[i] = cpu_to_be64(retbuf[i]);
        vphn_unpack_associativity(retbuf, associativity);
        return rc;
@@ -1488,11 +1509,14 @@ static int update_cpu_topology(void *data)
        cpu = smp_processor_id();
        for (update = data; update; update = update->next) {
+                int new_nid = update->new_nid;
                if (cpu != update->cpu)
                        continue;
-                unmap_cpu_from_node(update->cpu);
+                unmap_cpu_from_node(cpu);
-                map_cpu_to_node(update->cpu, update->new_nid);
+                map_cpu_to_node(cpu, new_nid);
+                set_cpu_numa_node(cpu, new_nid);
+                set_cpu_numa_mem(cpu, local_memory_node(new_nid));
                vdso_getcpu_init();
        }
@@ -1539,6 +1563,9 @@ int arch_update_cpu_topology(void)
        struct device *dev;
        int weight, new_nid, i = 0;
+        if (!prrn_enabled && !vphn_enabled)
+                return 0;
        weight = cpumask_weight(&cpu_associativity_changes_mask);
        if (!weight)
                return 0;
@@ -1592,6 +1619,15 @@ int arch_update_cpu_topology(void)
                cpu = cpu_last_thread_sibling(cpu);
        }
+        pr_debug("Topology update for the following CPUs:\n");
+        if (cpumask_weight(&updated_cpus)) {
+                for (ud = &updates[0]; ud; ud = ud->next) {
+                        pr_debug("cpu %d moving from node %d "
+                                          "to %d\n", ud->cpu,
+                                          ud->old_nid, ud->new_nid);
+                }
+        }
        /*
         * In cases where we have nothing to update (because the updates list
         * is too short or because the new topology is same as the old one),
@@ -1800,8 +1836,12 @@ static const struct file_operations topology_ops = {
 static int topology_update_init(void)
 {
-        start_topology_update();
+        /* Do not poll for changes if disabled at boot */
-        proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops);
+        if (topology_updates_enabled)
+                start_topology_update();
+        if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops))
+                return -ENOMEM;
        return 0;
 }
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index c695943a513c..c90e602677c9 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -48,7 +48,7 @@ static inline int pte_looks_normal(pte_t pte)
            (_PAGE_PRESENT | _PAGE_USER);
 }
-struct page * maybe_pte_to_page(pte_t pte)
+static struct page *maybe_pte_to_page(pte_t pte)
 {
        unsigned long pfn = pte_pfn(pte);
        struct page *page;
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 343a87fa78b5..cf11342bf519 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -41,7 +41,7 @@ unsigned long ioremap_base;
 unsigned long ioremap_bot;
 EXPORT_SYMBOL(ioremap_bot);     /* aka VMALLOC_END */
-#if defined(CONFIG_6xx) || defined(CONFIG_POWER3)
+#ifdef CONFIG_6xx
 #define HAVE_BATS       1
 #endif
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index f6ce1f111f5b..c8d709ab489d 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -54,6 +54,9 @@
 #include "mmu_decl.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/thp.h>
 /* Some sanity checking */
 #if TASK_SIZE_USER64 > PGTABLE_RANGE
 #error TASK_SIZE_USER64 exceeds pagetable range
@@ -68,7 +71,7 @@
 unsigned long ioremap_bot = IOREMAP_BASE;
 #ifdef CONFIG_PPC_MMU_NOHASH
-static void *early_alloc_pgtable(unsigned long size)
+static __ref void *early_alloc_pgtable(unsigned long size)
 {
        void *pt;
@@ -537,8 +540,9 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
        old = pmd_val(*pmdp);
        *pmdp = __pmd((old & ~clr) | set);
 #endif
+        trace_hugepage_update(addr, old, clr, set);
        if (old & _PAGE_HASHPTE)
-                hpte_do_hugepage_flush(mm, addr, pmdp);
+                hpte_do_hugepage_flush(mm, addr, pmdp, old);
        return old;
 }
@@ -642,10 +646,11 @@ void pmdp_splitting_flush(struct vm_area_struct *vma,
         * If we didn't had the splitting flag set, go and flush the
         * HPTE entries.
         */
+        trace_hugepage_splitting(address, old);
        if (!(old & _PAGE_SPLITTING)) {
                /* We need to flush the hpte */
                if (old & _PAGE_HASHPTE)
-                        hpte_do_hugepage_flush(vma->vm_mm, address, pmdp);
+                        hpte_do_hugepage_flush(vma->vm_mm, address, pmdp, old);
        }
        /*
         * This ensures that generic code that rely on IRQ disabling
@@ -709,6 +714,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
        assert_spin_locked(&mm->page_table_lock);
        WARN_ON(!pmd_trans_huge(pmd));
 #endif
+        trace_hugepage_set_pmd(addr, pmd);
        return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
 }
@@ -723,7 +729,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 * neesd to be flushed.
 */
 void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
-                            pmd_t *pmdp)
+                            pmd_t *pmdp, unsigned long old_pmd)
 {
        int ssize, i;
        unsigned long s_addr;
@@ -745,12 +751,29 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
        if (!hpte_slot_array)
                return;
-        /* get the base page size */
+        /* get the base page size,vsid and segment size */
+#ifdef CONFIG_DEBUG_VM
        psize = get_slice_psize(mm, s_addr);
+        BUG_ON(psize == MMU_PAGE_16M);
+#endif
+        if (old_pmd & _PAGE_COMBO)
+                psize = MMU_PAGE_4K;
+        else
+                psize = MMU_PAGE_64K;
+        if (!is_kernel_addr(s_addr)) {
+                ssize = user_segment_size(s_addr);
+                vsid = get_vsid(mm->context.id, s_addr, ssize);
+                WARN_ON(vsid == 0);
+        } else {
+                vsid = get_kernel_vsid(s_addr, mmu_kernel_ssize);
+                ssize = mmu_kernel_ssize;
+        }
        if (ppc_md.hugepage_invalidate)
-                return ppc_md.hugepage_invalidate(mm, hpte_slot_array,
+                return ppc_md.hugepage_invalidate(vsid, s_addr,
-                                                  s_addr, psize);
+                                                  hpte_slot_array,
+                                                  psize, ssize);
        /*
         * No bluk hpte removal support, invalidate each entry
         */
@@ -768,15 +791,6 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
                /* get the vpn */
                addr = s_addr + (i * (1ul << shift));
-                if (!is_kernel_addr(addr)) {
-                        ssize = user_segment_size(addr);
-                        vsid = get_vsid(mm->context.id, addr, ssize);
-                        WARN_ON(vsid == 0);
-                } else {
-                        vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
-                        ssize = mmu_kernel_ssize;
-                }
                vpn = hpt_vpn(addr, vsid, ssize);
                hash = hpt_hash(vpn, shift, ssize);
                if (hidx & _PTEIDX_SECONDARY)
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index 11571e118831..5029dc19b517 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -2,7 +2,7 @@
 * This file contains the routines for handling the MMU on those
 * PowerPC implementations where the MMU substantially follows the
 * architecture specification.  This includes the 6xx, 7xx, 7xxx,
- * 8260, and POWER3 implementations but excludes the 8xx and 4xx.
+ * and 8260 implementations but excludes the 8xx and 4xx.
 *  -- paulus
 *
 *  Derived from arch/ppc/mm/init.c:
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 0399a6702958..6e450ca66526 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -46,9 +46,6 @@ static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
        return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | slot;
 }
-#define slb_vsid_shift(ssize)   \
-        ((ssize) == MMU_SEGSIZE_256M? SLB_VSID_SHIFT: SLB_VSID_SHIFT_1T)
 static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
                                         unsigned long flags)
 {
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index b0c75cc15efc..ded0ea1afde4 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -30,9 +30,11 @@
 #include <linux/err.h>
 #include <linux/spinlock.h>
 #include <linux/export.h>
+#include <linux/hugetlb.h>
 #include <asm/mman.h>
 #include <asm/mmu.h>
-#include <asm/spu.h>
+#include <asm/copro.h>
+#include <asm/hugetlb.h>
 /* some sanity checks */
 #if (PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE
@@ -232,9 +234,7 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
        spin_unlock_irqrestore(&slice_convert_lock, flags);
-#ifdef CONFIG_SPU_BASE
+        copro_flush_all_slbs(mm);
-        spu_flush_all_slbs(mm);
-#endif
 }
 /*
@@ -671,9 +671,7 @@ void slice_set_psize(struct mm_struct *mm, unsigned long address,
        spin_unlock_irqrestore(&slice_convert_lock, flags);
-#ifdef CONFIG_SPU_BASE
+        copro_flush_all_slbs(mm);
-        spu_flush_all_slbs(mm);
-#endif
 }
 void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
@@ -684,6 +682,7 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
        slice_convert(mm, mask, psize);
 }
+#ifdef CONFIG_HUGETLB_PAGE
 /*
 * is_hugepage_only_range() is used by generic code to verify whether
 * a normal mmap mapping (non hugetlbfs) is valid on a given area.
@@ -728,4 +727,4 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 #endif
        return !slice_check_fit(mask, available);
 }
+#endif
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
deleted file mode 100644
index 3f8efa6f2997..000000000000
--- a/arch/powerpc/mm/stab.c
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * PowerPC64 Segment Translation Support.
- *
- * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
- *    Copyright (c) 2001 Dave Engebretsen
- *
- * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- *      This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      as published by the Free Software Foundation; either version
- *      2 of the License, or (at your option) any later version.
- */
-#include <linux/memblock.h>
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/paca.h>
-#include <asm/cputable.h>
-#include <asm/prom.h>
-struct stab_entry {
-        unsigned long esid_data;
-        unsigned long vsid_data;
-};
-#define NR_STAB_CACHE_ENTRIES 8
-static DEFINE_PER_CPU(long, stab_cache_ptr);
-static DEFINE_PER_CPU(long [NR_STAB_CACHE_ENTRIES], stab_cache);
-/*
- * Create a segment table entry for the given esid/vsid pair.
- */
-static int make_ste(unsigned long stab, unsigned long esid, unsigned long vsid)
-{
-        unsigned long esid_data, vsid_data;
-        unsigned long entry, group, old_esid, castout_entry, i;
-        unsigned int global_entry;
-        struct stab_entry *ste, *castout_ste;
-        unsigned long kernel_segment = (esid << SID_SHIFT) >= PAGE_OFFSET;
-        vsid_data = vsid << STE_VSID_SHIFT;
-        esid_data = esid << SID_SHIFT | STE_ESID_KP | STE_ESID_V;
-        if (! kernel_segment)
-                esid_data |= STE_ESID_KS;
-        /* Search the primary group first. */
-        global_entry = (esid & 0x1f) << 3;
-        ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7));
-        /* Find an empty entry, if one exists. */
-        for (group = 0; group < 2; group++) {
-                for (entry = 0; entry < 8; entry++, ste++) {
-                        if (!(ste->esid_data & STE_ESID_V)) {
-                                ste->vsid_data = vsid_data;
-                                eieio();
-                                ste->esid_data = esid_data;
-                                return (global_entry | entry);
-                        }
-                }
-                /* Now search the secondary group. */
-                global_entry = ((~esid) & 0x1f) << 3;
-                ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7));
-        }
-        /*
-         * Could not find empty entry, pick one with a round robin selection.
-         * Search all entries in the two groups.
-         */
-        castout_entry = get_paca()->stab_rr;
-        for (i = 0; i < 16; i++) {
-                if (castout_entry < 8) {
-                        global_entry = (esid & 0x1f) << 3;
-                        ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7));
-                        castout_ste = ste + castout_entry;
-                } else {
-                        global_entry = ((~esid) & 0x1f) << 3;
-                        ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7));
-                        castout_ste = ste + (castout_entry - 8);
-                }
-                /* Dont cast out the first kernel segment */
-                if ((castout_ste->esid_data & ESID_MASK) != PAGE_OFFSET)
-                        break;
-                castout_entry = (castout_entry + 1) & 0xf;
-        }
-        get_paca()->stab_rr = (castout_entry + 1) & 0xf;
-        /* Modify the old entry to the new value. */
-        /* Force previous translations to complete. DRENG */
-        asm volatile("isync" : : : "memory");
-        old_esid = castout_ste->esid_data >> SID_SHIFT;
-        castout_ste->esid_data = 0;             /* Invalidate old entry */
-        asm volatile("sync" : : : "memory");    /* Order update */
-        castout_ste->vsid_data = vsid_data;
-        eieio();                                /* Order update */
-        castout_ste->esid_data = esid_data;
-        asm volatile("slbie  %0" : : "r" (old_esid << SID_SHIFT));
-        /* Ensure completion of slbie */
-        asm volatile("sync" : : : "memory");
-        return (global_entry | (castout_entry & 0x7));
-}
-/*
- * Allocate a segment table entry for the given ea and mm
- */
-static int __ste_allocate(unsigned long ea, struct mm_struct *mm)
-{
-        unsigned long vsid;
-        unsigned char stab_entry;
-        unsigned long offset;
-        /* Kernel or user address? */
-        if (is_kernel_addr(ea)) {
-                vsid = get_kernel_vsid(ea, MMU_SEGSIZE_256M);
-        } else {
-                if ((ea >= TASK_SIZE_USER64) || (! mm))
-                        return 1;
-                vsid = get_vsid(mm->context.id, ea, MMU_SEGSIZE_256M);
-        }
-        stab_entry = make_ste(get_paca()->stab_addr, GET_ESID(ea), vsid);
-        if (!is_kernel_addr(ea)) {
-                offset = __get_cpu_var(stab_cache_ptr);
-                if (offset < NR_STAB_CACHE_ENTRIES)
-                        __get_cpu_var(stab_cache[offset++]) = stab_entry;
-                else
-                        offset = NR_STAB_CACHE_ENTRIES+1;
-                __get_cpu_var(stab_cache_ptr) = offset;
-                /* Order update */
-                asm volatile("sync":::"memory");
-        }
-        return 0;
-}
-int ste_allocate(unsigned long ea)
-{
-        return __ste_allocate(ea, current->mm);
-}
-/*
- * Do the segment table work for a context switch: flush all user
- * entries from the table, then preload some probably useful entries
- * for the new task
- */
-void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
-{
-        struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr;
-        struct stab_entry *ste;
-        unsigned long offset;
-        unsigned long pc = KSTK_EIP(tsk);
-        unsigned long stack = KSTK_ESP(tsk);
-        unsigned long unmapped_base;
-        /* Force previous translations to complete. DRENG */
-        asm volatile("isync" : : : "memory");
-        /*
-         * We need interrupts hard-disabled here, not just soft-disabled,
-         * so that a PMU interrupt can't occur, which might try to access
-         * user memory (to get a stack trace) and possible cause an STAB miss
-         * which would update the stab_cache/stab_cache_ptr per-cpu variables.
-         */
-        hard_irq_disable();
-        offset = __get_cpu_var(stab_cache_ptr);
-        if (offset <= NR_STAB_CACHE_ENTRIES) {
-                int i;
-                for (i = 0; i < offset; i++) {
-                        ste = stab + __get_cpu_var(stab_cache[i]);
-                        ste->esid_data = 0; /* invalidate entry */
-                }
-        } else {
-                unsigned long entry;
-                /* Invalidate all entries. */
-                ste = stab;
-                /* Never flush the first entry. */
-                ste += 1;
-                for (entry = 1;
-                     entry < (HW_PAGE_SIZE / sizeof(struct stab_entry));
-                     entry++, ste++) {
-                        unsigned long ea;
-                        ea = ste->esid_data & ESID_MASK;
-                        if (!is_kernel_addr(ea)) {
-                                ste->esid_data = 0;
-                        }
-                }
-        }
-        asm volatile("sync; slbia; sync":::"memory");
-        __get_cpu_var(stab_cache_ptr) = 0;
-        /* Now preload some entries for the new task */
-        if (test_tsk_thread_flag(tsk, TIF_32BIT))
-                unmapped_base = TASK_UNMAPPED_BASE_USER32;
-        else
-                unmapped_base = TASK_UNMAPPED_BASE_USER64;
-        __ste_allocate(pc, mm);
-        if (GET_ESID(pc) == GET_ESID(stack))
-                return;
-        __ste_allocate(stack, mm);
-        if ((GET_ESID(pc) == GET_ESID(unmapped_base))
-            || (GET_ESID(stack) == GET_ESID(unmapped_base)))
-                return;
-        __ste_allocate(unmapped_base, mm);
-        /* Order update */
-        asm volatile("sync" : : : "memory");
-}
-/*
- * Allocate segment tables for secondary CPUs.  These must all go in
- * the first (bolted) segment, so that do_stab_bolted won't get a
- * recursive segment miss on the segment table itself.
- */
-void __init stabs_alloc(void)
-{
-        int cpu;
-        if (mmu_has_feature(MMU_FTR_SLB))
-                return;
-        for_each_possible_cpu(cpu) {
-                unsigned long newstab;
-                if (cpu == 0)
-                        continue; /* stab for CPU 0 is statically allocated */
-                newstab = memblock_alloc_base(HW_PAGE_SIZE, HW_PAGE_SIZE,
-                                         1<<SID_SHIFT);
-                newstab = (unsigned long)__va(newstab);
-                memset((void *)newstab, 0, HW_PAGE_SIZE);
-                paca[cpu].stab_addr = newstab;
-                paca[cpu].stab_real = __pa(newstab);
-                printk(KERN_INFO "Segment table for CPU %d at 0x%llx "
-                       "virtual, 0x%llx absolute\n",
-                       cpu, paca[cpu].stab_addr, paca[cpu].stab_real);
-        }
-}
-/*
- * Build an entry for the base kernel segment and put it into
- * the segment table or SLB.  All other segment table or SLB
- * entries are faulted in.
- */
-void stab_initialize(unsigned long stab)
-{
-        unsigned long vsid = get_kernel_vsid(PAGE_OFFSET, MMU_SEGSIZE_256M);
-        unsigned long stabreal;
-        asm volatile("isync; slbia; isync":::"memory");
-        make_ste(stab, GET_ESID(PAGE_OFFSET), vsid);
-        /* Order update */
-        asm volatile("sync":::"memory");
-        /* Set ASR */
-        stabreal = get_paca()->stab_real | 0x1ul;
-        mtspr(SPRN_ASR, stabreal);
-}
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index c99f6510a0b2..d2a94b85dbc2 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -30,6 +30,8 @@
 #include <asm/tlb.h>
 #include <asm/bug.h>
+#include <trace/events/thp.h>
 DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
 /*
@@ -213,10 +215,12 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
                if (ptep == NULL)
                        continue;
                pte = pte_val(*ptep);
+                if (hugepage_shift)
+                        trace_hugepage_invalidate(start, pte_val(pte));
                if (!(pte & _PAGE_HASHPTE))
                        continue;
                if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte)))
-                        hpte_do_hugepage_flush(mm, start, (pmd_t *)pte);
+                        hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte);
                else
                        hpte_need_flush(mm, start, ptep, pte, 0);
        }
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index 356e8b41fb09..89bf95bd63b1 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -296,9 +296,12 @@ itlb_miss_fault_bolted:
 * r14 = page table base
 * r13 = PACA
 * r11 = tlb_per_core ptr
- * r10 = cpu number
+ * r10 = crap (free to use)
 */
 tlb_miss_common_e6500:
+        crmove  cr2*4+2,cr0*4+2         /* cr2.eq != 0 if kernel address */
+BEGIN_FTR_SECTION               /* CPU_FTR_SMT */
        /*
         * Search if we already have an indirect entry for that virtual
         * address, and if we do, bail out.
@@ -309,6 +312,7 @@ tlb_miss_common_e6500:
        lhz     r10,PACAPACAINDEX(r13)
        cmpdi   r15,0
        cmpdi   cr1,r15,1       /* set cr1.eq = 0 for non-recursive */
+        addi    r10,r10,1
        bne     2f
        stbcx.  r10,0,r11
        bne     1b
@@ -322,18 +326,62 @@ tlb_miss_common_e6500:
        b       1b
        .previous
+        /*
+         * Erratum A-008139 says that we can't use tlbwe to change
+         * an indirect entry in any way (including replacing or
+         * invalidating) if the other thread could be in the process
+         * of a lookup.  The workaround is to invalidate the entry
+         * with tlbilx before overwriting.
+         */
+        lbz     r15,TCD_ESEL_NEXT(r11)
+        rlwinm  r10,r15,16,0xff0000
+        oris    r10,r10,MAS0_TLBSEL(1)@h
+        mtspr   SPRN_MAS0,r10
+        isync
+        tlbre
+        mfspr   r15,SPRN_MAS1
+        andis.  r15,r15,MAS1_VALID@h
+        beq     5f
+BEGIN_FTR_SECTION_NESTED(532)
+        mfspr   r10,SPRN_MAS8
+        rlwinm  r10,r10,0,0x80000fff  /* tgs,tlpid -> sgs,slpid */
+        mtspr   SPRN_MAS5,r10
+END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532)
+        mfspr   r10,SPRN_MAS1
+        rlwinm  r15,r10,0,0x3fff0000  /* tid -> spid */
+        rlwimi  r15,r10,20,0x00000003 /* ind,ts -> sind,sas */
+        mfspr   r10,SPRN_MAS6
+        mtspr   SPRN_MAS6,r15
        mfspr   r15,SPRN_MAS2
+        isync
+        tlbilxva 0,r15
+        isync
+        mtspr   SPRN_MAS6,r10
+5:
+BEGIN_FTR_SECTION_NESTED(532)
+        li      r10,0
+        mtspr   SPRN_MAS8,r10
+        mtspr   SPRN_MAS5,r10
+END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532)
        tlbsx   0,r16
        mfspr   r10,SPRN_MAS1
-        andis.  r10,r10,MAS1_VALID@h
+        andis.  r15,r10,MAS1_VALID@h
        bne     tlb_miss_done_e6500
+FTR_SECTION_ELSE
-        /* Undo MAS-damage from the tlbsx */
        mfspr   r10,SPRN_MAS1
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_SMT)
        oris    r10,r10,MAS1_VALID@h
-        mtspr   SPRN_MAS1,r10
+        beq     cr2,4f
-        mtspr   SPRN_MAS2,r15
+        rlwinm  r10,r10,0,16,1          /* Clear TID */
+4:      mtspr   SPRN_MAS1,r10
        /* Now, we need to walk the page tables. First check if we are in
         * range.
@@ -394,11 +442,13 @@ tlb_miss_common_e6500:
 tlb_miss_done_e6500:
        .macro  tlb_unlock_e6500
+BEGIN_FTR_SECTION
        beq     cr1,1f          /* no unlock if lock was recursively grabbed */
        li      r15,0
        isync
        stb     r15,0(r11)
 1:
+END_FTR_SECTION_IFSET(CPU_FTR_SMT)
        .endm
        tlb_unlock_e6500
@@ -407,12 +457,9 @@ tlb_miss_done_e6500:
        rfi
 tlb_miss_kernel_e6500:
-        mfspr   r10,SPRN_MAS1
        ld      r14,PACA_KERNELPGD(r13)
-        cmpldi  cr0,r15,8               /* Check for vmalloc region */
+        cmpldi  cr1,r15,8               /* Check for vmalloc region */
-        rlwinm  r10,r10,0,16,1          /* Clear TID */
+        beq+    cr1,tlb_miss_common_e6500
-        mtspr   SPRN_MAS1,r10
-        beq+    tlb_miss_common_e6500
 tlb_miss_fault_e6500:
        tlb_unlock_e6500
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index 92cb18d52ea8..f38ea4df6a85 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -581,42 +581,10 @@ static void setup_mmu_htw(void)
 /*
 * Early initialization of the MMU TLB code
 */
-static void __early_init_mmu(int boot_cpu)
+static void early_init_this_mmu(void)
 {
        unsigned int mas4;
-        /* XXX This will have to be decided at runtime, but right
-         * now our boot and TLB miss code hard wires it. Ideally
-         * we should find out a suitable page size and patch the
-         * TLB miss code (either that or use the PACA to store
-         * the value we want)
-         */
-        mmu_linear_psize = MMU_PAGE_1G;
-        /* XXX This should be decided at runtime based on supported
-         * page sizes in the TLB, but for now let's assume 16M is
-         * always there and a good fit (which it probably is)
-         *
-         * Freescale booke only supports 4K pages in TLB0, so use that.
-         */
-        if (mmu_has_feature(MMU_FTR_TYPE_FSL_E))
-                mmu_vmemmap_psize = MMU_PAGE_4K;
-        else
-                mmu_vmemmap_psize = MMU_PAGE_16M;
-        /* XXX This code only checks for TLB 0 capabilities and doesn't
-         *     check what page size combos are supported by the HW. It
-         *     also doesn't handle the case where a separate array holds
-         *     the IND entries from the array loaded by the PT.
-         */
-        if (boot_cpu) {
-                /* Look for supported page sizes */
-                setup_page_sizes();
-                /* Look for HW tablewalk support */
-                setup_mmu_htw();
-        }
        /* Set MAS4 based on page table setting */
        mas4 = 0x4 << MAS4_WIMGED_SHIFT;
@@ -650,11 +618,6 @@ static void __early_init_mmu(int boot_cpu)
        }
        mtspr(SPRN_MAS4, mas4);
-        /* Set the global containing the top of the linear mapping
-         * for use by the TLB miss code
-         */
-        linear_map_top = memblock_end_of_DRAM();
 #ifdef CONFIG_PPC_FSL_BOOK3E
        if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
                unsigned int num_cams;
@@ -662,10 +625,49 @@ static void __early_init_mmu(int boot_cpu)
                /* use a quarter of the TLBCAM for bolted linear map */
                num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
                linear_map_top = map_mem_in_cams(linear_map_top, num_cams);
+        }
+#endif
-                /* limit memory so we dont have linear faults */
+        /* A sync won't hurt us after mucking around with
-                memblock_enforce_memory_limit(linear_map_top);
+         * the MMU configuration
+         */
+        mb();
+}
+static void __init early_init_mmu_global(void)
+{
+        /* XXX This will have to be decided at runtime, but right
+         * now our boot and TLB miss code hard wires it. Ideally
+         * we should find out a suitable page size and patch the
+         * TLB miss code (either that or use the PACA to store
+         * the value we want)
+         */
+        mmu_linear_psize = MMU_PAGE_1G;
+        /* XXX This should be decided at runtime based on supported
+         * page sizes in the TLB, but for now let's assume 16M is
+         * always there and a good fit (which it probably is)
+         *
+         * Freescale booke only supports 4K pages in TLB0, so use that.
+         */
+        if (mmu_has_feature(MMU_FTR_TYPE_FSL_E))
+                mmu_vmemmap_psize = MMU_PAGE_4K;
+        else
+                mmu_vmemmap_psize = MMU_PAGE_16M;
+        /* XXX This code only checks for TLB 0 capabilities and doesn't
+         *     check what page size combos are supported by the HW. It
+         *     also doesn't handle the case where a separate array holds
+         *     the IND entries from the array loaded by the PT.
+         */
+        /* Look for supported page sizes */
+        setup_page_sizes();
+        /* Look for HW tablewalk support */
+        setup_mmu_htw();
+#ifdef CONFIG_PPC_FSL_BOOK3E
+        if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
                if (book3e_htw_mode == PPC_HTW_NONE) {
                        extlb_level_exc = EX_TLB_SIZE;
                        patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
@@ -675,22 +677,41 @@ static void __early_init_mmu(int boot_cpu)
        }
 #endif
-        /* A sync won't hurt us after mucking around with
+        /* Set the global containing the top of the linear mapping
-         * the MMU configuration
+         * for use by the TLB miss code
         */
-        mb();
+        linear_map_top = memblock_end_of_DRAM();
+}
+static void __init early_mmu_set_memory_limit(void)
+{
+#ifdef CONFIG_PPC_FSL_BOOK3E
+        if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
+                /*
+                 * Limit memory so we dont have linear faults.
+                 * Unlike memblock_set_current_limit, which limits
+                 * memory available during early boot, this permanently
+                 * reduces the memory available to Linux.  We need to
+                 * do this because highmem is not supported on 64-bit.
+                 */
+                memblock_enforce_memory_limit(linear_map_top);
+        }
+#endif
        memblock_set_current_limit(linear_map_top);
 }
+/* boot cpu only */
 void __init early_init_mmu(void)
 {
-        __early_init_mmu(1);
+        early_init_mmu_global();
+        early_init_this_mmu();
+        early_mmu_set_memory_limit();
 }
 void early_init_mmu_secondary(void)
 {
-        __early_init_mmu(0);
+        early_init_this_mmu();
 }
 void setup_initial_memory_limit(phys_addr_t first_memblock_base,
author	Jiri Kosina <jkosina@suse.cz>	2014-11-20 08:42:02 -0500
committer	Jiri Kosina <jkosina@suse.cz>	2014-11-20 08:42:02 -0500
commit	a02001086bbfb4da35d1228bebc2f1b442db455f (patch)
tree	62ab47936cef06fd08657ca5b6cd1df98c19be57 /arch/powerpc/mm
parent	eff264efeeb0898408e8c9df72d8a32621035bed (diff)
parent	fc14f9c1272f62c3e8d01300f52467c0d9af50f9 (diff)