17 files changed, 1356 insertions, 110 deletions
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 3e68363405b7..6fb8fc8d2fea 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -13,6 +13,7 @@ obj-y				:= fault.o mem.o pgtable.o gup.o \
                                   pgtable_$(CONFIG_WORD_SIZE).o
 obj-$(CONFIG_PPC_MMU_NOHASH)    += mmu_context_nohash.o tlb_nohash.o \
                                   tlb_nohash_low.o
+obj-$(CONFIG_PPC_BOOK3E)        += tlb_low_$(CONFIG_WORD_SIZE)e.o
 obj-$(CONFIG_PPC64)             += mmap_64.o
 hash64-$(CONFIG_PPC_NATIVE)     := hash_native_64.o
 obj-$(CONFIG_PPC_STD_MMU_64)    += hash_utils_64.o \
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index bb3d65998e6b..dc93e95b256e 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -161,7 +161,7 @@ unsigned long __init mmu_mapin_ram(void)
        unsigned long virt = PAGE_OFFSET;
        phys_addr_t phys = memstart_addr;
-        while (cam[tlbcam_index] && tlbcam_index < ARRAY_SIZE(cam)) {
+        while (tlbcam_index < ARRAY_SIZE(cam) && cam[tlbcam_index]) {
                settlbcam(tlbcam_index, virt, phys, cam[tlbcam_index], PAGE_KERNEL_X, 0);
                virt += cam[tlbcam_index];
                phys += cam[tlbcam_index];
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index 14af8cedab70..b13d58932bf6 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -40,7 +40,7 @@ mmu_hash_lock:
 * The address is in r4, and r3 contains an access flag:
 * _PAGE_RW (0x400) if a write.
 * r9 contains the SRR1 value, from which we use the MSR_PR bit.
- * SPRG3 contains the physical address of the current task's thread.
+ * SPRG_THREAD contains the physical address of the current task's thread.
 *
 * Returns to the caller if the access is illegal or there is no
 * mapping for the address.  Otherwise it places an appropriate PTE
@@ -68,7 +68,7 @@ _GLOBAL(hash_page)
        /* Get PTE (linux-style) and check access */
        lis     r0,KERNELBASE@h         /* check if kernel address */
        cmplw   0,r4,r0
-        mfspr   r8,SPRN_SPRG3           /* current task's THREAD (phys) */
+        mfspr   r8,SPRN_SPRG_THREAD     /* current task's THREAD (phys) */
        ori     r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */
        lwz     r5,PGDIR(r8)            /* virt page-table root */
        blt+    112f                    /* assume user more likely */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index c46ef2ffa3d9..90df6ffe3a43 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -57,8 +57,10 @@ unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
 #define HUGEPTE_CACHE_NAME(psize)       (huge_pgtable_cache_name[psize])
 static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
-        "unused_4K", "hugepte_cache_64K", "unused_64K_AP",
+        [MMU_PAGE_64K]  = "hugepte_cache_64K",
-        "hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G"
+        [MMU_PAGE_1M]   = "hugepte_cache_1M",
+        [MMU_PAGE_16M]  = "hugepte_cache_16M",
+        [MMU_PAGE_16G]  = "hugepte_cache_16G",
 };
 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
@@ -700,6 +702,8 @@ static void __init set_huge_psize(int psize)
                if (mmu_huge_psizes[psize] ||
                   mmu_psize_defs[psize].shift == PAGE_SHIFT)
                        return;
+                if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL))
+                        return;
                hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
                switch (mmu_psize_defs[psize].shift) {
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 3de6a0d93824..3ef5084b90ca 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -54,8 +54,6 @@
 #endif
 #define MAX_LOW_MEM     CONFIG_LOWMEM_SIZE
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 phys_addr_t total_memory;
 phys_addr_t total_lowmem;
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 68a821add28d..31582329cd67 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -205,6 +205,47 @@ static int __meminit vmemmap_populated(unsigned long start, int page_size)
        return 0;
 }
+/* On hash-based CPUs, the vmemmap is bolted in the hash table.
+ *
+ * On Book3E CPUs, the vmemmap is currently mapped in the top half of
+ * the vmalloc space using normal page tables, though the size of
+ * pages encoded in the PTEs can be different
+ */
+#ifdef CONFIG_PPC_BOOK3E
+static void __meminit vmemmap_create_mapping(unsigned long start,
+                                             unsigned long page_size,
+                                             unsigned long phys)
+{
+        /* Create a PTE encoding without page size */
+        unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED |
+                _PAGE_KERNEL_RW;
+        /* PTEs only contain page size encodings up to 32M */
+        BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf);
+        /* Encode the size in the PTE */
+        flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8;
+        /* For each PTE for that area, map things. Note that we don't
+         * increment phys because all PTEs are of the large size and
+         * thus must have the low bits clear
+         */
+        for (i = 0; i < page_size; i += PAGE_SIZE)
+                BUG_ON(map_kernel_page(start + i, phys, flags));
+}
+#else /* CONFIG_PPC_BOOK3E */
+static void __meminit vmemmap_create_mapping(unsigned long start,
+                                             unsigned long page_size,
+                                             unsigned long phys)
+{
+        int  mapped = htab_bolt_mapping(start, start + page_size, phys,
+                                        PAGE_KERNEL, mmu_vmemmap_psize,
+                                        mmu_kernel_ssize);
+        BUG_ON(mapped < 0);
+}
+#endif /* CONFIG_PPC_BOOK3E */
 int __meminit vmemmap_populate(struct page *start_page,
                               unsigned long nr_pages, int node)
 {
@@ -215,8 +256,11 @@ int __meminit vmemmap_populate(struct page *start_page,
        /* Align to the page size of the linear mapping. */
        start = _ALIGN_DOWN(start, page_size);
+        pr_debug("vmemmap_populate page %p, %ld pages, node %d\n",
+                 start_page, nr_pages, node);
+        pr_debug(" -> map %lx..%lx\n", start, end);
        for (; start < end; start += page_size) {
-                int mapped;
                void *p;
                if (vmemmap_populated(start, page_size))
@@ -226,13 +270,10 @@ int __meminit vmemmap_populate(struct page *start_page,
                if (!p)
                        return -ENOMEM;
-                pr_debug("vmemmap %08lx allocated at %p, physical %08lx.\n",
+                pr_debug("      * %016lx..%016lx allocated at %p\n",
-                        start, p, __pa(p));
+                         start, start + page_size, p);
-                mapped = htab_bolt_mapping(start, start + page_size, __pa(p),
+                vmemmap_create_mapping(start, page_size, __pa(p));
-                                           pgprot_val(PAGE_KERNEL),
-                                           mmu_vmemmap_psize, mmu_kernel_ssize);
-                BUG_ON(mapped < 0);
        }
        return 0;
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index b1a727def15b..c2f93dc470e6 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -25,10 +25,20 @@
 *     also clear mm->cpu_vm_mask bits when processes are migrated
 */
-#undef DEBUG
+#define DEBUG_MAP_CONSISTENCY
-#define DEBUG_STEAL_ONLY
+#define DEBUG_CLAMP_LAST_CONTEXT   31
-#undef DEBUG_MAP_CONSISTENCY
+//#define DEBUG_HARDER
-/*#define DEBUG_CLAMP_LAST_CONTEXT   15 */
+/* We don't use DEBUG because it tends to be compiled in always nowadays
+ * and this would generate way too much output
+ */
+#ifdef DEBUG_HARDER
+#define pr_hard(args...)        printk(KERN_DEBUG args)
+#define pr_hardcont(args...)    printk(KERN_CONT args)
+#else
+#define pr_hard(args...)        do { } while(0)
+#define pr_hardcont(args...)    do { } while(0)
+#endif
 #include <linux/kernel.h>
 #include <linux/mm.h>
@@ -71,7 +81,7 @@ static DEFINE_SPINLOCK(context_lock);
 static unsigned int steal_context_smp(unsigned int id)
 {
        struct mm_struct *mm;
-        unsigned int cpu, max;
+        unsigned int cpu, max, i;
        max = last_context - first_context;
@@ -89,15 +99,22 @@ static unsigned int steal_context_smp(unsigned int id)
                                id = first_context;
                        continue;
                }
-                pr_devel("[%d] steal context %d from mm @%p\n",
+                pr_hardcont(" | steal %d from 0x%p", id, mm);
-                         smp_processor_id(), id, mm);
                /* Mark this mm has having no context anymore */
                mm->context.id = MMU_NO_CONTEXT;
-                /* Mark it stale on all CPUs that used this mm */
+                /* Mark it stale on all CPUs that used this mm. For threaded
-                for_each_cpu(cpu, mm_cpumask(mm))
+                 * implementations, we set it on all threads on each core
-                        __set_bit(id, stale_map[cpu]);
+                 * represented in the mask. A future implementation will use
+                 * a core map instead but this will do for now.
+                 */
+                for_each_cpu(cpu, mm_cpumask(mm)) {
+                        for (i = cpu_first_thread_in_core(cpu);
+                             i <= cpu_last_thread_in_core(cpu); i++)
+                                __set_bit(id, stale_map[i]);
+                        cpu = i - 1;
+                }
                return id;
        }
@@ -126,7 +143,7 @@ static unsigned int steal_context_up(unsigned int id)
        /* Pick up the victim mm */
        mm = context_mm[id];
-        pr_devel("[%d] steal context %d from mm @%p\n", cpu, id, mm);
+        pr_hardcont(" | steal %d from 0x%p", id, mm);
        /* Flush the TLB for that context */
        local_flush_tlb_mm(mm);
@@ -173,25 +190,20 @@ static void context_check_map(void) { }
 void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 {
-        unsigned int id, cpu = smp_processor_id();
+        unsigned int i, id, cpu = smp_processor_id();
        unsigned long *map;
        /* No lockless fast path .. yet */
        spin_lock(&context_lock);
-#ifndef DEBUG_STEAL_ONLY
+        pr_hard("[%d] activating context for mm @%p, active=%d, id=%d",
-        pr_devel("[%d] activating context for mm @%p, active=%d, id=%d\n",
+                cpu, next, next->context.active, next->context.id);
-                 cpu, next, next->context.active, next->context.id);
-#endif
 #ifdef CONFIG_SMP
        /* Mark us active and the previous one not anymore */
        next->context.active++;
        if (prev) {
-#ifndef DEBUG_STEAL_ONLY
+                pr_hardcont(" (old=0x%p a=%d)", prev, prev->context.active);
-                pr_devel(" old context %p active was: %d\n",
-                         prev, prev->context.active);
-#endif
                WARN_ON(prev->context.active < 1);
                prev->context.active--;
        }
@@ -201,8 +213,14 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
        /* If we already have a valid assigned context, skip all that */
        id = next->context.id;
-        if (likely(id != MMU_NO_CONTEXT))
+        if (likely(id != MMU_NO_CONTEXT)) {
+#ifdef DEBUG_MAP_CONSISTENCY
+                if (context_mm[id] != next)
+                        pr_err("MMU: mm 0x%p has id %d but context_mm[%d] says 0x%p\n",
+                               next, id, id, context_mm[id]);
+#endif
                goto ctxt_ok;
+        }
        /* We really don't have a context, let's try to acquire one */
        id = next_context;
@@ -235,11 +253,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
        next_context = id + 1;
        context_mm[id] = next;
        next->context.id = id;
+        pr_hardcont(" | new id=%d,nrf=%d", id, nr_free_contexts);
-#ifndef DEBUG_STEAL_ONLY
-        pr_devel("[%d] picked up new id %d, nrf is now %d\n",
-                 cpu, id, nr_free_contexts);
-#endif
        context_check_map();
 ctxt_ok:
@@ -248,15 +262,21 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
         * local TLB for it and unmark it before we use it
         */
        if (test_bit(id, stale_map[cpu])) {
-                pr_devel("[%d] flushing stale context %d for mm @%p !\n",
+                pr_hardcont(" | stale flush %d [%d..%d]",
-                         cpu, id, next);
+                            id, cpu_first_thread_in_core(cpu),
+                            cpu_last_thread_in_core(cpu));
                local_flush_tlb_mm(next);
                /* XXX This clear should ultimately be part of local_flush_tlb_mm */
-                __clear_bit(id, stale_map[cpu]);
+                for (i = cpu_first_thread_in_core(cpu);
+                     i <= cpu_last_thread_in_core(cpu); i++) {
+                        __clear_bit(id, stale_map[i]);
+                }
        }
        /* Flick the MMU and release lock */
+        pr_hardcont(" -> %d\n", id);
        set_context(id, next->pgd);
        spin_unlock(&context_lock);
 }
@@ -266,6 +286,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 */
 int init_new_context(struct task_struct *t, struct mm_struct *mm)
 {
+        pr_hard("initing context for mm @%p\n", mm);
        mm->context.id = MMU_NO_CONTEXT;
        mm->context.active = 0;
@@ -305,7 +327,9 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
                                            unsigned long action, void *hcpu)
 {
        unsigned int cpu = (unsigned int)(long)hcpu;
+#ifdef CONFIG_HOTPLUG_CPU
+        struct task_struct *p;
+#endif
        /* We don't touch CPU 0 map, it's allocated at aboot and kept
         * around forever
         */
@@ -324,8 +348,16 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
                pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu);
                kfree(stale_map[cpu]);
                stale_map[cpu] = NULL;
-                break;
-#endif
+                /* We also clear the cpu_vm_mask bits of CPUs going away */
+                read_lock(&tasklist_lock);
+                for_each_process(p) {
+                        if (p->mm)
+                                cpu_mask_clear_cpu(cpu, mm_cpumask(p->mm));
+                }
+                read_unlock(&tasklist_lock);
+        break;
+#endif /* CONFIG_HOTPLUG_CPU */
        }
        return NOTIFY_OK;
 }
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index d1f9c62dc177..d2e5321d5ea6 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -36,21 +36,37 @@ static inline void _tlbil_pid(unsigned int pid)
 {
        asm volatile ("sync; tlbia; isync" : : : "memory");
 }
+#define _tlbil_pid_noind(pid)   _tlbil_pid(pid)
 #else /* CONFIG_40x || CONFIG_8xx */
 extern void _tlbil_all(void);
 extern void _tlbil_pid(unsigned int pid);
+#ifdef CONFIG_PPC_BOOK3E
+extern void _tlbil_pid_noind(unsigned int pid);
+#else
+#define _tlbil_pid_noind(pid)   _tlbil_pid(pid)
+#endif
 #endif /* !(CONFIG_40x || CONFIG_8xx) */
 /*
 * On 8xx, we directly inline tlbie, on others, it's extern
 */
 #ifdef CONFIG_8xx
-static inline void _tlbil_va(unsigned long address, unsigned int pid)
+static inline void _tlbil_va(unsigned long address, unsigned int pid,
+                             unsigned int tsize, unsigned int ind)
 {
        asm volatile ("tlbie %0; sync" : : "r" (address) : "memory");
 }
-#else /* CONFIG_8xx */
+#elif defined(CONFIG_PPC_BOOK3E)
-extern void _tlbil_va(unsigned long address, unsigned int pid);
+extern void _tlbil_va(unsigned long address, unsigned int pid,
+                      unsigned int tsize, unsigned int ind);
+#else
+extern void __tlbil_va(unsigned long address, unsigned int pid);
+static inline void _tlbil_va(unsigned long address, unsigned int pid,
+                             unsigned int tsize, unsigned int ind)
+{
+        __tlbil_va(address, pid);
+}
 #endif /* CONIFG_8xx */
 /*
@@ -58,10 +74,16 @@ extern void _tlbil_va(unsigned long address, unsigned int pid);
 * implementation. When that becomes the case, this will be
 * an extern.
 */
-static inline void _tlbivax_bcast(unsigned long address, unsigned int pid)
+#ifdef CONFIG_PPC_BOOK3E
+extern void _tlbivax_bcast(unsigned long address, unsigned int pid,
+                           unsigned int tsize, unsigned int ind);
+#else
+static inline void _tlbivax_bcast(unsigned long address, unsigned int pid,
+                                   unsigned int tsize, unsigned int ind)
 {
        BUG();
 }
+#endif
 #else /* CONFIG_PPC_MMU_NOHASH */
@@ -99,7 +121,12 @@ extern unsigned int rtas_data, rtas_size;
 struct hash_pte;
 extern struct hash_pte *Hash, *Hash_end;
 extern unsigned long Hash_size, Hash_mask;
-#endif
+#endif /* CONFIG_PPC32 */
+#ifdef CONFIG_PPC64
+extern int map_kernel_page(unsigned long ea, unsigned long pa, int flags);
+#endif /* CONFIG_PPC64 */
 extern unsigned long ioremap_bot;
 extern unsigned long __max_low_memory;
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 627767d6169b..b6b32487e740 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -30,6 +30,16 @@
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+#ifdef CONFIG_SMP
+/*
+ * Handle batching of page table freeing on SMP. Page tables are
+ * queued up and send to be freed later by RCU in order to avoid
+ * freeing a page table page that is being walked without locks
+ */
 static DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
 static unsigned long pte_freelist_forced_free;
@@ -116,6 +126,8 @@ void pte_free_finish(void)
        *batchp = NULL;
 }
+#endif /* CONFIG_SMP */
 /*
 * Handle i/d cache flushing, called from set_pte_at() or ptep_set_access_flags()
 */
@@ -242,7 +254,7 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
        BUG_ON(pud_none(*pud));
        pmd = pmd_offset(pud, addr);
        BUG_ON(!pmd_present(*pmd));
-        BUG_ON(!spin_is_locked(pte_lockptr(mm, pmd)));
+        assert_spin_locked(pte_lockptr(mm, pmd));
 }
 #endif /* CONFIG_DEBUG_VM */
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index bfa7db6b2fd5..853d5565eed5 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -33,6 +33,8 @@
 #include <linux/stddef.h>
 #include <linux/vmalloc.h>
 #include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/lmb.h>
 #include <asm/pgalloc.h>
 #include <asm/page.h>
@@ -55,19 +57,36 @@
 unsigned long ioremap_bot = IOREMAP_BASE;
+#ifdef CONFIG_PPC_MMU_NOHASH
+static void *early_alloc_pgtable(unsigned long size)
+{
+        void *pt;
+        if (init_bootmem_done)
+                pt = __alloc_bootmem(size, size, __pa(MAX_DMA_ADDRESS));
+        else
+                pt = __va(lmb_alloc_base(size, size,
+                                         __pa(MAX_DMA_ADDRESS)));
+        memset(pt, 0, size);
+        return pt;
+}
+#endif /* CONFIG_PPC_MMU_NOHASH */
 /*
- * map_io_page currently only called by __ioremap
+ * map_kernel_page currently only called by __ioremap
- * map_io_page adds an entry to the ioremap page table
+ * map_kernel_page adds an entry to the ioremap page table
 * and adds an entry to the HPT, possibly bolting it
 */
-static int map_io_page(unsigned long ea, unsigned long pa, int flags)
+int map_kernel_page(unsigned long ea, unsigned long pa, int flags)
 {
        pgd_t *pgdp;
        pud_t *pudp;
        pmd_t *pmdp;
        pte_t *ptep;
-        if (mem_init_done) {
+        if (slab_is_available()) {
                pgdp = pgd_offset_k(ea);
                pudp = pud_alloc(&init_mm, pgdp, ea);
                if (!pudp)
@@ -81,6 +100,35 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
                set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
                                                          __pgprot(flags)));
        } else {
+#ifdef CONFIG_PPC_MMU_NOHASH
+                /* Warning ! This will blow up if bootmem is not initialized
+                 * which our ppc64 code is keen to do that, we'll need to
+                 * fix it and/or be more careful
+                 */
+                pgdp = pgd_offset_k(ea);
+#ifdef PUD_TABLE_SIZE
+                if (pgd_none(*pgdp)) {
+                        pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
+                        BUG_ON(pudp == NULL);
+                        pgd_populate(&init_mm, pgdp, pudp);
+                }
+#endif /* PUD_TABLE_SIZE */
+                pudp = pud_offset(pgdp, ea);
+                if (pud_none(*pudp)) {
+                        pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
+                        BUG_ON(pmdp == NULL);
+                        pud_populate(&init_mm, pudp, pmdp);
+                }
+                pmdp = pmd_offset(pudp, ea);
+                if (!pmd_present(*pmdp)) {
+                        ptep = early_alloc_pgtable(PAGE_SIZE);
+                        BUG_ON(ptep == NULL);
+                        pmd_populate_kernel(&init_mm, pmdp, ptep);
+                }
+                ptep = pte_offset_kernel(pmdp, ea);
+                set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
+                                                          __pgprot(flags)));
+#else /* CONFIG_PPC_MMU_NOHASH */
                /*
                 * If the mm subsystem is not fully up, we cannot create a
                 * linux page table entry for this mapping.  Simply bolt an
@@ -93,6 +141,7 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
                               "memory at %016lx !\n", pa);
                        return -ENOMEM;
                }
+#endif /* !CONFIG_PPC_MMU_NOHASH */
        }
        return 0;
 }
@@ -124,7 +173,7 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
        WARN_ON(size & ~PAGE_MASK);
        for (i = 0; i < size; i += PAGE_SIZE)
-                if (map_io_page((unsigned long)ea+i, pa+i, flags))
+                if (map_kernel_page((unsigned long)ea+i, pa+i, flags))
                        return NULL;
        return (void __iomem *)ea;
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 5b7038f248b6..07961c5c169e 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -92,15 +92,13 @@ static inline void create_shadowed_slbe(unsigned long ea, int ssize,
                     : "memory" );
 }
-void slb_flush_and_rebolt(void)
+static void __slb_flush_and_rebolt(void)
 {
        /* If you change this make sure you change SLB_NUM_BOLTED
         * appropriately too. */
        unsigned long linear_llp, vmalloc_llp, lflags, vflags;
        unsigned long ksp_esid_data, ksp_vsid_data;
-        WARN_ON(!irqs_disabled());
        linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
        vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
        lflags = SLB_VSID_KERNEL | linear_llp;
@@ -117,12 +115,6 @@ void slb_flush_and_rebolt(void)
                ksp_vsid_data = get_slb_shadow()->save_area[2].vsid;
        }
-        /*
-         * We can't take a PMU exception in the following code, so hard
-         * disable interrupts.
-         */
-        hard_irq_disable();
        /* We need to do this all in asm, so we're sure we don't touch
         * the stack between the slbia and rebolting it. */
        asm volatile("isync\n"
@@ -139,6 +131,21 @@ void slb_flush_and_rebolt(void)
                     : "memory");
 }
+void slb_flush_and_rebolt(void)
+{
+        WARN_ON(!irqs_disabled());
+        /*
+         * We can't take a PMU exception in the following code, so hard
+         * disable interrupts.
+         */
+        hard_irq_disable();
+        __slb_flush_and_rebolt();
+        get_paca()->slb_cache_ptr = 0;
+}
 void slb_vmalloc_update(void)
 {
        unsigned long vflags;
@@ -180,12 +187,20 @@ static inline int esids_match(unsigned long addr1, unsigned long addr2)
 /* Flush all user entries from the segment table of the current processor. */
 void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 {
-        unsigned long offset = get_paca()->slb_cache_ptr;
+        unsigned long offset;
        unsigned long slbie_data = 0;
        unsigned long pc = KSTK_EIP(tsk);
        unsigned long stack = KSTK_ESP(tsk);
-        unsigned long unmapped_base;
+        unsigned long exec_base;
+        /*
+         * We need interrupts hard-disabled here, not just soft-disabled,
+         * so that a PMU interrupt can't occur, which might try to access
+         * user memory (to get a stack trace) and possible cause an SLB miss
+         * which would update the slb_cache/slb_cache_ptr fields in the PACA.
+         */
+        hard_irq_disable();
+        offset = get_paca()->slb_cache_ptr;
        if (!cpu_has_feature(CPU_FTR_NO_SLBIE_B) &&
            offset <= SLB_CACHE_ENTRIES) {
                int i;
@@ -200,7 +215,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
                }
                asm volatile("isync" : : : "memory");
        } else {
-                slb_flush_and_rebolt();
+                __slb_flush_and_rebolt();
        }
        /* Workaround POWER5 < DD2.1 issue */
@@ -212,29 +227,23 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
        /*
         * preload some userspace segments into the SLB.
+         * Almost all 32 and 64bit PowerPC executables are linked at
+         * 0x10000000 so it makes sense to preload this segment.
         */
-        if (test_tsk_thread_flag(tsk, TIF_32BIT))
+        exec_base = 0x10000000;
-                unmapped_base = TASK_UNMAPPED_BASE_USER32;
-        else
-                unmapped_base = TASK_UNMAPPED_BASE_USER64;
-        if (is_kernel_addr(pc))
-                return;
-        slb_allocate(pc);
-        if (esids_match(pc,stack))
+        if (is_kernel_addr(pc) || is_kernel_addr(stack) ||
+            is_kernel_addr(exec_base))
                return;
-        if (is_kernel_addr(stack))
+        slb_allocate(pc);
-                return;
-        slb_allocate(stack);
-        if (esids_match(pc,unmapped_base) || esids_match(stack,unmapped_base))
+        if (!esids_match(pc, stack))
-                return;
+                slb_allocate(stack);
-        if (is_kernel_addr(unmapped_base))
+        if (!esids_match(pc, exec_base) &&
-                return;
+            !esids_match(stack, exec_base))
-        slb_allocate(unmapped_base);
+                slb_allocate(exec_base);
 }
 static inline void patch_slb_encoding(unsigned int *insn_addr,
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
index 98cd1dc2ae75..ab5fb48b3e90 100644
--- a/arch/powerpc/mm/stab.c
+++ b/arch/powerpc/mm/stab.c
@@ -164,7 +164,7 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
 {
        struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr;
        struct stab_entry *ste;
-        unsigned long offset = __get_cpu_var(stab_cache_ptr);
+        unsigned long offset;
        unsigned long pc = KSTK_EIP(tsk);
        unsigned long stack = KSTK_ESP(tsk);
        unsigned long unmapped_base;
@@ -172,6 +172,15 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
        /* Force previous translations to complete. DRENG */
        asm volatile("isync" : : : "memory");
+        /*
+         * We need interrupts hard-disabled here, not just soft-disabled,
+         * so that a PMU interrupt can't occur, which might try to access
+         * user memory (to get a stack trace) and possible cause an STAB miss
+         * which would update the stab_cache/stab_cache_ptr per-cpu variables.
+         */
+        hard_irq_disable();
+        offset = __get_cpu_var(stab_cache_ptr);
        if (offset <= NR_STAB_CACHE_ENTRIES) {
                int i;
diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c
index 65190587a365..8aaa8b7eb324 100644
--- a/arch/powerpc/mm/tlb_hash32.c
+++ b/arch/powerpc/mm/tlb_hash32.c
@@ -71,6 +71,9 @@ void tlb_flush(struct mmu_gather *tlb)
                 */
                _tlbia();
        }
+        /* Push out batch of freed page tables */
+        pte_free_finish();
 }
 /*
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 937eb90677d9..2b2f35f6985e 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -33,11 +33,6 @@
 DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
-/* This is declared as we are using the more or less generic
- * arch/powerpc/include/asm/tlb.h file -- tgall
- */
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 /*
 * A linux PTE was changed and the corresponding hash table entry
 * neesd to be flushed. This function will either perform the flush
@@ -154,6 +149,21 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
        batch->index = 0;
 }
+void tlb_flush(struct mmu_gather *tlb)
+{
+        struct ppc64_tlb_batch *tlbbatch = &__get_cpu_var(ppc64_tlb_batch);
+        /* If there's a TLB batch pending, then we must flush it because the
+         * pages are going to be freed and we really don't want to have a CPU
+         * access a freed page because it has a stale TLB
+         */
+        if (tlbbatch->index)
+                __flush_tlb_pending(tlbbatch);
+        /* Push out batch of freed page tables */
+        pte_free_finish();
+}
 /**
 * __flush_hash_table_range - Flush all HPTEs for a given address range
 *                            from the hash table (and the TLB). But keeps
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
new file mode 100644
index 000000000000..10d524ded7b2
--- /dev/null
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -0,0 +1,734 @@
+/*
+ *  Low leve TLB miss handlers for Book3E
+ *
+ *  Copyright (C) 2008-2009
+ *      Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+#include <asm/processor.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/cputable.h>
+#include <asm/pgtable.h>
+#include <asm/reg.h>
+#include <asm/exception-64e.h>
+#include <asm/ppc-opcode.h>
+#ifdef CONFIG_PPC_64K_PAGES
+#define VPTE_PMD_SHIFT  (PTE_INDEX_SIZE+1)
+#else
+#define VPTE_PMD_SHIFT  (PTE_INDEX_SIZE)
+#endif
+#define VPTE_PUD_SHIFT  (VPTE_PMD_SHIFT + PMD_INDEX_SIZE)
+#define VPTE_PGD_SHIFT  (VPTE_PUD_SHIFT + PUD_INDEX_SIZE)
+#define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE)
+/**********************************************************************
+ *                                                                    *
+ * TLB miss handling for Book3E with TLB reservation and HES support  *
+ *                                                                    *
+ **********************************************************************/
+/* Data TLB miss */
+        START_EXCEPTION(data_tlb_miss)
+        TLB_MISS_PROLOG
+        /* Now we handle the fault proper. We only save DEAR in normal
+         * fault case since that's the only interesting values here.
+         * We could probably also optimize by not saving SRR0/1 in the
+         * linear mapping case but I'll leave that for later
+         */
+        mfspr   r14,SPRN_ESR
+        mfspr   r16,SPRN_DEAR           /* get faulting address */
+        srdi    r15,r16,60              /* get region */
+        cmpldi  cr0,r15,0xc             /* linear mapping ? */
+        TLB_MISS_STATS_SAVE_INFO
+        beq     tlb_load_linear         /* yes -> go to linear map load */
+        /* The page tables are mapped virtually linear. At this point, though,
+         * we don't know whether we are trying to fault in a first level
+         * virtual address or a virtual page table address. We can get that
+         * from bit 0x1 of the region ID which we have set for a page table
+         */
+        andi.   r10,r15,0x1
+        bne-    virt_page_table_tlb_miss
+        std     r14,EX_TLB_ESR(r12);    /* save ESR */
+        std     r16,EX_TLB_DEAR(r12);   /* save DEAR */
+         /* We need _PAGE_PRESENT and  _PAGE_ACCESSED set */
+        li      r11,_PAGE_PRESENT
+        oris    r11,r11,_PAGE_ACCESSED@h
+        /* We do the user/kernel test for the PID here along with the RW test
+         */
+        cmpldi  cr0,r15,0               /* Check for user region */
+        /* We pre-test some combination of permissions to avoid double
+         * faults:
+         *
+         * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE
+         * ESR_ST   is 0x00800000
+         * _PAGE_BAP_SW is 0x00000010
+         * So the shift is >> 19. This tests for supervisor writeability.
+         * If the page happens to be supervisor writeable and not user
+         * writeable, we will take a new fault later, but that should be
+         * a rare enough case.
+         *
+         * We also move ESR_ST in _PAGE_DIRTY position
+         * _PAGE_DIRTY is 0x00001000 so the shift is >> 11
+         *
+         * MAS1 is preset for all we need except for TID that needs to
+         * be cleared for kernel translations
+         */
+        rlwimi  r11,r14,32-19,27,27
+        rlwimi  r11,r14,32-16,19,19
+        beq     normal_tlb_miss
+        /* XXX replace the RMW cycles with immediate loads + writes */
+1:      mfspr   r10,SPRN_MAS1
+        cmpldi  cr0,r15,8               /* Check for vmalloc region */
+        rlwinm  r10,r10,0,16,1          /* Clear TID */
+        mtspr   SPRN_MAS1,r10
+        beq+    normal_tlb_miss
+        /* We got a crappy address, just fault with whatever DEAR and ESR
+         * are here
+         */
+        TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+        TLB_MISS_EPILOG_ERROR
+        b       exc_data_storage_book3e
+/* Instruction TLB miss */
+        START_EXCEPTION(instruction_tlb_miss)
+        TLB_MISS_PROLOG
+        /* If we take a recursive fault, the second level handler may need
+         * to know whether we are handling a data or instruction fault in
+         * order to get to the right store fault handler. We provide that
+         * info by writing a crazy value in ESR in our exception frame
+         */
+        li      r14,-1  /* store to exception frame is done later */
+        /* Now we handle the fault proper. We only save DEAR in the non
+         * linear mapping case since we know the linear mapping case will
+         * not re-enter. We could indeed optimize and also not save SRR0/1
+         * in the linear mapping case but I'll leave that for later
+         *
+         * Faulting address is SRR0 which is already in r16
+         */
+        srdi    r15,r16,60              /* get region */
+        cmpldi  cr0,r15,0xc             /* linear mapping ? */
+        TLB_MISS_STATS_SAVE_INFO
+        beq     tlb_load_linear         /* yes -> go to linear map load */
+        /* We do the user/kernel test for the PID here along with the RW test
+         */
+        li      r11,_PAGE_PRESENT|_PAGE_HWEXEC  /* Base perm */
+        oris    r11,r11,_PAGE_ACCESSED@h
+        cmpldi  cr0,r15,0                       /* Check for user region */
+        std     r14,EX_TLB_ESR(r12)             /* write crazy -1 to frame */
+        beq     normal_tlb_miss
+        /* XXX replace the RMW cycles with immediate loads + writes */
+1:      mfspr   r10,SPRN_MAS1
+        cmpldi  cr0,r15,8                       /* Check for vmalloc region */
+        rlwinm  r10,r10,0,16,1                  /* Clear TID */
+        mtspr   SPRN_MAS1,r10
+        beq+    normal_tlb_miss
+        /* We got a crappy address, just fault */
+        TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+        TLB_MISS_EPILOG_ERROR
+        b       exc_instruction_storage_book3e
+/*
+ * This is the guts of the first-level TLB miss handler for direct
+ * misses. We are entered with:
+ *
+ * r16 = faulting address
+ * r15 = region ID
+ * r14 = crap (free to use)
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = PTE permission mask
+ * r10 = crap (free to use)
+ */
+normal_tlb_miss:
+        /* So we first construct the page table address. We do that by
+         * shifting the bottom of the address (not the region ID) by
+         * PAGE_SHIFT-3, clearing the bottom 3 bits (get a PTE ptr) and
+         * or'ing the fourth high bit.
+         *
+         * NOTE: For 64K pages, we do things slightly differently in
+         * order to handle the weird page table format used by linux
+         */
+        ori     r10,r15,0x1
+#ifdef CONFIG_PPC_64K_PAGES
+        /* For the top bits, 16 bytes per PTE */
+        rldicl  r14,r16,64-(PAGE_SHIFT-4),PAGE_SHIFT-4+4
+        /* Now create the bottom bits as 0 in position 0x8000 and
+         * the rest calculated for 8 bytes per PTE
+         */
+        rldicl  r15,r16,64-(PAGE_SHIFT-3),64-15
+        /* Insert the bottom bits in */
+        rlwimi  r14,r15,0,16,31
+#else
+        rldicl  r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4
+#endif
+        sldi    r15,r10,60
+        clrrdi  r14,r14,3
+        or      r10,r15,r14
+        /* Set the TLB reservation and seach for existing entry. Then load
+         * the entry.
+         */
+        PPC_TLBSRX_DOT(0,r16)
+        ld      r14,0(r10)
+        beq     normal_tlb_miss_done
+finish_normal_tlb_miss:
+        /* Check if required permissions are met */
+        andc.   r15,r11,r14
+        bne-    normal_tlb_miss_access_fault
+        /* Now we build the MAS:
+         *
+         * MAS 0   :    Fully setup with defaults in MAS4 and TLBnCFG
+         * MAS 1   :    Almost fully setup
+         *               - PID already updated by caller if necessary
+         *               - TSIZE need change if !base page size, not
+         *                 yet implemented for now
+         * MAS 2   :    Defaults not useful, need to be redone
+         * MAS 3+7 :    Needs to be done
+         *
+         * TODO: mix up code below for better scheduling
+         */
+        clrrdi  r11,r16,12              /* Clear low crap in EA */
+        rlwimi  r11,r14,32-19,27,31     /* Insert WIMGE */
+        mtspr   SPRN_MAS2,r11
+        /* Check page size, if not standard, update MAS1 */
+        rldicl  r11,r14,64-8,64-8
+#ifdef CONFIG_PPC_64K_PAGES
+        cmpldi  cr0,r11,BOOK3E_PAGESZ_64K
+#else
+        cmpldi  cr0,r11,BOOK3E_PAGESZ_4K
+#endif
+        beq-    1f
+        mfspr   r11,SPRN_MAS1
+        rlwimi  r11,r14,31,21,24
+        rlwinm  r11,r11,0,21,19
+        mtspr   SPRN_MAS1,r11
+1:
+        /* Move RPN in position */
+        rldicr  r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
+        clrldi  r15,r11,12              /* Clear crap at the top */
+        rlwimi  r15,r14,32-8,22,25      /* Move in U bits */
+        rlwimi  r15,r14,32-2,26,31      /* Move in BAP bits */
+        /* Mask out SW and UW if !DIRTY (XXX optimize this !) */
+        andi.   r11,r14,_PAGE_DIRTY
+        bne     1f
+        li      r11,MAS3_SW|MAS3_UW
+        andc    r15,r15,r11
+1:      mtspr   SPRN_MAS7_MAS3,r15
+        tlbwe
+normal_tlb_miss_done:
+        /* We don't bother with restoring DEAR or ESR since we know we are
+         * level 0 and just going back to userland. They are only needed
+         * if you are going to take an access fault
+         */
+        TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
+        TLB_MISS_EPILOG_SUCCESS
+        rfi
+normal_tlb_miss_access_fault:
+        /* We need to check if it was an instruction miss */
+        andi.   r10,r11,_PAGE_HWEXEC
+        bne     1f
+        ld      r14,EX_TLB_DEAR(r12)
+        ld      r15,EX_TLB_ESR(r12)
+        mtspr   SPRN_DEAR,r14
+        mtspr   SPRN_ESR,r15
+        TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+        TLB_MISS_EPILOG_ERROR
+        b       exc_data_storage_book3e
+1:      TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+        TLB_MISS_EPILOG_ERROR
+        b       exc_instruction_storage_book3e
+/*
+ * This is the guts of the second-level TLB miss handler for direct
+ * misses. We are entered with:
+ *
+ * r16 = virtual page table faulting address
+ * r15 = region (top 4 bits of address)
+ * r14 = crap (free to use)
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = crap (free to use)
+ * r10 = crap (free to use)
+ *
+ * Note that this should only ever be called as a second level handler
+ * with the current scheme when using SW load.
+ * That means we can always get the original fault DEAR at
+ * EX_TLB_DEAR-EX_TLB_SIZE(r12)
+ *
+ * It can be re-entered by the linear mapping miss handler. However, to
+ * avoid too much complication, it will restart the whole fault at level
+ * 0 so we don't care too much about clobbers
+ *
+ * XXX That code was written back when we couldn't clobber r14. We can now,
+ * so we could probably optimize things a bit
+ */
+virt_page_table_tlb_miss:
+        /* Are we hitting a kernel page table ? */
+        andi.   r10,r15,0x8
+        /* The cool thing now is that r10 contains 0 for user and 8 for kernel,
+         * and we happen to have the swapper_pg_dir at offset 8 from the user
+         * pgdir in the PACA :-).
+         */
+        add     r11,r10,r13
+        /* If kernel, we need to clear MAS1 TID */
+        beq     1f
+        /* XXX replace the RMW cycles with immediate loads + writes */
+        mfspr   r10,SPRN_MAS1
+        rlwinm  r10,r10,0,16,1                  /* Clear TID */
+        mtspr   SPRN_MAS1,r10
+1:
+        /* Search if we already have a TLB entry for that virtual address, and
+         * if we do, bail out.
+         */
+        PPC_TLBSRX_DOT(0,r16)
+        beq     virt_page_table_tlb_miss_done
+        /* Now, we need to walk the page tables. First check if we are in
+         * range.
+         */
+        rldicl. r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4
+        bne-    virt_page_table_tlb_miss_fault
+        /* Get the PGD pointer */
+        ld      r15,PACAPGD(r11)
+        cmpldi  cr0,r15,0
+        beq-    virt_page_table_tlb_miss_fault
+        /* Get to PGD entry */
+        rldicl  r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3
+        clrrdi  r10,r11,3
+        ldx     r15,r10,r15
+        cmpldi  cr0,r15,0
+        beq     virt_page_table_tlb_miss_fault
+#ifndef CONFIG_PPC_64K_PAGES
+        /* Get to PUD entry */
+        rldicl  r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3
+        clrrdi  r10,r11,3
+        ldx     r15,r10,r15
+        cmpldi  cr0,r15,0
+        beq     virt_page_table_tlb_miss_fault
+#endif /* CONFIG_PPC_64K_PAGES */
+        /* Get to PMD entry */
+        rldicl  r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3
+        clrrdi  r10,r11,3
+        ldx     r15,r10,r15
+        cmpldi  cr0,r15,0
+        beq     virt_page_table_tlb_miss_fault
+        /* Ok, we're all right, we can now create a kernel translation for
+         * a 4K or 64K page from r16 -> r15.
+         */
+        /* Now we build the MAS:
+         *
+         * MAS 0   :    Fully setup with defaults in MAS4 and TLBnCFG
+         * MAS 1   :    Almost fully setup
+         *               - PID already updated by caller if necessary
+         *               - TSIZE for now is base page size always
+         * MAS 2   :    Use defaults
+         * MAS 3+7 :    Needs to be done
+         *
+         * So we only do MAS 2 and 3 for now...
+         */
+        clrldi  r11,r15,4               /* remove region ID from RPN */
+        ori     r10,r11,1               /* Or-in SR */
+        mtspr   SPRN_MAS7_MAS3,r10
+        tlbwe
+virt_page_table_tlb_miss_done:
+        /* We have overriden MAS2:EPN but currently our primary TLB miss
+         * handler will always restore it so that should not be an issue,
+         * if we ever optimize the primary handler to not write MAS2 on
+         * some cases, we'll have to restore MAS2:EPN here based on the
+         * original fault's DEAR. If we do that we have to modify the
+         * ITLB miss handler to also store SRR0 in the exception frame
+         * as DEAR.
+         *
+         * However, one nasty thing we did is we cleared the reservation
+         * (well, potentially we did). We do a trick here thus if we
+         * are not a level 0 exception (we interrupted the TLB miss) we
+         * offset the return address by -4 in order to replay the tlbsrx
+         * instruction there
+         */
+        subf    r10,r13,r12
+        cmpldi  cr0,r10,PACA_EXTLB+EX_TLB_SIZE
+        bne-    1f
+        ld      r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13)
+        addi    r10,r11,-4
+        std     r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13)
+1:
+        /* Return to caller, normal case */
+        TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK);
+        TLB_MISS_EPILOG_SUCCESS
+        rfi
+virt_page_table_tlb_miss_fault:
+        /* If we fault here, things are a little bit tricky. We need to call
+         * either data or instruction store fault, and we need to retreive
+         * the original fault address and ESR (for data).
+         *
+         * The thing is, we know that in normal circumstances, this is
+         * always called as a second level tlb miss for SW load or as a first
+         * level TLB miss for HW load, so we should be able to peek at the
+         * relevant informations in the first exception frame in the PACA.
+         *
+         * However, we do need to double check that, because we may just hit
+         * a stray kernel pointer or a userland attack trying to hit those
+         * areas. If that is the case, we do a data fault. (We can't get here
+         * from an instruction tlb miss anyway).
+         *
+         * Note also that when going to a fault, we must unwind the previous
+         * level as well. Since we are doing that, we don't need to clear or
+         * restore the TLB reservation neither.
+         */
+        subf    r10,r13,r12
+        cmpldi  cr0,r10,PACA_EXTLB+EX_TLB_SIZE
+        bne-    virt_page_table_tlb_miss_whacko_fault
+        /* We dig the original DEAR and ESR from slot 0 */
+        ld      r15,EX_TLB_DEAR+PACA_EXTLB(r13)
+        ld      r16,EX_TLB_ESR+PACA_EXTLB(r13)
+        /* We check for the "special" ESR value for instruction faults */
+        cmpdi   cr0,r16,-1
+        beq     1f
+        mtspr   SPRN_DEAR,r15
+        mtspr   SPRN_ESR,r16
+        TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT);
+        TLB_MISS_EPILOG_ERROR
+        b       exc_data_storage_book3e
+1:      TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT);
+        TLB_MISS_EPILOG_ERROR
+        b       exc_instruction_storage_book3e
+virt_page_table_tlb_miss_whacko_fault:
+        /* The linear fault will restart everything so ESR and DEAR will
+         * not have been clobbered, let's just fault with what we have
+         */
+        TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_FAULT);
+        TLB_MISS_EPILOG_ERROR
+        b       exc_data_storage_book3e
+/**************************************************************
+ *                                                            *
+ * TLB miss handling for Book3E with hw page table support    *
+ *                                                            *
+ **************************************************************/
+/* Data TLB miss */
+        START_EXCEPTION(data_tlb_miss_htw)
+        TLB_MISS_PROLOG
+        /* Now we handle the fault proper. We only save DEAR in normal
+         * fault case since that's the only interesting values here.
+         * We could probably also optimize by not saving SRR0/1 in the
+         * linear mapping case but I'll leave that for later
+         */
+        mfspr   r14,SPRN_ESR
+        mfspr   r16,SPRN_DEAR           /* get faulting address */
+        srdi    r11,r16,60              /* get region */
+        cmpldi  cr0,r11,0xc             /* linear mapping ? */
+        TLB_MISS_STATS_SAVE_INFO
+        beq     tlb_load_linear         /* yes -> go to linear map load */
+        /* We do the user/kernel test for the PID here along with the RW test
+         */
+        cmpldi  cr0,r11,0               /* Check for user region */
+        ld      r15,PACAPGD(r13)        /* Load user pgdir */
+        beq     htw_tlb_miss
+        /* XXX replace the RMW cycles with immediate loads + writes */
+1:      mfspr   r10,SPRN_MAS1
+        cmpldi  cr0,r11,8               /* Check for vmalloc region */
+        rlwinm  r10,r10,0,16,1          /* Clear TID */
+        mtspr   SPRN_MAS1,r10
+        ld      r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */
+        beq+    htw_tlb_miss
+        /* We got a crappy address, just fault with whatever DEAR and ESR
+         * are here
+         */
+        TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+        TLB_MISS_EPILOG_ERROR
+        b       exc_data_storage_book3e
+/* Instruction TLB miss */
+        START_EXCEPTION(instruction_tlb_miss_htw)
+        TLB_MISS_PROLOG
+        /* If we take a recursive fault, the second level handler may need
+         * to know whether we are handling a data or instruction fault in
+         * order to get to the right store fault handler. We provide that
+         * info by keeping a crazy value for ESR in r14
+         */
+        li      r14,-1  /* store to exception frame is done later */
+        /* Now we handle the fault proper. We only save DEAR in the non
+         * linear mapping case since we know the linear mapping case will
+         * not re-enter. We could indeed optimize and also not save SRR0/1
+         * in the linear mapping case but I'll leave that for later
+         *
+         * Faulting address is SRR0 which is already in r16
+         */
+        srdi    r11,r16,60              /* get region */
+        cmpldi  cr0,r11,0xc             /* linear mapping ? */
+        TLB_MISS_STATS_SAVE_INFO
+        beq     tlb_load_linear         /* yes -> go to linear map load */
+        /* We do the user/kernel test for the PID here along with the RW test
+         */
+        cmpldi  cr0,r11,0                       /* Check for user region */
+        ld      r15,PACAPGD(r13)                /* Load user pgdir */
+        beq     htw_tlb_miss
+        /* XXX replace the RMW cycles with immediate loads + writes */
+1:      mfspr   r10,SPRN_MAS1
+        cmpldi  cr0,r11,8                       /* Check for vmalloc region */
+        rlwinm  r10,r10,0,16,1                  /* Clear TID */
+        mtspr   SPRN_MAS1,r10
+        ld      r15,PACA_KERNELPGD(r13)         /* Load kernel pgdir */
+        beq+    htw_tlb_miss
+        /* We got a crappy address, just fault */
+        TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+        TLB_MISS_EPILOG_ERROR
+        b       exc_instruction_storage_book3e
+/*
+ * This is the guts of the second-level TLB miss handler for direct
+ * misses. We are entered with:
+ *
+ * r16 = virtual page table faulting address
+ * r15 = PGD pointer
+ * r14 = ESR
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = crap (free to use)
+ * r10 = crap (free to use)
+ *
+ * It can be re-entered by the linear mapping miss handler. However, to
+ * avoid too much complication, it will save/restore things for us
+ */
+htw_tlb_miss:
+        /* Search if we already have a TLB entry for that virtual address, and
+         * if we do, bail out.
+         *
+         * MAS1:IND should be already set based on MAS4
+         */
+        PPC_TLBSRX_DOT(0,r16)
+        beq     htw_tlb_miss_done
+        /* Now, we need to walk the page tables. First check if we are in
+         * range.
+         */
+        rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
+        bne-    htw_tlb_miss_fault
+        /* Get the PGD pointer */
+        cmpldi  cr0,r15,0
+        beq-    htw_tlb_miss_fault
+        /* Get to PGD entry */
+        rldicl  r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3
+        clrrdi  r10,r11,3
+        ldx     r15,r10,r15
+        cmpldi  cr0,r15,0
+        beq     htw_tlb_miss_fault
+#ifndef CONFIG_PPC_64K_PAGES
+        /* Get to PUD entry */
+        rldicl  r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3
+        clrrdi  r10,r11,3
+        ldx     r15,r10,r15
+        cmpldi  cr0,r15,0
+        beq     htw_tlb_miss_fault
+#endif /* CONFIG_PPC_64K_PAGES */
+        /* Get to PMD entry */
+        rldicl  r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3
+        clrrdi  r10,r11,3
+        ldx     r15,r10,r15
+        cmpldi  cr0,r15,0
+        beq     htw_tlb_miss_fault
+        /* Ok, we're all right, we can now create an indirect entry for
+         * a 1M or 256M page.
+         *
+         * The last trick is now that because we use "half" pages for
+         * the HTW (1M IND is 2K and 256M IND is 32K) we need to account
+         * for an added LSB bit to the RPN. For 64K pages, there is no
+         * problem as we already use 32K arrays (half PTE pages), but for
+         * 4K page we need to extract a bit from the virtual address and
+         * insert it into the "PA52" bit of the RPN.
+         */
+#ifndef CONFIG_PPC_64K_PAGES
+        rlwimi  r15,r16,32-9,20,20
+#endif
+        /* Now we build the MAS:
+         *
+         * MAS 0   :    Fully setup with defaults in MAS4 and TLBnCFG
+         * MAS 1   :    Almost fully setup
+         *               - PID already updated by caller if necessary
+         *               - TSIZE for now is base ind page size always
+         * MAS 2   :    Use defaults
+         * MAS 3+7 :    Needs to be done
+         */
+#ifdef CONFIG_PPC_64K_PAGES
+        ori     r10,r15,(BOOK3E_PAGESZ_64K << MAS3_SPSIZE_SHIFT)
+#else
+        ori     r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT)
+#endif
+        mtspr   SPRN_MAS7_MAS3,r10
+        tlbwe
+htw_tlb_miss_done:
+        /* We don't bother with restoring DEAR or ESR since we know we are
+         * level 0 and just going back to userland. They are only needed
+         * if you are going to take an access fault
+         */
+        TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK)
+        TLB_MISS_EPILOG_SUCCESS
+        rfi
+htw_tlb_miss_fault:
+        /* We need to check if it was an instruction miss. We know this
+         * though because r14 would contain -1
+         */
+        cmpdi   cr0,r14,-1
+        beq     1f
+        mtspr   SPRN_DEAR,r16
+        mtspr   SPRN_ESR,r14
+        TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT)
+        TLB_MISS_EPILOG_ERROR
+        b       exc_data_storage_book3e
+1:      TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT)
+        TLB_MISS_EPILOG_ERROR
+        b       exc_instruction_storage_book3e
+/*
+ * This is the guts of "any" level TLB miss handler for kernel linear
+ * mapping misses. We are entered with:
+ *
+ *
+ * r16 = faulting address
+ * r15 = crap (free to use)
+ * r14 = ESR (data) or -1 (instruction)
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = crap (free to use)
+ * r10 = crap (free to use)
+ *
+ * In addition we know that we will not re-enter, so in theory, we could
+ * use a simpler epilog not restoring SRR0/1 etc.. but we'll do that later.
+ *
+ * We also need to be careful about MAS registers here & TLB reservation,
+ * as we know we'll have clobbered them if we interrupt the main TLB miss
+ * handlers in which case we probably want to do a full restart at level
+ * 0 rather than saving / restoring the MAS.
+ *
+ * Note: If we care about performance of that core, we can easily shuffle
+ *       a few things around
+ */
+tlb_load_linear:
+        /* For now, we assume the linear mapping is contiguous and stops at
+         * linear_map_top. We also assume the size is a multiple of 1G, thus
+         * we only use 1G pages for now. That might have to be changed in a
+         * final implementation, especially when dealing with hypervisors
+         */
+        ld      r11,PACATOC(r13)
+        ld      r11,linear_map_top@got(r11)
+        ld      r10,0(r11)
+        cmpld   cr0,r10,r16
+        bge     tlb_load_linear_fault
+        /* MAS1 need whole new setup. */
+        li      r15,(BOOK3E_PAGESZ_1GB<<MAS1_TSIZE_SHIFT)
+        oris    r15,r15,MAS1_VALID@h    /* MAS1 needs V and TSIZE */
+        mtspr   SPRN_MAS1,r15
+        /* Already somebody there ? */
+        PPC_TLBSRX_DOT(0,r16)
+        beq     tlb_load_linear_done
+        /* Now we build the remaining MAS. MAS0 and 2 should be fine
+         * with their defaults, which leaves us with MAS 3 and 7. The
+         * mapping is linear, so we just take the address, clear the
+         * region bits, and or in the permission bits which are currently
+         * hard wired
+         */
+        clrrdi  r10,r16,30              /* 1G page index */
+        clrldi  r10,r10,4               /* clear region bits */
+        ori     r10,r10,MAS3_SR|MAS3_SW|MAS3_SX
+        mtspr   SPRN_MAS7_MAS3,r10
+        tlbwe
+tlb_load_linear_done:
+        /* We use the "error" epilog for success as we do want to
+         * restore to the initial faulting context, whatever it was.
+         * We do that because we can't resume a fault within a TLB
+         * miss handler, due to MAS and TLB reservation being clobbered.
+         */
+        TLB_MISS_STATS_X(MMSTAT_TLB_MISS_LINEAR)
+        TLB_MISS_EPILOG_ERROR
+        rfi
+tlb_load_linear_fault:
+        /* We keep the DEAR and ESR around, this shouldn't have happened */
+        cmpdi   cr0,r14,-1
+        beq     1f
+        TLB_MISS_EPILOG_ERROR_SPECIAL
+        b       exc_data_storage_book3e
+1:      TLB_MISS_EPILOG_ERROR_SPECIAL
+        b       exc_instruction_storage_book3e
+#ifdef CONFIG_BOOK3E_MMU_TLB_STATS
+.tlb_stat_inc:
+1:      ldarx   r8,0,r9
+        addi    r8,r8,1
+        stdcx.  r8,0,r9
+        bne-    1b
+        blr
+#endif
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index ad2eb4d34dd4..2fbc680c2c71 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -7,8 +7,8 @@
 *
 *  -- BenH
 *
- * Copyright 2008 Ben Herrenschmidt <benh@kernel.crashing.org>
+ * Copyright 2008,2009 Ben Herrenschmidt <benh@kernel.crashing.org>
- *                IBM Corp.
+ *                     IBM Corp.
 *
 *  Derived from arch/ppc/mm/init.c:
 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
@@ -34,12 +34,71 @@
 #include <linux/pagemap.h>
 #include <linux/preempt.h>
 #include <linux/spinlock.h>
+#include <linux/lmb.h>
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
+#include <asm/code-patching.h>
 #include "mmu_decl.h"
+#ifdef CONFIG_PPC_BOOK3E
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
+        [MMU_PAGE_4K] = {
+                .shift  = 12,
+                .enc    = BOOK3E_PAGESZ_4K,
+        },
+        [MMU_PAGE_16K] = {
+                .shift  = 14,
+                .enc    = BOOK3E_PAGESZ_16K,
+        },
+        [MMU_PAGE_64K] = {
+                .shift  = 16,
+                .enc    = BOOK3E_PAGESZ_64K,
+        },
+        [MMU_PAGE_1M] = {
+                .shift  = 20,
+                .enc    = BOOK3E_PAGESZ_1M,
+        },
+        [MMU_PAGE_16M] = {
+                .shift  = 24,
+                .enc    = BOOK3E_PAGESZ_16M,
+        },
+        [MMU_PAGE_256M] = {
+                .shift  = 28,
+                .enc    = BOOK3E_PAGESZ_256M,
+        },
+        [MMU_PAGE_1G] = {
+                .shift  = 30,
+                .enc    = BOOK3E_PAGESZ_1GB,
+        },
+};
+static inline int mmu_get_tsize(int psize)
+{
+        return mmu_psize_defs[psize].enc;
+}
+#else
+static inline int mmu_get_tsize(int psize)
+{
+        /* This isn't used on !Book3E for now */
+        return 0;
+}
+#endif
+/* The variables below are currently only used on 64-bit Book3E
+ * though this will probably be made common with other nohash
+ * implementations at some point
+ */
+#ifdef CONFIG_PPC64
+int mmu_linear_psize;           /* Page size used for the linear mapping */
+int mmu_pte_psize;              /* Page size used for PTE pages */
+int mmu_vmemmap_psize;          /* Page size used for the virtual mem map */
+int book3e_htw_enabled;         /* Is HW tablewalk enabled ? */
+unsigned long linear_map_top;   /* Top of linear mapping */
+#endif /* CONFIG_PPC64 */
 /*
 * Base TLB flushing operations:
 *
@@ -67,18 +126,24 @@ void local_flush_tlb_mm(struct mm_struct *mm)
 }
 EXPORT_SYMBOL(local_flush_tlb_mm);
-void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
+                            int tsize, int ind)
 {
        unsigned int pid;
        preempt_disable();
-        pid = vma ? vma->vm_mm->context.id : 0;
+        pid = mm ? mm->context.id : 0;
        if (pid != MMU_NO_CONTEXT)
-                _tlbil_va(vmaddr, pid);
+                _tlbil_va(vmaddr, pid, tsize, ind);
        preempt_enable();
 }
-EXPORT_SYMBOL(local_flush_tlb_page);
+void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+        __local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
+                               mmu_get_tsize(mmu_virtual_psize), 0);
+}
+EXPORT_SYMBOL(local_flush_tlb_page);
 /*
 * And here are the SMP non-local implementations
@@ -87,9 +152,17 @@ EXPORT_SYMBOL(local_flush_tlb_page);
 static DEFINE_SPINLOCK(tlbivax_lock);
+static int mm_is_core_local(struct mm_struct *mm)
+{
+        return cpumask_subset(mm_cpumask(mm),
+                              topology_thread_cpumask(smp_processor_id()));
+}
 struct tlb_flush_param {
        unsigned long addr;
        unsigned int pid;
+        unsigned int tsize;
+        unsigned int ind;
 };
 static void do_flush_tlb_mm_ipi(void *param)
@@ -103,7 +176,7 @@ static void do_flush_tlb_page_ipi(void *param)
 {
        struct tlb_flush_param *p = param;
-        _tlbil_va(p->addr, p->pid);
+        _tlbil_va(p->addr, p->pid, p->tsize, p->ind);
 }
@@ -131,7 +204,7 @@ void flush_tlb_mm(struct mm_struct *mm)
        pid = mm->context.id;
        if (unlikely(pid == MMU_NO_CONTEXT))
                goto no_context;
-        if (!cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
+        if (!mm_is_core_local(mm)) {
                struct tlb_flush_param p = { .pid = pid };
                /* Ignores smp_processor_id() even if set. */
                smp_call_function_many(mm_cpumask(mm),
@@ -143,37 +216,49 @@ void flush_tlb_mm(struct mm_struct *mm)
 }
 EXPORT_SYMBOL(flush_tlb_mm);
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
+                      int tsize, int ind)
 {
        struct cpumask *cpu_mask;
        unsigned int pid;
        preempt_disable();
-        pid = vma ? vma->vm_mm->context.id : 0;
+        pid = mm ? mm->context.id : 0;
        if (unlikely(pid == MMU_NO_CONTEXT))
                goto bail;
-        cpu_mask = mm_cpumask(vma->vm_mm);
+        cpu_mask = mm_cpumask(mm);
-        if (!cpumask_equal(cpu_mask, cpumask_of(smp_processor_id()))) {
+        if (!mm_is_core_local(mm)) {
                /* If broadcast tlbivax is supported, use it */
                if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) {
                        int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL);
                        if (lock)
                                spin_lock(&tlbivax_lock);
-                        _tlbivax_bcast(vmaddr, pid);
+                        _tlbivax_bcast(vmaddr, pid, tsize, ind);
                        if (lock)
                                spin_unlock(&tlbivax_lock);
                        goto bail;
                } else {
-                        struct tlb_flush_param p = { .pid = pid, .addr = vmaddr };
+                        struct tlb_flush_param p = {
+                                .pid = pid,
+                                .addr = vmaddr,
+                                .tsize = tsize,
+                                .ind = ind,
+                        };
                        /* Ignores smp_processor_id() even if set in cpu_mask */
                        smp_call_function_many(cpu_mask,
                                               do_flush_tlb_page_ipi, &p, 1);
                }
        }
-        _tlbil_va(vmaddr, pid);
+        _tlbil_va(vmaddr, pid, tsize, ind);
 bail:
        preempt_enable();
 }
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+        __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
+                         mmu_get_tsize(mmu_virtual_psize), 0);
+}
 EXPORT_SYMBOL(flush_tlb_page);
 #endif /* CONFIG_SMP */
@@ -207,3 +292,156 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
        flush_tlb_mm(vma->vm_mm);
 }
 EXPORT_SYMBOL(flush_tlb_range);
+void tlb_flush(struct mmu_gather *tlb)
+{
+        flush_tlb_mm(tlb->mm);
+        /* Push out batch of freed page tables */
+        pte_free_finish();
+}
+/*
+ * Below are functions specific to the 64-bit variant of Book3E though that
+ * may change in the future
+ */
+#ifdef CONFIG_PPC64
+/*
+ * Handling of virtual linear page tables or indirect TLB entries
+ * flushing when PTE pages are freed
+ */
+void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
+{
+        int tsize = mmu_psize_defs[mmu_pte_psize].enc;
+        if (book3e_htw_enabled) {
+                unsigned long start = address & PMD_MASK;
+                unsigned long end = address + PMD_SIZE;
+                unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift;
+                /* This isn't the most optimal, ideally we would factor out the
+                 * while preempt & CPU mask mucking around, or even the IPI but
+                 * it will do for now
+                 */
+                while (start < end) {
+                        __flush_tlb_page(tlb->mm, start, tsize, 1);
+                        start += size;
+                }
+        } else {
+                unsigned long rmask = 0xf000000000000000ul;
+                unsigned long rid = (address & rmask) | 0x1000000000000000ul;
+                unsigned long vpte = address & ~rmask;
+#ifdef CONFIG_PPC_64K_PAGES
+                vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful;
+#else
+                vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful;
+#endif
+                vpte |= rid;
+                __flush_tlb_page(tlb->mm, vpte, tsize, 0);
+        }
+}
+/*
+ * Early initialization of the MMU TLB code
+ */
+static void __early_init_mmu(int boot_cpu)
+{
+        extern unsigned int interrupt_base_book3e;
+        extern unsigned int exc_data_tlb_miss_htw_book3e;
+        extern unsigned int exc_instruction_tlb_miss_htw_book3e;
+        unsigned int *ibase = &interrupt_base_book3e;
+        unsigned int mas4;
+        /* XXX This will have to be decided at runtime, but right
+         * now our boot and TLB miss code hard wires it. Ideally
+         * we should find out a suitable page size and patch the
+         * TLB miss code (either that or use the PACA to store
+         * the value we want)
+         */
+        mmu_linear_psize = MMU_PAGE_1G;
+        /* XXX This should be decided at runtime based on supported
+         * page sizes in the TLB, but for now let's assume 16M is
+         * always there and a good fit (which it probably is)
+         */
+        mmu_vmemmap_psize = MMU_PAGE_16M;
+        /* Check if HW tablewalk is present, and if yes, enable it by:
+         *
+         * - patching the TLB miss handlers to branch to the
+         *   one dedicates to it
+         *
+         * - setting the global book3e_htw_enabled
+         *
+         * - Set MAS4:INDD and default page size
+         */
+        /* XXX This code only checks for TLB 0 capabilities and doesn't
+         *     check what page size combos are supported by the HW. It
+         *     also doesn't handle the case where a separate array holds
+         *     the IND entries from the array loaded by the PT.
+         */
+        if (boot_cpu) {
+                unsigned int tlb0cfg = mfspr(SPRN_TLB0CFG);
+                /* Check if HW loader is supported */
+                if ((tlb0cfg & TLBnCFG_IND) &&
+                    (tlb0cfg & TLBnCFG_PT)) {
+                        patch_branch(ibase + (0x1c0 / 4),
+                             (unsigned long)&exc_data_tlb_miss_htw_book3e, 0);
+                        patch_branch(ibase + (0x1e0 / 4),
+                             (unsigned long)&exc_instruction_tlb_miss_htw_book3e, 0);
+                        book3e_htw_enabled = 1;
+                }
+                pr_info("MMU: Book3E Page Tables %s\n",
+                        book3e_htw_enabled ? "Enabled" : "Disabled");
+        }
+        /* Set MAS4 based on page table setting */
+        mas4 = 0x4 << MAS4_WIMGED_SHIFT;
+        if (book3e_htw_enabled) {
+                mas4 |= mas4 | MAS4_INDD;
+#ifdef CONFIG_PPC_64K_PAGES
+                mas4 |= BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT;
+                mmu_pte_psize = MMU_PAGE_256M;
+#else
+                mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT;
+                mmu_pte_psize = MMU_PAGE_1M;
+#endif
+        } else {
+#ifdef CONFIG_PPC_64K_PAGES
+                mas4 |= BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT;
+#else
+                mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT;
+#endif
+                mmu_pte_psize = mmu_virtual_psize;
+        }
+        mtspr(SPRN_MAS4, mas4);
+        /* Set the global containing the top of the linear mapping
+         * for use by the TLB miss code
+         */
+        linear_map_top = lmb_end_of_DRAM();
+        /* A sync won't hurt us after mucking around with
+         * the MMU configuration
+         */
+        mb();
+}
+void __init early_init_mmu(void)
+{
+        __early_init_mmu(1);
+}
+void __cpuinit early_init_mmu_secondary(void)
+{
+        __early_init_mmu(0);
+}
+#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S
index 3037911279b1..7bcd9fbf6cc6 100644
--- a/arch/powerpc/mm/tlb_nohash_low.S
+++ b/arch/powerpc/mm/tlb_nohash_low.S
@@ -39,7 +39,7 @@
 /*
 * 40x implementation needs only tlbil_va
 */
-_GLOBAL(_tlbil_va)
+_GLOBAL(__tlbil_va)
        /* We run the search with interrupts disabled because we have to change
         * the PID and I don't want to preempt when that happens.
         */
@@ -71,7 +71,7 @@ _GLOBAL(_tlbil_va)
 * 440 implementation uses tlbsx/we for tlbil_va and a full sweep
 * of the TLB for everything else.
 */
-_GLOBAL(_tlbil_va)
+_GLOBAL(__tlbil_va)
        mfspr   r5,SPRN_MMUCR
        rlwimi  r5,r4,0,24,31                   /* Set TID */
@@ -170,7 +170,7 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBILX)
 * Flush MMU TLB for a particular address, but only on the local processor
 * (no broadcast)
 */
-_GLOBAL(_tlbil_va)
+_GLOBAL(__tlbil_va)
        mfmsr   r10
        wrteei  0
        slwi    r4,r4,16
@@ -191,6 +191,85 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX)
        isync
 1:      wrtee   r10
        blr
+#elif defined(CONFIG_PPC_BOOK3E)
+/*
+ * New Book3E (>= 2.06) implementation
+ *
+ * Note: We may be able to get away without the interrupt masking stuff
+ * if we save/restore MAS6 on exceptions that might modify it
+ */
+_GLOBAL(_tlbil_pid)
+        slwi    r4,r3,MAS6_SPID_SHIFT
+        mfmsr   r10
+        wrteei  0
+        mtspr   SPRN_MAS6,r4
+        PPC_TLBILX_PID(0,0)
+        wrtee   r10
+        msync
+        isync
+        blr
+_GLOBAL(_tlbil_pid_noind)
+        slwi    r4,r3,MAS6_SPID_SHIFT
+        mfmsr   r10
+        ori     r4,r4,MAS6_SIND
+        wrteei  0
+        mtspr   SPRN_MAS6,r4
+        PPC_TLBILX_PID(0,0)
+        wrtee   r10
+        msync
+        isync
+        blr
+_GLOBAL(_tlbil_all)
+        PPC_TLBILX_ALL(0,0)
+        msync
+        isync
+        blr
+_GLOBAL(_tlbil_va)
+        mfmsr   r10
+        wrteei  0
+        cmpwi   cr0,r6,0
+        slwi    r4,r4,MAS6_SPID_SHIFT
+        rlwimi  r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK
+        beq     1f
+        rlwimi  r4,r6,MAS6_SIND_SHIFT,MAS6_SIND
+1:      mtspr   SPRN_MAS6,r4            /* assume AS=0 for now */
+        PPC_TLBILX_VA(0,r3)
+        msync
+        isync
+        wrtee   r10
+        blr
+_GLOBAL(_tlbivax_bcast)
+        mfmsr   r10
+        wrteei  0
+        cmpwi   cr0,r6,0
+        slwi    r4,r4,MAS6_SPID_SHIFT
+        rlwimi  r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK
+        beq     1f
+        rlwimi  r4,r6,MAS6_SIND_SHIFT,MAS6_SIND
+1:      mtspr   SPRN_MAS6,r4            /* assume AS=0 for now */
+        PPC_TLBIVAX(0,r3)
+        eieio
+        tlbsync
+        sync
+        wrtee   r10
+        blr
+_GLOBAL(set_context)
+#ifdef CONFIG_BDI_SWITCH
+        /* Context switch the PTE pointer for the Abatron BDI2000.
+         * The PGDIR is the second parameter.
+         */
+        lis     r5, abatron_pteptrs@h
+        ori     r5, r5, abatron_pteptrs@l
+        stw     r4, 0x4(r5)
+#endif
+        mtspr   SPRN_PID,r3
+        isync                   /* Force context change */
+        blr
 #else
 #error Unsupported processor type !
 #endif