27 files changed, 818 insertions, 833 deletions
diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c
index f5e7b9ce63dd..65abfcfaaa9e 100644
--- a/arch/powerpc/mm/40x_mmu.c
+++ b/arch/powerpc/mm/40x_mmu.c
@@ -84,14 +84,14 @@ void __init MMU_init_hw(void)
         * vectors and the kernel live in real-mode.
         */
-        mtspr(SPRN_DCCR, 0xF0000000);   /* 512 MB of data space at 0x0. */
+        mtspr(SPRN_DCCR, 0xFFFF0000);   /* 2GByte of data space at 0x0. */
-        mtspr(SPRN_ICCR, 0xF0000000);   /* 512 MB of instr. space at 0x0. */
+        mtspr(SPRN_ICCR, 0xFFFF0000);   /* 2GByte of instr. space at 0x0. */
 }
 #define LARGE_PAGE_SIZE_16M     (1<<24)
 #define LARGE_PAGE_SIZE_4M      (1<<22)
-unsigned long __init mmu_mapin_ram(void)
+unsigned long __init mmu_mapin_ram(unsigned long top)
 {
        unsigned long v, s, mapped;
        phys_addr_t p;
diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c
index 98052ac96580..3986264b0993 100644
--- a/arch/powerpc/mm/44x_mmu.c
+++ b/arch/powerpc/mm/44x_mmu.c
@@ -88,7 +88,7 @@ void __init MMU_init_hw(void)
        flush_instruction_cache();
 }
-unsigned long __init mmu_mapin_ram(void)
+unsigned long __init mmu_mapin_ram(unsigned long top)
 {
        unsigned long addr;
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 6fb8fc8d2fea..ce68708bbad5 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -28,7 +28,10 @@ obj-$(CONFIG_44x)		+= 44x_mmu.o
 obj-$(CONFIG_FSL_BOOKE)         += fsl_booke_mmu.o
 obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
 obj-$(CONFIG_PPC_MM_SLICES)     += slice.o
-obj-$(CONFIG_HUGETLB_PAGE)      += hugetlbpage.o
+ifeq ($(CONFIG_HUGETLB_PAGE),y)
+obj-y                           += hugetlbpage.o
+obj-$(CONFIG_PPC_STD_MMU_64)    += hugetlbpage-hash64.o
+endif
 obj-$(CONFIG_PPC_SUBPAGE_PROT)  += subpage-prot.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)           += highmem.o
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
index 36692f5c9a76..757c0bed9a91 100644
--- a/arch/powerpc/mm/dma-noncoherent.c
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -23,6 +23,7 @@
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index e7dae82c1285..26fb6b990b0a 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -40,7 +40,7 @@
 #include <asm/uaccess.h>
 #include <asm/tlbflush.h>
 #include <asm/siginfo.h>
+#include <mm/mmu_decl.h>
 #ifdef CONFIG_KPROBES
 static inline int notify_page_fault(struct pt_regs *regs)
@@ -246,6 +246,12 @@ good_area:
                goto bad_area;
 #endif /* CONFIG_6xx */
 #if defined(CONFIG_8xx)
+        /* 8xx sometimes need to load a invalid/non-present TLBs.
+         * These must be invalidated separately as linux mm don't.
+         */
+        if (error_code & 0x40000000) /* no translation? */
+                _tlbil_va(address, 0, 0, 0);
        /* The MPC8xx seems to always set 0x80000000, which is
         * "undefined".  Of those that can be set, this is the only
         * one which seems bad.
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index dc93e95b256e..1ed6b52f3031 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -54,26 +54,35 @@
 #include "mmu_decl.h"
-extern void loadcam_entry(unsigned int index);
 unsigned int tlbcam_index;
-static unsigned long cam[CONFIG_LOWMEM_CAM_NUM];
-#define NUM_TLBCAMS     (16)
+#define NUM_TLBCAMS     (64)
 #if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS)
 #error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS"
 #endif
-struct tlbcam TLBCAM[NUM_TLBCAMS];
+struct tlbcam {
+        u32     MAS0;
+        u32     MAS1;
+        unsigned long   MAS2;
+        u32     MAS3;
+        u32     MAS7;
+} TLBCAM[NUM_TLBCAMS];
 struct tlbcamrange {
-        unsigned long start;
+        unsigned long start;
        unsigned long limit;
        phys_addr_t phys;
 } tlbcam_addrs[NUM_TLBCAMS];
 extern unsigned int tlbcam_index;
+unsigned long tlbcam_sz(int idx)
+{
+        return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1;
+}
 /*
 * Return PA for this VA if it is mapped by a CAM, or 0
 */
@@ -94,23 +103,36 @@ unsigned long p_mapped_by_tlbcam(phys_addr_t pa)
        int b;
        for (b = 0; b < tlbcam_index; ++b)
                if (pa >= tlbcam_addrs[b].phys
-                    && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start)
+                        && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start)
                              +tlbcam_addrs[b].phys)
                        return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys);
        return 0;
 }
+void loadcam_entry(int idx)
+{
+        mtspr(SPRN_MAS0, TLBCAM[idx].MAS0);
+        mtspr(SPRN_MAS1, TLBCAM[idx].MAS1);
+        mtspr(SPRN_MAS2, TLBCAM[idx].MAS2);
+        mtspr(SPRN_MAS3, TLBCAM[idx].MAS3);
+        if (mmu_has_feature(MMU_FTR_BIG_PHYS))
+                mtspr(SPRN_MAS7, TLBCAM[idx].MAS7);
+        asm volatile("isync;tlbwe;isync" : : : "memory");
+}
 /*
 * Set up one of the I/D BAT (block address translation) register pairs.
 * The parameters are not checked; in particular size must be a power
 * of 4 between 4k and 256M.
 */
-void settlbcam(int index, unsigned long virt, phys_addr_t phys,
+static void settlbcam(int index, unsigned long virt, phys_addr_t phys,
-                unsigned int size, int flags, unsigned int pid)
+                unsigned long size, unsigned long flags, unsigned int pid)
 {
        unsigned int tsize, lz;
-        asm ("cntlzw %0,%1" : "=r" (lz) : "r" (size));
+        asm (PPC_CNTLZL "%0,%1" : "=r" (lz) : "r" (size));
        tsize = 21 - lz;
 #ifdef CONFIG_SMP
@@ -128,18 +150,15 @@ void settlbcam(int index, unsigned long virt, phys_addr_t phys,
        TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0;
        TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0;
-        TLBCAM[index].MAS3 = (phys & PAGE_MASK) | MAS3_SX | MAS3_SR;
+        TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SX | MAS3_SR;
        TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0);
+        if (mmu_has_feature(MMU_FTR_BIG_PHYS))
+                TLBCAM[index].MAS7 = (u64)phys >> 32;
-#ifndef CONFIG_KGDB /* want user access for breakpoints */
        if (flags & _PAGE_USER) {
           TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR;
           TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0);
        }
-#else
-        TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR;
-        TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0);
-#endif
        tlbcam_addrs[index].start = virt;
        tlbcam_addrs[index].limit = virt + size - 1;
@@ -148,27 +167,44 @@ void settlbcam(int index, unsigned long virt, phys_addr_t phys,
        loadcam_entry(index);
 }
-void invalidate_tlbcam_entry(int index)
+unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx)
-{
-        TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index);
-        TLBCAM[index].MAS1 = ~MAS1_VALID;
-        loadcam_entry(index);
-}
-unsigned long __init mmu_mapin_ram(void)
 {
+        int i;
        unsigned long virt = PAGE_OFFSET;
        phys_addr_t phys = memstart_addr;
+        unsigned long amount_mapped = 0;
+        unsigned long max_cam = (mfspr(SPRN_TLB1CFG) >> 16) & 0xf;
+        /* Convert (4^max) kB to (2^max) bytes */
+        max_cam = max_cam * 2 + 10;
+        /* Calculate CAM values */
+        for (i = 0; ram && i < max_cam_idx; i++) {
+                unsigned int camsize = __ilog2(ram) & ~1U;
+                unsigned int align = __ffs(virt | phys) & ~1U;
+                unsigned long cam_sz;
+                if (camsize > align)
+                        camsize = align;
+                if (camsize > max_cam)
+                        camsize = max_cam;
+                cam_sz = 1UL << camsize;
+                settlbcam(i, virt, phys, cam_sz, PAGE_KERNEL_X, 0);
-        while (tlbcam_index < ARRAY_SIZE(cam) && cam[tlbcam_index]) {
+                ram -= cam_sz;
-                settlbcam(tlbcam_index, virt, phys, cam[tlbcam_index], PAGE_KERNEL_X, 0);
+                amount_mapped += cam_sz;
-                virt += cam[tlbcam_index];
+                virt += cam_sz;
-                phys += cam[tlbcam_index];
+                phys += cam_sz;
-                tlbcam_index++;
        }
+        tlbcam_index = i;
+        return amount_mapped;
+}
-        return virt - PAGE_OFFSET;
+unsigned long __init mmu_mapin_ram(unsigned long top)
+{
+        return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1;
 }
 /*
@@ -179,46 +215,21 @@ void __init MMU_init_hw(void)
        flush_instruction_cache();
 }
-void __init
+void __init adjust_total_lowmem(void)
-adjust_total_lowmem(void)
 {
-        phys_addr_t ram;
+        unsigned long ram;
-        unsigned int max_cam = (mfspr(SPRN_TLB1CFG) >> 16) & 0xff;
-        char buf[ARRAY_SIZE(cam) * 5 + 1], *p = buf;
        int i;
-        unsigned long virt = PAGE_OFFSET & 0xffffffffUL;
-        unsigned long phys = memstart_addr & 0xffffffffUL;
-        /* Convert (4^max) kB to (2^max) bytes */
-        max_cam = max_cam * 2 + 10;
        /* adjust lowmem size to __max_low_memory */
        ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem);
-        /* Calculate CAM values */
+        __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM);
-        __max_low_memory = 0;
-        for (i = 0; ram && i < ARRAY_SIZE(cam); i++) {
-                unsigned int camsize = __ilog2(ram) & ~1U;
-                unsigned int align = __ffs(virt | phys) & ~1U;
-                if (camsize > align)
+        pr_info("Memory CAM mapping: ");
-                        camsize = align;
+        for (i = 0; i < tlbcam_index - 1; i++)
-                if (camsize > max_cam)
+                pr_cont("%lu/", tlbcam_sz(i) >> 20);
-                        camsize = max_cam;
+        pr_cont("%lu Mb, residual: %dMb\n", tlbcam_sz(tlbcam_index - 1) >> 20,
-                cam[i] = 1UL << camsize;
-                ram -= cam[i];
-                __max_low_memory += cam[i];
-                virt += cam[i];
-                phys += cam[i];
-                p += sprintf(p, "%lu/", cam[i] >> 20);
-        }
-        for (; i < ARRAY_SIZE(cam); i++)
-                p += sprintf(p, "0/");
-        p[-1] = '\0';
-        pr_info("Memory CAM mapping: %s Mb, residual: %dMb\n", buf,
                (unsigned int)((total_lowmem - __max_low_memory) >> 20));
        __initial_memory_limit_addr = memstart_addr + __max_low_memory;
 }
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index bc122a120bf0..d7efdbf640c7 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -55,57 +55,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
        return 1;
 }
-#ifdef CONFIG_HUGETLB_PAGE
-static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate,
-                                 unsigned long *addr, unsigned long end,
-                                 int write, struct page **pages, int *nr)
-{
-        unsigned long mask;
-        unsigned long pte_end;
-        struct page *head, *page;
-        pte_t pte;
-        int refs;
-        pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate);
-        if (pte_end < end)
-                end = pte_end;
-        pte = *ptep;
-        mask = _PAGE_PRESENT|_PAGE_USER;
-        if (write)
-                mask |= _PAGE_RW;
-        if ((pte_val(pte) & mask) != mask)
-                return 0;
-        /* hugepages are never "special" */
-        VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-        refs = 0;
-        head = pte_page(pte);
-        page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT);
-        do {
-                VM_BUG_ON(compound_head(page) != head);
-                pages[*nr] = page;
-                (*nr)++;
-                page++;
-                refs++;
-        } while (*addr += PAGE_SIZE, *addr != end);
-        if (!page_cache_add_speculative(head, refs)) {
-                *nr -= refs;
-                return 0;
-        }
-        if (unlikely(pte_val(pte) != pte_val(*ptep))) {
-                /* Could be optimized better */
-                while (*nr) {
-                        put_page(page);
-                        (*nr)--;
-                }
-        }
-        return 1;
-}
-#endif /* CONFIG_HUGETLB_PAGE */
 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                int write, struct page **pages, int *nr)
 {
@@ -119,7 +68,11 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                next = pmd_addr_end(addr, end);
                if (pmd_none(pmd))
                        return 0;
-                if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+                if (is_hugepd(pmdp)) {
+                        if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
+                                        addr, next, write, pages, nr))
+                                return 0;
+                } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
                        return 0;
        } while (pmdp++, addr = next, addr != end);
@@ -139,7 +92,11 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
                next = pud_addr_end(addr, end);
                if (pud_none(pud))
                        return 0;
-                if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+                if (is_hugepd(pudp)) {
+                        if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT,
+                                        addr, next, write, pages, nr))
+                                return 0;
+                } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
                        return 0;
        } while (pudp++, addr = next, addr != end);
@@ -154,10 +111,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
        unsigned long next;
        pgd_t *pgdp;
        int nr = 0;
-#ifdef CONFIG_PPC64
-        unsigned int shift;
-        int psize;
-#endif
        pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
@@ -172,25 +125,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
        pr_devel("  aligned: %lx .. %lx\n", start, end);
-#ifdef CONFIG_HUGETLB_PAGE
-        /* We bail out on slice boundary crossing when hugetlb is
-         * enabled in order to not have to deal with two different
-         * page table formats
-         */
-        if (addr < SLICE_LOW_TOP) {
-                if (end > SLICE_LOW_TOP)
-                        goto slow_irqon;
-                if (unlikely(GET_LOW_SLICE_INDEX(addr) !=
-                             GET_LOW_SLICE_INDEX(end - 1)))
-                        goto slow_irqon;
-        } else {
-                if (unlikely(GET_HIGH_SLICE_INDEX(addr) !=
-                             GET_HIGH_SLICE_INDEX(end - 1)))
-                        goto slow_irqon;
-        }
-#endif /* CONFIG_HUGETLB_PAGE */
        /*
         * XXX: batch / limit 'nr', to avoid large irq off latency
         * needs some instrumenting to determine the common sizes used by
@@ -210,54 +144,23 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
         */
        local_irq_disable();
-#ifdef CONFIG_PPC64
+        pgdp = pgd_offset(mm, addr);
-        /* Those bits are related to hugetlbfs implementation and only exist
+        do {
-         * on 64-bit for now
+                pgd_t pgd = *pgdp;
-         */
-        psize = get_slice_psize(mm, addr);
+                pr_devel("  %016lx: normal pgd %p\n", addr,
-        shift = mmu_psize_defs[psize].shift;
+                         (void *)pgd_val(pgd));
-#endif /* CONFIG_PPC64 */
+                next = pgd_addr_end(addr, end);
+                if (pgd_none(pgd))
-#ifdef CONFIG_HUGETLB_PAGE
+                        goto slow;
-        if (unlikely(mmu_huge_psizes[psize])) {
+                if (is_hugepd(pgdp)) {
-                pte_t *ptep;
+                        if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT,
-                unsigned long a = addr;
+                                        addr, next, write, pages, &nr))
-                unsigned long sz = ((1UL) << shift);
-                struct hstate *hstate = size_to_hstate(sz);
-                BUG_ON(!hstate);
-                /*
-                 * XXX: could be optimized to avoid hstate
-                 * lookup entirely (just use shift)
-                 */
-                do {
-                        VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift);
-                        ptep = huge_pte_offset(mm, a);
-                        pr_devel(" %016lx: huge ptep %p\n", a, ptep);
-                        if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages,
-                                                   &nr))
-                                goto slow;
-                } while (a != end);
-        } else
-#endif /* CONFIG_HUGETLB_PAGE */
-        {
-                pgdp = pgd_offset(mm, addr);
-                do {
-                        pgd_t pgd = *pgdp;
-#ifdef CONFIG_PPC64
-                        VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift);
-#endif
-                        pr_devel("  %016lx: normal pgd %p\n", addr,
-                                 (void *)pgd_val(pgd));
-                        next = pgd_addr_end(addr, end);
-                        if (pgd_none(pgd))
-                                goto slow;
-                        if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
                                goto slow;
-                } while (pgdp++, addr = next, addr != end);
+                } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
-        }
+                        goto slow;
+        } while (pgdp++, addr = next, addr != end);
        local_irq_enable();
        VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 056d23a1b105..784a400e0781 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -37,7 +37,7 @@
 #define HPTE_LOCK_BIT 3
-static DEFINE_SPINLOCK(native_tlbie_lock);
+static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
 static inline void __tlbie(unsigned long va, int psize, int ssize)
 {
@@ -104,7 +104,7 @@ static inline void tlbie(unsigned long va, int psize, int ssize, int local)
        if (use_local)
                use_local = mmu_psize_defs[psize].tlbiel;
        if (lock_tlbie && !use_local)
-                spin_lock(&native_tlbie_lock);
+                raw_spin_lock(&native_tlbie_lock);
        asm volatile("ptesync": : :"memory");
        if (use_local) {
                __tlbiel(va, psize, ssize);
@@ -114,7 +114,7 @@ static inline void tlbie(unsigned long va, int psize, int ssize, int local)
                asm volatile("eieio; tlbsync; ptesync": : :"memory");
        }
        if (lock_tlbie && !use_local)
-                spin_unlock(&native_tlbie_lock);
+                raw_spin_unlock(&native_tlbie_lock);
 }
 static inline void native_lock_hpte(struct hash_pte *hptep)
@@ -122,7 +122,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep)
        unsigned long *word = &hptep->v;
        while (1) {
-                if (!test_and_set_bit(HPTE_LOCK_BIT, word))
+                if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
                        break;
                while(test_bit(HPTE_LOCK_BIT, word))
                        cpu_relax();
@@ -133,8 +133,7 @@ static inline void native_unlock_hpte(struct hash_pte *hptep)
 {
        unsigned long *word = &hptep->v;
-        asm volatile("lwsync":::"memory");
+        clear_bit_unlock(HPTE_LOCK_BIT, word);
-        clear_bit(HPTE_LOCK_BIT, word);
 }
 static long native_hpte_insert(unsigned long hpte_group, unsigned long va,
@@ -434,7 +433,7 @@ static void native_hpte_clear(void)
        /* we take the tlbie lock and hold it.  Some hardware will
         * deadlock if we try to tlbie from two processors at once.
         */
-        spin_lock(&native_tlbie_lock);
+        raw_spin_lock(&native_tlbie_lock);
        slots = pteg_count * HPTES_PER_GROUP;
@@ -458,7 +457,7 @@ static void native_hpte_clear(void)
        }
        asm volatile("eieio; tlbsync; ptesync":::"memory");
-        spin_unlock(&native_tlbie_lock);
+        raw_spin_unlock(&native_tlbie_lock);
        local_irq_restore(flags);
 }
@@ -521,7 +520,7 @@ static void native_flush_hash_range(unsigned long number, int local)
                int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
                if (lock_tlbie)
-                        spin_lock(&native_tlbie_lock);
+                        raw_spin_lock(&native_tlbie_lock);
                asm volatile("ptesync":::"memory");
                for (i = 0; i < number; i++) {
@@ -536,7 +535,7 @@ static void native_flush_hash_range(unsigned long number, int local)
                asm volatile("eieio; tlbsync; ptesync":::"memory");
                if (lock_tlbie)
-                        spin_unlock(&native_tlbie_lock);
+                        raw_spin_unlock(&native_tlbie_lock);
        }
        local_irq_restore(flags);
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 1ade7eb6ae00..3ecdcec0a39e 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -92,6 +92,7 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
 struct hash_pte *htab_address;
 unsigned long htab_size_bytes;
 unsigned long htab_hash_mask;
+EXPORT_SYMBOL_GPL(htab_hash_mask);
 int mmu_linear_psize = MMU_PAGE_4K;
 int mmu_virtual_psize = MMU_PAGE_4K;
 int mmu_vmalloc_psize = MMU_PAGE_4K;
@@ -102,6 +103,7 @@ int mmu_io_psize = MMU_PAGE_4K;
 int mmu_kernel_ssize = MMU_SEGSIZE_256M;
 int mmu_highuser_ssize = MMU_SEGSIZE_256M;
 u16 mmu_slb_size = 64;
+EXPORT_SYMBOL_GPL(mmu_slb_size);
 #ifdef CONFIG_HUGETLB_PAGE
 unsigned int HPAGE_SHIFT;
 #endif
@@ -338,7 +340,7 @@ static int __init htab_dt_scan_page_sizes(unsigned long node,
                        else
                                def->tlbiel = 0;
-                        DBG(" %d: shift=%02x, sllp=%04x, avpnm=%08x, "
+                        DBG(" %d: shift=%02x, sllp=%04lx, avpnm=%08lx, "
                            "tlbiel=%d, penc=%d\n",
                            idx, shift, def->sllp, def->avpnm, def->tlbiel,
                            def->penc);
@@ -481,16 +483,6 @@ static void __init htab_init_page_sizes(void)
 #ifdef CONFIG_HUGETLB_PAGE
        /* Reserve 16G huge page memory sections for huge pages */
        of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
-/* Set default large page size. Currently, we pick 16M or 1M depending
-         * on what is available
-         */
-        if (mmu_psize_defs[MMU_PAGE_16M].shift)
-                HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
-        /* With 4k/4level pagetables, we can't (for now) cope with a
-         * huge page size < PMD_SIZE */
-        else if (mmu_psize_defs[MMU_PAGE_1M].shift)
-                HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
 #endif /* CONFIG_HUGETLB_PAGE */
 }
@@ -671,7 +663,7 @@ static void __init htab_initialize(void)
                base = (unsigned long)__va(lmb.memory.region[i].base);
                size = lmb.memory.region[i].size;
-                DBG("creating mapping for region: %lx..%lx (prot: %x)\n",
+                DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
                    base, size, prot);
 #ifdef CONFIG_U3_DART
@@ -785,7 +777,7 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
        /* page is dirty */
        if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
                if (trap == 0x400) {
-                        __flush_dcache_icache(page_address(page));
+                        flush_dcache_icache_page(page);
                        set_bit(PG_arch_1, &page->flags);
                } else
                        pp |= HPTE_R_N;
@@ -843,9 +835,9 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
 * Result is 0: full permissions, _PAGE_RW: read-only,
 * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access.
 */
-static int subpage_protection(pgd_t *pgdir, unsigned long ea)
+static int subpage_protection(struct mm_struct *mm, unsigned long ea)
 {
-        struct subpage_prot_table *spt = pgd_subpage_prot(pgdir);
+        struct subpage_prot_table *spt = &mm->context.spt;
        u32 spp = 0;
        u32 **sbpm, *sbpp;
@@ -873,7 +865,7 @@ static int subpage_protection(pgd_t *pgdir, unsigned long ea)
 }
 #else /* CONFIG_PPC_SUBPAGE_PROT */
-static inline int subpage_protection(pgd_t *pgdir, unsigned long ea)
+static inline int subpage_protection(struct mm_struct *mm, unsigned long ea)
 {
        return 0;
 }
@@ -887,10 +879,11 @@ static inline int subpage_protection(pgd_t *pgdir, unsigned long ea)
 */
 int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 {
-        void *pgdir;
+        pgd_t *pgdir;
        unsigned long vsid;
        struct mm_struct *mm;
        pte_t *ptep;
+        unsigned hugeshift;
        const struct cpumask *tmp;
        int rc, user_region = 0, local = 0;
        int psize, ssize;
@@ -943,30 +936,31 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
        if (user_region && cpumask_equal(mm_cpumask(mm), tmp))
                local = 1;
-#ifdef CONFIG_HUGETLB_PAGE
-        /* Handle hugepage regions */
-        if (HPAGE_SHIFT && mmu_huge_psizes[psize]) {
-                DBG_LOW(" -> huge page !\n");
-                return hash_huge_page(mm, access, ea, vsid, local, trap);
-        }
-#endif /* CONFIG_HUGETLB_PAGE */
 #ifndef CONFIG_PPC_64K_PAGES
-        /* If we use 4K pages and our psize is not 4K, then we are hitting
+        /* If we use 4K pages and our psize is not 4K, then we might
-         * a special driver mapping, we need to align the address before
+         * be hitting a special driver mapping, and need to align the
-         * we fetch the PTE
+         * address before we fetch the PTE.
+         *
+         * It could also be a hugepage mapping, in which case this is
+         * not necessary, but it's not harmful, either.
         */
        if (psize != MMU_PAGE_4K)
                ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
 #endif /* CONFIG_PPC_64K_PAGES */
        /* Get PTE and page size from page tables */
-        ptep = find_linux_pte(pgdir, ea);
+        ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
        if (ptep == NULL || !pte_present(*ptep)) {
                DBG_LOW(" no PTE !\n");
                return 1;
        }
+#ifdef CONFIG_HUGETLB_PAGE
+        if (hugeshift)
+                return __hash_page_huge(ea, access, vsid, ptep, trap, local,
+                                        ssize, hugeshift, psize);
+#endif /* CONFIG_HUGETLB_PAGE */
 #ifndef CONFIG_PPC_64K_PAGES
        DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
 #else
@@ -1031,7 +1025,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
        else
 #endif /* CONFIG_PPC_HAS_HASH_64K */
        {
-                int spp = subpage_protection(pgdir, ea);
+                int spp = subpage_protection(mm, ea);
                if (access & spp)
                        rc = -2;
                else
@@ -1121,7 +1115,7 @@ void flush_hash_page(unsigned long va, real_pte_t pte, int psize, int ssize,
 {
        unsigned long hash, index, shift, hidx, slot;
-        DBG_LOW("flush_hash_page(va=%016x)\n", va);
+        DBG_LOW("flush_hash_page(va=%016lx)\n", va);
        pte_iterate_hashed_subpages(pte, psize, va, index, shift) {
                hash = hpt_hash(va, shift, ssize);
                hidx = __rpte_to_hidx(pte, index);
@@ -1129,7 +1123,7 @@ void flush_hash_page(unsigned long va, real_pte_t pte, int psize, int ssize,
                        hash = ~hash;
                slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
                slot += hidx & _PTEIDX_GROUP_IX;
-                DBG_LOW(" sub %d: hash=%x, hidx=%x\n", index, slot, hidx);
+                DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx);
                ppc_md.hpte_invalidate(slot, va, psize, ssize, local);
        } pte_iterate_hashed_end();
 }
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
new file mode 100644
index 000000000000..199539882f92
--- /dev/null
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -0,0 +1,139 @@
+/*
+ * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later)
+ *
+ * Copyright (C) 2003 David Gibson, IBM Corporation.
+ *
+ * Based on the IA-32 version:
+ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
+ */
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/machdep.h>
+int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
+                     pte_t *ptep, unsigned long trap, int local, int ssize,
+                     unsigned int shift, unsigned int mmu_psize)
+{
+        unsigned long old_pte, new_pte;
+        unsigned long va, rflags, pa, sz;
+        long slot;
+        int err = 1;
+        BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
+        /* Search the Linux page table for a match with va */
+        va = hpt_va(ea, vsid, ssize);
+        /*
+         * Check the user's access rights to the page.  If access should be
+         * prevented then send the problem up to do_page_fault.
+         */
+        if (unlikely(access & ~pte_val(*ptep)))
+                goto out;
+        /*
+         * At this point, we have a pte (old_pte) which can be used to build
+         * or update an HPTE. There are 2 cases:
+         *
+         * 1. There is a valid (present) pte with no associated HPTE (this is
+         *      the most common case)
+         * 2. There is a valid (present) pte with an associated HPTE. The
+         *      current values of the pp bits in the HPTE prevent access
+         *      because we are doing software DIRTY bit management and the
+         *      page is currently not DIRTY.
+         */
+        do {
+                old_pte = pte_val(*ptep);
+                if (old_pte & _PAGE_BUSY)
+                        goto out;
+                new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
+        } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
+                                         old_pte, new_pte));
+        rflags = 0x2 | (!(new_pte & _PAGE_RW));
+        /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
+        rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
+        sz = ((1UL) << shift);
+        if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+                /* No CPU has hugepages but lacks no execute, so we
+                 * don't need to worry about that case */
+                rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+        /* Check if pte already has an hpte (case 2) */
+        if (unlikely(old_pte & _PAGE_HASHPTE)) {
+                /* There MIGHT be an HPTE for this pte */
+                unsigned long hash, slot;
+                hash = hpt_hash(va, shift, ssize);
+                if (old_pte & _PAGE_F_SECOND)
+                        hash = ~hash;
+                slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+                slot += (old_pte & _PAGE_F_GIX) >> 12;
+                if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
+                                         ssize, local) == -1)
+                        old_pte &= ~_PAGE_HPTEFLAGS;
+        }
+        if (likely(!(old_pte & _PAGE_HASHPTE))) {
+                unsigned long hash = hpt_hash(va, shift, ssize);
+                unsigned long hpte_group;
+                pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+repeat:
+                hpte_group = ((hash & htab_hash_mask) *
+                              HPTES_PER_GROUP) & ~0x7UL;
+                /* clear HPTE slot informations in new PTE */
+#ifdef CONFIG_PPC_64K_PAGES
+                new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
+#else
+                new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+#endif
+                /* Add in WIMG bits */
+                rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
+                                      _PAGE_COHERENT | _PAGE_GUARDED));
+                /* Insert into the hash table, primary slot */
+                slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
+                                          mmu_psize, ssize);
+                /* Primary is full, try the secondary */
+                if (unlikely(slot == -1)) {
+                        hpte_group = ((~hash & htab_hash_mask) *
+                                      HPTES_PER_GROUP) & ~0x7UL;
+                        slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
+                                                  HPTE_V_SECONDARY,
+                                                  mmu_psize, ssize);
+                        if (slot == -1) {
+                                if (mftb() & 0x1)
+                                        hpte_group = ((hash & htab_hash_mask) *
+                                                      HPTES_PER_GROUP)&~0x7UL;
+                                ppc_md.hpte_remove(hpte_group);
+                                goto repeat;
+                        }
+                }
+                if (unlikely(slot == -2))
+                        panic("hash_huge_page: pte_insert failed\n");
+                new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
+        }
+        /*
+         * No need to use ldarx/stdcx here
+         */
+        *ptep = __pte(new_pte & ~_PAGE_BUSY);
+        err = 0;
+ out:
+        return err;
+}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 90df6ffe3a43..9bb249c3046e 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -7,29 +7,18 @@
 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
 */
-#include <linux/init.h>
-#include <linux/fs.h>
 #include <linux/mm.h>
-#include <linux/hugetlb.h>
+#include <linux/io.h>
-#include <linux/pagemap.h>
 #include <linux/slab.h>
-#include <linux/err.h>
+#include <linux/hugetlb.h>
-#include <linux/sysctl.h>
+#include <asm/pgtable.h>
-#include <asm/mman.h>
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <asm/machdep.h>
-#include <asm/cputable.h>
-#include <asm/spu.h>
 #define PAGE_SHIFT_64K  16
 #define PAGE_SHIFT_16M  24
 #define PAGE_SHIFT_16G  34
-#define NUM_LOW_AREAS   (0x100000000UL >> SID_SHIFT)
-#define NUM_HIGH_AREAS  (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
 #define MAX_NUMBER_GPAGES       1024
 /* Tracks the 16G pages after the device tree is scanned and before the
@@ -37,53 +26,17 @@
 static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
 static unsigned nr_gpages;
-/* Array of valid huge page sizes - non-zero value(hugepte_shift) is
- * stored for the huge page sizes that are valid.
- */
-unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
-#define hugepte_shift                   mmu_huge_psizes
-#define PTRS_PER_HUGEPTE(psize)         (1 << hugepte_shift[psize])
-#define HUGEPTE_TABLE_SIZE(psize)       (sizeof(pte_t) << hugepte_shift[psize])
-#define HUGEPD_SHIFT(psize)             (mmu_psize_to_shift(psize) \
-                                                + hugepte_shift[psize])
-#define HUGEPD_SIZE(psize)              (1UL << HUGEPD_SHIFT(psize))
-#define HUGEPD_MASK(psize)              (~(HUGEPD_SIZE(psize)-1))
-/* Subtract one from array size because we don't need a cache for 4K since
- * is not a huge page size */
-#define HUGE_PGTABLE_INDEX(psize)       (HUGEPTE_CACHE_NUM + psize - 1)
-#define HUGEPTE_CACHE_NAME(psize)       (huge_pgtable_cache_name[psize])
-static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
-        [MMU_PAGE_64K]  = "hugepte_cache_64K",
-        [MMU_PAGE_1M]   = "hugepte_cache_1M",
-        [MMU_PAGE_16M]  = "hugepte_cache_16M",
-        [MMU_PAGE_16G]  = "hugepte_cache_16G",
-};
 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
 * will choke on pointers to hugepte tables, which is handy for
 * catching screwups early. */
-#define HUGEPD_OK       0x1
-typedef struct { unsigned long pd; } hugepd_t;
-#define hugepd_none(hpd)        ((hpd).pd == 0)
 static inline int shift_to_mmu_psize(unsigned int shift)
 {
-        switch (shift) {
+        int psize;
-#ifndef CONFIG_PPC_64K_PAGES
-        case PAGE_SHIFT_64K:
+        for (psize = 0; psize < MMU_PAGE_COUNT; ++psize)
-            return MMU_PAGE_64K;
+                if (mmu_psize_defs[psize].shift == shift)
-#endif
+                        return psize;
-        case PAGE_SHIFT_16M:
-            return MMU_PAGE_16M;
-        case PAGE_SHIFT_16G:
-            return MMU_PAGE_16G;
-        }
        return -1;
 }
@@ -94,71 +47,126 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
        BUG();
 }
+#define hugepd_none(hpd)        ((hpd).pd == 0)
 static inline pte_t *hugepd_page(hugepd_t hpd)
 {
-        BUG_ON(!(hpd.pd & HUGEPD_OK));
+        BUG_ON(!hugepd_ok(hpd));
-        return (pte_t *)(hpd.pd & ~HUGEPD_OK);
+        return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000);
+}
+static inline unsigned int hugepd_shift(hugepd_t hpd)
+{
+        return hpd.pd & HUGEPD_SHIFT_MASK;
 }
-static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
+static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift)
-                                    struct hstate *hstate)
 {
-        unsigned int shift = huge_page_shift(hstate);
+        unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
-        int psize = shift_to_mmu_psize(shift);
-        unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1));
        pte_t *dir = hugepd_page(*hpdp);
        return dir + idx;
 }
+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
+{
+        pgd_t *pg;
+        pud_t *pu;
+        pmd_t *pm;
+        hugepd_t *hpdp = NULL;
+        unsigned pdshift = PGDIR_SHIFT;
+        if (shift)
+                *shift = 0;
+        pg = pgdir + pgd_index(ea);
+        if (is_hugepd(pg)) {
+                hpdp = (hugepd_t *)pg;
+        } else if (!pgd_none(*pg)) {
+                pdshift = PUD_SHIFT;
+                pu = pud_offset(pg, ea);
+                if (is_hugepd(pu))
+                        hpdp = (hugepd_t *)pu;
+                else if (!pud_none(*pu)) {
+                        pdshift = PMD_SHIFT;
+                        pm = pmd_offset(pu, ea);
+                        if (is_hugepd(pm))
+                                hpdp = (hugepd_t *)pm;
+                        else if (!pmd_none(*pm)) {
+                                return pte_offset_map(pm, ea);
+                        }
+                }
+        }
+        if (!hpdp)
+                return NULL;
+        if (shift)
+                *shift = hugepd_shift(*hpdp);
+        return hugepte_offset(hpdp, ea, pdshift);
+}
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+        return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
+}
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
-                           unsigned long address, unsigned int psize)
+                           unsigned long address, unsigned pdshift, unsigned pshift)
 {
-        pte_t *new = kmem_cache_zalloc(pgtable_cache[HUGE_PGTABLE_INDEX(psize)],
+        pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift),
-                                      GFP_KERNEL|__GFP_REPEAT);
+                                       GFP_KERNEL|__GFP_REPEAT);
+        BUG_ON(pshift > HUGEPD_SHIFT_MASK);
+        BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
        if (! new)
                return -ENOMEM;
        spin_lock(&mm->page_table_lock);
        if (!hugepd_none(*hpdp))
-                kmem_cache_free(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], new);
+                kmem_cache_free(PGT_CACHE(pdshift - pshift), new);
        else
-                hpdp->pd = (unsigned long)new | HUGEPD_OK;
+                hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift;
        spin_unlock(&mm->page_table_lock);
        return 0;
 }
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
-static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate)
-{
-        if (huge_page_shift(hstate) < PUD_SHIFT)
-                return pud_offset(pgd, addr);
-        else
-                return (pud_t *) pgd;
-}
-static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr,
-                         struct hstate *hstate)
-{
-        if (huge_page_shift(hstate) < PUD_SHIFT)
-                return pud_alloc(mm, pgd, addr);
-        else
-                return (pud_t *) pgd;
-}
-static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate)
 {
-        if (huge_page_shift(hstate) < PMD_SHIFT)
+        pgd_t *pg;
-                return pmd_offset(pud, addr);
+        pud_t *pu;
-        else
+        pmd_t *pm;
-                return (pmd_t *) pud;
+        hugepd_t *hpdp = NULL;
-}
+        unsigned pshift = __ffs(sz);
-static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr,
+        unsigned pdshift = PGDIR_SHIFT;
-                         struct hstate *hstate)
-{
+        addr &= ~(sz-1);
-        if (huge_page_shift(hstate) < PMD_SHIFT)
-                return pmd_alloc(mm, pud, addr);
+        pg = pgd_offset(mm, addr);
-        else
+        if (pshift >= PUD_SHIFT) {
-                return (pmd_t *) pud;
+                hpdp = (hugepd_t *)pg;
+        } else {
+                pdshift = PUD_SHIFT;
+                pu = pud_alloc(mm, pg, addr);
+                if (pshift >= PMD_SHIFT) {
+                        hpdp = (hugepd_t *)pu;
+                } else {
+                        pdshift = PMD_SHIFT;
+                        pm = pmd_alloc(mm, pu, addr);
+                        hpdp = (hugepd_t *)pm;
+                }
+        }
+        if (!hpdp)
+                return NULL;
+        BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
+        if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
+                return NULL;
+        return hugepte_offset(hpdp, addr, pdshift);
 }
 /* Build list of addresses of gigantic pages.  This function is used in early
@@ -192,94 +200,38 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
        return 1;
 }
-/* Modelled after find_linux_pte() */
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
-{
-        pgd_t *pg;
-        pud_t *pu;
-        pmd_t *pm;
-        unsigned int psize;
-        unsigned int shift;
-        unsigned long sz;
-        struct hstate *hstate;
-        psize = get_slice_psize(mm, addr);
-        shift = mmu_psize_to_shift(psize);
-        sz = ((1UL) << shift);
-        hstate = size_to_hstate(sz);
-        addr &= hstate->mask;
-        pg = pgd_offset(mm, addr);
-        if (!pgd_none(*pg)) {
-                pu = hpud_offset(pg, addr, hstate);
-                if (!pud_none(*pu)) {
-                        pm = hpmd_offset(pu, addr, hstate);
-                        if (!pmd_none(*pm))
-                                return hugepte_offset((hugepd_t *)pm, addr,
-                                                      hstate);
-                }
-        }
-        return NULL;
-}
-pte_t *huge_pte_alloc(struct mm_struct *mm,
-                        unsigned long addr, unsigned long sz)
-{
-        pgd_t *pg;
-        pud_t *pu;
-        pmd_t *pm;
-        hugepd_t *hpdp = NULL;
-        struct hstate *hstate;
-        unsigned int psize;
-        hstate = size_to_hstate(sz);
-        psize = get_slice_psize(mm, addr);
-        BUG_ON(!mmu_huge_psizes[psize]);
-        addr &= hstate->mask;
-        pg = pgd_offset(mm, addr);
-        pu = hpud_alloc(mm, pg, addr, hstate);
-        if (pu) {
-                pm = hpmd_alloc(mm, pu, addr, hstate);
-                if (pm)
-                        hpdp = (hugepd_t *)pm;
-        }
-        if (! hpdp)
-                return NULL;
-        if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
-                return NULL;
-        return hugepte_offset(hpdp, addr, hstate);
-}
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 {
        return 0;
 }
-static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp,
+static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
-                               unsigned int psize)
+                              unsigned long start, unsigned long end,
+                              unsigned long floor, unsigned long ceiling)
 {
        pte_t *hugepte = hugepd_page(*hpdp);
+        unsigned shift = hugepd_shift(*hpdp);
+        unsigned long pdmask = ~((1UL << pdshift) - 1);
+        start &= pdmask;
+        if (start < floor)
+                return;
+        if (ceiling) {
+                ceiling &= pdmask;
+                if (! ceiling)
+                        return;
+        }
+        if (end - 1 > ceiling - 1)
+                return;
        hpdp->pd = 0;
        tlb->need_flush = 1;
-        pgtable_free_tlb(tlb, pgtable_free_cache(hugepte,
+        pgtable_free_tlb(tlb, hugepte, pdshift - shift);
-                                                 HUGEPTE_CACHE_NUM+psize-1,
-                                                 PGF_CACHENUM_MASK));
 }
 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
                                   unsigned long addr, unsigned long end,
-                                   unsigned long floor, unsigned long ceiling,
+                                   unsigned long floor, unsigned long ceiling)
-                                   unsigned int psize)
 {
        pmd_t *pmd;
        unsigned long next;
@@ -291,7 +243,8 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
                next = pmd_addr_end(addr, end);
                if (pmd_none(*pmd))
                        continue;
-                free_hugepte_range(tlb, (hugepd_t *)pmd, psize);
+                free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
+                                  addr, next, floor, ceiling);
        } while (pmd++, addr = next, addr != end);
        start &= PUD_MASK;
@@ -317,23 +270,19 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
        pud_t *pud;
        unsigned long next;
        unsigned long start;
-        unsigned int shift;
-        unsigned int psize = get_slice_psize(tlb->mm, addr);
-        shift = mmu_psize_to_shift(psize);
        start = addr;
        pud = pud_offset(pgd, addr);
        do {
                next = pud_addr_end(addr, end);
-                if (shift < PMD_SHIFT) {
+                if (!is_hugepd(pud)) {
                        if (pud_none_or_clear_bad(pud))
                                continue;
                        hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
-                                               ceiling, psize);
+                                               ceiling);
                } else {
-                        if (pud_none(*pud))
+                        free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
-                                continue;
+                                          addr, next, floor, ceiling);
-                        free_hugepte_range(tlb, (hugepd_t *)pud, psize);
                }
        } while (pud++, addr = next, addr != end);
@@ -364,121 +313,56 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 {
        pgd_t *pgd;
        unsigned long next;
-        unsigned long start;
        /*
-         * Comments below take from the normal free_pgd_range().  They
+         * Because there are a number of different possible pagetable
-         * apply here too.  The tests against HUGEPD_MASK below are
+         * layouts for hugepage ranges, we limit knowledge of how
-         * essential, because we *don't* test for this at the bottom
+         * things should be laid out to the allocation path
-         * level.  Without them we'll attempt to free a hugepte table
+         * (huge_pte_alloc(), above).  Everything else works out the
-         * when we unmap just part of it, even if there are other
+         * structure as it goes from information in the hugepd
-         * active mappings using it.
+         * pointers.  That means that we can't here use the
-         *
+         * optimization used in the normal page free_pgd_range(), of
-         * The next few lines have given us lots of grief...
+         * checking whether we're actually covering a large enough
-         *
+         * range to have to do anything at the top level of the walk
-         * Why are we testing HUGEPD* at this top level?  Because
+         * instead of at the bottom.
-         * often there will be no work to do at all, and we'd prefer
-         * not to go all the way down to the bottom just to discover
-         * that.
         *
-         * Why all these "- 1"s?  Because 0 represents both the bottom
+         * To make sense of this, you should probably go read the big
-         * of the address space and the top of it (using -1 for the
+         * block comment at the top of the normal free_pgd_range(),
-         * top wouldn't help much: the masks would do the wrong thing).
+         * too.
-         * The rule is that addr 0 and floor 0 refer to the bottom of
-         * the address space, but end 0 and ceiling 0 refer to the top
-         * Comparisons need to use "end - 1" and "ceiling - 1" (though
-         * that end 0 case should be mythical).
-         *
-         * Wherever addr is brought up or ceiling brought down, we
-         * must be careful to reject "the opposite 0" before it
-         * confuses the subsequent tests.  But what about where end is
-         * brought down by HUGEPD_SIZE below? no, end can't go down to
-         * 0 there.
-         *
-         * Whereas we round start (addr) and ceiling down, by different
-         * masks at different levels, in order to test whether a table
-         * now has no other vmas using it, so can be freed, we don't
-         * bother to round floor or end up - the tests don't need that.
         */
-        unsigned int psize = get_slice_psize(tlb->mm, addr);
-        addr &= HUGEPD_MASK(psize);
-        if (addr < floor) {
-                addr += HUGEPD_SIZE(psize);
-                if (!addr)
-                        return;
-        }
-        if (ceiling) {
-                ceiling &= HUGEPD_MASK(psize);
-                if (!ceiling)
-                        return;
-        }
-        if (end - 1 > ceiling - 1)
-                end -= HUGEPD_SIZE(psize);
-        if (addr > end - 1)
-                return;
-        start = addr;
        pgd = pgd_offset(tlb->mm, addr);
        do {
-                psize = get_slice_psize(tlb->mm, addr);
-                BUG_ON(!mmu_huge_psizes[psize]);
                next = pgd_addr_end(addr, end);
-                if (mmu_psize_to_shift(psize) < PUD_SHIFT) {
+                if (!is_hugepd(pgd)) {
                        if (pgd_none_or_clear_bad(pgd))
                                continue;
                        hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
                } else {
-                        if (pgd_none(*pgd))
+                        free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
-                                continue;
+                                          addr, next, floor, ceiling);
-                        free_hugepte_range(tlb, (hugepd_t *)pgd, psize);
                }
        } while (pgd++, addr = next, addr != end);
 }
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
-                     pte_t *ptep, pte_t pte)
-{
-        if (pte_present(*ptep)) {
-                /* We open-code pte_clear because we need to pass the right
-                 * argument to hpte_need_flush (huge / !huge). Might not be
-                 * necessary anymore if we make hpte_need_flush() get the
-                 * page size from the slices
-                 */
-                unsigned int psize = get_slice_psize(mm, addr);
-                unsigned int shift = mmu_psize_to_shift(psize);
-                unsigned long sz = ((1UL) << shift);
-                struct hstate *hstate = size_to_hstate(sz);
-                pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1);
-        }
-        *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
-}
-pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
-                              pte_t *ptep)
-{
-        unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
-        return __pte(old);
-}
 struct page *
 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 {
        pte_t *ptep;
        struct page *page;
-        unsigned int mmu_psize = get_slice_psize(mm, address);
+        unsigned shift;
+        unsigned long mask;
+        ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
        /* Verify it is a huge page else bail. */
-        if (!mmu_huge_psizes[mmu_psize])
+        if (!ptep || !shift)
                return ERR_PTR(-EINVAL);
-        ptep = huge_pte_offset(mm, address);
+        mask = (1UL << shift) - 1;
        page = pte_page(*ptep);
-        if (page) {
+        if (page)
-                unsigned int shift = mmu_psize_to_shift(mmu_psize);
+                page += (address & mask) / PAGE_SIZE;
-                unsigned long sz = ((1UL) << shift);
-                page += (address % sz) / PAGE_SIZE;
-        }
        return page;
 }
@@ -501,6 +385,82 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
        return NULL;
 }
+static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+                       unsigned long end, int write, struct page **pages, int *nr)
+{
+        unsigned long mask;
+        unsigned long pte_end;
+        struct page *head, *page;
+        pte_t pte;
+        int refs;
+        pte_end = (addr + sz) & ~(sz-1);
+        if (pte_end < end)
+                end = pte_end;
+        pte = *ptep;
+        mask = _PAGE_PRESENT | _PAGE_USER;
+        if (write)
+                mask |= _PAGE_RW;
+        if ((pte_val(pte) & mask) != mask)
+                return 0;
+        /* hugepages are never "special" */
+        VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+        refs = 0;
+        head = pte_page(pte);
+        page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
+        do {
+                VM_BUG_ON(compound_head(page) != head);
+                pages[*nr] = page;
+                (*nr)++;
+                page++;
+                refs++;
+        } while (addr += PAGE_SIZE, addr != end);
+        if (!page_cache_add_speculative(head, refs)) {
+                *nr -= refs;
+                return 0;
+        }
+        if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+                /* Could be optimized better */
+                while (*nr) {
+                        put_page(page);
+                        (*nr)--;
+                }
+        }
+        return 1;
+}
+static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
+                                      unsigned long sz)
+{
+        unsigned long __boundary = (addr + sz) & ~(sz-1);
+        return (__boundary - 1 < end - 1) ? __boundary : end;
+}
+int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
+               unsigned long addr, unsigned long end,
+               int write, struct page **pages, int *nr)
+{
+        pte_t *ptep;
+        unsigned long sz = 1UL << hugepd_shift(*hugepd);
+        unsigned long next;
+        ptep = hugepte_offset(hugepd, addr, pdshift);
+        do {
+                next = hugepte_addr_end(addr, end, sz);
+                if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
+                        return 0;
+        } while (ptep++, addr = next, addr != end);
+        return 1;
+}
 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                                        unsigned long len, unsigned long pgoff,
@@ -509,8 +469,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
        struct hstate *hstate = hstate_file(file);
        int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
-        if (!mmu_huge_psizes[mmu_psize])
-                return -EINVAL;
        return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
 }
@@ -521,229 +479,46 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
        return 1UL << mmu_psize_to_shift(psize);
 }
-/*
+static int __init add_huge_page_size(unsigned long long size)
- * Called by asm hashtable.S for doing lazy icache flush
- */
-static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
-                                        pte_t pte, int trap, unsigned long sz)
 {
-        struct page *page;
+        int shift = __ffs(size);
-        int i;
+        int mmu_psize;
-        if (!pfn_valid(pte_pfn(pte)))
-                return rflags;
-        page = pte_page(pte);
-        /* page is dirty */
-        if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
-                if (trap == 0x400) {
-                        for (i = 0; i < (sz / PAGE_SIZE); i++)
-                                __flush_dcache_icache(page_address(page+i));
-                        set_bit(PG_arch_1, &page->flags);
-                } else {
-                        rflags |= HPTE_R_N;
-                }
-        }
-        return rflags;
-}
-int hash_huge_page(struct mm_struct *mm, unsigned long access,
+        /* Check that it is a page size supported by the hardware and
-                   unsigned long ea, unsigned long vsid, int local,
+         * that it fits within pagetable and slice limits. */
-                   unsigned long trap)
+        if (!is_power_of_2(size)
-{
+            || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
-        pte_t *ptep;
+                return -EINVAL;
-        unsigned long old_pte, new_pte;
-        unsigned long va, rflags, pa, sz;
-        long slot;
-        int err = 1;
-        int ssize = user_segment_size(ea);
-        unsigned int mmu_psize;
-        int shift;
-        mmu_psize = get_slice_psize(mm, ea);
-        if (!mmu_huge_psizes[mmu_psize])
-                goto out;
-        ptep = huge_pte_offset(mm, ea);
-        /* Search the Linux page table for a match with va */
-        va = hpt_va(ea, vsid, ssize);
-        /*
+        if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
-         * If no pte found or not present, send the problem up to
+                return -EINVAL;
-         * do_page_fault
-         */
-        if (unlikely(!ptep || pte_none(*ptep)))
-                goto out;
-        /* 
+#ifdef CONFIG_SPU_FS_64K_LS
-         * Check the user's access rights to the page.  If access should be
+        /* Disable support for 64K huge pages when 64K SPU local store
-         * prevented then send the problem up to do_page_fault.
+         * support is enabled as the current implementation conflicts.
-         */
-        if (unlikely(access & ~pte_val(*ptep)))
-                goto out;
-        /*
-         * At this point, we have a pte (old_pte) which can be used to build
-         * or update an HPTE. There are 2 cases:
-         *
-         * 1. There is a valid (present) pte with no associated HPTE (this is 
-         *      the most common case)
-         * 2. There is a valid (present) pte with an associated HPTE. The
-         *      current values of the pp bits in the HPTE prevent access
-         *      because we are doing software DIRTY bit management and the
-         *      page is currently not DIRTY. 
         */
+        if (shift == PAGE_SHIFT_64K)
+                return -EINVAL;
+#endif /* CONFIG_SPU_FS_64K_LS */
+        BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
-        do {
+        /* Return if huge page size has already been setup */
-                old_pte = pte_val(*ptep);
+        if (size_to_hstate(size))
-                if (old_pte & _PAGE_BUSY)
+                return 0;
-                        goto out;
-                new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
-        } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
-                                         old_pte, new_pte));
-        rflags = 0x2 | (!(new_pte & _PAGE_RW));
-        /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
-        rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
-        shift = mmu_psize_to_shift(mmu_psize);
-        sz = ((1UL) << shift);
-        if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
-                /* No CPU has hugepages but lacks no execute, so we
-                 * don't need to worry about that case */
-                rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
-                                                       trap, sz);
-        /* Check if pte already has an hpte (case 2) */
-        if (unlikely(old_pte & _PAGE_HASHPTE)) {
-                /* There MIGHT be an HPTE for this pte */
-                unsigned long hash, slot;
-                hash = hpt_hash(va, shift, ssize);
-                if (old_pte & _PAGE_F_SECOND)
-                        hash = ~hash;
-                slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-                slot += (old_pte & _PAGE_F_GIX) >> 12;
-                if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
-                                         ssize, local) == -1)
-                        old_pte &= ~_PAGE_HPTEFLAGS;
-        }
-        if (likely(!(old_pte & _PAGE_HASHPTE))) {
-                unsigned long hash = hpt_hash(va, shift, ssize);
-                unsigned long hpte_group;
-                pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
-repeat:
-                hpte_group = ((hash & htab_hash_mask) *
-                              HPTES_PER_GROUP) & ~0x7UL;
-                /* clear HPTE slot informations in new PTE */
-#ifdef CONFIG_PPC_64K_PAGES
-                new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
-#else
-                new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
-#endif
-                /* Add in WIMG bits */
-                rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
-                                      _PAGE_COHERENT | _PAGE_GUARDED));
-                /* Insert into the hash table, primary slot */
-                slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
-                                          mmu_psize, ssize);
-                /* Primary is full, try the secondary */
-                if (unlikely(slot == -1)) {
-                        hpte_group = ((~hash & htab_hash_mask) *
-                                      HPTES_PER_GROUP) & ~0x7UL; 
-                        slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
-                                                  HPTE_V_SECONDARY,
-                                                  mmu_psize, ssize);
-                        if (slot == -1) {
-                                if (mftb() & 0x1)
-                                        hpte_group = ((hash & htab_hash_mask) *
-                                                      HPTES_PER_GROUP)&~0x7UL;
-                                ppc_md.hpte_remove(hpte_group);
-                                goto repeat;
-                        }
-                }
-                if (unlikely(slot == -2))
-                        panic("hash_huge_page: pte_insert failed\n");
-                new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
-        }
-        /*
-         * No need to use ldarx/stdcx here
-         */
-        *ptep = __pte(new_pte & ~_PAGE_BUSY);
-        err = 0;
- out:
+        hugetlb_add_hstate(shift - PAGE_SHIFT);
-        return err;
-}
-static void __init set_huge_psize(int psize)
+        return 0;
-{
-        /* Check that it is a page size supported by the hardware and
-         * that it fits within pagetable limits. */
-        if (mmu_psize_defs[psize].shift &&
-                mmu_psize_defs[psize].shift < SID_SHIFT_1T &&
-                (mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
-                 mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
-                 mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
-                /* Return if huge page size has already been setup or is the
-                 * same as the base page size. */
-                if (mmu_huge_psizes[psize] ||
-                   mmu_psize_defs[psize].shift == PAGE_SHIFT)
-                        return;
-                if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL))
-                        return;
-                hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
-                switch (mmu_psize_defs[psize].shift) {
-                case PAGE_SHIFT_64K:
-                    /* We only allow 64k hpages with 4k base page,
-                     * which was checked above, and always put them
-                     * at the PMD */
-                    hugepte_shift[psize] = PMD_SHIFT;
-                    break;
-                case PAGE_SHIFT_16M:
-                    /* 16M pages can be at two different levels
-                     * of pagestables based on base page size */
-                    if (PAGE_SHIFT == PAGE_SHIFT_64K)
-                            hugepte_shift[psize] = PMD_SHIFT;
-                    else /* 4k base page */
-                            hugepte_shift[psize] = PUD_SHIFT;
-                    break;
-                case PAGE_SHIFT_16G:
-                    /* 16G pages are always at PGD level */
-                    hugepte_shift[psize] = PGDIR_SHIFT;
-                    break;
-                }
-                hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
-        } else
-                hugepte_shift[psize] = 0;
 }
 static int __init hugepage_setup_sz(char *str)
 {
        unsigned long long size;
-        int mmu_psize;
-        int shift;
        size = memparse(str, &str);
-        shift = __ffs(size);
+        if (add_huge_page_size(size) != 0)
-        mmu_psize = shift_to_mmu_psize(shift);
-        if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift)
-                set_huge_psize(mmu_psize);
-        else
                printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
        return 1;
@@ -752,41 +527,55 @@ __setup("hugepagesz=", hugepage_setup_sz);
 static int __init hugetlbpage_init(void)
 {
-        unsigned int psize;
+        int psize;
        if (!cpu_has_feature(CPU_FTR_16M_PAGE))
                return -ENODEV;
-        /* Add supported huge page sizes.  Need to change HUGE_MAX_HSTATE
+        for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
-         * and adjust PTE_NONCACHE_NUM if the number of supported huge page
+                unsigned shift;
-         * sizes changes.
+                unsigned pdshift;
-         */
-        set_huge_psize(MMU_PAGE_16M);
-        set_huge_psize(MMU_PAGE_16G);
-        /* Temporarily disable support for 64K huge pages when 64K SPU local
+                if (!mmu_psize_defs[psize].shift)
-         * store support is enabled as the current implementation conflicts.
+                        continue;
-         */
-#ifndef CONFIG_SPU_FS_64K_LS
-        set_huge_psize(MMU_PAGE_64K);
-#endif
-        for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+                shift = mmu_psize_to_shift(psize);
-                if (mmu_huge_psizes[psize]) {
-                        pgtable_cache[HUGE_PGTABLE_INDEX(psize)] =
+                if (add_huge_page_size(1ULL << shift) < 0)
-                                kmem_cache_create(
+                        continue;
-                                        HUGEPTE_CACHE_NAME(psize),
-                                        HUGEPTE_TABLE_SIZE(psize),
+                if (shift < PMD_SHIFT)
-                                        HUGEPTE_TABLE_SIZE(psize),
+                        pdshift = PMD_SHIFT;
-                                        0,
+                else if (shift < PUD_SHIFT)
-                                        NULL);
+                        pdshift = PUD_SHIFT;
-                        if (!pgtable_cache[HUGE_PGTABLE_INDEX(psize)])
+                else
-                                panic("hugetlbpage_init(): could not create %s"\
+                        pdshift = PGDIR_SHIFT;
-                                      "\n", HUGEPTE_CACHE_NAME(psize));
-                }
+                pgtable_cache_add(pdshift - shift, NULL);
+                if (!PGT_CACHE(pdshift - shift))
+                        panic("hugetlbpage_init(): could not create "
+                              "pgtable cache for %d bit pagesize\n", shift);
        }
+        /* Set default large page size. Currently, we pick 16M or 1M
+         * depending on what is available
+         */
+        if (mmu_psize_defs[MMU_PAGE_16M].shift)
+                HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
+        else if (mmu_psize_defs[MMU_PAGE_1M].shift)
+                HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
        return 0;
 }
 module_init(hugetlbpage_init);
+void flush_dcache_icache_hugepage(struct page *page)
+{
+        int i;
+        BUG_ON(!PageCompound(page));
+        for (i = 0; i < (1UL << compound_order(page)); i++)
+                __flush_dcache_icache(page_address(page+i));
+}
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 9ddcfb4dc139..767333005eb4 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -31,6 +31,7 @@
 #include <linux/initrd.h>
 #include <linux/pagemap.h>
 #include <linux/lmb.h>
+#include <linux/gfp.h>
 #include <asm/pgalloc.h>
 #include <asm/prom.h>
@@ -47,7 +48,7 @@
 #include "mmu_decl.h"
 #if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL)
-/* The ammount of lowmem must be within 0xF0000000 - KERNELBASE. */
+/* The amount of lowmem must be within 0xF0000000 - KERNELBASE. */
 #if (CONFIG_LOWMEM_SIZE > (0xF0000000 - PAGE_OFFSET))
 #error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_START_KERNEL"
 #endif
@@ -82,6 +83,11 @@ extern struct task_struct *current_set[NR_CPUS];
 int __map_without_bats;
 int __map_without_ltlbs;
+/*
+ * This tells the system to allow ioremapping memory marked as reserved.
+ */
+int __allow_ioremap_reserved;
 /* max amount of low RAM to map in */
 unsigned long __max_low_memory = MAX_LOW_MEM;
@@ -131,9 +137,13 @@ void __init MMU_init(void)
        MMU_setup();
        if (lmb.memory.cnt > 1) {
+#ifndef CONFIG_WII
                lmb.memory.cnt = 1;
                lmb_analyze();
                printk(KERN_WARNING "Only using first contiguous memory region");
+#else
+                wii_memory_fixups();
+#endif
        }
        total_lowmem = total_memory = lmb_end_of_DRAM() - memstart_addr;
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 335c578b9cc3..d7fa50b09b4a 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -41,6 +41,8 @@
 #include <linux/module.h>
 #include <linux/poison.h>
 #include <linux/lmb.h>
+#include <linux/hugetlb.h>
+#include <linux/slab.h>
 #include <asm/pgalloc.h>
 #include <asm/page.h>
@@ -119,30 +121,63 @@ static void pmd_ctor(void *addr)
        memset(addr, 0, PMD_TABLE_SIZE);
 }
-static const unsigned int pgtable_cache_size[2] = {
+struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
-        PGD_TABLE_SIZE, PMD_TABLE_SIZE
-};
+/*
-static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
+ * Create a kmem_cache() for pagetables.  This is not used for PTE
-#ifdef CONFIG_PPC_64K_PAGES
+ * pages - they're linked to struct page, come from the normal free
-        "pgd_cache", "pmd_cache",
+ * pages pool and have a different entry size (see real_pte_t) to
-#else
+ * everything else.  Caches created by this function are used for all
-        "pgd_cache", "pud_pmd_cache",
+ * the higher level pagetables, and for hugepage pagetables.
-#endif /* CONFIG_PPC_64K_PAGES */
+ */
-};
+void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
+{
-#ifdef CONFIG_HUGETLB_PAGE
+        char *name;
-/* Hugepages need an extra cache per hugepagesize, initialized in
+        unsigned long table_size = sizeof(void *) << shift;
- * hugetlbpage.c.  We can't put into the tables above, because HPAGE_SHIFT
+        unsigned long align = table_size;
- * is not compile time constant. */
-struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT];
+        /* When batching pgtable pointers for RCU freeing, we store
-#else
+         * the index size in the low bits.  Table alignment must be
-struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
+         * big enough to fit it.
-#endif
+         *
+         * Likewise, hugeapge pagetable pointers contain a (different)
+         * shift value in the low bits.  All tables must be aligned so
+         * as to leave enough 0 bits in the address to contain it. */
+        unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1,
+                                     HUGEPD_SHIFT_MASK + 1);
+        struct kmem_cache *new;
+        /* It would be nice if this was a BUILD_BUG_ON(), but at the
+         * moment, gcc doesn't seem to recognize is_power_of_2 as a
+         * constant expression, so so much for that. */
+        BUG_ON(!is_power_of_2(minalign));
+        BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE));
+        if (PGT_CACHE(shift))
+                return; /* Already have a cache of this size */
+        align = max_t(unsigned long, align, minalign);
+        name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
+        new = kmem_cache_create(name, table_size, align, 0, ctor);
+        PGT_CACHE(shift) = new;
+        pr_debug("Allocated pgtable cache for order %d\n", shift);
+}
 void pgtable_cache_init(void)
 {
-        pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor);
+        pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
-        pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor);
+        pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor);
+        if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE))
+                panic("Couldn't allocate pgtable caches");
+        /* In all current configs, when the PUD index exists it's the
+         * same size as either the pgd or pmd index.  Verify that the
+         * initialization above has also created a PUD cache.  This
+         * will need re-examiniation if we add new possibilities for
+         * the pagetable layout. */
+        BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE));
 }
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 59736317bf0e..0f594d774bf7 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -22,6 +22,7 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
+#include <linux/gfp.h>
 #include <linux/types.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
@@ -32,6 +33,7 @@
 #include <linux/pagemap.h>
 #include <linux/suspend.h>
 #include <linux/lmb.h>
+#include <linux/hugetlb.h>
 #include <asm/pgalloc.h>
 #include <asm/prom.h>
@@ -47,6 +49,7 @@
 #include <asm/sparsemem.h>
 #include <asm/vdso.h>
 #include <asm/fixmap.h>
+#include <asm/swiotlb.h>
 #include "mmu_decl.h"
@@ -319,6 +322,11 @@ void __init mem_init(void)
        struct page *page;
        unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize;
+#ifdef CONFIG_SWIOTLB
+        if (ppc_swiotlb_enable)
+                swiotlb_init(1);
+#endif
        num_physpages = lmb.memory.size >> PAGE_SHIFT;
        high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
@@ -417,18 +425,26 @@ EXPORT_SYMBOL(flush_dcache_page);
 void flush_dcache_icache_page(struct page *page)
 {
+#ifdef CONFIG_HUGETLB_PAGE
+        if (PageCompound(page)) {
+                flush_dcache_icache_hugepage(page);
+                return;
+        }
+#endif
 #ifdef CONFIG_BOOKE
-        void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE);
+        {
-        __flush_dcache_icache(start);
+                void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE);
-        kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
+                __flush_dcache_icache(start);
+                kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
+        }
 #elif defined(CONFIG_8xx) || defined(CONFIG_PPC64)
        /* On 8xx there is no need to kmap since highmem is not supported */
        __flush_dcache_icache(page_address(page)); 
 #else
        __flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT);
 #endif
 }
 void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
 {
        clear_page(page);
@@ -485,13 +501,13 @@ EXPORT_SYMBOL(flush_icache_user_range);
 * This must always be called with the pte lock held.
 */
 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
-                      pte_t pte)
+                      pte_t *ptep)
 {
 #ifdef CONFIG_PPC_STD_MMU
        unsigned long access = 0, trap;
        /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
-        if (!pte_young(pte) || address >= TASK_SIZE)
+        if (!pte_young(*ptep) || address >= TASK_SIZE)
                return;
        /* We try to figure out if we are coming from an instruction
diff --git a/arch/powerpc/mm/mmap_64.c b/arch/powerpc/mm/mmap_64.c
index 0d957a4c70fe..5a783d8e8e8e 100644
--- a/arch/powerpc/mm/mmap_64.c
+++ b/arch/powerpc/mm/mmap_64.c
@@ -47,7 +47,7 @@ static inline int mmap_is_legacy(void)
        if (current->personality & ADDR_COMPAT_LAYOUT)
                return 1;
-        if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
+        if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
                return 1;
        return sysctl_legacy_va_layout;
@@ -77,7 +77,7 @@ static unsigned long mmap_rnd(void)
 static inline unsigned long mmap_base(void)
 {
-        unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
+        unsigned long gap = rlimit(RLIMIT_STACK);
        if (gap < MIN_GAP)
                gap = MIN_GAP;
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
index dbeb86ac90cd..2535828aa84b 100644
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -18,11 +18,13 @@
 #include <linux/mm.h>
 #include <linux/spinlock.h>
 #include <linux/idr.h>
+#include <linux/module.h>
+#include <linux/gfp.h>
 #include <asm/mmu_context.h>
 static DEFINE_SPINLOCK(mmu_context_lock);
-static DEFINE_IDR(mmu_context_idr);
+static DEFINE_IDA(mmu_context_ida);
 /*
 * The proto-VSID space has 2^35 - 1 segments available for user mappings.
@@ -32,17 +34,17 @@ static DEFINE_IDR(mmu_context_idr);
 #define NO_CONTEXT      0
 #define MAX_CONTEXT     ((1UL << 19) - 1)
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+int __init_new_context(void)
 {
        int index;
        int err;
 again:
-        if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
+        if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL))
                return -ENOMEM;
        spin_lock(&mmu_context_lock);
-        err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index);
+        err = ida_get_new_above(&mmu_context_ida, 1, &index);
        spin_unlock(&mmu_context_lock);
        if (err == -EAGAIN)
@@ -52,27 +54,46 @@ again:
        if (index > MAX_CONTEXT) {
                spin_lock(&mmu_context_lock);
-                idr_remove(&mmu_context_idr, index);
+                ida_remove(&mmu_context_ida, index);
                spin_unlock(&mmu_context_lock);
                return -ENOMEM;
        }
+        return index;
+}
+EXPORT_SYMBOL_GPL(__init_new_context);
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+        int index;
+        index = __init_new_context();
+        if (index < 0)
+                return index;
        /* The old code would re-promote on fork, we don't do that
         * when using slices as it could cause problem promoting slices
         * that have been forced down to 4K
         */
        if (slice_mm_new_context(mm))
                slice_set_user_psize(mm, mmu_virtual_psize);
+        subpage_prot_init_new_context(mm);
        mm->context.id = index;
        return 0;
 }
-void destroy_context(struct mm_struct *mm)
+void __destroy_context(int context_id)
 {
        spin_lock(&mmu_context_lock);
-        idr_remove(&mmu_context_idr, mm->context.id);
+        ida_remove(&mmu_context_ida, context_id);
        spin_unlock(&mmu_context_lock);
+}
+EXPORT_SYMBOL_GPL(__destroy_context);
+void destroy_context(struct mm_struct *mm)
+{
+        __destroy_context(mm->context.id);
+        subpage_prot_free(mm);
        mm->context.id = NO_CONTEXT;
 }
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index be4f34c30a0b..1f2d9ff09895 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -47,6 +47,7 @@
 #include <linux/bootmem.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
+#include <linux/slab.h>
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
@@ -56,7 +57,7 @@ static unsigned int next_context, nr_free_contexts;
 static unsigned long *context_map;
 static unsigned long *stale_map[NR_CPUS];
 static struct mm_struct **context_mm;
-static DEFINE_SPINLOCK(context_lock);
+static DEFINE_RAW_SPINLOCK(context_lock);
 #define CTX_MAP_SIZE    \
        (sizeof(unsigned long) * (last_context / BITS_PER_LONG + 1))
@@ -121,9 +122,9 @@ static unsigned int steal_context_smp(unsigned int id)
        /* This will happen if you have more CPUs than available contexts,
         * all we can do here is wait a bit and try again
         */
-        spin_unlock(&context_lock);
+        raw_spin_unlock(&context_lock);
        cpu_relax();
-        spin_lock(&context_lock);
+        raw_spin_lock(&context_lock);
        /* This will cause the caller to try again */
        return MMU_NO_CONTEXT;
@@ -194,7 +195,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
        unsigned long *map;
        /* No lockless fast path .. yet */
-        spin_lock(&context_lock);
+        raw_spin_lock(&context_lock);
        pr_hard("[%d] activating context for mm @%p, active=%d, id=%d",
                cpu, next, next->context.active, next->context.id);
@@ -278,7 +279,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
        /* Flick the MMU and release lock */
        pr_hardcont(" -> %d\n", id);
        set_context(id, next->pgd);
-        spin_unlock(&context_lock);
+        raw_spin_unlock(&context_lock);
 }
 /*
@@ -307,7 +308,7 @@ void destroy_context(struct mm_struct *mm)
        WARN_ON(mm->context.active != 0);
-        spin_lock_irqsave(&context_lock, flags);
+        raw_spin_lock_irqsave(&context_lock, flags);
        id = mm->context.id;
        if (id != MMU_NO_CONTEXT) {
                __clear_bit(id, context_map);
@@ -318,7 +319,7 @@ void destroy_context(struct mm_struct *mm)
                context_mm[id] = NULL;
                nr_free_contexts++;
        }
-        spin_unlock_irqrestore(&context_lock, flags);
+        raw_spin_unlock_irqrestore(&context_lock, flags);
 }
 #ifdef CONFIG_SMP
@@ -353,7 +354,7 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
                read_lock(&tasklist_lock);
                for_each_process(p) {
                        if (p->mm)
-                                cpu_mask_clear_cpu(cpu, mm_cpumask(p->mm));
+                                cpumask_clear_cpu(cpu, mm_cpumask(p->mm));
                }
                read_unlock(&tasklist_lock);
        break;
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index d2e5321d5ea6..d49a77503e19 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -98,23 +98,13 @@ extern void _tlbia(void);
 #ifdef CONFIG_PPC32
-struct tlbcam {
-        u32     MAS0;
-        u32     MAS1;
-        u32     MAS2;
-        u32     MAS3;
-        u32     MAS7;
-};
 extern void mapin_ram(void);
 extern int map_page(unsigned long va, phys_addr_t pa, int flags);
 extern void setbat(int index, unsigned long virt, phys_addr_t phys,
                   unsigned int size, int flags);
-extern void settlbcam(int index, unsigned long virt, phys_addr_t phys,
-                      unsigned int size, int flags, unsigned int pid);
-extern void invalidate_tlbcam_entry(int index);
 extern int __map_without_bats;
+extern int __allow_ioremap_reserved;
 extern unsigned long ioremap_base;
 extern unsigned int rtas_data, rtas_size;
@@ -136,24 +126,32 @@ extern phys_addr_t total_lowmem;
 extern phys_addr_t memstart_addr;
 extern phys_addr_t lowmem_end_addr;
+#ifdef CONFIG_WII
+extern unsigned long wii_hole_start;
+extern unsigned long wii_hole_size;
+extern unsigned long wii_mmu_mapin_mem2(unsigned long top);
+extern void wii_memory_fixups(void);
+#endif
 /* ...and now those things that may be slightly different between processor
 * architectures.  -- Dan
 */
 #if defined(CONFIG_8xx)
 #define MMU_init_hw()           do { } while(0)
-#define mmu_mapin_ram()         (0UL)
+#define mmu_mapin_ram(top)      (0UL)
 #elif defined(CONFIG_4xx)
 extern void MMU_init_hw(void);
-extern unsigned long mmu_mapin_ram(void);
+extern unsigned long mmu_mapin_ram(unsigned long top);
 #elif defined(CONFIG_FSL_BOOKE)
 extern void MMU_init_hw(void);
-extern unsigned long mmu_mapin_ram(void);
+extern unsigned long mmu_mapin_ram(unsigned long top);
 extern void adjust_total_lowmem(void);
 #elif defined(CONFIG_PPC32)
 /* anything 32-bit except 4xx or 8xx */
 extern void MMU_init_hw(void);
-extern unsigned long mmu_mapin_ram(void);
+extern unsigned long mmu_mapin_ram(unsigned long top);
 #endif
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index b037d95eeadc..eaa7633515b7 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -242,10 +242,11 @@ EXPORT_SYMBOL_GPL(of_node_to_nid);
 */
 static int __init find_min_common_depth(void)
 {
-        int depth;
+        int depth, index;
        const unsigned int *ref_points;
        struct device_node *rtas_root;
        unsigned int len;
+        struct device_node *options;
        rtas_root = of_find_node_by_path("/rtas");
@@ -258,11 +259,23 @@ static int __init find_min_common_depth(void)
         * configuration (should be all 0's) and the second is for a normal
         * NUMA configuration.
         */
+        index = 1;
        ref_points = of_get_property(rtas_root,
                        "ibm,associativity-reference-points", &len);
+        /*
+         * For type 1 affinity information we want the first field
+         */
+        options = of_find_node_by_path("/options");
+        if (options) {
+                const char *str;
+                str = of_get_property(options, "ibm,associativity-form", NULL);
+                if (str && !strcmp(str, "1"))
+                        index = 0;
+        }
        if ((len >= 2 * sizeof(unsigned int)) && ref_points) {
-                depth = ref_points[1];
+                depth = ref_points[index];
        } else {
                dbg("NUMA: ibm,associativity-reference-points not found.\n");
                depth = -1;
@@ -451,7 +464,7 @@ static int __cpuinit numa_setup_cpu(unsigned long lcpu)
        nid = of_node_to_nid_single(cpu);
        if (nid < 0 || !node_online(nid))
-                nid = any_online_node(NODE_MASK_ALL);
+                nid = first_online_node;
 out:
        map_cpu_to_node(lcpu, nid);
@@ -1114,7 +1127,7 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
        int nid, found = 0;
        if (!numa_enabled || (min_common_depth < 0))
-                return any_online_node(NODE_MASK_ALL);
+                return first_online_node;
        memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
        if (memory) {
@@ -1125,7 +1138,7 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
        }
        if (nid < 0 || !node_online(nid))
-                nid = any_online_node(NODE_MASK_ALL);
+                nid = first_online_node;
        if (NODE_DATA(nid)->node_spanned_pages)
                return nid;
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 53040931de32..ebc2f38eb381 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -22,6 +22,7 @@
 */
 #include <linux/kernel.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/percpu.h>
@@ -49,12 +50,12 @@ struct pte_freelist_batch
 {
        struct rcu_head rcu;
        unsigned int    index;
-        pgtable_free_t  tables[0];
+        unsigned long   tables[0];
 };
 #define PTE_FREELIST_SIZE \
        ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
-          / sizeof(pgtable_free_t))
+          / sizeof(unsigned long))
 static void pte_free_smp_sync(void *arg)
 {
@@ -64,13 +65,13 @@ static void pte_free_smp_sync(void *arg)
 /* This is only called when we are critically out of memory
 * (and fail to get a page in pte_free_tlb).
 */
-static void pgtable_free_now(pgtable_free_t pgf)
+static void pgtable_free_now(void *table, unsigned shift)
 {
        pte_freelist_forced_free++;
        smp_call_function(pte_free_smp_sync, NULL, 1);
-        pgtable_free(pgf);
+        pgtable_free(table, shift);
 }
 static void pte_free_rcu_callback(struct rcu_head *head)
@@ -79,8 +80,12 @@ static void pte_free_rcu_callback(struct rcu_head *head)
                container_of(head, struct pte_freelist_batch, rcu);
        unsigned int i;
-        for (i = 0; i < batch->index; i++)
+        for (i = 0; i < batch->index; i++) {
-                pgtable_free(batch->tables[i]);
+                void *table = (void *)(batch->tables[i] & ~MAX_PGTABLE_INDEX_SIZE);
+                unsigned shift = batch->tables[i] & MAX_PGTABLE_INDEX_SIZE;
+                pgtable_free(table, shift);
+        }
        free_page((unsigned long)batch);
 }
@@ -91,25 +96,28 @@ static void pte_free_submit(struct pte_freelist_batch *batch)
        call_rcu(&batch->rcu, pte_free_rcu_callback);
 }
-void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift)
 {
        /* This is safe since tlb_gather_mmu has disabled preemption */
        struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+        unsigned long pgf;
        if (atomic_read(&tlb->mm->mm_users) < 2 ||
            cpumask_equal(mm_cpumask(tlb->mm), cpumask_of(smp_processor_id()))){
-                pgtable_free(pgf);
+                pgtable_free(table, shift);
                return;
        }
        if (*batchp == NULL) {
                *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
                if (*batchp == NULL) {
-                        pgtable_free_now(pgf);
+                        pgtable_free_now(table, shift);
                        return;
                }
                (*batchp)->index = 0;
        }
+        BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+        pgf = (unsigned long)table | shift;
        (*batchp)->tables[(*batchp)->index++] = pgf;
        if ((*batchp)->index == PTE_FREELIST_SIZE) {
                pte_free_submit(*batchp);
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index cb96cb2e17cc..b9243e7557ae 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -26,6 +26,8 @@
 #include <linux/vmalloc.h>
 #include <linux/init.h>
 #include <linux/highmem.h>
+#include <linux/lmb.h>
+#include <linux/slab.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -191,7 +193,8 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags,
         * Don't allow anybody to remap normal RAM that we're using.
         * mem_init() sets high_memory so only do the check after that.
         */
-        if (mem_init_done && (p < virt_to_phys(high_memory))) {
+        if (mem_init_done && (p < virt_to_phys(high_memory)) &&
+            !(__allow_ioremap_reserved && lmb_is_region_reserved(p, size))) {
                printk("__ioremap(): phys addr 0x%llx is RAM lr %p\n",
                       (unsigned long long)p, __builtin_return_address(0));
                return NULL;
@@ -283,18 +286,18 @@ int map_page(unsigned long va, phys_addr_t pa, int flags)
 }
 /*
- * Map in a big chunk of physical memory starting at PAGE_OFFSET.
+ * Map in a chunk of physical memory starting at start.
 */
-void __init mapin_ram(void)
+void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
 {
        unsigned long v, s, f;
        phys_addr_t p;
        int ktext;
-        s = mmu_mapin_ram();
+        s = offset;
        v = PAGE_OFFSET + s;
        p = memstart_addr + s;
-        for (; s < total_lowmem; s += PAGE_SIZE) {
+        for (; s < top; s += PAGE_SIZE) {
                ktext = ((char *) v >= _stext && (char *) v < etext);
                f = ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL;
                map_page(v, p, f);
@@ -307,6 +310,30 @@ void __init mapin_ram(void)
        }
 }
+void __init mapin_ram(void)
+{
+        unsigned long s, top;
+#ifndef CONFIG_WII
+        top = total_lowmem;
+        s = mmu_mapin_ram(top);
+        __mapin_ram_chunk(s, top);
+#else
+        if (!wii_hole_size) {
+                s = mmu_mapin_ram(total_lowmem);
+                __mapin_ram_chunk(s, total_lowmem);
+        } else {
+                top = wii_hole_start;
+                s = mmu_mapin_ram(top);
+                __mapin_ram_chunk(s, top);
+                top = lmb_end_of_DRAM();
+                s = wii_mmu_mapin_mem2(top);
+                __mapin_ram_chunk(s, top);
+        }
+#endif
+}
 /* Scan the real Linux page tables and return a PTE pointer for
 * a virtual address in a context.
 * Returns true (1) if PTE was found, zero otherwise.  The pointer to
@@ -356,7 +383,7 @@ static int __change_page_attr(struct page *page, pgprot_t prot)
                return 0;
        if (!get_pteptr(&init_mm, address, &kpte, &kpmd))
                return -EINVAL;
-        set_pte_at(&init_mm, address, kpte, mk_pte(page, prot));
+        __set_pte_at(&init_mm, address, kpte, mk_pte(page, prot), 0);
        wmb();
 #ifdef CONFIG_PPC_STD_MMU
        flush_hash_pages(0, address, pmd_val(*kpmd), 1);
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 853d5565eed5..d95679a5fb29 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/lmb.h>
+#include <linux/slab.h>
 #include <asm/pgalloc.h>
 #include <asm/page.h>
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index 2d2a87e10154..f11c2cdcb0fe 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -72,7 +72,7 @@ unsigned long p_mapped_by_bats(phys_addr_t pa)
        return 0;
 }
-unsigned long __init mmu_mapin_ram(void)
+unsigned long __init mmu_mapin_ram(unsigned long top)
 {
        unsigned long tot, bl, done;
        unsigned long max_size = (256<<20);
@@ -86,7 +86,7 @@ unsigned long __init mmu_mapin_ram(void)
        /* Make sure we don't map a block larger than the
           smallest alignment of the physical address. */
-        tot = total_lowmem;
+        tot = top;
        for (bl = 128<<10; bl < max_size; bl <<= 1) {
                if (bl * 2 > tot)
                        break;
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 4cafc0c33d0a..e4f8f1fc81a5 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -10,7 +10,6 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/gfp.h>
-#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
@@ -24,9 +23,9 @@
 * Also makes sure that the subpage_prot_table structure is
 * reinitialized for the next user.
 */
-void subpage_prot_free(pgd_t *pgd)
+void subpage_prot_free(struct mm_struct *mm)
 {
-        struct subpage_prot_table *spt = pgd_subpage_prot(pgd);
+        struct subpage_prot_table *spt = &mm->context.spt;
        unsigned long i, j, addr;
        u32 **p;
@@ -51,6 +50,13 @@ void subpage_prot_free(pgd_t *pgd)
        spt->maxaddr = 0;
 }
+void subpage_prot_init_new_context(struct mm_struct *mm)
+{
+        struct subpage_prot_table *spt = &mm->context.spt;
+        memset(spt, 0, sizeof(*spt));
+}
 static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
                             int npages)
 {
@@ -87,7 +93,7 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
 static void subpage_prot_clear(unsigned long addr, unsigned long len)
 {
        struct mm_struct *mm = current->mm;
-        struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd);
+        struct subpage_prot_table *spt = &mm->context.spt;
        u32 **spm, *spp;
        int i, nw;
        unsigned long next, limit;
@@ -136,7 +142,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
 long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
 {
        struct mm_struct *mm = current->mm;
-        struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd);
+        struct subpage_prot_table *spt = &mm->context.spt;
        u32 **spm, *spp;
        int i, nw;
        unsigned long next, limit;
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 2b2f35f6985e..1ec06576f619 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -53,11 +53,6 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
        i = batch->index;
-        /* We mask the address for the base page size. Huge pages will
-         * have applied their own masking already
-         */
-        addr &= PAGE_MASK;
        /* Get page size (maybe move back to caller).
         *
         * NOTE: when using special 64K mappings in 4K environment like
@@ -68,12 +63,21 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
        if (huge) {
 #ifdef CONFIG_HUGETLB_PAGE
                psize = get_slice_psize(mm, addr);
+                /* Mask the address for the correct page size */
+                addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1);
 #else
                BUG();
                psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
 #endif
-        } else
+        } else {
                psize = pte_pagesize_index(mm, addr, pte);
+                /* Mask the address for the standard page size.  If we
+                 * have a 64k page kernel, but the hardware does not
+                 * support 64k pages, this might be different from the
+                 * hardware page size encoded in the slice table. */
+                addr &= PAGE_MASK;
+        }
        /* Build full vaddr */
        if (!is_kernel_addr(addr)) {
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index f288279e679d..8b04c54e596f 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -1,5 +1,5 @@
 /*
- *  Low leve TLB miss handlers for Book3E
+ *  Low level TLB miss handlers for Book3E
 *
 *  Copyright (C) 2008-2009
 *      Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp.
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index 2fbc680c2c71..e81d5d67f834 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -150,7 +150,7 @@ EXPORT_SYMBOL(local_flush_tlb_page);
 */
 #ifdef CONFIG_SMP
-static DEFINE_SPINLOCK(tlbivax_lock);
+static DEFINE_RAW_SPINLOCK(tlbivax_lock);
 static int mm_is_core_local(struct mm_struct *mm)
 {
@@ -232,10 +232,10 @@ void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
                if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) {
                        int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL);
                        if (lock)
-                                spin_lock(&tlbivax_lock);
+                                raw_spin_lock(&tlbivax_lock);
                        _tlbivax_bcast(vmaddr, pid, tsize, ind);
                        if (lock)
-                                spin_unlock(&tlbivax_lock);
+                                raw_spin_unlock(&tlbivax_lock);
                        goto bail;
                } else {
                        struct tlb_flush_param p = {