9 files changed, 343 insertions, 286 deletions
diff --git a/arch/ppc64/mm/hash_low.S b/arch/ppc64/mm/hash_low.S
index fbff24827ae7..35eb49e1b890 100644
--- a/arch/ppc64/mm/hash_low.S
+++ b/arch/ppc64/mm/hash_low.S
@@ -129,12 +129,10 @@ _GLOBAL(__hash_page)
         * code rather than call a C function...) 
         */
 BEGIN_FTR_SECTION
-BEGIN_FTR_SECTION
        mr      r4,r30
        mr      r5,r7
        bl      .hash_page_do_lazy_icache
-END_FTR_SECTION_IFSET(CPU_FTR_NOEXECUTE)
+END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
-END_FTR_SECTION_IFCLR(CPU_FTR_COHERENT_ICACHE)
        /* At this point, r3 contains new PP bits, save them in
         * place of "access" in the param area (sic)
diff --git a/arch/ppc64/mm/hash_native.c b/arch/ppc64/mm/hash_native.c
index a6abd3a979bf..7626bb59954d 100644
--- a/arch/ppc64/mm/hash_native.c
+++ b/arch/ppc64/mm/hash_native.c
@@ -51,7 +51,6 @@ long native_hpte_insert(unsigned long hpte_group, unsigned long va,
                        unsigned long prpn, unsigned long vflags,
                        unsigned long rflags)
 {
-        unsigned long arpn = physRpn_to_absRpn(prpn);
        hpte_t *hptep = htab_address + hpte_group;
        unsigned long hpte_v, hpte_r;
        int i;
@@ -74,7 +73,7 @@ long native_hpte_insert(unsigned long hpte_group, unsigned long va,
        hpte_v = (va >> 23) << HPTE_V_AVPN_SHIFT | vflags | HPTE_V_VALID;
        if (vflags & HPTE_V_LARGE)
                va &= ~(1UL << HPTE_V_AVPN_SHIFT);
-        hpte_r = (arpn << HPTE_R_RPN_SHIFT) | rflags;
+        hpte_r = (prpn << HPTE_R_RPN_SHIFT) | rflags;
        hptep->r = hpte_r;
        /* Guarantee the second dword is visible before the valid bit */
diff --git a/arch/ppc64/mm/hash_utils.c b/arch/ppc64/mm/hash_utils.c
index 623b5d130c31..09475c8edf7c 100644
--- a/arch/ppc64/mm/hash_utils.c
+++ b/arch/ppc64/mm/hash_utils.c
@@ -210,7 +210,7 @@ void __init htab_initialize(void)
        /* create bolted the linear mapping in the hash table */
        for (i=0; i < lmb.memory.cnt; i++) {
-                base = lmb.memory.region[i].physbase + KERNELBASE;
+                base = lmb.memory.region[i].base + KERNELBASE;
                size = lmb.memory.region[i].size;
                DBG("creating mapping for region: %lx : %lx\n", base, size);
@@ -302,7 +302,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
        int local = 0;
        cpumask_t tmp;
-        if ((ea & ~REGION_MASK) > EADDR_MASK)
+        if ((ea & ~REGION_MASK) >= PGTABLE_RANGE)
                return 1;
        switch (REGION_ID(ea)) {
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
index f9524602818d..e7833c80eb68 100644
--- a/arch/ppc64/mm/hugetlbpage.c
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -27,124 +27,94 @@
 #include <linux/sysctl.h>
-#define HUGEPGDIR_SHIFT         (HPAGE_SHIFT + PAGE_SHIFT - 3)
+#define NUM_LOW_AREAS   (0x100000000UL >> SID_SHIFT)
-#define HUGEPGDIR_SIZE          (1UL << HUGEPGDIR_SHIFT)
+#define NUM_HIGH_AREAS  (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
-#define HUGEPGDIR_MASK          (~(HUGEPGDIR_SIZE-1))
-#define HUGEPTE_INDEX_SIZE      9
+/* Modelled after find_linux_pte() */
-#define HUGEPGD_INDEX_SIZE      10
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
-#define PTRS_PER_HUGEPTE        (1 << HUGEPTE_INDEX_SIZE)
-#define PTRS_PER_HUGEPGD        (1 << HUGEPGD_INDEX_SIZE)
-static inline int hugepgd_index(unsigned long addr)
-{
-        return (addr & ~REGION_MASK) >> HUGEPGDIR_SHIFT;
-}
-static pud_t *hugepgd_offset(struct mm_struct *mm, unsigned long addr)
 {
-        int index;
+        pgd_t *pg;
+        pud_t *pu;
+        pmd_t *pm;
+        pte_t *pt;
-        if (! mm->context.huge_pgdir)
+        BUG_ON(! in_hugepage_area(mm->context, addr));
-                return NULL;
+        addr &= HPAGE_MASK;
+        pg = pgd_offset(mm, addr);
+        if (!pgd_none(*pg)) {
+                pu = pud_offset(pg, addr);
+                if (!pud_none(*pu)) {
+                        pm = pmd_offset(pu, addr);
+                        pt = (pte_t *)pm;
+                        BUG_ON(!pmd_none(*pm)
+                               && !(pte_present(*pt) && pte_huge(*pt)));
+                        return pt;
+                }
+        }
-        index = hugepgd_index(addr);
+        return NULL;
-        BUG_ON(index >= PTRS_PER_HUGEPGD);
-        return (pud_t *)(mm->context.huge_pgdir + index);
 }
-static inline pte_t *hugepte_offset(pud_t *dir, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
-        int index;
+        pgd_t *pg;
+        pud_t *pu;
-        if (pud_none(*dir))
+        pmd_t *pm;
-                return NULL;
+        pte_t *pt;
-        index = (addr >> HPAGE_SHIFT) % PTRS_PER_HUGEPTE;
-        return (pte_t *)pud_page(*dir) + index;
-}
-static pud_t *hugepgd_alloc(struct mm_struct *mm, unsigned long addr)
-{
        BUG_ON(! in_hugepage_area(mm->context, addr));
-        if (! mm->context.huge_pgdir) {
+        addr &= HPAGE_MASK;
-                pgd_t *new;
-                spin_unlock(&mm->page_table_lock);
-                /* Don't use pgd_alloc(), because we want __GFP_REPEAT */
-                new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT);
-                BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));
-                spin_lock(&mm->page_table_lock);
-                /*
+        pg = pgd_offset(mm, addr);
-                 * Because we dropped the lock, we should re-check the
+        pu = pud_alloc(mm, pg, addr);
-                 * entry, as somebody else could have populated it..
-                 */
-                if (mm->context.huge_pgdir)
-                        pgd_free(new);
-                else
-                        mm->context.huge_pgdir = new;
-        }
-        return hugepgd_offset(mm, addr);
-}
-static pte_t *hugepte_alloc(struct mm_struct *mm, pud_t *dir, unsigned long addr)
+        if (pu) {
-{
+                pm = pmd_alloc(mm, pu, addr);
-        if (! pud_present(*dir)) {
+                if (pm) {
-                pte_t *new;
+                        pt = (pte_t *)pm;
+                        BUG_ON(!pmd_none(*pm)
-                spin_unlock(&mm->page_table_lock);
+                               && !(pte_present(*pt) && pte_huge(*pt)));
-                new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT);
+                        return pt;
-                BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));
-                spin_lock(&mm->page_table_lock);
-                /*
-                 * Because we dropped the lock, we should re-check the
-                 * entry, as somebody else could have populated it..
-                 */
-                if (pud_present(*dir)) {
-                        if (new)
-                                kmem_cache_free(zero_cache, new);
-                } else {
-                        struct page *ptepage;
-                        if (! new)
-                                return NULL;
-                        ptepage = virt_to_page(new);
-                        ptepage->mapping = (void *) mm;
-                        ptepage->index = addr & HUGEPGDIR_MASK;
-                        pud_populate(mm, dir, new);
                }
        }
-        return hugepte_offset(dir, addr);
+        return NULL;
 }
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+#define HUGEPTE_BATCH_SIZE      (HPAGE_SIZE / PMD_SIZE)
-{
-        pud_t *pud;
-        BUG_ON(! in_hugepage_area(mm->context, addr));
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+                     pte_t *ptep, pte_t pte)
+{
+        int i;
-        pud = hugepgd_offset(mm, addr);
+        if (pte_present(*ptep)) {
-        if (! pud)
+                pte_clear(mm, addr, ptep);
-                return NULL;
+                flush_tlb_pending();
+        }
-        return hugepte_offset(pud, addr);
+        for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) {
+                *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
+                ptep++;
+        }
 }
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+                              pte_t *ptep)
 {
-        pud_t *pud;
+        unsigned long old = pte_update(ptep, ~0UL);
+        int i;
-        BUG_ON(! in_hugepage_area(mm->context, addr));
+        if (old & _PAGE_HASHPTE)
+                hpte_update(mm, addr, old, 0);
-        pud = hugepgd_alloc(mm, addr);
+        for (i = 1; i < HUGEPTE_BATCH_SIZE; i++)
-        if (! pud)
+                ptep[i] = __pte(0);
-                return NULL;
-        return hugepte_alloc(mm, pud, addr);
+        return __pte(old);
 }
 /*
@@ -162,15 +132,17 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
        return 0;
 }
-static void flush_segments(void *parm)
+static void flush_low_segments(void *parm)
 {
-        u16 segs = (unsigned long) parm;
+        u16 areas = (unsigned long) parm;
        unsigned long i;
        asm volatile("isync" : : : "memory");
-        for (i = 0; i < 16; i++) {
+        BUILD_BUG_ON((sizeof(areas)*8) != NUM_LOW_AREAS);
-                if (! (segs & (1U << i)))
+        for (i = 0; i < NUM_LOW_AREAS; i++) {
+                if (! (areas & (1U << i)))
                        continue;
                asm volatile("slbie %0" : : "r" (i << SID_SHIFT));
        }
@@ -178,13 +150,33 @@ static void flush_segments(void *parm)
        asm volatile("isync" : : : "memory");
 }
-static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg)
+static void flush_high_segments(void *parm)
 {
-        unsigned long start = seg << SID_SHIFT;
+        u16 areas = (unsigned long) parm;
-        unsigned long end = (seg+1) << SID_SHIFT;
+        unsigned long i, j;
+        asm volatile("isync" : : : "memory");
+        BUILD_BUG_ON((sizeof(areas)*8) != NUM_HIGH_AREAS);
+        for (i = 0; i < NUM_HIGH_AREAS; i++) {
+                if (! (areas & (1U << i)))
+                        continue;
+                for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
+                        asm volatile("slbie %0"
+                                     :: "r" ((i << HTLB_AREA_SHIFT) + (j << SID_SHIFT)));
+        }
+        asm volatile("isync" : : : "memory");
+}
+static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area)
+{
+        unsigned long start = area << SID_SHIFT;
+        unsigned long end = (area+1) << SID_SHIFT;
        struct vm_area_struct *vma;
-        BUG_ON(seg >= 16);
+        BUG_ON(area >= NUM_LOW_AREAS);
        /* Check no VMAs are in the region */
        vma = find_vma(mm, start);
@@ -194,20 +186,39 @@ static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg)
        return 0;
 }
-static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs)
+static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
+{
+        unsigned long start = area << HTLB_AREA_SHIFT;
+        unsigned long end = (area+1) << HTLB_AREA_SHIFT;
+        struct vm_area_struct *vma;
+        BUG_ON(area >= NUM_HIGH_AREAS);
+        /* Check no VMAs are in the region */
+        vma = find_vma(mm, start);
+        if (vma && (vma->vm_start < end))
+                return -EBUSY;
+        return 0;
+}
+static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
 {
        unsigned long i;
-        newsegs &= ~(mm->context.htlb_segs);
+        BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
-        if (! newsegs)
+        BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
+        newareas &= ~(mm->context.low_htlb_areas);
+        if (! newareas)
                return 0; /* The segments we want are already open */
-        for (i = 0; i < 16; i++)
+        for (i = 0; i < NUM_LOW_AREAS; i++)
-                if ((1 << i) & newsegs)
+                if ((1 << i) & newareas)
-                        if (prepare_low_seg_for_htlb(mm, i) != 0)
+                        if (prepare_low_area_for_htlb(mm, i) != 0)
                                return -EBUSY;
-        mm->context.htlb_segs |= newsegs;
+        mm->context.low_htlb_areas |= newareas;
        /* update the paca copy of the context struct */
        get_paca()->context = mm->context;
@@ -215,29 +226,63 @@ static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs)
        /* the context change must make it to memory before the flush,
         * so that further SLB misses do the right thing. */
        mb();
-        on_each_cpu(flush_segments, (void *)(unsigned long)newsegs, 0, 1);
+        on_each_cpu(flush_low_segments, (void *)(unsigned long)newareas, 0, 1);
+        return 0;
+}
+static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
+{
+        unsigned long i;
+        BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
+        BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8)
+                     != NUM_HIGH_AREAS);
+        newareas &= ~(mm->context.high_htlb_areas);
+        if (! newareas)
+                return 0; /* The areas we want are already open */
+        for (i = 0; i < NUM_HIGH_AREAS; i++)
+                if ((1 << i) & newareas)
+                        if (prepare_high_area_for_htlb(mm, i) != 0)
+                                return -EBUSY;
+        mm->context.high_htlb_areas |= newareas;
+        /* update the paca copy of the context struct */
+        get_paca()->context = mm->context;
+        /* the context change must make it to memory before the flush,
+         * so that further SLB misses do the right thing. */
+        mb();
+        on_each_cpu(flush_high_segments, (void *)(unsigned long)newareas, 0, 1);
        return 0;
 }
 int prepare_hugepage_range(unsigned long addr, unsigned long len)
 {
-        if (within_hugepage_high_range(addr, len))
+        int err;
-                return 0;
-        else if ((addr < 0x100000000UL) && ((addr+len) < 0x100000000UL)) {
+        if ( (addr+len) < addr )
-                int err;
+                return -EINVAL;
-                /* Yes, we need both tests, in case addr+len overflows
-                 * 64-bit arithmetic */
+        if ((addr + len) < 0x100000000UL)
-                err = open_low_hpage_segs(current->mm,
+                err = open_low_hpage_areas(current->mm,
                                          LOW_ESID_MASK(addr, len));
-                if (err)
+        else
-                        printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
+                err = open_high_hpage_areas(current->mm,
-                               " failed (segs: 0x%04hx)\n", addr, len,
+                                            HTLB_AREA_MASK(addr, len));
-                               LOW_ESID_MASK(addr, len));
+        if (err) {
+                printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
+                       " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n",
+                       addr, len,
+                       LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len));
                return err;
        }
-        return -EINVAL;
+        return 0;
 }
 struct page *
@@ -309,8 +354,8 @@ full_search:
                        vma = find_vma(mm, addr);
                        continue;
                }
-                if (touches_hugepage_high_range(addr, len)) {
+                if (touches_hugepage_high_range(mm, addr, len)) {
-                        addr = TASK_HPAGE_END;
+                        addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
                        vma = find_vma(mm, addr);
                        continue;
                }
@@ -389,8 +434,9 @@ hugepage_recheck:
                if (touches_hugepage_low_range(mm, addr, len)) {
                        addr = (addr & ((~0) << SID_SHIFT)) - len;
                        goto hugepage_recheck;
-                } else if (touches_hugepage_high_range(addr, len)) {
+                } else if (touches_hugepage_high_range(mm, addr, len)) {
-                        addr = TASK_HPAGE_BASE - len;
+                        addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len;
+                        goto hugepage_recheck;
                }
                /*
@@ -481,23 +527,28 @@ static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
        return -ENOMEM;
 }
-static unsigned long htlb_get_high_area(unsigned long len)
+static unsigned long htlb_get_high_area(unsigned long len, u16 areamask)
 {
-        unsigned long addr = TASK_HPAGE_BASE;
+        unsigned long addr = 0x100000000UL;
        struct vm_area_struct *vma;
        vma = find_vma(current->mm, addr);
-        for (vma = find_vma(current->mm, addr);
+        while (addr + len <= TASK_SIZE_USER64) {
-             addr + len <= TASK_HPAGE_END;
-             vma = vma->vm_next) {
                BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
-                BUG_ON(! within_hugepage_high_range(addr, len));
+                if (! __within_hugepage_high_range(addr, len, areamask)) {
+                        addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
+                        vma = find_vma(current->mm, addr);
+                        continue;
+                }
                if (!vma || (addr + len) <= vma->vm_start)
                        return addr;
                addr = ALIGN(vma->vm_end, HPAGE_SIZE);
-                /* Because we're in a hugepage region, this alignment
+                /* Depending on segmask this might not be a confirmed
-                 * should not skip us over any VMAs */
+                 * hugepage region, so the ALIGN could have skipped
+                 * some VMAs */
+                vma = find_vma(current->mm, addr);
        }
        return -ENOMEM;
@@ -507,6 +558,9 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                                        unsigned long len, unsigned long pgoff,
                                        unsigned long flags)
 {
+        int lastshift;
+        u16 areamask, curareas;
        if (len & ~HPAGE_MASK)
                return -EINVAL;
@@ -514,67 +568,49 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                return -EINVAL;
        if (test_thread_flag(TIF_32BIT)) {
-                int lastshift = 0;
+                curareas = current->mm->context.low_htlb_areas;
-                u16 segmask, cursegs = current->mm->context.htlb_segs;
                /* First see if we can do the mapping in the existing
-                 * low hpage segments */
+                 * low areas */
-                addr = htlb_get_low_area(len, cursegs);
+                addr = htlb_get_low_area(len, curareas);
                if (addr != -ENOMEM)
                        return addr;
-                for (segmask = LOW_ESID_MASK(0x100000000UL-len, len);
+                lastshift = 0;
-                     ! lastshift; segmask >>=1) {
+                for (areamask = LOW_ESID_MASK(0x100000000UL-len, len);
-                        if (segmask & 1)
+                     ! lastshift; areamask >>=1) {
+                        if (areamask & 1)
                                lastshift = 1;
-                        addr = htlb_get_low_area(len, cursegs | segmask);
+                        addr = htlb_get_low_area(len, curareas | areamask);
                        if ((addr != -ENOMEM)
-                            && open_low_hpage_segs(current->mm, segmask) == 0)
+                            && open_low_hpage_areas(current->mm, areamask) == 0)
                                return addr;
                }
-                printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
-                       " enough segments\n");
-                return -ENOMEM;
        } else {
-                return htlb_get_high_area(len);
+                curareas = current->mm->context.high_htlb_areas;
-        }
-}
-void hugetlb_mm_free_pgd(struct mm_struct *mm)
-{
-        int i;
-        pgd_t *pgdir;
-        spin_lock(&mm->page_table_lock);
-        pgdir = mm->context.huge_pgdir;
-        if (! pgdir)
-                goto out;
-        mm->context.huge_pgdir = NULL;
-        /* cleanup any hugepte pages leftover */
+                /* First see if we can do the mapping in the existing
-        for (i = 0; i < PTRS_PER_HUGEPGD; i++) {
+                 * high areas */
-                pud_t *pud = (pud_t *)(pgdir + i);
+                addr = htlb_get_high_area(len, curareas);
+                if (addr != -ENOMEM)
-                if (! pud_none(*pud)) {
+                        return addr;
-                        pte_t *pte = (pte_t *)pud_page(*pud);
-                        struct page *ptepage = virt_to_page(pte);
-                        ptepage->mapping = NULL;
+                lastshift = 0;
+                for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len);
+                     ! lastshift; areamask >>=1) {
+                        if (areamask & 1)
+                                lastshift = 1;
-                        BUG_ON(memcmp(pte, empty_zero_page, PAGE_SIZE));
+                        addr = htlb_get_high_area(len, curareas | areamask);
-                        kmem_cache_free(zero_cache, pte);
+                        if ((addr != -ENOMEM)
+                            && open_high_hpage_areas(current->mm, areamask) == 0)
+                                return addr;
                }
-                pud_clear(pud);
        }
+        printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
-        BUG_ON(memcmp(pgdir, empty_zero_page, PAGE_SIZE));
+               " enough areas\n");
-        kmem_cache_free(zero_cache, pgdir);
+        return -ENOMEM;
- out:
-        spin_unlock(&mm->page_table_lock);
 }
 int hash_huge_page(struct mm_struct *mm, unsigned long access,
diff --git a/arch/ppc64/mm/imalloc.c b/arch/ppc64/mm/imalloc.c
index b6e75b891ac0..c65b87b92756 100644
--- a/arch/ppc64/mm/imalloc.c
+++ b/arch/ppc64/mm/imalloc.c
@@ -31,7 +31,7 @@ static int get_free_im_addr(unsigned long size, unsigned long *im_addr)
                        break;
                if ((unsigned long)tmp->addr >= ioremap_bot)
                        addr = tmp->size + (unsigned long) tmp->addr;
-                if (addr > IMALLOC_END-size) 
+                if (addr >= IMALLOC_END-size)
                        return 1;
        }
        *im_addr = addr;
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
index e58a24d42879..c02dc9809ca5 100644
--- a/arch/ppc64/mm/init.c
+++ b/arch/ppc64/mm/init.c
@@ -42,7 +42,6 @@
 #include <asm/pgalloc.h>
 #include <asm/page.h>
-#include <asm/abs_addr.h>
 #include <asm/prom.h>
 #include <asm/lmb.h>
 #include <asm/rtas.h>
@@ -66,6 +65,14 @@
 #include <asm/vdso.h>
 #include <asm/imalloc.h>
+#if PGTABLE_RANGE > USER_VSID_RANGE
+#warning Limited user VSID range means pagetable space is wasted
+#endif
+#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
+#warning TASK_SIZE is smaller than it needs to be.
+#endif
 int mem_init_done;
 unsigned long ioremap_bot = IMALLOC_BASE;
 static unsigned long phbs_io_bot = PHBS_IO_BASE;
@@ -159,7 +166,6 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
                ptep = pte_alloc_kernel(&init_mm, pmdp, ea);
                if (!ptep)
                        return -ENOMEM;
-                pa = abs_to_phys(pa);
                set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
                                                          __pgprot(flags)));
                spin_unlock(&init_mm.page_table_lock);
@@ -226,7 +232,7 @@ void __iomem * __ioremap(unsigned long addr, unsigned long size,
         * Before that, we map using addresses going
         * up from ioremap_bot.  imalloc will use
         * the addresses from ioremap_bot through
-         * IMALLOC_END (0xE000001fffffffff)
+         * IMALLOC_END
         * 
         */
        pa = addr & PAGE_MASK;
@@ -417,12 +423,6 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
        int index;
        int err;
-#ifdef CONFIG_HUGETLB_PAGE
-        /* We leave htlb_segs as it was, but for a fork, we need to
-         * clear the huge_pgdir. */
-        mm->context.huge_pgdir = NULL;
-#endif
 again:
        if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
                return -ENOMEM;
@@ -453,8 +453,6 @@ void destroy_context(struct mm_struct *mm)
        spin_unlock(&mmu_context_lock);
        mm->context.id = NO_CONTEXT;
-        hugetlb_mm_free_pgd(mm);
 }
 /*
@@ -484,9 +482,9 @@ void __init mm_init_ppc64(void)
        for (i = 1; i < lmb.memory.cnt; i++) {
                unsigned long base, prevbase, prevsize;
-                prevbase = lmb.memory.region[i-1].physbase;
+                prevbase = lmb.memory.region[i-1].base;
                prevsize = lmb.memory.region[i-1].size;
-                base = lmb.memory.region[i].physbase;
+                base = lmb.memory.region[i].base;
                if (base > (prevbase + prevsize)) {
                        io_hole_start = prevbase + prevsize;
                        io_hole_size = base  - (prevbase + prevsize);
@@ -513,11 +511,8 @@ int page_is_ram(unsigned long pfn)
        for (i=0; i < lmb.memory.cnt; i++) {
                unsigned long base;
-#ifdef CONFIG_MSCHUNKS
-                base = lmb.memory.region[i].physbase;
-#else
                base = lmb.memory.region[i].base;
-#endif
                if ((paddr >= base) &&
                        (paddr < (base + lmb.memory.region[i].size))) {
                        return 1;
@@ -547,7 +542,7 @@ void __init do_init_bootmem(void)
         */
        bootmap_pages = bootmem_bootmap_pages(total_pages);
-        start = abs_to_phys(lmb_alloc(bootmap_pages<<PAGE_SHIFT, PAGE_SIZE));
+        start = lmb_alloc(bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
        BUG_ON(!start);
        boot_mapsize = init_bootmem(start >> PAGE_SHIFT, total_pages);
@@ -558,25 +553,25 @@ void __init do_init_bootmem(void)
         * present.
         */
        for (i=0; i < lmb.memory.cnt; i++) {
-                unsigned long physbase, size;
+                unsigned long base, size;
                unsigned long start_pfn, end_pfn;
-                physbase = lmb.memory.region[i].physbase;
+                base = lmb.memory.region[i].base;
                size = lmb.memory.region[i].size;
-                start_pfn = physbase >> PAGE_SHIFT;
+                start_pfn = base >> PAGE_SHIFT;
                end_pfn = start_pfn + (size >> PAGE_SHIFT);
                memory_present(0, start_pfn, end_pfn);
-                free_bootmem(physbase, size);
+                free_bootmem(base, size);
        }
        /* reserve the sections we're already using */
        for (i=0; i < lmb.reserved.cnt; i++) {
-                unsigned long physbase = lmb.reserved.region[i].physbase;
+                unsigned long base = lmb.reserved.region[i].base;
                unsigned long size = lmb.reserved.region[i].size;
-                reserve_bootmem(physbase, size);
+                reserve_bootmem(base, size);
        }
 }
@@ -615,10 +610,10 @@ static int __init setup_kcore(void)
        int i;
        for (i=0; i < lmb.memory.cnt; i++) {
-                unsigned long physbase, size;
+                unsigned long base, size;
                struct kcore_list *kcore_mem;
-                physbase = lmb.memory.region[i].physbase;
+                base = lmb.memory.region[i].base;
                size = lmb.memory.region[i].size;
                /* GFP_ATOMIC to avoid might_sleep warnings during boot */
@@ -626,7 +621,7 @@ static int __init setup_kcore(void)
                if (!kcore_mem)
                        panic("mem_init: kmalloc failed\n");
-                kclist_add(kcore_mem, __va(physbase), size);
+                kclist_add(kcore_mem, __va(base), size);
        }
        kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
@@ -686,9 +681,6 @@ void __init mem_init(void)
        mem_init_done = 1;
-#ifdef CONFIG_PPC_ISERIES
-        iommu_vio_init();
-#endif
        /* Initialize the vDSO */
        vdso_init();
 }
@@ -833,23 +825,43 @@ void __iomem * reserve_phb_iospace(unsigned long size)
        return virt_addr;
 }
-kmem_cache_t *zero_cache;
+static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
-static void zero_ctor(void *pte, kmem_cache_t *cache, unsigned long flags)
 {
-        memset(pte, 0, PAGE_SIZE);
+        memset(addr, 0, kmem_cache_size(cache));
 }
+static const int pgtable_cache_size[2] = {
+        PTE_TABLE_SIZE, PMD_TABLE_SIZE
+};
+static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
+        "pgd_pte_cache", "pud_pmd_cache",
+};
+kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
 void pgtable_cache_init(void)
 {
-        zero_cache = kmem_cache_create("zero",
+        int i;
-                                PAGE_SIZE,
-                                0,
+        BUILD_BUG_ON(PTE_TABLE_SIZE != pgtable_cache_size[PTE_CACHE_NUM]);
-                                SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
+        BUILD_BUG_ON(PMD_TABLE_SIZE != pgtable_cache_size[PMD_CACHE_NUM]);
-                                zero_ctor,
+        BUILD_BUG_ON(PUD_TABLE_SIZE != pgtable_cache_size[PUD_CACHE_NUM]);
-                                NULL);
+        BUILD_BUG_ON(PGD_TABLE_SIZE != pgtable_cache_size[PGD_CACHE_NUM]);
-        if (!zero_cache)
-                panic("pgtable_cache_init(): could not create zero_cache!\n");
+        for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
+                int size = pgtable_cache_size[i];
+                const char *name = pgtable_cache_name[i];
+                pgtable_cache[i] = kmem_cache_create(name,
+                                                     size, size,
+                                                     SLAB_HWCACHE_ALIGN
+                                                     | SLAB_MUST_HWCACHE_ALIGN,
+                                                     zero_ctor,
+                                                     NULL);
+                if (! pgtable_cache[i])
+                        panic("pgtable_cache_init(): could not create %s!\n",
+                              name);
+        }
 }
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
diff --git a/arch/ppc64/mm/numa.c b/arch/ppc64/mm/numa.c
index 0b191f2de016..c3116f0d788c 100644
--- a/arch/ppc64/mm/numa.c
+++ b/arch/ppc64/mm/numa.c
@@ -671,7 +671,7 @@ new_range:
                 * Mark reserved regions on this node
                 */
                for (i = 0; i < lmb.reserved.cnt; i++) {
-                        unsigned long physbase = lmb.reserved.region[i].physbase;
+                        unsigned long physbase = lmb.reserved.region[i].base;
                        unsigned long size = lmb.reserved.region[i].size;
                        if (pa_to_nid(physbase) != nid &&
diff --git a/arch/ppc64/mm/slb_low.S b/arch/ppc64/mm/slb_low.S
index 8379d678f70f..698d6b9ed6d1 100644
--- a/arch/ppc64/mm/slb_low.S
+++ b/arch/ppc64/mm/slb_low.S
@@ -89,32 +89,29 @@ END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
        b       9f
 0:      /* user address: proto-VSID = context<<15 | ESID */
-        li      r11,SLB_VSID_USER
+        srdi.   r9,r3,USER_ESID_BITS
-        srdi.   r9,r3,13
        bne-    8f                      /* invalid ea bits set */
 #ifdef CONFIG_HUGETLB_PAGE
 BEGIN_FTR_SECTION
-        /* check against the hugepage ranges */
+        lhz     r9,PACAHIGHHTLBAREAS(r13)
-        cmpldi  r3,(TASK_HPAGE_END>>SID_SHIFT)
+        srdi    r11,r3,(HTLB_AREA_SHIFT-SID_SHIFT)
-        bge     6f                      /* >= TASK_HPAGE_END */
+        srd     r9,r9,r11
-        cmpldi  r3,(TASK_HPAGE_BASE>>SID_SHIFT)
+        lhz     r11,PACALOWHTLBAREAS(r13)
-        bge     5f                      /* TASK_HPAGE_BASE..TASK_HPAGE_END */
+        srd     r11,r11,r3
-        cmpldi  r3,16
+        or      r9,r9,r11
-        bge     6f                      /* 4GB..TASK_HPAGE_BASE */
+END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
+#endif /* CONFIG_HUGETLB_PAGE */
-        lhz     r9,PACAHTLBSEGS(r13)
-        srd     r9,r9,r3
+        li      r11,SLB_VSID_USER
-        andi.   r9,r9,1
-        beq     6f
+#ifdef CONFIG_HUGETLB_PAGE
+BEGIN_FTR_SECTION
-5:      /* this is a hugepage user address */
+        rldimi  r11,r9,8,55             /* shift masked bit into SLB_VSID_L */
-        li      r11,(SLB_VSID_USER|SLB_VSID_L)
 END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
 #endif /* CONFIG_HUGETLB_PAGE */
-6:      ld      r9,PACACONTEXTID(r13)
+        ld      r9,PACACONTEXTID(r13)
        rldimi  r3,r9,USER_ESID_BITS,0
 9:      /* r3 = protovsid, r11 = flags, r10 = esid_data, cr7 = <>KERNELBASE */
diff --git a/arch/ppc64/mm/tlb.c b/arch/ppc64/mm/tlb.c
index 26f0172c4527..d8a6593a13f0 100644
--- a/arch/ppc64/mm/tlb.c
+++ b/arch/ppc64/mm/tlb.c
@@ -41,7 +41,58 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
 unsigned long pte_freelist_forced_free;
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage)
+struct pte_freelist_batch
+{
+        struct rcu_head rcu;
+        unsigned int    index;
+        pgtable_free_t  tables[0];
+};
+DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
+unsigned long pte_freelist_forced_free;
+#define PTE_FREELIST_SIZE \
+        ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
+          / sizeof(pgtable_free_t))
+#ifdef CONFIG_SMP
+static void pte_free_smp_sync(void *arg)
+{
+        /* Do nothing, just ensure we sync with all CPUs */
+}
+#endif
+/* This is only called when we are critically out of memory
+ * (and fail to get a page in pte_free_tlb).
+ */
+static void pgtable_free_now(pgtable_free_t pgf)
+{
+        pte_freelist_forced_free++;
+        smp_call_function(pte_free_smp_sync, NULL, 0, 1);
+        pgtable_free(pgf);
+}
+static void pte_free_rcu_callback(struct rcu_head *head)
+{
+        struct pte_freelist_batch *batch =
+                container_of(head, struct pte_freelist_batch, rcu);
+        unsigned int i;
+        for (i = 0; i < batch->index; i++)
+                pgtable_free(batch->tables[i]);
+        free_page((unsigned long)batch);
+}
+static void pte_free_submit(struct pte_freelist_batch *batch)
+{
+        INIT_RCU_HEAD(&batch->rcu);
+        call_rcu(&batch->rcu, pte_free_rcu_callback);
+}
+void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
 {
        /* This is safe as we are holding page_table_lock */
        cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
@@ -49,19 +100,19 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage)
        if (atomic_read(&tlb->mm->mm_users) < 2 ||
            cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) {
-                pte_free(ptepage);
+                pgtable_free(pgf);
                return;
        }
        if (*batchp == NULL) {
                *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
                if (*batchp == NULL) {
-                        pte_free_now(ptepage);
+                        pgtable_free_now(pgf);
                        return;
                }
                (*batchp)->index = 0;
        }
-        (*batchp)->pages[(*batchp)->index++] = ptepage;
+        (*batchp)->tables[(*batchp)->index++] = pgf;
        if ((*batchp)->index == PTE_FREELIST_SIZE) {
                pte_free_submit(*batchp);
                *batchp = NULL;
@@ -132,42 +183,6 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
        put_cpu();
 }
-#ifdef CONFIG_SMP
-static void pte_free_smp_sync(void *arg)
-{
-        /* Do nothing, just ensure we sync with all CPUs */
-}
-#endif
-/* This is only called when we are critically out of memory
- * (and fail to get a page in pte_free_tlb).
- */
-void pte_free_now(struct page *ptepage)
-{
-        pte_freelist_forced_free++;
-        smp_call_function(pte_free_smp_sync, NULL, 0, 1);
-        pte_free(ptepage);
-}
-static void pte_free_rcu_callback(struct rcu_head *head)
-{
-        struct pte_freelist_batch *batch =
-                container_of(head, struct pte_freelist_batch, rcu);
-        unsigned int i;
-        for (i = 0; i < batch->index; i++)
-                pte_free(batch->pages[i]);
-        free_page((unsigned long)batch);
-}
-void pte_free_submit(struct pte_freelist_batch *batch)
-{
-        INIT_RCU_HEAD(&batch->rcu);
-        call_rcu(&batch->rcu, pte_free_rcu_callback);
-}
 void pte_free_finish(void)
 {
        /* This is safe as we are holding page_table_lock */