Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc

* 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc: [POWERPC] Further fixes for the removal of 4level-fixup hack from ppc32 [POWERPC] EEH: log all PCI-X and PCI-E AER registers [POWERPC] EEH: capture and log pci state on error [POWERPC] EEH: Split up long error msg [POWERPC] EEH: log error only after driver notification. [POWERPC] fsl_soc: Make mac_addr const in fs_enet_of_init(). [POWERPC] Don't use SLAB/SLUB for PTE pages [POWERPC] Spufs support for 64K LS mappings on 4K kernels [POWERPC] Add ability to 4K kernel to hash in 64K pages [POWERPC] Introduce address space "slices" [POWERPC] Small fixes & cleanups in segment page size demotion [POWERPC] iSeries: Make HVC_ISERIES the default [POWERPC] iSeries: suppress build warning in lparmap.c [POWERPC] Mark pages that don't exist as nosave [POWERPC] swsusp: Introduce register_nosave_region_late
author: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-05-09 15:56:01 -0400
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-05-09 15:56:01 -0400
commit: aabded9c3aab5160ae2ca3dd1fa0fa37f3d510e4 (patch)
tree: 8544d546735bcb975b8dec296eb9b6dc6531fb2a /arch/powerpc/mm
parent: 9a9136e270af14da506f66bcafcc506b86a86498 (diff)
parent: f1a1eb299a8422c3e8d41753095bec44b2493398 (diff)
13 files changed, 817 insertions, 645 deletions
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 38a81967ca07..4f839c6a9768 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -18,4 +18,5 @@ obj-$(CONFIG_40x)		+= 4xx_mmu.o
 obj-$(CONFIG_44x)               += 44x_mmu.o
 obj-$(CONFIG_FSL_BOOKE)         += fsl_booke_mmu.o
 obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
+obj-$(CONFIG_PPC_MM_SLICES)     += slice.o
 obj-$(CONFIG_HUGETLB_PAGE)      += hugetlbpage.o
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index e64ce3eec36e..4762ff7c14df 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -615,6 +615,9 @@ htab_pte_insert_failure:
        li      r3,-1
        b       htab_bail
+#endif /* CONFIG_PPC_64K_PAGES */
+#ifdef CONFIG_PPC_HAS_HASH_64K
 /*****************************************************************************
 *                                                                           *
@@ -870,7 +873,7 @@ ht64_pte_insert_failure:
        b       ht64_bail
-#endif /* CONFIG_PPC_64K_PAGES */
+#endif /* CONFIG_PPC_HAS_HASH_64K */
 /*****************************************************************************
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 9b226fa7006f..028ba4ed03d2 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -51,6 +51,7 @@
 #include <asm/cputable.h>
 #include <asm/abs_addr.h>
 #include <asm/sections.h>
+#include <asm/spu.h>
 #ifdef DEBUG
 #define DBG(fmt...) udbg_printf(fmt)
@@ -419,7 +420,7 @@ static void __init htab_finish_init(void)
        extern unsigned int *htab_call_hpte_remove;
        extern unsigned int *htab_call_hpte_updatepp;
-#ifdef CONFIG_PPC_64K_PAGES
+#ifdef CONFIG_PPC_HAS_HASH_64K
        extern unsigned int *ht64_call_hpte_insert1;
        extern unsigned int *ht64_call_hpte_insert2;
        extern unsigned int *ht64_call_hpte_remove;
@@ -596,22 +597,23 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
 * Demote a segment to using 4k pages.
 * For now this makes the whole process use 4k pages.
 */
-void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
-{
 #ifdef CONFIG_PPC_64K_PAGES
+static void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
+{
        if (mm->context.user_psize == MMU_PAGE_4K)
                return;
+#ifdef CONFIG_PPC_MM_SLICES
+        slice_set_user_psize(mm, MMU_PAGE_4K);
+#else /* CONFIG_PPC_MM_SLICES */
        mm->context.user_psize = MMU_PAGE_4K;
        mm->context.sllp = SLB_VSID_USER | mmu_psize_defs[MMU_PAGE_4K].sllp;
-        get_paca()->context = mm->context;
+#endif /* CONFIG_PPC_MM_SLICES */
-        slb_flush_and_rebolt();
 #ifdef CONFIG_SPE_BASE
        spu_flush_all_slbs(mm);
 #endif
-#endif
 }
+#endif /* CONFIG_PPC_64K_PAGES */
-EXPORT_SYMBOL_GPL(demote_segment_4k);
 /* Result code is:
 *  0 - handled
@@ -646,7 +648,11 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
                        return 1;
                }
                vsid = get_vsid(mm->context.id, ea);
+#ifdef CONFIG_PPC_MM_SLICES
+                psize = get_slice_psize(mm, ea);
+#else
                psize = mm->context.user_psize;
+#endif
                break;
        case VMALLOC_REGION_ID:
                mm = &init_mm;
@@ -674,11 +680,22 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
        if (user_region && cpus_equal(mm->cpu_vm_mask, tmp))
                local = 1;
+#ifdef CONFIG_HUGETLB_PAGE
        /* Handle hugepage regions */
-        if (unlikely(in_hugepage_area(mm->context, ea))) {
+        if (HPAGE_SHIFT && psize == mmu_huge_psize) {
                DBG_LOW(" -> huge page !\n");
                return hash_huge_page(mm, access, ea, vsid, local, trap);
        }
+#endif /* CONFIG_HUGETLB_PAGE */
+#ifndef CONFIG_PPC_64K_PAGES
+        /* If we use 4K pages and our psize is not 4K, then we are hitting
+         * a special driver mapping, we need to align the address before
+         * we fetch the PTE
+         */
+        if (psize != MMU_PAGE_4K)
+                ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+#endif /* CONFIG_PPC_64K_PAGES */
        /* Get PTE and page size from page tables */
        ptep = find_linux_pte(pgdir, ea);
@@ -702,54 +719,56 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
        }
        /* Do actual hashing */
-#ifndef CONFIG_PPC_64K_PAGES
+#ifdef CONFIG_PPC_64K_PAGES
-        rc = __hash_page_4K(ea, access, vsid, ptep, trap, local);
-#else
        /* If _PAGE_4K_PFN is set, make sure this is a 4k segment */
        if (pte_val(*ptep) & _PAGE_4K_PFN) {
                demote_segment_4k(mm, ea);
                psize = MMU_PAGE_4K;
        }
-        if (mmu_ci_restrictions) {
+        /* If this PTE is non-cacheable and we have restrictions on
-                /* If this PTE is non-cacheable, switch to 4k */
+         * using non cacheable large pages, then we switch to 4k
-                if (psize == MMU_PAGE_64K &&
+         */
-                    (pte_val(*ptep) & _PAGE_NO_CACHE)) {
+        if (mmu_ci_restrictions && psize == MMU_PAGE_64K &&
-                        if (user_region) {
+            (pte_val(*ptep) & _PAGE_NO_CACHE)) {
-                                demote_segment_4k(mm, ea);
+                if (user_region) {
-                                psize = MMU_PAGE_4K;
+                        demote_segment_4k(mm, ea);
-                        } else if (ea < VMALLOC_END) {
+                        psize = MMU_PAGE_4K;
-                                /*
+                } else if (ea < VMALLOC_END) {
-                                 * some driver did a non-cacheable mapping
+                        /*
-                                 * in vmalloc space, so switch vmalloc
+                         * some driver did a non-cacheable mapping
-                                 * to 4k pages
+                         * in vmalloc space, so switch vmalloc
-                                 */
+                         * to 4k pages
-                                printk(KERN_ALERT "Reducing vmalloc segment "
+                         */
-                                       "to 4kB pages because of "
+                        printk(KERN_ALERT "Reducing vmalloc segment "
-                                       "non-cacheable mapping\n");
+                               "to 4kB pages because of "
-                                psize = mmu_vmalloc_psize = MMU_PAGE_4K;
+                               "non-cacheable mapping\n");
-                        }
+                        psize = mmu_vmalloc_psize = MMU_PAGE_4K;
 #ifdef CONFIG_SPE_BASE
                        spu_flush_all_slbs(mm);
 #endif
                }
-                if (user_region) {
+        }
-                        if (psize != get_paca()->context.user_psize) {
+        if (user_region) {
-                                get_paca()->context = mm->context;
+                if (psize != get_paca()->context.user_psize) {
-                                slb_flush_and_rebolt();
+                        get_paca()->context.user_psize =
-                        }
+                                mm->context.user_psize;
-                } else if (get_paca()->vmalloc_sllp !=
-                           mmu_psize_defs[mmu_vmalloc_psize].sllp) {
-                        get_paca()->vmalloc_sllp =
-                                mmu_psize_defs[mmu_vmalloc_psize].sllp;
                        slb_flush_and_rebolt();
                }
+        } else if (get_paca()->vmalloc_sllp !=
+                   mmu_psize_defs[mmu_vmalloc_psize].sllp) {
+                get_paca()->vmalloc_sllp =
+                        mmu_psize_defs[mmu_vmalloc_psize].sllp;
+                slb_flush_and_rebolt();
        }
+#endif /* CONFIG_PPC_64K_PAGES */
+#ifdef CONFIG_PPC_HAS_HASH_64K
        if (psize == MMU_PAGE_64K)
                rc = __hash_page_64K(ea, access, vsid, ptep, trap, local);
        else
+#endif /* CONFIG_PPC_HAS_HASH_64K */
                rc = __hash_page_4K(ea, access, vsid, ptep, trap, local);
-#endif /* CONFIG_PPC_64K_PAGES */
 #ifndef CONFIG_PPC_64K_PAGES
        DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep));
@@ -772,42 +791,55 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
        unsigned long flags;
        int local = 0;
-        /* We don't want huge pages prefaulted for now
+        BUG_ON(REGION_ID(ea) != USER_REGION_ID);
-         */
-        if (unlikely(in_hugepage_area(mm->context, ea)))
+#ifdef CONFIG_PPC_MM_SLICES
+        /* We only prefault standard pages for now */
+        if (unlikely(get_slice_psize(mm, ea) != mm->context.user_psize));
                return;
+#endif
        DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx,"
                " trap=%lx\n", mm, mm->pgd, ea, access, trap);
-        /* Get PTE, VSID, access mask */
+        /* Get Linux PTE if available */
        pgdir = mm->pgd;
        if (pgdir == NULL)
                return;
        ptep = find_linux_pte(pgdir, ea);
        if (!ptep)
                return;
+#ifdef CONFIG_PPC_64K_PAGES
+        /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on
+         * a 64K kernel), then we don't preload, hash_page() will take
+         * care of it once we actually try to access the page.
+         * That way we don't have to duplicate all of the logic for segment
+         * page size demotion here
+         */
+        if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE))
+                return;
+#endif /* CONFIG_PPC_64K_PAGES */
+        /* Get VSID */
        vsid = get_vsid(mm->context.id, ea);
-        /* Hash it in */
+        /* Hash doesn't like irqs */
        local_irq_save(flags);
+        /* Is that local to this CPU ? */
        mask = cpumask_of_cpu(smp_processor_id());
        if (cpus_equal(mm->cpu_vm_mask, mask))
                local = 1;
-#ifndef CONFIG_PPC_64K_PAGES
-        __hash_page_4K(ea, access, vsid, ptep, trap, local);
+        /* Hash it in */
-#else
+#ifdef CONFIG_PPC_HAS_HASH_64K
-        if (mmu_ci_restrictions) {
-                /* If this PTE is non-cacheable, switch to 4k */
-                if (mm->context.user_psize == MMU_PAGE_64K &&
-                    (pte_val(*ptep) & _PAGE_NO_CACHE))
-                        demote_segment_4k(mm, ea);
-        }
        if (mm->context.user_psize == MMU_PAGE_64K)
                __hash_page_64K(ea, access, vsid, ptep, trap, local);
        else
-                __hash_page_4K(ea, access, vsid, ptep, trap, local);
 #endif /* CONFIG_PPC_64K_PAGES */
+                __hash_page_4K(ea, access, vsid, ptep, trap, local);
        local_irq_restore(flags);
 }
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index fb959264c104..92a1b16fb7e3 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -91,7 +91,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
        pgd_t *pg;
        pud_t *pu;
-        BUG_ON(! in_hugepage_area(mm->context, addr));
+        BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
        addr &= HPAGE_MASK;
@@ -119,7 +119,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
        pud_t *pu;
        hugepd_t *hpdp = NULL;
-        BUG_ON(! in_hugepage_area(mm->context, addr));
+        BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
        addr &= HPAGE_MASK;
@@ -302,7 +302,7 @@ void hugetlb_free_pgd_range(struct mmu_gather **tlb,
        start = addr;
        pgd = pgd_offset((*tlb)->mm, addr);
        do {
-                BUG_ON(! in_hugepage_area((*tlb)->mm->context, addr));
+                BUG_ON(get_slice_psize((*tlb)->mm, addr) != mmu_huge_psize);
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
@@ -331,203 +331,13 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
        return __pte(old);
 }
-struct slb_flush_info {
-        struct mm_struct *mm;
-        u16 newareas;
-};
-static void flush_low_segments(void *parm)
-{
-        struct slb_flush_info *fi = parm;
-        unsigned long i;
-        BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_LOW_AREAS);
-        if (current->active_mm != fi->mm)
-                return;
-        /* Only need to do anything if this CPU is working in the same
-         * mm as the one which has changed */
-        /* update the paca copy of the context struct */
-        get_paca()->context = current->active_mm->context;
-        asm volatile("isync" : : : "memory");
-        for (i = 0; i < NUM_LOW_AREAS; i++) {
-                if (! (fi->newareas & (1U << i)))
-                        continue;
-                asm volatile("slbie %0"
-                             : : "r" ((i << SID_SHIFT) | SLBIE_C));
-        }
-        asm volatile("isync" : : : "memory");
-}
-static void flush_high_segments(void *parm)
-{
-        struct slb_flush_info *fi = parm;
-        unsigned long i, j;
-        BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_HIGH_AREAS);
-        if (current->active_mm != fi->mm)
-                return;
-        /* Only need to do anything if this CPU is working in the same
-         * mm as the one which has changed */
-        /* update the paca copy of the context struct */
-        get_paca()->context = current->active_mm->context;
-        asm volatile("isync" : : : "memory");
-        for (i = 0; i < NUM_HIGH_AREAS; i++) {
-                if (! (fi->newareas & (1U << i)))
-                        continue;
-                for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
-                        asm volatile("slbie %0"
-                                     :: "r" (((i << HTLB_AREA_SHIFT)
-                                              + (j << SID_SHIFT)) | SLBIE_C));
-        }
-        asm volatile("isync" : : : "memory");
-}
-static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area)
-{
-        unsigned long start = area << SID_SHIFT;
-        unsigned long end = (area+1) << SID_SHIFT;
-        struct vm_area_struct *vma;
-        BUG_ON(area >= NUM_LOW_AREAS);
-        /* Check no VMAs are in the region */
-        vma = find_vma(mm, start);
-        if (vma && (vma->vm_start < end))
-                return -EBUSY;
-        return 0;
-}
-static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
-{
-        unsigned long start = area << HTLB_AREA_SHIFT;
-        unsigned long end = (area+1) << HTLB_AREA_SHIFT;
-        struct vm_area_struct *vma;
-        BUG_ON(area >= NUM_HIGH_AREAS);
-        /* Hack, so that each addresses is controlled by exactly one
-         * of the high or low area bitmaps, the first high area starts
-         * at 4GB, not 0 */
-        if (start == 0)
-                start = 0x100000000UL;
-        /* Check no VMAs are in the region */
-        vma = find_vma(mm, start);
-        if (vma && (vma->vm_start < end))
-                return -EBUSY;
-        return 0;
-}
-static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
-{
-        unsigned long i;
-        struct slb_flush_info fi;
-        BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
-        BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
-        newareas &= ~(mm->context.low_htlb_areas);
-        if (! newareas)
-                return 0; /* The segments we want are already open */
-        for (i = 0; i < NUM_LOW_AREAS; i++)
-                if ((1 << i) & newareas)
-                        if (prepare_low_area_for_htlb(mm, i) != 0)
-                                return -EBUSY;
-        mm->context.low_htlb_areas |= newareas;
-        /* the context change must make it to memory before the flush,
-         * so that further SLB misses do the right thing. */
-        mb();
-        fi.mm = mm;
-        fi.newareas = newareas;
-        on_each_cpu(flush_low_segments, &fi, 0, 1);
-        return 0;
-}
-static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
-{
-        struct slb_flush_info fi;
-        unsigned long i;
-        BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
-        BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8)
-                     != NUM_HIGH_AREAS);
-        newareas &= ~(mm->context.high_htlb_areas);
-        if (! newareas)
-                return 0; /* The areas we want are already open */
-        for (i = 0; i < NUM_HIGH_AREAS; i++)
-                if ((1 << i) & newareas)
-                        if (prepare_high_area_for_htlb(mm, i) != 0)
-                                return -EBUSY;
-        mm->context.high_htlb_areas |= newareas;
-        /* the context change must make it to memory before the flush,
-         * so that further SLB misses do the right thing. */
-        mb();
-        fi.mm = mm;
-        fi.newareas = newareas;
-        on_each_cpu(flush_high_segments, &fi, 0, 1);
-        return 0;
-}
-int prepare_hugepage_range(unsigned long addr, unsigned long len, pgoff_t pgoff)
-{
-        int err = 0;
-        if (pgoff & (~HPAGE_MASK >> PAGE_SHIFT))
-                return -EINVAL;
-        if (len & ~HPAGE_MASK)
-                return -EINVAL;
-        if (addr & ~HPAGE_MASK)
-                return -EINVAL;
-        if (addr < 0x100000000UL)
-                err = open_low_hpage_areas(current->mm,
-                                          LOW_ESID_MASK(addr, len));
-        if ((addr + len) > 0x100000000UL)
-                err = open_high_hpage_areas(current->mm,
-                                            HTLB_AREA_MASK(addr, len));
-#ifdef CONFIG_SPE_BASE
-        spu_flush_all_slbs(current->mm);
-#endif
-        if (err) {
-                printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
-                       " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n",
-                       addr, len,
-                       LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len));
-                return err;
-        }
-        return 0;
-}
 struct page *
 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 {
        pte_t *ptep;
        struct page *page;
-        if (! in_hugepage_area(mm->context, address))
+        if (get_slice_psize(mm, address) != mmu_huge_psize)
                return ERR_PTR(-EINVAL);
        ptep = huge_pte_offset(mm, address);
@@ -551,359 +361,13 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
        return NULL;
 }
-/* Because we have an exclusive hugepage region which lies within the
- * normal user address space, we have to take special measures to make
- * non-huge mmap()s evade the hugepage reserved regions. */
-unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
-                                     unsigned long len, unsigned long pgoff,
-                                     unsigned long flags)
-{
-        struct mm_struct *mm = current->mm;
-        struct vm_area_struct *vma;
-        unsigned long start_addr;
-        if (len > TASK_SIZE)
-                return -ENOMEM;
-        /* handle fixed mapping: prevent overlap with huge pages */
-        if (flags & MAP_FIXED) {
-                if (is_hugepage_only_range(mm, addr, len))
-                        return -EINVAL;
-                return addr;
-        }
-        if (addr) {
-                addr = PAGE_ALIGN(addr);
-                vma = find_vma(mm, addr);
-                if (((TASK_SIZE - len) >= addr)
-                    && (!vma || (addr+len) <= vma->vm_start)
-                    && !is_hugepage_only_range(mm, addr,len))
-                        return addr;
-        }
-        if (len > mm->cached_hole_size) {
-                start_addr = addr = mm->free_area_cache;
-        } else {
-                start_addr = addr = TASK_UNMAPPED_BASE;
-                mm->cached_hole_size = 0;
-        }
-full_search:
-        vma = find_vma(mm, addr);
-        while (TASK_SIZE - len >= addr) {
-                BUG_ON(vma && (addr >= vma->vm_end));
-                if (touches_hugepage_low_range(mm, addr, len)) {
-                        addr = ALIGN(addr+1, 1<<SID_SHIFT);
-                        vma = find_vma(mm, addr);
-                        continue;
-                }
-                if (touches_hugepage_high_range(mm, addr, len)) {
-                        addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
-                        vma = find_vma(mm, addr);
-                        continue;
-                }
-                if (!vma || addr + len <= vma->vm_start) {
-                        /*
-                         * Remember the place where we stopped the search:
-                         */
-                        mm->free_area_cache = addr + len;
-                        return addr;
-                }
-                if (addr + mm->cached_hole_size < vma->vm_start)
-                        mm->cached_hole_size = vma->vm_start - addr;
-                addr = vma->vm_end;
-                vma = vma->vm_next;
-        }
-        /* Make sure we didn't miss any holes */
-        if (start_addr != TASK_UNMAPPED_BASE) {
-                start_addr = addr = TASK_UNMAPPED_BASE;
-                mm->cached_hole_size = 0;
-                goto full_search;
-        }
-        return -ENOMEM;
-}
-/*
- * This mmap-allocator allocates new areas top-down from below the
- * stack's low limit (the base):
- *
- * Because we have an exclusive hugepage region which lies within the
- * normal user address space, we have to take special measures to make
- * non-huge mmap()s evade the hugepage reserved regions.
- */
-unsigned long
-arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
-                          const unsigned long len, const unsigned long pgoff,
-                          const unsigned long flags)
-{
-        struct vm_area_struct *vma, *prev_vma;
-        struct mm_struct *mm = current->mm;
-        unsigned long base = mm->mmap_base, addr = addr0;
-        unsigned long largest_hole = mm->cached_hole_size;
-        int first_time = 1;
-        /* requested length too big for entire address space */
-        if (len > TASK_SIZE)
-                return -ENOMEM;
-        /* handle fixed mapping: prevent overlap with huge pages */
-        if (flags & MAP_FIXED) {
-                if (is_hugepage_only_range(mm, addr, len))
-                        return -EINVAL;
-                return addr;
-        }
-        /* dont allow allocations above current base */
-        if (mm->free_area_cache > base)
-                mm->free_area_cache = base;
-        /* requesting a specific address */
-        if (addr) {
-                addr = PAGE_ALIGN(addr);
-                vma = find_vma(mm, addr);
-                if (TASK_SIZE - len >= addr &&
-                                (!vma || addr + len <= vma->vm_start)
-                                && !is_hugepage_only_range(mm, addr,len))
-                        return addr;
-        }
-        if (len <= largest_hole) {
-                largest_hole = 0;
-                mm->free_area_cache = base;
-        }
-try_again:
-        /* make sure it can fit in the remaining address space */
-        if (mm->free_area_cache < len)
-                goto fail;
-        /* either no address requested or cant fit in requested address hole */
-        addr = (mm->free_area_cache - len) & PAGE_MASK;
-        do {
-hugepage_recheck:
-                if (touches_hugepage_low_range(mm, addr, len)) {
-                        addr = (addr & ((~0) << SID_SHIFT)) - len;
-                        goto hugepage_recheck;
-                } else if (touches_hugepage_high_range(mm, addr, len)) {
-                        addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len;
-                        goto hugepage_recheck;
-                }
-                /*
-                 * Lookup failure means no vma is above this address,
-                 * i.e. return with success:
-                 */
-                if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
-                        return addr;
-                /*
-                 * new region fits between prev_vma->vm_end and
-                 * vma->vm_start, use it:
-                 */
-                if (addr+len <= vma->vm_start &&
-                          (!prev_vma || (addr >= prev_vma->vm_end))) {
-                        /* remember the address as a hint for next time */
-                        mm->cached_hole_size = largest_hole;
-                        return (mm->free_area_cache = addr);
-                } else {
-                        /* pull free_area_cache down to the first hole */
-                        if (mm->free_area_cache == vma->vm_end) {
-                                mm->free_area_cache = vma->vm_start;
-                                mm->cached_hole_size = largest_hole;
-                        }
-                }
-                /* remember the largest hole we saw so far */
-                if (addr + largest_hole < vma->vm_start)
-                        largest_hole = vma->vm_start - addr;
-                /* try just below the current vma->vm_start */
-                addr = vma->vm_start-len;
-        } while (len <= vma->vm_start);
-fail:
-        /*
-         * if hint left us with no space for the requested
-         * mapping then try again:
-         */
-        if (first_time) {
-                mm->free_area_cache = base;
-                largest_hole = 0;
-                first_time = 0;
-                goto try_again;
-        }
-        /*
-         * A failed mmap() very likely causes application failure,
-         * so fall back to the bottom-up function here. This scenario
-         * can happen with large stack limits and large mmap()
-         * allocations.
-         */
-        mm->free_area_cache = TASK_UNMAPPED_BASE;
-        mm->cached_hole_size = ~0UL;
-        addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
-        /*
-         * Restore the topdown base:
-         */
-        mm->free_area_cache = base;
-        mm->cached_hole_size = ~0UL;
-        return addr;
-}
-static int htlb_check_hinted_area(unsigned long addr, unsigned long len)
-{
-        struct vm_area_struct *vma;
-        vma = find_vma(current->mm, addr);
-        if (TASK_SIZE - len >= addr &&
-            (!vma || ((addr + len) <= vma->vm_start)))
-                return 0;
-        return -ENOMEM;
-}
-static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
-{
-        unsigned long addr = 0;
-        struct vm_area_struct *vma;
-        vma = find_vma(current->mm, addr);
-        while (addr + len <= 0x100000000UL) {
-                BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
-                if (! __within_hugepage_low_range(addr, len, segmask)) {
-                        addr = ALIGN(addr+1, 1<<SID_SHIFT);
-                        vma = find_vma(current->mm, addr);
-                        continue;
-                }
-                if (!vma || (addr + len) <= vma->vm_start)
-                        return addr;
-                addr = ALIGN(vma->vm_end, HPAGE_SIZE);
-                /* Depending on segmask this might not be a confirmed
-                 * hugepage region, so the ALIGN could have skipped
-                 * some VMAs */
-                vma = find_vma(current->mm, addr);
-        }
-        return -ENOMEM;
-}
-static unsigned long htlb_get_high_area(unsigned long len, u16 areamask)
-{
-        unsigned long addr = 0x100000000UL;
-        struct vm_area_struct *vma;
-        vma = find_vma(current->mm, addr);
-        while (addr + len <= TASK_SIZE_USER64) {
-                BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
-                if (! __within_hugepage_high_range(addr, len, areamask)) {
-                        addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
-                        vma = find_vma(current->mm, addr);
-                        continue;
-                }
-                if (!vma || (addr + len) <= vma->vm_start)
-                        return addr;
-                addr = ALIGN(vma->vm_end, HPAGE_SIZE);
-                /* Depending on segmask this might not be a confirmed
-                 * hugepage region, so the ALIGN could have skipped
-                 * some VMAs */
-                vma = find_vma(current->mm, addr);
-        }
-        return -ENOMEM;
-}
 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                                        unsigned long len, unsigned long pgoff,
                                        unsigned long flags)
 {
-        int lastshift;
+        return slice_get_unmapped_area(addr, len, flags,
-        u16 areamask, curareas;
+                                       mmu_huge_psize, 1, 0);
-        if (HPAGE_SHIFT == 0)
-                return -EINVAL;
-        if (len & ~HPAGE_MASK)
-                return -EINVAL;
-        if (len > TASK_SIZE)
-                return -ENOMEM;
-        if (!cpu_has_feature(CPU_FTR_16M_PAGE))
-                return -EINVAL;
-        /* Paranoia, caller should have dealt with this */
-        BUG_ON((addr + len)  < addr);
-        /* Handle MAP_FIXED */
-        if (flags & MAP_FIXED) {
-                if (prepare_hugepage_range(addr, len, pgoff))
-                        return -EINVAL;
-                return addr;
-        }
-        if (test_thread_flag(TIF_32BIT)) {
-                curareas = current->mm->context.low_htlb_areas;
-                /* First see if we can use the hint address */
-                if (addr && (htlb_check_hinted_area(addr, len) == 0)) {
-                        areamask = LOW_ESID_MASK(addr, len);
-                        if (open_low_hpage_areas(current->mm, areamask) == 0)
-                                return addr;
-                }
-                /* Next see if we can map in the existing low areas */
-                addr = htlb_get_low_area(len, curareas);
-                if (addr != -ENOMEM)
-                        return addr;
-                /* Finally go looking for areas to open */
-                lastshift = 0;
-                for (areamask = LOW_ESID_MASK(0x100000000UL-len, len);
-                     ! lastshift; areamask >>=1) {
-                        if (areamask & 1)
-                                lastshift = 1;
-                        addr = htlb_get_low_area(len, curareas | areamask);
-                        if ((addr != -ENOMEM)
-                            && open_low_hpage_areas(current->mm, areamask) == 0)
-                                return addr;
-                }
-        } else {
-                curareas = current->mm->context.high_htlb_areas;
-                /* First see if we can use the hint address */
-                /* We discourage 64-bit processes from doing hugepage
-                 * mappings below 4GB (must use MAP_FIXED) */
-                if ((addr >= 0x100000000UL)
-                    && (htlb_check_hinted_area(addr, len) == 0)) {
-                        areamask = HTLB_AREA_MASK(addr, len);
-                        if (open_high_hpage_areas(current->mm, areamask) == 0)
-                                return addr;
-                }
-                /* Next see if we can map in the existing high areas */
-                addr = htlb_get_high_area(len, curareas);
-                if (addr != -ENOMEM)
-                        return addr;
-                /* Finally go looking for areas to open */
-                lastshift = 0;
-                for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len);
-                     ! lastshift; areamask >>=1) {
-                        if (areamask & 1)
-                                lastshift = 1;
-                        addr = htlb_get_high_area(len, curareas | areamask);
-                        if ((addr != -ENOMEM)
-                            && open_high_hpage_areas(current->mm, areamask) == 0)
-                                return addr;
-                }
-        }
-        printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
-               " enough areas\n");
-        return -ENOMEM;
 }
 /*
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index fe1fe852181a..7312a265545f 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -146,21 +146,16 @@ static void zero_ctor(void *addr, struct kmem_cache *cache, unsigned long flags)
        memset(addr, 0, kmem_cache_size(cache));
 }
-#ifdef CONFIG_PPC_64K_PAGES
-static const unsigned int pgtable_cache_size[3] = {
-        PTE_TABLE_SIZE, PMD_TABLE_SIZE, PGD_TABLE_SIZE
-};
-static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
-        "pte_pmd_cache", "pmd_cache", "pgd_cache",
-};
-#else
 static const unsigned int pgtable_cache_size[2] = {
-        PTE_TABLE_SIZE, PMD_TABLE_SIZE
+        PGD_TABLE_SIZE, PMD_TABLE_SIZE
 };
 static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
-        "pgd_pte_cache", "pud_pmd_cache",
+#ifdef CONFIG_PPC_64K_PAGES
-};
+        "pgd_cache", "pmd_cache",
+#else
+        "pgd_cache", "pud_pmd_cache",
 #endif /* CONFIG_PPC_64K_PAGES */
+};
 #ifdef CONFIG_HUGETLB_PAGE
 /* Hugepages need one extra cache, initialized in hugetlbpage.c.  We
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 1a6e08f3298f..246eeea40ece 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -31,6 +31,7 @@
 #include <linux/highmem.h>
 #include <linux/initrd.h>
 #include <linux/pagemap.h>
+#include <linux/suspend.h>
 #include <asm/pgalloc.h>
 #include <asm/prom.h>
@@ -276,6 +277,28 @@ void __init do_init_bootmem(void)
        init_bootmem_done = 1;
 }
+/* mark pages that don't exist as nosave */
+static int __init mark_nonram_nosave(void)
+{
+        unsigned long lmb_next_region_start_pfn,
+                      lmb_region_max_pfn;
+        int i;
+        for (i = 0; i < lmb.memory.cnt - 1; i++) {
+                lmb_region_max_pfn =
+                        (lmb.memory.region[i].base >> PAGE_SHIFT) +
+                        (lmb.memory.region[i].size >> PAGE_SHIFT);
+                lmb_next_region_start_pfn =
+                        lmb.memory.region[i+1].base >> PAGE_SHIFT;
+                if (lmb_region_max_pfn < lmb_next_region_start_pfn)
+                        register_nosave_region(lmb_region_max_pfn,
+                                               lmb_next_region_start_pfn);
+        }
+        return 0;
+}
 /*
 * paging_init() sets up the page tables - in fact we've already done this.
 */
@@ -307,6 +330,8 @@ void __init paging_init(void)
        max_zone_pfns[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
 #endif
        free_area_init_nodes(max_zone_pfns);
+        mark_nonram_nosave();
 }
 #endif /* ! CONFIG_NEED_MULTIPLE_NODES */
diff --git a/arch/powerpc/mm/mmu_context_64.c b/arch/powerpc/mm/mmu_context_64.c
index 90a06ac02d5e..7a78cdc0515a 100644
--- a/arch/powerpc/mm/mmu_context_64.c
+++ b/arch/powerpc/mm/mmu_context_64.c
@@ -28,6 +28,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
        int index;
        int err;
+        int new_context = (mm->context.id == 0);
 again:
        if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
@@ -50,9 +51,18 @@ again:
        }
        mm->context.id = index;
+#ifdef CONFIG_PPC_MM_SLICES
+        /* The old code would re-promote on fork, we don't do that
+         * when using slices as it could cause problem promoting slices
+         * that have been forced down to 4K
+         */
+        if (new_context)
+                slice_set_user_psize(mm, mmu_virtual_psize);
+#else
        mm->context.user_psize = mmu_virtual_psize;
        mm->context.sllp = SLB_VSID_USER |
                mmu_psize_defs[mmu_virtual_psize].sllp;
+#endif
        return 0;
 }
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index 05066674a7a0..ec1421a20aaa 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -185,7 +185,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
        if (Hash == 0)
                return;
-        pmd = pmd_offset(pgd_offset(mm, ea), ea);
+        pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea);
        if (!pmd_none(*pmd))
                add_hash_page(mm->context.id, ea, pmd_val(*pmd));
 }
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 224e960650a0..304375a73574 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -198,12 +198,6 @@ void slb_initialize(void)
        static int slb_encoding_inited;
        extern unsigned int *slb_miss_kernel_load_linear;
        extern unsigned int *slb_miss_kernel_load_io;
-#ifdef CONFIG_HUGETLB_PAGE
-        extern unsigned int *slb_miss_user_load_huge;
-        unsigned long huge_llp;
-        huge_llp = mmu_psize_defs[mmu_huge_psize].sllp;
-#endif
        /* Prepare our SLB miss handler based on our page size */
        linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
@@ -220,11 +214,6 @@ void slb_initialize(void)
                DBG("SLB: linear  LLP = %04x\n", linear_llp);
                DBG("SLB: io      LLP = %04x\n", io_llp);
-#ifdef CONFIG_HUGETLB_PAGE
-                patch_slb_encoding(slb_miss_user_load_huge,
-                                   SLB_VSID_USER | huge_llp);
-                DBG("SLB: huge    LLP = %04x\n", huge_llp);
-#endif
        }
        get_paca()->stab_rr = SLB_NUM_BOLTED;
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index b10e4707d7c1..cd1a93d4948c 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -82,31 +82,45 @@ _GLOBAL(slb_miss_kernel_load_io)
        srdi.   r9,r10,USER_ESID_BITS
        bne-    8f                      /* invalid ea bits set */
-        /* Figure out if the segment contains huge pages */
-#ifdef CONFIG_HUGETLB_PAGE
+        /* when using slices, we extract the psize off the slice bitmaps
-BEGIN_FTR_SECTION
+         * and then we need to get the sllp encoding off the mmu_psize_defs
-        b       1f
+         * array.
-END_FTR_SECTION_IFCLR(CPU_FTR_16M_PAGE)
+         *
+         * XXX This is a bit inefficient especially for the normal case,
+         * so we should try to implement a fast path for the standard page
+         * size using the old sllp value so we avoid the array. We cannot
+         * really do dynamic patching unfortunately as processes might flip
+         * between 4k and 64k standard page size
+         */
+#ifdef CONFIG_PPC_MM_SLICES
        cmpldi  r10,16
-        lhz     r9,PACALOWHTLBAREAS(r13)
+        /* Get the slice index * 4 in r11 and matching slice size mask in r9 */
-        mr      r11,r10
+        ld      r9,PACALOWSLICESPSIZE(r13)
+        sldi    r11,r10,2
        blt     5f
+        ld      r9,PACAHIGHSLICEPSIZE(r13)
+        srdi    r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT - 2)
+        andi.   r11,r11,0x3c
-        lhz     r9,PACAHIGHHTLBAREAS(r13)
+5:      /* Extract the psize and multiply to get an array offset */
-        srdi    r11,r10,(HTLB_AREA_SHIFT-SID_SHIFT)
+        srd     r9,r9,r11
+        andi.   r9,r9,0xf
-5:      srd     r9,r9,r11
+        mulli   r9,r9,MMUPSIZEDEFSIZE
-        andi.   r9,r9,1
-        beq     1f
-_GLOBAL(slb_miss_user_load_huge)
-        li      r11,0
-        b       2f
-1:
-#endif /* CONFIG_HUGETLB_PAGE */
+        /* Now get to the array and obtain the sllp
+         */
+        ld      r11,PACATOC(r13)
+        ld      r11,mmu_psize_defs@got(r11)
+        add     r11,r11,r9
+        ld      r11,MMUPSIZESLLP(r11)
+        ori     r11,r11,SLB_VSID_USER
+#else
+        /* paca context sllp already contains the SLB_VSID_USER bits */
        lhz     r11,PACACONTEXTSLLP(r13)
-2:
+#endif /* CONFIG_PPC_MM_SLICES */
        ld      r9,PACACONTEXTID(r13)
        rldimi  r10,r9,USER_ESID_BITS,0
        b       slb_finish_load
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
new file mode 100644
index 000000000000..f833dba2a028
--- /dev/null
+++ b/arch/powerpc/mm/slice.c
@@ -0,0 +1,633 @@
+/*
+ * address space "slices" (meta-segments) support
+ *
+ * Copyright (C) 2007 Benjamin Herrenschmidt, IBM Corporation.
+ *
+ * Based on hugetlb implementation
+ *
+ * Copyright (C) 2003 David Gibson, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#undef DEBUG
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <asm/mman.h>
+#include <asm/mmu.h>
+#include <asm/spu.h>
+static spinlock_t slice_convert_lock = SPIN_LOCK_UNLOCKED;
+#ifdef DEBUG
+int _slice_debug = 1;
+static void slice_print_mask(const char *label, struct slice_mask mask)
+{
+        char    *p, buf[16 + 3 + 16 + 1];
+        int     i;
+        if (!_slice_debug)
+                return;
+        p = buf;
+        for (i = 0; i < SLICE_NUM_LOW; i++)
+                *(p++) = (mask.low_slices & (1 << i)) ? '1' : '0';
+        *(p++) = ' ';
+        *(p++) = '-';
+        *(p++) = ' ';
+        for (i = 0; i < SLICE_NUM_HIGH; i++)
+                *(p++) = (mask.high_slices & (1 << i)) ? '1' : '0';
+        *(p++) = 0;
+        printk(KERN_DEBUG "%s:%s\n", label, buf);
+}
+#define slice_dbg(fmt...) do { if (_slice_debug) pr_debug(fmt); } while(0)
+#else
+static void slice_print_mask(const char *label, struct slice_mask mask) {}
+#define slice_dbg(fmt...)
+#endif
+static struct slice_mask slice_range_to_mask(unsigned long start,
+                                             unsigned long len)
+{
+        unsigned long end = start + len - 1;
+        struct slice_mask ret = { 0, 0 };
+        if (start < SLICE_LOW_TOP) {
+                unsigned long mend = min(end, SLICE_LOW_TOP);
+                unsigned long mstart = min(start, SLICE_LOW_TOP);
+                ret.low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
+                        - (1u << GET_LOW_SLICE_INDEX(mstart));
+        }
+        if ((start + len) > SLICE_LOW_TOP)
+                ret.high_slices = (1u << (GET_HIGH_SLICE_INDEX(end) + 1))
+                        - (1u << GET_HIGH_SLICE_INDEX(start));
+        return ret;
+}
+static int slice_area_is_free(struct mm_struct *mm, unsigned long addr,
+                              unsigned long len)
+{
+        struct vm_area_struct *vma;
+        if ((mm->task_size - len) < addr)
+                return 0;
+        vma = find_vma(mm, addr);
+        return (!vma || (addr + len) <= vma->vm_start);
+}
+static int slice_low_has_vma(struct mm_struct *mm, unsigned long slice)
+{
+        return !slice_area_is_free(mm, slice << SLICE_LOW_SHIFT,
+                                   1ul << SLICE_LOW_SHIFT);
+}
+static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice)
+{
+        unsigned long start = slice << SLICE_HIGH_SHIFT;
+        unsigned long end = start + (1ul << SLICE_HIGH_SHIFT);
+        /* Hack, so that each addresses is controlled by exactly one
+         * of the high or low area bitmaps, the first high area starts
+         * at 4GB, not 0 */
+        if (start == 0)
+                start = SLICE_LOW_TOP;
+        return !slice_area_is_free(mm, start, end - start);
+}
+static struct slice_mask slice_mask_for_free(struct mm_struct *mm)
+{
+        struct slice_mask ret = { 0, 0 };
+        unsigned long i;
+        for (i = 0; i < SLICE_NUM_LOW; i++)
+                if (!slice_low_has_vma(mm, i))
+                        ret.low_slices |= 1u << i;
+        if (mm->task_size <= SLICE_LOW_TOP)
+                return ret;
+        for (i = 0; i < SLICE_NUM_HIGH; i++)
+                if (!slice_high_has_vma(mm, i))
+                        ret.high_slices |= 1u << i;
+        return ret;
+}
+static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize)
+{
+        struct slice_mask ret = { 0, 0 };
+        unsigned long i;
+        u64 psizes;
+        psizes = mm->context.low_slices_psize;
+        for (i = 0; i < SLICE_NUM_LOW; i++)
+                if (((psizes >> (i * 4)) & 0xf) == psize)
+                        ret.low_slices |= 1u << i;
+        psizes = mm->context.high_slices_psize;
+        for (i = 0; i < SLICE_NUM_HIGH; i++)
+                if (((psizes >> (i * 4)) & 0xf) == psize)
+                        ret.high_slices |= 1u << i;
+        return ret;
+}
+static int slice_check_fit(struct slice_mask mask, struct slice_mask available)
+{
+        return (mask.low_slices & available.low_slices) == mask.low_slices &&
+                (mask.high_slices & available.high_slices) == mask.high_slices;
+}
+static void slice_flush_segments(void *parm)
+{
+        struct mm_struct *mm = parm;
+        unsigned long flags;
+        if (mm != current->active_mm)
+                return;
+        /* update the paca copy of the context struct */
+        get_paca()->context = current->active_mm->context;
+        local_irq_save(flags);
+        slb_flush_and_rebolt();
+        local_irq_restore(flags);
+}
+static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psize)
+{
+        /* Write the new slice psize bits */
+        u64 lpsizes, hpsizes;
+        unsigned long i, flags;
+        slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize);
+        slice_print_mask(" mask", mask);
+        /* We need to use a spinlock here to protect against
+         * concurrent 64k -> 4k demotion ...
+         */
+        spin_lock_irqsave(&slice_convert_lock, flags);
+        lpsizes = mm->context.low_slices_psize;
+        for (i = 0; i < SLICE_NUM_LOW; i++)
+                if (mask.low_slices & (1u << i))
+                        lpsizes = (lpsizes & ~(0xful << (i * 4))) |
+                                (((unsigned long)psize) << (i * 4));
+        hpsizes = mm->context.high_slices_psize;
+        for (i = 0; i < SLICE_NUM_HIGH; i++)
+                if (mask.high_slices & (1u << i))
+                        hpsizes = (hpsizes & ~(0xful << (i * 4))) |
+                                (((unsigned long)psize) << (i * 4));
+        mm->context.low_slices_psize = lpsizes;
+        mm->context.high_slices_psize = hpsizes;
+        slice_dbg(" lsps=%lx, hsps=%lx\n",
+                  mm->context.low_slices_psize,
+                  mm->context.high_slices_psize);
+        spin_unlock_irqrestore(&slice_convert_lock, flags);
+        mb();
+        /* XXX this is sub-optimal but will do for now */
+        on_each_cpu(slice_flush_segments, mm, 0, 1);
+#ifdef CONFIG_SPU_BASE
+        spu_flush_all_slbs(mm);
+#endif
+}
+static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
+                                              unsigned long len,
+                                              struct slice_mask available,
+                                              int psize, int use_cache)
+{
+        struct vm_area_struct *vma;
+        unsigned long start_addr, addr;
+        struct slice_mask mask;
+        int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+        if (use_cache) {
+                if (len <= mm->cached_hole_size) {
+                        start_addr = addr = TASK_UNMAPPED_BASE;
+                        mm->cached_hole_size = 0;
+                } else
+                        start_addr = addr = mm->free_area_cache;
+        } else
+                start_addr = addr = TASK_UNMAPPED_BASE;
+full_search:
+        for (;;) {
+                addr = _ALIGN_UP(addr, 1ul << pshift);
+                if ((TASK_SIZE - len) < addr)
+                        break;
+                vma = find_vma(mm, addr);
+                BUG_ON(vma && (addr >= vma->vm_end));
+                mask = slice_range_to_mask(addr, len);
+                if (!slice_check_fit(mask, available)) {
+                        if (addr < SLICE_LOW_TOP)
+                                addr = _ALIGN_UP(addr + 1,  1ul << SLICE_LOW_SHIFT);
+                        else
+                                addr = _ALIGN_UP(addr + 1,  1ul << SLICE_HIGH_SHIFT);
+                        continue;
+                }
+                if (!vma || addr + len <= vma->vm_start) {
+                        /*
+                         * Remember the place where we stopped the search:
+                         */
+                        if (use_cache)
+                                mm->free_area_cache = addr + len;
+                        return addr;
+                }
+                if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start)
+                        mm->cached_hole_size = vma->vm_start - addr;
+                addr = vma->vm_end;
+        }
+        /* Make sure we didn't miss any holes */
+        if (use_cache && start_addr != TASK_UNMAPPED_BASE) {
+                start_addr = addr = TASK_UNMAPPED_BASE;
+                mm->cached_hole_size = 0;
+                goto full_search;
+        }
+        return -ENOMEM;
+}
+static unsigned long slice_find_area_topdown(struct mm_struct *mm,
+                                             unsigned long len,
+                                             struct slice_mask available,
+                                             int psize, int use_cache)
+{
+        struct vm_area_struct *vma;
+        unsigned long addr;
+        struct slice_mask mask;
+        int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+        /* check if free_area_cache is useful for us */
+        if (use_cache) {
+                if (len <= mm->cached_hole_size) {
+                        mm->cached_hole_size = 0;
+                        mm->free_area_cache = mm->mmap_base;
+                }
+                /* either no address requested or can't fit in requested
+                 * address hole
+                 */
+                addr = mm->free_area_cache;
+                /* make sure it can fit in the remaining address space */
+                if (addr > len) {
+                        addr = _ALIGN_DOWN(addr - len, 1ul << pshift);
+                        mask = slice_range_to_mask(addr, len);
+                        if (slice_check_fit(mask, available) &&
+                            slice_area_is_free(mm, addr, len))
+                                        /* remember the address as a hint for
+                                         * next time
+                                         */
+                                        return (mm->free_area_cache = addr);
+                }
+        }
+        addr = mm->mmap_base;
+        while (addr > len) {
+                /* Go down by chunk size */
+                addr = _ALIGN_DOWN(addr - len, 1ul << pshift);
+                /* Check for hit with different page size */
+                mask = slice_range_to_mask(addr, len);
+                if (!slice_check_fit(mask, available)) {
+                        if (addr < SLICE_LOW_TOP)
+                                addr = _ALIGN_DOWN(addr, 1ul << SLICE_LOW_SHIFT);
+                        else if (addr < (1ul << SLICE_HIGH_SHIFT))
+                                addr = SLICE_LOW_TOP;
+                        else
+                                addr = _ALIGN_DOWN(addr, 1ul << SLICE_HIGH_SHIFT);
+                        continue;
+                }
+                /*
+                 * Lookup failure means no vma is above this address,
+                 * else if new region fits below vma->vm_start,
+                 * return with success:
+                 */
+                vma = find_vma(mm, addr);
+                if (!vma || (addr + len) <= vma->vm_start) {
+                        /* remember the address as a hint for next time */
+                        if (use_cache)
+                                mm->free_area_cache = addr;
+                        return addr;
+                }
+                /* remember the largest hole we saw so far */
+                if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start)
+                        mm->cached_hole_size = vma->vm_start - addr;
+                /* try just below the current vma->vm_start */
+                addr = vma->vm_start;
+        }
+        /*
+         * A failed mmap() very likely causes application failure,
+         * so fall back to the bottom-up function here. This scenario
+         * can happen with large stack limits and large mmap()
+         * allocations.
+         */
+        addr = slice_find_area_bottomup(mm, len, available, psize, 0);
+        /*
+         * Restore the topdown base:
+         */
+        if (use_cache) {
+                mm->free_area_cache = mm->mmap_base;
+                mm->cached_hole_size = ~0UL;
+        }
+        return addr;
+}
+static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
+                                     struct slice_mask mask, int psize,
+                                     int topdown, int use_cache)
+{
+        if (topdown)
+                return slice_find_area_topdown(mm, len, mask, psize, use_cache);
+        else
+                return slice_find_area_bottomup(mm, len, mask, psize, use_cache);
+}
+unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
+                                      unsigned long flags, unsigned int psize,
+                                      int topdown, int use_cache)
+{
+        struct slice_mask mask;
+        struct slice_mask good_mask;
+        struct slice_mask potential_mask = {0,0} /* silence stupid warning */;
+        int pmask_set = 0;
+        int fixed = (flags & MAP_FIXED);
+        int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+        struct mm_struct *mm = current->mm;
+        /* Sanity checks */
+        BUG_ON(mm->task_size == 0);
+        slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
+        slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d, use_cache=%d\n",
+                  addr, len, flags, topdown, use_cache);
+        if (len > mm->task_size)
+                return -ENOMEM;
+        if (fixed && (addr & ((1ul << pshift) - 1)))
+                return -EINVAL;
+        if (fixed && addr > (mm->task_size - len))
+                return -EINVAL;
+        /* If hint, make sure it matches our alignment restrictions */
+        if (!fixed && addr) {
+                addr = _ALIGN_UP(addr, 1ul << pshift);
+                slice_dbg(" aligned addr=%lx\n", addr);
+        }
+        /* First makeup a "good" mask of slices that have the right size
+         * already
+         */
+        good_mask = slice_mask_for_size(mm, psize);
+        slice_print_mask(" good_mask", good_mask);
+        /* First check hint if it's valid or if we have MAP_FIXED */
+        if ((addr != 0 || fixed) && (mm->task_size - len) >= addr) {
+                /* Don't bother with hint if it overlaps a VMA */
+                if (!fixed && !slice_area_is_free(mm, addr, len))
+                        goto search;
+                /* Build a mask for the requested range */
+                mask = slice_range_to_mask(addr, len);
+                slice_print_mask(" mask", mask);
+                /* Check if we fit in the good mask. If we do, we just return,
+                 * nothing else to do
+                 */
+                if (slice_check_fit(mask, good_mask)) {
+                        slice_dbg(" fits good !\n");
+                        return addr;
+                }
+                /* We don't fit in the good mask, check what other slices are
+                 * empty and thus can be converted
+                 */
+                potential_mask = slice_mask_for_free(mm);
+                potential_mask.low_slices |= good_mask.low_slices;
+                potential_mask.high_slices |= good_mask.high_slices;
+                pmask_set = 1;
+                slice_print_mask(" potential", potential_mask);
+                if (slice_check_fit(mask, potential_mask)) {
+                        slice_dbg(" fits potential !\n");
+                        goto convert;
+                }
+        }
+        /* If we have MAP_FIXED and failed the above step, then error out */
+        if (fixed)
+                return -EBUSY;
+ search:
+        slice_dbg(" search...\n");
+        /* Now let's see if we can find something in the existing slices
+         * for that size
+         */
+        addr = slice_find_area(mm, len, good_mask, psize, topdown, use_cache);
+        if (addr != -ENOMEM) {
+                /* Found within the good mask, we don't have to setup,
+                 * we thus return directly
+                 */
+                slice_dbg(" found area at 0x%lx\n", addr);
+                return addr;
+        }
+        /* Won't fit, check what can be converted */
+        if (!pmask_set) {
+                potential_mask = slice_mask_for_free(mm);
+                potential_mask.low_slices |= good_mask.low_slices;
+                potential_mask.high_slices |= good_mask.high_slices;
+                pmask_set = 1;
+                slice_print_mask(" potential", potential_mask);
+        }
+        /* Now let's see if we can find something in the existing slices
+         * for that size
+         */
+        addr = slice_find_area(mm, len, potential_mask, psize, topdown,
+                               use_cache);
+        if (addr == -ENOMEM)
+                return -ENOMEM;
+        mask = slice_range_to_mask(addr, len);
+        slice_dbg(" found potential area at 0x%lx\n", addr);
+        slice_print_mask(" mask", mask);
+ convert:
+        slice_convert(mm, mask, psize);
+        return addr;
+}
+EXPORT_SYMBOL_GPL(slice_get_unmapped_area);
+unsigned long arch_get_unmapped_area(struct file *filp,
+                                     unsigned long addr,
+                                     unsigned long len,
+                                     unsigned long pgoff,
+                                     unsigned long flags)
+{
+        return slice_get_unmapped_area(addr, len, flags,
+                                       current->mm->context.user_psize,
+                                       0, 1);
+}
+unsigned long arch_get_unmapped_area_topdown(struct file *filp,
+                                             const unsigned long addr0,
+                                             const unsigned long len,
+                                             const unsigned long pgoff,
+                                             const unsigned long flags)
+{
+        return slice_get_unmapped_area(addr0, len, flags,
+                                       current->mm->context.user_psize,
+                                       1, 1);
+}
+unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr)
+{
+        u64 psizes;
+        int index;
+        if (addr < SLICE_LOW_TOP) {
+                psizes = mm->context.low_slices_psize;
+                index = GET_LOW_SLICE_INDEX(addr);
+        } else {
+                psizes = mm->context.high_slices_psize;
+                index = GET_HIGH_SLICE_INDEX(addr);
+        }
+        return (psizes >> (index * 4)) & 0xf;
+}
+EXPORT_SYMBOL_GPL(get_slice_psize);
+/*
+ * This is called by hash_page when it needs to do a lazy conversion of
+ * an address space from real 64K pages to combo 4K pages (typically
+ * when hitting a non cacheable mapping on a processor or hypervisor
+ * that won't allow them for 64K pages).
+ *
+ * This is also called in init_new_context() to change back the user
+ * psize from whatever the parent context had it set to
+ *
+ * This function will only change the content of the {low,high)_slice_psize
+ * masks, it will not flush SLBs as this shall be handled lazily by the
+ * caller.
+ */
+void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
+{
+        unsigned long flags, lpsizes, hpsizes;
+        unsigned int old_psize;
+        int i;
+        slice_dbg("slice_set_user_psize(mm=%p, psize=%d)\n", mm, psize);
+        spin_lock_irqsave(&slice_convert_lock, flags);
+        old_psize = mm->context.user_psize;
+        slice_dbg(" old_psize=%d\n", old_psize);
+        if (old_psize == psize)
+                goto bail;
+        mm->context.user_psize = psize;
+        wmb();
+        lpsizes = mm->context.low_slices_psize;
+        for (i = 0; i < SLICE_NUM_LOW; i++)
+                if (((lpsizes >> (i * 4)) & 0xf) == old_psize)
+                        lpsizes = (lpsizes & ~(0xful << (i * 4))) |
+                                (((unsigned long)psize) << (i * 4));
+        hpsizes = mm->context.high_slices_psize;
+        for (i = 0; i < SLICE_NUM_HIGH; i++)
+                if (((hpsizes >> (i * 4)) & 0xf) == old_psize)
+                        hpsizes = (hpsizes & ~(0xful << (i * 4))) |
+                                (((unsigned long)psize) << (i * 4));
+        mm->context.low_slices_psize = lpsizes;
+        mm->context.high_slices_psize = hpsizes;
+        slice_dbg(" lsps=%lx, hsps=%lx\n",
+                  mm->context.low_slices_psize,
+                  mm->context.high_slices_psize);
+ bail:
+        spin_unlock_irqrestore(&slice_convert_lock, flags);
+}
+/*
+ * is_hugepage_only_range() is used by generic code to verify wether
+ * a normal mmap mapping (non hugetlbfs) is valid on a given area.
+ *
+ * until the generic code provides a more generic hook and/or starts
+ * calling arch get_unmapped_area for MAP_FIXED (which our implementation
+ * here knows how to deal with), we hijack it to keep standard mappings
+ * away from us.
+ *
+ * because of that generic code limitation, MAP_FIXED mapping cannot
+ * "convert" back a slice with no VMAs to the standard page size, only
+ * get_unmapped_area() can. It would be possible to fix it here but I
+ * prefer working on fixing the generic code instead.
+ *
+ * WARNING: This will not work if hugetlbfs isn't enabled since the
+ * generic code will redefine that function as 0 in that. This is ok
+ * for now as we only use slices with hugetlbfs enabled. This should
+ * be fixed as the generic code gets fixed.
+ */
+int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
+                           unsigned long len)
+{
+        struct slice_mask mask, available;
+        mask = slice_range_to_mask(addr, len);
+        available = slice_mask_for_size(mm, mm->context.user_psize);
+#if 0 /* too verbose */
+        slice_dbg("is_hugepage_only_range(mm=%p, addr=%lx, len=%lx)\n",
+                 mm, addr, len);
+        slice_print_mask(" mask", mask);
+        slice_print_mask(" available", available);
+#endif
+        return !slice_check_fit(mask, available);
+}
diff --git a/arch/powerpc/mm/tlb_32.c b/arch/powerpc/mm/tlb_32.c
index 925ff70be8ba..6a69417cbc0e 100644
--- a/arch/powerpc/mm/tlb_32.c
+++ b/arch/powerpc/mm/tlb_32.c
@@ -111,7 +111,7 @@ static void flush_range(struct mm_struct *mm, unsigned long start,
        if (start >= end)
                return;
        end = (end - 1) | ~PAGE_MASK;
-        pmd = pmd_offset(pgd_offset(mm, start), start);
+        pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start);
        for (;;) {
                pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1;
                if (pmd_end > end)
@@ -169,7 +169,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
                return;
        }
        mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm;
-        pmd = pmd_offset(pgd_offset(mm, vmaddr), vmaddr);
+        pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr);
        if (!pmd_none(*pmd))
                flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
        FINISH_FLUSH;
diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_64.c
index fd8d08c325eb..2bfc4d7e1aa2 100644
--- a/arch/powerpc/mm/tlb_64.c
+++ b/arch/powerpc/mm/tlb_64.c
@@ -143,16 +143,22 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
         */
        addr &= PAGE_MASK;
-        /* Get page size (maybe move back to caller) */
+        /* Get page size (maybe move back to caller).
+         *
+         * NOTE: when using special 64K mappings in 4K environment like
+         * for SPEs, we obtain the page size from the slice, which thus
+         * must still exist (and thus the VMA not reused) at the time
+         * of this call
+         */
        if (huge) {
 #ifdef CONFIG_HUGETLB_PAGE
                psize = mmu_huge_psize;
 #else
                BUG();
-                psize = pte_pagesize_index(pte); /* shutup gcc */
+                psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
 #endif
        } else
-                psize = pte_pagesize_index(pte);
+                psize = pte_pagesize_index(mm, addr, pte);
        /* Build full vaddr */
        if (!is_kernel_addr(addr)) {
author	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-05-09 15:56:01 -0400
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-05-09 15:56:01 -0400
commit	aabded9c3aab5160ae2ca3dd1fa0fa37f3d510e4 (patch)
tree	8544d546735bcb975b8dec296eb9b6dc6531fb2a /arch/powerpc/mm
parent	9a9136e270af14da506f66bcafcc506b86a86498 (diff)
parent	f1a1eb299a8422c3e8d41753095bec44b2493398 (diff)