[PATCH] Four level pagetables for ppc64

Implement 4-level pagetables for ppc64 This patch implements full four-level page tables for ppc64, thereby extending the usable user address range to 44 bits (16T). The patch uses a full page for the tables at the bottom and top level, and a quarter page for the intermediate levels. It uses full 64-bit pointers at every level, thus also increasing the addressable range of physical memory. This patch also tweaks the VSID allocation to allow matching range for user addresses (this halves the number of available contexts) and adds some #if and BUILD_BUG sanity checks. Signed-off-by: David Gibson <dwg@au1.ibm.com> Signed-off-by: Paul Mackerras <paulus@samba.org>
author: David Gibson <david@gibson.dropbear.id.au> 2005-08-05 05:39:06 -0400
committer: Paul Mackerras <paulus@samba.org> 2005-08-28 20:53:31 -0400
commit: e28f7faf05159f1cfd564596f5e6178edba6bd49 (patch)
tree: 45534d2c33bff8b64e3fd155fba55146cb7518e6 /arch
parent: decd300b30e499fe6be1bbfc5650fc971de8c1fa (diff)
6 files changed, 158 insertions, 192 deletions
diff --git a/arch/ppc64/mm/hash_utils.c b/arch/ppc64/mm/hash_utils.c
index 623b5d130c31..65d6e8527948 100644
--- a/arch/ppc64/mm/hash_utils.c
+++ b/arch/ppc64/mm/hash_utils.c
@@ -302,7 +302,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
        int local = 0;
        cpumask_t tmp;
-        if ((ea & ~REGION_MASK) > EADDR_MASK)
+        if ((ea & ~REGION_MASK) >= PGTABLE_RANGE)
                return 1;
        switch (REGION_ID(ea)) {
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
index f9524602818d..a13e44230a6f 100644
--- a/arch/ppc64/mm/hugetlbpage.c
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -27,124 +27,91 @@
 #include <linux/sysctl.h>
-#define HUGEPGDIR_SHIFT         (HPAGE_SHIFT + PAGE_SHIFT - 3)
+/* Modelled after find_linux_pte() */
-#define HUGEPGDIR_SIZE          (1UL << HUGEPGDIR_SHIFT)
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
-#define HUGEPGDIR_MASK          (~(HUGEPGDIR_SIZE-1))
-#define HUGEPTE_INDEX_SIZE      9
-#define HUGEPGD_INDEX_SIZE      10
-#define PTRS_PER_HUGEPTE        (1 << HUGEPTE_INDEX_SIZE)
-#define PTRS_PER_HUGEPGD        (1 << HUGEPGD_INDEX_SIZE)
-static inline int hugepgd_index(unsigned long addr)
-{
-        return (addr & ~REGION_MASK) >> HUGEPGDIR_SHIFT;
-}
-static pud_t *hugepgd_offset(struct mm_struct *mm, unsigned long addr)
 {
-        int index;
+        pgd_t *pg;
+        pud_t *pu;
+        pmd_t *pm;
+        pte_t *pt;
-        if (! mm->context.huge_pgdir)
+        BUG_ON(! in_hugepage_area(mm->context, addr));
-                return NULL;
+        addr &= HPAGE_MASK;
+        pg = pgd_offset(mm, addr);
+        if (!pgd_none(*pg)) {
+                pu = pud_offset(pg, addr);
+                if (!pud_none(*pu)) {
+                        pm = pmd_offset(pu, addr);
+                        pt = (pte_t *)pm;
+                        BUG_ON(!pmd_none(*pm)
+                               && !(pte_present(*pt) && pte_huge(*pt)));
+                        return pt;
+                }
+        }
-        index = hugepgd_index(addr);
+        return NULL;
-        BUG_ON(index >= PTRS_PER_HUGEPGD);
-        return (pud_t *)(mm->context.huge_pgdir + index);
 }
-static inline pte_t *hugepte_offset(pud_t *dir, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
-        int index;
+        pgd_t *pg;
+        pud_t *pu;
-        if (pud_none(*dir))
+        pmd_t *pm;
-                return NULL;
+        pte_t *pt;
-        index = (addr >> HPAGE_SHIFT) % PTRS_PER_HUGEPTE;
-        return (pte_t *)pud_page(*dir) + index;
-}
-static pud_t *hugepgd_alloc(struct mm_struct *mm, unsigned long addr)
-{
        BUG_ON(! in_hugepage_area(mm->context, addr));
-        if (! mm->context.huge_pgdir) {
+        addr &= HPAGE_MASK;
-                pgd_t *new;
-                spin_unlock(&mm->page_table_lock);
-                /* Don't use pgd_alloc(), because we want __GFP_REPEAT */
-                new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT);
-                BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));
-                spin_lock(&mm->page_table_lock);
-                /*
-                 * Because we dropped the lock, we should re-check the
-                 * entry, as somebody else could have populated it..
-                 */
-                if (mm->context.huge_pgdir)
-                        pgd_free(new);
-                else
-                        mm->context.huge_pgdir = new;
-        }
-        return hugepgd_offset(mm, addr);
-}
-static pte_t *hugepte_alloc(struct mm_struct *mm, pud_t *dir, unsigned long addr)
+        pg = pgd_offset(mm, addr);
-{
+        pu = pud_alloc(mm, pg, addr);
-        if (! pud_present(*dir)) {
-                pte_t *new;
-                spin_unlock(&mm->page_table_lock);
+        if (pu) {
-                new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT);
+                pm = pmd_alloc(mm, pu, addr);
-                BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));
+                if (pm) {
-                spin_lock(&mm->page_table_lock);
+                        pt = (pte_t *)pm;
-                /*
+                        BUG_ON(!pmd_none(*pm)
-                 * Because we dropped the lock, we should re-check the
+                               && !(pte_present(*pt) && pte_huge(*pt)));
-                 * entry, as somebody else could have populated it..
+                        return pt;
-                 */
-                if (pud_present(*dir)) {
-                        if (new)
-                                kmem_cache_free(zero_cache, new);
-                } else {
-                        struct page *ptepage;
-                        if (! new)
-                                return NULL;
-                        ptepage = virt_to_page(new);
-                        ptepage->mapping = (void *) mm;
-                        ptepage->index = addr & HUGEPGDIR_MASK;
-                        pud_populate(mm, dir, new);
                }
        }
-        return hugepte_offset(dir, addr);
+        return NULL;
 }
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+#define HUGEPTE_BATCH_SIZE      (HPAGE_SIZE / PMD_SIZE)
-{
-        pud_t *pud;
-        BUG_ON(! in_hugepage_area(mm->context, addr));
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+                     pte_t *ptep, pte_t pte)
+{
+        int i;
-        pud = hugepgd_offset(mm, addr);
+        if (pte_present(*ptep)) {
-        if (! pud)
+                pte_clear(mm, addr, ptep);
-                return NULL;
+                flush_tlb_pending();
+        }
-        return hugepte_offset(pud, addr);
+        for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) {
+                *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
+                ptep++;
+        }
 }
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+                              pte_t *ptep)
 {
-        pud_t *pud;
+        unsigned long old = pte_update(ptep, ~0UL);
+        int i;
-        BUG_ON(! in_hugepage_area(mm->context, addr));
+        if (old & _PAGE_HASHPTE)
+                hpte_update(mm, addr, old, 0);
-        pud = hugepgd_alloc(mm, addr);
+        for (i = 1; i < HUGEPTE_BATCH_SIZE; i++)
-        if (! pud)
+                ptep[i] = __pte(0);
-                return NULL;
-        return hugepte_alloc(mm, pud, addr);
+        return __pte(old);
 }
 /*
@@ -541,42 +508,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
        }
 }
-void hugetlb_mm_free_pgd(struct mm_struct *mm)
-{
-        int i;
-        pgd_t *pgdir;
-        spin_lock(&mm->page_table_lock);
-        pgdir = mm->context.huge_pgdir;
-        if (! pgdir)
-                goto out;
-        mm->context.huge_pgdir = NULL;
-        /* cleanup any hugepte pages leftover */
-        for (i = 0; i < PTRS_PER_HUGEPGD; i++) {
-                pud_t *pud = (pud_t *)(pgdir + i);
-                if (! pud_none(*pud)) {
-                        pte_t *pte = (pte_t *)pud_page(*pud);
-                        struct page *ptepage = virt_to_page(pte);
-                        ptepage->mapping = NULL;
-                        BUG_ON(memcmp(pte, empty_zero_page, PAGE_SIZE));
-                        kmem_cache_free(zero_cache, pte);
-                }
-                pud_clear(pud);
-        }
-        BUG_ON(memcmp(pgdir, empty_zero_page, PAGE_SIZE));
-        kmem_cache_free(zero_cache, pgdir);
- out:
-        spin_unlock(&mm->page_table_lock);
-}
 int hash_huge_page(struct mm_struct *mm, unsigned long access,
                   unsigned long ea, unsigned long vsid, int local)
 {
diff --git a/arch/ppc64/mm/imalloc.c b/arch/ppc64/mm/imalloc.c
index b6e75b891ac0..c65b87b92756 100644
--- a/arch/ppc64/mm/imalloc.c
+++ b/arch/ppc64/mm/imalloc.c
@@ -31,7 +31,7 @@ static int get_free_im_addr(unsigned long size, unsigned long *im_addr)
                        break;
                if ((unsigned long)tmp->addr >= ioremap_bot)
                        addr = tmp->size + (unsigned long) tmp->addr;
-                if (addr > IMALLOC_END-size) 
+                if (addr >= IMALLOC_END-size)
                        return 1;
        }
        *im_addr = addr;
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
index e58a24d42879..87f256df8de5 100644
--- a/arch/ppc64/mm/init.c
+++ b/arch/ppc64/mm/init.c
@@ -66,6 +66,14 @@
 #include <asm/vdso.h>
 #include <asm/imalloc.h>
+#if PGTABLE_RANGE > USER_VSID_RANGE
+#warning Limited user VSID range means pagetable space is wasted
+#endif
+#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
+#warning TASK_SIZE is smaller than it needs to be.
+#endif
 int mem_init_done;
 unsigned long ioremap_bot = IMALLOC_BASE;
 static unsigned long phbs_io_bot = PHBS_IO_BASE;
@@ -226,7 +234,7 @@ void __iomem * __ioremap(unsigned long addr, unsigned long size,
         * Before that, we map using addresses going
         * up from ioremap_bot.  imalloc will use
         * the addresses from ioremap_bot through
-         * IMALLOC_END (0xE000001fffffffff)
+         * IMALLOC_END
         * 
         */
        pa = addr & PAGE_MASK;
@@ -417,12 +425,6 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
        int index;
        int err;
-#ifdef CONFIG_HUGETLB_PAGE
-        /* We leave htlb_segs as it was, but for a fork, we need to
-         * clear the huge_pgdir. */
-        mm->context.huge_pgdir = NULL;
-#endif
 again:
        if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
                return -ENOMEM;
@@ -453,8 +455,6 @@ void destroy_context(struct mm_struct *mm)
        spin_unlock(&mmu_context_lock);
        mm->context.id = NO_CONTEXT;
-        hugetlb_mm_free_pgd(mm);
 }
 /*
@@ -833,23 +833,43 @@ void __iomem * reserve_phb_iospace(unsigned long size)
        return virt_addr;
 }
-kmem_cache_t *zero_cache;
+static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
-static void zero_ctor(void *pte, kmem_cache_t *cache, unsigned long flags)
 {
-        memset(pte, 0, PAGE_SIZE);
+        memset(addr, 0, kmem_cache_size(cache));
 }
+static const int pgtable_cache_size[2] = {
+        PTE_TABLE_SIZE, PMD_TABLE_SIZE
+};
+static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
+        "pgd_pte_cache", "pud_pmd_cache",
+};
+kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
 void pgtable_cache_init(void)
 {
-        zero_cache = kmem_cache_create("zero",
+        int i;
-                                PAGE_SIZE,
-                                0,
+        BUILD_BUG_ON(PTE_TABLE_SIZE != pgtable_cache_size[PTE_CACHE_NUM]);
-                                SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
+        BUILD_BUG_ON(PMD_TABLE_SIZE != pgtable_cache_size[PMD_CACHE_NUM]);
-                                zero_ctor,
+        BUILD_BUG_ON(PUD_TABLE_SIZE != pgtable_cache_size[PUD_CACHE_NUM]);
-                                NULL);
+        BUILD_BUG_ON(PGD_TABLE_SIZE != pgtable_cache_size[PGD_CACHE_NUM]);
-        if (!zero_cache)
-                panic("pgtable_cache_init(): could not create zero_cache!\n");
+        for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
+                int size = pgtable_cache_size[i];
+                const char *name = pgtable_cache_name[i];
+                pgtable_cache[i] = kmem_cache_create(name,
+                                                     size, size,
+                                                     SLAB_HWCACHE_ALIGN
+                                                     | SLAB_MUST_HWCACHE_ALIGN,
+                                                     zero_ctor,
+                                                     NULL);
+                if (! pgtable_cache[i])
+                        panic("pgtable_cache_init(): could not create %s!\n",
+                              name);
+        }
 }
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
diff --git a/arch/ppc64/mm/slb_low.S b/arch/ppc64/mm/slb_low.S
index 8379d678f70f..f20fc52483a7 100644
--- a/arch/ppc64/mm/slb_low.S
+++ b/arch/ppc64/mm/slb_low.S
@@ -91,7 +91,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
 0:      /* user address: proto-VSID = context<<15 | ESID */
        li      r11,SLB_VSID_USER
-        srdi.   r9,r3,13
+        srdi.   r9,r3,USER_ESID_BITS
        bne-    8f                      /* invalid ea bits set */
 #ifdef CONFIG_HUGETLB_PAGE
diff --git a/arch/ppc64/mm/tlb.c b/arch/ppc64/mm/tlb.c
index 26f0172c4527..d8a6593a13f0 100644
--- a/arch/ppc64/mm/tlb.c
+++ b/arch/ppc64/mm/tlb.c
@@ -41,7 +41,58 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
 unsigned long pte_freelist_forced_free;
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage)
+struct pte_freelist_batch
+{
+        struct rcu_head rcu;
+        unsigned int    index;
+        pgtable_free_t  tables[0];
+};
+DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
+unsigned long pte_freelist_forced_free;
+#define PTE_FREELIST_SIZE \
+        ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
+          / sizeof(pgtable_free_t))
+#ifdef CONFIG_SMP
+static void pte_free_smp_sync(void *arg)
+{
+        /* Do nothing, just ensure we sync with all CPUs */
+}
+#endif
+/* This is only called when we are critically out of memory
+ * (and fail to get a page in pte_free_tlb).
+ */
+static void pgtable_free_now(pgtable_free_t pgf)
+{
+        pte_freelist_forced_free++;
+        smp_call_function(pte_free_smp_sync, NULL, 0, 1);
+        pgtable_free(pgf);
+}
+static void pte_free_rcu_callback(struct rcu_head *head)
+{
+        struct pte_freelist_batch *batch =
+                container_of(head, struct pte_freelist_batch, rcu);
+        unsigned int i;
+        for (i = 0; i < batch->index; i++)
+                pgtable_free(batch->tables[i]);
+        free_page((unsigned long)batch);
+}
+static void pte_free_submit(struct pte_freelist_batch *batch)
+{
+        INIT_RCU_HEAD(&batch->rcu);
+        call_rcu(&batch->rcu, pte_free_rcu_callback);
+}
+void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
 {
        /* This is safe as we are holding page_table_lock */
        cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
@@ -49,19 +100,19 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage)
        if (atomic_read(&tlb->mm->mm_users) < 2 ||
            cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) {
-                pte_free(ptepage);
+                pgtable_free(pgf);
                return;
        }
        if (*batchp == NULL) {
                *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
                if (*batchp == NULL) {
-                        pte_free_now(ptepage);
+                        pgtable_free_now(pgf);
                        return;
                }
                (*batchp)->index = 0;
        }
-        (*batchp)->pages[(*batchp)->index++] = ptepage;
+        (*batchp)->tables[(*batchp)->index++] = pgf;
        if ((*batchp)->index == PTE_FREELIST_SIZE) {
                pte_free_submit(*batchp);
                *batchp = NULL;
@@ -132,42 +183,6 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
        put_cpu();
 }
-#ifdef CONFIG_SMP
-static void pte_free_smp_sync(void *arg)
-{
-        /* Do nothing, just ensure we sync with all CPUs */
-}
-#endif
-/* This is only called when we are critically out of memory
- * (and fail to get a page in pte_free_tlb).
- */
-void pte_free_now(struct page *ptepage)
-{
-        pte_freelist_forced_free++;
-        smp_call_function(pte_free_smp_sync, NULL, 0, 1);
-        pte_free(ptepage);
-}
-static void pte_free_rcu_callback(struct rcu_head *head)
-{
-        struct pte_freelist_batch *batch =
-                container_of(head, struct pte_freelist_batch, rcu);
-        unsigned int i;
-        for (i = 0; i < batch->index; i++)
-                pte_free(batch->pages[i]);
-        free_page((unsigned long)batch);
-}
-void pte_free_submit(struct pte_freelist_batch *batch)
-{
-        INIT_RCU_HEAD(&batch->rcu);
-        call_rcu(&batch->rcu, pte_free_rcu_callback);
-}
 void pte_free_finish(void)
 {
        /* This is safe as we are holding page_table_lock */
author	David Gibson <david@gibson.dropbear.id.au>	2005-08-05 05:39:06 -0400
committer	Paul Mackerras <paulus@samba.org>	2005-08-28 20:53:31 -0400
commit	e28f7faf05159f1cfd564596f5e6178edba6bd49 (patch)
tree	45534d2c33bff8b64e3fd155fba55146cb7518e6 /arch
parent	decd300b30e499fe6be1bbfc5650fc971de8c1fa (diff)

diff --git a/arch/ppc64/mm/hash_utils.c b/arch/ppc64/mm/hash_utils.c index 623b5d130c31..65d6e8527948 100644 --- a/arch/ppc64/mm/hash_utils.c +++ b/arch/ppc64/mm/hash_utils.c
@@ -302,7 +302,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
302	int local = 0;	302	int local = 0;
303	cpumask_t tmp;	303	cpumask_t tmp;
304		304
305	if ((ea & ~REGION_MASK) > EADDR_MASK)	305	if ((ea & ~REGION_MASK) >= PGTABLE_RANGE)
306	return 1;	306	return 1;
307		307
308	switch (REGION_ID(ea)) {	308	switch (REGION_ID(ea)) {


diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c index f9524602818d..a13e44230a6f 100644 --- a/arch/ppc64/mm/hugetlbpage.c +++ b/arch/ppc64/mm/hugetlbpage.c
@@ -27,124 +27,91 @@
27		27
28	#include <linux/sysctl.h>	28	#include <linux/sysctl.h>
29		29
30	#define HUGEPGDIR_SHIFT (HPAGE_SHIFT + PAGE_SHIFT - 3)	30	/* Modelled after find_linux_pte() */
31	#define HUGEPGDIR_SIZE (1UL << HUGEPGDIR_SHIFT)	31	pte_t huge_pte_offset(struct mm_struct mm, unsigned long addr)
32	#define HUGEPGDIR_MASK (~(HUGEPGDIR_SIZE-1))
33
34	#define HUGEPTE_INDEX_SIZE 9
35	#define HUGEPGD_INDEX_SIZE 10
36
37	#define PTRS_PER_HUGEPTE (1 << HUGEPTE_INDEX_SIZE)
38	#define PTRS_PER_HUGEPGD (1 << HUGEPGD_INDEX_SIZE)
39
40	static inline int hugepgd_index(unsigned long addr)
41	{
42	return (addr & ~REGION_MASK) >> HUGEPGDIR_SHIFT;
43	}
44
45	static pud_t hugepgd_offset(struct mm_struct mm, unsigned long addr)
46	{	32	{
47	int index;	33	pgd_t *pg;
		34	pud_t *pu;
		35	pmd_t *pm;
		36	pte_t *pt;
48		37
49	if (! mm->context.huge_pgdir)	38	BUG_ON(! in_hugepage_area(mm->context, addr));
50	return NULL;
51		39
		40	addr &= HPAGE_MASK;
		41
		42	pg = pgd_offset(mm, addr);
		43	if (!pgd_none(*pg)) {
		44	pu = pud_offset(pg, addr);
		45	if (!pud_none(*pu)) {
		46	pm = pmd_offset(pu, addr);
		47	pt = (pte_t *)pm;
		48	BUG_ON(!pmd_none(*pm)
		49	&& !(pte_present(pt) && pte_huge(pt)));
		50	return pt;
		51	}
		52	}
52		53
53	index = hugepgd_index(addr);	54	return NULL;
54	BUG_ON(index >= PTRS_PER_HUGEPGD);
55	return (pud_t *)(mm->context.huge_pgdir + index);
56	}	55	}
57		56
58	static inline pte_t hugepte_offset(pud_t dir, unsigned long addr)	57	pte_t huge_pte_alloc(struct mm_struct mm, unsigned long addr)
59	{	58	{
60	int index;	59	pgd_t *pg;
61		60	pud_t *pu;
62	if (pud_none(*dir))	61	pmd_t *pm;
63	return NULL;	62	pte_t *pt;
64
65	index = (addr >> HPAGE_SHIFT) % PTRS_PER_HUGEPTE;
66	return (pte_t )pud_page(dir) + index;
67	}
68		63
69	static pud_t hugepgd_alloc(struct mm_struct mm, unsigned long addr)
70	{
71	BUG_ON(! in_hugepage_area(mm->context, addr));	64	BUG_ON(! in_hugepage_area(mm->context, addr));
72		65
73	if (! mm->context.huge_pgdir) {	66	addr &= HPAGE_MASK;
74	pgd_t *new;
75	spin_unlock(&mm->page_table_lock);
76	/* Don't use pgd_alloc(), because we want __GFP_REPEAT */
77	new = kmem_cache_alloc(zero_cache, GFP_KERNEL \| __GFP_REPEAT);
78	BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));
79	spin_lock(&mm->page_table_lock);
80
81	/*
82	* Because we dropped the lock, we should re-check the
83	* entry, as somebody else could have populated it..
84	*/
85	if (mm->context.huge_pgdir)
86	pgd_free(new);
87	else
88	mm->context.huge_pgdir = new;
89	}
90	return hugepgd_offset(mm, addr);
91	}
92		67
93	static pte_t hugepte_alloc(struct mm_struct mm, pud_t *dir, unsigned long addr)	68	pg = pgd_offset(mm, addr);
94	{	69	pu = pud_alloc(mm, pg, addr);
95	if (! pud_present(*dir)) {
96	pte_t *new;
97		70
98	spin_unlock(&mm->page_table_lock);	71	if (pu) {
99	new = kmem_cache_alloc(zero_cache, GFP_KERNEL \| __GFP_REPEAT);	72	pm = pmd_alloc(mm, pu, addr);
100	BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));	73	if (pm) {
101	spin_lock(&mm->page_table_lock);	74	pt = (pte_t *)pm;
102	/*	75	BUG_ON(!pmd_none(*pm)
103	* Because we dropped the lock, we should re-check the	76	&& !(pte_present(pt) && pte_huge(pt)));
104	* entry, as somebody else could have populated it..	77	return pt;
105	*/
106	if (pud_present(*dir)) {
107	if (new)
108	kmem_cache_free(zero_cache, new);
109	} else {
110	struct page *ptepage;
111
112	if (! new)
113	return NULL;
114	ptepage = virt_to_page(new);
115	ptepage->mapping = (void *) mm;
116	ptepage->index = addr & HUGEPGDIR_MASK;
117	pud_populate(mm, dir, new);
118	}	78	}
119	}	79	}
120		80
121	return hugepte_offset(dir, addr);	81	return NULL;
122	}	82	}
123		83
124	pte_t huge_pte_offset(struct mm_struct mm, unsigned long addr)	84	#define HUGEPTE_BATCH_SIZE (HPAGE_SIZE / PMD_SIZE)
125	{
126	pud_t *pud;
127		85
128	BUG_ON(! in_hugepage_area(mm->context, addr));	86	void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
		87	pte_t *ptep, pte_t pte)
		88	{
		89	int i;
129		90
130	pud = hugepgd_offset(mm, addr);	91	if (pte_present(*ptep)) {
131	if (! pud)	92	pte_clear(mm, addr, ptep);
132	return NULL;	93	flush_tlb_pending();
		94	}
133		95
134	return hugepte_offset(pud, addr);	96	for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) {
		97	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
		98	ptep++;
		99	}
135	}	100	}
136		101
137	pte_t huge_pte_alloc(struct mm_struct mm, unsigned long addr)	102	pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
		103	pte_t *ptep)
138	{	104	{
139	pud_t *pud;	105	unsigned long old = pte_update(ptep, ~0UL);
		106	int i;
140		107
141	BUG_ON(! in_hugepage_area(mm->context, addr));	108	if (old & _PAGE_HASHPTE)
		109	hpte_update(mm, addr, old, 0);
142		110
143	pud = hugepgd_alloc(mm, addr);	111	for (i = 1; i < HUGEPTE_BATCH_SIZE; i++)
144	if (! pud)	112	ptep[i] = __pte(0);
145	return NULL;
146		113
147	return hugepte_alloc(mm, pud, addr);	114	return __pte(old);
148	}	115	}
149		116
150	/*	117	/*
@@ -541,42 +508,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
541	}	508	}
542	}	509	}
543		510
544	void hugetlb_mm_free_pgd(struct mm_struct *mm)
545	{
546	int i;
547	pgd_t *pgdir;
548
549	spin_lock(&mm->page_table_lock);
550
551	pgdir = mm->context.huge_pgdir;
552	if (! pgdir)
553	goto out;
554
555	mm->context.huge_pgdir = NULL;
556
557	/* cleanup any hugepte pages leftover */
558	for (i = 0; i < PTRS_PER_HUGEPGD; i++) {
559	pud_t pud = (pud_t )(pgdir + i);
560
561	if (! pud_none(*pud)) {
562	pte_t pte = (pte_t )pud_page(*pud);
563	struct page *ptepage = virt_to_page(pte);
564
565	ptepage->mapping = NULL;
566
567	BUG_ON(memcmp(pte, empty_zero_page, PAGE_SIZE));
568	kmem_cache_free(zero_cache, pte);
569	}
570	pud_clear(pud);
571	}
572
573	BUG_ON(memcmp(pgdir, empty_zero_page, PAGE_SIZE));
574	kmem_cache_free(zero_cache, pgdir);
575
576	out:
577	spin_unlock(&mm->page_table_lock);
578	}
579
580	int hash_huge_page(struct mm_struct *mm, unsigned long access,	511	int hash_huge_page(struct mm_struct *mm, unsigned long access,
581	unsigned long ea, unsigned long vsid, int local)	512	unsigned long ea, unsigned long vsid, int local)
582	{	513	{


diff --git a/arch/ppc64/mm/imalloc.c b/arch/ppc64/mm/imalloc.c index b6e75b891ac0..c65b87b92756 100644 --- a/arch/ppc64/mm/imalloc.c +++ b/arch/ppc64/mm/imalloc.c
@@ -31,7 +31,7 @@ static int get_free_im_addr(unsigned long size, unsigned long *im_addr)
31	break;	31	break;
32	if ((unsigned long)tmp->addr >= ioremap_bot)	32	if ((unsigned long)tmp->addr >= ioremap_bot)
33	addr = tmp->size + (unsigned long) tmp->addr;	33	addr = tmp->size + (unsigned long) tmp->addr;
34	if (addr > IMALLOC_END-size)	34	if (addr >= IMALLOC_END-size)
35	return 1;	35	return 1;
36	}	36	}
37	*im_addr = addr;	37	*im_addr = addr;


diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c index e58a24d42879..87f256df8de5 100644 --- a/arch/ppc64/mm/init.c +++ b/arch/ppc64/mm/init.c
@@ -66,6 +66,14 @@
66	#include <asm/vdso.h>	66	#include <asm/vdso.h>
67	#include <asm/imalloc.h>	67	#include <asm/imalloc.h>
68		68
		69	#if PGTABLE_RANGE > USER_VSID_RANGE
		70	#warning Limited user VSID range means pagetable space is wasted
		71	#endif
		72
		73	#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
		74	#warning TASK_SIZE is smaller than it needs to be.
		75	#endif
		76
69	int mem_init_done;	77	int mem_init_done;
70	unsigned long ioremap_bot = IMALLOC_BASE;	78	unsigned long ioremap_bot = IMALLOC_BASE;
71	static unsigned long phbs_io_bot = PHBS_IO_BASE;	79	static unsigned long phbs_io_bot = PHBS_IO_BASE;
@@ -226,7 +234,7 @@ void __iomem * __ioremap(unsigned long addr, unsigned long size,
226	* Before that, we map using addresses going	234	* Before that, we map using addresses going
227	* up from ioremap_bot. imalloc will use	235	* up from ioremap_bot. imalloc will use
228	* the addresses from ioremap_bot through	236	* the addresses from ioremap_bot through
229	* IMALLOC_END (0xE000001fffffffff)	237	* IMALLOC_END
230	*	238	*
231	*/	239	*/
232	pa = addr & PAGE_MASK;	240	pa = addr & PAGE_MASK;
@@ -417,12 +425,6 @@ int init_new_context(struct task_struct tsk, struct mm_struct mm)
417	int index;	425	int index;
418	int err;	426	int err;
419		427
420	#ifdef CONFIG_HUGETLB_PAGE
421	/* We leave htlb_segs as it was, but for a fork, we need to
422	* clear the huge_pgdir. */
423	mm->context.huge_pgdir = NULL;
424	#endif
425
426	again:	428	again:
427	if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))	429	if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
428	return -ENOMEM;	430	return -ENOMEM;
@@ -453,8 +455,6 @@ void destroy_context(struct mm_struct *mm)
453	spin_unlock(&mmu_context_lock);	455	spin_unlock(&mmu_context_lock);
454		456
455	mm->context.id = NO_CONTEXT;	457	mm->context.id = NO_CONTEXT;
456
457	hugetlb_mm_free_pgd(mm);
458	}	458	}
459		459
460	/*	460	/*
@@ -833,23 +833,43 @@ void __iomem * reserve_phb_iospace(unsigned long size)
833	return virt_addr;	833	return virt_addr;
834	}	834	}
835		835
836	kmem_cache_t *zero_cache;	836	static void zero_ctor(void addr, kmem_cache_t cache, unsigned long flags)
837
838	static void zero_ctor(void pte, kmem_cache_t cache, unsigned long flags)
839	{	837	{
840	memset(pte, 0, PAGE_SIZE);	838	memset(addr, 0, kmem_cache_size(cache));
841	}	839	}
842		840
		841	static const int pgtable_cache_size[2] = {
		842	PTE_TABLE_SIZE, PMD_TABLE_SIZE
		843	};
		844	static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
		845	"pgd_pte_cache", "pud_pmd_cache",
		846	};
		847
		848	kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
		849
843	void pgtable_cache_init(void)	850	void pgtable_cache_init(void)
844	{	851	{
845	zero_cache = kmem_cache_create("zero",	852	int i;
846	PAGE_SIZE,	853
847	0,	854	BUILD_BUG_ON(PTE_TABLE_SIZE != pgtable_cache_size[PTE_CACHE_NUM]);
848	SLAB_HWCACHE_ALIGN \| SLAB_MUST_HWCACHE_ALIGN,	855	BUILD_BUG_ON(PMD_TABLE_SIZE != pgtable_cache_size[PMD_CACHE_NUM]);
849	zero_ctor,	856	BUILD_BUG_ON(PUD_TABLE_SIZE != pgtable_cache_size[PUD_CACHE_NUM]);
850	NULL);	857	BUILD_BUG_ON(PGD_TABLE_SIZE != pgtable_cache_size[PGD_CACHE_NUM]);
851	if (!zero_cache)	858
852	panic("pgtable_cache_init(): could not create zero_cache!\n");	859	for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
		860	int size = pgtable_cache_size[i];
		861	const char *name = pgtable_cache_name[i];
		862
		863	pgtable_cache[i] = kmem_cache_create(name,
		864	size, size,
		865	SLAB_HWCACHE_ALIGN
		866	\| SLAB_MUST_HWCACHE_ALIGN,
		867	zero_ctor,
		868	NULL);
		869	if (! pgtable_cache[i])
		870	panic("pgtable_cache_init(): could not create %s!\n",
		871	name);
		872	}
853	}	873	}
854		874
855	pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,	875	pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,


diff --git a/arch/ppc64/mm/slb_low.S b/arch/ppc64/mm/slb_low.S index 8379d678f70f..f20fc52483a7 100644 --- a/arch/ppc64/mm/slb_low.S +++ b/arch/ppc64/mm/slb_low.S
@@ -91,7 +91,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
91	0: /* user address: proto-VSID = context<<15 \| ESID */	91	0: /* user address: proto-VSID = context<<15 \| ESID */
92	li r11,SLB_VSID_USER	92	li r11,SLB_VSID_USER
93		93
94	srdi. r9,r3,13	94	srdi. r9,r3,USER_ESID_BITS
95	bne- 8f /* invalid ea bits set */	95	bne- 8f /* invalid ea bits set */
96		96
97	#ifdef CONFIG_HUGETLB_PAGE	97	#ifdef CONFIG_HUGETLB_PAGE


diff --git a/arch/ppc64/mm/tlb.c b/arch/ppc64/mm/tlb.c index 26f0172c4527..d8a6593a13f0 100644 --- a/arch/ppc64/mm/tlb.c +++ b/arch/ppc64/mm/tlb.c
@@ -41,7 +41,58 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
41	DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);	41	DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
42	unsigned long pte_freelist_forced_free;	42	unsigned long pte_freelist_forced_free;
43		43
44	void __pte_free_tlb(struct mmu_gather tlb, struct page ptepage)	44	struct pte_freelist_batch
		45	{
		46	struct rcu_head rcu;
		47	unsigned int index;
		48	pgtable_free_t tables[0];
		49	};
		50
		51	DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
		52	unsigned long pte_freelist_forced_free;
		53
		54	#define PTE_FREELIST_SIZE \
		55	((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
		56	/ sizeof(pgtable_free_t))
		57
		58	#ifdef CONFIG_SMP
		59	static void pte_free_smp_sync(void *arg)
		60	{
		61	/* Do nothing, just ensure we sync with all CPUs */
		62	}
		63	#endif
		64
		65	/* This is only called when we are critically out of memory
		66	* (and fail to get a page in pte_free_tlb).
		67	*/
		68	static void pgtable_free_now(pgtable_free_t pgf)
		69	{
		70	pte_freelist_forced_free++;
		71
		72	smp_call_function(pte_free_smp_sync, NULL, 0, 1);
		73
		74	pgtable_free(pgf);
		75	}
		76
		77	static void pte_free_rcu_callback(struct rcu_head *head)
		78	{
		79	struct pte_freelist_batch *batch =
		80	container_of(head, struct pte_freelist_batch, rcu);
		81	unsigned int i;
		82
		83	for (i = 0; i < batch->index; i++)
		84	pgtable_free(batch->tables[i]);
		85
		86	free_page((unsigned long)batch);
		87	}
		88
		89	static void pte_free_submit(struct pte_freelist_batch *batch)
		90	{
		91	INIT_RCU_HEAD(&batch->rcu);
		92	call_rcu(&batch->rcu, pte_free_rcu_callback);
		93	}
		94
		95	void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
45	{	96	{
46	/* This is safe as we are holding page_table_lock */	97	/* This is safe as we are holding page_table_lock */
47	cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());	98	cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
@@ -49,19 +100,19 @@ void __pte_free_tlb(struct mmu_gather tlb, struct page ptepage)
49		100
50	if (atomic_read(&tlb->mm->mm_users) < 2 \|\|	101	if (atomic_read(&tlb->mm->mm_users) < 2 \|\|
51	cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) {	102	cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) {
52	pte_free(ptepage);	103	pgtable_free(pgf);
53	return;	104	return;
54	}	105	}
55		106
56	if (*batchp == NULL) {	107	if (*batchp == NULL) {
57	batchp = (struct pte_freelist_batch )__get_free_page(GFP_ATOMIC);	108	batchp = (struct pte_freelist_batch )__get_free_page(GFP_ATOMIC);
58	if (*batchp == NULL) {	109	if (*batchp == NULL) {
59	pte_free_now(ptepage);	110	pgtable_free_now(pgf);
60	return;	111	return;
61	}	112	}
62	(*batchp)->index = 0;	113	(*batchp)->index = 0;
63	}	114	}
64	(batchp)->pages[(batchp)->index++] = ptepage;	115	(batchp)->tables[(batchp)->index++] = pgf;
65	if ((*batchp)->index == PTE_FREELIST_SIZE) {	116	if ((*batchp)->index == PTE_FREELIST_SIZE) {
66	pte_free_submit(*batchp);	117	pte_free_submit(*batchp);
67	*batchp = NULL;	118	*batchp = NULL;
@@ -132,42 +183,6 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
132	put_cpu();	183	put_cpu();
133	}	184	}
134		185
135	#ifdef CONFIG_SMP
136	static void pte_free_smp_sync(void *arg)
137	{
138	/* Do nothing, just ensure we sync with all CPUs */
139	}
140	#endif
141
142	/* This is only called when we are critically out of memory
143	* (and fail to get a page in pte_free_tlb).
144	*/
145	void pte_free_now(struct page *ptepage)
146	{
147	pte_freelist_forced_free++;
148
149	smp_call_function(pte_free_smp_sync, NULL, 0, 1);
150
151	pte_free(ptepage);
152	}
153
154	static void pte_free_rcu_callback(struct rcu_head *head)
155	{
156	struct pte_freelist_batch *batch =
157	container_of(head, struct pte_freelist_batch, rcu);
158	unsigned int i;
159
160	for (i = 0; i < batch->index; i++)
161	pte_free(batch->pages[i]);
162	free_page((unsigned long)batch);
163	}
164
165	void pte_free_submit(struct pte_freelist_batch *batch)
166	{
167	INIT_RCU_HEAD(&batch->rcu);
168	call_rcu(&batch->rcu, pte_free_rcu_callback);
169	}
170
171	void pte_free_finish(void)	186	void pte_free_finish(void)
172	{	187	{
173	/* This is safe as we are holding page_table_lock */	188	/* This is safe as we are holding page_table_lock */