[PATCH] powerpc: Fix pagetable bloat for hugepages

At present, ARCH=powerpc kernels can waste considerable space in pagetables when making large hugepage mappings. Hugepage PTEs go in PMD pages, but each PMD page maps 256M and so contains only 16 hugepage PTEs (128 bytes of data), but takes up a 1024 byte allocation. With CONFIG_PPC_64K_PAGES enabled (64k base page size), the situation is worse. Now hugepage PTEs are at the PTE page level (also mapping 256M), so we store 16 hugepage PTEs in a 64k allocation. The PowerPC MMU already means that any 256M region is either all hugepage, or all normal pages. Thus, with some care, we can use a different allocation for the hugepage PTE tables and only allocate the 128 bytes necessary. Signed-off-by: Paul Mackerras <paulus@samba.org>
author: David Gibson <david@gibson.dropbear.id.au> 2006-04-28 01:02:51 -0400
committer: Paul Mackerras <paulus@samba.org> 2006-04-28 01:02:51 -0400
commit: f10a04c034c7285a1b15dfa4a83d3e56578e34e8 (patch)
tree: ee6bd0c670b6606017cbd88b56a1247ff241e00e /arch/powerpc/mm/hugetlbpage.c
parent: 37e53db8aa233c65142d63b496277bf5be9c0ade (diff)
1 files changed, 259 insertions, 36 deletions
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 7370f9f33e29..266b8b2ceac9 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -30,13 +30,66 @@
 #define NUM_LOW_AREAS   (0x100000000UL >> SID_SHIFT)
 #define NUM_HIGH_AREAS  (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
+#ifdef CONFIG_PPC_64K_PAGES
+#define HUGEPTE_INDEX_SIZE      (PMD_SHIFT-HPAGE_SHIFT)
+#else
+#define HUGEPTE_INDEX_SIZE      (PUD_SHIFT-HPAGE_SHIFT)
+#endif
+#define PTRS_PER_HUGEPTE        (1 << HUGEPTE_INDEX_SIZE)
+#define HUGEPTE_TABLE_SIZE      (sizeof(pte_t) << HUGEPTE_INDEX_SIZE)
+#define HUGEPD_SHIFT            (HPAGE_SHIFT + HUGEPTE_INDEX_SIZE)
+#define HUGEPD_SIZE             (1UL << HUGEPD_SHIFT)
+#define HUGEPD_MASK             (~(HUGEPD_SIZE-1))
+#define huge_pgtable_cache      (pgtable_cache[HUGEPTE_CACHE_NUM])
+/* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
+ * will choke on pointers to hugepte tables, which is handy for
+ * catching screwups early. */
+#define HUGEPD_OK       0x1
+typedef struct { unsigned long pd; } hugepd_t;
+#define hugepd_none(hpd)        ((hpd).pd == 0)
+static inline pte_t *hugepd_page(hugepd_t hpd)
+{
+        BUG_ON(!(hpd.pd & HUGEPD_OK));
+        return (pte_t *)(hpd.pd & ~HUGEPD_OK);
+}
+static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr)
+{
+        unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
+        pte_t *dir = hugepd_page(*hpdp);
+        return dir + idx;
+}
+static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
+                           unsigned long address)
+{
+        pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
+                                      GFP_KERNEL|__GFP_REPEAT);
+        if (! new)
+                return -ENOMEM;
+        spin_lock(&mm->page_table_lock);
+        if (!hugepd_none(*hpdp))
+                kmem_cache_free(huge_pgtable_cache, new);
+        else
+                hpdp->pd = (unsigned long)new | HUGEPD_OK;
+        spin_unlock(&mm->page_table_lock);
+        return 0;
+}
 /* Modelled after find_linux_pte() */
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
        pgd_t *pg;
        pud_t *pu;
-        pmd_t *pm;
-        pte_t *pt;
        BUG_ON(! in_hugepage_area(mm->context, addr));
@@ -46,26 +99,14 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
        if (!pgd_none(*pg)) {
                pu = pud_offset(pg, addr);
                if (!pud_none(*pu)) {
-                        pm = pmd_offset(pu, addr);
 #ifdef CONFIG_PPC_64K_PAGES
-                        /* Currently, we use the normal PTE offset within full
+                        pmd_t *pm;
-                         * size PTE pages, thus our huge PTEs are scattered in
+                        pm = pmd_offset(pu, addr);
-                         * the PTE page and we do waste some. We may change
+                        if (!pmd_none(*pm))
-                         * that in the future, but the current mecanism keeps
+                                return hugepte_offset((hugepd_t *)pm, addr);
-                         * things much simpler
+#else
-                         */
+                        return hugepte_offset((hugepd_t *)pu, addr);
-                        if (!pmd_none(*pm)) {
+#endif
-                                /* Note: pte_offset_* are all equivalent on
-                                 * ppc64 as we don't have HIGHMEM
-                                 */
-                                pt = pte_offset_kernel(pm, addr);
-                                return pt;
-                        }
-#else /* CONFIG_PPC_64K_PAGES */
-                        /* On 4k pages, we put huge PTEs in the PMD page */
-                        pt = (pte_t *)pm;
-                        return pt;
-#endif /* CONFIG_PPC_64K_PAGES */
                }
        }
@@ -76,8 +117,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
        pgd_t *pg;
        pud_t *pu;
-        pmd_t *pm;
+        hugepd_t *hpdp = NULL;
-        pte_t *pt;
        BUG_ON(! in_hugepage_area(mm->context, addr));
@@ -87,23 +127,182 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
        pu = pud_alloc(mm, pg, addr);
        if (pu) {
+#ifdef CONFIG_PPC_64K_PAGES
+                pmd_t *pm;
                pm = pmd_alloc(mm, pu, addr);
-                if (pm) {
+                if (pm)
+                        hpdp = (hugepd_t *)pm;
+#else
+                hpdp = (hugepd_t *)pu;
+#endif
+        }
+        if (! hpdp)
+                return NULL;
+        if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
+                return NULL;
+        return hugepte_offset(hpdp, addr);
+}
+static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
+{
+        pte_t *hugepte = hugepd_page(*hpdp);
+        hpdp->pd = 0;
+        tlb->need_flush = 1;
+        pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
+                                                 HUGEPTE_TABLE_SIZE-1));
+}
 #ifdef CONFIG_PPC_64K_PAGES
-                        /* See comment in huge_pte_offset. Note that if we ever
+static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
-                         * want to put the page size in the PMD, we would have
+                                   unsigned long addr, unsigned long end,
-                         * to open code our own pte_alloc* function in order
+                                   unsigned long floor, unsigned long ceiling)
-                         * to populate and set the size atomically
+{
-                         */
+        pmd_t *pmd;
-                        pt = pte_alloc_map(mm, pm, addr);
+        unsigned long next;
-#else /* CONFIG_PPC_64K_PAGES */
+        unsigned long start;
-                        pt = (pte_t *)pm;
-#endif /* CONFIG_PPC_64K_PAGES */
+        start = addr;
-                        return pt;
+        pmd = pmd_offset(pud, addr);
-                }
+        do {
+                next = pmd_addr_end(addr, end);
+                if (pmd_none(*pmd))
+                        continue;
+                free_hugepte_range(tlb, (hugepd_t *)pmd);
+        } while (pmd++, addr = next, addr != end);
+        start &= PUD_MASK;
+        if (start < floor)
+                return;
+        if (ceiling) {
+                ceiling &= PUD_MASK;
+                if (!ceiling)
+                        return;
        }
+        if (end - 1 > ceiling - 1)
+                return;
-        return NULL;
+        pmd = pmd_offset(pud, start);
+        pud_clear(pud);
+        pmd_free_tlb(tlb, pmd);
+}
+#endif
+static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+                                   unsigned long addr, unsigned long end,
+                                   unsigned long floor, unsigned long ceiling)
+{
+        pud_t *pud;
+        unsigned long next;
+        unsigned long start;
+        start = addr;
+        pud = pud_offset(pgd, addr);
+        do {
+                next = pud_addr_end(addr, end);
+#ifdef CONFIG_PPC_64K_PAGES
+                if (pud_none_or_clear_bad(pud))
+                        continue;
+                hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+#else
+                if (pud_none(*pud))
+                        continue;
+                free_hugepte_range(tlb, (hugepd_t *)pud);
+#endif
+        } while (pud++, addr = next, addr != end);
+        start &= PGDIR_MASK;
+        if (start < floor)
+                return;
+        if (ceiling) {
+                ceiling &= PGDIR_MASK;
+                if (!ceiling)
+                        return;
+        }
+        if (end - 1 > ceiling - 1)
+                return;
+        pud = pud_offset(pgd, start);
+        pgd_clear(pgd);
+        pud_free_tlb(tlb, pud);
+}
+/*
+ * This function frees user-level page tables of a process.
+ *
+ * Must be called with pagetable lock held.
+ */
+void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+                            unsigned long addr, unsigned long end,
+                            unsigned long floor, unsigned long ceiling)
+{
+        pgd_t *pgd;
+        unsigned long next;
+        unsigned long start;
+        /*
+         * Comments below take from the normal free_pgd_range().  They
+         * apply here too.  The tests against HUGEPD_MASK below are
+         * essential, because we *don't* test for this at the bottom
+         * level.  Without them we'll attempt to free a hugepte table
+         * when we unmap just part of it, even if there are other
+         * active mappings using it.
+         *
+         * The next few lines have given us lots of grief...
+         *
+         * Why are we testing HUGEPD* at this top level?  Because
+         * often there will be no work to do at all, and we'd prefer
+         * not to go all the way down to the bottom just to discover
+         * that.
+         *
+         * Why all these "- 1"s?  Because 0 represents both the bottom
+         * of the address space and the top of it (using -1 for the
+         * top wouldn't help much: the masks would do the wrong thing).
+         * The rule is that addr 0 and floor 0 refer to the bottom of
+         * the address space, but end 0 and ceiling 0 refer to the top
+         * Comparisons need to use "end - 1" and "ceiling - 1" (though
+         * that end 0 case should be mythical).
+         *
+         * Wherever addr is brought up or ceiling brought down, we
+         * must be careful to reject "the opposite 0" before it
+         * confuses the subsequent tests.  But what about where end is
+         * brought down by HUGEPD_SIZE below? no, end can't go down to
+         * 0 there.
+         *
+         * Whereas we round start (addr) and ceiling down, by different
+         * masks at different levels, in order to test whether a table
+         * now has no other vmas using it, so can be freed, we don't
+         * bother to round floor or end up - the tests don't need that.
+         */
+        addr &= HUGEPD_MASK;
+        if (addr < floor) {
+                addr += HUGEPD_SIZE;
+                if (!addr)
+                        return;
+        }
+        if (ceiling) {
+                ceiling &= HUGEPD_MASK;
+                if (!ceiling)
+                        return;
+        }
+        if (end - 1 > ceiling - 1)
+                end -= HUGEPD_SIZE;
+        if (addr > end - 1)
+                return;
+        start = addr;
+        pgd = pgd_offset((*tlb)->mm, addr);
+        do {
+                BUG_ON(! in_hugepage_area((*tlb)->mm->context, addr));
+                next = pgd_addr_end(addr, end);
+                if (pgd_none_or_clear_bad(pgd))
+                        continue;
+                hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
+        } while (pgd++, addr = next, addr != end);
 }
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -841,3 +1040,27 @@ repeat:
 out:
        return err;
 }
+static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
+{
+        memset(addr, 0, kmem_cache_size(cache));
+}
+static int __init hugetlbpage_init(void)
+{
+        if (!cpu_has_feature(CPU_FTR_16M_PAGE))
+                return -ENODEV;
+        huge_pgtable_cache = kmem_cache_create("hugepte_cache",
+                                               HUGEPTE_TABLE_SIZE,
+                                               HUGEPTE_TABLE_SIZE,
+                                               SLAB_HWCACHE_ALIGN |
+                                               SLAB_MUST_HWCACHE_ALIGN,
+                                               zero_ctor, NULL);
+        if (! huge_pgtable_cache)
+                panic("hugetlbpage_init(): could not create hugepte cache\n");
+        return 0;
+}
+module_init(hugetlbpage_init);
author	David Gibson <david@gibson.dropbear.id.au>	2006-04-28 01:02:51 -0400
committer	Paul Mackerras <paulus@samba.org>	2006-04-28 01:02:51 -0400
commit	f10a04c034c7285a1b15dfa4a83d3e56578e34e8 (patch)
tree	ee6bd0c670b6606017cbd88b56a1247ff241e00e /arch/powerpc/mm/hugetlbpage.c
parent	37e53db8aa233c65142d63b496277bf5be9c0ade (diff)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 7370f9f33e29..266b8b2ceac9 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c
@@ -30,13 +30,66 @@
30	#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT)	30	#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT)
31	#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)	31	#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
32		32
		33	#ifdef CONFIG_PPC_64K_PAGES
		34	#define HUGEPTE_INDEX_SIZE (PMD_SHIFT-HPAGE_SHIFT)
		35	#else
		36	#define HUGEPTE_INDEX_SIZE (PUD_SHIFT-HPAGE_SHIFT)
		37	#endif
		38	#define PTRS_PER_HUGEPTE (1 << HUGEPTE_INDEX_SIZE)
		39	#define HUGEPTE_TABLE_SIZE (sizeof(pte_t) << HUGEPTE_INDEX_SIZE)
		40
		41	#define HUGEPD_SHIFT (HPAGE_SHIFT + HUGEPTE_INDEX_SIZE)
		42	#define HUGEPD_SIZE (1UL << HUGEPD_SHIFT)
		43	#define HUGEPD_MASK (~(HUGEPD_SIZE-1))
		44
		45	#define huge_pgtable_cache (pgtable_cache[HUGEPTE_CACHE_NUM])
		46
		47	/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
		48	* will choke on pointers to hugepte tables, which is handy for
		49	* catching screwups early. */
		50	#define HUGEPD_OK 0x1
		51
		52	typedef struct { unsigned long pd; } hugepd_t;
		53
		54	#define hugepd_none(hpd) ((hpd).pd == 0)
		55
		56	static inline pte_t *hugepd_page(hugepd_t hpd)
		57	{
		58	BUG_ON(!(hpd.pd & HUGEPD_OK));
		59	return (pte_t *)(hpd.pd & ~HUGEPD_OK);
		60	}
		61
		62	static inline pte_t hugepte_offset(hugepd_t hpdp, unsigned long addr)
		63	{
		64	unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
		65	pte_t dir = hugepd_page(hpdp);
		66
		67	return dir + idx;
		68	}
		69
		70	static int __hugepte_alloc(struct mm_struct mm, hugepd_t hpdp,
		71	unsigned long address)
		72	{
		73	pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
		74	GFP_KERNEL\|__GFP_REPEAT);
		75
		76	if (! new)
		77	return -ENOMEM;
		78
		79	spin_lock(&mm->page_table_lock);
		80	if (!hugepd_none(*hpdp))
		81	kmem_cache_free(huge_pgtable_cache, new);
		82	else
		83	hpdp->pd = (unsigned long)new \| HUGEPD_OK;
		84	spin_unlock(&mm->page_table_lock);
		85	return 0;
		86	}
		87
33	/* Modelled after find_linux_pte() */	88	/* Modelled after find_linux_pte() */
34	pte_t huge_pte_offset(struct mm_struct mm, unsigned long addr)	89	pte_t huge_pte_offset(struct mm_struct mm, unsigned long addr)
35	{	90	{
36	pgd_t *pg;	91	pgd_t *pg;
37	pud_t *pu;	92	pud_t *pu;
38	pmd_t *pm;
39	pte_t *pt;
40		93
41	BUG_ON(! in_hugepage_area(mm->context, addr));	94	BUG_ON(! in_hugepage_area(mm->context, addr));
42		95
@@ -46,26 +99,14 @@ pte_t huge_pte_offset(struct mm_struct mm, unsigned long addr)
46	if (!pgd_none(*pg)) {	99	if (!pgd_none(*pg)) {
47	pu = pud_offset(pg, addr);	100	pu = pud_offset(pg, addr);
48	if (!pud_none(*pu)) {	101	if (!pud_none(*pu)) {
49	pm = pmd_offset(pu, addr);
50	#ifdef CONFIG_PPC_64K_PAGES	102	#ifdef CONFIG_PPC_64K_PAGES
51	/* Currently, we use the normal PTE offset within full	103	pmd_t *pm;
52	* size PTE pages, thus our huge PTEs are scattered in	104	pm = pmd_offset(pu, addr);
53	* the PTE page and we do waste some. We may change	105	if (!pmd_none(*pm))
54	* that in the future, but the current mecanism keeps	106	return hugepte_offset((hugepd_t *)pm, addr);
55	* things much simpler	107	#else
56	*/	108	return hugepte_offset((hugepd_t *)pu, addr);
57	if (!pmd_none(*pm)) {	109	#endif
58	/* Note: pte_offset_* are all equivalent on
59	* ppc64 as we don't have HIGHMEM
60	*/
61	pt = pte_offset_kernel(pm, addr);
62	return pt;
63	}
64	#else /* CONFIG_PPC_64K_PAGES */
65	/* On 4k pages, we put huge PTEs in the PMD page */
66	pt = (pte_t *)pm;
67	return pt;
68	#endif /* CONFIG_PPC_64K_PAGES */
69	}	110	}
70	}	111	}
71		112
@@ -76,8 +117,7 @@ pte_t huge_pte_alloc(struct mm_struct mm, unsigned long addr)
76	{	117	{
77	pgd_t *pg;	118	pgd_t *pg;
78	pud_t *pu;	119	pud_t *pu;
79	pmd_t *pm;	120	hugepd_t *hpdp = NULL;
80	pte_t *pt;
81		121
82	BUG_ON(! in_hugepage_area(mm->context, addr));	122	BUG_ON(! in_hugepage_area(mm->context, addr));
83		123
@@ -87,23 +127,182 @@ pte_t huge_pte_alloc(struct mm_struct mm, unsigned long addr)
87	pu = pud_alloc(mm, pg, addr);	127	pu = pud_alloc(mm, pg, addr);
88		128
89	if (pu) {	129	if (pu) {
		130	#ifdef CONFIG_PPC_64K_PAGES
		131	pmd_t *pm;
90	pm = pmd_alloc(mm, pu, addr);	132	pm = pmd_alloc(mm, pu, addr);
91	if (pm) {	133	if (pm)
		134	hpdp = (hugepd_t *)pm;
		135	#else
		136	hpdp = (hugepd_t *)pu;
		137	#endif
		138	}
		139
		140	if (! hpdp)
		141	return NULL;
		142
		143	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
		144	return NULL;
		145
		146	return hugepte_offset(hpdp, addr);
		147	}
		148
		149	static void free_hugepte_range(struct mmu_gather tlb, hugepd_t hpdp)
		150	{
		151	pte_t hugepte = hugepd_page(hpdp);
		152
		153	hpdp->pd = 0;
		154	tlb->need_flush = 1;
		155	pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
		156	HUGEPTE_TABLE_SIZE-1));
		157	}
		158
92	#ifdef CONFIG_PPC_64K_PAGES	159	#ifdef CONFIG_PPC_64K_PAGES
93	/* See comment in huge_pte_offset. Note that if we ever	160	static void hugetlb_free_pmd_range(struct mmu_gather tlb, pud_t pud,
94	* want to put the page size in the PMD, we would have	161	unsigned long addr, unsigned long end,
95	* to open code our own pte_alloc* function in order	162	unsigned long floor, unsigned long ceiling)
96	* to populate and set the size atomically	163	{
97	*/	164	pmd_t *pmd;
98	pt = pte_alloc_map(mm, pm, addr);	165	unsigned long next;
99	#else /* CONFIG_PPC_64K_PAGES */	166	unsigned long start;
100	pt = (pte_t *)pm;	167
101	#endif /* CONFIG_PPC_64K_PAGES */	168	start = addr;
102	return pt;	169	pmd = pmd_offset(pud, addr);
103	}	170	do {
		171	next = pmd_addr_end(addr, end);
		172	if (pmd_none(*pmd))
		173	continue;
		174	free_hugepte_range(tlb, (hugepd_t *)pmd);
		175	} while (pmd++, addr = next, addr != end);
		176
		177	start &= PUD_MASK;
		178	if (start < floor)
		179	return;
		180	if (ceiling) {
		181	ceiling &= PUD_MASK;
		182	if (!ceiling)
		183	return;
104	}	184	}
		185	if (end - 1 > ceiling - 1)
		186	return;
105		187
106	return NULL;	188	pmd = pmd_offset(pud, start);
		189	pud_clear(pud);
		190	pmd_free_tlb(tlb, pmd);
		191	}
		192	#endif
		193
		194	static void hugetlb_free_pud_range(struct mmu_gather tlb, pgd_t pgd,
		195	unsigned long addr, unsigned long end,
		196	unsigned long floor, unsigned long ceiling)
		197	{
		198	pud_t *pud;
		199	unsigned long next;
		200	unsigned long start;
		201
		202	start = addr;
		203	pud = pud_offset(pgd, addr);
		204	do {
		205	next = pud_addr_end(addr, end);
		206	#ifdef CONFIG_PPC_64K_PAGES
		207	if (pud_none_or_clear_bad(pud))
		208	continue;
		209	hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
		210	#else
		211	if (pud_none(*pud))
		212	continue;
		213	free_hugepte_range(tlb, (hugepd_t *)pud);
		214	#endif
		215	} while (pud++, addr = next, addr != end);
		216
		217	start &= PGDIR_MASK;
		218	if (start < floor)
		219	return;
		220	if (ceiling) {
		221	ceiling &= PGDIR_MASK;
		222	if (!ceiling)
		223	return;
		224	}
		225	if (end - 1 > ceiling - 1)
		226	return;
		227
		228	pud = pud_offset(pgd, start);
		229	pgd_clear(pgd);
		230	pud_free_tlb(tlb, pud);
		231	}
		232
		233	/*
		234	* This function frees user-level page tables of a process.
		235	*
		236	* Must be called with pagetable lock held.
		237	*/
		238	void hugetlb_free_pgd_range(struct mmu_gather **tlb,
		239	unsigned long addr, unsigned long end,
		240	unsigned long floor, unsigned long ceiling)
		241	{
		242	pgd_t *pgd;
		243	unsigned long next;
		244	unsigned long start;
		245
		246	/*
		247	* Comments below take from the normal free_pgd_range(). They
		248	* apply here too. The tests against HUGEPD_MASK below are
		249	* essential, because we don't test for this at the bottom
		250	* level. Without them we'll attempt to free a hugepte table
		251	* when we unmap just part of it, even if there are other
		252	* active mappings using it.
		253	*
		254	* The next few lines have given us lots of grief...
		255	*
		256	* Why are we testing HUGEPD* at this top level? Because
		257	* often there will be no work to do at all, and we'd prefer
		258	* not to go all the way down to the bottom just to discover
		259	* that.
		260	*
		261	* Why all these "- 1"s? Because 0 represents both the bottom
		262	* of the address space and the top of it (using -1 for the
		263	* top wouldn't help much: the masks would do the wrong thing).
		264	* The rule is that addr 0 and floor 0 refer to the bottom of
		265	* the address space, but end 0 and ceiling 0 refer to the top
		266	* Comparisons need to use "end - 1" and "ceiling - 1" (though
		267	* that end 0 case should be mythical).
		268	*
		269	* Wherever addr is brought up or ceiling brought down, we
		270	* must be careful to reject "the opposite 0" before it
		271	* confuses the subsequent tests. But what about where end is
		272	* brought down by HUGEPD_SIZE below? no, end can't go down to
		273	* 0 there.
		274	*
		275	* Whereas we round start (addr) and ceiling down, by different
		276	* masks at different levels, in order to test whether a table
		277	* now has no other vmas using it, so can be freed, we don't
		278	* bother to round floor or end up - the tests don't need that.
		279	*/
		280
		281	addr &= HUGEPD_MASK;
		282	if (addr < floor) {
		283	addr += HUGEPD_SIZE;
		284	if (!addr)
		285	return;
		286	}
		287	if (ceiling) {
		288	ceiling &= HUGEPD_MASK;
		289	if (!ceiling)
		290	return;
		291	}
		292	if (end - 1 > ceiling - 1)
		293	end -= HUGEPD_SIZE;
		294	if (addr > end - 1)
		295	return;
		296
		297	start = addr;
		298	pgd = pgd_offset((*tlb)->mm, addr);
		299	do {
		300	BUG_ON(! in_hugepage_area((*tlb)->mm->context, addr));
		301	next = pgd_addr_end(addr, end);
		302	if (pgd_none_or_clear_bad(pgd))
		303	continue;
		304	hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
		305	} while (pgd++, addr = next, addr != end);
107	}	306	}
108		307
109	void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,	308	void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -841,3 +1040,27 @@ repeat:
841	out:	1040	out:
842	return err;	1041	return err;
843	}	1042	}
		1043
		1044	static void zero_ctor(void addr, kmem_cache_t cache, unsigned long flags)
		1045	{
		1046	memset(addr, 0, kmem_cache_size(cache));
		1047	}
		1048
		1049	static int __init hugetlbpage_init(void)
		1050	{
		1051	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
		1052	return -ENODEV;
		1053
		1054	huge_pgtable_cache = kmem_cache_create("hugepte_cache",
		1055	HUGEPTE_TABLE_SIZE,
		1056	HUGEPTE_TABLE_SIZE,
		1057	SLAB_HWCACHE_ALIGN \|
		1058	SLAB_MUST_HWCACHE_ALIGN,
		1059	zero_ctor, NULL);
		1060	if (! huge_pgtable_cache)
		1061	panic("hugetlbpage_init(): could not create hugepte cache\n");
		1062
		1063	return 0;
		1064	}
		1065
		1066	module_init(hugetlbpage_init);