aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorDavid Gibson <david@gibson.dropbear.id.au>2006-04-28 01:02:51 -0400
committerPaul Mackerras <paulus@samba.org>2006-04-28 01:02:51 -0400
commitf10a04c034c7285a1b15dfa4a83d3e56578e34e8 (patch)
treeee6bd0c670b6606017cbd88b56a1247ff241e00e /arch
parent37e53db8aa233c65142d63b496277bf5be9c0ade (diff)
[PATCH] powerpc: Fix pagetable bloat for hugepages
At present, ARCH=powerpc kernels can waste considerable space in pagetables when making large hugepage mappings. Hugepage PTEs go in PMD pages, but each PMD page maps 256M and so contains only 16 hugepage PTEs (128 bytes of data), but takes up a 1024 byte allocation. With CONFIG_PPC_64K_PAGES enabled (64k base page size), the situation is worse. Now hugepage PTEs are at the PTE page level (also mapping 256M), so we store 16 hugepage PTEs in a 64k allocation. The PowerPC MMU already means that any 256M region is either all hugepage, or all normal pages. Thus, with some care, we can use a different allocation for the hugepage PTE tables and only allocate the 128 bytes necessary. Signed-off-by: Paul Mackerras <paulus@samba.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/powerpc/mm/hugetlbpage.c295
-rw-r--r--arch/powerpc/mm/init_64.c7
2 files changed, 266 insertions, 36 deletions
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 7370f9f33e29..266b8b2ceac9 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -30,13 +30,66 @@
30#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT) 30#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT)
31#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT) 31#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
32 32
33#ifdef CONFIG_PPC_64K_PAGES
34#define HUGEPTE_INDEX_SIZE (PMD_SHIFT-HPAGE_SHIFT)
35#else
36#define HUGEPTE_INDEX_SIZE (PUD_SHIFT-HPAGE_SHIFT)
37#endif
38#define PTRS_PER_HUGEPTE (1 << HUGEPTE_INDEX_SIZE)
39#define HUGEPTE_TABLE_SIZE (sizeof(pte_t) << HUGEPTE_INDEX_SIZE)
40
41#define HUGEPD_SHIFT (HPAGE_SHIFT + HUGEPTE_INDEX_SIZE)
42#define HUGEPD_SIZE (1UL << HUGEPD_SHIFT)
43#define HUGEPD_MASK (~(HUGEPD_SIZE-1))
44
45#define huge_pgtable_cache (pgtable_cache[HUGEPTE_CACHE_NUM])
46
47/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
48 * will choke on pointers to hugepte tables, which is handy for
49 * catching screwups early. */
50#define HUGEPD_OK 0x1
51
52typedef struct { unsigned long pd; } hugepd_t;
53
54#define hugepd_none(hpd) ((hpd).pd == 0)
55
56static inline pte_t *hugepd_page(hugepd_t hpd)
57{
58 BUG_ON(!(hpd.pd & HUGEPD_OK));
59 return (pte_t *)(hpd.pd & ~HUGEPD_OK);
60}
61
62static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr)
63{
64 unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
65 pte_t *dir = hugepd_page(*hpdp);
66
67 return dir + idx;
68}
69
70static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
71 unsigned long address)
72{
73 pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
74 GFP_KERNEL|__GFP_REPEAT);
75
76 if (! new)
77 return -ENOMEM;
78
79 spin_lock(&mm->page_table_lock);
80 if (!hugepd_none(*hpdp))
81 kmem_cache_free(huge_pgtable_cache, new);
82 else
83 hpdp->pd = (unsigned long)new | HUGEPD_OK;
84 spin_unlock(&mm->page_table_lock);
85 return 0;
86}
87
33/* Modelled after find_linux_pte() */ 88/* Modelled after find_linux_pte() */
34pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 89pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
35{ 90{
36 pgd_t *pg; 91 pgd_t *pg;
37 pud_t *pu; 92 pud_t *pu;
38 pmd_t *pm;
39 pte_t *pt;
40 93
41 BUG_ON(! in_hugepage_area(mm->context, addr)); 94 BUG_ON(! in_hugepage_area(mm->context, addr));
42 95
@@ -46,26 +99,14 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
46 if (!pgd_none(*pg)) { 99 if (!pgd_none(*pg)) {
47 pu = pud_offset(pg, addr); 100 pu = pud_offset(pg, addr);
48 if (!pud_none(*pu)) { 101 if (!pud_none(*pu)) {
49 pm = pmd_offset(pu, addr);
50#ifdef CONFIG_PPC_64K_PAGES 102#ifdef CONFIG_PPC_64K_PAGES
51 /* Currently, we use the normal PTE offset within full 103 pmd_t *pm;
52 * size PTE pages, thus our huge PTEs are scattered in 104 pm = pmd_offset(pu, addr);
53 * the PTE page and we do waste some. We may change 105 if (!pmd_none(*pm))
54 * that in the future, but the current mecanism keeps 106 return hugepte_offset((hugepd_t *)pm, addr);
55 * things much simpler 107#else
56 */ 108 return hugepte_offset((hugepd_t *)pu, addr);
57 if (!pmd_none(*pm)) { 109#endif
58 /* Note: pte_offset_* are all equivalent on
59 * ppc64 as we don't have HIGHMEM
60 */
61 pt = pte_offset_kernel(pm, addr);
62 return pt;
63 }
64#else /* CONFIG_PPC_64K_PAGES */
65 /* On 4k pages, we put huge PTEs in the PMD page */
66 pt = (pte_t *)pm;
67 return pt;
68#endif /* CONFIG_PPC_64K_PAGES */
69 } 110 }
70 } 111 }
71 112
@@ -76,8 +117,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
76{ 117{
77 pgd_t *pg; 118 pgd_t *pg;
78 pud_t *pu; 119 pud_t *pu;
79 pmd_t *pm; 120 hugepd_t *hpdp = NULL;
80 pte_t *pt;
81 121
82 BUG_ON(! in_hugepage_area(mm->context, addr)); 122 BUG_ON(! in_hugepage_area(mm->context, addr));
83 123
@@ -87,23 +127,182 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
87 pu = pud_alloc(mm, pg, addr); 127 pu = pud_alloc(mm, pg, addr);
88 128
89 if (pu) { 129 if (pu) {
130#ifdef CONFIG_PPC_64K_PAGES
131 pmd_t *pm;
90 pm = pmd_alloc(mm, pu, addr); 132 pm = pmd_alloc(mm, pu, addr);
91 if (pm) { 133 if (pm)
134 hpdp = (hugepd_t *)pm;
135#else
136 hpdp = (hugepd_t *)pu;
137#endif
138 }
139
140 if (! hpdp)
141 return NULL;
142
143 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
144 return NULL;
145
146 return hugepte_offset(hpdp, addr);
147}
148
149static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
150{
151 pte_t *hugepte = hugepd_page(*hpdp);
152
153 hpdp->pd = 0;
154 tlb->need_flush = 1;
155 pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
156 HUGEPTE_TABLE_SIZE-1));
157}
158
92#ifdef CONFIG_PPC_64K_PAGES 159#ifdef CONFIG_PPC_64K_PAGES
93 /* See comment in huge_pte_offset. Note that if we ever 160static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
94 * want to put the page size in the PMD, we would have 161 unsigned long addr, unsigned long end,
95 * to open code our own pte_alloc* function in order 162 unsigned long floor, unsigned long ceiling)
96 * to populate and set the size atomically 163{
97 */ 164 pmd_t *pmd;
98 pt = pte_alloc_map(mm, pm, addr); 165 unsigned long next;
99#else /* CONFIG_PPC_64K_PAGES */ 166 unsigned long start;
100 pt = (pte_t *)pm; 167
101#endif /* CONFIG_PPC_64K_PAGES */ 168 start = addr;
102 return pt; 169 pmd = pmd_offset(pud, addr);
103 } 170 do {
171 next = pmd_addr_end(addr, end);
172 if (pmd_none(*pmd))
173 continue;
174 free_hugepte_range(tlb, (hugepd_t *)pmd);
175 } while (pmd++, addr = next, addr != end);
176
177 start &= PUD_MASK;
178 if (start < floor)
179 return;
180 if (ceiling) {
181 ceiling &= PUD_MASK;
182 if (!ceiling)
183 return;
104 } 184 }
185 if (end - 1 > ceiling - 1)
186 return;
105 187
106 return NULL; 188 pmd = pmd_offset(pud, start);
189 pud_clear(pud);
190 pmd_free_tlb(tlb, pmd);
191}
192#endif
193
194static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
195 unsigned long addr, unsigned long end,
196 unsigned long floor, unsigned long ceiling)
197{
198 pud_t *pud;
199 unsigned long next;
200 unsigned long start;
201
202 start = addr;
203 pud = pud_offset(pgd, addr);
204 do {
205 next = pud_addr_end(addr, end);
206#ifdef CONFIG_PPC_64K_PAGES
207 if (pud_none_or_clear_bad(pud))
208 continue;
209 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
210#else
211 if (pud_none(*pud))
212 continue;
213 free_hugepte_range(tlb, (hugepd_t *)pud);
214#endif
215 } while (pud++, addr = next, addr != end);
216
217 start &= PGDIR_MASK;
218 if (start < floor)
219 return;
220 if (ceiling) {
221 ceiling &= PGDIR_MASK;
222 if (!ceiling)
223 return;
224 }
225 if (end - 1 > ceiling - 1)
226 return;
227
228 pud = pud_offset(pgd, start);
229 pgd_clear(pgd);
230 pud_free_tlb(tlb, pud);
231}
232
233/*
234 * This function frees user-level page tables of a process.
235 *
236 * Must be called with pagetable lock held.
237 */
238void hugetlb_free_pgd_range(struct mmu_gather **tlb,
239 unsigned long addr, unsigned long end,
240 unsigned long floor, unsigned long ceiling)
241{
242 pgd_t *pgd;
243 unsigned long next;
244 unsigned long start;
245
246 /*
247 * Comments below take from the normal free_pgd_range(). They
248 * apply here too. The tests against HUGEPD_MASK below are
249 * essential, because we *don't* test for this at the bottom
250 * level. Without them we'll attempt to free a hugepte table
251 * when we unmap just part of it, even if there are other
252 * active mappings using it.
253 *
254 * The next few lines have given us lots of grief...
255 *
256 * Why are we testing HUGEPD* at this top level? Because
257 * often there will be no work to do at all, and we'd prefer
258 * not to go all the way down to the bottom just to discover
259 * that.
260 *
261 * Why all these "- 1"s? Because 0 represents both the bottom
262 * of the address space and the top of it (using -1 for the
263 * top wouldn't help much: the masks would do the wrong thing).
264 * The rule is that addr 0 and floor 0 refer to the bottom of
265 * the address space, but end 0 and ceiling 0 refer to the top
266 * Comparisons need to use "end - 1" and "ceiling - 1" (though
267 * that end 0 case should be mythical).
268 *
269 * Wherever addr is brought up or ceiling brought down, we
270 * must be careful to reject "the opposite 0" before it
271 * confuses the subsequent tests. But what about where end is
272 * brought down by HUGEPD_SIZE below? no, end can't go down to
273 * 0 there.
274 *
275 * Whereas we round start (addr) and ceiling down, by different
276 * masks at different levels, in order to test whether a table
277 * now has no other vmas using it, so can be freed, we don't
278 * bother to round floor or end up - the tests don't need that.
279 */
280
281 addr &= HUGEPD_MASK;
282 if (addr < floor) {
283 addr += HUGEPD_SIZE;
284 if (!addr)
285 return;
286 }
287 if (ceiling) {
288 ceiling &= HUGEPD_MASK;
289 if (!ceiling)
290 return;
291 }
292 if (end - 1 > ceiling - 1)
293 end -= HUGEPD_SIZE;
294 if (addr > end - 1)
295 return;
296
297 start = addr;
298 pgd = pgd_offset((*tlb)->mm, addr);
299 do {
300 BUG_ON(! in_hugepage_area((*tlb)->mm->context, addr));
301 next = pgd_addr_end(addr, end);
302 if (pgd_none_or_clear_bad(pgd))
303 continue;
304 hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
305 } while (pgd++, addr = next, addr != end);
107} 306}
108 307
109void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, 308void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -841,3 +1040,27 @@ repeat:
841 out: 1040 out:
842 return err; 1041 return err;
843} 1042}
1043
1044static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
1045{
1046 memset(addr, 0, kmem_cache_size(cache));
1047}
1048
1049static int __init hugetlbpage_init(void)
1050{
1051 if (!cpu_has_feature(CPU_FTR_16M_PAGE))
1052 return -ENODEV;
1053
1054 huge_pgtable_cache = kmem_cache_create("hugepte_cache",
1055 HUGEPTE_TABLE_SIZE,
1056 HUGEPTE_TABLE_SIZE,
1057 SLAB_HWCACHE_ALIGN |
1058 SLAB_MUST_HWCACHE_ALIGN,
1059 zero_ctor, NULL);
1060 if (! huge_pgtable_cache)
1061 panic("hugetlbpage_init(): could not create hugepte cache\n");
1062
1063 return 0;
1064}
1065
1066module_init(hugetlbpage_init);
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index babebd15bdc4..9e30f968c184 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -162,7 +162,14 @@ static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
162}; 162};
163#endif /* CONFIG_PPC_64K_PAGES */ 163#endif /* CONFIG_PPC_64K_PAGES */
164 164
165#ifdef CONFIG_HUGETLB_PAGE
166/* Hugepages need one extra cache, initialized in hugetlbpage.c. We
167 * can't put into the tables above, because HPAGE_SHIFT is not compile
168 * time constant. */
169kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+1];
170#else
165kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)]; 171kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
172#endif
166 173
167void pgtable_cache_init(void) 174void pgtable_cache_init(void)
168{ 175{