aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/mm/hugetlbpage.c
diff options
context:
space:
mode:
authorDavid Gibson <david@gibson.dropbear.id.au>2009-10-26 15:24:31 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2009-10-30 02:20:58 -0400
commita4fe3ce7699bfe1bd88f816b55d42d8fe1dac655 (patch)
treeb72c982ffbb9f05d78a952288d60c4dc2d31a4d9 /arch/powerpc/mm/hugetlbpage.c
parenta0668cdc154e54bf0c85182e0535eea237d53146 (diff)
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different pagetable layout: that is, the bottem level table of pointers to hugepages is a different size, and may branch off from the normal page tables at a different level. Every hugepage aware path that needs to walk the pagetables must therefore look up the hugepage size from the slice info first, and work out the correct way to walk the pagetables accordingly. Future hardware is likely to add more possible hugepage sizes, more layout options and more mess. This patch, therefore reworks the handling of hugepage pagetables to reduce this complexity. In the new scheme, instead of having to consult the slice mask, pagetable walking code can check a flag in the PGD/PUD/PMD entries to see where to branch off to hugepage pagetables, and the entry also contains the information (eseentially hugepage shift) necessary to then interpret that table without recourse to the slice mask. This scheme can be extended neatly to handle multiple levels of self-describing "special" hugepage pagetables, although for now we assume only one level exists. This approach means that only the pagetable allocation path needs to know how the pagetables should be set out. All other (hugepage) pagetable walking paths can just interpret the structure as they go. There already was a flag bit in PGD/PUD/PMD entries for hugepage directory pointers, but it was only used for debug. We alter that flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable pointer (normally it would be 1 since the pointer lies in the linear mapping). This means that asm pagetable walking can test for (and punt on) hugepage pointers with the same test that checks for unpopulated page directory entries (beq becomes bge), since hugepage pointers will always be positive, and normal pointers always negative. While we're at it, we get rid of the confusing (and grep defeating) #defining of hugepte_shift to be the same thing as mmu_huge_psizes. Signed-off-by: David Gibson <dwg@au1.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/mm/hugetlbpage.c')
-rw-r--r--arch/powerpc/mm/hugetlbpage.c473
1 files changed, 228 insertions, 245 deletions
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 7230d7a4fbd9..95220a5dee58 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -40,25 +40,11 @@ static unsigned nr_gpages;
40/* Array of valid huge page sizes - non-zero value(hugepte_shift) is 40/* Array of valid huge page sizes - non-zero value(hugepte_shift) is
41 * stored for the huge page sizes that are valid. 41 * stored for the huge page sizes that are valid.
42 */ 42 */
43unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */ 43static unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
44
45#define hugepte_shift mmu_huge_psizes
46#define HUGEPTE_INDEX_SIZE(psize) (mmu_huge_psizes[(psize)])
47#define PTRS_PER_HUGEPTE(psize) (1 << mmu_huge_psizes[psize])
48
49#define HUGEPD_SHIFT(psize) (mmu_psize_to_shift(psize) \
50 + HUGEPTE_INDEX_SIZE(psize))
51#define HUGEPD_SIZE(psize) (1UL << HUGEPD_SHIFT(psize))
52#define HUGEPD_MASK(psize) (~(HUGEPD_SIZE(psize)-1))
53 44
54/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() 45/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
55 * will choke on pointers to hugepte tables, which is handy for 46 * will choke on pointers to hugepte tables, which is handy for
56 * catching screwups early. */ 47 * catching screwups early. */
57#define HUGEPD_OK 0x1
58
59typedef struct { unsigned long pd; } hugepd_t;
60
61#define hugepd_none(hpd) ((hpd).pd == 0)
62 48
63static inline int shift_to_mmu_psize(unsigned int shift) 49static inline int shift_to_mmu_psize(unsigned int shift)
64{ 50{
@@ -82,71 +68,126 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
82 BUG(); 68 BUG();
83} 69}
84 70
71#define hugepd_none(hpd) ((hpd).pd == 0)
72
85static inline pte_t *hugepd_page(hugepd_t hpd) 73static inline pte_t *hugepd_page(hugepd_t hpd)
86{ 74{
87 BUG_ON(!(hpd.pd & HUGEPD_OK)); 75 BUG_ON(!hugepd_ok(hpd));
88 return (pte_t *)(hpd.pd & ~HUGEPD_OK); 76 return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000);
77}
78
79static inline unsigned int hugepd_shift(hugepd_t hpd)
80{
81 return hpd.pd & HUGEPD_SHIFT_MASK;
89} 82}
90 83
91static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, 84static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift)
92 struct hstate *hstate)
93{ 85{
94 unsigned int shift = huge_page_shift(hstate); 86 unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
95 int psize = shift_to_mmu_psize(shift);
96 unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1));
97 pte_t *dir = hugepd_page(*hpdp); 87 pte_t *dir = hugepd_page(*hpdp);
98 88
99 return dir + idx; 89 return dir + idx;
100} 90}
101 91
92pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
93{
94 pgd_t *pg;
95 pud_t *pu;
96 pmd_t *pm;
97 hugepd_t *hpdp = NULL;
98 unsigned pdshift = PGDIR_SHIFT;
99
100 if (shift)
101 *shift = 0;
102
103 pg = pgdir + pgd_index(ea);
104 if (is_hugepd(pg)) {
105 hpdp = (hugepd_t *)pg;
106 } else if (!pgd_none(*pg)) {
107 pdshift = PUD_SHIFT;
108 pu = pud_offset(pg, ea);
109 if (is_hugepd(pu))
110 hpdp = (hugepd_t *)pu;
111 else if (!pud_none(*pu)) {
112 pdshift = PMD_SHIFT;
113 pm = pmd_offset(pu, ea);
114 if (is_hugepd(pm))
115 hpdp = (hugepd_t *)pm;
116 else if (!pmd_none(*pm)) {
117 return pte_offset_map(pm, ea);
118 }
119 }
120 }
121
122 if (!hpdp)
123 return NULL;
124
125 if (shift)
126 *shift = hugepd_shift(*hpdp);
127 return hugepte_offset(hpdp, ea, pdshift);
128}
129
130pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
131{
132 return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
133}
134
102static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 135static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
103 unsigned long address, unsigned int psize) 136 unsigned long address, unsigned pdshift, unsigned pshift)
104{ 137{
105 pte_t *new = kmem_cache_zalloc(PGT_CACHE(hugepte_shift[psize]), 138 pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift),
106 GFP_KERNEL|__GFP_REPEAT); 139 GFP_KERNEL|__GFP_REPEAT);
107 140
141 BUG_ON(pshift > HUGEPD_SHIFT_MASK);
142 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
143
108 if (! new) 144 if (! new)
109 return -ENOMEM; 145 return -ENOMEM;
110 146
111 spin_lock(&mm->page_table_lock); 147 spin_lock(&mm->page_table_lock);
112 if (!hugepd_none(*hpdp)) 148 if (!hugepd_none(*hpdp))
113 kmem_cache_free(PGT_CACHE(hugepte_shift[psize]), new); 149 kmem_cache_free(PGT_CACHE(pdshift - pshift), new);
114 else 150 else
115 hpdp->pd = (unsigned long)new | HUGEPD_OK; 151 hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift;
116 spin_unlock(&mm->page_table_lock); 152 spin_unlock(&mm->page_table_lock);
117 return 0; 153 return 0;
118} 154}
119 155
120 156pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
121static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate)
122{ 157{
123 if (huge_page_shift(hstate) < PUD_SHIFT) 158 pgd_t *pg;
124 return pud_offset(pgd, addr); 159 pud_t *pu;
125 else 160 pmd_t *pm;
126 return (pud_t *) pgd; 161 hugepd_t *hpdp = NULL;
127} 162 unsigned pshift = __ffs(sz);
128static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, 163 unsigned pdshift = PGDIR_SHIFT;
129 struct hstate *hstate) 164
130{ 165 addr &= ~(sz-1);
131 if (huge_page_shift(hstate) < PUD_SHIFT) 166
132 return pud_alloc(mm, pgd, addr); 167 pg = pgd_offset(mm, addr);
133 else 168 if (pshift >= PUD_SHIFT) {
134 return (pud_t *) pgd; 169 hpdp = (hugepd_t *)pg;
135} 170 } else {
136static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate) 171 pdshift = PUD_SHIFT;
137{ 172 pu = pud_alloc(mm, pg, addr);
138 if (huge_page_shift(hstate) < PMD_SHIFT) 173 if (pshift >= PMD_SHIFT) {
139 return pmd_offset(pud, addr); 174 hpdp = (hugepd_t *)pu;
140 else 175 } else {
141 return (pmd_t *) pud; 176 pdshift = PMD_SHIFT;
142} 177 pm = pmd_alloc(mm, pu, addr);
143static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr, 178 hpdp = (hugepd_t *)pm;
144 struct hstate *hstate) 179 }
145{ 180 }
146 if (huge_page_shift(hstate) < PMD_SHIFT) 181
147 return pmd_alloc(mm, pud, addr); 182 if (!hpdp)
148 else 183 return NULL;
149 return (pmd_t *) pud; 184
185 BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
186
187 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
188 return NULL;
189
190 return hugepte_offset(hpdp, addr, pdshift);
150} 191}
151 192
152/* Build list of addresses of gigantic pages. This function is used in early 193/* Build list of addresses of gigantic pages. This function is used in early
@@ -180,92 +221,38 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
180 return 1; 221 return 1;
181} 222}
182 223
183
184/* Modelled after find_linux_pte() */
185pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
186{
187 pgd_t *pg;
188 pud_t *pu;
189 pmd_t *pm;
190
191 unsigned int psize;
192 unsigned int shift;
193 unsigned long sz;
194 struct hstate *hstate;
195 psize = get_slice_psize(mm, addr);
196 shift = mmu_psize_to_shift(psize);
197 sz = ((1UL) << shift);
198 hstate = size_to_hstate(sz);
199
200 addr &= hstate->mask;
201
202 pg = pgd_offset(mm, addr);
203 if (!pgd_none(*pg)) {
204 pu = hpud_offset(pg, addr, hstate);
205 if (!pud_none(*pu)) {
206 pm = hpmd_offset(pu, addr, hstate);
207 if (!pmd_none(*pm))
208 return hugepte_offset((hugepd_t *)pm, addr,
209 hstate);
210 }
211 }
212
213 return NULL;
214}
215
216pte_t *huge_pte_alloc(struct mm_struct *mm,
217 unsigned long addr, unsigned long sz)
218{
219 pgd_t *pg;
220 pud_t *pu;
221 pmd_t *pm;
222 hugepd_t *hpdp = NULL;
223 struct hstate *hstate;
224 unsigned int psize;
225 hstate = size_to_hstate(sz);
226
227 psize = get_slice_psize(mm, addr);
228 BUG_ON(!mmu_huge_psizes[psize]);
229
230 addr &= hstate->mask;
231
232 pg = pgd_offset(mm, addr);
233 pu = hpud_alloc(mm, pg, addr, hstate);
234
235 if (pu) {
236 pm = hpmd_alloc(mm, pu, addr, hstate);
237 if (pm)
238 hpdp = (hugepd_t *)pm;
239 }
240
241 if (! hpdp)
242 return NULL;
243
244 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
245 return NULL;
246
247 return hugepte_offset(hpdp, addr, hstate);
248}
249
250int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 224int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
251{ 225{
252 return 0; 226 return 0;
253} 227}
254 228
255static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp, 229static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
256 unsigned int psize) 230 unsigned long start, unsigned long end,
231 unsigned long floor, unsigned long ceiling)
257{ 232{
258 pte_t *hugepte = hugepd_page(*hpdp); 233 pte_t *hugepte = hugepd_page(*hpdp);
234 unsigned shift = hugepd_shift(*hpdp);
235 unsigned long pdmask = ~((1UL << pdshift) - 1);
236
237 start &= pdmask;
238 if (start < floor)
239 return;
240 if (ceiling) {
241 ceiling &= pdmask;
242 if (! ceiling)
243 return;
244 }
245 if (end - 1 > ceiling - 1)
246 return;
259 247
260 hpdp->pd = 0; 248 hpdp->pd = 0;
261 tlb->need_flush = 1; 249 tlb->need_flush = 1;
262 pgtable_free_tlb(tlb, hugepte, hugepte_shift[psize]); 250 pgtable_free_tlb(tlb, hugepte, pdshift - shift);
263} 251}
264 252
265static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 253static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
266 unsigned long addr, unsigned long end, 254 unsigned long addr, unsigned long end,
267 unsigned long floor, unsigned long ceiling, 255 unsigned long floor, unsigned long ceiling)
268 unsigned int psize)
269{ 256{
270 pmd_t *pmd; 257 pmd_t *pmd;
271 unsigned long next; 258 unsigned long next;
@@ -277,7 +264,8 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
277 next = pmd_addr_end(addr, end); 264 next = pmd_addr_end(addr, end);
278 if (pmd_none(*pmd)) 265 if (pmd_none(*pmd))
279 continue; 266 continue;
280 free_hugepte_range(tlb, (hugepd_t *)pmd, psize); 267 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
268 addr, next, floor, ceiling);
281 } while (pmd++, addr = next, addr != end); 269 } while (pmd++, addr = next, addr != end);
282 270
283 start &= PUD_MASK; 271 start &= PUD_MASK;
@@ -303,23 +291,19 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
303 pud_t *pud; 291 pud_t *pud;
304 unsigned long next; 292 unsigned long next;
305 unsigned long start; 293 unsigned long start;
306 unsigned int shift;
307 unsigned int psize = get_slice_psize(tlb->mm, addr);
308 shift = mmu_psize_to_shift(psize);
309 294
310 start = addr; 295 start = addr;
311 pud = pud_offset(pgd, addr); 296 pud = pud_offset(pgd, addr);
312 do { 297 do {
313 next = pud_addr_end(addr, end); 298 next = pud_addr_end(addr, end);
314 if (shift < PMD_SHIFT) { 299 if (!is_hugepd(pud)) {
315 if (pud_none_or_clear_bad(pud)) 300 if (pud_none_or_clear_bad(pud))
316 continue; 301 continue;
317 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, 302 hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
318 ceiling, psize); 303 ceiling);
319 } else { 304 } else {
320 if (pud_none(*pud)) 305 free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
321 continue; 306 addr, next, floor, ceiling);
322 free_hugepte_range(tlb, (hugepd_t *)pud, psize);
323 } 307 }
324 } while (pud++, addr = next, addr != end); 308 } while (pud++, addr = next, addr != end);
325 309
@@ -350,74 +334,34 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
350{ 334{
351 pgd_t *pgd; 335 pgd_t *pgd;
352 unsigned long next; 336 unsigned long next;
353 unsigned long start;
354 337
355 /* 338 /*
356 * Comments below take from the normal free_pgd_range(). They 339 * Because there are a number of different possible pagetable
357 * apply here too. The tests against HUGEPD_MASK below are 340 * layouts for hugepage ranges, we limit knowledge of how
358 * essential, because we *don't* test for this at the bottom 341 * things should be laid out to the allocation path
359 * level. Without them we'll attempt to free a hugepte table 342 * (huge_pte_alloc(), above). Everything else works out the
360 * when we unmap just part of it, even if there are other 343 * structure as it goes from information in the hugepd
361 * active mappings using it. 344 * pointers. That means that we can't here use the
362 * 345 * optimization used in the normal page free_pgd_range(), of
363 * The next few lines have given us lots of grief... 346 * checking whether we're actually covering a large enough
364 * 347 * range to have to do anything at the top level of the walk
365 * Why are we testing HUGEPD* at this top level? Because 348 * instead of at the bottom.
366 * often there will be no work to do at all, and we'd prefer
367 * not to go all the way down to the bottom just to discover
368 * that.
369 *
370 * Why all these "- 1"s? Because 0 represents both the bottom
371 * of the address space and the top of it (using -1 for the
372 * top wouldn't help much: the masks would do the wrong thing).
373 * The rule is that addr 0 and floor 0 refer to the bottom of
374 * the address space, but end 0 and ceiling 0 refer to the top
375 * Comparisons need to use "end - 1" and "ceiling - 1" (though
376 * that end 0 case should be mythical).
377 *
378 * Wherever addr is brought up or ceiling brought down, we
379 * must be careful to reject "the opposite 0" before it
380 * confuses the subsequent tests. But what about where end is
381 * brought down by HUGEPD_SIZE below? no, end can't go down to
382 * 0 there.
383 * 349 *
384 * Whereas we round start (addr) and ceiling down, by different 350 * To make sense of this, you should probably go read the big
385 * masks at different levels, in order to test whether a table 351 * block comment at the top of the normal free_pgd_range(),
386 * now has no other vmas using it, so can be freed, we don't 352 * too.
387 * bother to round floor or end up - the tests don't need that.
388 */ 353 */
389 unsigned int psize = get_slice_psize(tlb->mm, addr);
390
391 addr &= HUGEPD_MASK(psize);
392 if (addr < floor) {
393 addr += HUGEPD_SIZE(psize);
394 if (!addr)
395 return;
396 }
397 if (ceiling) {
398 ceiling &= HUGEPD_MASK(psize);
399 if (!ceiling)
400 return;
401 }
402 if (end - 1 > ceiling - 1)
403 end -= HUGEPD_SIZE(psize);
404 if (addr > end - 1)
405 return;
406 354
407 start = addr;
408 pgd = pgd_offset(tlb->mm, addr); 355 pgd = pgd_offset(tlb->mm, addr);
409 do { 356 do {
410 psize = get_slice_psize(tlb->mm, addr);
411 BUG_ON(!mmu_huge_psizes[psize]);
412 next = pgd_addr_end(addr, end); 357 next = pgd_addr_end(addr, end);
413 if (mmu_psize_to_shift(psize) < PUD_SHIFT) { 358 if (!is_hugepd(pgd)) {
414 if (pgd_none_or_clear_bad(pgd)) 359 if (pgd_none_or_clear_bad(pgd))
415 continue; 360 continue;
416 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); 361 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
417 } else { 362 } else {
418 if (pgd_none(*pgd)) 363 free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
419 continue; 364 addr, next, floor, ceiling);
420 free_hugepte_range(tlb, (hugepd_t *)pgd, psize);
421 } 365 }
422 } while (pgd++, addr = next, addr != end); 366 } while (pgd++, addr = next, addr != end);
423} 367}
@@ -448,19 +392,19 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
448{ 392{
449 pte_t *ptep; 393 pte_t *ptep;
450 struct page *page; 394 struct page *page;
451 unsigned int mmu_psize = get_slice_psize(mm, address); 395 unsigned shift;
396 unsigned long mask;
397
398 ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
452 399
453 /* Verify it is a huge page else bail. */ 400 /* Verify it is a huge page else bail. */
454 if (!mmu_huge_psizes[mmu_psize]) 401 if (!ptep || !shift)
455 return ERR_PTR(-EINVAL); 402 return ERR_PTR(-EINVAL);
456 403
457 ptep = huge_pte_offset(mm, address); 404 mask = (1UL << shift) - 1;
458 page = pte_page(*ptep); 405 page = pte_page(*ptep);
459 if (page) { 406 if (page)
460 unsigned int shift = mmu_psize_to_shift(mmu_psize); 407 page += (address & mask) / PAGE_SIZE;
461 unsigned long sz = ((1UL) << shift);
462 page += (address % sz) / PAGE_SIZE;
463 }
464 408
465 return page; 409 return page;
466} 410}
@@ -483,6 +427,73 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
483 return NULL; 427 return NULL;
484} 428}
485 429
430static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
431 unsigned long end, int write, struct page **pages, int *nr)
432{
433 unsigned long mask;
434 unsigned long pte_end;
435 struct page *head, *page;
436 pte_t pte;
437 int refs;
438
439 pte_end = (addr + sz) & ~(sz-1);
440 if (pte_end < end)
441 end = pte_end;
442
443 pte = *ptep;
444 mask = _PAGE_PRESENT | _PAGE_USER;
445 if (write)
446 mask |= _PAGE_RW;
447
448 if ((pte_val(pte) & mask) != mask)
449 return 0;
450
451 /* hugepages are never "special" */
452 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
453
454 refs = 0;
455 head = pte_page(pte);
456
457 page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
458 do {
459 VM_BUG_ON(compound_head(page) != head);
460 pages[*nr] = page;
461 (*nr)++;
462 page++;
463 refs++;
464 } while (addr += PAGE_SIZE, addr != end);
465
466 if (!page_cache_add_speculative(head, refs)) {
467 *nr -= refs;
468 return 0;
469 }
470
471 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
472 /* Could be optimized better */
473 while (*nr) {
474 put_page(page);
475 (*nr)--;
476 }
477 }
478
479 return 1;
480}
481
482int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
483 unsigned long addr, unsigned long end,
484 int write, struct page **pages, int *nr)
485{
486 pte_t *ptep;
487 unsigned long sz = 1UL << hugepd_shift(*hugepd);
488
489 ptep = hugepte_offset(hugepd, addr, pdshift);
490 do {
491 if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
492 return 0;
493 } while (ptep++, addr += sz, addr != end);
494
495 return 1;
496}
486 497
487unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 498unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
488 unsigned long len, unsigned long pgoff, 499 unsigned long len, unsigned long pgoff,
@@ -530,34 +541,20 @@ static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
530 return rflags; 541 return rflags;
531} 542}
532 543
533int hash_huge_page(struct mm_struct *mm, unsigned long access, 544int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
534 unsigned long ea, unsigned long vsid, int local, 545 pte_t *ptep, unsigned long trap, int local, int ssize,
535 unsigned long trap) 546 unsigned int shift, unsigned int mmu_psize)
536{ 547{
537 pte_t *ptep;
538 unsigned long old_pte, new_pte; 548 unsigned long old_pte, new_pte;
539 unsigned long va, rflags, pa, sz; 549 unsigned long va, rflags, pa, sz;
540 long slot; 550 long slot;
541 int err = 1; 551 int err = 1;
542 int ssize = user_segment_size(ea);
543 unsigned int mmu_psize;
544 int shift;
545 mmu_psize = get_slice_psize(mm, ea);
546 552
547 if (!mmu_huge_psizes[mmu_psize]) 553 BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
548 goto out;
549 ptep = huge_pte_offset(mm, ea);
550 554
551 /* Search the Linux page table for a match with va */ 555 /* Search the Linux page table for a match with va */
552 va = hpt_va(ea, vsid, ssize); 556 va = hpt_va(ea, vsid, ssize);
553 557
554 /*
555 * If no pte found or not present, send the problem up to
556 * do_page_fault
557 */
558 if (unlikely(!ptep || pte_none(*ptep)))
559 goto out;
560
561 /* 558 /*
562 * Check the user's access rights to the page. If access should be 559 * Check the user's access rights to the page. If access should be
563 * prevented then send the problem up to do_page_fault. 560 * prevented then send the problem up to do_page_fault.
@@ -588,7 +585,6 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
588 rflags = 0x2 | (!(new_pte & _PAGE_RW)); 585 rflags = 0x2 | (!(new_pte & _PAGE_RW));
589 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ 586 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
590 rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); 587 rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
591 shift = mmu_psize_to_shift(mmu_psize);
592 sz = ((1UL) << shift); 588 sz = ((1UL) << shift);
593 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) 589 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
594 /* No CPU has hugepages but lacks no execute, so we 590 /* No CPU has hugepages but lacks no execute, so we
@@ -672,6 +668,8 @@ repeat:
672 668
673static void __init set_huge_psize(int psize) 669static void __init set_huge_psize(int psize)
674{ 670{
671 unsigned pdshift;
672
675 /* Check that it is a page size supported by the hardware and 673 /* Check that it is a page size supported by the hardware and
676 * that it fits within pagetable limits. */ 674 * that it fits within pagetable limits. */
677 if (mmu_psize_defs[psize].shift && 675 if (mmu_psize_defs[psize].shift &&
@@ -686,29 +684,14 @@ static void __init set_huge_psize(int psize)
686 return; 684 return;
687 hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT); 685 hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
688 686
689 switch (mmu_psize_defs[psize].shift) { 687 if (mmu_psize_defs[psize].shift < PMD_SHIFT)
690 case PAGE_SHIFT_64K: 688 pdshift = PMD_SHIFT;
691 /* We only allow 64k hpages with 4k base page, 689 else if (mmu_psize_defs[psize].shift < PUD_SHIFT)
692 * which was checked above, and always put them 690 pdshift = PUD_SHIFT;
693 * at the PMD */ 691 else
694 hugepte_shift[psize] = PMD_SHIFT; 692 pdshift = PGDIR_SHIFT;
695 break; 693 mmu_huge_psizes[psize] = pdshift - mmu_psize_defs[psize].shift;
696 case PAGE_SHIFT_16M: 694 }
697 /* 16M pages can be at two different levels
698 * of pagestables based on base page size */
699 if (PAGE_SHIFT == PAGE_SHIFT_64K)
700 hugepte_shift[psize] = PMD_SHIFT;
701 else /* 4k base page */
702 hugepte_shift[psize] = PUD_SHIFT;
703 break;
704 case PAGE_SHIFT_16G:
705 /* 16G pages are always at PGD level */
706 hugepte_shift[psize] = PGDIR_SHIFT;
707 break;
708 }
709 hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
710 } else
711 hugepte_shift[psize] = 0;
712} 695}
713 696
714static int __init hugepage_setup_sz(char *str) 697static int __init hugepage_setup_sz(char *str)
@@ -732,7 +715,7 @@ __setup("hugepagesz=", hugepage_setup_sz);
732 715
733static int __init hugetlbpage_init(void) 716static int __init hugetlbpage_init(void)
734{ 717{
735 unsigned int psize; 718 int psize;
736 719
737 if (!cpu_has_feature(CPU_FTR_16M_PAGE)) 720 if (!cpu_has_feature(CPU_FTR_16M_PAGE))
738 return -ENODEV; 721 return -ENODEV;
@@ -753,8 +736,8 @@ static int __init hugetlbpage_init(void)
753 736
754 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { 737 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
755 if (mmu_huge_psizes[psize]) { 738 if (mmu_huge_psizes[psize]) {
756 pgtable_cache_add(hugepte_shift[psize], NULL); 739 pgtable_cache_add(mmu_huge_psizes[psize], NULL);
757 if (!PGT_CACHE(hugepte_shift[psize])) 740 if (!PGT_CACHE(mmu_huge_psizes[psize]))
758 panic("hugetlbpage_init(): could not create " 741 panic("hugetlbpage_init(): could not create "
759 "pgtable cache for %d bit pagesize\n", 742 "pgtable cache for %d bit pagesize\n",
760 mmu_psize_to_shift(psize)); 743 mmu_psize_to_shift(psize));