aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/mm/hugetlbpage.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-12-12 17:27:24 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-12 17:27:24 -0500
commit09cea96caa59fabab3030c53bd698b9b568d959a (patch)
treea991cdc0c887fdcda37f4b751ee98d3db9559f4e /arch/powerpc/mm/hugetlbpage.c
parent6eb7365db6f3a4a9d8d9922bb0b800f9cbaad641 (diff)
parente090aa80321b64c3b793f3b047e31ecf1af9538d (diff)
Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc
* 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc: (151 commits) powerpc: Fix usage of 64-bit instruction in 32-bit altivec code MAINTAINERS: Add PowerPC patterns powerpc/pseries: Track previous CPPR values to correctly EOI interrupts powerpc/pseries: Correct pseries/dlpar.c build break without CONFIG_SMP powerpc: Make "intspec" pointers in irq_host->xlate() const powerpc/8xx: DTLB Miss cleanup powerpc/8xx: Remove DIRTY pte handling in DTLB Error. powerpc/8xx: Start using dcbX instructions in various copy routines powerpc/8xx: Restore _PAGE_WRITETHRU powerpc/8xx: Add missing Guarded setting in DTLB Error. powerpc/8xx: Fixup DAR from buggy dcbX instructions. powerpc/8xx: Tag DAR with 0x00f0 to catch buggy instructions. powerpc/8xx: Update TLB asm so it behaves as linux mm expects. powerpc/8xx: Invalidate non present TLBs powerpc/pseries: Serialize cpu hotplug operations during deactivate Vs deallocate pseries/pseries: Add code to online/offline CPUs of a DLPAR node powerpc: stop_this_cpu: remove the cpu from the online map. powerpc/pseries: Add kernel based CPU DLPAR handling sysfs/cpu: Add probe/release files powerpc/pseries: Kernel DLPAR Infrastructure ...
Diffstat (limited to 'arch/powerpc/mm/hugetlbpage.c')
-rw-r--r--arch/powerpc/mm/hugetlbpage.c792
1 files changed, 290 insertions, 502 deletions
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 90df6ffe3a43..123f7070238a 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -7,29 +7,17 @@
7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
8 */ 8 */
9 9
10#include <linux/init.h>
11#include <linux/fs.h>
12#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/io.h>
13#include <linux/hugetlb.h> 12#include <linux/hugetlb.h>
14#include <linux/pagemap.h> 13#include <asm/pgtable.h>
15#include <linux/slab.h>
16#include <linux/err.h>
17#include <linux/sysctl.h>
18#include <asm/mman.h>
19#include <asm/pgalloc.h> 14#include <asm/pgalloc.h>
20#include <asm/tlb.h> 15#include <asm/tlb.h>
21#include <asm/tlbflush.h>
22#include <asm/mmu_context.h>
23#include <asm/machdep.h>
24#include <asm/cputable.h>
25#include <asm/spu.h>
26 16
27#define PAGE_SHIFT_64K 16 17#define PAGE_SHIFT_64K 16
28#define PAGE_SHIFT_16M 24 18#define PAGE_SHIFT_16M 24
29#define PAGE_SHIFT_16G 34 19#define PAGE_SHIFT_16G 34
30 20
31#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT)
32#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
33#define MAX_NUMBER_GPAGES 1024 21#define MAX_NUMBER_GPAGES 1024
34 22
35/* Tracks the 16G pages after the device tree is scanned and before the 23/* Tracks the 16G pages after the device tree is scanned and before the
@@ -37,53 +25,17 @@
37static unsigned long gpage_freearray[MAX_NUMBER_GPAGES]; 25static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
38static unsigned nr_gpages; 26static unsigned nr_gpages;
39 27
40/* Array of valid huge page sizes - non-zero value(hugepte_shift) is
41 * stored for the huge page sizes that are valid.
42 */
43unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
44
45#define hugepte_shift mmu_huge_psizes
46#define PTRS_PER_HUGEPTE(psize) (1 << hugepte_shift[psize])
47#define HUGEPTE_TABLE_SIZE(psize) (sizeof(pte_t) << hugepte_shift[psize])
48
49#define HUGEPD_SHIFT(psize) (mmu_psize_to_shift(psize) \
50 + hugepte_shift[psize])
51#define HUGEPD_SIZE(psize) (1UL << HUGEPD_SHIFT(psize))
52#define HUGEPD_MASK(psize) (~(HUGEPD_SIZE(psize)-1))
53
54/* Subtract one from array size because we don't need a cache for 4K since
55 * is not a huge page size */
56#define HUGE_PGTABLE_INDEX(psize) (HUGEPTE_CACHE_NUM + psize - 1)
57#define HUGEPTE_CACHE_NAME(psize) (huge_pgtable_cache_name[psize])
58
59static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
60 [MMU_PAGE_64K] = "hugepte_cache_64K",
61 [MMU_PAGE_1M] = "hugepte_cache_1M",
62 [MMU_PAGE_16M] = "hugepte_cache_16M",
63 [MMU_PAGE_16G] = "hugepte_cache_16G",
64};
65
66/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() 28/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
67 * will choke on pointers to hugepte tables, which is handy for 29 * will choke on pointers to hugepte tables, which is handy for
68 * catching screwups early. */ 30 * catching screwups early. */
69#define HUGEPD_OK 0x1
70
71typedef struct { unsigned long pd; } hugepd_t;
72
73#define hugepd_none(hpd) ((hpd).pd == 0)
74 31
75static inline int shift_to_mmu_psize(unsigned int shift) 32static inline int shift_to_mmu_psize(unsigned int shift)
76{ 33{
77 switch (shift) { 34 int psize;
78#ifndef CONFIG_PPC_64K_PAGES 35
79 case PAGE_SHIFT_64K: 36 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize)
80 return MMU_PAGE_64K; 37 if (mmu_psize_defs[psize].shift == shift)
81#endif 38 return psize;
82 case PAGE_SHIFT_16M:
83 return MMU_PAGE_16M;
84 case PAGE_SHIFT_16G:
85 return MMU_PAGE_16G;
86 }
87 return -1; 39 return -1;
88} 40}
89 41
@@ -94,71 +46,126 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
94 BUG(); 46 BUG();
95} 47}
96 48
49#define hugepd_none(hpd) ((hpd).pd == 0)
50
97static inline pte_t *hugepd_page(hugepd_t hpd) 51static inline pte_t *hugepd_page(hugepd_t hpd)
98{ 52{
99 BUG_ON(!(hpd.pd & HUGEPD_OK)); 53 BUG_ON(!hugepd_ok(hpd));
100 return (pte_t *)(hpd.pd & ~HUGEPD_OK); 54 return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000);
55}
56
57static inline unsigned int hugepd_shift(hugepd_t hpd)
58{
59 return hpd.pd & HUGEPD_SHIFT_MASK;
101} 60}
102 61
103static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, 62static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift)
104 struct hstate *hstate)
105{ 63{
106 unsigned int shift = huge_page_shift(hstate); 64 unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
107 int psize = shift_to_mmu_psize(shift);
108 unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1));
109 pte_t *dir = hugepd_page(*hpdp); 65 pte_t *dir = hugepd_page(*hpdp);
110 66
111 return dir + idx; 67 return dir + idx;
112} 68}
113 69
70pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
71{
72 pgd_t *pg;
73 pud_t *pu;
74 pmd_t *pm;
75 hugepd_t *hpdp = NULL;
76 unsigned pdshift = PGDIR_SHIFT;
77
78 if (shift)
79 *shift = 0;
80
81 pg = pgdir + pgd_index(ea);
82 if (is_hugepd(pg)) {
83 hpdp = (hugepd_t *)pg;
84 } else if (!pgd_none(*pg)) {
85 pdshift = PUD_SHIFT;
86 pu = pud_offset(pg, ea);
87 if (is_hugepd(pu))
88 hpdp = (hugepd_t *)pu;
89 else if (!pud_none(*pu)) {
90 pdshift = PMD_SHIFT;
91 pm = pmd_offset(pu, ea);
92 if (is_hugepd(pm))
93 hpdp = (hugepd_t *)pm;
94 else if (!pmd_none(*pm)) {
95 return pte_offset_map(pm, ea);
96 }
97 }
98 }
99
100 if (!hpdp)
101 return NULL;
102
103 if (shift)
104 *shift = hugepd_shift(*hpdp);
105 return hugepte_offset(hpdp, ea, pdshift);
106}
107
108pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
109{
110 return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
111}
112
114static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 113static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
115 unsigned long address, unsigned int psize) 114 unsigned long address, unsigned pdshift, unsigned pshift)
116{ 115{
117 pte_t *new = kmem_cache_zalloc(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], 116 pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift),
118 GFP_KERNEL|__GFP_REPEAT); 117 GFP_KERNEL|__GFP_REPEAT);
118
119 BUG_ON(pshift > HUGEPD_SHIFT_MASK);
120 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
119 121
120 if (! new) 122 if (! new)
121 return -ENOMEM; 123 return -ENOMEM;
122 124
123 spin_lock(&mm->page_table_lock); 125 spin_lock(&mm->page_table_lock);
124 if (!hugepd_none(*hpdp)) 126 if (!hugepd_none(*hpdp))
125 kmem_cache_free(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], new); 127 kmem_cache_free(PGT_CACHE(pdshift - pshift), new);
126 else 128 else
127 hpdp->pd = (unsigned long)new | HUGEPD_OK; 129 hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift;
128 spin_unlock(&mm->page_table_lock); 130 spin_unlock(&mm->page_table_lock);
129 return 0; 131 return 0;
130} 132}
131 133
132 134pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
133static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate)
134{
135 if (huge_page_shift(hstate) < PUD_SHIFT)
136 return pud_offset(pgd, addr);
137 else
138 return (pud_t *) pgd;
139}
140static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr,
141 struct hstate *hstate)
142{ 135{
143 if (huge_page_shift(hstate) < PUD_SHIFT) 136 pgd_t *pg;
144 return pud_alloc(mm, pgd, addr); 137 pud_t *pu;
145 else 138 pmd_t *pm;
146 return (pud_t *) pgd; 139 hugepd_t *hpdp = NULL;
147} 140 unsigned pshift = __ffs(sz);
148static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate) 141 unsigned pdshift = PGDIR_SHIFT;
149{ 142
150 if (huge_page_shift(hstate) < PMD_SHIFT) 143 addr &= ~(sz-1);
151 return pmd_offset(pud, addr); 144
152 else 145 pg = pgd_offset(mm, addr);
153 return (pmd_t *) pud; 146 if (pshift >= PUD_SHIFT) {
154} 147 hpdp = (hugepd_t *)pg;
155static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr, 148 } else {
156 struct hstate *hstate) 149 pdshift = PUD_SHIFT;
157{ 150 pu = pud_alloc(mm, pg, addr);
158 if (huge_page_shift(hstate) < PMD_SHIFT) 151 if (pshift >= PMD_SHIFT) {
159 return pmd_alloc(mm, pud, addr); 152 hpdp = (hugepd_t *)pu;
160 else 153 } else {
161 return (pmd_t *) pud; 154 pdshift = PMD_SHIFT;
155 pm = pmd_alloc(mm, pu, addr);
156 hpdp = (hugepd_t *)pm;
157 }
158 }
159
160 if (!hpdp)
161 return NULL;
162
163 BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
164
165 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
166 return NULL;
167
168 return hugepte_offset(hpdp, addr, pdshift);
162} 169}
163 170
164/* Build list of addresses of gigantic pages. This function is used in early 171/* Build list of addresses of gigantic pages. This function is used in early
@@ -192,94 +199,38 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
192 return 1; 199 return 1;
193} 200}
194 201
195
196/* Modelled after find_linux_pte() */
197pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
198{
199 pgd_t *pg;
200 pud_t *pu;
201 pmd_t *pm;
202
203 unsigned int psize;
204 unsigned int shift;
205 unsigned long sz;
206 struct hstate *hstate;
207 psize = get_slice_psize(mm, addr);
208 shift = mmu_psize_to_shift(psize);
209 sz = ((1UL) << shift);
210 hstate = size_to_hstate(sz);
211
212 addr &= hstate->mask;
213
214 pg = pgd_offset(mm, addr);
215 if (!pgd_none(*pg)) {
216 pu = hpud_offset(pg, addr, hstate);
217 if (!pud_none(*pu)) {
218 pm = hpmd_offset(pu, addr, hstate);
219 if (!pmd_none(*pm))
220 return hugepte_offset((hugepd_t *)pm, addr,
221 hstate);
222 }
223 }
224
225 return NULL;
226}
227
228pte_t *huge_pte_alloc(struct mm_struct *mm,
229 unsigned long addr, unsigned long sz)
230{
231 pgd_t *pg;
232 pud_t *pu;
233 pmd_t *pm;
234 hugepd_t *hpdp = NULL;
235 struct hstate *hstate;
236 unsigned int psize;
237 hstate = size_to_hstate(sz);
238
239 psize = get_slice_psize(mm, addr);
240 BUG_ON(!mmu_huge_psizes[psize]);
241
242 addr &= hstate->mask;
243
244 pg = pgd_offset(mm, addr);
245 pu = hpud_alloc(mm, pg, addr, hstate);
246
247 if (pu) {
248 pm = hpmd_alloc(mm, pu, addr, hstate);
249 if (pm)
250 hpdp = (hugepd_t *)pm;
251 }
252
253 if (! hpdp)
254 return NULL;
255
256 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
257 return NULL;
258
259 return hugepte_offset(hpdp, addr, hstate);
260}
261
262int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 202int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
263{ 203{
264 return 0; 204 return 0;
265} 205}
266 206
267static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp, 207static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
268 unsigned int psize) 208 unsigned long start, unsigned long end,
209 unsigned long floor, unsigned long ceiling)
269{ 210{
270 pte_t *hugepte = hugepd_page(*hpdp); 211 pte_t *hugepte = hugepd_page(*hpdp);
212 unsigned shift = hugepd_shift(*hpdp);
213 unsigned long pdmask = ~((1UL << pdshift) - 1);
214
215 start &= pdmask;
216 if (start < floor)
217 return;
218 if (ceiling) {
219 ceiling &= pdmask;
220 if (! ceiling)
221 return;
222 }
223 if (end - 1 > ceiling - 1)
224 return;
271 225
272 hpdp->pd = 0; 226 hpdp->pd = 0;
273 tlb->need_flush = 1; 227 tlb->need_flush = 1;
274 pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, 228 pgtable_free_tlb(tlb, hugepte, pdshift - shift);
275 HUGEPTE_CACHE_NUM+psize-1,
276 PGF_CACHENUM_MASK));
277} 229}
278 230
279static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 231static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
280 unsigned long addr, unsigned long end, 232 unsigned long addr, unsigned long end,
281 unsigned long floor, unsigned long ceiling, 233 unsigned long floor, unsigned long ceiling)
282 unsigned int psize)
283{ 234{
284 pmd_t *pmd; 235 pmd_t *pmd;
285 unsigned long next; 236 unsigned long next;
@@ -291,7 +242,8 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
291 next = pmd_addr_end(addr, end); 242 next = pmd_addr_end(addr, end);
292 if (pmd_none(*pmd)) 243 if (pmd_none(*pmd))
293 continue; 244 continue;
294 free_hugepte_range(tlb, (hugepd_t *)pmd, psize); 245 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
246 addr, next, floor, ceiling);
295 } while (pmd++, addr = next, addr != end); 247 } while (pmd++, addr = next, addr != end);
296 248
297 start &= PUD_MASK; 249 start &= PUD_MASK;
@@ -317,23 +269,19 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
317 pud_t *pud; 269 pud_t *pud;
318 unsigned long next; 270 unsigned long next;
319 unsigned long start; 271 unsigned long start;
320 unsigned int shift;
321 unsigned int psize = get_slice_psize(tlb->mm, addr);
322 shift = mmu_psize_to_shift(psize);
323 272
324 start = addr; 273 start = addr;
325 pud = pud_offset(pgd, addr); 274 pud = pud_offset(pgd, addr);
326 do { 275 do {
327 next = pud_addr_end(addr, end); 276 next = pud_addr_end(addr, end);
328 if (shift < PMD_SHIFT) { 277 if (!is_hugepd(pud)) {
329 if (pud_none_or_clear_bad(pud)) 278 if (pud_none_or_clear_bad(pud))
330 continue; 279 continue;
331 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, 280 hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
332 ceiling, psize); 281 ceiling);
333 } else { 282 } else {
334 if (pud_none(*pud)) 283 free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
335 continue; 284 addr, next, floor, ceiling);
336 free_hugepte_range(tlb, (hugepd_t *)pud, psize);
337 } 285 }
338 } while (pud++, addr = next, addr != end); 286 } while (pud++, addr = next, addr != end);
339 287
@@ -364,121 +312,56 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
364{ 312{
365 pgd_t *pgd; 313 pgd_t *pgd;
366 unsigned long next; 314 unsigned long next;
367 unsigned long start;
368 315
369 /* 316 /*
370 * Comments below take from the normal free_pgd_range(). They 317 * Because there are a number of different possible pagetable
371 * apply here too. The tests against HUGEPD_MASK below are 318 * layouts for hugepage ranges, we limit knowledge of how
372 * essential, because we *don't* test for this at the bottom 319 * things should be laid out to the allocation path
373 * level. Without them we'll attempt to free a hugepte table 320 * (huge_pte_alloc(), above). Everything else works out the
374 * when we unmap just part of it, even if there are other 321 * structure as it goes from information in the hugepd
375 * active mappings using it. 322 * pointers. That means that we can't here use the
376 * 323 * optimization used in the normal page free_pgd_range(), of
377 * The next few lines have given us lots of grief... 324 * checking whether we're actually covering a large enough
378 * 325 * range to have to do anything at the top level of the walk
379 * Why are we testing HUGEPD* at this top level? Because 326 * instead of at the bottom.
380 * often there will be no work to do at all, and we'd prefer
381 * not to go all the way down to the bottom just to discover
382 * that.
383 *
384 * Why all these "- 1"s? Because 0 represents both the bottom
385 * of the address space and the top of it (using -1 for the
386 * top wouldn't help much: the masks would do the wrong thing).
387 * The rule is that addr 0 and floor 0 refer to the bottom of
388 * the address space, but end 0 and ceiling 0 refer to the top
389 * Comparisons need to use "end - 1" and "ceiling - 1" (though
390 * that end 0 case should be mythical).
391 * 327 *
392 * Wherever addr is brought up or ceiling brought down, we 328 * To make sense of this, you should probably go read the big
393 * must be careful to reject "the opposite 0" before it 329 * block comment at the top of the normal free_pgd_range(),
394 * confuses the subsequent tests. But what about where end is 330 * too.
395 * brought down by HUGEPD_SIZE below? no, end can't go down to
396 * 0 there.
397 *
398 * Whereas we round start (addr) and ceiling down, by different
399 * masks at different levels, in order to test whether a table
400 * now has no other vmas using it, so can be freed, we don't
401 * bother to round floor or end up - the tests don't need that.
402 */ 331 */
403 unsigned int psize = get_slice_psize(tlb->mm, addr);
404
405 addr &= HUGEPD_MASK(psize);
406 if (addr < floor) {
407 addr += HUGEPD_SIZE(psize);
408 if (!addr)
409 return;
410 }
411 if (ceiling) {
412 ceiling &= HUGEPD_MASK(psize);
413 if (!ceiling)
414 return;
415 }
416 if (end - 1 > ceiling - 1)
417 end -= HUGEPD_SIZE(psize);
418 if (addr > end - 1)
419 return;
420 332
421 start = addr;
422 pgd = pgd_offset(tlb->mm, addr); 333 pgd = pgd_offset(tlb->mm, addr);
423 do { 334 do {
424 psize = get_slice_psize(tlb->mm, addr);
425 BUG_ON(!mmu_huge_psizes[psize]);
426 next = pgd_addr_end(addr, end); 335 next = pgd_addr_end(addr, end);
427 if (mmu_psize_to_shift(psize) < PUD_SHIFT) { 336 if (!is_hugepd(pgd)) {
428 if (pgd_none_or_clear_bad(pgd)) 337 if (pgd_none_or_clear_bad(pgd))
429 continue; 338 continue;
430 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); 339 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
431 } else { 340 } else {
432 if (pgd_none(*pgd)) 341 free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
433 continue; 342 addr, next, floor, ceiling);
434 free_hugepte_range(tlb, (hugepd_t *)pgd, psize);
435 } 343 }
436 } while (pgd++, addr = next, addr != end); 344 } while (pgd++, addr = next, addr != end);
437} 345}
438 346
439void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
440 pte_t *ptep, pte_t pte)
441{
442 if (pte_present(*ptep)) {
443 /* We open-code pte_clear because we need to pass the right
444 * argument to hpte_need_flush (huge / !huge). Might not be
445 * necessary anymore if we make hpte_need_flush() get the
446 * page size from the slices
447 */
448 unsigned int psize = get_slice_psize(mm, addr);
449 unsigned int shift = mmu_psize_to_shift(psize);
450 unsigned long sz = ((1UL) << shift);
451 struct hstate *hstate = size_to_hstate(sz);
452 pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1);
453 }
454 *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
455}
456
457pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
458 pte_t *ptep)
459{
460 unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
461 return __pte(old);
462}
463
464struct page * 347struct page *
465follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 348follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
466{ 349{
467 pte_t *ptep; 350 pte_t *ptep;
468 struct page *page; 351 struct page *page;
469 unsigned int mmu_psize = get_slice_psize(mm, address); 352 unsigned shift;
353 unsigned long mask;
354
355 ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
470 356
471 /* Verify it is a huge page else bail. */ 357 /* Verify it is a huge page else bail. */
472 if (!mmu_huge_psizes[mmu_psize]) 358 if (!ptep || !shift)
473 return ERR_PTR(-EINVAL); 359 return ERR_PTR(-EINVAL);
474 360
475 ptep = huge_pte_offset(mm, address); 361 mask = (1UL << shift) - 1;
476 page = pte_page(*ptep); 362 page = pte_page(*ptep);
477 if (page) { 363 if (page)
478 unsigned int shift = mmu_psize_to_shift(mmu_psize); 364 page += (address & mask) / PAGE_SIZE;
479 unsigned long sz = ((1UL) << shift);
480 page += (address % sz) / PAGE_SIZE;
481 }
482 365
483 return page; 366 return page;
484} 367}
@@ -501,6 +384,82 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
501 return NULL; 384 return NULL;
502} 385}
503 386
387static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
388 unsigned long end, int write, struct page **pages, int *nr)
389{
390 unsigned long mask;
391 unsigned long pte_end;
392 struct page *head, *page;
393 pte_t pte;
394 int refs;
395
396 pte_end = (addr + sz) & ~(sz-1);
397 if (pte_end < end)
398 end = pte_end;
399
400 pte = *ptep;
401 mask = _PAGE_PRESENT | _PAGE_USER;
402 if (write)
403 mask |= _PAGE_RW;
404
405 if ((pte_val(pte) & mask) != mask)
406 return 0;
407
408 /* hugepages are never "special" */
409 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
410
411 refs = 0;
412 head = pte_page(pte);
413
414 page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
415 do {
416 VM_BUG_ON(compound_head(page) != head);
417 pages[*nr] = page;
418 (*nr)++;
419 page++;
420 refs++;
421 } while (addr += PAGE_SIZE, addr != end);
422
423 if (!page_cache_add_speculative(head, refs)) {
424 *nr -= refs;
425 return 0;
426 }
427
428 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
429 /* Could be optimized better */
430 while (*nr) {
431 put_page(page);
432 (*nr)--;
433 }
434 }
435
436 return 1;
437}
438
439static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
440 unsigned long sz)
441{
442 unsigned long __boundary = (addr + sz) & ~(sz-1);
443 return (__boundary - 1 < end - 1) ? __boundary : end;
444}
445
446int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
447 unsigned long addr, unsigned long end,
448 int write, struct page **pages, int *nr)
449{
450 pte_t *ptep;
451 unsigned long sz = 1UL << hugepd_shift(*hugepd);
452 unsigned long next;
453
454 ptep = hugepte_offset(hugepd, addr, pdshift);
455 do {
456 next = hugepte_addr_end(addr, end, sz);
457 if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
458 return 0;
459 } while (ptep++, addr = next, addr != end);
460
461 return 1;
462}
504 463
505unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 464unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
506 unsigned long len, unsigned long pgoff, 465 unsigned long len, unsigned long pgoff,
@@ -509,8 +468,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
509 struct hstate *hstate = hstate_file(file); 468 struct hstate *hstate = hstate_file(file);
510 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); 469 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
511 470
512 if (!mmu_huge_psizes[mmu_psize])
513 return -EINVAL;
514 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0); 471 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
515} 472}
516 473
@@ -521,229 +478,46 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
521 return 1UL << mmu_psize_to_shift(psize); 478 return 1UL << mmu_psize_to_shift(psize);
522} 479}
523 480
524/* 481static int __init add_huge_page_size(unsigned long long size)
525 * Called by asm hashtable.S for doing lazy icache flush
526 */
527static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
528 pte_t pte, int trap, unsigned long sz)
529{ 482{
530 struct page *page; 483 int shift = __ffs(size);
531 int i; 484 int mmu_psize;
532
533 if (!pfn_valid(pte_pfn(pte)))
534 return rflags;
535
536 page = pte_page(pte);
537
538 /* page is dirty */
539 if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
540 if (trap == 0x400) {
541 for (i = 0; i < (sz / PAGE_SIZE); i++)
542 __flush_dcache_icache(page_address(page+i));
543 set_bit(PG_arch_1, &page->flags);
544 } else {
545 rflags |= HPTE_R_N;
546 }
547 }
548 return rflags;
549}
550 485
551int hash_huge_page(struct mm_struct *mm, unsigned long access, 486 /* Check that it is a page size supported by the hardware and
552 unsigned long ea, unsigned long vsid, int local, 487 * that it fits within pagetable and slice limits. */
553 unsigned long trap) 488 if (!is_power_of_2(size)
554{ 489 || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
555 pte_t *ptep; 490 return -EINVAL;
556 unsigned long old_pte, new_pte;
557 unsigned long va, rflags, pa, sz;
558 long slot;
559 int err = 1;
560 int ssize = user_segment_size(ea);
561 unsigned int mmu_psize;
562 int shift;
563 mmu_psize = get_slice_psize(mm, ea);
564
565 if (!mmu_huge_psizes[mmu_psize])
566 goto out;
567 ptep = huge_pte_offset(mm, ea);
568
569 /* Search the Linux page table for a match with va */
570 va = hpt_va(ea, vsid, ssize);
571 491
572 /* 492 if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
573 * If no pte found or not present, send the problem up to 493 return -EINVAL;
574 * do_page_fault
575 */
576 if (unlikely(!ptep || pte_none(*ptep)))
577 goto out;
578 494
579 /* 495#ifdef CONFIG_SPU_FS_64K_LS
580 * Check the user's access rights to the page. If access should be 496 /* Disable support for 64K huge pages when 64K SPU local store
581 * prevented then send the problem up to do_page_fault. 497 * support is enabled as the current implementation conflicts.
582 */ 498 */
583 if (unlikely(access & ~pte_val(*ptep))) 499 if (shift == PAGE_SHIFT_64K)
584 goto out; 500 return -EINVAL;
585 /* 501#endif /* CONFIG_SPU_FS_64K_LS */
586 * At this point, we have a pte (old_pte) which can be used to build
587 * or update an HPTE. There are 2 cases:
588 *
589 * 1. There is a valid (present) pte with no associated HPTE (this is
590 * the most common case)
591 * 2. There is a valid (present) pte with an associated HPTE. The
592 * current values of the pp bits in the HPTE prevent access
593 * because we are doing software DIRTY bit management and the
594 * page is currently not DIRTY.
595 */
596
597
598 do {
599 old_pte = pte_val(*ptep);
600 if (old_pte & _PAGE_BUSY)
601 goto out;
602 new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
603 } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
604 old_pte, new_pte));
605
606 rflags = 0x2 | (!(new_pte & _PAGE_RW));
607 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
608 rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
609 shift = mmu_psize_to_shift(mmu_psize);
610 sz = ((1UL) << shift);
611 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
612 /* No CPU has hugepages but lacks no execute, so we
613 * don't need to worry about that case */
614 rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
615 trap, sz);
616
617 /* Check if pte already has an hpte (case 2) */
618 if (unlikely(old_pte & _PAGE_HASHPTE)) {
619 /* There MIGHT be an HPTE for this pte */
620 unsigned long hash, slot;
621
622 hash = hpt_hash(va, shift, ssize);
623 if (old_pte & _PAGE_F_SECOND)
624 hash = ~hash;
625 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
626 slot += (old_pte & _PAGE_F_GIX) >> 12;
627
628 if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
629 ssize, local) == -1)
630 old_pte &= ~_PAGE_HPTEFLAGS;
631 }
632
633 if (likely(!(old_pte & _PAGE_HASHPTE))) {
634 unsigned long hash = hpt_hash(va, shift, ssize);
635 unsigned long hpte_group;
636
637 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
638
639repeat:
640 hpte_group = ((hash & htab_hash_mask) *
641 HPTES_PER_GROUP) & ~0x7UL;
642
643 /* clear HPTE slot informations in new PTE */
644#ifdef CONFIG_PPC_64K_PAGES
645 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
646#else
647 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
648#endif
649 /* Add in WIMG bits */
650 rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
651 _PAGE_COHERENT | _PAGE_GUARDED));
652
653 /* Insert into the hash table, primary slot */
654 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
655 mmu_psize, ssize);
656
657 /* Primary is full, try the secondary */
658 if (unlikely(slot == -1)) {
659 hpte_group = ((~hash & htab_hash_mask) *
660 HPTES_PER_GROUP) & ~0x7UL;
661 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
662 HPTE_V_SECONDARY,
663 mmu_psize, ssize);
664 if (slot == -1) {
665 if (mftb() & 0x1)
666 hpte_group = ((hash & htab_hash_mask) *
667 HPTES_PER_GROUP)&~0x7UL;
668
669 ppc_md.hpte_remove(hpte_group);
670 goto repeat;
671 }
672 }
673
674 if (unlikely(slot == -2))
675 panic("hash_huge_page: pte_insert failed\n");
676
677 new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
678 }
679 502
680 /* 503 BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
681 * No need to use ldarx/stdcx here
682 */
683 *ptep = __pte(new_pte & ~_PAGE_BUSY);
684 504
685 err = 0; 505 /* Return if huge page size has already been setup */
506 if (size_to_hstate(size))
507 return 0;
686 508
687 out: 509 hugetlb_add_hstate(shift - PAGE_SHIFT);
688 return err;
689}
690 510
691static void __init set_huge_psize(int psize) 511 return 0;
692{
693 /* Check that it is a page size supported by the hardware and
694 * that it fits within pagetable limits. */
695 if (mmu_psize_defs[psize].shift &&
696 mmu_psize_defs[psize].shift < SID_SHIFT_1T &&
697 (mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
698 mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
699 mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
700 /* Return if huge page size has already been setup or is the
701 * same as the base page size. */
702 if (mmu_huge_psizes[psize] ||
703 mmu_psize_defs[psize].shift == PAGE_SHIFT)
704 return;
705 if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL))
706 return;
707 hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
708
709 switch (mmu_psize_defs[psize].shift) {
710 case PAGE_SHIFT_64K:
711 /* We only allow 64k hpages with 4k base page,
712 * which was checked above, and always put them
713 * at the PMD */
714 hugepte_shift[psize] = PMD_SHIFT;
715 break;
716 case PAGE_SHIFT_16M:
717 /* 16M pages can be at two different levels
718 * of pagestables based on base page size */
719 if (PAGE_SHIFT == PAGE_SHIFT_64K)
720 hugepte_shift[psize] = PMD_SHIFT;
721 else /* 4k base page */
722 hugepte_shift[psize] = PUD_SHIFT;
723 break;
724 case PAGE_SHIFT_16G:
725 /* 16G pages are always at PGD level */
726 hugepte_shift[psize] = PGDIR_SHIFT;
727 break;
728 }
729 hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
730 } else
731 hugepte_shift[psize] = 0;
732} 512}
733 513
734static int __init hugepage_setup_sz(char *str) 514static int __init hugepage_setup_sz(char *str)
735{ 515{
736 unsigned long long size; 516 unsigned long long size;
737 int mmu_psize;
738 int shift;
739 517
740 size = memparse(str, &str); 518 size = memparse(str, &str);
741 519
742 shift = __ffs(size); 520 if (add_huge_page_size(size) != 0)
743 mmu_psize = shift_to_mmu_psize(shift);
744 if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift)
745 set_huge_psize(mmu_psize);
746 else
747 printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size); 521 printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
748 522
749 return 1; 523 return 1;
@@ -752,41 +526,55 @@ __setup("hugepagesz=", hugepage_setup_sz);
752 526
753static int __init hugetlbpage_init(void) 527static int __init hugetlbpage_init(void)
754{ 528{
755 unsigned int psize; 529 int psize;
756 530
757 if (!cpu_has_feature(CPU_FTR_16M_PAGE)) 531 if (!cpu_has_feature(CPU_FTR_16M_PAGE))
758 return -ENODEV; 532 return -ENODEV;
759 533
760 /* Add supported huge page sizes. Need to change HUGE_MAX_HSTATE 534 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
761 * and adjust PTE_NONCACHE_NUM if the number of supported huge page 535 unsigned shift;
762 * sizes changes. 536 unsigned pdshift;
763 */
764 set_huge_psize(MMU_PAGE_16M);
765 set_huge_psize(MMU_PAGE_16G);
766 537
767 /* Temporarily disable support for 64K huge pages when 64K SPU local 538 if (!mmu_psize_defs[psize].shift)
768 * store support is enabled as the current implementation conflicts. 539 continue;
769 */
770#ifndef CONFIG_SPU_FS_64K_LS
771 set_huge_psize(MMU_PAGE_64K);
772#endif
773 540
774 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { 541 shift = mmu_psize_to_shift(psize);
775 if (mmu_huge_psizes[psize]) { 542
776 pgtable_cache[HUGE_PGTABLE_INDEX(psize)] = 543 if (add_huge_page_size(1ULL << shift) < 0)
777 kmem_cache_create( 544 continue;
778 HUGEPTE_CACHE_NAME(psize), 545
779 HUGEPTE_TABLE_SIZE(psize), 546 if (shift < PMD_SHIFT)
780 HUGEPTE_TABLE_SIZE(psize), 547 pdshift = PMD_SHIFT;
781 0, 548 else if (shift < PUD_SHIFT)
782 NULL); 549 pdshift = PUD_SHIFT;
783 if (!pgtable_cache[HUGE_PGTABLE_INDEX(psize)]) 550 else
784 panic("hugetlbpage_init(): could not create %s"\ 551 pdshift = PGDIR_SHIFT;
785 "\n", HUGEPTE_CACHE_NAME(psize)); 552
786 } 553 pgtable_cache_add(pdshift - shift, NULL);
554 if (!PGT_CACHE(pdshift - shift))
555 panic("hugetlbpage_init(): could not create "
556 "pgtable cache for %d bit pagesize\n", shift);
787 } 557 }
788 558
559 /* Set default large page size. Currently, we pick 16M or 1M
560 * depending on what is available
561 */
562 if (mmu_psize_defs[MMU_PAGE_16M].shift)
563 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
564 else if (mmu_psize_defs[MMU_PAGE_1M].shift)
565 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
566
789 return 0; 567 return 0;
790} 568}
791 569
792module_init(hugetlbpage_init); 570module_init(hugetlbpage_init);
571
572void flush_dcache_icache_hugepage(struct page *page)
573{
574 int i;
575
576 BUG_ON(!PageCompound(page));
577
578 for (i = 0; i < (1UL << compound_order(page)); i++)
579 __flush_dcache_icache(page_address(page+i));
580}