diff options
author | David Gibson <david@gibson.dropbear.id.au> | 2009-10-26 15:24:31 -0400 |
---|---|---|
committer | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2009-10-30 02:20:58 -0400 |
commit | a4fe3ce7699bfe1bd88f816b55d42d8fe1dac655 (patch) | |
tree | b72c982ffbb9f05d78a952288d60c4dc2d31a4d9 /arch/powerpc/mm/hugetlbpage.c | |
parent | a0668cdc154e54bf0c85182e0535eea237d53146 (diff) |
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/mm/hugetlbpage.c')
-rw-r--r-- | arch/powerpc/mm/hugetlbpage.c | 473 |
1 files changed, 228 insertions, 245 deletions
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 7230d7a4fbd9..95220a5dee58 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c | |||
@@ -40,25 +40,11 @@ static unsigned nr_gpages; | |||
40 | /* Array of valid huge page sizes - non-zero value(hugepte_shift) is | 40 | /* Array of valid huge page sizes - non-zero value(hugepte_shift) is |
41 | * stored for the huge page sizes that are valid. | 41 | * stored for the huge page sizes that are valid. |
42 | */ | 42 | */ |
43 | unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */ | 43 | static unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */ |
44 | |||
45 | #define hugepte_shift mmu_huge_psizes | ||
46 | #define HUGEPTE_INDEX_SIZE(psize) (mmu_huge_psizes[(psize)]) | ||
47 | #define PTRS_PER_HUGEPTE(psize) (1 << mmu_huge_psizes[psize]) | ||
48 | |||
49 | #define HUGEPD_SHIFT(psize) (mmu_psize_to_shift(psize) \ | ||
50 | + HUGEPTE_INDEX_SIZE(psize)) | ||
51 | #define HUGEPD_SIZE(psize) (1UL << HUGEPD_SHIFT(psize)) | ||
52 | #define HUGEPD_MASK(psize) (~(HUGEPD_SIZE(psize)-1)) | ||
53 | 44 | ||
54 | /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() | 45 | /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() |
55 | * will choke on pointers to hugepte tables, which is handy for | 46 | * will choke on pointers to hugepte tables, which is handy for |
56 | * catching screwups early. */ | 47 | * catching screwups early. */ |
57 | #define HUGEPD_OK 0x1 | ||
58 | |||
59 | typedef struct { unsigned long pd; } hugepd_t; | ||
60 | |||
61 | #define hugepd_none(hpd) ((hpd).pd == 0) | ||
62 | 48 | ||
63 | static inline int shift_to_mmu_psize(unsigned int shift) | 49 | static inline int shift_to_mmu_psize(unsigned int shift) |
64 | { | 50 | { |
@@ -82,71 +68,126 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) | |||
82 | BUG(); | 68 | BUG(); |
83 | } | 69 | } |
84 | 70 | ||
71 | #define hugepd_none(hpd) ((hpd).pd == 0) | ||
72 | |||
85 | static inline pte_t *hugepd_page(hugepd_t hpd) | 73 | static inline pte_t *hugepd_page(hugepd_t hpd) |
86 | { | 74 | { |
87 | BUG_ON(!(hpd.pd & HUGEPD_OK)); | 75 | BUG_ON(!hugepd_ok(hpd)); |
88 | return (pte_t *)(hpd.pd & ~HUGEPD_OK); | 76 | return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000); |
77 | } | ||
78 | |||
79 | static inline unsigned int hugepd_shift(hugepd_t hpd) | ||
80 | { | ||
81 | return hpd.pd & HUGEPD_SHIFT_MASK; | ||
89 | } | 82 | } |
90 | 83 | ||
91 | static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, | 84 | static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift) |
92 | struct hstate *hstate) | ||
93 | { | 85 | { |
94 | unsigned int shift = huge_page_shift(hstate); | 86 | unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp); |
95 | int psize = shift_to_mmu_psize(shift); | ||
96 | unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1)); | ||
97 | pte_t *dir = hugepd_page(*hpdp); | 87 | pte_t *dir = hugepd_page(*hpdp); |
98 | 88 | ||
99 | return dir + idx; | 89 | return dir + idx; |
100 | } | 90 | } |
101 | 91 | ||
92 | pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) | ||
93 | { | ||
94 | pgd_t *pg; | ||
95 | pud_t *pu; | ||
96 | pmd_t *pm; | ||
97 | hugepd_t *hpdp = NULL; | ||
98 | unsigned pdshift = PGDIR_SHIFT; | ||
99 | |||
100 | if (shift) | ||
101 | *shift = 0; | ||
102 | |||
103 | pg = pgdir + pgd_index(ea); | ||
104 | if (is_hugepd(pg)) { | ||
105 | hpdp = (hugepd_t *)pg; | ||
106 | } else if (!pgd_none(*pg)) { | ||
107 | pdshift = PUD_SHIFT; | ||
108 | pu = pud_offset(pg, ea); | ||
109 | if (is_hugepd(pu)) | ||
110 | hpdp = (hugepd_t *)pu; | ||
111 | else if (!pud_none(*pu)) { | ||
112 | pdshift = PMD_SHIFT; | ||
113 | pm = pmd_offset(pu, ea); | ||
114 | if (is_hugepd(pm)) | ||
115 | hpdp = (hugepd_t *)pm; | ||
116 | else if (!pmd_none(*pm)) { | ||
117 | return pte_offset_map(pm, ea); | ||
118 | } | ||
119 | } | ||
120 | } | ||
121 | |||
122 | if (!hpdp) | ||
123 | return NULL; | ||
124 | |||
125 | if (shift) | ||
126 | *shift = hugepd_shift(*hpdp); | ||
127 | return hugepte_offset(hpdp, ea, pdshift); | ||
128 | } | ||
129 | |||
130 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | ||
131 | { | ||
132 | return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); | ||
133 | } | ||
134 | |||
102 | static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, | 135 | static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, |
103 | unsigned long address, unsigned int psize) | 136 | unsigned long address, unsigned pdshift, unsigned pshift) |
104 | { | 137 | { |
105 | pte_t *new = kmem_cache_zalloc(PGT_CACHE(hugepte_shift[psize]), | 138 | pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift), |
106 | GFP_KERNEL|__GFP_REPEAT); | 139 | GFP_KERNEL|__GFP_REPEAT); |
107 | 140 | ||
141 | BUG_ON(pshift > HUGEPD_SHIFT_MASK); | ||
142 | BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); | ||
143 | |||
108 | if (! new) | 144 | if (! new) |
109 | return -ENOMEM; | 145 | return -ENOMEM; |
110 | 146 | ||
111 | spin_lock(&mm->page_table_lock); | 147 | spin_lock(&mm->page_table_lock); |
112 | if (!hugepd_none(*hpdp)) | 148 | if (!hugepd_none(*hpdp)) |
113 | kmem_cache_free(PGT_CACHE(hugepte_shift[psize]), new); | 149 | kmem_cache_free(PGT_CACHE(pdshift - pshift), new); |
114 | else | 150 | else |
115 | hpdp->pd = (unsigned long)new | HUGEPD_OK; | 151 | hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift; |
116 | spin_unlock(&mm->page_table_lock); | 152 | spin_unlock(&mm->page_table_lock); |
117 | return 0; | 153 | return 0; |
118 | } | 154 | } |
119 | 155 | ||
120 | 156 | pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) | |
121 | static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate) | ||
122 | { | 157 | { |
123 | if (huge_page_shift(hstate) < PUD_SHIFT) | 158 | pgd_t *pg; |
124 | return pud_offset(pgd, addr); | 159 | pud_t *pu; |
125 | else | 160 | pmd_t *pm; |
126 | return (pud_t *) pgd; | 161 | hugepd_t *hpdp = NULL; |
127 | } | 162 | unsigned pshift = __ffs(sz); |
128 | static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, | 163 | unsigned pdshift = PGDIR_SHIFT; |
129 | struct hstate *hstate) | 164 | |
130 | { | 165 | addr &= ~(sz-1); |
131 | if (huge_page_shift(hstate) < PUD_SHIFT) | 166 | |
132 | return pud_alloc(mm, pgd, addr); | 167 | pg = pgd_offset(mm, addr); |
133 | else | 168 | if (pshift >= PUD_SHIFT) { |
134 | return (pud_t *) pgd; | 169 | hpdp = (hugepd_t *)pg; |
135 | } | 170 | } else { |
136 | static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate) | 171 | pdshift = PUD_SHIFT; |
137 | { | 172 | pu = pud_alloc(mm, pg, addr); |
138 | if (huge_page_shift(hstate) < PMD_SHIFT) | 173 | if (pshift >= PMD_SHIFT) { |
139 | return pmd_offset(pud, addr); | 174 | hpdp = (hugepd_t *)pu; |
140 | else | 175 | } else { |
141 | return (pmd_t *) pud; | 176 | pdshift = PMD_SHIFT; |
142 | } | 177 | pm = pmd_alloc(mm, pu, addr); |
143 | static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr, | 178 | hpdp = (hugepd_t *)pm; |
144 | struct hstate *hstate) | 179 | } |
145 | { | 180 | } |
146 | if (huge_page_shift(hstate) < PMD_SHIFT) | 181 | |
147 | return pmd_alloc(mm, pud, addr); | 182 | if (!hpdp) |
148 | else | 183 | return NULL; |
149 | return (pmd_t *) pud; | 184 | |
185 | BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); | ||
186 | |||
187 | if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift)) | ||
188 | return NULL; | ||
189 | |||
190 | return hugepte_offset(hpdp, addr, pdshift); | ||
150 | } | 191 | } |
151 | 192 | ||
152 | /* Build list of addresses of gigantic pages. This function is used in early | 193 | /* Build list of addresses of gigantic pages. This function is used in early |
@@ -180,92 +221,38 @@ int alloc_bootmem_huge_page(struct hstate *hstate) | |||
180 | return 1; | 221 | return 1; |
181 | } | 222 | } |
182 | 223 | ||
183 | |||
184 | /* Modelled after find_linux_pte() */ | ||
185 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | ||
186 | { | ||
187 | pgd_t *pg; | ||
188 | pud_t *pu; | ||
189 | pmd_t *pm; | ||
190 | |||
191 | unsigned int psize; | ||
192 | unsigned int shift; | ||
193 | unsigned long sz; | ||
194 | struct hstate *hstate; | ||
195 | psize = get_slice_psize(mm, addr); | ||
196 | shift = mmu_psize_to_shift(psize); | ||
197 | sz = ((1UL) << shift); | ||
198 | hstate = size_to_hstate(sz); | ||
199 | |||
200 | addr &= hstate->mask; | ||
201 | |||
202 | pg = pgd_offset(mm, addr); | ||
203 | if (!pgd_none(*pg)) { | ||
204 | pu = hpud_offset(pg, addr, hstate); | ||
205 | if (!pud_none(*pu)) { | ||
206 | pm = hpmd_offset(pu, addr, hstate); | ||
207 | if (!pmd_none(*pm)) | ||
208 | return hugepte_offset((hugepd_t *)pm, addr, | ||
209 | hstate); | ||
210 | } | ||
211 | } | ||
212 | |||
213 | return NULL; | ||
214 | } | ||
215 | |||
216 | pte_t *huge_pte_alloc(struct mm_struct *mm, | ||
217 | unsigned long addr, unsigned long sz) | ||
218 | { | ||
219 | pgd_t *pg; | ||
220 | pud_t *pu; | ||
221 | pmd_t *pm; | ||
222 | hugepd_t *hpdp = NULL; | ||
223 | struct hstate *hstate; | ||
224 | unsigned int psize; | ||
225 | hstate = size_to_hstate(sz); | ||
226 | |||
227 | psize = get_slice_psize(mm, addr); | ||
228 | BUG_ON(!mmu_huge_psizes[psize]); | ||
229 | |||
230 | addr &= hstate->mask; | ||
231 | |||
232 | pg = pgd_offset(mm, addr); | ||
233 | pu = hpud_alloc(mm, pg, addr, hstate); | ||
234 | |||
235 | if (pu) { | ||
236 | pm = hpmd_alloc(mm, pu, addr, hstate); | ||
237 | if (pm) | ||
238 | hpdp = (hugepd_t *)pm; | ||
239 | } | ||
240 | |||
241 | if (! hpdp) | ||
242 | return NULL; | ||
243 | |||
244 | if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize)) | ||
245 | return NULL; | ||
246 | |||
247 | return hugepte_offset(hpdp, addr, hstate); | ||
248 | } | ||
249 | |||
250 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | 224 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) |
251 | { | 225 | { |
252 | return 0; | 226 | return 0; |
253 | } | 227 | } |
254 | 228 | ||
255 | static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp, | 229 | static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, |
256 | unsigned int psize) | 230 | unsigned long start, unsigned long end, |
231 | unsigned long floor, unsigned long ceiling) | ||
257 | { | 232 | { |
258 | pte_t *hugepte = hugepd_page(*hpdp); | 233 | pte_t *hugepte = hugepd_page(*hpdp); |
234 | unsigned shift = hugepd_shift(*hpdp); | ||
235 | unsigned long pdmask = ~((1UL << pdshift) - 1); | ||
236 | |||
237 | start &= pdmask; | ||
238 | if (start < floor) | ||
239 | return; | ||
240 | if (ceiling) { | ||
241 | ceiling &= pdmask; | ||
242 | if (! ceiling) | ||
243 | return; | ||
244 | } | ||
245 | if (end - 1 > ceiling - 1) | ||
246 | return; | ||
259 | 247 | ||
260 | hpdp->pd = 0; | 248 | hpdp->pd = 0; |
261 | tlb->need_flush = 1; | 249 | tlb->need_flush = 1; |
262 | pgtable_free_tlb(tlb, hugepte, hugepte_shift[psize]); | 250 | pgtable_free_tlb(tlb, hugepte, pdshift - shift); |
263 | } | 251 | } |
264 | 252 | ||
265 | static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, | 253 | static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, |
266 | unsigned long addr, unsigned long end, | 254 | unsigned long addr, unsigned long end, |
267 | unsigned long floor, unsigned long ceiling, | 255 | unsigned long floor, unsigned long ceiling) |
268 | unsigned int psize) | ||
269 | { | 256 | { |
270 | pmd_t *pmd; | 257 | pmd_t *pmd; |
271 | unsigned long next; | 258 | unsigned long next; |
@@ -277,7 +264,8 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, | |||
277 | next = pmd_addr_end(addr, end); | 264 | next = pmd_addr_end(addr, end); |
278 | if (pmd_none(*pmd)) | 265 | if (pmd_none(*pmd)) |
279 | continue; | 266 | continue; |
280 | free_hugepte_range(tlb, (hugepd_t *)pmd, psize); | 267 | free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT, |
268 | addr, next, floor, ceiling); | ||
281 | } while (pmd++, addr = next, addr != end); | 269 | } while (pmd++, addr = next, addr != end); |
282 | 270 | ||
283 | start &= PUD_MASK; | 271 | start &= PUD_MASK; |
@@ -303,23 +291,19 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | |||
303 | pud_t *pud; | 291 | pud_t *pud; |
304 | unsigned long next; | 292 | unsigned long next; |
305 | unsigned long start; | 293 | unsigned long start; |
306 | unsigned int shift; | ||
307 | unsigned int psize = get_slice_psize(tlb->mm, addr); | ||
308 | shift = mmu_psize_to_shift(psize); | ||
309 | 294 | ||
310 | start = addr; | 295 | start = addr; |
311 | pud = pud_offset(pgd, addr); | 296 | pud = pud_offset(pgd, addr); |
312 | do { | 297 | do { |
313 | next = pud_addr_end(addr, end); | 298 | next = pud_addr_end(addr, end); |
314 | if (shift < PMD_SHIFT) { | 299 | if (!is_hugepd(pud)) { |
315 | if (pud_none_or_clear_bad(pud)) | 300 | if (pud_none_or_clear_bad(pud)) |
316 | continue; | 301 | continue; |
317 | hugetlb_free_pmd_range(tlb, pud, addr, next, floor, | 302 | hugetlb_free_pmd_range(tlb, pud, addr, next, floor, |
318 | ceiling, psize); | 303 | ceiling); |
319 | } else { | 304 | } else { |
320 | if (pud_none(*pud)) | 305 | free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT, |
321 | continue; | 306 | addr, next, floor, ceiling); |
322 | free_hugepte_range(tlb, (hugepd_t *)pud, psize); | ||
323 | } | 307 | } |
324 | } while (pud++, addr = next, addr != end); | 308 | } while (pud++, addr = next, addr != end); |
325 | 309 | ||
@@ -350,74 +334,34 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, | |||
350 | { | 334 | { |
351 | pgd_t *pgd; | 335 | pgd_t *pgd; |
352 | unsigned long next; | 336 | unsigned long next; |
353 | unsigned long start; | ||
354 | 337 | ||
355 | /* | 338 | /* |
356 | * Comments below take from the normal free_pgd_range(). They | 339 | * Because there are a number of different possible pagetable |
357 | * apply here too. The tests against HUGEPD_MASK below are | 340 | * layouts for hugepage ranges, we limit knowledge of how |
358 | * essential, because we *don't* test for this at the bottom | 341 | * things should be laid out to the allocation path |
359 | * level. Without them we'll attempt to free a hugepte table | 342 | * (huge_pte_alloc(), above). Everything else works out the |
360 | * when we unmap just part of it, even if there are other | 343 | * structure as it goes from information in the hugepd |
361 | * active mappings using it. | 344 | * pointers. That means that we can't here use the |
362 | * | 345 | * optimization used in the normal page free_pgd_range(), of |
363 | * The next few lines have given us lots of grief... | 346 | * checking whether we're actually covering a large enough |
364 | * | 347 | * range to have to do anything at the top level of the walk |
365 | * Why are we testing HUGEPD* at this top level? Because | 348 | * instead of at the bottom. |
366 | * often there will be no work to do at all, and we'd prefer | ||
367 | * not to go all the way down to the bottom just to discover | ||
368 | * that. | ||
369 | * | ||
370 | * Why all these "- 1"s? Because 0 represents both the bottom | ||
371 | * of the address space and the top of it (using -1 for the | ||
372 | * top wouldn't help much: the masks would do the wrong thing). | ||
373 | * The rule is that addr 0 and floor 0 refer to the bottom of | ||
374 | * the address space, but end 0 and ceiling 0 refer to the top | ||
375 | * Comparisons need to use "end - 1" and "ceiling - 1" (though | ||
376 | * that end 0 case should be mythical). | ||
377 | * | ||
378 | * Wherever addr is brought up or ceiling brought down, we | ||
379 | * must be careful to reject "the opposite 0" before it | ||
380 | * confuses the subsequent tests. But what about where end is | ||
381 | * brought down by HUGEPD_SIZE below? no, end can't go down to | ||
382 | * 0 there. | ||
383 | * | 349 | * |
384 | * Whereas we round start (addr) and ceiling down, by different | 350 | * To make sense of this, you should probably go read the big |
385 | * masks at different levels, in order to test whether a table | 351 | * block comment at the top of the normal free_pgd_range(), |
386 | * now has no other vmas using it, so can be freed, we don't | 352 | * too. |
387 | * bother to round floor or end up - the tests don't need that. | ||
388 | */ | 353 | */ |
389 | unsigned int psize = get_slice_psize(tlb->mm, addr); | ||
390 | |||
391 | addr &= HUGEPD_MASK(psize); | ||
392 | if (addr < floor) { | ||
393 | addr += HUGEPD_SIZE(psize); | ||
394 | if (!addr) | ||
395 | return; | ||
396 | } | ||
397 | if (ceiling) { | ||
398 | ceiling &= HUGEPD_MASK(psize); | ||
399 | if (!ceiling) | ||
400 | return; | ||
401 | } | ||
402 | if (end - 1 > ceiling - 1) | ||
403 | end -= HUGEPD_SIZE(psize); | ||
404 | if (addr > end - 1) | ||
405 | return; | ||
406 | 354 | ||
407 | start = addr; | ||
408 | pgd = pgd_offset(tlb->mm, addr); | 355 | pgd = pgd_offset(tlb->mm, addr); |
409 | do { | 356 | do { |
410 | psize = get_slice_psize(tlb->mm, addr); | ||
411 | BUG_ON(!mmu_huge_psizes[psize]); | ||
412 | next = pgd_addr_end(addr, end); | 357 | next = pgd_addr_end(addr, end); |
413 | if (mmu_psize_to_shift(psize) < PUD_SHIFT) { | 358 | if (!is_hugepd(pgd)) { |
414 | if (pgd_none_or_clear_bad(pgd)) | 359 | if (pgd_none_or_clear_bad(pgd)) |
415 | continue; | 360 | continue; |
416 | hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); | 361 | hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); |
417 | } else { | 362 | } else { |
418 | if (pgd_none(*pgd)) | 363 | free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT, |
419 | continue; | 364 | addr, next, floor, ceiling); |
420 | free_hugepte_range(tlb, (hugepd_t *)pgd, psize); | ||
421 | } | 365 | } |
422 | } while (pgd++, addr = next, addr != end); | 366 | } while (pgd++, addr = next, addr != end); |
423 | } | 367 | } |
@@ -448,19 +392,19 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | |||
448 | { | 392 | { |
449 | pte_t *ptep; | 393 | pte_t *ptep; |
450 | struct page *page; | 394 | struct page *page; |
451 | unsigned int mmu_psize = get_slice_psize(mm, address); | 395 | unsigned shift; |
396 | unsigned long mask; | ||
397 | |||
398 | ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); | ||
452 | 399 | ||
453 | /* Verify it is a huge page else bail. */ | 400 | /* Verify it is a huge page else bail. */ |
454 | if (!mmu_huge_psizes[mmu_psize]) | 401 | if (!ptep || !shift) |
455 | return ERR_PTR(-EINVAL); | 402 | return ERR_PTR(-EINVAL); |
456 | 403 | ||
457 | ptep = huge_pte_offset(mm, address); | 404 | mask = (1UL << shift) - 1; |
458 | page = pte_page(*ptep); | 405 | page = pte_page(*ptep); |
459 | if (page) { | 406 | if (page) |
460 | unsigned int shift = mmu_psize_to_shift(mmu_psize); | 407 | page += (address & mask) / PAGE_SIZE; |
461 | unsigned long sz = ((1UL) << shift); | ||
462 | page += (address % sz) / PAGE_SIZE; | ||
463 | } | ||
464 | 408 | ||
465 | return page; | 409 | return page; |
466 | } | 410 | } |
@@ -483,6 +427,73 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, | |||
483 | return NULL; | 427 | return NULL; |
484 | } | 428 | } |
485 | 429 | ||
430 | static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, | ||
431 | unsigned long end, int write, struct page **pages, int *nr) | ||
432 | { | ||
433 | unsigned long mask; | ||
434 | unsigned long pte_end; | ||
435 | struct page *head, *page; | ||
436 | pte_t pte; | ||
437 | int refs; | ||
438 | |||
439 | pte_end = (addr + sz) & ~(sz-1); | ||
440 | if (pte_end < end) | ||
441 | end = pte_end; | ||
442 | |||
443 | pte = *ptep; | ||
444 | mask = _PAGE_PRESENT | _PAGE_USER; | ||
445 | if (write) | ||
446 | mask |= _PAGE_RW; | ||
447 | |||
448 | if ((pte_val(pte) & mask) != mask) | ||
449 | return 0; | ||
450 | |||
451 | /* hugepages are never "special" */ | ||
452 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
453 | |||
454 | refs = 0; | ||
455 | head = pte_page(pte); | ||
456 | |||
457 | page = head + ((addr & (sz-1)) >> PAGE_SHIFT); | ||
458 | do { | ||
459 | VM_BUG_ON(compound_head(page) != head); | ||
460 | pages[*nr] = page; | ||
461 | (*nr)++; | ||
462 | page++; | ||
463 | refs++; | ||
464 | } while (addr += PAGE_SIZE, addr != end); | ||
465 | |||
466 | if (!page_cache_add_speculative(head, refs)) { | ||
467 | *nr -= refs; | ||
468 | return 0; | ||
469 | } | ||
470 | |||
471 | if (unlikely(pte_val(pte) != pte_val(*ptep))) { | ||
472 | /* Could be optimized better */ | ||
473 | while (*nr) { | ||
474 | put_page(page); | ||
475 | (*nr)--; | ||
476 | } | ||
477 | } | ||
478 | |||
479 | return 1; | ||
480 | } | ||
481 | |||
482 | int gup_hugepd(hugepd_t *hugepd, unsigned pdshift, | ||
483 | unsigned long addr, unsigned long end, | ||
484 | int write, struct page **pages, int *nr) | ||
485 | { | ||
486 | pte_t *ptep; | ||
487 | unsigned long sz = 1UL << hugepd_shift(*hugepd); | ||
488 | |||
489 | ptep = hugepte_offset(hugepd, addr, pdshift); | ||
490 | do { | ||
491 | if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) | ||
492 | return 0; | ||
493 | } while (ptep++, addr += sz, addr != end); | ||
494 | |||
495 | return 1; | ||
496 | } | ||
486 | 497 | ||
487 | unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | 498 | unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, |
488 | unsigned long len, unsigned long pgoff, | 499 | unsigned long len, unsigned long pgoff, |
@@ -530,34 +541,20 @@ static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags, | |||
530 | return rflags; | 541 | return rflags; |
531 | } | 542 | } |
532 | 543 | ||
533 | int hash_huge_page(struct mm_struct *mm, unsigned long access, | 544 | int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, |
534 | unsigned long ea, unsigned long vsid, int local, | 545 | pte_t *ptep, unsigned long trap, int local, int ssize, |
535 | unsigned long trap) | 546 | unsigned int shift, unsigned int mmu_psize) |
536 | { | 547 | { |
537 | pte_t *ptep; | ||
538 | unsigned long old_pte, new_pte; | 548 | unsigned long old_pte, new_pte; |
539 | unsigned long va, rflags, pa, sz; | 549 | unsigned long va, rflags, pa, sz; |
540 | long slot; | 550 | long slot; |
541 | int err = 1; | 551 | int err = 1; |
542 | int ssize = user_segment_size(ea); | ||
543 | unsigned int mmu_psize; | ||
544 | int shift; | ||
545 | mmu_psize = get_slice_psize(mm, ea); | ||
546 | 552 | ||
547 | if (!mmu_huge_psizes[mmu_psize]) | 553 | BUG_ON(shift != mmu_psize_defs[mmu_psize].shift); |
548 | goto out; | ||
549 | ptep = huge_pte_offset(mm, ea); | ||
550 | 554 | ||
551 | /* Search the Linux page table for a match with va */ | 555 | /* Search the Linux page table for a match with va */ |
552 | va = hpt_va(ea, vsid, ssize); | 556 | va = hpt_va(ea, vsid, ssize); |
553 | 557 | ||
554 | /* | ||
555 | * If no pte found or not present, send the problem up to | ||
556 | * do_page_fault | ||
557 | */ | ||
558 | if (unlikely(!ptep || pte_none(*ptep))) | ||
559 | goto out; | ||
560 | |||
561 | /* | 558 | /* |
562 | * Check the user's access rights to the page. If access should be | 559 | * Check the user's access rights to the page. If access should be |
563 | * prevented then send the problem up to do_page_fault. | 560 | * prevented then send the problem up to do_page_fault. |
@@ -588,7 +585,6 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access, | |||
588 | rflags = 0x2 | (!(new_pte & _PAGE_RW)); | 585 | rflags = 0x2 | (!(new_pte & _PAGE_RW)); |
589 | /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ | 586 | /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ |
590 | rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); | 587 | rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); |
591 | shift = mmu_psize_to_shift(mmu_psize); | ||
592 | sz = ((1UL) << shift); | 588 | sz = ((1UL) << shift); |
593 | if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) | 589 | if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) |
594 | /* No CPU has hugepages but lacks no execute, so we | 590 | /* No CPU has hugepages but lacks no execute, so we |
@@ -672,6 +668,8 @@ repeat: | |||
672 | 668 | ||
673 | static void __init set_huge_psize(int psize) | 669 | static void __init set_huge_psize(int psize) |
674 | { | 670 | { |
671 | unsigned pdshift; | ||
672 | |||
675 | /* Check that it is a page size supported by the hardware and | 673 | /* Check that it is a page size supported by the hardware and |
676 | * that it fits within pagetable limits. */ | 674 | * that it fits within pagetable limits. */ |
677 | if (mmu_psize_defs[psize].shift && | 675 | if (mmu_psize_defs[psize].shift && |
@@ -686,29 +684,14 @@ static void __init set_huge_psize(int psize) | |||
686 | return; | 684 | return; |
687 | hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT); | 685 | hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT); |
688 | 686 | ||
689 | switch (mmu_psize_defs[psize].shift) { | 687 | if (mmu_psize_defs[psize].shift < PMD_SHIFT) |
690 | case PAGE_SHIFT_64K: | 688 | pdshift = PMD_SHIFT; |
691 | /* We only allow 64k hpages with 4k base page, | 689 | else if (mmu_psize_defs[psize].shift < PUD_SHIFT) |
692 | * which was checked above, and always put them | 690 | pdshift = PUD_SHIFT; |
693 | * at the PMD */ | 691 | else |
694 | hugepte_shift[psize] = PMD_SHIFT; | 692 | pdshift = PGDIR_SHIFT; |
695 | break; | 693 | mmu_huge_psizes[psize] = pdshift - mmu_psize_defs[psize].shift; |
696 | case PAGE_SHIFT_16M: | 694 | } |
697 | /* 16M pages can be at two different levels | ||
698 | * of pagestables based on base page size */ | ||
699 | if (PAGE_SHIFT == PAGE_SHIFT_64K) | ||
700 | hugepte_shift[psize] = PMD_SHIFT; | ||
701 | else /* 4k base page */ | ||
702 | hugepte_shift[psize] = PUD_SHIFT; | ||
703 | break; | ||
704 | case PAGE_SHIFT_16G: | ||
705 | /* 16G pages are always at PGD level */ | ||
706 | hugepte_shift[psize] = PGDIR_SHIFT; | ||
707 | break; | ||
708 | } | ||
709 | hugepte_shift[psize] -= mmu_psize_defs[psize].shift; | ||
710 | } else | ||
711 | hugepte_shift[psize] = 0; | ||
712 | } | 695 | } |
713 | 696 | ||
714 | static int __init hugepage_setup_sz(char *str) | 697 | static int __init hugepage_setup_sz(char *str) |
@@ -732,7 +715,7 @@ __setup("hugepagesz=", hugepage_setup_sz); | |||
732 | 715 | ||
733 | static int __init hugetlbpage_init(void) | 716 | static int __init hugetlbpage_init(void) |
734 | { | 717 | { |
735 | unsigned int psize; | 718 | int psize; |
736 | 719 | ||
737 | if (!cpu_has_feature(CPU_FTR_16M_PAGE)) | 720 | if (!cpu_has_feature(CPU_FTR_16M_PAGE)) |
738 | return -ENODEV; | 721 | return -ENODEV; |
@@ -753,8 +736,8 @@ static int __init hugetlbpage_init(void) | |||
753 | 736 | ||
754 | for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { | 737 | for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { |
755 | if (mmu_huge_psizes[psize]) { | 738 | if (mmu_huge_psizes[psize]) { |
756 | pgtable_cache_add(hugepte_shift[psize], NULL); | 739 | pgtable_cache_add(mmu_huge_psizes[psize], NULL); |
757 | if (!PGT_CACHE(hugepte_shift[psize])) | 740 | if (!PGT_CACHE(mmu_huge_psizes[psize])) |
758 | panic("hugetlbpage_init(): could not create " | 741 | panic("hugetlbpage_init(): could not create " |
759 | "pgtable cache for %d bit pagesize\n", | 742 | "pgtable cache for %d bit pagesize\n", |
760 | mmu_psize_to_shift(psize)); | 743 | mmu_psize_to_shift(psize)); |