diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/ppc64/mm/hugetlbpage.c |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'arch/ppc64/mm/hugetlbpage.c')
-rw-r--r-- | arch/ppc64/mm/hugetlbpage.c | 904 |
1 files changed, 904 insertions, 0 deletions
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c new file mode 100644 index 000000000000..c62ddaff0720 --- /dev/null +++ b/arch/ppc64/mm/hugetlbpage.c | |||
@@ -0,0 +1,904 @@ | |||
1 | /* | ||
2 | * PPC64 (POWER4) Huge TLB Page Support for Kernel. | ||
3 | * | ||
4 | * Copyright (C) 2003 David Gibson, IBM Corporation. | ||
5 | * | ||
6 | * Based on the IA-32 version: | ||
7 | * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> | ||
8 | */ | ||
9 | |||
10 | #include <linux/init.h> | ||
11 | #include <linux/fs.h> | ||
12 | #include <linux/mm.h> | ||
13 | #include <linux/hugetlb.h> | ||
14 | #include <linux/pagemap.h> | ||
15 | #include <linux/smp_lock.h> | ||
16 | #include <linux/slab.h> | ||
17 | #include <linux/err.h> | ||
18 | #include <linux/sysctl.h> | ||
19 | #include <asm/mman.h> | ||
20 | #include <asm/pgalloc.h> | ||
21 | #include <asm/tlb.h> | ||
22 | #include <asm/tlbflush.h> | ||
23 | #include <asm/mmu_context.h> | ||
24 | #include <asm/machdep.h> | ||
25 | #include <asm/cputable.h> | ||
26 | #include <asm/tlb.h> | ||
27 | |||
28 | #include <linux/sysctl.h> | ||
29 | |||
30 | #define HUGEPGDIR_SHIFT (HPAGE_SHIFT + PAGE_SHIFT - 3) | ||
31 | #define HUGEPGDIR_SIZE (1UL << HUGEPGDIR_SHIFT) | ||
32 | #define HUGEPGDIR_MASK (~(HUGEPGDIR_SIZE-1)) | ||
33 | |||
34 | #define HUGEPTE_INDEX_SIZE 9 | ||
35 | #define HUGEPGD_INDEX_SIZE 10 | ||
36 | |||
37 | #define PTRS_PER_HUGEPTE (1 << HUGEPTE_INDEX_SIZE) | ||
38 | #define PTRS_PER_HUGEPGD (1 << HUGEPGD_INDEX_SIZE) | ||
39 | |||
40 | static inline int hugepgd_index(unsigned long addr) | ||
41 | { | ||
42 | return (addr & ~REGION_MASK) >> HUGEPGDIR_SHIFT; | ||
43 | } | ||
44 | |||
45 | static pgd_t *hugepgd_offset(struct mm_struct *mm, unsigned long addr) | ||
46 | { | ||
47 | int index; | ||
48 | |||
49 | if (! mm->context.huge_pgdir) | ||
50 | return NULL; | ||
51 | |||
52 | |||
53 | index = hugepgd_index(addr); | ||
54 | BUG_ON(index >= PTRS_PER_HUGEPGD); | ||
55 | return mm->context.huge_pgdir + index; | ||
56 | } | ||
57 | |||
58 | static inline pte_t *hugepte_offset(pgd_t *dir, unsigned long addr) | ||
59 | { | ||
60 | int index; | ||
61 | |||
62 | if (pgd_none(*dir)) | ||
63 | return NULL; | ||
64 | |||
65 | index = (addr >> HPAGE_SHIFT) % PTRS_PER_HUGEPTE; | ||
66 | return (pte_t *)pgd_page(*dir) + index; | ||
67 | } | ||
68 | |||
69 | static pgd_t *hugepgd_alloc(struct mm_struct *mm, unsigned long addr) | ||
70 | { | ||
71 | BUG_ON(! in_hugepage_area(mm->context, addr)); | ||
72 | |||
73 | if (! mm->context.huge_pgdir) { | ||
74 | pgd_t *new; | ||
75 | spin_unlock(&mm->page_table_lock); | ||
76 | /* Don't use pgd_alloc(), because we want __GFP_REPEAT */ | ||
77 | new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT); | ||
78 | BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE)); | ||
79 | spin_lock(&mm->page_table_lock); | ||
80 | |||
81 | /* | ||
82 | * Because we dropped the lock, we should re-check the | ||
83 | * entry, as somebody else could have populated it.. | ||
84 | */ | ||
85 | if (mm->context.huge_pgdir) | ||
86 | pgd_free(new); | ||
87 | else | ||
88 | mm->context.huge_pgdir = new; | ||
89 | } | ||
90 | return hugepgd_offset(mm, addr); | ||
91 | } | ||
92 | |||
93 | static pte_t *hugepte_alloc(struct mm_struct *mm, pgd_t *dir, | ||
94 | unsigned long addr) | ||
95 | { | ||
96 | if (! pgd_present(*dir)) { | ||
97 | pte_t *new; | ||
98 | |||
99 | spin_unlock(&mm->page_table_lock); | ||
100 | new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT); | ||
101 | BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE)); | ||
102 | spin_lock(&mm->page_table_lock); | ||
103 | /* | ||
104 | * Because we dropped the lock, we should re-check the | ||
105 | * entry, as somebody else could have populated it.. | ||
106 | */ | ||
107 | if (pgd_present(*dir)) { | ||
108 | if (new) | ||
109 | kmem_cache_free(zero_cache, new); | ||
110 | } else { | ||
111 | struct page *ptepage; | ||
112 | |||
113 | if (! new) | ||
114 | return NULL; | ||
115 | ptepage = virt_to_page(new); | ||
116 | ptepage->mapping = (void *) mm; | ||
117 | ptepage->index = addr & HUGEPGDIR_MASK; | ||
118 | pgd_populate(mm, dir, new); | ||
119 | } | ||
120 | } | ||
121 | |||
122 | return hugepte_offset(dir, addr); | ||
123 | } | ||
124 | |||
125 | static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | ||
126 | { | ||
127 | pgd_t *pgd; | ||
128 | |||
129 | BUG_ON(! in_hugepage_area(mm->context, addr)); | ||
130 | |||
131 | pgd = hugepgd_offset(mm, addr); | ||
132 | if (! pgd) | ||
133 | return NULL; | ||
134 | |||
135 | return hugepte_offset(pgd, addr); | ||
136 | } | ||
137 | |||
138 | static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) | ||
139 | { | ||
140 | pgd_t *pgd; | ||
141 | |||
142 | BUG_ON(! in_hugepage_area(mm->context, addr)); | ||
143 | |||
144 | pgd = hugepgd_alloc(mm, addr); | ||
145 | if (! pgd) | ||
146 | return NULL; | ||
147 | |||
148 | return hugepte_alloc(mm, pgd, addr); | ||
149 | } | ||
150 | |||
151 | static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, | ||
152 | unsigned long addr, struct page *page, | ||
153 | pte_t *ptep, int write_access) | ||
154 | { | ||
155 | pte_t entry; | ||
156 | |||
157 | add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); | ||
158 | if (write_access) { | ||
159 | entry = | ||
160 | pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); | ||
161 | } else { | ||
162 | entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); | ||
163 | } | ||
164 | entry = pte_mkyoung(entry); | ||
165 | entry = pte_mkhuge(entry); | ||
166 | |||
167 | set_pte_at(mm, addr, ptep, entry); | ||
168 | } | ||
169 | |||
170 | /* | ||
171 | * This function checks for proper alignment of input addr and len parameters. | ||
172 | */ | ||
173 | int is_aligned_hugepage_range(unsigned long addr, unsigned long len) | ||
174 | { | ||
175 | if (len & ~HPAGE_MASK) | ||
176 | return -EINVAL; | ||
177 | if (addr & ~HPAGE_MASK) | ||
178 | return -EINVAL; | ||
179 | if (! (within_hugepage_low_range(addr, len) | ||
180 | || within_hugepage_high_range(addr, len)) ) | ||
181 | return -EINVAL; | ||
182 | return 0; | ||
183 | } | ||
184 | |||
185 | static void flush_segments(void *parm) | ||
186 | { | ||
187 | u16 segs = (unsigned long) parm; | ||
188 | unsigned long i; | ||
189 | |||
190 | asm volatile("isync" : : : "memory"); | ||
191 | |||
192 | for (i = 0; i < 16; i++) { | ||
193 | if (! (segs & (1U << i))) | ||
194 | continue; | ||
195 | asm volatile("slbie %0" : : "r" (i << SID_SHIFT)); | ||
196 | } | ||
197 | |||
198 | asm volatile("isync" : : : "memory"); | ||
199 | } | ||
200 | |||
201 | static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg) | ||
202 | { | ||
203 | unsigned long start = seg << SID_SHIFT; | ||
204 | unsigned long end = (seg+1) << SID_SHIFT; | ||
205 | struct vm_area_struct *vma; | ||
206 | unsigned long addr; | ||
207 | struct mmu_gather *tlb; | ||
208 | |||
209 | BUG_ON(seg >= 16); | ||
210 | |||
211 | /* Check no VMAs are in the region */ | ||
212 | vma = find_vma(mm, start); | ||
213 | if (vma && (vma->vm_start < end)) | ||
214 | return -EBUSY; | ||
215 | |||
216 | /* Clean up any leftover PTE pages in the region */ | ||
217 | spin_lock(&mm->page_table_lock); | ||
218 | tlb = tlb_gather_mmu(mm, 0); | ||
219 | for (addr = start; addr < end; addr += PMD_SIZE) { | ||
220 | pgd_t *pgd = pgd_offset(mm, addr); | ||
221 | pmd_t *pmd; | ||
222 | struct page *page; | ||
223 | pte_t *pte; | ||
224 | int i; | ||
225 | |||
226 | if (pgd_none(*pgd)) | ||
227 | continue; | ||
228 | pmd = pmd_offset(pgd, addr); | ||
229 | if (!pmd || pmd_none(*pmd)) | ||
230 | continue; | ||
231 | if (pmd_bad(*pmd)) { | ||
232 | pmd_ERROR(*pmd); | ||
233 | pmd_clear(pmd); | ||
234 | continue; | ||
235 | } | ||
236 | pte = (pte_t *)pmd_page_kernel(*pmd); | ||
237 | /* No VMAs, so there should be no PTEs, check just in case. */ | ||
238 | for (i = 0; i < PTRS_PER_PTE; i++) { | ||
239 | BUG_ON(!pte_none(*pte)); | ||
240 | pte++; | ||
241 | } | ||
242 | page = pmd_page(*pmd); | ||
243 | pmd_clear(pmd); | ||
244 | mm->nr_ptes--; | ||
245 | dec_page_state(nr_page_table_pages); | ||
246 | pte_free_tlb(tlb, page); | ||
247 | } | ||
248 | tlb_finish_mmu(tlb, start, end); | ||
249 | spin_unlock(&mm->page_table_lock); | ||
250 | |||
251 | return 0; | ||
252 | } | ||
253 | |||
254 | static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs) | ||
255 | { | ||
256 | unsigned long i; | ||
257 | |||
258 | newsegs &= ~(mm->context.htlb_segs); | ||
259 | if (! newsegs) | ||
260 | return 0; /* The segments we want are already open */ | ||
261 | |||
262 | for (i = 0; i < 16; i++) | ||
263 | if ((1 << i) & newsegs) | ||
264 | if (prepare_low_seg_for_htlb(mm, i) != 0) | ||
265 | return -EBUSY; | ||
266 | |||
267 | mm->context.htlb_segs |= newsegs; | ||
268 | |||
269 | /* update the paca copy of the context struct */ | ||
270 | get_paca()->context = mm->context; | ||
271 | |||
272 | /* the context change must make it to memory before the flush, | ||
273 | * so that further SLB misses do the right thing. */ | ||
274 | mb(); | ||
275 | on_each_cpu(flush_segments, (void *)(unsigned long)newsegs, 0, 1); | ||
276 | |||
277 | return 0; | ||
278 | } | ||
279 | |||
280 | int prepare_hugepage_range(unsigned long addr, unsigned long len) | ||
281 | { | ||
282 | if (within_hugepage_high_range(addr, len)) | ||
283 | return 0; | ||
284 | else if ((addr < 0x100000000UL) && ((addr+len) < 0x100000000UL)) { | ||
285 | int err; | ||
286 | /* Yes, we need both tests, in case addr+len overflows | ||
287 | * 64-bit arithmetic */ | ||
288 | err = open_low_hpage_segs(current->mm, | ||
289 | LOW_ESID_MASK(addr, len)); | ||
290 | if (err) | ||
291 | printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)" | ||
292 | " failed (segs: 0x%04hx)\n", addr, len, | ||
293 | LOW_ESID_MASK(addr, len)); | ||
294 | return err; | ||
295 | } | ||
296 | |||
297 | return -EINVAL; | ||
298 | } | ||
299 | |||
300 | int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | ||
301 | struct vm_area_struct *vma) | ||
302 | { | ||
303 | pte_t *src_pte, *dst_pte, entry; | ||
304 | struct page *ptepage; | ||
305 | unsigned long addr = vma->vm_start; | ||
306 | unsigned long end = vma->vm_end; | ||
307 | int err = -ENOMEM; | ||
308 | |||
309 | while (addr < end) { | ||
310 | dst_pte = huge_pte_alloc(dst, addr); | ||
311 | if (!dst_pte) | ||
312 | goto out; | ||
313 | |||
314 | src_pte = huge_pte_offset(src, addr); | ||
315 | entry = *src_pte; | ||
316 | |||
317 | ptepage = pte_page(entry); | ||
318 | get_page(ptepage); | ||
319 | add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); | ||
320 | set_pte_at(dst, addr, dst_pte, entry); | ||
321 | |||
322 | addr += HPAGE_SIZE; | ||
323 | } | ||
324 | |||
325 | err = 0; | ||
326 | out: | ||
327 | return err; | ||
328 | } | ||
329 | |||
330 | int | ||
331 | follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
332 | struct page **pages, struct vm_area_struct **vmas, | ||
333 | unsigned long *position, int *length, int i) | ||
334 | { | ||
335 | unsigned long vpfn, vaddr = *position; | ||
336 | int remainder = *length; | ||
337 | |||
338 | WARN_ON(!is_vm_hugetlb_page(vma)); | ||
339 | |||
340 | vpfn = vaddr/PAGE_SIZE; | ||
341 | while (vaddr < vma->vm_end && remainder) { | ||
342 | if (pages) { | ||
343 | pte_t *pte; | ||
344 | struct page *page; | ||
345 | |||
346 | pte = huge_pte_offset(mm, vaddr); | ||
347 | |||
348 | /* hugetlb should be locked, and hence, prefaulted */ | ||
349 | WARN_ON(!pte || pte_none(*pte)); | ||
350 | |||
351 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | ||
352 | |||
353 | WARN_ON(!PageCompound(page)); | ||
354 | |||
355 | get_page(page); | ||
356 | pages[i] = page; | ||
357 | } | ||
358 | |||
359 | if (vmas) | ||
360 | vmas[i] = vma; | ||
361 | |||
362 | vaddr += PAGE_SIZE; | ||
363 | ++vpfn; | ||
364 | --remainder; | ||
365 | ++i; | ||
366 | } | ||
367 | |||
368 | *length = remainder; | ||
369 | *position = vaddr; | ||
370 | |||
371 | return i; | ||
372 | } | ||
373 | |||
374 | struct page * | ||
375 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | ||
376 | { | ||
377 | pte_t *ptep; | ||
378 | struct page *page; | ||
379 | |||
380 | if (! in_hugepage_area(mm->context, address)) | ||
381 | return ERR_PTR(-EINVAL); | ||
382 | |||
383 | ptep = huge_pte_offset(mm, address); | ||
384 | page = pte_page(*ptep); | ||
385 | if (page) | ||
386 | page += (address % HPAGE_SIZE) / PAGE_SIZE; | ||
387 | |||
388 | return page; | ||
389 | } | ||
390 | |||
391 | int pmd_huge(pmd_t pmd) | ||
392 | { | ||
393 | return 0; | ||
394 | } | ||
395 | |||
396 | struct page * | ||
397 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | ||
398 | pmd_t *pmd, int write) | ||
399 | { | ||
400 | BUG(); | ||
401 | return NULL; | ||
402 | } | ||
403 | |||
404 | void unmap_hugepage_range(struct vm_area_struct *vma, | ||
405 | unsigned long start, unsigned long end) | ||
406 | { | ||
407 | struct mm_struct *mm = vma->vm_mm; | ||
408 | unsigned long addr; | ||
409 | pte_t *ptep; | ||
410 | struct page *page; | ||
411 | |||
412 | WARN_ON(!is_vm_hugetlb_page(vma)); | ||
413 | BUG_ON((start % HPAGE_SIZE) != 0); | ||
414 | BUG_ON((end % HPAGE_SIZE) != 0); | ||
415 | |||
416 | for (addr = start; addr < end; addr += HPAGE_SIZE) { | ||
417 | pte_t pte; | ||
418 | |||
419 | ptep = huge_pte_offset(mm, addr); | ||
420 | if (!ptep || pte_none(*ptep)) | ||
421 | continue; | ||
422 | |||
423 | pte = *ptep; | ||
424 | page = pte_page(pte); | ||
425 | pte_clear(mm, addr, ptep); | ||
426 | |||
427 | put_page(page); | ||
428 | } | ||
429 | add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT)); | ||
430 | flush_tlb_pending(); | ||
431 | } | ||
432 | |||
433 | void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev, | ||
434 | unsigned long start, unsigned long end) | ||
435 | { | ||
436 | /* Because the huge pgtables are only 2 level, they can take | ||
437 | * at most around 4M, much less than one hugepage which the | ||
438 | * process is presumably entitled to use. So we don't bother | ||
439 | * freeing up the pagetables on unmap, and wait until | ||
440 | * destroy_context() to clean up the lot. */ | ||
441 | } | ||
442 | |||
443 | int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) | ||
444 | { | ||
445 | struct mm_struct *mm = current->mm; | ||
446 | unsigned long addr; | ||
447 | int ret = 0; | ||
448 | |||
449 | WARN_ON(!is_vm_hugetlb_page(vma)); | ||
450 | BUG_ON((vma->vm_start % HPAGE_SIZE) != 0); | ||
451 | BUG_ON((vma->vm_end % HPAGE_SIZE) != 0); | ||
452 | |||
453 | spin_lock(&mm->page_table_lock); | ||
454 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | ||
455 | unsigned long idx; | ||
456 | pte_t *pte = huge_pte_alloc(mm, addr); | ||
457 | struct page *page; | ||
458 | |||
459 | if (!pte) { | ||
460 | ret = -ENOMEM; | ||
461 | goto out; | ||
462 | } | ||
463 | if (! pte_none(*pte)) | ||
464 | continue; | ||
465 | |||
466 | idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) | ||
467 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | ||
468 | page = find_get_page(mapping, idx); | ||
469 | if (!page) { | ||
470 | /* charge the fs quota first */ | ||
471 | if (hugetlb_get_quota(mapping)) { | ||
472 | ret = -ENOMEM; | ||
473 | goto out; | ||
474 | } | ||
475 | page = alloc_huge_page(); | ||
476 | if (!page) { | ||
477 | hugetlb_put_quota(mapping); | ||
478 | ret = -ENOMEM; | ||
479 | goto out; | ||
480 | } | ||
481 | ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); | ||
482 | if (! ret) { | ||
483 | unlock_page(page); | ||
484 | } else { | ||
485 | hugetlb_put_quota(mapping); | ||
486 | free_huge_page(page); | ||
487 | goto out; | ||
488 | } | ||
489 | } | ||
490 | set_huge_pte(mm, vma, addr, page, pte, vma->vm_flags & VM_WRITE); | ||
491 | } | ||
492 | out: | ||
493 | spin_unlock(&mm->page_table_lock); | ||
494 | return ret; | ||
495 | } | ||
496 | |||
497 | /* Because we have an exclusive hugepage region which lies within the | ||
498 | * normal user address space, we have to take special measures to make | ||
499 | * non-huge mmap()s evade the hugepage reserved regions. */ | ||
500 | unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, | ||
501 | unsigned long len, unsigned long pgoff, | ||
502 | unsigned long flags) | ||
503 | { | ||
504 | struct mm_struct *mm = current->mm; | ||
505 | struct vm_area_struct *vma; | ||
506 | unsigned long start_addr; | ||
507 | |||
508 | if (len > TASK_SIZE) | ||
509 | return -ENOMEM; | ||
510 | |||
511 | if (addr) { | ||
512 | addr = PAGE_ALIGN(addr); | ||
513 | vma = find_vma(mm, addr); | ||
514 | if (((TASK_SIZE - len) >= addr) | ||
515 | && (!vma || (addr+len) <= vma->vm_start) | ||
516 | && !is_hugepage_only_range(mm, addr,len)) | ||
517 | return addr; | ||
518 | } | ||
519 | start_addr = addr = mm->free_area_cache; | ||
520 | |||
521 | full_search: | ||
522 | vma = find_vma(mm, addr); | ||
523 | while (TASK_SIZE - len >= addr) { | ||
524 | BUG_ON(vma && (addr >= vma->vm_end)); | ||
525 | |||
526 | if (touches_hugepage_low_range(mm, addr, len)) { | ||
527 | addr = ALIGN(addr+1, 1<<SID_SHIFT); | ||
528 | vma = find_vma(mm, addr); | ||
529 | continue; | ||
530 | } | ||
531 | if (touches_hugepage_high_range(addr, len)) { | ||
532 | addr = TASK_HPAGE_END; | ||
533 | vma = find_vma(mm, addr); | ||
534 | continue; | ||
535 | } | ||
536 | if (!vma || addr + len <= vma->vm_start) { | ||
537 | /* | ||
538 | * Remember the place where we stopped the search: | ||
539 | */ | ||
540 | mm->free_area_cache = addr + len; | ||
541 | return addr; | ||
542 | } | ||
543 | addr = vma->vm_end; | ||
544 | vma = vma->vm_next; | ||
545 | } | ||
546 | |||
547 | /* Make sure we didn't miss any holes */ | ||
548 | if (start_addr != TASK_UNMAPPED_BASE) { | ||
549 | start_addr = addr = TASK_UNMAPPED_BASE; | ||
550 | goto full_search; | ||
551 | } | ||
552 | return -ENOMEM; | ||
553 | } | ||
554 | |||
555 | /* | ||
556 | * This mmap-allocator allocates new areas top-down from below the | ||
557 | * stack's low limit (the base): | ||
558 | * | ||
559 | * Because we have an exclusive hugepage region which lies within the | ||
560 | * normal user address space, we have to take special measures to make | ||
561 | * non-huge mmap()s evade the hugepage reserved regions. | ||
562 | */ | ||
563 | unsigned long | ||
564 | arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | ||
565 | const unsigned long len, const unsigned long pgoff, | ||
566 | const unsigned long flags) | ||
567 | { | ||
568 | struct vm_area_struct *vma, *prev_vma; | ||
569 | struct mm_struct *mm = current->mm; | ||
570 | unsigned long base = mm->mmap_base, addr = addr0; | ||
571 | int first_time = 1; | ||
572 | |||
573 | /* requested length too big for entire address space */ | ||
574 | if (len > TASK_SIZE) | ||
575 | return -ENOMEM; | ||
576 | |||
577 | /* dont allow allocations above current base */ | ||
578 | if (mm->free_area_cache > base) | ||
579 | mm->free_area_cache = base; | ||
580 | |||
581 | /* requesting a specific address */ | ||
582 | if (addr) { | ||
583 | addr = PAGE_ALIGN(addr); | ||
584 | vma = find_vma(mm, addr); | ||
585 | if (TASK_SIZE - len >= addr && | ||
586 | (!vma || addr + len <= vma->vm_start) | ||
587 | && !is_hugepage_only_range(mm, addr,len)) | ||
588 | return addr; | ||
589 | } | ||
590 | |||
591 | try_again: | ||
592 | /* make sure it can fit in the remaining address space */ | ||
593 | if (mm->free_area_cache < len) | ||
594 | goto fail; | ||
595 | |||
596 | /* either no address requested or cant fit in requested address hole */ | ||
597 | addr = (mm->free_area_cache - len) & PAGE_MASK; | ||
598 | do { | ||
599 | hugepage_recheck: | ||
600 | if (touches_hugepage_low_range(mm, addr, len)) { | ||
601 | addr = (addr & ((~0) << SID_SHIFT)) - len; | ||
602 | goto hugepage_recheck; | ||
603 | } else if (touches_hugepage_high_range(addr, len)) { | ||
604 | addr = TASK_HPAGE_BASE - len; | ||
605 | } | ||
606 | |||
607 | /* | ||
608 | * Lookup failure means no vma is above this address, | ||
609 | * i.e. return with success: | ||
610 | */ | ||
611 | if (!(vma = find_vma_prev(mm, addr, &prev_vma))) | ||
612 | return addr; | ||
613 | |||
614 | /* | ||
615 | * new region fits between prev_vma->vm_end and | ||
616 | * vma->vm_start, use it: | ||
617 | */ | ||
618 | if (addr+len <= vma->vm_start && | ||
619 | (!prev_vma || (addr >= prev_vma->vm_end))) | ||
620 | /* remember the address as a hint for next time */ | ||
621 | return (mm->free_area_cache = addr); | ||
622 | else | ||
623 | /* pull free_area_cache down to the first hole */ | ||
624 | if (mm->free_area_cache == vma->vm_end) | ||
625 | mm->free_area_cache = vma->vm_start; | ||
626 | |||
627 | /* try just below the current vma->vm_start */ | ||
628 | addr = vma->vm_start-len; | ||
629 | } while (len <= vma->vm_start); | ||
630 | |||
631 | fail: | ||
632 | /* | ||
633 | * if hint left us with no space for the requested | ||
634 | * mapping then try again: | ||
635 | */ | ||
636 | if (first_time) { | ||
637 | mm->free_area_cache = base; | ||
638 | first_time = 0; | ||
639 | goto try_again; | ||
640 | } | ||
641 | /* | ||
642 | * A failed mmap() very likely causes application failure, | ||
643 | * so fall back to the bottom-up function here. This scenario | ||
644 | * can happen with large stack limits and large mmap() | ||
645 | * allocations. | ||
646 | */ | ||
647 | mm->free_area_cache = TASK_UNMAPPED_BASE; | ||
648 | addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); | ||
649 | /* | ||
650 | * Restore the topdown base: | ||
651 | */ | ||
652 | mm->free_area_cache = base; | ||
653 | |||
654 | return addr; | ||
655 | } | ||
656 | |||
657 | static unsigned long htlb_get_low_area(unsigned long len, u16 segmask) | ||
658 | { | ||
659 | unsigned long addr = 0; | ||
660 | struct vm_area_struct *vma; | ||
661 | |||
662 | vma = find_vma(current->mm, addr); | ||
663 | while (addr + len <= 0x100000000UL) { | ||
664 | BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ | ||
665 | |||
666 | if (! __within_hugepage_low_range(addr, len, segmask)) { | ||
667 | addr = ALIGN(addr+1, 1<<SID_SHIFT); | ||
668 | vma = find_vma(current->mm, addr); | ||
669 | continue; | ||
670 | } | ||
671 | |||
672 | if (!vma || (addr + len) <= vma->vm_start) | ||
673 | return addr; | ||
674 | addr = ALIGN(vma->vm_end, HPAGE_SIZE); | ||
675 | /* Depending on segmask this might not be a confirmed | ||
676 | * hugepage region, so the ALIGN could have skipped | ||
677 | * some VMAs */ | ||
678 | vma = find_vma(current->mm, addr); | ||
679 | } | ||
680 | |||
681 | return -ENOMEM; | ||
682 | } | ||
683 | |||
684 | static unsigned long htlb_get_high_area(unsigned long len) | ||
685 | { | ||
686 | unsigned long addr = TASK_HPAGE_BASE; | ||
687 | struct vm_area_struct *vma; | ||
688 | |||
689 | vma = find_vma(current->mm, addr); | ||
690 | for (vma = find_vma(current->mm, addr); | ||
691 | addr + len <= TASK_HPAGE_END; | ||
692 | vma = vma->vm_next) { | ||
693 | BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ | ||
694 | BUG_ON(! within_hugepage_high_range(addr, len)); | ||
695 | |||
696 | if (!vma || (addr + len) <= vma->vm_start) | ||
697 | return addr; | ||
698 | addr = ALIGN(vma->vm_end, HPAGE_SIZE); | ||
699 | /* Because we're in a hugepage region, this alignment | ||
700 | * should not skip us over any VMAs */ | ||
701 | } | ||
702 | |||
703 | return -ENOMEM; | ||
704 | } | ||
705 | |||
706 | unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | ||
707 | unsigned long len, unsigned long pgoff, | ||
708 | unsigned long flags) | ||
709 | { | ||
710 | if (len & ~HPAGE_MASK) | ||
711 | return -EINVAL; | ||
712 | |||
713 | if (!cpu_has_feature(CPU_FTR_16M_PAGE)) | ||
714 | return -EINVAL; | ||
715 | |||
716 | if (test_thread_flag(TIF_32BIT)) { | ||
717 | int lastshift = 0; | ||
718 | u16 segmask, cursegs = current->mm->context.htlb_segs; | ||
719 | |||
720 | /* First see if we can do the mapping in the existing | ||
721 | * low hpage segments */ | ||
722 | addr = htlb_get_low_area(len, cursegs); | ||
723 | if (addr != -ENOMEM) | ||
724 | return addr; | ||
725 | |||
726 | for (segmask = LOW_ESID_MASK(0x100000000UL-len, len); | ||
727 | ! lastshift; segmask >>=1) { | ||
728 | if (segmask & 1) | ||
729 | lastshift = 1; | ||
730 | |||
731 | addr = htlb_get_low_area(len, cursegs | segmask); | ||
732 | if ((addr != -ENOMEM) | ||
733 | && open_low_hpage_segs(current->mm, segmask) == 0) | ||
734 | return addr; | ||
735 | } | ||
736 | printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open" | ||
737 | " enough segments\n"); | ||
738 | return -ENOMEM; | ||
739 | } else { | ||
740 | return htlb_get_high_area(len); | ||
741 | } | ||
742 | } | ||
743 | |||
744 | void hugetlb_mm_free_pgd(struct mm_struct *mm) | ||
745 | { | ||
746 | int i; | ||
747 | pgd_t *pgdir; | ||
748 | |||
749 | spin_lock(&mm->page_table_lock); | ||
750 | |||
751 | pgdir = mm->context.huge_pgdir; | ||
752 | if (! pgdir) | ||
753 | goto out; | ||
754 | |||
755 | mm->context.huge_pgdir = NULL; | ||
756 | |||
757 | /* cleanup any hugepte pages leftover */ | ||
758 | for (i = 0; i < PTRS_PER_HUGEPGD; i++) { | ||
759 | pgd_t *pgd = pgdir + i; | ||
760 | |||
761 | if (! pgd_none(*pgd)) { | ||
762 | pte_t *pte = (pte_t *)pgd_page(*pgd); | ||
763 | struct page *ptepage = virt_to_page(pte); | ||
764 | |||
765 | ptepage->mapping = NULL; | ||
766 | |||
767 | BUG_ON(memcmp(pte, empty_zero_page, PAGE_SIZE)); | ||
768 | kmem_cache_free(zero_cache, pte); | ||
769 | } | ||
770 | pgd_clear(pgd); | ||
771 | } | ||
772 | |||
773 | BUG_ON(memcmp(pgdir, empty_zero_page, PAGE_SIZE)); | ||
774 | kmem_cache_free(zero_cache, pgdir); | ||
775 | |||
776 | out: | ||
777 | spin_unlock(&mm->page_table_lock); | ||
778 | } | ||
779 | |||
780 | int hash_huge_page(struct mm_struct *mm, unsigned long access, | ||
781 | unsigned long ea, unsigned long vsid, int local) | ||
782 | { | ||
783 | pte_t *ptep; | ||
784 | unsigned long va, vpn; | ||
785 | pte_t old_pte, new_pte; | ||
786 | unsigned long hpteflags, prpn; | ||
787 | long slot; | ||
788 | int err = 1; | ||
789 | |||
790 | spin_lock(&mm->page_table_lock); | ||
791 | |||
792 | ptep = huge_pte_offset(mm, ea); | ||
793 | |||
794 | /* Search the Linux page table for a match with va */ | ||
795 | va = (vsid << 28) | (ea & 0x0fffffff); | ||
796 | vpn = va >> HPAGE_SHIFT; | ||
797 | |||
798 | /* | ||
799 | * If no pte found or not present, send the problem up to | ||
800 | * do_page_fault | ||
801 | */ | ||
802 | if (unlikely(!ptep || pte_none(*ptep))) | ||
803 | goto out; | ||
804 | |||
805 | /* BUG_ON(pte_bad(*ptep)); */ | ||
806 | |||
807 | /* | ||
808 | * Check the user's access rights to the page. If access should be | ||
809 | * prevented then send the problem up to do_page_fault. | ||
810 | */ | ||
811 | if (unlikely(access & ~pte_val(*ptep))) | ||
812 | goto out; | ||
813 | /* | ||
814 | * At this point, we have a pte (old_pte) which can be used to build | ||
815 | * or update an HPTE. There are 2 cases: | ||
816 | * | ||
817 | * 1. There is a valid (present) pte with no associated HPTE (this is | ||
818 | * the most common case) | ||
819 | * 2. There is a valid (present) pte with an associated HPTE. The | ||
820 | * current values of the pp bits in the HPTE prevent access | ||
821 | * because we are doing software DIRTY bit management and the | ||
822 | * page is currently not DIRTY. | ||
823 | */ | ||
824 | |||
825 | |||
826 | old_pte = *ptep; | ||
827 | new_pte = old_pte; | ||
828 | |||
829 | hpteflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW)); | ||
830 | /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ | ||
831 | hpteflags |= ((pte_val(new_pte) & _PAGE_EXEC) ? 0 : HW_NO_EXEC); | ||
832 | |||
833 | /* Check if pte already has an hpte (case 2) */ | ||
834 | if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) { | ||
835 | /* There MIGHT be an HPTE for this pte */ | ||
836 | unsigned long hash, slot; | ||
837 | |||
838 | hash = hpt_hash(vpn, 1); | ||
839 | if (pte_val(old_pte) & _PAGE_SECONDARY) | ||
840 | hash = ~hash; | ||
841 | slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; | ||
842 | slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12; | ||
843 | |||
844 | if (ppc_md.hpte_updatepp(slot, hpteflags, va, 1, local) == -1) | ||
845 | pte_val(old_pte) &= ~_PAGE_HPTEFLAGS; | ||
846 | } | ||
847 | |||
848 | if (likely(!(pte_val(old_pte) & _PAGE_HASHPTE))) { | ||
849 | unsigned long hash = hpt_hash(vpn, 1); | ||
850 | unsigned long hpte_group; | ||
851 | |||
852 | prpn = pte_pfn(old_pte); | ||
853 | |||
854 | repeat: | ||
855 | hpte_group = ((hash & htab_hash_mask) * | ||
856 | HPTES_PER_GROUP) & ~0x7UL; | ||
857 | |||
858 | /* Update the linux pte with the HPTE slot */ | ||
859 | pte_val(new_pte) &= ~_PAGE_HPTEFLAGS; | ||
860 | pte_val(new_pte) |= _PAGE_HASHPTE; | ||
861 | |||
862 | /* Add in WIMG bits */ | ||
863 | /* XXX We should store these in the pte */ | ||
864 | hpteflags |= _PAGE_COHERENT; | ||
865 | |||
866 | slot = ppc_md.hpte_insert(hpte_group, va, prpn, 0, | ||
867 | hpteflags, 0, 1); | ||
868 | |||
869 | /* Primary is full, try the secondary */ | ||
870 | if (unlikely(slot == -1)) { | ||
871 | pte_val(new_pte) |= _PAGE_SECONDARY; | ||
872 | hpte_group = ((~hash & htab_hash_mask) * | ||
873 | HPTES_PER_GROUP) & ~0x7UL; | ||
874 | slot = ppc_md.hpte_insert(hpte_group, va, prpn, | ||
875 | 1, hpteflags, 0, 1); | ||
876 | if (slot == -1) { | ||
877 | if (mftb() & 0x1) | ||
878 | hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; | ||
879 | |||
880 | ppc_md.hpte_remove(hpte_group); | ||
881 | goto repeat; | ||
882 | } | ||
883 | } | ||
884 | |||
885 | if (unlikely(slot == -2)) | ||
886 | panic("hash_huge_page: pte_insert failed\n"); | ||
887 | |||
888 | pte_val(new_pte) |= (slot<<12) & _PAGE_GROUP_IX; | ||
889 | |||
890 | /* | ||
891 | * No need to use ldarx/stdcx here because all who | ||
892 | * might be updating the pte will hold the | ||
893 | * page_table_lock | ||
894 | */ | ||
895 | *ptep = new_pte; | ||
896 | } | ||
897 | |||
898 | err = 0; | ||
899 | |||
900 | out: | ||
901 | spin_unlock(&mm->page_table_lock); | ||
902 | |||
903 | return err; | ||
904 | } | ||