diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 192 |
1 files changed, 139 insertions, 53 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3e52df7c471b..f4c43d7980ba 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -11,6 +11,8 @@ | |||
11 | #include <linux/highmem.h> | 11 | #include <linux/highmem.h> |
12 | #include <linux/nodemask.h> | 12 | #include <linux/nodemask.h> |
13 | #include <linux/pagemap.h> | 13 | #include <linux/pagemap.h> |
14 | #include <linux/mempolicy.h> | ||
15 | |||
14 | #include <asm/page.h> | 16 | #include <asm/page.h> |
15 | #include <asm/pgtable.h> | 17 | #include <asm/pgtable.h> |
16 | 18 | ||
@@ -36,18 +38,21 @@ static void enqueue_huge_page(struct page *page) | |||
36 | free_huge_pages_node[nid]++; | 38 | free_huge_pages_node[nid]++; |
37 | } | 39 | } |
38 | 40 | ||
39 | static struct page *dequeue_huge_page(void) | 41 | static struct page *dequeue_huge_page(struct vm_area_struct *vma, |
42 | unsigned long address) | ||
40 | { | 43 | { |
41 | int nid = numa_node_id(); | 44 | int nid = numa_node_id(); |
42 | struct page *page = NULL; | 45 | struct page *page = NULL; |
46 | struct zonelist *zonelist = huge_zonelist(vma, address); | ||
47 | struct zone **z; | ||
43 | 48 | ||
44 | if (list_empty(&hugepage_freelists[nid])) { | 49 | for (z = zonelist->zones; *z; z++) { |
45 | for (nid = 0; nid < MAX_NUMNODES; ++nid) | 50 | nid = (*z)->zone_pgdat->node_id; |
46 | if (!list_empty(&hugepage_freelists[nid])) | 51 | if (!list_empty(&hugepage_freelists[nid])) |
47 | break; | 52 | break; |
48 | } | 53 | } |
49 | if (nid >= 0 && nid < MAX_NUMNODES && | 54 | |
50 | !list_empty(&hugepage_freelists[nid])) { | 55 | if (*z) { |
51 | page = list_entry(hugepage_freelists[nid].next, | 56 | page = list_entry(hugepage_freelists[nid].next, |
52 | struct page, lru); | 57 | struct page, lru); |
53 | list_del(&page->lru); | 58 | list_del(&page->lru); |
@@ -85,13 +90,13 @@ void free_huge_page(struct page *page) | |||
85 | spin_unlock(&hugetlb_lock); | 90 | spin_unlock(&hugetlb_lock); |
86 | } | 91 | } |
87 | 92 | ||
88 | struct page *alloc_huge_page(void) | 93 | struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) |
89 | { | 94 | { |
90 | struct page *page; | 95 | struct page *page; |
91 | int i; | 96 | int i; |
92 | 97 | ||
93 | spin_lock(&hugetlb_lock); | 98 | spin_lock(&hugetlb_lock); |
94 | page = dequeue_huge_page(); | 99 | page = dequeue_huge_page(vma, addr); |
95 | if (!page) { | 100 | if (!page) { |
96 | spin_unlock(&hugetlb_lock); | 101 | spin_unlock(&hugetlb_lock); |
97 | return NULL; | 102 | return NULL; |
@@ -194,7 +199,7 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
194 | spin_lock(&hugetlb_lock); | 199 | spin_lock(&hugetlb_lock); |
195 | try_to_free_low(count); | 200 | try_to_free_low(count); |
196 | while (count < nr_huge_pages) { | 201 | while (count < nr_huge_pages) { |
197 | struct page *page = dequeue_huge_page(); | 202 | struct page *page = dequeue_huge_page(NULL, 0); |
198 | if (!page) | 203 | if (!page) |
199 | break; | 204 | break; |
200 | update_and_free_page(page); | 205 | update_and_free_page(page); |
@@ -261,11 +266,12 @@ struct vm_operations_struct hugetlb_vm_ops = { | |||
261 | .nopage = hugetlb_nopage, | 266 | .nopage = hugetlb_nopage, |
262 | }; | 267 | }; |
263 | 268 | ||
264 | static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) | 269 | static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, |
270 | int writable) | ||
265 | { | 271 | { |
266 | pte_t entry; | 272 | pte_t entry; |
267 | 273 | ||
268 | if (vma->vm_flags & VM_WRITE) { | 274 | if (writable) { |
269 | entry = | 275 | entry = |
270 | pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); | 276 | pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); |
271 | } else { | 277 | } else { |
@@ -277,12 +283,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) | |||
277 | return entry; | 283 | return entry; |
278 | } | 284 | } |
279 | 285 | ||
286 | static void set_huge_ptep_writable(struct vm_area_struct *vma, | ||
287 | unsigned long address, pte_t *ptep) | ||
288 | { | ||
289 | pte_t entry; | ||
290 | |||
291 | entry = pte_mkwrite(pte_mkdirty(*ptep)); | ||
292 | ptep_set_access_flags(vma, address, ptep, entry, 1); | ||
293 | update_mmu_cache(vma, address, entry); | ||
294 | lazy_mmu_prot_update(entry); | ||
295 | } | ||
296 | |||
297 | |||
280 | int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | 298 | int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, |
281 | struct vm_area_struct *vma) | 299 | struct vm_area_struct *vma) |
282 | { | 300 | { |
283 | pte_t *src_pte, *dst_pte, entry; | 301 | pte_t *src_pte, *dst_pte, entry; |
284 | struct page *ptepage; | 302 | struct page *ptepage; |
285 | unsigned long addr; | 303 | unsigned long addr; |
304 | int cow; | ||
305 | |||
306 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | ||
286 | 307 | ||
287 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | 308 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { |
288 | src_pte = huge_pte_offset(src, addr); | 309 | src_pte = huge_pte_offset(src, addr); |
@@ -294,6 +315,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
294 | spin_lock(&dst->page_table_lock); | 315 | spin_lock(&dst->page_table_lock); |
295 | spin_lock(&src->page_table_lock); | 316 | spin_lock(&src->page_table_lock); |
296 | if (!pte_none(*src_pte)) { | 317 | if (!pte_none(*src_pte)) { |
318 | if (cow) | ||
319 | ptep_set_wrprotect(src, addr, src_pte); | ||
297 | entry = *src_pte; | 320 | entry = *src_pte; |
298 | ptepage = pte_page(entry); | 321 | ptepage = pte_page(entry); |
299 | get_page(ptepage); | 322 | get_page(ptepage); |
@@ -345,57 +368,63 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
345 | flush_tlb_range(vma, start, end); | 368 | flush_tlb_range(vma, start, end); |
346 | } | 369 | } |
347 | 370 | ||
348 | static struct page *find_lock_huge_page(struct address_space *mapping, | 371 | static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, |
349 | unsigned long idx) | 372 | unsigned long address, pte_t *ptep, pte_t pte) |
350 | { | 373 | { |
351 | struct page *page; | 374 | struct page *old_page, *new_page; |
352 | int err; | 375 | int i, avoidcopy; |
353 | struct inode *inode = mapping->host; | ||
354 | unsigned long size; | ||
355 | 376 | ||
356 | retry: | 377 | old_page = pte_page(pte); |
357 | page = find_lock_page(mapping, idx); | ||
358 | if (page) | ||
359 | goto out; | ||
360 | 378 | ||
361 | /* Check to make sure the mapping hasn't been truncated */ | 379 | /* If no-one else is actually using this page, avoid the copy |
362 | size = i_size_read(inode) >> HPAGE_SHIFT; | 380 | * and just make the page writable */ |
363 | if (idx >= size) | 381 | avoidcopy = (page_count(old_page) == 1); |
364 | goto out; | 382 | if (avoidcopy) { |
383 | set_huge_ptep_writable(vma, address, ptep); | ||
384 | return VM_FAULT_MINOR; | ||
385 | } | ||
365 | 386 | ||
366 | if (hugetlb_get_quota(mapping)) | 387 | page_cache_get(old_page); |
367 | goto out; | 388 | new_page = alloc_huge_page(vma, address); |
368 | page = alloc_huge_page(); | 389 | |
369 | if (!page) { | 390 | if (!new_page) { |
370 | hugetlb_put_quota(mapping); | 391 | page_cache_release(old_page); |
371 | goto out; | 392 | |
393 | /* Logically this is OOM, not a SIGBUS, but an OOM | ||
394 | * could cause the kernel to go killing other | ||
395 | * processes which won't help the hugepage situation | ||
396 | * at all (?) */ | ||
397 | return VM_FAULT_SIGBUS; | ||
372 | } | 398 | } |
373 | 399 | ||
374 | err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); | 400 | spin_unlock(&mm->page_table_lock); |
375 | if (err) { | 401 | for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) |
376 | put_page(page); | 402 | copy_user_highpage(new_page + i, old_page + i, |
377 | hugetlb_put_quota(mapping); | 403 | address + i*PAGE_SIZE); |
378 | if (err == -EEXIST) | 404 | spin_lock(&mm->page_table_lock); |
379 | goto retry; | 405 | |
380 | page = NULL; | 406 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); |
407 | if (likely(pte_same(*ptep, pte))) { | ||
408 | /* Break COW */ | ||
409 | set_huge_pte_at(mm, address, ptep, | ||
410 | make_huge_pte(vma, new_page, 1)); | ||
411 | /* Make the old page be freed below */ | ||
412 | new_page = old_page; | ||
381 | } | 413 | } |
382 | out: | 414 | page_cache_release(new_page); |
383 | return page; | 415 | page_cache_release(old_page); |
416 | return VM_FAULT_MINOR; | ||
384 | } | 417 | } |
385 | 418 | ||
386 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 419 | int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, |
387 | unsigned long address, int write_access) | 420 | unsigned long address, pte_t *ptep, int write_access) |
388 | { | 421 | { |
389 | int ret = VM_FAULT_SIGBUS; | 422 | int ret = VM_FAULT_SIGBUS; |
390 | unsigned long idx; | 423 | unsigned long idx; |
391 | unsigned long size; | 424 | unsigned long size; |
392 | pte_t *pte; | ||
393 | struct page *page; | 425 | struct page *page; |
394 | struct address_space *mapping; | 426 | struct address_space *mapping; |
395 | 427 | pte_t new_pte; | |
396 | pte = huge_pte_alloc(mm, address); | ||
397 | if (!pte) | ||
398 | goto out; | ||
399 | 428 | ||
400 | mapping = vma->vm_file->f_mapping; | 429 | mapping = vma->vm_file->f_mapping; |
401 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) | 430 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) |
@@ -405,9 +434,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
405 | * Use page lock to guard against racing truncation | 434 | * Use page lock to guard against racing truncation |
406 | * before we get page_table_lock. | 435 | * before we get page_table_lock. |
407 | */ | 436 | */ |
408 | page = find_lock_huge_page(mapping, idx); | 437 | retry: |
409 | if (!page) | 438 | page = find_lock_page(mapping, idx); |
410 | goto out; | 439 | if (!page) { |
440 | if (hugetlb_get_quota(mapping)) | ||
441 | goto out; | ||
442 | page = alloc_huge_page(vma, address); | ||
443 | if (!page) { | ||
444 | hugetlb_put_quota(mapping); | ||
445 | goto out; | ||
446 | } | ||
447 | |||
448 | if (vma->vm_flags & VM_SHARED) { | ||
449 | int err; | ||
450 | |||
451 | err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); | ||
452 | if (err) { | ||
453 | put_page(page); | ||
454 | hugetlb_put_quota(mapping); | ||
455 | if (err == -EEXIST) | ||
456 | goto retry; | ||
457 | goto out; | ||
458 | } | ||
459 | } else | ||
460 | lock_page(page); | ||
461 | } | ||
411 | 462 | ||
412 | spin_lock(&mm->page_table_lock); | 463 | spin_lock(&mm->page_table_lock); |
413 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; | 464 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; |
@@ -415,11 +466,19 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
415 | goto backout; | 466 | goto backout; |
416 | 467 | ||
417 | ret = VM_FAULT_MINOR; | 468 | ret = VM_FAULT_MINOR; |
418 | if (!pte_none(*pte)) | 469 | if (!pte_none(*ptep)) |
419 | goto backout; | 470 | goto backout; |
420 | 471 | ||
421 | add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); | 472 | add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); |
422 | set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page)); | 473 | new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) |
474 | && (vma->vm_flags & VM_SHARED))); | ||
475 | set_huge_pte_at(mm, address, ptep, new_pte); | ||
476 | |||
477 | if (write_access && !(vma->vm_flags & VM_SHARED)) { | ||
478 | /* Optimization, do the COW without a second fault */ | ||
479 | ret = hugetlb_cow(mm, vma, address, ptep, new_pte); | ||
480 | } | ||
481 | |||
423 | spin_unlock(&mm->page_table_lock); | 482 | spin_unlock(&mm->page_table_lock); |
424 | unlock_page(page); | 483 | unlock_page(page); |
425 | out: | 484 | out: |
@@ -433,6 +492,33 @@ backout: | |||
433 | goto out; | 492 | goto out; |
434 | } | 493 | } |
435 | 494 | ||
495 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | ||
496 | unsigned long address, int write_access) | ||
497 | { | ||
498 | pte_t *ptep; | ||
499 | pte_t entry; | ||
500 | int ret; | ||
501 | |||
502 | ptep = huge_pte_alloc(mm, address); | ||
503 | if (!ptep) | ||
504 | return VM_FAULT_OOM; | ||
505 | |||
506 | entry = *ptep; | ||
507 | if (pte_none(entry)) | ||
508 | return hugetlb_no_page(mm, vma, address, ptep, write_access); | ||
509 | |||
510 | ret = VM_FAULT_MINOR; | ||
511 | |||
512 | spin_lock(&mm->page_table_lock); | ||
513 | /* Check for a racing update before calling hugetlb_cow */ | ||
514 | if (likely(pte_same(entry, *ptep))) | ||
515 | if (write_access && !pte_write(entry)) | ||
516 | ret = hugetlb_cow(mm, vma, address, ptep, entry); | ||
517 | spin_unlock(&mm->page_table_lock); | ||
518 | |||
519 | return ret; | ||
520 | } | ||
521 | |||
436 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | 522 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, |
437 | struct page **pages, struct vm_area_struct **vmas, | 523 | struct page **pages, struct vm_area_struct **vmas, |
438 | unsigned long *position, int *length, int i) | 524 | unsigned long *position, int *length, int i) |