diff options
author | Kirill A. Shutemov <kirill.shutemov@linux.intel.com> | 2014-04-03 17:48:13 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-04-03 19:21:03 -0400 |
commit | f0c6d4d295e4ea9a47375304420baa38ca279542 (patch) | |
tree | dde84d183f17f94772500f408177486d509d85c7 /mm/memory.c | |
parent | ec47c3b9543054f6f255d027100fa8214e637003 (diff) |
mm: introduce do_shared_fault() and drop do_fault()
Introduce do_shared_fault(). The function does what do_fault() does for
write faults to shared mappings
Unlike do_fault(), do_shared_fault() is relatively clean and
straight-forward.
Old do_fault() is not needed anymore. Let it die.
[lliubbo@gmail.com: fix NULL pointer dereference]
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Bob Liu <bob.liu@oracle.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 226 |
1 files changed, 62 insertions, 164 deletions
diff --git a/mm/memory.c b/mm/memory.c index 5be13e794a7c..d4320e42989d 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -2748,7 +2748,7 @@ reuse: | |||
2748 | * bit after it clear all dirty ptes, but before a racing | 2748 | * bit after it clear all dirty ptes, but before a racing |
2749 | * do_wp_page installs a dirty pte. | 2749 | * do_wp_page installs a dirty pte. |
2750 | * | 2750 | * |
2751 | * do_fault is protected similarly. | 2751 | * do_shared_fault is protected similarly. |
2752 | */ | 2752 | */ |
2753 | if (!page_mkwrite) { | 2753 | if (!page_mkwrite) { |
2754 | wait_on_page_locked(dirty_page); | 2754 | wait_on_page_locked(dirty_page); |
@@ -3410,188 +3410,86 @@ uncharge_out: | |||
3410 | return ret; | 3410 | return ret; |
3411 | } | 3411 | } |
3412 | 3412 | ||
3413 | /* | 3413 | static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
3414 | * do_fault() tries to create a new page mapping. It aggressively | ||
3415 | * tries to share with existing pages, but makes a separate copy if | ||
3416 | * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid | ||
3417 | * the next page fault. | ||
3418 | * | ||
3419 | * As this is called only for pages that do not currently exist, we | ||
3420 | * do not need to flush old virtual caches or the TLB. | ||
3421 | * | ||
3422 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | ||
3423 | * but allow concurrent faults), and pte neither mapped nor locked. | ||
3424 | * We return with mmap_sem still held, but pte unmapped and unlocked. | ||
3425 | */ | ||
3426 | static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3427 | unsigned long address, pmd_t *pmd, | 3414 | unsigned long address, pmd_t *pmd, |
3428 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) | 3415 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) |
3429 | { | 3416 | { |
3430 | pte_t *page_table; | 3417 | struct page *fault_page; |
3418 | struct address_space *mapping; | ||
3431 | spinlock_t *ptl; | 3419 | spinlock_t *ptl; |
3432 | struct page *page, *fault_page; | 3420 | pte_t entry, *pte; |
3433 | struct page *cow_page; | 3421 | int dirtied = 0; |
3434 | pte_t entry; | 3422 | struct vm_fault vmf; |
3435 | int anon = 0; | 3423 | int ret, tmp; |
3436 | struct page *dirty_page = NULL; | ||
3437 | int ret; | ||
3438 | int page_mkwrite = 0; | ||
3439 | |||
3440 | /* | ||
3441 | * If we do COW later, allocate page befor taking lock_page() | ||
3442 | * on the file cache page. This will reduce lock holding time. | ||
3443 | */ | ||
3444 | if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { | ||
3445 | |||
3446 | if (unlikely(anon_vma_prepare(vma))) | ||
3447 | return VM_FAULT_OOM; | ||
3448 | |||
3449 | cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | ||
3450 | if (!cow_page) | ||
3451 | return VM_FAULT_OOM; | ||
3452 | |||
3453 | if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) { | ||
3454 | page_cache_release(cow_page); | ||
3455 | return VM_FAULT_OOM; | ||
3456 | } | ||
3457 | } else | ||
3458 | cow_page = NULL; | ||
3459 | 3424 | ||
3460 | ret = __do_fault(vma, address, pgoff, flags, &fault_page); | 3425 | ret = __do_fault(vma, address, pgoff, flags, &fault_page); |
3461 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3426 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3462 | goto uncharge_out; | 3427 | return ret; |
3463 | 3428 | ||
3464 | /* | 3429 | /* |
3465 | * Should we do an early C-O-W break? | 3430 | * Check if the backing address space wants to know that the page is |
3431 | * about to become writable | ||
3466 | */ | 3432 | */ |
3467 | page = fault_page; | 3433 | if (!vma->vm_ops->page_mkwrite) |
3468 | if (flags & FAULT_FLAG_WRITE) { | 3434 | goto set_pte; |
3469 | if (!(vma->vm_flags & VM_SHARED)) { | ||
3470 | page = cow_page; | ||
3471 | anon = 1; | ||
3472 | copy_user_highpage(page, fault_page, address, vma); | ||
3473 | __SetPageUptodate(page); | ||
3474 | } else { | ||
3475 | /* | ||
3476 | * If the page will be shareable, see if the backing | ||
3477 | * address space wants to know that the page is about | ||
3478 | * to become writable | ||
3479 | */ | ||
3480 | if (vma->vm_ops->page_mkwrite) { | ||
3481 | struct vm_fault vmf; | ||
3482 | int tmp; | ||
3483 | |||
3484 | vmf.virtual_address = | ||
3485 | (void __user *)(address & PAGE_MASK); | ||
3486 | vmf.pgoff = pgoff; | ||
3487 | vmf.flags = flags; | ||
3488 | vmf.page = fault_page; | ||
3489 | |||
3490 | unlock_page(page); | ||
3491 | vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; | ||
3492 | tmp = vma->vm_ops->page_mkwrite(vma, &vmf); | ||
3493 | if (unlikely(tmp & | ||
3494 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { | ||
3495 | ret = tmp; | ||
3496 | goto unwritable_page; | ||
3497 | } | ||
3498 | if (unlikely(!(tmp & VM_FAULT_LOCKED))) { | ||
3499 | lock_page(page); | ||
3500 | if (!page->mapping) { | ||
3501 | ret = 0; /* retry the fault */ | ||
3502 | unlock_page(page); | ||
3503 | goto unwritable_page; | ||
3504 | } | ||
3505 | } else | ||
3506 | VM_BUG_ON_PAGE(!PageLocked(page), page); | ||
3507 | page_mkwrite = 1; | ||
3508 | } | ||
3509 | } | ||
3510 | 3435 | ||
3511 | } | 3436 | unlock_page(fault_page); |
3437 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); | ||
3438 | vmf.pgoff = pgoff; | ||
3439 | vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; | ||
3440 | vmf.page = fault_page; | ||
3512 | 3441 | ||
3513 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 3442 | tmp = vma->vm_ops->page_mkwrite(vma, &vmf); |
3443 | if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { | ||
3444 | page_cache_release(fault_page); | ||
3445 | return tmp; | ||
3446 | } | ||
3514 | 3447 | ||
3515 | /* | 3448 | if (unlikely(!(tmp & VM_FAULT_LOCKED))) { |
3516 | * This silly early PAGE_DIRTY setting removes a race | 3449 | lock_page(fault_page); |
3517 | * due to the bad i386 page protection. But it's valid | 3450 | if (!fault_page->mapping) { |
3518 | * for other architectures too. | 3451 | unlock_page(fault_page); |
3519 | * | 3452 | page_cache_release(fault_page); |
3520 | * Note that if FAULT_FLAG_WRITE is set, we either now have | 3453 | return 0; /* retry */ |
3521 | * an exclusive copy of the page, or this is a shared mapping, | ||
3522 | * so we can make it writable and dirty to avoid having to | ||
3523 | * handle that later. | ||
3524 | */ | ||
3525 | /* Only go through if we didn't race with anybody else... */ | ||
3526 | if (likely(pte_same(*page_table, orig_pte))) { | ||
3527 | flush_icache_page(vma, page); | ||
3528 | entry = mk_pte(page, vma->vm_page_prot); | ||
3529 | if (flags & FAULT_FLAG_WRITE) | ||
3530 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
3531 | else if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte)) | ||
3532 | pte_mksoft_dirty(entry); | ||
3533 | if (anon) { | ||
3534 | inc_mm_counter_fast(mm, MM_ANONPAGES); | ||
3535 | page_add_new_anon_rmap(page, vma, address); | ||
3536 | } else { | ||
3537 | inc_mm_counter_fast(mm, MM_FILEPAGES); | ||
3538 | page_add_file_rmap(page); | ||
3539 | if (flags & FAULT_FLAG_WRITE) { | ||
3540 | dirty_page = page; | ||
3541 | get_page(dirty_page); | ||
3542 | } | ||
3543 | } | 3454 | } |
3544 | set_pte_at(mm, address, page_table, entry); | 3455 | } else |
3545 | 3456 | VM_BUG_ON_PAGE(!PageLocked(fault_page), fault_page); | |
3546 | /* no need to invalidate: a not-present page won't be cached */ | 3457 | set_pte: |
3547 | update_mmu_cache(vma, address, page_table); | 3458 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
3548 | } else { | 3459 | if (unlikely(!pte_same(*pte, orig_pte))) { |
3549 | if (cow_page) | 3460 | pte_unmap_unlock(pte, ptl); |
3550 | mem_cgroup_uncharge_page(cow_page); | 3461 | unlock_page(fault_page); |
3551 | if (anon) | 3462 | page_cache_release(fault_page); |
3552 | page_cache_release(page); | 3463 | return ret; |
3553 | else | ||
3554 | anon = 1; /* no anon but release faulted_page */ | ||
3555 | } | 3464 | } |
3556 | 3465 | ||
3557 | pte_unmap_unlock(page_table, ptl); | 3466 | flush_icache_page(vma, fault_page); |
3558 | 3467 | entry = mk_pte(fault_page, vma->vm_page_prot); | |
3559 | if (dirty_page) { | 3468 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
3560 | struct address_space *mapping = page->mapping; | 3469 | inc_mm_counter_fast(mm, MM_FILEPAGES); |
3561 | int dirtied = 0; | 3470 | page_add_file_rmap(fault_page); |
3471 | set_pte_at(mm, address, pte, entry); | ||
3562 | 3472 | ||
3563 | if (set_page_dirty(dirty_page)) | 3473 | /* no need to invalidate: a not-present page won't be cached */ |
3564 | dirtied = 1; | 3474 | update_mmu_cache(vma, address, pte); |
3565 | unlock_page(dirty_page); | 3475 | pte_unmap_unlock(pte, ptl); |
3566 | put_page(dirty_page); | ||
3567 | if ((dirtied || page_mkwrite) && mapping) { | ||
3568 | /* | ||
3569 | * Some device drivers do not set page.mapping but still | ||
3570 | * dirty their pages | ||
3571 | */ | ||
3572 | balance_dirty_pages_ratelimited(mapping); | ||
3573 | } | ||
3574 | 3476 | ||
3575 | /* file_update_time outside page_lock */ | 3477 | if (set_page_dirty(fault_page)) |
3576 | if (vma->vm_file && !page_mkwrite) | 3478 | dirtied = 1; |
3577 | file_update_time(vma->vm_file); | 3479 | mapping = fault_page->mapping; |
3578 | } else { | 3480 | unlock_page(fault_page); |
3579 | unlock_page(fault_page); | 3481 | if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { |
3580 | if (anon) | 3482 | /* |
3581 | page_cache_release(fault_page); | 3483 | * Some device drivers do not set page.mapping but still |
3484 | * dirty their pages | ||
3485 | */ | ||
3486 | balance_dirty_pages_ratelimited(mapping); | ||
3582 | } | 3487 | } |
3583 | 3488 | ||
3584 | return ret; | 3489 | /* file_update_time outside page_lock */ |
3490 | if (vma->vm_file && !vma->vm_ops->page_mkwrite) | ||
3491 | file_update_time(vma->vm_file); | ||
3585 | 3492 | ||
3586 | unwritable_page: | ||
3587 | page_cache_release(page); | ||
3588 | return ret; | ||
3589 | uncharge_out: | ||
3590 | /* fs's fault handler get error */ | ||
3591 | if (cow_page) { | ||
3592 | mem_cgroup_uncharge_page(cow_page); | ||
3593 | page_cache_release(cow_page); | ||
3594 | } | ||
3595 | return ret; | 3493 | return ret; |
3596 | } | 3494 | } |
3597 | 3495 | ||
@@ -3609,7 +3507,7 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3609 | if (!(vma->vm_flags & VM_SHARED)) | 3507 | if (!(vma->vm_flags & VM_SHARED)) |
3610 | return do_cow_fault(mm, vma, address, pmd, pgoff, flags, | 3508 | return do_cow_fault(mm, vma, address, pmd, pgoff, flags, |
3611 | orig_pte); | 3509 | orig_pte); |
3612 | return do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | 3510 | return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); |
3613 | } | 3511 | } |
3614 | 3512 | ||
3615 | /* | 3513 | /* |
@@ -3647,7 +3545,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3647 | if (!(vma->vm_flags & VM_SHARED)) | 3545 | if (!(vma->vm_flags & VM_SHARED)) |
3648 | return do_cow_fault(mm, vma, address, pmd, pgoff, flags, | 3546 | return do_cow_fault(mm, vma, address, pmd, pgoff, flags, |
3649 | orig_pte); | 3547 | orig_pte); |
3650 | return do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | 3548 | return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); |
3651 | } | 3549 | } |
3652 | 3550 | ||
3653 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | 3551 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, |