summaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorKirill A. Shutemov <kirill.shutemov@linux.intel.com>2014-04-03 17:48:13 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-04-03 19:21:03 -0400
commitf0c6d4d295e4ea9a47375304420baa38ca279542 (patch)
treedde84d183f17f94772500f408177486d509d85c7 /mm/memory.c
parentec47c3b9543054f6f255d027100fa8214e637003 (diff)
mm: introduce do_shared_fault() and drop do_fault()
Introduce do_shared_fault(). The function does what do_fault() does for write faults to shared mappings Unlike do_fault(), do_shared_fault() is relatively clean and straight-forward. Old do_fault() is not needed anymore. Let it die. [lliubbo@gmail.com: fix NULL pointer dereference] Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Rik van Riel <riel@redhat.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Matthew Wilcox <matthew.r.wilcox@intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Bob Liu <bob.liu@oracle.com> Cc: Sasha Levin <sasha.levin@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c226
1 files changed, 62 insertions, 164 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 5be13e794a7c..d4320e42989d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2748,7 +2748,7 @@ reuse:
2748 * bit after it clear all dirty ptes, but before a racing 2748 * bit after it clear all dirty ptes, but before a racing
2749 * do_wp_page installs a dirty pte. 2749 * do_wp_page installs a dirty pte.
2750 * 2750 *
2751 * do_fault is protected similarly. 2751 * do_shared_fault is protected similarly.
2752 */ 2752 */
2753 if (!page_mkwrite) { 2753 if (!page_mkwrite) {
2754 wait_on_page_locked(dirty_page); 2754 wait_on_page_locked(dirty_page);
@@ -3410,188 +3410,86 @@ uncharge_out:
3410 return ret; 3410 return ret;
3411} 3411}
3412 3412
3413/* 3413static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3414 * do_fault() tries to create a new page mapping. It aggressively
3415 * tries to share with existing pages, but makes a separate copy if
3416 * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
3417 * the next page fault.
3418 *
3419 * As this is called only for pages that do not currently exist, we
3420 * do not need to flush old virtual caches or the TLB.
3421 *
3422 * We enter with non-exclusive mmap_sem (to exclude vma changes,
3423 * but allow concurrent faults), and pte neither mapped nor locked.
3424 * We return with mmap_sem still held, but pte unmapped and unlocked.
3425 */
3426static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3427 unsigned long address, pmd_t *pmd, 3414 unsigned long address, pmd_t *pmd,
3428 pgoff_t pgoff, unsigned int flags, pte_t orig_pte) 3415 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
3429{ 3416{
3430 pte_t *page_table; 3417 struct page *fault_page;
3418 struct address_space *mapping;
3431 spinlock_t *ptl; 3419 spinlock_t *ptl;
3432 struct page *page, *fault_page; 3420 pte_t entry, *pte;
3433 struct page *cow_page; 3421 int dirtied = 0;
3434 pte_t entry; 3422 struct vm_fault vmf;
3435 int anon = 0; 3423 int ret, tmp;
3436 struct page *dirty_page = NULL;
3437 int ret;
3438 int page_mkwrite = 0;
3439
3440 /*
3441 * If we do COW later, allocate page befor taking lock_page()
3442 * on the file cache page. This will reduce lock holding time.
3443 */
3444 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3445
3446 if (unlikely(anon_vma_prepare(vma)))
3447 return VM_FAULT_OOM;
3448
3449 cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
3450 if (!cow_page)
3451 return VM_FAULT_OOM;
3452
3453 if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
3454 page_cache_release(cow_page);
3455 return VM_FAULT_OOM;
3456 }
3457 } else
3458 cow_page = NULL;
3459 3424
3460 ret = __do_fault(vma, address, pgoff, flags, &fault_page); 3425 ret = __do_fault(vma, address, pgoff, flags, &fault_page);
3461 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 3426 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3462 goto uncharge_out; 3427 return ret;
3463 3428
3464 /* 3429 /*
3465 * Should we do an early C-O-W break? 3430 * Check if the backing address space wants to know that the page is
3431 * about to become writable
3466 */ 3432 */
3467 page = fault_page; 3433 if (!vma->vm_ops->page_mkwrite)
3468 if (flags & FAULT_FLAG_WRITE) { 3434 goto set_pte;
3469 if (!(vma->vm_flags & VM_SHARED)) {
3470 page = cow_page;
3471 anon = 1;
3472 copy_user_highpage(page, fault_page, address, vma);
3473 __SetPageUptodate(page);
3474 } else {
3475 /*
3476 * If the page will be shareable, see if the backing
3477 * address space wants to know that the page is about
3478 * to become writable
3479 */
3480 if (vma->vm_ops->page_mkwrite) {
3481 struct vm_fault vmf;
3482 int tmp;
3483
3484 vmf.virtual_address =
3485 (void __user *)(address & PAGE_MASK);
3486 vmf.pgoff = pgoff;
3487 vmf.flags = flags;
3488 vmf.page = fault_page;
3489
3490 unlock_page(page);
3491 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
3492 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
3493 if (unlikely(tmp &
3494 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
3495 ret = tmp;
3496 goto unwritable_page;
3497 }
3498 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
3499 lock_page(page);
3500 if (!page->mapping) {
3501 ret = 0; /* retry the fault */
3502 unlock_page(page);
3503 goto unwritable_page;
3504 }
3505 } else
3506 VM_BUG_ON_PAGE(!PageLocked(page), page);
3507 page_mkwrite = 1;
3508 }
3509 }
3510 3435
3511 } 3436 unlock_page(fault_page);
3437 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
3438 vmf.pgoff = pgoff;
3439 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
3440 vmf.page = fault_page;
3512 3441
3513 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 3442 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
3443 if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
3444 page_cache_release(fault_page);
3445 return tmp;
3446 }
3514 3447
3515 /* 3448 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
3516 * This silly early PAGE_DIRTY setting removes a race 3449 lock_page(fault_page);
3517 * due to the bad i386 page protection. But it's valid 3450 if (!fault_page->mapping) {
3518 * for other architectures too. 3451 unlock_page(fault_page);
3519 * 3452 page_cache_release(fault_page);
3520 * Note that if FAULT_FLAG_WRITE is set, we either now have 3453 return 0; /* retry */
3521 * an exclusive copy of the page, or this is a shared mapping,
3522 * so we can make it writable and dirty to avoid having to
3523 * handle that later.
3524 */
3525 /* Only go through if we didn't race with anybody else... */
3526 if (likely(pte_same(*page_table, orig_pte))) {
3527 flush_icache_page(vma, page);
3528 entry = mk_pte(page, vma->vm_page_prot);
3529 if (flags & FAULT_FLAG_WRITE)
3530 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3531 else if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte))
3532 pte_mksoft_dirty(entry);
3533 if (anon) {
3534 inc_mm_counter_fast(mm, MM_ANONPAGES);
3535 page_add_new_anon_rmap(page, vma, address);
3536 } else {
3537 inc_mm_counter_fast(mm, MM_FILEPAGES);
3538 page_add_file_rmap(page);
3539 if (flags & FAULT_FLAG_WRITE) {
3540 dirty_page = page;
3541 get_page(dirty_page);
3542 }
3543 } 3454 }
3544 set_pte_at(mm, address, page_table, entry); 3455 } else
3545 3456 VM_BUG_ON_PAGE(!PageLocked(fault_page), fault_page);
3546 /* no need to invalidate: a not-present page won't be cached */ 3457set_pte:
3547 update_mmu_cache(vma, address, page_table); 3458 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
3548 } else { 3459 if (unlikely(!pte_same(*pte, orig_pte))) {
3549 if (cow_page) 3460 pte_unmap_unlock(pte, ptl);
3550 mem_cgroup_uncharge_page(cow_page); 3461 unlock_page(fault_page);
3551 if (anon) 3462 page_cache_release(fault_page);
3552 page_cache_release(page); 3463 return ret;
3553 else
3554 anon = 1; /* no anon but release faulted_page */
3555 } 3464 }
3556 3465
3557 pte_unmap_unlock(page_table, ptl); 3466 flush_icache_page(vma, fault_page);
3558 3467 entry = mk_pte(fault_page, vma->vm_page_prot);
3559 if (dirty_page) { 3468 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3560 struct address_space *mapping = page->mapping; 3469 inc_mm_counter_fast(mm, MM_FILEPAGES);
3561 int dirtied = 0; 3470 page_add_file_rmap(fault_page);
3471 set_pte_at(mm, address, pte, entry);
3562 3472
3563 if (set_page_dirty(dirty_page)) 3473 /* no need to invalidate: a not-present page won't be cached */
3564 dirtied = 1; 3474 update_mmu_cache(vma, address, pte);
3565 unlock_page(dirty_page); 3475 pte_unmap_unlock(pte, ptl);
3566 put_page(dirty_page);
3567 if ((dirtied || page_mkwrite) && mapping) {
3568 /*
3569 * Some device drivers do not set page.mapping but still
3570 * dirty their pages
3571 */
3572 balance_dirty_pages_ratelimited(mapping);
3573 }
3574 3476
3575 /* file_update_time outside page_lock */ 3477 if (set_page_dirty(fault_page))
3576 if (vma->vm_file && !page_mkwrite) 3478 dirtied = 1;
3577 file_update_time(vma->vm_file); 3479 mapping = fault_page->mapping;
3578 } else { 3480 unlock_page(fault_page);
3579 unlock_page(fault_page); 3481 if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
3580 if (anon) 3482 /*
3581 page_cache_release(fault_page); 3483 * Some device drivers do not set page.mapping but still
3484 * dirty their pages
3485 */
3486 balance_dirty_pages_ratelimited(mapping);
3582 } 3487 }
3583 3488
3584 return ret; 3489 /* file_update_time outside page_lock */
3490 if (vma->vm_file && !vma->vm_ops->page_mkwrite)
3491 file_update_time(vma->vm_file);
3585 3492
3586unwritable_page:
3587 page_cache_release(page);
3588 return ret;
3589uncharge_out:
3590 /* fs's fault handler get error */
3591 if (cow_page) {
3592 mem_cgroup_uncharge_page(cow_page);
3593 page_cache_release(cow_page);
3594 }
3595 return ret; 3493 return ret;
3596} 3494}
3597 3495
@@ -3609,7 +3507,7 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3609 if (!(vma->vm_flags & VM_SHARED)) 3507 if (!(vma->vm_flags & VM_SHARED))
3610 return do_cow_fault(mm, vma, address, pmd, pgoff, flags, 3508 return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
3611 orig_pte); 3509 orig_pte);
3612 return do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 3510 return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3613} 3511}
3614 3512
3615/* 3513/*
@@ -3647,7 +3545,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3647 if (!(vma->vm_flags & VM_SHARED)) 3545 if (!(vma->vm_flags & VM_SHARED))
3648 return do_cow_fault(mm, vma, address, pmd, pgoff, flags, 3546 return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
3649 orig_pte); 3547 orig_pte);
3650 return do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 3548 return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3651} 3549}
3652 3550
3653static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, 3551static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,