aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2009-04-30 18:08:16 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-05-02 18:36:09 -0400
commitb827e496c893de0c0f142abfaeb8730a2fd6b37f (patch)
treea86aecd5d811f9306b9662ceb5a5a45091b62b97 /mm
parenta5fc1abe438b87a9d128beebc377f78e2681a76d (diff)
mm: close page_mkwrite races
Change page_mkwrite to allow implementations to return with the page locked, and also change it's callers (in page fault paths) to hold the lock until the page is marked dirty. This allows the filesystem to have full control of page dirtying events coming from the VM. Rather than simply hold the page locked over the page_mkwrite call, we call page_mkwrite with the page unlocked and allow callers to return with it locked, so filesystems can avoid LOR conditions with page lock. The problem with the current scheme is this: a filesystem that wants to associate some metadata with a page as long as the page is dirty, will perform this manipulation in its ->page_mkwrite. It currently then must return with the page unlocked and may not hold any other locks (according to existing page_mkwrite convention). In this window, the VM could write out the page, clearing page-dirty. The filesystem has no good way to detect that a dirty pte is about to be attached, so it will happily write out the page, at which point, the filesystem may manipulate the metadata to reflect that the page is no longer dirty. It is not always possible to perform the required metadata manipulation in ->set_page_dirty, because that function cannot block or fail. The filesystem may need to allocate some data structure, for example. And the VM cannot mark the pte dirty before page_mkwrite, because page_mkwrite is allowed to fail, so we must not allow any window where the page could be written to if page_mkwrite does fail. This solution of holding the page locked over the 3 critical operations (page_mkwrite, setting the pte dirty, and finally setting the page dirty) closes out races nicely, preventing page cleaning for writeout being initiated in that window. This provides the filesystem with a strong synchronisation against the VM here. - Sage needs this race closed for ceph filesystem. - Trond for NFS (http://bugzilla.kernel.org/show_bug.cgi?id=12913). - I need it for fsblock. - I suspect other filesystems may need it too (eg. btrfs). - I have converted buffer.c to the new locking. Even simple block allocation under dirty pages might be susceptible to i_size changing under partial page at the end of file (we also have a buffer.c-side problem here, but it cannot be fixed properly without this patch). - Other filesystems (eg. NFS, maybe btrfs) will need to change their page_mkwrite functions themselves. [ This also moves page_mkwrite another step closer to fault, which should eventually allow page_mkwrite to be moved into ->fault, and thus avoiding a filesystem calldown and page lock/unlock cycle in __do_fault. ] [akpm@linux-foundation.org: fix derefs of NULL ->mapping] Cc: Sage Weil <sage@newdream.net> Cc: Trond Myklebust <trond.myklebust@fys.uio.no> Signed-off-by: Nick Piggin <npiggin@suse.de> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Cc: <stable@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memory.c108
1 files changed, 76 insertions, 32 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 6a4ef0fd0711..4126dd16778c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1971,6 +1971,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1971 ret = tmp; 1971 ret = tmp;
1972 goto unwritable_page; 1972 goto unwritable_page;
1973 } 1973 }
1974 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
1975 lock_page(old_page);
1976 if (!old_page->mapping) {
1977 ret = 0; /* retry the fault */
1978 unlock_page(old_page);
1979 goto unwritable_page;
1980 }
1981 } else
1982 VM_BUG_ON(!PageLocked(old_page));
1974 1983
1975 /* 1984 /*
1976 * Since we dropped the lock we need to revalidate 1985 * Since we dropped the lock we need to revalidate
@@ -1980,9 +1989,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1980 */ 1989 */
1981 page_table = pte_offset_map_lock(mm, pmd, address, 1990 page_table = pte_offset_map_lock(mm, pmd, address,
1982 &ptl); 1991 &ptl);
1983 page_cache_release(old_page); 1992 if (!pte_same(*page_table, orig_pte)) {
1984 if (!pte_same(*page_table, orig_pte)) 1993 unlock_page(old_page);
1994 page_cache_release(old_page);
1985 goto unlock; 1995 goto unlock;
1996 }
1986 1997
1987 page_mkwrite = 1; 1998 page_mkwrite = 1;
1988 } 1999 }
@@ -2094,9 +2105,6 @@ gotten:
2094unlock: 2105unlock:
2095 pte_unmap_unlock(page_table, ptl); 2106 pte_unmap_unlock(page_table, ptl);
2096 if (dirty_page) { 2107 if (dirty_page) {
2097 if (vma->vm_file)
2098 file_update_time(vma->vm_file);
2099
2100 /* 2108 /*
2101 * Yes, Virginia, this is actually required to prevent a race 2109 * Yes, Virginia, this is actually required to prevent a race
2102 * with clear_page_dirty_for_io() from clearing the page dirty 2110 * with clear_page_dirty_for_io() from clearing the page dirty
@@ -2105,16 +2113,41 @@ unlock:
2105 * 2113 *
2106 * do_no_page is protected similarly. 2114 * do_no_page is protected similarly.
2107 */ 2115 */
2108 wait_on_page_locked(dirty_page); 2116 if (!page_mkwrite) {
2109 set_page_dirty_balance(dirty_page, page_mkwrite); 2117 wait_on_page_locked(dirty_page);
2118 set_page_dirty_balance(dirty_page, page_mkwrite);
2119 }
2110 put_page(dirty_page); 2120 put_page(dirty_page);
2121 if (page_mkwrite) {
2122 struct address_space *mapping = dirty_page->mapping;
2123
2124 set_page_dirty(dirty_page);
2125 unlock_page(dirty_page);
2126 page_cache_release(dirty_page);
2127 if (mapping) {
2128 /*
2129 * Some device drivers do not set page.mapping
2130 * but still dirty their pages
2131 */
2132 balance_dirty_pages_ratelimited(mapping);
2133 }
2134 }
2135
2136 /* file_update_time outside page_lock */
2137 if (vma->vm_file)
2138 file_update_time(vma->vm_file);
2111 } 2139 }
2112 return ret; 2140 return ret;
2113oom_free_new: 2141oom_free_new:
2114 page_cache_release(new_page); 2142 page_cache_release(new_page);
2115oom: 2143oom:
2116 if (old_page) 2144 if (old_page) {
2145 if (page_mkwrite) {
2146 unlock_page(old_page);
2147 page_cache_release(old_page);
2148 }
2117 page_cache_release(old_page); 2149 page_cache_release(old_page);
2150 }
2118 return VM_FAULT_OOM; 2151 return VM_FAULT_OOM;
2119 2152
2120unwritable_page: 2153unwritable_page:
@@ -2664,27 +2697,22 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2664 int tmp; 2697 int tmp;
2665 2698
2666 unlock_page(page); 2699 unlock_page(page);
2667 vmf.flags |= FAULT_FLAG_MKWRITE; 2700 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2668 tmp = vma->vm_ops->page_mkwrite(vma, &vmf); 2701 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2669 if (unlikely(tmp & 2702 if (unlikely(tmp &
2670 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { 2703 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2671 ret = tmp; 2704 ret = tmp;
2672 anon = 1; /* no anon but release vmf.page */ 2705 goto unwritable_page;
2673 goto out_unlocked;
2674 }
2675 lock_page(page);
2676 /*
2677 * XXX: this is not quite right (racy vs
2678 * invalidate) to unlock and relock the page
2679 * like this, however a better fix requires
2680 * reworking page_mkwrite locking API, which
2681 * is better done later.
2682 */
2683 if (!page->mapping) {
2684 ret = 0;
2685 anon = 1; /* no anon but release vmf.page */
2686 goto out;
2687 } 2706 }
2707 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2708 lock_page(page);
2709 if (!page->mapping) {
2710 ret = 0; /* retry the fault */
2711 unlock_page(page);
2712 goto unwritable_page;
2713 }
2714 } else
2715 VM_BUG_ON(!PageLocked(page));
2688 page_mkwrite = 1; 2716 page_mkwrite = 1;
2689 } 2717 }
2690 } 2718 }
@@ -2736,19 +2764,35 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2736 pte_unmap_unlock(page_table, ptl); 2764 pte_unmap_unlock(page_table, ptl);
2737 2765
2738out: 2766out:
2739 unlock_page(vmf.page); 2767 if (dirty_page) {
2740out_unlocked: 2768 struct address_space *mapping = page->mapping;
2741 if (anon)
2742 page_cache_release(vmf.page);
2743 else if (dirty_page) {
2744 if (vma->vm_file)
2745 file_update_time(vma->vm_file);
2746 2769
2747 set_page_dirty_balance(dirty_page, page_mkwrite); 2770 if (set_page_dirty(dirty_page))
2771 page_mkwrite = 1;
2772 unlock_page(dirty_page);
2748 put_page(dirty_page); 2773 put_page(dirty_page);
2774 if (page_mkwrite && mapping) {
2775 /*
2776 * Some device drivers do not set page.mapping but still
2777 * dirty their pages
2778 */
2779 balance_dirty_pages_ratelimited(mapping);
2780 }
2781
2782 /* file_update_time outside page_lock */
2783 if (vma->vm_file)
2784 file_update_time(vma->vm_file);
2785 } else {
2786 unlock_page(vmf.page);
2787 if (anon)
2788 page_cache_release(vmf.page);
2749 } 2789 }
2750 2790
2751 return ret; 2791 return ret;
2792
2793unwritable_page:
2794 page_cache_release(page);
2795 return ret;
2752} 2796}
2753 2797
2754static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2798static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,