From e15f8c01af924e611bc7be1e45449c4a74e5dfdd Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 10 Feb 2011 15:01:32 -0800 Subject: mlock: fix race when munlocking pages in do_wp_page() vmscan can lazily find pages that are mapped within VM_LOCKED vmas, and set the PageMlocked bit on these pages, transfering them onto the unevictable list. When do_wp_page() breaks COW within a VM_LOCKED vma, it may need to clear PageMlocked on the old page and set it on the new page instead. This change fixes an issue where do_wp_page() was clearing PageMlocked on the old page while the pte was still pointing to it (as well as rmap). Therefore, we were not protected against vmscan immediately transfering the old page back onto the unevictable list. This could cause pages to get stranded there forever. I propose to move the corresponding code to the end of do_wp_page(), after the pte (and rmap) have been pointed to the new page. Additionally, we can use munlock_vma_page() instead of clear_page_mlock(), so that the old page stays mlocked if there are still other VM_LOCKED vmas mapping it. Signed-off-by: Michel Lespinasse Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Hugh Dickins Cc: Rik van Riel Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 31250faff390..32df03cf13a5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2219,7 +2219,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); - page_cache_release(old_page); goto unlock; } page_cache_release(old_page); @@ -2289,7 +2288,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); - page_cache_release(old_page); goto unlock; } @@ -2367,16 +2365,6 @@ gotten: } __SetPageUptodate(new_page); - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if ((vma->vm_flags & VM_LOCKED) && old_page) { - lock_page(old_page); /* for LRU manipulation */ - clear_page_mlock(old_page); - unlock_page(old_page); - } - if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; @@ -2444,10 +2432,20 @@ gotten: if (new_page) page_cache_release(new_page); - if (old_page) - page_cache_release(old_page); unlock: pte_unmap_unlock(page_table, ptl); + if (old_page) { + /* + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. + */ + if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { + lock_page(old_page); /* LRU manipulation */ + munlock_vma_page(old_page); + unlock_page(old_page); + } + page_cache_release(old_page); + } return ret; oom_free_new: page_cache_release(new_page); -- cgit v1.2.2 From 419d8c96dbfa558f00e623023917d0a5afc46129 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 10 Feb 2011 15:01:33 -0800 Subject: mlock: do not munlock pages in __do_fault() If the page is going to be written to, __do_page needs to break COW. However, the old page (before breaking COW) was never mapped mapped into the current pte (__do_fault is only called when the pte is not present), so vmscan can't have marked the old page as PageMlocked due to being mapped in __do_fault's VMA. Therefore, __do_fault() does not need to worry about clearing PageMlocked() on the old page. Signed-off-by: Michel Lespinasse Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Hugh Dickins Cc: Rik van Riel Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 32df03cf13a5..8e8c18324863 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3051,12 +3051,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto out; } charged = 1; - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if (vma->vm_flags & VM_LOCKED) - clear_page_mlock(vmf.page); copy_user_highpage(page, vmf.page, address, vma); __SetPageUptodate(page); } else { -- cgit v1.2.2 From 2aa15890f3c191326678f1bd68af61ec6b8753ec Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 23 Feb 2011 13:49:47 +0100 Subject: mm: prevent concurrent unmap_mapping_range() on the same inode Michael Leun reported that running parallel opens on a fuse filesystem can trigger a "kernel BUG at mm/truncate.c:475" Gurudas Pai reported the same bug on NFS. The reason is, unmap_mapping_range() is not prepared for more than one concurrent invocation per inode. For example: thread1: going through a big range, stops in the middle of a vma and stores the restart address in vm_truncate_count. thread2: comes in with a small (e.g. single page) unmap request on the same vma, somewhere before restart_address, finds that the vma was already unmapped up to the restart address and happily returns without doing anything. Another scenario would be two big unmap requests, both having to restart the unmapping and each one setting vm_truncate_count to its own value. This could go on forever without any of them being able to finish. Truncate and hole punching already serialize with i_mutex. Other callers of unmap_mapping_range() do not, and it's difficult to get i_mutex protection for all callers. In particular ->d_revalidate(), which calls invalidate_inode_pages2_range() in fuse, may be called with or without i_mutex. This patch adds a new mutex to 'struct address_space' to prevent running multiple concurrent unmap_mapping_range() on the same mapping. [ We'll hopefully get rid of all this with the upcoming mm preemptibility series by Peter Zijlstra, the "mm: Remove i_mmap_mutex lockbreak" patch in particular. But that is for 2.6.39 ] Signed-off-by: Miklos Szeredi Reported-by: Michael Leun Reported-by: Gurudas Pai Tested-by: Gurudas Pai Acked-by: Hugh Dickins Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/memory.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 8e8c18324863..5823698c2b71 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2648,6 +2648,7 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = ULONG_MAX; details.i_mmap_lock = &mapping->i_mmap_lock; + mutex_lock(&mapping->unmap_mutex); spin_lock(&mapping->i_mmap_lock); /* Protect against endless unmapping loops */ @@ -2664,6 +2665,7 @@ void unmap_mapping_range(struct address_space *mapping, if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->unmap_mutex); } EXPORT_SYMBOL(unmap_mapping_range); -- cgit v1.2.2