aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2009-09-21 20:02:22 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-22 10:17:32 -0400
commit1c2fb7a4c2ca7a958b02bc1e615d0254990bba8d (patch)
tree489a97bd453b8002f2234f7e736548103315fa38
parent9ba6929480088a85c1ff60a4b1f1c9fc80dbd2b7 (diff)
ksm: fix deadlock with munlock in exit_mmap
Rawhide users have reported hang at startup when cryptsetup is run: the same problem can be simply reproduced by running a program int main() { mlockall(MCL_CURRENT | MCL_FUTURE); return 0; } The problem is that exit_mmap() applies munlock_vma_pages_all() to clean up VM_LOCKED areas, and its current implementation (stupidly) tries to fault in absent pages, for example where PROT_NONE prevented them being faulted in when mlocking. Whereas the "ksm: fix oom deadlock" patch, knowing there's a race by which KSM might try to fault in pages after exit_mmap() had finally zapped the range, backs out of such faults doing nothing when its ksm_test_exit() notices mm_users 0. So revert that part of "ksm: fix oom deadlock" which moved the ksm_exit() call from before exit_mmap() to the middle of exit_mmap(); and remove those ksm_test_exit() checks from the page fault paths, so allowing the munlocking to proceed without interference. ksm_exit, if there are rmap_items still chained on this mm slot, takes mmap_sem write side: so preventing KSM from working on an mm while exit_mmap runs. And KSM will bail out as soon as it notices that mm_users is already zero, thanks to its internal ksm_test_exit checks. So that when a task is killed by OOM killer or the user, KSM will not indefinitely prevent it from running exit_mmap to release its memory. This does break a part of what "ksm: fix oom deadlock" was trying to achieve. When unmerging KSM (echo 2 >/sys/kernel/mm/ksm), and even when ksmd itself has to cancel a KSM page, it is possible that the first OOM-kill victim would be the KSM process being faulted: then its memory won't be freed until a second victim has been selected (freeing memory for the unmerging fault to complete). But the OOM killer is already liable to kill a second victim once the intended victim's p->mm goes to NULL: so there's not much point in rejecting this KSM patch before fixing that OOM behaviour. It is very much more important to allow KSM users to boot up, than to haggle over an unlikely and poorly supported OOM case. We also intend to fix munlocking to not fault pages: at which point this patch _could_ be reverted; though that would be controversial, so we hope to find a better solution. Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Acked-by: Justin M. Forbes <jforbes@redhat.com> Acked-for-now-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Izik Eidus <ieidus@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/ksm.h11
-rw-r--r--kernel/fork.c1
-rw-r--r--mm/ksm.c5
-rw-r--r--mm/memory.c4
-rw-r--r--mm/mmap.c7
5 files changed, 8 insertions, 20 deletions
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 2d64ff30c0de..0e26de6adb51 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -18,8 +18,7 @@ struct mmu_gather;
18int ksm_madvise(struct vm_area_struct *vma, unsigned long start, 18int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
19 unsigned long end, int advice, unsigned long *vm_flags); 19 unsigned long end, int advice, unsigned long *vm_flags);
20int __ksm_enter(struct mm_struct *mm); 20int __ksm_enter(struct mm_struct *mm);
21void __ksm_exit(struct mm_struct *mm, 21void __ksm_exit(struct mm_struct *mm);
22 struct mmu_gather **tlbp, unsigned long end);
23 22
24static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) 23static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
25{ 24{
@@ -41,11 +40,10 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
41 return atomic_read(&mm->mm_users) == 0; 40 return atomic_read(&mm->mm_users) == 0;
42} 41}
43 42
44static inline void ksm_exit(struct mm_struct *mm, 43static inline void ksm_exit(struct mm_struct *mm)
45 struct mmu_gather **tlbp, unsigned long end)
46{ 44{
47 if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) 45 if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
48 __ksm_exit(mm, tlbp, end); 46 __ksm_exit(mm);
49} 47}
50 48
51/* 49/*
@@ -86,8 +84,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
86 return 0; 84 return 0;
87} 85}
88 86
89static inline void ksm_exit(struct mm_struct *mm, 87static inline void ksm_exit(struct mm_struct *mm)
90 struct mmu_gather **tlbp, unsigned long end)
91{ 88{
92} 89}
93 90
diff --git a/kernel/fork.c b/kernel/fork.c
index 42f20f565b16..73a442b7be6d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -501,6 +501,7 @@ void mmput(struct mm_struct *mm)
501 501
502 if (atomic_dec_and_test(&mm->mm_users)) { 502 if (atomic_dec_and_test(&mm->mm_users)) {
503 exit_aio(mm); 503 exit_aio(mm);
504 ksm_exit(mm);
504 exit_mmap(mm); 505 exit_mmap(mm);
505 set_mm_exe_file(mm, NULL); 506 set_mm_exe_file(mm, NULL);
506 if (!list_empty(&mm->mmlist)) { 507 if (!list_empty(&mm->mmlist)) {
diff --git a/mm/ksm.c b/mm/ksm.c
index 722e3f2a8dc5..92034eb47eba 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1416,8 +1416,7 @@ int __ksm_enter(struct mm_struct *mm)
1416 return 0; 1416 return 0;
1417} 1417}
1418 1418
1419void __ksm_exit(struct mm_struct *mm, 1419void __ksm_exit(struct mm_struct *mm)
1420 struct mmu_gather **tlbp, unsigned long end)
1421{ 1420{
1422 struct mm_slot *mm_slot; 1421 struct mm_slot *mm_slot;
1423 int easy_to_free = 0; 1422 int easy_to_free = 0;
@@ -1450,10 +1449,8 @@ void __ksm_exit(struct mm_struct *mm,
1450 clear_bit(MMF_VM_MERGEABLE, &mm->flags); 1449 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1451 mmdrop(mm); 1450 mmdrop(mm);
1452 } else if (mm_slot) { 1451 } else if (mm_slot) {
1453 tlb_finish_mmu(*tlbp, 0, end);
1454 down_write(&mm->mmap_sem); 1452 down_write(&mm->mmap_sem);
1455 up_write(&mm->mmap_sem); 1453 up_write(&mm->mmap_sem);
1456 *tlbp = tlb_gather_mmu(mm, 1);
1457 } 1454 }
1458} 1455}
1459 1456
diff --git a/mm/memory.c b/mm/memory.c
index f47ffe971012..05feaa11d87c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2648,7 +2648,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2648 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2648 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2649 2649
2650 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2650 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2651 if (!pte_none(*page_table) || ksm_test_exit(mm)) 2651 if (!pte_none(*page_table))
2652 goto release; 2652 goto release;
2653 2653
2654 inc_mm_counter(mm, anon_rss); 2654 inc_mm_counter(mm, anon_rss);
@@ -2792,7 +2792,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2792 * handle that later. 2792 * handle that later.
2793 */ 2793 */
2794 /* Only go through if we didn't race with anybody else... */ 2794 /* Only go through if we didn't race with anybody else... */
2795 if (likely(pte_same(*page_table, orig_pte) && !ksm_test_exit(mm))) { 2795 if (likely(pte_same(*page_table, orig_pte))) {
2796 flush_icache_page(vma, page); 2796 flush_icache_page(vma, page);
2797 entry = mk_pte(page, vma->vm_page_prot); 2797 entry = mk_pte(page, vma->vm_page_prot);
2798 if (flags & FAULT_FLAG_WRITE) 2798 if (flags & FAULT_FLAG_WRITE)
diff --git a/mm/mmap.c b/mm/mmap.c
index e02f1aa66a1a..ffd6c6c9bcf4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2113,13 +2113,6 @@ void exit_mmap(struct mm_struct *mm)
2113 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2113 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2114 vm_unacct_memory(nr_accounted); 2114 vm_unacct_memory(nr_accounted);
2115 2115
2116 /*
2117 * For KSM to handle OOM without deadlock when it's breaking COW in a
2118 * likely victim of the OOM killer, we must serialize with ksm_exit()
2119 * after freeing mm's pages but before freeing its page tables.
2120 */
2121 ksm_exit(mm, &tlb, end);
2122
2123 free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); 2116 free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
2124 tlb_finish_mmu(tlb, 0, end); 2117 tlb_finish_mmu(tlb, 0, end);
2125 2118