ksm: fix oom deadlock

There's a now-obvious deadlock in KSM's out-of-memory handling: imagine ksmd or KSM_RUN_UNMERGE handling, holding ksm_thread_mutex, trying to allocate a page to break KSM in an mm which becomes the OOM victim (quite likely in the unmerge case): it's killed and goes to exit, and hangs there waiting to acquire ksm_thread_mutex. Clearly we must not require ksm_thread_mutex in __ksm_exit, simple though that made everything else: perhaps use mmap_sem somehow? And part of the answer lies in the comments on unmerge_ksm_pages: __ksm_exit should also leave all the rmap_item removal to ksmd. But there's a fundamental problem, that KSM relies upon mmap_sem to guarantee the consistency of the mm it's dealing with, yet exit_mmap tears down an mm without taking mmap_sem. And bumping mm_users won't help at all, that just ensures that the pages the OOM killer assumes are on their way to being freed will not be freed. The best answer seems to be, to move the ksm_exit callout from just before exit_mmap, to the middle of exit_mmap: after the mm's pages have been freed (if the mmu_gather is flushed), but before its page tables and vma structures have been freed; and down_write,up_write mmap_sem there to serialize with KSM's own reliance on mmap_sem. But KSM then needs to be careful, whenever it downs mmap_sem, to check that the mm is not already exiting: there's a danger of using find_vma on a layout that's being torn apart, or writing into page tables which have been freed for reuse; and even do_anonymous_page and __do_fault need to check they're not being called by break_ksm to reinstate a pte after zap_pte_range has zapped that page table. Though it might be clearer to add an exiting flag, set while holding mmap_sem in __ksm_exit, that wouldn't cover the issue of reinstating a zapped pte. All we need is to check whether mm_users is 0 - but must remember that ksmd may detect that before __ksm_exit is reached. So, ksm_test_exit(mm) added to comment such checks on mm->mm_users. __ksm_exit now has to leave clearing up the rmap_items to ksmd, that needs ksm_thread_mutex; but shift the exiting mm just after the ksm_scan cursor so that it will soon be dealt with. __ksm_enter raise mm_count to hold the mm_struct, ksmd's exit processing (exactly like its processing when it finds all VM_MERGEABLEs unmapped) mmdrop it, similar procedure for KSM_RUN_UNMERGE (which has stopped ksmd). But also give __ksm_exit a fast path: when there's no complication (no rmap_items attached to mm and it's not at the ksm_scan cursor), it can safely do all the exiting work itself. This is not just an optimization: when ksmd is not running, the raised mm_count would otherwise leak mm_structs. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Acked-by: Izik Eidus <ieidus@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Hugh Dickins <hugh.dickins@tiscali.co.uk> 2009-09-21 20:02:20 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2009-09-22 10:17:32 -0400
commit: 9ba6929480088a85c1ff60a4b1f1c9fc80dbd2b7 (patch)
tree: 39aab8cdffae598b55e35c578f70820712286ab4 /mm
parent: cd551f97519d35855be5a8720a47cc802ee4fd06 (diff)
3 files changed, 110 insertions, 48 deletions
diff --git a/mm/ksm.c b/mm/ksm.c
index 7e4d255dadc0..722e3f2a8dc5 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -32,6 +32,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/ksm.h>
+#include <asm/tlb.h>
 #include <asm/tlbflush.h>
 /*
@@ -347,6 +348,8 @@ static void break_cow(struct mm_struct *mm, unsigned long addr)
        struct vm_area_struct *vma;
        down_read(&mm->mmap_sem);
+        if (ksm_test_exit(mm))
+                goto out;
        vma = find_vma(mm, addr);
        if (!vma || vma->vm_start > addr)
                goto out;
@@ -365,6 +368,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
        struct page *page;
        down_read(&mm->mmap_sem);
+        if (ksm_test_exit(mm))
+                goto out;
        vma = find_vma(mm, addr);
        if (!vma || vma->vm_start > addr)
                goto out;
@@ -439,11 +444,11 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
        } else if (rmap_item->address & NODE_FLAG) {
                unsigned char age;
                /*
-                 * ksm_thread can and must skip the rb_erase, because
+                 * Usually ksmd can and must skip the rb_erase, because
                 * root_unstable_tree was already reset to RB_ROOT.
-                 * But __ksm_exit has to be careful: do the rb_erase
+                 * But be careful when an mm is exiting: do the rb_erase
-                 * if it's interrupting a scan, and this rmap_item was
+                 * if this rmap_item was inserted by this scan, rather
-                 * inserted by this scan rather than left from before.
+                 * than left over from before.
                 */
                age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
                BUG_ON(age > 1);
@@ -491,6 +496,8 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
        int err = 0;
        for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
+                if (ksm_test_exit(vma->vm_mm))
+                        break;
                if (signal_pending(current))
                        err = -ERESTARTSYS;
                else
@@ -507,34 +514,50 @@ static int unmerge_and_remove_all_rmap_items(void)
        int err = 0;
        spin_lock(&ksm_mmlist_lock);
-        mm_slot = list_entry(ksm_mm_head.mm_list.next,
+        ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
                                                struct mm_slot, mm_list);
        spin_unlock(&ksm_mmlist_lock);
-        while (mm_slot != &ksm_mm_head) {
+        for (mm_slot = ksm_scan.mm_slot;
+                        mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
                mm = mm_slot->mm;
                down_read(&mm->mmap_sem);
                for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                        if (ksm_test_exit(mm))
+                                break;
                        if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
                                continue;
                        err = unmerge_ksm_pages(vma,
                                                vma->vm_start, vma->vm_end);
-                        if (err) {
+                        if (err)
-                                up_read(&mm->mmap_sem);
+                                goto error;
-                                goto out;
-                        }
                }
                remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
-                up_read(&mm->mmap_sem);
                spin_lock(&ksm_mmlist_lock);
-                mm_slot = list_entry(mm_slot->mm_list.next,
+                ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
                                                struct mm_slot, mm_list);
-                spin_unlock(&ksm_mmlist_lock);
+                if (ksm_test_exit(mm)) {
+                        hlist_del(&mm_slot->link);
+                        list_del(&mm_slot->mm_list);
+                        spin_unlock(&ksm_mmlist_lock);
+                        free_mm_slot(mm_slot);
+                        clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+                        up_read(&mm->mmap_sem);
+                        mmdrop(mm);
+                } else {
+                        spin_unlock(&ksm_mmlist_lock);
+                        up_read(&mm->mmap_sem);
+                }
        }
        ksm_scan.seqnr = 0;
-out:
+        return 0;
+error:
+        up_read(&mm->mmap_sem);
        spin_lock(&ksm_mmlist_lock);
        ksm_scan.mm_slot = &ksm_mm_head;
        spin_unlock(&ksm_mmlist_lock);
@@ -755,6 +778,9 @@ static int try_to_merge_with_ksm_page(struct mm_struct *mm1,
        int err = -EFAULT;
        down_read(&mm1->mmap_sem);
+        if (ksm_test_exit(mm1))
+                goto out;
        vma = find_vma(mm1, addr1);
        if (!vma || vma->vm_start > addr1)
                goto out;
@@ -796,6 +822,10 @@ static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1,
                return err;
        down_read(&mm1->mmap_sem);
+        if (ksm_test_exit(mm1)) {
+                up_read(&mm1->mmap_sem);
+                goto out;
+        }
        vma = find_vma(mm1, addr1);
        if (!vma || vma->vm_start > addr1) {
                up_read(&mm1->mmap_sem);
@@ -1174,7 +1204,12 @@ next_mm:
        mm = slot->mm;
        down_read(&mm->mmap_sem);
-        for (vma = find_vma(mm, ksm_scan.address); vma; vma = vma->vm_next) {
+        if (ksm_test_exit(mm))
+                vma = NULL;
+        else
+                vma = find_vma(mm, ksm_scan.address);
+        for (; vma; vma = vma->vm_next) {
                if (!(vma->vm_flags & VM_MERGEABLE))
                        continue;
                if (ksm_scan.address < vma->vm_start)
@@ -1183,6 +1218,8 @@ next_mm:
                        ksm_scan.address = vma->vm_end;
                while (ksm_scan.address < vma->vm_end) {
+                        if (ksm_test_exit(mm))
+                                break;
                        *page = follow_page(vma, ksm_scan.address, FOLL_GET);
                        if (*page && PageAnon(*page)) {
                                flush_anon_page(vma, *page, ksm_scan.address);
@@ -1205,6 +1242,11 @@ next_mm:
                }
        }
+        if (ksm_test_exit(mm)) {
+                ksm_scan.address = 0;
+                ksm_scan.rmap_item = list_entry(&slot->rmap_list,
+                                                struct rmap_item, link);
+        }
        /*
         * Nuke all the rmap_items that are above this current rmap:
         * because there were no VM_MERGEABLE vmas with such addresses.
@@ -1219,24 +1261,29 @@ next_mm:
                 * We've completed a full scan of all vmas, holding mmap_sem
                 * throughout, and found no VM_MERGEABLE: so do the same as
                 * __ksm_exit does to remove this mm from all our lists now.
+                 * This applies either when cleaning up after __ksm_exit
+                 * (but beware: we can reach here even before __ksm_exit),
+                 * or when all VM_MERGEABLE areas have been unmapped (and
+                 * mmap_sem then protects against race with MADV_MERGEABLE).
                 */
                hlist_del(&slot->link);
                list_del(&slot->mm_list);
+                spin_unlock(&ksm_mmlist_lock);
                free_mm_slot(slot);
                clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+                up_read(&mm->mmap_sem);
+                mmdrop(mm);
+        } else {
+                spin_unlock(&ksm_mmlist_lock);
+                up_read(&mm->mmap_sem);
        }
-        spin_unlock(&ksm_mmlist_lock);
-        up_read(&mm->mmap_sem);
        /* Repeat until we've completed scanning the whole list */
        slot = ksm_scan.mm_slot;
        if (slot != &ksm_mm_head)
                goto next_mm;
-        /*
-         * Bump seqnr here rather than at top, so that __ksm_exit
-         * can skip rb_erase on unstable tree until we run again.
-         */
        ksm_scan.seqnr++;
        return NULL;
 }
@@ -1361,6 +1408,7 @@ int __ksm_enter(struct mm_struct *mm)
        spin_unlock(&ksm_mmlist_lock);
        set_bit(MMF_VM_MERGEABLE, &mm->flags);
+        atomic_inc(&mm->mm_count);
        if (needs_wakeup)
                wake_up_interruptible(&ksm_thread_wait);
@@ -1368,41 +1416,45 @@ int __ksm_enter(struct mm_struct *mm)
        return 0;
 }
-void __ksm_exit(struct mm_struct *mm)
+void __ksm_exit(struct mm_struct *mm,
+                struct mmu_gather **tlbp, unsigned long end)
 {
        struct mm_slot *mm_slot;
+        int easy_to_free = 0;
        /*
-         * This process is exiting: doesn't hold and doesn't need mmap_sem;
+         * This process is exiting: if it's straightforward (as is the
-         * but we do need to exclude ksmd and other exiters while we modify
+         * case when ksmd was never running), free mm_slot immediately.
-         * the various lists and trees.
+         * But if it's at the cursor or has rmap_items linked to it, use
+         * mmap_sem to synchronize with any break_cows before pagetables
+         * are freed, and leave the mm_slot on the list for ksmd to free.
+         * Beware: ksm may already have noticed it exiting and freed the slot.
         */
-        mutex_lock(&ksm_thread_mutex);
        spin_lock(&ksm_mmlist_lock);
        mm_slot = get_mm_slot(mm);
-        if (!list_empty(&mm_slot->rmap_list)) {
+        if (mm_slot && ksm_scan.mm_slot != mm_slot) {
-                spin_unlock(&ksm_mmlist_lock);
+                if (list_empty(&mm_slot->rmap_list)) {
-                remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
+                        hlist_del(&mm_slot->link);
-                spin_lock(&ksm_mmlist_lock);
+                        list_del(&mm_slot->mm_list);
-        }
+                        easy_to_free = 1;
+                } else {
-        if (ksm_scan.mm_slot == mm_slot) {
+                        list_move(&mm_slot->mm_list,
-                ksm_scan.mm_slot = list_entry(
+                                  &ksm_scan.mm_slot->mm_list);
-                        mm_slot->mm_list.next, struct mm_slot, mm_list);
+                }
-                ksm_scan.address = 0;
-                ksm_scan.rmap_item = list_entry(
-                        &ksm_scan.mm_slot->rmap_list, struct rmap_item, link);
-                if (ksm_scan.mm_slot == &ksm_mm_head)
-                        ksm_scan.seqnr++;
        }
-        hlist_del(&mm_slot->link);
-        list_del(&mm_slot->mm_list);
        spin_unlock(&ksm_mmlist_lock);
-        free_mm_slot(mm_slot);
+        if (easy_to_free) {
-        clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+                free_mm_slot(mm_slot);
-        mutex_unlock(&ksm_thread_mutex);
+                clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+                mmdrop(mm);
+        } else if (mm_slot) {
+                tlb_finish_mmu(*tlbp, 0, end);
+                down_write(&mm->mmap_sem);
+                up_write(&mm->mmap_sem);
+                *tlbp = tlb_gather_mmu(mm, 1);
+        }
 }
 #define KSM_ATTR_RO(_name) \
diff --git a/mm/memory.c b/mm/memory.c
index 1a435b81876c..f47ffe971012 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2648,8 +2648,9 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-        if (!pte_none(*page_table))
+        if (!pte_none(*page_table) || ksm_test_exit(mm))
                goto release;
        inc_mm_counter(mm, anon_rss);
        page_add_new_anon_rmap(page, vma, address);
        set_pte_at(mm, address, page_table, entry);
@@ -2791,7 +2792,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * handle that later.
         */
        /* Only go through if we didn't race with anybody else... */
-        if (likely(pte_same(*page_table, orig_pte))) {
+        if (likely(pte_same(*page_table, orig_pte) && !ksm_test_exit(mm))) {
                flush_icache_page(vma, page);
                entry = mk_pte(page, vma->vm_page_prot);
                if (flags & FAULT_FLAG_WRITE)
diff --git a/mm/mmap.c b/mm/mmap.c
index 376492ed08f4..e02f1aa66a1a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -27,6 +27,7 @@
 #include <linux/mount.h>
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
+#include <linux/ksm.h>
 #include <linux/mmu_notifier.h>
 #include <linux/perf_event.h>
@@ -2111,6 +2112,14 @@ void exit_mmap(struct mm_struct *mm)
        /* Use -1 here to ensure all VMAs in the mm are unmapped */
        end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
+        /*
+         * For KSM to handle OOM without deadlock when it's breaking COW in a
+         * likely victim of the OOM killer, we must serialize with ksm_exit()
+         * after freeing mm's pages but before freeing its page tables.
+         */
+        ksm_exit(mm, &tlb, end);
        free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
        tlb_finish_mmu(tlb, 0, end);
author	Hugh Dickins <hugh.dickins@tiscali.co.uk>	2009-09-21 20:02:20 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2009-09-22 10:17:32 -0400
commit	9ba6929480088a85c1ff60a4b1f1c9fc80dbd2b7 (patch)
tree	39aab8cdffae598b55e35c578f70820712286ab4 /mm
parent	cd551f97519d35855be5a8720a47cc802ee4fd06 (diff)

diff --git a/mm/ksm.c b/mm/ksm.c index 7e4d255dadc0..722e3f2a8dc5 100644 --- a/mm/ksm.c +++ b/mm/ksm.c
@@ -32,6 +32,7 @@
32	#include <linux/mmu_notifier.h>	32	#include <linux/mmu_notifier.h>
33	#include <linux/ksm.h>	33	#include <linux/ksm.h>
34		34
		35	#include <asm/tlb.h>
35	#include <asm/tlbflush.h>	36	#include <asm/tlbflush.h>
36		37
37	/*	38	/*
@@ -347,6 +348,8 @@ static void break_cow(struct mm_struct *mm, unsigned long addr)
347	struct vm_area_struct *vma;	348	struct vm_area_struct *vma;
348		349
349	down_read(&mm->mmap_sem);	350	down_read(&mm->mmap_sem);
		351	if (ksm_test_exit(mm))
		352	goto out;
350	vma = find_vma(mm, addr);	353	vma = find_vma(mm, addr);
351	if (!vma \|\| vma->vm_start > addr)	354	if (!vma \|\| vma->vm_start > addr)
352	goto out;	355	goto out;
@@ -365,6 +368,8 @@ static struct page get_mergeable_page(struct rmap_item rmap_item)
365	struct page *page;	368	struct page *page;
366		369
367	down_read(&mm->mmap_sem);	370	down_read(&mm->mmap_sem);
		371	if (ksm_test_exit(mm))
		372	goto out;
368	vma = find_vma(mm, addr);	373	vma = find_vma(mm, addr);
369	if (!vma \|\| vma->vm_start > addr)	374	if (!vma \|\| vma->vm_start > addr)
370	goto out;	375	goto out;
@@ -439,11 +444,11 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
439	} else if (rmap_item->address & NODE_FLAG) {	444	} else if (rmap_item->address & NODE_FLAG) {
440	unsigned char age;	445	unsigned char age;
441	/*	446	/*
442	* ksm_thread can and must skip the rb_erase, because	447	* Usually ksmd can and must skip the rb_erase, because
443	* root_unstable_tree was already reset to RB_ROOT.	448	* root_unstable_tree was already reset to RB_ROOT.
444	* But __ksm_exit has to be careful: do the rb_erase	449	* But be careful when an mm is exiting: do the rb_erase
445	* if it's interrupting a scan, and this rmap_item was	450	* if this rmap_item was inserted by this scan, rather
446	* inserted by this scan rather than left from before.	451	* than left over from before.
447	*/	452	*/
448	age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);	453	age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
449	BUG_ON(age > 1);	454	BUG_ON(age > 1);
@@ -491,6 +496,8 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
491	int err = 0;	496	int err = 0;
492		497
493	for (addr = start; addr < end && !err; addr += PAGE_SIZE) {	498	for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
		499	if (ksm_test_exit(vma->vm_mm))
		500	break;
494	if (signal_pending(current))	501	if (signal_pending(current))
495	err = -ERESTARTSYS;	502	err = -ERESTARTSYS;
496	else	503	else
@@ -507,34 +514,50 @@ static int unmerge_and_remove_all_rmap_items(void)
507	int err = 0;	514	int err = 0;
508		515
509	spin_lock(&ksm_mmlist_lock);	516	spin_lock(&ksm_mmlist_lock);
510	mm_slot = list_entry(ksm_mm_head.mm_list.next,	517	ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
511	struct mm_slot, mm_list);	518	struct mm_slot, mm_list);
512	spin_unlock(&ksm_mmlist_lock);	519	spin_unlock(&ksm_mmlist_lock);
513		520
514	while (mm_slot != &ksm_mm_head) {	521	for (mm_slot = ksm_scan.mm_slot;
		522	mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
515	mm = mm_slot->mm;	523	mm = mm_slot->mm;
516	down_read(&mm->mmap_sem);	524	down_read(&mm->mmap_sem);
517	for (vma = mm->mmap; vma; vma = vma->vm_next) {	525	for (vma = mm->mmap; vma; vma = vma->vm_next) {
		526	if (ksm_test_exit(mm))
		527	break;
518	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)	528	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)
519	continue;	529	continue;
520	err = unmerge_ksm_pages(vma,	530	err = unmerge_ksm_pages(vma,
521	vma->vm_start, vma->vm_end);	531	vma->vm_start, vma->vm_end);
522	if (err) {	532	if (err)
523	up_read(&mm->mmap_sem);	533	goto error;
524	goto out;
525	}
526	}	534	}
		535
527	remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);	536	remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
528	up_read(&mm->mmap_sem);
529		537
530	spin_lock(&ksm_mmlist_lock);	538	spin_lock(&ksm_mmlist_lock);
531	mm_slot = list_entry(mm_slot->mm_list.next,	539	ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
532	struct mm_slot, mm_list);	540	struct mm_slot, mm_list);
533	spin_unlock(&ksm_mmlist_lock);	541	if (ksm_test_exit(mm)) {
		542	hlist_del(&mm_slot->link);
		543	list_del(&mm_slot->mm_list);
		544	spin_unlock(&ksm_mmlist_lock);
		545
		546	free_mm_slot(mm_slot);
		547	clear_bit(MMF_VM_MERGEABLE, &mm->flags);
		548	up_read(&mm->mmap_sem);
		549	mmdrop(mm);
		550	} else {
		551	spin_unlock(&ksm_mmlist_lock);
		552	up_read(&mm->mmap_sem);
		553	}
534	}	554	}
535		555
536	ksm_scan.seqnr = 0;	556	ksm_scan.seqnr = 0;
537	out:	557	return 0;
		558
		559	error:
		560	up_read(&mm->mmap_sem);
538	spin_lock(&ksm_mmlist_lock);	561	spin_lock(&ksm_mmlist_lock);
539	ksm_scan.mm_slot = &ksm_mm_head;	562	ksm_scan.mm_slot = &ksm_mm_head;
540	spin_unlock(&ksm_mmlist_lock);	563	spin_unlock(&ksm_mmlist_lock);
@@ -755,6 +778,9 @@ static int try_to_merge_with_ksm_page(struct mm_struct *mm1,
755	int err = -EFAULT;	778	int err = -EFAULT;
756		779
757	down_read(&mm1->mmap_sem);	780	down_read(&mm1->mmap_sem);
		781	if (ksm_test_exit(mm1))
		782	goto out;
		783
758	vma = find_vma(mm1, addr1);	784	vma = find_vma(mm1, addr1);
759	if (!vma \|\| vma->vm_start > addr1)	785	if (!vma \|\| vma->vm_start > addr1)
760	goto out;	786	goto out;
@@ -796,6 +822,10 @@ static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1,
796	return err;	822	return err;
797		823
798	down_read(&mm1->mmap_sem);	824	down_read(&mm1->mmap_sem);
		825	if (ksm_test_exit(mm1)) {
		826	up_read(&mm1->mmap_sem);
		827	goto out;
		828	}
799	vma = find_vma(mm1, addr1);	829	vma = find_vma(mm1, addr1);
800	if (!vma \|\| vma->vm_start > addr1) {	830	if (!vma \|\| vma->vm_start > addr1) {
801	up_read(&mm1->mmap_sem);	831	up_read(&mm1->mmap_sem);
@@ -1174,7 +1204,12 @@ next_mm:
1174		1204
1175	mm = slot->mm;	1205	mm = slot->mm;
1176	down_read(&mm->mmap_sem);	1206	down_read(&mm->mmap_sem);
1177	for (vma = find_vma(mm, ksm_scan.address); vma; vma = vma->vm_next) {	1207	if (ksm_test_exit(mm))
		1208	vma = NULL;
		1209	else
		1210	vma = find_vma(mm, ksm_scan.address);
		1211
		1212	for (; vma; vma = vma->vm_next) {
1178	if (!(vma->vm_flags & VM_MERGEABLE))	1213	if (!(vma->vm_flags & VM_MERGEABLE))
1179	continue;	1214	continue;
1180	if (ksm_scan.address < vma->vm_start)	1215	if (ksm_scan.address < vma->vm_start)
@@ -1183,6 +1218,8 @@ next_mm:
1183	ksm_scan.address = vma->vm_end;	1218	ksm_scan.address = vma->vm_end;
1184		1219
1185	while (ksm_scan.address < vma->vm_end) {	1220	while (ksm_scan.address < vma->vm_end) {
		1221	if (ksm_test_exit(mm))
		1222	break;
1186	*page = follow_page(vma, ksm_scan.address, FOLL_GET);	1223	*page = follow_page(vma, ksm_scan.address, FOLL_GET);
1187	if (page && PageAnon(page)) {	1224	if (page && PageAnon(page)) {
1188	flush_anon_page(vma, *page, ksm_scan.address);	1225	flush_anon_page(vma, *page, ksm_scan.address);
@@ -1205,6 +1242,11 @@ next_mm:
1205	}	1242	}
1206	}	1243	}
1207		1244
		1245	if (ksm_test_exit(mm)) {
		1246	ksm_scan.address = 0;
		1247	ksm_scan.rmap_item = list_entry(&slot->rmap_list,
		1248	struct rmap_item, link);
		1249	}
1208	/*	1250	/*
1209	* Nuke all the rmap_items that are above this current rmap:	1251	* Nuke all the rmap_items that are above this current rmap:
1210	* because there were no VM_MERGEABLE vmas with such addresses.	1252	* because there were no VM_MERGEABLE vmas with such addresses.
@@ -1219,24 +1261,29 @@ next_mm:
1219	* We've completed a full scan of all vmas, holding mmap_sem	1261	* We've completed a full scan of all vmas, holding mmap_sem
1220	* throughout, and found no VM_MERGEABLE: so do the same as	1262	* throughout, and found no VM_MERGEABLE: so do the same as
1221	* __ksm_exit does to remove this mm from all our lists now.	1263	* __ksm_exit does to remove this mm from all our lists now.
		1264	* This applies either when cleaning up after __ksm_exit
		1265	* (but beware: we can reach here even before __ksm_exit),
		1266	* or when all VM_MERGEABLE areas have been unmapped (and
		1267	* mmap_sem then protects against race with MADV_MERGEABLE).
1222	*/	1268	*/
1223	hlist_del(&slot->link);	1269	hlist_del(&slot->link);
1224	list_del(&slot->mm_list);	1270	list_del(&slot->mm_list);
		1271	spin_unlock(&ksm_mmlist_lock);
		1272
1225	free_mm_slot(slot);	1273	free_mm_slot(slot);
1226	clear_bit(MMF_VM_MERGEABLE, &mm->flags);	1274	clear_bit(MMF_VM_MERGEABLE, &mm->flags);
		1275	up_read(&mm->mmap_sem);
		1276	mmdrop(mm);
		1277	} else {
		1278	spin_unlock(&ksm_mmlist_lock);
		1279	up_read(&mm->mmap_sem);
1227	}	1280	}
1228	spin_unlock(&ksm_mmlist_lock);
1229	up_read(&mm->mmap_sem);
1230		1281
1231	/* Repeat until we've completed scanning the whole list */	1282	/* Repeat until we've completed scanning the whole list */
1232	slot = ksm_scan.mm_slot;	1283	slot = ksm_scan.mm_slot;
1233	if (slot != &ksm_mm_head)	1284	if (slot != &ksm_mm_head)
1234	goto next_mm;	1285	goto next_mm;
1235		1286
1236	/*
1237	* Bump seqnr here rather than at top, so that __ksm_exit
1238	* can skip rb_erase on unstable tree until we run again.
1239	*/
1240	ksm_scan.seqnr++;	1287	ksm_scan.seqnr++;
1241	return NULL;	1288	return NULL;
1242	}	1289	}
@@ -1361,6 +1408,7 @@ int __ksm_enter(struct mm_struct *mm)
1361	spin_unlock(&ksm_mmlist_lock);	1408	spin_unlock(&ksm_mmlist_lock);
1362		1409
1363	set_bit(MMF_VM_MERGEABLE, &mm->flags);	1410	set_bit(MMF_VM_MERGEABLE, &mm->flags);
		1411	atomic_inc(&mm->mm_count);
1364		1412
1365	if (needs_wakeup)	1413	if (needs_wakeup)
1366	wake_up_interruptible(&ksm_thread_wait);	1414	wake_up_interruptible(&ksm_thread_wait);
@@ -1368,41 +1416,45 @@ int __ksm_enter(struct mm_struct *mm)
1368	return 0;	1416	return 0;
1369	}	1417	}
1370		1418
1371	void __ksm_exit(struct mm_struct *mm)	1419	void __ksm_exit(struct mm_struct *mm,
		1420	struct mmu_gather **tlbp, unsigned long end)
1372	{	1421	{
1373	struct mm_slot *mm_slot;	1422	struct mm_slot *mm_slot;
		1423	int easy_to_free = 0;
1374		1424
1375	/*	1425	/*
1376	* This process is exiting: doesn't hold and doesn't need mmap_sem;	1426	* This process is exiting: if it's straightforward (as is the
1377	* but we do need to exclude ksmd and other exiters while we modify	1427	* case when ksmd was never running), free mm_slot immediately.
1378	* the various lists and trees.	1428	* But if it's at the cursor or has rmap_items linked to it, use
		1429	* mmap_sem to synchronize with any break_cows before pagetables
		1430	* are freed, and leave the mm_slot on the list for ksmd to free.
		1431	* Beware: ksm may already have noticed it exiting and freed the slot.
1379	*/	1432	*/
1380	mutex_lock(&ksm_thread_mutex);	1433
1381	spin_lock(&ksm_mmlist_lock);	1434	spin_lock(&ksm_mmlist_lock);
1382	mm_slot = get_mm_slot(mm);	1435	mm_slot = get_mm_slot(mm);
1383	if (!list_empty(&mm_slot->rmap_list)) {	1436	if (mm_slot && ksm_scan.mm_slot != mm_slot) {
1384	spin_unlock(&ksm_mmlist_lock);	1437	if (list_empty(&mm_slot->rmap_list)) {
1385	remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);	1438	hlist_del(&mm_slot->link);
1386	spin_lock(&ksm_mmlist_lock);	1439	list_del(&mm_slot->mm_list);
1387	}	1440	easy_to_free = 1;
1388		1441	} else {
1389	if (ksm_scan.mm_slot == mm_slot) {	1442	list_move(&mm_slot->mm_list,
1390	ksm_scan.mm_slot = list_entry(	1443	&ksm_scan.mm_slot->mm_list);
1391	mm_slot->mm_list.next, struct mm_slot, mm_list);	1444	}
1392	ksm_scan.address = 0;
1393	ksm_scan.rmap_item = list_entry(
1394	&ksm_scan.mm_slot->rmap_list, struct rmap_item, link);
1395	if (ksm_scan.mm_slot == &ksm_mm_head)
1396	ksm_scan.seqnr++;
1397	}	1445	}
1398
1399	hlist_del(&mm_slot->link);
1400	list_del(&mm_slot->mm_list);
1401	spin_unlock(&ksm_mmlist_lock);	1446	spin_unlock(&ksm_mmlist_lock);
1402		1447
1403	free_mm_slot(mm_slot);	1448	if (easy_to_free) {
1404	clear_bit(MMF_VM_MERGEABLE, &mm->flags);	1449	free_mm_slot(mm_slot);
1405	mutex_unlock(&ksm_thread_mutex);	1450	clear_bit(MMF_VM_MERGEABLE, &mm->flags);
		1451	mmdrop(mm);
		1452	} else if (mm_slot) {
		1453	tlb_finish_mmu(*tlbp, 0, end);
		1454	down_write(&mm->mmap_sem);
		1455	up_write(&mm->mmap_sem);
		1456	*tlbp = tlb_gather_mmu(mm, 1);
		1457	}
1406	}	1458	}
1407		1459
1408	#define KSM_ATTR_RO(_name) \	1460	#define KSM_ATTR_RO(_name) \


diff --git a/mm/memory.c b/mm/memory.c index 1a435b81876c..f47ffe971012 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -2648,8 +2648,9 @@ static int do_anonymous_page(struct mm_struct mm, struct vm_area_struct vma,
2648	entry = maybe_mkwrite(pte_mkdirty(entry), vma);	2648	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2649		2649
2650	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);	2650	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2651	if (!pte_none(*page_table))	2651	if (!pte_none(*page_table) \|\| ksm_test_exit(mm))
2652	goto release;	2652	goto release;
		2653
2653	inc_mm_counter(mm, anon_rss);	2654	inc_mm_counter(mm, anon_rss);
2654	page_add_new_anon_rmap(page, vma, address);	2655	page_add_new_anon_rmap(page, vma, address);
2655	set_pte_at(mm, address, page_table, entry);	2656	set_pte_at(mm, address, page_table, entry);
@@ -2791,7 +2792,7 @@ static int __do_fault(struct mm_struct mm, struct vm_area_struct vma,
2791	* handle that later.	2792	* handle that later.
2792	*/	2793	*/
2793	/* Only go through if we didn't race with anybody else... */	2794	/* Only go through if we didn't race with anybody else... */
2794	if (likely(pte_same(*page_table, orig_pte))) {	2795	if (likely(pte_same(*page_table, orig_pte) && !ksm_test_exit(mm))) {
2795	flush_icache_page(vma, page);	2796	flush_icache_page(vma, page);
2796	entry = mk_pte(page, vma->vm_page_prot);	2797	entry = mk_pte(page, vma->vm_page_prot);
2797	if (flags & FAULT_FLAG_WRITE)	2798	if (flags & FAULT_FLAG_WRITE)


diff --git a/mm/mmap.c b/mm/mmap.c index 376492ed08f4..e02f1aa66a1a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c
@@ -27,6 +27,7 @@
27	#include <linux/mount.h>	27	#include <linux/mount.h>
28	#include <linux/mempolicy.h>	28	#include <linux/mempolicy.h>
29	#include <linux/rmap.h>	29	#include <linux/rmap.h>
		30	#include <linux/ksm.h>
30	#include <linux/mmu_notifier.h>	31	#include <linux/mmu_notifier.h>
31	#include <linux/perf_event.h>	32	#include <linux/perf_event.h>
32		33
@@ -2111,6 +2112,14 @@ void exit_mmap(struct mm_struct *mm)
2111	/* Use -1 here to ensure all VMAs in the mm are unmapped */	2112	/* Use -1 here to ensure all VMAs in the mm are unmapped */
2112	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);	2113	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2113	vm_unacct_memory(nr_accounted);	2114	vm_unacct_memory(nr_accounted);
		2115
		2116	/*
		2117	* For KSM to handle OOM without deadlock when it's breaking COW in a
		2118	* likely victim of the OOM killer, we must serialize with ksm_exit()
		2119	* after freeing mm's pages but before freeing its page tables.
		2120	*/
		2121	ksm_exit(mm, &tlb, end);
		2122
2114	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);	2123	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
2115	tlb_finish_mmu(tlb, 0, end);	2124	tlb_finish_mmu(tlb, 0, end);
2116		2125