ksm: fix endless loop on oom

break_ksm has been looping endlessly ignoring VM_FAULT_OOM: that should only be a problem for ksmd when a memory control group imposes limits (normally the OOM killer will kill others with an mm until it succeeds); but in general (especially for MADV_UNMERGEABLE and KSM_RUN_UNMERGE) we do need to route the error (or kill) back to the caller (or sighandling). Test signal_pending in unmerge_ksm_pages, which could be a lengthy procedure if it has to spill into swap: returning -ERESTARTSYS so that trivial signals will restart but fatals will terminate (is that right? we do different things in different places in mm, none exactly this). unmerge_and_remove_all_rmap_items was forgetting to lock when going down the mm_list: fix that. Whether it's successful or not, reset ksm_scan cursor to head; but only if it's successful, reset seqnr (shown in full_scans) - page counts will have gone down to zero. This patch leaves a significant OOM deadlock, but it's a good step on the way, and that deadlock is fixed in a subsequent patch. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Acked-by: Izik Eidus <ieidus@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Hugh Dickins <hugh.dickins@tiscali.co.uk> 2009-09-21 20:02:16 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2009-09-22 10:17:32 -0400
commit: d952b79136a6c32a3f97e0628ca78340f1d5c6f9 (patch)
tree: d46b096fa097c39faa21c89f329d0c84bd700062 /mm/ksm.c
parent: 81464e30609cdbd3d96d8dd6991e7481195a89a1 (diff)
1 files changed, 85 insertions, 23 deletions
diff --git a/mm/ksm.c b/mm/ksm.c
index c49bb7156a1d..d9e3cfcc150c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -294,10 +294,10 @@ static inline int in_stable_tree(struct rmap_item *rmap_item)
 * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
 * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
 */
-static void break_ksm(struct vm_area_struct *vma, unsigned long addr)
+static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
 {
        struct page *page;
-        int ret;
+        int ret = 0;
        do {
                cond_resched();
@@ -310,9 +310,36 @@ static void break_ksm(struct vm_area_struct *vma, unsigned long addr)
                else
                        ret = VM_FAULT_WRITE;
                put_page(page);
-        } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS)));
+        } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
+        /*
-        /* Which leaves us looping there if VM_FAULT_OOM: hmmm... */
+         * We must loop because handle_mm_fault() may back out if there's
+         * any difficulty e.g. if pte accessed bit gets updated concurrently.
+         *
+         * VM_FAULT_WRITE is what we have been hoping for: it indicates that
+         * COW has been broken, even if the vma does not permit VM_WRITE;
+         * but note that a concurrent fault might break PageKsm for us.
+         *
+         * VM_FAULT_SIGBUS could occur if we race with truncation of the
+         * backing file, which also invalidates anonymous pages: that's
+         * okay, that truncation will have unmapped the PageKsm for us.
+         *
+         * VM_FAULT_OOM: at the time of writing (late July 2009), setting
+         * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
+         * current task has TIF_MEMDIE set, and will be OOM killed on return
+         * to user; and ksmd, having no mm, would never be chosen for that.
+         *
+         * But if the mm is in a limited mem_cgroup, then the fault may fail
+         * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
+         * even ksmd can fail in this way - though it's usually breaking ksm
+         * just to undo a merge it made a moment before, so unlikely to oom.
+         *
+         * That's a pity: we might therefore have more kernel pages allocated
+         * than we're counting as nodes in the stable tree; but ksm_do_scan
+         * will retry to break_cow on each pass, so should recover the page
+         * in due course.  The important thing is to not let VM_MERGEABLE
+         * be cleared while any such pages might remain in the area.
+         */
+        return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
 }
 static void break_cow(struct mm_struct *mm, unsigned long addr)
@@ -462,39 +489,61 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
 * to the next pass of ksmd - consider, for example, how ksmd might be
 * in cmp_and_merge_page on one of the rmap_items we would be removing.
 */
-static void unmerge_ksm_pages(struct vm_area_struct *vma,
+static int unmerge_ksm_pages(struct vm_area_struct *vma,
-                              unsigned long start, unsigned long end)
+                             unsigned long start, unsigned long end)
 {
        unsigned long addr;
+        int err = 0;
-        for (addr = start; addr < end; addr += PAGE_SIZE)
+        for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
-                break_ksm(vma, addr);
+                if (signal_pending(current))
+                        err = -ERESTARTSYS;
+                else
+                        err = break_ksm(vma, addr);
+        }
+        return err;
 }
-static void unmerge_and_remove_all_rmap_items(void)
+static int unmerge_and_remove_all_rmap_items(void)
 {
        struct mm_slot *mm_slot;
        struct mm_struct *mm;
        struct vm_area_struct *vma;
+        int err = 0;
+        spin_lock(&ksm_mmlist_lock);
+        mm_slot = list_entry(ksm_mm_head.mm_list.next,
+                                                struct mm_slot, mm_list);
+        spin_unlock(&ksm_mmlist_lock);
-        list_for_each_entry(mm_slot, &ksm_mm_head.mm_list, mm_list) {
+        while (mm_slot != &ksm_mm_head) {
                mm = mm_slot->mm;
                down_read(&mm->mmap_sem);
                for (vma = mm->mmap; vma; vma = vma->vm_next) {
                        if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
                                continue;
-                        unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end);
+                        err = unmerge_ksm_pages(vma,
+                                                vma->vm_start, vma->vm_end);
+                        if (err) {
+                                up_read(&mm->mmap_sem);
+                                goto out;
+                        }
                }
                remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
                up_read(&mm->mmap_sem);
+                spin_lock(&ksm_mmlist_lock);
+                mm_slot = list_entry(mm_slot->mm_list.next,
+                                                struct mm_slot, mm_list);
+                spin_unlock(&ksm_mmlist_lock);
        }
+        ksm_scan.seqnr = 0;
+out:
        spin_lock(&ksm_mmlist_lock);
-        if (ksm_scan.mm_slot != &ksm_mm_head) {
+        ksm_scan.mm_slot = &ksm_mm_head;
-                ksm_scan.mm_slot = &ksm_mm_head;
-                ksm_scan.seqnr++;
-        }
        spin_unlock(&ksm_mmlist_lock);
+        return err;
 }
 static void remove_mm_from_lists(struct mm_struct *mm)
@@ -1051,6 +1100,8 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
        /*
         * A ksm page might have got here by fork, but its other
         * references have already been removed from the stable tree.
+         * Or it might be left over from a break_ksm which failed
+         * when the mem_cgroup had reached its limit: try again now.
         */
        if (PageKsm(page))
                break_cow(rmap_item->mm, rmap_item->address);
@@ -1286,6 +1337,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                unsigned long end, int advice, unsigned long *vm_flags)
 {
        struct mm_struct *mm = vma->vm_mm;
+        int err;
        switch (advice) {
        case MADV_MERGEABLE:
@@ -1298,9 +1350,11 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                                 VM_MIXEDMAP  | VM_SAO))
                        return 0;               /* just ignore the advice */
-                if (!test_bit(MMF_VM_MERGEABLE, &mm->flags))
+                if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
-                        if (__ksm_enter(mm) < 0)
+                        err = __ksm_enter(mm);
-                                return -EAGAIN;
+                        if (err)
+                                return err;
+                }
                *vm_flags |= VM_MERGEABLE;
                break;
@@ -1309,8 +1363,11 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                if (!(*vm_flags & VM_MERGEABLE))
                        return 0;               /* just ignore the advice */
-                if (vma->anon_vma)
+                if (vma->anon_vma) {
-                        unmerge_ksm_pages(vma, start, end);
+                        err = unmerge_ksm_pages(vma, start, end);
+                        if (err)
+                                return err;
+                }
                *vm_flags &= ~VM_MERGEABLE;
                break;
@@ -1441,8 +1498,13 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
        mutex_lock(&ksm_thread_mutex);
        if (ksm_run != flags) {
                ksm_run = flags;
-                if (flags & KSM_RUN_UNMERGE)
+                if (flags & KSM_RUN_UNMERGE) {
-                        unmerge_and_remove_all_rmap_items();
+                        err = unmerge_and_remove_all_rmap_items();
+                        if (err) {
+                                ksm_run = KSM_RUN_STOP;
+                                count = err;
+                        }
+                }
        }
        mutex_unlock(&ksm_thread_mutex);
author	Hugh Dickins <hugh.dickins@tiscali.co.uk>	2009-09-21 20:02:16 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2009-09-22 10:17:32 -0400
commit	d952b79136a6c32a3f97e0628ca78340f1d5c6f9 (patch)
tree	d46b096fa097c39faa21c89f329d0c84bd700062 /mm/ksm.c
parent	81464e30609cdbd3d96d8dd6991e7481195a89a1 (diff)

diff --git a/mm/ksm.c b/mm/ksm.c index c49bb7156a1d..d9e3cfcc150c 100644 --- a/mm/ksm.c +++ b/mm/ksm.c
@@ -294,10 +294,10 @@ static inline int in_stable_tree(struct rmap_item *rmap_item)
294	* Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP	294	* Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
295	* mmap of /dev/mem or /dev/kmem, where we would not want to touch it.	295	* mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
296	*/	296	*/
297	static void break_ksm(struct vm_area_struct *vma, unsigned long addr)	297	static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
298	{	298	{
299	struct page *page;	299	struct page *page;
300	int ret;	300	int ret = 0;
301		301
302	do {	302	do {
303	cond_resched();	303	cond_resched();
@@ -310,9 +310,36 @@ static void break_ksm(struct vm_area_struct *vma, unsigned long addr)
310	else	310	else
311	ret = VM_FAULT_WRITE;	311	ret = VM_FAULT_WRITE;
312	put_page(page);	312	put_page(page);
313	} while (!(ret & (VM_FAULT_WRITE \| VM_FAULT_SIGBUS)));	313	} while (!(ret & (VM_FAULT_WRITE \| VM_FAULT_SIGBUS \| VM_FAULT_OOM)));
314		314	/*
315	/* Which leaves us looping there if VM_FAULT_OOM: hmmm... */	315	* We must loop because handle_mm_fault() may back out if there's
		316	* any difficulty e.g. if pte accessed bit gets updated concurrently.
		317	*
		318	* VM_FAULT_WRITE is what we have been hoping for: it indicates that
		319	* COW has been broken, even if the vma does not permit VM_WRITE;
		320	* but note that a concurrent fault might break PageKsm for us.
		321	*
		322	* VM_FAULT_SIGBUS could occur if we race with truncation of the
		323	* backing file, which also invalidates anonymous pages: that's
		324	* okay, that truncation will have unmapped the PageKsm for us.
		325	*
		326	* VM_FAULT_OOM: at the time of writing (late July 2009), setting
		327	* aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
		328	* current task has TIF_MEMDIE set, and will be OOM killed on return
		329	* to user; and ksmd, having no mm, would never be chosen for that.
		330	*
		331	* But if the mm is in a limited mem_cgroup, then the fault may fail
		332	* with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
		333	* even ksmd can fail in this way - though it's usually breaking ksm
		334	* just to undo a merge it made a moment before, so unlikely to oom.
		335	*
		336	* That's a pity: we might therefore have more kernel pages allocated
		337	* than we're counting as nodes in the stable tree; but ksm_do_scan
		338	* will retry to break_cow on each pass, so should recover the page
		339	* in due course. The important thing is to not let VM_MERGEABLE
		340	* be cleared while any such pages might remain in the area.
		341	*/
		342	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
316	}	343	}
317		344
318	static void break_cow(struct mm_struct *mm, unsigned long addr)	345	static void break_cow(struct mm_struct *mm, unsigned long addr)
@@ -462,39 +489,61 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
462	* to the next pass of ksmd - consider, for example, how ksmd might be	489	* to the next pass of ksmd - consider, for example, how ksmd might be
463	* in cmp_and_merge_page on one of the rmap_items we would be removing.	490	* in cmp_and_merge_page on one of the rmap_items we would be removing.
464	*/	491	*/
465	static void unmerge_ksm_pages(struct vm_area_struct *vma,	492	static int unmerge_ksm_pages(struct vm_area_struct *vma,
466	unsigned long start, unsigned long end)	493	unsigned long start, unsigned long end)
467	{	494	{
468	unsigned long addr;	495	unsigned long addr;
		496	int err = 0;
469		497
470	for (addr = start; addr < end; addr += PAGE_SIZE)	498	for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
471	break_ksm(vma, addr);	499	if (signal_pending(current))
		500	err = -ERESTARTSYS;
		501	else
		502	err = break_ksm(vma, addr);
		503	}
		504	return err;
472	}	505	}
473		506
474	static void unmerge_and_remove_all_rmap_items(void)	507	static int unmerge_and_remove_all_rmap_items(void)
475	{	508	{
476	struct mm_slot *mm_slot;	509	struct mm_slot *mm_slot;
477	struct mm_struct *mm;	510	struct mm_struct *mm;
478	struct vm_area_struct *vma;	511	struct vm_area_struct *vma;
		512	int err = 0;
		513
		514	spin_lock(&ksm_mmlist_lock);
		515	mm_slot = list_entry(ksm_mm_head.mm_list.next,
		516	struct mm_slot, mm_list);
		517	spin_unlock(&ksm_mmlist_lock);
479		518
480	list_for_each_entry(mm_slot, &ksm_mm_head.mm_list, mm_list) {	519	while (mm_slot != &ksm_mm_head) {
481	mm = mm_slot->mm;	520	mm = mm_slot->mm;
482	down_read(&mm->mmap_sem);	521	down_read(&mm->mmap_sem);
483	for (vma = mm->mmap; vma; vma = vma->vm_next) {	522	for (vma = mm->mmap; vma; vma = vma->vm_next) {
484	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)	523	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)
485	continue;	524	continue;
486	unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end);	525	err = unmerge_ksm_pages(vma,
		526	vma->vm_start, vma->vm_end);
		527	if (err) {
		528	up_read(&mm->mmap_sem);
		529	goto out;
		530	}
487	}	531	}
488	remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);	532	remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
489	up_read(&mm->mmap_sem);	533	up_read(&mm->mmap_sem);
		534
		535	spin_lock(&ksm_mmlist_lock);
		536	mm_slot = list_entry(mm_slot->mm_list.next,
		537	struct mm_slot, mm_list);
		538	spin_unlock(&ksm_mmlist_lock);
490	}	539	}
491		540
		541	ksm_scan.seqnr = 0;
		542	out:
492	spin_lock(&ksm_mmlist_lock);	543	spin_lock(&ksm_mmlist_lock);
493	if (ksm_scan.mm_slot != &ksm_mm_head) {	544	ksm_scan.mm_slot = &ksm_mm_head;
494	ksm_scan.mm_slot = &ksm_mm_head;
495	ksm_scan.seqnr++;
496	}
497	spin_unlock(&ksm_mmlist_lock);	545	spin_unlock(&ksm_mmlist_lock);
		546	return err;
498	}	547	}
499		548
500	static void remove_mm_from_lists(struct mm_struct *mm)	549	static void remove_mm_from_lists(struct mm_struct *mm)
@@ -1051,6 +1100,8 @@ static void cmp_and_merge_page(struct page page, struct rmap_item rmap_item)
1051	/*	1100	/*
1052	* A ksm page might have got here by fork, but its other	1101	* A ksm page might have got here by fork, but its other
1053	* references have already been removed from the stable tree.	1102	* references have already been removed from the stable tree.
		1103	* Or it might be left over from a break_ksm which failed
		1104	* when the mem_cgroup had reached its limit: try again now.
1054	*/	1105	*/
1055	if (PageKsm(page))	1106	if (PageKsm(page))
1056	break_cow(rmap_item->mm, rmap_item->address);	1107	break_cow(rmap_item->mm, rmap_item->address);
@@ -1286,6 +1337,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1286	unsigned long end, int advice, unsigned long *vm_flags)	1337	unsigned long end, int advice, unsigned long *vm_flags)
1287	{	1338	{
1288	struct mm_struct *mm = vma->vm_mm;	1339	struct mm_struct *mm = vma->vm_mm;
		1340	int err;
1289		1341
1290	switch (advice) {	1342	switch (advice) {
1291	case MADV_MERGEABLE:	1343	case MADV_MERGEABLE:
@@ -1298,9 +1350,11 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1298	VM_MIXEDMAP \| VM_SAO))	1350	VM_MIXEDMAP \| VM_SAO))
1299	return 0; /* just ignore the advice */	1351	return 0; /* just ignore the advice */
1300		1352
1301	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags))	1353	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
1302	if (__ksm_enter(mm) < 0)	1354	err = __ksm_enter(mm);
1303	return -EAGAIN;	1355	if (err)
		1356	return err;
		1357	}
1304		1358
1305	*vm_flags \|= VM_MERGEABLE;	1359	*vm_flags \|= VM_MERGEABLE;
1306	break;	1360	break;
@@ -1309,8 +1363,11 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1309	if (!(*vm_flags & VM_MERGEABLE))	1363	if (!(*vm_flags & VM_MERGEABLE))
1310	return 0; /* just ignore the advice */	1364	return 0; /* just ignore the advice */
1311		1365
1312	if (vma->anon_vma)	1366	if (vma->anon_vma) {
1313	unmerge_ksm_pages(vma, start, end);	1367	err = unmerge_ksm_pages(vma, start, end);
		1368	if (err)
		1369	return err;
		1370	}
1314		1371
1315	*vm_flags &= ~VM_MERGEABLE;	1372	*vm_flags &= ~VM_MERGEABLE;
1316	break;	1373	break;
@@ -1441,8 +1498,13 @@ static ssize_t run_store(struct kobject kobj, struct kobj_attribute attr,
1441	mutex_lock(&ksm_thread_mutex);	1498	mutex_lock(&ksm_thread_mutex);
1442	if (ksm_run != flags) {	1499	if (ksm_run != flags) {
1443	ksm_run = flags;	1500	ksm_run = flags;
1444	if (flags & KSM_RUN_UNMERGE)	1501	if (flags & KSM_RUN_UNMERGE) {
1445	unmerge_and_remove_all_rmap_items();	1502	err = unmerge_and_remove_all_rmap_items();
		1503	if (err) {
		1504	ksm_run = KSM_RUN_STOP;
		1505	count = err;
		1506	}
		1507	}
1446	}	1508	}
1447	mutex_unlock(&ksm_thread_mutex);	1509	mutex_unlock(&ksm_thread_mutex);
1448		1510