aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHugh Dickins <hugh.dickins@tiscali.co.uk>2009-09-21 20:02:16 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-22 10:17:32 -0400
commitd952b79136a6c32a3f97e0628ca78340f1d5c6f9 (patch)
treed46b096fa097c39faa21c89f329d0c84bd700062
parent81464e30609cdbd3d96d8dd6991e7481195a89a1 (diff)
ksm: fix endless loop on oom
break_ksm has been looping endlessly ignoring VM_FAULT_OOM: that should only be a problem for ksmd when a memory control group imposes limits (normally the OOM killer will kill others with an mm until it succeeds); but in general (especially for MADV_UNMERGEABLE and KSM_RUN_UNMERGE) we do need to route the error (or kill) back to the caller (or sighandling). Test signal_pending in unmerge_ksm_pages, which could be a lengthy procedure if it has to spill into swap: returning -ERESTARTSYS so that trivial signals will restart but fatals will terminate (is that right? we do different things in different places in mm, none exactly this). unmerge_and_remove_all_rmap_items was forgetting to lock when going down the mm_list: fix that. Whether it's successful or not, reset ksm_scan cursor to head; but only if it's successful, reset seqnr (shown in full_scans) - page counts will have gone down to zero. This patch leaves a significant OOM deadlock, but it's a good step on the way, and that deadlock is fixed in a subsequent patch. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Acked-by: Izik Eidus <ieidus@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/ksm.c108
1 files changed, 85 insertions, 23 deletions
diff --git a/mm/ksm.c b/mm/ksm.c
index c49bb7156a1d..d9e3cfcc150c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -294,10 +294,10 @@ static inline int in_stable_tree(struct rmap_item *rmap_item)
294 * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP 294 * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
295 * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. 295 * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
296 */ 296 */
297static void break_ksm(struct vm_area_struct *vma, unsigned long addr) 297static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
298{ 298{
299 struct page *page; 299 struct page *page;
300 int ret; 300 int ret = 0;
301 301
302 do { 302 do {
303 cond_resched(); 303 cond_resched();
@@ -310,9 +310,36 @@ static void break_ksm(struct vm_area_struct *vma, unsigned long addr)
310 else 310 else
311 ret = VM_FAULT_WRITE; 311 ret = VM_FAULT_WRITE;
312 put_page(page); 312 put_page(page);
313 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS))); 313 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
314 314 /*
315 /* Which leaves us looping there if VM_FAULT_OOM: hmmm... */ 315 * We must loop because handle_mm_fault() may back out if there's
316 * any difficulty e.g. if pte accessed bit gets updated concurrently.
317 *
318 * VM_FAULT_WRITE is what we have been hoping for: it indicates that
319 * COW has been broken, even if the vma does not permit VM_WRITE;
320 * but note that a concurrent fault might break PageKsm for us.
321 *
322 * VM_FAULT_SIGBUS could occur if we race with truncation of the
323 * backing file, which also invalidates anonymous pages: that's
324 * okay, that truncation will have unmapped the PageKsm for us.
325 *
326 * VM_FAULT_OOM: at the time of writing (late July 2009), setting
327 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
328 * current task has TIF_MEMDIE set, and will be OOM killed on return
329 * to user; and ksmd, having no mm, would never be chosen for that.
330 *
331 * But if the mm is in a limited mem_cgroup, then the fault may fail
332 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
333 * even ksmd can fail in this way - though it's usually breaking ksm
334 * just to undo a merge it made a moment before, so unlikely to oom.
335 *
336 * That's a pity: we might therefore have more kernel pages allocated
337 * than we're counting as nodes in the stable tree; but ksm_do_scan
338 * will retry to break_cow on each pass, so should recover the page
339 * in due course. The important thing is to not let VM_MERGEABLE
340 * be cleared while any such pages might remain in the area.
341 */
342 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
316} 343}
317 344
318static void break_cow(struct mm_struct *mm, unsigned long addr) 345static void break_cow(struct mm_struct *mm, unsigned long addr)
@@ -462,39 +489,61 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
462 * to the next pass of ksmd - consider, for example, how ksmd might be 489 * to the next pass of ksmd - consider, for example, how ksmd might be
463 * in cmp_and_merge_page on one of the rmap_items we would be removing. 490 * in cmp_and_merge_page on one of the rmap_items we would be removing.
464 */ 491 */
465static void unmerge_ksm_pages(struct vm_area_struct *vma, 492static int unmerge_ksm_pages(struct vm_area_struct *vma,
466 unsigned long start, unsigned long end) 493 unsigned long start, unsigned long end)
467{ 494{
468 unsigned long addr; 495 unsigned long addr;
496 int err = 0;
469 497
470 for (addr = start; addr < end; addr += PAGE_SIZE) 498 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
471 break_ksm(vma, addr); 499 if (signal_pending(current))
500 err = -ERESTARTSYS;
501 else
502 err = break_ksm(vma, addr);
503 }
504 return err;
472} 505}
473 506
474static void unmerge_and_remove_all_rmap_items(void) 507static int unmerge_and_remove_all_rmap_items(void)
475{ 508{
476 struct mm_slot *mm_slot; 509 struct mm_slot *mm_slot;
477 struct mm_struct *mm; 510 struct mm_struct *mm;
478 struct vm_area_struct *vma; 511 struct vm_area_struct *vma;
512 int err = 0;
513
514 spin_lock(&ksm_mmlist_lock);
515 mm_slot = list_entry(ksm_mm_head.mm_list.next,
516 struct mm_slot, mm_list);
517 spin_unlock(&ksm_mmlist_lock);
479 518
480 list_for_each_entry(mm_slot, &ksm_mm_head.mm_list, mm_list) { 519 while (mm_slot != &ksm_mm_head) {
481 mm = mm_slot->mm; 520 mm = mm_slot->mm;
482 down_read(&mm->mmap_sem); 521 down_read(&mm->mmap_sem);
483 for (vma = mm->mmap; vma; vma = vma->vm_next) { 522 for (vma = mm->mmap; vma; vma = vma->vm_next) {
484 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 523 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
485 continue; 524 continue;
486 unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end); 525 err = unmerge_ksm_pages(vma,
526 vma->vm_start, vma->vm_end);
527 if (err) {
528 up_read(&mm->mmap_sem);
529 goto out;
530 }
487 } 531 }
488 remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); 532 remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
489 up_read(&mm->mmap_sem); 533 up_read(&mm->mmap_sem);
534
535 spin_lock(&ksm_mmlist_lock);
536 mm_slot = list_entry(mm_slot->mm_list.next,
537 struct mm_slot, mm_list);
538 spin_unlock(&ksm_mmlist_lock);
490 } 539 }
491 540
541 ksm_scan.seqnr = 0;
542out:
492 spin_lock(&ksm_mmlist_lock); 543 spin_lock(&ksm_mmlist_lock);
493 if (ksm_scan.mm_slot != &ksm_mm_head) { 544 ksm_scan.mm_slot = &ksm_mm_head;
494 ksm_scan.mm_slot = &ksm_mm_head;
495 ksm_scan.seqnr++;
496 }
497 spin_unlock(&ksm_mmlist_lock); 545 spin_unlock(&ksm_mmlist_lock);
546 return err;
498} 547}
499 548
500static void remove_mm_from_lists(struct mm_struct *mm) 549static void remove_mm_from_lists(struct mm_struct *mm)
@@ -1051,6 +1100,8 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1051 /* 1100 /*
1052 * A ksm page might have got here by fork, but its other 1101 * A ksm page might have got here by fork, but its other
1053 * references have already been removed from the stable tree. 1102 * references have already been removed from the stable tree.
1103 * Or it might be left over from a break_ksm which failed
1104 * when the mem_cgroup had reached its limit: try again now.
1054 */ 1105 */
1055 if (PageKsm(page)) 1106 if (PageKsm(page))
1056 break_cow(rmap_item->mm, rmap_item->address); 1107 break_cow(rmap_item->mm, rmap_item->address);
@@ -1286,6 +1337,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1286 unsigned long end, int advice, unsigned long *vm_flags) 1337 unsigned long end, int advice, unsigned long *vm_flags)
1287{ 1338{
1288 struct mm_struct *mm = vma->vm_mm; 1339 struct mm_struct *mm = vma->vm_mm;
1340 int err;
1289 1341
1290 switch (advice) { 1342 switch (advice) {
1291 case MADV_MERGEABLE: 1343 case MADV_MERGEABLE:
@@ -1298,9 +1350,11 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1298 VM_MIXEDMAP | VM_SAO)) 1350 VM_MIXEDMAP | VM_SAO))
1299 return 0; /* just ignore the advice */ 1351 return 0; /* just ignore the advice */
1300 1352
1301 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) 1353 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
1302 if (__ksm_enter(mm) < 0) 1354 err = __ksm_enter(mm);
1303 return -EAGAIN; 1355 if (err)
1356 return err;
1357 }
1304 1358
1305 *vm_flags |= VM_MERGEABLE; 1359 *vm_flags |= VM_MERGEABLE;
1306 break; 1360 break;
@@ -1309,8 +1363,11 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1309 if (!(*vm_flags & VM_MERGEABLE)) 1363 if (!(*vm_flags & VM_MERGEABLE))
1310 return 0; /* just ignore the advice */ 1364 return 0; /* just ignore the advice */
1311 1365
1312 if (vma->anon_vma) 1366 if (vma->anon_vma) {
1313 unmerge_ksm_pages(vma, start, end); 1367 err = unmerge_ksm_pages(vma, start, end);
1368 if (err)
1369 return err;
1370 }
1314 1371
1315 *vm_flags &= ~VM_MERGEABLE; 1372 *vm_flags &= ~VM_MERGEABLE;
1316 break; 1373 break;
@@ -1441,8 +1498,13 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1441 mutex_lock(&ksm_thread_mutex); 1498 mutex_lock(&ksm_thread_mutex);
1442 if (ksm_run != flags) { 1499 if (ksm_run != flags) {
1443 ksm_run = flags; 1500 ksm_run = flags;
1444 if (flags & KSM_RUN_UNMERGE) 1501 if (flags & KSM_RUN_UNMERGE) {
1445 unmerge_and_remove_all_rmap_items(); 1502 err = unmerge_and_remove_all_rmap_items();
1503 if (err) {
1504 ksm_run = KSM_RUN_STOP;
1505 count = err;
1506 }
1507 }
1446 } 1508 }
1447 mutex_unlock(&ksm_thread_mutex); 1509 mutex_unlock(&ksm_thread_mutex);
1448 1510