diff options
author | Hugh Dickins <hugh.dickins@tiscali.co.uk> | 2009-09-21 20:02:16 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-22 10:17:32 -0400 |
commit | d952b79136a6c32a3f97e0628ca78340f1d5c6f9 (patch) | |
tree | d46b096fa097c39faa21c89f329d0c84bd700062 | |
parent | 81464e30609cdbd3d96d8dd6991e7481195a89a1 (diff) |
ksm: fix endless loop on oom
break_ksm has been looping endlessly ignoring VM_FAULT_OOM: that should
only be a problem for ksmd when a memory control group imposes limits
(normally the OOM killer will kill others with an mm until it succeeds);
but in general (especially for MADV_UNMERGEABLE and KSM_RUN_UNMERGE) we
do need to route the error (or kill) back to the caller (or sighandling).
Test signal_pending in unmerge_ksm_pages, which could be a lengthy
procedure if it has to spill into swap: returning -ERESTARTSYS so that
trivial signals will restart but fatals will terminate (is that right?
we do different things in different places in mm, none exactly this).
unmerge_and_remove_all_rmap_items was forgetting to lock when going
down the mm_list: fix that. Whether it's successful or not, reset
ksm_scan cursor to head; but only if it's successful, reset seqnr
(shown in full_scans) - page counts will have gone down to zero.
This patch leaves a significant OOM deadlock, but it's a good step
on the way, and that deadlock is fixed in a subsequent patch.
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Acked-by: Izik Eidus <ieidus@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/ksm.c | 108 |
1 files changed, 85 insertions, 23 deletions
@@ -294,10 +294,10 @@ static inline int in_stable_tree(struct rmap_item *rmap_item) | |||
294 | * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP | 294 | * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP |
295 | * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. | 295 | * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. |
296 | */ | 296 | */ |
297 | static void break_ksm(struct vm_area_struct *vma, unsigned long addr) | 297 | static int break_ksm(struct vm_area_struct *vma, unsigned long addr) |
298 | { | 298 | { |
299 | struct page *page; | 299 | struct page *page; |
300 | int ret; | 300 | int ret = 0; |
301 | 301 | ||
302 | do { | 302 | do { |
303 | cond_resched(); | 303 | cond_resched(); |
@@ -310,9 +310,36 @@ static void break_ksm(struct vm_area_struct *vma, unsigned long addr) | |||
310 | else | 310 | else |
311 | ret = VM_FAULT_WRITE; | 311 | ret = VM_FAULT_WRITE; |
312 | put_page(page); | 312 | put_page(page); |
313 | } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS))); | 313 | } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM))); |
314 | 314 | /* | |
315 | /* Which leaves us looping there if VM_FAULT_OOM: hmmm... */ | 315 | * We must loop because handle_mm_fault() may back out if there's |
316 | * any difficulty e.g. if pte accessed bit gets updated concurrently. | ||
317 | * | ||
318 | * VM_FAULT_WRITE is what we have been hoping for: it indicates that | ||
319 | * COW has been broken, even if the vma does not permit VM_WRITE; | ||
320 | * but note that a concurrent fault might break PageKsm for us. | ||
321 | * | ||
322 | * VM_FAULT_SIGBUS could occur if we race with truncation of the | ||
323 | * backing file, which also invalidates anonymous pages: that's | ||
324 | * okay, that truncation will have unmapped the PageKsm for us. | ||
325 | * | ||
326 | * VM_FAULT_OOM: at the time of writing (late July 2009), setting | ||
327 | * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the | ||
328 | * current task has TIF_MEMDIE set, and will be OOM killed on return | ||
329 | * to user; and ksmd, having no mm, would never be chosen for that. | ||
330 | * | ||
331 | * But if the mm is in a limited mem_cgroup, then the fault may fail | ||
332 | * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and | ||
333 | * even ksmd can fail in this way - though it's usually breaking ksm | ||
334 | * just to undo a merge it made a moment before, so unlikely to oom. | ||
335 | * | ||
336 | * That's a pity: we might therefore have more kernel pages allocated | ||
337 | * than we're counting as nodes in the stable tree; but ksm_do_scan | ||
338 | * will retry to break_cow on each pass, so should recover the page | ||
339 | * in due course. The important thing is to not let VM_MERGEABLE | ||
340 | * be cleared while any such pages might remain in the area. | ||
341 | */ | ||
342 | return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; | ||
316 | } | 343 | } |
317 | 344 | ||
318 | static void break_cow(struct mm_struct *mm, unsigned long addr) | 345 | static void break_cow(struct mm_struct *mm, unsigned long addr) |
@@ -462,39 +489,61 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot, | |||
462 | * to the next pass of ksmd - consider, for example, how ksmd might be | 489 | * to the next pass of ksmd - consider, for example, how ksmd might be |
463 | * in cmp_and_merge_page on one of the rmap_items we would be removing. | 490 | * in cmp_and_merge_page on one of the rmap_items we would be removing. |
464 | */ | 491 | */ |
465 | static void unmerge_ksm_pages(struct vm_area_struct *vma, | 492 | static int unmerge_ksm_pages(struct vm_area_struct *vma, |
466 | unsigned long start, unsigned long end) | 493 | unsigned long start, unsigned long end) |
467 | { | 494 | { |
468 | unsigned long addr; | 495 | unsigned long addr; |
496 | int err = 0; | ||
469 | 497 | ||
470 | for (addr = start; addr < end; addr += PAGE_SIZE) | 498 | for (addr = start; addr < end && !err; addr += PAGE_SIZE) { |
471 | break_ksm(vma, addr); | 499 | if (signal_pending(current)) |
500 | err = -ERESTARTSYS; | ||
501 | else | ||
502 | err = break_ksm(vma, addr); | ||
503 | } | ||
504 | return err; | ||
472 | } | 505 | } |
473 | 506 | ||
474 | static void unmerge_and_remove_all_rmap_items(void) | 507 | static int unmerge_and_remove_all_rmap_items(void) |
475 | { | 508 | { |
476 | struct mm_slot *mm_slot; | 509 | struct mm_slot *mm_slot; |
477 | struct mm_struct *mm; | 510 | struct mm_struct *mm; |
478 | struct vm_area_struct *vma; | 511 | struct vm_area_struct *vma; |
512 | int err = 0; | ||
513 | |||
514 | spin_lock(&ksm_mmlist_lock); | ||
515 | mm_slot = list_entry(ksm_mm_head.mm_list.next, | ||
516 | struct mm_slot, mm_list); | ||
517 | spin_unlock(&ksm_mmlist_lock); | ||
479 | 518 | ||
480 | list_for_each_entry(mm_slot, &ksm_mm_head.mm_list, mm_list) { | 519 | while (mm_slot != &ksm_mm_head) { |
481 | mm = mm_slot->mm; | 520 | mm = mm_slot->mm; |
482 | down_read(&mm->mmap_sem); | 521 | down_read(&mm->mmap_sem); |
483 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 522 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
484 | if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) | 523 | if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) |
485 | continue; | 524 | continue; |
486 | unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end); | 525 | err = unmerge_ksm_pages(vma, |
526 | vma->vm_start, vma->vm_end); | ||
527 | if (err) { | ||
528 | up_read(&mm->mmap_sem); | ||
529 | goto out; | ||
530 | } | ||
487 | } | 531 | } |
488 | remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); | 532 | remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); |
489 | up_read(&mm->mmap_sem); | 533 | up_read(&mm->mmap_sem); |
534 | |||
535 | spin_lock(&ksm_mmlist_lock); | ||
536 | mm_slot = list_entry(mm_slot->mm_list.next, | ||
537 | struct mm_slot, mm_list); | ||
538 | spin_unlock(&ksm_mmlist_lock); | ||
490 | } | 539 | } |
491 | 540 | ||
541 | ksm_scan.seqnr = 0; | ||
542 | out: | ||
492 | spin_lock(&ksm_mmlist_lock); | 543 | spin_lock(&ksm_mmlist_lock); |
493 | if (ksm_scan.mm_slot != &ksm_mm_head) { | 544 | ksm_scan.mm_slot = &ksm_mm_head; |
494 | ksm_scan.mm_slot = &ksm_mm_head; | ||
495 | ksm_scan.seqnr++; | ||
496 | } | ||
497 | spin_unlock(&ksm_mmlist_lock); | 545 | spin_unlock(&ksm_mmlist_lock); |
546 | return err; | ||
498 | } | 547 | } |
499 | 548 | ||
500 | static void remove_mm_from_lists(struct mm_struct *mm) | 549 | static void remove_mm_from_lists(struct mm_struct *mm) |
@@ -1051,6 +1100,8 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1051 | /* | 1100 | /* |
1052 | * A ksm page might have got here by fork, but its other | 1101 | * A ksm page might have got here by fork, but its other |
1053 | * references have already been removed from the stable tree. | 1102 | * references have already been removed from the stable tree. |
1103 | * Or it might be left over from a break_ksm which failed | ||
1104 | * when the mem_cgroup had reached its limit: try again now. | ||
1054 | */ | 1105 | */ |
1055 | if (PageKsm(page)) | 1106 | if (PageKsm(page)) |
1056 | break_cow(rmap_item->mm, rmap_item->address); | 1107 | break_cow(rmap_item->mm, rmap_item->address); |
@@ -1286,6 +1337,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | |||
1286 | unsigned long end, int advice, unsigned long *vm_flags) | 1337 | unsigned long end, int advice, unsigned long *vm_flags) |
1287 | { | 1338 | { |
1288 | struct mm_struct *mm = vma->vm_mm; | 1339 | struct mm_struct *mm = vma->vm_mm; |
1340 | int err; | ||
1289 | 1341 | ||
1290 | switch (advice) { | 1342 | switch (advice) { |
1291 | case MADV_MERGEABLE: | 1343 | case MADV_MERGEABLE: |
@@ -1298,9 +1350,11 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | |||
1298 | VM_MIXEDMAP | VM_SAO)) | 1350 | VM_MIXEDMAP | VM_SAO)) |
1299 | return 0; /* just ignore the advice */ | 1351 | return 0; /* just ignore the advice */ |
1300 | 1352 | ||
1301 | if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) | 1353 | if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { |
1302 | if (__ksm_enter(mm) < 0) | 1354 | err = __ksm_enter(mm); |
1303 | return -EAGAIN; | 1355 | if (err) |
1356 | return err; | ||
1357 | } | ||
1304 | 1358 | ||
1305 | *vm_flags |= VM_MERGEABLE; | 1359 | *vm_flags |= VM_MERGEABLE; |
1306 | break; | 1360 | break; |
@@ -1309,8 +1363,11 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | |||
1309 | if (!(*vm_flags & VM_MERGEABLE)) | 1363 | if (!(*vm_flags & VM_MERGEABLE)) |
1310 | return 0; /* just ignore the advice */ | 1364 | return 0; /* just ignore the advice */ |
1311 | 1365 | ||
1312 | if (vma->anon_vma) | 1366 | if (vma->anon_vma) { |
1313 | unmerge_ksm_pages(vma, start, end); | 1367 | err = unmerge_ksm_pages(vma, start, end); |
1368 | if (err) | ||
1369 | return err; | ||
1370 | } | ||
1314 | 1371 | ||
1315 | *vm_flags &= ~VM_MERGEABLE; | 1372 | *vm_flags &= ~VM_MERGEABLE; |
1316 | break; | 1373 | break; |
@@ -1441,8 +1498,13 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1441 | mutex_lock(&ksm_thread_mutex); | 1498 | mutex_lock(&ksm_thread_mutex); |
1442 | if (ksm_run != flags) { | 1499 | if (ksm_run != flags) { |
1443 | ksm_run = flags; | 1500 | ksm_run = flags; |
1444 | if (flags & KSM_RUN_UNMERGE) | 1501 | if (flags & KSM_RUN_UNMERGE) { |
1445 | unmerge_and_remove_all_rmap_items(); | 1502 | err = unmerge_and_remove_all_rmap_items(); |
1503 | if (err) { | ||
1504 | ksm_run = KSM_RUN_STOP; | ||
1505 | count = err; | ||
1506 | } | ||
1507 | } | ||
1446 | } | 1508 | } |
1447 | mutex_unlock(&ksm_thread_mutex); | 1509 | mutex_unlock(&ksm_thread_mutex); |
1448 | 1510 | ||