aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.cz>2015-02-11 18:26:24 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-11 20:06:03 -0500
commitc32b3cbe0d067a9cfae85aa70ba1e97ceba0ced7 (patch)
treeea807199ce92eed21239e5279033dbeb83b9dde1 /mm
parent401e4a7cf67d993bae02efdf1a234d7e2dbd2df2 (diff)
oom, PM: make OOM detection in the freezer path raceless
Commit 5695be142e20 ("OOM, PM: OOM killed task shouldn't escape PM suspend") has left a race window when OOM killer manages to note_oom_kill after freeze_processes checks the counter. The race window is quite small and really unlikely and partial solution deemed sufficient at the time of submission. Tejun wasn't happy about this partial solution though and insisted on a full solution. That requires the full OOM and freezer's task freezing exclusion, though. This is done by this patch which introduces oom_sem RW lock and turns oom_killer_disable() into a full OOM barrier. oom_killer_disabled check is moved from the allocation path to the OOM level and we take oom_sem for reading for both the check and the whole OOM invocation. oom_killer_disable() takes oom_sem for writing so it waits for all currently running OOM killer invocations. Then it disable all the further OOMs by setting oom_killer_disabled and checks for any oom victims. Victims are counted via mark_tsk_oom_victim resp. unmark_oom_victim. The last victim wakes up all waiters enqueued by oom_killer_disable(). Therefore this function acts as the full OOM barrier. The page fault path is covered now as well although it was assumed to be safe before. As per Tejun, "We used to have freezing points deep in file system code which may be reacheable from page fault." so it would be better and more robust to not rely on freezing points here. Same applies to the memcg OOM killer. out_of_memory tells the caller whether the OOM was allowed to trigger and the callers are supposed to handle the situation. The page allocation path simply fails the allocation same as before. The page fault path will retry the fault (more on that later) and Sysrq OOM trigger will simply complain to the log. Normally there wouldn't be any unfrozen user tasks after try_to_freeze_tasks so the function will not block. But if there was an OOM killer racing with try_to_freeze_tasks and the OOM victim didn't finish yet then we have to wait for it. This should complete in a finite time, though, because - the victim cannot loop in the page fault handler (it would die on the way out from the exception) - it cannot loop in the page allocator because all the further allocation would fail and __GFP_NOFAIL allocations are not acceptable at this stage - it shouldn't be blocked on any locks held by frozen tasks (try_to_freeze expects lockless context) and kernel threads and work queues are not frozen yet Signed-off-by: Michal Hocko <mhocko@suse.cz> Suggested-by: Tejun Heo <tj@kernel.org> Cc: David Rientjes <rientjes@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Cong Wang <xiyou.wangcong@gmail.com> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/oom_kill.c132
-rw-r--r--mm/page_alloc.c17
3 files changed, 115 insertions, 36 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fe4d258ef32b..fbf64e6f64e4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1930,7 +1930,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
1930 if (!memcg) 1930 if (!memcg)
1931 return false; 1931 return false;
1932 1932
1933 if (!handle) 1933 if (!handle || oom_killer_disabled)
1934 goto cleanup; 1934 goto cleanup;
1935 1935
1936 owait.memcg = memcg; 1936 owait.memcg = memcg;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3cbd76b8c13b..b8df76ee2be3 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -398,30 +398,27 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
398} 398}
399 399
400/* 400/*
401 * Number of OOM killer invocations (including memcg OOM killer). 401 * Number of OOM victims in flight
402 * Primarily used by PM freezer to check for potential races with
403 * OOM killed frozen task.
404 */ 402 */
405static atomic_t oom_kills = ATOMIC_INIT(0); 403static atomic_t oom_victims = ATOMIC_INIT(0);
404static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
406 405
407int oom_kills_count(void) 406bool oom_killer_disabled __read_mostly;
408{ 407static DECLARE_RWSEM(oom_sem);
409 return atomic_read(&oom_kills);
410}
411
412void note_oom_kill(void)
413{
414 atomic_inc(&oom_kills);
415}
416 408
417/** 409/**
418 * mark_tsk_oom_victim - marks the given taks as OOM victim. 410 * mark_tsk_oom_victim - marks the given taks as OOM victim.
419 * @tsk: task to mark 411 * @tsk: task to mark
412 *
413 * Has to be called with oom_sem taken for read and never after
414 * oom has been disabled already.
420 */ 415 */
421void mark_tsk_oom_victim(struct task_struct *tsk) 416void mark_tsk_oom_victim(struct task_struct *tsk)
422{ 417{
423 set_tsk_thread_flag(tsk, TIF_MEMDIE); 418 WARN_ON(oom_killer_disabled);
424 419 /* OOM killer might race with memcg OOM */
420 if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
421 return;
425 /* 422 /*
426 * Make sure that the task is woken up from uninterruptible sleep 423 * Make sure that the task is woken up from uninterruptible sleep
427 * if it is frozen because OOM killer wouldn't be able to free 424 * if it is frozen because OOM killer wouldn't be able to free
@@ -429,14 +426,70 @@ void mark_tsk_oom_victim(struct task_struct *tsk)
429 * that TIF_MEMDIE tasks should be ignored. 426 * that TIF_MEMDIE tasks should be ignored.
430 */ 427 */
431 __thaw_task(tsk); 428 __thaw_task(tsk);
429 atomic_inc(&oom_victims);
432} 430}
433 431
434/** 432/**
435 * unmark_oom_victim - unmarks the current task as OOM victim. 433 * unmark_oom_victim - unmarks the current task as OOM victim.
434 *
435 * Wakes up all waiters in oom_killer_disable()
436 */ 436 */
437void unmark_oom_victim(void) 437void unmark_oom_victim(void)
438{ 438{
439 clear_thread_flag(TIF_MEMDIE); 439 if (!test_and_clear_thread_flag(TIF_MEMDIE))
440 return;
441
442 down_read(&oom_sem);
443 /*
444 * There is no need to signal the lasst oom_victim if there
445 * is nobody who cares.
446 */
447 if (!atomic_dec_return(&oom_victims) && oom_killer_disabled)
448 wake_up_all(&oom_victims_wait);
449 up_read(&oom_sem);
450}
451
452/**
453 * oom_killer_disable - disable OOM killer
454 *
455 * Forces all page allocations to fail rather than trigger OOM killer.
456 * Will block and wait until all OOM victims are killed.
457 *
458 * The function cannot be called when there are runnable user tasks because
459 * the userspace would see unexpected allocation failures as a result. Any
460 * new usage of this function should be consulted with MM people.
461 *
462 * Returns true if successful and false if the OOM killer cannot be
463 * disabled.
464 */
465bool oom_killer_disable(void)
466{
467 /*
468 * Make sure to not race with an ongoing OOM killer
469 * and that the current is not the victim.
470 */
471 down_write(&oom_sem);
472 if (test_thread_flag(TIF_MEMDIE)) {
473 up_write(&oom_sem);
474 return false;
475 }
476
477 oom_killer_disabled = true;
478 up_write(&oom_sem);
479
480 wait_event(oom_victims_wait, !atomic_read(&oom_victims));
481
482 return true;
483}
484
485/**
486 * oom_killer_enable - enable OOM killer
487 */
488void oom_killer_enable(void)
489{
490 down_write(&oom_sem);
491 oom_killer_disabled = false;
492 up_write(&oom_sem);
440} 493}
441 494
442#define K(x) ((x) << (PAGE_SHIFT-10)) 495#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -637,7 +690,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
637} 690}
638 691
639/** 692/**
640 * out_of_memory - kill the "best" process when we run out of memory 693 * __out_of_memory - kill the "best" process when we run out of memory
641 * @zonelist: zonelist pointer 694 * @zonelist: zonelist pointer
642 * @gfp_mask: memory allocation flags 695 * @gfp_mask: memory allocation flags
643 * @order: amount of memory being requested as a power of 2 696 * @order: amount of memory being requested as a power of 2
@@ -649,7 +702,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
649 * OR try to be smart about which process to kill. Note that we 702 * OR try to be smart about which process to kill. Note that we
650 * don't have to be perfect here, we just have to be good. 703 * don't have to be perfect here, we just have to be good.
651 */ 704 */
652void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, 705static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
653 int order, nodemask_t *nodemask, bool force_kill) 706 int order, nodemask_t *nodemask, bool force_kill)
654{ 707{
655 const nodemask_t *mpol_mask; 708 const nodemask_t *mpol_mask;
@@ -718,6 +771,32 @@ out:
718 schedule_timeout_killable(1); 771 schedule_timeout_killable(1);
719} 772}
720 773
774/**
775 * out_of_memory - tries to invoke OOM killer.
776 * @zonelist: zonelist pointer
777 * @gfp_mask: memory allocation flags
778 * @order: amount of memory being requested as a power of 2
779 * @nodemask: nodemask passed to page allocator
780 * @force_kill: true if a task must be killed, even if others are exiting
781 *
782 * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable()
783 * when it returns false. Otherwise returns true.
784 */
785bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
786 int order, nodemask_t *nodemask, bool force_kill)
787{
788 bool ret = false;
789
790 down_read(&oom_sem);
791 if (!oom_killer_disabled) {
792 __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill);
793 ret = true;
794 }
795 up_read(&oom_sem);
796
797 return ret;
798}
799
721/* 800/*
722 * The pagefault handler calls here because it is out of memory, so kill a 801 * The pagefault handler calls here because it is out of memory, so kill a
723 * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a 802 * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a
@@ -727,12 +806,25 @@ void pagefault_out_of_memory(void)
727{ 806{
728 struct zonelist *zonelist; 807 struct zonelist *zonelist;
729 808
809 down_read(&oom_sem);
730 if (mem_cgroup_oom_synchronize(true)) 810 if (mem_cgroup_oom_synchronize(true))
731 return; 811 goto unlock;
732 812
733 zonelist = node_zonelist(first_memory_node, GFP_KERNEL); 813 zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
734 if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { 814 if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
735 out_of_memory(NULL, 0, 0, NULL, false); 815 if (!oom_killer_disabled)
816 __out_of_memory(NULL, 0, 0, NULL, false);
817 else
818 /*
819 * There shouldn't be any user tasks runable while the
820 * OOM killer is disabled so the current task has to
821 * be a racing OOM victim for which oom_killer_disable()
822 * is waiting for.
823 */
824 WARN_ON(test_thread_flag(TIF_MEMDIE));
825
736 oom_zonelist_unlock(zonelist, GFP_KERNEL); 826 oom_zonelist_unlock(zonelist, GFP_KERNEL);
737 } 827 }
828unlock:
829 up_read(&oom_sem);
738} 830}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 641d5a9a8617..134e25525044 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -244,8 +244,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
244 PB_migrate, PB_migrate_end); 244 PB_migrate, PB_migrate_end);
245} 245}
246 246
247bool oom_killer_disabled __read_mostly;
248
249#ifdef CONFIG_DEBUG_VM 247#ifdef CONFIG_DEBUG_VM
250static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 248static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
251{ 249{
@@ -2317,9 +2315,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2317 2315
2318 *did_some_progress = 0; 2316 *did_some_progress = 0;
2319 2317
2320 if (oom_killer_disabled)
2321 return NULL;
2322
2323 /* 2318 /*
2324 * Acquire the per-zone oom lock for each zone. If that 2319 * Acquire the per-zone oom lock for each zone. If that
2325 * fails, somebody else is making progress for us. 2320 * fails, somebody else is making progress for us.
@@ -2331,14 +2326,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2331 } 2326 }
2332 2327
2333 /* 2328 /*
2334 * PM-freezer should be notified that there might be an OOM killer on
2335 * its way to kill and wake somebody up. This is too early and we might
2336 * end up not killing anything but false positives are acceptable.
2337 * See freeze_processes.
2338 */
2339 note_oom_kill();
2340
2341 /*
2342 * Go through the zonelist yet one more time, keep very high watermark 2329 * Go through the zonelist yet one more time, keep very high watermark
2343 * here, this is only to catch a parallel oom killing, we must fail if 2330 * here, this is only to catch a parallel oom killing, we must fail if
2344 * we're still under heavy pressure. 2331 * we're still under heavy pressure.
@@ -2372,8 +2359,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2372 goto out; 2359 goto out;
2373 } 2360 }
2374 /* Exhausted what can be done so it's blamo time */ 2361 /* Exhausted what can be done so it's blamo time */
2375 out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false); 2362 if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false))
2376 *did_some_progress = 1; 2363 *did_some_progress = 1;
2377out: 2364out:
2378 oom_zonelist_unlock(ac->zonelist, gfp_mask); 2365 oom_zonelist_unlock(ac->zonelist, gfp_mask);
2379 return page; 2366 return page;