aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2015-06-24 19:57:19 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-06-24 20:49:43 -0400
commitdc56401fc9f25e8f93899991ec858c98a331d88c (patch)
tree4222adb342f9595ef6bff62f26a33497647c3ded
parentda51b14adb671829077da3aeb9e9edd6f8c80afe (diff)
mm: oom_kill: simplify OOM killer locking
The zonelist locking and the oom_sem are two overlapping locks that are used to serialize global OOM killing against different things. The historical zonelist locking serializes OOM kills from allocations with overlapping zonelists against each other to prevent killing more tasks than necessary in the same memory domain. Only when neither tasklists nor zonelists from two concurrent OOM kills overlap (tasks in separate memcgs bound to separate nodes) are OOM kills allowed to execute in parallel. The younger oom_sem is a read-write lock to serialize OOM killing against the PM code trying to disable the OOM killer altogether. However, the OOM killer is a fairly cold error path, there is really no reason to optimize for highly performant and concurrent OOM kills. And the oom_sem is just flat-out redundant. Replace both locking schemes with a single global mutex serializing OOM kills regardless of context. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Michal Hocko <mhocko@suse.cz> Acked-by: David Rientjes <rientjes@google.com> Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Dave Chinner <david@fromorbit.com> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--drivers/tty/sysrq.c2
-rw-r--r--include/linux/oom.h5
-rw-r--r--mm/memcontrol.c18
-rw-r--r--mm/oom_kill.c127
-rw-r--r--mm/page_alloc.c8
5 files changed, 46 insertions, 114 deletions
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 843f2cdc280b..b20d2c0ec451 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -356,9 +356,11 @@ static struct sysrq_key_op sysrq_term_op = {
356 356
357static void moom_callback(struct work_struct *ignored) 357static void moom_callback(struct work_struct *ignored)
358{ 358{
359 mutex_lock(&oom_lock);
359 if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), 360 if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
360 GFP_KERNEL, 0, NULL, true)) 361 GFP_KERNEL, 0, NULL, true))
361 pr_info("OOM request ignored because killer is disabled\n"); 362 pr_info("OOM request ignored because killer is disabled\n");
363 mutex_unlock(&oom_lock);
362} 364}
363 365
364static DECLARE_WORK(moom_work, moom_callback); 366static DECLARE_WORK(moom_work, moom_callback);
diff --git a/include/linux/oom.h b/include/linux/oom.h
index a8e6a498cbcb..7deecb7bca5e 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -32,6 +32,8 @@ enum oom_scan_t {
32/* Thread is the potential origin of an oom condition; kill first on oom */ 32/* Thread is the potential origin of an oom condition; kill first on oom */
33#define OOM_FLAG_ORIGIN ((__force oom_flags_t)0x1) 33#define OOM_FLAG_ORIGIN ((__force oom_flags_t)0x1)
34 34
35extern struct mutex oom_lock;
36
35static inline void set_current_oom_origin(void) 37static inline void set_current_oom_origin(void)
36{ 38{
37 current->signal->oom_flags |= OOM_FLAG_ORIGIN; 39 current->signal->oom_flags |= OOM_FLAG_ORIGIN;
@@ -60,9 +62,6 @@ extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
60 struct mem_cgroup *memcg, nodemask_t *nodemask, 62 struct mem_cgroup *memcg, nodemask_t *nodemask,
61 const char *message); 63 const char *message);
62 64
63extern bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_flags);
64extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags);
65
66extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, 65extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
67 int order, const nodemask_t *nodemask, 66 int order, const nodemask_t *nodemask,
68 struct mem_cgroup *memcg); 67 struct mem_cgroup *memcg);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 20a7e874f719..8da44a083397 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1530,6 +1530,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1530 unsigned int points = 0; 1530 unsigned int points = 0;
1531 struct task_struct *chosen = NULL; 1531 struct task_struct *chosen = NULL;
1532 1532
1533 mutex_lock(&oom_lock);
1534
1533 /* 1535 /*
1534 * If current has a pending SIGKILL or is exiting, then automatically 1536 * If current has a pending SIGKILL or is exiting, then automatically
1535 * select it. The goal is to allow it to allocate so that it may 1537 * select it. The goal is to allow it to allocate so that it may
@@ -1537,7 +1539,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1537 */ 1539 */
1538 if (fatal_signal_pending(current) || task_will_free_mem(current)) { 1540 if (fatal_signal_pending(current) || task_will_free_mem(current)) {
1539 mark_oom_victim(current); 1541 mark_oom_victim(current);
1540 return; 1542 goto unlock;
1541 } 1543 }
1542 1544
1543 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg); 1545 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
@@ -1564,7 +1566,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1564 mem_cgroup_iter_break(memcg, iter); 1566 mem_cgroup_iter_break(memcg, iter);
1565 if (chosen) 1567 if (chosen)
1566 put_task_struct(chosen); 1568 put_task_struct(chosen);
1567 return; 1569 goto unlock;
1568 case OOM_SCAN_OK: 1570 case OOM_SCAN_OK:
1569 break; 1571 break;
1570 }; 1572 };
@@ -1585,11 +1587,13 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1585 css_task_iter_end(&it); 1587 css_task_iter_end(&it);
1586 } 1588 }
1587 1589
1588 if (!chosen) 1590 if (chosen) {
1589 return; 1591 points = chosen_points * 1000 / totalpages;
1590 points = chosen_points * 1000 / totalpages; 1592 oom_kill_process(chosen, gfp_mask, order, points, totalpages,
1591 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, 1593 memcg, NULL, "Memory cgroup out of memory");
1592 NULL, "Memory cgroup out of memory"); 1594 }
1595unlock:
1596 mutex_unlock(&oom_lock);
1593} 1597}
1594 1598
1595#if MAX_NUMNODES > 1 1599#if MAX_NUMNODES > 1
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d3490b019d46..5cfda39b3268 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -42,7 +42,8 @@
42int sysctl_panic_on_oom; 42int sysctl_panic_on_oom;
43int sysctl_oom_kill_allocating_task; 43int sysctl_oom_kill_allocating_task;
44int sysctl_oom_dump_tasks = 1; 44int sysctl_oom_dump_tasks = 1;
45static DEFINE_SPINLOCK(zone_scan_lock); 45
46DEFINE_MUTEX(oom_lock);
46 47
47#ifdef CONFIG_NUMA 48#ifdef CONFIG_NUMA
48/** 49/**
@@ -405,13 +406,12 @@ static atomic_t oom_victims = ATOMIC_INIT(0);
405static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); 406static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
406 407
407bool oom_killer_disabled __read_mostly; 408bool oom_killer_disabled __read_mostly;
408static DECLARE_RWSEM(oom_sem);
409 409
410/** 410/**
411 * mark_oom_victim - mark the given task as OOM victim 411 * mark_oom_victim - mark the given task as OOM victim
412 * @tsk: task to mark 412 * @tsk: task to mark
413 * 413 *
414 * Has to be called with oom_sem taken for read and never after 414 * Has to be called with oom_lock held and never after
415 * oom has been disabled already. 415 * oom has been disabled already.
416 */ 416 */
417void mark_oom_victim(struct task_struct *tsk) 417void mark_oom_victim(struct task_struct *tsk)
@@ -460,14 +460,14 @@ bool oom_killer_disable(void)
460 * Make sure to not race with an ongoing OOM killer 460 * Make sure to not race with an ongoing OOM killer
461 * and that the current is not the victim. 461 * and that the current is not the victim.
462 */ 462 */
463 down_write(&oom_sem); 463 mutex_lock(&oom_lock);
464 if (test_thread_flag(TIF_MEMDIE)) { 464 if (test_thread_flag(TIF_MEMDIE)) {
465 up_write(&oom_sem); 465 mutex_unlock(&oom_lock);
466 return false; 466 return false;
467 } 467 }
468 468
469 oom_killer_disabled = true; 469 oom_killer_disabled = true;
470 up_write(&oom_sem); 470 mutex_unlock(&oom_lock);
471 471
472 wait_event(oom_victims_wait, !atomic_read(&oom_victims)); 472 wait_event(oom_victims_wait, !atomic_read(&oom_victims));
473 473
@@ -634,52 +634,6 @@ int unregister_oom_notifier(struct notifier_block *nb)
634} 634}
635EXPORT_SYMBOL_GPL(unregister_oom_notifier); 635EXPORT_SYMBOL_GPL(unregister_oom_notifier);
636 636
637/*
638 * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero
639 * if a parallel OOM killing is already taking place that includes a zone in
640 * the zonelist. Otherwise, locks all zones in the zonelist and returns 1.
641 */
642bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask)
643{
644 struct zoneref *z;
645 struct zone *zone;
646 bool ret = true;
647
648 spin_lock(&zone_scan_lock);
649 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
650 if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) {
651 ret = false;
652 goto out;
653 }
654
655 /*
656 * Lock each zone in the zonelist under zone_scan_lock so a parallel
657 * call to oom_zonelist_trylock() doesn't succeed when it shouldn't.
658 */
659 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
660 set_bit(ZONE_OOM_LOCKED, &zone->flags);
661
662out:
663 spin_unlock(&zone_scan_lock);
664 return ret;
665}
666
667/*
668 * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
669 * allocation attempts with zonelists containing them may now recall the OOM
670 * killer, if necessary.
671 */
672void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
673{
674 struct zoneref *z;
675 struct zone *zone;
676
677 spin_lock(&zone_scan_lock);
678 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
679 clear_bit(ZONE_OOM_LOCKED, &zone->flags);
680 spin_unlock(&zone_scan_lock);
681}
682
683/** 637/**
684 * __out_of_memory - kill the "best" process when we run out of memory 638 * __out_of_memory - kill the "best" process when we run out of memory
685 * @zonelist: zonelist pointer 639 * @zonelist: zonelist pointer
@@ -693,8 +647,8 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
693 * OR try to be smart about which process to kill. Note that we 647 * OR try to be smart about which process to kill. Note that we
694 * don't have to be perfect here, we just have to be good. 648 * don't have to be perfect here, we just have to be good.
695 */ 649 */
696static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, 650bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
697 int order, nodemask_t *nodemask, bool force_kill) 651 int order, nodemask_t *nodemask, bool force_kill)
698{ 652{
699 const nodemask_t *mpol_mask; 653 const nodemask_t *mpol_mask;
700 struct task_struct *p; 654 struct task_struct *p;
@@ -704,10 +658,13 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
704 enum oom_constraint constraint = CONSTRAINT_NONE; 658 enum oom_constraint constraint = CONSTRAINT_NONE;
705 int killed = 0; 659 int killed = 0;
706 660
661 if (oom_killer_disabled)
662 return false;
663
707 blocking_notifier_call_chain(&oom_notify_list, 0, &freed); 664 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
708 if (freed > 0) 665 if (freed > 0)
709 /* Got some memory back in the last second. */ 666 /* Got some memory back in the last second. */
710 return; 667 goto out;
711 668
712 /* 669 /*
713 * If current has a pending SIGKILL or is exiting, then automatically 670 * If current has a pending SIGKILL or is exiting, then automatically
@@ -720,7 +677,7 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
720 if (current->mm && 677 if (current->mm &&
721 (fatal_signal_pending(current) || task_will_free_mem(current))) { 678 (fatal_signal_pending(current) || task_will_free_mem(current))) {
722 mark_oom_victim(current); 679 mark_oom_victim(current);
723 return; 680 goto out;
724 } 681 }
725 682
726 /* 683 /*
@@ -760,32 +717,8 @@ out:
760 */ 717 */
761 if (killed) 718 if (killed)
762 schedule_timeout_killable(1); 719 schedule_timeout_killable(1);
763}
764
765/**
766 * out_of_memory - tries to invoke OOM killer.
767 * @zonelist: zonelist pointer
768 * @gfp_mask: memory allocation flags
769 * @order: amount of memory being requested as a power of 2
770 * @nodemask: nodemask passed to page allocator
771 * @force_kill: true if a task must be killed, even if others are exiting
772 *
773 * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable()
774 * when it returns false. Otherwise returns true.
775 */
776bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
777 int order, nodemask_t *nodemask, bool force_kill)
778{
779 bool ret = false;
780
781 down_read(&oom_sem);
782 if (!oom_killer_disabled) {
783 __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill);
784 ret = true;
785 }
786 up_read(&oom_sem);
787 720
788 return ret; 721 return true;
789} 722}
790 723
791/* 724/*
@@ -795,27 +728,21 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
795 */ 728 */
796void pagefault_out_of_memory(void) 729void pagefault_out_of_memory(void)
797{ 730{
798 struct zonelist *zonelist;
799
800 down_read(&oom_sem);
801 if (mem_cgroup_oom_synchronize(true)) 731 if (mem_cgroup_oom_synchronize(true))
802 goto unlock; 732 return;
803 733
804 zonelist = node_zonelist(first_memory_node, GFP_KERNEL); 734 if (!mutex_trylock(&oom_lock))
805 if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { 735 return;
806 if (!oom_killer_disabled)
807 __out_of_memory(NULL, 0, 0, NULL, false);
808 else
809 /*
810 * There shouldn't be any user tasks runable while the
811 * OOM killer is disabled so the current task has to
812 * be a racing OOM victim for which oom_killer_disable()
813 * is waiting for.
814 */
815 WARN_ON(test_thread_flag(TIF_MEMDIE));
816 736
817 oom_zonelist_unlock(zonelist, GFP_KERNEL); 737 if (!out_of_memory(NULL, 0, 0, NULL, false)) {
738 /*
739 * There shouldn't be any user tasks runnable while the
740 * OOM killer is disabled, so the current task has to
741 * be a racing OOM victim for which oom_killer_disable()
742 * is waiting for.
743 */
744 WARN_ON(test_thread_flag(TIF_MEMDIE));
818 } 745 }
819unlock: 746
820 up_read(&oom_sem); 747 mutex_unlock(&oom_lock);
821} 748}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3b02be4def90..cae21dc9d54e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2360,10 +2360,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2360 *did_some_progress = 0; 2360 *did_some_progress = 0;
2361 2361
2362 /* 2362 /*
2363 * Acquire the per-zone oom lock for each zone. If that 2363 * Acquire the oom lock. If that fails, somebody else is
2364 * fails, somebody else is making progress for us. 2364 * making progress for us.
2365 */ 2365 */
2366 if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) { 2366 if (!mutex_trylock(&oom_lock)) {
2367 *did_some_progress = 1; 2367 *did_some_progress = 1;
2368 schedule_timeout_uninterruptible(1); 2368 schedule_timeout_uninterruptible(1);
2369 return NULL; 2369 return NULL;
@@ -2408,7 +2408,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2408 || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) 2408 || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
2409 *did_some_progress = 1; 2409 *did_some_progress = 1;
2410out: 2410out:
2411 oom_zonelist_unlock(ac->zonelist, gfp_mask); 2411 mutex_unlock(&oom_lock);
2412 return page; 2412 return page;
2413} 2413}
2414 2414