aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.cz>2011-07-26 19:08:23 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-07-26 19:49:42 -0400
commit79dfdaccd1d5b40ff7cf4a35a0e63696ebb78b4d (patch)
tree27eeb38c9ca690a19f05e06a2a46f094b20bfd52 /mm
parentbb2a0de92c891b8feeedc0178acb3ae009d899a8 (diff)
memcg: make oom_lock 0 and 1 based rather than counter
Commit 867578cb ("memcg: fix oom kill behavior") introduced a oom_lock counter which is incremented by mem_cgroup_oom_lock when we are about to handle memcg OOM situation. mem_cgroup_handle_oom falls back to a sleep if oom_lock > 1 to prevent from multiple oom kills at the same time. The counter is then decremented by mem_cgroup_oom_unlock called from the same function. This works correctly but it can lead to serious starvations when we have many processes triggering OOM and many CPUs available for them (I have tested with 16 CPUs). Consider a process (call it A) which gets the oom_lock (the first one that got to mem_cgroup_handle_oom and grabbed memcg_oom_mutex) and other processes that are blocked on the mutex. While A releases the mutex and calls mem_cgroup_out_of_memory others will wake up (one after another) and increase the counter and fall into sleep (memcg_oom_waitq). Once A finishes mem_cgroup_out_of_memory it takes the mutex again and decreases oom_lock and wakes other tasks (if releasing memory by somebody else - e.g. killed process - hasn't done it yet). A testcase would look like: Assume malloc XXX is a program allocating XXX Megabytes of memory which touches all allocated pages in a tight loop # swapoff SWAP_DEVICE # cgcreate -g memory:A # cgset -r memory.oom_control=0 A # cgset -r memory.limit_in_bytes= 200M # for i in `seq 100` # do # cgexec -g memory:A malloc 10 & # done The main problem here is that all processes still race for the mutex and there is no guarantee that we will get counter back to 0 for those that got back to mem_cgroup_handle_oom. In the end the whole convoy in/decreases the counter but we do not get to 1 that would enable killing so nothing useful can be done. The time is basically unbounded because it highly depends on scheduling and ordering on mutex (I have seen this taking hours...). This patch replaces the counter by a simple {un}lock semantic. As mem_cgroup_oom_{un}lock works on the a subtree of a hierarchy we have to make sure that nobody else races with us which is guaranteed by the memcg_oom_mutex. We have to be careful while locking subtrees because we can encounter a subtree which is already locked: hierarchy: A / \ B \ /\ \ C D E B - C - D tree might be already locked. While we want to enable locking E subtree because OOM situations cannot influence each other we definitely do not want to allow locking A. Therefore we have to refuse lock if any subtree is already locked and clear up the lock for all nodes that have been set up to the failure point. On the other hand we have to make sure that the rest of the world will recognize that a group is under OOM even though it doesn't have a lock. Therefore we have to introduce under_oom variable which is incremented and decremented for the whole subtree when we enter resp. leave mem_cgroup_handle_oom. under_oom, unlike oom_lock, doesn't need be updated under memcg_oom_mutex because its users only check a single group and they use atomic operations for that. This can be checked easily by the following test case: # cgcreate -g memory:A # cgset -r memory.use_hierarchy=1 A # cgset -r memory.oom_control=1 A # cgset -r memory.limit_in_bytes= 100M # cgset -r memory.memsw.limit_in_bytes= 100M # cgcreate -g memory:A/B # cgset -r memory.oom_control=1 A/B # cgset -r memory.limit_in_bytes=20M # cgset -r memory.memsw.limit_in_bytes=20M # cgexec -g memory:A/B malloc 30 & #->this will be blocked by OOM of group B # cgexec -g memory:A malloc 80 & #->this will be blocked by OOM of group A While B gets oom_lock A will not get it. Both of them go into sleep and wait for an external action. We can make the limit higher for A to enforce waking it up # cgset -r memory.memsw.limit_in_bytes=300M A # cgset -r memory.limit_in_bytes=300M A malloc in A has to wake up even though it doesn't have oom_lock. Finally, the unlock path is very easy because we always unlock only the subtree we have locked previously while we always decrement under_oom. Signed-off-by: Michal Hocko <mhocko@suse.cz> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <bsingharora@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c86
1 files changed, 70 insertions, 16 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 85599662bd90..95d6c256b54c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -246,7 +246,10 @@ struct mem_cgroup {
246 * Should the accounting and control be hierarchical, per subtree? 246 * Should the accounting and control be hierarchical, per subtree?
247 */ 247 */
248 bool use_hierarchy; 248 bool use_hierarchy;
249 atomic_t oom_lock; 249
250 bool oom_lock;
251 atomic_t under_oom;
252
250 atomic_t refcnt; 253 atomic_t refcnt;
251 254
252 int swappiness; 255 int swappiness;
@@ -1722,37 +1725,83 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1722/* 1725/*
1723 * Check OOM-Killer is already running under our hierarchy. 1726 * Check OOM-Killer is already running under our hierarchy.
1724 * If someone is running, return false. 1727 * If someone is running, return false.
1728 * Has to be called with memcg_oom_mutex
1725 */ 1729 */
1726static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1730static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1727{ 1731{
1728 int x, lock_count = 0; 1732 int lock_count = -1;
1729 struct mem_cgroup *iter; 1733 struct mem_cgroup *iter, *failed = NULL;
1734 bool cond = true;
1730 1735
1731 for_each_mem_cgroup_tree(iter, mem) { 1736 for_each_mem_cgroup_tree_cond(iter, mem, cond) {
1732 x = atomic_inc_return(&iter->oom_lock); 1737 bool locked = iter->oom_lock;
1733 lock_count = max(x, lock_count); 1738
1739 iter->oom_lock = true;
1740 if (lock_count == -1)
1741 lock_count = iter->oom_lock;
1742 else if (lock_count != locked) {
1743 /*
1744 * this subtree of our hierarchy is already locked
1745 * so we cannot give a lock.
1746 */
1747 lock_count = 0;
1748 failed = iter;
1749 cond = false;
1750 }
1734 } 1751 }
1735 1752
1736 if (lock_count == 1) 1753 if (!failed)
1737 return true; 1754 goto done;
1738 return false; 1755
1756 /*
1757 * OK, we failed to lock the whole subtree so we have to clean up
1758 * what we set up to the failing subtree
1759 */
1760 cond = true;
1761 for_each_mem_cgroup_tree_cond(iter, mem, cond) {
1762 if (iter == failed) {
1763 cond = false;
1764 continue;
1765 }
1766 iter->oom_lock = false;
1767 }
1768done:
1769 return lock_count;
1739} 1770}
1740 1771
1772/*
1773 * Has to be called with memcg_oom_mutex
1774 */
1741static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) 1775static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1742{ 1776{
1743 struct mem_cgroup *iter; 1777 struct mem_cgroup *iter;
1744 1778
1779 for_each_mem_cgroup_tree(iter, mem)
1780 iter->oom_lock = false;
1781 return 0;
1782}
1783
1784static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem)
1785{
1786 struct mem_cgroup *iter;
1787
1788 for_each_mem_cgroup_tree(iter, mem)
1789 atomic_inc(&iter->under_oom);
1790}
1791
1792static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
1793{
1794 struct mem_cgroup *iter;
1795
1745 /* 1796 /*
1746 * When a new child is created while the hierarchy is under oom, 1797 * When a new child is created while the hierarchy is under oom,
1747 * mem_cgroup_oom_lock() may not be called. We have to use 1798 * mem_cgroup_oom_lock() may not be called. We have to use
1748 * atomic_add_unless() here. 1799 * atomic_add_unless() here.
1749 */ 1800 */
1750 for_each_mem_cgroup_tree(iter, mem) 1801 for_each_mem_cgroup_tree(iter, mem)
1751 atomic_add_unless(&iter->oom_lock, -1, 0); 1802 atomic_add_unless(&iter->under_oom, -1, 0);
1752 return 0;
1753} 1803}
1754 1804
1755
1756static DEFINE_MUTEX(memcg_oom_mutex); 1805static DEFINE_MUTEX(memcg_oom_mutex);
1757static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1806static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1758 1807
@@ -1794,7 +1843,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
1794 1843
1795static void memcg_oom_recover(struct mem_cgroup *mem) 1844static void memcg_oom_recover(struct mem_cgroup *mem)
1796{ 1845{
1797 if (mem && atomic_read(&mem->oom_lock)) 1846 if (mem && atomic_read(&mem->under_oom))
1798 memcg_wakeup_oom(mem); 1847 memcg_wakeup_oom(mem);
1799} 1848}
1800 1849
@@ -1812,6 +1861,8 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1812 owait.wait.private = current; 1861 owait.wait.private = current;
1813 INIT_LIST_HEAD(&owait.wait.task_list); 1862 INIT_LIST_HEAD(&owait.wait.task_list);
1814 need_to_kill = true; 1863 need_to_kill = true;
1864 mem_cgroup_mark_under_oom(mem);
1865
1815 /* At first, try to OOM lock hierarchy under mem.*/ 1866 /* At first, try to OOM lock hierarchy under mem.*/
1816 mutex_lock(&memcg_oom_mutex); 1867 mutex_lock(&memcg_oom_mutex);
1817 locked = mem_cgroup_oom_lock(mem); 1868 locked = mem_cgroup_oom_lock(mem);
@@ -1835,10 +1886,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1835 finish_wait(&memcg_oom_waitq, &owait.wait); 1886 finish_wait(&memcg_oom_waitq, &owait.wait);
1836 } 1887 }
1837 mutex_lock(&memcg_oom_mutex); 1888 mutex_lock(&memcg_oom_mutex);
1838 mem_cgroup_oom_unlock(mem); 1889 if (locked)
1890 mem_cgroup_oom_unlock(mem);
1839 memcg_wakeup_oom(mem); 1891 memcg_wakeup_oom(mem);
1840 mutex_unlock(&memcg_oom_mutex); 1892 mutex_unlock(&memcg_oom_mutex);
1841 1893
1894 mem_cgroup_unmark_under_oom(mem);
1895
1842 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 1896 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1843 return false; 1897 return false;
1844 /* Give chance to dying process */ 1898 /* Give chance to dying process */
@@ -4505,7 +4559,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4505 list_add(&event->list, &memcg->oom_notify); 4559 list_add(&event->list, &memcg->oom_notify);
4506 4560
4507 /* already in OOM ? */ 4561 /* already in OOM ? */
4508 if (atomic_read(&memcg->oom_lock)) 4562 if (atomic_read(&memcg->under_oom))
4509 eventfd_signal(eventfd, 1); 4563 eventfd_signal(eventfd, 1);
4510 mutex_unlock(&memcg_oom_mutex); 4564 mutex_unlock(&memcg_oom_mutex);
4511 4565
@@ -4540,7 +4594,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
4540 4594
4541 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); 4595 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
4542 4596
4543 if (atomic_read(&mem->oom_lock)) 4597 if (atomic_read(&mem->under_oom))
4544 cb->fill(cb, "under_oom", 1); 4598 cb->fill(cb, "under_oom", 1);
4545 else 4599 else
4546 cb->fill(cb, "under_oom", 0); 4600 cb->fill(cb, "under_oom", 0);