aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2013-09-12 18:13:44 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-12 18:38:02 -0400
commit3812c8c8f3953921ef18544110dafc3505c1ac62 (patch)
tree8e5efc15fec4700644774df5fb5302f5c82f4a31 /mm/memcontrol.c
parentfb2a6fc56be66c169f8b80e07ed999ba453a2db2 (diff)
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a task fails to charge memory, it invokes the OOM killer and loops right there in the charge code until it succeeds. Comparably, any other task that enters the charge path at this point will go to a waitqueue right then and there and sleep until the OOM situation is resolved. The problem is that these tasks may hold filesystem locks and the mmap_sem; locks that the selected OOM victim may need to exit. For example, in one reported case, the task invoking the OOM killer was about to charge a page cache page during a write(), which holds the i_mutex. The OOM killer selected a task that was just entering truncate() and trying to acquire the i_mutex: OOM invoking task: mem_cgroup_handle_oom+0x241/0x3b0 mem_cgroup_cache_charge+0xbe/0xe0 add_to_page_cache_locked+0x4c/0x140 add_to_page_cache_lru+0x22/0x50 grab_cache_page_write_begin+0x8b/0xe0 ext3_write_begin+0x88/0x270 generic_file_buffered_write+0x116/0x290 __generic_file_aio_write+0x27c/0x480 generic_file_aio_write+0x76/0xf0 # takes ->i_mutex do_sync_write+0xea/0x130 vfs_write+0xf3/0x1f0 sys_write+0x51/0x90 system_call_fastpath+0x18/0x1d OOM kill victim: do_truncate+0x58/0xa0 # takes i_mutex do_last+0x250/0xa30 path_openat+0xd7/0x440 do_filp_open+0x49/0xa0 do_sys_open+0x106/0x240 sys_open+0x20/0x30 system_call_fastpath+0x18/0x1d The OOM handling task will retry the charge indefinitely while the OOM killed task is not releasing any resources. A similar scenario can happen when the kernel OOM killer for a memcg is disabled and a userspace task is in charge of resolving OOM situations. In this case, ALL tasks that enter the OOM path will be made to sleep on the OOM waitqueue and wait for userspace to free resources or increase the group's limit. But a userspace OOM handler is prone to deadlock itself on the locks held by the waiting tasks. For example one of the sleeping tasks may be stuck in a brk() call with the mmap_sem held for writing but the userspace handler, in order to pick an optimal victim, may need to read files from /proc/<pid>, which tries to acquire the same mmap_sem for reading and deadlocks. This patch changes the way tasks behave after detecting a memcg OOM and makes sure nobody loops or sleeps with locks held: 1. When OOMing in a user fault, invoke the OOM killer and restart the fault instead of looping on the charge attempt. This way, the OOM victim can not get stuck on locks the looping task may hold. 2. When OOMing in a user fault but somebody else is handling it (either the kernel OOM killer or a userspace handler), don't go to sleep in the charge context. Instead, remember the OOMing memcg in the task struct and then fully unwind the page fault stack with -ENOMEM. pagefault_out_of_memory() will then call back into the memcg code to check if the -ENOMEM came from the memcg, and then either put the task to sleep on the memcg's OOM waitqueue or just restart the fault. The OOM victim can no longer get stuck on any lock a sleeping task may hold. Debugged by Michal Hocko. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reported-by: azurIt <azurit@pobox.sk> Acked-by: Michal Hocko <mhocko@suse.cz> Cc: David Rientjes <rientjes@google.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c154
1 files changed, 107 insertions, 47 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 04250cbf46c6..4b5cfb509270 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -255,6 +255,7 @@ struct mem_cgroup {
255 255
256 bool oom_lock; 256 bool oom_lock;
257 atomic_t under_oom; 257 atomic_t under_oom;
258 atomic_t oom_wakeups;
258 259
259 int swappiness; 260 int swappiness;
260 /* OOM-Killer disable */ 261 /* OOM-Killer disable */
@@ -2020,6 +2021,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
2020 2021
2021static void memcg_wakeup_oom(struct mem_cgroup *memcg) 2022static void memcg_wakeup_oom(struct mem_cgroup *memcg)
2022{ 2023{
2024 atomic_inc(&memcg->oom_wakeups);
2023 /* for filtering, pass "memcg" as argument. */ 2025 /* for filtering, pass "memcg" as argument. */
2024 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 2026 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
2025} 2027}
@@ -2031,19 +2033,17 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
2031} 2033}
2032 2034
2033/* 2035/*
2034 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 2036 * try to call OOM killer
2035 */ 2037 */
2036static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, 2038static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
2037 int order)
2038{ 2039{
2039 struct oom_wait_info owait;
2040 bool locked; 2040 bool locked;
2041 int wakeups;
2041 2042
2042 owait.memcg = memcg; 2043 if (!current->memcg_oom.may_oom)
2043 owait.wait.flags = 0; 2044 return;
2044 owait.wait.func = memcg_oom_wake_function; 2045
2045 owait.wait.private = current; 2046 current->memcg_oom.in_memcg_oom = 1;
2046 INIT_LIST_HEAD(&owait.wait.task_list);
2047 2047
2048 /* 2048 /*
2049 * As with any blocking lock, a contender needs to start 2049 * As with any blocking lock, a contender needs to start
@@ -2051,12 +2051,8 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
2051 * otherwise it can miss the wakeup from the unlock and sleep 2051 * otherwise it can miss the wakeup from the unlock and sleep
2052 * indefinitely. This is just open-coded because our locking 2052 * indefinitely. This is just open-coded because our locking
2053 * is so particular to memcg hierarchies. 2053 * is so particular to memcg hierarchies.
2054 *
2055 * Even if signal_pending(), we can't quit charge() loop without
2056 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
2057 * under OOM is always welcomed, use TASK_KILLABLE here.
2058 */ 2054 */
2059 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2055 wakeups = atomic_read(&memcg->oom_wakeups);
2060 mem_cgroup_mark_under_oom(memcg); 2056 mem_cgroup_mark_under_oom(memcg);
2061 2057
2062 locked = mem_cgroup_oom_trylock(memcg); 2058 locked = mem_cgroup_oom_trylock(memcg);
@@ -2066,15 +2062,95 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
2066 2062
2067 if (locked && !memcg->oom_kill_disable) { 2063 if (locked && !memcg->oom_kill_disable) {
2068 mem_cgroup_unmark_under_oom(memcg); 2064 mem_cgroup_unmark_under_oom(memcg);
2069 finish_wait(&memcg_oom_waitq, &owait.wait);
2070 mem_cgroup_out_of_memory(memcg, mask, order); 2065 mem_cgroup_out_of_memory(memcg, mask, order);
2066 mem_cgroup_oom_unlock(memcg);
2067 /*
2068 * There is no guarantee that an OOM-lock contender
2069 * sees the wakeups triggered by the OOM kill
2070 * uncharges. Wake any sleepers explicitely.
2071 */
2072 memcg_oom_recover(memcg);
2071 } else { 2073 } else {
2072 schedule(); 2074 /*
2073 mem_cgroup_unmark_under_oom(memcg); 2075 * A system call can just return -ENOMEM, but if this
2074 finish_wait(&memcg_oom_waitq, &owait.wait); 2076 * is a page fault and somebody else is handling the
2077 * OOM already, we need to sleep on the OOM waitqueue
2078 * for this memcg until the situation is resolved.
2079 * Which can take some time because it might be
2080 * handled by a userspace task.
2081 *
2082 * However, this is the charge context, which means
2083 * that we may sit on a large call stack and hold
2084 * various filesystem locks, the mmap_sem etc. and we
2085 * don't want the OOM handler to deadlock on them
2086 * while we sit here and wait. Store the current OOM
2087 * context in the task_struct, then return -ENOMEM.
2088 * At the end of the page fault handler, with the
2089 * stack unwound, pagefault_out_of_memory() will check
2090 * back with us by calling
2091 * mem_cgroup_oom_synchronize(), possibly putting the
2092 * task to sleep.
2093 */
2094 current->memcg_oom.oom_locked = locked;
2095 current->memcg_oom.wakeups = wakeups;
2096 css_get(&memcg->css);
2097 current->memcg_oom.wait_on_memcg = memcg;
2075 } 2098 }
2099}
2100
2101/**
2102 * mem_cgroup_oom_synchronize - complete memcg OOM handling
2103 *
2104 * This has to be called at the end of a page fault if the the memcg
2105 * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
2106 *
2107 * Memcg supports userspace OOM handling, so failed allocations must
2108 * sleep on a waitqueue until the userspace task resolves the
2109 * situation. Sleeping directly in the charge context with all kinds
2110 * of locks held is not a good idea, instead we remember an OOM state
2111 * in the task and mem_cgroup_oom_synchronize() has to be called at
2112 * the end of the page fault to put the task to sleep and clean up the
2113 * OOM state.
2114 *
2115 * Returns %true if an ongoing memcg OOM situation was detected and
2116 * finalized, %false otherwise.
2117 */
2118bool mem_cgroup_oom_synchronize(void)
2119{
2120 struct oom_wait_info owait;
2121 struct mem_cgroup *memcg;
2122
2123 /* OOM is global, do not handle */
2124 if (!current->memcg_oom.in_memcg_oom)
2125 return false;
2126
2127 /*
2128 * We invoked the OOM killer but there is a chance that a kill
2129 * did not free up any charges. Everybody else might already
2130 * be sleeping, so restart the fault and keep the rampage
2131 * going until some charges are released.
2132 */
2133 memcg = current->memcg_oom.wait_on_memcg;
2134 if (!memcg)
2135 goto out;
2136
2137 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
2138 goto out_memcg;
2139
2140 owait.memcg = memcg;
2141 owait.wait.flags = 0;
2142 owait.wait.func = memcg_oom_wake_function;
2143 owait.wait.private = current;
2144 INIT_LIST_HEAD(&owait.wait.task_list);
2076 2145
2077 if (locked) { 2146 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2147 /* Only sleep if we didn't miss any wakeups since OOM */
2148 if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
2149 schedule();
2150 finish_wait(&memcg_oom_waitq, &owait.wait);
2151out_memcg:
2152 mem_cgroup_unmark_under_oom(memcg);
2153 if (current->memcg_oom.oom_locked) {
2078 mem_cgroup_oom_unlock(memcg); 2154 mem_cgroup_oom_unlock(memcg);
2079 /* 2155 /*
2080 * There is no guarantee that an OOM-lock contender 2156 * There is no guarantee that an OOM-lock contender
@@ -2083,11 +2159,10 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
2083 */ 2159 */
2084 memcg_oom_recover(memcg); 2160 memcg_oom_recover(memcg);
2085 } 2161 }
2086 2162 css_put(&memcg->css);
2087 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 2163 current->memcg_oom.wait_on_memcg = NULL;
2088 return false; 2164out:
2089 /* Give chance to dying process */ 2165 current->memcg_oom.in_memcg_oom = 0;
2090 schedule_timeout_uninterruptible(1);
2091 return true; 2166 return true;
2092} 2167}
2093 2168
@@ -2400,12 +2475,11 @@ enum {
2400 CHARGE_RETRY, /* need to retry but retry is not bad */ 2475 CHARGE_RETRY, /* need to retry but retry is not bad */
2401 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ 2476 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
2402 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ 2477 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
2403 CHARGE_OOM_DIE, /* the current is killed because of OOM */
2404}; 2478};
2405 2479
2406static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2480static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2407 unsigned int nr_pages, unsigned int min_pages, 2481 unsigned int nr_pages, unsigned int min_pages,
2408 bool oom_check) 2482 bool invoke_oom)
2409{ 2483{
2410 unsigned long csize = nr_pages * PAGE_SIZE; 2484 unsigned long csize = nr_pages * PAGE_SIZE;
2411 struct mem_cgroup *mem_over_limit; 2485 struct mem_cgroup *mem_over_limit;
@@ -2462,14 +2536,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2462 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2536 if (mem_cgroup_wait_acct_move(mem_over_limit))
2463 return CHARGE_RETRY; 2537 return CHARGE_RETRY;
2464 2538
2465 /* If we don't need to call oom-killer at el, return immediately */ 2539 if (invoke_oom)
2466 if (!oom_check || !current->memcg_oom.may_oom) 2540 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
2467 return CHARGE_NOMEM;
2468 /* check OOM */
2469 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
2470 return CHARGE_OOM_DIE;
2471 2541
2472 return CHARGE_RETRY; 2542 return CHARGE_NOMEM;
2473} 2543}
2474 2544
2475/* 2545/*
@@ -2572,7 +2642,7 @@ again:
2572 } 2642 }
2573 2643
2574 do { 2644 do {
2575 bool oom_check; 2645 bool invoke_oom = oom && !nr_oom_retries;
2576 2646
2577 /* If killed, bypass charge */ 2647 /* If killed, bypass charge */
2578 if (fatal_signal_pending(current)) { 2648 if (fatal_signal_pending(current)) {
@@ -2580,14 +2650,8 @@ again:
2580 goto bypass; 2650 goto bypass;
2581 } 2651 }
2582 2652
2583 oom_check = false; 2653 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
2584 if (oom && !nr_oom_retries) { 2654 nr_pages, invoke_oom);
2585 oom_check = true;
2586 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2587 }
2588
2589 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
2590 oom_check);
2591 switch (ret) { 2655 switch (ret) {
2592 case CHARGE_OK: 2656 case CHARGE_OK:
2593 break; 2657 break;
@@ -2600,16 +2664,12 @@ again:
2600 css_put(&memcg->css); 2664 css_put(&memcg->css);
2601 goto nomem; 2665 goto nomem;
2602 case CHARGE_NOMEM: /* OOM routine works */ 2666 case CHARGE_NOMEM: /* OOM routine works */
2603 if (!oom) { 2667 if (!oom || invoke_oom) {
2604 css_put(&memcg->css); 2668 css_put(&memcg->css);
2605 goto nomem; 2669 goto nomem;
2606 } 2670 }
2607 /* If oom, we never return -ENOMEM */
2608 nr_oom_retries--; 2671 nr_oom_retries--;
2609 break; 2672 break;
2610 case CHARGE_OOM_DIE: /* Killed by OOM Killer */
2611 css_put(&memcg->css);
2612 goto bypass;
2613 } 2673 }
2614 } while (ret != CHARGE_OK); 2674 } while (ret != CHARGE_OK);
2615 2675