aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2013-10-16 16:46:59 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-10-17 00:35:53 -0400
commit4942642080ea82d99ab5b653abb9a12b7ba31f4a (patch)
tree7ec12b61f0bfdd1f1466e5233b67432828b25c33 /mm/memcontrol.c
parentc88b05b2cd07221cdefd56f7f7422c1459eb60c9 (diff)
mm: memcg: handle non-error OOM situations more gracefully
Commit 3812c8c8f395 ("mm: memcg: do not trap chargers with full callstack on OOM") assumed that only a few places that can trigger a memcg OOM situation do not return VM_FAULT_OOM, like optional page cache readahead. But there are many more and it's impractical to annotate them all. First of all, we don't want to invoke the OOM killer when the failed allocation is gracefully handled, so defer the actual kill to the end of the fault handling as well. This simplifies the code quite a bit for added bonus. Second, since a failed allocation might not be the abrupt end of the fault, the memcg OOM handler needs to be re-entrant until the fault finishes for subsequent allocation attempts. If an allocation is attempted after the task already OOMed, allow it to bypass the limit so that it can quickly finish the fault and invoke the OOM killer. Reported-by: azurIt <azurit@pobox.sk> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: <stable@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c139
1 files changed, 51 insertions, 88 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5335b2b6be77..65fc6a449841 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2161,110 +2161,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
2161 memcg_wakeup_oom(memcg); 2161 memcg_wakeup_oom(memcg);
2162} 2162}
2163 2163
2164/*
2165 * try to call OOM killer
2166 */
2167static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 2164static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
2168{ 2165{
2169 bool locked;
2170 int wakeups;
2171
2172 if (!current->memcg_oom.may_oom) 2166 if (!current->memcg_oom.may_oom)
2173 return; 2167 return;
2174
2175 current->memcg_oom.in_memcg_oom = 1;
2176
2177 /* 2168 /*
2178 * As with any blocking lock, a contender needs to start 2169 * We are in the middle of the charge context here, so we
2179 * listening for wakeups before attempting the trylock, 2170 * don't want to block when potentially sitting on a callstack
2180 * otherwise it can miss the wakeup from the unlock and sleep 2171 * that holds all kinds of filesystem and mm locks.
2181 * indefinitely. This is just open-coded because our locking 2172 *
2182 * is so particular to memcg hierarchies. 2173 * Also, the caller may handle a failed allocation gracefully
2174 * (like optional page cache readahead) and so an OOM killer
2175 * invocation might not even be necessary.
2176 *
2177 * That's why we don't do anything here except remember the
2178 * OOM context and then deal with it at the end of the page
2179 * fault when the stack is unwound, the locks are released,
2180 * and when we know whether the fault was overall successful.
2183 */ 2181 */
2184 wakeups = atomic_read(&memcg->oom_wakeups); 2182 css_get(&memcg->css);
2185 mem_cgroup_mark_under_oom(memcg); 2183 current->memcg_oom.memcg = memcg;
2186 2184 current->memcg_oom.gfp_mask = mask;
2187 locked = mem_cgroup_oom_trylock(memcg); 2185 current->memcg_oom.order = order;
2188
2189 if (locked)
2190 mem_cgroup_oom_notify(memcg);
2191
2192 if (locked && !memcg->oom_kill_disable) {
2193 mem_cgroup_unmark_under_oom(memcg);
2194 mem_cgroup_out_of_memory(memcg, mask, order);
2195 mem_cgroup_oom_unlock(memcg);
2196 /*
2197 * There is no guarantee that an OOM-lock contender
2198 * sees the wakeups triggered by the OOM kill
2199 * uncharges. Wake any sleepers explicitely.
2200 */
2201 memcg_oom_recover(memcg);
2202 } else {
2203 /*
2204 * A system call can just return -ENOMEM, but if this
2205 * is a page fault and somebody else is handling the
2206 * OOM already, we need to sleep on the OOM waitqueue
2207 * for this memcg until the situation is resolved.
2208 * Which can take some time because it might be
2209 * handled by a userspace task.
2210 *
2211 * However, this is the charge context, which means
2212 * that we may sit on a large call stack and hold
2213 * various filesystem locks, the mmap_sem etc. and we
2214 * don't want the OOM handler to deadlock on them
2215 * while we sit here and wait. Store the current OOM
2216 * context in the task_struct, then return -ENOMEM.
2217 * At the end of the page fault handler, with the
2218 * stack unwound, pagefault_out_of_memory() will check
2219 * back with us by calling
2220 * mem_cgroup_oom_synchronize(), possibly putting the
2221 * task to sleep.
2222 */
2223 current->memcg_oom.oom_locked = locked;
2224 current->memcg_oom.wakeups = wakeups;
2225 css_get(&memcg->css);
2226 current->memcg_oom.wait_on_memcg = memcg;
2227 }
2228} 2186}
2229 2187
2230/** 2188/**
2231 * mem_cgroup_oom_synchronize - complete memcg OOM handling 2189 * mem_cgroup_oom_synchronize - complete memcg OOM handling
2190 * @handle: actually kill/wait or just clean up the OOM state
2232 * 2191 *
2233 * This has to be called at the end of a page fault if the the memcg 2192 * This has to be called at the end of a page fault if the memcg OOM
2234 * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. 2193 * handler was enabled.
2235 * 2194 *
2236 * Memcg supports userspace OOM handling, so failed allocations must 2195 * Memcg supports userspace OOM handling where failed allocations must
2237 * sleep on a waitqueue until the userspace task resolves the 2196 * sleep on a waitqueue until the userspace task resolves the
2238 * situation. Sleeping directly in the charge context with all kinds 2197 * situation. Sleeping directly in the charge context with all kinds
2239 * of locks held is not a good idea, instead we remember an OOM state 2198 * of locks held is not a good idea, instead we remember an OOM state
2240 * in the task and mem_cgroup_oom_synchronize() has to be called at 2199 * in the task and mem_cgroup_oom_synchronize() has to be called at
2241 * the end of the page fault to put the task to sleep and clean up the 2200 * the end of the page fault to complete the OOM handling.
2242 * OOM state.
2243 * 2201 *
2244 * Returns %true if an ongoing memcg OOM situation was detected and 2202 * Returns %true if an ongoing memcg OOM situation was detected and
2245 * finalized, %false otherwise. 2203 * completed, %false otherwise.
2246 */ 2204 */
2247bool mem_cgroup_oom_synchronize(void) 2205bool mem_cgroup_oom_synchronize(bool handle)
2248{ 2206{
2207 struct mem_cgroup *memcg = current->memcg_oom.memcg;
2249 struct oom_wait_info owait; 2208 struct oom_wait_info owait;
2250 struct mem_cgroup *memcg; 2209 bool locked;
2251 2210
2252 /* OOM is global, do not handle */ 2211 /* OOM is global, do not handle */
2253 if (!current->memcg_oom.in_memcg_oom)
2254 return false;
2255
2256 /*
2257 * We invoked the OOM killer but there is a chance that a kill
2258 * did not free up any charges. Everybody else might already
2259 * be sleeping, so restart the fault and keep the rampage
2260 * going until some charges are released.
2261 */
2262 memcg = current->memcg_oom.wait_on_memcg;
2263 if (!memcg) 2212 if (!memcg)
2264 goto out; 2213 return false;
2265 2214
2266 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 2215 if (!handle)
2267 goto out_memcg; 2216 goto cleanup;
2268 2217
2269 owait.memcg = memcg; 2218 owait.memcg = memcg;
2270 owait.wait.flags = 0; 2219 owait.wait.flags = 0;
@@ -2273,13 +2222,25 @@ bool mem_cgroup_oom_synchronize(void)
2273 INIT_LIST_HEAD(&owait.wait.task_list); 2222 INIT_LIST_HEAD(&owait.wait.task_list);
2274 2223
2275 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2224 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2276 /* Only sleep if we didn't miss any wakeups since OOM */ 2225 mem_cgroup_mark_under_oom(memcg);
2277 if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) 2226
2227 locked = mem_cgroup_oom_trylock(memcg);
2228
2229 if (locked)
2230 mem_cgroup_oom_notify(memcg);
2231
2232 if (locked && !memcg->oom_kill_disable) {
2233 mem_cgroup_unmark_under_oom(memcg);
2234 finish_wait(&memcg_oom_waitq, &owait.wait);
2235 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
2236 current->memcg_oom.order);
2237 } else {
2278 schedule(); 2238 schedule();
2279 finish_wait(&memcg_oom_waitq, &owait.wait); 2239 mem_cgroup_unmark_under_oom(memcg);
2280out_memcg: 2240 finish_wait(&memcg_oom_waitq, &owait.wait);
2281 mem_cgroup_unmark_under_oom(memcg); 2241 }
2282 if (current->memcg_oom.oom_locked) { 2242
2243 if (locked) {
2283 mem_cgroup_oom_unlock(memcg); 2244 mem_cgroup_oom_unlock(memcg);
2284 /* 2245 /*
2285 * There is no guarantee that an OOM-lock contender 2246 * There is no guarantee that an OOM-lock contender
@@ -2288,10 +2249,9 @@ out_memcg:
2288 */ 2249 */
2289 memcg_oom_recover(memcg); 2250 memcg_oom_recover(memcg);
2290 } 2251 }
2252cleanup:
2253 current->memcg_oom.memcg = NULL;
2291 css_put(&memcg->css); 2254 css_put(&memcg->css);
2292 current->memcg_oom.wait_on_memcg = NULL;
2293out:
2294 current->memcg_oom.in_memcg_oom = 0;
2295 return true; 2255 return true;
2296} 2256}
2297 2257
@@ -2705,6 +2665,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2705 || fatal_signal_pending(current))) 2665 || fatal_signal_pending(current)))
2706 goto bypass; 2666 goto bypass;
2707 2667
2668 if (unlikely(task_in_memcg_oom(current)))
2669 goto bypass;
2670
2708 /* 2671 /*
2709 * We always charge the cgroup the mm_struct belongs to. 2672 * We always charge the cgroup the mm_struct belongs to.
2710 * The mm_struct's mem_cgroup changes on task migration if the 2673 * The mm_struct's mem_cgroup changes on task migration if the