diff options
author | Johannes Weiner <hannes@cmpxchg.org> | 2013-10-16 16:46:59 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-10-17 00:35:53 -0400 |
commit | 4942642080ea82d99ab5b653abb9a12b7ba31f4a (patch) | |
tree | 7ec12b61f0bfdd1f1466e5233b67432828b25c33 /mm/memcontrol.c | |
parent | c88b05b2cd07221cdefd56f7f7422c1459eb60c9 (diff) |
mm: memcg: handle non-error OOM situations more gracefully
Commit 3812c8c8f395 ("mm: memcg: do not trap chargers with full
callstack on OOM") assumed that only a few places that can trigger a
memcg OOM situation do not return VM_FAULT_OOM, like optional page cache
readahead. But there are many more and it's impractical to annotate
them all.
First of all, we don't want to invoke the OOM killer when the failed
allocation is gracefully handled, so defer the actual kill to the end of
the fault handling as well. This simplifies the code quite a bit for
added bonus.
Second, since a failed allocation might not be the abrupt end of the
fault, the memcg OOM handler needs to be re-entrant until the fault
finishes for subsequent allocation attempts. If an allocation is
attempted after the task already OOMed, allow it to bypass the limit so
that it can quickly finish the fault and invoke the OOM killer.
Reported-by: azurIt <azurit@pobox.sk>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 139 |
1 files changed, 51 insertions, 88 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5335b2b6be77..65fc6a449841 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -2161,110 +2161,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) | |||
2161 | memcg_wakeup_oom(memcg); | 2161 | memcg_wakeup_oom(memcg); |
2162 | } | 2162 | } |
2163 | 2163 | ||
2164 | /* | ||
2165 | * try to call OOM killer | ||
2166 | */ | ||
2167 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) | 2164 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) |
2168 | { | 2165 | { |
2169 | bool locked; | ||
2170 | int wakeups; | ||
2171 | |||
2172 | if (!current->memcg_oom.may_oom) | 2166 | if (!current->memcg_oom.may_oom) |
2173 | return; | 2167 | return; |
2174 | |||
2175 | current->memcg_oom.in_memcg_oom = 1; | ||
2176 | |||
2177 | /* | 2168 | /* |
2178 | * As with any blocking lock, a contender needs to start | 2169 | * We are in the middle of the charge context here, so we |
2179 | * listening for wakeups before attempting the trylock, | 2170 | * don't want to block when potentially sitting on a callstack |
2180 | * otherwise it can miss the wakeup from the unlock and sleep | 2171 | * that holds all kinds of filesystem and mm locks. |
2181 | * indefinitely. This is just open-coded because our locking | 2172 | * |
2182 | * is so particular to memcg hierarchies. | 2173 | * Also, the caller may handle a failed allocation gracefully |
2174 | * (like optional page cache readahead) and so an OOM killer | ||
2175 | * invocation might not even be necessary. | ||
2176 | * | ||
2177 | * That's why we don't do anything here except remember the | ||
2178 | * OOM context and then deal with it at the end of the page | ||
2179 | * fault when the stack is unwound, the locks are released, | ||
2180 | * and when we know whether the fault was overall successful. | ||
2183 | */ | 2181 | */ |
2184 | wakeups = atomic_read(&memcg->oom_wakeups); | 2182 | css_get(&memcg->css); |
2185 | mem_cgroup_mark_under_oom(memcg); | 2183 | current->memcg_oom.memcg = memcg; |
2186 | 2184 | current->memcg_oom.gfp_mask = mask; | |
2187 | locked = mem_cgroup_oom_trylock(memcg); | 2185 | current->memcg_oom.order = order; |
2188 | |||
2189 | if (locked) | ||
2190 | mem_cgroup_oom_notify(memcg); | ||
2191 | |||
2192 | if (locked && !memcg->oom_kill_disable) { | ||
2193 | mem_cgroup_unmark_under_oom(memcg); | ||
2194 | mem_cgroup_out_of_memory(memcg, mask, order); | ||
2195 | mem_cgroup_oom_unlock(memcg); | ||
2196 | /* | ||
2197 | * There is no guarantee that an OOM-lock contender | ||
2198 | * sees the wakeups triggered by the OOM kill | ||
2199 | * uncharges. Wake any sleepers explicitely. | ||
2200 | */ | ||
2201 | memcg_oom_recover(memcg); | ||
2202 | } else { | ||
2203 | /* | ||
2204 | * A system call can just return -ENOMEM, but if this | ||
2205 | * is a page fault and somebody else is handling the | ||
2206 | * OOM already, we need to sleep on the OOM waitqueue | ||
2207 | * for this memcg until the situation is resolved. | ||
2208 | * Which can take some time because it might be | ||
2209 | * handled by a userspace task. | ||
2210 | * | ||
2211 | * However, this is the charge context, which means | ||
2212 | * that we may sit on a large call stack and hold | ||
2213 | * various filesystem locks, the mmap_sem etc. and we | ||
2214 | * don't want the OOM handler to deadlock on them | ||
2215 | * while we sit here and wait. Store the current OOM | ||
2216 | * context in the task_struct, then return -ENOMEM. | ||
2217 | * At the end of the page fault handler, with the | ||
2218 | * stack unwound, pagefault_out_of_memory() will check | ||
2219 | * back with us by calling | ||
2220 | * mem_cgroup_oom_synchronize(), possibly putting the | ||
2221 | * task to sleep. | ||
2222 | */ | ||
2223 | current->memcg_oom.oom_locked = locked; | ||
2224 | current->memcg_oom.wakeups = wakeups; | ||
2225 | css_get(&memcg->css); | ||
2226 | current->memcg_oom.wait_on_memcg = memcg; | ||
2227 | } | ||
2228 | } | 2186 | } |
2229 | 2187 | ||
2230 | /** | 2188 | /** |
2231 | * mem_cgroup_oom_synchronize - complete memcg OOM handling | 2189 | * mem_cgroup_oom_synchronize - complete memcg OOM handling |
2190 | * @handle: actually kill/wait or just clean up the OOM state | ||
2232 | * | 2191 | * |
2233 | * This has to be called at the end of a page fault if the the memcg | 2192 | * This has to be called at the end of a page fault if the memcg OOM |
2234 | * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. | 2193 | * handler was enabled. |
2235 | * | 2194 | * |
2236 | * Memcg supports userspace OOM handling, so failed allocations must | 2195 | * Memcg supports userspace OOM handling where failed allocations must |
2237 | * sleep on a waitqueue until the userspace task resolves the | 2196 | * sleep on a waitqueue until the userspace task resolves the |
2238 | * situation. Sleeping directly in the charge context with all kinds | 2197 | * situation. Sleeping directly in the charge context with all kinds |
2239 | * of locks held is not a good idea, instead we remember an OOM state | 2198 | * of locks held is not a good idea, instead we remember an OOM state |
2240 | * in the task and mem_cgroup_oom_synchronize() has to be called at | 2199 | * in the task and mem_cgroup_oom_synchronize() has to be called at |
2241 | * the end of the page fault to put the task to sleep and clean up the | 2200 | * the end of the page fault to complete the OOM handling. |
2242 | * OOM state. | ||
2243 | * | 2201 | * |
2244 | * Returns %true if an ongoing memcg OOM situation was detected and | 2202 | * Returns %true if an ongoing memcg OOM situation was detected and |
2245 | * finalized, %false otherwise. | 2203 | * completed, %false otherwise. |
2246 | */ | 2204 | */ |
2247 | bool mem_cgroup_oom_synchronize(void) | 2205 | bool mem_cgroup_oom_synchronize(bool handle) |
2248 | { | 2206 | { |
2207 | struct mem_cgroup *memcg = current->memcg_oom.memcg; | ||
2249 | struct oom_wait_info owait; | 2208 | struct oom_wait_info owait; |
2250 | struct mem_cgroup *memcg; | 2209 | bool locked; |
2251 | 2210 | ||
2252 | /* OOM is global, do not handle */ | 2211 | /* OOM is global, do not handle */ |
2253 | if (!current->memcg_oom.in_memcg_oom) | ||
2254 | return false; | ||
2255 | |||
2256 | /* | ||
2257 | * We invoked the OOM killer but there is a chance that a kill | ||
2258 | * did not free up any charges. Everybody else might already | ||
2259 | * be sleeping, so restart the fault and keep the rampage | ||
2260 | * going until some charges are released. | ||
2261 | */ | ||
2262 | memcg = current->memcg_oom.wait_on_memcg; | ||
2263 | if (!memcg) | 2212 | if (!memcg) |
2264 | goto out; | 2213 | return false; |
2265 | 2214 | ||
2266 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | 2215 | if (!handle) |
2267 | goto out_memcg; | 2216 | goto cleanup; |
2268 | 2217 | ||
2269 | owait.memcg = memcg; | 2218 | owait.memcg = memcg; |
2270 | owait.wait.flags = 0; | 2219 | owait.wait.flags = 0; |
@@ -2273,13 +2222,25 @@ bool mem_cgroup_oom_synchronize(void) | |||
2273 | INIT_LIST_HEAD(&owait.wait.task_list); | 2222 | INIT_LIST_HEAD(&owait.wait.task_list); |
2274 | 2223 | ||
2275 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | 2224 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); |
2276 | /* Only sleep if we didn't miss any wakeups since OOM */ | 2225 | mem_cgroup_mark_under_oom(memcg); |
2277 | if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) | 2226 | |
2227 | locked = mem_cgroup_oom_trylock(memcg); | ||
2228 | |||
2229 | if (locked) | ||
2230 | mem_cgroup_oom_notify(memcg); | ||
2231 | |||
2232 | if (locked && !memcg->oom_kill_disable) { | ||
2233 | mem_cgroup_unmark_under_oom(memcg); | ||
2234 | finish_wait(&memcg_oom_waitq, &owait.wait); | ||
2235 | mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, | ||
2236 | current->memcg_oom.order); | ||
2237 | } else { | ||
2278 | schedule(); | 2238 | schedule(); |
2279 | finish_wait(&memcg_oom_waitq, &owait.wait); | 2239 | mem_cgroup_unmark_under_oom(memcg); |
2280 | out_memcg: | 2240 | finish_wait(&memcg_oom_waitq, &owait.wait); |
2281 | mem_cgroup_unmark_under_oom(memcg); | 2241 | } |
2282 | if (current->memcg_oom.oom_locked) { | 2242 | |
2243 | if (locked) { | ||
2283 | mem_cgroup_oom_unlock(memcg); | 2244 | mem_cgroup_oom_unlock(memcg); |
2284 | /* | 2245 | /* |
2285 | * There is no guarantee that an OOM-lock contender | 2246 | * There is no guarantee that an OOM-lock contender |
@@ -2288,10 +2249,9 @@ out_memcg: | |||
2288 | */ | 2249 | */ |
2289 | memcg_oom_recover(memcg); | 2250 | memcg_oom_recover(memcg); |
2290 | } | 2251 | } |
2252 | cleanup: | ||
2253 | current->memcg_oom.memcg = NULL; | ||
2291 | css_put(&memcg->css); | 2254 | css_put(&memcg->css); |
2292 | current->memcg_oom.wait_on_memcg = NULL; | ||
2293 | out: | ||
2294 | current->memcg_oom.in_memcg_oom = 0; | ||
2295 | return true; | 2255 | return true; |
2296 | } | 2256 | } |
2297 | 2257 | ||
@@ -2705,6 +2665,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
2705 | || fatal_signal_pending(current))) | 2665 | || fatal_signal_pending(current))) |
2706 | goto bypass; | 2666 | goto bypass; |
2707 | 2667 | ||
2668 | if (unlikely(task_in_memcg_oom(current))) | ||
2669 | goto bypass; | ||
2670 | |||
2708 | /* | 2671 | /* |
2709 | * We always charge the cgroup the mm_struct belongs to. | 2672 | * We always charge the cgroup the mm_struct belongs to. |
2710 | * The mm_struct's mem_cgroup changes on task migration if the | 2673 | * The mm_struct's mem_cgroup changes on task migration if the |