diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 143 |
1 files changed, 55 insertions, 88 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1c52ddbc839b..34d3ca9572d6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -866,6 +866,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | |||
866 | unsigned long val = 0; | 866 | unsigned long val = 0; |
867 | int cpu; | 867 | int cpu; |
868 | 868 | ||
869 | get_online_cpus(); | ||
869 | for_each_online_cpu(cpu) | 870 | for_each_online_cpu(cpu) |
870 | val += per_cpu(memcg->stat->events[idx], cpu); | 871 | val += per_cpu(memcg->stat->events[idx], cpu); |
871 | #ifdef CONFIG_HOTPLUG_CPU | 872 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -873,6 +874,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | |||
873 | val += memcg->nocpu_base.events[idx]; | 874 | val += memcg->nocpu_base.events[idx]; |
874 | spin_unlock(&memcg->pcp_counter_lock); | 875 | spin_unlock(&memcg->pcp_counter_lock); |
875 | #endif | 876 | #endif |
877 | put_online_cpus(); | ||
876 | return val; | 878 | return val; |
877 | } | 879 | } |
878 | 880 | ||
@@ -2159,110 +2161,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) | |||
2159 | memcg_wakeup_oom(memcg); | 2161 | memcg_wakeup_oom(memcg); |
2160 | } | 2162 | } |
2161 | 2163 | ||
2162 | /* | ||
2163 | * try to call OOM killer | ||
2164 | */ | ||
2165 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) | 2164 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) |
2166 | { | 2165 | { |
2167 | bool locked; | ||
2168 | int wakeups; | ||
2169 | |||
2170 | if (!current->memcg_oom.may_oom) | 2166 | if (!current->memcg_oom.may_oom) |
2171 | return; | 2167 | return; |
2172 | |||
2173 | current->memcg_oom.in_memcg_oom = 1; | ||
2174 | |||
2175 | /* | 2168 | /* |
2176 | * As with any blocking lock, a contender needs to start | 2169 | * We are in the middle of the charge context here, so we |
2177 | * listening for wakeups before attempting the trylock, | 2170 | * don't want to block when potentially sitting on a callstack |
2178 | * otherwise it can miss the wakeup from the unlock and sleep | 2171 | * that holds all kinds of filesystem and mm locks. |
2179 | * indefinitely. This is just open-coded because our locking | 2172 | * |
2180 | * is so particular to memcg hierarchies. | 2173 | * Also, the caller may handle a failed allocation gracefully |
2174 | * (like optional page cache readahead) and so an OOM killer | ||
2175 | * invocation might not even be necessary. | ||
2176 | * | ||
2177 | * That's why we don't do anything here except remember the | ||
2178 | * OOM context and then deal with it at the end of the page | ||
2179 | * fault when the stack is unwound, the locks are released, | ||
2180 | * and when we know whether the fault was overall successful. | ||
2181 | */ | 2181 | */ |
2182 | wakeups = atomic_read(&memcg->oom_wakeups); | 2182 | css_get(&memcg->css); |
2183 | mem_cgroup_mark_under_oom(memcg); | 2183 | current->memcg_oom.memcg = memcg; |
2184 | 2184 | current->memcg_oom.gfp_mask = mask; | |
2185 | locked = mem_cgroup_oom_trylock(memcg); | 2185 | current->memcg_oom.order = order; |
2186 | |||
2187 | if (locked) | ||
2188 | mem_cgroup_oom_notify(memcg); | ||
2189 | |||
2190 | if (locked && !memcg->oom_kill_disable) { | ||
2191 | mem_cgroup_unmark_under_oom(memcg); | ||
2192 | mem_cgroup_out_of_memory(memcg, mask, order); | ||
2193 | mem_cgroup_oom_unlock(memcg); | ||
2194 | /* | ||
2195 | * There is no guarantee that an OOM-lock contender | ||
2196 | * sees the wakeups triggered by the OOM kill | ||
2197 | * uncharges. Wake any sleepers explicitely. | ||
2198 | */ | ||
2199 | memcg_oom_recover(memcg); | ||
2200 | } else { | ||
2201 | /* | ||
2202 | * A system call can just return -ENOMEM, but if this | ||
2203 | * is a page fault and somebody else is handling the | ||
2204 | * OOM already, we need to sleep on the OOM waitqueue | ||
2205 | * for this memcg until the situation is resolved. | ||
2206 | * Which can take some time because it might be | ||
2207 | * handled by a userspace task. | ||
2208 | * | ||
2209 | * However, this is the charge context, which means | ||
2210 | * that we may sit on a large call stack and hold | ||
2211 | * various filesystem locks, the mmap_sem etc. and we | ||
2212 | * don't want the OOM handler to deadlock on them | ||
2213 | * while we sit here and wait. Store the current OOM | ||
2214 | * context in the task_struct, then return -ENOMEM. | ||
2215 | * At the end of the page fault handler, with the | ||
2216 | * stack unwound, pagefault_out_of_memory() will check | ||
2217 | * back with us by calling | ||
2218 | * mem_cgroup_oom_synchronize(), possibly putting the | ||
2219 | * task to sleep. | ||
2220 | */ | ||
2221 | current->memcg_oom.oom_locked = locked; | ||
2222 | current->memcg_oom.wakeups = wakeups; | ||
2223 | css_get(&memcg->css); | ||
2224 | current->memcg_oom.wait_on_memcg = memcg; | ||
2225 | } | ||
2226 | } | 2186 | } |
2227 | 2187 | ||
2228 | /** | 2188 | /** |
2229 | * mem_cgroup_oom_synchronize - complete memcg OOM handling | 2189 | * mem_cgroup_oom_synchronize - complete memcg OOM handling |
2190 | * @handle: actually kill/wait or just clean up the OOM state | ||
2230 | * | 2191 | * |
2231 | * This has to be called at the end of a page fault if the the memcg | 2192 | * This has to be called at the end of a page fault if the memcg OOM |
2232 | * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. | 2193 | * handler was enabled. |
2233 | * | 2194 | * |
2234 | * Memcg supports userspace OOM handling, so failed allocations must | 2195 | * Memcg supports userspace OOM handling where failed allocations must |
2235 | * sleep on a waitqueue until the userspace task resolves the | 2196 | * sleep on a waitqueue until the userspace task resolves the |
2236 | * situation. Sleeping directly in the charge context with all kinds | 2197 | * situation. Sleeping directly in the charge context with all kinds |
2237 | * of locks held is not a good idea, instead we remember an OOM state | 2198 | * of locks held is not a good idea, instead we remember an OOM state |
2238 | * in the task and mem_cgroup_oom_synchronize() has to be called at | 2199 | * in the task and mem_cgroup_oom_synchronize() has to be called at |
2239 | * the end of the page fault to put the task to sleep and clean up the | 2200 | * the end of the page fault to complete the OOM handling. |
2240 | * OOM state. | ||
2241 | * | 2201 | * |
2242 | * Returns %true if an ongoing memcg OOM situation was detected and | 2202 | * Returns %true if an ongoing memcg OOM situation was detected and |
2243 | * finalized, %false otherwise. | 2203 | * completed, %false otherwise. |
2244 | */ | 2204 | */ |
2245 | bool mem_cgroup_oom_synchronize(void) | 2205 | bool mem_cgroup_oom_synchronize(bool handle) |
2246 | { | 2206 | { |
2207 | struct mem_cgroup *memcg = current->memcg_oom.memcg; | ||
2247 | struct oom_wait_info owait; | 2208 | struct oom_wait_info owait; |
2248 | struct mem_cgroup *memcg; | 2209 | bool locked; |
2249 | 2210 | ||
2250 | /* OOM is global, do not handle */ | 2211 | /* OOM is global, do not handle */ |
2251 | if (!current->memcg_oom.in_memcg_oom) | ||
2252 | return false; | ||
2253 | |||
2254 | /* | ||
2255 | * We invoked the OOM killer but there is a chance that a kill | ||
2256 | * did not free up any charges. Everybody else might already | ||
2257 | * be sleeping, so restart the fault and keep the rampage | ||
2258 | * going until some charges are released. | ||
2259 | */ | ||
2260 | memcg = current->memcg_oom.wait_on_memcg; | ||
2261 | if (!memcg) | 2212 | if (!memcg) |
2262 | goto out; | 2213 | return false; |
2263 | 2214 | ||
2264 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | 2215 | if (!handle) |
2265 | goto out_memcg; | 2216 | goto cleanup; |
2266 | 2217 | ||
2267 | owait.memcg = memcg; | 2218 | owait.memcg = memcg; |
2268 | owait.wait.flags = 0; | 2219 | owait.wait.flags = 0; |
@@ -2271,13 +2222,25 @@ bool mem_cgroup_oom_synchronize(void) | |||
2271 | INIT_LIST_HEAD(&owait.wait.task_list); | 2222 | INIT_LIST_HEAD(&owait.wait.task_list); |
2272 | 2223 | ||
2273 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | 2224 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); |
2274 | /* Only sleep if we didn't miss any wakeups since OOM */ | 2225 | mem_cgroup_mark_under_oom(memcg); |
2275 | if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) | 2226 | |
2227 | locked = mem_cgroup_oom_trylock(memcg); | ||
2228 | |||
2229 | if (locked) | ||
2230 | mem_cgroup_oom_notify(memcg); | ||
2231 | |||
2232 | if (locked && !memcg->oom_kill_disable) { | ||
2233 | mem_cgroup_unmark_under_oom(memcg); | ||
2234 | finish_wait(&memcg_oom_waitq, &owait.wait); | ||
2235 | mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, | ||
2236 | current->memcg_oom.order); | ||
2237 | } else { | ||
2276 | schedule(); | 2238 | schedule(); |
2277 | finish_wait(&memcg_oom_waitq, &owait.wait); | 2239 | mem_cgroup_unmark_under_oom(memcg); |
2278 | out_memcg: | 2240 | finish_wait(&memcg_oom_waitq, &owait.wait); |
2279 | mem_cgroup_unmark_under_oom(memcg); | 2241 | } |
2280 | if (current->memcg_oom.oom_locked) { | 2242 | |
2243 | if (locked) { | ||
2281 | mem_cgroup_oom_unlock(memcg); | 2244 | mem_cgroup_oom_unlock(memcg); |
2282 | /* | 2245 | /* |
2283 | * There is no guarantee that an OOM-lock contender | 2246 | * There is no guarantee that an OOM-lock contender |
@@ -2286,10 +2249,9 @@ out_memcg: | |||
2286 | */ | 2249 | */ |
2287 | memcg_oom_recover(memcg); | 2250 | memcg_oom_recover(memcg); |
2288 | } | 2251 | } |
2252 | cleanup: | ||
2253 | current->memcg_oom.memcg = NULL; | ||
2289 | css_put(&memcg->css); | 2254 | css_put(&memcg->css); |
2290 | current->memcg_oom.wait_on_memcg = NULL; | ||
2291 | out: | ||
2292 | current->memcg_oom.in_memcg_oom = 0; | ||
2293 | return true; | 2255 | return true; |
2294 | } | 2256 | } |
2295 | 2257 | ||
@@ -2703,6 +2665,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
2703 | || fatal_signal_pending(current))) | 2665 | || fatal_signal_pending(current))) |
2704 | goto bypass; | 2666 | goto bypass; |
2705 | 2667 | ||
2668 | if (unlikely(task_in_memcg_oom(current))) | ||
2669 | goto bypass; | ||
2670 | |||
2706 | /* | 2671 | /* |
2707 | * We always charge the cgroup the mm_struct belongs to. | 2672 | * We always charge the cgroup the mm_struct belongs to. |
2708 | * The mm_struct's mem_cgroup changes on task migration if the | 2673 | * The mm_struct's mem_cgroup changes on task migration if the |
@@ -2801,6 +2766,8 @@ done: | |||
2801 | return 0; | 2766 | return 0; |
2802 | nomem: | 2767 | nomem: |
2803 | *ptr = NULL; | 2768 | *ptr = NULL; |
2769 | if (gfp_mask & __GFP_NOFAIL) | ||
2770 | return 0; | ||
2804 | return -ENOMEM; | 2771 | return -ENOMEM; |
2805 | bypass: | 2772 | bypass: |
2806 | *ptr = root_mem_cgroup; | 2773 | *ptr = root_mem_cgroup; |