diff options
-rw-r--r-- | include/linux/memcontrol.h | 50 | ||||
-rw-r--r-- | include/linux/sched.h | 7 | ||||
-rw-r--r-- | mm/filemap.c | 11 | ||||
-rw-r--r-- | mm/memcontrol.c | 139 | ||||
-rw-r--r-- | mm/memory.c | 18 | ||||
-rw-r--r-- | mm/oom_kill.c | 2 |
6 files changed, 79 insertions, 148 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ecc82b37c4cc..b3e7a667e03c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -137,47 +137,24 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, | |||
137 | extern void mem_cgroup_replace_page_cache(struct page *oldpage, | 137 | extern void mem_cgroup_replace_page_cache(struct page *oldpage, |
138 | struct page *newpage); | 138 | struct page *newpage); |
139 | 139 | ||
140 | /** | 140 | static inline void mem_cgroup_oom_enable(void) |
141 | * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task | ||
142 | * @new: true to enable, false to disable | ||
143 | * | ||
144 | * Toggle whether a failed memcg charge should invoke the OOM killer | ||
145 | * or just return -ENOMEM. Returns the previous toggle state. | ||
146 | * | ||
147 | * NOTE: Any path that enables the OOM killer before charging must | ||
148 | * call mem_cgroup_oom_synchronize() afterward to finalize the | ||
149 | * OOM handling and clean up. | ||
150 | */ | ||
151 | static inline bool mem_cgroup_toggle_oom(bool new) | ||
152 | { | 141 | { |
153 | bool old; | 142 | WARN_ON(current->memcg_oom.may_oom); |
154 | 143 | current->memcg_oom.may_oom = 1; | |
155 | old = current->memcg_oom.may_oom; | ||
156 | current->memcg_oom.may_oom = new; | ||
157 | |||
158 | return old; | ||
159 | } | 144 | } |
160 | 145 | ||
161 | static inline void mem_cgroup_enable_oom(void) | 146 | static inline void mem_cgroup_oom_disable(void) |
162 | { | 147 | { |
163 | bool old = mem_cgroup_toggle_oom(true); | 148 | WARN_ON(!current->memcg_oom.may_oom); |
164 | 149 | current->memcg_oom.may_oom = 0; | |
165 | WARN_ON(old == true); | ||
166 | } | ||
167 | |||
168 | static inline void mem_cgroup_disable_oom(void) | ||
169 | { | ||
170 | bool old = mem_cgroup_toggle_oom(false); | ||
171 | |||
172 | WARN_ON(old == false); | ||
173 | } | 150 | } |
174 | 151 | ||
175 | static inline bool task_in_memcg_oom(struct task_struct *p) | 152 | static inline bool task_in_memcg_oom(struct task_struct *p) |
176 | { | 153 | { |
177 | return p->memcg_oom.in_memcg_oom; | 154 | return p->memcg_oom.memcg; |
178 | } | 155 | } |
179 | 156 | ||
180 | bool mem_cgroup_oom_synchronize(void); | 157 | bool mem_cgroup_oom_synchronize(bool wait); |
181 | 158 | ||
182 | #ifdef CONFIG_MEMCG_SWAP | 159 | #ifdef CONFIG_MEMCG_SWAP |
183 | extern int do_swap_account; | 160 | extern int do_swap_account; |
@@ -402,16 +379,11 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page, | |||
402 | { | 379 | { |
403 | } | 380 | } |
404 | 381 | ||
405 | static inline bool mem_cgroup_toggle_oom(bool new) | 382 | static inline void mem_cgroup_oom_enable(void) |
406 | { | ||
407 | return false; | ||
408 | } | ||
409 | |||
410 | static inline void mem_cgroup_enable_oom(void) | ||
411 | { | 383 | { |
412 | } | 384 | } |
413 | 385 | ||
414 | static inline void mem_cgroup_disable_oom(void) | 386 | static inline void mem_cgroup_oom_disable(void) |
415 | { | 387 | { |
416 | } | 388 | } |
417 | 389 | ||
@@ -420,7 +392,7 @@ static inline bool task_in_memcg_oom(struct task_struct *p) | |||
420 | return false; | 392 | return false; |
421 | } | 393 | } |
422 | 394 | ||
423 | static inline bool mem_cgroup_oom_synchronize(void) | 395 | static inline bool mem_cgroup_oom_synchronize(bool wait) |
424 | { | 396 | { |
425 | return false; | 397 | return false; |
426 | } | 398 | } |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 6682da36b293..e27baeeda3f4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1394,11 +1394,10 @@ struct task_struct { | |||
1394 | } memcg_batch; | 1394 | } memcg_batch; |
1395 | unsigned int memcg_kmem_skip_account; | 1395 | unsigned int memcg_kmem_skip_account; |
1396 | struct memcg_oom_info { | 1396 | struct memcg_oom_info { |
1397 | struct mem_cgroup *memcg; | ||
1398 | gfp_t gfp_mask; | ||
1399 | int order; | ||
1397 | unsigned int may_oom:1; | 1400 | unsigned int may_oom:1; |
1398 | unsigned int in_memcg_oom:1; | ||
1399 | unsigned int oom_locked:1; | ||
1400 | int wakeups; | ||
1401 | struct mem_cgroup *wait_on_memcg; | ||
1402 | } memcg_oom; | 1401 | } memcg_oom; |
1403 | #endif | 1402 | #endif |
1404 | #ifdef CONFIG_UPROBES | 1403 | #ifdef CONFIG_UPROBES |
diff --git a/mm/filemap.c b/mm/filemap.c index 1e6aec4a2d2e..ae4846ff4849 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -1616,7 +1616,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1616 | struct inode *inode = mapping->host; | 1616 | struct inode *inode = mapping->host; |
1617 | pgoff_t offset = vmf->pgoff; | 1617 | pgoff_t offset = vmf->pgoff; |
1618 | struct page *page; | 1618 | struct page *page; |
1619 | bool memcg_oom; | ||
1620 | pgoff_t size; | 1619 | pgoff_t size; |
1621 | int ret = 0; | 1620 | int ret = 0; |
1622 | 1621 | ||
@@ -1625,11 +1624,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1625 | return VM_FAULT_SIGBUS; | 1624 | return VM_FAULT_SIGBUS; |
1626 | 1625 | ||
1627 | /* | 1626 | /* |
1628 | * Do we have something in the page cache already? Either | 1627 | * Do we have something in the page cache already? |
1629 | * way, try readahead, but disable the memcg OOM killer for it | ||
1630 | * as readahead is optional and no errors are propagated up | ||
1631 | * the fault stack. The OOM killer is enabled while trying to | ||
1632 | * instantiate the faulting page individually below. | ||
1633 | */ | 1628 | */ |
1634 | page = find_get_page(mapping, offset); | 1629 | page = find_get_page(mapping, offset); |
1635 | if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { | 1630 | if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { |
@@ -1637,14 +1632,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1637 | * We found the page, so try async readahead before | 1632 | * We found the page, so try async readahead before |
1638 | * waiting for the lock. | 1633 | * waiting for the lock. |
1639 | */ | 1634 | */ |
1640 | memcg_oom = mem_cgroup_toggle_oom(false); | ||
1641 | do_async_mmap_readahead(vma, ra, file, page, offset); | 1635 | do_async_mmap_readahead(vma, ra, file, page, offset); |
1642 | mem_cgroup_toggle_oom(memcg_oom); | ||
1643 | } else if (!page) { | 1636 | } else if (!page) { |
1644 | /* No page in the page cache at all */ | 1637 | /* No page in the page cache at all */ |
1645 | memcg_oom = mem_cgroup_toggle_oom(false); | ||
1646 | do_sync_mmap_readahead(vma, ra, file, offset); | 1638 | do_sync_mmap_readahead(vma, ra, file, offset); |
1647 | mem_cgroup_toggle_oom(memcg_oom); | ||
1648 | count_vm_event(PGMAJFAULT); | 1639 | count_vm_event(PGMAJFAULT); |
1649 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); | 1640 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); |
1650 | ret = VM_FAULT_MAJOR; | 1641 | ret = VM_FAULT_MAJOR; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5335b2b6be77..65fc6a449841 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -2161,110 +2161,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) | |||
2161 | memcg_wakeup_oom(memcg); | 2161 | memcg_wakeup_oom(memcg); |
2162 | } | 2162 | } |
2163 | 2163 | ||
2164 | /* | ||
2165 | * try to call OOM killer | ||
2166 | */ | ||
2167 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) | 2164 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) |
2168 | { | 2165 | { |
2169 | bool locked; | ||
2170 | int wakeups; | ||
2171 | |||
2172 | if (!current->memcg_oom.may_oom) | 2166 | if (!current->memcg_oom.may_oom) |
2173 | return; | 2167 | return; |
2174 | |||
2175 | current->memcg_oom.in_memcg_oom = 1; | ||
2176 | |||
2177 | /* | 2168 | /* |
2178 | * As with any blocking lock, a contender needs to start | 2169 | * We are in the middle of the charge context here, so we |
2179 | * listening for wakeups before attempting the trylock, | 2170 | * don't want to block when potentially sitting on a callstack |
2180 | * otherwise it can miss the wakeup from the unlock and sleep | 2171 | * that holds all kinds of filesystem and mm locks. |
2181 | * indefinitely. This is just open-coded because our locking | 2172 | * |
2182 | * is so particular to memcg hierarchies. | 2173 | * Also, the caller may handle a failed allocation gracefully |
2174 | * (like optional page cache readahead) and so an OOM killer | ||
2175 | * invocation might not even be necessary. | ||
2176 | * | ||
2177 | * That's why we don't do anything here except remember the | ||
2178 | * OOM context and then deal with it at the end of the page | ||
2179 | * fault when the stack is unwound, the locks are released, | ||
2180 | * and when we know whether the fault was overall successful. | ||
2183 | */ | 2181 | */ |
2184 | wakeups = atomic_read(&memcg->oom_wakeups); | 2182 | css_get(&memcg->css); |
2185 | mem_cgroup_mark_under_oom(memcg); | 2183 | current->memcg_oom.memcg = memcg; |
2186 | 2184 | current->memcg_oom.gfp_mask = mask; | |
2187 | locked = mem_cgroup_oom_trylock(memcg); | 2185 | current->memcg_oom.order = order; |
2188 | |||
2189 | if (locked) | ||
2190 | mem_cgroup_oom_notify(memcg); | ||
2191 | |||
2192 | if (locked && !memcg->oom_kill_disable) { | ||
2193 | mem_cgroup_unmark_under_oom(memcg); | ||
2194 | mem_cgroup_out_of_memory(memcg, mask, order); | ||
2195 | mem_cgroup_oom_unlock(memcg); | ||
2196 | /* | ||
2197 | * There is no guarantee that an OOM-lock contender | ||
2198 | * sees the wakeups triggered by the OOM kill | ||
2199 | * uncharges. Wake any sleepers explicitely. | ||
2200 | */ | ||
2201 | memcg_oom_recover(memcg); | ||
2202 | } else { | ||
2203 | /* | ||
2204 | * A system call can just return -ENOMEM, but if this | ||
2205 | * is a page fault and somebody else is handling the | ||
2206 | * OOM already, we need to sleep on the OOM waitqueue | ||
2207 | * for this memcg until the situation is resolved. | ||
2208 | * Which can take some time because it might be | ||
2209 | * handled by a userspace task. | ||
2210 | * | ||
2211 | * However, this is the charge context, which means | ||
2212 | * that we may sit on a large call stack and hold | ||
2213 | * various filesystem locks, the mmap_sem etc. and we | ||
2214 | * don't want the OOM handler to deadlock on them | ||
2215 | * while we sit here and wait. Store the current OOM | ||
2216 | * context in the task_struct, then return -ENOMEM. | ||
2217 | * At the end of the page fault handler, with the | ||
2218 | * stack unwound, pagefault_out_of_memory() will check | ||
2219 | * back with us by calling | ||
2220 | * mem_cgroup_oom_synchronize(), possibly putting the | ||
2221 | * task to sleep. | ||
2222 | */ | ||
2223 | current->memcg_oom.oom_locked = locked; | ||
2224 | current->memcg_oom.wakeups = wakeups; | ||
2225 | css_get(&memcg->css); | ||
2226 | current->memcg_oom.wait_on_memcg = memcg; | ||
2227 | } | ||
2228 | } | 2186 | } |
2229 | 2187 | ||
2230 | /** | 2188 | /** |
2231 | * mem_cgroup_oom_synchronize - complete memcg OOM handling | 2189 | * mem_cgroup_oom_synchronize - complete memcg OOM handling |
2190 | * @handle: actually kill/wait or just clean up the OOM state | ||
2232 | * | 2191 | * |
2233 | * This has to be called at the end of a page fault if the the memcg | 2192 | * This has to be called at the end of a page fault if the memcg OOM |
2234 | * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. | 2193 | * handler was enabled. |
2235 | * | 2194 | * |
2236 | * Memcg supports userspace OOM handling, so failed allocations must | 2195 | * Memcg supports userspace OOM handling where failed allocations must |
2237 | * sleep on a waitqueue until the userspace task resolves the | 2196 | * sleep on a waitqueue until the userspace task resolves the |
2238 | * situation. Sleeping directly in the charge context with all kinds | 2197 | * situation. Sleeping directly in the charge context with all kinds |
2239 | * of locks held is not a good idea, instead we remember an OOM state | 2198 | * of locks held is not a good idea, instead we remember an OOM state |
2240 | * in the task and mem_cgroup_oom_synchronize() has to be called at | 2199 | * in the task and mem_cgroup_oom_synchronize() has to be called at |
2241 | * the end of the page fault to put the task to sleep and clean up the | 2200 | * the end of the page fault to complete the OOM handling. |
2242 | * OOM state. | ||
2243 | * | 2201 | * |
2244 | * Returns %true if an ongoing memcg OOM situation was detected and | 2202 | * Returns %true if an ongoing memcg OOM situation was detected and |
2245 | * finalized, %false otherwise. | 2203 | * completed, %false otherwise. |
2246 | */ | 2204 | */ |
2247 | bool mem_cgroup_oom_synchronize(void) | 2205 | bool mem_cgroup_oom_synchronize(bool handle) |
2248 | { | 2206 | { |
2207 | struct mem_cgroup *memcg = current->memcg_oom.memcg; | ||
2249 | struct oom_wait_info owait; | 2208 | struct oom_wait_info owait; |
2250 | struct mem_cgroup *memcg; | 2209 | bool locked; |
2251 | 2210 | ||
2252 | /* OOM is global, do not handle */ | 2211 | /* OOM is global, do not handle */ |
2253 | if (!current->memcg_oom.in_memcg_oom) | ||
2254 | return false; | ||
2255 | |||
2256 | /* | ||
2257 | * We invoked the OOM killer but there is a chance that a kill | ||
2258 | * did not free up any charges. Everybody else might already | ||
2259 | * be sleeping, so restart the fault and keep the rampage | ||
2260 | * going until some charges are released. | ||
2261 | */ | ||
2262 | memcg = current->memcg_oom.wait_on_memcg; | ||
2263 | if (!memcg) | 2212 | if (!memcg) |
2264 | goto out; | 2213 | return false; |
2265 | 2214 | ||
2266 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | 2215 | if (!handle) |
2267 | goto out_memcg; | 2216 | goto cleanup; |
2268 | 2217 | ||
2269 | owait.memcg = memcg; | 2218 | owait.memcg = memcg; |
2270 | owait.wait.flags = 0; | 2219 | owait.wait.flags = 0; |
@@ -2273,13 +2222,25 @@ bool mem_cgroup_oom_synchronize(void) | |||
2273 | INIT_LIST_HEAD(&owait.wait.task_list); | 2222 | INIT_LIST_HEAD(&owait.wait.task_list); |
2274 | 2223 | ||
2275 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | 2224 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); |
2276 | /* Only sleep if we didn't miss any wakeups since OOM */ | 2225 | mem_cgroup_mark_under_oom(memcg); |
2277 | if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) | 2226 | |
2227 | locked = mem_cgroup_oom_trylock(memcg); | ||
2228 | |||
2229 | if (locked) | ||
2230 | mem_cgroup_oom_notify(memcg); | ||
2231 | |||
2232 | if (locked && !memcg->oom_kill_disable) { | ||
2233 | mem_cgroup_unmark_under_oom(memcg); | ||
2234 | finish_wait(&memcg_oom_waitq, &owait.wait); | ||
2235 | mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, | ||
2236 | current->memcg_oom.order); | ||
2237 | } else { | ||
2278 | schedule(); | 2238 | schedule(); |
2279 | finish_wait(&memcg_oom_waitq, &owait.wait); | 2239 | mem_cgroup_unmark_under_oom(memcg); |
2280 | out_memcg: | 2240 | finish_wait(&memcg_oom_waitq, &owait.wait); |
2281 | mem_cgroup_unmark_under_oom(memcg); | 2241 | } |
2282 | if (current->memcg_oom.oom_locked) { | 2242 | |
2243 | if (locked) { | ||
2283 | mem_cgroup_oom_unlock(memcg); | 2244 | mem_cgroup_oom_unlock(memcg); |
2284 | /* | 2245 | /* |
2285 | * There is no guarantee that an OOM-lock contender | 2246 | * There is no guarantee that an OOM-lock contender |
@@ -2288,10 +2249,9 @@ out_memcg: | |||
2288 | */ | 2249 | */ |
2289 | memcg_oom_recover(memcg); | 2250 | memcg_oom_recover(memcg); |
2290 | } | 2251 | } |
2252 | cleanup: | ||
2253 | current->memcg_oom.memcg = NULL; | ||
2291 | css_put(&memcg->css); | 2254 | css_put(&memcg->css); |
2292 | current->memcg_oom.wait_on_memcg = NULL; | ||
2293 | out: | ||
2294 | current->memcg_oom.in_memcg_oom = 0; | ||
2295 | return true; | 2255 | return true; |
2296 | } | 2256 | } |
2297 | 2257 | ||
@@ -2705,6 +2665,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
2705 | || fatal_signal_pending(current))) | 2665 | || fatal_signal_pending(current))) |
2706 | goto bypass; | 2666 | goto bypass; |
2707 | 2667 | ||
2668 | if (unlikely(task_in_memcg_oom(current))) | ||
2669 | goto bypass; | ||
2670 | |||
2708 | /* | 2671 | /* |
2709 | * We always charge the cgroup the mm_struct belongs to. | 2672 | * We always charge the cgroup the mm_struct belongs to. |
2710 | * The mm_struct's mem_cgroup changes on task migration if the | 2673 | * The mm_struct's mem_cgroup changes on task migration if the |
diff --git a/mm/memory.c b/mm/memory.c index f7b7692c05ed..1311f26497e6 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -3865,15 +3865,21 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3865 | * space. Kernel faults are handled more gracefully. | 3865 | * space. Kernel faults are handled more gracefully. |
3866 | */ | 3866 | */ |
3867 | if (flags & FAULT_FLAG_USER) | 3867 | if (flags & FAULT_FLAG_USER) |
3868 | mem_cgroup_enable_oom(); | 3868 | mem_cgroup_oom_enable(); |
3869 | 3869 | ||
3870 | ret = __handle_mm_fault(mm, vma, address, flags); | 3870 | ret = __handle_mm_fault(mm, vma, address, flags); |
3871 | 3871 | ||
3872 | if (flags & FAULT_FLAG_USER) | 3872 | if (flags & FAULT_FLAG_USER) { |
3873 | mem_cgroup_disable_oom(); | 3873 | mem_cgroup_oom_disable(); |
3874 | 3874 | /* | |
3875 | if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) | 3875 | * The task may have entered a memcg OOM situation but |
3876 | mem_cgroup_oom_synchronize(); | 3876 | * if the allocation error was handled gracefully (no |
3877 | * VM_FAULT_OOM), there is no need to kill anything. | ||
3878 | * Just clean up the OOM state peacefully. | ||
3879 | */ | ||
3880 | if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) | ||
3881 | mem_cgroup_oom_synchronize(false); | ||
3882 | } | ||
3877 | 3883 | ||
3878 | return ret; | 3884 | return ret; |
3879 | } | 3885 | } |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 314e9d274381..6738c47f1f72 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -680,7 +680,7 @@ void pagefault_out_of_memory(void) | |||
680 | { | 680 | { |
681 | struct zonelist *zonelist; | 681 | struct zonelist *zonelist; |
682 | 682 | ||
683 | if (mem_cgroup_oom_synchronize()) | 683 | if (mem_cgroup_oom_synchronize(true)) |
684 | return; | 684 | return; |
685 | 685 | ||
686 | zonelist = node_zonelist(first_online_node, GFP_KERNEL); | 686 | zonelist = node_zonelist(first_online_node, GFP_KERNEL); |