aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/memcontrol.h50
-rw-r--r--include/linux/sched.h7
-rw-r--r--mm/filemap.c11
-rw-r--r--mm/memcontrol.c139
-rw-r--r--mm/memory.c18
-rw-r--r--mm/oom_kill.c2
6 files changed, 79 insertions, 148 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ecc82b37c4cc..b3e7a667e03c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -137,47 +137,24 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
137extern void mem_cgroup_replace_page_cache(struct page *oldpage, 137extern void mem_cgroup_replace_page_cache(struct page *oldpage,
138 struct page *newpage); 138 struct page *newpage);
139 139
140/** 140static inline void mem_cgroup_oom_enable(void)
141 * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task
142 * @new: true to enable, false to disable
143 *
144 * Toggle whether a failed memcg charge should invoke the OOM killer
145 * or just return -ENOMEM. Returns the previous toggle state.
146 *
147 * NOTE: Any path that enables the OOM killer before charging must
148 * call mem_cgroup_oom_synchronize() afterward to finalize the
149 * OOM handling and clean up.
150 */
151static inline bool mem_cgroup_toggle_oom(bool new)
152{ 141{
153 bool old; 142 WARN_ON(current->memcg_oom.may_oom);
154 143 current->memcg_oom.may_oom = 1;
155 old = current->memcg_oom.may_oom;
156 current->memcg_oom.may_oom = new;
157
158 return old;
159} 144}
160 145
161static inline void mem_cgroup_enable_oom(void) 146static inline void mem_cgroup_oom_disable(void)
162{ 147{
163 bool old = mem_cgroup_toggle_oom(true); 148 WARN_ON(!current->memcg_oom.may_oom);
164 149 current->memcg_oom.may_oom = 0;
165 WARN_ON(old == true);
166}
167
168static inline void mem_cgroup_disable_oom(void)
169{
170 bool old = mem_cgroup_toggle_oom(false);
171
172 WARN_ON(old == false);
173} 150}
174 151
175static inline bool task_in_memcg_oom(struct task_struct *p) 152static inline bool task_in_memcg_oom(struct task_struct *p)
176{ 153{
177 return p->memcg_oom.in_memcg_oom; 154 return p->memcg_oom.memcg;
178} 155}
179 156
180bool mem_cgroup_oom_synchronize(void); 157bool mem_cgroup_oom_synchronize(bool wait);
181 158
182#ifdef CONFIG_MEMCG_SWAP 159#ifdef CONFIG_MEMCG_SWAP
183extern int do_swap_account; 160extern int do_swap_account;
@@ -402,16 +379,11 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page,
402{ 379{
403} 380}
404 381
405static inline bool mem_cgroup_toggle_oom(bool new) 382static inline void mem_cgroup_oom_enable(void)
406{
407 return false;
408}
409
410static inline void mem_cgroup_enable_oom(void)
411{ 383{
412} 384}
413 385
414static inline void mem_cgroup_disable_oom(void) 386static inline void mem_cgroup_oom_disable(void)
415{ 387{
416} 388}
417 389
@@ -420,7 +392,7 @@ static inline bool task_in_memcg_oom(struct task_struct *p)
420 return false; 392 return false;
421} 393}
422 394
423static inline bool mem_cgroup_oom_synchronize(void) 395static inline bool mem_cgroup_oom_synchronize(bool wait)
424{ 396{
425 return false; 397 return false;
426} 398}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6682da36b293..e27baeeda3f4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1394,11 +1394,10 @@ struct task_struct {
1394 } memcg_batch; 1394 } memcg_batch;
1395 unsigned int memcg_kmem_skip_account; 1395 unsigned int memcg_kmem_skip_account;
1396 struct memcg_oom_info { 1396 struct memcg_oom_info {
1397 struct mem_cgroup *memcg;
1398 gfp_t gfp_mask;
1399 int order;
1397 unsigned int may_oom:1; 1400 unsigned int may_oom:1;
1398 unsigned int in_memcg_oom:1;
1399 unsigned int oom_locked:1;
1400 int wakeups;
1401 struct mem_cgroup *wait_on_memcg;
1402 } memcg_oom; 1401 } memcg_oom;
1403#endif 1402#endif
1404#ifdef CONFIG_UPROBES 1403#ifdef CONFIG_UPROBES
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e6aec4a2d2e..ae4846ff4849 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1616,7 +1616,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1616 struct inode *inode = mapping->host; 1616 struct inode *inode = mapping->host;
1617 pgoff_t offset = vmf->pgoff; 1617 pgoff_t offset = vmf->pgoff;
1618 struct page *page; 1618 struct page *page;
1619 bool memcg_oom;
1620 pgoff_t size; 1619 pgoff_t size;
1621 int ret = 0; 1620 int ret = 0;
1622 1621
@@ -1625,11 +1624,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1625 return VM_FAULT_SIGBUS; 1624 return VM_FAULT_SIGBUS;
1626 1625
1627 /* 1626 /*
1628 * Do we have something in the page cache already? Either 1627 * Do we have something in the page cache already?
1629 * way, try readahead, but disable the memcg OOM killer for it
1630 * as readahead is optional and no errors are propagated up
1631 * the fault stack. The OOM killer is enabled while trying to
1632 * instantiate the faulting page individually below.
1633 */ 1628 */
1634 page = find_get_page(mapping, offset); 1629 page = find_get_page(mapping, offset);
1635 if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { 1630 if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
@@ -1637,14 +1632,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1637 * We found the page, so try async readahead before 1632 * We found the page, so try async readahead before
1638 * waiting for the lock. 1633 * waiting for the lock.
1639 */ 1634 */
1640 memcg_oom = mem_cgroup_toggle_oom(false);
1641 do_async_mmap_readahead(vma, ra, file, page, offset); 1635 do_async_mmap_readahead(vma, ra, file, page, offset);
1642 mem_cgroup_toggle_oom(memcg_oom);
1643 } else if (!page) { 1636 } else if (!page) {
1644 /* No page in the page cache at all */ 1637 /* No page in the page cache at all */
1645 memcg_oom = mem_cgroup_toggle_oom(false);
1646 do_sync_mmap_readahead(vma, ra, file, offset); 1638 do_sync_mmap_readahead(vma, ra, file, offset);
1647 mem_cgroup_toggle_oom(memcg_oom);
1648 count_vm_event(PGMAJFAULT); 1639 count_vm_event(PGMAJFAULT);
1649 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 1640 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1650 ret = VM_FAULT_MAJOR; 1641 ret = VM_FAULT_MAJOR;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5335b2b6be77..65fc6a449841 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2161,110 +2161,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
2161 memcg_wakeup_oom(memcg); 2161 memcg_wakeup_oom(memcg);
2162} 2162}
2163 2163
2164/*
2165 * try to call OOM killer
2166 */
2167static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 2164static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
2168{ 2165{
2169 bool locked;
2170 int wakeups;
2171
2172 if (!current->memcg_oom.may_oom) 2166 if (!current->memcg_oom.may_oom)
2173 return; 2167 return;
2174
2175 current->memcg_oom.in_memcg_oom = 1;
2176
2177 /* 2168 /*
2178 * As with any blocking lock, a contender needs to start 2169 * We are in the middle of the charge context here, so we
2179 * listening for wakeups before attempting the trylock, 2170 * don't want to block when potentially sitting on a callstack
2180 * otherwise it can miss the wakeup from the unlock and sleep 2171 * that holds all kinds of filesystem and mm locks.
2181 * indefinitely. This is just open-coded because our locking 2172 *
2182 * is so particular to memcg hierarchies. 2173 * Also, the caller may handle a failed allocation gracefully
2174 * (like optional page cache readahead) and so an OOM killer
2175 * invocation might not even be necessary.
2176 *
2177 * That's why we don't do anything here except remember the
2178 * OOM context and then deal with it at the end of the page
2179 * fault when the stack is unwound, the locks are released,
2180 * and when we know whether the fault was overall successful.
2183 */ 2181 */
2184 wakeups = atomic_read(&memcg->oom_wakeups); 2182 css_get(&memcg->css);
2185 mem_cgroup_mark_under_oom(memcg); 2183 current->memcg_oom.memcg = memcg;
2186 2184 current->memcg_oom.gfp_mask = mask;
2187 locked = mem_cgroup_oom_trylock(memcg); 2185 current->memcg_oom.order = order;
2188
2189 if (locked)
2190 mem_cgroup_oom_notify(memcg);
2191
2192 if (locked && !memcg->oom_kill_disable) {
2193 mem_cgroup_unmark_under_oom(memcg);
2194 mem_cgroup_out_of_memory(memcg, mask, order);
2195 mem_cgroup_oom_unlock(memcg);
2196 /*
2197 * There is no guarantee that an OOM-lock contender
2198 * sees the wakeups triggered by the OOM kill
2199 * uncharges. Wake any sleepers explicitely.
2200 */
2201 memcg_oom_recover(memcg);
2202 } else {
2203 /*
2204 * A system call can just return -ENOMEM, but if this
2205 * is a page fault and somebody else is handling the
2206 * OOM already, we need to sleep on the OOM waitqueue
2207 * for this memcg until the situation is resolved.
2208 * Which can take some time because it might be
2209 * handled by a userspace task.
2210 *
2211 * However, this is the charge context, which means
2212 * that we may sit on a large call stack and hold
2213 * various filesystem locks, the mmap_sem etc. and we
2214 * don't want the OOM handler to deadlock on them
2215 * while we sit here and wait. Store the current OOM
2216 * context in the task_struct, then return -ENOMEM.
2217 * At the end of the page fault handler, with the
2218 * stack unwound, pagefault_out_of_memory() will check
2219 * back with us by calling
2220 * mem_cgroup_oom_synchronize(), possibly putting the
2221 * task to sleep.
2222 */
2223 current->memcg_oom.oom_locked = locked;
2224 current->memcg_oom.wakeups = wakeups;
2225 css_get(&memcg->css);
2226 current->memcg_oom.wait_on_memcg = memcg;
2227 }
2228} 2186}
2229 2187
2230/** 2188/**
2231 * mem_cgroup_oom_synchronize - complete memcg OOM handling 2189 * mem_cgroup_oom_synchronize - complete memcg OOM handling
2190 * @handle: actually kill/wait or just clean up the OOM state
2232 * 2191 *
2233 * This has to be called at the end of a page fault if the the memcg 2192 * This has to be called at the end of a page fault if the memcg OOM
2234 * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. 2193 * handler was enabled.
2235 * 2194 *
2236 * Memcg supports userspace OOM handling, so failed allocations must 2195 * Memcg supports userspace OOM handling where failed allocations must
2237 * sleep on a waitqueue until the userspace task resolves the 2196 * sleep on a waitqueue until the userspace task resolves the
2238 * situation. Sleeping directly in the charge context with all kinds 2197 * situation. Sleeping directly in the charge context with all kinds
2239 * of locks held is not a good idea, instead we remember an OOM state 2198 * of locks held is not a good idea, instead we remember an OOM state
2240 * in the task and mem_cgroup_oom_synchronize() has to be called at 2199 * in the task and mem_cgroup_oom_synchronize() has to be called at
2241 * the end of the page fault to put the task to sleep and clean up the 2200 * the end of the page fault to complete the OOM handling.
2242 * OOM state.
2243 * 2201 *
2244 * Returns %true if an ongoing memcg OOM situation was detected and 2202 * Returns %true if an ongoing memcg OOM situation was detected and
2245 * finalized, %false otherwise. 2203 * completed, %false otherwise.
2246 */ 2204 */
2247bool mem_cgroup_oom_synchronize(void) 2205bool mem_cgroup_oom_synchronize(bool handle)
2248{ 2206{
2207 struct mem_cgroup *memcg = current->memcg_oom.memcg;
2249 struct oom_wait_info owait; 2208 struct oom_wait_info owait;
2250 struct mem_cgroup *memcg; 2209 bool locked;
2251 2210
2252 /* OOM is global, do not handle */ 2211 /* OOM is global, do not handle */
2253 if (!current->memcg_oom.in_memcg_oom)
2254 return false;
2255
2256 /*
2257 * We invoked the OOM killer but there is a chance that a kill
2258 * did not free up any charges. Everybody else might already
2259 * be sleeping, so restart the fault and keep the rampage
2260 * going until some charges are released.
2261 */
2262 memcg = current->memcg_oom.wait_on_memcg;
2263 if (!memcg) 2212 if (!memcg)
2264 goto out; 2213 return false;
2265 2214
2266 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 2215 if (!handle)
2267 goto out_memcg; 2216 goto cleanup;
2268 2217
2269 owait.memcg = memcg; 2218 owait.memcg = memcg;
2270 owait.wait.flags = 0; 2219 owait.wait.flags = 0;
@@ -2273,13 +2222,25 @@ bool mem_cgroup_oom_synchronize(void)
2273 INIT_LIST_HEAD(&owait.wait.task_list); 2222 INIT_LIST_HEAD(&owait.wait.task_list);
2274 2223
2275 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2224 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2276 /* Only sleep if we didn't miss any wakeups since OOM */ 2225 mem_cgroup_mark_under_oom(memcg);
2277 if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) 2226
2227 locked = mem_cgroup_oom_trylock(memcg);
2228
2229 if (locked)
2230 mem_cgroup_oom_notify(memcg);
2231
2232 if (locked && !memcg->oom_kill_disable) {
2233 mem_cgroup_unmark_under_oom(memcg);
2234 finish_wait(&memcg_oom_waitq, &owait.wait);
2235 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
2236 current->memcg_oom.order);
2237 } else {
2278 schedule(); 2238 schedule();
2279 finish_wait(&memcg_oom_waitq, &owait.wait); 2239 mem_cgroup_unmark_under_oom(memcg);
2280out_memcg: 2240 finish_wait(&memcg_oom_waitq, &owait.wait);
2281 mem_cgroup_unmark_under_oom(memcg); 2241 }
2282 if (current->memcg_oom.oom_locked) { 2242
2243 if (locked) {
2283 mem_cgroup_oom_unlock(memcg); 2244 mem_cgroup_oom_unlock(memcg);
2284 /* 2245 /*
2285 * There is no guarantee that an OOM-lock contender 2246 * There is no guarantee that an OOM-lock contender
@@ -2288,10 +2249,9 @@ out_memcg:
2288 */ 2249 */
2289 memcg_oom_recover(memcg); 2250 memcg_oom_recover(memcg);
2290 } 2251 }
2252cleanup:
2253 current->memcg_oom.memcg = NULL;
2291 css_put(&memcg->css); 2254 css_put(&memcg->css);
2292 current->memcg_oom.wait_on_memcg = NULL;
2293out:
2294 current->memcg_oom.in_memcg_oom = 0;
2295 return true; 2255 return true;
2296} 2256}
2297 2257
@@ -2705,6 +2665,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2705 || fatal_signal_pending(current))) 2665 || fatal_signal_pending(current)))
2706 goto bypass; 2666 goto bypass;
2707 2667
2668 if (unlikely(task_in_memcg_oom(current)))
2669 goto bypass;
2670
2708 /* 2671 /*
2709 * We always charge the cgroup the mm_struct belongs to. 2672 * We always charge the cgroup the mm_struct belongs to.
2710 * The mm_struct's mem_cgroup changes on task migration if the 2673 * The mm_struct's mem_cgroup changes on task migration if the
diff --git a/mm/memory.c b/mm/memory.c
index f7b7692c05ed..1311f26497e6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3865,15 +3865,21 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3865 * space. Kernel faults are handled more gracefully. 3865 * space. Kernel faults are handled more gracefully.
3866 */ 3866 */
3867 if (flags & FAULT_FLAG_USER) 3867 if (flags & FAULT_FLAG_USER)
3868 mem_cgroup_enable_oom(); 3868 mem_cgroup_oom_enable();
3869 3869
3870 ret = __handle_mm_fault(mm, vma, address, flags); 3870 ret = __handle_mm_fault(mm, vma, address, flags);
3871 3871
3872 if (flags & FAULT_FLAG_USER) 3872 if (flags & FAULT_FLAG_USER) {
3873 mem_cgroup_disable_oom(); 3873 mem_cgroup_oom_disable();
3874 3874 /*
3875 if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) 3875 * The task may have entered a memcg OOM situation but
3876 mem_cgroup_oom_synchronize(); 3876 * if the allocation error was handled gracefully (no
3877 * VM_FAULT_OOM), there is no need to kill anything.
3878 * Just clean up the OOM state peacefully.
3879 */
3880 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
3881 mem_cgroup_oom_synchronize(false);
3882 }
3877 3883
3878 return ret; 3884 return ret;
3879} 3885}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 314e9d274381..6738c47f1f72 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -680,7 +680,7 @@ void pagefault_out_of_memory(void)
680{ 680{
681 struct zonelist *zonelist; 681 struct zonelist *zonelist;
682 682
683 if (mem_cgroup_oom_synchronize()) 683 if (mem_cgroup_oom_synchronize(true))
684 return; 684 return;
685 685
686 zonelist = node_zonelist(first_online_node, GFP_KERNEL); 686 zonelist = node_zonelist(first_online_node, GFP_KERNEL);