aboutsummaryrefslogtreecommitdiffstats
path: root/include/linux
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2013-09-12 18:13:44 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-12 18:38:02 -0400
commit3812c8c8f3953921ef18544110dafc3505c1ac62 (patch)
tree8e5efc15fec4700644774df5fb5302f5c82f4a31 /include/linux
parentfb2a6fc56be66c169f8b80e07ed999ba453a2db2 (diff)
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a task fails to charge memory, it invokes the OOM killer and loops right there in the charge code until it succeeds. Comparably, any other task that enters the charge path at this point will go to a waitqueue right then and there and sleep until the OOM situation is resolved. The problem is that these tasks may hold filesystem locks and the mmap_sem; locks that the selected OOM victim may need to exit. For example, in one reported case, the task invoking the OOM killer was about to charge a page cache page during a write(), which holds the i_mutex. The OOM killer selected a task that was just entering truncate() and trying to acquire the i_mutex: OOM invoking task: mem_cgroup_handle_oom+0x241/0x3b0 mem_cgroup_cache_charge+0xbe/0xe0 add_to_page_cache_locked+0x4c/0x140 add_to_page_cache_lru+0x22/0x50 grab_cache_page_write_begin+0x8b/0xe0 ext3_write_begin+0x88/0x270 generic_file_buffered_write+0x116/0x290 __generic_file_aio_write+0x27c/0x480 generic_file_aio_write+0x76/0xf0 # takes ->i_mutex do_sync_write+0xea/0x130 vfs_write+0xf3/0x1f0 sys_write+0x51/0x90 system_call_fastpath+0x18/0x1d OOM kill victim: do_truncate+0x58/0xa0 # takes i_mutex do_last+0x250/0xa30 path_openat+0xd7/0x440 do_filp_open+0x49/0xa0 do_sys_open+0x106/0x240 sys_open+0x20/0x30 system_call_fastpath+0x18/0x1d The OOM handling task will retry the charge indefinitely while the OOM killed task is not releasing any resources. A similar scenario can happen when the kernel OOM killer for a memcg is disabled and a userspace task is in charge of resolving OOM situations. In this case, ALL tasks that enter the OOM path will be made to sleep on the OOM waitqueue and wait for userspace to free resources or increase the group's limit. But a userspace OOM handler is prone to deadlock itself on the locks held by the waiting tasks. For example one of the sleeping tasks may be stuck in a brk() call with the mmap_sem held for writing but the userspace handler, in order to pick an optimal victim, may need to read files from /proc/<pid>, which tries to acquire the same mmap_sem for reading and deadlocks. This patch changes the way tasks behave after detecting a memcg OOM and makes sure nobody loops or sleeps with locks held: 1. When OOMing in a user fault, invoke the OOM killer and restart the fault instead of looping on the charge attempt. This way, the OOM victim can not get stuck on locks the looping task may hold. 2. When OOMing in a user fault but somebody else is handling it (either the kernel OOM killer or a userspace handler), don't go to sleep in the charge context. Instead, remember the OOMing memcg in the task struct and then fully unwind the page fault stack with -ENOMEM. pagefault_out_of_memory() will then call back into the memcg code to check if the -ENOMEM came from the memcg, and then either put the task to sleep on the memcg's OOM waitqueue or just restart the fault. The OOM victim can no longer get stuck on any lock a sleeping task may hold. Debugged by Michal Hocko. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reported-by: azurIt <azurit@pobox.sk> Acked-by: Michal Hocko <mhocko@suse.cz> Cc: David Rientjes <rientjes@google.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/memcontrol.h21
-rw-r--r--include/linux/sched.h4
2 files changed, 25 insertions, 0 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 34ac6497d01a..89d576cfcc4c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -157,6 +157,10 @@ extern void mem_cgroup_replace_page_cache(struct page *oldpage,
157 * 157 *
158 * Toggle whether a failed memcg charge should invoke the OOM killer 158 * Toggle whether a failed memcg charge should invoke the OOM killer
159 * or just return -ENOMEM. Returns the previous toggle state. 159 * or just return -ENOMEM. Returns the previous toggle state.
160 *
161 * NOTE: Any path that enables the OOM killer before charging must
162 * call mem_cgroup_oom_synchronize() afterward to finalize the
163 * OOM handling and clean up.
160 */ 164 */
161static inline bool mem_cgroup_toggle_oom(bool new) 165static inline bool mem_cgroup_toggle_oom(bool new)
162{ 166{
@@ -182,6 +186,13 @@ static inline void mem_cgroup_disable_oom(void)
182 WARN_ON(old == false); 186 WARN_ON(old == false);
183} 187}
184 188
189static inline bool task_in_memcg_oom(struct task_struct *p)
190{
191 return p->memcg_oom.in_memcg_oom;
192}
193
194bool mem_cgroup_oom_synchronize(void);
195
185#ifdef CONFIG_MEMCG_SWAP 196#ifdef CONFIG_MEMCG_SWAP
186extern int do_swap_account; 197extern int do_swap_account;
187#endif 198#endif
@@ -427,6 +438,16 @@ static inline void mem_cgroup_disable_oom(void)
427{ 438{
428} 439}
429 440
441static inline bool task_in_memcg_oom(struct task_struct *p)
442{
443 return false;
444}
445
446static inline bool mem_cgroup_oom_synchronize(void)
447{
448 return false;
449}
450
430static inline void mem_cgroup_inc_page_stat(struct page *page, 451static inline void mem_cgroup_inc_page_stat(struct page *page,
431 enum mem_cgroup_page_stat_item idx) 452 enum mem_cgroup_page_stat_item idx)
432{ 453{
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9ce1fa53031f..6682da36b293 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1395,6 +1395,10 @@ struct task_struct {
1395 unsigned int memcg_kmem_skip_account; 1395 unsigned int memcg_kmem_skip_account;
1396 struct memcg_oom_info { 1396 struct memcg_oom_info {
1397 unsigned int may_oom:1; 1397 unsigned int may_oom:1;
1398 unsigned int in_memcg_oom:1;
1399 unsigned int oom_locked:1;
1400 int wakeups;
1401 struct mem_cgroup *wait_on_memcg;
1398 } memcg_oom; 1402 } memcg_oom;
1399#endif 1403#endif
1400#ifdef CONFIG_UPROBES 1404#ifdef CONFIG_UPROBES