memcg: punt high overage reclaim to return-to-userland path

Currently, try_charge() tries to reclaim memory synchronously when the high limit is breached; however, if the allocation doesn't have __GFP_WAIT, synchronous reclaim is skipped. If a process performs only speculative allocations, it can blow way past the high limit. This is actually easily reproducible by simply doing "find /". slab/slub allocator tries speculative allocations first, so as long as there's memory which can be consumed without blocking, it can keep allocating memory regardless of the high limit. This patch makes try_charge() always punt the over-high reclaim to the return-to-userland path. If try_charge() detects that high limit is breached, it adds the overage to current->memcg_nr_pages_over_high and schedules execution of mem_cgroup_handle_over_high() which performs synchronous reclaim from the return-to-userland path. As long as kernel doesn't have a run-away allocation spree, this should provide enough protection while making kmemcg behave more consistently. It also has the following benefits. - All over-high reclaims can use GFP_KERNEL regardless of the specific gfp mask in use, e.g. GFP_NOFS, when the limit was breached. - It copes with prio inversion. Previously, a low-prio task with small memory.high might perform over-high reclaim with a bunch of locks held. If a higher prio task needed any of these locks, it would have to wait until the low prio task finished reclaim and released the locks. By handing over-high reclaim to the task exit path this issue can be avoided. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Michal Hocko <mhocko@kernel.org> Reviewed-by: Vladimir Davydov <vdavydov@parallels.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Tejun Heo <tj@kernel.org> 2015-11-05 21:46:11 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-11-05 22:34:48 -0500
commit: b23afb93d317c65cef553b804f08dec8a7a0f7e1 (patch)
tree: 41c1e46049d02c6649f2e773a7b9fbb26094e115
parent: 626ebc4100285be56fe3546f29b6afeb36b6871a (diff)
4 files changed, 51 insertions, 8 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 56174c7199ee..77bf42966200 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -401,6 +401,8 @@ static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
        return inactive * inactive_ratio < active;
 }
+void mem_cgroup_handle_over_high(void);
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
                                struct task_struct *p);
@@ -620,6 +622,10 @@ static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
 {
 }
+static inline void mem_cgroup_handle_over_high(void)
+{
+}
 static inline void mem_cgroup_oom_enable(void)
 {
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 17bf8b845aa0..055f2ee3b0f0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1809,6 +1809,9 @@ struct task_struct {
        struct mem_cgroup *memcg_in_oom;
        gfp_t memcg_oom_gfp_mask;
        int memcg_oom_order;
+        /* number of pages to reclaim on returning to userland */
+        unsigned int memcg_nr_pages_over_high;
 #endif
 #ifdef CONFIG_UPROBES
        struct uprobe_task *utask;
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 84d497297c5f..26c152122a42 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -50,6 +50,7 @@
 #include <linux/ptrace.h>
 #include <linux/security.h>
 #include <linux/task_work.h>
+#include <linux/memcontrol.h>
 struct linux_binprm;
 /*
@@ -188,6 +189,8 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
        smp_mb__after_atomic();
        if (unlikely(current->task_works))
                task_work_run();
+        mem_cgroup_handle_over_high();
 }
 #endif  /* <linux/tracehook.h> */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 47bd7f13f526..327dcda3ebf6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -62,6 +62,7 @@
 #include <linux/oom.h>
 #include <linux/lockdep.h>
 #include <linux/file.h>
+#include <linux/tracehook.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -1972,6 +1973,31 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
        return NOTIFY_OK;
 }
+/*
+ * Scheduled by try_charge() to be executed from the userland return path
+ * and reclaims memory over the high limit.
+ */
+void mem_cgroup_handle_over_high(void)
+{
+        unsigned int nr_pages = current->memcg_nr_pages_over_high;
+        struct mem_cgroup *memcg, *pos;
+        if (likely(!nr_pages))
+                return;
+        pos = memcg = get_mem_cgroup_from_mm(current->mm);
+        do {
+                if (page_counter_read(&pos->memory) <= pos->high)
+                        continue;
+                mem_cgroup_events(pos, MEMCG_HIGH, 1);
+                try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true);
+        } while ((pos = parent_mem_cgroup(pos)));
+        css_put(&memcg->css);
+        current->memcg_nr_pages_over_high = 0;
+}
 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
                      unsigned int nr_pages)
 {
@@ -2080,17 +2106,22 @@ done_restock:
        css_get_many(&memcg->css, batch);
        if (batch > nr_pages)
                refill_stock(memcg, batch - nr_pages);
-        if (!(gfp_mask & __GFP_WAIT))
-                goto done;
        /*
-         * If the hierarchy is above the normal consumption range,
+         * If the hierarchy is above the normal consumption range, schedule
-         * make the charging task trim their excess contribution.
+         * reclaim on returning to userland.  We can perform reclaim here
+         * if __GFP_WAIT but let's always punt for simplicity and so that
+         * GFP_KERNEL can consistently be used during reclaim.  @memcg is
+         * not recorded as it most likely matches current's and won't
+         * change in the meantime.  As high limit is checked again before
+         * reclaim, the cost of mismatch is negligible.
         */
        do {
-                if (page_counter_read(&memcg->memory) <= memcg->high)
+                if (page_counter_read(&memcg->memory) > memcg->high) {
-                        continue;
+                        current->memcg_nr_pages_over_high += nr_pages;
-                mem_cgroup_events(memcg, MEMCG_HIGH, 1);
+                        set_notify_resume(current);
-                try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+                        break;
+                }
        } while ((memcg = parent_mem_cgroup(memcg)));
 done:
        return ret;
author	Tejun Heo <tj@kernel.org>	2015-11-05 21:46:11 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-11-05 22:34:48 -0500
commit	b23afb93d317c65cef553b804f08dec8a7a0f7e1 (patch)
tree	41c1e46049d02c6649f2e773a7b9fbb26094e115
parent	626ebc4100285be56fe3546f29b6afeb36b6871a (diff)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 56174c7199ee..77bf42966200 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h
@@ -401,6 +401,8 @@ static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
401	return inactive * inactive_ratio < active;	401	return inactive * inactive_ratio < active;
402	}	402	}
403		403
		404	void mem_cgroup_handle_over_high(void);
		405
404	void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,	406	void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
405	struct task_struct *p);	407	struct task_struct *p);
406		408
@@ -620,6 +622,10 @@ static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
620	{	622	{
621	}	623	}
622		624
		625	static inline void mem_cgroup_handle_over_high(void)
		626	{
		627	}
		628
623	static inline void mem_cgroup_oom_enable(void)	629	static inline void mem_cgroup_oom_enable(void)
624	{	630	{
625	}	631	}


diff --git a/include/linux/sched.h b/include/linux/sched.h index 17bf8b845aa0..055f2ee3b0f0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h
@@ -1809,6 +1809,9 @@ struct task_struct {
1809	struct mem_cgroup *memcg_in_oom;	1809	struct mem_cgroup *memcg_in_oom;
1810	gfp_t memcg_oom_gfp_mask;	1810	gfp_t memcg_oom_gfp_mask;
1811	int memcg_oom_order;	1811	int memcg_oom_order;
		1812
		1813	/* number of pages to reclaim on returning to userland */
		1814	unsigned int memcg_nr_pages_over_high;
1812	#endif	1815	#endif
1813	#ifdef CONFIG_UPROBES	1816	#ifdef CONFIG_UPROBES
1814	struct uprobe_task *utask;	1817	struct uprobe_task *utask;


diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 84d497297c5f..26c152122a42 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h
@@ -50,6 +50,7 @@
50	#include <linux/ptrace.h>	50	#include <linux/ptrace.h>
51	#include <linux/security.h>	51	#include <linux/security.h>
52	#include <linux/task_work.h>	52	#include <linux/task_work.h>
		53	#include <linux/memcontrol.h>
53	struct linux_binprm;	54	struct linux_binprm;
54		55
55	/*	56	/*
@@ -188,6 +189,8 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
188	smp_mb__after_atomic();	189	smp_mb__after_atomic();
189	if (unlikely(current->task_works))	190	if (unlikely(current->task_works))
190	task_work_run();	191	task_work_run();
		192
		193	mem_cgroup_handle_over_high();
191	}	194	}
192		195
193	#endif /* <linux/tracehook.h> */	196	#endif /* <linux/tracehook.h> */


diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 47bd7f13f526..327dcda3ebf6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -62,6 +62,7 @@
62	#include <linux/oom.h>	62	#include <linux/oom.h>
63	#include <linux/lockdep.h>	63	#include <linux/lockdep.h>
64	#include <linux/file.h>	64	#include <linux/file.h>
		65	#include <linux/tracehook.h>
65	#include "internal.h"	66	#include "internal.h"
66	#include <net/sock.h>	67	#include <net/sock.h>
67	#include <net/ip.h>	68	#include <net/ip.h>
@@ -1972,6 +1973,31 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
1972	return NOTIFY_OK;	1973	return NOTIFY_OK;
1973	}	1974	}
1974		1975
		1976	/*
		1977	* Scheduled by try_charge() to be executed from the userland return path
		1978	* and reclaims memory over the high limit.
		1979	*/
		1980	void mem_cgroup_handle_over_high(void)
		1981	{
		1982	unsigned int nr_pages = current->memcg_nr_pages_over_high;
		1983	struct mem_cgroup memcg, pos;
		1984
		1985	if (likely(!nr_pages))
		1986	return;
		1987
		1988	pos = memcg = get_mem_cgroup_from_mm(current->mm);
		1989
		1990	do {
		1991	if (page_counter_read(&pos->memory) <= pos->high)
		1992	continue;
		1993	mem_cgroup_events(pos, MEMCG_HIGH, 1);
		1994	try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true);
		1995	} while ((pos = parent_mem_cgroup(pos)));
		1996
		1997	css_put(&memcg->css);
		1998	current->memcg_nr_pages_over_high = 0;
		1999	}
		2000
1975	static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,	2001	static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
1976	unsigned int nr_pages)	2002	unsigned int nr_pages)
1977	{	2003	{
@@ -2080,17 +2106,22 @@ done_restock:
2080	css_get_many(&memcg->css, batch);	2106	css_get_many(&memcg->css, batch);
2081	if (batch > nr_pages)	2107	if (batch > nr_pages)
2082	refill_stock(memcg, batch - nr_pages);	2108	refill_stock(memcg, batch - nr_pages);
2083	if (!(gfp_mask & __GFP_WAIT))	2109
2084	goto done;
2085	/*	2110	/*
2086	* If the hierarchy is above the normal consumption range,	2111	* If the hierarchy is above the normal consumption range, schedule
2087	* make the charging task trim their excess contribution.	2112	* reclaim on returning to userland. We can perform reclaim here
		2113	* if __GFP_WAIT but let's always punt for simplicity and so that
		2114	* GFP_KERNEL can consistently be used during reclaim. @memcg is
		2115	* not recorded as it most likely matches current's and won't
		2116	* change in the meantime. As high limit is checked again before
		2117	* reclaim, the cost of mismatch is negligible.
2088	*/	2118	*/
2089	do {	2119	do {
2090	if (page_counter_read(&memcg->memory) <= memcg->high)	2120	if (page_counter_read(&memcg->memory) > memcg->high) {
2091	continue;	2121	current->memcg_nr_pages_over_high += nr_pages;
2092	mem_cgroup_events(memcg, MEMCG_HIGH, 1);	2122	set_notify_resume(current);
2093	try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);	2123	break;
		2124	}
2094	} while ((memcg = parent_mem_cgroup(memcg)));	2125	} while ((memcg = parent_mem_cgroup(memcg)));
2095	done:	2126	done:
2096	return ret;	2127	return ret;