mm: memcontrol: fix transparent huge page allocations under pressure

In a memcg with even just moderate cache pressure, success rates for transparent huge page allocations drop to zero, wasting a lot of effort that the allocator puts into assembling these pages. The reason for this is that the memcg reclaim code was never designed for higher-order charges. It reclaims in small batches until there is room for at least one page. Huge page charges only succeed when these batches add up over a series of huge faults, which is unlikely under any significant load involving order-0 allocations in the group. Remove that loop on the memcg side in favor of passing the actual reclaim goal to direct reclaim, which is already set up and optimized to meet higher-order goals efficiently. This brings memcg's THP policy in line with the system policy: if the allocator painstakingly assembles a hugepage, memcg will at least make an honest effort to charge it. As a result, transparent hugepage allocation rates amid cache activity are drastically improved: vanilla patched pgalloc 4717530.80 ( +0.00%) 4451376.40 ( -5.64%) pgfault 491370.60 ( +0.00%) 225477.40 ( -54.11%) pgmajfault 2.00 ( +0.00%) 1.80 ( -6.67%) thp_fault_alloc 0.00 ( +0.00%) 531.60 (+100.00%) thp_fault_fallback 749.00 ( +0.00%) 217.40 ( -70.88%) [ Note: this may in turn increase memory consumption from internal fragmentation, which is an inherent risk of transparent hugepages. Some setups may have to adjust the memcg limits accordingly to accomodate this - or, if the machine is already packed to capacity, disable the transparent huge page feature. ] Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Vladimir Davydov <vdavydov@parallels.com> Cc: Michal Hocko <mhocko@suse.cz> Cc: Dave Hansen <dave@sr71.net> Cc: Greg Thelen <gthelen@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Johannes Weiner <hannes@cmpxchg.org> 2014-10-09 18:28:56 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-10-09 22:25:59 -0400
commit: b70a2a21dc9d4ad455931b53131a0cb4fc01fafe (patch)
tree: 95ad6c804009a5867ac991cc1edf414e163b40b4 /mm/memcontrol.c
parent: 3fbe724424fb104aaca9973389b4a9df428c3e2a (diff)
1 files changed, 17 insertions, 52 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9cda99dfac4f..c86cc442ada4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -481,14 +481,6 @@ enum res_type {
 #define OOM_CONTROL             (0)
 /*
- * Reclaim flags for mem_cgroup_hierarchical_reclaim
- */
-#define MEM_CGROUP_RECLAIM_NOSWAP_BIT   0x0
-#define MEM_CGROUP_RECLAIM_NOSWAP       (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
-#define MEM_CGROUP_RECLAIM_SHRINK_BIT   0x1
-#define MEM_CGROUP_RECLAIM_SHRINK       (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
-/*
 * The memcg_create_mutex will be held whenever a new cgroup is created.
 * As a consequence, any change that needs to protect against new child cgroups
 * appearing has to hold it as well.
@@ -1805,40 +1797,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                         NULL, "Memory cgroup out of memory");
 }
-static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
-                                        gfp_t gfp_mask,
-                                        unsigned long flags)
-{
-        unsigned long total = 0;
-        bool noswap = false;
-        int loop;
-        if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
-                noswap = true;
-        for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
-                if (loop)
-                        drain_all_stock_async(memcg);
-                total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
-                /*
-                 * Allow limit shrinkers, which are triggered directly
-                 * by userspace, to catch signals and stop reclaim
-                 * after minimal progress, regardless of the margin.
-                 */
-                if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
-                        break;
-                if (mem_cgroup_margin(memcg))
-                        break;
-                /*
-                 * If nothing was reclaimed after two attempts, there
-                 * may be no reclaimable pages in this hierarchy.
-                 */
-                if (loop && !total)
-                        break;
-        }
-        return total;
-}
 /**
 * test_mem_cgroup_node_reclaimable
 * @memcg: the target memcg
@@ -2541,8 +2499,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
        struct mem_cgroup *mem_over_limit;
        struct res_counter *fail_res;
        unsigned long nr_reclaimed;
-        unsigned long flags = 0;
        unsigned long long size;
+        bool may_swap = true;
+        bool drained = false;
        int ret = 0;
        if (mem_cgroup_is_root(memcg))
@@ -2561,7 +2520,7 @@ retry:
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
        } else {
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
-                flags |= MEM_CGROUP_RECLAIM_NOSWAP;
+                may_swap = false;
        }
        if (batch > nr_pages) {
@@ -2586,11 +2545,18 @@ retry:
        if (!(gfp_mask & __GFP_WAIT))
                goto nomem;
-        nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
+        nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
+                                                    gfp_mask, may_swap);
        if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
                goto retry;
+        if (!drained) {
+                drain_all_stock_async(mem_over_limit);
+                drained = true;
+                goto retry;
+        }
        if (gfp_mask & __GFP_NORETRY)
                goto nomem;
        /*
@@ -3666,8 +3632,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
                if (!ret)
                        break;
-                mem_cgroup_reclaim(memcg, GFP_KERNEL,
+                try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
-                                   MEM_CGROUP_RECLAIM_SHRINK);
                curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
                /* Usage is reduced ? */
                if (curusage >= oldusage)
@@ -3717,9 +3683,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
                if (!ret)
                        break;
-                mem_cgroup_reclaim(memcg, GFP_KERNEL,
+                try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
-                                   MEM_CGROUP_RECLAIM_NOSWAP |
-                                   MEM_CGROUP_RECLAIM_SHRINK);
                curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
                /* Usage is reduced ? */
                if (curusage >= oldusage)
@@ -3968,8 +3933,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
                if (signal_pending(current))
                        return -EINTR;
-                progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
+                progress = try_to_free_mem_cgroup_pages(memcg, 1,
-                                                false);
+                                                        GFP_KERNEL, true);
                if (!progress) {
                        nr_retries--;
                        /* maybe some writeback is necessary */
author	Johannes Weiner <hannes@cmpxchg.org>	2014-10-09 18:28:56 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-10-09 22:25:59 -0400
commit	b70a2a21dc9d4ad455931b53131a0cb4fc01fafe (patch)
tree	95ad6c804009a5867ac991cc1edf414e163b40b4 /mm/memcontrol.c
parent	3fbe724424fb104aaca9973389b4a9df428c3e2a (diff)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9cda99dfac4f..c86cc442ada4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -481,14 +481,6 @@ enum res_type {
481	#define OOM_CONTROL (0)	481	#define OOM_CONTROL (0)
482		482
483	/*	483	/*
484	* Reclaim flags for mem_cgroup_hierarchical_reclaim
485	*/
486	#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
487	#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
488	#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
489	#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
490
491	/*
492	* The memcg_create_mutex will be held whenever a new cgroup is created.	484	* The memcg_create_mutex will be held whenever a new cgroup is created.
493	* As a consequence, any change that needs to protect against new child cgroups	485	* As a consequence, any change that needs to protect against new child cgroups
494	* appearing has to hold it as well.	486	* appearing has to hold it as well.
@@ -1805,40 +1797,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1805	NULL, "Memory cgroup out of memory");	1797	NULL, "Memory cgroup out of memory");
1806	}	1798	}
1807		1799
1808	static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1809	gfp_t gfp_mask,
1810	unsigned long flags)
1811	{
1812	unsigned long total = 0;
1813	bool noswap = false;
1814	int loop;
1815
1816	if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
1817	noswap = true;
1818
1819	for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
1820	if (loop)
1821	drain_all_stock_async(memcg);
1822	total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
1823	/*
1824	* Allow limit shrinkers, which are triggered directly
1825	* by userspace, to catch signals and stop reclaim
1826	* after minimal progress, regardless of the margin.
1827	*/
1828	if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
1829	break;
1830	if (mem_cgroup_margin(memcg))
1831	break;
1832	/*
1833	* If nothing was reclaimed after two attempts, there
1834	* may be no reclaimable pages in this hierarchy.
1835	*/
1836	if (loop && !total)
1837	break;
1838	}
1839	return total;
1840	}
1841
1842	/**	1800	/**
1843	* test_mem_cgroup_node_reclaimable	1801	* test_mem_cgroup_node_reclaimable
1844	* @memcg: the target memcg	1802	* @memcg: the target memcg
@@ -2541,8 +2499,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2541	struct mem_cgroup *mem_over_limit;	2499	struct mem_cgroup *mem_over_limit;
2542	struct res_counter *fail_res;	2500	struct res_counter *fail_res;
2543	unsigned long nr_reclaimed;	2501	unsigned long nr_reclaimed;
2544	unsigned long flags = 0;
2545	unsigned long long size;	2502	unsigned long long size;
		2503	bool may_swap = true;
		2504	bool drained = false;
2546	int ret = 0;	2505	int ret = 0;
2547		2506
2548	if (mem_cgroup_is_root(memcg))	2507	if (mem_cgroup_is_root(memcg))
@@ -2561,7 +2520,7 @@ retry:
2561	mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);	2520	mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2562	} else {	2521	} else {
2563	mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);	2522	mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2564	flags \|= MEM_CGROUP_RECLAIM_NOSWAP;	2523	may_swap = false;
2565	}	2524	}
2566		2525
2567	if (batch > nr_pages) {	2526	if (batch > nr_pages) {
@@ -2586,11 +2545,18 @@ retry:
2586	if (!(gfp_mask & __GFP_WAIT))	2545	if (!(gfp_mask & __GFP_WAIT))
2587	goto nomem;	2546	goto nomem;
2588		2547
2589	nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);	2548	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
		2549	gfp_mask, may_swap);
2590		2550
2591	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)	2551	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2592	goto retry;	2552	goto retry;
2593		2553
		2554	if (!drained) {
		2555	drain_all_stock_async(mem_over_limit);
		2556	drained = true;
		2557	goto retry;
		2558	}
		2559
2594	if (gfp_mask & __GFP_NORETRY)	2560	if (gfp_mask & __GFP_NORETRY)
2595	goto nomem;	2561	goto nomem;
2596	/*	2562	/*
@@ -3666,8 +3632,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3666	if (!ret)	3632	if (!ret)
3667	break;	3633	break;
3668		3634
3669	mem_cgroup_reclaim(memcg, GFP_KERNEL,	3635	try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
3670	MEM_CGROUP_RECLAIM_SHRINK);	3636
3671	curusage = res_counter_read_u64(&memcg->res, RES_USAGE);	3637	curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3672	/* Usage is reduced ? */	3638	/* Usage is reduced ? */
3673	if (curusage >= oldusage)	3639	if (curusage >= oldusage)
@@ -3717,9 +3683,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3717	if (!ret)	3683	if (!ret)
3718	break;	3684	break;
3719		3685
3720	mem_cgroup_reclaim(memcg, GFP_KERNEL,	3686	try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
3721	MEM_CGROUP_RECLAIM_NOSWAP \|	3687
3722	MEM_CGROUP_RECLAIM_SHRINK);
3723	curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);	3688	curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3724	/* Usage is reduced ? */	3689	/* Usage is reduced ? */
3725	if (curusage >= oldusage)	3690	if (curusage >= oldusage)
@@ -3968,8 +3933,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3968	if (signal_pending(current))	3933	if (signal_pending(current))
3969	return -EINTR;	3934	return -EINTR;
3970		3935
3971	progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,	3936	progress = try_to_free_mem_cgroup_pages(memcg, 1,
3972	false);	3937	GFP_KERNEL, true);
3973	if (!progress) {	3938	if (!progress) {
3974	nr_retries--;	3939	nr_retries--;
3975	/* maybe some writeback is necessary */	3940	/* maybe some writeback is necessary */