aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2014-10-09 18:28:56 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-10-09 22:25:59 -0400
commitb70a2a21dc9d4ad455931b53131a0cb4fc01fafe (patch)
tree95ad6c804009a5867ac991cc1edf414e163b40b4 /mm/memcontrol.c
parent3fbe724424fb104aaca9973389b4a9df428c3e2a (diff)
mm: memcontrol: fix transparent huge page allocations under pressure
In a memcg with even just moderate cache pressure, success rates for transparent huge page allocations drop to zero, wasting a lot of effort that the allocator puts into assembling these pages. The reason for this is that the memcg reclaim code was never designed for higher-order charges. It reclaims in small batches until there is room for at least one page. Huge page charges only succeed when these batches add up over a series of huge faults, which is unlikely under any significant load involving order-0 allocations in the group. Remove that loop on the memcg side in favor of passing the actual reclaim goal to direct reclaim, which is already set up and optimized to meet higher-order goals efficiently. This brings memcg's THP policy in line with the system policy: if the allocator painstakingly assembles a hugepage, memcg will at least make an honest effort to charge it. As a result, transparent hugepage allocation rates amid cache activity are drastically improved: vanilla patched pgalloc 4717530.80 ( +0.00%) 4451376.40 ( -5.64%) pgfault 491370.60 ( +0.00%) 225477.40 ( -54.11%) pgmajfault 2.00 ( +0.00%) 1.80 ( -6.67%) thp_fault_alloc 0.00 ( +0.00%) 531.60 (+100.00%) thp_fault_fallback 749.00 ( +0.00%) 217.40 ( -70.88%) [ Note: this may in turn increase memory consumption from internal fragmentation, which is an inherent risk of transparent hugepages. Some setups may have to adjust the memcg limits accordingly to accomodate this - or, if the machine is already packed to capacity, disable the transparent huge page feature. ] Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Vladimir Davydov <vdavydov@parallels.com> Cc: Michal Hocko <mhocko@suse.cz> Cc: Dave Hansen <dave@sr71.net> Cc: Greg Thelen <gthelen@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c69
1 files changed, 17 insertions, 52 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9cda99dfac4f..c86cc442ada4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -481,14 +481,6 @@ enum res_type {
481#define OOM_CONTROL (0) 481#define OOM_CONTROL (0)
482 482
483/* 483/*
484 * Reclaim flags for mem_cgroup_hierarchical_reclaim
485 */
486#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
487#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
488#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
489#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
490
491/*
492 * The memcg_create_mutex will be held whenever a new cgroup is created. 484 * The memcg_create_mutex will be held whenever a new cgroup is created.
493 * As a consequence, any change that needs to protect against new child cgroups 485 * As a consequence, any change that needs to protect against new child cgroups
494 * appearing has to hold it as well. 486 * appearing has to hold it as well.
@@ -1805,40 +1797,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1805 NULL, "Memory cgroup out of memory"); 1797 NULL, "Memory cgroup out of memory");
1806} 1798}
1807 1799
1808static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1809 gfp_t gfp_mask,
1810 unsigned long flags)
1811{
1812 unsigned long total = 0;
1813 bool noswap = false;
1814 int loop;
1815
1816 if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
1817 noswap = true;
1818
1819 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
1820 if (loop)
1821 drain_all_stock_async(memcg);
1822 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
1823 /*
1824 * Allow limit shrinkers, which are triggered directly
1825 * by userspace, to catch signals and stop reclaim
1826 * after minimal progress, regardless of the margin.
1827 */
1828 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
1829 break;
1830 if (mem_cgroup_margin(memcg))
1831 break;
1832 /*
1833 * If nothing was reclaimed after two attempts, there
1834 * may be no reclaimable pages in this hierarchy.
1835 */
1836 if (loop && !total)
1837 break;
1838 }
1839 return total;
1840}
1841
1842/** 1800/**
1843 * test_mem_cgroup_node_reclaimable 1801 * test_mem_cgroup_node_reclaimable
1844 * @memcg: the target memcg 1802 * @memcg: the target memcg
@@ -2541,8 +2499,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2541 struct mem_cgroup *mem_over_limit; 2499 struct mem_cgroup *mem_over_limit;
2542 struct res_counter *fail_res; 2500 struct res_counter *fail_res;
2543 unsigned long nr_reclaimed; 2501 unsigned long nr_reclaimed;
2544 unsigned long flags = 0;
2545 unsigned long long size; 2502 unsigned long long size;
2503 bool may_swap = true;
2504 bool drained = false;
2546 int ret = 0; 2505 int ret = 0;
2547 2506
2548 if (mem_cgroup_is_root(memcg)) 2507 if (mem_cgroup_is_root(memcg))
@@ -2561,7 +2520,7 @@ retry:
2561 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2520 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2562 } else { 2521 } else {
2563 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2522 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2564 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 2523 may_swap = false;
2565 } 2524 }
2566 2525
2567 if (batch > nr_pages) { 2526 if (batch > nr_pages) {
@@ -2586,11 +2545,18 @@ retry:
2586 if (!(gfp_mask & __GFP_WAIT)) 2545 if (!(gfp_mask & __GFP_WAIT))
2587 goto nomem; 2546 goto nomem;
2588 2547
2589 nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); 2548 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2549 gfp_mask, may_swap);
2590 2550
2591 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2551 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2592 goto retry; 2552 goto retry;
2593 2553
2554 if (!drained) {
2555 drain_all_stock_async(mem_over_limit);
2556 drained = true;
2557 goto retry;
2558 }
2559
2594 if (gfp_mask & __GFP_NORETRY) 2560 if (gfp_mask & __GFP_NORETRY)
2595 goto nomem; 2561 goto nomem;
2596 /* 2562 /*
@@ -3666,8 +3632,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3666 if (!ret) 3632 if (!ret)
3667 break; 3633 break;
3668 3634
3669 mem_cgroup_reclaim(memcg, GFP_KERNEL, 3635 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
3670 MEM_CGROUP_RECLAIM_SHRINK); 3636
3671 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3637 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3672 /* Usage is reduced ? */ 3638 /* Usage is reduced ? */
3673 if (curusage >= oldusage) 3639 if (curusage >= oldusage)
@@ -3717,9 +3683,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3717 if (!ret) 3683 if (!ret)
3718 break; 3684 break;
3719 3685
3720 mem_cgroup_reclaim(memcg, GFP_KERNEL, 3686 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
3721 MEM_CGROUP_RECLAIM_NOSWAP | 3687
3722 MEM_CGROUP_RECLAIM_SHRINK);
3723 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3688 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3724 /* Usage is reduced ? */ 3689 /* Usage is reduced ? */
3725 if (curusage >= oldusage) 3690 if (curusage >= oldusage)
@@ -3968,8 +3933,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3968 if (signal_pending(current)) 3933 if (signal_pending(current))
3969 return -EINTR; 3934 return -EINTR;
3970 3935
3971 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, 3936 progress = try_to_free_mem_cgroup_pages(memcg, 1,
3972 false); 3937 GFP_KERNEL, true);
3973 if (!progress) { 3938 if (!progress) {
3974 nr_retries--; 3939 nr_retries--;
3975 /* maybe some writeback is necessary */ 3940 /* maybe some writeback is necessary */