aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2014-09-05 08:43:57 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-09-05 11:19:02 -0400
commitce00a967377baadf2481521e131771adc7652856 (patch)
treedf99fb2d2c1e2e0aa6873913decf53487e97391e /mm/memcontrol.c
parent10096fb1088e5c89b10772a1dfbe9682ecae5cea (diff)
mm: memcontrol: revert use of root_mem_cgroup res_counter
Dave Hansen reports a massive scalability regression in an uncontained page fault benchmark with more than 30 concurrent threads, which he bisected down to 05b843012335 ("mm: memcontrol: use root_mem_cgroup res_counter") and pin-pointed on res_counter spinlock contention. That change relied on the per-cpu charge caches to mostly swallow the res_counter costs, but it's apparent that the caches don't scale yet. Revert memcg back to bypassing res_counters on the root level in order to restore performance for uncontained workloads. Reported-by: Dave Hansen <dave@sr71.net> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Tested-by: Dave Hansen <dave.hansen@intel.com> Acked-by: Michal Hocko <mhocko@suse.cz> Acked-by: Vladimir Davydov <vdavydov@parallels.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c103
1 files changed, 78 insertions, 25 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ec4dcf1b9562..085dc6d2f876 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2534,6 +2534,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2534 unsigned long long size; 2534 unsigned long long size;
2535 int ret = 0; 2535 int ret = 0;
2536 2536
2537 if (mem_cgroup_is_root(memcg))
2538 goto done;
2537retry: 2539retry:
2538 if (consume_stock(memcg, nr_pages)) 2540 if (consume_stock(memcg, nr_pages))
2539 goto done; 2541 goto done;
@@ -2611,9 +2613,7 @@ nomem:
2611 if (!(gfp_mask & __GFP_NOFAIL)) 2613 if (!(gfp_mask & __GFP_NOFAIL))
2612 return -ENOMEM; 2614 return -ENOMEM;
2613bypass: 2615bypass:
2614 memcg = root_mem_cgroup; 2616 return -EINTR;
2615 ret = -EINTR;
2616 goto retry;
2617 2617
2618done_restock: 2618done_restock:
2619 if (batch > nr_pages) 2619 if (batch > nr_pages)
@@ -2626,6 +2626,9 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2626{ 2626{
2627 unsigned long bytes = nr_pages * PAGE_SIZE; 2627 unsigned long bytes = nr_pages * PAGE_SIZE;
2628 2628
2629 if (mem_cgroup_is_root(memcg))
2630 return;
2631
2629 res_counter_uncharge(&memcg->res, bytes); 2632 res_counter_uncharge(&memcg->res, bytes);
2630 if (do_swap_account) 2633 if (do_swap_account)
2631 res_counter_uncharge(&memcg->memsw, bytes); 2634 res_counter_uncharge(&memcg->memsw, bytes);
@@ -2640,6 +2643,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2640{ 2643{
2641 unsigned long bytes = nr_pages * PAGE_SIZE; 2644 unsigned long bytes = nr_pages * PAGE_SIZE;
2642 2645
2646 if (mem_cgroup_is_root(memcg))
2647 return;
2648
2643 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); 2649 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
2644 if (do_swap_account) 2650 if (do_swap_account)
2645 res_counter_uncharge_until(&memcg->memsw, 2651 res_counter_uncharge_until(&memcg->memsw,
@@ -4093,6 +4099,46 @@ out:
4093 return retval; 4099 return retval;
4094} 4100}
4095 4101
4102static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
4103 enum mem_cgroup_stat_index idx)
4104{
4105 struct mem_cgroup *iter;
4106 long val = 0;
4107
4108 /* Per-cpu values can be negative, use a signed accumulator */
4109 for_each_mem_cgroup_tree(iter, memcg)
4110 val += mem_cgroup_read_stat(iter, idx);
4111
4112 if (val < 0) /* race ? */
4113 val = 0;
4114 return val;
4115}
4116
4117static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
4118{
4119 u64 val;
4120
4121 if (!mem_cgroup_is_root(memcg)) {
4122 if (!swap)
4123 return res_counter_read_u64(&memcg->res, RES_USAGE);
4124 else
4125 return res_counter_read_u64(&memcg->memsw, RES_USAGE);
4126 }
4127
4128 /*
4129 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
4130 * as well as in MEM_CGROUP_STAT_RSS_HUGE.
4131 */
4132 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
4133 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
4134
4135 if (swap)
4136 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
4137
4138 return val << PAGE_SHIFT;
4139}
4140
4141
4096static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 4142static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
4097 struct cftype *cft) 4143 struct cftype *cft)
4098{ 4144{
@@ -4102,8 +4148,12 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
4102 4148
4103 switch (type) { 4149 switch (type) {
4104 case _MEM: 4150 case _MEM:
4151 if (name == RES_USAGE)
4152 return mem_cgroup_usage(memcg, false);
4105 return res_counter_read_u64(&memcg->res, name); 4153 return res_counter_read_u64(&memcg->res, name);
4106 case _MEMSWAP: 4154 case _MEMSWAP:
4155 if (name == RES_USAGE)
4156 return mem_cgroup_usage(memcg, true);
4107 return res_counter_read_u64(&memcg->memsw, name); 4157 return res_counter_read_u64(&memcg->memsw, name);
4108 case _KMEM: 4158 case _KMEM:
4109 return res_counter_read_u64(&memcg->kmem, name); 4159 return res_counter_read_u64(&memcg->kmem, name);
@@ -4572,10 +4622,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4572 if (!t) 4622 if (!t)
4573 goto unlock; 4623 goto unlock;
4574 4624
4575 if (!swap) 4625 usage = mem_cgroup_usage(memcg, swap);
4576 usage = res_counter_read_u64(&memcg->res, RES_USAGE);
4577 else
4578 usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
4579 4626
4580 /* 4627 /*
4581 * current_threshold points to threshold just below or equal to usage. 4628 * current_threshold points to threshold just below or equal to usage.
@@ -4673,10 +4720,10 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4673 4720
4674 if (type == _MEM) { 4721 if (type == _MEM) {
4675 thresholds = &memcg->thresholds; 4722 thresholds = &memcg->thresholds;
4676 usage = res_counter_read_u64(&memcg->res, RES_USAGE); 4723 usage = mem_cgroup_usage(memcg, false);
4677 } else if (type == _MEMSWAP) { 4724 } else if (type == _MEMSWAP) {
4678 thresholds = &memcg->memsw_thresholds; 4725 thresholds = &memcg->memsw_thresholds;
4679 usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4726 usage = mem_cgroup_usage(memcg, true);
4680 } else 4727 } else
4681 BUG(); 4728 BUG();
4682 4729
@@ -4762,10 +4809,10 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4762 4809
4763 if (type == _MEM) { 4810 if (type == _MEM) {
4764 thresholds = &memcg->thresholds; 4811 thresholds = &memcg->thresholds;
4765 usage = res_counter_read_u64(&memcg->res, RES_USAGE); 4812 usage = mem_cgroup_usage(memcg, false);
4766 } else if (type == _MEMSWAP) { 4813 } else if (type == _MEMSWAP) {
4767 thresholds = &memcg->memsw_thresholds; 4814 thresholds = &memcg->memsw_thresholds;
4768 usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4815 usage = mem_cgroup_usage(memcg, true);
4769 } else 4816 } else
4770 BUG(); 4817 BUG();
4771 4818
@@ -5525,9 +5572,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
5525 * core guarantees its existence. 5572 * core guarantees its existence.
5526 */ 5573 */
5527 } else { 5574 } else {
5528 res_counter_init(&memcg->res, &root_mem_cgroup->res); 5575 res_counter_init(&memcg->res, NULL);
5529 res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw); 5576 res_counter_init(&memcg->memsw, NULL);
5530 res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); 5577 res_counter_init(&memcg->kmem, NULL);
5531 /* 5578 /*
5532 * Deeper hierachy with use_hierarchy == false doesn't make 5579 * Deeper hierachy with use_hierarchy == false doesn't make
5533 * much sense so let cgroup subsystem know about this 5580 * much sense so let cgroup subsystem know about this
@@ -5969,8 +6016,9 @@ static void __mem_cgroup_clear_mc(void)
5969 /* we must fixup refcnts and charges */ 6016 /* we must fixup refcnts and charges */
5970 if (mc.moved_swap) { 6017 if (mc.moved_swap) {
5971 /* uncharge swap account from the old cgroup */ 6018 /* uncharge swap account from the old cgroup */
5972 res_counter_uncharge(&mc.from->memsw, 6019 if (!mem_cgroup_is_root(mc.from))
5973 PAGE_SIZE * mc.moved_swap); 6020 res_counter_uncharge(&mc.from->memsw,
6021 PAGE_SIZE * mc.moved_swap);
5974 6022
5975 for (i = 0; i < mc.moved_swap; i++) 6023 for (i = 0; i < mc.moved_swap; i++)
5976 css_put(&mc.from->css); 6024 css_put(&mc.from->css);
@@ -5979,8 +6027,9 @@ static void __mem_cgroup_clear_mc(void)
5979 * we charged both to->res and to->memsw, so we should 6027 * we charged both to->res and to->memsw, so we should
5980 * uncharge to->res. 6028 * uncharge to->res.
5981 */ 6029 */
5982 res_counter_uncharge(&mc.to->res, 6030 if (!mem_cgroup_is_root(mc.to))
5983 PAGE_SIZE * mc.moved_swap); 6031 res_counter_uncharge(&mc.to->res,
6032 PAGE_SIZE * mc.moved_swap);
5984 /* we've already done css_get(mc.to) */ 6033 /* we've already done css_get(mc.to) */
5985 mc.moved_swap = 0; 6034 mc.moved_swap = 0;
5986 } 6035 }
@@ -6345,7 +6394,8 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
6345 rcu_read_lock(); 6394 rcu_read_lock();
6346 memcg = mem_cgroup_lookup(id); 6395 memcg = mem_cgroup_lookup(id);
6347 if (memcg) { 6396 if (memcg) {
6348 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 6397 if (!mem_cgroup_is_root(memcg))
6398 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
6349 mem_cgroup_swap_statistics(memcg, false); 6399 mem_cgroup_swap_statistics(memcg, false);
6350 css_put(&memcg->css); 6400 css_put(&memcg->css);
6351 } 6401 }
@@ -6509,12 +6559,15 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
6509{ 6559{
6510 unsigned long flags; 6560 unsigned long flags;
6511 6561
6512 if (nr_mem) 6562 if (!mem_cgroup_is_root(memcg)) {
6513 res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE); 6563 if (nr_mem)
6514 if (nr_memsw) 6564 res_counter_uncharge(&memcg->res,
6515 res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE); 6565 nr_mem * PAGE_SIZE);
6516 6566 if (nr_memsw)
6517 memcg_oom_recover(memcg); 6567 res_counter_uncharge(&memcg->memsw,
6568 nr_memsw * PAGE_SIZE);
6569 memcg_oom_recover(memcg);
6570 }
6518 6571
6519 local_irq_save(flags); 6572 local_irq_save(flags);
6520 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); 6573 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);