diff options
author | Johannes Weiner <hannes@cmpxchg.org> | 2014-09-05 08:43:57 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-09-05 11:19:02 -0400 |
commit | ce00a967377baadf2481521e131771adc7652856 (patch) | |
tree | df99fb2d2c1e2e0aa6873913decf53487e97391e /mm/memcontrol.c | |
parent | 10096fb1088e5c89b10772a1dfbe9682ecae5cea (diff) |
mm: memcontrol: revert use of root_mem_cgroup res_counter
Dave Hansen reports a massive scalability regression in an uncontained
page fault benchmark with more than 30 concurrent threads, which he
bisected down to 05b843012335 ("mm: memcontrol: use root_mem_cgroup
res_counter") and pin-pointed on res_counter spinlock contention.
That change relied on the per-cpu charge caches to mostly swallow the
res_counter costs, but it's apparent that the caches don't scale yet.
Revert memcg back to bypassing res_counters on the root level in order
to restore performance for uncontained workloads.
Reported-by: Dave Hansen <dave@sr71.net>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Dave Hansen <dave.hansen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 103 |
1 files changed, 78 insertions, 25 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ec4dcf1b9562..085dc6d2f876 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -2534,6 +2534,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2534 | unsigned long long size; | 2534 | unsigned long long size; |
2535 | int ret = 0; | 2535 | int ret = 0; |
2536 | 2536 | ||
2537 | if (mem_cgroup_is_root(memcg)) | ||
2538 | goto done; | ||
2537 | retry: | 2539 | retry: |
2538 | if (consume_stock(memcg, nr_pages)) | 2540 | if (consume_stock(memcg, nr_pages)) |
2539 | goto done; | 2541 | goto done; |
@@ -2611,9 +2613,7 @@ nomem: | |||
2611 | if (!(gfp_mask & __GFP_NOFAIL)) | 2613 | if (!(gfp_mask & __GFP_NOFAIL)) |
2612 | return -ENOMEM; | 2614 | return -ENOMEM; |
2613 | bypass: | 2615 | bypass: |
2614 | memcg = root_mem_cgroup; | 2616 | return -EINTR; |
2615 | ret = -EINTR; | ||
2616 | goto retry; | ||
2617 | 2617 | ||
2618 | done_restock: | 2618 | done_restock: |
2619 | if (batch > nr_pages) | 2619 | if (batch > nr_pages) |
@@ -2626,6 +2626,9 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) | |||
2626 | { | 2626 | { |
2627 | unsigned long bytes = nr_pages * PAGE_SIZE; | 2627 | unsigned long bytes = nr_pages * PAGE_SIZE; |
2628 | 2628 | ||
2629 | if (mem_cgroup_is_root(memcg)) | ||
2630 | return; | ||
2631 | |||
2629 | res_counter_uncharge(&memcg->res, bytes); | 2632 | res_counter_uncharge(&memcg->res, bytes); |
2630 | if (do_swap_account) | 2633 | if (do_swap_account) |
2631 | res_counter_uncharge(&memcg->memsw, bytes); | 2634 | res_counter_uncharge(&memcg->memsw, bytes); |
@@ -2640,6 +2643,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, | |||
2640 | { | 2643 | { |
2641 | unsigned long bytes = nr_pages * PAGE_SIZE; | 2644 | unsigned long bytes = nr_pages * PAGE_SIZE; |
2642 | 2645 | ||
2646 | if (mem_cgroup_is_root(memcg)) | ||
2647 | return; | ||
2648 | |||
2643 | res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); | 2649 | res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); |
2644 | if (do_swap_account) | 2650 | if (do_swap_account) |
2645 | res_counter_uncharge_until(&memcg->memsw, | 2651 | res_counter_uncharge_until(&memcg->memsw, |
@@ -4093,6 +4099,46 @@ out: | |||
4093 | return retval; | 4099 | return retval; |
4094 | } | 4100 | } |
4095 | 4101 | ||
4102 | static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, | ||
4103 | enum mem_cgroup_stat_index idx) | ||
4104 | { | ||
4105 | struct mem_cgroup *iter; | ||
4106 | long val = 0; | ||
4107 | |||
4108 | /* Per-cpu values can be negative, use a signed accumulator */ | ||
4109 | for_each_mem_cgroup_tree(iter, memcg) | ||
4110 | val += mem_cgroup_read_stat(iter, idx); | ||
4111 | |||
4112 | if (val < 0) /* race ? */ | ||
4113 | val = 0; | ||
4114 | return val; | ||
4115 | } | ||
4116 | |||
4117 | static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) | ||
4118 | { | ||
4119 | u64 val; | ||
4120 | |||
4121 | if (!mem_cgroup_is_root(memcg)) { | ||
4122 | if (!swap) | ||
4123 | return res_counter_read_u64(&memcg->res, RES_USAGE); | ||
4124 | else | ||
4125 | return res_counter_read_u64(&memcg->memsw, RES_USAGE); | ||
4126 | } | ||
4127 | |||
4128 | /* | ||
4129 | * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS | ||
4130 | * as well as in MEM_CGROUP_STAT_RSS_HUGE. | ||
4131 | */ | ||
4132 | val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); | ||
4133 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); | ||
4134 | |||
4135 | if (swap) | ||
4136 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); | ||
4137 | |||
4138 | return val << PAGE_SHIFT; | ||
4139 | } | ||
4140 | |||
4141 | |||
4096 | static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, | 4142 | static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, |
4097 | struct cftype *cft) | 4143 | struct cftype *cft) |
4098 | { | 4144 | { |
@@ -4102,8 +4148,12 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, | |||
4102 | 4148 | ||
4103 | switch (type) { | 4149 | switch (type) { |
4104 | case _MEM: | 4150 | case _MEM: |
4151 | if (name == RES_USAGE) | ||
4152 | return mem_cgroup_usage(memcg, false); | ||
4105 | return res_counter_read_u64(&memcg->res, name); | 4153 | return res_counter_read_u64(&memcg->res, name); |
4106 | case _MEMSWAP: | 4154 | case _MEMSWAP: |
4155 | if (name == RES_USAGE) | ||
4156 | return mem_cgroup_usage(memcg, true); | ||
4107 | return res_counter_read_u64(&memcg->memsw, name); | 4157 | return res_counter_read_u64(&memcg->memsw, name); |
4108 | case _KMEM: | 4158 | case _KMEM: |
4109 | return res_counter_read_u64(&memcg->kmem, name); | 4159 | return res_counter_read_u64(&memcg->kmem, name); |
@@ -4572,10 +4622,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | |||
4572 | if (!t) | 4622 | if (!t) |
4573 | goto unlock; | 4623 | goto unlock; |
4574 | 4624 | ||
4575 | if (!swap) | 4625 | usage = mem_cgroup_usage(memcg, swap); |
4576 | usage = res_counter_read_u64(&memcg->res, RES_USAGE); | ||
4577 | else | ||
4578 | usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | ||
4579 | 4626 | ||
4580 | /* | 4627 | /* |
4581 | * current_threshold points to threshold just below or equal to usage. | 4628 | * current_threshold points to threshold just below or equal to usage. |
@@ -4673,10 +4720,10 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, | |||
4673 | 4720 | ||
4674 | if (type == _MEM) { | 4721 | if (type == _MEM) { |
4675 | thresholds = &memcg->thresholds; | 4722 | thresholds = &memcg->thresholds; |
4676 | usage = res_counter_read_u64(&memcg->res, RES_USAGE); | 4723 | usage = mem_cgroup_usage(memcg, false); |
4677 | } else if (type == _MEMSWAP) { | 4724 | } else if (type == _MEMSWAP) { |
4678 | thresholds = &memcg->memsw_thresholds; | 4725 | thresholds = &memcg->memsw_thresholds; |
4679 | usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 4726 | usage = mem_cgroup_usage(memcg, true); |
4680 | } else | 4727 | } else |
4681 | BUG(); | 4728 | BUG(); |
4682 | 4729 | ||
@@ -4762,10 +4809,10 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | |||
4762 | 4809 | ||
4763 | if (type == _MEM) { | 4810 | if (type == _MEM) { |
4764 | thresholds = &memcg->thresholds; | 4811 | thresholds = &memcg->thresholds; |
4765 | usage = res_counter_read_u64(&memcg->res, RES_USAGE); | 4812 | usage = mem_cgroup_usage(memcg, false); |
4766 | } else if (type == _MEMSWAP) { | 4813 | } else if (type == _MEMSWAP) { |
4767 | thresholds = &memcg->memsw_thresholds; | 4814 | thresholds = &memcg->memsw_thresholds; |
4768 | usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 4815 | usage = mem_cgroup_usage(memcg, true); |
4769 | } else | 4816 | } else |
4770 | BUG(); | 4817 | BUG(); |
4771 | 4818 | ||
@@ -5525,9 +5572,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
5525 | * core guarantees its existence. | 5572 | * core guarantees its existence. |
5526 | */ | 5573 | */ |
5527 | } else { | 5574 | } else { |
5528 | res_counter_init(&memcg->res, &root_mem_cgroup->res); | 5575 | res_counter_init(&memcg->res, NULL); |
5529 | res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw); | 5576 | res_counter_init(&memcg->memsw, NULL); |
5530 | res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); | 5577 | res_counter_init(&memcg->kmem, NULL); |
5531 | /* | 5578 | /* |
5532 | * Deeper hierachy with use_hierarchy == false doesn't make | 5579 | * Deeper hierachy with use_hierarchy == false doesn't make |
5533 | * much sense so let cgroup subsystem know about this | 5580 | * much sense so let cgroup subsystem know about this |
@@ -5969,8 +6016,9 @@ static void __mem_cgroup_clear_mc(void) | |||
5969 | /* we must fixup refcnts and charges */ | 6016 | /* we must fixup refcnts and charges */ |
5970 | if (mc.moved_swap) { | 6017 | if (mc.moved_swap) { |
5971 | /* uncharge swap account from the old cgroup */ | 6018 | /* uncharge swap account from the old cgroup */ |
5972 | res_counter_uncharge(&mc.from->memsw, | 6019 | if (!mem_cgroup_is_root(mc.from)) |
5973 | PAGE_SIZE * mc.moved_swap); | 6020 | res_counter_uncharge(&mc.from->memsw, |
6021 | PAGE_SIZE * mc.moved_swap); | ||
5974 | 6022 | ||
5975 | for (i = 0; i < mc.moved_swap; i++) | 6023 | for (i = 0; i < mc.moved_swap; i++) |
5976 | css_put(&mc.from->css); | 6024 | css_put(&mc.from->css); |
@@ -5979,8 +6027,9 @@ static void __mem_cgroup_clear_mc(void) | |||
5979 | * we charged both to->res and to->memsw, so we should | 6027 | * we charged both to->res and to->memsw, so we should |
5980 | * uncharge to->res. | 6028 | * uncharge to->res. |
5981 | */ | 6029 | */ |
5982 | res_counter_uncharge(&mc.to->res, | 6030 | if (!mem_cgroup_is_root(mc.to)) |
5983 | PAGE_SIZE * mc.moved_swap); | 6031 | res_counter_uncharge(&mc.to->res, |
6032 | PAGE_SIZE * mc.moved_swap); | ||
5984 | /* we've already done css_get(mc.to) */ | 6033 | /* we've already done css_get(mc.to) */ |
5985 | mc.moved_swap = 0; | 6034 | mc.moved_swap = 0; |
5986 | } | 6035 | } |
@@ -6345,7 +6394,8 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) | |||
6345 | rcu_read_lock(); | 6394 | rcu_read_lock(); |
6346 | memcg = mem_cgroup_lookup(id); | 6395 | memcg = mem_cgroup_lookup(id); |
6347 | if (memcg) { | 6396 | if (memcg) { |
6348 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 6397 | if (!mem_cgroup_is_root(memcg)) |
6398 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | ||
6349 | mem_cgroup_swap_statistics(memcg, false); | 6399 | mem_cgroup_swap_statistics(memcg, false); |
6350 | css_put(&memcg->css); | 6400 | css_put(&memcg->css); |
6351 | } | 6401 | } |
@@ -6509,12 +6559,15 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, | |||
6509 | { | 6559 | { |
6510 | unsigned long flags; | 6560 | unsigned long flags; |
6511 | 6561 | ||
6512 | if (nr_mem) | 6562 | if (!mem_cgroup_is_root(memcg)) { |
6513 | res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE); | 6563 | if (nr_mem) |
6514 | if (nr_memsw) | 6564 | res_counter_uncharge(&memcg->res, |
6515 | res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE); | 6565 | nr_mem * PAGE_SIZE); |
6516 | 6566 | if (nr_memsw) | |
6517 | memcg_oom_recover(memcg); | 6567 | res_counter_uncharge(&memcg->memsw, |
6568 | nr_memsw * PAGE_SIZE); | ||
6569 | memcg_oom_recover(memcg); | ||
6570 | } | ||
6518 | 6571 | ||
6519 | local_irq_save(flags); | 6572 | local_irq_save(flags); |
6520 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); | 6573 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); |