aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorGlauber Costa <glommer@parallels.com>2012-12-18 17:22:38 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-18 18:02:13 -0500
commit55007d849759252ddd573aeb36143b947202d509 (patch)
treed042bc2f717922fb73f9d526592eeb331c2f0f70 /mm
parent2633d7a028239a738b793be5ca8fa6ac312f5793 (diff)
memcg: allocate memory for memcg caches whenever a new memcg appears
Every cache that is considered a root cache (basically the "original" caches, tied to the root memcg/no-memcg) will have an array that should be large enough to store a cache pointer per each memcg in the system. Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently in the 64k pointers range. Most of the time, we won't be using that much. What goes in this patch, is a simple scheme to dynamically allocate such an array, in order to minimize memory usage for memcg caches. Because we would also like to avoid allocations all the time, at least for now, the array will only grow. It will tend to be big enough to hold the maximum number of kmem-limited memcgs ever achieved. We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have more than that, we'll start doubling the size of this array every time the limit is reached. Because we are only considering kmem limited memcgs, a natural point for this to happen is when we write to the limit. At that point, we already have set_limit_mutex held, so that will become our natural synchronization mechanism. Signed-off-by: Glauber Costa <glommer@parallels.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Frederic Weisbecker <fweisbec@redhat.com> Cc: Greg Thelen <gthelen@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: JoonSoo Kim <js1304@gmail.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Michal Hocko <mhocko@suse.cz> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: Rik van Riel <riel@redhat.com> Cc: Suleiman Souhlal <suleiman@google.com> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c207
-rw-r--r--mm/slab_common.c28
2 files changed, 219 insertions, 16 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3eafe6cf6ca4..db38b60e5f87 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -378,6 +378,11 @@ static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
378 set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); 378 set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
379} 379}
380 380
381static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
382{
383 clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
384}
385
381static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) 386static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
382{ 387{
383 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) 388 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
@@ -549,12 +554,48 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
549#endif 554#endif
550 555
551#ifdef CONFIG_MEMCG_KMEM 556#ifdef CONFIG_MEMCG_KMEM
557/*
558 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
559 * There are two main reasons for not using the css_id for this:
560 * 1) this works better in sparse environments, where we have a lot of memcgs,
561 * but only a few kmem-limited. Or also, if we have, for instance, 200
562 * memcgs, and none but the 200th is kmem-limited, we'd have to have a
563 * 200 entry array for that.
564 *
565 * 2) In order not to violate the cgroup API, we would like to do all memory
566 * allocation in ->create(). At that point, we haven't yet allocated the
567 * css_id. Having a separate index prevents us from messing with the cgroup
568 * core for this
569 *
570 * The current size of the caches array is stored in
571 * memcg_limited_groups_array_size. It will double each time we have to
572 * increase it.
573 */
574static DEFINE_IDA(kmem_limited_groups);
575static int memcg_limited_groups_array_size;
576/*
577 * MIN_SIZE is different than 1, because we would like to avoid going through
578 * the alloc/free process all the time. In a small machine, 4 kmem-limited
579 * cgroups is a reasonable guess. In the future, it could be a parameter or
580 * tunable, but that is strictly not necessary.
581 *
582 * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
583 * this constant directly from cgroup, but it is understandable that this is
584 * better kept as an internal representation in cgroup.c. In any case, the
585 * css_id space is not getting any smaller, and we don't have to necessarily
586 * increase ours as well if it increases.
587 */
588#define MEMCG_CACHES_MIN_SIZE 4
589#define MEMCG_CACHES_MAX_SIZE 65535
590
552struct static_key memcg_kmem_enabled_key; 591struct static_key memcg_kmem_enabled_key;
553 592
554static void disarm_kmem_keys(struct mem_cgroup *memcg) 593static void disarm_kmem_keys(struct mem_cgroup *memcg)
555{ 594{
556 if (memcg_kmem_is_active(memcg)) 595 if (memcg_kmem_is_active(memcg)) {
557 static_key_slow_dec(&memcg_kmem_enabled_key); 596 static_key_slow_dec(&memcg_kmem_enabled_key);
597 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
598 }
558 /* 599 /*
559 * This check can't live in kmem destruction function, 600 * This check can't live in kmem destruction function,
560 * since the charges will outlive the cgroup 601 * since the charges will outlive the cgroup
@@ -2813,6 +2854,120 @@ int memcg_cache_id(struct mem_cgroup *memcg)
2813 return memcg ? memcg->kmemcg_id : -1; 2854 return memcg ? memcg->kmemcg_id : -1;
2814} 2855}
2815 2856
2857/*
2858 * This ends up being protected by the set_limit mutex, during normal
2859 * operation, because that is its main call site.
2860 *
2861 * But when we create a new cache, we can call this as well if its parent
2862 * is kmem-limited. That will have to hold set_limit_mutex as well.
2863 */
2864int memcg_update_cache_sizes(struct mem_cgroup *memcg)
2865{
2866 int num, ret;
2867
2868 num = ida_simple_get(&kmem_limited_groups,
2869 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2870 if (num < 0)
2871 return num;
2872 /*
2873 * After this point, kmem_accounted (that we test atomically in
2874 * the beginning of this conditional), is no longer 0. This
2875 * guarantees only one process will set the following boolean
2876 * to true. We don't need test_and_set because we're protected
2877 * by the set_limit_mutex anyway.
2878 */
2879 memcg_kmem_set_activated(memcg);
2880
2881 ret = memcg_update_all_caches(num+1);
2882 if (ret) {
2883 ida_simple_remove(&kmem_limited_groups, num);
2884 memcg_kmem_clear_activated(memcg);
2885 return ret;
2886 }
2887
2888 memcg->kmemcg_id = num;
2889 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
2890 mutex_init(&memcg->slab_caches_mutex);
2891 return 0;
2892}
2893
2894static size_t memcg_caches_array_size(int num_groups)
2895{
2896 ssize_t size;
2897 if (num_groups <= 0)
2898 return 0;
2899
2900 size = 2 * num_groups;
2901 if (size < MEMCG_CACHES_MIN_SIZE)
2902 size = MEMCG_CACHES_MIN_SIZE;
2903 else if (size > MEMCG_CACHES_MAX_SIZE)
2904 size = MEMCG_CACHES_MAX_SIZE;
2905
2906 return size;
2907}
2908
2909/*
2910 * We should update the current array size iff all caches updates succeed. This
2911 * can only be done from the slab side. The slab mutex needs to be held when
2912 * calling this.
2913 */
2914void memcg_update_array_size(int num)
2915{
2916 if (num > memcg_limited_groups_array_size)
2917 memcg_limited_groups_array_size = memcg_caches_array_size(num);
2918}
2919
2920int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
2921{
2922 struct memcg_cache_params *cur_params = s->memcg_params;
2923
2924 VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
2925
2926 if (num_groups > memcg_limited_groups_array_size) {
2927 int i;
2928 ssize_t size = memcg_caches_array_size(num_groups);
2929
2930 size *= sizeof(void *);
2931 size += sizeof(struct memcg_cache_params);
2932
2933 s->memcg_params = kzalloc(size, GFP_KERNEL);
2934 if (!s->memcg_params) {
2935 s->memcg_params = cur_params;
2936 return -ENOMEM;
2937 }
2938
2939 s->memcg_params->is_root_cache = true;
2940
2941 /*
2942 * There is the chance it will be bigger than
2943 * memcg_limited_groups_array_size, if we failed an allocation
2944 * in a cache, in which case all caches updated before it, will
2945 * have a bigger array.
2946 *
2947 * But if that is the case, the data after
2948 * memcg_limited_groups_array_size is certainly unused
2949 */
2950 for (i = 0; i < memcg_limited_groups_array_size; i++) {
2951 if (!cur_params->memcg_caches[i])
2952 continue;
2953 s->memcg_params->memcg_caches[i] =
2954 cur_params->memcg_caches[i];
2955 }
2956
2957 /*
2958 * Ideally, we would wait until all caches succeed, and only
2959 * then free the old one. But this is not worth the extra
2960 * pointer per-cache we'd have to have for this.
2961 *
2962 * It is not a big deal if some caches are left with a size
2963 * bigger than the others. And all updates will reset this
2964 * anyway.
2965 */
2966 kfree(cur_params);
2967 }
2968 return 0;
2969}
2970
2816int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s) 2971int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s)
2817{ 2972{
2818 size_t size = sizeof(struct memcg_cache_params); 2973 size_t size = sizeof(struct memcg_cache_params);
@@ -2820,6 +2975,9 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s)
2820 if (!memcg_kmem_enabled()) 2975 if (!memcg_kmem_enabled())
2821 return 0; 2976 return 0;
2822 2977
2978 if (!memcg)
2979 size += memcg_limited_groups_array_size * sizeof(void *);
2980
2823 s->memcg_params = kzalloc(size, GFP_KERNEL); 2981 s->memcg_params = kzalloc(size, GFP_KERNEL);
2824 if (!s->memcg_params) 2982 if (!s->memcg_params)
2825 return -ENOMEM; 2983 return -ENOMEM;
@@ -4326,14 +4484,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4326 ret = res_counter_set_limit(&memcg->kmem, val); 4484 ret = res_counter_set_limit(&memcg->kmem, val);
4327 VM_BUG_ON(ret); 4485 VM_BUG_ON(ret);
4328 4486
4329 /* 4487 ret = memcg_update_cache_sizes(memcg);
4330 * After this point, kmem_accounted (that we test atomically in 4488 if (ret) {
4331 * the beginning of this conditional), is no longer 0. This 4489 res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
4332 * guarantees only one process will set the following boolean 4490 goto out;
4333 * to true. We don't need test_and_set because we're protected 4491 }
4334 * by the set_limit_mutex anyway.
4335 */
4336 memcg_kmem_set_activated(memcg);
4337 must_inc_static_branch = true; 4492 must_inc_static_branch = true;
4338 /* 4493 /*
4339 * kmem charges can outlive the cgroup. In the case of slab 4494 * kmem charges can outlive the cgroup. In the case of slab
@@ -4372,11 +4527,13 @@ out:
4372 return ret; 4527 return ret;
4373} 4528}
4374 4529
4375static void memcg_propagate_kmem(struct mem_cgroup *memcg) 4530static int memcg_propagate_kmem(struct mem_cgroup *memcg)
4376{ 4531{
4532 int ret = 0;
4377 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 4533 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4378 if (!parent) 4534 if (!parent)
4379 return; 4535 goto out;
4536
4380 memcg->kmem_account_flags = parent->kmem_account_flags; 4537 memcg->kmem_account_flags = parent->kmem_account_flags;
4381#ifdef CONFIG_MEMCG_KMEM 4538#ifdef CONFIG_MEMCG_KMEM
4382 /* 4539 /*
@@ -4389,11 +4546,24 @@ static void memcg_propagate_kmem(struct mem_cgroup *memcg)
4389 * It is a lot simpler just to do static_key_slow_inc() on every child 4546 * It is a lot simpler just to do static_key_slow_inc() on every child
4390 * that is accounted. 4547 * that is accounted.
4391 */ 4548 */
4392 if (memcg_kmem_is_active(memcg)) { 4549 if (!memcg_kmem_is_active(memcg))
4393 mem_cgroup_get(memcg); 4550 goto out;
4394 static_key_slow_inc(&memcg_kmem_enabled_key); 4551
4395 } 4552 /*
4553 * destroy(), called if we fail, will issue static_key_slow_inc() and
4554 * mem_cgroup_put() if kmem is enabled. We have to either call them
4555 * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
4556 * this more consistent, since it always leads to the same destroy path
4557 */
4558 mem_cgroup_get(memcg);
4559 static_key_slow_inc(&memcg_kmem_enabled_key);
4560
4561 mutex_lock(&set_limit_mutex);
4562 ret = memcg_update_cache_sizes(memcg);
4563 mutex_unlock(&set_limit_mutex);
4396#endif 4564#endif
4565out:
4566 return ret;
4397} 4567}
4398 4568
4399/* 4569/*
@@ -5075,8 +5245,12 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
5075#ifdef CONFIG_MEMCG_KMEM 5245#ifdef CONFIG_MEMCG_KMEM
5076static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5246static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
5077{ 5247{
5248 int ret;
5249
5078 memcg->kmemcg_id = -1; 5250 memcg->kmemcg_id = -1;
5079 memcg_propagate_kmem(memcg); 5251 ret = memcg_propagate_kmem(memcg);
5252 if (ret)
5253 return ret;
5080 5254
5081 return mem_cgroup_sockets_init(memcg, ss); 5255 return mem_cgroup_sockets_init(memcg, ss);
5082}; 5256};
@@ -5479,6 +5653,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
5479 res_counter_init(&memcg->res, &parent->res); 5653 res_counter_init(&memcg->res, &parent->res);
5480 res_counter_init(&memcg->memsw, &parent->memsw); 5654 res_counter_init(&memcg->memsw, &parent->memsw);
5481 res_counter_init(&memcg->kmem, &parent->kmem); 5655 res_counter_init(&memcg->kmem, &parent->kmem);
5656
5482 /* 5657 /*
5483 * We increment refcnt of the parent to ensure that we can 5658 * We increment refcnt of the parent to ensure that we can
5484 * safely access it on res_counter_charge/uncharge. 5659 * safely access it on res_counter_charge/uncharge.
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 3031badcc577..1c424b6511bf 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -81,6 +81,34 @@ static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
81} 81}
82#endif 82#endif
83 83
84#ifdef CONFIG_MEMCG_KMEM
85int memcg_update_all_caches(int num_memcgs)
86{
87 struct kmem_cache *s;
88 int ret = 0;
89 mutex_lock(&slab_mutex);
90
91 list_for_each_entry(s, &slab_caches, list) {
92 if (!is_root_cache(s))
93 continue;
94
95 ret = memcg_update_cache_size(s, num_memcgs);
96 /*
97 * See comment in memcontrol.c, memcg_update_cache_size:
98 * Instead of freeing the memory, we'll just leave the caches
99 * up to this point in an updated state.
100 */
101 if (ret)
102 goto out;
103 }
104
105 memcg_update_array_size(num_memcgs);
106out:
107 mutex_unlock(&slab_mutex);
108 return ret;
109}
110#endif
111
84/* 112/*
85 * Figure out what the alignment of the objects will be given a set of 113 * Figure out what the alignment of the objects will be given a set of
86 * flags, a user specified alignment and the size of the objects. 114 * flags, a user specified alignment and the size of the objects.