memcg: allocate memory for memcg caches whenever a new memcg appears

Every cache that is considered a root cache (basically the "original" caches, tied to the root memcg/no-memcg) will have an array that should be large enough to store a cache pointer per each memcg in the system. Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently in the 64k pointers range. Most of the time, we won't be using that much. What goes in this patch, is a simple scheme to dynamically allocate such an array, in order to minimize memory usage for memcg caches. Because we would also like to avoid allocations all the time, at least for now, the array will only grow. It will tend to be big enough to hold the maximum number of kmem-limited memcgs ever achieved. We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have more than that, we'll start doubling the size of this array every time the limit is reached. Because we are only considering kmem limited memcgs, a natural point for this to happen is when we write to the limit. At that point, we already have set_limit_mutex held, so that will become our natural synchronization mechanism. Signed-off-by: Glauber Costa <glommer@parallels.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Frederic Weisbecker <fweisbec@redhat.com> Cc: Greg Thelen <gthelen@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: JoonSoo Kim <js1304@gmail.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Michal Hocko <mhocko@suse.cz> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: Rik van Riel <riel@redhat.com> Cc: Suleiman Souhlal <suleiman@google.com> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Glauber Costa <glommer@parallels.com> 2012-12-18 17:22:38 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-12-18 18:02:13 -0500
commit: 55007d849759252ddd573aeb36143b947202d509 (patch)
tree: d042bc2f717922fb73f9d526592eeb331c2f0f70 /mm
parent: 2633d7a028239a738b793be5ca8fa6ac312f5793 (diff)
2 files changed, 219 insertions, 16 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3eafe6cf6ca4..db38b60e5f87 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -378,6 +378,11 @@ static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
        set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
 }
+static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
+{
+        clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
+}
 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
 {
        if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
@@ -549,12 +554,48 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
 #endif
 #ifdef CONFIG_MEMCG_KMEM
+/*
+ * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
+ * There are two main reasons for not using the css_id for this:
+ *  1) this works better in sparse environments, where we have a lot of memcgs,
+ *     but only a few kmem-limited. Or also, if we have, for instance, 200
+ *     memcgs, and none but the 200th is kmem-limited, we'd have to have a
+ *     200 entry array for that.
+ *
+ *  2) In order not to violate the cgroup API, we would like to do all memory
+ *     allocation in ->create(). At that point, we haven't yet allocated the
+ *     css_id. Having a separate index prevents us from messing with the cgroup
+ *     core for this
+ *
+ * The current size of the caches array is stored in
+ * memcg_limited_groups_array_size.  It will double each time we have to
+ * increase it.
+ */
+static DEFINE_IDA(kmem_limited_groups);
+static int memcg_limited_groups_array_size;
+/*
+ * MIN_SIZE is different than 1, because we would like to avoid going through
+ * the alloc/free process all the time. In a small machine, 4 kmem-limited
+ * cgroups is a reasonable guess. In the future, it could be a parameter or
+ * tunable, but that is strictly not necessary.
+ *
+ * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
+ * this constant directly from cgroup, but it is understandable that this is
+ * better kept as an internal representation in cgroup.c. In any case, the
+ * css_id space is not getting any smaller, and we don't have to necessarily
+ * increase ours as well if it increases.
+ */
+#define MEMCG_CACHES_MIN_SIZE 4
+#define MEMCG_CACHES_MAX_SIZE 65535
 struct static_key memcg_kmem_enabled_key;
 static void disarm_kmem_keys(struct mem_cgroup *memcg)
 {
-        if (memcg_kmem_is_active(memcg))
+        if (memcg_kmem_is_active(memcg)) {
                static_key_slow_dec(&memcg_kmem_enabled_key);
+                ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
+        }
        /*
         * This check can't live in kmem destruction function,
         * since the charges will outlive the cgroup
@@ -2813,6 +2854,120 @@ int memcg_cache_id(struct mem_cgroup *memcg)
        return memcg ? memcg->kmemcg_id : -1;
 }
+/*
+ * This ends up being protected by the set_limit mutex, during normal
+ * operation, because that is its main call site.
+ *
+ * But when we create a new cache, we can call this as well if its parent
+ * is kmem-limited. That will have to hold set_limit_mutex as well.
+ */
+int memcg_update_cache_sizes(struct mem_cgroup *memcg)
+{
+        int num, ret;
+        num = ida_simple_get(&kmem_limited_groups,
+                                0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
+        if (num < 0)
+                return num;
+        /*
+         * After this point, kmem_accounted (that we test atomically in
+         * the beginning of this conditional), is no longer 0. This
+         * guarantees only one process will set the following boolean
+         * to true. We don't need test_and_set because we're protected
+         * by the set_limit_mutex anyway.
+         */
+        memcg_kmem_set_activated(memcg);
+        ret = memcg_update_all_caches(num+1);
+        if (ret) {
+                ida_simple_remove(&kmem_limited_groups, num);
+                memcg_kmem_clear_activated(memcg);
+                return ret;
+        }
+        memcg->kmemcg_id = num;
+        INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+        mutex_init(&memcg->slab_caches_mutex);
+        return 0;
+}
+static size_t memcg_caches_array_size(int num_groups)
+{
+        ssize_t size;
+        if (num_groups <= 0)
+                return 0;
+        size = 2 * num_groups;
+        if (size < MEMCG_CACHES_MIN_SIZE)
+                size = MEMCG_CACHES_MIN_SIZE;
+        else if (size > MEMCG_CACHES_MAX_SIZE)
+                size = MEMCG_CACHES_MAX_SIZE;
+        return size;
+}
+/*
+ * We should update the current array size iff all caches updates succeed. This
+ * can only be done from the slab side. The slab mutex needs to be held when
+ * calling this.
+ */
+void memcg_update_array_size(int num)
+{
+        if (num > memcg_limited_groups_array_size)
+                memcg_limited_groups_array_size = memcg_caches_array_size(num);
+}
+int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
+{
+        struct memcg_cache_params *cur_params = s->memcg_params;
+        VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
+        if (num_groups > memcg_limited_groups_array_size) {
+                int i;
+                ssize_t size = memcg_caches_array_size(num_groups);
+                size *= sizeof(void *);
+                size += sizeof(struct memcg_cache_params);
+                s->memcg_params = kzalloc(size, GFP_KERNEL);
+                if (!s->memcg_params) {
+                        s->memcg_params = cur_params;
+                        return -ENOMEM;
+                }
+                s->memcg_params->is_root_cache = true;
+                /*
+                 * There is the chance it will be bigger than
+                 * memcg_limited_groups_array_size, if we failed an allocation
+                 * in a cache, in which case all caches updated before it, will
+                 * have a bigger array.
+                 *
+                 * But if that is the case, the data after
+                 * memcg_limited_groups_array_size is certainly unused
+                 */
+                for (i = 0; i < memcg_limited_groups_array_size; i++) {
+                        if (!cur_params->memcg_caches[i])
+                                continue;
+                        s->memcg_params->memcg_caches[i] =
+                                                cur_params->memcg_caches[i];
+                }
+                /*
+                 * Ideally, we would wait until all caches succeed, and only
+                 * then free the old one. But this is not worth the extra
+                 * pointer per-cache we'd have to have for this.
+                 *
+                 * It is not a big deal if some caches are left with a size
+                 * bigger than the others. And all updates will reset this
+                 * anyway.
+                 */
+                kfree(cur_params);
+        }
+        return 0;
+}
 int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s)
 {
        size_t size = sizeof(struct memcg_cache_params);
@@ -2820,6 +2975,9 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s)
        if (!memcg_kmem_enabled())
                return 0;
+        if (!memcg)
+                size += memcg_limited_groups_array_size * sizeof(void *);
        s->memcg_params = kzalloc(size, GFP_KERNEL);
        if (!s->memcg_params)
                return -ENOMEM;
@@ -4326,14 +4484,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
                ret = res_counter_set_limit(&memcg->kmem, val);
                VM_BUG_ON(ret);
-                /*
+                ret = memcg_update_cache_sizes(memcg);
-                 * After this point, kmem_accounted (that we test atomically in
+                if (ret) {
-                 * the beginning of this conditional), is no longer 0. This
+                        res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
-                 * guarantees only one process will set the following boolean
+                        goto out;
-                 * to true. We don't need test_and_set because we're protected
+                }
-                 * by the set_limit_mutex anyway.
-                 */
-                memcg_kmem_set_activated(memcg);
                must_inc_static_branch = true;
                /*
                 * kmem charges can outlive the cgroup. In the case of slab
@@ -4372,11 +4527,13 @@ out:
        return ret;
 }
-static void memcg_propagate_kmem(struct mem_cgroup *memcg)
+static int memcg_propagate_kmem(struct mem_cgroup *memcg)
 {
+        int ret = 0;
        struct mem_cgroup *parent = parent_mem_cgroup(memcg);
        if (!parent)
-                return;
+                goto out;
        memcg->kmem_account_flags = parent->kmem_account_flags;
 #ifdef CONFIG_MEMCG_KMEM
        /*
@@ -4389,11 +4546,24 @@ static void memcg_propagate_kmem(struct mem_cgroup *memcg)
         * It is a lot simpler just to do static_key_slow_inc() on every child
         * that is accounted.
         */
-        if (memcg_kmem_is_active(memcg)) {
+        if (!memcg_kmem_is_active(memcg))
-                mem_cgroup_get(memcg);
+                goto out;
-                static_key_slow_inc(&memcg_kmem_enabled_key);
-        }
+        /*
+         * destroy(), called if we fail, will issue static_key_slow_inc() and
+         * mem_cgroup_put() if kmem is enabled. We have to either call them
+         * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
+         * this more consistent, since it always leads to the same destroy path
+         */
+        mem_cgroup_get(memcg);
+        static_key_slow_inc(&memcg_kmem_enabled_key);
+        mutex_lock(&set_limit_mutex);
+        ret = memcg_update_cache_sizes(memcg);
+        mutex_unlock(&set_limit_mutex);
 #endif
+out:
+        return ret;
 }
 /*
@@ -5075,8 +5245,12 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
 #ifdef CONFIG_MEMCG_KMEM
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
+        int ret;
        memcg->kmemcg_id = -1;
-        memcg_propagate_kmem(memcg);
+        ret = memcg_propagate_kmem(memcg);
+        if (ret)
+                return ret;
        return mem_cgroup_sockets_init(memcg, ss);
 };
@@ -5479,6 +5653,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
                res_counter_init(&memcg->res, &parent->res);
                res_counter_init(&memcg->memsw, &parent->memsw);
                res_counter_init(&memcg->kmem, &parent->kmem);
                /*
                 * We increment refcnt of the parent to ensure that we can
                 * safely access it on res_counter_charge/uncharge.
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 3031badcc577..1c424b6511bf 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -81,6 +81,34 @@ static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
 }
 #endif
+#ifdef CONFIG_MEMCG_KMEM
+int memcg_update_all_caches(int num_memcgs)
+{
+        struct kmem_cache *s;
+        int ret = 0;
+        mutex_lock(&slab_mutex);
+        list_for_each_entry(s, &slab_caches, list) {
+                if (!is_root_cache(s))
+                        continue;
+                ret = memcg_update_cache_size(s, num_memcgs);
+                /*
+                 * See comment in memcontrol.c, memcg_update_cache_size:
+                 * Instead of freeing the memory, we'll just leave the caches
+                 * up to this point in an updated state.
+                 */
+                if (ret)
+                        goto out;
+        }
+        memcg_update_array_size(num_memcgs);
+out:
+        mutex_unlock(&slab_mutex);
+        return ret;
+}
+#endif
 /*
 * Figure out what the alignment of the objects will be given a set of
 * flags, a user specified alignment and the size of the objects.
author	Glauber Costa <glommer@parallels.com>	2012-12-18 17:22:38 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-12-18 18:02:13 -0500
commit	55007d849759252ddd573aeb36143b947202d509 (patch)
tree	d042bc2f717922fb73f9d526592eeb331c2f0f70 /mm
parent	2633d7a028239a738b793be5ca8fa6ac312f5793 (diff)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3eafe6cf6ca4..db38b60e5f87 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -378,6 +378,11 @@ static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
378	set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);	378	set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
379	}	379	}
380		380
		381	static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
		382	{
		383	clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
		384	}
		385
381	static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)	386	static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
382	{	387	{
383	if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))	388	if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
@@ -549,12 +554,48 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
549	#endif	554	#endif
550		555
551	#ifdef CONFIG_MEMCG_KMEM	556	#ifdef CONFIG_MEMCG_KMEM
		557	/*
		558	* This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
		559	* There are two main reasons for not using the css_id for this:
		560	* 1) this works better in sparse environments, where we have a lot of memcgs,
		561	* but only a few kmem-limited. Or also, if we have, for instance, 200
		562	* memcgs, and none but the 200th is kmem-limited, we'd have to have a
		563	* 200 entry array for that.
		564	*
		565	* 2) In order not to violate the cgroup API, we would like to do all memory
		566	* allocation in ->create(). At that point, we haven't yet allocated the
		567	* css_id. Having a separate index prevents us from messing with the cgroup
		568	* core for this
		569	*
		570	* The current size of the caches array is stored in
		571	* memcg_limited_groups_array_size. It will double each time we have to
		572	* increase it.
		573	*/
		574	static DEFINE_IDA(kmem_limited_groups);
		575	static int memcg_limited_groups_array_size;
		576	/*
		577	* MIN_SIZE is different than 1, because we would like to avoid going through
		578	* the alloc/free process all the time. In a small machine, 4 kmem-limited
		579	* cgroups is a reasonable guess. In the future, it could be a parameter or
		580	* tunable, but that is strictly not necessary.
		581	*
		582	* MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
		583	* this constant directly from cgroup, but it is understandable that this is
		584	* better kept as an internal representation in cgroup.c. In any case, the
		585	* css_id space is not getting any smaller, and we don't have to necessarily
		586	* increase ours as well if it increases.
		587	*/
		588	#define MEMCG_CACHES_MIN_SIZE 4
		589	#define MEMCG_CACHES_MAX_SIZE 65535
		590
552	struct static_key memcg_kmem_enabled_key;	591	struct static_key memcg_kmem_enabled_key;
553		592
554	static void disarm_kmem_keys(struct mem_cgroup *memcg)	593	static void disarm_kmem_keys(struct mem_cgroup *memcg)
555	{	594	{
556	if (memcg_kmem_is_active(memcg))	595	if (memcg_kmem_is_active(memcg)) {
557	static_key_slow_dec(&memcg_kmem_enabled_key);	596	static_key_slow_dec(&memcg_kmem_enabled_key);
		597	ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
		598	}
558	/*	599	/*
559	* This check can't live in kmem destruction function,	600	* This check can't live in kmem destruction function,
560	* since the charges will outlive the cgroup	601	* since the charges will outlive the cgroup
@@ -2813,6 +2854,120 @@ int memcg_cache_id(struct mem_cgroup *memcg)
2813	return memcg ? memcg->kmemcg_id : -1;	2854	return memcg ? memcg->kmemcg_id : -1;
2814	}	2855	}
2815		2856
		2857	/*
		2858	* This ends up being protected by the set_limit mutex, during normal
		2859	* operation, because that is its main call site.
		2860	*
		2861	* But when we create a new cache, we can call this as well if its parent
		2862	* is kmem-limited. That will have to hold set_limit_mutex as well.
		2863	*/
		2864	int memcg_update_cache_sizes(struct mem_cgroup *memcg)
		2865	{
		2866	int num, ret;
		2867
		2868	num = ida_simple_get(&kmem_limited_groups,
		2869	0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
		2870	if (num < 0)
		2871	return num;
		2872	/*
		2873	* After this point, kmem_accounted (that we test atomically in
		2874	* the beginning of this conditional), is no longer 0. This
		2875	* guarantees only one process will set the following boolean
		2876	* to true. We don't need test_and_set because we're protected
		2877	* by the set_limit_mutex anyway.
		2878	*/
		2879	memcg_kmem_set_activated(memcg);
		2880
		2881	ret = memcg_update_all_caches(num+1);
		2882	if (ret) {
		2883	ida_simple_remove(&kmem_limited_groups, num);
		2884	memcg_kmem_clear_activated(memcg);
		2885	return ret;
		2886	}
		2887
		2888	memcg->kmemcg_id = num;
		2889	INIT_LIST_HEAD(&memcg->memcg_slab_caches);
		2890	mutex_init(&memcg->slab_caches_mutex);
		2891	return 0;
		2892	}
		2893
		2894	static size_t memcg_caches_array_size(int num_groups)
		2895	{
		2896	ssize_t size;
		2897	if (num_groups <= 0)
		2898	return 0;
		2899
		2900	size = 2 * num_groups;
		2901	if (size < MEMCG_CACHES_MIN_SIZE)
		2902	size = MEMCG_CACHES_MIN_SIZE;
		2903	else if (size > MEMCG_CACHES_MAX_SIZE)
		2904	size = MEMCG_CACHES_MAX_SIZE;
		2905
		2906	return size;
		2907	}
		2908
		2909	/*
		2910	* We should update the current array size iff all caches updates succeed. This
		2911	* can only be done from the slab side. The slab mutex needs to be held when
		2912	* calling this.
		2913	*/
		2914	void memcg_update_array_size(int num)
		2915	{
		2916	if (num > memcg_limited_groups_array_size)
		2917	memcg_limited_groups_array_size = memcg_caches_array_size(num);
		2918	}
		2919
		2920	int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
		2921	{
		2922	struct memcg_cache_params *cur_params = s->memcg_params;
		2923
		2924	VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
		2925
		2926	if (num_groups > memcg_limited_groups_array_size) {
		2927	int i;
		2928	ssize_t size = memcg_caches_array_size(num_groups);
		2929
		2930	size = sizeof(void );
		2931	size += sizeof(struct memcg_cache_params);
		2932
		2933	s->memcg_params = kzalloc(size, GFP_KERNEL);
		2934	if (!s->memcg_params) {
		2935	s->memcg_params = cur_params;
		2936	return -ENOMEM;
		2937	}
		2938
		2939	s->memcg_params->is_root_cache = true;
		2940
		2941	/*
		2942	* There is the chance it will be bigger than
		2943	* memcg_limited_groups_array_size, if we failed an allocation
		2944	* in a cache, in which case all caches updated before it, will
		2945	* have a bigger array.
		2946	*
		2947	* But if that is the case, the data after
		2948	* memcg_limited_groups_array_size is certainly unused
		2949	*/
		2950	for (i = 0; i < memcg_limited_groups_array_size; i++) {
		2951	if (!cur_params->memcg_caches[i])
		2952	continue;
		2953	s->memcg_params->memcg_caches[i] =
		2954	cur_params->memcg_caches[i];
		2955	}
		2956
		2957	/*
		2958	* Ideally, we would wait until all caches succeed, and only
		2959	* then free the old one. But this is not worth the extra
		2960	* pointer per-cache we'd have to have for this.
		2961	*
		2962	* It is not a big deal if some caches are left with a size
		2963	* bigger than the others. And all updates will reset this
		2964	* anyway.
		2965	*/
		2966	kfree(cur_params);
		2967	}
		2968	return 0;
		2969	}
		2970
2816	int memcg_register_cache(struct mem_cgroup memcg, struct kmem_cache s)	2971	int memcg_register_cache(struct mem_cgroup memcg, struct kmem_cache s)
2817	{	2972	{
2818	size_t size = sizeof(struct memcg_cache_params);	2973	size_t size = sizeof(struct memcg_cache_params);
@@ -2820,6 +2975,9 @@ int memcg_register_cache(struct mem_cgroup memcg, struct kmem_cache s)
2820	if (!memcg_kmem_enabled())	2975	if (!memcg_kmem_enabled())
2821	return 0;	2976	return 0;
2822		2977
		2978	if (!memcg)
		2979	size += memcg_limited_groups_array_size * sizeof(void *);
		2980
2823	s->memcg_params = kzalloc(size, GFP_KERNEL);	2981	s->memcg_params = kzalloc(size, GFP_KERNEL);
2824	if (!s->memcg_params)	2982	if (!s->memcg_params)
2825	return -ENOMEM;	2983	return -ENOMEM;
@@ -4326,14 +4484,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4326	ret = res_counter_set_limit(&memcg->kmem, val);	4484	ret = res_counter_set_limit(&memcg->kmem, val);
4327	VM_BUG_ON(ret);	4485	VM_BUG_ON(ret);
4328		4486
4329	/*	4487	ret = memcg_update_cache_sizes(memcg);
4330	* After this point, kmem_accounted (that we test atomically in	4488	if (ret) {
4331	* the beginning of this conditional), is no longer 0. This	4489	res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
4332	* guarantees only one process will set the following boolean	4490	goto out;
4333	* to true. We don't need test_and_set because we're protected	4491	}
4334	* by the set_limit_mutex anyway.
4335	*/
4336	memcg_kmem_set_activated(memcg);
4337	must_inc_static_branch = true;	4492	must_inc_static_branch = true;
4338	/*	4493	/*
4339	* kmem charges can outlive the cgroup. In the case of slab	4494	* kmem charges can outlive the cgroup. In the case of slab
@@ -4372,11 +4527,13 @@ out:
4372	return ret;	4527	return ret;
4373	}	4528	}
4374		4529
4375	static void memcg_propagate_kmem(struct mem_cgroup *memcg)	4530	static int memcg_propagate_kmem(struct mem_cgroup *memcg)
4376	{	4531	{
		4532	int ret = 0;
4377	struct mem_cgroup *parent = parent_mem_cgroup(memcg);	4533	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4378	if (!parent)	4534	if (!parent)
4379	return;	4535	goto out;
		4536
4380	memcg->kmem_account_flags = parent->kmem_account_flags;	4537	memcg->kmem_account_flags = parent->kmem_account_flags;
4381	#ifdef CONFIG_MEMCG_KMEM	4538	#ifdef CONFIG_MEMCG_KMEM
4382	/*	4539	/*
@@ -4389,11 +4546,24 @@ static void memcg_propagate_kmem(struct mem_cgroup *memcg)
4389	* It is a lot simpler just to do static_key_slow_inc() on every child	4546	* It is a lot simpler just to do static_key_slow_inc() on every child
4390	* that is accounted.	4547	* that is accounted.
4391	*/	4548	*/
4392	if (memcg_kmem_is_active(memcg)) {	4549	if (!memcg_kmem_is_active(memcg))
4393	mem_cgroup_get(memcg);	4550	goto out;
4394	static_key_slow_inc(&memcg_kmem_enabled_key);	4551
4395	}	4552	/*
		4553	* destroy(), called if we fail, will issue static_key_slow_inc() and
		4554	* mem_cgroup_put() if kmem is enabled. We have to either call them
		4555	* unconditionally, or clear the KMEM_ACTIVE flag. I personally find
		4556	* this more consistent, since it always leads to the same destroy path
		4557	*/
		4558	mem_cgroup_get(memcg);
		4559	static_key_slow_inc(&memcg_kmem_enabled_key);
		4560
		4561	mutex_lock(&set_limit_mutex);
		4562	ret = memcg_update_cache_sizes(memcg);
		4563	mutex_unlock(&set_limit_mutex);
4396	#endif	4564	#endif
		4565	out:
		4566	return ret;
4397	}	4567	}
4398		4568
4399	/*	4569	/*
@@ -5075,8 +5245,12 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
5075	#ifdef CONFIG_MEMCG_KMEM	5245	#ifdef CONFIG_MEMCG_KMEM
5076	static int memcg_init_kmem(struct mem_cgroup memcg, struct cgroup_subsys ss)	5246	static int memcg_init_kmem(struct mem_cgroup memcg, struct cgroup_subsys ss)
5077	{	5247	{
		5248	int ret;
		5249
5078	memcg->kmemcg_id = -1;	5250	memcg->kmemcg_id = -1;
5079	memcg_propagate_kmem(memcg);	5251	ret = memcg_propagate_kmem(memcg);
		5252	if (ret)
		5253	return ret;
5080		5254
5081	return mem_cgroup_sockets_init(memcg, ss);	5255	return mem_cgroup_sockets_init(memcg, ss);
5082	};	5256	};
@@ -5479,6 +5653,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
5479	res_counter_init(&memcg->res, &parent->res);	5653	res_counter_init(&memcg->res, &parent->res);
5480	res_counter_init(&memcg->memsw, &parent->memsw);	5654	res_counter_init(&memcg->memsw, &parent->memsw);
5481	res_counter_init(&memcg->kmem, &parent->kmem);	5655	res_counter_init(&memcg->kmem, &parent->kmem);
		5656
5482	/*	5657	/*
5483	* We increment refcnt of the parent to ensure that we can	5658	* We increment refcnt of the parent to ensure that we can
5484	* safely access it on res_counter_charge/uncharge.	5659	* safely access it on res_counter_charge/uncharge.


diff --git a/mm/slab_common.c b/mm/slab_common.c index 3031badcc577..1c424b6511bf 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c
@@ -81,6 +81,34 @@ static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
81	}	81	}
82	#endif	82	#endif
83		83
		84	#ifdef CONFIG_MEMCG_KMEM
		85	int memcg_update_all_caches(int num_memcgs)
		86	{
		87	struct kmem_cache *s;
		88	int ret = 0;
		89	mutex_lock(&slab_mutex);
		90
		91	list_for_each_entry(s, &slab_caches, list) {
		92	if (!is_root_cache(s))
		93	continue;
		94
		95	ret = memcg_update_cache_size(s, num_memcgs);
		96	/*
		97	* See comment in memcontrol.c, memcg_update_cache_size:
		98	* Instead of freeing the memory, we'll just leave the caches
		99	* up to this point in an updated state.
		100	*/
		101	if (ret)
		102	goto out;
		103	}
		104
		105	memcg_update_array_size(num_memcgs);
		106	out:
		107	mutex_unlock(&slab_mutex);
		108	return ret;
		109	}
		110	#endif
		111
84	/*	112	/*
85	* Figure out what the alignment of the objects will be given a set of	113	* Figure out what the alignment of the objects will be given a set of
86	* flags, a user specified alignment and the size of the objects.	114	* flags, a user specified alignment and the size of the objects.