memcg: infrastructure to match an allocation to the right cache

The page allocator is able to bind a page to a memcg when it is allocated. But for the caches, we'd like to have as many objects as possible in a page belonging to the same cache. This is done in this patch by calling memcg_kmem_get_cache in the beginning of every allocation function. This function is patched out by static branches when kernel memory controller is not being used. It assumes that the task allocating, which determines the memcg in the page allocator, belongs to the same cgroup throughout the whole process. Misaccounting can happen if the task calls memcg_kmem_get_cache() while belonging to a cgroup, and later on changes. This is considered acceptable, and should only happen upon task migration. Before the cache is created by the memcg core, there is also a possible imbalance: the task belongs to a memcg, but the cache being allocated from is the global cache, since the child cache is not yet guaranteed to be ready. This case is also fine, since in this case the GFP_KMEMCG will not be passed and the page allocator will not attempt any cgroup accounting. Signed-off-by: Glauber Costa <glommer@parallels.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Frederic Weisbecker <fweisbec@redhat.com> Cc: Greg Thelen <gthelen@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: JoonSoo Kim <js1304@gmail.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Michal Hocko <mhocko@suse.cz> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: Rik van Riel <riel@redhat.com> Cc: Suleiman Souhlal <suleiman@google.com> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Glauber Costa <glommer@parallels.com> 2012-12-18 17:22:40 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-12-18 18:02:14 -0500
commit: d7f25f8a2f81252d1ac134470ba1d0a287cf8fcd (patch)
tree: ecde8b5d98762e15a6fa1984d098ddf86646942b /mm
parent: 55007d849759252ddd573aeb36143b947202d509 (diff)
1 files changed, 217 insertions, 0 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index db38b60e5f87..efd26620a60b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -588,7 +588,14 @@ static int memcg_limited_groups_array_size;
 #define MEMCG_CACHES_MIN_SIZE 4
 #define MEMCG_CACHES_MAX_SIZE 65535
+/*
+ * A lot of the calls to the cache allocation functions are expected to be
+ * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
+ * conditional to this static branch, we'll have to allow modules that does
+ * kmem_cache_alloc and the such to see this symbol as well
+ */
 struct static_key memcg_kmem_enabled_key;
+EXPORT_SYMBOL(memcg_kmem_enabled_key);
 static void disarm_kmem_keys(struct mem_cgroup *memcg)
 {
@@ -2989,9 +2996,219 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s)
 void memcg_release_cache(struct kmem_cache *s)
 {
+        struct kmem_cache *root;
+        struct mem_cgroup *memcg;
+        int id;
+        /*
+         * This happens, for instance, when a root cache goes away before we
+         * add any memcg.
+         */
+        if (!s->memcg_params)
+                return;
+        if (s->memcg_params->is_root_cache)
+                goto out;
+        memcg = s->memcg_params->memcg;
+        id  = memcg_cache_id(memcg);
+        root = s->memcg_params->root_cache;
+        root->memcg_params->memcg_caches[id] = NULL;
+        mem_cgroup_put(memcg);
+        mutex_lock(&memcg->slab_caches_mutex);
+        list_del(&s->memcg_params->list);
+        mutex_unlock(&memcg->slab_caches_mutex);
+out:
        kfree(s->memcg_params);
 }
+static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s)
+{
+        char *name;
+        struct dentry *dentry;
+        rcu_read_lock();
+        dentry = rcu_dereference(memcg->css.cgroup->dentry);
+        rcu_read_unlock();
+        BUG_ON(dentry == NULL);
+        name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name,
+                         memcg_cache_id(memcg), dentry->d_name.name);
+        return name;
+}
+static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
+                                         struct kmem_cache *s)
+{
+        char *name;
+        struct kmem_cache *new;
+        name = memcg_cache_name(memcg, s);
+        if (!name)
+                return NULL;
+        new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align,
+                                      (s->flags & ~SLAB_PANIC), s->ctor);
+        kfree(name);
+        return new;
+}
+/*
+ * This lock protects updaters, not readers. We want readers to be as fast as
+ * they can, and they will either see NULL or a valid cache value. Our model
+ * allow them to see NULL, in which case the root memcg will be selected.
+ *
+ * We need this lock because multiple allocations to the same cache from a non
+ * will span more than one worker. Only one of them can create the cache.
+ */
+static DEFINE_MUTEX(memcg_cache_mutex);
+static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
+                                                  struct kmem_cache *cachep)
+{
+        struct kmem_cache *new_cachep;
+        int idx;
+        BUG_ON(!memcg_can_account_kmem(memcg));
+        idx = memcg_cache_id(memcg);
+        mutex_lock(&memcg_cache_mutex);
+        new_cachep = cachep->memcg_params->memcg_caches[idx];
+        if (new_cachep)
+                goto out;
+        new_cachep = kmem_cache_dup(memcg, cachep);
+        if (new_cachep == NULL) {
+                new_cachep = cachep;
+                goto out;
+        }
+        mem_cgroup_get(memcg);
+        new_cachep->memcg_params->root_cache = cachep;
+        cachep->memcg_params->memcg_caches[idx] = new_cachep;
+        /*
+         * the readers won't lock, make sure everybody sees the updated value,
+         * so they won't put stuff in the queue again for no reason
+         */
+        wmb();
+out:
+        mutex_unlock(&memcg_cache_mutex);
+        return new_cachep;
+}
+struct create_work {
+        struct mem_cgroup *memcg;
+        struct kmem_cache *cachep;
+        struct work_struct work;
+};
+static void memcg_create_cache_work_func(struct work_struct *w)
+{
+        struct create_work *cw;
+        cw = container_of(w, struct create_work, work);
+        memcg_create_kmem_cache(cw->memcg, cw->cachep);
+        /* Drop the reference gotten when we enqueued. */
+        css_put(&cw->memcg->css);
+        kfree(cw);
+}
+/*
+ * Enqueue the creation of a per-memcg kmem_cache.
+ * Called with rcu_read_lock.
+ */
+static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
+                                       struct kmem_cache *cachep)
+{
+        struct create_work *cw;
+        cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
+        if (cw == NULL)
+                return;
+        /* The corresponding put will be done in the workqueue. */
+        if (!css_tryget(&memcg->css)) {
+                kfree(cw);
+                return;
+        }
+        cw->memcg = memcg;
+        cw->cachep = cachep;
+        INIT_WORK(&cw->work, memcg_create_cache_work_func);
+        schedule_work(&cw->work);
+}
+/*
+ * Return the kmem_cache we're supposed to use for a slab allocation.
+ * We try to use the current memcg's version of the cache.
+ *
+ * If the cache does not exist yet, if we are the first user of it,
+ * we either create it immediately, if possible, or create it asynchronously
+ * in a workqueue.
+ * In the latter case, we will let the current allocation go through with
+ * the original cache.
+ *
+ * Can't be called in interrupt context or from kernel threads.
+ * This function needs to be called with rcu_read_lock() held.
+ */
+struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
+                                          gfp_t gfp)
+{
+        struct mem_cgroup *memcg;
+        int idx;
+        VM_BUG_ON(!cachep->memcg_params);
+        VM_BUG_ON(!cachep->memcg_params->is_root_cache);
+        rcu_read_lock();
+        memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
+        rcu_read_unlock();
+        if (!memcg_can_account_kmem(memcg))
+                return cachep;
+        idx = memcg_cache_id(memcg);
+        /*
+         * barrier to mare sure we're always seeing the up to date value.  The
+         * code updating memcg_caches will issue a write barrier to match this.
+         */
+        read_barrier_depends();
+        if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) {
+                /*
+                 * If we are in a safe context (can wait, and not in interrupt
+                 * context), we could be be predictable and return right away.
+                 * This would guarantee that the allocation being performed
+                 * already belongs in the new cache.
+                 *
+                 * However, there are some clashes that can arrive from locking.
+                 * For instance, because we acquire the slab_mutex while doing
+                 * kmem_cache_dup, this means no further allocation could happen
+                 * with the slab_mutex held.
+                 *
+                 * Also, because cache creation issue get_online_cpus(), this
+                 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
+                 * that ends up reversed during cpu hotplug. (cpuset allocates
+                 * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
+                 * better to defer everything.
+                 */
+                memcg_create_cache_enqueue(memcg, cachep);
+                return cachep;
+        }
+        return cachep->memcg_params->memcg_caches[idx];
+}
+EXPORT_SYMBOL(__memcg_kmem_get_cache);
 /*
 * We need to verify if the allocation against current->mm->owner's memcg is
 * possible for the given order. But the page is not allocated yet, so we'll
author	Glauber Costa <glommer@parallels.com>	2012-12-18 17:22:40 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-12-18 18:02:14 -0500
commit	d7f25f8a2f81252d1ac134470ba1d0a287cf8fcd (patch)
tree	ecde8b5d98762e15a6fa1984d098ddf86646942b /mm
parent	55007d849759252ddd573aeb36143b947202d509 (diff)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c index db38b60e5f87..efd26620a60b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -588,7 +588,14 @@ static int memcg_limited_groups_array_size;
588	#define MEMCG_CACHES_MIN_SIZE 4	588	#define MEMCG_CACHES_MIN_SIZE 4
589	#define MEMCG_CACHES_MAX_SIZE 65535	589	#define MEMCG_CACHES_MAX_SIZE 65535
590		590
		591	/*
		592	* A lot of the calls to the cache allocation functions are expected to be
		593	* inlined by the compiler. Since the calls to memcg_kmem_get_cache are
		594	* conditional to this static branch, we'll have to allow modules that does
		595	* kmem_cache_alloc and the such to see this symbol as well
		596	*/
591	struct static_key memcg_kmem_enabled_key;	597	struct static_key memcg_kmem_enabled_key;
		598	EXPORT_SYMBOL(memcg_kmem_enabled_key);
592		599
593	static void disarm_kmem_keys(struct mem_cgroup *memcg)	600	static void disarm_kmem_keys(struct mem_cgroup *memcg)
594	{	601	{
@@ -2989,9 +2996,219 @@ int memcg_register_cache(struct mem_cgroup memcg, struct kmem_cache s)
2989		2996
2990	void memcg_release_cache(struct kmem_cache *s)	2997	void memcg_release_cache(struct kmem_cache *s)
2991	{	2998	{
		2999	struct kmem_cache *root;
		3000	struct mem_cgroup *memcg;
		3001	int id;
		3002
		3003	/*
		3004	* This happens, for instance, when a root cache goes away before we
		3005	* add any memcg.
		3006	*/
		3007	if (!s->memcg_params)
		3008	return;
		3009
		3010	if (s->memcg_params->is_root_cache)
		3011	goto out;
		3012
		3013	memcg = s->memcg_params->memcg;
		3014	id = memcg_cache_id(memcg);
		3015
		3016	root = s->memcg_params->root_cache;
		3017	root->memcg_params->memcg_caches[id] = NULL;
		3018	mem_cgroup_put(memcg);
		3019
		3020	mutex_lock(&memcg->slab_caches_mutex);
		3021	list_del(&s->memcg_params->list);
		3022	mutex_unlock(&memcg->slab_caches_mutex);
		3023
		3024	out:
2992	kfree(s->memcg_params);	3025	kfree(s->memcg_params);
2993	}	3026	}
2994		3027
		3028	static char memcg_cache_name(struct mem_cgroup memcg, struct kmem_cache *s)
		3029	{
		3030	char *name;
		3031	struct dentry *dentry;
		3032
		3033	rcu_read_lock();
		3034	dentry = rcu_dereference(memcg->css.cgroup->dentry);
		3035	rcu_read_unlock();
		3036
		3037	BUG_ON(dentry == NULL);
		3038
		3039	name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name,
		3040	memcg_cache_id(memcg), dentry->d_name.name);
		3041
		3042	return name;
		3043	}
		3044
		3045	static struct kmem_cache kmem_cache_dup(struct mem_cgroup memcg,
		3046	struct kmem_cache *s)
		3047	{
		3048	char *name;
		3049	struct kmem_cache *new;
		3050
		3051	name = memcg_cache_name(memcg, s);
		3052	if (!name)
		3053	return NULL;
		3054
		3055	new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align,
		3056	(s->flags & ~SLAB_PANIC), s->ctor);
		3057
		3058	kfree(name);
		3059	return new;
		3060	}
		3061
		3062	/*
		3063	* This lock protects updaters, not readers. We want readers to be as fast as
		3064	* they can, and they will either see NULL or a valid cache value. Our model
		3065	* allow them to see NULL, in which case the root memcg will be selected.
		3066	*
		3067	* We need this lock because multiple allocations to the same cache from a non
		3068	* will span more than one worker. Only one of them can create the cache.
		3069	*/
		3070	static DEFINE_MUTEX(memcg_cache_mutex);
		3071	static struct kmem_cache memcg_create_kmem_cache(struct mem_cgroup memcg,
		3072	struct kmem_cache *cachep)
		3073	{
		3074	struct kmem_cache *new_cachep;
		3075	int idx;
		3076
		3077	BUG_ON(!memcg_can_account_kmem(memcg));
		3078
		3079	idx = memcg_cache_id(memcg);
		3080
		3081	mutex_lock(&memcg_cache_mutex);
		3082	new_cachep = cachep->memcg_params->memcg_caches[idx];
		3083	if (new_cachep)
		3084	goto out;
		3085
		3086	new_cachep = kmem_cache_dup(memcg, cachep);
		3087
		3088	if (new_cachep == NULL) {
		3089	new_cachep = cachep;
		3090	goto out;
		3091	}
		3092
		3093	mem_cgroup_get(memcg);
		3094	new_cachep->memcg_params->root_cache = cachep;
		3095
		3096	cachep->memcg_params->memcg_caches[idx] = new_cachep;
		3097	/*
		3098	* the readers won't lock, make sure everybody sees the updated value,
		3099	* so they won't put stuff in the queue again for no reason
		3100	*/
		3101	wmb();
		3102	out:
		3103	mutex_unlock(&memcg_cache_mutex);
		3104	return new_cachep;
		3105	}
		3106
		3107	struct create_work {
		3108	struct mem_cgroup *memcg;
		3109	struct kmem_cache *cachep;
		3110	struct work_struct work;
		3111	};
		3112
		3113	static void memcg_create_cache_work_func(struct work_struct *w)
		3114	{
		3115	struct create_work *cw;
		3116
		3117	cw = container_of(w, struct create_work, work);
		3118	memcg_create_kmem_cache(cw->memcg, cw->cachep);
		3119	/* Drop the reference gotten when we enqueued. */
		3120	css_put(&cw->memcg->css);
		3121	kfree(cw);
		3122	}
		3123
		3124	/*
		3125	* Enqueue the creation of a per-memcg kmem_cache.
		3126	* Called with rcu_read_lock.
		3127	*/
		3128	static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
		3129	struct kmem_cache *cachep)
		3130	{
		3131	struct create_work *cw;
		3132
		3133	cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
		3134	if (cw == NULL)
		3135	return;
		3136
		3137	/* The corresponding put will be done in the workqueue. */
		3138	if (!css_tryget(&memcg->css)) {
		3139	kfree(cw);
		3140	return;
		3141	}
		3142
		3143	cw->memcg = memcg;
		3144	cw->cachep = cachep;
		3145
		3146	INIT_WORK(&cw->work, memcg_create_cache_work_func);
		3147	schedule_work(&cw->work);
		3148	}
		3149
		3150	/*
		3151	* Return the kmem_cache we're supposed to use for a slab allocation.
		3152	* We try to use the current memcg's version of the cache.
		3153	*
		3154	* If the cache does not exist yet, if we are the first user of it,
		3155	* we either create it immediately, if possible, or create it asynchronously
		3156	* in a workqueue.
		3157	* In the latter case, we will let the current allocation go through with
		3158	* the original cache.
		3159	*
		3160	* Can't be called in interrupt context or from kernel threads.
		3161	* This function needs to be called with rcu_read_lock() held.
		3162	*/
		3163	struct kmem_cache __memcg_kmem_get_cache(struct kmem_cache cachep,
		3164	gfp_t gfp)
		3165	{
		3166	struct mem_cgroup *memcg;
		3167	int idx;
		3168
		3169	VM_BUG_ON(!cachep->memcg_params);
		3170	VM_BUG_ON(!cachep->memcg_params->is_root_cache);
		3171
		3172	rcu_read_lock();
		3173	memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
		3174	rcu_read_unlock();
		3175
		3176	if (!memcg_can_account_kmem(memcg))
		3177	return cachep;
		3178
		3179	idx = memcg_cache_id(memcg);
		3180
		3181	/*
		3182	* barrier to mare sure we're always seeing the up to date value. The
		3183	* code updating memcg_caches will issue a write barrier to match this.
		3184	*/
		3185	read_barrier_depends();
		3186	if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) {
		3187	/*
		3188	* If we are in a safe context (can wait, and not in interrupt
		3189	* context), we could be be predictable and return right away.
		3190	* This would guarantee that the allocation being performed
		3191	* already belongs in the new cache.
		3192	*
		3193	* However, there are some clashes that can arrive from locking.
		3194	* For instance, because we acquire the slab_mutex while doing
		3195	* kmem_cache_dup, this means no further allocation could happen
		3196	* with the slab_mutex held.
		3197	*
		3198	* Also, because cache creation issue get_online_cpus(), this
		3199	* creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
		3200	* that ends up reversed during cpu hotplug. (cpuset allocates
		3201	* a bunch of GFP_KERNEL memory during cpuup). Due to all that,
		3202	* better to defer everything.
		3203	*/
		3204	memcg_create_cache_enqueue(memcg, cachep);
		3205	return cachep;
		3206	}
		3207
		3208	return cachep->memcg_params->memcg_caches[idx];
		3209	}
		3210	EXPORT_SYMBOL(__memcg_kmem_get_cache);
		3211
2995	/*	3212	/*
2996	* We need to verify if the allocation against current->mm->owner's memcg is	3213	* We need to verify if the allocation against current->mm->owner's memcg is
2997	* possible for the given order. But the page is not allocated yet, so we'll	3214	* possible for the given order. But the page is not allocated yet, so we'll