memcg: fix possible use-after-free in memcg_kmem_get_cache()

Suppose task @t that belongs to a memory cgroup @memcg is going to allocate an object from a kmem cache @c. The copy of @c corresponding to @memcg, @mc, is empty. Then if kmem_cache_alloc races with the memory cgroup destruction we can access the memory cgroup's copy of the cache after it was destroyed: CPU0 CPU1 ---- ---- [ current=@t @mc->memcg_params->nr_pages=0 ] kmem_cache_alloc(@c): call memcg_kmem_get_cache(@c); proceed to allocation from @mc: alloc a page for @mc: ... move @t from @memcg destroy @memcg: mem_cgroup_css_offline(@memcg): memcg_unregister_all_caches(@memcg): kmem_cache_destroy(@mc) add page to @mc We could fix this issue by taking a reference to a per-memcg cache, but that would require adding a per-cpu reference counter to per-memcg caches, which would look cumbersome. Instead, let's take a reference to a memory cgroup, which already has a per-cpu reference counter, in the beginning of kmem_cache_alloc to be dropped in the end, and move per memcg caches destruction from css offline to css free. As a side effect, per-memcg caches will be destroyed not one by one, but all at once when the last page accounted to the memory cgroup is freed. This doesn't sound as a high price for code readability though. Note, this patch does add some overhead to the kmem_cache_alloc hot path, but it is pretty negligible - it's just a function call plus a per cpu counter decrement, which is comparable to what we already have in memcg_kmem_get_cache. Besides, it's only relevant if there are memory cgroups with kmem accounting enabled. I don't think we can find a way to handle this race w/o it, because alloc_page called from kmem_cache_alloc may sleep so we can't flush all pending kmallocs w/o reference counting. Signed-off-by: Vladimir Davydov <vdavydov@parallels.com> Acked-by: Christoph Lameter <cl@linux.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: Pekka Enberg <penberg@kernel.org> Cc: David Rientjes <rientjes@google.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Vladimir Davydov <vdavydov@parallels.com> 2014-12-12 19:56:38 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-12-13 15:42:49 -0500
commit: 8135be5a8012f4c7e95218563855e16c09a8271b (patch)
tree: 49e85409f82f5973a0cbf21e3e3eac382daa515b /mm
parent: ae6e71d3d900c398bdb346ac25733b2efa9b3752 (diff)
3 files changed, 27 insertions, 40 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dac81b975996..05e1584750ac 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2635,7 +2635,6 @@ static void memcg_register_cache(struct mem_cgroup *memcg,
        if (!cachep)
                return;
-        css_get(&memcg->css);
        list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
        /*
@@ -2669,9 +2668,6 @@ static void memcg_unregister_cache(struct kmem_cache *cachep)
        list_del(&cachep->memcg_params->list);
        kmem_cache_destroy(cachep);
-        /* drop the reference taken in memcg_register_cache */
-        css_put(&memcg->css);
 }
 int __memcg_cleanup_cache_params(struct kmem_cache *s)
@@ -2705,9 +2701,7 @@ static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
        mutex_lock(&memcg_slab_mutex);
        list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
                cachep = memcg_params_to_cache(params);
-                kmem_cache_shrink(cachep);
+                memcg_unregister_cache(cachep);
-                if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
-                        memcg_unregister_cache(cachep);
        }
        mutex_unlock(&memcg_slab_mutex);
 }
@@ -2742,10 +2736,10 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
        struct memcg_register_cache_work *cw;
        cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
-        if (cw == NULL) {
+        if (!cw)
-                css_put(&memcg->css);
                return;
-        }
+        css_get(&memcg->css);
        cw->memcg = memcg;
        cw->cachep = cachep;
@@ -2776,12 +2770,8 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
 int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
 {
        unsigned int nr_pages = 1 << order;
-        int res;
-        res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
+        return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
-        if (!res)
-                atomic_add(nr_pages, &cachep->memcg_params->nr_pages);
-        return res;
 }
 void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
@@ -2789,7 +2779,6 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
        unsigned int nr_pages = 1 << order;
        memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
-        atomic_sub(nr_pages, &cachep->memcg_params->nr_pages);
 }
 /*
@@ -2816,22 +2805,13 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
        if (current->memcg_kmem_skip_account)
                return cachep;
-        rcu_read_lock();
+        memcg = get_mem_cgroup_from_mm(current->mm);
-        memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
        if (!memcg_kmem_is_active(memcg))
                goto out;
        memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
-        if (likely(memcg_cachep)) {
+        if (likely(memcg_cachep))
-                cachep = memcg_cachep;
+                return memcg_cachep;
-                goto out;
-        }
-        /* The corresponding put will be done in the workqueue. */
-        if (!css_tryget_online(&memcg->css))
-                goto out;
-        rcu_read_unlock();
        /*
         * If we are in a safe context (can wait, and not in interrupt
@@ -2846,12 +2826,17 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
         * defer everything.
         */
        memcg_schedule_register_cache(memcg, cachep);
-        return cachep;
 out:
-        rcu_read_unlock();
+        css_put(&memcg->css);
        return cachep;
 }
+void __memcg_kmem_put_cache(struct kmem_cache *cachep)
+{
+        if (!is_root_cache(cachep))
+                css_put(&cachep->memcg_params->memcg->css);
+}
 /*
 * We need to verify if the allocation against current->mm->owner's memcg is
 * possible for the given order. But the page is not allocated yet, so we'll
@@ -2914,10 +2899,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
        memcg_uncharge_kmem(memcg, 1 << order);
        page->mem_cgroup = NULL;
 }
-#else
-static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
-{
-}
 #endif /* CONFIG_MEMCG_KMEM */
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -4188,6 +4169,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
+        memcg_unregister_all_caches(memcg);
        mem_cgroup_sockets_destroy(memcg);
 }
 #else
@@ -4797,7 +4779,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
        }
        spin_unlock(&memcg->event_list_lock);
-        memcg_unregister_all_caches(memcg);
        vmpressure_cleanup(&memcg->vmpressure);
 }
diff --git a/mm/slab.c b/mm/slab.c
index fee275b5b6b7..6042fe57cc60 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3182,6 +3182,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
                        memset(ptr, 0, cachep->object_size);
        }
+        memcg_kmem_put_cache(cachep);
        return ptr;
 }
@@ -3247,6 +3248,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
                        memset(objp, 0, cachep->object_size);
        }
+        memcg_kmem_put_cache(cachep);
        return objp;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 765c5884d03d..fe4db9c17238 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1233,13 +1233,17 @@ static inline void kfree_hook(const void *x)
        kmemleak_free(x);
 }
-static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
+static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
+                                                     gfp_t flags)
 {
        flags &= gfp_allowed_mask;
        lockdep_trace_alloc(flags);
        might_sleep_if(flags & __GFP_WAIT);
-        return should_failslab(s->object_size, flags, s->flags);
+        if (should_failslab(s->object_size, flags, s->flags))
+                return NULL;
+        return memcg_kmem_get_cache(s, flags);
 }
 static inline void slab_post_alloc_hook(struct kmem_cache *s,
@@ -1248,6 +1252,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
        flags &= gfp_allowed_mask;
        kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
        kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
+        memcg_kmem_put_cache(s);
 }
 static inline void slab_free_hook(struct kmem_cache *s, void *x)
@@ -2384,10 +2389,9 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
        struct page *page;
        unsigned long tid;
-        if (slab_pre_alloc_hook(s, gfpflags))
+        s = slab_pre_alloc_hook(s, gfpflags);
+        if (!s)
                return NULL;
-        s = memcg_kmem_get_cache(s, gfpflags);
 redo:
        /*
         * Must read kmem_cache cpu data via this cpu ptr. Preemption is
author	Vladimir Davydov <vdavydov@parallels.com>	2014-12-12 19:56:38 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-12-13 15:42:49 -0500
commit	8135be5a8012f4c7e95218563855e16c09a8271b (patch)
tree	49e85409f82f5973a0cbf21e3e3eac382daa515b /mm
parent	ae6e71d3d900c398bdb346ac25733b2efa9b3752 (diff)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dac81b975996..05e1584750ac 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -2635,7 +2635,6 @@ static void memcg_register_cache(struct mem_cgroup *memcg,
2635	if (!cachep)	2635	if (!cachep)
2636	return;	2636	return;
2637		2637
2638	css_get(&memcg->css);
2639	list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);	2638	list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
2640		2639
2641	/*	2640	/*
@@ -2669,9 +2668,6 @@ static void memcg_unregister_cache(struct kmem_cache *cachep)
2669	list_del(&cachep->memcg_params->list);	2668	list_del(&cachep->memcg_params->list);
2670		2669
2671	kmem_cache_destroy(cachep);	2670	kmem_cache_destroy(cachep);
2672
2673	/* drop the reference taken in memcg_register_cache */
2674	css_put(&memcg->css);
2675	}	2671	}
2676		2672
2677	int __memcg_cleanup_cache_params(struct kmem_cache *s)	2673	int __memcg_cleanup_cache_params(struct kmem_cache *s)
@@ -2705,9 +2701,7 @@ static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
2705	mutex_lock(&memcg_slab_mutex);	2701	mutex_lock(&memcg_slab_mutex);
2706	list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {	2702	list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
2707	cachep = memcg_params_to_cache(params);	2703	cachep = memcg_params_to_cache(params);
2708	kmem_cache_shrink(cachep);	2704	memcg_unregister_cache(cachep);
2709	if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
2710	memcg_unregister_cache(cachep);
2711	}	2705	}
2712	mutex_unlock(&memcg_slab_mutex);	2706	mutex_unlock(&memcg_slab_mutex);
2713	}	2707	}
@@ -2742,10 +2736,10 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
2742	struct memcg_register_cache_work *cw;	2736	struct memcg_register_cache_work *cw;
2743		2737
2744	cw = kmalloc(sizeof(*cw), GFP_NOWAIT);	2738	cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
2745	if (cw == NULL) {	2739	if (!cw)
2746	css_put(&memcg->css);
2747	return;	2740	return;
2748	}	2741
		2742	css_get(&memcg->css);
2749		2743
2750	cw->memcg = memcg;	2744	cw->memcg = memcg;
2751	cw->cachep = cachep;	2745	cw->cachep = cachep;
@@ -2776,12 +2770,8 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
2776	int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)	2770	int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
2777	{	2771	{
2778	unsigned int nr_pages = 1 << order;	2772	unsigned int nr_pages = 1 << order;
2779	int res;
2780		2773
2781	res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);	2774	return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
2782	if (!res)
2783	atomic_add(nr_pages, &cachep->memcg_params->nr_pages);
2784	return res;
2785	}	2775	}
2786		2776
2787	void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)	2777	void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
@@ -2789,7 +2779,6 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
2789	unsigned int nr_pages = 1 << order;	2779	unsigned int nr_pages = 1 << order;
2790		2780
2791	memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);	2781	memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
2792	atomic_sub(nr_pages, &cachep->memcg_params->nr_pages);
2793	}	2782	}
2794		2783
2795	/*	2784	/*
@@ -2816,22 +2805,13 @@ struct kmem_cache __memcg_kmem_get_cache(struct kmem_cache cachep)
2816	if (current->memcg_kmem_skip_account)	2805	if (current->memcg_kmem_skip_account)
2817	return cachep;	2806	return cachep;
2818		2807
2819	rcu_read_lock();	2808	memcg = get_mem_cgroup_from_mm(current->mm);
2820	memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
2821
2822	if (!memcg_kmem_is_active(memcg))	2809	if (!memcg_kmem_is_active(memcg))
2823	goto out;	2810	goto out;
2824		2811
2825	memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));	2812	memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
2826	if (likely(memcg_cachep)) {	2813	if (likely(memcg_cachep))
2827	cachep = memcg_cachep;	2814	return memcg_cachep;
2828	goto out;
2829	}
2830
2831	/* The corresponding put will be done in the workqueue. */
2832	if (!css_tryget_online(&memcg->css))
2833	goto out;
2834	rcu_read_unlock();
2835		2815
2836	/*	2816	/*
2837	* If we are in a safe context (can wait, and not in interrupt	2817	* If we are in a safe context (can wait, and not in interrupt
@@ -2846,12 +2826,17 @@ struct kmem_cache __memcg_kmem_get_cache(struct kmem_cache cachep)
2846	* defer everything.	2826	* defer everything.
2847	*/	2827	*/
2848	memcg_schedule_register_cache(memcg, cachep);	2828	memcg_schedule_register_cache(memcg, cachep);
2849	return cachep;
2850	out:	2829	out:
2851	rcu_read_unlock();	2830	css_put(&memcg->css);
2852	return cachep;	2831	return cachep;
2853	}	2832	}
2854		2833
		2834	void __memcg_kmem_put_cache(struct kmem_cache *cachep)
		2835	{
		2836	if (!is_root_cache(cachep))
		2837	css_put(&cachep->memcg_params->memcg->css);
		2838	}
		2839
2855	/*	2840	/*
2856	* We need to verify if the allocation against current->mm->owner's memcg is	2841	* We need to verify if the allocation against current->mm->owner's memcg is
2857	* possible for the given order. But the page is not allocated yet, so we'll	2842	* possible for the given order. But the page is not allocated yet, so we'll
@@ -2914,10 +2899,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
2914	memcg_uncharge_kmem(memcg, 1 << order);	2899	memcg_uncharge_kmem(memcg, 1 << order);
2915	page->mem_cgroup = NULL;	2900	page->mem_cgroup = NULL;
2916	}	2901	}
2917	#else
2918	static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
2919	{
2920	}
2921	#endif /* CONFIG_MEMCG_KMEM */	2902	#endif /* CONFIG_MEMCG_KMEM */
2922		2903
2923	#ifdef CONFIG_TRANSPARENT_HUGEPAGE	2904	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -4188,6 +4169,7 @@ static int memcg_init_kmem(struct mem_cgroup memcg, struct cgroup_subsys ss)
4188		4169
4189	static void memcg_destroy_kmem(struct mem_cgroup *memcg)	4170	static void memcg_destroy_kmem(struct mem_cgroup *memcg)
4190	{	4171	{
		4172	memcg_unregister_all_caches(memcg);
4191	mem_cgroup_sockets_destroy(memcg);	4173	mem_cgroup_sockets_destroy(memcg);
4192	}	4174	}
4193	#else	4175	#else
@@ -4797,7 +4779,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
4797	}	4779	}
4798	spin_unlock(&memcg->event_list_lock);	4780	spin_unlock(&memcg->event_list_lock);
4799		4781
4800	memcg_unregister_all_caches(memcg);
4801	vmpressure_cleanup(&memcg->vmpressure);	4782	vmpressure_cleanup(&memcg->vmpressure);
4802	}	4783	}
4803		4784


diff --git a/mm/slab.c b/mm/slab.c index fee275b5b6b7..6042fe57cc60 100644 --- a/mm/slab.c +++ b/mm/slab.c
@@ -3182,6 +3182,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3182	memset(ptr, 0, cachep->object_size);	3182	memset(ptr, 0, cachep->object_size);
3183	}	3183	}
3184		3184
		3185	memcg_kmem_put_cache(cachep);
3185	return ptr;	3186	return ptr;
3186	}	3187	}
3187		3188
@@ -3247,6 +3248,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
3247	memset(objp, 0, cachep->object_size);	3248	memset(objp, 0, cachep->object_size);
3248	}	3249	}
3249		3250
		3251	memcg_kmem_put_cache(cachep);
3250	return objp;	3252	return objp;
3251	}	3253	}
3252		3254


diff --git a/mm/slub.c b/mm/slub.c index 765c5884d03d..fe4db9c17238 100644 --- a/mm/slub.c +++ b/mm/slub.c
@@ -1233,13 +1233,17 @@ static inline void kfree_hook(const void *x)
1233	kmemleak_free(x);	1233	kmemleak_free(x);
1234	}	1234	}
1235		1235
1236	static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)	1236	static inline struct kmem_cache slab_pre_alloc_hook(struct kmem_cache s,
		1237	gfp_t flags)
1237	{	1238	{
1238	flags &= gfp_allowed_mask;	1239	flags &= gfp_allowed_mask;
1239	lockdep_trace_alloc(flags);	1240	lockdep_trace_alloc(flags);
1240	might_sleep_if(flags & __GFP_WAIT);	1241	might_sleep_if(flags & __GFP_WAIT);
1241		1242
1242	return should_failslab(s->object_size, flags, s->flags);	1243	if (should_failslab(s->object_size, flags, s->flags))
		1244	return NULL;
		1245
		1246	return memcg_kmem_get_cache(s, flags);
1243	}	1247	}
1244		1248
1245	static inline void slab_post_alloc_hook(struct kmem_cache *s,	1249	static inline void slab_post_alloc_hook(struct kmem_cache *s,
@@ -1248,6 +1252,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
1248	flags &= gfp_allowed_mask;	1252	flags &= gfp_allowed_mask;
1249	kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));	1253	kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
1250	kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);	1254	kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
		1255	memcg_kmem_put_cache(s);
1251	}	1256	}
1252		1257
1253	static inline void slab_free_hook(struct kmem_cache s, void x)	1258	static inline void slab_free_hook(struct kmem_cache s, void x)
@@ -2384,10 +2389,9 @@ static __always_inline void slab_alloc_node(struct kmem_cache s,
2384	struct page *page;	2389	struct page *page;
2385	unsigned long tid;	2390	unsigned long tid;
2386		2391
2387	if (slab_pre_alloc_hook(s, gfpflags))	2392	s = slab_pre_alloc_hook(s, gfpflags);
		2393	if (!s)
2388	return NULL;	2394	return NULL;
2389
2390	s = memcg_kmem_get_cache(s, gfpflags);
2391	redo:	2395	redo:
2392	/*	2396	/*
2393	* Must read kmem_cache cpu data via this cpu ptr. Preemption is	2397	* Must read kmem_cache cpu data via this cpu ptr. Preemption is