aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorVladimir Davydov <vdavydov@parallels.com>2014-12-12 19:56:38 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-13 15:42:49 -0500
commit8135be5a8012f4c7e95218563855e16c09a8271b (patch)
tree49e85409f82f5973a0cbf21e3e3eac382daa515b /mm
parentae6e71d3d900c398bdb346ac25733b2efa9b3752 (diff)
memcg: fix possible use-after-free in memcg_kmem_get_cache()
Suppose task @t that belongs to a memory cgroup @memcg is going to allocate an object from a kmem cache @c. The copy of @c corresponding to @memcg, @mc, is empty. Then if kmem_cache_alloc races with the memory cgroup destruction we can access the memory cgroup's copy of the cache after it was destroyed: CPU0 CPU1 ---- ---- [ current=@t @mc->memcg_params->nr_pages=0 ] kmem_cache_alloc(@c): call memcg_kmem_get_cache(@c); proceed to allocation from @mc: alloc a page for @mc: ... move @t from @memcg destroy @memcg: mem_cgroup_css_offline(@memcg): memcg_unregister_all_caches(@memcg): kmem_cache_destroy(@mc) add page to @mc We could fix this issue by taking a reference to a per-memcg cache, but that would require adding a per-cpu reference counter to per-memcg caches, which would look cumbersome. Instead, let's take a reference to a memory cgroup, which already has a per-cpu reference counter, in the beginning of kmem_cache_alloc to be dropped in the end, and move per memcg caches destruction from css offline to css free. As a side effect, per-memcg caches will be destroyed not one by one, but all at once when the last page accounted to the memory cgroup is freed. This doesn't sound as a high price for code readability though. Note, this patch does add some overhead to the kmem_cache_alloc hot path, but it is pretty negligible - it's just a function call plus a per cpu counter decrement, which is comparable to what we already have in memcg_kmem_get_cache. Besides, it's only relevant if there are memory cgroups with kmem accounting enabled. I don't think we can find a way to handle this race w/o it, because alloc_page called from kmem_cache_alloc may sleep so we can't flush all pending kmallocs w/o reference counting. Signed-off-by: Vladimir Davydov <vdavydov@parallels.com> Acked-by: Christoph Lameter <cl@linux.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: Pekka Enberg <penberg@kernel.org> Cc: David Rientjes <rientjes@google.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c51
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slub.c14
3 files changed, 27 insertions, 40 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dac81b975996..05e1584750ac 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2635,7 +2635,6 @@ static void memcg_register_cache(struct mem_cgroup *memcg,
2635 if (!cachep) 2635 if (!cachep)
2636 return; 2636 return;
2637 2637
2638 css_get(&memcg->css);
2639 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); 2638 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
2640 2639
2641 /* 2640 /*
@@ -2669,9 +2668,6 @@ static void memcg_unregister_cache(struct kmem_cache *cachep)
2669 list_del(&cachep->memcg_params->list); 2668 list_del(&cachep->memcg_params->list);
2670 2669
2671 kmem_cache_destroy(cachep); 2670 kmem_cache_destroy(cachep);
2672
2673 /* drop the reference taken in memcg_register_cache */
2674 css_put(&memcg->css);
2675} 2671}
2676 2672
2677int __memcg_cleanup_cache_params(struct kmem_cache *s) 2673int __memcg_cleanup_cache_params(struct kmem_cache *s)
@@ -2705,9 +2701,7 @@ static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
2705 mutex_lock(&memcg_slab_mutex); 2701 mutex_lock(&memcg_slab_mutex);
2706 list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { 2702 list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
2707 cachep = memcg_params_to_cache(params); 2703 cachep = memcg_params_to_cache(params);
2708 kmem_cache_shrink(cachep); 2704 memcg_unregister_cache(cachep);
2709 if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
2710 memcg_unregister_cache(cachep);
2711 } 2705 }
2712 mutex_unlock(&memcg_slab_mutex); 2706 mutex_unlock(&memcg_slab_mutex);
2713} 2707}
@@ -2742,10 +2736,10 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
2742 struct memcg_register_cache_work *cw; 2736 struct memcg_register_cache_work *cw;
2743 2737
2744 cw = kmalloc(sizeof(*cw), GFP_NOWAIT); 2738 cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
2745 if (cw == NULL) { 2739 if (!cw)
2746 css_put(&memcg->css);
2747 return; 2740 return;
2748 } 2741
2742 css_get(&memcg->css);
2749 2743
2750 cw->memcg = memcg; 2744 cw->memcg = memcg;
2751 cw->cachep = cachep; 2745 cw->cachep = cachep;
@@ -2776,12 +2770,8 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
2776int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) 2770int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
2777{ 2771{
2778 unsigned int nr_pages = 1 << order; 2772 unsigned int nr_pages = 1 << order;
2779 int res;
2780 2773
2781 res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); 2774 return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
2782 if (!res)
2783 atomic_add(nr_pages, &cachep->memcg_params->nr_pages);
2784 return res;
2785} 2775}
2786 2776
2787void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) 2777void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
@@ -2789,7 +2779,6 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
2789 unsigned int nr_pages = 1 << order; 2779 unsigned int nr_pages = 1 << order;
2790 2780
2791 memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); 2781 memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
2792 atomic_sub(nr_pages, &cachep->memcg_params->nr_pages);
2793} 2782}
2794 2783
2795/* 2784/*
@@ -2816,22 +2805,13 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
2816 if (current->memcg_kmem_skip_account) 2805 if (current->memcg_kmem_skip_account)
2817 return cachep; 2806 return cachep;
2818 2807
2819 rcu_read_lock(); 2808 memcg = get_mem_cgroup_from_mm(current->mm);
2820 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
2821
2822 if (!memcg_kmem_is_active(memcg)) 2809 if (!memcg_kmem_is_active(memcg))
2823 goto out; 2810 goto out;
2824 2811
2825 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); 2812 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
2826 if (likely(memcg_cachep)) { 2813 if (likely(memcg_cachep))
2827 cachep = memcg_cachep; 2814 return memcg_cachep;
2828 goto out;
2829 }
2830
2831 /* The corresponding put will be done in the workqueue. */
2832 if (!css_tryget_online(&memcg->css))
2833 goto out;
2834 rcu_read_unlock();
2835 2815
2836 /* 2816 /*
2837 * If we are in a safe context (can wait, and not in interrupt 2817 * If we are in a safe context (can wait, and not in interrupt
@@ -2846,12 +2826,17 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
2846 * defer everything. 2826 * defer everything.
2847 */ 2827 */
2848 memcg_schedule_register_cache(memcg, cachep); 2828 memcg_schedule_register_cache(memcg, cachep);
2849 return cachep;
2850out: 2829out:
2851 rcu_read_unlock(); 2830 css_put(&memcg->css);
2852 return cachep; 2831 return cachep;
2853} 2832}
2854 2833
2834void __memcg_kmem_put_cache(struct kmem_cache *cachep)
2835{
2836 if (!is_root_cache(cachep))
2837 css_put(&cachep->memcg_params->memcg->css);
2838}
2839
2855/* 2840/*
2856 * We need to verify if the allocation against current->mm->owner's memcg is 2841 * We need to verify if the allocation against current->mm->owner's memcg is
2857 * possible for the given order. But the page is not allocated yet, so we'll 2842 * possible for the given order. But the page is not allocated yet, so we'll
@@ -2914,10 +2899,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
2914 memcg_uncharge_kmem(memcg, 1 << order); 2899 memcg_uncharge_kmem(memcg, 1 << order);
2915 page->mem_cgroup = NULL; 2900 page->mem_cgroup = NULL;
2916} 2901}
2917#else
2918static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
2919{
2920}
2921#endif /* CONFIG_MEMCG_KMEM */ 2902#endif /* CONFIG_MEMCG_KMEM */
2922 2903
2923#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2904#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -4188,6 +4169,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4188 4169
4189static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4170static void memcg_destroy_kmem(struct mem_cgroup *memcg)
4190{ 4171{
4172 memcg_unregister_all_caches(memcg);
4191 mem_cgroup_sockets_destroy(memcg); 4173 mem_cgroup_sockets_destroy(memcg);
4192} 4174}
4193#else 4175#else
@@ -4797,7 +4779,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
4797 } 4779 }
4798 spin_unlock(&memcg->event_list_lock); 4780 spin_unlock(&memcg->event_list_lock);
4799 4781
4800 memcg_unregister_all_caches(memcg);
4801 vmpressure_cleanup(&memcg->vmpressure); 4782 vmpressure_cleanup(&memcg->vmpressure);
4802} 4783}
4803 4784
diff --git a/mm/slab.c b/mm/slab.c
index fee275b5b6b7..6042fe57cc60 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3182,6 +3182,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3182 memset(ptr, 0, cachep->object_size); 3182 memset(ptr, 0, cachep->object_size);
3183 } 3183 }
3184 3184
3185 memcg_kmem_put_cache(cachep);
3185 return ptr; 3186 return ptr;
3186} 3187}
3187 3188
@@ -3247,6 +3248,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
3247 memset(objp, 0, cachep->object_size); 3248 memset(objp, 0, cachep->object_size);
3248 } 3249 }
3249 3250
3251 memcg_kmem_put_cache(cachep);
3250 return objp; 3252 return objp;
3251} 3253}
3252 3254
diff --git a/mm/slub.c b/mm/slub.c
index 765c5884d03d..fe4db9c17238 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1233,13 +1233,17 @@ static inline void kfree_hook(const void *x)
1233 kmemleak_free(x); 1233 kmemleak_free(x);
1234} 1234}
1235 1235
1236static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) 1236static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
1237 gfp_t flags)
1237{ 1238{
1238 flags &= gfp_allowed_mask; 1239 flags &= gfp_allowed_mask;
1239 lockdep_trace_alloc(flags); 1240 lockdep_trace_alloc(flags);
1240 might_sleep_if(flags & __GFP_WAIT); 1241 might_sleep_if(flags & __GFP_WAIT);
1241 1242
1242 return should_failslab(s->object_size, flags, s->flags); 1243 if (should_failslab(s->object_size, flags, s->flags))
1244 return NULL;
1245
1246 return memcg_kmem_get_cache(s, flags);
1243} 1247}
1244 1248
1245static inline void slab_post_alloc_hook(struct kmem_cache *s, 1249static inline void slab_post_alloc_hook(struct kmem_cache *s,
@@ -1248,6 +1252,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
1248 flags &= gfp_allowed_mask; 1252 flags &= gfp_allowed_mask;
1249 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); 1253 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
1250 kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); 1254 kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
1255 memcg_kmem_put_cache(s);
1251} 1256}
1252 1257
1253static inline void slab_free_hook(struct kmem_cache *s, void *x) 1258static inline void slab_free_hook(struct kmem_cache *s, void *x)
@@ -2384,10 +2389,9 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
2384 struct page *page; 2389 struct page *page;
2385 unsigned long tid; 2390 unsigned long tid;
2386 2391
2387 if (slab_pre_alloc_hook(s, gfpflags)) 2392 s = slab_pre_alloc_hook(s, gfpflags);
2393 if (!s)
2388 return NULL; 2394 return NULL;
2389
2390 s = memcg_kmem_get_cache(s, gfpflags);
2391redo: 2395redo:
2392 /* 2396 /*
2393 * Must read kmem_cache cpu data via this cpu ptr. Preemption is 2397 * Must read kmem_cache cpu data via this cpu ptr. Preemption is