diff options
author | Vladimir Davydov <vdavydov@parallels.com> | 2014-12-12 19:56:38 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-13 15:42:49 -0500 |
commit | 8135be5a8012f4c7e95218563855e16c09a8271b (patch) | |
tree | 49e85409f82f5973a0cbf21e3e3eac382daa515b /mm | |
parent | ae6e71d3d900c398bdb346ac25733b2efa9b3752 (diff) |
memcg: fix possible use-after-free in memcg_kmem_get_cache()
Suppose task @t that belongs to a memory cgroup @memcg is going to
allocate an object from a kmem cache @c. The copy of @c corresponding to
@memcg, @mc, is empty. Then if kmem_cache_alloc races with the memory
cgroup destruction we can access the memory cgroup's copy of the cache
after it was destroyed:
CPU0 CPU1
---- ----
[ current=@t
@mc->memcg_params->nr_pages=0 ]
kmem_cache_alloc(@c):
call memcg_kmem_get_cache(@c);
proceed to allocation from @mc:
alloc a page for @mc:
...
move @t from @memcg
destroy @memcg:
mem_cgroup_css_offline(@memcg):
memcg_unregister_all_caches(@memcg):
kmem_cache_destroy(@mc)
add page to @mc
We could fix this issue by taking a reference to a per-memcg cache, but
that would require adding a per-cpu reference counter to per-memcg caches,
which would look cumbersome.
Instead, let's take a reference to a memory cgroup, which already has a
per-cpu reference counter, in the beginning of kmem_cache_alloc to be
dropped in the end, and move per memcg caches destruction from css offline
to css free. As a side effect, per-memcg caches will be destroyed not one
by one, but all at once when the last page accounted to the memory cgroup
is freed. This doesn't sound as a high price for code readability though.
Note, this patch does add some overhead to the kmem_cache_alloc hot path,
but it is pretty negligible - it's just a function call plus a per cpu
counter decrement, which is comparable to what we already have in
memcg_kmem_get_cache. Besides, it's only relevant if there are memory
cgroups with kmem accounting enabled. I don't think we can find a way to
handle this race w/o it, because alloc_page called from kmem_cache_alloc
may sleep so we can't flush all pending kmallocs w/o reference counting.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/memcontrol.c | 51 | ||||
-rw-r--r-- | mm/slab.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 14 |
3 files changed, 27 insertions, 40 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dac81b975996..05e1584750ac 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -2635,7 +2635,6 @@ static void memcg_register_cache(struct mem_cgroup *memcg, | |||
2635 | if (!cachep) | 2635 | if (!cachep) |
2636 | return; | 2636 | return; |
2637 | 2637 | ||
2638 | css_get(&memcg->css); | ||
2639 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); | 2638 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); |
2640 | 2639 | ||
2641 | /* | 2640 | /* |
@@ -2669,9 +2668,6 @@ static void memcg_unregister_cache(struct kmem_cache *cachep) | |||
2669 | list_del(&cachep->memcg_params->list); | 2668 | list_del(&cachep->memcg_params->list); |
2670 | 2669 | ||
2671 | kmem_cache_destroy(cachep); | 2670 | kmem_cache_destroy(cachep); |
2672 | |||
2673 | /* drop the reference taken in memcg_register_cache */ | ||
2674 | css_put(&memcg->css); | ||
2675 | } | 2671 | } |
2676 | 2672 | ||
2677 | int __memcg_cleanup_cache_params(struct kmem_cache *s) | 2673 | int __memcg_cleanup_cache_params(struct kmem_cache *s) |
@@ -2705,9 +2701,7 @@ static void memcg_unregister_all_caches(struct mem_cgroup *memcg) | |||
2705 | mutex_lock(&memcg_slab_mutex); | 2701 | mutex_lock(&memcg_slab_mutex); |
2706 | list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { | 2702 | list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { |
2707 | cachep = memcg_params_to_cache(params); | 2703 | cachep = memcg_params_to_cache(params); |
2708 | kmem_cache_shrink(cachep); | 2704 | memcg_unregister_cache(cachep); |
2709 | if (atomic_read(&cachep->memcg_params->nr_pages) == 0) | ||
2710 | memcg_unregister_cache(cachep); | ||
2711 | } | 2705 | } |
2712 | mutex_unlock(&memcg_slab_mutex); | 2706 | mutex_unlock(&memcg_slab_mutex); |
2713 | } | 2707 | } |
@@ -2742,10 +2736,10 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, | |||
2742 | struct memcg_register_cache_work *cw; | 2736 | struct memcg_register_cache_work *cw; |
2743 | 2737 | ||
2744 | cw = kmalloc(sizeof(*cw), GFP_NOWAIT); | 2738 | cw = kmalloc(sizeof(*cw), GFP_NOWAIT); |
2745 | if (cw == NULL) { | 2739 | if (!cw) |
2746 | css_put(&memcg->css); | ||
2747 | return; | 2740 | return; |
2748 | } | 2741 | |
2742 | css_get(&memcg->css); | ||
2749 | 2743 | ||
2750 | cw->memcg = memcg; | 2744 | cw->memcg = memcg; |
2751 | cw->cachep = cachep; | 2745 | cw->cachep = cachep; |
@@ -2776,12 +2770,8 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg, | |||
2776 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) | 2770 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) |
2777 | { | 2771 | { |
2778 | unsigned int nr_pages = 1 << order; | 2772 | unsigned int nr_pages = 1 << order; |
2779 | int res; | ||
2780 | 2773 | ||
2781 | res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); | 2774 | return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); |
2782 | if (!res) | ||
2783 | atomic_add(nr_pages, &cachep->memcg_params->nr_pages); | ||
2784 | return res; | ||
2785 | } | 2775 | } |
2786 | 2776 | ||
2787 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) | 2777 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) |
@@ -2789,7 +2779,6 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) | |||
2789 | unsigned int nr_pages = 1 << order; | 2779 | unsigned int nr_pages = 1 << order; |
2790 | 2780 | ||
2791 | memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); | 2781 | memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); |
2792 | atomic_sub(nr_pages, &cachep->memcg_params->nr_pages); | ||
2793 | } | 2782 | } |
2794 | 2783 | ||
2795 | /* | 2784 | /* |
@@ -2816,22 +2805,13 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) | |||
2816 | if (current->memcg_kmem_skip_account) | 2805 | if (current->memcg_kmem_skip_account) |
2817 | return cachep; | 2806 | return cachep; |
2818 | 2807 | ||
2819 | rcu_read_lock(); | 2808 | memcg = get_mem_cgroup_from_mm(current->mm); |
2820 | memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); | ||
2821 | |||
2822 | if (!memcg_kmem_is_active(memcg)) | 2809 | if (!memcg_kmem_is_active(memcg)) |
2823 | goto out; | 2810 | goto out; |
2824 | 2811 | ||
2825 | memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); | 2812 | memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); |
2826 | if (likely(memcg_cachep)) { | 2813 | if (likely(memcg_cachep)) |
2827 | cachep = memcg_cachep; | 2814 | return memcg_cachep; |
2828 | goto out; | ||
2829 | } | ||
2830 | |||
2831 | /* The corresponding put will be done in the workqueue. */ | ||
2832 | if (!css_tryget_online(&memcg->css)) | ||
2833 | goto out; | ||
2834 | rcu_read_unlock(); | ||
2835 | 2815 | ||
2836 | /* | 2816 | /* |
2837 | * If we are in a safe context (can wait, and not in interrupt | 2817 | * If we are in a safe context (can wait, and not in interrupt |
@@ -2846,12 +2826,17 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) | |||
2846 | * defer everything. | 2826 | * defer everything. |
2847 | */ | 2827 | */ |
2848 | memcg_schedule_register_cache(memcg, cachep); | 2828 | memcg_schedule_register_cache(memcg, cachep); |
2849 | return cachep; | ||
2850 | out: | 2829 | out: |
2851 | rcu_read_unlock(); | 2830 | css_put(&memcg->css); |
2852 | return cachep; | 2831 | return cachep; |
2853 | } | 2832 | } |
2854 | 2833 | ||
2834 | void __memcg_kmem_put_cache(struct kmem_cache *cachep) | ||
2835 | { | ||
2836 | if (!is_root_cache(cachep)) | ||
2837 | css_put(&cachep->memcg_params->memcg->css); | ||
2838 | } | ||
2839 | |||
2855 | /* | 2840 | /* |
2856 | * We need to verify if the allocation against current->mm->owner's memcg is | 2841 | * We need to verify if the allocation against current->mm->owner's memcg is |
2857 | * possible for the given order. But the page is not allocated yet, so we'll | 2842 | * possible for the given order. But the page is not allocated yet, so we'll |
@@ -2914,10 +2899,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) | |||
2914 | memcg_uncharge_kmem(memcg, 1 << order); | 2899 | memcg_uncharge_kmem(memcg, 1 << order); |
2915 | page->mem_cgroup = NULL; | 2900 | page->mem_cgroup = NULL; |
2916 | } | 2901 | } |
2917 | #else | ||
2918 | static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) | ||
2919 | { | ||
2920 | } | ||
2921 | #endif /* CONFIG_MEMCG_KMEM */ | 2902 | #endif /* CONFIG_MEMCG_KMEM */ |
2922 | 2903 | ||
2923 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 2904 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
@@ -4188,6 +4169,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | |||
4188 | 4169 | ||
4189 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) | 4170 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) |
4190 | { | 4171 | { |
4172 | memcg_unregister_all_caches(memcg); | ||
4191 | mem_cgroup_sockets_destroy(memcg); | 4173 | mem_cgroup_sockets_destroy(memcg); |
4192 | } | 4174 | } |
4193 | #else | 4175 | #else |
@@ -4797,7 +4779,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
4797 | } | 4779 | } |
4798 | spin_unlock(&memcg->event_list_lock); | 4780 | spin_unlock(&memcg->event_list_lock); |
4799 | 4781 | ||
4800 | memcg_unregister_all_caches(memcg); | ||
4801 | vmpressure_cleanup(&memcg->vmpressure); | 4782 | vmpressure_cleanup(&memcg->vmpressure); |
4802 | } | 4783 | } |
4803 | 4784 | ||
@@ -3182,6 +3182,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3182 | memset(ptr, 0, cachep->object_size); | 3182 | memset(ptr, 0, cachep->object_size); |
3183 | } | 3183 | } |
3184 | 3184 | ||
3185 | memcg_kmem_put_cache(cachep); | ||
3185 | return ptr; | 3186 | return ptr; |
3186 | } | 3187 | } |
3187 | 3188 | ||
@@ -3247,6 +3248,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) | |||
3247 | memset(objp, 0, cachep->object_size); | 3248 | memset(objp, 0, cachep->object_size); |
3248 | } | 3249 | } |
3249 | 3250 | ||
3251 | memcg_kmem_put_cache(cachep); | ||
3250 | return objp; | 3252 | return objp; |
3251 | } | 3253 | } |
3252 | 3254 | ||
@@ -1233,13 +1233,17 @@ static inline void kfree_hook(const void *x) | |||
1233 | kmemleak_free(x); | 1233 | kmemleak_free(x); |
1234 | } | 1234 | } |
1235 | 1235 | ||
1236 | static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) | 1236 | static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, |
1237 | gfp_t flags) | ||
1237 | { | 1238 | { |
1238 | flags &= gfp_allowed_mask; | 1239 | flags &= gfp_allowed_mask; |
1239 | lockdep_trace_alloc(flags); | 1240 | lockdep_trace_alloc(flags); |
1240 | might_sleep_if(flags & __GFP_WAIT); | 1241 | might_sleep_if(flags & __GFP_WAIT); |
1241 | 1242 | ||
1242 | return should_failslab(s->object_size, flags, s->flags); | 1243 | if (should_failslab(s->object_size, flags, s->flags)) |
1244 | return NULL; | ||
1245 | |||
1246 | return memcg_kmem_get_cache(s, flags); | ||
1243 | } | 1247 | } |
1244 | 1248 | ||
1245 | static inline void slab_post_alloc_hook(struct kmem_cache *s, | 1249 | static inline void slab_post_alloc_hook(struct kmem_cache *s, |
@@ -1248,6 +1252,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, | |||
1248 | flags &= gfp_allowed_mask; | 1252 | flags &= gfp_allowed_mask; |
1249 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); | 1253 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); |
1250 | kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); | 1254 | kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); |
1255 | memcg_kmem_put_cache(s); | ||
1251 | } | 1256 | } |
1252 | 1257 | ||
1253 | static inline void slab_free_hook(struct kmem_cache *s, void *x) | 1258 | static inline void slab_free_hook(struct kmem_cache *s, void *x) |
@@ -2384,10 +2389,9 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, | |||
2384 | struct page *page; | 2389 | struct page *page; |
2385 | unsigned long tid; | 2390 | unsigned long tid; |
2386 | 2391 | ||
2387 | if (slab_pre_alloc_hook(s, gfpflags)) | 2392 | s = slab_pre_alloc_hook(s, gfpflags); |
2393 | if (!s) | ||
2388 | return NULL; | 2394 | return NULL; |
2389 | |||
2390 | s = memcg_kmem_get_cache(s, gfpflags); | ||
2391 | redo: | 2395 | redo: |
2392 | /* | 2396 | /* |
2393 | * Must read kmem_cache cpu data via this cpu ptr. Preemption is | 2397 | * Must read kmem_cache cpu data via this cpu ptr. Preemption is |