summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRoman Gushchin <guro@fb.com>2019-07-11 23:56:27 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-07-12 14:05:44 -0400
commitf0a3a24b532d9a7e56a33c5112b2a212ed6ec580 (patch)
tree254f501899a5e542a84043674dfe3df9dccf0cb4
parent63b02ef7dc4ec239df45c018ac0adbd02ba30a0c (diff)
mm: memcg/slab: rework non-root kmem_cache lifecycle management
Currently each charged slab page holds a reference to the cgroup to which it's charged. Kmem_caches are held by the memcg and are released all together with the memory cgroup. It means that none of kmem_caches are released unless at least one reference to the memcg exists, which is very far from optimal. Let's rework it in a way that allows releasing individual kmem_caches as soon as the cgroup is offline, the kmem_cache is empty and there are no pending allocations. To make it possible, let's introduce a new percpu refcounter for non-root kmem caches. The counter is initialized to the percpu mode, and is switched to the atomic mode during kmem_cache deactivation. The counter is bumped for every charged page and also for every running allocation. So the kmem_cache can't be released unless all allocations complete. To shutdown non-active empty kmem_caches, let's reuse the work queue, previously used for the kmem_cache deactivation. Once the reference counter reaches 0, let's schedule an asynchronous kmem_cache release. * I used the following simple approach to test the performance (stolen from another patchset by T. Harding): time find / -name fname-no-exist echo 2 > /proc/sys/vm/drop_caches repeat 10 times Results: orig patched real 0m1.455s real 0m1.355s user 0m0.206s user 0m0.219s sys 0m0.855s sys 0m0.807s real 0m1.487s real 0m1.699s user 0m0.221s user 0m0.256s sys 0m0.806s sys 0m0.948s real 0m1.515s real 0m1.505s user 0m0.183s user 0m0.215s sys 0m0.876s sys 0m0.858s real 0m1.291s real 0m1.380s user 0m0.193s user 0m0.198s sys 0m0.843s sys 0m0.786s real 0m1.364s real 0m1.374s user 0m0.180s user 0m0.182s sys 0m0.868s sys 0m0.806s real 0m1.352s real 0m1.312s user 0m0.201s user 0m0.212s sys 0m0.820s sys 0m0.761s real 0m1.302s real 0m1.349s user 0m0.205s user 0m0.203s sys 0m0.803s sys 0m0.792s real 0m1.334s real 0m1.301s user 0m0.194s user 0m0.201s sys 0m0.806s sys 0m0.779s real 0m1.426s real 0m1.434s user 0m0.216s user 0m0.181s sys 0m0.824s sys 0m0.864s real 0m1.350s real 0m1.295s user 0m0.200s user 0m0.190s sys 0m0.842s sys 0m0.811s So it looks like the difference is not noticeable in this test. [cai@lca.pw: fix an use-after-free in kmemcg_workfn()] Link: http://lkml.kernel.org/r/1560977573-10715-1-git-send-email-cai@lca.pw Link: http://lkml.kernel.org/r/20190611231813.3148843-9-guro@fb.com Signed-off-by: Roman Gushchin <guro@fb.com> Signed-off-by: Qian Cai <cai@lca.pw> Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: Christoph Lameter <cl@linux.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Shakeel Butt <shakeelb@google.com> Cc: Waiman Long <longman@redhat.com> Cc: David Rientjes <rientjes@google.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Pekka Enberg <penberg@kernel.org> Cc: Andrei Vagin <avagin@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/slab.h3
-rw-r--r--mm/memcontrol.c50
-rw-r--r--mm/slab.h44
-rw-r--r--mm/slab_common.c78
4 files changed, 96 insertions, 79 deletions
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 6008d884e621..bc189a43e680 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -16,6 +16,7 @@
16#include <linux/overflow.h> 16#include <linux/overflow.h>
17#include <linux/types.h> 17#include <linux/types.h>
18#include <linux/workqueue.h> 18#include <linux/workqueue.h>
19#include <linux/percpu-refcount.h>
19 20
20 21
21/* 22/*
@@ -152,7 +153,6 @@ int kmem_cache_shrink(struct kmem_cache *);
152 153
153void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *); 154void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
154void memcg_deactivate_kmem_caches(struct mem_cgroup *); 155void memcg_deactivate_kmem_caches(struct mem_cgroup *);
155void memcg_destroy_kmem_caches(struct mem_cgroup *);
156 156
157/* 157/*
158 * Please use this macro to create slab caches. Simply specify the 158 * Please use this macro to create slab caches. Simply specify the
@@ -642,6 +642,7 @@ struct memcg_cache_params {
642 struct mem_cgroup *memcg; 642 struct mem_cgroup *memcg;
643 struct list_head children_node; 643 struct list_head children_node;
644 struct list_head kmem_caches_node; 644 struct list_head kmem_caches_node;
645 struct percpu_ref refcnt;
645 646
646 void (*work_fn)(struct kmem_cache *); 647 void (*work_fn)(struct kmem_cache *);
647 union { 648 union {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 25e35a8b8ba2..ce4ce5e7937b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2667,12 +2667,13 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2667{ 2667{
2668 struct memcg_kmem_cache_create_work *cw; 2668 struct memcg_kmem_cache_create_work *cw;
2669 2669
2670 if (!css_tryget_online(&memcg->css))
2671 return;
2672
2670 cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN); 2673 cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
2671 if (!cw) 2674 if (!cw)
2672 return; 2675 return;
2673 2676
2674 css_get(&memcg->css);
2675
2676 cw->memcg = memcg; 2677 cw->memcg = memcg;
2677 cw->cachep = cachep; 2678 cw->cachep = cachep;
2678 INIT_WORK(&cw->work, memcg_kmem_cache_create_func); 2679 INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
@@ -2707,6 +2708,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2707{ 2708{
2708 struct mem_cgroup *memcg; 2709 struct mem_cgroup *memcg;
2709 struct kmem_cache *memcg_cachep; 2710 struct kmem_cache *memcg_cachep;
2711 struct memcg_cache_array *arr;
2710 int kmemcg_id; 2712 int kmemcg_id;
2711 2713
2712 VM_BUG_ON(!is_root_cache(cachep)); 2714 VM_BUG_ON(!is_root_cache(cachep));
@@ -2714,14 +2716,28 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2714 if (memcg_kmem_bypass()) 2716 if (memcg_kmem_bypass())
2715 return cachep; 2717 return cachep;
2716 2718
2717 memcg = get_mem_cgroup_from_current(); 2719 rcu_read_lock();
2720
2721 if (unlikely(current->active_memcg))
2722 memcg = current->active_memcg;
2723 else
2724 memcg = mem_cgroup_from_task(current);
2725
2726 if (!memcg || memcg == root_mem_cgroup)
2727 goto out_unlock;
2728
2718 kmemcg_id = READ_ONCE(memcg->kmemcg_id); 2729 kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2719 if (kmemcg_id < 0) 2730 if (kmemcg_id < 0)
2720 goto out; 2731 goto out_unlock;
2732
2733 arr = rcu_dereference(cachep->memcg_params.memcg_caches);
2721 2734
2722 memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id); 2735 /*
2723 if (likely(memcg_cachep)) 2736 * Make sure we will access the up-to-date value. The code updating
2724 return memcg_cachep; 2737 * memcg_caches issues a write barrier to match the data dependency
2738 * barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
2739 */
2740 memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
2725 2741
2726 /* 2742 /*
2727 * If we are in a safe context (can wait, and not in interrupt 2743 * If we are in a safe context (can wait, and not in interrupt
@@ -2734,10 +2750,20 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2734 * memcg_create_kmem_cache, this means no further allocation 2750 * memcg_create_kmem_cache, this means no further allocation
2735 * could happen with the slab_mutex held. So it's better to 2751 * could happen with the slab_mutex held. So it's better to
2736 * defer everything. 2752 * defer everything.
2753 *
2754 * If the memcg is dying or memcg_cache is about to be released,
2755 * don't bother creating new kmem_caches. Because memcg_cachep
2756 * is ZEROed as the fist step of kmem offlining, we don't need
2757 * percpu_ref_tryget_live() here. css_tryget_online() check in
2758 * memcg_schedule_kmem_cache_create() will prevent us from
2759 * creation of a new kmem_cache.
2737 */ 2760 */
2738 memcg_schedule_kmem_cache_create(memcg, cachep); 2761 if (unlikely(!memcg_cachep))
2739out: 2762 memcg_schedule_kmem_cache_create(memcg, cachep);
2740 css_put(&memcg->css); 2763 else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
2764 cachep = memcg_cachep;
2765out_unlock:
2766 rcu_read_unlock();
2741 return cachep; 2767 return cachep;
2742} 2768}
2743 2769
@@ -2748,7 +2774,7 @@ out:
2748void memcg_kmem_put_cache(struct kmem_cache *cachep) 2774void memcg_kmem_put_cache(struct kmem_cache *cachep)
2749{ 2775{
2750 if (!is_root_cache(cachep)) 2776 if (!is_root_cache(cachep))
2751 css_put(&cachep->memcg_params.memcg->css); 2777 percpu_ref_put(&cachep->memcg_params.refcnt);
2752} 2778}
2753 2779
2754/** 2780/**
@@ -3295,7 +3321,7 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
3295 memcg_offline_kmem(memcg); 3321 memcg_offline_kmem(memcg);
3296 3322
3297 if (memcg->kmem_state == KMEM_ALLOCATED) { 3323 if (memcg->kmem_state == KMEM_ALLOCATED) {
3298 memcg_destroy_kmem_caches(memcg); 3324 WARN_ON(!list_empty(&memcg->kmem_caches));
3299 static_branch_dec(&memcg_kmem_enabled_key); 3325 static_branch_dec(&memcg_kmem_enabled_key);
3300 WARN_ON(page_counter_read(&memcg->kmem)); 3326 WARN_ON(page_counter_read(&memcg->kmem));
3301 } 3327 }
diff --git a/mm/slab.h b/mm/slab.h
index 46623a576a3c..5d2b8511e6fb 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -248,31 +248,6 @@ static inline const char *cache_name(struct kmem_cache *s)
248 return s->name; 248 return s->name;
249} 249}
250 250
251/*
252 * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
253 * That said the caller must assure the memcg's cache won't go away by either
254 * taking a css reference to the owner cgroup, or holding the slab_mutex.
255 */
256static inline struct kmem_cache *
257cache_from_memcg_idx(struct kmem_cache *s, int idx)
258{
259 struct kmem_cache *cachep;
260 struct memcg_cache_array *arr;
261
262 rcu_read_lock();
263 arr = rcu_dereference(s->memcg_params.memcg_caches);
264
265 /*
266 * Make sure we will access the up-to-date value. The code updating
267 * memcg_caches issues a write barrier to match this (see
268 * memcg_create_kmem_cache()).
269 */
270 cachep = READ_ONCE(arr->entries[idx]);
271 rcu_read_unlock();
272
273 return cachep;
274}
275
276static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) 251static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
277{ 252{
278 if (is_root_cache(s)) 253 if (is_root_cache(s))
@@ -284,14 +259,25 @@ static __always_inline int memcg_charge_slab(struct page *page,
284 gfp_t gfp, int order, 259 gfp_t gfp, int order,
285 struct kmem_cache *s) 260 struct kmem_cache *s)
286{ 261{
262 int ret;
263
287 if (is_root_cache(s)) 264 if (is_root_cache(s))
288 return 0; 265 return 0;
289 return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); 266
267 ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
268 if (ret)
269 return ret;
270
271 percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
272
273 return 0;
290} 274}
291 275
292static __always_inline void memcg_uncharge_slab(struct page *page, int order, 276static __always_inline void memcg_uncharge_slab(struct page *page, int order,
293 struct kmem_cache *s) 277 struct kmem_cache *s)
294{ 278{
279 if (!is_root_cache(s))
280 percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order);
295 memcg_kmem_uncharge(page, order); 281 memcg_kmem_uncharge(page, order);
296} 282}
297 283
@@ -323,12 +309,6 @@ static inline const char *cache_name(struct kmem_cache *s)
323 return s->name; 309 return s->name;
324} 310}
325 311
326static inline struct kmem_cache *
327cache_from_memcg_idx(struct kmem_cache *s, int idx)
328{
329 return NULL;
330}
331
332static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) 312static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
333{ 313{
334 return s; 314 return s;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index a15557776d7d..ee3971f7fabc 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -132,6 +132,8 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
132LIST_HEAD(slab_root_caches); 132LIST_HEAD(slab_root_caches);
133static DEFINE_SPINLOCK(memcg_kmem_wq_lock); 133static DEFINE_SPINLOCK(memcg_kmem_wq_lock);
134 134
135static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref);
136
135void slab_init_memcg_params(struct kmem_cache *s) 137void slab_init_memcg_params(struct kmem_cache *s)
136{ 138{
137 s->memcg_params.root_cache = NULL; 139 s->memcg_params.root_cache = NULL;
@@ -146,6 +148,12 @@ static int init_memcg_params(struct kmem_cache *s,
146 struct memcg_cache_array *arr; 148 struct memcg_cache_array *arr;
147 149
148 if (root_cache) { 150 if (root_cache) {
151 int ret = percpu_ref_init(&s->memcg_params.refcnt,
152 kmemcg_cache_shutdown,
153 0, GFP_KERNEL);
154 if (ret)
155 return ret;
156
149 s->memcg_params.root_cache = root_cache; 157 s->memcg_params.root_cache = root_cache;
150 INIT_LIST_HEAD(&s->memcg_params.children_node); 158 INIT_LIST_HEAD(&s->memcg_params.children_node);
151 INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node); 159 INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
@@ -171,6 +179,8 @@ static void destroy_memcg_params(struct kmem_cache *s)
171{ 179{
172 if (is_root_cache(s)) 180 if (is_root_cache(s))
173 kvfree(rcu_access_pointer(s->memcg_params.memcg_caches)); 181 kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
182 else
183 percpu_ref_exit(&s->memcg_params.refcnt);
174} 184}
175 185
176static void free_memcg_params(struct rcu_head *rcu) 186static void free_memcg_params(struct rcu_head *rcu)
@@ -226,6 +236,7 @@ void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg)
226 if (is_root_cache(s)) { 236 if (is_root_cache(s)) {
227 list_add(&s->root_caches_node, &slab_root_caches); 237 list_add(&s->root_caches_node, &slab_root_caches);
228 } else { 238 } else {
239 css_get(&memcg->css);
229 s->memcg_params.memcg = memcg; 240 s->memcg_params.memcg = memcg;
230 list_add(&s->memcg_params.children_node, 241 list_add(&s->memcg_params.children_node,
231 &s->memcg_params.root_cache->memcg_params.children); 242 &s->memcg_params.root_cache->memcg_params.children);
@@ -241,6 +252,7 @@ static void memcg_unlink_cache(struct kmem_cache *s)
241 } else { 252 } else {
242 list_del(&s->memcg_params.children_node); 253 list_del(&s->memcg_params.children_node);
243 list_del(&s->memcg_params.kmem_caches_node); 254 list_del(&s->memcg_params.kmem_caches_node);
255 css_put(&s->memcg_params.memcg->css);
244 } 256 }
245} 257}
246#else 258#else
@@ -678,7 +690,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
678 } 690 }
679 691
680 /* 692 /*
681 * Since readers won't lock (see cache_from_memcg_idx()), we need a 693 * Since readers won't lock (see memcg_kmem_get_cache()), we need a
682 * barrier here to ensure nobody will see the kmem_cache partially 694 * barrier here to ensure nobody will see the kmem_cache partially
683 * initialized. 695 * initialized.
684 */ 696 */
@@ -701,16 +713,11 @@ static void kmemcg_workfn(struct work_struct *work)
701 get_online_mems(); 713 get_online_mems();
702 714
703 mutex_lock(&slab_mutex); 715 mutex_lock(&slab_mutex);
704
705 s->memcg_params.work_fn(s); 716 s->memcg_params.work_fn(s);
706
707 mutex_unlock(&slab_mutex); 717 mutex_unlock(&slab_mutex);
708 718
709 put_online_mems(); 719 put_online_mems();
710 put_online_cpus(); 720 put_online_cpus();
711
712 /* done, put the ref from kmemcg_cache_deactivate() */
713 css_put(&s->memcg_params.memcg->css);
714} 721}
715 722
716static void kmemcg_rcufn(struct rcu_head *head) 723static void kmemcg_rcufn(struct rcu_head *head)
@@ -727,10 +734,38 @@ static void kmemcg_rcufn(struct rcu_head *head)
727 queue_work(memcg_kmem_cache_wq, &s->memcg_params.work); 734 queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
728} 735}
729 736
737static void kmemcg_cache_shutdown_fn(struct kmem_cache *s)
738{
739 WARN_ON(shutdown_cache(s));
740}
741
742static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref)
743{
744 struct kmem_cache *s = container_of(percpu_ref, struct kmem_cache,
745 memcg_params.refcnt);
746 unsigned long flags;
747
748 spin_lock_irqsave(&memcg_kmem_wq_lock, flags);
749 if (s->memcg_params.root_cache->memcg_params.dying)
750 goto unlock;
751
752 s->memcg_params.work_fn = kmemcg_cache_shutdown_fn;
753 INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
754 queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
755
756unlock:
757 spin_unlock_irqrestore(&memcg_kmem_wq_lock, flags);
758}
759
760static void kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
761{
762 __kmemcg_cache_deactivate_after_rcu(s);
763 percpu_ref_kill(&s->memcg_params.refcnt);
764}
765
730static void kmemcg_cache_deactivate(struct kmem_cache *s) 766static void kmemcg_cache_deactivate(struct kmem_cache *s)
731{ 767{
732 if (WARN_ON_ONCE(is_root_cache(s)) || 768 if (WARN_ON_ONCE(is_root_cache(s)))
733 WARN_ON_ONCE(s->memcg_params.work_fn))
734 return; 769 return;
735 770
736 __kmemcg_cache_deactivate(s); 771 __kmemcg_cache_deactivate(s);
@@ -744,10 +779,7 @@ static void kmemcg_cache_deactivate(struct kmem_cache *s)
744 if (s->memcg_params.root_cache->memcg_params.dying) 779 if (s->memcg_params.root_cache->memcg_params.dying)
745 goto unlock; 780 goto unlock;
746 781
747 /* pin memcg so that @s doesn't get destroyed in the middle */ 782 s->memcg_params.work_fn = kmemcg_cache_deactivate_after_rcu;
748 css_get(&s->memcg_params.memcg->css);
749
750 s->memcg_params.work_fn = __kmemcg_cache_deactivate_after_rcu;
751 call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn); 783 call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn);
752unlock: 784unlock:
753 spin_unlock_irq(&memcg_kmem_wq_lock); 785 spin_unlock_irq(&memcg_kmem_wq_lock);
@@ -781,28 +813,6 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
781 put_online_cpus(); 813 put_online_cpus();
782} 814}
783 815
784void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
785{
786 struct kmem_cache *s, *s2;
787
788 get_online_cpus();
789 get_online_mems();
790
791 mutex_lock(&slab_mutex);
792 list_for_each_entry_safe(s, s2, &memcg->kmem_caches,
793 memcg_params.kmem_caches_node) {
794 /*
795 * The cgroup is about to be freed and therefore has no charges
796 * left. Hence, all its caches must be empty by now.
797 */
798 BUG_ON(shutdown_cache(s));
799 }
800 mutex_unlock(&slab_mutex);
801
802 put_online_mems();
803 put_online_cpus();
804}
805
806static int shutdown_memcg_caches(struct kmem_cache *s) 816static int shutdown_memcg_caches(struct kmem_cache *s)
807{ 817{
808 struct memcg_cache_array *arr; 818 struct memcg_cache_array *arr;