mm: memcg/slab: rework non-root kmem_cache lifecycle management

Currently each charged slab page holds a reference to the cgroup to which it's charged. Kmem_caches are held by the memcg and are released all together with the memory cgroup. It means that none of kmem_caches are released unless at least one reference to the memcg exists, which is very far from optimal. Let's rework it in a way that allows releasing individual kmem_caches as soon as the cgroup is offline, the kmem_cache is empty and there are no pending allocations. To make it possible, let's introduce a new percpu refcounter for non-root kmem caches. The counter is initialized to the percpu mode, and is switched to the atomic mode during kmem_cache deactivation. The counter is bumped for every charged page and also for every running allocation. So the kmem_cache can't be released unless all allocations complete. To shutdown non-active empty kmem_caches, let's reuse the work queue, previously used for the kmem_cache deactivation. Once the reference counter reaches 0, let's schedule an asynchronous kmem_cache release. * I used the following simple approach to test the performance (stolen from another patchset by T. Harding): time find / -name fname-no-exist echo 2 > /proc/sys/vm/drop_caches repeat 10 times Results: orig patched real 0m1.455s real 0m1.355s user 0m0.206s user 0m0.219s sys 0m0.855s sys 0m0.807s real 0m1.487s real 0m1.699s user 0m0.221s user 0m0.256s sys 0m0.806s sys 0m0.948s real 0m1.515s real 0m1.505s user 0m0.183s user 0m0.215s sys 0m0.876s sys 0m0.858s real 0m1.291s real 0m1.380s user 0m0.193s user 0m0.198s sys 0m0.843s sys 0m0.786s real 0m1.364s real 0m1.374s user 0m0.180s user 0m0.182s sys 0m0.868s sys 0m0.806s real 0m1.352s real 0m1.312s user 0m0.201s user 0m0.212s sys 0m0.820s sys 0m0.761s real 0m1.302s real 0m1.349s user 0m0.205s user 0m0.203s sys 0m0.803s sys 0m0.792s real 0m1.334s real 0m1.301s user 0m0.194s user 0m0.201s sys 0m0.806s sys 0m0.779s real 0m1.426s real 0m1.434s user 0m0.216s user 0m0.181s sys 0m0.824s sys 0m0.864s real 0m1.350s real 0m1.295s user 0m0.200s user 0m0.190s sys 0m0.842s sys 0m0.811s So it looks like the difference is not noticeable in this test. [cai@lca.pw: fix an use-after-free in kmemcg_workfn()] Link: http://lkml.kernel.org/r/1560977573-10715-1-git-send-email-cai@lca.pw Link: http://lkml.kernel.org/r/20190611231813.3148843-9-guro@fb.com Signed-off-by: Roman Gushchin <guro@fb.com> Signed-off-by: Qian Cai <cai@lca.pw> Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: Christoph Lameter <cl@linux.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Shakeel Butt <shakeelb@google.com> Cc: Waiman Long <longman@redhat.com> Cc: David Rientjes <rientjes@google.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Pekka Enberg <penberg@kernel.org> Cc: Andrei Vagin <avagin@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Roman Gushchin <guro@fb.com> 2019-07-11 23:56:27 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2019-07-12 14:05:44 -0400
commit: f0a3a24b532d9a7e56a33c5112b2a212ed6ec580 (patch)
tree: 254f501899a5e542a84043674dfe3df9dccf0cb4
parent: 63b02ef7dc4ec239df45c018ac0adbd02ba30a0c (diff)
4 files changed, 96 insertions, 79 deletions
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 6008d884e621..bc189a43e680 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -16,6 +16,7 @@
 #include <linux/overflow.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
+#include <linux/percpu-refcount.h>
 /*
@@ -152,7 +153,6 @@ int kmem_cache_shrink(struct kmem_cache *);
 void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
 void memcg_deactivate_kmem_caches(struct mem_cgroup *);
-void memcg_destroy_kmem_caches(struct mem_cgroup *);
 /*
 * Please use this macro to create slab caches. Simply specify the
@@ -642,6 +642,7 @@ struct memcg_cache_params {
                        struct mem_cgroup *memcg;
                        struct list_head children_node;
                        struct list_head kmem_caches_node;
+                        struct percpu_ref refcnt;
                        void (*work_fn)(struct kmem_cache *);
                        union {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 25e35a8b8ba2..ce4ce5e7937b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2667,12 +2667,13 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
 {
        struct memcg_kmem_cache_create_work *cw;
+        if (!css_tryget_online(&memcg->css))
+                return;
        cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
        if (!cw)
                return;
-        css_get(&memcg->css);
        cw->memcg = memcg;
        cw->cachep = cachep;
        INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
@@ -2707,6 +2708,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
 {
        struct mem_cgroup *memcg;
        struct kmem_cache *memcg_cachep;
+        struct memcg_cache_array *arr;
        int kmemcg_id;
        VM_BUG_ON(!is_root_cache(cachep));
@@ -2714,14 +2716,28 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
        if (memcg_kmem_bypass())
                return cachep;
-        memcg = get_mem_cgroup_from_current();
+        rcu_read_lock();
+        if (unlikely(current->active_memcg))
+                memcg = current->active_memcg;
+        else
+                memcg = mem_cgroup_from_task(current);
+        if (!memcg || memcg == root_mem_cgroup)
+                goto out_unlock;
        kmemcg_id = READ_ONCE(memcg->kmemcg_id);
        if (kmemcg_id < 0)
-                goto out;
+                goto out_unlock;
+        arr = rcu_dereference(cachep->memcg_params.memcg_caches);
-        memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
+        /*
-        if (likely(memcg_cachep))
+         * Make sure we will access the up-to-date value. The code updating
-                return memcg_cachep;
+         * memcg_caches issues a write barrier to match the data dependency
+         * barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
+         */
+        memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
        /*
         * If we are in a safe context (can wait, and not in interrupt
@@ -2734,10 +2750,20 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
         * memcg_create_kmem_cache, this means no further allocation
         * could happen with the slab_mutex held. So it's better to
         * defer everything.
+         *
+         * If the memcg is dying or memcg_cache is about to be released,
+         * don't bother creating new kmem_caches. Because memcg_cachep
+         * is ZEROed as the fist step of kmem offlining, we don't need
+         * percpu_ref_tryget_live() here. css_tryget_online() check in
+         * memcg_schedule_kmem_cache_create() will prevent us from
+         * creation of a new kmem_cache.
         */
-        memcg_schedule_kmem_cache_create(memcg, cachep);
+        if (unlikely(!memcg_cachep))
-out:
+                memcg_schedule_kmem_cache_create(memcg, cachep);
-        css_put(&memcg->css);
+        else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
+                cachep = memcg_cachep;
+out_unlock:
+        rcu_read_unlock();
        return cachep;
 }
@@ -2748,7 +2774,7 @@ out:
 void memcg_kmem_put_cache(struct kmem_cache *cachep)
 {
        if (!is_root_cache(cachep))
-                css_put(&cachep->memcg_params.memcg->css);
+                percpu_ref_put(&cachep->memcg_params.refcnt);
 }
 /**
@@ -3295,7 +3321,7 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
                memcg_offline_kmem(memcg);
        if (memcg->kmem_state == KMEM_ALLOCATED) {
-                memcg_destroy_kmem_caches(memcg);
+                WARN_ON(!list_empty(&memcg->kmem_caches));
                static_branch_dec(&memcg_kmem_enabled_key);
                WARN_ON(page_counter_read(&memcg->kmem));
        }
diff --git a/mm/slab.h b/mm/slab.h
index 46623a576a3c..5d2b8511e6fb 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -248,31 +248,6 @@ static inline const char *cache_name(struct kmem_cache *s)
        return s->name;
 }
-/*
- * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
- * That said the caller must assure the memcg's cache won't go away by either
- * taking a css reference to the owner cgroup, or holding the slab_mutex.
- */
-static inline struct kmem_cache *
-cache_from_memcg_idx(struct kmem_cache *s, int idx)
-{
-        struct kmem_cache *cachep;
-        struct memcg_cache_array *arr;
-        rcu_read_lock();
-        arr = rcu_dereference(s->memcg_params.memcg_caches);
-        /*
-         * Make sure we will access the up-to-date value. The code updating
-         * memcg_caches issues a write barrier to match this (see
-         * memcg_create_kmem_cache()).
-         */
-        cachep = READ_ONCE(arr->entries[idx]);
-        rcu_read_unlock();
-        return cachep;
-}
 static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
 {
        if (is_root_cache(s))
@@ -284,14 +259,25 @@ static __always_inline int memcg_charge_slab(struct page *page,
                                             gfp_t gfp, int order,
                                             struct kmem_cache *s)
 {
+        int ret;
        if (is_root_cache(s))
                return 0;
-        return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
+        ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
+        if (ret)
+                return ret;
+        percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
+        return 0;
 }
 static __always_inline void memcg_uncharge_slab(struct page *page, int order,
                                                struct kmem_cache *s)
 {
+        if (!is_root_cache(s))
+                percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order);
        memcg_kmem_uncharge(page, order);
 }
@@ -323,12 +309,6 @@ static inline const char *cache_name(struct kmem_cache *s)
        return s->name;
 }
-static inline struct kmem_cache *
-cache_from_memcg_idx(struct kmem_cache *s, int idx)
-{
-        return NULL;
-}
 static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
 {
        return s;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index a15557776d7d..ee3971f7fabc 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -132,6 +132,8 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
 LIST_HEAD(slab_root_caches);
 static DEFINE_SPINLOCK(memcg_kmem_wq_lock);
+static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref);
 void slab_init_memcg_params(struct kmem_cache *s)
 {
        s->memcg_params.root_cache = NULL;
@@ -146,6 +148,12 @@ static int init_memcg_params(struct kmem_cache *s,
        struct memcg_cache_array *arr;
        if (root_cache) {
+                int ret = percpu_ref_init(&s->memcg_params.refcnt,
+                                          kmemcg_cache_shutdown,
+                                          0, GFP_KERNEL);
+                if (ret)
+                        return ret;
                s->memcg_params.root_cache = root_cache;
                INIT_LIST_HEAD(&s->memcg_params.children_node);
                INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
@@ -171,6 +179,8 @@ static void destroy_memcg_params(struct kmem_cache *s)
 {
        if (is_root_cache(s))
                kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
+        else
+                percpu_ref_exit(&s->memcg_params.refcnt);
 }
 static void free_memcg_params(struct rcu_head *rcu)
@@ -226,6 +236,7 @@ void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg)
        if (is_root_cache(s)) {
                list_add(&s->root_caches_node, &slab_root_caches);
        } else {
+                css_get(&memcg->css);
                s->memcg_params.memcg = memcg;
                list_add(&s->memcg_params.children_node,
                         &s->memcg_params.root_cache->memcg_params.children);
@@ -241,6 +252,7 @@ static void memcg_unlink_cache(struct kmem_cache *s)
        } else {
                list_del(&s->memcg_params.children_node);
                list_del(&s->memcg_params.kmem_caches_node);
+                css_put(&s->memcg_params.memcg->css);
        }
 }
 #else
@@ -678,7 +690,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
        }
        /*
-         * Since readers won't lock (see cache_from_memcg_idx()), we need a
+         * Since readers won't lock (see memcg_kmem_get_cache()), we need a
         * barrier here to ensure nobody will see the kmem_cache partially
         * initialized.
         */
@@ -701,16 +713,11 @@ static void kmemcg_workfn(struct work_struct *work)
        get_online_mems();
        mutex_lock(&slab_mutex);
        s->memcg_params.work_fn(s);
        mutex_unlock(&slab_mutex);
        put_online_mems();
        put_online_cpus();
-        /* done, put the ref from kmemcg_cache_deactivate() */
-        css_put(&s->memcg_params.memcg->css);
 }
 static void kmemcg_rcufn(struct rcu_head *head)
@@ -727,10 +734,38 @@ static void kmemcg_rcufn(struct rcu_head *head)
        queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
 }
+static void kmemcg_cache_shutdown_fn(struct kmem_cache *s)
+{
+        WARN_ON(shutdown_cache(s));
+}
+static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref)
+{
+        struct kmem_cache *s = container_of(percpu_ref, struct kmem_cache,
+                                            memcg_params.refcnt);
+        unsigned long flags;
+        spin_lock_irqsave(&memcg_kmem_wq_lock, flags);
+        if (s->memcg_params.root_cache->memcg_params.dying)
+                goto unlock;
+        s->memcg_params.work_fn = kmemcg_cache_shutdown_fn;
+        INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
+        queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
+unlock:
+        spin_unlock_irqrestore(&memcg_kmem_wq_lock, flags);
+}
+static void kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
+{
+        __kmemcg_cache_deactivate_after_rcu(s);
+        percpu_ref_kill(&s->memcg_params.refcnt);
+}
 static void kmemcg_cache_deactivate(struct kmem_cache *s)
 {
-        if (WARN_ON_ONCE(is_root_cache(s)) ||
+        if (WARN_ON_ONCE(is_root_cache(s)))
-            WARN_ON_ONCE(s->memcg_params.work_fn))
                return;
        __kmemcg_cache_deactivate(s);
@@ -744,10 +779,7 @@ static void kmemcg_cache_deactivate(struct kmem_cache *s)
        if (s->memcg_params.root_cache->memcg_params.dying)
                goto unlock;
-        /* pin memcg so that @s doesn't get destroyed in the middle */
+        s->memcg_params.work_fn = kmemcg_cache_deactivate_after_rcu;
-        css_get(&s->memcg_params.memcg->css);
-        s->memcg_params.work_fn = __kmemcg_cache_deactivate_after_rcu;
        call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn);
 unlock:
        spin_unlock_irq(&memcg_kmem_wq_lock);
@@ -781,28 +813,6 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
        put_online_cpus();
 }
-void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
-{
-        struct kmem_cache *s, *s2;
-        get_online_cpus();
-        get_online_mems();
-        mutex_lock(&slab_mutex);
-        list_for_each_entry_safe(s, s2, &memcg->kmem_caches,
-                                 memcg_params.kmem_caches_node) {
-                /*
-                 * The cgroup is about to be freed and therefore has no charges
-                 * left. Hence, all its caches must be empty by now.
-                 */
-                BUG_ON(shutdown_cache(s));
-        }
-        mutex_unlock(&slab_mutex);
-        put_online_mems();
-        put_online_cpus();
-}
 static int shutdown_memcg_caches(struct kmem_cache *s)
 {
        struct memcg_cache_array *arr;
author	Roman Gushchin <guro@fb.com>	2019-07-11 23:56:27 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2019-07-12 14:05:44 -0400
commit	f0a3a24b532d9a7e56a33c5112b2a212ed6ec580 (patch)
tree	254f501899a5e542a84043674dfe3df9dccf0cb4
parent	63b02ef7dc4ec239df45c018ac0adbd02ba30a0c (diff)

diff --git a/include/linux/slab.h b/include/linux/slab.h index 6008d884e621..bc189a43e680 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h
@@ -16,6 +16,7 @@
16	#include <linux/overflow.h>	16	#include <linux/overflow.h>
17	#include <linux/types.h>	17	#include <linux/types.h>
18	#include <linux/workqueue.h>	18	#include <linux/workqueue.h>
		19	#include <linux/percpu-refcount.h>
19		20
20		21
21	/*	22	/*
@@ -152,7 +153,6 @@ int kmem_cache_shrink(struct kmem_cache *);
152		153
153	void memcg_create_kmem_cache(struct mem_cgroup , struct kmem_cache );	154	void memcg_create_kmem_cache(struct mem_cgroup , struct kmem_cache );
154	void memcg_deactivate_kmem_caches(struct mem_cgroup *);	155	void memcg_deactivate_kmem_caches(struct mem_cgroup *);
155	void memcg_destroy_kmem_caches(struct mem_cgroup *);
156		156
157	/*	157	/*
158	* Please use this macro to create slab caches. Simply specify the	158	* Please use this macro to create slab caches. Simply specify the
@@ -642,6 +642,7 @@ struct memcg_cache_params {
642	struct mem_cgroup *memcg;	642	struct mem_cgroup *memcg;
643	struct list_head children_node;	643	struct list_head children_node;
644	struct list_head kmem_caches_node;	644	struct list_head kmem_caches_node;
		645	struct percpu_ref refcnt;
645		646
646	void (work_fn)(struct kmem_cache );	647	void (work_fn)(struct kmem_cache );
647	union {	648	union {


diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 25e35a8b8ba2..ce4ce5e7937b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -2667,12 +2667,13 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2667	{	2667	{
2668	struct memcg_kmem_cache_create_work *cw;	2668	struct memcg_kmem_cache_create_work *cw;
2669		2669
		2670	if (!css_tryget_online(&memcg->css))
		2671	return;
		2672
2670	cw = kmalloc(sizeof(*cw), GFP_NOWAIT \| __GFP_NOWARN);	2673	cw = kmalloc(sizeof(*cw), GFP_NOWAIT \| __GFP_NOWARN);
2671	if (!cw)	2674	if (!cw)
2672	return;	2675	return;
2673		2676
2674	css_get(&memcg->css);
2675
2676	cw->memcg = memcg;	2677	cw->memcg = memcg;
2677	cw->cachep = cachep;	2678	cw->cachep = cachep;
2678	INIT_WORK(&cw->work, memcg_kmem_cache_create_func);	2679	INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
@@ -2707,6 +2708,7 @@ struct kmem_cache memcg_kmem_get_cache(struct kmem_cache cachep)
2707	{	2708	{
2708	struct mem_cgroup *memcg;	2709	struct mem_cgroup *memcg;
2709	struct kmem_cache *memcg_cachep;	2710	struct kmem_cache *memcg_cachep;
		2711	struct memcg_cache_array *arr;
2710	int kmemcg_id;	2712	int kmemcg_id;
2711		2713
2712	VM_BUG_ON(!is_root_cache(cachep));	2714	VM_BUG_ON(!is_root_cache(cachep));
@@ -2714,14 +2716,28 @@ struct kmem_cache memcg_kmem_get_cache(struct kmem_cache cachep)
2714	if (memcg_kmem_bypass())	2716	if (memcg_kmem_bypass())
2715	return cachep;	2717	return cachep;
2716		2718
2717	memcg = get_mem_cgroup_from_current();	2719	rcu_read_lock();
		2720
		2721	if (unlikely(current->active_memcg))
		2722	memcg = current->active_memcg;
		2723	else
		2724	memcg = mem_cgroup_from_task(current);
		2725
		2726	if (!memcg \|\| memcg == root_mem_cgroup)
		2727	goto out_unlock;
		2728
2718	kmemcg_id = READ_ONCE(memcg->kmemcg_id);	2729	kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2719	if (kmemcg_id < 0)	2730	if (kmemcg_id < 0)
2720	goto out;	2731	goto out_unlock;
		2732
		2733	arr = rcu_dereference(cachep->memcg_params.memcg_caches);
2721		2734
2722	memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);	2735	/*
2723	if (likely(memcg_cachep))	2736	* Make sure we will access the up-to-date value. The code updating
2724	return memcg_cachep;	2737	* memcg_caches issues a write barrier to match the data dependency
		2738	* barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
		2739	*/
		2740	memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
2725		2741
2726	/*	2742	/*
2727	* If we are in a safe context (can wait, and not in interrupt	2743	* If we are in a safe context (can wait, and not in interrupt
@@ -2734,10 +2750,20 @@ struct kmem_cache memcg_kmem_get_cache(struct kmem_cache cachep)
2734	* memcg_create_kmem_cache, this means no further allocation	2750	* memcg_create_kmem_cache, this means no further allocation
2735	* could happen with the slab_mutex held. So it's better to	2751	* could happen with the slab_mutex held. So it's better to
2736	* defer everything.	2752	* defer everything.
		2753	*
		2754	* If the memcg is dying or memcg_cache is about to be released,
		2755	* don't bother creating new kmem_caches. Because memcg_cachep
		2756	* is ZEROed as the fist step of kmem offlining, we don't need
		2757	* percpu_ref_tryget_live() here. css_tryget_online() check in
		2758	* memcg_schedule_kmem_cache_create() will prevent us from
		2759	* creation of a new kmem_cache.
2737	*/	2760	*/
2738	memcg_schedule_kmem_cache_create(memcg, cachep);	2761	if (unlikely(!memcg_cachep))
2739	out:	2762	memcg_schedule_kmem_cache_create(memcg, cachep);
2740	css_put(&memcg->css);	2763	else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
		2764	cachep = memcg_cachep;
		2765	out_unlock:
		2766	rcu_read_unlock();
2741	return cachep;	2767	return cachep;
2742	}	2768	}
2743		2769
@@ -2748,7 +2774,7 @@ out:
2748	void memcg_kmem_put_cache(struct kmem_cache *cachep)	2774	void memcg_kmem_put_cache(struct kmem_cache *cachep)
2749	{	2775	{
2750	if (!is_root_cache(cachep))	2776	if (!is_root_cache(cachep))
2751	css_put(&cachep->memcg_params.memcg->css);	2777	percpu_ref_put(&cachep->memcg_params.refcnt);
2752	}	2778	}
2753		2779
2754	/**	2780	/**
@@ -3295,7 +3321,7 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
3295	memcg_offline_kmem(memcg);	3321	memcg_offline_kmem(memcg);
3296		3322
3297	if (memcg->kmem_state == KMEM_ALLOCATED) {	3323	if (memcg->kmem_state == KMEM_ALLOCATED) {
3298	memcg_destroy_kmem_caches(memcg);	3324	WARN_ON(!list_empty(&memcg->kmem_caches));
3299	static_branch_dec(&memcg_kmem_enabled_key);	3325	static_branch_dec(&memcg_kmem_enabled_key);
3300	WARN_ON(page_counter_read(&memcg->kmem));	3326	WARN_ON(page_counter_read(&memcg->kmem));
3301	}	3327	}


diff --git a/mm/slab.h b/mm/slab.h index 46623a576a3c..5d2b8511e6fb 100644 --- a/mm/slab.h +++ b/mm/slab.h
@@ -248,31 +248,6 @@ static inline const char cache_name(struct kmem_cache s)
248	return s->name;	248	return s->name;
249	}	249	}
250		250
251	/*
252	* Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
253	* That said the caller must assure the memcg's cache won't go away by either
254	* taking a css reference to the owner cgroup, or holding the slab_mutex.
255	*/
256	static inline struct kmem_cache *
257	cache_from_memcg_idx(struct kmem_cache *s, int idx)
258	{
259	struct kmem_cache *cachep;
260	struct memcg_cache_array *arr;
261
262	rcu_read_lock();
263	arr = rcu_dereference(s->memcg_params.memcg_caches);
264
265	/*
266	* Make sure we will access the up-to-date value. The code updating
267	* memcg_caches issues a write barrier to match this (see
268	* memcg_create_kmem_cache()).
269	*/
270	cachep = READ_ONCE(arr->entries[idx]);
271	rcu_read_unlock();
272
273	return cachep;
274	}
275
276	static inline struct kmem_cache memcg_root_cache(struct kmem_cache s)	251	static inline struct kmem_cache memcg_root_cache(struct kmem_cache s)
277	{	252	{
278	if (is_root_cache(s))	253	if (is_root_cache(s))
@@ -284,14 +259,25 @@ static __always_inline int memcg_charge_slab(struct page *page,
284	gfp_t gfp, int order,	259	gfp_t gfp, int order,
285	struct kmem_cache *s)	260	struct kmem_cache *s)
286	{	261	{
		262	int ret;
		263
287	if (is_root_cache(s))	264	if (is_root_cache(s))
288	return 0;	265	return 0;
289	return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);	266
		267	ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
		268	if (ret)
		269	return ret;
		270
		271	percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
		272
		273	return 0;
290	}	274	}
291		275
292	static __always_inline void memcg_uncharge_slab(struct page *page, int order,	276	static __always_inline void memcg_uncharge_slab(struct page *page, int order,
293	struct kmem_cache *s)	277	struct kmem_cache *s)
294	{	278	{
		279	if (!is_root_cache(s))
		280	percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order);
295	memcg_kmem_uncharge(page, order);	281	memcg_kmem_uncharge(page, order);
296	}	282	}
297		283
@@ -323,12 +309,6 @@ static inline const char cache_name(struct kmem_cache s)
323	return s->name;	309	return s->name;
324	}	310	}
325		311
326	static inline struct kmem_cache *
327	cache_from_memcg_idx(struct kmem_cache *s, int idx)
328	{
329	return NULL;
330	}
331
332	static inline struct kmem_cache memcg_root_cache(struct kmem_cache s)	312	static inline struct kmem_cache memcg_root_cache(struct kmem_cache s)
333	{	313	{
334	return s;	314	return s;


diff --git a/mm/slab_common.c b/mm/slab_common.c index a15557776d7d..ee3971f7fabc 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c
@@ -132,6 +132,8 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
132	LIST_HEAD(slab_root_caches);	132	LIST_HEAD(slab_root_caches);
133	static DEFINE_SPINLOCK(memcg_kmem_wq_lock);	133	static DEFINE_SPINLOCK(memcg_kmem_wq_lock);
134		134
		135	static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref);
		136
135	void slab_init_memcg_params(struct kmem_cache *s)	137	void slab_init_memcg_params(struct kmem_cache *s)
136	{	138	{
137	s->memcg_params.root_cache = NULL;	139	s->memcg_params.root_cache = NULL;
@@ -146,6 +148,12 @@ static int init_memcg_params(struct kmem_cache *s,
146	struct memcg_cache_array *arr;	148	struct memcg_cache_array *arr;
147		149
148	if (root_cache) {	150	if (root_cache) {
		151	int ret = percpu_ref_init(&s->memcg_params.refcnt,
		152	kmemcg_cache_shutdown,
		153	0, GFP_KERNEL);
		154	if (ret)
		155	return ret;
		156
149	s->memcg_params.root_cache = root_cache;	157	s->memcg_params.root_cache = root_cache;
150	INIT_LIST_HEAD(&s->memcg_params.children_node);	158	INIT_LIST_HEAD(&s->memcg_params.children_node);
151	INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);	159	INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
@@ -171,6 +179,8 @@ static void destroy_memcg_params(struct kmem_cache *s)
171	{	179	{
172	if (is_root_cache(s))	180	if (is_root_cache(s))
173	kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));	181	kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
		182	else
		183	percpu_ref_exit(&s->memcg_params.refcnt);
174	}	184	}
175		185
176	static void free_memcg_params(struct rcu_head *rcu)	186	static void free_memcg_params(struct rcu_head *rcu)
@@ -226,6 +236,7 @@ void memcg_link_cache(struct kmem_cache s, struct mem_cgroup memcg)
226	if (is_root_cache(s)) {	236	if (is_root_cache(s)) {
227	list_add(&s->root_caches_node, &slab_root_caches);	237	list_add(&s->root_caches_node, &slab_root_caches);
228	} else {	238	} else {
		239	css_get(&memcg->css);
229	s->memcg_params.memcg = memcg;	240	s->memcg_params.memcg = memcg;
230	list_add(&s->memcg_params.children_node,	241	list_add(&s->memcg_params.children_node,
231	&s->memcg_params.root_cache->memcg_params.children);	242	&s->memcg_params.root_cache->memcg_params.children);
@@ -241,6 +252,7 @@ static void memcg_unlink_cache(struct kmem_cache *s)
241	} else {	252	} else {
242	list_del(&s->memcg_params.children_node);	253	list_del(&s->memcg_params.children_node);
243	list_del(&s->memcg_params.kmem_caches_node);	254	list_del(&s->memcg_params.kmem_caches_node);
		255	css_put(&s->memcg_params.memcg->css);
244	}	256	}
245	}	257	}
246	#else	258	#else
@@ -678,7 +690,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
678	}	690	}
679		691
680	/*	692	/*
681	* Since readers won't lock (see cache_from_memcg_idx()), we need a	693	* Since readers won't lock (see memcg_kmem_get_cache()), we need a
682	* barrier here to ensure nobody will see the kmem_cache partially	694	* barrier here to ensure nobody will see the kmem_cache partially
683	* initialized.	695	* initialized.
684	*/	696	*/
@@ -701,16 +713,11 @@ static void kmemcg_workfn(struct work_struct *work)
701	get_online_mems();	713	get_online_mems();
702		714
703	mutex_lock(&slab_mutex);	715	mutex_lock(&slab_mutex);
704
705	s->memcg_params.work_fn(s);	716	s->memcg_params.work_fn(s);
706
707	mutex_unlock(&slab_mutex);	717	mutex_unlock(&slab_mutex);
708		718
709	put_online_mems();	719	put_online_mems();
710	put_online_cpus();	720	put_online_cpus();
711
712	/* done, put the ref from kmemcg_cache_deactivate() */
713	css_put(&s->memcg_params.memcg->css);
714	}	721	}
715		722
716	static void kmemcg_rcufn(struct rcu_head *head)	723	static void kmemcg_rcufn(struct rcu_head *head)
@@ -727,10 +734,38 @@ static void kmemcg_rcufn(struct rcu_head *head)
727	queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);	734	queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
728	}	735	}
729		736
		737	static void kmemcg_cache_shutdown_fn(struct kmem_cache *s)
		738	{
		739	WARN_ON(shutdown_cache(s));
		740	}
		741
		742	static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref)
		743	{
		744	struct kmem_cache *s = container_of(percpu_ref, struct kmem_cache,
		745	memcg_params.refcnt);
		746	unsigned long flags;
		747
		748	spin_lock_irqsave(&memcg_kmem_wq_lock, flags);
		749	if (s->memcg_params.root_cache->memcg_params.dying)
		750	goto unlock;
		751
		752	s->memcg_params.work_fn = kmemcg_cache_shutdown_fn;
		753	INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
		754	queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
		755
		756	unlock:
		757	spin_unlock_irqrestore(&memcg_kmem_wq_lock, flags);
		758	}
		759
		760	static void kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
		761	{
		762	__kmemcg_cache_deactivate_after_rcu(s);
		763	percpu_ref_kill(&s->memcg_params.refcnt);
		764	}
		765
730	static void kmemcg_cache_deactivate(struct kmem_cache *s)	766	static void kmemcg_cache_deactivate(struct kmem_cache *s)
731	{	767	{
732	if (WARN_ON_ONCE(is_root_cache(s)) \|\|	768	if (WARN_ON_ONCE(is_root_cache(s)))
733	WARN_ON_ONCE(s->memcg_params.work_fn))
734	return;	769	return;
735		770
736	__kmemcg_cache_deactivate(s);	771	__kmemcg_cache_deactivate(s);
@@ -744,10 +779,7 @@ static void kmemcg_cache_deactivate(struct kmem_cache *s)
744	if (s->memcg_params.root_cache->memcg_params.dying)	779	if (s->memcg_params.root_cache->memcg_params.dying)
745	goto unlock;	780	goto unlock;
746		781
747	/* pin memcg so that @s doesn't get destroyed in the middle */	782	s->memcg_params.work_fn = kmemcg_cache_deactivate_after_rcu;
748	css_get(&s->memcg_params.memcg->css);
749
750	s->memcg_params.work_fn = __kmemcg_cache_deactivate_after_rcu;
751	call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn);	783	call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn);
752	unlock:	784	unlock:
753	spin_unlock_irq(&memcg_kmem_wq_lock);	785	spin_unlock_irq(&memcg_kmem_wq_lock);
@@ -781,28 +813,6 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
781	put_online_cpus();	813	put_online_cpus();
782	}	814	}
783		815
784	void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
785	{
786	struct kmem_cache s, s2;
787
788	get_online_cpus();
789	get_online_mems();
790
791	mutex_lock(&slab_mutex);
792	list_for_each_entry_safe(s, s2, &memcg->kmem_caches,
793	memcg_params.kmem_caches_node) {
794	/*
795	* The cgroup is about to be freed and therefore has no charges
796	* left. Hence, all its caches must be empty by now.
797	*/
798	BUG_ON(shutdown_cache(s));
799	}
800	mutex_unlock(&slab_mutex);
801
802	put_online_mems();
803	put_online_cpus();
804	}
805
806	static int shutdown_memcg_caches(struct kmem_cache *s)	816	static int shutdown_memcg_caches(struct kmem_cache *s)
807	{	817	{
808	struct memcg_cache_array *arr;	818	struct memcg_cache_array *arr;