slab: remove synchronous synchronize_sched() from memcg cache deactivation path

With kmem cgroup support enabled, kmem_caches can be created and destroyed frequently and a great number of near empty kmem_caches can accumulate if there are a lot of transient cgroups and the system is not under memory pressure. When memory reclaim starts under such conditions, it can lead to consecutive deactivation and destruction of many kmem_caches, easily hundreds of thousands on moderately large systems, exposing scalability issues in the current slab management code. This is one of the patches to address the issue. slub uses synchronize_sched() to deactivate a memcg cache. synchronize_sched() is an expensive and slow operation and doesn't scale when a huge number of caches are destroyed back-to-back. While there used to be a simple batching mechanism, the batching was too restricted to be helpful. This patch implements slab_deactivate_memcg_cache_rcu_sched() which slub can use to schedule sched RCU callback instead of performing synchronize_sched() synchronously while holding cgroup_mutex. While this adds online cpus, mems and slab_mutex operations, operating on these locks back-to-back from the same kworker, which is what's gonna happen when there are many to deactivate, isn't expensive at all and this gets rid of the scalability problem completely. Link: http://lkml.kernel.org/r/20170117235411.9408-9-tj@kernel.org Signed-off-by: Tejun Heo <tj@kernel.org> Reported-by: Jay Vana <jsvana@fb.com> Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: Christoph Lameter <cl@linux.com> Cc: Pekka Enberg <penberg@kernel.org> Cc: David Rientjes <rientjes@google.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Tejun Heo <tj@kernel.org> 2017-02-22 18:41:30 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2017-02-22 19:41:27 -0500
commit: 01fb58bcba63f8fba37581c24c99e9a515dd0335 (patch)
tree: 475ebac1b656204783280c52acf315dfd3caea03 /mm/slab_common.c
parent: c9fc586403e7c85eee06b2d5dea14ce71c00fcd8 (diff)
1 files changed, 60 insertions, 0 deletions
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 59e41bb81575..c549296c7981 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -627,6 +627,66 @@ out_unlock:
        put_online_cpus();
 }
+static void kmemcg_deactivate_workfn(struct work_struct *work)
+{
+        struct kmem_cache *s = container_of(work, struct kmem_cache,
+                                            memcg_params.deact_work);
+        get_online_cpus();
+        get_online_mems();
+        mutex_lock(&slab_mutex);
+        s->memcg_params.deact_fn(s);
+        mutex_unlock(&slab_mutex);
+        put_online_mems();
+        put_online_cpus();
+        /* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */
+        css_put(&s->memcg_params.memcg->css);
+}
+static void kmemcg_deactivate_rcufn(struct rcu_head *head)
+{
+        struct kmem_cache *s = container_of(head, struct kmem_cache,
+                                            memcg_params.deact_rcu_head);
+        /*
+         * We need to grab blocking locks.  Bounce to ->deact_work.  The
+         * work item shares the space with the RCU head and can't be
+         * initialized eariler.
+         */
+        INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn);
+        schedule_work(&s->memcg_params.deact_work);
+}
+/**
+ * slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a
+ *                                         sched RCU grace period
+ * @s: target kmem_cache
+ * @deact_fn: deactivation function to call
+ *
+ * Schedule @deact_fn to be invoked with online cpus, mems and slab_mutex
+ * held after a sched RCU grace period.  The slab is guaranteed to stay
+ * alive until @deact_fn is finished.  This is to be used from
+ * __kmemcg_cache_deactivate().
+ */
+void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
+                                           void (*deact_fn)(struct kmem_cache *))
+{
+        if (WARN_ON_ONCE(is_root_cache(s)) ||
+            WARN_ON_ONCE(s->memcg_params.deact_fn))
+                return;
+        /* pin memcg so that @s doesn't get destroyed in the middle */
+        css_get(&s->memcg_params.memcg->css);
+        s->memcg_params.deact_fn = deact_fn;
+        call_rcu_sched(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn);
+}
 void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
 {
        int idx;
author	Tejun Heo <tj@kernel.org>	2017-02-22 18:41:30 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2017-02-22 19:41:27 -0500
commit	01fb58bcba63f8fba37581c24c99e9a515dd0335 (patch)
tree	475ebac1b656204783280c52acf315dfd3caea03 /mm/slab_common.c
parent	c9fc586403e7c85eee06b2d5dea14ce71c00fcd8 (diff)

diff --git a/mm/slab_common.c b/mm/slab_common.c index 59e41bb81575..c549296c7981 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c
@@ -627,6 +627,66 @@ out_unlock:
627	put_online_cpus();	627	put_online_cpus();
628	}	628	}
629		629
		630	static void kmemcg_deactivate_workfn(struct work_struct *work)
		631	{
		632	struct kmem_cache *s = container_of(work, struct kmem_cache,
		633	memcg_params.deact_work);
		634
		635	get_online_cpus();
		636	get_online_mems();
		637
		638	mutex_lock(&slab_mutex);
		639
		640	s->memcg_params.deact_fn(s);
		641
		642	mutex_unlock(&slab_mutex);
		643
		644	put_online_mems();
		645	put_online_cpus();
		646
		647	/* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */
		648	css_put(&s->memcg_params.memcg->css);
		649	}
		650
		651	static void kmemcg_deactivate_rcufn(struct rcu_head *head)
		652	{
		653	struct kmem_cache *s = container_of(head, struct kmem_cache,
		654	memcg_params.deact_rcu_head);
		655
		656	/*
		657	* We need to grab blocking locks. Bounce to ->deact_work. The
		658	* work item shares the space with the RCU head and can't be
		659	* initialized eariler.
		660	*/
		661	INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn);
		662	schedule_work(&s->memcg_params.deact_work);
		663	}
		664
		665	/**
		666	* slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a
		667	* sched RCU grace period
		668	* @s: target kmem_cache
		669	* @deact_fn: deactivation function to call
		670	*
		671	* Schedule @deact_fn to be invoked with online cpus, mems and slab_mutex
		672	* held after a sched RCU grace period. The slab is guaranteed to stay
		673	* alive until @deact_fn is finished. This is to be used from
		674	* __kmemcg_cache_deactivate().
		675	*/
		676	void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
		677	void (deact_fn)(struct kmem_cache ))
		678	{
		679	if (WARN_ON_ONCE(is_root_cache(s)) \|\|
		680	WARN_ON_ONCE(s->memcg_params.deact_fn))
		681	return;
		682
		683	/* pin memcg so that @s doesn't get destroyed in the middle */
		684	css_get(&s->memcg_params.memcg->css);
		685
		686	s->memcg_params.deact_fn = deact_fn;
		687	call_rcu_sched(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn);
		688	}
		689
630	void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)	690	void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
631	{	691	{
632	int idx;	692	int idx;