diff options
author | Tejun Heo <tj@kernel.org> | 2017-02-22 18:41:30 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-02-22 19:41:27 -0500 |
commit | 01fb58bcba63f8fba37581c24c99e9a515dd0335 (patch) | |
tree | 475ebac1b656204783280c52acf315dfd3caea03 /mm/slab_common.c | |
parent | c9fc586403e7c85eee06b2d5dea14ce71c00fcd8 (diff) |
slab: remove synchronous synchronize_sched() from memcg cache deactivation path
With kmem cgroup support enabled, kmem_caches can be created and
destroyed frequently and a great number of near empty kmem_caches can
accumulate if there are a lot of transient cgroups and the system is not
under memory pressure. When memory reclaim starts under such
conditions, it can lead to consecutive deactivation and destruction of
many kmem_caches, easily hundreds of thousands on moderately large
systems, exposing scalability issues in the current slab management
code. This is one of the patches to address the issue.
slub uses synchronize_sched() to deactivate a memcg cache.
synchronize_sched() is an expensive and slow operation and doesn't scale
when a huge number of caches are destroyed back-to-back. While there
used to be a simple batching mechanism, the batching was too restricted
to be helpful.
This patch implements slab_deactivate_memcg_cache_rcu_sched() which slub
can use to schedule sched RCU callback instead of performing
synchronize_sched() synchronously while holding cgroup_mutex. While
this adds online cpus, mems and slab_mutex operations, operating on
these locks back-to-back from the same kworker, which is what's gonna
happen when there are many to deactivate, isn't expensive at all and
this gets rid of the scalability problem completely.
Link: http://lkml.kernel.org/r/20170117235411.9408-9-tj@kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jay Vana <jsvana@fb.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/slab_common.c')
-rw-r--r-- | mm/slab_common.c | 60 |
1 files changed, 60 insertions, 0 deletions
diff --git a/mm/slab_common.c b/mm/slab_common.c index 59e41bb81575..c549296c7981 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -627,6 +627,66 @@ out_unlock: | |||
627 | put_online_cpus(); | 627 | put_online_cpus(); |
628 | } | 628 | } |
629 | 629 | ||
630 | static void kmemcg_deactivate_workfn(struct work_struct *work) | ||
631 | { | ||
632 | struct kmem_cache *s = container_of(work, struct kmem_cache, | ||
633 | memcg_params.deact_work); | ||
634 | |||
635 | get_online_cpus(); | ||
636 | get_online_mems(); | ||
637 | |||
638 | mutex_lock(&slab_mutex); | ||
639 | |||
640 | s->memcg_params.deact_fn(s); | ||
641 | |||
642 | mutex_unlock(&slab_mutex); | ||
643 | |||
644 | put_online_mems(); | ||
645 | put_online_cpus(); | ||
646 | |||
647 | /* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */ | ||
648 | css_put(&s->memcg_params.memcg->css); | ||
649 | } | ||
650 | |||
651 | static void kmemcg_deactivate_rcufn(struct rcu_head *head) | ||
652 | { | ||
653 | struct kmem_cache *s = container_of(head, struct kmem_cache, | ||
654 | memcg_params.deact_rcu_head); | ||
655 | |||
656 | /* | ||
657 | * We need to grab blocking locks. Bounce to ->deact_work. The | ||
658 | * work item shares the space with the RCU head and can't be | ||
659 | * initialized eariler. | ||
660 | */ | ||
661 | INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn); | ||
662 | schedule_work(&s->memcg_params.deact_work); | ||
663 | } | ||
664 | |||
665 | /** | ||
666 | * slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a | ||
667 | * sched RCU grace period | ||
668 | * @s: target kmem_cache | ||
669 | * @deact_fn: deactivation function to call | ||
670 | * | ||
671 | * Schedule @deact_fn to be invoked with online cpus, mems and slab_mutex | ||
672 | * held after a sched RCU grace period. The slab is guaranteed to stay | ||
673 | * alive until @deact_fn is finished. This is to be used from | ||
674 | * __kmemcg_cache_deactivate(). | ||
675 | */ | ||
676 | void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, | ||
677 | void (*deact_fn)(struct kmem_cache *)) | ||
678 | { | ||
679 | if (WARN_ON_ONCE(is_root_cache(s)) || | ||
680 | WARN_ON_ONCE(s->memcg_params.deact_fn)) | ||
681 | return; | ||
682 | |||
683 | /* pin memcg so that @s doesn't get destroyed in the middle */ | ||
684 | css_get(&s->memcg_params.memcg->css); | ||
685 | |||
686 | s->memcg_params.deact_fn = deact_fn; | ||
687 | call_rcu_sched(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn); | ||
688 | } | ||
689 | |||
630 | void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) | 690 | void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) |
631 | { | 691 | { |
632 | int idx; | 692 | int idx; |