aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2013-11-22 17:14:39 -0500
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2013-12-04 13:57:20 -0500
commita6647e9e4bdd231a008a12302d008a6cd81596bf (patch)
treeb7285e73ff0ebeeb377a39b3a3038c100d5030f5 /kernel
parent52915b499962c637289f1ac04cc0858e690bbfe4 (diff)
cgroup: use a dedicated workqueue for cgroup destruction
commit e5fca243abae1445afbfceebda5f08462ef869d3 upstream. Since be44562613851 ("cgroup: remove synchronize_rcu() from cgroup_diput()"), cgroup destruction path makes use of workqueue. css freeing is performed from a work item from that point on and a later commit, ea15f8ccdb430 ("cgroup: split cgroup destruction into two steps"), moves css offlining to workqueue too. As cgroup destruction isn't depended upon for memory reclaim, the destruction work items were put on the system_wq; unfortunately, some controller may block in the destruction path for considerable duration while holding cgroup_mutex. As large part of destruction path is synchronized through cgroup_mutex, when combined with high rate of cgroup removals, this has potential to fill up system_wq's max_active of 256. Also, it turns out that memcg's css destruction path ends up queueing and waiting for work items on system_wq through work_on_cpu(). If such operation happens while system_wq is fully occupied by cgroup destruction work items, work_on_cpu() can't make forward progress because system_wq is full and other destruction work items on system_wq can't make forward progress because the work item waiting for work_on_cpu() is holding cgroup_mutex, leading to deadlock. This can be fixed by queueing destruction work items on a separate workqueue. This patch creates a dedicated workqueue - cgroup_destroy_wq - for this purpose. As these work items shouldn't have inter-dependencies and mostly serialized by cgroup_mutex anyway, giving high concurrency level doesn't buy anything and the workqueue's @max_active is set to 1 so that destruction work items are executed one by one on each CPU. Hugh Dickins: Because cgroup_init() is run before init_workqueues(), cgroup_destroy_wq can't be allocated from cgroup_init(). Do it from a separate core_initcall(). In the future, we probably want to reorder so that workqueue init happens before cgroup_init(). Signed-off-by: Tejun Heo <tj@kernel.org> Reported-by: Hugh Dickins <hughd@google.com> Reported-by: Shawn Bohrer <shawn.bohrer@gmail.com> Link: http://lkml.kernel.org/r/20131111220626.GA7509@sbohrermbp13-local.rgmadvisors.com Link: http://lkml.kernel.org/g/alpine.LNX.2.00.1310301606080.2333@eggly.anvils Cc: stable@vger.kernel.org # v3.9+ Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c28
1 files changed, 26 insertions, 2 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b6b26faf1740..d0def7fc2848 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -92,6 +92,14 @@ static DEFINE_MUTEX(cgroup_mutex);
92static DEFINE_MUTEX(cgroup_root_mutex); 92static DEFINE_MUTEX(cgroup_root_mutex);
93 93
94/* 94/*
95 * cgroup destruction makes heavy use of work items and there can be a lot
96 * of concurrent destructions. Use a separate workqueue so that cgroup
97 * destruction work items don't end up filling up max_active of system_wq
98 * which may lead to deadlock.
99 */
100static struct workqueue_struct *cgroup_destroy_wq;
101
102/*
95 * Generate an array of cgroup subsystem pointers. At boot time, this is 103 * Generate an array of cgroup subsystem pointers. At boot time, this is
96 * populated with the built in subsystems, and modular subsystems are 104 * populated with the built in subsystems, and modular subsystems are
97 * registered after that. The mutable section of this array is protected by 105 * registered after that. The mutable section of this array is protected by
@@ -873,7 +881,7 @@ static void cgroup_free_rcu(struct rcu_head *head)
873{ 881{
874 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); 882 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
875 883
876 schedule_work(&cgrp->free_work); 884 queue_work(cgroup_destroy_wq, &cgrp->free_work);
877} 885}
878 886
879static void cgroup_diput(struct dentry *dentry, struct inode *inode) 887static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -4686,6 +4694,22 @@ out:
4686 return err; 4694 return err;
4687} 4695}
4688 4696
4697static int __init cgroup_wq_init(void)
4698{
4699 /*
4700 * There isn't much point in executing destruction path in
4701 * parallel. Good chunk is serialized with cgroup_mutex anyway.
4702 * Use 1 for @max_active.
4703 *
4704 * We would prefer to do this in cgroup_init() above, but that
4705 * is called before init_workqueues(): so leave this until after.
4706 */
4707 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
4708 BUG_ON(!cgroup_destroy_wq);
4709 return 0;
4710}
4711core_initcall(cgroup_wq_init);
4712
4689/* 4713/*
4690 * proc_cgroup_show() 4714 * proc_cgroup_show()
4691 * - Print task's cgroup paths into seq_file, one line for each hierarchy 4715 * - Print task's cgroup paths into seq_file, one line for each hierarchy
@@ -4996,7 +5020,7 @@ void __css_put(struct cgroup_subsys_state *css)
4996 5020
4997 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); 5021 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
4998 if (v == 0) 5022 if (v == 0)
4999 schedule_work(&css->dput_work); 5023 queue_work(cgroup_destroy_wq, &css->dput_work);
5000} 5024}
5001EXPORT_SYMBOL_GPL(__css_put); 5025EXPORT_SYMBOL_GPL(__css_put);
5002 5026