diff options
author | Tejun Heo <tj@kernel.org> | 2013-11-22 17:14:39 -0500 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2013-12-04 13:57:20 -0500 |
commit | a6647e9e4bdd231a008a12302d008a6cd81596bf (patch) | |
tree | b7285e73ff0ebeeb377a39b3a3038c100d5030f5 /kernel | |
parent | 52915b499962c637289f1ac04cc0858e690bbfe4 (diff) |
cgroup: use a dedicated workqueue for cgroup destruction
commit e5fca243abae1445afbfceebda5f08462ef869d3 upstream.
Since be44562613851 ("cgroup: remove synchronize_rcu() from
cgroup_diput()"), cgroup destruction path makes use of workqueue. css
freeing is performed from a work item from that point on and a later
commit, ea15f8ccdb430 ("cgroup: split cgroup destruction into two
steps"), moves css offlining to workqueue too.
As cgroup destruction isn't depended upon for memory reclaim, the
destruction work items were put on the system_wq; unfortunately, some
controller may block in the destruction path for considerable duration
while holding cgroup_mutex. As large part of destruction path is
synchronized through cgroup_mutex, when combined with high rate of
cgroup removals, this has potential to fill up system_wq's max_active
of 256.
Also, it turns out that memcg's css destruction path ends up queueing
and waiting for work items on system_wq through work_on_cpu(). If
such operation happens while system_wq is fully occupied by cgroup
destruction work items, work_on_cpu() can't make forward progress
because system_wq is full and other destruction work items on
system_wq can't make forward progress because the work item waiting
for work_on_cpu() is holding cgroup_mutex, leading to deadlock.
This can be fixed by queueing destruction work items on a separate
workqueue. This patch creates a dedicated workqueue -
cgroup_destroy_wq - for this purpose. As these work items shouldn't
have inter-dependencies and mostly serialized by cgroup_mutex anyway,
giving high concurrency level doesn't buy anything and the workqueue's
@max_active is set to 1 so that destruction work items are executed
one by one on each CPU.
Hugh Dickins: Because cgroup_init() is run before init_workqueues(),
cgroup_destroy_wq can't be allocated from cgroup_init(). Do it from a
separate core_initcall(). In the future, we probably want to reorder
so that workqueue init happens before cgroup_init().
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Hugh Dickins <hughd@google.com>
Reported-by: Shawn Bohrer <shawn.bohrer@gmail.com>
Link: http://lkml.kernel.org/r/20131111220626.GA7509@sbohrermbp13-local.rgmadvisors.com
Link: http://lkml.kernel.org/g/alpine.LNX.2.00.1310301606080.2333@eggly.anvils
Cc: stable@vger.kernel.org # v3.9+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup.c | 28 |
1 files changed, 26 insertions, 2 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b6b26faf1740..d0def7fc2848 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -92,6 +92,14 @@ static DEFINE_MUTEX(cgroup_mutex); | |||
92 | static DEFINE_MUTEX(cgroup_root_mutex); | 92 | static DEFINE_MUTEX(cgroup_root_mutex); |
93 | 93 | ||
94 | /* | 94 | /* |
95 | * cgroup destruction makes heavy use of work items and there can be a lot | ||
96 | * of concurrent destructions. Use a separate workqueue so that cgroup | ||
97 | * destruction work items don't end up filling up max_active of system_wq | ||
98 | * which may lead to deadlock. | ||
99 | */ | ||
100 | static struct workqueue_struct *cgroup_destroy_wq; | ||
101 | |||
102 | /* | ||
95 | * Generate an array of cgroup subsystem pointers. At boot time, this is | 103 | * Generate an array of cgroup subsystem pointers. At boot time, this is |
96 | * populated with the built in subsystems, and modular subsystems are | 104 | * populated with the built in subsystems, and modular subsystems are |
97 | * registered after that. The mutable section of this array is protected by | 105 | * registered after that. The mutable section of this array is protected by |
@@ -873,7 +881,7 @@ static void cgroup_free_rcu(struct rcu_head *head) | |||
873 | { | 881 | { |
874 | struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); | 882 | struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); |
875 | 883 | ||
876 | schedule_work(&cgrp->free_work); | 884 | queue_work(cgroup_destroy_wq, &cgrp->free_work); |
877 | } | 885 | } |
878 | 886 | ||
879 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 887 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) |
@@ -4686,6 +4694,22 @@ out: | |||
4686 | return err; | 4694 | return err; |
4687 | } | 4695 | } |
4688 | 4696 | ||
4697 | static int __init cgroup_wq_init(void) | ||
4698 | { | ||
4699 | /* | ||
4700 | * There isn't much point in executing destruction path in | ||
4701 | * parallel. Good chunk is serialized with cgroup_mutex anyway. | ||
4702 | * Use 1 for @max_active. | ||
4703 | * | ||
4704 | * We would prefer to do this in cgroup_init() above, but that | ||
4705 | * is called before init_workqueues(): so leave this until after. | ||
4706 | */ | ||
4707 | cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); | ||
4708 | BUG_ON(!cgroup_destroy_wq); | ||
4709 | return 0; | ||
4710 | } | ||
4711 | core_initcall(cgroup_wq_init); | ||
4712 | |||
4689 | /* | 4713 | /* |
4690 | * proc_cgroup_show() | 4714 | * proc_cgroup_show() |
4691 | * - Print task's cgroup paths into seq_file, one line for each hierarchy | 4715 | * - Print task's cgroup paths into seq_file, one line for each hierarchy |
@@ -4996,7 +5020,7 @@ void __css_put(struct cgroup_subsys_state *css) | |||
4996 | 5020 | ||
4997 | v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); | 5021 | v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); |
4998 | if (v == 0) | 5022 | if (v == 0) |
4999 | schedule_work(&css->dput_work); | 5023 | queue_work(cgroup_destroy_wq, &css->dput_work); |
5000 | } | 5024 | } |
5001 | EXPORT_SYMBOL_GPL(__css_put); | 5025 | EXPORT_SYMBOL_GPL(__css_put); |
5002 | 5026 | ||