diff options
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 85 |
1 files changed, 63 insertions, 22 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4c62513fe19f..bc1dcabe9217 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -90,6 +90,14 @@ static DEFINE_MUTEX(cgroup_mutex); | |||
90 | static DEFINE_MUTEX(cgroup_root_mutex); | 90 | static DEFINE_MUTEX(cgroup_root_mutex); |
91 | 91 | ||
92 | /* | 92 | /* |
93 | * cgroup destruction makes heavy use of work items and there can be a lot | ||
94 | * of concurrent destructions. Use a separate workqueue so that cgroup | ||
95 | * destruction work items don't end up filling up max_active of system_wq | ||
96 | * which may lead to deadlock. | ||
97 | */ | ||
98 | static struct workqueue_struct *cgroup_destroy_wq; | ||
99 | |||
100 | /* | ||
93 | * Generate an array of cgroup subsystem pointers. At boot time, this is | 101 | * Generate an array of cgroup subsystem pointers. At boot time, this is |
94 | * populated with the built in subsystems, and modular subsystems are | 102 | * populated with the built in subsystems, and modular subsystems are |
95 | * registered after that. The mutable section of this array is protected by | 103 | * registered after that. The mutable section of this array is protected by |
@@ -191,6 +199,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp); | |||
191 | static int cgroup_destroy_locked(struct cgroup *cgrp); | 199 | static int cgroup_destroy_locked(struct cgroup *cgrp); |
192 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | 200 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], |
193 | bool is_add); | 201 | bool is_add); |
202 | static int cgroup_file_release(struct inode *inode, struct file *file); | ||
194 | 203 | ||
195 | /** | 204 | /** |
196 | * cgroup_css - obtain a cgroup's css for the specified subsystem | 205 | * cgroup_css - obtain a cgroup's css for the specified subsystem |
@@ -871,7 +880,7 @@ static void cgroup_free_rcu(struct rcu_head *head) | |||
871 | struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); | 880 | struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); |
872 | 881 | ||
873 | INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); | 882 | INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); |
874 | schedule_work(&cgrp->destroy_work); | 883 | queue_work(cgroup_destroy_wq, &cgrp->destroy_work); |
875 | } | 884 | } |
876 | 885 | ||
877 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 886 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) |
@@ -881,6 +890,16 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
881 | struct cgroup *cgrp = dentry->d_fsdata; | 890 | struct cgroup *cgrp = dentry->d_fsdata; |
882 | 891 | ||
883 | BUG_ON(!(cgroup_is_dead(cgrp))); | 892 | BUG_ON(!(cgroup_is_dead(cgrp))); |
893 | |||
894 | /* | ||
895 | * XXX: cgrp->id is only used to look up css's. As cgroup | ||
896 | * and css's lifetimes will be decoupled, it should be made | ||
897 | * per-subsystem and moved to css->id so that lookups are | ||
898 | * successful until the target css is released. | ||
899 | */ | ||
900 | idr_remove(&cgrp->root->cgroup_idr, cgrp->id); | ||
901 | cgrp->id = -1; | ||
902 | |||
884 | call_rcu(&cgrp->rcu_head, cgroup_free_rcu); | 903 | call_rcu(&cgrp->rcu_head, cgroup_free_rcu); |
885 | } else { | 904 | } else { |
886 | struct cfent *cfe = __d_cfe(dentry); | 905 | struct cfent *cfe = __d_cfe(dentry); |
@@ -2421,7 +2440,7 @@ static const struct file_operations cgroup_seqfile_operations = { | |||
2421 | .read = seq_read, | 2440 | .read = seq_read, |
2422 | .write = cgroup_file_write, | 2441 | .write = cgroup_file_write, |
2423 | .llseek = seq_lseek, | 2442 | .llseek = seq_lseek, |
2424 | .release = single_release, | 2443 | .release = cgroup_file_release, |
2425 | }; | 2444 | }; |
2426 | 2445 | ||
2427 | static int cgroup_file_open(struct inode *inode, struct file *file) | 2446 | static int cgroup_file_open(struct inode *inode, struct file *file) |
@@ -2482,6 +2501,8 @@ static int cgroup_file_release(struct inode *inode, struct file *file) | |||
2482 | ret = cft->release(inode, file); | 2501 | ret = cft->release(inode, file); |
2483 | if (css->ss) | 2502 | if (css->ss) |
2484 | css_put(css); | 2503 | css_put(css); |
2504 | if (file->f_op == &cgroup_seqfile_operations) | ||
2505 | single_release(inode, file); | ||
2485 | return ret; | 2506 | return ret; |
2486 | } | 2507 | } |
2487 | 2508 | ||
@@ -4249,7 +4270,7 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) | |||
4249 | * css_put(). dput() requires process context which we don't have. | 4270 | * css_put(). dput() requires process context which we don't have. |
4250 | */ | 4271 | */ |
4251 | INIT_WORK(&css->destroy_work, css_free_work_fn); | 4272 | INIT_WORK(&css->destroy_work, css_free_work_fn); |
4252 | schedule_work(&css->destroy_work); | 4273 | queue_work(cgroup_destroy_wq, &css->destroy_work); |
4253 | } | 4274 | } |
4254 | 4275 | ||
4255 | static void css_release(struct percpu_ref *ref) | 4276 | static void css_release(struct percpu_ref *ref) |
@@ -4257,6 +4278,7 @@ static void css_release(struct percpu_ref *ref) | |||
4257 | struct cgroup_subsys_state *css = | 4278 | struct cgroup_subsys_state *css = |
4258 | container_of(ref, struct cgroup_subsys_state, refcnt); | 4279 | container_of(ref, struct cgroup_subsys_state, refcnt); |
4259 | 4280 | ||
4281 | rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL); | ||
4260 | call_rcu(&css->rcu_head, css_free_rcu_fn); | 4282 | call_rcu(&css->rcu_head, css_free_rcu_fn); |
4261 | } | 4283 | } |
4262 | 4284 | ||
@@ -4415,14 +4437,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4415 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); | 4437 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); |
4416 | root->number_of_cgroups++; | 4438 | root->number_of_cgroups++; |
4417 | 4439 | ||
4418 | /* each css holds a ref to the cgroup's dentry and the parent css */ | ||
4419 | for_each_root_subsys(root, ss) { | ||
4420 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; | ||
4421 | |||
4422 | dget(dentry); | ||
4423 | css_get(css->parent); | ||
4424 | } | ||
4425 | |||
4426 | /* hold a ref to the parent's dentry */ | 4440 | /* hold a ref to the parent's dentry */ |
4427 | dget(parent->dentry); | 4441 | dget(parent->dentry); |
4428 | 4442 | ||
@@ -4434,6 +4448,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4434 | if (err) | 4448 | if (err) |
4435 | goto err_destroy; | 4449 | goto err_destroy; |
4436 | 4450 | ||
4451 | /* each css holds a ref to the cgroup's dentry and parent css */ | ||
4452 | dget(dentry); | ||
4453 | css_get(css->parent); | ||
4454 | |||
4455 | /* mark it consumed for error path */ | ||
4456 | css_ar[ss->subsys_id] = NULL; | ||
4457 | |||
4437 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && | 4458 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && |
4438 | parent->parent) { | 4459 | parent->parent) { |
4439 | pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", | 4460 | pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", |
@@ -4480,6 +4501,14 @@ err_free_cgrp: | |||
4480 | return err; | 4501 | return err; |
4481 | 4502 | ||
4482 | err_destroy: | 4503 | err_destroy: |
4504 | for_each_root_subsys(root, ss) { | ||
4505 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; | ||
4506 | |||
4507 | if (css) { | ||
4508 | percpu_ref_cancel_init(&css->refcnt); | ||
4509 | ss->css_free(css); | ||
4510 | } | ||
4511 | } | ||
4483 | cgroup_destroy_locked(cgrp); | 4512 | cgroup_destroy_locked(cgrp); |
4484 | mutex_unlock(&cgroup_mutex); | 4513 | mutex_unlock(&cgroup_mutex); |
4485 | mutex_unlock(&dentry->d_inode->i_mutex); | 4514 | mutex_unlock(&dentry->d_inode->i_mutex); |
@@ -4539,7 +4568,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) | |||
4539 | container_of(ref, struct cgroup_subsys_state, refcnt); | 4568 | container_of(ref, struct cgroup_subsys_state, refcnt); |
4540 | 4569 | ||
4541 | INIT_WORK(&css->destroy_work, css_killed_work_fn); | 4570 | INIT_WORK(&css->destroy_work, css_killed_work_fn); |
4542 | schedule_work(&css->destroy_work); | 4571 | queue_work(cgroup_destroy_wq, &css->destroy_work); |
4543 | } | 4572 | } |
4544 | 4573 | ||
4545 | /** | 4574 | /** |
@@ -4641,8 +4670,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4641 | * will be invoked to perform the rest of destruction once the | 4670 | * will be invoked to perform the rest of destruction once the |
4642 | * percpu refs of all css's are confirmed to be killed. | 4671 | * percpu refs of all css's are confirmed to be killed. |
4643 | */ | 4672 | */ |
4644 | for_each_root_subsys(cgrp->root, ss) | 4673 | for_each_root_subsys(cgrp->root, ss) { |
4645 | kill_css(cgroup_css(cgrp, ss)); | 4674 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); |
4675 | |||
4676 | if (css) | ||
4677 | kill_css(css); | ||
4678 | } | ||
4646 | 4679 | ||
4647 | /* | 4680 | /* |
4648 | * Mark @cgrp dead. This prevents further task migration and child | 4681 | * Mark @cgrp dead. This prevents further task migration and child |
@@ -4711,14 +4744,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp) | |||
4711 | /* delete this cgroup from parent->children */ | 4744 | /* delete this cgroup from parent->children */ |
4712 | list_del_rcu(&cgrp->sibling); | 4745 | list_del_rcu(&cgrp->sibling); |
4713 | 4746 | ||
4714 | /* | ||
4715 | * We should remove the cgroup object from idr before its grace | ||
4716 | * period starts, so we won't be looking up a cgroup while the | ||
4717 | * cgroup is being freed. | ||
4718 | */ | ||
4719 | idr_remove(&cgrp->root->cgroup_idr, cgrp->id); | ||
4720 | cgrp->id = -1; | ||
4721 | |||
4722 | dput(d); | 4747 | dput(d); |
4723 | 4748 | ||
4724 | set_bit(CGRP_RELEASABLE, &parent->flags); | 4749 | set_bit(CGRP_RELEASABLE, &parent->flags); |
@@ -5063,6 +5088,22 @@ out: | |||
5063 | return err; | 5088 | return err; |
5064 | } | 5089 | } |
5065 | 5090 | ||
5091 | static int __init cgroup_wq_init(void) | ||
5092 | { | ||
5093 | /* | ||
5094 | * There isn't much point in executing destruction path in | ||
5095 | * parallel. Good chunk is serialized with cgroup_mutex anyway. | ||
5096 | * Use 1 for @max_active. | ||
5097 | * | ||
5098 | * We would prefer to do this in cgroup_init() above, but that | ||
5099 | * is called before init_workqueues(): so leave this until after. | ||
5100 | */ | ||
5101 | cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); | ||
5102 | BUG_ON(!cgroup_destroy_wq); | ||
5103 | return 0; | ||
5104 | } | ||
5105 | core_initcall(cgroup_wq_init); | ||
5106 | |||
5066 | /* | 5107 | /* |
5067 | * proc_cgroup_show() | 5108 | * proc_cgroup_show() |
5068 | * - Print task's cgroup paths into seq_file, one line for each hierarchy | 5109 | * - Print task's cgroup paths into seq_file, one line for each hierarchy |