aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2012-11-05 12:16:59 -0500
committerTejun Heo <tj@kernel.org>2012-11-05 12:16:59 -0500
commitb25ed609d0eecf077db607e88ea70bae83b6adb2 (patch)
treeba6d9f2b6d1f0a389acb5a52cd4498c0f5ee6060
parent1a90dd508b0b00e382fd61a46f55dc889ac21b39 (diff)
cgroup: remove CGRP_WAIT_ON_RMDIR, cgroup_exclude_rmdir() and cgroup_release_and_wakeup_rmdir()
CGRP_WAIT_ON_RMDIR is another kludge which was added to make cgroup destruction rollback somewhat working. cgroup_rmdir() used to drain CSS references and CGRP_WAIT_ON_RMDIR and the associated waitqueue and helpers were used to allow the task performing rmdir to wait for the next relevant event. Unfortunately, the wait is visible to controllers too and the mechanism got exposed to memcg by 887032670d ("cgroup avoid permanent sleep at rmdir"). Now that the draining and retries are gone, CGRP_WAIT_ON_RMDIR is unnecessary. Remove it and all the mechanisms supporting it. Note that memcontrol.c changes are essentially revert of 887032670d ("cgroup avoid permanent sleep at rmdir"). Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Michal Hocko <mhocko@suse.cz> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Li Zefan <lizefan@huawei.com> Cc: Balbir Singh <bsingharora@gmail.com>
-rw-r--r--include/linux/cgroup.h21
-rw-r--r--kernel/cgroup.c51
-rw-r--r--mm/memcontrol.c24
3 files changed, 1 insertions, 95 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index a3098046250b..47868a86ba2b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -145,10 +145,6 @@ enum {
145 /* Control Group requires release notifications to userspace */ 145 /* Control Group requires release notifications to userspace */
146 CGRP_NOTIFY_ON_RELEASE, 146 CGRP_NOTIFY_ON_RELEASE,
147 /* 147 /*
148 * A thread in rmdir() is wating for this cgroup.
149 */
150 CGRP_WAIT_ON_RMDIR,
151 /*
152 * Clone cgroup values when creating a new child cgroup 148 * Clone cgroup values when creating a new child cgroup
153 */ 149 */
154 CGRP_CLONE_CHILDREN, 150 CGRP_CLONE_CHILDREN,
@@ -412,23 +408,6 @@ int cgroup_task_count(const struct cgroup *cgrp);
412int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); 408int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
413 409
414/* 410/*
415 * When the subsys has to access css and may add permanent refcnt to css,
416 * it should take care of racy conditions with rmdir(). Following set of
417 * functions, is for stop/restart rmdir if necessary.
418 * Because these will call css_get/put, "css" should be alive css.
419 *
420 * cgroup_exclude_rmdir();
421 * ...do some jobs which may access arbitrary empty cgroup
422 * cgroup_release_and_wakeup_rmdir();
423 *
424 * When someone removes a cgroup while cgroup_exclude_rmdir() holds it,
425 * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up.
426 */
427
428void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
429void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css);
430
431/*
432 * Control Group taskset, used to pass around set of tasks to cgroup_subsys 411 * Control Group taskset, used to pass around set of tasks to cgroup_subsys
433 * methods. 412 * methods.
434 */ 413 */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 66204a6f68f3..c5f6fb28dd0e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -966,33 +966,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
966} 966}
967 967
968/* 968/*
969 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
970 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
971 * reference to css->refcnt. In general, this refcnt is expected to goes down
972 * to zero, soon.
973 *
974 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
975 */
976static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
977
978static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
979{
980 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
981 wake_up_all(&cgroup_rmdir_waitq);
982}
983
984void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
985{
986 css_get(css);
987}
988
989void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
990{
991 cgroup_wakeup_rmdir_waiter(css->cgroup);
992 css_put(css);
993}
994
995/*
996 * Call with cgroup_mutex held. Drops reference counts on modules, including 969 * Call with cgroup_mutex held. Drops reference counts on modules, including
997 * any duplicate ones that parse_cgroupfs_options took. If this function 970 * any duplicate ones that parse_cgroupfs_options took. If this function
998 * returns an error, no reference counts are touched. 971 * returns an error, no reference counts are touched.
@@ -1963,12 +1936,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1963 } 1936 }
1964 1937
1965 synchronize_rcu(); 1938 synchronize_rcu();
1966
1967 /*
1968 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1969 * is no longer empty.
1970 */
1971 cgroup_wakeup_rmdir_waiter(cgrp);
1972out: 1939out:
1973 if (retval) { 1940 if (retval) {
1974 for_each_subsys(root, ss) { 1941 for_each_subsys(root, ss) {
@@ -2138,7 +2105,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2138 * step 5: success! and cleanup 2105 * step 5: success! and cleanup
2139 */ 2106 */
2140 synchronize_rcu(); 2107 synchronize_rcu();
2141 cgroup_wakeup_rmdir_waiter(cgrp);
2142 retval = 0; 2108 retval = 0;
2143out_put_css_set_refs: 2109out_put_css_set_refs:
2144 if (retval) { 2110 if (retval) {
@@ -4058,26 +4024,13 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4058 struct cgroup_event *event, *tmp; 4024 struct cgroup_event *event, *tmp;
4059 struct cgroup_subsys *ss; 4025 struct cgroup_subsys *ss;
4060 4026
4061 /*
4062 * In general, subsystem has no css->refcnt after pre_destroy(). But
4063 * in racy cases, subsystem may have to get css->refcnt after
4064 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
4065 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
4066 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
4067 * and subsystem's reference count handling. Please see css_get/put
4068 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
4069 */
4070 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4071
4072 /* the vfs holds both inode->i_mutex already */ 4027 /* the vfs holds both inode->i_mutex already */
4073 mutex_lock(&cgroup_mutex); 4028 mutex_lock(&cgroup_mutex);
4074 parent = cgrp->parent; 4029 parent = cgrp->parent;
4075 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { 4030 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
4076 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4077 mutex_unlock(&cgroup_mutex); 4031 mutex_unlock(&cgroup_mutex);
4078 return -EBUSY; 4032 return -EBUSY;
4079 } 4033 }
4080 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
4081 4034
4082 /* 4035 /*
4083 * Block new css_tryget() by deactivating refcnt and mark @cgrp 4036 * Block new css_tryget() by deactivating refcnt and mark @cgrp
@@ -4114,9 +4067,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4114 for_each_subsys(cgrp->root, ss) 4067 for_each_subsys(cgrp->root, ss)
4115 css_put(cgrp->subsys[ss->subsys_id]); 4068 css_put(cgrp->subsys[ss->subsys_id]);
4116 4069
4117 finish_wait(&cgroup_rmdir_waitq, &wait);
4118 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4119
4120 raw_spin_lock(&release_list_lock); 4070 raw_spin_lock(&release_list_lock);
4121 if (!list_empty(&cgrp->release_list)) 4071 if (!list_empty(&cgrp->release_list))
4122 list_del_init(&cgrp->release_list); 4072 list_del_init(&cgrp->release_list);
@@ -4864,7 +4814,6 @@ void __css_put(struct cgroup_subsys_state *css)
4864 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4814 set_bit(CGRP_RELEASABLE, &cgrp->flags);
4865 check_for_release(cgrp); 4815 check_for_release(cgrp);
4866 } 4816 }
4867 cgroup_wakeup_rmdir_waiter(cgrp);
4868 break; 4817 break;
4869 case 0: 4818 case 0:
4870 schedule_work(&css->dput_work); 4819 schedule_work(&css->dput_work);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 37c356646544..930edfaa5187 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2681,13 +2681,6 @@ static int mem_cgroup_move_account(struct page *page,
2681 /* caller should have done css_get */ 2681 /* caller should have done css_get */
2682 pc->mem_cgroup = to; 2682 pc->mem_cgroup = to;
2683 mem_cgroup_charge_statistics(to, anon, nr_pages); 2683 mem_cgroup_charge_statistics(to, anon, nr_pages);
2684 /*
2685 * We charges against "to" which may not have any tasks. Then, "to"
2686 * can be under rmdir(). But in current implementation, caller of
2687 * this function is just force_empty() and move charge, so it's
2688 * guaranteed that "to" is never removed. So, we don't check rmdir
2689 * status here.
2690 */
2691 move_unlock_mem_cgroup(from, &flags); 2684 move_unlock_mem_cgroup(from, &flags);
2692 ret = 0; 2685 ret = 0;
2693unlock: 2686unlock:
@@ -2893,7 +2886,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2893 return; 2886 return;
2894 if (!memcg) 2887 if (!memcg)
2895 return; 2888 return;
2896 cgroup_exclude_rmdir(&memcg->css);
2897 2889
2898 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); 2890 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
2899 /* 2891 /*
@@ -2907,12 +2899,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2907 swp_entry_t ent = {.val = page_private(page)}; 2899 swp_entry_t ent = {.val = page_private(page)};
2908 mem_cgroup_uncharge_swap(ent); 2900 mem_cgroup_uncharge_swap(ent);
2909 } 2901 }
2910 /*
2911 * At swapin, we may charge account against cgroup which has no tasks.
2912 * So, rmdir()->pre_destroy() can be called while we do this charge.
2913 * In that case, we need to call pre_destroy() again. check it here.
2914 */
2915 cgroup_release_and_wakeup_rmdir(&memcg->css);
2916} 2902}
2917 2903
2918void mem_cgroup_commit_charge_swapin(struct page *page, 2904void mem_cgroup_commit_charge_swapin(struct page *page,
@@ -3360,8 +3346,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3360 3346
3361 if (!memcg) 3347 if (!memcg)
3362 return; 3348 return;
3363 /* blocks rmdir() */ 3349
3364 cgroup_exclude_rmdir(&memcg->css);
3365 if (!migration_ok) { 3350 if (!migration_ok) {
3366 used = oldpage; 3351 used = oldpage;
3367 unused = newpage; 3352 unused = newpage;
@@ -3395,13 +3380,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3395 */ 3380 */
3396 if (anon) 3381 if (anon)
3397 mem_cgroup_uncharge_page(used); 3382 mem_cgroup_uncharge_page(used);
3398 /*
3399 * At migration, we may charge account against cgroup which has no
3400 * tasks.
3401 * So, rmdir()->pre_destroy() can be called while we do this charge.
3402 * In that case, we need to call pre_destroy() again. check it here.
3403 */
3404 cgroup_release_and_wakeup_rmdir(&memcg->css);
3405} 3383}
3406 3384
3407/* 3385/*