cgroup: remove CGRP_WAIT_ON_RMDIR, cgroup_exclude_rmdir() and cgroup_release_and_wakeup_rmdir()

CGRP_WAIT_ON_RMDIR is another kludge which was added to make cgroup destruction rollback somewhat working. cgroup_rmdir() used to drain CSS references and CGRP_WAIT_ON_RMDIR and the associated waitqueue and helpers were used to allow the task performing rmdir to wait for the next relevant event. Unfortunately, the wait is visible to controllers too and the mechanism got exposed to memcg by 887032670d ("cgroup avoid permanent sleep at rmdir"). Now that the draining and retries are gone, CGRP_WAIT_ON_RMDIR is unnecessary. Remove it and all the mechanisms supporting it. Note that memcontrol.c changes are essentially revert of 887032670d ("cgroup avoid permanent sleep at rmdir"). Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Michal Hocko <mhocko@suse.cz> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Li Zefan <lizefan@huawei.com> Cc: Balbir Singh <bsingharora@gmail.com>
author: Tejun Heo <tj@kernel.org> 2012-11-05 12:16:59 -0500
committer: Tejun Heo <tj@kernel.org> 2012-11-05 12:16:59 -0500
commit: b25ed609d0eecf077db607e88ea70bae83b6adb2 (patch)
tree: ba6d9f2b6d1f0a389acb5a52cd4498c0f5ee6060
parent: 1a90dd508b0b00e382fd61a46f55dc889ac21b39 (diff)
3 files changed, 1 insertions, 95 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index a3098046250b..47868a86ba2b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -145,10 +145,6 @@ enum {
        /* Control Group requires release notifications to userspace */
        CGRP_NOTIFY_ON_RELEASE,
        /*
-         * A thread in rmdir() is wating for this cgroup.
-         */
-        CGRP_WAIT_ON_RMDIR,
-        /*
         * Clone cgroup values when creating a new child cgroup
         */
        CGRP_CLONE_CHILDREN,
@@ -412,23 +408,6 @@ int cgroup_task_count(const struct cgroup *cgrp);
 int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
 /*
- * When the subsys has to access css and may add permanent refcnt to css,
- * it should take care of racy conditions with rmdir(). Following set of
- * functions, is for stop/restart rmdir if necessary.
- * Because these will call css_get/put, "css" should be alive css.
- *
- *  cgroup_exclude_rmdir();
- *  ...do some jobs which may access arbitrary empty cgroup
- *  cgroup_release_and_wakeup_rmdir();
- *
- *  When someone removes a cgroup while cgroup_exclude_rmdir() holds it,
- *  it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up.
- */
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css);
-/*
 * Control Group taskset, used to pass around set of tasks to cgroup_subsys
 * methods.
 */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 66204a6f68f3..c5f6fb28dd0e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -966,33 +966,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 }
 /*
- * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
- * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
- * reference to css->refcnt. In general, this refcnt is expected to goes down
- * to zero, soon.
- *
- * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
- */
-static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
-static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
-{
-        if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
-                wake_up_all(&cgroup_rmdir_waitq);
-}
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
-{
-        css_get(css);
-}
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
-{
-        cgroup_wakeup_rmdir_waiter(css->cgroup);
-        css_put(css);
-}
-/*
 * Call with cgroup_mutex held. Drops reference counts on modules, including
 * any duplicate ones that parse_cgroupfs_options took. If this function
 * returns an error, no reference counts are touched.
@@ -1963,12 +1936,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        }
        synchronize_rcu();
-        /*
-         * wake up rmdir() waiter. the rmdir should fail since the cgroup
-         * is no longer empty.
-         */
-        cgroup_wakeup_rmdir_waiter(cgrp);
 out:
        if (retval) {
                for_each_subsys(root, ss) {
@@ -2138,7 +2105,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
         * step 5: success! and cleanup
         */
        synchronize_rcu();
-        cgroup_wakeup_rmdir_waiter(cgrp);
        retval = 0;
 out_put_css_set_refs:
        if (retval) {
@@ -4058,26 +4024,13 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        struct cgroup_event *event, *tmp;
        struct cgroup_subsys *ss;
-        /*
-         * In general, subsystem has no css->refcnt after pre_destroy(). But
-         * in racy cases, subsystem may have to get css->refcnt after
-         * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
-         * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
-         * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
-         * and subsystem's reference count handling. Please see css_get/put
-         * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
-         */
-        set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
        /* the vfs holds both inode->i_mutex already */
        mutex_lock(&cgroup_mutex);
        parent = cgrp->parent;
        if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
-                clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
                mutex_unlock(&cgroup_mutex);
                return -EBUSY;
        }
-        prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
        /*
         * Block new css_tryget() by deactivating refcnt and mark @cgrp
@@ -4114,9 +4067,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        for_each_subsys(cgrp->root, ss)
                css_put(cgrp->subsys[ss->subsys_id]);
-        finish_wait(&cgroup_rmdir_waitq, &wait);
-        clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
        raw_spin_lock(&release_list_lock);
        if (!list_empty(&cgrp->release_list))
                list_del_init(&cgrp->release_list);
@@ -4864,7 +4814,6 @@ void __css_put(struct cgroup_subsys_state *css)
                        set_bit(CGRP_RELEASABLE, &cgrp->flags);
                        check_for_release(cgrp);
                }
-                cgroup_wakeup_rmdir_waiter(cgrp);
                break;
        case 0:
                schedule_work(&css->dput_work);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 37c356646544..930edfaa5187 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2681,13 +2681,6 @@ static int mem_cgroup_move_account(struct page *page,
        /* caller should have done css_get */
        pc->mem_cgroup = to;
        mem_cgroup_charge_statistics(to, anon, nr_pages);
-        /*
-         * We charges against "to" which may not have any tasks. Then, "to"
-         * can be under rmdir(). But in current implementation, caller of
-         * this function is just force_empty() and move charge, so it's
-         * guaranteed that "to" is never removed. So, we don't check rmdir
-         * status here.
-         */
        move_unlock_mem_cgroup(from, &flags);
        ret = 0;
 unlock:
@@ -2893,7 +2886,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
                return;
        if (!memcg)
                return;
-        cgroup_exclude_rmdir(&memcg->css);
        __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
        /*
@@ -2907,12 +2899,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
                swp_entry_t ent = {.val = page_private(page)};
                mem_cgroup_uncharge_swap(ent);
        }
-        /*
-         * At swapin, we may charge account against cgroup which has no tasks.
-         * So, rmdir()->pre_destroy() can be called while we do this charge.
-         * In that case, we need to call pre_destroy() again. check it here.
-         */
-        cgroup_release_and_wakeup_rmdir(&memcg->css);
 }
 void mem_cgroup_commit_charge_swapin(struct page *page,
@@ -3360,8 +3346,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
        if (!memcg)
                return;
-        /* blocks rmdir() */
-        cgroup_exclude_rmdir(&memcg->css);
        if (!migration_ok) {
                used = oldpage;
                unused = newpage;
@@ -3395,13 +3380,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
         */
        if (anon)
                mem_cgroup_uncharge_page(used);
-        /*
-         * At migration, we may charge account against cgroup which has no
-         * tasks.
-         * So, rmdir()->pre_destroy() can be called while we do this charge.
-         * In that case, we need to call pre_destroy() again. check it here.
-         */
-        cgroup_release_and_wakeup_rmdir(&memcg->css);
 }
 /*
author	Tejun Heo <tj@kernel.org>	2012-11-05 12:16:59 -0500
committer	Tejun Heo <tj@kernel.org>	2012-11-05 12:16:59 -0500
commit	b25ed609d0eecf077db607e88ea70bae83b6adb2 (patch)
tree	ba6d9f2b6d1f0a389acb5a52cd4498c0f5ee6060
parent	1a90dd508b0b00e382fd61a46f55dc889ac21b39 (diff)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index a3098046250b..47868a86ba2b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h
@@ -145,10 +145,6 @@ enum {
145	/* Control Group requires release notifications to userspace */	145	/* Control Group requires release notifications to userspace */
146	CGRP_NOTIFY_ON_RELEASE,	146	CGRP_NOTIFY_ON_RELEASE,
147	/*	147	/*
148	* A thread in rmdir() is wating for this cgroup.
149	*/
150	CGRP_WAIT_ON_RMDIR,
151	/*
152	* Clone cgroup values when creating a new child cgroup	148	* Clone cgroup values when creating a new child cgroup
153	*/	149	*/
154	CGRP_CLONE_CHILDREN,	150	CGRP_CLONE_CHILDREN,
@@ -412,23 +408,6 @@ int cgroup_task_count(const struct cgroup *cgrp);
412	int cgroup_is_descendant(const struct cgroup cgrp, struct task_struct task);	408	int cgroup_is_descendant(const struct cgroup cgrp, struct task_struct task);
413		409
414	/*	410	/*
415	* When the subsys has to access css and may add permanent refcnt to css,
416	* it should take care of racy conditions with rmdir(). Following set of
417	* functions, is for stop/restart rmdir if necessary.
418	* Because these will call css_get/put, "css" should be alive css.
419	*
420	* cgroup_exclude_rmdir();
421	* ...do some jobs which may access arbitrary empty cgroup
422	* cgroup_release_and_wakeup_rmdir();
423	*
424	* When someone removes a cgroup while cgroup_exclude_rmdir() holds it,
425	* it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up.
426	*/
427
428	void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
429	void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css);
430
431	/*
432	* Control Group taskset, used to pass around set of tasks to cgroup_subsys	411	* Control Group taskset, used to pass around set of tasks to cgroup_subsys
433	* methods.	412	* methods.
434	*/	413	*/


diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 66204a6f68f3..c5f6fb28dd0e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c
@@ -966,33 +966,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
966	}	966	}
967		967
968	/*	968	/*
969	* A queue for waiters to do rmdir() cgroup. A tasks will sleep when
970	* cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
971	* reference to css->refcnt. In general, this refcnt is expected to goes down
972	* to zero, soon.
973	*
974	* CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
975	*/
976	static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
977
978	static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
979	{
980	if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
981	wake_up_all(&cgroup_rmdir_waitq);
982	}
983
984	void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
985	{
986	css_get(css);
987	}
988
989	void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
990	{
991	cgroup_wakeup_rmdir_waiter(css->cgroup);
992	css_put(css);
993	}
994
995	/*
996	* Call with cgroup_mutex held. Drops reference counts on modules, including	969	* Call with cgroup_mutex held. Drops reference counts on modules, including
997	* any duplicate ones that parse_cgroupfs_options took. If this function	970	* any duplicate ones that parse_cgroupfs_options took. If this function
998	* returns an error, no reference counts are touched.	971	* returns an error, no reference counts are touched.
@@ -1963,12 +1936,6 @@ int cgroup_attach_task(struct cgroup cgrp, struct task_struct tsk)
1963	}	1936	}
1964		1937
1965	synchronize_rcu();	1938	synchronize_rcu();
1966
1967	/*
1968	* wake up rmdir() waiter. the rmdir should fail since the cgroup
1969	* is no longer empty.
1970	*/
1971	cgroup_wakeup_rmdir_waiter(cgrp);
1972	out:	1939	out:
1973	if (retval) {	1940	if (retval) {
1974	for_each_subsys(root, ss) {	1941	for_each_subsys(root, ss) {
@@ -2138,7 +2105,6 @@ static int cgroup_attach_proc(struct cgroup cgrp, struct task_struct leader)
2138	* step 5: success! and cleanup	2105	* step 5: success! and cleanup
2139	*/	2106	*/
2140	synchronize_rcu();	2107	synchronize_rcu();
2141	cgroup_wakeup_rmdir_waiter(cgrp);
2142	retval = 0;	2108	retval = 0;
2143	out_put_css_set_refs:	2109	out_put_css_set_refs:
2144	if (retval) {	2110	if (retval) {
@@ -4058,26 +4024,13 @@ static int cgroup_rmdir(struct inode unused_dir, struct dentry dentry)
4058	struct cgroup_event event, tmp;	4024	struct cgroup_event event, tmp;
4059	struct cgroup_subsys *ss;	4025	struct cgroup_subsys *ss;
4060		4026
4061	/*
4062	* In general, subsystem has no css->refcnt after pre_destroy(). But
4063	* in racy cases, subsystem may have to get css->refcnt after
4064	* pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
4065	* make rmdir return -EBUSY too often. To avoid that, we use waitqueue
4066	* for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
4067	* and subsystem's reference count handling. Please see css_get/put
4068	* and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
4069	*/
4070	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4071
4072	/* the vfs holds both inode->i_mutex already */	4027	/* the vfs holds both inode->i_mutex already */
4073	mutex_lock(&cgroup_mutex);	4028	mutex_lock(&cgroup_mutex);
4074	parent = cgrp->parent;	4029	parent = cgrp->parent;
4075	if (atomic_read(&cgrp->count) \|\| !list_empty(&cgrp->children)) {	4030	if (atomic_read(&cgrp->count) \|\| !list_empty(&cgrp->children)) {
4076	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4077	mutex_unlock(&cgroup_mutex);	4031	mutex_unlock(&cgroup_mutex);
4078	return -EBUSY;	4032	return -EBUSY;
4079	}	4033	}
4080	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
4081		4034
4082	/*	4035	/*
4083	* Block new css_tryget() by deactivating refcnt and mark @cgrp	4036	* Block new css_tryget() by deactivating refcnt and mark @cgrp
@@ -4114,9 +4067,6 @@ static int cgroup_rmdir(struct inode unused_dir, struct dentry dentry)
4114	for_each_subsys(cgrp->root, ss)	4067	for_each_subsys(cgrp->root, ss)
4115	css_put(cgrp->subsys[ss->subsys_id]);	4068	css_put(cgrp->subsys[ss->subsys_id]);
4116		4069
4117	finish_wait(&cgroup_rmdir_waitq, &wait);
4118	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4119
4120	raw_spin_lock(&release_list_lock);	4070	raw_spin_lock(&release_list_lock);
4121	if (!list_empty(&cgrp->release_list))	4071	if (!list_empty(&cgrp->release_list))
4122	list_del_init(&cgrp->release_list);	4072	list_del_init(&cgrp->release_list);
@@ -4864,7 +4814,6 @@ void __css_put(struct cgroup_subsys_state *css)
4864	set_bit(CGRP_RELEASABLE, &cgrp->flags);	4814	set_bit(CGRP_RELEASABLE, &cgrp->flags);
4865	check_for_release(cgrp);	4815	check_for_release(cgrp);
4866	}	4816	}
4867	cgroup_wakeup_rmdir_waiter(cgrp);
4868	break;	4817	break;
4869	case 0:	4818	case 0:
4870	schedule_work(&css->dput_work);	4819	schedule_work(&css->dput_work);


diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 37c356646544..930edfaa5187 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -2681,13 +2681,6 @@ static int mem_cgroup_move_account(struct page *page,
2681	/* caller should have done css_get */	2681	/* caller should have done css_get */
2682	pc->mem_cgroup = to;	2682	pc->mem_cgroup = to;
2683	mem_cgroup_charge_statistics(to, anon, nr_pages);	2683	mem_cgroup_charge_statistics(to, anon, nr_pages);
2684	/*
2685	* We charges against "to" which may not have any tasks. Then, "to"
2686	* can be under rmdir(). But in current implementation, caller of
2687	* this function is just force_empty() and move charge, so it's
2688	* guaranteed that "to" is never removed. So, we don't check rmdir
2689	* status here.
2690	*/
2691	move_unlock_mem_cgroup(from, &flags);	2684	move_unlock_mem_cgroup(from, &flags);
2692	ret = 0;	2685	ret = 0;
2693	unlock:	2686	unlock:
@@ -2893,7 +2886,6 @@ __mem_cgroup_commit_charge_swapin(struct page page, struct mem_cgroup memcg,
2893	return;	2886	return;
2894	if (!memcg)	2887	if (!memcg)
2895	return;	2888	return;
2896	cgroup_exclude_rmdir(&memcg->css);
2897		2889
2898	__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);	2890	__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
2899	/*	2891	/*
@@ -2907,12 +2899,6 @@ __mem_cgroup_commit_charge_swapin(struct page page, struct mem_cgroup memcg,
2907	swp_entry_t ent = {.val = page_private(page)};	2899	swp_entry_t ent = {.val = page_private(page)};
2908	mem_cgroup_uncharge_swap(ent);	2900	mem_cgroup_uncharge_swap(ent);
2909	}	2901	}
2910	/*
2911	* At swapin, we may charge account against cgroup which has no tasks.
2912	* So, rmdir()->pre_destroy() can be called while we do this charge.
2913	* In that case, we need to call pre_destroy() again. check it here.
2914	*/
2915	cgroup_release_and_wakeup_rmdir(&memcg->css);
2916	}	2902	}
2917		2903
2918	void mem_cgroup_commit_charge_swapin(struct page *page,	2904	void mem_cgroup_commit_charge_swapin(struct page *page,
@@ -3360,8 +3346,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3360		3346
3361	if (!memcg)	3347	if (!memcg)
3362	return;	3348	return;
3363	/* blocks rmdir() */	3349
3364	cgroup_exclude_rmdir(&memcg->css);
3365	if (!migration_ok) {	3350	if (!migration_ok) {
3366	used = oldpage;	3351	used = oldpage;
3367	unused = newpage;	3352	unused = newpage;
@@ -3395,13 +3380,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3395	*/	3380	*/
3396	if (anon)	3381	if (anon)
3397	mem_cgroup_uncharge_page(used);	3382	mem_cgroup_uncharge_page(used);
3398	/*
3399	* At migration, we may charge account against cgroup which has no
3400	* tasks.
3401	* So, rmdir()->pre_destroy() can be called while we do this charge.
3402	* In that case, we need to call pre_destroy() again. check it here.
3403	*/
3404	cgroup_release_and_wakeup_rmdir(&memcg->css);
3405	}	3383	}
3406		3384
3407	/*	3385	/*