cgroup: fix frequent -EBUSY at rmdir

In following situation, with memory subsystem, /groupA use_hierarchy==1 /01 some tasks /02 some tasks /03 some tasks /04 empty When tasks under 01/02/03 hit limit on /groupA, hierarchical reclaim is triggered and the kernel walks tree under groupA. In this case, rmdir /groupA/04 fails with -EBUSY frequently because of temporal refcnt from the kernel. In general. cgroup can be rmdir'd if there are no children groups and no tasks. Frequent fails of rmdir() is not useful to users. (And the reason for -EBUSY is unknown to users.....in most cases) This patch tries to modify above behavior, by - retries if css_refcnt is got by someone. - add "return value" to pre_destroy() and allows subsystem to say "we're really busy!" Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Paul Menage <menage@google.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> 2009-04-02 19:57:26 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2009-04-02 22:04:54 -0400
commit: ec64f51545fffbc4cb968f0cea56341a4b07e85a (patch)
tree: 575d890a6759d81f3324fa2a22ca6ab14a41eefc /kernel
parent: 38460b48d06440de46b34cb778bd6c4855030754 (diff)
1 files changed, 67 insertions, 14 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d3c521137425..fc5e4a48582f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -622,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
 * Call subsys's pre_destroy handler.
 * This is called before css refcnt check.
 */
-static void cgroup_call_pre_destroy(struct cgroup *cgrp)
+static int cgroup_call_pre_destroy(struct cgroup *cgrp)
 {
        struct cgroup_subsys *ss;
+        int ret = 0;
        for_each_subsys(cgrp->root, ss)
-                if (ss->pre_destroy)
+                if (ss->pre_destroy) {
-                        ss->pre_destroy(ss, cgrp);
+                        ret = ss->pre_destroy(ss, cgrp);
-        return;
+                        if (ret)
+                                break;
+                }
+        return ret;
 }
 static void free_cgroup_rcu(struct rcu_head *obj)
@@ -722,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
        remove_dir(dentry);
 }
+/*
+ * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
+ * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
+ * reference to css->refcnt. In general, this refcnt is expected to goes down
+ * to zero, soon.
+ *
+ * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
+ */
+DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
+{
+        if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
+                wake_up_all(&cgroup_rmdir_waitq);
+}
 static int rebind_subsystems(struct cgroupfs_root *root,
                              unsigned long final_bits)
 {
@@ -1317,6 +1338,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
        synchronize_rcu();
        put_css_set(cg);
+        /*
+         * wake up rmdir() waiter. the rmdir should fail since the cgroup
+         * is no longer empty.
+         */
+        cgroup_wakeup_rmdir_waiters(cgrp);
        return 0;
 }
@@ -2608,9 +2635,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        struct cgroup *cgrp = dentry->d_fsdata;
        struct dentry *d;
        struct cgroup *parent;
+        DEFINE_WAIT(wait);
+        int ret;
        /* the vfs holds both inode->i_mutex already */
+again:
        mutex_lock(&cgroup_mutex);
        if (atomic_read(&cgrp->count) != 0) {
                mutex_unlock(&cgroup_mutex);
@@ -2626,17 +2655,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
         * Call pre_destroy handlers of subsys. Notify subsystems
         * that rmdir() request comes.
         */
-        cgroup_call_pre_destroy(cgrp);
+        ret = cgroup_call_pre_destroy(cgrp);
+        if (ret)
+                return ret;
        mutex_lock(&cgroup_mutex);
        parent = cgrp->parent;
+        if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
-        if (atomic_read(&cgrp->count)
-            || !list_empty(&cgrp->children)
-            || !cgroup_clear_css_refs(cgrp)) {
                mutex_unlock(&cgroup_mutex);
                return -EBUSY;
        }
+        /*
+         * css_put/get is provided for subsys to grab refcnt to css. In typical
+         * case, subsystem has no reference after pre_destroy(). But, under
+         * hierarchy management, some *temporal* refcnt can be hold.
+         * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
+         * is really busy, it should return -EBUSY at pre_destroy(). wake_up
+         * is called when css_put() is called and refcnt goes down to 0.
+         */
+        set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+        prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
+        if (!cgroup_clear_css_refs(cgrp)) {
+                mutex_unlock(&cgroup_mutex);
+                schedule();
+                finish_wait(&cgroup_rmdir_waitq, &wait);
+                clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+                if (signal_pending(current))
+                        return -EINTR;
+                goto again;
+        }
+        /* NO css_tryget() can success after here. */
+        finish_wait(&cgroup_rmdir_waitq, &wait);
+        clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
        spin_lock(&release_list_lock);
        set_bit(CGRP_REMOVED, &cgrp->flags);
@@ -3194,10 +3245,12 @@ void __css_put(struct cgroup_subsys_state *css)
 {
        struct cgroup *cgrp = css->cgroup;
        rcu_read_lock();
-        if ((atomic_dec_return(&css->refcnt) == 1) &&
+        if (atomic_dec_return(&css->refcnt) == 1) {
-            notify_on_release(cgrp)) {
+                if (notify_on_release(cgrp)) {
-                set_bit(CGRP_RELEASABLE, &cgrp->flags);
+                        set_bit(CGRP_RELEASABLE, &cgrp->flags);
-                check_for_release(cgrp);
+                        check_for_release(cgrp);
+                }
+                cgroup_wakeup_rmdir_waiters(cgrp);
        }
        rcu_read_unlock();
 }
author	KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>	2009-04-02 19:57:26 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2009-04-02 22:04:54 -0400
commit	ec64f51545fffbc4cb968f0cea56341a4b07e85a (patch)
tree	575d890a6759d81f3324fa2a22ca6ab14a41eefc /kernel
parent	38460b48d06440de46b34cb778bd6c4855030754 (diff)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d3c521137425..fc5e4a48582f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c
@@ -622,13 +622,18 @@ static struct inode cgroup_new_inode(mode_t mode, struct super_block sb)
622	* Call subsys's pre_destroy handler.	622	* Call subsys's pre_destroy handler.
623	* This is called before css refcnt check.	623	* This is called before css refcnt check.
624	*/	624	*/
625	static void cgroup_call_pre_destroy(struct cgroup *cgrp)	625	static int cgroup_call_pre_destroy(struct cgroup *cgrp)
626	{	626	{
627	struct cgroup_subsys *ss;	627	struct cgroup_subsys *ss;
		628	int ret = 0;
		629
628	for_each_subsys(cgrp->root, ss)	630	for_each_subsys(cgrp->root, ss)
629	if (ss->pre_destroy)	631	if (ss->pre_destroy) {
630	ss->pre_destroy(ss, cgrp);	632	ret = ss->pre_destroy(ss, cgrp);
631	return;	633	if (ret)
		634	break;
		635	}
		636	return ret;
632	}	637	}
633		638
634	static void free_cgroup_rcu(struct rcu_head *obj)	639	static void free_cgroup_rcu(struct rcu_head *obj)
@@ -722,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
722	remove_dir(dentry);	727	remove_dir(dentry);
723	}	728	}
724		729
		730	/*
		731	* A queue for waiters to do rmdir() cgroup. A tasks will sleep when
		732	* cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
		733	* reference to css->refcnt. In general, this refcnt is expected to goes down
		734	* to zero, soon.
		735	*
		736	* CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
		737	*/
		738	DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
		739
		740	static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
		741	{
		742	if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
		743	wake_up_all(&cgroup_rmdir_waitq);
		744	}
		745
725	static int rebind_subsystems(struct cgroupfs_root *root,	746	static int rebind_subsystems(struct cgroupfs_root *root,
726	unsigned long final_bits)	747	unsigned long final_bits)
727	{	748	{
@@ -1317,6 +1338,12 @@ int cgroup_attach_task(struct cgroup cgrp, struct task_struct tsk)
1317	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);	1338	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1318	synchronize_rcu();	1339	synchronize_rcu();
1319	put_css_set(cg);	1340	put_css_set(cg);
		1341
		1342	/*
		1343	* wake up rmdir() waiter. the rmdir should fail since the cgroup
		1344	* is no longer empty.
		1345	*/
		1346	cgroup_wakeup_rmdir_waiters(cgrp);
1320	return 0;	1347	return 0;
1321	}	1348	}
1322		1349
@@ -2608,9 +2635,11 @@ static int cgroup_rmdir(struct inode unused_dir, struct dentry dentry)
2608	struct cgroup *cgrp = dentry->d_fsdata;	2635	struct cgroup *cgrp = dentry->d_fsdata;
2609	struct dentry *d;	2636	struct dentry *d;
2610	struct cgroup *parent;	2637	struct cgroup *parent;
		2638	DEFINE_WAIT(wait);
		2639	int ret;
2611		2640
2612	/* the vfs holds both inode->i_mutex already */	2641	/* the vfs holds both inode->i_mutex already */
2613		2642	again:
2614	mutex_lock(&cgroup_mutex);	2643	mutex_lock(&cgroup_mutex);
2615	if (atomic_read(&cgrp->count) != 0) {	2644	if (atomic_read(&cgrp->count) != 0) {
2616	mutex_unlock(&cgroup_mutex);	2645	mutex_unlock(&cgroup_mutex);
@@ -2626,17 +2655,39 @@ static int cgroup_rmdir(struct inode unused_dir, struct dentry dentry)
2626	* Call pre_destroy handlers of subsys. Notify subsystems	2655	* Call pre_destroy handlers of subsys. Notify subsystems
2627	* that rmdir() request comes.	2656	* that rmdir() request comes.
2628	*/	2657	*/
2629	cgroup_call_pre_destroy(cgrp);	2658	ret = cgroup_call_pre_destroy(cgrp);
		2659	if (ret)
		2660	return ret;
2630		2661
2631	mutex_lock(&cgroup_mutex);	2662	mutex_lock(&cgroup_mutex);
2632	parent = cgrp->parent;	2663	parent = cgrp->parent;
2633		2664	if (atomic_read(&cgrp->count) \|\| !list_empty(&cgrp->children)) {
2634	if (atomic_read(&cgrp->count)
2635	\|\| !list_empty(&cgrp->children)
2636	\|\| !cgroup_clear_css_refs(cgrp)) {
2637	mutex_unlock(&cgroup_mutex);	2665	mutex_unlock(&cgroup_mutex);
2638	return -EBUSY;	2666	return -EBUSY;
2639	}	2667	}
		2668	/*
		2669	* css_put/get is provided for subsys to grab refcnt to css. In typical
		2670	* case, subsystem has no reference after pre_destroy(). But, under
		2671	* hierarchy management, some temporal refcnt can be hold.
		2672	* To avoid returning -EBUSY to a user, waitqueue is used. If subsys
		2673	* is really busy, it should return -EBUSY at pre_destroy(). wake_up
		2674	* is called when css_put() is called and refcnt goes down to 0.
		2675	*/
		2676	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
		2677	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
		2678
		2679	if (!cgroup_clear_css_refs(cgrp)) {
		2680	mutex_unlock(&cgroup_mutex);
		2681	schedule();
		2682	finish_wait(&cgroup_rmdir_waitq, &wait);
		2683	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
		2684	if (signal_pending(current))
		2685	return -EINTR;
		2686	goto again;
		2687	}
		2688	/* NO css_tryget() can success after here. */
		2689	finish_wait(&cgroup_rmdir_waitq, &wait);
		2690	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2640		2691
2641	spin_lock(&release_list_lock);	2692	spin_lock(&release_list_lock);
2642	set_bit(CGRP_REMOVED, &cgrp->flags);	2693	set_bit(CGRP_REMOVED, &cgrp->flags);
@@ -3194,10 +3245,12 @@ void __css_put(struct cgroup_subsys_state *css)
3194	{	3245	{
3195	struct cgroup *cgrp = css->cgroup;	3246	struct cgroup *cgrp = css->cgroup;
3196	rcu_read_lock();	3247	rcu_read_lock();
3197	if ((atomic_dec_return(&css->refcnt) == 1) &&	3248	if (atomic_dec_return(&css->refcnt) == 1) {
3198	notify_on_release(cgrp)) {	3249	if (notify_on_release(cgrp)) {
3199	set_bit(CGRP_RELEASABLE, &cgrp->flags);	3250	set_bit(CGRP_RELEASABLE, &cgrp->flags);
3200	check_for_release(cgrp);	3251	check_for_release(cgrp);
		3252	}
		3253	cgroup_wakeup_rmdir_waiters(cgrp);
3201	}	3254	}
3202	rcu_read_unlock();	3255	rcu_read_unlock();
3203	}	3256	}