aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2009-04-02 19:57:26 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-04-02 22:04:54 -0400
commitec64f51545fffbc4cb968f0cea56341a4b07e85a (patch)
tree575d890a6759d81f3324fa2a22ca6ab14a41eefc /kernel
parent38460b48d06440de46b34cb778bd6c4855030754 (diff)
cgroup: fix frequent -EBUSY at rmdir
In following situation, with memory subsystem, /groupA use_hierarchy==1 /01 some tasks /02 some tasks /03 some tasks /04 empty When tasks under 01/02/03 hit limit on /groupA, hierarchical reclaim is triggered and the kernel walks tree under groupA. In this case, rmdir /groupA/04 fails with -EBUSY frequently because of temporal refcnt from the kernel. In general. cgroup can be rmdir'd if there are no children groups and no tasks. Frequent fails of rmdir() is not useful to users. (And the reason for -EBUSY is unknown to users.....in most cases) This patch tries to modify above behavior, by - retries if css_refcnt is got by someone. - add "return value" to pre_destroy() and allows subsystem to say "we're really busy!" Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Paul Menage <menage@google.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c81
1 files changed, 67 insertions, 14 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d3c521137425..fc5e4a48582f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -622,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
622 * Call subsys's pre_destroy handler. 622 * Call subsys's pre_destroy handler.
623 * This is called before css refcnt check. 623 * This is called before css refcnt check.
624 */ 624 */
625static void cgroup_call_pre_destroy(struct cgroup *cgrp) 625static int cgroup_call_pre_destroy(struct cgroup *cgrp)
626{ 626{
627 struct cgroup_subsys *ss; 627 struct cgroup_subsys *ss;
628 int ret = 0;
629
628 for_each_subsys(cgrp->root, ss) 630 for_each_subsys(cgrp->root, ss)
629 if (ss->pre_destroy) 631 if (ss->pre_destroy) {
630 ss->pre_destroy(ss, cgrp); 632 ret = ss->pre_destroy(ss, cgrp);
631 return; 633 if (ret)
634 break;
635 }
636 return ret;
632} 637}
633 638
634static void free_cgroup_rcu(struct rcu_head *obj) 639static void free_cgroup_rcu(struct rcu_head *obj)
@@ -722,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
722 remove_dir(dentry); 727 remove_dir(dentry);
723} 728}
724 729
730/*
731 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
732 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
733 * reference to css->refcnt. In general, this refcnt is expected to goes down
734 * to zero, soon.
735 *
736 * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
737 */
738DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
739
740static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
741{
742 if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
743 wake_up_all(&cgroup_rmdir_waitq);
744}
745
725static int rebind_subsystems(struct cgroupfs_root *root, 746static int rebind_subsystems(struct cgroupfs_root *root,
726 unsigned long final_bits) 747 unsigned long final_bits)
727{ 748{
@@ -1317,6 +1338,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1317 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1338 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1318 synchronize_rcu(); 1339 synchronize_rcu();
1319 put_css_set(cg); 1340 put_css_set(cg);
1341
1342 /*
1343 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1344 * is no longer empty.
1345 */
1346 cgroup_wakeup_rmdir_waiters(cgrp);
1320 return 0; 1347 return 0;
1321} 1348}
1322 1349
@@ -2608,9 +2635,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2608 struct cgroup *cgrp = dentry->d_fsdata; 2635 struct cgroup *cgrp = dentry->d_fsdata;
2609 struct dentry *d; 2636 struct dentry *d;
2610 struct cgroup *parent; 2637 struct cgroup *parent;
2638 DEFINE_WAIT(wait);
2639 int ret;
2611 2640
2612 /* the vfs holds both inode->i_mutex already */ 2641 /* the vfs holds both inode->i_mutex already */
2613 2642again:
2614 mutex_lock(&cgroup_mutex); 2643 mutex_lock(&cgroup_mutex);
2615 if (atomic_read(&cgrp->count) != 0) { 2644 if (atomic_read(&cgrp->count) != 0) {
2616 mutex_unlock(&cgroup_mutex); 2645 mutex_unlock(&cgroup_mutex);
@@ -2626,17 +2655,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2626 * Call pre_destroy handlers of subsys. Notify subsystems 2655 * Call pre_destroy handlers of subsys. Notify subsystems
2627 * that rmdir() request comes. 2656 * that rmdir() request comes.
2628 */ 2657 */
2629 cgroup_call_pre_destroy(cgrp); 2658 ret = cgroup_call_pre_destroy(cgrp);
2659 if (ret)
2660 return ret;
2630 2661
2631 mutex_lock(&cgroup_mutex); 2662 mutex_lock(&cgroup_mutex);
2632 parent = cgrp->parent; 2663 parent = cgrp->parent;
2633 2664 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
2634 if (atomic_read(&cgrp->count)
2635 || !list_empty(&cgrp->children)
2636 || !cgroup_clear_css_refs(cgrp)) {
2637 mutex_unlock(&cgroup_mutex); 2665 mutex_unlock(&cgroup_mutex);
2638 return -EBUSY; 2666 return -EBUSY;
2639 } 2667 }
2668 /*
2669 * css_put/get is provided for subsys to grab refcnt to css. In typical
2670 * case, subsystem has no reference after pre_destroy(). But, under
2671 * hierarchy management, some *temporal* refcnt can be hold.
2672 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
2673 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
2674 * is called when css_put() is called and refcnt goes down to 0.
2675 */
2676 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2677 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
2678
2679 if (!cgroup_clear_css_refs(cgrp)) {
2680 mutex_unlock(&cgroup_mutex);
2681 schedule();
2682 finish_wait(&cgroup_rmdir_waitq, &wait);
2683 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2684 if (signal_pending(current))
2685 return -EINTR;
2686 goto again;
2687 }
2688 /* NO css_tryget() can success after here. */
2689 finish_wait(&cgroup_rmdir_waitq, &wait);
2690 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2640 2691
2641 spin_lock(&release_list_lock); 2692 spin_lock(&release_list_lock);
2642 set_bit(CGRP_REMOVED, &cgrp->flags); 2693 set_bit(CGRP_REMOVED, &cgrp->flags);
@@ -3194,10 +3245,12 @@ void __css_put(struct cgroup_subsys_state *css)
3194{ 3245{
3195 struct cgroup *cgrp = css->cgroup; 3246 struct cgroup *cgrp = css->cgroup;
3196 rcu_read_lock(); 3247 rcu_read_lock();
3197 if ((atomic_dec_return(&css->refcnt) == 1) && 3248 if (atomic_dec_return(&css->refcnt) == 1) {
3198 notify_on_release(cgrp)) { 3249 if (notify_on_release(cgrp)) {
3199 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3250 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3200 check_for_release(cgrp); 3251 check_for_release(cgrp);
3252 }
3253 cgroup_wakeup_rmdir_waiters(cgrp);
3201 } 3254 }
3202 rcu_read_unlock(); 3255 rcu_read_unlock();
3203} 3256}