aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2009-04-02 19:57:26 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-04-02 22:04:54 -0400
commitec64f51545fffbc4cb968f0cea56341a4b07e85a (patch)
tree575d890a6759d81f3324fa2a22ca6ab14a41eefc
parent38460b48d06440de46b34cb778bd6c4855030754 (diff)
cgroup: fix frequent -EBUSY at rmdir
In following situation, with memory subsystem, /groupA use_hierarchy==1 /01 some tasks /02 some tasks /03 some tasks /04 empty When tasks under 01/02/03 hit limit on /groupA, hierarchical reclaim is triggered and the kernel walks tree under groupA. In this case, rmdir /groupA/04 fails with -EBUSY frequently because of temporal refcnt from the kernel. In general. cgroup can be rmdir'd if there are no children groups and no tasks. Frequent fails of rmdir() is not useful to users. (And the reason for -EBUSY is unknown to users.....in most cases) This patch tries to modify above behavior, by - retries if css_refcnt is got by someone. - add "return value" to pre_destroy() and allows subsystem to say "we're really busy!" Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Paul Menage <menage@google.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/cgroups/cgroups.txt6
-rw-r--r--include/linux/cgroup.h6
-rw-r--r--kernel/cgroup.c81
-rw-r--r--mm/memcontrol.c5
4 files changed, 79 insertions, 19 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 93feb8444489..cdc46a501b85 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -476,11 +476,13 @@ cgroup->parent is still valid. (Note - can also be called for a
476newly-created cgroup if an error occurs after this subsystem's 476newly-created cgroup if an error occurs after this subsystem's
477create() method has been called for the new cgroup). 477create() method has been called for the new cgroup).
478 478
479void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); 479int pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
480 480
481Called before checking the reference count on each subsystem. This may 481Called before checking the reference count on each subsystem. This may
482be useful for subsystems which have some extra references even if 482be useful for subsystems which have some extra references even if
483there are not tasks in the cgroup. 483there are not tasks in the cgroup. If pre_destroy() returns error code,
484rmdir() will fail with it. From this behavior, pre_destroy() can be
485called multiple times against a cgroup.
484 486
485int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 487int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
486 struct task_struct *task) 488 struct task_struct *task)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9a23bb098205..7d824b80b3d7 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -135,6 +135,10 @@ enum {
135 CGRP_RELEASABLE, 135 CGRP_RELEASABLE,
136 /* Control Group requires release notifications to userspace */ 136 /* Control Group requires release notifications to userspace */
137 CGRP_NOTIFY_ON_RELEASE, 137 CGRP_NOTIFY_ON_RELEASE,
138 /*
139 * A thread in rmdir() is wating for this cgroup.
140 */
141 CGRP_WAIT_ON_RMDIR,
138}; 142};
139 143
140struct cgroup { 144struct cgroup {
@@ -360,7 +364,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
360struct cgroup_subsys { 364struct cgroup_subsys {
361 struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss, 365 struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss,
362 struct cgroup *cgrp); 366 struct cgroup *cgrp);
363 void (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); 367 int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
364 void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); 368 void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
365 int (*can_attach)(struct cgroup_subsys *ss, 369 int (*can_attach)(struct cgroup_subsys *ss,
366 struct cgroup *cgrp, struct task_struct *tsk); 370 struct cgroup *cgrp, struct task_struct *tsk);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d3c521137425..fc5e4a48582f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -622,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
622 * Call subsys's pre_destroy handler. 622 * Call subsys's pre_destroy handler.
623 * This is called before css refcnt check. 623 * This is called before css refcnt check.
624 */ 624 */
625static void cgroup_call_pre_destroy(struct cgroup *cgrp) 625static int cgroup_call_pre_destroy(struct cgroup *cgrp)
626{ 626{
627 struct cgroup_subsys *ss; 627 struct cgroup_subsys *ss;
628 int ret = 0;
629
628 for_each_subsys(cgrp->root, ss) 630 for_each_subsys(cgrp->root, ss)
629 if (ss->pre_destroy) 631 if (ss->pre_destroy) {
630 ss->pre_destroy(ss, cgrp); 632 ret = ss->pre_destroy(ss, cgrp);
631 return; 633 if (ret)
634 break;
635 }
636 return ret;
632} 637}
633 638
634static void free_cgroup_rcu(struct rcu_head *obj) 639static void free_cgroup_rcu(struct rcu_head *obj)
@@ -722,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
722 remove_dir(dentry); 727 remove_dir(dentry);
723} 728}
724 729
730/*
731 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
732 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
733 * reference to css->refcnt. In general, this refcnt is expected to goes down
734 * to zero, soon.
735 *
736 * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
737 */
738DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
739
740static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
741{
742 if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
743 wake_up_all(&cgroup_rmdir_waitq);
744}
745
725static int rebind_subsystems(struct cgroupfs_root *root, 746static int rebind_subsystems(struct cgroupfs_root *root,
726 unsigned long final_bits) 747 unsigned long final_bits)
727{ 748{
@@ -1317,6 +1338,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1317 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1338 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1318 synchronize_rcu(); 1339 synchronize_rcu();
1319 put_css_set(cg); 1340 put_css_set(cg);
1341
1342 /*
1343 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1344 * is no longer empty.
1345 */
1346 cgroup_wakeup_rmdir_waiters(cgrp);
1320 return 0; 1347 return 0;
1321} 1348}
1322 1349
@@ -2608,9 +2635,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2608 struct cgroup *cgrp = dentry->d_fsdata; 2635 struct cgroup *cgrp = dentry->d_fsdata;
2609 struct dentry *d; 2636 struct dentry *d;
2610 struct cgroup *parent; 2637 struct cgroup *parent;
2638 DEFINE_WAIT(wait);
2639 int ret;
2611 2640
2612 /* the vfs holds both inode->i_mutex already */ 2641 /* the vfs holds both inode->i_mutex already */
2613 2642again:
2614 mutex_lock(&cgroup_mutex); 2643 mutex_lock(&cgroup_mutex);
2615 if (atomic_read(&cgrp->count) != 0) { 2644 if (atomic_read(&cgrp->count) != 0) {
2616 mutex_unlock(&cgroup_mutex); 2645 mutex_unlock(&cgroup_mutex);
@@ -2626,17 +2655,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2626 * Call pre_destroy handlers of subsys. Notify subsystems 2655 * Call pre_destroy handlers of subsys. Notify subsystems
2627 * that rmdir() request comes. 2656 * that rmdir() request comes.
2628 */ 2657 */
2629 cgroup_call_pre_destroy(cgrp); 2658 ret = cgroup_call_pre_destroy(cgrp);
2659 if (ret)
2660 return ret;
2630 2661
2631 mutex_lock(&cgroup_mutex); 2662 mutex_lock(&cgroup_mutex);
2632 parent = cgrp->parent; 2663 parent = cgrp->parent;
2633 2664 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
2634 if (atomic_read(&cgrp->count)
2635 || !list_empty(&cgrp->children)
2636 || !cgroup_clear_css_refs(cgrp)) {
2637 mutex_unlock(&cgroup_mutex); 2665 mutex_unlock(&cgroup_mutex);
2638 return -EBUSY; 2666 return -EBUSY;
2639 } 2667 }
2668 /*
2669 * css_put/get is provided for subsys to grab refcnt to css. In typical
2670 * case, subsystem has no reference after pre_destroy(). But, under
2671 * hierarchy management, some *temporal* refcnt can be hold.
2672 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
2673 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
2674 * is called when css_put() is called and refcnt goes down to 0.
2675 */
2676 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2677 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
2678
2679 if (!cgroup_clear_css_refs(cgrp)) {
2680 mutex_unlock(&cgroup_mutex);
2681 schedule();
2682 finish_wait(&cgroup_rmdir_waitq, &wait);
2683 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2684 if (signal_pending(current))
2685 return -EINTR;
2686 goto again;
2687 }
2688 /* NO css_tryget() can success after here. */
2689 finish_wait(&cgroup_rmdir_waitq, &wait);
2690 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2640 2691
2641 spin_lock(&release_list_lock); 2692 spin_lock(&release_list_lock);
2642 set_bit(CGRP_REMOVED, &cgrp->flags); 2693 set_bit(CGRP_REMOVED, &cgrp->flags);
@@ -3194,10 +3245,12 @@ void __css_put(struct cgroup_subsys_state *css)
3194{ 3245{
3195 struct cgroup *cgrp = css->cgroup; 3246 struct cgroup *cgrp = css->cgroup;
3196 rcu_read_lock(); 3247 rcu_read_lock();
3197 if ((atomic_dec_return(&css->refcnt) == 1) && 3248 if (atomic_dec_return(&css->refcnt) == 1) {
3198 notify_on_release(cgrp)) { 3249 if (notify_on_release(cgrp)) {
3199 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3250 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3200 check_for_release(cgrp); 3251 check_for_release(cgrp);
3252 }
3253 cgroup_wakeup_rmdir_waiters(cgrp);
3201 } 3254 }
3202 rcu_read_unlock(); 3255 rcu_read_unlock();
3203} 3256}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8e4be9cb2a6a..8ffec674c5ac 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2272,11 +2272,12 @@ free_out:
2272 return ERR_PTR(-ENOMEM); 2272 return ERR_PTR(-ENOMEM);
2273} 2273}
2274 2274
2275static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 2275static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
2276 struct cgroup *cont) 2276 struct cgroup *cont)
2277{ 2277{
2278 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2278 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2279 mem_cgroup_force_empty(mem, false); 2279
2280 return mem_cgroup_force_empty(mem, false);
2280} 2281}
2281 2282
2282static void mem_cgroup_destroy(struct cgroup_subsys *ss, 2283static void mem_cgroup_destroy(struct cgroup_subsys *ss,