diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2009-07-29 18:04:06 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-07-29 22:10:35 -0400 |
commit | 887032670d47366a8c8f25396ea7c14b7b2cc620 (patch) | |
tree | e5f9ece5ab9239648e8d7051ccb9a217d92553d7 /kernel | |
parent | f0d83679a8d471dc8b646919f70595d6fe8c9606 (diff) |
cgroup avoid permanent sleep at rmdir
After commit ec64f51545fffbc4cb968f0cea56341a4b07e85a ("cgroup: fix
frequent -EBUSY at rmdir"), cgroup's rmdir (especially against memcg)
doesn't return -EBUSY by temporary ref counts. That commit expects all
refs after pre_destroy() is temporary but...it wasn't. Then, rmdir can
wait permanently. This patch tries to fix that and change followings.
- set CGRP_WAIT_ON_RMDIR flag before pre_destroy().
- clear CGRP_WAIT_ON_RMDIR flag when the subsys finds racy case.
if there are sleeping ones, wakes them up.
- rmdir() sleeps only when CGRP_WAIT_ON_RMDIR flag is set.
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reported-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reviewed-by: Paul Menage <menage@google.com>
Acked-by: Balbir Sigh <balbir@linux.vnet.ibm.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup.c | 55 |
1 files changed, 38 insertions, 17 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 250dac05680f..b6eadfe30e7b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -735,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry) | |||
735 | * reference to css->refcnt. In general, this refcnt is expected to goes down | 735 | * reference to css->refcnt. In general, this refcnt is expected to goes down |
736 | * to zero, soon. | 736 | * to zero, soon. |
737 | * | 737 | * |
738 | * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; | 738 | * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; |
739 | */ | 739 | */ |
740 | DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); | 740 | DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); |
741 | 741 | ||
742 | static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) | 742 | static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) |
743 | { | 743 | { |
744 | if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) | 744 | if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) |
745 | wake_up_all(&cgroup_rmdir_waitq); | 745 | wake_up_all(&cgroup_rmdir_waitq); |
746 | } | 746 | } |
747 | 747 | ||
748 | void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) | ||
749 | { | ||
750 | css_get(css); | ||
751 | } | ||
752 | |||
753 | void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) | ||
754 | { | ||
755 | cgroup_wakeup_rmdir_waiter(css->cgroup); | ||
756 | css_put(css); | ||
757 | } | ||
758 | |||
759 | |||
748 | static int rebind_subsystems(struct cgroupfs_root *root, | 760 | static int rebind_subsystems(struct cgroupfs_root *root, |
749 | unsigned long final_bits) | 761 | unsigned long final_bits) |
750 | { | 762 | { |
@@ -1359,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1359 | * wake up rmdir() waiter. the rmdir should fail since the cgroup | 1371 | * wake up rmdir() waiter. the rmdir should fail since the cgroup |
1360 | * is no longer empty. | 1372 | * is no longer empty. |
1361 | */ | 1373 | */ |
1362 | cgroup_wakeup_rmdir_waiters(cgrp); | 1374 | cgroup_wakeup_rmdir_waiter(cgrp); |
1363 | return 0; | 1375 | return 0; |
1364 | } | 1376 | } |
1365 | 1377 | ||
@@ -2744,33 +2756,42 @@ again: | |||
2744 | mutex_unlock(&cgroup_mutex); | 2756 | mutex_unlock(&cgroup_mutex); |
2745 | 2757 | ||
2746 | /* | 2758 | /* |
2759 | * In general, subsystem has no css->refcnt after pre_destroy(). But | ||
2760 | * in racy cases, subsystem may have to get css->refcnt after | ||
2761 | * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes | ||
2762 | * make rmdir return -EBUSY too often. To avoid that, we use waitqueue | ||
2763 | * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir | ||
2764 | * and subsystem's reference count handling. Please see css_get/put | ||
2765 | * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. | ||
2766 | */ | ||
2767 | set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
2768 | |||
2769 | /* | ||
2747 | * Call pre_destroy handlers of subsys. Notify subsystems | 2770 | * Call pre_destroy handlers of subsys. Notify subsystems |
2748 | * that rmdir() request comes. | 2771 | * that rmdir() request comes. |
2749 | */ | 2772 | */ |
2750 | ret = cgroup_call_pre_destroy(cgrp); | 2773 | ret = cgroup_call_pre_destroy(cgrp); |
2751 | if (ret) | 2774 | if (ret) { |
2775 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
2752 | return ret; | 2776 | return ret; |
2777 | } | ||
2753 | 2778 | ||
2754 | mutex_lock(&cgroup_mutex); | 2779 | mutex_lock(&cgroup_mutex); |
2755 | parent = cgrp->parent; | 2780 | parent = cgrp->parent; |
2756 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { | 2781 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { |
2782 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
2757 | mutex_unlock(&cgroup_mutex); | 2783 | mutex_unlock(&cgroup_mutex); |
2758 | return -EBUSY; | 2784 | return -EBUSY; |
2759 | } | 2785 | } |
2760 | /* | ||
2761 | * css_put/get is provided for subsys to grab refcnt to css. In typical | ||
2762 | * case, subsystem has no reference after pre_destroy(). But, under | ||
2763 | * hierarchy management, some *temporal* refcnt can be hold. | ||
2764 | * To avoid returning -EBUSY to a user, waitqueue is used. If subsys | ||
2765 | * is really busy, it should return -EBUSY at pre_destroy(). wake_up | ||
2766 | * is called when css_put() is called and refcnt goes down to 0. | ||
2767 | */ | ||
2768 | set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
2769 | prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); | 2786 | prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); |
2770 | |||
2771 | if (!cgroup_clear_css_refs(cgrp)) { | 2787 | if (!cgroup_clear_css_refs(cgrp)) { |
2772 | mutex_unlock(&cgroup_mutex); | 2788 | mutex_unlock(&cgroup_mutex); |
2773 | schedule(); | 2789 | /* |
2790 | * Because someone may call cgroup_wakeup_rmdir_waiter() before | ||
2791 | * prepare_to_wait(), we need to check this flag. | ||
2792 | */ | ||
2793 | if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) | ||
2794 | schedule(); | ||
2774 | finish_wait(&cgroup_rmdir_waitq, &wait); | 2795 | finish_wait(&cgroup_rmdir_waitq, &wait); |
2775 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | 2796 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); |
2776 | if (signal_pending(current)) | 2797 | if (signal_pending(current)) |
@@ -3342,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css) | |||
3342 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 3363 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
3343 | check_for_release(cgrp); | 3364 | check_for_release(cgrp); |
3344 | } | 3365 | } |
3345 | cgroup_wakeup_rmdir_waiters(cgrp); | 3366 | cgroup_wakeup_rmdir_waiter(cgrp); |
3346 | } | 3367 | } |
3347 | rcu_read_unlock(); | 3368 | rcu_read_unlock(); |
3348 | } | 3369 | } |