diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2009-07-29 18:04:06 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-07-29 22:10:35 -0400 |
commit | 887032670d47366a8c8f25396ea7c14b7b2cc620 (patch) | |
tree | e5f9ece5ab9239648e8d7051ccb9a217d92553d7 | |
parent | f0d83679a8d471dc8b646919f70595d6fe8c9606 (diff) |
cgroup avoid permanent sleep at rmdir
After commit ec64f51545fffbc4cb968f0cea56341a4b07e85a ("cgroup: fix
frequent -EBUSY at rmdir"), cgroup's rmdir (especially against memcg)
doesn't return -EBUSY by temporary ref counts. That commit expects all
refs after pre_destroy() is temporary but...it wasn't. Then, rmdir can
wait permanently. This patch tries to fix that and change followings.
- set CGRP_WAIT_ON_RMDIR flag before pre_destroy().
- clear CGRP_WAIT_ON_RMDIR flag when the subsys finds racy case.
if there are sleeping ones, wakes them up.
- rmdir() sleeps only when CGRP_WAIT_ON_RMDIR flag is set.
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reported-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reviewed-by: Paul Menage <menage@google.com>
Acked-by: Balbir Sigh <balbir@linux.vnet.ibm.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/cgroup.h | 17 | ||||
-rw-r--r-- | kernel/cgroup.c | 55 | ||||
-rw-r--r-- | mm/memcontrol.c | 23 |
3 files changed, 75 insertions, 20 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 20411d2876f8..90bba9e62286 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -363,6 +363,23 @@ int cgroup_task_count(const struct cgroup *cgrp); | |||
363 | int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); | 363 | int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); |
364 | 364 | ||
365 | /* | 365 | /* |
366 | * When the subsys has to access css and may add permanent refcnt to css, | ||
367 | * it should take care of racy conditions with rmdir(). Following set of | ||
368 | * functions, is for stop/restart rmdir if necessary. | ||
369 | * Because these will call css_get/put, "css" should be alive css. | ||
370 | * | ||
371 | * cgroup_exclude_rmdir(); | ||
372 | * ...do some jobs which may access arbitrary empty cgroup | ||
373 | * cgroup_release_and_wakeup_rmdir(); | ||
374 | * | ||
375 | * When someone removes a cgroup while cgroup_exclude_rmdir() holds it, | ||
376 | * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up. | ||
377 | */ | ||
378 | |||
379 | void cgroup_exclude_rmdir(struct cgroup_subsys_state *css); | ||
380 | void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css); | ||
381 | |||
382 | /* | ||
366 | * Control Group subsystem type. | 383 | * Control Group subsystem type. |
367 | * See Documentation/cgroups/cgroups.txt for details | 384 | * See Documentation/cgroups/cgroups.txt for details |
368 | */ | 385 | */ |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 250dac05680f..b6eadfe30e7b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -735,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry) | |||
735 | * reference to css->refcnt. In general, this refcnt is expected to goes down | 735 | * reference to css->refcnt. In general, this refcnt is expected to goes down |
736 | * to zero, soon. | 736 | * to zero, soon. |
737 | * | 737 | * |
738 | * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; | 738 | * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; |
739 | */ | 739 | */ |
740 | DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); | 740 | DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); |
741 | 741 | ||
742 | static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) | 742 | static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) |
743 | { | 743 | { |
744 | if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) | 744 | if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) |
745 | wake_up_all(&cgroup_rmdir_waitq); | 745 | wake_up_all(&cgroup_rmdir_waitq); |
746 | } | 746 | } |
747 | 747 | ||
748 | void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) | ||
749 | { | ||
750 | css_get(css); | ||
751 | } | ||
752 | |||
753 | void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) | ||
754 | { | ||
755 | cgroup_wakeup_rmdir_waiter(css->cgroup); | ||
756 | css_put(css); | ||
757 | } | ||
758 | |||
759 | |||
748 | static int rebind_subsystems(struct cgroupfs_root *root, | 760 | static int rebind_subsystems(struct cgroupfs_root *root, |
749 | unsigned long final_bits) | 761 | unsigned long final_bits) |
750 | { | 762 | { |
@@ -1359,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1359 | * wake up rmdir() waiter. the rmdir should fail since the cgroup | 1371 | * wake up rmdir() waiter. the rmdir should fail since the cgroup |
1360 | * is no longer empty. | 1372 | * is no longer empty. |
1361 | */ | 1373 | */ |
1362 | cgroup_wakeup_rmdir_waiters(cgrp); | 1374 | cgroup_wakeup_rmdir_waiter(cgrp); |
1363 | return 0; | 1375 | return 0; |
1364 | } | 1376 | } |
1365 | 1377 | ||
@@ -2744,33 +2756,42 @@ again: | |||
2744 | mutex_unlock(&cgroup_mutex); | 2756 | mutex_unlock(&cgroup_mutex); |
2745 | 2757 | ||
2746 | /* | 2758 | /* |
2759 | * In general, subsystem has no css->refcnt after pre_destroy(). But | ||
2760 | * in racy cases, subsystem may have to get css->refcnt after | ||
2761 | * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes | ||
2762 | * make rmdir return -EBUSY too often. To avoid that, we use waitqueue | ||
2763 | * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir | ||
2764 | * and subsystem's reference count handling. Please see css_get/put | ||
2765 | * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. | ||
2766 | */ | ||
2767 | set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
2768 | |||
2769 | /* | ||
2747 | * Call pre_destroy handlers of subsys. Notify subsystems | 2770 | * Call pre_destroy handlers of subsys. Notify subsystems |
2748 | * that rmdir() request comes. | 2771 | * that rmdir() request comes. |
2749 | */ | 2772 | */ |
2750 | ret = cgroup_call_pre_destroy(cgrp); | 2773 | ret = cgroup_call_pre_destroy(cgrp); |
2751 | if (ret) | 2774 | if (ret) { |
2775 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
2752 | return ret; | 2776 | return ret; |
2777 | } | ||
2753 | 2778 | ||
2754 | mutex_lock(&cgroup_mutex); | 2779 | mutex_lock(&cgroup_mutex); |
2755 | parent = cgrp->parent; | 2780 | parent = cgrp->parent; |
2756 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { | 2781 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { |
2782 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
2757 | mutex_unlock(&cgroup_mutex); | 2783 | mutex_unlock(&cgroup_mutex); |
2758 | return -EBUSY; | 2784 | return -EBUSY; |
2759 | } | 2785 | } |
2760 | /* | ||
2761 | * css_put/get is provided for subsys to grab refcnt to css. In typical | ||
2762 | * case, subsystem has no reference after pre_destroy(). But, under | ||
2763 | * hierarchy management, some *temporal* refcnt can be hold. | ||
2764 | * To avoid returning -EBUSY to a user, waitqueue is used. If subsys | ||
2765 | * is really busy, it should return -EBUSY at pre_destroy(). wake_up | ||
2766 | * is called when css_put() is called and refcnt goes down to 0. | ||
2767 | */ | ||
2768 | set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
2769 | prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); | 2786 | prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); |
2770 | |||
2771 | if (!cgroup_clear_css_refs(cgrp)) { | 2787 | if (!cgroup_clear_css_refs(cgrp)) { |
2772 | mutex_unlock(&cgroup_mutex); | 2788 | mutex_unlock(&cgroup_mutex); |
2773 | schedule(); | 2789 | /* |
2790 | * Because someone may call cgroup_wakeup_rmdir_waiter() before | ||
2791 | * prepare_to_wait(), we need to check this flag. | ||
2792 | */ | ||
2793 | if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) | ||
2794 | schedule(); | ||
2774 | finish_wait(&cgroup_rmdir_waitq, &wait); | 2795 | finish_wait(&cgroup_rmdir_waitq, &wait); |
2775 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | 2796 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); |
2776 | if (signal_pending(current)) | 2797 | if (signal_pending(current)) |
@@ -3342,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css) | |||
3342 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 3363 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
3343 | check_for_release(cgrp); | 3364 | check_for_release(cgrp); |
3344 | } | 3365 | } |
3345 | cgroup_wakeup_rmdir_waiters(cgrp); | 3366 | cgroup_wakeup_rmdir_waiter(cgrp); |
3346 | } | 3367 | } |
3347 | rcu_read_unlock(); | 3368 | rcu_read_unlock(); |
3348 | } | 3369 | } |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e717964cb5a0..fd4529d86de5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -1207,6 +1207,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1207 | ret = 0; | 1207 | ret = 0; |
1208 | out: | 1208 | out: |
1209 | unlock_page_cgroup(pc); | 1209 | unlock_page_cgroup(pc); |
1210 | /* | ||
1211 | * We charges against "to" which may not have any tasks. Then, "to" | ||
1212 | * can be under rmdir(). But in current implementation, caller of | ||
1213 | * this function is just force_empty() and it's garanteed that | ||
1214 | * "to" is never removed. So, we don't check rmdir status here. | ||
1215 | */ | ||
1210 | return ret; | 1216 | return ret; |
1211 | } | 1217 | } |
1212 | 1218 | ||
@@ -1428,6 +1434,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
1428 | return; | 1434 | return; |
1429 | if (!ptr) | 1435 | if (!ptr) |
1430 | return; | 1436 | return; |
1437 | cgroup_exclude_rmdir(&ptr->css); | ||
1431 | pc = lookup_page_cgroup(page); | 1438 | pc = lookup_page_cgroup(page); |
1432 | mem_cgroup_lru_del_before_commit_swapcache(page); | 1439 | mem_cgroup_lru_del_before_commit_swapcache(page); |
1433 | __mem_cgroup_commit_charge(ptr, pc, ctype); | 1440 | __mem_cgroup_commit_charge(ptr, pc, ctype); |
@@ -1457,8 +1464,12 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
1457 | } | 1464 | } |
1458 | rcu_read_unlock(); | 1465 | rcu_read_unlock(); |
1459 | } | 1466 | } |
1460 | /* add this page(page_cgroup) to the LRU we want. */ | 1467 | /* |
1461 | 1468 | * At swapin, we may charge account against cgroup which has no tasks. | |
1469 | * So, rmdir()->pre_destroy() can be called while we do this charge. | ||
1470 | * In that case, we need to call pre_destroy() again. check it here. | ||
1471 | */ | ||
1472 | cgroup_release_and_wakeup_rmdir(&ptr->css); | ||
1462 | } | 1473 | } |
1463 | 1474 | ||
1464 | void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) | 1475 | void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) |
@@ -1664,7 +1675,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, | |||
1664 | 1675 | ||
1665 | if (!mem) | 1676 | if (!mem) |
1666 | return; | 1677 | return; |
1667 | 1678 | cgroup_exclude_rmdir(&mem->css); | |
1668 | /* at migration success, oldpage->mapping is NULL. */ | 1679 | /* at migration success, oldpage->mapping is NULL. */ |
1669 | if (oldpage->mapping) { | 1680 | if (oldpage->mapping) { |
1670 | target = oldpage; | 1681 | target = oldpage; |
@@ -1704,6 +1715,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, | |||
1704 | */ | 1715 | */ |
1705 | if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) | 1716 | if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) |
1706 | mem_cgroup_uncharge_page(target); | 1717 | mem_cgroup_uncharge_page(target); |
1718 | /* | ||
1719 | * At migration, we may charge account against cgroup which has no tasks | ||
1720 | * So, rmdir()->pre_destroy() can be called while we do this charge. | ||
1721 | * In that case, we need to call pre_destroy() again. check it here. | ||
1722 | */ | ||
1723 | cgroup_release_and_wakeup_rmdir(&mem->css); | ||
1707 | } | 1724 | } |
1708 | 1725 | ||
1709 | /* | 1726 | /* |