aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2009-07-29 18:04:06 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-07-29 22:10:35 -0400
commit887032670d47366a8c8f25396ea7c14b7b2cc620 (patch)
treee5f9ece5ab9239648e8d7051ccb9a217d92553d7
parentf0d83679a8d471dc8b646919f70595d6fe8c9606 (diff)
cgroup avoid permanent sleep at rmdir
After commit ec64f51545fffbc4cb968f0cea56341a4b07e85a ("cgroup: fix frequent -EBUSY at rmdir"), cgroup's rmdir (especially against memcg) doesn't return -EBUSY by temporary ref counts. That commit expects all refs after pre_destroy() is temporary but...it wasn't. Then, rmdir can wait permanently. This patch tries to fix that and change followings. - set CGRP_WAIT_ON_RMDIR flag before pre_destroy(). - clear CGRP_WAIT_ON_RMDIR flag when the subsys finds racy case. if there are sleeping ones, wakes them up. - rmdir() sleeps only when CGRP_WAIT_ON_RMDIR flag is set. Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Reported-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Reviewed-by: Paul Menage <menage@google.com> Acked-by: Balbir Sigh <balbir@linux.vnet.ibm.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/cgroup.h17
-rw-r--r--kernel/cgroup.c55
-rw-r--r--mm/memcontrol.c23
3 files changed, 75 insertions, 20 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 20411d2876f8..90bba9e62286 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -363,6 +363,23 @@ int cgroup_task_count(const struct cgroup *cgrp);
363int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); 363int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
364 364
365/* 365/*
366 * When the subsys has to access css and may add permanent refcnt to css,
367 * it should take care of racy conditions with rmdir(). Following set of
368 * functions, is for stop/restart rmdir if necessary.
369 * Because these will call css_get/put, "css" should be alive css.
370 *
371 * cgroup_exclude_rmdir();
372 * ...do some jobs which may access arbitrary empty cgroup
373 * cgroup_release_and_wakeup_rmdir();
374 *
375 * When someone removes a cgroup while cgroup_exclude_rmdir() holds it,
376 * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up.
377 */
378
379void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
380void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css);
381
382/*
366 * Control Group subsystem type. 383 * Control Group subsystem type.
367 * See Documentation/cgroups/cgroups.txt for details 384 * See Documentation/cgroups/cgroups.txt for details
368 */ 385 */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 250dac05680f..b6eadfe30e7b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -735,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
735 * reference to css->refcnt. In general, this refcnt is expected to goes down 735 * reference to css->refcnt. In general, this refcnt is expected to goes down
736 * to zero, soon. 736 * to zero, soon.
737 * 737 *
738 * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; 738 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
739 */ 739 */
740DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); 740DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
741 741
742static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) 742static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
743{ 743{
744 if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) 744 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
745 wake_up_all(&cgroup_rmdir_waitq); 745 wake_up_all(&cgroup_rmdir_waitq);
746} 746}
747 747
748void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
749{
750 css_get(css);
751}
752
753void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
754{
755 cgroup_wakeup_rmdir_waiter(css->cgroup);
756 css_put(css);
757}
758
759
748static int rebind_subsystems(struct cgroupfs_root *root, 760static int rebind_subsystems(struct cgroupfs_root *root,
749 unsigned long final_bits) 761 unsigned long final_bits)
750{ 762{
@@ -1359,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1359 * wake up rmdir() waiter. the rmdir should fail since the cgroup 1371 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1360 * is no longer empty. 1372 * is no longer empty.
1361 */ 1373 */
1362 cgroup_wakeup_rmdir_waiters(cgrp); 1374 cgroup_wakeup_rmdir_waiter(cgrp);
1363 return 0; 1375 return 0;
1364} 1376}
1365 1377
@@ -2744,33 +2756,42 @@ again:
2744 mutex_unlock(&cgroup_mutex); 2756 mutex_unlock(&cgroup_mutex);
2745 2757
2746 /* 2758 /*
2759 * In general, subsystem has no css->refcnt after pre_destroy(). But
2760 * in racy cases, subsystem may have to get css->refcnt after
2761 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
2762 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
2763 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
2764 * and subsystem's reference count handling. Please see css_get/put
2765 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
2766 */
2767 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2768
2769 /*
2747 * Call pre_destroy handlers of subsys. Notify subsystems 2770 * Call pre_destroy handlers of subsys. Notify subsystems
2748 * that rmdir() request comes. 2771 * that rmdir() request comes.
2749 */ 2772 */
2750 ret = cgroup_call_pre_destroy(cgrp); 2773 ret = cgroup_call_pre_destroy(cgrp);
2751 if (ret) 2774 if (ret) {
2775 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2752 return ret; 2776 return ret;
2777 }
2753 2778
2754 mutex_lock(&cgroup_mutex); 2779 mutex_lock(&cgroup_mutex);
2755 parent = cgrp->parent; 2780 parent = cgrp->parent;
2756 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { 2781 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
2782 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2757 mutex_unlock(&cgroup_mutex); 2783 mutex_unlock(&cgroup_mutex);
2758 return -EBUSY; 2784 return -EBUSY;
2759 } 2785 }
2760 /*
2761 * css_put/get is provided for subsys to grab refcnt to css. In typical
2762 * case, subsystem has no reference after pre_destroy(). But, under
2763 * hierarchy management, some *temporal* refcnt can be hold.
2764 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
2765 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
2766 * is called when css_put() is called and refcnt goes down to 0.
2767 */
2768 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2769 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); 2786 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
2770
2771 if (!cgroup_clear_css_refs(cgrp)) { 2787 if (!cgroup_clear_css_refs(cgrp)) {
2772 mutex_unlock(&cgroup_mutex); 2788 mutex_unlock(&cgroup_mutex);
2773 schedule(); 2789 /*
2790 * Because someone may call cgroup_wakeup_rmdir_waiter() before
2791 * prepare_to_wait(), we need to check this flag.
2792 */
2793 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
2794 schedule();
2774 finish_wait(&cgroup_rmdir_waitq, &wait); 2795 finish_wait(&cgroup_rmdir_waitq, &wait);
2775 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 2796 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2776 if (signal_pending(current)) 2797 if (signal_pending(current))
@@ -3342,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css)
3342 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3363 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3343 check_for_release(cgrp); 3364 check_for_release(cgrp);
3344 } 3365 }
3345 cgroup_wakeup_rmdir_waiters(cgrp); 3366 cgroup_wakeup_rmdir_waiter(cgrp);
3346 } 3367 }
3347 rcu_read_unlock(); 3368 rcu_read_unlock();
3348} 3369}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e717964cb5a0..fd4529d86de5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1207,6 +1207,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1207 ret = 0; 1207 ret = 0;
1208out: 1208out:
1209 unlock_page_cgroup(pc); 1209 unlock_page_cgroup(pc);
1210 /*
1211 * We charges against "to" which may not have any tasks. Then, "to"
1212 * can be under rmdir(). But in current implementation, caller of
1213 * this function is just force_empty() and it's garanteed that
1214 * "to" is never removed. So, we don't check rmdir status here.
1215 */
1210 return ret; 1216 return ret;
1211} 1217}
1212 1218
@@ -1428,6 +1434,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1428 return; 1434 return;
1429 if (!ptr) 1435 if (!ptr)
1430 return; 1436 return;
1437 cgroup_exclude_rmdir(&ptr->css);
1431 pc = lookup_page_cgroup(page); 1438 pc = lookup_page_cgroup(page);
1432 mem_cgroup_lru_del_before_commit_swapcache(page); 1439 mem_cgroup_lru_del_before_commit_swapcache(page);
1433 __mem_cgroup_commit_charge(ptr, pc, ctype); 1440 __mem_cgroup_commit_charge(ptr, pc, ctype);
@@ -1457,8 +1464,12 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1457 } 1464 }
1458 rcu_read_unlock(); 1465 rcu_read_unlock();
1459 } 1466 }
1460 /* add this page(page_cgroup) to the LRU we want. */ 1467 /*
1461 1468 * At swapin, we may charge account against cgroup which has no tasks.
1469 * So, rmdir()->pre_destroy() can be called while we do this charge.
1470 * In that case, we need to call pre_destroy() again. check it here.
1471 */
1472 cgroup_release_and_wakeup_rmdir(&ptr->css);
1462} 1473}
1463 1474
1464void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 1475void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
@@ -1664,7 +1675,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
1664 1675
1665 if (!mem) 1676 if (!mem)
1666 return; 1677 return;
1667 1678 cgroup_exclude_rmdir(&mem->css);
1668 /* at migration success, oldpage->mapping is NULL. */ 1679 /* at migration success, oldpage->mapping is NULL. */
1669 if (oldpage->mapping) { 1680 if (oldpage->mapping) {
1670 target = oldpage; 1681 target = oldpage;
@@ -1704,6 +1715,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
1704 */ 1715 */
1705 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 1716 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
1706 mem_cgroup_uncharge_page(target); 1717 mem_cgroup_uncharge_page(target);
1718 /*
1719 * At migration, we may charge account against cgroup which has no tasks
1720 * So, rmdir()->pre_destroy() can be called while we do this charge.
1721 * In that case, we need to call pre_destroy() again. check it here.
1722 */
1723 cgroup_release_and_wakeup_rmdir(&mem->css);
1707} 1724}
1708 1725
1709/* 1726/*