aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2012-04-01 15:09:56 -0400
committerTejun Heo <tj@kernel.org>2012-04-01 15:09:56 -0400
commit48ddbe194623ae089cc0576e60363f2d2e85662a (patch)
treebf9f9fc29e28b6440c64727f5e0a57a9ccd8ec5d
parent28b4c27b8e6bb6d7ff2875281a8484f8898a87ef (diff)
cgroup: make css->refcnt clearing on cgroup removal optional
Currently, cgroup removal tries to drain all css references. If there are active css references, the removal logic waits and retries ->pre_detroy() until either all refs drop to zero or removal is cancelled. This semantics is unusual and adds non-trivial complexity to cgroup core and IMHO is fundamentally misguided in that it couples internal implementation details (references to internal data structure) with externally visible operation (rmdir). To userland, this is a behavior peculiarity which is unnecessary and difficult to expect (css refs is otherwise invisible from userland), and, to policy implementations, this is an unnecessary restriction (e.g. blkcg wants to hold css refs for caching purposes but can't as that becomes visible as rmdir hang). Unfortunately, memcg currently depends on ->pre_destroy() retrials and cgroup removal vetoing and can't be immmediately switched to the new behavior. This patch introduces the new behavior of not waiting for css refs to drain and maintains the old behavior for subsystems which have __DEPRECATED_clear_css_refs set. Once, memcg is updated, we can drop the code paths for the old behavior as proposed in the following patch. Note that the following patch is incorrect in that dput work item is in cgroup and may lose some of dputs when multiples css's are released back-to-back, and __css_put() triggers check_for_release() when refcnt reaches 0 instead of 1; however, it shows what part can be removed. http://thread.gmane.org/gmane.linux.kernel.containers/22559/focus=75251 Note that, in not-too-distant future, cgroup core will start emitting warning messages for subsys which require the old behavior, so please get moving. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizf@cn.fujitsu.com> Cc: Vivek Goyal <vgoyal@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: Balbir Singh <bsingharora@gmail.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
-rw-r--r--include/linux/cgroup.h17
-rw-r--r--kernel/cgroup.c71
-rw-r--r--mm/memcontrol.c1
3 files changed, 80 insertions, 9 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index be81fafae11f..565c8034e6c8 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -16,6 +16,7 @@
16#include <linux/prio_heap.h> 16#include <linux/prio_heap.h>
17#include <linux/rwsem.h> 17#include <linux/rwsem.h>
18#include <linux/idr.h> 18#include <linux/idr.h>
19#include <linux/workqueue.h>
19 20
20#ifdef CONFIG_CGROUPS 21#ifdef CONFIG_CGROUPS
21 22
@@ -76,12 +77,16 @@ struct cgroup_subsys_state {
76 unsigned long flags; 77 unsigned long flags;
77 /* ID for this css, if possible */ 78 /* ID for this css, if possible */
78 struct css_id __rcu *id; 79 struct css_id __rcu *id;
80
81 /* Used to put @cgroup->dentry on the last css_put() */
82 struct work_struct dput_work;
79}; 83};
80 84
81/* bits in struct cgroup_subsys_state flags field */ 85/* bits in struct cgroup_subsys_state flags field */
82enum { 86enum {
83 CSS_ROOT, /* This CSS is the root of the subsystem */ 87 CSS_ROOT, /* This CSS is the root of the subsystem */
84 CSS_REMOVED, /* This CSS is dead */ 88 CSS_REMOVED, /* This CSS is dead */
89 CSS_CLEAR_CSS_REFS, /* @ss->__DEPRECATED_clear_css_refs */
85}; 90};
86 91
87/* Caller must verify that the css is not for root cgroup */ 92/* Caller must verify that the css is not for root cgroup */
@@ -480,6 +485,18 @@ struct cgroup_subsys {
480 * (not available in early_init time.) 485 * (not available in early_init time.)
481 */ 486 */
482 bool use_id; 487 bool use_id;
488
489 /*
490 * If %true, cgroup removal will try to clear css refs by retrying
491 * ss->pre_destroy() until there's no css ref left. This behavior
492 * is strictly for backward compatibility and will be removed as
493 * soon as the current user (memcg) is updated.
494 *
495 * If %false, ss->pre_destroy() can't fail and cgroup removal won't
496 * wait for css refs to drop to zero before proceeding.
497 */
498 bool __DEPRECATED_clear_css_refs;
499
483#define MAX_CGROUP_TYPE_NAMELEN 32 500#define MAX_CGROUP_TYPE_NAMELEN 32
484 const char *name; 501 const char *name;
485 502
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2eade5186604..2905977e0f33 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -854,12 +854,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
854 struct cgroup_subsys *ss; 854 struct cgroup_subsys *ss;
855 int ret = 0; 855 int ret = 0;
856 856
857 for_each_subsys(cgrp->root, ss) 857 for_each_subsys(cgrp->root, ss) {
858 if (ss->pre_destroy) { 858 if (!ss->pre_destroy)
859 ret = ss->pre_destroy(cgrp); 859 continue;
860 if (ret) 860
861 break; 861 ret = ss->pre_destroy(cgrp);
862 if (ret) {
863 /* ->pre_destroy() failure is being deprecated */
864 WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
865 break;
862 } 866 }
867 }
863 868
864 return ret; 869 return ret;
865} 870}
@@ -3859,6 +3864,14 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
3859 return 0; 3864 return 0;
3860} 3865}
3861 3866
3867static void css_dput_fn(struct work_struct *work)
3868{
3869 struct cgroup_subsys_state *css =
3870 container_of(work, struct cgroup_subsys_state, dput_work);
3871
3872 dput(css->cgroup->dentry);
3873}
3874
3862static void init_cgroup_css(struct cgroup_subsys_state *css, 3875static void init_cgroup_css(struct cgroup_subsys_state *css,
3863 struct cgroup_subsys *ss, 3876 struct cgroup_subsys *ss,
3864 struct cgroup *cgrp) 3877 struct cgroup *cgrp)
@@ -3871,6 +3884,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
3871 set_bit(CSS_ROOT, &css->flags); 3884 set_bit(CSS_ROOT, &css->flags);
3872 BUG_ON(cgrp->subsys[ss->subsys_id]); 3885 BUG_ON(cgrp->subsys[ss->subsys_id]);
3873 cgrp->subsys[ss->subsys_id] = css; 3886 cgrp->subsys[ss->subsys_id] = css;
3887
3888 /*
3889 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
3890 * which is put on the last css_put(). dput() requires process
3891 * context, which css_put() may be called without. @css->dput_work
3892 * will be used to invoke dput() asynchronously from css_put().
3893 */
3894 INIT_WORK(&css->dput_work, css_dput_fn);
3895 if (ss->__DEPRECATED_clear_css_refs)
3896 set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
3874} 3897}
3875 3898
3876static void cgroup_lock_hierarchy(struct cgroupfs_root *root) 3899static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
@@ -3973,6 +3996,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3973 if (err < 0) 3996 if (err < 0)
3974 goto err_remove; 3997 goto err_remove;
3975 3998
3999 /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
4000 for_each_subsys(root, ss)
4001 if (!ss->__DEPRECATED_clear_css_refs)
4002 dget(dentry);
4003
3976 /* The cgroup directory was pre-locked for us */ 4004 /* The cgroup directory was pre-locked for us */
3977 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); 4005 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
3978 4006
@@ -4062,8 +4090,24 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
4062 * Atomically mark all (or else none) of the cgroup's CSS objects as 4090 * Atomically mark all (or else none) of the cgroup's CSS objects as
4063 * CSS_REMOVED. Return true on success, or false if the cgroup has 4091 * CSS_REMOVED. Return true on success, or false if the cgroup has
4064 * busy subsystems. Call with cgroup_mutex held 4092 * busy subsystems. Call with cgroup_mutex held
4093 *
4094 * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
4095 * not, cgroup removal behaves differently.
4096 *
4097 * If clear is set, css refcnt for the subsystem should be zero before
4098 * cgroup removal can be committed. This is implemented by
4099 * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
4100 * called multiple times until all css refcnts reach zero and is allowed to
4101 * veto removal on any invocation. This behavior is deprecated and will be
4102 * removed as soon as the existing user (memcg) is updated.
4103 *
4104 * If clear is not set, each css holds an extra reference to the cgroup's
4105 * dentry and cgroup removal proceeds regardless of css refs.
4106 * ->pre_destroy() will be called at least once and is not allowed to fail.
4107 * On the last put of each css, whenever that may be, the extra dentry ref
4108 * is put so that dentry destruction happens only after all css's are
4109 * released.
4065 */ 4110 */
4066
4067static int cgroup_clear_css_refs(struct cgroup *cgrp) 4111static int cgroup_clear_css_refs(struct cgroup *cgrp)
4068{ 4112{
4069 struct cgroup_subsys *ss; 4113 struct cgroup_subsys *ss;
@@ -4074,14 +4118,17 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
4074 4118
4075 /* 4119 /*
4076 * Block new css_tryget() by deactivating refcnt. If all refcnts 4120 * Block new css_tryget() by deactivating refcnt. If all refcnts
4077 * were 1 at the moment of deactivation, we succeeded. 4121 * for subsystems w/ clear_css_refs set were 1 at the moment of
4122 * deactivation, we succeeded.
4078 */ 4123 */
4079 for_each_subsys(cgrp->root, ss) { 4124 for_each_subsys(cgrp->root, ss) {
4080 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4125 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4081 4126
4082 WARN_ON(atomic_read(&css->refcnt) < 0); 4127 WARN_ON(atomic_read(&css->refcnt) < 0);
4083 atomic_add(CSS_DEACT_BIAS, &css->refcnt); 4128 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
4084 failed |= css_refcnt(css) != 1; 4129
4130 if (ss->__DEPRECATED_clear_css_refs)
4131 failed |= css_refcnt(css) != 1;
4085 } 4132 }
4086 4133
4087 /* 4134 /*
@@ -4917,12 +4964,18 @@ void __css_put(struct cgroup_subsys_state *css)
4917 4964
4918 rcu_read_lock(); 4965 rcu_read_lock();
4919 atomic_dec(&css->refcnt); 4966 atomic_dec(&css->refcnt);
4920 if (css_refcnt(css) == 1) { 4967 switch (css_refcnt(css)) {
4968 case 1:
4921 if (notify_on_release(cgrp)) { 4969 if (notify_on_release(cgrp)) {
4922 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4970 set_bit(CGRP_RELEASABLE, &cgrp->flags);
4923 check_for_release(cgrp); 4971 check_for_release(cgrp);
4924 } 4972 }
4925 cgroup_wakeup_rmdir_waiter(cgrp); 4973 cgroup_wakeup_rmdir_waiter(cgrp);
4974 break;
4975 case 0:
4976 if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
4977 schedule_work(&css->dput_work);
4978 break;
4926 } 4979 }
4927 rcu_read_unlock(); 4980 rcu_read_unlock();
4928} 4981}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bef114258bbd..d28359cd6b55 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5635,6 +5635,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
5635 .base_cftypes = mem_cgroup_files, 5635 .base_cftypes = mem_cgroup_files,
5636 .early_init = 0, 5636 .early_init = 0,
5637 .use_id = 1, 5637 .use_id = 1,
5638 .__DEPRECATED_clear_css_refs = true,
5638}; 5639};
5639 5640
5640#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5641#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP