aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c256
1 files changed, 61 insertions, 195 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b7a0171067ea..e3045ad4267a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -171,8 +171,8 @@ struct css_id {
171 * The css to which this ID points. This pointer is set to valid value 171 * The css to which this ID points. This pointer is set to valid value
172 * after cgroup is populated. If cgroup is removed, this will be NULL. 172 * after cgroup is populated. If cgroup is removed, this will be NULL.
173 * This pointer is expected to be RCU-safe because destroy() 173 * This pointer is expected to be RCU-safe because destroy()
174 * is called after synchronize_rcu(). But for safe use, css_is_removed() 174 * is called after synchronize_rcu(). But for safe use, css_tryget()
175 * css_tryget() should be used for avoiding race. 175 * should be used for avoiding race.
176 */ 176 */
177 struct cgroup_subsys_state __rcu *css; 177 struct cgroup_subsys_state __rcu *css;
178 /* 178 /*
@@ -854,30 +854,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
854 return inode; 854 return inode;
855} 855}
856 856
857/*
858 * Call subsys's pre_destroy handler.
859 * This is called before css refcnt check.
860 */
861static int cgroup_call_pre_destroy(struct cgroup *cgrp)
862{
863 struct cgroup_subsys *ss;
864 int ret = 0;
865
866 for_each_subsys(cgrp->root, ss) {
867 if (!ss->pre_destroy)
868 continue;
869
870 ret = ss->pre_destroy(cgrp);
871 if (ret) {
872 /* ->pre_destroy() failure is being deprecated */
873 WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
874 break;
875 }
876 }
877
878 return ret;
879}
880
881static void cgroup_diput(struct dentry *dentry, struct inode *inode) 857static void cgroup_diput(struct dentry *dentry, struct inode *inode)
882{ 858{
883 /* is dentry a directory ? if so, kfree() associated cgroup */ 859 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -1015,33 +991,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
1015} 991}
1016 992
1017/* 993/*
1018 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
1019 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
1020 * reference to css->refcnt. In general, this refcnt is expected to goes down
1021 * to zero, soon.
1022 *
1023 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
1024 */
1025static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
1026
1027static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
1028{
1029 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
1030 wake_up_all(&cgroup_rmdir_waitq);
1031}
1032
1033void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
1034{
1035 css_get(css);
1036}
1037
1038void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
1039{
1040 cgroup_wakeup_rmdir_waiter(css->cgroup);
1041 css_put(css);
1042}
1043
1044/*
1045 * Call with cgroup_mutex held. Drops reference counts on modules, including 994 * Call with cgroup_mutex held. Drops reference counts on modules, including
1046 * any duplicate ones that parse_cgroupfs_options took. If this function 995 * any duplicate ones that parse_cgroupfs_options took. If this function
1047 * returns an error, no reference counts are touched. 996 * returns an error, no reference counts are touched.
@@ -2026,12 +1975,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
2026 } 1975 }
2027 1976
2028 synchronize_rcu(); 1977 synchronize_rcu();
2029
2030 /*
2031 * wake up rmdir() waiter. the rmdir should fail since the cgroup
2032 * is no longer empty.
2033 */
2034 cgroup_wakeup_rmdir_waiter(cgrp);
2035out: 1978out:
2036 if (retval) { 1979 if (retval) {
2037 for_each_subsys(root, ss) { 1980 for_each_subsys(root, ss) {
@@ -2201,7 +2144,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2201 * step 5: success! and cleanup 2144 * step 5: success! and cleanup
2202 */ 2145 */
2203 synchronize_rcu(); 2146 synchronize_rcu();
2204 cgroup_wakeup_rmdir_waiter(cgrp);
2205 retval = 0; 2147 retval = 0;
2206out_put_css_set_refs: 2148out_put_css_set_refs:
2207 if (retval) { 2149 if (retval) {
@@ -4023,14 +3965,12 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
4023 cgrp->subsys[ss->subsys_id] = css; 3965 cgrp->subsys[ss->subsys_id] = css;
4024 3966
4025 /* 3967 /*
4026 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry 3968 * css holds an extra ref to @cgrp->dentry which is put on the last
4027 * which is put on the last css_put(). dput() requires process 3969 * css_put(). dput() requires process context, which css_put() may
4028 * context, which css_put() may be called without. @css->dput_work 3970 * be called without. @css->dput_work will be used to invoke
4029 * will be used to invoke dput() asynchronously from css_put(). 3971 * dput() asynchronously from css_put().
4030 */ 3972 */
4031 INIT_WORK(&css->dput_work, css_dput_fn); 3973 INIT_WORK(&css->dput_work, css_dput_fn);
4032 if (ss->__DEPRECATED_clear_css_refs)
4033 set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
4034} 3974}
4035 3975
4036/* 3976/*
@@ -4054,6 +3994,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4054 if (!cgrp) 3994 if (!cgrp)
4055 return -ENOMEM; 3995 return -ENOMEM;
4056 3996
3997 /*
3998 * Only live parents can have children. Note that the liveliness
3999 * check isn't strictly necessary because cgroup_mkdir() and
4000 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
4001 * anyway so that locking is contained inside cgroup proper and we
4002 * don't get nasty surprises if we ever grow another caller.
4003 */
4004 if (!cgroup_lock_live_group(parent)) {
4005 err = -ENODEV;
4006 goto err_free;
4007 }
4008
4057 /* Grab a reference on the superblock so the hierarchy doesn't 4009 /* Grab a reference on the superblock so the hierarchy doesn't
4058 * get deleted on unmount if there are child cgroups. This 4010 * get deleted on unmount if there are child cgroups. This
4059 * can be done outside cgroup_mutex, since the sb can't 4011 * can be done outside cgroup_mutex, since the sb can't
@@ -4061,8 +4013,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4061 * fs */ 4013 * fs */
4062 atomic_inc(&sb->s_active); 4014 atomic_inc(&sb->s_active);
4063 4015
4064 mutex_lock(&cgroup_mutex);
4065
4066 init_cgroup_housekeeping(cgrp); 4016 init_cgroup_housekeeping(cgrp);
4067 4017
4068 cgrp->parent = parent; 4018 cgrp->parent = parent;
@@ -4110,10 +4060,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4110 if (err < 0) 4060 if (err < 0)
4111 goto err_remove; 4061 goto err_remove;
4112 4062
4113 /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ 4063 /* each css holds a ref to the cgroup's dentry */
4114 for_each_subsys(root, ss) 4064 for_each_subsys(root, ss)
4115 if (!ss->__DEPRECATED_clear_css_refs) 4065 dget(dentry);
4116 dget(dentry);
4117 4066
4118 /* The cgroup directory was pre-locked for us */ 4067 /* The cgroup directory was pre-locked for us */
4119 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); 4068 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
@@ -4144,7 +4093,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4144 4093
4145 /* Release the reference count that we took on the superblock */ 4094 /* Release the reference count that we took on the superblock */
4146 deactivate_super(sb); 4095 deactivate_super(sb);
4147 4096err_free:
4148 kfree(cgrp); 4097 kfree(cgrp);
4149 return err; 4098 return err;
4150} 4099}
@@ -4198,71 +4147,6 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
4198 return 0; 4147 return 0;
4199} 4148}
4200 4149
4201/*
4202 * Atomically mark all (or else none) of the cgroup's CSS objects as
4203 * CSS_REMOVED. Return true on success, or false if the cgroup has
4204 * busy subsystems. Call with cgroup_mutex held
4205 *
4206 * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
4207 * not, cgroup removal behaves differently.
4208 *
4209 * If clear is set, css refcnt for the subsystem should be zero before
4210 * cgroup removal can be committed. This is implemented by
4211 * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
4212 * called multiple times until all css refcnts reach zero and is allowed to
4213 * veto removal on any invocation. This behavior is deprecated and will be
4214 * removed as soon as the existing user (memcg) is updated.
4215 *
4216 * If clear is not set, each css holds an extra reference to the cgroup's
4217 * dentry and cgroup removal proceeds regardless of css refs.
4218 * ->pre_destroy() will be called at least once and is not allowed to fail.
4219 * On the last put of each css, whenever that may be, the extra dentry ref
4220 * is put so that dentry destruction happens only after all css's are
4221 * released.
4222 */
4223static int cgroup_clear_css_refs(struct cgroup *cgrp)
4224{
4225 struct cgroup_subsys *ss;
4226 unsigned long flags;
4227 bool failed = false;
4228
4229 local_irq_save(flags);
4230
4231 /*
4232 * Block new css_tryget() by deactivating refcnt. If all refcnts
4233 * for subsystems w/ clear_css_refs set were 1 at the moment of
4234 * deactivation, we succeeded.
4235 */
4236 for_each_subsys(cgrp->root, ss) {
4237 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4238
4239 WARN_ON(atomic_read(&css->refcnt) < 0);
4240 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
4241
4242 if (ss->__DEPRECATED_clear_css_refs)
4243 failed |= css_refcnt(css) != 1;
4244 }
4245
4246 /*
4247 * If succeeded, set REMOVED and put all the base refs; otherwise,
4248 * restore refcnts to positive values. Either way, all in-progress
4249 * css_tryget() will be released.
4250 */
4251 for_each_subsys(cgrp->root, ss) {
4252 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4253
4254 if (!failed) {
4255 set_bit(CSS_REMOVED, &css->flags);
4256 css_put(css);
4257 } else {
4258 atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
4259 }
4260 }
4261
4262 local_irq_restore(flags);
4263 return !failed;
4264}
4265
4266static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4150static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4267{ 4151{
4268 struct cgroup *cgrp = dentry->d_fsdata; 4152 struct cgroup *cgrp = dentry->d_fsdata;
@@ -4270,70 +4154,52 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4270 struct cgroup *parent; 4154 struct cgroup *parent;
4271 DEFINE_WAIT(wait); 4155 DEFINE_WAIT(wait);
4272 struct cgroup_event *event, *tmp; 4156 struct cgroup_event *event, *tmp;
4273 int ret; 4157 struct cgroup_subsys *ss;
4274 4158
4275 /* the vfs holds both inode->i_mutex already */ 4159 /* the vfs holds both inode->i_mutex already */
4276again:
4277 mutex_lock(&cgroup_mutex); 4160 mutex_lock(&cgroup_mutex);
4278 if (atomic_read(&cgrp->count) != 0) { 4161 parent = cgrp->parent;
4279 mutex_unlock(&cgroup_mutex); 4162 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
4280 return -EBUSY;
4281 }
4282 if (!list_empty(&cgrp->children)) {
4283 mutex_unlock(&cgroup_mutex); 4163 mutex_unlock(&cgroup_mutex);
4284 return -EBUSY; 4164 return -EBUSY;
4285 } 4165 }
4286 mutex_unlock(&cgroup_mutex);
4287 4166
4288 /* 4167 /*
4289 * In general, subsystem has no css->refcnt after pre_destroy(). But 4168 * Block new css_tryget() by deactivating refcnt and mark @cgrp
4290 * in racy cases, subsystem may have to get css->refcnt after 4169 * removed. This makes future css_tryget() and child creation
4291 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes 4170 * attempts fail thus maintaining the removal conditions verified
4292 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue 4171 * above.
4293 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
4294 * and subsystem's reference count handling. Please see css_get/put
4295 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
4296 */ 4172 */
4297 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 4173 for_each_subsys(cgrp->root, ss) {
4174 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4298 4175
4299 /* 4176 WARN_ON(atomic_read(&css->refcnt) < 0);
4300 * Call pre_destroy handlers of subsys. Notify subsystems 4177 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
4301 * that rmdir() request comes.
4302 */
4303 ret = cgroup_call_pre_destroy(cgrp);
4304 if (ret) {
4305 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4306 return ret;
4307 } 4178 }
4179 set_bit(CGRP_REMOVED, &cgrp->flags);
4308 4180
4181 /*
4182 * Tell subsystems to initate destruction. pre_destroy() should be
4183 * called with cgroup_mutex unlocked. See 3fa59dfbc3 ("cgroup: fix
4184 * potential deadlock in pre_destroy") for details.
4185 */
4186 mutex_unlock(&cgroup_mutex);
4187 for_each_subsys(cgrp->root, ss)
4188 if (ss->pre_destroy)
4189 ss->pre_destroy(cgrp);
4309 mutex_lock(&cgroup_mutex); 4190 mutex_lock(&cgroup_mutex);
4310 parent = cgrp->parent; 4191
4311 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { 4192 /*
4312 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 4193 * Put all the base refs. Each css holds an extra reference to the
4313 mutex_unlock(&cgroup_mutex); 4194 * cgroup's dentry and cgroup removal proceeds regardless of css
4314 return -EBUSY; 4195 * refs. On the last put of each css, whenever that may be, the
4315 } 4196 * extra dentry ref is put so that dentry destruction happens only
4316 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); 4197 * after all css's are released.
4317 if (!cgroup_clear_css_refs(cgrp)) { 4198 */
4318 mutex_unlock(&cgroup_mutex); 4199 for_each_subsys(cgrp->root, ss)
4319 /* 4200 css_put(cgrp->subsys[ss->subsys_id]);
4320 * Because someone may call cgroup_wakeup_rmdir_waiter() before
4321 * prepare_to_wait(), we need to check this flag.
4322 */
4323 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
4324 schedule();
4325 finish_wait(&cgroup_rmdir_waitq, &wait);
4326 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4327 if (signal_pending(current))
4328 return -EINTR;
4329 goto again;
4330 }
4331 /* NO css_tryget() can success after here. */
4332 finish_wait(&cgroup_rmdir_waitq, &wait);
4333 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4334 4201
4335 raw_spin_lock(&release_list_lock); 4202 raw_spin_lock(&release_list_lock);
4336 set_bit(CGRP_REMOVED, &cgrp->flags);
4337 if (!list_empty(&cgrp->release_list)) 4203 if (!list_empty(&cgrp->release_list))
4338 list_del_init(&cgrp->release_list); 4204 list_del_init(&cgrp->release_list);
4339 raw_spin_unlock(&release_list_lock); 4205 raw_spin_unlock(&release_list_lock);
@@ -5041,15 +4907,17 @@ static void check_for_release(struct cgroup *cgrp)
5041/* Caller must verify that the css is not for root cgroup */ 4907/* Caller must verify that the css is not for root cgroup */
5042bool __css_tryget(struct cgroup_subsys_state *css) 4908bool __css_tryget(struct cgroup_subsys_state *css)
5043{ 4909{
5044 do { 4910 while (true) {
5045 int v = css_refcnt(css); 4911 int t, v;
5046 4912
5047 if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) 4913 v = css_refcnt(css);
4914 t = atomic_cmpxchg(&css->refcnt, v, v + 1);
4915 if (likely(t == v))
5048 return true; 4916 return true;
4917 else if (t < 0)
4918 return false;
5049 cpu_relax(); 4919 cpu_relax();
5050 } while (!test_bit(CSS_REMOVED, &css->flags)); 4920 }
5051
5052 return false;
5053} 4921}
5054EXPORT_SYMBOL_GPL(__css_tryget); 4922EXPORT_SYMBOL_GPL(__css_tryget);
5055 4923
@@ -5068,11 +4936,9 @@ void __css_put(struct cgroup_subsys_state *css)
5068 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4936 set_bit(CGRP_RELEASABLE, &cgrp->flags);
5069 check_for_release(cgrp); 4937 check_for_release(cgrp);
5070 } 4938 }
5071 cgroup_wakeup_rmdir_waiter(cgrp);
5072 break; 4939 break;
5073 case 0: 4940 case 0:
5074 if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) 4941 schedule_work(&css->dput_work);
5075 schedule_work(&css->dput_work);
5076 break; 4942 break;
5077 } 4943 }
5078 rcu_read_unlock(); 4944 rcu_read_unlock();