diff options
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 256 |
1 files changed, 61 insertions, 195 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b7a0171067ea..e3045ad4267a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -171,8 +171,8 @@ struct css_id { | |||
171 | * The css to which this ID points. This pointer is set to valid value | 171 | * The css to which this ID points. This pointer is set to valid value |
172 | * after cgroup is populated. If cgroup is removed, this will be NULL. | 172 | * after cgroup is populated. If cgroup is removed, this will be NULL. |
173 | * This pointer is expected to be RCU-safe because destroy() | 173 | * This pointer is expected to be RCU-safe because destroy() |
174 | * is called after synchronize_rcu(). But for safe use, css_is_removed() | 174 | * is called after synchronize_rcu(). But for safe use, css_tryget() |
175 | * css_tryget() should be used for avoiding race. | 175 | * should be used for avoiding race. |
176 | */ | 176 | */ |
177 | struct cgroup_subsys_state __rcu *css; | 177 | struct cgroup_subsys_state __rcu *css; |
178 | /* | 178 | /* |
@@ -854,30 +854,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) | |||
854 | return inode; | 854 | return inode; |
855 | } | 855 | } |
856 | 856 | ||
857 | /* | ||
858 | * Call subsys's pre_destroy handler. | ||
859 | * This is called before css refcnt check. | ||
860 | */ | ||
861 | static int cgroup_call_pre_destroy(struct cgroup *cgrp) | ||
862 | { | ||
863 | struct cgroup_subsys *ss; | ||
864 | int ret = 0; | ||
865 | |||
866 | for_each_subsys(cgrp->root, ss) { | ||
867 | if (!ss->pre_destroy) | ||
868 | continue; | ||
869 | |||
870 | ret = ss->pre_destroy(cgrp); | ||
871 | if (ret) { | ||
872 | /* ->pre_destroy() failure is being deprecated */ | ||
873 | WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); | ||
874 | break; | ||
875 | } | ||
876 | } | ||
877 | |||
878 | return ret; | ||
879 | } | ||
880 | |||
881 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 857 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) |
882 | { | 858 | { |
883 | /* is dentry a directory ? if so, kfree() associated cgroup */ | 859 | /* is dentry a directory ? if so, kfree() associated cgroup */ |
@@ -1015,33 +991,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry) | |||
1015 | } | 991 | } |
1016 | 992 | ||
1017 | /* | 993 | /* |
1018 | * A queue for waiters to do rmdir() cgroup. A tasks will sleep when | ||
1019 | * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some | ||
1020 | * reference to css->refcnt. In general, this refcnt is expected to goes down | ||
1021 | * to zero, soon. | ||
1022 | * | ||
1023 | * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; | ||
1024 | */ | ||
1025 | static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); | ||
1026 | |||
1027 | static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) | ||
1028 | { | ||
1029 | if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) | ||
1030 | wake_up_all(&cgroup_rmdir_waitq); | ||
1031 | } | ||
1032 | |||
1033 | void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) | ||
1034 | { | ||
1035 | css_get(css); | ||
1036 | } | ||
1037 | |||
1038 | void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) | ||
1039 | { | ||
1040 | cgroup_wakeup_rmdir_waiter(css->cgroup); | ||
1041 | css_put(css); | ||
1042 | } | ||
1043 | |||
1044 | /* | ||
1045 | * Call with cgroup_mutex held. Drops reference counts on modules, including | 994 | * Call with cgroup_mutex held. Drops reference counts on modules, including |
1046 | * any duplicate ones that parse_cgroupfs_options took. If this function | 995 | * any duplicate ones that parse_cgroupfs_options took. If this function |
1047 | * returns an error, no reference counts are touched. | 996 | * returns an error, no reference counts are touched. |
@@ -2026,12 +1975,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
2026 | } | 1975 | } |
2027 | 1976 | ||
2028 | synchronize_rcu(); | 1977 | synchronize_rcu(); |
2029 | |||
2030 | /* | ||
2031 | * wake up rmdir() waiter. the rmdir should fail since the cgroup | ||
2032 | * is no longer empty. | ||
2033 | */ | ||
2034 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
2035 | out: | 1978 | out: |
2036 | if (retval) { | 1979 | if (retval) { |
2037 | for_each_subsys(root, ss) { | 1980 | for_each_subsys(root, ss) { |
@@ -2201,7 +2144,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2201 | * step 5: success! and cleanup | 2144 | * step 5: success! and cleanup |
2202 | */ | 2145 | */ |
2203 | synchronize_rcu(); | 2146 | synchronize_rcu(); |
2204 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
2205 | retval = 0; | 2147 | retval = 0; |
2206 | out_put_css_set_refs: | 2148 | out_put_css_set_refs: |
2207 | if (retval) { | 2149 | if (retval) { |
@@ -4023,14 +3965,12 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, | |||
4023 | cgrp->subsys[ss->subsys_id] = css; | 3965 | cgrp->subsys[ss->subsys_id] = css; |
4024 | 3966 | ||
4025 | /* | 3967 | /* |
4026 | * If !clear_css_refs, css holds an extra ref to @cgrp->dentry | 3968 | * css holds an extra ref to @cgrp->dentry which is put on the last |
4027 | * which is put on the last css_put(). dput() requires process | 3969 | * css_put(). dput() requires process context, which css_put() may |
4028 | * context, which css_put() may be called without. @css->dput_work | 3970 | * be called without. @css->dput_work will be used to invoke |
4029 | * will be used to invoke dput() asynchronously from css_put(). | 3971 | * dput() asynchronously from css_put(). |
4030 | */ | 3972 | */ |
4031 | INIT_WORK(&css->dput_work, css_dput_fn); | 3973 | INIT_WORK(&css->dput_work, css_dput_fn); |
4032 | if (ss->__DEPRECATED_clear_css_refs) | ||
4033 | set_bit(CSS_CLEAR_CSS_REFS, &css->flags); | ||
4034 | } | 3974 | } |
4035 | 3975 | ||
4036 | /* | 3976 | /* |
@@ -4054,6 +3994,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4054 | if (!cgrp) | 3994 | if (!cgrp) |
4055 | return -ENOMEM; | 3995 | return -ENOMEM; |
4056 | 3996 | ||
3997 | /* | ||
3998 | * Only live parents can have children. Note that the liveliness | ||
3999 | * check isn't strictly necessary because cgroup_mkdir() and | ||
4000 | * cgroup_rmdir() are fully synchronized by i_mutex; however, do it | ||
4001 | * anyway so that locking is contained inside cgroup proper and we | ||
4002 | * don't get nasty surprises if we ever grow another caller. | ||
4003 | */ | ||
4004 | if (!cgroup_lock_live_group(parent)) { | ||
4005 | err = -ENODEV; | ||
4006 | goto err_free; | ||
4007 | } | ||
4008 | |||
4057 | /* Grab a reference on the superblock so the hierarchy doesn't | 4009 | /* Grab a reference on the superblock so the hierarchy doesn't |
4058 | * get deleted on unmount if there are child cgroups. This | 4010 | * get deleted on unmount if there are child cgroups. This |
4059 | * can be done outside cgroup_mutex, since the sb can't | 4011 | * can be done outside cgroup_mutex, since the sb can't |
@@ -4061,8 +4013,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4061 | * fs */ | 4013 | * fs */ |
4062 | atomic_inc(&sb->s_active); | 4014 | atomic_inc(&sb->s_active); |
4063 | 4015 | ||
4064 | mutex_lock(&cgroup_mutex); | ||
4065 | |||
4066 | init_cgroup_housekeeping(cgrp); | 4016 | init_cgroup_housekeeping(cgrp); |
4067 | 4017 | ||
4068 | cgrp->parent = parent; | 4018 | cgrp->parent = parent; |
@@ -4110,10 +4060,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4110 | if (err < 0) | 4060 | if (err < 0) |
4111 | goto err_remove; | 4061 | goto err_remove; |
4112 | 4062 | ||
4113 | /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ | 4063 | /* each css holds a ref to the cgroup's dentry */ |
4114 | for_each_subsys(root, ss) | 4064 | for_each_subsys(root, ss) |
4115 | if (!ss->__DEPRECATED_clear_css_refs) | 4065 | dget(dentry); |
4116 | dget(dentry); | ||
4117 | 4066 | ||
4118 | /* The cgroup directory was pre-locked for us */ | 4067 | /* The cgroup directory was pre-locked for us */ |
4119 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); | 4068 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); |
@@ -4144,7 +4093,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4144 | 4093 | ||
4145 | /* Release the reference count that we took on the superblock */ | 4094 | /* Release the reference count that we took on the superblock */ |
4146 | deactivate_super(sb); | 4095 | deactivate_super(sb); |
4147 | 4096 | err_free: | |
4148 | kfree(cgrp); | 4097 | kfree(cgrp); |
4149 | return err; | 4098 | return err; |
4150 | } | 4099 | } |
@@ -4198,71 +4147,6 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
4198 | return 0; | 4147 | return 0; |
4199 | } | 4148 | } |
4200 | 4149 | ||
4201 | /* | ||
4202 | * Atomically mark all (or else none) of the cgroup's CSS objects as | ||
4203 | * CSS_REMOVED. Return true on success, or false if the cgroup has | ||
4204 | * busy subsystems. Call with cgroup_mutex held | ||
4205 | * | ||
4206 | * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or | ||
4207 | * not, cgroup removal behaves differently. | ||
4208 | * | ||
4209 | * If clear is set, css refcnt for the subsystem should be zero before | ||
4210 | * cgroup removal can be committed. This is implemented by | ||
4211 | * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be | ||
4212 | * called multiple times until all css refcnts reach zero and is allowed to | ||
4213 | * veto removal on any invocation. This behavior is deprecated and will be | ||
4214 | * removed as soon as the existing user (memcg) is updated. | ||
4215 | * | ||
4216 | * If clear is not set, each css holds an extra reference to the cgroup's | ||
4217 | * dentry and cgroup removal proceeds regardless of css refs. | ||
4218 | * ->pre_destroy() will be called at least once and is not allowed to fail. | ||
4219 | * On the last put of each css, whenever that may be, the extra dentry ref | ||
4220 | * is put so that dentry destruction happens only after all css's are | ||
4221 | * released. | ||
4222 | */ | ||
4223 | static int cgroup_clear_css_refs(struct cgroup *cgrp) | ||
4224 | { | ||
4225 | struct cgroup_subsys *ss; | ||
4226 | unsigned long flags; | ||
4227 | bool failed = false; | ||
4228 | |||
4229 | local_irq_save(flags); | ||
4230 | |||
4231 | /* | ||
4232 | * Block new css_tryget() by deactivating refcnt. If all refcnts | ||
4233 | * for subsystems w/ clear_css_refs set were 1 at the moment of | ||
4234 | * deactivation, we succeeded. | ||
4235 | */ | ||
4236 | for_each_subsys(cgrp->root, ss) { | ||
4237 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | ||
4238 | |||
4239 | WARN_ON(atomic_read(&css->refcnt) < 0); | ||
4240 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); | ||
4241 | |||
4242 | if (ss->__DEPRECATED_clear_css_refs) | ||
4243 | failed |= css_refcnt(css) != 1; | ||
4244 | } | ||
4245 | |||
4246 | /* | ||
4247 | * If succeeded, set REMOVED and put all the base refs; otherwise, | ||
4248 | * restore refcnts to positive values. Either way, all in-progress | ||
4249 | * css_tryget() will be released. | ||
4250 | */ | ||
4251 | for_each_subsys(cgrp->root, ss) { | ||
4252 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | ||
4253 | |||
4254 | if (!failed) { | ||
4255 | set_bit(CSS_REMOVED, &css->flags); | ||
4256 | css_put(css); | ||
4257 | } else { | ||
4258 | atomic_sub(CSS_DEACT_BIAS, &css->refcnt); | ||
4259 | } | ||
4260 | } | ||
4261 | |||
4262 | local_irq_restore(flags); | ||
4263 | return !failed; | ||
4264 | } | ||
4265 | |||
4266 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | 4150 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) |
4267 | { | 4151 | { |
4268 | struct cgroup *cgrp = dentry->d_fsdata; | 4152 | struct cgroup *cgrp = dentry->d_fsdata; |
@@ -4270,70 +4154,52 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
4270 | struct cgroup *parent; | 4154 | struct cgroup *parent; |
4271 | DEFINE_WAIT(wait); | 4155 | DEFINE_WAIT(wait); |
4272 | struct cgroup_event *event, *tmp; | 4156 | struct cgroup_event *event, *tmp; |
4273 | int ret; | 4157 | struct cgroup_subsys *ss; |
4274 | 4158 | ||
4275 | /* the vfs holds both inode->i_mutex already */ | 4159 | /* the vfs holds both inode->i_mutex already */ |
4276 | again: | ||
4277 | mutex_lock(&cgroup_mutex); | 4160 | mutex_lock(&cgroup_mutex); |
4278 | if (atomic_read(&cgrp->count) != 0) { | 4161 | parent = cgrp->parent; |
4279 | mutex_unlock(&cgroup_mutex); | 4162 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { |
4280 | return -EBUSY; | ||
4281 | } | ||
4282 | if (!list_empty(&cgrp->children)) { | ||
4283 | mutex_unlock(&cgroup_mutex); | 4163 | mutex_unlock(&cgroup_mutex); |
4284 | return -EBUSY; | 4164 | return -EBUSY; |
4285 | } | 4165 | } |
4286 | mutex_unlock(&cgroup_mutex); | ||
4287 | 4166 | ||
4288 | /* | 4167 | /* |
4289 | * In general, subsystem has no css->refcnt after pre_destroy(). But | 4168 | * Block new css_tryget() by deactivating refcnt and mark @cgrp |
4290 | * in racy cases, subsystem may have to get css->refcnt after | 4169 | * removed. This makes future css_tryget() and child creation |
4291 | * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes | 4170 | * attempts fail thus maintaining the removal conditions verified |
4292 | * make rmdir return -EBUSY too often. To avoid that, we use waitqueue | 4171 | * above. |
4293 | * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir | ||
4294 | * and subsystem's reference count handling. Please see css_get/put | ||
4295 | * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. | ||
4296 | */ | 4172 | */ |
4297 | set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | 4173 | for_each_subsys(cgrp->root, ss) { |
4174 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | ||
4298 | 4175 | ||
4299 | /* | 4176 | WARN_ON(atomic_read(&css->refcnt) < 0); |
4300 | * Call pre_destroy handlers of subsys. Notify subsystems | 4177 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); |
4301 | * that rmdir() request comes. | ||
4302 | */ | ||
4303 | ret = cgroup_call_pre_destroy(cgrp); | ||
4304 | if (ret) { | ||
4305 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4306 | return ret; | ||
4307 | } | 4178 | } |
4179 | set_bit(CGRP_REMOVED, &cgrp->flags); | ||
4308 | 4180 | ||
4181 | /* | ||
4182 | * Tell subsystems to initate destruction. pre_destroy() should be | ||
4183 | * called with cgroup_mutex unlocked. See 3fa59dfbc3 ("cgroup: fix | ||
4184 | * potential deadlock in pre_destroy") for details. | ||
4185 | */ | ||
4186 | mutex_unlock(&cgroup_mutex); | ||
4187 | for_each_subsys(cgrp->root, ss) | ||
4188 | if (ss->pre_destroy) | ||
4189 | ss->pre_destroy(cgrp); | ||
4309 | mutex_lock(&cgroup_mutex); | 4190 | mutex_lock(&cgroup_mutex); |
4310 | parent = cgrp->parent; | 4191 | |
4311 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { | 4192 | /* |
4312 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | 4193 | * Put all the base refs. Each css holds an extra reference to the |
4313 | mutex_unlock(&cgroup_mutex); | 4194 | * cgroup's dentry and cgroup removal proceeds regardless of css |
4314 | return -EBUSY; | 4195 | * refs. On the last put of each css, whenever that may be, the |
4315 | } | 4196 | * extra dentry ref is put so that dentry destruction happens only |
4316 | prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); | 4197 | * after all css's are released. |
4317 | if (!cgroup_clear_css_refs(cgrp)) { | 4198 | */ |
4318 | mutex_unlock(&cgroup_mutex); | 4199 | for_each_subsys(cgrp->root, ss) |
4319 | /* | 4200 | css_put(cgrp->subsys[ss->subsys_id]); |
4320 | * Because someone may call cgroup_wakeup_rmdir_waiter() before | ||
4321 | * prepare_to_wait(), we need to check this flag. | ||
4322 | */ | ||
4323 | if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) | ||
4324 | schedule(); | ||
4325 | finish_wait(&cgroup_rmdir_waitq, &wait); | ||
4326 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4327 | if (signal_pending(current)) | ||
4328 | return -EINTR; | ||
4329 | goto again; | ||
4330 | } | ||
4331 | /* NO css_tryget() can success after here. */ | ||
4332 | finish_wait(&cgroup_rmdir_waitq, &wait); | ||
4333 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4334 | 4201 | ||
4335 | raw_spin_lock(&release_list_lock); | 4202 | raw_spin_lock(&release_list_lock); |
4336 | set_bit(CGRP_REMOVED, &cgrp->flags); | ||
4337 | if (!list_empty(&cgrp->release_list)) | 4203 | if (!list_empty(&cgrp->release_list)) |
4338 | list_del_init(&cgrp->release_list); | 4204 | list_del_init(&cgrp->release_list); |
4339 | raw_spin_unlock(&release_list_lock); | 4205 | raw_spin_unlock(&release_list_lock); |
@@ -5041,15 +4907,17 @@ static void check_for_release(struct cgroup *cgrp) | |||
5041 | /* Caller must verify that the css is not for root cgroup */ | 4907 | /* Caller must verify that the css is not for root cgroup */ |
5042 | bool __css_tryget(struct cgroup_subsys_state *css) | 4908 | bool __css_tryget(struct cgroup_subsys_state *css) |
5043 | { | 4909 | { |
5044 | do { | 4910 | while (true) { |
5045 | int v = css_refcnt(css); | 4911 | int t, v; |
5046 | 4912 | ||
5047 | if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) | 4913 | v = css_refcnt(css); |
4914 | t = atomic_cmpxchg(&css->refcnt, v, v + 1); | ||
4915 | if (likely(t == v)) | ||
5048 | return true; | 4916 | return true; |
4917 | else if (t < 0) | ||
4918 | return false; | ||
5049 | cpu_relax(); | 4919 | cpu_relax(); |
5050 | } while (!test_bit(CSS_REMOVED, &css->flags)); | 4920 | } |
5051 | |||
5052 | return false; | ||
5053 | } | 4921 | } |
5054 | EXPORT_SYMBOL_GPL(__css_tryget); | 4922 | EXPORT_SYMBOL_GPL(__css_tryget); |
5055 | 4923 | ||
@@ -5068,11 +4936,9 @@ void __css_put(struct cgroup_subsys_state *css) | |||
5068 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 4936 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
5069 | check_for_release(cgrp); | 4937 | check_for_release(cgrp); |
5070 | } | 4938 | } |
5071 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
5072 | break; | 4939 | break; |
5073 | case 0: | 4940 | case 0: |
5074 | if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) | 4941 | schedule_work(&css->dput_work); |
5075 | schedule_work(&css->dput_work); | ||
5076 | break; | 4942 | break; |
5077 | } | 4943 | } |
5078 | rcu_read_unlock(); | 4944 | rcu_read_unlock(); |