aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--block/blk-cgroup.c3
-rw-r--r--include/linux/cgroup.h41
-rw-r--r--kernel/cgroup.c256
-rw-r--r--mm/hugetlb_cgroup.c11
-rw-r--r--mm/memcontrol.c181
5 files changed, 156 insertions, 336 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index cafcd7431189..6ce36ff98a41 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -600,7 +600,7 @@ struct cftype blkcg_files[] = {
600 * 600 *
601 * This is the blkcg counterpart of ioc_release_fn(). 601 * This is the blkcg counterpart of ioc_release_fn().
602 */ 602 */
603static int blkcg_pre_destroy(struct cgroup *cgroup) 603static void blkcg_pre_destroy(struct cgroup *cgroup)
604{ 604{
605 struct blkcg *blkcg = cgroup_to_blkcg(cgroup); 605 struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
606 606
@@ -622,7 +622,6 @@ static int blkcg_pre_destroy(struct cgroup *cgroup)
622 } 622 }
623 623
624 spin_unlock_irq(&blkcg->lock); 624 spin_unlock_irq(&blkcg->lock);
625 return 0;
626} 625}
627 626
628static void blkcg_destroy(struct cgroup *cgroup) 627static void blkcg_destroy(struct cgroup *cgroup)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4cd1d0fd2542..fe876a77031a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -81,8 +81,6 @@ struct cgroup_subsys_state {
81/* bits in struct cgroup_subsys_state flags field */ 81/* bits in struct cgroup_subsys_state flags field */
82enum { 82enum {
83 CSS_ROOT, /* This CSS is the root of the subsystem */ 83 CSS_ROOT, /* This CSS is the root of the subsystem */
84 CSS_REMOVED, /* This CSS is dead */
85 CSS_CLEAR_CSS_REFS, /* @ss->__DEPRECATED_clear_css_refs */
86}; 84};
87 85
88/* Caller must verify that the css is not for root cgroup */ 86/* Caller must verify that the css is not for root cgroup */
@@ -105,11 +103,6 @@ static inline void css_get(struct cgroup_subsys_state *css)
105 __css_get(css, 1); 103 __css_get(css, 1);
106} 104}
107 105
108static inline bool css_is_removed(struct cgroup_subsys_state *css)
109{
110 return test_bit(CSS_REMOVED, &css->flags);
111}
112
113/* 106/*
114 * Call css_tryget() to take a reference on a css if your existing 107 * Call css_tryget() to take a reference on a css if your existing
115 * (known-valid) reference isn't already ref-counted. Returns false if 108 * (known-valid) reference isn't already ref-counted. Returns false if
@@ -148,10 +141,6 @@ enum {
148 /* Control Group requires release notifications to userspace */ 141 /* Control Group requires release notifications to userspace */
149 CGRP_NOTIFY_ON_RELEASE, 142 CGRP_NOTIFY_ON_RELEASE,
150 /* 143 /*
151 * A thread in rmdir() is wating for this cgroup.
152 */
153 CGRP_WAIT_ON_RMDIR,
154 /*
155 * Clone cgroup values when creating a new child cgroup 144 * Clone cgroup values when creating a new child cgroup
156 */ 145 */
157 CGRP_CLONE_CHILDREN, 146 CGRP_CLONE_CHILDREN,
@@ -421,23 +410,6 @@ int cgroup_task_count(const struct cgroup *cgrp);
421int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); 410int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
422 411
423/* 412/*
424 * When the subsys has to access css and may add permanent refcnt to css,
425 * it should take care of racy conditions with rmdir(). Following set of
426 * functions, is for stop/restart rmdir if necessary.
427 * Because these will call css_get/put, "css" should be alive css.
428 *
429 * cgroup_exclude_rmdir();
430 * ...do some jobs which may access arbitrary empty cgroup
431 * cgroup_release_and_wakeup_rmdir();
432 *
433 * When someone removes a cgroup while cgroup_exclude_rmdir() holds it,
434 * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up.
435 */
436
437void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
438void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css);
439
440/*
441 * Control Group taskset, used to pass around set of tasks to cgroup_subsys 413 * Control Group taskset, used to pass around set of tasks to cgroup_subsys
442 * methods. 414 * methods.
443 */ 415 */
@@ -466,7 +438,7 @@ int cgroup_taskset_size(struct cgroup_taskset *tset);
466 438
467struct cgroup_subsys { 439struct cgroup_subsys {
468 struct cgroup_subsys_state *(*create)(struct cgroup *cgrp); 440 struct cgroup_subsys_state *(*create)(struct cgroup *cgrp);
469 int (*pre_destroy)(struct cgroup *cgrp); 441 void (*pre_destroy)(struct cgroup *cgrp);
470 void (*destroy)(struct cgroup *cgrp); 442 void (*destroy)(struct cgroup *cgrp);
471 int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); 443 int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
472 void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); 444 void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
@@ -488,17 +460,6 @@ struct cgroup_subsys {
488 bool use_id; 460 bool use_id;
489 461
490 /* 462 /*
491 * If %true, cgroup removal will try to clear css refs by retrying
492 * ss->pre_destroy() until there's no css ref left. This behavior
493 * is strictly for backward compatibility and will be removed as
494 * soon as the current user (memcg) is updated.
495 *
496 * If %false, ss->pre_destroy() can't fail and cgroup removal won't
497 * wait for css refs to drop to zero before proceeding.
498 */
499 bool __DEPRECATED_clear_css_refs;
500
501 /*
502 * If %false, this subsystem is properly hierarchical - 463 * If %false, this subsystem is properly hierarchical -
503 * configuration, resource accounting and restriction on a parent 464 * configuration, resource accounting and restriction on a parent
504 * cgroup cover those of its children. If %true, hierarchy support 465 * cgroup cover those of its children. If %true, hierarchy support
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b7a0171067ea..e3045ad4267a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -171,8 +171,8 @@ struct css_id {
171 * The css to which this ID points. This pointer is set to valid value 171 * The css to which this ID points. This pointer is set to valid value
172 * after cgroup is populated. If cgroup is removed, this will be NULL. 172 * after cgroup is populated. If cgroup is removed, this will be NULL.
173 * This pointer is expected to be RCU-safe because destroy() 173 * This pointer is expected to be RCU-safe because destroy()
174 * is called after synchronize_rcu(). But for safe use, css_is_removed() 174 * is called after synchronize_rcu(). But for safe use, css_tryget()
175 * css_tryget() should be used for avoiding race. 175 * should be used for avoiding race.
176 */ 176 */
177 struct cgroup_subsys_state __rcu *css; 177 struct cgroup_subsys_state __rcu *css;
178 /* 178 /*
@@ -854,30 +854,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
854 return inode; 854 return inode;
855} 855}
856 856
857/*
858 * Call subsys's pre_destroy handler.
859 * This is called before css refcnt check.
860 */
861static int cgroup_call_pre_destroy(struct cgroup *cgrp)
862{
863 struct cgroup_subsys *ss;
864 int ret = 0;
865
866 for_each_subsys(cgrp->root, ss) {
867 if (!ss->pre_destroy)
868 continue;
869
870 ret = ss->pre_destroy(cgrp);
871 if (ret) {
872 /* ->pre_destroy() failure is being deprecated */
873 WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
874 break;
875 }
876 }
877
878 return ret;
879}
880
881static void cgroup_diput(struct dentry *dentry, struct inode *inode) 857static void cgroup_diput(struct dentry *dentry, struct inode *inode)
882{ 858{
883 /* is dentry a directory ? if so, kfree() associated cgroup */ 859 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -1015,33 +991,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
1015} 991}
1016 992
1017/* 993/*
1018 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
1019 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
1020 * reference to css->refcnt. In general, this refcnt is expected to goes down
1021 * to zero, soon.
1022 *
1023 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
1024 */
1025static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
1026
1027static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
1028{
1029 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
1030 wake_up_all(&cgroup_rmdir_waitq);
1031}
1032
1033void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
1034{
1035 css_get(css);
1036}
1037
1038void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
1039{
1040 cgroup_wakeup_rmdir_waiter(css->cgroup);
1041 css_put(css);
1042}
1043
1044/*
1045 * Call with cgroup_mutex held. Drops reference counts on modules, including 994 * Call with cgroup_mutex held. Drops reference counts on modules, including
1046 * any duplicate ones that parse_cgroupfs_options took. If this function 995 * any duplicate ones that parse_cgroupfs_options took. If this function
1047 * returns an error, no reference counts are touched. 996 * returns an error, no reference counts are touched.
@@ -2026,12 +1975,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
2026 } 1975 }
2027 1976
2028 synchronize_rcu(); 1977 synchronize_rcu();
2029
2030 /*
2031 * wake up rmdir() waiter. the rmdir should fail since the cgroup
2032 * is no longer empty.
2033 */
2034 cgroup_wakeup_rmdir_waiter(cgrp);
2035out: 1978out:
2036 if (retval) { 1979 if (retval) {
2037 for_each_subsys(root, ss) { 1980 for_each_subsys(root, ss) {
@@ -2201,7 +2144,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2201 * step 5: success! and cleanup 2144 * step 5: success! and cleanup
2202 */ 2145 */
2203 synchronize_rcu(); 2146 synchronize_rcu();
2204 cgroup_wakeup_rmdir_waiter(cgrp);
2205 retval = 0; 2147 retval = 0;
2206out_put_css_set_refs: 2148out_put_css_set_refs:
2207 if (retval) { 2149 if (retval) {
@@ -4023,14 +3965,12 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
4023 cgrp->subsys[ss->subsys_id] = css; 3965 cgrp->subsys[ss->subsys_id] = css;
4024 3966
4025 /* 3967 /*
4026 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry 3968 * css holds an extra ref to @cgrp->dentry which is put on the last
4027 * which is put on the last css_put(). dput() requires process 3969 * css_put(). dput() requires process context, which css_put() may
4028 * context, which css_put() may be called without. @css->dput_work 3970 * be called without. @css->dput_work will be used to invoke
4029 * will be used to invoke dput() asynchronously from css_put(). 3971 * dput() asynchronously from css_put().
4030 */ 3972 */
4031 INIT_WORK(&css->dput_work, css_dput_fn); 3973 INIT_WORK(&css->dput_work, css_dput_fn);
4032 if (ss->__DEPRECATED_clear_css_refs)
4033 set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
4034} 3974}
4035 3975
4036/* 3976/*
@@ -4054,6 +3994,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4054 if (!cgrp) 3994 if (!cgrp)
4055 return -ENOMEM; 3995 return -ENOMEM;
4056 3996
3997 /*
3998 * Only live parents can have children. Note that the liveliness
3999 * check isn't strictly necessary because cgroup_mkdir() and
4000 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
4001 * anyway so that locking is contained inside cgroup proper and we
4002 * don't get nasty surprises if we ever grow another caller.
4003 */
4004 if (!cgroup_lock_live_group(parent)) {
4005 err = -ENODEV;
4006 goto err_free;
4007 }
4008
4057 /* Grab a reference on the superblock so the hierarchy doesn't 4009 /* Grab a reference on the superblock so the hierarchy doesn't
4058 * get deleted on unmount if there are child cgroups. This 4010 * get deleted on unmount if there are child cgroups. This
4059 * can be done outside cgroup_mutex, since the sb can't 4011 * can be done outside cgroup_mutex, since the sb can't
@@ -4061,8 +4013,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4061 * fs */ 4013 * fs */
4062 atomic_inc(&sb->s_active); 4014 atomic_inc(&sb->s_active);
4063 4015
4064 mutex_lock(&cgroup_mutex);
4065
4066 init_cgroup_housekeeping(cgrp); 4016 init_cgroup_housekeeping(cgrp);
4067 4017
4068 cgrp->parent = parent; 4018 cgrp->parent = parent;
@@ -4110,10 +4060,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4110 if (err < 0) 4060 if (err < 0)
4111 goto err_remove; 4061 goto err_remove;
4112 4062
4113 /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ 4063 /* each css holds a ref to the cgroup's dentry */
4114 for_each_subsys(root, ss) 4064 for_each_subsys(root, ss)
4115 if (!ss->__DEPRECATED_clear_css_refs) 4065 dget(dentry);
4116 dget(dentry);
4117 4066
4118 /* The cgroup directory was pre-locked for us */ 4067 /* The cgroup directory was pre-locked for us */
4119 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); 4068 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
@@ -4144,7 +4093,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4144 4093
4145 /* Release the reference count that we took on the superblock */ 4094 /* Release the reference count that we took on the superblock */
4146 deactivate_super(sb); 4095 deactivate_super(sb);
4147 4096err_free:
4148 kfree(cgrp); 4097 kfree(cgrp);
4149 return err; 4098 return err;
4150} 4099}
@@ -4198,71 +4147,6 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
4198 return 0; 4147 return 0;
4199} 4148}
4200 4149
4201/*
4202 * Atomically mark all (or else none) of the cgroup's CSS objects as
4203 * CSS_REMOVED. Return true on success, or false if the cgroup has
4204 * busy subsystems. Call with cgroup_mutex held
4205 *
4206 * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
4207 * not, cgroup removal behaves differently.
4208 *
4209 * If clear is set, css refcnt for the subsystem should be zero before
4210 * cgroup removal can be committed. This is implemented by
4211 * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
4212 * called multiple times until all css refcnts reach zero and is allowed to
4213 * veto removal on any invocation. This behavior is deprecated and will be
4214 * removed as soon as the existing user (memcg) is updated.
4215 *
4216 * If clear is not set, each css holds an extra reference to the cgroup's
4217 * dentry and cgroup removal proceeds regardless of css refs.
4218 * ->pre_destroy() will be called at least once and is not allowed to fail.
4219 * On the last put of each css, whenever that may be, the extra dentry ref
4220 * is put so that dentry destruction happens only after all css's are
4221 * released.
4222 */
4223static int cgroup_clear_css_refs(struct cgroup *cgrp)
4224{
4225 struct cgroup_subsys *ss;
4226 unsigned long flags;
4227 bool failed = false;
4228
4229 local_irq_save(flags);
4230
4231 /*
4232 * Block new css_tryget() by deactivating refcnt. If all refcnts
4233 * for subsystems w/ clear_css_refs set were 1 at the moment of
4234 * deactivation, we succeeded.
4235 */
4236 for_each_subsys(cgrp->root, ss) {
4237 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4238
4239 WARN_ON(atomic_read(&css->refcnt) < 0);
4240 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
4241
4242 if (ss->__DEPRECATED_clear_css_refs)
4243 failed |= css_refcnt(css) != 1;
4244 }
4245
4246 /*
4247 * If succeeded, set REMOVED and put all the base refs; otherwise,
4248 * restore refcnts to positive values. Either way, all in-progress
4249 * css_tryget() will be released.
4250 */
4251 for_each_subsys(cgrp->root, ss) {
4252 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4253
4254 if (!failed) {
4255 set_bit(CSS_REMOVED, &css->flags);
4256 css_put(css);
4257 } else {
4258 atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
4259 }
4260 }
4261
4262 local_irq_restore(flags);
4263 return !failed;
4264}
4265
4266static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4150static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4267{ 4151{
4268 struct cgroup *cgrp = dentry->d_fsdata; 4152 struct cgroup *cgrp = dentry->d_fsdata;
@@ -4270,70 +4154,52 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4270 struct cgroup *parent; 4154 struct cgroup *parent;
4271 DEFINE_WAIT(wait); 4155 DEFINE_WAIT(wait);
4272 struct cgroup_event *event, *tmp; 4156 struct cgroup_event *event, *tmp;
4273 int ret; 4157 struct cgroup_subsys *ss;
4274 4158
4275 /* the vfs holds both inode->i_mutex already */ 4159 /* the vfs holds both inode->i_mutex already */
4276again:
4277 mutex_lock(&cgroup_mutex); 4160 mutex_lock(&cgroup_mutex);
4278 if (atomic_read(&cgrp->count) != 0) { 4161 parent = cgrp->parent;
4279 mutex_unlock(&cgroup_mutex); 4162 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
4280 return -EBUSY;
4281 }
4282 if (!list_empty(&cgrp->children)) {
4283 mutex_unlock(&cgroup_mutex); 4163 mutex_unlock(&cgroup_mutex);
4284 return -EBUSY; 4164 return -EBUSY;
4285 } 4165 }
4286 mutex_unlock(&cgroup_mutex);
4287 4166
4288 /* 4167 /*
4289 * In general, subsystem has no css->refcnt after pre_destroy(). But 4168 * Block new css_tryget() by deactivating refcnt and mark @cgrp
4290 * in racy cases, subsystem may have to get css->refcnt after 4169 * removed. This makes future css_tryget() and child creation
4291 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes 4170 * attempts fail thus maintaining the removal conditions verified
4292 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue 4171 * above.
4293 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
4294 * and subsystem's reference count handling. Please see css_get/put
4295 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
4296 */ 4172 */
4297 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 4173 for_each_subsys(cgrp->root, ss) {
4174 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4298 4175
4299 /* 4176 WARN_ON(atomic_read(&css->refcnt) < 0);
4300 * Call pre_destroy handlers of subsys. Notify subsystems 4177 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
4301 * that rmdir() request comes.
4302 */
4303 ret = cgroup_call_pre_destroy(cgrp);
4304 if (ret) {
4305 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4306 return ret;
4307 } 4178 }
4179 set_bit(CGRP_REMOVED, &cgrp->flags);
4308 4180
4181 /*
4182 * Tell subsystems to initate destruction. pre_destroy() should be
4183 * called with cgroup_mutex unlocked. See 3fa59dfbc3 ("cgroup: fix
4184 * potential deadlock in pre_destroy") for details.
4185 */
4186 mutex_unlock(&cgroup_mutex);
4187 for_each_subsys(cgrp->root, ss)
4188 if (ss->pre_destroy)
4189 ss->pre_destroy(cgrp);
4309 mutex_lock(&cgroup_mutex); 4190 mutex_lock(&cgroup_mutex);
4310 parent = cgrp->parent; 4191
4311 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { 4192 /*
4312 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 4193 * Put all the base refs. Each css holds an extra reference to the
4313 mutex_unlock(&cgroup_mutex); 4194 * cgroup's dentry and cgroup removal proceeds regardless of css
4314 return -EBUSY; 4195 * refs. On the last put of each css, whenever that may be, the
4315 } 4196 * extra dentry ref is put so that dentry destruction happens only
4316 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); 4197 * after all css's are released.
4317 if (!cgroup_clear_css_refs(cgrp)) { 4198 */
4318 mutex_unlock(&cgroup_mutex); 4199 for_each_subsys(cgrp->root, ss)
4319 /* 4200 css_put(cgrp->subsys[ss->subsys_id]);
4320 * Because someone may call cgroup_wakeup_rmdir_waiter() before
4321 * prepare_to_wait(), we need to check this flag.
4322 */
4323 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
4324 schedule();
4325 finish_wait(&cgroup_rmdir_waitq, &wait);
4326 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4327 if (signal_pending(current))
4328 return -EINTR;
4329 goto again;
4330 }
4331 /* NO css_tryget() can success after here. */
4332 finish_wait(&cgroup_rmdir_waitq, &wait);
4333 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4334 4201
4335 raw_spin_lock(&release_list_lock); 4202 raw_spin_lock(&release_list_lock);
4336 set_bit(CGRP_REMOVED, &cgrp->flags);
4337 if (!list_empty(&cgrp->release_list)) 4203 if (!list_empty(&cgrp->release_list))
4338 list_del_init(&cgrp->release_list); 4204 list_del_init(&cgrp->release_list);
4339 raw_spin_unlock(&release_list_lock); 4205 raw_spin_unlock(&release_list_lock);
@@ -5041,15 +4907,17 @@ static void check_for_release(struct cgroup *cgrp)
5041/* Caller must verify that the css is not for root cgroup */ 4907/* Caller must verify that the css is not for root cgroup */
5042bool __css_tryget(struct cgroup_subsys_state *css) 4908bool __css_tryget(struct cgroup_subsys_state *css)
5043{ 4909{
5044 do { 4910 while (true) {
5045 int v = css_refcnt(css); 4911 int t, v;
5046 4912
5047 if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) 4913 v = css_refcnt(css);
4914 t = atomic_cmpxchg(&css->refcnt, v, v + 1);
4915 if (likely(t == v))
5048 return true; 4916 return true;
4917 else if (t < 0)
4918 return false;
5049 cpu_relax(); 4919 cpu_relax();
5050 } while (!test_bit(CSS_REMOVED, &css->flags)); 4920 }
5051
5052 return false;
5053} 4921}
5054EXPORT_SYMBOL_GPL(__css_tryget); 4922EXPORT_SYMBOL_GPL(__css_tryget);
5055 4923
@@ -5068,11 +4936,9 @@ void __css_put(struct cgroup_subsys_state *css)
5068 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4936 set_bit(CGRP_RELEASABLE, &cgrp->flags);
5069 check_for_release(cgrp); 4937 check_for_release(cgrp);
5070 } 4938 }
5071 cgroup_wakeup_rmdir_waiter(cgrp);
5072 break; 4939 break;
5073 case 0: 4940 case 0:
5074 if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) 4941 schedule_work(&css->dput_work);
5075 schedule_work(&css->dput_work);
5076 break; 4942 break;
5077 } 4943 }
5078 rcu_read_unlock(); 4944 rcu_read_unlock();
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index a3f358fb8a0c..0d3a1a317731 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -155,18 +155,13 @@ out:
155 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to 155 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
156 * the parent cgroup. 156 * the parent cgroup.
157 */ 157 */
158static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) 158static void hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
159{ 159{
160 struct hstate *h; 160 struct hstate *h;
161 struct page *page; 161 struct page *page;
162 int ret = 0, idx = 0; 162 int idx = 0;
163 163
164 do { 164 do {
165 if (cgroup_task_count(cgroup) ||
166 !list_empty(&cgroup->children)) {
167 ret = -EBUSY;
168 goto out;
169 }
170 for_each_hstate(h) { 165 for_each_hstate(h) {
171 spin_lock(&hugetlb_lock); 166 spin_lock(&hugetlb_lock);
172 list_for_each_entry(page, &h->hugepage_activelist, lru) 167 list_for_each_entry(page, &h->hugepage_activelist, lru)
@@ -177,8 +172,6 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
177 } 172 }
178 cond_resched(); 173 cond_resched();
179 } while (hugetlb_cgroup_have_usage(cgroup)); 174 } while (hugetlb_cgroup_have_usage(cgroup));
180out:
181 return ret;
182} 175}
183 176
184int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 177int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7acf43bf04a2..08adaaae6fcc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2337,7 +2337,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2337again: 2337again:
2338 if (*ptr) { /* css should be a valid one */ 2338 if (*ptr) { /* css should be a valid one */
2339 memcg = *ptr; 2339 memcg = *ptr;
2340 VM_BUG_ON(css_is_removed(&memcg->css));
2341 if (mem_cgroup_is_root(memcg)) 2340 if (mem_cgroup_is_root(memcg))
2342 goto done; 2341 goto done;
2343 if (nr_pages == 1 && consume_stock(memcg)) 2342 if (nr_pages == 1 && consume_stock(memcg))
@@ -2477,9 +2476,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2477 2476
2478/* 2477/*
2479 * A helper function to get mem_cgroup from ID. must be called under 2478 * A helper function to get mem_cgroup from ID. must be called under
2480 * rcu_read_lock(). The caller must check css_is_removed() or some if 2479 * rcu_read_lock(). The caller is responsible for calling css_tryget if
2481 * it's concern. (dropping refcnt from swap can be called against removed 2480 * the mem_cgroup is used for charging. (dropping refcnt from swap can be
2482 * memcg.) 2481 * called against removed memcg.)
2483 */ 2482 */
2484static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2483static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2485{ 2484{
@@ -2676,13 +2675,6 @@ static int mem_cgroup_move_account(struct page *page,
2676 /* caller should have done css_get */ 2675 /* caller should have done css_get */
2677 pc->mem_cgroup = to; 2676 pc->mem_cgroup = to;
2678 mem_cgroup_charge_statistics(to, anon, nr_pages); 2677 mem_cgroup_charge_statistics(to, anon, nr_pages);
2679 /*
2680 * We charges against "to" which may not have any tasks. Then, "to"
2681 * can be under rmdir(). But in current implementation, caller of
2682 * this function is just force_empty() and move charge, so it's
2683 * guaranteed that "to" is never removed. So, we don't check rmdir
2684 * status here.
2685 */
2686 move_unlock_mem_cgroup(from, &flags); 2678 move_unlock_mem_cgroup(from, &flags);
2687 ret = 0; 2679 ret = 0;
2688unlock: 2680unlock:
@@ -2696,10 +2688,27 @@ out:
2696 return ret; 2688 return ret;
2697} 2689}
2698 2690
2699/* 2691/**
2700 * move charges to its parent. 2692 * mem_cgroup_move_parent - moves page to the parent group
2693 * @page: the page to move
2694 * @pc: page_cgroup of the page
2695 * @child: page's cgroup
2696 *
2697 * move charges to its parent or the root cgroup if the group has no
2698 * parent (aka use_hierarchy==0).
2699 * Although this might fail (get_page_unless_zero, isolate_lru_page or
2700 * mem_cgroup_move_account fails) the failure is always temporary and
2701 * it signals a race with a page removal/uncharge or migration. In the
2702 * first case the page is on the way out and it will vanish from the LRU
2703 * on the next attempt and the call should be retried later.
2704 * Isolation from the LRU fails only if page has been isolated from
2705 * the LRU since we looked at it and that usually means either global
2706 * reclaim or migration going on. The page will either get back to the
2707 * LRU or vanish.
2708 * Finaly mem_cgroup_move_account fails only if the page got uncharged
2709 * (!PageCgroupUsed) or moved to a different group. The page will
2710 * disappear in the next attempt.
2701 */ 2711 */
2702
2703static int mem_cgroup_move_parent(struct page *page, 2712static int mem_cgroup_move_parent(struct page *page,
2704 struct page_cgroup *pc, 2713 struct page_cgroup *pc,
2705 struct mem_cgroup *child) 2714 struct mem_cgroup *child)
@@ -2709,9 +2718,7 @@ static int mem_cgroup_move_parent(struct page *page,
2709 unsigned long uninitialized_var(flags); 2718 unsigned long uninitialized_var(flags);
2710 int ret; 2719 int ret;
2711 2720
2712 /* Is ROOT ? */ 2721 VM_BUG_ON(mem_cgroup_is_root(child));
2713 if (mem_cgroup_is_root(child))
2714 return -EINVAL;
2715 2722
2716 ret = -EBUSY; 2723 ret = -EBUSY;
2717 if (!get_page_unless_zero(page)) 2724 if (!get_page_unless_zero(page))
@@ -2728,8 +2735,10 @@ static int mem_cgroup_move_parent(struct page *page,
2728 if (!parent) 2735 if (!parent)
2729 parent = root_mem_cgroup; 2736 parent = root_mem_cgroup;
2730 2737
2731 if (nr_pages > 1) 2738 if (nr_pages > 1) {
2739 VM_BUG_ON(!PageTransHuge(page));
2732 flags = compound_lock_irqsave(page); 2740 flags = compound_lock_irqsave(page);
2741 }
2733 2742
2734 ret = mem_cgroup_move_account(page, nr_pages, 2743 ret = mem_cgroup_move_account(page, nr_pages,
2735 pc, child, parent); 2744 pc, child, parent);
@@ -2871,7 +2880,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2871 return; 2880 return;
2872 if (!memcg) 2881 if (!memcg)
2873 return; 2882 return;
2874 cgroup_exclude_rmdir(&memcg->css);
2875 2883
2876 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); 2884 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
2877 /* 2885 /*
@@ -2885,12 +2893,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2885 swp_entry_t ent = {.val = page_private(page)}; 2893 swp_entry_t ent = {.val = page_private(page)};
2886 mem_cgroup_uncharge_swap(ent); 2894 mem_cgroup_uncharge_swap(ent);
2887 } 2895 }
2888 /*
2889 * At swapin, we may charge account against cgroup which has no tasks.
2890 * So, rmdir()->pre_destroy() can be called while we do this charge.
2891 * In that case, we need to call pre_destroy() again. check it here.
2892 */
2893 cgroup_release_and_wakeup_rmdir(&memcg->css);
2894} 2896}
2895 2897
2896void mem_cgroup_commit_charge_swapin(struct page *page, 2898void mem_cgroup_commit_charge_swapin(struct page *page,
@@ -3338,8 +3340,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3338 3340
3339 if (!memcg) 3341 if (!memcg)
3340 return; 3342 return;
3341 /* blocks rmdir() */ 3343
3342 cgroup_exclude_rmdir(&memcg->css);
3343 if (!migration_ok) { 3344 if (!migration_ok) {
3344 used = oldpage; 3345 used = oldpage;
3345 unused = newpage; 3346 unused = newpage;
@@ -3373,13 +3374,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3373 */ 3374 */
3374 if (anon) 3375 if (anon)
3375 mem_cgroup_uncharge_page(used); 3376 mem_cgroup_uncharge_page(used);
3376 /*
3377 * At migration, we may charge account against cgroup which has no
3378 * tasks.
3379 * So, rmdir()->pre_destroy() can be called while we do this charge.
3380 * In that case, we need to call pre_destroy() again. check it here.
3381 */
3382 cgroup_release_and_wakeup_rmdir(&memcg->css);
3383} 3377}
3384 3378
3385/* 3379/*
@@ -3679,17 +3673,22 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3679 return nr_reclaimed; 3673 return nr_reclaimed;
3680} 3674}
3681 3675
3682/* 3676/**
3677 * mem_cgroup_force_empty_list - clears LRU of a group
3678 * @memcg: group to clear
3679 * @node: NUMA node
3680 * @zid: zone id
3681 * @lru: lru to to clear
3682 *
3683 * Traverse a specified page_cgroup list and try to drop them all. This doesn't 3683 * Traverse a specified page_cgroup list and try to drop them all. This doesn't
3684 * reclaim the pages page themselves - it just removes the page_cgroups. 3684 * reclaim the pages page themselves - pages are moved to the parent (or root)
3685 * Returns true if some page_cgroups were not freed, indicating that the caller 3685 * group.
3686 * must retry this operation.
3687 */ 3686 */
3688static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 3687static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3689 int node, int zid, enum lru_list lru) 3688 int node, int zid, enum lru_list lru)
3690{ 3689{
3691 struct mem_cgroup_per_zone *mz; 3690 struct mem_cgroup_per_zone *mz;
3692 unsigned long flags, loop; 3691 unsigned long flags;
3693 struct list_head *list; 3692 struct list_head *list;
3694 struct page *busy; 3693 struct page *busy;
3695 struct zone *zone; 3694 struct zone *zone;
@@ -3698,11 +3697,8 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3698 mz = mem_cgroup_zoneinfo(memcg, node, zid); 3697 mz = mem_cgroup_zoneinfo(memcg, node, zid);
3699 list = &mz->lruvec.lists[lru]; 3698 list = &mz->lruvec.lists[lru];
3700 3699
3701 loop = mz->lru_size[lru];
3702 /* give some margin against EBUSY etc...*/
3703 loop += 256;
3704 busy = NULL; 3700 busy = NULL;
3705 while (loop--) { 3701 do {
3706 struct page_cgroup *pc; 3702 struct page_cgroup *pc;
3707 struct page *page; 3703 struct page *page;
3708 3704
@@ -3728,76 +3724,72 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3728 cond_resched(); 3724 cond_resched();
3729 } else 3725 } else
3730 busy = NULL; 3726 busy = NULL;
3731 } 3727 } while (!list_empty(list));
3732 return !list_empty(list);
3733} 3728}
3734 3729
3735/* 3730/*
3736 * make mem_cgroup's charge to be 0 if there is no task. 3731 * make mem_cgroup's charge to be 0 if there is no task by moving
3732 * all the charges and pages to the parent.
3737 * This enables deleting this mem_cgroup. 3733 * This enables deleting this mem_cgroup.
3734 *
3735 * Caller is responsible for holding css reference on the memcg.
3738 */ 3736 */
3739static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all) 3737static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
3740{ 3738{
3741 int ret; 3739 int node, zid;
3742 int node, zid, shrink;
3743 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3744 struct cgroup *cgrp = memcg->css.cgroup;
3745
3746 css_get(&memcg->css);
3747 3740
3748 shrink = 0;
3749 /* should free all ? */
3750 if (free_all)
3751 goto try_to_free;
3752move_account:
3753 do { 3741 do {
3754 ret = -EBUSY;
3755 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3756 goto out;
3757 /* This is for making all *used* pages to be on LRU. */ 3742 /* This is for making all *used* pages to be on LRU. */
3758 lru_add_drain_all(); 3743 lru_add_drain_all();
3759 drain_all_stock_sync(memcg); 3744 drain_all_stock_sync(memcg);
3760 ret = 0;
3761 mem_cgroup_start_move(memcg); 3745 mem_cgroup_start_move(memcg);
3762 for_each_node_state(node, N_HIGH_MEMORY) { 3746 for_each_node_state(node, N_HIGH_MEMORY) {
3763 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3747 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3764 enum lru_list lru; 3748 enum lru_list lru;
3765 for_each_lru(lru) { 3749 for_each_lru(lru) {
3766 ret = mem_cgroup_force_empty_list(memcg, 3750 mem_cgroup_force_empty_list(memcg,
3767 node, zid, lru); 3751 node, zid, lru);
3768 if (ret)
3769 break;
3770 } 3752 }
3771 } 3753 }
3772 if (ret)
3773 break;
3774 } 3754 }
3775 mem_cgroup_end_move(memcg); 3755 mem_cgroup_end_move(memcg);
3776 memcg_oom_recover(memcg); 3756 memcg_oom_recover(memcg);
3777 cond_resched(); 3757 cond_resched();
3778 /* "ret" should also be checked to ensure all lists are empty. */
3779 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
3780out:
3781 css_put(&memcg->css);
3782 return ret;
3783 3758
3784try_to_free: 3759 /*
3760 * This is a safety check because mem_cgroup_force_empty_list
3761 * could have raced with mem_cgroup_replace_page_cache callers
3762 * so the lru seemed empty but the page could have been added
3763 * right after the check. RES_USAGE should be safe as we always
3764 * charge before adding to the LRU.
3765 */
3766 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0);
3767}
3768
3769/*
3770 * Reclaims as many pages from the given memcg as possible and moves
3771 * the rest to the parent.
3772 *
3773 * Caller is responsible for holding css reference for memcg.
3774 */
3775static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3776{
3777 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3778 struct cgroup *cgrp = memcg->css.cgroup;
3779
3785 /* returns EBUSY if there is a task or if we come here twice. */ 3780 /* returns EBUSY if there is a task or if we come here twice. */
3786 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 3781 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3787 ret = -EBUSY; 3782 return -EBUSY;
3788 goto out; 3783
3789 }
3790 /* we call try-to-free pages for make this cgroup empty */ 3784 /* we call try-to-free pages for make this cgroup empty */
3791 lru_add_drain_all(); 3785 lru_add_drain_all();
3792 /* try to free all pages in this cgroup */ 3786 /* try to free all pages in this cgroup */
3793 shrink = 1;
3794 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { 3787 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
3795 int progress; 3788 int progress;
3796 3789
3797 if (signal_pending(current)) { 3790 if (signal_pending(current))
3798 ret = -EINTR; 3791 return -EINTR;
3799 goto out; 3792
3800 }
3801 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, 3793 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
3802 false); 3794 false);
3803 if (!progress) { 3795 if (!progress) {
@@ -3808,13 +3800,23 @@ try_to_free:
3808 3800
3809 } 3801 }
3810 lru_add_drain(); 3802 lru_add_drain();
3811 /* try move_account...there may be some *locked* pages. */ 3803 mem_cgroup_reparent_charges(memcg);
3812 goto move_account; 3804
3805 return 0;
3813} 3806}
3814 3807
3815static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 3808static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3816{ 3809{
3817 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 3810 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3811 int ret;
3812
3813 if (mem_cgroup_is_root(memcg))
3814 return -EINVAL;
3815 css_get(&memcg->css);
3816 ret = mem_cgroup_force_empty(memcg);
3817 css_put(&memcg->css);
3818
3819 return ret;
3818} 3820}
3819 3821
3820 3822
@@ -5001,11 +5003,11 @@ free_out:
5001 return ERR_PTR(error); 5003 return ERR_PTR(error);
5002} 5004}
5003 5005
5004static int mem_cgroup_pre_destroy(struct cgroup *cont) 5006static void mem_cgroup_pre_destroy(struct cgroup *cont)
5005{ 5007{
5006 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5008 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5007 5009
5008 return mem_cgroup_force_empty(memcg, false); 5010 mem_cgroup_reparent_charges(memcg);
5009} 5011}
5010 5012
5011static void mem_cgroup_destroy(struct cgroup *cont) 5013static void mem_cgroup_destroy(struct cgroup *cont)
@@ -5607,7 +5609,6 @@ struct cgroup_subsys mem_cgroup_subsys = {
5607 .base_cftypes = mem_cgroup_files, 5609 .base_cftypes = mem_cgroup_files,
5608 .early_init = 0, 5610 .early_init = 0,
5609 .use_id = 1, 5611 .use_id = 1,
5610 .__DEPRECATED_clear_css_refs = true,
5611}; 5612};
5612 5613
5613#ifdef CONFIG_MEMCG_SWAP 5614#ifdef CONFIG_MEMCG_SWAP