diff options
-rw-r--r-- | block/blk-cgroup.c | 3 | ||||
-rw-r--r-- | include/linux/cgroup.h | 41 | ||||
-rw-r--r-- | kernel/cgroup.c | 256 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 11 | ||||
-rw-r--r-- | mm/memcontrol.c | 181 |
5 files changed, 156 insertions, 336 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index cafcd7431189..6ce36ff98a41 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c | |||
@@ -600,7 +600,7 @@ struct cftype blkcg_files[] = { | |||
600 | * | 600 | * |
601 | * This is the blkcg counterpart of ioc_release_fn(). | 601 | * This is the blkcg counterpart of ioc_release_fn(). |
602 | */ | 602 | */ |
603 | static int blkcg_pre_destroy(struct cgroup *cgroup) | 603 | static void blkcg_pre_destroy(struct cgroup *cgroup) |
604 | { | 604 | { |
605 | struct blkcg *blkcg = cgroup_to_blkcg(cgroup); | 605 | struct blkcg *blkcg = cgroup_to_blkcg(cgroup); |
606 | 606 | ||
@@ -622,7 +622,6 @@ static int blkcg_pre_destroy(struct cgroup *cgroup) | |||
622 | } | 622 | } |
623 | 623 | ||
624 | spin_unlock_irq(&blkcg->lock); | 624 | spin_unlock_irq(&blkcg->lock); |
625 | return 0; | ||
626 | } | 625 | } |
627 | 626 | ||
628 | static void blkcg_destroy(struct cgroup *cgroup) | 627 | static void blkcg_destroy(struct cgroup *cgroup) |
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4cd1d0fd2542..fe876a77031a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -81,8 +81,6 @@ struct cgroup_subsys_state { | |||
81 | /* bits in struct cgroup_subsys_state flags field */ | 81 | /* bits in struct cgroup_subsys_state flags field */ |
82 | enum { | 82 | enum { |
83 | CSS_ROOT, /* This CSS is the root of the subsystem */ | 83 | CSS_ROOT, /* This CSS is the root of the subsystem */ |
84 | CSS_REMOVED, /* This CSS is dead */ | ||
85 | CSS_CLEAR_CSS_REFS, /* @ss->__DEPRECATED_clear_css_refs */ | ||
86 | }; | 84 | }; |
87 | 85 | ||
88 | /* Caller must verify that the css is not for root cgroup */ | 86 | /* Caller must verify that the css is not for root cgroup */ |
@@ -105,11 +103,6 @@ static inline void css_get(struct cgroup_subsys_state *css) | |||
105 | __css_get(css, 1); | 103 | __css_get(css, 1); |
106 | } | 104 | } |
107 | 105 | ||
108 | static inline bool css_is_removed(struct cgroup_subsys_state *css) | ||
109 | { | ||
110 | return test_bit(CSS_REMOVED, &css->flags); | ||
111 | } | ||
112 | |||
113 | /* | 106 | /* |
114 | * Call css_tryget() to take a reference on a css if your existing | 107 | * Call css_tryget() to take a reference on a css if your existing |
115 | * (known-valid) reference isn't already ref-counted. Returns false if | 108 | * (known-valid) reference isn't already ref-counted. Returns false if |
@@ -148,10 +141,6 @@ enum { | |||
148 | /* Control Group requires release notifications to userspace */ | 141 | /* Control Group requires release notifications to userspace */ |
149 | CGRP_NOTIFY_ON_RELEASE, | 142 | CGRP_NOTIFY_ON_RELEASE, |
150 | /* | 143 | /* |
151 | * A thread in rmdir() is wating for this cgroup. | ||
152 | */ | ||
153 | CGRP_WAIT_ON_RMDIR, | ||
154 | /* | ||
155 | * Clone cgroup values when creating a new child cgroup | 144 | * Clone cgroup values when creating a new child cgroup |
156 | */ | 145 | */ |
157 | CGRP_CLONE_CHILDREN, | 146 | CGRP_CLONE_CHILDREN, |
@@ -421,23 +410,6 @@ int cgroup_task_count(const struct cgroup *cgrp); | |||
421 | int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); | 410 | int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); |
422 | 411 | ||
423 | /* | 412 | /* |
424 | * When the subsys has to access css and may add permanent refcnt to css, | ||
425 | * it should take care of racy conditions with rmdir(). Following set of | ||
426 | * functions, is for stop/restart rmdir if necessary. | ||
427 | * Because these will call css_get/put, "css" should be alive css. | ||
428 | * | ||
429 | * cgroup_exclude_rmdir(); | ||
430 | * ...do some jobs which may access arbitrary empty cgroup | ||
431 | * cgroup_release_and_wakeup_rmdir(); | ||
432 | * | ||
433 | * When someone removes a cgroup while cgroup_exclude_rmdir() holds it, | ||
434 | * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up. | ||
435 | */ | ||
436 | |||
437 | void cgroup_exclude_rmdir(struct cgroup_subsys_state *css); | ||
438 | void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css); | ||
439 | |||
440 | /* | ||
441 | * Control Group taskset, used to pass around set of tasks to cgroup_subsys | 413 | * Control Group taskset, used to pass around set of tasks to cgroup_subsys |
442 | * methods. | 414 | * methods. |
443 | */ | 415 | */ |
@@ -466,7 +438,7 @@ int cgroup_taskset_size(struct cgroup_taskset *tset); | |||
466 | 438 | ||
467 | struct cgroup_subsys { | 439 | struct cgroup_subsys { |
468 | struct cgroup_subsys_state *(*create)(struct cgroup *cgrp); | 440 | struct cgroup_subsys_state *(*create)(struct cgroup *cgrp); |
469 | int (*pre_destroy)(struct cgroup *cgrp); | 441 | void (*pre_destroy)(struct cgroup *cgrp); |
470 | void (*destroy)(struct cgroup *cgrp); | 442 | void (*destroy)(struct cgroup *cgrp); |
471 | int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); | 443 | int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); |
472 | void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); | 444 | void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); |
@@ -488,17 +460,6 @@ struct cgroup_subsys { | |||
488 | bool use_id; | 460 | bool use_id; |
489 | 461 | ||
490 | /* | 462 | /* |
491 | * If %true, cgroup removal will try to clear css refs by retrying | ||
492 | * ss->pre_destroy() until there's no css ref left. This behavior | ||
493 | * is strictly for backward compatibility and will be removed as | ||
494 | * soon as the current user (memcg) is updated. | ||
495 | * | ||
496 | * If %false, ss->pre_destroy() can't fail and cgroup removal won't | ||
497 | * wait for css refs to drop to zero before proceeding. | ||
498 | */ | ||
499 | bool __DEPRECATED_clear_css_refs; | ||
500 | |||
501 | /* | ||
502 | * If %false, this subsystem is properly hierarchical - | 463 | * If %false, this subsystem is properly hierarchical - |
503 | * configuration, resource accounting and restriction on a parent | 464 | * configuration, resource accounting and restriction on a parent |
504 | * cgroup cover those of its children. If %true, hierarchy support | 465 | * cgroup cover those of its children. If %true, hierarchy support |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b7a0171067ea..e3045ad4267a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -171,8 +171,8 @@ struct css_id { | |||
171 | * The css to which this ID points. This pointer is set to valid value | 171 | * The css to which this ID points. This pointer is set to valid value |
172 | * after cgroup is populated. If cgroup is removed, this will be NULL. | 172 | * after cgroup is populated. If cgroup is removed, this will be NULL. |
173 | * This pointer is expected to be RCU-safe because destroy() | 173 | * This pointer is expected to be RCU-safe because destroy() |
174 | * is called after synchronize_rcu(). But for safe use, css_is_removed() | 174 | * is called after synchronize_rcu(). But for safe use, css_tryget() |
175 | * css_tryget() should be used for avoiding race. | 175 | * should be used for avoiding race. |
176 | */ | 176 | */ |
177 | struct cgroup_subsys_state __rcu *css; | 177 | struct cgroup_subsys_state __rcu *css; |
178 | /* | 178 | /* |
@@ -854,30 +854,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) | |||
854 | return inode; | 854 | return inode; |
855 | } | 855 | } |
856 | 856 | ||
857 | /* | ||
858 | * Call subsys's pre_destroy handler. | ||
859 | * This is called before css refcnt check. | ||
860 | */ | ||
861 | static int cgroup_call_pre_destroy(struct cgroup *cgrp) | ||
862 | { | ||
863 | struct cgroup_subsys *ss; | ||
864 | int ret = 0; | ||
865 | |||
866 | for_each_subsys(cgrp->root, ss) { | ||
867 | if (!ss->pre_destroy) | ||
868 | continue; | ||
869 | |||
870 | ret = ss->pre_destroy(cgrp); | ||
871 | if (ret) { | ||
872 | /* ->pre_destroy() failure is being deprecated */ | ||
873 | WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); | ||
874 | break; | ||
875 | } | ||
876 | } | ||
877 | |||
878 | return ret; | ||
879 | } | ||
880 | |||
881 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 857 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) |
882 | { | 858 | { |
883 | /* is dentry a directory ? if so, kfree() associated cgroup */ | 859 | /* is dentry a directory ? if so, kfree() associated cgroup */ |
@@ -1015,33 +991,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry) | |||
1015 | } | 991 | } |
1016 | 992 | ||
1017 | /* | 993 | /* |
1018 | * A queue for waiters to do rmdir() cgroup. A tasks will sleep when | ||
1019 | * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some | ||
1020 | * reference to css->refcnt. In general, this refcnt is expected to goes down | ||
1021 | * to zero, soon. | ||
1022 | * | ||
1023 | * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; | ||
1024 | */ | ||
1025 | static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); | ||
1026 | |||
1027 | static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) | ||
1028 | { | ||
1029 | if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) | ||
1030 | wake_up_all(&cgroup_rmdir_waitq); | ||
1031 | } | ||
1032 | |||
1033 | void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) | ||
1034 | { | ||
1035 | css_get(css); | ||
1036 | } | ||
1037 | |||
1038 | void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) | ||
1039 | { | ||
1040 | cgroup_wakeup_rmdir_waiter(css->cgroup); | ||
1041 | css_put(css); | ||
1042 | } | ||
1043 | |||
1044 | /* | ||
1045 | * Call with cgroup_mutex held. Drops reference counts on modules, including | 994 | * Call with cgroup_mutex held. Drops reference counts on modules, including |
1046 | * any duplicate ones that parse_cgroupfs_options took. If this function | 995 | * any duplicate ones that parse_cgroupfs_options took. If this function |
1047 | * returns an error, no reference counts are touched. | 996 | * returns an error, no reference counts are touched. |
@@ -2026,12 +1975,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
2026 | } | 1975 | } |
2027 | 1976 | ||
2028 | synchronize_rcu(); | 1977 | synchronize_rcu(); |
2029 | |||
2030 | /* | ||
2031 | * wake up rmdir() waiter. the rmdir should fail since the cgroup | ||
2032 | * is no longer empty. | ||
2033 | */ | ||
2034 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
2035 | out: | 1978 | out: |
2036 | if (retval) { | 1979 | if (retval) { |
2037 | for_each_subsys(root, ss) { | 1980 | for_each_subsys(root, ss) { |
@@ -2201,7 +2144,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2201 | * step 5: success! and cleanup | 2144 | * step 5: success! and cleanup |
2202 | */ | 2145 | */ |
2203 | synchronize_rcu(); | 2146 | synchronize_rcu(); |
2204 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
2205 | retval = 0; | 2147 | retval = 0; |
2206 | out_put_css_set_refs: | 2148 | out_put_css_set_refs: |
2207 | if (retval) { | 2149 | if (retval) { |
@@ -4023,14 +3965,12 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, | |||
4023 | cgrp->subsys[ss->subsys_id] = css; | 3965 | cgrp->subsys[ss->subsys_id] = css; |
4024 | 3966 | ||
4025 | /* | 3967 | /* |
4026 | * If !clear_css_refs, css holds an extra ref to @cgrp->dentry | 3968 | * css holds an extra ref to @cgrp->dentry which is put on the last |
4027 | * which is put on the last css_put(). dput() requires process | 3969 | * css_put(). dput() requires process context, which css_put() may |
4028 | * context, which css_put() may be called without. @css->dput_work | 3970 | * be called without. @css->dput_work will be used to invoke |
4029 | * will be used to invoke dput() asynchronously from css_put(). | 3971 | * dput() asynchronously from css_put(). |
4030 | */ | 3972 | */ |
4031 | INIT_WORK(&css->dput_work, css_dput_fn); | 3973 | INIT_WORK(&css->dput_work, css_dput_fn); |
4032 | if (ss->__DEPRECATED_clear_css_refs) | ||
4033 | set_bit(CSS_CLEAR_CSS_REFS, &css->flags); | ||
4034 | } | 3974 | } |
4035 | 3975 | ||
4036 | /* | 3976 | /* |
@@ -4054,6 +3994,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4054 | if (!cgrp) | 3994 | if (!cgrp) |
4055 | return -ENOMEM; | 3995 | return -ENOMEM; |
4056 | 3996 | ||
3997 | /* | ||
3998 | * Only live parents can have children. Note that the liveliness | ||
3999 | * check isn't strictly necessary because cgroup_mkdir() and | ||
4000 | * cgroup_rmdir() are fully synchronized by i_mutex; however, do it | ||
4001 | * anyway so that locking is contained inside cgroup proper and we | ||
4002 | * don't get nasty surprises if we ever grow another caller. | ||
4003 | */ | ||
4004 | if (!cgroup_lock_live_group(parent)) { | ||
4005 | err = -ENODEV; | ||
4006 | goto err_free; | ||
4007 | } | ||
4008 | |||
4057 | /* Grab a reference on the superblock so the hierarchy doesn't | 4009 | /* Grab a reference on the superblock so the hierarchy doesn't |
4058 | * get deleted on unmount if there are child cgroups. This | 4010 | * get deleted on unmount if there are child cgroups. This |
4059 | * can be done outside cgroup_mutex, since the sb can't | 4011 | * can be done outside cgroup_mutex, since the sb can't |
@@ -4061,8 +4013,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4061 | * fs */ | 4013 | * fs */ |
4062 | atomic_inc(&sb->s_active); | 4014 | atomic_inc(&sb->s_active); |
4063 | 4015 | ||
4064 | mutex_lock(&cgroup_mutex); | ||
4065 | |||
4066 | init_cgroup_housekeeping(cgrp); | 4016 | init_cgroup_housekeeping(cgrp); |
4067 | 4017 | ||
4068 | cgrp->parent = parent; | 4018 | cgrp->parent = parent; |
@@ -4110,10 +4060,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4110 | if (err < 0) | 4060 | if (err < 0) |
4111 | goto err_remove; | 4061 | goto err_remove; |
4112 | 4062 | ||
4113 | /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ | 4063 | /* each css holds a ref to the cgroup's dentry */ |
4114 | for_each_subsys(root, ss) | 4064 | for_each_subsys(root, ss) |
4115 | if (!ss->__DEPRECATED_clear_css_refs) | 4065 | dget(dentry); |
4116 | dget(dentry); | ||
4117 | 4066 | ||
4118 | /* The cgroup directory was pre-locked for us */ | 4067 | /* The cgroup directory was pre-locked for us */ |
4119 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); | 4068 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); |
@@ -4144,7 +4093,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4144 | 4093 | ||
4145 | /* Release the reference count that we took on the superblock */ | 4094 | /* Release the reference count that we took on the superblock */ |
4146 | deactivate_super(sb); | 4095 | deactivate_super(sb); |
4147 | 4096 | err_free: | |
4148 | kfree(cgrp); | 4097 | kfree(cgrp); |
4149 | return err; | 4098 | return err; |
4150 | } | 4099 | } |
@@ -4198,71 +4147,6 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
4198 | return 0; | 4147 | return 0; |
4199 | } | 4148 | } |
4200 | 4149 | ||
4201 | /* | ||
4202 | * Atomically mark all (or else none) of the cgroup's CSS objects as | ||
4203 | * CSS_REMOVED. Return true on success, or false if the cgroup has | ||
4204 | * busy subsystems. Call with cgroup_mutex held | ||
4205 | * | ||
4206 | * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or | ||
4207 | * not, cgroup removal behaves differently. | ||
4208 | * | ||
4209 | * If clear is set, css refcnt for the subsystem should be zero before | ||
4210 | * cgroup removal can be committed. This is implemented by | ||
4211 | * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be | ||
4212 | * called multiple times until all css refcnts reach zero and is allowed to | ||
4213 | * veto removal on any invocation. This behavior is deprecated and will be | ||
4214 | * removed as soon as the existing user (memcg) is updated. | ||
4215 | * | ||
4216 | * If clear is not set, each css holds an extra reference to the cgroup's | ||
4217 | * dentry and cgroup removal proceeds regardless of css refs. | ||
4218 | * ->pre_destroy() will be called at least once and is not allowed to fail. | ||
4219 | * On the last put of each css, whenever that may be, the extra dentry ref | ||
4220 | * is put so that dentry destruction happens only after all css's are | ||
4221 | * released. | ||
4222 | */ | ||
4223 | static int cgroup_clear_css_refs(struct cgroup *cgrp) | ||
4224 | { | ||
4225 | struct cgroup_subsys *ss; | ||
4226 | unsigned long flags; | ||
4227 | bool failed = false; | ||
4228 | |||
4229 | local_irq_save(flags); | ||
4230 | |||
4231 | /* | ||
4232 | * Block new css_tryget() by deactivating refcnt. If all refcnts | ||
4233 | * for subsystems w/ clear_css_refs set were 1 at the moment of | ||
4234 | * deactivation, we succeeded. | ||
4235 | */ | ||
4236 | for_each_subsys(cgrp->root, ss) { | ||
4237 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | ||
4238 | |||
4239 | WARN_ON(atomic_read(&css->refcnt) < 0); | ||
4240 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); | ||
4241 | |||
4242 | if (ss->__DEPRECATED_clear_css_refs) | ||
4243 | failed |= css_refcnt(css) != 1; | ||
4244 | } | ||
4245 | |||
4246 | /* | ||
4247 | * If succeeded, set REMOVED and put all the base refs; otherwise, | ||
4248 | * restore refcnts to positive values. Either way, all in-progress | ||
4249 | * css_tryget() will be released. | ||
4250 | */ | ||
4251 | for_each_subsys(cgrp->root, ss) { | ||
4252 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | ||
4253 | |||
4254 | if (!failed) { | ||
4255 | set_bit(CSS_REMOVED, &css->flags); | ||
4256 | css_put(css); | ||
4257 | } else { | ||
4258 | atomic_sub(CSS_DEACT_BIAS, &css->refcnt); | ||
4259 | } | ||
4260 | } | ||
4261 | |||
4262 | local_irq_restore(flags); | ||
4263 | return !failed; | ||
4264 | } | ||
4265 | |||
4266 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | 4150 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) |
4267 | { | 4151 | { |
4268 | struct cgroup *cgrp = dentry->d_fsdata; | 4152 | struct cgroup *cgrp = dentry->d_fsdata; |
@@ -4270,70 +4154,52 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
4270 | struct cgroup *parent; | 4154 | struct cgroup *parent; |
4271 | DEFINE_WAIT(wait); | 4155 | DEFINE_WAIT(wait); |
4272 | struct cgroup_event *event, *tmp; | 4156 | struct cgroup_event *event, *tmp; |
4273 | int ret; | 4157 | struct cgroup_subsys *ss; |
4274 | 4158 | ||
4275 | /* the vfs holds both inode->i_mutex already */ | 4159 | /* the vfs holds both inode->i_mutex already */ |
4276 | again: | ||
4277 | mutex_lock(&cgroup_mutex); | 4160 | mutex_lock(&cgroup_mutex); |
4278 | if (atomic_read(&cgrp->count) != 0) { | 4161 | parent = cgrp->parent; |
4279 | mutex_unlock(&cgroup_mutex); | 4162 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { |
4280 | return -EBUSY; | ||
4281 | } | ||
4282 | if (!list_empty(&cgrp->children)) { | ||
4283 | mutex_unlock(&cgroup_mutex); | 4163 | mutex_unlock(&cgroup_mutex); |
4284 | return -EBUSY; | 4164 | return -EBUSY; |
4285 | } | 4165 | } |
4286 | mutex_unlock(&cgroup_mutex); | ||
4287 | 4166 | ||
4288 | /* | 4167 | /* |
4289 | * In general, subsystem has no css->refcnt after pre_destroy(). But | 4168 | * Block new css_tryget() by deactivating refcnt and mark @cgrp |
4290 | * in racy cases, subsystem may have to get css->refcnt after | 4169 | * removed. This makes future css_tryget() and child creation |
4291 | * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes | 4170 | * attempts fail thus maintaining the removal conditions verified |
4292 | * make rmdir return -EBUSY too often. To avoid that, we use waitqueue | 4171 | * above. |
4293 | * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir | ||
4294 | * and subsystem's reference count handling. Please see css_get/put | ||
4295 | * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. | ||
4296 | */ | 4172 | */ |
4297 | set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | 4173 | for_each_subsys(cgrp->root, ss) { |
4174 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | ||
4298 | 4175 | ||
4299 | /* | 4176 | WARN_ON(atomic_read(&css->refcnt) < 0); |
4300 | * Call pre_destroy handlers of subsys. Notify subsystems | 4177 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); |
4301 | * that rmdir() request comes. | ||
4302 | */ | ||
4303 | ret = cgroup_call_pre_destroy(cgrp); | ||
4304 | if (ret) { | ||
4305 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4306 | return ret; | ||
4307 | } | 4178 | } |
4179 | set_bit(CGRP_REMOVED, &cgrp->flags); | ||
4308 | 4180 | ||
4181 | /* | ||
4182 | * Tell subsystems to initate destruction. pre_destroy() should be | ||
4183 | * called with cgroup_mutex unlocked. See 3fa59dfbc3 ("cgroup: fix | ||
4184 | * potential deadlock in pre_destroy") for details. | ||
4185 | */ | ||
4186 | mutex_unlock(&cgroup_mutex); | ||
4187 | for_each_subsys(cgrp->root, ss) | ||
4188 | if (ss->pre_destroy) | ||
4189 | ss->pre_destroy(cgrp); | ||
4309 | mutex_lock(&cgroup_mutex); | 4190 | mutex_lock(&cgroup_mutex); |
4310 | parent = cgrp->parent; | 4191 | |
4311 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { | 4192 | /* |
4312 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | 4193 | * Put all the base refs. Each css holds an extra reference to the |
4313 | mutex_unlock(&cgroup_mutex); | 4194 | * cgroup's dentry and cgroup removal proceeds regardless of css |
4314 | return -EBUSY; | 4195 | * refs. On the last put of each css, whenever that may be, the |
4315 | } | 4196 | * extra dentry ref is put so that dentry destruction happens only |
4316 | prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); | 4197 | * after all css's are released. |
4317 | if (!cgroup_clear_css_refs(cgrp)) { | 4198 | */ |
4318 | mutex_unlock(&cgroup_mutex); | 4199 | for_each_subsys(cgrp->root, ss) |
4319 | /* | 4200 | css_put(cgrp->subsys[ss->subsys_id]); |
4320 | * Because someone may call cgroup_wakeup_rmdir_waiter() before | ||
4321 | * prepare_to_wait(), we need to check this flag. | ||
4322 | */ | ||
4323 | if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) | ||
4324 | schedule(); | ||
4325 | finish_wait(&cgroup_rmdir_waitq, &wait); | ||
4326 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4327 | if (signal_pending(current)) | ||
4328 | return -EINTR; | ||
4329 | goto again; | ||
4330 | } | ||
4331 | /* NO css_tryget() can success after here. */ | ||
4332 | finish_wait(&cgroup_rmdir_waitq, &wait); | ||
4333 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4334 | 4201 | ||
4335 | raw_spin_lock(&release_list_lock); | 4202 | raw_spin_lock(&release_list_lock); |
4336 | set_bit(CGRP_REMOVED, &cgrp->flags); | ||
4337 | if (!list_empty(&cgrp->release_list)) | 4203 | if (!list_empty(&cgrp->release_list)) |
4338 | list_del_init(&cgrp->release_list); | 4204 | list_del_init(&cgrp->release_list); |
4339 | raw_spin_unlock(&release_list_lock); | 4205 | raw_spin_unlock(&release_list_lock); |
@@ -5041,15 +4907,17 @@ static void check_for_release(struct cgroup *cgrp) | |||
5041 | /* Caller must verify that the css is not for root cgroup */ | 4907 | /* Caller must verify that the css is not for root cgroup */ |
5042 | bool __css_tryget(struct cgroup_subsys_state *css) | 4908 | bool __css_tryget(struct cgroup_subsys_state *css) |
5043 | { | 4909 | { |
5044 | do { | 4910 | while (true) { |
5045 | int v = css_refcnt(css); | 4911 | int t, v; |
5046 | 4912 | ||
5047 | if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) | 4913 | v = css_refcnt(css); |
4914 | t = atomic_cmpxchg(&css->refcnt, v, v + 1); | ||
4915 | if (likely(t == v)) | ||
5048 | return true; | 4916 | return true; |
4917 | else if (t < 0) | ||
4918 | return false; | ||
5049 | cpu_relax(); | 4919 | cpu_relax(); |
5050 | } while (!test_bit(CSS_REMOVED, &css->flags)); | 4920 | } |
5051 | |||
5052 | return false; | ||
5053 | } | 4921 | } |
5054 | EXPORT_SYMBOL_GPL(__css_tryget); | 4922 | EXPORT_SYMBOL_GPL(__css_tryget); |
5055 | 4923 | ||
@@ -5068,11 +4936,9 @@ void __css_put(struct cgroup_subsys_state *css) | |||
5068 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 4936 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
5069 | check_for_release(cgrp); | 4937 | check_for_release(cgrp); |
5070 | } | 4938 | } |
5071 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
5072 | break; | 4939 | break; |
5073 | case 0: | 4940 | case 0: |
5074 | if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) | 4941 | schedule_work(&css->dput_work); |
5075 | schedule_work(&css->dput_work); | ||
5076 | break; | 4942 | break; |
5077 | } | 4943 | } |
5078 | rcu_read_unlock(); | 4944 | rcu_read_unlock(); |
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index a3f358fb8a0c..0d3a1a317731 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c | |||
@@ -155,18 +155,13 @@ out: | |||
155 | * Force the hugetlb cgroup to empty the hugetlb resources by moving them to | 155 | * Force the hugetlb cgroup to empty the hugetlb resources by moving them to |
156 | * the parent cgroup. | 156 | * the parent cgroup. |
157 | */ | 157 | */ |
158 | static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) | 158 | static void hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) |
159 | { | 159 | { |
160 | struct hstate *h; | 160 | struct hstate *h; |
161 | struct page *page; | 161 | struct page *page; |
162 | int ret = 0, idx = 0; | 162 | int idx = 0; |
163 | 163 | ||
164 | do { | 164 | do { |
165 | if (cgroup_task_count(cgroup) || | ||
166 | !list_empty(&cgroup->children)) { | ||
167 | ret = -EBUSY; | ||
168 | goto out; | ||
169 | } | ||
170 | for_each_hstate(h) { | 165 | for_each_hstate(h) { |
171 | spin_lock(&hugetlb_lock); | 166 | spin_lock(&hugetlb_lock); |
172 | list_for_each_entry(page, &h->hugepage_activelist, lru) | 167 | list_for_each_entry(page, &h->hugepage_activelist, lru) |
@@ -177,8 +172,6 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) | |||
177 | } | 172 | } |
178 | cond_resched(); | 173 | cond_resched(); |
179 | } while (hugetlb_cgroup_have_usage(cgroup)); | 174 | } while (hugetlb_cgroup_have_usage(cgroup)); |
180 | out: | ||
181 | return ret; | ||
182 | } | 175 | } |
183 | 176 | ||
184 | int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, | 177 | int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7acf43bf04a2..08adaaae6fcc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -2337,7 +2337,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
2337 | again: | 2337 | again: |
2338 | if (*ptr) { /* css should be a valid one */ | 2338 | if (*ptr) { /* css should be a valid one */ |
2339 | memcg = *ptr; | 2339 | memcg = *ptr; |
2340 | VM_BUG_ON(css_is_removed(&memcg->css)); | ||
2341 | if (mem_cgroup_is_root(memcg)) | 2340 | if (mem_cgroup_is_root(memcg)) |
2342 | goto done; | 2341 | goto done; |
2343 | if (nr_pages == 1 && consume_stock(memcg)) | 2342 | if (nr_pages == 1 && consume_stock(memcg)) |
@@ -2477,9 +2476,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, | |||
2477 | 2476 | ||
2478 | /* | 2477 | /* |
2479 | * A helper function to get mem_cgroup from ID. must be called under | 2478 | * A helper function to get mem_cgroup from ID. must be called under |
2480 | * rcu_read_lock(). The caller must check css_is_removed() or some if | 2479 | * rcu_read_lock(). The caller is responsible for calling css_tryget if |
2481 | * it's concern. (dropping refcnt from swap can be called against removed | 2480 | * the mem_cgroup is used for charging. (dropping refcnt from swap can be |
2482 | * memcg.) | 2481 | * called against removed memcg.) |
2483 | */ | 2482 | */ |
2484 | static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | 2483 | static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) |
2485 | { | 2484 | { |
@@ -2676,13 +2675,6 @@ static int mem_cgroup_move_account(struct page *page, | |||
2676 | /* caller should have done css_get */ | 2675 | /* caller should have done css_get */ |
2677 | pc->mem_cgroup = to; | 2676 | pc->mem_cgroup = to; |
2678 | mem_cgroup_charge_statistics(to, anon, nr_pages); | 2677 | mem_cgroup_charge_statistics(to, anon, nr_pages); |
2679 | /* | ||
2680 | * We charges against "to" which may not have any tasks. Then, "to" | ||
2681 | * can be under rmdir(). But in current implementation, caller of | ||
2682 | * this function is just force_empty() and move charge, so it's | ||
2683 | * guaranteed that "to" is never removed. So, we don't check rmdir | ||
2684 | * status here. | ||
2685 | */ | ||
2686 | move_unlock_mem_cgroup(from, &flags); | 2678 | move_unlock_mem_cgroup(from, &flags); |
2687 | ret = 0; | 2679 | ret = 0; |
2688 | unlock: | 2680 | unlock: |
@@ -2696,10 +2688,27 @@ out: | |||
2696 | return ret; | 2688 | return ret; |
2697 | } | 2689 | } |
2698 | 2690 | ||
2699 | /* | 2691 | /** |
2700 | * move charges to its parent. | 2692 | * mem_cgroup_move_parent - moves page to the parent group |
2693 | * @page: the page to move | ||
2694 | * @pc: page_cgroup of the page | ||
2695 | * @child: page's cgroup | ||
2696 | * | ||
2697 | * move charges to its parent or the root cgroup if the group has no | ||
2698 | * parent (aka use_hierarchy==0). | ||
2699 | * Although this might fail (get_page_unless_zero, isolate_lru_page or | ||
2700 | * mem_cgroup_move_account fails) the failure is always temporary and | ||
2701 | * it signals a race with a page removal/uncharge or migration. In the | ||
2702 | * first case the page is on the way out and it will vanish from the LRU | ||
2703 | * on the next attempt and the call should be retried later. | ||
2704 | * Isolation from the LRU fails only if page has been isolated from | ||
2705 | * the LRU since we looked at it and that usually means either global | ||
2706 | * reclaim or migration going on. The page will either get back to the | ||
2707 | * LRU or vanish. | ||
2708 | * Finaly mem_cgroup_move_account fails only if the page got uncharged | ||
2709 | * (!PageCgroupUsed) or moved to a different group. The page will | ||
2710 | * disappear in the next attempt. | ||
2701 | */ | 2711 | */ |
2702 | |||
2703 | static int mem_cgroup_move_parent(struct page *page, | 2712 | static int mem_cgroup_move_parent(struct page *page, |
2704 | struct page_cgroup *pc, | 2713 | struct page_cgroup *pc, |
2705 | struct mem_cgroup *child) | 2714 | struct mem_cgroup *child) |
@@ -2709,9 +2718,7 @@ static int mem_cgroup_move_parent(struct page *page, | |||
2709 | unsigned long uninitialized_var(flags); | 2718 | unsigned long uninitialized_var(flags); |
2710 | int ret; | 2719 | int ret; |
2711 | 2720 | ||
2712 | /* Is ROOT ? */ | 2721 | VM_BUG_ON(mem_cgroup_is_root(child)); |
2713 | if (mem_cgroup_is_root(child)) | ||
2714 | return -EINVAL; | ||
2715 | 2722 | ||
2716 | ret = -EBUSY; | 2723 | ret = -EBUSY; |
2717 | if (!get_page_unless_zero(page)) | 2724 | if (!get_page_unless_zero(page)) |
@@ -2728,8 +2735,10 @@ static int mem_cgroup_move_parent(struct page *page, | |||
2728 | if (!parent) | 2735 | if (!parent) |
2729 | parent = root_mem_cgroup; | 2736 | parent = root_mem_cgroup; |
2730 | 2737 | ||
2731 | if (nr_pages > 1) | 2738 | if (nr_pages > 1) { |
2739 | VM_BUG_ON(!PageTransHuge(page)); | ||
2732 | flags = compound_lock_irqsave(page); | 2740 | flags = compound_lock_irqsave(page); |
2741 | } | ||
2733 | 2742 | ||
2734 | ret = mem_cgroup_move_account(page, nr_pages, | 2743 | ret = mem_cgroup_move_account(page, nr_pages, |
2735 | pc, child, parent); | 2744 | pc, child, parent); |
@@ -2871,7 +2880,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, | |||
2871 | return; | 2880 | return; |
2872 | if (!memcg) | 2881 | if (!memcg) |
2873 | return; | 2882 | return; |
2874 | cgroup_exclude_rmdir(&memcg->css); | ||
2875 | 2883 | ||
2876 | __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); | 2884 | __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); |
2877 | /* | 2885 | /* |
@@ -2885,12 +2893,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, | |||
2885 | swp_entry_t ent = {.val = page_private(page)}; | 2893 | swp_entry_t ent = {.val = page_private(page)}; |
2886 | mem_cgroup_uncharge_swap(ent); | 2894 | mem_cgroup_uncharge_swap(ent); |
2887 | } | 2895 | } |
2888 | /* | ||
2889 | * At swapin, we may charge account against cgroup which has no tasks. | ||
2890 | * So, rmdir()->pre_destroy() can be called while we do this charge. | ||
2891 | * In that case, we need to call pre_destroy() again. check it here. | ||
2892 | */ | ||
2893 | cgroup_release_and_wakeup_rmdir(&memcg->css); | ||
2894 | } | 2896 | } |
2895 | 2897 | ||
2896 | void mem_cgroup_commit_charge_swapin(struct page *page, | 2898 | void mem_cgroup_commit_charge_swapin(struct page *page, |
@@ -3338,8 +3340,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3338 | 3340 | ||
3339 | if (!memcg) | 3341 | if (!memcg) |
3340 | return; | 3342 | return; |
3341 | /* blocks rmdir() */ | 3343 | |
3342 | cgroup_exclude_rmdir(&memcg->css); | ||
3343 | if (!migration_ok) { | 3344 | if (!migration_ok) { |
3344 | used = oldpage; | 3345 | used = oldpage; |
3345 | unused = newpage; | 3346 | unused = newpage; |
@@ -3373,13 +3374,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3373 | */ | 3374 | */ |
3374 | if (anon) | 3375 | if (anon) |
3375 | mem_cgroup_uncharge_page(used); | 3376 | mem_cgroup_uncharge_page(used); |
3376 | /* | ||
3377 | * At migration, we may charge account against cgroup which has no | ||
3378 | * tasks. | ||
3379 | * So, rmdir()->pre_destroy() can be called while we do this charge. | ||
3380 | * In that case, we need to call pre_destroy() again. check it here. | ||
3381 | */ | ||
3382 | cgroup_release_and_wakeup_rmdir(&memcg->css); | ||
3383 | } | 3377 | } |
3384 | 3378 | ||
3385 | /* | 3379 | /* |
@@ -3679,17 +3673,22 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3679 | return nr_reclaimed; | 3673 | return nr_reclaimed; |
3680 | } | 3674 | } |
3681 | 3675 | ||
3682 | /* | 3676 | /** |
3677 | * mem_cgroup_force_empty_list - clears LRU of a group | ||
3678 | * @memcg: group to clear | ||
3679 | * @node: NUMA node | ||
3680 | * @zid: zone id | ||
3681 | * @lru: lru to to clear | ||
3682 | * | ||
3683 | * Traverse a specified page_cgroup list and try to drop them all. This doesn't | 3683 | * Traverse a specified page_cgroup list and try to drop them all. This doesn't |
3684 | * reclaim the pages page themselves - it just removes the page_cgroups. | 3684 | * reclaim the pages page themselves - pages are moved to the parent (or root) |
3685 | * Returns true if some page_cgroups were not freed, indicating that the caller | 3685 | * group. |
3686 | * must retry this operation. | ||
3687 | */ | 3686 | */ |
3688 | static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | 3687 | static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, |
3689 | int node, int zid, enum lru_list lru) | 3688 | int node, int zid, enum lru_list lru) |
3690 | { | 3689 | { |
3691 | struct mem_cgroup_per_zone *mz; | 3690 | struct mem_cgroup_per_zone *mz; |
3692 | unsigned long flags, loop; | 3691 | unsigned long flags; |
3693 | struct list_head *list; | 3692 | struct list_head *list; |
3694 | struct page *busy; | 3693 | struct page *busy; |
3695 | struct zone *zone; | 3694 | struct zone *zone; |
@@ -3698,11 +3697,8 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3698 | mz = mem_cgroup_zoneinfo(memcg, node, zid); | 3697 | mz = mem_cgroup_zoneinfo(memcg, node, zid); |
3699 | list = &mz->lruvec.lists[lru]; | 3698 | list = &mz->lruvec.lists[lru]; |
3700 | 3699 | ||
3701 | loop = mz->lru_size[lru]; | ||
3702 | /* give some margin against EBUSY etc...*/ | ||
3703 | loop += 256; | ||
3704 | busy = NULL; | 3700 | busy = NULL; |
3705 | while (loop--) { | 3701 | do { |
3706 | struct page_cgroup *pc; | 3702 | struct page_cgroup *pc; |
3707 | struct page *page; | 3703 | struct page *page; |
3708 | 3704 | ||
@@ -3728,76 +3724,72 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3728 | cond_resched(); | 3724 | cond_resched(); |
3729 | } else | 3725 | } else |
3730 | busy = NULL; | 3726 | busy = NULL; |
3731 | } | 3727 | } while (!list_empty(list)); |
3732 | return !list_empty(list); | ||
3733 | } | 3728 | } |
3734 | 3729 | ||
3735 | /* | 3730 | /* |
3736 | * make mem_cgroup's charge to be 0 if there is no task. | 3731 | * make mem_cgroup's charge to be 0 if there is no task by moving |
3732 | * all the charges and pages to the parent. | ||
3737 | * This enables deleting this mem_cgroup. | 3733 | * This enables deleting this mem_cgroup. |
3734 | * | ||
3735 | * Caller is responsible for holding css reference on the memcg. | ||
3738 | */ | 3736 | */ |
3739 | static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all) | 3737 | static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) |
3740 | { | 3738 | { |
3741 | int ret; | 3739 | int node, zid; |
3742 | int node, zid, shrink; | ||
3743 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
3744 | struct cgroup *cgrp = memcg->css.cgroup; | ||
3745 | |||
3746 | css_get(&memcg->css); | ||
3747 | 3740 | ||
3748 | shrink = 0; | ||
3749 | /* should free all ? */ | ||
3750 | if (free_all) | ||
3751 | goto try_to_free; | ||
3752 | move_account: | ||
3753 | do { | 3741 | do { |
3754 | ret = -EBUSY; | ||
3755 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) | ||
3756 | goto out; | ||
3757 | /* This is for making all *used* pages to be on LRU. */ | 3742 | /* This is for making all *used* pages to be on LRU. */ |
3758 | lru_add_drain_all(); | 3743 | lru_add_drain_all(); |
3759 | drain_all_stock_sync(memcg); | 3744 | drain_all_stock_sync(memcg); |
3760 | ret = 0; | ||
3761 | mem_cgroup_start_move(memcg); | 3745 | mem_cgroup_start_move(memcg); |
3762 | for_each_node_state(node, N_HIGH_MEMORY) { | 3746 | for_each_node_state(node, N_HIGH_MEMORY) { |
3763 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 3747 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
3764 | enum lru_list lru; | 3748 | enum lru_list lru; |
3765 | for_each_lru(lru) { | 3749 | for_each_lru(lru) { |
3766 | ret = mem_cgroup_force_empty_list(memcg, | 3750 | mem_cgroup_force_empty_list(memcg, |
3767 | node, zid, lru); | 3751 | node, zid, lru); |
3768 | if (ret) | ||
3769 | break; | ||
3770 | } | 3752 | } |
3771 | } | 3753 | } |
3772 | if (ret) | ||
3773 | break; | ||
3774 | } | 3754 | } |
3775 | mem_cgroup_end_move(memcg); | 3755 | mem_cgroup_end_move(memcg); |
3776 | memcg_oom_recover(memcg); | 3756 | memcg_oom_recover(memcg); |
3777 | cond_resched(); | 3757 | cond_resched(); |
3778 | /* "ret" should also be checked to ensure all lists are empty. */ | ||
3779 | } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); | ||
3780 | out: | ||
3781 | css_put(&memcg->css); | ||
3782 | return ret; | ||
3783 | 3758 | ||
3784 | try_to_free: | 3759 | /* |
3760 | * This is a safety check because mem_cgroup_force_empty_list | ||
3761 | * could have raced with mem_cgroup_replace_page_cache callers | ||
3762 | * so the lru seemed empty but the page could have been added | ||
3763 | * right after the check. RES_USAGE should be safe as we always | ||
3764 | * charge before adding to the LRU. | ||
3765 | */ | ||
3766 | } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0); | ||
3767 | } | ||
3768 | |||
3769 | /* | ||
3770 | * Reclaims as many pages from the given memcg as possible and moves | ||
3771 | * the rest to the parent. | ||
3772 | * | ||
3773 | * Caller is responsible for holding css reference for memcg. | ||
3774 | */ | ||
3775 | static int mem_cgroup_force_empty(struct mem_cgroup *memcg) | ||
3776 | { | ||
3777 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
3778 | struct cgroup *cgrp = memcg->css.cgroup; | ||
3779 | |||
3785 | /* returns EBUSY if there is a task or if we come here twice. */ | 3780 | /* returns EBUSY if there is a task or if we come here twice. */ |
3786 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { | 3781 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) |
3787 | ret = -EBUSY; | 3782 | return -EBUSY; |
3788 | goto out; | 3783 | |
3789 | } | ||
3790 | /* we call try-to-free pages for make this cgroup empty */ | 3784 | /* we call try-to-free pages for make this cgroup empty */ |
3791 | lru_add_drain_all(); | 3785 | lru_add_drain_all(); |
3792 | /* try to free all pages in this cgroup */ | 3786 | /* try to free all pages in this cgroup */ |
3793 | shrink = 1; | ||
3794 | while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { | 3787 | while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { |
3795 | int progress; | 3788 | int progress; |
3796 | 3789 | ||
3797 | if (signal_pending(current)) { | 3790 | if (signal_pending(current)) |
3798 | ret = -EINTR; | 3791 | return -EINTR; |
3799 | goto out; | 3792 | |
3800 | } | ||
3801 | progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, | 3793 | progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, |
3802 | false); | 3794 | false); |
3803 | if (!progress) { | 3795 | if (!progress) { |
@@ -3808,13 +3800,23 @@ try_to_free: | |||
3808 | 3800 | ||
3809 | } | 3801 | } |
3810 | lru_add_drain(); | 3802 | lru_add_drain(); |
3811 | /* try move_account...there may be some *locked* pages. */ | 3803 | mem_cgroup_reparent_charges(memcg); |
3812 | goto move_account; | 3804 | |
3805 | return 0; | ||
3813 | } | 3806 | } |
3814 | 3807 | ||
3815 | static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) | 3808 | static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) |
3816 | { | 3809 | { |
3817 | return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); | 3810 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
3811 | int ret; | ||
3812 | |||
3813 | if (mem_cgroup_is_root(memcg)) | ||
3814 | return -EINVAL; | ||
3815 | css_get(&memcg->css); | ||
3816 | ret = mem_cgroup_force_empty(memcg); | ||
3817 | css_put(&memcg->css); | ||
3818 | |||
3819 | return ret; | ||
3818 | } | 3820 | } |
3819 | 3821 | ||
3820 | 3822 | ||
@@ -5001,11 +5003,11 @@ free_out: | |||
5001 | return ERR_PTR(error); | 5003 | return ERR_PTR(error); |
5002 | } | 5004 | } |
5003 | 5005 | ||
5004 | static int mem_cgroup_pre_destroy(struct cgroup *cont) | 5006 | static void mem_cgroup_pre_destroy(struct cgroup *cont) |
5005 | { | 5007 | { |
5006 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 5008 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
5007 | 5009 | ||
5008 | return mem_cgroup_force_empty(memcg, false); | 5010 | mem_cgroup_reparent_charges(memcg); |
5009 | } | 5011 | } |
5010 | 5012 | ||
5011 | static void mem_cgroup_destroy(struct cgroup *cont) | 5013 | static void mem_cgroup_destroy(struct cgroup *cont) |
@@ -5607,7 +5609,6 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
5607 | .base_cftypes = mem_cgroup_files, | 5609 | .base_cftypes = mem_cgroup_files, |
5608 | .early_init = 0, | 5610 | .early_init = 0, |
5609 | .use_id = 1, | 5611 | .use_id = 1, |
5610 | .__DEPRECATED_clear_css_refs = true, | ||
5611 | }; | 5612 | }; |
5612 | 5613 | ||
5613 | #ifdef CONFIG_MEMCG_SWAP | 5614 | #ifdef CONFIG_MEMCG_SWAP |