diff options
-rw-r--r-- | include/linux/cgroup.h | 23 | ||||
-rw-r--r-- | kernel/cgroup.c | 165 |
2 files changed, 112 insertions, 76 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e345d8b90046..b7bd4beae294 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/workqueue.h> | 20 | #include <linux/workqueue.h> |
21 | #include <linux/xattr.h> | 21 | #include <linux/xattr.h> |
22 | #include <linux/fs.h> | 22 | #include <linux/fs.h> |
23 | #include <linux/percpu-refcount.h> | ||
23 | 24 | ||
24 | #ifdef CONFIG_CGROUPS | 25 | #ifdef CONFIG_CGROUPS |
25 | 26 | ||
@@ -72,13 +73,8 @@ struct cgroup_subsys_state { | |||
72 | */ | 73 | */ |
73 | struct cgroup *cgroup; | 74 | struct cgroup *cgroup; |
74 | 75 | ||
75 | /* | 76 | /* reference count - access via css_[try]get() and css_put() */ |
76 | * State maintained by the cgroup system to allow subsystems | 77 | struct percpu_ref refcnt; |
77 | * to be "busy". Should be accessed via css_get(), | ||
78 | * css_tryget() and css_put(). | ||
79 | */ | ||
80 | |||
81 | atomic_t refcnt; | ||
82 | 78 | ||
83 | unsigned long flags; | 79 | unsigned long flags; |
84 | /* ID for this css, if possible */ | 80 | /* ID for this css, if possible */ |
@@ -104,11 +100,9 @@ static inline void css_get(struct cgroup_subsys_state *css) | |||
104 | { | 100 | { |
105 | /* We don't need to reference count the root state */ | 101 | /* We don't need to reference count the root state */ |
106 | if (!(css->flags & CSS_ROOT)) | 102 | if (!(css->flags & CSS_ROOT)) |
107 | atomic_inc(&css->refcnt); | 103 | percpu_ref_get(&css->refcnt); |
108 | } | 104 | } |
109 | 105 | ||
110 | extern bool __css_tryget(struct cgroup_subsys_state *css); | ||
111 | |||
112 | /** | 106 | /** |
113 | * css_tryget - try to obtain a reference on the specified css | 107 | * css_tryget - try to obtain a reference on the specified css |
114 | * @css: target css | 108 | * @css: target css |
@@ -123,11 +117,9 @@ static inline bool css_tryget(struct cgroup_subsys_state *css) | |||
123 | { | 117 | { |
124 | if (css->flags & CSS_ROOT) | 118 | if (css->flags & CSS_ROOT) |
125 | return true; | 119 | return true; |
126 | return __css_tryget(css); | 120 | return percpu_ref_tryget(&css->refcnt); |
127 | } | 121 | } |
128 | 122 | ||
129 | extern void __css_put(struct cgroup_subsys_state *css); | ||
130 | |||
131 | /** | 123 | /** |
132 | * css_put - put a css reference | 124 | * css_put - put a css reference |
133 | * @css: target css | 125 | * @css: target css |
@@ -137,7 +129,7 @@ extern void __css_put(struct cgroup_subsys_state *css); | |||
137 | static inline void css_put(struct cgroup_subsys_state *css) | 129 | static inline void css_put(struct cgroup_subsys_state *css) |
138 | { | 130 | { |
139 | if (!(css->flags & CSS_ROOT)) | 131 | if (!(css->flags & CSS_ROOT)) |
140 | __css_put(css); | 132 | percpu_ref_put(&css->refcnt); |
141 | } | 133 | } |
142 | 134 | ||
143 | /* bits in struct cgroup flags field */ | 135 | /* bits in struct cgroup flags field */ |
@@ -231,9 +223,10 @@ struct cgroup { | |||
231 | struct list_head pidlists; | 223 | struct list_head pidlists; |
232 | struct mutex pidlist_mutex; | 224 | struct mutex pidlist_mutex; |
233 | 225 | ||
234 | /* For RCU-protected deletion */ | 226 | /* For css percpu_ref killing and RCU-protected deletion */ |
235 | struct rcu_head rcu_head; | 227 | struct rcu_head rcu_head; |
236 | struct work_struct destroy_work; | 228 | struct work_struct destroy_work; |
229 | atomic_t css_kill_cnt; | ||
237 | 230 | ||
238 | /* List of events which userspace want to receive */ | 231 | /* List of events which userspace want to receive */ |
239 | struct list_head event_list; | 232 | struct list_head event_list; |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ebbfc043153f..2e9da7bf25cb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -63,9 +63,6 @@ | |||
63 | 63 | ||
64 | #include <linux/atomic.h> | 64 | #include <linux/atomic.h> |
65 | 65 | ||
66 | /* css deactivation bias, makes css->refcnt negative to deny new trygets */ | ||
67 | #define CSS_DEACT_BIAS INT_MIN | ||
68 | |||
69 | /* | 66 | /* |
70 | * cgroup_mutex is the master lock. Any modification to cgroup or its | 67 | * cgroup_mutex is the master lock. Any modification to cgroup or its |
71 | * hierarchy must be performed while holding it. | 68 | * hierarchy must be performed while holding it. |
@@ -213,19 +210,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp); | |||
213 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | 210 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, |
214 | struct cftype cfts[], bool is_add); | 211 | struct cftype cfts[], bool is_add); |
215 | 212 | ||
216 | static int css_unbias_refcnt(int refcnt) | ||
217 | { | ||
218 | return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; | ||
219 | } | ||
220 | |||
221 | /* the current nr of refs, always >= 0 whether @css is deactivated or not */ | ||
222 | static int css_refcnt(struct cgroup_subsys_state *css) | ||
223 | { | ||
224 | int v = atomic_read(&css->refcnt); | ||
225 | |||
226 | return css_unbias_refcnt(v); | ||
227 | } | ||
228 | |||
229 | /* convenient tests for these bits */ | 213 | /* convenient tests for these bits */ |
230 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) | 214 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) |
231 | { | 215 | { |
@@ -4139,12 +4123,19 @@ static void css_dput_fn(struct work_struct *work) | |||
4139 | deactivate_super(sb); | 4123 | deactivate_super(sb); |
4140 | } | 4124 | } |
4141 | 4125 | ||
4126 | static void css_release(struct percpu_ref *ref) | ||
4127 | { | ||
4128 | struct cgroup_subsys_state *css = | ||
4129 | container_of(ref, struct cgroup_subsys_state, refcnt); | ||
4130 | |||
4131 | schedule_work(&css->dput_work); | ||
4132 | } | ||
4133 | |||
4142 | static void init_cgroup_css(struct cgroup_subsys_state *css, | 4134 | static void init_cgroup_css(struct cgroup_subsys_state *css, |
4143 | struct cgroup_subsys *ss, | 4135 | struct cgroup_subsys *ss, |
4144 | struct cgroup *cgrp) | 4136 | struct cgroup *cgrp) |
4145 | { | 4137 | { |
4146 | css->cgroup = cgrp; | 4138 | css->cgroup = cgrp; |
4147 | atomic_set(&css->refcnt, 1); | ||
4148 | css->flags = 0; | 4139 | css->flags = 0; |
4149 | css->id = NULL; | 4140 | css->id = NULL; |
4150 | if (cgrp == dummytop) | 4141 | if (cgrp == dummytop) |
@@ -4266,7 +4257,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4266 | err = PTR_ERR(css); | 4257 | err = PTR_ERR(css); |
4267 | goto err_free_all; | 4258 | goto err_free_all; |
4268 | } | 4259 | } |
4260 | |||
4261 | err = percpu_ref_init(&css->refcnt, css_release); | ||
4262 | if (err) | ||
4263 | goto err_free_all; | ||
4264 | |||
4269 | init_cgroup_css(css, ss, cgrp); | 4265 | init_cgroup_css(css, ss, cgrp); |
4266 | |||
4270 | if (ss->use_id) { | 4267 | if (ss->use_id) { |
4271 | err = alloc_css_id(ss, parent, cgrp); | 4268 | err = alloc_css_id(ss, parent, cgrp); |
4272 | if (err) | 4269 | if (err) |
@@ -4331,8 +4328,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4331 | 4328 | ||
4332 | err_free_all: | 4329 | err_free_all: |
4333 | for_each_subsys(root, ss) { | 4330 | for_each_subsys(root, ss) { |
4334 | if (cgrp->subsys[ss->subsys_id]) | 4331 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
4332 | |||
4333 | if (css) { | ||
4334 | percpu_ref_cancel_init(&css->refcnt); | ||
4335 | ss->css_free(cgrp); | 4335 | ss->css_free(cgrp); |
4336 | } | ||
4336 | } | 4337 | } |
4337 | mutex_unlock(&cgroup_mutex); | 4338 | mutex_unlock(&cgroup_mutex); |
4338 | /* Release the reference count that we took on the superblock */ | 4339 | /* Release the reference count that we took on the superblock */ |
@@ -4360,6 +4361,48 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
4360 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); | 4361 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); |
4361 | } | 4362 | } |
4362 | 4363 | ||
4364 | static void cgroup_css_killed(struct cgroup *cgrp) | ||
4365 | { | ||
4366 | if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) | ||
4367 | return; | ||
4368 | |||
4369 | /* percpu ref's of all css's are killed, kick off the next step */ | ||
4370 | INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); | ||
4371 | schedule_work(&cgrp->destroy_work); | ||
4372 | } | ||
4373 | |||
4374 | static void css_ref_killed_fn(struct percpu_ref *ref) | ||
4375 | { | ||
4376 | struct cgroup_subsys_state *css = | ||
4377 | container_of(ref, struct cgroup_subsys_state, refcnt); | ||
4378 | |||
4379 | cgroup_css_killed(css->cgroup); | ||
4380 | } | ||
4381 | |||
4382 | /** | ||
4383 | * cgroup_destroy_locked - the first stage of cgroup destruction | ||
4384 | * @cgrp: cgroup to be destroyed | ||
4385 | * | ||
4386 | * css's make use of percpu refcnts whose killing latency shouldn't be | ||
4387 | * exposed to userland and are RCU protected. Also, cgroup core needs to | ||
4388 | * guarantee that css_tryget() won't succeed by the time ->css_offline() is | ||
4389 | * invoked. To satisfy all the requirements, destruction is implemented in | ||
4390 | * the following two steps. | ||
4391 | * | ||
4392 | * s1. Verify @cgrp can be destroyed and mark it dying. Remove all | ||
4393 | * userland visible parts and start killing the percpu refcnts of | ||
4394 | * css's. Set up so that the next stage will be kicked off once all | ||
4395 | * the percpu refcnts are confirmed to be killed. | ||
4396 | * | ||
4397 | * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the | ||
4398 | * rest of destruction. Once all cgroup references are gone, the | ||
4399 | * cgroup is RCU-freed. | ||
4400 | * | ||
4401 | * This function implements s1. After this step, @cgrp is gone as far as | ||
4402 | * the userland is concerned and a new cgroup with the same name may be | ||
4403 | * created. As cgroup doesn't care about the names internally, this | ||
4404 | * doesn't cause any problem. | ||
4405 | */ | ||
4363 | static int cgroup_destroy_locked(struct cgroup *cgrp) | 4406 | static int cgroup_destroy_locked(struct cgroup *cgrp) |
4364 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | 4407 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) |
4365 | { | 4408 | { |
@@ -4382,16 +4425,34 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4382 | return -EBUSY; | 4425 | return -EBUSY; |
4383 | 4426 | ||
4384 | /* | 4427 | /* |
4385 | * Block new css_tryget() by deactivating refcnt and mark @cgrp | 4428 | * Block new css_tryget() by killing css refcnts. cgroup core |
4386 | * removed. This makes future css_tryget() attempts fail which we | 4429 | * guarantees that, by the time ->css_offline() is invoked, no new |
4387 | * guarantee to ->css_offline() callbacks. | 4430 | * css reference will be given out via css_tryget(). We can't |
4431 | * simply call percpu_ref_kill() and proceed to offlining css's | ||
4432 | * because percpu_ref_kill() doesn't guarantee that the ref is seen | ||
4433 | * as killed on all CPUs on return. | ||
4434 | * | ||
4435 | * Use percpu_ref_kill_and_confirm() to get notifications as each | ||
4436 | * css is confirmed to be seen as killed on all CPUs. The | ||
4437 | * notification callback keeps track of the number of css's to be | ||
4438 | * killed and schedules cgroup_offline_fn() to perform the rest of | ||
4439 | * destruction once the percpu refs of all css's are confirmed to | ||
4440 | * be killed. | ||
4388 | */ | 4441 | */ |
4442 | atomic_set(&cgrp->css_kill_cnt, 1); | ||
4389 | for_each_subsys(cgrp->root, ss) { | 4443 | for_each_subsys(cgrp->root, ss) { |
4390 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4444 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
4391 | 4445 | ||
4392 | WARN_ON(atomic_read(&css->refcnt) < 0); | 4446 | /* |
4393 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); | 4447 | * Killing would put the base ref, but we need to keep it |
4448 | * alive until after ->css_offline. | ||
4449 | */ | ||
4450 | percpu_ref_get(&css->refcnt); | ||
4451 | |||
4452 | atomic_inc(&cgrp->css_kill_cnt); | ||
4453 | percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn); | ||
4394 | } | 4454 | } |
4455 | cgroup_css_killed(cgrp); | ||
4395 | 4456 | ||
4396 | /* | 4457 | /* |
4397 | * Mark @cgrp dead. This prevents further task migration and child | 4458 | * Mark @cgrp dead. This prevents further task migration and child |
@@ -4427,12 +4488,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4427 | } | 4488 | } |
4428 | spin_unlock(&cgrp->event_list_lock); | 4489 | spin_unlock(&cgrp->event_list_lock); |
4429 | 4490 | ||
4430 | INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); | ||
4431 | schedule_work(&cgrp->destroy_work); | ||
4432 | |||
4433 | return 0; | 4491 | return 0; |
4434 | }; | 4492 | }; |
4435 | 4493 | ||
4494 | /** | ||
4495 | * cgroup_offline_fn - the second step of cgroup destruction | ||
4496 | * @work: cgroup->destroy_free_work | ||
4497 | * | ||
4498 | * This function is invoked from a work item for a cgroup which is being | ||
4499 | * destroyed after the percpu refcnts of all css's are guaranteed to be | ||
4500 | * seen as killed on all CPUs, and performs the rest of destruction. This | ||
4501 | * is the second step of destruction described in the comment above | ||
4502 | * cgroup_destroy_locked(). | ||
4503 | */ | ||
4436 | static void cgroup_offline_fn(struct work_struct *work) | 4504 | static void cgroup_offline_fn(struct work_struct *work) |
4437 | { | 4505 | { |
4438 | struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); | 4506 | struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); |
@@ -4442,16 +4510,19 @@ static void cgroup_offline_fn(struct work_struct *work) | |||
4442 | 4510 | ||
4443 | mutex_lock(&cgroup_mutex); | 4511 | mutex_lock(&cgroup_mutex); |
4444 | 4512 | ||
4445 | /* tell subsystems to initate destruction */ | 4513 | /* |
4514 | * css_tryget() is guaranteed to fail now. Tell subsystems to | ||
4515 | * initate destruction. | ||
4516 | */ | ||
4446 | for_each_subsys(cgrp->root, ss) | 4517 | for_each_subsys(cgrp->root, ss) |
4447 | offline_css(ss, cgrp); | 4518 | offline_css(ss, cgrp); |
4448 | 4519 | ||
4449 | /* | 4520 | /* |
4450 | * Put all the base refs. Each css holds an extra reference to the | 4521 | * Put the css refs from cgroup_destroy_locked(). Each css holds |
4451 | * cgroup's dentry and cgroup removal proceeds regardless of css | 4522 | * an extra reference to the cgroup's dentry and cgroup removal |
4452 | * refs. On the last put of each css, whenever that may be, the | 4523 | * proceeds regardless of css refs. On the last put of each css, |
4453 | * extra dentry ref is put so that dentry destruction happens only | 4524 | * whenever that may be, the extra dentry ref is put so that dentry |
4454 | * after all css's are released. | 4525 | * destruction happens only after all css's are released. |
4455 | */ | 4526 | */ |
4456 | for_each_subsys(cgrp->root, ss) | 4527 | for_each_subsys(cgrp->root, ss) |
4457 | css_put(cgrp->subsys[ss->subsys_id]); | 4528 | css_put(cgrp->subsys[ss->subsys_id]); |
@@ -5100,34 +5171,6 @@ static void check_for_release(struct cgroup *cgrp) | |||
5100 | } | 5171 | } |
5101 | } | 5172 | } |
5102 | 5173 | ||
5103 | /* Caller must verify that the css is not for root cgroup */ | ||
5104 | bool __css_tryget(struct cgroup_subsys_state *css) | ||
5105 | { | ||
5106 | while (true) { | ||
5107 | int t, v; | ||
5108 | |||
5109 | v = css_refcnt(css); | ||
5110 | t = atomic_cmpxchg(&css->refcnt, v, v + 1); | ||
5111 | if (likely(t == v)) | ||
5112 | return true; | ||
5113 | else if (t < 0) | ||
5114 | return false; | ||
5115 | cpu_relax(); | ||
5116 | } | ||
5117 | } | ||
5118 | EXPORT_SYMBOL_GPL(__css_tryget); | ||
5119 | |||
5120 | /* Caller must verify that the css is not for root cgroup */ | ||
5121 | void __css_put(struct cgroup_subsys_state *css) | ||
5122 | { | ||
5123 | int v; | ||
5124 | |||
5125 | v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); | ||
5126 | if (v == 0) | ||
5127 | schedule_work(&css->dput_work); | ||
5128 | } | ||
5129 | EXPORT_SYMBOL_GPL(__css_put); | ||
5130 | |||
5131 | /* | 5174 | /* |
5132 | * Notify userspace when a cgroup is released, by running the | 5175 | * Notify userspace when a cgroup is released, by running the |
5133 | * configured release agent with the name of the cgroup (path | 5176 | * configured release agent with the name of the cgroup (path |
@@ -5245,7 +5288,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) | |||
5245 | * on this or this is under rcu_read_lock(). Once css->id is allocated, | 5288 | * on this or this is under rcu_read_lock(). Once css->id is allocated, |
5246 | * it's unchanged until freed. | 5289 | * it's unchanged until freed. |
5247 | */ | 5290 | */ |
5248 | cssid = rcu_dereference_check(css->id, css_refcnt(css)); | 5291 | cssid = rcu_dereference_raw(css->id); |
5249 | 5292 | ||
5250 | if (cssid) | 5293 | if (cssid) |
5251 | return cssid->id; | 5294 | return cssid->id; |