aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/cgroup.h23
-rw-r--r--kernel/cgroup.c165
2 files changed, 112 insertions, 76 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e345d8b90046..b7bd4beae294 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -20,6 +20,7 @@
20#include <linux/workqueue.h> 20#include <linux/workqueue.h>
21#include <linux/xattr.h> 21#include <linux/xattr.h>
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/percpu-refcount.h>
23 24
24#ifdef CONFIG_CGROUPS 25#ifdef CONFIG_CGROUPS
25 26
@@ -72,13 +73,8 @@ struct cgroup_subsys_state {
72 */ 73 */
73 struct cgroup *cgroup; 74 struct cgroup *cgroup;
74 75
75 /* 76 /* reference count - access via css_[try]get() and css_put() */
76 * State maintained by the cgroup system to allow subsystems 77 struct percpu_ref refcnt;
77 * to be "busy". Should be accessed via css_get(),
78 * css_tryget() and css_put().
79 */
80
81 atomic_t refcnt;
82 78
83 unsigned long flags; 79 unsigned long flags;
84 /* ID for this css, if possible */ 80 /* ID for this css, if possible */
@@ -104,11 +100,9 @@ static inline void css_get(struct cgroup_subsys_state *css)
104{ 100{
105 /* We don't need to reference count the root state */ 101 /* We don't need to reference count the root state */
106 if (!(css->flags & CSS_ROOT)) 102 if (!(css->flags & CSS_ROOT))
107 atomic_inc(&css->refcnt); 103 percpu_ref_get(&css->refcnt);
108} 104}
109 105
110extern bool __css_tryget(struct cgroup_subsys_state *css);
111
112/** 106/**
113 * css_tryget - try to obtain a reference on the specified css 107 * css_tryget - try to obtain a reference on the specified css
114 * @css: target css 108 * @css: target css
@@ -123,11 +117,9 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
123{ 117{
124 if (css->flags & CSS_ROOT) 118 if (css->flags & CSS_ROOT)
125 return true; 119 return true;
126 return __css_tryget(css); 120 return percpu_ref_tryget(&css->refcnt);
127} 121}
128 122
129extern void __css_put(struct cgroup_subsys_state *css);
130
131/** 123/**
132 * css_put - put a css reference 124 * css_put - put a css reference
133 * @css: target css 125 * @css: target css
@@ -137,7 +129,7 @@ extern void __css_put(struct cgroup_subsys_state *css);
137static inline void css_put(struct cgroup_subsys_state *css) 129static inline void css_put(struct cgroup_subsys_state *css)
138{ 130{
139 if (!(css->flags & CSS_ROOT)) 131 if (!(css->flags & CSS_ROOT))
140 __css_put(css); 132 percpu_ref_put(&css->refcnt);
141} 133}
142 134
143/* bits in struct cgroup flags field */ 135/* bits in struct cgroup flags field */
@@ -231,9 +223,10 @@ struct cgroup {
231 struct list_head pidlists; 223 struct list_head pidlists;
232 struct mutex pidlist_mutex; 224 struct mutex pidlist_mutex;
233 225
234 /* For RCU-protected deletion */ 226 /* For css percpu_ref killing and RCU-protected deletion */
235 struct rcu_head rcu_head; 227 struct rcu_head rcu_head;
236 struct work_struct destroy_work; 228 struct work_struct destroy_work;
229 atomic_t css_kill_cnt;
237 230
238 /* List of events which userspace want to receive */ 231 /* List of events which userspace want to receive */
239 struct list_head event_list; 232 struct list_head event_list;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ebbfc043153f..2e9da7bf25cb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,9 +63,6 @@
63 63
64#include <linux/atomic.h> 64#include <linux/atomic.h>
65 65
66/* css deactivation bias, makes css->refcnt negative to deny new trygets */
67#define CSS_DEACT_BIAS INT_MIN
68
69/* 66/*
70 * cgroup_mutex is the master lock. Any modification to cgroup or its 67 * cgroup_mutex is the master lock. Any modification to cgroup or its
71 * hierarchy must be performed while holding it. 68 * hierarchy must be performed while holding it.
@@ -213,19 +210,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
213static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 210static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
214 struct cftype cfts[], bool is_add); 211 struct cftype cfts[], bool is_add);
215 212
216static int css_unbias_refcnt(int refcnt)
217{
218 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
219}
220
221/* the current nr of refs, always >= 0 whether @css is deactivated or not */
222static int css_refcnt(struct cgroup_subsys_state *css)
223{
224 int v = atomic_read(&css->refcnt);
225
226 return css_unbias_refcnt(v);
227}
228
229/* convenient tests for these bits */ 213/* convenient tests for these bits */
230static inline bool cgroup_is_dead(const struct cgroup *cgrp) 214static inline bool cgroup_is_dead(const struct cgroup *cgrp)
231{ 215{
@@ -4139,12 +4123,19 @@ static void css_dput_fn(struct work_struct *work)
4139 deactivate_super(sb); 4123 deactivate_super(sb);
4140} 4124}
4141 4125
4126static void css_release(struct percpu_ref *ref)
4127{
4128 struct cgroup_subsys_state *css =
4129 container_of(ref, struct cgroup_subsys_state, refcnt);
4130
4131 schedule_work(&css->dput_work);
4132}
4133
4142static void init_cgroup_css(struct cgroup_subsys_state *css, 4134static void init_cgroup_css(struct cgroup_subsys_state *css,
4143 struct cgroup_subsys *ss, 4135 struct cgroup_subsys *ss,
4144 struct cgroup *cgrp) 4136 struct cgroup *cgrp)
4145{ 4137{
4146 css->cgroup = cgrp; 4138 css->cgroup = cgrp;
4147 atomic_set(&css->refcnt, 1);
4148 css->flags = 0; 4139 css->flags = 0;
4149 css->id = NULL; 4140 css->id = NULL;
4150 if (cgrp == dummytop) 4141 if (cgrp == dummytop)
@@ -4266,7 +4257,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4266 err = PTR_ERR(css); 4257 err = PTR_ERR(css);
4267 goto err_free_all; 4258 goto err_free_all;
4268 } 4259 }
4260
4261 err = percpu_ref_init(&css->refcnt, css_release);
4262 if (err)
4263 goto err_free_all;
4264
4269 init_cgroup_css(css, ss, cgrp); 4265 init_cgroup_css(css, ss, cgrp);
4266
4270 if (ss->use_id) { 4267 if (ss->use_id) {
4271 err = alloc_css_id(ss, parent, cgrp); 4268 err = alloc_css_id(ss, parent, cgrp);
4272 if (err) 4269 if (err)
@@ -4331,8 +4328,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4331 4328
4332err_free_all: 4329err_free_all:
4333 for_each_subsys(root, ss) { 4330 for_each_subsys(root, ss) {
4334 if (cgrp->subsys[ss->subsys_id]) 4331 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4332
4333 if (css) {
4334 percpu_ref_cancel_init(&css->refcnt);
4335 ss->css_free(cgrp); 4335 ss->css_free(cgrp);
4336 }
4336 } 4337 }
4337 mutex_unlock(&cgroup_mutex); 4338 mutex_unlock(&cgroup_mutex);
4338 /* Release the reference count that we took on the superblock */ 4339 /* Release the reference count that we took on the superblock */
@@ -4360,6 +4361,48 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4360 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4361 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4361} 4362}
4362 4363
4364static void cgroup_css_killed(struct cgroup *cgrp)
4365{
4366 if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
4367 return;
4368
4369 /* percpu ref's of all css's are killed, kick off the next step */
4370 INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
4371 schedule_work(&cgrp->destroy_work);
4372}
4373
4374static void css_ref_killed_fn(struct percpu_ref *ref)
4375{
4376 struct cgroup_subsys_state *css =
4377 container_of(ref, struct cgroup_subsys_state, refcnt);
4378
4379 cgroup_css_killed(css->cgroup);
4380}
4381
4382/**
4383 * cgroup_destroy_locked - the first stage of cgroup destruction
4384 * @cgrp: cgroup to be destroyed
4385 *
4386 * css's make use of percpu refcnts whose killing latency shouldn't be
4387 * exposed to userland and are RCU protected. Also, cgroup core needs to
4388 * guarantee that css_tryget() won't succeed by the time ->css_offline() is
4389 * invoked. To satisfy all the requirements, destruction is implemented in
4390 * the following two steps.
4391 *
4392 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
4393 * userland visible parts and start killing the percpu refcnts of
4394 * css's. Set up so that the next stage will be kicked off once all
4395 * the percpu refcnts are confirmed to be killed.
4396 *
4397 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
4398 * rest of destruction. Once all cgroup references are gone, the
4399 * cgroup is RCU-freed.
4400 *
4401 * This function implements s1. After this step, @cgrp is gone as far as
4402 * the userland is concerned and a new cgroup with the same name may be
4403 * created. As cgroup doesn't care about the names internally, this
4404 * doesn't cause any problem.
4405 */
4363static int cgroup_destroy_locked(struct cgroup *cgrp) 4406static int cgroup_destroy_locked(struct cgroup *cgrp)
4364 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4407 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4365{ 4408{
@@ -4382,16 +4425,34 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4382 return -EBUSY; 4425 return -EBUSY;
4383 4426
4384 /* 4427 /*
4385 * Block new css_tryget() by deactivating refcnt and mark @cgrp 4428 * Block new css_tryget() by killing css refcnts. cgroup core
4386 * removed. This makes future css_tryget() attempts fail which we 4429 * guarantees that, by the time ->css_offline() is invoked, no new
4387 * guarantee to ->css_offline() callbacks. 4430 * css reference will be given out via css_tryget(). We can't
4431 * simply call percpu_ref_kill() and proceed to offlining css's
4432 * because percpu_ref_kill() doesn't guarantee that the ref is seen
4433 * as killed on all CPUs on return.
4434 *
4435 * Use percpu_ref_kill_and_confirm() to get notifications as each
4436 * css is confirmed to be seen as killed on all CPUs. The
4437 * notification callback keeps track of the number of css's to be
4438 * killed and schedules cgroup_offline_fn() to perform the rest of
4439 * destruction once the percpu refs of all css's are confirmed to
4440 * be killed.
4388 */ 4441 */
4442 atomic_set(&cgrp->css_kill_cnt, 1);
4389 for_each_subsys(cgrp->root, ss) { 4443 for_each_subsys(cgrp->root, ss) {
4390 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4444 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4391 4445
4392 WARN_ON(atomic_read(&css->refcnt) < 0); 4446 /*
4393 atomic_add(CSS_DEACT_BIAS, &css->refcnt); 4447 * Killing would put the base ref, but we need to keep it
4448 * alive until after ->css_offline.
4449 */
4450 percpu_ref_get(&css->refcnt);
4451
4452 atomic_inc(&cgrp->css_kill_cnt);
4453 percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
4394 } 4454 }
4455 cgroup_css_killed(cgrp);
4395 4456
4396 /* 4457 /*
4397 * Mark @cgrp dead. This prevents further task migration and child 4458 * Mark @cgrp dead. This prevents further task migration and child
@@ -4427,12 +4488,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4427 } 4488 }
4428 spin_unlock(&cgrp->event_list_lock); 4489 spin_unlock(&cgrp->event_list_lock);
4429 4490
4430 INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
4431 schedule_work(&cgrp->destroy_work);
4432
4433 return 0; 4491 return 0;
4434}; 4492};
4435 4493
4494/**
4495 * cgroup_offline_fn - the second step of cgroup destruction
4496 * @work: cgroup->destroy_free_work
4497 *
4498 * This function is invoked from a work item for a cgroup which is being
4499 * destroyed after the percpu refcnts of all css's are guaranteed to be
4500 * seen as killed on all CPUs, and performs the rest of destruction. This
4501 * is the second step of destruction described in the comment above
4502 * cgroup_destroy_locked().
4503 */
4436static void cgroup_offline_fn(struct work_struct *work) 4504static void cgroup_offline_fn(struct work_struct *work)
4437{ 4505{
4438 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 4506 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
@@ -4442,16 +4510,19 @@ static void cgroup_offline_fn(struct work_struct *work)
4442 4510
4443 mutex_lock(&cgroup_mutex); 4511 mutex_lock(&cgroup_mutex);
4444 4512
4445 /* tell subsystems to initate destruction */ 4513 /*
4514 * css_tryget() is guaranteed to fail now. Tell subsystems to
4515 * initate destruction.
4516 */
4446 for_each_subsys(cgrp->root, ss) 4517 for_each_subsys(cgrp->root, ss)
4447 offline_css(ss, cgrp); 4518 offline_css(ss, cgrp);
4448 4519
4449 /* 4520 /*
4450 * Put all the base refs. Each css holds an extra reference to the 4521 * Put the css refs from cgroup_destroy_locked(). Each css holds
4451 * cgroup's dentry and cgroup removal proceeds regardless of css 4522 * an extra reference to the cgroup's dentry and cgroup removal
4452 * refs. On the last put of each css, whenever that may be, the 4523 * proceeds regardless of css refs. On the last put of each css,
4453 * extra dentry ref is put so that dentry destruction happens only 4524 * whenever that may be, the extra dentry ref is put so that dentry
4454 * after all css's are released. 4525 * destruction happens only after all css's are released.
4455 */ 4526 */
4456 for_each_subsys(cgrp->root, ss) 4527 for_each_subsys(cgrp->root, ss)
4457 css_put(cgrp->subsys[ss->subsys_id]); 4528 css_put(cgrp->subsys[ss->subsys_id]);
@@ -5100,34 +5171,6 @@ static void check_for_release(struct cgroup *cgrp)
5100 } 5171 }
5101} 5172}
5102 5173
5103/* Caller must verify that the css is not for root cgroup */
5104bool __css_tryget(struct cgroup_subsys_state *css)
5105{
5106 while (true) {
5107 int t, v;
5108
5109 v = css_refcnt(css);
5110 t = atomic_cmpxchg(&css->refcnt, v, v + 1);
5111 if (likely(t == v))
5112 return true;
5113 else if (t < 0)
5114 return false;
5115 cpu_relax();
5116 }
5117}
5118EXPORT_SYMBOL_GPL(__css_tryget);
5119
5120/* Caller must verify that the css is not for root cgroup */
5121void __css_put(struct cgroup_subsys_state *css)
5122{
5123 int v;
5124
5125 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
5126 if (v == 0)
5127 schedule_work(&css->dput_work);
5128}
5129EXPORT_SYMBOL_GPL(__css_put);
5130
5131/* 5174/*
5132 * Notify userspace when a cgroup is released, by running the 5175 * Notify userspace when a cgroup is released, by running the
5133 * configured release agent with the name of the cgroup (path 5176 * configured release agent with the name of the cgroup (path
@@ -5245,7 +5288,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
5245 * on this or this is under rcu_read_lock(). Once css->id is allocated, 5288 * on this or this is under rcu_read_lock(). Once css->id is allocated,
5246 * it's unchanged until freed. 5289 * it's unchanged until freed.
5247 */ 5290 */
5248 cssid = rcu_dereference_check(css->id, css_refcnt(css)); 5291 cssid = rcu_dereference_raw(css->id);
5249 5292
5250 if (cssid) 5293 if (cssid)
5251 return cssid->id; 5294 return cssid->id;