aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2013-06-13 22:39:16 -0400
committerTejun Heo <tj@kernel.org>2013-06-13 22:43:12 -0400
commitd3daf28da16a30af95bfb303189a634a87606725 (patch)
treed73530e2b759b2f180daec0814124e71e8bb149b
parent2b0e53a7c8a6972755c0f0152d7fad2289fdc5eb (diff)
cgroup: use percpu refcnt for cgroup_subsys_states
A css (cgroup_subsys_state) is how each cgroup is represented to a controller. As such, it can be used in hot paths across the various subsystems different controllers are associated with. One of the common operations is reference counting, which up until now has been implemented using a global atomic counter and can have significant adverse impact on scalability. For example, css refcnt can be gotten and put multiple times by blkcg for each IO request. For highops configurations which try to do as much per-cpu as possible, the global frequent refcnting can be very expensive. In general, given the various and hugely diverse paths css's end up being used from, we need to make it cheap and highly scalable. In its usage, css refcnting isn't very different from module refcnting. This patch converts css refcnting to use the recently added percpu_ref. css_get/tryget/put() directly maps to the matching percpu_ref operations and the deactivation logic is no longer necessary as percpu_ref already has refcnt killing. The only complication is that as the refcnt is per-cpu, percpu_ref_kill() in itself doesn't ensure that further tryget operations will fail, which we need to guarantee before invoking ->css_offline()'s. This is resolved collecting kill confirmation using percpu_ref_kill_and_confirm() and initiating the offline phase of destruction after all css refcnt's are confirmed to be seen as killed on all CPUs. The previous patches already splitted destruction into two phases, so percpu_ref_kill_and_confirm() can be hooked up easily. This patch removes css_refcnt() which is used for rcu dereference sanity check in css_id(). While we can add a percpu refcnt API to ask the same question, css_id() itself is scheduled to be removed fairly soon, so let's not bother with it. Just drop the sanity check and use rcu_dereference_raw() instead. v2: - init_cgroup_css() was calling percpu_ref_init() without checking the return value. This causes two problems - the obvious lack of error handling and percpu_ref_init() being called from cgroup_init_subsys() before the allocators are up, which triggers warnings but doesn't cause actual problems as the refcnt isn't used for roots anyway. Fix both by moving percpu_ref_init() to cgroup_create(). - The base references were put too early by percpu_ref_kill_and_confirm() and cgroup_offline_fn() put the refs one extra time. This wasn't noticeable because css's go through another RCU grace period before being freed. Update cgroup_destroy_locked() to grab an extra reference before killing the refcnts. This problem was noticed by Kent. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Kent Overstreet <koverstreet@google.com> Acked-by: Li Zefan <lizefan@huawei.com> Cc: Michal Hocko <mhocko@suse.cz> Cc: Mike Snitzer <snitzer@redhat.com> Cc: Vivek Goyal <vgoyal@redhat.com> Cc: "Alasdair G. Kergon" <agk@redhat.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Mikulas Patocka <mpatocka@redhat.com> Cc: Glauber Costa <glommer@gmail.com>
-rw-r--r--include/linux/cgroup.h23
-rw-r--r--kernel/cgroup.c165
2 files changed, 112 insertions, 76 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e345d8b90046..b7bd4beae294 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -20,6 +20,7 @@
20#include <linux/workqueue.h> 20#include <linux/workqueue.h>
21#include <linux/xattr.h> 21#include <linux/xattr.h>
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/percpu-refcount.h>
23 24
24#ifdef CONFIG_CGROUPS 25#ifdef CONFIG_CGROUPS
25 26
@@ -72,13 +73,8 @@ struct cgroup_subsys_state {
72 */ 73 */
73 struct cgroup *cgroup; 74 struct cgroup *cgroup;
74 75
75 /* 76 /* reference count - access via css_[try]get() and css_put() */
76 * State maintained by the cgroup system to allow subsystems 77 struct percpu_ref refcnt;
77 * to be "busy". Should be accessed via css_get(),
78 * css_tryget() and css_put().
79 */
80
81 atomic_t refcnt;
82 78
83 unsigned long flags; 79 unsigned long flags;
84 /* ID for this css, if possible */ 80 /* ID for this css, if possible */
@@ -104,11 +100,9 @@ static inline void css_get(struct cgroup_subsys_state *css)
104{ 100{
105 /* We don't need to reference count the root state */ 101 /* We don't need to reference count the root state */
106 if (!(css->flags & CSS_ROOT)) 102 if (!(css->flags & CSS_ROOT))
107 atomic_inc(&css->refcnt); 103 percpu_ref_get(&css->refcnt);
108} 104}
109 105
110extern bool __css_tryget(struct cgroup_subsys_state *css);
111
112/** 106/**
113 * css_tryget - try to obtain a reference on the specified css 107 * css_tryget - try to obtain a reference on the specified css
114 * @css: target css 108 * @css: target css
@@ -123,11 +117,9 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
123{ 117{
124 if (css->flags & CSS_ROOT) 118 if (css->flags & CSS_ROOT)
125 return true; 119 return true;
126 return __css_tryget(css); 120 return percpu_ref_tryget(&css->refcnt);
127} 121}
128 122
129extern void __css_put(struct cgroup_subsys_state *css);
130
131/** 123/**
132 * css_put - put a css reference 124 * css_put - put a css reference
133 * @css: target css 125 * @css: target css
@@ -137,7 +129,7 @@ extern void __css_put(struct cgroup_subsys_state *css);
137static inline void css_put(struct cgroup_subsys_state *css) 129static inline void css_put(struct cgroup_subsys_state *css)
138{ 130{
139 if (!(css->flags & CSS_ROOT)) 131 if (!(css->flags & CSS_ROOT))
140 __css_put(css); 132 percpu_ref_put(&css->refcnt);
141} 133}
142 134
143/* bits in struct cgroup flags field */ 135/* bits in struct cgroup flags field */
@@ -231,9 +223,10 @@ struct cgroup {
231 struct list_head pidlists; 223 struct list_head pidlists;
232 struct mutex pidlist_mutex; 224 struct mutex pidlist_mutex;
233 225
234 /* For RCU-protected deletion */ 226 /* For css percpu_ref killing and RCU-protected deletion */
235 struct rcu_head rcu_head; 227 struct rcu_head rcu_head;
236 struct work_struct destroy_work; 228 struct work_struct destroy_work;
229 atomic_t css_kill_cnt;
237 230
238 /* List of events which userspace want to receive */ 231 /* List of events which userspace want to receive */
239 struct list_head event_list; 232 struct list_head event_list;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ebbfc043153f..2e9da7bf25cb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,9 +63,6 @@
63 63
64#include <linux/atomic.h> 64#include <linux/atomic.h>
65 65
66/* css deactivation bias, makes css->refcnt negative to deny new trygets */
67#define CSS_DEACT_BIAS INT_MIN
68
69/* 66/*
70 * cgroup_mutex is the master lock. Any modification to cgroup or its 67 * cgroup_mutex is the master lock. Any modification to cgroup or its
71 * hierarchy must be performed while holding it. 68 * hierarchy must be performed while holding it.
@@ -213,19 +210,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
213static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 210static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
214 struct cftype cfts[], bool is_add); 211 struct cftype cfts[], bool is_add);
215 212
216static int css_unbias_refcnt(int refcnt)
217{
218 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
219}
220
221/* the current nr of refs, always >= 0 whether @css is deactivated or not */
222static int css_refcnt(struct cgroup_subsys_state *css)
223{
224 int v = atomic_read(&css->refcnt);
225
226 return css_unbias_refcnt(v);
227}
228
229/* convenient tests for these bits */ 213/* convenient tests for these bits */
230static inline bool cgroup_is_dead(const struct cgroup *cgrp) 214static inline bool cgroup_is_dead(const struct cgroup *cgrp)
231{ 215{
@@ -4139,12 +4123,19 @@ static void css_dput_fn(struct work_struct *work)
4139 deactivate_super(sb); 4123 deactivate_super(sb);
4140} 4124}
4141 4125
4126static void css_release(struct percpu_ref *ref)
4127{
4128 struct cgroup_subsys_state *css =
4129 container_of(ref, struct cgroup_subsys_state, refcnt);
4130
4131 schedule_work(&css->dput_work);
4132}
4133
4142static void init_cgroup_css(struct cgroup_subsys_state *css, 4134static void init_cgroup_css(struct cgroup_subsys_state *css,
4143 struct cgroup_subsys *ss, 4135 struct cgroup_subsys *ss,
4144 struct cgroup *cgrp) 4136 struct cgroup *cgrp)
4145{ 4137{
4146 css->cgroup = cgrp; 4138 css->cgroup = cgrp;
4147 atomic_set(&css->refcnt, 1);
4148 css->flags = 0; 4139 css->flags = 0;
4149 css->id = NULL; 4140 css->id = NULL;
4150 if (cgrp == dummytop) 4141 if (cgrp == dummytop)
@@ -4266,7 +4257,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4266 err = PTR_ERR(css); 4257 err = PTR_ERR(css);
4267 goto err_free_all; 4258 goto err_free_all;
4268 } 4259 }
4260
4261 err = percpu_ref_init(&css->refcnt, css_release);
4262 if (err)
4263 goto err_free_all;
4264
4269 init_cgroup_css(css, ss, cgrp); 4265 init_cgroup_css(css, ss, cgrp);
4266
4270 if (ss->use_id) { 4267 if (ss->use_id) {
4271 err = alloc_css_id(ss, parent, cgrp); 4268 err = alloc_css_id(ss, parent, cgrp);
4272 if (err) 4269 if (err)
@@ -4331,8 +4328,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4331 4328
4332err_free_all: 4329err_free_all:
4333 for_each_subsys(root, ss) { 4330 for_each_subsys(root, ss) {
4334 if (cgrp->subsys[ss->subsys_id]) 4331 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4332
4333 if (css) {
4334 percpu_ref_cancel_init(&css->refcnt);
4335 ss->css_free(cgrp); 4335 ss->css_free(cgrp);
4336 }
4336 } 4337 }
4337 mutex_unlock(&cgroup_mutex); 4338 mutex_unlock(&cgroup_mutex);
4338 /* Release the reference count that we took on the superblock */ 4339 /* Release the reference count that we took on the superblock */
@@ -4360,6 +4361,48 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4360 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4361 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4361} 4362}
4362 4363
4364static void cgroup_css_killed(struct cgroup *cgrp)
4365{
4366 if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
4367 return;
4368
4369 /* percpu ref's of all css's are killed, kick off the next step */
4370 INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
4371 schedule_work(&cgrp->destroy_work);
4372}
4373
4374static void css_ref_killed_fn(struct percpu_ref *ref)
4375{
4376 struct cgroup_subsys_state *css =
4377 container_of(ref, struct cgroup_subsys_state, refcnt);
4378
4379 cgroup_css_killed(css->cgroup);
4380}
4381
4382/**
4383 * cgroup_destroy_locked - the first stage of cgroup destruction
4384 * @cgrp: cgroup to be destroyed
4385 *
4386 * css's make use of percpu refcnts whose killing latency shouldn't be
4387 * exposed to userland and are RCU protected. Also, cgroup core needs to
4388 * guarantee that css_tryget() won't succeed by the time ->css_offline() is
4389 * invoked. To satisfy all the requirements, destruction is implemented in
4390 * the following two steps.
4391 *
4392 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
4393 * userland visible parts and start killing the percpu refcnts of
4394 * css's. Set up so that the next stage will be kicked off once all
4395 * the percpu refcnts are confirmed to be killed.
4396 *
4397 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
4398 * rest of destruction. Once all cgroup references are gone, the
4399 * cgroup is RCU-freed.
4400 *
4401 * This function implements s1. After this step, @cgrp is gone as far as
4402 * the userland is concerned and a new cgroup with the same name may be
4403 * created. As cgroup doesn't care about the names internally, this
4404 * doesn't cause any problem.
4405 */
4363static int cgroup_destroy_locked(struct cgroup *cgrp) 4406static int cgroup_destroy_locked(struct cgroup *cgrp)
4364 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4407 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4365{ 4408{
@@ -4382,16 +4425,34 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4382 return -EBUSY; 4425 return -EBUSY;
4383 4426
4384 /* 4427 /*
4385 * Block new css_tryget() by deactivating refcnt and mark @cgrp 4428 * Block new css_tryget() by killing css refcnts. cgroup core
4386 * removed. This makes future css_tryget() attempts fail which we 4429 * guarantees that, by the time ->css_offline() is invoked, no new
4387 * guarantee to ->css_offline() callbacks. 4430 * css reference will be given out via css_tryget(). We can't
4431 * simply call percpu_ref_kill() and proceed to offlining css's
4432 * because percpu_ref_kill() doesn't guarantee that the ref is seen
4433 * as killed on all CPUs on return.
4434 *
4435 * Use percpu_ref_kill_and_confirm() to get notifications as each
4436 * css is confirmed to be seen as killed on all CPUs. The
4437 * notification callback keeps track of the number of css's to be
4438 * killed and schedules cgroup_offline_fn() to perform the rest of
4439 * destruction once the percpu refs of all css's are confirmed to
4440 * be killed.
4388 */ 4441 */
4442 atomic_set(&cgrp->css_kill_cnt, 1);
4389 for_each_subsys(cgrp->root, ss) { 4443 for_each_subsys(cgrp->root, ss) {
4390 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4444 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4391 4445
4392 WARN_ON(atomic_read(&css->refcnt) < 0); 4446 /*
4393 atomic_add(CSS_DEACT_BIAS, &css->refcnt); 4447 * Killing would put the base ref, but we need to keep it
4448 * alive until after ->css_offline.
4449 */
4450 percpu_ref_get(&css->refcnt);
4451
4452 atomic_inc(&cgrp->css_kill_cnt);
4453 percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
4394 } 4454 }
4455 cgroup_css_killed(cgrp);
4395 4456
4396 /* 4457 /*
4397 * Mark @cgrp dead. This prevents further task migration and child 4458 * Mark @cgrp dead. This prevents further task migration and child
@@ -4427,12 +4488,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4427 } 4488 }
4428 spin_unlock(&cgrp->event_list_lock); 4489 spin_unlock(&cgrp->event_list_lock);
4429 4490
4430 INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
4431 schedule_work(&cgrp->destroy_work);
4432
4433 return 0; 4491 return 0;
4434}; 4492};
4435 4493
4494/**
4495 * cgroup_offline_fn - the second step of cgroup destruction
4496 * @work: cgroup->destroy_free_work
4497 *
4498 * This function is invoked from a work item for a cgroup which is being
4499 * destroyed after the percpu refcnts of all css's are guaranteed to be
4500 * seen as killed on all CPUs, and performs the rest of destruction. This
4501 * is the second step of destruction described in the comment above
4502 * cgroup_destroy_locked().
4503 */
4436static void cgroup_offline_fn(struct work_struct *work) 4504static void cgroup_offline_fn(struct work_struct *work)
4437{ 4505{
4438 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 4506 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
@@ -4442,16 +4510,19 @@ static void cgroup_offline_fn(struct work_struct *work)
4442 4510
4443 mutex_lock(&cgroup_mutex); 4511 mutex_lock(&cgroup_mutex);
4444 4512
4445 /* tell subsystems to initate destruction */ 4513 /*
4514 * css_tryget() is guaranteed to fail now. Tell subsystems to
4515 * initate destruction.
4516 */
4446 for_each_subsys(cgrp->root, ss) 4517 for_each_subsys(cgrp->root, ss)
4447 offline_css(ss, cgrp); 4518 offline_css(ss, cgrp);
4448 4519
4449 /* 4520 /*
4450 * Put all the base refs. Each css holds an extra reference to the 4521 * Put the css refs from cgroup_destroy_locked(). Each css holds
4451 * cgroup's dentry and cgroup removal proceeds regardless of css 4522 * an extra reference to the cgroup's dentry and cgroup removal
4452 * refs. On the last put of each css, whenever that may be, the 4523 * proceeds regardless of css refs. On the last put of each css,
4453 * extra dentry ref is put so that dentry destruction happens only 4524 * whenever that may be, the extra dentry ref is put so that dentry
4454 * after all css's are released. 4525 * destruction happens only after all css's are released.
4455 */ 4526 */
4456 for_each_subsys(cgrp->root, ss) 4527 for_each_subsys(cgrp->root, ss)
4457 css_put(cgrp->subsys[ss->subsys_id]); 4528 css_put(cgrp->subsys[ss->subsys_id]);
@@ -5100,34 +5171,6 @@ static void check_for_release(struct cgroup *cgrp)
5100 } 5171 }
5101} 5172}
5102 5173
5103/* Caller must verify that the css is not for root cgroup */
5104bool __css_tryget(struct cgroup_subsys_state *css)
5105{
5106 while (true) {
5107 int t, v;
5108
5109 v = css_refcnt(css);
5110 t = atomic_cmpxchg(&css->refcnt, v, v + 1);
5111 if (likely(t == v))
5112 return true;
5113 else if (t < 0)
5114 return false;
5115 cpu_relax();
5116 }
5117}
5118EXPORT_SYMBOL_GPL(__css_tryget);
5119
5120/* Caller must verify that the css is not for root cgroup */
5121void __css_put(struct cgroup_subsys_state *css)
5122{
5123 int v;
5124
5125 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
5126 if (v == 0)
5127 schedule_work(&css->dput_work);
5128}
5129EXPORT_SYMBOL_GPL(__css_put);
5130
5131/* 5174/*
5132 * Notify userspace when a cgroup is released, by running the 5175 * Notify userspace when a cgroup is released, by running the
5133 * configured release agent with the name of the cgroup (path 5176 * configured release agent with the name of the cgroup (path
@@ -5245,7 +5288,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
5245 * on this or this is under rcu_read_lock(). Once css->id is allocated, 5288 * on this or this is under rcu_read_lock(). Once css->id is allocated,
5246 * it's unchanged until freed. 5289 * it's unchanged until freed.
5247 */ 5290 */
5248 cssid = rcu_dereference_check(css->id, css_refcnt(css)); 5291 cssid = rcu_dereference_raw(css->id);
5249 5292
5250 if (cssid) 5293 if (cssid)
5251 return cssid->id; 5294 return cssid->id;