diff options
author | Tejun Heo <tj@kernel.org> | 2014-05-07 21:31:17 -0400 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2014-05-13 11:26:31 -0400 |
commit | e5ced8ebb10c20a3b349bd798b69ccabd3b25d21 (patch) | |
tree | 8af5430880909fb3346eb5c64270b34e34052973 /kernel/cgroup_freezer.c | |
parent | 5024ae29cd285ce9e736776414da645d3a91828c (diff) |
cgroup_freezer: replace freezer->lock with freezer_mutex
After 96d365e0b86e ("cgroup: make css_set_lock a rwsem and rename it
to css_set_rwsem"), css task iterators requires sleepable context as
it may block on css_set_rwsem. I missed that cgroup_freezer was
iterating tasks under IRQ-safe spinlock freezer->lock. This leads to
errors like the following on freezer state reads and transitions.
BUG: sleeping function called from invalid context at /work
/os/work/kernel/locking/rwsem.c:20
in_atomic(): 0, irqs_disabled(): 0, pid: 462, name: bash
5 locks held by bash/462:
#0: (sb_writers#7){.+.+.+}, at: [<ffffffff811f0843>] vfs_write+0x1a3/0x1c0
#1: (&of->mutex){+.+.+.}, at: [<ffffffff8126d78b>] kernfs_fop_write+0xbb/0x170
#2: (s_active#70){.+.+.+}, at: [<ffffffff8126d793>] kernfs_fop_write+0xc3/0x170
#3: (freezer_mutex){+.+...}, at: [<ffffffff81135981>] freezer_write+0x61/0x1e0
#4: (rcu_read_lock){......}, at: [<ffffffff81135973>] freezer_write+0x53/0x1e0
Preemption disabled at:[<ffffffff81104404>] console_unlock+0x1e4/0x460
CPU: 3 PID: 462 Comm: bash Not tainted 3.15.0-rc1-work+ #10
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
ffff88000916a6d0 ffff88000e0a3da0 ffffffff81cf8c96 0000000000000000
ffff88000e0a3dc8 ffffffff810cf4f2 ffffffff82388040 ffff880013aaf740
0000000000000002 ffff88000e0a3de8 ffffffff81d05974 0000000000000246
Call Trace:
[<ffffffff81cf8c96>] dump_stack+0x4e/0x7a
[<ffffffff810cf4f2>] __might_sleep+0x162/0x260
[<ffffffff81d05974>] down_read+0x24/0x60
[<ffffffff81133e87>] css_task_iter_start+0x27/0x70
[<ffffffff8113584d>] freezer_apply_state+0x5d/0x130
[<ffffffff81135a16>] freezer_write+0xf6/0x1e0
[<ffffffff8112eb88>] cgroup_file_write+0xd8/0x230
[<ffffffff8126d7b7>] kernfs_fop_write+0xe7/0x170
[<ffffffff811f0756>] vfs_write+0xb6/0x1c0
[<ffffffff811f121d>] SyS_write+0x4d/0xc0
[<ffffffff81d08292>] system_call_fastpath+0x16/0x1b
freezer->lock used to be used in hot paths but that time is long gone
and there's no reason for the lock to be IRQ-safe spinlock or even
per-cgroup. In fact, given the fact that a cgroup may contain large
number of tasks, it's not a good idea to iterate over them while
holding IRQ-safe spinlock.
Let's simplify locking by replacing per-cgroup freezer->lock with
global freezer_mutex. This also makes the comments explaining the
intricacies of policy inheritance and the locking around it as the
states are protected by a common mutex.
The conversion is mostly straight-forward. The followings are worth
mentioning.
* freezer_css_online() no longer needs double locking.
* freezer_attach() now performs propagation simply while holding
freezer_mutex. update_if_frozen() race no longer exists and the
comment is removed.
* freezer_fork() now tests whether the task is in root cgroup using
the new task_css_is_root() without doing rcu_read_lock/unlock(). If
not, it grabs freezer_mutex and performs the operation.
* freezer_read() and freezer_change_state() grab freezer_mutex across
the whole operation and pin the css while iterating so that each
descendant processing happens in sleepable context.
Fixes: 96d365e0b86e ("cgroup: make css_set_lock a rwsem and rename it to css_set_rwsem")
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Diffstat (limited to 'kernel/cgroup_freezer.c')
-rw-r--r-- | kernel/cgroup_freezer.c | 112 |
1 files changed, 46 insertions, 66 deletions
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 2bc4a2256444..12ead0b766ee 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/uaccess.h> | 21 | #include <linux/uaccess.h> |
22 | #include <linux/freezer.h> | 22 | #include <linux/freezer.h> |
23 | #include <linux/seq_file.h> | 23 | #include <linux/seq_file.h> |
24 | #include <linux/mutex.h> | ||
24 | 25 | ||
25 | /* | 26 | /* |
26 | * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is | 27 | * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is |
@@ -42,9 +43,10 @@ enum freezer_state_flags { | |||
42 | struct freezer { | 43 | struct freezer { |
43 | struct cgroup_subsys_state css; | 44 | struct cgroup_subsys_state css; |
44 | unsigned int state; | 45 | unsigned int state; |
45 | spinlock_t lock; | ||
46 | }; | 46 | }; |
47 | 47 | ||
48 | static DEFINE_MUTEX(freezer_mutex); | ||
49 | |||
48 | static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) | 50 | static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) |
49 | { | 51 | { |
50 | return css ? container_of(css, struct freezer, css) : NULL; | 52 | return css ? container_of(css, struct freezer, css) : NULL; |
@@ -93,7 +95,6 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css) | |||
93 | if (!freezer) | 95 | if (!freezer) |
94 | return ERR_PTR(-ENOMEM); | 96 | return ERR_PTR(-ENOMEM); |
95 | 97 | ||
96 | spin_lock_init(&freezer->lock); | ||
97 | return &freezer->css; | 98 | return &freezer->css; |
98 | } | 99 | } |
99 | 100 | ||
@@ -110,14 +111,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) | |||
110 | struct freezer *freezer = css_freezer(css); | 111 | struct freezer *freezer = css_freezer(css); |
111 | struct freezer *parent = parent_freezer(freezer); | 112 | struct freezer *parent = parent_freezer(freezer); |
112 | 113 | ||
113 | /* | 114 | mutex_lock(&freezer_mutex); |
114 | * The following double locking and freezing state inheritance | ||
115 | * guarantee that @cgroup can never escape ancestors' freezing | ||
116 | * states. See css_for_each_descendant_pre() for details. | ||
117 | */ | ||
118 | if (parent) | ||
119 | spin_lock_irq(&parent->lock); | ||
120 | spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING); | ||
121 | 115 | ||
122 | freezer->state |= CGROUP_FREEZER_ONLINE; | 116 | freezer->state |= CGROUP_FREEZER_ONLINE; |
123 | 117 | ||
@@ -126,10 +120,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) | |||
126 | atomic_inc(&system_freezing_cnt); | 120 | atomic_inc(&system_freezing_cnt); |
127 | } | 121 | } |
128 | 122 | ||
129 | spin_unlock(&freezer->lock); | 123 | mutex_unlock(&freezer_mutex); |
130 | if (parent) | ||
131 | spin_unlock_irq(&parent->lock); | ||
132 | |||
133 | return 0; | 124 | return 0; |
134 | } | 125 | } |
135 | 126 | ||
@@ -144,14 +135,14 @@ static void freezer_css_offline(struct cgroup_subsys_state *css) | |||
144 | { | 135 | { |
145 | struct freezer *freezer = css_freezer(css); | 136 | struct freezer *freezer = css_freezer(css); |
146 | 137 | ||
147 | spin_lock_irq(&freezer->lock); | 138 | mutex_lock(&freezer_mutex); |
148 | 139 | ||
149 | if (freezer->state & CGROUP_FREEZING) | 140 | if (freezer->state & CGROUP_FREEZING) |
150 | atomic_dec(&system_freezing_cnt); | 141 | atomic_dec(&system_freezing_cnt); |
151 | 142 | ||
152 | freezer->state = 0; | 143 | freezer->state = 0; |
153 | 144 | ||
154 | spin_unlock_irq(&freezer->lock); | 145 | mutex_unlock(&freezer_mutex); |
155 | } | 146 | } |
156 | 147 | ||
157 | static void freezer_css_free(struct cgroup_subsys_state *css) | 148 | static void freezer_css_free(struct cgroup_subsys_state *css) |
@@ -175,7 +166,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, | |||
175 | struct task_struct *task; | 166 | struct task_struct *task; |
176 | bool clear_frozen = false; | 167 | bool clear_frozen = false; |
177 | 168 | ||
178 | spin_lock_irq(&freezer->lock); | 169 | mutex_lock(&freezer_mutex); |
179 | 170 | ||
180 | /* | 171 | /* |
181 | * Make the new tasks conform to the current state of @new_css. | 172 | * Make the new tasks conform to the current state of @new_css. |
@@ -197,21 +188,13 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, | |||
197 | } | 188 | } |
198 | } | 189 | } |
199 | 190 | ||
200 | spin_unlock_irq(&freezer->lock); | 191 | /* propagate FROZEN clearing upwards */ |
201 | |||
202 | /* | ||
203 | * Propagate FROZEN clearing upwards. We may race with | ||
204 | * update_if_frozen(), but as long as both work bottom-up, either | ||
205 | * update_if_frozen() sees child's FROZEN cleared or we clear the | ||
206 | * parent's FROZEN later. No parent w/ !FROZEN children can be | ||
207 | * left FROZEN. | ||
208 | */ | ||
209 | while (clear_frozen && (freezer = parent_freezer(freezer))) { | 192 | while (clear_frozen && (freezer = parent_freezer(freezer))) { |
210 | spin_lock_irq(&freezer->lock); | ||
211 | freezer->state &= ~CGROUP_FROZEN; | 193 | freezer->state &= ~CGROUP_FROZEN; |
212 | clear_frozen = freezer->state & CGROUP_FREEZING; | 194 | clear_frozen = freezer->state & CGROUP_FREEZING; |
213 | spin_unlock_irq(&freezer->lock); | ||
214 | } | 195 | } |
196 | |||
197 | mutex_unlock(&freezer_mutex); | ||
215 | } | 198 | } |
216 | 199 | ||
217 | /** | 200 | /** |
@@ -228,9 +211,6 @@ static void freezer_fork(struct task_struct *task) | |||
228 | { | 211 | { |
229 | struct freezer *freezer; | 212 | struct freezer *freezer; |
230 | 213 | ||
231 | rcu_read_lock(); | ||
232 | freezer = task_freezer(task); | ||
233 | |||
234 | /* | 214 | /* |
235 | * The root cgroup is non-freezable, so we can skip locking the | 215 | * The root cgroup is non-freezable, so we can skip locking the |
236 | * freezer. This is safe regardless of race with task migration. | 216 | * freezer. This is safe regardless of race with task migration. |
@@ -238,24 +218,18 @@ static void freezer_fork(struct task_struct *task) | |||
238 | * to do. If we lost and root is the new cgroup, noop is still the | 218 | * to do. If we lost and root is the new cgroup, noop is still the |
239 | * right thing to do. | 219 | * right thing to do. |
240 | */ | 220 | */ |
241 | if (!parent_freezer(freezer)) | 221 | if (task_css_is_root(task, freezer_cgrp_id)) |
242 | goto out; | 222 | return; |
243 | 223 | ||
244 | /* | 224 | mutex_lock(&freezer_mutex); |
245 | * Grab @freezer->lock and freeze @task after verifying @task still | 225 | rcu_read_lock(); |
246 | * belongs to @freezer and it's freezing. The former is for the | 226 | |
247 | * case where we have raced against task migration and lost and | 227 | freezer = task_freezer(task); |
248 | * @task is already in a different cgroup which may not be frozen. | 228 | if (freezer->state & CGROUP_FREEZING) |
249 | * This isn't strictly necessary as freeze_task() is allowed to be | ||
250 | * called spuriously but let's do it anyway for, if nothing else, | ||
251 | * documentation. | ||
252 | */ | ||
253 | spin_lock_irq(&freezer->lock); | ||
254 | if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING)) | ||
255 | freeze_task(task); | 229 | freeze_task(task); |
256 | spin_unlock_irq(&freezer->lock); | 230 | |
257 | out: | ||
258 | rcu_read_unlock(); | 231 | rcu_read_unlock(); |
232 | mutex_unlock(&freezer_mutex); | ||
259 | } | 233 | } |
260 | 234 | ||
261 | /** | 235 | /** |
@@ -281,22 +255,22 @@ static void update_if_frozen(struct cgroup_subsys_state *css) | |||
281 | struct css_task_iter it; | 255 | struct css_task_iter it; |
282 | struct task_struct *task; | 256 | struct task_struct *task; |
283 | 257 | ||
284 | WARN_ON_ONCE(!rcu_read_lock_held()); | 258 | lockdep_assert_held(&freezer_mutex); |
285 | |||
286 | spin_lock_irq(&freezer->lock); | ||
287 | 259 | ||
288 | if (!(freezer->state & CGROUP_FREEZING) || | 260 | if (!(freezer->state & CGROUP_FREEZING) || |
289 | (freezer->state & CGROUP_FROZEN)) | 261 | (freezer->state & CGROUP_FROZEN)) |
290 | goto out_unlock; | 262 | return; |
291 | 263 | ||
292 | /* are all (live) children frozen? */ | 264 | /* are all (live) children frozen? */ |
265 | rcu_read_lock(); | ||
293 | css_for_each_child(pos, css) { | 266 | css_for_each_child(pos, css) { |
294 | struct freezer *child = css_freezer(pos); | 267 | struct freezer *child = css_freezer(pos); |
295 | 268 | ||
296 | if ((child->state & CGROUP_FREEZER_ONLINE) && | 269 | if ((child->state & CGROUP_FREEZER_ONLINE) && |
297 | !(child->state & CGROUP_FROZEN)) | 270 | !(child->state & CGROUP_FROZEN)) |
298 | goto out_unlock; | 271 | return; |
299 | } | 272 | } |
273 | rcu_read_unlock(); | ||
300 | 274 | ||
301 | /* are all tasks frozen? */ | 275 | /* are all tasks frozen? */ |
302 | css_task_iter_start(css, &it); | 276 | css_task_iter_start(css, &it); |
@@ -317,21 +291,29 @@ static void update_if_frozen(struct cgroup_subsys_state *css) | |||
317 | freezer->state |= CGROUP_FROZEN; | 291 | freezer->state |= CGROUP_FROZEN; |
318 | out_iter_end: | 292 | out_iter_end: |
319 | css_task_iter_end(&it); | 293 | css_task_iter_end(&it); |
320 | out_unlock: | ||
321 | spin_unlock_irq(&freezer->lock); | ||
322 | } | 294 | } |
323 | 295 | ||
324 | static int freezer_read(struct seq_file *m, void *v) | 296 | static int freezer_read(struct seq_file *m, void *v) |
325 | { | 297 | { |
326 | struct cgroup_subsys_state *css = seq_css(m), *pos; | 298 | struct cgroup_subsys_state *css = seq_css(m), *pos; |
327 | 299 | ||
300 | mutex_lock(&freezer_mutex); | ||
328 | rcu_read_lock(); | 301 | rcu_read_lock(); |
329 | 302 | ||
330 | /* update states bottom-up */ | 303 | /* update states bottom-up */ |
331 | css_for_each_descendant_post(pos, css) | 304 | css_for_each_descendant_post(pos, css) { |
305 | if (!css_tryget(pos)) | ||
306 | continue; | ||
307 | rcu_read_unlock(); | ||
308 | |||
332 | update_if_frozen(pos); | 309 | update_if_frozen(pos); |
333 | 310 | ||
311 | rcu_read_lock(); | ||
312 | css_put(pos); | ||
313 | } | ||
314 | |||
334 | rcu_read_unlock(); | 315 | rcu_read_unlock(); |
316 | mutex_unlock(&freezer_mutex); | ||
335 | 317 | ||
336 | seq_puts(m, freezer_state_strs(css_freezer(css)->state)); | 318 | seq_puts(m, freezer_state_strs(css_freezer(css)->state)); |
337 | seq_putc(m, '\n'); | 319 | seq_putc(m, '\n'); |
@@ -373,7 +355,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, | |||
373 | unsigned int state) | 355 | unsigned int state) |
374 | { | 356 | { |
375 | /* also synchronizes against task migration, see freezer_attach() */ | 357 | /* also synchronizes against task migration, see freezer_attach() */ |
376 | lockdep_assert_held(&freezer->lock); | 358 | lockdep_assert_held(&freezer_mutex); |
377 | 359 | ||
378 | if (!(freezer->state & CGROUP_FREEZER_ONLINE)) | 360 | if (!(freezer->state & CGROUP_FREEZER_ONLINE)) |
379 | return; | 361 | return; |
@@ -414,31 +396,29 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) | |||
414 | * descendant will try to inherit its parent's FREEZING state as | 396 | * descendant will try to inherit its parent's FREEZING state as |
415 | * CGROUP_FREEZING_PARENT. | 397 | * CGROUP_FREEZING_PARENT. |
416 | */ | 398 | */ |
399 | mutex_lock(&freezer_mutex); | ||
417 | rcu_read_lock(); | 400 | rcu_read_lock(); |
418 | css_for_each_descendant_pre(pos, &freezer->css) { | 401 | css_for_each_descendant_pre(pos, &freezer->css) { |
419 | struct freezer *pos_f = css_freezer(pos); | 402 | struct freezer *pos_f = css_freezer(pos); |
420 | struct freezer *parent = parent_freezer(pos_f); | 403 | struct freezer *parent = parent_freezer(pos_f); |
421 | 404 | ||
422 | spin_lock_irq(&pos_f->lock); | 405 | if (!css_tryget(pos)) |
406 | continue; | ||
407 | rcu_read_unlock(); | ||
423 | 408 | ||
424 | if (pos_f == freezer) { | 409 | if (pos_f == freezer) |
425 | freezer_apply_state(pos_f, freeze, | 410 | freezer_apply_state(pos_f, freeze, |
426 | CGROUP_FREEZING_SELF); | 411 | CGROUP_FREEZING_SELF); |
427 | } else { | 412 | else |
428 | /* | ||
429 | * Our update to @parent->state is already visible | ||
430 | * which is all we need. No need to lock @parent. | ||
431 | * For more info on synchronization, see | ||
432 | * freezer_post_create(). | ||
433 | */ | ||
434 | freezer_apply_state(pos_f, | 413 | freezer_apply_state(pos_f, |
435 | parent->state & CGROUP_FREEZING, | 414 | parent->state & CGROUP_FREEZING, |
436 | CGROUP_FREEZING_PARENT); | 415 | CGROUP_FREEZING_PARENT); |
437 | } | ||
438 | 416 | ||
439 | spin_unlock_irq(&pos_f->lock); | 417 | rcu_read_lock(); |
418 | css_put(pos); | ||
440 | } | 419 | } |
441 | rcu_read_unlock(); | 420 | rcu_read_unlock(); |
421 | mutex_unlock(&freezer_mutex); | ||
442 | } | 422 | } |
443 | 423 | ||
444 | static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, | 424 | static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, |