diff options
| author | Tejun Heo <tj@kernel.org> | 2014-05-07 21:31:17 -0400 |
|---|---|---|
| committer | Tejun Heo <tj@kernel.org> | 2014-05-13 11:26:31 -0400 |
| commit | e5ced8ebb10c20a3b349bd798b69ccabd3b25d21 (patch) | |
| tree | 8af5430880909fb3346eb5c64270b34e34052973 | |
| parent | 5024ae29cd285ce9e736776414da645d3a91828c (diff) | |
cgroup_freezer: replace freezer->lock with freezer_mutex
After 96d365e0b86e ("cgroup: make css_set_lock a rwsem and rename it
to css_set_rwsem"), css task iterators requires sleepable context as
it may block on css_set_rwsem. I missed that cgroup_freezer was
iterating tasks under IRQ-safe spinlock freezer->lock. This leads to
errors like the following on freezer state reads and transitions.
BUG: sleeping function called from invalid context at /work
/os/work/kernel/locking/rwsem.c:20
in_atomic(): 0, irqs_disabled(): 0, pid: 462, name: bash
5 locks held by bash/462:
#0: (sb_writers#7){.+.+.+}, at: [<ffffffff811f0843>] vfs_write+0x1a3/0x1c0
#1: (&of->mutex){+.+.+.}, at: [<ffffffff8126d78b>] kernfs_fop_write+0xbb/0x170
#2: (s_active#70){.+.+.+}, at: [<ffffffff8126d793>] kernfs_fop_write+0xc3/0x170
#3: (freezer_mutex){+.+...}, at: [<ffffffff81135981>] freezer_write+0x61/0x1e0
#4: (rcu_read_lock){......}, at: [<ffffffff81135973>] freezer_write+0x53/0x1e0
Preemption disabled at:[<ffffffff81104404>] console_unlock+0x1e4/0x460
CPU: 3 PID: 462 Comm: bash Not tainted 3.15.0-rc1-work+ #10
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
ffff88000916a6d0 ffff88000e0a3da0 ffffffff81cf8c96 0000000000000000
ffff88000e0a3dc8 ffffffff810cf4f2 ffffffff82388040 ffff880013aaf740
0000000000000002 ffff88000e0a3de8 ffffffff81d05974 0000000000000246
Call Trace:
[<ffffffff81cf8c96>] dump_stack+0x4e/0x7a
[<ffffffff810cf4f2>] __might_sleep+0x162/0x260
[<ffffffff81d05974>] down_read+0x24/0x60
[<ffffffff81133e87>] css_task_iter_start+0x27/0x70
[<ffffffff8113584d>] freezer_apply_state+0x5d/0x130
[<ffffffff81135a16>] freezer_write+0xf6/0x1e0
[<ffffffff8112eb88>] cgroup_file_write+0xd8/0x230
[<ffffffff8126d7b7>] kernfs_fop_write+0xe7/0x170
[<ffffffff811f0756>] vfs_write+0xb6/0x1c0
[<ffffffff811f121d>] SyS_write+0x4d/0xc0
[<ffffffff81d08292>] system_call_fastpath+0x16/0x1b
freezer->lock used to be used in hot paths but that time is long gone
and there's no reason for the lock to be IRQ-safe spinlock or even
per-cgroup. In fact, given the fact that a cgroup may contain large
number of tasks, it's not a good idea to iterate over them while
holding IRQ-safe spinlock.
Let's simplify locking by replacing per-cgroup freezer->lock with
global freezer_mutex. This also makes the comments explaining the
intricacies of policy inheritance and the locking around it as the
states are protected by a common mutex.
The conversion is mostly straight-forward. The followings are worth
mentioning.
* freezer_css_online() no longer needs double locking.
* freezer_attach() now performs propagation simply while holding
freezer_mutex. update_if_frozen() race no longer exists and the
comment is removed.
* freezer_fork() now tests whether the task is in root cgroup using
the new task_css_is_root() without doing rcu_read_lock/unlock(). If
not, it grabs freezer_mutex and performs the operation.
* freezer_read() and freezer_change_state() grab freezer_mutex across
the whole operation and pin the css while iterating so that each
descendant processing happens in sleepable context.
Fixes: 96d365e0b86e ("cgroup: make css_set_lock a rwsem and rename it to css_set_rwsem")
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
| -rw-r--r-- | kernel/cgroup_freezer.c | 112 |
1 files changed, 46 insertions, 66 deletions
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 2bc4a2256444..12ead0b766ee 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
| @@ -21,6 +21,7 @@ | |||
| 21 | #include <linux/uaccess.h> | 21 | #include <linux/uaccess.h> |
| 22 | #include <linux/freezer.h> | 22 | #include <linux/freezer.h> |
| 23 | #include <linux/seq_file.h> | 23 | #include <linux/seq_file.h> |
| 24 | #include <linux/mutex.h> | ||
| 24 | 25 | ||
| 25 | /* | 26 | /* |
| 26 | * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is | 27 | * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is |
| @@ -42,9 +43,10 @@ enum freezer_state_flags { | |||
| 42 | struct freezer { | 43 | struct freezer { |
| 43 | struct cgroup_subsys_state css; | 44 | struct cgroup_subsys_state css; |
| 44 | unsigned int state; | 45 | unsigned int state; |
| 45 | spinlock_t lock; | ||
| 46 | }; | 46 | }; |
| 47 | 47 | ||
| 48 | static DEFINE_MUTEX(freezer_mutex); | ||
| 49 | |||
| 48 | static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) | 50 | static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) |
| 49 | { | 51 | { |
| 50 | return css ? container_of(css, struct freezer, css) : NULL; | 52 | return css ? container_of(css, struct freezer, css) : NULL; |
| @@ -93,7 +95,6 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css) | |||
| 93 | if (!freezer) | 95 | if (!freezer) |
| 94 | return ERR_PTR(-ENOMEM); | 96 | return ERR_PTR(-ENOMEM); |
| 95 | 97 | ||
| 96 | spin_lock_init(&freezer->lock); | ||
| 97 | return &freezer->css; | 98 | return &freezer->css; |
| 98 | } | 99 | } |
| 99 | 100 | ||
| @@ -110,14 +111,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) | |||
| 110 | struct freezer *freezer = css_freezer(css); | 111 | struct freezer *freezer = css_freezer(css); |
| 111 | struct freezer *parent = parent_freezer(freezer); | 112 | struct freezer *parent = parent_freezer(freezer); |
| 112 | 113 | ||
| 113 | /* | 114 | mutex_lock(&freezer_mutex); |
| 114 | * The following double locking and freezing state inheritance | ||
| 115 | * guarantee that @cgroup can never escape ancestors' freezing | ||
| 116 | * states. See css_for_each_descendant_pre() for details. | ||
| 117 | */ | ||
| 118 | if (parent) | ||
| 119 | spin_lock_irq(&parent->lock); | ||
| 120 | spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING); | ||
| 121 | 115 | ||
| 122 | freezer->state |= CGROUP_FREEZER_ONLINE; | 116 | freezer->state |= CGROUP_FREEZER_ONLINE; |
| 123 | 117 | ||
| @@ -126,10 +120,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) | |||
| 126 | atomic_inc(&system_freezing_cnt); | 120 | atomic_inc(&system_freezing_cnt); |
| 127 | } | 121 | } |
| 128 | 122 | ||
| 129 | spin_unlock(&freezer->lock); | 123 | mutex_unlock(&freezer_mutex); |
| 130 | if (parent) | ||
| 131 | spin_unlock_irq(&parent->lock); | ||
| 132 | |||
| 133 | return 0; | 124 | return 0; |
| 134 | } | 125 | } |
| 135 | 126 | ||
| @@ -144,14 +135,14 @@ static void freezer_css_offline(struct cgroup_subsys_state *css) | |||
| 144 | { | 135 | { |
| 145 | struct freezer *freezer = css_freezer(css); | 136 | struct freezer *freezer = css_freezer(css); |
| 146 | 137 | ||
| 147 | spin_lock_irq(&freezer->lock); | 138 | mutex_lock(&freezer_mutex); |
| 148 | 139 | ||
| 149 | if (freezer->state & CGROUP_FREEZING) | 140 | if (freezer->state & CGROUP_FREEZING) |
| 150 | atomic_dec(&system_freezing_cnt); | 141 | atomic_dec(&system_freezing_cnt); |
| 151 | 142 | ||
| 152 | freezer->state = 0; | 143 | freezer->state = 0; |
| 153 | 144 | ||
| 154 | spin_unlock_irq(&freezer->lock); | 145 | mutex_unlock(&freezer_mutex); |
| 155 | } | 146 | } |
| 156 | 147 | ||
| 157 | static void freezer_css_free(struct cgroup_subsys_state *css) | 148 | static void freezer_css_free(struct cgroup_subsys_state *css) |
| @@ -175,7 +166,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, | |||
| 175 | struct task_struct *task; | 166 | struct task_struct *task; |
| 176 | bool clear_frozen = false; | 167 | bool clear_frozen = false; |
| 177 | 168 | ||
| 178 | spin_lock_irq(&freezer->lock); | 169 | mutex_lock(&freezer_mutex); |
| 179 | 170 | ||
| 180 | /* | 171 | /* |
| 181 | * Make the new tasks conform to the current state of @new_css. | 172 | * Make the new tasks conform to the current state of @new_css. |
| @@ -197,21 +188,13 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, | |||
| 197 | } | 188 | } |
| 198 | } | 189 | } |
| 199 | 190 | ||
| 200 | spin_unlock_irq(&freezer->lock); | 191 | /* propagate FROZEN clearing upwards */ |
| 201 | |||
| 202 | /* | ||
| 203 | * Propagate FROZEN clearing upwards. We may race with | ||
| 204 | * update_if_frozen(), but as long as both work bottom-up, either | ||
| 205 | * update_if_frozen() sees child's FROZEN cleared or we clear the | ||
| 206 | * parent's FROZEN later. No parent w/ !FROZEN children can be | ||
| 207 | * left FROZEN. | ||
| 208 | */ | ||
| 209 | while (clear_frozen && (freezer = parent_freezer(freezer))) { | 192 | while (clear_frozen && (freezer = parent_freezer(freezer))) { |
| 210 | spin_lock_irq(&freezer->lock); | ||
| 211 | freezer->state &= ~CGROUP_FROZEN; | 193 | freezer->state &= ~CGROUP_FROZEN; |
| 212 | clear_frozen = freezer->state & CGROUP_FREEZING; | 194 | clear_frozen = freezer->state & CGROUP_FREEZING; |
| 213 | spin_unlock_irq(&freezer->lock); | ||
| 214 | } | 195 | } |
| 196 | |||
| 197 | mutex_unlock(&freezer_mutex); | ||
| 215 | } | 198 | } |
| 216 | 199 | ||
| 217 | /** | 200 | /** |
| @@ -228,9 +211,6 @@ static void freezer_fork(struct task_struct *task) | |||
| 228 | { | 211 | { |
| 229 | struct freezer *freezer; | 212 | struct freezer *freezer; |
| 230 | 213 | ||
| 231 | rcu_read_lock(); | ||
| 232 | freezer = task_freezer(task); | ||
| 233 | |||
| 234 | /* | 214 | /* |
| 235 | * The root cgroup is non-freezable, so we can skip locking the | 215 | * The root cgroup is non-freezable, so we can skip locking the |
| 236 | * freezer. This is safe regardless of race with task migration. | 216 | * freezer. This is safe regardless of race with task migration. |
| @@ -238,24 +218,18 @@ static void freezer_fork(struct task_struct *task) | |||
| 238 | * to do. If we lost and root is the new cgroup, noop is still the | 218 | * to do. If we lost and root is the new cgroup, noop is still the |
| 239 | * right thing to do. | 219 | * right thing to do. |
| 240 | */ | 220 | */ |
| 241 | if (!parent_freezer(freezer)) | 221 | if (task_css_is_root(task, freezer_cgrp_id)) |
| 242 | goto out; | 222 | return; |
| 243 | 223 | ||
| 244 | /* | 224 | mutex_lock(&freezer_mutex); |
| 245 | * Grab @freezer->lock and freeze @task after verifying @task still | 225 | rcu_read_lock(); |
| 246 | * belongs to @freezer and it's freezing. The former is for the | 226 | |
| 247 | * case where we have raced against task migration and lost and | 227 | freezer = task_freezer(task); |
| 248 | * @task is already in a different cgroup which may not be frozen. | 228 | if (freezer->state & CGROUP_FREEZING) |
| 249 | * This isn't strictly necessary as freeze_task() is allowed to be | ||
| 250 | * called spuriously but let's do it anyway for, if nothing else, | ||
| 251 | * documentation. | ||
| 252 | */ | ||
| 253 | spin_lock_irq(&freezer->lock); | ||
| 254 | if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING)) | ||
| 255 | freeze_task(task); | 229 | freeze_task(task); |
| 256 | spin_unlock_irq(&freezer->lock); | 230 | |
| 257 | out: | ||
| 258 | rcu_read_unlock(); | 231 | rcu_read_unlock(); |
| 232 | mutex_unlock(&freezer_mutex); | ||
| 259 | } | 233 | } |
| 260 | 234 | ||
| 261 | /** | 235 | /** |
| @@ -281,22 +255,22 @@ static void update_if_frozen(struct cgroup_subsys_state *css) | |||
| 281 | struct css_task_iter it; | 255 | struct css_task_iter it; |
| 282 | struct task_struct *task; | 256 | struct task_struct *task; |
| 283 | 257 | ||
| 284 | WARN_ON_ONCE(!rcu_read_lock_held()); | 258 | lockdep_assert_held(&freezer_mutex); |
| 285 | |||
| 286 | spin_lock_irq(&freezer->lock); | ||
| 287 | 259 | ||
| 288 | if (!(freezer->state & CGROUP_FREEZING) || | 260 | if (!(freezer->state & CGROUP_FREEZING) || |
| 289 | (freezer->state & CGROUP_FROZEN)) | 261 | (freezer->state & CGROUP_FROZEN)) |
| 290 | goto out_unlock; | 262 | return; |
| 291 | 263 | ||
| 292 | /* are all (live) children frozen? */ | 264 | /* are all (live) children frozen? */ |
| 265 | rcu_read_lock(); | ||
| 293 | css_for_each_child(pos, css) { | 266 | css_for_each_child(pos, css) { |
| 294 | struct freezer *child = css_freezer(pos); | 267 | struct freezer *child = css_freezer(pos); |
| 295 | 268 | ||
| 296 | if ((child->state & CGROUP_FREEZER_ONLINE) && | 269 | if ((child->state & CGROUP_FREEZER_ONLINE) && |
| 297 | !(child->state & CGROUP_FROZEN)) | 270 | !(child->state & CGROUP_FROZEN)) |
| 298 | goto out_unlock; | 271 | return; |
| 299 | } | 272 | } |
| 273 | rcu_read_unlock(); | ||
| 300 | 274 | ||
| 301 | /* are all tasks frozen? */ | 275 | /* are all tasks frozen? */ |
| 302 | css_task_iter_start(css, &it); | 276 | css_task_iter_start(css, &it); |
| @@ -317,21 +291,29 @@ static void update_if_frozen(struct cgroup_subsys_state *css) | |||
| 317 | freezer->state |= CGROUP_FROZEN; | 291 | freezer->state |= CGROUP_FROZEN; |
| 318 | out_iter_end: | 292 | out_iter_end: |
| 319 | css_task_iter_end(&it); | 293 | css_task_iter_end(&it); |
| 320 | out_unlock: | ||
| 321 | spin_unlock_irq(&freezer->lock); | ||
| 322 | } | 294 | } |
| 323 | 295 | ||
| 324 | static int freezer_read(struct seq_file *m, void *v) | 296 | static int freezer_read(struct seq_file *m, void *v) |
| 325 | { | 297 | { |
| 326 | struct cgroup_subsys_state *css = seq_css(m), *pos; | 298 | struct cgroup_subsys_state *css = seq_css(m), *pos; |
| 327 | 299 | ||
| 300 | mutex_lock(&freezer_mutex); | ||
| 328 | rcu_read_lock(); | 301 | rcu_read_lock(); |
| 329 | 302 | ||
| 330 | /* update states bottom-up */ | 303 | /* update states bottom-up */ |
| 331 | css_for_each_descendant_post(pos, css) | 304 | css_for_each_descendant_post(pos, css) { |
| 305 | if (!css_tryget(pos)) | ||
| 306 | continue; | ||
| 307 | rcu_read_unlock(); | ||
| 308 | |||
| 332 | update_if_frozen(pos); | 309 | update_if_frozen(pos); |
| 333 | 310 | ||
| 311 | rcu_read_lock(); | ||
| 312 | css_put(pos); | ||
| 313 | } | ||
| 314 | |||
| 334 | rcu_read_unlock(); | 315 | rcu_read_unlock(); |
| 316 | mutex_unlock(&freezer_mutex); | ||
| 335 | 317 | ||
| 336 | seq_puts(m, freezer_state_strs(css_freezer(css)->state)); | 318 | seq_puts(m, freezer_state_strs(css_freezer(css)->state)); |
| 337 | seq_putc(m, '\n'); | 319 | seq_putc(m, '\n'); |
| @@ -373,7 +355,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, | |||
| 373 | unsigned int state) | 355 | unsigned int state) |
| 374 | { | 356 | { |
| 375 | /* also synchronizes against task migration, see freezer_attach() */ | 357 | /* also synchronizes against task migration, see freezer_attach() */ |
| 376 | lockdep_assert_held(&freezer->lock); | 358 | lockdep_assert_held(&freezer_mutex); |
| 377 | 359 | ||
| 378 | if (!(freezer->state & CGROUP_FREEZER_ONLINE)) | 360 | if (!(freezer->state & CGROUP_FREEZER_ONLINE)) |
| 379 | return; | 361 | return; |
| @@ -414,31 +396,29 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) | |||
| 414 | * descendant will try to inherit its parent's FREEZING state as | 396 | * descendant will try to inherit its parent's FREEZING state as |
| 415 | * CGROUP_FREEZING_PARENT. | 397 | * CGROUP_FREEZING_PARENT. |
| 416 | */ | 398 | */ |
| 399 | mutex_lock(&freezer_mutex); | ||
| 417 | rcu_read_lock(); | 400 | rcu_read_lock(); |
| 418 | css_for_each_descendant_pre(pos, &freezer->css) { | 401 | css_for_each_descendant_pre(pos, &freezer->css) { |
| 419 | struct freezer *pos_f = css_freezer(pos); | 402 | struct freezer *pos_f = css_freezer(pos); |
| 420 | struct freezer *parent = parent_freezer(pos_f); | 403 | struct freezer *parent = parent_freezer(pos_f); |
| 421 | 404 | ||
| 422 | spin_lock_irq(&pos_f->lock); | 405 | if (!css_tryget(pos)) |
| 406 | continue; | ||
| 407 | rcu_read_unlock(); | ||
| 423 | 408 | ||
| 424 | if (pos_f == freezer) { | 409 | if (pos_f == freezer) |
| 425 | freezer_apply_state(pos_f, freeze, | 410 | freezer_apply_state(pos_f, freeze, |
| 426 | CGROUP_FREEZING_SELF); | 411 | CGROUP_FREEZING_SELF); |
| 427 | } else { | 412 | else |
| 428 | /* | ||
| 429 | * Our update to @parent->state is already visible | ||
| 430 | * which is all we need. No need to lock @parent. | ||
| 431 | * For more info on synchronization, see | ||
| 432 | * freezer_post_create(). | ||
| 433 | */ | ||
| 434 | freezer_apply_state(pos_f, | 413 | freezer_apply_state(pos_f, |
| 435 | parent->state & CGROUP_FREEZING, | 414 | parent->state & CGROUP_FREEZING, |
| 436 | CGROUP_FREEZING_PARENT); | 415 | CGROUP_FREEZING_PARENT); |
| 437 | } | ||
| 438 | 416 | ||
| 439 | spin_unlock_irq(&pos_f->lock); | 417 | rcu_read_lock(); |
| 418 | css_put(pos); | ||
| 440 | } | 419 | } |
| 441 | rcu_read_unlock(); | 420 | rcu_read_unlock(); |
| 421 | mutex_unlock(&freezer_mutex); | ||
| 442 | } | 422 | } |
| 443 | 423 | ||
| 444 | static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, | 424 | static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, |
