diff options
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 1536 |
1 files changed, 880 insertions, 656 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a7c9e6ddb979..e5583d10a325 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -63,9 +63,6 @@ | |||
63 | 63 | ||
64 | #include <linux/atomic.h> | 64 | #include <linux/atomic.h> |
65 | 65 | ||
66 | /* css deactivation bias, makes css->refcnt negative to deny new trygets */ | ||
67 | #define CSS_DEACT_BIAS INT_MIN | ||
68 | |||
69 | /* | 66 | /* |
70 | * cgroup_mutex is the master lock. Any modification to cgroup or its | 67 | * cgroup_mutex is the master lock. Any modification to cgroup or its |
71 | * hierarchy must be performed while holding it. | 68 | * hierarchy must be performed while holding it. |
@@ -99,16 +96,19 @@ static DEFINE_MUTEX(cgroup_root_mutex); | |||
99 | */ | 96 | */ |
100 | #define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, | 97 | #define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, |
101 | #define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) | 98 | #define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) |
102 | static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { | 99 | static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = { |
103 | #include <linux/cgroup_subsys.h> | 100 | #include <linux/cgroup_subsys.h> |
104 | }; | 101 | }; |
105 | 102 | ||
106 | /* | 103 | /* |
107 | * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the | 104 | * The dummy hierarchy, reserved for the subsystems that are otherwise |
108 | * subsystems that are otherwise unattached - it never has more than a | 105 | * unattached - it never has more than a single cgroup, and all tasks are |
109 | * single cgroup, and all tasks are part of that cgroup. | 106 | * part of that cgroup. |
110 | */ | 107 | */ |
111 | static struct cgroupfs_root rootnode; | 108 | static struct cgroupfs_root cgroup_dummy_root; |
109 | |||
110 | /* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ | ||
111 | static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; | ||
112 | 112 | ||
113 | /* | 113 | /* |
114 | * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. | 114 | * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. |
@@ -186,18 +186,28 @@ struct cgroup_event { | |||
186 | 186 | ||
187 | /* The list of hierarchy roots */ | 187 | /* The list of hierarchy roots */ |
188 | 188 | ||
189 | static LIST_HEAD(roots); | 189 | static LIST_HEAD(cgroup_roots); |
190 | static int root_count; | 190 | static int cgroup_root_count; |
191 | 191 | ||
192 | static DEFINE_IDA(hierarchy_ida); | 192 | /* |
193 | static int next_hierarchy_id; | 193 | * Hierarchy ID allocation and mapping. It follows the same exclusion |
194 | static DEFINE_SPINLOCK(hierarchy_id_lock); | 194 | * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for |
195 | 195 | * writes, either for reads. | |
196 | /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ | 196 | */ |
197 | #define dummytop (&rootnode.top_cgroup) | 197 | static DEFINE_IDR(cgroup_hierarchy_idr); |
198 | 198 | ||
199 | static struct cgroup_name root_cgroup_name = { .name = "/" }; | 199 | static struct cgroup_name root_cgroup_name = { .name = "/" }; |
200 | 200 | ||
201 | /* | ||
202 | * Assign a monotonically increasing serial number to cgroups. It | ||
203 | * guarantees cgroups with bigger numbers are newer than those with smaller | ||
204 | * numbers. Also, as cgroups are always appended to the parent's | ||
205 | * ->children list, it guarantees that sibling cgroups are always sorted in | ||
206 | * the ascending serial number order on the list. Protected by | ||
207 | * cgroup_mutex. | ||
208 | */ | ||
209 | static u64 cgroup_serial_nr_next = 1; | ||
210 | |||
201 | /* This flag indicates whether tasks in the fork and exit paths should | 211 | /* This flag indicates whether tasks in the fork and exit paths should |
202 | * check for fork/exit handlers to call. This avoids us having to do | 212 | * check for fork/exit handlers to call. This avoids us having to do |
203 | * extra work in the fork/exit path if none of the subsystems need to | 213 | * extra work in the fork/exit path if none of the subsystems need to |
@@ -205,27 +215,15 @@ static struct cgroup_name root_cgroup_name = { .name = "/" }; | |||
205 | */ | 215 | */ |
206 | static int need_forkexit_callback __read_mostly; | 216 | static int need_forkexit_callback __read_mostly; |
207 | 217 | ||
218 | static void cgroup_offline_fn(struct work_struct *work); | ||
208 | static int cgroup_destroy_locked(struct cgroup *cgrp); | 219 | static int cgroup_destroy_locked(struct cgroup *cgrp); |
209 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | 220 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, |
210 | struct cftype cfts[], bool is_add); | 221 | struct cftype cfts[], bool is_add); |
211 | 222 | ||
212 | static int css_unbias_refcnt(int refcnt) | ||
213 | { | ||
214 | return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; | ||
215 | } | ||
216 | |||
217 | /* the current nr of refs, always >= 0 whether @css is deactivated or not */ | ||
218 | static int css_refcnt(struct cgroup_subsys_state *css) | ||
219 | { | ||
220 | int v = atomic_read(&css->refcnt); | ||
221 | |||
222 | return css_unbias_refcnt(v); | ||
223 | } | ||
224 | |||
225 | /* convenient tests for these bits */ | 223 | /* convenient tests for these bits */ |
226 | inline int cgroup_is_removed(const struct cgroup *cgrp) | 224 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) |
227 | { | 225 | { |
228 | return test_bit(CGRP_REMOVED, &cgrp->flags); | 226 | return test_bit(CGRP_DEAD, &cgrp->flags); |
229 | } | 227 | } |
230 | 228 | ||
231 | /** | 229 | /** |
@@ -261,16 +259,38 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
261 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 259 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
262 | } | 260 | } |
263 | 261 | ||
264 | /* | 262 | /** |
265 | * for_each_subsys() allows you to iterate on each subsystem attached to | 263 | * for_each_subsys - iterate all loaded cgroup subsystems |
266 | * an active hierarchy | 264 | * @ss: the iteration cursor |
265 | * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end | ||
266 | * | ||
267 | * Should be called under cgroup_mutex. | ||
267 | */ | 268 | */ |
268 | #define for_each_subsys(_root, _ss) \ | 269 | #define for_each_subsys(ss, i) \ |
269 | list_for_each_entry(_ss, &_root->subsys_list, sibling) | 270 | for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \ |
271 | if (({ lockdep_assert_held(&cgroup_mutex); \ | ||
272 | !((ss) = cgroup_subsys[i]); })) { } \ | ||
273 | else | ||
274 | |||
275 | /** | ||
276 | * for_each_builtin_subsys - iterate all built-in cgroup subsystems | ||
277 | * @ss: the iteration cursor | ||
278 | * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end | ||
279 | * | ||
280 | * Bulit-in subsystems are always present and iteration itself doesn't | ||
281 | * require any synchronization. | ||
282 | */ | ||
283 | #define for_each_builtin_subsys(ss, i) \ | ||
284 | for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ | ||
285 | (((ss) = cgroup_subsys[i]) || true); (i)++) | ||
286 | |||
287 | /* iterate each subsystem attached to a hierarchy */ | ||
288 | #define for_each_root_subsys(root, ss) \ | ||
289 | list_for_each_entry((ss), &(root)->subsys_list, sibling) | ||
270 | 290 | ||
271 | /* for_each_active_root() allows you to iterate across the active hierarchies */ | 291 | /* iterate across the active hierarchies */ |
272 | #define for_each_active_root(_root) \ | 292 | #define for_each_active_root(root) \ |
273 | list_for_each_entry(_root, &roots, root_list) | 293 | list_for_each_entry((root), &cgroup_roots, root_list) |
274 | 294 | ||
275 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) | 295 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) |
276 | { | 296 | { |
@@ -297,7 +317,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry) | |||
297 | static bool cgroup_lock_live_group(struct cgroup *cgrp) | 317 | static bool cgroup_lock_live_group(struct cgroup *cgrp) |
298 | { | 318 | { |
299 | mutex_lock(&cgroup_mutex); | 319 | mutex_lock(&cgroup_mutex); |
300 | if (cgroup_is_removed(cgrp)) { | 320 | if (cgroup_is_dead(cgrp)) { |
301 | mutex_unlock(&cgroup_mutex); | 321 | mutex_unlock(&cgroup_mutex); |
302 | return false; | 322 | return false; |
303 | } | 323 | } |
@@ -312,20 +332,24 @@ static void cgroup_release_agent(struct work_struct *work); | |||
312 | static DECLARE_WORK(release_agent_work, cgroup_release_agent); | 332 | static DECLARE_WORK(release_agent_work, cgroup_release_agent); |
313 | static void check_for_release(struct cgroup *cgrp); | 333 | static void check_for_release(struct cgroup *cgrp); |
314 | 334 | ||
315 | /* Link structure for associating css_set objects with cgroups */ | 335 | /* |
316 | struct cg_cgroup_link { | 336 | * A cgroup can be associated with multiple css_sets as different tasks may |
317 | /* | 337 | * belong to different cgroups on different hierarchies. In the other |
318 | * List running through cg_cgroup_links associated with a | 338 | * direction, a css_set is naturally associated with multiple cgroups. |
319 | * cgroup, anchored on cgroup->css_sets | 339 | * This M:N relationship is represented by the following link structure |
320 | */ | 340 | * which exists for each association and allows traversing the associations |
321 | struct list_head cgrp_link_list; | 341 | * from both sides. |
322 | struct cgroup *cgrp; | 342 | */ |
323 | /* | 343 | struct cgrp_cset_link { |
324 | * List running through cg_cgroup_links pointing at a | 344 | /* the cgroup and css_set this link associates */ |
325 | * single css_set object, anchored on css_set->cg_links | 345 | struct cgroup *cgrp; |
326 | */ | 346 | struct css_set *cset; |
327 | struct list_head cg_link_list; | 347 | |
328 | struct css_set *cg; | 348 | /* list of cgrp_cset_links anchored at cgrp->cset_links */ |
349 | struct list_head cset_link; | ||
350 | |||
351 | /* list of cgrp_cset_links anchored at css_set->cgrp_links */ | ||
352 | struct list_head cgrp_link; | ||
329 | }; | 353 | }; |
330 | 354 | ||
331 | /* The default css_set - used by init and its children prior to any | 355 | /* The default css_set - used by init and its children prior to any |
@@ -336,7 +360,7 @@ struct cg_cgroup_link { | |||
336 | */ | 360 | */ |
337 | 361 | ||
338 | static struct css_set init_css_set; | 362 | static struct css_set init_css_set; |
339 | static struct cg_cgroup_link init_css_set_link; | 363 | static struct cgrp_cset_link init_cgrp_cset_link; |
340 | 364 | ||
341 | static int cgroup_init_idr(struct cgroup_subsys *ss, | 365 | static int cgroup_init_idr(struct cgroup_subsys *ss, |
342 | struct cgroup_subsys_state *css); | 366 | struct cgroup_subsys_state *css); |
@@ -357,10 +381,11 @@ static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); | |||
357 | 381 | ||
358 | static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) | 382 | static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) |
359 | { | 383 | { |
360 | int i; | ||
361 | unsigned long key = 0UL; | 384 | unsigned long key = 0UL; |
385 | struct cgroup_subsys *ss; | ||
386 | int i; | ||
362 | 387 | ||
363 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) | 388 | for_each_subsys(ss, i) |
364 | key += (unsigned long)css[i]; | 389 | key += (unsigned long)css[i]; |
365 | key = (key >> 16) ^ key; | 390 | key = (key >> 16) ^ key; |
366 | 391 | ||
@@ -373,90 +398,83 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) | |||
373 | * compiled into their kernel but not actually in use */ | 398 | * compiled into their kernel but not actually in use */ |
374 | static int use_task_css_set_links __read_mostly; | 399 | static int use_task_css_set_links __read_mostly; |
375 | 400 | ||
376 | static void __put_css_set(struct css_set *cg, int taskexit) | 401 | static void __put_css_set(struct css_set *cset, int taskexit) |
377 | { | 402 | { |
378 | struct cg_cgroup_link *link; | 403 | struct cgrp_cset_link *link, *tmp_link; |
379 | struct cg_cgroup_link *saved_link; | 404 | |
380 | /* | 405 | /* |
381 | * Ensure that the refcount doesn't hit zero while any readers | 406 | * Ensure that the refcount doesn't hit zero while any readers |
382 | * can see it. Similar to atomic_dec_and_lock(), but for an | 407 | * can see it. Similar to atomic_dec_and_lock(), but for an |
383 | * rwlock | 408 | * rwlock |
384 | */ | 409 | */ |
385 | if (atomic_add_unless(&cg->refcount, -1, 1)) | 410 | if (atomic_add_unless(&cset->refcount, -1, 1)) |
386 | return; | 411 | return; |
387 | write_lock(&css_set_lock); | 412 | write_lock(&css_set_lock); |
388 | if (!atomic_dec_and_test(&cg->refcount)) { | 413 | if (!atomic_dec_and_test(&cset->refcount)) { |
389 | write_unlock(&css_set_lock); | 414 | write_unlock(&css_set_lock); |
390 | return; | 415 | return; |
391 | } | 416 | } |
392 | 417 | ||
393 | /* This css_set is dead. unlink it and release cgroup refcounts */ | 418 | /* This css_set is dead. unlink it and release cgroup refcounts */ |
394 | hash_del(&cg->hlist); | 419 | hash_del(&cset->hlist); |
395 | css_set_count--; | 420 | css_set_count--; |
396 | 421 | ||
397 | list_for_each_entry_safe(link, saved_link, &cg->cg_links, | 422 | list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { |
398 | cg_link_list) { | ||
399 | struct cgroup *cgrp = link->cgrp; | 423 | struct cgroup *cgrp = link->cgrp; |
400 | list_del(&link->cg_link_list); | ||
401 | list_del(&link->cgrp_link_list); | ||
402 | 424 | ||
403 | /* | 425 | list_del(&link->cset_link); |
404 | * We may not be holding cgroup_mutex, and if cgrp->count is | 426 | list_del(&link->cgrp_link); |
405 | * dropped to 0 the cgroup can be destroyed at any time, hence | 427 | |
406 | * rcu_read_lock is used to keep it alive. | 428 | /* @cgrp can't go away while we're holding css_set_lock */ |
407 | */ | 429 | if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { |
408 | rcu_read_lock(); | ||
409 | if (atomic_dec_and_test(&cgrp->count) && | ||
410 | notify_on_release(cgrp)) { | ||
411 | if (taskexit) | 430 | if (taskexit) |
412 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 431 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
413 | check_for_release(cgrp); | 432 | check_for_release(cgrp); |
414 | } | 433 | } |
415 | rcu_read_unlock(); | ||
416 | 434 | ||
417 | kfree(link); | 435 | kfree(link); |
418 | } | 436 | } |
419 | 437 | ||
420 | write_unlock(&css_set_lock); | 438 | write_unlock(&css_set_lock); |
421 | kfree_rcu(cg, rcu_head); | 439 | kfree_rcu(cset, rcu_head); |
422 | } | 440 | } |
423 | 441 | ||
424 | /* | 442 | /* |
425 | * refcounted get/put for css_set objects | 443 | * refcounted get/put for css_set objects |
426 | */ | 444 | */ |
427 | static inline void get_css_set(struct css_set *cg) | 445 | static inline void get_css_set(struct css_set *cset) |
428 | { | 446 | { |
429 | atomic_inc(&cg->refcount); | 447 | atomic_inc(&cset->refcount); |
430 | } | 448 | } |
431 | 449 | ||
432 | static inline void put_css_set(struct css_set *cg) | 450 | static inline void put_css_set(struct css_set *cset) |
433 | { | 451 | { |
434 | __put_css_set(cg, 0); | 452 | __put_css_set(cset, 0); |
435 | } | 453 | } |
436 | 454 | ||
437 | static inline void put_css_set_taskexit(struct css_set *cg) | 455 | static inline void put_css_set_taskexit(struct css_set *cset) |
438 | { | 456 | { |
439 | __put_css_set(cg, 1); | 457 | __put_css_set(cset, 1); |
440 | } | 458 | } |
441 | 459 | ||
442 | /* | 460 | /** |
443 | * compare_css_sets - helper function for find_existing_css_set(). | 461 | * compare_css_sets - helper function for find_existing_css_set(). |
444 | * @cg: candidate css_set being tested | 462 | * @cset: candidate css_set being tested |
445 | * @old_cg: existing css_set for a task | 463 | * @old_cset: existing css_set for a task |
446 | * @new_cgrp: cgroup that's being entered by the task | 464 | * @new_cgrp: cgroup that's being entered by the task |
447 | * @template: desired set of css pointers in css_set (pre-calculated) | 465 | * @template: desired set of css pointers in css_set (pre-calculated) |
448 | * | 466 | * |
449 | * Returns true if "cg" matches "old_cg" except for the hierarchy | 467 | * Returns true if "cg" matches "old_cg" except for the hierarchy |
450 | * which "new_cgrp" belongs to, for which it should match "new_cgrp". | 468 | * which "new_cgrp" belongs to, for which it should match "new_cgrp". |
451 | */ | 469 | */ |
452 | static bool compare_css_sets(struct css_set *cg, | 470 | static bool compare_css_sets(struct css_set *cset, |
453 | struct css_set *old_cg, | 471 | struct css_set *old_cset, |
454 | struct cgroup *new_cgrp, | 472 | struct cgroup *new_cgrp, |
455 | struct cgroup_subsys_state *template[]) | 473 | struct cgroup_subsys_state *template[]) |
456 | { | 474 | { |
457 | struct list_head *l1, *l2; | 475 | struct list_head *l1, *l2; |
458 | 476 | ||
459 | if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { | 477 | if (memcmp(template, cset->subsys, sizeof(cset->subsys))) { |
460 | /* Not all subsystems matched */ | 478 | /* Not all subsystems matched */ |
461 | return false; | 479 | return false; |
462 | } | 480 | } |
@@ -470,28 +488,28 @@ static bool compare_css_sets(struct css_set *cg, | |||
470 | * candidates. | 488 | * candidates. |
471 | */ | 489 | */ |
472 | 490 | ||
473 | l1 = &cg->cg_links; | 491 | l1 = &cset->cgrp_links; |
474 | l2 = &old_cg->cg_links; | 492 | l2 = &old_cset->cgrp_links; |
475 | while (1) { | 493 | while (1) { |
476 | struct cg_cgroup_link *cgl1, *cgl2; | 494 | struct cgrp_cset_link *link1, *link2; |
477 | struct cgroup *cg1, *cg2; | 495 | struct cgroup *cgrp1, *cgrp2; |
478 | 496 | ||
479 | l1 = l1->next; | 497 | l1 = l1->next; |
480 | l2 = l2->next; | 498 | l2 = l2->next; |
481 | /* See if we reached the end - both lists are equal length. */ | 499 | /* See if we reached the end - both lists are equal length. */ |
482 | if (l1 == &cg->cg_links) { | 500 | if (l1 == &cset->cgrp_links) { |
483 | BUG_ON(l2 != &old_cg->cg_links); | 501 | BUG_ON(l2 != &old_cset->cgrp_links); |
484 | break; | 502 | break; |
485 | } else { | 503 | } else { |
486 | BUG_ON(l2 == &old_cg->cg_links); | 504 | BUG_ON(l2 == &old_cset->cgrp_links); |
487 | } | 505 | } |
488 | /* Locate the cgroups associated with these links. */ | 506 | /* Locate the cgroups associated with these links. */ |
489 | cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); | 507 | link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link); |
490 | cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); | 508 | link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link); |
491 | cg1 = cgl1->cgrp; | 509 | cgrp1 = link1->cgrp; |
492 | cg2 = cgl2->cgrp; | 510 | cgrp2 = link2->cgrp; |
493 | /* Hierarchies should be linked in the same order. */ | 511 | /* Hierarchies should be linked in the same order. */ |
494 | BUG_ON(cg1->root != cg2->root); | 512 | BUG_ON(cgrp1->root != cgrp2->root); |
495 | 513 | ||
496 | /* | 514 | /* |
497 | * If this hierarchy is the hierarchy of the cgroup | 515 | * If this hierarchy is the hierarchy of the cgroup |
@@ -500,46 +518,39 @@ static bool compare_css_sets(struct css_set *cg, | |||
500 | * hierarchy, then this css_set should point to the | 518 | * hierarchy, then this css_set should point to the |
501 | * same cgroup as the old css_set. | 519 | * same cgroup as the old css_set. |
502 | */ | 520 | */ |
503 | if (cg1->root == new_cgrp->root) { | 521 | if (cgrp1->root == new_cgrp->root) { |
504 | if (cg1 != new_cgrp) | 522 | if (cgrp1 != new_cgrp) |
505 | return false; | 523 | return false; |
506 | } else { | 524 | } else { |
507 | if (cg1 != cg2) | 525 | if (cgrp1 != cgrp2) |
508 | return false; | 526 | return false; |
509 | } | 527 | } |
510 | } | 528 | } |
511 | return true; | 529 | return true; |
512 | } | 530 | } |
513 | 531 | ||
514 | /* | 532 | /** |
515 | * find_existing_css_set() is a helper for | 533 | * find_existing_css_set - init css array and find the matching css_set |
516 | * find_css_set(), and checks to see whether an existing | 534 | * @old_cset: the css_set that we're using before the cgroup transition |
517 | * css_set is suitable. | 535 | * @cgrp: the cgroup that we're moving into |
518 | * | 536 | * @template: out param for the new set of csses, should be clear on entry |
519 | * oldcg: the cgroup group that we're using before the cgroup | ||
520 | * transition | ||
521 | * | ||
522 | * cgrp: the cgroup that we're moving into | ||
523 | * | ||
524 | * template: location in which to build the desired set of subsystem | ||
525 | * state objects for the new cgroup group | ||
526 | */ | 537 | */ |
527 | static struct css_set *find_existing_css_set( | 538 | static struct css_set *find_existing_css_set(struct css_set *old_cset, |
528 | struct css_set *oldcg, | 539 | struct cgroup *cgrp, |
529 | struct cgroup *cgrp, | 540 | struct cgroup_subsys_state *template[]) |
530 | struct cgroup_subsys_state *template[]) | ||
531 | { | 541 | { |
532 | int i; | ||
533 | struct cgroupfs_root *root = cgrp->root; | 542 | struct cgroupfs_root *root = cgrp->root; |
534 | struct css_set *cg; | 543 | struct cgroup_subsys *ss; |
544 | struct css_set *cset; | ||
535 | unsigned long key; | 545 | unsigned long key; |
546 | int i; | ||
536 | 547 | ||
537 | /* | 548 | /* |
538 | * Build the set of subsystem state objects that we want to see in the | 549 | * Build the set of subsystem state objects that we want to see in the |
539 | * new css_set. while subsystems can change globally, the entries here | 550 | * new css_set. while subsystems can change globally, the entries here |
540 | * won't change, so no need for locking. | 551 | * won't change, so no need for locking. |
541 | */ | 552 | */ |
542 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 553 | for_each_subsys(ss, i) { |
543 | if (root->subsys_mask & (1UL << i)) { | 554 | if (root->subsys_mask & (1UL << i)) { |
544 | /* Subsystem is in this hierarchy. So we want | 555 | /* Subsystem is in this hierarchy. So we want |
545 | * the subsystem state from the new | 556 | * the subsystem state from the new |
@@ -548,148 +559,152 @@ static struct css_set *find_existing_css_set( | |||
548 | } else { | 559 | } else { |
549 | /* Subsystem is not in this hierarchy, so we | 560 | /* Subsystem is not in this hierarchy, so we |
550 | * don't want to change the subsystem state */ | 561 | * don't want to change the subsystem state */ |
551 | template[i] = oldcg->subsys[i]; | 562 | template[i] = old_cset->subsys[i]; |
552 | } | 563 | } |
553 | } | 564 | } |
554 | 565 | ||
555 | key = css_set_hash(template); | 566 | key = css_set_hash(template); |
556 | hash_for_each_possible(css_set_table, cg, hlist, key) { | 567 | hash_for_each_possible(css_set_table, cset, hlist, key) { |
557 | if (!compare_css_sets(cg, oldcg, cgrp, template)) | 568 | if (!compare_css_sets(cset, old_cset, cgrp, template)) |
558 | continue; | 569 | continue; |
559 | 570 | ||
560 | /* This css_set matches what we need */ | 571 | /* This css_set matches what we need */ |
561 | return cg; | 572 | return cset; |
562 | } | 573 | } |
563 | 574 | ||
564 | /* No existing cgroup group matched */ | 575 | /* No existing cgroup group matched */ |
565 | return NULL; | 576 | return NULL; |
566 | } | 577 | } |
567 | 578 | ||
568 | static void free_cg_links(struct list_head *tmp) | 579 | static void free_cgrp_cset_links(struct list_head *links_to_free) |
569 | { | 580 | { |
570 | struct cg_cgroup_link *link; | 581 | struct cgrp_cset_link *link, *tmp_link; |
571 | struct cg_cgroup_link *saved_link; | ||
572 | 582 | ||
573 | list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { | 583 | list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) { |
574 | list_del(&link->cgrp_link_list); | 584 | list_del(&link->cset_link); |
575 | kfree(link); | 585 | kfree(link); |
576 | } | 586 | } |
577 | } | 587 | } |
578 | 588 | ||
579 | /* | 589 | /** |
580 | * allocate_cg_links() allocates "count" cg_cgroup_link structures | 590 | * allocate_cgrp_cset_links - allocate cgrp_cset_links |
581 | * and chains them on tmp through their cgrp_link_list fields. Returns 0 on | 591 | * @count: the number of links to allocate |
582 | * success or a negative error | 592 | * @tmp_links: list_head the allocated links are put on |
593 | * | ||
594 | * Allocate @count cgrp_cset_link structures and chain them on @tmp_links | ||
595 | * through ->cset_link. Returns 0 on success or -errno. | ||
583 | */ | 596 | */ |
584 | static int allocate_cg_links(int count, struct list_head *tmp) | 597 | static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links) |
585 | { | 598 | { |
586 | struct cg_cgroup_link *link; | 599 | struct cgrp_cset_link *link; |
587 | int i; | 600 | int i; |
588 | INIT_LIST_HEAD(tmp); | 601 | |
602 | INIT_LIST_HEAD(tmp_links); | ||
603 | |||
589 | for (i = 0; i < count; i++) { | 604 | for (i = 0; i < count; i++) { |
590 | link = kmalloc(sizeof(*link), GFP_KERNEL); | 605 | link = kzalloc(sizeof(*link), GFP_KERNEL); |
591 | if (!link) { | 606 | if (!link) { |
592 | free_cg_links(tmp); | 607 | free_cgrp_cset_links(tmp_links); |
593 | return -ENOMEM; | 608 | return -ENOMEM; |
594 | } | 609 | } |
595 | list_add(&link->cgrp_link_list, tmp); | 610 | list_add(&link->cset_link, tmp_links); |
596 | } | 611 | } |
597 | return 0; | 612 | return 0; |
598 | } | 613 | } |
599 | 614 | ||
600 | /** | 615 | /** |
601 | * link_css_set - a helper function to link a css_set to a cgroup | 616 | * link_css_set - a helper function to link a css_set to a cgroup |
602 | * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() | 617 | * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links() |
603 | * @cg: the css_set to be linked | 618 | * @cset: the css_set to be linked |
604 | * @cgrp: the destination cgroup | 619 | * @cgrp: the destination cgroup |
605 | */ | 620 | */ |
606 | static void link_css_set(struct list_head *tmp_cg_links, | 621 | static void link_css_set(struct list_head *tmp_links, struct css_set *cset, |
607 | struct css_set *cg, struct cgroup *cgrp) | 622 | struct cgroup *cgrp) |
608 | { | 623 | { |
609 | struct cg_cgroup_link *link; | 624 | struct cgrp_cset_link *link; |
610 | 625 | ||
611 | BUG_ON(list_empty(tmp_cg_links)); | 626 | BUG_ON(list_empty(tmp_links)); |
612 | link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, | 627 | link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); |
613 | cgrp_link_list); | 628 | link->cset = cset; |
614 | link->cg = cg; | ||
615 | link->cgrp = cgrp; | 629 | link->cgrp = cgrp; |
616 | atomic_inc(&cgrp->count); | 630 | list_move(&link->cset_link, &cgrp->cset_links); |
617 | list_move(&link->cgrp_link_list, &cgrp->css_sets); | ||
618 | /* | 631 | /* |
619 | * Always add links to the tail of the list so that the list | 632 | * Always add links to the tail of the list so that the list |
620 | * is sorted by order of hierarchy creation | 633 | * is sorted by order of hierarchy creation |
621 | */ | 634 | */ |
622 | list_add_tail(&link->cg_link_list, &cg->cg_links); | 635 | list_add_tail(&link->cgrp_link, &cset->cgrp_links); |
623 | } | 636 | } |
624 | 637 | ||
625 | /* | 638 | /** |
626 | * find_css_set() takes an existing cgroup group and a | 639 | * find_css_set - return a new css_set with one cgroup updated |
627 | * cgroup object, and returns a css_set object that's | 640 | * @old_cset: the baseline css_set |
628 | * equivalent to the old group, but with the given cgroup | 641 | * @cgrp: the cgroup to be updated |
629 | * substituted into the appropriate hierarchy. Must be called with | 642 | * |
630 | * cgroup_mutex held | 643 | * Return a new css_set that's equivalent to @old_cset, but with @cgrp |
644 | * substituted into the appropriate hierarchy. | ||
631 | */ | 645 | */ |
632 | static struct css_set *find_css_set( | 646 | static struct css_set *find_css_set(struct css_set *old_cset, |
633 | struct css_set *oldcg, struct cgroup *cgrp) | 647 | struct cgroup *cgrp) |
634 | { | 648 | { |
635 | struct css_set *res; | 649 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { }; |
636 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; | 650 | struct css_set *cset; |
637 | 651 | struct list_head tmp_links; | |
638 | struct list_head tmp_cg_links; | 652 | struct cgrp_cset_link *link; |
639 | |||
640 | struct cg_cgroup_link *link; | ||
641 | unsigned long key; | 653 | unsigned long key; |
642 | 654 | ||
655 | lockdep_assert_held(&cgroup_mutex); | ||
656 | |||
643 | /* First see if we already have a cgroup group that matches | 657 | /* First see if we already have a cgroup group that matches |
644 | * the desired set */ | 658 | * the desired set */ |
645 | read_lock(&css_set_lock); | 659 | read_lock(&css_set_lock); |
646 | res = find_existing_css_set(oldcg, cgrp, template); | 660 | cset = find_existing_css_set(old_cset, cgrp, template); |
647 | if (res) | 661 | if (cset) |
648 | get_css_set(res); | 662 | get_css_set(cset); |
649 | read_unlock(&css_set_lock); | 663 | read_unlock(&css_set_lock); |
650 | 664 | ||
651 | if (res) | 665 | if (cset) |
652 | return res; | 666 | return cset; |
653 | 667 | ||
654 | res = kmalloc(sizeof(*res), GFP_KERNEL); | 668 | cset = kzalloc(sizeof(*cset), GFP_KERNEL); |
655 | if (!res) | 669 | if (!cset) |
656 | return NULL; | 670 | return NULL; |
657 | 671 | ||
658 | /* Allocate all the cg_cgroup_link objects that we'll need */ | 672 | /* Allocate all the cgrp_cset_link objects that we'll need */ |
659 | if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { | 673 | if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) { |
660 | kfree(res); | 674 | kfree(cset); |
661 | return NULL; | 675 | return NULL; |
662 | } | 676 | } |
663 | 677 | ||
664 | atomic_set(&res->refcount, 1); | 678 | atomic_set(&cset->refcount, 1); |
665 | INIT_LIST_HEAD(&res->cg_links); | 679 | INIT_LIST_HEAD(&cset->cgrp_links); |
666 | INIT_LIST_HEAD(&res->tasks); | 680 | INIT_LIST_HEAD(&cset->tasks); |
667 | INIT_HLIST_NODE(&res->hlist); | 681 | INIT_HLIST_NODE(&cset->hlist); |
668 | 682 | ||
669 | /* Copy the set of subsystem state objects generated in | 683 | /* Copy the set of subsystem state objects generated in |
670 | * find_existing_css_set() */ | 684 | * find_existing_css_set() */ |
671 | memcpy(res->subsys, template, sizeof(res->subsys)); | 685 | memcpy(cset->subsys, template, sizeof(cset->subsys)); |
672 | 686 | ||
673 | write_lock(&css_set_lock); | 687 | write_lock(&css_set_lock); |
674 | /* Add reference counts and links from the new css_set. */ | 688 | /* Add reference counts and links from the new css_set. */ |
675 | list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { | 689 | list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { |
676 | struct cgroup *c = link->cgrp; | 690 | struct cgroup *c = link->cgrp; |
691 | |||
677 | if (c->root == cgrp->root) | 692 | if (c->root == cgrp->root) |
678 | c = cgrp; | 693 | c = cgrp; |
679 | link_css_set(&tmp_cg_links, res, c); | 694 | link_css_set(&tmp_links, cset, c); |
680 | } | 695 | } |
681 | 696 | ||
682 | BUG_ON(!list_empty(&tmp_cg_links)); | 697 | BUG_ON(!list_empty(&tmp_links)); |
683 | 698 | ||
684 | css_set_count++; | 699 | css_set_count++; |
685 | 700 | ||
686 | /* Add this cgroup group to the hash table */ | 701 | /* Add this cgroup group to the hash table */ |
687 | key = css_set_hash(res->subsys); | 702 | key = css_set_hash(cset->subsys); |
688 | hash_add(css_set_table, &res->hlist, key); | 703 | hash_add(css_set_table, &cset->hlist, key); |
689 | 704 | ||
690 | write_unlock(&css_set_lock); | 705 | write_unlock(&css_set_lock); |
691 | 706 | ||
692 | return res; | 707 | return cset; |
693 | } | 708 | } |
694 | 709 | ||
695 | /* | 710 | /* |
@@ -699,7 +714,7 @@ static struct css_set *find_css_set( | |||
699 | static struct cgroup *task_cgroup_from_root(struct task_struct *task, | 714 | static struct cgroup *task_cgroup_from_root(struct task_struct *task, |
700 | struct cgroupfs_root *root) | 715 | struct cgroupfs_root *root) |
701 | { | 716 | { |
702 | struct css_set *css; | 717 | struct css_set *cset; |
703 | struct cgroup *res = NULL; | 718 | struct cgroup *res = NULL; |
704 | 719 | ||
705 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | 720 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); |
@@ -709,13 +724,15 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
709 | * task can't change groups, so the only thing that can happen | 724 | * task can't change groups, so the only thing that can happen |
710 | * is that it exits and its css is set back to init_css_set. | 725 | * is that it exits and its css is set back to init_css_set. |
711 | */ | 726 | */ |
712 | css = task->cgroups; | 727 | cset = task_css_set(task); |
713 | if (css == &init_css_set) { | 728 | if (cset == &init_css_set) { |
714 | res = &root->top_cgroup; | 729 | res = &root->top_cgroup; |
715 | } else { | 730 | } else { |
716 | struct cg_cgroup_link *link; | 731 | struct cgrp_cset_link *link; |
717 | list_for_each_entry(link, &css->cg_links, cg_link_list) { | 732 | |
733 | list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { | ||
718 | struct cgroup *c = link->cgrp; | 734 | struct cgroup *c = link->cgrp; |
735 | |||
719 | if (c->root == root) { | 736 | if (c->root == root) { |
720 | res = c; | 737 | res = c; |
721 | break; | 738 | break; |
@@ -828,14 +845,14 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) | |||
828 | 845 | ||
829 | static void cgroup_free_fn(struct work_struct *work) | 846 | static void cgroup_free_fn(struct work_struct *work) |
830 | { | 847 | { |
831 | struct cgroup *cgrp = container_of(work, struct cgroup, free_work); | 848 | struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); |
832 | struct cgroup_subsys *ss; | 849 | struct cgroup_subsys *ss; |
833 | 850 | ||
834 | mutex_lock(&cgroup_mutex); | 851 | mutex_lock(&cgroup_mutex); |
835 | /* | 852 | /* |
836 | * Release the subsystem state objects. | 853 | * Release the subsystem state objects. |
837 | */ | 854 | */ |
838 | for_each_subsys(cgrp->root, ss) | 855 | for_each_root_subsys(cgrp->root, ss) |
839 | ss->css_free(cgrp); | 856 | ss->css_free(cgrp); |
840 | 857 | ||
841 | cgrp->root->number_of_cgroups--; | 858 | cgrp->root->number_of_cgroups--; |
@@ -873,7 +890,8 @@ static void cgroup_free_rcu(struct rcu_head *head) | |||
873 | { | 890 | { |
874 | struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); | 891 | struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); |
875 | 892 | ||
876 | schedule_work(&cgrp->free_work); | 893 | INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); |
894 | schedule_work(&cgrp->destroy_work); | ||
877 | } | 895 | } |
878 | 896 | ||
879 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 897 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) |
@@ -882,7 +900,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
882 | if (S_ISDIR(inode->i_mode)) { | 900 | if (S_ISDIR(inode->i_mode)) { |
883 | struct cgroup *cgrp = dentry->d_fsdata; | 901 | struct cgroup *cgrp = dentry->d_fsdata; |
884 | 902 | ||
885 | BUG_ON(!(cgroup_is_removed(cgrp))); | 903 | BUG_ON(!(cgroup_is_dead(cgrp))); |
886 | call_rcu(&cgrp->rcu_head, cgroup_free_rcu); | 904 | call_rcu(&cgrp->rcu_head, cgroup_free_rcu); |
887 | } else { | 905 | } else { |
888 | struct cfent *cfe = __d_cfe(dentry); | 906 | struct cfent *cfe = __d_cfe(dentry); |
@@ -950,7 +968,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files, | |||
950 | struct cgroup *cgrp = __d_cgrp(dir); | 968 | struct cgroup *cgrp = __d_cgrp(dir); |
951 | struct cgroup_subsys *ss; | 969 | struct cgroup_subsys *ss; |
952 | 970 | ||
953 | for_each_subsys(cgrp->root, ss) { | 971 | for_each_root_subsys(cgrp->root, ss) { |
954 | struct cftype_set *set; | 972 | struct cftype_set *set; |
955 | if (!test_bit(ss->subsys_id, &subsys_mask)) | 973 | if (!test_bit(ss->subsys_id, &subsys_mask)) |
956 | continue; | 974 | continue; |
@@ -988,30 +1006,23 @@ static void cgroup_d_remove_dir(struct dentry *dentry) | |||
988 | * returns an error, no reference counts are touched. | 1006 | * returns an error, no reference counts are touched. |
989 | */ | 1007 | */ |
990 | static int rebind_subsystems(struct cgroupfs_root *root, | 1008 | static int rebind_subsystems(struct cgroupfs_root *root, |
991 | unsigned long final_subsys_mask) | 1009 | unsigned long added_mask, unsigned removed_mask) |
992 | { | 1010 | { |
993 | unsigned long added_mask, removed_mask; | ||
994 | struct cgroup *cgrp = &root->top_cgroup; | 1011 | struct cgroup *cgrp = &root->top_cgroup; |
1012 | struct cgroup_subsys *ss; | ||
995 | int i; | 1013 | int i; |
996 | 1014 | ||
997 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | 1015 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); |
998 | BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); | 1016 | BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); |
999 | 1017 | ||
1000 | removed_mask = root->actual_subsys_mask & ~final_subsys_mask; | ||
1001 | added_mask = final_subsys_mask & ~root->actual_subsys_mask; | ||
1002 | /* Check that any added subsystems are currently free */ | 1018 | /* Check that any added subsystems are currently free */ |
1003 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1019 | for_each_subsys(ss, i) { |
1004 | unsigned long bit = 1UL << i; | 1020 | unsigned long bit = 1UL << i; |
1005 | struct cgroup_subsys *ss = subsys[i]; | 1021 | |
1006 | if (!(bit & added_mask)) | 1022 | if (!(bit & added_mask)) |
1007 | continue; | 1023 | continue; |
1008 | /* | 1024 | |
1009 | * Nobody should tell us to do a subsys that doesn't exist: | 1025 | if (ss->root != &cgroup_dummy_root) { |
1010 | * parse_cgroupfs_options should catch that case and refcounts | ||
1011 | * ensure that subsystems won't disappear once selected. | ||
1012 | */ | ||
1013 | BUG_ON(ss == NULL); | ||
1014 | if (ss->root != &rootnode) { | ||
1015 | /* Subsystem isn't free */ | 1026 | /* Subsystem isn't free */ |
1016 | return -EBUSY; | 1027 | return -EBUSY; |
1017 | } | 1028 | } |
@@ -1025,38 +1036,41 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1025 | return -EBUSY; | 1036 | return -EBUSY; |
1026 | 1037 | ||
1027 | /* Process each subsystem */ | 1038 | /* Process each subsystem */ |
1028 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1039 | for_each_subsys(ss, i) { |
1029 | struct cgroup_subsys *ss = subsys[i]; | ||
1030 | unsigned long bit = 1UL << i; | 1040 | unsigned long bit = 1UL << i; |
1041 | |||
1031 | if (bit & added_mask) { | 1042 | if (bit & added_mask) { |
1032 | /* We're binding this subsystem to this hierarchy */ | 1043 | /* We're binding this subsystem to this hierarchy */ |
1033 | BUG_ON(ss == NULL); | ||
1034 | BUG_ON(cgrp->subsys[i]); | 1044 | BUG_ON(cgrp->subsys[i]); |
1035 | BUG_ON(!dummytop->subsys[i]); | 1045 | BUG_ON(!cgroup_dummy_top->subsys[i]); |
1036 | BUG_ON(dummytop->subsys[i]->cgroup != dummytop); | 1046 | BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); |
1037 | cgrp->subsys[i] = dummytop->subsys[i]; | 1047 | |
1048 | cgrp->subsys[i] = cgroup_dummy_top->subsys[i]; | ||
1038 | cgrp->subsys[i]->cgroup = cgrp; | 1049 | cgrp->subsys[i]->cgroup = cgrp; |
1039 | list_move(&ss->sibling, &root->subsys_list); | 1050 | list_move(&ss->sibling, &root->subsys_list); |
1040 | ss->root = root; | 1051 | ss->root = root; |
1041 | if (ss->bind) | 1052 | if (ss->bind) |
1042 | ss->bind(cgrp); | 1053 | ss->bind(cgrp); |
1054 | |||
1043 | /* refcount was already taken, and we're keeping it */ | 1055 | /* refcount was already taken, and we're keeping it */ |
1056 | root->subsys_mask |= bit; | ||
1044 | } else if (bit & removed_mask) { | 1057 | } else if (bit & removed_mask) { |
1045 | /* We're removing this subsystem */ | 1058 | /* We're removing this subsystem */ |
1046 | BUG_ON(ss == NULL); | 1059 | BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); |
1047 | BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); | ||
1048 | BUG_ON(cgrp->subsys[i]->cgroup != cgrp); | 1060 | BUG_ON(cgrp->subsys[i]->cgroup != cgrp); |
1061 | |||
1049 | if (ss->bind) | 1062 | if (ss->bind) |
1050 | ss->bind(dummytop); | 1063 | ss->bind(cgroup_dummy_top); |
1051 | dummytop->subsys[i]->cgroup = dummytop; | 1064 | cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; |
1052 | cgrp->subsys[i] = NULL; | 1065 | cgrp->subsys[i] = NULL; |
1053 | subsys[i]->root = &rootnode; | 1066 | cgroup_subsys[i]->root = &cgroup_dummy_root; |
1054 | list_move(&ss->sibling, &rootnode.subsys_list); | 1067 | list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); |
1068 | |||
1055 | /* subsystem is now free - drop reference on module */ | 1069 | /* subsystem is now free - drop reference on module */ |
1056 | module_put(ss->module); | 1070 | module_put(ss->module); |
1057 | } else if (bit & final_subsys_mask) { | 1071 | root->subsys_mask &= ~bit; |
1072 | } else if (bit & root->subsys_mask) { | ||
1058 | /* Subsystem state should already exist */ | 1073 | /* Subsystem state should already exist */ |
1059 | BUG_ON(ss == NULL); | ||
1060 | BUG_ON(!cgrp->subsys[i]); | 1074 | BUG_ON(!cgrp->subsys[i]); |
1061 | /* | 1075 | /* |
1062 | * a refcount was taken, but we already had one, so | 1076 | * a refcount was taken, but we already had one, so |
@@ -1071,7 +1085,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1071 | BUG_ON(cgrp->subsys[i]); | 1085 | BUG_ON(cgrp->subsys[i]); |
1072 | } | 1086 | } |
1073 | } | 1087 | } |
1074 | root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; | 1088 | |
1089 | /* | ||
1090 | * Mark @root has finished binding subsystems. @root->subsys_mask | ||
1091 | * now matches the bound subsystems. | ||
1092 | */ | ||
1093 | root->flags |= CGRP_ROOT_SUBSYS_BOUND; | ||
1075 | 1094 | ||
1076 | return 0; | 1095 | return 0; |
1077 | } | 1096 | } |
@@ -1082,7 +1101,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) | |||
1082 | struct cgroup_subsys *ss; | 1101 | struct cgroup_subsys *ss; |
1083 | 1102 | ||
1084 | mutex_lock(&cgroup_root_mutex); | 1103 | mutex_lock(&cgroup_root_mutex); |
1085 | for_each_subsys(root, ss) | 1104 | for_each_root_subsys(root, ss) |
1086 | seq_printf(seq, ",%s", ss->name); | 1105 | seq_printf(seq, ",%s", ss->name); |
1087 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) | 1106 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) |
1088 | seq_puts(seq, ",sane_behavior"); | 1107 | seq_puts(seq, ",sane_behavior"); |
@@ -1114,18 +1133,19 @@ struct cgroup_sb_opts { | |||
1114 | }; | 1133 | }; |
1115 | 1134 | ||
1116 | /* | 1135 | /* |
1117 | * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call | 1136 | * Convert a hierarchy specifier into a bitmask of subsystems and |
1118 | * with cgroup_mutex held to protect the subsys[] array. This function takes | 1137 | * flags. Call with cgroup_mutex held to protect the cgroup_subsys[] |
1119 | * refcounts on subsystems to be used, unless it returns error, in which case | 1138 | * array. This function takes refcounts on subsystems to be used, unless it |
1120 | * no refcounts are taken. | 1139 | * returns error, in which case no refcounts are taken. |
1121 | */ | 1140 | */ |
1122 | static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | 1141 | static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) |
1123 | { | 1142 | { |
1124 | char *token, *o = data; | 1143 | char *token, *o = data; |
1125 | bool all_ss = false, one_ss = false; | 1144 | bool all_ss = false, one_ss = false; |
1126 | unsigned long mask = (unsigned long)-1; | 1145 | unsigned long mask = (unsigned long)-1; |
1127 | int i; | ||
1128 | bool module_pin_failed = false; | 1146 | bool module_pin_failed = false; |
1147 | struct cgroup_subsys *ss; | ||
1148 | int i; | ||
1129 | 1149 | ||
1130 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | 1150 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); |
1131 | 1151 | ||
@@ -1202,10 +1222,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1202 | continue; | 1222 | continue; |
1203 | } | 1223 | } |
1204 | 1224 | ||
1205 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1225 | for_each_subsys(ss, i) { |
1206 | struct cgroup_subsys *ss = subsys[i]; | ||
1207 | if (ss == NULL) | ||
1208 | continue; | ||
1209 | if (strcmp(token, ss->name)) | 1226 | if (strcmp(token, ss->name)) |
1210 | continue; | 1227 | continue; |
1211 | if (ss->disabled) | 1228 | if (ss->disabled) |
@@ -1228,16 +1245,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1228 | * otherwise if 'none', 'name=' and a subsystem name options | 1245 | * otherwise if 'none', 'name=' and a subsystem name options |
1229 | * were not specified, let's default to 'all' | 1246 | * were not specified, let's default to 'all' |
1230 | */ | 1247 | */ |
1231 | if (all_ss || (!one_ss && !opts->none && !opts->name)) { | 1248 | if (all_ss || (!one_ss && !opts->none && !opts->name)) |
1232 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1249 | for_each_subsys(ss, i) |
1233 | struct cgroup_subsys *ss = subsys[i]; | 1250 | if (!ss->disabled) |
1234 | if (ss == NULL) | 1251 | set_bit(i, &opts->subsys_mask); |
1235 | continue; | ||
1236 | if (ss->disabled) | ||
1237 | continue; | ||
1238 | set_bit(i, &opts->subsys_mask); | ||
1239 | } | ||
1240 | } | ||
1241 | 1252 | ||
1242 | /* Consistency checks */ | 1253 | /* Consistency checks */ |
1243 | 1254 | ||
@@ -1281,12 +1292,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1281 | * take duplicate reference counts on a subsystem that's already used, | 1292 | * take duplicate reference counts on a subsystem that's already used, |
1282 | * but rebind_subsystems handles this case. | 1293 | * but rebind_subsystems handles this case. |
1283 | */ | 1294 | */ |
1284 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1295 | for_each_subsys(ss, i) { |
1285 | unsigned long bit = 1UL << i; | 1296 | if (!(opts->subsys_mask & (1UL << i))) |
1286 | |||
1287 | if (!(bit & opts->subsys_mask)) | ||
1288 | continue; | 1297 | continue; |
1289 | if (!try_module_get(subsys[i]->module)) { | 1298 | if (!try_module_get(cgroup_subsys[i]->module)) { |
1290 | module_pin_failed = true; | 1299 | module_pin_failed = true; |
1291 | break; | 1300 | break; |
1292 | } | 1301 | } |
@@ -1303,7 +1312,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1303 | 1312 | ||
1304 | if (!(bit & opts->subsys_mask)) | 1313 | if (!(bit & opts->subsys_mask)) |
1305 | continue; | 1314 | continue; |
1306 | module_put(subsys[i]->module); | 1315 | module_put(cgroup_subsys[i]->module); |
1307 | } | 1316 | } |
1308 | return -ENOENT; | 1317 | return -ENOENT; |
1309 | } | 1318 | } |
@@ -1313,14 +1322,14 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1313 | 1322 | ||
1314 | static void drop_parsed_module_refcounts(unsigned long subsys_mask) | 1323 | static void drop_parsed_module_refcounts(unsigned long subsys_mask) |
1315 | { | 1324 | { |
1325 | struct cgroup_subsys *ss; | ||
1316 | int i; | 1326 | int i; |
1317 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
1318 | unsigned long bit = 1UL << i; | ||
1319 | 1327 | ||
1320 | if (!(bit & subsys_mask)) | 1328 | mutex_lock(&cgroup_mutex); |
1321 | continue; | 1329 | for_each_subsys(ss, i) |
1322 | module_put(subsys[i]->module); | 1330 | if (subsys_mask & (1UL << i)) |
1323 | } | 1331 | module_put(cgroup_subsys[i]->module); |
1332 | mutex_unlock(&cgroup_mutex); | ||
1324 | } | 1333 | } |
1325 | 1334 | ||
1326 | static int cgroup_remount(struct super_block *sb, int *flags, char *data) | 1335 | static int cgroup_remount(struct super_block *sb, int *flags, char *data) |
@@ -1345,7 +1354,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1345 | if (ret) | 1354 | if (ret) |
1346 | goto out_unlock; | 1355 | goto out_unlock; |
1347 | 1356 | ||
1348 | if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) | 1357 | if (opts.subsys_mask != root->subsys_mask || opts.release_agent) |
1349 | pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", | 1358 | pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", |
1350 | task_tgid_nr(current), current->comm); | 1359 | task_tgid_nr(current), current->comm); |
1351 | 1360 | ||
@@ -1353,10 +1362,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1353 | removed_mask = root->subsys_mask & ~opts.subsys_mask; | 1362 | removed_mask = root->subsys_mask & ~opts.subsys_mask; |
1354 | 1363 | ||
1355 | /* Don't allow flags or name to change at remount */ | 1364 | /* Don't allow flags or name to change at remount */ |
1356 | if (opts.flags != root->flags || | 1365 | if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || |
1357 | (opts.name && strcmp(opts.name, root->name))) { | 1366 | (opts.name && strcmp(opts.name, root->name))) { |
1367 | pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", | ||
1368 | opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", | ||
1369 | root->flags & CGRP_ROOT_OPTION_MASK, root->name); | ||
1358 | ret = -EINVAL; | 1370 | ret = -EINVAL; |
1359 | drop_parsed_module_refcounts(opts.subsys_mask); | ||
1360 | goto out_unlock; | 1371 | goto out_unlock; |
1361 | } | 1372 | } |
1362 | 1373 | ||
@@ -1367,11 +1378,10 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1367 | */ | 1378 | */ |
1368 | cgroup_clear_directory(cgrp->dentry, false, removed_mask); | 1379 | cgroup_clear_directory(cgrp->dentry, false, removed_mask); |
1369 | 1380 | ||
1370 | ret = rebind_subsystems(root, opts.subsys_mask); | 1381 | ret = rebind_subsystems(root, added_mask, removed_mask); |
1371 | if (ret) { | 1382 | if (ret) { |
1372 | /* rebind_subsystems failed, re-populate the removed files */ | 1383 | /* rebind_subsystems failed, re-populate the removed files */ |
1373 | cgroup_populate_dir(cgrp, false, removed_mask); | 1384 | cgroup_populate_dir(cgrp, false, removed_mask); |
1374 | drop_parsed_module_refcounts(opts.subsys_mask); | ||
1375 | goto out_unlock; | 1385 | goto out_unlock; |
1376 | } | 1386 | } |
1377 | 1387 | ||
@@ -1386,6 +1396,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1386 | mutex_unlock(&cgroup_root_mutex); | 1396 | mutex_unlock(&cgroup_root_mutex); |
1387 | mutex_unlock(&cgroup_mutex); | 1397 | mutex_unlock(&cgroup_mutex); |
1388 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | 1398 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); |
1399 | if (ret) | ||
1400 | drop_parsed_module_refcounts(opts.subsys_mask); | ||
1389 | return ret; | 1401 | return ret; |
1390 | } | 1402 | } |
1391 | 1403 | ||
@@ -1401,11 +1413,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1401 | INIT_LIST_HEAD(&cgrp->sibling); | 1413 | INIT_LIST_HEAD(&cgrp->sibling); |
1402 | INIT_LIST_HEAD(&cgrp->children); | 1414 | INIT_LIST_HEAD(&cgrp->children); |
1403 | INIT_LIST_HEAD(&cgrp->files); | 1415 | INIT_LIST_HEAD(&cgrp->files); |
1404 | INIT_LIST_HEAD(&cgrp->css_sets); | 1416 | INIT_LIST_HEAD(&cgrp->cset_links); |
1405 | INIT_LIST_HEAD(&cgrp->allcg_node); | ||
1406 | INIT_LIST_HEAD(&cgrp->release_list); | 1417 | INIT_LIST_HEAD(&cgrp->release_list); |
1407 | INIT_LIST_HEAD(&cgrp->pidlists); | 1418 | INIT_LIST_HEAD(&cgrp->pidlists); |
1408 | INIT_WORK(&cgrp->free_work, cgroup_free_fn); | ||
1409 | mutex_init(&cgrp->pidlist_mutex); | 1419 | mutex_init(&cgrp->pidlist_mutex); |
1410 | INIT_LIST_HEAD(&cgrp->event_list); | 1420 | INIT_LIST_HEAD(&cgrp->event_list); |
1411 | spin_lock_init(&cgrp->event_list_lock); | 1421 | spin_lock_init(&cgrp->event_list_lock); |
@@ -1418,37 +1428,37 @@ static void init_cgroup_root(struct cgroupfs_root *root) | |||
1418 | 1428 | ||
1419 | INIT_LIST_HEAD(&root->subsys_list); | 1429 | INIT_LIST_HEAD(&root->subsys_list); |
1420 | INIT_LIST_HEAD(&root->root_list); | 1430 | INIT_LIST_HEAD(&root->root_list); |
1421 | INIT_LIST_HEAD(&root->allcg_list); | ||
1422 | root->number_of_cgroups = 1; | 1431 | root->number_of_cgroups = 1; |
1423 | cgrp->root = root; | 1432 | cgrp->root = root; |
1424 | cgrp->name = &root_cgroup_name; | 1433 | RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); |
1425 | init_cgroup_housekeeping(cgrp); | 1434 | init_cgroup_housekeeping(cgrp); |
1426 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
1427 | } | 1435 | } |
1428 | 1436 | ||
1429 | static bool init_root_id(struct cgroupfs_root *root) | 1437 | static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) |
1430 | { | 1438 | { |
1431 | int ret = 0; | 1439 | int id; |
1432 | 1440 | ||
1433 | do { | 1441 | lockdep_assert_held(&cgroup_mutex); |
1434 | if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) | 1442 | lockdep_assert_held(&cgroup_root_mutex); |
1435 | return false; | 1443 | |
1436 | spin_lock(&hierarchy_id_lock); | 1444 | id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end, |
1437 | /* Try to allocate the next unused ID */ | 1445 | GFP_KERNEL); |
1438 | ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, | 1446 | if (id < 0) |
1439 | &root->hierarchy_id); | 1447 | return id; |
1440 | if (ret == -ENOSPC) | 1448 | |
1441 | /* Try again starting from 0 */ | 1449 | root->hierarchy_id = id; |
1442 | ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); | 1450 | return 0; |
1443 | if (!ret) { | 1451 | } |
1444 | next_hierarchy_id = root->hierarchy_id + 1; | 1452 | |
1445 | } else if (ret != -EAGAIN) { | 1453 | static void cgroup_exit_root_id(struct cgroupfs_root *root) |
1446 | /* Can only get here if the 31-bit IDR is full ... */ | 1454 | { |
1447 | BUG_ON(ret); | 1455 | lockdep_assert_held(&cgroup_mutex); |
1448 | } | 1456 | lockdep_assert_held(&cgroup_root_mutex); |
1449 | spin_unlock(&hierarchy_id_lock); | 1457 | |
1450 | } while (ret); | 1458 | if (root->hierarchy_id) { |
1451 | return true; | 1459 | idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); |
1460 | root->hierarchy_id = 0; | ||
1461 | } | ||
1452 | } | 1462 | } |
1453 | 1463 | ||
1454 | static int cgroup_test_super(struct super_block *sb, void *data) | 1464 | static int cgroup_test_super(struct super_block *sb, void *data) |
@@ -1482,12 +1492,16 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | |||
1482 | if (!root) | 1492 | if (!root) |
1483 | return ERR_PTR(-ENOMEM); | 1493 | return ERR_PTR(-ENOMEM); |
1484 | 1494 | ||
1485 | if (!init_root_id(root)) { | ||
1486 | kfree(root); | ||
1487 | return ERR_PTR(-ENOMEM); | ||
1488 | } | ||
1489 | init_cgroup_root(root); | 1495 | init_cgroup_root(root); |
1490 | 1496 | ||
1497 | /* | ||
1498 | * We need to set @root->subsys_mask now so that @root can be | ||
1499 | * matched by cgroup_test_super() before it finishes | ||
1500 | * initialization; otherwise, competing mounts with the same | ||
1501 | * options may try to bind the same subsystems instead of waiting | ||
1502 | * for the first one leading to unexpected mount errors. | ||
1503 | * SUBSYS_BOUND will be set once actual binding is complete. | ||
1504 | */ | ||
1491 | root->subsys_mask = opts->subsys_mask; | 1505 | root->subsys_mask = opts->subsys_mask; |
1492 | root->flags = opts->flags; | 1506 | root->flags = opts->flags; |
1493 | ida_init(&root->cgroup_ida); | 1507 | ida_init(&root->cgroup_ida); |
@@ -1500,17 +1514,15 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | |||
1500 | return root; | 1514 | return root; |
1501 | } | 1515 | } |
1502 | 1516 | ||
1503 | static void cgroup_drop_root(struct cgroupfs_root *root) | 1517 | static void cgroup_free_root(struct cgroupfs_root *root) |
1504 | { | 1518 | { |
1505 | if (!root) | 1519 | if (root) { |
1506 | return; | 1520 | /* hierarhcy ID shoulid already have been released */ |
1521 | WARN_ON_ONCE(root->hierarchy_id); | ||
1507 | 1522 | ||
1508 | BUG_ON(!root->hierarchy_id); | 1523 | ida_destroy(&root->cgroup_ida); |
1509 | spin_lock(&hierarchy_id_lock); | 1524 | kfree(root); |
1510 | ida_remove(&hierarchy_ida, root->hierarchy_id); | 1525 | } |
1511 | spin_unlock(&hierarchy_id_lock); | ||
1512 | ida_destroy(&root->cgroup_ida); | ||
1513 | kfree(root); | ||
1514 | } | 1526 | } |
1515 | 1527 | ||
1516 | static int cgroup_set_super(struct super_block *sb, void *data) | 1528 | static int cgroup_set_super(struct super_block *sb, void *data) |
@@ -1597,7 +1609,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1597 | sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); | 1609 | sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); |
1598 | if (IS_ERR(sb)) { | 1610 | if (IS_ERR(sb)) { |
1599 | ret = PTR_ERR(sb); | 1611 | ret = PTR_ERR(sb); |
1600 | cgroup_drop_root(opts.new_root); | 1612 | cgroup_free_root(opts.new_root); |
1601 | goto drop_modules; | 1613 | goto drop_modules; |
1602 | } | 1614 | } |
1603 | 1615 | ||
@@ -1605,12 +1617,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1605 | BUG_ON(!root); | 1617 | BUG_ON(!root); |
1606 | if (root == opts.new_root) { | 1618 | if (root == opts.new_root) { |
1607 | /* We used the new root structure, so this is a new hierarchy */ | 1619 | /* We used the new root structure, so this is a new hierarchy */ |
1608 | struct list_head tmp_cg_links; | 1620 | struct list_head tmp_links; |
1609 | struct cgroup *root_cgrp = &root->top_cgroup; | 1621 | struct cgroup *root_cgrp = &root->top_cgroup; |
1610 | struct cgroupfs_root *existing_root; | 1622 | struct cgroupfs_root *existing_root; |
1611 | const struct cred *cred; | 1623 | const struct cred *cred; |
1612 | int i; | 1624 | int i; |
1613 | struct css_set *cg; | 1625 | struct css_set *cset; |
1614 | 1626 | ||
1615 | BUG_ON(sb->s_root != NULL); | 1627 | BUG_ON(sb->s_root != NULL); |
1616 | 1628 | ||
@@ -1637,13 +1649,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1637 | * that's us. The worst that can happen is that we | 1649 | * that's us. The worst that can happen is that we |
1638 | * have some link structures left over | 1650 | * have some link structures left over |
1639 | */ | 1651 | */ |
1640 | ret = allocate_cg_links(css_set_count, &tmp_cg_links); | 1652 | ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); |
1641 | if (ret) | 1653 | if (ret) |
1642 | goto unlock_drop; | 1654 | goto unlock_drop; |
1643 | 1655 | ||
1644 | ret = rebind_subsystems(root, root->subsys_mask); | 1656 | /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ |
1657 | ret = cgroup_init_root_id(root, 2, 0); | ||
1658 | if (ret) | ||
1659 | goto unlock_drop; | ||
1660 | |||
1661 | ret = rebind_subsystems(root, root->subsys_mask, 0); | ||
1645 | if (ret == -EBUSY) { | 1662 | if (ret == -EBUSY) { |
1646 | free_cg_links(&tmp_cg_links); | 1663 | free_cgrp_cset_links(&tmp_links); |
1647 | goto unlock_drop; | 1664 | goto unlock_drop; |
1648 | } | 1665 | } |
1649 | /* | 1666 | /* |
@@ -1655,8 +1672,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1655 | /* EBUSY should be the only error here */ | 1672 | /* EBUSY should be the only error here */ |
1656 | BUG_ON(ret); | 1673 | BUG_ON(ret); |
1657 | 1674 | ||
1658 | list_add(&root->root_list, &roots); | 1675 | list_add(&root->root_list, &cgroup_roots); |
1659 | root_count++; | 1676 | cgroup_root_count++; |
1660 | 1677 | ||
1661 | sb->s_root->d_fsdata = root_cgrp; | 1678 | sb->s_root->d_fsdata = root_cgrp; |
1662 | root->top_cgroup.dentry = sb->s_root; | 1679 | root->top_cgroup.dentry = sb->s_root; |
@@ -1664,11 +1681,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1664 | /* Link the top cgroup in this hierarchy into all | 1681 | /* Link the top cgroup in this hierarchy into all |
1665 | * the css_set objects */ | 1682 | * the css_set objects */ |
1666 | write_lock(&css_set_lock); | 1683 | write_lock(&css_set_lock); |
1667 | hash_for_each(css_set_table, i, cg, hlist) | 1684 | hash_for_each(css_set_table, i, cset, hlist) |
1668 | link_css_set(&tmp_cg_links, cg, root_cgrp); | 1685 | link_css_set(&tmp_links, cset, root_cgrp); |
1669 | write_unlock(&css_set_lock); | 1686 | write_unlock(&css_set_lock); |
1670 | 1687 | ||
1671 | free_cg_links(&tmp_cg_links); | 1688 | free_cgrp_cset_links(&tmp_links); |
1672 | 1689 | ||
1673 | BUG_ON(!list_empty(&root_cgrp->children)); | 1690 | BUG_ON(!list_empty(&root_cgrp->children)); |
1674 | BUG_ON(root->number_of_cgroups != 1); | 1691 | BUG_ON(root->number_of_cgroups != 1); |
@@ -1684,9 +1701,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1684 | * We re-used an existing hierarchy - the new root (if | 1701 | * We re-used an existing hierarchy - the new root (if |
1685 | * any) is not needed | 1702 | * any) is not needed |
1686 | */ | 1703 | */ |
1687 | cgroup_drop_root(opts.new_root); | 1704 | cgroup_free_root(opts.new_root); |
1688 | 1705 | ||
1689 | if (root->flags != opts.flags) { | 1706 | if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { |
1690 | if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { | 1707 | if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { |
1691 | pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); | 1708 | pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); |
1692 | ret = -EINVAL; | 1709 | ret = -EINVAL; |
@@ -1705,6 +1722,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1705 | return dget(sb->s_root); | 1722 | return dget(sb->s_root); |
1706 | 1723 | ||
1707 | unlock_drop: | 1724 | unlock_drop: |
1725 | cgroup_exit_root_id(root); | ||
1708 | mutex_unlock(&cgroup_root_mutex); | 1726 | mutex_unlock(&cgroup_root_mutex); |
1709 | mutex_unlock(&cgroup_mutex); | 1727 | mutex_unlock(&cgroup_mutex); |
1710 | mutex_unlock(&inode->i_mutex); | 1728 | mutex_unlock(&inode->i_mutex); |
@@ -1721,9 +1739,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1721 | static void cgroup_kill_sb(struct super_block *sb) { | 1739 | static void cgroup_kill_sb(struct super_block *sb) { |
1722 | struct cgroupfs_root *root = sb->s_fs_info; | 1740 | struct cgroupfs_root *root = sb->s_fs_info; |
1723 | struct cgroup *cgrp = &root->top_cgroup; | 1741 | struct cgroup *cgrp = &root->top_cgroup; |
1742 | struct cgrp_cset_link *link, *tmp_link; | ||
1724 | int ret; | 1743 | int ret; |
1725 | struct cg_cgroup_link *link; | ||
1726 | struct cg_cgroup_link *saved_link; | ||
1727 | 1744 | ||
1728 | BUG_ON(!root); | 1745 | BUG_ON(!root); |
1729 | 1746 | ||
@@ -1734,36 +1751,39 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
1734 | mutex_lock(&cgroup_root_mutex); | 1751 | mutex_lock(&cgroup_root_mutex); |
1735 | 1752 | ||
1736 | /* Rebind all subsystems back to the default hierarchy */ | 1753 | /* Rebind all subsystems back to the default hierarchy */ |
1737 | ret = rebind_subsystems(root, 0); | 1754 | if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { |
1738 | /* Shouldn't be able to fail ... */ | 1755 | ret = rebind_subsystems(root, 0, root->subsys_mask); |
1739 | BUG_ON(ret); | 1756 | /* Shouldn't be able to fail ... */ |
1757 | BUG_ON(ret); | ||
1758 | } | ||
1740 | 1759 | ||
1741 | /* | 1760 | /* |
1742 | * Release all the links from css_sets to this hierarchy's | 1761 | * Release all the links from cset_links to this hierarchy's |
1743 | * root cgroup | 1762 | * root cgroup |
1744 | */ | 1763 | */ |
1745 | write_lock(&css_set_lock); | 1764 | write_lock(&css_set_lock); |
1746 | 1765 | ||
1747 | list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, | 1766 | list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { |
1748 | cgrp_link_list) { | 1767 | list_del(&link->cset_link); |
1749 | list_del(&link->cg_link_list); | 1768 | list_del(&link->cgrp_link); |
1750 | list_del(&link->cgrp_link_list); | ||
1751 | kfree(link); | 1769 | kfree(link); |
1752 | } | 1770 | } |
1753 | write_unlock(&css_set_lock); | 1771 | write_unlock(&css_set_lock); |
1754 | 1772 | ||
1755 | if (!list_empty(&root->root_list)) { | 1773 | if (!list_empty(&root->root_list)) { |
1756 | list_del(&root->root_list); | 1774 | list_del(&root->root_list); |
1757 | root_count--; | 1775 | cgroup_root_count--; |
1758 | } | 1776 | } |
1759 | 1777 | ||
1778 | cgroup_exit_root_id(root); | ||
1779 | |||
1760 | mutex_unlock(&cgroup_root_mutex); | 1780 | mutex_unlock(&cgroup_root_mutex); |
1761 | mutex_unlock(&cgroup_mutex); | 1781 | mutex_unlock(&cgroup_mutex); |
1762 | 1782 | ||
1763 | simple_xattrs_free(&cgrp->xattrs); | 1783 | simple_xattrs_free(&cgrp->xattrs); |
1764 | 1784 | ||
1765 | kill_litter_super(sb); | 1785 | kill_litter_super(sb); |
1766 | cgroup_drop_root(root); | 1786 | cgroup_free_root(root); |
1767 | } | 1787 | } |
1768 | 1788 | ||
1769 | static struct file_system_type cgroup_fs_type = { | 1789 | static struct file_system_type cgroup_fs_type = { |
@@ -1825,6 +1845,38 @@ out: | |||
1825 | } | 1845 | } |
1826 | EXPORT_SYMBOL_GPL(cgroup_path); | 1846 | EXPORT_SYMBOL_GPL(cgroup_path); |
1827 | 1847 | ||
1848 | /** | ||
1849 | * task_cgroup_path_from_hierarchy - cgroup path of a task on a hierarchy | ||
1850 | * @task: target task | ||
1851 | * @hierarchy_id: the hierarchy to look up @task's cgroup from | ||
1852 | * @buf: the buffer to write the path into | ||
1853 | * @buflen: the length of the buffer | ||
1854 | * | ||
1855 | * Determine @task's cgroup on the hierarchy specified by @hierarchy_id and | ||
1856 | * copy its path into @buf. This function grabs cgroup_mutex and shouldn't | ||
1857 | * be used inside locks used by cgroup controller callbacks. | ||
1858 | */ | ||
1859 | int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id, | ||
1860 | char *buf, size_t buflen) | ||
1861 | { | ||
1862 | struct cgroupfs_root *root; | ||
1863 | struct cgroup *cgrp = NULL; | ||
1864 | int ret = -ENOENT; | ||
1865 | |||
1866 | mutex_lock(&cgroup_mutex); | ||
1867 | |||
1868 | root = idr_find(&cgroup_hierarchy_idr, hierarchy_id); | ||
1869 | if (root) { | ||
1870 | cgrp = task_cgroup_from_root(task, root); | ||
1871 | ret = cgroup_path(cgrp, buf, buflen); | ||
1872 | } | ||
1873 | |||
1874 | mutex_unlock(&cgroup_mutex); | ||
1875 | |||
1876 | return ret; | ||
1877 | } | ||
1878 | EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy); | ||
1879 | |||
1828 | /* | 1880 | /* |
1829 | * Control Group taskset | 1881 | * Control Group taskset |
1830 | */ | 1882 | */ |
@@ -1910,10 +1962,11 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); | |||
1910 | * | 1962 | * |
1911 | * Must be called with cgroup_mutex and threadgroup locked. | 1963 | * Must be called with cgroup_mutex and threadgroup locked. |
1912 | */ | 1964 | */ |
1913 | static void cgroup_task_migrate(struct cgroup *oldcgrp, | 1965 | static void cgroup_task_migrate(struct cgroup *old_cgrp, |
1914 | struct task_struct *tsk, struct css_set *newcg) | 1966 | struct task_struct *tsk, |
1967 | struct css_set *new_cset) | ||
1915 | { | 1968 | { |
1916 | struct css_set *oldcg; | 1969 | struct css_set *old_cset; |
1917 | 1970 | ||
1918 | /* | 1971 | /* |
1919 | * We are synchronized through threadgroup_lock() against PF_EXITING | 1972 | * We are synchronized through threadgroup_lock() against PF_EXITING |
@@ -1921,25 +1974,25 @@ static void cgroup_task_migrate(struct cgroup *oldcgrp, | |||
1921 | * css_set to init_css_set and dropping the old one. | 1974 | * css_set to init_css_set and dropping the old one. |
1922 | */ | 1975 | */ |
1923 | WARN_ON_ONCE(tsk->flags & PF_EXITING); | 1976 | WARN_ON_ONCE(tsk->flags & PF_EXITING); |
1924 | oldcg = tsk->cgroups; | 1977 | old_cset = task_css_set(tsk); |
1925 | 1978 | ||
1926 | task_lock(tsk); | 1979 | task_lock(tsk); |
1927 | rcu_assign_pointer(tsk->cgroups, newcg); | 1980 | rcu_assign_pointer(tsk->cgroups, new_cset); |
1928 | task_unlock(tsk); | 1981 | task_unlock(tsk); |
1929 | 1982 | ||
1930 | /* Update the css_set linked lists if we're using them */ | 1983 | /* Update the css_set linked lists if we're using them */ |
1931 | write_lock(&css_set_lock); | 1984 | write_lock(&css_set_lock); |
1932 | if (!list_empty(&tsk->cg_list)) | 1985 | if (!list_empty(&tsk->cg_list)) |
1933 | list_move(&tsk->cg_list, &newcg->tasks); | 1986 | list_move(&tsk->cg_list, &new_cset->tasks); |
1934 | write_unlock(&css_set_lock); | 1987 | write_unlock(&css_set_lock); |
1935 | 1988 | ||
1936 | /* | 1989 | /* |
1937 | * We just gained a reference on oldcg by taking it from the task. As | 1990 | * We just gained a reference on old_cset by taking it from the |
1938 | * trading it for newcg is protected by cgroup_mutex, we're safe to drop | 1991 | * task. As trading it for new_cset is protected by cgroup_mutex, |
1939 | * it here; it will be freed under RCU. | 1992 | * we're safe to drop it here; it will be freed under RCU. |
1940 | */ | 1993 | */ |
1941 | set_bit(CGRP_RELEASABLE, &oldcgrp->flags); | 1994 | set_bit(CGRP_RELEASABLE, &old_cgrp->flags); |
1942 | put_css_set(oldcg); | 1995 | put_css_set(old_cset); |
1943 | } | 1996 | } |
1944 | 1997 | ||
1945 | /** | 1998 | /** |
@@ -2029,7 +2082,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2029 | /* | 2082 | /* |
2030 | * step 1: check that we can legitimately attach to the cgroup. | 2083 | * step 1: check that we can legitimately attach to the cgroup. |
2031 | */ | 2084 | */ |
2032 | for_each_subsys(root, ss) { | 2085 | for_each_root_subsys(root, ss) { |
2033 | if (ss->can_attach) { | 2086 | if (ss->can_attach) { |
2034 | retval = ss->can_attach(cgrp, &tset); | 2087 | retval = ss->can_attach(cgrp, &tset); |
2035 | if (retval) { | 2088 | if (retval) { |
@@ -2044,8 +2097,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2044 | * we use find_css_set, which allocates a new one if necessary. | 2097 | * we use find_css_set, which allocates a new one if necessary. |
2045 | */ | 2098 | */ |
2046 | for (i = 0; i < group_size; i++) { | 2099 | for (i = 0; i < group_size; i++) { |
2100 | struct css_set *old_cset; | ||
2101 | |||
2047 | tc = flex_array_get(group, i); | 2102 | tc = flex_array_get(group, i); |
2048 | tc->cg = find_css_set(tc->task->cgroups, cgrp); | 2103 | old_cset = task_css_set(tc->task); |
2104 | tc->cg = find_css_set(old_cset, cgrp); | ||
2049 | if (!tc->cg) { | 2105 | if (!tc->cg) { |
2050 | retval = -ENOMEM; | 2106 | retval = -ENOMEM; |
2051 | goto out_put_css_set_refs; | 2107 | goto out_put_css_set_refs; |
@@ -2066,7 +2122,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2066 | /* | 2122 | /* |
2067 | * step 4: do subsystem attach callbacks. | 2123 | * step 4: do subsystem attach callbacks. |
2068 | */ | 2124 | */ |
2069 | for_each_subsys(root, ss) { | 2125 | for_each_root_subsys(root, ss) { |
2070 | if (ss->attach) | 2126 | if (ss->attach) |
2071 | ss->attach(cgrp, &tset); | 2127 | ss->attach(cgrp, &tset); |
2072 | } | 2128 | } |
@@ -2086,7 +2142,7 @@ out_put_css_set_refs: | |||
2086 | } | 2142 | } |
2087 | out_cancel_attach: | 2143 | out_cancel_attach: |
2088 | if (retval) { | 2144 | if (retval) { |
2089 | for_each_subsys(root, ss) { | 2145 | for_each_root_subsys(root, ss) { |
2090 | if (ss == failed_ss) | 2146 | if (ss == failed_ss) |
2091 | break; | 2147 | break; |
2092 | if (ss->cancel_attach) | 2148 | if (ss->cancel_attach) |
@@ -2323,7 +2379,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, | |||
2323 | struct cftype *cft = __d_cft(file->f_dentry); | 2379 | struct cftype *cft = __d_cft(file->f_dentry); |
2324 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2380 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); |
2325 | 2381 | ||
2326 | if (cgroup_is_removed(cgrp)) | 2382 | if (cgroup_is_dead(cgrp)) |
2327 | return -ENODEV; | 2383 | return -ENODEV; |
2328 | if (cft->write) | 2384 | if (cft->write) |
2329 | return cft->write(cgrp, cft, file, buf, nbytes, ppos); | 2385 | return cft->write(cgrp, cft, file, buf, nbytes, ppos); |
@@ -2368,7 +2424,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, | |||
2368 | struct cftype *cft = __d_cft(file->f_dentry); | 2424 | struct cftype *cft = __d_cft(file->f_dentry); |
2369 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2425 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); |
2370 | 2426 | ||
2371 | if (cgroup_is_removed(cgrp)) | 2427 | if (cgroup_is_dead(cgrp)) |
2372 | return -ENODEV; | 2428 | return -ENODEV; |
2373 | 2429 | ||
2374 | if (cft->read) | 2430 | if (cft->read) |
@@ -2435,10 +2491,12 @@ static int cgroup_file_open(struct inode *inode, struct file *file) | |||
2435 | cft = __d_cft(file->f_dentry); | 2491 | cft = __d_cft(file->f_dentry); |
2436 | 2492 | ||
2437 | if (cft->read_map || cft->read_seq_string) { | 2493 | if (cft->read_map || cft->read_seq_string) { |
2438 | struct cgroup_seqfile_state *state = | 2494 | struct cgroup_seqfile_state *state; |
2439 | kzalloc(sizeof(*state), GFP_USER); | 2495 | |
2496 | state = kzalloc(sizeof(*state), GFP_USER); | ||
2440 | if (!state) | 2497 | if (!state) |
2441 | return -ENOMEM; | 2498 | return -ENOMEM; |
2499 | |||
2442 | state->cft = cft; | 2500 | state->cft = cft; |
2443 | state->cgroup = __d_cgrp(file->f_dentry->d_parent); | 2501 | state->cgroup = __d_cgrp(file->f_dentry->d_parent); |
2444 | file->f_op = &cgroup_seqfile_operations; | 2502 | file->f_op = &cgroup_seqfile_operations; |
@@ -2486,6 +2544,13 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
2486 | 2544 | ||
2487 | cgrp = __d_cgrp(old_dentry); | 2545 | cgrp = __d_cgrp(old_dentry); |
2488 | 2546 | ||
2547 | /* | ||
2548 | * This isn't a proper migration and its usefulness is very | ||
2549 | * limited. Disallow if sane_behavior. | ||
2550 | */ | ||
2551 | if (cgroup_sane_behavior(cgrp)) | ||
2552 | return -EPERM; | ||
2553 | |||
2489 | name = cgroup_alloc_name(new_dentry); | 2554 | name = cgroup_alloc_name(new_dentry); |
2490 | if (!name) | 2555 | if (!name) |
2491 | return -ENOMEM; | 2556 | return -ENOMEM; |
@@ -2496,7 +2561,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
2496 | return ret; | 2561 | return ret; |
2497 | } | 2562 | } |
2498 | 2563 | ||
2499 | old_name = cgrp->name; | 2564 | old_name = rcu_dereference_protected(cgrp->name, true); |
2500 | rcu_assign_pointer(cgrp->name, name); | 2565 | rcu_assign_pointer(cgrp->name, name); |
2501 | 2566 | ||
2502 | kfree_rcu(old_name, rcu_head); | 2567 | kfree_rcu(old_name, rcu_head); |
@@ -2747,58 +2812,78 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
2747 | return ret; | 2812 | return ret; |
2748 | } | 2813 | } |
2749 | 2814 | ||
2750 | static DEFINE_MUTEX(cgroup_cft_mutex); | ||
2751 | |||
2752 | static void cgroup_cfts_prepare(void) | 2815 | static void cgroup_cfts_prepare(void) |
2753 | __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex) | 2816 | __acquires(&cgroup_mutex) |
2754 | { | 2817 | { |
2755 | /* | 2818 | /* |
2756 | * Thanks to the entanglement with vfs inode locking, we can't walk | 2819 | * Thanks to the entanglement with vfs inode locking, we can't walk |
2757 | * the existing cgroups under cgroup_mutex and create files. | 2820 | * the existing cgroups under cgroup_mutex and create files. |
2758 | * Instead, we increment reference on all cgroups and build list of | 2821 | * Instead, we use cgroup_for_each_descendant_pre() and drop RCU |
2759 | * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure | 2822 | * read lock before calling cgroup_addrm_files(). |
2760 | * exclusive access to the field. | ||
2761 | */ | 2823 | */ |
2762 | mutex_lock(&cgroup_cft_mutex); | ||
2763 | mutex_lock(&cgroup_mutex); | 2824 | mutex_lock(&cgroup_mutex); |
2764 | } | 2825 | } |
2765 | 2826 | ||
2766 | static void cgroup_cfts_commit(struct cgroup_subsys *ss, | 2827 | static void cgroup_cfts_commit(struct cgroup_subsys *ss, |
2767 | struct cftype *cfts, bool is_add) | 2828 | struct cftype *cfts, bool is_add) |
2768 | __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) | 2829 | __releases(&cgroup_mutex) |
2769 | { | 2830 | { |
2770 | LIST_HEAD(pending); | 2831 | LIST_HEAD(pending); |
2771 | struct cgroup *cgrp, *n; | 2832 | struct cgroup *cgrp, *root = &ss->root->top_cgroup; |
2833 | struct super_block *sb = ss->root->sb; | ||
2834 | struct dentry *prev = NULL; | ||
2835 | struct inode *inode; | ||
2836 | u64 update_before; | ||
2772 | 2837 | ||
2773 | /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ | 2838 | /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ |
2774 | if (cfts && ss->root != &rootnode) { | 2839 | if (!cfts || ss->root == &cgroup_dummy_root || |
2775 | list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { | 2840 | !atomic_inc_not_zero(&sb->s_active)) { |
2776 | dget(cgrp->dentry); | 2841 | mutex_unlock(&cgroup_mutex); |
2777 | list_add_tail(&cgrp->cft_q_node, &pending); | 2842 | return; |
2778 | } | ||
2779 | } | 2843 | } |
2780 | 2844 | ||
2781 | mutex_unlock(&cgroup_mutex); | ||
2782 | |||
2783 | /* | 2845 | /* |
2784 | * All new cgroups will see @cfts update on @ss->cftsets. Add/rm | 2846 | * All cgroups which are created after we drop cgroup_mutex will |
2785 | * files for all cgroups which were created before. | 2847 | * have the updated set of files, so we only need to update the |
2848 | * cgroups created before the current @cgroup_serial_nr_next. | ||
2786 | */ | 2849 | */ |
2787 | list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) { | 2850 | update_before = cgroup_serial_nr_next; |
2788 | struct inode *inode = cgrp->dentry->d_inode; | 2851 | |
2852 | mutex_unlock(&cgroup_mutex); | ||
2853 | |||
2854 | /* @root always needs to be updated */ | ||
2855 | inode = root->dentry->d_inode; | ||
2856 | mutex_lock(&inode->i_mutex); | ||
2857 | mutex_lock(&cgroup_mutex); | ||
2858 | cgroup_addrm_files(root, ss, cfts, is_add); | ||
2859 | mutex_unlock(&cgroup_mutex); | ||
2860 | mutex_unlock(&inode->i_mutex); | ||
2861 | |||
2862 | /* add/rm files for all cgroups created before */ | ||
2863 | rcu_read_lock(); | ||
2864 | cgroup_for_each_descendant_pre(cgrp, root) { | ||
2865 | if (cgroup_is_dead(cgrp)) | ||
2866 | continue; | ||
2867 | |||
2868 | inode = cgrp->dentry->d_inode; | ||
2869 | dget(cgrp->dentry); | ||
2870 | rcu_read_unlock(); | ||
2871 | |||
2872 | dput(prev); | ||
2873 | prev = cgrp->dentry; | ||
2789 | 2874 | ||
2790 | mutex_lock(&inode->i_mutex); | 2875 | mutex_lock(&inode->i_mutex); |
2791 | mutex_lock(&cgroup_mutex); | 2876 | mutex_lock(&cgroup_mutex); |
2792 | if (!cgroup_is_removed(cgrp)) | 2877 | if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) |
2793 | cgroup_addrm_files(cgrp, ss, cfts, is_add); | 2878 | cgroup_addrm_files(cgrp, ss, cfts, is_add); |
2794 | mutex_unlock(&cgroup_mutex); | 2879 | mutex_unlock(&cgroup_mutex); |
2795 | mutex_unlock(&inode->i_mutex); | 2880 | mutex_unlock(&inode->i_mutex); |
2796 | 2881 | ||
2797 | list_del_init(&cgrp->cft_q_node); | 2882 | rcu_read_lock(); |
2798 | dput(cgrp->dentry); | ||
2799 | } | 2883 | } |
2800 | 2884 | rcu_read_unlock(); | |
2801 | mutex_unlock(&cgroup_cft_mutex); | 2885 | dput(prev); |
2886 | deactivate_super(sb); | ||
2802 | } | 2887 | } |
2803 | 2888 | ||
2804 | /** | 2889 | /** |
@@ -2853,7 +2938,8 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | |||
2853 | 2938 | ||
2854 | list_for_each_entry(set, &ss->cftsets, node) { | 2939 | list_for_each_entry(set, &ss->cftsets, node) { |
2855 | if (set->cfts == cfts) { | 2940 | if (set->cfts == cfts) { |
2856 | list_del_init(&set->node); | 2941 | list_del(&set->node); |
2942 | kfree(set); | ||
2857 | cgroup_cfts_commit(ss, cfts, false); | 2943 | cgroup_cfts_commit(ss, cfts, false); |
2858 | return 0; | 2944 | return 0; |
2859 | } | 2945 | } |
@@ -2872,12 +2958,11 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | |||
2872 | int cgroup_task_count(const struct cgroup *cgrp) | 2958 | int cgroup_task_count(const struct cgroup *cgrp) |
2873 | { | 2959 | { |
2874 | int count = 0; | 2960 | int count = 0; |
2875 | struct cg_cgroup_link *link; | 2961 | struct cgrp_cset_link *link; |
2876 | 2962 | ||
2877 | read_lock(&css_set_lock); | 2963 | read_lock(&css_set_lock); |
2878 | list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { | 2964 | list_for_each_entry(link, &cgrp->cset_links, cset_link) |
2879 | count += atomic_read(&link->cg->refcount); | 2965 | count += atomic_read(&link->cset->refcount); |
2880 | } | ||
2881 | read_unlock(&css_set_lock); | 2966 | read_unlock(&css_set_lock); |
2882 | return count; | 2967 | return count; |
2883 | } | 2968 | } |
@@ -2886,25 +2971,24 @@ int cgroup_task_count(const struct cgroup *cgrp) | |||
2886 | * Advance a list_head iterator. The iterator should be positioned at | 2971 | * Advance a list_head iterator. The iterator should be positioned at |
2887 | * the start of a css_set | 2972 | * the start of a css_set |
2888 | */ | 2973 | */ |
2889 | static void cgroup_advance_iter(struct cgroup *cgrp, | 2974 | static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) |
2890 | struct cgroup_iter *it) | ||
2891 | { | 2975 | { |
2892 | struct list_head *l = it->cg_link; | 2976 | struct list_head *l = it->cset_link; |
2893 | struct cg_cgroup_link *link; | 2977 | struct cgrp_cset_link *link; |
2894 | struct css_set *cg; | 2978 | struct css_set *cset; |
2895 | 2979 | ||
2896 | /* Advance to the next non-empty css_set */ | 2980 | /* Advance to the next non-empty css_set */ |
2897 | do { | 2981 | do { |
2898 | l = l->next; | 2982 | l = l->next; |
2899 | if (l == &cgrp->css_sets) { | 2983 | if (l == &cgrp->cset_links) { |
2900 | it->cg_link = NULL; | 2984 | it->cset_link = NULL; |
2901 | return; | 2985 | return; |
2902 | } | 2986 | } |
2903 | link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); | 2987 | link = list_entry(l, struct cgrp_cset_link, cset_link); |
2904 | cg = link->cg; | 2988 | cset = link->cset; |
2905 | } while (list_empty(&cg->tasks)); | 2989 | } while (list_empty(&cset->tasks)); |
2906 | it->cg_link = l; | 2990 | it->cset_link = l; |
2907 | it->task = cg->tasks.next; | 2991 | it->task = cset->tasks.next; |
2908 | } | 2992 | } |
2909 | 2993 | ||
2910 | /* | 2994 | /* |
@@ -2934,7 +3018,7 @@ static void cgroup_enable_task_cg_lists(void) | |||
2934 | * entry won't be deleted though the process has exited. | 3018 | * entry won't be deleted though the process has exited. |
2935 | */ | 3019 | */ |
2936 | if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) | 3020 | if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) |
2937 | list_add(&p->cg_list, &p->cgroups->tasks); | 3021 | list_add(&p->cg_list, &task_css_set(p)->tasks); |
2938 | task_unlock(p); | 3022 | task_unlock(p); |
2939 | } while_each_thread(g, p); | 3023 | } while_each_thread(g, p); |
2940 | read_unlock(&tasklist_lock); | 3024 | read_unlock(&tasklist_lock); |
@@ -2942,12 +3026,67 @@ static void cgroup_enable_task_cg_lists(void) | |||
2942 | } | 3026 | } |
2943 | 3027 | ||
2944 | /** | 3028 | /** |
3029 | * cgroup_next_sibling - find the next sibling of a given cgroup | ||
3030 | * @pos: the current cgroup | ||
3031 | * | ||
3032 | * This function returns the next sibling of @pos and should be called | ||
3033 | * under RCU read lock. The only requirement is that @pos is accessible. | ||
3034 | * The next sibling is guaranteed to be returned regardless of @pos's | ||
3035 | * state. | ||
3036 | */ | ||
3037 | struct cgroup *cgroup_next_sibling(struct cgroup *pos) | ||
3038 | { | ||
3039 | struct cgroup *next; | ||
3040 | |||
3041 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
3042 | |||
3043 | /* | ||
3044 | * @pos could already have been removed. Once a cgroup is removed, | ||
3045 | * its ->sibling.next is no longer updated when its next sibling | ||
3046 | * changes. As CGRP_DEAD assertion is serialized and happens | ||
3047 | * before the cgroup is taken off the ->sibling list, if we see it | ||
3048 | * unasserted, it's guaranteed that the next sibling hasn't | ||
3049 | * finished its grace period even if it's already removed, and thus | ||
3050 | * safe to dereference from this RCU critical section. If | ||
3051 | * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed | ||
3052 | * to be visible as %true here. | ||
3053 | */ | ||
3054 | if (likely(!cgroup_is_dead(pos))) { | ||
3055 | next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); | ||
3056 | if (&next->sibling != &pos->parent->children) | ||
3057 | return next; | ||
3058 | return NULL; | ||
3059 | } | ||
3060 | |||
3061 | /* | ||
3062 | * Can't dereference the next pointer. Each cgroup is given a | ||
3063 | * monotonically increasing unique serial number and always | ||
3064 | * appended to the sibling list, so the next one can be found by | ||
3065 | * walking the parent's children until we see a cgroup with higher | ||
3066 | * serial number than @pos's. | ||
3067 | * | ||
3068 | * While this path can be slow, it's taken only when either the | ||
3069 | * current cgroup is removed or iteration and removal race. | ||
3070 | */ | ||
3071 | list_for_each_entry_rcu(next, &pos->parent->children, sibling) | ||
3072 | if (next->serial_nr > pos->serial_nr) | ||
3073 | return next; | ||
3074 | return NULL; | ||
3075 | } | ||
3076 | EXPORT_SYMBOL_GPL(cgroup_next_sibling); | ||
3077 | |||
3078 | /** | ||
2945 | * cgroup_next_descendant_pre - find the next descendant for pre-order walk | 3079 | * cgroup_next_descendant_pre - find the next descendant for pre-order walk |
2946 | * @pos: the current position (%NULL to initiate traversal) | 3080 | * @pos: the current position (%NULL to initiate traversal) |
2947 | * @cgroup: cgroup whose descendants to walk | 3081 | * @cgroup: cgroup whose descendants to walk |
2948 | * | 3082 | * |
2949 | * To be used by cgroup_for_each_descendant_pre(). Find the next | 3083 | * To be used by cgroup_for_each_descendant_pre(). Find the next |
2950 | * descendant to visit for pre-order traversal of @cgroup's descendants. | 3084 | * descendant to visit for pre-order traversal of @cgroup's descendants. |
3085 | * | ||
3086 | * While this function requires RCU read locking, it doesn't require the | ||
3087 | * whole traversal to be contained in a single RCU critical section. This | ||
3088 | * function will return the correct next descendant as long as both @pos | ||
3089 | * and @cgroup are accessible and @pos is a descendant of @cgroup. | ||
2951 | */ | 3090 | */ |
2952 | struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, | 3091 | struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, |
2953 | struct cgroup *cgroup) | 3092 | struct cgroup *cgroup) |
@@ -2967,11 +3106,9 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, | |||
2967 | 3106 | ||
2968 | /* no child, visit my or the closest ancestor's next sibling */ | 3107 | /* no child, visit my or the closest ancestor's next sibling */ |
2969 | while (pos != cgroup) { | 3108 | while (pos != cgroup) { |
2970 | next = list_entry_rcu(pos->sibling.next, struct cgroup, | 3109 | next = cgroup_next_sibling(pos); |
2971 | sibling); | 3110 | if (next) |
2972 | if (&next->sibling != &pos->parent->children) | ||
2973 | return next; | 3111 | return next; |
2974 | |||
2975 | pos = pos->parent; | 3112 | pos = pos->parent; |
2976 | } | 3113 | } |
2977 | 3114 | ||
@@ -2986,6 +3123,11 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); | |||
2986 | * Return the rightmost descendant of @pos. If there's no descendant, | 3123 | * Return the rightmost descendant of @pos. If there's no descendant, |
2987 | * @pos is returned. This can be used during pre-order traversal to skip | 3124 | * @pos is returned. This can be used during pre-order traversal to skip |
2988 | * subtree of @pos. | 3125 | * subtree of @pos. |
3126 | * | ||
3127 | * While this function requires RCU read locking, it doesn't require the | ||
3128 | * whole traversal to be contained in a single RCU critical section. This | ||
3129 | * function will return the correct rightmost descendant as long as @pos is | ||
3130 | * accessible. | ||
2989 | */ | 3131 | */ |
2990 | struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) | 3132 | struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) |
2991 | { | 3133 | { |
@@ -3025,6 +3167,11 @@ static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) | |||
3025 | * | 3167 | * |
3026 | * To be used by cgroup_for_each_descendant_post(). Find the next | 3168 | * To be used by cgroup_for_each_descendant_post(). Find the next |
3027 | * descendant to visit for post-order traversal of @cgroup's descendants. | 3169 | * descendant to visit for post-order traversal of @cgroup's descendants. |
3170 | * | ||
3171 | * While this function requires RCU read locking, it doesn't require the | ||
3172 | * whole traversal to be contained in a single RCU critical section. This | ||
3173 | * function will return the correct next descendant as long as both @pos | ||
3174 | * and @cgroup are accessible and @pos is a descendant of @cgroup. | ||
3028 | */ | 3175 | */ |
3029 | struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, | 3176 | struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, |
3030 | struct cgroup *cgroup) | 3177 | struct cgroup *cgroup) |
@@ -3040,8 +3187,8 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, | |||
3040 | } | 3187 | } |
3041 | 3188 | ||
3042 | /* if there's an unvisited sibling, visit its leftmost descendant */ | 3189 | /* if there's an unvisited sibling, visit its leftmost descendant */ |
3043 | next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); | 3190 | next = cgroup_next_sibling(pos); |
3044 | if (&next->sibling != &pos->parent->children) | 3191 | if (next) |
3045 | return cgroup_leftmost_descendant(next); | 3192 | return cgroup_leftmost_descendant(next); |
3046 | 3193 | ||
3047 | /* no sibling left, visit parent */ | 3194 | /* no sibling left, visit parent */ |
@@ -3062,7 +3209,7 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) | |||
3062 | cgroup_enable_task_cg_lists(); | 3209 | cgroup_enable_task_cg_lists(); |
3063 | 3210 | ||
3064 | read_lock(&css_set_lock); | 3211 | read_lock(&css_set_lock); |
3065 | it->cg_link = &cgrp->css_sets; | 3212 | it->cset_link = &cgrp->cset_links; |
3066 | cgroup_advance_iter(cgrp, it); | 3213 | cgroup_advance_iter(cgrp, it); |
3067 | } | 3214 | } |
3068 | 3215 | ||
@@ -3071,16 +3218,16 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, | |||
3071 | { | 3218 | { |
3072 | struct task_struct *res; | 3219 | struct task_struct *res; |
3073 | struct list_head *l = it->task; | 3220 | struct list_head *l = it->task; |
3074 | struct cg_cgroup_link *link; | 3221 | struct cgrp_cset_link *link; |
3075 | 3222 | ||
3076 | /* If the iterator cg is NULL, we have no tasks */ | 3223 | /* If the iterator cg is NULL, we have no tasks */ |
3077 | if (!it->cg_link) | 3224 | if (!it->cset_link) |
3078 | return NULL; | 3225 | return NULL; |
3079 | res = list_entry(l, struct task_struct, cg_list); | 3226 | res = list_entry(l, struct task_struct, cg_list); |
3080 | /* Advance iterator to find next entry */ | 3227 | /* Advance iterator to find next entry */ |
3081 | l = l->next; | 3228 | l = l->next; |
3082 | link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); | 3229 | link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); |
3083 | if (l == &link->cg->tasks) { | 3230 | if (l == &link->cset->tasks) { |
3084 | /* We reached the end of this task list - move on to | 3231 | /* We reached the end of this task list - move on to |
3085 | * the next cg_cgroup_link */ | 3232 | * the next cg_cgroup_link */ |
3086 | cgroup_advance_iter(cgrp, it); | 3233 | cgroup_advance_iter(cgrp, it); |
@@ -3411,7 +3558,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | |||
3411 | } | 3558 | } |
3412 | } | 3559 | } |
3413 | /* entry not found; create a new one */ | 3560 | /* entry not found; create a new one */ |
3414 | l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); | 3561 | l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); |
3415 | if (!l) { | 3562 | if (!l) { |
3416 | mutex_unlock(&cgrp->pidlist_mutex); | 3563 | mutex_unlock(&cgrp->pidlist_mutex); |
3417 | return l; | 3564 | return l; |
@@ -3420,8 +3567,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | |||
3420 | down_write(&l->mutex); | 3567 | down_write(&l->mutex); |
3421 | l->key.type = type; | 3568 | l->key.type = type; |
3422 | l->key.ns = get_pid_ns(ns); | 3569 | l->key.ns = get_pid_ns(ns); |
3423 | l->use_count = 0; /* don't increment here */ | ||
3424 | l->list = NULL; | ||
3425 | l->owner = cgrp; | 3570 | l->owner = cgrp; |
3426 | list_add(&l->links, &cgrp->pidlists); | 3571 | list_add(&l->links, &cgrp->pidlists); |
3427 | mutex_unlock(&cgrp->pidlist_mutex); | 3572 | mutex_unlock(&cgrp->pidlist_mutex); |
@@ -3727,6 +3872,23 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp, | |||
3727 | } | 3872 | } |
3728 | 3873 | ||
3729 | /* | 3874 | /* |
3875 | * When dput() is called asynchronously, if umount has been done and | ||
3876 | * then deactivate_super() in cgroup_free_fn() kills the superblock, | ||
3877 | * there's a small window that vfs will see the root dentry with non-zero | ||
3878 | * refcnt and trigger BUG(). | ||
3879 | * | ||
3880 | * That's why we hold a reference before dput() and drop it right after. | ||
3881 | */ | ||
3882 | static void cgroup_dput(struct cgroup *cgrp) | ||
3883 | { | ||
3884 | struct super_block *sb = cgrp->root->sb; | ||
3885 | |||
3886 | atomic_inc(&sb->s_active); | ||
3887 | dput(cgrp->dentry); | ||
3888 | deactivate_super(sb); | ||
3889 | } | ||
3890 | |||
3891 | /* | ||
3730 | * Unregister event and free resources. | 3892 | * Unregister event and free resources. |
3731 | * | 3893 | * |
3732 | * Gets called from workqueue. | 3894 | * Gets called from workqueue. |
@@ -3746,7 +3908,7 @@ static void cgroup_event_remove(struct work_struct *work) | |||
3746 | 3908 | ||
3747 | eventfd_ctx_put(event->eventfd); | 3909 | eventfd_ctx_put(event->eventfd); |
3748 | kfree(event); | 3910 | kfree(event); |
3749 | dput(cgrp->dentry); | 3911 | cgroup_dput(cgrp); |
3750 | } | 3912 | } |
3751 | 3913 | ||
3752 | /* | 3914 | /* |
@@ -3933,33 +4095,16 @@ static int cgroup_clone_children_write(struct cgroup *cgrp, | |||
3933 | return 0; | 4095 | return 0; |
3934 | } | 4096 | } |
3935 | 4097 | ||
3936 | /* | 4098 | static struct cftype cgroup_base_files[] = { |
3937 | * for the common functions, 'private' gives the type of file | ||
3938 | */ | ||
3939 | /* for hysterical raisins, we can't put this on the older files */ | ||
3940 | #define CGROUP_FILE_GENERIC_PREFIX "cgroup." | ||
3941 | static struct cftype files[] = { | ||
3942 | { | ||
3943 | .name = "tasks", | ||
3944 | .open = cgroup_tasks_open, | ||
3945 | .write_u64 = cgroup_tasks_write, | ||
3946 | .release = cgroup_pidlist_release, | ||
3947 | .mode = S_IRUGO | S_IWUSR, | ||
3948 | }, | ||
3949 | { | 4099 | { |
3950 | .name = CGROUP_FILE_GENERIC_PREFIX "procs", | 4100 | .name = "cgroup.procs", |
3951 | .open = cgroup_procs_open, | 4101 | .open = cgroup_procs_open, |
3952 | .write_u64 = cgroup_procs_write, | 4102 | .write_u64 = cgroup_procs_write, |
3953 | .release = cgroup_pidlist_release, | 4103 | .release = cgroup_pidlist_release, |
3954 | .mode = S_IRUGO | S_IWUSR, | 4104 | .mode = S_IRUGO | S_IWUSR, |
3955 | }, | 4105 | }, |
3956 | { | 4106 | { |
3957 | .name = "notify_on_release", | 4107 | .name = "cgroup.event_control", |
3958 | .read_u64 = cgroup_read_notify_on_release, | ||
3959 | .write_u64 = cgroup_write_notify_on_release, | ||
3960 | }, | ||
3961 | { | ||
3962 | .name = CGROUP_FILE_GENERIC_PREFIX "event_control", | ||
3963 | .write_string = cgroup_write_event_control, | 4108 | .write_string = cgroup_write_event_control, |
3964 | .mode = S_IWUGO, | 4109 | .mode = S_IWUGO, |
3965 | }, | 4110 | }, |
@@ -3974,9 +4119,29 @@ static struct cftype files[] = { | |||
3974 | .flags = CFTYPE_ONLY_ON_ROOT, | 4119 | .flags = CFTYPE_ONLY_ON_ROOT, |
3975 | .read_seq_string = cgroup_sane_behavior_show, | 4120 | .read_seq_string = cgroup_sane_behavior_show, |
3976 | }, | 4121 | }, |
4122 | |||
4123 | /* | ||
4124 | * Historical crazy stuff. These don't have "cgroup." prefix and | ||
4125 | * don't exist if sane_behavior. If you're depending on these, be | ||
4126 | * prepared to be burned. | ||
4127 | */ | ||
4128 | { | ||
4129 | .name = "tasks", | ||
4130 | .flags = CFTYPE_INSANE, /* use "procs" instead */ | ||
4131 | .open = cgroup_tasks_open, | ||
4132 | .write_u64 = cgroup_tasks_write, | ||
4133 | .release = cgroup_pidlist_release, | ||
4134 | .mode = S_IRUGO | S_IWUSR, | ||
4135 | }, | ||
4136 | { | ||
4137 | .name = "notify_on_release", | ||
4138 | .flags = CFTYPE_INSANE, | ||
4139 | .read_u64 = cgroup_read_notify_on_release, | ||
4140 | .write_u64 = cgroup_write_notify_on_release, | ||
4141 | }, | ||
3977 | { | 4142 | { |
3978 | .name = "release_agent", | 4143 | .name = "release_agent", |
3979 | .flags = CFTYPE_ONLY_ON_ROOT, | 4144 | .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, |
3980 | .read_seq_string = cgroup_release_agent_show, | 4145 | .read_seq_string = cgroup_release_agent_show, |
3981 | .write_string = cgroup_release_agent_write, | 4146 | .write_string = cgroup_release_agent_write, |
3982 | .max_write_len = PATH_MAX, | 4147 | .max_write_len = PATH_MAX, |
@@ -3997,13 +4162,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, | |||
3997 | struct cgroup_subsys *ss; | 4162 | struct cgroup_subsys *ss; |
3998 | 4163 | ||
3999 | if (base_files) { | 4164 | if (base_files) { |
4000 | err = cgroup_addrm_files(cgrp, NULL, files, true); | 4165 | err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); |
4001 | if (err < 0) | 4166 | if (err < 0) |
4002 | return err; | 4167 | return err; |
4003 | } | 4168 | } |
4004 | 4169 | ||
4005 | /* process cftsets of each subsystem */ | 4170 | /* process cftsets of each subsystem */ |
4006 | for_each_subsys(cgrp->root, ss) { | 4171 | for_each_root_subsys(cgrp->root, ss) { |
4007 | struct cftype_set *set; | 4172 | struct cftype_set *set; |
4008 | if (!test_bit(ss->subsys_id, &subsys_mask)) | 4173 | if (!test_bit(ss->subsys_id, &subsys_mask)) |
4009 | continue; | 4174 | continue; |
@@ -4013,15 +4178,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, | |||
4013 | } | 4178 | } |
4014 | 4179 | ||
4015 | /* This cgroup is ready now */ | 4180 | /* This cgroup is ready now */ |
4016 | for_each_subsys(cgrp->root, ss) { | 4181 | for_each_root_subsys(cgrp->root, ss) { |
4017 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4182 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
4183 | struct css_id *id = rcu_dereference_protected(css->id, true); | ||
4184 | |||
4018 | /* | 4185 | /* |
4019 | * Update id->css pointer and make this css visible from | 4186 | * Update id->css pointer and make this css visible from |
4020 | * CSS ID functions. This pointer will be dereferened | 4187 | * CSS ID functions. This pointer will be dereferened |
4021 | * from RCU-read-side without locks. | 4188 | * from RCU-read-side without locks. |
4022 | */ | 4189 | */ |
4023 | if (css->id) | 4190 | if (id) |
4024 | rcu_assign_pointer(css->id->css, css); | 4191 | rcu_assign_pointer(id->css, css); |
4025 | } | 4192 | } |
4026 | 4193 | ||
4027 | return 0; | 4194 | return 0; |
@@ -4031,12 +4198,16 @@ static void css_dput_fn(struct work_struct *work) | |||
4031 | { | 4198 | { |
4032 | struct cgroup_subsys_state *css = | 4199 | struct cgroup_subsys_state *css = |
4033 | container_of(work, struct cgroup_subsys_state, dput_work); | 4200 | container_of(work, struct cgroup_subsys_state, dput_work); |
4034 | struct dentry *dentry = css->cgroup->dentry; | ||
4035 | struct super_block *sb = dentry->d_sb; | ||
4036 | 4201 | ||
4037 | atomic_inc(&sb->s_active); | 4202 | cgroup_dput(css->cgroup); |
4038 | dput(dentry); | 4203 | } |
4039 | deactivate_super(sb); | 4204 | |
4205 | static void css_release(struct percpu_ref *ref) | ||
4206 | { | ||
4207 | struct cgroup_subsys_state *css = | ||
4208 | container_of(ref, struct cgroup_subsys_state, refcnt); | ||
4209 | |||
4210 | schedule_work(&css->dput_work); | ||
4040 | } | 4211 | } |
4041 | 4212 | ||
4042 | static void init_cgroup_css(struct cgroup_subsys_state *css, | 4213 | static void init_cgroup_css(struct cgroup_subsys_state *css, |
@@ -4044,10 +4215,9 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, | |||
4044 | struct cgroup *cgrp) | 4215 | struct cgroup *cgrp) |
4045 | { | 4216 | { |
4046 | css->cgroup = cgrp; | 4217 | css->cgroup = cgrp; |
4047 | atomic_set(&css->refcnt, 1); | ||
4048 | css->flags = 0; | 4218 | css->flags = 0; |
4049 | css->id = NULL; | 4219 | css->id = NULL; |
4050 | if (cgrp == dummytop) | 4220 | if (cgrp == cgroup_dummy_top) |
4051 | css->flags |= CSS_ROOT; | 4221 | css->flags |= CSS_ROOT; |
4052 | BUG_ON(cgrp->subsys[ss->subsys_id]); | 4222 | BUG_ON(cgrp->subsys[ss->subsys_id]); |
4053 | cgrp->subsys[ss->subsys_id] = css; | 4223 | cgrp->subsys[ss->subsys_id] = css; |
@@ -4157,7 +4327,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4157 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) | 4327 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) |
4158 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); | 4328 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
4159 | 4329 | ||
4160 | for_each_subsys(root, ss) { | 4330 | for_each_root_subsys(root, ss) { |
4161 | struct cgroup_subsys_state *css; | 4331 | struct cgroup_subsys_state *css; |
4162 | 4332 | ||
4163 | css = ss->css_alloc(cgrp); | 4333 | css = ss->css_alloc(cgrp); |
@@ -4165,7 +4335,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4165 | err = PTR_ERR(css); | 4335 | err = PTR_ERR(css); |
4166 | goto err_free_all; | 4336 | goto err_free_all; |
4167 | } | 4337 | } |
4338 | |||
4339 | err = percpu_ref_init(&css->refcnt, css_release); | ||
4340 | if (err) | ||
4341 | goto err_free_all; | ||
4342 | |||
4168 | init_cgroup_css(css, ss, cgrp); | 4343 | init_cgroup_css(css, ss, cgrp); |
4344 | |||
4169 | if (ss->use_id) { | 4345 | if (ss->use_id) { |
4170 | err = alloc_css_id(ss, parent, cgrp); | 4346 | err = alloc_css_id(ss, parent, cgrp); |
4171 | if (err) | 4347 | if (err) |
@@ -4183,20 +4359,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4183 | goto err_free_all; | 4359 | goto err_free_all; |
4184 | lockdep_assert_held(&dentry->d_inode->i_mutex); | 4360 | lockdep_assert_held(&dentry->d_inode->i_mutex); |
4185 | 4361 | ||
4362 | cgrp->serial_nr = cgroup_serial_nr_next++; | ||
4363 | |||
4186 | /* allocation complete, commit to creation */ | 4364 | /* allocation complete, commit to creation */ |
4187 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
4188 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); | 4365 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); |
4189 | root->number_of_cgroups++; | 4366 | root->number_of_cgroups++; |
4190 | 4367 | ||
4191 | /* each css holds a ref to the cgroup's dentry */ | 4368 | /* each css holds a ref to the cgroup's dentry */ |
4192 | for_each_subsys(root, ss) | 4369 | for_each_root_subsys(root, ss) |
4193 | dget(dentry); | 4370 | dget(dentry); |
4194 | 4371 | ||
4195 | /* hold a ref to the parent's dentry */ | 4372 | /* hold a ref to the parent's dentry */ |
4196 | dget(parent->dentry); | 4373 | dget(parent->dentry); |
4197 | 4374 | ||
4198 | /* creation succeeded, notify subsystems */ | 4375 | /* creation succeeded, notify subsystems */ |
4199 | for_each_subsys(root, ss) { | 4376 | for_each_root_subsys(root, ss) { |
4200 | err = online_css(ss, cgrp); | 4377 | err = online_css(ss, cgrp); |
4201 | if (err) | 4378 | if (err) |
4202 | goto err_destroy; | 4379 | goto err_destroy; |
@@ -4221,9 +4398,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4221 | return 0; | 4398 | return 0; |
4222 | 4399 | ||
4223 | err_free_all: | 4400 | err_free_all: |
4224 | for_each_subsys(root, ss) { | 4401 | for_each_root_subsys(root, ss) { |
4225 | if (cgrp->subsys[ss->subsys_id]) | 4402 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
4403 | |||
4404 | if (css) { | ||
4405 | percpu_ref_cancel_init(&css->refcnt); | ||
4226 | ss->css_free(cgrp); | 4406 | ss->css_free(cgrp); |
4407 | } | ||
4227 | } | 4408 | } |
4228 | mutex_unlock(&cgroup_mutex); | 4409 | mutex_unlock(&cgroup_mutex); |
4229 | /* Release the reference count that we took on the superblock */ | 4410 | /* Release the reference count that we took on the superblock */ |
@@ -4251,63 +4432,120 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
4251 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); | 4432 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); |
4252 | } | 4433 | } |
4253 | 4434 | ||
4435 | static void cgroup_css_killed(struct cgroup *cgrp) | ||
4436 | { | ||
4437 | if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) | ||
4438 | return; | ||
4439 | |||
4440 | /* percpu ref's of all css's are killed, kick off the next step */ | ||
4441 | INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); | ||
4442 | schedule_work(&cgrp->destroy_work); | ||
4443 | } | ||
4444 | |||
4445 | static void css_ref_killed_fn(struct percpu_ref *ref) | ||
4446 | { | ||
4447 | struct cgroup_subsys_state *css = | ||
4448 | container_of(ref, struct cgroup_subsys_state, refcnt); | ||
4449 | |||
4450 | cgroup_css_killed(css->cgroup); | ||
4451 | } | ||
4452 | |||
4453 | /** | ||
4454 | * cgroup_destroy_locked - the first stage of cgroup destruction | ||
4455 | * @cgrp: cgroup to be destroyed | ||
4456 | * | ||
4457 | * css's make use of percpu refcnts whose killing latency shouldn't be | ||
4458 | * exposed to userland and are RCU protected. Also, cgroup core needs to | ||
4459 | * guarantee that css_tryget() won't succeed by the time ->css_offline() is | ||
4460 | * invoked. To satisfy all the requirements, destruction is implemented in | ||
4461 | * the following two steps. | ||
4462 | * | ||
4463 | * s1. Verify @cgrp can be destroyed and mark it dying. Remove all | ||
4464 | * userland visible parts and start killing the percpu refcnts of | ||
4465 | * css's. Set up so that the next stage will be kicked off once all | ||
4466 | * the percpu refcnts are confirmed to be killed. | ||
4467 | * | ||
4468 | * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the | ||
4469 | * rest of destruction. Once all cgroup references are gone, the | ||
4470 | * cgroup is RCU-freed. | ||
4471 | * | ||
4472 | * This function implements s1. After this step, @cgrp is gone as far as | ||
4473 | * the userland is concerned and a new cgroup with the same name may be | ||
4474 | * created. As cgroup doesn't care about the names internally, this | ||
4475 | * doesn't cause any problem. | ||
4476 | */ | ||
4254 | static int cgroup_destroy_locked(struct cgroup *cgrp) | 4477 | static int cgroup_destroy_locked(struct cgroup *cgrp) |
4255 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | 4478 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) |
4256 | { | 4479 | { |
4257 | struct dentry *d = cgrp->dentry; | 4480 | struct dentry *d = cgrp->dentry; |
4258 | struct cgroup *parent = cgrp->parent; | ||
4259 | struct cgroup_event *event, *tmp; | 4481 | struct cgroup_event *event, *tmp; |
4260 | struct cgroup_subsys *ss; | 4482 | struct cgroup_subsys *ss; |
4483 | bool empty; | ||
4261 | 4484 | ||
4262 | lockdep_assert_held(&d->d_inode->i_mutex); | 4485 | lockdep_assert_held(&d->d_inode->i_mutex); |
4263 | lockdep_assert_held(&cgroup_mutex); | 4486 | lockdep_assert_held(&cgroup_mutex); |
4264 | 4487 | ||
4265 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) | 4488 | /* |
4489 | * css_set_lock synchronizes access to ->cset_links and prevents | ||
4490 | * @cgrp from being removed while __put_css_set() is in progress. | ||
4491 | */ | ||
4492 | read_lock(&css_set_lock); | ||
4493 | empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children); | ||
4494 | read_unlock(&css_set_lock); | ||
4495 | if (!empty) | ||
4266 | return -EBUSY; | 4496 | return -EBUSY; |
4267 | 4497 | ||
4268 | /* | 4498 | /* |
4269 | * Block new css_tryget() by deactivating refcnt and mark @cgrp | 4499 | * Block new css_tryget() by killing css refcnts. cgroup core |
4270 | * removed. This makes future css_tryget() and child creation | 4500 | * guarantees that, by the time ->css_offline() is invoked, no new |
4271 | * attempts fail thus maintaining the removal conditions verified | 4501 | * css reference will be given out via css_tryget(). We can't |
4272 | * above. | 4502 | * simply call percpu_ref_kill() and proceed to offlining css's |
4503 | * because percpu_ref_kill() doesn't guarantee that the ref is seen | ||
4504 | * as killed on all CPUs on return. | ||
4505 | * | ||
4506 | * Use percpu_ref_kill_and_confirm() to get notifications as each | ||
4507 | * css is confirmed to be seen as killed on all CPUs. The | ||
4508 | * notification callback keeps track of the number of css's to be | ||
4509 | * killed and schedules cgroup_offline_fn() to perform the rest of | ||
4510 | * destruction once the percpu refs of all css's are confirmed to | ||
4511 | * be killed. | ||
4273 | */ | 4512 | */ |
4274 | for_each_subsys(cgrp->root, ss) { | 4513 | atomic_set(&cgrp->css_kill_cnt, 1); |
4514 | for_each_root_subsys(cgrp->root, ss) { | ||
4275 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4515 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
4276 | 4516 | ||
4277 | WARN_ON(atomic_read(&css->refcnt) < 0); | 4517 | /* |
4278 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); | 4518 | * Killing would put the base ref, but we need to keep it |
4279 | } | 4519 | * alive until after ->css_offline. |
4280 | set_bit(CGRP_REMOVED, &cgrp->flags); | 4520 | */ |
4521 | percpu_ref_get(&css->refcnt); | ||
4281 | 4522 | ||
4282 | /* tell subsystems to initate destruction */ | 4523 | atomic_inc(&cgrp->css_kill_cnt); |
4283 | for_each_subsys(cgrp->root, ss) | 4524 | percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn); |
4284 | offline_css(ss, cgrp); | 4525 | } |
4526 | cgroup_css_killed(cgrp); | ||
4285 | 4527 | ||
4286 | /* | 4528 | /* |
4287 | * Put all the base refs. Each css holds an extra reference to the | 4529 | * Mark @cgrp dead. This prevents further task migration and child |
4288 | * cgroup's dentry and cgroup removal proceeds regardless of css | 4530 | * creation by disabling cgroup_lock_live_group(). Note that |
4289 | * refs. On the last put of each css, whenever that may be, the | 4531 | * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to |
4290 | * extra dentry ref is put so that dentry destruction happens only | 4532 | * resume iteration after dropping RCU read lock. See |
4291 | * after all css's are released. | 4533 | * cgroup_next_sibling() for details. |
4292 | */ | 4534 | */ |
4293 | for_each_subsys(cgrp->root, ss) | 4535 | set_bit(CGRP_DEAD, &cgrp->flags); |
4294 | css_put(cgrp->subsys[ss->subsys_id]); | ||
4295 | 4536 | ||
4537 | /* CGRP_DEAD is set, remove from ->release_list for the last time */ | ||
4296 | raw_spin_lock(&release_list_lock); | 4538 | raw_spin_lock(&release_list_lock); |
4297 | if (!list_empty(&cgrp->release_list)) | 4539 | if (!list_empty(&cgrp->release_list)) |
4298 | list_del_init(&cgrp->release_list); | 4540 | list_del_init(&cgrp->release_list); |
4299 | raw_spin_unlock(&release_list_lock); | 4541 | raw_spin_unlock(&release_list_lock); |
4300 | 4542 | ||
4301 | /* delete this cgroup from parent->children */ | 4543 | /* |
4302 | list_del_rcu(&cgrp->sibling); | 4544 | * Remove @cgrp directory. The removal puts the base ref but we |
4303 | list_del_init(&cgrp->allcg_node); | 4545 | * aren't quite done with @cgrp yet, so hold onto it. |
4304 | 4546 | */ | |
4305 | dget(d); | 4547 | dget(d); |
4306 | cgroup_d_remove_dir(d); | 4548 | cgroup_d_remove_dir(d); |
4307 | dput(d); | ||
4308 | |||
4309 | set_bit(CGRP_RELEASABLE, &parent->flags); | ||
4310 | check_for_release(parent); | ||
4311 | 4549 | ||
4312 | /* | 4550 | /* |
4313 | * Unregister events and notify userspace. | 4551 | * Unregister events and notify userspace. |
@@ -4322,6 +4560,53 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4322 | spin_unlock(&cgrp->event_list_lock); | 4560 | spin_unlock(&cgrp->event_list_lock); |
4323 | 4561 | ||
4324 | return 0; | 4562 | return 0; |
4563 | }; | ||
4564 | |||
4565 | /** | ||
4566 | * cgroup_offline_fn - the second step of cgroup destruction | ||
4567 | * @work: cgroup->destroy_free_work | ||
4568 | * | ||
4569 | * This function is invoked from a work item for a cgroup which is being | ||
4570 | * destroyed after the percpu refcnts of all css's are guaranteed to be | ||
4571 | * seen as killed on all CPUs, and performs the rest of destruction. This | ||
4572 | * is the second step of destruction described in the comment above | ||
4573 | * cgroup_destroy_locked(). | ||
4574 | */ | ||
4575 | static void cgroup_offline_fn(struct work_struct *work) | ||
4576 | { | ||
4577 | struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); | ||
4578 | struct cgroup *parent = cgrp->parent; | ||
4579 | struct dentry *d = cgrp->dentry; | ||
4580 | struct cgroup_subsys *ss; | ||
4581 | |||
4582 | mutex_lock(&cgroup_mutex); | ||
4583 | |||
4584 | /* | ||
4585 | * css_tryget() is guaranteed to fail now. Tell subsystems to | ||
4586 | * initate destruction. | ||
4587 | */ | ||
4588 | for_each_root_subsys(cgrp->root, ss) | ||
4589 | offline_css(ss, cgrp); | ||
4590 | |||
4591 | /* | ||
4592 | * Put the css refs from cgroup_destroy_locked(). Each css holds | ||
4593 | * an extra reference to the cgroup's dentry and cgroup removal | ||
4594 | * proceeds regardless of css refs. On the last put of each css, | ||
4595 | * whenever that may be, the extra dentry ref is put so that dentry | ||
4596 | * destruction happens only after all css's are released. | ||
4597 | */ | ||
4598 | for_each_root_subsys(cgrp->root, ss) | ||
4599 | css_put(cgrp->subsys[ss->subsys_id]); | ||
4600 | |||
4601 | /* delete this cgroup from parent->children */ | ||
4602 | list_del_rcu(&cgrp->sibling); | ||
4603 | |||
4604 | dput(d); | ||
4605 | |||
4606 | set_bit(CGRP_RELEASABLE, &parent->flags); | ||
4607 | check_for_release(parent); | ||
4608 | |||
4609 | mutex_unlock(&cgroup_mutex); | ||
4325 | } | 4610 | } |
4326 | 4611 | ||
4327 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | 4612 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) |
@@ -4361,12 +4646,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4361 | cgroup_init_cftsets(ss); | 4646 | cgroup_init_cftsets(ss); |
4362 | 4647 | ||
4363 | /* Create the top cgroup state for this subsystem */ | 4648 | /* Create the top cgroup state for this subsystem */ |
4364 | list_add(&ss->sibling, &rootnode.subsys_list); | 4649 | list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); |
4365 | ss->root = &rootnode; | 4650 | ss->root = &cgroup_dummy_root; |
4366 | css = ss->css_alloc(dummytop); | 4651 | css = ss->css_alloc(cgroup_dummy_top); |
4367 | /* We don't handle early failures gracefully */ | 4652 | /* We don't handle early failures gracefully */ |
4368 | BUG_ON(IS_ERR(css)); | 4653 | BUG_ON(IS_ERR(css)); |
4369 | init_cgroup_css(css, ss, dummytop); | 4654 | init_cgroup_css(css, ss, cgroup_dummy_top); |
4370 | 4655 | ||
4371 | /* Update the init_css_set to contain a subsys | 4656 | /* Update the init_css_set to contain a subsys |
4372 | * pointer to this state - since the subsystem is | 4657 | * pointer to this state - since the subsystem is |
@@ -4381,7 +4666,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4381 | * need to invoke fork callbacks here. */ | 4666 | * need to invoke fork callbacks here. */ |
4382 | BUG_ON(!list_empty(&init_task.tasks)); | 4667 | BUG_ON(!list_empty(&init_task.tasks)); |
4383 | 4668 | ||
4384 | BUG_ON(online_css(ss, dummytop)); | 4669 | BUG_ON(online_css(ss, cgroup_dummy_top)); |
4385 | 4670 | ||
4386 | mutex_unlock(&cgroup_mutex); | 4671 | mutex_unlock(&cgroup_mutex); |
4387 | 4672 | ||
@@ -4404,7 +4689,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4404 | struct cgroup_subsys_state *css; | 4689 | struct cgroup_subsys_state *css; |
4405 | int i, ret; | 4690 | int i, ret; |
4406 | struct hlist_node *tmp; | 4691 | struct hlist_node *tmp; |
4407 | struct css_set *cg; | 4692 | struct css_set *cset; |
4408 | unsigned long key; | 4693 | unsigned long key; |
4409 | 4694 | ||
4410 | /* check name and function validity */ | 4695 | /* check name and function validity */ |
@@ -4427,7 +4712,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4427 | */ | 4712 | */ |
4428 | if (ss->module == NULL) { | 4713 | if (ss->module == NULL) { |
4429 | /* a sanity check */ | 4714 | /* a sanity check */ |
4430 | BUG_ON(subsys[ss->subsys_id] != ss); | 4715 | BUG_ON(cgroup_subsys[ss->subsys_id] != ss); |
4431 | return 0; | 4716 | return 0; |
4432 | } | 4717 | } |
4433 | 4718 | ||
@@ -4435,26 +4720,26 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4435 | cgroup_init_cftsets(ss); | 4720 | cgroup_init_cftsets(ss); |
4436 | 4721 | ||
4437 | mutex_lock(&cgroup_mutex); | 4722 | mutex_lock(&cgroup_mutex); |
4438 | subsys[ss->subsys_id] = ss; | 4723 | cgroup_subsys[ss->subsys_id] = ss; |
4439 | 4724 | ||
4440 | /* | 4725 | /* |
4441 | * no ss->css_alloc seems to need anything important in the ss | 4726 | * no ss->css_alloc seems to need anything important in the ss |
4442 | * struct, so this can happen first (i.e. before the rootnode | 4727 | * struct, so this can happen first (i.e. before the dummy root |
4443 | * attachment). | 4728 | * attachment). |
4444 | */ | 4729 | */ |
4445 | css = ss->css_alloc(dummytop); | 4730 | css = ss->css_alloc(cgroup_dummy_top); |
4446 | if (IS_ERR(css)) { | 4731 | if (IS_ERR(css)) { |
4447 | /* failure case - need to deassign the subsys[] slot. */ | 4732 | /* failure case - need to deassign the cgroup_subsys[] slot. */ |
4448 | subsys[ss->subsys_id] = NULL; | 4733 | cgroup_subsys[ss->subsys_id] = NULL; |
4449 | mutex_unlock(&cgroup_mutex); | 4734 | mutex_unlock(&cgroup_mutex); |
4450 | return PTR_ERR(css); | 4735 | return PTR_ERR(css); |
4451 | } | 4736 | } |
4452 | 4737 | ||
4453 | list_add(&ss->sibling, &rootnode.subsys_list); | 4738 | list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); |
4454 | ss->root = &rootnode; | 4739 | ss->root = &cgroup_dummy_root; |
4455 | 4740 | ||
4456 | /* our new subsystem will be attached to the dummy hierarchy. */ | 4741 | /* our new subsystem will be attached to the dummy hierarchy. */ |
4457 | init_cgroup_css(css, ss, dummytop); | 4742 | init_cgroup_css(css, ss, cgroup_dummy_top); |
4458 | /* init_idr must be after init_cgroup_css because it sets css->id. */ | 4743 | /* init_idr must be after init_cgroup_css because it sets css->id. */ |
4459 | if (ss->use_id) { | 4744 | if (ss->use_id) { |
4460 | ret = cgroup_init_idr(ss, css); | 4745 | ret = cgroup_init_idr(ss, css); |
@@ -4471,21 +4756,21 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4471 | * this is all done under the css_set_lock. | 4756 | * this is all done under the css_set_lock. |
4472 | */ | 4757 | */ |
4473 | write_lock(&css_set_lock); | 4758 | write_lock(&css_set_lock); |
4474 | hash_for_each_safe(css_set_table, i, tmp, cg, hlist) { | 4759 | hash_for_each_safe(css_set_table, i, tmp, cset, hlist) { |
4475 | /* skip entries that we already rehashed */ | 4760 | /* skip entries that we already rehashed */ |
4476 | if (cg->subsys[ss->subsys_id]) | 4761 | if (cset->subsys[ss->subsys_id]) |
4477 | continue; | 4762 | continue; |
4478 | /* remove existing entry */ | 4763 | /* remove existing entry */ |
4479 | hash_del(&cg->hlist); | 4764 | hash_del(&cset->hlist); |
4480 | /* set new value */ | 4765 | /* set new value */ |
4481 | cg->subsys[ss->subsys_id] = css; | 4766 | cset->subsys[ss->subsys_id] = css; |
4482 | /* recompute hash and restore entry */ | 4767 | /* recompute hash and restore entry */ |
4483 | key = css_set_hash(cg->subsys); | 4768 | key = css_set_hash(cset->subsys); |
4484 | hash_add(css_set_table, &cg->hlist, key); | 4769 | hash_add(css_set_table, &cset->hlist, key); |
4485 | } | 4770 | } |
4486 | write_unlock(&css_set_lock); | 4771 | write_unlock(&css_set_lock); |
4487 | 4772 | ||
4488 | ret = online_css(ss, dummytop); | 4773 | ret = online_css(ss, cgroup_dummy_top); |
4489 | if (ret) | 4774 | if (ret) |
4490 | goto err_unload; | 4775 | goto err_unload; |
4491 | 4776 | ||
@@ -4511,7 +4796,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys); | |||
4511 | */ | 4796 | */ |
4512 | void cgroup_unload_subsys(struct cgroup_subsys *ss) | 4797 | void cgroup_unload_subsys(struct cgroup_subsys *ss) |
4513 | { | 4798 | { |
4514 | struct cg_cgroup_link *link; | 4799 | struct cgrp_cset_link *link; |
4515 | 4800 | ||
4516 | BUG_ON(ss->module == NULL); | 4801 | BUG_ON(ss->module == NULL); |
4517 | 4802 | ||
@@ -4520,45 +4805,46 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4520 | * try_module_get in parse_cgroupfs_options should ensure that it | 4805 | * try_module_get in parse_cgroupfs_options should ensure that it |
4521 | * doesn't start being used while we're killing it off. | 4806 | * doesn't start being used while we're killing it off. |
4522 | */ | 4807 | */ |
4523 | BUG_ON(ss->root != &rootnode); | 4808 | BUG_ON(ss->root != &cgroup_dummy_root); |
4524 | 4809 | ||
4525 | mutex_lock(&cgroup_mutex); | 4810 | mutex_lock(&cgroup_mutex); |
4526 | 4811 | ||
4527 | offline_css(ss, dummytop); | 4812 | offline_css(ss, cgroup_dummy_top); |
4528 | 4813 | ||
4529 | if (ss->use_id) | 4814 | if (ss->use_id) |
4530 | idr_destroy(&ss->idr); | 4815 | idr_destroy(&ss->idr); |
4531 | 4816 | ||
4532 | /* deassign the subsys_id */ | 4817 | /* deassign the subsys_id */ |
4533 | subsys[ss->subsys_id] = NULL; | 4818 | cgroup_subsys[ss->subsys_id] = NULL; |
4534 | 4819 | ||
4535 | /* remove subsystem from rootnode's list of subsystems */ | 4820 | /* remove subsystem from the dummy root's list of subsystems */ |
4536 | list_del_init(&ss->sibling); | 4821 | list_del_init(&ss->sibling); |
4537 | 4822 | ||
4538 | /* | 4823 | /* |
4539 | * disentangle the css from all css_sets attached to the dummytop. as | 4824 | * disentangle the css from all css_sets attached to the dummy |
4540 | * in loading, we need to pay our respects to the hashtable gods. | 4825 | * top. as in loading, we need to pay our respects to the hashtable |
4826 | * gods. | ||
4541 | */ | 4827 | */ |
4542 | write_lock(&css_set_lock); | 4828 | write_lock(&css_set_lock); |
4543 | list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { | 4829 | list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) { |
4544 | struct css_set *cg = link->cg; | 4830 | struct css_set *cset = link->cset; |
4545 | unsigned long key; | 4831 | unsigned long key; |
4546 | 4832 | ||
4547 | hash_del(&cg->hlist); | 4833 | hash_del(&cset->hlist); |
4548 | cg->subsys[ss->subsys_id] = NULL; | 4834 | cset->subsys[ss->subsys_id] = NULL; |
4549 | key = css_set_hash(cg->subsys); | 4835 | key = css_set_hash(cset->subsys); |
4550 | hash_add(css_set_table, &cg->hlist, key); | 4836 | hash_add(css_set_table, &cset->hlist, key); |
4551 | } | 4837 | } |
4552 | write_unlock(&css_set_lock); | 4838 | write_unlock(&css_set_lock); |
4553 | 4839 | ||
4554 | /* | 4840 | /* |
4555 | * remove subsystem's css from the dummytop and free it - need to | 4841 | * remove subsystem's css from the cgroup_dummy_top and free it - |
4556 | * free before marking as null because ss->css_free needs the | 4842 | * need to free before marking as null because ss->css_free needs |
4557 | * cgrp->subsys pointer to find their state. note that this also | 4843 | * the cgrp->subsys pointer to find their state. note that this |
4558 | * takes care of freeing the css_id. | 4844 | * also takes care of freeing the css_id. |
4559 | */ | 4845 | */ |
4560 | ss->css_free(dummytop); | 4846 | ss->css_free(cgroup_dummy_top); |
4561 | dummytop->subsys[ss->subsys_id] = NULL; | 4847 | cgroup_dummy_top->subsys[ss->subsys_id] = NULL; |
4562 | 4848 | ||
4563 | mutex_unlock(&cgroup_mutex); | 4849 | mutex_unlock(&cgroup_mutex); |
4564 | } | 4850 | } |
@@ -4572,30 +4858,25 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys); | |||
4572 | */ | 4858 | */ |
4573 | int __init cgroup_init_early(void) | 4859 | int __init cgroup_init_early(void) |
4574 | { | 4860 | { |
4861 | struct cgroup_subsys *ss; | ||
4575 | int i; | 4862 | int i; |
4863 | |||
4576 | atomic_set(&init_css_set.refcount, 1); | 4864 | atomic_set(&init_css_set.refcount, 1); |
4577 | INIT_LIST_HEAD(&init_css_set.cg_links); | 4865 | INIT_LIST_HEAD(&init_css_set.cgrp_links); |
4578 | INIT_LIST_HEAD(&init_css_set.tasks); | 4866 | INIT_LIST_HEAD(&init_css_set.tasks); |
4579 | INIT_HLIST_NODE(&init_css_set.hlist); | 4867 | INIT_HLIST_NODE(&init_css_set.hlist); |
4580 | css_set_count = 1; | 4868 | css_set_count = 1; |
4581 | init_cgroup_root(&rootnode); | 4869 | init_cgroup_root(&cgroup_dummy_root); |
4582 | root_count = 1; | 4870 | cgroup_root_count = 1; |
4583 | init_task.cgroups = &init_css_set; | 4871 | RCU_INIT_POINTER(init_task.cgroups, &init_css_set); |
4584 | 4872 | ||
4585 | init_css_set_link.cg = &init_css_set; | 4873 | init_cgrp_cset_link.cset = &init_css_set; |
4586 | init_css_set_link.cgrp = dummytop; | 4874 | init_cgrp_cset_link.cgrp = cgroup_dummy_top; |
4587 | list_add(&init_css_set_link.cgrp_link_list, | 4875 | list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links); |
4588 | &rootnode.top_cgroup.css_sets); | 4876 | list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); |
4589 | list_add(&init_css_set_link.cg_link_list, | ||
4590 | &init_css_set.cg_links); | ||
4591 | |||
4592 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
4593 | struct cgroup_subsys *ss = subsys[i]; | ||
4594 | |||
4595 | /* at bootup time, we don't worry about modular subsystems */ | ||
4596 | if (!ss || ss->module) | ||
4597 | continue; | ||
4598 | 4877 | ||
4878 | /* at bootup time, we don't worry about modular subsystems */ | ||
4879 | for_each_builtin_subsys(ss, i) { | ||
4599 | BUG_ON(!ss->name); | 4880 | BUG_ON(!ss->name); |
4600 | BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); | 4881 | BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); |
4601 | BUG_ON(!ss->css_alloc); | 4882 | BUG_ON(!ss->css_alloc); |
@@ -4620,30 +4901,33 @@ int __init cgroup_init_early(void) | |||
4620 | */ | 4901 | */ |
4621 | int __init cgroup_init(void) | 4902 | int __init cgroup_init(void) |
4622 | { | 4903 | { |
4623 | int err; | 4904 | struct cgroup_subsys *ss; |
4624 | int i; | ||
4625 | unsigned long key; | 4905 | unsigned long key; |
4906 | int i, err; | ||
4626 | 4907 | ||
4627 | err = bdi_init(&cgroup_backing_dev_info); | 4908 | err = bdi_init(&cgroup_backing_dev_info); |
4628 | if (err) | 4909 | if (err) |
4629 | return err; | 4910 | return err; |
4630 | 4911 | ||
4631 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 4912 | for_each_builtin_subsys(ss, i) { |
4632 | struct cgroup_subsys *ss = subsys[i]; | ||
4633 | |||
4634 | /* at bootup time, we don't worry about modular subsystems */ | ||
4635 | if (!ss || ss->module) | ||
4636 | continue; | ||
4637 | if (!ss->early_init) | 4913 | if (!ss->early_init) |
4638 | cgroup_init_subsys(ss); | 4914 | cgroup_init_subsys(ss); |
4639 | if (ss->use_id) | 4915 | if (ss->use_id) |
4640 | cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); | 4916 | cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); |
4641 | } | 4917 | } |
4642 | 4918 | ||
4919 | /* allocate id for the dummy hierarchy */ | ||
4920 | mutex_lock(&cgroup_mutex); | ||
4921 | mutex_lock(&cgroup_root_mutex); | ||
4922 | |||
4643 | /* Add init_css_set to the hash table */ | 4923 | /* Add init_css_set to the hash table */ |
4644 | key = css_set_hash(init_css_set.subsys); | 4924 | key = css_set_hash(init_css_set.subsys); |
4645 | hash_add(css_set_table, &init_css_set.hlist, key); | 4925 | hash_add(css_set_table, &init_css_set.hlist, key); |
4646 | BUG_ON(!init_root_id(&rootnode)); | 4926 | |
4927 | BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); | ||
4928 | |||
4929 | mutex_unlock(&cgroup_root_mutex); | ||
4930 | mutex_unlock(&cgroup_mutex); | ||
4647 | 4931 | ||
4648 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); | 4932 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); |
4649 | if (!cgroup_kobj) { | 4933 | if (!cgroup_kobj) { |
@@ -4708,7 +4992,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) | |||
4708 | int count = 0; | 4992 | int count = 0; |
4709 | 4993 | ||
4710 | seq_printf(m, "%d:", root->hierarchy_id); | 4994 | seq_printf(m, "%d:", root->hierarchy_id); |
4711 | for_each_subsys(root, ss) | 4995 | for_each_root_subsys(root, ss) |
4712 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); | 4996 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); |
4713 | if (strlen(root->name)) | 4997 | if (strlen(root->name)) |
4714 | seq_printf(m, "%sname=%s", count ? "," : "", | 4998 | seq_printf(m, "%sname=%s", count ? "," : "", |
@@ -4734,6 +5018,7 @@ out: | |||
4734 | /* Display information about each subsystem and each hierarchy */ | 5018 | /* Display information about each subsystem and each hierarchy */ |
4735 | static int proc_cgroupstats_show(struct seq_file *m, void *v) | 5019 | static int proc_cgroupstats_show(struct seq_file *m, void *v) |
4736 | { | 5020 | { |
5021 | struct cgroup_subsys *ss; | ||
4737 | int i; | 5022 | int i; |
4738 | 5023 | ||
4739 | seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); | 5024 | seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); |
@@ -4743,14 +5028,12 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) | |||
4743 | * subsys/hierarchy state. | 5028 | * subsys/hierarchy state. |
4744 | */ | 5029 | */ |
4745 | mutex_lock(&cgroup_mutex); | 5030 | mutex_lock(&cgroup_mutex); |
4746 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 5031 | |
4747 | struct cgroup_subsys *ss = subsys[i]; | 5032 | for_each_subsys(ss, i) |
4748 | if (ss == NULL) | ||
4749 | continue; | ||
4750 | seq_printf(m, "%s\t%d\t%d\t%d\n", | 5033 | seq_printf(m, "%s\t%d\t%d\t%d\n", |
4751 | ss->name, ss->root->hierarchy_id, | 5034 | ss->name, ss->root->hierarchy_id, |
4752 | ss->root->number_of_cgroups, !ss->disabled); | 5035 | ss->root->number_of_cgroups, !ss->disabled); |
4753 | } | 5036 | |
4754 | mutex_unlock(&cgroup_mutex); | 5037 | mutex_unlock(&cgroup_mutex); |
4755 | return 0; | 5038 | return 0; |
4756 | } | 5039 | } |
@@ -4786,8 +5069,8 @@ static const struct file_operations proc_cgroupstats_operations = { | |||
4786 | void cgroup_fork(struct task_struct *child) | 5069 | void cgroup_fork(struct task_struct *child) |
4787 | { | 5070 | { |
4788 | task_lock(current); | 5071 | task_lock(current); |
5072 | get_css_set(task_css_set(current)); | ||
4789 | child->cgroups = current->cgroups; | 5073 | child->cgroups = current->cgroups; |
4790 | get_css_set(child->cgroups); | ||
4791 | task_unlock(current); | 5074 | task_unlock(current); |
4792 | INIT_LIST_HEAD(&child->cg_list); | 5075 | INIT_LIST_HEAD(&child->cg_list); |
4793 | } | 5076 | } |
@@ -4804,6 +5087,7 @@ void cgroup_fork(struct task_struct *child) | |||
4804 | */ | 5087 | */ |
4805 | void cgroup_post_fork(struct task_struct *child) | 5088 | void cgroup_post_fork(struct task_struct *child) |
4806 | { | 5089 | { |
5090 | struct cgroup_subsys *ss; | ||
4807 | int i; | 5091 | int i; |
4808 | 5092 | ||
4809 | /* | 5093 | /* |
@@ -4821,7 +5105,7 @@ void cgroup_post_fork(struct task_struct *child) | |||
4821 | write_lock(&css_set_lock); | 5105 | write_lock(&css_set_lock); |
4822 | task_lock(child); | 5106 | task_lock(child); |
4823 | if (list_empty(&child->cg_list)) | 5107 | if (list_empty(&child->cg_list)) |
4824 | list_add(&child->cg_list, &child->cgroups->tasks); | 5108 | list_add(&child->cg_list, &task_css_set(child)->tasks); |
4825 | task_unlock(child); | 5109 | task_unlock(child); |
4826 | write_unlock(&css_set_lock); | 5110 | write_unlock(&css_set_lock); |
4827 | } | 5111 | } |
@@ -4840,12 +5124,9 @@ void cgroup_post_fork(struct task_struct *child) | |||
4840 | * of the array can be freed at module unload, so we | 5124 | * of the array can be freed at module unload, so we |
4841 | * can't touch that. | 5125 | * can't touch that. |
4842 | */ | 5126 | */ |
4843 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | 5127 | for_each_builtin_subsys(ss, i) |
4844 | struct cgroup_subsys *ss = subsys[i]; | ||
4845 | |||
4846 | if (ss->fork) | 5128 | if (ss->fork) |
4847 | ss->fork(child); | 5129 | ss->fork(child); |
4848 | } | ||
4849 | } | 5130 | } |
4850 | } | 5131 | } |
4851 | 5132 | ||
@@ -4886,7 +5167,8 @@ void cgroup_post_fork(struct task_struct *child) | |||
4886 | */ | 5167 | */ |
4887 | void cgroup_exit(struct task_struct *tsk, int run_callbacks) | 5168 | void cgroup_exit(struct task_struct *tsk, int run_callbacks) |
4888 | { | 5169 | { |
4889 | struct css_set *cg; | 5170 | struct cgroup_subsys *ss; |
5171 | struct css_set *cset; | ||
4890 | int i; | 5172 | int i; |
4891 | 5173 | ||
4892 | /* | 5174 | /* |
@@ -4903,36 +5185,32 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
4903 | 5185 | ||
4904 | /* Reassign the task to the init_css_set. */ | 5186 | /* Reassign the task to the init_css_set. */ |
4905 | task_lock(tsk); | 5187 | task_lock(tsk); |
4906 | cg = tsk->cgroups; | 5188 | cset = task_css_set(tsk); |
4907 | tsk->cgroups = &init_css_set; | 5189 | RCU_INIT_POINTER(tsk->cgroups, &init_css_set); |
4908 | 5190 | ||
4909 | if (run_callbacks && need_forkexit_callback) { | 5191 | if (run_callbacks && need_forkexit_callback) { |
4910 | /* | 5192 | /* |
4911 | * fork/exit callbacks are supported only for builtin | 5193 | * fork/exit callbacks are supported only for builtin |
4912 | * subsystems, see cgroup_post_fork() for details. | 5194 | * subsystems, see cgroup_post_fork() for details. |
4913 | */ | 5195 | */ |
4914 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | 5196 | for_each_builtin_subsys(ss, i) { |
4915 | struct cgroup_subsys *ss = subsys[i]; | ||
4916 | |||
4917 | if (ss->exit) { | 5197 | if (ss->exit) { |
4918 | struct cgroup *old_cgrp = | 5198 | struct cgroup *old_cgrp = cset->subsys[i]->cgroup; |
4919 | rcu_dereference_raw(cg->subsys[i])->cgroup; | ||
4920 | struct cgroup *cgrp = task_cgroup(tsk, i); | 5199 | struct cgroup *cgrp = task_cgroup(tsk, i); |
5200 | |||
4921 | ss->exit(cgrp, old_cgrp, tsk); | 5201 | ss->exit(cgrp, old_cgrp, tsk); |
4922 | } | 5202 | } |
4923 | } | 5203 | } |
4924 | } | 5204 | } |
4925 | task_unlock(tsk); | 5205 | task_unlock(tsk); |
4926 | 5206 | ||
4927 | put_css_set_taskexit(cg); | 5207 | put_css_set_taskexit(cset); |
4928 | } | 5208 | } |
4929 | 5209 | ||
4930 | static void check_for_release(struct cgroup *cgrp) | 5210 | static void check_for_release(struct cgroup *cgrp) |
4931 | { | 5211 | { |
4932 | /* All of these checks rely on RCU to keep the cgroup | ||
4933 | * structure alive */ | ||
4934 | if (cgroup_is_releasable(cgrp) && | 5212 | if (cgroup_is_releasable(cgrp) && |
4935 | !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) { | 5213 | list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) { |
4936 | /* | 5214 | /* |
4937 | * Control Group is currently removeable. If it's not | 5215 | * Control Group is currently removeable. If it's not |
4938 | * already queued for a userspace notification, queue | 5216 | * already queued for a userspace notification, queue |
@@ -4941,7 +5219,7 @@ static void check_for_release(struct cgroup *cgrp) | |||
4941 | int need_schedule_work = 0; | 5219 | int need_schedule_work = 0; |
4942 | 5220 | ||
4943 | raw_spin_lock(&release_list_lock); | 5221 | raw_spin_lock(&release_list_lock); |
4944 | if (!cgroup_is_removed(cgrp) && | 5222 | if (!cgroup_is_dead(cgrp) && |
4945 | list_empty(&cgrp->release_list)) { | 5223 | list_empty(&cgrp->release_list)) { |
4946 | list_add(&cgrp->release_list, &release_list); | 5224 | list_add(&cgrp->release_list, &release_list); |
4947 | need_schedule_work = 1; | 5225 | need_schedule_work = 1; |
@@ -4952,34 +5230,6 @@ static void check_for_release(struct cgroup *cgrp) | |||
4952 | } | 5230 | } |
4953 | } | 5231 | } |
4954 | 5232 | ||
4955 | /* Caller must verify that the css is not for root cgroup */ | ||
4956 | bool __css_tryget(struct cgroup_subsys_state *css) | ||
4957 | { | ||
4958 | while (true) { | ||
4959 | int t, v; | ||
4960 | |||
4961 | v = css_refcnt(css); | ||
4962 | t = atomic_cmpxchg(&css->refcnt, v, v + 1); | ||
4963 | if (likely(t == v)) | ||
4964 | return true; | ||
4965 | else if (t < 0) | ||
4966 | return false; | ||
4967 | cpu_relax(); | ||
4968 | } | ||
4969 | } | ||
4970 | EXPORT_SYMBOL_GPL(__css_tryget); | ||
4971 | |||
4972 | /* Caller must verify that the css is not for root cgroup */ | ||
4973 | void __css_put(struct cgroup_subsys_state *css) | ||
4974 | { | ||
4975 | int v; | ||
4976 | |||
4977 | v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); | ||
4978 | if (v == 0) | ||
4979 | schedule_work(&css->dput_work); | ||
4980 | } | ||
4981 | EXPORT_SYMBOL_GPL(__css_put); | ||
4982 | |||
4983 | /* | 5233 | /* |
4984 | * Notify userspace when a cgroup is released, by running the | 5234 | * Notify userspace when a cgroup is released, by running the |
4985 | * configured release agent with the name of the cgroup (path | 5235 | * configured release agent with the name of the cgroup (path |
@@ -5054,23 +5304,19 @@ static void cgroup_release_agent(struct work_struct *work) | |||
5054 | 5304 | ||
5055 | static int __init cgroup_disable(char *str) | 5305 | static int __init cgroup_disable(char *str) |
5056 | { | 5306 | { |
5057 | int i; | 5307 | struct cgroup_subsys *ss; |
5058 | char *token; | 5308 | char *token; |
5309 | int i; | ||
5059 | 5310 | ||
5060 | while ((token = strsep(&str, ",")) != NULL) { | 5311 | while ((token = strsep(&str, ",")) != NULL) { |
5061 | if (!*token) | 5312 | if (!*token) |
5062 | continue; | 5313 | continue; |
5063 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
5064 | struct cgroup_subsys *ss = subsys[i]; | ||
5065 | |||
5066 | /* | ||
5067 | * cgroup_disable, being at boot time, can't | ||
5068 | * know about module subsystems, so we don't | ||
5069 | * worry about them. | ||
5070 | */ | ||
5071 | if (!ss || ss->module) | ||
5072 | continue; | ||
5073 | 5314 | ||
5315 | /* | ||
5316 | * cgroup_disable, being at boot time, can't know about | ||
5317 | * module subsystems, so we don't worry about them. | ||
5318 | */ | ||
5319 | for_each_builtin_subsys(ss, i) { | ||
5074 | if (!strcmp(token, ss->name)) { | 5320 | if (!strcmp(token, ss->name)) { |
5075 | ss->disabled = 1; | 5321 | ss->disabled = 1; |
5076 | printk(KERN_INFO "Disabling %s control group" | 5322 | printk(KERN_INFO "Disabling %s control group" |
@@ -5087,9 +5333,7 @@ __setup("cgroup_disable=", cgroup_disable); | |||
5087 | * Functons for CSS ID. | 5333 | * Functons for CSS ID. |
5088 | */ | 5334 | */ |
5089 | 5335 | ||
5090 | /* | 5336 | /* to get ID other than 0, this should be called when !cgroup_is_dead() */ |
5091 | *To get ID other than 0, this should be called when !cgroup_is_removed(). | ||
5092 | */ | ||
5093 | unsigned short css_id(struct cgroup_subsys_state *css) | 5337 | unsigned short css_id(struct cgroup_subsys_state *css) |
5094 | { | 5338 | { |
5095 | struct css_id *cssid; | 5339 | struct css_id *cssid; |
@@ -5099,7 +5343,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) | |||
5099 | * on this or this is under rcu_read_lock(). Once css->id is allocated, | 5343 | * on this or this is under rcu_read_lock(). Once css->id is allocated, |
5100 | * it's unchanged until freed. | 5344 | * it's unchanged until freed. |
5101 | */ | 5345 | */ |
5102 | cssid = rcu_dereference_check(css->id, css_refcnt(css)); | 5346 | cssid = rcu_dereference_raw(css->id); |
5103 | 5347 | ||
5104 | if (cssid) | 5348 | if (cssid) |
5105 | return cssid->id; | 5349 | return cssid->id; |
@@ -5107,18 +5351,6 @@ unsigned short css_id(struct cgroup_subsys_state *css) | |||
5107 | } | 5351 | } |
5108 | EXPORT_SYMBOL_GPL(css_id); | 5352 | EXPORT_SYMBOL_GPL(css_id); |
5109 | 5353 | ||
5110 | unsigned short css_depth(struct cgroup_subsys_state *css) | ||
5111 | { | ||
5112 | struct css_id *cssid; | ||
5113 | |||
5114 | cssid = rcu_dereference_check(css->id, css_refcnt(css)); | ||
5115 | |||
5116 | if (cssid) | ||
5117 | return cssid->depth; | ||
5118 | return 0; | ||
5119 | } | ||
5120 | EXPORT_SYMBOL_GPL(css_depth); | ||
5121 | |||
5122 | /** | 5354 | /** |
5123 | * css_is_ancestor - test "root" css is an ancestor of "child" | 5355 | * css_is_ancestor - test "root" css is an ancestor of "child" |
5124 | * @child: the css to be tested. | 5356 | * @child: the css to be tested. |
@@ -5153,7 +5385,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *child, | |||
5153 | 5385 | ||
5154 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) | 5386 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) |
5155 | { | 5387 | { |
5156 | struct css_id *id = css->id; | 5388 | struct css_id *id = rcu_dereference_protected(css->id, true); |
5389 | |||
5157 | /* When this is called before css_id initialization, id can be NULL */ | 5390 | /* When this is called before css_id initialization, id can be NULL */ |
5158 | if (!id) | 5391 | if (!id) |
5159 | return; | 5392 | return; |
@@ -5219,8 +5452,8 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, | |||
5219 | return PTR_ERR(newid); | 5452 | return PTR_ERR(newid); |
5220 | 5453 | ||
5221 | newid->stack[0] = newid->id; | 5454 | newid->stack[0] = newid->id; |
5222 | newid->css = rootcss; | 5455 | RCU_INIT_POINTER(newid->css, rootcss); |
5223 | rootcss->id = newid; | 5456 | RCU_INIT_POINTER(rootcss->id, newid); |
5224 | return 0; | 5457 | return 0; |
5225 | } | 5458 | } |
5226 | 5459 | ||
@@ -5234,7 +5467,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, | |||
5234 | subsys_id = ss->subsys_id; | 5467 | subsys_id = ss->subsys_id; |
5235 | parent_css = parent->subsys[subsys_id]; | 5468 | parent_css = parent->subsys[subsys_id]; |
5236 | child_css = child->subsys[subsys_id]; | 5469 | child_css = child->subsys[subsys_id]; |
5237 | parent_id = parent_css->id; | 5470 | parent_id = rcu_dereference_protected(parent_css->id, true); |
5238 | depth = parent_id->depth + 1; | 5471 | depth = parent_id->depth + 1; |
5239 | 5472 | ||
5240 | child_id = get_new_cssid(ss, depth); | 5473 | child_id = get_new_cssid(ss, depth); |
@@ -5299,7 +5532,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) | |||
5299 | } | 5532 | } |
5300 | 5533 | ||
5301 | #ifdef CONFIG_CGROUP_DEBUG | 5534 | #ifdef CONFIG_CGROUP_DEBUG |
5302 | static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) | 5535 | static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) |
5303 | { | 5536 | { |
5304 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); | 5537 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); |
5305 | 5538 | ||
@@ -5309,48 +5542,43 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) | |||
5309 | return css; | 5542 | return css; |
5310 | } | 5543 | } |
5311 | 5544 | ||
5312 | static void debug_css_free(struct cgroup *cont) | 5545 | static void debug_css_free(struct cgroup *cgrp) |
5313 | { | ||
5314 | kfree(cont->subsys[debug_subsys_id]); | ||
5315 | } | ||
5316 | |||
5317 | static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) | ||
5318 | { | 5546 | { |
5319 | return atomic_read(&cont->count); | 5547 | kfree(cgrp->subsys[debug_subsys_id]); |
5320 | } | 5548 | } |
5321 | 5549 | ||
5322 | static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) | 5550 | static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) |
5323 | { | 5551 | { |
5324 | return cgroup_task_count(cont); | 5552 | return cgroup_task_count(cgrp); |
5325 | } | 5553 | } |
5326 | 5554 | ||
5327 | static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) | 5555 | static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft) |
5328 | { | 5556 | { |
5329 | return (u64)(unsigned long)current->cgroups; | 5557 | return (u64)(unsigned long)current->cgroups; |
5330 | } | 5558 | } |
5331 | 5559 | ||
5332 | static u64 current_css_set_refcount_read(struct cgroup *cont, | 5560 | static u64 current_css_set_refcount_read(struct cgroup *cgrp, |
5333 | struct cftype *cft) | 5561 | struct cftype *cft) |
5334 | { | 5562 | { |
5335 | u64 count; | 5563 | u64 count; |
5336 | 5564 | ||
5337 | rcu_read_lock(); | 5565 | rcu_read_lock(); |
5338 | count = atomic_read(¤t->cgroups->refcount); | 5566 | count = atomic_read(&task_css_set(current)->refcount); |
5339 | rcu_read_unlock(); | 5567 | rcu_read_unlock(); |
5340 | return count; | 5568 | return count; |
5341 | } | 5569 | } |
5342 | 5570 | ||
5343 | static int current_css_set_cg_links_read(struct cgroup *cont, | 5571 | static int current_css_set_cg_links_read(struct cgroup *cgrp, |
5344 | struct cftype *cft, | 5572 | struct cftype *cft, |
5345 | struct seq_file *seq) | 5573 | struct seq_file *seq) |
5346 | { | 5574 | { |
5347 | struct cg_cgroup_link *link; | 5575 | struct cgrp_cset_link *link; |
5348 | struct css_set *cg; | 5576 | struct css_set *cset; |
5349 | 5577 | ||
5350 | read_lock(&css_set_lock); | 5578 | read_lock(&css_set_lock); |
5351 | rcu_read_lock(); | 5579 | rcu_read_lock(); |
5352 | cg = rcu_dereference(current->cgroups); | 5580 | cset = rcu_dereference(current->cgroups); |
5353 | list_for_each_entry(link, &cg->cg_links, cg_link_list) { | 5581 | list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { |
5354 | struct cgroup *c = link->cgrp; | 5582 | struct cgroup *c = link->cgrp; |
5355 | const char *name; | 5583 | const char *name; |
5356 | 5584 | ||
@@ -5367,19 +5595,19 @@ static int current_css_set_cg_links_read(struct cgroup *cont, | |||
5367 | } | 5595 | } |
5368 | 5596 | ||
5369 | #define MAX_TASKS_SHOWN_PER_CSS 25 | 5597 | #define MAX_TASKS_SHOWN_PER_CSS 25 |
5370 | static int cgroup_css_links_read(struct cgroup *cont, | 5598 | static int cgroup_css_links_read(struct cgroup *cgrp, |
5371 | struct cftype *cft, | 5599 | struct cftype *cft, |
5372 | struct seq_file *seq) | 5600 | struct seq_file *seq) |
5373 | { | 5601 | { |
5374 | struct cg_cgroup_link *link; | 5602 | struct cgrp_cset_link *link; |
5375 | 5603 | ||
5376 | read_lock(&css_set_lock); | 5604 | read_lock(&css_set_lock); |
5377 | list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { | 5605 | list_for_each_entry(link, &cgrp->cset_links, cset_link) { |
5378 | struct css_set *cg = link->cg; | 5606 | struct css_set *cset = link->cset; |
5379 | struct task_struct *task; | 5607 | struct task_struct *task; |
5380 | int count = 0; | 5608 | int count = 0; |
5381 | seq_printf(seq, "css_set %p\n", cg); | 5609 | seq_printf(seq, "css_set %p\n", cset); |
5382 | list_for_each_entry(task, &cg->tasks, cg_list) { | 5610 | list_for_each_entry(task, &cset->tasks, cg_list) { |
5383 | if (count++ > MAX_TASKS_SHOWN_PER_CSS) { | 5611 | if (count++ > MAX_TASKS_SHOWN_PER_CSS) { |
5384 | seq_puts(seq, " ...\n"); | 5612 | seq_puts(seq, " ...\n"); |
5385 | break; | 5613 | break; |
@@ -5400,10 +5628,6 @@ static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) | |||
5400 | 5628 | ||
5401 | static struct cftype debug_files[] = { | 5629 | static struct cftype debug_files[] = { |
5402 | { | 5630 | { |
5403 | .name = "cgroup_refcount", | ||
5404 | .read_u64 = cgroup_refcount_read, | ||
5405 | }, | ||
5406 | { | ||
5407 | .name = "taskcount", | 5631 | .name = "taskcount", |
5408 | .read_u64 = debug_taskcount_read, | 5632 | .read_u64 = debug_taskcount_read, |
5409 | }, | 5633 | }, |