aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c1536
1 files changed, 880 insertions, 656 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a7c9e6ddb979..e5583d10a325 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,9 +63,6 @@
63 63
64#include <linux/atomic.h> 64#include <linux/atomic.h>
65 65
66/* css deactivation bias, makes css->refcnt negative to deny new trygets */
67#define CSS_DEACT_BIAS INT_MIN
68
69/* 66/*
70 * cgroup_mutex is the master lock. Any modification to cgroup or its 67 * cgroup_mutex is the master lock. Any modification to cgroup or its
71 * hierarchy must be performed while holding it. 68 * hierarchy must be performed while holding it.
@@ -99,16 +96,19 @@ static DEFINE_MUTEX(cgroup_root_mutex);
99 */ 96 */
100#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, 97#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
101#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) 98#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
102static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { 99static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = {
103#include <linux/cgroup_subsys.h> 100#include <linux/cgroup_subsys.h>
104}; 101};
105 102
106/* 103/*
107 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the 104 * The dummy hierarchy, reserved for the subsystems that are otherwise
108 * subsystems that are otherwise unattached - it never has more than a 105 * unattached - it never has more than a single cgroup, and all tasks are
109 * single cgroup, and all tasks are part of that cgroup. 106 * part of that cgroup.
110 */ 107 */
111static struct cgroupfs_root rootnode; 108static struct cgroupfs_root cgroup_dummy_root;
109
110/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
111static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
112 112
113/* 113/*
114 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. 114 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
@@ -186,18 +186,28 @@ struct cgroup_event {
186 186
187/* The list of hierarchy roots */ 187/* The list of hierarchy roots */
188 188
189static LIST_HEAD(roots); 189static LIST_HEAD(cgroup_roots);
190static int root_count; 190static int cgroup_root_count;
191 191
192static DEFINE_IDA(hierarchy_ida); 192/*
193static int next_hierarchy_id; 193 * Hierarchy ID allocation and mapping. It follows the same exclusion
194static DEFINE_SPINLOCK(hierarchy_id_lock); 194 * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for
195 195 * writes, either for reads.
196/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 196 */
197#define dummytop (&rootnode.top_cgroup) 197static DEFINE_IDR(cgroup_hierarchy_idr);
198 198
199static struct cgroup_name root_cgroup_name = { .name = "/" }; 199static struct cgroup_name root_cgroup_name = { .name = "/" };
200 200
201/*
202 * Assign a monotonically increasing serial number to cgroups. It
203 * guarantees cgroups with bigger numbers are newer than those with smaller
204 * numbers. Also, as cgroups are always appended to the parent's
205 * ->children list, it guarantees that sibling cgroups are always sorted in
206 * the ascending serial number order on the list. Protected by
207 * cgroup_mutex.
208 */
209static u64 cgroup_serial_nr_next = 1;
210
201/* This flag indicates whether tasks in the fork and exit paths should 211/* This flag indicates whether tasks in the fork and exit paths should
202 * check for fork/exit handlers to call. This avoids us having to do 212 * check for fork/exit handlers to call. This avoids us having to do
203 * extra work in the fork/exit path if none of the subsystems need to 213 * extra work in the fork/exit path if none of the subsystems need to
@@ -205,27 +215,15 @@ static struct cgroup_name root_cgroup_name = { .name = "/" };
205 */ 215 */
206static int need_forkexit_callback __read_mostly; 216static int need_forkexit_callback __read_mostly;
207 217
218static void cgroup_offline_fn(struct work_struct *work);
208static int cgroup_destroy_locked(struct cgroup *cgrp); 219static int cgroup_destroy_locked(struct cgroup *cgrp);
209static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 220static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
210 struct cftype cfts[], bool is_add); 221 struct cftype cfts[], bool is_add);
211 222
212static int css_unbias_refcnt(int refcnt)
213{
214 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
215}
216
217/* the current nr of refs, always >= 0 whether @css is deactivated or not */
218static int css_refcnt(struct cgroup_subsys_state *css)
219{
220 int v = atomic_read(&css->refcnt);
221
222 return css_unbias_refcnt(v);
223}
224
225/* convenient tests for these bits */ 223/* convenient tests for these bits */
226inline int cgroup_is_removed(const struct cgroup *cgrp) 224static inline bool cgroup_is_dead(const struct cgroup *cgrp)
227{ 225{
228 return test_bit(CGRP_REMOVED, &cgrp->flags); 226 return test_bit(CGRP_DEAD, &cgrp->flags);
229} 227}
230 228
231/** 229/**
@@ -261,16 +259,38 @@ static int notify_on_release(const struct cgroup *cgrp)
261 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 259 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
262} 260}
263 261
264/* 262/**
265 * for_each_subsys() allows you to iterate on each subsystem attached to 263 * for_each_subsys - iterate all loaded cgroup subsystems
266 * an active hierarchy 264 * @ss: the iteration cursor
265 * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
266 *
267 * Should be called under cgroup_mutex.
267 */ 268 */
268#define for_each_subsys(_root, _ss) \ 269#define for_each_subsys(ss, i) \
269list_for_each_entry(_ss, &_root->subsys_list, sibling) 270 for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \
271 if (({ lockdep_assert_held(&cgroup_mutex); \
272 !((ss) = cgroup_subsys[i]); })) { } \
273 else
274
275/**
276 * for_each_builtin_subsys - iterate all built-in cgroup subsystems
277 * @ss: the iteration cursor
278 * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end
279 *
280 * Bulit-in subsystems are always present and iteration itself doesn't
281 * require any synchronization.
282 */
283#define for_each_builtin_subsys(ss, i) \
284 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
285 (((ss) = cgroup_subsys[i]) || true); (i)++)
286
287/* iterate each subsystem attached to a hierarchy */
288#define for_each_root_subsys(root, ss) \
289 list_for_each_entry((ss), &(root)->subsys_list, sibling)
270 290
271/* for_each_active_root() allows you to iterate across the active hierarchies */ 291/* iterate across the active hierarchies */
272#define for_each_active_root(_root) \ 292#define for_each_active_root(root) \
273list_for_each_entry(_root, &roots, root_list) 293 list_for_each_entry((root), &cgroup_roots, root_list)
274 294
275static inline struct cgroup *__d_cgrp(struct dentry *dentry) 295static inline struct cgroup *__d_cgrp(struct dentry *dentry)
276{ 296{
@@ -297,7 +317,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
297static bool cgroup_lock_live_group(struct cgroup *cgrp) 317static bool cgroup_lock_live_group(struct cgroup *cgrp)
298{ 318{
299 mutex_lock(&cgroup_mutex); 319 mutex_lock(&cgroup_mutex);
300 if (cgroup_is_removed(cgrp)) { 320 if (cgroup_is_dead(cgrp)) {
301 mutex_unlock(&cgroup_mutex); 321 mutex_unlock(&cgroup_mutex);
302 return false; 322 return false;
303 } 323 }
@@ -312,20 +332,24 @@ static void cgroup_release_agent(struct work_struct *work);
312static DECLARE_WORK(release_agent_work, cgroup_release_agent); 332static DECLARE_WORK(release_agent_work, cgroup_release_agent);
313static void check_for_release(struct cgroup *cgrp); 333static void check_for_release(struct cgroup *cgrp);
314 334
315/* Link structure for associating css_set objects with cgroups */ 335/*
316struct cg_cgroup_link { 336 * A cgroup can be associated with multiple css_sets as different tasks may
317 /* 337 * belong to different cgroups on different hierarchies. In the other
318 * List running through cg_cgroup_links associated with a 338 * direction, a css_set is naturally associated with multiple cgroups.
319 * cgroup, anchored on cgroup->css_sets 339 * This M:N relationship is represented by the following link structure
320 */ 340 * which exists for each association and allows traversing the associations
321 struct list_head cgrp_link_list; 341 * from both sides.
322 struct cgroup *cgrp; 342 */
323 /* 343struct cgrp_cset_link {
324 * List running through cg_cgroup_links pointing at a 344 /* the cgroup and css_set this link associates */
325 * single css_set object, anchored on css_set->cg_links 345 struct cgroup *cgrp;
326 */ 346 struct css_set *cset;
327 struct list_head cg_link_list; 347
328 struct css_set *cg; 348 /* list of cgrp_cset_links anchored at cgrp->cset_links */
349 struct list_head cset_link;
350
351 /* list of cgrp_cset_links anchored at css_set->cgrp_links */
352 struct list_head cgrp_link;
329}; 353};
330 354
331/* The default css_set - used by init and its children prior to any 355/* The default css_set - used by init and its children prior to any
@@ -336,7 +360,7 @@ struct cg_cgroup_link {
336 */ 360 */
337 361
338static struct css_set init_css_set; 362static struct css_set init_css_set;
339static struct cg_cgroup_link init_css_set_link; 363static struct cgrp_cset_link init_cgrp_cset_link;
340 364
341static int cgroup_init_idr(struct cgroup_subsys *ss, 365static int cgroup_init_idr(struct cgroup_subsys *ss,
342 struct cgroup_subsys_state *css); 366 struct cgroup_subsys_state *css);
@@ -357,10 +381,11 @@ static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
357 381
358static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) 382static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
359{ 383{
360 int i;
361 unsigned long key = 0UL; 384 unsigned long key = 0UL;
385 struct cgroup_subsys *ss;
386 int i;
362 387
363 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) 388 for_each_subsys(ss, i)
364 key += (unsigned long)css[i]; 389 key += (unsigned long)css[i];
365 key = (key >> 16) ^ key; 390 key = (key >> 16) ^ key;
366 391
@@ -373,90 +398,83 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
373 * compiled into their kernel but not actually in use */ 398 * compiled into their kernel but not actually in use */
374static int use_task_css_set_links __read_mostly; 399static int use_task_css_set_links __read_mostly;
375 400
376static void __put_css_set(struct css_set *cg, int taskexit) 401static void __put_css_set(struct css_set *cset, int taskexit)
377{ 402{
378 struct cg_cgroup_link *link; 403 struct cgrp_cset_link *link, *tmp_link;
379 struct cg_cgroup_link *saved_link; 404
380 /* 405 /*
381 * Ensure that the refcount doesn't hit zero while any readers 406 * Ensure that the refcount doesn't hit zero while any readers
382 * can see it. Similar to atomic_dec_and_lock(), but for an 407 * can see it. Similar to atomic_dec_and_lock(), but for an
383 * rwlock 408 * rwlock
384 */ 409 */
385 if (atomic_add_unless(&cg->refcount, -1, 1)) 410 if (atomic_add_unless(&cset->refcount, -1, 1))
386 return; 411 return;
387 write_lock(&css_set_lock); 412 write_lock(&css_set_lock);
388 if (!atomic_dec_and_test(&cg->refcount)) { 413 if (!atomic_dec_and_test(&cset->refcount)) {
389 write_unlock(&css_set_lock); 414 write_unlock(&css_set_lock);
390 return; 415 return;
391 } 416 }
392 417
393 /* This css_set is dead. unlink it and release cgroup refcounts */ 418 /* This css_set is dead. unlink it and release cgroup refcounts */
394 hash_del(&cg->hlist); 419 hash_del(&cset->hlist);
395 css_set_count--; 420 css_set_count--;
396 421
397 list_for_each_entry_safe(link, saved_link, &cg->cg_links, 422 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
398 cg_link_list) {
399 struct cgroup *cgrp = link->cgrp; 423 struct cgroup *cgrp = link->cgrp;
400 list_del(&link->cg_link_list);
401 list_del(&link->cgrp_link_list);
402 424
403 /* 425 list_del(&link->cset_link);
404 * We may not be holding cgroup_mutex, and if cgrp->count is 426 list_del(&link->cgrp_link);
405 * dropped to 0 the cgroup can be destroyed at any time, hence 427
406 * rcu_read_lock is used to keep it alive. 428 /* @cgrp can't go away while we're holding css_set_lock */
407 */ 429 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
408 rcu_read_lock();
409 if (atomic_dec_and_test(&cgrp->count) &&
410 notify_on_release(cgrp)) {
411 if (taskexit) 430 if (taskexit)
412 set_bit(CGRP_RELEASABLE, &cgrp->flags); 431 set_bit(CGRP_RELEASABLE, &cgrp->flags);
413 check_for_release(cgrp); 432 check_for_release(cgrp);
414 } 433 }
415 rcu_read_unlock();
416 434
417 kfree(link); 435 kfree(link);
418 } 436 }
419 437
420 write_unlock(&css_set_lock); 438 write_unlock(&css_set_lock);
421 kfree_rcu(cg, rcu_head); 439 kfree_rcu(cset, rcu_head);
422} 440}
423 441
424/* 442/*
425 * refcounted get/put for css_set objects 443 * refcounted get/put for css_set objects
426 */ 444 */
427static inline void get_css_set(struct css_set *cg) 445static inline void get_css_set(struct css_set *cset)
428{ 446{
429 atomic_inc(&cg->refcount); 447 atomic_inc(&cset->refcount);
430} 448}
431 449
432static inline void put_css_set(struct css_set *cg) 450static inline void put_css_set(struct css_set *cset)
433{ 451{
434 __put_css_set(cg, 0); 452 __put_css_set(cset, 0);
435} 453}
436 454
437static inline void put_css_set_taskexit(struct css_set *cg) 455static inline void put_css_set_taskexit(struct css_set *cset)
438{ 456{
439 __put_css_set(cg, 1); 457 __put_css_set(cset, 1);
440} 458}
441 459
442/* 460/**
443 * compare_css_sets - helper function for find_existing_css_set(). 461 * compare_css_sets - helper function for find_existing_css_set().
444 * @cg: candidate css_set being tested 462 * @cset: candidate css_set being tested
445 * @old_cg: existing css_set for a task 463 * @old_cset: existing css_set for a task
446 * @new_cgrp: cgroup that's being entered by the task 464 * @new_cgrp: cgroup that's being entered by the task
447 * @template: desired set of css pointers in css_set (pre-calculated) 465 * @template: desired set of css pointers in css_set (pre-calculated)
448 * 466 *
449 * Returns true if "cg" matches "old_cg" except for the hierarchy 467 * Returns true if "cg" matches "old_cg" except for the hierarchy
450 * which "new_cgrp" belongs to, for which it should match "new_cgrp". 468 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
451 */ 469 */
452static bool compare_css_sets(struct css_set *cg, 470static bool compare_css_sets(struct css_set *cset,
453 struct css_set *old_cg, 471 struct css_set *old_cset,
454 struct cgroup *new_cgrp, 472 struct cgroup *new_cgrp,
455 struct cgroup_subsys_state *template[]) 473 struct cgroup_subsys_state *template[])
456{ 474{
457 struct list_head *l1, *l2; 475 struct list_head *l1, *l2;
458 476
459 if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { 477 if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {
460 /* Not all subsystems matched */ 478 /* Not all subsystems matched */
461 return false; 479 return false;
462 } 480 }
@@ -470,28 +488,28 @@ static bool compare_css_sets(struct css_set *cg,
470 * candidates. 488 * candidates.
471 */ 489 */
472 490
473 l1 = &cg->cg_links; 491 l1 = &cset->cgrp_links;
474 l2 = &old_cg->cg_links; 492 l2 = &old_cset->cgrp_links;
475 while (1) { 493 while (1) {
476 struct cg_cgroup_link *cgl1, *cgl2; 494 struct cgrp_cset_link *link1, *link2;
477 struct cgroup *cg1, *cg2; 495 struct cgroup *cgrp1, *cgrp2;
478 496
479 l1 = l1->next; 497 l1 = l1->next;
480 l2 = l2->next; 498 l2 = l2->next;
481 /* See if we reached the end - both lists are equal length. */ 499 /* See if we reached the end - both lists are equal length. */
482 if (l1 == &cg->cg_links) { 500 if (l1 == &cset->cgrp_links) {
483 BUG_ON(l2 != &old_cg->cg_links); 501 BUG_ON(l2 != &old_cset->cgrp_links);
484 break; 502 break;
485 } else { 503 } else {
486 BUG_ON(l2 == &old_cg->cg_links); 504 BUG_ON(l2 == &old_cset->cgrp_links);
487 } 505 }
488 /* Locate the cgroups associated with these links. */ 506 /* Locate the cgroups associated with these links. */
489 cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); 507 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
490 cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); 508 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
491 cg1 = cgl1->cgrp; 509 cgrp1 = link1->cgrp;
492 cg2 = cgl2->cgrp; 510 cgrp2 = link2->cgrp;
493 /* Hierarchies should be linked in the same order. */ 511 /* Hierarchies should be linked in the same order. */
494 BUG_ON(cg1->root != cg2->root); 512 BUG_ON(cgrp1->root != cgrp2->root);
495 513
496 /* 514 /*
497 * If this hierarchy is the hierarchy of the cgroup 515 * If this hierarchy is the hierarchy of the cgroup
@@ -500,46 +518,39 @@ static bool compare_css_sets(struct css_set *cg,
500 * hierarchy, then this css_set should point to the 518 * hierarchy, then this css_set should point to the
501 * same cgroup as the old css_set. 519 * same cgroup as the old css_set.
502 */ 520 */
503 if (cg1->root == new_cgrp->root) { 521 if (cgrp1->root == new_cgrp->root) {
504 if (cg1 != new_cgrp) 522 if (cgrp1 != new_cgrp)
505 return false; 523 return false;
506 } else { 524 } else {
507 if (cg1 != cg2) 525 if (cgrp1 != cgrp2)
508 return false; 526 return false;
509 } 527 }
510 } 528 }
511 return true; 529 return true;
512} 530}
513 531
514/* 532/**
515 * find_existing_css_set() is a helper for 533 * find_existing_css_set - init css array and find the matching css_set
516 * find_css_set(), and checks to see whether an existing 534 * @old_cset: the css_set that we're using before the cgroup transition
517 * css_set is suitable. 535 * @cgrp: the cgroup that we're moving into
518 * 536 * @template: out param for the new set of csses, should be clear on entry
519 * oldcg: the cgroup group that we're using before the cgroup
520 * transition
521 *
522 * cgrp: the cgroup that we're moving into
523 *
524 * template: location in which to build the desired set of subsystem
525 * state objects for the new cgroup group
526 */ 537 */
527static struct css_set *find_existing_css_set( 538static struct css_set *find_existing_css_set(struct css_set *old_cset,
528 struct css_set *oldcg, 539 struct cgroup *cgrp,
529 struct cgroup *cgrp, 540 struct cgroup_subsys_state *template[])
530 struct cgroup_subsys_state *template[])
531{ 541{
532 int i;
533 struct cgroupfs_root *root = cgrp->root; 542 struct cgroupfs_root *root = cgrp->root;
534 struct css_set *cg; 543 struct cgroup_subsys *ss;
544 struct css_set *cset;
535 unsigned long key; 545 unsigned long key;
546 int i;
536 547
537 /* 548 /*
538 * Build the set of subsystem state objects that we want to see in the 549 * Build the set of subsystem state objects that we want to see in the
539 * new css_set. while subsystems can change globally, the entries here 550 * new css_set. while subsystems can change globally, the entries here
540 * won't change, so no need for locking. 551 * won't change, so no need for locking.
541 */ 552 */
542 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 553 for_each_subsys(ss, i) {
543 if (root->subsys_mask & (1UL << i)) { 554 if (root->subsys_mask & (1UL << i)) {
544 /* Subsystem is in this hierarchy. So we want 555 /* Subsystem is in this hierarchy. So we want
545 * the subsystem state from the new 556 * the subsystem state from the new
@@ -548,148 +559,152 @@ static struct css_set *find_existing_css_set(
548 } else { 559 } else {
549 /* Subsystem is not in this hierarchy, so we 560 /* Subsystem is not in this hierarchy, so we
550 * don't want to change the subsystem state */ 561 * don't want to change the subsystem state */
551 template[i] = oldcg->subsys[i]; 562 template[i] = old_cset->subsys[i];
552 } 563 }
553 } 564 }
554 565
555 key = css_set_hash(template); 566 key = css_set_hash(template);
556 hash_for_each_possible(css_set_table, cg, hlist, key) { 567 hash_for_each_possible(css_set_table, cset, hlist, key) {
557 if (!compare_css_sets(cg, oldcg, cgrp, template)) 568 if (!compare_css_sets(cset, old_cset, cgrp, template))
558 continue; 569 continue;
559 570
560 /* This css_set matches what we need */ 571 /* This css_set matches what we need */
561 return cg; 572 return cset;
562 } 573 }
563 574
564 /* No existing cgroup group matched */ 575 /* No existing cgroup group matched */
565 return NULL; 576 return NULL;
566} 577}
567 578
568static void free_cg_links(struct list_head *tmp) 579static void free_cgrp_cset_links(struct list_head *links_to_free)
569{ 580{
570 struct cg_cgroup_link *link; 581 struct cgrp_cset_link *link, *tmp_link;
571 struct cg_cgroup_link *saved_link;
572 582
573 list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { 583 list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
574 list_del(&link->cgrp_link_list); 584 list_del(&link->cset_link);
575 kfree(link); 585 kfree(link);
576 } 586 }
577} 587}
578 588
579/* 589/**
580 * allocate_cg_links() allocates "count" cg_cgroup_link structures 590 * allocate_cgrp_cset_links - allocate cgrp_cset_links
581 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on 591 * @count: the number of links to allocate
582 * success or a negative error 592 * @tmp_links: list_head the allocated links are put on
593 *
594 * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
595 * through ->cset_link. Returns 0 on success or -errno.
583 */ 596 */
584static int allocate_cg_links(int count, struct list_head *tmp) 597static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
585{ 598{
586 struct cg_cgroup_link *link; 599 struct cgrp_cset_link *link;
587 int i; 600 int i;
588 INIT_LIST_HEAD(tmp); 601
602 INIT_LIST_HEAD(tmp_links);
603
589 for (i = 0; i < count; i++) { 604 for (i = 0; i < count; i++) {
590 link = kmalloc(sizeof(*link), GFP_KERNEL); 605 link = kzalloc(sizeof(*link), GFP_KERNEL);
591 if (!link) { 606 if (!link) {
592 free_cg_links(tmp); 607 free_cgrp_cset_links(tmp_links);
593 return -ENOMEM; 608 return -ENOMEM;
594 } 609 }
595 list_add(&link->cgrp_link_list, tmp); 610 list_add(&link->cset_link, tmp_links);
596 } 611 }
597 return 0; 612 return 0;
598} 613}
599 614
600/** 615/**
601 * link_css_set - a helper function to link a css_set to a cgroup 616 * link_css_set - a helper function to link a css_set to a cgroup
602 * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() 617 * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
603 * @cg: the css_set to be linked 618 * @cset: the css_set to be linked
604 * @cgrp: the destination cgroup 619 * @cgrp: the destination cgroup
605 */ 620 */
606static void link_css_set(struct list_head *tmp_cg_links, 621static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
607 struct css_set *cg, struct cgroup *cgrp) 622 struct cgroup *cgrp)
608{ 623{
609 struct cg_cgroup_link *link; 624 struct cgrp_cset_link *link;
610 625
611 BUG_ON(list_empty(tmp_cg_links)); 626 BUG_ON(list_empty(tmp_links));
612 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, 627 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
613 cgrp_link_list); 628 link->cset = cset;
614 link->cg = cg;
615 link->cgrp = cgrp; 629 link->cgrp = cgrp;
616 atomic_inc(&cgrp->count); 630 list_move(&link->cset_link, &cgrp->cset_links);
617 list_move(&link->cgrp_link_list, &cgrp->css_sets);
618 /* 631 /*
619 * Always add links to the tail of the list so that the list 632 * Always add links to the tail of the list so that the list
620 * is sorted by order of hierarchy creation 633 * is sorted by order of hierarchy creation
621 */ 634 */
622 list_add_tail(&link->cg_link_list, &cg->cg_links); 635 list_add_tail(&link->cgrp_link, &cset->cgrp_links);
623} 636}
624 637
625/* 638/**
626 * find_css_set() takes an existing cgroup group and a 639 * find_css_set - return a new css_set with one cgroup updated
627 * cgroup object, and returns a css_set object that's 640 * @old_cset: the baseline css_set
628 * equivalent to the old group, but with the given cgroup 641 * @cgrp: the cgroup to be updated
629 * substituted into the appropriate hierarchy. Must be called with 642 *
630 * cgroup_mutex held 643 * Return a new css_set that's equivalent to @old_cset, but with @cgrp
644 * substituted into the appropriate hierarchy.
631 */ 645 */
632static struct css_set *find_css_set( 646static struct css_set *find_css_set(struct css_set *old_cset,
633 struct css_set *oldcg, struct cgroup *cgrp) 647 struct cgroup *cgrp)
634{ 648{
635 struct css_set *res; 649 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
636 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; 650 struct css_set *cset;
637 651 struct list_head tmp_links;
638 struct list_head tmp_cg_links; 652 struct cgrp_cset_link *link;
639
640 struct cg_cgroup_link *link;
641 unsigned long key; 653 unsigned long key;
642 654
655 lockdep_assert_held(&cgroup_mutex);
656
643 /* First see if we already have a cgroup group that matches 657 /* First see if we already have a cgroup group that matches
644 * the desired set */ 658 * the desired set */
645 read_lock(&css_set_lock); 659 read_lock(&css_set_lock);
646 res = find_existing_css_set(oldcg, cgrp, template); 660 cset = find_existing_css_set(old_cset, cgrp, template);
647 if (res) 661 if (cset)
648 get_css_set(res); 662 get_css_set(cset);
649 read_unlock(&css_set_lock); 663 read_unlock(&css_set_lock);
650 664
651 if (res) 665 if (cset)
652 return res; 666 return cset;
653 667
654 res = kmalloc(sizeof(*res), GFP_KERNEL); 668 cset = kzalloc(sizeof(*cset), GFP_KERNEL);
655 if (!res) 669 if (!cset)
656 return NULL; 670 return NULL;
657 671
658 /* Allocate all the cg_cgroup_link objects that we'll need */ 672 /* Allocate all the cgrp_cset_link objects that we'll need */
659 if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { 673 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
660 kfree(res); 674 kfree(cset);
661 return NULL; 675 return NULL;
662 } 676 }
663 677
664 atomic_set(&res->refcount, 1); 678 atomic_set(&cset->refcount, 1);
665 INIT_LIST_HEAD(&res->cg_links); 679 INIT_LIST_HEAD(&cset->cgrp_links);
666 INIT_LIST_HEAD(&res->tasks); 680 INIT_LIST_HEAD(&cset->tasks);
667 INIT_HLIST_NODE(&res->hlist); 681 INIT_HLIST_NODE(&cset->hlist);
668 682
669 /* Copy the set of subsystem state objects generated in 683 /* Copy the set of subsystem state objects generated in
670 * find_existing_css_set() */ 684 * find_existing_css_set() */
671 memcpy(res->subsys, template, sizeof(res->subsys)); 685 memcpy(cset->subsys, template, sizeof(cset->subsys));
672 686
673 write_lock(&css_set_lock); 687 write_lock(&css_set_lock);
674 /* Add reference counts and links from the new css_set. */ 688 /* Add reference counts and links from the new css_set. */
675 list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { 689 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
676 struct cgroup *c = link->cgrp; 690 struct cgroup *c = link->cgrp;
691
677 if (c->root == cgrp->root) 692 if (c->root == cgrp->root)
678 c = cgrp; 693 c = cgrp;
679 link_css_set(&tmp_cg_links, res, c); 694 link_css_set(&tmp_links, cset, c);
680 } 695 }
681 696
682 BUG_ON(!list_empty(&tmp_cg_links)); 697 BUG_ON(!list_empty(&tmp_links));
683 698
684 css_set_count++; 699 css_set_count++;
685 700
686 /* Add this cgroup group to the hash table */ 701 /* Add this cgroup group to the hash table */
687 key = css_set_hash(res->subsys); 702 key = css_set_hash(cset->subsys);
688 hash_add(css_set_table, &res->hlist, key); 703 hash_add(css_set_table, &cset->hlist, key);
689 704
690 write_unlock(&css_set_lock); 705 write_unlock(&css_set_lock);
691 706
692 return res; 707 return cset;
693} 708}
694 709
695/* 710/*
@@ -699,7 +714,7 @@ static struct css_set *find_css_set(
699static struct cgroup *task_cgroup_from_root(struct task_struct *task, 714static struct cgroup *task_cgroup_from_root(struct task_struct *task,
700 struct cgroupfs_root *root) 715 struct cgroupfs_root *root)
701{ 716{
702 struct css_set *css; 717 struct css_set *cset;
703 struct cgroup *res = NULL; 718 struct cgroup *res = NULL;
704 719
705 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 720 BUG_ON(!mutex_is_locked(&cgroup_mutex));
@@ -709,13 +724,15 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
709 * task can't change groups, so the only thing that can happen 724 * task can't change groups, so the only thing that can happen
710 * is that it exits and its css is set back to init_css_set. 725 * is that it exits and its css is set back to init_css_set.
711 */ 726 */
712 css = task->cgroups; 727 cset = task_css_set(task);
713 if (css == &init_css_set) { 728 if (cset == &init_css_set) {
714 res = &root->top_cgroup; 729 res = &root->top_cgroup;
715 } else { 730 } else {
716 struct cg_cgroup_link *link; 731 struct cgrp_cset_link *link;
717 list_for_each_entry(link, &css->cg_links, cg_link_list) { 732
733 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
718 struct cgroup *c = link->cgrp; 734 struct cgroup *c = link->cgrp;
735
719 if (c->root == root) { 736 if (c->root == root) {
720 res = c; 737 res = c;
721 break; 738 break;
@@ -828,14 +845,14 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
828 845
829static void cgroup_free_fn(struct work_struct *work) 846static void cgroup_free_fn(struct work_struct *work)
830{ 847{
831 struct cgroup *cgrp = container_of(work, struct cgroup, free_work); 848 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
832 struct cgroup_subsys *ss; 849 struct cgroup_subsys *ss;
833 850
834 mutex_lock(&cgroup_mutex); 851 mutex_lock(&cgroup_mutex);
835 /* 852 /*
836 * Release the subsystem state objects. 853 * Release the subsystem state objects.
837 */ 854 */
838 for_each_subsys(cgrp->root, ss) 855 for_each_root_subsys(cgrp->root, ss)
839 ss->css_free(cgrp); 856 ss->css_free(cgrp);
840 857
841 cgrp->root->number_of_cgroups--; 858 cgrp->root->number_of_cgroups--;
@@ -873,7 +890,8 @@ static void cgroup_free_rcu(struct rcu_head *head)
873{ 890{
874 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); 891 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
875 892
876 schedule_work(&cgrp->free_work); 893 INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
894 schedule_work(&cgrp->destroy_work);
877} 895}
878 896
879static void cgroup_diput(struct dentry *dentry, struct inode *inode) 897static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -882,7 +900,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
882 if (S_ISDIR(inode->i_mode)) { 900 if (S_ISDIR(inode->i_mode)) {
883 struct cgroup *cgrp = dentry->d_fsdata; 901 struct cgroup *cgrp = dentry->d_fsdata;
884 902
885 BUG_ON(!(cgroup_is_removed(cgrp))); 903 BUG_ON(!(cgroup_is_dead(cgrp)));
886 call_rcu(&cgrp->rcu_head, cgroup_free_rcu); 904 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
887 } else { 905 } else {
888 struct cfent *cfe = __d_cfe(dentry); 906 struct cfent *cfe = __d_cfe(dentry);
@@ -950,7 +968,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
950 struct cgroup *cgrp = __d_cgrp(dir); 968 struct cgroup *cgrp = __d_cgrp(dir);
951 struct cgroup_subsys *ss; 969 struct cgroup_subsys *ss;
952 970
953 for_each_subsys(cgrp->root, ss) { 971 for_each_root_subsys(cgrp->root, ss) {
954 struct cftype_set *set; 972 struct cftype_set *set;
955 if (!test_bit(ss->subsys_id, &subsys_mask)) 973 if (!test_bit(ss->subsys_id, &subsys_mask))
956 continue; 974 continue;
@@ -988,30 +1006,23 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
988 * returns an error, no reference counts are touched. 1006 * returns an error, no reference counts are touched.
989 */ 1007 */
990static int rebind_subsystems(struct cgroupfs_root *root, 1008static int rebind_subsystems(struct cgroupfs_root *root,
991 unsigned long final_subsys_mask) 1009 unsigned long added_mask, unsigned removed_mask)
992{ 1010{
993 unsigned long added_mask, removed_mask;
994 struct cgroup *cgrp = &root->top_cgroup; 1011 struct cgroup *cgrp = &root->top_cgroup;
1012 struct cgroup_subsys *ss;
995 int i; 1013 int i;
996 1014
997 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1015 BUG_ON(!mutex_is_locked(&cgroup_mutex));
998 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 1016 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
999 1017
1000 removed_mask = root->actual_subsys_mask & ~final_subsys_mask;
1001 added_mask = final_subsys_mask & ~root->actual_subsys_mask;
1002 /* Check that any added subsystems are currently free */ 1018 /* Check that any added subsystems are currently free */
1003 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1019 for_each_subsys(ss, i) {
1004 unsigned long bit = 1UL << i; 1020 unsigned long bit = 1UL << i;
1005 struct cgroup_subsys *ss = subsys[i]; 1021
1006 if (!(bit & added_mask)) 1022 if (!(bit & added_mask))
1007 continue; 1023 continue;
1008 /* 1024
1009 * Nobody should tell us to do a subsys that doesn't exist: 1025 if (ss->root != &cgroup_dummy_root) {
1010 * parse_cgroupfs_options should catch that case and refcounts
1011 * ensure that subsystems won't disappear once selected.
1012 */
1013 BUG_ON(ss == NULL);
1014 if (ss->root != &rootnode) {
1015 /* Subsystem isn't free */ 1026 /* Subsystem isn't free */
1016 return -EBUSY; 1027 return -EBUSY;
1017 } 1028 }
@@ -1025,38 +1036,41 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1025 return -EBUSY; 1036 return -EBUSY;
1026 1037
1027 /* Process each subsystem */ 1038 /* Process each subsystem */
1028 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1039 for_each_subsys(ss, i) {
1029 struct cgroup_subsys *ss = subsys[i];
1030 unsigned long bit = 1UL << i; 1040 unsigned long bit = 1UL << i;
1041
1031 if (bit & added_mask) { 1042 if (bit & added_mask) {
1032 /* We're binding this subsystem to this hierarchy */ 1043 /* We're binding this subsystem to this hierarchy */
1033 BUG_ON(ss == NULL);
1034 BUG_ON(cgrp->subsys[i]); 1044 BUG_ON(cgrp->subsys[i]);
1035 BUG_ON(!dummytop->subsys[i]); 1045 BUG_ON(!cgroup_dummy_top->subsys[i]);
1036 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 1046 BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top);
1037 cgrp->subsys[i] = dummytop->subsys[i]; 1047
1048 cgrp->subsys[i] = cgroup_dummy_top->subsys[i];
1038 cgrp->subsys[i]->cgroup = cgrp; 1049 cgrp->subsys[i]->cgroup = cgrp;
1039 list_move(&ss->sibling, &root->subsys_list); 1050 list_move(&ss->sibling, &root->subsys_list);
1040 ss->root = root; 1051 ss->root = root;
1041 if (ss->bind) 1052 if (ss->bind)
1042 ss->bind(cgrp); 1053 ss->bind(cgrp);
1054
1043 /* refcount was already taken, and we're keeping it */ 1055 /* refcount was already taken, and we're keeping it */
1056 root->subsys_mask |= bit;
1044 } else if (bit & removed_mask) { 1057 } else if (bit & removed_mask) {
1045 /* We're removing this subsystem */ 1058 /* We're removing this subsystem */
1046 BUG_ON(ss == NULL); 1059 BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]);
1047 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
1048 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1060 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
1061
1049 if (ss->bind) 1062 if (ss->bind)
1050 ss->bind(dummytop); 1063 ss->bind(cgroup_dummy_top);
1051 dummytop->subsys[i]->cgroup = dummytop; 1064 cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top;
1052 cgrp->subsys[i] = NULL; 1065 cgrp->subsys[i] = NULL;
1053 subsys[i]->root = &rootnode; 1066 cgroup_subsys[i]->root = &cgroup_dummy_root;
1054 list_move(&ss->sibling, &rootnode.subsys_list); 1067 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
1068
1055 /* subsystem is now free - drop reference on module */ 1069 /* subsystem is now free - drop reference on module */
1056 module_put(ss->module); 1070 module_put(ss->module);
1057 } else if (bit & final_subsys_mask) { 1071 root->subsys_mask &= ~bit;
1072 } else if (bit & root->subsys_mask) {
1058 /* Subsystem state should already exist */ 1073 /* Subsystem state should already exist */
1059 BUG_ON(ss == NULL);
1060 BUG_ON(!cgrp->subsys[i]); 1074 BUG_ON(!cgrp->subsys[i]);
1061 /* 1075 /*
1062 * a refcount was taken, but we already had one, so 1076 * a refcount was taken, but we already had one, so
@@ -1071,7 +1085,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1071 BUG_ON(cgrp->subsys[i]); 1085 BUG_ON(cgrp->subsys[i]);
1072 } 1086 }
1073 } 1087 }
1074 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; 1088
1089 /*
1090 * Mark @root has finished binding subsystems. @root->subsys_mask
1091 * now matches the bound subsystems.
1092 */
1093 root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1075 1094
1076 return 0; 1095 return 0;
1077} 1096}
@@ -1082,7 +1101,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1082 struct cgroup_subsys *ss; 1101 struct cgroup_subsys *ss;
1083 1102
1084 mutex_lock(&cgroup_root_mutex); 1103 mutex_lock(&cgroup_root_mutex);
1085 for_each_subsys(root, ss) 1104 for_each_root_subsys(root, ss)
1086 seq_printf(seq, ",%s", ss->name); 1105 seq_printf(seq, ",%s", ss->name);
1087 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1106 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1088 seq_puts(seq, ",sane_behavior"); 1107 seq_puts(seq, ",sane_behavior");
@@ -1114,18 +1133,19 @@ struct cgroup_sb_opts {
1114}; 1133};
1115 1134
1116/* 1135/*
1117 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call 1136 * Convert a hierarchy specifier into a bitmask of subsystems and
1118 * with cgroup_mutex held to protect the subsys[] array. This function takes 1137 * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
1119 * refcounts on subsystems to be used, unless it returns error, in which case 1138 * array. This function takes refcounts on subsystems to be used, unless it
1120 * no refcounts are taken. 1139 * returns error, in which case no refcounts are taken.
1121 */ 1140 */
1122static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) 1141static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1123{ 1142{
1124 char *token, *o = data; 1143 char *token, *o = data;
1125 bool all_ss = false, one_ss = false; 1144 bool all_ss = false, one_ss = false;
1126 unsigned long mask = (unsigned long)-1; 1145 unsigned long mask = (unsigned long)-1;
1127 int i;
1128 bool module_pin_failed = false; 1146 bool module_pin_failed = false;
1147 struct cgroup_subsys *ss;
1148 int i;
1129 1149
1130 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1150 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1131 1151
@@ -1202,10 +1222,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1202 continue; 1222 continue;
1203 } 1223 }
1204 1224
1205 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1225 for_each_subsys(ss, i) {
1206 struct cgroup_subsys *ss = subsys[i];
1207 if (ss == NULL)
1208 continue;
1209 if (strcmp(token, ss->name)) 1226 if (strcmp(token, ss->name))
1210 continue; 1227 continue;
1211 if (ss->disabled) 1228 if (ss->disabled)
@@ -1228,16 +1245,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1228 * otherwise if 'none', 'name=' and a subsystem name options 1245 * otherwise if 'none', 'name=' and a subsystem name options
1229 * were not specified, let's default to 'all' 1246 * were not specified, let's default to 'all'
1230 */ 1247 */
1231 if (all_ss || (!one_ss && !opts->none && !opts->name)) { 1248 if (all_ss || (!one_ss && !opts->none && !opts->name))
1232 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1249 for_each_subsys(ss, i)
1233 struct cgroup_subsys *ss = subsys[i]; 1250 if (!ss->disabled)
1234 if (ss == NULL) 1251 set_bit(i, &opts->subsys_mask);
1235 continue;
1236 if (ss->disabled)
1237 continue;
1238 set_bit(i, &opts->subsys_mask);
1239 }
1240 }
1241 1252
1242 /* Consistency checks */ 1253 /* Consistency checks */
1243 1254
@@ -1281,12 +1292,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1281 * take duplicate reference counts on a subsystem that's already used, 1292 * take duplicate reference counts on a subsystem that's already used,
1282 * but rebind_subsystems handles this case. 1293 * but rebind_subsystems handles this case.
1283 */ 1294 */
1284 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1295 for_each_subsys(ss, i) {
1285 unsigned long bit = 1UL << i; 1296 if (!(opts->subsys_mask & (1UL << i)))
1286
1287 if (!(bit & opts->subsys_mask))
1288 continue; 1297 continue;
1289 if (!try_module_get(subsys[i]->module)) { 1298 if (!try_module_get(cgroup_subsys[i]->module)) {
1290 module_pin_failed = true; 1299 module_pin_failed = true;
1291 break; 1300 break;
1292 } 1301 }
@@ -1303,7 +1312,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1303 1312
1304 if (!(bit & opts->subsys_mask)) 1313 if (!(bit & opts->subsys_mask))
1305 continue; 1314 continue;
1306 module_put(subsys[i]->module); 1315 module_put(cgroup_subsys[i]->module);
1307 } 1316 }
1308 return -ENOENT; 1317 return -ENOENT;
1309 } 1318 }
@@ -1313,14 +1322,14 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1313 1322
1314static void drop_parsed_module_refcounts(unsigned long subsys_mask) 1323static void drop_parsed_module_refcounts(unsigned long subsys_mask)
1315{ 1324{
1325 struct cgroup_subsys *ss;
1316 int i; 1326 int i;
1317 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1318 unsigned long bit = 1UL << i;
1319 1327
1320 if (!(bit & subsys_mask)) 1328 mutex_lock(&cgroup_mutex);
1321 continue; 1329 for_each_subsys(ss, i)
1322 module_put(subsys[i]->module); 1330 if (subsys_mask & (1UL << i))
1323 } 1331 module_put(cgroup_subsys[i]->module);
1332 mutex_unlock(&cgroup_mutex);
1324} 1333}
1325 1334
1326static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1335static int cgroup_remount(struct super_block *sb, int *flags, char *data)
@@ -1345,7 +1354,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1345 if (ret) 1354 if (ret)
1346 goto out_unlock; 1355 goto out_unlock;
1347 1356
1348 if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) 1357 if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1349 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1358 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1350 task_tgid_nr(current), current->comm); 1359 task_tgid_nr(current), current->comm);
1351 1360
@@ -1353,10 +1362,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1353 removed_mask = root->subsys_mask & ~opts.subsys_mask; 1362 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1354 1363
1355 /* Don't allow flags or name to change at remount */ 1364 /* Don't allow flags or name to change at remount */
1356 if (opts.flags != root->flags || 1365 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
1357 (opts.name && strcmp(opts.name, root->name))) { 1366 (opts.name && strcmp(opts.name, root->name))) {
1367 pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
1368 opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
1369 root->flags & CGRP_ROOT_OPTION_MASK, root->name);
1358 ret = -EINVAL; 1370 ret = -EINVAL;
1359 drop_parsed_module_refcounts(opts.subsys_mask);
1360 goto out_unlock; 1371 goto out_unlock;
1361 } 1372 }
1362 1373
@@ -1367,11 +1378,10 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1367 */ 1378 */
1368 cgroup_clear_directory(cgrp->dentry, false, removed_mask); 1379 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1369 1380
1370 ret = rebind_subsystems(root, opts.subsys_mask); 1381 ret = rebind_subsystems(root, added_mask, removed_mask);
1371 if (ret) { 1382 if (ret) {
1372 /* rebind_subsystems failed, re-populate the removed files */ 1383 /* rebind_subsystems failed, re-populate the removed files */
1373 cgroup_populate_dir(cgrp, false, removed_mask); 1384 cgroup_populate_dir(cgrp, false, removed_mask);
1374 drop_parsed_module_refcounts(opts.subsys_mask);
1375 goto out_unlock; 1385 goto out_unlock;
1376 } 1386 }
1377 1387
@@ -1386,6 +1396,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1386 mutex_unlock(&cgroup_root_mutex); 1396 mutex_unlock(&cgroup_root_mutex);
1387 mutex_unlock(&cgroup_mutex); 1397 mutex_unlock(&cgroup_mutex);
1388 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1398 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1399 if (ret)
1400 drop_parsed_module_refcounts(opts.subsys_mask);
1389 return ret; 1401 return ret;
1390} 1402}
1391 1403
@@ -1401,11 +1413,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1401 INIT_LIST_HEAD(&cgrp->sibling); 1413 INIT_LIST_HEAD(&cgrp->sibling);
1402 INIT_LIST_HEAD(&cgrp->children); 1414 INIT_LIST_HEAD(&cgrp->children);
1403 INIT_LIST_HEAD(&cgrp->files); 1415 INIT_LIST_HEAD(&cgrp->files);
1404 INIT_LIST_HEAD(&cgrp->css_sets); 1416 INIT_LIST_HEAD(&cgrp->cset_links);
1405 INIT_LIST_HEAD(&cgrp->allcg_node);
1406 INIT_LIST_HEAD(&cgrp->release_list); 1417 INIT_LIST_HEAD(&cgrp->release_list);
1407 INIT_LIST_HEAD(&cgrp->pidlists); 1418 INIT_LIST_HEAD(&cgrp->pidlists);
1408 INIT_WORK(&cgrp->free_work, cgroup_free_fn);
1409 mutex_init(&cgrp->pidlist_mutex); 1419 mutex_init(&cgrp->pidlist_mutex);
1410 INIT_LIST_HEAD(&cgrp->event_list); 1420 INIT_LIST_HEAD(&cgrp->event_list);
1411 spin_lock_init(&cgrp->event_list_lock); 1421 spin_lock_init(&cgrp->event_list_lock);
@@ -1418,37 +1428,37 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1418 1428
1419 INIT_LIST_HEAD(&root->subsys_list); 1429 INIT_LIST_HEAD(&root->subsys_list);
1420 INIT_LIST_HEAD(&root->root_list); 1430 INIT_LIST_HEAD(&root->root_list);
1421 INIT_LIST_HEAD(&root->allcg_list);
1422 root->number_of_cgroups = 1; 1431 root->number_of_cgroups = 1;
1423 cgrp->root = root; 1432 cgrp->root = root;
1424 cgrp->name = &root_cgroup_name; 1433 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
1425 init_cgroup_housekeeping(cgrp); 1434 init_cgroup_housekeeping(cgrp);
1426 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1427} 1435}
1428 1436
1429static bool init_root_id(struct cgroupfs_root *root) 1437static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
1430{ 1438{
1431 int ret = 0; 1439 int id;
1432 1440
1433 do { 1441 lockdep_assert_held(&cgroup_mutex);
1434 if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) 1442 lockdep_assert_held(&cgroup_root_mutex);
1435 return false; 1443
1436 spin_lock(&hierarchy_id_lock); 1444 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
1437 /* Try to allocate the next unused ID */ 1445 GFP_KERNEL);
1438 ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, 1446 if (id < 0)
1439 &root->hierarchy_id); 1447 return id;
1440 if (ret == -ENOSPC) 1448
1441 /* Try again starting from 0 */ 1449 root->hierarchy_id = id;
1442 ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); 1450 return 0;
1443 if (!ret) { 1451}
1444 next_hierarchy_id = root->hierarchy_id + 1; 1452
1445 } else if (ret != -EAGAIN) { 1453static void cgroup_exit_root_id(struct cgroupfs_root *root)
1446 /* Can only get here if the 31-bit IDR is full ... */ 1454{
1447 BUG_ON(ret); 1455 lockdep_assert_held(&cgroup_mutex);
1448 } 1456 lockdep_assert_held(&cgroup_root_mutex);
1449 spin_unlock(&hierarchy_id_lock); 1457
1450 } while (ret); 1458 if (root->hierarchy_id) {
1451 return true; 1459 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1460 root->hierarchy_id = 0;
1461 }
1452} 1462}
1453 1463
1454static int cgroup_test_super(struct super_block *sb, void *data) 1464static int cgroup_test_super(struct super_block *sb, void *data)
@@ -1482,12 +1492,16 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1482 if (!root) 1492 if (!root)
1483 return ERR_PTR(-ENOMEM); 1493 return ERR_PTR(-ENOMEM);
1484 1494
1485 if (!init_root_id(root)) {
1486 kfree(root);
1487 return ERR_PTR(-ENOMEM);
1488 }
1489 init_cgroup_root(root); 1495 init_cgroup_root(root);
1490 1496
1497 /*
1498 * We need to set @root->subsys_mask now so that @root can be
1499 * matched by cgroup_test_super() before it finishes
1500 * initialization; otherwise, competing mounts with the same
1501 * options may try to bind the same subsystems instead of waiting
1502 * for the first one leading to unexpected mount errors.
1503 * SUBSYS_BOUND will be set once actual binding is complete.
1504 */
1491 root->subsys_mask = opts->subsys_mask; 1505 root->subsys_mask = opts->subsys_mask;
1492 root->flags = opts->flags; 1506 root->flags = opts->flags;
1493 ida_init(&root->cgroup_ida); 1507 ida_init(&root->cgroup_ida);
@@ -1500,17 +1514,15 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1500 return root; 1514 return root;
1501} 1515}
1502 1516
1503static void cgroup_drop_root(struct cgroupfs_root *root) 1517static void cgroup_free_root(struct cgroupfs_root *root)
1504{ 1518{
1505 if (!root) 1519 if (root) {
1506 return; 1520 /* hierarhcy ID shoulid already have been released */
1521 WARN_ON_ONCE(root->hierarchy_id);
1507 1522
1508 BUG_ON(!root->hierarchy_id); 1523 ida_destroy(&root->cgroup_ida);
1509 spin_lock(&hierarchy_id_lock); 1524 kfree(root);
1510 ida_remove(&hierarchy_ida, root->hierarchy_id); 1525 }
1511 spin_unlock(&hierarchy_id_lock);
1512 ida_destroy(&root->cgroup_ida);
1513 kfree(root);
1514} 1526}
1515 1527
1516static int cgroup_set_super(struct super_block *sb, void *data) 1528static int cgroup_set_super(struct super_block *sb, void *data)
@@ -1597,7 +1609,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1597 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); 1609 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
1598 if (IS_ERR(sb)) { 1610 if (IS_ERR(sb)) {
1599 ret = PTR_ERR(sb); 1611 ret = PTR_ERR(sb);
1600 cgroup_drop_root(opts.new_root); 1612 cgroup_free_root(opts.new_root);
1601 goto drop_modules; 1613 goto drop_modules;
1602 } 1614 }
1603 1615
@@ -1605,12 +1617,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1605 BUG_ON(!root); 1617 BUG_ON(!root);
1606 if (root == opts.new_root) { 1618 if (root == opts.new_root) {
1607 /* We used the new root structure, so this is a new hierarchy */ 1619 /* We used the new root structure, so this is a new hierarchy */
1608 struct list_head tmp_cg_links; 1620 struct list_head tmp_links;
1609 struct cgroup *root_cgrp = &root->top_cgroup; 1621 struct cgroup *root_cgrp = &root->top_cgroup;
1610 struct cgroupfs_root *existing_root; 1622 struct cgroupfs_root *existing_root;
1611 const struct cred *cred; 1623 const struct cred *cred;
1612 int i; 1624 int i;
1613 struct css_set *cg; 1625 struct css_set *cset;
1614 1626
1615 BUG_ON(sb->s_root != NULL); 1627 BUG_ON(sb->s_root != NULL);
1616 1628
@@ -1637,13 +1649,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1637 * that's us. The worst that can happen is that we 1649 * that's us. The worst that can happen is that we
1638 * have some link structures left over 1650 * have some link structures left over
1639 */ 1651 */
1640 ret = allocate_cg_links(css_set_count, &tmp_cg_links); 1652 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1641 if (ret) 1653 if (ret)
1642 goto unlock_drop; 1654 goto unlock_drop;
1643 1655
1644 ret = rebind_subsystems(root, root->subsys_mask); 1656 /* ID 0 is reserved for dummy root, 1 for unified hierarchy */
1657 ret = cgroup_init_root_id(root, 2, 0);
1658 if (ret)
1659 goto unlock_drop;
1660
1661 ret = rebind_subsystems(root, root->subsys_mask, 0);
1645 if (ret == -EBUSY) { 1662 if (ret == -EBUSY) {
1646 free_cg_links(&tmp_cg_links); 1663 free_cgrp_cset_links(&tmp_links);
1647 goto unlock_drop; 1664 goto unlock_drop;
1648 } 1665 }
1649 /* 1666 /*
@@ -1655,8 +1672,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1655 /* EBUSY should be the only error here */ 1672 /* EBUSY should be the only error here */
1656 BUG_ON(ret); 1673 BUG_ON(ret);
1657 1674
1658 list_add(&root->root_list, &roots); 1675 list_add(&root->root_list, &cgroup_roots);
1659 root_count++; 1676 cgroup_root_count++;
1660 1677
1661 sb->s_root->d_fsdata = root_cgrp; 1678 sb->s_root->d_fsdata = root_cgrp;
1662 root->top_cgroup.dentry = sb->s_root; 1679 root->top_cgroup.dentry = sb->s_root;
@@ -1664,11 +1681,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1664 /* Link the top cgroup in this hierarchy into all 1681 /* Link the top cgroup in this hierarchy into all
1665 * the css_set objects */ 1682 * the css_set objects */
1666 write_lock(&css_set_lock); 1683 write_lock(&css_set_lock);
1667 hash_for_each(css_set_table, i, cg, hlist) 1684 hash_for_each(css_set_table, i, cset, hlist)
1668 link_css_set(&tmp_cg_links, cg, root_cgrp); 1685 link_css_set(&tmp_links, cset, root_cgrp);
1669 write_unlock(&css_set_lock); 1686 write_unlock(&css_set_lock);
1670 1687
1671 free_cg_links(&tmp_cg_links); 1688 free_cgrp_cset_links(&tmp_links);
1672 1689
1673 BUG_ON(!list_empty(&root_cgrp->children)); 1690 BUG_ON(!list_empty(&root_cgrp->children));
1674 BUG_ON(root->number_of_cgroups != 1); 1691 BUG_ON(root->number_of_cgroups != 1);
@@ -1684,9 +1701,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1684 * We re-used an existing hierarchy - the new root (if 1701 * We re-used an existing hierarchy - the new root (if
1685 * any) is not needed 1702 * any) is not needed
1686 */ 1703 */
1687 cgroup_drop_root(opts.new_root); 1704 cgroup_free_root(opts.new_root);
1688 1705
1689 if (root->flags != opts.flags) { 1706 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1690 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1707 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1691 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); 1708 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1692 ret = -EINVAL; 1709 ret = -EINVAL;
@@ -1705,6 +1722,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1705 return dget(sb->s_root); 1722 return dget(sb->s_root);
1706 1723
1707 unlock_drop: 1724 unlock_drop:
1725 cgroup_exit_root_id(root);
1708 mutex_unlock(&cgroup_root_mutex); 1726 mutex_unlock(&cgroup_root_mutex);
1709 mutex_unlock(&cgroup_mutex); 1727 mutex_unlock(&cgroup_mutex);
1710 mutex_unlock(&inode->i_mutex); 1728 mutex_unlock(&inode->i_mutex);
@@ -1721,9 +1739,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1721static void cgroup_kill_sb(struct super_block *sb) { 1739static void cgroup_kill_sb(struct super_block *sb) {
1722 struct cgroupfs_root *root = sb->s_fs_info; 1740 struct cgroupfs_root *root = sb->s_fs_info;
1723 struct cgroup *cgrp = &root->top_cgroup; 1741 struct cgroup *cgrp = &root->top_cgroup;
1742 struct cgrp_cset_link *link, *tmp_link;
1724 int ret; 1743 int ret;
1725 struct cg_cgroup_link *link;
1726 struct cg_cgroup_link *saved_link;
1727 1744
1728 BUG_ON(!root); 1745 BUG_ON(!root);
1729 1746
@@ -1734,36 +1751,39 @@ static void cgroup_kill_sb(struct super_block *sb) {
1734 mutex_lock(&cgroup_root_mutex); 1751 mutex_lock(&cgroup_root_mutex);
1735 1752
1736 /* Rebind all subsystems back to the default hierarchy */ 1753 /* Rebind all subsystems back to the default hierarchy */
1737 ret = rebind_subsystems(root, 0); 1754 if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
1738 /* Shouldn't be able to fail ... */ 1755 ret = rebind_subsystems(root, 0, root->subsys_mask);
1739 BUG_ON(ret); 1756 /* Shouldn't be able to fail ... */
1757 BUG_ON(ret);
1758 }
1740 1759
1741 /* 1760 /*
1742 * Release all the links from css_sets to this hierarchy's 1761 * Release all the links from cset_links to this hierarchy's
1743 * root cgroup 1762 * root cgroup
1744 */ 1763 */
1745 write_lock(&css_set_lock); 1764 write_lock(&css_set_lock);
1746 1765
1747 list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, 1766 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1748 cgrp_link_list) { 1767 list_del(&link->cset_link);
1749 list_del(&link->cg_link_list); 1768 list_del(&link->cgrp_link);
1750 list_del(&link->cgrp_link_list);
1751 kfree(link); 1769 kfree(link);
1752 } 1770 }
1753 write_unlock(&css_set_lock); 1771 write_unlock(&css_set_lock);
1754 1772
1755 if (!list_empty(&root->root_list)) { 1773 if (!list_empty(&root->root_list)) {
1756 list_del(&root->root_list); 1774 list_del(&root->root_list);
1757 root_count--; 1775 cgroup_root_count--;
1758 } 1776 }
1759 1777
1778 cgroup_exit_root_id(root);
1779
1760 mutex_unlock(&cgroup_root_mutex); 1780 mutex_unlock(&cgroup_root_mutex);
1761 mutex_unlock(&cgroup_mutex); 1781 mutex_unlock(&cgroup_mutex);
1762 1782
1763 simple_xattrs_free(&cgrp->xattrs); 1783 simple_xattrs_free(&cgrp->xattrs);
1764 1784
1765 kill_litter_super(sb); 1785 kill_litter_super(sb);
1766 cgroup_drop_root(root); 1786 cgroup_free_root(root);
1767} 1787}
1768 1788
1769static struct file_system_type cgroup_fs_type = { 1789static struct file_system_type cgroup_fs_type = {
@@ -1825,6 +1845,38 @@ out:
1825} 1845}
1826EXPORT_SYMBOL_GPL(cgroup_path); 1846EXPORT_SYMBOL_GPL(cgroup_path);
1827 1847
1848/**
1849 * task_cgroup_path_from_hierarchy - cgroup path of a task on a hierarchy
1850 * @task: target task
1851 * @hierarchy_id: the hierarchy to look up @task's cgroup from
1852 * @buf: the buffer to write the path into
1853 * @buflen: the length of the buffer
1854 *
1855 * Determine @task's cgroup on the hierarchy specified by @hierarchy_id and
1856 * copy its path into @buf. This function grabs cgroup_mutex and shouldn't
1857 * be used inside locks used by cgroup controller callbacks.
1858 */
1859int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id,
1860 char *buf, size_t buflen)
1861{
1862 struct cgroupfs_root *root;
1863 struct cgroup *cgrp = NULL;
1864 int ret = -ENOENT;
1865
1866 mutex_lock(&cgroup_mutex);
1867
1868 root = idr_find(&cgroup_hierarchy_idr, hierarchy_id);
1869 if (root) {
1870 cgrp = task_cgroup_from_root(task, root);
1871 ret = cgroup_path(cgrp, buf, buflen);
1872 }
1873
1874 mutex_unlock(&cgroup_mutex);
1875
1876 return ret;
1877}
1878EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy);
1879
1828/* 1880/*
1829 * Control Group taskset 1881 * Control Group taskset
1830 */ 1882 */
@@ -1910,10 +1962,11 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1910 * 1962 *
1911 * Must be called with cgroup_mutex and threadgroup locked. 1963 * Must be called with cgroup_mutex and threadgroup locked.
1912 */ 1964 */
1913static void cgroup_task_migrate(struct cgroup *oldcgrp, 1965static void cgroup_task_migrate(struct cgroup *old_cgrp,
1914 struct task_struct *tsk, struct css_set *newcg) 1966 struct task_struct *tsk,
1967 struct css_set *new_cset)
1915{ 1968{
1916 struct css_set *oldcg; 1969 struct css_set *old_cset;
1917 1970
1918 /* 1971 /*
1919 * We are synchronized through threadgroup_lock() against PF_EXITING 1972 * We are synchronized through threadgroup_lock() against PF_EXITING
@@ -1921,25 +1974,25 @@ static void cgroup_task_migrate(struct cgroup *oldcgrp,
1921 * css_set to init_css_set and dropping the old one. 1974 * css_set to init_css_set and dropping the old one.
1922 */ 1975 */
1923 WARN_ON_ONCE(tsk->flags & PF_EXITING); 1976 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1924 oldcg = tsk->cgroups; 1977 old_cset = task_css_set(tsk);
1925 1978
1926 task_lock(tsk); 1979 task_lock(tsk);
1927 rcu_assign_pointer(tsk->cgroups, newcg); 1980 rcu_assign_pointer(tsk->cgroups, new_cset);
1928 task_unlock(tsk); 1981 task_unlock(tsk);
1929 1982
1930 /* Update the css_set linked lists if we're using them */ 1983 /* Update the css_set linked lists if we're using them */
1931 write_lock(&css_set_lock); 1984 write_lock(&css_set_lock);
1932 if (!list_empty(&tsk->cg_list)) 1985 if (!list_empty(&tsk->cg_list))
1933 list_move(&tsk->cg_list, &newcg->tasks); 1986 list_move(&tsk->cg_list, &new_cset->tasks);
1934 write_unlock(&css_set_lock); 1987 write_unlock(&css_set_lock);
1935 1988
1936 /* 1989 /*
1937 * We just gained a reference on oldcg by taking it from the task. As 1990 * We just gained a reference on old_cset by taking it from the
1938 * trading it for newcg is protected by cgroup_mutex, we're safe to drop 1991 * task. As trading it for new_cset is protected by cgroup_mutex,
1939 * it here; it will be freed under RCU. 1992 * we're safe to drop it here; it will be freed under RCU.
1940 */ 1993 */
1941 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1994 set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
1942 put_css_set(oldcg); 1995 put_css_set(old_cset);
1943} 1996}
1944 1997
1945/** 1998/**
@@ -2029,7 +2082,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2029 /* 2082 /*
2030 * step 1: check that we can legitimately attach to the cgroup. 2083 * step 1: check that we can legitimately attach to the cgroup.
2031 */ 2084 */
2032 for_each_subsys(root, ss) { 2085 for_each_root_subsys(root, ss) {
2033 if (ss->can_attach) { 2086 if (ss->can_attach) {
2034 retval = ss->can_attach(cgrp, &tset); 2087 retval = ss->can_attach(cgrp, &tset);
2035 if (retval) { 2088 if (retval) {
@@ -2044,8 +2097,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2044 * we use find_css_set, which allocates a new one if necessary. 2097 * we use find_css_set, which allocates a new one if necessary.
2045 */ 2098 */
2046 for (i = 0; i < group_size; i++) { 2099 for (i = 0; i < group_size; i++) {
2100 struct css_set *old_cset;
2101
2047 tc = flex_array_get(group, i); 2102 tc = flex_array_get(group, i);
2048 tc->cg = find_css_set(tc->task->cgroups, cgrp); 2103 old_cset = task_css_set(tc->task);
2104 tc->cg = find_css_set(old_cset, cgrp);
2049 if (!tc->cg) { 2105 if (!tc->cg) {
2050 retval = -ENOMEM; 2106 retval = -ENOMEM;
2051 goto out_put_css_set_refs; 2107 goto out_put_css_set_refs;
@@ -2066,7 +2122,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2066 /* 2122 /*
2067 * step 4: do subsystem attach callbacks. 2123 * step 4: do subsystem attach callbacks.
2068 */ 2124 */
2069 for_each_subsys(root, ss) { 2125 for_each_root_subsys(root, ss) {
2070 if (ss->attach) 2126 if (ss->attach)
2071 ss->attach(cgrp, &tset); 2127 ss->attach(cgrp, &tset);
2072 } 2128 }
@@ -2086,7 +2142,7 @@ out_put_css_set_refs:
2086 } 2142 }
2087out_cancel_attach: 2143out_cancel_attach:
2088 if (retval) { 2144 if (retval) {
2089 for_each_subsys(root, ss) { 2145 for_each_root_subsys(root, ss) {
2090 if (ss == failed_ss) 2146 if (ss == failed_ss)
2091 break; 2147 break;
2092 if (ss->cancel_attach) 2148 if (ss->cancel_attach)
@@ -2323,7 +2379,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
2323 struct cftype *cft = __d_cft(file->f_dentry); 2379 struct cftype *cft = __d_cft(file->f_dentry);
2324 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2380 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2325 2381
2326 if (cgroup_is_removed(cgrp)) 2382 if (cgroup_is_dead(cgrp))
2327 return -ENODEV; 2383 return -ENODEV;
2328 if (cft->write) 2384 if (cft->write)
2329 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 2385 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -2368,7 +2424,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2368 struct cftype *cft = __d_cft(file->f_dentry); 2424 struct cftype *cft = __d_cft(file->f_dentry);
2369 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2425 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2370 2426
2371 if (cgroup_is_removed(cgrp)) 2427 if (cgroup_is_dead(cgrp))
2372 return -ENODEV; 2428 return -ENODEV;
2373 2429
2374 if (cft->read) 2430 if (cft->read)
@@ -2435,10 +2491,12 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
2435 cft = __d_cft(file->f_dentry); 2491 cft = __d_cft(file->f_dentry);
2436 2492
2437 if (cft->read_map || cft->read_seq_string) { 2493 if (cft->read_map || cft->read_seq_string) {
2438 struct cgroup_seqfile_state *state = 2494 struct cgroup_seqfile_state *state;
2439 kzalloc(sizeof(*state), GFP_USER); 2495
2496 state = kzalloc(sizeof(*state), GFP_USER);
2440 if (!state) 2497 if (!state)
2441 return -ENOMEM; 2498 return -ENOMEM;
2499
2442 state->cft = cft; 2500 state->cft = cft;
2443 state->cgroup = __d_cgrp(file->f_dentry->d_parent); 2501 state->cgroup = __d_cgrp(file->f_dentry->d_parent);
2444 file->f_op = &cgroup_seqfile_operations; 2502 file->f_op = &cgroup_seqfile_operations;
@@ -2486,6 +2544,13 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2486 2544
2487 cgrp = __d_cgrp(old_dentry); 2545 cgrp = __d_cgrp(old_dentry);
2488 2546
2547 /*
2548 * This isn't a proper migration and its usefulness is very
2549 * limited. Disallow if sane_behavior.
2550 */
2551 if (cgroup_sane_behavior(cgrp))
2552 return -EPERM;
2553
2489 name = cgroup_alloc_name(new_dentry); 2554 name = cgroup_alloc_name(new_dentry);
2490 if (!name) 2555 if (!name)
2491 return -ENOMEM; 2556 return -ENOMEM;
@@ -2496,7 +2561,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2496 return ret; 2561 return ret;
2497 } 2562 }
2498 2563
2499 old_name = cgrp->name; 2564 old_name = rcu_dereference_protected(cgrp->name, true);
2500 rcu_assign_pointer(cgrp->name, name); 2565 rcu_assign_pointer(cgrp->name, name);
2501 2566
2502 kfree_rcu(old_name, rcu_head); 2567 kfree_rcu(old_name, rcu_head);
@@ -2747,58 +2812,78 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2747 return ret; 2812 return ret;
2748} 2813}
2749 2814
2750static DEFINE_MUTEX(cgroup_cft_mutex);
2751
2752static void cgroup_cfts_prepare(void) 2815static void cgroup_cfts_prepare(void)
2753 __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex) 2816 __acquires(&cgroup_mutex)
2754{ 2817{
2755 /* 2818 /*
2756 * Thanks to the entanglement with vfs inode locking, we can't walk 2819 * Thanks to the entanglement with vfs inode locking, we can't walk
2757 * the existing cgroups under cgroup_mutex and create files. 2820 * the existing cgroups under cgroup_mutex and create files.
2758 * Instead, we increment reference on all cgroups and build list of 2821 * Instead, we use cgroup_for_each_descendant_pre() and drop RCU
2759 * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure 2822 * read lock before calling cgroup_addrm_files().
2760 * exclusive access to the field.
2761 */ 2823 */
2762 mutex_lock(&cgroup_cft_mutex);
2763 mutex_lock(&cgroup_mutex); 2824 mutex_lock(&cgroup_mutex);
2764} 2825}
2765 2826
2766static void cgroup_cfts_commit(struct cgroup_subsys *ss, 2827static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2767 struct cftype *cfts, bool is_add) 2828 struct cftype *cfts, bool is_add)
2768 __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) 2829 __releases(&cgroup_mutex)
2769{ 2830{
2770 LIST_HEAD(pending); 2831 LIST_HEAD(pending);
2771 struct cgroup *cgrp, *n; 2832 struct cgroup *cgrp, *root = &ss->root->top_cgroup;
2833 struct super_block *sb = ss->root->sb;
2834 struct dentry *prev = NULL;
2835 struct inode *inode;
2836 u64 update_before;
2772 2837
2773 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ 2838 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2774 if (cfts && ss->root != &rootnode) { 2839 if (!cfts || ss->root == &cgroup_dummy_root ||
2775 list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { 2840 !atomic_inc_not_zero(&sb->s_active)) {
2776 dget(cgrp->dentry); 2841 mutex_unlock(&cgroup_mutex);
2777 list_add_tail(&cgrp->cft_q_node, &pending); 2842 return;
2778 }
2779 } 2843 }
2780 2844
2781 mutex_unlock(&cgroup_mutex);
2782
2783 /* 2845 /*
2784 * All new cgroups will see @cfts update on @ss->cftsets. Add/rm 2846 * All cgroups which are created after we drop cgroup_mutex will
2785 * files for all cgroups which were created before. 2847 * have the updated set of files, so we only need to update the
2848 * cgroups created before the current @cgroup_serial_nr_next.
2786 */ 2849 */
2787 list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) { 2850 update_before = cgroup_serial_nr_next;
2788 struct inode *inode = cgrp->dentry->d_inode; 2851
2852 mutex_unlock(&cgroup_mutex);
2853
2854 /* @root always needs to be updated */
2855 inode = root->dentry->d_inode;
2856 mutex_lock(&inode->i_mutex);
2857 mutex_lock(&cgroup_mutex);
2858 cgroup_addrm_files(root, ss, cfts, is_add);
2859 mutex_unlock(&cgroup_mutex);
2860 mutex_unlock(&inode->i_mutex);
2861
2862 /* add/rm files for all cgroups created before */
2863 rcu_read_lock();
2864 cgroup_for_each_descendant_pre(cgrp, root) {
2865 if (cgroup_is_dead(cgrp))
2866 continue;
2867
2868 inode = cgrp->dentry->d_inode;
2869 dget(cgrp->dentry);
2870 rcu_read_unlock();
2871
2872 dput(prev);
2873 prev = cgrp->dentry;
2789 2874
2790 mutex_lock(&inode->i_mutex); 2875 mutex_lock(&inode->i_mutex);
2791 mutex_lock(&cgroup_mutex); 2876 mutex_lock(&cgroup_mutex);
2792 if (!cgroup_is_removed(cgrp)) 2877 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2793 cgroup_addrm_files(cgrp, ss, cfts, is_add); 2878 cgroup_addrm_files(cgrp, ss, cfts, is_add);
2794 mutex_unlock(&cgroup_mutex); 2879 mutex_unlock(&cgroup_mutex);
2795 mutex_unlock(&inode->i_mutex); 2880 mutex_unlock(&inode->i_mutex);
2796 2881
2797 list_del_init(&cgrp->cft_q_node); 2882 rcu_read_lock();
2798 dput(cgrp->dentry);
2799 } 2883 }
2800 2884 rcu_read_unlock();
2801 mutex_unlock(&cgroup_cft_mutex); 2885 dput(prev);
2886 deactivate_super(sb);
2802} 2887}
2803 2888
2804/** 2889/**
@@ -2853,7 +2938,8 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2853 2938
2854 list_for_each_entry(set, &ss->cftsets, node) { 2939 list_for_each_entry(set, &ss->cftsets, node) {
2855 if (set->cfts == cfts) { 2940 if (set->cfts == cfts) {
2856 list_del_init(&set->node); 2941 list_del(&set->node);
2942 kfree(set);
2857 cgroup_cfts_commit(ss, cfts, false); 2943 cgroup_cfts_commit(ss, cfts, false);
2858 return 0; 2944 return 0;
2859 } 2945 }
@@ -2872,12 +2958,11 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2872int cgroup_task_count(const struct cgroup *cgrp) 2958int cgroup_task_count(const struct cgroup *cgrp)
2873{ 2959{
2874 int count = 0; 2960 int count = 0;
2875 struct cg_cgroup_link *link; 2961 struct cgrp_cset_link *link;
2876 2962
2877 read_lock(&css_set_lock); 2963 read_lock(&css_set_lock);
2878 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { 2964 list_for_each_entry(link, &cgrp->cset_links, cset_link)
2879 count += atomic_read(&link->cg->refcount); 2965 count += atomic_read(&link->cset->refcount);
2880 }
2881 read_unlock(&css_set_lock); 2966 read_unlock(&css_set_lock);
2882 return count; 2967 return count;
2883} 2968}
@@ -2886,25 +2971,24 @@ int cgroup_task_count(const struct cgroup *cgrp)
2886 * Advance a list_head iterator. The iterator should be positioned at 2971 * Advance a list_head iterator. The iterator should be positioned at
2887 * the start of a css_set 2972 * the start of a css_set
2888 */ 2973 */
2889static void cgroup_advance_iter(struct cgroup *cgrp, 2974static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it)
2890 struct cgroup_iter *it)
2891{ 2975{
2892 struct list_head *l = it->cg_link; 2976 struct list_head *l = it->cset_link;
2893 struct cg_cgroup_link *link; 2977 struct cgrp_cset_link *link;
2894 struct css_set *cg; 2978 struct css_set *cset;
2895 2979
2896 /* Advance to the next non-empty css_set */ 2980 /* Advance to the next non-empty css_set */
2897 do { 2981 do {
2898 l = l->next; 2982 l = l->next;
2899 if (l == &cgrp->css_sets) { 2983 if (l == &cgrp->cset_links) {
2900 it->cg_link = NULL; 2984 it->cset_link = NULL;
2901 return; 2985 return;
2902 } 2986 }
2903 link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); 2987 link = list_entry(l, struct cgrp_cset_link, cset_link);
2904 cg = link->cg; 2988 cset = link->cset;
2905 } while (list_empty(&cg->tasks)); 2989 } while (list_empty(&cset->tasks));
2906 it->cg_link = l; 2990 it->cset_link = l;
2907 it->task = cg->tasks.next; 2991 it->task = cset->tasks.next;
2908} 2992}
2909 2993
2910/* 2994/*
@@ -2934,7 +3018,7 @@ static void cgroup_enable_task_cg_lists(void)
2934 * entry won't be deleted though the process has exited. 3018 * entry won't be deleted though the process has exited.
2935 */ 3019 */
2936 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) 3020 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
2937 list_add(&p->cg_list, &p->cgroups->tasks); 3021 list_add(&p->cg_list, &task_css_set(p)->tasks);
2938 task_unlock(p); 3022 task_unlock(p);
2939 } while_each_thread(g, p); 3023 } while_each_thread(g, p);
2940 read_unlock(&tasklist_lock); 3024 read_unlock(&tasklist_lock);
@@ -2942,12 +3026,67 @@ static void cgroup_enable_task_cg_lists(void)
2942} 3026}
2943 3027
2944/** 3028/**
3029 * cgroup_next_sibling - find the next sibling of a given cgroup
3030 * @pos: the current cgroup
3031 *
3032 * This function returns the next sibling of @pos and should be called
3033 * under RCU read lock. The only requirement is that @pos is accessible.
3034 * The next sibling is guaranteed to be returned regardless of @pos's
3035 * state.
3036 */
3037struct cgroup *cgroup_next_sibling(struct cgroup *pos)
3038{
3039 struct cgroup *next;
3040
3041 WARN_ON_ONCE(!rcu_read_lock_held());
3042
3043 /*
3044 * @pos could already have been removed. Once a cgroup is removed,
3045 * its ->sibling.next is no longer updated when its next sibling
3046 * changes. As CGRP_DEAD assertion is serialized and happens
3047 * before the cgroup is taken off the ->sibling list, if we see it
3048 * unasserted, it's guaranteed that the next sibling hasn't
3049 * finished its grace period even if it's already removed, and thus
3050 * safe to dereference from this RCU critical section. If
3051 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
3052 * to be visible as %true here.
3053 */
3054 if (likely(!cgroup_is_dead(pos))) {
3055 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3056 if (&next->sibling != &pos->parent->children)
3057 return next;
3058 return NULL;
3059 }
3060
3061 /*
3062 * Can't dereference the next pointer. Each cgroup is given a
3063 * monotonically increasing unique serial number and always
3064 * appended to the sibling list, so the next one can be found by
3065 * walking the parent's children until we see a cgroup with higher
3066 * serial number than @pos's.
3067 *
3068 * While this path can be slow, it's taken only when either the
3069 * current cgroup is removed or iteration and removal race.
3070 */
3071 list_for_each_entry_rcu(next, &pos->parent->children, sibling)
3072 if (next->serial_nr > pos->serial_nr)
3073 return next;
3074 return NULL;
3075}
3076EXPORT_SYMBOL_GPL(cgroup_next_sibling);
3077
3078/**
2945 * cgroup_next_descendant_pre - find the next descendant for pre-order walk 3079 * cgroup_next_descendant_pre - find the next descendant for pre-order walk
2946 * @pos: the current position (%NULL to initiate traversal) 3080 * @pos: the current position (%NULL to initiate traversal)
2947 * @cgroup: cgroup whose descendants to walk 3081 * @cgroup: cgroup whose descendants to walk
2948 * 3082 *
2949 * To be used by cgroup_for_each_descendant_pre(). Find the next 3083 * To be used by cgroup_for_each_descendant_pre(). Find the next
2950 * descendant to visit for pre-order traversal of @cgroup's descendants. 3084 * descendant to visit for pre-order traversal of @cgroup's descendants.
3085 *
3086 * While this function requires RCU read locking, it doesn't require the
3087 * whole traversal to be contained in a single RCU critical section. This
3088 * function will return the correct next descendant as long as both @pos
3089 * and @cgroup are accessible and @pos is a descendant of @cgroup.
2951 */ 3090 */
2952struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, 3091struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2953 struct cgroup *cgroup) 3092 struct cgroup *cgroup)
@@ -2967,11 +3106,9 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2967 3106
2968 /* no child, visit my or the closest ancestor's next sibling */ 3107 /* no child, visit my or the closest ancestor's next sibling */
2969 while (pos != cgroup) { 3108 while (pos != cgroup) {
2970 next = list_entry_rcu(pos->sibling.next, struct cgroup, 3109 next = cgroup_next_sibling(pos);
2971 sibling); 3110 if (next)
2972 if (&next->sibling != &pos->parent->children)
2973 return next; 3111 return next;
2974
2975 pos = pos->parent; 3112 pos = pos->parent;
2976 } 3113 }
2977 3114
@@ -2986,6 +3123,11 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
2986 * Return the rightmost descendant of @pos. If there's no descendant, 3123 * Return the rightmost descendant of @pos. If there's no descendant,
2987 * @pos is returned. This can be used during pre-order traversal to skip 3124 * @pos is returned. This can be used during pre-order traversal to skip
2988 * subtree of @pos. 3125 * subtree of @pos.
3126 *
3127 * While this function requires RCU read locking, it doesn't require the
3128 * whole traversal to be contained in a single RCU critical section. This
3129 * function will return the correct rightmost descendant as long as @pos is
3130 * accessible.
2989 */ 3131 */
2990struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) 3132struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
2991{ 3133{
@@ -3025,6 +3167,11 @@ static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
3025 * 3167 *
3026 * To be used by cgroup_for_each_descendant_post(). Find the next 3168 * To be used by cgroup_for_each_descendant_post(). Find the next
3027 * descendant to visit for post-order traversal of @cgroup's descendants. 3169 * descendant to visit for post-order traversal of @cgroup's descendants.
3170 *
3171 * While this function requires RCU read locking, it doesn't require the
3172 * whole traversal to be contained in a single RCU critical section. This
3173 * function will return the correct next descendant as long as both @pos
3174 * and @cgroup are accessible and @pos is a descendant of @cgroup.
3028 */ 3175 */
3029struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, 3176struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
3030 struct cgroup *cgroup) 3177 struct cgroup *cgroup)
@@ -3040,8 +3187,8 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
3040 } 3187 }
3041 3188
3042 /* if there's an unvisited sibling, visit its leftmost descendant */ 3189 /* if there's an unvisited sibling, visit its leftmost descendant */
3043 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); 3190 next = cgroup_next_sibling(pos);
3044 if (&next->sibling != &pos->parent->children) 3191 if (next)
3045 return cgroup_leftmost_descendant(next); 3192 return cgroup_leftmost_descendant(next);
3046 3193
3047 /* no sibling left, visit parent */ 3194 /* no sibling left, visit parent */
@@ -3062,7 +3209,7 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
3062 cgroup_enable_task_cg_lists(); 3209 cgroup_enable_task_cg_lists();
3063 3210
3064 read_lock(&css_set_lock); 3211 read_lock(&css_set_lock);
3065 it->cg_link = &cgrp->css_sets; 3212 it->cset_link = &cgrp->cset_links;
3066 cgroup_advance_iter(cgrp, it); 3213 cgroup_advance_iter(cgrp, it);
3067} 3214}
3068 3215
@@ -3071,16 +3218,16 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
3071{ 3218{
3072 struct task_struct *res; 3219 struct task_struct *res;
3073 struct list_head *l = it->task; 3220 struct list_head *l = it->task;
3074 struct cg_cgroup_link *link; 3221 struct cgrp_cset_link *link;
3075 3222
3076 /* If the iterator cg is NULL, we have no tasks */ 3223 /* If the iterator cg is NULL, we have no tasks */
3077 if (!it->cg_link) 3224 if (!it->cset_link)
3078 return NULL; 3225 return NULL;
3079 res = list_entry(l, struct task_struct, cg_list); 3226 res = list_entry(l, struct task_struct, cg_list);
3080 /* Advance iterator to find next entry */ 3227 /* Advance iterator to find next entry */
3081 l = l->next; 3228 l = l->next;
3082 link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); 3229 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
3083 if (l == &link->cg->tasks) { 3230 if (l == &link->cset->tasks) {
3084 /* We reached the end of this task list - move on to 3231 /* We reached the end of this task list - move on to
3085 * the next cg_cgroup_link */ 3232 * the next cg_cgroup_link */
3086 cgroup_advance_iter(cgrp, it); 3233 cgroup_advance_iter(cgrp, it);
@@ -3411,7 +3558,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3411 } 3558 }
3412 } 3559 }
3413 /* entry not found; create a new one */ 3560 /* entry not found; create a new one */
3414 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 3561 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3415 if (!l) { 3562 if (!l) {
3416 mutex_unlock(&cgrp->pidlist_mutex); 3563 mutex_unlock(&cgrp->pidlist_mutex);
3417 return l; 3564 return l;
@@ -3420,8 +3567,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3420 down_write(&l->mutex); 3567 down_write(&l->mutex);
3421 l->key.type = type; 3568 l->key.type = type;
3422 l->key.ns = get_pid_ns(ns); 3569 l->key.ns = get_pid_ns(ns);
3423 l->use_count = 0; /* don't increment here */
3424 l->list = NULL;
3425 l->owner = cgrp; 3570 l->owner = cgrp;
3426 list_add(&l->links, &cgrp->pidlists); 3571 list_add(&l->links, &cgrp->pidlists);
3427 mutex_unlock(&cgrp->pidlist_mutex); 3572 mutex_unlock(&cgrp->pidlist_mutex);
@@ -3727,6 +3872,23 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
3727} 3872}
3728 3873
3729/* 3874/*
3875 * When dput() is called asynchronously, if umount has been done and
3876 * then deactivate_super() in cgroup_free_fn() kills the superblock,
3877 * there's a small window that vfs will see the root dentry with non-zero
3878 * refcnt and trigger BUG().
3879 *
3880 * That's why we hold a reference before dput() and drop it right after.
3881 */
3882static void cgroup_dput(struct cgroup *cgrp)
3883{
3884 struct super_block *sb = cgrp->root->sb;
3885
3886 atomic_inc(&sb->s_active);
3887 dput(cgrp->dentry);
3888 deactivate_super(sb);
3889}
3890
3891/*
3730 * Unregister event and free resources. 3892 * Unregister event and free resources.
3731 * 3893 *
3732 * Gets called from workqueue. 3894 * Gets called from workqueue.
@@ -3746,7 +3908,7 @@ static void cgroup_event_remove(struct work_struct *work)
3746 3908
3747 eventfd_ctx_put(event->eventfd); 3909 eventfd_ctx_put(event->eventfd);
3748 kfree(event); 3910 kfree(event);
3749 dput(cgrp->dentry); 3911 cgroup_dput(cgrp);
3750} 3912}
3751 3913
3752/* 3914/*
@@ -3933,33 +4095,16 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
3933 return 0; 4095 return 0;
3934} 4096}
3935 4097
3936/* 4098static struct cftype cgroup_base_files[] = {
3937 * for the common functions, 'private' gives the type of file
3938 */
3939/* for hysterical raisins, we can't put this on the older files */
3940#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
3941static struct cftype files[] = {
3942 {
3943 .name = "tasks",
3944 .open = cgroup_tasks_open,
3945 .write_u64 = cgroup_tasks_write,
3946 .release = cgroup_pidlist_release,
3947 .mode = S_IRUGO | S_IWUSR,
3948 },
3949 { 4099 {
3950 .name = CGROUP_FILE_GENERIC_PREFIX "procs", 4100 .name = "cgroup.procs",
3951 .open = cgroup_procs_open, 4101 .open = cgroup_procs_open,
3952 .write_u64 = cgroup_procs_write, 4102 .write_u64 = cgroup_procs_write,
3953 .release = cgroup_pidlist_release, 4103 .release = cgroup_pidlist_release,
3954 .mode = S_IRUGO | S_IWUSR, 4104 .mode = S_IRUGO | S_IWUSR,
3955 }, 4105 },
3956 { 4106 {
3957 .name = "notify_on_release", 4107 .name = "cgroup.event_control",
3958 .read_u64 = cgroup_read_notify_on_release,
3959 .write_u64 = cgroup_write_notify_on_release,
3960 },
3961 {
3962 .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3963 .write_string = cgroup_write_event_control, 4108 .write_string = cgroup_write_event_control,
3964 .mode = S_IWUGO, 4109 .mode = S_IWUGO,
3965 }, 4110 },
@@ -3974,9 +4119,29 @@ static struct cftype files[] = {
3974 .flags = CFTYPE_ONLY_ON_ROOT, 4119 .flags = CFTYPE_ONLY_ON_ROOT,
3975 .read_seq_string = cgroup_sane_behavior_show, 4120 .read_seq_string = cgroup_sane_behavior_show,
3976 }, 4121 },
4122
4123 /*
4124 * Historical crazy stuff. These don't have "cgroup." prefix and
4125 * don't exist if sane_behavior. If you're depending on these, be
4126 * prepared to be burned.
4127 */
4128 {
4129 .name = "tasks",
4130 .flags = CFTYPE_INSANE, /* use "procs" instead */
4131 .open = cgroup_tasks_open,
4132 .write_u64 = cgroup_tasks_write,
4133 .release = cgroup_pidlist_release,
4134 .mode = S_IRUGO | S_IWUSR,
4135 },
4136 {
4137 .name = "notify_on_release",
4138 .flags = CFTYPE_INSANE,
4139 .read_u64 = cgroup_read_notify_on_release,
4140 .write_u64 = cgroup_write_notify_on_release,
4141 },
3977 { 4142 {
3978 .name = "release_agent", 4143 .name = "release_agent",
3979 .flags = CFTYPE_ONLY_ON_ROOT, 4144 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
3980 .read_seq_string = cgroup_release_agent_show, 4145 .read_seq_string = cgroup_release_agent_show,
3981 .write_string = cgroup_release_agent_write, 4146 .write_string = cgroup_release_agent_write,
3982 .max_write_len = PATH_MAX, 4147 .max_write_len = PATH_MAX,
@@ -3997,13 +4162,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
3997 struct cgroup_subsys *ss; 4162 struct cgroup_subsys *ss;
3998 4163
3999 if (base_files) { 4164 if (base_files) {
4000 err = cgroup_addrm_files(cgrp, NULL, files, true); 4165 err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true);
4001 if (err < 0) 4166 if (err < 0)
4002 return err; 4167 return err;
4003 } 4168 }
4004 4169
4005 /* process cftsets of each subsystem */ 4170 /* process cftsets of each subsystem */
4006 for_each_subsys(cgrp->root, ss) { 4171 for_each_root_subsys(cgrp->root, ss) {
4007 struct cftype_set *set; 4172 struct cftype_set *set;
4008 if (!test_bit(ss->subsys_id, &subsys_mask)) 4173 if (!test_bit(ss->subsys_id, &subsys_mask))
4009 continue; 4174 continue;
@@ -4013,15 +4178,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
4013 } 4178 }
4014 4179
4015 /* This cgroup is ready now */ 4180 /* This cgroup is ready now */
4016 for_each_subsys(cgrp->root, ss) { 4181 for_each_root_subsys(cgrp->root, ss) {
4017 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4182 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4183 struct css_id *id = rcu_dereference_protected(css->id, true);
4184
4018 /* 4185 /*
4019 * Update id->css pointer and make this css visible from 4186 * Update id->css pointer and make this css visible from
4020 * CSS ID functions. This pointer will be dereferened 4187 * CSS ID functions. This pointer will be dereferened
4021 * from RCU-read-side without locks. 4188 * from RCU-read-side without locks.
4022 */ 4189 */
4023 if (css->id) 4190 if (id)
4024 rcu_assign_pointer(css->id->css, css); 4191 rcu_assign_pointer(id->css, css);
4025 } 4192 }
4026 4193
4027 return 0; 4194 return 0;
@@ -4031,12 +4198,16 @@ static void css_dput_fn(struct work_struct *work)
4031{ 4198{
4032 struct cgroup_subsys_state *css = 4199 struct cgroup_subsys_state *css =
4033 container_of(work, struct cgroup_subsys_state, dput_work); 4200 container_of(work, struct cgroup_subsys_state, dput_work);
4034 struct dentry *dentry = css->cgroup->dentry;
4035 struct super_block *sb = dentry->d_sb;
4036 4201
4037 atomic_inc(&sb->s_active); 4202 cgroup_dput(css->cgroup);
4038 dput(dentry); 4203}
4039 deactivate_super(sb); 4204
4205static void css_release(struct percpu_ref *ref)
4206{
4207 struct cgroup_subsys_state *css =
4208 container_of(ref, struct cgroup_subsys_state, refcnt);
4209
4210 schedule_work(&css->dput_work);
4040} 4211}
4041 4212
4042static void init_cgroup_css(struct cgroup_subsys_state *css, 4213static void init_cgroup_css(struct cgroup_subsys_state *css,
@@ -4044,10 +4215,9 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
4044 struct cgroup *cgrp) 4215 struct cgroup *cgrp)
4045{ 4216{
4046 css->cgroup = cgrp; 4217 css->cgroup = cgrp;
4047 atomic_set(&css->refcnt, 1);
4048 css->flags = 0; 4218 css->flags = 0;
4049 css->id = NULL; 4219 css->id = NULL;
4050 if (cgrp == dummytop) 4220 if (cgrp == cgroup_dummy_top)
4051 css->flags |= CSS_ROOT; 4221 css->flags |= CSS_ROOT;
4052 BUG_ON(cgrp->subsys[ss->subsys_id]); 4222 BUG_ON(cgrp->subsys[ss->subsys_id]);
4053 cgrp->subsys[ss->subsys_id] = css; 4223 cgrp->subsys[ss->subsys_id] = css;
@@ -4157,7 +4327,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4157 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 4327 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4158 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4328 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4159 4329
4160 for_each_subsys(root, ss) { 4330 for_each_root_subsys(root, ss) {
4161 struct cgroup_subsys_state *css; 4331 struct cgroup_subsys_state *css;
4162 4332
4163 css = ss->css_alloc(cgrp); 4333 css = ss->css_alloc(cgrp);
@@ -4165,7 +4335,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4165 err = PTR_ERR(css); 4335 err = PTR_ERR(css);
4166 goto err_free_all; 4336 goto err_free_all;
4167 } 4337 }
4338
4339 err = percpu_ref_init(&css->refcnt, css_release);
4340 if (err)
4341 goto err_free_all;
4342
4168 init_cgroup_css(css, ss, cgrp); 4343 init_cgroup_css(css, ss, cgrp);
4344
4169 if (ss->use_id) { 4345 if (ss->use_id) {
4170 err = alloc_css_id(ss, parent, cgrp); 4346 err = alloc_css_id(ss, parent, cgrp);
4171 if (err) 4347 if (err)
@@ -4183,20 +4359,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4183 goto err_free_all; 4359 goto err_free_all;
4184 lockdep_assert_held(&dentry->d_inode->i_mutex); 4360 lockdep_assert_held(&dentry->d_inode->i_mutex);
4185 4361
4362 cgrp->serial_nr = cgroup_serial_nr_next++;
4363
4186 /* allocation complete, commit to creation */ 4364 /* allocation complete, commit to creation */
4187 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4188 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4365 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4189 root->number_of_cgroups++; 4366 root->number_of_cgroups++;
4190 4367
4191 /* each css holds a ref to the cgroup's dentry */ 4368 /* each css holds a ref to the cgroup's dentry */
4192 for_each_subsys(root, ss) 4369 for_each_root_subsys(root, ss)
4193 dget(dentry); 4370 dget(dentry);
4194 4371
4195 /* hold a ref to the parent's dentry */ 4372 /* hold a ref to the parent's dentry */
4196 dget(parent->dentry); 4373 dget(parent->dentry);
4197 4374
4198 /* creation succeeded, notify subsystems */ 4375 /* creation succeeded, notify subsystems */
4199 for_each_subsys(root, ss) { 4376 for_each_root_subsys(root, ss) {
4200 err = online_css(ss, cgrp); 4377 err = online_css(ss, cgrp);
4201 if (err) 4378 if (err)
4202 goto err_destroy; 4379 goto err_destroy;
@@ -4221,9 +4398,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4221 return 0; 4398 return 0;
4222 4399
4223err_free_all: 4400err_free_all:
4224 for_each_subsys(root, ss) { 4401 for_each_root_subsys(root, ss) {
4225 if (cgrp->subsys[ss->subsys_id]) 4402 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4403
4404 if (css) {
4405 percpu_ref_cancel_init(&css->refcnt);
4226 ss->css_free(cgrp); 4406 ss->css_free(cgrp);
4407 }
4227 } 4408 }
4228 mutex_unlock(&cgroup_mutex); 4409 mutex_unlock(&cgroup_mutex);
4229 /* Release the reference count that we took on the superblock */ 4410 /* Release the reference count that we took on the superblock */
@@ -4251,63 +4432,120 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4251 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4432 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4252} 4433}
4253 4434
4435static void cgroup_css_killed(struct cgroup *cgrp)
4436{
4437 if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
4438 return;
4439
4440 /* percpu ref's of all css's are killed, kick off the next step */
4441 INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
4442 schedule_work(&cgrp->destroy_work);
4443}
4444
4445static void css_ref_killed_fn(struct percpu_ref *ref)
4446{
4447 struct cgroup_subsys_state *css =
4448 container_of(ref, struct cgroup_subsys_state, refcnt);
4449
4450 cgroup_css_killed(css->cgroup);
4451}
4452
4453/**
4454 * cgroup_destroy_locked - the first stage of cgroup destruction
4455 * @cgrp: cgroup to be destroyed
4456 *
4457 * css's make use of percpu refcnts whose killing latency shouldn't be
4458 * exposed to userland and are RCU protected. Also, cgroup core needs to
4459 * guarantee that css_tryget() won't succeed by the time ->css_offline() is
4460 * invoked. To satisfy all the requirements, destruction is implemented in
4461 * the following two steps.
4462 *
4463 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
4464 * userland visible parts and start killing the percpu refcnts of
4465 * css's. Set up so that the next stage will be kicked off once all
4466 * the percpu refcnts are confirmed to be killed.
4467 *
4468 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
4469 * rest of destruction. Once all cgroup references are gone, the
4470 * cgroup is RCU-freed.
4471 *
4472 * This function implements s1. After this step, @cgrp is gone as far as
4473 * the userland is concerned and a new cgroup with the same name may be
4474 * created. As cgroup doesn't care about the names internally, this
4475 * doesn't cause any problem.
4476 */
4254static int cgroup_destroy_locked(struct cgroup *cgrp) 4477static int cgroup_destroy_locked(struct cgroup *cgrp)
4255 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4478 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4256{ 4479{
4257 struct dentry *d = cgrp->dentry; 4480 struct dentry *d = cgrp->dentry;
4258 struct cgroup *parent = cgrp->parent;
4259 struct cgroup_event *event, *tmp; 4481 struct cgroup_event *event, *tmp;
4260 struct cgroup_subsys *ss; 4482 struct cgroup_subsys *ss;
4483 bool empty;
4261 4484
4262 lockdep_assert_held(&d->d_inode->i_mutex); 4485 lockdep_assert_held(&d->d_inode->i_mutex);
4263 lockdep_assert_held(&cgroup_mutex); 4486 lockdep_assert_held(&cgroup_mutex);
4264 4487
4265 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) 4488 /*
4489 * css_set_lock synchronizes access to ->cset_links and prevents
4490 * @cgrp from being removed while __put_css_set() is in progress.
4491 */
4492 read_lock(&css_set_lock);
4493 empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children);
4494 read_unlock(&css_set_lock);
4495 if (!empty)
4266 return -EBUSY; 4496 return -EBUSY;
4267 4497
4268 /* 4498 /*
4269 * Block new css_tryget() by deactivating refcnt and mark @cgrp 4499 * Block new css_tryget() by killing css refcnts. cgroup core
4270 * removed. This makes future css_tryget() and child creation 4500 * guarantees that, by the time ->css_offline() is invoked, no new
4271 * attempts fail thus maintaining the removal conditions verified 4501 * css reference will be given out via css_tryget(). We can't
4272 * above. 4502 * simply call percpu_ref_kill() and proceed to offlining css's
4503 * because percpu_ref_kill() doesn't guarantee that the ref is seen
4504 * as killed on all CPUs on return.
4505 *
4506 * Use percpu_ref_kill_and_confirm() to get notifications as each
4507 * css is confirmed to be seen as killed on all CPUs. The
4508 * notification callback keeps track of the number of css's to be
4509 * killed and schedules cgroup_offline_fn() to perform the rest of
4510 * destruction once the percpu refs of all css's are confirmed to
4511 * be killed.
4273 */ 4512 */
4274 for_each_subsys(cgrp->root, ss) { 4513 atomic_set(&cgrp->css_kill_cnt, 1);
4514 for_each_root_subsys(cgrp->root, ss) {
4275 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4515 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4276 4516
4277 WARN_ON(atomic_read(&css->refcnt) < 0); 4517 /*
4278 atomic_add(CSS_DEACT_BIAS, &css->refcnt); 4518 * Killing would put the base ref, but we need to keep it
4279 } 4519 * alive until after ->css_offline.
4280 set_bit(CGRP_REMOVED, &cgrp->flags); 4520 */
4521 percpu_ref_get(&css->refcnt);
4281 4522
4282 /* tell subsystems to initate destruction */ 4523 atomic_inc(&cgrp->css_kill_cnt);
4283 for_each_subsys(cgrp->root, ss) 4524 percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
4284 offline_css(ss, cgrp); 4525 }
4526 cgroup_css_killed(cgrp);
4285 4527
4286 /* 4528 /*
4287 * Put all the base refs. Each css holds an extra reference to the 4529 * Mark @cgrp dead. This prevents further task migration and child
4288 * cgroup's dentry and cgroup removal proceeds regardless of css 4530 * creation by disabling cgroup_lock_live_group(). Note that
4289 * refs. On the last put of each css, whenever that may be, the 4531 * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to
4290 * extra dentry ref is put so that dentry destruction happens only 4532 * resume iteration after dropping RCU read lock. See
4291 * after all css's are released. 4533 * cgroup_next_sibling() for details.
4292 */ 4534 */
4293 for_each_subsys(cgrp->root, ss) 4535 set_bit(CGRP_DEAD, &cgrp->flags);
4294 css_put(cgrp->subsys[ss->subsys_id]);
4295 4536
4537 /* CGRP_DEAD is set, remove from ->release_list for the last time */
4296 raw_spin_lock(&release_list_lock); 4538 raw_spin_lock(&release_list_lock);
4297 if (!list_empty(&cgrp->release_list)) 4539 if (!list_empty(&cgrp->release_list))
4298 list_del_init(&cgrp->release_list); 4540 list_del_init(&cgrp->release_list);
4299 raw_spin_unlock(&release_list_lock); 4541 raw_spin_unlock(&release_list_lock);
4300 4542
4301 /* delete this cgroup from parent->children */ 4543 /*
4302 list_del_rcu(&cgrp->sibling); 4544 * Remove @cgrp directory. The removal puts the base ref but we
4303 list_del_init(&cgrp->allcg_node); 4545 * aren't quite done with @cgrp yet, so hold onto it.
4304 4546 */
4305 dget(d); 4547 dget(d);
4306 cgroup_d_remove_dir(d); 4548 cgroup_d_remove_dir(d);
4307 dput(d);
4308
4309 set_bit(CGRP_RELEASABLE, &parent->flags);
4310 check_for_release(parent);
4311 4549
4312 /* 4550 /*
4313 * Unregister events and notify userspace. 4551 * Unregister events and notify userspace.
@@ -4322,6 +4560,53 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4322 spin_unlock(&cgrp->event_list_lock); 4560 spin_unlock(&cgrp->event_list_lock);
4323 4561
4324 return 0; 4562 return 0;
4563};
4564
4565/**
4566 * cgroup_offline_fn - the second step of cgroup destruction
4567 * @work: cgroup->destroy_free_work
4568 *
4569 * This function is invoked from a work item for a cgroup which is being
4570 * destroyed after the percpu refcnts of all css's are guaranteed to be
4571 * seen as killed on all CPUs, and performs the rest of destruction. This
4572 * is the second step of destruction described in the comment above
4573 * cgroup_destroy_locked().
4574 */
4575static void cgroup_offline_fn(struct work_struct *work)
4576{
4577 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
4578 struct cgroup *parent = cgrp->parent;
4579 struct dentry *d = cgrp->dentry;
4580 struct cgroup_subsys *ss;
4581
4582 mutex_lock(&cgroup_mutex);
4583
4584 /*
4585 * css_tryget() is guaranteed to fail now. Tell subsystems to
4586 * initate destruction.
4587 */
4588 for_each_root_subsys(cgrp->root, ss)
4589 offline_css(ss, cgrp);
4590
4591 /*
4592 * Put the css refs from cgroup_destroy_locked(). Each css holds
4593 * an extra reference to the cgroup's dentry and cgroup removal
4594 * proceeds regardless of css refs. On the last put of each css,
4595 * whenever that may be, the extra dentry ref is put so that dentry
4596 * destruction happens only after all css's are released.
4597 */
4598 for_each_root_subsys(cgrp->root, ss)
4599 css_put(cgrp->subsys[ss->subsys_id]);
4600
4601 /* delete this cgroup from parent->children */
4602 list_del_rcu(&cgrp->sibling);
4603
4604 dput(d);
4605
4606 set_bit(CGRP_RELEASABLE, &parent->flags);
4607 check_for_release(parent);
4608
4609 mutex_unlock(&cgroup_mutex);
4325} 4610}
4326 4611
4327static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4612static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
@@ -4361,12 +4646,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4361 cgroup_init_cftsets(ss); 4646 cgroup_init_cftsets(ss);
4362 4647
4363 /* Create the top cgroup state for this subsystem */ 4648 /* Create the top cgroup state for this subsystem */
4364 list_add(&ss->sibling, &rootnode.subsys_list); 4649 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4365 ss->root = &rootnode; 4650 ss->root = &cgroup_dummy_root;
4366 css = ss->css_alloc(dummytop); 4651 css = ss->css_alloc(cgroup_dummy_top);
4367 /* We don't handle early failures gracefully */ 4652 /* We don't handle early failures gracefully */
4368 BUG_ON(IS_ERR(css)); 4653 BUG_ON(IS_ERR(css));
4369 init_cgroup_css(css, ss, dummytop); 4654 init_cgroup_css(css, ss, cgroup_dummy_top);
4370 4655
4371 /* Update the init_css_set to contain a subsys 4656 /* Update the init_css_set to contain a subsys
4372 * pointer to this state - since the subsystem is 4657 * pointer to this state - since the subsystem is
@@ -4381,7 +4666,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4381 * need to invoke fork callbacks here. */ 4666 * need to invoke fork callbacks here. */
4382 BUG_ON(!list_empty(&init_task.tasks)); 4667 BUG_ON(!list_empty(&init_task.tasks));
4383 4668
4384 BUG_ON(online_css(ss, dummytop)); 4669 BUG_ON(online_css(ss, cgroup_dummy_top));
4385 4670
4386 mutex_unlock(&cgroup_mutex); 4671 mutex_unlock(&cgroup_mutex);
4387 4672
@@ -4404,7 +4689,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4404 struct cgroup_subsys_state *css; 4689 struct cgroup_subsys_state *css;
4405 int i, ret; 4690 int i, ret;
4406 struct hlist_node *tmp; 4691 struct hlist_node *tmp;
4407 struct css_set *cg; 4692 struct css_set *cset;
4408 unsigned long key; 4693 unsigned long key;
4409 4694
4410 /* check name and function validity */ 4695 /* check name and function validity */
@@ -4427,7 +4712,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4427 */ 4712 */
4428 if (ss->module == NULL) { 4713 if (ss->module == NULL) {
4429 /* a sanity check */ 4714 /* a sanity check */
4430 BUG_ON(subsys[ss->subsys_id] != ss); 4715 BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
4431 return 0; 4716 return 0;
4432 } 4717 }
4433 4718
@@ -4435,26 +4720,26 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4435 cgroup_init_cftsets(ss); 4720 cgroup_init_cftsets(ss);
4436 4721
4437 mutex_lock(&cgroup_mutex); 4722 mutex_lock(&cgroup_mutex);
4438 subsys[ss->subsys_id] = ss; 4723 cgroup_subsys[ss->subsys_id] = ss;
4439 4724
4440 /* 4725 /*
4441 * no ss->css_alloc seems to need anything important in the ss 4726 * no ss->css_alloc seems to need anything important in the ss
4442 * struct, so this can happen first (i.e. before the rootnode 4727 * struct, so this can happen first (i.e. before the dummy root
4443 * attachment). 4728 * attachment).
4444 */ 4729 */
4445 css = ss->css_alloc(dummytop); 4730 css = ss->css_alloc(cgroup_dummy_top);
4446 if (IS_ERR(css)) { 4731 if (IS_ERR(css)) {
4447 /* failure case - need to deassign the subsys[] slot. */ 4732 /* failure case - need to deassign the cgroup_subsys[] slot. */
4448 subsys[ss->subsys_id] = NULL; 4733 cgroup_subsys[ss->subsys_id] = NULL;
4449 mutex_unlock(&cgroup_mutex); 4734 mutex_unlock(&cgroup_mutex);
4450 return PTR_ERR(css); 4735 return PTR_ERR(css);
4451 } 4736 }
4452 4737
4453 list_add(&ss->sibling, &rootnode.subsys_list); 4738 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4454 ss->root = &rootnode; 4739 ss->root = &cgroup_dummy_root;
4455 4740
4456 /* our new subsystem will be attached to the dummy hierarchy. */ 4741 /* our new subsystem will be attached to the dummy hierarchy. */
4457 init_cgroup_css(css, ss, dummytop); 4742 init_cgroup_css(css, ss, cgroup_dummy_top);
4458 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4743 /* init_idr must be after init_cgroup_css because it sets css->id. */
4459 if (ss->use_id) { 4744 if (ss->use_id) {
4460 ret = cgroup_init_idr(ss, css); 4745 ret = cgroup_init_idr(ss, css);
@@ -4471,21 +4756,21 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4471 * this is all done under the css_set_lock. 4756 * this is all done under the css_set_lock.
4472 */ 4757 */
4473 write_lock(&css_set_lock); 4758 write_lock(&css_set_lock);
4474 hash_for_each_safe(css_set_table, i, tmp, cg, hlist) { 4759 hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
4475 /* skip entries that we already rehashed */ 4760 /* skip entries that we already rehashed */
4476 if (cg->subsys[ss->subsys_id]) 4761 if (cset->subsys[ss->subsys_id])
4477 continue; 4762 continue;
4478 /* remove existing entry */ 4763 /* remove existing entry */
4479 hash_del(&cg->hlist); 4764 hash_del(&cset->hlist);
4480 /* set new value */ 4765 /* set new value */
4481 cg->subsys[ss->subsys_id] = css; 4766 cset->subsys[ss->subsys_id] = css;
4482 /* recompute hash and restore entry */ 4767 /* recompute hash and restore entry */
4483 key = css_set_hash(cg->subsys); 4768 key = css_set_hash(cset->subsys);
4484 hash_add(css_set_table, &cg->hlist, key); 4769 hash_add(css_set_table, &cset->hlist, key);
4485 } 4770 }
4486 write_unlock(&css_set_lock); 4771 write_unlock(&css_set_lock);
4487 4772
4488 ret = online_css(ss, dummytop); 4773 ret = online_css(ss, cgroup_dummy_top);
4489 if (ret) 4774 if (ret)
4490 goto err_unload; 4775 goto err_unload;
4491 4776
@@ -4511,7 +4796,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4511 */ 4796 */
4512void cgroup_unload_subsys(struct cgroup_subsys *ss) 4797void cgroup_unload_subsys(struct cgroup_subsys *ss)
4513{ 4798{
4514 struct cg_cgroup_link *link; 4799 struct cgrp_cset_link *link;
4515 4800
4516 BUG_ON(ss->module == NULL); 4801 BUG_ON(ss->module == NULL);
4517 4802
@@ -4520,45 +4805,46 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4520 * try_module_get in parse_cgroupfs_options should ensure that it 4805 * try_module_get in parse_cgroupfs_options should ensure that it
4521 * doesn't start being used while we're killing it off. 4806 * doesn't start being used while we're killing it off.
4522 */ 4807 */
4523 BUG_ON(ss->root != &rootnode); 4808 BUG_ON(ss->root != &cgroup_dummy_root);
4524 4809
4525 mutex_lock(&cgroup_mutex); 4810 mutex_lock(&cgroup_mutex);
4526 4811
4527 offline_css(ss, dummytop); 4812 offline_css(ss, cgroup_dummy_top);
4528 4813
4529 if (ss->use_id) 4814 if (ss->use_id)
4530 idr_destroy(&ss->idr); 4815 idr_destroy(&ss->idr);
4531 4816
4532 /* deassign the subsys_id */ 4817 /* deassign the subsys_id */
4533 subsys[ss->subsys_id] = NULL; 4818 cgroup_subsys[ss->subsys_id] = NULL;
4534 4819
4535 /* remove subsystem from rootnode's list of subsystems */ 4820 /* remove subsystem from the dummy root's list of subsystems */
4536 list_del_init(&ss->sibling); 4821 list_del_init(&ss->sibling);
4537 4822
4538 /* 4823 /*
4539 * disentangle the css from all css_sets attached to the dummytop. as 4824 * disentangle the css from all css_sets attached to the dummy
4540 * in loading, we need to pay our respects to the hashtable gods. 4825 * top. as in loading, we need to pay our respects to the hashtable
4826 * gods.
4541 */ 4827 */
4542 write_lock(&css_set_lock); 4828 write_lock(&css_set_lock);
4543 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { 4829 list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
4544 struct css_set *cg = link->cg; 4830 struct css_set *cset = link->cset;
4545 unsigned long key; 4831 unsigned long key;
4546 4832
4547 hash_del(&cg->hlist); 4833 hash_del(&cset->hlist);
4548 cg->subsys[ss->subsys_id] = NULL; 4834 cset->subsys[ss->subsys_id] = NULL;
4549 key = css_set_hash(cg->subsys); 4835 key = css_set_hash(cset->subsys);
4550 hash_add(css_set_table, &cg->hlist, key); 4836 hash_add(css_set_table, &cset->hlist, key);
4551 } 4837 }
4552 write_unlock(&css_set_lock); 4838 write_unlock(&css_set_lock);
4553 4839
4554 /* 4840 /*
4555 * remove subsystem's css from the dummytop and free it - need to 4841 * remove subsystem's css from the cgroup_dummy_top and free it -
4556 * free before marking as null because ss->css_free needs the 4842 * need to free before marking as null because ss->css_free needs
4557 * cgrp->subsys pointer to find their state. note that this also 4843 * the cgrp->subsys pointer to find their state. note that this
4558 * takes care of freeing the css_id. 4844 * also takes care of freeing the css_id.
4559 */ 4845 */
4560 ss->css_free(dummytop); 4846 ss->css_free(cgroup_dummy_top);
4561 dummytop->subsys[ss->subsys_id] = NULL; 4847 cgroup_dummy_top->subsys[ss->subsys_id] = NULL;
4562 4848
4563 mutex_unlock(&cgroup_mutex); 4849 mutex_unlock(&cgroup_mutex);
4564} 4850}
@@ -4572,30 +4858,25 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4572 */ 4858 */
4573int __init cgroup_init_early(void) 4859int __init cgroup_init_early(void)
4574{ 4860{
4861 struct cgroup_subsys *ss;
4575 int i; 4862 int i;
4863
4576 atomic_set(&init_css_set.refcount, 1); 4864 atomic_set(&init_css_set.refcount, 1);
4577 INIT_LIST_HEAD(&init_css_set.cg_links); 4865 INIT_LIST_HEAD(&init_css_set.cgrp_links);
4578 INIT_LIST_HEAD(&init_css_set.tasks); 4866 INIT_LIST_HEAD(&init_css_set.tasks);
4579 INIT_HLIST_NODE(&init_css_set.hlist); 4867 INIT_HLIST_NODE(&init_css_set.hlist);
4580 css_set_count = 1; 4868 css_set_count = 1;
4581 init_cgroup_root(&rootnode); 4869 init_cgroup_root(&cgroup_dummy_root);
4582 root_count = 1; 4870 cgroup_root_count = 1;
4583 init_task.cgroups = &init_css_set; 4871 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4584 4872
4585 init_css_set_link.cg = &init_css_set; 4873 init_cgrp_cset_link.cset = &init_css_set;
4586 init_css_set_link.cgrp = dummytop; 4874 init_cgrp_cset_link.cgrp = cgroup_dummy_top;
4587 list_add(&init_css_set_link.cgrp_link_list, 4875 list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links);
4588 &rootnode.top_cgroup.css_sets); 4876 list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links);
4589 list_add(&init_css_set_link.cg_link_list,
4590 &init_css_set.cg_links);
4591
4592 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4593 struct cgroup_subsys *ss = subsys[i];
4594
4595 /* at bootup time, we don't worry about modular subsystems */
4596 if (!ss || ss->module)
4597 continue;
4598 4877
4878 /* at bootup time, we don't worry about modular subsystems */
4879 for_each_builtin_subsys(ss, i) {
4599 BUG_ON(!ss->name); 4880 BUG_ON(!ss->name);
4600 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4881 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4601 BUG_ON(!ss->css_alloc); 4882 BUG_ON(!ss->css_alloc);
@@ -4620,30 +4901,33 @@ int __init cgroup_init_early(void)
4620 */ 4901 */
4621int __init cgroup_init(void) 4902int __init cgroup_init(void)
4622{ 4903{
4623 int err; 4904 struct cgroup_subsys *ss;
4624 int i;
4625 unsigned long key; 4905 unsigned long key;
4906 int i, err;
4626 4907
4627 err = bdi_init(&cgroup_backing_dev_info); 4908 err = bdi_init(&cgroup_backing_dev_info);
4628 if (err) 4909 if (err)
4629 return err; 4910 return err;
4630 4911
4631 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4912 for_each_builtin_subsys(ss, i) {
4632 struct cgroup_subsys *ss = subsys[i];
4633
4634 /* at bootup time, we don't worry about modular subsystems */
4635 if (!ss || ss->module)
4636 continue;
4637 if (!ss->early_init) 4913 if (!ss->early_init)
4638 cgroup_init_subsys(ss); 4914 cgroup_init_subsys(ss);
4639 if (ss->use_id) 4915 if (ss->use_id)
4640 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); 4916 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
4641 } 4917 }
4642 4918
4919 /* allocate id for the dummy hierarchy */
4920 mutex_lock(&cgroup_mutex);
4921 mutex_lock(&cgroup_root_mutex);
4922
4643 /* Add init_css_set to the hash table */ 4923 /* Add init_css_set to the hash table */
4644 key = css_set_hash(init_css_set.subsys); 4924 key = css_set_hash(init_css_set.subsys);
4645 hash_add(css_set_table, &init_css_set.hlist, key); 4925 hash_add(css_set_table, &init_css_set.hlist, key);
4646 BUG_ON(!init_root_id(&rootnode)); 4926
4927 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
4928
4929 mutex_unlock(&cgroup_root_mutex);
4930 mutex_unlock(&cgroup_mutex);
4647 4931
4648 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4932 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4649 if (!cgroup_kobj) { 4933 if (!cgroup_kobj) {
@@ -4708,7 +4992,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4708 int count = 0; 4992 int count = 0;
4709 4993
4710 seq_printf(m, "%d:", root->hierarchy_id); 4994 seq_printf(m, "%d:", root->hierarchy_id);
4711 for_each_subsys(root, ss) 4995 for_each_root_subsys(root, ss)
4712 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4996 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4713 if (strlen(root->name)) 4997 if (strlen(root->name))
4714 seq_printf(m, "%sname=%s", count ? "," : "", 4998 seq_printf(m, "%sname=%s", count ? "," : "",
@@ -4734,6 +5018,7 @@ out:
4734/* Display information about each subsystem and each hierarchy */ 5018/* Display information about each subsystem and each hierarchy */
4735static int proc_cgroupstats_show(struct seq_file *m, void *v) 5019static int proc_cgroupstats_show(struct seq_file *m, void *v)
4736{ 5020{
5021 struct cgroup_subsys *ss;
4737 int i; 5022 int i;
4738 5023
4739 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); 5024 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
@@ -4743,14 +5028,12 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
4743 * subsys/hierarchy state. 5028 * subsys/hierarchy state.
4744 */ 5029 */
4745 mutex_lock(&cgroup_mutex); 5030 mutex_lock(&cgroup_mutex);
4746 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 5031
4747 struct cgroup_subsys *ss = subsys[i]; 5032 for_each_subsys(ss, i)
4748 if (ss == NULL)
4749 continue;
4750 seq_printf(m, "%s\t%d\t%d\t%d\n", 5033 seq_printf(m, "%s\t%d\t%d\t%d\n",
4751 ss->name, ss->root->hierarchy_id, 5034 ss->name, ss->root->hierarchy_id,
4752 ss->root->number_of_cgroups, !ss->disabled); 5035 ss->root->number_of_cgroups, !ss->disabled);
4753 } 5036
4754 mutex_unlock(&cgroup_mutex); 5037 mutex_unlock(&cgroup_mutex);
4755 return 0; 5038 return 0;
4756} 5039}
@@ -4786,8 +5069,8 @@ static const struct file_operations proc_cgroupstats_operations = {
4786void cgroup_fork(struct task_struct *child) 5069void cgroup_fork(struct task_struct *child)
4787{ 5070{
4788 task_lock(current); 5071 task_lock(current);
5072 get_css_set(task_css_set(current));
4789 child->cgroups = current->cgroups; 5073 child->cgroups = current->cgroups;
4790 get_css_set(child->cgroups);
4791 task_unlock(current); 5074 task_unlock(current);
4792 INIT_LIST_HEAD(&child->cg_list); 5075 INIT_LIST_HEAD(&child->cg_list);
4793} 5076}
@@ -4804,6 +5087,7 @@ void cgroup_fork(struct task_struct *child)
4804 */ 5087 */
4805void cgroup_post_fork(struct task_struct *child) 5088void cgroup_post_fork(struct task_struct *child)
4806{ 5089{
5090 struct cgroup_subsys *ss;
4807 int i; 5091 int i;
4808 5092
4809 /* 5093 /*
@@ -4821,7 +5105,7 @@ void cgroup_post_fork(struct task_struct *child)
4821 write_lock(&css_set_lock); 5105 write_lock(&css_set_lock);
4822 task_lock(child); 5106 task_lock(child);
4823 if (list_empty(&child->cg_list)) 5107 if (list_empty(&child->cg_list))
4824 list_add(&child->cg_list, &child->cgroups->tasks); 5108 list_add(&child->cg_list, &task_css_set(child)->tasks);
4825 task_unlock(child); 5109 task_unlock(child);
4826 write_unlock(&css_set_lock); 5110 write_unlock(&css_set_lock);
4827 } 5111 }
@@ -4840,12 +5124,9 @@ void cgroup_post_fork(struct task_struct *child)
4840 * of the array can be freed at module unload, so we 5124 * of the array can be freed at module unload, so we
4841 * can't touch that. 5125 * can't touch that.
4842 */ 5126 */
4843 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { 5127 for_each_builtin_subsys(ss, i)
4844 struct cgroup_subsys *ss = subsys[i];
4845
4846 if (ss->fork) 5128 if (ss->fork)
4847 ss->fork(child); 5129 ss->fork(child);
4848 }
4849 } 5130 }
4850} 5131}
4851 5132
@@ -4886,7 +5167,8 @@ void cgroup_post_fork(struct task_struct *child)
4886 */ 5167 */
4887void cgroup_exit(struct task_struct *tsk, int run_callbacks) 5168void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4888{ 5169{
4889 struct css_set *cg; 5170 struct cgroup_subsys *ss;
5171 struct css_set *cset;
4890 int i; 5172 int i;
4891 5173
4892 /* 5174 /*
@@ -4903,36 +5185,32 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4903 5185
4904 /* Reassign the task to the init_css_set. */ 5186 /* Reassign the task to the init_css_set. */
4905 task_lock(tsk); 5187 task_lock(tsk);
4906 cg = tsk->cgroups; 5188 cset = task_css_set(tsk);
4907 tsk->cgroups = &init_css_set; 5189 RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
4908 5190
4909 if (run_callbacks && need_forkexit_callback) { 5191 if (run_callbacks && need_forkexit_callback) {
4910 /* 5192 /*
4911 * fork/exit callbacks are supported only for builtin 5193 * fork/exit callbacks are supported only for builtin
4912 * subsystems, see cgroup_post_fork() for details. 5194 * subsystems, see cgroup_post_fork() for details.
4913 */ 5195 */
4914 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { 5196 for_each_builtin_subsys(ss, i) {
4915 struct cgroup_subsys *ss = subsys[i];
4916
4917 if (ss->exit) { 5197 if (ss->exit) {
4918 struct cgroup *old_cgrp = 5198 struct cgroup *old_cgrp = cset->subsys[i]->cgroup;
4919 rcu_dereference_raw(cg->subsys[i])->cgroup;
4920 struct cgroup *cgrp = task_cgroup(tsk, i); 5199 struct cgroup *cgrp = task_cgroup(tsk, i);
5200
4921 ss->exit(cgrp, old_cgrp, tsk); 5201 ss->exit(cgrp, old_cgrp, tsk);
4922 } 5202 }
4923 } 5203 }
4924 } 5204 }
4925 task_unlock(tsk); 5205 task_unlock(tsk);
4926 5206
4927 put_css_set_taskexit(cg); 5207 put_css_set_taskexit(cset);
4928} 5208}
4929 5209
4930static void check_for_release(struct cgroup *cgrp) 5210static void check_for_release(struct cgroup *cgrp)
4931{ 5211{
4932 /* All of these checks rely on RCU to keep the cgroup
4933 * structure alive */
4934 if (cgroup_is_releasable(cgrp) && 5212 if (cgroup_is_releasable(cgrp) &&
4935 !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) { 5213 list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
4936 /* 5214 /*
4937 * Control Group is currently removeable. If it's not 5215 * Control Group is currently removeable. If it's not
4938 * already queued for a userspace notification, queue 5216 * already queued for a userspace notification, queue
@@ -4941,7 +5219,7 @@ static void check_for_release(struct cgroup *cgrp)
4941 int need_schedule_work = 0; 5219 int need_schedule_work = 0;
4942 5220
4943 raw_spin_lock(&release_list_lock); 5221 raw_spin_lock(&release_list_lock);
4944 if (!cgroup_is_removed(cgrp) && 5222 if (!cgroup_is_dead(cgrp) &&
4945 list_empty(&cgrp->release_list)) { 5223 list_empty(&cgrp->release_list)) {
4946 list_add(&cgrp->release_list, &release_list); 5224 list_add(&cgrp->release_list, &release_list);
4947 need_schedule_work = 1; 5225 need_schedule_work = 1;
@@ -4952,34 +5230,6 @@ static void check_for_release(struct cgroup *cgrp)
4952 } 5230 }
4953} 5231}
4954 5232
4955/* Caller must verify that the css is not for root cgroup */
4956bool __css_tryget(struct cgroup_subsys_state *css)
4957{
4958 while (true) {
4959 int t, v;
4960
4961 v = css_refcnt(css);
4962 t = atomic_cmpxchg(&css->refcnt, v, v + 1);
4963 if (likely(t == v))
4964 return true;
4965 else if (t < 0)
4966 return false;
4967 cpu_relax();
4968 }
4969}
4970EXPORT_SYMBOL_GPL(__css_tryget);
4971
4972/* Caller must verify that the css is not for root cgroup */
4973void __css_put(struct cgroup_subsys_state *css)
4974{
4975 int v;
4976
4977 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
4978 if (v == 0)
4979 schedule_work(&css->dput_work);
4980}
4981EXPORT_SYMBOL_GPL(__css_put);
4982
4983/* 5233/*
4984 * Notify userspace when a cgroup is released, by running the 5234 * Notify userspace when a cgroup is released, by running the
4985 * configured release agent with the name of the cgroup (path 5235 * configured release agent with the name of the cgroup (path
@@ -5054,23 +5304,19 @@ static void cgroup_release_agent(struct work_struct *work)
5054 5304
5055static int __init cgroup_disable(char *str) 5305static int __init cgroup_disable(char *str)
5056{ 5306{
5057 int i; 5307 struct cgroup_subsys *ss;
5058 char *token; 5308 char *token;
5309 int i;
5059 5310
5060 while ((token = strsep(&str, ",")) != NULL) { 5311 while ((token = strsep(&str, ",")) != NULL) {
5061 if (!*token) 5312 if (!*token)
5062 continue; 5313 continue;
5063 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
5064 struct cgroup_subsys *ss = subsys[i];
5065
5066 /*
5067 * cgroup_disable, being at boot time, can't
5068 * know about module subsystems, so we don't
5069 * worry about them.
5070 */
5071 if (!ss || ss->module)
5072 continue;
5073 5314
5315 /*
5316 * cgroup_disable, being at boot time, can't know about
5317 * module subsystems, so we don't worry about them.
5318 */
5319 for_each_builtin_subsys(ss, i) {
5074 if (!strcmp(token, ss->name)) { 5320 if (!strcmp(token, ss->name)) {
5075 ss->disabled = 1; 5321 ss->disabled = 1;
5076 printk(KERN_INFO "Disabling %s control group" 5322 printk(KERN_INFO "Disabling %s control group"
@@ -5087,9 +5333,7 @@ __setup("cgroup_disable=", cgroup_disable);
5087 * Functons for CSS ID. 5333 * Functons for CSS ID.
5088 */ 5334 */
5089 5335
5090/* 5336/* to get ID other than 0, this should be called when !cgroup_is_dead() */
5091 *To get ID other than 0, this should be called when !cgroup_is_removed().
5092 */
5093unsigned short css_id(struct cgroup_subsys_state *css) 5337unsigned short css_id(struct cgroup_subsys_state *css)
5094{ 5338{
5095 struct css_id *cssid; 5339 struct css_id *cssid;
@@ -5099,7 +5343,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
5099 * on this or this is under rcu_read_lock(). Once css->id is allocated, 5343 * on this or this is under rcu_read_lock(). Once css->id is allocated,
5100 * it's unchanged until freed. 5344 * it's unchanged until freed.
5101 */ 5345 */
5102 cssid = rcu_dereference_check(css->id, css_refcnt(css)); 5346 cssid = rcu_dereference_raw(css->id);
5103 5347
5104 if (cssid) 5348 if (cssid)
5105 return cssid->id; 5349 return cssid->id;
@@ -5107,18 +5351,6 @@ unsigned short css_id(struct cgroup_subsys_state *css)
5107} 5351}
5108EXPORT_SYMBOL_GPL(css_id); 5352EXPORT_SYMBOL_GPL(css_id);
5109 5353
5110unsigned short css_depth(struct cgroup_subsys_state *css)
5111{
5112 struct css_id *cssid;
5113
5114 cssid = rcu_dereference_check(css->id, css_refcnt(css));
5115
5116 if (cssid)
5117 return cssid->depth;
5118 return 0;
5119}
5120EXPORT_SYMBOL_GPL(css_depth);
5121
5122/** 5354/**
5123 * css_is_ancestor - test "root" css is an ancestor of "child" 5355 * css_is_ancestor - test "root" css is an ancestor of "child"
5124 * @child: the css to be tested. 5356 * @child: the css to be tested.
@@ -5153,7 +5385,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
5153 5385
5154void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) 5386void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
5155{ 5387{
5156 struct css_id *id = css->id; 5388 struct css_id *id = rcu_dereference_protected(css->id, true);
5389
5157 /* When this is called before css_id initialization, id can be NULL */ 5390 /* When this is called before css_id initialization, id can be NULL */
5158 if (!id) 5391 if (!id)
5159 return; 5392 return;
@@ -5219,8 +5452,8 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
5219 return PTR_ERR(newid); 5452 return PTR_ERR(newid);
5220 5453
5221 newid->stack[0] = newid->id; 5454 newid->stack[0] = newid->id;
5222 newid->css = rootcss; 5455 RCU_INIT_POINTER(newid->css, rootcss);
5223 rootcss->id = newid; 5456 RCU_INIT_POINTER(rootcss->id, newid);
5224 return 0; 5457 return 0;
5225} 5458}
5226 5459
@@ -5234,7 +5467,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
5234 subsys_id = ss->subsys_id; 5467 subsys_id = ss->subsys_id;
5235 parent_css = parent->subsys[subsys_id]; 5468 parent_css = parent->subsys[subsys_id];
5236 child_css = child->subsys[subsys_id]; 5469 child_css = child->subsys[subsys_id];
5237 parent_id = parent_css->id; 5470 parent_id = rcu_dereference_protected(parent_css->id, true);
5238 depth = parent_id->depth + 1; 5471 depth = parent_id->depth + 1;
5239 5472
5240 child_id = get_new_cssid(ss, depth); 5473 child_id = get_new_cssid(ss, depth);
@@ -5299,7 +5532,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5299} 5532}
5300 5533
5301#ifdef CONFIG_CGROUP_DEBUG 5534#ifdef CONFIG_CGROUP_DEBUG
5302static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) 5535static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
5303{ 5536{
5304 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5537 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5305 5538
@@ -5309,48 +5542,43 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
5309 return css; 5542 return css;
5310} 5543}
5311 5544
5312static void debug_css_free(struct cgroup *cont) 5545static void debug_css_free(struct cgroup *cgrp)
5313{
5314 kfree(cont->subsys[debug_subsys_id]);
5315}
5316
5317static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
5318{ 5546{
5319 return atomic_read(&cont->count); 5547 kfree(cgrp->subsys[debug_subsys_id]);
5320} 5548}
5321 5549
5322static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) 5550static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft)
5323{ 5551{
5324 return cgroup_task_count(cont); 5552 return cgroup_task_count(cgrp);
5325} 5553}
5326 5554
5327static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) 5555static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft)
5328{ 5556{
5329 return (u64)(unsigned long)current->cgroups; 5557 return (u64)(unsigned long)current->cgroups;
5330} 5558}
5331 5559
5332static u64 current_css_set_refcount_read(struct cgroup *cont, 5560static u64 current_css_set_refcount_read(struct cgroup *cgrp,
5333 struct cftype *cft) 5561 struct cftype *cft)
5334{ 5562{
5335 u64 count; 5563 u64 count;
5336 5564
5337 rcu_read_lock(); 5565 rcu_read_lock();
5338 count = atomic_read(&current->cgroups->refcount); 5566 count = atomic_read(&task_css_set(current)->refcount);
5339 rcu_read_unlock(); 5567 rcu_read_unlock();
5340 return count; 5568 return count;
5341} 5569}
5342 5570
5343static int current_css_set_cg_links_read(struct cgroup *cont, 5571static int current_css_set_cg_links_read(struct cgroup *cgrp,
5344 struct cftype *cft, 5572 struct cftype *cft,
5345 struct seq_file *seq) 5573 struct seq_file *seq)
5346{ 5574{
5347 struct cg_cgroup_link *link; 5575 struct cgrp_cset_link *link;
5348 struct css_set *cg; 5576 struct css_set *cset;
5349 5577
5350 read_lock(&css_set_lock); 5578 read_lock(&css_set_lock);
5351 rcu_read_lock(); 5579 rcu_read_lock();
5352 cg = rcu_dereference(current->cgroups); 5580 cset = rcu_dereference(current->cgroups);
5353 list_for_each_entry(link, &cg->cg_links, cg_link_list) { 5581 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
5354 struct cgroup *c = link->cgrp; 5582 struct cgroup *c = link->cgrp;
5355 const char *name; 5583 const char *name;
5356 5584
@@ -5367,19 +5595,19 @@ static int current_css_set_cg_links_read(struct cgroup *cont,
5367} 5595}
5368 5596
5369#define MAX_TASKS_SHOWN_PER_CSS 25 5597#define MAX_TASKS_SHOWN_PER_CSS 25
5370static int cgroup_css_links_read(struct cgroup *cont, 5598static int cgroup_css_links_read(struct cgroup *cgrp,
5371 struct cftype *cft, 5599 struct cftype *cft,
5372 struct seq_file *seq) 5600 struct seq_file *seq)
5373{ 5601{
5374 struct cg_cgroup_link *link; 5602 struct cgrp_cset_link *link;
5375 5603
5376 read_lock(&css_set_lock); 5604 read_lock(&css_set_lock);
5377 list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { 5605 list_for_each_entry(link, &cgrp->cset_links, cset_link) {
5378 struct css_set *cg = link->cg; 5606 struct css_set *cset = link->cset;
5379 struct task_struct *task; 5607 struct task_struct *task;
5380 int count = 0; 5608 int count = 0;
5381 seq_printf(seq, "css_set %p\n", cg); 5609 seq_printf(seq, "css_set %p\n", cset);
5382 list_for_each_entry(task, &cg->tasks, cg_list) { 5610 list_for_each_entry(task, &cset->tasks, cg_list) {
5383 if (count++ > MAX_TASKS_SHOWN_PER_CSS) { 5611 if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
5384 seq_puts(seq, " ...\n"); 5612 seq_puts(seq, " ...\n");
5385 break; 5613 break;
@@ -5400,10 +5628,6 @@ static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
5400 5628
5401static struct cftype debug_files[] = { 5629static struct cftype debug_files[] = {
5402 { 5630 {
5403 .name = "cgroup_refcount",
5404 .read_u64 = cgroup_refcount_read,
5405 },
5406 {
5407 .name = "taskcount", 5631 .name = "taskcount",
5408 .read_u64 = debug_taskcount_read, 5632 .read_u64 = debug_taskcount_read,
5409 }, 5633 },