aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-07-02 22:54:47 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-02 22:54:47 -0400
commitb028161fbba178ccd35aa69051c04d7673fe9d80 (patch)
treea164c39cdf5e1b954584dcdc51401e1521beae27 /kernel
parentf317ff9eed763e99bd226a447f93d42509434f43 (diff)
parentc7ba8287cd11f2fc9e2feee9e1fac34b7293658f (diff)
Merge branch 'for-3.11' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup changes from Tejun Heo: "This pull request contains the following changes. - cgroup_subsys_state (css) reference counting has been converted to percpu-ref. css is what each resource controller embeds into its own control structure and perform reference count against. It may be used in hot paths of various subsystems and is similar to module refcnt in that aspect. For example, block-cgroup's css refcnting was showing up a lot in Mikulaus's device-mapper scalability work and this should alleviate it. - cgroup subtree iterator has been updated so that RCU read lock can be released after grabbing reference. This allows simplifying its users which requires blocking which used to build iteration list under RCU read lock and then traverse it outside. This pull request contains simplification of cgroup core and device-cgroup. A separate pull request will update cpuset. - Fixes for various bugs including corner race conditions and RCU usage bugs. - A lot of cleanups and some prepartory work for the planned unified hierarchy support." * 'for-3.11' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (48 commits) cgroup: CGRP_ROOT_SUBSYS_BOUND should also be ignored when mounting an existing hierarchy cgroup: CGRP_ROOT_SUBSYS_BOUND should be ignored when comparing mount options cgroup: fix deadlock on cgroup_mutex via drop_parsed_module_refcounts() cgroup: always use RCU accessors for protected accesses cgroup: fix RCU accesses around task->cgroups cgroup: fix RCU accesses to task->cgroups cgroup: grab cgroup_mutex in drop_parsed_module_refcounts() cgroup: fix cgroupfs_root early destruction path cgroup: reserve ID 0 for dummy_root and 1 for unified hierarchy cgroup: implement for_each_[builtin_]subsys() cgroup: move init_css_set initialization inside cgroup_mutex cgroup: s/for_each_subsys()/for_each_root_subsys()/ cgroup: clean up find_css_set() and friends cgroup: remove cgroup->actual_subsys_mask cgroup: prefix global variables with "cgroup_" cgroup: convert CFTYPE_* flags to enums cgroup: rename cont to cgrp cgroup: clean up cgroup_serial_nr_cursor cgroup: convert cgroup_cft_commit() to use cgroup_for_each_descendant_pre() cgroup: make serial_nr_cursor available throughout cgroup.c ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c1536
1 files changed, 880 insertions, 656 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a7c9e6ddb979..e5583d10a325 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,9 +63,6 @@
63 63
64#include <linux/atomic.h> 64#include <linux/atomic.h>
65 65
66/* css deactivation bias, makes css->refcnt negative to deny new trygets */
67#define CSS_DEACT_BIAS INT_MIN
68
69/* 66/*
70 * cgroup_mutex is the master lock. Any modification to cgroup or its 67 * cgroup_mutex is the master lock. Any modification to cgroup or its
71 * hierarchy must be performed while holding it. 68 * hierarchy must be performed while holding it.
@@ -99,16 +96,19 @@ static DEFINE_MUTEX(cgroup_root_mutex);
99 */ 96 */
100#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, 97#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
101#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) 98#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
102static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { 99static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = {
103#include <linux/cgroup_subsys.h> 100#include <linux/cgroup_subsys.h>
104}; 101};
105 102
106/* 103/*
107 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the 104 * The dummy hierarchy, reserved for the subsystems that are otherwise
108 * subsystems that are otherwise unattached - it never has more than a 105 * unattached - it never has more than a single cgroup, and all tasks are
109 * single cgroup, and all tasks are part of that cgroup. 106 * part of that cgroup.
110 */ 107 */
111static struct cgroupfs_root rootnode; 108static struct cgroupfs_root cgroup_dummy_root;
109
110/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
111static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
112 112
113/* 113/*
114 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. 114 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
@@ -186,18 +186,28 @@ struct cgroup_event {
186 186
187/* The list of hierarchy roots */ 187/* The list of hierarchy roots */
188 188
189static LIST_HEAD(roots); 189static LIST_HEAD(cgroup_roots);
190static int root_count; 190static int cgroup_root_count;
191 191
192static DEFINE_IDA(hierarchy_ida); 192/*
193static int next_hierarchy_id; 193 * Hierarchy ID allocation and mapping. It follows the same exclusion
194static DEFINE_SPINLOCK(hierarchy_id_lock); 194 * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for
195 195 * writes, either for reads.
196/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 196 */
197#define dummytop (&rootnode.top_cgroup) 197static DEFINE_IDR(cgroup_hierarchy_idr);
198 198
199static struct cgroup_name root_cgroup_name = { .name = "/" }; 199static struct cgroup_name root_cgroup_name = { .name = "/" };
200 200
201/*
202 * Assign a monotonically increasing serial number to cgroups. It
203 * guarantees cgroups with bigger numbers are newer than those with smaller
204 * numbers. Also, as cgroups are always appended to the parent's
205 * ->children list, it guarantees that sibling cgroups are always sorted in
206 * the ascending serial number order on the list. Protected by
207 * cgroup_mutex.
208 */
209static u64 cgroup_serial_nr_next = 1;
210
201/* This flag indicates whether tasks in the fork and exit paths should 211/* This flag indicates whether tasks in the fork and exit paths should
202 * check for fork/exit handlers to call. This avoids us having to do 212 * check for fork/exit handlers to call. This avoids us having to do
203 * extra work in the fork/exit path if none of the subsystems need to 213 * extra work in the fork/exit path if none of the subsystems need to
@@ -205,27 +215,15 @@ static struct cgroup_name root_cgroup_name = { .name = "/" };
205 */ 215 */
206static int need_forkexit_callback __read_mostly; 216static int need_forkexit_callback __read_mostly;
207 217
218static void cgroup_offline_fn(struct work_struct *work);
208static int cgroup_destroy_locked(struct cgroup *cgrp); 219static int cgroup_destroy_locked(struct cgroup *cgrp);
209static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 220static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
210 struct cftype cfts[], bool is_add); 221 struct cftype cfts[], bool is_add);
211 222
212static int css_unbias_refcnt(int refcnt)
213{
214 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
215}
216
217/* the current nr of refs, always >= 0 whether @css is deactivated or not */
218static int css_refcnt(struct cgroup_subsys_state *css)
219{
220 int v = atomic_read(&css->refcnt);
221
222 return css_unbias_refcnt(v);
223}
224
225/* convenient tests for these bits */ 223/* convenient tests for these bits */
226inline int cgroup_is_removed(const struct cgroup *cgrp) 224static inline bool cgroup_is_dead(const struct cgroup *cgrp)
227{ 225{
228 return test_bit(CGRP_REMOVED, &cgrp->flags); 226 return test_bit(CGRP_DEAD, &cgrp->flags);
229} 227}
230 228
231/** 229/**
@@ -261,16 +259,38 @@ static int notify_on_release(const struct cgroup *cgrp)
261 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 259 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
262} 260}
263 261
264/* 262/**
265 * for_each_subsys() allows you to iterate on each subsystem attached to 263 * for_each_subsys - iterate all loaded cgroup subsystems
266 * an active hierarchy 264 * @ss: the iteration cursor
265 * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
266 *
267 * Should be called under cgroup_mutex.
267 */ 268 */
268#define for_each_subsys(_root, _ss) \ 269#define for_each_subsys(ss, i) \
269list_for_each_entry(_ss, &_root->subsys_list, sibling) 270 for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \
271 if (({ lockdep_assert_held(&cgroup_mutex); \
272 !((ss) = cgroup_subsys[i]); })) { } \
273 else
274
275/**
276 * for_each_builtin_subsys - iterate all built-in cgroup subsystems
277 * @ss: the iteration cursor
278 * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end
279 *
280 * Bulit-in subsystems are always present and iteration itself doesn't
281 * require any synchronization.
282 */
283#define for_each_builtin_subsys(ss, i) \
284 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
285 (((ss) = cgroup_subsys[i]) || true); (i)++)
286
287/* iterate each subsystem attached to a hierarchy */
288#define for_each_root_subsys(root, ss) \
289 list_for_each_entry((ss), &(root)->subsys_list, sibling)
270 290
271/* for_each_active_root() allows you to iterate across the active hierarchies */ 291/* iterate across the active hierarchies */
272#define for_each_active_root(_root) \ 292#define for_each_active_root(root) \
273list_for_each_entry(_root, &roots, root_list) 293 list_for_each_entry((root), &cgroup_roots, root_list)
274 294
275static inline struct cgroup *__d_cgrp(struct dentry *dentry) 295static inline struct cgroup *__d_cgrp(struct dentry *dentry)
276{ 296{
@@ -297,7 +317,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
297static bool cgroup_lock_live_group(struct cgroup *cgrp) 317static bool cgroup_lock_live_group(struct cgroup *cgrp)
298{ 318{
299 mutex_lock(&cgroup_mutex); 319 mutex_lock(&cgroup_mutex);
300 if (cgroup_is_removed(cgrp)) { 320 if (cgroup_is_dead(cgrp)) {
301 mutex_unlock(&cgroup_mutex); 321 mutex_unlock(&cgroup_mutex);
302 return false; 322 return false;
303 } 323 }
@@ -312,20 +332,24 @@ static void cgroup_release_agent(struct work_struct *work);
312static DECLARE_WORK(release_agent_work, cgroup_release_agent); 332static DECLARE_WORK(release_agent_work, cgroup_release_agent);
313static void check_for_release(struct cgroup *cgrp); 333static void check_for_release(struct cgroup *cgrp);
314 334
315/* Link structure for associating css_set objects with cgroups */ 335/*
316struct cg_cgroup_link { 336 * A cgroup can be associated with multiple css_sets as different tasks may
317 /* 337 * belong to different cgroups on different hierarchies. In the other
318 * List running through cg_cgroup_links associated with a 338 * direction, a css_set is naturally associated with multiple cgroups.
319 * cgroup, anchored on cgroup->css_sets 339 * This M:N relationship is represented by the following link structure
320 */ 340 * which exists for each association and allows traversing the associations
321 struct list_head cgrp_link_list; 341 * from both sides.
322 struct cgroup *cgrp; 342 */
323 /* 343struct cgrp_cset_link {
324 * List running through cg_cgroup_links pointing at a 344 /* the cgroup and css_set this link associates */
325 * single css_set object, anchored on css_set->cg_links 345 struct cgroup *cgrp;
326 */ 346 struct css_set *cset;
327 struct list_head cg_link_list; 347
328 struct css_set *cg; 348 /* list of cgrp_cset_links anchored at cgrp->cset_links */
349 struct list_head cset_link;
350
351 /* list of cgrp_cset_links anchored at css_set->cgrp_links */
352 struct list_head cgrp_link;
329}; 353};
330 354
331/* The default css_set - used by init and its children prior to any 355/* The default css_set - used by init and its children prior to any
@@ -336,7 +360,7 @@ struct cg_cgroup_link {
336 */ 360 */
337 361
338static struct css_set init_css_set; 362static struct css_set init_css_set;
339static struct cg_cgroup_link init_css_set_link; 363static struct cgrp_cset_link init_cgrp_cset_link;
340 364
341static int cgroup_init_idr(struct cgroup_subsys *ss, 365static int cgroup_init_idr(struct cgroup_subsys *ss,
342 struct cgroup_subsys_state *css); 366 struct cgroup_subsys_state *css);
@@ -357,10 +381,11 @@ static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
357 381
358static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) 382static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
359{ 383{
360 int i;
361 unsigned long key = 0UL; 384 unsigned long key = 0UL;
385 struct cgroup_subsys *ss;
386 int i;
362 387
363 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) 388 for_each_subsys(ss, i)
364 key += (unsigned long)css[i]; 389 key += (unsigned long)css[i];
365 key = (key >> 16) ^ key; 390 key = (key >> 16) ^ key;
366 391
@@ -373,90 +398,83 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
373 * compiled into their kernel but not actually in use */ 398 * compiled into their kernel but not actually in use */
374static int use_task_css_set_links __read_mostly; 399static int use_task_css_set_links __read_mostly;
375 400
376static void __put_css_set(struct css_set *cg, int taskexit) 401static void __put_css_set(struct css_set *cset, int taskexit)
377{ 402{
378 struct cg_cgroup_link *link; 403 struct cgrp_cset_link *link, *tmp_link;
379 struct cg_cgroup_link *saved_link; 404
380 /* 405 /*
381 * Ensure that the refcount doesn't hit zero while any readers 406 * Ensure that the refcount doesn't hit zero while any readers
382 * can see it. Similar to atomic_dec_and_lock(), but for an 407 * can see it. Similar to atomic_dec_and_lock(), but for an
383 * rwlock 408 * rwlock
384 */ 409 */
385 if (atomic_add_unless(&cg->refcount, -1, 1)) 410 if (atomic_add_unless(&cset->refcount, -1, 1))
386 return; 411 return;
387 write_lock(&css_set_lock); 412 write_lock(&css_set_lock);
388 if (!atomic_dec_and_test(&cg->refcount)) { 413 if (!atomic_dec_and_test(&cset->refcount)) {
389 write_unlock(&css_set_lock); 414 write_unlock(&css_set_lock);
390 return; 415 return;
391 } 416 }
392 417
393 /* This css_set is dead. unlink it and release cgroup refcounts */ 418 /* This css_set is dead. unlink it and release cgroup refcounts */
394 hash_del(&cg->hlist); 419 hash_del(&cset->hlist);
395 css_set_count--; 420 css_set_count--;
396 421
397 list_for_each_entry_safe(link, saved_link, &cg->cg_links, 422 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
398 cg_link_list) {
399 struct cgroup *cgrp = link->cgrp; 423 struct cgroup *cgrp = link->cgrp;
400 list_del(&link->cg_link_list);
401 list_del(&link->cgrp_link_list);
402 424
403 /* 425 list_del(&link->cset_link);
404 * We may not be holding cgroup_mutex, and if cgrp->count is 426 list_del(&link->cgrp_link);
405 * dropped to 0 the cgroup can be destroyed at any time, hence 427
406 * rcu_read_lock is used to keep it alive. 428 /* @cgrp can't go away while we're holding css_set_lock */
407 */ 429 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
408 rcu_read_lock();
409 if (atomic_dec_and_test(&cgrp->count) &&
410 notify_on_release(cgrp)) {
411 if (taskexit) 430 if (taskexit)
412 set_bit(CGRP_RELEASABLE, &cgrp->flags); 431 set_bit(CGRP_RELEASABLE, &cgrp->flags);
413 check_for_release(cgrp); 432 check_for_release(cgrp);
414 } 433 }
415 rcu_read_unlock();
416 434
417 kfree(link); 435 kfree(link);
418 } 436 }
419 437
420 write_unlock(&css_set_lock); 438 write_unlock(&css_set_lock);
421 kfree_rcu(cg, rcu_head); 439 kfree_rcu(cset, rcu_head);
422} 440}
423 441
424/* 442/*
425 * refcounted get/put for css_set objects 443 * refcounted get/put for css_set objects
426 */ 444 */
427static inline void get_css_set(struct css_set *cg) 445static inline void get_css_set(struct css_set *cset)
428{ 446{
429 atomic_inc(&cg->refcount); 447 atomic_inc(&cset->refcount);
430} 448}
431 449
432static inline void put_css_set(struct css_set *cg) 450static inline void put_css_set(struct css_set *cset)
433{ 451{
434 __put_css_set(cg, 0); 452 __put_css_set(cset, 0);
435} 453}
436 454
437static inline void put_css_set_taskexit(struct css_set *cg) 455static inline void put_css_set_taskexit(struct css_set *cset)
438{ 456{
439 __put_css_set(cg, 1); 457 __put_css_set(cset, 1);
440} 458}
441 459
442/* 460/**
443 * compare_css_sets - helper function for find_existing_css_set(). 461 * compare_css_sets - helper function for find_existing_css_set().
444 * @cg: candidate css_set being tested 462 * @cset: candidate css_set being tested
445 * @old_cg: existing css_set for a task 463 * @old_cset: existing css_set for a task
446 * @new_cgrp: cgroup that's being entered by the task 464 * @new_cgrp: cgroup that's being entered by the task
447 * @template: desired set of css pointers in css_set (pre-calculated) 465 * @template: desired set of css pointers in css_set (pre-calculated)
448 * 466 *
449 * Returns true if "cg" matches "old_cg" except for the hierarchy 467 * Returns true if "cg" matches "old_cg" except for the hierarchy
450 * which "new_cgrp" belongs to, for which it should match "new_cgrp". 468 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
451 */ 469 */
452static bool compare_css_sets(struct css_set *cg, 470static bool compare_css_sets(struct css_set *cset,
453 struct css_set *old_cg, 471 struct css_set *old_cset,
454 struct cgroup *new_cgrp, 472 struct cgroup *new_cgrp,
455 struct cgroup_subsys_state *template[]) 473 struct cgroup_subsys_state *template[])
456{ 474{
457 struct list_head *l1, *l2; 475 struct list_head *l1, *l2;
458 476
459 if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { 477 if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {
460 /* Not all subsystems matched */ 478 /* Not all subsystems matched */
461 return false; 479 return false;
462 } 480 }
@@ -470,28 +488,28 @@ static bool compare_css_sets(struct css_set *cg,
470 * candidates. 488 * candidates.
471 */ 489 */
472 490
473 l1 = &cg->cg_links; 491 l1 = &cset->cgrp_links;
474 l2 = &old_cg->cg_links; 492 l2 = &old_cset->cgrp_links;
475 while (1) { 493 while (1) {
476 struct cg_cgroup_link *cgl1, *cgl2; 494 struct cgrp_cset_link *link1, *link2;
477 struct cgroup *cg1, *cg2; 495 struct cgroup *cgrp1, *cgrp2;
478 496
479 l1 = l1->next; 497 l1 = l1->next;
480 l2 = l2->next; 498 l2 = l2->next;
481 /* See if we reached the end - both lists are equal length. */ 499 /* See if we reached the end - both lists are equal length. */
482 if (l1 == &cg->cg_links) { 500 if (l1 == &cset->cgrp_links) {
483 BUG_ON(l2 != &old_cg->cg_links); 501 BUG_ON(l2 != &old_cset->cgrp_links);
484 break; 502 break;
485 } else { 503 } else {
486 BUG_ON(l2 == &old_cg->cg_links); 504 BUG_ON(l2 == &old_cset->cgrp_links);
487 } 505 }
488 /* Locate the cgroups associated with these links. */ 506 /* Locate the cgroups associated with these links. */
489 cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); 507 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
490 cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); 508 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
491 cg1 = cgl1->cgrp; 509 cgrp1 = link1->cgrp;
492 cg2 = cgl2->cgrp; 510 cgrp2 = link2->cgrp;
493 /* Hierarchies should be linked in the same order. */ 511 /* Hierarchies should be linked in the same order. */
494 BUG_ON(cg1->root != cg2->root); 512 BUG_ON(cgrp1->root != cgrp2->root);
495 513
496 /* 514 /*
497 * If this hierarchy is the hierarchy of the cgroup 515 * If this hierarchy is the hierarchy of the cgroup
@@ -500,46 +518,39 @@ static bool compare_css_sets(struct css_set *cg,
500 * hierarchy, then this css_set should point to the 518 * hierarchy, then this css_set should point to the
501 * same cgroup as the old css_set. 519 * same cgroup as the old css_set.
502 */ 520 */
503 if (cg1->root == new_cgrp->root) { 521 if (cgrp1->root == new_cgrp->root) {
504 if (cg1 != new_cgrp) 522 if (cgrp1 != new_cgrp)
505 return false; 523 return false;
506 } else { 524 } else {
507 if (cg1 != cg2) 525 if (cgrp1 != cgrp2)
508 return false; 526 return false;
509 } 527 }
510 } 528 }
511 return true; 529 return true;
512} 530}
513 531
514/* 532/**
515 * find_existing_css_set() is a helper for 533 * find_existing_css_set - init css array and find the matching css_set
516 * find_css_set(), and checks to see whether an existing 534 * @old_cset: the css_set that we're using before the cgroup transition
517 * css_set is suitable. 535 * @cgrp: the cgroup that we're moving into
518 * 536 * @template: out param for the new set of csses, should be clear on entry
519 * oldcg: the cgroup group that we're using before the cgroup
520 * transition
521 *
522 * cgrp: the cgroup that we're moving into
523 *
524 * template: location in which to build the desired set of subsystem
525 * state objects for the new cgroup group
526 */ 537 */
527static struct css_set *find_existing_css_set( 538static struct css_set *find_existing_css_set(struct css_set *old_cset,
528 struct css_set *oldcg, 539 struct cgroup *cgrp,
529 struct cgroup *cgrp, 540 struct cgroup_subsys_state *template[])
530 struct cgroup_subsys_state *template[])
531{ 541{
532 int i;
533 struct cgroupfs_root *root = cgrp->root; 542 struct cgroupfs_root *root = cgrp->root;
534 struct css_set *cg; 543 struct cgroup_subsys *ss;
544 struct css_set *cset;
535 unsigned long key; 545 unsigned long key;
546 int i;
536 547
537 /* 548 /*
538 * Build the set of subsystem state objects that we want to see in the 549 * Build the set of subsystem state objects that we want to see in the
539 * new css_set. while subsystems can change globally, the entries here 550 * new css_set. while subsystems can change globally, the entries here
540 * won't change, so no need for locking. 551 * won't change, so no need for locking.
541 */ 552 */
542 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 553 for_each_subsys(ss, i) {
543 if (root->subsys_mask & (1UL << i)) { 554 if (root->subsys_mask & (1UL << i)) {
544 /* Subsystem is in this hierarchy. So we want 555 /* Subsystem is in this hierarchy. So we want
545 * the subsystem state from the new 556 * the subsystem state from the new
@@ -548,148 +559,152 @@ static struct css_set *find_existing_css_set(
548 } else { 559 } else {
549 /* Subsystem is not in this hierarchy, so we 560 /* Subsystem is not in this hierarchy, so we
550 * don't want to change the subsystem state */ 561 * don't want to change the subsystem state */
551 template[i] = oldcg->subsys[i]; 562 template[i] = old_cset->subsys[i];
552 } 563 }
553 } 564 }
554 565
555 key = css_set_hash(template); 566 key = css_set_hash(template);
556 hash_for_each_possible(css_set_table, cg, hlist, key) { 567 hash_for_each_possible(css_set_table, cset, hlist, key) {
557 if (!compare_css_sets(cg, oldcg, cgrp, template)) 568 if (!compare_css_sets(cset, old_cset, cgrp, template))
558 continue; 569 continue;
559 570
560 /* This css_set matches what we need */ 571 /* This css_set matches what we need */
561 return cg; 572 return cset;
562 } 573 }
563 574
564 /* No existing cgroup group matched */ 575 /* No existing cgroup group matched */
565 return NULL; 576 return NULL;
566} 577}
567 578
568static void free_cg_links(struct list_head *tmp) 579static void free_cgrp_cset_links(struct list_head *links_to_free)
569{ 580{
570 struct cg_cgroup_link *link; 581 struct cgrp_cset_link *link, *tmp_link;
571 struct cg_cgroup_link *saved_link;
572 582
573 list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { 583 list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
574 list_del(&link->cgrp_link_list); 584 list_del(&link->cset_link);
575 kfree(link); 585 kfree(link);
576 } 586 }
577} 587}
578 588
579/* 589/**
580 * allocate_cg_links() allocates "count" cg_cgroup_link structures 590 * allocate_cgrp_cset_links - allocate cgrp_cset_links
581 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on 591 * @count: the number of links to allocate
582 * success or a negative error 592 * @tmp_links: list_head the allocated links are put on
593 *
594 * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
595 * through ->cset_link. Returns 0 on success or -errno.
583 */ 596 */
584static int allocate_cg_links(int count, struct list_head *tmp) 597static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
585{ 598{
586 struct cg_cgroup_link *link; 599 struct cgrp_cset_link *link;
587 int i; 600 int i;
588 INIT_LIST_HEAD(tmp); 601
602 INIT_LIST_HEAD(tmp_links);
603
589 for (i = 0; i < count; i++) { 604 for (i = 0; i < count; i++) {
590 link = kmalloc(sizeof(*link), GFP_KERNEL); 605 link = kzalloc(sizeof(*link), GFP_KERNEL);
591 if (!link) { 606 if (!link) {
592 free_cg_links(tmp); 607 free_cgrp_cset_links(tmp_links);
593 return -ENOMEM; 608 return -ENOMEM;
594 } 609 }
595 list_add(&link->cgrp_link_list, tmp); 610 list_add(&link->cset_link, tmp_links);
596 } 611 }
597 return 0; 612 return 0;
598} 613}
599 614
600/** 615/**
601 * link_css_set - a helper function to link a css_set to a cgroup 616 * link_css_set - a helper function to link a css_set to a cgroup
602 * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() 617 * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
603 * @cg: the css_set to be linked 618 * @cset: the css_set to be linked
604 * @cgrp: the destination cgroup 619 * @cgrp: the destination cgroup
605 */ 620 */
606static void link_css_set(struct list_head *tmp_cg_links, 621static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
607 struct css_set *cg, struct cgroup *cgrp) 622 struct cgroup *cgrp)
608{ 623{
609 struct cg_cgroup_link *link; 624 struct cgrp_cset_link *link;
610 625
611 BUG_ON(list_empty(tmp_cg_links)); 626 BUG_ON(list_empty(tmp_links));
612 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, 627 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
613 cgrp_link_list); 628 link->cset = cset;
614 link->cg = cg;
615 link->cgrp = cgrp; 629 link->cgrp = cgrp;
616 atomic_inc(&cgrp->count); 630 list_move(&link->cset_link, &cgrp->cset_links);
617 list_move(&link->cgrp_link_list, &cgrp->css_sets);
618 /* 631 /*
619 * Always add links to the tail of the list so that the list 632 * Always add links to the tail of the list so that the list
620 * is sorted by order of hierarchy creation 633 * is sorted by order of hierarchy creation
621 */ 634 */
622 list_add_tail(&link->cg_link_list, &cg->cg_links); 635 list_add_tail(&link->cgrp_link, &cset->cgrp_links);
623} 636}
624 637
625/* 638/**
626 * find_css_set() takes an existing cgroup group and a 639 * find_css_set - return a new css_set with one cgroup updated
627 * cgroup object, and returns a css_set object that's 640 * @old_cset: the baseline css_set
628 * equivalent to the old group, but with the given cgroup 641 * @cgrp: the cgroup to be updated
629 * substituted into the appropriate hierarchy. Must be called with 642 *
630 * cgroup_mutex held 643 * Return a new css_set that's equivalent to @old_cset, but with @cgrp
644 * substituted into the appropriate hierarchy.
631 */ 645 */
632static struct css_set *find_css_set( 646static struct css_set *find_css_set(struct css_set *old_cset,
633 struct css_set *oldcg, struct cgroup *cgrp) 647 struct cgroup *cgrp)
634{ 648{
635 struct css_set *res; 649 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
636 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; 650 struct css_set *cset;
637 651 struct list_head tmp_links;
638 struct list_head tmp_cg_links; 652 struct cgrp_cset_link *link;
639
640 struct cg_cgroup_link *link;
641 unsigned long key; 653 unsigned long key;
642 654
655 lockdep_assert_held(&cgroup_mutex);
656
643 /* First see if we already have a cgroup group that matches 657 /* First see if we already have a cgroup group that matches
644 * the desired set */ 658 * the desired set */
645 read_lock(&css_set_lock); 659 read_lock(&css_set_lock);
646 res = find_existing_css_set(oldcg, cgrp, template); 660 cset = find_existing_css_set(old_cset, cgrp, template);
647 if (res) 661 if (cset)
648 get_css_set(res); 662 get_css_set(cset);
649 read_unlock(&css_set_lock); 663 read_unlock(&css_set_lock);
650 664
651 if (res) 665 if (cset)
652 return res; 666 return cset;
653 667
654 res = kmalloc(sizeof(*res), GFP_KERNEL); 668 cset = kzalloc(sizeof(*cset), GFP_KERNEL);
655 if (!res) 669 if (!cset)
656 return NULL; 670 return NULL;
657 671
658 /* Allocate all the cg_cgroup_link objects that we'll need */ 672 /* Allocate all the cgrp_cset_link objects that we'll need */
659 if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { 673 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
660 kfree(res); 674 kfree(cset);
661 return NULL; 675 return NULL;
662 } 676 }
663 677
664 atomic_set(&res->refcount, 1); 678 atomic_set(&cset->refcount, 1);
665 INIT_LIST_HEAD(&res->cg_links); 679 INIT_LIST_HEAD(&cset->cgrp_links);
666 INIT_LIST_HEAD(&res->tasks); 680 INIT_LIST_HEAD(&cset->tasks);
667 INIT_HLIST_NODE(&res->hlist); 681 INIT_HLIST_NODE(&cset->hlist);
668 682
669 /* Copy the set of subsystem state objects generated in 683 /* Copy the set of subsystem state objects generated in
670 * find_existing_css_set() */ 684 * find_existing_css_set() */
671 memcpy(res->subsys, template, sizeof(res->subsys)); 685 memcpy(cset->subsys, template, sizeof(cset->subsys));
672 686
673 write_lock(&css_set_lock); 687 write_lock(&css_set_lock);
674 /* Add reference counts and links from the new css_set. */ 688 /* Add reference counts and links from the new css_set. */
675 list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { 689 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
676 struct cgroup *c = link->cgrp; 690 struct cgroup *c = link->cgrp;
691
677 if (c->root == cgrp->root) 692 if (c->root == cgrp->root)
678 c = cgrp; 693 c = cgrp;
679 link_css_set(&tmp_cg_links, res, c); 694 link_css_set(&tmp_links, cset, c);
680 } 695 }
681 696
682 BUG_ON(!list_empty(&tmp_cg_links)); 697 BUG_ON(!list_empty(&tmp_links));
683 698
684 css_set_count++; 699 css_set_count++;
685 700
686 /* Add this cgroup group to the hash table */ 701 /* Add this cgroup group to the hash table */
687 key = css_set_hash(res->subsys); 702 key = css_set_hash(cset->subsys);
688 hash_add(css_set_table, &res->hlist, key); 703 hash_add(css_set_table, &cset->hlist, key);
689 704
690 write_unlock(&css_set_lock); 705 write_unlock(&css_set_lock);
691 706
692 return res; 707 return cset;
693} 708}
694 709
695/* 710/*
@@ -699,7 +714,7 @@ static struct css_set *find_css_set(
699static struct cgroup *task_cgroup_from_root(struct task_struct *task, 714static struct cgroup *task_cgroup_from_root(struct task_struct *task,
700 struct cgroupfs_root *root) 715 struct cgroupfs_root *root)
701{ 716{
702 struct css_set *css; 717 struct css_set *cset;
703 struct cgroup *res = NULL; 718 struct cgroup *res = NULL;
704 719
705 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 720 BUG_ON(!mutex_is_locked(&cgroup_mutex));
@@ -709,13 +724,15 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
709 * task can't change groups, so the only thing that can happen 724 * task can't change groups, so the only thing that can happen
710 * is that it exits and its css is set back to init_css_set. 725 * is that it exits and its css is set back to init_css_set.
711 */ 726 */
712 css = task->cgroups; 727 cset = task_css_set(task);
713 if (css == &init_css_set) { 728 if (cset == &init_css_set) {
714 res = &root->top_cgroup; 729 res = &root->top_cgroup;
715 } else { 730 } else {
716 struct cg_cgroup_link *link; 731 struct cgrp_cset_link *link;
717 list_for_each_entry(link, &css->cg_links, cg_link_list) { 732
733 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
718 struct cgroup *c = link->cgrp; 734 struct cgroup *c = link->cgrp;
735
719 if (c->root == root) { 736 if (c->root == root) {
720 res = c; 737 res = c;
721 break; 738 break;
@@ -828,14 +845,14 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
828 845
829static void cgroup_free_fn(struct work_struct *work) 846static void cgroup_free_fn(struct work_struct *work)
830{ 847{
831 struct cgroup *cgrp = container_of(work, struct cgroup, free_work); 848 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
832 struct cgroup_subsys *ss; 849 struct cgroup_subsys *ss;
833 850
834 mutex_lock(&cgroup_mutex); 851 mutex_lock(&cgroup_mutex);
835 /* 852 /*
836 * Release the subsystem state objects. 853 * Release the subsystem state objects.
837 */ 854 */
838 for_each_subsys(cgrp->root, ss) 855 for_each_root_subsys(cgrp->root, ss)
839 ss->css_free(cgrp); 856 ss->css_free(cgrp);
840 857
841 cgrp->root->number_of_cgroups--; 858 cgrp->root->number_of_cgroups--;
@@ -873,7 +890,8 @@ static void cgroup_free_rcu(struct rcu_head *head)
873{ 890{
874 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); 891 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
875 892
876 schedule_work(&cgrp->free_work); 893 INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
894 schedule_work(&cgrp->destroy_work);
877} 895}
878 896
879static void cgroup_diput(struct dentry *dentry, struct inode *inode) 897static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -882,7 +900,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
882 if (S_ISDIR(inode->i_mode)) { 900 if (S_ISDIR(inode->i_mode)) {
883 struct cgroup *cgrp = dentry->d_fsdata; 901 struct cgroup *cgrp = dentry->d_fsdata;
884 902
885 BUG_ON(!(cgroup_is_removed(cgrp))); 903 BUG_ON(!(cgroup_is_dead(cgrp)));
886 call_rcu(&cgrp->rcu_head, cgroup_free_rcu); 904 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
887 } else { 905 } else {
888 struct cfent *cfe = __d_cfe(dentry); 906 struct cfent *cfe = __d_cfe(dentry);
@@ -950,7 +968,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
950 struct cgroup *cgrp = __d_cgrp(dir); 968 struct cgroup *cgrp = __d_cgrp(dir);
951 struct cgroup_subsys *ss; 969 struct cgroup_subsys *ss;
952 970
953 for_each_subsys(cgrp->root, ss) { 971 for_each_root_subsys(cgrp->root, ss) {
954 struct cftype_set *set; 972 struct cftype_set *set;
955 if (!test_bit(ss->subsys_id, &subsys_mask)) 973 if (!test_bit(ss->subsys_id, &subsys_mask))
956 continue; 974 continue;
@@ -988,30 +1006,23 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
988 * returns an error, no reference counts are touched. 1006 * returns an error, no reference counts are touched.
989 */ 1007 */
990static int rebind_subsystems(struct cgroupfs_root *root, 1008static int rebind_subsystems(struct cgroupfs_root *root,
991 unsigned long final_subsys_mask) 1009 unsigned long added_mask, unsigned removed_mask)
992{ 1010{
993 unsigned long added_mask, removed_mask;
994 struct cgroup *cgrp = &root->top_cgroup; 1011 struct cgroup *cgrp = &root->top_cgroup;
1012 struct cgroup_subsys *ss;
995 int i; 1013 int i;
996 1014
997 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1015 BUG_ON(!mutex_is_locked(&cgroup_mutex));
998 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 1016 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
999 1017
1000 removed_mask = root->actual_subsys_mask & ~final_subsys_mask;
1001 added_mask = final_subsys_mask & ~root->actual_subsys_mask;
1002 /* Check that any added subsystems are currently free */ 1018 /* Check that any added subsystems are currently free */
1003 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1019 for_each_subsys(ss, i) {
1004 unsigned long bit = 1UL << i; 1020 unsigned long bit = 1UL << i;
1005 struct cgroup_subsys *ss = subsys[i]; 1021
1006 if (!(bit & added_mask)) 1022 if (!(bit & added_mask))
1007 continue; 1023 continue;
1008 /* 1024
1009 * Nobody should tell us to do a subsys that doesn't exist: 1025 if (ss->root != &cgroup_dummy_root) {
1010 * parse_cgroupfs_options should catch that case and refcounts
1011 * ensure that subsystems won't disappear once selected.
1012 */
1013 BUG_ON(ss == NULL);
1014 if (ss->root != &rootnode) {
1015 /* Subsystem isn't free */ 1026 /* Subsystem isn't free */
1016 return -EBUSY; 1027 return -EBUSY;
1017 } 1028 }
@@ -1025,38 +1036,41 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1025 return -EBUSY; 1036 return -EBUSY;
1026 1037
1027 /* Process each subsystem */ 1038 /* Process each subsystem */
1028 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1039 for_each_subsys(ss, i) {
1029 struct cgroup_subsys *ss = subsys[i];
1030 unsigned long bit = 1UL << i; 1040 unsigned long bit = 1UL << i;
1041
1031 if (bit & added_mask) { 1042 if (bit & added_mask) {
1032 /* We're binding this subsystem to this hierarchy */ 1043 /* We're binding this subsystem to this hierarchy */
1033 BUG_ON(ss == NULL);
1034 BUG_ON(cgrp->subsys[i]); 1044 BUG_ON(cgrp->subsys[i]);
1035 BUG_ON(!dummytop->subsys[i]); 1045 BUG_ON(!cgroup_dummy_top->subsys[i]);
1036 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 1046 BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top);
1037 cgrp->subsys[i] = dummytop->subsys[i]; 1047
1048 cgrp->subsys[i] = cgroup_dummy_top->subsys[i];
1038 cgrp->subsys[i]->cgroup = cgrp; 1049 cgrp->subsys[i]->cgroup = cgrp;
1039 list_move(&ss->sibling, &root->subsys_list); 1050 list_move(&ss->sibling, &root->subsys_list);
1040 ss->root = root; 1051 ss->root = root;
1041 if (ss->bind) 1052 if (ss->bind)
1042 ss->bind(cgrp); 1053 ss->bind(cgrp);
1054
1043 /* refcount was already taken, and we're keeping it */ 1055 /* refcount was already taken, and we're keeping it */
1056 root->subsys_mask |= bit;
1044 } else if (bit & removed_mask) { 1057 } else if (bit & removed_mask) {
1045 /* We're removing this subsystem */ 1058 /* We're removing this subsystem */
1046 BUG_ON(ss == NULL); 1059 BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]);
1047 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
1048 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1060 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
1061
1049 if (ss->bind) 1062 if (ss->bind)
1050 ss->bind(dummytop); 1063 ss->bind(cgroup_dummy_top);
1051 dummytop->subsys[i]->cgroup = dummytop; 1064 cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top;
1052 cgrp->subsys[i] = NULL; 1065 cgrp->subsys[i] = NULL;
1053 subsys[i]->root = &rootnode; 1066 cgroup_subsys[i]->root = &cgroup_dummy_root;
1054 list_move(&ss->sibling, &rootnode.subsys_list); 1067 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
1068
1055 /* subsystem is now free - drop reference on module */ 1069 /* subsystem is now free - drop reference on module */
1056 module_put(ss->module); 1070 module_put(ss->module);
1057 } else if (bit & final_subsys_mask) { 1071 root->subsys_mask &= ~bit;
1072 } else if (bit & root->subsys_mask) {
1058 /* Subsystem state should already exist */ 1073 /* Subsystem state should already exist */
1059 BUG_ON(ss == NULL);
1060 BUG_ON(!cgrp->subsys[i]); 1074 BUG_ON(!cgrp->subsys[i]);
1061 /* 1075 /*
1062 * a refcount was taken, but we already had one, so 1076 * a refcount was taken, but we already had one, so
@@ -1071,7 +1085,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1071 BUG_ON(cgrp->subsys[i]); 1085 BUG_ON(cgrp->subsys[i]);
1072 } 1086 }
1073 } 1087 }
1074 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; 1088
1089 /*
1090 * Mark @root has finished binding subsystems. @root->subsys_mask
1091 * now matches the bound subsystems.
1092 */
1093 root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1075 1094
1076 return 0; 1095 return 0;
1077} 1096}
@@ -1082,7 +1101,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1082 struct cgroup_subsys *ss; 1101 struct cgroup_subsys *ss;
1083 1102
1084 mutex_lock(&cgroup_root_mutex); 1103 mutex_lock(&cgroup_root_mutex);
1085 for_each_subsys(root, ss) 1104 for_each_root_subsys(root, ss)
1086 seq_printf(seq, ",%s", ss->name); 1105 seq_printf(seq, ",%s", ss->name);
1087 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1106 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1088 seq_puts(seq, ",sane_behavior"); 1107 seq_puts(seq, ",sane_behavior");
@@ -1114,18 +1133,19 @@ struct cgroup_sb_opts {
1114}; 1133};
1115 1134
1116/* 1135/*
1117 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call 1136 * Convert a hierarchy specifier into a bitmask of subsystems and
1118 * with cgroup_mutex held to protect the subsys[] array. This function takes 1137 * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
1119 * refcounts on subsystems to be used, unless it returns error, in which case 1138 * array. This function takes refcounts on subsystems to be used, unless it
1120 * no refcounts are taken. 1139 * returns error, in which case no refcounts are taken.
1121 */ 1140 */
1122static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) 1141static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1123{ 1142{
1124 char *token, *o = data; 1143 char *token, *o = data;
1125 bool all_ss = false, one_ss = false; 1144 bool all_ss = false, one_ss = false;
1126 unsigned long mask = (unsigned long)-1; 1145 unsigned long mask = (unsigned long)-1;
1127 int i;
1128 bool module_pin_failed = false; 1146 bool module_pin_failed = false;
1147 struct cgroup_subsys *ss;
1148 int i;
1129 1149
1130 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1150 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1131 1151
@@ -1202,10 +1222,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1202 continue; 1222 continue;
1203 } 1223 }
1204 1224
1205 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1225 for_each_subsys(ss, i) {
1206 struct cgroup_subsys *ss = subsys[i];
1207 if (ss == NULL)
1208 continue;
1209 if (strcmp(token, ss->name)) 1226 if (strcmp(token, ss->name))
1210 continue; 1227 continue;
1211 if (ss->disabled) 1228 if (ss->disabled)
@@ -1228,16 +1245,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1228 * otherwise if 'none', 'name=' and a subsystem name options 1245 * otherwise if 'none', 'name=' and a subsystem name options
1229 * were not specified, let's default to 'all' 1246 * were not specified, let's default to 'all'
1230 */ 1247 */
1231 if (all_ss || (!one_ss && !opts->none && !opts->name)) { 1248 if (all_ss || (!one_ss && !opts->none && !opts->name))
1232 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1249 for_each_subsys(ss, i)
1233 struct cgroup_subsys *ss = subsys[i]; 1250 if (!ss->disabled)
1234 if (ss == NULL) 1251 set_bit(i, &opts->subsys_mask);
1235 continue;
1236 if (ss->disabled)
1237 continue;
1238 set_bit(i, &opts->subsys_mask);
1239 }
1240 }
1241 1252
1242 /* Consistency checks */ 1253 /* Consistency checks */
1243 1254
@@ -1281,12 +1292,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1281 * take duplicate reference counts on a subsystem that's already used, 1292 * take duplicate reference counts on a subsystem that's already used,
1282 * but rebind_subsystems handles this case. 1293 * but rebind_subsystems handles this case.
1283 */ 1294 */
1284 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1295 for_each_subsys(ss, i) {
1285 unsigned long bit = 1UL << i; 1296 if (!(opts->subsys_mask & (1UL << i)))
1286
1287 if (!(bit & opts->subsys_mask))
1288 continue; 1297 continue;
1289 if (!try_module_get(subsys[i]->module)) { 1298 if (!try_module_get(cgroup_subsys[i]->module)) {
1290 module_pin_failed = true; 1299 module_pin_failed = true;
1291 break; 1300 break;
1292 } 1301 }
@@ -1303,7 +1312,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1303 1312
1304 if (!(bit & opts->subsys_mask)) 1313 if (!(bit & opts->subsys_mask))
1305 continue; 1314 continue;
1306 module_put(subsys[i]->module); 1315 module_put(cgroup_subsys[i]->module);
1307 } 1316 }
1308 return -ENOENT; 1317 return -ENOENT;
1309 } 1318 }
@@ -1313,14 +1322,14 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1313 1322
1314static void drop_parsed_module_refcounts(unsigned long subsys_mask) 1323static void drop_parsed_module_refcounts(unsigned long subsys_mask)
1315{ 1324{
1325 struct cgroup_subsys *ss;
1316 int i; 1326 int i;
1317 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1318 unsigned long bit = 1UL << i;
1319 1327
1320 if (!(bit & subsys_mask)) 1328 mutex_lock(&cgroup_mutex);
1321 continue; 1329 for_each_subsys(ss, i)
1322 module_put(subsys[i]->module); 1330 if (subsys_mask & (1UL << i))
1323 } 1331 module_put(cgroup_subsys[i]->module);
1332 mutex_unlock(&cgroup_mutex);
1324} 1333}
1325 1334
1326static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1335static int cgroup_remount(struct super_block *sb, int *flags, char *data)
@@ -1345,7 +1354,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1345 if (ret) 1354 if (ret)
1346 goto out_unlock; 1355 goto out_unlock;
1347 1356
1348 if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) 1357 if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1349 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1358 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1350 task_tgid_nr(current), current->comm); 1359 task_tgid_nr(current), current->comm);
1351 1360
@@ -1353,10 +1362,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1353 removed_mask = root->subsys_mask & ~opts.subsys_mask; 1362 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1354 1363
1355 /* Don't allow flags or name to change at remount */ 1364 /* Don't allow flags or name to change at remount */
1356 if (opts.flags != root->flags || 1365 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
1357 (opts.name && strcmp(opts.name, root->name))) { 1366 (opts.name && strcmp(opts.name, root->name))) {
1367 pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
1368 opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
1369 root->flags & CGRP_ROOT_OPTION_MASK, root->name);
1358 ret = -EINVAL; 1370 ret = -EINVAL;
1359 drop_parsed_module_refcounts(opts.subsys_mask);
1360 goto out_unlock; 1371 goto out_unlock;
1361 } 1372 }
1362 1373
@@ -1367,11 +1378,10 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1367 */ 1378 */
1368 cgroup_clear_directory(cgrp->dentry, false, removed_mask); 1379 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1369 1380
1370 ret = rebind_subsystems(root, opts.subsys_mask); 1381 ret = rebind_subsystems(root, added_mask, removed_mask);
1371 if (ret) { 1382 if (ret) {
1372 /* rebind_subsystems failed, re-populate the removed files */ 1383 /* rebind_subsystems failed, re-populate the removed files */
1373 cgroup_populate_dir(cgrp, false, removed_mask); 1384 cgroup_populate_dir(cgrp, false, removed_mask);
1374 drop_parsed_module_refcounts(opts.subsys_mask);
1375 goto out_unlock; 1385 goto out_unlock;
1376 } 1386 }
1377 1387
@@ -1386,6 +1396,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1386 mutex_unlock(&cgroup_root_mutex); 1396 mutex_unlock(&cgroup_root_mutex);
1387 mutex_unlock(&cgroup_mutex); 1397 mutex_unlock(&cgroup_mutex);
1388 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1398 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1399 if (ret)
1400 drop_parsed_module_refcounts(opts.subsys_mask);
1389 return ret; 1401 return ret;
1390} 1402}
1391 1403
@@ -1401,11 +1413,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1401 INIT_LIST_HEAD(&cgrp->sibling); 1413 INIT_LIST_HEAD(&cgrp->sibling);
1402 INIT_LIST_HEAD(&cgrp->children); 1414 INIT_LIST_HEAD(&cgrp->children);
1403 INIT_LIST_HEAD(&cgrp->files); 1415 INIT_LIST_HEAD(&cgrp->files);
1404 INIT_LIST_HEAD(&cgrp->css_sets); 1416 INIT_LIST_HEAD(&cgrp->cset_links);
1405 INIT_LIST_HEAD(&cgrp->allcg_node);
1406 INIT_LIST_HEAD(&cgrp->release_list); 1417 INIT_LIST_HEAD(&cgrp->release_list);
1407 INIT_LIST_HEAD(&cgrp->pidlists); 1418 INIT_LIST_HEAD(&cgrp->pidlists);
1408 INIT_WORK(&cgrp->free_work, cgroup_free_fn);
1409 mutex_init(&cgrp->pidlist_mutex); 1419 mutex_init(&cgrp->pidlist_mutex);
1410 INIT_LIST_HEAD(&cgrp->event_list); 1420 INIT_LIST_HEAD(&cgrp->event_list);
1411 spin_lock_init(&cgrp->event_list_lock); 1421 spin_lock_init(&cgrp->event_list_lock);
@@ -1418,37 +1428,37 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1418 1428
1419 INIT_LIST_HEAD(&root->subsys_list); 1429 INIT_LIST_HEAD(&root->subsys_list);
1420 INIT_LIST_HEAD(&root->root_list); 1430 INIT_LIST_HEAD(&root->root_list);
1421 INIT_LIST_HEAD(&root->allcg_list);
1422 root->number_of_cgroups = 1; 1431 root->number_of_cgroups = 1;
1423 cgrp->root = root; 1432 cgrp->root = root;
1424 cgrp->name = &root_cgroup_name; 1433 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
1425 init_cgroup_housekeeping(cgrp); 1434 init_cgroup_housekeeping(cgrp);
1426 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1427} 1435}
1428 1436
1429static bool init_root_id(struct cgroupfs_root *root) 1437static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
1430{ 1438{
1431 int ret = 0; 1439 int id;
1432 1440
1433 do { 1441 lockdep_assert_held(&cgroup_mutex);
1434 if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) 1442 lockdep_assert_held(&cgroup_root_mutex);
1435 return false; 1443
1436 spin_lock(&hierarchy_id_lock); 1444 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
1437 /* Try to allocate the next unused ID */ 1445 GFP_KERNEL);
1438 ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, 1446 if (id < 0)
1439 &root->hierarchy_id); 1447 return id;
1440 if (ret == -ENOSPC) 1448
1441 /* Try again starting from 0 */ 1449 root->hierarchy_id = id;
1442 ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); 1450 return 0;
1443 if (!ret) { 1451}
1444 next_hierarchy_id = root->hierarchy_id + 1; 1452
1445 } else if (ret != -EAGAIN) { 1453static void cgroup_exit_root_id(struct cgroupfs_root *root)
1446 /* Can only get here if the 31-bit IDR is full ... */ 1454{
1447 BUG_ON(ret); 1455 lockdep_assert_held(&cgroup_mutex);
1448 } 1456 lockdep_assert_held(&cgroup_root_mutex);
1449 spin_unlock(&hierarchy_id_lock); 1457
1450 } while (ret); 1458 if (root->hierarchy_id) {
1451 return true; 1459 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1460 root->hierarchy_id = 0;
1461 }
1452} 1462}
1453 1463
1454static int cgroup_test_super(struct super_block *sb, void *data) 1464static int cgroup_test_super(struct super_block *sb, void *data)
@@ -1482,12 +1492,16 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1482 if (!root) 1492 if (!root)
1483 return ERR_PTR(-ENOMEM); 1493 return ERR_PTR(-ENOMEM);
1484 1494
1485 if (!init_root_id(root)) {
1486 kfree(root);
1487 return ERR_PTR(-ENOMEM);
1488 }
1489 init_cgroup_root(root); 1495 init_cgroup_root(root);
1490 1496
1497 /*
1498 * We need to set @root->subsys_mask now so that @root can be
1499 * matched by cgroup_test_super() before it finishes
1500 * initialization; otherwise, competing mounts with the same
1501 * options may try to bind the same subsystems instead of waiting
1502 * for the first one leading to unexpected mount errors.
1503 * SUBSYS_BOUND will be set once actual binding is complete.
1504 */
1491 root->subsys_mask = opts->subsys_mask; 1505 root->subsys_mask = opts->subsys_mask;
1492 root->flags = opts->flags; 1506 root->flags = opts->flags;
1493 ida_init(&root->cgroup_ida); 1507 ida_init(&root->cgroup_ida);
@@ -1500,17 +1514,15 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1500 return root; 1514 return root;
1501} 1515}
1502 1516
1503static void cgroup_drop_root(struct cgroupfs_root *root) 1517static void cgroup_free_root(struct cgroupfs_root *root)
1504{ 1518{
1505 if (!root) 1519 if (root) {
1506 return; 1520 /* hierarhcy ID shoulid already have been released */
1521 WARN_ON_ONCE(root->hierarchy_id);
1507 1522
1508 BUG_ON(!root->hierarchy_id); 1523 ida_destroy(&root->cgroup_ida);
1509 spin_lock(&hierarchy_id_lock); 1524 kfree(root);
1510 ida_remove(&hierarchy_ida, root->hierarchy_id); 1525 }
1511 spin_unlock(&hierarchy_id_lock);
1512 ida_destroy(&root->cgroup_ida);
1513 kfree(root);
1514} 1526}
1515 1527
1516static int cgroup_set_super(struct super_block *sb, void *data) 1528static int cgroup_set_super(struct super_block *sb, void *data)
@@ -1597,7 +1609,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1597 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); 1609 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
1598 if (IS_ERR(sb)) { 1610 if (IS_ERR(sb)) {
1599 ret = PTR_ERR(sb); 1611 ret = PTR_ERR(sb);
1600 cgroup_drop_root(opts.new_root); 1612 cgroup_free_root(opts.new_root);
1601 goto drop_modules; 1613 goto drop_modules;
1602 } 1614 }
1603 1615
@@ -1605,12 +1617,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1605 BUG_ON(!root); 1617 BUG_ON(!root);
1606 if (root == opts.new_root) { 1618 if (root == opts.new_root) {
1607 /* We used the new root structure, so this is a new hierarchy */ 1619 /* We used the new root structure, so this is a new hierarchy */
1608 struct list_head tmp_cg_links; 1620 struct list_head tmp_links;
1609 struct cgroup *root_cgrp = &root->top_cgroup; 1621 struct cgroup *root_cgrp = &root->top_cgroup;
1610 struct cgroupfs_root *existing_root; 1622 struct cgroupfs_root *existing_root;
1611 const struct cred *cred; 1623 const struct cred *cred;
1612 int i; 1624 int i;
1613 struct css_set *cg; 1625 struct css_set *cset;
1614 1626
1615 BUG_ON(sb->s_root != NULL); 1627 BUG_ON(sb->s_root != NULL);
1616 1628
@@ -1637,13 +1649,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1637 * that's us. The worst that can happen is that we 1649 * that's us. The worst that can happen is that we
1638 * have some link structures left over 1650 * have some link structures left over
1639 */ 1651 */
1640 ret = allocate_cg_links(css_set_count, &tmp_cg_links); 1652 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1641 if (ret) 1653 if (ret)
1642 goto unlock_drop; 1654 goto unlock_drop;
1643 1655
1644 ret = rebind_subsystems(root, root->subsys_mask); 1656 /* ID 0 is reserved for dummy root, 1 for unified hierarchy */
1657 ret = cgroup_init_root_id(root, 2, 0);
1658 if (ret)
1659 goto unlock_drop;
1660
1661 ret = rebind_subsystems(root, root->subsys_mask, 0);
1645 if (ret == -EBUSY) { 1662 if (ret == -EBUSY) {
1646 free_cg_links(&tmp_cg_links); 1663 free_cgrp_cset_links(&tmp_links);
1647 goto unlock_drop; 1664 goto unlock_drop;
1648 } 1665 }
1649 /* 1666 /*
@@ -1655,8 +1672,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1655 /* EBUSY should be the only error here */ 1672 /* EBUSY should be the only error here */
1656 BUG_ON(ret); 1673 BUG_ON(ret);
1657 1674
1658 list_add(&root->root_list, &roots); 1675 list_add(&root->root_list, &cgroup_roots);
1659 root_count++; 1676 cgroup_root_count++;
1660 1677
1661 sb->s_root->d_fsdata = root_cgrp; 1678 sb->s_root->d_fsdata = root_cgrp;
1662 root->top_cgroup.dentry = sb->s_root; 1679 root->top_cgroup.dentry = sb->s_root;
@@ -1664,11 +1681,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1664 /* Link the top cgroup in this hierarchy into all 1681 /* Link the top cgroup in this hierarchy into all
1665 * the css_set objects */ 1682 * the css_set objects */
1666 write_lock(&css_set_lock); 1683 write_lock(&css_set_lock);
1667 hash_for_each(css_set_table, i, cg, hlist) 1684 hash_for_each(css_set_table, i, cset, hlist)
1668 link_css_set(&tmp_cg_links, cg, root_cgrp); 1685 link_css_set(&tmp_links, cset, root_cgrp);
1669 write_unlock(&css_set_lock); 1686 write_unlock(&css_set_lock);
1670 1687
1671 free_cg_links(&tmp_cg_links); 1688 free_cgrp_cset_links(&tmp_links);
1672 1689
1673 BUG_ON(!list_empty(&root_cgrp->children)); 1690 BUG_ON(!list_empty(&root_cgrp->children));
1674 BUG_ON(root->number_of_cgroups != 1); 1691 BUG_ON(root->number_of_cgroups != 1);
@@ -1684,9 +1701,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1684 * We re-used an existing hierarchy - the new root (if 1701 * We re-used an existing hierarchy - the new root (if
1685 * any) is not needed 1702 * any) is not needed
1686 */ 1703 */
1687 cgroup_drop_root(opts.new_root); 1704 cgroup_free_root(opts.new_root);
1688 1705
1689 if (root->flags != opts.flags) { 1706 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1690 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1707 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1691 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); 1708 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1692 ret = -EINVAL; 1709 ret = -EINVAL;
@@ -1705,6 +1722,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1705 return dget(sb->s_root); 1722 return dget(sb->s_root);
1706 1723
1707 unlock_drop: 1724 unlock_drop:
1725 cgroup_exit_root_id(root);
1708 mutex_unlock(&cgroup_root_mutex); 1726 mutex_unlock(&cgroup_root_mutex);
1709 mutex_unlock(&cgroup_mutex); 1727 mutex_unlock(&cgroup_mutex);
1710 mutex_unlock(&inode->i_mutex); 1728 mutex_unlock(&inode->i_mutex);
@@ -1721,9 +1739,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1721static void cgroup_kill_sb(struct super_block *sb) { 1739static void cgroup_kill_sb(struct super_block *sb) {
1722 struct cgroupfs_root *root = sb->s_fs_info; 1740 struct cgroupfs_root *root = sb->s_fs_info;
1723 struct cgroup *cgrp = &root->top_cgroup; 1741 struct cgroup *cgrp = &root->top_cgroup;
1742 struct cgrp_cset_link *link, *tmp_link;
1724 int ret; 1743 int ret;
1725 struct cg_cgroup_link *link;
1726 struct cg_cgroup_link *saved_link;
1727 1744
1728 BUG_ON(!root); 1745 BUG_ON(!root);
1729 1746
@@ -1734,36 +1751,39 @@ static void cgroup_kill_sb(struct super_block *sb) {
1734 mutex_lock(&cgroup_root_mutex); 1751 mutex_lock(&cgroup_root_mutex);
1735 1752
1736 /* Rebind all subsystems back to the default hierarchy */ 1753 /* Rebind all subsystems back to the default hierarchy */
1737 ret = rebind_subsystems(root, 0); 1754 if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
1738 /* Shouldn't be able to fail ... */ 1755 ret = rebind_subsystems(root, 0, root->subsys_mask);
1739 BUG_ON(ret); 1756 /* Shouldn't be able to fail ... */
1757 BUG_ON(ret);
1758 }
1740 1759
1741 /* 1760 /*
1742 * Release all the links from css_sets to this hierarchy's 1761 * Release all the links from cset_links to this hierarchy's
1743 * root cgroup 1762 * root cgroup
1744 */ 1763 */
1745 write_lock(&css_set_lock); 1764 write_lock(&css_set_lock);
1746 1765
1747 list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, 1766 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1748 cgrp_link_list) { 1767 list_del(&link->cset_link);
1749 list_del(&link->cg_link_list); 1768 list_del(&link->cgrp_link);
1750 list_del(&link->cgrp_link_list);
1751 kfree(link); 1769 kfree(link);
1752 } 1770 }
1753 write_unlock(&css_set_lock); 1771 write_unlock(&css_set_lock);
1754 1772
1755 if (!list_empty(&root->root_list)) { 1773 if (!list_empty(&root->root_list)) {
1756 list_del(&root->root_list); 1774 list_del(&root->root_list);
1757 root_count--; 1775 cgroup_root_count--;
1758 } 1776 }
1759 1777
1778 cgroup_exit_root_id(root);
1779
1760 mutex_unlock(&cgroup_root_mutex); 1780 mutex_unlock(&cgroup_root_mutex);
1761 mutex_unlock(&cgroup_mutex); 1781 mutex_unlock(&cgroup_mutex);
1762 1782
1763 simple_xattrs_free(&cgrp->xattrs); 1783 simple_xattrs_free(&cgrp->xattrs);
1764 1784
1765 kill_litter_super(sb); 1785 kill_litter_super(sb);
1766 cgroup_drop_root(root); 1786 cgroup_free_root(root);
1767} 1787}
1768 1788
1769static struct file_system_type cgroup_fs_type = { 1789static struct file_system_type cgroup_fs_type = {
@@ -1825,6 +1845,38 @@ out:
1825} 1845}
1826EXPORT_SYMBOL_GPL(cgroup_path); 1846EXPORT_SYMBOL_GPL(cgroup_path);
1827 1847
1848/**
1849 * task_cgroup_path_from_hierarchy - cgroup path of a task on a hierarchy
1850 * @task: target task
1851 * @hierarchy_id: the hierarchy to look up @task's cgroup from
1852 * @buf: the buffer to write the path into
1853 * @buflen: the length of the buffer
1854 *
1855 * Determine @task's cgroup on the hierarchy specified by @hierarchy_id and
1856 * copy its path into @buf. This function grabs cgroup_mutex and shouldn't
1857 * be used inside locks used by cgroup controller callbacks.
1858 */
1859int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id,
1860 char *buf, size_t buflen)
1861{
1862 struct cgroupfs_root *root;
1863 struct cgroup *cgrp = NULL;
1864 int ret = -ENOENT;
1865
1866 mutex_lock(&cgroup_mutex);
1867
1868 root = idr_find(&cgroup_hierarchy_idr, hierarchy_id);
1869 if (root) {
1870 cgrp = task_cgroup_from_root(task, root);
1871 ret = cgroup_path(cgrp, buf, buflen);
1872 }
1873
1874 mutex_unlock(&cgroup_mutex);
1875
1876 return ret;
1877}
1878EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy);
1879
1828/* 1880/*
1829 * Control Group taskset 1881 * Control Group taskset
1830 */ 1882 */
@@ -1910,10 +1962,11 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1910 * 1962 *
1911 * Must be called with cgroup_mutex and threadgroup locked. 1963 * Must be called with cgroup_mutex and threadgroup locked.
1912 */ 1964 */
1913static void cgroup_task_migrate(struct cgroup *oldcgrp, 1965static void cgroup_task_migrate(struct cgroup *old_cgrp,
1914 struct task_struct *tsk, struct css_set *newcg) 1966 struct task_struct *tsk,
1967 struct css_set *new_cset)
1915{ 1968{
1916 struct css_set *oldcg; 1969 struct css_set *old_cset;
1917 1970
1918 /* 1971 /*
1919 * We are synchronized through threadgroup_lock() against PF_EXITING 1972 * We are synchronized through threadgroup_lock() against PF_EXITING
@@ -1921,25 +1974,25 @@ static void cgroup_task_migrate(struct cgroup *oldcgrp,
1921 * css_set to init_css_set and dropping the old one. 1974 * css_set to init_css_set and dropping the old one.
1922 */ 1975 */
1923 WARN_ON_ONCE(tsk->flags & PF_EXITING); 1976 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1924 oldcg = tsk->cgroups; 1977 old_cset = task_css_set(tsk);
1925 1978
1926 task_lock(tsk); 1979 task_lock(tsk);
1927 rcu_assign_pointer(tsk->cgroups, newcg); 1980 rcu_assign_pointer(tsk->cgroups, new_cset);
1928 task_unlock(tsk); 1981 task_unlock(tsk);
1929 1982
1930 /* Update the css_set linked lists if we're using them */ 1983 /* Update the css_set linked lists if we're using them */
1931 write_lock(&css_set_lock); 1984 write_lock(&css_set_lock);
1932 if (!list_empty(&tsk->cg_list)) 1985 if (!list_empty(&tsk->cg_list))
1933 list_move(&tsk->cg_list, &newcg->tasks); 1986 list_move(&tsk->cg_list, &new_cset->tasks);
1934 write_unlock(&css_set_lock); 1987 write_unlock(&css_set_lock);
1935 1988
1936 /* 1989 /*
1937 * We just gained a reference on oldcg by taking it from the task. As 1990 * We just gained a reference on old_cset by taking it from the
1938 * trading it for newcg is protected by cgroup_mutex, we're safe to drop 1991 * task. As trading it for new_cset is protected by cgroup_mutex,
1939 * it here; it will be freed under RCU. 1992 * we're safe to drop it here; it will be freed under RCU.
1940 */ 1993 */
1941 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1994 set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
1942 put_css_set(oldcg); 1995 put_css_set(old_cset);
1943} 1996}
1944 1997
1945/** 1998/**
@@ -2029,7 +2082,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2029 /* 2082 /*
2030 * step 1: check that we can legitimately attach to the cgroup. 2083 * step 1: check that we can legitimately attach to the cgroup.
2031 */ 2084 */
2032 for_each_subsys(root, ss) { 2085 for_each_root_subsys(root, ss) {
2033 if (ss->can_attach) { 2086 if (ss->can_attach) {
2034 retval = ss->can_attach(cgrp, &tset); 2087 retval = ss->can_attach(cgrp, &tset);
2035 if (retval) { 2088 if (retval) {
@@ -2044,8 +2097,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2044 * we use find_css_set, which allocates a new one if necessary. 2097 * we use find_css_set, which allocates a new one if necessary.
2045 */ 2098 */
2046 for (i = 0; i < group_size; i++) { 2099 for (i = 0; i < group_size; i++) {
2100 struct css_set *old_cset;
2101
2047 tc = flex_array_get(group, i); 2102 tc = flex_array_get(group, i);
2048 tc->cg = find_css_set(tc->task->cgroups, cgrp); 2103 old_cset = task_css_set(tc->task);
2104 tc->cg = find_css_set(old_cset, cgrp);
2049 if (!tc->cg) { 2105 if (!tc->cg) {
2050 retval = -ENOMEM; 2106 retval = -ENOMEM;
2051 goto out_put_css_set_refs; 2107 goto out_put_css_set_refs;
@@ -2066,7 +2122,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2066 /* 2122 /*
2067 * step 4: do subsystem attach callbacks. 2123 * step 4: do subsystem attach callbacks.
2068 */ 2124 */
2069 for_each_subsys(root, ss) { 2125 for_each_root_subsys(root, ss) {
2070 if (ss->attach) 2126 if (ss->attach)
2071 ss->attach(cgrp, &tset); 2127 ss->attach(cgrp, &tset);
2072 } 2128 }
@@ -2086,7 +2142,7 @@ out_put_css_set_refs:
2086 } 2142 }
2087out_cancel_attach: 2143out_cancel_attach:
2088 if (retval) { 2144 if (retval) {
2089 for_each_subsys(root, ss) { 2145 for_each_root_subsys(root, ss) {
2090 if (ss == failed_ss) 2146 if (ss == failed_ss)
2091 break; 2147 break;
2092 if (ss->cancel_attach) 2148 if (ss->cancel_attach)
@@ -2323,7 +2379,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
2323 struct cftype *cft = __d_cft(file->f_dentry); 2379 struct cftype *cft = __d_cft(file->f_dentry);
2324 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2380 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2325 2381
2326 if (cgroup_is_removed(cgrp)) 2382 if (cgroup_is_dead(cgrp))
2327 return -ENODEV; 2383 return -ENODEV;
2328 if (cft->write) 2384 if (cft->write)
2329 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 2385 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -2368,7 +2424,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2368 struct cftype *cft = __d_cft(file->f_dentry); 2424 struct cftype *cft = __d_cft(file->f_dentry);
2369 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2425 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2370 2426
2371 if (cgroup_is_removed(cgrp)) 2427 if (cgroup_is_dead(cgrp))
2372 return -ENODEV; 2428 return -ENODEV;
2373 2429
2374 if (cft->read) 2430 if (cft->read)
@@ -2435,10 +2491,12 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
2435 cft = __d_cft(file->f_dentry); 2491 cft = __d_cft(file->f_dentry);
2436 2492
2437 if (cft->read_map || cft->read_seq_string) { 2493 if (cft->read_map || cft->read_seq_string) {
2438 struct cgroup_seqfile_state *state = 2494 struct cgroup_seqfile_state *state;
2439 kzalloc(sizeof(*state), GFP_USER); 2495
2496 state = kzalloc(sizeof(*state), GFP_USER);
2440 if (!state) 2497 if (!state)
2441 return -ENOMEM; 2498 return -ENOMEM;
2499
2442 state->cft = cft; 2500 state->cft = cft;
2443 state->cgroup = __d_cgrp(file->f_dentry->d_parent); 2501 state->cgroup = __d_cgrp(file->f_dentry->d_parent);
2444 file->f_op = &cgroup_seqfile_operations; 2502 file->f_op = &cgroup_seqfile_operations;
@@ -2486,6 +2544,13 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2486 2544
2487 cgrp = __d_cgrp(old_dentry); 2545 cgrp = __d_cgrp(old_dentry);
2488 2546
2547 /*
2548 * This isn't a proper migration and its usefulness is very
2549 * limited. Disallow if sane_behavior.
2550 */
2551 if (cgroup_sane_behavior(cgrp))
2552 return -EPERM;
2553
2489 name = cgroup_alloc_name(new_dentry); 2554 name = cgroup_alloc_name(new_dentry);
2490 if (!name) 2555 if (!name)
2491 return -ENOMEM; 2556 return -ENOMEM;
@@ -2496,7 +2561,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2496 return ret; 2561 return ret;
2497 } 2562 }
2498 2563
2499 old_name = cgrp->name; 2564 old_name = rcu_dereference_protected(cgrp->name, true);
2500 rcu_assign_pointer(cgrp->name, name); 2565 rcu_assign_pointer(cgrp->name, name);
2501 2566
2502 kfree_rcu(old_name, rcu_head); 2567 kfree_rcu(old_name, rcu_head);
@@ -2747,58 +2812,78 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2747 return ret; 2812 return ret;
2748} 2813}
2749 2814
2750static DEFINE_MUTEX(cgroup_cft_mutex);
2751
2752static void cgroup_cfts_prepare(void) 2815static void cgroup_cfts_prepare(void)
2753 __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex) 2816 __acquires(&cgroup_mutex)
2754{ 2817{
2755 /* 2818 /*
2756 * Thanks to the entanglement with vfs inode locking, we can't walk 2819 * Thanks to the entanglement with vfs inode locking, we can't walk
2757 * the existing cgroups under cgroup_mutex and create files. 2820 * the existing cgroups under cgroup_mutex and create files.
2758 * Instead, we increment reference on all cgroups and build list of 2821 * Instead, we use cgroup_for_each_descendant_pre() and drop RCU
2759 * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure 2822 * read lock before calling cgroup_addrm_files().
2760 * exclusive access to the field.
2761 */ 2823 */
2762 mutex_lock(&cgroup_cft_mutex);
2763 mutex_lock(&cgroup_mutex); 2824 mutex_lock(&cgroup_mutex);
2764} 2825}
2765 2826
2766static void cgroup_cfts_commit(struct cgroup_subsys *ss, 2827static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2767 struct cftype *cfts, bool is_add) 2828 struct cftype *cfts, bool is_add)
2768 __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) 2829 __releases(&cgroup_mutex)
2769{ 2830{
2770 LIST_HEAD(pending); 2831 LIST_HEAD(pending);
2771 struct cgroup *cgrp, *n; 2832 struct cgroup *cgrp, *root = &ss->root->top_cgroup;
2833 struct super_block *sb = ss->root->sb;
2834 struct dentry *prev = NULL;
2835 struct inode *inode;
2836 u64 update_before;
2772 2837
2773 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ 2838 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2774 if (cfts && ss->root != &rootnode) { 2839 if (!cfts || ss->root == &cgroup_dummy_root ||
2775 list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { 2840 !atomic_inc_not_zero(&sb->s_active)) {
2776 dget(cgrp->dentry); 2841 mutex_unlock(&cgroup_mutex);
2777 list_add_tail(&cgrp->cft_q_node, &pending); 2842 return;
2778 }
2779 } 2843 }
2780 2844
2781 mutex_unlock(&cgroup_mutex);
2782
2783 /* 2845 /*
2784 * All new cgroups will see @cfts update on @ss->cftsets. Add/rm 2846 * All cgroups which are created after we drop cgroup_mutex will
2785 * files for all cgroups which were created before. 2847 * have the updated set of files, so we only need to update the
2848 * cgroups created before the current @cgroup_serial_nr_next.
2786 */ 2849 */
2787 list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) { 2850 update_before = cgroup_serial_nr_next;
2788 struct inode *inode = cgrp->dentry->d_inode; 2851
2852 mutex_unlock(&cgroup_mutex);
2853
2854 /* @root always needs to be updated */
2855 inode = root->dentry->d_inode;
2856 mutex_lock(&inode->i_mutex);
2857 mutex_lock(&cgroup_mutex);
2858 cgroup_addrm_files(root, ss, cfts, is_add);
2859 mutex_unlock(&cgroup_mutex);
2860 mutex_unlock(&inode->i_mutex);
2861
2862 /* add/rm files for all cgroups created before */
2863 rcu_read_lock();
2864 cgroup_for_each_descendant_pre(cgrp, root) {
2865 if (cgroup_is_dead(cgrp))
2866 continue;
2867
2868 inode = cgrp->dentry->d_inode;
2869 dget(cgrp->dentry);
2870 rcu_read_unlock();
2871
2872 dput(prev);
2873 prev = cgrp->dentry;
2789 2874
2790 mutex_lock(&inode->i_mutex); 2875 mutex_lock(&inode->i_mutex);
2791 mutex_lock(&cgroup_mutex); 2876 mutex_lock(&cgroup_mutex);
2792 if (!cgroup_is_removed(cgrp)) 2877 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2793 cgroup_addrm_files(cgrp, ss, cfts, is_add); 2878 cgroup_addrm_files(cgrp, ss, cfts, is_add);
2794 mutex_unlock(&cgroup_mutex); 2879 mutex_unlock(&cgroup_mutex);
2795 mutex_unlock(&inode->i_mutex); 2880 mutex_unlock(&inode->i_mutex);
2796 2881
2797 list_del_init(&cgrp->cft_q_node); 2882 rcu_read_lock();
2798 dput(cgrp->dentry);
2799 } 2883 }
2800 2884 rcu_read_unlock();
2801 mutex_unlock(&cgroup_cft_mutex); 2885 dput(prev);
2886 deactivate_super(sb);
2802} 2887}
2803 2888
2804/** 2889/**
@@ -2853,7 +2938,8 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2853 2938
2854 list_for_each_entry(set, &ss->cftsets, node) { 2939 list_for_each_entry(set, &ss->cftsets, node) {
2855 if (set->cfts == cfts) { 2940 if (set->cfts == cfts) {
2856 list_del_init(&set->node); 2941 list_del(&set->node);
2942 kfree(set);
2857 cgroup_cfts_commit(ss, cfts, false); 2943 cgroup_cfts_commit(ss, cfts, false);
2858 return 0; 2944 return 0;
2859 } 2945 }
@@ -2872,12 +2958,11 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2872int cgroup_task_count(const struct cgroup *cgrp) 2958int cgroup_task_count(const struct cgroup *cgrp)
2873{ 2959{
2874 int count = 0; 2960 int count = 0;
2875 struct cg_cgroup_link *link; 2961 struct cgrp_cset_link *link;
2876 2962
2877 read_lock(&css_set_lock); 2963 read_lock(&css_set_lock);
2878 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { 2964 list_for_each_entry(link, &cgrp->cset_links, cset_link)
2879 count += atomic_read(&link->cg->refcount); 2965 count += atomic_read(&link->cset->refcount);
2880 }
2881 read_unlock(&css_set_lock); 2966 read_unlock(&css_set_lock);
2882 return count; 2967 return count;
2883} 2968}
@@ -2886,25 +2971,24 @@ int cgroup_task_count(const struct cgroup *cgrp)
2886 * Advance a list_head iterator. The iterator should be positioned at 2971 * Advance a list_head iterator. The iterator should be positioned at
2887 * the start of a css_set 2972 * the start of a css_set
2888 */ 2973 */
2889static void cgroup_advance_iter(struct cgroup *cgrp, 2974static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it)
2890 struct cgroup_iter *it)
2891{ 2975{
2892 struct list_head *l = it->cg_link; 2976 struct list_head *l = it->cset_link;
2893 struct cg_cgroup_link *link; 2977 struct cgrp_cset_link *link;
2894 struct css_set *cg; 2978 struct css_set *cset;
2895 2979
2896 /* Advance to the next non-empty css_set */ 2980 /* Advance to the next non-empty css_set */
2897 do { 2981 do {
2898 l = l->next; 2982 l = l->next;
2899 if (l == &cgrp->css_sets) { 2983 if (l == &cgrp->cset_links) {
2900 it->cg_link = NULL; 2984 it->cset_link = NULL;
2901 return; 2985 return;
2902 } 2986 }
2903 link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); 2987 link = list_entry(l, struct cgrp_cset_link, cset_link);
2904 cg = link->cg; 2988 cset = link->cset;
2905 } while (list_empty(&cg->tasks)); 2989 } while (list_empty(&cset->tasks));
2906 it->cg_link = l; 2990 it->cset_link = l;
2907 it->task = cg->tasks.next; 2991 it->task = cset->tasks.next;
2908} 2992}
2909 2993
2910/* 2994/*
@@ -2934,7 +3018,7 @@ static void cgroup_enable_task_cg_lists(void)
2934 * entry won't be deleted though the process has exited. 3018 * entry won't be deleted though the process has exited.
2935 */ 3019 */
2936 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) 3020 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
2937 list_add(&p->cg_list, &p->cgroups->tasks); 3021 list_add(&p->cg_list, &task_css_set(p)->tasks);
2938 task_unlock(p); 3022 task_unlock(p);
2939 } while_each_thread(g, p); 3023 } while_each_thread(g, p);
2940 read_unlock(&tasklist_lock); 3024 read_unlock(&tasklist_lock);
@@ -2942,12 +3026,67 @@ static void cgroup_enable_task_cg_lists(void)
2942} 3026}
2943 3027
2944/** 3028/**
3029 * cgroup_next_sibling - find the next sibling of a given cgroup
3030 * @pos: the current cgroup
3031 *
3032 * This function returns the next sibling of @pos and should be called
3033 * under RCU read lock. The only requirement is that @pos is accessible.
3034 * The next sibling is guaranteed to be returned regardless of @pos's
3035 * state.
3036 */
3037struct cgroup *cgroup_next_sibling(struct cgroup *pos)
3038{
3039 struct cgroup *next;
3040
3041 WARN_ON_ONCE(!rcu_read_lock_held());
3042
3043 /*
3044 * @pos could already have been removed. Once a cgroup is removed,
3045 * its ->sibling.next is no longer updated when its next sibling
3046 * changes. As CGRP_DEAD assertion is serialized and happens
3047 * before the cgroup is taken off the ->sibling list, if we see it
3048 * unasserted, it's guaranteed that the next sibling hasn't
3049 * finished its grace period even if it's already removed, and thus
3050 * safe to dereference from this RCU critical section. If
3051 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
3052 * to be visible as %true here.
3053 */
3054 if (likely(!cgroup_is_dead(pos))) {
3055 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3056 if (&next->sibling != &pos->parent->children)
3057 return next;
3058 return NULL;
3059 }
3060
3061 /*
3062 * Can't dereference the next pointer. Each cgroup is given a
3063 * monotonically increasing unique serial number and always
3064 * appended to the sibling list, so the next one can be found by
3065 * walking the parent's children until we see a cgroup with higher
3066 * serial number than @pos's.
3067 *
3068 * While this path can be slow, it's taken only when either the
3069 * current cgroup is removed or iteration and removal race.
3070 */
3071 list_for_each_entry_rcu(next, &pos->parent->children, sibling)
3072 if (next->serial_nr > pos->serial_nr)
3073 return next;
3074 return NULL;
3075}
3076EXPORT_SYMBOL_GPL(cgroup_next_sibling);
3077
3078/**
2945 * cgroup_next_descendant_pre - find the next descendant for pre-order walk 3079 * cgroup_next_descendant_pre - find the next descendant for pre-order walk
2946 * @pos: the current position (%NULL to initiate traversal) 3080 * @pos: the current position (%NULL to initiate traversal)
2947 * @cgroup: cgroup whose descendants to walk 3081 * @cgroup: cgroup whose descendants to walk
2948 * 3082 *
2949 * To be used by cgroup_for_each_descendant_pre(). Find the next 3083 * To be used by cgroup_for_each_descendant_pre(). Find the next
2950 * descendant to visit for pre-order traversal of @cgroup's descendants. 3084 * descendant to visit for pre-order traversal of @cgroup's descendants.
3085 *
3086 * While this function requires RCU read locking, it doesn't require the
3087 * whole traversal to be contained in a single RCU critical section. This
3088 * function will return the correct next descendant as long as both @pos
3089 * and @cgroup are accessible and @pos is a descendant of @cgroup.
2951 */ 3090 */
2952struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, 3091struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2953 struct cgroup *cgroup) 3092 struct cgroup *cgroup)
@@ -2967,11 +3106,9 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2967 3106
2968 /* no child, visit my or the closest ancestor's next sibling */ 3107 /* no child, visit my or the closest ancestor's next sibling */
2969 while (pos != cgroup) { 3108 while (pos != cgroup) {
2970 next = list_entry_rcu(pos->sibling.next, struct cgroup, 3109 next = cgroup_next_sibling(pos);
2971 sibling); 3110 if (next)
2972 if (&next->sibling != &pos->parent->children)
2973 return next; 3111 return next;
2974
2975 pos = pos->parent; 3112 pos = pos->parent;
2976 } 3113 }
2977 3114
@@ -2986,6 +3123,11 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
2986 * Return the rightmost descendant of @pos. If there's no descendant, 3123 * Return the rightmost descendant of @pos. If there's no descendant,
2987 * @pos is returned. This can be used during pre-order traversal to skip 3124 * @pos is returned. This can be used during pre-order traversal to skip
2988 * subtree of @pos. 3125 * subtree of @pos.
3126 *
3127 * While this function requires RCU read locking, it doesn't require the
3128 * whole traversal to be contained in a single RCU critical section. This
3129 * function will return the correct rightmost descendant as long as @pos is
3130 * accessible.
2989 */ 3131 */
2990struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) 3132struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
2991{ 3133{
@@ -3025,6 +3167,11 @@ static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
3025 * 3167 *
3026 * To be used by cgroup_for_each_descendant_post(). Find the next 3168 * To be used by cgroup_for_each_descendant_post(). Find the next
3027 * descendant to visit for post-order traversal of @cgroup's descendants. 3169 * descendant to visit for post-order traversal of @cgroup's descendants.
3170 *
3171 * While this function requires RCU read locking, it doesn't require the
3172 * whole traversal to be contained in a single RCU critical section. This
3173 * function will return the correct next descendant as long as both @pos
3174 * and @cgroup are accessible and @pos is a descendant of @cgroup.
3028 */ 3175 */
3029struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, 3176struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
3030 struct cgroup *cgroup) 3177 struct cgroup *cgroup)
@@ -3040,8 +3187,8 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
3040 } 3187 }
3041 3188
3042 /* if there's an unvisited sibling, visit its leftmost descendant */ 3189 /* if there's an unvisited sibling, visit its leftmost descendant */
3043 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); 3190 next = cgroup_next_sibling(pos);
3044 if (&next->sibling != &pos->parent->children) 3191 if (next)
3045 return cgroup_leftmost_descendant(next); 3192 return cgroup_leftmost_descendant(next);
3046 3193
3047 /* no sibling left, visit parent */ 3194 /* no sibling left, visit parent */
@@ -3062,7 +3209,7 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
3062 cgroup_enable_task_cg_lists(); 3209 cgroup_enable_task_cg_lists();
3063 3210
3064 read_lock(&css_set_lock); 3211 read_lock(&css_set_lock);
3065 it->cg_link = &cgrp->css_sets; 3212 it->cset_link = &cgrp->cset_links;
3066 cgroup_advance_iter(cgrp, it); 3213 cgroup_advance_iter(cgrp, it);
3067} 3214}
3068 3215
@@ -3071,16 +3218,16 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
3071{ 3218{
3072 struct task_struct *res; 3219 struct task_struct *res;
3073 struct list_head *l = it->task; 3220 struct list_head *l = it->task;
3074 struct cg_cgroup_link *link; 3221 struct cgrp_cset_link *link;
3075 3222
3076 /* If the iterator cg is NULL, we have no tasks */ 3223 /* If the iterator cg is NULL, we have no tasks */
3077 if (!it->cg_link) 3224 if (!it->cset_link)
3078 return NULL; 3225 return NULL;
3079 res = list_entry(l, struct task_struct, cg_list); 3226 res = list_entry(l, struct task_struct, cg_list);
3080 /* Advance iterator to find next entry */ 3227 /* Advance iterator to find next entry */
3081 l = l->next; 3228 l = l->next;
3082 link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); 3229 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
3083 if (l == &link->cg->tasks) { 3230 if (l == &link->cset->tasks) {
3084 /* We reached the end of this task list - move on to 3231 /* We reached the end of this task list - move on to
3085 * the next cg_cgroup_link */ 3232 * the next cg_cgroup_link */
3086 cgroup_advance_iter(cgrp, it); 3233 cgroup_advance_iter(cgrp, it);
@@ -3411,7 +3558,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3411 } 3558 }
3412 } 3559 }
3413 /* entry not found; create a new one */ 3560 /* entry not found; create a new one */
3414 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 3561 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3415 if (!l) { 3562 if (!l) {
3416 mutex_unlock(&cgrp->pidlist_mutex); 3563 mutex_unlock(&cgrp->pidlist_mutex);
3417 return l; 3564 return l;
@@ -3420,8 +3567,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3420 down_write(&l->mutex); 3567 down_write(&l->mutex);
3421 l->key.type = type; 3568 l->key.type = type;
3422 l->key.ns = get_pid_ns(ns); 3569 l->key.ns = get_pid_ns(ns);
3423 l->use_count = 0; /* don't increment here */
3424 l->list = NULL;
3425 l->owner = cgrp; 3570 l->owner = cgrp;
3426 list_add(&l->links, &cgrp->pidlists); 3571 list_add(&l->links, &cgrp->pidlists);
3427 mutex_unlock(&cgrp->pidlist_mutex); 3572 mutex_unlock(&cgrp->pidlist_mutex);
@@ -3727,6 +3872,23 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
3727} 3872}
3728 3873
3729/* 3874/*
3875 * When dput() is called asynchronously, if umount has been done and
3876 * then deactivate_super() in cgroup_free_fn() kills the superblock,
3877 * there's a small window that vfs will see the root dentry with non-zero
3878 * refcnt and trigger BUG().
3879 *
3880 * That's why we hold a reference before dput() and drop it right after.
3881 */
3882static void cgroup_dput(struct cgroup *cgrp)
3883{
3884 struct super_block *sb = cgrp->root->sb;
3885
3886 atomic_inc(&sb->s_active);
3887 dput(cgrp->dentry);
3888 deactivate_super(sb);
3889}
3890
3891/*
3730 * Unregister event and free resources. 3892 * Unregister event and free resources.
3731 * 3893 *
3732 * Gets called from workqueue. 3894 * Gets called from workqueue.
@@ -3746,7 +3908,7 @@ static void cgroup_event_remove(struct work_struct *work)
3746 3908
3747 eventfd_ctx_put(event->eventfd); 3909 eventfd_ctx_put(event->eventfd);
3748 kfree(event); 3910 kfree(event);
3749 dput(cgrp->dentry); 3911 cgroup_dput(cgrp);
3750} 3912}
3751 3913
3752/* 3914/*
@@ -3933,33 +4095,16 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
3933 return 0; 4095 return 0;
3934} 4096}
3935 4097
3936/* 4098static struct cftype cgroup_base_files[] = {
3937 * for the common functions, 'private' gives the type of file
3938 */
3939/* for hysterical raisins, we can't put this on the older files */
3940#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
3941static struct cftype files[] = {
3942 {
3943 .name = "tasks",
3944 .open = cgroup_tasks_open,
3945 .write_u64 = cgroup_tasks_write,
3946 .release = cgroup_pidlist_release,
3947 .mode = S_IRUGO | S_IWUSR,
3948 },
3949 { 4099 {
3950 .name = CGROUP_FILE_GENERIC_PREFIX "procs", 4100 .name = "cgroup.procs",
3951 .open = cgroup_procs_open, 4101 .open = cgroup_procs_open,
3952 .write_u64 = cgroup_procs_write, 4102 .write_u64 = cgroup_procs_write,
3953 .release = cgroup_pidlist_release, 4103 .release = cgroup_pidlist_release,
3954 .mode = S_IRUGO | S_IWUSR, 4104 .mode = S_IRUGO | S_IWUSR,
3955 }, 4105 },
3956 { 4106 {
3957 .name = "notify_on_release", 4107 .name = "cgroup.event_control",
3958 .read_u64 = cgroup_read_notify_on_release,
3959 .write_u64 = cgroup_write_notify_on_release,
3960 },
3961 {
3962 .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3963 .write_string = cgroup_write_event_control, 4108 .write_string = cgroup_write_event_control,
3964 .mode = S_IWUGO, 4109 .mode = S_IWUGO,
3965 }, 4110 },
@@ -3974,9 +4119,29 @@ static struct cftype files[] = {
3974 .flags = CFTYPE_ONLY_ON_ROOT, 4119 .flags = CFTYPE_ONLY_ON_ROOT,
3975 .read_seq_string = cgroup_sane_behavior_show, 4120 .read_seq_string = cgroup_sane_behavior_show,
3976 }, 4121 },
4122
4123 /*
4124 * Historical crazy stuff. These don't have "cgroup." prefix and
4125 * don't exist if sane_behavior. If you're depending on these, be
4126 * prepared to be burned.
4127 */
4128 {
4129 .name = "tasks",
4130 .flags = CFTYPE_INSANE, /* use "procs" instead */
4131 .open = cgroup_tasks_open,
4132 .write_u64 = cgroup_tasks_write,
4133 .release = cgroup_pidlist_release,
4134 .mode = S_IRUGO | S_IWUSR,
4135 },
4136 {
4137 .name = "notify_on_release",
4138 .flags = CFTYPE_INSANE,
4139 .read_u64 = cgroup_read_notify_on_release,
4140 .write_u64 = cgroup_write_notify_on_release,
4141 },
3977 { 4142 {
3978 .name = "release_agent", 4143 .name = "release_agent",
3979 .flags = CFTYPE_ONLY_ON_ROOT, 4144 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
3980 .read_seq_string = cgroup_release_agent_show, 4145 .read_seq_string = cgroup_release_agent_show,
3981 .write_string = cgroup_release_agent_write, 4146 .write_string = cgroup_release_agent_write,
3982 .max_write_len = PATH_MAX, 4147 .max_write_len = PATH_MAX,
@@ -3997,13 +4162,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
3997 struct cgroup_subsys *ss; 4162 struct cgroup_subsys *ss;
3998 4163
3999 if (base_files) { 4164 if (base_files) {
4000 err = cgroup_addrm_files(cgrp, NULL, files, true); 4165 err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true);
4001 if (err < 0) 4166 if (err < 0)
4002 return err; 4167 return err;
4003 } 4168 }
4004 4169
4005 /* process cftsets of each subsystem */ 4170 /* process cftsets of each subsystem */
4006 for_each_subsys(cgrp->root, ss) { 4171 for_each_root_subsys(cgrp->root, ss) {
4007 struct cftype_set *set; 4172 struct cftype_set *set;
4008 if (!test_bit(ss->subsys_id, &subsys_mask)) 4173 if (!test_bit(ss->subsys_id, &subsys_mask))
4009 continue; 4174 continue;
@@ -4013,15 +4178,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
4013 } 4178 }
4014 4179
4015 /* This cgroup is ready now */ 4180 /* This cgroup is ready now */
4016 for_each_subsys(cgrp->root, ss) { 4181 for_each_root_subsys(cgrp->root, ss) {
4017 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4182 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4183 struct css_id *id = rcu_dereference_protected(css->id, true);
4184
4018 /* 4185 /*
4019 * Update id->css pointer and make this css visible from 4186 * Update id->css pointer and make this css visible from
4020 * CSS ID functions. This pointer will be dereferened 4187 * CSS ID functions. This pointer will be dereferened
4021 * from RCU-read-side without locks. 4188 * from RCU-read-side without locks.
4022 */ 4189 */
4023 if (css->id) 4190 if (id)
4024 rcu_assign_pointer(css->id->css, css); 4191 rcu_assign_pointer(id->css, css);
4025 } 4192 }
4026 4193
4027 return 0; 4194 return 0;
@@ -4031,12 +4198,16 @@ static void css_dput_fn(struct work_struct *work)
4031{ 4198{
4032 struct cgroup_subsys_state *css = 4199 struct cgroup_subsys_state *css =
4033 container_of(work, struct cgroup_subsys_state, dput_work); 4200 container_of(work, struct cgroup_subsys_state, dput_work);
4034 struct dentry *dentry = css->cgroup->dentry;
4035 struct super_block *sb = dentry->d_sb;
4036 4201
4037 atomic_inc(&sb->s_active); 4202 cgroup_dput(css->cgroup);
4038 dput(dentry); 4203}
4039 deactivate_super(sb); 4204
4205static void css_release(struct percpu_ref *ref)
4206{
4207 struct cgroup_subsys_state *css =
4208 container_of(ref, struct cgroup_subsys_state, refcnt);
4209
4210 schedule_work(&css->dput_work);
4040} 4211}
4041 4212
4042static void init_cgroup_css(struct cgroup_subsys_state *css, 4213static void init_cgroup_css(struct cgroup_subsys_state *css,
@@ -4044,10 +4215,9 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
4044 struct cgroup *cgrp) 4215 struct cgroup *cgrp)
4045{ 4216{
4046 css->cgroup = cgrp; 4217 css->cgroup = cgrp;
4047 atomic_set(&css->refcnt, 1);
4048 css->flags = 0; 4218 css->flags = 0;
4049 css->id = NULL; 4219 css->id = NULL;
4050 if (cgrp == dummytop) 4220 if (cgrp == cgroup_dummy_top)
4051 css->flags |= CSS_ROOT; 4221 css->flags |= CSS_ROOT;
4052 BUG_ON(cgrp->subsys[ss->subsys_id]); 4222 BUG_ON(cgrp->subsys[ss->subsys_id]);
4053 cgrp->subsys[ss->subsys_id] = css; 4223 cgrp->subsys[ss->subsys_id] = css;
@@ -4157,7 +4327,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4157 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 4327 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4158 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4328 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4159 4329
4160 for_each_subsys(root, ss) { 4330 for_each_root_subsys(root, ss) {
4161 struct cgroup_subsys_state *css; 4331 struct cgroup_subsys_state *css;
4162 4332
4163 css = ss->css_alloc(cgrp); 4333 css = ss->css_alloc(cgrp);
@@ -4165,7 +4335,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4165 err = PTR_ERR(css); 4335 err = PTR_ERR(css);
4166 goto err_free_all; 4336 goto err_free_all;
4167 } 4337 }
4338
4339 err = percpu_ref_init(&css->refcnt, css_release);
4340 if (err)
4341 goto err_free_all;
4342
4168 init_cgroup_css(css, ss, cgrp); 4343 init_cgroup_css(css, ss, cgrp);
4344
4169 if (ss->use_id) { 4345 if (ss->use_id) {
4170 err = alloc_css_id(ss, parent, cgrp); 4346 err = alloc_css_id(ss, parent, cgrp);
4171 if (err) 4347 if (err)
@@ -4183,20 +4359,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4183 goto err_free_all; 4359 goto err_free_all;
4184 lockdep_assert_held(&dentry->d_inode->i_mutex); 4360 lockdep_assert_held(&dentry->d_inode->i_mutex);
4185 4361
4362 cgrp->serial_nr = cgroup_serial_nr_next++;
4363
4186 /* allocation complete, commit to creation */ 4364 /* allocation complete, commit to creation */
4187 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4188 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4365 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4189 root->number_of_cgroups++; 4366 root->number_of_cgroups++;
4190 4367
4191 /* each css holds a ref to the cgroup's dentry */ 4368 /* each css holds a ref to the cgroup's dentry */
4192 for_each_subsys(root, ss) 4369 for_each_root_subsys(root, ss)
4193 dget(dentry); 4370 dget(dentry);
4194 4371
4195 /* hold a ref to the parent's dentry */ 4372 /* hold a ref to the parent's dentry */
4196 dget(parent->dentry); 4373 dget(parent->dentry);
4197 4374
4198 /* creation succeeded, notify subsystems */ 4375 /* creation succeeded, notify subsystems */
4199 for_each_subsys(root, ss) { 4376 for_each_root_subsys(root, ss) {
4200 err = online_css(ss, cgrp); 4377 err = online_css(ss, cgrp);
4201 if (err) 4378 if (err)
4202 goto err_destroy; 4379 goto err_destroy;
@@ -4221,9 +4398,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4221 return 0; 4398 return 0;
4222 4399
4223err_free_all: 4400err_free_all:
4224 for_each_subsys(root, ss) { 4401 for_each_root_subsys(root, ss) {
4225 if (cgrp->subsys[ss->subsys_id]) 4402 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4403
4404 if (css) {
4405 percpu_ref_cancel_init(&css->refcnt);
4226 ss->css_free(cgrp); 4406 ss->css_free(cgrp);
4407 }
4227 } 4408 }
4228 mutex_unlock(&cgroup_mutex); 4409 mutex_unlock(&cgroup_mutex);
4229 /* Release the reference count that we took on the superblock */ 4410 /* Release the reference count that we took on the superblock */
@@ -4251,63 +4432,120 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4251 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4432 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4252} 4433}
4253 4434
4435static void cgroup_css_killed(struct cgroup *cgrp)
4436{
4437 if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
4438 return;
4439
4440 /* percpu ref's of all css's are killed, kick off the next step */
4441 INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
4442 schedule_work(&cgrp->destroy_work);
4443}
4444
4445static void css_ref_killed_fn(struct percpu_ref *ref)
4446{
4447 struct cgroup_subsys_state *css =
4448 container_of(ref, struct cgroup_subsys_state, refcnt);
4449
4450 cgroup_css_killed(css->cgroup);
4451}
4452
4453/**
4454 * cgroup_destroy_locked - the first stage of cgroup destruction
4455 * @cgrp: cgroup to be destroyed
4456 *
4457 * css's make use of percpu refcnts whose killing latency shouldn't be
4458 * exposed to userland and are RCU protected. Also, cgroup core needs to
4459 * guarantee that css_tryget() won't succeed by the time ->css_offline() is
4460 * invoked. To satisfy all the requirements, destruction is implemented in
4461 * the following two steps.
4462 *
4463 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
4464 * userland visible parts and start killing the percpu refcnts of
4465 * css's. Set up so that the next stage will be kicked off once all
4466 * the percpu refcnts are confirmed to be killed.
4467 *
4468 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
4469 * rest of destruction. Once all cgroup references are gone, the
4470 * cgroup is RCU-freed.
4471 *
4472 * This function implements s1. After this step, @cgrp is gone as far as
4473 * the userland is concerned and a new cgroup with the same name may be
4474 * created. As cgroup doesn't care about the names internally, this
4475 * doesn't cause any problem.
4476 */
4254static int cgroup_destroy_locked(struct cgroup *cgrp) 4477static int cgroup_destroy_locked(struct cgroup *cgrp)
4255 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4478 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4256{ 4479{
4257 struct dentry *d = cgrp->dentry; 4480 struct dentry *d = cgrp->dentry;
4258 struct cgroup *parent = cgrp->parent;
4259 struct cgroup_event *event, *tmp; 4481 struct cgroup_event *event, *tmp;
4260 struct cgroup_subsys *ss; 4482 struct cgroup_subsys *ss;
4483 bool empty;
4261 4484
4262 lockdep_assert_held(&d->d_inode->i_mutex); 4485 lockdep_assert_held(&d->d_inode->i_mutex);
4263 lockdep_assert_held(&cgroup_mutex); 4486 lockdep_assert_held(&cgroup_mutex);
4264 4487
4265 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) 4488 /*
4489 * css_set_lock synchronizes access to ->cset_links and prevents
4490 * @cgrp from being removed while __put_css_set() is in progress.
4491 */
4492 read_lock(&css_set_lock);
4493 empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children);
4494 read_unlock(&css_set_lock);
4495 if (!empty)
4266 return -EBUSY; 4496 return -EBUSY;
4267 4497
4268 /* 4498 /*
4269 * Block new css_tryget() by deactivating refcnt and mark @cgrp 4499 * Block new css_tryget() by killing css refcnts. cgroup core
4270 * removed. This makes future css_tryget() and child creation 4500 * guarantees that, by the time ->css_offline() is invoked, no new
4271 * attempts fail thus maintaining the removal conditions verified 4501 * css reference will be given out via css_tryget(). We can't
4272 * above. 4502 * simply call percpu_ref_kill() and proceed to offlining css's
4503 * because percpu_ref_kill() doesn't guarantee that the ref is seen
4504 * as killed on all CPUs on return.
4505 *
4506 * Use percpu_ref_kill_and_confirm() to get notifications as each
4507 * css is confirmed to be seen as killed on all CPUs. The
4508 * notification callback keeps track of the number of css's to be
4509 * killed and schedules cgroup_offline_fn() to perform the rest of
4510 * destruction once the percpu refs of all css's are confirmed to
4511 * be killed.
4273 */ 4512 */
4274 for_each_subsys(cgrp->root, ss) { 4513 atomic_set(&cgrp->css_kill_cnt, 1);
4514 for_each_root_subsys(cgrp->root, ss) {
4275 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4515 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4276 4516
4277 WARN_ON(atomic_read(&css->refcnt) < 0); 4517 /*
4278 atomic_add(CSS_DEACT_BIAS, &css->refcnt); 4518 * Killing would put the base ref, but we need to keep it
4279 } 4519 * alive until after ->css_offline.
4280 set_bit(CGRP_REMOVED, &cgrp->flags); 4520 */
4521 percpu_ref_get(&css->refcnt);
4281 4522
4282 /* tell subsystems to initate destruction */ 4523 atomic_inc(&cgrp->css_kill_cnt);
4283 for_each_subsys(cgrp->root, ss) 4524 percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
4284 offline_css(ss, cgrp); 4525 }
4526 cgroup_css_killed(cgrp);
4285 4527
4286 /* 4528 /*
4287 * Put all the base refs. Each css holds an extra reference to the 4529 * Mark @cgrp dead. This prevents further task migration and child
4288 * cgroup's dentry and cgroup removal proceeds regardless of css 4530 * creation by disabling cgroup_lock_live_group(). Note that
4289 * refs. On the last put of each css, whenever that may be, the 4531 * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to
4290 * extra dentry ref is put so that dentry destruction happens only 4532 * resume iteration after dropping RCU read lock. See
4291 * after all css's are released. 4533 * cgroup_next_sibling() for details.
4292 */ 4534 */
4293 for_each_subsys(cgrp->root, ss) 4535 set_bit(CGRP_DEAD, &cgrp->flags);
4294 css_put(cgrp->subsys[ss->subsys_id]);
4295 4536
4537 /* CGRP_DEAD is set, remove from ->release_list for the last time */
4296 raw_spin_lock(&release_list_lock); 4538 raw_spin_lock(&release_list_lock);
4297 if (!list_empty(&cgrp->release_list)) 4539 if (!list_empty(&cgrp->release_list))
4298 list_del_init(&cgrp->release_list); 4540 list_del_init(&cgrp->release_list);
4299 raw_spin_unlock(&release_list_lock); 4541 raw_spin_unlock(&release_list_lock);
4300 4542
4301 /* delete this cgroup from parent->children */ 4543 /*
4302 list_del_rcu(&cgrp->sibling); 4544 * Remove @cgrp directory. The removal puts the base ref but we
4303 list_del_init(&cgrp->allcg_node); 4545 * aren't quite done with @cgrp yet, so hold onto it.
4304 4546 */
4305 dget(d); 4547 dget(d);
4306 cgroup_d_remove_dir(d); 4548 cgroup_d_remove_dir(d);
4307 dput(d);
4308
4309 set_bit(CGRP_RELEASABLE, &parent->flags);
4310 check_for_release(parent);
4311 4549
4312 /* 4550 /*
4313 * Unregister events and notify userspace. 4551 * Unregister events and notify userspace.
@@ -4322,6 +4560,53 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4322 spin_unlock(&cgrp->event_list_lock); 4560 spin_unlock(&cgrp->event_list_lock);
4323 4561
4324 return 0; 4562 return 0;
4563};
4564
4565/**
4566 * cgroup_offline_fn - the second step of cgroup destruction
4567 * @work: cgroup->destroy_free_work
4568 *
4569 * This function is invoked from a work item for a cgroup which is being
4570 * destroyed after the percpu refcnts of all css's are guaranteed to be
4571 * seen as killed on all CPUs, and performs the rest of destruction. This
4572 * is the second step of destruction described in the comment above
4573 * cgroup_destroy_locked().
4574 */
4575static void cgroup_offline_fn(struct work_struct *work)
4576{
4577 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
4578 struct cgroup *parent = cgrp->parent;
4579 struct dentry *d = cgrp->dentry;
4580 struct cgroup_subsys *ss;
4581
4582 mutex_lock(&cgroup_mutex);
4583
4584 /*
4585 * css_tryget() is guaranteed to fail now. Tell subsystems to
4586 * initate destruction.
4587 */
4588 for_each_root_subsys(cgrp->root, ss)
4589 offline_css(ss, cgrp);
4590
4591 /*
4592 * Put the css refs from cgroup_destroy_locked(). Each css holds
4593 * an extra reference to the cgroup's dentry and cgroup removal
4594 * proceeds regardless of css refs. On the last put of each css,
4595 * whenever that may be, the extra dentry ref is put so that dentry
4596 * destruction happens only after all css's are released.
4597 */
4598 for_each_root_subsys(cgrp->root, ss)
4599 css_put(cgrp->subsys[ss->subsys_id]);
4600
4601 /* delete this cgroup from parent->children */
4602 list_del_rcu(&cgrp->sibling);
4603
4604 dput(d);
4605
4606 set_bit(CGRP_RELEASABLE, &parent->flags);
4607 check_for_release(parent);
4608
4609 mutex_unlock(&cgroup_mutex);
4325} 4610}
4326 4611
4327static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4612static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
@@ -4361,12 +4646,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4361 cgroup_init_cftsets(ss); 4646 cgroup_init_cftsets(ss);
4362 4647
4363 /* Create the top cgroup state for this subsystem */ 4648 /* Create the top cgroup state for this subsystem */
4364 list_add(&ss->sibling, &rootnode.subsys_list); 4649 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4365 ss->root = &rootnode; 4650 ss->root = &cgroup_dummy_root;
4366 css = ss->css_alloc(dummytop); 4651 css = ss->css_alloc(cgroup_dummy_top);
4367 /* We don't handle early failures gracefully */ 4652 /* We don't handle early failures gracefully */
4368 BUG_ON(IS_ERR(css)); 4653 BUG_ON(IS_ERR(css));
4369 init_cgroup_css(css, ss, dummytop); 4654 init_cgroup_css(css, ss, cgroup_dummy_top);
4370 4655
4371 /* Update the init_css_set to contain a subsys 4656 /* Update the init_css_set to contain a subsys
4372 * pointer to this state - since the subsystem is 4657 * pointer to this state - since the subsystem is
@@ -4381,7 +4666,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4381 * need to invoke fork callbacks here. */ 4666 * need to invoke fork callbacks here. */
4382 BUG_ON(!list_empty(&init_task.tasks)); 4667 BUG_ON(!list_empty(&init_task.tasks));
4383 4668
4384 BUG_ON(online_css(ss, dummytop)); 4669 BUG_ON(online_css(ss, cgroup_dummy_top));
4385 4670
4386 mutex_unlock(&cgroup_mutex); 4671 mutex_unlock(&cgroup_mutex);
4387 4672
@@ -4404,7 +4689,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4404 struct cgroup_subsys_state *css; 4689 struct cgroup_subsys_state *css;
4405 int i, ret; 4690 int i, ret;
4406 struct hlist_node *tmp; 4691 struct hlist_node *tmp;
4407 struct css_set *cg; 4692 struct css_set *cset;
4408 unsigned long key; 4693 unsigned long key;
4409 4694
4410 /* check name and function validity */ 4695 /* check name and function validity */
@@ -4427,7 +4712,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4427 */ 4712 */
4428 if (ss->module == NULL) { 4713 if (ss->module == NULL) {
4429 /* a sanity check */ 4714 /* a sanity check */
4430 BUG_ON(subsys[ss->subsys_id] != ss); 4715 BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
4431 return 0; 4716 return 0;
4432 } 4717 }
4433 4718
@@ -4435,26 +4720,26 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4435 cgroup_init_cftsets(ss); 4720 cgroup_init_cftsets(ss);
4436 4721
4437 mutex_lock(&cgroup_mutex); 4722 mutex_lock(&cgroup_mutex);
4438 subsys[ss->subsys_id] = ss; 4723 cgroup_subsys[ss->subsys_id] = ss;
4439 4724
4440 /* 4725 /*
4441 * no ss->css_alloc seems to need anything important in the ss 4726 * no ss->css_alloc seems to need anything important in the ss
4442 * struct, so this can happen first (i.e. before the rootnode 4727 * struct, so this can happen first (i.e. before the dummy root
4443 * attachment). 4728 * attachment).
4444 */ 4729 */
4445 css = ss->css_alloc(dummytop); 4730 css = ss->css_alloc(cgroup_dummy_top);
4446 if (IS_ERR(css)) { 4731 if (IS_ERR(css)) {
4447 /* failure case - need to deassign the subsys[] slot. */ 4732 /* failure case - need to deassign the cgroup_subsys[] slot. */
4448 subsys[ss->subsys_id] = NULL; 4733 cgroup_subsys[ss->subsys_id] = NULL;
4449 mutex_unlock(&cgroup_mutex); 4734 mutex_unlock(&cgroup_mutex);
4450 return PTR_ERR(css); 4735 return PTR_ERR(css);
4451 } 4736 }
4452 4737
4453 list_add(&ss->sibling, &rootnode.subsys_list); 4738 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4454 ss->root = &rootnode; 4739 ss->root = &cgroup_dummy_root;
4455 4740
4456 /* our new subsystem will be attached to the dummy hierarchy. */ 4741 /* our new subsystem will be attached to the dummy hierarchy. */
4457 init_cgroup_css(css, ss, dummytop); 4742 init_cgroup_css(css, ss, cgroup_dummy_top);
4458 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4743 /* init_idr must be after init_cgroup_css because it sets css->id. */
4459 if (ss->use_id) { 4744 if (ss->use_id) {
4460 ret = cgroup_init_idr(ss, css); 4745 ret = cgroup_init_idr(ss, css);
@@ -4471,21 +4756,21 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4471 * this is all done under the css_set_lock. 4756 * this is all done under the css_set_lock.
4472 */ 4757 */
4473 write_lock(&css_set_lock); 4758 write_lock(&css_set_lock);
4474 hash_for_each_safe(css_set_table, i, tmp, cg, hlist) { 4759 hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
4475 /* skip entries that we already rehashed */ 4760 /* skip entries that we already rehashed */
4476 if (cg->subsys[ss->subsys_id]) 4761 if (cset->subsys[ss->subsys_id])
4477 continue; 4762 continue;
4478 /* remove existing entry */ 4763 /* remove existing entry */
4479 hash_del(&cg->hlist); 4764 hash_del(&cset->hlist);
4480 /* set new value */ 4765 /* set new value */
4481 cg->subsys[ss->subsys_id] = css; 4766 cset->subsys[ss->subsys_id] = css;
4482 /* recompute hash and restore entry */ 4767 /* recompute hash and restore entry */
4483 key = css_set_hash(cg->subsys); 4768 key = css_set_hash(cset->subsys);
4484 hash_add(css_set_table, &cg->hlist, key); 4769 hash_add(css_set_table, &cset->hlist, key);
4485 } 4770 }
4486 write_unlock(&css_set_lock); 4771 write_unlock(&css_set_lock);
4487 4772
4488 ret = online_css(ss, dummytop); 4773 ret = online_css(ss, cgroup_dummy_top);
4489 if (ret) 4774 if (ret)
4490 goto err_unload; 4775 goto err_unload;
4491 4776
@@ -4511,7 +4796,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4511 */ 4796 */
4512void cgroup_unload_subsys(struct cgroup_subsys *ss) 4797void cgroup_unload_subsys(struct cgroup_subsys *ss)
4513{ 4798{
4514 struct cg_cgroup_link *link; 4799 struct cgrp_cset_link *link;
4515 4800
4516 BUG_ON(ss->module == NULL); 4801 BUG_ON(ss->module == NULL);
4517 4802
@@ -4520,45 +4805,46 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4520 * try_module_get in parse_cgroupfs_options should ensure that it 4805 * try_module_get in parse_cgroupfs_options should ensure that it
4521 * doesn't start being used while we're killing it off. 4806 * doesn't start being used while we're killing it off.
4522 */ 4807 */
4523 BUG_ON(ss->root != &rootnode); 4808 BUG_ON(ss->root != &cgroup_dummy_root);
4524 4809
4525 mutex_lock(&cgroup_mutex); 4810 mutex_lock(&cgroup_mutex);
4526 4811
4527 offline_css(ss, dummytop); 4812 offline_css(ss, cgroup_dummy_top);
4528 4813
4529 if (ss->use_id) 4814 if (ss->use_id)
4530 idr_destroy(&ss->idr); 4815 idr_destroy(&ss->idr);
4531 4816
4532 /* deassign the subsys_id */ 4817 /* deassign the subsys_id */
4533 subsys[ss->subsys_id] = NULL; 4818 cgroup_subsys[ss->subsys_id] = NULL;
4534 4819
4535 /* remove subsystem from rootnode's list of subsystems */ 4820 /* remove subsystem from the dummy root's list of subsystems */
4536 list_del_init(&ss->sibling); 4821 list_del_init(&ss->sibling);
4537 4822
4538 /* 4823 /*
4539 * disentangle the css from all css_sets attached to the dummytop. as 4824 * disentangle the css from all css_sets attached to the dummy
4540 * in loading, we need to pay our respects to the hashtable gods. 4825 * top. as in loading, we need to pay our respects to the hashtable
4826 * gods.
4541 */ 4827 */
4542 write_lock(&css_set_lock); 4828 write_lock(&css_set_lock);
4543 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { 4829 list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
4544 struct css_set *cg = link->cg; 4830 struct css_set *cset = link->cset;
4545 unsigned long key; 4831 unsigned long key;
4546 4832
4547 hash_del(&cg->hlist); 4833 hash_del(&cset->hlist);
4548 cg->subsys[ss->subsys_id] = NULL; 4834 cset->subsys[ss->subsys_id] = NULL;
4549 key = css_set_hash(cg->subsys); 4835 key = css_set_hash(cset->subsys);
4550 hash_add(css_set_table, &cg->hlist, key); 4836 hash_add(css_set_table, &cset->hlist, key);
4551 } 4837 }
4552 write_unlock(&css_set_lock); 4838 write_unlock(&css_set_lock);
4553 4839
4554 /* 4840 /*
4555 * remove subsystem's css from the dummytop and free it - need to 4841 * remove subsystem's css from the cgroup_dummy_top and free it -
4556 * free before marking as null because ss->css_free needs the 4842 * need to free before marking as null because ss->css_free needs
4557 * cgrp->subsys pointer to find their state. note that this also 4843 * the cgrp->subsys pointer to find their state. note that this
4558 * takes care of freeing the css_id. 4844 * also takes care of freeing the css_id.
4559 */ 4845 */
4560 ss->css_free(dummytop); 4846 ss->css_free(cgroup_dummy_top);
4561 dummytop->subsys[ss->subsys_id] = NULL; 4847 cgroup_dummy_top->subsys[ss->subsys_id] = NULL;
4562 4848
4563 mutex_unlock(&cgroup_mutex); 4849 mutex_unlock(&cgroup_mutex);
4564} 4850}
@@ -4572,30 +4858,25 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4572 */ 4858 */
4573int __init cgroup_init_early(void) 4859int __init cgroup_init_early(void)
4574{ 4860{
4861 struct cgroup_subsys *ss;
4575 int i; 4862 int i;
4863
4576 atomic_set(&init_css_set.refcount, 1); 4864 atomic_set(&init_css_set.refcount, 1);
4577 INIT_LIST_HEAD(&init_css_set.cg_links); 4865 INIT_LIST_HEAD(&init_css_set.cgrp_links);
4578 INIT_LIST_HEAD(&init_css_set.tasks); 4866 INIT_LIST_HEAD(&init_css_set.tasks);
4579 INIT_HLIST_NODE(&init_css_set.hlist); 4867 INIT_HLIST_NODE(&init_css_set.hlist);
4580 css_set_count = 1; 4868 css_set_count = 1;
4581 init_cgroup_root(&rootnode); 4869 init_cgroup_root(&cgroup_dummy_root);
4582 root_count = 1; 4870 cgroup_root_count = 1;
4583 init_task.cgroups = &init_css_set; 4871 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4584 4872
4585 init_css_set_link.cg = &init_css_set; 4873 init_cgrp_cset_link.cset = &init_css_set;
4586 init_css_set_link.cgrp = dummytop; 4874 init_cgrp_cset_link.cgrp = cgroup_dummy_top;
4587 list_add(&init_css_set_link.cgrp_link_list, 4875 list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links);
4588 &rootnode.top_cgroup.css_sets); 4876 list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links);
4589 list_add(&init_css_set_link.cg_link_list,
4590 &init_css_set.cg_links);
4591
4592 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4593 struct cgroup_subsys *ss = subsys[i];
4594
4595 /* at bootup time, we don't worry about modular subsystems */
4596 if (!ss || ss->module)
4597 continue;
4598 4877
4878 /* at bootup time, we don't worry about modular subsystems */
4879 for_each_builtin_subsys(ss, i) {
4599 BUG_ON(!ss->name); 4880 BUG_ON(!ss->name);
4600 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4881 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4601 BUG_ON(!ss->css_alloc); 4882 BUG_ON(!ss->css_alloc);
@@ -4620,30 +4901,33 @@ int __init cgroup_init_early(void)
4620 */ 4901 */
4621int __init cgroup_init(void) 4902int __init cgroup_init(void)
4622{ 4903{
4623 int err; 4904 struct cgroup_subsys *ss;
4624 int i;
4625 unsigned long key; 4905 unsigned long key;
4906 int i, err;
4626 4907
4627 err = bdi_init(&cgroup_backing_dev_info); 4908 err = bdi_init(&cgroup_backing_dev_info);
4628 if (err) 4909 if (err)
4629 return err; 4910 return err;
4630 4911
4631 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4912 for_each_builtin_subsys(ss, i) {
4632 struct cgroup_subsys *ss = subsys[i];
4633
4634 /* at bootup time, we don't worry about modular subsystems */
4635 if (!ss || ss->module)
4636 continue;
4637 if (!ss->early_init) 4913 if (!ss->early_init)
4638 cgroup_init_subsys(ss); 4914 cgroup_init_subsys(ss);
4639 if (ss->use_id) 4915 if (ss->use_id)
4640 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); 4916 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
4641 } 4917 }
4642 4918
4919 /* allocate id for the dummy hierarchy */
4920 mutex_lock(&cgroup_mutex);
4921 mutex_lock(&cgroup_root_mutex);
4922
4643 /* Add init_css_set to the hash table */ 4923 /* Add init_css_set to the hash table */
4644 key = css_set_hash(init_css_set.subsys); 4924 key = css_set_hash(init_css_set.subsys);
4645 hash_add(css_set_table, &init_css_set.hlist, key); 4925 hash_add(css_set_table, &init_css_set.hlist, key);
4646 BUG_ON(!init_root_id(&rootnode)); 4926
4927 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
4928
4929 mutex_unlock(&cgroup_root_mutex);
4930 mutex_unlock(&cgroup_mutex);
4647 4931
4648 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4932 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4649 if (!cgroup_kobj) { 4933 if (!cgroup_kobj) {
@@ -4708,7 +4992,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4708 int count = 0; 4992 int count = 0;
4709 4993
4710 seq_printf(m, "%d:", root->hierarchy_id); 4994 seq_printf(m, "%d:", root->hierarchy_id);
4711 for_each_subsys(root, ss) 4995 for_each_root_subsys(root, ss)
4712 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4996 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4713 if (strlen(root->name)) 4997 if (strlen(root->name))
4714 seq_printf(m, "%sname=%s", count ? "," : "", 4998 seq_printf(m, "%sname=%s", count ? "," : "",
@@ -4734,6 +5018,7 @@ out:
4734/* Display information about each subsystem and each hierarchy */ 5018/* Display information about each subsystem and each hierarchy */
4735static int proc_cgroupstats_show(struct seq_file *m, void *v) 5019static int proc_cgroupstats_show(struct seq_file *m, void *v)
4736{ 5020{
5021 struct cgroup_subsys *ss;
4737 int i; 5022 int i;
4738 5023
4739 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); 5024 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
@@ -4743,14 +5028,12 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
4743 * subsys/hierarchy state. 5028 * subsys/hierarchy state.
4744 */ 5029 */
4745 mutex_lock(&cgroup_mutex); 5030 mutex_lock(&cgroup_mutex);
4746 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 5031
4747 struct cgroup_subsys *ss = subsys[i]; 5032 for_each_subsys(ss, i)
4748 if (ss == NULL)
4749 continue;
4750 seq_printf(m, "%s\t%d\t%d\t%d\n", 5033 seq_printf(m, "%s\t%d\t%d\t%d\n",
4751 ss->name, ss->root->hierarchy_id, 5034 ss->name, ss->root->hierarchy_id,
4752 ss->root->number_of_cgroups, !ss->disabled); 5035 ss->root->number_of_cgroups, !ss->disabled);
4753 } 5036
4754 mutex_unlock(&cgroup_mutex); 5037 mutex_unlock(&cgroup_mutex);
4755 return 0; 5038 return 0;
4756} 5039}
@@ -4786,8 +5069,8 @@ static const struct file_operations proc_cgroupstats_operations = {
4786void cgroup_fork(struct task_struct *child) 5069void cgroup_fork(struct task_struct *child)
4787{ 5070{
4788 task_lock(current); 5071 task_lock(current);
5072 get_css_set(task_css_set(current));
4789 child->cgroups = current->cgroups; 5073 child->cgroups = current->cgroups;
4790 get_css_set(child->cgroups);
4791 task_unlock(current); 5074 task_unlock(current);
4792 INIT_LIST_HEAD(&child->cg_list); 5075 INIT_LIST_HEAD(&child->cg_list);
4793} 5076}
@@ -4804,6 +5087,7 @@ void cgroup_fork(struct task_struct *child)
4804 */ 5087 */
4805void cgroup_post_fork(struct task_struct *child) 5088void cgroup_post_fork(struct task_struct *child)
4806{ 5089{
5090 struct cgroup_subsys *ss;
4807 int i; 5091 int i;
4808 5092
4809 /* 5093 /*
@@ -4821,7 +5105,7 @@ void cgroup_post_fork(struct task_struct *child)
4821 write_lock(&css_set_lock); 5105 write_lock(&css_set_lock);
4822 task_lock(child); 5106 task_lock(child);
4823 if (list_empty(&child->cg_list)) 5107 if (list_empty(&child->cg_list))
4824 list_add(&child->cg_list, &child->cgroups->tasks); 5108 list_add(&child->cg_list, &task_css_set(child)->tasks);
4825 task_unlock(child); 5109 task_unlock(child);
4826 write_unlock(&css_set_lock); 5110 write_unlock(&css_set_lock);
4827 } 5111 }
@@ -4840,12 +5124,9 @@ void cgroup_post_fork(struct task_struct *child)
4840 * of the array can be freed at module unload, so we 5124 * of the array can be freed at module unload, so we
4841 * can't touch that. 5125 * can't touch that.
4842 */ 5126 */
4843 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { 5127 for_each_builtin_subsys(ss, i)
4844 struct cgroup_subsys *ss = subsys[i];
4845
4846 if (ss->fork) 5128 if (ss->fork)
4847 ss->fork(child); 5129 ss->fork(child);
4848 }
4849 } 5130 }
4850} 5131}
4851 5132
@@ -4886,7 +5167,8 @@ void cgroup_post_fork(struct task_struct *child)
4886 */ 5167 */
4887void cgroup_exit(struct task_struct *tsk, int run_callbacks) 5168void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4888{ 5169{
4889 struct css_set *cg; 5170 struct cgroup_subsys *ss;
5171 struct css_set *cset;
4890 int i; 5172 int i;
4891 5173
4892 /* 5174 /*
@@ -4903,36 +5185,32 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4903 5185
4904 /* Reassign the task to the init_css_set. */ 5186 /* Reassign the task to the init_css_set. */
4905 task_lock(tsk); 5187 task_lock(tsk);
4906 cg = tsk->cgroups; 5188 cset = task_css_set(tsk);
4907 tsk->cgroups = &init_css_set; 5189 RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
4908 5190
4909 if (run_callbacks && need_forkexit_callback) { 5191 if (run_callbacks && need_forkexit_callback) {
4910 /* 5192 /*
4911 * fork/exit callbacks are supported only for builtin 5193 * fork/exit callbacks are supported only for builtin
4912 * subsystems, see cgroup_post_fork() for details. 5194 * subsystems, see cgroup_post_fork() for details.
4913 */ 5195 */
4914 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { 5196 for_each_builtin_subsys(ss, i) {
4915 struct cgroup_subsys *ss = subsys[i];
4916
4917 if (ss->exit) { 5197 if (ss->exit) {
4918 struct cgroup *old_cgrp = 5198 struct cgroup *old_cgrp = cset->subsys[i]->cgroup;
4919 rcu_dereference_raw(cg->subsys[i])->cgroup;
4920 struct cgroup *cgrp = task_cgroup(tsk, i); 5199 struct cgroup *cgrp = task_cgroup(tsk, i);
5200
4921 ss->exit(cgrp, old_cgrp, tsk); 5201 ss->exit(cgrp, old_cgrp, tsk);
4922 } 5202 }
4923 } 5203 }
4924 } 5204 }
4925 task_unlock(tsk); 5205 task_unlock(tsk);
4926 5206
4927 put_css_set_taskexit(cg); 5207 put_css_set_taskexit(cset);
4928} 5208}
4929 5209
4930static void check_for_release(struct cgroup *cgrp) 5210static void check_for_release(struct cgroup *cgrp)
4931{ 5211{
4932 /* All of these checks rely on RCU to keep the cgroup
4933 * structure alive */
4934 if (cgroup_is_releasable(cgrp) && 5212 if (cgroup_is_releasable(cgrp) &&
4935 !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) { 5213 list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
4936 /* 5214 /*
4937 * Control Group is currently removeable. If it's not 5215 * Control Group is currently removeable. If it's not
4938 * already queued for a userspace notification, queue 5216 * already queued for a userspace notification, queue
@@ -4941,7 +5219,7 @@ static void check_for_release(struct cgroup *cgrp)
4941 int need_schedule_work = 0; 5219 int need_schedule_work = 0;
4942 5220
4943 raw_spin_lock(&release_list_lock); 5221 raw_spin_lock(&release_list_lock);
4944 if (!cgroup_is_removed(cgrp) && 5222 if (!cgroup_is_dead(cgrp) &&
4945 list_empty(&cgrp->release_list)) { 5223 list_empty(&cgrp->release_list)) {
4946 list_add(&cgrp->release_list, &release_list); 5224 list_add(&cgrp->release_list, &release_list);
4947 need_schedule_work = 1; 5225 need_schedule_work = 1;
@@ -4952,34 +5230,6 @@ static void check_for_release(struct cgroup *cgrp)
4952 } 5230 }
4953} 5231}
4954 5232
4955/* Caller must verify that the css is not for root cgroup */
4956bool __css_tryget(struct cgroup_subsys_state *css)
4957{
4958 while (true) {
4959 int t, v;
4960
4961 v = css_refcnt(css);
4962 t = atomic_cmpxchg(&css->refcnt, v, v + 1);
4963 if (likely(t == v))
4964 return true;
4965 else if (t < 0)
4966 return false;
4967 cpu_relax();
4968 }
4969}
4970EXPORT_SYMBOL_GPL(__css_tryget);
4971
4972/* Caller must verify that the css is not for root cgroup */
4973void __css_put(struct cgroup_subsys_state *css)
4974{
4975 int v;
4976
4977 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
4978 if (v == 0)
4979 schedule_work(&css->dput_work);
4980}
4981EXPORT_SYMBOL_GPL(__css_put);
4982
4983/* 5233/*
4984 * Notify userspace when a cgroup is released, by running the 5234 * Notify userspace when a cgroup is released, by running the
4985 * configured release agent with the name of the cgroup (path 5235 * configured release agent with the name of the cgroup (path
@@ -5054,23 +5304,19 @@ static void cgroup_release_agent(struct work_struct *work)
5054 5304
5055static int __init cgroup_disable(char *str) 5305static int __init cgroup_disable(char *str)
5056{ 5306{
5057 int i; 5307 struct cgroup_subsys *ss;
5058 char *token; 5308 char *token;
5309 int i;
5059 5310
5060 while ((token = strsep(&str, ",")) != NULL) { 5311 while ((token = strsep(&str, ",")) != NULL) {
5061 if (!*token) 5312 if (!*token)
5062 continue; 5313 continue;
5063 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
5064 struct cgroup_subsys *ss = subsys[i];
5065
5066 /*
5067 * cgroup_disable, being at boot time, can't
5068 * know about module subsystems, so we don't
5069 * worry about them.
5070 */
5071 if (!ss || ss->module)
5072 continue;
5073 5314
5315 /*
5316 * cgroup_disable, being at boot time, can't know about
5317 * module subsystems, so we don't worry about them.
5318 */
5319 for_each_builtin_subsys(ss, i) {
5074 if (!strcmp(token, ss->name)) { 5320 if (!strcmp(token, ss->name)) {
5075 ss->disabled = 1; 5321 ss->disabled = 1;
5076 printk(KERN_INFO "Disabling %s control group" 5322 printk(KERN_INFO "Disabling %s control group"
@@ -5087,9 +5333,7 @@ __setup("cgroup_disable=", cgroup_disable);
5087 * Functons for CSS ID. 5333 * Functons for CSS ID.
5088 */ 5334 */
5089 5335
5090/* 5336/* to get ID other than 0, this should be called when !cgroup_is_dead() */
5091 *To get ID other than 0, this should be called when !cgroup_is_removed().
5092 */
5093unsigned short css_id(struct cgroup_subsys_state *css) 5337unsigned short css_id(struct cgroup_subsys_state *css)
5094{ 5338{
5095 struct css_id *cssid; 5339 struct css_id *cssid;
@@ -5099,7 +5343,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
5099 * on this or this is under rcu_read_lock(). Once css->id is allocated, 5343 * on this or this is under rcu_read_lock(). Once css->id is allocated,
5100 * it's unchanged until freed. 5344 * it's unchanged until freed.
5101 */ 5345 */
5102 cssid = rcu_dereference_check(css->id, css_refcnt(css)); 5346 cssid = rcu_dereference_raw(css->id);
5103 5347
5104 if (cssid) 5348 if (cssid)
5105 return cssid->id; 5349 return cssid->id;
@@ -5107,18 +5351,6 @@ unsigned short css_id(struct cgroup_subsys_state *css)
5107} 5351}
5108EXPORT_SYMBOL_GPL(css_id); 5352EXPORT_SYMBOL_GPL(css_id);
5109 5353
5110unsigned short css_depth(struct cgroup_subsys_state *css)
5111{
5112 struct css_id *cssid;
5113
5114 cssid = rcu_dereference_check(css->id, css_refcnt(css));
5115
5116 if (cssid)
5117 return cssid->depth;
5118 return 0;
5119}
5120EXPORT_SYMBOL_GPL(css_depth);
5121
5122/** 5354/**
5123 * css_is_ancestor - test "root" css is an ancestor of "child" 5355 * css_is_ancestor - test "root" css is an ancestor of "child"
5124 * @child: the css to be tested. 5356 * @child: the css to be tested.
@@ -5153,7 +5385,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
5153 5385
5154void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) 5386void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
5155{ 5387{
5156 struct css_id *id = css->id; 5388 struct css_id *id = rcu_dereference_protected(css->id, true);
5389
5157 /* When this is called before css_id initialization, id can be NULL */ 5390 /* When this is called before css_id initialization, id can be NULL */
5158 if (!id) 5391 if (!id)
5159 return; 5392 return;
@@ -5219,8 +5452,8 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
5219 return PTR_ERR(newid); 5452 return PTR_ERR(newid);
5220 5453
5221 newid->stack[0] = newid->id; 5454 newid->stack[0] = newid->id;
5222 newid->css = rootcss; 5455 RCU_INIT_POINTER(newid->css, rootcss);
5223 rootcss->id = newid; 5456 RCU_INIT_POINTER(rootcss->id, newid);
5224 return 0; 5457 return 0;
5225} 5458}
5226 5459
@@ -5234,7 +5467,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
5234 subsys_id = ss->subsys_id; 5467 subsys_id = ss->subsys_id;
5235 parent_css = parent->subsys[subsys_id]; 5468 parent_css = parent->subsys[subsys_id];
5236 child_css = child->subsys[subsys_id]; 5469 child_css = child->subsys[subsys_id];
5237 parent_id = parent_css->id; 5470 parent_id = rcu_dereference_protected(parent_css->id, true);
5238 depth = parent_id->depth + 1; 5471 depth = parent_id->depth + 1;
5239 5472
5240 child_id = get_new_cssid(ss, depth); 5473 child_id = get_new_cssid(ss, depth);
@@ -5299,7 +5532,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5299} 5532}
5300 5533
5301#ifdef CONFIG_CGROUP_DEBUG 5534#ifdef CONFIG_CGROUP_DEBUG
5302static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) 5535static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
5303{ 5536{
5304 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5537 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5305 5538
@@ -5309,48 +5542,43 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
5309 return css; 5542 return css;
5310} 5543}
5311 5544
5312static void debug_css_free(struct cgroup *cont) 5545static void debug_css_free(struct cgroup *cgrp)
5313{
5314 kfree(cont->subsys[debug_subsys_id]);
5315}
5316
5317static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
5318{ 5546{
5319 return atomic_read(&cont->count); 5547 kfree(cgrp->subsys[debug_subsys_id]);
5320} 5548}
5321 5549
5322static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) 5550static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft)
5323{ 5551{
5324 return cgroup_task_count(cont); 5552 return cgroup_task_count(cgrp);
5325} 5553}
5326 5554
5327static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) 5555static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft)
5328{ 5556{
5329 return (u64)(unsigned long)current->cgroups; 5557 return (u64)(unsigned long)current->cgroups;
5330} 5558}
5331 5559
5332static u64 current_css_set_refcount_read(struct cgroup *cont, 5560static u64 current_css_set_refcount_read(struct cgroup *cgrp,
5333 struct cftype *cft) 5561 struct cftype *cft)
5334{ 5562{
5335 u64 count; 5563 u64 count;
5336 5564
5337 rcu_read_lock(); 5565 rcu_read_lock();
5338 count = atomic_read(&current->cgroups->refcount); 5566 count = atomic_read(&task_css_set(current)->refcount);
5339 rcu_read_unlock(); 5567 rcu_read_unlock();
5340 return count; 5568 return count;
5341} 5569}
5342 5570
5343static int current_css_set_cg_links_read(struct cgroup *cont, 5571static int current_css_set_cg_links_read(struct cgroup *cgrp,
5344 struct cftype *cft, 5572 struct cftype *cft,
5345 struct seq_file *seq) 5573 struct seq_file *seq)
5346{ 5574{
5347 struct cg_cgroup_link *link; 5575 struct cgrp_cset_link *link;
5348 struct css_set *cg; 5576 struct css_set *cset;
5349 5577
5350 read_lock(&css_set_lock); 5578 read_lock(&css_set_lock);
5351 rcu_read_lock(); 5579 rcu_read_lock();
5352 cg = rcu_dereference(current->cgroups); 5580 cset = rcu_dereference(current->cgroups);
5353 list_for_each_entry(link, &cg->cg_links, cg_link_list) { 5581 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
5354 struct cgroup *c = link->cgrp; 5582 struct cgroup *c = link->cgrp;
5355 const char *name; 5583 const char *name;
5356 5584
@@ -5367,19 +5595,19 @@ static int current_css_set_cg_links_read(struct cgroup *cont,
5367} 5595}
5368 5596
5369#define MAX_TASKS_SHOWN_PER_CSS 25 5597#define MAX_TASKS_SHOWN_PER_CSS 25
5370static int cgroup_css_links_read(struct cgroup *cont, 5598static int cgroup_css_links_read(struct cgroup *cgrp,
5371 struct cftype *cft, 5599 struct cftype *cft,
5372 struct seq_file *seq) 5600 struct seq_file *seq)
5373{ 5601{
5374 struct cg_cgroup_link *link; 5602 struct cgrp_cset_link *link;
5375 5603
5376 read_lock(&css_set_lock); 5604 read_lock(&css_set_lock);
5377 list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { 5605 list_for_each_entry(link, &cgrp->cset_links, cset_link) {
5378 struct css_set *cg = link->cg; 5606 struct css_set *cset = link->cset;
5379 struct task_struct *task; 5607 struct task_struct *task;
5380 int count = 0; 5608 int count = 0;
5381 seq_printf(seq, "css_set %p\n", cg); 5609 seq_printf(seq, "css_set %p\n", cset);
5382 list_for_each_entry(task, &cg->tasks, cg_list) { 5610 list_for_each_entry(task, &cset->tasks, cg_list) {
5383 if (count++ > MAX_TASKS_SHOWN_PER_CSS) { 5611 if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
5384 seq_puts(seq, " ...\n"); 5612 seq_puts(seq, " ...\n");
5385 break; 5613 break;
@@ -5400,10 +5628,6 @@ static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
5400 5628
5401static struct cftype debug_files[] = { 5629static struct cftype debug_files[] = {
5402 { 5630 {
5403 .name = "cgroup_refcount",
5404 .read_u64 = cgroup_refcount_read,
5405 },
5406 {
5407 .name = "taskcount", 5631 .name = "taskcount",
5408 .read_u64 = debug_taskcount_read, 5632 .read_u64 = debug_taskcount_read,
5409 }, 5633 },