aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c1831
1 files changed, 1185 insertions, 646 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9fcdaa705b6c..7868fc3c0bc5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -26,6 +26,8 @@
26 * distribution for more details. 26 * distribution for more details.
27 */ 27 */
28 28
29#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
30
29#include <linux/cgroup.h> 31#include <linux/cgroup.h>
30#include <linux/cred.h> 32#include <linux/cred.h>
31#include <linux/ctype.h> 33#include <linux/ctype.h>
@@ -33,6 +35,7 @@
33#include <linux/init_task.h> 35#include <linux/init_task.h>
34#include <linux/kernel.h> 36#include <linux/kernel.h>
35#include <linux/list.h> 37#include <linux/list.h>
38#include <linux/magic.h>
36#include <linux/mm.h> 39#include <linux/mm.h>
37#include <linux/mutex.h> 40#include <linux/mutex.h>
38#include <linux/mount.h> 41#include <linux/mount.h>
@@ -69,15 +72,6 @@
69 MAX_CFTYPE_NAME + 2) 72 MAX_CFTYPE_NAME + 2)
70 73
71/* 74/*
72 * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
73 * creation/removal and hierarchy changing operations including cgroup
74 * creation, removal, css association and controller rebinding. This outer
75 * lock is needed mainly to resolve the circular dependency between kernfs
76 * active ref and cgroup_mutex. cgroup_tree_mutex nests above both.
77 */
78static DEFINE_MUTEX(cgroup_tree_mutex);
79
80/*
81 * cgroup_mutex is the master lock. Any modification to cgroup or its 75 * cgroup_mutex is the master lock. Any modification to cgroup or its
82 * hierarchy must be performed while holding it. 76 * hierarchy must be performed while holding it.
83 * 77 *
@@ -98,16 +92,21 @@ static DECLARE_RWSEM(css_set_rwsem);
98#endif 92#endif
99 93
100/* 94/*
95 * Protects cgroup_idr and css_idr so that IDs can be released without
96 * grabbing cgroup_mutex.
97 */
98static DEFINE_SPINLOCK(cgroup_idr_lock);
99
100/*
101 * Protects cgroup_subsys->release_agent_path. Modifying it also requires 101 * Protects cgroup_subsys->release_agent_path. Modifying it also requires
102 * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. 102 * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
103 */ 103 */
104static DEFINE_SPINLOCK(release_agent_path_lock); 104static DEFINE_SPINLOCK(release_agent_path_lock);
105 105
106#define cgroup_assert_mutexes_or_rcu_locked() \ 106#define cgroup_assert_mutex_or_rcu_locked() \
107 rcu_lockdep_assert(rcu_read_lock_held() || \ 107 rcu_lockdep_assert(rcu_read_lock_held() || \
108 lockdep_is_held(&cgroup_tree_mutex) || \
109 lockdep_is_held(&cgroup_mutex), \ 108 lockdep_is_held(&cgroup_mutex), \
110 "cgroup_[tree_]mutex or RCU read lock required"); 109 "cgroup_mutex or RCU read lock required");
111 110
112/* 111/*
113 * cgroup destruction makes heavy use of work items and there can be a lot 112 * cgroup destruction makes heavy use of work items and there can be a lot
@@ -150,6 +149,13 @@ struct cgroup_root cgrp_dfl_root;
150 */ 149 */
151static bool cgrp_dfl_root_visible; 150static bool cgrp_dfl_root_visible;
152 151
152/* some controllers are not supported in the default hierarchy */
153static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0
154#ifdef CONFIG_CGROUP_DEBUG
155 | (1 << debug_cgrp_id)
156#endif
157 ;
158
153/* The list of hierarchy roots */ 159/* The list of hierarchy roots */
154 160
155static LIST_HEAD(cgroup_roots); 161static LIST_HEAD(cgroup_roots);
@@ -159,14 +165,13 @@ static int cgroup_root_count;
159static DEFINE_IDR(cgroup_hierarchy_idr); 165static DEFINE_IDR(cgroup_hierarchy_idr);
160 166
161/* 167/*
162 * Assign a monotonically increasing serial number to cgroups. It 168 * Assign a monotonically increasing serial number to csses. It guarantees
163 * guarantees cgroups with bigger numbers are newer than those with smaller 169 * cgroups with bigger numbers are newer than those with smaller numbers.
164 * numbers. Also, as cgroups are always appended to the parent's 170 * Also, as csses are always appended to the parent's ->children list, it
165 * ->children list, it guarantees that sibling cgroups are always sorted in 171 * guarantees that sibling csses are always sorted in the ascending serial
166 * the ascending serial number order on the list. Protected by 172 * number order on the list. Protected by cgroup_mutex.
167 * cgroup_mutex.
168 */ 173 */
169static u64 cgroup_serial_nr_next = 1; 174static u64 css_serial_nr_next = 1;
170 175
171/* This flag indicates whether tasks in the fork and exit paths should 176/* This flag indicates whether tasks in the fork and exit paths should
172 * check for fork/exit handlers to call. This avoids us having to do 177 * check for fork/exit handlers to call. This avoids us having to do
@@ -179,17 +184,59 @@ static struct cftype cgroup_base_files[];
179 184
180static void cgroup_put(struct cgroup *cgrp); 185static void cgroup_put(struct cgroup *cgrp);
181static int rebind_subsystems(struct cgroup_root *dst_root, 186static int rebind_subsystems(struct cgroup_root *dst_root,
182 unsigned long ss_mask); 187 unsigned int ss_mask);
183static void cgroup_destroy_css_killed(struct cgroup *cgrp);
184static int cgroup_destroy_locked(struct cgroup *cgrp); 188static int cgroup_destroy_locked(struct cgroup *cgrp);
189static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
190static void css_release(struct percpu_ref *ref);
191static void kill_css(struct cgroup_subsys_state *css);
185static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 192static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
186 bool is_add); 193 bool is_add);
187static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); 194static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
188 195
196/* IDR wrappers which synchronize using cgroup_idr_lock */
197static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
198 gfp_t gfp_mask)
199{
200 int ret;
201
202 idr_preload(gfp_mask);
203 spin_lock_bh(&cgroup_idr_lock);
204 ret = idr_alloc(idr, ptr, start, end, gfp_mask);
205 spin_unlock_bh(&cgroup_idr_lock);
206 idr_preload_end();
207 return ret;
208}
209
210static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
211{
212 void *ret;
213
214 spin_lock_bh(&cgroup_idr_lock);
215 ret = idr_replace(idr, ptr, id);
216 spin_unlock_bh(&cgroup_idr_lock);
217 return ret;
218}
219
220static void cgroup_idr_remove(struct idr *idr, int id)
221{
222 spin_lock_bh(&cgroup_idr_lock);
223 idr_remove(idr, id);
224 spin_unlock_bh(&cgroup_idr_lock);
225}
226
227static struct cgroup *cgroup_parent(struct cgroup *cgrp)
228{
229 struct cgroup_subsys_state *parent_css = cgrp->self.parent;
230
231 if (parent_css)
232 return container_of(parent_css, struct cgroup, self);
233 return NULL;
234}
235
189/** 236/**
190 * cgroup_css - obtain a cgroup's css for the specified subsystem 237 * cgroup_css - obtain a cgroup's css for the specified subsystem
191 * @cgrp: the cgroup of interest 238 * @cgrp: the cgroup of interest
192 * @ss: the subsystem of interest (%NULL returns the dummy_css) 239 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
193 * 240 *
194 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This 241 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
195 * function must be called either under cgroup_mutex or rcu_read_lock() and 242 * function must be called either under cgroup_mutex or rcu_read_lock() and
@@ -202,23 +249,49 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
202{ 249{
203 if (ss) 250 if (ss)
204 return rcu_dereference_check(cgrp->subsys[ss->id], 251 return rcu_dereference_check(cgrp->subsys[ss->id],
205 lockdep_is_held(&cgroup_tree_mutex) ||
206 lockdep_is_held(&cgroup_mutex)); 252 lockdep_is_held(&cgroup_mutex));
207 else 253 else
208 return &cgrp->dummy_css; 254 return &cgrp->self;
255}
256
257/**
258 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
259 * @cgrp: the cgroup of interest
260 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
261 *
262 * Similar to cgroup_css() but returns the effctive css, which is defined
263 * as the matching css of the nearest ancestor including self which has @ss
264 * enabled. If @ss is associated with the hierarchy @cgrp is on, this
265 * function is guaranteed to return non-NULL css.
266 */
267static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
268 struct cgroup_subsys *ss)
269{
270 lockdep_assert_held(&cgroup_mutex);
271
272 if (!ss)
273 return &cgrp->self;
274
275 if (!(cgrp->root->subsys_mask & (1 << ss->id)))
276 return NULL;
277
278 while (cgroup_parent(cgrp) &&
279 !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
280 cgrp = cgroup_parent(cgrp);
281
282 return cgroup_css(cgrp, ss);
209} 283}
210 284
211/* convenient tests for these bits */ 285/* convenient tests for these bits */
212static inline bool cgroup_is_dead(const struct cgroup *cgrp) 286static inline bool cgroup_is_dead(const struct cgroup *cgrp)
213{ 287{
214 return test_bit(CGRP_DEAD, &cgrp->flags); 288 return !(cgrp->self.flags & CSS_ONLINE);
215} 289}
216 290
217struct cgroup_subsys_state *seq_css(struct seq_file *seq) 291struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
218{ 292{
219 struct kernfs_open_file *of = seq->private;
220 struct cgroup *cgrp = of->kn->parent->priv; 293 struct cgroup *cgrp = of->kn->parent->priv;
221 struct cftype *cft = seq_cft(seq); 294 struct cftype *cft = of_cft(of);
222 295
223 /* 296 /*
224 * This is open and unprotected implementation of cgroup_css(). 297 * This is open and unprotected implementation of cgroup_css().
@@ -231,9 +304,9 @@ struct cgroup_subsys_state *seq_css(struct seq_file *seq)
231 if (cft->ss) 304 if (cft->ss)
232 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); 305 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
233 else 306 else
234 return &cgrp->dummy_css; 307 return &cgrp->self;
235} 308}
236EXPORT_SYMBOL_GPL(seq_css); 309EXPORT_SYMBOL_GPL(of_css);
237 310
238/** 311/**
239 * cgroup_is_descendant - test ancestry 312 * cgroup_is_descendant - test ancestry
@@ -249,7 +322,7 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
249 while (cgrp) { 322 while (cgrp) {
250 if (cgrp == ancestor) 323 if (cgrp == ancestor)
251 return true; 324 return true;
252 cgrp = cgrp->parent; 325 cgrp = cgroup_parent(cgrp);
253 } 326 }
254 return false; 327 return false;
255} 328}
@@ -273,17 +346,30 @@ static int notify_on_release(const struct cgroup *cgrp)
273 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end 346 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
274 * @cgrp: the target cgroup to iterate css's of 347 * @cgrp: the target cgroup to iterate css's of
275 * 348 *
276 * Should be called under cgroup_mutex. 349 * Should be called under cgroup_[tree_]mutex.
277 */ 350 */
278#define for_each_css(css, ssid, cgrp) \ 351#define for_each_css(css, ssid, cgrp) \
279 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 352 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
280 if (!((css) = rcu_dereference_check( \ 353 if (!((css) = rcu_dereference_check( \
281 (cgrp)->subsys[(ssid)], \ 354 (cgrp)->subsys[(ssid)], \
282 lockdep_is_held(&cgroup_tree_mutex) || \
283 lockdep_is_held(&cgroup_mutex)))) { } \ 355 lockdep_is_held(&cgroup_mutex)))) { } \
284 else 356 else
285 357
286/** 358/**
359 * for_each_e_css - iterate all effective css's of a cgroup
360 * @css: the iteration cursor
361 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
362 * @cgrp: the target cgroup to iterate css's of
363 *
364 * Should be called under cgroup_[tree_]mutex.
365 */
366#define for_each_e_css(css, ssid, cgrp) \
367 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
368 if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
369 ; \
370 else
371
372/**
287 * for_each_subsys - iterate all enabled cgroup subsystems 373 * for_each_subsys - iterate all enabled cgroup subsystems
288 * @ss: the iteration cursor 374 * @ss: the iteration cursor
289 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 375 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
@@ -296,22 +382,13 @@ static int notify_on_release(const struct cgroup *cgrp)
296#define for_each_root(root) \ 382#define for_each_root(root) \
297 list_for_each_entry((root), &cgroup_roots, root_list) 383 list_for_each_entry((root), &cgroup_roots, root_list)
298 384
299/** 385/* iterate over child cgrps, lock should be held throughout iteration */
300 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 386#define cgroup_for_each_live_child(child, cgrp) \
301 * @cgrp: the cgroup to be checked for liveness 387 list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
302 * 388 if (({ lockdep_assert_held(&cgroup_mutex); \
303 * On success, returns true; the mutex should be later unlocked. On 389 cgroup_is_dead(child); })) \
304 * failure returns false with no lock held. 390 ; \
305 */ 391 else
306static bool cgroup_lock_live_group(struct cgroup *cgrp)
307{
308 mutex_lock(&cgroup_mutex);
309 if (cgroup_is_dead(cgrp)) {
310 mutex_unlock(&cgroup_mutex);
311 return false;
312 }
313 return true;
314}
315 392
316/* the list of cgroups eligible for automatic release. Protected by 393/* the list of cgroups eligible for automatic release. Protected by
317 * release_list_lock */ 394 * release_list_lock */
@@ -348,7 +425,7 @@ struct cgrp_cset_link {
348 * reference-counted, to improve performance when child cgroups 425 * reference-counted, to improve performance when child cgroups
349 * haven't been created. 426 * haven't been created.
350 */ 427 */
351static struct css_set init_css_set = { 428struct css_set init_css_set = {
352 .refcount = ATOMIC_INIT(1), 429 .refcount = ATOMIC_INIT(1),
353 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), 430 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
354 .tasks = LIST_HEAD_INIT(init_css_set.tasks), 431 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
@@ -359,6 +436,43 @@ static struct css_set init_css_set = {
359 436
360static int css_set_count = 1; /* 1 for init_css_set */ 437static int css_set_count = 1; /* 1 for init_css_set */
361 438
439/**
440 * cgroup_update_populated - updated populated count of a cgroup
441 * @cgrp: the target cgroup
442 * @populated: inc or dec populated count
443 *
444 * @cgrp is either getting the first task (css_set) or losing the last.
445 * Update @cgrp->populated_cnt accordingly. The count is propagated
446 * towards root so that a given cgroup's populated_cnt is zero iff the
447 * cgroup and all its descendants are empty.
448 *
449 * @cgrp's interface file "cgroup.populated" is zero if
450 * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
451 * changes from or to zero, userland is notified that the content of the
452 * interface file has changed. This can be used to detect when @cgrp and
453 * its descendants become populated or empty.
454 */
455static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
456{
457 lockdep_assert_held(&css_set_rwsem);
458
459 do {
460 bool trigger;
461
462 if (populated)
463 trigger = !cgrp->populated_cnt++;
464 else
465 trigger = !--cgrp->populated_cnt;
466
467 if (!trigger)
468 break;
469
470 if (cgrp->populated_kn)
471 kernfs_notify(cgrp->populated_kn);
472 cgrp = cgroup_parent(cgrp);
473 } while (cgrp);
474}
475
362/* 476/*
363 * hash table for cgroup groups. This improves the performance to find 477 * hash table for cgroup groups. This improves the performance to find
364 * an existing css_set. This hash doesn't (currently) take into 478 * an existing css_set. This hash doesn't (currently) take into
@@ -383,6 +497,8 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
383static void put_css_set_locked(struct css_set *cset, bool taskexit) 497static void put_css_set_locked(struct css_set *cset, bool taskexit)
384{ 498{
385 struct cgrp_cset_link *link, *tmp_link; 499 struct cgrp_cset_link *link, *tmp_link;
500 struct cgroup_subsys *ss;
501 int ssid;
386 502
387 lockdep_assert_held(&css_set_rwsem); 503 lockdep_assert_held(&css_set_rwsem);
388 504
@@ -390,6 +506,8 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
390 return; 506 return;
391 507
392 /* This css_set is dead. unlink it and release cgroup refcounts */ 508 /* This css_set is dead. unlink it and release cgroup refcounts */
509 for_each_subsys(ss, ssid)
510 list_del(&cset->e_cset_node[ssid]);
393 hash_del(&cset->hlist); 511 hash_del(&cset->hlist);
394 css_set_count--; 512 css_set_count--;
395 513
@@ -400,10 +518,13 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
400 list_del(&link->cgrp_link); 518 list_del(&link->cgrp_link);
401 519
402 /* @cgrp can't go away while we're holding css_set_rwsem */ 520 /* @cgrp can't go away while we're holding css_set_rwsem */
403 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { 521 if (list_empty(&cgrp->cset_links)) {
404 if (taskexit) 522 cgroup_update_populated(cgrp, false);
405 set_bit(CGRP_RELEASABLE, &cgrp->flags); 523 if (notify_on_release(cgrp)) {
406 check_for_release(cgrp); 524 if (taskexit)
525 set_bit(CGRP_RELEASABLE, &cgrp->flags);
526 check_for_release(cgrp);
527 }
407 } 528 }
408 529
409 kfree(link); 530 kfree(link);
@@ -452,20 +573,20 @@ static bool compare_css_sets(struct css_set *cset,
452{ 573{
453 struct list_head *l1, *l2; 574 struct list_head *l1, *l2;
454 575
455 if (memcmp(template, cset->subsys, sizeof(cset->subsys))) { 576 /*
456 /* Not all subsystems matched */ 577 * On the default hierarchy, there can be csets which are
578 * associated with the same set of cgroups but different csses.
579 * Let's first ensure that csses match.
580 */
581 if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
457 return false; 582 return false;
458 }
459 583
460 /* 584 /*
461 * Compare cgroup pointers in order to distinguish between 585 * Compare cgroup pointers in order to distinguish between
462 * different cgroups in heirarchies with no subsystems. We 586 * different cgroups in hierarchies. As different cgroups may
463 * could get by with just this check alone (and skip the 587 * share the same effective css, this comparison is always
464 * memcmp above) but on most setups the memcmp check will 588 * necessary.
465 * avoid the need for this more expensive check on almost all
466 * candidates.
467 */ 589 */
468
469 l1 = &cset->cgrp_links; 590 l1 = &cset->cgrp_links;
470 l2 = &old_cset->cgrp_links; 591 l2 = &old_cset->cgrp_links;
471 while (1) { 592 while (1) {
@@ -529,14 +650,17 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
529 * won't change, so no need for locking. 650 * won't change, so no need for locking.
530 */ 651 */
531 for_each_subsys(ss, i) { 652 for_each_subsys(ss, i) {
532 if (root->cgrp.subsys_mask & (1UL << i)) { 653 if (root->subsys_mask & (1UL << i)) {
533 /* Subsystem is in this hierarchy. So we want 654 /*
534 * the subsystem state from the new 655 * @ss is in this hierarchy, so we want the
535 * cgroup */ 656 * effective css from @cgrp.
536 template[i] = cgroup_css(cgrp, ss); 657 */
658 template[i] = cgroup_e_css(cgrp, ss);
537 } else { 659 } else {
538 /* Subsystem is not in this hierarchy, so we 660 /*
539 * don't want to change the subsystem state */ 661 * @ss is not in this hierarchy, so we don't want
662 * to change the css.
663 */
540 template[i] = old_cset->subsys[i]; 664 template[i] = old_cset->subsys[i];
541 } 665 }
542 } 666 }
@@ -602,10 +726,18 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
602 struct cgrp_cset_link *link; 726 struct cgrp_cset_link *link;
603 727
604 BUG_ON(list_empty(tmp_links)); 728 BUG_ON(list_empty(tmp_links));
729
730 if (cgroup_on_dfl(cgrp))
731 cset->dfl_cgrp = cgrp;
732
605 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); 733 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
606 link->cset = cset; 734 link->cset = cset;
607 link->cgrp = cgrp; 735 link->cgrp = cgrp;
736
737 if (list_empty(&cgrp->cset_links))
738 cgroup_update_populated(cgrp, true);
608 list_move(&link->cset_link, &cgrp->cset_links); 739 list_move(&link->cset_link, &cgrp->cset_links);
740
609 /* 741 /*
610 * Always add links to the tail of the list so that the list 742 * Always add links to the tail of the list so that the list
611 * is sorted by order of hierarchy creation 743 * is sorted by order of hierarchy creation
@@ -628,7 +760,9 @@ static struct css_set *find_css_set(struct css_set *old_cset,
628 struct css_set *cset; 760 struct css_set *cset;
629 struct list_head tmp_links; 761 struct list_head tmp_links;
630 struct cgrp_cset_link *link; 762 struct cgrp_cset_link *link;
763 struct cgroup_subsys *ss;
631 unsigned long key; 764 unsigned long key;
765 int ssid;
632 766
633 lockdep_assert_held(&cgroup_mutex); 767 lockdep_assert_held(&cgroup_mutex);
634 768
@@ -679,10 +813,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,
679 813
680 css_set_count++; 814 css_set_count++;
681 815
682 /* Add this cgroup group to the hash table */ 816 /* Add @cset to the hash table */
683 key = css_set_hash(cset->subsys); 817 key = css_set_hash(cset->subsys);
684 hash_add(css_set_table, &cset->hlist, key); 818 hash_add(css_set_table, &cset->hlist, key);
685 819
820 for_each_subsys(ss, ssid)
821 list_add_tail(&cset->e_cset_node[ssid],
822 &cset->subsys[ssid]->cgroup->e_csets[ssid]);
823
686 up_write(&css_set_rwsem); 824 up_write(&css_set_rwsem);
687 825
688 return cset; 826 return cset;
@@ -735,14 +873,13 @@ static void cgroup_destroy_root(struct cgroup_root *root)
735 struct cgroup *cgrp = &root->cgrp; 873 struct cgroup *cgrp = &root->cgrp;
736 struct cgrp_cset_link *link, *tmp_link; 874 struct cgrp_cset_link *link, *tmp_link;
737 875
738 mutex_lock(&cgroup_tree_mutex);
739 mutex_lock(&cgroup_mutex); 876 mutex_lock(&cgroup_mutex);
740 877
741 BUG_ON(atomic_read(&root->nr_cgrps)); 878 BUG_ON(atomic_read(&root->nr_cgrps));
742 BUG_ON(!list_empty(&cgrp->children)); 879 BUG_ON(!list_empty(&cgrp->self.children));
743 880
744 /* Rebind all subsystems back to the default hierarchy */ 881 /* Rebind all subsystems back to the default hierarchy */
745 rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask); 882 rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
746 883
747 /* 884 /*
748 * Release all the links from cset_links to this hierarchy's 885 * Release all the links from cset_links to this hierarchy's
@@ -765,7 +902,6 @@ static void cgroup_destroy_root(struct cgroup_root *root)
765 cgroup_exit_root_id(root); 902 cgroup_exit_root_id(root);
766 903
767 mutex_unlock(&cgroup_mutex); 904 mutex_unlock(&cgroup_mutex);
768 mutex_unlock(&cgroup_tree_mutex);
769 905
770 kernfs_destroy_root(root->kf_root); 906 kernfs_destroy_root(root->kf_root);
771 cgroup_free_root(root); 907 cgroup_free_root(root);
@@ -848,7 +984,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
848 * update of a tasks cgroup pointer by cgroup_attach_task() 984 * update of a tasks cgroup pointer by cgroup_attach_task()
849 */ 985 */
850 986
851static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); 987static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
852static struct kernfs_syscall_ops cgroup_kf_syscall_ops; 988static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
853static const struct file_operations proc_cgroupstats_operations; 989static const struct file_operations proc_cgroupstats_operations;
854 990
@@ -883,79 +1019,95 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
883 if (cft->read_u64 || cft->read_s64 || cft->seq_show) 1019 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
884 mode |= S_IRUGO; 1020 mode |= S_IRUGO;
885 1021
886 if (cft->write_u64 || cft->write_s64 || cft->write_string || 1022 if (cft->write_u64 || cft->write_s64 || cft->write)
887 cft->trigger)
888 mode |= S_IWUSR; 1023 mode |= S_IWUSR;
889 1024
890 return mode; 1025 return mode;
891} 1026}
892 1027
893static void cgroup_free_fn(struct work_struct *work) 1028static void cgroup_get(struct cgroup *cgrp)
894{ 1029{
895 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 1030 WARN_ON_ONCE(cgroup_is_dead(cgrp));
896 1031 css_get(&cgrp->self);
897 atomic_dec(&cgrp->root->nr_cgrps);
898 cgroup_pidlist_destroy_all(cgrp);
899
900 if (cgrp->parent) {
901 /*
902 * We get a ref to the parent, and put the ref when this
903 * cgroup is being freed, so it's guaranteed that the
904 * parent won't be destroyed before its children.
905 */
906 cgroup_put(cgrp->parent);
907 kernfs_put(cgrp->kn);
908 kfree(cgrp);
909 } else {
910 /*
911 * This is root cgroup's refcnt reaching zero, which
912 * indicates that the root should be released.
913 */
914 cgroup_destroy_root(cgrp->root);
915 }
916} 1032}
917 1033
918static void cgroup_free_rcu(struct rcu_head *head) 1034static void cgroup_put(struct cgroup *cgrp)
919{ 1035{
920 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); 1036 css_put(&cgrp->self);
921
922 INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
923 queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
924} 1037}
925 1038
926static void cgroup_get(struct cgroup *cgrp) 1039/**
1040 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
1041 * @kn: the kernfs_node being serviced
1042 *
1043 * This helper undoes cgroup_kn_lock_live() and should be invoked before
1044 * the method finishes if locking succeeded. Note that once this function
1045 * returns the cgroup returned by cgroup_kn_lock_live() may become
1046 * inaccessible any time. If the caller intends to continue to access the
1047 * cgroup, it should pin it before invoking this function.
1048 */
1049static void cgroup_kn_unlock(struct kernfs_node *kn)
927{ 1050{
928 WARN_ON_ONCE(cgroup_is_dead(cgrp)); 1051 struct cgroup *cgrp;
929 WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0); 1052
930 atomic_inc(&cgrp->refcnt); 1053 if (kernfs_type(kn) == KERNFS_DIR)
1054 cgrp = kn->priv;
1055 else
1056 cgrp = kn->parent->priv;
1057
1058 mutex_unlock(&cgroup_mutex);
1059
1060 kernfs_unbreak_active_protection(kn);
1061 cgroup_put(cgrp);
931} 1062}
932 1063
933static void cgroup_put(struct cgroup *cgrp) 1064/**
1065 * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
1066 * @kn: the kernfs_node being serviced
1067 *
1068 * This helper is to be used by a cgroup kernfs method currently servicing
1069 * @kn. It breaks the active protection, performs cgroup locking and
1070 * verifies that the associated cgroup is alive. Returns the cgroup if
1071 * alive; otherwise, %NULL. A successful return should be undone by a
1072 * matching cgroup_kn_unlock() invocation.
1073 *
1074 * Any cgroup kernfs method implementation which requires locking the
1075 * associated cgroup should use this helper. It avoids nesting cgroup
1076 * locking under kernfs active protection and allows all kernfs operations
1077 * including self-removal.
1078 */
1079static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
934{ 1080{
935 if (!atomic_dec_and_test(&cgrp->refcnt)) 1081 struct cgroup *cgrp;
936 return; 1082
937 if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) 1083 if (kernfs_type(kn) == KERNFS_DIR)
938 return; 1084 cgrp = kn->priv;
1085 else
1086 cgrp = kn->parent->priv;
939 1087
940 /* 1088 /*
941 * XXX: cgrp->id is only used to look up css's. As cgroup and 1089 * We're gonna grab cgroup_mutex which nests outside kernfs
942 * css's lifetimes will be decoupled, it should be made 1090 * active_ref. cgroup liveliness check alone provides enough
943 * per-subsystem and moved to css->id so that lookups are 1091 * protection against removal. Ensure @cgrp stays accessible and
944 * successful until the target css is released. 1092 * break the active_ref protection.
945 */ 1093 */
1094 cgroup_get(cgrp);
1095 kernfs_break_active_protection(kn);
1096
946 mutex_lock(&cgroup_mutex); 1097 mutex_lock(&cgroup_mutex);
947 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
948 mutex_unlock(&cgroup_mutex);
949 cgrp->id = -1;
950 1098
951 call_rcu(&cgrp->rcu_head, cgroup_free_rcu); 1099 if (!cgroup_is_dead(cgrp))
1100 return cgrp;
1101
1102 cgroup_kn_unlock(kn);
1103 return NULL;
952} 1104}
953 1105
954static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 1106static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
955{ 1107{
956 char name[CGROUP_FILE_NAME_MAX]; 1108 char name[CGROUP_FILE_NAME_MAX];
957 1109
958 lockdep_assert_held(&cgroup_tree_mutex); 1110 lockdep_assert_held(&cgroup_mutex);
959 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); 1111 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
960} 1112}
961 1113
@@ -964,7 +1116,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
964 * @cgrp: target cgroup 1116 * @cgrp: target cgroup
965 * @subsys_mask: mask of the subsystem ids whose files should be removed 1117 * @subsys_mask: mask of the subsystem ids whose files should be removed
966 */ 1118 */
967static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) 1119static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
968{ 1120{
969 struct cgroup_subsys *ss; 1121 struct cgroup_subsys *ss;
970 int i; 1122 int i;
@@ -972,40 +1124,40 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
972 for_each_subsys(ss, i) { 1124 for_each_subsys(ss, i) {
973 struct cftype *cfts; 1125 struct cftype *cfts;
974 1126
975 if (!test_bit(i, &subsys_mask)) 1127 if (!(subsys_mask & (1 << i)))
976 continue; 1128 continue;
977 list_for_each_entry(cfts, &ss->cfts, node) 1129 list_for_each_entry(cfts, &ss->cfts, node)
978 cgroup_addrm_files(cgrp, cfts, false); 1130 cgroup_addrm_files(cgrp, cfts, false);
979 } 1131 }
980} 1132}
981 1133
982static int rebind_subsystems(struct cgroup_root *dst_root, 1134static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
983 unsigned long ss_mask)
984{ 1135{
985 struct cgroup_subsys *ss; 1136 struct cgroup_subsys *ss;
986 int ssid, ret; 1137 unsigned int tmp_ss_mask;
1138 int ssid, i, ret;
987 1139
988 lockdep_assert_held(&cgroup_tree_mutex);
989 lockdep_assert_held(&cgroup_mutex); 1140 lockdep_assert_held(&cgroup_mutex);
990 1141
991 for_each_subsys(ss, ssid) { 1142 for_each_subsys(ss, ssid) {
992 if (!(ss_mask & (1 << ssid))) 1143 if (!(ss_mask & (1 << ssid)))
993 continue; 1144 continue;
994 1145
995 /* if @ss is on the dummy_root, we can always move it */ 1146 /* if @ss has non-root csses attached to it, can't move */
996 if (ss->root == &cgrp_dfl_root) 1147 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
997 continue;
998
999 /* if @ss has non-root cgroups attached to it, can't move */
1000 if (!list_empty(&ss->root->cgrp.children))
1001 return -EBUSY; 1148 return -EBUSY;
1002 1149
1003 /* can't move between two non-dummy roots either */ 1150 /* can't move between two non-dummy roots either */
1004 if (dst_root != &cgrp_dfl_root) 1151 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1005 return -EBUSY; 1152 return -EBUSY;
1006 } 1153 }
1007 1154
1008 ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask); 1155 /* skip creating root files on dfl_root for inhibited subsystems */
1156 tmp_ss_mask = ss_mask;
1157 if (dst_root == &cgrp_dfl_root)
1158 tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
1159
1160 ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
1009 if (ret) { 1161 if (ret) {
1010 if (dst_root != &cgrp_dfl_root) 1162 if (dst_root != &cgrp_dfl_root)
1011 return ret; 1163 return ret;
@@ -1017,9 +1169,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
1017 * Just warn about it and continue. 1169 * Just warn about it and continue.
1018 */ 1170 */
1019 if (cgrp_dfl_root_visible) { 1171 if (cgrp_dfl_root_visible) {
1020 pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", 1172 pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
1021 ret, ss_mask); 1173 ret, ss_mask);
1022 pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); 1174 pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
1023 } 1175 }
1024 } 1176 }
1025 1177
@@ -1027,15 +1179,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
1027 * Nothing can fail from this point on. Remove files for the 1179 * Nothing can fail from this point on. Remove files for the
1028 * removed subsystems and rebind each subsystem. 1180 * removed subsystems and rebind each subsystem.
1029 */ 1181 */
1030 mutex_unlock(&cgroup_mutex);
1031 for_each_subsys(ss, ssid) 1182 for_each_subsys(ss, ssid)
1032 if (ss_mask & (1 << ssid)) 1183 if (ss_mask & (1 << ssid))
1033 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); 1184 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1034 mutex_lock(&cgroup_mutex);
1035 1185
1036 for_each_subsys(ss, ssid) { 1186 for_each_subsys(ss, ssid) {
1037 struct cgroup_root *src_root; 1187 struct cgroup_root *src_root;
1038 struct cgroup_subsys_state *css; 1188 struct cgroup_subsys_state *css;
1189 struct css_set *cset;
1039 1190
1040 if (!(ss_mask & (1 << ssid))) 1191 if (!(ss_mask & (1 << ssid)))
1041 continue; 1192 continue;
@@ -1050,8 +1201,19 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
1050 ss->root = dst_root; 1201 ss->root = dst_root;
1051 css->cgroup = &dst_root->cgrp; 1202 css->cgroup = &dst_root->cgrp;
1052 1203
1053 src_root->cgrp.subsys_mask &= ~(1 << ssid); 1204 down_write(&css_set_rwsem);
1054 dst_root->cgrp.subsys_mask |= 1 << ssid; 1205 hash_for_each(css_set_table, i, cset, hlist)
1206 list_move_tail(&cset->e_cset_node[ss->id],
1207 &dst_root->cgrp.e_csets[ss->id]);
1208 up_write(&css_set_rwsem);
1209
1210 src_root->subsys_mask &= ~(1 << ssid);
1211 src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
1212
1213 /* default hierarchy doesn't enable controllers by default */
1214 dst_root->subsys_mask |= 1 << ssid;
1215 if (dst_root != &cgrp_dfl_root)
1216 dst_root->cgrp.child_subsys_mask |= 1 << ssid;
1055 1217
1056 if (ss->bind) 1218 if (ss->bind)
1057 ss->bind(css); 1219 ss->bind(css);
@@ -1069,7 +1231,7 @@ static int cgroup_show_options(struct seq_file *seq,
1069 int ssid; 1231 int ssid;
1070 1232
1071 for_each_subsys(ss, ssid) 1233 for_each_subsys(ss, ssid)
1072 if (root->cgrp.subsys_mask & (1 << ssid)) 1234 if (root->subsys_mask & (1 << ssid))
1073 seq_printf(seq, ",%s", ss->name); 1235 seq_printf(seq, ",%s", ss->name);
1074 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1236 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1075 seq_puts(seq, ",sane_behavior"); 1237 seq_puts(seq, ",sane_behavior");
@@ -1091,8 +1253,8 @@ static int cgroup_show_options(struct seq_file *seq,
1091} 1253}
1092 1254
1093struct cgroup_sb_opts { 1255struct cgroup_sb_opts {
1094 unsigned long subsys_mask; 1256 unsigned int subsys_mask;
1095 unsigned long flags; 1257 unsigned int flags;
1096 char *release_agent; 1258 char *release_agent;
1097 bool cpuset_clone_children; 1259 bool cpuset_clone_children;
1098 char *name; 1260 char *name;
@@ -1100,24 +1262,16 @@ struct cgroup_sb_opts {
1100 bool none; 1262 bool none;
1101}; 1263};
1102 1264
1103/*
1104 * Convert a hierarchy specifier into a bitmask of subsystems and
1105 * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
1106 * array. This function takes refcounts on subsystems to be used, unless it
1107 * returns error, in which case no refcounts are taken.
1108 */
1109static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) 1265static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1110{ 1266{
1111 char *token, *o = data; 1267 char *token, *o = data;
1112 bool all_ss = false, one_ss = false; 1268 bool all_ss = false, one_ss = false;
1113 unsigned long mask = (unsigned long)-1; 1269 unsigned int mask = -1U;
1114 struct cgroup_subsys *ss; 1270 struct cgroup_subsys *ss;
1115 int i; 1271 int i;
1116 1272
1117 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1118
1119#ifdef CONFIG_CPUSETS 1273#ifdef CONFIG_CPUSETS
1120 mask = ~(1UL << cpuset_cgrp_id); 1274 mask = ~(1U << cpuset_cgrp_id);
1121#endif 1275#endif
1122 1276
1123 memset(opts, 0, sizeof(*opts)); 1277 memset(opts, 0, sizeof(*opts));
@@ -1198,7 +1352,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1198 /* Mutually exclusive option 'all' + subsystem name */ 1352 /* Mutually exclusive option 'all' + subsystem name */
1199 if (all_ss) 1353 if (all_ss)
1200 return -EINVAL; 1354 return -EINVAL;
1201 set_bit(i, &opts->subsys_mask); 1355 opts->subsys_mask |= (1 << i);
1202 one_ss = true; 1356 one_ss = true;
1203 1357
1204 break; 1358 break;
@@ -1210,12 +1364,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1210 /* Consistency checks */ 1364 /* Consistency checks */
1211 1365
1212 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1366 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1213 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); 1367 pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1214 1368
1215 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || 1369 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
1216 opts->cpuset_clone_children || opts->release_agent || 1370 opts->cpuset_clone_children || opts->release_agent ||
1217 opts->name) { 1371 opts->name) {
1218 pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); 1372 pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
1219 return -EINVAL; 1373 return -EINVAL;
1220 } 1374 }
1221 } else { 1375 } else {
@@ -1227,7 +1381,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1227 if (all_ss || (!one_ss && !opts->none && !opts->name)) 1381 if (all_ss || (!one_ss && !opts->none && !opts->name))
1228 for_each_subsys(ss, i) 1382 for_each_subsys(ss, i)
1229 if (!ss->disabled) 1383 if (!ss->disabled)
1230 set_bit(i, &opts->subsys_mask); 1384 opts->subsys_mask |= (1 << i);
1231 1385
1232 /* 1386 /*
1233 * We either have to specify by name or by subsystems. (So 1387 * We either have to specify by name or by subsystems. (So
@@ -1258,14 +1412,13 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1258 int ret = 0; 1412 int ret = 0;
1259 struct cgroup_root *root = cgroup_root_from_kf(kf_root); 1413 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1260 struct cgroup_sb_opts opts; 1414 struct cgroup_sb_opts opts;
1261 unsigned long added_mask, removed_mask; 1415 unsigned int added_mask, removed_mask;
1262 1416
1263 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1417 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1264 pr_err("cgroup: sane_behavior: remount is not allowed\n"); 1418 pr_err("sane_behavior: remount is not allowed\n");
1265 return -EINVAL; 1419 return -EINVAL;
1266 } 1420 }
1267 1421
1268 mutex_lock(&cgroup_tree_mutex);
1269 mutex_lock(&cgroup_mutex); 1422 mutex_lock(&cgroup_mutex);
1270 1423
1271 /* See what subsystems are wanted */ 1424 /* See what subsystems are wanted */
@@ -1273,17 +1426,17 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1273 if (ret) 1426 if (ret)
1274 goto out_unlock; 1427 goto out_unlock;
1275 1428
1276 if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent) 1429 if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1277 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1430 pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
1278 task_tgid_nr(current), current->comm); 1431 task_tgid_nr(current), current->comm);
1279 1432
1280 added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask; 1433 added_mask = opts.subsys_mask & ~root->subsys_mask;
1281 removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask; 1434 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1282 1435
1283 /* Don't allow flags or name to change at remount */ 1436 /* Don't allow flags or name to change at remount */
1284 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || 1437 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
1285 (opts.name && strcmp(opts.name, root->name))) { 1438 (opts.name && strcmp(opts.name, root->name))) {
1286 pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", 1439 pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
1287 opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", 1440 opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
1288 root->flags & CGRP_ROOT_OPTION_MASK, root->name); 1441 root->flags & CGRP_ROOT_OPTION_MASK, root->name);
1289 ret = -EINVAL; 1442 ret = -EINVAL;
@@ -1291,7 +1444,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1291 } 1444 }
1292 1445
1293 /* remounting is not allowed for populated hierarchies */ 1446 /* remounting is not allowed for populated hierarchies */
1294 if (!list_empty(&root->cgrp.children)) { 1447 if (!list_empty(&root->cgrp.self.children)) {
1295 ret = -EBUSY; 1448 ret = -EBUSY;
1296 goto out_unlock; 1449 goto out_unlock;
1297 } 1450 }
@@ -1311,7 +1464,6 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1311 kfree(opts.release_agent); 1464 kfree(opts.release_agent);
1312 kfree(opts.name); 1465 kfree(opts.name);
1313 mutex_unlock(&cgroup_mutex); 1466 mutex_unlock(&cgroup_mutex);
1314 mutex_unlock(&cgroup_tree_mutex);
1315 return ret; 1467 return ret;
1316} 1468}
1317 1469
@@ -1369,14 +1521,22 @@ out_unlock:
1369 1521
1370static void init_cgroup_housekeeping(struct cgroup *cgrp) 1522static void init_cgroup_housekeeping(struct cgroup *cgrp)
1371{ 1523{
1372 atomic_set(&cgrp->refcnt, 1); 1524 struct cgroup_subsys *ss;
1373 INIT_LIST_HEAD(&cgrp->sibling); 1525 int ssid;
1374 INIT_LIST_HEAD(&cgrp->children); 1526
1527 INIT_LIST_HEAD(&cgrp->self.sibling);
1528 INIT_LIST_HEAD(&cgrp->self.children);
1375 INIT_LIST_HEAD(&cgrp->cset_links); 1529 INIT_LIST_HEAD(&cgrp->cset_links);
1376 INIT_LIST_HEAD(&cgrp->release_list); 1530 INIT_LIST_HEAD(&cgrp->release_list);
1377 INIT_LIST_HEAD(&cgrp->pidlists); 1531 INIT_LIST_HEAD(&cgrp->pidlists);
1378 mutex_init(&cgrp->pidlist_mutex); 1532 mutex_init(&cgrp->pidlist_mutex);
1379 cgrp->dummy_css.cgroup = cgrp; 1533 cgrp->self.cgroup = cgrp;
1534 cgrp->self.flags |= CSS_ONLINE;
1535
1536 for_each_subsys(ss, ssid)
1537 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1538
1539 init_waitqueue_head(&cgrp->offline_waitq);
1380} 1540}
1381 1541
1382static void init_cgroup_root(struct cgroup_root *root, 1542static void init_cgroup_root(struct cgroup_root *root,
@@ -1399,21 +1559,24 @@ static void init_cgroup_root(struct cgroup_root *root,
1399 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); 1559 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1400} 1560}
1401 1561
1402static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) 1562static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
1403{ 1563{
1404 LIST_HEAD(tmp_links); 1564 LIST_HEAD(tmp_links);
1405 struct cgroup *root_cgrp = &root->cgrp; 1565 struct cgroup *root_cgrp = &root->cgrp;
1406 struct css_set *cset; 1566 struct css_set *cset;
1407 int i, ret; 1567 int i, ret;
1408 1568
1409 lockdep_assert_held(&cgroup_tree_mutex);
1410 lockdep_assert_held(&cgroup_mutex); 1569 lockdep_assert_held(&cgroup_mutex);
1411 1570
1412 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); 1571 ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
1413 if (ret < 0) 1572 if (ret < 0)
1414 goto out; 1573 goto out;
1415 root_cgrp->id = ret; 1574 root_cgrp->id = ret;
1416 1575
1576 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release);
1577 if (ret)
1578 goto out;
1579
1417 /* 1580 /*
1418 * We're accessing css_set_count without locking css_set_rwsem here, 1581 * We're accessing css_set_count without locking css_set_rwsem here,
1419 * but that's OK - it can only be increased by someone holding 1582 * but that's OK - it can only be increased by someone holding
@@ -1422,11 +1585,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1422 */ 1585 */
1423 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); 1586 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1424 if (ret) 1587 if (ret)
1425 goto out; 1588 goto cancel_ref;
1426 1589
1427 ret = cgroup_init_root_id(root); 1590 ret = cgroup_init_root_id(root);
1428 if (ret) 1591 if (ret)
1429 goto out; 1592 goto cancel_ref;
1430 1593
1431 root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, 1594 root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
1432 KERNFS_ROOT_CREATE_DEACTIVATED, 1595 KERNFS_ROOT_CREATE_DEACTIVATED,
@@ -1462,7 +1625,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1462 link_css_set(&tmp_links, cset, root_cgrp); 1625 link_css_set(&tmp_links, cset, root_cgrp);
1463 up_write(&css_set_rwsem); 1626 up_write(&css_set_rwsem);
1464 1627
1465 BUG_ON(!list_empty(&root_cgrp->children)); 1628 BUG_ON(!list_empty(&root_cgrp->self.children));
1466 BUG_ON(atomic_read(&root->nr_cgrps) != 1); 1629 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1467 1630
1468 kernfs_activate(root_cgrp->kn); 1631 kernfs_activate(root_cgrp->kn);
@@ -1474,6 +1637,8 @@ destroy_root:
1474 root->kf_root = NULL; 1637 root->kf_root = NULL;
1475exit_root_id: 1638exit_root_id:
1476 cgroup_exit_root_id(root); 1639 cgroup_exit_root_id(root);
1640cancel_ref:
1641 percpu_ref_cancel_init(&root_cgrp->self.refcnt);
1477out: 1642out:
1478 free_cgrp_cset_links(&tmp_links); 1643 free_cgrp_cset_links(&tmp_links);
1479 return ret; 1644 return ret;
@@ -1495,8 +1660,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1495 */ 1660 */
1496 if (!use_task_css_set_links) 1661 if (!use_task_css_set_links)
1497 cgroup_enable_task_cg_lists(); 1662 cgroup_enable_task_cg_lists();
1498retry: 1663
1499 mutex_lock(&cgroup_tree_mutex);
1500 mutex_lock(&cgroup_mutex); 1664 mutex_lock(&cgroup_mutex);
1501 1665
1502 /* First find the desired set of subsystems */ 1666 /* First find the desired set of subsystems */
@@ -1535,7 +1699,7 @@ retry:
1535 * subsystems) then they must match. 1699 * subsystems) then they must match.
1536 */ 1700 */
1537 if ((opts.subsys_mask || opts.none) && 1701 if ((opts.subsys_mask || opts.none) &&
1538 (opts.subsys_mask != root->cgrp.subsys_mask)) { 1702 (opts.subsys_mask != root->subsys_mask)) {
1539 if (!name_match) 1703 if (!name_match)
1540 continue; 1704 continue;
1541 ret = -EBUSY; 1705 ret = -EBUSY;
@@ -1544,28 +1708,27 @@ retry:
1544 1708
1545 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { 1709 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1546 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1710 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1547 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); 1711 pr_err("sane_behavior: new mount options should match the existing superblock\n");
1548 ret = -EINVAL; 1712 ret = -EINVAL;
1549 goto out_unlock; 1713 goto out_unlock;
1550 } else { 1714 } else {
1551 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); 1715 pr_warn("new mount options do not match the existing superblock, will be ignored\n");
1552 } 1716 }
1553 } 1717 }
1554 1718
1555 /* 1719 /*
1556 * A root's lifetime is governed by its root cgroup. Zero 1720 * A root's lifetime is governed by its root cgroup.
1557 * ref indicate that the root is being destroyed. Wait for 1721 * tryget_live failure indicate that the root is being
1558 * destruction to complete so that the subsystems are free. 1722 * destroyed. Wait for destruction to complete so that the
1559 * We can use wait_queue for the wait but this path is 1723 * subsystems are free. We can use wait_queue for the wait
1560 * super cold. Let's just sleep for a bit and retry. 1724 * but this path is super cold. Let's just sleep for a bit
1725 * and retry.
1561 */ 1726 */
1562 if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { 1727 if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
1563 mutex_unlock(&cgroup_mutex); 1728 mutex_unlock(&cgroup_mutex);
1564 mutex_unlock(&cgroup_tree_mutex);
1565 kfree(opts.release_agent);
1566 kfree(opts.name);
1567 msleep(10); 1729 msleep(10);
1568 goto retry; 1730 ret = restart_syscall();
1731 goto out_free;
1569 } 1732 }
1570 1733
1571 ret = 0; 1734 ret = 0;
@@ -1596,15 +1759,15 @@ retry:
1596 1759
1597out_unlock: 1760out_unlock:
1598 mutex_unlock(&cgroup_mutex); 1761 mutex_unlock(&cgroup_mutex);
1599 mutex_unlock(&cgroup_tree_mutex); 1762out_free:
1600
1601 kfree(opts.release_agent); 1763 kfree(opts.release_agent);
1602 kfree(opts.name); 1764 kfree(opts.name);
1603 1765
1604 if (ret) 1766 if (ret)
1605 return ERR_PTR(ret); 1767 return ERR_PTR(ret);
1606 1768
1607 dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb); 1769 dentry = kernfs_mount(fs_type, flags, root->kf_root,
1770 CGROUP_SUPER_MAGIC, &new_sb);
1608 if (IS_ERR(dentry) || !new_sb) 1771 if (IS_ERR(dentry) || !new_sb)
1609 cgroup_put(&root->cgrp); 1772 cgroup_put(&root->cgrp);
1610 return dentry; 1773 return dentry;
@@ -1615,7 +1778,19 @@ static void cgroup_kill_sb(struct super_block *sb)
1615 struct kernfs_root *kf_root = kernfs_root_from_sb(sb); 1778 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1616 struct cgroup_root *root = cgroup_root_from_kf(kf_root); 1779 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1617 1780
1618 cgroup_put(&root->cgrp); 1781 /*
1782 * If @root doesn't have any mounts or children, start killing it.
1783 * This prevents new mounts by disabling percpu_ref_tryget_live().
1784 * cgroup_mount() may wait for @root's release.
1785 *
1786 * And don't kill the default root.
1787 */
1788 if (css_has_online_children(&root->cgrp.self) ||
1789 root == &cgrp_dfl_root)
1790 cgroup_put(&root->cgrp);
1791 else
1792 percpu_ref_kill(&root->cgrp.self.refcnt);
1793
1619 kernfs_kill_sb(sb); 1794 kernfs_kill_sb(sb);
1620} 1795}
1621 1796
@@ -1737,7 +1912,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1737 1912
1738/** 1913/**
1739 * cgroup_task_migrate - move a task from one cgroup to another. 1914 * cgroup_task_migrate - move a task from one cgroup to another.
1740 * @old_cgrp; the cgroup @tsk is being migrated from 1915 * @old_cgrp: the cgroup @tsk is being migrated from
1741 * @tsk: the task being migrated 1916 * @tsk: the task being migrated
1742 * @new_cset: the new css_set @tsk is being attached to 1917 * @new_cset: the new css_set @tsk is being attached to
1743 * 1918 *
@@ -1829,10 +2004,6 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
1829 2004
1830 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); 2005 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
1831 2006
1832 /* nothing to do if this cset already belongs to the cgroup */
1833 if (src_cgrp == dst_cgrp)
1834 return;
1835
1836 if (!list_empty(&src_cset->mg_preload_node)) 2007 if (!list_empty(&src_cset->mg_preload_node))
1837 return; 2008 return;
1838 2009
@@ -1847,13 +2018,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
1847 2018
1848/** 2019/**
1849 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration 2020 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
1850 * @dst_cgrp: the destination cgroup 2021 * @dst_cgrp: the destination cgroup (may be %NULL)
1851 * @preloaded_csets: list of preloaded source css_sets 2022 * @preloaded_csets: list of preloaded source css_sets
1852 * 2023 *
1853 * Tasks are about to be moved to @dst_cgrp and all the source css_sets 2024 * Tasks are about to be moved to @dst_cgrp and all the source css_sets
1854 * have been preloaded to @preloaded_csets. This function looks up and 2025 * have been preloaded to @preloaded_csets. This function looks up and
1855 * pins all destination css_sets, links each to its source, and put them on 2026 * pins all destination css_sets, links each to its source, and append them
1856 * @preloaded_csets. 2027 * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each
2028 * source css_set is assumed to be its cgroup on the default hierarchy.
1857 * 2029 *
1858 * This function must be called after cgroup_migrate_add_src() has been 2030 * This function must be called after cgroup_migrate_add_src() has been
1859 * called on each migration source css_set. After migration is performed 2031 * called on each migration source css_set. After migration is performed
@@ -1864,19 +2036,42 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
1864 struct list_head *preloaded_csets) 2036 struct list_head *preloaded_csets)
1865{ 2037{
1866 LIST_HEAD(csets); 2038 LIST_HEAD(csets);
1867 struct css_set *src_cset; 2039 struct css_set *src_cset, *tmp_cset;
1868 2040
1869 lockdep_assert_held(&cgroup_mutex); 2041 lockdep_assert_held(&cgroup_mutex);
1870 2042
2043 /*
2044 * Except for the root, child_subsys_mask must be zero for a cgroup
2045 * with tasks so that child cgroups don't compete against tasks.
2046 */
2047 if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
2048 dst_cgrp->child_subsys_mask)
2049 return -EBUSY;
2050
1871 /* look up the dst cset for each src cset and link it to src */ 2051 /* look up the dst cset for each src cset and link it to src */
1872 list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) { 2052 list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
1873 struct css_set *dst_cset; 2053 struct css_set *dst_cset;
1874 2054
1875 dst_cset = find_css_set(src_cset, dst_cgrp); 2055 dst_cset = find_css_set(src_cset,
2056 dst_cgrp ?: src_cset->dfl_cgrp);
1876 if (!dst_cset) 2057 if (!dst_cset)
1877 goto err; 2058 goto err;
1878 2059
1879 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); 2060 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2061
2062 /*
2063 * If src cset equals dst, it's noop. Drop the src.
2064 * cgroup_migrate() will skip the cset too. Note that we
2065 * can't handle src == dst as some nodes are used by both.
2066 */
2067 if (src_cset == dst_cset) {
2068 src_cset->mg_src_cgrp = NULL;
2069 list_del_init(&src_cset->mg_preload_node);
2070 put_css_set(src_cset, false);
2071 put_css_set(dst_cset, false);
2072 continue;
2073 }
2074
1880 src_cset->mg_dst_cset = dst_cset; 2075 src_cset->mg_dst_cset = dst_cset;
1881 2076
1882 if (list_empty(&dst_cset->mg_preload_node)) 2077 if (list_empty(&dst_cset->mg_preload_node))
@@ -1885,7 +2080,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
1885 put_css_set(dst_cset, false); 2080 put_css_set(dst_cset, false);
1886 } 2081 }
1887 2082
1888 list_splice(&csets, preloaded_csets); 2083 list_splice_tail(&csets, preloaded_csets);
1889 return 0; 2084 return 0;
1890err: 2085err:
1891 cgroup_migrate_finish(&csets); 2086 cgroup_migrate_finish(&csets);
@@ -1966,7 +2161,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
1966 return 0; 2161 return 0;
1967 2162
1968 /* check that we can legitimately attach to the cgroup */ 2163 /* check that we can legitimately attach to the cgroup */
1969 for_each_css(css, i, cgrp) { 2164 for_each_e_css(css, i, cgrp) {
1970 if (css->ss->can_attach) { 2165 if (css->ss->can_attach) {
1971 ret = css->ss->can_attach(css, &tset); 2166 ret = css->ss->can_attach(css, &tset);
1972 if (ret) { 2167 if (ret) {
@@ -1996,7 +2191,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
1996 */ 2191 */
1997 tset.csets = &tset.dst_csets; 2192 tset.csets = &tset.dst_csets;
1998 2193
1999 for_each_css(css, i, cgrp) 2194 for_each_e_css(css, i, cgrp)
2000 if (css->ss->attach) 2195 if (css->ss->attach)
2001 css->ss->attach(css, &tset); 2196 css->ss->attach(css, &tset);
2002 2197
@@ -2004,7 +2199,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
2004 goto out_release_tset; 2199 goto out_release_tset;
2005 2200
2006out_cancel_attach: 2201out_cancel_attach:
2007 for_each_css(css, i, cgrp) { 2202 for_each_e_css(css, i, cgrp) {
2008 if (css == failed_css) 2203 if (css == failed_css)
2009 break; 2204 break;
2010 if (css->ss->cancel_attach) 2205 if (css->ss->cancel_attach)
@@ -2063,13 +2258,20 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
2063 * function to attach either it or all tasks in its threadgroup. Will lock 2258 * function to attach either it or all tasks in its threadgroup. Will lock
2064 * cgroup_mutex and threadgroup. 2259 * cgroup_mutex and threadgroup.
2065 */ 2260 */
2066static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 2261static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2262 size_t nbytes, loff_t off, bool threadgroup)
2067{ 2263{
2068 struct task_struct *tsk; 2264 struct task_struct *tsk;
2069 const struct cred *cred = current_cred(), *tcred; 2265 const struct cred *cred = current_cred(), *tcred;
2266 struct cgroup *cgrp;
2267 pid_t pid;
2070 int ret; 2268 int ret;
2071 2269
2072 if (!cgroup_lock_live_group(cgrp)) 2270 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2271 return -EINVAL;
2272
2273 cgrp = cgroup_kn_lock_live(of->kn);
2274 if (!cgrp)
2073 return -ENODEV; 2275 return -ENODEV;
2074 2276
2075retry_find_task: 2277retry_find_task:
@@ -2135,8 +2337,8 @@ retry_find_task:
2135 2337
2136 put_task_struct(tsk); 2338 put_task_struct(tsk);
2137out_unlock_cgroup: 2339out_unlock_cgroup:
2138 mutex_unlock(&cgroup_mutex); 2340 cgroup_kn_unlock(of->kn);
2139 return ret; 2341 return ret ?: nbytes;
2140} 2342}
2141 2343
2142/** 2344/**
@@ -2170,43 +2372,44 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2170} 2372}
2171EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 2373EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2172 2374
2173static int cgroup_tasks_write(struct cgroup_subsys_state *css, 2375static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
2174 struct cftype *cft, u64 pid) 2376 char *buf, size_t nbytes, loff_t off)
2175{ 2377{
2176 return attach_task_by_pid(css->cgroup, pid, false); 2378 return __cgroup_procs_write(of, buf, nbytes, off, false);
2177} 2379}
2178 2380
2179static int cgroup_procs_write(struct cgroup_subsys_state *css, 2381static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
2180 struct cftype *cft, u64 tgid) 2382 char *buf, size_t nbytes, loff_t off)
2181{ 2383{
2182 return attach_task_by_pid(css->cgroup, tgid, true); 2384 return __cgroup_procs_write(of, buf, nbytes, off, true);
2183} 2385}
2184 2386
2185static int cgroup_release_agent_write(struct cgroup_subsys_state *css, 2387static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
2186 struct cftype *cft, char *buffer) 2388 char *buf, size_t nbytes, loff_t off)
2187{ 2389{
2188 struct cgroup_root *root = css->cgroup->root; 2390 struct cgroup *cgrp;
2189 2391
2190 BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); 2392 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
2191 if (!cgroup_lock_live_group(css->cgroup)) 2393
2394 cgrp = cgroup_kn_lock_live(of->kn);
2395 if (!cgrp)
2192 return -ENODEV; 2396 return -ENODEV;
2193 spin_lock(&release_agent_path_lock); 2397 spin_lock(&release_agent_path_lock);
2194 strlcpy(root->release_agent_path, buffer, 2398 strlcpy(cgrp->root->release_agent_path, strstrip(buf),
2195 sizeof(root->release_agent_path)); 2399 sizeof(cgrp->root->release_agent_path));
2196 spin_unlock(&release_agent_path_lock); 2400 spin_unlock(&release_agent_path_lock);
2197 mutex_unlock(&cgroup_mutex); 2401 cgroup_kn_unlock(of->kn);
2198 return 0; 2402 return nbytes;
2199} 2403}
2200 2404
2201static int cgroup_release_agent_show(struct seq_file *seq, void *v) 2405static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2202{ 2406{
2203 struct cgroup *cgrp = seq_css(seq)->cgroup; 2407 struct cgroup *cgrp = seq_css(seq)->cgroup;
2204 2408
2205 if (!cgroup_lock_live_group(cgrp)) 2409 spin_lock(&release_agent_path_lock);
2206 return -ENODEV;
2207 seq_puts(seq, cgrp->root->release_agent_path); 2410 seq_puts(seq, cgrp->root->release_agent_path);
2411 spin_unlock(&release_agent_path_lock);
2208 seq_putc(seq, '\n'); 2412 seq_putc(seq, '\n');
2209 mutex_unlock(&cgroup_mutex);
2210 return 0; 2413 return 0;
2211} 2414}
2212 2415
@@ -2218,6 +2421,320 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2218 return 0; 2421 return 0;
2219} 2422}
2220 2423
2424static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
2425{
2426 struct cgroup_subsys *ss;
2427 bool printed = false;
2428 int ssid;
2429
2430 for_each_subsys(ss, ssid) {
2431 if (ss_mask & (1 << ssid)) {
2432 if (printed)
2433 seq_putc(seq, ' ');
2434 seq_printf(seq, "%s", ss->name);
2435 printed = true;
2436 }
2437 }
2438 if (printed)
2439 seq_putc(seq, '\n');
2440}
2441
2442/* show controllers which are currently attached to the default hierarchy */
2443static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
2444{
2445 struct cgroup *cgrp = seq_css(seq)->cgroup;
2446
2447 cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
2448 ~cgrp_dfl_root_inhibit_ss_mask);
2449 return 0;
2450}
2451
2452/* show controllers which are enabled from the parent */
2453static int cgroup_controllers_show(struct seq_file *seq, void *v)
2454{
2455 struct cgroup *cgrp = seq_css(seq)->cgroup;
2456
2457 cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask);
2458 return 0;
2459}
2460
2461/* show controllers which are enabled for a given cgroup's children */
2462static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2463{
2464 struct cgroup *cgrp = seq_css(seq)->cgroup;
2465
2466 cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
2467 return 0;
2468}
2469
2470/**
2471 * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
2472 * @cgrp: root of the subtree to update csses for
2473 *
2474 * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
2475 * css associations need to be updated accordingly. This function looks up
2476 * all css_sets which are attached to the subtree, creates the matching
2477 * updated css_sets and migrates the tasks to the new ones.
2478 */
2479static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2480{
2481 LIST_HEAD(preloaded_csets);
2482 struct cgroup_subsys_state *css;
2483 struct css_set *src_cset;
2484 int ret;
2485
2486 lockdep_assert_held(&cgroup_mutex);
2487
2488 /* look up all csses currently attached to @cgrp's subtree */
2489 down_read(&css_set_rwsem);
2490 css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
2491 struct cgrp_cset_link *link;
2492
2493 /* self is not affected by child_subsys_mask change */
2494 if (css->cgroup == cgrp)
2495 continue;
2496
2497 list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
2498 cgroup_migrate_add_src(link->cset, cgrp,
2499 &preloaded_csets);
2500 }
2501 up_read(&css_set_rwsem);
2502
2503 /* NULL dst indicates self on default hierarchy */
2504 ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
2505 if (ret)
2506 goto out_finish;
2507
2508 list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
2509 struct task_struct *last_task = NULL, *task;
2510
2511 /* src_csets precede dst_csets, break on the first dst_cset */
2512 if (!src_cset->mg_src_cgrp)
2513 break;
2514
2515 /*
2516 * All tasks in src_cset need to be migrated to the
2517 * matching dst_cset. Empty it process by process. We
2518 * walk tasks but migrate processes. The leader might even
2519 * belong to a different cset but such src_cset would also
2520 * be among the target src_csets because the default
2521 * hierarchy enforces per-process membership.
2522 */
2523 while (true) {
2524 down_read(&css_set_rwsem);
2525 task = list_first_entry_or_null(&src_cset->tasks,
2526 struct task_struct, cg_list);
2527 if (task) {
2528 task = task->group_leader;
2529 WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
2530 get_task_struct(task);
2531 }
2532 up_read(&css_set_rwsem);
2533
2534 if (!task)
2535 break;
2536
2537 /* guard against possible infinite loop */
2538 if (WARN(last_task == task,
2539 "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
2540 goto out_finish;
2541 last_task = task;
2542
2543 threadgroup_lock(task);
2544 /* raced against de_thread() from another thread? */
2545 if (!thread_group_leader(task)) {
2546 threadgroup_unlock(task);
2547 put_task_struct(task);
2548 continue;
2549 }
2550
2551 ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
2552
2553 threadgroup_unlock(task);
2554 put_task_struct(task);
2555
2556 if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
2557 goto out_finish;
2558 }
2559 }
2560
2561out_finish:
2562 cgroup_migrate_finish(&preloaded_csets);
2563 return ret;
2564}
2565
2566/* change the enabled child controllers for a cgroup in the default hierarchy */
2567static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2568 char *buf, size_t nbytes,
2569 loff_t off)
2570{
2571 unsigned int enable = 0, disable = 0;
2572 struct cgroup *cgrp, *child;
2573 struct cgroup_subsys *ss;
2574 char *tok;
2575 int ssid, ret;
2576
2577 /*
2578 * Parse input - space separated list of subsystem names prefixed
2579 * with either + or -.
2580 */
2581 buf = strstrip(buf);
2582 while ((tok = strsep(&buf, " "))) {
2583 if (tok[0] == '\0')
2584 continue;
2585 for_each_subsys(ss, ssid) {
2586 if (ss->disabled || strcmp(tok + 1, ss->name) ||
2587 ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
2588 continue;
2589
2590 if (*tok == '+') {
2591 enable |= 1 << ssid;
2592 disable &= ~(1 << ssid);
2593 } else if (*tok == '-') {
2594 disable |= 1 << ssid;
2595 enable &= ~(1 << ssid);
2596 } else {
2597 return -EINVAL;
2598 }
2599 break;
2600 }
2601 if (ssid == CGROUP_SUBSYS_COUNT)
2602 return -EINVAL;
2603 }
2604
2605 cgrp = cgroup_kn_lock_live(of->kn);
2606 if (!cgrp)
2607 return -ENODEV;
2608
2609 for_each_subsys(ss, ssid) {
2610 if (enable & (1 << ssid)) {
2611 if (cgrp->child_subsys_mask & (1 << ssid)) {
2612 enable &= ~(1 << ssid);
2613 continue;
2614 }
2615
2616 /*
2617 * Because css offlining is asynchronous, userland
2618 * might try to re-enable the same controller while
2619 * the previous instance is still around. In such
2620 * cases, wait till it's gone using offline_waitq.
2621 */
2622 cgroup_for_each_live_child(child, cgrp) {
2623 DEFINE_WAIT(wait);
2624
2625 if (!cgroup_css(child, ss))
2626 continue;
2627
2628 cgroup_get(child);
2629 prepare_to_wait(&child->offline_waitq, &wait,
2630 TASK_UNINTERRUPTIBLE);
2631 cgroup_kn_unlock(of->kn);
2632 schedule();
2633 finish_wait(&child->offline_waitq, &wait);
2634 cgroup_put(child);
2635
2636 return restart_syscall();
2637 }
2638
2639 /* unavailable or not enabled on the parent? */
2640 if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
2641 (cgroup_parent(cgrp) &&
2642 !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
2643 ret = -ENOENT;
2644 goto out_unlock;
2645 }
2646 } else if (disable & (1 << ssid)) {
2647 if (!(cgrp->child_subsys_mask & (1 << ssid))) {
2648 disable &= ~(1 << ssid);
2649 continue;
2650 }
2651
2652 /* a child has it enabled? */
2653 cgroup_for_each_live_child(child, cgrp) {
2654 if (child->child_subsys_mask & (1 << ssid)) {
2655 ret = -EBUSY;
2656 goto out_unlock;
2657 }
2658 }
2659 }
2660 }
2661
2662 if (!enable && !disable) {
2663 ret = 0;
2664 goto out_unlock;
2665 }
2666
2667 /*
2668 * Except for the root, child_subsys_mask must be zero for a cgroup
2669 * with tasks so that child cgroups don't compete against tasks.
2670 */
2671 if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
2672 ret = -EBUSY;
2673 goto out_unlock;
2674 }
2675
2676 /*
2677 * Create csses for enables and update child_subsys_mask. This
2678 * changes cgroup_e_css() results which in turn makes the
2679 * subsequent cgroup_update_dfl_csses() associate all tasks in the
2680 * subtree to the updated csses.
2681 */
2682 for_each_subsys(ss, ssid) {
2683 if (!(enable & (1 << ssid)))
2684 continue;
2685
2686 cgroup_for_each_live_child(child, cgrp) {
2687 ret = create_css(child, ss);
2688 if (ret)
2689 goto err_undo_css;
2690 }
2691 }
2692
2693 cgrp->child_subsys_mask |= enable;
2694 cgrp->child_subsys_mask &= ~disable;
2695
2696 ret = cgroup_update_dfl_csses(cgrp);
2697 if (ret)
2698 goto err_undo_css;
2699
2700 /* all tasks are now migrated away from the old csses, kill them */
2701 for_each_subsys(ss, ssid) {
2702 if (!(disable & (1 << ssid)))
2703 continue;
2704
2705 cgroup_for_each_live_child(child, cgrp)
2706 kill_css(cgroup_css(child, ss));
2707 }
2708
2709 kernfs_activate(cgrp->kn);
2710 ret = 0;
2711out_unlock:
2712 cgroup_kn_unlock(of->kn);
2713 return ret ?: nbytes;
2714
2715err_undo_css:
2716 cgrp->child_subsys_mask &= ~enable;
2717 cgrp->child_subsys_mask |= disable;
2718
2719 for_each_subsys(ss, ssid) {
2720 if (!(enable & (1 << ssid)))
2721 continue;
2722
2723 cgroup_for_each_live_child(child, cgrp) {
2724 struct cgroup_subsys_state *css = cgroup_css(child, ss);
2725 if (css)
2726 kill_css(css);
2727 }
2728 }
2729 goto out_unlock;
2730}
2731
2732static int cgroup_populated_show(struct seq_file *seq, void *v)
2733{
2734 seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
2735 return 0;
2736}
2737
2221static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, 2738static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2222 size_t nbytes, loff_t off) 2739 size_t nbytes, loff_t off)
2223{ 2740{
@@ -2226,6 +2743,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2226 struct cgroup_subsys_state *css; 2743 struct cgroup_subsys_state *css;
2227 int ret; 2744 int ret;
2228 2745
2746 if (cft->write)
2747 return cft->write(of, buf, nbytes, off);
2748
2229 /* 2749 /*
2230 * kernfs guarantees that a file isn't deleted with operations in 2750 * kernfs guarantees that a file isn't deleted with operations in
2231 * flight, which means that the matching css is and stays alive and 2751 * flight, which means that the matching css is and stays alive and
@@ -2236,9 +2756,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2236 css = cgroup_css(cgrp, cft->ss); 2756 css = cgroup_css(cgrp, cft->ss);
2237 rcu_read_unlock(); 2757 rcu_read_unlock();
2238 2758
2239 if (cft->write_string) { 2759 if (cft->write_u64) {
2240 ret = cft->write_string(css, cft, strstrip(buf));
2241 } else if (cft->write_u64) {
2242 unsigned long long v; 2760 unsigned long long v;
2243 ret = kstrtoull(buf, 0, &v); 2761 ret = kstrtoull(buf, 0, &v);
2244 if (!ret) 2762 if (!ret)
@@ -2248,8 +2766,6 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2248 ret = kstrtoll(buf, 0, &v); 2766 ret = kstrtoll(buf, 0, &v);
2249 if (!ret) 2767 if (!ret)
2250 ret = cft->write_s64(css, cft, v); 2768 ret = cft->write_s64(css, cft, v);
2251 } else if (cft->trigger) {
2252 ret = cft->trigger(css, (unsigned int)cft->private);
2253 } else { 2769 } else {
2254 ret = -EINVAL; 2770 ret = -EINVAL;
2255 } 2771 }
@@ -2326,20 +2842,18 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
2326 return -EPERM; 2842 return -EPERM;
2327 2843
2328 /* 2844 /*
2329 * We're gonna grab cgroup_tree_mutex which nests outside kernfs 2845 * We're gonna grab cgroup_mutex which nests outside kernfs
2330 * active_ref. kernfs_rename() doesn't require active_ref 2846 * active_ref. kernfs_rename() doesn't require active_ref
2331 * protection. Break them before grabbing cgroup_tree_mutex. 2847 * protection. Break them before grabbing cgroup_mutex.
2332 */ 2848 */
2333 kernfs_break_active_protection(new_parent); 2849 kernfs_break_active_protection(new_parent);
2334 kernfs_break_active_protection(kn); 2850 kernfs_break_active_protection(kn);
2335 2851
2336 mutex_lock(&cgroup_tree_mutex);
2337 mutex_lock(&cgroup_mutex); 2852 mutex_lock(&cgroup_mutex);
2338 2853
2339 ret = kernfs_rename(kn, new_parent, new_name_str); 2854 ret = kernfs_rename(kn, new_parent, new_name_str);
2340 2855
2341 mutex_unlock(&cgroup_mutex); 2856 mutex_unlock(&cgroup_mutex);
2342 mutex_unlock(&cgroup_tree_mutex);
2343 2857
2344 kernfs_unbreak_active_protection(kn); 2858 kernfs_unbreak_active_protection(kn);
2345 kernfs_unbreak_active_protection(new_parent); 2859 kernfs_unbreak_active_protection(new_parent);
@@ -2377,9 +2891,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2377 return PTR_ERR(kn); 2891 return PTR_ERR(kn);
2378 2892
2379 ret = cgroup_kn_set_ugid(kn); 2893 ret = cgroup_kn_set_ugid(kn);
2380 if (ret) 2894 if (ret) {
2381 kernfs_remove(kn); 2895 kernfs_remove(kn);
2382 return ret; 2896 return ret;
2897 }
2898
2899 if (cft->seq_show == cgroup_populated_show)
2900 cgrp->populated_kn = kn;
2901 return 0;
2383} 2902}
2384 2903
2385/** 2904/**
@@ -2399,7 +2918,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2399 struct cftype *cft; 2918 struct cftype *cft;
2400 int ret; 2919 int ret;
2401 2920
2402 lockdep_assert_held(&cgroup_tree_mutex); 2921 lockdep_assert_held(&cgroup_mutex);
2403 2922
2404 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2923 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2405 /* does cft->flags tell us to skip this file on @cgrp? */ 2924 /* does cft->flags tell us to skip this file on @cgrp? */
@@ -2407,16 +2926,16 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2407 continue; 2926 continue;
2408 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) 2927 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2409 continue; 2928 continue;
2410 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) 2929 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
2411 continue; 2930 continue;
2412 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) 2931 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
2413 continue; 2932 continue;
2414 2933
2415 if (is_add) { 2934 if (is_add) {
2416 ret = cgroup_add_file(cgrp, cft); 2935 ret = cgroup_add_file(cgrp, cft);
2417 if (ret) { 2936 if (ret) {
2418 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", 2937 pr_warn("%s: failed to add %s, err=%d\n",
2419 cft->name, ret); 2938 __func__, cft->name, ret);
2420 return ret; 2939 return ret;
2421 } 2940 }
2422 } else { 2941 } else {
@@ -2434,11 +2953,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
2434 struct cgroup_subsys_state *css; 2953 struct cgroup_subsys_state *css;
2435 int ret = 0; 2954 int ret = 0;
2436 2955
2437 lockdep_assert_held(&cgroup_tree_mutex); 2956 lockdep_assert_held(&cgroup_mutex);
2438
2439 /* don't bother if @ss isn't attached */
2440 if (ss->root == &cgrp_dfl_root)
2441 return 0;
2442 2957
2443 /* add/rm files for all cgroups created before */ 2958 /* add/rm files for all cgroups created before */
2444 css_for_each_descendant_pre(css, cgroup_css(root, ss)) { 2959 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
@@ -2506,7 +3021,7 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2506 3021
2507static int cgroup_rm_cftypes_locked(struct cftype *cfts) 3022static int cgroup_rm_cftypes_locked(struct cftype *cfts)
2508{ 3023{
2509 lockdep_assert_held(&cgroup_tree_mutex); 3024 lockdep_assert_held(&cgroup_mutex);
2510 3025
2511 if (!cfts || !cfts[0].ss) 3026 if (!cfts || !cfts[0].ss)
2512 return -ENOENT; 3027 return -ENOENT;
@@ -2532,9 +3047,9 @@ int cgroup_rm_cftypes(struct cftype *cfts)
2532{ 3047{
2533 int ret; 3048 int ret;
2534 3049
2535 mutex_lock(&cgroup_tree_mutex); 3050 mutex_lock(&cgroup_mutex);
2536 ret = cgroup_rm_cftypes_locked(cfts); 3051 ret = cgroup_rm_cftypes_locked(cfts);
2537 mutex_unlock(&cgroup_tree_mutex); 3052 mutex_unlock(&cgroup_mutex);
2538 return ret; 3053 return ret;
2539} 3054}
2540 3055
@@ -2556,6 +3071,9 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2556{ 3071{
2557 int ret; 3072 int ret;
2558 3073
3074 if (ss->disabled)
3075 return 0;
3076
2559 if (!cfts || cfts[0].name[0] == '\0') 3077 if (!cfts || cfts[0].name[0] == '\0')
2560 return 0; 3078 return 0;
2561 3079
@@ -2563,14 +3081,14 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2563 if (ret) 3081 if (ret)
2564 return ret; 3082 return ret;
2565 3083
2566 mutex_lock(&cgroup_tree_mutex); 3084 mutex_lock(&cgroup_mutex);
2567 3085
2568 list_add_tail(&cfts->node, &ss->cfts); 3086 list_add_tail(&cfts->node, &ss->cfts);
2569 ret = cgroup_apply_cftypes(cfts, true); 3087 ret = cgroup_apply_cftypes(cfts, true);
2570 if (ret) 3088 if (ret)
2571 cgroup_rm_cftypes_locked(cfts); 3089 cgroup_rm_cftypes_locked(cfts);
2572 3090
2573 mutex_unlock(&cgroup_tree_mutex); 3091 mutex_unlock(&cgroup_mutex);
2574 return ret; 3092 return ret;
2575} 3093}
2576 3094
@@ -2594,57 +3112,65 @@ static int cgroup_task_count(const struct cgroup *cgrp)
2594 3112
2595/** 3113/**
2596 * css_next_child - find the next child of a given css 3114 * css_next_child - find the next child of a given css
2597 * @pos_css: the current position (%NULL to initiate traversal) 3115 * @pos: the current position (%NULL to initiate traversal)
2598 * @parent_css: css whose children to walk 3116 * @parent: css whose children to walk
2599 * 3117 *
2600 * This function returns the next child of @parent_css and should be called 3118 * This function returns the next child of @parent and should be called
2601 * under either cgroup_mutex or RCU read lock. The only requirement is 3119 * under either cgroup_mutex or RCU read lock. The only requirement is
2602 * that @parent_css and @pos_css are accessible. The next sibling is 3120 * that @parent and @pos are accessible. The next sibling is guaranteed to
2603 * guaranteed to be returned regardless of their states. 3121 * be returned regardless of their states.
3122 *
3123 * If a subsystem synchronizes ->css_online() and the start of iteration, a
3124 * css which finished ->css_online() is guaranteed to be visible in the
3125 * future iterations and will stay visible until the last reference is put.
3126 * A css which hasn't finished ->css_online() or already finished
3127 * ->css_offline() may show up during traversal. It's each subsystem's
3128 * responsibility to synchronize against on/offlining.
2604 */ 3129 */
2605struct cgroup_subsys_state * 3130struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
2606css_next_child(struct cgroup_subsys_state *pos_css, 3131 struct cgroup_subsys_state *parent)
2607 struct cgroup_subsys_state *parent_css)
2608{ 3132{
2609 struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; 3133 struct cgroup_subsys_state *next;
2610 struct cgroup *cgrp = parent_css->cgroup;
2611 struct cgroup *next;
2612 3134
2613 cgroup_assert_mutexes_or_rcu_locked(); 3135 cgroup_assert_mutex_or_rcu_locked();
2614 3136
2615 /* 3137 /*
2616 * @pos could already have been removed. Once a cgroup is removed, 3138 * @pos could already have been unlinked from the sibling list.
2617 * its ->sibling.next is no longer updated when its next sibling 3139 * Once a cgroup is removed, its ->sibling.next is no longer
2618 * changes. As CGRP_DEAD assertion is serialized and happens 3140 * updated when its next sibling changes. CSS_RELEASED is set when
2619 * before the cgroup is taken off the ->sibling list, if we see it 3141 * @pos is taken off list, at which time its next pointer is valid,
2620 * unasserted, it's guaranteed that the next sibling hasn't 3142 * and, as releases are serialized, the one pointed to by the next
2621 * finished its grace period even if it's already removed, and thus 3143 * pointer is guaranteed to not have started release yet. This
2622 * safe to dereference from this RCU critical section. If 3144 * implies that if we observe !CSS_RELEASED on @pos in this RCU
2623 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed 3145 * critical section, the one pointed to by its next pointer is
2624 * to be visible as %true here. 3146 * guaranteed to not have finished its RCU grace period even if we
3147 * have dropped rcu_read_lock() inbetween iterations.
2625 * 3148 *
2626 * If @pos is dead, its next pointer can't be dereferenced; 3149 * If @pos has CSS_RELEASED set, its next pointer can't be
2627 * however, as each cgroup is given a monotonically increasing 3150 * dereferenced; however, as each css is given a monotonically
2628 * unique serial number and always appended to the sibling list, 3151 * increasing unique serial number and always appended to the
2629 * the next one can be found by walking the parent's children until 3152 * sibling list, the next one can be found by walking the parent's
2630 * we see a cgroup with higher serial number than @pos's. While 3153 * children until the first css with higher serial number than
2631 * this path can be slower, it's taken only when either the current 3154 * @pos's. While this path can be slower, it happens iff iteration
2632 * cgroup is removed or iteration and removal race. 3155 * races against release and the race window is very small.
2633 */ 3156 */
2634 if (!pos) { 3157 if (!pos) {
2635 next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); 3158 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
2636 } else if (likely(!cgroup_is_dead(pos))) { 3159 } else if (likely(!(pos->flags & CSS_RELEASED))) {
2637 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); 3160 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
2638 } else { 3161 } else {
2639 list_for_each_entry_rcu(next, &cgrp->children, sibling) 3162 list_for_each_entry_rcu(next, &parent->children, sibling)
2640 if (next->serial_nr > pos->serial_nr) 3163 if (next->serial_nr > pos->serial_nr)
2641 break; 3164 break;
2642 } 3165 }
2643 3166
2644 if (&next->sibling == &cgrp->children) 3167 /*
2645 return NULL; 3168 * @next, if not pointing to the head, can be dereferenced and is
2646 3169 * the next sibling.
2647 return cgroup_css(next, parent_css->ss); 3170 */
3171 if (&next->sibling != &parent->children)
3172 return next;
3173 return NULL;
2648} 3174}
2649 3175
2650/** 3176/**
@@ -2660,6 +3186,13 @@ css_next_child(struct cgroup_subsys_state *pos_css,
2660 * doesn't require the whole traversal to be contained in a single critical 3186 * doesn't require the whole traversal to be contained in a single critical
2661 * section. This function will return the correct next descendant as long 3187 * section. This function will return the correct next descendant as long
2662 * as both @pos and @root are accessible and @pos is a descendant of @root. 3188 * as both @pos and @root are accessible and @pos is a descendant of @root.
3189 *
3190 * If a subsystem synchronizes ->css_online() and the start of iteration, a
3191 * css which finished ->css_online() is guaranteed to be visible in the
3192 * future iterations and will stay visible until the last reference is put.
3193 * A css which hasn't finished ->css_online() or already finished
3194 * ->css_offline() may show up during traversal. It's each subsystem's
3195 * responsibility to synchronize against on/offlining.
2663 */ 3196 */
2664struct cgroup_subsys_state * 3197struct cgroup_subsys_state *
2665css_next_descendant_pre(struct cgroup_subsys_state *pos, 3198css_next_descendant_pre(struct cgroup_subsys_state *pos,
@@ -2667,7 +3200,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
2667{ 3200{
2668 struct cgroup_subsys_state *next; 3201 struct cgroup_subsys_state *next;
2669 3202
2670 cgroup_assert_mutexes_or_rcu_locked(); 3203 cgroup_assert_mutex_or_rcu_locked();
2671 3204
2672 /* if first iteration, visit @root */ 3205 /* if first iteration, visit @root */
2673 if (!pos) 3206 if (!pos)
@@ -2680,10 +3213,10 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
2680 3213
2681 /* no child, visit my or the closest ancestor's next sibling */ 3214 /* no child, visit my or the closest ancestor's next sibling */
2682 while (pos != root) { 3215 while (pos != root) {
2683 next = css_next_child(pos, css_parent(pos)); 3216 next = css_next_child(pos, pos->parent);
2684 if (next) 3217 if (next)
2685 return next; 3218 return next;
2686 pos = css_parent(pos); 3219 pos = pos->parent;
2687 } 3220 }
2688 3221
2689 return NULL; 3222 return NULL;
@@ -2707,7 +3240,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
2707{ 3240{
2708 struct cgroup_subsys_state *last, *tmp; 3241 struct cgroup_subsys_state *last, *tmp;
2709 3242
2710 cgroup_assert_mutexes_or_rcu_locked(); 3243 cgroup_assert_mutex_or_rcu_locked();
2711 3244
2712 do { 3245 do {
2713 last = pos; 3246 last = pos;
@@ -2747,6 +3280,13 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
2747 * section. This function will return the correct next descendant as long 3280 * section. This function will return the correct next descendant as long
2748 * as both @pos and @cgroup are accessible and @pos is a descendant of 3281 * as both @pos and @cgroup are accessible and @pos is a descendant of
2749 * @cgroup. 3282 * @cgroup.
3283 *
3284 * If a subsystem synchronizes ->css_online() and the start of iteration, a
3285 * css which finished ->css_online() is guaranteed to be visible in the
3286 * future iterations and will stay visible until the last reference is put.
3287 * A css which hasn't finished ->css_online() or already finished
3288 * ->css_offline() may show up during traversal. It's each subsystem's
3289 * responsibility to synchronize against on/offlining.
2750 */ 3290 */
2751struct cgroup_subsys_state * 3291struct cgroup_subsys_state *
2752css_next_descendant_post(struct cgroup_subsys_state *pos, 3292css_next_descendant_post(struct cgroup_subsys_state *pos,
@@ -2754,7 +3294,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
2754{ 3294{
2755 struct cgroup_subsys_state *next; 3295 struct cgroup_subsys_state *next;
2756 3296
2757 cgroup_assert_mutexes_or_rcu_locked(); 3297 cgroup_assert_mutex_or_rcu_locked();
2758 3298
2759 /* if first iteration, visit leftmost descendant which may be @root */ 3299 /* if first iteration, visit leftmost descendant which may be @root */
2760 if (!pos) 3300 if (!pos)
@@ -2765,12 +3305,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
2765 return NULL; 3305 return NULL;
2766 3306
2767 /* if there's an unvisited sibling, visit its leftmost descendant */ 3307 /* if there's an unvisited sibling, visit its leftmost descendant */
2768 next = css_next_child(pos, css_parent(pos)); 3308 next = css_next_child(pos, pos->parent);
2769 if (next) 3309 if (next)
2770 return css_leftmost_descendant(next); 3310 return css_leftmost_descendant(next);
2771 3311
2772 /* no sibling left, visit parent */ 3312 /* no sibling left, visit parent */
2773 return css_parent(pos); 3313 return pos->parent;
3314}
3315
3316/**
3317 * css_has_online_children - does a css have online children
3318 * @css: the target css
3319 *
3320 * Returns %true if @css has any online children; otherwise, %false. This
3321 * function can be called from any context but the caller is responsible
3322 * for synchronizing against on/offlining as necessary.
3323 */
3324bool css_has_online_children(struct cgroup_subsys_state *css)
3325{
3326 struct cgroup_subsys_state *child;
3327 bool ret = false;
3328
3329 rcu_read_lock();
3330 css_for_each_child(child, css) {
3331 if (css->flags & CSS_ONLINE) {
3332 ret = true;
3333 break;
3334 }
3335 }
3336 rcu_read_unlock();
3337 return ret;
2774} 3338}
2775 3339
2776/** 3340/**
@@ -2781,27 +3345,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
2781 */ 3345 */
2782static void css_advance_task_iter(struct css_task_iter *it) 3346static void css_advance_task_iter(struct css_task_iter *it)
2783{ 3347{
2784 struct list_head *l = it->cset_link; 3348 struct list_head *l = it->cset_pos;
2785 struct cgrp_cset_link *link; 3349 struct cgrp_cset_link *link;
2786 struct css_set *cset; 3350 struct css_set *cset;
2787 3351
2788 /* Advance to the next non-empty css_set */ 3352 /* Advance to the next non-empty css_set */
2789 do { 3353 do {
2790 l = l->next; 3354 l = l->next;
2791 if (l == &it->origin_css->cgroup->cset_links) { 3355 if (l == it->cset_head) {
2792 it->cset_link = NULL; 3356 it->cset_pos = NULL;
2793 return; 3357 return;
2794 } 3358 }
2795 link = list_entry(l, struct cgrp_cset_link, cset_link); 3359
2796 cset = link->cset; 3360 if (it->ss) {
3361 cset = container_of(l, struct css_set,
3362 e_cset_node[it->ss->id]);
3363 } else {
3364 link = list_entry(l, struct cgrp_cset_link, cset_link);
3365 cset = link->cset;
3366 }
2797 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); 3367 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
2798 3368
2799 it->cset_link = l; 3369 it->cset_pos = l;
2800 3370
2801 if (!list_empty(&cset->tasks)) 3371 if (!list_empty(&cset->tasks))
2802 it->task = cset->tasks.next; 3372 it->task_pos = cset->tasks.next;
2803 else 3373 else
2804 it->task = cset->mg_tasks.next; 3374 it->task_pos = cset->mg_tasks.next;
3375
3376 it->tasks_head = &cset->tasks;
3377 it->mg_tasks_head = &cset->mg_tasks;
2805} 3378}
2806 3379
2807/** 3380/**
@@ -2827,8 +3400,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
2827 3400
2828 down_read(&css_set_rwsem); 3401 down_read(&css_set_rwsem);
2829 3402
2830 it->origin_css = css; 3403 it->ss = css->ss;
2831 it->cset_link = &css->cgroup->cset_links; 3404
3405 if (it->ss)
3406 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
3407 else
3408 it->cset_pos = &css->cgroup->cset_links;
3409
3410 it->cset_head = it->cset_pos;
2832 3411
2833 css_advance_task_iter(it); 3412 css_advance_task_iter(it);
2834} 3413}
@@ -2844,12 +3423,10 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
2844struct task_struct *css_task_iter_next(struct css_task_iter *it) 3423struct task_struct *css_task_iter_next(struct css_task_iter *it)
2845{ 3424{
2846 struct task_struct *res; 3425 struct task_struct *res;
2847 struct list_head *l = it->task; 3426 struct list_head *l = it->task_pos;
2848 struct cgrp_cset_link *link = list_entry(it->cset_link,
2849 struct cgrp_cset_link, cset_link);
2850 3427
2851 /* If the iterator cg is NULL, we have no tasks */ 3428 /* If the iterator cg is NULL, we have no tasks */
2852 if (!it->cset_link) 3429 if (!it->cset_pos)
2853 return NULL; 3430 return NULL;
2854 res = list_entry(l, struct task_struct, cg_list); 3431 res = list_entry(l, struct task_struct, cg_list);
2855 3432
@@ -2860,13 +3437,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
2860 */ 3437 */
2861 l = l->next; 3438 l = l->next;
2862 3439
2863 if (l == &link->cset->tasks) 3440 if (l == it->tasks_head)
2864 l = link->cset->mg_tasks.next; 3441 l = it->mg_tasks_head->next;
2865 3442
2866 if (l == &link->cset->mg_tasks) 3443 if (l == it->mg_tasks_head)
2867 css_advance_task_iter(it); 3444 css_advance_task_iter(it);
2868 else 3445 else
2869 it->task = l; 3446 it->task_pos = l;
2870 3447
2871 return res; 3448 return res;
2872} 3449}
@@ -2919,7 +3496,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
2919 * ->can_attach() fails. 3496 * ->can_attach() fails.
2920 */ 3497 */
2921 do { 3498 do {
2922 css_task_iter_start(&from->dummy_css, &it); 3499 css_task_iter_start(&from->self, &it);
2923 task = css_task_iter_next(&it); 3500 task = css_task_iter_next(&it);
2924 if (task) 3501 if (task)
2925 get_task_struct(task); 3502 get_task_struct(task);
@@ -3184,7 +3761,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3184 if (!array) 3761 if (!array)
3185 return -ENOMEM; 3762 return -ENOMEM;
3186 /* now, populate the array */ 3763 /* now, populate the array */
3187 css_task_iter_start(&cgrp->dummy_css, &it); 3764 css_task_iter_start(&cgrp->self, &it);
3188 while ((tsk = css_task_iter_next(&it))) { 3765 while ((tsk = css_task_iter_next(&it))) {
3189 if (unlikely(n == length)) 3766 if (unlikely(n == length))
3190 break; 3767 break;
@@ -3246,7 +3823,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3246 3823
3247 /* 3824 /*
3248 * We aren't being called from kernfs and there's no guarantee on 3825 * We aren't being called from kernfs and there's no guarantee on
3249 * @kn->priv's validity. For this and css_tryget_from_dir(), 3826 * @kn->priv's validity. For this and css_tryget_online_from_dir(),
3250 * @kn->priv is RCU safe. Let's do the RCU dancing. 3827 * @kn->priv is RCU safe. Let's do the RCU dancing.
3251 */ 3828 */
3252 rcu_read_lock(); 3829 rcu_read_lock();
@@ -3258,7 +3835,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3258 } 3835 }
3259 rcu_read_unlock(); 3836 rcu_read_unlock();
3260 3837
3261 css_task_iter_start(&cgrp->dummy_css, &it); 3838 css_task_iter_start(&cgrp->self, &it);
3262 while ((tsk = css_task_iter_next(&it))) { 3839 while ((tsk = css_task_iter_next(&it))) {
3263 switch (tsk->state) { 3840 switch (tsk->state) {
3264 case TASK_RUNNING: 3841 case TASK_RUNNING:
@@ -3388,17 +3965,6 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v)
3388 return seq_printf(s, "%d\n", *(int *)v); 3965 return seq_printf(s, "%d\n", *(int *)v);
3389} 3966}
3390 3967
3391/*
3392 * seq_operations functions for iterating on pidlists through seq_file -
3393 * independent of whether it's tasks or procs
3394 */
3395static const struct seq_operations cgroup_pidlist_seq_operations = {
3396 .start = cgroup_pidlist_start,
3397 .stop = cgroup_pidlist_stop,
3398 .next = cgroup_pidlist_next,
3399 .show = cgroup_pidlist_show,
3400};
3401
3402static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, 3968static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3403 struct cftype *cft) 3969 struct cftype *cft)
3404{ 3970{
@@ -3440,7 +4006,7 @@ static struct cftype cgroup_base_files[] = {
3440 .seq_stop = cgroup_pidlist_stop, 4006 .seq_stop = cgroup_pidlist_stop,
3441 .seq_show = cgroup_pidlist_show, 4007 .seq_show = cgroup_pidlist_show,
3442 .private = CGROUP_FILE_PROCS, 4008 .private = CGROUP_FILE_PROCS,
3443 .write_u64 = cgroup_procs_write, 4009 .write = cgroup_procs_write,
3444 .mode = S_IRUGO | S_IWUSR, 4010 .mode = S_IRUGO | S_IWUSR,
3445 }, 4011 },
3446 { 4012 {
@@ -3454,6 +4020,27 @@ static struct cftype cgroup_base_files[] = {
3454 .flags = CFTYPE_ONLY_ON_ROOT, 4020 .flags = CFTYPE_ONLY_ON_ROOT,
3455 .seq_show = cgroup_sane_behavior_show, 4021 .seq_show = cgroup_sane_behavior_show,
3456 }, 4022 },
4023 {
4024 .name = "cgroup.controllers",
4025 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
4026 .seq_show = cgroup_root_controllers_show,
4027 },
4028 {
4029 .name = "cgroup.controllers",
4030 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
4031 .seq_show = cgroup_controllers_show,
4032 },
4033 {
4034 .name = "cgroup.subtree_control",
4035 .flags = CFTYPE_ONLY_ON_DFL,
4036 .seq_show = cgroup_subtree_control_show,
4037 .write = cgroup_subtree_control_write,
4038 },
4039 {
4040 .name = "cgroup.populated",
4041 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
4042 .seq_show = cgroup_populated_show,
4043 },
3457 4044
3458 /* 4045 /*
3459 * Historical crazy stuff. These don't have "cgroup." prefix and 4046 * Historical crazy stuff. These don't have "cgroup." prefix and
@@ -3468,7 +4055,7 @@ static struct cftype cgroup_base_files[] = {
3468 .seq_stop = cgroup_pidlist_stop, 4055 .seq_stop = cgroup_pidlist_stop,
3469 .seq_show = cgroup_pidlist_show, 4056 .seq_show = cgroup_pidlist_show,
3470 .private = CGROUP_FILE_TASKS, 4057 .private = CGROUP_FILE_TASKS,
3471 .write_u64 = cgroup_tasks_write, 4058 .write = cgroup_tasks_write,
3472 .mode = S_IRUGO | S_IWUSR, 4059 .mode = S_IRUGO | S_IWUSR,
3473 }, 4060 },
3474 { 4061 {
@@ -3481,7 +4068,7 @@ static struct cftype cgroup_base_files[] = {
3481 .name = "release_agent", 4068 .name = "release_agent",
3482 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 4069 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
3483 .seq_show = cgroup_release_agent_show, 4070 .seq_show = cgroup_release_agent_show,
3484 .write_string = cgroup_release_agent_write, 4071 .write = cgroup_release_agent_write,
3485 .max_write_len = PATH_MAX - 1, 4072 .max_write_len = PATH_MAX - 1,
3486 }, 4073 },
3487 { } /* terminate */ 4074 { } /* terminate */
@@ -3494,7 +4081,7 @@ static struct cftype cgroup_base_files[] = {
3494 * 4081 *
3495 * On failure, no file is added. 4082 * On failure, no file is added.
3496 */ 4083 */
3497static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) 4084static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
3498{ 4085{
3499 struct cgroup_subsys *ss; 4086 struct cgroup_subsys *ss;
3500 int i, ret = 0; 4087 int i, ret = 0;
@@ -3503,7 +4090,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
3503 for_each_subsys(ss, i) { 4090 for_each_subsys(ss, i) {
3504 struct cftype *cfts; 4091 struct cftype *cfts;
3505 4092
3506 if (!test_bit(i, &subsys_mask)) 4093 if (!(subsys_mask & (1 << i)))
3507 continue; 4094 continue;
3508 4095
3509 list_for_each_entry(cfts, &ss->cfts, node) { 4096 list_for_each_entry(cfts, &ss->cfts, node) {
@@ -3525,9 +4112,9 @@ err:
3525 * Implemented in kill_css(). 4112 * Implemented in kill_css().
3526 * 4113 *
3527 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs 4114 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
3528 * and thus css_tryget() is guaranteed to fail, the css can be offlined 4115 * and thus css_tryget_online() is guaranteed to fail, the css can be
3529 * by invoking offline_css(). After offlining, the base ref is put. 4116 * offlined by invoking offline_css(). After offlining, the base ref is
3530 * Implemented in css_killed_work_fn(). 4117 * put. Implemented in css_killed_work_fn().
3531 * 4118 *
3532 * 3. When the percpu_ref reaches zero, the only possible remaining 4119 * 3. When the percpu_ref reaches zero, the only possible remaining
3533 * accessors are inside RCU read sections. css_release() schedules the 4120 * accessors are inside RCU read sections. css_release() schedules the
@@ -3546,11 +4133,37 @@ static void css_free_work_fn(struct work_struct *work)
3546 container_of(work, struct cgroup_subsys_state, destroy_work); 4133 container_of(work, struct cgroup_subsys_state, destroy_work);
3547 struct cgroup *cgrp = css->cgroup; 4134 struct cgroup *cgrp = css->cgroup;
3548 4135
3549 if (css->parent) 4136 if (css->ss) {
3550 css_put(css->parent); 4137 /* css free path */
4138 if (css->parent)
4139 css_put(css->parent);
3551 4140
3552 css->ss->css_free(css); 4141 css->ss->css_free(css);
3553 cgroup_put(cgrp); 4142 cgroup_put(cgrp);
4143 } else {
4144 /* cgroup free path */
4145 atomic_dec(&cgrp->root->nr_cgrps);
4146 cgroup_pidlist_destroy_all(cgrp);
4147
4148 if (cgroup_parent(cgrp)) {
4149 /*
4150 * We get a ref to the parent, and put the ref when
4151 * this cgroup is being freed, so it's guaranteed
4152 * that the parent won't be destroyed before its
4153 * children.
4154 */
4155 cgroup_put(cgroup_parent(cgrp));
4156 kernfs_put(cgrp->kn);
4157 kfree(cgrp);
4158 } else {
4159 /*
4160 * This is root cgroup's refcnt reaching zero,
4161 * which indicates that the root should be
4162 * released.
4163 */
4164 cgroup_destroy_root(cgrp->root);
4165 }
4166 }
3554} 4167}
3555 4168
3556static void css_free_rcu_fn(struct rcu_head *rcu_head) 4169static void css_free_rcu_fn(struct rcu_head *rcu_head)
@@ -3562,26 +4175,59 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
3562 queue_work(cgroup_destroy_wq, &css->destroy_work); 4175 queue_work(cgroup_destroy_wq, &css->destroy_work);
3563} 4176}
3564 4177
4178static void css_release_work_fn(struct work_struct *work)
4179{
4180 struct cgroup_subsys_state *css =
4181 container_of(work, struct cgroup_subsys_state, destroy_work);
4182 struct cgroup_subsys *ss = css->ss;
4183 struct cgroup *cgrp = css->cgroup;
4184
4185 mutex_lock(&cgroup_mutex);
4186
4187 css->flags |= CSS_RELEASED;
4188 list_del_rcu(&css->sibling);
4189
4190 if (ss) {
4191 /* css release path */
4192 cgroup_idr_remove(&ss->css_idr, css->id);
4193 } else {
4194 /* cgroup release path */
4195 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4196 cgrp->id = -1;
4197 }
4198
4199 mutex_unlock(&cgroup_mutex);
4200
4201 call_rcu(&css->rcu_head, css_free_rcu_fn);
4202}
4203
3565static void css_release(struct percpu_ref *ref) 4204static void css_release(struct percpu_ref *ref)
3566{ 4205{
3567 struct cgroup_subsys_state *css = 4206 struct cgroup_subsys_state *css =
3568 container_of(ref, struct cgroup_subsys_state, refcnt); 4207 container_of(ref, struct cgroup_subsys_state, refcnt);
3569 4208
3570 RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL); 4209 INIT_WORK(&css->destroy_work, css_release_work_fn);
3571 call_rcu(&css->rcu_head, css_free_rcu_fn); 4210 queue_work(cgroup_destroy_wq, &css->destroy_work);
3572} 4211}
3573 4212
3574static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, 4213static void init_and_link_css(struct cgroup_subsys_state *css,
3575 struct cgroup *cgrp) 4214 struct cgroup_subsys *ss, struct cgroup *cgrp)
3576{ 4215{
4216 lockdep_assert_held(&cgroup_mutex);
4217
4218 cgroup_get(cgrp);
4219
4220 memset(css, 0, sizeof(*css));
3577 css->cgroup = cgrp; 4221 css->cgroup = cgrp;
3578 css->ss = ss; 4222 css->ss = ss;
3579 css->flags = 0; 4223 INIT_LIST_HEAD(&css->sibling);
4224 INIT_LIST_HEAD(&css->children);
4225 css->serial_nr = css_serial_nr_next++;
3580 4226
3581 if (cgrp->parent) 4227 if (cgroup_parent(cgrp)) {
3582 css->parent = cgroup_css(cgrp->parent, ss); 4228 css->parent = cgroup_css(cgroup_parent(cgrp), ss);
3583 else 4229 css_get(css->parent);
3584 css->flags |= CSS_ROOT; 4230 }
3585 4231
3586 BUG_ON(cgroup_css(cgrp, ss)); 4232 BUG_ON(cgroup_css(cgrp, ss));
3587} 4233}
@@ -3592,14 +4238,12 @@ static int online_css(struct cgroup_subsys_state *css)
3592 struct cgroup_subsys *ss = css->ss; 4238 struct cgroup_subsys *ss = css->ss;
3593 int ret = 0; 4239 int ret = 0;
3594 4240
3595 lockdep_assert_held(&cgroup_tree_mutex);
3596 lockdep_assert_held(&cgroup_mutex); 4241 lockdep_assert_held(&cgroup_mutex);
3597 4242
3598 if (ss->css_online) 4243 if (ss->css_online)
3599 ret = ss->css_online(css); 4244 ret = ss->css_online(css);
3600 if (!ret) { 4245 if (!ret) {
3601 css->flags |= CSS_ONLINE; 4246 css->flags |= CSS_ONLINE;
3602 css->cgroup->nr_css++;
3603 rcu_assign_pointer(css->cgroup->subsys[ss->id], css); 4247 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
3604 } 4248 }
3605 return ret; 4249 return ret;
@@ -3610,7 +4254,6 @@ static void offline_css(struct cgroup_subsys_state *css)
3610{ 4254{
3611 struct cgroup_subsys *ss = css->ss; 4255 struct cgroup_subsys *ss = css->ss;
3612 4256
3613 lockdep_assert_held(&cgroup_tree_mutex);
3614 lockdep_assert_held(&cgroup_mutex); 4257 lockdep_assert_held(&cgroup_mutex);
3615 4258
3616 if (!(css->flags & CSS_ONLINE)) 4259 if (!(css->flags & CSS_ONLINE))
@@ -3620,8 +4263,9 @@ static void offline_css(struct cgroup_subsys_state *css)
3620 ss->css_offline(css); 4263 ss->css_offline(css);
3621 4264
3622 css->flags &= ~CSS_ONLINE; 4265 css->flags &= ~CSS_ONLINE;
3623 css->cgroup->nr_css--; 4266 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
3624 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css); 4267
4268 wake_up_all(&css->cgroup->offline_waitq);
3625} 4269}
3626 4270
3627/** 4271/**
@@ -3635,111 +4279,102 @@ static void offline_css(struct cgroup_subsys_state *css)
3635 */ 4279 */
3636static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) 4280static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
3637{ 4281{
3638 struct cgroup *parent = cgrp->parent; 4282 struct cgroup *parent = cgroup_parent(cgrp);
4283 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
3639 struct cgroup_subsys_state *css; 4284 struct cgroup_subsys_state *css;
3640 int err; 4285 int err;
3641 4286
3642 lockdep_assert_held(&cgroup_mutex); 4287 lockdep_assert_held(&cgroup_mutex);
3643 4288
3644 css = ss->css_alloc(cgroup_css(parent, ss)); 4289 css = ss->css_alloc(parent_css);
3645 if (IS_ERR(css)) 4290 if (IS_ERR(css))
3646 return PTR_ERR(css); 4291 return PTR_ERR(css);
3647 4292
4293 init_and_link_css(css, ss, cgrp);
4294
3648 err = percpu_ref_init(&css->refcnt, css_release); 4295 err = percpu_ref_init(&css->refcnt, css_release);
3649 if (err) 4296 if (err)
3650 goto err_free_css; 4297 goto err_free_css;
3651 4298
3652 init_css(css, ss, cgrp); 4299 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
4300 if (err < 0)
4301 goto err_free_percpu_ref;
4302 css->id = err;
3653 4303
3654 err = cgroup_populate_dir(cgrp, 1 << ss->id); 4304 err = cgroup_populate_dir(cgrp, 1 << ss->id);
3655 if (err) 4305 if (err)
3656 goto err_free_percpu_ref; 4306 goto err_free_id;
4307
4308 /* @css is ready to be brought online now, make it visible */
4309 list_add_tail_rcu(&css->sibling, &parent_css->children);
4310 cgroup_idr_replace(&ss->css_idr, css, css->id);
3657 4311
3658 err = online_css(css); 4312 err = online_css(css);
3659 if (err) 4313 if (err)
3660 goto err_clear_dir; 4314 goto err_list_del;
3661
3662 cgroup_get(cgrp);
3663 css_get(css->parent);
3664
3665 cgrp->subsys_mask |= 1 << ss->id;
3666 4315
3667 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 4316 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
3668 parent->parent) { 4317 cgroup_parent(parent)) {
3669 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", 4318 pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
3670 current->comm, current->pid, ss->name); 4319 current->comm, current->pid, ss->name);
3671 if (!strcmp(ss->name, "memory")) 4320 if (!strcmp(ss->name, "memory"))
3672 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); 4321 pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
3673 ss->warned_broken_hierarchy = true; 4322 ss->warned_broken_hierarchy = true;
3674 } 4323 }
3675 4324
3676 return 0; 4325 return 0;
3677 4326
3678err_clear_dir: 4327err_list_del:
4328 list_del_rcu(&css->sibling);
3679 cgroup_clear_dir(css->cgroup, 1 << css->ss->id); 4329 cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4330err_free_id:
4331 cgroup_idr_remove(&ss->css_idr, css->id);
3680err_free_percpu_ref: 4332err_free_percpu_ref:
3681 percpu_ref_cancel_init(&css->refcnt); 4333 percpu_ref_cancel_init(&css->refcnt);
3682err_free_css: 4334err_free_css:
3683 ss->css_free(css); 4335 call_rcu(&css->rcu_head, css_free_rcu_fn);
3684 return err; 4336 return err;
3685} 4337}
3686 4338
3687/** 4339static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3688 * cgroup_create - create a cgroup 4340 umode_t mode)
3689 * @parent: cgroup that will be parent of the new cgroup
3690 * @name: name of the new cgroup
3691 * @mode: mode to set on new cgroup
3692 */
3693static long cgroup_create(struct cgroup *parent, const char *name,
3694 umode_t mode)
3695{ 4341{
3696 struct cgroup *cgrp; 4342 struct cgroup *parent, *cgrp;
3697 struct cgroup_root *root = parent->root; 4343 struct cgroup_root *root;
3698 int ssid, err;
3699 struct cgroup_subsys *ss; 4344 struct cgroup_subsys *ss;
3700 struct kernfs_node *kn; 4345 struct kernfs_node *kn;
4346 int ssid, ret;
3701 4347
3702 /* 4348 parent = cgroup_kn_lock_live(parent_kn);
3703 * XXX: The default hierarchy isn't fully implemented yet. Block 4349 if (!parent)
3704 * !root cgroup creation on it for now. 4350 return -ENODEV;
3705 */ 4351 root = parent->root;
3706 if (root == &cgrp_dfl_root)
3707 return -EINVAL;
3708 4352
3709 /* allocate the cgroup and its ID, 0 is reserved for the root */ 4353 /* allocate the cgroup and its ID, 0 is reserved for the root */
3710 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 4354 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
3711 if (!cgrp) 4355 if (!cgrp) {
3712 return -ENOMEM; 4356 ret = -ENOMEM;
3713 4357 goto out_unlock;
3714 mutex_lock(&cgroup_tree_mutex);
3715
3716 /*
3717 * Only live parents can have children. Note that the liveliness
3718 * check isn't strictly necessary because cgroup_mkdir() and
3719 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
3720 * anyway so that locking is contained inside cgroup proper and we
3721 * don't get nasty surprises if we ever grow another caller.
3722 */
3723 if (!cgroup_lock_live_group(parent)) {
3724 err = -ENODEV;
3725 goto err_unlock_tree;
3726 } 4358 }
3727 4359
4360 ret = percpu_ref_init(&cgrp->self.refcnt, css_release);
4361 if (ret)
4362 goto out_free_cgrp;
4363
3728 /* 4364 /*
3729 * Temporarily set the pointer to NULL, so idr_find() won't return 4365 * Temporarily set the pointer to NULL, so idr_find() won't return
3730 * a half-baked cgroup. 4366 * a half-baked cgroup.
3731 */ 4367 */
3732 cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); 4368 cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
3733 if (cgrp->id < 0) { 4369 if (cgrp->id < 0) {
3734 err = -ENOMEM; 4370 ret = -ENOMEM;
3735 goto err_unlock; 4371 goto out_cancel_ref;
3736 } 4372 }
3737 4373
3738 init_cgroup_housekeeping(cgrp); 4374 init_cgroup_housekeeping(cgrp);
3739 4375
3740 cgrp->parent = parent; 4376 cgrp->self.parent = &parent->self;
3741 cgrp->dummy_css.parent = &parent->dummy_css; 4377 cgrp->root = root;
3742 cgrp->root = parent->root;
3743 4378
3744 if (notify_on_release(parent)) 4379 if (notify_on_release(parent))
3745 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 4380 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -3750,8 +4385,8 @@ static long cgroup_create(struct cgroup *parent, const char *name,
3750 /* create the directory */ 4385 /* create the directory */
3751 kn = kernfs_create_dir(parent->kn, name, mode, cgrp); 4386 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
3752 if (IS_ERR(kn)) { 4387 if (IS_ERR(kn)) {
3753 err = PTR_ERR(kn); 4388 ret = PTR_ERR(kn);
3754 goto err_free_id; 4389 goto out_free_id;
3755 } 4390 }
3756 cgrp->kn = kn; 4391 cgrp->kn = kn;
3757 4392
@@ -3761,10 +4396,10 @@ static long cgroup_create(struct cgroup *parent, const char *name,
3761 */ 4396 */
3762 kernfs_get(kn); 4397 kernfs_get(kn);
3763 4398
3764 cgrp->serial_nr = cgroup_serial_nr_next++; 4399 cgrp->self.serial_nr = css_serial_nr_next++;
3765 4400
3766 /* allocation complete, commit to creation */ 4401 /* allocation complete, commit to creation */
3767 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4402 list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
3768 atomic_inc(&root->nr_cgrps); 4403 atomic_inc(&root->nr_cgrps);
3769 cgroup_get(parent); 4404 cgroup_get(parent);
3770 4405
@@ -3772,107 +4407,66 @@ static long cgroup_create(struct cgroup *parent, const char *name,
3772 * @cgrp is now fully operational. If something fails after this 4407 * @cgrp is now fully operational. If something fails after this
3773 * point, it'll be released via the normal destruction path. 4408 * point, it'll be released via the normal destruction path.
3774 */ 4409 */
3775 idr_replace(&root->cgroup_idr, cgrp, cgrp->id); 4410 cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
3776 4411
3777 err = cgroup_kn_set_ugid(kn); 4412 ret = cgroup_kn_set_ugid(kn);
3778 if (err) 4413 if (ret)
3779 goto err_destroy; 4414 goto out_destroy;
3780 4415
3781 err = cgroup_addrm_files(cgrp, cgroup_base_files, true); 4416 ret = cgroup_addrm_files(cgrp, cgroup_base_files, true);
3782 if (err) 4417 if (ret)
3783 goto err_destroy; 4418 goto out_destroy;
3784 4419
3785 /* let's create and online css's */ 4420 /* let's create and online css's */
3786 for_each_subsys(ss, ssid) { 4421 for_each_subsys(ss, ssid) {
3787 if (root->cgrp.subsys_mask & (1 << ssid)) { 4422 if (parent->child_subsys_mask & (1 << ssid)) {
3788 err = create_css(cgrp, ss); 4423 ret = create_css(cgrp, ss);
3789 if (err) 4424 if (ret)
3790 goto err_destroy; 4425 goto out_destroy;
3791 } 4426 }
3792 } 4427 }
3793 4428
3794 kernfs_activate(kn); 4429 /*
4430 * On the default hierarchy, a child doesn't automatically inherit
4431 * child_subsys_mask from the parent. Each is configured manually.
4432 */
4433 if (!cgroup_on_dfl(cgrp))
4434 cgrp->child_subsys_mask = parent->child_subsys_mask;
3795 4435
3796 mutex_unlock(&cgroup_mutex); 4436 kernfs_activate(kn);
3797 mutex_unlock(&cgroup_tree_mutex);
3798 4437
3799 return 0; 4438 ret = 0;
4439 goto out_unlock;
3800 4440
3801err_free_id: 4441out_free_id:
3802 idr_remove(&root->cgroup_idr, cgrp->id); 4442 cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
3803err_unlock: 4443out_cancel_ref:
3804 mutex_unlock(&cgroup_mutex); 4444 percpu_ref_cancel_init(&cgrp->self.refcnt);
3805err_unlock_tree: 4445out_free_cgrp:
3806 mutex_unlock(&cgroup_tree_mutex);
3807 kfree(cgrp); 4446 kfree(cgrp);
3808 return err; 4447out_unlock:
4448 cgroup_kn_unlock(parent_kn);
4449 return ret;
3809 4450
3810err_destroy: 4451out_destroy:
3811 cgroup_destroy_locked(cgrp); 4452 cgroup_destroy_locked(cgrp);
3812 mutex_unlock(&cgroup_mutex); 4453 goto out_unlock;
3813 mutex_unlock(&cgroup_tree_mutex);
3814 return err;
3815}
3816
3817static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3818 umode_t mode)
3819{
3820 struct cgroup *parent = parent_kn->priv;
3821 int ret;
3822
3823 /*
3824 * cgroup_create() grabs cgroup_tree_mutex which nests outside
3825 * kernfs active_ref and cgroup_create() already synchronizes
3826 * properly against removal through cgroup_lock_live_group().
3827 * Break it before calling cgroup_create().
3828 */
3829 cgroup_get(parent);
3830 kernfs_break_active_protection(parent_kn);
3831
3832 ret = cgroup_create(parent, name, mode);
3833
3834 kernfs_unbreak_active_protection(parent_kn);
3835 cgroup_put(parent);
3836 return ret;
3837} 4454}
3838 4455
3839/* 4456/*
3840 * This is called when the refcnt of a css is confirmed to be killed. 4457 * This is called when the refcnt of a css is confirmed to be killed.
3841 * css_tryget() is now guaranteed to fail. 4458 * css_tryget_online() is now guaranteed to fail. Tell the subsystem to
4459 * initate destruction and put the css ref from kill_css().
3842 */ 4460 */
3843static void css_killed_work_fn(struct work_struct *work) 4461static void css_killed_work_fn(struct work_struct *work)
3844{ 4462{
3845 struct cgroup_subsys_state *css = 4463 struct cgroup_subsys_state *css =
3846 container_of(work, struct cgroup_subsys_state, destroy_work); 4464 container_of(work, struct cgroup_subsys_state, destroy_work);
3847 struct cgroup *cgrp = css->cgroup;
3848 4465
3849 mutex_lock(&cgroup_tree_mutex);
3850 mutex_lock(&cgroup_mutex); 4466 mutex_lock(&cgroup_mutex);
3851
3852 /*
3853 * css_tryget() is guaranteed to fail now. Tell subsystems to
3854 * initate destruction.
3855 */
3856 offline_css(css); 4467 offline_css(css);
3857
3858 /*
3859 * If @cgrp is marked dead, it's waiting for refs of all css's to
3860 * be disabled before proceeding to the second phase of cgroup
3861 * destruction. If we are the last one, kick it off.
3862 */
3863 if (!cgrp->nr_css && cgroup_is_dead(cgrp))
3864 cgroup_destroy_css_killed(cgrp);
3865
3866 mutex_unlock(&cgroup_mutex); 4468 mutex_unlock(&cgroup_mutex);
3867 mutex_unlock(&cgroup_tree_mutex);
3868 4469
3869 /*
3870 * Put the css refs from kill_css(). Each css holds an extra
3871 * reference to the cgroup's dentry and cgroup removal proceeds
3872 * regardless of css refs. On the last put of each css, whenever
3873 * that may be, the extra dentry ref is put so that dentry
3874 * destruction happens only after all css's are released.
3875 */
3876 css_put(css); 4470 css_put(css);
3877} 4471}
3878 4472
@@ -3886,9 +4480,18 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
3886 queue_work(cgroup_destroy_wq, &css->destroy_work); 4480 queue_work(cgroup_destroy_wq, &css->destroy_work);
3887} 4481}
3888 4482
3889static void __kill_css(struct cgroup_subsys_state *css) 4483/**
4484 * kill_css - destroy a css
4485 * @css: css to destroy
4486 *
4487 * This function initiates destruction of @css by removing cgroup interface
4488 * files and putting its base reference. ->css_offline() will be invoked
4489 * asynchronously once css_tryget_online() is guaranteed to fail and when
4490 * the reference count reaches zero, @css will be released.
4491 */
4492static void kill_css(struct cgroup_subsys_state *css)
3890{ 4493{
3891 lockdep_assert_held(&cgroup_tree_mutex); 4494 lockdep_assert_held(&cgroup_mutex);
3892 4495
3893 /* 4496 /*
3894 * This must happen before css is disassociated with its cgroup. 4497 * This must happen before css is disassociated with its cgroup.
@@ -3905,7 +4508,7 @@ static void __kill_css(struct cgroup_subsys_state *css)
3905 /* 4508 /*
3906 * cgroup core guarantees that, by the time ->css_offline() is 4509 * cgroup core guarantees that, by the time ->css_offline() is
3907 * invoked, no new css reference will be given out via 4510 * invoked, no new css reference will be given out via
3908 * css_tryget(). We can't simply call percpu_ref_kill() and 4511 * css_tryget_online(). We can't simply call percpu_ref_kill() and
3909 * proceed to offlining css's because percpu_ref_kill() doesn't 4512 * proceed to offlining css's because percpu_ref_kill() doesn't
3910 * guarantee that the ref is seen as killed on all CPUs on return. 4513 * guarantee that the ref is seen as killed on all CPUs on return.
3911 * 4514 *
@@ -3916,36 +4519,14 @@ static void __kill_css(struct cgroup_subsys_state *css)
3916} 4519}
3917 4520
3918/** 4521/**
3919 * kill_css - destroy a css
3920 * @css: css to destroy
3921 *
3922 * This function initiates destruction of @css by removing cgroup interface
3923 * files and putting its base reference. ->css_offline() will be invoked
3924 * asynchronously once css_tryget() is guaranteed to fail and when the
3925 * reference count reaches zero, @css will be released.
3926 */
3927static void kill_css(struct cgroup_subsys_state *css)
3928{
3929 struct cgroup *cgrp = css->cgroup;
3930
3931 lockdep_assert_held(&cgroup_tree_mutex);
3932
3933 /* if already killed, noop */
3934 if (cgrp->subsys_mask & (1 << css->ss->id)) {
3935 cgrp->subsys_mask &= ~(1 << css->ss->id);
3936 __kill_css(css);
3937 }
3938}
3939
3940/**
3941 * cgroup_destroy_locked - the first stage of cgroup destruction 4522 * cgroup_destroy_locked - the first stage of cgroup destruction
3942 * @cgrp: cgroup to be destroyed 4523 * @cgrp: cgroup to be destroyed
3943 * 4524 *
3944 * css's make use of percpu refcnts whose killing latency shouldn't be 4525 * css's make use of percpu refcnts whose killing latency shouldn't be
3945 * exposed to userland and are RCU protected. Also, cgroup core needs to 4526 * exposed to userland and are RCU protected. Also, cgroup core needs to
3946 * guarantee that css_tryget() won't succeed by the time ->css_offline() is 4527 * guarantee that css_tryget_online() won't succeed by the time
3947 * invoked. To satisfy all the requirements, destruction is implemented in 4528 * ->css_offline() is invoked. To satisfy all the requirements,
3948 * the following two steps. 4529 * destruction is implemented in the following two steps.
3949 * 4530 *
3950 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all 4531 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
3951 * userland visible parts and start killing the percpu refcnts of 4532 * userland visible parts and start killing the percpu refcnts of
@@ -3964,12 +4545,10 @@ static void kill_css(struct cgroup_subsys_state *css)
3964static int cgroup_destroy_locked(struct cgroup *cgrp) 4545static int cgroup_destroy_locked(struct cgroup *cgrp)
3965 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4546 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
3966{ 4547{
3967 struct cgroup *child;
3968 struct cgroup_subsys_state *css; 4548 struct cgroup_subsys_state *css;
3969 bool empty; 4549 bool empty;
3970 int ssid; 4550 int ssid;
3971 4551
3972 lockdep_assert_held(&cgroup_tree_mutex);
3973 lockdep_assert_held(&cgroup_mutex); 4552 lockdep_assert_held(&cgroup_mutex);
3974 4553
3975 /* 4554 /*
@@ -3983,127 +4562,68 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
3983 return -EBUSY; 4562 return -EBUSY;
3984 4563
3985 /* 4564 /*
3986 * Make sure there's no live children. We can't test ->children 4565 * Make sure there's no live children. We can't test emptiness of
3987 * emptiness as dead children linger on it while being destroyed; 4566 * ->self.children as dead children linger on it while being
3988 * otherwise, "rmdir parent/child parent" may fail with -EBUSY. 4567 * drained; otherwise, "rmdir parent/child parent" may fail.
3989 */ 4568 */
3990 empty = true; 4569 if (css_has_online_children(&cgrp->self))
3991 rcu_read_lock();
3992 list_for_each_entry_rcu(child, &cgrp->children, sibling) {
3993 empty = cgroup_is_dead(child);
3994 if (!empty)
3995 break;
3996 }
3997 rcu_read_unlock();
3998 if (!empty)
3999 return -EBUSY; 4570 return -EBUSY;
4000 4571
4001 /* 4572 /*
4002 * Mark @cgrp dead. This prevents further task migration and child 4573 * Mark @cgrp dead. This prevents further task migration and child
4003 * creation by disabling cgroup_lock_live_group(). Note that 4574 * creation by disabling cgroup_lock_live_group().
4004 * CGRP_DEAD assertion is depended upon by css_next_child() to
4005 * resume iteration after dropping RCU read lock. See
4006 * css_next_child() for details.
4007 */ 4575 */
4008 set_bit(CGRP_DEAD, &cgrp->flags); 4576 cgrp->self.flags &= ~CSS_ONLINE;
4009 4577
4010 /* 4578 /* initiate massacre of all css's */
4011 * Initiate massacre of all css's. cgroup_destroy_css_killed()
4012 * will be invoked to perform the rest of destruction once the
4013 * percpu refs of all css's are confirmed to be killed. This
4014 * involves removing the subsystem's files, drop cgroup_mutex.
4015 */
4016 mutex_unlock(&cgroup_mutex);
4017 for_each_css(css, ssid, cgrp) 4579 for_each_css(css, ssid, cgrp)
4018 kill_css(css); 4580 kill_css(css);
4019 mutex_lock(&cgroup_mutex);
4020 4581
4021 /* CGRP_DEAD is set, remove from ->release_list for the last time */ 4582 /* CSS_ONLINE is clear, remove from ->release_list for the last time */
4022 raw_spin_lock(&release_list_lock); 4583 raw_spin_lock(&release_list_lock);
4023 if (!list_empty(&cgrp->release_list)) 4584 if (!list_empty(&cgrp->release_list))
4024 list_del_init(&cgrp->release_list); 4585 list_del_init(&cgrp->release_list);
4025 raw_spin_unlock(&release_list_lock); 4586 raw_spin_unlock(&release_list_lock);
4026 4587
4027 /* 4588 /*
4028 * If @cgrp has css's attached, the second stage of cgroup 4589 * Remove @cgrp directory along with the base files. @cgrp has an
4029 * destruction is kicked off from css_killed_work_fn() after the 4590 * extra ref on its kn.
4030 * refs of all attached css's are killed. If @cgrp doesn't have
4031 * any css, we kick it off here.
4032 */ 4591 */
4033 if (!cgrp->nr_css) 4592 kernfs_remove(cgrp->kn);
4034 cgroup_destroy_css_killed(cgrp);
4035
4036 /* remove @cgrp directory along with the base files */
4037 mutex_unlock(&cgroup_mutex);
4038 4593
4039 /* 4594 set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags);
4040 * There are two control paths which try to determine cgroup from 4595 check_for_release(cgroup_parent(cgrp));
4041 * dentry without going through kernfs - cgroupstats_build() and
4042 * css_tryget_from_dir(). Those are supported by RCU protecting
4043 * clearing of cgrp->kn->priv backpointer, which should happen
4044 * after all files under it have been removed.
4045 */
4046 kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */
4047 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
4048 4596
4049 mutex_lock(&cgroup_mutex); 4597 /* put the base reference */
4598 percpu_ref_kill(&cgrp->self.refcnt);
4050 4599
4051 return 0; 4600 return 0;
4052}; 4601};
4053 4602
4054/**
4055 * cgroup_destroy_css_killed - the second step of cgroup destruction
4056 * @work: cgroup->destroy_free_work
4057 *
4058 * This function is invoked from a work item for a cgroup which is being
4059 * destroyed after all css's are offlined and performs the rest of
4060 * destruction. This is the second step of destruction described in the
4061 * comment above cgroup_destroy_locked().
4062 */
4063static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4064{
4065 struct cgroup *parent = cgrp->parent;
4066
4067 lockdep_assert_held(&cgroup_tree_mutex);
4068 lockdep_assert_held(&cgroup_mutex);
4069
4070 /* delete this cgroup from parent->children */
4071 list_del_rcu(&cgrp->sibling);
4072
4073 cgroup_put(cgrp);
4074
4075 set_bit(CGRP_RELEASABLE, &parent->flags);
4076 check_for_release(parent);
4077}
4078
4079static int cgroup_rmdir(struct kernfs_node *kn) 4603static int cgroup_rmdir(struct kernfs_node *kn)
4080{ 4604{
4081 struct cgroup *cgrp = kn->priv; 4605 struct cgroup *cgrp;
4082 int ret = 0; 4606 int ret = 0;
4083 4607
4084 /* 4608 cgrp = cgroup_kn_lock_live(kn);
4085 * This is self-destruction but @kn can't be removed while this 4609 if (!cgrp)
4086 * callback is in progress. Let's break active protection. Once 4610 return 0;
4087 * the protection is broken, @cgrp can be destroyed at any point. 4611 cgroup_get(cgrp); /* for @kn->priv clearing */
4088 * Pin it so that it stays accessible.
4089 */
4090 cgroup_get(cgrp);
4091 kernfs_break_active_protection(kn);
4092 4612
4093 mutex_lock(&cgroup_tree_mutex); 4613 ret = cgroup_destroy_locked(cgrp);
4094 mutex_lock(&cgroup_mutex); 4614
4615 cgroup_kn_unlock(kn);
4095 4616
4096 /* 4617 /*
4097 * @cgrp might already have been destroyed while we're trying to 4618 * There are two control paths which try to determine cgroup from
4098 * grab the mutexes. 4619 * dentry without going through kernfs - cgroupstats_build() and
4620 * css_tryget_online_from_dir(). Those are supported by RCU
4621 * protecting clearing of cgrp->kn->priv backpointer, which should
4622 * happen after all files under it have been removed.
4099 */ 4623 */
4100 if (!cgroup_is_dead(cgrp)) 4624 if (!ret)
4101 ret = cgroup_destroy_locked(cgrp); 4625 RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
4102
4103 mutex_unlock(&cgroup_mutex);
4104 mutex_unlock(&cgroup_tree_mutex);
4105 4626
4106 kernfs_unbreak_active_protection(kn);
4107 cgroup_put(cgrp); 4627 cgroup_put(cgrp);
4108 return ret; 4628 return ret;
4109} 4629}
@@ -4116,15 +4636,15 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4116 .rename = cgroup_rename, 4636 .rename = cgroup_rename,
4117}; 4637};
4118 4638
4119static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4639static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
4120{ 4640{
4121 struct cgroup_subsys_state *css; 4641 struct cgroup_subsys_state *css;
4122 4642
4123 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4643 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4124 4644
4125 mutex_lock(&cgroup_tree_mutex);
4126 mutex_lock(&cgroup_mutex); 4645 mutex_lock(&cgroup_mutex);
4127 4646
4647 idr_init(&ss->css_idr);
4128 INIT_LIST_HEAD(&ss->cfts); 4648 INIT_LIST_HEAD(&ss->cfts);
4129 4649
4130 /* Create the root cgroup state for this subsystem */ 4650 /* Create the root cgroup state for this subsystem */
@@ -4132,7 +4652,21 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4132 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); 4652 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
4133 /* We don't handle early failures gracefully */ 4653 /* We don't handle early failures gracefully */
4134 BUG_ON(IS_ERR(css)); 4654 BUG_ON(IS_ERR(css));
4135 init_css(css, ss, &cgrp_dfl_root.cgrp); 4655 init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
4656
4657 /*
4658 * Root csses are never destroyed and we can't initialize
4659 * percpu_ref during early init. Disable refcnting.
4660 */
4661 css->flags |= CSS_NO_REF;
4662
4663 if (early) {
4664 /* allocation can't be done safely during early init */
4665 css->id = 1;
4666 } else {
4667 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
4668 BUG_ON(css->id < 0);
4669 }
4136 4670
4137 /* Update the init_css_set to contain a subsys 4671 /* Update the init_css_set to contain a subsys
4138 * pointer to this state - since the subsystem is 4672 * pointer to this state - since the subsystem is
@@ -4149,10 +4683,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4149 4683
4150 BUG_ON(online_css(css)); 4684 BUG_ON(online_css(css));
4151 4685
4152 cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id;
4153
4154 mutex_unlock(&cgroup_mutex); 4686 mutex_unlock(&cgroup_mutex);
4155 mutex_unlock(&cgroup_tree_mutex);
4156} 4687}
4157 4688
4158/** 4689/**
@@ -4169,6 +4700,8 @@ int __init cgroup_init_early(void)
4169 int i; 4700 int i;
4170 4701
4171 init_cgroup_root(&cgrp_dfl_root, &opts); 4702 init_cgroup_root(&cgrp_dfl_root, &opts);
4703 cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
4704
4172 RCU_INIT_POINTER(init_task.cgroups, &init_css_set); 4705 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4173 4706
4174 for_each_subsys(ss, i) { 4707 for_each_subsys(ss, i) {
@@ -4183,7 +4716,7 @@ int __init cgroup_init_early(void)
4183 ss->name = cgroup_subsys_name[i]; 4716 ss->name = cgroup_subsys_name[i];
4184 4717
4185 if (ss->early_init) 4718 if (ss->early_init)
4186 cgroup_init_subsys(ss); 4719 cgroup_init_subsys(ss, true);
4187 } 4720 }
4188 return 0; 4721 return 0;
4189} 4722}
@@ -4202,7 +4735,6 @@ int __init cgroup_init(void)
4202 4735
4203 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); 4736 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
4204 4737
4205 mutex_lock(&cgroup_tree_mutex);
4206 mutex_lock(&cgroup_mutex); 4738 mutex_lock(&cgroup_mutex);
4207 4739
4208 /* Add init_css_set to the hash table */ 4740 /* Add init_css_set to the hash table */
@@ -4212,18 +4744,31 @@ int __init cgroup_init(void)
4212 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); 4744 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
4213 4745
4214 mutex_unlock(&cgroup_mutex); 4746 mutex_unlock(&cgroup_mutex);
4215 mutex_unlock(&cgroup_tree_mutex);
4216 4747
4217 for_each_subsys(ss, ssid) { 4748 for_each_subsys(ss, ssid) {
4218 if (!ss->early_init) 4749 if (ss->early_init) {
4219 cgroup_init_subsys(ss); 4750 struct cgroup_subsys_state *css =
4751 init_css_set.subsys[ss->id];
4752
4753 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
4754 GFP_KERNEL);
4755 BUG_ON(css->id < 0);
4756 } else {
4757 cgroup_init_subsys(ss, false);
4758 }
4759
4760 list_add_tail(&init_css_set.e_cset_node[ssid],
4761 &cgrp_dfl_root.cgrp.e_csets[ssid]);
4220 4762
4221 /* 4763 /*
4222 * cftype registration needs kmalloc and can't be done 4764 * Setting dfl_root subsys_mask needs to consider the
4223 * during early_init. Register base cftypes separately. 4765 * disabled flag and cftype registration needs kmalloc,
4766 * both of which aren't available during early_init.
4224 */ 4767 */
4225 if (ss->base_cftypes) 4768 if (!ss->disabled) {
4769 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
4226 WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); 4770 WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
4771 }
4227 } 4772 }
4228 4773
4229 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4774 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4306,7 +4851,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4306 4851
4307 seq_printf(m, "%d:", root->hierarchy_id); 4852 seq_printf(m, "%d:", root->hierarchy_id);
4308 for_each_subsys(ss, ssid) 4853 for_each_subsys(ss, ssid)
4309 if (root->cgrp.subsys_mask & (1 << ssid)) 4854 if (root->subsys_mask & (1 << ssid))
4310 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4855 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4311 if (strlen(root->name)) 4856 if (strlen(root->name))
4312 seq_printf(m, "%sname=%s", count ? "," : "", 4857 seq_printf(m, "%sname=%s", count ? "," : "",
@@ -4501,8 +5046,8 @@ void cgroup_exit(struct task_struct *tsk)
4501 5046
4502static void check_for_release(struct cgroup *cgrp) 5047static void check_for_release(struct cgroup *cgrp)
4503{ 5048{
4504 if (cgroup_is_releasable(cgrp) && 5049 if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) &&
4505 list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) { 5050 !css_has_online_children(&cgrp->self)) {
4506 /* 5051 /*
4507 * Control Group is currently removeable. If it's not 5052 * Control Group is currently removeable. If it's not
4508 * already queued for a userspace notification, queue 5053 * already queued for a userspace notification, queue
@@ -4619,7 +5164,7 @@ static int __init cgroup_disable(char *str)
4619__setup("cgroup_disable=", cgroup_disable); 5164__setup("cgroup_disable=", cgroup_disable);
4620 5165
4621/** 5166/**
4622 * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir 5167 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
4623 * @dentry: directory dentry of interest 5168 * @dentry: directory dentry of interest
4624 * @ss: subsystem of interest 5169 * @ss: subsystem of interest
4625 * 5170 *
@@ -4627,8 +5172,8 @@ __setup("cgroup_disable=", cgroup_disable);
4627 * to get the corresponding css and return it. If such css doesn't exist 5172 * to get the corresponding css and return it. If such css doesn't exist
4628 * or can't be pinned, an ERR_PTR value is returned. 5173 * or can't be pinned, an ERR_PTR value is returned.
4629 */ 5174 */
4630struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, 5175struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
4631 struct cgroup_subsys *ss) 5176 struct cgroup_subsys *ss)
4632{ 5177{
4633 struct kernfs_node *kn = kernfs_node_from_dentry(dentry); 5178 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
4634 struct cgroup_subsys_state *css = NULL; 5179 struct cgroup_subsys_state *css = NULL;
@@ -4644,13 +5189,13 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
4644 /* 5189 /*
4645 * This path doesn't originate from kernfs and @kn could already 5190 * This path doesn't originate from kernfs and @kn could already
4646 * have been or be removed at any point. @kn->priv is RCU 5191 * have been or be removed at any point. @kn->priv is RCU
4647 * protected for this access. See destroy_locked() for details. 5192 * protected for this access. See cgroup_rmdir() for details.
4648 */ 5193 */
4649 cgrp = rcu_dereference(kn->priv); 5194 cgrp = rcu_dereference(kn->priv);
4650 if (cgrp) 5195 if (cgrp)
4651 css = cgroup_css(cgrp, ss); 5196 css = cgroup_css(cgrp, ss);
4652 5197
4653 if (!css || !css_tryget(css)) 5198 if (!css || !css_tryget_online(css))
4654 css = ERR_PTR(-ENOENT); 5199 css = ERR_PTR(-ENOENT);
4655 5200
4656 rcu_read_unlock(); 5201 rcu_read_unlock();
@@ -4667,14 +5212,8 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
4667 */ 5212 */
4668struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) 5213struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
4669{ 5214{
4670 struct cgroup *cgrp; 5215 WARN_ON_ONCE(!rcu_read_lock_held());
4671 5216 return idr_find(&ss->css_idr, id);
4672 cgroup_assert_mutexes_or_rcu_locked();
4673
4674 cgrp = idr_find(&ss->root->cgroup_idr, id);
4675 if (cgrp)
4676 return cgroup_css(cgrp, ss);
4677 return NULL;
4678} 5217}
4679 5218
4680#ifdef CONFIG_CGROUP_DEBUG 5219#ifdef CONFIG_CGROUP_DEBUG