aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c3711
-rw-r--r--kernel/cgroup_freezer.c40
-rw-r--r--kernel/cpuset.c262
-rw-r--r--kernel/events/core.c25
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/sched/core.c10
-rw-r--r--kernel/sched/cpuacct.c6
-rw-r--r--kernel/sched/debug.c3
9 files changed, 1676 insertions, 2388 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0c753ddd223b..fede3d3f28ff 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -40,23 +40,20 @@
40#include <linux/proc_fs.h> 40#include <linux/proc_fs.h>
41#include <linux/rcupdate.h> 41#include <linux/rcupdate.h>
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/backing-dev.h>
44#include <linux/slab.h> 43#include <linux/slab.h>
45#include <linux/magic.h>
46#include <linux/spinlock.h> 44#include <linux/spinlock.h>
45#include <linux/rwsem.h>
47#include <linux/string.h> 46#include <linux/string.h>
48#include <linux/sort.h> 47#include <linux/sort.h>
49#include <linux/kmod.h> 48#include <linux/kmod.h>
50#include <linux/module.h>
51#include <linux/delayacct.h> 49#include <linux/delayacct.h>
52#include <linux/cgroupstats.h> 50#include <linux/cgroupstats.h>
53#include <linux/hashtable.h> 51#include <linux/hashtable.h>
54#include <linux/namei.h>
55#include <linux/pid_namespace.h> 52#include <linux/pid_namespace.h>
56#include <linux/idr.h> 53#include <linux/idr.h>
57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 54#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
58#include <linux/flex_array.h> /* used in cgroup_attach_task */
59#include <linux/kthread.h> 55#include <linux/kthread.h>
56#include <linux/delay.h>
60 57
61#include <linux/atomic.h> 58#include <linux/atomic.h>
62 59
@@ -68,43 +65,49 @@
68 */ 65 */
69#define CGROUP_PIDLIST_DESTROY_DELAY HZ 66#define CGROUP_PIDLIST_DESTROY_DELAY HZ
70 67
68#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
69 MAX_CFTYPE_NAME + 2)
70
71/*
72 * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
73 * creation/removal and hierarchy changing operations including cgroup
74 * creation, removal, css association and controller rebinding. This outer
75 * lock is needed mainly to resolve the circular dependency between kernfs
76 * active ref and cgroup_mutex. cgroup_tree_mutex nests above both.
77 */
78static DEFINE_MUTEX(cgroup_tree_mutex);
79
71/* 80/*
72 * cgroup_mutex is the master lock. Any modification to cgroup or its 81 * cgroup_mutex is the master lock. Any modification to cgroup or its
73 * hierarchy must be performed while holding it. 82 * hierarchy must be performed while holding it.
74 * 83 *
75 * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify 84 * css_set_rwsem protects task->cgroups pointer, the list of css_set
76 * cgroupfs_root of any cgroup hierarchy - subsys list, flags, 85 * objects, and the chain of tasks off each css_set.
77 * release_agent_path and so on. Modifying requires both cgroup_mutex and
78 * cgroup_root_mutex. Readers can acquire either of the two. This is to
79 * break the following locking order cycle.
80 *
81 * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
82 * B. namespace_sem -> cgroup_mutex
83 * 86 *
84 * B happens only through cgroup_show_options() and using cgroup_root_mutex 87 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
85 * breaks it. 88 * cgroup.h can use them for lockdep annotations.
86 */ 89 */
87#ifdef CONFIG_PROVE_RCU 90#ifdef CONFIG_PROVE_RCU
88DEFINE_MUTEX(cgroup_mutex); 91DEFINE_MUTEX(cgroup_mutex);
89EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ 92DECLARE_RWSEM(css_set_rwsem);
93EXPORT_SYMBOL_GPL(cgroup_mutex);
94EXPORT_SYMBOL_GPL(css_set_rwsem);
90#else 95#else
91static DEFINE_MUTEX(cgroup_mutex); 96static DEFINE_MUTEX(cgroup_mutex);
97static DECLARE_RWSEM(css_set_rwsem);
92#endif 98#endif
93 99
94static DEFINE_MUTEX(cgroup_root_mutex); 100/*
101 * Protects cgroup_subsys->release_agent_path. Modifying it also requires
102 * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
103 */
104static DEFINE_SPINLOCK(release_agent_path_lock);
95 105
96#define cgroup_assert_mutex_or_rcu_locked() \ 106#define cgroup_assert_mutexes_or_rcu_locked() \
97 rcu_lockdep_assert(rcu_read_lock_held() || \ 107 rcu_lockdep_assert(rcu_read_lock_held() || \
108 lockdep_is_held(&cgroup_tree_mutex) || \
98 lockdep_is_held(&cgroup_mutex), \ 109 lockdep_is_held(&cgroup_mutex), \
99 "cgroup_mutex or RCU read lock required"); 110 "cgroup_[tree_]mutex or RCU read lock required");
100
101#ifdef CONFIG_LOCKDEP
102#define cgroup_assert_mutex_or_root_locked() \
103 WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \
104 !lockdep_is_held(&cgroup_root_mutex)))
105#else
106#define cgroup_assert_mutex_or_root_locked() do { } while (0)
107#endif
108 111
109/* 112/*
110 * cgroup destruction makes heavy use of work items and there can be a lot 113 * cgroup destruction makes heavy use of work items and there can be a lot
@@ -120,42 +123,41 @@ static struct workqueue_struct *cgroup_destroy_wq;
120 */ 123 */
121static struct workqueue_struct *cgroup_pidlist_destroy_wq; 124static struct workqueue_struct *cgroup_pidlist_destroy_wq;
122 125
123/* 126/* generate an array of cgroup subsystem pointers */
124 * Generate an array of cgroup subsystem pointers. At boot time, this is 127#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
125 * populated with the built in subsystems, and modular subsystems are 128static struct cgroup_subsys *cgroup_subsys[] = {
126 * registered after that. The mutable section of this array is protected by 129#include <linux/cgroup_subsys.h>
127 * cgroup_mutex. 130};
128 */ 131#undef SUBSYS
129#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, 132
130#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) 133/* array of cgroup subsystem names */
131static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = { 134#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
135static const char *cgroup_subsys_name[] = {
132#include <linux/cgroup_subsys.h> 136#include <linux/cgroup_subsys.h>
133}; 137};
138#undef SUBSYS
134 139
135/* 140/*
136 * The dummy hierarchy, reserved for the subsystems that are otherwise 141 * The default hierarchy, reserved for the subsystems that are otherwise
137 * unattached - it never has more than a single cgroup, and all tasks are 142 * unattached - it never has more than a single cgroup, and all tasks are
138 * part of that cgroup. 143 * part of that cgroup.
139 */ 144 */
140static struct cgroupfs_root cgroup_dummy_root; 145struct cgroup_root cgrp_dfl_root;
141 146
142/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ 147/*
143static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; 148 * The default hierarchy always exists but is hidden until mounted for the
149 * first time. This is for backward compatibility.
150 */
151static bool cgrp_dfl_root_visible;
144 152
145/* The list of hierarchy roots */ 153/* The list of hierarchy roots */
146 154
147static LIST_HEAD(cgroup_roots); 155static LIST_HEAD(cgroup_roots);
148static int cgroup_root_count; 156static int cgroup_root_count;
149 157
150/* 158/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
151 * Hierarchy ID allocation and mapping. It follows the same exclusion
152 * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for
153 * writes, either for reads.
154 */
155static DEFINE_IDR(cgroup_hierarchy_idr); 159static DEFINE_IDR(cgroup_hierarchy_idr);
156 160
157static struct cgroup_name root_cgroup_name = { .name = "/" };
158
159/* 161/*
160 * Assign a monotonically increasing serial number to cgroups. It 162 * Assign a monotonically increasing serial number to cgroups. It
161 * guarantees cgroups with bigger numbers are newer than those with smaller 163 * guarantees cgroups with bigger numbers are newer than those with smaller
@@ -175,11 +177,13 @@ static int need_forkexit_callback __read_mostly;
175 177
176static struct cftype cgroup_base_files[]; 178static struct cftype cgroup_base_files[];
177 179
180static void cgroup_put(struct cgroup *cgrp);
181static int rebind_subsystems(struct cgroup_root *dst_root,
182 unsigned long ss_mask);
178static void cgroup_destroy_css_killed(struct cgroup *cgrp); 183static void cgroup_destroy_css_killed(struct cgroup *cgrp);
179static int cgroup_destroy_locked(struct cgroup *cgrp); 184static int cgroup_destroy_locked(struct cgroup *cgrp);
180static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 185static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
181 bool is_add); 186 bool is_add);
182static int cgroup_file_release(struct inode *inode, struct file *file);
183static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); 187static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
184 188
185/** 189/**
@@ -197,8 +201,9 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
197 struct cgroup_subsys *ss) 201 struct cgroup_subsys *ss)
198{ 202{
199 if (ss) 203 if (ss)
200 return rcu_dereference_check(cgrp->subsys[ss->subsys_id], 204 return rcu_dereference_check(cgrp->subsys[ss->id],
201 lockdep_is_held(&cgroup_mutex)); 205 lockdep_is_held(&cgroup_tree_mutex) ||
206 lockdep_is_held(&cgroup_mutex));
202 else 207 else
203 return &cgrp->dummy_css; 208 return &cgrp->dummy_css;
204} 209}
@@ -209,6 +214,27 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp)
209 return test_bit(CGRP_DEAD, &cgrp->flags); 214 return test_bit(CGRP_DEAD, &cgrp->flags);
210} 215}
211 216
217struct cgroup_subsys_state *seq_css(struct seq_file *seq)
218{
219 struct kernfs_open_file *of = seq->private;
220 struct cgroup *cgrp = of->kn->parent->priv;
221 struct cftype *cft = seq_cft(seq);
222
223 /*
224 * This is open and unprotected implementation of cgroup_css().
225 * seq_css() is only called from a kernfs file operation which has
226 * an active reference on the file. Because all the subsystem
227 * files are drained before a css is disassociated with a cgroup,
228 * the matching css from the cgroup's subsys table is guaranteed to
229 * be and stay valid until the enclosing operation is complete.
230 */
231 if (cft->ss)
232 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
233 else
234 return &cgrp->dummy_css;
235}
236EXPORT_SYMBOL_GPL(seq_css);
237
212/** 238/**
213 * cgroup_is_descendant - test ancestry 239 * cgroup_is_descendant - test ancestry
214 * @cgrp: the cgroup to be tested 240 * @cgrp: the cgroup to be tested
@@ -227,7 +253,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
227 } 253 }
228 return false; 254 return false;
229} 255}
230EXPORT_SYMBOL_GPL(cgroup_is_descendant);
231 256
232static int cgroup_is_releasable(const struct cgroup *cgrp) 257static int cgroup_is_releasable(const struct cgroup *cgrp)
233{ 258{
@@ -254,54 +279,23 @@ static int notify_on_release(const struct cgroup *cgrp)
254 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 279 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
255 if (!((css) = rcu_dereference_check( \ 280 if (!((css) = rcu_dereference_check( \
256 (cgrp)->subsys[(ssid)], \ 281 (cgrp)->subsys[(ssid)], \
282 lockdep_is_held(&cgroup_tree_mutex) || \
257 lockdep_is_held(&cgroup_mutex)))) { } \ 283 lockdep_is_held(&cgroup_mutex)))) { } \
258 else 284 else
259 285
260/** 286/**
261 * for_each_subsys - iterate all loaded cgroup subsystems 287 * for_each_subsys - iterate all enabled cgroup subsystems
262 * @ss: the iteration cursor 288 * @ss: the iteration cursor
263 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 289 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
264 *
265 * Iterates through all loaded subsystems. Should be called under
266 * cgroup_mutex or cgroup_root_mutex.
267 */ 290 */
268#define for_each_subsys(ss, ssid) \ 291#define for_each_subsys(ss, ssid) \
269 for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \ 292 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
270 (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 293 (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
271 if (!((ss) = cgroup_subsys[(ssid)])) { } \
272 else
273 294
274/** 295/* iterate across the hierarchies */
275 * for_each_builtin_subsys - iterate all built-in cgroup subsystems 296#define for_each_root(root) \
276 * @ss: the iteration cursor
277 * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end
278 *
279 * Bulit-in subsystems are always present and iteration itself doesn't
280 * require any synchronization.
281 */
282#define for_each_builtin_subsys(ss, i) \
283 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
284 (((ss) = cgroup_subsys[i]) || true); (i)++)
285
286/* iterate across the active hierarchies */
287#define for_each_active_root(root) \
288 list_for_each_entry((root), &cgroup_roots, root_list) 297 list_for_each_entry((root), &cgroup_roots, root_list)
289 298
290static inline struct cgroup *__d_cgrp(struct dentry *dentry)
291{
292 return dentry->d_fsdata;
293}
294
295static inline struct cfent *__d_cfe(struct dentry *dentry)
296{
297 return dentry->d_fsdata;
298}
299
300static inline struct cftype *__d_cft(struct dentry *dentry)
301{
302 return __d_cfe(dentry)->type;
303}
304
305/** 299/**
306 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 300 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
307 * @cgrp: the cgroup to be checked for liveness 301 * @cgrp: the cgroup to be checked for liveness
@@ -347,23 +341,23 @@ struct cgrp_cset_link {
347 struct list_head cgrp_link; 341 struct list_head cgrp_link;
348}; 342};
349 343
350/* The default css_set - used by init and its children prior to any 344/*
345 * The default css_set - used by init and its children prior to any
351 * hierarchies being mounted. It contains a pointer to the root state 346 * hierarchies being mounted. It contains a pointer to the root state
352 * for each subsystem. Also used to anchor the list of css_sets. Not 347 * for each subsystem. Also used to anchor the list of css_sets. Not
353 * reference-counted, to improve performance when child cgroups 348 * reference-counted, to improve performance when child cgroups
354 * haven't been created. 349 * haven't been created.
355 */ 350 */
351static struct css_set init_css_set = {
352 .refcount = ATOMIC_INIT(1),
353 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
354 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
355 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
356 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
357 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
358};
356 359
357static struct css_set init_css_set; 360static int css_set_count = 1; /* 1 for init_css_set */
358static struct cgrp_cset_link init_cgrp_cset_link;
359
360/*
361 * css_set_lock protects the list of css_set objects, and the chain of
362 * tasks off each css_set. Nests outside task->alloc_lock due to
363 * css_task_iter_start().
364 */
365static DEFINE_RWLOCK(css_set_lock);
366static int css_set_count;
367 361
368/* 362/*
369 * hash table for cgroup groups. This improves the performance to find 363 * hash table for cgroup groups. This improves the performance to find
@@ -386,30 +380,14 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
386 return key; 380 return key;
387} 381}
388 382
389/* 383static void put_css_set_locked(struct css_set *cset, bool taskexit)
390 * We don't maintain the lists running through each css_set to its task
391 * until after the first call to css_task_iter_start(). This reduces the
392 * fork()/exit() overhead for people who have cgroups compiled into their
393 * kernel but not actually in use.
394 */
395static int use_task_css_set_links __read_mostly;
396
397static void __put_css_set(struct css_set *cset, int taskexit)
398{ 384{
399 struct cgrp_cset_link *link, *tmp_link; 385 struct cgrp_cset_link *link, *tmp_link;
400 386
401 /* 387 lockdep_assert_held(&css_set_rwsem);
402 * Ensure that the refcount doesn't hit zero while any readers 388
403 * can see it. Similar to atomic_dec_and_lock(), but for an 389 if (!atomic_dec_and_test(&cset->refcount))
404 * rwlock
405 */
406 if (atomic_add_unless(&cset->refcount, -1, 1))
407 return;
408 write_lock(&css_set_lock);
409 if (!atomic_dec_and_test(&cset->refcount)) {
410 write_unlock(&css_set_lock);
411 return; 390 return;
412 }
413 391
414 /* This css_set is dead. unlink it and release cgroup refcounts */ 392 /* This css_set is dead. unlink it and release cgroup refcounts */
415 hash_del(&cset->hlist); 393 hash_del(&cset->hlist);
@@ -421,7 +399,7 @@ static void __put_css_set(struct css_set *cset, int taskexit)
421 list_del(&link->cset_link); 399 list_del(&link->cset_link);
422 list_del(&link->cgrp_link); 400 list_del(&link->cgrp_link);
423 401
424 /* @cgrp can't go away while we're holding css_set_lock */ 402 /* @cgrp can't go away while we're holding css_set_rwsem */
425 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { 403 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
426 if (taskexit) 404 if (taskexit)
427 set_bit(CGRP_RELEASABLE, &cgrp->flags); 405 set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -431,10 +409,24 @@ static void __put_css_set(struct css_set *cset, int taskexit)
431 kfree(link); 409 kfree(link);
432 } 410 }
433 411
434 write_unlock(&css_set_lock);
435 kfree_rcu(cset, rcu_head); 412 kfree_rcu(cset, rcu_head);
436} 413}
437 414
415static void put_css_set(struct css_set *cset, bool taskexit)
416{
417 /*
418 * Ensure that the refcount doesn't hit zero while any readers
419 * can see it. Similar to atomic_dec_and_lock(), but for an
420 * rwlock
421 */
422 if (atomic_add_unless(&cset->refcount, -1, 1))
423 return;
424
425 down_write(&css_set_rwsem);
426 put_css_set_locked(cset, taskexit);
427 up_write(&css_set_rwsem);
428}
429
438/* 430/*
439 * refcounted get/put for css_set objects 431 * refcounted get/put for css_set objects
440 */ 432 */
@@ -443,16 +435,6 @@ static inline void get_css_set(struct css_set *cset)
443 atomic_inc(&cset->refcount); 435 atomic_inc(&cset->refcount);
444} 436}
445 437
446static inline void put_css_set(struct css_set *cset)
447{
448 __put_css_set(cset, 0);
449}
450
451static inline void put_css_set_taskexit(struct css_set *cset)
452{
453 __put_css_set(cset, 1);
454}
455
456/** 438/**
457 * compare_css_sets - helper function for find_existing_css_set(). 439 * compare_css_sets - helper function for find_existing_css_set().
458 * @cset: candidate css_set being tested 440 * @cset: candidate css_set being tested
@@ -535,7 +517,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
535 struct cgroup *cgrp, 517 struct cgroup *cgrp,
536 struct cgroup_subsys_state *template[]) 518 struct cgroup_subsys_state *template[])
537{ 519{
538 struct cgroupfs_root *root = cgrp->root; 520 struct cgroup_root *root = cgrp->root;
539 struct cgroup_subsys *ss; 521 struct cgroup_subsys *ss;
540 struct css_set *cset; 522 struct css_set *cset;
541 unsigned long key; 523 unsigned long key;
@@ -547,7 +529,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
547 * won't change, so no need for locking. 529 * won't change, so no need for locking.
548 */ 530 */
549 for_each_subsys(ss, i) { 531 for_each_subsys(ss, i) {
550 if (root->subsys_mask & (1UL << i)) { 532 if (root->cgrp.subsys_mask & (1UL << i)) {
551 /* Subsystem is in this hierarchy. So we want 533 /* Subsystem is in this hierarchy. So we want
552 * the subsystem state from the new 534 * the subsystem state from the new
553 * cgroup */ 535 * cgroup */
@@ -652,11 +634,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
652 634
653 /* First see if we already have a cgroup group that matches 635 /* First see if we already have a cgroup group that matches
654 * the desired set */ 636 * the desired set */
655 read_lock(&css_set_lock); 637 down_read(&css_set_rwsem);
656 cset = find_existing_css_set(old_cset, cgrp, template); 638 cset = find_existing_css_set(old_cset, cgrp, template);
657 if (cset) 639 if (cset)
658 get_css_set(cset); 640 get_css_set(cset);
659 read_unlock(&css_set_lock); 641 up_read(&css_set_rwsem);
660 642
661 if (cset) 643 if (cset)
662 return cset; 644 return cset;
@@ -674,13 +656,16 @@ static struct css_set *find_css_set(struct css_set *old_cset,
674 atomic_set(&cset->refcount, 1); 656 atomic_set(&cset->refcount, 1);
675 INIT_LIST_HEAD(&cset->cgrp_links); 657 INIT_LIST_HEAD(&cset->cgrp_links);
676 INIT_LIST_HEAD(&cset->tasks); 658 INIT_LIST_HEAD(&cset->tasks);
659 INIT_LIST_HEAD(&cset->mg_tasks);
660 INIT_LIST_HEAD(&cset->mg_preload_node);
661 INIT_LIST_HEAD(&cset->mg_node);
677 INIT_HLIST_NODE(&cset->hlist); 662 INIT_HLIST_NODE(&cset->hlist);
678 663
679 /* Copy the set of subsystem state objects generated in 664 /* Copy the set of subsystem state objects generated in
680 * find_existing_css_set() */ 665 * find_existing_css_set() */
681 memcpy(cset->subsys, template, sizeof(cset->subsys)); 666 memcpy(cset->subsys, template, sizeof(cset->subsys));
682 667
683 write_lock(&css_set_lock); 668 down_write(&css_set_rwsem);
684 /* Add reference counts and links from the new css_set. */ 669 /* Add reference counts and links from the new css_set. */
685 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { 670 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
686 struct cgroup *c = link->cgrp; 671 struct cgroup *c = link->cgrp;
@@ -698,31 +683,105 @@ static struct css_set *find_css_set(struct css_set *old_cset,
698 key = css_set_hash(cset->subsys); 683 key = css_set_hash(cset->subsys);
699 hash_add(css_set_table, &cset->hlist, key); 684 hash_add(css_set_table, &cset->hlist, key);
700 685
701 write_unlock(&css_set_lock); 686 up_write(&css_set_rwsem);
702 687
703 return cset; 688 return cset;
704} 689}
705 690
706/* 691static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
707 * Return the cgroup for "task" from the given hierarchy. Must be
708 * called with cgroup_mutex held.
709 */
710static struct cgroup *task_cgroup_from_root(struct task_struct *task,
711 struct cgroupfs_root *root)
712{ 692{
713 struct css_set *cset; 693 struct cgroup *root_cgrp = kf_root->kn->priv;
714 struct cgroup *res = NULL; 694
695 return root_cgrp->root;
696}
697
698static int cgroup_init_root_id(struct cgroup_root *root)
699{
700 int id;
701
702 lockdep_assert_held(&cgroup_mutex);
703
704 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
705 if (id < 0)
706 return id;
707
708 root->hierarchy_id = id;
709 return 0;
710}
711
712static void cgroup_exit_root_id(struct cgroup_root *root)
713{
714 lockdep_assert_held(&cgroup_mutex);
715
716 if (root->hierarchy_id) {
717 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
718 root->hierarchy_id = 0;
719 }
720}
721
722static void cgroup_free_root(struct cgroup_root *root)
723{
724 if (root) {
725 /* hierarhcy ID shoulid already have been released */
726 WARN_ON_ONCE(root->hierarchy_id);
727
728 idr_destroy(&root->cgroup_idr);
729 kfree(root);
730 }
731}
732
733static void cgroup_destroy_root(struct cgroup_root *root)
734{
735 struct cgroup *cgrp = &root->cgrp;
736 struct cgrp_cset_link *link, *tmp_link;
737
738 mutex_lock(&cgroup_tree_mutex);
739 mutex_lock(&cgroup_mutex);
740
741 BUG_ON(atomic_read(&root->nr_cgrps));
742 BUG_ON(!list_empty(&cgrp->children));
743
744 /* Rebind all subsystems back to the default hierarchy */
745 rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask);
715 746
716 BUG_ON(!mutex_is_locked(&cgroup_mutex));
717 read_lock(&css_set_lock);
718 /* 747 /*
719 * No need to lock the task - since we hold cgroup_mutex the 748 * Release all the links from cset_links to this hierarchy's
720 * task can't change groups, so the only thing that can happen 749 * root cgroup
721 * is that it exits and its css is set back to init_css_set.
722 */ 750 */
723 cset = task_css_set(task); 751 down_write(&css_set_rwsem);
752
753 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
754 list_del(&link->cset_link);
755 list_del(&link->cgrp_link);
756 kfree(link);
757 }
758 up_write(&css_set_rwsem);
759
760 if (!list_empty(&root->root_list)) {
761 list_del(&root->root_list);
762 cgroup_root_count--;
763 }
764
765 cgroup_exit_root_id(root);
766
767 mutex_unlock(&cgroup_mutex);
768 mutex_unlock(&cgroup_tree_mutex);
769
770 kernfs_destroy_root(root->kf_root);
771 cgroup_free_root(root);
772}
773
774/* look up cgroup associated with given css_set on the specified hierarchy */
775static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
776 struct cgroup_root *root)
777{
778 struct cgroup *res = NULL;
779
780 lockdep_assert_held(&cgroup_mutex);
781 lockdep_assert_held(&css_set_rwsem);
782
724 if (cset == &init_css_set) { 783 if (cset == &init_css_set) {
725 res = &root->top_cgroup; 784 res = &root->cgrp;
726 } else { 785 } else {
727 struct cgrp_cset_link *link; 786 struct cgrp_cset_link *link;
728 787
@@ -735,16 +794,27 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
735 } 794 }
736 } 795 }
737 } 796 }
738 read_unlock(&css_set_lock); 797
739 BUG_ON(!res); 798 BUG_ON(!res);
740 return res; 799 return res;
741} 800}
742 801
743/* 802/*
744 * There is one global cgroup mutex. We also require taking 803 * Return the cgroup for "task" from the given hierarchy. Must be
745 * task_lock() when dereferencing a task's cgroup subsys pointers. 804 * called with cgroup_mutex and css_set_rwsem held.
746 * See "The task_lock() exception", at the end of this comment. 805 */
747 * 806static struct cgroup *task_cgroup_from_root(struct task_struct *task,
807 struct cgroup_root *root)
808{
809 /*
810 * No need to lock the task - since we hold cgroup_mutex the
811 * task can't change groups, so the only thing that can happen
812 * is that it exits and its css is set back to init_css_set.
813 */
814 return cset_cgroup_from_root(task_css_set(task), root);
815}
816
817/*
748 * A task must hold cgroup_mutex to modify cgroups. 818 * A task must hold cgroup_mutex to modify cgroups.
749 * 819 *
750 * Any task can increment and decrement the count field without lock. 820 * Any task can increment and decrement the count field without lock.
@@ -770,98 +840,79 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
770 * A cgroup can only be deleted if both its 'count' of using tasks 840 * A cgroup can only be deleted if both its 'count' of using tasks
771 * is zero, and its list of 'children' cgroups is empty. Since all 841 * is zero, and its list of 'children' cgroups is empty. Since all
772 * tasks in the system use _some_ cgroup, and since there is always at 842 * tasks in the system use _some_ cgroup, and since there is always at
773 * least one task in the system (init, pid == 1), therefore, top_cgroup 843 * least one task in the system (init, pid == 1), therefore, root cgroup
774 * always has either children cgroups and/or using tasks. So we don't 844 * always has either children cgroups and/or using tasks. So we don't
775 * need a special hack to ensure that top_cgroup cannot be deleted. 845 * need a special hack to ensure that root cgroup cannot be deleted.
776 *
777 * The task_lock() exception
778 *
779 * The need for this exception arises from the action of
780 * cgroup_attach_task(), which overwrites one task's cgroup pointer with
781 * another. It does so using cgroup_mutex, however there are
782 * several performance critical places that need to reference
783 * task->cgroup without the expense of grabbing a system global
784 * mutex. Therefore except as noted below, when dereferencing or, as
785 * in cgroup_attach_task(), modifying a task's cgroup pointer we use
786 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
787 * the task_struct routinely used for such matters.
788 * 846 *
789 * P.S. One more locking exception. RCU is used to guard the 847 * P.S. One more locking exception. RCU is used to guard the
790 * update of a tasks cgroup pointer by cgroup_attach_task() 848 * update of a tasks cgroup pointer by cgroup_attach_task()
791 */ 849 */
792 850
793/*
794 * A couple of forward declarations required, due to cyclic reference loop:
795 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
796 * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
797 * -> cgroup_mkdir.
798 */
799
800static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
801static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
802static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); 851static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
803static const struct inode_operations cgroup_dir_inode_operations; 852static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
804static const struct file_operations proc_cgroupstats_operations; 853static const struct file_operations proc_cgroupstats_operations;
805 854
806static struct backing_dev_info cgroup_backing_dev_info = { 855static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
807 .name = "cgroup", 856 char *buf)
808 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
809};
810
811static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
812{ 857{
813 struct inode *inode = new_inode(sb); 858 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
814 859 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
815 if (inode) { 860 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
816 inode->i_ino = get_next_ino(); 861 cft->ss->name, cft->name);
817 inode->i_mode = mode; 862 else
818 inode->i_uid = current_fsuid(); 863 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
819 inode->i_gid = current_fsgid(); 864 return buf;
820 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
821 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
822 }
823 return inode;
824} 865}
825 866
826static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) 867/**
868 * cgroup_file_mode - deduce file mode of a control file
869 * @cft: the control file in question
870 *
871 * returns cft->mode if ->mode is not 0
872 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
873 * returns S_IRUGO if it has only a read handler
874 * returns S_IWUSR if it has only a write hander
875 */
876static umode_t cgroup_file_mode(const struct cftype *cft)
827{ 877{
828 struct cgroup_name *name; 878 umode_t mode = 0;
829 879
830 name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL); 880 if (cft->mode)
831 if (!name) 881 return cft->mode;
832 return NULL; 882
833 strcpy(name->name, dentry->d_name.name); 883 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
834 return name; 884 mode |= S_IRUGO;
885
886 if (cft->write_u64 || cft->write_s64 || cft->write_string ||
887 cft->trigger)
888 mode |= S_IWUSR;
889
890 return mode;
835} 891}
836 892
837static void cgroup_free_fn(struct work_struct *work) 893static void cgroup_free_fn(struct work_struct *work)
838{ 894{
839 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 895 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
840 896
841 mutex_lock(&cgroup_mutex); 897 atomic_dec(&cgrp->root->nr_cgrps);
842 cgrp->root->number_of_cgroups--;
843 mutex_unlock(&cgroup_mutex);
844
845 /*
846 * We get a ref to the parent's dentry, and put the ref when
847 * this cgroup is being freed, so it's guaranteed that the
848 * parent won't be destroyed before its children.
849 */
850 dput(cgrp->parent->dentry);
851
852 /*
853 * Drop the active superblock reference that we took when we
854 * created the cgroup. This will free cgrp->root, if we are
855 * holding the last reference to @sb.
856 */
857 deactivate_super(cgrp->root->sb);
858
859 cgroup_pidlist_destroy_all(cgrp); 898 cgroup_pidlist_destroy_all(cgrp);
860 899
861 simple_xattrs_free(&cgrp->xattrs); 900 if (cgrp->parent) {
862 901 /*
863 kfree(rcu_dereference_raw(cgrp->name)); 902 * We get a ref to the parent, and put the ref when this
864 kfree(cgrp); 903 * cgroup is being freed, so it's guaranteed that the
904 * parent won't be destroyed before its children.
905 */
906 cgroup_put(cgrp->parent);
907 kernfs_put(cgrp->kn);
908 kfree(cgrp);
909 } else {
910 /*
911 * This is root cgroup's refcnt reaching zero, which
912 * indicates that the root should be released.
913 */
914 cgroup_destroy_root(cgrp->root);
915 }
865} 916}
866 917
867static void cgroup_free_rcu(struct rcu_head *head) 918static void cgroup_free_rcu(struct rcu_head *head)
@@ -872,73 +923,40 @@ static void cgroup_free_rcu(struct rcu_head *head)
872 queue_work(cgroup_destroy_wq, &cgrp->destroy_work); 923 queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
873} 924}
874 925
875static void cgroup_diput(struct dentry *dentry, struct inode *inode) 926static void cgroup_get(struct cgroup *cgrp)
876{
877 /* is dentry a directory ? if so, kfree() associated cgroup */
878 if (S_ISDIR(inode->i_mode)) {
879 struct cgroup *cgrp = dentry->d_fsdata;
880
881 BUG_ON(!(cgroup_is_dead(cgrp)));
882
883 /*
884 * XXX: cgrp->id is only used to look up css's. As cgroup
885 * and css's lifetimes will be decoupled, it should be made
886 * per-subsystem and moved to css->id so that lookups are
887 * successful until the target css is released.
888 */
889 mutex_lock(&cgroup_mutex);
890 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
891 mutex_unlock(&cgroup_mutex);
892 cgrp->id = -1;
893
894 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
895 } else {
896 struct cfent *cfe = __d_cfe(dentry);
897 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
898
899 WARN_ONCE(!list_empty(&cfe->node) &&
900 cgrp != &cgrp->root->top_cgroup,
901 "cfe still linked for %s\n", cfe->type->name);
902 simple_xattrs_free(&cfe->xattrs);
903 kfree(cfe);
904 }
905 iput(inode);
906}
907
908static void remove_dir(struct dentry *d)
909{ 927{
910 struct dentry *parent = dget(d->d_parent); 928 WARN_ON_ONCE(cgroup_is_dead(cgrp));
911 929 WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
912 d_delete(d); 930 atomic_inc(&cgrp->refcnt);
913 simple_rmdir(parent->d_inode, d);
914 dput(parent);
915} 931}
916 932
917static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 933static void cgroup_put(struct cgroup *cgrp)
918{ 934{
919 struct cfent *cfe; 935 if (!atomic_dec_and_test(&cgrp->refcnt))
920 936 return;
921 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); 937 if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
922 lockdep_assert_held(&cgroup_mutex); 938 return;
923 939
924 /* 940 /*
925 * If we're doing cleanup due to failure of cgroup_create(), 941 * XXX: cgrp->id is only used to look up css's. As cgroup and
926 * the corresponding @cfe may not exist. 942 * css's lifetimes will be decoupled, it should be made
943 * per-subsystem and moved to css->id so that lookups are
944 * successful until the target css is released.
927 */ 945 */
928 list_for_each_entry(cfe, &cgrp->files, node) { 946 mutex_lock(&cgroup_mutex);
929 struct dentry *d = cfe->dentry; 947 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
948 mutex_unlock(&cgroup_mutex);
949 cgrp->id = -1;
930 950
931 if (cft && cfe->type != cft) 951 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
932 continue; 952}
933 953
934 dget(d); 954static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
935 d_delete(d); 955{
936 simple_unlink(cgrp->dentry->d_inode, d); 956 char name[CGROUP_FILE_NAME_MAX];
937 list_del_init(&cfe->node);
938 dput(d);
939 957
940 break; 958 lockdep_assert_held(&cgroup_tree_mutex);
941 } 959 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
942} 960}
943 961
944/** 962/**
@@ -952,144 +970,106 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
952 int i; 970 int i;
953 971
954 for_each_subsys(ss, i) { 972 for_each_subsys(ss, i) {
955 struct cftype_set *set; 973 struct cftype *cfts;
956 974
957 if (!test_bit(i, &subsys_mask)) 975 if (!test_bit(i, &subsys_mask))
958 continue; 976 continue;
959 list_for_each_entry(set, &ss->cftsets, node) 977 list_for_each_entry(cfts, &ss->cfts, node)
960 cgroup_addrm_files(cgrp, set->cfts, false); 978 cgroup_addrm_files(cgrp, cfts, false);
961 } 979 }
962} 980}
963 981
964/* 982static int rebind_subsystems(struct cgroup_root *dst_root,
965 * NOTE : the dentry must have been dget()'ed 983 unsigned long ss_mask)
966 */
967static void cgroup_d_remove_dir(struct dentry *dentry)
968{
969 struct dentry *parent;
970
971 parent = dentry->d_parent;
972 spin_lock(&parent->d_lock);
973 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
974 list_del_init(&dentry->d_u.d_child);
975 spin_unlock(&dentry->d_lock);
976 spin_unlock(&parent->d_lock);
977 remove_dir(dentry);
978}
979
980/*
981 * Call with cgroup_mutex held. Drops reference counts on modules, including
982 * any duplicate ones that parse_cgroupfs_options took. If this function
983 * returns an error, no reference counts are touched.
984 */
985static int rebind_subsystems(struct cgroupfs_root *root,
986 unsigned long added_mask, unsigned removed_mask)
987{ 984{
988 struct cgroup *cgrp = &root->top_cgroup;
989 struct cgroup_subsys *ss; 985 struct cgroup_subsys *ss;
990 unsigned long pinned = 0; 986 int ssid, ret;
991 int i, ret;
992 987
993 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 988 lockdep_assert_held(&cgroup_tree_mutex);
994 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 989 lockdep_assert_held(&cgroup_mutex);
995 990
996 /* Check that any added subsystems are currently free */ 991 for_each_subsys(ss, ssid) {
997 for_each_subsys(ss, i) { 992 if (!(ss_mask & (1 << ssid)))
998 if (!(added_mask & (1 << i)))
999 continue; 993 continue;
1000 994
1001 /* is the subsystem mounted elsewhere? */ 995 /* if @ss is on the dummy_root, we can always move it */
1002 if (ss->root != &cgroup_dummy_root) { 996 if (ss->root == &cgrp_dfl_root)
1003 ret = -EBUSY; 997 continue;
1004 goto out_put;
1005 }
1006 998
1007 /* pin the module */ 999 /* if @ss has non-root cgroups attached to it, can't move */
1008 if (!try_module_get(ss->module)) { 1000 if (!list_empty(&ss->root->cgrp.children))
1009 ret = -ENOENT; 1001 return -EBUSY;
1010 goto out_put;
1011 }
1012 pinned |= 1 << i;
1013 }
1014 1002
1015 /* subsys could be missing if unloaded between parsing and here */ 1003 /* can't move between two non-dummy roots either */
1016 if (added_mask != pinned) { 1004 if (dst_root != &cgrp_dfl_root)
1017 ret = -ENOENT; 1005 return -EBUSY;
1018 goto out_put;
1019 } 1006 }
1020 1007
1021 ret = cgroup_populate_dir(cgrp, added_mask); 1008 ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);
1022 if (ret) 1009 if (ret) {
1023 goto out_put; 1010 if (dst_root != &cgrp_dfl_root)
1011 return ret;
1012
1013 /*
1014 * Rebinding back to the default root is not allowed to
1015 * fail. Using both default and non-default roots should
1016 * be rare. Moving subsystems back and forth even more so.
1017 * Just warn about it and continue.
1018 */
1019 if (cgrp_dfl_root_visible) {
1020 pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n",
1021 ret, ss_mask);
1022 pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n");
1023 }
1024 }
1024 1025
1025 /* 1026 /*
1026 * Nothing can fail from this point on. Remove files for the 1027 * Nothing can fail from this point on. Remove files for the
1027 * removed subsystems and rebind each subsystem. 1028 * removed subsystems and rebind each subsystem.
1028 */ 1029 */
1029 cgroup_clear_dir(cgrp, removed_mask); 1030 mutex_unlock(&cgroup_mutex);
1030 1031 for_each_subsys(ss, ssid)
1031 for_each_subsys(ss, i) { 1032 if (ss_mask & (1 << ssid))
1032 unsigned long bit = 1UL << i; 1033 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1033 1034 mutex_lock(&cgroup_mutex);
1034 if (bit & added_mask) {
1035 /* We're binding this subsystem to this hierarchy */
1036 BUG_ON(cgroup_css(cgrp, ss));
1037 BUG_ON(!cgroup_css(cgroup_dummy_top, ss));
1038 BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top);
1039 1035
1040 rcu_assign_pointer(cgrp->subsys[i], 1036 for_each_subsys(ss, ssid) {
1041 cgroup_css(cgroup_dummy_top, ss)); 1037 struct cgroup_root *src_root;
1042 cgroup_css(cgrp, ss)->cgroup = cgrp; 1038 struct cgroup_subsys_state *css;
1043 1039
1044 ss->root = root; 1040 if (!(ss_mask & (1 << ssid)))
1045 if (ss->bind) 1041 continue;
1046 ss->bind(cgroup_css(cgrp, ss));
1047 1042
1048 /* refcount was already taken, and we're keeping it */ 1043 src_root = ss->root;
1049 root->subsys_mask |= bit; 1044 css = cgroup_css(&src_root->cgrp, ss);
1050 } else if (bit & removed_mask) {
1051 /* We're removing this subsystem */
1052 BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss));
1053 BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp);
1054 1045
1055 if (ss->bind) 1046 WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
1056 ss->bind(cgroup_css(cgroup_dummy_top, ss));
1057 1047
1058 cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top; 1048 RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
1059 RCU_INIT_POINTER(cgrp->subsys[i], NULL); 1049 rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
1050 ss->root = dst_root;
1051 css->cgroup = &dst_root->cgrp;
1060 1052
1061 cgroup_subsys[i]->root = &cgroup_dummy_root; 1053 src_root->cgrp.subsys_mask &= ~(1 << ssid);
1054 dst_root->cgrp.subsys_mask |= 1 << ssid;
1062 1055
1063 /* subsystem is now free - drop reference on module */ 1056 if (ss->bind)
1064 module_put(ss->module); 1057 ss->bind(css);
1065 root->subsys_mask &= ~bit;
1066 }
1067 } 1058 }
1068 1059
1069 /* 1060 kernfs_activate(dst_root->cgrp.kn);
1070 * Mark @root has finished binding subsystems. @root->subsys_mask
1071 * now matches the bound subsystems.
1072 */
1073 root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1074
1075 return 0; 1061 return 0;
1076
1077out_put:
1078 for_each_subsys(ss, i)
1079 if (pinned & (1 << i))
1080 module_put(ss->module);
1081 return ret;
1082} 1062}
1083 1063
1084static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) 1064static int cgroup_show_options(struct seq_file *seq,
1065 struct kernfs_root *kf_root)
1085{ 1066{
1086 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 1067 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1087 struct cgroup_subsys *ss; 1068 struct cgroup_subsys *ss;
1088 int ssid; 1069 int ssid;
1089 1070
1090 mutex_lock(&cgroup_root_mutex);
1091 for_each_subsys(ss, ssid) 1071 for_each_subsys(ss, ssid)
1092 if (root->subsys_mask & (1 << ssid)) 1072 if (root->cgrp.subsys_mask & (1 << ssid))
1093 seq_printf(seq, ",%s", ss->name); 1073 seq_printf(seq, ",%s", ss->name);
1094 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1074 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1095 seq_puts(seq, ",sane_behavior"); 1075 seq_puts(seq, ",sane_behavior");
@@ -1097,13 +1077,16 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1097 seq_puts(seq, ",noprefix"); 1077 seq_puts(seq, ",noprefix");
1098 if (root->flags & CGRP_ROOT_XATTR) 1078 if (root->flags & CGRP_ROOT_XATTR)
1099 seq_puts(seq, ",xattr"); 1079 seq_puts(seq, ",xattr");
1080
1081 spin_lock(&release_agent_path_lock);
1100 if (strlen(root->release_agent_path)) 1082 if (strlen(root->release_agent_path))
1101 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1083 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1102 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) 1084 spin_unlock(&release_agent_path_lock);
1085
1086 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
1103 seq_puts(seq, ",clone_children"); 1087 seq_puts(seq, ",clone_children");
1104 if (strlen(root->name)) 1088 if (strlen(root->name))
1105 seq_printf(seq, ",name=%s", root->name); 1089 seq_printf(seq, ",name=%s", root->name);
1106 mutex_unlock(&cgroup_root_mutex);
1107 return 0; 1090 return 0;
1108} 1091}
1109 1092
@@ -1115,9 +1098,6 @@ struct cgroup_sb_opts {
1115 char *name; 1098 char *name;
1116 /* User explicitly requested empty subsystem */ 1099 /* User explicitly requested empty subsystem */
1117 bool none; 1100 bool none;
1118
1119 struct cgroupfs_root *new_root;
1120
1121}; 1101};
1122 1102
1123/* 1103/*
@@ -1137,7 +1117,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1137 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1117 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1138 1118
1139#ifdef CONFIG_CPUSETS 1119#ifdef CONFIG_CPUSETS
1140 mask = ~(1UL << cpuset_subsys_id); 1120 mask = ~(1UL << cpuset_cgrp_id);
1141#endif 1121#endif
1142 1122
1143 memset(opts, 0, sizeof(*opts)); 1123 memset(opts, 0, sizeof(*opts));
@@ -1227,30 +1207,34 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1227 return -ENOENT; 1207 return -ENOENT;
1228 } 1208 }
1229 1209
1230 /*
1231 * If the 'all' option was specified select all the subsystems,
1232 * otherwise if 'none', 'name=' and a subsystem name options
1233 * were not specified, let's default to 'all'
1234 */
1235 if (all_ss || (!one_ss && !opts->none && !opts->name))
1236 for_each_subsys(ss, i)
1237 if (!ss->disabled)
1238 set_bit(i, &opts->subsys_mask);
1239
1240 /* Consistency checks */ 1210 /* Consistency checks */
1241 1211
1242 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1212 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1243 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); 1213 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1244 1214
1245 if (opts->flags & CGRP_ROOT_NOPREFIX) { 1215 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
1246 pr_err("cgroup: sane_behavior: noprefix is not allowed\n"); 1216 opts->cpuset_clone_children || opts->release_agent ||
1217 opts->name) {
1218 pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
1247 return -EINVAL; 1219 return -EINVAL;
1248 } 1220 }
1221 } else {
1222 /*
1223 * If the 'all' option was specified select all the
1224 * subsystems, otherwise if 'none', 'name=' and a subsystem
1225 * name options were not specified, let's default to 'all'
1226 */
1227 if (all_ss || (!one_ss && !opts->none && !opts->name))
1228 for_each_subsys(ss, i)
1229 if (!ss->disabled)
1230 set_bit(i, &opts->subsys_mask);
1249 1231
1250 if (opts->cpuset_clone_children) { 1232 /*
1251 pr_err("cgroup: sane_behavior: clone_children is not allowed\n"); 1233 * We either have to specify by name or by subsystems. (So
1234 * all empty hierarchies must have a name).
1235 */
1236 if (!opts->subsys_mask && !opts->name)
1252 return -EINVAL; 1237 return -EINVAL;
1253 }
1254 } 1238 }
1255 1239
1256 /* 1240 /*
@@ -1266,21 +1250,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1266 if (opts->subsys_mask && opts->none) 1250 if (opts->subsys_mask && opts->none)
1267 return -EINVAL; 1251 return -EINVAL;
1268 1252
1269 /*
1270 * We either have to specify by name or by subsystems. (So all
1271 * empty hierarchies must have a name).
1272 */
1273 if (!opts->subsys_mask && !opts->name)
1274 return -EINVAL;
1275
1276 return 0; 1253 return 0;
1277} 1254}
1278 1255
1279static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1256static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1280{ 1257{
1281 int ret = 0; 1258 int ret = 0;
1282 struct cgroupfs_root *root = sb->s_fs_info; 1259 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1283 struct cgroup *cgrp = &root->top_cgroup;
1284 struct cgroup_sb_opts opts; 1260 struct cgroup_sb_opts opts;
1285 unsigned long added_mask, removed_mask; 1261 unsigned long added_mask, removed_mask;
1286 1262
@@ -1289,21 +1265,20 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1289 return -EINVAL; 1265 return -EINVAL;
1290 } 1266 }
1291 1267
1292 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1268 mutex_lock(&cgroup_tree_mutex);
1293 mutex_lock(&cgroup_mutex); 1269 mutex_lock(&cgroup_mutex);
1294 mutex_lock(&cgroup_root_mutex);
1295 1270
1296 /* See what subsystems are wanted */ 1271 /* See what subsystems are wanted */
1297 ret = parse_cgroupfs_options(data, &opts); 1272 ret = parse_cgroupfs_options(data, &opts);
1298 if (ret) 1273 if (ret)
1299 goto out_unlock; 1274 goto out_unlock;
1300 1275
1301 if (opts.subsys_mask != root->subsys_mask || opts.release_agent) 1276 if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent)
1302 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1277 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1303 task_tgid_nr(current), current->comm); 1278 task_tgid_nr(current), current->comm);
1304 1279
1305 added_mask = opts.subsys_mask & ~root->subsys_mask; 1280 added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask;
1306 removed_mask = root->subsys_mask & ~opts.subsys_mask; 1281 removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask;
1307 1282
1308 /* Don't allow flags or name to change at remount */ 1283 /* Don't allow flags or name to change at remount */
1309 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || 1284 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
@@ -1316,422 +1291,331 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1316 } 1291 }
1317 1292
1318 /* remounting is not allowed for populated hierarchies */ 1293 /* remounting is not allowed for populated hierarchies */
1319 if (root->number_of_cgroups > 1) { 1294 if (!list_empty(&root->cgrp.children)) {
1320 ret = -EBUSY; 1295 ret = -EBUSY;
1321 goto out_unlock; 1296 goto out_unlock;
1322 } 1297 }
1323 1298
1324 ret = rebind_subsystems(root, added_mask, removed_mask); 1299 ret = rebind_subsystems(root, added_mask);
1325 if (ret) 1300 if (ret)
1326 goto out_unlock; 1301 goto out_unlock;
1327 1302
1328 if (opts.release_agent) 1303 rebind_subsystems(&cgrp_dfl_root, removed_mask);
1304
1305 if (opts.release_agent) {
1306 spin_lock(&release_agent_path_lock);
1329 strcpy(root->release_agent_path, opts.release_agent); 1307 strcpy(root->release_agent_path, opts.release_agent);
1308 spin_unlock(&release_agent_path_lock);
1309 }
1330 out_unlock: 1310 out_unlock:
1331 kfree(opts.release_agent); 1311 kfree(opts.release_agent);
1332 kfree(opts.name); 1312 kfree(opts.name);
1333 mutex_unlock(&cgroup_root_mutex);
1334 mutex_unlock(&cgroup_mutex); 1313 mutex_unlock(&cgroup_mutex);
1335 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1314 mutex_unlock(&cgroup_tree_mutex);
1336 return ret; 1315 return ret;
1337} 1316}
1338 1317
1339static const struct super_operations cgroup_ops = { 1318/*
1340 .statfs = simple_statfs, 1319 * To reduce the fork() overhead for systems that are not actually using
1341 .drop_inode = generic_delete_inode, 1320 * their cgroups capability, we don't maintain the lists running through
1342 .show_options = cgroup_show_options, 1321 * each css_set to its tasks until we see the list actually used - in other
1343 .remount_fs = cgroup_remount, 1322 * words after the first mount.
1344}; 1323 */
1324static bool use_task_css_set_links __read_mostly;
1325
1326static void cgroup_enable_task_cg_lists(void)
1327{
1328 struct task_struct *p, *g;
1329
1330 down_write(&css_set_rwsem);
1331
1332 if (use_task_css_set_links)
1333 goto out_unlock;
1334
1335 use_task_css_set_links = true;
1336
1337 /*
1338 * We need tasklist_lock because RCU is not safe against
1339 * while_each_thread(). Besides, a forking task that has passed
1340 * cgroup_post_fork() without seeing use_task_css_set_links = 1
1341 * is not guaranteed to have its child immediately visible in the
1342 * tasklist if we walk through it with RCU.
1343 */
1344 read_lock(&tasklist_lock);
1345 do_each_thread(g, p) {
1346 WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1347 task_css_set(p) != &init_css_set);
1348
1349 /*
1350 * We should check if the process is exiting, otherwise
1351 * it will race with cgroup_exit() in that the list
1352 * entry won't be deleted though the process has exited.
1353 * Do it while holding siglock so that we don't end up
1354 * racing against cgroup_exit().
1355 */
1356 spin_lock_irq(&p->sighand->siglock);
1357 if (!(p->flags & PF_EXITING)) {
1358 struct css_set *cset = task_css_set(p);
1359
1360 list_add(&p->cg_list, &cset->tasks);
1361 get_css_set(cset);
1362 }
1363 spin_unlock_irq(&p->sighand->siglock);
1364 } while_each_thread(g, p);
1365 read_unlock(&tasklist_lock);
1366out_unlock:
1367 up_write(&css_set_rwsem);
1368}
1345 1369
1346static void init_cgroup_housekeeping(struct cgroup *cgrp) 1370static void init_cgroup_housekeeping(struct cgroup *cgrp)
1347{ 1371{
1372 atomic_set(&cgrp->refcnt, 1);
1348 INIT_LIST_HEAD(&cgrp->sibling); 1373 INIT_LIST_HEAD(&cgrp->sibling);
1349 INIT_LIST_HEAD(&cgrp->children); 1374 INIT_LIST_HEAD(&cgrp->children);
1350 INIT_LIST_HEAD(&cgrp->files);
1351 INIT_LIST_HEAD(&cgrp->cset_links); 1375 INIT_LIST_HEAD(&cgrp->cset_links);
1352 INIT_LIST_HEAD(&cgrp->release_list); 1376 INIT_LIST_HEAD(&cgrp->release_list);
1353 INIT_LIST_HEAD(&cgrp->pidlists); 1377 INIT_LIST_HEAD(&cgrp->pidlists);
1354 mutex_init(&cgrp->pidlist_mutex); 1378 mutex_init(&cgrp->pidlist_mutex);
1355 cgrp->dummy_css.cgroup = cgrp; 1379 cgrp->dummy_css.cgroup = cgrp;
1356 simple_xattrs_init(&cgrp->xattrs);
1357} 1380}
1358 1381
1359static void init_cgroup_root(struct cgroupfs_root *root) 1382static void init_cgroup_root(struct cgroup_root *root,
1383 struct cgroup_sb_opts *opts)
1360{ 1384{
1361 struct cgroup *cgrp = &root->top_cgroup; 1385 struct cgroup *cgrp = &root->cgrp;
1362 1386
1363 INIT_LIST_HEAD(&root->root_list); 1387 INIT_LIST_HEAD(&root->root_list);
1364 root->number_of_cgroups = 1; 1388 atomic_set(&root->nr_cgrps, 1);
1365 cgrp->root = root; 1389 cgrp->root = root;
1366 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
1367 init_cgroup_housekeeping(cgrp); 1390 init_cgroup_housekeeping(cgrp);
1368 idr_init(&root->cgroup_idr); 1391 idr_init(&root->cgroup_idr);
1369}
1370
1371static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
1372{
1373 int id;
1374
1375 lockdep_assert_held(&cgroup_mutex);
1376 lockdep_assert_held(&cgroup_root_mutex);
1377
1378 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
1379 GFP_KERNEL);
1380 if (id < 0)
1381 return id;
1382
1383 root->hierarchy_id = id;
1384 return 0;
1385}
1386
1387static void cgroup_exit_root_id(struct cgroupfs_root *root)
1388{
1389 lockdep_assert_held(&cgroup_mutex);
1390 lockdep_assert_held(&cgroup_root_mutex);
1391
1392 if (root->hierarchy_id) {
1393 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1394 root->hierarchy_id = 0;
1395 }
1396}
1397
1398static int cgroup_test_super(struct super_block *sb, void *data)
1399{
1400 struct cgroup_sb_opts *opts = data;
1401 struct cgroupfs_root *root = sb->s_fs_info;
1402
1403 /* If we asked for a name then it must match */
1404 if (opts->name && strcmp(opts->name, root->name))
1405 return 0;
1406
1407 /*
1408 * If we asked for subsystems (or explicitly for no
1409 * subsystems) then they must match
1410 */
1411 if ((opts->subsys_mask || opts->none)
1412 && (opts->subsys_mask != root->subsys_mask))
1413 return 0;
1414
1415 return 1;
1416}
1417
1418static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1419{
1420 struct cgroupfs_root *root;
1421 1392
1422 if (!opts->subsys_mask && !opts->none)
1423 return NULL;
1424
1425 root = kzalloc(sizeof(*root), GFP_KERNEL);
1426 if (!root)
1427 return ERR_PTR(-ENOMEM);
1428
1429 init_cgroup_root(root);
1430
1431 /*
1432 * We need to set @root->subsys_mask now so that @root can be
1433 * matched by cgroup_test_super() before it finishes
1434 * initialization; otherwise, competing mounts with the same
1435 * options may try to bind the same subsystems instead of waiting
1436 * for the first one leading to unexpected mount errors.
1437 * SUBSYS_BOUND will be set once actual binding is complete.
1438 */
1439 root->subsys_mask = opts->subsys_mask;
1440 root->flags = opts->flags; 1393 root->flags = opts->flags;
1441 if (opts->release_agent) 1394 if (opts->release_agent)
1442 strcpy(root->release_agent_path, opts->release_agent); 1395 strcpy(root->release_agent_path, opts->release_agent);
1443 if (opts->name) 1396 if (opts->name)
1444 strcpy(root->name, opts->name); 1397 strcpy(root->name, opts->name);
1445 if (opts->cpuset_clone_children) 1398 if (opts->cpuset_clone_children)
1446 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); 1399 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1447 return root;
1448} 1400}
1449 1401
1450static void cgroup_free_root(struct cgroupfs_root *root) 1402static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1451{ 1403{
1452 if (root) { 1404 LIST_HEAD(tmp_links);
1453 /* hierarhcy ID shoulid already have been released */ 1405 struct cgroup *root_cgrp = &root->cgrp;
1454 WARN_ON_ONCE(root->hierarchy_id); 1406 struct css_set *cset;
1455 1407 int i, ret;
1456 idr_destroy(&root->cgroup_idr);
1457 kfree(root);
1458 }
1459}
1460 1408
1461static int cgroup_set_super(struct super_block *sb, void *data) 1409 lockdep_assert_held(&cgroup_tree_mutex);
1462{ 1410 lockdep_assert_held(&cgroup_mutex);
1463 int ret;
1464 struct cgroup_sb_opts *opts = data;
1465 1411
1466 /* If we don't have a new root, we can't set up a new sb */ 1412 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
1467 if (!opts->new_root) 1413 if (ret < 0)
1468 return -EINVAL; 1414 goto out;
1415 root_cgrp->id = ret;
1469 1416
1470 BUG_ON(!opts->subsys_mask && !opts->none); 1417 /*
1418 * We're accessing css_set_count without locking css_set_rwsem here,
1419 * but that's OK - it can only be increased by someone holding
1420 * cgroup_lock, and that's us. The worst that can happen is that we
1421 * have some link structures left over
1422 */
1423 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1424 if (ret)
1425 goto out;
1471 1426
1472 ret = set_anon_super(sb, NULL); 1427 ret = cgroup_init_root_id(root);
1473 if (ret) 1428 if (ret)
1474 return ret; 1429 goto out;
1475 1430
1476 sb->s_fs_info = opts->new_root; 1431 root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
1477 opts->new_root->sb = sb; 1432 KERNFS_ROOT_CREATE_DEACTIVATED,
1433 root_cgrp);
1434 if (IS_ERR(root->kf_root)) {
1435 ret = PTR_ERR(root->kf_root);
1436 goto exit_root_id;
1437 }
1438 root_cgrp->kn = root->kf_root->kn;
1478 1439
1479 sb->s_blocksize = PAGE_CACHE_SIZE; 1440 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1480 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 1441 if (ret)
1481 sb->s_magic = CGROUP_SUPER_MAGIC; 1442 goto destroy_root;
1482 sb->s_op = &cgroup_ops;
1483 1443
1484 return 0; 1444 ret = rebind_subsystems(root, ss_mask);
1485} 1445 if (ret)
1446 goto destroy_root;
1486 1447
1487static int cgroup_get_rootdir(struct super_block *sb) 1448 /*
1488{ 1449 * There must be no failure case after here, since rebinding takes
1489 static const struct dentry_operations cgroup_dops = { 1450 * care of subsystems' refcounts, which are explicitly dropped in
1490 .d_iput = cgroup_diput, 1451 * the failure exit path.
1491 .d_delete = always_delete_dentry, 1452 */
1492 }; 1453 list_add(&root->root_list, &cgroup_roots);
1454 cgroup_root_count++;
1493 1455
1494 struct inode *inode = 1456 /*
1495 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 1457 * Link the root cgroup in this hierarchy into all the css_set
1458 * objects.
1459 */
1460 down_write(&css_set_rwsem);
1461 hash_for_each(css_set_table, i, cset, hlist)
1462 link_css_set(&tmp_links, cset, root_cgrp);
1463 up_write(&css_set_rwsem);
1496 1464
1497 if (!inode) 1465 BUG_ON(!list_empty(&root_cgrp->children));
1498 return -ENOMEM; 1466 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1499 1467
1500 inode->i_fop = &simple_dir_operations; 1468 kernfs_activate(root_cgrp->kn);
1501 inode->i_op = &cgroup_dir_inode_operations; 1469 ret = 0;
1502 /* directories start off with i_nlink == 2 (for "." entry) */ 1470 goto out;
1503 inc_nlink(inode); 1471
1504 sb->s_root = d_make_root(inode); 1472destroy_root:
1505 if (!sb->s_root) 1473 kernfs_destroy_root(root->kf_root);
1506 return -ENOMEM; 1474 root->kf_root = NULL;
1507 /* for everything else we want ->d_op set */ 1475exit_root_id:
1508 sb->s_d_op = &cgroup_dops; 1476 cgroup_exit_root_id(root);
1509 return 0; 1477out:
1478 free_cgrp_cset_links(&tmp_links);
1479 return ret;
1510} 1480}
1511 1481
1512static struct dentry *cgroup_mount(struct file_system_type *fs_type, 1482static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1513 int flags, const char *unused_dev_name, 1483 int flags, const char *unused_dev_name,
1514 void *data) 1484 void *data)
1515{ 1485{
1486 struct cgroup_root *root;
1516 struct cgroup_sb_opts opts; 1487 struct cgroup_sb_opts opts;
1517 struct cgroupfs_root *root; 1488 struct dentry *dentry;
1518 int ret = 0; 1489 int ret;
1519 struct super_block *sb;
1520 struct cgroupfs_root *new_root;
1521 struct list_head tmp_links;
1522 struct inode *inode;
1523 const struct cred *cred;
1524 1490
1525 /* First find the desired set of subsystems */ 1491 /*
1492 * The first time anyone tries to mount a cgroup, enable the list
1493 * linking each css_set to its tasks and fix up all existing tasks.
1494 */
1495 if (!use_task_css_set_links)
1496 cgroup_enable_task_cg_lists();
1497retry:
1498 mutex_lock(&cgroup_tree_mutex);
1526 mutex_lock(&cgroup_mutex); 1499 mutex_lock(&cgroup_mutex);
1500
1501 /* First find the desired set of subsystems */
1527 ret = parse_cgroupfs_options(data, &opts); 1502 ret = parse_cgroupfs_options(data, &opts);
1528 mutex_unlock(&cgroup_mutex);
1529 if (ret) 1503 if (ret)
1530 goto out_err; 1504 goto out_unlock;
1531
1532 /*
1533 * Allocate a new cgroup root. We may not need it if we're
1534 * reusing an existing hierarchy.
1535 */
1536 new_root = cgroup_root_from_opts(&opts);
1537 if (IS_ERR(new_root)) {
1538 ret = PTR_ERR(new_root);
1539 goto out_err;
1540 }
1541 opts.new_root = new_root;
1542 1505
1543 /* Locate an existing or new sb for this hierarchy */ 1506 /* look for a matching existing root */
1544 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); 1507 if (!opts.subsys_mask && !opts.none && !opts.name) {
1545 if (IS_ERR(sb)) { 1508 cgrp_dfl_root_visible = true;
1546 ret = PTR_ERR(sb); 1509 root = &cgrp_dfl_root;
1547 cgroup_free_root(opts.new_root); 1510 cgroup_get(&root->cgrp);
1548 goto out_err; 1511 ret = 0;
1512 goto out_unlock;
1549 } 1513 }
1550 1514
1551 root = sb->s_fs_info; 1515 for_each_root(root) {
1552 BUG_ON(!root); 1516 bool name_match = false;
1553 if (root == opts.new_root) {
1554 /* We used the new root structure, so this is a new hierarchy */
1555 struct cgroup *root_cgrp = &root->top_cgroup;
1556 struct cgroupfs_root *existing_root;
1557 int i;
1558 struct css_set *cset;
1559
1560 BUG_ON(sb->s_root != NULL);
1561
1562 ret = cgroup_get_rootdir(sb);
1563 if (ret)
1564 goto drop_new_super;
1565 inode = sb->s_root->d_inode;
1566
1567 mutex_lock(&inode->i_mutex);
1568 mutex_lock(&cgroup_mutex);
1569 mutex_lock(&cgroup_root_mutex);
1570
1571 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
1572 if (ret < 0)
1573 goto unlock_drop;
1574 root_cgrp->id = ret;
1575
1576 /* Check for name clashes with existing mounts */
1577 ret = -EBUSY;
1578 if (strlen(root->name))
1579 for_each_active_root(existing_root)
1580 if (!strcmp(existing_root->name, root->name))
1581 goto unlock_drop;
1582
1583 /*
1584 * We're accessing css_set_count without locking
1585 * css_set_lock here, but that's OK - it can only be
1586 * increased by someone holding cgroup_lock, and
1587 * that's us. The worst that can happen is that we
1588 * have some link structures left over
1589 */
1590 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1591 if (ret)
1592 goto unlock_drop;
1593 1517
1594 /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ 1518 if (root == &cgrp_dfl_root)
1595 ret = cgroup_init_root_id(root, 2, 0); 1519 continue;
1596 if (ret)
1597 goto unlock_drop;
1598
1599 sb->s_root->d_fsdata = root_cgrp;
1600 root_cgrp->dentry = sb->s_root;
1601
1602 /*
1603 * We're inside get_sb() and will call lookup_one_len() to
1604 * create the root files, which doesn't work if SELinux is
1605 * in use. The following cred dancing somehow works around
1606 * it. See 2ce9738ba ("cgroupfs: use init_cred when
1607 * populating new cgroupfs mount") for more details.
1608 */
1609 cred = override_creds(&init_cred);
1610
1611 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1612 if (ret)
1613 goto rm_base_files;
1614
1615 ret = rebind_subsystems(root, root->subsys_mask, 0);
1616 if (ret)
1617 goto rm_base_files;
1618
1619 revert_creds(cred);
1620 1520
1621 /* 1521 /*
1622 * There must be no failure case after here, since rebinding 1522 * If we asked for a name then it must match. Also, if
1623 * takes care of subsystems' refcounts, which are explicitly 1523 * name matches but sybsys_mask doesn't, we should fail.
1624 * dropped in the failure exit path. 1524 * Remember whether name matched.
1625 */ 1525 */
1526 if (opts.name) {
1527 if (strcmp(opts.name, root->name))
1528 continue;
1529 name_match = true;
1530 }
1626 1531
1627 list_add(&root->root_list, &cgroup_roots);
1628 cgroup_root_count++;
1629
1630 /* Link the top cgroup in this hierarchy into all
1631 * the css_set objects */
1632 write_lock(&css_set_lock);
1633 hash_for_each(css_set_table, i, cset, hlist)
1634 link_css_set(&tmp_links, cset, root_cgrp);
1635 write_unlock(&css_set_lock);
1636
1637 free_cgrp_cset_links(&tmp_links);
1638
1639 BUG_ON(!list_empty(&root_cgrp->children));
1640 BUG_ON(root->number_of_cgroups != 1);
1641
1642 mutex_unlock(&cgroup_root_mutex);
1643 mutex_unlock(&cgroup_mutex);
1644 mutex_unlock(&inode->i_mutex);
1645 } else {
1646 /* 1532 /*
1647 * We re-used an existing hierarchy - the new root (if 1533 * If we asked for subsystems (or explicitly for no
1648 * any) is not needed 1534 * subsystems) then they must match.
1649 */ 1535 */
1650 cgroup_free_root(opts.new_root); 1536 if ((opts.subsys_mask || opts.none) &&
1537 (opts.subsys_mask != root->cgrp.subsys_mask)) {
1538 if (!name_match)
1539 continue;
1540 ret = -EBUSY;
1541 goto out_unlock;
1542 }
1651 1543
1652 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { 1544 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1653 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1545 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1654 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); 1546 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1655 ret = -EINVAL; 1547 ret = -EINVAL;
1656 goto drop_new_super; 1548 goto out_unlock;
1657 } else { 1549 } else {
1658 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); 1550 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1659 } 1551 }
1660 } 1552 }
1661 }
1662
1663 kfree(opts.release_agent);
1664 kfree(opts.name);
1665 return dget(sb->s_root);
1666
1667 rm_base_files:
1668 free_cgrp_cset_links(&tmp_links);
1669 cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
1670 revert_creds(cred);
1671 unlock_drop:
1672 cgroup_exit_root_id(root);
1673 mutex_unlock(&cgroup_root_mutex);
1674 mutex_unlock(&cgroup_mutex);
1675 mutex_unlock(&inode->i_mutex);
1676 drop_new_super:
1677 deactivate_locked_super(sb);
1678 out_err:
1679 kfree(opts.release_agent);
1680 kfree(opts.name);
1681 return ERR_PTR(ret);
1682}
1683
1684static void cgroup_kill_sb(struct super_block *sb)
1685{
1686 struct cgroupfs_root *root = sb->s_fs_info;
1687 struct cgroup *cgrp = &root->top_cgroup;
1688 struct cgrp_cset_link *link, *tmp_link;
1689 int ret;
1690
1691 BUG_ON(!root);
1692
1693 BUG_ON(root->number_of_cgroups != 1);
1694 BUG_ON(!list_empty(&cgrp->children));
1695 1553
1696 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1554 /*
1697 mutex_lock(&cgroup_mutex); 1555 * A root's lifetime is governed by its root cgroup. Zero
1698 mutex_lock(&cgroup_root_mutex); 1556 * ref indicate that the root is being destroyed. Wait for
1557 * destruction to complete so that the subsystems are free.
1558 * We can use wait_queue for the wait but this path is
1559 * super cold. Let's just sleep for a bit and retry.
1560 */
1561 if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
1562 mutex_unlock(&cgroup_mutex);
1563 mutex_unlock(&cgroup_tree_mutex);
1564 kfree(opts.release_agent);
1565 kfree(opts.name);
1566 msleep(10);
1567 goto retry;
1568 }
1699 1569
1700 /* Rebind all subsystems back to the default hierarchy */ 1570 ret = 0;
1701 if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { 1571 goto out_unlock;
1702 ret = rebind_subsystems(root, 0, root->subsys_mask);
1703 /* Shouldn't be able to fail ... */
1704 BUG_ON(ret);
1705 } 1572 }
1706 1573
1707 /* 1574 /*
1708 * Release all the links from cset_links to this hierarchy's 1575 * No such thing, create a new one. name= matching without subsys
1709 * root cgroup 1576 * specification is allowed for already existing hierarchies but we
1577 * can't create new one without subsys specification.
1710 */ 1578 */
1711 write_lock(&css_set_lock); 1579 if (!opts.subsys_mask && !opts.none) {
1712 1580 ret = -EINVAL;
1713 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { 1581 goto out_unlock;
1714 list_del(&link->cset_link);
1715 list_del(&link->cgrp_link);
1716 kfree(link);
1717 } 1582 }
1718 write_unlock(&css_set_lock);
1719 1583
1720 if (!list_empty(&root->root_list)) { 1584 root = kzalloc(sizeof(*root), GFP_KERNEL);
1721 list_del(&root->root_list); 1585 if (!root) {
1722 cgroup_root_count--; 1586 ret = -ENOMEM;
1587 goto out_unlock;
1723 } 1588 }
1724 1589
1725 cgroup_exit_root_id(root); 1590 init_cgroup_root(root, &opts);
1726 1591
1727 mutex_unlock(&cgroup_root_mutex); 1592 ret = cgroup_setup_root(root, opts.subsys_mask);
1593 if (ret)
1594 cgroup_free_root(root);
1595
1596out_unlock:
1728 mutex_unlock(&cgroup_mutex); 1597 mutex_unlock(&cgroup_mutex);
1729 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1598 mutex_unlock(&cgroup_tree_mutex);
1730 1599
1731 simple_xattrs_free(&cgrp->xattrs); 1600 kfree(opts.release_agent);
1601 kfree(opts.name);
1732 1602
1733 kill_litter_super(sb); 1603 if (ret)
1734 cgroup_free_root(root); 1604 return ERR_PTR(ret);
1605
1606 dentry = kernfs_mount(fs_type, flags, root->kf_root, NULL);
1607 if (IS_ERR(dentry))
1608 cgroup_put(&root->cgrp);
1609 return dentry;
1610}
1611
1612static void cgroup_kill_sb(struct super_block *sb)
1613{
1614 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1615 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1616
1617 cgroup_put(&root->cgrp);
1618 kernfs_kill_sb(sb);
1735} 1619}
1736 1620
1737static struct file_system_type cgroup_fs_type = { 1621static struct file_system_type cgroup_fs_type = {
@@ -1743,57 +1627,6 @@ static struct file_system_type cgroup_fs_type = {
1743static struct kobject *cgroup_kobj; 1627static struct kobject *cgroup_kobj;
1744 1628
1745/** 1629/**
1746 * cgroup_path - generate the path of a cgroup
1747 * @cgrp: the cgroup in question
1748 * @buf: the buffer to write the path into
1749 * @buflen: the length of the buffer
1750 *
1751 * Writes path of cgroup into buf. Returns 0 on success, -errno on error.
1752 *
1753 * We can't generate cgroup path using dentry->d_name, as accessing
1754 * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
1755 * inode's i_mutex, while on the other hand cgroup_path() can be called
1756 * with some irq-safe spinlocks held.
1757 */
1758int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1759{
1760 int ret = -ENAMETOOLONG;
1761 char *start;
1762
1763 if (!cgrp->parent) {
1764 if (strlcpy(buf, "/", buflen) >= buflen)
1765 return -ENAMETOOLONG;
1766 return 0;
1767 }
1768
1769 start = buf + buflen - 1;
1770 *start = '\0';
1771
1772 rcu_read_lock();
1773 do {
1774 const char *name = cgroup_name(cgrp);
1775 int len;
1776
1777 len = strlen(name);
1778 if ((start -= len) < buf)
1779 goto out;
1780 memcpy(start, name, len);
1781
1782 if (--start < buf)
1783 goto out;
1784 *start = '/';
1785
1786 cgrp = cgrp->parent;
1787 } while (cgrp->parent);
1788 ret = 0;
1789 memmove(buf, start, buf + buflen - start);
1790out:
1791 rcu_read_unlock();
1792 return ret;
1793}
1794EXPORT_SYMBOL_GPL(cgroup_path);
1795
1796/**
1797 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy 1630 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
1798 * @task: target task 1631 * @task: target task
1799 * @buf: the buffer to write the path into 1632 * @buf: the buffer to write the path into
@@ -1804,49 +1637,55 @@ EXPORT_SYMBOL_GPL(cgroup_path);
1804 * function grabs cgroup_mutex and shouldn't be used inside locks used by 1637 * function grabs cgroup_mutex and shouldn't be used inside locks used by
1805 * cgroup controller callbacks. 1638 * cgroup controller callbacks.
1806 * 1639 *
1807 * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short. 1640 * Return value is the same as kernfs_path().
1808 */ 1641 */
1809int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) 1642char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1810{ 1643{
1811 struct cgroupfs_root *root; 1644 struct cgroup_root *root;
1812 struct cgroup *cgrp; 1645 struct cgroup *cgrp;
1813 int hierarchy_id = 1, ret = 0; 1646 int hierarchy_id = 1;
1814 1647 char *path = NULL;
1815 if (buflen < 2)
1816 return -ENAMETOOLONG;
1817 1648
1818 mutex_lock(&cgroup_mutex); 1649 mutex_lock(&cgroup_mutex);
1650 down_read(&css_set_rwsem);
1819 1651
1820 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); 1652 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
1821 1653
1822 if (root) { 1654 if (root) {
1823 cgrp = task_cgroup_from_root(task, root); 1655 cgrp = task_cgroup_from_root(task, root);
1824 ret = cgroup_path(cgrp, buf, buflen); 1656 path = cgroup_path(cgrp, buf, buflen);
1825 } else { 1657 } else {
1826 /* if no hierarchy exists, everyone is in "/" */ 1658 /* if no hierarchy exists, everyone is in "/" */
1827 memcpy(buf, "/", 2); 1659 if (strlcpy(buf, "/", buflen) < buflen)
1660 path = buf;
1828 } 1661 }
1829 1662
1663 up_read(&css_set_rwsem);
1830 mutex_unlock(&cgroup_mutex); 1664 mutex_unlock(&cgroup_mutex);
1831 return ret; 1665 return path;
1832} 1666}
1833EXPORT_SYMBOL_GPL(task_cgroup_path); 1667EXPORT_SYMBOL_GPL(task_cgroup_path);
1834 1668
1835/* 1669/* used to track tasks and other necessary states during migration */
1836 * Control Group taskset
1837 */
1838struct task_and_cgroup {
1839 struct task_struct *task;
1840 struct cgroup *cgrp;
1841 struct css_set *cset;
1842};
1843
1844struct cgroup_taskset { 1670struct cgroup_taskset {
1845 struct task_and_cgroup single; 1671 /* the src and dst cset list running through cset->mg_node */
1846 struct flex_array *tc_array; 1672 struct list_head src_csets;
1847 int tc_array_len; 1673 struct list_head dst_csets;
1848 int idx; 1674
1849 struct cgroup *cur_cgrp; 1675 /*
1676 * Fields for cgroup_taskset_*() iteration.
1677 *
1678 * Before migration is committed, the target migration tasks are on
1679 * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
1680 * the csets on ->dst_csets. ->csets point to either ->src_csets
1681 * or ->dst_csets depending on whether migration is committed.
1682 *
1683 * ->cur_csets and ->cur_task point to the current task position
1684 * during iteration.
1685 */
1686 struct list_head *csets;
1687 struct css_set *cur_cset;
1688 struct task_struct *cur_task;
1850}; 1689};
1851 1690
1852/** 1691/**
@@ -1857,15 +1696,11 @@ struct cgroup_taskset {
1857 */ 1696 */
1858struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) 1697struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1859{ 1698{
1860 if (tset->tc_array) { 1699 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
1861 tset->idx = 0; 1700 tset->cur_task = NULL;
1862 return cgroup_taskset_next(tset); 1701
1863 } else { 1702 return cgroup_taskset_next(tset);
1864 tset->cur_cgrp = tset->single.cgrp;
1865 return tset->single.task;
1866 }
1867} 1703}
1868EXPORT_SYMBOL_GPL(cgroup_taskset_first);
1869 1704
1870/** 1705/**
1871 * cgroup_taskset_next - iterate to the next task in taskset 1706 * cgroup_taskset_next - iterate to the next task in taskset
@@ -1876,48 +1711,36 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_first);
1876 */ 1711 */
1877struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) 1712struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1878{ 1713{
1879 struct task_and_cgroup *tc; 1714 struct css_set *cset = tset->cur_cset;
1715 struct task_struct *task = tset->cur_task;
1880 1716
1881 if (!tset->tc_array || tset->idx >= tset->tc_array_len) 1717 while (&cset->mg_node != tset->csets) {
1882 return NULL; 1718 if (!task)
1719 task = list_first_entry(&cset->mg_tasks,
1720 struct task_struct, cg_list);
1721 else
1722 task = list_next_entry(task, cg_list);
1883 1723
1884 tc = flex_array_get(tset->tc_array, tset->idx++); 1724 if (&task->cg_list != &cset->mg_tasks) {
1885 tset->cur_cgrp = tc->cgrp; 1725 tset->cur_cset = cset;
1886 return tc->task; 1726 tset->cur_task = task;
1887} 1727 return task;
1888EXPORT_SYMBOL_GPL(cgroup_taskset_next); 1728 }
1889 1729
1890/** 1730 cset = list_next_entry(cset, mg_node);
1891 * cgroup_taskset_cur_css - return the matching css for the current task 1731 task = NULL;
1892 * @tset: taskset of interest 1732 }
1893 * @subsys_id: the ID of the target subsystem
1894 *
1895 * Return the css for the current (last returned) task of @tset for
1896 * subsystem specified by @subsys_id. This function must be preceded by
1897 * either cgroup_taskset_first() or cgroup_taskset_next().
1898 */
1899struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
1900 int subsys_id)
1901{
1902 return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
1903}
1904EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
1905 1733
1906/** 1734 return NULL;
1907 * cgroup_taskset_size - return the number of tasks in taskset
1908 * @tset: taskset of interest
1909 */
1910int cgroup_taskset_size(struct cgroup_taskset *tset)
1911{
1912 return tset->tc_array ? tset->tc_array_len : 1;
1913} 1735}
1914EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1915 1736
1916 1737/**
1917/*
1918 * cgroup_task_migrate - move a task from one cgroup to another. 1738 * cgroup_task_migrate - move a task from one cgroup to another.
1739 * @old_cgrp; the cgroup @tsk is being migrated from
1740 * @tsk: the task being migrated
1741 * @new_cset: the new css_set @tsk is being attached to
1919 * 1742 *
1920 * Must be called with cgroup_mutex and threadgroup locked. 1743 * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
1921 */ 1744 */
1922static void cgroup_task_migrate(struct cgroup *old_cgrp, 1745static void cgroup_task_migrate(struct cgroup *old_cgrp,
1923 struct task_struct *tsk, 1746 struct task_struct *tsk,
@@ -1925,6 +1748,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
1925{ 1748{
1926 struct css_set *old_cset; 1749 struct css_set *old_cset;
1927 1750
1751 lockdep_assert_held(&cgroup_mutex);
1752 lockdep_assert_held(&css_set_rwsem);
1753
1928 /* 1754 /*
1929 * We are synchronized through threadgroup_lock() against PF_EXITING 1755 * We are synchronized through threadgroup_lock() against PF_EXITING
1930 * setting such that we can't race against cgroup_exit() changing the 1756 * setting such that we can't race against cgroup_exit() changing the
@@ -1933,15 +1759,16 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
1933 WARN_ON_ONCE(tsk->flags & PF_EXITING); 1759 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1934 old_cset = task_css_set(tsk); 1760 old_cset = task_css_set(tsk);
1935 1761
1936 task_lock(tsk); 1762 get_css_set(new_cset);
1937 rcu_assign_pointer(tsk->cgroups, new_cset); 1763 rcu_assign_pointer(tsk->cgroups, new_cset);
1938 task_unlock(tsk);
1939 1764
1940 /* Update the css_set linked lists if we're using them */ 1765 /*
1941 write_lock(&css_set_lock); 1766 * Use move_tail so that cgroup_taskset_first() still returns the
1942 if (!list_empty(&tsk->cg_list)) 1767 * leader after migration. This works because cgroup_migrate()
1943 list_move(&tsk->cg_list, &new_cset->tasks); 1768 * ensures that the dst_cset of the leader is the first on the
1944 write_unlock(&css_set_lock); 1769 * tset's dst_csets list.
1770 */
1771 list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
1945 1772
1946 /* 1773 /*
1947 * We just gained a reference on old_cset by taking it from the 1774 * We just gained a reference on old_cset by taking it from the
@@ -1949,100 +1776,199 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
1949 * we're safe to drop it here; it will be freed under RCU. 1776 * we're safe to drop it here; it will be freed under RCU.
1950 */ 1777 */
1951 set_bit(CGRP_RELEASABLE, &old_cgrp->flags); 1778 set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
1952 put_css_set(old_cset); 1779 put_css_set_locked(old_cset, false);
1953} 1780}
1954 1781
1955/** 1782/**
1956 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup 1783 * cgroup_migrate_finish - cleanup after attach
1957 * @cgrp: the cgroup to attach to 1784 * @preloaded_csets: list of preloaded css_sets
1958 * @tsk: the task or the leader of the threadgroup to be attached
1959 * @threadgroup: attach the whole threadgroup?
1960 * 1785 *
1961 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take 1786 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
1962 * task_lock of @tsk or each thread in the threadgroup individually in turn. 1787 * those functions for details.
1963 */ 1788 */
1964static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, 1789static void cgroup_migrate_finish(struct list_head *preloaded_csets)
1965 bool threadgroup)
1966{ 1790{
1967 int retval, i, group_size; 1791 struct css_set *cset, *tmp_cset;
1968 struct cgroupfs_root *root = cgrp->root;
1969 struct cgroup_subsys_state *css, *failed_css = NULL;
1970 /* threadgroup list cursor and array */
1971 struct task_struct *leader = tsk;
1972 struct task_and_cgroup *tc;
1973 struct flex_array *group;
1974 struct cgroup_taskset tset = { };
1975 1792
1976 /* 1793 lockdep_assert_held(&cgroup_mutex);
1977 * step 0: in order to do expensive, possibly blocking operations for 1794
1978 * every thread, we cannot iterate the thread group list, since it needs 1795 down_write(&css_set_rwsem);
1979 * rcu or tasklist locked. instead, build an array of all threads in the 1796 list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
1980 * group - group_rwsem prevents new threads from appearing, and if 1797 cset->mg_src_cgrp = NULL;
1981 * threads exit, this will just be an over-estimate. 1798 cset->mg_dst_cset = NULL;
1982 */ 1799 list_del_init(&cset->mg_preload_node);
1983 if (threadgroup) 1800 put_css_set_locked(cset, false);
1984 group_size = get_nr_threads(tsk); 1801 }
1985 else 1802 up_write(&css_set_rwsem);
1986 group_size = 1; 1803}
1987 /* flex_array supports very large thread-groups better than kmalloc. */ 1804
1988 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); 1805/**
1989 if (!group) 1806 * cgroup_migrate_add_src - add a migration source css_set
1990 return -ENOMEM; 1807 * @src_cset: the source css_set to add
1991 /* pre-allocate to guarantee space while iterating in rcu read-side. */ 1808 * @dst_cgrp: the destination cgroup
1992 retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); 1809 * @preloaded_csets: list of preloaded css_sets
1993 if (retval) 1810 *
1994 goto out_free_group_list; 1811 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
1812 * @src_cset and add it to @preloaded_csets, which should later be cleaned
1813 * up by cgroup_migrate_finish().
1814 *
1815 * This function may be called without holding threadgroup_lock even if the
1816 * target is a process. Threads may be created and destroyed but as long
1817 * as cgroup_mutex is not dropped, no new css_set can be put into play and
1818 * the preloaded css_sets are guaranteed to cover all migrations.
1819 */
1820static void cgroup_migrate_add_src(struct css_set *src_cset,
1821 struct cgroup *dst_cgrp,
1822 struct list_head *preloaded_csets)
1823{
1824 struct cgroup *src_cgrp;
1825
1826 lockdep_assert_held(&cgroup_mutex);
1827 lockdep_assert_held(&css_set_rwsem);
1828
1829 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
1830
1831 /* nothing to do if this cset already belongs to the cgroup */
1832 if (src_cgrp == dst_cgrp)
1833 return;
1834
1835 if (!list_empty(&src_cset->mg_preload_node))
1836 return;
1837
1838 WARN_ON(src_cset->mg_src_cgrp);
1839 WARN_ON(!list_empty(&src_cset->mg_tasks));
1840 WARN_ON(!list_empty(&src_cset->mg_node));
1841
1842 src_cset->mg_src_cgrp = src_cgrp;
1843 get_css_set(src_cset);
1844 list_add(&src_cset->mg_preload_node, preloaded_csets);
1845}
1846
1847/**
1848 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
1849 * @dst_cgrp: the destination cgroup
1850 * @preloaded_csets: list of preloaded source css_sets
1851 *
1852 * Tasks are about to be moved to @dst_cgrp and all the source css_sets
1853 * have been preloaded to @preloaded_csets. This function looks up and
1854 * pins all destination css_sets, links each to its source, and put them on
1855 * @preloaded_csets.
1856 *
1857 * This function must be called after cgroup_migrate_add_src() has been
1858 * called on each migration source css_set. After migration is performed
1859 * using cgroup_migrate(), cgroup_migrate_finish() must be called on
1860 * @preloaded_csets.
1861 */
1862static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
1863 struct list_head *preloaded_csets)
1864{
1865 LIST_HEAD(csets);
1866 struct css_set *src_cset;
1867
1868 lockdep_assert_held(&cgroup_mutex);
1869
1870 /* look up the dst cset for each src cset and link it to src */
1871 list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {
1872 struct css_set *dst_cset;
1873
1874 dst_cset = find_css_set(src_cset, dst_cgrp);
1875 if (!dst_cset)
1876 goto err;
1877
1878 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
1879 src_cset->mg_dst_cset = dst_cset;
1880
1881 if (list_empty(&dst_cset->mg_preload_node))
1882 list_add(&dst_cset->mg_preload_node, &csets);
1883 else
1884 put_css_set(dst_cset, false);
1885 }
1886
1887 list_splice(&csets, preloaded_csets);
1888 return 0;
1889err:
1890 cgroup_migrate_finish(&csets);
1891 return -ENOMEM;
1892}
1893
1894/**
1895 * cgroup_migrate - migrate a process or task to a cgroup
1896 * @cgrp: the destination cgroup
1897 * @leader: the leader of the process or the task to migrate
1898 * @threadgroup: whether @leader points to the whole process or a single task
1899 *
1900 * Migrate a process or task denoted by @leader to @cgrp. If migrating a
1901 * process, the caller must be holding threadgroup_lock of @leader. The
1902 * caller is also responsible for invoking cgroup_migrate_add_src() and
1903 * cgroup_migrate_prepare_dst() on the targets before invoking this
1904 * function and following up with cgroup_migrate_finish().
1905 *
1906 * As long as a controller's ->can_attach() doesn't fail, this function is
1907 * guaranteed to succeed. This means that, excluding ->can_attach()
1908 * failure, when migrating multiple targets, the success or failure can be
1909 * decided for all targets by invoking group_migrate_prepare_dst() before
1910 * actually starting migrating.
1911 */
1912static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
1913 bool threadgroup)
1914{
1915 struct cgroup_taskset tset = {
1916 .src_csets = LIST_HEAD_INIT(tset.src_csets),
1917 .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
1918 .csets = &tset.src_csets,
1919 };
1920 struct cgroup_subsys_state *css, *failed_css = NULL;
1921 struct css_set *cset, *tmp_cset;
1922 struct task_struct *task, *tmp_task;
1923 int i, ret;
1995 1924
1996 i = 0;
1997 /* 1925 /*
1998 * Prevent freeing of tasks while we take a snapshot. Tasks that are 1926 * Prevent freeing of tasks while we take a snapshot. Tasks that are
1999 * already PF_EXITING could be freed from underneath us unless we 1927 * already PF_EXITING could be freed from underneath us unless we
2000 * take an rcu_read_lock. 1928 * take an rcu_read_lock.
2001 */ 1929 */
1930 down_write(&css_set_rwsem);
2002 rcu_read_lock(); 1931 rcu_read_lock();
1932 task = leader;
2003 do { 1933 do {
2004 struct task_and_cgroup ent; 1934 /* @task either already exited or can't exit until the end */
1935 if (task->flags & PF_EXITING)
1936 goto next;
2005 1937
2006 /* @tsk either already exited or can't exit until the end */ 1938 /* leave @task alone if post_fork() hasn't linked it yet */
2007 if (tsk->flags & PF_EXITING) 1939 if (list_empty(&task->cg_list))
2008 goto next; 1940 goto next;
2009 1941
2010 /* as per above, nr_threads may decrease, but not increase. */ 1942 cset = task_css_set(task);
2011 BUG_ON(i >= group_size); 1943 if (!cset->mg_src_cgrp)
2012 ent.task = tsk;
2013 ent.cgrp = task_cgroup_from_root(tsk, root);
2014 /* nothing to do if this task is already in the cgroup */
2015 if (ent.cgrp == cgrp)
2016 goto next; 1944 goto next;
1945
2017 /* 1946 /*
2018 * saying GFP_ATOMIC has no effect here because we did prealloc 1947 * cgroup_taskset_first() must always return the leader.
2019 * earlier, but it's good form to communicate our expectations. 1948 * Take care to avoid disturbing the ordering.
2020 */ 1949 */
2021 retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 1950 list_move_tail(&task->cg_list, &cset->mg_tasks);
2022 BUG_ON(retval != 0); 1951 if (list_empty(&cset->mg_node))
2023 i++; 1952 list_add_tail(&cset->mg_node, &tset.src_csets);
1953 if (list_empty(&cset->mg_dst_cset->mg_node))
1954 list_move_tail(&cset->mg_dst_cset->mg_node,
1955 &tset.dst_csets);
2024 next: 1956 next:
2025 if (!threadgroup) 1957 if (!threadgroup)
2026 break; 1958 break;
2027 } while_each_thread(leader, tsk); 1959 } while_each_thread(leader, task);
2028 rcu_read_unlock(); 1960 rcu_read_unlock();
2029 /* remember the number of threads in the array for later. */ 1961 up_write(&css_set_rwsem);
2030 group_size = i;
2031 tset.tc_array = group;
2032 tset.tc_array_len = group_size;
2033 1962
2034 /* methods shouldn't be called if no task is actually migrating */ 1963 /* methods shouldn't be called if no task is actually migrating */
2035 retval = 0; 1964 if (list_empty(&tset.src_csets))
2036 if (!group_size) 1965 return 0;
2037 goto out_free_group_list;
2038 1966
2039 /* 1967 /* check that we can legitimately attach to the cgroup */
2040 * step 1: check that we can legitimately attach to the cgroup.
2041 */
2042 for_each_css(css, i, cgrp) { 1968 for_each_css(css, i, cgrp) {
2043 if (css->ss->can_attach) { 1969 if (css->ss->can_attach) {
2044 retval = css->ss->can_attach(css, &tset); 1970 ret = css->ss->can_attach(css, &tset);
2045 if (retval) { 1971 if (ret) {
2046 failed_css = css; 1972 failed_css = css;
2047 goto out_cancel_attach; 1973 goto out_cancel_attach;
2048 } 1974 }
@@ -2050,70 +1976,91 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2050 } 1976 }
2051 1977
2052 /* 1978 /*
2053 * step 2: make sure css_sets exist for all threads to be migrated. 1979 * Now that we're guaranteed success, proceed to move all tasks to
2054 * we use find_css_set, which allocates a new one if necessary. 1980 * the new cgroup. There are no failure cases after here, so this
1981 * is the commit point.
2055 */ 1982 */
2056 for (i = 0; i < group_size; i++) { 1983 down_write(&css_set_rwsem);
2057 struct css_set *old_cset; 1984 list_for_each_entry(cset, &tset.src_csets, mg_node) {
2058 1985 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
2059 tc = flex_array_get(group, i); 1986 cgroup_task_migrate(cset->mg_src_cgrp, task,
2060 old_cset = task_css_set(tc->task); 1987 cset->mg_dst_cset);
2061 tc->cset = find_css_set(old_cset, cgrp);
2062 if (!tc->cset) {
2063 retval = -ENOMEM;
2064 goto out_put_css_set_refs;
2065 }
2066 } 1988 }
1989 up_write(&css_set_rwsem);
2067 1990
2068 /* 1991 /*
2069 * step 3: now that we're guaranteed success wrt the css_sets, 1992 * Migration is committed, all target tasks are now on dst_csets.
2070 * proceed to move all tasks to the new cgroup. There are no 1993 * Nothing is sensitive to fork() after this point. Notify
2071 * failure cases after here, so this is the commit point. 1994 * controllers that migration is complete.
2072 */ 1995 */
2073 for (i = 0; i < group_size; i++) { 1996 tset.csets = &tset.dst_csets;
2074 tc = flex_array_get(group, i);
2075 cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
2076 }
2077 /* nothing is sensitive to fork() after this point. */
2078 1997
2079 /*
2080 * step 4: do subsystem attach callbacks.
2081 */
2082 for_each_css(css, i, cgrp) 1998 for_each_css(css, i, cgrp)
2083 if (css->ss->attach) 1999 if (css->ss->attach)
2084 css->ss->attach(css, &tset); 2000 css->ss->attach(css, &tset);
2085 2001
2086 /* 2002 ret = 0;
2087 * step 5: success! and cleanup 2003 goto out_release_tset;
2088 */ 2004
2089 retval = 0;
2090out_put_css_set_refs:
2091 if (retval) {
2092 for (i = 0; i < group_size; i++) {
2093 tc = flex_array_get(group, i);
2094 if (!tc->cset)
2095 break;
2096 put_css_set(tc->cset);
2097 }
2098 }
2099out_cancel_attach: 2005out_cancel_attach:
2100 if (retval) { 2006 for_each_css(css, i, cgrp) {
2101 for_each_css(css, i, cgrp) { 2007 if (css == failed_css)
2102 if (css == failed_css) 2008 break;
2103 break; 2009 if (css->ss->cancel_attach)
2104 if (css->ss->cancel_attach) 2010 css->ss->cancel_attach(css, &tset);
2105 css->ss->cancel_attach(css, &tset);
2106 }
2107 } 2011 }
2108out_free_group_list: 2012out_release_tset:
2109 flex_array_free(group); 2013 down_write(&css_set_rwsem);
2110 return retval; 2014 list_splice_init(&tset.dst_csets, &tset.src_csets);
2015 list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
2016 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2017 list_del_init(&cset->mg_node);
2018 }
2019 up_write(&css_set_rwsem);
2020 return ret;
2021}
2022
2023/**
2024 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
2025 * @dst_cgrp: the cgroup to attach to
2026 * @leader: the task or the leader of the threadgroup to be attached
2027 * @threadgroup: attach the whole threadgroup?
2028 *
2029 * Call holding cgroup_mutex and threadgroup_lock of @leader.
2030 */
2031static int cgroup_attach_task(struct cgroup *dst_cgrp,
2032 struct task_struct *leader, bool threadgroup)
2033{
2034 LIST_HEAD(preloaded_csets);
2035 struct task_struct *task;
2036 int ret;
2037
2038 /* look up all src csets */
2039 down_read(&css_set_rwsem);
2040 rcu_read_lock();
2041 task = leader;
2042 do {
2043 cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
2044 &preloaded_csets);
2045 if (!threadgroup)
2046 break;
2047 } while_each_thread(leader, task);
2048 rcu_read_unlock();
2049 up_read(&css_set_rwsem);
2050
2051 /* prepare dst csets and commit */
2052 ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
2053 if (!ret)
2054 ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
2055
2056 cgroup_migrate_finish(&preloaded_csets);
2057 return ret;
2111} 2058}
2112 2059
2113/* 2060/*
2114 * Find the task_struct of the task to attach by vpid and pass it along to the 2061 * Find the task_struct of the task to attach by vpid and pass it along to the
2115 * function to attach either it or all tasks in its threadgroup. Will lock 2062 * function to attach either it or all tasks in its threadgroup. Will lock
2116 * cgroup_mutex and threadgroup; may take task_lock of task. 2063 * cgroup_mutex and threadgroup.
2117 */ 2064 */
2118static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 2065static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2119{ 2066{
@@ -2198,12 +2145,19 @@ out_unlock_cgroup:
2198 */ 2145 */
2199int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) 2146int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2200{ 2147{
2201 struct cgroupfs_root *root; 2148 struct cgroup_root *root;
2202 int retval = 0; 2149 int retval = 0;
2203 2150
2204 mutex_lock(&cgroup_mutex); 2151 mutex_lock(&cgroup_mutex);
2205 for_each_active_root(root) { 2152 for_each_root(root) {
2206 struct cgroup *from_cgrp = task_cgroup_from_root(from, root); 2153 struct cgroup *from_cgrp;
2154
2155 if (root == &cgrp_dfl_root)
2156 continue;
2157
2158 down_read(&css_set_rwsem);
2159 from_cgrp = task_cgroup_from_root(from, root);
2160 up_read(&css_set_rwsem);
2207 2161
2208 retval = cgroup_attach_task(from_cgrp, tsk, false); 2162 retval = cgroup_attach_task(from_cgrp, tsk, false);
2209 if (retval) 2163 if (retval)
@@ -2228,16 +2182,17 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css,
2228} 2182}
2229 2183
2230static int cgroup_release_agent_write(struct cgroup_subsys_state *css, 2184static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2231 struct cftype *cft, const char *buffer) 2185 struct cftype *cft, char *buffer)
2232{ 2186{
2233 BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); 2187 struct cgroup_root *root = css->cgroup->root;
2234 if (strlen(buffer) >= PATH_MAX) 2188
2235 return -EINVAL; 2189 BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);
2236 if (!cgroup_lock_live_group(css->cgroup)) 2190 if (!cgroup_lock_live_group(css->cgroup))
2237 return -ENODEV; 2191 return -ENODEV;
2238 mutex_lock(&cgroup_root_mutex); 2192 spin_lock(&release_agent_path_lock);
2239 strcpy(css->cgroup->root->release_agent_path, buffer); 2193 strlcpy(root->release_agent_path, buffer,
2240 mutex_unlock(&cgroup_root_mutex); 2194 sizeof(root->release_agent_path));
2195 spin_unlock(&release_agent_path_lock);
2241 mutex_unlock(&cgroup_mutex); 2196 mutex_unlock(&cgroup_mutex);
2242 return 0; 2197 return 0;
2243} 2198}
@@ -2262,32 +2217,23 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2262 return 0; 2217 return 0;
2263} 2218}
2264 2219
2265/* A buffer size big enough for numbers or short strings */ 2220static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2266#define CGROUP_LOCAL_BUFFER_SIZE 64 2221 size_t nbytes, loff_t off)
2267
2268static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
2269 size_t nbytes, loff_t *ppos)
2270{ 2222{
2271 struct cfent *cfe = __d_cfe(file->f_dentry); 2223 struct cgroup *cgrp = of->kn->parent->priv;
2272 struct cftype *cft = __d_cft(file->f_dentry); 2224 struct cftype *cft = of->kn->priv;
2273 struct cgroup_subsys_state *css = cfe->css; 2225 struct cgroup_subsys_state *css;
2274 size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1;
2275 char *buf;
2276 int ret; 2226 int ret;
2277 2227
2278 if (nbytes >= max_bytes) 2228 /*
2279 return -E2BIG; 2229 * kernfs guarantees that a file isn't deleted with operations in
2280 2230 * flight, which means that the matching css is and stays alive and
2281 buf = kmalloc(nbytes + 1, GFP_KERNEL); 2231 * doesn't need to be pinned. The RCU locking is not necessary
2282 if (!buf) 2232 * either. It's just for the convenience of using cgroup_css().
2283 return -ENOMEM; 2233 */
2284 2234 rcu_read_lock();
2285 if (copy_from_user(buf, userbuf, nbytes)) { 2235 css = cgroup_css(cgrp, cft->ss);
2286 ret = -EFAULT; 2236 rcu_read_unlock();
2287 goto out_free;
2288 }
2289
2290 buf[nbytes] = '\0';
2291 2237
2292 if (cft->write_string) { 2238 if (cft->write_string) {
2293 ret = cft->write_string(css, cft, strstrip(buf)); 2239 ret = cft->write_string(css, cft, strstrip(buf));
@@ -2306,53 +2252,23 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
2306 } else { 2252 } else {
2307 ret = -EINVAL; 2253 ret = -EINVAL;
2308 } 2254 }
2309out_free: 2255
2310 kfree(buf);
2311 return ret ?: nbytes; 2256 return ret ?: nbytes;
2312} 2257}
2313 2258
2314/*
2315 * seqfile ops/methods for returning structured data. Currently just
2316 * supports string->u64 maps, but can be extended in future.
2317 */
2318
2319static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) 2259static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2320{ 2260{
2321 struct cftype *cft = seq_cft(seq); 2261 return seq_cft(seq)->seq_start(seq, ppos);
2322
2323 if (cft->seq_start) {
2324 return cft->seq_start(seq, ppos);
2325 } else {
2326 /*
2327 * The same behavior and code as single_open(). Returns
2328 * !NULL if pos is at the beginning; otherwise, NULL.
2329 */
2330 return NULL + !*ppos;
2331 }
2332} 2262}
2333 2263
2334static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) 2264static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2335{ 2265{
2336 struct cftype *cft = seq_cft(seq); 2266 return seq_cft(seq)->seq_next(seq, v, ppos);
2337
2338 if (cft->seq_next) {
2339 return cft->seq_next(seq, v, ppos);
2340 } else {
2341 /*
2342 * The same behavior and code as single_open(), always
2343 * terminate after the initial read.
2344 */
2345 ++*ppos;
2346 return NULL;
2347 }
2348} 2267}
2349 2268
2350static void cgroup_seqfile_stop(struct seq_file *seq, void *v) 2269static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2351{ 2270{
2352 struct cftype *cft = seq_cft(seq); 2271 seq_cft(seq)->seq_stop(seq, v);
2353
2354 if (cft->seq_stop)
2355 cft->seq_stop(seq, v);
2356} 2272}
2357 2273
2358static int cgroup_seqfile_show(struct seq_file *m, void *arg) 2274static int cgroup_seqfile_show(struct seq_file *m, void *arg)
@@ -2372,96 +2288,35 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2372 return 0; 2288 return 0;
2373} 2289}
2374 2290
2375static struct seq_operations cgroup_seq_operations = { 2291static struct kernfs_ops cgroup_kf_single_ops = {
2376 .start = cgroup_seqfile_start, 2292 .atomic_write_len = PAGE_SIZE,
2377 .next = cgroup_seqfile_next, 2293 .write = cgroup_file_write,
2378 .stop = cgroup_seqfile_stop, 2294 .seq_show = cgroup_seqfile_show,
2379 .show = cgroup_seqfile_show,
2380}; 2295};
2381 2296
2382static int cgroup_file_open(struct inode *inode, struct file *file) 2297static struct kernfs_ops cgroup_kf_ops = {
2383{ 2298 .atomic_write_len = PAGE_SIZE,
2384 struct cfent *cfe = __d_cfe(file->f_dentry); 2299 .write = cgroup_file_write,
2385 struct cftype *cft = __d_cft(file->f_dentry); 2300 .seq_start = cgroup_seqfile_start,
2386 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); 2301 .seq_next = cgroup_seqfile_next,
2387 struct cgroup_subsys_state *css; 2302 .seq_stop = cgroup_seqfile_stop,
2388 struct cgroup_open_file *of; 2303 .seq_show = cgroup_seqfile_show,
2389 int err; 2304};
2390
2391 err = generic_file_open(inode, file);
2392 if (err)
2393 return err;
2394
2395 /*
2396 * If the file belongs to a subsystem, pin the css. Will be
2397 * unpinned either on open failure or release. This ensures that
2398 * @css stays alive for all file operations.
2399 */
2400 rcu_read_lock();
2401 css = cgroup_css(cgrp, cft->ss);
2402 if (cft->ss && !css_tryget(css))
2403 css = NULL;
2404 rcu_read_unlock();
2405
2406 if (!css)
2407 return -ENODEV;
2408
2409 /*
2410 * @cfe->css is used by read/write/close to determine the
2411 * associated css. @file->private_data would be a better place but
2412 * that's already used by seqfile. Multiple accessors may use it
2413 * simultaneously which is okay as the association never changes.
2414 */
2415 WARN_ON_ONCE(cfe->css && cfe->css != css);
2416 cfe->css = css;
2417
2418 of = __seq_open_private(file, &cgroup_seq_operations,
2419 sizeof(struct cgroup_open_file));
2420 if (of) {
2421 of->cfe = cfe;
2422 return 0;
2423 }
2424
2425 if (css->ss)
2426 css_put(css);
2427 return -ENOMEM;
2428}
2429
2430static int cgroup_file_release(struct inode *inode, struct file *file)
2431{
2432 struct cfent *cfe = __d_cfe(file->f_dentry);
2433 struct cgroup_subsys_state *css = cfe->css;
2434
2435 if (css->ss)
2436 css_put(css);
2437 return seq_release_private(inode, file);
2438}
2439 2305
2440/* 2306/*
2441 * cgroup_rename - Only allow simple rename of directories in place. 2307 * cgroup_rename - Only allow simple rename of directories in place.
2442 */ 2308 */
2443static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, 2309static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
2444 struct inode *new_dir, struct dentry *new_dentry) 2310 const char *new_name_str)
2445{ 2311{
2312 struct cgroup *cgrp = kn->priv;
2446 int ret; 2313 int ret;
2447 struct cgroup_name *name, *old_name;
2448 struct cgroup *cgrp;
2449
2450 /*
2451 * It's convinient to use parent dir's i_mutex to protected
2452 * cgrp->name.
2453 */
2454 lockdep_assert_held(&old_dir->i_mutex);
2455 2314
2456 if (!S_ISDIR(old_dentry->d_inode->i_mode)) 2315 if (kernfs_type(kn) != KERNFS_DIR)
2457 return -ENOTDIR; 2316 return -ENOTDIR;
2458 if (new_dentry->d_inode) 2317 if (kn->parent != new_parent)
2459 return -EEXIST;
2460 if (old_dir != new_dir)
2461 return -EIO; 2318 return -EIO;
2462 2319
2463 cgrp = __d_cgrp(old_dentry);
2464
2465 /* 2320 /*
2466 * This isn't a proper migration and its usefulness is very 2321 * This isn't a proper migration and its usefulness is very
2467 * limited. Disallow if sane_behavior. 2322 * limited. Disallow if sane_behavior.
@@ -2469,218 +2324,40 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2469 if (cgroup_sane_behavior(cgrp)) 2324 if (cgroup_sane_behavior(cgrp))
2470 return -EPERM; 2325 return -EPERM;
2471 2326
2472 name = cgroup_alloc_name(new_dentry); 2327 /*
2473 if (!name) 2328 * We're gonna grab cgroup_tree_mutex which nests outside kernfs
2474 return -ENOMEM; 2329 * active_ref. kernfs_rename() doesn't require active_ref
2475 2330 * protection. Break them before grabbing cgroup_tree_mutex.
2476 ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry); 2331 */
2477 if (ret) { 2332 kernfs_break_active_protection(new_parent);
2478 kfree(name); 2333 kernfs_break_active_protection(kn);
2479 return ret;
2480 }
2481
2482 old_name = rcu_dereference_protected(cgrp->name, true);
2483 rcu_assign_pointer(cgrp->name, name);
2484
2485 kfree_rcu(old_name, rcu_head);
2486 return 0;
2487}
2488
2489static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
2490{
2491 if (S_ISDIR(dentry->d_inode->i_mode))
2492 return &__d_cgrp(dentry)->xattrs;
2493 else
2494 return &__d_cfe(dentry)->xattrs;
2495}
2496
2497static inline int xattr_enabled(struct dentry *dentry)
2498{
2499 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
2500 return root->flags & CGRP_ROOT_XATTR;
2501}
2502
2503static bool is_valid_xattr(const char *name)
2504{
2505 if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
2506 !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
2507 return true;
2508 return false;
2509}
2510
2511static int cgroup_setxattr(struct dentry *dentry, const char *name,
2512 const void *val, size_t size, int flags)
2513{
2514 if (!xattr_enabled(dentry))
2515 return -EOPNOTSUPP;
2516 if (!is_valid_xattr(name))
2517 return -EINVAL;
2518 return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
2519}
2520
2521static int cgroup_removexattr(struct dentry *dentry, const char *name)
2522{
2523 if (!xattr_enabled(dentry))
2524 return -EOPNOTSUPP;
2525 if (!is_valid_xattr(name))
2526 return -EINVAL;
2527 return simple_xattr_remove(__d_xattrs(dentry), name);
2528}
2529
2530static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
2531 void *buf, size_t size)
2532{
2533 if (!xattr_enabled(dentry))
2534 return -EOPNOTSUPP;
2535 if (!is_valid_xattr(name))
2536 return -EINVAL;
2537 return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
2538}
2539
2540static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
2541{
2542 if (!xattr_enabled(dentry))
2543 return -EOPNOTSUPP;
2544 return simple_xattr_list(__d_xattrs(dentry), buf, size);
2545}
2546
2547static const struct file_operations cgroup_file_operations = {
2548 .read = seq_read,
2549 .write = cgroup_file_write,
2550 .llseek = generic_file_llseek,
2551 .open = cgroup_file_open,
2552 .release = cgroup_file_release,
2553};
2554
2555static const struct inode_operations cgroup_file_inode_operations = {
2556 .setxattr = cgroup_setxattr,
2557 .getxattr = cgroup_getxattr,
2558 .listxattr = cgroup_listxattr,
2559 .removexattr = cgroup_removexattr,
2560};
2561
2562static const struct inode_operations cgroup_dir_inode_operations = {
2563 .lookup = simple_lookup,
2564 .mkdir = cgroup_mkdir,
2565 .rmdir = cgroup_rmdir,
2566 .rename = cgroup_rename,
2567 .setxattr = cgroup_setxattr,
2568 .getxattr = cgroup_getxattr,
2569 .listxattr = cgroup_listxattr,
2570 .removexattr = cgroup_removexattr,
2571};
2572
2573static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2574 struct super_block *sb)
2575{
2576 struct inode *inode;
2577
2578 if (!dentry)
2579 return -ENOENT;
2580 if (dentry->d_inode)
2581 return -EEXIST;
2582
2583 inode = cgroup_new_inode(mode, sb);
2584 if (!inode)
2585 return -ENOMEM;
2586
2587 if (S_ISDIR(mode)) {
2588 inode->i_op = &cgroup_dir_inode_operations;
2589 inode->i_fop = &simple_dir_operations;
2590
2591 /* start off with i_nlink == 2 (for "." entry) */
2592 inc_nlink(inode);
2593 inc_nlink(dentry->d_parent->d_inode);
2594
2595 /*
2596 * Control reaches here with cgroup_mutex held.
2597 * @inode->i_mutex should nest outside cgroup_mutex but we
2598 * want to populate it immediately without releasing
2599 * cgroup_mutex. As @inode isn't visible to anyone else
2600 * yet, trylock will always succeed without affecting
2601 * lockdep checks.
2602 */
2603 WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
2604 } else if (S_ISREG(mode)) {
2605 inode->i_size = 0;
2606 inode->i_fop = &cgroup_file_operations;
2607 inode->i_op = &cgroup_file_inode_operations;
2608 }
2609 d_instantiate(dentry, inode);
2610 dget(dentry); /* Extra count - pin the dentry in core */
2611 return 0;
2612}
2613
2614/**
2615 * cgroup_file_mode - deduce file mode of a control file
2616 * @cft: the control file in question
2617 *
2618 * returns cft->mode if ->mode is not 0
2619 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
2620 * returns S_IRUGO if it has only a read handler
2621 * returns S_IWUSR if it has only a write hander
2622 */
2623static umode_t cgroup_file_mode(const struct cftype *cft)
2624{
2625 umode_t mode = 0;
2626 2334
2627 if (cft->mode) 2335 mutex_lock(&cgroup_tree_mutex);
2628 return cft->mode; 2336 mutex_lock(&cgroup_mutex);
2629 2337
2630 if (cft->read_u64 || cft->read_s64 || cft->seq_show) 2338 ret = kernfs_rename(kn, new_parent, new_name_str);
2631 mode |= S_IRUGO;
2632 2339
2633 if (cft->write_u64 || cft->write_s64 || cft->write_string || 2340 mutex_unlock(&cgroup_mutex);
2634 cft->trigger) 2341 mutex_unlock(&cgroup_tree_mutex);
2635 mode |= S_IWUSR;
2636 2342
2637 return mode; 2343 kernfs_unbreak_active_protection(kn);
2344 kernfs_unbreak_active_protection(new_parent);
2345 return ret;
2638} 2346}
2639 2347
2640static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) 2348static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2641{ 2349{
2642 struct dentry *dir = cgrp->dentry; 2350 char name[CGROUP_FILE_NAME_MAX];
2643 struct cgroup *parent = __d_cgrp(dir); 2351 struct kernfs_node *kn;
2644 struct dentry *dentry; 2352 struct lock_class_key *key = NULL;
2645 struct cfent *cfe;
2646 int error;
2647 umode_t mode;
2648 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2649
2650 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
2651 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2652 strcpy(name, cft->ss->name);
2653 strcat(name, ".");
2654 }
2655 strcat(name, cft->name);
2656
2657 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
2658
2659 cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
2660 if (!cfe)
2661 return -ENOMEM;
2662 2353
2663 dentry = lookup_one_len(name, dir, strlen(name)); 2354#ifdef CONFIG_DEBUG_LOCK_ALLOC
2664 if (IS_ERR(dentry)) { 2355 key = &cft->lockdep_key;
2665 error = PTR_ERR(dentry); 2356#endif
2666 goto out; 2357 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
2667 } 2358 cgroup_file_mode(cft), 0, cft->kf_ops, cft,
2668 2359 NULL, false, key);
2669 cfe->type = (void *)cft; 2360 return PTR_ERR_OR_ZERO(kn);
2670 cfe->dentry = dentry;
2671 dentry->d_fsdata = cfe;
2672 simple_xattrs_init(&cfe->xattrs);
2673
2674 mode = cgroup_file_mode(cft);
2675 error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
2676 if (!error) {
2677 list_add_tail(&cfe->node, &parent->files);
2678 cfe = NULL;
2679 }
2680 dput(dentry);
2681out:
2682 kfree(cfe);
2683 return error;
2684} 2361}
2685 2362
2686/** 2363/**
@@ -2700,11 +2377,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2700 struct cftype *cft; 2377 struct cftype *cft;
2701 int ret; 2378 int ret;
2702 2379
2703 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); 2380 lockdep_assert_held(&cgroup_tree_mutex);
2704 lockdep_assert_held(&cgroup_mutex);
2705 2381
2706 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2382 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2707 /* does cft->flags tell us to skip this file on @cgrp? */ 2383 /* does cft->flags tell us to skip this file on @cgrp? */
2384 if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
2385 continue;
2708 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) 2386 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2709 continue; 2387 continue;
2710 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) 2388 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
@@ -2726,44 +2404,19 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2726 return 0; 2404 return 0;
2727} 2405}
2728 2406
2729static void cgroup_cfts_prepare(void) 2407static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
2730 __acquires(&cgroup_mutex)
2731{
2732 /*
2733 * Thanks to the entanglement with vfs inode locking, we can't walk
2734 * the existing cgroups under cgroup_mutex and create files.
2735 * Instead, we use css_for_each_descendant_pre() and drop RCU read
2736 * lock before calling cgroup_addrm_files().
2737 */
2738 mutex_lock(&cgroup_mutex);
2739}
2740
2741static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2742 __releases(&cgroup_mutex)
2743{ 2408{
2744 LIST_HEAD(pending); 2409 LIST_HEAD(pending);
2745 struct cgroup_subsys *ss = cfts[0].ss; 2410 struct cgroup_subsys *ss = cfts[0].ss;
2746 struct cgroup *root = &ss->root->top_cgroup; 2411 struct cgroup *root = &ss->root->cgrp;
2747 struct super_block *sb = ss->root->sb;
2748 struct dentry *prev = NULL;
2749 struct inode *inode;
2750 struct cgroup_subsys_state *css; 2412 struct cgroup_subsys_state *css;
2751 u64 update_before;
2752 int ret = 0; 2413 int ret = 0;
2753 2414
2754 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ 2415 lockdep_assert_held(&cgroup_tree_mutex);
2755 if (!cfts || ss->root == &cgroup_dummy_root ||
2756 !atomic_inc_not_zero(&sb->s_active)) {
2757 mutex_unlock(&cgroup_mutex);
2758 return 0;
2759 }
2760 2416
2761 /* 2417 /* don't bother if @ss isn't attached */
2762 * All cgroups which are created after we drop cgroup_mutex will 2418 if (ss->root == &cgrp_dfl_root)
2763 * have the updated set of files, so we only need to update the 2419 return 0;
2764 * cgroups created before the current @cgroup_serial_nr_next.
2765 */
2766 update_before = cgroup_serial_nr_next;
2767 2420
2768 /* add/rm files for all cgroups created before */ 2421 /* add/rm files for all cgroups created before */
2769 css_for_each_descendant_pre(css, cgroup_css(root, ss)) { 2422 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
@@ -2772,62 +2425,75 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2772 if (cgroup_is_dead(cgrp)) 2425 if (cgroup_is_dead(cgrp))
2773 continue; 2426 continue;
2774 2427
2775 inode = cgrp->dentry->d_inode; 2428 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2776 dget(cgrp->dentry);
2777 dput(prev);
2778 prev = cgrp->dentry;
2779
2780 mutex_unlock(&cgroup_mutex);
2781 mutex_lock(&inode->i_mutex);
2782 mutex_lock(&cgroup_mutex);
2783 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2784 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2785 mutex_unlock(&inode->i_mutex);
2786 if (ret) 2429 if (ret)
2787 break; 2430 break;
2788 } 2431 }
2789 mutex_unlock(&cgroup_mutex); 2432
2790 dput(prev); 2433 if (is_add && !ret)
2791 deactivate_super(sb); 2434 kernfs_activate(root->kn);
2792 return ret; 2435 return ret;
2793} 2436}
2794 2437
2795/** 2438static void cgroup_exit_cftypes(struct cftype *cfts)
2796 * cgroup_add_cftypes - add an array of cftypes to a subsystem
2797 * @ss: target cgroup subsystem
2798 * @cfts: zero-length name terminated array of cftypes
2799 *
2800 * Register @cfts to @ss. Files described by @cfts are created for all
2801 * existing cgroups to which @ss is attached and all future cgroups will
2802 * have them too. This function can be called anytime whether @ss is
2803 * attached or not.
2804 *
2805 * Returns 0 on successful registration, -errno on failure. Note that this
2806 * function currently returns 0 as long as @cfts registration is successful
2807 * even if some file creation attempts on existing cgroups fail.
2808 */
2809int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2810{ 2439{
2811 struct cftype_set *set;
2812 struct cftype *cft; 2440 struct cftype *cft;
2813 int ret;
2814 2441
2815 set = kzalloc(sizeof(*set), GFP_KERNEL); 2442 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2816 if (!set) 2443 /* free copy for custom atomic_write_len, see init_cftypes() */
2817 return -ENOMEM; 2444 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
2445 kfree(cft->kf_ops);
2446 cft->kf_ops = NULL;
2447 cft->ss = NULL;
2448 }
2449}
2818 2450
2819 for (cft = cfts; cft->name[0] != '\0'; cft++) 2451static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2452{
2453 struct cftype *cft;
2454
2455 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2456 struct kernfs_ops *kf_ops;
2457
2458 WARN_ON(cft->ss || cft->kf_ops);
2459
2460 if (cft->seq_start)
2461 kf_ops = &cgroup_kf_ops;
2462 else
2463 kf_ops = &cgroup_kf_single_ops;
2464
2465 /*
2466 * Ugh... if @cft wants a custom max_write_len, we need to
2467 * make a copy of kf_ops to set its atomic_write_len.
2468 */
2469 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
2470 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
2471 if (!kf_ops) {
2472 cgroup_exit_cftypes(cfts);
2473 return -ENOMEM;
2474 }
2475 kf_ops->atomic_write_len = cft->max_write_len;
2476 }
2477
2478 cft->kf_ops = kf_ops;
2820 cft->ss = ss; 2479 cft->ss = ss;
2480 }
2821 2481
2822 cgroup_cfts_prepare(); 2482 return 0;
2823 set->cfts = cfts; 2483}
2824 list_add_tail(&set->node, &ss->cftsets); 2484
2825 ret = cgroup_cfts_commit(cfts, true); 2485static int cgroup_rm_cftypes_locked(struct cftype *cfts)
2826 if (ret) 2486{
2827 cgroup_rm_cftypes(cfts); 2487 lockdep_assert_held(&cgroup_tree_mutex);
2828 return ret; 2488
2489 if (!cfts || !cfts[0].ss)
2490 return -ENOENT;
2491
2492 list_del(&cfts->node);
2493 cgroup_apply_cftypes(cfts, false);
2494 cgroup_exit_cftypes(cfts);
2495 return 0;
2829} 2496}
2830EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2831 2497
2832/** 2498/**
2833 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem 2499 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
@@ -2842,24 +2508,48 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2842 */ 2508 */
2843int cgroup_rm_cftypes(struct cftype *cfts) 2509int cgroup_rm_cftypes(struct cftype *cfts)
2844{ 2510{
2845 struct cftype_set *set; 2511 int ret;
2846 2512
2847 if (!cfts || !cfts[0].ss) 2513 mutex_lock(&cgroup_tree_mutex);
2848 return -ENOENT; 2514 ret = cgroup_rm_cftypes_locked(cfts);
2515 mutex_unlock(&cgroup_tree_mutex);
2516 return ret;
2517}
2849 2518
2850 cgroup_cfts_prepare(); 2519/**
2520 * cgroup_add_cftypes - add an array of cftypes to a subsystem
2521 * @ss: target cgroup subsystem
2522 * @cfts: zero-length name terminated array of cftypes
2523 *
2524 * Register @cfts to @ss. Files described by @cfts are created for all
2525 * existing cgroups to which @ss is attached and all future cgroups will
2526 * have them too. This function can be called anytime whether @ss is
2527 * attached or not.
2528 *
2529 * Returns 0 on successful registration, -errno on failure. Note that this
2530 * function currently returns 0 as long as @cfts registration is successful
2531 * even if some file creation attempts on existing cgroups fail.
2532 */
2533int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2534{
2535 int ret;
2851 2536
2852 list_for_each_entry(set, &cfts[0].ss->cftsets, node) { 2537 if (!cfts || cfts[0].name[0] == '\0')
2853 if (set->cfts == cfts) { 2538 return 0;
2854 list_del(&set->node); 2539
2855 kfree(set); 2540 ret = cgroup_init_cftypes(ss, cfts);
2856 cgroup_cfts_commit(cfts, false); 2541 if (ret)
2857 return 0; 2542 return ret;
2858 } 2543
2859 } 2544 mutex_lock(&cgroup_tree_mutex);
2860 2545
2861 cgroup_cfts_commit(NULL, false); 2546 list_add_tail(&cfts->node, &ss->cfts);
2862 return -ENOENT; 2547 ret = cgroup_apply_cftypes(cfts, true);
2548 if (ret)
2549 cgroup_rm_cftypes_locked(cfts);
2550
2551 mutex_unlock(&cgroup_tree_mutex);
2552 return ret;
2863} 2553}
2864 2554
2865/** 2555/**
@@ -2868,57 +2558,18 @@ int cgroup_rm_cftypes(struct cftype *cfts)
2868 * 2558 *
2869 * Return the number of tasks in the cgroup. 2559 * Return the number of tasks in the cgroup.
2870 */ 2560 */
2871int cgroup_task_count(const struct cgroup *cgrp) 2561static int cgroup_task_count(const struct cgroup *cgrp)
2872{ 2562{
2873 int count = 0; 2563 int count = 0;
2874 struct cgrp_cset_link *link; 2564 struct cgrp_cset_link *link;
2875 2565
2876 read_lock(&css_set_lock); 2566 down_read(&css_set_rwsem);
2877 list_for_each_entry(link, &cgrp->cset_links, cset_link) 2567 list_for_each_entry(link, &cgrp->cset_links, cset_link)
2878 count += atomic_read(&link->cset->refcount); 2568 count += atomic_read(&link->cset->refcount);
2879 read_unlock(&css_set_lock); 2569 up_read(&css_set_rwsem);
2880 return count; 2570 return count;
2881} 2571}
2882 2572
2883/*
2884 * To reduce the fork() overhead for systems that are not actually using
2885 * their cgroups capability, we don't maintain the lists running through
2886 * each css_set to its tasks until we see the list actually used - in other
2887 * words after the first call to css_task_iter_start().
2888 */
2889static void cgroup_enable_task_cg_lists(void)
2890{
2891 struct task_struct *p, *g;
2892 write_lock(&css_set_lock);
2893 use_task_css_set_links = 1;
2894 /*
2895 * We need tasklist_lock because RCU is not safe against
2896 * while_each_thread(). Besides, a forking task that has passed
2897 * cgroup_post_fork() without seeing use_task_css_set_links = 1
2898 * is not guaranteed to have its child immediately visible in the
2899 * tasklist if we walk through it with RCU.
2900 */
2901 read_lock(&tasklist_lock);
2902 do_each_thread(g, p) {
2903 task_lock(p);
2904 /*
2905 * We should check if the process is exiting, otherwise
2906 * it will race with cgroup_exit() in that the list
2907 * entry won't be deleted though the process has exited.
2908 * Do it while holding siglock so that we don't end up
2909 * racing against cgroup_exit().
2910 */
2911 spin_lock_irq(&p->sighand->siglock);
2912 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
2913 list_add(&p->cg_list, &task_css_set(p)->tasks);
2914 spin_unlock_irq(&p->sighand->siglock);
2915
2916 task_unlock(p);
2917 } while_each_thread(g, p);
2918 read_unlock(&tasklist_lock);
2919 write_unlock(&css_set_lock);
2920}
2921
2922/** 2573/**
2923 * css_next_child - find the next child of a given css 2574 * css_next_child - find the next child of a given css
2924 * @pos_css: the current position (%NULL to initiate traversal) 2575 * @pos_css: the current position (%NULL to initiate traversal)
@@ -2937,7 +2588,7 @@ css_next_child(struct cgroup_subsys_state *pos_css,
2937 struct cgroup *cgrp = parent_css->cgroup; 2588 struct cgroup *cgrp = parent_css->cgroup;
2938 struct cgroup *next; 2589 struct cgroup *next;
2939 2590
2940 cgroup_assert_mutex_or_rcu_locked(); 2591 cgroup_assert_mutexes_or_rcu_locked();
2941 2592
2942 /* 2593 /*
2943 * @pos could already have been removed. Once a cgroup is removed, 2594 * @pos could already have been removed. Once a cgroup is removed,
@@ -2973,7 +2624,6 @@ css_next_child(struct cgroup_subsys_state *pos_css,
2973 2624
2974 return cgroup_css(next, parent_css->ss); 2625 return cgroup_css(next, parent_css->ss);
2975} 2626}
2976EXPORT_SYMBOL_GPL(css_next_child);
2977 2627
2978/** 2628/**
2979 * css_next_descendant_pre - find the next descendant for pre-order walk 2629 * css_next_descendant_pre - find the next descendant for pre-order walk
@@ -2995,7 +2645,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
2995{ 2645{
2996 struct cgroup_subsys_state *next; 2646 struct cgroup_subsys_state *next;
2997 2647
2998 cgroup_assert_mutex_or_rcu_locked(); 2648 cgroup_assert_mutexes_or_rcu_locked();
2999 2649
3000 /* if first iteration, visit @root */ 2650 /* if first iteration, visit @root */
3001 if (!pos) 2651 if (!pos)
@@ -3016,7 +2666,6 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
3016 2666
3017 return NULL; 2667 return NULL;
3018} 2668}
3019EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3020 2669
3021/** 2670/**
3022 * css_rightmost_descendant - return the rightmost descendant of a css 2671 * css_rightmost_descendant - return the rightmost descendant of a css
@@ -3036,7 +2685,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
3036{ 2685{
3037 struct cgroup_subsys_state *last, *tmp; 2686 struct cgroup_subsys_state *last, *tmp;
3038 2687
3039 cgroup_assert_mutex_or_rcu_locked(); 2688 cgroup_assert_mutexes_or_rcu_locked();
3040 2689
3041 do { 2690 do {
3042 last = pos; 2691 last = pos;
@@ -3048,7 +2697,6 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
3048 2697
3049 return last; 2698 return last;
3050} 2699}
3051EXPORT_SYMBOL_GPL(css_rightmost_descendant);
3052 2700
3053static struct cgroup_subsys_state * 2701static struct cgroup_subsys_state *
3054css_leftmost_descendant(struct cgroup_subsys_state *pos) 2702css_leftmost_descendant(struct cgroup_subsys_state *pos)
@@ -3084,7 +2732,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
3084{ 2732{
3085 struct cgroup_subsys_state *next; 2733 struct cgroup_subsys_state *next;
3086 2734
3087 cgroup_assert_mutex_or_rcu_locked(); 2735 cgroup_assert_mutexes_or_rcu_locked();
3088 2736
3089 /* if first iteration, visit leftmost descendant which may be @root */ 2737 /* if first iteration, visit leftmost descendant which may be @root */
3090 if (!pos) 2738 if (!pos)
@@ -3102,7 +2750,6 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
3102 /* no sibling left, visit parent */ 2750 /* no sibling left, visit parent */
3103 return css_parent(pos); 2751 return css_parent(pos);
3104} 2752}
3105EXPORT_SYMBOL_GPL(css_next_descendant_post);
3106 2753
3107/** 2754/**
3108 * css_advance_task_iter - advance a task itererator to the next css_set 2755 * css_advance_task_iter - advance a task itererator to the next css_set
@@ -3125,9 +2772,14 @@ static void css_advance_task_iter(struct css_task_iter *it)
3125 } 2772 }
3126 link = list_entry(l, struct cgrp_cset_link, cset_link); 2773 link = list_entry(l, struct cgrp_cset_link, cset_link);
3127 cset = link->cset; 2774 cset = link->cset;
3128 } while (list_empty(&cset->tasks)); 2775 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
2776
3129 it->cset_link = l; 2777 it->cset_link = l;
3130 it->task = cset->tasks.next; 2778
2779 if (!list_empty(&cset->tasks))
2780 it->task = cset->tasks.next;
2781 else
2782 it->task = cset->mg_tasks.next;
3131} 2783}
3132 2784
3133/** 2785/**
@@ -3146,17 +2798,12 @@ static void css_advance_task_iter(struct css_task_iter *it)
3146 */ 2798 */
3147void css_task_iter_start(struct cgroup_subsys_state *css, 2799void css_task_iter_start(struct cgroup_subsys_state *css,
3148 struct css_task_iter *it) 2800 struct css_task_iter *it)
3149 __acquires(css_set_lock) 2801 __acquires(css_set_rwsem)
3150{ 2802{
3151 /* 2803 /* no one should try to iterate before mounting cgroups */
3152 * The first time anyone tries to iterate across a css, we need to 2804 WARN_ON_ONCE(!use_task_css_set_links);
3153 * enable the list linking each css_set to its tasks, and fix up
3154 * all existing tasks.
3155 */
3156 if (!use_task_css_set_links)
3157 cgroup_enable_task_cg_lists();
3158 2805
3159 read_lock(&css_set_lock); 2806 down_read(&css_set_rwsem);
3160 2807
3161 it->origin_css = css; 2808 it->origin_css = css;
3162 it->cset_link = &css->cgroup->cset_links; 2809 it->cset_link = &css->cgroup->cset_links;
@@ -3176,24 +2823,29 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
3176{ 2823{
3177 struct task_struct *res; 2824 struct task_struct *res;
3178 struct list_head *l = it->task; 2825 struct list_head *l = it->task;
3179 struct cgrp_cset_link *link; 2826 struct cgrp_cset_link *link = list_entry(it->cset_link,
2827 struct cgrp_cset_link, cset_link);
3180 2828
3181 /* If the iterator cg is NULL, we have no tasks */ 2829 /* If the iterator cg is NULL, we have no tasks */
3182 if (!it->cset_link) 2830 if (!it->cset_link)
3183 return NULL; 2831 return NULL;
3184 res = list_entry(l, struct task_struct, cg_list); 2832 res = list_entry(l, struct task_struct, cg_list);
3185 /* Advance iterator to find next entry */ 2833
2834 /*
2835 * Advance iterator to find next entry. cset->tasks is consumed
2836 * first and then ->mg_tasks. After ->mg_tasks, we move onto the
2837 * next cset.
2838 */
3186 l = l->next; 2839 l = l->next;
3187 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); 2840
3188 if (l == &link->cset->tasks) { 2841 if (l == &link->cset->tasks)
3189 /* 2842 l = link->cset->mg_tasks.next;
3190 * We reached the end of this task list - move on to the 2843
3191 * next cgrp_cset_link. 2844 if (l == &link->cset->mg_tasks)
3192 */
3193 css_advance_task_iter(it); 2845 css_advance_task_iter(it);
3194 } else { 2846 else
3195 it->task = l; 2847 it->task = l;
3196 } 2848
3197 return res; 2849 return res;
3198} 2850}
3199 2851
@@ -3204,191 +2856,62 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
3204 * Finish task iteration started by css_task_iter_start(). 2856 * Finish task iteration started by css_task_iter_start().
3205 */ 2857 */
3206void css_task_iter_end(struct css_task_iter *it) 2858void css_task_iter_end(struct css_task_iter *it)
3207 __releases(css_set_lock) 2859 __releases(css_set_rwsem)
3208{
3209 read_unlock(&css_set_lock);
3210}
3211
3212static inline int started_after_time(struct task_struct *t1,
3213 struct timespec *time,
3214 struct task_struct *t2)
3215{
3216 int start_diff = timespec_compare(&t1->start_time, time);
3217 if (start_diff > 0) {
3218 return 1;
3219 } else if (start_diff < 0) {
3220 return 0;
3221 } else {
3222 /*
3223 * Arbitrarily, if two processes started at the same
3224 * time, we'll say that the lower pointer value
3225 * started first. Note that t2 may have exited by now
3226 * so this may not be a valid pointer any longer, but
3227 * that's fine - it still serves to distinguish
3228 * between two tasks started (effectively) simultaneously.
3229 */
3230 return t1 > t2;
3231 }
3232}
3233
3234/*
3235 * This function is a callback from heap_insert() and is used to order
3236 * the heap.
3237 * In this case we order the heap in descending task start time.
3238 */
3239static inline int started_after(void *p1, void *p2)
3240{ 2860{
3241 struct task_struct *t1 = p1; 2861 up_read(&css_set_rwsem);
3242 struct task_struct *t2 = p2;
3243 return started_after_time(t1, &t2->start_time, t2);
3244} 2862}
3245 2863
3246/** 2864/**
3247 * css_scan_tasks - iterate though all the tasks in a css 2865 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3248 * @css: the css to iterate tasks of 2866 * @to: cgroup to which the tasks will be moved
3249 * @test: optional test callback 2867 * @from: cgroup in which the tasks currently reside
3250 * @process: process callback
3251 * @data: data passed to @test and @process
3252 * @heap: optional pre-allocated heap used for task iteration
3253 *
3254 * Iterate through all the tasks in @css, calling @test for each, and if it
3255 * returns %true, call @process for it also.
3256 *
3257 * @test may be NULL, meaning always true (select all tasks), which
3258 * effectively duplicates css_task_iter_{start,next,end}() but does not
3259 * lock css_set_lock for the call to @process.
3260 *
3261 * It is guaranteed that @process will act on every task that is a member
3262 * of @css for the duration of this call. This function may or may not
3263 * call @process for tasks that exit or move to a different css during the
3264 * call, or are forked or move into the css during the call.
3265 *
3266 * Note that @test may be called with locks held, and may in some
3267 * situations be called multiple times for the same task, so it should be
3268 * cheap.
3269 * 2868 *
3270 * If @heap is non-NULL, a heap has been pre-allocated and will be used for 2869 * Locking rules between cgroup_post_fork() and the migration path
3271 * heap operations (and its "gt" member will be overwritten), else a 2870 * guarantee that, if a task is forking while being migrated, the new child
3272 * temporary heap will be used (allocation of which may cause this function 2871 * is guaranteed to be either visible in the source cgroup after the
3273 * to fail). 2872 * parent's migration is complete or put into the target cgroup. No task
2873 * can slip out of migration through forking.
3274 */ 2874 */
3275int css_scan_tasks(struct cgroup_subsys_state *css, 2875int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3276 bool (*test)(struct task_struct *, void *),
3277 void (*process)(struct task_struct *, void *),
3278 void *data, struct ptr_heap *heap)
3279{ 2876{
3280 int retval, i; 2877 LIST_HEAD(preloaded_csets);
2878 struct cgrp_cset_link *link;
3281 struct css_task_iter it; 2879 struct css_task_iter it;
3282 struct task_struct *p, *dropped; 2880 struct task_struct *task;
3283 /* Never dereference latest_task, since it's not refcounted */ 2881 int ret;
3284 struct task_struct *latest_task = NULL;
3285 struct ptr_heap tmp_heap;
3286 struct timespec latest_time = { 0, 0 };
3287
3288 if (heap) {
3289 /* The caller supplied our heap and pre-allocated its memory */
3290 heap->gt = &started_after;
3291 } else {
3292 /* We need to allocate our own heap memory */
3293 heap = &tmp_heap;
3294 retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
3295 if (retval)
3296 /* cannot allocate the heap */
3297 return retval;
3298 }
3299 2882
3300 again: 2883 mutex_lock(&cgroup_mutex);
3301 /*
3302 * Scan tasks in the css, using the @test callback to determine
3303 * which are of interest, and invoking @process callback on the
3304 * ones which need an update. Since we don't want to hold any
3305 * locks during the task updates, gather tasks to be processed in a
3306 * heap structure. The heap is sorted by descending task start
3307 * time. If the statically-sized heap fills up, we overflow tasks
3308 * that started later, and in future iterations only consider tasks
3309 * that started after the latest task in the previous pass. This
3310 * guarantees forward progress and that we don't miss any tasks.
3311 */
3312 heap->size = 0;
3313 css_task_iter_start(css, &it);
3314 while ((p = css_task_iter_next(&it))) {
3315 /*
3316 * Only affect tasks that qualify per the caller's callback,
3317 * if he provided one
3318 */
3319 if (test && !test(p, data))
3320 continue;
3321 /*
3322 * Only process tasks that started after the last task
3323 * we processed
3324 */
3325 if (!started_after_time(p, &latest_time, latest_task))
3326 continue;
3327 dropped = heap_insert(heap, p);
3328 if (dropped == NULL) {
3329 /*
3330 * The new task was inserted; the heap wasn't
3331 * previously full
3332 */
3333 get_task_struct(p);
3334 } else if (dropped != p) {
3335 /*
3336 * The new task was inserted, and pushed out a
3337 * different task
3338 */
3339 get_task_struct(p);
3340 put_task_struct(dropped);
3341 }
3342 /*
3343 * Else the new task was newer than anything already in
3344 * the heap and wasn't inserted
3345 */
3346 }
3347 css_task_iter_end(&it);
3348 2884
3349 if (heap->size) { 2885 /* all tasks in @from are being moved, all csets are source */
3350 for (i = 0; i < heap->size; i++) { 2886 down_read(&css_set_rwsem);
3351 struct task_struct *q = heap->ptrs[i]; 2887 list_for_each_entry(link, &from->cset_links, cset_link)
3352 if (i == 0) { 2888 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
3353 latest_time = q->start_time; 2889 up_read(&css_set_rwsem);
3354 latest_task = q;
3355 }
3356 /* Process the task per the caller's callback */
3357 process(q, data);
3358 put_task_struct(q);
3359 }
3360 /*
3361 * If we had to process any tasks at all, scan again
3362 * in case some of them were in the middle of forking
3363 * children that didn't get processed.
3364 * Not the most efficient way to do it, but it avoids
3365 * having to take callback_mutex in the fork path
3366 */
3367 goto again;
3368 }
3369 if (heap == &tmp_heap)
3370 heap_free(&tmp_heap);
3371 return 0;
3372}
3373 2890
3374static void cgroup_transfer_one_task(struct task_struct *task, void *data) 2891 ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
3375{ 2892 if (ret)
3376 struct cgroup *new_cgroup = data; 2893 goto out_err;
3377 2894
3378 mutex_lock(&cgroup_mutex); 2895 /*
3379 cgroup_attach_task(new_cgroup, task, false); 2896 * Migrate tasks one-by-one until @form is empty. This fails iff
2897 * ->can_attach() fails.
2898 */
2899 do {
2900 css_task_iter_start(&from->dummy_css, &it);
2901 task = css_task_iter_next(&it);
2902 if (task)
2903 get_task_struct(task);
2904 css_task_iter_end(&it);
2905
2906 if (task) {
2907 ret = cgroup_migrate(to, task, false);
2908 put_task_struct(task);
2909 }
2910 } while (task && !ret);
2911out_err:
2912 cgroup_migrate_finish(&preloaded_csets);
3380 mutex_unlock(&cgroup_mutex); 2913 mutex_unlock(&cgroup_mutex);
3381} 2914 return ret;
3382
3383/**
3384 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3385 * @to: cgroup to which the tasks will be moved
3386 * @from: cgroup in which the tasks currently reside
3387 */
3388int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3389{
3390 return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
3391 to, NULL);
3392} 2915}
3393 2916
3394/* 2917/*
@@ -3687,21 +3210,31 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3687 */ 3210 */
3688int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) 3211int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3689{ 3212{
3690 int ret = -EINVAL; 3213 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
3691 struct cgroup *cgrp; 3214 struct cgroup *cgrp;
3692 struct css_task_iter it; 3215 struct css_task_iter it;
3693 struct task_struct *tsk; 3216 struct task_struct *tsk;
3694 3217
3218 /* it should be kernfs_node belonging to cgroupfs and is a directory */
3219 if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
3220 kernfs_type(kn) != KERNFS_DIR)
3221 return -EINVAL;
3222
3223 mutex_lock(&cgroup_mutex);
3224
3695 /* 3225 /*
3696 * Validate dentry by checking the superblock operations, 3226 * We aren't being called from kernfs and there's no guarantee on
3697 * and make sure it's a directory. 3227 * @kn->priv's validity. For this and css_tryget_from_dir(),
3228 * @kn->priv is RCU safe. Let's do the RCU dancing.
3698 */ 3229 */
3699 if (dentry->d_sb->s_op != &cgroup_ops || 3230 rcu_read_lock();
3700 !S_ISDIR(dentry->d_inode->i_mode)) 3231 cgrp = rcu_dereference(kn->priv);
3701 goto err; 3232 if (!cgrp || cgroup_is_dead(cgrp)) {
3702 3233 rcu_read_unlock();
3703 ret = 0; 3234 mutex_unlock(&cgroup_mutex);
3704 cgrp = dentry->d_fsdata; 3235 return -ENOENT;
3236 }
3237 rcu_read_unlock();
3705 3238
3706 css_task_iter_start(&cgrp->dummy_css, &it); 3239 css_task_iter_start(&cgrp->dummy_css, &it);
3707 while ((tsk = css_task_iter_next(&it))) { 3240 while ((tsk = css_task_iter_next(&it))) {
@@ -3726,8 +3259,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3726 } 3259 }
3727 css_task_iter_end(&it); 3260 css_task_iter_end(&it);
3728 3261
3729err: 3262 mutex_unlock(&cgroup_mutex);
3730 return ret; 3263 return 0;
3731} 3264}
3732 3265
3733 3266
@@ -3745,7 +3278,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3745 * after a seek to the start). Use a binary-search to find the 3278 * after a seek to the start). Use a binary-search to find the
3746 * next pid to display, if any 3279 * next pid to display, if any
3747 */ 3280 */
3748 struct cgroup_open_file *of = s->private; 3281 struct kernfs_open_file *of = s->private;
3749 struct cgroup *cgrp = seq_css(s)->cgroup; 3282 struct cgroup *cgrp = seq_css(s)->cgroup;
3750 struct cgroup_pidlist *l; 3283 struct cgroup_pidlist *l;
3751 enum cgroup_filetype type = seq_cft(s)->private; 3284 enum cgroup_filetype type = seq_cft(s)->private;
@@ -3800,7 +3333,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3800 3333
3801static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3334static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3802{ 3335{
3803 struct cgroup_open_file *of = s->private; 3336 struct kernfs_open_file *of = s->private;
3804 struct cgroup_pidlist *l = of->priv; 3337 struct cgroup_pidlist *l = of->priv;
3805 3338
3806 if (l) 3339 if (l)
@@ -3811,7 +3344,7 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3811 3344
3812static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3345static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3813{ 3346{
3814 struct cgroup_open_file *of = s->private; 3347 struct kernfs_open_file *of = s->private;
3815 struct cgroup_pidlist *l = of->priv; 3348 struct cgroup_pidlist *l = of->priv;
3816 pid_t *p = v; 3349 pid_t *p = v;
3817 pid_t *end = l->list + l->length; 3350 pid_t *end = l->list + l->length;
@@ -3861,23 +3394,6 @@ static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3861 return 0; 3394 return 0;
3862} 3395}
3863 3396
3864/*
3865 * When dput() is called asynchronously, if umount has been done and
3866 * then deactivate_super() in cgroup_free_fn() kills the superblock,
3867 * there's a small window that vfs will see the root dentry with non-zero
3868 * refcnt and trigger BUG().
3869 *
3870 * That's why we hold a reference before dput() and drop it right after.
3871 */
3872static void cgroup_dput(struct cgroup *cgrp)
3873{
3874 struct super_block *sb = cgrp->root->sb;
3875
3876 atomic_inc(&sb->s_active);
3877 dput(cgrp->dentry);
3878 deactivate_super(sb);
3879}
3880
3881static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, 3397static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
3882 struct cftype *cft) 3398 struct cftype *cft)
3883{ 3399{
@@ -3944,7 +3460,7 @@ static struct cftype cgroup_base_files[] = {
3944 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 3460 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
3945 .seq_show = cgroup_release_agent_show, 3461 .seq_show = cgroup_release_agent_show,
3946 .write_string = cgroup_release_agent_write, 3462 .write_string = cgroup_release_agent_write,
3947 .max_write_len = PATH_MAX, 3463 .max_write_len = PATH_MAX - 1,
3948 }, 3464 },
3949 { } /* terminate */ 3465 { } /* terminate */
3950}; 3466};
@@ -3963,13 +3479,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
3963 3479
3964 /* process cftsets of each subsystem */ 3480 /* process cftsets of each subsystem */
3965 for_each_subsys(ss, i) { 3481 for_each_subsys(ss, i) {
3966 struct cftype_set *set; 3482 struct cftype *cfts;
3967 3483
3968 if (!test_bit(i, &subsys_mask)) 3484 if (!test_bit(i, &subsys_mask))
3969 continue; 3485 continue;
3970 3486
3971 list_for_each_entry(set, &ss->cftsets, node) { 3487 list_for_each_entry(cfts, &ss->cfts, node) {
3972 ret = cgroup_addrm_files(cgrp, set->cfts, true); 3488 ret = cgroup_addrm_files(cgrp, cfts, true);
3973 if (ret < 0) 3489 if (ret < 0)
3974 goto err; 3490 goto err;
3975 } 3491 }
@@ -4012,7 +3528,7 @@ static void css_free_work_fn(struct work_struct *work)
4012 css_put(css->parent); 3528 css_put(css->parent);
4013 3529
4014 css->ss->css_free(css); 3530 css->ss->css_free(css);
4015 cgroup_dput(cgrp); 3531 cgroup_put(cgrp);
4016} 3532}
4017 3533
4018static void css_free_rcu_fn(struct rcu_head *rcu_head) 3534static void css_free_rcu_fn(struct rcu_head *rcu_head)
@@ -4020,10 +3536,6 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
4020 struct cgroup_subsys_state *css = 3536 struct cgroup_subsys_state *css =
4021 container_of(rcu_head, struct cgroup_subsys_state, rcu_head); 3537 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4022 3538
4023 /*
4024 * css holds an extra ref to @cgrp->dentry which is put on the last
4025 * css_put(). dput() requires process context which we don't have.
4026 */
4027 INIT_WORK(&css->destroy_work, css_free_work_fn); 3539 INIT_WORK(&css->destroy_work, css_free_work_fn);
4028 queue_work(cgroup_destroy_wq, &css->destroy_work); 3540 queue_work(cgroup_destroy_wq, &css->destroy_work);
4029} 3541}
@@ -4033,7 +3545,7 @@ static void css_release(struct percpu_ref *ref)
4033 struct cgroup_subsys_state *css = 3545 struct cgroup_subsys_state *css =
4034 container_of(ref, struct cgroup_subsys_state, refcnt); 3546 container_of(ref, struct cgroup_subsys_state, refcnt);
4035 3547
4036 rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL); 3548 RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);
4037 call_rcu(&css->rcu_head, css_free_rcu_fn); 3549 call_rcu(&css->rcu_head, css_free_rcu_fn);
4038} 3550}
4039 3551
@@ -4058,6 +3570,7 @@ static int online_css(struct cgroup_subsys_state *css)
4058 struct cgroup_subsys *ss = css->ss; 3570 struct cgroup_subsys *ss = css->ss;
4059 int ret = 0; 3571 int ret = 0;
4060 3572
3573 lockdep_assert_held(&cgroup_tree_mutex);
4061 lockdep_assert_held(&cgroup_mutex); 3574 lockdep_assert_held(&cgroup_mutex);
4062 3575
4063 if (ss->css_online) 3576 if (ss->css_online)
@@ -4065,7 +3578,7 @@ static int online_css(struct cgroup_subsys_state *css)
4065 if (!ret) { 3578 if (!ret) {
4066 css->flags |= CSS_ONLINE; 3579 css->flags |= CSS_ONLINE;
4067 css->cgroup->nr_css++; 3580 css->cgroup->nr_css++;
4068 rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); 3581 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
4069 } 3582 }
4070 return ret; 3583 return ret;
4071} 3584}
@@ -4075,6 +3588,7 @@ static void offline_css(struct cgroup_subsys_state *css)
4075{ 3588{
4076 struct cgroup_subsys *ss = css->ss; 3589 struct cgroup_subsys *ss = css->ss;
4077 3590
3591 lockdep_assert_held(&cgroup_tree_mutex);
4078 lockdep_assert_held(&cgroup_mutex); 3592 lockdep_assert_held(&cgroup_mutex);
4079 3593
4080 if (!(css->flags & CSS_ONLINE)) 3594 if (!(css->flags & CSS_ONLINE))
@@ -4085,7 +3599,7 @@ static void offline_css(struct cgroup_subsys_state *css)
4085 3599
4086 css->flags &= ~CSS_ONLINE; 3600 css->flags &= ~CSS_ONLINE;
4087 css->cgroup->nr_css--; 3601 css->cgroup->nr_css--;
4088 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); 3602 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css);
4089} 3603}
4090 3604
4091/** 3605/**
@@ -4103,7 +3617,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4103 struct cgroup_subsys_state *css; 3617 struct cgroup_subsys_state *css;
4104 int err; 3618 int err;
4105 3619
4106 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
4107 lockdep_assert_held(&cgroup_mutex); 3620 lockdep_assert_held(&cgroup_mutex);
4108 3621
4109 css = ss->css_alloc(cgroup_css(parent, ss)); 3622 css = ss->css_alloc(cgroup_css(parent, ss));
@@ -4116,7 +3629,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4116 3629
4117 init_css(css, ss, cgrp); 3630 init_css(css, ss, cgrp);
4118 3631
4119 err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id); 3632 err = cgroup_populate_dir(cgrp, 1 << ss->id);
4120 if (err) 3633 if (err)
4121 goto err_free_percpu_ref; 3634 goto err_free_percpu_ref;
4122 3635
@@ -4124,9 +3637,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4124 if (err) 3637 if (err)
4125 goto err_clear_dir; 3638 goto err_clear_dir;
4126 3639
4127 dget(cgrp->dentry); 3640 cgroup_get(cgrp);
4128 css_get(css->parent); 3641 css_get(css->parent);
4129 3642
3643 cgrp->subsys_mask |= 1 << ss->id;
3644
4130 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 3645 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4131 parent->parent) { 3646 parent->parent) {
4132 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", 3647 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
@@ -4139,7 +3654,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4139 return 0; 3654 return 0;
4140 3655
4141err_clear_dir: 3656err_clear_dir:
4142 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); 3657 cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4143err_free_percpu_ref: 3658err_free_percpu_ref:
4144 percpu_ref_cancel_init(&css->refcnt); 3659 percpu_ref_cancel_init(&css->refcnt);
4145err_free_css: 3660err_free_css:
@@ -4147,35 +3662,34 @@ err_free_css:
4147 return err; 3662 return err;
4148} 3663}
4149 3664
4150/* 3665/**
4151 * cgroup_create - create a cgroup 3666 * cgroup_create - create a cgroup
4152 * @parent: cgroup that will be parent of the new cgroup 3667 * @parent: cgroup that will be parent of the new cgroup
4153 * @dentry: dentry of the new cgroup 3668 * @name: name of the new cgroup
4154 * @mode: mode to set on new inode 3669 * @mode: mode to set on new cgroup
4155 *
4156 * Must be called with the mutex on the parent inode held
4157 */ 3670 */
4158static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 3671static long cgroup_create(struct cgroup *parent, const char *name,
4159 umode_t mode) 3672 umode_t mode)
4160{ 3673{
4161 struct cgroup *cgrp; 3674 struct cgroup *cgrp;
4162 struct cgroup_name *name; 3675 struct cgroup_root *root = parent->root;
4163 struct cgroupfs_root *root = parent->root;
4164 int ssid, err; 3676 int ssid, err;
4165 struct cgroup_subsys *ss; 3677 struct cgroup_subsys *ss;
4166 struct super_block *sb = root->sb; 3678 struct kernfs_node *kn;
3679
3680 /*
3681 * XXX: The default hierarchy isn't fully implemented yet. Block
3682 * !root cgroup creation on it for now.
3683 */
3684 if (root == &cgrp_dfl_root)
3685 return -EINVAL;
4167 3686
4168 /* allocate the cgroup and its ID, 0 is reserved for the root */ 3687 /* allocate the cgroup and its ID, 0 is reserved for the root */
4169 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 3688 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4170 if (!cgrp) 3689 if (!cgrp)
4171 return -ENOMEM; 3690 return -ENOMEM;
4172 3691
4173 name = cgroup_alloc_name(dentry); 3692 mutex_lock(&cgroup_tree_mutex);
4174 if (!name) {
4175 err = -ENOMEM;
4176 goto err_free_cgrp;
4177 }
4178 rcu_assign_pointer(cgrp->name, name);
4179 3693
4180 /* 3694 /*
4181 * Only live parents can have children. Note that the liveliness 3695 * Only live parents can have children. Note that the liveliness
@@ -4186,7 +3700,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4186 */ 3700 */
4187 if (!cgroup_lock_live_group(parent)) { 3701 if (!cgroup_lock_live_group(parent)) {
4188 err = -ENODEV; 3702 err = -ENODEV;
4189 goto err_free_name; 3703 goto err_unlock_tree;
4190 } 3704 }
4191 3705
4192 /* 3706 /*
@@ -4199,18 +3713,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4199 goto err_unlock; 3713 goto err_unlock;
4200 } 3714 }
4201 3715
4202 /* Grab a reference on the superblock so the hierarchy doesn't
4203 * get deleted on unmount if there are child cgroups. This
4204 * can be done outside cgroup_mutex, since the sb can't
4205 * disappear while someone has an open control file on the
4206 * fs */
4207 atomic_inc(&sb->s_active);
4208
4209 init_cgroup_housekeeping(cgrp); 3716 init_cgroup_housekeeping(cgrp);
4210 3717
4211 dentry->d_fsdata = cgrp;
4212 cgrp->dentry = dentry;
4213
4214 cgrp->parent = parent; 3718 cgrp->parent = parent;
4215 cgrp->dummy_css.parent = &parent->dummy_css; 3719 cgrp->dummy_css.parent = &parent->dummy_css;
4216 cgrp->root = parent->root; 3720 cgrp->root = parent->root;
@@ -4221,24 +3725,26 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4221 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 3725 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4222 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 3726 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4223 3727
3728 /* create the directory */
3729 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
3730 if (IS_ERR(kn)) {
3731 err = PTR_ERR(kn);
3732 goto err_free_id;
3733 }
3734 cgrp->kn = kn;
3735
4224 /* 3736 /*
4225 * Create directory. cgroup_create_file() returns with the new 3737 * This extra ref will be put in cgroup_free_fn() and guarantees
4226 * directory locked on success so that it can be populated without 3738 * that @cgrp->kn is always accessible.
4227 * dropping cgroup_mutex.
4228 */ 3739 */
4229 err = cgroup_create_file(dentry, S_IFDIR | mode, sb); 3740 kernfs_get(kn);
4230 if (err < 0)
4231 goto err_free_id;
4232 lockdep_assert_held(&dentry->d_inode->i_mutex);
4233 3741
4234 cgrp->serial_nr = cgroup_serial_nr_next++; 3742 cgrp->serial_nr = cgroup_serial_nr_next++;
4235 3743
4236 /* allocation complete, commit to creation */ 3744 /* allocation complete, commit to creation */
4237 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 3745 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4238 root->number_of_cgroups++; 3746 atomic_inc(&root->nr_cgrps);
4239 3747 cgroup_get(parent);
4240 /* hold a ref to the parent's dentry */
4241 dget(parent->dentry);
4242 3748
4243 /* 3749 /*
4244 * @cgrp is now fully operational. If something fails after this 3750 * @cgrp is now fully operational. If something fails after this
@@ -4252,43 +3758,56 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4252 3758
4253 /* let's create and online css's */ 3759 /* let's create and online css's */
4254 for_each_subsys(ss, ssid) { 3760 for_each_subsys(ss, ssid) {
4255 if (root->subsys_mask & (1 << ssid)) { 3761 if (root->cgrp.subsys_mask & (1 << ssid)) {
4256 err = create_css(cgrp, ss); 3762 err = create_css(cgrp, ss);
4257 if (err) 3763 if (err)
4258 goto err_destroy; 3764 goto err_destroy;
4259 } 3765 }
4260 } 3766 }
4261 3767
3768 kernfs_activate(kn);
3769
4262 mutex_unlock(&cgroup_mutex); 3770 mutex_unlock(&cgroup_mutex);
4263 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 3771 mutex_unlock(&cgroup_tree_mutex);
4264 3772
4265 return 0; 3773 return 0;
4266 3774
4267err_free_id: 3775err_free_id:
4268 idr_remove(&root->cgroup_idr, cgrp->id); 3776 idr_remove(&root->cgroup_idr, cgrp->id);
4269 /* Release the reference count that we took on the superblock */
4270 deactivate_super(sb);
4271err_unlock: 3777err_unlock:
4272 mutex_unlock(&cgroup_mutex); 3778 mutex_unlock(&cgroup_mutex);
4273err_free_name: 3779err_unlock_tree:
4274 kfree(rcu_dereference_raw(cgrp->name)); 3780 mutex_unlock(&cgroup_tree_mutex);
4275err_free_cgrp:
4276 kfree(cgrp); 3781 kfree(cgrp);
4277 return err; 3782 return err;
4278 3783
4279err_destroy: 3784err_destroy:
4280 cgroup_destroy_locked(cgrp); 3785 cgroup_destroy_locked(cgrp);
4281 mutex_unlock(&cgroup_mutex); 3786 mutex_unlock(&cgroup_mutex);
4282 mutex_unlock(&dentry->d_inode->i_mutex); 3787 mutex_unlock(&cgroup_tree_mutex);
4283 return err; 3788 return err;
4284} 3789}
4285 3790
4286static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 3791static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3792 umode_t mode)
4287{ 3793{
4288 struct cgroup *c_parent = dentry->d_parent->d_fsdata; 3794 struct cgroup *parent = parent_kn->priv;
3795 int ret;
3796
3797 /*
3798 * cgroup_create() grabs cgroup_tree_mutex which nests outside
3799 * kernfs active_ref and cgroup_create() already synchronizes
3800 * properly against removal through cgroup_lock_live_group().
3801 * Break it before calling cgroup_create().
3802 */
3803 cgroup_get(parent);
3804 kernfs_break_active_protection(parent_kn);
3805
3806 ret = cgroup_create(parent, name, mode);
4289 3807
4290 /* the vfs holds inode->i_mutex already */ 3808 kernfs_unbreak_active_protection(parent_kn);
4291 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 3809 cgroup_put(parent);
3810 return ret;
4292} 3811}
4293 3812
4294/* 3813/*
@@ -4301,6 +3820,7 @@ static void css_killed_work_fn(struct work_struct *work)
4301 container_of(work, struct cgroup_subsys_state, destroy_work); 3820 container_of(work, struct cgroup_subsys_state, destroy_work);
4302 struct cgroup *cgrp = css->cgroup; 3821 struct cgroup *cgrp = css->cgroup;
4303 3822
3823 mutex_lock(&cgroup_tree_mutex);
4304 mutex_lock(&cgroup_mutex); 3824 mutex_lock(&cgroup_mutex);
4305 3825
4306 /* 3826 /*
@@ -4318,6 +3838,7 @@ static void css_killed_work_fn(struct work_struct *work)
4318 cgroup_destroy_css_killed(cgrp); 3838 cgroup_destroy_css_killed(cgrp);
4319 3839
4320 mutex_unlock(&cgroup_mutex); 3840 mutex_unlock(&cgroup_mutex);
3841 mutex_unlock(&cgroup_tree_mutex);
4321 3842
4322 /* 3843 /*
4323 * Put the css refs from kill_css(). Each css holds an extra 3844 * Put the css refs from kill_css(). Each css holds an extra
@@ -4339,18 +3860,15 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
4339 queue_work(cgroup_destroy_wq, &css->destroy_work); 3860 queue_work(cgroup_destroy_wq, &css->destroy_work);
4340} 3861}
4341 3862
4342/** 3863static void __kill_css(struct cgroup_subsys_state *css)
4343 * kill_css - destroy a css
4344 * @css: css to destroy
4345 *
4346 * This function initiates destruction of @css by removing cgroup interface
4347 * files and putting its base reference. ->css_offline() will be invoked
4348 * asynchronously once css_tryget() is guaranteed to fail and when the
4349 * reference count reaches zero, @css will be released.
4350 */
4351static void kill_css(struct cgroup_subsys_state *css)
4352{ 3864{
4353 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); 3865 lockdep_assert_held(&cgroup_tree_mutex);
3866
3867 /*
3868 * This must happen before css is disassociated with its cgroup.
3869 * See seq_css() for details.
3870 */
3871 cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4354 3872
4355 /* 3873 /*
4356 * Killing would put the base ref, but we need to keep it alive 3874 * Killing would put the base ref, but we need to keep it alive
@@ -4372,6 +3890,28 @@ static void kill_css(struct cgroup_subsys_state *css)
4372} 3890}
4373 3891
4374/** 3892/**
3893 * kill_css - destroy a css
3894 * @css: css to destroy
3895 *
3896 * This function initiates destruction of @css by removing cgroup interface
3897 * files and putting its base reference. ->css_offline() will be invoked
3898 * asynchronously once css_tryget() is guaranteed to fail and when the
3899 * reference count reaches zero, @css will be released.
3900 */
3901static void kill_css(struct cgroup_subsys_state *css)
3902{
3903 struct cgroup *cgrp = css->cgroup;
3904
3905 lockdep_assert_held(&cgroup_tree_mutex);
3906
3907 /* if already killed, noop */
3908 if (cgrp->subsys_mask & (1 << css->ss->id)) {
3909 cgrp->subsys_mask &= ~(1 << css->ss->id);
3910 __kill_css(css);
3911 }
3912}
3913
3914/**
4375 * cgroup_destroy_locked - the first stage of cgroup destruction 3915 * cgroup_destroy_locked - the first stage of cgroup destruction
4376 * @cgrp: cgroup to be destroyed 3916 * @cgrp: cgroup to be destroyed
4377 * 3917 *
@@ -4398,22 +3938,21 @@ static void kill_css(struct cgroup_subsys_state *css)
4398static int cgroup_destroy_locked(struct cgroup *cgrp) 3938static int cgroup_destroy_locked(struct cgroup *cgrp)
4399 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 3939 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4400{ 3940{
4401 struct dentry *d = cgrp->dentry;
4402 struct cgroup_subsys_state *css;
4403 struct cgroup *child; 3941 struct cgroup *child;
3942 struct cgroup_subsys_state *css;
4404 bool empty; 3943 bool empty;
4405 int ssid; 3944 int ssid;
4406 3945
4407 lockdep_assert_held(&d->d_inode->i_mutex); 3946 lockdep_assert_held(&cgroup_tree_mutex);
4408 lockdep_assert_held(&cgroup_mutex); 3947 lockdep_assert_held(&cgroup_mutex);
4409 3948
4410 /* 3949 /*
4411 * css_set_lock synchronizes access to ->cset_links and prevents 3950 * css_set_rwsem synchronizes access to ->cset_links and prevents
4412 * @cgrp from being removed while __put_css_set() is in progress. 3951 * @cgrp from being removed while put_css_set() is in progress.
4413 */ 3952 */
4414 read_lock(&css_set_lock); 3953 down_read(&css_set_rwsem);
4415 empty = list_empty(&cgrp->cset_links); 3954 empty = list_empty(&cgrp->cset_links);
4416 read_unlock(&css_set_lock); 3955 up_read(&css_set_rwsem);
4417 if (!empty) 3956 if (!empty)
4418 return -EBUSY; 3957 return -EBUSY;
4419 3958
@@ -4434,14 +3973,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4434 return -EBUSY; 3973 return -EBUSY;
4435 3974
4436 /* 3975 /*
4437 * Initiate massacre of all css's. cgroup_destroy_css_killed()
4438 * will be invoked to perform the rest of destruction once the
4439 * percpu refs of all css's are confirmed to be killed.
4440 */
4441 for_each_css(css, ssid, cgrp)
4442 kill_css(css);
4443
4444 /*
4445 * Mark @cgrp dead. This prevents further task migration and child 3976 * Mark @cgrp dead. This prevents further task migration and child
4446 * creation by disabling cgroup_lock_live_group(). Note that 3977 * creation by disabling cgroup_lock_live_group(). Note that
4447 * CGRP_DEAD assertion is depended upon by css_next_child() to 3978 * CGRP_DEAD assertion is depended upon by css_next_child() to
@@ -4450,6 +3981,17 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4450 */ 3981 */
4451 set_bit(CGRP_DEAD, &cgrp->flags); 3982 set_bit(CGRP_DEAD, &cgrp->flags);
4452 3983
3984 /*
3985 * Initiate massacre of all css's. cgroup_destroy_css_killed()
3986 * will be invoked to perform the rest of destruction once the
3987 * percpu refs of all css's are confirmed to be killed. This
3988 * involves removing the subsystem's files, drop cgroup_mutex.
3989 */
3990 mutex_unlock(&cgroup_mutex);
3991 for_each_css(css, ssid, cgrp)
3992 kill_css(css);
3993 mutex_lock(&cgroup_mutex);
3994
4453 /* CGRP_DEAD is set, remove from ->release_list for the last time */ 3995 /* CGRP_DEAD is set, remove from ->release_list for the last time */
4454 raw_spin_lock(&release_list_lock); 3996 raw_spin_lock(&release_list_lock);
4455 if (!list_empty(&cgrp->release_list)) 3997 if (!list_empty(&cgrp->release_list))
@@ -4465,14 +4007,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4465 if (!cgrp->nr_css) 4007 if (!cgrp->nr_css)
4466 cgroup_destroy_css_killed(cgrp); 4008 cgroup_destroy_css_killed(cgrp);
4467 4009
4010 /* remove @cgrp directory along with the base files */
4011 mutex_unlock(&cgroup_mutex);
4012
4468 /* 4013 /*
4469 * Clear the base files and remove @cgrp directory. The removal 4014 * There are two control paths which try to determine cgroup from
4470 * puts the base ref but we aren't quite done with @cgrp yet, so 4015 * dentry without going through kernfs - cgroupstats_build() and
4471 * hold onto it. 4016 * css_tryget_from_dir(). Those are supported by RCU protecting
4017 * clearing of cgrp->kn->priv backpointer, which should happen
4018 * after all files under it have been removed.
4472 */ 4019 */
4473 cgroup_addrm_files(cgrp, cgroup_base_files, false); 4020 kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */
4474 dget(d); 4021 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
4475 cgroup_d_remove_dir(d); 4022
4023 mutex_lock(&cgroup_mutex);
4476 4024
4477 return 0; 4025 return 0;
4478}; 4026};
@@ -4489,72 +4037,82 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4489static void cgroup_destroy_css_killed(struct cgroup *cgrp) 4037static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4490{ 4038{
4491 struct cgroup *parent = cgrp->parent; 4039 struct cgroup *parent = cgrp->parent;
4492 struct dentry *d = cgrp->dentry;
4493 4040
4041 lockdep_assert_held(&cgroup_tree_mutex);
4494 lockdep_assert_held(&cgroup_mutex); 4042 lockdep_assert_held(&cgroup_mutex);
4495 4043
4496 /* delete this cgroup from parent->children */ 4044 /* delete this cgroup from parent->children */
4497 list_del_rcu(&cgrp->sibling); 4045 list_del_rcu(&cgrp->sibling);
4498 4046
4499 dput(d); 4047 cgroup_put(cgrp);
4500 4048
4501 set_bit(CGRP_RELEASABLE, &parent->flags); 4049 set_bit(CGRP_RELEASABLE, &parent->flags);
4502 check_for_release(parent); 4050 check_for_release(parent);
4503} 4051}
4504 4052
4505static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4053static int cgroup_rmdir(struct kernfs_node *kn)
4506{ 4054{
4507 int ret; 4055 struct cgroup *cgrp = kn->priv;
4508 4056 int ret = 0;
4509 mutex_lock(&cgroup_mutex);
4510 ret = cgroup_destroy_locked(dentry->d_fsdata);
4511 mutex_unlock(&cgroup_mutex);
4512 4057
4513 return ret; 4058 /*
4514} 4059 * This is self-destruction but @kn can't be removed while this
4060 * callback is in progress. Let's break active protection. Once
4061 * the protection is broken, @cgrp can be destroyed at any point.
4062 * Pin it so that it stays accessible.
4063 */
4064 cgroup_get(cgrp);
4065 kernfs_break_active_protection(kn);
4515 4066
4516static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) 4067 mutex_lock(&cgroup_tree_mutex);
4517{ 4068 mutex_lock(&cgroup_mutex);
4518 INIT_LIST_HEAD(&ss->cftsets);
4519 4069
4520 /* 4070 /*
4521 * base_cftset is embedded in subsys itself, no need to worry about 4071 * @cgrp might already have been destroyed while we're trying to
4522 * deregistration. 4072 * grab the mutexes.
4523 */ 4073 */
4524 if (ss->base_cftypes) { 4074 if (!cgroup_is_dead(cgrp))
4525 struct cftype *cft; 4075 ret = cgroup_destroy_locked(cgrp);
4526 4076
4527 for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) 4077 mutex_unlock(&cgroup_mutex);
4528 cft->ss = ss; 4078 mutex_unlock(&cgroup_tree_mutex);
4529 4079
4530 ss->base_cftset.cfts = ss->base_cftypes; 4080 kernfs_unbreak_active_protection(kn);
4531 list_add_tail(&ss->base_cftset.node, &ss->cftsets); 4081 cgroup_put(cgrp);
4532 } 4082 return ret;
4533} 4083}
4534 4084
4085static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4086 .remount_fs = cgroup_remount,
4087 .show_options = cgroup_show_options,
4088 .mkdir = cgroup_mkdir,
4089 .rmdir = cgroup_rmdir,
4090 .rename = cgroup_rename,
4091};
4092
4535static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4093static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4536{ 4094{
4537 struct cgroup_subsys_state *css; 4095 struct cgroup_subsys_state *css;
4538 4096
4539 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4097 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4540 4098
4099 mutex_lock(&cgroup_tree_mutex);
4541 mutex_lock(&cgroup_mutex); 4100 mutex_lock(&cgroup_mutex);
4542 4101
4543 /* init base cftset */ 4102 INIT_LIST_HEAD(&ss->cfts);
4544 cgroup_init_cftsets(ss);
4545 4103
4546 /* Create the top cgroup state for this subsystem */ 4104 /* Create the root cgroup state for this subsystem */
4547 ss->root = &cgroup_dummy_root; 4105 ss->root = &cgrp_dfl_root;
4548 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); 4106 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
4549 /* We don't handle early failures gracefully */ 4107 /* We don't handle early failures gracefully */
4550 BUG_ON(IS_ERR(css)); 4108 BUG_ON(IS_ERR(css));
4551 init_css(css, ss, cgroup_dummy_top); 4109 init_css(css, ss, &cgrp_dfl_root.cgrp);
4552 4110
4553 /* Update the init_css_set to contain a subsys 4111 /* Update the init_css_set to contain a subsys
4554 * pointer to this state - since the subsystem is 4112 * pointer to this state - since the subsystem is
4555 * newly registered, all tasks and hence the 4113 * newly registered, all tasks and hence the
4556 * init_css_set is in the subsystem's top cgroup. */ 4114 * init_css_set is in the subsystem's root cgroup. */
4557 init_css_set.subsys[ss->subsys_id] = css; 4115 init_css_set.subsys[ss->id] = css;
4558 4116
4559 need_forkexit_callback |= ss->fork || ss->exit; 4117 need_forkexit_callback |= ss->fork || ss->exit;
4560 4118
@@ -4565,185 +4123,11 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4565 4123
4566 BUG_ON(online_css(css)); 4124 BUG_ON(online_css(css));
4567 4125
4568 mutex_unlock(&cgroup_mutex); 4126 cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id;
4569
4570 /* this function shouldn't be used with modular subsystems, since they
4571 * need to register a subsys_id, among other things */
4572 BUG_ON(ss->module);
4573}
4574
4575/**
4576 * cgroup_load_subsys: load and register a modular subsystem at runtime
4577 * @ss: the subsystem to load
4578 *
4579 * This function should be called in a modular subsystem's initcall. If the
4580 * subsystem is built as a module, it will be assigned a new subsys_id and set
4581 * up for use. If the subsystem is built-in anyway, work is delegated to the
4582 * simpler cgroup_init_subsys.
4583 */
4584int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4585{
4586 struct cgroup_subsys_state *css;
4587 int i, ret;
4588 struct hlist_node *tmp;
4589 struct css_set *cset;
4590 unsigned long key;
4591
4592 /* check name and function validity */
4593 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
4594 ss->css_alloc == NULL || ss->css_free == NULL)
4595 return -EINVAL;
4596
4597 /*
4598 * we don't support callbacks in modular subsystems. this check is
4599 * before the ss->module check for consistency; a subsystem that could
4600 * be a module should still have no callbacks even if the user isn't
4601 * compiling it as one.
4602 */
4603 if (ss->fork || ss->exit)
4604 return -EINVAL;
4605
4606 /*
4607 * an optionally modular subsystem is built-in: we want to do nothing,
4608 * since cgroup_init_subsys will have already taken care of it.
4609 */
4610 if (ss->module == NULL) {
4611 /* a sanity check */
4612 BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
4613 return 0;
4614 }
4615
4616 /* init base cftset */
4617 cgroup_init_cftsets(ss);
4618
4619 mutex_lock(&cgroup_mutex);
4620 mutex_lock(&cgroup_root_mutex);
4621 cgroup_subsys[ss->subsys_id] = ss;
4622
4623 /*
4624 * no ss->css_alloc seems to need anything important in the ss
4625 * struct, so this can happen first (i.e. before the dummy root
4626 * attachment).
4627 */
4628 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4629 if (IS_ERR(css)) {
4630 /* failure case - need to deassign the cgroup_subsys[] slot. */
4631 cgroup_subsys[ss->subsys_id] = NULL;
4632 mutex_unlock(&cgroup_root_mutex);
4633 mutex_unlock(&cgroup_mutex);
4634 return PTR_ERR(css);
4635 }
4636
4637 ss->root = &cgroup_dummy_root;
4638
4639 /* our new subsystem will be attached to the dummy hierarchy. */
4640 init_css(css, ss, cgroup_dummy_top);
4641
4642 /*
4643 * Now we need to entangle the css into the existing css_sets. unlike
4644 * in cgroup_init_subsys, there are now multiple css_sets, so each one
4645 * will need a new pointer to it; done by iterating the css_set_table.
4646 * furthermore, modifying the existing css_sets will corrupt the hash
4647 * table state, so each changed css_set will need its hash recomputed.
4648 * this is all done under the css_set_lock.
4649 */
4650 write_lock(&css_set_lock);
4651 hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
4652 /* skip entries that we already rehashed */
4653 if (cset->subsys[ss->subsys_id])
4654 continue;
4655 /* remove existing entry */
4656 hash_del(&cset->hlist);
4657 /* set new value */
4658 cset->subsys[ss->subsys_id] = css;
4659 /* recompute hash and restore entry */
4660 key = css_set_hash(cset->subsys);
4661 hash_add(css_set_table, &cset->hlist, key);
4662 }
4663 write_unlock(&css_set_lock);
4664
4665 ret = online_css(css);
4666 if (ret) {
4667 ss->css_free(css);
4668 goto err_unload;
4669 }
4670
4671 /* success! */
4672 mutex_unlock(&cgroup_root_mutex);
4673 mutex_unlock(&cgroup_mutex);
4674 return 0;
4675
4676err_unload:
4677 mutex_unlock(&cgroup_root_mutex);
4678 mutex_unlock(&cgroup_mutex);
4679 /* @ss can't be mounted here as try_module_get() would fail */
4680 cgroup_unload_subsys(ss);
4681 return ret;
4682}
4683EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4684
4685/**
4686 * cgroup_unload_subsys: unload a modular subsystem
4687 * @ss: the subsystem to unload
4688 *
4689 * This function should be called in a modular subsystem's exitcall. When this
4690 * function is invoked, the refcount on the subsystem's module will be 0, so
4691 * the subsystem will not be attached to any hierarchy.
4692 */
4693void cgroup_unload_subsys(struct cgroup_subsys *ss)
4694{
4695 struct cgrp_cset_link *link;
4696 struct cgroup_subsys_state *css;
4697
4698 BUG_ON(ss->module == NULL);
4699
4700 /*
4701 * we shouldn't be called if the subsystem is in use, and the use of
4702 * try_module_get() in rebind_subsystems() should ensure that it
4703 * doesn't start being used while we're killing it off.
4704 */
4705 BUG_ON(ss->root != &cgroup_dummy_root);
4706
4707 mutex_lock(&cgroup_mutex);
4708 mutex_lock(&cgroup_root_mutex);
4709
4710 css = cgroup_css(cgroup_dummy_top, ss);
4711 if (css)
4712 offline_css(css);
4713 4127
4714 /* deassign the subsys_id */
4715 cgroup_subsys[ss->subsys_id] = NULL;
4716
4717 /*
4718 * disentangle the css from all css_sets attached to the dummy
4719 * top. as in loading, we need to pay our respects to the hashtable
4720 * gods.
4721 */
4722 write_lock(&css_set_lock);
4723 list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
4724 struct css_set *cset = link->cset;
4725 unsigned long key;
4726
4727 hash_del(&cset->hlist);
4728 cset->subsys[ss->subsys_id] = NULL;
4729 key = css_set_hash(cset->subsys);
4730 hash_add(css_set_table, &cset->hlist, key);
4731 }
4732 write_unlock(&css_set_lock);
4733
4734 /*
4735 * remove subsystem's css from the cgroup_dummy_top and free it -
4736 * need to free before marking as null because ss->css_free needs
4737 * the cgrp->subsys pointer to find their state.
4738 */
4739 if (css)
4740 ss->css_free(css);
4741 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4742
4743 mutex_unlock(&cgroup_root_mutex);
4744 mutex_unlock(&cgroup_mutex); 4128 mutex_unlock(&cgroup_mutex);
4129 mutex_unlock(&cgroup_tree_mutex);
4745} 4130}
4746EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4747 4131
4748/** 4132/**
4749 * cgroup_init_early - cgroup initialization at system boot 4133 * cgroup_init_early - cgroup initialization at system boot
@@ -4753,34 +4137,24 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4753 */ 4137 */
4754int __init cgroup_init_early(void) 4138int __init cgroup_init_early(void)
4755{ 4139{
4140 static struct cgroup_sb_opts __initdata opts =
4141 { .flags = CGRP_ROOT_SANE_BEHAVIOR };
4756 struct cgroup_subsys *ss; 4142 struct cgroup_subsys *ss;
4757 int i; 4143 int i;
4758 4144
4759 atomic_set(&init_css_set.refcount, 1); 4145 init_cgroup_root(&cgrp_dfl_root, &opts);
4760 INIT_LIST_HEAD(&init_css_set.cgrp_links);
4761 INIT_LIST_HEAD(&init_css_set.tasks);
4762 INIT_HLIST_NODE(&init_css_set.hlist);
4763 css_set_count = 1;
4764 init_cgroup_root(&cgroup_dummy_root);
4765 cgroup_root_count = 1;
4766 RCU_INIT_POINTER(init_task.cgroups, &init_css_set); 4146 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4767 4147
4768 init_cgrp_cset_link.cset = &init_css_set; 4148 for_each_subsys(ss, i) {
4769 init_cgrp_cset_link.cgrp = cgroup_dummy_top; 4149 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
4770 list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links); 4150 "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
4771 list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); 4151 i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
4772 4152 ss->id, ss->name);
4773 /* at bootup time, we don't worry about modular subsystems */ 4153 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
4774 for_each_builtin_subsys(ss, i) { 4154 "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
4775 BUG_ON(!ss->name); 4155
4776 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4156 ss->id = i;
4777 BUG_ON(!ss->css_alloc); 4157 ss->name = cgroup_subsys_name[i];
4778 BUG_ON(!ss->css_free);
4779 if (ss->subsys_id != i) {
4780 printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
4781 ss->name, ss->subsys_id);
4782 BUG();
4783 }
4784 4158
4785 if (ss->early_init) 4159 if (ss->early_init)
4786 cgroup_init_subsys(ss); 4160 cgroup_init_subsys(ss);
@@ -4798,53 +4172,46 @@ int __init cgroup_init(void)
4798{ 4172{
4799 struct cgroup_subsys *ss; 4173 struct cgroup_subsys *ss;
4800 unsigned long key; 4174 unsigned long key;
4801 int i, err; 4175 int ssid, err;
4802 4176
4803 err = bdi_init(&cgroup_backing_dev_info); 4177 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
4804 if (err)
4805 return err;
4806 4178
4807 for_each_builtin_subsys(ss, i) { 4179 mutex_lock(&cgroup_tree_mutex);
4808 if (!ss->early_init)
4809 cgroup_init_subsys(ss);
4810 }
4811
4812 /* allocate id for the dummy hierarchy */
4813 mutex_lock(&cgroup_mutex); 4180 mutex_lock(&cgroup_mutex);
4814 mutex_lock(&cgroup_root_mutex);
4815 4181
4816 /* Add init_css_set to the hash table */ 4182 /* Add init_css_set to the hash table */
4817 key = css_set_hash(init_css_set.subsys); 4183 key = css_set_hash(init_css_set.subsys);
4818 hash_add(css_set_table, &init_css_set.hlist, key); 4184 hash_add(css_set_table, &init_css_set.hlist, key);
4819 4185
4820 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); 4186 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
4821 4187
4822 err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top,
4823 0, 1, GFP_KERNEL);
4824 BUG_ON(err < 0);
4825
4826 mutex_unlock(&cgroup_root_mutex);
4827 mutex_unlock(&cgroup_mutex); 4188 mutex_unlock(&cgroup_mutex);
4189 mutex_unlock(&cgroup_tree_mutex);
4828 4190
4829 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4191 for_each_subsys(ss, ssid) {
4830 if (!cgroup_kobj) { 4192 if (!ss->early_init)
4831 err = -ENOMEM; 4193 cgroup_init_subsys(ss);
4832 goto out; 4194
4195 /*
4196 * cftype registration needs kmalloc and can't be done
4197 * during early_init. Register base cftypes separately.
4198 */
4199 if (ss->base_cftypes)
4200 WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
4833 } 4201 }
4834 4202
4203 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4204 if (!cgroup_kobj)
4205 return -ENOMEM;
4206
4835 err = register_filesystem(&cgroup_fs_type); 4207 err = register_filesystem(&cgroup_fs_type);
4836 if (err < 0) { 4208 if (err < 0) {
4837 kobject_put(cgroup_kobj); 4209 kobject_put(cgroup_kobj);
4838 goto out; 4210 return err;
4839 } 4211 }
4840 4212
4841 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); 4213 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
4842 4214 return 0;
4843out:
4844 if (err)
4845 bdi_destroy(&cgroup_backing_dev_info);
4846
4847 return err;
4848} 4215}
4849 4216
4850static int __init cgroup_wq_init(void) 4217static int __init cgroup_wq_init(void)
@@ -4876,12 +4243,6 @@ core_initcall(cgroup_wq_init);
4876 * proc_cgroup_show() 4243 * proc_cgroup_show()
4877 * - Print task's cgroup paths into seq_file, one line for each hierarchy 4244 * - Print task's cgroup paths into seq_file, one line for each hierarchy
4878 * - Used for /proc/<pid>/cgroup. 4245 * - Used for /proc/<pid>/cgroup.
4879 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
4880 * doesn't really matter if tsk->cgroup changes after we read it,
4881 * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
4882 * anyway. No need to check that tsk->cgroup != NULL, thanks to
4883 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
4884 * cgroup to top_cgroup.
4885 */ 4246 */
4886 4247
4887/* TODO: Use a proper seq_file iterator */ 4248/* TODO: Use a proper seq_file iterator */
@@ -4889,12 +4250,12 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4889{ 4250{
4890 struct pid *pid; 4251 struct pid *pid;
4891 struct task_struct *tsk; 4252 struct task_struct *tsk;
4892 char *buf; 4253 char *buf, *path;
4893 int retval; 4254 int retval;
4894 struct cgroupfs_root *root; 4255 struct cgroup_root *root;
4895 4256
4896 retval = -ENOMEM; 4257 retval = -ENOMEM;
4897 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 4258 buf = kmalloc(PATH_MAX, GFP_KERNEL);
4898 if (!buf) 4259 if (!buf)
4899 goto out; 4260 goto out;
4900 4261
@@ -4907,29 +4268,36 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4907 retval = 0; 4268 retval = 0;
4908 4269
4909 mutex_lock(&cgroup_mutex); 4270 mutex_lock(&cgroup_mutex);
4271 down_read(&css_set_rwsem);
4910 4272
4911 for_each_active_root(root) { 4273 for_each_root(root) {
4912 struct cgroup_subsys *ss; 4274 struct cgroup_subsys *ss;
4913 struct cgroup *cgrp; 4275 struct cgroup *cgrp;
4914 int ssid, count = 0; 4276 int ssid, count = 0;
4915 4277
4278 if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
4279 continue;
4280
4916 seq_printf(m, "%d:", root->hierarchy_id); 4281 seq_printf(m, "%d:", root->hierarchy_id);
4917 for_each_subsys(ss, ssid) 4282 for_each_subsys(ss, ssid)
4918 if (root->subsys_mask & (1 << ssid)) 4283 if (root->cgrp.subsys_mask & (1 << ssid))
4919 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4284 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4920 if (strlen(root->name)) 4285 if (strlen(root->name))
4921 seq_printf(m, "%sname=%s", count ? "," : "", 4286 seq_printf(m, "%sname=%s", count ? "," : "",
4922 root->name); 4287 root->name);
4923 seq_putc(m, ':'); 4288 seq_putc(m, ':');
4924 cgrp = task_cgroup_from_root(tsk, root); 4289 cgrp = task_cgroup_from_root(tsk, root);
4925 retval = cgroup_path(cgrp, buf, PAGE_SIZE); 4290 path = cgroup_path(cgrp, buf, PATH_MAX);
4926 if (retval < 0) 4291 if (!path) {
4292 retval = -ENAMETOOLONG;
4927 goto out_unlock; 4293 goto out_unlock;
4928 seq_puts(m, buf); 4294 }
4295 seq_puts(m, path);
4929 seq_putc(m, '\n'); 4296 seq_putc(m, '\n');
4930 } 4297 }
4931 4298
4932out_unlock: 4299out_unlock:
4300 up_read(&css_set_rwsem);
4933 mutex_unlock(&cgroup_mutex); 4301 mutex_unlock(&cgroup_mutex);
4934 put_task_struct(tsk); 4302 put_task_struct(tsk);
4935out_free: 4303out_free:
@@ -4955,7 +4323,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
4955 for_each_subsys(ss, i) 4323 for_each_subsys(ss, i)
4956 seq_printf(m, "%s\t%d\t%d\t%d\n", 4324 seq_printf(m, "%s\t%d\t%d\t%d\n",
4957 ss->name, ss->root->hierarchy_id, 4325 ss->name, ss->root->hierarchy_id,
4958 ss->root->number_of_cgroups, !ss->disabled); 4326 atomic_read(&ss->root->nr_cgrps), !ss->disabled);
4959 4327
4960 mutex_unlock(&cgroup_mutex); 4328 mutex_unlock(&cgroup_mutex);
4961 return 0; 4329 return 0;
@@ -4974,27 +4342,16 @@ static const struct file_operations proc_cgroupstats_operations = {
4974}; 4342};
4975 4343
4976/** 4344/**
4977 * cgroup_fork - attach newly forked task to its parents cgroup. 4345 * cgroup_fork - initialize cgroup related fields during copy_process()
4978 * @child: pointer to task_struct of forking parent process. 4346 * @child: pointer to task_struct of forking parent process.
4979 * 4347 *
4980 * Description: A task inherits its parent's cgroup at fork(). 4348 * A task is associated with the init_css_set until cgroup_post_fork()
4981 * 4349 * attaches it to the parent's css_set. Empty cg_list indicates that
4982 * A pointer to the shared css_set was automatically copied in 4350 * @child isn't holding reference to its css_set.
4983 * fork.c by dup_task_struct(). However, we ignore that copy, since
4984 * it was not made under the protection of RCU or cgroup_mutex, so
4985 * might no longer be a valid cgroup pointer. cgroup_attach_task() might
4986 * have already changed current->cgroups, allowing the previously
4987 * referenced cgroup group to be removed and freed.
4988 *
4989 * At the point that cgroup_fork() is called, 'current' is the parent
4990 * task, and the passed argument 'child' points to the child task.
4991 */ 4351 */
4992void cgroup_fork(struct task_struct *child) 4352void cgroup_fork(struct task_struct *child)
4993{ 4353{
4994 task_lock(current); 4354 RCU_INIT_POINTER(child->cgroups, &init_css_set);
4995 get_css_set(task_css_set(current));
4996 child->cgroups = current->cgroups;
4997 task_unlock(current);
4998 INIT_LIST_HEAD(&child->cg_list); 4355 INIT_LIST_HEAD(&child->cg_list);
4999} 4356}
5000 4357
@@ -5014,23 +4371,37 @@ void cgroup_post_fork(struct task_struct *child)
5014 int i; 4371 int i;
5015 4372
5016 /* 4373 /*
5017 * use_task_css_set_links is set to 1 before we walk the tasklist 4374 * This may race against cgroup_enable_task_cg_links(). As that
5018 * under the tasklist_lock and we read it here after we added the child 4375 * function sets use_task_css_set_links before grabbing
5019 * to the tasklist under the tasklist_lock as well. If the child wasn't 4376 * tasklist_lock and we just went through tasklist_lock to add
5020 * yet in the tasklist when we walked through it from 4377 * @child, it's guaranteed that either we see the set
5021 * cgroup_enable_task_cg_lists(), then use_task_css_set_links value 4378 * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
5022 * should be visible now due to the paired locking and barriers implied 4379 * @child during its iteration.
5023 * by LOCK/UNLOCK: it is written before the tasklist_lock unlock 4380 *
5024 * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock 4381 * If we won the race, @child is associated with %current's
5025 * lock on fork. 4382 * css_set. Grabbing css_set_rwsem guarantees both that the
4383 * association is stable, and, on completion of the parent's
4384 * migration, @child is visible in the source of migration or
4385 * already in the destination cgroup. This guarantee is necessary
4386 * when implementing operations which need to migrate all tasks of
4387 * a cgroup to another.
4388 *
4389 * Note that if we lose to cgroup_enable_task_cg_links(), @child
4390 * will remain in init_css_set. This is safe because all tasks are
4391 * in the init_css_set before cg_links is enabled and there's no
4392 * operation which transfers all tasks out of init_css_set.
5026 */ 4393 */
5027 if (use_task_css_set_links) { 4394 if (use_task_css_set_links) {
5028 write_lock(&css_set_lock); 4395 struct css_set *cset;
5029 task_lock(child); 4396
5030 if (list_empty(&child->cg_list)) 4397 down_write(&css_set_rwsem);
5031 list_add(&child->cg_list, &task_css_set(child)->tasks); 4398 cset = task_css_set(current);
5032 task_unlock(child); 4399 if (list_empty(&child->cg_list)) {
5033 write_unlock(&css_set_lock); 4400 rcu_assign_pointer(child->cgroups, cset);
4401 list_add(&child->cg_list, &cset->tasks);
4402 get_css_set(cset);
4403 }
4404 up_write(&css_set_rwsem);
5034 } 4405 }
5035 4406
5036 /* 4407 /*
@@ -5039,15 +4410,7 @@ void cgroup_post_fork(struct task_struct *child)
5039 * and addition to css_set. 4410 * and addition to css_set.
5040 */ 4411 */
5041 if (need_forkexit_callback) { 4412 if (need_forkexit_callback) {
5042 /* 4413 for_each_subsys(ss, i)
5043 * fork/exit callbacks are supported only for builtin
5044 * subsystems, and the builtin section of the subsys
5045 * array is immutable, so we don't need to lock the
5046 * subsys array here. On the other hand, modular section
5047 * of the array can be freed at module unload, so we
5048 * can't touch that.
5049 */
5050 for_each_builtin_subsys(ss, i)
5051 if (ss->fork) 4414 if (ss->fork)
5052 ss->fork(child); 4415 ss->fork(child);
5053 } 4416 }
@@ -5056,7 +4419,6 @@ void cgroup_post_fork(struct task_struct *child)
5056/** 4419/**
5057 * cgroup_exit - detach cgroup from exiting task 4420 * cgroup_exit - detach cgroup from exiting task
5058 * @tsk: pointer to task_struct of exiting process 4421 * @tsk: pointer to task_struct of exiting process
5059 * @run_callback: run exit callbacks?
5060 * 4422 *
5061 * Description: Detach cgroup from @tsk and release it. 4423 * Description: Detach cgroup from @tsk and release it.
5062 * 4424 *
@@ -5066,57 +4428,38 @@ void cgroup_post_fork(struct task_struct *child)
5066 * use notify_on_release cgroups where very high task exit scaling 4428 * use notify_on_release cgroups where very high task exit scaling
5067 * is required on large systems. 4429 * is required on large systems.
5068 * 4430 *
5069 * the_top_cgroup_hack: 4431 * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
5070 * 4432 * call cgroup_exit() while the task is still competent to handle
5071 * Set the exiting tasks cgroup to the root cgroup (top_cgroup). 4433 * notify_on_release(), then leave the task attached to the root cgroup in
5072 * 4434 * each hierarchy for the remainder of its exit. No need to bother with
5073 * We call cgroup_exit() while the task is still competent to 4435 * init_css_set refcnting. init_css_set never goes away and we can't race
5074 * handle notify_on_release(), then leave the task attached to the 4436 * with migration path - PF_EXITING is visible to migration path.
5075 * root cgroup in each hierarchy for the remainder of its exit.
5076 *
5077 * To do this properly, we would increment the reference count on
5078 * top_cgroup, and near the very end of the kernel/exit.c do_exit()
5079 * code we would add a second cgroup function call, to drop that
5080 * reference. This would just create an unnecessary hot spot on
5081 * the top_cgroup reference count, to no avail.
5082 *
5083 * Normally, holding a reference to a cgroup without bumping its
5084 * count is unsafe. The cgroup could go away, or someone could
5085 * attach us to a different cgroup, decrementing the count on
5086 * the first cgroup that we never incremented. But in this case,
5087 * top_cgroup isn't going away, and either task has PF_EXITING set,
5088 * which wards off any cgroup_attach_task() attempts, or task is a failed
5089 * fork, never visible to cgroup_attach_task.
5090 */ 4437 */
5091void cgroup_exit(struct task_struct *tsk, int run_callbacks) 4438void cgroup_exit(struct task_struct *tsk)
5092{ 4439{
5093 struct cgroup_subsys *ss; 4440 struct cgroup_subsys *ss;
5094 struct css_set *cset; 4441 struct css_set *cset;
4442 bool put_cset = false;
5095 int i; 4443 int i;
5096 4444
5097 /* 4445 /*
5098 * Unlink from the css_set task list if necessary. 4446 * Unlink from @tsk from its css_set. As migration path can't race
5099 * Optimistically check cg_list before taking 4447 * with us, we can check cg_list without grabbing css_set_rwsem.
5100 * css_set_lock
5101 */ 4448 */
5102 if (!list_empty(&tsk->cg_list)) { 4449 if (!list_empty(&tsk->cg_list)) {
5103 write_lock(&css_set_lock); 4450 down_write(&css_set_rwsem);
5104 if (!list_empty(&tsk->cg_list)) 4451 list_del_init(&tsk->cg_list);
5105 list_del_init(&tsk->cg_list); 4452 up_write(&css_set_rwsem);
5106 write_unlock(&css_set_lock); 4453 put_cset = true;
5107 } 4454 }
5108 4455
5109 /* Reassign the task to the init_css_set. */ 4456 /* Reassign the task to the init_css_set. */
5110 task_lock(tsk);
5111 cset = task_css_set(tsk); 4457 cset = task_css_set(tsk);
5112 RCU_INIT_POINTER(tsk->cgroups, &init_css_set); 4458 RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
5113 4459
5114 if (run_callbacks && need_forkexit_callback) { 4460 if (need_forkexit_callback) {
5115 /* 4461 /* see cgroup_post_fork() for details */
5116 * fork/exit callbacks are supported only for builtin 4462 for_each_subsys(ss, i) {
5117 * subsystems, see cgroup_post_fork() for details.
5118 */
5119 for_each_builtin_subsys(ss, i) {
5120 if (ss->exit) { 4463 if (ss->exit) {
5121 struct cgroup_subsys_state *old_css = cset->subsys[i]; 4464 struct cgroup_subsys_state *old_css = cset->subsys[i];
5122 struct cgroup_subsys_state *css = task_css(tsk, i); 4465 struct cgroup_subsys_state *css = task_css(tsk, i);
@@ -5125,9 +4468,9 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5125 } 4468 }
5126 } 4469 }
5127 } 4470 }
5128 task_unlock(tsk);
5129 4471
5130 put_css_set_taskexit(cset); 4472 if (put_cset)
4473 put_css_set(cset, true);
5131} 4474}
5132 4475
5133static void check_for_release(struct cgroup *cgrp) 4476static void check_for_release(struct cgroup *cgrp)
@@ -5184,16 +4527,17 @@ static void cgroup_release_agent(struct work_struct *work)
5184 while (!list_empty(&release_list)) { 4527 while (!list_empty(&release_list)) {
5185 char *argv[3], *envp[3]; 4528 char *argv[3], *envp[3];
5186 int i; 4529 int i;
5187 char *pathbuf = NULL, *agentbuf = NULL; 4530 char *pathbuf = NULL, *agentbuf = NULL, *path;
5188 struct cgroup *cgrp = list_entry(release_list.next, 4531 struct cgroup *cgrp = list_entry(release_list.next,
5189 struct cgroup, 4532 struct cgroup,
5190 release_list); 4533 release_list);
5191 list_del_init(&cgrp->release_list); 4534 list_del_init(&cgrp->release_list);
5192 raw_spin_unlock(&release_list_lock); 4535 raw_spin_unlock(&release_list_lock);
5193 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); 4536 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
5194 if (!pathbuf) 4537 if (!pathbuf)
5195 goto continue_free; 4538 goto continue_free;
5196 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) 4539 path = cgroup_path(cgrp, pathbuf, PATH_MAX);
4540 if (!path)
5197 goto continue_free; 4541 goto continue_free;
5198 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); 4542 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
5199 if (!agentbuf) 4543 if (!agentbuf)
@@ -5201,7 +4545,7 @@ static void cgroup_release_agent(struct work_struct *work)
5201 4545
5202 i = 0; 4546 i = 0;
5203 argv[i++] = agentbuf; 4547 argv[i++] = agentbuf;
5204 argv[i++] = pathbuf; 4548 argv[i++] = path;
5205 argv[i] = NULL; 4549 argv[i] = NULL;
5206 4550
5207 i = 0; 4551 i = 0;
@@ -5235,11 +4579,7 @@ static int __init cgroup_disable(char *str)
5235 if (!*token) 4579 if (!*token)
5236 continue; 4580 continue;
5237 4581
5238 /* 4582 for_each_subsys(ss, i) {
5239 * cgroup_disable, being at boot time, can't know about
5240 * module subsystems, so we don't worry about them.
5241 */
5242 for_each_builtin_subsys(ss, i) {
5243 if (!strcmp(token, ss->name)) { 4583 if (!strcmp(token, ss->name)) {
5244 ss->disabled = 1; 4584 ss->disabled = 1;
5245 printk(KERN_INFO "Disabling %s control group" 4585 printk(KERN_INFO "Disabling %s control group"
@@ -5253,28 +4593,42 @@ static int __init cgroup_disable(char *str)
5253__setup("cgroup_disable=", cgroup_disable); 4593__setup("cgroup_disable=", cgroup_disable);
5254 4594
5255/** 4595/**
5256 * css_from_dir - get corresponding css from the dentry of a cgroup dir 4596 * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir
5257 * @dentry: directory dentry of interest 4597 * @dentry: directory dentry of interest
5258 * @ss: subsystem of interest 4598 * @ss: subsystem of interest
5259 * 4599 *
5260 * Must be called under cgroup_mutex or RCU read lock. The caller is 4600 * If @dentry is a directory for a cgroup which has @ss enabled on it, try
5261 * responsible for pinning the returned css if it needs to be accessed 4601 * to get the corresponding css and return it. If such css doesn't exist
5262 * outside the critical section. 4602 * or can't be pinned, an ERR_PTR value is returned.
5263 */ 4603 */
5264struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, 4604struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
5265 struct cgroup_subsys *ss) 4605 struct cgroup_subsys *ss)
5266{ 4606{
4607 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
4608 struct cgroup_subsys_state *css = NULL;
5267 struct cgroup *cgrp; 4609 struct cgroup *cgrp;
5268 4610
5269 cgroup_assert_mutex_or_rcu_locked();
5270
5271 /* is @dentry a cgroup dir? */ 4611 /* is @dentry a cgroup dir? */
5272 if (!dentry->d_inode || 4612 if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
5273 dentry->d_inode->i_op != &cgroup_dir_inode_operations) 4613 kernfs_type(kn) != KERNFS_DIR)
5274 return ERR_PTR(-EBADF); 4614 return ERR_PTR(-EBADF);
5275 4615
5276 cgrp = __d_cgrp(dentry); 4616 rcu_read_lock();
5277 return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); 4617
4618 /*
4619 * This path doesn't originate from kernfs and @kn could already
4620 * have been or be removed at any point. @kn->priv is RCU
4621 * protected for this access. See destroy_locked() for details.
4622 */
4623 cgrp = rcu_dereference(kn->priv);
4624 if (cgrp)
4625 css = cgroup_css(cgrp, ss);
4626
4627 if (!css || !css_tryget(css))
4628 css = ERR_PTR(-ENOENT);
4629
4630 rcu_read_unlock();
4631 return css;
5278} 4632}
5279 4633
5280/** 4634/**
@@ -5289,7 +4643,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5289{ 4643{
5290 struct cgroup *cgrp; 4644 struct cgroup *cgrp;
5291 4645
5292 cgroup_assert_mutex_or_rcu_locked(); 4646 cgroup_assert_mutexes_or_rcu_locked();
5293 4647
5294 cgrp = idr_find(&ss->root->cgroup_idr, id); 4648 cgrp = idr_find(&ss->root->cgroup_idr, id);
5295 if (cgrp) 4649 if (cgrp)
@@ -5341,23 +4695,25 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5341{ 4695{
5342 struct cgrp_cset_link *link; 4696 struct cgrp_cset_link *link;
5343 struct css_set *cset; 4697 struct css_set *cset;
4698 char *name_buf;
5344 4699
5345 read_lock(&css_set_lock); 4700 name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
4701 if (!name_buf)
4702 return -ENOMEM;
4703
4704 down_read(&css_set_rwsem);
5346 rcu_read_lock(); 4705 rcu_read_lock();
5347 cset = rcu_dereference(current->cgroups); 4706 cset = rcu_dereference(current->cgroups);
5348 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { 4707 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
5349 struct cgroup *c = link->cgrp; 4708 struct cgroup *c = link->cgrp;
5350 const char *name;
5351 4709
5352 if (c->dentry) 4710 cgroup_name(c, name_buf, NAME_MAX + 1);
5353 name = c->dentry->d_name.name;
5354 else
5355 name = "?";
5356 seq_printf(seq, "Root %d group %s\n", 4711 seq_printf(seq, "Root %d group %s\n",
5357 c->root->hierarchy_id, name); 4712 c->root->hierarchy_id, name_buf);
5358 } 4713 }
5359 rcu_read_unlock(); 4714 rcu_read_unlock();
5360 read_unlock(&css_set_lock); 4715 up_read(&css_set_rwsem);
4716 kfree(name_buf);
5361 return 0; 4717 return 0;
5362} 4718}
5363 4719
@@ -5367,23 +4723,30 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
5367 struct cgroup_subsys_state *css = seq_css(seq); 4723 struct cgroup_subsys_state *css = seq_css(seq);
5368 struct cgrp_cset_link *link; 4724 struct cgrp_cset_link *link;
5369 4725
5370 read_lock(&css_set_lock); 4726 down_read(&css_set_rwsem);
5371 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { 4727 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5372 struct css_set *cset = link->cset; 4728 struct css_set *cset = link->cset;
5373 struct task_struct *task; 4729 struct task_struct *task;
5374 int count = 0; 4730 int count = 0;
4731
5375 seq_printf(seq, "css_set %p\n", cset); 4732 seq_printf(seq, "css_set %p\n", cset);
4733
5376 list_for_each_entry(task, &cset->tasks, cg_list) { 4734 list_for_each_entry(task, &cset->tasks, cg_list) {
5377 if (count++ > MAX_TASKS_SHOWN_PER_CSS) { 4735 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
5378 seq_puts(seq, " ...\n"); 4736 goto overflow;
5379 break; 4737 seq_printf(seq, " task %d\n", task_pid_vnr(task));
5380 } else { 4738 }
5381 seq_printf(seq, " task %d\n", 4739
5382 task_pid_vnr(task)); 4740 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
5383 } 4741 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
4742 goto overflow;
4743 seq_printf(seq, " task %d\n", task_pid_vnr(task));
5384 } 4744 }
4745 continue;
4746 overflow:
4747 seq_puts(seq, " ...\n");
5385 } 4748 }
5386 read_unlock(&css_set_lock); 4749 up_read(&css_set_rwsem);
5387 return 0; 4750 return 0;
5388} 4751}
5389 4752
@@ -5426,11 +4789,9 @@ static struct cftype debug_files[] = {
5426 { } /* terminate */ 4789 { } /* terminate */
5427}; 4790};
5428 4791
5429struct cgroup_subsys debug_subsys = { 4792struct cgroup_subsys debug_cgrp_subsys = {
5430 .name = "debug",
5431 .css_alloc = debug_css_alloc, 4793 .css_alloc = debug_css_alloc,
5432 .css_free = debug_css_free, 4794 .css_free = debug_css_free,
5433 .subsys_id = debug_subsys_id,
5434 .base_cftypes = debug_files, 4795 .base_cftypes = debug_files,
5435}; 4796};
5436#endif /* CONFIG_CGROUP_DEBUG */ 4797#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 6c3154e477f6..2bc4a2256444 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -52,7 +52,7 @@ static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
52 52
53static inline struct freezer *task_freezer(struct task_struct *task) 53static inline struct freezer *task_freezer(struct task_struct *task)
54{ 54{
55 return css_freezer(task_css(task, freezer_subsys_id)); 55 return css_freezer(task_css(task, freezer_cgrp_id));
56} 56}
57 57
58static struct freezer *parent_freezer(struct freezer *freezer) 58static struct freezer *parent_freezer(struct freezer *freezer)
@@ -84,8 +84,6 @@ static const char *freezer_state_strs(unsigned int state)
84 return "THAWED"; 84 return "THAWED";
85}; 85};
86 86
87struct cgroup_subsys freezer_subsys;
88
89static struct cgroup_subsys_state * 87static struct cgroup_subsys_state *
90freezer_css_alloc(struct cgroup_subsys_state *parent_css) 88freezer_css_alloc(struct cgroup_subsys_state *parent_css)
91{ 89{
@@ -189,7 +187,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
189 * current state before executing the following - !frozen tasks may 187 * current state before executing the following - !frozen tasks may
190 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. 188 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
191 */ 189 */
192 cgroup_taskset_for_each(task, new_css, tset) { 190 cgroup_taskset_for_each(task, tset) {
193 if (!(freezer->state & CGROUP_FREEZING)) { 191 if (!(freezer->state & CGROUP_FREEZING)) {
194 __thaw_task(task); 192 __thaw_task(task);
195 } else { 193 } else {
@@ -216,6 +214,16 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
216 } 214 }
217} 215}
218 216
217/**
218 * freezer_fork - cgroup post fork callback
219 * @task: a task which has just been forked
220 *
221 * @task has just been created and should conform to the current state of
222 * the cgroup_freezer it belongs to. This function may race against
223 * freezer_attach(). Losing to freezer_attach() means that we don't have
224 * to do anything as freezer_attach() will put @task into the appropriate
225 * state.
226 */
219static void freezer_fork(struct task_struct *task) 227static void freezer_fork(struct task_struct *task)
220{ 228{
221 struct freezer *freezer; 229 struct freezer *freezer;
@@ -224,14 +232,26 @@ static void freezer_fork(struct task_struct *task)
224 freezer = task_freezer(task); 232 freezer = task_freezer(task);
225 233
226 /* 234 /*
227 * The root cgroup is non-freezable, so we can skip the 235 * The root cgroup is non-freezable, so we can skip locking the
228 * following check. 236 * freezer. This is safe regardless of race with task migration.
237 * If we didn't race or won, skipping is obviously the right thing
238 * to do. If we lost and root is the new cgroup, noop is still the
239 * right thing to do.
229 */ 240 */
230 if (!parent_freezer(freezer)) 241 if (!parent_freezer(freezer))
231 goto out; 242 goto out;
232 243
244 /*
245 * Grab @freezer->lock and freeze @task after verifying @task still
246 * belongs to @freezer and it's freezing. The former is for the
247 * case where we have raced against task migration and lost and
248 * @task is already in a different cgroup which may not be frozen.
249 * This isn't strictly necessary as freeze_task() is allowed to be
250 * called spuriously but let's do it anyway for, if nothing else,
251 * documentation.
252 */
233 spin_lock_irq(&freezer->lock); 253 spin_lock_irq(&freezer->lock);
234 if (freezer->state & CGROUP_FREEZING) 254 if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING))
235 freeze_task(task); 255 freeze_task(task);
236 spin_unlock_irq(&freezer->lock); 256 spin_unlock_irq(&freezer->lock);
237out: 257out:
@@ -422,7 +442,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
422} 442}
423 443
424static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, 444static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
425 const char *buffer) 445 char *buffer)
426{ 446{
427 bool freeze; 447 bool freeze;
428 448
@@ -473,13 +493,11 @@ static struct cftype files[] = {
473 { } /* terminate */ 493 { } /* terminate */
474}; 494};
475 495
476struct cgroup_subsys freezer_subsys = { 496struct cgroup_subsys freezer_cgrp_subsys = {
477 .name = "freezer",
478 .css_alloc = freezer_css_alloc, 497 .css_alloc = freezer_css_alloc,
479 .css_online = freezer_css_online, 498 .css_online = freezer_css_online,
480 .css_offline = freezer_css_offline, 499 .css_offline = freezer_css_offline,
481 .css_free = freezer_css_free, 500 .css_free = freezer_css_free,
482 .subsys_id = freezer_subsys_id,
483 .attach = freezer_attach, 501 .attach = freezer_attach,
484 .fork = freezer_fork, 502 .fork = freezer_fork,
485 .base_cftypes = files, 503 .base_cftypes = files,
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e6b1b66afe52..e2dbb60004d4 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -119,7 +119,7 @@ static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
119/* Retrieve the cpuset for a task */ 119/* Retrieve the cpuset for a task */
120static inline struct cpuset *task_cs(struct task_struct *task) 120static inline struct cpuset *task_cs(struct task_struct *task)
121{ 121{
122 return css_cs(task_css(task, cpuset_subsys_id)); 122 return css_cs(task_css(task, cpuset_cgrp_id));
123} 123}
124 124
125static inline struct cpuset *parent_cs(struct cpuset *cs) 125static inline struct cpuset *parent_cs(struct cpuset *cs)
@@ -467,7 +467,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
467 * be changed to have empty cpus_allowed or mems_allowed. 467 * be changed to have empty cpus_allowed or mems_allowed.
468 */ 468 */
469 ret = -ENOSPC; 469 ret = -ENOSPC;
470 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) { 470 if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
471 if (!cpumask_empty(cur->cpus_allowed) && 471 if (!cpumask_empty(cur->cpus_allowed) &&
472 cpumask_empty(trial->cpus_allowed)) 472 cpumask_empty(trial->cpus_allowed))
473 goto out; 473 goto out;
@@ -829,55 +829,36 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
829} 829}
830 830
831/** 831/**
832 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
833 * @tsk: task to test
834 * @data: cpuset to @tsk belongs to
835 *
836 * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed
837 * mask needs to be changed.
838 *
839 * We don't need to re-check for the cgroup/cpuset membership, since we're
840 * holding cpuset_mutex at this point.
841 */
842static void cpuset_change_cpumask(struct task_struct *tsk, void *data)
843{
844 struct cpuset *cs = data;
845 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
846
847 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
848}
849
850/**
851 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 832 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
852 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 833 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
853 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
854 *
855 * Called with cpuset_mutex held
856 * 834 *
857 * The css_scan_tasks() function will scan all the tasks in a cgroup, 835 * Iterate through each task of @cs updating its cpus_allowed to the
858 * calling callback functions for each. 836 * effective cpuset's. As this function is called with cpuset_mutex held,
859 * 837 * cpuset membership stays stable.
860 * No return value. It's guaranteed that css_scan_tasks() always returns 0
861 * if @heap != NULL.
862 */ 838 */
863static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) 839static void update_tasks_cpumask(struct cpuset *cs)
864{ 840{
865 css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap); 841 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
842 struct css_task_iter it;
843 struct task_struct *task;
844
845 css_task_iter_start(&cs->css, &it);
846 while ((task = css_task_iter_next(&it)))
847 set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
848 css_task_iter_end(&it);
866} 849}
867 850
868/* 851/*
869 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. 852 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
870 * @root_cs: the root cpuset of the hierarchy 853 * @root_cs: the root cpuset of the hierarchy
871 * @update_root: update root cpuset or not? 854 * @update_root: update root cpuset or not?
872 * @heap: the heap used by css_scan_tasks()
873 * 855 *
874 * This will update cpumasks of tasks in @root_cs and all other empty cpusets 856 * This will update cpumasks of tasks in @root_cs and all other empty cpusets
875 * which take on cpumask of @root_cs. 857 * which take on cpumask of @root_cs.
876 * 858 *
877 * Called with cpuset_mutex held 859 * Called with cpuset_mutex held
878 */ 860 */
879static void update_tasks_cpumask_hier(struct cpuset *root_cs, 861static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
880 bool update_root, struct ptr_heap *heap)
881{ 862{
882 struct cpuset *cp; 863 struct cpuset *cp;
883 struct cgroup_subsys_state *pos_css; 864 struct cgroup_subsys_state *pos_css;
@@ -898,7 +879,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
898 continue; 879 continue;
899 rcu_read_unlock(); 880 rcu_read_unlock();
900 881
901 update_tasks_cpumask(cp, heap); 882 update_tasks_cpumask(cp);
902 883
903 rcu_read_lock(); 884 rcu_read_lock();
904 css_put(&cp->css); 885 css_put(&cp->css);
@@ -914,7 +895,6 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
914static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, 895static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
915 const char *buf) 896 const char *buf)
916{ 897{
917 struct ptr_heap heap;
918 int retval; 898 int retval;
919 int is_load_balanced; 899 int is_load_balanced;
920 900
@@ -947,19 +927,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
947 if (retval < 0) 927 if (retval < 0)
948 return retval; 928 return retval;
949 929
950 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
951 if (retval)
952 return retval;
953
954 is_load_balanced = is_sched_load_balance(trialcs); 930 is_load_balanced = is_sched_load_balance(trialcs);
955 931
956 mutex_lock(&callback_mutex); 932 mutex_lock(&callback_mutex);
957 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 933 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
958 mutex_unlock(&callback_mutex); 934 mutex_unlock(&callback_mutex);
959 935
960 update_tasks_cpumask_hier(cs, true, &heap); 936 update_tasks_cpumask_hier(cs, true);
961
962 heap_free(&heap);
963 937
964 if (is_load_balanced) 938 if (is_load_balanced)
965 rebuild_sched_domains_locked(); 939 rebuild_sched_domains_locked();
@@ -1048,53 +1022,22 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1048 task_unlock(tsk); 1022 task_unlock(tsk);
1049} 1023}
1050 1024
1051struct cpuset_change_nodemask_arg {
1052 struct cpuset *cs;
1053 nodemask_t *newmems;
1054};
1055
1056/*
1057 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
1058 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
1059 * memory_migrate flag is set. Called with cpuset_mutex held.
1060 */
1061static void cpuset_change_nodemask(struct task_struct *p, void *data)
1062{
1063 struct cpuset_change_nodemask_arg *arg = data;
1064 struct cpuset *cs = arg->cs;
1065 struct mm_struct *mm;
1066 int migrate;
1067
1068 cpuset_change_task_nodemask(p, arg->newmems);
1069
1070 mm = get_task_mm(p);
1071 if (!mm)
1072 return;
1073
1074 migrate = is_memory_migrate(cs);
1075
1076 mpol_rebind_mm(mm, &cs->mems_allowed);
1077 if (migrate)
1078 cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems);
1079 mmput(mm);
1080}
1081
1082static void *cpuset_being_rebound; 1025static void *cpuset_being_rebound;
1083 1026
1084/** 1027/**
1085 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1028 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1086 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1029 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1087 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1088 * 1030 *
1089 * Called with cpuset_mutex held. No return value. It's guaranteed that 1031 * Iterate through each task of @cs updating its mems_allowed to the
1090 * css_scan_tasks() always returns 0 if @heap != NULL. 1032 * effective cpuset's. As this function is called with cpuset_mutex held,
1033 * cpuset membership stays stable.
1091 */ 1034 */
1092static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) 1035static void update_tasks_nodemask(struct cpuset *cs)
1093{ 1036{
1094 static nodemask_t newmems; /* protected by cpuset_mutex */ 1037 static nodemask_t newmems; /* protected by cpuset_mutex */
1095 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1038 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1096 struct cpuset_change_nodemask_arg arg = { .cs = cs, 1039 struct css_task_iter it;
1097 .newmems = &newmems }; 1040 struct task_struct *task;
1098 1041
1099 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1042 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1100 1043
@@ -1110,7 +1053,25 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1110 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1053 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1111 * is idempotent. Also migrate pages in each mm to new nodes. 1054 * is idempotent. Also migrate pages in each mm to new nodes.
1112 */ 1055 */
1113 css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap); 1056 css_task_iter_start(&cs->css, &it);
1057 while ((task = css_task_iter_next(&it))) {
1058 struct mm_struct *mm;
1059 bool migrate;
1060
1061 cpuset_change_task_nodemask(task, &newmems);
1062
1063 mm = get_task_mm(task);
1064 if (!mm)
1065 continue;
1066
1067 migrate = is_memory_migrate(cs);
1068
1069 mpol_rebind_mm(mm, &cs->mems_allowed);
1070 if (migrate)
1071 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1072 mmput(mm);
1073 }
1074 css_task_iter_end(&it);
1114 1075
1115 /* 1076 /*
1116 * All the tasks' nodemasks have been updated, update 1077 * All the tasks' nodemasks have been updated, update
@@ -1126,15 +1087,13 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1126 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. 1087 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
1127 * @cs: the root cpuset of the hierarchy 1088 * @cs: the root cpuset of the hierarchy
1128 * @update_root: update the root cpuset or not? 1089 * @update_root: update the root cpuset or not?
1129 * @heap: the heap used by css_scan_tasks()
1130 * 1090 *
1131 * This will update nodemasks of tasks in @root_cs and all other empty cpusets 1091 * This will update nodemasks of tasks in @root_cs and all other empty cpusets
1132 * which take on nodemask of @root_cs. 1092 * which take on nodemask of @root_cs.
1133 * 1093 *
1134 * Called with cpuset_mutex held 1094 * Called with cpuset_mutex held
1135 */ 1095 */
1136static void update_tasks_nodemask_hier(struct cpuset *root_cs, 1096static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
1137 bool update_root, struct ptr_heap *heap)
1138{ 1097{
1139 struct cpuset *cp; 1098 struct cpuset *cp;
1140 struct cgroup_subsys_state *pos_css; 1099 struct cgroup_subsys_state *pos_css;
@@ -1155,7 +1114,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs,
1155 continue; 1114 continue;
1156 rcu_read_unlock(); 1115 rcu_read_unlock();
1157 1116
1158 update_tasks_nodemask(cp, heap); 1117 update_tasks_nodemask(cp);
1159 1118
1160 rcu_read_lock(); 1119 rcu_read_lock();
1161 css_put(&cp->css); 1120 css_put(&cp->css);
@@ -1180,7 +1139,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1180 const char *buf) 1139 const char *buf)
1181{ 1140{
1182 int retval; 1141 int retval;
1183 struct ptr_heap heap;
1184 1142
1185 /* 1143 /*
1186 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; 1144 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
@@ -1219,17 +1177,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1219 if (retval < 0) 1177 if (retval < 0)
1220 goto done; 1178 goto done;
1221 1179
1222 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1223 if (retval < 0)
1224 goto done;
1225
1226 mutex_lock(&callback_mutex); 1180 mutex_lock(&callback_mutex);
1227 cs->mems_allowed = trialcs->mems_allowed; 1181 cs->mems_allowed = trialcs->mems_allowed;
1228 mutex_unlock(&callback_mutex); 1182 mutex_unlock(&callback_mutex);
1229 1183
1230 update_tasks_nodemask_hier(cs, true, &heap); 1184 update_tasks_nodemask_hier(cs, true);
1231
1232 heap_free(&heap);
1233done: 1185done:
1234 return retval; 1186 return retval;
1235} 1187}
@@ -1257,38 +1209,22 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1257} 1209}
1258 1210
1259/** 1211/**
1260 * cpuset_change_flag - make a task's spread flags the same as its cpuset's
1261 * @tsk: task to be updated
1262 * @data: cpuset to @tsk belongs to
1263 *
1264 * Called by css_scan_tasks() for each task in a cgroup.
1265 *
1266 * We don't need to re-check for the cgroup/cpuset membership, since we're
1267 * holding cpuset_mutex at this point.
1268 */
1269static void cpuset_change_flag(struct task_struct *tsk, void *data)
1270{
1271 struct cpuset *cs = data;
1272
1273 cpuset_update_task_spread_flag(cs, tsk);
1274}
1275
1276/**
1277 * update_tasks_flags - update the spread flags of tasks in the cpuset. 1212 * update_tasks_flags - update the spread flags of tasks in the cpuset.
1278 * @cs: the cpuset in which each task's spread flags needs to be changed 1213 * @cs: the cpuset in which each task's spread flags needs to be changed
1279 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1280 *
1281 * Called with cpuset_mutex held
1282 * 1214 *
1283 * The css_scan_tasks() function will scan all the tasks in a cgroup, 1215 * Iterate through each task of @cs updating its spread flags. As this
1284 * calling callback functions for each. 1216 * function is called with cpuset_mutex held, cpuset membership stays
1285 * 1217 * stable.
1286 * No return value. It's guaranteed that css_scan_tasks() always returns 0
1287 * if @heap != NULL.
1288 */ 1218 */
1289static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) 1219static void update_tasks_flags(struct cpuset *cs)
1290{ 1220{
1291 css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap); 1221 struct css_task_iter it;
1222 struct task_struct *task;
1223
1224 css_task_iter_start(&cs->css, &it);
1225 while ((task = css_task_iter_next(&it)))
1226 cpuset_update_task_spread_flag(cs, task);
1227 css_task_iter_end(&it);
1292} 1228}
1293 1229
1294/* 1230/*
@@ -1306,7 +1242,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1306 struct cpuset *trialcs; 1242 struct cpuset *trialcs;
1307 int balance_flag_changed; 1243 int balance_flag_changed;
1308 int spread_flag_changed; 1244 int spread_flag_changed;
1309 struct ptr_heap heap;
1310 int err; 1245 int err;
1311 1246
1312 trialcs = alloc_trial_cpuset(cs); 1247 trialcs = alloc_trial_cpuset(cs);
@@ -1322,10 +1257,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1322 if (err < 0) 1257 if (err < 0)
1323 goto out; 1258 goto out;
1324 1259
1325 err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1326 if (err < 0)
1327 goto out;
1328
1329 balance_flag_changed = (is_sched_load_balance(cs) != 1260 balance_flag_changed = (is_sched_load_balance(cs) !=
1330 is_sched_load_balance(trialcs)); 1261 is_sched_load_balance(trialcs));
1331 1262
@@ -1340,8 +1271,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1340 rebuild_sched_domains_locked(); 1271 rebuild_sched_domains_locked();
1341 1272
1342 if (spread_flag_changed) 1273 if (spread_flag_changed)
1343 update_tasks_flags(cs, &heap); 1274 update_tasks_flags(cs);
1344 heap_free(&heap);
1345out: 1275out:
1346 free_trial_cpuset(trialcs); 1276 free_trial_cpuset(trialcs);
1347 return err; 1277 return err;
@@ -1445,6 +1375,8 @@ static int fmeter_getrate(struct fmeter *fmp)
1445 return val; 1375 return val;
1446} 1376}
1447 1377
1378static struct cpuset *cpuset_attach_old_cs;
1379
1448/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ 1380/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
1449static int cpuset_can_attach(struct cgroup_subsys_state *css, 1381static int cpuset_can_attach(struct cgroup_subsys_state *css,
1450 struct cgroup_taskset *tset) 1382 struct cgroup_taskset *tset)
@@ -1453,6 +1385,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
1453 struct task_struct *task; 1385 struct task_struct *task;
1454 int ret; 1386 int ret;
1455 1387
1388 /* used later by cpuset_attach() */
1389 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
1390
1456 mutex_lock(&cpuset_mutex); 1391 mutex_lock(&cpuset_mutex);
1457 1392
1458 /* 1393 /*
@@ -1464,7 +1399,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
1464 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 1399 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1465 goto out_unlock; 1400 goto out_unlock;
1466 1401
1467 cgroup_taskset_for_each(task, css, tset) { 1402 cgroup_taskset_for_each(task, tset) {
1468 /* 1403 /*
1469 * Kthreads which disallow setaffinity shouldn't be moved 1404 * Kthreads which disallow setaffinity shouldn't be moved
1470 * to a new cpuset; we don't want to change their cpu 1405 * to a new cpuset; we don't want to change their cpu
@@ -1516,10 +1451,8 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1516 struct mm_struct *mm; 1451 struct mm_struct *mm;
1517 struct task_struct *task; 1452 struct task_struct *task;
1518 struct task_struct *leader = cgroup_taskset_first(tset); 1453 struct task_struct *leader = cgroup_taskset_first(tset);
1519 struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset,
1520 cpuset_subsys_id);
1521 struct cpuset *cs = css_cs(css); 1454 struct cpuset *cs = css_cs(css);
1522 struct cpuset *oldcs = css_cs(oldcss); 1455 struct cpuset *oldcs = cpuset_attach_old_cs;
1523 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); 1456 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
1524 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1457 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1525 1458
@@ -1533,7 +1466,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1533 1466
1534 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); 1467 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
1535 1468
1536 cgroup_taskset_for_each(task, css, tset) { 1469 cgroup_taskset_for_each(task, tset) {
1537 /* 1470 /*
1538 * can_attach beforehand should guarantee that this doesn't 1471 * can_attach beforehand should guarantee that this doesn't
1539 * fail. TODO: have a better way to handle failure here 1472 * fail. TODO: have a better way to handle failure here
@@ -1673,7 +1606,7 @@ out_unlock:
1673 * Common handling for a write to a "cpus" or "mems" file. 1606 * Common handling for a write to a "cpus" or "mems" file.
1674 */ 1607 */
1675static int cpuset_write_resmask(struct cgroup_subsys_state *css, 1608static int cpuset_write_resmask(struct cgroup_subsys_state *css,
1676 struct cftype *cft, const char *buf) 1609 struct cftype *cft, char *buf)
1677{ 1610{
1678 struct cpuset *cs = css_cs(css); 1611 struct cpuset *cs = css_cs(css);
1679 struct cpuset *trialcs; 1612 struct cpuset *trialcs;
@@ -2020,8 +1953,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
2020 kfree(cs); 1953 kfree(cs);
2021} 1954}
2022 1955
2023struct cgroup_subsys cpuset_subsys = { 1956struct cgroup_subsys cpuset_cgrp_subsys = {
2024 .name = "cpuset",
2025 .css_alloc = cpuset_css_alloc, 1957 .css_alloc = cpuset_css_alloc,
2026 .css_online = cpuset_css_online, 1958 .css_online = cpuset_css_online,
2027 .css_offline = cpuset_css_offline, 1959 .css_offline = cpuset_css_offline,
@@ -2029,7 +1961,6 @@ struct cgroup_subsys cpuset_subsys = {
2029 .can_attach = cpuset_can_attach, 1961 .can_attach = cpuset_can_attach,
2030 .cancel_attach = cpuset_cancel_attach, 1962 .cancel_attach = cpuset_cancel_attach,
2031 .attach = cpuset_attach, 1963 .attach = cpuset_attach,
2032 .subsys_id = cpuset_subsys_id,
2033 .base_cftypes = files, 1964 .base_cftypes = files,
2034 .early_init = 1, 1965 .early_init = 1,
2035}; 1966};
@@ -2086,10 +2017,9 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2086 parent = parent_cs(parent); 2017 parent = parent_cs(parent);
2087 2018
2088 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { 2019 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2089 rcu_read_lock(); 2020 printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset ");
2090 printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n", 2021 pr_cont_cgroup_name(cs->css.cgroup);
2091 cgroup_name(cs->css.cgroup)); 2022 pr_cont("\n");
2092 rcu_read_unlock();
2093 } 2023 }
2094} 2024}
2095 2025
@@ -2137,7 +2067,7 @@ retry:
2137 */ 2067 */
2138 if ((sane && cpumask_empty(cs->cpus_allowed)) || 2068 if ((sane && cpumask_empty(cs->cpus_allowed)) ||
2139 (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) 2069 (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
2140 update_tasks_cpumask(cs, NULL); 2070 update_tasks_cpumask(cs);
2141 2071
2142 mutex_lock(&callback_mutex); 2072 mutex_lock(&callback_mutex);
2143 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); 2073 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
@@ -2151,7 +2081,7 @@ retry:
2151 */ 2081 */
2152 if ((sane && nodes_empty(cs->mems_allowed)) || 2082 if ((sane && nodes_empty(cs->mems_allowed)) ||
2153 (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) 2083 (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
2154 update_tasks_nodemask(cs, NULL); 2084 update_tasks_nodemask(cs);
2155 2085
2156 is_empty = cpumask_empty(cs->cpus_allowed) || 2086 is_empty = cpumask_empty(cs->cpus_allowed) ||
2157 nodes_empty(cs->mems_allowed); 2087 nodes_empty(cs->mems_allowed);
@@ -2213,7 +2143,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2213 mutex_lock(&callback_mutex); 2143 mutex_lock(&callback_mutex);
2214 top_cpuset.mems_allowed = new_mems; 2144 top_cpuset.mems_allowed = new_mems;
2215 mutex_unlock(&callback_mutex); 2145 mutex_unlock(&callback_mutex);
2216 update_tasks_nodemask(&top_cpuset, NULL); 2146 update_tasks_nodemask(&top_cpuset);
2217 } 2147 }
2218 2148
2219 mutex_unlock(&cpuset_mutex); 2149 mutex_unlock(&cpuset_mutex);
@@ -2305,10 +2235,10 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2305 struct cpuset *cpus_cs; 2235 struct cpuset *cpus_cs;
2306 2236
2307 mutex_lock(&callback_mutex); 2237 mutex_lock(&callback_mutex);
2308 task_lock(tsk); 2238 rcu_read_lock();
2309 cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); 2239 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
2310 guarantee_online_cpus(cpus_cs, pmask); 2240 guarantee_online_cpus(cpus_cs, pmask);
2311 task_unlock(tsk); 2241 rcu_read_unlock();
2312 mutex_unlock(&callback_mutex); 2242 mutex_unlock(&callback_mutex);
2313} 2243}
2314 2244
@@ -2361,10 +2291,10 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2361 nodemask_t mask; 2291 nodemask_t mask;
2362 2292
2363 mutex_lock(&callback_mutex); 2293 mutex_lock(&callback_mutex);
2364 task_lock(tsk); 2294 rcu_read_lock();
2365 mems_cs = effective_nodemask_cpuset(task_cs(tsk)); 2295 mems_cs = effective_nodemask_cpuset(task_cs(tsk));
2366 guarantee_online_mems(mems_cs, &mask); 2296 guarantee_online_mems(mems_cs, &mask);
2367 task_unlock(tsk); 2297 rcu_read_unlock();
2368 mutex_unlock(&callback_mutex); 2298 mutex_unlock(&callback_mutex);
2369 2299
2370 return mask; 2300 return mask;
@@ -2480,10 +2410,10 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2480 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 2410 /* Not hardwall and node outside mems_allowed: scan up cpusets */
2481 mutex_lock(&callback_mutex); 2411 mutex_lock(&callback_mutex);
2482 2412
2483 task_lock(current); 2413 rcu_read_lock();
2484 cs = nearest_hardwall_ancestor(task_cs(current)); 2414 cs = nearest_hardwall_ancestor(task_cs(current));
2485 allowed = node_isset(node, cs->mems_allowed); 2415 allowed = node_isset(node, cs->mems_allowed);
2486 task_unlock(current); 2416 rcu_read_unlock();
2487 2417
2488 mutex_unlock(&callback_mutex); 2418 mutex_unlock(&callback_mutex);
2489 return allowed; 2419 return allowed;
@@ -2609,27 +2539,27 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2609 * @task: pointer to task_struct of some task. 2539 * @task: pointer to task_struct of some task.
2610 * 2540 *
2611 * Description: Prints @task's name, cpuset name, and cached copy of its 2541 * Description: Prints @task's name, cpuset name, and cached copy of its
2612 * mems_allowed to the kernel log. Must hold task_lock(task) to allow 2542 * mems_allowed to the kernel log.
2613 * dereferencing task_cs(task).
2614 */ 2543 */
2615void cpuset_print_task_mems_allowed(struct task_struct *tsk) 2544void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2616{ 2545{
2617 /* Statically allocated to prevent using excess stack. */ 2546 /* Statically allocated to prevent using excess stack. */
2618 static char cpuset_nodelist[CPUSET_NODELIST_LEN]; 2547 static char cpuset_nodelist[CPUSET_NODELIST_LEN];
2619 static DEFINE_SPINLOCK(cpuset_buffer_lock); 2548 static DEFINE_SPINLOCK(cpuset_buffer_lock);
2549 struct cgroup *cgrp;
2620 2550
2621 struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
2622
2623 rcu_read_lock();
2624 spin_lock(&cpuset_buffer_lock); 2551 spin_lock(&cpuset_buffer_lock);
2552 rcu_read_lock();
2625 2553
2554 cgrp = task_cs(tsk)->css.cgroup;
2626 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, 2555 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2627 tsk->mems_allowed); 2556 tsk->mems_allowed);
2628 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", 2557 printk(KERN_INFO "%s cpuset=", tsk->comm);
2629 tsk->comm, cgroup_name(cgrp), cpuset_nodelist); 2558 pr_cont_cgroup_name(cgrp);
2559 pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
2630 2560
2631 spin_unlock(&cpuset_buffer_lock);
2632 rcu_read_unlock(); 2561 rcu_read_unlock();
2562 spin_unlock(&cpuset_buffer_lock);
2633} 2563}
2634 2564
2635/* 2565/*
@@ -2660,9 +2590,9 @@ int cpuset_memory_pressure_enabled __read_mostly;
2660 2590
2661void __cpuset_memory_pressure_bump(void) 2591void __cpuset_memory_pressure_bump(void)
2662{ 2592{
2663 task_lock(current); 2593 rcu_read_lock();
2664 fmeter_markevent(&task_cs(current)->fmeter); 2594 fmeter_markevent(&task_cs(current)->fmeter);
2665 task_unlock(current); 2595 rcu_read_unlock();
2666} 2596}
2667 2597
2668#ifdef CONFIG_PROC_PID_CPUSET 2598#ifdef CONFIG_PROC_PID_CPUSET
@@ -2679,12 +2609,12 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
2679{ 2609{
2680 struct pid *pid; 2610 struct pid *pid;
2681 struct task_struct *tsk; 2611 struct task_struct *tsk;
2682 char *buf; 2612 char *buf, *p;
2683 struct cgroup_subsys_state *css; 2613 struct cgroup_subsys_state *css;
2684 int retval; 2614 int retval;
2685 2615
2686 retval = -ENOMEM; 2616 retval = -ENOMEM;
2687 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 2617 buf = kmalloc(PATH_MAX, GFP_KERNEL);
2688 if (!buf) 2618 if (!buf)
2689 goto out; 2619 goto out;
2690 2620
@@ -2694,14 +2624,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
2694 if (!tsk) 2624 if (!tsk)
2695 goto out_free; 2625 goto out_free;
2696 2626
2627 retval = -ENAMETOOLONG;
2697 rcu_read_lock(); 2628 rcu_read_lock();
2698 css = task_css(tsk, cpuset_subsys_id); 2629 css = task_css(tsk, cpuset_cgrp_id);
2699 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); 2630 p = cgroup_path(css->cgroup, buf, PATH_MAX);
2700 rcu_read_unlock(); 2631 rcu_read_unlock();
2701 if (retval < 0) 2632 if (!p)
2702 goto out_put_task; 2633 goto out_put_task;
2703 seq_puts(m, buf); 2634 seq_puts(m, p);
2704 seq_putc(m, '\n'); 2635 seq_putc(m, '\n');
2636 retval = 0;
2705out_put_task: 2637out_put_task:
2706 put_task_struct(tsk); 2638 put_task_struct(tsk);
2707out_free: 2639out_free:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 661951ab8ae7..f83a71a3e46d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -361,7 +361,7 @@ struct perf_cgroup {
361static inline struct perf_cgroup * 361static inline struct perf_cgroup *
362perf_cgroup_from_task(struct task_struct *task) 362perf_cgroup_from_task(struct task_struct *task)
363{ 363{
364 return container_of(task_css(task, perf_subsys_id), 364 return container_of(task_css(task, perf_event_cgrp_id),
365 struct perf_cgroup, css); 365 struct perf_cgroup, css);
366} 366}
367 367
@@ -389,11 +389,6 @@ perf_cgroup_match(struct perf_event *event)
389 event->cgrp->css.cgroup); 389 event->cgrp->css.cgroup);
390} 390}
391 391
392static inline bool perf_tryget_cgroup(struct perf_event *event)
393{
394 return css_tryget(&event->cgrp->css);
395}
396
397static inline void perf_put_cgroup(struct perf_event *event) 392static inline void perf_put_cgroup(struct perf_event *event)
398{ 393{
399 css_put(&event->cgrp->css); 394 css_put(&event->cgrp->css);
@@ -612,9 +607,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
612 if (!f.file) 607 if (!f.file)
613 return -EBADF; 608 return -EBADF;
614 609
615 rcu_read_lock(); 610 css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys);
616
617 css = css_from_dir(f.file->f_dentry, &perf_subsys);
618 if (IS_ERR(css)) { 611 if (IS_ERR(css)) {
619 ret = PTR_ERR(css); 612 ret = PTR_ERR(css);
620 goto out; 613 goto out;
@@ -623,13 +616,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
623 cgrp = container_of(css, struct perf_cgroup, css); 616 cgrp = container_of(css, struct perf_cgroup, css);
624 event->cgrp = cgrp; 617 event->cgrp = cgrp;
625 618
626 /* must be done before we fput() the file */
627 if (!perf_tryget_cgroup(event)) {
628 event->cgrp = NULL;
629 ret = -ENOENT;
630 goto out;
631 }
632
633 /* 619 /*
634 * all events in a group must monitor 620 * all events in a group must monitor
635 * the same cgroup because a task belongs 621 * the same cgroup because a task belongs
@@ -640,7 +626,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
640 ret = -EINVAL; 626 ret = -EINVAL;
641 } 627 }
642out: 628out:
643 rcu_read_unlock();
644 fdput(f); 629 fdput(f);
645 return ret; 630 return ret;
646} 631}
@@ -8053,7 +8038,7 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css,
8053{ 8038{
8054 struct task_struct *task; 8039 struct task_struct *task;
8055 8040
8056 cgroup_taskset_for_each(task, css, tset) 8041 cgroup_taskset_for_each(task, tset)
8057 task_function_call(task, __perf_cgroup_move, task); 8042 task_function_call(task, __perf_cgroup_move, task);
8058} 8043}
8059 8044
@@ -8072,9 +8057,7 @@ static void perf_cgroup_exit(struct cgroup_subsys_state *css,
8072 task_function_call(task, __perf_cgroup_move, task); 8057 task_function_call(task, __perf_cgroup_move, task);
8073} 8058}
8074 8059
8075struct cgroup_subsys perf_subsys = { 8060struct cgroup_subsys perf_event_cgrp_subsys = {
8076 .name = "perf_event",
8077 .subsys_id = perf_subsys_id,
8078 .css_alloc = perf_cgroup_css_alloc, 8061 .css_alloc = perf_cgroup_css_alloc,
8079 .css_free = perf_cgroup_css_free, 8062 .css_free = perf_cgroup_css_free,
8080 .exit = perf_cgroup_exit, 8063 .exit = perf_cgroup_exit,
diff --git a/kernel/exit.c b/kernel/exit.c
index 1e77fc645317..6480d1c85d7a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -797,7 +797,7 @@ void do_exit(long code)
797 */ 797 */
798 perf_event_exit_task(tsk); 798 perf_event_exit_task(tsk);
799 799
800 cgroup_exit(tsk, 1); 800 cgroup_exit(tsk);
801 801
802 if (group_dead) 802 if (group_dead)
803 disassociate_ctty(1); 803 disassociate_ctty(1);
diff --git a/kernel/fork.c b/kernel/fork.c
index 332688e5e7b4..abc45890f0a5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1272,7 +1272,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1272 if (IS_ERR(p->mempolicy)) { 1272 if (IS_ERR(p->mempolicy)) {
1273 retval = PTR_ERR(p->mempolicy); 1273 retval = PTR_ERR(p->mempolicy);
1274 p->mempolicy = NULL; 1274 p->mempolicy = NULL;
1275 goto bad_fork_cleanup_cgroup; 1275 goto bad_fork_cleanup_threadgroup_lock;
1276 } 1276 }
1277 mpol_fix_fork_child_flag(p); 1277 mpol_fix_fork_child_flag(p);
1278#endif 1278#endif
@@ -1525,11 +1525,10 @@ bad_fork_cleanup_policy:
1525 perf_event_free_task(p); 1525 perf_event_free_task(p);
1526#ifdef CONFIG_NUMA 1526#ifdef CONFIG_NUMA
1527 mpol_put(p->mempolicy); 1527 mpol_put(p->mempolicy);
1528bad_fork_cleanup_cgroup: 1528bad_fork_cleanup_threadgroup_lock:
1529#endif 1529#endif
1530 if (clone_flags & CLONE_THREAD) 1530 if (clone_flags & CLONE_THREAD)
1531 threadgroup_change_end(current); 1531 threadgroup_change_end(current);
1532 cgroup_exit(p, 0);
1533 delayacct_tsk_free(p); 1532 delayacct_tsk_free(p);
1534 module_put(task_thread_info(p)->exec_domain->module); 1533 module_put(task_thread_info(p)->exec_domain->module);
1535bad_fork_cleanup_count: 1534bad_fork_cleanup_count:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9cae286824bb..1d1b87b36778 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7230,7 +7230,7 @@ void sched_move_task(struct task_struct *tsk)
7230 if (unlikely(running)) 7230 if (unlikely(running))
7231 tsk->sched_class->put_prev_task(rq, tsk); 7231 tsk->sched_class->put_prev_task(rq, tsk);
7232 7232
7233 tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, 7233 tg = container_of(task_css_check(tsk, cpu_cgrp_id,
7234 lockdep_is_held(&tsk->sighand->siglock)), 7234 lockdep_is_held(&tsk->sighand->siglock)),
7235 struct task_group, css); 7235 struct task_group, css);
7236 tg = autogroup_task_group(tsk, tg); 7236 tg = autogroup_task_group(tsk, tg);
@@ -7657,7 +7657,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7657{ 7657{
7658 struct task_struct *task; 7658 struct task_struct *task;
7659 7659
7660 cgroup_taskset_for_each(task, css, tset) { 7660 cgroup_taskset_for_each(task, tset) {
7661#ifdef CONFIG_RT_GROUP_SCHED 7661#ifdef CONFIG_RT_GROUP_SCHED
7662 if (!sched_rt_can_attach(css_tg(css), task)) 7662 if (!sched_rt_can_attach(css_tg(css), task))
7663 return -EINVAL; 7663 return -EINVAL;
@@ -7675,7 +7675,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
7675{ 7675{
7676 struct task_struct *task; 7676 struct task_struct *task;
7677 7677
7678 cgroup_taskset_for_each(task, css, tset) 7678 cgroup_taskset_for_each(task, tset)
7679 sched_move_task(task); 7679 sched_move_task(task);
7680} 7680}
7681 7681
@@ -8014,8 +8014,7 @@ static struct cftype cpu_files[] = {
8014 { } /* terminate */ 8014 { } /* terminate */
8015}; 8015};
8016 8016
8017struct cgroup_subsys cpu_cgroup_subsys = { 8017struct cgroup_subsys cpu_cgrp_subsys = {
8018 .name = "cpu",
8019 .css_alloc = cpu_cgroup_css_alloc, 8018 .css_alloc = cpu_cgroup_css_alloc,
8020 .css_free = cpu_cgroup_css_free, 8019 .css_free = cpu_cgroup_css_free,
8021 .css_online = cpu_cgroup_css_online, 8020 .css_online = cpu_cgroup_css_online,
@@ -8023,7 +8022,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8023 .can_attach = cpu_cgroup_can_attach, 8022 .can_attach = cpu_cgroup_can_attach,
8024 .attach = cpu_cgroup_attach, 8023 .attach = cpu_cgroup_attach,
8025 .exit = cpu_cgroup_exit, 8024 .exit = cpu_cgroup_exit,
8026 .subsys_id = cpu_cgroup_subsys_id,
8027 .base_cftypes = cpu_files, 8025 .base_cftypes = cpu_files,
8028 .early_init = 1, 8026 .early_init = 1,
8029}; 8027};
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 622e0818f905..c143ee380e3a 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -41,7 +41,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
41/* return cpu accounting group to which this task belongs */ 41/* return cpu accounting group to which this task belongs */
42static inline struct cpuacct *task_ca(struct task_struct *tsk) 42static inline struct cpuacct *task_ca(struct task_struct *tsk)
43{ 43{
44 return css_ca(task_css(tsk, cpuacct_subsys_id)); 44 return css_ca(task_css(tsk, cpuacct_cgrp_id));
45} 45}
46 46
47static inline struct cpuacct *parent_ca(struct cpuacct *ca) 47static inline struct cpuacct *parent_ca(struct cpuacct *ca)
@@ -275,11 +275,9 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
275 rcu_read_unlock(); 275 rcu_read_unlock();
276} 276}
277 277
278struct cgroup_subsys cpuacct_subsys = { 278struct cgroup_subsys cpuacct_cgrp_subsys = {
279 .name = "cpuacct",
280 .css_alloc = cpuacct_css_alloc, 279 .css_alloc = cpuacct_css_alloc,
281 .css_free = cpuacct_css_free, 280 .css_free = cpuacct_css_free,
282 .subsys_id = cpuacct_subsys_id,
283 .base_cftypes = files, 281 .base_cftypes = files,
284 .early_init = 1, 282 .early_init = 1,
285}; 283};
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f3344c31632a..695f9773bb60 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -111,8 +111,7 @@ static char *task_group_path(struct task_group *tg)
111 if (autogroup_path(tg, group_path, PATH_MAX)) 111 if (autogroup_path(tg, group_path, PATH_MAX))
112 return group_path; 112 return group_path;
113 113
114 cgroup_path(tg->css.cgroup, group_path, PATH_MAX); 114 return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
115 return group_path;
116} 115}
117#endif 116#endif
118 117