summaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-04-03 16:05:42 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-04-03 16:05:42 -0400
commit32d01dc7be4e725ab85ce1d74e8f4adc02ad68dd (patch)
tree213fe7d76b315413fe551332423fb2f6dfae59b9 /kernel/cgroup.c
parent68114e5eb862ad0a7a261b91497281b026102715 (diff)
parent1ec41830e087cda1f62dda4182c2b62811eb0ffc (diff)
Merge branch 'for-3.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: "A lot updates for cgroup: - The biggest one is cgroup's conversion to kernfs. cgroup took after the long abandoned vfs-entangled sysfs implementation and made it even more convoluted over time. cgroup's internal objects were fused with vfs objects which also brought in vfs locking and object lifetime rules. Naturally, there are places where vfs rules don't fit and nasty hacks, such as credential switching or lock dance interleaving inode mutex and cgroup_mutex with object serial number comparison thrown in to decide whether the operation is actually necessary, needed to be employed. After conversion to kernfs, internal object lifetime and locking rules are mostly isolated from vfs interactions allowing shedding of several nasty hacks and overall simplification. This will also allow implmentation of operations which may affect multiple cgroups which weren't possible before as it would have required nesting i_mutexes. - Various simplifications including dropping of module support, easier cgroup name/path handling, simplified cgroup file type handling and task_cg_lists optimization. - Prepatory changes for the planned unified hierarchy, which is still a patchset away from being actually operational. The dummy hierarchy is updated to serve as the default unified hierarchy. Controllers which aren't claimed by other hierarchies are associated with it, which BTW was what the dummy hierarchy was for anyway. - Various fixes from Li and others. This pull request includes some patches to add missing slab.h to various subsystems. This was triggered xattr.h include removal from cgroup.h. cgroup.h indirectly got included a lot of files which brought in xattr.h which brought in slab.h. There are several merge commits - one to pull in kernfs updates necessary for converting cgroup (already in upstream through driver-core), others for interfering changes in the fixes branch" * 'for-3.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (74 commits) cgroup: remove useless argument from cgroup_exit() cgroup: fix spurious lockdep warning in cgroup_exit() cgroup: Use RCU_INIT_POINTER(x, NULL) in cgroup.c cgroup: break kernfs active_ref protection in cgroup directory operations cgroup: fix cgroup_taskset walking order cgroup: implement CFTYPE_ONLY_ON_DFL cgroup: make cgrp_dfl_root mountable cgroup: drop const from @buffer of cftype->write_string() cgroup: rename cgroup_dummy_root and related names cgroup: move ->subsys_mask from cgroupfs_root to cgroup cgroup: treat cgroup_dummy_root as an equivalent hierarchy during rebinding cgroup: remove NULL checks from [pr_cont_]cgroup_{name|path}() cgroup: use cgroup_setup_root() to initialize cgroup_dummy_root cgroup: reorganize cgroup bootstrapping cgroup: relocate setting of CGRP_DEAD cpuset: use rcu_read_lock() to protect task_cs() cgroup_freezer: document freezer_fork() subtleties cgroup: update cgroup_transfer_tasks() to either succeed or fail cgroup: drop task_lock() protection around task->cgroups cgroup: update how a newly forked task gets associated with css_set ...
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c3711
1 files changed, 1536 insertions, 2175 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0c753ddd223b..fede3d3f28ff 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -40,23 +40,20 @@
40#include <linux/proc_fs.h> 40#include <linux/proc_fs.h>
41#include <linux/rcupdate.h> 41#include <linux/rcupdate.h>
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/backing-dev.h>
44#include <linux/slab.h> 43#include <linux/slab.h>
45#include <linux/magic.h>
46#include <linux/spinlock.h> 44#include <linux/spinlock.h>
45#include <linux/rwsem.h>
47#include <linux/string.h> 46#include <linux/string.h>
48#include <linux/sort.h> 47#include <linux/sort.h>
49#include <linux/kmod.h> 48#include <linux/kmod.h>
50#include <linux/module.h>
51#include <linux/delayacct.h> 49#include <linux/delayacct.h>
52#include <linux/cgroupstats.h> 50#include <linux/cgroupstats.h>
53#include <linux/hashtable.h> 51#include <linux/hashtable.h>
54#include <linux/namei.h>
55#include <linux/pid_namespace.h> 52#include <linux/pid_namespace.h>
56#include <linux/idr.h> 53#include <linux/idr.h>
57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 54#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
58#include <linux/flex_array.h> /* used in cgroup_attach_task */
59#include <linux/kthread.h> 55#include <linux/kthread.h>
56#include <linux/delay.h>
60 57
61#include <linux/atomic.h> 58#include <linux/atomic.h>
62 59
@@ -68,43 +65,49 @@
68 */ 65 */
69#define CGROUP_PIDLIST_DESTROY_DELAY HZ 66#define CGROUP_PIDLIST_DESTROY_DELAY HZ
70 67
68#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
69 MAX_CFTYPE_NAME + 2)
70
71/*
72 * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
73 * creation/removal and hierarchy changing operations including cgroup
74 * creation, removal, css association and controller rebinding. This outer
75 * lock is needed mainly to resolve the circular dependency between kernfs
76 * active ref and cgroup_mutex. cgroup_tree_mutex nests above both.
77 */
78static DEFINE_MUTEX(cgroup_tree_mutex);
79
71/* 80/*
72 * cgroup_mutex is the master lock. Any modification to cgroup or its 81 * cgroup_mutex is the master lock. Any modification to cgroup or its
73 * hierarchy must be performed while holding it. 82 * hierarchy must be performed while holding it.
74 * 83 *
75 * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify 84 * css_set_rwsem protects task->cgroups pointer, the list of css_set
76 * cgroupfs_root of any cgroup hierarchy - subsys list, flags, 85 * objects, and the chain of tasks off each css_set.
77 * release_agent_path and so on. Modifying requires both cgroup_mutex and
78 * cgroup_root_mutex. Readers can acquire either of the two. This is to
79 * break the following locking order cycle.
80 *
81 * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
82 * B. namespace_sem -> cgroup_mutex
83 * 86 *
84 * B happens only through cgroup_show_options() and using cgroup_root_mutex 87 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
85 * breaks it. 88 * cgroup.h can use them for lockdep annotations.
86 */ 89 */
87#ifdef CONFIG_PROVE_RCU 90#ifdef CONFIG_PROVE_RCU
88DEFINE_MUTEX(cgroup_mutex); 91DEFINE_MUTEX(cgroup_mutex);
89EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ 92DECLARE_RWSEM(css_set_rwsem);
93EXPORT_SYMBOL_GPL(cgroup_mutex);
94EXPORT_SYMBOL_GPL(css_set_rwsem);
90#else 95#else
91static DEFINE_MUTEX(cgroup_mutex); 96static DEFINE_MUTEX(cgroup_mutex);
97static DECLARE_RWSEM(css_set_rwsem);
92#endif 98#endif
93 99
94static DEFINE_MUTEX(cgroup_root_mutex); 100/*
101 * Protects cgroup_subsys->release_agent_path. Modifying it also requires
102 * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
103 */
104static DEFINE_SPINLOCK(release_agent_path_lock);
95 105
96#define cgroup_assert_mutex_or_rcu_locked() \ 106#define cgroup_assert_mutexes_or_rcu_locked() \
97 rcu_lockdep_assert(rcu_read_lock_held() || \ 107 rcu_lockdep_assert(rcu_read_lock_held() || \
108 lockdep_is_held(&cgroup_tree_mutex) || \
98 lockdep_is_held(&cgroup_mutex), \ 109 lockdep_is_held(&cgroup_mutex), \
99 "cgroup_mutex or RCU read lock required"); 110 "cgroup_[tree_]mutex or RCU read lock required");
100
101#ifdef CONFIG_LOCKDEP
102#define cgroup_assert_mutex_or_root_locked() \
103 WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \
104 !lockdep_is_held(&cgroup_root_mutex)))
105#else
106#define cgroup_assert_mutex_or_root_locked() do { } while (0)
107#endif
108 111
109/* 112/*
110 * cgroup destruction makes heavy use of work items and there can be a lot 113 * cgroup destruction makes heavy use of work items and there can be a lot
@@ -120,42 +123,41 @@ static struct workqueue_struct *cgroup_destroy_wq;
120 */ 123 */
121static struct workqueue_struct *cgroup_pidlist_destroy_wq; 124static struct workqueue_struct *cgroup_pidlist_destroy_wq;
122 125
123/* 126/* generate an array of cgroup subsystem pointers */
124 * Generate an array of cgroup subsystem pointers. At boot time, this is 127#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
125 * populated with the built in subsystems, and modular subsystems are 128static struct cgroup_subsys *cgroup_subsys[] = {
126 * registered after that. The mutable section of this array is protected by 129#include <linux/cgroup_subsys.h>
127 * cgroup_mutex. 130};
128 */ 131#undef SUBSYS
129#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, 132
130#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) 133/* array of cgroup subsystem names */
131static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = { 134#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
135static const char *cgroup_subsys_name[] = {
132#include <linux/cgroup_subsys.h> 136#include <linux/cgroup_subsys.h>
133}; 137};
138#undef SUBSYS
134 139
135/* 140/*
136 * The dummy hierarchy, reserved for the subsystems that are otherwise 141 * The default hierarchy, reserved for the subsystems that are otherwise
137 * unattached - it never has more than a single cgroup, and all tasks are 142 * unattached - it never has more than a single cgroup, and all tasks are
138 * part of that cgroup. 143 * part of that cgroup.
139 */ 144 */
140static struct cgroupfs_root cgroup_dummy_root; 145struct cgroup_root cgrp_dfl_root;
141 146
142/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ 147/*
143static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; 148 * The default hierarchy always exists but is hidden until mounted for the
149 * first time. This is for backward compatibility.
150 */
151static bool cgrp_dfl_root_visible;
144 152
145/* The list of hierarchy roots */ 153/* The list of hierarchy roots */
146 154
147static LIST_HEAD(cgroup_roots); 155static LIST_HEAD(cgroup_roots);
148static int cgroup_root_count; 156static int cgroup_root_count;
149 157
150/* 158/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
151 * Hierarchy ID allocation and mapping. It follows the same exclusion
152 * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for
153 * writes, either for reads.
154 */
155static DEFINE_IDR(cgroup_hierarchy_idr); 159static DEFINE_IDR(cgroup_hierarchy_idr);
156 160
157static struct cgroup_name root_cgroup_name = { .name = "/" };
158
159/* 161/*
160 * Assign a monotonically increasing serial number to cgroups. It 162 * Assign a monotonically increasing serial number to cgroups. It
161 * guarantees cgroups with bigger numbers are newer than those with smaller 163 * guarantees cgroups with bigger numbers are newer than those with smaller
@@ -175,11 +177,13 @@ static int need_forkexit_callback __read_mostly;
175 177
176static struct cftype cgroup_base_files[]; 178static struct cftype cgroup_base_files[];
177 179
180static void cgroup_put(struct cgroup *cgrp);
181static int rebind_subsystems(struct cgroup_root *dst_root,
182 unsigned long ss_mask);
178static void cgroup_destroy_css_killed(struct cgroup *cgrp); 183static void cgroup_destroy_css_killed(struct cgroup *cgrp);
179static int cgroup_destroy_locked(struct cgroup *cgrp); 184static int cgroup_destroy_locked(struct cgroup *cgrp);
180static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 185static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
181 bool is_add); 186 bool is_add);
182static int cgroup_file_release(struct inode *inode, struct file *file);
183static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); 187static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
184 188
185/** 189/**
@@ -197,8 +201,9 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
197 struct cgroup_subsys *ss) 201 struct cgroup_subsys *ss)
198{ 202{
199 if (ss) 203 if (ss)
200 return rcu_dereference_check(cgrp->subsys[ss->subsys_id], 204 return rcu_dereference_check(cgrp->subsys[ss->id],
201 lockdep_is_held(&cgroup_mutex)); 205 lockdep_is_held(&cgroup_tree_mutex) ||
206 lockdep_is_held(&cgroup_mutex));
202 else 207 else
203 return &cgrp->dummy_css; 208 return &cgrp->dummy_css;
204} 209}
@@ -209,6 +214,27 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp)
209 return test_bit(CGRP_DEAD, &cgrp->flags); 214 return test_bit(CGRP_DEAD, &cgrp->flags);
210} 215}
211 216
217struct cgroup_subsys_state *seq_css(struct seq_file *seq)
218{
219 struct kernfs_open_file *of = seq->private;
220 struct cgroup *cgrp = of->kn->parent->priv;
221 struct cftype *cft = seq_cft(seq);
222
223 /*
224 * This is open and unprotected implementation of cgroup_css().
225 * seq_css() is only called from a kernfs file operation which has
226 * an active reference on the file. Because all the subsystem
227 * files are drained before a css is disassociated with a cgroup,
228 * the matching css from the cgroup's subsys table is guaranteed to
229 * be and stay valid until the enclosing operation is complete.
230 */
231 if (cft->ss)
232 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
233 else
234 return &cgrp->dummy_css;
235}
236EXPORT_SYMBOL_GPL(seq_css);
237
212/** 238/**
213 * cgroup_is_descendant - test ancestry 239 * cgroup_is_descendant - test ancestry
214 * @cgrp: the cgroup to be tested 240 * @cgrp: the cgroup to be tested
@@ -227,7 +253,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
227 } 253 }
228 return false; 254 return false;
229} 255}
230EXPORT_SYMBOL_GPL(cgroup_is_descendant);
231 256
232static int cgroup_is_releasable(const struct cgroup *cgrp) 257static int cgroup_is_releasable(const struct cgroup *cgrp)
233{ 258{
@@ -254,54 +279,23 @@ static int notify_on_release(const struct cgroup *cgrp)
254 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 279 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
255 if (!((css) = rcu_dereference_check( \ 280 if (!((css) = rcu_dereference_check( \
256 (cgrp)->subsys[(ssid)], \ 281 (cgrp)->subsys[(ssid)], \
282 lockdep_is_held(&cgroup_tree_mutex) || \
257 lockdep_is_held(&cgroup_mutex)))) { } \ 283 lockdep_is_held(&cgroup_mutex)))) { } \
258 else 284 else
259 285
260/** 286/**
261 * for_each_subsys - iterate all loaded cgroup subsystems 287 * for_each_subsys - iterate all enabled cgroup subsystems
262 * @ss: the iteration cursor 288 * @ss: the iteration cursor
263 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 289 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
264 *
265 * Iterates through all loaded subsystems. Should be called under
266 * cgroup_mutex or cgroup_root_mutex.
267 */ 290 */
268#define for_each_subsys(ss, ssid) \ 291#define for_each_subsys(ss, ssid) \
269 for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \ 292 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
270 (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 293 (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
271 if (!((ss) = cgroup_subsys[(ssid)])) { } \
272 else
273 294
274/** 295/* iterate across the hierarchies */
275 * for_each_builtin_subsys - iterate all built-in cgroup subsystems 296#define for_each_root(root) \
276 * @ss: the iteration cursor
277 * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end
278 *
279 * Bulit-in subsystems are always present and iteration itself doesn't
280 * require any synchronization.
281 */
282#define for_each_builtin_subsys(ss, i) \
283 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
284 (((ss) = cgroup_subsys[i]) || true); (i)++)
285
286/* iterate across the active hierarchies */
287#define for_each_active_root(root) \
288 list_for_each_entry((root), &cgroup_roots, root_list) 297 list_for_each_entry((root), &cgroup_roots, root_list)
289 298
290static inline struct cgroup *__d_cgrp(struct dentry *dentry)
291{
292 return dentry->d_fsdata;
293}
294
295static inline struct cfent *__d_cfe(struct dentry *dentry)
296{
297 return dentry->d_fsdata;
298}
299
300static inline struct cftype *__d_cft(struct dentry *dentry)
301{
302 return __d_cfe(dentry)->type;
303}
304
305/** 299/**
306 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 300 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
307 * @cgrp: the cgroup to be checked for liveness 301 * @cgrp: the cgroup to be checked for liveness
@@ -347,23 +341,23 @@ struct cgrp_cset_link {
347 struct list_head cgrp_link; 341 struct list_head cgrp_link;
348}; 342};
349 343
350/* The default css_set - used by init and its children prior to any 344/*
345 * The default css_set - used by init and its children prior to any
351 * hierarchies being mounted. It contains a pointer to the root state 346 * hierarchies being mounted. It contains a pointer to the root state
352 * for each subsystem. Also used to anchor the list of css_sets. Not 347 * for each subsystem. Also used to anchor the list of css_sets. Not
353 * reference-counted, to improve performance when child cgroups 348 * reference-counted, to improve performance when child cgroups
354 * haven't been created. 349 * haven't been created.
355 */ 350 */
351static struct css_set init_css_set = {
352 .refcount = ATOMIC_INIT(1),
353 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
354 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
355 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
356 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
357 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
358};
356 359
357static struct css_set init_css_set; 360static int css_set_count = 1; /* 1 for init_css_set */
358static struct cgrp_cset_link init_cgrp_cset_link;
359
360/*
361 * css_set_lock protects the list of css_set objects, and the chain of
362 * tasks off each css_set. Nests outside task->alloc_lock due to
363 * css_task_iter_start().
364 */
365static DEFINE_RWLOCK(css_set_lock);
366static int css_set_count;
367 361
368/* 362/*
369 * hash table for cgroup groups. This improves the performance to find 363 * hash table for cgroup groups. This improves the performance to find
@@ -386,30 +380,14 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
386 return key; 380 return key;
387} 381}
388 382
389/* 383static void put_css_set_locked(struct css_set *cset, bool taskexit)
390 * We don't maintain the lists running through each css_set to its task
391 * until after the first call to css_task_iter_start(). This reduces the
392 * fork()/exit() overhead for people who have cgroups compiled into their
393 * kernel but not actually in use.
394 */
395static int use_task_css_set_links __read_mostly;
396
397static void __put_css_set(struct css_set *cset, int taskexit)
398{ 384{
399 struct cgrp_cset_link *link, *tmp_link; 385 struct cgrp_cset_link *link, *tmp_link;
400 386
401 /* 387 lockdep_assert_held(&css_set_rwsem);
402 * Ensure that the refcount doesn't hit zero while any readers 388
403 * can see it. Similar to atomic_dec_and_lock(), but for an 389 if (!atomic_dec_and_test(&cset->refcount))
404 * rwlock
405 */
406 if (atomic_add_unless(&cset->refcount, -1, 1))
407 return;
408 write_lock(&css_set_lock);
409 if (!atomic_dec_and_test(&cset->refcount)) {
410 write_unlock(&css_set_lock);
411 return; 390 return;
412 }
413 391
414 /* This css_set is dead. unlink it and release cgroup refcounts */ 392 /* This css_set is dead. unlink it and release cgroup refcounts */
415 hash_del(&cset->hlist); 393 hash_del(&cset->hlist);
@@ -421,7 +399,7 @@ static void __put_css_set(struct css_set *cset, int taskexit)
421 list_del(&link->cset_link); 399 list_del(&link->cset_link);
422 list_del(&link->cgrp_link); 400 list_del(&link->cgrp_link);
423 401
424 /* @cgrp can't go away while we're holding css_set_lock */ 402 /* @cgrp can't go away while we're holding css_set_rwsem */
425 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { 403 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
426 if (taskexit) 404 if (taskexit)
427 set_bit(CGRP_RELEASABLE, &cgrp->flags); 405 set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -431,10 +409,24 @@ static void __put_css_set(struct css_set *cset, int taskexit)
431 kfree(link); 409 kfree(link);
432 } 410 }
433 411
434 write_unlock(&css_set_lock);
435 kfree_rcu(cset, rcu_head); 412 kfree_rcu(cset, rcu_head);
436} 413}
437 414
415static void put_css_set(struct css_set *cset, bool taskexit)
416{
417 /*
418 * Ensure that the refcount doesn't hit zero while any readers
419 * can see it. Similar to atomic_dec_and_lock(), but for an
420 * rwlock
421 */
422 if (atomic_add_unless(&cset->refcount, -1, 1))
423 return;
424
425 down_write(&css_set_rwsem);
426 put_css_set_locked(cset, taskexit);
427 up_write(&css_set_rwsem);
428}
429
438/* 430/*
439 * refcounted get/put for css_set objects 431 * refcounted get/put for css_set objects
440 */ 432 */
@@ -443,16 +435,6 @@ static inline void get_css_set(struct css_set *cset)
443 atomic_inc(&cset->refcount); 435 atomic_inc(&cset->refcount);
444} 436}
445 437
446static inline void put_css_set(struct css_set *cset)
447{
448 __put_css_set(cset, 0);
449}
450
451static inline void put_css_set_taskexit(struct css_set *cset)
452{
453 __put_css_set(cset, 1);
454}
455
456/** 438/**
457 * compare_css_sets - helper function for find_existing_css_set(). 439 * compare_css_sets - helper function for find_existing_css_set().
458 * @cset: candidate css_set being tested 440 * @cset: candidate css_set being tested
@@ -535,7 +517,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
535 struct cgroup *cgrp, 517 struct cgroup *cgrp,
536 struct cgroup_subsys_state *template[]) 518 struct cgroup_subsys_state *template[])
537{ 519{
538 struct cgroupfs_root *root = cgrp->root; 520 struct cgroup_root *root = cgrp->root;
539 struct cgroup_subsys *ss; 521 struct cgroup_subsys *ss;
540 struct css_set *cset; 522 struct css_set *cset;
541 unsigned long key; 523 unsigned long key;
@@ -547,7 +529,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
547 * won't change, so no need for locking. 529 * won't change, so no need for locking.
548 */ 530 */
549 for_each_subsys(ss, i) { 531 for_each_subsys(ss, i) {
550 if (root->subsys_mask & (1UL << i)) { 532 if (root->cgrp.subsys_mask & (1UL << i)) {
551 /* Subsystem is in this hierarchy. So we want 533 /* Subsystem is in this hierarchy. So we want
552 * the subsystem state from the new 534 * the subsystem state from the new
553 * cgroup */ 535 * cgroup */
@@ -652,11 +634,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
652 634
653 /* First see if we already have a cgroup group that matches 635 /* First see if we already have a cgroup group that matches
654 * the desired set */ 636 * the desired set */
655 read_lock(&css_set_lock); 637 down_read(&css_set_rwsem);
656 cset = find_existing_css_set(old_cset, cgrp, template); 638 cset = find_existing_css_set(old_cset, cgrp, template);
657 if (cset) 639 if (cset)
658 get_css_set(cset); 640 get_css_set(cset);
659 read_unlock(&css_set_lock); 641 up_read(&css_set_rwsem);
660 642
661 if (cset) 643 if (cset)
662 return cset; 644 return cset;
@@ -674,13 +656,16 @@ static struct css_set *find_css_set(struct css_set *old_cset,
674 atomic_set(&cset->refcount, 1); 656 atomic_set(&cset->refcount, 1);
675 INIT_LIST_HEAD(&cset->cgrp_links); 657 INIT_LIST_HEAD(&cset->cgrp_links);
676 INIT_LIST_HEAD(&cset->tasks); 658 INIT_LIST_HEAD(&cset->tasks);
659 INIT_LIST_HEAD(&cset->mg_tasks);
660 INIT_LIST_HEAD(&cset->mg_preload_node);
661 INIT_LIST_HEAD(&cset->mg_node);
677 INIT_HLIST_NODE(&cset->hlist); 662 INIT_HLIST_NODE(&cset->hlist);
678 663
679 /* Copy the set of subsystem state objects generated in 664 /* Copy the set of subsystem state objects generated in
680 * find_existing_css_set() */ 665 * find_existing_css_set() */
681 memcpy(cset->subsys, template, sizeof(cset->subsys)); 666 memcpy(cset->subsys, template, sizeof(cset->subsys));
682 667
683 write_lock(&css_set_lock); 668 down_write(&css_set_rwsem);
684 /* Add reference counts and links from the new css_set. */ 669 /* Add reference counts and links from the new css_set. */
685 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { 670 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
686 struct cgroup *c = link->cgrp; 671 struct cgroup *c = link->cgrp;
@@ -698,31 +683,105 @@ static struct css_set *find_css_set(struct css_set *old_cset,
698 key = css_set_hash(cset->subsys); 683 key = css_set_hash(cset->subsys);
699 hash_add(css_set_table, &cset->hlist, key); 684 hash_add(css_set_table, &cset->hlist, key);
700 685
701 write_unlock(&css_set_lock); 686 up_write(&css_set_rwsem);
702 687
703 return cset; 688 return cset;
704} 689}
705 690
706/* 691static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
707 * Return the cgroup for "task" from the given hierarchy. Must be
708 * called with cgroup_mutex held.
709 */
710static struct cgroup *task_cgroup_from_root(struct task_struct *task,
711 struct cgroupfs_root *root)
712{ 692{
713 struct css_set *cset; 693 struct cgroup *root_cgrp = kf_root->kn->priv;
714 struct cgroup *res = NULL; 694
695 return root_cgrp->root;
696}
697
698static int cgroup_init_root_id(struct cgroup_root *root)
699{
700 int id;
701
702 lockdep_assert_held(&cgroup_mutex);
703
704 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
705 if (id < 0)
706 return id;
707
708 root->hierarchy_id = id;
709 return 0;
710}
711
712static void cgroup_exit_root_id(struct cgroup_root *root)
713{
714 lockdep_assert_held(&cgroup_mutex);
715
716 if (root->hierarchy_id) {
717 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
718 root->hierarchy_id = 0;
719 }
720}
721
722static void cgroup_free_root(struct cgroup_root *root)
723{
724 if (root) {
725 /* hierarhcy ID shoulid already have been released */
726 WARN_ON_ONCE(root->hierarchy_id);
727
728 idr_destroy(&root->cgroup_idr);
729 kfree(root);
730 }
731}
732
733static void cgroup_destroy_root(struct cgroup_root *root)
734{
735 struct cgroup *cgrp = &root->cgrp;
736 struct cgrp_cset_link *link, *tmp_link;
737
738 mutex_lock(&cgroup_tree_mutex);
739 mutex_lock(&cgroup_mutex);
740
741 BUG_ON(atomic_read(&root->nr_cgrps));
742 BUG_ON(!list_empty(&cgrp->children));
743
744 /* Rebind all subsystems back to the default hierarchy */
745 rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask);
715 746
716 BUG_ON(!mutex_is_locked(&cgroup_mutex));
717 read_lock(&css_set_lock);
718 /* 747 /*
719 * No need to lock the task - since we hold cgroup_mutex the 748 * Release all the links from cset_links to this hierarchy's
720 * task can't change groups, so the only thing that can happen 749 * root cgroup
721 * is that it exits and its css is set back to init_css_set.
722 */ 750 */
723 cset = task_css_set(task); 751 down_write(&css_set_rwsem);
752
753 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
754 list_del(&link->cset_link);
755 list_del(&link->cgrp_link);
756 kfree(link);
757 }
758 up_write(&css_set_rwsem);
759
760 if (!list_empty(&root->root_list)) {
761 list_del(&root->root_list);
762 cgroup_root_count--;
763 }
764
765 cgroup_exit_root_id(root);
766
767 mutex_unlock(&cgroup_mutex);
768 mutex_unlock(&cgroup_tree_mutex);
769
770 kernfs_destroy_root(root->kf_root);
771 cgroup_free_root(root);
772}
773
774/* look up cgroup associated with given css_set on the specified hierarchy */
775static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
776 struct cgroup_root *root)
777{
778 struct cgroup *res = NULL;
779
780 lockdep_assert_held(&cgroup_mutex);
781 lockdep_assert_held(&css_set_rwsem);
782
724 if (cset == &init_css_set) { 783 if (cset == &init_css_set) {
725 res = &root->top_cgroup; 784 res = &root->cgrp;
726 } else { 785 } else {
727 struct cgrp_cset_link *link; 786 struct cgrp_cset_link *link;
728 787
@@ -735,16 +794,27 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
735 } 794 }
736 } 795 }
737 } 796 }
738 read_unlock(&css_set_lock); 797
739 BUG_ON(!res); 798 BUG_ON(!res);
740 return res; 799 return res;
741} 800}
742 801
743/* 802/*
744 * There is one global cgroup mutex. We also require taking 803 * Return the cgroup for "task" from the given hierarchy. Must be
745 * task_lock() when dereferencing a task's cgroup subsys pointers. 804 * called with cgroup_mutex and css_set_rwsem held.
746 * See "The task_lock() exception", at the end of this comment. 805 */
747 * 806static struct cgroup *task_cgroup_from_root(struct task_struct *task,
807 struct cgroup_root *root)
808{
809 /*
810 * No need to lock the task - since we hold cgroup_mutex the
811 * task can't change groups, so the only thing that can happen
812 * is that it exits and its css is set back to init_css_set.
813 */
814 return cset_cgroup_from_root(task_css_set(task), root);
815}
816
817/*
748 * A task must hold cgroup_mutex to modify cgroups. 818 * A task must hold cgroup_mutex to modify cgroups.
749 * 819 *
750 * Any task can increment and decrement the count field without lock. 820 * Any task can increment and decrement the count field without lock.
@@ -770,98 +840,79 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
770 * A cgroup can only be deleted if both its 'count' of using tasks 840 * A cgroup can only be deleted if both its 'count' of using tasks
771 * is zero, and its list of 'children' cgroups is empty. Since all 841 * is zero, and its list of 'children' cgroups is empty. Since all
772 * tasks in the system use _some_ cgroup, and since there is always at 842 * tasks in the system use _some_ cgroup, and since there is always at
773 * least one task in the system (init, pid == 1), therefore, top_cgroup 843 * least one task in the system (init, pid == 1), therefore, root cgroup
774 * always has either children cgroups and/or using tasks. So we don't 844 * always has either children cgroups and/or using tasks. So we don't
775 * need a special hack to ensure that top_cgroup cannot be deleted. 845 * need a special hack to ensure that root cgroup cannot be deleted.
776 *
777 * The task_lock() exception
778 *
779 * The need for this exception arises from the action of
780 * cgroup_attach_task(), which overwrites one task's cgroup pointer with
781 * another. It does so using cgroup_mutex, however there are
782 * several performance critical places that need to reference
783 * task->cgroup without the expense of grabbing a system global
784 * mutex. Therefore except as noted below, when dereferencing or, as
785 * in cgroup_attach_task(), modifying a task's cgroup pointer we use
786 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
787 * the task_struct routinely used for such matters.
788 * 846 *
789 * P.S. One more locking exception. RCU is used to guard the 847 * P.S. One more locking exception. RCU is used to guard the
790 * update of a tasks cgroup pointer by cgroup_attach_task() 848 * update of a tasks cgroup pointer by cgroup_attach_task()
791 */ 849 */
792 850
793/*
794 * A couple of forward declarations required, due to cyclic reference loop:
795 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
796 * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
797 * -> cgroup_mkdir.
798 */
799
800static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
801static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
802static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); 851static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
803static const struct inode_operations cgroup_dir_inode_operations; 852static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
804static const struct file_operations proc_cgroupstats_operations; 853static const struct file_operations proc_cgroupstats_operations;
805 854
806static struct backing_dev_info cgroup_backing_dev_info = { 855static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
807 .name = "cgroup", 856 char *buf)
808 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
809};
810
811static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
812{ 857{
813 struct inode *inode = new_inode(sb); 858 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
814 859 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
815 if (inode) { 860 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
816 inode->i_ino = get_next_ino(); 861 cft->ss->name, cft->name);
817 inode->i_mode = mode; 862 else
818 inode->i_uid = current_fsuid(); 863 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
819 inode->i_gid = current_fsgid(); 864 return buf;
820 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
821 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
822 }
823 return inode;
824} 865}
825 866
826static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) 867/**
868 * cgroup_file_mode - deduce file mode of a control file
869 * @cft: the control file in question
870 *
871 * returns cft->mode if ->mode is not 0
872 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
873 * returns S_IRUGO if it has only a read handler
874 * returns S_IWUSR if it has only a write hander
875 */
876static umode_t cgroup_file_mode(const struct cftype *cft)
827{ 877{
828 struct cgroup_name *name; 878 umode_t mode = 0;
829 879
830 name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL); 880 if (cft->mode)
831 if (!name) 881 return cft->mode;
832 return NULL; 882
833 strcpy(name->name, dentry->d_name.name); 883 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
834 return name; 884 mode |= S_IRUGO;
885
886 if (cft->write_u64 || cft->write_s64 || cft->write_string ||
887 cft->trigger)
888 mode |= S_IWUSR;
889
890 return mode;
835} 891}
836 892
837static void cgroup_free_fn(struct work_struct *work) 893static void cgroup_free_fn(struct work_struct *work)
838{ 894{
839 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 895 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
840 896
841 mutex_lock(&cgroup_mutex); 897 atomic_dec(&cgrp->root->nr_cgrps);
842 cgrp->root->number_of_cgroups--;
843 mutex_unlock(&cgroup_mutex);
844
845 /*
846 * We get a ref to the parent's dentry, and put the ref when
847 * this cgroup is being freed, so it's guaranteed that the
848 * parent won't be destroyed before its children.
849 */
850 dput(cgrp->parent->dentry);
851
852 /*
853 * Drop the active superblock reference that we took when we
854 * created the cgroup. This will free cgrp->root, if we are
855 * holding the last reference to @sb.
856 */
857 deactivate_super(cgrp->root->sb);
858
859 cgroup_pidlist_destroy_all(cgrp); 898 cgroup_pidlist_destroy_all(cgrp);
860 899
861 simple_xattrs_free(&cgrp->xattrs); 900 if (cgrp->parent) {
862 901 /*
863 kfree(rcu_dereference_raw(cgrp->name)); 902 * We get a ref to the parent, and put the ref when this
864 kfree(cgrp); 903 * cgroup is being freed, so it's guaranteed that the
904 * parent won't be destroyed before its children.
905 */
906 cgroup_put(cgrp->parent);
907 kernfs_put(cgrp->kn);
908 kfree(cgrp);
909 } else {
910 /*
911 * This is root cgroup's refcnt reaching zero, which
912 * indicates that the root should be released.
913 */
914 cgroup_destroy_root(cgrp->root);
915 }
865} 916}
866 917
867static void cgroup_free_rcu(struct rcu_head *head) 918static void cgroup_free_rcu(struct rcu_head *head)
@@ -872,73 +923,40 @@ static void cgroup_free_rcu(struct rcu_head *head)
872 queue_work(cgroup_destroy_wq, &cgrp->destroy_work); 923 queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
873} 924}
874 925
875static void cgroup_diput(struct dentry *dentry, struct inode *inode) 926static void cgroup_get(struct cgroup *cgrp)
876{
877 /* is dentry a directory ? if so, kfree() associated cgroup */
878 if (S_ISDIR(inode->i_mode)) {
879 struct cgroup *cgrp = dentry->d_fsdata;
880
881 BUG_ON(!(cgroup_is_dead(cgrp)));
882
883 /*
884 * XXX: cgrp->id is only used to look up css's. As cgroup
885 * and css's lifetimes will be decoupled, it should be made
886 * per-subsystem and moved to css->id so that lookups are
887 * successful until the target css is released.
888 */
889 mutex_lock(&cgroup_mutex);
890 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
891 mutex_unlock(&cgroup_mutex);
892 cgrp->id = -1;
893
894 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
895 } else {
896 struct cfent *cfe = __d_cfe(dentry);
897 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
898
899 WARN_ONCE(!list_empty(&cfe->node) &&
900 cgrp != &cgrp->root->top_cgroup,
901 "cfe still linked for %s\n", cfe->type->name);
902 simple_xattrs_free(&cfe->xattrs);
903 kfree(cfe);
904 }
905 iput(inode);
906}
907
908static void remove_dir(struct dentry *d)
909{ 927{
910 struct dentry *parent = dget(d->d_parent); 928 WARN_ON_ONCE(cgroup_is_dead(cgrp));
911 929 WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
912 d_delete(d); 930 atomic_inc(&cgrp->refcnt);
913 simple_rmdir(parent->d_inode, d);
914 dput(parent);
915} 931}
916 932
917static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 933static void cgroup_put(struct cgroup *cgrp)
918{ 934{
919 struct cfent *cfe; 935 if (!atomic_dec_and_test(&cgrp->refcnt))
920 936 return;
921 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); 937 if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
922 lockdep_assert_held(&cgroup_mutex); 938 return;
923 939
924 /* 940 /*
925 * If we're doing cleanup due to failure of cgroup_create(), 941 * XXX: cgrp->id is only used to look up css's. As cgroup and
926 * the corresponding @cfe may not exist. 942 * css's lifetimes will be decoupled, it should be made
943 * per-subsystem and moved to css->id so that lookups are
944 * successful until the target css is released.
927 */ 945 */
928 list_for_each_entry(cfe, &cgrp->files, node) { 946 mutex_lock(&cgroup_mutex);
929 struct dentry *d = cfe->dentry; 947 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
948 mutex_unlock(&cgroup_mutex);
949 cgrp->id = -1;
930 950
931 if (cft && cfe->type != cft) 951 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
932 continue; 952}
933 953
934 dget(d); 954static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
935 d_delete(d); 955{
936 simple_unlink(cgrp->dentry->d_inode, d); 956 char name[CGROUP_FILE_NAME_MAX];
937 list_del_init(&cfe->node);
938 dput(d);
939 957
940 break; 958 lockdep_assert_held(&cgroup_tree_mutex);
941 } 959 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
942} 960}
943 961
944/** 962/**
@@ -952,144 +970,106 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
952 int i; 970 int i;
953 971
954 for_each_subsys(ss, i) { 972 for_each_subsys(ss, i) {
955 struct cftype_set *set; 973 struct cftype *cfts;
956 974
957 if (!test_bit(i, &subsys_mask)) 975 if (!test_bit(i, &subsys_mask))
958 continue; 976 continue;
959 list_for_each_entry(set, &ss->cftsets, node) 977 list_for_each_entry(cfts, &ss->cfts, node)
960 cgroup_addrm_files(cgrp, set->cfts, false); 978 cgroup_addrm_files(cgrp, cfts, false);
961 } 979 }
962} 980}
963 981
964/* 982static int rebind_subsystems(struct cgroup_root *dst_root,
965 * NOTE : the dentry must have been dget()'ed 983 unsigned long ss_mask)
966 */
967static void cgroup_d_remove_dir(struct dentry *dentry)
968{
969 struct dentry *parent;
970
971 parent = dentry->d_parent;
972 spin_lock(&parent->d_lock);
973 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
974 list_del_init(&dentry->d_u.d_child);
975 spin_unlock(&dentry->d_lock);
976 spin_unlock(&parent->d_lock);
977 remove_dir(dentry);
978}
979
980/*
981 * Call with cgroup_mutex held. Drops reference counts on modules, including
982 * any duplicate ones that parse_cgroupfs_options took. If this function
983 * returns an error, no reference counts are touched.
984 */
985static int rebind_subsystems(struct cgroupfs_root *root,
986 unsigned long added_mask, unsigned removed_mask)
987{ 984{
988 struct cgroup *cgrp = &root->top_cgroup;
989 struct cgroup_subsys *ss; 985 struct cgroup_subsys *ss;
990 unsigned long pinned = 0; 986 int ssid, ret;
991 int i, ret;
992 987
993 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 988 lockdep_assert_held(&cgroup_tree_mutex);
994 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 989 lockdep_assert_held(&cgroup_mutex);
995 990
996 /* Check that any added subsystems are currently free */ 991 for_each_subsys(ss, ssid) {
997 for_each_subsys(ss, i) { 992 if (!(ss_mask & (1 << ssid)))
998 if (!(added_mask & (1 << i)))
999 continue; 993 continue;
1000 994
1001 /* is the subsystem mounted elsewhere? */ 995 /* if @ss is on the dummy_root, we can always move it */
1002 if (ss->root != &cgroup_dummy_root) { 996 if (ss->root == &cgrp_dfl_root)
1003 ret = -EBUSY; 997 continue;
1004 goto out_put;
1005 }
1006 998
1007 /* pin the module */ 999 /* if @ss has non-root cgroups attached to it, can't move */
1008 if (!try_module_get(ss->module)) { 1000 if (!list_empty(&ss->root->cgrp.children))
1009 ret = -ENOENT; 1001 return -EBUSY;
1010 goto out_put;
1011 }
1012 pinned |= 1 << i;
1013 }
1014 1002
1015 /* subsys could be missing if unloaded between parsing and here */ 1003 /* can't move between two non-dummy roots either */
1016 if (added_mask != pinned) { 1004 if (dst_root != &cgrp_dfl_root)
1017 ret = -ENOENT; 1005 return -EBUSY;
1018 goto out_put;
1019 } 1006 }
1020 1007
1021 ret = cgroup_populate_dir(cgrp, added_mask); 1008 ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);
1022 if (ret) 1009 if (ret) {
1023 goto out_put; 1010 if (dst_root != &cgrp_dfl_root)
1011 return ret;
1012
1013 /*
1014 * Rebinding back to the default root is not allowed to
1015 * fail. Using both default and non-default roots should
1016 * be rare. Moving subsystems back and forth even more so.
1017 * Just warn about it and continue.
1018 */
1019 if (cgrp_dfl_root_visible) {
1020 pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n",
1021 ret, ss_mask);
1022 pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n");
1023 }
1024 }
1024 1025
1025 /* 1026 /*
1026 * Nothing can fail from this point on. Remove files for the 1027 * Nothing can fail from this point on. Remove files for the
1027 * removed subsystems and rebind each subsystem. 1028 * removed subsystems and rebind each subsystem.
1028 */ 1029 */
1029 cgroup_clear_dir(cgrp, removed_mask); 1030 mutex_unlock(&cgroup_mutex);
1030 1031 for_each_subsys(ss, ssid)
1031 for_each_subsys(ss, i) { 1032 if (ss_mask & (1 << ssid))
1032 unsigned long bit = 1UL << i; 1033 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1033 1034 mutex_lock(&cgroup_mutex);
1034 if (bit & added_mask) {
1035 /* We're binding this subsystem to this hierarchy */
1036 BUG_ON(cgroup_css(cgrp, ss));
1037 BUG_ON(!cgroup_css(cgroup_dummy_top, ss));
1038 BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top);
1039 1035
1040 rcu_assign_pointer(cgrp->subsys[i], 1036 for_each_subsys(ss, ssid) {
1041 cgroup_css(cgroup_dummy_top, ss)); 1037 struct cgroup_root *src_root;
1042 cgroup_css(cgrp, ss)->cgroup = cgrp; 1038 struct cgroup_subsys_state *css;
1043 1039
1044 ss->root = root; 1040 if (!(ss_mask & (1 << ssid)))
1045 if (ss->bind) 1041 continue;
1046 ss->bind(cgroup_css(cgrp, ss));
1047 1042
1048 /* refcount was already taken, and we're keeping it */ 1043 src_root = ss->root;
1049 root->subsys_mask |= bit; 1044 css = cgroup_css(&src_root->cgrp, ss);
1050 } else if (bit & removed_mask) {
1051 /* We're removing this subsystem */
1052 BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss));
1053 BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp);
1054 1045
1055 if (ss->bind) 1046 WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
1056 ss->bind(cgroup_css(cgroup_dummy_top, ss));
1057 1047
1058 cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top; 1048 RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
1059 RCU_INIT_POINTER(cgrp->subsys[i], NULL); 1049 rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
1050 ss->root = dst_root;
1051 css->cgroup = &dst_root->cgrp;
1060 1052
1061 cgroup_subsys[i]->root = &cgroup_dummy_root; 1053 src_root->cgrp.subsys_mask &= ~(1 << ssid);
1054 dst_root->cgrp.subsys_mask |= 1 << ssid;
1062 1055
1063 /* subsystem is now free - drop reference on module */ 1056 if (ss->bind)
1064 module_put(ss->module); 1057 ss->bind(css);
1065 root->subsys_mask &= ~bit;
1066 }
1067 } 1058 }
1068 1059
1069 /* 1060 kernfs_activate(dst_root->cgrp.kn);
1070 * Mark @root has finished binding subsystems. @root->subsys_mask
1071 * now matches the bound subsystems.
1072 */
1073 root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1074
1075 return 0; 1061 return 0;
1076
1077out_put:
1078 for_each_subsys(ss, i)
1079 if (pinned & (1 << i))
1080 module_put(ss->module);
1081 return ret;
1082} 1062}
1083 1063
1084static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) 1064static int cgroup_show_options(struct seq_file *seq,
1065 struct kernfs_root *kf_root)
1085{ 1066{
1086 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 1067 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1087 struct cgroup_subsys *ss; 1068 struct cgroup_subsys *ss;
1088 int ssid; 1069 int ssid;
1089 1070
1090 mutex_lock(&cgroup_root_mutex);
1091 for_each_subsys(ss, ssid) 1071 for_each_subsys(ss, ssid)
1092 if (root->subsys_mask & (1 << ssid)) 1072 if (root->cgrp.subsys_mask & (1 << ssid))
1093 seq_printf(seq, ",%s", ss->name); 1073 seq_printf(seq, ",%s", ss->name);
1094 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1074 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1095 seq_puts(seq, ",sane_behavior"); 1075 seq_puts(seq, ",sane_behavior");
@@ -1097,13 +1077,16 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1097 seq_puts(seq, ",noprefix"); 1077 seq_puts(seq, ",noprefix");
1098 if (root->flags & CGRP_ROOT_XATTR) 1078 if (root->flags & CGRP_ROOT_XATTR)
1099 seq_puts(seq, ",xattr"); 1079 seq_puts(seq, ",xattr");
1080
1081 spin_lock(&release_agent_path_lock);
1100 if (strlen(root->release_agent_path)) 1082 if (strlen(root->release_agent_path))
1101 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1083 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1102 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) 1084 spin_unlock(&release_agent_path_lock);
1085
1086 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
1103 seq_puts(seq, ",clone_children"); 1087 seq_puts(seq, ",clone_children");
1104 if (strlen(root->name)) 1088 if (strlen(root->name))
1105 seq_printf(seq, ",name=%s", root->name); 1089 seq_printf(seq, ",name=%s", root->name);
1106 mutex_unlock(&cgroup_root_mutex);
1107 return 0; 1090 return 0;
1108} 1091}
1109 1092
@@ -1115,9 +1098,6 @@ struct cgroup_sb_opts {
1115 char *name; 1098 char *name;
1116 /* User explicitly requested empty subsystem */ 1099 /* User explicitly requested empty subsystem */
1117 bool none; 1100 bool none;
1118
1119 struct cgroupfs_root *new_root;
1120
1121}; 1101};
1122 1102
1123/* 1103/*
@@ -1137,7 +1117,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1137 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1117 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1138 1118
1139#ifdef CONFIG_CPUSETS 1119#ifdef CONFIG_CPUSETS
1140 mask = ~(1UL << cpuset_subsys_id); 1120 mask = ~(1UL << cpuset_cgrp_id);
1141#endif 1121#endif
1142 1122
1143 memset(opts, 0, sizeof(*opts)); 1123 memset(opts, 0, sizeof(*opts));
@@ -1227,30 +1207,34 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1227 return -ENOENT; 1207 return -ENOENT;
1228 } 1208 }
1229 1209
1230 /*
1231 * If the 'all' option was specified select all the subsystems,
1232 * otherwise if 'none', 'name=' and a subsystem name options
1233 * were not specified, let's default to 'all'
1234 */
1235 if (all_ss || (!one_ss && !opts->none && !opts->name))
1236 for_each_subsys(ss, i)
1237 if (!ss->disabled)
1238 set_bit(i, &opts->subsys_mask);
1239
1240 /* Consistency checks */ 1210 /* Consistency checks */
1241 1211
1242 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1212 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1243 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); 1213 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1244 1214
1245 if (opts->flags & CGRP_ROOT_NOPREFIX) { 1215 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
1246 pr_err("cgroup: sane_behavior: noprefix is not allowed\n"); 1216 opts->cpuset_clone_children || opts->release_agent ||
1217 opts->name) {
1218 pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
1247 return -EINVAL; 1219 return -EINVAL;
1248 } 1220 }
1221 } else {
1222 /*
1223 * If the 'all' option was specified select all the
1224 * subsystems, otherwise if 'none', 'name=' and a subsystem
1225 * name options were not specified, let's default to 'all'
1226 */
1227 if (all_ss || (!one_ss && !opts->none && !opts->name))
1228 for_each_subsys(ss, i)
1229 if (!ss->disabled)
1230 set_bit(i, &opts->subsys_mask);
1249 1231
1250 if (opts->cpuset_clone_children) { 1232 /*
1251 pr_err("cgroup: sane_behavior: clone_children is not allowed\n"); 1233 * We either have to specify by name or by subsystems. (So
1234 * all empty hierarchies must have a name).
1235 */
1236 if (!opts->subsys_mask && !opts->name)
1252 return -EINVAL; 1237 return -EINVAL;
1253 }
1254 } 1238 }
1255 1239
1256 /* 1240 /*
@@ -1266,21 +1250,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1266 if (opts->subsys_mask && opts->none) 1250 if (opts->subsys_mask && opts->none)
1267 return -EINVAL; 1251 return -EINVAL;
1268 1252
1269 /*
1270 * We either have to specify by name or by subsystems. (So all
1271 * empty hierarchies must have a name).
1272 */
1273 if (!opts->subsys_mask && !opts->name)
1274 return -EINVAL;
1275
1276 return 0; 1253 return 0;
1277} 1254}
1278 1255
1279static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1256static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1280{ 1257{
1281 int ret = 0; 1258 int ret = 0;
1282 struct cgroupfs_root *root = sb->s_fs_info; 1259 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1283 struct cgroup *cgrp = &root->top_cgroup;
1284 struct cgroup_sb_opts opts; 1260 struct cgroup_sb_opts opts;
1285 unsigned long added_mask, removed_mask; 1261 unsigned long added_mask, removed_mask;
1286 1262
@@ -1289,21 +1265,20 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1289 return -EINVAL; 1265 return -EINVAL;
1290 } 1266 }
1291 1267
1292 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1268 mutex_lock(&cgroup_tree_mutex);
1293 mutex_lock(&cgroup_mutex); 1269 mutex_lock(&cgroup_mutex);
1294 mutex_lock(&cgroup_root_mutex);
1295 1270
1296 /* See what subsystems are wanted */ 1271 /* See what subsystems are wanted */
1297 ret = parse_cgroupfs_options(data, &opts); 1272 ret = parse_cgroupfs_options(data, &opts);
1298 if (ret) 1273 if (ret)
1299 goto out_unlock; 1274 goto out_unlock;
1300 1275
1301 if (opts.subsys_mask != root->subsys_mask || opts.release_agent) 1276 if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent)
1302 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1277 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1303 task_tgid_nr(current), current->comm); 1278 task_tgid_nr(current), current->comm);
1304 1279
1305 added_mask = opts.subsys_mask & ~root->subsys_mask; 1280 added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask;
1306 removed_mask = root->subsys_mask & ~opts.subsys_mask; 1281 removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask;
1307 1282
1308 /* Don't allow flags or name to change at remount */ 1283 /* Don't allow flags or name to change at remount */
1309 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || 1284 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
@@ -1316,422 +1291,331 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1316 } 1291 }
1317 1292
1318 /* remounting is not allowed for populated hierarchies */ 1293 /* remounting is not allowed for populated hierarchies */
1319 if (root->number_of_cgroups > 1) { 1294 if (!list_empty(&root->cgrp.children)) {
1320 ret = -EBUSY; 1295 ret = -EBUSY;
1321 goto out_unlock; 1296 goto out_unlock;
1322 } 1297 }
1323 1298
1324 ret = rebind_subsystems(root, added_mask, removed_mask); 1299 ret = rebind_subsystems(root, added_mask);
1325 if (ret) 1300 if (ret)
1326 goto out_unlock; 1301 goto out_unlock;
1327 1302
1328 if (opts.release_agent) 1303 rebind_subsystems(&cgrp_dfl_root, removed_mask);
1304
1305 if (opts.release_agent) {
1306 spin_lock(&release_agent_path_lock);
1329 strcpy(root->release_agent_path, opts.release_agent); 1307 strcpy(root->release_agent_path, opts.release_agent);
1308 spin_unlock(&release_agent_path_lock);
1309 }
1330 out_unlock: 1310 out_unlock:
1331 kfree(opts.release_agent); 1311 kfree(opts.release_agent);
1332 kfree(opts.name); 1312 kfree(opts.name);
1333 mutex_unlock(&cgroup_root_mutex);
1334 mutex_unlock(&cgroup_mutex); 1313 mutex_unlock(&cgroup_mutex);
1335 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1314 mutex_unlock(&cgroup_tree_mutex);
1336 return ret; 1315 return ret;
1337} 1316}
1338 1317
1339static const struct super_operations cgroup_ops = { 1318/*
1340 .statfs = simple_statfs, 1319 * To reduce the fork() overhead for systems that are not actually using
1341 .drop_inode = generic_delete_inode, 1320 * their cgroups capability, we don't maintain the lists running through
1342 .show_options = cgroup_show_options, 1321 * each css_set to its tasks until we see the list actually used - in other
1343 .remount_fs = cgroup_remount, 1322 * words after the first mount.
1344}; 1323 */
1324static bool use_task_css_set_links __read_mostly;
1325
1326static void cgroup_enable_task_cg_lists(void)
1327{
1328 struct task_struct *p, *g;
1329
1330 down_write(&css_set_rwsem);
1331
1332 if (use_task_css_set_links)
1333 goto out_unlock;
1334
1335 use_task_css_set_links = true;
1336
1337 /*
1338 * We need tasklist_lock because RCU is not safe against
1339 * while_each_thread(). Besides, a forking task that has passed
1340 * cgroup_post_fork() without seeing use_task_css_set_links = 1
1341 * is not guaranteed to have its child immediately visible in the
1342 * tasklist if we walk through it with RCU.
1343 */
1344 read_lock(&tasklist_lock);
1345 do_each_thread(g, p) {
1346 WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1347 task_css_set(p) != &init_css_set);
1348
1349 /*
1350 * We should check if the process is exiting, otherwise
1351 * it will race with cgroup_exit() in that the list
1352 * entry won't be deleted though the process has exited.
1353 * Do it while holding siglock so that we don't end up
1354 * racing against cgroup_exit().
1355 */
1356 spin_lock_irq(&p->sighand->siglock);
1357 if (!(p->flags & PF_EXITING)) {
1358 struct css_set *cset = task_css_set(p);
1359
1360 list_add(&p->cg_list, &cset->tasks);
1361 get_css_set(cset);
1362 }
1363 spin_unlock_irq(&p->sighand->siglock);
1364 } while_each_thread(g, p);
1365 read_unlock(&tasklist_lock);
1366out_unlock:
1367 up_write(&css_set_rwsem);
1368}
1345 1369
1346static void init_cgroup_housekeeping(struct cgroup *cgrp) 1370static void init_cgroup_housekeeping(struct cgroup *cgrp)
1347{ 1371{
1372 atomic_set(&cgrp->refcnt, 1);
1348 INIT_LIST_HEAD(&cgrp->sibling); 1373 INIT_LIST_HEAD(&cgrp->sibling);
1349 INIT_LIST_HEAD(&cgrp->children); 1374 INIT_LIST_HEAD(&cgrp->children);
1350 INIT_LIST_HEAD(&cgrp->files);
1351 INIT_LIST_HEAD(&cgrp->cset_links); 1375 INIT_LIST_HEAD(&cgrp->cset_links);
1352 INIT_LIST_HEAD(&cgrp->release_list); 1376 INIT_LIST_HEAD(&cgrp->release_list);
1353 INIT_LIST_HEAD(&cgrp->pidlists); 1377 INIT_LIST_HEAD(&cgrp->pidlists);
1354 mutex_init(&cgrp->pidlist_mutex); 1378 mutex_init(&cgrp->pidlist_mutex);
1355 cgrp->dummy_css.cgroup = cgrp; 1379 cgrp->dummy_css.cgroup = cgrp;
1356 simple_xattrs_init(&cgrp->xattrs);
1357} 1380}
1358 1381
1359static void init_cgroup_root(struct cgroupfs_root *root) 1382static void init_cgroup_root(struct cgroup_root *root,
1383 struct cgroup_sb_opts *opts)
1360{ 1384{
1361 struct cgroup *cgrp = &root->top_cgroup; 1385 struct cgroup *cgrp = &root->cgrp;
1362 1386
1363 INIT_LIST_HEAD(&root->root_list); 1387 INIT_LIST_HEAD(&root->root_list);
1364 root->number_of_cgroups = 1; 1388 atomic_set(&root->nr_cgrps, 1);
1365 cgrp->root = root; 1389 cgrp->root = root;
1366 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
1367 init_cgroup_housekeeping(cgrp); 1390 init_cgroup_housekeeping(cgrp);
1368 idr_init(&root->cgroup_idr); 1391 idr_init(&root->cgroup_idr);
1369}
1370
1371static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
1372{
1373 int id;
1374
1375 lockdep_assert_held(&cgroup_mutex);
1376 lockdep_assert_held(&cgroup_root_mutex);
1377
1378 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
1379 GFP_KERNEL);
1380 if (id < 0)
1381 return id;
1382
1383 root->hierarchy_id = id;
1384 return 0;
1385}
1386
1387static void cgroup_exit_root_id(struct cgroupfs_root *root)
1388{
1389 lockdep_assert_held(&cgroup_mutex);
1390 lockdep_assert_held(&cgroup_root_mutex);
1391
1392 if (root->hierarchy_id) {
1393 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1394 root->hierarchy_id = 0;
1395 }
1396}
1397
1398static int cgroup_test_super(struct super_block *sb, void *data)
1399{
1400 struct cgroup_sb_opts *opts = data;
1401 struct cgroupfs_root *root = sb->s_fs_info;
1402
1403 /* If we asked for a name then it must match */
1404 if (opts->name && strcmp(opts->name, root->name))
1405 return 0;
1406
1407 /*
1408 * If we asked for subsystems (or explicitly for no
1409 * subsystems) then they must match
1410 */
1411 if ((opts->subsys_mask || opts->none)
1412 && (opts->subsys_mask != root->subsys_mask))
1413 return 0;
1414
1415 return 1;
1416}
1417
1418static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1419{
1420 struct cgroupfs_root *root;
1421 1392
1422 if (!opts->subsys_mask && !opts->none)
1423 return NULL;
1424
1425 root = kzalloc(sizeof(*root), GFP_KERNEL);
1426 if (!root)
1427 return ERR_PTR(-ENOMEM);
1428
1429 init_cgroup_root(root);
1430
1431 /*
1432 * We need to set @root->subsys_mask now so that @root can be
1433 * matched by cgroup_test_super() before it finishes
1434 * initialization; otherwise, competing mounts with the same
1435 * options may try to bind the same subsystems instead of waiting
1436 * for the first one leading to unexpected mount errors.
1437 * SUBSYS_BOUND will be set once actual binding is complete.
1438 */
1439 root->subsys_mask = opts->subsys_mask;
1440 root->flags = opts->flags; 1393 root->flags = opts->flags;
1441 if (opts->release_agent) 1394 if (opts->release_agent)
1442 strcpy(root->release_agent_path, opts->release_agent); 1395 strcpy(root->release_agent_path, opts->release_agent);
1443 if (opts->name) 1396 if (opts->name)
1444 strcpy(root->name, opts->name); 1397 strcpy(root->name, opts->name);
1445 if (opts->cpuset_clone_children) 1398 if (opts->cpuset_clone_children)
1446 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); 1399 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1447 return root;
1448} 1400}
1449 1401
1450static void cgroup_free_root(struct cgroupfs_root *root) 1402static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1451{ 1403{
1452 if (root) { 1404 LIST_HEAD(tmp_links);
1453 /* hierarhcy ID shoulid already have been released */ 1405 struct cgroup *root_cgrp = &root->cgrp;
1454 WARN_ON_ONCE(root->hierarchy_id); 1406 struct css_set *cset;
1455 1407 int i, ret;
1456 idr_destroy(&root->cgroup_idr);
1457 kfree(root);
1458 }
1459}
1460 1408
1461static int cgroup_set_super(struct super_block *sb, void *data) 1409 lockdep_assert_held(&cgroup_tree_mutex);
1462{ 1410 lockdep_assert_held(&cgroup_mutex);
1463 int ret;
1464 struct cgroup_sb_opts *opts = data;
1465 1411
1466 /* If we don't have a new root, we can't set up a new sb */ 1412 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
1467 if (!opts->new_root) 1413 if (ret < 0)
1468 return -EINVAL; 1414 goto out;
1415 root_cgrp->id = ret;
1469 1416
1470 BUG_ON(!opts->subsys_mask && !opts->none); 1417 /*
1418 * We're accessing css_set_count without locking css_set_rwsem here,
1419 * but that's OK - it can only be increased by someone holding
1420 * cgroup_lock, and that's us. The worst that can happen is that we
1421 * have some link structures left over
1422 */
1423 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1424 if (ret)
1425 goto out;
1471 1426
1472 ret = set_anon_super(sb, NULL); 1427 ret = cgroup_init_root_id(root);
1473 if (ret) 1428 if (ret)
1474 return ret; 1429 goto out;
1475 1430
1476 sb->s_fs_info = opts->new_root; 1431 root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
1477 opts->new_root->sb = sb; 1432 KERNFS_ROOT_CREATE_DEACTIVATED,
1433 root_cgrp);
1434 if (IS_ERR(root->kf_root)) {
1435 ret = PTR_ERR(root->kf_root);
1436 goto exit_root_id;
1437 }
1438 root_cgrp->kn = root->kf_root->kn;
1478 1439
1479 sb->s_blocksize = PAGE_CACHE_SIZE; 1440 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1480 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 1441 if (ret)
1481 sb->s_magic = CGROUP_SUPER_MAGIC; 1442 goto destroy_root;
1482 sb->s_op = &cgroup_ops;
1483 1443
1484 return 0; 1444 ret = rebind_subsystems(root, ss_mask);
1485} 1445 if (ret)
1446 goto destroy_root;
1486 1447
1487static int cgroup_get_rootdir(struct super_block *sb) 1448 /*
1488{ 1449 * There must be no failure case after here, since rebinding takes
1489 static const struct dentry_operations cgroup_dops = { 1450 * care of subsystems' refcounts, which are explicitly dropped in
1490 .d_iput = cgroup_diput, 1451 * the failure exit path.
1491 .d_delete = always_delete_dentry, 1452 */
1492 }; 1453 list_add(&root->root_list, &cgroup_roots);
1454 cgroup_root_count++;
1493 1455
1494 struct inode *inode = 1456 /*
1495 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 1457 * Link the root cgroup in this hierarchy into all the css_set
1458 * objects.
1459 */
1460 down_write(&css_set_rwsem);
1461 hash_for_each(css_set_table, i, cset, hlist)
1462 link_css_set(&tmp_links, cset, root_cgrp);
1463 up_write(&css_set_rwsem);
1496 1464
1497 if (!inode) 1465 BUG_ON(!list_empty(&root_cgrp->children));
1498 return -ENOMEM; 1466 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1499 1467
1500 inode->i_fop = &simple_dir_operations; 1468 kernfs_activate(root_cgrp->kn);
1501 inode->i_op = &cgroup_dir_inode_operations; 1469 ret = 0;
1502 /* directories start off with i_nlink == 2 (for "." entry) */ 1470 goto out;
1503 inc_nlink(inode); 1471
1504 sb->s_root = d_make_root(inode); 1472destroy_root:
1505 if (!sb->s_root) 1473 kernfs_destroy_root(root->kf_root);
1506 return -ENOMEM; 1474 root->kf_root = NULL;
1507 /* for everything else we want ->d_op set */ 1475exit_root_id:
1508 sb->s_d_op = &cgroup_dops; 1476 cgroup_exit_root_id(root);
1509 return 0; 1477out:
1478 free_cgrp_cset_links(&tmp_links);
1479 return ret;
1510} 1480}
1511 1481
1512static struct dentry *cgroup_mount(struct file_system_type *fs_type, 1482static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1513 int flags, const char *unused_dev_name, 1483 int flags, const char *unused_dev_name,
1514 void *data) 1484 void *data)
1515{ 1485{
1486 struct cgroup_root *root;
1516 struct cgroup_sb_opts opts; 1487 struct cgroup_sb_opts opts;
1517 struct cgroupfs_root *root; 1488 struct dentry *dentry;
1518 int ret = 0; 1489 int ret;
1519 struct super_block *sb;
1520 struct cgroupfs_root *new_root;
1521 struct list_head tmp_links;
1522 struct inode *inode;
1523 const struct cred *cred;
1524 1490
1525 /* First find the desired set of subsystems */ 1491 /*
1492 * The first time anyone tries to mount a cgroup, enable the list
1493 * linking each css_set to its tasks and fix up all existing tasks.
1494 */
1495 if (!use_task_css_set_links)
1496 cgroup_enable_task_cg_lists();
1497retry:
1498 mutex_lock(&cgroup_tree_mutex);
1526 mutex_lock(&cgroup_mutex); 1499 mutex_lock(&cgroup_mutex);
1500
1501 /* First find the desired set of subsystems */
1527 ret = parse_cgroupfs_options(data, &opts); 1502 ret = parse_cgroupfs_options(data, &opts);
1528 mutex_unlock(&cgroup_mutex);
1529 if (ret) 1503 if (ret)
1530 goto out_err; 1504 goto out_unlock;
1531
1532 /*
1533 * Allocate a new cgroup root. We may not need it if we're
1534 * reusing an existing hierarchy.
1535 */
1536 new_root = cgroup_root_from_opts(&opts);
1537 if (IS_ERR(new_root)) {
1538 ret = PTR_ERR(new_root);
1539 goto out_err;
1540 }
1541 opts.new_root = new_root;
1542 1505
1543 /* Locate an existing or new sb for this hierarchy */ 1506 /* look for a matching existing root */
1544 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); 1507 if (!opts.subsys_mask && !opts.none && !opts.name) {
1545 if (IS_ERR(sb)) { 1508 cgrp_dfl_root_visible = true;
1546 ret = PTR_ERR(sb); 1509 root = &cgrp_dfl_root;
1547 cgroup_free_root(opts.new_root); 1510 cgroup_get(&root->cgrp);
1548 goto out_err; 1511 ret = 0;
1512 goto out_unlock;
1549 } 1513 }
1550 1514
1551 root = sb->s_fs_info; 1515 for_each_root(root) {
1552 BUG_ON(!root); 1516 bool name_match = false;
1553 if (root == opts.new_root) {
1554 /* We used the new root structure, so this is a new hierarchy */
1555 struct cgroup *root_cgrp = &root->top_cgroup;
1556 struct cgroupfs_root *existing_root;
1557 int i;
1558 struct css_set *cset;
1559
1560 BUG_ON(sb->s_root != NULL);
1561
1562 ret = cgroup_get_rootdir(sb);
1563 if (ret)
1564 goto drop_new_super;
1565 inode = sb->s_root->d_inode;
1566
1567 mutex_lock(&inode->i_mutex);
1568 mutex_lock(&cgroup_mutex);
1569 mutex_lock(&cgroup_root_mutex);
1570
1571 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
1572 if (ret < 0)
1573 goto unlock_drop;
1574 root_cgrp->id = ret;
1575
1576 /* Check for name clashes with existing mounts */
1577 ret = -EBUSY;
1578 if (strlen(root->name))
1579 for_each_active_root(existing_root)
1580 if (!strcmp(existing_root->name, root->name))
1581 goto unlock_drop;
1582
1583 /*
1584 * We're accessing css_set_count without locking
1585 * css_set_lock here, but that's OK - it can only be
1586 * increased by someone holding cgroup_lock, and
1587 * that's us. The worst that can happen is that we
1588 * have some link structures left over
1589 */
1590 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1591 if (ret)
1592 goto unlock_drop;
1593 1517
1594 /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ 1518 if (root == &cgrp_dfl_root)
1595 ret = cgroup_init_root_id(root, 2, 0); 1519 continue;
1596 if (ret)
1597 goto unlock_drop;
1598
1599 sb->s_root->d_fsdata = root_cgrp;
1600 root_cgrp->dentry = sb->s_root;
1601
1602 /*
1603 * We're inside get_sb() and will call lookup_one_len() to
1604 * create the root files, which doesn't work if SELinux is
1605 * in use. The following cred dancing somehow works around
1606 * it. See 2ce9738ba ("cgroupfs: use init_cred when
1607 * populating new cgroupfs mount") for more details.
1608 */
1609 cred = override_creds(&init_cred);
1610
1611 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1612 if (ret)
1613 goto rm_base_files;
1614
1615 ret = rebind_subsystems(root, root->subsys_mask, 0);
1616 if (ret)
1617 goto rm_base_files;
1618
1619 revert_creds(cred);
1620 1520
1621 /* 1521 /*
1622 * There must be no failure case after here, since rebinding 1522 * If we asked for a name then it must match. Also, if
1623 * takes care of subsystems' refcounts, which are explicitly 1523 * name matches but sybsys_mask doesn't, we should fail.
1624 * dropped in the failure exit path. 1524 * Remember whether name matched.
1625 */ 1525 */
1526 if (opts.name) {
1527 if (strcmp(opts.name, root->name))
1528 continue;
1529 name_match = true;
1530 }
1626 1531
1627 list_add(&root->root_list, &cgroup_roots);
1628 cgroup_root_count++;
1629
1630 /* Link the top cgroup in this hierarchy into all
1631 * the css_set objects */
1632 write_lock(&css_set_lock);
1633 hash_for_each(css_set_table, i, cset, hlist)
1634 link_css_set(&tmp_links, cset, root_cgrp);
1635 write_unlock(&css_set_lock);
1636
1637 free_cgrp_cset_links(&tmp_links);
1638
1639 BUG_ON(!list_empty(&root_cgrp->children));
1640 BUG_ON(root->number_of_cgroups != 1);
1641
1642 mutex_unlock(&cgroup_root_mutex);
1643 mutex_unlock(&cgroup_mutex);
1644 mutex_unlock(&inode->i_mutex);
1645 } else {
1646 /* 1532 /*
1647 * We re-used an existing hierarchy - the new root (if 1533 * If we asked for subsystems (or explicitly for no
1648 * any) is not needed 1534 * subsystems) then they must match.
1649 */ 1535 */
1650 cgroup_free_root(opts.new_root); 1536 if ((opts.subsys_mask || opts.none) &&
1537 (opts.subsys_mask != root->cgrp.subsys_mask)) {
1538 if (!name_match)
1539 continue;
1540 ret = -EBUSY;
1541 goto out_unlock;
1542 }
1651 1543
1652 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { 1544 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1653 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1545 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1654 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); 1546 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1655 ret = -EINVAL; 1547 ret = -EINVAL;
1656 goto drop_new_super; 1548 goto out_unlock;
1657 } else { 1549 } else {
1658 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); 1550 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1659 } 1551 }
1660 } 1552 }
1661 }
1662
1663 kfree(opts.release_agent);
1664 kfree(opts.name);
1665 return dget(sb->s_root);
1666
1667 rm_base_files:
1668 free_cgrp_cset_links(&tmp_links);
1669 cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
1670 revert_creds(cred);
1671 unlock_drop:
1672 cgroup_exit_root_id(root);
1673 mutex_unlock(&cgroup_root_mutex);
1674 mutex_unlock(&cgroup_mutex);
1675 mutex_unlock(&inode->i_mutex);
1676 drop_new_super:
1677 deactivate_locked_super(sb);
1678 out_err:
1679 kfree(opts.release_agent);
1680 kfree(opts.name);
1681 return ERR_PTR(ret);
1682}
1683
1684static void cgroup_kill_sb(struct super_block *sb)
1685{
1686 struct cgroupfs_root *root = sb->s_fs_info;
1687 struct cgroup *cgrp = &root->top_cgroup;
1688 struct cgrp_cset_link *link, *tmp_link;
1689 int ret;
1690
1691 BUG_ON(!root);
1692
1693 BUG_ON(root->number_of_cgroups != 1);
1694 BUG_ON(!list_empty(&cgrp->children));
1695 1553
1696 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1554 /*
1697 mutex_lock(&cgroup_mutex); 1555 * A root's lifetime is governed by its root cgroup. Zero
1698 mutex_lock(&cgroup_root_mutex); 1556 * ref indicate that the root is being destroyed. Wait for
1557 * destruction to complete so that the subsystems are free.
1558 * We can use wait_queue for the wait but this path is
1559 * super cold. Let's just sleep for a bit and retry.
1560 */
1561 if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
1562 mutex_unlock(&cgroup_mutex);
1563 mutex_unlock(&cgroup_tree_mutex);
1564 kfree(opts.release_agent);
1565 kfree(opts.name);
1566 msleep(10);
1567 goto retry;
1568 }
1699 1569
1700 /* Rebind all subsystems back to the default hierarchy */ 1570 ret = 0;
1701 if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { 1571 goto out_unlock;
1702 ret = rebind_subsystems(root, 0, root->subsys_mask);
1703 /* Shouldn't be able to fail ... */
1704 BUG_ON(ret);
1705 } 1572 }
1706 1573
1707 /* 1574 /*
1708 * Release all the links from cset_links to this hierarchy's 1575 * No such thing, create a new one. name= matching without subsys
1709 * root cgroup 1576 * specification is allowed for already existing hierarchies but we
1577 * can't create new one without subsys specification.
1710 */ 1578 */
1711 write_lock(&css_set_lock); 1579 if (!opts.subsys_mask && !opts.none) {
1712 1580 ret = -EINVAL;
1713 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { 1581 goto out_unlock;
1714 list_del(&link->cset_link);
1715 list_del(&link->cgrp_link);
1716 kfree(link);
1717 } 1582 }
1718 write_unlock(&css_set_lock);
1719 1583
1720 if (!list_empty(&root->root_list)) { 1584 root = kzalloc(sizeof(*root), GFP_KERNEL);
1721 list_del(&root->root_list); 1585 if (!root) {
1722 cgroup_root_count--; 1586 ret = -ENOMEM;
1587 goto out_unlock;
1723 } 1588 }
1724 1589
1725 cgroup_exit_root_id(root); 1590 init_cgroup_root(root, &opts);
1726 1591
1727 mutex_unlock(&cgroup_root_mutex); 1592 ret = cgroup_setup_root(root, opts.subsys_mask);
1593 if (ret)
1594 cgroup_free_root(root);
1595
1596out_unlock:
1728 mutex_unlock(&cgroup_mutex); 1597 mutex_unlock(&cgroup_mutex);
1729 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1598 mutex_unlock(&cgroup_tree_mutex);
1730 1599
1731 simple_xattrs_free(&cgrp->xattrs); 1600 kfree(opts.release_agent);
1601 kfree(opts.name);
1732 1602
1733 kill_litter_super(sb); 1603 if (ret)
1734 cgroup_free_root(root); 1604 return ERR_PTR(ret);
1605
1606 dentry = kernfs_mount(fs_type, flags, root->kf_root, NULL);
1607 if (IS_ERR(dentry))
1608 cgroup_put(&root->cgrp);
1609 return dentry;
1610}
1611
1612static void cgroup_kill_sb(struct super_block *sb)
1613{
1614 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1615 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1616
1617 cgroup_put(&root->cgrp);
1618 kernfs_kill_sb(sb);
1735} 1619}
1736 1620
1737static struct file_system_type cgroup_fs_type = { 1621static struct file_system_type cgroup_fs_type = {
@@ -1743,57 +1627,6 @@ static struct file_system_type cgroup_fs_type = {
1743static struct kobject *cgroup_kobj; 1627static struct kobject *cgroup_kobj;
1744 1628
1745/** 1629/**
1746 * cgroup_path - generate the path of a cgroup
1747 * @cgrp: the cgroup in question
1748 * @buf: the buffer to write the path into
1749 * @buflen: the length of the buffer
1750 *
1751 * Writes path of cgroup into buf. Returns 0 on success, -errno on error.
1752 *
1753 * We can't generate cgroup path using dentry->d_name, as accessing
1754 * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
1755 * inode's i_mutex, while on the other hand cgroup_path() can be called
1756 * with some irq-safe spinlocks held.
1757 */
1758int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1759{
1760 int ret = -ENAMETOOLONG;
1761 char *start;
1762
1763 if (!cgrp->parent) {
1764 if (strlcpy(buf, "/", buflen) >= buflen)
1765 return -ENAMETOOLONG;
1766 return 0;
1767 }
1768
1769 start = buf + buflen - 1;
1770 *start = '\0';
1771
1772 rcu_read_lock();
1773 do {
1774 const char *name = cgroup_name(cgrp);
1775 int len;
1776
1777 len = strlen(name);
1778 if ((start -= len) < buf)
1779 goto out;
1780 memcpy(start, name, len);
1781
1782 if (--start < buf)
1783 goto out;
1784 *start = '/';
1785
1786 cgrp = cgrp->parent;
1787 } while (cgrp->parent);
1788 ret = 0;
1789 memmove(buf, start, buf + buflen - start);
1790out:
1791 rcu_read_unlock();
1792 return ret;
1793}
1794EXPORT_SYMBOL_GPL(cgroup_path);
1795
1796/**
1797 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy 1630 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
1798 * @task: target task 1631 * @task: target task
1799 * @buf: the buffer to write the path into 1632 * @buf: the buffer to write the path into
@@ -1804,49 +1637,55 @@ EXPORT_SYMBOL_GPL(cgroup_path);
1804 * function grabs cgroup_mutex and shouldn't be used inside locks used by 1637 * function grabs cgroup_mutex and shouldn't be used inside locks used by
1805 * cgroup controller callbacks. 1638 * cgroup controller callbacks.
1806 * 1639 *
1807 * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short. 1640 * Return value is the same as kernfs_path().
1808 */ 1641 */
1809int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) 1642char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1810{ 1643{
1811 struct cgroupfs_root *root; 1644 struct cgroup_root *root;
1812 struct cgroup *cgrp; 1645 struct cgroup *cgrp;
1813 int hierarchy_id = 1, ret = 0; 1646 int hierarchy_id = 1;
1814 1647 char *path = NULL;
1815 if (buflen < 2)
1816 return -ENAMETOOLONG;
1817 1648
1818 mutex_lock(&cgroup_mutex); 1649 mutex_lock(&cgroup_mutex);
1650 down_read(&css_set_rwsem);
1819 1651
1820 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); 1652 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
1821 1653
1822 if (root) { 1654 if (root) {
1823 cgrp = task_cgroup_from_root(task, root); 1655 cgrp = task_cgroup_from_root(task, root);
1824 ret = cgroup_path(cgrp, buf, buflen); 1656 path = cgroup_path(cgrp, buf, buflen);
1825 } else { 1657 } else {
1826 /* if no hierarchy exists, everyone is in "/" */ 1658 /* if no hierarchy exists, everyone is in "/" */
1827 memcpy(buf, "/", 2); 1659 if (strlcpy(buf, "/", buflen) < buflen)
1660 path = buf;
1828 } 1661 }
1829 1662
1663 up_read(&css_set_rwsem);
1830 mutex_unlock(&cgroup_mutex); 1664 mutex_unlock(&cgroup_mutex);
1831 return ret; 1665 return path;
1832} 1666}
1833EXPORT_SYMBOL_GPL(task_cgroup_path); 1667EXPORT_SYMBOL_GPL(task_cgroup_path);
1834 1668
1835/* 1669/* used to track tasks and other necessary states during migration */
1836 * Control Group taskset
1837 */
1838struct task_and_cgroup {
1839 struct task_struct *task;
1840 struct cgroup *cgrp;
1841 struct css_set *cset;
1842};
1843
1844struct cgroup_taskset { 1670struct cgroup_taskset {
1845 struct task_and_cgroup single; 1671 /* the src and dst cset list running through cset->mg_node */
1846 struct flex_array *tc_array; 1672 struct list_head src_csets;
1847 int tc_array_len; 1673 struct list_head dst_csets;
1848 int idx; 1674
1849 struct cgroup *cur_cgrp; 1675 /*
1676 * Fields for cgroup_taskset_*() iteration.
1677 *
1678 * Before migration is committed, the target migration tasks are on
1679 * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
1680 * the csets on ->dst_csets. ->csets point to either ->src_csets
1681 * or ->dst_csets depending on whether migration is committed.
1682 *
1683 * ->cur_csets and ->cur_task point to the current task position
1684 * during iteration.
1685 */
1686 struct list_head *csets;
1687 struct css_set *cur_cset;
1688 struct task_struct *cur_task;
1850}; 1689};
1851 1690
1852/** 1691/**
@@ -1857,15 +1696,11 @@ struct cgroup_taskset {
1857 */ 1696 */
1858struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) 1697struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1859{ 1698{
1860 if (tset->tc_array) { 1699 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
1861 tset->idx = 0; 1700 tset->cur_task = NULL;
1862 return cgroup_taskset_next(tset); 1701
1863 } else { 1702 return cgroup_taskset_next(tset);
1864 tset->cur_cgrp = tset->single.cgrp;
1865 return tset->single.task;
1866 }
1867} 1703}
1868EXPORT_SYMBOL_GPL(cgroup_taskset_first);
1869 1704
1870/** 1705/**
1871 * cgroup_taskset_next - iterate to the next task in taskset 1706 * cgroup_taskset_next - iterate to the next task in taskset
@@ -1876,48 +1711,36 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_first);
1876 */ 1711 */
1877struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) 1712struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1878{ 1713{
1879 struct task_and_cgroup *tc; 1714 struct css_set *cset = tset->cur_cset;
1715 struct task_struct *task = tset->cur_task;
1880 1716
1881 if (!tset->tc_array || tset->idx >= tset->tc_array_len) 1717 while (&cset->mg_node != tset->csets) {
1882 return NULL; 1718 if (!task)
1719 task = list_first_entry(&cset->mg_tasks,
1720 struct task_struct, cg_list);
1721 else
1722 task = list_next_entry(task, cg_list);
1883 1723
1884 tc = flex_array_get(tset->tc_array, tset->idx++); 1724 if (&task->cg_list != &cset->mg_tasks) {
1885 tset->cur_cgrp = tc->cgrp; 1725 tset->cur_cset = cset;
1886 return tc->task; 1726 tset->cur_task = task;
1887} 1727 return task;
1888EXPORT_SYMBOL_GPL(cgroup_taskset_next); 1728 }
1889 1729
1890/** 1730 cset = list_next_entry(cset, mg_node);
1891 * cgroup_taskset_cur_css - return the matching css for the current task 1731 task = NULL;
1892 * @tset: taskset of interest 1732 }
1893 * @subsys_id: the ID of the target subsystem
1894 *
1895 * Return the css for the current (last returned) task of @tset for
1896 * subsystem specified by @subsys_id. This function must be preceded by
1897 * either cgroup_taskset_first() or cgroup_taskset_next().
1898 */
1899struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
1900 int subsys_id)
1901{
1902 return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
1903}
1904EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
1905 1733
1906/** 1734 return NULL;
1907 * cgroup_taskset_size - return the number of tasks in taskset
1908 * @tset: taskset of interest
1909 */
1910int cgroup_taskset_size(struct cgroup_taskset *tset)
1911{
1912 return tset->tc_array ? tset->tc_array_len : 1;
1913} 1735}
1914EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1915 1736
1916 1737/**
1917/*
1918 * cgroup_task_migrate - move a task from one cgroup to another. 1738 * cgroup_task_migrate - move a task from one cgroup to another.
1739 * @old_cgrp; the cgroup @tsk is being migrated from
1740 * @tsk: the task being migrated
1741 * @new_cset: the new css_set @tsk is being attached to
1919 * 1742 *
1920 * Must be called with cgroup_mutex and threadgroup locked. 1743 * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
1921 */ 1744 */
1922static void cgroup_task_migrate(struct cgroup *old_cgrp, 1745static void cgroup_task_migrate(struct cgroup *old_cgrp,
1923 struct task_struct *tsk, 1746 struct task_struct *tsk,
@@ -1925,6 +1748,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
1925{ 1748{
1926 struct css_set *old_cset; 1749 struct css_set *old_cset;
1927 1750
1751 lockdep_assert_held(&cgroup_mutex);
1752 lockdep_assert_held(&css_set_rwsem);
1753
1928 /* 1754 /*
1929 * We are synchronized through threadgroup_lock() against PF_EXITING 1755 * We are synchronized through threadgroup_lock() against PF_EXITING
1930 * setting such that we can't race against cgroup_exit() changing the 1756 * setting such that we can't race against cgroup_exit() changing the
@@ -1933,15 +1759,16 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
1933 WARN_ON_ONCE(tsk->flags & PF_EXITING); 1759 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1934 old_cset = task_css_set(tsk); 1760 old_cset = task_css_set(tsk);
1935 1761
1936 task_lock(tsk); 1762 get_css_set(new_cset);
1937 rcu_assign_pointer(tsk->cgroups, new_cset); 1763 rcu_assign_pointer(tsk->cgroups, new_cset);
1938 task_unlock(tsk);
1939 1764
1940 /* Update the css_set linked lists if we're using them */ 1765 /*
1941 write_lock(&css_set_lock); 1766 * Use move_tail so that cgroup_taskset_first() still returns the
1942 if (!list_empty(&tsk->cg_list)) 1767 * leader after migration. This works because cgroup_migrate()
1943 list_move(&tsk->cg_list, &new_cset->tasks); 1768 * ensures that the dst_cset of the leader is the first on the
1944 write_unlock(&css_set_lock); 1769 * tset's dst_csets list.
1770 */
1771 list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
1945 1772
1946 /* 1773 /*
1947 * We just gained a reference on old_cset by taking it from the 1774 * We just gained a reference on old_cset by taking it from the
@@ -1949,100 +1776,199 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
1949 * we're safe to drop it here; it will be freed under RCU. 1776 * we're safe to drop it here; it will be freed under RCU.
1950 */ 1777 */
1951 set_bit(CGRP_RELEASABLE, &old_cgrp->flags); 1778 set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
1952 put_css_set(old_cset); 1779 put_css_set_locked(old_cset, false);
1953} 1780}
1954 1781
1955/** 1782/**
1956 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup 1783 * cgroup_migrate_finish - cleanup after attach
1957 * @cgrp: the cgroup to attach to 1784 * @preloaded_csets: list of preloaded css_sets
1958 * @tsk: the task or the leader of the threadgroup to be attached
1959 * @threadgroup: attach the whole threadgroup?
1960 * 1785 *
1961 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take 1786 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
1962 * task_lock of @tsk or each thread in the threadgroup individually in turn. 1787 * those functions for details.
1963 */ 1788 */
1964static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, 1789static void cgroup_migrate_finish(struct list_head *preloaded_csets)
1965 bool threadgroup)
1966{ 1790{
1967 int retval, i, group_size; 1791 struct css_set *cset, *tmp_cset;
1968 struct cgroupfs_root *root = cgrp->root;
1969 struct cgroup_subsys_state *css, *failed_css = NULL;
1970 /* threadgroup list cursor and array */
1971 struct task_struct *leader = tsk;
1972 struct task_and_cgroup *tc;
1973 struct flex_array *group;
1974 struct cgroup_taskset tset = { };
1975 1792
1976 /* 1793 lockdep_assert_held(&cgroup_mutex);
1977 * step 0: in order to do expensive, possibly blocking operations for 1794
1978 * every thread, we cannot iterate the thread group list, since it needs 1795 down_write(&css_set_rwsem);
1979 * rcu or tasklist locked. instead, build an array of all threads in the 1796 list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
1980 * group - group_rwsem prevents new threads from appearing, and if 1797 cset->mg_src_cgrp = NULL;
1981 * threads exit, this will just be an over-estimate. 1798 cset->mg_dst_cset = NULL;
1982 */ 1799 list_del_init(&cset->mg_preload_node);
1983 if (threadgroup) 1800 put_css_set_locked(cset, false);
1984 group_size = get_nr_threads(tsk); 1801 }
1985 else 1802 up_write(&css_set_rwsem);
1986 group_size = 1; 1803}
1987 /* flex_array supports very large thread-groups better than kmalloc. */ 1804
1988 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); 1805/**
1989 if (!group) 1806 * cgroup_migrate_add_src - add a migration source css_set
1990 return -ENOMEM; 1807 * @src_cset: the source css_set to add
1991 /* pre-allocate to guarantee space while iterating in rcu read-side. */ 1808 * @dst_cgrp: the destination cgroup
1992 retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); 1809 * @preloaded_csets: list of preloaded css_sets
1993 if (retval) 1810 *
1994 goto out_free_group_list; 1811 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
1812 * @src_cset and add it to @preloaded_csets, which should later be cleaned
1813 * up by cgroup_migrate_finish().
1814 *
1815 * This function may be called without holding threadgroup_lock even if the
1816 * target is a process. Threads may be created and destroyed but as long
1817 * as cgroup_mutex is not dropped, no new css_set can be put into play and
1818 * the preloaded css_sets are guaranteed to cover all migrations.
1819 */
1820static void cgroup_migrate_add_src(struct css_set *src_cset,
1821 struct cgroup *dst_cgrp,
1822 struct list_head *preloaded_csets)
1823{
1824 struct cgroup *src_cgrp;
1825
1826 lockdep_assert_held(&cgroup_mutex);
1827 lockdep_assert_held(&css_set_rwsem);
1828
1829 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
1830
1831 /* nothing to do if this cset already belongs to the cgroup */
1832 if (src_cgrp == dst_cgrp)
1833 return;
1834
1835 if (!list_empty(&src_cset->mg_preload_node))
1836 return;
1837
1838 WARN_ON(src_cset->mg_src_cgrp);
1839 WARN_ON(!list_empty(&src_cset->mg_tasks));
1840 WARN_ON(!list_empty(&src_cset->mg_node));
1841
1842 src_cset->mg_src_cgrp = src_cgrp;
1843 get_css_set(src_cset);
1844 list_add(&src_cset->mg_preload_node, preloaded_csets);
1845}
1846
1847/**
1848 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
1849 * @dst_cgrp: the destination cgroup
1850 * @preloaded_csets: list of preloaded source css_sets
1851 *
1852 * Tasks are about to be moved to @dst_cgrp and all the source css_sets
1853 * have been preloaded to @preloaded_csets. This function looks up and
1854 * pins all destination css_sets, links each to its source, and put them on
1855 * @preloaded_csets.
1856 *
1857 * This function must be called after cgroup_migrate_add_src() has been
1858 * called on each migration source css_set. After migration is performed
1859 * using cgroup_migrate(), cgroup_migrate_finish() must be called on
1860 * @preloaded_csets.
1861 */
1862static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
1863 struct list_head *preloaded_csets)
1864{
1865 LIST_HEAD(csets);
1866 struct css_set *src_cset;
1867
1868 lockdep_assert_held(&cgroup_mutex);
1869
1870 /* look up the dst cset for each src cset and link it to src */
1871 list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {
1872 struct css_set *dst_cset;
1873
1874 dst_cset = find_css_set(src_cset, dst_cgrp);
1875 if (!dst_cset)
1876 goto err;
1877
1878 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
1879 src_cset->mg_dst_cset = dst_cset;
1880
1881 if (list_empty(&dst_cset->mg_preload_node))
1882 list_add(&dst_cset->mg_preload_node, &csets);
1883 else
1884 put_css_set(dst_cset, false);
1885 }
1886
1887 list_splice(&csets, preloaded_csets);
1888 return 0;
1889err:
1890 cgroup_migrate_finish(&csets);
1891 return -ENOMEM;
1892}
1893
1894/**
1895 * cgroup_migrate - migrate a process or task to a cgroup
1896 * @cgrp: the destination cgroup
1897 * @leader: the leader of the process or the task to migrate
1898 * @threadgroup: whether @leader points to the whole process or a single task
1899 *
1900 * Migrate a process or task denoted by @leader to @cgrp. If migrating a
1901 * process, the caller must be holding threadgroup_lock of @leader. The
1902 * caller is also responsible for invoking cgroup_migrate_add_src() and
1903 * cgroup_migrate_prepare_dst() on the targets before invoking this
1904 * function and following up with cgroup_migrate_finish().
1905 *
1906 * As long as a controller's ->can_attach() doesn't fail, this function is
1907 * guaranteed to succeed. This means that, excluding ->can_attach()
1908 * failure, when migrating multiple targets, the success or failure can be
1909 * decided for all targets by invoking group_migrate_prepare_dst() before
1910 * actually starting migrating.
1911 */
1912static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
1913 bool threadgroup)
1914{
1915 struct cgroup_taskset tset = {
1916 .src_csets = LIST_HEAD_INIT(tset.src_csets),
1917 .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
1918 .csets = &tset.src_csets,
1919 };
1920 struct cgroup_subsys_state *css, *failed_css = NULL;
1921 struct css_set *cset, *tmp_cset;
1922 struct task_struct *task, *tmp_task;
1923 int i, ret;
1995 1924
1996 i = 0;
1997 /* 1925 /*
1998 * Prevent freeing of tasks while we take a snapshot. Tasks that are 1926 * Prevent freeing of tasks while we take a snapshot. Tasks that are
1999 * already PF_EXITING could be freed from underneath us unless we 1927 * already PF_EXITING could be freed from underneath us unless we
2000 * take an rcu_read_lock. 1928 * take an rcu_read_lock.
2001 */ 1929 */
1930 down_write(&css_set_rwsem);
2002 rcu_read_lock(); 1931 rcu_read_lock();
1932 task = leader;
2003 do { 1933 do {
2004 struct task_and_cgroup ent; 1934 /* @task either already exited or can't exit until the end */
1935 if (task->flags & PF_EXITING)
1936 goto next;
2005 1937
2006 /* @tsk either already exited or can't exit until the end */ 1938 /* leave @task alone if post_fork() hasn't linked it yet */
2007 if (tsk->flags & PF_EXITING) 1939 if (list_empty(&task->cg_list))
2008 goto next; 1940 goto next;
2009 1941
2010 /* as per above, nr_threads may decrease, but not increase. */ 1942 cset = task_css_set(task);
2011 BUG_ON(i >= group_size); 1943 if (!cset->mg_src_cgrp)
2012 ent.task = tsk;
2013 ent.cgrp = task_cgroup_from_root(tsk, root);
2014 /* nothing to do if this task is already in the cgroup */
2015 if (ent.cgrp == cgrp)
2016 goto next; 1944 goto next;
1945
2017 /* 1946 /*
2018 * saying GFP_ATOMIC has no effect here because we did prealloc 1947 * cgroup_taskset_first() must always return the leader.
2019 * earlier, but it's good form to communicate our expectations. 1948 * Take care to avoid disturbing the ordering.
2020 */ 1949 */
2021 retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 1950 list_move_tail(&task->cg_list, &cset->mg_tasks);
2022 BUG_ON(retval != 0); 1951 if (list_empty(&cset->mg_node))
2023 i++; 1952 list_add_tail(&cset->mg_node, &tset.src_csets);
1953 if (list_empty(&cset->mg_dst_cset->mg_node))
1954 list_move_tail(&cset->mg_dst_cset->mg_node,
1955 &tset.dst_csets);
2024 next: 1956 next:
2025 if (!threadgroup) 1957 if (!threadgroup)
2026 break; 1958 break;
2027 } while_each_thread(leader, tsk); 1959 } while_each_thread(leader, task);
2028 rcu_read_unlock(); 1960 rcu_read_unlock();
2029 /* remember the number of threads in the array for later. */ 1961 up_write(&css_set_rwsem);
2030 group_size = i;
2031 tset.tc_array = group;
2032 tset.tc_array_len = group_size;
2033 1962
2034 /* methods shouldn't be called if no task is actually migrating */ 1963 /* methods shouldn't be called if no task is actually migrating */
2035 retval = 0; 1964 if (list_empty(&tset.src_csets))
2036 if (!group_size) 1965 return 0;
2037 goto out_free_group_list;
2038 1966
2039 /* 1967 /* check that we can legitimately attach to the cgroup */
2040 * step 1: check that we can legitimately attach to the cgroup.
2041 */
2042 for_each_css(css, i, cgrp) { 1968 for_each_css(css, i, cgrp) {
2043 if (css->ss->can_attach) { 1969 if (css->ss->can_attach) {
2044 retval = css->ss->can_attach(css, &tset); 1970 ret = css->ss->can_attach(css, &tset);
2045 if (retval) { 1971 if (ret) {
2046 failed_css = css; 1972 failed_css = css;
2047 goto out_cancel_attach; 1973 goto out_cancel_attach;
2048 } 1974 }
@@ -2050,70 +1976,91 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2050 } 1976 }
2051 1977
2052 /* 1978 /*
2053 * step 2: make sure css_sets exist for all threads to be migrated. 1979 * Now that we're guaranteed success, proceed to move all tasks to
2054 * we use find_css_set, which allocates a new one if necessary. 1980 * the new cgroup. There are no failure cases after here, so this
1981 * is the commit point.
2055 */ 1982 */
2056 for (i = 0; i < group_size; i++) { 1983 down_write(&css_set_rwsem);
2057 struct css_set *old_cset; 1984 list_for_each_entry(cset, &tset.src_csets, mg_node) {
2058 1985 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
2059 tc = flex_array_get(group, i); 1986 cgroup_task_migrate(cset->mg_src_cgrp, task,
2060 old_cset = task_css_set(tc->task); 1987 cset->mg_dst_cset);
2061 tc->cset = find_css_set(old_cset, cgrp);
2062 if (!tc->cset) {
2063 retval = -ENOMEM;
2064 goto out_put_css_set_refs;
2065 }
2066 } 1988 }
1989 up_write(&css_set_rwsem);
2067 1990
2068 /* 1991 /*
2069 * step 3: now that we're guaranteed success wrt the css_sets, 1992 * Migration is committed, all target tasks are now on dst_csets.
2070 * proceed to move all tasks to the new cgroup. There are no 1993 * Nothing is sensitive to fork() after this point. Notify
2071 * failure cases after here, so this is the commit point. 1994 * controllers that migration is complete.
2072 */ 1995 */
2073 for (i = 0; i < group_size; i++) { 1996 tset.csets = &tset.dst_csets;
2074 tc = flex_array_get(group, i);
2075 cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
2076 }
2077 /* nothing is sensitive to fork() after this point. */
2078 1997
2079 /*
2080 * step 4: do subsystem attach callbacks.
2081 */
2082 for_each_css(css, i, cgrp) 1998 for_each_css(css, i, cgrp)
2083 if (css->ss->attach) 1999 if (css->ss->attach)
2084 css->ss->attach(css, &tset); 2000 css->ss->attach(css, &tset);
2085 2001
2086 /* 2002 ret = 0;
2087 * step 5: success! and cleanup 2003 goto out_release_tset;
2088 */ 2004
2089 retval = 0;
2090out_put_css_set_refs:
2091 if (retval) {
2092 for (i = 0; i < group_size; i++) {
2093 tc = flex_array_get(group, i);
2094 if (!tc->cset)
2095 break;
2096 put_css_set(tc->cset);
2097 }
2098 }
2099out_cancel_attach: 2005out_cancel_attach:
2100 if (retval) { 2006 for_each_css(css, i, cgrp) {
2101 for_each_css(css, i, cgrp) { 2007 if (css == failed_css)
2102 if (css == failed_css) 2008 break;
2103 break; 2009 if (css->ss->cancel_attach)
2104 if (css->ss->cancel_attach) 2010 css->ss->cancel_attach(css, &tset);
2105 css->ss->cancel_attach(css, &tset);
2106 }
2107 } 2011 }
2108out_free_group_list: 2012out_release_tset:
2109 flex_array_free(group); 2013 down_write(&css_set_rwsem);
2110 return retval; 2014 list_splice_init(&tset.dst_csets, &tset.src_csets);
2015 list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
2016 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2017 list_del_init(&cset->mg_node);
2018 }
2019 up_write(&css_set_rwsem);
2020 return ret;
2021}
2022
2023/**
2024 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
2025 * @dst_cgrp: the cgroup to attach to
2026 * @leader: the task or the leader of the threadgroup to be attached
2027 * @threadgroup: attach the whole threadgroup?
2028 *
2029 * Call holding cgroup_mutex and threadgroup_lock of @leader.
2030 */
2031static int cgroup_attach_task(struct cgroup *dst_cgrp,
2032 struct task_struct *leader, bool threadgroup)
2033{
2034 LIST_HEAD(preloaded_csets);
2035 struct task_struct *task;
2036 int ret;
2037
2038 /* look up all src csets */
2039 down_read(&css_set_rwsem);
2040 rcu_read_lock();
2041 task = leader;
2042 do {
2043 cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
2044 &preloaded_csets);
2045 if (!threadgroup)
2046 break;
2047 } while_each_thread(leader, task);
2048 rcu_read_unlock();
2049 up_read(&css_set_rwsem);
2050
2051 /* prepare dst csets and commit */
2052 ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
2053 if (!ret)
2054 ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
2055
2056 cgroup_migrate_finish(&preloaded_csets);
2057 return ret;
2111} 2058}
2112 2059
2113/* 2060/*
2114 * Find the task_struct of the task to attach by vpid and pass it along to the 2061 * Find the task_struct of the task to attach by vpid and pass it along to the
2115 * function to attach either it or all tasks in its threadgroup. Will lock 2062 * function to attach either it or all tasks in its threadgroup. Will lock
2116 * cgroup_mutex and threadgroup; may take task_lock of task. 2063 * cgroup_mutex and threadgroup.
2117 */ 2064 */
2118static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 2065static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2119{ 2066{
@@ -2198,12 +2145,19 @@ out_unlock_cgroup:
2198 */ 2145 */
2199int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) 2146int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2200{ 2147{
2201 struct cgroupfs_root *root; 2148 struct cgroup_root *root;
2202 int retval = 0; 2149 int retval = 0;
2203 2150
2204 mutex_lock(&cgroup_mutex); 2151 mutex_lock(&cgroup_mutex);
2205 for_each_active_root(root) { 2152 for_each_root(root) {
2206 struct cgroup *from_cgrp = task_cgroup_from_root(from, root); 2153 struct cgroup *from_cgrp;
2154
2155 if (root == &cgrp_dfl_root)
2156 continue;
2157
2158 down_read(&css_set_rwsem);
2159 from_cgrp = task_cgroup_from_root(from, root);
2160 up_read(&css_set_rwsem);
2207 2161
2208 retval = cgroup_attach_task(from_cgrp, tsk, false); 2162 retval = cgroup_attach_task(from_cgrp, tsk, false);
2209 if (retval) 2163 if (retval)
@@ -2228,16 +2182,17 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css,
2228} 2182}
2229 2183
2230static int cgroup_release_agent_write(struct cgroup_subsys_state *css, 2184static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2231 struct cftype *cft, const char *buffer) 2185 struct cftype *cft, char *buffer)
2232{ 2186{
2233 BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); 2187 struct cgroup_root *root = css->cgroup->root;
2234 if (strlen(buffer) >= PATH_MAX) 2188
2235 return -EINVAL; 2189 BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);
2236 if (!cgroup_lock_live_group(css->cgroup)) 2190 if (!cgroup_lock_live_group(css->cgroup))
2237 return -ENODEV; 2191 return -ENODEV;
2238 mutex_lock(&cgroup_root_mutex); 2192 spin_lock(&release_agent_path_lock);
2239 strcpy(css->cgroup->root->release_agent_path, buffer); 2193 strlcpy(root->release_agent_path, buffer,
2240 mutex_unlock(&cgroup_root_mutex); 2194 sizeof(root->release_agent_path));
2195 spin_unlock(&release_agent_path_lock);
2241 mutex_unlock(&cgroup_mutex); 2196 mutex_unlock(&cgroup_mutex);
2242 return 0; 2197 return 0;
2243} 2198}
@@ -2262,32 +2217,23 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2262 return 0; 2217 return 0;
2263} 2218}
2264 2219
2265/* A buffer size big enough for numbers or short strings */ 2220static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2266#define CGROUP_LOCAL_BUFFER_SIZE 64 2221 size_t nbytes, loff_t off)
2267
2268static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
2269 size_t nbytes, loff_t *ppos)
2270{ 2222{
2271 struct cfent *cfe = __d_cfe(file->f_dentry); 2223 struct cgroup *cgrp = of->kn->parent->priv;
2272 struct cftype *cft = __d_cft(file->f_dentry); 2224 struct cftype *cft = of->kn->priv;
2273 struct cgroup_subsys_state *css = cfe->css; 2225 struct cgroup_subsys_state *css;
2274 size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1;
2275 char *buf;
2276 int ret; 2226 int ret;
2277 2227
2278 if (nbytes >= max_bytes) 2228 /*
2279 return -E2BIG; 2229 * kernfs guarantees that a file isn't deleted with operations in
2280 2230 * flight, which means that the matching css is and stays alive and
2281 buf = kmalloc(nbytes + 1, GFP_KERNEL); 2231 * doesn't need to be pinned. The RCU locking is not necessary
2282 if (!buf) 2232 * either. It's just for the convenience of using cgroup_css().
2283 return -ENOMEM; 2233 */
2284 2234 rcu_read_lock();
2285 if (copy_from_user(buf, userbuf, nbytes)) { 2235 css = cgroup_css(cgrp, cft->ss);
2286 ret = -EFAULT; 2236 rcu_read_unlock();
2287 goto out_free;
2288 }
2289
2290 buf[nbytes] = '\0';
2291 2237
2292 if (cft->write_string) { 2238 if (cft->write_string) {
2293 ret = cft->write_string(css, cft, strstrip(buf)); 2239 ret = cft->write_string(css, cft, strstrip(buf));
@@ -2306,53 +2252,23 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
2306 } else { 2252 } else {
2307 ret = -EINVAL; 2253 ret = -EINVAL;
2308 } 2254 }
2309out_free: 2255
2310 kfree(buf);
2311 return ret ?: nbytes; 2256 return ret ?: nbytes;
2312} 2257}
2313 2258
2314/*
2315 * seqfile ops/methods for returning structured data. Currently just
2316 * supports string->u64 maps, but can be extended in future.
2317 */
2318
2319static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) 2259static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2320{ 2260{
2321 struct cftype *cft = seq_cft(seq); 2261 return seq_cft(seq)->seq_start(seq, ppos);
2322
2323 if (cft->seq_start) {
2324 return cft->seq_start(seq, ppos);
2325 } else {
2326 /*
2327 * The same behavior and code as single_open(). Returns
2328 * !NULL if pos is at the beginning; otherwise, NULL.
2329 */
2330 return NULL + !*ppos;
2331 }
2332} 2262}
2333 2263
2334static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) 2264static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2335{ 2265{
2336 struct cftype *cft = seq_cft(seq); 2266 return seq_cft(seq)->seq_next(seq, v, ppos);
2337
2338 if (cft->seq_next) {
2339 return cft->seq_next(seq, v, ppos);
2340 } else {
2341 /*
2342 * The same behavior and code as single_open(), always
2343 * terminate after the initial read.
2344 */
2345 ++*ppos;
2346 return NULL;
2347 }
2348} 2267}
2349 2268
2350static void cgroup_seqfile_stop(struct seq_file *seq, void *v) 2269static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2351{ 2270{
2352 struct cftype *cft = seq_cft(seq); 2271 seq_cft(seq)->seq_stop(seq, v);
2353
2354 if (cft->seq_stop)
2355 cft->seq_stop(seq, v);
2356} 2272}
2357 2273
2358static int cgroup_seqfile_show(struct seq_file *m, void *arg) 2274static int cgroup_seqfile_show(struct seq_file *m, void *arg)
@@ -2372,96 +2288,35 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2372 return 0; 2288 return 0;
2373} 2289}
2374 2290
2375static struct seq_operations cgroup_seq_operations = { 2291static struct kernfs_ops cgroup_kf_single_ops = {
2376 .start = cgroup_seqfile_start, 2292 .atomic_write_len = PAGE_SIZE,
2377 .next = cgroup_seqfile_next, 2293 .write = cgroup_file_write,
2378 .stop = cgroup_seqfile_stop, 2294 .seq_show = cgroup_seqfile_show,
2379 .show = cgroup_seqfile_show,
2380}; 2295};
2381 2296
2382static int cgroup_file_open(struct inode *inode, struct file *file) 2297static struct kernfs_ops cgroup_kf_ops = {
2383{ 2298 .atomic_write_len = PAGE_SIZE,
2384 struct cfent *cfe = __d_cfe(file->f_dentry); 2299 .write = cgroup_file_write,
2385 struct cftype *cft = __d_cft(file->f_dentry); 2300 .seq_start = cgroup_seqfile_start,
2386 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); 2301 .seq_next = cgroup_seqfile_next,
2387 struct cgroup_subsys_state *css; 2302 .seq_stop = cgroup_seqfile_stop,
2388 struct cgroup_open_file *of; 2303 .seq_show = cgroup_seqfile_show,
2389 int err; 2304};
2390
2391 err = generic_file_open(inode, file);
2392 if (err)
2393 return err;
2394
2395 /*
2396 * If the file belongs to a subsystem, pin the css. Will be
2397 * unpinned either on open failure or release. This ensures that
2398 * @css stays alive for all file operations.
2399 */
2400 rcu_read_lock();
2401 css = cgroup_css(cgrp, cft->ss);
2402 if (cft->ss && !css_tryget(css))
2403 css = NULL;
2404 rcu_read_unlock();
2405
2406 if (!css)
2407 return -ENODEV;
2408
2409 /*
2410 * @cfe->css is used by read/write/close to determine the
2411 * associated css. @file->private_data would be a better place but
2412 * that's already used by seqfile. Multiple accessors may use it
2413 * simultaneously which is okay as the association never changes.
2414 */
2415 WARN_ON_ONCE(cfe->css && cfe->css != css);
2416 cfe->css = css;
2417
2418 of = __seq_open_private(file, &cgroup_seq_operations,
2419 sizeof(struct cgroup_open_file));
2420 if (of) {
2421 of->cfe = cfe;
2422 return 0;
2423 }
2424
2425 if (css->ss)
2426 css_put(css);
2427 return -ENOMEM;
2428}
2429
2430static int cgroup_file_release(struct inode *inode, struct file *file)
2431{
2432 struct cfent *cfe = __d_cfe(file->f_dentry);
2433 struct cgroup_subsys_state *css = cfe->css;
2434
2435 if (css->ss)
2436 css_put(css);
2437 return seq_release_private(inode, file);
2438}
2439 2305
2440/* 2306/*
2441 * cgroup_rename - Only allow simple rename of directories in place. 2307 * cgroup_rename - Only allow simple rename of directories in place.
2442 */ 2308 */
2443static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, 2309static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
2444 struct inode *new_dir, struct dentry *new_dentry) 2310 const char *new_name_str)
2445{ 2311{
2312 struct cgroup *cgrp = kn->priv;
2446 int ret; 2313 int ret;
2447 struct cgroup_name *name, *old_name;
2448 struct cgroup *cgrp;
2449
2450 /*
2451 * It's convinient to use parent dir's i_mutex to protected
2452 * cgrp->name.
2453 */
2454 lockdep_assert_held(&old_dir->i_mutex);
2455 2314
2456 if (!S_ISDIR(old_dentry->d_inode->i_mode)) 2315 if (kernfs_type(kn) != KERNFS_DIR)
2457 return -ENOTDIR; 2316 return -ENOTDIR;
2458 if (new_dentry->d_inode) 2317 if (kn->parent != new_parent)
2459 return -EEXIST;
2460 if (old_dir != new_dir)
2461 return -EIO; 2318 return -EIO;
2462 2319
2463 cgrp = __d_cgrp(old_dentry);
2464
2465 /* 2320 /*
2466 * This isn't a proper migration and its usefulness is very 2321 * This isn't a proper migration and its usefulness is very
2467 * limited. Disallow if sane_behavior. 2322 * limited. Disallow if sane_behavior.
@@ -2469,218 +2324,40 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2469 if (cgroup_sane_behavior(cgrp)) 2324 if (cgroup_sane_behavior(cgrp))
2470 return -EPERM; 2325 return -EPERM;
2471 2326
2472 name = cgroup_alloc_name(new_dentry); 2327 /*
2473 if (!name) 2328 * We're gonna grab cgroup_tree_mutex which nests outside kernfs
2474 return -ENOMEM; 2329 * active_ref. kernfs_rename() doesn't require active_ref
2475 2330 * protection. Break them before grabbing cgroup_tree_mutex.
2476 ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry); 2331 */
2477 if (ret) { 2332 kernfs_break_active_protection(new_parent);
2478 kfree(name); 2333 kernfs_break_active_protection(kn);
2479 return ret;
2480 }
2481
2482 old_name = rcu_dereference_protected(cgrp->name, true);
2483 rcu_assign_pointer(cgrp->name, name);
2484
2485 kfree_rcu(old_name, rcu_head);
2486 return 0;
2487}
2488
2489static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
2490{
2491 if (S_ISDIR(dentry->d_inode->i_mode))
2492 return &__d_cgrp(dentry)->xattrs;
2493 else
2494 return &__d_cfe(dentry)->xattrs;
2495}
2496
2497static inline int xattr_enabled(struct dentry *dentry)
2498{
2499 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
2500 return root->flags & CGRP_ROOT_XATTR;
2501}
2502
2503static bool is_valid_xattr(const char *name)
2504{
2505 if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
2506 !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
2507 return true;
2508 return false;
2509}
2510
2511static int cgroup_setxattr(struct dentry *dentry, const char *name,
2512 const void *val, size_t size, int flags)
2513{
2514 if (!xattr_enabled(dentry))
2515 return -EOPNOTSUPP;
2516 if (!is_valid_xattr(name))
2517 return -EINVAL;
2518 return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
2519}
2520
2521static int cgroup_removexattr(struct dentry *dentry, const char *name)
2522{
2523 if (!xattr_enabled(dentry))
2524 return -EOPNOTSUPP;
2525 if (!is_valid_xattr(name))
2526 return -EINVAL;
2527 return simple_xattr_remove(__d_xattrs(dentry), name);
2528}
2529
2530static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
2531 void *buf, size_t size)
2532{
2533 if (!xattr_enabled(dentry))
2534 return -EOPNOTSUPP;
2535 if (!is_valid_xattr(name))
2536 return -EINVAL;
2537 return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
2538}
2539
2540static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
2541{
2542 if (!xattr_enabled(dentry))
2543 return -EOPNOTSUPP;
2544 return simple_xattr_list(__d_xattrs(dentry), buf, size);
2545}
2546
2547static const struct file_operations cgroup_file_operations = {
2548 .read = seq_read,
2549 .write = cgroup_file_write,
2550 .llseek = generic_file_llseek,
2551 .open = cgroup_file_open,
2552 .release = cgroup_file_release,
2553};
2554
2555static const struct inode_operations cgroup_file_inode_operations = {
2556 .setxattr = cgroup_setxattr,
2557 .getxattr = cgroup_getxattr,
2558 .listxattr = cgroup_listxattr,
2559 .removexattr = cgroup_removexattr,
2560};
2561
2562static const struct inode_operations cgroup_dir_inode_operations = {
2563 .lookup = simple_lookup,
2564 .mkdir = cgroup_mkdir,
2565 .rmdir = cgroup_rmdir,
2566 .rename = cgroup_rename,
2567 .setxattr = cgroup_setxattr,
2568 .getxattr = cgroup_getxattr,
2569 .listxattr = cgroup_listxattr,
2570 .removexattr = cgroup_removexattr,
2571};
2572
2573static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2574 struct super_block *sb)
2575{
2576 struct inode *inode;
2577
2578 if (!dentry)
2579 return -ENOENT;
2580 if (dentry->d_inode)
2581 return -EEXIST;
2582
2583 inode = cgroup_new_inode(mode, sb);
2584 if (!inode)
2585 return -ENOMEM;
2586
2587 if (S_ISDIR(mode)) {
2588 inode->i_op = &cgroup_dir_inode_operations;
2589 inode->i_fop = &simple_dir_operations;
2590
2591 /* start off with i_nlink == 2 (for "." entry) */
2592 inc_nlink(inode);
2593 inc_nlink(dentry->d_parent->d_inode);
2594
2595 /*
2596 * Control reaches here with cgroup_mutex held.
2597 * @inode->i_mutex should nest outside cgroup_mutex but we
2598 * want to populate it immediately without releasing
2599 * cgroup_mutex. As @inode isn't visible to anyone else
2600 * yet, trylock will always succeed without affecting
2601 * lockdep checks.
2602 */
2603 WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
2604 } else if (S_ISREG(mode)) {
2605 inode->i_size = 0;
2606 inode->i_fop = &cgroup_file_operations;
2607 inode->i_op = &cgroup_file_inode_operations;
2608 }
2609 d_instantiate(dentry, inode);
2610 dget(dentry); /* Extra count - pin the dentry in core */
2611 return 0;
2612}
2613
2614/**
2615 * cgroup_file_mode - deduce file mode of a control file
2616 * @cft: the control file in question
2617 *
2618 * returns cft->mode if ->mode is not 0
2619 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
2620 * returns S_IRUGO if it has only a read handler
2621 * returns S_IWUSR if it has only a write hander
2622 */
2623static umode_t cgroup_file_mode(const struct cftype *cft)
2624{
2625 umode_t mode = 0;
2626 2334
2627 if (cft->mode) 2335 mutex_lock(&cgroup_tree_mutex);
2628 return cft->mode; 2336 mutex_lock(&cgroup_mutex);
2629 2337
2630 if (cft->read_u64 || cft->read_s64 || cft->seq_show) 2338 ret = kernfs_rename(kn, new_parent, new_name_str);
2631 mode |= S_IRUGO;
2632 2339
2633 if (cft->write_u64 || cft->write_s64 || cft->write_string || 2340 mutex_unlock(&cgroup_mutex);
2634 cft->trigger) 2341 mutex_unlock(&cgroup_tree_mutex);
2635 mode |= S_IWUSR;
2636 2342
2637 return mode; 2343 kernfs_unbreak_active_protection(kn);
2344 kernfs_unbreak_active_protection(new_parent);
2345 return ret;
2638} 2346}
2639 2347
2640static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) 2348static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2641{ 2349{
2642 struct dentry *dir = cgrp->dentry; 2350 char name[CGROUP_FILE_NAME_MAX];
2643 struct cgroup *parent = __d_cgrp(dir); 2351 struct kernfs_node *kn;
2644 struct dentry *dentry; 2352 struct lock_class_key *key = NULL;
2645 struct cfent *cfe;
2646 int error;
2647 umode_t mode;
2648 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2649
2650 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
2651 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2652 strcpy(name, cft->ss->name);
2653 strcat(name, ".");
2654 }
2655 strcat(name, cft->name);
2656
2657 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
2658
2659 cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
2660 if (!cfe)
2661 return -ENOMEM;
2662 2353
2663 dentry = lookup_one_len(name, dir, strlen(name)); 2354#ifdef CONFIG_DEBUG_LOCK_ALLOC
2664 if (IS_ERR(dentry)) { 2355 key = &cft->lockdep_key;
2665 error = PTR_ERR(dentry); 2356#endif
2666 goto out; 2357 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
2667 } 2358 cgroup_file_mode(cft), 0, cft->kf_ops, cft,
2668 2359 NULL, false, key);
2669 cfe->type = (void *)cft; 2360 return PTR_ERR_OR_ZERO(kn);
2670 cfe->dentry = dentry;
2671 dentry->d_fsdata = cfe;
2672 simple_xattrs_init(&cfe->xattrs);
2673
2674 mode = cgroup_file_mode(cft);
2675 error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
2676 if (!error) {
2677 list_add_tail(&cfe->node, &parent->files);
2678 cfe = NULL;
2679 }
2680 dput(dentry);
2681out:
2682 kfree(cfe);
2683 return error;
2684} 2361}
2685 2362
2686/** 2363/**
@@ -2700,11 +2377,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2700 struct cftype *cft; 2377 struct cftype *cft;
2701 int ret; 2378 int ret;
2702 2379
2703 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); 2380 lockdep_assert_held(&cgroup_tree_mutex);
2704 lockdep_assert_held(&cgroup_mutex);
2705 2381
2706 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2382 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2707 /* does cft->flags tell us to skip this file on @cgrp? */ 2383 /* does cft->flags tell us to skip this file on @cgrp? */
2384 if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
2385 continue;
2708 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) 2386 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2709 continue; 2387 continue;
2710 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) 2388 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
@@ -2726,44 +2404,19 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2726 return 0; 2404 return 0;
2727} 2405}
2728 2406
2729static void cgroup_cfts_prepare(void) 2407static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
2730 __acquires(&cgroup_mutex)
2731{
2732 /*
2733 * Thanks to the entanglement with vfs inode locking, we can't walk
2734 * the existing cgroups under cgroup_mutex and create files.
2735 * Instead, we use css_for_each_descendant_pre() and drop RCU read
2736 * lock before calling cgroup_addrm_files().
2737 */
2738 mutex_lock(&cgroup_mutex);
2739}
2740
2741static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2742 __releases(&cgroup_mutex)
2743{ 2408{
2744 LIST_HEAD(pending); 2409 LIST_HEAD(pending);
2745 struct cgroup_subsys *ss = cfts[0].ss; 2410 struct cgroup_subsys *ss = cfts[0].ss;
2746 struct cgroup *root = &ss->root->top_cgroup; 2411 struct cgroup *root = &ss->root->cgrp;
2747 struct super_block *sb = ss->root->sb;
2748 struct dentry *prev = NULL;
2749 struct inode *inode;
2750 struct cgroup_subsys_state *css; 2412 struct cgroup_subsys_state *css;
2751 u64 update_before;
2752 int ret = 0; 2413 int ret = 0;
2753 2414
2754 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ 2415 lockdep_assert_held(&cgroup_tree_mutex);
2755 if (!cfts || ss->root == &cgroup_dummy_root ||
2756 !atomic_inc_not_zero(&sb->s_active)) {
2757 mutex_unlock(&cgroup_mutex);
2758 return 0;
2759 }
2760 2416
2761 /* 2417 /* don't bother if @ss isn't attached */
2762 * All cgroups which are created after we drop cgroup_mutex will 2418 if (ss->root == &cgrp_dfl_root)
2763 * have the updated set of files, so we only need to update the 2419 return 0;
2764 * cgroups created before the current @cgroup_serial_nr_next.
2765 */
2766 update_before = cgroup_serial_nr_next;
2767 2420
2768 /* add/rm files for all cgroups created before */ 2421 /* add/rm files for all cgroups created before */
2769 css_for_each_descendant_pre(css, cgroup_css(root, ss)) { 2422 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
@@ -2772,62 +2425,75 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2772 if (cgroup_is_dead(cgrp)) 2425 if (cgroup_is_dead(cgrp))
2773 continue; 2426 continue;
2774 2427
2775 inode = cgrp->dentry->d_inode; 2428 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2776 dget(cgrp->dentry);
2777 dput(prev);
2778 prev = cgrp->dentry;
2779
2780 mutex_unlock(&cgroup_mutex);
2781 mutex_lock(&inode->i_mutex);
2782 mutex_lock(&cgroup_mutex);
2783 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2784 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2785 mutex_unlock(&inode->i_mutex);
2786 if (ret) 2429 if (ret)
2787 break; 2430 break;
2788 } 2431 }
2789 mutex_unlock(&cgroup_mutex); 2432
2790 dput(prev); 2433 if (is_add && !ret)
2791 deactivate_super(sb); 2434 kernfs_activate(root->kn);
2792 return ret; 2435 return ret;
2793} 2436}
2794 2437
2795/** 2438static void cgroup_exit_cftypes(struct cftype *cfts)
2796 * cgroup_add_cftypes - add an array of cftypes to a subsystem
2797 * @ss: target cgroup subsystem
2798 * @cfts: zero-length name terminated array of cftypes
2799 *
2800 * Register @cfts to @ss. Files described by @cfts are created for all
2801 * existing cgroups to which @ss is attached and all future cgroups will
2802 * have them too. This function can be called anytime whether @ss is
2803 * attached or not.
2804 *
2805 * Returns 0 on successful registration, -errno on failure. Note that this
2806 * function currently returns 0 as long as @cfts registration is successful
2807 * even if some file creation attempts on existing cgroups fail.
2808 */
2809int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2810{ 2439{
2811 struct cftype_set *set;
2812 struct cftype *cft; 2440 struct cftype *cft;
2813 int ret;
2814 2441
2815 set = kzalloc(sizeof(*set), GFP_KERNEL); 2442 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2816 if (!set) 2443 /* free copy for custom atomic_write_len, see init_cftypes() */
2817 return -ENOMEM; 2444 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
2445 kfree(cft->kf_ops);
2446 cft->kf_ops = NULL;
2447 cft->ss = NULL;
2448 }
2449}
2818 2450
2819 for (cft = cfts; cft->name[0] != '\0'; cft++) 2451static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2452{
2453 struct cftype *cft;
2454
2455 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2456 struct kernfs_ops *kf_ops;
2457
2458 WARN_ON(cft->ss || cft->kf_ops);
2459
2460 if (cft->seq_start)
2461 kf_ops = &cgroup_kf_ops;
2462 else
2463 kf_ops = &cgroup_kf_single_ops;
2464
2465 /*
2466 * Ugh... if @cft wants a custom max_write_len, we need to
2467 * make a copy of kf_ops to set its atomic_write_len.
2468 */
2469 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
2470 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
2471 if (!kf_ops) {
2472 cgroup_exit_cftypes(cfts);
2473 return -ENOMEM;
2474 }
2475 kf_ops->atomic_write_len = cft->max_write_len;
2476 }
2477
2478 cft->kf_ops = kf_ops;
2820 cft->ss = ss; 2479 cft->ss = ss;
2480 }
2821 2481
2822 cgroup_cfts_prepare(); 2482 return 0;
2823 set->cfts = cfts; 2483}
2824 list_add_tail(&set->node, &ss->cftsets); 2484
2825 ret = cgroup_cfts_commit(cfts, true); 2485static int cgroup_rm_cftypes_locked(struct cftype *cfts)
2826 if (ret) 2486{
2827 cgroup_rm_cftypes(cfts); 2487 lockdep_assert_held(&cgroup_tree_mutex);
2828 return ret; 2488
2489 if (!cfts || !cfts[0].ss)
2490 return -ENOENT;
2491
2492 list_del(&cfts->node);
2493 cgroup_apply_cftypes(cfts, false);
2494 cgroup_exit_cftypes(cfts);
2495 return 0;
2829} 2496}
2830EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2831 2497
2832/** 2498/**
2833 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem 2499 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
@@ -2842,24 +2508,48 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2842 */ 2508 */
2843int cgroup_rm_cftypes(struct cftype *cfts) 2509int cgroup_rm_cftypes(struct cftype *cfts)
2844{ 2510{
2845 struct cftype_set *set; 2511 int ret;
2846 2512
2847 if (!cfts || !cfts[0].ss) 2513 mutex_lock(&cgroup_tree_mutex);
2848 return -ENOENT; 2514 ret = cgroup_rm_cftypes_locked(cfts);
2515 mutex_unlock(&cgroup_tree_mutex);
2516 return ret;
2517}
2849 2518
2850 cgroup_cfts_prepare(); 2519/**
2520 * cgroup_add_cftypes - add an array of cftypes to a subsystem
2521 * @ss: target cgroup subsystem
2522 * @cfts: zero-length name terminated array of cftypes
2523 *
2524 * Register @cfts to @ss. Files described by @cfts are created for all
2525 * existing cgroups to which @ss is attached and all future cgroups will
2526 * have them too. This function can be called anytime whether @ss is
2527 * attached or not.
2528 *
2529 * Returns 0 on successful registration, -errno on failure. Note that this
2530 * function currently returns 0 as long as @cfts registration is successful
2531 * even if some file creation attempts on existing cgroups fail.
2532 */
2533int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2534{
2535 int ret;
2851 2536
2852 list_for_each_entry(set, &cfts[0].ss->cftsets, node) { 2537 if (!cfts || cfts[0].name[0] == '\0')
2853 if (set->cfts == cfts) { 2538 return 0;
2854 list_del(&set->node); 2539
2855 kfree(set); 2540 ret = cgroup_init_cftypes(ss, cfts);
2856 cgroup_cfts_commit(cfts, false); 2541 if (ret)
2857 return 0; 2542 return ret;
2858 } 2543
2859 } 2544 mutex_lock(&cgroup_tree_mutex);
2860 2545
2861 cgroup_cfts_commit(NULL, false); 2546 list_add_tail(&cfts->node, &ss->cfts);
2862 return -ENOENT; 2547 ret = cgroup_apply_cftypes(cfts, true);
2548 if (ret)
2549 cgroup_rm_cftypes_locked(cfts);
2550
2551 mutex_unlock(&cgroup_tree_mutex);
2552 return ret;
2863} 2553}
2864 2554
2865/** 2555/**
@@ -2868,57 +2558,18 @@ int cgroup_rm_cftypes(struct cftype *cfts)
2868 * 2558 *
2869 * Return the number of tasks in the cgroup. 2559 * Return the number of tasks in the cgroup.
2870 */ 2560 */
2871int cgroup_task_count(const struct cgroup *cgrp) 2561static int cgroup_task_count(const struct cgroup *cgrp)
2872{ 2562{
2873 int count = 0; 2563 int count = 0;
2874 struct cgrp_cset_link *link; 2564 struct cgrp_cset_link *link;
2875 2565
2876 read_lock(&css_set_lock); 2566 down_read(&css_set_rwsem);
2877 list_for_each_entry(link, &cgrp->cset_links, cset_link) 2567 list_for_each_entry(link, &cgrp->cset_links, cset_link)
2878 count += atomic_read(&link->cset->refcount); 2568 count += atomic_read(&link->cset->refcount);
2879 read_unlock(&css_set_lock); 2569 up_read(&css_set_rwsem);
2880 return count; 2570 return count;
2881} 2571}
2882 2572
2883/*
2884 * To reduce the fork() overhead for systems that are not actually using
2885 * their cgroups capability, we don't maintain the lists running through
2886 * each css_set to its tasks until we see the list actually used - in other
2887 * words after the first call to css_task_iter_start().
2888 */
2889static void cgroup_enable_task_cg_lists(void)
2890{
2891 struct task_struct *p, *g;
2892 write_lock(&css_set_lock);
2893 use_task_css_set_links = 1;
2894 /*
2895 * We need tasklist_lock because RCU is not safe against
2896 * while_each_thread(). Besides, a forking task that has passed
2897 * cgroup_post_fork() without seeing use_task_css_set_links = 1
2898 * is not guaranteed to have its child immediately visible in the
2899 * tasklist if we walk through it with RCU.
2900 */
2901 read_lock(&tasklist_lock);
2902 do_each_thread(g, p) {
2903 task_lock(p);
2904 /*
2905 * We should check if the process is exiting, otherwise
2906 * it will race with cgroup_exit() in that the list
2907 * entry won't be deleted though the process has exited.
2908 * Do it while holding siglock so that we don't end up
2909 * racing against cgroup_exit().
2910 */
2911 spin_lock_irq(&p->sighand->siglock);
2912 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
2913 list_add(&p->cg_list, &task_css_set(p)->tasks);
2914 spin_unlock_irq(&p->sighand->siglock);
2915
2916 task_unlock(p);
2917 } while_each_thread(g, p);
2918 read_unlock(&tasklist_lock);
2919 write_unlock(&css_set_lock);
2920}
2921
2922/** 2573/**
2923 * css_next_child - find the next child of a given css 2574 * css_next_child - find the next child of a given css
2924 * @pos_css: the current position (%NULL to initiate traversal) 2575 * @pos_css: the current position (%NULL to initiate traversal)
@@ -2937,7 +2588,7 @@ css_next_child(struct cgroup_subsys_state *pos_css,
2937 struct cgroup *cgrp = parent_css->cgroup; 2588 struct cgroup *cgrp = parent_css->cgroup;
2938 struct cgroup *next; 2589 struct cgroup *next;
2939 2590
2940 cgroup_assert_mutex_or_rcu_locked(); 2591 cgroup_assert_mutexes_or_rcu_locked();
2941 2592
2942 /* 2593 /*
2943 * @pos could already have been removed. Once a cgroup is removed, 2594 * @pos could already have been removed. Once a cgroup is removed,
@@ -2973,7 +2624,6 @@ css_next_child(struct cgroup_subsys_state *pos_css,
2973 2624
2974 return cgroup_css(next, parent_css->ss); 2625 return cgroup_css(next, parent_css->ss);
2975} 2626}
2976EXPORT_SYMBOL_GPL(css_next_child);
2977 2627
2978/** 2628/**
2979 * css_next_descendant_pre - find the next descendant for pre-order walk 2629 * css_next_descendant_pre - find the next descendant for pre-order walk
@@ -2995,7 +2645,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
2995{ 2645{
2996 struct cgroup_subsys_state *next; 2646 struct cgroup_subsys_state *next;
2997 2647
2998 cgroup_assert_mutex_or_rcu_locked(); 2648 cgroup_assert_mutexes_or_rcu_locked();
2999 2649
3000 /* if first iteration, visit @root */ 2650 /* if first iteration, visit @root */
3001 if (!pos) 2651 if (!pos)
@@ -3016,7 +2666,6 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
3016 2666
3017 return NULL; 2667 return NULL;
3018} 2668}
3019EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3020 2669
3021/** 2670/**
3022 * css_rightmost_descendant - return the rightmost descendant of a css 2671 * css_rightmost_descendant - return the rightmost descendant of a css
@@ -3036,7 +2685,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
3036{ 2685{
3037 struct cgroup_subsys_state *last, *tmp; 2686 struct cgroup_subsys_state *last, *tmp;
3038 2687
3039 cgroup_assert_mutex_or_rcu_locked(); 2688 cgroup_assert_mutexes_or_rcu_locked();
3040 2689
3041 do { 2690 do {
3042 last = pos; 2691 last = pos;
@@ -3048,7 +2697,6 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
3048 2697
3049 return last; 2698 return last;
3050} 2699}
3051EXPORT_SYMBOL_GPL(css_rightmost_descendant);
3052 2700
3053static struct cgroup_subsys_state * 2701static struct cgroup_subsys_state *
3054css_leftmost_descendant(struct cgroup_subsys_state *pos) 2702css_leftmost_descendant(struct cgroup_subsys_state *pos)
@@ -3084,7 +2732,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
3084{ 2732{
3085 struct cgroup_subsys_state *next; 2733 struct cgroup_subsys_state *next;
3086 2734
3087 cgroup_assert_mutex_or_rcu_locked(); 2735 cgroup_assert_mutexes_or_rcu_locked();
3088 2736
3089 /* if first iteration, visit leftmost descendant which may be @root */ 2737 /* if first iteration, visit leftmost descendant which may be @root */
3090 if (!pos) 2738 if (!pos)
@@ -3102,7 +2750,6 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
3102 /* no sibling left, visit parent */ 2750 /* no sibling left, visit parent */
3103 return css_parent(pos); 2751 return css_parent(pos);
3104} 2752}
3105EXPORT_SYMBOL_GPL(css_next_descendant_post);
3106 2753
3107/** 2754/**
3108 * css_advance_task_iter - advance a task itererator to the next css_set 2755 * css_advance_task_iter - advance a task itererator to the next css_set
@@ -3125,9 +2772,14 @@ static void css_advance_task_iter(struct css_task_iter *it)
3125 } 2772 }
3126 link = list_entry(l, struct cgrp_cset_link, cset_link); 2773 link = list_entry(l, struct cgrp_cset_link, cset_link);
3127 cset = link->cset; 2774 cset = link->cset;
3128 } while (list_empty(&cset->tasks)); 2775 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
2776
3129 it->cset_link = l; 2777 it->cset_link = l;
3130 it->task = cset->tasks.next; 2778
2779 if (!list_empty(&cset->tasks))
2780 it->task = cset->tasks.next;
2781 else
2782 it->task = cset->mg_tasks.next;
3131} 2783}
3132 2784
3133/** 2785/**
@@ -3146,17 +2798,12 @@ static void css_advance_task_iter(struct css_task_iter *it)
3146 */ 2798 */
3147void css_task_iter_start(struct cgroup_subsys_state *css, 2799void css_task_iter_start(struct cgroup_subsys_state *css,
3148 struct css_task_iter *it) 2800 struct css_task_iter *it)
3149 __acquires(css_set_lock) 2801 __acquires(css_set_rwsem)
3150{ 2802{
3151 /* 2803 /* no one should try to iterate before mounting cgroups */
3152 * The first time anyone tries to iterate across a css, we need to 2804 WARN_ON_ONCE(!use_task_css_set_links);
3153 * enable the list linking each css_set to its tasks, and fix up
3154 * all existing tasks.
3155 */
3156 if (!use_task_css_set_links)
3157 cgroup_enable_task_cg_lists();
3158 2805
3159 read_lock(&css_set_lock); 2806 down_read(&css_set_rwsem);
3160 2807
3161 it->origin_css = css; 2808 it->origin_css = css;
3162 it->cset_link = &css->cgroup->cset_links; 2809 it->cset_link = &css->cgroup->cset_links;
@@ -3176,24 +2823,29 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
3176{ 2823{
3177 struct task_struct *res; 2824 struct task_struct *res;
3178 struct list_head *l = it->task; 2825 struct list_head *l = it->task;
3179 struct cgrp_cset_link *link; 2826 struct cgrp_cset_link *link = list_entry(it->cset_link,
2827 struct cgrp_cset_link, cset_link);
3180 2828
3181 /* If the iterator cg is NULL, we have no tasks */ 2829 /* If the iterator cg is NULL, we have no tasks */
3182 if (!it->cset_link) 2830 if (!it->cset_link)
3183 return NULL; 2831 return NULL;
3184 res = list_entry(l, struct task_struct, cg_list); 2832 res = list_entry(l, struct task_struct, cg_list);
3185 /* Advance iterator to find next entry */ 2833
2834 /*
2835 * Advance iterator to find next entry. cset->tasks is consumed
2836 * first and then ->mg_tasks. After ->mg_tasks, we move onto the
2837 * next cset.
2838 */
3186 l = l->next; 2839 l = l->next;
3187 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); 2840
3188 if (l == &link->cset->tasks) { 2841 if (l == &link->cset->tasks)
3189 /* 2842 l = link->cset->mg_tasks.next;
3190 * We reached the end of this task list - move on to the 2843
3191 * next cgrp_cset_link. 2844 if (l == &link->cset->mg_tasks)
3192 */
3193 css_advance_task_iter(it); 2845 css_advance_task_iter(it);
3194 } else { 2846 else
3195 it->task = l; 2847 it->task = l;
3196 } 2848
3197 return res; 2849 return res;
3198} 2850}
3199 2851
@@ -3204,191 +2856,62 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
3204 * Finish task iteration started by css_task_iter_start(). 2856 * Finish task iteration started by css_task_iter_start().
3205 */ 2857 */
3206void css_task_iter_end(struct css_task_iter *it) 2858void css_task_iter_end(struct css_task_iter *it)
3207 __releases(css_set_lock) 2859 __releases(css_set_rwsem)
3208{
3209 read_unlock(&css_set_lock);
3210}
3211
3212static inline int started_after_time(struct task_struct *t1,
3213 struct timespec *time,
3214 struct task_struct *t2)
3215{
3216 int start_diff = timespec_compare(&t1->start_time, time);
3217 if (start_diff > 0) {
3218 return 1;
3219 } else if (start_diff < 0) {
3220 return 0;
3221 } else {
3222 /*
3223 * Arbitrarily, if two processes started at the same
3224 * time, we'll say that the lower pointer value
3225 * started first. Note that t2 may have exited by now
3226 * so this may not be a valid pointer any longer, but
3227 * that's fine - it still serves to distinguish
3228 * between two tasks started (effectively) simultaneously.
3229 */
3230 return t1 > t2;
3231 }
3232}
3233
3234/*
3235 * This function is a callback from heap_insert() and is used to order
3236 * the heap.
3237 * In this case we order the heap in descending task start time.
3238 */
3239static inline int started_after(void *p1, void *p2)
3240{ 2860{
3241 struct task_struct *t1 = p1; 2861 up_read(&css_set_rwsem);
3242 struct task_struct *t2 = p2;
3243 return started_after_time(t1, &t2->start_time, t2);
3244} 2862}
3245 2863
3246/** 2864/**
3247 * css_scan_tasks - iterate though all the tasks in a css 2865 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3248 * @css: the css to iterate tasks of 2866 * @to: cgroup to which the tasks will be moved
3249 * @test: optional test callback 2867 * @from: cgroup in which the tasks currently reside
3250 * @process: process callback
3251 * @data: data passed to @test and @process
3252 * @heap: optional pre-allocated heap used for task iteration
3253 *
3254 * Iterate through all the tasks in @css, calling @test for each, and if it
3255 * returns %true, call @process for it also.
3256 *
3257 * @test may be NULL, meaning always true (select all tasks), which
3258 * effectively duplicates css_task_iter_{start,next,end}() but does not
3259 * lock css_set_lock for the call to @process.
3260 *
3261 * It is guaranteed that @process will act on every task that is a member
3262 * of @css for the duration of this call. This function may or may not
3263 * call @process for tasks that exit or move to a different css during the
3264 * call, or are forked or move into the css during the call.
3265 *
3266 * Note that @test may be called with locks held, and may in some
3267 * situations be called multiple times for the same task, so it should be
3268 * cheap.
3269 * 2868 *
3270 * If @heap is non-NULL, a heap has been pre-allocated and will be used for 2869 * Locking rules between cgroup_post_fork() and the migration path
3271 * heap operations (and its "gt" member will be overwritten), else a 2870 * guarantee that, if a task is forking while being migrated, the new child
3272 * temporary heap will be used (allocation of which may cause this function 2871 * is guaranteed to be either visible in the source cgroup after the
3273 * to fail). 2872 * parent's migration is complete or put into the target cgroup. No task
2873 * can slip out of migration through forking.
3274 */ 2874 */
3275int css_scan_tasks(struct cgroup_subsys_state *css, 2875int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3276 bool (*test)(struct task_struct *, void *),
3277 void (*process)(struct task_struct *, void *),
3278 void *data, struct ptr_heap *heap)
3279{ 2876{
3280 int retval, i; 2877 LIST_HEAD(preloaded_csets);
2878 struct cgrp_cset_link *link;
3281 struct css_task_iter it; 2879 struct css_task_iter it;
3282 struct task_struct *p, *dropped; 2880 struct task_struct *task;
3283 /* Never dereference latest_task, since it's not refcounted */ 2881 int ret;
3284 struct task_struct *latest_task = NULL;
3285 struct ptr_heap tmp_heap;
3286 struct timespec latest_time = { 0, 0 };
3287
3288 if (heap) {
3289 /* The caller supplied our heap and pre-allocated its memory */
3290 heap->gt = &started_after;
3291 } else {
3292 /* We need to allocate our own heap memory */
3293 heap = &tmp_heap;
3294 retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
3295 if (retval)
3296 /* cannot allocate the heap */
3297 return retval;
3298 }
3299 2882
3300 again: 2883 mutex_lock(&cgroup_mutex);
3301 /*
3302 * Scan tasks in the css, using the @test callback to determine
3303 * which are of interest, and invoking @process callback on the
3304 * ones which need an update. Since we don't want to hold any
3305 * locks during the task updates, gather tasks to be processed in a
3306 * heap structure. The heap is sorted by descending task start
3307 * time. If the statically-sized heap fills up, we overflow tasks
3308 * that started later, and in future iterations only consider tasks
3309 * that started after the latest task in the previous pass. This
3310 * guarantees forward progress and that we don't miss any tasks.
3311 */
3312 heap->size = 0;
3313 css_task_iter_start(css, &it);
3314 while ((p = css_task_iter_next(&it))) {
3315 /*
3316 * Only affect tasks that qualify per the caller's callback,
3317 * if he provided one
3318 */
3319 if (test && !test(p, data))
3320 continue;
3321 /*
3322 * Only process tasks that started after the last task
3323 * we processed
3324 */
3325 if (!started_after_time(p, &latest_time, latest_task))
3326 continue;
3327 dropped = heap_insert(heap, p);
3328 if (dropped == NULL) {
3329 /*
3330 * The new task was inserted; the heap wasn't
3331 * previously full
3332 */
3333 get_task_struct(p);
3334 } else if (dropped != p) {
3335 /*
3336 * The new task was inserted, and pushed out a
3337 * different task
3338 */
3339 get_task_struct(p);
3340 put_task_struct(dropped);
3341 }
3342 /*
3343 * Else the new task was newer than anything already in
3344 * the heap and wasn't inserted
3345 */
3346 }
3347 css_task_iter_end(&it);
3348 2884
3349 if (heap->size) { 2885 /* all tasks in @from are being moved, all csets are source */
3350 for (i = 0; i < heap->size; i++) { 2886 down_read(&css_set_rwsem);
3351 struct task_struct *q = heap->ptrs[i]; 2887 list_for_each_entry(link, &from->cset_links, cset_link)
3352 if (i == 0) { 2888 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
3353 latest_time = q->start_time; 2889 up_read(&css_set_rwsem);
3354 latest_task = q;
3355 }
3356 /* Process the task per the caller's callback */
3357 process(q, data);
3358 put_task_struct(q);
3359 }
3360 /*
3361 * If we had to process any tasks at all, scan again
3362 * in case some of them were in the middle of forking
3363 * children that didn't get processed.
3364 * Not the most efficient way to do it, but it avoids
3365 * having to take callback_mutex in the fork path
3366 */
3367 goto again;
3368 }
3369 if (heap == &tmp_heap)
3370 heap_free(&tmp_heap);
3371 return 0;
3372}
3373 2890
3374static void cgroup_transfer_one_task(struct task_struct *task, void *data) 2891 ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
3375{ 2892 if (ret)
3376 struct cgroup *new_cgroup = data; 2893 goto out_err;
3377 2894
3378 mutex_lock(&cgroup_mutex); 2895 /*
3379 cgroup_attach_task(new_cgroup, task, false); 2896 * Migrate tasks one-by-one until @form is empty. This fails iff
2897 * ->can_attach() fails.
2898 */
2899 do {
2900 css_task_iter_start(&from->dummy_css, &it);
2901 task = css_task_iter_next(&it);
2902 if (task)
2903 get_task_struct(task);
2904 css_task_iter_end(&it);
2905
2906 if (task) {
2907 ret = cgroup_migrate(to, task, false);
2908 put_task_struct(task);
2909 }
2910 } while (task && !ret);
2911out_err:
2912 cgroup_migrate_finish(&preloaded_csets);
3380 mutex_unlock(&cgroup_mutex); 2913 mutex_unlock(&cgroup_mutex);
3381} 2914 return ret;
3382
3383/**
3384 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3385 * @to: cgroup to which the tasks will be moved
3386 * @from: cgroup in which the tasks currently reside
3387 */
3388int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3389{
3390 return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
3391 to, NULL);
3392} 2915}
3393 2916
3394/* 2917/*
@@ -3687,21 +3210,31 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3687 */ 3210 */
3688int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) 3211int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3689{ 3212{
3690 int ret = -EINVAL; 3213 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
3691 struct cgroup *cgrp; 3214 struct cgroup *cgrp;
3692 struct css_task_iter it; 3215 struct css_task_iter it;
3693 struct task_struct *tsk; 3216 struct task_struct *tsk;
3694 3217
3218 /* it should be kernfs_node belonging to cgroupfs and is a directory */
3219 if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
3220 kernfs_type(kn) != KERNFS_DIR)
3221 return -EINVAL;
3222
3223 mutex_lock(&cgroup_mutex);
3224
3695 /* 3225 /*
3696 * Validate dentry by checking the superblock operations, 3226 * We aren't being called from kernfs and there's no guarantee on
3697 * and make sure it's a directory. 3227 * @kn->priv's validity. For this and css_tryget_from_dir(),
3228 * @kn->priv is RCU safe. Let's do the RCU dancing.
3698 */ 3229 */
3699 if (dentry->d_sb->s_op != &cgroup_ops || 3230 rcu_read_lock();
3700 !S_ISDIR(dentry->d_inode->i_mode)) 3231 cgrp = rcu_dereference(kn->priv);
3701 goto err; 3232 if (!cgrp || cgroup_is_dead(cgrp)) {
3702 3233 rcu_read_unlock();
3703 ret = 0; 3234 mutex_unlock(&cgroup_mutex);
3704 cgrp = dentry->d_fsdata; 3235 return -ENOENT;
3236 }
3237 rcu_read_unlock();
3705 3238
3706 css_task_iter_start(&cgrp->dummy_css, &it); 3239 css_task_iter_start(&cgrp->dummy_css, &it);
3707 while ((tsk = css_task_iter_next(&it))) { 3240 while ((tsk = css_task_iter_next(&it))) {
@@ -3726,8 +3259,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3726 } 3259 }
3727 css_task_iter_end(&it); 3260 css_task_iter_end(&it);
3728 3261
3729err: 3262 mutex_unlock(&cgroup_mutex);
3730 return ret; 3263 return 0;
3731} 3264}
3732 3265
3733 3266
@@ -3745,7 +3278,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3745 * after a seek to the start). Use a binary-search to find the 3278 * after a seek to the start). Use a binary-search to find the
3746 * next pid to display, if any 3279 * next pid to display, if any
3747 */ 3280 */
3748 struct cgroup_open_file *of = s->private; 3281 struct kernfs_open_file *of = s->private;
3749 struct cgroup *cgrp = seq_css(s)->cgroup; 3282 struct cgroup *cgrp = seq_css(s)->cgroup;
3750 struct cgroup_pidlist *l; 3283 struct cgroup_pidlist *l;
3751 enum cgroup_filetype type = seq_cft(s)->private; 3284 enum cgroup_filetype type = seq_cft(s)->private;
@@ -3800,7 +3333,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3800 3333
3801static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3334static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3802{ 3335{
3803 struct cgroup_open_file *of = s->private; 3336 struct kernfs_open_file *of = s->private;
3804 struct cgroup_pidlist *l = of->priv; 3337 struct cgroup_pidlist *l = of->priv;
3805 3338
3806 if (l) 3339 if (l)
@@ -3811,7 +3344,7 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3811 3344
3812static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3345static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3813{ 3346{
3814 struct cgroup_open_file *of = s->private; 3347 struct kernfs_open_file *of = s->private;
3815 struct cgroup_pidlist *l = of->priv; 3348 struct cgroup_pidlist *l = of->priv;
3816 pid_t *p = v; 3349 pid_t *p = v;
3817 pid_t *end = l->list + l->length; 3350 pid_t *end = l->list + l->length;
@@ -3861,23 +3394,6 @@ static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3861 return 0; 3394 return 0;
3862} 3395}
3863 3396
3864/*
3865 * When dput() is called asynchronously, if umount has been done and
3866 * then deactivate_super() in cgroup_free_fn() kills the superblock,
3867 * there's a small window that vfs will see the root dentry with non-zero
3868 * refcnt and trigger BUG().
3869 *
3870 * That's why we hold a reference before dput() and drop it right after.
3871 */
3872static void cgroup_dput(struct cgroup *cgrp)
3873{
3874 struct super_block *sb = cgrp->root->sb;
3875
3876 atomic_inc(&sb->s_active);
3877 dput(cgrp->dentry);
3878 deactivate_super(sb);
3879}
3880
3881static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, 3397static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
3882 struct cftype *cft) 3398 struct cftype *cft)
3883{ 3399{
@@ -3944,7 +3460,7 @@ static struct cftype cgroup_base_files[] = {
3944 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 3460 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
3945 .seq_show = cgroup_release_agent_show, 3461 .seq_show = cgroup_release_agent_show,
3946 .write_string = cgroup_release_agent_write, 3462 .write_string = cgroup_release_agent_write,
3947 .max_write_len = PATH_MAX, 3463 .max_write_len = PATH_MAX - 1,
3948 }, 3464 },
3949 { } /* terminate */ 3465 { } /* terminate */
3950}; 3466};
@@ -3963,13 +3479,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
3963 3479
3964 /* process cftsets of each subsystem */ 3480 /* process cftsets of each subsystem */
3965 for_each_subsys(ss, i) { 3481 for_each_subsys(ss, i) {
3966 struct cftype_set *set; 3482 struct cftype *cfts;
3967 3483
3968 if (!test_bit(i, &subsys_mask)) 3484 if (!test_bit(i, &subsys_mask))
3969 continue; 3485 continue;
3970 3486
3971 list_for_each_entry(set, &ss->cftsets, node) { 3487 list_for_each_entry(cfts, &ss->cfts, node) {
3972 ret = cgroup_addrm_files(cgrp, set->cfts, true); 3488 ret = cgroup_addrm_files(cgrp, cfts, true);
3973 if (ret < 0) 3489 if (ret < 0)
3974 goto err; 3490 goto err;
3975 } 3491 }
@@ -4012,7 +3528,7 @@ static void css_free_work_fn(struct work_struct *work)
4012 css_put(css->parent); 3528 css_put(css->parent);
4013 3529
4014 css->ss->css_free(css); 3530 css->ss->css_free(css);
4015 cgroup_dput(cgrp); 3531 cgroup_put(cgrp);
4016} 3532}
4017 3533
4018static void css_free_rcu_fn(struct rcu_head *rcu_head) 3534static void css_free_rcu_fn(struct rcu_head *rcu_head)
@@ -4020,10 +3536,6 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
4020 struct cgroup_subsys_state *css = 3536 struct cgroup_subsys_state *css =
4021 container_of(rcu_head, struct cgroup_subsys_state, rcu_head); 3537 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4022 3538
4023 /*
4024 * css holds an extra ref to @cgrp->dentry which is put on the last
4025 * css_put(). dput() requires process context which we don't have.
4026 */
4027 INIT_WORK(&css->destroy_work, css_free_work_fn); 3539 INIT_WORK(&css->destroy_work, css_free_work_fn);
4028 queue_work(cgroup_destroy_wq, &css->destroy_work); 3540 queue_work(cgroup_destroy_wq, &css->destroy_work);
4029} 3541}
@@ -4033,7 +3545,7 @@ static void css_release(struct percpu_ref *ref)
4033 struct cgroup_subsys_state *css = 3545 struct cgroup_subsys_state *css =
4034 container_of(ref, struct cgroup_subsys_state, refcnt); 3546 container_of(ref, struct cgroup_subsys_state, refcnt);
4035 3547
4036 rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL); 3548 RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);
4037 call_rcu(&css->rcu_head, css_free_rcu_fn); 3549 call_rcu(&css->rcu_head, css_free_rcu_fn);
4038} 3550}
4039 3551
@@ -4058,6 +3570,7 @@ static int online_css(struct cgroup_subsys_state *css)
4058 struct cgroup_subsys *ss = css->ss; 3570 struct cgroup_subsys *ss = css->ss;
4059 int ret = 0; 3571 int ret = 0;
4060 3572
3573 lockdep_assert_held(&cgroup_tree_mutex);
4061 lockdep_assert_held(&cgroup_mutex); 3574 lockdep_assert_held(&cgroup_mutex);
4062 3575
4063 if (ss->css_online) 3576 if (ss->css_online)
@@ -4065,7 +3578,7 @@ static int online_css(struct cgroup_subsys_state *css)
4065 if (!ret) { 3578 if (!ret) {
4066 css->flags |= CSS_ONLINE; 3579 css->flags |= CSS_ONLINE;
4067 css->cgroup->nr_css++; 3580 css->cgroup->nr_css++;
4068 rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); 3581 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
4069 } 3582 }
4070 return ret; 3583 return ret;
4071} 3584}
@@ -4075,6 +3588,7 @@ static void offline_css(struct cgroup_subsys_state *css)
4075{ 3588{
4076 struct cgroup_subsys *ss = css->ss; 3589 struct cgroup_subsys *ss = css->ss;
4077 3590
3591 lockdep_assert_held(&cgroup_tree_mutex);
4078 lockdep_assert_held(&cgroup_mutex); 3592 lockdep_assert_held(&cgroup_mutex);
4079 3593
4080 if (!(css->flags & CSS_ONLINE)) 3594 if (!(css->flags & CSS_ONLINE))
@@ -4085,7 +3599,7 @@ static void offline_css(struct cgroup_subsys_state *css)
4085 3599
4086 css->flags &= ~CSS_ONLINE; 3600 css->flags &= ~CSS_ONLINE;
4087 css->cgroup->nr_css--; 3601 css->cgroup->nr_css--;
4088 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); 3602 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css);
4089} 3603}
4090 3604
4091/** 3605/**
@@ -4103,7 +3617,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4103 struct cgroup_subsys_state *css; 3617 struct cgroup_subsys_state *css;
4104 int err; 3618 int err;
4105 3619
4106 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
4107 lockdep_assert_held(&cgroup_mutex); 3620 lockdep_assert_held(&cgroup_mutex);
4108 3621
4109 css = ss->css_alloc(cgroup_css(parent, ss)); 3622 css = ss->css_alloc(cgroup_css(parent, ss));
@@ -4116,7 +3629,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4116 3629
4117 init_css(css, ss, cgrp); 3630 init_css(css, ss, cgrp);
4118 3631
4119 err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id); 3632 err = cgroup_populate_dir(cgrp, 1 << ss->id);
4120 if (err) 3633 if (err)
4121 goto err_free_percpu_ref; 3634 goto err_free_percpu_ref;
4122 3635
@@ -4124,9 +3637,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4124 if (err) 3637 if (err)
4125 goto err_clear_dir; 3638 goto err_clear_dir;
4126 3639
4127 dget(cgrp->dentry); 3640 cgroup_get(cgrp);
4128 css_get(css->parent); 3641 css_get(css->parent);
4129 3642
3643 cgrp->subsys_mask |= 1 << ss->id;
3644
4130 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 3645 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4131 parent->parent) { 3646 parent->parent) {
4132 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", 3647 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
@@ -4139,7 +3654,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4139 return 0; 3654 return 0;
4140 3655
4141err_clear_dir: 3656err_clear_dir:
4142 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); 3657 cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4143err_free_percpu_ref: 3658err_free_percpu_ref:
4144 percpu_ref_cancel_init(&css->refcnt); 3659 percpu_ref_cancel_init(&css->refcnt);
4145err_free_css: 3660err_free_css:
@@ -4147,35 +3662,34 @@ err_free_css:
4147 return err; 3662 return err;
4148} 3663}
4149 3664
4150/* 3665/**
4151 * cgroup_create - create a cgroup 3666 * cgroup_create - create a cgroup
4152 * @parent: cgroup that will be parent of the new cgroup 3667 * @parent: cgroup that will be parent of the new cgroup
4153 * @dentry: dentry of the new cgroup 3668 * @name: name of the new cgroup
4154 * @mode: mode to set on new inode 3669 * @mode: mode to set on new cgroup
4155 *
4156 * Must be called with the mutex on the parent inode held
4157 */ 3670 */
4158static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 3671static long cgroup_create(struct cgroup *parent, const char *name,
4159 umode_t mode) 3672 umode_t mode)
4160{ 3673{
4161 struct cgroup *cgrp; 3674 struct cgroup *cgrp;
4162 struct cgroup_name *name; 3675 struct cgroup_root *root = parent->root;
4163 struct cgroupfs_root *root = parent->root;
4164 int ssid, err; 3676 int ssid, err;
4165 struct cgroup_subsys *ss; 3677 struct cgroup_subsys *ss;
4166 struct super_block *sb = root->sb; 3678 struct kernfs_node *kn;
3679
3680 /*
3681 * XXX: The default hierarchy isn't fully implemented yet. Block
3682 * !root cgroup creation on it for now.
3683 */
3684 if (root == &cgrp_dfl_root)
3685 return -EINVAL;
4167 3686
4168 /* allocate the cgroup and its ID, 0 is reserved for the root */ 3687 /* allocate the cgroup and its ID, 0 is reserved for the root */
4169 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 3688 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4170 if (!cgrp) 3689 if (!cgrp)
4171 return -ENOMEM; 3690 return -ENOMEM;
4172 3691
4173 name = cgroup_alloc_name(dentry); 3692 mutex_lock(&cgroup_tree_mutex);
4174 if (!name) {
4175 err = -ENOMEM;
4176 goto err_free_cgrp;
4177 }
4178 rcu_assign_pointer(cgrp->name, name);
4179 3693
4180 /* 3694 /*
4181 * Only live parents can have children. Note that the liveliness 3695 * Only live parents can have children. Note that the liveliness
@@ -4186,7 +3700,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4186 */ 3700 */
4187 if (!cgroup_lock_live_group(parent)) { 3701 if (!cgroup_lock_live_group(parent)) {
4188 err = -ENODEV; 3702 err = -ENODEV;
4189 goto err_free_name; 3703 goto err_unlock_tree;
4190 } 3704 }
4191 3705
4192 /* 3706 /*
@@ -4199,18 +3713,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4199 goto err_unlock; 3713 goto err_unlock;
4200 } 3714 }
4201 3715
4202 /* Grab a reference on the superblock so the hierarchy doesn't
4203 * get deleted on unmount if there are child cgroups. This
4204 * can be done outside cgroup_mutex, since the sb can't
4205 * disappear while someone has an open control file on the
4206 * fs */
4207 atomic_inc(&sb->s_active);
4208
4209 init_cgroup_housekeeping(cgrp); 3716 init_cgroup_housekeeping(cgrp);
4210 3717
4211 dentry->d_fsdata = cgrp;
4212 cgrp->dentry = dentry;
4213
4214 cgrp->parent = parent; 3718 cgrp->parent = parent;
4215 cgrp->dummy_css.parent = &parent->dummy_css; 3719 cgrp->dummy_css.parent = &parent->dummy_css;
4216 cgrp->root = parent->root; 3720 cgrp->root = parent->root;
@@ -4221,24 +3725,26 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4221 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 3725 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4222 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 3726 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4223 3727
3728 /* create the directory */
3729 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
3730 if (IS_ERR(kn)) {
3731 err = PTR_ERR(kn);
3732 goto err_free_id;
3733 }
3734 cgrp->kn = kn;
3735
4224 /* 3736 /*
4225 * Create directory. cgroup_create_file() returns with the new 3737 * This extra ref will be put in cgroup_free_fn() and guarantees
4226 * directory locked on success so that it can be populated without 3738 * that @cgrp->kn is always accessible.
4227 * dropping cgroup_mutex.
4228 */ 3739 */
4229 err = cgroup_create_file(dentry, S_IFDIR | mode, sb); 3740 kernfs_get(kn);
4230 if (err < 0)
4231 goto err_free_id;
4232 lockdep_assert_held(&dentry->d_inode->i_mutex);
4233 3741
4234 cgrp->serial_nr = cgroup_serial_nr_next++; 3742 cgrp->serial_nr = cgroup_serial_nr_next++;
4235 3743
4236 /* allocation complete, commit to creation */ 3744 /* allocation complete, commit to creation */
4237 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 3745 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4238 root->number_of_cgroups++; 3746 atomic_inc(&root->nr_cgrps);
4239 3747 cgroup_get(parent);
4240 /* hold a ref to the parent's dentry */
4241 dget(parent->dentry);
4242 3748
4243 /* 3749 /*
4244 * @cgrp is now fully operational. If something fails after this 3750 * @cgrp is now fully operational. If something fails after this
@@ -4252,43 +3758,56 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4252 3758
4253 /* let's create and online css's */ 3759 /* let's create and online css's */
4254 for_each_subsys(ss, ssid) { 3760 for_each_subsys(ss, ssid) {
4255 if (root->subsys_mask & (1 << ssid)) { 3761 if (root->cgrp.subsys_mask & (1 << ssid)) {
4256 err = create_css(cgrp, ss); 3762 err = create_css(cgrp, ss);
4257 if (err) 3763 if (err)
4258 goto err_destroy; 3764 goto err_destroy;
4259 } 3765 }
4260 } 3766 }
4261 3767
3768 kernfs_activate(kn);
3769
4262 mutex_unlock(&cgroup_mutex); 3770 mutex_unlock(&cgroup_mutex);
4263 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 3771 mutex_unlock(&cgroup_tree_mutex);
4264 3772
4265 return 0; 3773 return 0;
4266 3774
4267err_free_id: 3775err_free_id:
4268 idr_remove(&root->cgroup_idr, cgrp->id); 3776 idr_remove(&root->cgroup_idr, cgrp->id);
4269 /* Release the reference count that we took on the superblock */
4270 deactivate_super(sb);
4271err_unlock: 3777err_unlock:
4272 mutex_unlock(&cgroup_mutex); 3778 mutex_unlock(&cgroup_mutex);
4273err_free_name: 3779err_unlock_tree:
4274 kfree(rcu_dereference_raw(cgrp->name)); 3780 mutex_unlock(&cgroup_tree_mutex);
4275err_free_cgrp:
4276 kfree(cgrp); 3781 kfree(cgrp);
4277 return err; 3782 return err;
4278 3783
4279err_destroy: 3784err_destroy:
4280 cgroup_destroy_locked(cgrp); 3785 cgroup_destroy_locked(cgrp);
4281 mutex_unlock(&cgroup_mutex); 3786 mutex_unlock(&cgroup_mutex);
4282 mutex_unlock(&dentry->d_inode->i_mutex); 3787 mutex_unlock(&cgroup_tree_mutex);
4283 return err; 3788 return err;
4284} 3789}
4285 3790
4286static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 3791static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3792 umode_t mode)
4287{ 3793{
4288 struct cgroup *c_parent = dentry->d_parent->d_fsdata; 3794 struct cgroup *parent = parent_kn->priv;
3795 int ret;
3796
3797 /*
3798 * cgroup_create() grabs cgroup_tree_mutex which nests outside
3799 * kernfs active_ref and cgroup_create() already synchronizes
3800 * properly against removal through cgroup_lock_live_group().
3801 * Break it before calling cgroup_create().
3802 */
3803 cgroup_get(parent);
3804 kernfs_break_active_protection(parent_kn);
3805
3806 ret = cgroup_create(parent, name, mode);
4289 3807
4290 /* the vfs holds inode->i_mutex already */ 3808 kernfs_unbreak_active_protection(parent_kn);
4291 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 3809 cgroup_put(parent);
3810 return ret;
4292} 3811}
4293 3812
4294/* 3813/*
@@ -4301,6 +3820,7 @@ static void css_killed_work_fn(struct work_struct *work)
4301 container_of(work, struct cgroup_subsys_state, destroy_work); 3820 container_of(work, struct cgroup_subsys_state, destroy_work);
4302 struct cgroup *cgrp = css->cgroup; 3821 struct cgroup *cgrp = css->cgroup;
4303 3822
3823 mutex_lock(&cgroup_tree_mutex);
4304 mutex_lock(&cgroup_mutex); 3824 mutex_lock(&cgroup_mutex);
4305 3825
4306 /* 3826 /*
@@ -4318,6 +3838,7 @@ static void css_killed_work_fn(struct work_struct *work)
4318 cgroup_destroy_css_killed(cgrp); 3838 cgroup_destroy_css_killed(cgrp);
4319 3839
4320 mutex_unlock(&cgroup_mutex); 3840 mutex_unlock(&cgroup_mutex);
3841 mutex_unlock(&cgroup_tree_mutex);
4321 3842
4322 /* 3843 /*
4323 * Put the css refs from kill_css(). Each css holds an extra 3844 * Put the css refs from kill_css(). Each css holds an extra
@@ -4339,18 +3860,15 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
4339 queue_work(cgroup_destroy_wq, &css->destroy_work); 3860 queue_work(cgroup_destroy_wq, &css->destroy_work);
4340} 3861}
4341 3862
4342/** 3863static void __kill_css(struct cgroup_subsys_state *css)
4343 * kill_css - destroy a css
4344 * @css: css to destroy
4345 *
4346 * This function initiates destruction of @css by removing cgroup interface
4347 * files and putting its base reference. ->css_offline() will be invoked
4348 * asynchronously once css_tryget() is guaranteed to fail and when the
4349 * reference count reaches zero, @css will be released.
4350 */
4351static void kill_css(struct cgroup_subsys_state *css)
4352{ 3864{
4353 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); 3865 lockdep_assert_held(&cgroup_tree_mutex);
3866
3867 /*
3868 * This must happen before css is disassociated with its cgroup.
3869 * See seq_css() for details.
3870 */
3871 cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4354 3872
4355 /* 3873 /*
4356 * Killing would put the base ref, but we need to keep it alive 3874 * Killing would put the base ref, but we need to keep it alive
@@ -4372,6 +3890,28 @@ static void kill_css(struct cgroup_subsys_state *css)
4372} 3890}
4373 3891
4374/** 3892/**
3893 * kill_css - destroy a css
3894 * @css: css to destroy
3895 *
3896 * This function initiates destruction of @css by removing cgroup interface
3897 * files and putting its base reference. ->css_offline() will be invoked
3898 * asynchronously once css_tryget() is guaranteed to fail and when the
3899 * reference count reaches zero, @css will be released.
3900 */
3901static void kill_css(struct cgroup_subsys_state *css)
3902{
3903 struct cgroup *cgrp = css->cgroup;
3904
3905 lockdep_assert_held(&cgroup_tree_mutex);
3906
3907 /* if already killed, noop */
3908 if (cgrp->subsys_mask & (1 << css->ss->id)) {
3909 cgrp->subsys_mask &= ~(1 << css->ss->id);
3910 __kill_css(css);
3911 }
3912}
3913
3914/**
4375 * cgroup_destroy_locked - the first stage of cgroup destruction 3915 * cgroup_destroy_locked - the first stage of cgroup destruction
4376 * @cgrp: cgroup to be destroyed 3916 * @cgrp: cgroup to be destroyed
4377 * 3917 *
@@ -4398,22 +3938,21 @@ static void kill_css(struct cgroup_subsys_state *css)
4398static int cgroup_destroy_locked(struct cgroup *cgrp) 3938static int cgroup_destroy_locked(struct cgroup *cgrp)
4399 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 3939 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4400{ 3940{
4401 struct dentry *d = cgrp->dentry;
4402 struct cgroup_subsys_state *css;
4403 struct cgroup *child; 3941 struct cgroup *child;
3942 struct cgroup_subsys_state *css;
4404 bool empty; 3943 bool empty;
4405 int ssid; 3944 int ssid;
4406 3945
4407 lockdep_assert_held(&d->d_inode->i_mutex); 3946 lockdep_assert_held(&cgroup_tree_mutex);
4408 lockdep_assert_held(&cgroup_mutex); 3947 lockdep_assert_held(&cgroup_mutex);
4409 3948
4410 /* 3949 /*
4411 * css_set_lock synchronizes access to ->cset_links and prevents 3950 * css_set_rwsem synchronizes access to ->cset_links and prevents
4412 * @cgrp from being removed while __put_css_set() is in progress. 3951 * @cgrp from being removed while put_css_set() is in progress.
4413 */ 3952 */
4414 read_lock(&css_set_lock); 3953 down_read(&css_set_rwsem);
4415 empty = list_empty(&cgrp->cset_links); 3954 empty = list_empty(&cgrp->cset_links);
4416 read_unlock(&css_set_lock); 3955 up_read(&css_set_rwsem);
4417 if (!empty) 3956 if (!empty)
4418 return -EBUSY; 3957 return -EBUSY;
4419 3958
@@ -4434,14 +3973,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4434 return -EBUSY; 3973 return -EBUSY;
4435 3974
4436 /* 3975 /*
4437 * Initiate massacre of all css's. cgroup_destroy_css_killed()
4438 * will be invoked to perform the rest of destruction once the
4439 * percpu refs of all css's are confirmed to be killed.
4440 */
4441 for_each_css(css, ssid, cgrp)
4442 kill_css(css);
4443
4444 /*
4445 * Mark @cgrp dead. This prevents further task migration and child 3976 * Mark @cgrp dead. This prevents further task migration and child
4446 * creation by disabling cgroup_lock_live_group(). Note that 3977 * creation by disabling cgroup_lock_live_group(). Note that
4447 * CGRP_DEAD assertion is depended upon by css_next_child() to 3978 * CGRP_DEAD assertion is depended upon by css_next_child() to
@@ -4450,6 +3981,17 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4450 */ 3981 */
4451 set_bit(CGRP_DEAD, &cgrp->flags); 3982 set_bit(CGRP_DEAD, &cgrp->flags);
4452 3983
3984 /*
3985 * Initiate massacre of all css's. cgroup_destroy_css_killed()
3986 * will be invoked to perform the rest of destruction once the
3987 * percpu refs of all css's are confirmed to be killed. This
3988 * involves removing the subsystem's files, drop cgroup_mutex.
3989 */
3990 mutex_unlock(&cgroup_mutex);
3991 for_each_css(css, ssid, cgrp)
3992 kill_css(css);
3993 mutex_lock(&cgroup_mutex);
3994
4453 /* CGRP_DEAD is set, remove from ->release_list for the last time */ 3995 /* CGRP_DEAD is set, remove from ->release_list for the last time */
4454 raw_spin_lock(&release_list_lock); 3996 raw_spin_lock(&release_list_lock);
4455 if (!list_empty(&cgrp->release_list)) 3997 if (!list_empty(&cgrp->release_list))
@@ -4465,14 +4007,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4465 if (!cgrp->nr_css) 4007 if (!cgrp->nr_css)
4466 cgroup_destroy_css_killed(cgrp); 4008 cgroup_destroy_css_killed(cgrp);
4467 4009
4010 /* remove @cgrp directory along with the base files */
4011 mutex_unlock(&cgroup_mutex);
4012
4468 /* 4013 /*
4469 * Clear the base files and remove @cgrp directory. The removal 4014 * There are two control paths which try to determine cgroup from
4470 * puts the base ref but we aren't quite done with @cgrp yet, so 4015 * dentry without going through kernfs - cgroupstats_build() and
4471 * hold onto it. 4016 * css_tryget_from_dir(). Those are supported by RCU protecting
4017 * clearing of cgrp->kn->priv backpointer, which should happen
4018 * after all files under it have been removed.
4472 */ 4019 */
4473 cgroup_addrm_files(cgrp, cgroup_base_files, false); 4020 kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */
4474 dget(d); 4021 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
4475 cgroup_d_remove_dir(d); 4022
4023 mutex_lock(&cgroup_mutex);
4476 4024
4477 return 0; 4025 return 0;
4478}; 4026};
@@ -4489,72 +4037,82 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4489static void cgroup_destroy_css_killed(struct cgroup *cgrp) 4037static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4490{ 4038{
4491 struct cgroup *parent = cgrp->parent; 4039 struct cgroup *parent = cgrp->parent;
4492 struct dentry *d = cgrp->dentry;
4493 4040
4041 lockdep_assert_held(&cgroup_tree_mutex);
4494 lockdep_assert_held(&cgroup_mutex); 4042 lockdep_assert_held(&cgroup_mutex);
4495 4043
4496 /* delete this cgroup from parent->children */ 4044 /* delete this cgroup from parent->children */
4497 list_del_rcu(&cgrp->sibling); 4045 list_del_rcu(&cgrp->sibling);
4498 4046
4499 dput(d); 4047 cgroup_put(cgrp);
4500 4048
4501 set_bit(CGRP_RELEASABLE, &parent->flags); 4049 set_bit(CGRP_RELEASABLE, &parent->flags);
4502 check_for_release(parent); 4050 check_for_release(parent);
4503} 4051}
4504 4052
4505static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4053static int cgroup_rmdir(struct kernfs_node *kn)
4506{ 4054{
4507 int ret; 4055 struct cgroup *cgrp = kn->priv;
4508 4056 int ret = 0;
4509 mutex_lock(&cgroup_mutex);
4510 ret = cgroup_destroy_locked(dentry->d_fsdata);
4511 mutex_unlock(&cgroup_mutex);
4512 4057
4513 return ret; 4058 /*
4514} 4059 * This is self-destruction but @kn can't be removed while this
4060 * callback is in progress. Let's break active protection. Once
4061 * the protection is broken, @cgrp can be destroyed at any point.
4062 * Pin it so that it stays accessible.
4063 */
4064 cgroup_get(cgrp);
4065 kernfs_break_active_protection(kn);
4515 4066
4516static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) 4067 mutex_lock(&cgroup_tree_mutex);
4517{ 4068 mutex_lock(&cgroup_mutex);
4518 INIT_LIST_HEAD(&ss->cftsets);
4519 4069
4520 /* 4070 /*
4521 * base_cftset is embedded in subsys itself, no need to worry about 4071 * @cgrp might already have been destroyed while we're trying to
4522 * deregistration. 4072 * grab the mutexes.
4523 */ 4073 */
4524 if (ss->base_cftypes) { 4074 if (!cgroup_is_dead(cgrp))
4525 struct cftype *cft; 4075 ret = cgroup_destroy_locked(cgrp);
4526 4076
4527 for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) 4077 mutex_unlock(&cgroup_mutex);
4528 cft->ss = ss; 4078 mutex_unlock(&cgroup_tree_mutex);
4529 4079
4530 ss->base_cftset.cfts = ss->base_cftypes; 4080 kernfs_unbreak_active_protection(kn);
4531 list_add_tail(&ss->base_cftset.node, &ss->cftsets); 4081 cgroup_put(cgrp);
4532 } 4082 return ret;
4533} 4083}
4534 4084
4085static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4086 .remount_fs = cgroup_remount,
4087 .show_options = cgroup_show_options,
4088 .mkdir = cgroup_mkdir,
4089 .rmdir = cgroup_rmdir,
4090 .rename = cgroup_rename,
4091};
4092
4535static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4093static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4536{ 4094{
4537 struct cgroup_subsys_state *css; 4095 struct cgroup_subsys_state *css;
4538 4096
4539 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4097 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4540 4098
4099 mutex_lock(&cgroup_tree_mutex);
4541 mutex_lock(&cgroup_mutex); 4100 mutex_lock(&cgroup_mutex);
4542 4101
4543 /* init base cftset */ 4102 INIT_LIST_HEAD(&ss->cfts);
4544 cgroup_init_cftsets(ss);
4545 4103
4546 /* Create the top cgroup state for this subsystem */ 4104 /* Create the root cgroup state for this subsystem */
4547 ss->root = &cgroup_dummy_root; 4105 ss->root = &cgrp_dfl_root;
4548 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); 4106 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
4549 /* We don't handle early failures gracefully */ 4107 /* We don't handle early failures gracefully */
4550 BUG_ON(IS_ERR(css)); 4108 BUG_ON(IS_ERR(css));
4551 init_css(css, ss, cgroup_dummy_top); 4109 init_css(css, ss, &cgrp_dfl_root.cgrp);
4552 4110
4553 /* Update the init_css_set to contain a subsys 4111 /* Update the init_css_set to contain a subsys
4554 * pointer to this state - since the subsystem is 4112 * pointer to this state - since the subsystem is
4555 * newly registered, all tasks and hence the 4113 * newly registered, all tasks and hence the
4556 * init_css_set is in the subsystem's top cgroup. */ 4114 * init_css_set is in the subsystem's root cgroup. */
4557 init_css_set.subsys[ss->subsys_id] = css; 4115 init_css_set.subsys[ss->id] = css;
4558 4116
4559 need_forkexit_callback |= ss->fork || ss->exit; 4117 need_forkexit_callback |= ss->fork || ss->exit;
4560 4118
@@ -4565,185 +4123,11 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4565 4123
4566 BUG_ON(online_css(css)); 4124 BUG_ON(online_css(css));
4567 4125
4568 mutex_unlock(&cgroup_mutex); 4126 cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id;
4569
4570 /* this function shouldn't be used with modular subsystems, since they
4571 * need to register a subsys_id, among other things */
4572 BUG_ON(ss->module);
4573}
4574
4575/**
4576 * cgroup_load_subsys: load and register a modular subsystem at runtime
4577 * @ss: the subsystem to load
4578 *
4579 * This function should be called in a modular subsystem's initcall. If the
4580 * subsystem is built as a module, it will be assigned a new subsys_id and set
4581 * up for use. If the subsystem is built-in anyway, work is delegated to the
4582 * simpler cgroup_init_subsys.
4583 */
4584int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4585{
4586 struct cgroup_subsys_state *css;
4587 int i, ret;
4588 struct hlist_node *tmp;
4589 struct css_set *cset;
4590 unsigned long key;
4591
4592 /* check name and function validity */
4593 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
4594 ss->css_alloc == NULL || ss->css_free == NULL)
4595 return -EINVAL;
4596
4597 /*
4598 * we don't support callbacks in modular subsystems. this check is
4599 * before the ss->module check for consistency; a subsystem that could
4600 * be a module should still have no callbacks even if the user isn't
4601 * compiling it as one.
4602 */
4603 if (ss->fork || ss->exit)
4604 return -EINVAL;
4605
4606 /*
4607 * an optionally modular subsystem is built-in: we want to do nothing,
4608 * since cgroup_init_subsys will have already taken care of it.
4609 */
4610 if (ss->module == NULL) {
4611 /* a sanity check */
4612 BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
4613 return 0;
4614 }
4615
4616 /* init base cftset */
4617 cgroup_init_cftsets(ss);
4618
4619 mutex_lock(&cgroup_mutex);
4620 mutex_lock(&cgroup_root_mutex);
4621 cgroup_subsys[ss->subsys_id] = ss;
4622
4623 /*
4624 * no ss->css_alloc seems to need anything important in the ss
4625 * struct, so this can happen first (i.e. before the dummy root
4626 * attachment).
4627 */
4628 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4629 if (IS_ERR(css)) {
4630 /* failure case - need to deassign the cgroup_subsys[] slot. */
4631 cgroup_subsys[ss->subsys_id] = NULL;
4632 mutex_unlock(&cgroup_root_mutex);
4633 mutex_unlock(&cgroup_mutex);
4634 return PTR_ERR(css);
4635 }
4636
4637 ss->root = &cgroup_dummy_root;
4638
4639 /* our new subsystem will be attached to the dummy hierarchy. */
4640 init_css(css, ss, cgroup_dummy_top);
4641
4642 /*
4643 * Now we need to entangle the css into the existing css_sets. unlike
4644 * in cgroup_init_subsys, there are now multiple css_sets, so each one
4645 * will need a new pointer to it; done by iterating the css_set_table.
4646 * furthermore, modifying the existing css_sets will corrupt the hash
4647 * table state, so each changed css_set will need its hash recomputed.
4648 * this is all done under the css_set_lock.
4649 */
4650 write_lock(&css_set_lock);
4651 hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
4652 /* skip entries that we already rehashed */
4653 if (cset->subsys[ss->subsys_id])
4654 continue;
4655 /* remove existing entry */
4656 hash_del(&cset->hlist);
4657 /* set new value */
4658 cset->subsys[ss->subsys_id] = css;
4659 /* recompute hash and restore entry */
4660 key = css_set_hash(cset->subsys);
4661 hash_add(css_set_table, &cset->hlist, key);
4662 }
4663 write_unlock(&css_set_lock);
4664
4665 ret = online_css(css);
4666 if (ret) {
4667 ss->css_free(css);
4668 goto err_unload;
4669 }
4670
4671 /* success! */
4672 mutex_unlock(&cgroup_root_mutex);
4673 mutex_unlock(&cgroup_mutex);
4674 return 0;
4675
4676err_unload:
4677 mutex_unlock(&cgroup_root_mutex);
4678 mutex_unlock(&cgroup_mutex);
4679 /* @ss can't be mounted here as try_module_get() would fail */
4680 cgroup_unload_subsys(ss);
4681 return ret;
4682}
4683EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4684
4685/**
4686 * cgroup_unload_subsys: unload a modular subsystem
4687 * @ss: the subsystem to unload
4688 *
4689 * This function should be called in a modular subsystem's exitcall. When this
4690 * function is invoked, the refcount on the subsystem's module will be 0, so
4691 * the subsystem will not be attached to any hierarchy.
4692 */
4693void cgroup_unload_subsys(struct cgroup_subsys *ss)
4694{
4695 struct cgrp_cset_link *link;
4696 struct cgroup_subsys_state *css;
4697
4698 BUG_ON(ss->module == NULL);
4699
4700 /*
4701 * we shouldn't be called if the subsystem is in use, and the use of
4702 * try_module_get() in rebind_subsystems() should ensure that it
4703 * doesn't start being used while we're killing it off.
4704 */
4705 BUG_ON(ss->root != &cgroup_dummy_root);
4706
4707 mutex_lock(&cgroup_mutex);
4708 mutex_lock(&cgroup_root_mutex);
4709
4710 css = cgroup_css(cgroup_dummy_top, ss);
4711 if (css)
4712 offline_css(css);
4713 4127
4714 /* deassign the subsys_id */
4715 cgroup_subsys[ss->subsys_id] = NULL;
4716
4717 /*
4718 * disentangle the css from all css_sets attached to the dummy
4719 * top. as in loading, we need to pay our respects to the hashtable
4720 * gods.
4721 */
4722 write_lock(&css_set_lock);
4723 list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
4724 struct css_set *cset = link->cset;
4725 unsigned long key;
4726
4727 hash_del(&cset->hlist);
4728 cset->subsys[ss->subsys_id] = NULL;
4729 key = css_set_hash(cset->subsys);
4730 hash_add(css_set_table, &cset->hlist, key);
4731 }
4732 write_unlock(&css_set_lock);
4733
4734 /*
4735 * remove subsystem's css from the cgroup_dummy_top and free it -
4736 * need to free before marking as null because ss->css_free needs
4737 * the cgrp->subsys pointer to find their state.
4738 */
4739 if (css)
4740 ss->css_free(css);
4741 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4742
4743 mutex_unlock(&cgroup_root_mutex);
4744 mutex_unlock(&cgroup_mutex); 4128 mutex_unlock(&cgroup_mutex);
4129 mutex_unlock(&cgroup_tree_mutex);
4745} 4130}
4746EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4747 4131
4748/** 4132/**
4749 * cgroup_init_early - cgroup initialization at system boot 4133 * cgroup_init_early - cgroup initialization at system boot
@@ -4753,34 +4137,24 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4753 */ 4137 */
4754int __init cgroup_init_early(void) 4138int __init cgroup_init_early(void)
4755{ 4139{
4140 static struct cgroup_sb_opts __initdata opts =
4141 { .flags = CGRP_ROOT_SANE_BEHAVIOR };
4756 struct cgroup_subsys *ss; 4142 struct cgroup_subsys *ss;
4757 int i; 4143 int i;
4758 4144
4759 atomic_set(&init_css_set.refcount, 1); 4145 init_cgroup_root(&cgrp_dfl_root, &opts);
4760 INIT_LIST_HEAD(&init_css_set.cgrp_links);
4761 INIT_LIST_HEAD(&init_css_set.tasks);
4762 INIT_HLIST_NODE(&init_css_set.hlist);
4763 css_set_count = 1;
4764 init_cgroup_root(&cgroup_dummy_root);
4765 cgroup_root_count = 1;
4766 RCU_INIT_POINTER(init_task.cgroups, &init_css_set); 4146 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4767 4147
4768 init_cgrp_cset_link.cset = &init_css_set; 4148 for_each_subsys(ss, i) {
4769 init_cgrp_cset_link.cgrp = cgroup_dummy_top; 4149 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
4770 list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links); 4150 "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
4771 list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); 4151 i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
4772 4152 ss->id, ss->name);
4773 /* at bootup time, we don't worry about modular subsystems */ 4153 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
4774 for_each_builtin_subsys(ss, i) { 4154 "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
4775 BUG_ON(!ss->name); 4155
4776 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4156 ss->id = i;
4777 BUG_ON(!ss->css_alloc); 4157 ss->name = cgroup_subsys_name[i];
4778 BUG_ON(!ss->css_free);
4779 if (ss->subsys_id != i) {
4780 printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
4781 ss->name, ss->subsys_id);
4782 BUG();
4783 }
4784 4158
4785 if (ss->early_init) 4159 if (ss->early_init)
4786 cgroup_init_subsys(ss); 4160 cgroup_init_subsys(ss);
@@ -4798,53 +4172,46 @@ int __init cgroup_init(void)
4798{ 4172{
4799 struct cgroup_subsys *ss; 4173 struct cgroup_subsys *ss;
4800 unsigned long key; 4174 unsigned long key;
4801 int i, err; 4175 int ssid, err;
4802 4176
4803 err = bdi_init(&cgroup_backing_dev_info); 4177 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
4804 if (err)
4805 return err;
4806 4178
4807 for_each_builtin_subsys(ss, i) { 4179 mutex_lock(&cgroup_tree_mutex);
4808 if (!ss->early_init)
4809 cgroup_init_subsys(ss);
4810 }
4811
4812 /* allocate id for the dummy hierarchy */
4813 mutex_lock(&cgroup_mutex); 4180 mutex_lock(&cgroup_mutex);
4814 mutex_lock(&cgroup_root_mutex);
4815 4181
4816 /* Add init_css_set to the hash table */ 4182 /* Add init_css_set to the hash table */
4817 key = css_set_hash(init_css_set.subsys); 4183 key = css_set_hash(init_css_set.subsys);
4818 hash_add(css_set_table, &init_css_set.hlist, key); 4184 hash_add(css_set_table, &init_css_set.hlist, key);
4819 4185
4820 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); 4186 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
4821 4187
4822 err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top,
4823 0, 1, GFP_KERNEL);
4824 BUG_ON(err < 0);
4825
4826 mutex_unlock(&cgroup_root_mutex);
4827 mutex_unlock(&cgroup_mutex); 4188 mutex_unlock(&cgroup_mutex);
4189 mutex_unlock(&cgroup_tree_mutex);
4828 4190
4829 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4191 for_each_subsys(ss, ssid) {
4830 if (!cgroup_kobj) { 4192 if (!ss->early_init)
4831 err = -ENOMEM; 4193 cgroup_init_subsys(ss);
4832 goto out; 4194
4195 /*
4196 * cftype registration needs kmalloc and can't be done
4197 * during early_init. Register base cftypes separately.
4198 */
4199 if (ss->base_cftypes)
4200 WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
4833 } 4201 }
4834 4202
4203 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4204 if (!cgroup_kobj)
4205 return -ENOMEM;
4206
4835 err = register_filesystem(&cgroup_fs_type); 4207 err = register_filesystem(&cgroup_fs_type);
4836 if (err < 0) { 4208 if (err < 0) {
4837 kobject_put(cgroup_kobj); 4209 kobject_put(cgroup_kobj);
4838 goto out; 4210 return err;
4839 } 4211 }
4840 4212
4841 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); 4213 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
4842 4214 return 0;
4843out:
4844 if (err)
4845 bdi_destroy(&cgroup_backing_dev_info);
4846
4847 return err;
4848} 4215}
4849 4216
4850static int __init cgroup_wq_init(void) 4217static int __init cgroup_wq_init(void)
@@ -4876,12 +4243,6 @@ core_initcall(cgroup_wq_init);
4876 * proc_cgroup_show() 4243 * proc_cgroup_show()
4877 * - Print task's cgroup paths into seq_file, one line for each hierarchy 4244 * - Print task's cgroup paths into seq_file, one line for each hierarchy
4878 * - Used for /proc/<pid>/cgroup. 4245 * - Used for /proc/<pid>/cgroup.
4879 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
4880 * doesn't really matter if tsk->cgroup changes after we read it,
4881 * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
4882 * anyway. No need to check that tsk->cgroup != NULL, thanks to
4883 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
4884 * cgroup to top_cgroup.
4885 */ 4246 */
4886 4247
4887/* TODO: Use a proper seq_file iterator */ 4248/* TODO: Use a proper seq_file iterator */
@@ -4889,12 +4250,12 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4889{ 4250{
4890 struct pid *pid; 4251 struct pid *pid;
4891 struct task_struct *tsk; 4252 struct task_struct *tsk;
4892 char *buf; 4253 char *buf, *path;
4893 int retval; 4254 int retval;
4894 struct cgroupfs_root *root; 4255 struct cgroup_root *root;
4895 4256
4896 retval = -ENOMEM; 4257 retval = -ENOMEM;
4897 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 4258 buf = kmalloc(PATH_MAX, GFP_KERNEL);
4898 if (!buf) 4259 if (!buf)
4899 goto out; 4260 goto out;
4900 4261
@@ -4907,29 +4268,36 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4907 retval = 0; 4268 retval = 0;
4908 4269
4909 mutex_lock(&cgroup_mutex); 4270 mutex_lock(&cgroup_mutex);
4271 down_read(&css_set_rwsem);
4910 4272
4911 for_each_active_root(root) { 4273 for_each_root(root) {
4912 struct cgroup_subsys *ss; 4274 struct cgroup_subsys *ss;
4913 struct cgroup *cgrp; 4275 struct cgroup *cgrp;
4914 int ssid, count = 0; 4276 int ssid, count = 0;
4915 4277
4278 if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
4279 continue;
4280
4916 seq_printf(m, "%d:", root->hierarchy_id); 4281 seq_printf(m, "%d:", root->hierarchy_id);
4917 for_each_subsys(ss, ssid) 4282 for_each_subsys(ss, ssid)
4918 if (root->subsys_mask & (1 << ssid)) 4283 if (root->cgrp.subsys_mask & (1 << ssid))
4919 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4284 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4920 if (strlen(root->name)) 4285 if (strlen(root->name))
4921 seq_printf(m, "%sname=%s", count ? "," : "", 4286 seq_printf(m, "%sname=%s", count ? "," : "",
4922 root->name); 4287 root->name);
4923 seq_putc(m, ':'); 4288 seq_putc(m, ':');
4924 cgrp = task_cgroup_from_root(tsk, root); 4289 cgrp = task_cgroup_from_root(tsk, root);
4925 retval = cgroup_path(cgrp, buf, PAGE_SIZE); 4290 path = cgroup_path(cgrp, buf, PATH_MAX);
4926 if (retval < 0) 4291 if (!path) {
4292 retval = -ENAMETOOLONG;
4927 goto out_unlock; 4293 goto out_unlock;
4928 seq_puts(m, buf); 4294 }
4295 seq_puts(m, path);
4929 seq_putc(m, '\n'); 4296 seq_putc(m, '\n');
4930 } 4297 }
4931 4298
4932out_unlock: 4299out_unlock:
4300 up_read(&css_set_rwsem);
4933 mutex_unlock(&cgroup_mutex); 4301 mutex_unlock(&cgroup_mutex);
4934 put_task_struct(tsk); 4302 put_task_struct(tsk);
4935out_free: 4303out_free:
@@ -4955,7 +4323,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
4955 for_each_subsys(ss, i) 4323 for_each_subsys(ss, i)
4956 seq_printf(m, "%s\t%d\t%d\t%d\n", 4324 seq_printf(m, "%s\t%d\t%d\t%d\n",
4957 ss->name, ss->root->hierarchy_id, 4325 ss->name, ss->root->hierarchy_id,
4958 ss->root->number_of_cgroups, !ss->disabled); 4326 atomic_read(&ss->root->nr_cgrps), !ss->disabled);
4959 4327
4960 mutex_unlock(&cgroup_mutex); 4328 mutex_unlock(&cgroup_mutex);
4961 return 0; 4329 return 0;
@@ -4974,27 +4342,16 @@ static const struct file_operations proc_cgroupstats_operations = {
4974}; 4342};
4975 4343
4976/** 4344/**
4977 * cgroup_fork - attach newly forked task to its parents cgroup. 4345 * cgroup_fork - initialize cgroup related fields during copy_process()
4978 * @child: pointer to task_struct of forking parent process. 4346 * @child: pointer to task_struct of forking parent process.
4979 * 4347 *
4980 * Description: A task inherits its parent's cgroup at fork(). 4348 * A task is associated with the init_css_set until cgroup_post_fork()
4981 * 4349 * attaches it to the parent's css_set. Empty cg_list indicates that
4982 * A pointer to the shared css_set was automatically copied in 4350 * @child isn't holding reference to its css_set.
4983 * fork.c by dup_task_struct(). However, we ignore that copy, since
4984 * it was not made under the protection of RCU or cgroup_mutex, so
4985 * might no longer be a valid cgroup pointer. cgroup_attach_task() might
4986 * have already changed current->cgroups, allowing the previously
4987 * referenced cgroup group to be removed and freed.
4988 *
4989 * At the point that cgroup_fork() is called, 'current' is the parent
4990 * task, and the passed argument 'child' points to the child task.
4991 */ 4351 */
4992void cgroup_fork(struct task_struct *child) 4352void cgroup_fork(struct task_struct *child)
4993{ 4353{
4994 task_lock(current); 4354 RCU_INIT_POINTER(child->cgroups, &init_css_set);
4995 get_css_set(task_css_set(current));
4996 child->cgroups = current->cgroups;
4997 task_unlock(current);
4998 INIT_LIST_HEAD(&child->cg_list); 4355 INIT_LIST_HEAD(&child->cg_list);
4999} 4356}
5000 4357
@@ -5014,23 +4371,37 @@ void cgroup_post_fork(struct task_struct *child)
5014 int i; 4371 int i;
5015 4372
5016 /* 4373 /*
5017 * use_task_css_set_links is set to 1 before we walk the tasklist 4374 * This may race against cgroup_enable_task_cg_links(). As that
5018 * under the tasklist_lock and we read it here after we added the child 4375 * function sets use_task_css_set_links before grabbing
5019 * to the tasklist under the tasklist_lock as well. If the child wasn't 4376 * tasklist_lock and we just went through tasklist_lock to add
5020 * yet in the tasklist when we walked through it from 4377 * @child, it's guaranteed that either we see the set
5021 * cgroup_enable_task_cg_lists(), then use_task_css_set_links value 4378 * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
5022 * should be visible now due to the paired locking and barriers implied 4379 * @child during its iteration.
5023 * by LOCK/UNLOCK: it is written before the tasklist_lock unlock 4380 *
5024 * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock 4381 * If we won the race, @child is associated with %current's
5025 * lock on fork. 4382 * css_set. Grabbing css_set_rwsem guarantees both that the
4383 * association is stable, and, on completion of the parent's
4384 * migration, @child is visible in the source of migration or
4385 * already in the destination cgroup. This guarantee is necessary
4386 * when implementing operations which need to migrate all tasks of
4387 * a cgroup to another.
4388 *
4389 * Note that if we lose to cgroup_enable_task_cg_links(), @child
4390 * will remain in init_css_set. This is safe because all tasks are
4391 * in the init_css_set before cg_links is enabled and there's no
4392 * operation which transfers all tasks out of init_css_set.
5026 */ 4393 */
5027 if (use_task_css_set_links) { 4394 if (use_task_css_set_links) {
5028 write_lock(&css_set_lock); 4395 struct css_set *cset;
5029 task_lock(child); 4396
5030 if (list_empty(&child->cg_list)) 4397 down_write(&css_set_rwsem);
5031 list_add(&child->cg_list, &task_css_set(child)->tasks); 4398 cset = task_css_set(current);
5032 task_unlock(child); 4399 if (list_empty(&child->cg_list)) {
5033 write_unlock(&css_set_lock); 4400 rcu_assign_pointer(child->cgroups, cset);
4401 list_add(&child->cg_list, &cset->tasks);
4402 get_css_set(cset);
4403 }
4404 up_write(&css_set_rwsem);
5034 } 4405 }
5035 4406
5036 /* 4407 /*
@@ -5039,15 +4410,7 @@ void cgroup_post_fork(struct task_struct *child)
5039 * and addition to css_set. 4410 * and addition to css_set.
5040 */ 4411 */
5041 if (need_forkexit_callback) { 4412 if (need_forkexit_callback) {
5042 /* 4413 for_each_subsys(ss, i)
5043 * fork/exit callbacks are supported only for builtin
5044 * subsystems, and the builtin section of the subsys
5045 * array is immutable, so we don't need to lock the
5046 * subsys array here. On the other hand, modular section
5047 * of the array can be freed at module unload, so we
5048 * can't touch that.
5049 */
5050 for_each_builtin_subsys(ss, i)
5051 if (ss->fork) 4414 if (ss->fork)
5052 ss->fork(child); 4415 ss->fork(child);
5053 } 4416 }
@@ -5056,7 +4419,6 @@ void cgroup_post_fork(struct task_struct *child)
5056/** 4419/**
5057 * cgroup_exit - detach cgroup from exiting task 4420 * cgroup_exit - detach cgroup from exiting task
5058 * @tsk: pointer to task_struct of exiting process 4421 * @tsk: pointer to task_struct of exiting process
5059 * @run_callback: run exit callbacks?
5060 * 4422 *
5061 * Description: Detach cgroup from @tsk and release it. 4423 * Description: Detach cgroup from @tsk and release it.
5062 * 4424 *
@@ -5066,57 +4428,38 @@ void cgroup_post_fork(struct task_struct *child)
5066 * use notify_on_release cgroups where very high task exit scaling 4428 * use notify_on_release cgroups where very high task exit scaling
5067 * is required on large systems. 4429 * is required on large systems.
5068 * 4430 *
5069 * the_top_cgroup_hack: 4431 * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
5070 * 4432 * call cgroup_exit() while the task is still competent to handle
5071 * Set the exiting tasks cgroup to the root cgroup (top_cgroup). 4433 * notify_on_release(), then leave the task attached to the root cgroup in
5072 * 4434 * each hierarchy for the remainder of its exit. No need to bother with
5073 * We call cgroup_exit() while the task is still competent to 4435 * init_css_set refcnting. init_css_set never goes away and we can't race
5074 * handle notify_on_release(), then leave the task attached to the 4436 * with migration path - PF_EXITING is visible to migration path.
5075 * root cgroup in each hierarchy for the remainder of its exit.
5076 *
5077 * To do this properly, we would increment the reference count on
5078 * top_cgroup, and near the very end of the kernel/exit.c do_exit()
5079 * code we would add a second cgroup function call, to drop that
5080 * reference. This would just create an unnecessary hot spot on
5081 * the top_cgroup reference count, to no avail.
5082 *
5083 * Normally, holding a reference to a cgroup without bumping its
5084 * count is unsafe. The cgroup could go away, or someone could
5085 * attach us to a different cgroup, decrementing the count on
5086 * the first cgroup that we never incremented. But in this case,
5087 * top_cgroup isn't going away, and either task has PF_EXITING set,
5088 * which wards off any cgroup_attach_task() attempts, or task is a failed
5089 * fork, never visible to cgroup_attach_task.
5090 */ 4437 */
5091void cgroup_exit(struct task_struct *tsk, int run_callbacks) 4438void cgroup_exit(struct task_struct *tsk)
5092{ 4439{
5093 struct cgroup_subsys *ss; 4440 struct cgroup_subsys *ss;
5094 struct css_set *cset; 4441 struct css_set *cset;
4442 bool put_cset = false;
5095 int i; 4443 int i;
5096 4444
5097 /* 4445 /*
5098 * Unlink from the css_set task list if necessary. 4446 * Unlink from @tsk from its css_set. As migration path can't race
5099 * Optimistically check cg_list before taking 4447 * with us, we can check cg_list without grabbing css_set_rwsem.
5100 * css_set_lock
5101 */ 4448 */
5102 if (!list_empty(&tsk->cg_list)) { 4449 if (!list_empty(&tsk->cg_list)) {
5103 write_lock(&css_set_lock); 4450 down_write(&css_set_rwsem);
5104 if (!list_empty(&tsk->cg_list)) 4451 list_del_init(&tsk->cg_list);
5105 list_del_init(&tsk->cg_list); 4452 up_write(&css_set_rwsem);
5106 write_unlock(&css_set_lock); 4453 put_cset = true;
5107 } 4454 }
5108 4455
5109 /* Reassign the task to the init_css_set. */ 4456 /* Reassign the task to the init_css_set. */
5110 task_lock(tsk);
5111 cset = task_css_set(tsk); 4457 cset = task_css_set(tsk);
5112 RCU_INIT_POINTER(tsk->cgroups, &init_css_set); 4458 RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
5113 4459
5114 if (run_callbacks && need_forkexit_callback) { 4460 if (need_forkexit_callback) {
5115 /* 4461 /* see cgroup_post_fork() for details */
5116 * fork/exit callbacks are supported only for builtin 4462 for_each_subsys(ss, i) {
5117 * subsystems, see cgroup_post_fork() for details.
5118 */
5119 for_each_builtin_subsys(ss, i) {
5120 if (ss->exit) { 4463 if (ss->exit) {
5121 struct cgroup_subsys_state *old_css = cset->subsys[i]; 4464 struct cgroup_subsys_state *old_css = cset->subsys[i];
5122 struct cgroup_subsys_state *css = task_css(tsk, i); 4465 struct cgroup_subsys_state *css = task_css(tsk, i);
@@ -5125,9 +4468,9 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5125 } 4468 }
5126 } 4469 }
5127 } 4470 }
5128 task_unlock(tsk);
5129 4471
5130 put_css_set_taskexit(cset); 4472 if (put_cset)
4473 put_css_set(cset, true);
5131} 4474}
5132 4475
5133static void check_for_release(struct cgroup *cgrp) 4476static void check_for_release(struct cgroup *cgrp)
@@ -5184,16 +4527,17 @@ static void cgroup_release_agent(struct work_struct *work)
5184 while (!list_empty(&release_list)) { 4527 while (!list_empty(&release_list)) {
5185 char *argv[3], *envp[3]; 4528 char *argv[3], *envp[3];
5186 int i; 4529 int i;
5187 char *pathbuf = NULL, *agentbuf = NULL; 4530 char *pathbuf = NULL, *agentbuf = NULL, *path;
5188 struct cgroup *cgrp = list_entry(release_list.next, 4531 struct cgroup *cgrp = list_entry(release_list.next,
5189 struct cgroup, 4532 struct cgroup,
5190 release_list); 4533 release_list);
5191 list_del_init(&cgrp->release_list); 4534 list_del_init(&cgrp->release_list);
5192 raw_spin_unlock(&release_list_lock); 4535 raw_spin_unlock(&release_list_lock);
5193 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); 4536 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
5194 if (!pathbuf) 4537 if (!pathbuf)
5195 goto continue_free; 4538 goto continue_free;
5196 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) 4539 path = cgroup_path(cgrp, pathbuf, PATH_MAX);
4540 if (!path)
5197 goto continue_free; 4541 goto continue_free;
5198 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); 4542 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
5199 if (!agentbuf) 4543 if (!agentbuf)
@@ -5201,7 +4545,7 @@ static void cgroup_release_agent(struct work_struct *work)
5201 4545
5202 i = 0; 4546 i = 0;
5203 argv[i++] = agentbuf; 4547 argv[i++] = agentbuf;
5204 argv[i++] = pathbuf; 4548 argv[i++] = path;
5205 argv[i] = NULL; 4549 argv[i] = NULL;
5206 4550
5207 i = 0; 4551 i = 0;
@@ -5235,11 +4579,7 @@ static int __init cgroup_disable(char *str)
5235 if (!*token) 4579 if (!*token)
5236 continue; 4580 continue;
5237 4581
5238 /* 4582 for_each_subsys(ss, i) {
5239 * cgroup_disable, being at boot time, can't know about
5240 * module subsystems, so we don't worry about them.
5241 */
5242 for_each_builtin_subsys(ss, i) {
5243 if (!strcmp(token, ss->name)) { 4583 if (!strcmp(token, ss->name)) {
5244 ss->disabled = 1; 4584 ss->disabled = 1;
5245 printk(KERN_INFO "Disabling %s control group" 4585 printk(KERN_INFO "Disabling %s control group"
@@ -5253,28 +4593,42 @@ static int __init cgroup_disable(char *str)
5253__setup("cgroup_disable=", cgroup_disable); 4593__setup("cgroup_disable=", cgroup_disable);
5254 4594
5255/** 4595/**
5256 * css_from_dir - get corresponding css from the dentry of a cgroup dir 4596 * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir
5257 * @dentry: directory dentry of interest 4597 * @dentry: directory dentry of interest
5258 * @ss: subsystem of interest 4598 * @ss: subsystem of interest
5259 * 4599 *
5260 * Must be called under cgroup_mutex or RCU read lock. The caller is 4600 * If @dentry is a directory for a cgroup which has @ss enabled on it, try
5261 * responsible for pinning the returned css if it needs to be accessed 4601 * to get the corresponding css and return it. If such css doesn't exist
5262 * outside the critical section. 4602 * or can't be pinned, an ERR_PTR value is returned.
5263 */ 4603 */
5264struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, 4604struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
5265 struct cgroup_subsys *ss) 4605 struct cgroup_subsys *ss)
5266{ 4606{
4607 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
4608 struct cgroup_subsys_state *css = NULL;
5267 struct cgroup *cgrp; 4609 struct cgroup *cgrp;
5268 4610
5269 cgroup_assert_mutex_or_rcu_locked();
5270
5271 /* is @dentry a cgroup dir? */ 4611 /* is @dentry a cgroup dir? */
5272 if (!dentry->d_inode || 4612 if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
5273 dentry->d_inode->i_op != &cgroup_dir_inode_operations) 4613 kernfs_type(kn) != KERNFS_DIR)
5274 return ERR_PTR(-EBADF); 4614 return ERR_PTR(-EBADF);
5275 4615
5276 cgrp = __d_cgrp(dentry); 4616 rcu_read_lock();
5277 return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); 4617
4618 /*
4619 * This path doesn't originate from kernfs and @kn could already
4620 * have been or be removed at any point. @kn->priv is RCU
4621 * protected for this access. See destroy_locked() for details.
4622 */
4623 cgrp = rcu_dereference(kn->priv);
4624 if (cgrp)
4625 css = cgroup_css(cgrp, ss);
4626
4627 if (!css || !css_tryget(css))
4628 css = ERR_PTR(-ENOENT);
4629
4630 rcu_read_unlock();
4631 return css;
5278} 4632}
5279 4633
5280/** 4634/**
@@ -5289,7 +4643,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5289{ 4643{
5290 struct cgroup *cgrp; 4644 struct cgroup *cgrp;
5291 4645
5292 cgroup_assert_mutex_or_rcu_locked(); 4646 cgroup_assert_mutexes_or_rcu_locked();
5293 4647
5294 cgrp = idr_find(&ss->root->cgroup_idr, id); 4648 cgrp = idr_find(&ss->root->cgroup_idr, id);
5295 if (cgrp) 4649 if (cgrp)
@@ -5341,23 +4695,25 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5341{ 4695{
5342 struct cgrp_cset_link *link; 4696 struct cgrp_cset_link *link;
5343 struct css_set *cset; 4697 struct css_set *cset;
4698 char *name_buf;
5344 4699
5345 read_lock(&css_set_lock); 4700 name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
4701 if (!name_buf)
4702 return -ENOMEM;
4703
4704 down_read(&css_set_rwsem);
5346 rcu_read_lock(); 4705 rcu_read_lock();
5347 cset = rcu_dereference(current->cgroups); 4706 cset = rcu_dereference(current->cgroups);
5348 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { 4707 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
5349 struct cgroup *c = link->cgrp; 4708 struct cgroup *c = link->cgrp;
5350 const char *name;
5351 4709
5352 if (c->dentry) 4710 cgroup_name(c, name_buf, NAME_MAX + 1);
5353 name = c->dentry->d_name.name;
5354 else
5355 name = "?";
5356 seq_printf(seq, "Root %d group %s\n", 4711 seq_printf(seq, "Root %d group %s\n",
5357 c->root->hierarchy_id, name); 4712 c->root->hierarchy_id, name_buf);
5358 } 4713 }
5359 rcu_read_unlock(); 4714 rcu_read_unlock();
5360 read_unlock(&css_set_lock); 4715 up_read(&css_set_rwsem);
4716 kfree(name_buf);
5361 return 0; 4717 return 0;
5362} 4718}
5363 4719
@@ -5367,23 +4723,30 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
5367 struct cgroup_subsys_state *css = seq_css(seq); 4723 struct cgroup_subsys_state *css = seq_css(seq);
5368 struct cgrp_cset_link *link; 4724 struct cgrp_cset_link *link;
5369 4725
5370 read_lock(&css_set_lock); 4726 down_read(&css_set_rwsem);
5371 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { 4727 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5372 struct css_set *cset = link->cset; 4728 struct css_set *cset = link->cset;
5373 struct task_struct *task; 4729 struct task_struct *task;
5374 int count = 0; 4730 int count = 0;
4731
5375 seq_printf(seq, "css_set %p\n", cset); 4732 seq_printf(seq, "css_set %p\n", cset);
4733
5376 list_for_each_entry(task, &cset->tasks, cg_list) { 4734 list_for_each_entry(task, &cset->tasks, cg_list) {
5377 if (count++ > MAX_TASKS_SHOWN_PER_CSS) { 4735 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
5378 seq_puts(seq, " ...\n"); 4736 goto overflow;
5379 break; 4737 seq_printf(seq, " task %d\n", task_pid_vnr(task));
5380 } else { 4738 }
5381 seq_printf(seq, " task %d\n", 4739
5382 task_pid_vnr(task)); 4740 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
5383 } 4741 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
4742 goto overflow;
4743 seq_printf(seq, " task %d\n", task_pid_vnr(task));
5384 } 4744 }
4745 continue;
4746 overflow:
4747 seq_puts(seq, " ...\n");
5385 } 4748 }
5386 read_unlock(&css_set_lock); 4749 up_read(&css_set_rwsem);
5387 return 0; 4750 return 0;
5388} 4751}
5389 4752
@@ -5426,11 +4789,9 @@ static struct cftype debug_files[] = {
5426 { } /* terminate */ 4789 { } /* terminate */
5427}; 4790};
5428 4791
5429struct cgroup_subsys debug_subsys = { 4792struct cgroup_subsys debug_cgrp_subsys = {
5430 .name = "debug",
5431 .css_alloc = debug_css_alloc, 4793 .css_alloc = debug_css_alloc,
5432 .css_free = debug_css_free, 4794 .css_free = debug_css_free,
5433 .subsys_id = debug_subsys_id,
5434 .base_cftypes = debug_files, 4795 .base_cftypes = debug_files,
5435}; 4796};
5436#endif /* CONFIG_CGROUP_DEBUG */ 4797#endif /* CONFIG_CGROUP_DEBUG */