aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-11-05 17:51:32 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-11-05 17:51:32 -0500
commit69234acee54407962a20bedf90ef9c96326994b5 (patch)
tree5e979b1a489d866691c2c65ac3f46b4f29feef68 /kernel/cgroup.c
parent11eaaadb3ea376c6c194491c2e9bddd647f9d253 (diff)
parentd57456753787ab158f906f1f8eb58d54a2ccd9f4 (diff)
Merge branch 'for-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: "The cgroup core saw several significant updates this cycle: - percpu_rwsem for threadgroup locking is reinstated. This was temporarily dropped due to down_write latency issues. Oleg's rework of percpu_rwsem which is scheduled to be merged in this merge window resolves the issue. - On the v2 hierarchy, when controllers are enabled and disabled, all operations are atomic and can fail and revert cleanly. This allows ->can_attach() failure which is necessary for cpu RT slices. - Tasks now stay associated with the original cgroups after exit until released. This allows tracking resources held by zombies (e.g. pids) and makes it easy to find out where zombies came from on the v2 hierarchy. The pids controller was broken before these changes as zombies escaped the limits; unfortunately, updating this behavior required too many invasive changes and I don't think it's a good idea to backport them, so the pids controller on 4.3, the first version which included the pids controller, will stay broken at least until I'm sure about the cgroup core changes. - Optimization of a couple common tests using static_key" * 'for-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (38 commits) cgroup: fix race condition around termination check in css_task_iter_next() blkcg: don't create "io.stat" on the root cgroup cgroup: drop cgroup__DEVEL__legacy_files_on_dfl cgroup: replace error handling in cgroup_init() with WARN_ON()s cgroup: add cgroup_subsys->free() method and use it to fix pids controller cgroup: keep zombies associated with their original cgroups cgroup: make css_set_rwsem a spinlock and rename it to css_set_lock cgroup: don't hold css_set_rwsem across css task iteration cgroup: reorganize css_task_iter functions cgroup: factor out css_set_move_task() cgroup: keep css_set and task lists in chronological order cgroup: make cgroup_destroy_locked() test cgroup_is_populated() cgroup: make css_sets pin the associated cgroups cgroup: relocate cgroup_[try]get/put() cgroup: move check_for_release() invocation cgroup: replace cgroup_has_tasks() with cgroup_is_populated() cgroup: make cgroup->nr_populated count the number of populated css_sets cgroup: remove an unused parameter from cgroup_task_migrate() cgroup: fix too early usage of static_branch_disable() cgroup: make cgroup_update_dfl_csses() migrate all target processes atomically ...
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c1297
1 files changed, 725 insertions, 572 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2c9eae6ad970..b9d0cce3f9ce 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -45,7 +45,7 @@
45#include <linux/sched.h> 45#include <linux/sched.h>
46#include <linux/slab.h> 46#include <linux/slab.h>
47#include <linux/spinlock.h> 47#include <linux/spinlock.h>
48#include <linux/rwsem.h> 48#include <linux/percpu-rwsem.h>
49#include <linux/string.h> 49#include <linux/string.h>
50#include <linux/sort.h> 50#include <linux/sort.h>
51#include <linux/kmod.h> 51#include <linux/kmod.h>
@@ -75,7 +75,7 @@
75 * cgroup_mutex is the master lock. Any modification to cgroup or its 75 * cgroup_mutex is the master lock. Any modification to cgroup or its
76 * hierarchy must be performed while holding it. 76 * hierarchy must be performed while holding it.
77 * 77 *
78 * css_set_rwsem protects task->cgroups pointer, the list of css_set 78 * css_set_lock protects task->cgroups pointer, the list of css_set
79 * objects, and the chain of tasks off each css_set. 79 * objects, and the chain of tasks off each css_set.
80 * 80 *
81 * These locks are exported if CONFIG_PROVE_RCU so that accessors in 81 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
@@ -83,12 +83,12 @@
83 */ 83 */
84#ifdef CONFIG_PROVE_RCU 84#ifdef CONFIG_PROVE_RCU
85DEFINE_MUTEX(cgroup_mutex); 85DEFINE_MUTEX(cgroup_mutex);
86DECLARE_RWSEM(css_set_rwsem); 86DEFINE_SPINLOCK(css_set_lock);
87EXPORT_SYMBOL_GPL(cgroup_mutex); 87EXPORT_SYMBOL_GPL(cgroup_mutex);
88EXPORT_SYMBOL_GPL(css_set_rwsem); 88EXPORT_SYMBOL_GPL(css_set_lock);
89#else 89#else
90static DEFINE_MUTEX(cgroup_mutex); 90static DEFINE_MUTEX(cgroup_mutex);
91static DECLARE_RWSEM(css_set_rwsem); 91static DEFINE_SPINLOCK(css_set_lock);
92#endif 92#endif
93 93
94/* 94/*
@@ -103,6 +103,8 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
103 */ 103 */
104static DEFINE_SPINLOCK(release_agent_path_lock); 104static DEFINE_SPINLOCK(release_agent_path_lock);
105 105
106struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
107
106#define cgroup_assert_mutex_or_rcu_locked() \ 108#define cgroup_assert_mutex_or_rcu_locked() \
107 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ 109 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
108 !lockdep_is_held(&cgroup_mutex), \ 110 !lockdep_is_held(&cgroup_mutex), \
@@ -136,6 +138,27 @@ static const char *cgroup_subsys_name[] = {
136}; 138};
137#undef SUBSYS 139#undef SUBSYS
138 140
141/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
142#define SUBSYS(_x) \
143 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
144 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
145 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
146 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
147#include <linux/cgroup_subsys.h>
148#undef SUBSYS
149
150#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
151static struct static_key_true *cgroup_subsys_enabled_key[] = {
152#include <linux/cgroup_subsys.h>
153};
154#undef SUBSYS
155
156#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
157static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
158#include <linux/cgroup_subsys.h>
159};
160#undef SUBSYS
161
139/* 162/*
140 * The default hierarchy, reserved for the subsystems that are otherwise 163 * The default hierarchy, reserved for the subsystems that are otherwise
141 * unattached - it never has more than a single cgroup, and all tasks are 164 * unattached - it never has more than a single cgroup, and all tasks are
@@ -150,12 +173,6 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
150 */ 173 */
151static bool cgrp_dfl_root_visible; 174static bool cgrp_dfl_root_visible;
152 175
153/*
154 * Set by the boot param of the same name and makes subsystems with NULL
155 * ->dfl_files to use ->legacy_files on the default hierarchy.
156 */
157static bool cgroup_legacy_files_on_dfl;
158
159/* some controllers are not supported in the default hierarchy */ 176/* some controllers are not supported in the default hierarchy */
160static unsigned long cgrp_dfl_root_inhibit_ss_mask; 177static unsigned long cgrp_dfl_root_inhibit_ss_mask;
161 178
@@ -183,6 +200,7 @@ static u64 css_serial_nr_next = 1;
183 */ 200 */
184static unsigned long have_fork_callback __read_mostly; 201static unsigned long have_fork_callback __read_mostly;
185static unsigned long have_exit_callback __read_mostly; 202static unsigned long have_exit_callback __read_mostly;
203static unsigned long have_free_callback __read_mostly;
186 204
187/* Ditto for the can_fork callback. */ 205/* Ditto for the can_fork callback. */
188static unsigned long have_canfork_callback __read_mostly; 206static unsigned long have_canfork_callback __read_mostly;
@@ -192,14 +210,87 @@ static struct cftype cgroup_legacy_base_files[];
192 210
193static int rebind_subsystems(struct cgroup_root *dst_root, 211static int rebind_subsystems(struct cgroup_root *dst_root,
194 unsigned long ss_mask); 212 unsigned long ss_mask);
213static void css_task_iter_advance(struct css_task_iter *it);
195static int cgroup_destroy_locked(struct cgroup *cgrp); 214static int cgroup_destroy_locked(struct cgroup *cgrp);
196static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, 215static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
197 bool visible); 216 bool visible);
198static void css_release(struct percpu_ref *ref); 217static void css_release(struct percpu_ref *ref);
199static void kill_css(struct cgroup_subsys_state *css); 218static void kill_css(struct cgroup_subsys_state *css);
200static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 219static int cgroup_addrm_files(struct cgroup_subsys_state *css,
220 struct cgroup *cgrp, struct cftype cfts[],
201 bool is_add); 221 bool is_add);
202 222
223/**
224 * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
225 * @ssid: subsys ID of interest
226 *
227 * cgroup_subsys_enabled() can only be used with literal subsys names which
228 * is fine for individual subsystems but unsuitable for cgroup core. This
229 * is slower static_key_enabled() based test indexed by @ssid.
230 */
231static bool cgroup_ssid_enabled(int ssid)
232{
233 return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
234}
235
236/**
237 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
238 * @cgrp: the cgroup of interest
239 *
240 * The default hierarchy is the v2 interface of cgroup and this function
241 * can be used to test whether a cgroup is on the default hierarchy for
242 * cases where a subsystem should behave differnetly depending on the
243 * interface version.
244 *
245 * The set of behaviors which change on the default hierarchy are still
246 * being determined and the mount option is prefixed with __DEVEL__.
247 *
248 * List of changed behaviors:
249 *
250 * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
251 * and "name" are disallowed.
252 *
253 * - When mounting an existing superblock, mount options should match.
254 *
255 * - Remount is disallowed.
256 *
257 * - rename(2) is disallowed.
258 *
259 * - "tasks" is removed. Everything should be at process granularity. Use
260 * "cgroup.procs" instead.
261 *
262 * - "cgroup.procs" is not sorted. pids will be unique unless they got
263 * recycled inbetween reads.
264 *
265 * - "release_agent" and "notify_on_release" are removed. Replacement
266 * notification mechanism will be implemented.
267 *
268 * - "cgroup.clone_children" is removed.
269 *
270 * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup
271 * and its descendants contain no task; otherwise, 1. The file also
272 * generates kernfs notification which can be monitored through poll and
273 * [di]notify when the value of the file changes.
274 *
275 * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
276 * take masks of ancestors with non-empty cpus/mems, instead of being
277 * moved to an ancestor.
278 *
279 * - cpuset: a task can be moved into an empty cpuset, and again it takes
280 * masks of ancestors.
281 *
282 * - memcg: use_hierarchy is on by default and the cgroup file for the flag
283 * is not created.
284 *
285 * - blkcg: blk-throttle becomes properly hierarchical.
286 *
287 * - debug: disallowed on the default hierarchy.
288 */
289static bool cgroup_on_dfl(const struct cgroup *cgrp)
290{
291 return cgrp->root == &cgrp_dfl_root;
292}
293
203/* IDR wrappers which synchronize using cgroup_idr_lock */ 294/* IDR wrappers which synchronize using cgroup_idr_lock */
204static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, 295static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
205 gfp_t gfp_mask) 296 gfp_t gfp_mask)
@@ -332,6 +423,22 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp)
332 return !(cgrp->self.flags & CSS_ONLINE); 423 return !(cgrp->self.flags & CSS_ONLINE);
333} 424}
334 425
426static void cgroup_get(struct cgroup *cgrp)
427{
428 WARN_ON_ONCE(cgroup_is_dead(cgrp));
429 css_get(&cgrp->self);
430}
431
432static bool cgroup_tryget(struct cgroup *cgrp)
433{
434 return css_tryget(&cgrp->self);
435}
436
437static void cgroup_put(struct cgroup *cgrp)
438{
439 css_put(&cgrp->self);
440}
441
335struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) 442struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
336{ 443{
337 struct cgroup *cgrp = of->kn->parent->priv; 444 struct cgroup *cgrp = of->kn->parent->priv;
@@ -481,19 +588,31 @@ struct css_set init_css_set = {
481 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), 588 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
482 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), 589 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
483 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), 590 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
591 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
484}; 592};
485 593
486static int css_set_count = 1; /* 1 for init_css_set */ 594static int css_set_count = 1; /* 1 for init_css_set */
487 595
488/** 596/**
597 * css_set_populated - does a css_set contain any tasks?
598 * @cset: target css_set
599 */
600static bool css_set_populated(struct css_set *cset)
601{
602 lockdep_assert_held(&css_set_lock);
603
604 return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
605}
606
607/**
489 * cgroup_update_populated - updated populated count of a cgroup 608 * cgroup_update_populated - updated populated count of a cgroup
490 * @cgrp: the target cgroup 609 * @cgrp: the target cgroup
491 * @populated: inc or dec populated count 610 * @populated: inc or dec populated count
492 * 611 *
493 * @cgrp is either getting the first task (css_set) or losing the last. 612 * One of the css_sets associated with @cgrp is either getting its first
494 * Update @cgrp->populated_cnt accordingly. The count is propagated 613 * task or losing the last. Update @cgrp->populated_cnt accordingly. The
495 * towards root so that a given cgroup's populated_cnt is zero iff the 614 * count is propagated towards root so that a given cgroup's populated_cnt
496 * cgroup and all its descendants are empty. 615 * is zero iff the cgroup and all its descendants don't contain any tasks.
497 * 616 *
498 * @cgrp's interface file "cgroup.populated" is zero if 617 * @cgrp's interface file "cgroup.populated" is zero if
499 * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt 618 * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
@@ -503,7 +622,7 @@ static int css_set_count = 1; /* 1 for init_css_set */
503 */ 622 */
504static void cgroup_update_populated(struct cgroup *cgrp, bool populated) 623static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
505{ 624{
506 lockdep_assert_held(&css_set_rwsem); 625 lockdep_assert_held(&css_set_lock);
507 626
508 do { 627 do {
509 bool trigger; 628 bool trigger;
@@ -516,12 +635,93 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
516 if (!trigger) 635 if (!trigger)
517 break; 636 break;
518 637
519 if (cgrp->populated_kn) 638 check_for_release(cgrp);
520 kernfs_notify(cgrp->populated_kn); 639 cgroup_file_notify(&cgrp->events_file);
640
521 cgrp = cgroup_parent(cgrp); 641 cgrp = cgroup_parent(cgrp);
522 } while (cgrp); 642 } while (cgrp);
523} 643}
524 644
645/**
646 * css_set_update_populated - update populated state of a css_set
647 * @cset: target css_set
648 * @populated: whether @cset is populated or depopulated
649 *
650 * @cset is either getting the first task or losing the last. Update the
651 * ->populated_cnt of all associated cgroups accordingly.
652 */
653static void css_set_update_populated(struct css_set *cset, bool populated)
654{
655 struct cgrp_cset_link *link;
656
657 lockdep_assert_held(&css_set_lock);
658
659 list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
660 cgroup_update_populated(link->cgrp, populated);
661}
662
663/**
664 * css_set_move_task - move a task from one css_set to another
665 * @task: task being moved
666 * @from_cset: css_set @task currently belongs to (may be NULL)
667 * @to_cset: new css_set @task is being moved to (may be NULL)
668 * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
669 *
670 * Move @task from @from_cset to @to_cset. If @task didn't belong to any
671 * css_set, @from_cset can be NULL. If @task is being disassociated
672 * instead of moved, @to_cset can be NULL.
673 *
674 * This function automatically handles populated_cnt updates and
675 * css_task_iter adjustments but the caller is responsible for managing
676 * @from_cset and @to_cset's reference counts.
677 */
678static void css_set_move_task(struct task_struct *task,
679 struct css_set *from_cset, struct css_set *to_cset,
680 bool use_mg_tasks)
681{
682 lockdep_assert_held(&css_set_lock);
683
684 if (from_cset) {
685 struct css_task_iter *it, *pos;
686
687 WARN_ON_ONCE(list_empty(&task->cg_list));
688
689 /*
690 * @task is leaving, advance task iterators which are
691 * pointing to it so that they can resume at the next
692 * position. Advancing an iterator might remove it from
693 * the list, use safe walk. See css_task_iter_advance*()
694 * for details.
695 */
696 list_for_each_entry_safe(it, pos, &from_cset->task_iters,
697 iters_node)
698 if (it->task_pos == &task->cg_list)
699 css_task_iter_advance(it);
700
701 list_del_init(&task->cg_list);
702 if (!css_set_populated(from_cset))
703 css_set_update_populated(from_cset, false);
704 } else {
705 WARN_ON_ONCE(!list_empty(&task->cg_list));
706 }
707
708 if (to_cset) {
709 /*
710 * We are synchronized through cgroup_threadgroup_rwsem
711 * against PF_EXITING setting such that we can't race
712 * against cgroup_exit() changing the css_set to
713 * init_css_set and dropping the old one.
714 */
715 WARN_ON_ONCE(task->flags & PF_EXITING);
716
717 if (!css_set_populated(to_cset))
718 css_set_update_populated(to_cset, true);
719 rcu_assign_pointer(task->cgroups, to_cset);
720 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
721 &to_cset->tasks);
722 }
723}
724
525/* 725/*
526 * hash table for cgroup groups. This improves the performance to find 726 * hash table for cgroup groups. This improves the performance to find
527 * an existing css_set. This hash doesn't (currently) take into 727 * an existing css_set. This hash doesn't (currently) take into
@@ -549,7 +749,7 @@ static void put_css_set_locked(struct css_set *cset)
549 struct cgroup_subsys *ss; 749 struct cgroup_subsys *ss;
550 int ssid; 750 int ssid;
551 751
552 lockdep_assert_held(&css_set_rwsem); 752 lockdep_assert_held(&css_set_lock);
553 753
554 if (!atomic_dec_and_test(&cset->refcount)) 754 if (!atomic_dec_and_test(&cset->refcount))
555 return; 755 return;
@@ -561,17 +761,10 @@ static void put_css_set_locked(struct css_set *cset)
561 css_set_count--; 761 css_set_count--;
562 762
563 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { 763 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
564 struct cgroup *cgrp = link->cgrp;
565
566 list_del(&link->cset_link); 764 list_del(&link->cset_link);
567 list_del(&link->cgrp_link); 765 list_del(&link->cgrp_link);
568 766 if (cgroup_parent(link->cgrp))
569 /* @cgrp can't go away while we're holding css_set_rwsem */ 767 cgroup_put(link->cgrp);
570 if (list_empty(&cgrp->cset_links)) {
571 cgroup_update_populated(cgrp, false);
572 check_for_release(cgrp);
573 }
574
575 kfree(link); 768 kfree(link);
576 } 769 }
577 770
@@ -588,9 +781,9 @@ static void put_css_set(struct css_set *cset)
588 if (atomic_add_unless(&cset->refcount, -1, 1)) 781 if (atomic_add_unless(&cset->refcount, -1, 1))
589 return; 782 return;
590 783
591 down_write(&css_set_rwsem); 784 spin_lock_bh(&css_set_lock);
592 put_css_set_locked(cset); 785 put_css_set_locked(cset);
593 up_write(&css_set_rwsem); 786 spin_unlock_bh(&css_set_lock);
594} 787}
595 788
596/* 789/*
@@ -779,15 +972,15 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
779 link->cset = cset; 972 link->cset = cset;
780 link->cgrp = cgrp; 973 link->cgrp = cgrp;
781 974
782 if (list_empty(&cgrp->cset_links))
783 cgroup_update_populated(cgrp, true);
784 list_move(&link->cset_link, &cgrp->cset_links);
785
786 /* 975 /*
787 * Always add links to the tail of the list so that the list 976 * Always add links to the tail of the lists so that the lists are
788 * is sorted by order of hierarchy creation 977 * in choronological order.
789 */ 978 */
979 list_move_tail(&link->cset_link, &cgrp->cset_links);
790 list_add_tail(&link->cgrp_link, &cset->cgrp_links); 980 list_add_tail(&link->cgrp_link, &cset->cgrp_links);
981
982 if (cgroup_parent(cgrp))
983 cgroup_get(cgrp);
791} 984}
792 985
793/** 986/**
@@ -813,11 +1006,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
813 1006
814 /* First see if we already have a cgroup group that matches 1007 /* First see if we already have a cgroup group that matches
815 * the desired set */ 1008 * the desired set */
816 down_read(&css_set_rwsem); 1009 spin_lock_bh(&css_set_lock);
817 cset = find_existing_css_set(old_cset, cgrp, template); 1010 cset = find_existing_css_set(old_cset, cgrp, template);
818 if (cset) 1011 if (cset)
819 get_css_set(cset); 1012 get_css_set(cset);
820 up_read(&css_set_rwsem); 1013 spin_unlock_bh(&css_set_lock);
821 1014
822 if (cset) 1015 if (cset)
823 return cset; 1016 return cset;
@@ -838,13 +1031,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,
838 INIT_LIST_HEAD(&cset->mg_tasks); 1031 INIT_LIST_HEAD(&cset->mg_tasks);
839 INIT_LIST_HEAD(&cset->mg_preload_node); 1032 INIT_LIST_HEAD(&cset->mg_preload_node);
840 INIT_LIST_HEAD(&cset->mg_node); 1033 INIT_LIST_HEAD(&cset->mg_node);
1034 INIT_LIST_HEAD(&cset->task_iters);
841 INIT_HLIST_NODE(&cset->hlist); 1035 INIT_HLIST_NODE(&cset->hlist);
842 1036
843 /* Copy the set of subsystem state objects generated in 1037 /* Copy the set of subsystem state objects generated in
844 * find_existing_css_set() */ 1038 * find_existing_css_set() */
845 memcpy(cset->subsys, template, sizeof(cset->subsys)); 1039 memcpy(cset->subsys, template, sizeof(cset->subsys));
846 1040
847 down_write(&css_set_rwsem); 1041 spin_lock_bh(&css_set_lock);
848 /* Add reference counts and links from the new css_set. */ 1042 /* Add reference counts and links from the new css_set. */
849 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { 1043 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
850 struct cgroup *c = link->cgrp; 1044 struct cgroup *c = link->cgrp;
@@ -866,53 +1060,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
866 list_add_tail(&cset->e_cset_node[ssid], 1060 list_add_tail(&cset->e_cset_node[ssid],
867 &cset->subsys[ssid]->cgroup->e_csets[ssid]); 1061 &cset->subsys[ssid]->cgroup->e_csets[ssid]);
868 1062
869 up_write(&css_set_rwsem); 1063 spin_unlock_bh(&css_set_lock);
870 1064
871 return cset; 1065 return cset;
872} 1066}
873 1067
874void cgroup_threadgroup_change_begin(struct task_struct *tsk)
875{
876 down_read(&tsk->signal->group_rwsem);
877}
878
879void cgroup_threadgroup_change_end(struct task_struct *tsk)
880{
881 up_read(&tsk->signal->group_rwsem);
882}
883
884/**
885 * threadgroup_lock - lock threadgroup
886 * @tsk: member task of the threadgroup to lock
887 *
888 * Lock the threadgroup @tsk belongs to. No new task is allowed to enter
889 * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
890 * change ->group_leader/pid. This is useful for cases where the threadgroup
891 * needs to stay stable across blockable operations.
892 *
893 * fork and exit explicitly call threadgroup_change_{begin|end}() for
894 * synchronization. While held, no new task will be added to threadgroup
895 * and no existing live task will have its PF_EXITING set.
896 *
897 * de_thread() does threadgroup_change_{begin|end}() when a non-leader
898 * sub-thread becomes a new leader.
899 */
900static void threadgroup_lock(struct task_struct *tsk)
901{
902 down_write(&tsk->signal->group_rwsem);
903}
904
905/**
906 * threadgroup_unlock - unlock threadgroup
907 * @tsk: member task of the threadgroup to unlock
908 *
909 * Reverse threadgroup_lock().
910 */
911static inline void threadgroup_unlock(struct task_struct *tsk)
912{
913 up_write(&tsk->signal->group_rwsem);
914}
915
916static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) 1068static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
917{ 1069{
918 struct cgroup *root_cgrp = kf_root->kn->priv; 1070 struct cgroup *root_cgrp = kf_root->kn->priv;
@@ -972,14 +1124,15 @@ static void cgroup_destroy_root(struct cgroup_root *root)
972 * Release all the links from cset_links to this hierarchy's 1124 * Release all the links from cset_links to this hierarchy's
973 * root cgroup 1125 * root cgroup
974 */ 1126 */
975 down_write(&css_set_rwsem); 1127 spin_lock_bh(&css_set_lock);
976 1128
977 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { 1129 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
978 list_del(&link->cset_link); 1130 list_del(&link->cset_link);
979 list_del(&link->cgrp_link); 1131 list_del(&link->cgrp_link);
980 kfree(link); 1132 kfree(link);
981 } 1133 }
982 up_write(&css_set_rwsem); 1134
1135 spin_unlock_bh(&css_set_lock);
983 1136
984 if (!list_empty(&root->root_list)) { 1137 if (!list_empty(&root->root_list)) {
985 list_del(&root->root_list); 1138 list_del(&root->root_list);
@@ -1001,7 +1154,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1001 struct cgroup *res = NULL; 1154 struct cgroup *res = NULL;
1002 1155
1003 lockdep_assert_held(&cgroup_mutex); 1156 lockdep_assert_held(&cgroup_mutex);
1004 lockdep_assert_held(&css_set_rwsem); 1157 lockdep_assert_held(&css_set_lock);
1005 1158
1006 if (cset == &init_css_set) { 1159 if (cset == &init_css_set) {
1007 res = &root->cgrp; 1160 res = &root->cgrp;
@@ -1024,7 +1177,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1024 1177
1025/* 1178/*
1026 * Return the cgroup for "task" from the given hierarchy. Must be 1179 * Return the cgroup for "task" from the given hierarchy. Must be
1027 * called with cgroup_mutex and css_set_rwsem held. 1180 * called with cgroup_mutex and css_set_lock held.
1028 */ 1181 */
1029static struct cgroup *task_cgroup_from_root(struct task_struct *task, 1182static struct cgroup *task_cgroup_from_root(struct task_struct *task,
1030 struct cgroup_root *root) 1183 struct cgroup_root *root)
@@ -1063,7 +1216,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
1063 * update of a tasks cgroup pointer by cgroup_attach_task() 1216 * update of a tasks cgroup pointer by cgroup_attach_task()
1064 */ 1217 */
1065 1218
1066static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
1067static struct kernfs_syscall_ops cgroup_kf_syscall_ops; 1219static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1068static const struct file_operations proc_cgroupstats_operations; 1220static const struct file_operations proc_cgroupstats_operations;
1069 1221
@@ -1086,43 +1238,25 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1086 * cgroup_file_mode - deduce file mode of a control file 1238 * cgroup_file_mode - deduce file mode of a control file
1087 * @cft: the control file in question 1239 * @cft: the control file in question
1088 * 1240 *
1089 * returns cft->mode if ->mode is not 0 1241 * S_IRUGO for read, S_IWUSR for write.
1090 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
1091 * returns S_IRUGO if it has only a read handler
1092 * returns S_IWUSR if it has only a write hander
1093 */ 1242 */
1094static umode_t cgroup_file_mode(const struct cftype *cft) 1243static umode_t cgroup_file_mode(const struct cftype *cft)
1095{ 1244{
1096 umode_t mode = 0; 1245 umode_t mode = 0;
1097 1246
1098 if (cft->mode)
1099 return cft->mode;
1100
1101 if (cft->read_u64 || cft->read_s64 || cft->seq_show) 1247 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1102 mode |= S_IRUGO; 1248 mode |= S_IRUGO;
1103 1249
1104 if (cft->write_u64 || cft->write_s64 || cft->write) 1250 if (cft->write_u64 || cft->write_s64 || cft->write) {
1105 mode |= S_IWUSR; 1251 if (cft->flags & CFTYPE_WORLD_WRITABLE)
1252 mode |= S_IWUGO;
1253 else
1254 mode |= S_IWUSR;
1255 }
1106 1256
1107 return mode; 1257 return mode;
1108} 1258}
1109 1259
1110static void cgroup_get(struct cgroup *cgrp)
1111{
1112 WARN_ON_ONCE(cgroup_is_dead(cgrp));
1113 css_get(&cgrp->self);
1114}
1115
1116static bool cgroup_tryget(struct cgroup *cgrp)
1117{
1118 return css_tryget(&cgrp->self);
1119}
1120
1121static void cgroup_put(struct cgroup *cgrp)
1122{
1123 css_put(&cgrp->self);
1124}
1125
1126/** 1260/**
1127 * cgroup_calc_child_subsys_mask - calculate child_subsys_mask 1261 * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
1128 * @cgrp: the target cgroup 1262 * @cgrp: the target cgroup
@@ -1263,28 +1397,64 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1263} 1397}
1264 1398
1265/** 1399/**
1266 * cgroup_clear_dir - remove subsys files in a cgroup directory 1400 * css_clear_dir - remove subsys files in a cgroup directory
1267 * @cgrp: target cgroup 1401 * @css: taget css
1268 * @subsys_mask: mask of the subsystem ids whose files should be removed 1402 * @cgrp_override: specify if target cgroup is different from css->cgroup
1269 */ 1403 */
1270static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) 1404static void css_clear_dir(struct cgroup_subsys_state *css,
1405 struct cgroup *cgrp_override)
1271{ 1406{
1272 struct cgroup_subsys *ss; 1407 struct cgroup *cgrp = cgrp_override ?: css->cgroup;
1273 int i; 1408 struct cftype *cfts;
1274 1409
1275 for_each_subsys(ss, i) { 1410 list_for_each_entry(cfts, &css->ss->cfts, node)
1276 struct cftype *cfts; 1411 cgroup_addrm_files(css, cgrp, cfts, false);
1412}
1277 1413
1278 if (!(subsys_mask & (1 << i))) 1414/**
1279 continue; 1415 * css_populate_dir - create subsys files in a cgroup directory
1280 list_for_each_entry(cfts, &ss->cfts, node) 1416 * @css: target css
1281 cgroup_addrm_files(cgrp, cfts, false); 1417 * @cgrp_overried: specify if target cgroup is different from css->cgroup
1418 *
1419 * On failure, no file is added.
1420 */
1421static int css_populate_dir(struct cgroup_subsys_state *css,
1422 struct cgroup *cgrp_override)
1423{
1424 struct cgroup *cgrp = cgrp_override ?: css->cgroup;
1425 struct cftype *cfts, *failed_cfts;
1426 int ret;
1427
1428 if (!css->ss) {
1429 if (cgroup_on_dfl(cgrp))
1430 cfts = cgroup_dfl_base_files;
1431 else
1432 cfts = cgroup_legacy_base_files;
1433
1434 return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
1435 }
1436
1437 list_for_each_entry(cfts, &css->ss->cfts, node) {
1438 ret = cgroup_addrm_files(css, cgrp, cfts, true);
1439 if (ret < 0) {
1440 failed_cfts = cfts;
1441 goto err;
1442 }
1282 } 1443 }
1444 return 0;
1445err:
1446 list_for_each_entry(cfts, &css->ss->cfts, node) {
1447 if (cfts == failed_cfts)
1448 break;
1449 cgroup_addrm_files(css, cgrp, cfts, false);
1450 }
1451 return ret;
1283} 1452}
1284 1453
1285static int rebind_subsystems(struct cgroup_root *dst_root, 1454static int rebind_subsystems(struct cgroup_root *dst_root,
1286 unsigned long ss_mask) 1455 unsigned long ss_mask)
1287{ 1456{
1457 struct cgroup *dcgrp = &dst_root->cgrp;
1288 struct cgroup_subsys *ss; 1458 struct cgroup_subsys *ss;
1289 unsigned long tmp_ss_mask; 1459 unsigned long tmp_ss_mask;
1290 int ssid, i, ret; 1460 int ssid, i, ret;
@@ -1306,10 +1476,13 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
1306 if (dst_root == &cgrp_dfl_root) 1476 if (dst_root == &cgrp_dfl_root)
1307 tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; 1477 tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
1308 1478
1309 ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask); 1479 for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
1310 if (ret) { 1480 struct cgroup *scgrp = &ss->root->cgrp;
1311 if (dst_root != &cgrp_dfl_root) 1481 int tssid;
1312 return ret; 1482
1483 ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp);
1484 if (!ret)
1485 continue;
1313 1486
1314 /* 1487 /*
1315 * Rebinding back to the default root is not allowed to 1488 * Rebinding back to the default root is not allowed to
@@ -1317,57 +1490,67 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
1317 * be rare. Moving subsystems back and forth even more so. 1490 * be rare. Moving subsystems back and forth even more so.
1318 * Just warn about it and continue. 1491 * Just warn about it and continue.
1319 */ 1492 */
1320 if (cgrp_dfl_root_visible) { 1493 if (dst_root == &cgrp_dfl_root) {
1321 pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", 1494 if (cgrp_dfl_root_visible) {
1322 ret, ss_mask); 1495 pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
1323 pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); 1496 ret, ss_mask);
1497 pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
1498 }
1499 continue;
1500 }
1501
1502 for_each_subsys_which(ss, tssid, &tmp_ss_mask) {
1503 if (tssid == ssid)
1504 break;
1505 css_clear_dir(cgroup_css(scgrp, ss), dcgrp);
1324 } 1506 }
1507 return ret;
1325 } 1508 }
1326 1509
1327 /* 1510 /*
1328 * Nothing can fail from this point on. Remove files for the 1511 * Nothing can fail from this point on. Remove files for the
1329 * removed subsystems and rebind each subsystem. 1512 * removed subsystems and rebind each subsystem.
1330 */ 1513 */
1331 for_each_subsys_which(ss, ssid, &ss_mask)
1332 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1333
1334 for_each_subsys_which(ss, ssid, &ss_mask) { 1514 for_each_subsys_which(ss, ssid, &ss_mask) {
1335 struct cgroup_root *src_root; 1515 struct cgroup_root *src_root = ss->root;
1336 struct cgroup_subsys_state *css; 1516 struct cgroup *scgrp = &src_root->cgrp;
1517 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
1337 struct css_set *cset; 1518 struct css_set *cset;
1338 1519
1339 src_root = ss->root; 1520 WARN_ON(!css || cgroup_css(dcgrp, ss));
1340 css = cgroup_css(&src_root->cgrp, ss);
1341 1521
1342 WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss)); 1522 css_clear_dir(css, NULL);
1343 1523
1344 RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL); 1524 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
1345 rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css); 1525 rcu_assign_pointer(dcgrp->subsys[ssid], css);
1346 ss->root = dst_root; 1526 ss->root = dst_root;
1347 css->cgroup = &dst_root->cgrp; 1527 css->cgroup = dcgrp;
1348 1528
1349 down_write(&css_set_rwsem); 1529 spin_lock_bh(&css_set_lock);
1350 hash_for_each(css_set_table, i, cset, hlist) 1530 hash_for_each(css_set_table, i, cset, hlist)
1351 list_move_tail(&cset->e_cset_node[ss->id], 1531 list_move_tail(&cset->e_cset_node[ss->id],
1352 &dst_root->cgrp.e_csets[ss->id]); 1532 &dcgrp->e_csets[ss->id]);
1353 up_write(&css_set_rwsem); 1533 spin_unlock_bh(&css_set_lock);
1354 1534
1355 src_root->subsys_mask &= ~(1 << ssid); 1535 src_root->subsys_mask &= ~(1 << ssid);
1356 src_root->cgrp.subtree_control &= ~(1 << ssid); 1536 scgrp->subtree_control &= ~(1 << ssid);
1357 cgroup_refresh_child_subsys_mask(&src_root->cgrp); 1537 cgroup_refresh_child_subsys_mask(scgrp);
1358 1538
1359 /* default hierarchy doesn't enable controllers by default */ 1539 /* default hierarchy doesn't enable controllers by default */
1360 dst_root->subsys_mask |= 1 << ssid; 1540 dst_root->subsys_mask |= 1 << ssid;
1361 if (dst_root != &cgrp_dfl_root) { 1541 if (dst_root == &cgrp_dfl_root) {
1362 dst_root->cgrp.subtree_control |= 1 << ssid; 1542 static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
1363 cgroup_refresh_child_subsys_mask(&dst_root->cgrp); 1543 } else {
1544 dcgrp->subtree_control |= 1 << ssid;
1545 cgroup_refresh_child_subsys_mask(dcgrp);
1546 static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
1364 } 1547 }
1365 1548
1366 if (ss->bind) 1549 if (ss->bind)
1367 ss->bind(css); 1550 ss->bind(css);
1368 } 1551 }
1369 1552
1370 kernfs_activate(dst_root->cgrp.kn); 1553 kernfs_activate(dcgrp->kn);
1371 return 0; 1554 return 0;
1372} 1555}
1373 1556
@@ -1497,7 +1680,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1497 for_each_subsys(ss, i) { 1680 for_each_subsys(ss, i) {
1498 if (strcmp(token, ss->legacy_name)) 1681 if (strcmp(token, ss->legacy_name))
1499 continue; 1682 continue;
1500 if (ss->disabled) 1683 if (!cgroup_ssid_enabled(i))
1501 continue; 1684 continue;
1502 1685
1503 /* Mutually exclusive option 'all' + subsystem name */ 1686 /* Mutually exclusive option 'all' + subsystem name */
@@ -1528,7 +1711,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1528 */ 1711 */
1529 if (all_ss || (!one_ss && !opts->none && !opts->name)) 1712 if (all_ss || (!one_ss && !opts->none && !opts->name))
1530 for_each_subsys(ss, i) 1713 for_each_subsys(ss, i)
1531 if (!ss->disabled) 1714 if (cgroup_ssid_enabled(i))
1532 opts->subsys_mask |= (1 << i); 1715 opts->subsys_mask |= (1 << i);
1533 1716
1534 /* 1717 /*
@@ -1624,7 +1807,7 @@ static void cgroup_enable_task_cg_lists(void)
1624{ 1807{
1625 struct task_struct *p, *g; 1808 struct task_struct *p, *g;
1626 1809
1627 down_write(&css_set_rwsem); 1810 spin_lock_bh(&css_set_lock);
1628 1811
1629 if (use_task_css_set_links) 1812 if (use_task_css_set_links)
1630 goto out_unlock; 1813 goto out_unlock;
@@ -1654,14 +1837,16 @@ static void cgroup_enable_task_cg_lists(void)
1654 if (!(p->flags & PF_EXITING)) { 1837 if (!(p->flags & PF_EXITING)) {
1655 struct css_set *cset = task_css_set(p); 1838 struct css_set *cset = task_css_set(p);
1656 1839
1657 list_add(&p->cg_list, &cset->tasks); 1840 if (!css_set_populated(cset))
1841 css_set_update_populated(cset, true);
1842 list_add_tail(&p->cg_list, &cset->tasks);
1658 get_css_set(cset); 1843 get_css_set(cset);
1659 } 1844 }
1660 spin_unlock_irq(&p->sighand->siglock); 1845 spin_unlock_irq(&p->sighand->siglock);
1661 } while_each_thread(g, p); 1846 } while_each_thread(g, p);
1662 read_unlock(&tasklist_lock); 1847 read_unlock(&tasklist_lock);
1663out_unlock: 1848out_unlock:
1664 up_write(&css_set_rwsem); 1849 spin_unlock_bh(&css_set_lock);
1665} 1850}
1666 1851
1667static void init_cgroup_housekeeping(struct cgroup *cgrp) 1852static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1671,6 +1856,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1671 1856
1672 INIT_LIST_HEAD(&cgrp->self.sibling); 1857 INIT_LIST_HEAD(&cgrp->self.sibling);
1673 INIT_LIST_HEAD(&cgrp->self.children); 1858 INIT_LIST_HEAD(&cgrp->self.children);
1859 INIT_LIST_HEAD(&cgrp->self.files);
1674 INIT_LIST_HEAD(&cgrp->cset_links); 1860 INIT_LIST_HEAD(&cgrp->cset_links);
1675 INIT_LIST_HEAD(&cgrp->pidlists); 1861 INIT_LIST_HEAD(&cgrp->pidlists);
1676 mutex_init(&cgrp->pidlist_mutex); 1862 mutex_init(&cgrp->pidlist_mutex);
@@ -1708,7 +1894,6 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1708{ 1894{
1709 LIST_HEAD(tmp_links); 1895 LIST_HEAD(tmp_links);
1710 struct cgroup *root_cgrp = &root->cgrp; 1896 struct cgroup *root_cgrp = &root->cgrp;
1711 struct cftype *base_files;
1712 struct css_set *cset; 1897 struct css_set *cset;
1713 int i, ret; 1898 int i, ret;
1714 1899
@@ -1725,7 +1910,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1725 goto out; 1910 goto out;
1726 1911
1727 /* 1912 /*
1728 * We're accessing css_set_count without locking css_set_rwsem here, 1913 * We're accessing css_set_count without locking css_set_lock here,
1729 * but that's OK - it can only be increased by someone holding 1914 * but that's OK - it can only be increased by someone holding
1730 * cgroup_lock, and that's us. The worst that can happen is that we 1915 * cgroup_lock, and that's us. The worst that can happen is that we
1731 * have some link structures left over 1916 * have some link structures left over
@@ -1747,12 +1932,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1747 } 1932 }
1748 root_cgrp->kn = root->kf_root->kn; 1933 root_cgrp->kn = root->kf_root->kn;
1749 1934
1750 if (root == &cgrp_dfl_root) 1935 ret = css_populate_dir(&root_cgrp->self, NULL);
1751 base_files = cgroup_dfl_base_files;
1752 else
1753 base_files = cgroup_legacy_base_files;
1754
1755 ret = cgroup_addrm_files(root_cgrp, base_files, true);
1756 if (ret) 1936 if (ret)
1757 goto destroy_root; 1937 goto destroy_root;
1758 1938
@@ -1772,10 +1952,13 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1772 * Link the root cgroup in this hierarchy into all the css_set 1952 * Link the root cgroup in this hierarchy into all the css_set
1773 * objects. 1953 * objects.
1774 */ 1954 */
1775 down_write(&css_set_rwsem); 1955 spin_lock_bh(&css_set_lock);
1776 hash_for_each(css_set_table, i, cset, hlist) 1956 hash_for_each(css_set_table, i, cset, hlist) {
1777 link_css_set(&tmp_links, cset, root_cgrp); 1957 link_css_set(&tmp_links, cset, root_cgrp);
1778 up_write(&css_set_rwsem); 1958 if (css_set_populated(cset))
1959 cgroup_update_populated(root_cgrp, true);
1960 }
1961 spin_unlock_bh(&css_set_lock);
1779 1962
1780 BUG_ON(!list_empty(&root_cgrp->self.children)); 1963 BUG_ON(!list_empty(&root_cgrp->self.children));
1781 BUG_ON(atomic_read(&root->nr_cgrps) != 1); 1964 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@ -2008,7 +2191,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2008 char *path = NULL; 2191 char *path = NULL;
2009 2192
2010 mutex_lock(&cgroup_mutex); 2193 mutex_lock(&cgroup_mutex);
2011 down_read(&css_set_rwsem); 2194 spin_lock_bh(&css_set_lock);
2012 2195
2013 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); 2196 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
2014 2197
@@ -2021,7 +2204,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2021 path = buf; 2204 path = buf;
2022 } 2205 }
2023 2206
2024 up_read(&css_set_rwsem); 2207 spin_unlock_bh(&css_set_lock);
2025 mutex_unlock(&cgroup_mutex); 2208 mutex_unlock(&cgroup_mutex);
2026 return path; 2209 return path;
2027} 2210}
@@ -2049,6 +2232,49 @@ struct cgroup_taskset {
2049 struct task_struct *cur_task; 2232 struct task_struct *cur_task;
2050}; 2233};
2051 2234
2235#define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \
2236 .src_csets = LIST_HEAD_INIT(tset.src_csets), \
2237 .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \
2238 .csets = &tset.src_csets, \
2239}
2240
2241/**
2242 * cgroup_taskset_add - try to add a migration target task to a taskset
2243 * @task: target task
2244 * @tset: target taskset
2245 *
2246 * Add @task, which is a migration target, to @tset. This function becomes
2247 * noop if @task doesn't need to be migrated. @task's css_set should have
2248 * been added as a migration source and @task->cg_list will be moved from
2249 * the css_set's tasks list to mg_tasks one.
2250 */
2251static void cgroup_taskset_add(struct task_struct *task,
2252 struct cgroup_taskset *tset)
2253{
2254 struct css_set *cset;
2255
2256 lockdep_assert_held(&css_set_lock);
2257
2258 /* @task either already exited or can't exit until the end */
2259 if (task->flags & PF_EXITING)
2260 return;
2261
2262 /* leave @task alone if post_fork() hasn't linked it yet */
2263 if (list_empty(&task->cg_list))
2264 return;
2265
2266 cset = task_css_set(task);
2267 if (!cset->mg_src_cgrp)
2268 return;
2269
2270 list_move_tail(&task->cg_list, &cset->mg_tasks);
2271 if (list_empty(&cset->mg_node))
2272 list_add_tail(&cset->mg_node, &tset->src_csets);
2273 if (list_empty(&cset->mg_dst_cset->mg_node))
2274 list_move_tail(&cset->mg_dst_cset->mg_node,
2275 &tset->dst_csets);
2276}
2277
2052/** 2278/**
2053 * cgroup_taskset_first - reset taskset and return the first task 2279 * cgroup_taskset_first - reset taskset and return the first task
2054 * @tset: taskset of interest 2280 * @tset: taskset of interest
@@ -2096,47 +2322,86 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
2096} 2322}
2097 2323
2098/** 2324/**
2099 * cgroup_task_migrate - move a task from one cgroup to another. 2325 * cgroup_taskset_migrate - migrate a taskset to a cgroup
2100 * @old_cgrp: the cgroup @tsk is being migrated from 2326 * @tset: taget taskset
2101 * @tsk: the task being migrated 2327 * @dst_cgrp: destination cgroup
2102 * @new_cset: the new css_set @tsk is being attached to
2103 * 2328 *
2104 * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked. 2329 * Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the
2330 * ->can_attach callbacks fails and guarantees that either all or none of
2331 * the tasks in @tset are migrated. @tset is consumed regardless of
2332 * success.
2105 */ 2333 */
2106static void cgroup_task_migrate(struct cgroup *old_cgrp, 2334static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
2107 struct task_struct *tsk, 2335 struct cgroup *dst_cgrp)
2108 struct css_set *new_cset)
2109{ 2336{
2110 struct css_set *old_cset; 2337 struct cgroup_subsys_state *css, *failed_css = NULL;
2111 2338 struct task_struct *task, *tmp_task;
2112 lockdep_assert_held(&cgroup_mutex); 2339 struct css_set *cset, *tmp_cset;
2113 lockdep_assert_held(&css_set_rwsem); 2340 int i, ret;
2114 2341
2115 /* 2342 /* methods shouldn't be called if no task is actually migrating */
2116 * We are synchronized through threadgroup_lock() against PF_EXITING 2343 if (list_empty(&tset->src_csets))
2117 * setting such that we can't race against cgroup_exit() changing the 2344 return 0;
2118 * css_set to init_css_set and dropping the old one.
2119 */
2120 WARN_ON_ONCE(tsk->flags & PF_EXITING);
2121 old_cset = task_css_set(tsk);
2122 2345
2123 get_css_set(new_cset); 2346 /* check that we can legitimately attach to the cgroup */
2124 rcu_assign_pointer(tsk->cgroups, new_cset); 2347 for_each_e_css(css, i, dst_cgrp) {
2348 if (css->ss->can_attach) {
2349 ret = css->ss->can_attach(css, tset);
2350 if (ret) {
2351 failed_css = css;
2352 goto out_cancel_attach;
2353 }
2354 }
2355 }
2125 2356
2126 /* 2357 /*
2127 * Use move_tail so that cgroup_taskset_first() still returns the 2358 * Now that we're guaranteed success, proceed to move all tasks to
2128 * leader after migration. This works because cgroup_migrate() 2359 * the new cgroup. There are no failure cases after here, so this
2129 * ensures that the dst_cset of the leader is the first on the 2360 * is the commit point.
2130 * tset's dst_csets list.
2131 */ 2361 */
2132 list_move_tail(&tsk->cg_list, &new_cset->mg_tasks); 2362 spin_lock_bh(&css_set_lock);
2363 list_for_each_entry(cset, &tset->src_csets, mg_node) {
2364 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
2365 struct css_set *from_cset = task_css_set(task);
2366 struct css_set *to_cset = cset->mg_dst_cset;
2367
2368 get_css_set(to_cset);
2369 css_set_move_task(task, from_cset, to_cset, true);
2370 put_css_set_locked(from_cset);
2371 }
2372 }
2373 spin_unlock_bh(&css_set_lock);
2133 2374
2134 /* 2375 /*
2135 * We just gained a reference on old_cset by taking it from the 2376 * Migration is committed, all target tasks are now on dst_csets.
2136 * task. As trading it for new_cset is protected by cgroup_mutex, 2377 * Nothing is sensitive to fork() after this point. Notify
2137 * we're safe to drop it here; it will be freed under RCU. 2378 * controllers that migration is complete.
2138 */ 2379 */
2139 put_css_set_locked(old_cset); 2380 tset->csets = &tset->dst_csets;
2381
2382 for_each_e_css(css, i, dst_cgrp)
2383 if (css->ss->attach)
2384 css->ss->attach(css, tset);
2385
2386 ret = 0;
2387 goto out_release_tset;
2388
2389out_cancel_attach:
2390 for_each_e_css(css, i, dst_cgrp) {
2391 if (css == failed_css)
2392 break;
2393 if (css->ss->cancel_attach)
2394 css->ss->cancel_attach(css, tset);
2395 }
2396out_release_tset:
2397 spin_lock_bh(&css_set_lock);
2398 list_splice_init(&tset->dst_csets, &tset->src_csets);
2399 list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
2400 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2401 list_del_init(&cset->mg_node);
2402 }
2403 spin_unlock_bh(&css_set_lock);
2404 return ret;
2140} 2405}
2141 2406
2142/** 2407/**
@@ -2152,14 +2417,14 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
2152 2417
2153 lockdep_assert_held(&cgroup_mutex); 2418 lockdep_assert_held(&cgroup_mutex);
2154 2419
2155 down_write(&css_set_rwsem); 2420 spin_lock_bh(&css_set_lock);
2156 list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { 2421 list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
2157 cset->mg_src_cgrp = NULL; 2422 cset->mg_src_cgrp = NULL;
2158 cset->mg_dst_cset = NULL; 2423 cset->mg_dst_cset = NULL;
2159 list_del_init(&cset->mg_preload_node); 2424 list_del_init(&cset->mg_preload_node);
2160 put_css_set_locked(cset); 2425 put_css_set_locked(cset);
2161 } 2426 }
2162 up_write(&css_set_rwsem); 2427 spin_unlock_bh(&css_set_lock);
2163} 2428}
2164 2429
2165/** 2430/**
@@ -2172,10 +2437,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
2172 * @src_cset and add it to @preloaded_csets, which should later be cleaned 2437 * @src_cset and add it to @preloaded_csets, which should later be cleaned
2173 * up by cgroup_migrate_finish(). 2438 * up by cgroup_migrate_finish().
2174 * 2439 *
2175 * This function may be called without holding threadgroup_lock even if the 2440 * This function may be called without holding cgroup_threadgroup_rwsem
2176 * target is a process. Threads may be created and destroyed but as long 2441 * even if the target is a process. Threads may be created and destroyed
2177 * as cgroup_mutex is not dropped, no new css_set can be put into play and 2442 * but as long as cgroup_mutex is not dropped, no new css_set can be put
2178 * the preloaded css_sets are guaranteed to cover all migrations. 2443 * into play and the preloaded css_sets are guaranteed to cover all
2444 * migrations.
2179 */ 2445 */
2180static void cgroup_migrate_add_src(struct css_set *src_cset, 2446static void cgroup_migrate_add_src(struct css_set *src_cset,
2181 struct cgroup *dst_cgrp, 2447 struct cgroup *dst_cgrp,
@@ -2184,7 +2450,7 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
2184 struct cgroup *src_cgrp; 2450 struct cgroup *src_cgrp;
2185 2451
2186 lockdep_assert_held(&cgroup_mutex); 2452 lockdep_assert_held(&cgroup_mutex);
2187 lockdep_assert_held(&css_set_rwsem); 2453 lockdep_assert_held(&css_set_lock);
2188 2454
2189 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); 2455 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2190 2456
@@ -2273,12 +2539,12 @@ err:
2273 2539
2274/** 2540/**
2275 * cgroup_migrate - migrate a process or task to a cgroup 2541 * cgroup_migrate - migrate a process or task to a cgroup
2276 * @cgrp: the destination cgroup
2277 * @leader: the leader of the process or the task to migrate 2542 * @leader: the leader of the process or the task to migrate
2278 * @threadgroup: whether @leader points to the whole process or a single task 2543 * @threadgroup: whether @leader points to the whole process or a single task
2544 * @cgrp: the destination cgroup
2279 * 2545 *
2280 * Migrate a process or task denoted by @leader to @cgrp. If migrating a 2546 * Migrate a process or task denoted by @leader to @cgrp. If migrating a
2281 * process, the caller must be holding threadgroup_lock of @leader. The 2547 * process, the caller must be holding cgroup_threadgroup_rwsem. The
2282 * caller is also responsible for invoking cgroup_migrate_add_src() and 2548 * caller is also responsible for invoking cgroup_migrate_add_src() and
2283 * cgroup_migrate_prepare_dst() on the targets before invoking this 2549 * cgroup_migrate_prepare_dst() on the targets before invoking this
2284 * function and following up with cgroup_migrate_finish(). 2550 * function and following up with cgroup_migrate_finish().
@@ -2289,115 +2555,29 @@ err:
2289 * decided for all targets by invoking group_migrate_prepare_dst() before 2555 * decided for all targets by invoking group_migrate_prepare_dst() before
2290 * actually starting migrating. 2556 * actually starting migrating.
2291 */ 2557 */
2292static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, 2558static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2293 bool threadgroup) 2559 struct cgroup *cgrp)
2294{ 2560{
2295 struct cgroup_taskset tset = { 2561 struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
2296 .src_csets = LIST_HEAD_INIT(tset.src_csets), 2562 struct task_struct *task;
2297 .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
2298 .csets = &tset.src_csets,
2299 };
2300 struct cgroup_subsys_state *css, *failed_css = NULL;
2301 struct css_set *cset, *tmp_cset;
2302 struct task_struct *task, *tmp_task;
2303 int i, ret;
2304 2563
2305 /* 2564 /*
2306 * Prevent freeing of tasks while we take a snapshot. Tasks that are 2565 * Prevent freeing of tasks while we take a snapshot. Tasks that are
2307 * already PF_EXITING could be freed from underneath us unless we 2566 * already PF_EXITING could be freed from underneath us unless we
2308 * take an rcu_read_lock. 2567 * take an rcu_read_lock.
2309 */ 2568 */
2310 down_write(&css_set_rwsem); 2569 spin_lock_bh(&css_set_lock);
2311 rcu_read_lock(); 2570 rcu_read_lock();
2312 task = leader; 2571 task = leader;
2313 do { 2572 do {
2314 /* @task either already exited or can't exit until the end */ 2573 cgroup_taskset_add(task, &tset);
2315 if (task->flags & PF_EXITING)
2316 goto next;
2317
2318 /* leave @task alone if post_fork() hasn't linked it yet */
2319 if (list_empty(&task->cg_list))
2320 goto next;
2321
2322 cset = task_css_set(task);
2323 if (!cset->mg_src_cgrp)
2324 goto next;
2325
2326 /*
2327 * cgroup_taskset_first() must always return the leader.
2328 * Take care to avoid disturbing the ordering.
2329 */
2330 list_move_tail(&task->cg_list, &cset->mg_tasks);
2331 if (list_empty(&cset->mg_node))
2332 list_add_tail(&cset->mg_node, &tset.src_csets);
2333 if (list_empty(&cset->mg_dst_cset->mg_node))
2334 list_move_tail(&cset->mg_dst_cset->mg_node,
2335 &tset.dst_csets);
2336 next:
2337 if (!threadgroup) 2574 if (!threadgroup)
2338 break; 2575 break;
2339 } while_each_thread(leader, task); 2576 } while_each_thread(leader, task);
2340 rcu_read_unlock(); 2577 rcu_read_unlock();
2341 up_write(&css_set_rwsem); 2578 spin_unlock_bh(&css_set_lock);
2342
2343 /* methods shouldn't be called if no task is actually migrating */
2344 if (list_empty(&tset.src_csets))
2345 return 0;
2346
2347 /* check that we can legitimately attach to the cgroup */
2348 for_each_e_css(css, i, cgrp) {
2349 if (css->ss->can_attach) {
2350 ret = css->ss->can_attach(css, &tset);
2351 if (ret) {
2352 failed_css = css;
2353 goto out_cancel_attach;
2354 }
2355 }
2356 }
2357
2358 /*
2359 * Now that we're guaranteed success, proceed to move all tasks to
2360 * the new cgroup. There are no failure cases after here, so this
2361 * is the commit point.
2362 */
2363 down_write(&css_set_rwsem);
2364 list_for_each_entry(cset, &tset.src_csets, mg_node) {
2365 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
2366 cgroup_task_migrate(cset->mg_src_cgrp, task,
2367 cset->mg_dst_cset);
2368 }
2369 up_write(&css_set_rwsem);
2370
2371 /*
2372 * Migration is committed, all target tasks are now on dst_csets.
2373 * Nothing is sensitive to fork() after this point. Notify
2374 * controllers that migration is complete.
2375 */
2376 tset.csets = &tset.dst_csets;
2377
2378 for_each_e_css(css, i, cgrp)
2379 if (css->ss->attach)
2380 css->ss->attach(css, &tset);
2381
2382 ret = 0;
2383 goto out_release_tset;
2384 2579
2385out_cancel_attach: 2580 return cgroup_taskset_migrate(&tset, cgrp);
2386 for_each_e_css(css, i, cgrp) {
2387 if (css == failed_css)
2388 break;
2389 if (css->ss->cancel_attach)
2390 css->ss->cancel_attach(css, &tset);
2391 }
2392out_release_tset:
2393 down_write(&css_set_rwsem);
2394 list_splice_init(&tset.dst_csets, &tset.src_csets);
2395 list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
2396 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2397 list_del_init(&cset->mg_node);
2398 }
2399 up_write(&css_set_rwsem);
2400 return ret;
2401} 2581}
2402 2582
2403/** 2583/**
@@ -2406,7 +2586,7 @@ out_release_tset:
2406 * @leader: the task or the leader of the threadgroup to be attached 2586 * @leader: the task or the leader of the threadgroup to be attached
2407 * @threadgroup: attach the whole threadgroup? 2587 * @threadgroup: attach the whole threadgroup?
2408 * 2588 *
2409 * Call holding cgroup_mutex and threadgroup_lock of @leader. 2589 * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
2410 */ 2590 */
2411static int cgroup_attach_task(struct cgroup *dst_cgrp, 2591static int cgroup_attach_task(struct cgroup *dst_cgrp,
2412 struct task_struct *leader, bool threadgroup) 2592 struct task_struct *leader, bool threadgroup)
@@ -2416,7 +2596,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
2416 int ret; 2596 int ret;
2417 2597
2418 /* look up all src csets */ 2598 /* look up all src csets */
2419 down_read(&css_set_rwsem); 2599 spin_lock_bh(&css_set_lock);
2420 rcu_read_lock(); 2600 rcu_read_lock();
2421 task = leader; 2601 task = leader;
2422 do { 2602 do {
@@ -2426,12 +2606,12 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
2426 break; 2606 break;
2427 } while_each_thread(leader, task); 2607 } while_each_thread(leader, task);
2428 rcu_read_unlock(); 2608 rcu_read_unlock();
2429 up_read(&css_set_rwsem); 2609 spin_unlock_bh(&css_set_lock);
2430 2610
2431 /* prepare dst csets and commit */ 2611 /* prepare dst csets and commit */
2432 ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); 2612 ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
2433 if (!ret) 2613 if (!ret)
2434 ret = cgroup_migrate(dst_cgrp, leader, threadgroup); 2614 ret = cgroup_migrate(leader, threadgroup, dst_cgrp);
2435 2615
2436 cgroup_migrate_finish(&preloaded_csets); 2616 cgroup_migrate_finish(&preloaded_csets);
2437 return ret; 2617 return ret;
@@ -2459,15 +2639,15 @@ static int cgroup_procs_write_permission(struct task_struct *task,
2459 struct cgroup *cgrp; 2639 struct cgroup *cgrp;
2460 struct inode *inode; 2640 struct inode *inode;
2461 2641
2462 down_read(&css_set_rwsem); 2642 spin_lock_bh(&css_set_lock);
2463 cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); 2643 cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
2464 up_read(&css_set_rwsem); 2644 spin_unlock_bh(&css_set_lock);
2465 2645
2466 while (!cgroup_is_descendant(dst_cgrp, cgrp)) 2646 while (!cgroup_is_descendant(dst_cgrp, cgrp))
2467 cgrp = cgroup_parent(cgrp); 2647 cgrp = cgroup_parent(cgrp);
2468 2648
2469 ret = -ENOMEM; 2649 ret = -ENOMEM;
2470 inode = kernfs_get_inode(sb, cgrp->procs_kn); 2650 inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
2471 if (inode) { 2651 if (inode) {
2472 ret = inode_permission(inode, MAY_WRITE); 2652 ret = inode_permission(inode, MAY_WRITE);
2473 iput(inode); 2653 iput(inode);
@@ -2498,14 +2678,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2498 if (!cgrp) 2678 if (!cgrp)
2499 return -ENODEV; 2679 return -ENODEV;
2500 2680
2501retry_find_task: 2681 percpu_down_write(&cgroup_threadgroup_rwsem);
2502 rcu_read_lock(); 2682 rcu_read_lock();
2503 if (pid) { 2683 if (pid) {
2504 tsk = find_task_by_vpid(pid); 2684 tsk = find_task_by_vpid(pid);
2505 if (!tsk) { 2685 if (!tsk) {
2506 rcu_read_unlock();
2507 ret = -ESRCH; 2686 ret = -ESRCH;
2508 goto out_unlock_cgroup; 2687 goto out_unlock_rcu;
2509 } 2688 }
2510 } else { 2689 } else {
2511 tsk = current; 2690 tsk = current;
@@ -2521,37 +2700,23 @@ retry_find_task:
2521 */ 2700 */
2522 if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { 2701 if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
2523 ret = -EINVAL; 2702 ret = -EINVAL;
2524 rcu_read_unlock(); 2703 goto out_unlock_rcu;
2525 goto out_unlock_cgroup;
2526 } 2704 }
2527 2705
2528 get_task_struct(tsk); 2706 get_task_struct(tsk);
2529 rcu_read_unlock(); 2707 rcu_read_unlock();
2530 2708
2531 threadgroup_lock(tsk);
2532 if (threadgroup) {
2533 if (!thread_group_leader(tsk)) {
2534 /*
2535 * a race with de_thread from another thread's exec()
2536 * may strip us of our leadership, if this happens,
2537 * there is no choice but to throw this task away and
2538 * try again; this is
2539 * "double-double-toil-and-trouble-check locking".
2540 */
2541 threadgroup_unlock(tsk);
2542 put_task_struct(tsk);
2543 goto retry_find_task;
2544 }
2545 }
2546
2547 ret = cgroup_procs_write_permission(tsk, cgrp, of); 2709 ret = cgroup_procs_write_permission(tsk, cgrp, of);
2548 if (!ret) 2710 if (!ret)
2549 ret = cgroup_attach_task(cgrp, tsk, threadgroup); 2711 ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2550 2712
2551 threadgroup_unlock(tsk);
2552
2553 put_task_struct(tsk); 2713 put_task_struct(tsk);
2554out_unlock_cgroup: 2714 goto out_unlock_threadgroup;
2715
2716out_unlock_rcu:
2717 rcu_read_unlock();
2718out_unlock_threadgroup:
2719 percpu_up_write(&cgroup_threadgroup_rwsem);
2555 cgroup_kn_unlock(of->kn); 2720 cgroup_kn_unlock(of->kn);
2556 return ret ?: nbytes; 2721 return ret ?: nbytes;
2557} 2722}
@@ -2573,9 +2738,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2573 if (root == &cgrp_dfl_root) 2738 if (root == &cgrp_dfl_root)
2574 continue; 2739 continue;
2575 2740
2576 down_read(&css_set_rwsem); 2741 spin_lock_bh(&css_set_lock);
2577 from_cgrp = task_cgroup_from_root(from, root); 2742 from_cgrp = task_cgroup_from_root(from, root);
2578 up_read(&css_set_rwsem); 2743 spin_unlock_bh(&css_set_lock);
2579 2744
2580 retval = cgroup_attach_task(from_cgrp, tsk, false); 2745 retval = cgroup_attach_task(from_cgrp, tsk, false);
2581 if (retval) 2746 if (retval)
@@ -2690,14 +2855,17 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2690static int cgroup_update_dfl_csses(struct cgroup *cgrp) 2855static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2691{ 2856{
2692 LIST_HEAD(preloaded_csets); 2857 LIST_HEAD(preloaded_csets);
2858 struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
2693 struct cgroup_subsys_state *css; 2859 struct cgroup_subsys_state *css;
2694 struct css_set *src_cset; 2860 struct css_set *src_cset;
2695 int ret; 2861 int ret;
2696 2862
2697 lockdep_assert_held(&cgroup_mutex); 2863 lockdep_assert_held(&cgroup_mutex);
2698 2864
2865 percpu_down_write(&cgroup_threadgroup_rwsem);
2866
2699 /* look up all csses currently attached to @cgrp's subtree */ 2867 /* look up all csses currently attached to @cgrp's subtree */
2700 down_read(&css_set_rwsem); 2868 spin_lock_bh(&css_set_lock);
2701 css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { 2869 css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
2702 struct cgrp_cset_link *link; 2870 struct cgrp_cset_link *link;
2703 2871
@@ -2709,68 +2877,31 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2709 cgroup_migrate_add_src(link->cset, cgrp, 2877 cgroup_migrate_add_src(link->cset, cgrp,
2710 &preloaded_csets); 2878 &preloaded_csets);
2711 } 2879 }
2712 up_read(&css_set_rwsem); 2880 spin_unlock_bh(&css_set_lock);
2713 2881
2714 /* NULL dst indicates self on default hierarchy */ 2882 /* NULL dst indicates self on default hierarchy */
2715 ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); 2883 ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
2716 if (ret) 2884 if (ret)
2717 goto out_finish; 2885 goto out_finish;
2718 2886
2887 spin_lock_bh(&css_set_lock);
2719 list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { 2888 list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
2720 struct task_struct *last_task = NULL, *task; 2889 struct task_struct *task, *ntask;
2721 2890
2722 /* src_csets precede dst_csets, break on the first dst_cset */ 2891 /* src_csets precede dst_csets, break on the first dst_cset */
2723 if (!src_cset->mg_src_cgrp) 2892 if (!src_cset->mg_src_cgrp)
2724 break; 2893 break;
2725 2894
2726 /* 2895 /* all tasks in src_csets need to be migrated */
2727 * All tasks in src_cset need to be migrated to the 2896 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
2728 * matching dst_cset. Empty it process by process. We 2897 cgroup_taskset_add(task, &tset);
2729 * walk tasks but migrate processes. The leader might even
2730 * belong to a different cset but such src_cset would also
2731 * be among the target src_csets because the default
2732 * hierarchy enforces per-process membership.
2733 */
2734 while (true) {
2735 down_read(&css_set_rwsem);
2736 task = list_first_entry_or_null(&src_cset->tasks,
2737 struct task_struct, cg_list);
2738 if (task) {
2739 task = task->group_leader;
2740 WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
2741 get_task_struct(task);
2742 }
2743 up_read(&css_set_rwsem);
2744
2745 if (!task)
2746 break;
2747
2748 /* guard against possible infinite loop */
2749 if (WARN(last_task == task,
2750 "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
2751 goto out_finish;
2752 last_task = task;
2753
2754 threadgroup_lock(task);
2755 /* raced against de_thread() from another thread? */
2756 if (!thread_group_leader(task)) {
2757 threadgroup_unlock(task);
2758 put_task_struct(task);
2759 continue;
2760 }
2761
2762 ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
2763
2764 threadgroup_unlock(task);
2765 put_task_struct(task);
2766
2767 if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
2768 goto out_finish;
2769 }
2770 } 2898 }
2899 spin_unlock_bh(&css_set_lock);
2771 2900
2901 ret = cgroup_taskset_migrate(&tset, cgrp);
2772out_finish: 2902out_finish:
2773 cgroup_migrate_finish(&preloaded_csets); 2903 cgroup_migrate_finish(&preloaded_csets);
2904 percpu_up_write(&cgroup_threadgroup_rwsem);
2774 return ret; 2905 return ret;
2775} 2906}
2776 2907
@@ -2797,7 +2928,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2797 if (tok[0] == '\0') 2928 if (tok[0] == '\0')
2798 continue; 2929 continue;
2799 for_each_subsys_which(ss, ssid, &tmp_ss_mask) { 2930 for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
2800 if (ss->disabled || strcmp(tok + 1, ss->name)) 2931 if (!cgroup_ssid_enabled(ssid) ||
2932 strcmp(tok + 1, ss->name))
2801 continue; 2933 continue;
2802 2934
2803 if (*tok == '+') { 2935 if (*tok == '+') {
@@ -2921,7 +3053,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2921 ret = create_css(child, ss, 3053 ret = create_css(child, ss,
2922 cgrp->subtree_control & (1 << ssid)); 3054 cgrp->subtree_control & (1 << ssid));
2923 else 3055 else
2924 ret = cgroup_populate_dir(child, 1 << ssid); 3056 ret = css_populate_dir(cgroup_css(child, ss),
3057 NULL);
2925 if (ret) 3058 if (ret)
2926 goto err_undo_css; 3059 goto err_undo_css;
2927 } 3060 }
@@ -2954,7 +3087,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2954 if (css_disable & (1 << ssid)) { 3087 if (css_disable & (1 << ssid)) {
2955 kill_css(css); 3088 kill_css(css);
2956 } else { 3089 } else {
2957 cgroup_clear_dir(child, 1 << ssid); 3090 css_clear_dir(css, NULL);
2958 if (ss->css_reset) 3091 if (ss->css_reset)
2959 ss->css_reset(css); 3092 ss->css_reset(css);
2960 } 3093 }
@@ -3002,15 +3135,16 @@ err_undo_css:
3002 if (css_enable & (1 << ssid)) 3135 if (css_enable & (1 << ssid))
3003 kill_css(css); 3136 kill_css(css);
3004 else 3137 else
3005 cgroup_clear_dir(child, 1 << ssid); 3138 css_clear_dir(css, NULL);
3006 } 3139 }
3007 } 3140 }
3008 goto out_unlock; 3141 goto out_unlock;
3009} 3142}
3010 3143
3011static int cgroup_populated_show(struct seq_file *seq, void *v) 3144static int cgroup_events_show(struct seq_file *seq, void *v)
3012{ 3145{
3013 seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt); 3146 seq_printf(seq, "populated %d\n",
3147 cgroup_is_populated(seq_css(seq)->cgroup));
3014 return 0; 3148 return 0;
3015} 3149}
3016 3150
@@ -3153,7 +3287,8 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn)
3153 return kernfs_setattr(kn, &iattr); 3287 return kernfs_setattr(kn, &iattr);
3154} 3288}
3155 3289
3156static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) 3290static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
3291 struct cftype *cft)
3157{ 3292{
3158 char name[CGROUP_FILE_NAME_MAX]; 3293 char name[CGROUP_FILE_NAME_MAX];
3159 struct kernfs_node *kn; 3294 struct kernfs_node *kn;
@@ -3175,33 +3310,38 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
3175 return ret; 3310 return ret;
3176 } 3311 }
3177 3312
3178 if (cft->write == cgroup_procs_write) 3313 if (cft->file_offset) {
3179 cgrp->procs_kn = kn; 3314 struct cgroup_file *cfile = (void *)css + cft->file_offset;
3180 else if (cft->seq_show == cgroup_populated_show) 3315
3181 cgrp->populated_kn = kn; 3316 kernfs_get(kn);
3317 cfile->kn = kn;
3318 list_add(&cfile->node, &css->files);
3319 }
3320
3182 return 0; 3321 return 0;
3183} 3322}
3184 3323
3185/** 3324/**
3186 * cgroup_addrm_files - add or remove files to a cgroup directory 3325 * cgroup_addrm_files - add or remove files to a cgroup directory
3187 * @cgrp: the target cgroup 3326 * @css: the target css
3327 * @cgrp: the target cgroup (usually css->cgroup)
3188 * @cfts: array of cftypes to be added 3328 * @cfts: array of cftypes to be added
3189 * @is_add: whether to add or remove 3329 * @is_add: whether to add or remove
3190 * 3330 *
3191 * Depending on @is_add, add or remove files defined by @cfts on @cgrp. 3331 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
3192 * For removals, this function never fails. If addition fails, this 3332 * For removals, this function never fails.
3193 * function doesn't remove files already added. The caller is responsible
3194 * for cleaning up.
3195 */ 3333 */
3196static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 3334static int cgroup_addrm_files(struct cgroup_subsys_state *css,
3335 struct cgroup *cgrp, struct cftype cfts[],
3197 bool is_add) 3336 bool is_add)
3198{ 3337{
3199 struct cftype *cft; 3338 struct cftype *cft, *cft_end = NULL;
3200 int ret; 3339 int ret;
3201 3340
3202 lockdep_assert_held(&cgroup_mutex); 3341 lockdep_assert_held(&cgroup_mutex);
3203 3342
3204 for (cft = cfts; cft->name[0] != '\0'; cft++) { 3343restart:
3344 for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
3205 /* does cft->flags tell us to skip this file on @cgrp? */ 3345 /* does cft->flags tell us to skip this file on @cgrp? */
3206 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) 3346 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
3207 continue; 3347 continue;
@@ -3213,11 +3353,13 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
3213 continue; 3353 continue;
3214 3354
3215 if (is_add) { 3355 if (is_add) {
3216 ret = cgroup_add_file(cgrp, cft); 3356 ret = cgroup_add_file(css, cgrp, cft);
3217 if (ret) { 3357 if (ret) {
3218 pr_warn("%s: failed to add %s, err=%d\n", 3358 pr_warn("%s: failed to add %s, err=%d\n",
3219 __func__, cft->name, ret); 3359 __func__, cft->name, ret);
3220 return ret; 3360 cft_end = cft;
3361 is_add = false;
3362 goto restart;
3221 } 3363 }
3222 } else { 3364 } else {
3223 cgroup_rm_file(cgrp, cft); 3365 cgroup_rm_file(cgrp, cft);
@@ -3243,7 +3385,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
3243 if (cgroup_is_dead(cgrp)) 3385 if (cgroup_is_dead(cgrp))
3244 continue; 3386 continue;
3245 3387
3246 ret = cgroup_addrm_files(cgrp, cfts, is_add); 3388 ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
3247 if (ret) 3389 if (ret)
3248 break; 3390 break;
3249 } 3391 }
@@ -3355,7 +3497,7 @@ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3355{ 3497{
3356 int ret; 3498 int ret;
3357 3499
3358 if (ss->disabled) 3500 if (!cgroup_ssid_enabled(ss->id))
3359 return 0; 3501 return 0;
3360 3502
3361 if (!cfts || cfts[0].name[0] == '\0') 3503 if (!cfts || cfts[0].name[0] == '\0')
@@ -3405,17 +3547,8 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3405{ 3547{
3406 struct cftype *cft; 3548 struct cftype *cft;
3407 3549
3408 /* 3550 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3409 * If legacy_flies_on_dfl, we want to show the legacy files on the 3551 cft->flags |= __CFTYPE_NOT_ON_DFL;
3410 * dfl hierarchy but iff the target subsystem hasn't been updated
3411 * for the dfl hierarchy yet.
3412 */
3413 if (!cgroup_legacy_files_on_dfl ||
3414 ss->dfl_cftypes != ss->legacy_cftypes) {
3415 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3416 cft->flags |= __CFTYPE_NOT_ON_DFL;
3417 }
3418
3419 return cgroup_add_cftypes(ss, cfts); 3552 return cgroup_add_cftypes(ss, cfts);
3420} 3553}
3421 3554
@@ -3430,10 +3563,10 @@ static int cgroup_task_count(const struct cgroup *cgrp)
3430 int count = 0; 3563 int count = 0;
3431 struct cgrp_cset_link *link; 3564 struct cgrp_cset_link *link;
3432 3565
3433 down_read(&css_set_rwsem); 3566 spin_lock_bh(&css_set_lock);
3434 list_for_each_entry(link, &cgrp->cset_links, cset_link) 3567 list_for_each_entry(link, &cgrp->cset_links, cset_link)
3435 count += atomic_read(&link->cset->refcount); 3568 count += atomic_read(&link->cset->refcount);
3436 up_read(&css_set_rwsem); 3569 spin_unlock_bh(&css_set_lock);
3437 return count; 3570 return count;
3438} 3571}
3439 3572
@@ -3665,22 +3798,25 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
3665} 3798}
3666 3799
3667/** 3800/**
3668 * css_advance_task_iter - advance a task itererator to the next css_set 3801 * css_task_iter_advance_css_set - advance a task itererator to the next css_set
3669 * @it: the iterator to advance 3802 * @it: the iterator to advance
3670 * 3803 *
3671 * Advance @it to the next css_set to walk. 3804 * Advance @it to the next css_set to walk.
3672 */ 3805 */
3673static void css_advance_task_iter(struct css_task_iter *it) 3806static void css_task_iter_advance_css_set(struct css_task_iter *it)
3674{ 3807{
3675 struct list_head *l = it->cset_pos; 3808 struct list_head *l = it->cset_pos;
3676 struct cgrp_cset_link *link; 3809 struct cgrp_cset_link *link;
3677 struct css_set *cset; 3810 struct css_set *cset;
3678 3811
3812 lockdep_assert_held(&css_set_lock);
3813
3679 /* Advance to the next non-empty css_set */ 3814 /* Advance to the next non-empty css_set */
3680 do { 3815 do {
3681 l = l->next; 3816 l = l->next;
3682 if (l == it->cset_head) { 3817 if (l == it->cset_head) {
3683 it->cset_pos = NULL; 3818 it->cset_pos = NULL;
3819 it->task_pos = NULL;
3684 return; 3820 return;
3685 } 3821 }
3686 3822
@@ -3691,7 +3827,7 @@ static void css_advance_task_iter(struct css_task_iter *it)
3691 link = list_entry(l, struct cgrp_cset_link, cset_link); 3827 link = list_entry(l, struct cgrp_cset_link, cset_link);
3692 cset = link->cset; 3828 cset = link->cset;
3693 } 3829 }
3694 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); 3830 } while (!css_set_populated(cset));
3695 3831
3696 it->cset_pos = l; 3832 it->cset_pos = l;
3697 3833
@@ -3702,6 +3838,52 @@ static void css_advance_task_iter(struct css_task_iter *it)
3702 3838
3703 it->tasks_head = &cset->tasks; 3839 it->tasks_head = &cset->tasks;
3704 it->mg_tasks_head = &cset->mg_tasks; 3840 it->mg_tasks_head = &cset->mg_tasks;
3841
3842 /*
3843 * We don't keep css_sets locked across iteration steps and thus
3844 * need to take steps to ensure that iteration can be resumed after
3845 * the lock is re-acquired. Iteration is performed at two levels -
3846 * css_sets and tasks in them.
3847 *
3848 * Once created, a css_set never leaves its cgroup lists, so a
3849 * pinned css_set is guaranteed to stay put and we can resume
3850 * iteration afterwards.
3851 *
3852 * Tasks may leave @cset across iteration steps. This is resolved
3853 * by registering each iterator with the css_set currently being
3854 * walked and making css_set_move_task() advance iterators whose
3855 * next task is leaving.
3856 */
3857 if (it->cur_cset) {
3858 list_del(&it->iters_node);
3859 put_css_set_locked(it->cur_cset);
3860 }
3861 get_css_set(cset);
3862 it->cur_cset = cset;
3863 list_add(&it->iters_node, &cset->task_iters);
3864}
3865
3866static void css_task_iter_advance(struct css_task_iter *it)
3867{
3868 struct list_head *l = it->task_pos;
3869
3870 lockdep_assert_held(&css_set_lock);
3871 WARN_ON_ONCE(!l);
3872
3873 /*
3874 * Advance iterator to find next entry. cset->tasks is consumed
3875 * first and then ->mg_tasks. After ->mg_tasks, we move onto the
3876 * next cset.
3877 */
3878 l = l->next;
3879
3880 if (l == it->tasks_head)
3881 l = it->mg_tasks_head->next;
3882
3883 if (l == it->mg_tasks_head)
3884 css_task_iter_advance_css_set(it);
3885 else
3886 it->task_pos = l;
3705} 3887}
3706 3888
3707/** 3889/**
@@ -3713,19 +3895,16 @@ static void css_advance_task_iter(struct css_task_iter *it)
3713 * css_task_iter_next() to walk through the tasks until the function 3895 * css_task_iter_next() to walk through the tasks until the function
3714 * returns NULL. On completion of iteration, css_task_iter_end() must be 3896 * returns NULL. On completion of iteration, css_task_iter_end() must be
3715 * called. 3897 * called.
3716 *
3717 * Note that this function acquires a lock which is released when the
3718 * iteration finishes. The caller can't sleep while iteration is in
3719 * progress.
3720 */ 3898 */
3721void css_task_iter_start(struct cgroup_subsys_state *css, 3899void css_task_iter_start(struct cgroup_subsys_state *css,
3722 struct css_task_iter *it) 3900 struct css_task_iter *it)
3723 __acquires(css_set_rwsem)
3724{ 3901{
3725 /* no one should try to iterate before mounting cgroups */ 3902 /* no one should try to iterate before mounting cgroups */
3726 WARN_ON_ONCE(!use_task_css_set_links); 3903 WARN_ON_ONCE(!use_task_css_set_links);
3727 3904
3728 down_read(&css_set_rwsem); 3905 memset(it, 0, sizeof(*it));
3906
3907 spin_lock_bh(&css_set_lock);
3729 3908
3730 it->ss = css->ss; 3909 it->ss = css->ss;
3731 3910
@@ -3736,7 +3915,9 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
3736 3915
3737 it->cset_head = it->cset_pos; 3916 it->cset_head = it->cset_pos;
3738 3917
3739 css_advance_task_iter(it); 3918 css_task_iter_advance_css_set(it);
3919
3920 spin_unlock_bh(&css_set_lock);
3740} 3921}
3741 3922
3742/** 3923/**
@@ -3749,30 +3930,23 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
3749 */ 3930 */
3750struct task_struct *css_task_iter_next(struct css_task_iter *it) 3931struct task_struct *css_task_iter_next(struct css_task_iter *it)
3751{ 3932{
3752 struct task_struct *res; 3933 if (it->cur_task) {
3753 struct list_head *l = it->task_pos; 3934 put_task_struct(it->cur_task);
3935 it->cur_task = NULL;
3936 }
3754 3937
3755 /* If the iterator cg is NULL, we have no tasks */ 3938 spin_lock_bh(&css_set_lock);
3756 if (!it->cset_pos)
3757 return NULL;
3758 res = list_entry(l, struct task_struct, cg_list);
3759 3939
3760 /* 3940 if (it->task_pos) {
3761 * Advance iterator to find next entry. cset->tasks is consumed 3941 it->cur_task = list_entry(it->task_pos, struct task_struct,
3762 * first and then ->mg_tasks. After ->mg_tasks, we move onto the 3942 cg_list);
3763 * next cset. 3943 get_task_struct(it->cur_task);
3764 */ 3944 css_task_iter_advance(it);
3765 l = l->next; 3945 }
3766 3946
3767 if (l == it->tasks_head) 3947 spin_unlock_bh(&css_set_lock);
3768 l = it->mg_tasks_head->next;
3769 3948
3770 if (l == it->mg_tasks_head) 3949 return it->cur_task;
3771 css_advance_task_iter(it);
3772 else
3773 it->task_pos = l;
3774
3775 return res;
3776} 3950}
3777 3951
3778/** 3952/**
@@ -3782,9 +3956,16 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
3782 * Finish task iteration started by css_task_iter_start(). 3956 * Finish task iteration started by css_task_iter_start().
3783 */ 3957 */
3784void css_task_iter_end(struct css_task_iter *it) 3958void css_task_iter_end(struct css_task_iter *it)
3785 __releases(css_set_rwsem)
3786{ 3959{
3787 up_read(&css_set_rwsem); 3960 if (it->cur_cset) {
3961 spin_lock_bh(&css_set_lock);
3962 list_del(&it->iters_node);
3963 put_css_set_locked(it->cur_cset);
3964 spin_unlock_bh(&css_set_lock);
3965 }
3966
3967 if (it->cur_task)
3968 put_task_struct(it->cur_task);
3788} 3969}
3789 3970
3790/** 3971/**
@@ -3809,10 +3990,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3809 mutex_lock(&cgroup_mutex); 3990 mutex_lock(&cgroup_mutex);
3810 3991
3811 /* all tasks in @from are being moved, all csets are source */ 3992 /* all tasks in @from are being moved, all csets are source */
3812 down_read(&css_set_rwsem); 3993 spin_lock_bh(&css_set_lock);
3813 list_for_each_entry(link, &from->cset_links, cset_link) 3994 list_for_each_entry(link, &from->cset_links, cset_link)
3814 cgroup_migrate_add_src(link->cset, to, &preloaded_csets); 3995 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
3815 up_read(&css_set_rwsem); 3996 spin_unlock_bh(&css_set_lock);
3816 3997
3817 ret = cgroup_migrate_prepare_dst(to, &preloaded_csets); 3998 ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
3818 if (ret) 3999 if (ret)
@@ -3830,7 +4011,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3830 css_task_iter_end(&it); 4011 css_task_iter_end(&it);
3831 4012
3832 if (task) { 4013 if (task) {
3833 ret = cgroup_migrate(to, task, false); 4014 ret = cgroup_migrate(task, false, to);
3834 put_task_struct(task); 4015 put_task_struct(task);
3835 } 4016 }
3836 } while (task && !ret); 4017 } while (task && !ret);
@@ -4327,13 +4508,13 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4327static struct cftype cgroup_dfl_base_files[] = { 4508static struct cftype cgroup_dfl_base_files[] = {
4328 { 4509 {
4329 .name = "cgroup.procs", 4510 .name = "cgroup.procs",
4511 .file_offset = offsetof(struct cgroup, procs_file),
4330 .seq_start = cgroup_pidlist_start, 4512 .seq_start = cgroup_pidlist_start,
4331 .seq_next = cgroup_pidlist_next, 4513 .seq_next = cgroup_pidlist_next,
4332 .seq_stop = cgroup_pidlist_stop, 4514 .seq_stop = cgroup_pidlist_stop,
4333 .seq_show = cgroup_pidlist_show, 4515 .seq_show = cgroup_pidlist_show,
4334 .private = CGROUP_FILE_PROCS, 4516 .private = CGROUP_FILE_PROCS,
4335 .write = cgroup_procs_write, 4517 .write = cgroup_procs_write,
4336 .mode = S_IRUGO | S_IWUSR,
4337 }, 4518 },
4338 { 4519 {
4339 .name = "cgroup.controllers", 4520 .name = "cgroup.controllers",
@@ -4351,9 +4532,10 @@ static struct cftype cgroup_dfl_base_files[] = {
4351 .write = cgroup_subtree_control_write, 4532 .write = cgroup_subtree_control_write,
4352 }, 4533 },
4353 { 4534 {
4354 .name = "cgroup.populated", 4535 .name = "cgroup.events",
4355 .flags = CFTYPE_NOT_ON_ROOT, 4536 .flags = CFTYPE_NOT_ON_ROOT,
4356 .seq_show = cgroup_populated_show, 4537 .file_offset = offsetof(struct cgroup, events_file),
4538 .seq_show = cgroup_events_show,
4357 }, 4539 },
4358 { } /* terminate */ 4540 { } /* terminate */
4359}; 4541};
@@ -4368,7 +4550,6 @@ static struct cftype cgroup_legacy_base_files[] = {
4368 .seq_show = cgroup_pidlist_show, 4550 .seq_show = cgroup_pidlist_show,
4369 .private = CGROUP_FILE_PROCS, 4551 .private = CGROUP_FILE_PROCS,
4370 .write = cgroup_procs_write, 4552 .write = cgroup_procs_write,
4371 .mode = S_IRUGO | S_IWUSR,
4372 }, 4553 },
4373 { 4554 {
4374 .name = "cgroup.clone_children", 4555 .name = "cgroup.clone_children",
@@ -4388,7 +4569,6 @@ static struct cftype cgroup_legacy_base_files[] = {
4388 .seq_show = cgroup_pidlist_show, 4569 .seq_show = cgroup_pidlist_show,
4389 .private = CGROUP_FILE_TASKS, 4570 .private = CGROUP_FILE_TASKS,
4390 .write = cgroup_tasks_write, 4571 .write = cgroup_tasks_write,
4391 .mode = S_IRUGO | S_IWUSR,
4392 }, 4572 },
4393 { 4573 {
4394 .name = "notify_on_release", 4574 .name = "notify_on_release",
@@ -4405,37 +4585,6 @@ static struct cftype cgroup_legacy_base_files[] = {
4405 { } /* terminate */ 4585 { } /* terminate */
4406}; 4586};
4407 4587
4408/**
4409 * cgroup_populate_dir - create subsys files in a cgroup directory
4410 * @cgrp: target cgroup
4411 * @subsys_mask: mask of the subsystem ids whose files should be added
4412 *
4413 * On failure, no file is added.
4414 */
4415static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
4416{
4417 struct cgroup_subsys *ss;
4418 int i, ret = 0;
4419
4420 /* process cftsets of each subsystem */
4421 for_each_subsys(ss, i) {
4422 struct cftype *cfts;
4423
4424 if (!(subsys_mask & (1 << i)))
4425 continue;
4426
4427 list_for_each_entry(cfts, &ss->cfts, node) {
4428 ret = cgroup_addrm_files(cgrp, cfts, true);
4429 if (ret < 0)
4430 goto err;
4431 }
4432 }
4433 return 0;
4434err:
4435 cgroup_clear_dir(cgrp, subsys_mask);
4436 return ret;
4437}
4438
4439/* 4588/*
4440 * css destruction is four-stage process. 4589 * css destruction is four-stage process.
4441 * 4590 *
@@ -4464,9 +4613,13 @@ static void css_free_work_fn(struct work_struct *work)
4464 container_of(work, struct cgroup_subsys_state, destroy_work); 4613 container_of(work, struct cgroup_subsys_state, destroy_work);
4465 struct cgroup_subsys *ss = css->ss; 4614 struct cgroup_subsys *ss = css->ss;
4466 struct cgroup *cgrp = css->cgroup; 4615 struct cgroup *cgrp = css->cgroup;
4616 struct cgroup_file *cfile;
4467 4617
4468 percpu_ref_exit(&css->refcnt); 4618 percpu_ref_exit(&css->refcnt);
4469 4619
4620 list_for_each_entry(cfile, &css->files, node)
4621 kernfs_put(cfile->kn);
4622
4470 if (ss) { 4623 if (ss) {
4471 /* css free path */ 4624 /* css free path */
4472 int id = css->id; 4625 int id = css->id;
@@ -4571,6 +4724,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
4571 css->ss = ss; 4724 css->ss = ss;
4572 INIT_LIST_HEAD(&css->sibling); 4725 INIT_LIST_HEAD(&css->sibling);
4573 INIT_LIST_HEAD(&css->children); 4726 INIT_LIST_HEAD(&css->children);
4727 INIT_LIST_HEAD(&css->files);
4574 css->serial_nr = css_serial_nr_next++; 4728 css->serial_nr = css_serial_nr_next++;
4575 4729
4576 if (cgroup_parent(cgrp)) { 4730 if (cgroup_parent(cgrp)) {
@@ -4653,7 +4807,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
4653 css->id = err; 4807 css->id = err;
4654 4808
4655 if (visible) { 4809 if (visible) {
4656 err = cgroup_populate_dir(cgrp, 1 << ss->id); 4810 err = css_populate_dir(css, NULL);
4657 if (err) 4811 if (err)
4658 goto err_free_id; 4812 goto err_free_id;
4659 } 4813 }
@@ -4679,7 +4833,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
4679 4833
4680err_list_del: 4834err_list_del:
4681 list_del_rcu(&css->sibling); 4835 list_del_rcu(&css->sibling);
4682 cgroup_clear_dir(css->cgroup, 1 << css->ss->id); 4836 css_clear_dir(css, NULL);
4683err_free_id: 4837err_free_id:
4684 cgroup_idr_remove(&ss->css_idr, css->id); 4838 cgroup_idr_remove(&ss->css_idr, css->id);
4685err_free_percpu_ref: 4839err_free_percpu_ref:
@@ -4696,7 +4850,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4696 struct cgroup_root *root; 4850 struct cgroup_root *root;
4697 struct cgroup_subsys *ss; 4851 struct cgroup_subsys *ss;
4698 struct kernfs_node *kn; 4852 struct kernfs_node *kn;
4699 struct cftype *base_files;
4700 int ssid, ret; 4853 int ssid, ret;
4701 4854
4702 /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. 4855 /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
@@ -4772,12 +4925,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4772 if (ret) 4925 if (ret)
4773 goto out_destroy; 4926 goto out_destroy;
4774 4927
4775 if (cgroup_on_dfl(cgrp)) 4928 ret = css_populate_dir(&cgrp->self, NULL);
4776 base_files = cgroup_dfl_base_files;
4777 else
4778 base_files = cgroup_legacy_base_files;
4779
4780 ret = cgroup_addrm_files(cgrp, base_files, true);
4781 if (ret) 4929 if (ret)
4782 goto out_destroy; 4930 goto out_destroy;
4783 4931
@@ -4864,7 +5012,7 @@ static void kill_css(struct cgroup_subsys_state *css)
4864 * This must happen before css is disassociated with its cgroup. 5012 * This must happen before css is disassociated with its cgroup.
4865 * See seq_css() for details. 5013 * See seq_css() for details.
4866 */ 5014 */
4867 cgroup_clear_dir(css->cgroup, 1 << css->ss->id); 5015 css_clear_dir(css, NULL);
4868 5016
4869 /* 5017 /*
4870 * Killing would put the base ref, but we need to keep it alive 5018 * Killing would put the base ref, but we need to keep it alive
@@ -4913,19 +5061,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4913 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 5061 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4914{ 5062{
4915 struct cgroup_subsys_state *css; 5063 struct cgroup_subsys_state *css;
4916 bool empty;
4917 int ssid; 5064 int ssid;
4918 5065
4919 lockdep_assert_held(&cgroup_mutex); 5066 lockdep_assert_held(&cgroup_mutex);
4920 5067
4921 /* 5068 /*
4922 * css_set_rwsem synchronizes access to ->cset_links and prevents 5069 * Only migration can raise populated from zero and we're already
4923 * @cgrp from being removed while put_css_set() is in progress. 5070 * holding cgroup_mutex.
4924 */ 5071 */
4925 down_read(&css_set_rwsem); 5072 if (cgroup_is_populated(cgrp))
4926 empty = list_empty(&cgrp->cset_links);
4927 up_read(&css_set_rwsem);
4928 if (!empty)
4929 return -EBUSY; 5073 return -EBUSY;
4930 5074
4931 /* 5075 /*
@@ -5023,6 +5167,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
5023 5167
5024 have_fork_callback |= (bool)ss->fork << ss->id; 5168 have_fork_callback |= (bool)ss->fork << ss->id;
5025 have_exit_callback |= (bool)ss->exit << ss->id; 5169 have_exit_callback |= (bool)ss->exit << ss->id;
5170 have_free_callback |= (bool)ss->free << ss->id;
5026 have_canfork_callback |= (bool)ss->can_fork << ss->id; 5171 have_canfork_callback |= (bool)ss->can_fork << ss->id;
5027 5172
5028 /* At system boot, before all subsystems have been 5173 /* At system boot, before all subsystems have been
@@ -5071,6 +5216,8 @@ int __init cgroup_init_early(void)
5071 return 0; 5216 return 0;
5072} 5217}
5073 5218
5219static unsigned long cgroup_disable_mask __initdata;
5220
5074/** 5221/**
5075 * cgroup_init - cgroup initialization 5222 * cgroup_init - cgroup initialization
5076 * 5223 *
@@ -5081,8 +5228,9 @@ int __init cgroup_init(void)
5081{ 5228{
5082 struct cgroup_subsys *ss; 5229 struct cgroup_subsys *ss;
5083 unsigned long key; 5230 unsigned long key;
5084 int ssid, err; 5231 int ssid;
5085 5232
5233 BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
5086 BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); 5234 BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
5087 BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); 5235 BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
5088 5236
@@ -5116,14 +5264,15 @@ int __init cgroup_init(void)
5116 * disabled flag and cftype registration needs kmalloc, 5264 * disabled flag and cftype registration needs kmalloc,
5117 * both of which aren't available during early_init. 5265 * both of which aren't available during early_init.
5118 */ 5266 */
5119 if (ss->disabled) 5267 if (cgroup_disable_mask & (1 << ssid)) {
5268 static_branch_disable(cgroup_subsys_enabled_key[ssid]);
5269 printk(KERN_INFO "Disabling %s control group subsystem\n",
5270 ss->name);
5120 continue; 5271 continue;
5272 }
5121 5273
5122 cgrp_dfl_root.subsys_mask |= 1 << ss->id; 5274 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
5123 5275
5124 if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
5125 ss->dfl_cftypes = ss->legacy_cftypes;
5126
5127 if (!ss->dfl_cftypes) 5276 if (!ss->dfl_cftypes)
5128 cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; 5277 cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
5129 5278
@@ -5138,17 +5287,10 @@ int __init cgroup_init(void)
5138 ss->bind(init_css_set.subsys[ssid]); 5287 ss->bind(init_css_set.subsys[ssid]);
5139 } 5288 }
5140 5289
5141 err = sysfs_create_mount_point(fs_kobj, "cgroup"); 5290 WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
5142 if (err) 5291 WARN_ON(register_filesystem(&cgroup_fs_type));
5143 return err; 5292 WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
5144 5293
5145 err = register_filesystem(&cgroup_fs_type);
5146 if (err < 0) {
5147 sysfs_remove_mount_point(fs_kobj, "cgroup");
5148 return err;
5149 }
5150
5151 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
5152 return 0; 5294 return 0;
5153} 5295}
5154 5296
@@ -5195,7 +5337,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5195 goto out; 5337 goto out;
5196 5338
5197 mutex_lock(&cgroup_mutex); 5339 mutex_lock(&cgroup_mutex);
5198 down_read(&css_set_rwsem); 5340 spin_lock_bh(&css_set_lock);
5199 5341
5200 for_each_root(root) { 5342 for_each_root(root) {
5201 struct cgroup_subsys *ss; 5343 struct cgroup_subsys *ss;
@@ -5215,19 +5357,39 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5215 seq_printf(m, "%sname=%s", count ? "," : "", 5357 seq_printf(m, "%sname=%s", count ? "," : "",
5216 root->name); 5358 root->name);
5217 seq_putc(m, ':'); 5359 seq_putc(m, ':');
5360
5218 cgrp = task_cgroup_from_root(tsk, root); 5361 cgrp = task_cgroup_from_root(tsk, root);
5219 path = cgroup_path(cgrp, buf, PATH_MAX); 5362
5220 if (!path) { 5363 /*
5221 retval = -ENAMETOOLONG; 5364 * On traditional hierarchies, all zombie tasks show up as
5222 goto out_unlock; 5365 * belonging to the root cgroup. On the default hierarchy,
5366 * while a zombie doesn't show up in "cgroup.procs" and
5367 * thus can't be migrated, its /proc/PID/cgroup keeps
5368 * reporting the cgroup it belonged to before exiting. If
5369 * the cgroup is removed before the zombie is reaped,
5370 * " (deleted)" is appended to the cgroup path.
5371 */
5372 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
5373 path = cgroup_path(cgrp, buf, PATH_MAX);
5374 if (!path) {
5375 retval = -ENAMETOOLONG;
5376 goto out_unlock;
5377 }
5378 } else {
5379 path = "/";
5223 } 5380 }
5381
5224 seq_puts(m, path); 5382 seq_puts(m, path);
5225 seq_putc(m, '\n'); 5383
5384 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
5385 seq_puts(m, " (deleted)\n");
5386 else
5387 seq_putc(m, '\n');
5226 } 5388 }
5227 5389
5228 retval = 0; 5390 retval = 0;
5229out_unlock: 5391out_unlock:
5230 up_read(&css_set_rwsem); 5392 spin_unlock_bh(&css_set_lock);
5231 mutex_unlock(&cgroup_mutex); 5393 mutex_unlock(&cgroup_mutex);
5232 kfree(buf); 5394 kfree(buf);
5233out: 5395out:
@@ -5251,7 +5413,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
5251 for_each_subsys(ss, i) 5413 for_each_subsys(ss, i)
5252 seq_printf(m, "%s\t%d\t%d\t%d\n", 5414 seq_printf(m, "%s\t%d\t%d\t%d\n",
5253 ss->legacy_name, ss->root->hierarchy_id, 5415 ss->legacy_name, ss->root->hierarchy_id,
5254 atomic_read(&ss->root->nr_cgrps), !ss->disabled); 5416 atomic_read(&ss->root->nr_cgrps),
5417 cgroup_ssid_enabled(i));
5255 5418
5256 mutex_unlock(&cgroup_mutex); 5419 mutex_unlock(&cgroup_mutex);
5257 return 0; 5420 return 0;
@@ -5372,7 +5535,7 @@ void cgroup_post_fork(struct task_struct *child,
5372 * @child during its iteration. 5535 * @child during its iteration.
5373 * 5536 *
5374 * If we won the race, @child is associated with %current's 5537 * If we won the race, @child is associated with %current's
5375 * css_set. Grabbing css_set_rwsem guarantees both that the 5538 * css_set. Grabbing css_set_lock guarantees both that the
5376 * association is stable, and, on completion of the parent's 5539 * association is stable, and, on completion of the parent's
5377 * migration, @child is visible in the source of migration or 5540 * migration, @child is visible in the source of migration or
5378 * already in the destination cgroup. This guarantee is necessary 5541 * already in the destination cgroup. This guarantee is necessary
@@ -5387,14 +5550,13 @@ void cgroup_post_fork(struct task_struct *child,
5387 if (use_task_css_set_links) { 5550 if (use_task_css_set_links) {
5388 struct css_set *cset; 5551 struct css_set *cset;
5389 5552
5390 down_write(&css_set_rwsem); 5553 spin_lock_bh(&css_set_lock);
5391 cset = task_css_set(current); 5554 cset = task_css_set(current);
5392 if (list_empty(&child->cg_list)) { 5555 if (list_empty(&child->cg_list)) {
5393 rcu_assign_pointer(child->cgroups, cset);
5394 list_add(&child->cg_list, &cset->tasks);
5395 get_css_set(cset); 5556 get_css_set(cset);
5557 css_set_move_task(child, NULL, cset, false);
5396 } 5558 }
5397 up_write(&css_set_rwsem); 5559 spin_unlock_bh(&css_set_lock);
5398 } 5560 }
5399 5561
5400 /* 5562 /*
@@ -5429,39 +5591,42 @@ void cgroup_exit(struct task_struct *tsk)
5429{ 5591{
5430 struct cgroup_subsys *ss; 5592 struct cgroup_subsys *ss;
5431 struct css_set *cset; 5593 struct css_set *cset;
5432 bool put_cset = false;
5433 int i; 5594 int i;
5434 5595
5435 /* 5596 /*
5436 * Unlink from @tsk from its css_set. As migration path can't race 5597 * Unlink from @tsk from its css_set. As migration path can't race
5437 * with us, we can check cg_list without grabbing css_set_rwsem. 5598 * with us, we can check css_set and cg_list without synchronization.
5438 */ 5599 */
5600 cset = task_css_set(tsk);
5601
5439 if (!list_empty(&tsk->cg_list)) { 5602 if (!list_empty(&tsk->cg_list)) {
5440 down_write(&css_set_rwsem); 5603 spin_lock_bh(&css_set_lock);
5441 list_del_init(&tsk->cg_list); 5604 css_set_move_task(tsk, cset, NULL, false);
5442 up_write(&css_set_rwsem); 5605 spin_unlock_bh(&css_set_lock);
5443 put_cset = true; 5606 } else {
5607 get_css_set(cset);
5444 } 5608 }
5445 5609
5446 /* Reassign the task to the init_css_set. */
5447 cset = task_css_set(tsk);
5448 RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
5449
5450 /* see cgroup_post_fork() for details */ 5610 /* see cgroup_post_fork() for details */
5451 for_each_subsys_which(ss, i, &have_exit_callback) { 5611 for_each_subsys_which(ss, i, &have_exit_callback)
5452 struct cgroup_subsys_state *old_css = cset->subsys[i]; 5612 ss->exit(tsk);
5453 struct cgroup_subsys_state *css = task_css(tsk, i); 5613}
5454 5614
5455 ss->exit(css, old_css, tsk); 5615void cgroup_free(struct task_struct *task)
5456 } 5616{
5617 struct css_set *cset = task_css_set(task);
5618 struct cgroup_subsys *ss;
5619 int ssid;
5457 5620
5458 if (put_cset) 5621 for_each_subsys_which(ss, ssid, &have_free_callback)
5459 put_css_set(cset); 5622 ss->free(task);
5623
5624 put_css_set(cset);
5460} 5625}
5461 5626
5462static void check_for_release(struct cgroup *cgrp) 5627static void check_for_release(struct cgroup *cgrp)
5463{ 5628{
5464 if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) && 5629 if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
5465 !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) 5630 !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
5466 schedule_work(&cgrp->release_agent_work); 5631 schedule_work(&cgrp->release_agent_work);
5467} 5632}
@@ -5540,25 +5705,13 @@ static int __init cgroup_disable(char *str)
5540 if (strcmp(token, ss->name) && 5705 if (strcmp(token, ss->name) &&
5541 strcmp(token, ss->legacy_name)) 5706 strcmp(token, ss->legacy_name))
5542 continue; 5707 continue;
5543 5708 cgroup_disable_mask |= 1 << i;
5544 ss->disabled = 1;
5545 printk(KERN_INFO "Disabling %s control group subsystem\n",
5546 ss->name);
5547 break;
5548 } 5709 }
5549 } 5710 }
5550 return 1; 5711 return 1;
5551} 5712}
5552__setup("cgroup_disable=", cgroup_disable); 5713__setup("cgroup_disable=", cgroup_disable);
5553 5714
5554static int __init cgroup_set_legacy_files_on_dfl(char *str)
5555{
5556 printk("cgroup: using legacy files on the default hierarchy\n");
5557 cgroup_legacy_files_on_dfl = true;
5558 return 0;
5559}
5560__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
5561
5562/** 5715/**
5563 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry 5716 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
5564 * @dentry: directory dentry of interest 5717 * @dentry: directory dentry of interest
@@ -5662,7 +5815,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5662 if (!name_buf) 5815 if (!name_buf)
5663 return -ENOMEM; 5816 return -ENOMEM;
5664 5817
5665 down_read(&css_set_rwsem); 5818 spin_lock_bh(&css_set_lock);
5666 rcu_read_lock(); 5819 rcu_read_lock();
5667 cset = rcu_dereference(current->cgroups); 5820 cset = rcu_dereference(current->cgroups);
5668 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { 5821 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@@ -5673,7 +5826,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5673 c->root->hierarchy_id, name_buf); 5826 c->root->hierarchy_id, name_buf);
5674 } 5827 }
5675 rcu_read_unlock(); 5828 rcu_read_unlock();
5676 up_read(&css_set_rwsem); 5829 spin_unlock_bh(&css_set_lock);
5677 kfree(name_buf); 5830 kfree(name_buf);
5678 return 0; 5831 return 0;
5679} 5832}
@@ -5684,7 +5837,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
5684 struct cgroup_subsys_state *css = seq_css(seq); 5837 struct cgroup_subsys_state *css = seq_css(seq);
5685 struct cgrp_cset_link *link; 5838 struct cgrp_cset_link *link;
5686 5839
5687 down_read(&css_set_rwsem); 5840 spin_lock_bh(&css_set_lock);
5688 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { 5841 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5689 struct css_set *cset = link->cset; 5842 struct css_set *cset = link->cset;
5690 struct task_struct *task; 5843 struct task_struct *task;
@@ -5707,13 +5860,13 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
5707 overflow: 5860 overflow:
5708 seq_puts(seq, " ...\n"); 5861 seq_puts(seq, " ...\n");
5709 } 5862 }
5710 up_read(&css_set_rwsem); 5863 spin_unlock_bh(&css_set_lock);
5711 return 0; 5864 return 0;
5712} 5865}
5713 5866
5714static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) 5867static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
5715{ 5868{
5716 return (!cgroup_has_tasks(css->cgroup) && 5869 return (!cgroup_is_populated(css->cgroup) &&
5717 !css_has_online_children(&css->cgroup->self)); 5870 !css_has_online_children(&css->cgroup->self));
5718} 5871}
5719 5872