aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-11-05 17:51:32 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-11-05 17:51:32 -0500
commit69234acee54407962a20bedf90ef9c96326994b5 (patch)
tree5e979b1a489d866691c2c65ac3f46b4f29feef68
parent11eaaadb3ea376c6c194491c2e9bddd647f9d253 (diff)
parentd57456753787ab158f906f1f8eb58d54a2ccd9f4 (diff)
Merge branch 'for-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: "The cgroup core saw several significant updates this cycle: - percpu_rwsem for threadgroup locking is reinstated. This was temporarily dropped due to down_write latency issues. Oleg's rework of percpu_rwsem which is scheduled to be merged in this merge window resolves the issue. - On the v2 hierarchy, when controllers are enabled and disabled, all operations are atomic and can fail and revert cleanly. This allows ->can_attach() failure which is necessary for cpu RT slices. - Tasks now stay associated with the original cgroups after exit until released. This allows tracking resources held by zombies (e.g. pids) and makes it easy to find out where zombies came from on the v2 hierarchy. The pids controller was broken before these changes as zombies escaped the limits; unfortunately, updating this behavior required too many invasive changes and I don't think it's a good idea to backport them, so the pids controller on 4.3, the first version which included the pids controller, will stay broken at least until I'm sure about the cgroup core changes. - Optimization of a couple common tests using static_key" * 'for-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (38 commits) cgroup: fix race condition around termination check in css_task_iter_next() blkcg: don't create "io.stat" on the root cgroup cgroup: drop cgroup__DEVEL__legacy_files_on_dfl cgroup: replace error handling in cgroup_init() with WARN_ON()s cgroup: add cgroup_subsys->free() method and use it to fix pids controller cgroup: keep zombies associated with their original cgroups cgroup: make css_set_rwsem a spinlock and rename it to css_set_lock cgroup: don't hold css_set_rwsem across css task iteration cgroup: reorganize css_task_iter functions cgroup: factor out css_set_move_task() cgroup: keep css_set and task lists in chronological order cgroup: make cgroup_destroy_locked() test cgroup_is_populated() cgroup: make css_sets pin the associated cgroups cgroup: relocate cgroup_[try]get/put() cgroup: move check_for_release() invocation cgroup: replace cgroup_has_tasks() with cgroup_is_populated() cgroup: make cgroup->nr_populated count the number of populated css_sets cgroup: remove an unused parameter from cgroup_task_migrate() cgroup: fix too early usage of static_branch_disable() cgroup: make cgroup_update_dfl_csses() migrate all target processes atomically ...
-rw-r--r--Documentation/cgroups/cgroups.txt4
-rw-r--r--Documentation/cgroups/unified-hierarchy.txt25
-rw-r--r--block/blk-cgroup.c1
-rw-r--r--block/blk-throttle.c2
-rw-r--r--block/cfq-iosched.c4
-rw-r--r--include/linux/backing-dev.h5
-rw-r--r--include/linux/cgroup-defs.h76
-rw-r--r--include/linux/cgroup.h129
-rw-r--r--include/linux/hugetlb_cgroup.h4
-rw-r--r--include/linux/init_task.h8
-rw-r--r--include/linux/jump_label.h18
-rw-r--r--include/linux/memcontrol.h8
-rw-r--r--include/linux/sched.h12
-rw-r--r--kernel/cgroup.c1297
-rw-r--r--kernel/cgroup_pids.c8
-rw-r--r--kernel/cpuset.c72
-rw-r--r--kernel/events/core.c8
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/sched/core.c8
-rw-r--r--mm/memcontrol.c27
-rw-r--r--mm/vmscan.c2
21 files changed, 952 insertions, 771 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index f935fac1e73b..c6256ae9885b 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -637,6 +637,10 @@ void exit(struct task_struct *task)
637 637
638Called during task exit. 638Called during task exit.
639 639
640void free(struct task_struct *task)
641
642Called when the task_struct is freed.
643
640void bind(struct cgroup *root) 644void bind(struct cgroup *root)
641(cgroup_mutex held by caller) 645(cgroup_mutex held by caller)
642 646
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index e0975c2cf03d..0cd27a4e0055 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -107,12 +107,6 @@ root of unified hierarchy can be bound to other hierarchies. This
107allows mixing unified hierarchy with the traditional multiple 107allows mixing unified hierarchy with the traditional multiple
108hierarchies in a fully backward compatible way. 108hierarchies in a fully backward compatible way.
109 109
110For development purposes, the following boot parameter makes all
111controllers to appear on the unified hierarchy whether supported or
112not.
113
114 cgroup__DEVEL__legacy_files_on_dfl
115
116A controller can be moved across hierarchies only after the controller 110A controller can be moved across hierarchies only after the controller
117is no longer referenced in its current hierarchy. Because per-cgroup 111is no longer referenced in its current hierarchy. Because per-cgroup
118controller states are destroyed asynchronously and controllers may 112controller states are destroyed asynchronously and controllers may
@@ -341,11 +335,11 @@ is riddled with issues.
341 unnecessarily complicated and probably done this way because event 335 unnecessarily complicated and probably done this way because event
342 delivery itself was expensive. 336 delivery itself was expensive.
343 337
344Unified hierarchy implements an interface file "cgroup.populated" 338Unified hierarchy implements "populated" field in "cgroup.events"
345which can be used to monitor whether the cgroup's subhierarchy has 339interface file which can be used to monitor whether the cgroup's
346tasks in it or not. Its value is 0 if there is no task in the cgroup 340subhierarchy has tasks in it or not. Its value is 0 if there is no
347and its descendants; otherwise, 1. poll and [id]notify events are 341task in the cgroup and its descendants; otherwise, 1. poll and
348triggered when the value changes. 342[id]notify events are triggered when the value changes.
349 343
350This is significantly lighter and simpler and trivially allows 344This is significantly lighter and simpler and trivially allows
351delegating management of subhierarchy - subhierarchy monitoring can 345delegating management of subhierarchy - subhierarchy monitoring can
@@ -374,6 +368,10 @@ supported and the interface files "release_agent" and
374 368
375- The "cgroup.clone_children" file is removed. 369- The "cgroup.clone_children" file is removed.
376 370
371- /proc/PID/cgroup keeps reporting the cgroup that a zombie belonged
372 to before exiting. If the cgroup is removed before the zombie is
373 reaped, " (deleted)" is appeneded to the path.
374
377 375
3785-3. Controller File Conventions 3765-3. Controller File Conventions
379 377
@@ -435,6 +433,11 @@ may be specified in any order and not all pairs have to be specified.
435 the first entry in the file. Specific entries can use "default" as 433 the first entry in the file. Specific entries can use "default" as
436 its value to indicate inheritance of the default value. 434 its value to indicate inheritance of the default value.
437 435
436- For events which are not very high frequency, an interface file
437 "events" should be created which lists event key value pairs.
438 Whenever a notifiable event happens, file modified event should be
439 generated on the file.
440
438 441
4395-4. Per-Controller Changes 4425-4. Per-Controller Changes
440 443
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 55512dd62633..5bcdfc10c23a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -899,6 +899,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
899struct cftype blkcg_files[] = { 899struct cftype blkcg_files[] = {
900 { 900 {
901 .name = "stat", 901 .name = "stat",
902 .flags = CFTYPE_NOT_ON_ROOT,
902 .seq_show = blkcg_print_stat, 903 .seq_show = blkcg_print_stat,
903 }, 904 },
904 { } /* terminate */ 905 { } /* terminate */
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c75a2636dd40..2149a1ddbacf 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -369,7 +369,7 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
369 * regardless of the position of the group in the hierarchy. 369 * regardless of the position of the group in the hierarchy.
370 */ 370 */
371 sq->parent_sq = &td->service_queue; 371 sq->parent_sq = &td->service_queue;
372 if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent) 372 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
373 sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; 373 sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
374 tg->td = td; 374 tg->td = td;
375} 375}
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 04de88463a98..1f9093e901da 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1581,7 +1581,7 @@ static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
1581static void cfq_cpd_init(struct blkcg_policy_data *cpd) 1581static void cfq_cpd_init(struct blkcg_policy_data *cpd)
1582{ 1582{
1583 struct cfq_group_data *cgd = cpd_to_cfqgd(cpd); 1583 struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
1584 unsigned int weight = cgroup_on_dfl(blkcg_root.css.cgroup) ? 1584 unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
1585 CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; 1585 CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
1586 1586
1587 if (cpd_to_blkcg(cpd) == &blkcg_root) 1587 if (cpd_to_blkcg(cpd) == &blkcg_root)
@@ -1599,7 +1599,7 @@ static void cfq_cpd_free(struct blkcg_policy_data *cpd)
1599static void cfq_cpd_bind(struct blkcg_policy_data *cpd) 1599static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
1600{ 1600{
1601 struct blkcg *blkcg = cpd_to_blkcg(cpd); 1601 struct blkcg *blkcg = cpd_to_blkcg(cpd);
1602 bool on_dfl = cgroup_on_dfl(blkcg_root.css.cgroup); 1602 bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys);
1603 unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; 1603 unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
1604 1604
1605 if (blkcg == &blkcg_root) 1605 if (blkcg == &blkcg_root)
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index c85f74946a8b..c82794f20110 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -13,7 +13,6 @@
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/blkdev.h> 14#include <linux/blkdev.h>
15#include <linux/writeback.h> 15#include <linux/writeback.h>
16#include <linux/memcontrol.h>
17#include <linux/blk-cgroup.h> 16#include <linux/blk-cgroup.h>
18#include <linux/backing-dev-defs.h> 17#include <linux/backing-dev-defs.h>
19#include <linux/slab.h> 18#include <linux/slab.h>
@@ -267,8 +266,8 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
267{ 266{
268 struct backing_dev_info *bdi = inode_to_bdi(inode); 267 struct backing_dev_info *bdi = inode_to_bdi(inode);
269 268
270 return cgroup_on_dfl(mem_cgroup_root_css->cgroup) && 269 return cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
271 cgroup_on_dfl(blkcg_root_css->cgroup) && 270 cgroup_subsys_on_dfl(io_cgrp_subsys) &&
272 bdi_cap_account_dirty(bdi) && 271 bdi_cap_account_dirty(bdi) &&
273 (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) && 272 (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) &&
274 (inode->i_sb->s_iflags & SB_I_CGROUPWB); 273 (inode->i_sb->s_iflags & SB_I_CGROUPWB);
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 8492721b39be..60d44b26276d 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -76,6 +76,7 @@ enum {
76 CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ 76 CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */
77 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ 77 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */
78 CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ 78 CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */
79 CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */
79 80
80 /* internal flags, do not use outside cgroup core proper */ 81 /* internal flags, do not use outside cgroup core proper */
81 __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ 82 __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */
@@ -83,6 +84,17 @@ enum {
83}; 84};
84 85
85/* 86/*
87 * cgroup_file is the handle for a file instance created in a cgroup which
88 * is used, for example, to generate file changed notifications. This can
89 * be obtained by setting cftype->file_offset.
90 */
91struct cgroup_file {
92 /* do not access any fields from outside cgroup core */
93 struct list_head node; /* anchored at css->files */
94 struct kernfs_node *kn;
95};
96
97/*
86 * Per-subsystem/per-cgroup state maintained by the system. This is the 98 * Per-subsystem/per-cgroup state maintained by the system. This is the
87 * fundamental structural building block that controllers deal with. 99 * fundamental structural building block that controllers deal with.
88 * 100 *
@@ -122,6 +134,9 @@ struct cgroup_subsys_state {
122 */ 134 */
123 u64 serial_nr; 135 u64 serial_nr;
124 136
137 /* all cgroup_files associated with this css */
138 struct list_head files;
139
125 /* percpu_ref killing and RCU release */ 140 /* percpu_ref killing and RCU release */
126 struct rcu_head rcu_head; 141 struct rcu_head rcu_head;
127 struct work_struct destroy_work; 142 struct work_struct destroy_work;
@@ -196,6 +211,9 @@ struct css_set {
196 */ 211 */
197 struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; 212 struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
198 213
214 /* all css_task_iters currently walking this cset */
215 struct list_head task_iters;
216
199 /* For RCU-protected deletion */ 217 /* For RCU-protected deletion */
200 struct rcu_head rcu_head; 218 struct rcu_head rcu_head;
201}; 219};
@@ -217,16 +235,16 @@ struct cgroup {
217 int id; 235 int id;
218 236
219 /* 237 /*
220 * If this cgroup contains any tasks, it contributes one to 238 * Each non-empty css_set associated with this cgroup contributes
221 * populated_cnt. All children with non-zero popuplated_cnt of 239 * one to populated_cnt. All children with non-zero popuplated_cnt
222 * their own contribute one. The count is zero iff there's no task 240 * of their own contribute one. The count is zero iff there's no
223 * in this cgroup or its subtree. 241 * task in this cgroup or its subtree.
224 */ 242 */
225 int populated_cnt; 243 int populated_cnt;
226 244
227 struct kernfs_node *kn; /* cgroup kernfs entry */ 245 struct kernfs_node *kn; /* cgroup kernfs entry */
228 struct kernfs_node *procs_kn; /* kn for "cgroup.procs" */ 246 struct cgroup_file procs_file; /* handle for "cgroup.procs" */
229 struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ 247 struct cgroup_file events_file; /* handle for "cgroup.events" */
230 248
231 /* 249 /*
232 * The bitmask of subsystems enabled on the child cgroups. 250 * The bitmask of subsystems enabled on the child cgroups.
@@ -324,11 +342,6 @@ struct cftype {
324 */ 342 */
325 char name[MAX_CFTYPE_NAME]; 343 char name[MAX_CFTYPE_NAME];
326 unsigned long private; 344 unsigned long private;
327 /*
328 * If not 0, file mode is set to this value, otherwise it will
329 * be figured out automatically
330 */
331 umode_t mode;
332 345
333 /* 346 /*
334 * The maximum length of string, excluding trailing nul, that can 347 * The maximum length of string, excluding trailing nul, that can
@@ -340,6 +353,14 @@ struct cftype {
340 unsigned int flags; 353 unsigned int flags;
341 354
342 /* 355 /*
356 * If non-zero, should contain the offset from the start of css to
357 * a struct cgroup_file field. cgroup will record the handle of
358 * the created file into it. The recorded handle can be used as
359 * long as the containing css remains accessible.
360 */
361 unsigned int file_offset;
362
363 /*
343 * Fields used for internal bookkeeping. Initialized automatically 364 * Fields used for internal bookkeeping. Initialized automatically
344 * during registration. 365 * during registration.
345 */ 366 */
@@ -414,12 +435,10 @@ struct cgroup_subsys {
414 int (*can_fork)(struct task_struct *task, void **priv_p); 435 int (*can_fork)(struct task_struct *task, void **priv_p);
415 void (*cancel_fork)(struct task_struct *task, void *priv); 436 void (*cancel_fork)(struct task_struct *task, void *priv);
416 void (*fork)(struct task_struct *task, void *priv); 437 void (*fork)(struct task_struct *task, void *priv);
417 void (*exit)(struct cgroup_subsys_state *css, 438 void (*exit)(struct task_struct *task);
418 struct cgroup_subsys_state *old_css, 439 void (*free)(struct task_struct *task);
419 struct task_struct *task);
420 void (*bind)(struct cgroup_subsys_state *root_css); 440 void (*bind)(struct cgroup_subsys_state *root_css);
421 441
422 int disabled;
423 int early_init; 442 int early_init;
424 443
425 /* 444 /*
@@ -473,8 +492,31 @@ struct cgroup_subsys {
473 unsigned int depends_on; 492 unsigned int depends_on;
474}; 493};
475 494
476void cgroup_threadgroup_change_begin(struct task_struct *tsk); 495extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
477void cgroup_threadgroup_change_end(struct task_struct *tsk); 496
497/**
498 * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups
499 * @tsk: target task
500 *
501 * Called from threadgroup_change_begin() and allows cgroup operations to
502 * synchronize against threadgroup changes using a percpu_rw_semaphore.
503 */
504static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
505{
506 percpu_down_read(&cgroup_threadgroup_rwsem);
507}
508
509/**
510 * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups
511 * @tsk: target task
512 *
513 * Called from threadgroup_change_end(). Counterpart of
514 * cgroup_threadcgroup_change_begin().
515 */
516static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
517{
518 percpu_up_read(&cgroup_threadgroup_rwsem);
519}
478 520
479#else /* CONFIG_CGROUPS */ 521#else /* CONFIG_CGROUPS */
480 522
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index eb7ca55f72ef..22e3754f89c5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -13,10 +13,10 @@
13#include <linux/nodemask.h> 13#include <linux/nodemask.h>
14#include <linux/rculist.h> 14#include <linux/rculist.h>
15#include <linux/cgroupstats.h> 15#include <linux/cgroupstats.h>
16#include <linux/rwsem.h>
17#include <linux/fs.h> 16#include <linux/fs.h>
18#include <linux/seq_file.h> 17#include <linux/seq_file.h>
19#include <linux/kernfs.h> 18#include <linux/kernfs.h>
19#include <linux/jump_label.h>
20 20
21#include <linux/cgroup-defs.h> 21#include <linux/cgroup-defs.h>
22 22
@@ -41,6 +41,10 @@ struct css_task_iter {
41 struct list_head *task_pos; 41 struct list_head *task_pos;
42 struct list_head *tasks_head; 42 struct list_head *tasks_head;
43 struct list_head *mg_tasks_head; 43 struct list_head *mg_tasks_head;
44
45 struct css_set *cur_cset;
46 struct task_struct *cur_task;
47 struct list_head iters_node; /* css_set->task_iters */
44}; 48};
45 49
46extern struct cgroup_root cgrp_dfl_root; 50extern struct cgroup_root cgrp_dfl_root;
@@ -50,6 +54,26 @@ extern struct css_set init_css_set;
50#include <linux/cgroup_subsys.h> 54#include <linux/cgroup_subsys.h>
51#undef SUBSYS 55#undef SUBSYS
52 56
57#define SUBSYS(_x) \
58 extern struct static_key_true _x ## _cgrp_subsys_enabled_key; \
59 extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key;
60#include <linux/cgroup_subsys.h>
61#undef SUBSYS
62
63/**
64 * cgroup_subsys_enabled - fast test on whether a subsys is enabled
65 * @ss: subsystem in question
66 */
67#define cgroup_subsys_enabled(ss) \
68 static_branch_likely(&ss ## _enabled_key)
69
70/**
71 * cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy
72 * @ss: subsystem in question
73 */
74#define cgroup_subsys_on_dfl(ss) \
75 static_branch_likely(&ss ## _on_dfl_key)
76
53bool css_has_online_children(struct cgroup_subsys_state *css); 77bool css_has_online_children(struct cgroup_subsys_state *css);
54struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); 78struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
55struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, 79struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
@@ -78,6 +102,7 @@ extern void cgroup_cancel_fork(struct task_struct *p,
78extern void cgroup_post_fork(struct task_struct *p, 102extern void cgroup_post_fork(struct task_struct *p,
79 void *old_ss_priv[CGROUP_CANFORK_COUNT]); 103 void *old_ss_priv[CGROUP_CANFORK_COUNT]);
80void cgroup_exit(struct task_struct *p); 104void cgroup_exit(struct task_struct *p);
105void cgroup_free(struct task_struct *p);
81 106
82int cgroup_init_early(void); 107int cgroup_init_early(void);
83int cgroup_init(void); 108int cgroup_init(void);
@@ -211,11 +236,33 @@ void css_task_iter_end(struct css_task_iter *it);
211 * cgroup_taskset_for_each - iterate cgroup_taskset 236 * cgroup_taskset_for_each - iterate cgroup_taskset
212 * @task: the loop cursor 237 * @task: the loop cursor
213 * @tset: taskset to iterate 238 * @tset: taskset to iterate
239 *
240 * @tset may contain multiple tasks and they may belong to multiple
241 * processes. When there are multiple tasks in @tset, if a task of a
242 * process is in @tset, all tasks of the process are in @tset. Also, all
243 * are guaranteed to share the same source and destination csses.
244 *
245 * Iteration is not in any specific order.
214 */ 246 */
215#define cgroup_taskset_for_each(task, tset) \ 247#define cgroup_taskset_for_each(task, tset) \
216 for ((task) = cgroup_taskset_first((tset)); (task); \ 248 for ((task) = cgroup_taskset_first((tset)); (task); \
217 (task) = cgroup_taskset_next((tset))) 249 (task) = cgroup_taskset_next((tset)))
218 250
251/**
252 * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
253 * @leader: the loop cursor
254 * @tset: takset to iterate
255 *
256 * Iterate threadgroup leaders of @tset. For single-task migrations, @tset
257 * may not contain any.
258 */
259#define cgroup_taskset_for_each_leader(leader, tset) \
260 for ((leader) = cgroup_taskset_first((tset)); (leader); \
261 (leader) = cgroup_taskset_next((tset))) \
262 if ((leader) != (leader)->group_leader) \
263 ; \
264 else
265
219/* 266/*
220 * Inline functions. 267 * Inline functions.
221 */ 268 */
@@ -320,11 +367,11 @@ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
320 */ 367 */
321#ifdef CONFIG_PROVE_RCU 368#ifdef CONFIG_PROVE_RCU
322extern struct mutex cgroup_mutex; 369extern struct mutex cgroup_mutex;
323extern struct rw_semaphore css_set_rwsem; 370extern spinlock_t css_set_lock;
324#define task_css_set_check(task, __c) \ 371#define task_css_set_check(task, __c) \
325 rcu_dereference_check((task)->cgroups, \ 372 rcu_dereference_check((task)->cgroups, \
326 lockdep_is_held(&cgroup_mutex) || \ 373 lockdep_is_held(&cgroup_mutex) || \
327 lockdep_is_held(&css_set_rwsem) || \ 374 lockdep_is_held(&css_set_lock) || \
328 ((task)->flags & PF_EXITING) || (__c)) 375 ((task)->flags & PF_EXITING) || (__c))
329#else 376#else
330#define task_css_set_check(task, __c) \ 377#define task_css_set_check(task, __c) \
@@ -412,68 +459,10 @@ static inline struct cgroup *task_cgroup(struct task_struct *task,
412 return task_css(task, subsys_id)->cgroup; 459 return task_css(task, subsys_id)->cgroup;
413} 460}
414 461
415/**
416 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
417 * @cgrp: the cgroup of interest
418 *
419 * The default hierarchy is the v2 interface of cgroup and this function
420 * can be used to test whether a cgroup is on the default hierarchy for
421 * cases where a subsystem should behave differnetly depending on the
422 * interface version.
423 *
424 * The set of behaviors which change on the default hierarchy are still
425 * being determined and the mount option is prefixed with __DEVEL__.
426 *
427 * List of changed behaviors:
428 *
429 * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
430 * and "name" are disallowed.
431 *
432 * - When mounting an existing superblock, mount options should match.
433 *
434 * - Remount is disallowed.
435 *
436 * - rename(2) is disallowed.
437 *
438 * - "tasks" is removed. Everything should be at process granularity. Use
439 * "cgroup.procs" instead.
440 *
441 * - "cgroup.procs" is not sorted. pids will be unique unless they got
442 * recycled inbetween reads.
443 *
444 * - "release_agent" and "notify_on_release" are removed. Replacement
445 * notification mechanism will be implemented.
446 *
447 * - "cgroup.clone_children" is removed.
448 *
449 * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup
450 * and its descendants contain no task; otherwise, 1. The file also
451 * generates kernfs notification which can be monitored through poll and
452 * [di]notify when the value of the file changes.
453 *
454 * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
455 * take masks of ancestors with non-empty cpus/mems, instead of being
456 * moved to an ancestor.
457 *
458 * - cpuset: a task can be moved into an empty cpuset, and again it takes
459 * masks of ancestors.
460 *
461 * - memcg: use_hierarchy is on by default and the cgroup file for the flag
462 * is not created.
463 *
464 * - blkcg: blk-throttle becomes properly hierarchical.
465 *
466 * - debug: disallowed on the default hierarchy.
467 */
468static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
469{
470 return cgrp->root == &cgrp_dfl_root;
471}
472
473/* no synchronization, the result can only be used as a hint */ 462/* no synchronization, the result can only be used as a hint */
474static inline bool cgroup_has_tasks(struct cgroup *cgrp) 463static inline bool cgroup_is_populated(struct cgroup *cgrp)
475{ 464{
476 return !list_empty(&cgrp->cset_links); 465 return cgrp->populated_cnt;
477} 466}
478 467
479/* returns ino associated with a cgroup */ 468/* returns ino associated with a cgroup */
@@ -527,6 +516,19 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
527 pr_cont_kernfs_path(cgrp->kn); 516 pr_cont_kernfs_path(cgrp->kn);
528} 517}
529 518
519/**
520 * cgroup_file_notify - generate a file modified event for a cgroup_file
521 * @cfile: target cgroup_file
522 *
523 * @cfile must have been obtained by setting cftype->file_offset.
524 */
525static inline void cgroup_file_notify(struct cgroup_file *cfile)
526{
527 /* might not have been created due to one of the CFTYPE selector flags */
528 if (cfile->kn)
529 kernfs_notify(cfile->kn);
530}
531
530#else /* !CONFIG_CGROUPS */ 532#else /* !CONFIG_CGROUPS */
531 533
532struct cgroup_subsys_state; 534struct cgroup_subsys_state;
@@ -546,6 +548,7 @@ static inline void cgroup_cancel_fork(struct task_struct *p,
546static inline void cgroup_post_fork(struct task_struct *p, 548static inline void cgroup_post_fork(struct task_struct *p,
547 void *ss_priv[CGROUP_CANFORK_COUNT]) {} 549 void *ss_priv[CGROUP_CANFORK_COUNT]) {}
548static inline void cgroup_exit(struct task_struct *p) {} 550static inline void cgroup_exit(struct task_struct *p) {}
551static inline void cgroup_free(struct task_struct *p) {}
549 552
550static inline int cgroup_init_early(void) { return 0; } 553static inline int cgroup_init_early(void) { return 0; }
551static inline int cgroup_init(void) { return 0; } 554static inline int cgroup_init(void) { return 0; }
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index bcc853eccc85..7edd30515298 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -48,9 +48,7 @@ int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg)
48 48
49static inline bool hugetlb_cgroup_disabled(void) 49static inline bool hugetlb_cgroup_disabled(void)
50{ 50{
51 if (hugetlb_cgrp_subsys.disabled) 51 return !cgroup_subsys_enabled(hugetlb_cgrp_subsys);
52 return true;
53 return false;
54} 52}
55 53
56extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 54extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 810a34f60424..1c1ff7e4faa4 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -25,13 +25,6 @@
25extern struct files_struct init_files; 25extern struct files_struct init_files;
26extern struct fs_struct init_fs; 26extern struct fs_struct init_fs;
27 27
28#ifdef CONFIG_CGROUPS
29#define INIT_GROUP_RWSEM(sig) \
30 .group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem),
31#else
32#define INIT_GROUP_RWSEM(sig)
33#endif
34
35#ifdef CONFIG_CPUSETS 28#ifdef CONFIG_CPUSETS
36#define INIT_CPUSET_SEQ(tsk) \ 29#define INIT_CPUSET_SEQ(tsk) \
37 .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq), 30 .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq),
@@ -65,7 +58,6 @@ extern struct fs_struct init_fs;
65 INIT_PREV_CPUTIME(sig) \ 58 INIT_PREV_CPUTIME(sig) \
66 .cred_guard_mutex = \ 59 .cred_guard_mutex = \
67 __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ 60 __MUTEX_INITIALIZER(sig.cred_guard_mutex), \
68 INIT_GROUP_RWSEM(sig) \
69} 61}
70 62
71extern struct nsproxy init_nsproxy; 63extern struct nsproxy init_nsproxy;
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index f1094238ab2a..8dde55974f18 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -214,11 +214,6 @@ static inline int jump_label_apply_nops(struct module *mod)
214#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE 214#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE
215#define jump_label_enabled static_key_enabled 215#define jump_label_enabled static_key_enabled
216 216
217static inline bool static_key_enabled(struct static_key *key)
218{
219 return static_key_count(key) > 0;
220}
221
222static inline void static_key_enable(struct static_key *key) 217static inline void static_key_enable(struct static_key *key)
223{ 218{
224 int count = static_key_count(key); 219 int count = static_key_count(key);
@@ -265,6 +260,17 @@ struct static_key_false {
265#define DEFINE_STATIC_KEY_FALSE(name) \ 260#define DEFINE_STATIC_KEY_FALSE(name) \
266 struct static_key_false name = STATIC_KEY_FALSE_INIT 261 struct static_key_false name = STATIC_KEY_FALSE_INIT
267 262
263extern bool ____wrong_branch_error(void);
264
265#define static_key_enabled(x) \
266({ \
267 if (!__builtin_types_compatible_p(typeof(*x), struct static_key) && \
268 !__builtin_types_compatible_p(typeof(*x), struct static_key_true) &&\
269 !__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \
270 ____wrong_branch_error(); \
271 static_key_count((struct static_key *)x) > 0; \
272})
273
268#ifdef HAVE_JUMP_LABEL 274#ifdef HAVE_JUMP_LABEL
269 275
270/* 276/*
@@ -323,8 +329,6 @@ struct static_key_false {
323 * See jump_label_type() / jump_label_init_type(). 329 * See jump_label_type() / jump_label_init_type().
324 */ 330 */
325 331
326extern bool ____wrong_branch_error(void);
327
328#define static_branch_likely(x) \ 332#define static_branch_likely(x) \
329({ \ 333({ \
330 bool branch; \ 334 bool branch; \
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 3e3318ddfc0e..27251ed428f7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -213,6 +213,9 @@ struct mem_cgroup {
213 /* OOM-Killer disable */ 213 /* OOM-Killer disable */
214 int oom_kill_disable; 214 int oom_kill_disable;
215 215
216 /* handle for "memory.events" */
217 struct cgroup_file events_file;
218
216 /* protect arrays of thresholds */ 219 /* protect arrays of thresholds */
217 struct mutex thresholds_lock; 220 struct mutex thresholds_lock;
218 221
@@ -285,6 +288,7 @@ static inline void mem_cgroup_events(struct mem_cgroup *memcg,
285 unsigned int nr) 288 unsigned int nr)
286{ 289{
287 this_cpu_add(memcg->stat->events[idx], nr); 290 this_cpu_add(memcg->stat->events[idx], nr);
291 cgroup_file_notify(&memcg->events_file);
288} 292}
289 293
290bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); 294bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
@@ -346,9 +350,7 @@ ino_t page_cgroup_ino(struct page *page);
346 350
347static inline bool mem_cgroup_disabled(void) 351static inline bool mem_cgroup_disabled(void)
348{ 352{
349 if (memory_cgrp_subsys.disabled) 353 return !cgroup_subsys_enabled(memory_cgrp_subsys);
350 return true;
351 return false;
352} 354}
353 355
354/* 356/*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c115d617739d..4effb1025fbb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -771,18 +771,6 @@ struct signal_struct {
771 unsigned audit_tty_log_passwd; 771 unsigned audit_tty_log_passwd;
772 struct tty_audit_buf *tty_audit_buf; 772 struct tty_audit_buf *tty_audit_buf;
773#endif 773#endif
774#ifdef CONFIG_CGROUPS
775 /*
776 * group_rwsem prevents new tasks from entering the threadgroup and
777 * member tasks from exiting,a more specifically, setting of
778 * PF_EXITING. fork and exit paths are protected with this rwsem
779 * using threadgroup_change_begin/end(). Users which require
780 * threadgroup to remain stable should use threadgroup_[un]lock()
781 * which also takes care of exec path. Currently, cgroup is the
782 * only user.
783 */
784 struct rw_semaphore group_rwsem;
785#endif
786 774
787 oom_flags_t oom_flags; 775 oom_flags_t oom_flags;
788 short oom_score_adj; /* OOM kill score adjustment */ 776 short oom_score_adj; /* OOM kill score adjustment */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2c9eae6ad970..b9d0cce3f9ce 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -45,7 +45,7 @@
45#include <linux/sched.h> 45#include <linux/sched.h>
46#include <linux/slab.h> 46#include <linux/slab.h>
47#include <linux/spinlock.h> 47#include <linux/spinlock.h>
48#include <linux/rwsem.h> 48#include <linux/percpu-rwsem.h>
49#include <linux/string.h> 49#include <linux/string.h>
50#include <linux/sort.h> 50#include <linux/sort.h>
51#include <linux/kmod.h> 51#include <linux/kmod.h>
@@ -75,7 +75,7 @@
75 * cgroup_mutex is the master lock. Any modification to cgroup or its 75 * cgroup_mutex is the master lock. Any modification to cgroup or its
76 * hierarchy must be performed while holding it. 76 * hierarchy must be performed while holding it.
77 * 77 *
78 * css_set_rwsem protects task->cgroups pointer, the list of css_set 78 * css_set_lock protects task->cgroups pointer, the list of css_set
79 * objects, and the chain of tasks off each css_set. 79 * objects, and the chain of tasks off each css_set.
80 * 80 *
81 * These locks are exported if CONFIG_PROVE_RCU so that accessors in 81 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
@@ -83,12 +83,12 @@
83 */ 83 */
84#ifdef CONFIG_PROVE_RCU 84#ifdef CONFIG_PROVE_RCU
85DEFINE_MUTEX(cgroup_mutex); 85DEFINE_MUTEX(cgroup_mutex);
86DECLARE_RWSEM(css_set_rwsem); 86DEFINE_SPINLOCK(css_set_lock);
87EXPORT_SYMBOL_GPL(cgroup_mutex); 87EXPORT_SYMBOL_GPL(cgroup_mutex);
88EXPORT_SYMBOL_GPL(css_set_rwsem); 88EXPORT_SYMBOL_GPL(css_set_lock);
89#else 89#else
90static DEFINE_MUTEX(cgroup_mutex); 90static DEFINE_MUTEX(cgroup_mutex);
91static DECLARE_RWSEM(css_set_rwsem); 91static DEFINE_SPINLOCK(css_set_lock);
92#endif 92#endif
93 93
94/* 94/*
@@ -103,6 +103,8 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
103 */ 103 */
104static DEFINE_SPINLOCK(release_agent_path_lock); 104static DEFINE_SPINLOCK(release_agent_path_lock);
105 105
106struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
107
106#define cgroup_assert_mutex_or_rcu_locked() \ 108#define cgroup_assert_mutex_or_rcu_locked() \
107 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ 109 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
108 !lockdep_is_held(&cgroup_mutex), \ 110 !lockdep_is_held(&cgroup_mutex), \
@@ -136,6 +138,27 @@ static const char *cgroup_subsys_name[] = {
136}; 138};
137#undef SUBSYS 139#undef SUBSYS
138 140
141/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
142#define SUBSYS(_x) \
143 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
144 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
145 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
146 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
147#include <linux/cgroup_subsys.h>
148#undef SUBSYS
149
150#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
151static struct static_key_true *cgroup_subsys_enabled_key[] = {
152#include <linux/cgroup_subsys.h>
153};
154#undef SUBSYS
155
156#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
157static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
158#include <linux/cgroup_subsys.h>
159};
160#undef SUBSYS
161
139/* 162/*
140 * The default hierarchy, reserved for the subsystems that are otherwise 163 * The default hierarchy, reserved for the subsystems that are otherwise
141 * unattached - it never has more than a single cgroup, and all tasks are 164 * unattached - it never has more than a single cgroup, and all tasks are
@@ -150,12 +173,6 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
150 */ 173 */
151static bool cgrp_dfl_root_visible; 174static bool cgrp_dfl_root_visible;
152 175
153/*
154 * Set by the boot param of the same name and makes subsystems with NULL
155 * ->dfl_files to use ->legacy_files on the default hierarchy.
156 */
157static bool cgroup_legacy_files_on_dfl;
158
159/* some controllers are not supported in the default hierarchy */ 176/* some controllers are not supported in the default hierarchy */
160static unsigned long cgrp_dfl_root_inhibit_ss_mask; 177static unsigned long cgrp_dfl_root_inhibit_ss_mask;
161 178
@@ -183,6 +200,7 @@ static u64 css_serial_nr_next = 1;
183 */ 200 */
184static unsigned long have_fork_callback __read_mostly; 201static unsigned long have_fork_callback __read_mostly;
185static unsigned long have_exit_callback __read_mostly; 202static unsigned long have_exit_callback __read_mostly;
203static unsigned long have_free_callback __read_mostly;
186 204
187/* Ditto for the can_fork callback. */ 205/* Ditto for the can_fork callback. */
188static unsigned long have_canfork_callback __read_mostly; 206static unsigned long have_canfork_callback __read_mostly;
@@ -192,14 +210,87 @@ static struct cftype cgroup_legacy_base_files[];
192 210
193static int rebind_subsystems(struct cgroup_root *dst_root, 211static int rebind_subsystems(struct cgroup_root *dst_root,
194 unsigned long ss_mask); 212 unsigned long ss_mask);
213static void css_task_iter_advance(struct css_task_iter *it);
195static int cgroup_destroy_locked(struct cgroup *cgrp); 214static int cgroup_destroy_locked(struct cgroup *cgrp);
196static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, 215static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
197 bool visible); 216 bool visible);
198static void css_release(struct percpu_ref *ref); 217static void css_release(struct percpu_ref *ref);
199static void kill_css(struct cgroup_subsys_state *css); 218static void kill_css(struct cgroup_subsys_state *css);
200static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 219static int cgroup_addrm_files(struct cgroup_subsys_state *css,
220 struct cgroup *cgrp, struct cftype cfts[],
201 bool is_add); 221 bool is_add);
202 222
223/**
224 * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
225 * @ssid: subsys ID of interest
226 *
227 * cgroup_subsys_enabled() can only be used with literal subsys names which
228 * is fine for individual subsystems but unsuitable for cgroup core. This
229 * is slower static_key_enabled() based test indexed by @ssid.
230 */
231static bool cgroup_ssid_enabled(int ssid)
232{
233 return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
234}
235
236/**
237 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
238 * @cgrp: the cgroup of interest
239 *
240 * The default hierarchy is the v2 interface of cgroup and this function
241 * can be used to test whether a cgroup is on the default hierarchy for
242 * cases where a subsystem should behave differnetly depending on the
243 * interface version.
244 *
245 * The set of behaviors which change on the default hierarchy are still
246 * being determined and the mount option is prefixed with __DEVEL__.
247 *
248 * List of changed behaviors:
249 *
250 * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
251 * and "name" are disallowed.
252 *
253 * - When mounting an existing superblock, mount options should match.
254 *
255 * - Remount is disallowed.
256 *
257 * - rename(2) is disallowed.
258 *
259 * - "tasks" is removed. Everything should be at process granularity. Use
260 * "cgroup.procs" instead.
261 *
262 * - "cgroup.procs" is not sorted. pids will be unique unless they got
263 * recycled inbetween reads.
264 *
265 * - "release_agent" and "notify_on_release" are removed. Replacement
266 * notification mechanism will be implemented.
267 *
268 * - "cgroup.clone_children" is removed.
269 *
270 * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup
271 * and its descendants contain no task; otherwise, 1. The file also
272 * generates kernfs notification which can be monitored through poll and
273 * [di]notify when the value of the file changes.
274 *
275 * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
276 * take masks of ancestors with non-empty cpus/mems, instead of being
277 * moved to an ancestor.
278 *
279 * - cpuset: a task can be moved into an empty cpuset, and again it takes
280 * masks of ancestors.
281 *
282 * - memcg: use_hierarchy is on by default and the cgroup file for the flag
283 * is not created.
284 *
285 * - blkcg: blk-throttle becomes properly hierarchical.
286 *
287 * - debug: disallowed on the default hierarchy.
288 */
289static bool cgroup_on_dfl(const struct cgroup *cgrp)
290{
291 return cgrp->root == &cgrp_dfl_root;
292}
293
203/* IDR wrappers which synchronize using cgroup_idr_lock */ 294/* IDR wrappers which synchronize using cgroup_idr_lock */
204static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, 295static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
205 gfp_t gfp_mask) 296 gfp_t gfp_mask)
@@ -332,6 +423,22 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp)
332 return !(cgrp->self.flags & CSS_ONLINE); 423 return !(cgrp->self.flags & CSS_ONLINE);
333} 424}
334 425
426static void cgroup_get(struct cgroup *cgrp)
427{
428 WARN_ON_ONCE(cgroup_is_dead(cgrp));
429 css_get(&cgrp->self);
430}
431
432static bool cgroup_tryget(struct cgroup *cgrp)
433{
434 return css_tryget(&cgrp->self);
435}
436
437static void cgroup_put(struct cgroup *cgrp)
438{
439 css_put(&cgrp->self);
440}
441
335struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) 442struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
336{ 443{
337 struct cgroup *cgrp = of->kn->parent->priv; 444 struct cgroup *cgrp = of->kn->parent->priv;
@@ -481,19 +588,31 @@ struct css_set init_css_set = {
481 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), 588 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
482 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), 589 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
483 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), 590 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
591 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
484}; 592};
485 593
486static int css_set_count = 1; /* 1 for init_css_set */ 594static int css_set_count = 1; /* 1 for init_css_set */
487 595
488/** 596/**
597 * css_set_populated - does a css_set contain any tasks?
598 * @cset: target css_set
599 */
600static bool css_set_populated(struct css_set *cset)
601{
602 lockdep_assert_held(&css_set_lock);
603
604 return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
605}
606
607/**
489 * cgroup_update_populated - updated populated count of a cgroup 608 * cgroup_update_populated - updated populated count of a cgroup
490 * @cgrp: the target cgroup 609 * @cgrp: the target cgroup
491 * @populated: inc or dec populated count 610 * @populated: inc or dec populated count
492 * 611 *
493 * @cgrp is either getting the first task (css_set) or losing the last. 612 * One of the css_sets associated with @cgrp is either getting its first
494 * Update @cgrp->populated_cnt accordingly. The count is propagated 613 * task or losing the last. Update @cgrp->populated_cnt accordingly. The
495 * towards root so that a given cgroup's populated_cnt is zero iff the 614 * count is propagated towards root so that a given cgroup's populated_cnt
496 * cgroup and all its descendants are empty. 615 * is zero iff the cgroup and all its descendants don't contain any tasks.
497 * 616 *
498 * @cgrp's interface file "cgroup.populated" is zero if 617 * @cgrp's interface file "cgroup.populated" is zero if
499 * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt 618 * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
@@ -503,7 +622,7 @@ static int css_set_count = 1; /* 1 for init_css_set */
503 */ 622 */
504static void cgroup_update_populated(struct cgroup *cgrp, bool populated) 623static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
505{ 624{
506 lockdep_assert_held(&css_set_rwsem); 625 lockdep_assert_held(&css_set_lock);
507 626
508 do { 627 do {
509 bool trigger; 628 bool trigger;
@@ -516,12 +635,93 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
516 if (!trigger) 635 if (!trigger)
517 break; 636 break;
518 637
519 if (cgrp->populated_kn) 638 check_for_release(cgrp);
520 kernfs_notify(cgrp->populated_kn); 639 cgroup_file_notify(&cgrp->events_file);
640
521 cgrp = cgroup_parent(cgrp); 641 cgrp = cgroup_parent(cgrp);
522 } while (cgrp); 642 } while (cgrp);
523} 643}
524 644
645/**
646 * css_set_update_populated - update populated state of a css_set
647 * @cset: target css_set
648 * @populated: whether @cset is populated or depopulated
649 *
650 * @cset is either getting the first task or losing the last. Update the
651 * ->populated_cnt of all associated cgroups accordingly.
652 */
653static void css_set_update_populated(struct css_set *cset, bool populated)
654{
655 struct cgrp_cset_link *link;
656
657 lockdep_assert_held(&css_set_lock);
658
659 list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
660 cgroup_update_populated(link->cgrp, populated);
661}
662
663/**
664 * css_set_move_task - move a task from one css_set to another
665 * @task: task being moved
666 * @from_cset: css_set @task currently belongs to (may be NULL)
667 * @to_cset: new css_set @task is being moved to (may be NULL)
668 * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
669 *
670 * Move @task from @from_cset to @to_cset. If @task didn't belong to any
671 * css_set, @from_cset can be NULL. If @task is being disassociated
672 * instead of moved, @to_cset can be NULL.
673 *
674 * This function automatically handles populated_cnt updates and
675 * css_task_iter adjustments but the caller is responsible for managing
676 * @from_cset and @to_cset's reference counts.
677 */
678static void css_set_move_task(struct task_struct *task,
679 struct css_set *from_cset, struct css_set *to_cset,
680 bool use_mg_tasks)
681{
682 lockdep_assert_held(&css_set_lock);
683
684 if (from_cset) {
685 struct css_task_iter *it, *pos;
686
687 WARN_ON_ONCE(list_empty(&task->cg_list));
688
689 /*
690 * @task is leaving, advance task iterators which are
691 * pointing to it so that they can resume at the next
692 * position. Advancing an iterator might remove it from
693 * the list, use safe walk. See css_task_iter_advance*()
694 * for details.
695 */
696 list_for_each_entry_safe(it, pos, &from_cset->task_iters,
697 iters_node)
698 if (it->task_pos == &task->cg_list)
699 css_task_iter_advance(it);
700
701 list_del_init(&task->cg_list);
702 if (!css_set_populated(from_cset))
703 css_set_update_populated(from_cset, false);
704 } else {
705 WARN_ON_ONCE(!list_empty(&task->cg_list));
706 }
707
708 if (to_cset) {
709 /*
710 * We are synchronized through cgroup_threadgroup_rwsem
711 * against PF_EXITING setting such that we can't race
712 * against cgroup_exit() changing the css_set to
713 * init_css_set and dropping the old one.
714 */
715 WARN_ON_ONCE(task->flags & PF_EXITING);
716
717 if (!css_set_populated(to_cset))
718 css_set_update_populated(to_cset, true);
719 rcu_assign_pointer(task->cgroups, to_cset);
720 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
721 &to_cset->tasks);
722 }
723}
724
525/* 725/*
526 * hash table for cgroup groups. This improves the performance to find 726 * hash table for cgroup groups. This improves the performance to find
527 * an existing css_set. This hash doesn't (currently) take into 727 * an existing css_set. This hash doesn't (currently) take into
@@ -549,7 +749,7 @@ static void put_css_set_locked(struct css_set *cset)
549 struct cgroup_subsys *ss; 749 struct cgroup_subsys *ss;
550 int ssid; 750 int ssid;
551 751
552 lockdep_assert_held(&css_set_rwsem); 752 lockdep_assert_held(&css_set_lock);
553 753
554 if (!atomic_dec_and_test(&cset->refcount)) 754 if (!atomic_dec_and_test(&cset->refcount))
555 return; 755 return;
@@ -561,17 +761,10 @@ static void put_css_set_locked(struct css_set *cset)
561 css_set_count--; 761 css_set_count--;
562 762
563 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { 763 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
564 struct cgroup *cgrp = link->cgrp;
565
566 list_del(&link->cset_link); 764 list_del(&link->cset_link);
567 list_del(&link->cgrp_link); 765 list_del(&link->cgrp_link);
568 766 if (cgroup_parent(link->cgrp))
569 /* @cgrp can't go away while we're holding css_set_rwsem */ 767 cgroup_put(link->cgrp);
570 if (list_empty(&cgrp->cset_links)) {
571 cgroup_update_populated(cgrp, false);
572 check_for_release(cgrp);
573 }
574
575 kfree(link); 768 kfree(link);
576 } 769 }
577 770
@@ -588,9 +781,9 @@ static void put_css_set(struct css_set *cset)
588 if (atomic_add_unless(&cset->refcount, -1, 1)) 781 if (atomic_add_unless(&cset->refcount, -1, 1))
589 return; 782 return;
590 783
591 down_write(&css_set_rwsem); 784 spin_lock_bh(&css_set_lock);
592 put_css_set_locked(cset); 785 put_css_set_locked(cset);
593 up_write(&css_set_rwsem); 786 spin_unlock_bh(&css_set_lock);
594} 787}
595 788
596/* 789/*
@@ -779,15 +972,15 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
779 link->cset = cset; 972 link->cset = cset;
780 link->cgrp = cgrp; 973 link->cgrp = cgrp;
781 974
782 if (list_empty(&cgrp->cset_links))
783 cgroup_update_populated(cgrp, true);
784 list_move(&link->cset_link, &cgrp->cset_links);
785
786 /* 975 /*
787 * Always add links to the tail of the list so that the list 976 * Always add links to the tail of the lists so that the lists are
788 * is sorted by order of hierarchy creation 977 * in choronological order.
789 */ 978 */
979 list_move_tail(&link->cset_link, &cgrp->cset_links);
790 list_add_tail(&link->cgrp_link, &cset->cgrp_links); 980 list_add_tail(&link->cgrp_link, &cset->cgrp_links);
981
982 if (cgroup_parent(cgrp))
983 cgroup_get(cgrp);
791} 984}
792 985
793/** 986/**
@@ -813,11 +1006,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
813 1006
814 /* First see if we already have a cgroup group that matches 1007 /* First see if we already have a cgroup group that matches
815 * the desired set */ 1008 * the desired set */
816 down_read(&css_set_rwsem); 1009 spin_lock_bh(&css_set_lock);
817 cset = find_existing_css_set(old_cset, cgrp, template); 1010 cset = find_existing_css_set(old_cset, cgrp, template);
818 if (cset) 1011 if (cset)
819 get_css_set(cset); 1012 get_css_set(cset);
820 up_read(&css_set_rwsem); 1013 spin_unlock_bh(&css_set_lock);
821 1014
822 if (cset) 1015 if (cset)
823 return cset; 1016 return cset;
@@ -838,13 +1031,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,
838 INIT_LIST_HEAD(&cset->mg_tasks); 1031 INIT_LIST_HEAD(&cset->mg_tasks);
839 INIT_LIST_HEAD(&cset->mg_preload_node); 1032 INIT_LIST_HEAD(&cset->mg_preload_node);
840 INIT_LIST_HEAD(&cset->mg_node); 1033 INIT_LIST_HEAD(&cset->mg_node);
1034 INIT_LIST_HEAD(&cset->task_iters);
841 INIT_HLIST_NODE(&cset->hlist); 1035 INIT_HLIST_NODE(&cset->hlist);
842 1036
843 /* Copy the set of subsystem state objects generated in 1037 /* Copy the set of subsystem state objects generated in
844 * find_existing_css_set() */ 1038 * find_existing_css_set() */
845 memcpy(cset->subsys, template, sizeof(cset->subsys)); 1039 memcpy(cset->subsys, template, sizeof(cset->subsys));
846 1040
847 down_write(&css_set_rwsem); 1041 spin_lock_bh(&css_set_lock);
848 /* Add reference counts and links from the new css_set. */ 1042 /* Add reference counts and links from the new css_set. */
849 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { 1043 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
850 struct cgroup *c = link->cgrp; 1044 struct cgroup *c = link->cgrp;
@@ -866,53 +1060,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
866 list_add_tail(&cset->e_cset_node[ssid], 1060 list_add_tail(&cset->e_cset_node[ssid],
867 &cset->subsys[ssid]->cgroup->e_csets[ssid]); 1061 &cset->subsys[ssid]->cgroup->e_csets[ssid]);
868 1062
869 up_write(&css_set_rwsem); 1063 spin_unlock_bh(&css_set_lock);
870 1064
871 return cset; 1065 return cset;
872} 1066}
873 1067
874void cgroup_threadgroup_change_begin(struct task_struct *tsk)
875{
876 down_read(&tsk->signal->group_rwsem);
877}
878
879void cgroup_threadgroup_change_end(struct task_struct *tsk)
880{
881 up_read(&tsk->signal->group_rwsem);
882}
883
884/**
885 * threadgroup_lock - lock threadgroup
886 * @tsk: member task of the threadgroup to lock
887 *
888 * Lock the threadgroup @tsk belongs to. No new task is allowed to enter
889 * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
890 * change ->group_leader/pid. This is useful for cases where the threadgroup
891 * needs to stay stable across blockable operations.
892 *
893 * fork and exit explicitly call threadgroup_change_{begin|end}() for
894 * synchronization. While held, no new task will be added to threadgroup
895 * and no existing live task will have its PF_EXITING set.
896 *
897 * de_thread() does threadgroup_change_{begin|end}() when a non-leader
898 * sub-thread becomes a new leader.
899 */
900static void threadgroup_lock(struct task_struct *tsk)
901{
902 down_write(&tsk->signal->group_rwsem);
903}
904
905/**
906 * threadgroup_unlock - unlock threadgroup
907 * @tsk: member task of the threadgroup to unlock
908 *
909 * Reverse threadgroup_lock().
910 */
911static inline void threadgroup_unlock(struct task_struct *tsk)
912{
913 up_write(&tsk->signal->group_rwsem);
914}
915
916static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) 1068static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
917{ 1069{
918 struct cgroup *root_cgrp = kf_root->kn->priv; 1070 struct cgroup *root_cgrp = kf_root->kn->priv;
@@ -972,14 +1124,15 @@ static void cgroup_destroy_root(struct cgroup_root *root)
972 * Release all the links from cset_links to this hierarchy's 1124 * Release all the links from cset_links to this hierarchy's
973 * root cgroup 1125 * root cgroup
974 */ 1126 */
975 down_write(&css_set_rwsem); 1127 spin_lock_bh(&css_set_lock);
976 1128
977 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { 1129 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
978 list_del(&link->cset_link); 1130 list_del(&link->cset_link);
979 list_del(&link->cgrp_link); 1131 list_del(&link->cgrp_link);
980 kfree(link); 1132 kfree(link);
981 } 1133 }
982 up_write(&css_set_rwsem); 1134
1135 spin_unlock_bh(&css_set_lock);
983 1136
984 if (!list_empty(&root->root_list)) { 1137 if (!list_empty(&root->root_list)) {
985 list_del(&root->root_list); 1138 list_del(&root->root_list);
@@ -1001,7 +1154,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1001 struct cgroup *res = NULL; 1154 struct cgroup *res = NULL;
1002 1155
1003 lockdep_assert_held(&cgroup_mutex); 1156 lockdep_assert_held(&cgroup_mutex);
1004 lockdep_assert_held(&css_set_rwsem); 1157 lockdep_assert_held(&css_set_lock);
1005 1158
1006 if (cset == &init_css_set) { 1159 if (cset == &init_css_set) {
1007 res = &root->cgrp; 1160 res = &root->cgrp;
@@ -1024,7 +1177,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1024 1177
1025/* 1178/*
1026 * Return the cgroup for "task" from the given hierarchy. Must be 1179 * Return the cgroup for "task" from the given hierarchy. Must be
1027 * called with cgroup_mutex and css_set_rwsem held. 1180 * called with cgroup_mutex and css_set_lock held.
1028 */ 1181 */
1029static struct cgroup *task_cgroup_from_root(struct task_struct *task, 1182static struct cgroup *task_cgroup_from_root(struct task_struct *task,
1030 struct cgroup_root *root) 1183 struct cgroup_root *root)
@@ -1063,7 +1216,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
1063 * update of a tasks cgroup pointer by cgroup_attach_task() 1216 * update of a tasks cgroup pointer by cgroup_attach_task()
1064 */ 1217 */
1065 1218
1066static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
1067static struct kernfs_syscall_ops cgroup_kf_syscall_ops; 1219static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1068static const struct file_operations proc_cgroupstats_operations; 1220static const struct file_operations proc_cgroupstats_operations;
1069 1221
@@ -1086,43 +1238,25 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1086 * cgroup_file_mode - deduce file mode of a control file 1238 * cgroup_file_mode - deduce file mode of a control file
1087 * @cft: the control file in question 1239 * @cft: the control file in question
1088 * 1240 *
1089 * returns cft->mode if ->mode is not 0 1241 * S_IRUGO for read, S_IWUSR for write.
1090 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
1091 * returns S_IRUGO if it has only a read handler
1092 * returns S_IWUSR if it has only a write hander
1093 */ 1242 */
1094static umode_t cgroup_file_mode(const struct cftype *cft) 1243static umode_t cgroup_file_mode(const struct cftype *cft)
1095{ 1244{
1096 umode_t mode = 0; 1245 umode_t mode = 0;
1097 1246
1098 if (cft->mode)
1099 return cft->mode;
1100
1101 if (cft->read_u64 || cft->read_s64 || cft->seq_show) 1247 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1102 mode |= S_IRUGO; 1248 mode |= S_IRUGO;
1103 1249
1104 if (cft->write_u64 || cft->write_s64 || cft->write) 1250 if (cft->write_u64 || cft->write_s64 || cft->write) {
1105 mode |= S_IWUSR; 1251 if (cft->flags & CFTYPE_WORLD_WRITABLE)
1252 mode |= S_IWUGO;
1253 else
1254 mode |= S_IWUSR;
1255 }
1106 1256
1107 return mode; 1257 return mode;
1108} 1258}
1109 1259
1110static void cgroup_get(struct cgroup *cgrp)
1111{
1112 WARN_ON_ONCE(cgroup_is_dead(cgrp));
1113 css_get(&cgrp->self);
1114}
1115
1116static bool cgroup_tryget(struct cgroup *cgrp)
1117{
1118 return css_tryget(&cgrp->self);
1119}
1120
1121static void cgroup_put(struct cgroup *cgrp)
1122{
1123 css_put(&cgrp->self);
1124}
1125
1126/** 1260/**
1127 * cgroup_calc_child_subsys_mask - calculate child_subsys_mask 1261 * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
1128 * @cgrp: the target cgroup 1262 * @cgrp: the target cgroup
@@ -1263,28 +1397,64 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1263} 1397}
1264 1398
1265/** 1399/**
1266 * cgroup_clear_dir - remove subsys files in a cgroup directory 1400 * css_clear_dir - remove subsys files in a cgroup directory
1267 * @cgrp: target cgroup 1401 * @css: taget css
1268 * @subsys_mask: mask of the subsystem ids whose files should be removed 1402 * @cgrp_override: specify if target cgroup is different from css->cgroup
1269 */ 1403 */
1270static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) 1404static void css_clear_dir(struct cgroup_subsys_state *css,
1405 struct cgroup *cgrp_override)
1271{ 1406{
1272 struct cgroup_subsys *ss; 1407 struct cgroup *cgrp = cgrp_override ?: css->cgroup;
1273 int i; 1408 struct cftype *cfts;
1274 1409
1275 for_each_subsys(ss, i) { 1410 list_for_each_entry(cfts, &css->ss->cfts, node)
1276 struct cftype *cfts; 1411 cgroup_addrm_files(css, cgrp, cfts, false);
1412}
1277 1413
1278 if (!(subsys_mask & (1 << i))) 1414/**
1279 continue; 1415 * css_populate_dir - create subsys files in a cgroup directory
1280 list_for_each_entry(cfts, &ss->cfts, node) 1416 * @css: target css
1281 cgroup_addrm_files(cgrp, cfts, false); 1417 * @cgrp_overried: specify if target cgroup is different from css->cgroup
1418 *
1419 * On failure, no file is added.
1420 */
1421static int css_populate_dir(struct cgroup_subsys_state *css,
1422 struct cgroup *cgrp_override)
1423{
1424 struct cgroup *cgrp = cgrp_override ?: css->cgroup;
1425 struct cftype *cfts, *failed_cfts;
1426 int ret;
1427
1428 if (!css->ss) {
1429 if (cgroup_on_dfl(cgrp))
1430 cfts = cgroup_dfl_base_files;
1431 else
1432 cfts = cgroup_legacy_base_files;
1433
1434 return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
1435 }
1436
1437 list_for_each_entry(cfts, &css->ss->cfts, node) {
1438 ret = cgroup_addrm_files(css, cgrp, cfts, true);
1439 if (ret < 0) {
1440 failed_cfts = cfts;
1441 goto err;
1442 }
1282 } 1443 }
1444 return 0;
1445err:
1446 list_for_each_entry(cfts, &css->ss->cfts, node) {
1447 if (cfts == failed_cfts)
1448 break;
1449 cgroup_addrm_files(css, cgrp, cfts, false);
1450 }
1451 return ret;
1283} 1452}
1284 1453
1285static int rebind_subsystems(struct cgroup_root *dst_root, 1454static int rebind_subsystems(struct cgroup_root *dst_root,
1286 unsigned long ss_mask) 1455 unsigned long ss_mask)
1287{ 1456{
1457 struct cgroup *dcgrp = &dst_root->cgrp;
1288 struct cgroup_subsys *ss; 1458 struct cgroup_subsys *ss;
1289 unsigned long tmp_ss_mask; 1459 unsigned long tmp_ss_mask;
1290 int ssid, i, ret; 1460 int ssid, i, ret;
@@ -1306,10 +1476,13 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
1306 if (dst_root == &cgrp_dfl_root) 1476 if (dst_root == &cgrp_dfl_root)
1307 tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; 1477 tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
1308 1478
1309 ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask); 1479 for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
1310 if (ret) { 1480 struct cgroup *scgrp = &ss->root->cgrp;
1311 if (dst_root != &cgrp_dfl_root) 1481 int tssid;
1312 return ret; 1482
1483 ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp);
1484 if (!ret)
1485 continue;
1313 1486
1314 /* 1487 /*
1315 * Rebinding back to the default root is not allowed to 1488 * Rebinding back to the default root is not allowed to
@@ -1317,57 +1490,67 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
1317 * be rare. Moving subsystems back and forth even more so. 1490 * be rare. Moving subsystems back and forth even more so.
1318 * Just warn about it and continue. 1491 * Just warn about it and continue.
1319 */ 1492 */
1320 if (cgrp_dfl_root_visible) { 1493 if (dst_root == &cgrp_dfl_root) {
1321 pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", 1494 if (cgrp_dfl_root_visible) {
1322 ret, ss_mask); 1495 pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
1323 pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); 1496 ret, ss_mask);
1497 pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
1498 }
1499 continue;
1500 }
1501
1502 for_each_subsys_which(ss, tssid, &tmp_ss_mask) {
1503 if (tssid == ssid)
1504 break;
1505 css_clear_dir(cgroup_css(scgrp, ss), dcgrp);
1324 } 1506 }
1507 return ret;
1325 } 1508 }
1326 1509
1327 /* 1510 /*
1328 * Nothing can fail from this point on. Remove files for the 1511 * Nothing can fail from this point on. Remove files for the
1329 * removed subsystems and rebind each subsystem. 1512 * removed subsystems and rebind each subsystem.
1330 */ 1513 */
1331 for_each_subsys_which(ss, ssid, &ss_mask)
1332 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1333
1334 for_each_subsys_which(ss, ssid, &ss_mask) { 1514 for_each_subsys_which(ss, ssid, &ss_mask) {
1335 struct cgroup_root *src_root; 1515 struct cgroup_root *src_root = ss->root;
1336 struct cgroup_subsys_state *css; 1516 struct cgroup *scgrp = &src_root->cgrp;
1517 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
1337 struct css_set *cset; 1518 struct css_set *cset;
1338 1519
1339 src_root = ss->root; 1520 WARN_ON(!css || cgroup_css(dcgrp, ss));
1340 css = cgroup_css(&src_root->cgrp, ss);
1341 1521
1342 WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss)); 1522 css_clear_dir(css, NULL);
1343 1523
1344 RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL); 1524 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
1345 rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css); 1525 rcu_assign_pointer(dcgrp->subsys[ssid], css);
1346 ss->root = dst_root; 1526 ss->root = dst_root;
1347 css->cgroup = &dst_root->cgrp; 1527 css->cgroup = dcgrp;
1348 1528
1349 down_write(&css_set_rwsem); 1529 spin_lock_bh(&css_set_lock);
1350 hash_for_each(css_set_table, i, cset, hlist) 1530 hash_for_each(css_set_table, i, cset, hlist)
1351 list_move_tail(&cset->e_cset_node[ss->id], 1531 list_move_tail(&cset->e_cset_node[ss->id],
1352 &dst_root->cgrp.e_csets[ss->id]); 1532 &dcgrp->e_csets[ss->id]);
1353 up_write(&css_set_rwsem); 1533 spin_unlock_bh(&css_set_lock);
1354 1534
1355 src_root->subsys_mask &= ~(1 << ssid); 1535 src_root->subsys_mask &= ~(1 << ssid);
1356 src_root->cgrp.subtree_control &= ~(1 << ssid); 1536 scgrp->subtree_control &= ~(1 << ssid);
1357 cgroup_refresh_child_subsys_mask(&src_root->cgrp); 1537 cgroup_refresh_child_subsys_mask(scgrp);
1358 1538
1359 /* default hierarchy doesn't enable controllers by default */ 1539 /* default hierarchy doesn't enable controllers by default */
1360 dst_root->subsys_mask |= 1 << ssid; 1540 dst_root->subsys_mask |= 1 << ssid;
1361 if (dst_root != &cgrp_dfl_root) { 1541 if (dst_root == &cgrp_dfl_root) {
1362 dst_root->cgrp.subtree_control |= 1 << ssid; 1542 static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
1363 cgroup_refresh_child_subsys_mask(&dst_root->cgrp); 1543 } else {
1544 dcgrp->subtree_control |= 1 << ssid;
1545 cgroup_refresh_child_subsys_mask(dcgrp);
1546 static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
1364 } 1547 }
1365 1548
1366 if (ss->bind) 1549 if (ss->bind)
1367 ss->bind(css); 1550 ss->bind(css);
1368 } 1551 }
1369 1552
1370 kernfs_activate(dst_root->cgrp.kn); 1553 kernfs_activate(dcgrp->kn);
1371 return 0; 1554 return 0;
1372} 1555}
1373 1556
@@ -1497,7 +1680,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1497 for_each_subsys(ss, i) { 1680 for_each_subsys(ss, i) {
1498 if (strcmp(token, ss->legacy_name)) 1681 if (strcmp(token, ss->legacy_name))
1499 continue; 1682 continue;
1500 if (ss->disabled) 1683 if (!cgroup_ssid_enabled(i))
1501 continue; 1684 continue;
1502 1685
1503 /* Mutually exclusive option 'all' + subsystem name */ 1686 /* Mutually exclusive option 'all' + subsystem name */
@@ -1528,7 +1711,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1528 */ 1711 */
1529 if (all_ss || (!one_ss && !opts->none && !opts->name)) 1712 if (all_ss || (!one_ss && !opts->none && !opts->name))
1530 for_each_subsys(ss, i) 1713 for_each_subsys(ss, i)
1531 if (!ss->disabled) 1714 if (cgroup_ssid_enabled(i))
1532 opts->subsys_mask |= (1 << i); 1715 opts->subsys_mask |= (1 << i);
1533 1716
1534 /* 1717 /*
@@ -1624,7 +1807,7 @@ static void cgroup_enable_task_cg_lists(void)
1624{ 1807{
1625 struct task_struct *p, *g; 1808 struct task_struct *p, *g;
1626 1809
1627 down_write(&css_set_rwsem); 1810 spin_lock_bh(&css_set_lock);
1628 1811
1629 if (use_task_css_set_links) 1812 if (use_task_css_set_links)
1630 goto out_unlock; 1813 goto out_unlock;
@@ -1654,14 +1837,16 @@ static void cgroup_enable_task_cg_lists(void)
1654 if (!(p->flags & PF_EXITING)) { 1837 if (!(p->flags & PF_EXITING)) {
1655 struct css_set *cset = task_css_set(p); 1838 struct css_set *cset = task_css_set(p);
1656 1839
1657 list_add(&p->cg_list, &cset->tasks); 1840 if (!css_set_populated(cset))
1841 css_set_update_populated(cset, true);
1842 list_add_tail(&p->cg_list, &cset->tasks);
1658 get_css_set(cset); 1843 get_css_set(cset);
1659 } 1844 }
1660 spin_unlock_irq(&p->sighand->siglock); 1845 spin_unlock_irq(&p->sighand->siglock);
1661 } while_each_thread(g, p); 1846 } while_each_thread(g, p);
1662 read_unlock(&tasklist_lock); 1847 read_unlock(&tasklist_lock);
1663out_unlock: 1848out_unlock:
1664 up_write(&css_set_rwsem); 1849 spin_unlock_bh(&css_set_lock);
1665} 1850}
1666 1851
1667static void init_cgroup_housekeeping(struct cgroup *cgrp) 1852static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1671,6 +1856,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1671 1856
1672 INIT_LIST_HEAD(&cgrp->self.sibling); 1857 INIT_LIST_HEAD(&cgrp->self.sibling);
1673 INIT_LIST_HEAD(&cgrp->self.children); 1858 INIT_LIST_HEAD(&cgrp->self.children);
1859 INIT_LIST_HEAD(&cgrp->self.files);
1674 INIT_LIST_HEAD(&cgrp->cset_links); 1860 INIT_LIST_HEAD(&cgrp->cset_links);
1675 INIT_LIST_HEAD(&cgrp->pidlists); 1861 INIT_LIST_HEAD(&cgrp->pidlists);
1676 mutex_init(&cgrp->pidlist_mutex); 1862 mutex_init(&cgrp->pidlist_mutex);
@@ -1708,7 +1894,6 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1708{ 1894{
1709 LIST_HEAD(tmp_links); 1895 LIST_HEAD(tmp_links);
1710 struct cgroup *root_cgrp = &root->cgrp; 1896 struct cgroup *root_cgrp = &root->cgrp;
1711 struct cftype *base_files;
1712 struct css_set *cset; 1897 struct css_set *cset;
1713 int i, ret; 1898 int i, ret;
1714 1899
@@ -1725,7 +1910,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1725 goto out; 1910 goto out;
1726 1911
1727 /* 1912 /*
1728 * We're accessing css_set_count without locking css_set_rwsem here, 1913 * We're accessing css_set_count without locking css_set_lock here,
1729 * but that's OK - it can only be increased by someone holding 1914 * but that's OK - it can only be increased by someone holding
1730 * cgroup_lock, and that's us. The worst that can happen is that we 1915 * cgroup_lock, and that's us. The worst that can happen is that we
1731 * have some link structures left over 1916 * have some link structures left over
@@ -1747,12 +1932,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1747 } 1932 }
1748 root_cgrp->kn = root->kf_root->kn; 1933 root_cgrp->kn = root->kf_root->kn;
1749 1934
1750 if (root == &cgrp_dfl_root) 1935 ret = css_populate_dir(&root_cgrp->self, NULL);
1751 base_files = cgroup_dfl_base_files;
1752 else
1753 base_files = cgroup_legacy_base_files;
1754
1755 ret = cgroup_addrm_files(root_cgrp, base_files, true);
1756 if (ret) 1936 if (ret)
1757 goto destroy_root; 1937 goto destroy_root;
1758 1938
@@ -1772,10 +1952,13 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1772 * Link the root cgroup in this hierarchy into all the css_set 1952 * Link the root cgroup in this hierarchy into all the css_set
1773 * objects. 1953 * objects.
1774 */ 1954 */
1775 down_write(&css_set_rwsem); 1955 spin_lock_bh(&css_set_lock);
1776 hash_for_each(css_set_table, i, cset, hlist) 1956 hash_for_each(css_set_table, i, cset, hlist) {
1777 link_css_set(&tmp_links, cset, root_cgrp); 1957 link_css_set(&tmp_links, cset, root_cgrp);
1778 up_write(&css_set_rwsem); 1958 if (css_set_populated(cset))
1959 cgroup_update_populated(root_cgrp, true);
1960 }
1961 spin_unlock_bh(&css_set_lock);
1779 1962
1780 BUG_ON(!list_empty(&root_cgrp->self.children)); 1963 BUG_ON(!list_empty(&root_cgrp->self.children));
1781 BUG_ON(atomic_read(&root->nr_cgrps) != 1); 1964 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@ -2008,7 +2191,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2008 char *path = NULL; 2191 char *path = NULL;
2009 2192
2010 mutex_lock(&cgroup_mutex); 2193 mutex_lock(&cgroup_mutex);
2011 down_read(&css_set_rwsem); 2194 spin_lock_bh(&css_set_lock);
2012 2195
2013 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); 2196 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
2014 2197
@@ -2021,7 +2204,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2021 path = buf; 2204 path = buf;
2022 } 2205 }
2023 2206
2024 up_read(&css_set_rwsem); 2207 spin_unlock_bh(&css_set_lock);
2025 mutex_unlock(&cgroup_mutex); 2208 mutex_unlock(&cgroup_mutex);
2026 return path; 2209 return path;
2027} 2210}
@@ -2049,6 +2232,49 @@ struct cgroup_taskset {
2049 struct task_struct *cur_task; 2232 struct task_struct *cur_task;
2050}; 2233};
2051 2234
2235#define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \
2236 .src_csets = LIST_HEAD_INIT(tset.src_csets), \
2237 .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \
2238 .csets = &tset.src_csets, \
2239}
2240
2241/**
2242 * cgroup_taskset_add - try to add a migration target task to a taskset
2243 * @task: target task
2244 * @tset: target taskset
2245 *
2246 * Add @task, which is a migration target, to @tset. This function becomes
2247 * noop if @task doesn't need to be migrated. @task's css_set should have
2248 * been added as a migration source and @task->cg_list will be moved from
2249 * the css_set's tasks list to mg_tasks one.
2250 */
2251static void cgroup_taskset_add(struct task_struct *task,
2252 struct cgroup_taskset *tset)
2253{
2254 struct css_set *cset;
2255
2256 lockdep_assert_held(&css_set_lock);
2257
2258 /* @task either already exited or can't exit until the end */
2259 if (task->flags & PF_EXITING)
2260 return;
2261
2262 /* leave @task alone if post_fork() hasn't linked it yet */
2263 if (list_empty(&task->cg_list))
2264 return;
2265
2266 cset = task_css_set(task);
2267 if (!cset->mg_src_cgrp)
2268 return;
2269
2270 list_move_tail(&task->cg_list, &cset->mg_tasks);
2271 if (list_empty(&cset->mg_node))
2272 list_add_tail(&cset->mg_node, &tset->src_csets);
2273 if (list_empty(&cset->mg_dst_cset->mg_node))
2274 list_move_tail(&cset->mg_dst_cset->mg_node,
2275 &tset->dst_csets);
2276}
2277
2052/** 2278/**
2053 * cgroup_taskset_first - reset taskset and return the first task 2279 * cgroup_taskset_first - reset taskset and return the first task
2054 * @tset: taskset of interest 2280 * @tset: taskset of interest
@@ -2096,47 +2322,86 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
2096} 2322}
2097 2323
2098/** 2324/**
2099 * cgroup_task_migrate - move a task from one cgroup to another. 2325 * cgroup_taskset_migrate - migrate a taskset to a cgroup
2100 * @old_cgrp: the cgroup @tsk is being migrated from 2326 * @tset: taget taskset
2101 * @tsk: the task being migrated 2327 * @dst_cgrp: destination cgroup
2102 * @new_cset: the new css_set @tsk is being attached to
2103 * 2328 *
2104 * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked. 2329 * Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the
2330 * ->can_attach callbacks fails and guarantees that either all or none of
2331 * the tasks in @tset are migrated. @tset is consumed regardless of
2332 * success.
2105 */ 2333 */
2106static void cgroup_task_migrate(struct cgroup *old_cgrp, 2334static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
2107 struct task_struct *tsk, 2335 struct cgroup *dst_cgrp)
2108 struct css_set *new_cset)
2109{ 2336{
2110 struct css_set *old_cset; 2337 struct cgroup_subsys_state *css, *failed_css = NULL;
2111 2338 struct task_struct *task, *tmp_task;
2112 lockdep_assert_held(&cgroup_mutex); 2339 struct css_set *cset, *tmp_cset;
2113 lockdep_assert_held(&css_set_rwsem); 2340 int i, ret;
2114 2341
2115 /* 2342 /* methods shouldn't be called if no task is actually migrating */
2116 * We are synchronized through threadgroup_lock() against PF_EXITING 2343 if (list_empty(&tset->src_csets))
2117 * setting such that we can't race against cgroup_exit() changing the 2344 return 0;
2118 * css_set to init_css_set and dropping the old one.
2119 */
2120 WARN_ON_ONCE(tsk->flags & PF_EXITING);
2121 old_cset = task_css_set(tsk);
2122 2345
2123 get_css_set(new_cset); 2346 /* check that we can legitimately attach to the cgroup */
2124 rcu_assign_pointer(tsk->cgroups, new_cset); 2347 for_each_e_css(css, i, dst_cgrp) {
2348 if (css->ss->can_attach) {
2349 ret = css->ss->can_attach(css, tset);
2350 if (ret) {
2351 failed_css = css;
2352 goto out_cancel_attach;
2353 }
2354 }
2355 }
2125 2356
2126 /* 2357 /*
2127 * Use move_tail so that cgroup_taskset_first() still returns the 2358 * Now that we're guaranteed success, proceed to move all tasks to
2128 * leader after migration. This works because cgroup_migrate() 2359 * the new cgroup. There are no failure cases after here, so this
2129 * ensures that the dst_cset of the leader is the first on the 2360 * is the commit point.
2130 * tset's dst_csets list.
2131 */ 2361 */
2132 list_move_tail(&tsk->cg_list, &new_cset->mg_tasks); 2362 spin_lock_bh(&css_set_lock);
2363 list_for_each_entry(cset, &tset->src_csets, mg_node) {
2364 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
2365 struct css_set *from_cset = task_css_set(task);
2366 struct css_set *to_cset = cset->mg_dst_cset;
2367
2368 get_css_set(to_cset);
2369 css_set_move_task(task, from_cset, to_cset, true);
2370 put_css_set_locked(from_cset);
2371 }
2372 }
2373 spin_unlock_bh(&css_set_lock);
2133 2374
2134 /* 2375 /*
2135 * We just gained a reference on old_cset by taking it from the 2376 * Migration is committed, all target tasks are now on dst_csets.
2136 * task. As trading it for new_cset is protected by cgroup_mutex, 2377 * Nothing is sensitive to fork() after this point. Notify
2137 * we're safe to drop it here; it will be freed under RCU. 2378 * controllers that migration is complete.
2138 */ 2379 */
2139 put_css_set_locked(old_cset); 2380 tset->csets = &tset->dst_csets;
2381
2382 for_each_e_css(css, i, dst_cgrp)
2383 if (css->ss->attach)
2384 css->ss->attach(css, tset);
2385
2386 ret = 0;
2387 goto out_release_tset;
2388
2389out_cancel_attach:
2390 for_each_e_css(css, i, dst_cgrp) {
2391 if (css == failed_css)
2392 break;
2393 if (css->ss->cancel_attach)
2394 css->ss->cancel_attach(css, tset);
2395 }
2396out_release_tset:
2397 spin_lock_bh(&css_set_lock);
2398 list_splice_init(&tset->dst_csets, &tset->src_csets);
2399 list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
2400 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2401 list_del_init(&cset->mg_node);
2402 }
2403 spin_unlock_bh(&css_set_lock);
2404 return ret;
2140} 2405}
2141 2406
2142/** 2407/**
@@ -2152,14 +2417,14 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
2152 2417
2153 lockdep_assert_held(&cgroup_mutex); 2418 lockdep_assert_held(&cgroup_mutex);
2154 2419
2155 down_write(&css_set_rwsem); 2420 spin_lock_bh(&css_set_lock);
2156 list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { 2421 list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
2157 cset->mg_src_cgrp = NULL; 2422 cset->mg_src_cgrp = NULL;
2158 cset->mg_dst_cset = NULL; 2423 cset->mg_dst_cset = NULL;
2159 list_del_init(&cset->mg_preload_node); 2424 list_del_init(&cset->mg_preload_node);
2160 put_css_set_locked(cset); 2425 put_css_set_locked(cset);
2161 } 2426 }
2162 up_write(&css_set_rwsem); 2427 spin_unlock_bh(&css_set_lock);
2163} 2428}
2164 2429
2165/** 2430/**
@@ -2172,10 +2437,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
2172 * @src_cset and add it to @preloaded_csets, which should later be cleaned 2437 * @src_cset and add it to @preloaded_csets, which should later be cleaned
2173 * up by cgroup_migrate_finish(). 2438 * up by cgroup_migrate_finish().
2174 * 2439 *
2175 * This function may be called without holding threadgroup_lock even if the 2440 * This function may be called without holding cgroup_threadgroup_rwsem
2176 * target is a process. Threads may be created and destroyed but as long 2441 * even if the target is a process. Threads may be created and destroyed
2177 * as cgroup_mutex is not dropped, no new css_set can be put into play and 2442 * but as long as cgroup_mutex is not dropped, no new css_set can be put
2178 * the preloaded css_sets are guaranteed to cover all migrations. 2443 * into play and the preloaded css_sets are guaranteed to cover all
2444 * migrations.
2179 */ 2445 */
2180static void cgroup_migrate_add_src(struct css_set *src_cset, 2446static void cgroup_migrate_add_src(struct css_set *src_cset,
2181 struct cgroup *dst_cgrp, 2447 struct cgroup *dst_cgrp,
@@ -2184,7 +2450,7 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
2184 struct cgroup *src_cgrp; 2450 struct cgroup *src_cgrp;
2185 2451
2186 lockdep_assert_held(&cgroup_mutex); 2452 lockdep_assert_held(&cgroup_mutex);
2187 lockdep_assert_held(&css_set_rwsem); 2453 lockdep_assert_held(&css_set_lock);
2188 2454
2189 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); 2455 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2190 2456
@@ -2273,12 +2539,12 @@ err:
2273 2539
2274/** 2540/**
2275 * cgroup_migrate - migrate a process or task to a cgroup 2541 * cgroup_migrate - migrate a process or task to a cgroup
2276 * @cgrp: the destination cgroup
2277 * @leader: the leader of the process or the task to migrate 2542 * @leader: the leader of the process or the task to migrate
2278 * @threadgroup: whether @leader points to the whole process or a single task 2543 * @threadgroup: whether @leader points to the whole process or a single task
2544 * @cgrp: the destination cgroup
2279 * 2545 *
2280 * Migrate a process or task denoted by @leader to @cgrp. If migrating a 2546 * Migrate a process or task denoted by @leader to @cgrp. If migrating a
2281 * process, the caller must be holding threadgroup_lock of @leader. The 2547 * process, the caller must be holding cgroup_threadgroup_rwsem. The
2282 * caller is also responsible for invoking cgroup_migrate_add_src() and 2548 * caller is also responsible for invoking cgroup_migrate_add_src() and
2283 * cgroup_migrate_prepare_dst() on the targets before invoking this 2549 * cgroup_migrate_prepare_dst() on the targets before invoking this
2284 * function and following up with cgroup_migrate_finish(). 2550 * function and following up with cgroup_migrate_finish().
@@ -2289,115 +2555,29 @@ err:
2289 * decided for all targets by invoking group_migrate_prepare_dst() before 2555 * decided for all targets by invoking group_migrate_prepare_dst() before
2290 * actually starting migrating. 2556 * actually starting migrating.
2291 */ 2557 */
2292static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, 2558static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2293 bool threadgroup) 2559 struct cgroup *cgrp)
2294{ 2560{
2295 struct cgroup_taskset tset = { 2561 struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
2296 .src_csets = LIST_HEAD_INIT(tset.src_csets), 2562 struct task_struct *task;
2297 .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
2298 .csets = &tset.src_csets,
2299 };
2300 struct cgroup_subsys_state *css, *failed_css = NULL;
2301 struct css_set *cset, *tmp_cset;
2302 struct task_struct *task, *tmp_task;
2303 int i, ret;
2304 2563
2305 /* 2564 /*
2306 * Prevent freeing of tasks while we take a snapshot. Tasks that are 2565 * Prevent freeing of tasks while we take a snapshot. Tasks that are
2307 * already PF_EXITING could be freed from underneath us unless we 2566 * already PF_EXITING could be freed from underneath us unless we
2308 * take an rcu_read_lock. 2567 * take an rcu_read_lock.
2309 */ 2568 */
2310 down_write(&css_set_rwsem); 2569 spin_lock_bh(&css_set_lock);
2311 rcu_read_lock(); 2570 rcu_read_lock();
2312 task = leader; 2571 task = leader;
2313 do { 2572 do {
2314 /* @task either already exited or can't exit until the end */ 2573 cgroup_taskset_add(task, &tset);
2315 if (task->flags & PF_EXITING)
2316 goto next;
2317
2318 /* leave @task alone if post_fork() hasn't linked it yet */
2319 if (list_empty(&task->cg_list))
2320 goto next;
2321
2322 cset = task_css_set(task);
2323 if (!cset->mg_src_cgrp)
2324 goto next;
2325
2326 /*
2327 * cgroup_taskset_first() must always return the leader.
2328 * Take care to avoid disturbing the ordering.
2329 */
2330 list_move_tail(&task->cg_list, &cset->mg_tasks);
2331 if (list_empty(&cset->mg_node))
2332 list_add_tail(&cset->mg_node, &tset.src_csets);
2333 if (list_empty(&cset->mg_dst_cset->mg_node))
2334 list_move_tail(&cset->mg_dst_cset->mg_node,
2335 &tset.dst_csets);
2336 next:
2337 if (!threadgroup) 2574 if (!threadgroup)
2338 break; 2575 break;
2339 } while_each_thread(leader, task); 2576 } while_each_thread(leader, task);
2340 rcu_read_unlock(); 2577 rcu_read_unlock();
2341 up_write(&css_set_rwsem); 2578 spin_unlock_bh(&css_set_lock);
2342
2343 /* methods shouldn't be called if no task is actually migrating */
2344 if (list_empty(&tset.src_csets))
2345 return 0;
2346
2347 /* check that we can legitimately attach to the cgroup */
2348 for_each_e_css(css, i, cgrp) {
2349 if (css->ss->can_attach) {
2350 ret = css->ss->can_attach(css, &tset);
2351 if (ret) {
2352 failed_css = css;
2353 goto out_cancel_attach;
2354 }
2355 }
2356 }
2357
2358 /*
2359 * Now that we're guaranteed success, proceed to move all tasks to
2360 * the new cgroup. There are no failure cases after here, so this
2361 * is the commit point.
2362 */
2363 down_write(&css_set_rwsem);
2364 list_for_each_entry(cset, &tset.src_csets, mg_node) {
2365 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
2366 cgroup_task_migrate(cset->mg_src_cgrp, task,
2367 cset->mg_dst_cset);
2368 }
2369 up_write(&css_set_rwsem);
2370
2371 /*
2372 * Migration is committed, all target tasks are now on dst_csets.
2373 * Nothing is sensitive to fork() after this point. Notify
2374 * controllers that migration is complete.
2375 */
2376 tset.csets = &tset.dst_csets;
2377
2378 for_each_e_css(css, i, cgrp)
2379 if (css->ss->attach)
2380 css->ss->attach(css, &tset);
2381
2382 ret = 0;
2383 goto out_release_tset;
2384 2579
2385out_cancel_attach: 2580 return cgroup_taskset_migrate(&tset, cgrp);
2386 for_each_e_css(css, i, cgrp) {
2387 if (css == failed_css)
2388 break;
2389 if (css->ss->cancel_attach)
2390 css->ss->cancel_attach(css, &tset);
2391 }
2392out_release_tset:
2393 down_write(&css_set_rwsem);
2394 list_splice_init(&tset.dst_csets, &tset.src_csets);
2395 list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
2396 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2397 list_del_init(&cset->mg_node);
2398 }
2399 up_write(&css_set_rwsem);
2400 return ret;
2401} 2581}
2402 2582
2403/** 2583/**
@@ -2406,7 +2586,7 @@ out_release_tset:
2406 * @leader: the task or the leader of the threadgroup to be attached 2586 * @leader: the task or the leader of the threadgroup to be attached
2407 * @threadgroup: attach the whole threadgroup? 2587 * @threadgroup: attach the whole threadgroup?
2408 * 2588 *
2409 * Call holding cgroup_mutex and threadgroup_lock of @leader. 2589 * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
2410 */ 2590 */
2411static int cgroup_attach_task(struct cgroup *dst_cgrp, 2591static int cgroup_attach_task(struct cgroup *dst_cgrp,
2412 struct task_struct *leader, bool threadgroup) 2592 struct task_struct *leader, bool threadgroup)
@@ -2416,7 +2596,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
2416 int ret; 2596 int ret;
2417 2597
2418 /* look up all src csets */ 2598 /* look up all src csets */
2419 down_read(&css_set_rwsem); 2599 spin_lock_bh(&css_set_lock);
2420 rcu_read_lock(); 2600 rcu_read_lock();
2421 task = leader; 2601 task = leader;
2422 do { 2602 do {
@@ -2426,12 +2606,12 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
2426 break; 2606 break;
2427 } while_each_thread(leader, task); 2607 } while_each_thread(leader, task);
2428 rcu_read_unlock(); 2608 rcu_read_unlock();
2429 up_read(&css_set_rwsem); 2609 spin_unlock_bh(&css_set_lock);
2430 2610
2431 /* prepare dst csets and commit */ 2611 /* prepare dst csets and commit */
2432 ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); 2612 ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
2433 if (!ret) 2613 if (!ret)
2434 ret = cgroup_migrate(dst_cgrp, leader, threadgroup); 2614 ret = cgroup_migrate(leader, threadgroup, dst_cgrp);
2435 2615
2436 cgroup_migrate_finish(&preloaded_csets); 2616 cgroup_migrate_finish(&preloaded_csets);
2437 return ret; 2617 return ret;
@@ -2459,15 +2639,15 @@ static int cgroup_procs_write_permission(struct task_struct *task,
2459 struct cgroup *cgrp; 2639 struct cgroup *cgrp;
2460 struct inode *inode; 2640 struct inode *inode;
2461 2641
2462 down_read(&css_set_rwsem); 2642 spin_lock_bh(&css_set_lock);
2463 cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); 2643 cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
2464 up_read(&css_set_rwsem); 2644 spin_unlock_bh(&css_set_lock);
2465 2645
2466 while (!cgroup_is_descendant(dst_cgrp, cgrp)) 2646 while (!cgroup_is_descendant(dst_cgrp, cgrp))
2467 cgrp = cgroup_parent(cgrp); 2647 cgrp = cgroup_parent(cgrp);
2468 2648
2469 ret = -ENOMEM; 2649 ret = -ENOMEM;
2470 inode = kernfs_get_inode(sb, cgrp->procs_kn); 2650 inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
2471 if (inode) { 2651 if (inode) {
2472 ret = inode_permission(inode, MAY_WRITE); 2652 ret = inode_permission(inode, MAY_WRITE);
2473 iput(inode); 2653 iput(inode);
@@ -2498,14 +2678,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2498 if (!cgrp) 2678 if (!cgrp)
2499 return -ENODEV; 2679 return -ENODEV;
2500 2680
2501retry_find_task: 2681 percpu_down_write(&cgroup_threadgroup_rwsem);
2502 rcu_read_lock(); 2682 rcu_read_lock();
2503 if (pid) { 2683 if (pid) {
2504 tsk = find_task_by_vpid(pid); 2684 tsk = find_task_by_vpid(pid);
2505 if (!tsk) { 2685 if (!tsk) {
2506 rcu_read_unlock();
2507 ret = -ESRCH; 2686 ret = -ESRCH;
2508 goto out_unlock_cgroup; 2687 goto out_unlock_rcu;
2509 } 2688 }
2510 } else { 2689 } else {
2511 tsk = current; 2690 tsk = current;
@@ -2521,37 +2700,23 @@ retry_find_task:
2521 */ 2700 */
2522 if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { 2701 if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
2523 ret = -EINVAL; 2702 ret = -EINVAL;
2524 rcu_read_unlock(); 2703 goto out_unlock_rcu;
2525 goto out_unlock_cgroup;
2526 } 2704 }
2527 2705
2528 get_task_struct(tsk); 2706 get_task_struct(tsk);
2529 rcu_read_unlock(); 2707 rcu_read_unlock();
2530 2708
2531 threadgroup_lock(tsk);
2532 if (threadgroup) {
2533 if (!thread_group_leader(tsk)) {
2534 /*
2535 * a race with de_thread from another thread's exec()
2536 * may strip us of our leadership, if this happens,
2537 * there is no choice but to throw this task away and
2538 * try again; this is
2539 * "double-double-toil-and-trouble-check locking".
2540 */
2541 threadgroup_unlock(tsk);
2542 put_task_struct(tsk);
2543 goto retry_find_task;
2544 }
2545 }
2546
2547 ret = cgroup_procs_write_permission(tsk, cgrp, of); 2709 ret = cgroup_procs_write_permission(tsk, cgrp, of);
2548 if (!ret) 2710 if (!ret)
2549 ret = cgroup_attach_task(cgrp, tsk, threadgroup); 2711 ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2550 2712
2551 threadgroup_unlock(tsk);
2552
2553 put_task_struct(tsk); 2713 put_task_struct(tsk);
2554out_unlock_cgroup: 2714 goto out_unlock_threadgroup;
2715
2716out_unlock_rcu:
2717 rcu_read_unlock();
2718out_unlock_threadgroup:
2719 percpu_up_write(&cgroup_threadgroup_rwsem);
2555 cgroup_kn_unlock(of->kn); 2720 cgroup_kn_unlock(of->kn);
2556 return ret ?: nbytes; 2721 return ret ?: nbytes;
2557} 2722}
@@ -2573,9 +2738,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2573 if (root == &cgrp_dfl_root) 2738 if (root == &cgrp_dfl_root)
2574 continue; 2739 continue;
2575 2740
2576 down_read(&css_set_rwsem); 2741 spin_lock_bh(&css_set_lock);
2577 from_cgrp = task_cgroup_from_root(from, root); 2742 from_cgrp = task_cgroup_from_root(from, root);
2578 up_read(&css_set_rwsem); 2743 spin_unlock_bh(&css_set_lock);
2579 2744
2580 retval = cgroup_attach_task(from_cgrp, tsk, false); 2745 retval = cgroup_attach_task(from_cgrp, tsk, false);
2581 if (retval) 2746 if (retval)
@@ -2690,14 +2855,17 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2690static int cgroup_update_dfl_csses(struct cgroup *cgrp) 2855static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2691{ 2856{
2692 LIST_HEAD(preloaded_csets); 2857 LIST_HEAD(preloaded_csets);
2858 struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
2693 struct cgroup_subsys_state *css; 2859 struct cgroup_subsys_state *css;
2694 struct css_set *src_cset; 2860 struct css_set *src_cset;
2695 int ret; 2861 int ret;
2696 2862
2697 lockdep_assert_held(&cgroup_mutex); 2863 lockdep_assert_held(&cgroup_mutex);
2698 2864
2865 percpu_down_write(&cgroup_threadgroup_rwsem);
2866
2699 /* look up all csses currently attached to @cgrp's subtree */ 2867 /* look up all csses currently attached to @cgrp's subtree */
2700 down_read(&css_set_rwsem); 2868 spin_lock_bh(&css_set_lock);
2701 css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { 2869 css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
2702 struct cgrp_cset_link *link; 2870 struct cgrp_cset_link *link;
2703 2871
@@ -2709,68 +2877,31 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2709 cgroup_migrate_add_src(link->cset, cgrp, 2877 cgroup_migrate_add_src(link->cset, cgrp,
2710 &preloaded_csets); 2878 &preloaded_csets);
2711 } 2879 }
2712 up_read(&css_set_rwsem); 2880 spin_unlock_bh(&css_set_lock);
2713 2881
2714 /* NULL dst indicates self on default hierarchy */ 2882 /* NULL dst indicates self on default hierarchy */
2715 ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); 2883 ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
2716 if (ret) 2884 if (ret)
2717 goto out_finish; 2885 goto out_finish;
2718 2886
2887 spin_lock_bh(&css_set_lock);
2719 list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { 2888 list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
2720 struct task_struct *last_task = NULL, *task; 2889 struct task_struct *task, *ntask;
2721 2890
2722 /* src_csets precede dst_csets, break on the first dst_cset */ 2891 /* src_csets precede dst_csets, break on the first dst_cset */
2723 if (!src_cset->mg_src_cgrp) 2892 if (!src_cset->mg_src_cgrp)
2724 break; 2893 break;
2725 2894
2726 /* 2895 /* all tasks in src_csets need to be migrated */
2727 * All tasks in src_cset need to be migrated to the 2896 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
2728 * matching dst_cset. Empty it process by process. We 2897 cgroup_taskset_add(task, &tset);
2729 * walk tasks but migrate processes. The leader might even
2730 * belong to a different cset but such src_cset would also
2731 * be among the target src_csets because the default
2732 * hierarchy enforces per-process membership.
2733 */
2734 while (true) {
2735 down_read(&css_set_rwsem);
2736 task = list_first_entry_or_null(&src_cset->tasks,
2737 struct task_struct, cg_list);
2738 if (task) {
2739 task = task->group_leader;
2740 WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
2741 get_task_struct(task);
2742 }
2743 up_read(&css_set_rwsem);
2744
2745 if (!task)
2746 break;
2747
2748 /* guard against possible infinite loop */
2749 if (WARN(last_task == task,
2750 "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
2751 goto out_finish;
2752 last_task = task;
2753
2754 threadgroup_lock(task);
2755 /* raced against de_thread() from another thread? */
2756 if (!thread_group_leader(task)) {
2757 threadgroup_unlock(task);
2758 put_task_struct(task);
2759 continue;
2760 }
2761
2762 ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
2763
2764 threadgroup_unlock(task);
2765 put_task_struct(task);
2766
2767 if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
2768 goto out_finish;
2769 }
2770 } 2898 }
2899 spin_unlock_bh(&css_set_lock);
2771 2900
2901 ret = cgroup_taskset_migrate(&tset, cgrp);
2772out_finish: 2902out_finish:
2773 cgroup_migrate_finish(&preloaded_csets); 2903 cgroup_migrate_finish(&preloaded_csets);
2904 percpu_up_write(&cgroup_threadgroup_rwsem);
2774 return ret; 2905 return ret;
2775} 2906}
2776 2907
@@ -2797,7 +2928,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2797 if (tok[0] == '\0') 2928 if (tok[0] == '\0')
2798 continue; 2929 continue;
2799 for_each_subsys_which(ss, ssid, &tmp_ss_mask) { 2930 for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
2800 if (ss->disabled || strcmp(tok + 1, ss->name)) 2931 if (!cgroup_ssid_enabled(ssid) ||
2932 strcmp(tok + 1, ss->name))
2801 continue; 2933 continue;
2802 2934
2803 if (*tok == '+') { 2935 if (*tok == '+') {
@@ -2921,7 +3053,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2921 ret = create_css(child, ss, 3053 ret = create_css(child, ss,
2922 cgrp->subtree_control & (1 << ssid)); 3054 cgrp->subtree_control & (1 << ssid));
2923 else 3055 else
2924 ret = cgroup_populate_dir(child, 1 << ssid); 3056 ret = css_populate_dir(cgroup_css(child, ss),
3057 NULL);
2925 if (ret) 3058 if (ret)
2926 goto err_undo_css; 3059 goto err_undo_css;
2927 } 3060 }
@@ -2954,7 +3087,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2954 if (css_disable & (1 << ssid)) { 3087 if (css_disable & (1 << ssid)) {
2955 kill_css(css); 3088 kill_css(css);
2956 } else { 3089 } else {
2957 cgroup_clear_dir(child, 1 << ssid); 3090 css_clear_dir(css, NULL);
2958 if (ss->css_reset) 3091 if (ss->css_reset)
2959 ss->css_reset(css); 3092 ss->css_reset(css);
2960 } 3093 }
@@ -3002,15 +3135,16 @@ err_undo_css:
3002 if (css_enable & (1 << ssid)) 3135 if (css_enable & (1 << ssid))
3003 kill_css(css); 3136 kill_css(css);
3004 else 3137 else
3005 cgroup_clear_dir(child, 1 << ssid); 3138 css_clear_dir(css, NULL);
3006 } 3139 }
3007 } 3140 }
3008 goto out_unlock; 3141 goto out_unlock;
3009} 3142}
3010 3143
3011static int cgroup_populated_show(struct seq_file *seq, void *v) 3144static int cgroup_events_show(struct seq_file *seq, void *v)
3012{ 3145{
3013 seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt); 3146 seq_printf(seq, "populated %d\n",
3147 cgroup_is_populated(seq_css(seq)->cgroup));
3014 return 0; 3148 return 0;
3015} 3149}
3016 3150
@@ -3153,7 +3287,8 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn)
3153 return kernfs_setattr(kn, &iattr); 3287 return kernfs_setattr(kn, &iattr);
3154} 3288}
3155 3289
3156static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) 3290static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
3291 struct cftype *cft)
3157{ 3292{
3158 char name[CGROUP_FILE_NAME_MAX]; 3293 char name[CGROUP_FILE_NAME_MAX];
3159 struct kernfs_node *kn; 3294 struct kernfs_node *kn;
@@ -3175,33 +3310,38 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
3175 return ret; 3310 return ret;
3176 } 3311 }
3177 3312
3178 if (cft->write == cgroup_procs_write) 3313 if (cft->file_offset) {
3179 cgrp->procs_kn = kn; 3314 struct cgroup_file *cfile = (void *)css + cft->file_offset;
3180 else if (cft->seq_show == cgroup_populated_show) 3315
3181 cgrp->populated_kn = kn; 3316 kernfs_get(kn);
3317 cfile->kn = kn;
3318 list_add(&cfile->node, &css->files);
3319 }
3320
3182 return 0; 3321 return 0;
3183} 3322}
3184 3323
3185/** 3324/**
3186 * cgroup_addrm_files - add or remove files to a cgroup directory 3325 * cgroup_addrm_files - add or remove files to a cgroup directory
3187 * @cgrp: the target cgroup 3326 * @css: the target css
3327 * @cgrp: the target cgroup (usually css->cgroup)
3188 * @cfts: array of cftypes to be added 3328 * @cfts: array of cftypes to be added
3189 * @is_add: whether to add or remove 3329 * @is_add: whether to add or remove
3190 * 3330 *
3191 * Depending on @is_add, add or remove files defined by @cfts on @cgrp. 3331 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
3192 * For removals, this function never fails. If addition fails, this 3332 * For removals, this function never fails.
3193 * function doesn't remove files already added. The caller is responsible
3194 * for cleaning up.
3195 */ 3333 */
3196static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 3334static int cgroup_addrm_files(struct cgroup_subsys_state *css,
3335 struct cgroup *cgrp, struct cftype cfts[],
3197 bool is_add) 3336 bool is_add)
3198{ 3337{
3199 struct cftype *cft; 3338 struct cftype *cft, *cft_end = NULL;
3200 int ret; 3339 int ret;
3201 3340
3202 lockdep_assert_held(&cgroup_mutex); 3341 lockdep_assert_held(&cgroup_mutex);
3203 3342
3204 for (cft = cfts; cft->name[0] != '\0'; cft++) { 3343restart:
3344 for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
3205 /* does cft->flags tell us to skip this file on @cgrp? */ 3345 /* does cft->flags tell us to skip this file on @cgrp? */
3206 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) 3346 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
3207 continue; 3347 continue;
@@ -3213,11 +3353,13 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
3213 continue; 3353 continue;
3214 3354
3215 if (is_add) { 3355 if (is_add) {
3216 ret = cgroup_add_file(cgrp, cft); 3356 ret = cgroup_add_file(css, cgrp, cft);
3217 if (ret) { 3357 if (ret) {
3218 pr_warn("%s: failed to add %s, err=%d\n", 3358 pr_warn("%s: failed to add %s, err=%d\n",
3219 __func__, cft->name, ret); 3359 __func__, cft->name, ret);
3220 return ret; 3360 cft_end = cft;
3361 is_add = false;
3362 goto restart;
3221 } 3363 }
3222 } else { 3364 } else {
3223 cgroup_rm_file(cgrp, cft); 3365 cgroup_rm_file(cgrp, cft);
@@ -3243,7 +3385,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
3243 if (cgroup_is_dead(cgrp)) 3385 if (cgroup_is_dead(cgrp))
3244 continue; 3386 continue;
3245 3387
3246 ret = cgroup_addrm_files(cgrp, cfts, is_add); 3388 ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
3247 if (ret) 3389 if (ret)
3248 break; 3390 break;
3249 } 3391 }
@@ -3355,7 +3497,7 @@ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3355{ 3497{
3356 int ret; 3498 int ret;
3357 3499
3358 if (ss->disabled) 3500 if (!cgroup_ssid_enabled(ss->id))
3359 return 0; 3501 return 0;
3360 3502
3361 if (!cfts || cfts[0].name[0] == '\0') 3503 if (!cfts || cfts[0].name[0] == '\0')
@@ -3405,17 +3547,8 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3405{ 3547{
3406 struct cftype *cft; 3548 struct cftype *cft;
3407 3549
3408 /* 3550 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3409 * If legacy_flies_on_dfl, we want to show the legacy files on the 3551 cft->flags |= __CFTYPE_NOT_ON_DFL;
3410 * dfl hierarchy but iff the target subsystem hasn't been updated
3411 * for the dfl hierarchy yet.
3412 */
3413 if (!cgroup_legacy_files_on_dfl ||
3414 ss->dfl_cftypes != ss->legacy_cftypes) {
3415 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3416 cft->flags |= __CFTYPE_NOT_ON_DFL;
3417 }
3418
3419 return cgroup_add_cftypes(ss, cfts); 3552 return cgroup_add_cftypes(ss, cfts);
3420} 3553}
3421 3554
@@ -3430,10 +3563,10 @@ static int cgroup_task_count(const struct cgroup *cgrp)
3430 int count = 0; 3563 int count = 0;
3431 struct cgrp_cset_link *link; 3564 struct cgrp_cset_link *link;
3432 3565
3433 down_read(&css_set_rwsem); 3566 spin_lock_bh(&css_set_lock);
3434 list_for_each_entry(link, &cgrp->cset_links, cset_link) 3567 list_for_each_entry(link, &cgrp->cset_links, cset_link)
3435 count += atomic_read(&link->cset->refcount); 3568 count += atomic_read(&link->cset->refcount);
3436 up_read(&css_set_rwsem); 3569 spin_unlock_bh(&css_set_lock);
3437 return count; 3570 return count;
3438} 3571}
3439 3572
@@ -3665,22 +3798,25 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
3665} 3798}
3666 3799
3667/** 3800/**
3668 * css_advance_task_iter - advance a task itererator to the next css_set 3801 * css_task_iter_advance_css_set - advance a task itererator to the next css_set
3669 * @it: the iterator to advance 3802 * @it: the iterator to advance
3670 * 3803 *
3671 * Advance @it to the next css_set to walk. 3804 * Advance @it to the next css_set to walk.
3672 */ 3805 */
3673static void css_advance_task_iter(struct css_task_iter *it) 3806static void css_task_iter_advance_css_set(struct css_task_iter *it)
3674{ 3807{
3675 struct list_head *l = it->cset_pos; 3808 struct list_head *l = it->cset_pos;
3676 struct cgrp_cset_link *link; 3809 struct cgrp_cset_link *link;
3677 struct css_set *cset; 3810 struct css_set *cset;
3678 3811
3812 lockdep_assert_held(&css_set_lock);
3813
3679 /* Advance to the next non-empty css_set */ 3814 /* Advance to the next non-empty css_set */
3680 do { 3815 do {
3681 l = l->next; 3816 l = l->next;
3682 if (l == it->cset_head) { 3817 if (l == it->cset_head) {
3683 it->cset_pos = NULL; 3818 it->cset_pos = NULL;
3819 it->task_pos = NULL;
3684 return; 3820 return;
3685 } 3821 }
3686 3822
@@ -3691,7 +3827,7 @@ static void css_advance_task_iter(struct css_task_iter *it)
3691 link = list_entry(l, struct cgrp_cset_link, cset_link); 3827 link = list_entry(l, struct cgrp_cset_link, cset_link);
3692 cset = link->cset; 3828 cset = link->cset;
3693 } 3829 }
3694 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); 3830 } while (!css_set_populated(cset));
3695 3831
3696 it->cset_pos = l; 3832 it->cset_pos = l;
3697 3833
@@ -3702,6 +3838,52 @@ static void css_advance_task_iter(struct css_task_iter *it)
3702 3838
3703 it->tasks_head = &cset->tasks; 3839 it->tasks_head = &cset->tasks;
3704 it->mg_tasks_head = &cset->mg_tasks; 3840 it->mg_tasks_head = &cset->mg_tasks;
3841
3842 /*
3843 * We don't keep css_sets locked across iteration steps and thus
3844 * need to take steps to ensure that iteration can be resumed after
3845 * the lock is re-acquired. Iteration is performed at two levels -
3846 * css_sets and tasks in them.
3847 *
3848 * Once created, a css_set never leaves its cgroup lists, so a
3849 * pinned css_set is guaranteed to stay put and we can resume
3850 * iteration afterwards.
3851 *
3852 * Tasks may leave @cset across iteration steps. This is resolved
3853 * by registering each iterator with the css_set currently being
3854 * walked and making css_set_move_task() advance iterators whose
3855 * next task is leaving.
3856 */
3857 if (it->cur_cset) {
3858 list_del(&it->iters_node);
3859 put_css_set_locked(it->cur_cset);
3860 }
3861 get_css_set(cset);
3862 it->cur_cset = cset;
3863 list_add(&it->iters_node, &cset->task_iters);
3864}
3865
3866static void css_task_iter_advance(struct css_task_iter *it)
3867{
3868 struct list_head *l = it->task_pos;
3869
3870 lockdep_assert_held(&css_set_lock);
3871 WARN_ON_ONCE(!l);
3872
3873 /*
3874 * Advance iterator to find next entry. cset->tasks is consumed
3875 * first and then ->mg_tasks. After ->mg_tasks, we move onto the
3876 * next cset.
3877 */
3878 l = l->next;
3879
3880 if (l == it->tasks_head)
3881 l = it->mg_tasks_head->next;
3882
3883 if (l == it->mg_tasks_head)
3884 css_task_iter_advance_css_set(it);
3885 else
3886 it->task_pos = l;
3705} 3887}
3706 3888
3707/** 3889/**
@@ -3713,19 +3895,16 @@ static void css_advance_task_iter(struct css_task_iter *it)
3713 * css_task_iter_next() to walk through the tasks until the function 3895 * css_task_iter_next() to walk through the tasks until the function
3714 * returns NULL. On completion of iteration, css_task_iter_end() must be 3896 * returns NULL. On completion of iteration, css_task_iter_end() must be
3715 * called. 3897 * called.
3716 *
3717 * Note that this function acquires a lock which is released when the
3718 * iteration finishes. The caller can't sleep while iteration is in
3719 * progress.
3720 */ 3898 */
3721void css_task_iter_start(struct cgroup_subsys_state *css, 3899void css_task_iter_start(struct cgroup_subsys_state *css,
3722 struct css_task_iter *it) 3900 struct css_task_iter *it)
3723 __acquires(css_set_rwsem)
3724{ 3901{
3725 /* no one should try to iterate before mounting cgroups */ 3902 /* no one should try to iterate before mounting cgroups */
3726 WARN_ON_ONCE(!use_task_css_set_links); 3903 WARN_ON_ONCE(!use_task_css_set_links);
3727 3904
3728 down_read(&css_set_rwsem); 3905 memset(it, 0, sizeof(*it));
3906
3907 spin_lock_bh(&css_set_lock);
3729 3908
3730 it->ss = css->ss; 3909 it->ss = css->ss;
3731 3910
@@ -3736,7 +3915,9 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
3736 3915
3737 it->cset_head = it->cset_pos; 3916 it->cset_head = it->cset_pos;
3738 3917
3739 css_advance_task_iter(it); 3918 css_task_iter_advance_css_set(it);
3919
3920 spin_unlock_bh(&css_set_lock);
3740} 3921}
3741 3922
3742/** 3923/**
@@ -3749,30 +3930,23 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
3749 */ 3930 */
3750struct task_struct *css_task_iter_next(struct css_task_iter *it) 3931struct task_struct *css_task_iter_next(struct css_task_iter *it)
3751{ 3932{
3752 struct task_struct *res; 3933 if (it->cur_task) {
3753 struct list_head *l = it->task_pos; 3934 put_task_struct(it->cur_task);
3935 it->cur_task = NULL;
3936 }
3754 3937
3755 /* If the iterator cg is NULL, we have no tasks */ 3938 spin_lock_bh(&css_set_lock);
3756 if (!it->cset_pos)
3757 return NULL;
3758 res = list_entry(l, struct task_struct, cg_list);
3759 3939
3760 /* 3940 if (it->task_pos) {
3761 * Advance iterator to find next entry. cset->tasks is consumed 3941 it->cur_task = list_entry(it->task_pos, struct task_struct,
3762 * first and then ->mg_tasks. After ->mg_tasks, we move onto the 3942 cg_list);
3763 * next cset. 3943 get_task_struct(it->cur_task);
3764 */ 3944 css_task_iter_advance(it);
3765 l = l->next; 3945 }
3766 3946
3767 if (l == it->tasks_head) 3947 spin_unlock_bh(&css_set_lock);
3768 l = it->mg_tasks_head->next;
3769 3948
3770 if (l == it->mg_tasks_head) 3949 return it->cur_task;
3771 css_advance_task_iter(it);
3772 else
3773 it->task_pos = l;
3774
3775 return res;
3776} 3950}
3777 3951
3778/** 3952/**
@@ -3782,9 +3956,16 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
3782 * Finish task iteration started by css_task_iter_start(). 3956 * Finish task iteration started by css_task_iter_start().
3783 */ 3957 */
3784void css_task_iter_end(struct css_task_iter *it) 3958void css_task_iter_end(struct css_task_iter *it)
3785 __releases(css_set_rwsem)
3786{ 3959{
3787 up_read(&css_set_rwsem); 3960 if (it->cur_cset) {
3961 spin_lock_bh(&css_set_lock);
3962 list_del(&it->iters_node);
3963 put_css_set_locked(it->cur_cset);
3964 spin_unlock_bh(&css_set_lock);
3965 }
3966
3967 if (it->cur_task)
3968 put_task_struct(it->cur_task);
3788} 3969}
3789 3970
3790/** 3971/**
@@ -3809,10 +3990,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3809 mutex_lock(&cgroup_mutex); 3990 mutex_lock(&cgroup_mutex);
3810 3991
3811 /* all tasks in @from are being moved, all csets are source */ 3992 /* all tasks in @from are being moved, all csets are source */
3812 down_read(&css_set_rwsem); 3993 spin_lock_bh(&css_set_lock);
3813 list_for_each_entry(link, &from->cset_links, cset_link) 3994 list_for_each_entry(link, &from->cset_links, cset_link)
3814 cgroup_migrate_add_src(link->cset, to, &preloaded_csets); 3995 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
3815 up_read(&css_set_rwsem); 3996 spin_unlock_bh(&css_set_lock);
3816 3997
3817 ret = cgroup_migrate_prepare_dst(to, &preloaded_csets); 3998 ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
3818 if (ret) 3999 if (ret)
@@ -3830,7 +4011,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3830 css_task_iter_end(&it); 4011 css_task_iter_end(&it);
3831 4012
3832 if (task) { 4013 if (task) {
3833 ret = cgroup_migrate(to, task, false); 4014 ret = cgroup_migrate(task, false, to);
3834 put_task_struct(task); 4015 put_task_struct(task);
3835 } 4016 }
3836 } while (task && !ret); 4017 } while (task && !ret);
@@ -4327,13 +4508,13 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4327static struct cftype cgroup_dfl_base_files[] = { 4508static struct cftype cgroup_dfl_base_files[] = {
4328 { 4509 {
4329 .name = "cgroup.procs", 4510 .name = "cgroup.procs",
4511 .file_offset = offsetof(struct cgroup, procs_file),
4330 .seq_start = cgroup_pidlist_start, 4512 .seq_start = cgroup_pidlist_start,
4331 .seq_next = cgroup_pidlist_next, 4513 .seq_next = cgroup_pidlist_next,
4332 .seq_stop = cgroup_pidlist_stop, 4514 .seq_stop = cgroup_pidlist_stop,
4333 .seq_show = cgroup_pidlist_show, 4515 .seq_show = cgroup_pidlist_show,
4334 .private = CGROUP_FILE_PROCS, 4516 .private = CGROUP_FILE_PROCS,
4335 .write = cgroup_procs_write, 4517 .write = cgroup_procs_write,
4336 .mode = S_IRUGO | S_IWUSR,
4337 }, 4518 },
4338 { 4519 {
4339 .name = "cgroup.controllers", 4520 .name = "cgroup.controllers",
@@ -4351,9 +4532,10 @@ static struct cftype cgroup_dfl_base_files[] = {
4351 .write = cgroup_subtree_control_write, 4532 .write = cgroup_subtree_control_write,
4352 }, 4533 },
4353 { 4534 {
4354 .name = "cgroup.populated", 4535 .name = "cgroup.events",
4355 .flags = CFTYPE_NOT_ON_ROOT, 4536 .flags = CFTYPE_NOT_ON_ROOT,
4356 .seq_show = cgroup_populated_show, 4537 .file_offset = offsetof(struct cgroup, events_file),
4538 .seq_show = cgroup_events_show,
4357 }, 4539 },
4358 { } /* terminate */ 4540 { } /* terminate */
4359}; 4541};
@@ -4368,7 +4550,6 @@ static struct cftype cgroup_legacy_base_files[] = {
4368 .seq_show = cgroup_pidlist_show, 4550 .seq_show = cgroup_pidlist_show,
4369 .private = CGROUP_FILE_PROCS, 4551 .private = CGROUP_FILE_PROCS,
4370 .write = cgroup_procs_write, 4552 .write = cgroup_procs_write,
4371 .mode = S_IRUGO | S_IWUSR,
4372 }, 4553 },
4373 { 4554 {
4374 .name = "cgroup.clone_children", 4555 .name = "cgroup.clone_children",
@@ -4388,7 +4569,6 @@ static struct cftype cgroup_legacy_base_files[] = {
4388 .seq_show = cgroup_pidlist_show, 4569 .seq_show = cgroup_pidlist_show,
4389 .private = CGROUP_FILE_TASKS, 4570 .private = CGROUP_FILE_TASKS,
4390 .write = cgroup_tasks_write, 4571 .write = cgroup_tasks_write,
4391 .mode = S_IRUGO | S_IWUSR,
4392 }, 4572 },
4393 { 4573 {
4394 .name = "notify_on_release", 4574 .name = "notify_on_release",
@@ -4405,37 +4585,6 @@ static struct cftype cgroup_legacy_base_files[] = {
4405 { } /* terminate */ 4585 { } /* terminate */
4406}; 4586};
4407 4587
4408/**
4409 * cgroup_populate_dir - create subsys files in a cgroup directory
4410 * @cgrp: target cgroup
4411 * @subsys_mask: mask of the subsystem ids whose files should be added
4412 *
4413 * On failure, no file is added.
4414 */
4415static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
4416{
4417 struct cgroup_subsys *ss;
4418 int i, ret = 0;
4419
4420 /* process cftsets of each subsystem */
4421 for_each_subsys(ss, i) {
4422 struct cftype *cfts;
4423
4424 if (!(subsys_mask & (1 << i)))
4425 continue;
4426
4427 list_for_each_entry(cfts, &ss->cfts, node) {
4428 ret = cgroup_addrm_files(cgrp, cfts, true);
4429 if (ret < 0)
4430 goto err;
4431 }
4432 }
4433 return 0;
4434err:
4435 cgroup_clear_dir(cgrp, subsys_mask);
4436 return ret;
4437}
4438
4439/* 4588/*
4440 * css destruction is four-stage process. 4589 * css destruction is four-stage process.
4441 * 4590 *
@@ -4464,9 +4613,13 @@ static void css_free_work_fn(struct work_struct *work)
4464 container_of(work, struct cgroup_subsys_state, destroy_work); 4613 container_of(work, struct cgroup_subsys_state, destroy_work);
4465 struct cgroup_subsys *ss = css->ss; 4614 struct cgroup_subsys *ss = css->ss;
4466 struct cgroup *cgrp = css->cgroup; 4615 struct cgroup *cgrp = css->cgroup;
4616 struct cgroup_file *cfile;
4467 4617
4468 percpu_ref_exit(&css->refcnt); 4618 percpu_ref_exit(&css->refcnt);
4469 4619
4620 list_for_each_entry(cfile, &css->files, node)
4621 kernfs_put(cfile->kn);
4622
4470 if (ss) { 4623 if (ss) {
4471 /* css free path */ 4624 /* css free path */
4472 int id = css->id; 4625 int id = css->id;
@@ -4571,6 +4724,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
4571 css->ss = ss; 4724 css->ss = ss;
4572 INIT_LIST_HEAD(&css->sibling); 4725 INIT_LIST_HEAD(&css->sibling);
4573 INIT_LIST_HEAD(&css->children); 4726 INIT_LIST_HEAD(&css->children);
4727 INIT_LIST_HEAD(&css->files);
4574 css->serial_nr = css_serial_nr_next++; 4728 css->serial_nr = css_serial_nr_next++;
4575 4729
4576 if (cgroup_parent(cgrp)) { 4730 if (cgroup_parent(cgrp)) {
@@ -4653,7 +4807,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
4653 css->id = err; 4807 css->id = err;
4654 4808
4655 if (visible) { 4809 if (visible) {
4656 err = cgroup_populate_dir(cgrp, 1 << ss->id); 4810 err = css_populate_dir(css, NULL);
4657 if (err) 4811 if (err)
4658 goto err_free_id; 4812 goto err_free_id;
4659 } 4813 }
@@ -4679,7 +4833,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
4679 4833
4680err_list_del: 4834err_list_del:
4681 list_del_rcu(&css->sibling); 4835 list_del_rcu(&css->sibling);
4682 cgroup_clear_dir(css->cgroup, 1 << css->ss->id); 4836 css_clear_dir(css, NULL);
4683err_free_id: 4837err_free_id:
4684 cgroup_idr_remove(&ss->css_idr, css->id); 4838 cgroup_idr_remove(&ss->css_idr, css->id);
4685err_free_percpu_ref: 4839err_free_percpu_ref:
@@ -4696,7 +4850,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4696 struct cgroup_root *root; 4850 struct cgroup_root *root;
4697 struct cgroup_subsys *ss; 4851 struct cgroup_subsys *ss;
4698 struct kernfs_node *kn; 4852 struct kernfs_node *kn;
4699 struct cftype *base_files;
4700 int ssid, ret; 4853 int ssid, ret;
4701 4854
4702 /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. 4855 /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
@@ -4772,12 +4925,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4772 if (ret) 4925 if (ret)
4773 goto out_destroy; 4926 goto out_destroy;
4774 4927
4775 if (cgroup_on_dfl(cgrp)) 4928 ret = css_populate_dir(&cgrp->self, NULL);
4776 base_files = cgroup_dfl_base_files;
4777 else
4778 base_files = cgroup_legacy_base_files;
4779
4780 ret = cgroup_addrm_files(cgrp, base_files, true);
4781 if (ret) 4929 if (ret)
4782 goto out_destroy; 4930 goto out_destroy;
4783 4931
@@ -4864,7 +5012,7 @@ static void kill_css(struct cgroup_subsys_state *css)
4864 * This must happen before css is disassociated with its cgroup. 5012 * This must happen before css is disassociated with its cgroup.
4865 * See seq_css() for details. 5013 * See seq_css() for details.
4866 */ 5014 */
4867 cgroup_clear_dir(css->cgroup, 1 << css->ss->id); 5015 css_clear_dir(css, NULL);
4868 5016
4869 /* 5017 /*
4870 * Killing would put the base ref, but we need to keep it alive 5018 * Killing would put the base ref, but we need to keep it alive
@@ -4913,19 +5061,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4913 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 5061 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4914{ 5062{
4915 struct cgroup_subsys_state *css; 5063 struct cgroup_subsys_state *css;
4916 bool empty;
4917 int ssid; 5064 int ssid;
4918 5065
4919 lockdep_assert_held(&cgroup_mutex); 5066 lockdep_assert_held(&cgroup_mutex);
4920 5067
4921 /* 5068 /*
4922 * css_set_rwsem synchronizes access to ->cset_links and prevents 5069 * Only migration can raise populated from zero and we're already
4923 * @cgrp from being removed while put_css_set() is in progress. 5070 * holding cgroup_mutex.
4924 */ 5071 */
4925 down_read(&css_set_rwsem); 5072 if (cgroup_is_populated(cgrp))
4926 empty = list_empty(&cgrp->cset_links);
4927 up_read(&css_set_rwsem);
4928 if (!empty)
4929 return -EBUSY; 5073 return -EBUSY;
4930 5074
4931 /* 5075 /*
@@ -5023,6 +5167,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
5023 5167
5024 have_fork_callback |= (bool)ss->fork << ss->id; 5168 have_fork_callback |= (bool)ss->fork << ss->id;
5025 have_exit_callback |= (bool)ss->exit << ss->id; 5169 have_exit_callback |= (bool)ss->exit << ss->id;
5170 have_free_callback |= (bool)ss->free << ss->id;
5026 have_canfork_callback |= (bool)ss->can_fork << ss->id; 5171 have_canfork_callback |= (bool)ss->can_fork << ss->id;
5027 5172
5028 /* At system boot, before all subsystems have been 5173 /* At system boot, before all subsystems have been
@@ -5071,6 +5216,8 @@ int __init cgroup_init_early(void)
5071 return 0; 5216 return 0;
5072} 5217}
5073 5218
5219static unsigned long cgroup_disable_mask __initdata;
5220
5074/** 5221/**
5075 * cgroup_init - cgroup initialization 5222 * cgroup_init - cgroup initialization
5076 * 5223 *
@@ -5081,8 +5228,9 @@ int __init cgroup_init(void)
5081{ 5228{
5082 struct cgroup_subsys *ss; 5229 struct cgroup_subsys *ss;
5083 unsigned long key; 5230 unsigned long key;
5084 int ssid, err; 5231 int ssid;
5085 5232
5233 BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
5086 BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); 5234 BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
5087 BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); 5235 BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
5088 5236
@@ -5116,14 +5264,15 @@ int __init cgroup_init(void)
5116 * disabled flag and cftype registration needs kmalloc, 5264 * disabled flag and cftype registration needs kmalloc,
5117 * both of which aren't available during early_init. 5265 * both of which aren't available during early_init.
5118 */ 5266 */
5119 if (ss->disabled) 5267 if (cgroup_disable_mask & (1 << ssid)) {
5268 static_branch_disable(cgroup_subsys_enabled_key[ssid]);
5269 printk(KERN_INFO "Disabling %s control group subsystem\n",
5270 ss->name);
5120 continue; 5271 continue;
5272 }
5121 5273
5122 cgrp_dfl_root.subsys_mask |= 1 << ss->id; 5274 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
5123 5275
5124 if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
5125 ss->dfl_cftypes = ss->legacy_cftypes;
5126
5127 if (!ss->dfl_cftypes) 5276 if (!ss->dfl_cftypes)
5128 cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; 5277 cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
5129 5278
@@ -5138,17 +5287,10 @@ int __init cgroup_init(void)
5138 ss->bind(init_css_set.subsys[ssid]); 5287 ss->bind(init_css_set.subsys[ssid]);
5139 } 5288 }
5140 5289
5141 err = sysfs_create_mount_point(fs_kobj, "cgroup"); 5290 WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
5142 if (err) 5291 WARN_ON(register_filesystem(&cgroup_fs_type));
5143 return err; 5292 WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
5144 5293
5145 err = register_filesystem(&cgroup_fs_type);
5146 if (err < 0) {
5147 sysfs_remove_mount_point(fs_kobj, "cgroup");
5148 return err;
5149 }
5150
5151 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
5152 return 0; 5294 return 0;
5153} 5295}
5154 5296
@@ -5195,7 +5337,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5195 goto out; 5337 goto out;
5196 5338
5197 mutex_lock(&cgroup_mutex); 5339 mutex_lock(&cgroup_mutex);
5198 down_read(&css_set_rwsem); 5340 spin_lock_bh(&css_set_lock);
5199 5341
5200 for_each_root(root) { 5342 for_each_root(root) {
5201 struct cgroup_subsys *ss; 5343 struct cgroup_subsys *ss;
@@ -5215,19 +5357,39 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5215 seq_printf(m, "%sname=%s", count ? "," : "", 5357 seq_printf(m, "%sname=%s", count ? "," : "",
5216 root->name); 5358 root->name);
5217 seq_putc(m, ':'); 5359 seq_putc(m, ':');
5360
5218 cgrp = task_cgroup_from_root(tsk, root); 5361 cgrp = task_cgroup_from_root(tsk, root);
5219 path = cgroup_path(cgrp, buf, PATH_MAX); 5362
5220 if (!path) { 5363 /*
5221 retval = -ENAMETOOLONG; 5364 * On traditional hierarchies, all zombie tasks show up as
5222 goto out_unlock; 5365 * belonging to the root cgroup. On the default hierarchy,
5366 * while a zombie doesn't show up in "cgroup.procs" and
5367 * thus can't be migrated, its /proc/PID/cgroup keeps
5368 * reporting the cgroup it belonged to before exiting. If
5369 * the cgroup is removed before the zombie is reaped,
5370 * " (deleted)" is appended to the cgroup path.
5371 */
5372 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
5373 path = cgroup_path(cgrp, buf, PATH_MAX);
5374 if (!path) {
5375 retval = -ENAMETOOLONG;
5376 goto out_unlock;
5377 }
5378 } else {
5379 path = "/";
5223 } 5380 }
5381
5224 seq_puts(m, path); 5382 seq_puts(m, path);
5225 seq_putc(m, '\n'); 5383
5384 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
5385 seq_puts(m, " (deleted)\n");
5386 else
5387 seq_putc(m, '\n');
5226 } 5388 }
5227 5389
5228 retval = 0; 5390 retval = 0;
5229out_unlock: 5391out_unlock:
5230 up_read(&css_set_rwsem); 5392 spin_unlock_bh(&css_set_lock);
5231 mutex_unlock(&cgroup_mutex); 5393 mutex_unlock(&cgroup_mutex);
5232 kfree(buf); 5394 kfree(buf);
5233out: 5395out:
@@ -5251,7 +5413,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
5251 for_each_subsys(ss, i) 5413 for_each_subsys(ss, i)
5252 seq_printf(m, "%s\t%d\t%d\t%d\n", 5414 seq_printf(m, "%s\t%d\t%d\t%d\n",
5253 ss->legacy_name, ss->root->hierarchy_id, 5415 ss->legacy_name, ss->root->hierarchy_id,
5254 atomic_read(&ss->root->nr_cgrps), !ss->disabled); 5416 atomic_read(&ss->root->nr_cgrps),
5417 cgroup_ssid_enabled(i));
5255 5418
5256 mutex_unlock(&cgroup_mutex); 5419 mutex_unlock(&cgroup_mutex);
5257 return 0; 5420 return 0;
@@ -5372,7 +5535,7 @@ void cgroup_post_fork(struct task_struct *child,
5372 * @child during its iteration. 5535 * @child during its iteration.
5373 * 5536 *
5374 * If we won the race, @child is associated with %current's 5537 * If we won the race, @child is associated with %current's
5375 * css_set. Grabbing css_set_rwsem guarantees both that the 5538 * css_set. Grabbing css_set_lock guarantees both that the
5376 * association is stable, and, on completion of the parent's 5539 * association is stable, and, on completion of the parent's
5377 * migration, @child is visible in the source of migration or 5540 * migration, @child is visible in the source of migration or
5378 * already in the destination cgroup. This guarantee is necessary 5541 * already in the destination cgroup. This guarantee is necessary
@@ -5387,14 +5550,13 @@ void cgroup_post_fork(struct task_struct *child,
5387 if (use_task_css_set_links) { 5550 if (use_task_css_set_links) {
5388 struct css_set *cset; 5551 struct css_set *cset;
5389 5552
5390 down_write(&css_set_rwsem); 5553 spin_lock_bh(&css_set_lock);
5391 cset = task_css_set(current); 5554 cset = task_css_set(current);
5392 if (list_empty(&child->cg_list)) { 5555 if (list_empty(&child->cg_list)) {
5393 rcu_assign_pointer(child->cgroups, cset);
5394 list_add(&child->cg_list, &cset->tasks);
5395 get_css_set(cset); 5556 get_css_set(cset);
5557 css_set_move_task(child, NULL, cset, false);
5396 } 5558 }
5397 up_write(&css_set_rwsem); 5559 spin_unlock_bh(&css_set_lock);
5398 } 5560 }
5399 5561
5400 /* 5562 /*
@@ -5429,39 +5591,42 @@ void cgroup_exit(struct task_struct *tsk)
5429{ 5591{
5430 struct cgroup_subsys *ss; 5592 struct cgroup_subsys *ss;
5431 struct css_set *cset; 5593 struct css_set *cset;
5432 bool put_cset = false;
5433 int i; 5594 int i;
5434 5595
5435 /* 5596 /*
5436 * Unlink from @tsk from its css_set. As migration path can't race 5597 * Unlink from @tsk from its css_set. As migration path can't race
5437 * with us, we can check cg_list without grabbing css_set_rwsem. 5598 * with us, we can check css_set and cg_list without synchronization.
5438 */ 5599 */
5600 cset = task_css_set(tsk);
5601
5439 if (!list_empty(&tsk->cg_list)) { 5602 if (!list_empty(&tsk->cg_list)) {
5440 down_write(&css_set_rwsem); 5603 spin_lock_bh(&css_set_lock);
5441 list_del_init(&tsk->cg_list); 5604 css_set_move_task(tsk, cset, NULL, false);
5442 up_write(&css_set_rwsem); 5605 spin_unlock_bh(&css_set_lock);
5443 put_cset = true; 5606 } else {
5607 get_css_set(cset);
5444 } 5608 }
5445 5609
5446 /* Reassign the task to the init_css_set. */
5447 cset = task_css_set(tsk);
5448 RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
5449
5450 /* see cgroup_post_fork() for details */ 5610 /* see cgroup_post_fork() for details */
5451 for_each_subsys_which(ss, i, &have_exit_callback) { 5611 for_each_subsys_which(ss, i, &have_exit_callback)
5452 struct cgroup_subsys_state *old_css = cset->subsys[i]; 5612 ss->exit(tsk);
5453 struct cgroup_subsys_state *css = task_css(tsk, i); 5613}
5454 5614
5455 ss->exit(css, old_css, tsk); 5615void cgroup_free(struct task_struct *task)
5456 } 5616{
5617 struct css_set *cset = task_css_set(task);
5618 struct cgroup_subsys *ss;
5619 int ssid;
5457 5620
5458 if (put_cset) 5621 for_each_subsys_which(ss, ssid, &have_free_callback)
5459 put_css_set(cset); 5622 ss->free(task);
5623
5624 put_css_set(cset);
5460} 5625}
5461 5626
5462static void check_for_release(struct cgroup *cgrp) 5627static void check_for_release(struct cgroup *cgrp)
5463{ 5628{
5464 if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) && 5629 if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
5465 !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) 5630 !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
5466 schedule_work(&cgrp->release_agent_work); 5631 schedule_work(&cgrp->release_agent_work);
5467} 5632}
@@ -5540,25 +5705,13 @@ static int __init cgroup_disable(char *str)
5540 if (strcmp(token, ss->name) && 5705 if (strcmp(token, ss->name) &&
5541 strcmp(token, ss->legacy_name)) 5706 strcmp(token, ss->legacy_name))
5542 continue; 5707 continue;
5543 5708 cgroup_disable_mask |= 1 << i;
5544 ss->disabled = 1;
5545 printk(KERN_INFO "Disabling %s control group subsystem\n",
5546 ss->name);
5547 break;
5548 } 5709 }
5549 } 5710 }
5550 return 1; 5711 return 1;
5551} 5712}
5552__setup("cgroup_disable=", cgroup_disable); 5713__setup("cgroup_disable=", cgroup_disable);
5553 5714
5554static int __init cgroup_set_legacy_files_on_dfl(char *str)
5555{
5556 printk("cgroup: using legacy files on the default hierarchy\n");
5557 cgroup_legacy_files_on_dfl = true;
5558 return 0;
5559}
5560__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
5561
5562/** 5715/**
5563 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry 5716 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
5564 * @dentry: directory dentry of interest 5717 * @dentry: directory dentry of interest
@@ -5662,7 +5815,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5662 if (!name_buf) 5815 if (!name_buf)
5663 return -ENOMEM; 5816 return -ENOMEM;
5664 5817
5665 down_read(&css_set_rwsem); 5818 spin_lock_bh(&css_set_lock);
5666 rcu_read_lock(); 5819 rcu_read_lock();
5667 cset = rcu_dereference(current->cgroups); 5820 cset = rcu_dereference(current->cgroups);
5668 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { 5821 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@@ -5673,7 +5826,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5673 c->root->hierarchy_id, name_buf); 5826 c->root->hierarchy_id, name_buf);
5674 } 5827 }
5675 rcu_read_unlock(); 5828 rcu_read_unlock();
5676 up_read(&css_set_rwsem); 5829 spin_unlock_bh(&css_set_lock);
5677 kfree(name_buf); 5830 kfree(name_buf);
5678 return 0; 5831 return 0;
5679} 5832}
@@ -5684,7 +5837,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
5684 struct cgroup_subsys_state *css = seq_css(seq); 5837 struct cgroup_subsys_state *css = seq_css(seq);
5685 struct cgrp_cset_link *link; 5838 struct cgrp_cset_link *link;
5686 5839
5687 down_read(&css_set_rwsem); 5840 spin_lock_bh(&css_set_lock);
5688 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { 5841 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5689 struct css_set *cset = link->cset; 5842 struct css_set *cset = link->cset;
5690 struct task_struct *task; 5843 struct task_struct *task;
@@ -5707,13 +5860,13 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
5707 overflow: 5860 overflow:
5708 seq_puts(seq, " ...\n"); 5861 seq_puts(seq, " ...\n");
5709 } 5862 }
5710 up_read(&css_set_rwsem); 5863 spin_unlock_bh(&css_set_lock);
5711 return 0; 5864 return 0;
5712} 5865}
5713 5866
5714static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) 5867static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
5715{ 5868{
5716 return (!cgroup_has_tasks(css->cgroup) && 5869 return (!cgroup_is_populated(css->cgroup) &&
5717 !css_has_online_children(&css->cgroup->self)); 5870 !css_has_online_children(&css->cgroup->self));
5718} 5871}
5719 5872
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index 806cd7693ac8..cdd8df4e991c 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -266,11 +266,9 @@ static void pids_fork(struct task_struct *task, void *priv)
266 css_put(old_css); 266 css_put(old_css);
267} 267}
268 268
269static void pids_exit(struct cgroup_subsys_state *css, 269static void pids_free(struct task_struct *task)
270 struct cgroup_subsys_state *old_css,
271 struct task_struct *task)
272{ 270{
273 struct pids_cgroup *pids = css_pids(old_css); 271 struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
274 272
275 pids_uncharge(pids, 1); 273 pids_uncharge(pids, 1);
276} 274}
@@ -349,7 +347,7 @@ struct cgroup_subsys pids_cgrp_subsys = {
349 .can_fork = pids_can_fork, 347 .can_fork = pids_can_fork,
350 .cancel_fork = pids_cancel_fork, 348 .cancel_fork = pids_cancel_fork,
351 .fork = pids_fork, 349 .fork = pids_fork,
352 .exit = pids_exit, 350 .free = pids_free,
353 .legacy_cftypes = pids_files, 351 .legacy_cftypes = pids_files,
354 .dfl_cftypes = pids_files, 352 .dfl_cftypes = pids_files,
355}; 353};
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f0acff0f66c9..d7ccb87a6714 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -473,7 +473,8 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
473 473
474 /* On legacy hiearchy, we must be a subset of our parent cpuset. */ 474 /* On legacy hiearchy, we must be a subset of our parent cpuset. */
475 ret = -EACCES; 475 ret = -EACCES;
476 if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par)) 476 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
477 !is_cpuset_subset(trial, par))
477 goto out; 478 goto out;
478 479
479 /* 480 /*
@@ -497,7 +498,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
497 * be changed to have empty cpus_allowed or mems_allowed. 498 * be changed to have empty cpus_allowed or mems_allowed.
498 */ 499 */
499 ret = -ENOSPC; 500 ret = -ENOSPC;
500 if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) { 501 if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
501 if (!cpumask_empty(cur->cpus_allowed) && 502 if (!cpumask_empty(cur->cpus_allowed) &&
502 cpumask_empty(trial->cpus_allowed)) 503 cpumask_empty(trial->cpus_allowed))
503 goto out; 504 goto out;
@@ -879,7 +880,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
879 * If it becomes empty, inherit the effective mask of the 880 * If it becomes empty, inherit the effective mask of the
880 * parent, which is guaranteed to have some CPUs. 881 * parent, which is guaranteed to have some CPUs.
881 */ 882 */
882 if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus)) 883 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
884 cpumask_empty(new_cpus))
883 cpumask_copy(new_cpus, parent->effective_cpus); 885 cpumask_copy(new_cpus, parent->effective_cpus);
884 886
885 /* Skip the whole subtree if the cpumask remains the same. */ 887 /* Skip the whole subtree if the cpumask remains the same. */
@@ -896,7 +898,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
896 cpumask_copy(cp->effective_cpus, new_cpus); 898 cpumask_copy(cp->effective_cpus, new_cpus);
897 spin_unlock_irq(&callback_lock); 899 spin_unlock_irq(&callback_lock);
898 900
899 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && 901 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
900 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); 902 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
901 903
902 update_tasks_cpumask(cp); 904 update_tasks_cpumask(cp);
@@ -1135,7 +1137,8 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1135 * If it becomes empty, inherit the effective mask of the 1137 * If it becomes empty, inherit the effective mask of the
1136 * parent, which is guaranteed to have some MEMs. 1138 * parent, which is guaranteed to have some MEMs.
1137 */ 1139 */
1138 if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems)) 1140 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
1141 nodes_empty(*new_mems))
1139 *new_mems = parent->effective_mems; 1142 *new_mems = parent->effective_mems;
1140 1143
1141 /* Skip the whole subtree if the nodemask remains the same. */ 1144 /* Skip the whole subtree if the nodemask remains the same. */
@@ -1152,7 +1155,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1152 cp->effective_mems = *new_mems; 1155 cp->effective_mems = *new_mems;
1153 spin_unlock_irq(&callback_lock); 1156 spin_unlock_irq(&callback_lock);
1154 1157
1155 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && 1158 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
1156 !nodes_equal(cp->mems_allowed, cp->effective_mems)); 1159 !nodes_equal(cp->mems_allowed, cp->effective_mems));
1157 1160
1158 update_tasks_nodemask(cp); 1161 update_tasks_nodemask(cp);
@@ -1440,7 +1443,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
1440 1443
1441 /* allow moving tasks into an empty cpuset if on default hierarchy */ 1444 /* allow moving tasks into an empty cpuset if on default hierarchy */
1442 ret = -ENOSPC; 1445 ret = -ENOSPC;
1443 if (!cgroup_on_dfl(css->cgroup) && 1446 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
1444 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 1447 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1445 goto out_unlock; 1448 goto out_unlock;
1446 1449
@@ -1484,9 +1487,8 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1484{ 1487{
1485 /* static buf protected by cpuset_mutex */ 1488 /* static buf protected by cpuset_mutex */
1486 static nodemask_t cpuset_attach_nodemask_to; 1489 static nodemask_t cpuset_attach_nodemask_to;
1487 struct mm_struct *mm;
1488 struct task_struct *task; 1490 struct task_struct *task;
1489 struct task_struct *leader = cgroup_taskset_first(tset); 1491 struct task_struct *leader;
1490 struct cpuset *cs = css_cs(css); 1492 struct cpuset *cs = css_cs(css);
1491 struct cpuset *oldcs = cpuset_attach_old_cs; 1493 struct cpuset *oldcs = cpuset_attach_old_cs;
1492 1494
@@ -1512,26 +1514,30 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1512 } 1514 }
1513 1515
1514 /* 1516 /*
1515 * Change mm, possibly for multiple threads in a threadgroup. This is 1517 * Change mm for all threadgroup leaders. This is expensive and may
1516 * expensive and may sleep. 1518 * sleep and should be moved outside migration path proper.
1517 */ 1519 */
1518 cpuset_attach_nodemask_to = cs->effective_mems; 1520 cpuset_attach_nodemask_to = cs->effective_mems;
1519 mm = get_task_mm(leader); 1521 cgroup_taskset_for_each_leader(leader, tset) {
1520 if (mm) { 1522 struct mm_struct *mm = get_task_mm(leader);
1521 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 1523
1522 1524 if (mm) {
1523 /* 1525 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1524 * old_mems_allowed is the same with mems_allowed here, except 1526
1525 * if this task is being moved automatically due to hotplug. 1527 /*
1526 * In that case @mems_allowed has been updated and is empty, 1528 * old_mems_allowed is the same with mems_allowed
1527 * so @old_mems_allowed is the right nodesets that we migrate 1529 * here, except if this task is being moved
1528 * mm from. 1530 * automatically due to hotplug. In that case
1529 */ 1531 * @mems_allowed has been updated and is empty, so
1530 if (is_memory_migrate(cs)) { 1532 * @old_mems_allowed is the right nodesets that we
1531 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, 1533 * migrate mm from.
1532 &cpuset_attach_nodemask_to); 1534 */
1535 if (is_memory_migrate(cs)) {
1536 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
1537 &cpuset_attach_nodemask_to);
1538 }
1539 mmput(mm);
1533 } 1540 }
1534 mmput(mm);
1535 } 1541 }
1536 1542
1537 cs->old_mems_allowed = cpuset_attach_nodemask_to; 1543 cs->old_mems_allowed = cpuset_attach_nodemask_to;
@@ -1594,9 +1600,6 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
1594 case FILE_MEMORY_PRESSURE_ENABLED: 1600 case FILE_MEMORY_PRESSURE_ENABLED:
1595 cpuset_memory_pressure_enabled = !!val; 1601 cpuset_memory_pressure_enabled = !!val;
1596 break; 1602 break;
1597 case FILE_MEMORY_PRESSURE:
1598 retval = -EACCES;
1599 break;
1600 case FILE_SPREAD_PAGE: 1603 case FILE_SPREAD_PAGE:
1601 retval = update_flag(CS_SPREAD_PAGE, cs, val); 1604 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1602 break; 1605 break;
@@ -1863,9 +1866,6 @@ static struct cftype files[] = {
1863 { 1866 {
1864 .name = "memory_pressure", 1867 .name = "memory_pressure",
1865 .read_u64 = cpuset_read_u64, 1868 .read_u64 = cpuset_read_u64,
1866 .write_u64 = cpuset_write_u64,
1867 .private = FILE_MEMORY_PRESSURE,
1868 .mode = S_IRUGO,
1869 }, 1869 },
1870 1870
1871 { 1871 {
@@ -1952,7 +1952,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
1952 cpuset_inc(); 1952 cpuset_inc();
1953 1953
1954 spin_lock_irq(&callback_lock); 1954 spin_lock_irq(&callback_lock);
1955 if (cgroup_on_dfl(cs->css.cgroup)) { 1955 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
1956 cpumask_copy(cs->effective_cpus, parent->effective_cpus); 1956 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
1957 cs->effective_mems = parent->effective_mems; 1957 cs->effective_mems = parent->effective_mems;
1958 } 1958 }
@@ -2029,7 +2029,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
2029 mutex_lock(&cpuset_mutex); 2029 mutex_lock(&cpuset_mutex);
2030 spin_lock_irq(&callback_lock); 2030 spin_lock_irq(&callback_lock);
2031 2031
2032 if (cgroup_on_dfl(root_css->cgroup)) { 2032 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
2033 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); 2033 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
2034 top_cpuset.mems_allowed = node_possible_map; 2034 top_cpuset.mems_allowed = node_possible_map;
2035 } else { 2035 } else {
@@ -2210,7 +2210,7 @@ retry:
2210 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); 2210 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
2211 mems_updated = !nodes_equal(new_mems, cs->effective_mems); 2211 mems_updated = !nodes_equal(new_mems, cs->effective_mems);
2212 2212
2213 if (cgroup_on_dfl(cs->css.cgroup)) 2213 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
2214 hotplug_update_tasks(cs, &new_cpus, &new_mems, 2214 hotplug_update_tasks(cs, &new_cpus, &new_mems,
2215 cpus_updated, mems_updated); 2215 cpus_updated, mems_updated);
2216 else 2216 else
@@ -2241,7 +2241,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2241 static cpumask_t new_cpus; 2241 static cpumask_t new_cpus;
2242 static nodemask_t new_mems; 2242 static nodemask_t new_mems;
2243 bool cpus_updated, mems_updated; 2243 bool cpus_updated, mems_updated;
2244 bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup); 2244 bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
2245 2245
2246 mutex_lock(&cpuset_mutex); 2246 mutex_lock(&cpuset_mutex);
2247 2247
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 39db20c6248e..1a734e0adfa7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9460,17 +9460,9 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css,
9460 task_function_call(task, __perf_cgroup_move, task); 9460 task_function_call(task, __perf_cgroup_move, task);
9461} 9461}
9462 9462
9463static void perf_cgroup_exit(struct cgroup_subsys_state *css,
9464 struct cgroup_subsys_state *old_css,
9465 struct task_struct *task)
9466{
9467 task_function_call(task, __perf_cgroup_move, task);
9468}
9469
9470struct cgroup_subsys perf_event_cgrp_subsys = { 9463struct cgroup_subsys perf_event_cgrp_subsys = {
9471 .css_alloc = perf_cgroup_css_alloc, 9464 .css_alloc = perf_cgroup_css_alloc,
9472 .css_free = perf_cgroup_css_free, 9465 .css_free = perf_cgroup_css_free,
9473 .exit = perf_cgroup_exit,
9474 .attach = perf_cgroup_attach, 9466 .attach = perf_cgroup_attach,
9475}; 9467};
9476#endif /* CONFIG_CGROUP_PERF */ 9468#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/fork.c b/kernel/fork.c
index 6ac894244d39..825ecc32454d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -251,6 +251,7 @@ void __put_task_struct(struct task_struct *tsk)
251 WARN_ON(atomic_read(&tsk->usage)); 251 WARN_ON(atomic_read(&tsk->usage));
252 WARN_ON(tsk == current); 252 WARN_ON(tsk == current);
253 253
254 cgroup_free(tsk);
254 task_numa_free(tsk); 255 task_numa_free(tsk);
255 security_task_free(tsk); 256 security_task_free(tsk);
256 exit_creds(tsk); 257 exit_creds(tsk);
@@ -1149,10 +1150,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1149 tty_audit_fork(sig); 1150 tty_audit_fork(sig);
1150 sched_autogroup_fork(sig); 1151 sched_autogroup_fork(sig);
1151 1152
1152#ifdef CONFIG_CGROUPS
1153 init_rwsem(&sig->group_rwsem);
1154#endif
1155
1156 sig->oom_score_adj = current->signal->oom_score_adj; 1153 sig->oom_score_adj = current->signal->oom_score_adj;
1157 sig->oom_score_adj_min = current->signal->oom_score_adj_min; 1154 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1158 1155
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index aa5973220ad2..4d568ac9319e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8244,13 +8244,6 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
8244 sched_move_task(task); 8244 sched_move_task(task);
8245} 8245}
8246 8246
8247static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
8248 struct cgroup_subsys_state *old_css,
8249 struct task_struct *task)
8250{
8251 sched_move_task(task);
8252}
8253
8254#ifdef CONFIG_FAIR_GROUP_SCHED 8247#ifdef CONFIG_FAIR_GROUP_SCHED
8255static int cpu_shares_write_u64(struct cgroup_subsys_state *css, 8248static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
8256 struct cftype *cftype, u64 shareval) 8249 struct cftype *cftype, u64 shareval)
@@ -8582,7 +8575,6 @@ struct cgroup_subsys cpu_cgrp_subsys = {
8582 .fork = cpu_cgroup_fork, 8575 .fork = cpu_cgroup_fork,
8583 .can_attach = cpu_cgroup_can_attach, 8576 .can_attach = cpu_cgroup_can_attach,
8584 .attach = cpu_cgroup_attach, 8577 .attach = cpu_cgroup_attach,
8585 .exit = cpu_cgroup_exit,
8586 .legacy_cftypes = cpu_files, 8578 .legacy_cftypes = cpu_files,
8587 .early_init = 1, 8579 .early_init = 1,
8588}; 8580};
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c57c4423c688..b732edfddb76 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -434,7 +434,7 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
434 434
435 memcg = page->mem_cgroup; 435 memcg = page->mem_cgroup;
436 436
437 if (!memcg || !cgroup_on_dfl(memcg->css.cgroup)) 437 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
438 memcg = root_mem_cgroup; 438 memcg = root_mem_cgroup;
439 439
440 rcu_read_unlock(); 440 rcu_read_unlock();
@@ -2926,7 +2926,7 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
2926 * of course permitted. 2926 * of course permitted.
2927 */ 2927 */
2928 mutex_lock(&memcg_create_mutex); 2928 mutex_lock(&memcg_create_mutex);
2929 if (cgroup_has_tasks(memcg->css.cgroup) || 2929 if (cgroup_is_populated(memcg->css.cgroup) ||
2930 (memcg->use_hierarchy && memcg_has_children(memcg))) 2930 (memcg->use_hierarchy && memcg_has_children(memcg)))
2931 err = -EBUSY; 2931 err = -EBUSY;
2932 mutex_unlock(&memcg_create_mutex); 2932 mutex_unlock(&memcg_create_mutex);
@@ -4066,8 +4066,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
4066 { 4066 {
4067 .name = "cgroup.event_control", /* XXX: for compat */ 4067 .name = "cgroup.event_control", /* XXX: for compat */
4068 .write = memcg_write_event_control, 4068 .write = memcg_write_event_control,
4069 .flags = CFTYPE_NO_PREFIX, 4069 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
4070 .mode = S_IWUGO,
4071 }, 4070 },
4072 { 4071 {
4073 .name = "swappiness", 4072 .name = "swappiness",
@@ -4834,7 +4833,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
4834{ 4833{
4835 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4834 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4836 struct mem_cgroup *from; 4835 struct mem_cgroup *from;
4837 struct task_struct *p; 4836 struct task_struct *leader, *p;
4838 struct mm_struct *mm; 4837 struct mm_struct *mm;
4839 unsigned long move_flags; 4838 unsigned long move_flags;
4840 int ret = 0; 4839 int ret = 0;
@@ -4848,7 +4847,20 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
4848 if (!move_flags) 4847 if (!move_flags)
4849 return 0; 4848 return 0;
4850 4849
4851 p = cgroup_taskset_first(tset); 4850 /*
4851 * Multi-process migrations only happen on the default hierarchy
4852 * where charge immigration is not used. Perform charge
4853 * immigration if @tset contains a leader and whine if there are
4854 * multiple.
4855 */
4856 p = NULL;
4857 cgroup_taskset_for_each_leader(leader, tset) {
4858 WARN_ON_ONCE(p);
4859 p = leader;
4860 }
4861 if (!p)
4862 return 0;
4863
4852 from = mem_cgroup_from_task(p); 4864 from = mem_cgroup_from_task(p);
4853 4865
4854 VM_BUG_ON(from == memcg); 4866 VM_BUG_ON(from == memcg);
@@ -5064,7 +5076,7 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
5064 * guarantees that @root doesn't have any children, so turning it 5076 * guarantees that @root doesn't have any children, so turning it
5065 * on for the root memcg is enough. 5077 * on for the root memcg is enough.
5066 */ 5078 */
5067 if (cgroup_on_dfl(root_css->cgroup)) 5079 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5068 root_mem_cgroup->use_hierarchy = true; 5080 root_mem_cgroup->use_hierarchy = true;
5069 else 5081 else
5070 root_mem_cgroup->use_hierarchy = false; 5082 root_mem_cgroup->use_hierarchy = false;
@@ -5208,6 +5220,7 @@ static struct cftype memory_files[] = {
5208 { 5220 {
5209 .name = "events", 5221 .name = "events",
5210 .flags = CFTYPE_NOT_ON_ROOT, 5222 .flags = CFTYPE_NOT_ON_ROOT,
5223 .file_offset = offsetof(struct mem_cgroup, events_file),
5211 .seq_show = memory_events_show, 5224 .seq_show = memory_events_show,
5212 }, 5225 },
5213 { } /* terminate */ 5226 { } /* terminate */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7f63a9381f71..e7057af54b6e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -175,7 +175,7 @@ static bool sane_reclaim(struct scan_control *sc)
175 if (!memcg) 175 if (!memcg)
176 return true; 176 return true;
177#ifdef CONFIG_CGROUP_WRITEBACK 177#ifdef CONFIG_CGROUP_WRITEBACK
178 if (cgroup_on_dfl(memcg->css.cgroup)) 178 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
179 return true; 179 return true;
180#endif 180#endif
181 return false; 181 return false;