aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-09-07 01:25:25 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-07 01:25:25 -0400
commit608c1d3c17e9e0e87dae69b9bb78f0556006ee6e (patch)
treedcbd1f035a140f61e012d22bc6633b30b3ead29d
parent9954d4892a813155cf808b1c29df50886b5672cf (diff)
parentb8d1b8ee93df8ffbabbeadd65d39853cfad6d698 (diff)
Merge branch 'for-4.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: "Several notable changes this cycle: - Thread mode was merged. This will be used for cgroup2 support for CPU and possibly other controllers. Unfortunately, CPU controller cgroup2 support didn't make this pull request but most contentions have been resolved and the support is likely to be merged before the next merge window. - cgroup.stat now shows the number of descendant cgroups. - cpuset now can enable the easier-to-configure v2 behavior on v1 hierarchy" * 'for-4.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (21 commits) cpuset: Allow v2 behavior in v1 cgroup cgroup: Add mount flag to enable cpuset to use v2 behavior in v1 cgroup cgroup: remove unneeded checks cgroup: misc changes cgroup: short-circuit cset_cgroup_from_root() on the default hierarchy cgroup: re-use the parent pointer in cgroup_destroy_locked() cgroup: add cgroup.stat interface with basic hierarchy stats cgroup: implement hierarchy limits cgroup: keep track of number of descent cgroups cgroup: add comment to cgroup_enable_threaded() cgroup: remove unnecessary empty check when enabling threaded mode cgroup: update debug controller to print out thread mode information cgroup: implement cgroup v2 thread support cgroup: implement CSS_TASK_ITER_THREADED cgroup: introduce cgroup->dom_cgrp and threaded css_set handling cgroup: add @flags to css_task_iter_start() and implement CSS_TASK_ITER_PROCS cgroup: reorganize cgroup.procs / task write path cgroup: replace css_set walking populated test with testing cgrp->nr_populated_csets cgroup: distinguish local and children populated states cgroup: remove now unused list_head @pending in cgroup_apply_cftypes() ...
-rw-r--r--Documentation/cgroup-v2.txt221
-rw-r--r--include/linux/cgroup-defs.h68
-rw-r--r--include/linux/cgroup.h39
-rw-r--r--kernel/cgroup/cgroup-internal.h12
-rw-r--r--kernel/cgroup/cgroup-v1.c75
-rw-r--r--kernel/cgroup/cgroup.c947
-rw-r--r--kernel/cgroup/cpuset.c39
-rw-r--r--kernel/cgroup/debug.c53
-rw-r--r--kernel/cgroup/freezer.c6
-rw-r--r--kernel/cgroup/pids.c1
-rw-r--r--kernel/events/core.c1
-rw-r--r--mm/memcontrol.c2
-rw-r--r--net/core/netclassid_cgroup.c2
13 files changed, 1194 insertions, 272 deletions
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index bde177103567..dc44785dc0fa 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -18,7 +18,9 @@ v1 is available under Documentation/cgroup-v1/.
18 1-2. What is cgroup? 18 1-2. What is cgroup?
19 2. Basic Operations 19 2. Basic Operations
20 2-1. Mounting 20 2-1. Mounting
21 2-2. Organizing Processes 21 2-2. Organizing Processes and Threads
22 2-2-1. Processes
23 2-2-2. Threads
22 2-3. [Un]populated Notification 24 2-3. [Un]populated Notification
23 2-4. Controlling Controllers 25 2-4. Controlling Controllers
24 2-4-1. Enabling and Disabling 26 2-4-1. Enabling and Disabling
@@ -167,8 +169,11 @@ cgroup v2 currently supports the following mount options.
167 Delegation section for details. 169 Delegation section for details.
168 170
169 171
170Organizing Processes 172Organizing Processes and Threads
171-------------------- 173--------------------------------
174
175Processes
176~~~~~~~~~
172 177
173Initially, only the root cgroup exists to which all processes belong. 178Initially, only the root cgroup exists to which all processes belong.
174A child cgroup can be created by creating a sub-directory:: 179A child cgroup can be created by creating a sub-directory::
@@ -219,6 +224,105 @@ is removed subsequently, " (deleted)" is appended to the path::
219 0::/test-cgroup/test-cgroup-nested (deleted) 224 0::/test-cgroup/test-cgroup-nested (deleted)
220 225
221 226
227Threads
228~~~~~~~
229
230cgroup v2 supports thread granularity for a subset of controllers to
231support use cases requiring hierarchical resource distribution across
232the threads of a group of processes. By default, all threads of a
233process belong to the same cgroup, which also serves as the resource
234domain to host resource consumptions which are not specific to a
235process or thread. The thread mode allows threads to be spread across
236a subtree while still maintaining the common resource domain for them.
237
238Controllers which support thread mode are called threaded controllers.
239The ones which don't are called domain controllers.
240
241Marking a cgroup threaded makes it join the resource domain of its
242parent as a threaded cgroup. The parent may be another threaded
243cgroup whose resource domain is further up in the hierarchy. The root
244of a threaded subtree, that is, the nearest ancestor which is not
245threaded, is called threaded domain or thread root interchangeably and
246serves as the resource domain for the entire subtree.
247
248Inside a threaded subtree, threads of a process can be put in
249different cgroups and are not subject to the no internal process
250constraint - threaded controllers can be enabled on non-leaf cgroups
251whether they have threads in them or not.
252
253As the threaded domain cgroup hosts all the domain resource
254consumptions of the subtree, it is considered to have internal
255resource consumptions whether there are processes in it or not and
256can't have populated child cgroups which aren't threaded. Because the
257root cgroup is not subject to no internal process constraint, it can
258serve both as a threaded domain and a parent to domain cgroups.
259
260The current operation mode or type of the cgroup is shown in the
261"cgroup.type" file which indicates whether the cgroup is a normal
262domain, a domain which is serving as the domain of a threaded subtree,
263or a threaded cgroup.
264
265On creation, a cgroup is always a domain cgroup and can be made
266threaded by writing "threaded" to the "cgroup.type" file. The
267operation is single direction::
268
269 # echo threaded > cgroup.type
270
271Once threaded, the cgroup can't be made a domain again. To enable the
272thread mode, the following conditions must be met.
273
274- As the cgroup will join the parent's resource domain. The parent
275 must either be a valid (threaded) domain or a threaded cgroup.
276
277- When the parent is an unthreaded domain, it must not have any domain
278 controllers enabled or populated domain children. The root is
279 exempt from this requirement.
280
281Topology-wise, a cgroup can be in an invalid state. Please consider
282the following toplogy::
283
284 A (threaded domain) - B (threaded) - C (domain, just created)
285
286C is created as a domain but isn't connected to a parent which can
287host child domains. C can't be used until it is turned into a
288threaded cgroup. "cgroup.type" file will report "domain (invalid)" in
289these cases. Operations which fail due to invalid topology use
290EOPNOTSUPP as the errno.
291
292A domain cgroup is turned into a threaded domain when one of its child
293cgroup becomes threaded or threaded controllers are enabled in the
294"cgroup.subtree_control" file while there are processes in the cgroup.
295A threaded domain reverts to a normal domain when the conditions
296clear.
297
298When read, "cgroup.threads" contains the list of the thread IDs of all
299threads in the cgroup. Except that the operations are per-thread
300instead of per-process, "cgroup.threads" has the same format and
301behaves the same way as "cgroup.procs". While "cgroup.threads" can be
302written to in any cgroup, as it can only move threads inside the same
303threaded domain, its operations are confined inside each threaded
304subtree.
305
306The threaded domain cgroup serves as the resource domain for the whole
307subtree, and, while the threads can be scattered across the subtree,
308all the processes are considered to be in the threaded domain cgroup.
309"cgroup.procs" in a threaded domain cgroup contains the PIDs of all
310processes in the subtree and is not readable in the subtree proper.
311However, "cgroup.procs" can be written to from anywhere in the subtree
312to migrate all threads of the matching process to the cgroup.
313
314Only threaded controllers can be enabled in a threaded subtree. When
315a threaded controller is enabled inside a threaded subtree, it only
316accounts for and controls resource consumptions associated with the
317threads in the cgroup and its descendants. All consumptions which
318aren't tied to a specific thread belong to the threaded domain cgroup.
319
320Because a threaded subtree is exempt from no internal process
321constraint, a threaded controller must be able to handle competition
322between threads in a non-leaf cgroup and its child cgroups. Each
323threaded controller defines how such competitions are handled.
324
325
222[Un]populated Notification 326[Un]populated Notification
223-------------------------- 327--------------------------
224 328
@@ -302,15 +406,15 @@ disabled if one or more children have it enabled.
302No Internal Process Constraint 406No Internal Process Constraint
303~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 407~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
304 408
305Non-root cgroups can only distribute resources to their children when 409Non-root cgroups can distribute domain resources to their children
306they don't have any processes of their own. In other words, only 410only when they don't have any processes of their own. In other words,
307cgroups which don't contain any processes can have controllers enabled 411only domain cgroups which don't contain any processes can have domain
308in their "cgroup.subtree_control" files. 412controllers enabled in their "cgroup.subtree_control" files.
309 413
310This guarantees that, when a controller is looking at the part of the 414This guarantees that, when a domain controller is looking at the part
311hierarchy which has it enabled, processes are always only on the 415of the hierarchy which has it enabled, processes are always only on
312leaves. This rules out situations where child cgroups compete against 416the leaves. This rules out situations where child cgroups compete
313internal processes of the parent. 417against internal processes of the parent.
314 418
315The root cgroup is exempt from this restriction. Root contains 419The root cgroup is exempt from this restriction. Root contains
316processes and anonymous resource consumption which can't be associated 420processes and anonymous resource consumption which can't be associated
@@ -334,10 +438,10 @@ Model of Delegation
334~~~~~~~~~~~~~~~~~~~ 438~~~~~~~~~~~~~~~~~~~
335 439
336A cgroup can be delegated in two ways. First, to a less privileged 440A cgroup can be delegated in two ways. First, to a less privileged
337user by granting write access of the directory and its "cgroup.procs" 441user by granting write access of the directory and its "cgroup.procs",
338and "cgroup.subtree_control" files to the user. Second, if the 442"cgroup.threads" and "cgroup.subtree_control" files to the user.
339"nsdelegate" mount option is set, automatically to a cgroup namespace 443Second, if the "nsdelegate" mount option is set, automatically to a
340on namespace creation. 444cgroup namespace on namespace creation.
341 445
342Because the resource control interface files in a given directory 446Because the resource control interface files in a given directory
343control the distribution of the parent's resources, the delegatee 447control the distribution of the parent's resources, the delegatee
@@ -644,6 +748,29 @@ Core Interface Files
644 748
645All cgroup core files are prefixed with "cgroup." 749All cgroup core files are prefixed with "cgroup."
646 750
751 cgroup.type
752
753 A read-write single value file which exists on non-root
754 cgroups.
755
756 When read, it indicates the current type of the cgroup, which
757 can be one of the following values.
758
759 - "domain" : A normal valid domain cgroup.
760
761 - "domain threaded" : A threaded domain cgroup which is
762 serving as the root of a threaded subtree.
763
764 - "domain invalid" : A cgroup which is in an invalid state.
765 It can't be populated or have controllers enabled. It may
766 be allowed to become a threaded cgroup.
767
768 - "threaded" : A threaded cgroup which is a member of a
769 threaded subtree.
770
771 A cgroup can be turned into a threaded cgroup by writing
772 "threaded" to this file.
773
647 cgroup.procs 774 cgroup.procs
648 A read-write new-line separated values file which exists on 775 A read-write new-line separated values file which exists on
649 all cgroups. 776 all cgroups.
@@ -658,9 +785,6 @@ All cgroup core files are prefixed with "cgroup."
658 the PID to the cgroup. The writer should match all of the 785 the PID to the cgroup. The writer should match all of the
659 following conditions. 786 following conditions.
660 787
661 - Its euid is either root or must match either uid or suid of
662 the target process.
663
664 - It must have write access to the "cgroup.procs" file. 788 - It must have write access to the "cgroup.procs" file.
665 789
666 - It must have write access to the "cgroup.procs" file of the 790 - It must have write access to the "cgroup.procs" file of the
@@ -669,6 +793,35 @@ All cgroup core files are prefixed with "cgroup."
669 When delegating a sub-hierarchy, write access to this file 793 When delegating a sub-hierarchy, write access to this file
670 should be granted along with the containing directory. 794 should be granted along with the containing directory.
671 795
796 In a threaded cgroup, reading this file fails with EOPNOTSUPP
797 as all the processes belong to the thread root. Writing is
798 supported and moves every thread of the process to the cgroup.
799
800 cgroup.threads
801 A read-write new-line separated values file which exists on
802 all cgroups.
803
804 When read, it lists the TIDs of all threads which belong to
805 the cgroup one-per-line. The TIDs are not ordered and the
806 same TID may show up more than once if the thread got moved to
807 another cgroup and then back or the TID got recycled while
808 reading.
809
810 A TID can be written to migrate the thread associated with the
811 TID to the cgroup. The writer should match all of the
812 following conditions.
813
814 - It must have write access to the "cgroup.threads" file.
815
816 - The cgroup that the thread is currently in must be in the
817 same resource domain as the destination cgroup.
818
819 - It must have write access to the "cgroup.procs" file of the
820 common ancestor of the source and destination cgroups.
821
822 When delegating a sub-hierarchy, write access to this file
823 should be granted along with the containing directory.
824
672 cgroup.controllers 825 cgroup.controllers
673 A read-only space separated values file which exists on all 826 A read-only space separated values file which exists on all
674 cgroups. 827 cgroups.
@@ -701,6 +854,38 @@ All cgroup core files are prefixed with "cgroup."
701 1 if the cgroup or its descendants contains any live 854 1 if the cgroup or its descendants contains any live
702 processes; otherwise, 0. 855 processes; otherwise, 0.
703 856
857 cgroup.max.descendants
858 A read-write single value files. The default is "max".
859
860 Maximum allowed number of descent cgroups.
861 If the actual number of descendants is equal or larger,
862 an attempt to create a new cgroup in the hierarchy will fail.
863
864 cgroup.max.depth
865 A read-write single value files. The default is "max".
866
867 Maximum allowed descent depth below the current cgroup.
868 If the actual descent depth is equal or larger,
869 an attempt to create a new child cgroup will fail.
870
871 cgroup.stat
872 A read-only flat-keyed file with the following entries:
873
874 nr_descendants
875 Total number of visible descendant cgroups.
876
877 nr_dying_descendants
878 Total number of dying descendant cgroups. A cgroup becomes
879 dying after being deleted by a user. The cgroup will remain
880 in dying state for some time undefined time (which can depend
881 on system load) before being completely destroyed.
882
883 A process can't enter a dying cgroup under any circumstances,
884 a dying cgroup can't revive.
885
886 A dying cgroup can consume system resources not exceeding
887 limits, which were active at the moment of cgroup deletion.
888
704 889
705Controllers 890Controllers
706=========== 891===========
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 09f4c7df1478..ade4a78a54c2 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -74,6 +74,11 @@ enum {
74 * aren't writeable from inside the namespace. 74 * aren't writeable from inside the namespace.
75 */ 75 */
76 CGRP_ROOT_NS_DELEGATE = (1 << 3), 76 CGRP_ROOT_NS_DELEGATE = (1 << 3),
77
78 /*
79 * Enable cpuset controller in v1 cgroup to use v2 behavior.
80 */
81 CGRP_ROOT_CPUSET_V2_MODE = (1 << 4),
77}; 82};
78 83
79/* cftype->flags */ 84/* cftype->flags */
@@ -172,6 +177,14 @@ struct css_set {
172 /* reference count */ 177 /* reference count */
173 refcount_t refcount; 178 refcount_t refcount;
174 179
180 /*
181 * For a domain cgroup, the following points to self. If threaded,
182 * to the matching cset of the nearest domain ancestor. The
183 * dom_cset provides access to the domain cgroup and its csses to
184 * which domain level resource consumptions should be charged.
185 */
186 struct css_set *dom_cset;
187
175 /* the default cgroup associated with this css_set */ 188 /* the default cgroup associated with this css_set */
176 struct cgroup *dfl_cgrp; 189 struct cgroup *dfl_cgrp;
177 190
@@ -200,6 +213,10 @@ struct css_set {
200 */ 213 */
201 struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; 214 struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
202 215
216 /* all threaded csets whose ->dom_cset points to this cset */
217 struct list_head threaded_csets;
218 struct list_head threaded_csets_node;
219
203 /* 220 /*
204 * List running through all cgroup groups in the same hash 221 * List running through all cgroup groups in the same hash
205 * slot. Protected by css_set_lock 222 * slot. Protected by css_set_lock
@@ -261,13 +278,35 @@ struct cgroup {
261 */ 278 */
262 int level; 279 int level;
263 280
281 /* Maximum allowed descent tree depth */
282 int max_depth;
283
284 /*
285 * Keep track of total numbers of visible and dying descent cgroups.
286 * Dying cgroups are cgroups which were deleted by a user,
287 * but are still existing because someone else is holding a reference.
288 * max_descendants is a maximum allowed number of descent cgroups.
289 */
290 int nr_descendants;
291 int nr_dying_descendants;
292 int max_descendants;
293
264 /* 294 /*
265 * Each non-empty css_set associated with this cgroup contributes 295 * Each non-empty css_set associated with this cgroup contributes
266 * one to populated_cnt. All children with non-zero popuplated_cnt 296 * one to nr_populated_csets. The counter is zero iff this cgroup
267 * of their own contribute one. The count is zero iff there's no 297 * doesn't have any tasks.
268 * task in this cgroup or its subtree. 298 *
299 * All children which have non-zero nr_populated_csets and/or
300 * nr_populated_children of their own contribute one to either
301 * nr_populated_domain_children or nr_populated_threaded_children
302 * depending on their type. Each counter is zero iff all cgroups
303 * of the type in the subtree proper don't have any tasks.
269 */ 304 */
270 int populated_cnt; 305 int nr_populated_csets;
306 int nr_populated_domain_children;
307 int nr_populated_threaded_children;
308
309 int nr_threaded_children; /* # of live threaded child cgroups */
271 310
272 struct kernfs_node *kn; /* cgroup kernfs entry */ 311 struct kernfs_node *kn; /* cgroup kernfs entry */
273 struct cgroup_file procs_file; /* handle for "cgroup.procs" */ 312 struct cgroup_file procs_file; /* handle for "cgroup.procs" */
@@ -306,6 +345,15 @@ struct cgroup {
306 struct list_head e_csets[CGROUP_SUBSYS_COUNT]; 345 struct list_head e_csets[CGROUP_SUBSYS_COUNT];
307 346
308 /* 347 /*
348 * If !threaded, self. If threaded, it points to the nearest
349 * domain ancestor. Inside a threaded subtree, cgroups are exempt
350 * from process granularity and no-internal-task constraint.
351 * Domain level resource consumptions which aren't tied to a
352 * specific task are charged to the dom_cgrp.
353 */
354 struct cgroup *dom_cgrp;
355
356 /*
309 * list of pidlists, up to two for each namespace (one for procs, one 357 * list of pidlists, up to two for each namespace (one for procs, one
310 * for tasks); created on demand. 358 * for tasks); created on demand.
311 */ 359 */
@@ -492,6 +540,18 @@ struct cgroup_subsys {
492 bool implicit_on_dfl:1; 540 bool implicit_on_dfl:1;
493 541
494 /* 542 /*
543 * If %true, the controller, supports threaded mode on the default
544 * hierarchy. In a threaded subtree, both process granularity and
545 * no-internal-process constraint are ignored and a threaded
546 * controllers should be able to handle that.
547 *
548 * Note that as an implicit controller is automatically enabled on
549 * all cgroups on the default hierarchy, it should also be
550 * threaded. implicit && !threaded is not supported.
551 */
552 bool threaded:1;
553
554 /*
495 * If %false, this subsystem is properly hierarchical - 555 * If %false, this subsystem is properly hierarchical -
496 * configuration, resource accounting and restriction on a parent 556 * configuration, resource accounting and restriction on a parent
497 * cgroup cover those of its children. If %true, hierarchy support 557 * cgroup cover those of its children. If %true, hierarchy support
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 710a005c6b7a..085056e562b1 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -36,18 +36,28 @@
36#define CGROUP_WEIGHT_DFL 100 36#define CGROUP_WEIGHT_DFL 100
37#define CGROUP_WEIGHT_MAX 10000 37#define CGROUP_WEIGHT_MAX 10000
38 38
39/* walk only threadgroup leaders */
40#define CSS_TASK_ITER_PROCS (1U << 0)
41/* walk all threaded css_sets in the domain */
42#define CSS_TASK_ITER_THREADED (1U << 1)
43
39/* a css_task_iter should be treated as an opaque object */ 44/* a css_task_iter should be treated as an opaque object */
40struct css_task_iter { 45struct css_task_iter {
41 struct cgroup_subsys *ss; 46 struct cgroup_subsys *ss;
47 unsigned int flags;
42 48
43 struct list_head *cset_pos; 49 struct list_head *cset_pos;
44 struct list_head *cset_head; 50 struct list_head *cset_head;
45 51
52 struct list_head *tcset_pos;
53 struct list_head *tcset_head;
54
46 struct list_head *task_pos; 55 struct list_head *task_pos;
47 struct list_head *tasks_head; 56 struct list_head *tasks_head;
48 struct list_head *mg_tasks_head; 57 struct list_head *mg_tasks_head;
49 58
50 struct css_set *cur_cset; 59 struct css_set *cur_cset;
60 struct css_set *cur_dcset;
51 struct task_struct *cur_task; 61 struct task_struct *cur_task;
52 struct list_head iters_node; /* css_set->task_iters */ 62 struct list_head iters_node; /* css_set->task_iters */
53}; 63};
@@ -129,7 +139,7 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
129struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, 139struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
130 struct cgroup_subsys_state **dst_cssp); 140 struct cgroup_subsys_state **dst_cssp);
131 141
132void css_task_iter_start(struct cgroup_subsys_state *css, 142void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
133 struct css_task_iter *it); 143 struct css_task_iter *it);
134struct task_struct *css_task_iter_next(struct css_task_iter *it); 144struct task_struct *css_task_iter_next(struct css_task_iter *it);
135void css_task_iter_end(struct css_task_iter *it); 145void css_task_iter_end(struct css_task_iter *it);
@@ -388,6 +398,16 @@ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
388 percpu_ref_put_many(&css->refcnt, n); 398 percpu_ref_put_many(&css->refcnt, n);
389} 399}
390 400
401static inline void cgroup_get(struct cgroup *cgrp)
402{
403 css_get(&cgrp->self);
404}
405
406static inline bool cgroup_tryget(struct cgroup *cgrp)
407{
408 return css_tryget(&cgrp->self);
409}
410
391static inline void cgroup_put(struct cgroup *cgrp) 411static inline void cgroup_put(struct cgroup *cgrp)
392{ 412{
393 css_put(&cgrp->self); 413 css_put(&cgrp->self);
@@ -500,6 +520,20 @@ static inline struct cgroup *task_cgroup(struct task_struct *task,
500 return task_css(task, subsys_id)->cgroup; 520 return task_css(task, subsys_id)->cgroup;
501} 521}
502 522
523static inline struct cgroup *task_dfl_cgroup(struct task_struct *task)
524{
525 return task_css_set(task)->dfl_cgrp;
526}
527
528static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
529{
530 struct cgroup_subsys_state *parent_css = cgrp->self.parent;
531
532 if (parent_css)
533 return container_of(parent_css, struct cgroup, self);
534 return NULL;
535}
536
503/** 537/**
504 * cgroup_is_descendant - test ancestry 538 * cgroup_is_descendant - test ancestry
505 * @cgrp: the cgroup to be tested 539 * @cgrp: the cgroup to be tested
@@ -537,7 +571,8 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
537/* no synchronization, the result can only be used as a hint */ 571/* no synchronization, the result can only be used as a hint */
538static inline bool cgroup_is_populated(struct cgroup *cgrp) 572static inline bool cgroup_is_populated(struct cgroup *cgrp)
539{ 573{
540 return cgrp->populated_cnt; 574 return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children +
575 cgrp->nr_populated_threaded_children;
541} 576}
542 577
543/* returns ino associated with a cgroup */ 578/* returns ino associated with a cgroup */
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 8b4c3c2f2509..5151ff256c29 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -156,6 +156,8 @@ static inline void get_css_set(struct css_set *cset)
156 156
157bool cgroup_ssid_enabled(int ssid); 157bool cgroup_ssid_enabled(int ssid);
158bool cgroup_on_dfl(const struct cgroup *cgrp); 158bool cgroup_on_dfl(const struct cgroup *cgrp);
159bool cgroup_is_thread_root(struct cgroup *cgrp);
160bool cgroup_is_threaded(struct cgroup *cgrp);
159 161
160struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); 162struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
161struct cgroup *task_cgroup_from_root(struct task_struct *task, 163struct cgroup *task_cgroup_from_root(struct task_struct *task,
@@ -173,7 +175,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
173 struct cgroup_root *root, unsigned long magic, 175 struct cgroup_root *root, unsigned long magic,
174 struct cgroup_namespace *ns); 176 struct cgroup_namespace *ns);
175 177
176bool cgroup_may_migrate_to(struct cgroup *dst_cgrp); 178int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
177void cgroup_migrate_finish(struct cgroup_mgctx *mgctx); 179void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
178void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, 180void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
179 struct cgroup_mgctx *mgctx); 181 struct cgroup_mgctx *mgctx);
@@ -183,10 +185,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
183 185
184int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, 186int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
185 bool threadgroup); 187 bool threadgroup);
186ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, 188struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
187 size_t nbytes, loff_t off, bool threadgroup); 189 __acquires(&cgroup_threadgroup_rwsem);
188ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, 190void cgroup_procs_write_finish(struct task_struct *task)
189 loff_t off); 191 __releases(&cgroup_threadgroup_rwsem);
190 192
191void cgroup_lock_and_drain_offline(struct cgroup *cgrp); 193void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
192 194
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 7bf4b1533f34..024085daab1a 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -99,8 +99,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
99 if (cgroup_on_dfl(to)) 99 if (cgroup_on_dfl(to))
100 return -EINVAL; 100 return -EINVAL;
101 101
102 if (!cgroup_may_migrate_to(to)) 102 ret = cgroup_migrate_vet_dst(to);
103 return -EBUSY; 103 if (ret)
104 return ret;
104 105
105 mutex_lock(&cgroup_mutex); 106 mutex_lock(&cgroup_mutex);
106 107
@@ -121,7 +122,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
121 * ->can_attach() fails. 122 * ->can_attach() fails.
122 */ 123 */
123 do { 124 do {
124 css_task_iter_start(&from->self, &it); 125 css_task_iter_start(&from->self, 0, &it);
125 task = css_task_iter_next(&it); 126 task = css_task_iter_next(&it);
126 if (task) 127 if (task)
127 get_task_struct(task); 128 get_task_struct(task);
@@ -373,7 +374,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
373 if (!array) 374 if (!array)
374 return -ENOMEM; 375 return -ENOMEM;
375 /* now, populate the array */ 376 /* now, populate the array */
376 css_task_iter_start(&cgrp->self, &it); 377 css_task_iter_start(&cgrp->self, 0, &it);
377 while ((tsk = css_task_iter_next(&it))) { 378 while ((tsk = css_task_iter_next(&it))) {
378 if (unlikely(n == length)) 379 if (unlikely(n == length))
379 break; 380 break;
@@ -510,10 +511,58 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v)
510 return 0; 511 return 0;
511} 512}
512 513
513static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, 514static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
514 char *buf, size_t nbytes, loff_t off) 515 char *buf, size_t nbytes, loff_t off,
516 bool threadgroup)
515{ 517{
516 return __cgroup_procs_write(of, buf, nbytes, off, false); 518 struct cgroup *cgrp;
519 struct task_struct *task;
520 const struct cred *cred, *tcred;
521 ssize_t ret;
522
523 cgrp = cgroup_kn_lock_live(of->kn, false);
524 if (!cgrp)
525 return -ENODEV;
526
527 task = cgroup_procs_write_start(buf, threadgroup);
528 ret = PTR_ERR_OR_ZERO(task);
529 if (ret)
530 goto out_unlock;
531
532 /*
533 * Even if we're attaching all tasks in the thread group, we only
534 * need to check permissions on one of them.
535 */
536 cred = current_cred();
537 tcred = get_task_cred(task);
538 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
539 !uid_eq(cred->euid, tcred->uid) &&
540 !uid_eq(cred->euid, tcred->suid))
541 ret = -EACCES;
542 put_cred(tcred);
543 if (ret)
544 goto out_finish;
545
546 ret = cgroup_attach_task(cgrp, task, threadgroup);
547
548out_finish:
549 cgroup_procs_write_finish(task);
550out_unlock:
551 cgroup_kn_unlock(of->kn);
552
553 return ret ?: nbytes;
554}
555
556static ssize_t cgroup1_procs_write(struct kernfs_open_file *of,
557 char *buf, size_t nbytes, loff_t off)
558{
559 return __cgroup1_procs_write(of, buf, nbytes, off, true);
560}
561
562static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of,
563 char *buf, size_t nbytes, loff_t off)
564{
565 return __cgroup1_procs_write(of, buf, nbytes, off, false);
517} 566}
518 567
519static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, 568static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
@@ -592,7 +641,7 @@ struct cftype cgroup1_base_files[] = {
592 .seq_stop = cgroup_pidlist_stop, 641 .seq_stop = cgroup_pidlist_stop,
593 .seq_show = cgroup_pidlist_show, 642 .seq_show = cgroup_pidlist_show,
594 .private = CGROUP_FILE_PROCS, 643 .private = CGROUP_FILE_PROCS,
595 .write = cgroup_procs_write, 644 .write = cgroup1_procs_write,
596 }, 645 },
597 { 646 {
598 .name = "cgroup.clone_children", 647 .name = "cgroup.clone_children",
@@ -611,7 +660,7 @@ struct cftype cgroup1_base_files[] = {
611 .seq_stop = cgroup_pidlist_stop, 660 .seq_stop = cgroup_pidlist_stop,
612 .seq_show = cgroup_pidlist_show, 661 .seq_show = cgroup_pidlist_show,
613 .private = CGROUP_FILE_TASKS, 662 .private = CGROUP_FILE_TASKS,
614 .write = cgroup_tasks_write, 663 .write = cgroup1_tasks_write,
615 }, 664 },
616 { 665 {
617 .name = "notify_on_release", 666 .name = "notify_on_release",
@@ -701,7 +750,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
701 } 750 }
702 rcu_read_unlock(); 751 rcu_read_unlock();
703 752
704 css_task_iter_start(&cgrp->self, &it); 753 css_task_iter_start(&cgrp->self, 0, &it);
705 while ((tsk = css_task_iter_next(&it))) { 754 while ((tsk = css_task_iter_next(&it))) {
706 switch (tsk->state) { 755 switch (tsk->state) {
707 case TASK_RUNNING: 756 case TASK_RUNNING:
@@ -846,6 +895,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
846 seq_puts(seq, ",noprefix"); 895 seq_puts(seq, ",noprefix");
847 if (root->flags & CGRP_ROOT_XATTR) 896 if (root->flags & CGRP_ROOT_XATTR)
848 seq_puts(seq, ",xattr"); 897 seq_puts(seq, ",xattr");
898 if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
899 seq_puts(seq, ",cpuset_v2_mode");
849 900
850 spin_lock(&release_agent_path_lock); 901 spin_lock(&release_agent_path_lock);
851 if (strlen(root->release_agent_path)) 902 if (strlen(root->release_agent_path))
@@ -900,6 +951,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
900 opts->cpuset_clone_children = true; 951 opts->cpuset_clone_children = true;
901 continue; 952 continue;
902 } 953 }
954 if (!strcmp(token, "cpuset_v2_mode")) {
955 opts->flags |= CGRP_ROOT_CPUSET_V2_MODE;
956 continue;
957 }
903 if (!strcmp(token, "xattr")) { 958 if (!strcmp(token, "xattr")) {
904 opts->flags |= CGRP_ROOT_XATTR; 959 opts->flags |= CGRP_ROOT_XATTR;
905 continue; 960 continue;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index f64fc967a9ef..4f2196a00953 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -162,6 +162,9 @@ static u16 cgrp_dfl_inhibit_ss_mask;
162/* some controllers are implicitly enabled on the default hierarchy */ 162/* some controllers are implicitly enabled on the default hierarchy */
163static u16 cgrp_dfl_implicit_ss_mask; 163static u16 cgrp_dfl_implicit_ss_mask;
164 164
165/* some controllers can be threaded on the default hierarchy */
166static u16 cgrp_dfl_threaded_ss_mask;
167
165/* The list of hierarchy roots */ 168/* The list of hierarchy roots */
166LIST_HEAD(cgroup_roots); 169LIST_HEAD(cgroup_roots);
167static int cgroup_root_count; 170static int cgroup_root_count;
@@ -316,13 +319,87 @@ static void cgroup_idr_remove(struct idr *idr, int id)
316 spin_unlock_bh(&cgroup_idr_lock); 319 spin_unlock_bh(&cgroup_idr_lock);
317} 320}
318 321
319static struct cgroup *cgroup_parent(struct cgroup *cgrp) 322static bool cgroup_has_tasks(struct cgroup *cgrp)
320{ 323{
321 struct cgroup_subsys_state *parent_css = cgrp->self.parent; 324 return cgrp->nr_populated_csets;
325}
322 326
323 if (parent_css) 327bool cgroup_is_threaded(struct cgroup *cgrp)
324 return container_of(parent_css, struct cgroup, self); 328{
325 return NULL; 329 return cgrp->dom_cgrp != cgrp;
330}
331
332/* can @cgrp host both domain and threaded children? */
333static bool cgroup_is_mixable(struct cgroup *cgrp)
334{
335 /*
336 * Root isn't under domain level resource control exempting it from
337 * the no-internal-process constraint, so it can serve as a thread
338 * root and a parent of resource domains at the same time.
339 */
340 return !cgroup_parent(cgrp);
341}
342
343/* can @cgrp become a thread root? should always be true for a thread root */
344static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
345{
346 /* mixables don't care */
347 if (cgroup_is_mixable(cgrp))
348 return true;
349
350 /* domain roots can't be nested under threaded */
351 if (cgroup_is_threaded(cgrp))
352 return false;
353
354 /* can only have either domain or threaded children */
355 if (cgrp->nr_populated_domain_children)
356 return false;
357
358 /* and no domain controllers can be enabled */
359 if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
360 return false;
361
362 return true;
363}
364
365/* is @cgrp root of a threaded subtree? */
366bool cgroup_is_thread_root(struct cgroup *cgrp)
367{
368 /* thread root should be a domain */
369 if (cgroup_is_threaded(cgrp))
370 return false;
371
372 /* a domain w/ threaded children is a thread root */
373 if (cgrp->nr_threaded_children)
374 return true;
375
376 /*
377 * A domain which has tasks and explicit threaded controllers
378 * enabled is a thread root.
379 */
380 if (cgroup_has_tasks(cgrp) &&
381 (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
382 return true;
383
384 return false;
385}
386
387/* a domain which isn't connected to the root w/o brekage can't be used */
388static bool cgroup_is_valid_domain(struct cgroup *cgrp)
389{
390 /* the cgroup itself can be a thread root */
391 if (cgroup_is_threaded(cgrp))
392 return false;
393
394 /* but the ancestors can't be unless mixable */
395 while ((cgrp = cgroup_parent(cgrp))) {
396 if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
397 return false;
398 if (cgroup_is_threaded(cgrp))
399 return false;
400 }
401
402 return true;
326} 403}
327 404
328/* subsystems visibly enabled on a cgroup */ 405/* subsystems visibly enabled on a cgroup */
@@ -331,8 +408,14 @@ static u16 cgroup_control(struct cgroup *cgrp)
331 struct cgroup *parent = cgroup_parent(cgrp); 408 struct cgroup *parent = cgroup_parent(cgrp);
332 u16 root_ss_mask = cgrp->root->subsys_mask; 409 u16 root_ss_mask = cgrp->root->subsys_mask;
333 410
334 if (parent) 411 if (parent) {
335 return parent->subtree_control; 412 u16 ss_mask = parent->subtree_control;
413
414 /* threaded cgroups can only have threaded controllers */
415 if (cgroup_is_threaded(cgrp))
416 ss_mask &= cgrp_dfl_threaded_ss_mask;
417 return ss_mask;
418 }
336 419
337 if (cgroup_on_dfl(cgrp)) 420 if (cgroup_on_dfl(cgrp))
338 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask | 421 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
@@ -345,8 +428,14 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp)
345{ 428{
346 struct cgroup *parent = cgroup_parent(cgrp); 429 struct cgroup *parent = cgroup_parent(cgrp);
347 430
348 if (parent) 431 if (parent) {
349 return parent->subtree_ss_mask; 432 u16 ss_mask = parent->subtree_ss_mask;
433
434 /* threaded cgroups can only have threaded controllers */
435 if (cgroup_is_threaded(cgrp))
436 ss_mask &= cgrp_dfl_threaded_ss_mask;
437 return ss_mask;
438 }
350 439
351 return cgrp->root->subsys_mask; 440 return cgrp->root->subsys_mask;
352} 441}
@@ -436,22 +525,12 @@ out_unlock:
436 return css; 525 return css;
437} 526}
438 527
439static void __maybe_unused cgroup_get(struct cgroup *cgrp)
440{
441 css_get(&cgrp->self);
442}
443
444static void cgroup_get_live(struct cgroup *cgrp) 528static void cgroup_get_live(struct cgroup *cgrp)
445{ 529{
446 WARN_ON_ONCE(cgroup_is_dead(cgrp)); 530 WARN_ON_ONCE(cgroup_is_dead(cgrp));
447 css_get(&cgrp->self); 531 css_get(&cgrp->self);
448} 532}
449 533
450static bool cgroup_tryget(struct cgroup *cgrp)
451{
452 return css_tryget(&cgrp->self);
453}
454
455struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) 534struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
456{ 535{
457 struct cgroup *cgrp = of->kn->parent->priv; 536 struct cgroup *cgrp = of->kn->parent->priv;
@@ -560,9 +639,11 @@ EXPORT_SYMBOL_GPL(of_css);
560 */ 639 */
561struct css_set init_css_set = { 640struct css_set init_css_set = {
562 .refcount = REFCOUNT_INIT(1), 641 .refcount = REFCOUNT_INIT(1),
642 .dom_cset = &init_css_set,
563 .tasks = LIST_HEAD_INIT(init_css_set.tasks), 643 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
564 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), 644 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
565 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), 645 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
646 .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
566 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), 647 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
567 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), 648 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
568 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), 649 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
@@ -570,6 +651,11 @@ struct css_set init_css_set = {
570 651
571static int css_set_count = 1; /* 1 for init_css_set */ 652static int css_set_count = 1; /* 1 for init_css_set */
572 653
654static bool css_set_threaded(struct css_set *cset)
655{
656 return cset->dom_cset != cset;
657}
658
573/** 659/**
574 * css_set_populated - does a css_set contain any tasks? 660 * css_set_populated - does a css_set contain any tasks?
575 * @cset: target css_set 661 * @cset: target css_set
@@ -587,39 +673,48 @@ static bool css_set_populated(struct css_set *cset)
587} 673}
588 674
589/** 675/**
590 * cgroup_update_populated - updated populated count of a cgroup 676 * cgroup_update_populated - update the populated count of a cgroup
591 * @cgrp: the target cgroup 677 * @cgrp: the target cgroup
592 * @populated: inc or dec populated count 678 * @populated: inc or dec populated count
593 * 679 *
594 * One of the css_sets associated with @cgrp is either getting its first 680 * One of the css_sets associated with @cgrp is either getting its first
595 * task or losing the last. Update @cgrp->populated_cnt accordingly. The 681 * task or losing the last. Update @cgrp->nr_populated_* accordingly. The
596 * count is propagated towards root so that a given cgroup's populated_cnt 682 * count is propagated towards root so that a given cgroup's
597 * is zero iff the cgroup and all its descendants don't contain any tasks. 683 * nr_populated_children is zero iff none of its descendants contain any
684 * tasks.
598 * 685 *
599 * @cgrp's interface file "cgroup.populated" is zero if 686 * @cgrp's interface file "cgroup.populated" is zero if both
600 * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt 687 * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
601 * changes from or to zero, userland is notified that the content of the 688 * 1 otherwise. When the sum changes from or to zero, userland is notified
602 * interface file has changed. This can be used to detect when @cgrp and 689 * that the content of the interface file has changed. This can be used to
603 * its descendants become populated or empty. 690 * detect when @cgrp and its descendants become populated or empty.
604 */ 691 */
605static void cgroup_update_populated(struct cgroup *cgrp, bool populated) 692static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
606{ 693{
694 struct cgroup *child = NULL;
695 int adj = populated ? 1 : -1;
696
607 lockdep_assert_held(&css_set_lock); 697 lockdep_assert_held(&css_set_lock);
608 698
609 do { 699 do {
610 bool trigger; 700 bool was_populated = cgroup_is_populated(cgrp);
611 701
612 if (populated) 702 if (!child) {
613 trigger = !cgrp->populated_cnt++; 703 cgrp->nr_populated_csets += adj;
614 else 704 } else {
615 trigger = !--cgrp->populated_cnt; 705 if (cgroup_is_threaded(child))
706 cgrp->nr_populated_threaded_children += adj;
707 else
708 cgrp->nr_populated_domain_children += adj;
709 }
616 710
617 if (!trigger) 711 if (was_populated == cgroup_is_populated(cgrp))
618 break; 712 break;
619 713
620 cgroup1_check_for_release(cgrp); 714 cgroup1_check_for_release(cgrp);
621 cgroup_file_notify(&cgrp->events_file); 715 cgroup_file_notify(&cgrp->events_file);
622 716
717 child = cgrp;
623 cgrp = cgroup_parent(cgrp); 718 cgrp = cgroup_parent(cgrp);
624 } while (cgrp); 719 } while (cgrp);
625} 720}
@@ -630,7 +725,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
630 * @populated: whether @cset is populated or depopulated 725 * @populated: whether @cset is populated or depopulated
631 * 726 *
632 * @cset is either getting the first task or losing the last. Update the 727 * @cset is either getting the first task or losing the last. Update the
633 * ->populated_cnt of all associated cgroups accordingly. 728 * populated counters of all associated cgroups accordingly.
634 */ 729 */
635static void css_set_update_populated(struct css_set *cset, bool populated) 730static void css_set_update_populated(struct css_set *cset, bool populated)
636{ 731{
@@ -653,7 +748,7 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
653 * css_set, @from_cset can be NULL. If @task is being disassociated 748 * css_set, @from_cset can be NULL. If @task is being disassociated
654 * instead of moved, @to_cset can be NULL. 749 * instead of moved, @to_cset can be NULL.
655 * 750 *
656 * This function automatically handles populated_cnt updates and 751 * This function automatically handles populated counter updates and
657 * css_task_iter adjustments but the caller is responsible for managing 752 * css_task_iter adjustments but the caller is responsible for managing
658 * @from_cset and @to_cset's reference counts. 753 * @from_cset and @to_cset's reference counts.
659 */ 754 */
@@ -737,6 +832,8 @@ void put_css_set_locked(struct css_set *cset)
737 if (!refcount_dec_and_test(&cset->refcount)) 832 if (!refcount_dec_and_test(&cset->refcount))
738 return; 833 return;
739 834
835 WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
836
740 /* This css_set is dead. unlink it and release cgroup and css refs */ 837 /* This css_set is dead. unlink it and release cgroup and css refs */
741 for_each_subsys(ss, ssid) { 838 for_each_subsys(ss, ssid) {
742 list_del(&cset->e_cset_node[ssid]); 839 list_del(&cset->e_cset_node[ssid]);
@@ -753,6 +850,11 @@ void put_css_set_locked(struct css_set *cset)
753 kfree(link); 850 kfree(link);
754 } 851 }
755 852
853 if (css_set_threaded(cset)) {
854 list_del(&cset->threaded_csets_node);
855 put_css_set_locked(cset->dom_cset);
856 }
857
756 kfree_rcu(cset, rcu_head); 858 kfree_rcu(cset, rcu_head);
757} 859}
758 860
@@ -771,6 +873,7 @@ static bool compare_css_sets(struct css_set *cset,
771 struct cgroup *new_cgrp, 873 struct cgroup *new_cgrp,
772 struct cgroup_subsys_state *template[]) 874 struct cgroup_subsys_state *template[])
773{ 875{
876 struct cgroup *new_dfl_cgrp;
774 struct list_head *l1, *l2; 877 struct list_head *l1, *l2;
775 878
776 /* 879 /*
@@ -781,6 +884,16 @@ static bool compare_css_sets(struct css_set *cset,
781 if (memcmp(template, cset->subsys, sizeof(cset->subsys))) 884 if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
782 return false; 885 return false;
783 886
887
888 /* @cset's domain should match the default cgroup's */
889 if (cgroup_on_dfl(new_cgrp))
890 new_dfl_cgrp = new_cgrp;
891 else
892 new_dfl_cgrp = old_cset->dfl_cgrp;
893
894 if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
895 return false;
896
784 /* 897 /*
785 * Compare cgroup pointers in order to distinguish between 898 * Compare cgroup pointers in order to distinguish between
786 * different cgroups in hierarchies. As different cgroups may 899 * different cgroups in hierarchies. As different cgroups may
@@ -988,9 +1101,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
988 } 1101 }
989 1102
990 refcount_set(&cset->refcount, 1); 1103 refcount_set(&cset->refcount, 1);
1104 cset->dom_cset = cset;
991 INIT_LIST_HEAD(&cset->tasks); 1105 INIT_LIST_HEAD(&cset->tasks);
992 INIT_LIST_HEAD(&cset->mg_tasks); 1106 INIT_LIST_HEAD(&cset->mg_tasks);
993 INIT_LIST_HEAD(&cset->task_iters); 1107 INIT_LIST_HEAD(&cset->task_iters);
1108 INIT_LIST_HEAD(&cset->threaded_csets);
994 INIT_HLIST_NODE(&cset->hlist); 1109 INIT_HLIST_NODE(&cset->hlist);
995 INIT_LIST_HEAD(&cset->cgrp_links); 1110 INIT_LIST_HEAD(&cset->cgrp_links);
996 INIT_LIST_HEAD(&cset->mg_preload_node); 1111 INIT_LIST_HEAD(&cset->mg_preload_node);
@@ -1028,6 +1143,28 @@ static struct css_set *find_css_set(struct css_set *old_cset,
1028 1143
1029 spin_unlock_irq(&css_set_lock); 1144 spin_unlock_irq(&css_set_lock);
1030 1145
1146 /*
1147 * If @cset should be threaded, look up the matching dom_cset and
1148 * link them up. We first fully initialize @cset then look for the
1149 * dom_cset. It's simpler this way and safe as @cset is guaranteed
1150 * to stay empty until we return.
1151 */
1152 if (cgroup_is_threaded(cset->dfl_cgrp)) {
1153 struct css_set *dcset;
1154
1155 dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
1156 if (!dcset) {
1157 put_css_set(cset);
1158 return NULL;
1159 }
1160
1161 spin_lock_irq(&css_set_lock);
1162 cset->dom_cset = dcset;
1163 list_add_tail(&cset->threaded_csets_node,
1164 &dcset->threaded_csets);
1165 spin_unlock_irq(&css_set_lock);
1166 }
1167
1031 return cset; 1168 return cset;
1032} 1169}
1033 1170
@@ -1155,6 +1292,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1155 1292
1156 if (cset == &init_css_set) { 1293 if (cset == &init_css_set) {
1157 res = &root->cgrp; 1294 res = &root->cgrp;
1295 } else if (root == &cgrp_dfl_root) {
1296 res = cset->dfl_cgrp;
1158 } else { 1297 } else {
1159 struct cgrp_cset_link *link; 1298 struct cgrp_cset_link *link;
1160 1299
@@ -1670,6 +1809,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1670 mutex_init(&cgrp->pidlist_mutex); 1809 mutex_init(&cgrp->pidlist_mutex);
1671 cgrp->self.cgroup = cgrp; 1810 cgrp->self.cgroup = cgrp;
1672 cgrp->self.flags |= CSS_ONLINE; 1811 cgrp->self.flags |= CSS_ONLINE;
1812 cgrp->dom_cgrp = cgrp;
1813 cgrp->max_descendants = INT_MAX;
1814 cgrp->max_depth = INT_MAX;
1673 1815
1674 for_each_subsys(ss, ssid) 1816 for_each_subsys(ss, ssid)
1675 INIT_LIST_HEAD(&cgrp->e_csets[ssid]); 1817 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
@@ -2172,17 +2314,40 @@ out_release_tset:
2172} 2314}
2173 2315
2174/** 2316/**
2175 * cgroup_may_migrate_to - verify whether a cgroup can be migration destination 2317 * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
2176 * @dst_cgrp: destination cgroup to test 2318 * @dst_cgrp: destination cgroup to test
2177 * 2319 *
2178 * On the default hierarchy, except for the root, subtree_control must be 2320 * On the default hierarchy, except for the mixable, (possible) thread root
2179 * zero for migration destination cgroups with tasks so that child cgroups 2321 * and threaded cgroups, subtree_control must be zero for migration
2180 * don't compete against tasks. 2322 * destination cgroups with tasks so that child cgroups don't compete
2323 * against tasks.
2181 */ 2324 */
2182bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) 2325int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
2183{ 2326{
2184 return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) || 2327 /* v1 doesn't have any restriction */
2185 !dst_cgrp->subtree_control; 2328 if (!cgroup_on_dfl(dst_cgrp))
2329 return 0;
2330
2331 /* verify @dst_cgrp can host resources */
2332 if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
2333 return -EOPNOTSUPP;
2334
2335 /* mixables don't care */
2336 if (cgroup_is_mixable(dst_cgrp))
2337 return 0;
2338
2339 /*
2340 * If @dst_cgrp is already or can become a thread root or is
2341 * threaded, it doesn't matter.
2342 */
2343 if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
2344 return 0;
2345
2346 /* apply no-internal-process constraint */
2347 if (dst_cgrp->subtree_control)
2348 return -EBUSY;
2349
2350 return 0;
2186} 2351}
2187 2352
2188/** 2353/**
@@ -2387,8 +2552,9 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
2387 struct task_struct *task; 2552 struct task_struct *task;
2388 int ret; 2553 int ret;
2389 2554
2390 if (!cgroup_may_migrate_to(dst_cgrp)) 2555 ret = cgroup_migrate_vet_dst(dst_cgrp);
2391 return -EBUSY; 2556 if (ret)
2557 return ret;
2392 2558
2393 /* look up all src csets */ 2559 /* look up all src csets */
2394 spin_lock_irq(&css_set_lock); 2560 spin_lock_irq(&css_set_lock);
@@ -2415,96 +2581,23 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
2415 return ret; 2581 return ret;
2416} 2582}
2417 2583
2418static int cgroup_procs_write_permission(struct task_struct *task, 2584struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
2419 struct cgroup *dst_cgrp, 2585 __acquires(&cgroup_threadgroup_rwsem)
2420 struct kernfs_open_file *of)
2421{
2422 struct super_block *sb = of->file->f_path.dentry->d_sb;
2423 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
2424 struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
2425 struct cgroup *src_cgrp, *com_cgrp;
2426 struct inode *inode;
2427 int ret;
2428
2429 if (!cgroup_on_dfl(dst_cgrp)) {
2430 const struct cred *cred = current_cred();
2431 const struct cred *tcred = get_task_cred(task);
2432
2433 /*
2434 * even if we're attaching all tasks in the thread group,
2435 * we only need to check permissions on one of them.
2436 */
2437 if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
2438 uid_eq(cred->euid, tcred->uid) ||
2439 uid_eq(cred->euid, tcred->suid))
2440 ret = 0;
2441 else
2442 ret = -EACCES;
2443
2444 put_cred(tcred);
2445 return ret;
2446 }
2447
2448 /* find the source cgroup */
2449 spin_lock_irq(&css_set_lock);
2450 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
2451 spin_unlock_irq(&css_set_lock);
2452
2453 /* and the common ancestor */
2454 com_cgrp = src_cgrp;
2455 while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
2456 com_cgrp = cgroup_parent(com_cgrp);
2457
2458 /* %current should be authorized to migrate to the common ancestor */
2459 inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
2460 if (!inode)
2461 return -ENOMEM;
2462
2463 ret = inode_permission(inode, MAY_WRITE);
2464 iput(inode);
2465 if (ret)
2466 return ret;
2467
2468 /*
2469 * If namespaces are delegation boundaries, %current must be able
2470 * to see both source and destination cgroups from its namespace.
2471 */
2472 if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
2473 (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
2474 !cgroup_is_descendant(dst_cgrp, root_cgrp)))
2475 return -ENOENT;
2476
2477 return 0;
2478}
2479
2480/*
2481 * Find the task_struct of the task to attach by vpid and pass it along to the
2482 * function to attach either it or all tasks in its threadgroup. Will lock
2483 * cgroup_mutex and threadgroup.
2484 */
2485ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2486 size_t nbytes, loff_t off, bool threadgroup)
2487{ 2586{
2488 struct task_struct *tsk; 2587 struct task_struct *tsk;
2489 struct cgroup_subsys *ss;
2490 struct cgroup *cgrp;
2491 pid_t pid; 2588 pid_t pid;
2492 int ssid, ret;
2493 2589
2494 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) 2590 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2495 return -EINVAL; 2591 return ERR_PTR(-EINVAL);
2496
2497 cgrp = cgroup_kn_lock_live(of->kn, false);
2498 if (!cgrp)
2499 return -ENODEV;
2500 2592
2501 percpu_down_write(&cgroup_threadgroup_rwsem); 2593 percpu_down_write(&cgroup_threadgroup_rwsem);
2594
2502 rcu_read_lock(); 2595 rcu_read_lock();
2503 if (pid) { 2596 if (pid) {
2504 tsk = find_task_by_vpid(pid); 2597 tsk = find_task_by_vpid(pid);
2505 if (!tsk) { 2598 if (!tsk) {
2506 ret = -ESRCH; 2599 tsk = ERR_PTR(-ESRCH);
2507 goto out_unlock_rcu; 2600 goto out_unlock_threadgroup;
2508 } 2601 }
2509 } else { 2602 } else {
2510 tsk = current; 2603 tsk = current;
@@ -2520,35 +2613,33 @@ ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2520 * cgroup with no rt_runtime allocated. Just say no. 2613 * cgroup with no rt_runtime allocated. Just say no.
2521 */ 2614 */
2522 if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { 2615 if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
2523 ret = -EINVAL; 2616 tsk = ERR_PTR(-EINVAL);
2524 goto out_unlock_rcu; 2617 goto out_unlock_threadgroup;
2525 } 2618 }
2526 2619
2527 get_task_struct(tsk); 2620 get_task_struct(tsk);
2621 goto out_unlock_rcu;
2622
2623out_unlock_threadgroup:
2624 percpu_up_write(&cgroup_threadgroup_rwsem);
2625out_unlock_rcu:
2528 rcu_read_unlock(); 2626 rcu_read_unlock();
2627 return tsk;
2628}
2529 2629
2530 ret = cgroup_procs_write_permission(tsk, cgrp, of); 2630void cgroup_procs_write_finish(struct task_struct *task)
2531 if (!ret) 2631 __releases(&cgroup_threadgroup_rwsem)
2532 ret = cgroup_attach_task(cgrp, tsk, threadgroup); 2632{
2633 struct cgroup_subsys *ss;
2634 int ssid;
2533 2635
2534 put_task_struct(tsk); 2636 /* release reference from cgroup_procs_write_start() */
2535 goto out_unlock_threadgroup; 2637 put_task_struct(task);
2536 2638
2537out_unlock_rcu:
2538 rcu_read_unlock();
2539out_unlock_threadgroup:
2540 percpu_up_write(&cgroup_threadgroup_rwsem); 2639 percpu_up_write(&cgroup_threadgroup_rwsem);
2541 for_each_subsys(ss, ssid) 2640 for_each_subsys(ss, ssid)
2542 if (ss->post_attach) 2641 if (ss->post_attach)
2543 ss->post_attach(); 2642 ss->post_attach();
2544 cgroup_kn_unlock(of->kn);
2545 return ret ?: nbytes;
2546}
2547
2548ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
2549 loff_t off)
2550{
2551 return __cgroup_procs_write(of, buf, nbytes, off, true);
2552} 2643}
2553 2644
2554static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) 2645static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
@@ -2891,6 +2982,46 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
2891 cgroup_apply_control_disable(cgrp); 2982 cgroup_apply_control_disable(cgrp);
2892} 2983}
2893 2984
2985static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
2986{
2987 u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
2988
2989 /* if nothing is getting enabled, nothing to worry about */
2990 if (!enable)
2991 return 0;
2992
2993 /* can @cgrp host any resources? */
2994 if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
2995 return -EOPNOTSUPP;
2996
2997 /* mixables don't care */
2998 if (cgroup_is_mixable(cgrp))
2999 return 0;
3000
3001 if (domain_enable) {
3002 /* can't enable domain controllers inside a thread subtree */
3003 if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3004 return -EOPNOTSUPP;
3005 } else {
3006 /*
3007 * Threaded controllers can handle internal competitions
3008 * and are always allowed inside a (prospective) thread
3009 * subtree.
3010 */
3011 if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3012 return 0;
3013 }
3014
3015 /*
3016 * Controllers can't be enabled for a cgroup with tasks to avoid
3017 * child cgroups competing against tasks.
3018 */
3019 if (cgroup_has_tasks(cgrp))
3020 return -EBUSY;
3021
3022 return 0;
3023}
3024
2894/* change the enabled child controllers for a cgroup in the default hierarchy */ 3025/* change the enabled child controllers for a cgroup in the default hierarchy */
2895static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, 3026static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2896 char *buf, size_t nbytes, 3027 char *buf, size_t nbytes,
@@ -2966,33 +3097,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2966 goto out_unlock; 3097 goto out_unlock;
2967 } 3098 }
2968 3099
2969 /* 3100 ret = cgroup_vet_subtree_control_enable(cgrp, enable);
2970 * Except for the root, subtree_control must be zero for a cgroup 3101 if (ret)
2971 * with tasks so that child cgroups don't compete against tasks. 3102 goto out_unlock;
2972 */
2973 if (enable && cgroup_parent(cgrp)) {
2974 struct cgrp_cset_link *link;
2975
2976 /*
2977 * Because namespaces pin csets too, @cgrp->cset_links
2978 * might not be empty even when @cgrp is empty. Walk and
2979 * verify each cset.
2980 */
2981 spin_lock_irq(&css_set_lock);
2982
2983 ret = 0;
2984 list_for_each_entry(link, &cgrp->cset_links, cset_link) {
2985 if (css_set_populated(link->cset)) {
2986 ret = -EBUSY;
2987 break;
2988 }
2989 }
2990
2991 spin_unlock_irq(&css_set_lock);
2992
2993 if (ret)
2994 goto out_unlock;
2995 }
2996 3103
2997 /* save and update control masks and prepare csses */ 3104 /* save and update control masks and prepare csses */
2998 cgroup_save_control(cgrp); 3105 cgroup_save_control(cgrp);
@@ -3011,6 +3118,172 @@ out_unlock:
3011 return ret ?: nbytes; 3118 return ret ?: nbytes;
3012} 3119}
3013 3120
3121/**
3122 * cgroup_enable_threaded - make @cgrp threaded
3123 * @cgrp: the target cgroup
3124 *
3125 * Called when "threaded" is written to the cgroup.type interface file and
3126 * tries to make @cgrp threaded and join the parent's resource domain.
3127 * This function is never called on the root cgroup as cgroup.type doesn't
3128 * exist on it.
3129 */
3130static int cgroup_enable_threaded(struct cgroup *cgrp)
3131{
3132 struct cgroup *parent = cgroup_parent(cgrp);
3133 struct cgroup *dom_cgrp = parent->dom_cgrp;
3134 int ret;
3135
3136 lockdep_assert_held(&cgroup_mutex);
3137
3138 /* noop if already threaded */
3139 if (cgroup_is_threaded(cgrp))
3140 return 0;
3141
3142 /* we're joining the parent's domain, ensure its validity */
3143 if (!cgroup_is_valid_domain(dom_cgrp) ||
3144 !cgroup_can_be_thread_root(dom_cgrp))
3145 return -EOPNOTSUPP;
3146
3147 /*
3148 * The following shouldn't cause actual migrations and should
3149 * always succeed.
3150 */
3151 cgroup_save_control(cgrp);
3152
3153 cgrp->dom_cgrp = dom_cgrp;
3154 ret = cgroup_apply_control(cgrp);
3155 if (!ret)
3156 parent->nr_threaded_children++;
3157 else
3158 cgrp->dom_cgrp = cgrp;
3159
3160 cgroup_finalize_control(cgrp, ret);
3161 return ret;
3162}
3163
3164static int cgroup_type_show(struct seq_file *seq, void *v)
3165{
3166 struct cgroup *cgrp = seq_css(seq)->cgroup;
3167
3168 if (cgroup_is_threaded(cgrp))
3169 seq_puts(seq, "threaded\n");
3170 else if (!cgroup_is_valid_domain(cgrp))
3171 seq_puts(seq, "domain invalid\n");
3172 else if (cgroup_is_thread_root(cgrp))
3173 seq_puts(seq, "domain threaded\n");
3174 else
3175 seq_puts(seq, "domain\n");
3176
3177 return 0;
3178}
3179
3180static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
3181 size_t nbytes, loff_t off)
3182{
3183 struct cgroup *cgrp;
3184 int ret;
3185
3186 /* only switching to threaded mode is supported */
3187 if (strcmp(strstrip(buf), "threaded"))
3188 return -EINVAL;
3189
3190 cgrp = cgroup_kn_lock_live(of->kn, false);
3191 if (!cgrp)
3192 return -ENOENT;
3193
3194 /* threaded can only be enabled */
3195 ret = cgroup_enable_threaded(cgrp);
3196
3197 cgroup_kn_unlock(of->kn);
3198 return ret ?: nbytes;
3199}
3200
3201static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
3202{
3203 struct cgroup *cgrp = seq_css(seq)->cgroup;
3204 int descendants = READ_ONCE(cgrp->max_descendants);
3205
3206 if (descendants == INT_MAX)
3207 seq_puts(seq, "max\n");
3208 else
3209 seq_printf(seq, "%d\n", descendants);
3210
3211 return 0;
3212}
3213
3214static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
3215 char *buf, size_t nbytes, loff_t off)
3216{
3217 struct cgroup *cgrp;
3218 int descendants;
3219 ssize_t ret;
3220
3221 buf = strstrip(buf);
3222 if (!strcmp(buf, "max")) {
3223 descendants = INT_MAX;
3224 } else {
3225 ret = kstrtoint(buf, 0, &descendants);
3226 if (ret)
3227 return ret;
3228 }
3229
3230 if (descendants < 0)
3231 return -ERANGE;
3232
3233 cgrp = cgroup_kn_lock_live(of->kn, false);
3234 if (!cgrp)
3235 return -ENOENT;
3236
3237 cgrp->max_descendants = descendants;
3238
3239 cgroup_kn_unlock(of->kn);
3240
3241 return nbytes;
3242}
3243
3244static int cgroup_max_depth_show(struct seq_file *seq, void *v)
3245{
3246 struct cgroup *cgrp = seq_css(seq)->cgroup;
3247 int depth = READ_ONCE(cgrp->max_depth);
3248
3249 if (depth == INT_MAX)
3250 seq_puts(seq, "max\n");
3251 else
3252 seq_printf(seq, "%d\n", depth);
3253
3254 return 0;
3255}
3256
3257static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
3258 char *buf, size_t nbytes, loff_t off)
3259{
3260 struct cgroup *cgrp;
3261 ssize_t ret;
3262 int depth;
3263
3264 buf = strstrip(buf);
3265 if (!strcmp(buf, "max")) {
3266 depth = INT_MAX;
3267 } else {
3268 ret = kstrtoint(buf, 0, &depth);
3269 if (ret)
3270 return ret;
3271 }
3272
3273 if (depth < 0)
3274 return -ERANGE;
3275
3276 cgrp = cgroup_kn_lock_live(of->kn, false);
3277 if (!cgrp)
3278 return -ENOENT;
3279
3280 cgrp->max_depth = depth;
3281
3282 cgroup_kn_unlock(of->kn);
3283
3284 return nbytes;
3285}
3286
3014static int cgroup_events_show(struct seq_file *seq, void *v) 3287static int cgroup_events_show(struct seq_file *seq, void *v)
3015{ 3288{
3016 seq_printf(seq, "populated %d\n", 3289 seq_printf(seq, "populated %d\n",
@@ -3018,6 +3291,18 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
3018 return 0; 3291 return 0;
3019} 3292}
3020 3293
3294static int cgroup_stat_show(struct seq_file *seq, void *v)
3295{
3296 struct cgroup *cgroup = seq_css(seq)->cgroup;
3297
3298 seq_printf(seq, "nr_descendants %d\n",
3299 cgroup->nr_descendants);
3300 seq_printf(seq, "nr_dying_descendants %d\n",
3301 cgroup->nr_dying_descendants);
3302
3303 return 0;
3304}
3305
3021static int cgroup_file_open(struct kernfs_open_file *of) 3306static int cgroup_file_open(struct kernfs_open_file *of)
3022{ 3307{
3023 struct cftype *cft = of->kn->priv; 3308 struct cftype *cft = of->kn->priv;
@@ -3234,7 +3519,6 @@ restart:
3234 3519
3235static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) 3520static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
3236{ 3521{
3237 LIST_HEAD(pending);
3238 struct cgroup_subsys *ss = cfts[0].ss; 3522 struct cgroup_subsys *ss = cfts[0].ss;
3239 struct cgroup *root = &ss->root->cgrp; 3523 struct cgroup *root = &ss->root->cgrp;
3240 struct cgroup_subsys_state *css; 3524 struct cgroup_subsys_state *css;
@@ -3659,6 +3943,58 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
3659 return ret; 3943 return ret;
3660} 3944}
3661 3945
3946static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
3947{
3948 struct list_head *l;
3949 struct cgrp_cset_link *link;
3950 struct css_set *cset;
3951
3952 lockdep_assert_held(&css_set_lock);
3953
3954 /* find the next threaded cset */
3955 if (it->tcset_pos) {
3956 l = it->tcset_pos->next;
3957
3958 if (l != it->tcset_head) {
3959 it->tcset_pos = l;
3960 return container_of(l, struct css_set,
3961 threaded_csets_node);
3962 }
3963
3964 it->tcset_pos = NULL;
3965 }
3966
3967 /* find the next cset */
3968 l = it->cset_pos;
3969 l = l->next;
3970 if (l == it->cset_head) {
3971 it->cset_pos = NULL;
3972 return NULL;
3973 }
3974
3975 if (it->ss) {
3976 cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
3977 } else {
3978 link = list_entry(l, struct cgrp_cset_link, cset_link);
3979 cset = link->cset;
3980 }
3981
3982 it->cset_pos = l;
3983
3984 /* initialize threaded css_set walking */
3985 if (it->flags & CSS_TASK_ITER_THREADED) {
3986 if (it->cur_dcset)
3987 put_css_set_locked(it->cur_dcset);
3988 it->cur_dcset = cset;
3989 get_css_set(cset);
3990
3991 it->tcset_head = &cset->threaded_csets;
3992 it->tcset_pos = &cset->threaded_csets;
3993 }
3994
3995 return cset;
3996}
3997
3662/** 3998/**
3663 * css_task_iter_advance_css_set - advance a task itererator to the next css_set 3999 * css_task_iter_advance_css_set - advance a task itererator to the next css_set
3664 * @it: the iterator to advance 4000 * @it: the iterator to advance
@@ -3667,32 +4003,19 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
3667 */ 4003 */
3668static void css_task_iter_advance_css_set(struct css_task_iter *it) 4004static void css_task_iter_advance_css_set(struct css_task_iter *it)
3669{ 4005{
3670 struct list_head *l = it->cset_pos;
3671 struct cgrp_cset_link *link;
3672 struct css_set *cset; 4006 struct css_set *cset;
3673 4007
3674 lockdep_assert_held(&css_set_lock); 4008 lockdep_assert_held(&css_set_lock);
3675 4009
3676 /* Advance to the next non-empty css_set */ 4010 /* Advance to the next non-empty css_set */
3677 do { 4011 do {
3678 l = l->next; 4012 cset = css_task_iter_next_css_set(it);
3679 if (l == it->cset_head) { 4013 if (!cset) {
3680 it->cset_pos = NULL;
3681 it->task_pos = NULL; 4014 it->task_pos = NULL;
3682 return; 4015 return;
3683 } 4016 }
3684
3685 if (it->ss) {
3686 cset = container_of(l, struct css_set,
3687 e_cset_node[it->ss->id]);
3688 } else {
3689 link = list_entry(l, struct cgrp_cset_link, cset_link);
3690 cset = link->cset;
3691 }
3692 } while (!css_set_populated(cset)); 4017 } while (!css_set_populated(cset));
3693 4018
3694 it->cset_pos = l;
3695
3696 if (!list_empty(&cset->tasks)) 4019 if (!list_empty(&cset->tasks))
3697 it->task_pos = cset->tasks.next; 4020 it->task_pos = cset->tasks.next;
3698 else 4021 else
@@ -3732,6 +4055,7 @@ static void css_task_iter_advance(struct css_task_iter *it)
3732 lockdep_assert_held(&css_set_lock); 4055 lockdep_assert_held(&css_set_lock);
3733 WARN_ON_ONCE(!l); 4056 WARN_ON_ONCE(!l);
3734 4057
4058repeat:
3735 /* 4059 /*
3736 * Advance iterator to find next entry. cset->tasks is consumed 4060 * Advance iterator to find next entry. cset->tasks is consumed
3737 * first and then ->mg_tasks. After ->mg_tasks, we move onto the 4061 * first and then ->mg_tasks. After ->mg_tasks, we move onto the
@@ -3746,11 +4070,18 @@ static void css_task_iter_advance(struct css_task_iter *it)
3746 css_task_iter_advance_css_set(it); 4070 css_task_iter_advance_css_set(it);
3747 else 4071 else
3748 it->task_pos = l; 4072 it->task_pos = l;
4073
4074 /* if PROCS, skip over tasks which aren't group leaders */
4075 if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
4076 !thread_group_leader(list_entry(it->task_pos, struct task_struct,
4077 cg_list)))
4078 goto repeat;
3749} 4079}
3750 4080
3751/** 4081/**
3752 * css_task_iter_start - initiate task iteration 4082 * css_task_iter_start - initiate task iteration
3753 * @css: the css to walk tasks of 4083 * @css: the css to walk tasks of
4084 * @flags: CSS_TASK_ITER_* flags
3754 * @it: the task iterator to use 4085 * @it: the task iterator to use
3755 * 4086 *
3756 * Initiate iteration through the tasks of @css. The caller can call 4087 * Initiate iteration through the tasks of @css. The caller can call
@@ -3758,7 +4089,7 @@ static void css_task_iter_advance(struct css_task_iter *it)
3758 * returns NULL. On completion of iteration, css_task_iter_end() must be 4089 * returns NULL. On completion of iteration, css_task_iter_end() must be
3759 * called. 4090 * called.
3760 */ 4091 */
3761void css_task_iter_start(struct cgroup_subsys_state *css, 4092void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
3762 struct css_task_iter *it) 4093 struct css_task_iter *it)
3763{ 4094{
3764 /* no one should try to iterate before mounting cgroups */ 4095 /* no one should try to iterate before mounting cgroups */
@@ -3769,6 +4100,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
3769 spin_lock_irq(&css_set_lock); 4100 spin_lock_irq(&css_set_lock);
3770 4101
3771 it->ss = css->ss; 4102 it->ss = css->ss;
4103 it->flags = flags;
3772 4104
3773 if (it->ss) 4105 if (it->ss)
3774 it->cset_pos = &css->cgroup->e_csets[css->ss->id]; 4106 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
@@ -3826,6 +4158,9 @@ void css_task_iter_end(struct css_task_iter *it)
3826 spin_unlock_irq(&css_set_lock); 4158 spin_unlock_irq(&css_set_lock);
3827 } 4159 }
3828 4160
4161 if (it->cur_dcset)
4162 put_css_set(it->cur_dcset);
4163
3829 if (it->cur_task) 4164 if (it->cur_task)
3830 put_task_struct(it->cur_task); 4165 put_task_struct(it->cur_task);
3831} 4166}
@@ -3842,16 +4177,12 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
3842{ 4177{
3843 struct kernfs_open_file *of = s->private; 4178 struct kernfs_open_file *of = s->private;
3844 struct css_task_iter *it = of->priv; 4179 struct css_task_iter *it = of->priv;
3845 struct task_struct *task;
3846 4180
3847 do { 4181 return css_task_iter_next(it);
3848 task = css_task_iter_next(it);
3849 } while (task && !thread_group_leader(task));
3850
3851 return task;
3852} 4182}
3853 4183
3854static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) 4184static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
4185 unsigned int iter_flags)
3855{ 4186{
3856 struct kernfs_open_file *of = s->private; 4187 struct kernfs_open_file *of = s->private;
3857 struct cgroup *cgrp = seq_css(s)->cgroup; 4188 struct cgroup *cgrp = seq_css(s)->cgroup;
@@ -3869,24 +4200,169 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
3869 if (!it) 4200 if (!it)
3870 return ERR_PTR(-ENOMEM); 4201 return ERR_PTR(-ENOMEM);
3871 of->priv = it; 4202 of->priv = it;
3872 css_task_iter_start(&cgrp->self, it); 4203 css_task_iter_start(&cgrp->self, iter_flags, it);
3873 } else if (!(*pos)++) { 4204 } else if (!(*pos)++) {
3874 css_task_iter_end(it); 4205 css_task_iter_end(it);
3875 css_task_iter_start(&cgrp->self, it); 4206 css_task_iter_start(&cgrp->self, iter_flags, it);
3876 } 4207 }
3877 4208
3878 return cgroup_procs_next(s, NULL, NULL); 4209 return cgroup_procs_next(s, NULL, NULL);
3879} 4210}
3880 4211
4212static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
4213{
4214 struct cgroup *cgrp = seq_css(s)->cgroup;
4215
4216 /*
4217 * All processes of a threaded subtree belong to the domain cgroup
4218 * of the subtree. Only threads can be distributed across the
4219 * subtree. Reject reads on cgroup.procs in the subtree proper.
4220 * They're always empty anyway.
4221 */
4222 if (cgroup_is_threaded(cgrp))
4223 return ERR_PTR(-EOPNOTSUPP);
4224
4225 return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
4226 CSS_TASK_ITER_THREADED);
4227}
4228
3881static int cgroup_procs_show(struct seq_file *s, void *v) 4229static int cgroup_procs_show(struct seq_file *s, void *v)
3882{ 4230{
3883 seq_printf(s, "%d\n", task_tgid_vnr(v)); 4231 seq_printf(s, "%d\n", task_pid_vnr(v));
3884 return 0; 4232 return 0;
3885} 4233}
3886 4234
4235static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
4236 struct cgroup *dst_cgrp,
4237 struct super_block *sb)
4238{
4239 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
4240 struct cgroup *com_cgrp = src_cgrp;
4241 struct inode *inode;
4242 int ret;
4243
4244 lockdep_assert_held(&cgroup_mutex);
4245
4246 /* find the common ancestor */
4247 while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
4248 com_cgrp = cgroup_parent(com_cgrp);
4249
4250 /* %current should be authorized to migrate to the common ancestor */
4251 inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
4252 if (!inode)
4253 return -ENOMEM;
4254
4255 ret = inode_permission(inode, MAY_WRITE);
4256 iput(inode);
4257 if (ret)
4258 return ret;
4259
4260 /*
4261 * If namespaces are delegation boundaries, %current must be able
4262 * to see both source and destination cgroups from its namespace.
4263 */
4264 if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
4265 (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
4266 !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
4267 return -ENOENT;
4268
4269 return 0;
4270}
4271
4272static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
4273 char *buf, size_t nbytes, loff_t off)
4274{
4275 struct cgroup *src_cgrp, *dst_cgrp;
4276 struct task_struct *task;
4277 ssize_t ret;
4278
4279 dst_cgrp = cgroup_kn_lock_live(of->kn, false);
4280 if (!dst_cgrp)
4281 return -ENODEV;
4282
4283 task = cgroup_procs_write_start(buf, true);
4284 ret = PTR_ERR_OR_ZERO(task);
4285 if (ret)
4286 goto out_unlock;
4287
4288 /* find the source cgroup */
4289 spin_lock_irq(&css_set_lock);
4290 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
4291 spin_unlock_irq(&css_set_lock);
4292
4293 ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
4294 of->file->f_path.dentry->d_sb);
4295 if (ret)
4296 goto out_finish;
4297
4298 ret = cgroup_attach_task(dst_cgrp, task, true);
4299
4300out_finish:
4301 cgroup_procs_write_finish(task);
4302out_unlock:
4303 cgroup_kn_unlock(of->kn);
4304
4305 return ret ?: nbytes;
4306}
4307
4308static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
4309{
4310 return __cgroup_procs_start(s, pos, 0);
4311}
4312
4313static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
4314 char *buf, size_t nbytes, loff_t off)
4315{
4316 struct cgroup *src_cgrp, *dst_cgrp;
4317 struct task_struct *task;
4318 ssize_t ret;
4319
4320 buf = strstrip(buf);
4321
4322 dst_cgrp = cgroup_kn_lock_live(of->kn, false);
4323 if (!dst_cgrp)
4324 return -ENODEV;
4325
4326 task = cgroup_procs_write_start(buf, false);
4327 ret = PTR_ERR_OR_ZERO(task);
4328 if (ret)
4329 goto out_unlock;
4330
4331 /* find the source cgroup */
4332 spin_lock_irq(&css_set_lock);
4333 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
4334 spin_unlock_irq(&css_set_lock);
4335
4336 /* thread migrations follow the cgroup.procs delegation rule */
4337 ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
4338 of->file->f_path.dentry->d_sb);
4339 if (ret)
4340 goto out_finish;
4341
4342 /* and must be contained in the same domain */
4343 ret = -EOPNOTSUPP;
4344 if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
4345 goto out_finish;
4346
4347 ret = cgroup_attach_task(dst_cgrp, task, false);
4348
4349out_finish:
4350 cgroup_procs_write_finish(task);
4351out_unlock:
4352 cgroup_kn_unlock(of->kn);
4353
4354 return ret ?: nbytes;
4355}
4356
3887/* cgroup core interface files for the default hierarchy */ 4357/* cgroup core interface files for the default hierarchy */
3888static struct cftype cgroup_base_files[] = { 4358static struct cftype cgroup_base_files[] = {
3889 { 4359 {
4360 .name = "cgroup.type",
4361 .flags = CFTYPE_NOT_ON_ROOT,
4362 .seq_show = cgroup_type_show,
4363 .write = cgroup_type_write,
4364 },
4365 {
3890 .name = "cgroup.procs", 4366 .name = "cgroup.procs",
3891 .flags = CFTYPE_NS_DELEGATABLE, 4367 .flags = CFTYPE_NS_DELEGATABLE,
3892 .file_offset = offsetof(struct cgroup, procs_file), 4368 .file_offset = offsetof(struct cgroup, procs_file),
@@ -3897,6 +4373,14 @@ static struct cftype cgroup_base_files[] = {
3897 .write = cgroup_procs_write, 4373 .write = cgroup_procs_write,
3898 }, 4374 },
3899 { 4375 {
4376 .name = "cgroup.threads",
4377 .release = cgroup_procs_release,
4378 .seq_start = cgroup_threads_start,
4379 .seq_next = cgroup_procs_next,
4380 .seq_show = cgroup_procs_show,
4381 .write = cgroup_threads_write,
4382 },
4383 {
3900 .name = "cgroup.controllers", 4384 .name = "cgroup.controllers",
3901 .seq_show = cgroup_controllers_show, 4385 .seq_show = cgroup_controllers_show,
3902 }, 4386 },
@@ -3912,6 +4396,20 @@ static struct cftype cgroup_base_files[] = {
3912 .file_offset = offsetof(struct cgroup, events_file), 4396 .file_offset = offsetof(struct cgroup, events_file),
3913 .seq_show = cgroup_events_show, 4397 .seq_show = cgroup_events_show,
3914 }, 4398 },
4399 {
4400 .name = "cgroup.max.descendants",
4401 .seq_show = cgroup_max_descendants_show,
4402 .write = cgroup_max_descendants_write,
4403 },
4404 {
4405 .name = "cgroup.max.depth",
4406 .seq_show = cgroup_max_depth_show,
4407 .write = cgroup_max_depth_write,
4408 },
4409 {
4410 .name = "cgroup.stat",
4411 .seq_show = cgroup_stat_show,
4412 },
3915 { } /* terminate */ 4413 { } /* terminate */
3916}; 4414};
3917 4415
@@ -4011,9 +4509,15 @@ static void css_release_work_fn(struct work_struct *work)
4011 if (ss->css_released) 4509 if (ss->css_released)
4012 ss->css_released(css); 4510 ss->css_released(css);
4013 } else { 4511 } else {
4512 struct cgroup *tcgrp;
4513
4014 /* cgroup release path */ 4514 /* cgroup release path */
4015 trace_cgroup_release(cgrp); 4515 trace_cgroup_release(cgrp);
4016 4516
4517 for (tcgrp = cgroup_parent(cgrp); tcgrp;
4518 tcgrp = cgroup_parent(tcgrp))
4519 tcgrp->nr_dying_descendants--;
4520
4017 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); 4521 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4018 cgrp->id = -1; 4522 cgrp->id = -1;
4019 4523
@@ -4209,9 +4713,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
4209 cgrp->root = root; 4713 cgrp->root = root;
4210 cgrp->level = level; 4714 cgrp->level = level;
4211 4715
4212 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) 4716 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
4213 cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; 4717 cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
4214 4718
4719 if (tcgrp != cgrp)
4720 tcgrp->nr_descendants++;
4721 }
4722
4215 if (notify_on_release(parent)) 4723 if (notify_on_release(parent))
4216 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 4724 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4217 4725
@@ -4252,6 +4760,29 @@ out_free_cgrp:
4252 return ERR_PTR(ret); 4760 return ERR_PTR(ret);
4253} 4761}
4254 4762
4763static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
4764{
4765 struct cgroup *cgroup;
4766 int ret = false;
4767 int level = 1;
4768
4769 lockdep_assert_held(&cgroup_mutex);
4770
4771 for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
4772 if (cgroup->nr_descendants >= cgroup->max_descendants)
4773 goto fail;
4774
4775 if (level > cgroup->max_depth)
4776 goto fail;
4777
4778 level++;
4779 }
4780
4781 ret = true;
4782fail:
4783 return ret;
4784}
4785
4255int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) 4786int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
4256{ 4787{
4257 struct cgroup *parent, *cgrp; 4788 struct cgroup *parent, *cgrp;
@@ -4266,6 +4797,11 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
4266 if (!parent) 4797 if (!parent)
4267 return -ENODEV; 4798 return -ENODEV;
4268 4799
4800 if (!cgroup_check_hierarchy_limits(parent)) {
4801 ret = -EAGAIN;
4802 goto out_unlock;
4803 }
4804
4269 cgrp = cgroup_create(parent); 4805 cgrp = cgroup_create(parent);
4270 if (IS_ERR(cgrp)) { 4806 if (IS_ERR(cgrp)) {
4271 ret = PTR_ERR(cgrp); 4807 ret = PTR_ERR(cgrp);
@@ -4417,6 +4953,7 @@ static void kill_css(struct cgroup_subsys_state *css)
4417static int cgroup_destroy_locked(struct cgroup *cgrp) 4953static int cgroup_destroy_locked(struct cgroup *cgrp)
4418 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4954 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4419{ 4955{
4956 struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
4420 struct cgroup_subsys_state *css; 4957 struct cgroup_subsys_state *css;
4421 struct cgrp_cset_link *link; 4958 struct cgrp_cset_link *link;
4422 int ssid; 4959 int ssid;
@@ -4461,7 +4998,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4461 */ 4998 */
4462 kernfs_remove(cgrp->kn); 4999 kernfs_remove(cgrp->kn);
4463 5000
4464 cgroup1_check_for_release(cgroup_parent(cgrp)); 5001 if (parent && cgroup_is_threaded(cgrp))
5002 parent->nr_threaded_children--;
5003
5004 for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5005 tcgrp->nr_descendants--;
5006 tcgrp->nr_dying_descendants++;
5007 }
5008
5009 cgroup1_check_for_release(parent);
4465 5010
4466 /* put the base reference */ 5011 /* put the base reference */
4467 percpu_ref_kill(&cgrp->self.refcnt); 5012 percpu_ref_kill(&cgrp->self.refcnt);
@@ -4656,11 +5201,17 @@ int __init cgroup_init(void)
4656 5201
4657 cgrp_dfl_root.subsys_mask |= 1 << ss->id; 5202 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
4658 5203
5204 /* implicit controllers must be threaded too */
5205 WARN_ON(ss->implicit_on_dfl && !ss->threaded);
5206
4659 if (ss->implicit_on_dfl) 5207 if (ss->implicit_on_dfl)
4660 cgrp_dfl_implicit_ss_mask |= 1 << ss->id; 5208 cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
4661 else if (!ss->dfl_cftypes) 5209 else if (!ss->dfl_cftypes)
4662 cgrp_dfl_inhibit_ss_mask |= 1 << ss->id; 5210 cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
4663 5211
5212 if (ss->threaded)
5213 cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
5214
4664 if (ss->dfl_cftypes == ss->legacy_cftypes) { 5215 if (ss->dfl_cftypes == ss->legacy_cftypes) {
4665 WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); 5216 WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
4666 } else { 5217 } else {
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index e7485786db9b..67230ecf2ce1 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -301,6 +301,16 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
301static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); 301static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
302 302
303/* 303/*
304 * Cgroup v2 behavior is used when on default hierarchy or the
305 * cgroup_v2_mode flag is set.
306 */
307static inline bool is_in_v2_mode(void)
308{
309 return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
310 (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
311}
312
313/*
304 * This is ugly, but preserves the userspace API for existing cpuset 314 * This is ugly, but preserves the userspace API for existing cpuset
305 * users. If someone tries to mount the "cpuset" filesystem, we 315 * users. If someone tries to mount the "cpuset" filesystem, we
306 * silently switch it to mount "cgroup" instead 316 * silently switch it to mount "cgroup" instead
@@ -490,8 +500,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
490 500
491 /* On legacy hiearchy, we must be a subset of our parent cpuset. */ 501 /* On legacy hiearchy, we must be a subset of our parent cpuset. */
492 ret = -EACCES; 502 ret = -EACCES;
493 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 503 if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
494 !is_cpuset_subset(trial, par))
495 goto out; 504 goto out;
496 505
497 /* 506 /*
@@ -870,7 +879,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
870 struct css_task_iter it; 879 struct css_task_iter it;
871 struct task_struct *task; 880 struct task_struct *task;
872 881
873 css_task_iter_start(&cs->css, &it); 882 css_task_iter_start(&cs->css, 0, &it);
874 while ((task = css_task_iter_next(&it))) 883 while ((task = css_task_iter_next(&it)))
875 set_cpus_allowed_ptr(task, cs->effective_cpus); 884 set_cpus_allowed_ptr(task, cs->effective_cpus);
876 css_task_iter_end(&it); 885 css_task_iter_end(&it);
@@ -904,8 +913,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
904 * If it becomes empty, inherit the effective mask of the 913 * If it becomes empty, inherit the effective mask of the
905 * parent, which is guaranteed to have some CPUs. 914 * parent, which is guaranteed to have some CPUs.
906 */ 915 */
907 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 916 if (is_in_v2_mode() && cpumask_empty(new_cpus))
908 cpumask_empty(new_cpus))
909 cpumask_copy(new_cpus, parent->effective_cpus); 917 cpumask_copy(new_cpus, parent->effective_cpus);
910 918
911 /* Skip the whole subtree if the cpumask remains the same. */ 919 /* Skip the whole subtree if the cpumask remains the same. */
@@ -922,7 +930,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
922 cpumask_copy(cp->effective_cpus, new_cpus); 930 cpumask_copy(cp->effective_cpus, new_cpus);
923 spin_unlock_irq(&callback_lock); 931 spin_unlock_irq(&callback_lock);
924 932
925 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 933 WARN_ON(!is_in_v2_mode() &&
926 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); 934 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
927 935
928 update_tasks_cpumask(cp); 936 update_tasks_cpumask(cp);
@@ -1100,7 +1108,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
1100 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1108 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1101 * is idempotent. Also migrate pages in each mm to new nodes. 1109 * is idempotent. Also migrate pages in each mm to new nodes.
1102 */ 1110 */
1103 css_task_iter_start(&cs->css, &it); 1111 css_task_iter_start(&cs->css, 0, &it);
1104 while ((task = css_task_iter_next(&it))) { 1112 while ((task = css_task_iter_next(&it))) {
1105 struct mm_struct *mm; 1113 struct mm_struct *mm;
1106 bool migrate; 1114 bool migrate;
@@ -1158,8 +1166,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1158 * If it becomes empty, inherit the effective mask of the 1166 * If it becomes empty, inherit the effective mask of the
1159 * parent, which is guaranteed to have some MEMs. 1167 * parent, which is guaranteed to have some MEMs.
1160 */ 1168 */
1161 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 1169 if (is_in_v2_mode() && nodes_empty(*new_mems))
1162 nodes_empty(*new_mems))
1163 *new_mems = parent->effective_mems; 1170 *new_mems = parent->effective_mems;
1164 1171
1165 /* Skip the whole subtree if the nodemask remains the same. */ 1172 /* Skip the whole subtree if the nodemask remains the same. */
@@ -1176,7 +1183,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1176 cp->effective_mems = *new_mems; 1183 cp->effective_mems = *new_mems;
1177 spin_unlock_irq(&callback_lock); 1184 spin_unlock_irq(&callback_lock);
1178 1185
1179 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 1186 WARN_ON(!is_in_v2_mode() &&
1180 !nodes_equal(cp->mems_allowed, cp->effective_mems)); 1187 !nodes_equal(cp->mems_allowed, cp->effective_mems));
1181 1188
1182 update_tasks_nodemask(cp); 1189 update_tasks_nodemask(cp);
@@ -1293,7 +1300,7 @@ static void update_tasks_flags(struct cpuset *cs)
1293 struct css_task_iter it; 1300 struct css_task_iter it;
1294 struct task_struct *task; 1301 struct task_struct *task;
1295 1302
1296 css_task_iter_start(&cs->css, &it); 1303 css_task_iter_start(&cs->css, 0, &it);
1297 while ((task = css_task_iter_next(&it))) 1304 while ((task = css_task_iter_next(&it)))
1298 cpuset_update_task_spread_flag(cs, task); 1305 cpuset_update_task_spread_flag(cs, task);
1299 css_task_iter_end(&it); 1306 css_task_iter_end(&it);
@@ -1468,7 +1475,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
1468 1475
1469 /* allow moving tasks into an empty cpuset if on default hierarchy */ 1476 /* allow moving tasks into an empty cpuset if on default hierarchy */
1470 ret = -ENOSPC; 1477 ret = -ENOSPC;
1471 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 1478 if (!is_in_v2_mode() &&
1472 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 1479 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1473 goto out_unlock; 1480 goto out_unlock;
1474 1481
@@ -1987,7 +1994,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
1987 cpuset_inc(); 1994 cpuset_inc();
1988 1995
1989 spin_lock_irq(&callback_lock); 1996 spin_lock_irq(&callback_lock);
1990 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { 1997 if (is_in_v2_mode()) {
1991 cpumask_copy(cs->effective_cpus, parent->effective_cpus); 1998 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
1992 cs->effective_mems = parent->effective_mems; 1999 cs->effective_mems = parent->effective_mems;
1993 } 2000 }
@@ -2064,7 +2071,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
2064 mutex_lock(&cpuset_mutex); 2071 mutex_lock(&cpuset_mutex);
2065 spin_lock_irq(&callback_lock); 2072 spin_lock_irq(&callback_lock);
2066 2073
2067 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { 2074 if (is_in_v2_mode()) {
2068 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); 2075 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
2069 top_cpuset.mems_allowed = node_possible_map; 2076 top_cpuset.mems_allowed = node_possible_map;
2070 } else { 2077 } else {
@@ -2258,7 +2265,7 @@ retry:
2258 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); 2265 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
2259 mems_updated = !nodes_equal(new_mems, cs->effective_mems); 2266 mems_updated = !nodes_equal(new_mems, cs->effective_mems);
2260 2267
2261 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) 2268 if (is_in_v2_mode())
2262 hotplug_update_tasks(cs, &new_cpus, &new_mems, 2269 hotplug_update_tasks(cs, &new_cpus, &new_mems,
2263 cpus_updated, mems_updated); 2270 cpus_updated, mems_updated);
2264 else 2271 else
@@ -2289,7 +2296,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2289 static cpumask_t new_cpus; 2296 static cpumask_t new_cpus;
2290 static nodemask_t new_mems; 2297 static nodemask_t new_mems;
2291 bool cpus_updated, mems_updated; 2298 bool cpus_updated, mems_updated;
2292 bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); 2299 bool on_dfl = is_in_v2_mode();
2293 2300
2294 mutex_lock(&cpuset_mutex); 2301 mutex_lock(&cpuset_mutex);
2295 2302
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index dac46af22782..f661b4cc5efd 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -114,27 +114,49 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
114{ 114{
115 struct cgroup_subsys_state *css = seq_css(seq); 115 struct cgroup_subsys_state *css = seq_css(seq);
116 struct cgrp_cset_link *link; 116 struct cgrp_cset_link *link;
117 int dead_cnt = 0, extra_refs = 0; 117 int dead_cnt = 0, extra_refs = 0, threaded_csets = 0;
118 118
119 spin_lock_irq(&css_set_lock); 119 spin_lock_irq(&css_set_lock);
120
120 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { 121 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
121 struct css_set *cset = link->cset; 122 struct css_set *cset = link->cset;
122 struct task_struct *task; 123 struct task_struct *task;
123 int count = 0; 124 int count = 0;
124 int refcnt = refcount_read(&cset->refcount); 125 int refcnt = refcount_read(&cset->refcount);
125 126
126 seq_printf(seq, " %d", refcnt); 127 /*
127 if (refcnt - cset->nr_tasks > 0) { 128 * Print out the proc_cset and threaded_cset relationship
128 int extra = refcnt - cset->nr_tasks; 129 * and highlight difference between refcount and task_count.
129 130 */
130 seq_printf(seq, " +%d", extra); 131 seq_printf(seq, "css_set %pK", cset);
131 /* 132 if (rcu_dereference_protected(cset->dom_cset, 1) != cset) {
132 * Take out the one additional reference in 133 threaded_csets++;
133 * init_css_set. 134 seq_printf(seq, "=>%pK", cset->dom_cset);
134 */ 135 }
135 if (cset == &init_css_set) 136 if (!list_empty(&cset->threaded_csets)) {
136 extra--; 137 struct css_set *tcset;
137 extra_refs += extra; 138 int idx = 0;
139
140 list_for_each_entry(tcset, &cset->threaded_csets,
141 threaded_csets_node) {
142 seq_puts(seq, idx ? "," : "<=");
143 seq_printf(seq, "%pK", tcset);
144 idx++;
145 }
146 } else {
147 seq_printf(seq, " %d", refcnt);
148 if (refcnt - cset->nr_tasks > 0) {
149 int extra = refcnt - cset->nr_tasks;
150
151 seq_printf(seq, " +%d", extra);
152 /*
153 * Take out the one additional reference in
154 * init_css_set.
155 */
156 if (cset == &init_css_set)
157 extra--;
158 extra_refs += extra;
159 }
138 } 160 }
139 seq_puts(seq, "\n"); 161 seq_puts(seq, "\n");
140 162
@@ -163,10 +185,12 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
163 } 185 }
164 spin_unlock_irq(&css_set_lock); 186 spin_unlock_irq(&css_set_lock);
165 187
166 if (!dead_cnt && !extra_refs) 188 if (!dead_cnt && !extra_refs && !threaded_csets)
167 return 0; 189 return 0;
168 190
169 seq_puts(seq, "\n"); 191 seq_puts(seq, "\n");
192 if (threaded_csets)
193 seq_printf(seq, "threaded css_sets = %d\n", threaded_csets);
170 if (extra_refs) 194 if (extra_refs)
171 seq_printf(seq, "extra references = %d\n", extra_refs); 195 seq_printf(seq, "extra references = %d\n", extra_refs);
172 if (dead_cnt) 196 if (dead_cnt)
@@ -352,6 +376,7 @@ static int __init enable_cgroup_debug(char *str)
352{ 376{
353 debug_cgrp_subsys.dfl_cftypes = debug_files; 377 debug_cgrp_subsys.dfl_cftypes = debug_files;
354 debug_cgrp_subsys.implicit_on_dfl = true; 378 debug_cgrp_subsys.implicit_on_dfl = true;
379 debug_cgrp_subsys.threaded = true;
355 return 1; 380 return 1;
356} 381}
357__setup("cgroup_debug", enable_cgroup_debug); 382__setup("cgroup_debug", enable_cgroup_debug);
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 1b72d56edce5..08236798d173 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -268,7 +268,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
268 rcu_read_unlock(); 268 rcu_read_unlock();
269 269
270 /* are all tasks frozen? */ 270 /* are all tasks frozen? */
271 css_task_iter_start(css, &it); 271 css_task_iter_start(css, 0, &it);
272 272
273 while ((task = css_task_iter_next(&it))) { 273 while ((task = css_task_iter_next(&it))) {
274 if (freezing(task)) { 274 if (freezing(task)) {
@@ -320,7 +320,7 @@ static void freeze_cgroup(struct freezer *freezer)
320 struct css_task_iter it; 320 struct css_task_iter it;
321 struct task_struct *task; 321 struct task_struct *task;
322 322
323 css_task_iter_start(&freezer->css, &it); 323 css_task_iter_start(&freezer->css, 0, &it);
324 while ((task = css_task_iter_next(&it))) 324 while ((task = css_task_iter_next(&it)))
325 freeze_task(task); 325 freeze_task(task);
326 css_task_iter_end(&it); 326 css_task_iter_end(&it);
@@ -331,7 +331,7 @@ static void unfreeze_cgroup(struct freezer *freezer)
331 struct css_task_iter it; 331 struct css_task_iter it;
332 struct task_struct *task; 332 struct task_struct *task;
333 333
334 css_task_iter_start(&freezer->css, &it); 334 css_task_iter_start(&freezer->css, 0, &it);
335 while ((task = css_task_iter_next(&it))) 335 while ((task = css_task_iter_next(&it)))
336 __thaw_task(task); 336 __thaw_task(task);
337 css_task_iter_end(&it); 337 css_task_iter_end(&it);
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 2237201d66d5..9829c67ebc0a 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -345,4 +345,5 @@ struct cgroup_subsys pids_cgrp_subsys = {
345 .free = pids_free, 345 .free = pids_free,
346 .legacy_cftypes = pids_files, 346 .legacy_cftypes = pids_files,
347 .dfl_cftypes = pids_files, 347 .dfl_cftypes = pids_files,
348 .threaded = true,
348}; 349};
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fb415e3d824b..3e691b75b2db 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11293,5 +11293,6 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
11293 * controller is not mounted on a legacy hierarchy. 11293 * controller is not mounted on a legacy hierarchy.
11294 */ 11294 */
11295 .implicit_on_dfl = true, 11295 .implicit_on_dfl = true,
11296 .threaded = true,
11296}; 11297};
11297#endif /* CONFIG_CGROUP_PERF */ 11298#endif /* CONFIG_CGROUP_PERF */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ad15850ee157..6532b219b222 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -919,7 +919,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
919 struct css_task_iter it; 919 struct css_task_iter it;
920 struct task_struct *task; 920 struct task_struct *task;
921 921
922 css_task_iter_start(&iter->css, &it); 922 css_task_iter_start(&iter->css, 0, &it);
923 while (!ret && (task = css_task_iter_next(&it))) 923 while (!ret && (task = css_task_iter_next(&it)))
924 ret = fn(task, arg); 924 ret = fn(task, arg);
925 css_task_iter_end(&it); 925 css_task_iter_end(&it);
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 029a61ac6cdd..5e4f04004a49 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -100,7 +100,7 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
100 100
101 cs->classid = (u32)value; 101 cs->classid = (u32)value;
102 102
103 css_task_iter_start(css, &it); 103 css_task_iter_start(css, 0, &it);
104 while ((p = css_task_iter_next(&it))) { 104 while ((p = css_task_iter_next(&it))) {
105 task_lock(p); 105 task_lock(p);
106 iterate_fd(p->files, 0, update_classid_sock, 106 iterate_fd(p->files, 0, update_classid_sock,