aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2017-07-21 11:14:51 -0400
committerTejun Heo <tj@kernel.org>2017-07-21 11:14:51 -0400
commit8cfd8147df67e741d93b8783a3ea8f3c74f93a0e (patch)
tree3a1a0598c3939d4606222e32adf6630f71963297
parent450ee0c1feed657894e0b4bdd48f3974af9d394c (diff)
cgroup: implement cgroup v2 thread support
This patch implements cgroup v2 thread support. The goal of the thread mode is supporting hierarchical accounting and control at thread granularity while staying inside the resource domain model which allows coordination across different resource controllers and handling of anonymous resource consumptions. A cgroup is always created as a domain and can be made threaded by writing to the "cgroup.type" file. When a cgroup becomes threaded, it becomes a member of a threaded subtree which is anchored at the closest ancestor which isn't threaded. The threads of the processes which are in a threaded subtree can be placed anywhere without being restricted by process granularity or no-internal-process constraint. Note that the threads aren't allowed to escape to a different threaded subtree. To be used inside a threaded subtree, a controller should explicitly support threaded mode and be able to handle internal competition in the way which is appropriate for the resource. The root of a threaded subtree, the nearest ancestor which isn't threaded, is called the threaded domain and serves as the resource domain for the whole subtree. This is the last cgroup where domain controllers are operational and where all the domain-level resource consumptions in the subtree are accounted. This allows threaded controllers to operate at thread granularity when requested while staying inside the scope of system-level resource distribution. As the root cgroup is exempt from the no-internal-process constraint, it can serve as both a threaded domain and a parent to normal cgroups, so, unlike non-root cgroups, the root cgroup can have both domain and threaded children. Internally, in a threaded subtree, each css_set has its ->dom_cset pointing to a matching css_set which belongs to the threaded domain. This ensures that thread root level cgroup_subsys_state for all threaded controllers are readily accessible for domain-level operations. This patch enables threaded mode for the pids and perf_events controllers. Neither has to worry about domain-level resource consumptions and it's enough to simply set the flag. For more details on the interface and behavior of the thread mode, please refer to the section 2-2-2 in Documentation/cgroup-v2.txt added by this patch. v5: - Dropped silly no-op ->dom_cgrp init from cgroup_create(). Spotted by Waiman. - Documentation updated as suggested by Waiman. - cgroup.type content slightly reformatted. - Mark the debug controller threaded. v4: - Updated to the general idea of marking specific cgroups domain/threaded as suggested by PeterZ. v3: - Dropped "join" and always make mixed children join the parent's threaded subtree. v2: - After discussions with Waiman, support for mixed thread mode is added. This should address the issue that Peter pointed out where any nesting should be avoided for thread subtrees while coexisting with other domain cgroups. - Enabling / disabling thread mode now piggy backs on the existing control mask update mechanism. - Bug fixes and cleanup. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Waiman Long <longman@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org>
-rw-r--r--Documentation/cgroup-v2.txt185
-rw-r--r--include/linux/cgroup-defs.h12
-rw-r--r--kernel/cgroup/cgroup-internal.h2
-rw-r--r--kernel/cgroup/cgroup-v1.c5
-rw-r--r--kernel/cgroup/cgroup.c355
-rw-r--r--kernel/cgroup/debug.c1
-rw-r--r--kernel/cgroup/pids.c1
-rw-r--r--kernel/events/core.c1
8 files changed, 522 insertions, 40 deletions
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index f01f831a3b11..cb9ea281ab72 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -18,7 +18,9 @@ v1 is available under Documentation/cgroup-v1/.
18 1-2. What is cgroup? 18 1-2. What is cgroup?
19 2. Basic Operations 19 2. Basic Operations
20 2-1. Mounting 20 2-1. Mounting
21 2-2. Organizing Processes 21 2-2. Organizing Processes and Threads
22 2-2-1. Processes
23 2-2-2. Threads
22 2-3. [Un]populated Notification 24 2-3. [Un]populated Notification
23 2-4. Controlling Controllers 25 2-4. Controlling Controllers
24 2-4-1. Enabling and Disabling 26 2-4-1. Enabling and Disabling
@@ -167,8 +169,11 @@ cgroup v2 currently supports the following mount options.
167 Delegation section for details. 169 Delegation section for details.
168 170
169 171
170Organizing Processes 172Organizing Processes and Threads
171-------------------- 173--------------------------------
174
175Processes
176~~~~~~~~~
172 177
173Initially, only the root cgroup exists to which all processes belong. 178Initially, only the root cgroup exists to which all processes belong.
174A child cgroup can be created by creating a sub-directory:: 179A child cgroup can be created by creating a sub-directory::
@@ -219,6 +224,104 @@ is removed subsequently, " (deleted)" is appended to the path::
219 0::/test-cgroup/test-cgroup-nested (deleted) 224 0::/test-cgroup/test-cgroup-nested (deleted)
220 225
221 226
227Threads
228~~~~~~~
229
230cgroup v2 supports thread granularity for a subset of controllers to
231support use cases requiring hierarchical resource distribution across
232the threads of a group of processes. By default, all threads of a
233process belong to the same cgroup, which also serves as the resource
234domain to host resource consumptions which are not specific to a
235process or thread. The thread mode allows threads to be spread across
236a subtree while still maintaining the common resource domain for them.
237
238Controllers which support thread mode are called threaded controllers.
239The ones which don't are called domain controllers.
240
241Marking a cgroup threaded makes it join the resource domain of its
242parent as a threaded cgroup. The parent may be another threaded
243cgroup whose resource domain is further up in the hierarchy. The root
244of a threaded subtree, that is, the nearest ancestor which is not
245threaded, is called threaded domain or thread root interchangeably and
246serves as the resource domain for the entire subtree.
247
248Inside a threaded subtree, threads of a process can be put in
249different cgroups and are not subject to the no internal process
250constraint - threaded controllers can be enabled on non-leaf cgroups
251whether they have threads in them or not.
252
253As the threaded domain cgroup hosts all the domain resource
254consumptions of the subtree, it is considered to have internal
255resource consumptions whether there are processes in it or not and
256can't have populated child cgroups which aren't threaded. Because the
257root cgroup is not subject to no internal process constraint, it can
258serve both as a threaded domain and a parent to domain cgroups.
259
260The current operation mode or type of the cgroup is shown in the
261"cgroup.type" file which indicates whether the cgroup is a normal
262domain, a domain which is serving as the domain of a threaded subtree,
263or a threaded cgroup.
264
265On creation, a cgroup is always a domain cgroup and can be made
266threaded by writing "threaded" to the "cgroup.type" file. The
267operation is single direction::
268
269 # echo threaded > cgroup.type
270
271Once threaded, the cgroup can't be made a domain again. To enable the
272thread mode, the following conditions must be met.
273
274- As the cgroup will join the parent's resource domain. The parent
275 must either be a valid (threaded) domain or a threaded cgroup.
276
277- The cgroup must be empty. No enabled controllers, child cgroups or
278 processes.
279
280Topology-wise, a cgroup can be in an invalid state. Please consider
281the following toplogy::
282
283 A (threaded domain) - B (threaded) - C (domain, just created)
284
285C is created as a domain but isn't connected to a parent which can
286host child domains. C can't be used until it is turned into a
287threaded cgroup. "cgroup.type" file will report "domain (invalid)" in
288these cases. Operations which fail due to invalid topology use
289EOPNOTSUPP as the errno.
290
291A domain cgroup is turned into a threaded domain when one of its child
292cgroup becomes threaded or threaded controllers are enabled in the
293"cgroup.subtree_control" file while there are processes in the cgroup.
294A threaded domain reverts to a normal domain when the conditions
295clear.
296
297When read, "cgroup.threads" contains the list of the thread IDs of all
298threads in the cgroup. Except that the operations are per-thread
299instead of per-process, "cgroup.threads" has the same format and
300behaves the same way as "cgroup.procs". While "cgroup.threads" can be
301written to in any cgroup, as it can only move threads inside the same
302threaded domain, its operations are confined inside each threaded
303subtree.
304
305The threaded domain cgroup serves as the resource domain for the whole
306subtree, and, while the threads can be scattered across the subtree,
307all the processes are considered to be in the threaded domain cgroup.
308"cgroup.procs" in a threaded domain cgroup contains the PIDs of all
309processes in the subtree and is not readable in the subtree proper.
310However, "cgroup.procs" can be written to from anywhere in the subtree
311to migrate all threads of the matching process to the cgroup.
312
313Only threaded controllers can be enabled in a threaded subtree. When
314a threaded controller is enabled inside a threaded subtree, it only
315accounts for and controls resource consumptions associated with the
316threads in the cgroup and its descendants. All consumptions which
317aren't tied to a specific thread belong to the threaded domain cgroup.
318
319Because a threaded subtree is exempt from no internal process
320constraint, a threaded controller must be able to handle competition
321between threads in a non-leaf cgroup and its child cgroups. Each
322threaded controller defines how such competitions are handled.
323
324
222[Un]populated Notification 325[Un]populated Notification
223-------------------------- 326--------------------------
224 327
@@ -302,15 +405,15 @@ disabled if one or more children have it enabled.
302No Internal Process Constraint 405No Internal Process Constraint
303~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 406~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
304 407
305Non-root cgroups can only distribute resources to their children when 408Non-root cgroups can distribute domain resources to their children
306they don't have any processes of their own. In other words, only 409only when they don't have any processes of their own. In other words,
307cgroups which don't contain any processes can have controllers enabled 410only domain cgroups which don't contain any processes can have domain
308in their "cgroup.subtree_control" files. 411controllers enabled in their "cgroup.subtree_control" files.
309 412
310This guarantees that, when a controller is looking at the part of the 413This guarantees that, when a domain controller is looking at the part
311hierarchy which has it enabled, processes are always only on the 414of the hierarchy which has it enabled, processes are always only on
312leaves. This rules out situations where child cgroups compete against 415the leaves. This rules out situations where child cgroups compete
313internal processes of the parent. 416against internal processes of the parent.
314 417
315The root cgroup is exempt from this restriction. Root contains 418The root cgroup is exempt from this restriction. Root contains
316processes and anonymous resource consumption which can't be associated 419processes and anonymous resource consumption which can't be associated
@@ -334,10 +437,10 @@ Model of Delegation
334~~~~~~~~~~~~~~~~~~~ 437~~~~~~~~~~~~~~~~~~~
335 438
336A cgroup can be delegated in two ways. First, to a less privileged 439A cgroup can be delegated in two ways. First, to a less privileged
337user by granting write access of the directory and its "cgroup.procs" 440user by granting write access of the directory and its "cgroup.procs",
338and "cgroup.subtree_control" files to the user. Second, if the 441"cgroup.threads" and "cgroup.subtree_control" files to the user.
339"nsdelegate" mount option is set, automatically to a cgroup namespace 442Second, if the "nsdelegate" mount option is set, automatically to a
340on namespace creation. 443cgroup namespace on namespace creation.
341 444
342Because the resource control interface files in a given directory 445Because the resource control interface files in a given directory
343control the distribution of the parent's resources, the delegatee 446control the distribution of the parent's resources, the delegatee
@@ -644,6 +747,29 @@ Core Interface Files
644 747
645All cgroup core files are prefixed with "cgroup." 748All cgroup core files are prefixed with "cgroup."
646 749
750 cgroup.type
751
752 A read-write single value file which exists on non-root
753 cgroups.
754
755 When read, it indicates the current type of the cgroup, which
756 can be one of the following values.
757
758 - "domain" : A normal valid domain cgroup.
759
760 - "domain threaded" : A threaded domain cgroup which is
761 serving as the root of a threaded subtree.
762
763 - "domain invalid" : A cgroup which is in an invalid state.
764 It can't be populated or have controllers enabled. It may
765 be allowed to become a threaded cgroup.
766
767 - "threaded" : A threaded cgroup which is a member of a
768 threaded subtree.
769
770 A cgroup can be turned into a threaded cgroup by writing
771 "threaded" to this file.
772
647 cgroup.procs 773 cgroup.procs
648 A read-write new-line separated values file which exists on 774 A read-write new-line separated values file which exists on
649 all cgroups. 775 all cgroups.
@@ -666,6 +792,35 @@ All cgroup core files are prefixed with "cgroup."
666 When delegating a sub-hierarchy, write access to this file 792 When delegating a sub-hierarchy, write access to this file
667 should be granted along with the containing directory. 793 should be granted along with the containing directory.
668 794
795 In a threaded cgroup, reading this file fails with EOPNOTSUPP
796 as all the processes belong to the thread root. Writing is
797 supported and moves every thread of the process to the cgroup.
798
799 cgroup.threads
800 A read-write new-line separated values file which exists on
801 all cgroups.
802
803 When read, it lists the TIDs of all threads which belong to
804 the cgroup one-per-line. The TIDs are not ordered and the
805 same TID may show up more than once if the thread got moved to
806 another cgroup and then back or the TID got recycled while
807 reading.
808
809 A TID can be written to migrate the thread associated with the
810 TID to the cgroup. The writer should match all of the
811 following conditions.
812
813 - It must have write access to the "cgroup.threads" file.
814
815 - The cgroup that the thread is currently in must be in the
816 same resource domain as the destination cgroup.
817
818 - It must have write access to the "cgroup.procs" file of the
819 common ancestor of the source and destination cgroups.
820
821 When delegating a sub-hierarchy, write access to this file
822 should be granted along with the containing directory.
823
669 cgroup.controllers 824 cgroup.controllers
670 A read-only space separated values file which exists on all 825 A read-only space separated values file which exists on all
671 cgroups. 826 cgroups.
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 651c4363c85e..9d741959f218 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -522,6 +522,18 @@ struct cgroup_subsys {
522 bool implicit_on_dfl:1; 522 bool implicit_on_dfl:1;
523 523
524 /* 524 /*
525 * If %true, the controller, supports threaded mode on the default
526 * hierarchy. In a threaded subtree, both process granularity and
527 * no-internal-process constraint are ignored and a threaded
528 * controllers should be able to handle that.
529 *
530 * Note that as an implicit controller is automatically enabled on
531 * all cgroups on the default hierarchy, it should also be
532 * threaded. implicit && !threaded is not supported.
533 */
534 bool threaded:1;
535
536 /*
525 * If %false, this subsystem is properly hierarchical - 537 * If %false, this subsystem is properly hierarchical -
526 * configuration, resource accounting and restriction on a parent 538 * configuration, resource accounting and restriction on a parent
527 * cgroup cover those of its children. If %true, hierarchy support 539 * cgroup cover those of its children. If %true, hierarchy support
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 0e81c6109e91..f10eb19ddf04 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -170,7 +170,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
170 struct cgroup_root *root, unsigned long magic, 170 struct cgroup_root *root, unsigned long magic,
171 struct cgroup_namespace *ns); 171 struct cgroup_namespace *ns);
172 172
173bool cgroup_may_migrate_to(struct cgroup *dst_cgrp); 173int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
174void cgroup_migrate_finish(struct cgroup_mgctx *mgctx); 174void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
175void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, 175void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
176 struct cgroup_mgctx *mgctx); 176 struct cgroup_mgctx *mgctx);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 167aaab04bf9..f0e8601b13cb 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -99,8 +99,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
99 if (cgroup_on_dfl(to)) 99 if (cgroup_on_dfl(to))
100 return -EINVAL; 100 return -EINVAL;
101 101
102 if (!cgroup_may_migrate_to(to)) 102 ret = cgroup_migrate_vet_dst(to);
103 return -EBUSY; 103 if (ret)
104 return ret;
104 105
105 mutex_lock(&cgroup_mutex); 106 mutex_lock(&cgroup_mutex);
106 107
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a1d59af274a9..c396e701c206 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -162,6 +162,9 @@ static u16 cgrp_dfl_inhibit_ss_mask;
162/* some controllers are implicitly enabled on the default hierarchy */ 162/* some controllers are implicitly enabled on the default hierarchy */
163static u16 cgrp_dfl_implicit_ss_mask; 163static u16 cgrp_dfl_implicit_ss_mask;
164 164
165/* some controllers can be threaded on the default hierarchy */
166static u16 cgrp_dfl_threaded_ss_mask;
167
165/* The list of hierarchy roots */ 168/* The list of hierarchy roots */
166LIST_HEAD(cgroup_roots); 169LIST_HEAD(cgroup_roots);
167static int cgroup_root_count; 170static int cgroup_root_count;
@@ -335,14 +338,93 @@ static bool cgroup_is_threaded(struct cgroup *cgrp)
335 return cgrp->dom_cgrp != cgrp; 338 return cgrp->dom_cgrp != cgrp;
336} 339}
337 340
341/* can @cgrp host both domain and threaded children? */
342static bool cgroup_is_mixable(struct cgroup *cgrp)
343{
344 /*
345 * Root isn't under domain level resource control exempting it from
346 * the no-internal-process constraint, so it can serve as a thread
347 * root and a parent of resource domains at the same time.
348 */
349 return !cgroup_parent(cgrp);
350}
351
352/* can @cgrp become a thread root? should always be true for a thread root */
353static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
354{
355 /* mixables don't care */
356 if (cgroup_is_mixable(cgrp))
357 return true;
358
359 /* domain roots can't be nested under threaded */
360 if (cgroup_is_threaded(cgrp))
361 return false;
362
363 /* can only have either domain or threaded children */
364 if (cgrp->nr_populated_domain_children)
365 return false;
366
367 /* and no domain controllers can be enabled */
368 if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
369 return false;
370
371 return true;
372}
373
374/* is @cgrp root of a threaded subtree? */
375static bool cgroup_is_thread_root(struct cgroup *cgrp)
376{
377 /* thread root should be a domain */
378 if (cgroup_is_threaded(cgrp))
379 return false;
380
381 /* a domain w/ threaded children is a thread root */
382 if (cgrp->nr_threaded_children)
383 return true;
384
385 /*
386 * A domain which has tasks and explicit threaded controllers
387 * enabled is a thread root.
388 */
389 if (cgroup_has_tasks(cgrp) &&
390 (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
391 return true;
392
393 return false;
394}
395
396/* a domain which isn't connected to the root w/o brekage can't be used */
397static bool cgroup_is_valid_domain(struct cgroup *cgrp)
398{
399 /* the cgroup itself can be a thread root */
400 if (cgroup_is_threaded(cgrp))
401 return false;
402
403 /* but the ancestors can't be unless mixable */
404 while ((cgrp = cgroup_parent(cgrp))) {
405 if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
406 return false;
407 if (cgroup_is_threaded(cgrp))
408 return false;
409 }
410
411 return true;
412}
413
338/* subsystems visibly enabled on a cgroup */ 414/* subsystems visibly enabled on a cgroup */
339static u16 cgroup_control(struct cgroup *cgrp) 415static u16 cgroup_control(struct cgroup *cgrp)
340{ 416{
341 struct cgroup *parent = cgroup_parent(cgrp); 417 struct cgroup *parent = cgroup_parent(cgrp);
342 u16 root_ss_mask = cgrp->root->subsys_mask; 418 u16 root_ss_mask = cgrp->root->subsys_mask;
343 419
344 if (parent) 420 if (parent) {
345 return parent->subtree_control; 421 u16 ss_mask = parent->subtree_control;
422
423 /* threaded cgroups can only have threaded controllers */
424 if (cgroup_is_threaded(cgrp))
425 ss_mask &= cgrp_dfl_threaded_ss_mask;
426 return ss_mask;
427 }
346 428
347 if (cgroup_on_dfl(cgrp)) 429 if (cgroup_on_dfl(cgrp))
348 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask | 430 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
@@ -355,8 +437,14 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp)
355{ 437{
356 struct cgroup *parent = cgroup_parent(cgrp); 438 struct cgroup *parent = cgroup_parent(cgrp);
357 439
358 if (parent) 440 if (parent) {
359 return parent->subtree_ss_mask; 441 u16 ss_mask = parent->subtree_ss_mask;
442
443 /* threaded cgroups can only have threaded controllers */
444 if (cgroup_is_threaded(cgrp))
445 ss_mask &= cgrp_dfl_threaded_ss_mask;
446 return ss_mask;
447 }
360 448
361 return cgrp->root->subsys_mask; 449 return cgrp->root->subsys_mask;
362} 450}
@@ -2237,17 +2325,40 @@ out_release_tset:
2237} 2325}
2238 2326
2239/** 2327/**
2240 * cgroup_may_migrate_to - verify whether a cgroup can be migration destination 2328 * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
2241 * @dst_cgrp: destination cgroup to test 2329 * @dst_cgrp: destination cgroup to test
2242 * 2330 *
2243 * On the default hierarchy, except for the root, subtree_control must be 2331 * On the default hierarchy, except for the mixable, (possible) thread root
2244 * zero for migration destination cgroups with tasks so that child cgroups 2332 * and threaded cgroups, subtree_control must be zero for migration
2245 * don't compete against tasks. 2333 * destination cgroups with tasks so that child cgroups don't compete
2334 * against tasks.
2246 */ 2335 */
2247bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) 2336int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
2248{ 2337{
2249 return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) || 2338 /* v1 doesn't have any restriction */
2250 !dst_cgrp->subtree_control; 2339 if (!cgroup_on_dfl(dst_cgrp))
2340 return 0;
2341
2342 /* verify @dst_cgrp can host resources */
2343 if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
2344 return -EOPNOTSUPP;
2345
2346 /* mixables don't care */
2347 if (cgroup_is_mixable(dst_cgrp))
2348 return 0;
2349
2350 /*
2351 * If @dst_cgrp is already or can become a thread root or is
2352 * threaded, it doesn't matter.
2353 */
2354 if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
2355 return 0;
2356
2357 /* apply no-internal-process constraint */
2358 if (dst_cgrp->subtree_control)
2359 return -EBUSY;
2360
2361 return 0;
2251} 2362}
2252 2363
2253/** 2364/**
@@ -2452,8 +2563,9 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
2452 struct task_struct *task; 2563 struct task_struct *task;
2453 int ret; 2564 int ret;
2454 2565
2455 if (!cgroup_may_migrate_to(dst_cgrp)) 2566 ret = cgroup_migrate_vet_dst(dst_cgrp);
2456 return -EBUSY; 2567 if (ret)
2568 return ret;
2457 2569
2458 /* look up all src csets */ 2570 /* look up all src csets */
2459 spin_lock_irq(&css_set_lock); 2571 spin_lock_irq(&css_set_lock);
@@ -2881,6 +2993,46 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
2881 cgroup_apply_control_disable(cgrp); 2993 cgroup_apply_control_disable(cgrp);
2882} 2994}
2883 2995
2996static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
2997{
2998 u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
2999
3000 /* if nothing is getting enabled, nothing to worry about */
3001 if (!enable)
3002 return 0;
3003
3004 /* can @cgrp host any resources? */
3005 if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
3006 return -EOPNOTSUPP;
3007
3008 /* mixables don't care */
3009 if (cgroup_is_mixable(cgrp))
3010 return 0;
3011
3012 if (domain_enable) {
3013 /* can't enable domain controllers inside a thread subtree */
3014 if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3015 return -EOPNOTSUPP;
3016 } else {
3017 /*
3018 * Threaded controllers can handle internal competitions
3019 * and are always allowed inside a (prospective) thread
3020 * subtree.
3021 */
3022 if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3023 return 0;
3024 }
3025
3026 /*
3027 * Controllers can't be enabled for a cgroup with tasks to avoid
3028 * child cgroups competing against tasks.
3029 */
3030 if (cgroup_has_tasks(cgrp))
3031 return -EBUSY;
3032
3033 return 0;
3034}
3035
2884/* change the enabled child controllers for a cgroup in the default hierarchy */ 3036/* change the enabled child controllers for a cgroup in the default hierarchy */
2885static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, 3037static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2886 char *buf, size_t nbytes, 3038 char *buf, size_t nbytes,
@@ -2956,14 +3108,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2956 goto out_unlock; 3108 goto out_unlock;
2957 } 3109 }
2958 3110
2959 /* 3111 ret = cgroup_vet_subtree_control_enable(cgrp, enable);
2960 * Except for the root, subtree_control must be zero for a cgroup 3112 if (ret)
2961 * with tasks so that child cgroups don't compete against tasks.
2962 */
2963 if (enable && cgroup_parent(cgrp) && cgroup_has_tasks(cgrp)) {
2964 ret = -EBUSY;
2965 goto out_unlock; 3113 goto out_unlock;
2966 }
2967 3114
2968 /* save and update control masks and prepare csses */ 3115 /* save and update control masks and prepare csses */
2969 cgroup_save_control(cgrp); 3116 cgroup_save_control(cgrp);
@@ -2982,6 +3129,84 @@ out_unlock:
2982 return ret ?: nbytes; 3129 return ret ?: nbytes;
2983} 3130}
2984 3131
3132static int cgroup_enable_threaded(struct cgroup *cgrp)
3133{
3134 struct cgroup *parent = cgroup_parent(cgrp);
3135 struct cgroup *dom_cgrp = parent->dom_cgrp;
3136 int ret;
3137
3138 lockdep_assert_held(&cgroup_mutex);
3139
3140 /* noop if already threaded */
3141 if (cgroup_is_threaded(cgrp))
3142 return 0;
3143
3144 /* we're joining the parent's domain, ensure its validity */
3145 if (!cgroup_is_valid_domain(dom_cgrp) ||
3146 !cgroup_can_be_thread_root(dom_cgrp))
3147 return -EOPNOTSUPP;
3148
3149 /*
3150 * Allow enabling thread mode only on empty cgroups to avoid
3151 * implicit migrations and recursive operations.
3152 */
3153 if (cgroup_has_tasks(cgrp) || css_has_online_children(&cgrp->self))
3154 return -EBUSY;
3155
3156 /*
3157 * The following shouldn't cause actual migrations and should
3158 * always succeed.
3159 */
3160 cgroup_save_control(cgrp);
3161
3162 cgrp->dom_cgrp = dom_cgrp;
3163 ret = cgroup_apply_control(cgrp);
3164 if (!ret)
3165 parent->nr_threaded_children++;
3166 else
3167 cgrp->dom_cgrp = cgrp;
3168
3169 cgroup_finalize_control(cgrp, ret);
3170 return ret;
3171}
3172
3173static int cgroup_type_show(struct seq_file *seq, void *v)
3174{
3175 struct cgroup *cgrp = seq_css(seq)->cgroup;
3176
3177 if (cgroup_is_threaded(cgrp))
3178 seq_puts(seq, "threaded\n");
3179 else if (!cgroup_is_valid_domain(cgrp))
3180 seq_puts(seq, "domain invalid\n");
3181 else if (cgroup_is_thread_root(cgrp))
3182 seq_puts(seq, "domain threaded\n");
3183 else
3184 seq_puts(seq, "domain\n");
3185
3186 return 0;
3187}
3188
3189static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
3190 size_t nbytes, loff_t off)
3191{
3192 struct cgroup *cgrp;
3193 int ret;
3194
3195 /* only switching to threaded mode is supported */
3196 if (strcmp(strstrip(buf), "threaded"))
3197 return -EINVAL;
3198
3199 cgrp = cgroup_kn_lock_live(of->kn, false);
3200 if (!cgrp)
3201 return -ENOENT;
3202
3203 /* threaded can only be enabled */
3204 ret = cgroup_enable_threaded(cgrp);
3205
3206 cgroup_kn_unlock(of->kn);
3207 return ret ?: nbytes;
3208}
3209
2985static int cgroup_events_show(struct seq_file *seq, void *v) 3210static int cgroup_events_show(struct seq_file *seq, void *v)
2986{ 3211{
2987 seq_printf(seq, "populated %d\n", 3212 seq_printf(seq, "populated %d\n",
@@ -3867,12 +4092,12 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
3867 return css_task_iter_next(it); 4092 return css_task_iter_next(it);
3868} 4093}
3869 4094
3870static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) 4095static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
4096 unsigned int iter_flags)
3871{ 4097{
3872 struct kernfs_open_file *of = s->private; 4098 struct kernfs_open_file *of = s->private;
3873 struct cgroup *cgrp = seq_css(s)->cgroup; 4099 struct cgroup *cgrp = seq_css(s)->cgroup;
3874 struct css_task_iter *it = of->priv; 4100 struct css_task_iter *it = of->priv;
3875 unsigned iter_flags = CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED;
3876 4101
3877 /* 4102 /*
3878 * When a seq_file is seeked, it's always traversed sequentially 4103 * When a seq_file is seeked, it's always traversed sequentially
@@ -3895,6 +4120,23 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
3895 return cgroup_procs_next(s, NULL, NULL); 4120 return cgroup_procs_next(s, NULL, NULL);
3896} 4121}
3897 4122
4123static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
4124{
4125 struct cgroup *cgrp = seq_css(s)->cgroup;
4126
4127 /*
4128 * All processes of a threaded subtree belong to the domain cgroup
4129 * of the subtree. Only threads can be distributed across the
4130 * subtree. Reject reads on cgroup.procs in the subtree proper.
4131 * They're always empty anyway.
4132 */
4133 if (cgroup_is_threaded(cgrp))
4134 return ERR_PTR(-EOPNOTSUPP);
4135
4136 return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
4137 CSS_TASK_ITER_THREADED);
4138}
4139
3898static int cgroup_procs_show(struct seq_file *s, void *v) 4140static int cgroup_procs_show(struct seq_file *s, void *v)
3899{ 4141{
3900 seq_printf(s, "%d\n", task_pid_vnr(v)); 4142 seq_printf(s, "%d\n", task_pid_vnr(v));
@@ -3974,9 +4216,64 @@ out_unlock:
3974 return ret ?: nbytes; 4216 return ret ?: nbytes;
3975} 4217}
3976 4218
4219static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
4220{
4221 return __cgroup_procs_start(s, pos, 0);
4222}
4223
4224static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
4225 char *buf, size_t nbytes, loff_t off)
4226{
4227 struct cgroup *src_cgrp, *dst_cgrp;
4228 struct task_struct *task;
4229 ssize_t ret;
4230
4231 buf = strstrip(buf);
4232
4233 dst_cgrp = cgroup_kn_lock_live(of->kn, false);
4234 if (!dst_cgrp)
4235 return -ENODEV;
4236
4237 task = cgroup_procs_write_start(buf, false);
4238 ret = PTR_ERR_OR_ZERO(task);
4239 if (ret)
4240 goto out_unlock;
4241
4242 /* find the source cgroup */
4243 spin_lock_irq(&css_set_lock);
4244 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
4245 spin_unlock_irq(&css_set_lock);
4246
4247 /* thread migrations follow the cgroup.procs delegation rule */
4248 ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
4249 of->file->f_path.dentry->d_sb);
4250 if (ret)
4251 goto out_finish;
4252
4253 /* and must be contained in the same domain */
4254 ret = -EOPNOTSUPP;
4255 if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
4256 goto out_finish;
4257
4258 ret = cgroup_attach_task(dst_cgrp, task, false);
4259
4260out_finish:
4261 cgroup_procs_write_finish(task);
4262out_unlock:
4263 cgroup_kn_unlock(of->kn);
4264
4265 return ret ?: nbytes;
4266}
4267
3977/* cgroup core interface files for the default hierarchy */ 4268/* cgroup core interface files for the default hierarchy */
3978static struct cftype cgroup_base_files[] = { 4269static struct cftype cgroup_base_files[] = {
3979 { 4270 {
4271 .name = "cgroup.type",
4272 .flags = CFTYPE_NOT_ON_ROOT,
4273 .seq_show = cgroup_type_show,
4274 .write = cgroup_type_write,
4275 },
4276 {
3980 .name = "cgroup.procs", 4277 .name = "cgroup.procs",
3981 .flags = CFTYPE_NS_DELEGATABLE, 4278 .flags = CFTYPE_NS_DELEGATABLE,
3982 .file_offset = offsetof(struct cgroup, procs_file), 4279 .file_offset = offsetof(struct cgroup, procs_file),
@@ -3987,6 +4284,14 @@ static struct cftype cgroup_base_files[] = {
3987 .write = cgroup_procs_write, 4284 .write = cgroup_procs_write,
3988 }, 4285 },
3989 { 4286 {
4287 .name = "cgroup.threads",
4288 .release = cgroup_procs_release,
4289 .seq_start = cgroup_threads_start,
4290 .seq_next = cgroup_procs_next,
4291 .seq_show = cgroup_procs_show,
4292 .write = cgroup_threads_write,
4293 },
4294 {
3990 .name = "cgroup.controllers", 4295 .name = "cgroup.controllers",
3991 .seq_show = cgroup_controllers_show, 4296 .seq_show = cgroup_controllers_show,
3992 }, 4297 },
@@ -4753,11 +5058,17 @@ int __init cgroup_init(void)
4753 5058
4754 cgrp_dfl_root.subsys_mask |= 1 << ss->id; 5059 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
4755 5060
5061 /* implicit controllers must be threaded too */
5062 WARN_ON(ss->implicit_on_dfl && !ss->threaded);
5063
4756 if (ss->implicit_on_dfl) 5064 if (ss->implicit_on_dfl)
4757 cgrp_dfl_implicit_ss_mask |= 1 << ss->id; 5065 cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
4758 else if (!ss->dfl_cftypes) 5066 else if (!ss->dfl_cftypes)
4759 cgrp_dfl_inhibit_ss_mask |= 1 << ss->id; 5067 cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
4760 5068
5069 if (ss->threaded)
5070 cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
5071
4761 if (ss->dfl_cftypes == ss->legacy_cftypes) { 5072 if (ss->dfl_cftypes == ss->legacy_cftypes) {
4762 WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); 5073 WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
4763 } else { 5074 } else {
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index dac46af22782..787a242fa69d 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -352,6 +352,7 @@ static int __init enable_cgroup_debug(char *str)
352{ 352{
353 debug_cgrp_subsys.dfl_cftypes = debug_files; 353 debug_cgrp_subsys.dfl_cftypes = debug_files;
354 debug_cgrp_subsys.implicit_on_dfl = true; 354 debug_cgrp_subsys.implicit_on_dfl = true;
355 debug_cgrp_subsys.threaded = true;
355 return 1; 356 return 1;
356} 357}
357__setup("cgroup_debug", enable_cgroup_debug); 358__setup("cgroup_debug", enable_cgroup_debug);
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 2237201d66d5..9829c67ebc0a 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -345,4 +345,5 @@ struct cgroup_subsys pids_cgrp_subsys = {
345 .free = pids_free, 345 .free = pids_free,
346 .legacy_cftypes = pids_files, 346 .legacy_cftypes = pids_files,
347 .dfl_cftypes = pids_files, 347 .dfl_cftypes = pids_files,
348 .threaded = true,
348}; 349};
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1538df9b2b65..ec78247da310 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11210,5 +11210,6 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
11210 * controller is not mounted on a legacy hierarchy. 11210 * controller is not mounted on a legacy hierarchy.
11211 */ 11211 */
11212 .implicit_on_dfl = true, 11212 .implicit_on_dfl = true,
11213 .threaded = true,
11213}; 11214};
11214#endif /* CONFIG_CGROUP_PERF */ 11215#endif /* CONFIG_CGROUP_PERF */