diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-09-02 11:04:23 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-09-02 11:04:23 -0400 |
commit | 8bdc69b764013a9b5ebeef7df8f314f1066c5d79 (patch) | |
tree | 335dcb29c9ba06142917121d551575d360eca63e | |
parent | 76ec51ef5edfe540bbc3c61b860f88deb8e6a37b (diff) | |
parent | 20f1f4b5ffb870631bf4a4e7c7ba10e3528ae6a6 (diff) |
Merge branch 'for-4.3' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo:
- a new PIDs controller is added. It turns out that PIDs are actually
an independent resource from kmem due to the limited PID space.
- more core preparations for the v2 interface. Once cpu side interface
is settled, it should be ready for lifting the devel mask.
for-4.3-unified-base was temporarily branched so that other trees
(block) can pull cgroup core changes that blkcg changes depend on.
- a non-critical idr_preload usage bug fix.
* 'for-4.3' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
cgroup: pids: fix invalid get/put usage
cgroup: introduce cgroup_subsys->legacy_name
cgroup: don't print subsystems for the default hierarchy
cgroup: make cftype->private a unsigned long
cgroup: export cgrp_dfl_root
cgroup: define controller file conventions
cgroup: fix idr_preload usage
cgroup: add documentation for the PIDs controller
cgroup: implement the PIDs subsystem
cgroup: allow a cgroup subsystem to reject a fork
-rw-r--r-- | CREDITS | 5 | ||||
-rw-r--r-- | Documentation/cgroups/00-INDEX | 2 | ||||
-rw-r--r-- | Documentation/cgroups/pids.txt | 85 | ||||
-rw-r--r-- | Documentation/cgroups/unified-hierarchy.txt | 80 | ||||
-rw-r--r-- | include/linux/cgroup-defs.h | 15 | ||||
-rw-r--r-- | include/linux/cgroup.h | 24 | ||||
-rw-r--r-- | include/linux/cgroup_subsys.h | 28 | ||||
-rw-r--r-- | init/Kconfig | 16 | ||||
-rw-r--r-- | kernel/Makefile | 1 | ||||
-rw-r--r-- | kernel/cgroup.c | 122 | ||||
-rw-r--r-- | kernel/cgroup_freezer.c | 2 | ||||
-rw-r--r-- | kernel/cgroup_pids.c | 355 | ||||
-rw-r--r-- | kernel/fork.c | 17 | ||||
-rw-r--r-- | kernel/sched/core.c | 2 |
14 files changed, 717 insertions, 37 deletions
@@ -3219,6 +3219,11 @@ S: 69 rue Dunois | |||
3219 | S: 75013 Paris | 3219 | S: 75013 Paris |
3220 | S: France | 3220 | S: France |
3221 | 3221 | ||
3222 | N: Aleksa Sarai | ||
3223 | E: cyphar@cyphar.com | ||
3224 | W: https://www.cyphar.com/ | ||
3225 | D: `pids` cgroup subsystem | ||
3226 | |||
3222 | N: Dipankar Sarma | 3227 | N: Dipankar Sarma |
3223 | E: dipankar@in.ibm.com | 3228 | E: dipankar@in.ibm.com |
3224 | D: RCU | 3229 | D: RCU |
diff --git a/Documentation/cgroups/00-INDEX b/Documentation/cgroups/00-INDEX index 96ce071a3633..3f5a40f57d4a 100644 --- a/Documentation/cgroups/00-INDEX +++ b/Documentation/cgroups/00-INDEX | |||
@@ -22,6 +22,8 @@ net_cls.txt | |||
22 | - Network classifier cgroups details and usages. | 22 | - Network classifier cgroups details and usages. |
23 | net_prio.txt | 23 | net_prio.txt |
24 | - Network priority cgroups details and usages. | 24 | - Network priority cgroups details and usages. |
25 | pids.txt | ||
26 | - Process number cgroups details and usages. | ||
25 | resource_counter.txt | 27 | resource_counter.txt |
26 | - Resource Counter API. | 28 | - Resource Counter API. |
27 | unified-hierarchy.txt | 29 | unified-hierarchy.txt |
diff --git a/Documentation/cgroups/pids.txt b/Documentation/cgroups/pids.txt new file mode 100644 index 000000000000..1a078b5d281a --- /dev/null +++ b/Documentation/cgroups/pids.txt | |||
@@ -0,0 +1,85 @@ | |||
1 | Process Number Controller | ||
2 | ========================= | ||
3 | |||
4 | Abstract | ||
5 | -------- | ||
6 | |||
7 | The process number controller is used to allow a cgroup hierarchy to stop any | ||
8 | new tasks from being fork()'d or clone()'d after a certain limit is reached. | ||
9 | |||
10 | Since it is trivial to hit the task limit without hitting any kmemcg limits in | ||
11 | place, PIDs are a fundamental resource. As such, PID exhaustion must be | ||
12 | preventable in the scope of a cgroup hierarchy by allowing resource limiting of | ||
13 | the number of tasks in a cgroup. | ||
14 | |||
15 | Usage | ||
16 | ----- | ||
17 | |||
18 | In order to use the `pids` controller, set the maximum number of tasks in | ||
19 | pids.max (this is not available in the root cgroup for obvious reasons). The | ||
20 | number of processes currently in the cgroup is given by pids.current. | ||
21 | |||
22 | Organisational operations are not blocked by cgroup policies, so it is possible | ||
23 | to have pids.current > pids.max. This can be done by either setting the limit to | ||
24 | be smaller than pids.current, or attaching enough processes to the cgroup such | ||
25 | that pids.current > pids.max. However, it is not possible to violate a cgroup | ||
26 | policy through fork() or clone(). fork() and clone() will return -EAGAIN if the | ||
27 | creation of a new process would cause a cgroup policy to be violated. | ||
28 | |||
29 | To set a cgroup to have no limit, set pids.max to "max". This is the default for | ||
30 | all new cgroups (N.B. that PID limits are hierarchical, so the most stringent | ||
31 | limit in the hierarchy is followed). | ||
32 | |||
33 | pids.current tracks all child cgroup hierarchies, so parent/pids.current is a | ||
34 | superset of parent/child/pids.current. | ||
35 | |||
36 | Example | ||
37 | ------- | ||
38 | |||
39 | First, we mount the pids controller: | ||
40 | # mkdir -p /sys/fs/cgroup/pids | ||
41 | # mount -t cgroup -o pids none /sys/fs/cgroup/pids | ||
42 | |||
43 | Then we create a hierarchy, set limits and attach processes to it: | ||
44 | # mkdir -p /sys/fs/cgroup/pids/parent/child | ||
45 | # echo 2 > /sys/fs/cgroup/pids/parent/pids.max | ||
46 | # echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs | ||
47 | # cat /sys/fs/cgroup/pids/parent/pids.current | ||
48 | 2 | ||
49 | # | ||
50 | |||
51 | It should be noted that attempts to overcome the set limit (2 in this case) will | ||
52 | fail: | ||
53 | |||
54 | # cat /sys/fs/cgroup/pids/parent/pids.current | ||
55 | 2 | ||
56 | # ( /bin/echo "Here's some processes for you." | cat ) | ||
57 | sh: fork: Resource temporary unavailable | ||
58 | # | ||
59 | |||
60 | Even if we migrate to a child cgroup (which doesn't have a set limit), we will | ||
61 | not be able to overcome the most stringent limit in the hierarchy (in this case, | ||
62 | parent's): | ||
63 | |||
64 | # echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs | ||
65 | # cat /sys/fs/cgroup/pids/parent/pids.current | ||
66 | 2 | ||
67 | # cat /sys/fs/cgroup/pids/parent/child/pids.current | ||
68 | 2 | ||
69 | # cat /sys/fs/cgroup/pids/parent/child/pids.max | ||
70 | max | ||
71 | # ( /bin/echo "Here's some processes for you." | cat ) | ||
72 | sh: fork: Resource temporary unavailable | ||
73 | # | ||
74 | |||
75 | We can set a limit that is smaller than pids.current, which will stop any new | ||
76 | processes from being forked at all (note that the shell itself counts towards | ||
77 | pids.current): | ||
78 | |||
79 | # echo 1 > /sys/fs/cgroup/pids/parent/pids.max | ||
80 | # /bin/echo "We can't even spawn a single process now." | ||
81 | sh: fork: Resource temporary unavailable | ||
82 | # echo 0 > /sys/fs/cgroup/pids/parent/pids.max | ||
83 | # /bin/echo "We can't even spawn a single process now." | ||
84 | sh: fork: Resource temporary unavailable | ||
85 | # | ||
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt index 86847a7647ab..1ee9caf29e57 100644 --- a/Documentation/cgroups/unified-hierarchy.txt +++ b/Documentation/cgroups/unified-hierarchy.txt | |||
@@ -23,10 +23,13 @@ CONTENTS | |||
23 | 5. Other Changes | 23 | 5. Other Changes |
24 | 5-1. [Un]populated Notification | 24 | 5-1. [Un]populated Notification |
25 | 5-2. Other Core Changes | 25 | 5-2. Other Core Changes |
26 | 5-3. Per-Controller Changes | 26 | 5-3. Controller File Conventions |
27 | 5-3-1. blkio | 27 | 5-3-1. Format |
28 | 5-3-2. cpuset | 28 | 5-3-2. Control Knobs |
29 | 5-3-3. memory | 29 | 5-4. Per-Controller Changes |
30 | 5-4-1. blkio | ||
31 | 5-4-2. cpuset | ||
32 | 5-4-3. memory | ||
30 | 6. Planned Changes | 33 | 6. Planned Changes |
31 | 6-1. CAP for resource control | 34 | 6-1. CAP for resource control |
32 | 35 | ||
@@ -372,14 +375,75 @@ supported and the interface files "release_agent" and | |||
372 | - The "cgroup.clone_children" file is removed. | 375 | - The "cgroup.clone_children" file is removed. |
373 | 376 | ||
374 | 377 | ||
375 | 5-3. Per-Controller Changes | 378 | 5-3. Controller File Conventions |
376 | 379 | ||
377 | 5-3-1. blkio | 380 | 5-3-1. Format |
381 | |||
382 | In general, all controller files should be in one of the following | ||
383 | formats whenever possible. | ||
384 | |||
385 | - Values only files | ||
386 | |||
387 | VAL0 VAL1...\n | ||
388 | |||
389 | - Flat keyed files | ||
390 | |||
391 | KEY0 VAL0\n | ||
392 | KEY1 VAL1\n | ||
393 | ... | ||
394 | |||
395 | - Nested keyed files | ||
396 | |||
397 | KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01... | ||
398 | KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11... | ||
399 | ... | ||
400 | |||
401 | For a writeable file, the format for writing should generally match | ||
402 | reading; however, controllers may allow omitting later fields or | ||
403 | implement restricted shortcuts for most common use cases. | ||
404 | |||
405 | For both flat and nested keyed files, only the values for a single key | ||
406 | can be written at a time. For nested keyed files, the sub key pairs | ||
407 | may be specified in any order and not all pairs have to be specified. | ||
408 | |||
409 | |||
410 | 5-3-2. Control Knobs | ||
411 | |||
412 | - Settings for a single feature should generally be implemented in a | ||
413 | single file. | ||
414 | |||
415 | - In general, the root cgroup should be exempt from resource control | ||
416 | and thus shouldn't have resource control knobs. | ||
417 | |||
418 | - If a controller implements ratio based resource distribution, the | ||
419 | control knob should be named "weight" and have the range [1, 10000] | ||
420 | and 100 should be the default value. The values are chosen to allow | ||
421 | enough and symmetric bias in both directions while keeping it | ||
422 | intuitive (the default is 100%). | ||
423 | |||
424 | - If a controller implements an absolute resource guarantee and/or | ||
425 | limit, the control knobs should be named "min" and "max" | ||
426 | respectively. If a controller implements best effort resource | ||
427 | gurantee and/or limit, the control knobs should be named "low" and | ||
428 | "high" respectively. | ||
429 | |||
430 | In the above four control files, the special token "max" should be | ||
431 | used to represent upward infinity for both reading and writing. | ||
432 | |||
433 | - If a setting has configurable default value and specific overrides, | ||
434 | the default settings should be keyed with "default" and appear as | ||
435 | the first entry in the file. Specific entries can use "default" as | ||
436 | its value to indicate inheritance of the default value. | ||
437 | |||
438 | |||
439 | 5-4. Per-Controller Changes | ||
440 | |||
441 | 5-4-1. blkio | ||
378 | 442 | ||
379 | - blk-throttle becomes properly hierarchical. | 443 | - blk-throttle becomes properly hierarchical. |
380 | 444 | ||
381 | 445 | ||
382 | 5-3-2. cpuset | 446 | 5-4-2. cpuset |
383 | 447 | ||
384 | - Tasks are kept in empty cpusets after hotplug and take on the masks | 448 | - Tasks are kept in empty cpusets after hotplug and take on the masks |
385 | of the nearest non-empty ancestor, instead of being moved to it. | 449 | of the nearest non-empty ancestor, instead of being moved to it. |
@@ -388,7 +452,7 @@ supported and the interface files "release_agent" and | |||
388 | masks of the nearest non-empty ancestor. | 452 | masks of the nearest non-empty ancestor. |
389 | 453 | ||
390 | 454 | ||
391 | 5-3-3. memory | 455 | 5-4-3. memory |
392 | 456 | ||
393 | - use_hierarchy is on by default and the cgroup file for the flag is | 457 | - use_hierarchy is on by default and the cgroup file for the flag is |
394 | not created. | 458 | not created. |
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 93755a629299..4d8fcf2187dc 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h | |||
@@ -34,12 +34,17 @@ struct seq_file; | |||
34 | 34 | ||
35 | /* define the enumeration of all cgroup subsystems */ | 35 | /* define the enumeration of all cgroup subsystems */ |
36 | #define SUBSYS(_x) _x ## _cgrp_id, | 36 | #define SUBSYS(_x) _x ## _cgrp_id, |
37 | #define SUBSYS_TAG(_t) CGROUP_ ## _t, \ | ||
38 | __unused_tag_ ## _t = CGROUP_ ## _t - 1, | ||
37 | enum cgroup_subsys_id { | 39 | enum cgroup_subsys_id { |
38 | #include <linux/cgroup_subsys.h> | 40 | #include <linux/cgroup_subsys.h> |
39 | CGROUP_SUBSYS_COUNT, | 41 | CGROUP_SUBSYS_COUNT, |
40 | }; | 42 | }; |
43 | #undef SUBSYS_TAG | ||
41 | #undef SUBSYS | 44 | #undef SUBSYS |
42 | 45 | ||
46 | #define CGROUP_CANFORK_COUNT (CGROUP_CANFORK_END - CGROUP_CANFORK_START) | ||
47 | |||
43 | /* bits in struct cgroup_subsys_state flags field */ | 48 | /* bits in struct cgroup_subsys_state flags field */ |
44 | enum { | 49 | enum { |
45 | CSS_NO_REF = (1 << 0), /* no reference counting for this css */ | 50 | CSS_NO_REF = (1 << 0), /* no reference counting for this css */ |
@@ -318,7 +323,7 @@ struct cftype { | |||
318 | * end of cftype array. | 323 | * end of cftype array. |
319 | */ | 324 | */ |
320 | char name[MAX_CFTYPE_NAME]; | 325 | char name[MAX_CFTYPE_NAME]; |
321 | int private; | 326 | unsigned long private; |
322 | /* | 327 | /* |
323 | * If not 0, file mode is set to this value, otherwise it will | 328 | * If not 0, file mode is set to this value, otherwise it will |
324 | * be figured out automatically | 329 | * be figured out automatically |
@@ -406,7 +411,9 @@ struct cgroup_subsys { | |||
406 | struct cgroup_taskset *tset); | 411 | struct cgroup_taskset *tset); |
407 | void (*attach)(struct cgroup_subsys_state *css, | 412 | void (*attach)(struct cgroup_subsys_state *css, |
408 | struct cgroup_taskset *tset); | 413 | struct cgroup_taskset *tset); |
409 | void (*fork)(struct task_struct *task); | 414 | int (*can_fork)(struct task_struct *task, void **priv_p); |
415 | void (*cancel_fork)(struct task_struct *task, void *priv); | ||
416 | void (*fork)(struct task_struct *task, void *priv); | ||
410 | void (*exit)(struct cgroup_subsys_state *css, | 417 | void (*exit)(struct cgroup_subsys_state *css, |
411 | struct cgroup_subsys_state *old_css, | 418 | struct cgroup_subsys_state *old_css, |
412 | struct task_struct *task); | 419 | struct task_struct *task); |
@@ -434,6 +441,9 @@ struct cgroup_subsys { | |||
434 | int id; | 441 | int id; |
435 | const char *name; | 442 | const char *name; |
436 | 443 | ||
444 | /* optional, initialized automatically during boot if not set */ | ||
445 | const char *legacy_name; | ||
446 | |||
437 | /* link to parent, protected by cgroup_lock() */ | 447 | /* link to parent, protected by cgroup_lock() */ |
438 | struct cgroup_root *root; | 448 | struct cgroup_root *root; |
439 | 449 | ||
@@ -491,6 +501,7 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) | |||
491 | 501 | ||
492 | #else /* CONFIG_CGROUPS */ | 502 | #else /* CONFIG_CGROUPS */ |
493 | 503 | ||
504 | #define CGROUP_CANFORK_COUNT 0 | ||
494 | #define CGROUP_SUBSYS_COUNT 0 | 505 | #define CGROUP_SUBSYS_COUNT 0 |
495 | 506 | ||
496 | static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {} | 507 | static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {} |
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index a593e299162e..eb7ca55f72ef 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -22,6 +22,15 @@ | |||
22 | 22 | ||
23 | #ifdef CONFIG_CGROUPS | 23 | #ifdef CONFIG_CGROUPS |
24 | 24 | ||
25 | /* | ||
26 | * All weight knobs on the default hierarhcy should use the following min, | ||
27 | * default and max values. The default value is the logarithmic center of | ||
28 | * MIN and MAX and allows 100x to be expressed in both directions. | ||
29 | */ | ||
30 | #define CGROUP_WEIGHT_MIN 1 | ||
31 | #define CGROUP_WEIGHT_DFL 100 | ||
32 | #define CGROUP_WEIGHT_MAX 10000 | ||
33 | |||
25 | /* a css_task_iter should be treated as an opaque object */ | 34 | /* a css_task_iter should be treated as an opaque object */ |
26 | struct css_task_iter { | 35 | struct css_task_iter { |
27 | struct cgroup_subsys *ss; | 36 | struct cgroup_subsys *ss; |
@@ -62,7 +71,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, | |||
62 | struct pid *pid, struct task_struct *tsk); | 71 | struct pid *pid, struct task_struct *tsk); |
63 | 72 | ||
64 | void cgroup_fork(struct task_struct *p); | 73 | void cgroup_fork(struct task_struct *p); |
65 | void cgroup_post_fork(struct task_struct *p); | 74 | extern int cgroup_can_fork(struct task_struct *p, |
75 | void *ss_priv[CGROUP_CANFORK_COUNT]); | ||
76 | extern void cgroup_cancel_fork(struct task_struct *p, | ||
77 | void *ss_priv[CGROUP_CANFORK_COUNT]); | ||
78 | extern void cgroup_post_fork(struct task_struct *p, | ||
79 | void *old_ss_priv[CGROUP_CANFORK_COUNT]); | ||
66 | void cgroup_exit(struct task_struct *p); | 80 | void cgroup_exit(struct task_struct *p); |
67 | 81 | ||
68 | int cgroup_init_early(void); | 82 | int cgroup_init_early(void); |
@@ -524,7 +538,13 @@ static inline int cgroupstats_build(struct cgroupstats *stats, | |||
524 | struct dentry *dentry) { return -EINVAL; } | 538 | struct dentry *dentry) { return -EINVAL; } |
525 | 539 | ||
526 | static inline void cgroup_fork(struct task_struct *p) {} | 540 | static inline void cgroup_fork(struct task_struct *p) {} |
527 | static inline void cgroup_post_fork(struct task_struct *p) {} | 541 | static inline int cgroup_can_fork(struct task_struct *p, |
542 | void *ss_priv[CGROUP_CANFORK_COUNT]) | ||
543 | { return 0; } | ||
544 | static inline void cgroup_cancel_fork(struct task_struct *p, | ||
545 | void *ss_priv[CGROUP_CANFORK_COUNT]) {} | ||
546 | static inline void cgroup_post_fork(struct task_struct *p, | ||
547 | void *ss_priv[CGROUP_CANFORK_COUNT]) {} | ||
528 | static inline void cgroup_exit(struct task_struct *p) {} | 548 | static inline void cgroup_exit(struct task_struct *p) {} |
529 | 549 | ||
530 | static inline int cgroup_init_early(void) { return 0; } | 550 | static inline int cgroup_init_early(void) { return 0; } |
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index e4a96fb14403..1f36945fd23d 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h | |||
@@ -3,6 +3,17 @@ | |||
3 | * | 3 | * |
4 | * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. | 4 | * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. |
5 | */ | 5 | */ |
6 | |||
7 | /* | ||
8 | * This file *must* be included with SUBSYS() defined. | ||
9 | * SUBSYS_TAG() is a noop if undefined. | ||
10 | */ | ||
11 | |||
12 | #ifndef SUBSYS_TAG | ||
13 | #define __TMP_SUBSYS_TAG | ||
14 | #define SUBSYS_TAG(_x) | ||
15 | #endif | ||
16 | |||
6 | #if IS_ENABLED(CONFIG_CPUSETS) | 17 | #if IS_ENABLED(CONFIG_CPUSETS) |
7 | SUBSYS(cpuset) | 18 | SUBSYS(cpuset) |
8 | #endif | 19 | #endif |
@@ -48,11 +59,28 @@ SUBSYS(hugetlb) | |||
48 | #endif | 59 | #endif |
49 | 60 | ||
50 | /* | 61 | /* |
62 | * Subsystems that implement the can_fork() family of callbacks. | ||
63 | */ | ||
64 | SUBSYS_TAG(CANFORK_START) | ||
65 | |||
66 | #if IS_ENABLED(CONFIG_CGROUP_PIDS) | ||
67 | SUBSYS(pids) | ||
68 | #endif | ||
69 | |||
70 | SUBSYS_TAG(CANFORK_END) | ||
71 | |||
72 | /* | ||
51 | * The following subsystems are not supported on the default hierarchy. | 73 | * The following subsystems are not supported on the default hierarchy. |
52 | */ | 74 | */ |
53 | #if IS_ENABLED(CONFIG_CGROUP_DEBUG) | 75 | #if IS_ENABLED(CONFIG_CGROUP_DEBUG) |
54 | SUBSYS(debug) | 76 | SUBSYS(debug) |
55 | #endif | 77 | #endif |
78 | |||
79 | #ifdef __TMP_SUBSYS_TAG | ||
80 | #undef __TMP_SUBSYS_TAG | ||
81 | #undef SUBSYS_TAG | ||
82 | #endif | ||
83 | |||
56 | /* | 84 | /* |
57 | * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. | 85 | * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. |
58 | */ | 86 | */ |
diff --git a/init/Kconfig b/init/Kconfig index ba1e6eaf4c36..bb9b4dd55889 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -947,6 +947,22 @@ config CGROUP_FREEZER | |||
947 | Provides a way to freeze and unfreeze all tasks in a | 947 | Provides a way to freeze and unfreeze all tasks in a |
948 | cgroup. | 948 | cgroup. |
949 | 949 | ||
950 | config CGROUP_PIDS | ||
951 | bool "PIDs cgroup subsystem" | ||
952 | help | ||
953 | Provides enforcement of process number limits in the scope of a | ||
954 | cgroup. Any attempt to fork more processes than is allowed in the | ||
955 | cgroup will fail. PIDs are fundamentally a global resource because it | ||
956 | is fairly trivial to reach PID exhaustion before you reach even a | ||
957 | conservative kmemcg limit. As a result, it is possible to grind a | ||
958 | system to halt without being limited by other cgroup policies. The | ||
959 | PIDs cgroup subsystem is designed to stop this from happening. | ||
960 | |||
961 | It should be noted that organisational operations (such as attaching | ||
962 | to a cgroup hierarchy will *not* be blocked by the PIDs subsystem), | ||
963 | since the PIDs limit only affects a process's ability to fork, not to | ||
964 | attach to a cgroup. | ||
965 | |||
950 | config CGROUP_DEVICE | 966 | config CGROUP_DEVICE |
951 | bool "Device controller for cgroups" | 967 | bool "Device controller for cgroups" |
952 | help | 968 | help |
diff --git a/kernel/Makefile b/kernel/Makefile index 43c4c920f30a..718fb8afab7a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -55,6 +55,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o | |||
55 | obj-$(CONFIG_COMPAT) += compat.o | 55 | obj-$(CONFIG_COMPAT) += compat.o |
56 | obj-$(CONFIG_CGROUPS) += cgroup.o | 56 | obj-$(CONFIG_CGROUPS) += cgroup.o |
57 | obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o | 57 | obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o |
58 | obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o | ||
58 | obj-$(CONFIG_CPUSETS) += cpuset.o | 59 | obj-$(CONFIG_CPUSETS) += cpuset.o |
59 | obj-$(CONFIG_UTS_NS) += utsname.o | 60 | obj-$(CONFIG_UTS_NS) += utsname.o |
60 | obj-$(CONFIG_USER_NS) += user_namespace.o | 61 | obj-$(CONFIG_USER_NS) += user_namespace.o |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b89f3168411b..f3f5cd5e2c0d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -145,6 +145,7 @@ static const char *cgroup_subsys_name[] = { | |||
145 | * part of that cgroup. | 145 | * part of that cgroup. |
146 | */ | 146 | */ |
147 | struct cgroup_root cgrp_dfl_root; | 147 | struct cgroup_root cgrp_dfl_root; |
148 | EXPORT_SYMBOL_GPL(cgrp_dfl_root); | ||
148 | 149 | ||
149 | /* | 150 | /* |
150 | * The default hierarchy always exists but is hidden until mounted for the | 151 | * The default hierarchy always exists but is hidden until mounted for the |
@@ -186,6 +187,9 @@ static u64 css_serial_nr_next = 1; | |||
186 | static unsigned long have_fork_callback __read_mostly; | 187 | static unsigned long have_fork_callback __read_mostly; |
187 | static unsigned long have_exit_callback __read_mostly; | 188 | static unsigned long have_exit_callback __read_mostly; |
188 | 189 | ||
190 | /* Ditto for the can_fork callback. */ | ||
191 | static unsigned long have_canfork_callback __read_mostly; | ||
192 | |||
189 | static struct cftype cgroup_dfl_base_files[]; | 193 | static struct cftype cgroup_dfl_base_files[]; |
190 | static struct cftype cgroup_legacy_base_files[]; | 194 | static struct cftype cgroup_legacy_base_files[]; |
191 | 195 | ||
@@ -207,7 +211,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, | |||
207 | 211 | ||
208 | idr_preload(gfp_mask); | 212 | idr_preload(gfp_mask); |
209 | spin_lock_bh(&cgroup_idr_lock); | 213 | spin_lock_bh(&cgroup_idr_lock); |
210 | ret = idr_alloc(idr, ptr, start, end, gfp_mask); | 214 | ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT); |
211 | spin_unlock_bh(&cgroup_idr_lock); | 215 | spin_unlock_bh(&cgroup_idr_lock); |
212 | idr_preload_end(); | 216 | idr_preload_end(); |
213 | return ret; | 217 | return ret; |
@@ -1027,10 +1031,13 @@ static const struct file_operations proc_cgroupstats_operations; | |||
1027 | static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, | 1031 | static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, |
1028 | char *buf) | 1032 | char *buf) |
1029 | { | 1033 | { |
1034 | struct cgroup_subsys *ss = cft->ss; | ||
1035 | |||
1030 | if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && | 1036 | if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && |
1031 | !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) | 1037 | !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) |
1032 | snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s", | 1038 | snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s", |
1033 | cft->ss->name, cft->name); | 1039 | cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, |
1040 | cft->name); | ||
1034 | else | 1041 | else |
1035 | strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); | 1042 | strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); |
1036 | return buf; | 1043 | return buf; |
@@ -1332,9 +1339,10 @@ static int cgroup_show_options(struct seq_file *seq, | |||
1332 | struct cgroup_subsys *ss; | 1339 | struct cgroup_subsys *ss; |
1333 | int ssid; | 1340 | int ssid; |
1334 | 1341 | ||
1335 | for_each_subsys(ss, ssid) | 1342 | if (root != &cgrp_dfl_root) |
1336 | if (root->subsys_mask & (1 << ssid)) | 1343 | for_each_subsys(ss, ssid) |
1337 | seq_printf(seq, ",%s", ss->name); | 1344 | if (root->subsys_mask & (1 << ssid)) |
1345 | seq_printf(seq, ",%s", ss->legacy_name); | ||
1338 | if (root->flags & CGRP_ROOT_NOPREFIX) | 1346 | if (root->flags & CGRP_ROOT_NOPREFIX) |
1339 | seq_puts(seq, ",noprefix"); | 1347 | seq_puts(seq, ",noprefix"); |
1340 | if (root->flags & CGRP_ROOT_XATTR) | 1348 | if (root->flags & CGRP_ROOT_XATTR) |
@@ -1447,7 +1455,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1447 | } | 1455 | } |
1448 | 1456 | ||
1449 | for_each_subsys(ss, i) { | 1457 | for_each_subsys(ss, i) { |
1450 | if (strcmp(token, ss->name)) | 1458 | if (strcmp(token, ss->legacy_name)) |
1451 | continue; | 1459 | continue; |
1452 | if (ss->disabled) | 1460 | if (ss->disabled) |
1453 | continue; | 1461 | continue; |
@@ -1666,7 +1674,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) | |||
1666 | 1674 | ||
1667 | lockdep_assert_held(&cgroup_mutex); | 1675 | lockdep_assert_held(&cgroup_mutex); |
1668 | 1676 | ||
1669 | ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT); | 1677 | ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); |
1670 | if (ret < 0) | 1678 | if (ret < 0) |
1671 | goto out; | 1679 | goto out; |
1672 | root_cgrp->id = ret; | 1680 | root_cgrp->id = ret; |
@@ -4579,7 +4587,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, | |||
4579 | if (err) | 4587 | if (err) |
4580 | goto err_free_css; | 4588 | goto err_free_css; |
4581 | 4589 | ||
4582 | err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT); | 4590 | err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL); |
4583 | if (err < 0) | 4591 | if (err < 0) |
4584 | goto err_free_percpu_ref; | 4592 | goto err_free_percpu_ref; |
4585 | css->id = err; | 4593 | css->id = err; |
@@ -4656,7 +4664,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | |||
4656 | * Temporarily set the pointer to NULL, so idr_find() won't return | 4664 | * Temporarily set the pointer to NULL, so idr_find() won't return |
4657 | * a half-baked cgroup. | 4665 | * a half-baked cgroup. |
4658 | */ | 4666 | */ |
4659 | cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); | 4667 | cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); |
4660 | if (cgrp->id < 0) { | 4668 | if (cgrp->id < 0) { |
4661 | ret = -ENOMEM; | 4669 | ret = -ENOMEM; |
4662 | goto out_cancel_ref; | 4670 | goto out_cancel_ref; |
@@ -4955,6 +4963,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) | |||
4955 | 4963 | ||
4956 | have_fork_callback |= (bool)ss->fork << ss->id; | 4964 | have_fork_callback |= (bool)ss->fork << ss->id; |
4957 | have_exit_callback |= (bool)ss->exit << ss->id; | 4965 | have_exit_callback |= (bool)ss->exit << ss->id; |
4966 | have_canfork_callback |= (bool)ss->can_fork << ss->id; | ||
4958 | 4967 | ||
4959 | /* At system boot, before all subsystems have been | 4968 | /* At system boot, before all subsystems have been |
4960 | * registered, no tasks have been forked, so we don't | 4969 | * registered, no tasks have been forked, so we don't |
@@ -4993,6 +5002,8 @@ int __init cgroup_init_early(void) | |||
4993 | 5002 | ||
4994 | ss->id = i; | 5003 | ss->id = i; |
4995 | ss->name = cgroup_subsys_name[i]; | 5004 | ss->name = cgroup_subsys_name[i]; |
5005 | if (!ss->legacy_name) | ||
5006 | ss->legacy_name = cgroup_subsys_name[i]; | ||
4996 | 5007 | ||
4997 | if (ss->early_init) | 5008 | if (ss->early_init) |
4998 | cgroup_init_subsys(ss, true); | 5009 | cgroup_init_subsys(ss, true); |
@@ -5136,9 +5147,11 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, | |||
5136 | continue; | 5147 | continue; |
5137 | 5148 | ||
5138 | seq_printf(m, "%d:", root->hierarchy_id); | 5149 | seq_printf(m, "%d:", root->hierarchy_id); |
5139 | for_each_subsys(ss, ssid) | 5150 | if (root != &cgrp_dfl_root) |
5140 | if (root->subsys_mask & (1 << ssid)) | 5151 | for_each_subsys(ss, ssid) |
5141 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); | 5152 | if (root->subsys_mask & (1 << ssid)) |
5153 | seq_printf(m, "%s%s", count++ ? "," : "", | ||
5154 | ss->legacy_name); | ||
5142 | if (strlen(root->name)) | 5155 | if (strlen(root->name)) |
5143 | seq_printf(m, "%sname=%s", count ? "," : "", | 5156 | seq_printf(m, "%sname=%s", count ? "," : "", |
5144 | root->name); | 5157 | root->name); |
@@ -5178,7 +5191,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) | |||
5178 | 5191 | ||
5179 | for_each_subsys(ss, i) | 5192 | for_each_subsys(ss, i) |
5180 | seq_printf(m, "%s\t%d\t%d\t%d\n", | 5193 | seq_printf(m, "%s\t%d\t%d\t%d\n", |
5181 | ss->name, ss->root->hierarchy_id, | 5194 | ss->legacy_name, ss->root->hierarchy_id, |
5182 | atomic_read(&ss->root->nr_cgrps), !ss->disabled); | 5195 | atomic_read(&ss->root->nr_cgrps), !ss->disabled); |
5183 | 5196 | ||
5184 | mutex_unlock(&cgroup_mutex); | 5197 | mutex_unlock(&cgroup_mutex); |
@@ -5197,6 +5210,19 @@ static const struct file_operations proc_cgroupstats_operations = { | |||
5197 | .release = single_release, | 5210 | .release = single_release, |
5198 | }; | 5211 | }; |
5199 | 5212 | ||
5213 | static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i) | ||
5214 | { | ||
5215 | if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END) | ||
5216 | return &ss_priv[i - CGROUP_CANFORK_START]; | ||
5217 | return NULL; | ||
5218 | } | ||
5219 | |||
5220 | static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i) | ||
5221 | { | ||
5222 | void **private = subsys_canfork_priv_p(ss_priv, i); | ||
5223 | return private ? *private : NULL; | ||
5224 | } | ||
5225 | |||
5200 | /** | 5226 | /** |
5201 | * cgroup_fork - initialize cgroup related fields during copy_process() | 5227 | * cgroup_fork - initialize cgroup related fields during copy_process() |
5202 | * @child: pointer to task_struct of forking parent process. | 5228 | * @child: pointer to task_struct of forking parent process. |
@@ -5212,6 +5238,57 @@ void cgroup_fork(struct task_struct *child) | |||
5212 | } | 5238 | } |
5213 | 5239 | ||
5214 | /** | 5240 | /** |
5241 | * cgroup_can_fork - called on a new task before the process is exposed | ||
5242 | * @child: the task in question. | ||
5243 | * | ||
5244 | * This calls the subsystem can_fork() callbacks. If the can_fork() callback | ||
5245 | * returns an error, the fork aborts with that error code. This allows for | ||
5246 | * a cgroup subsystem to conditionally allow or deny new forks. | ||
5247 | */ | ||
5248 | int cgroup_can_fork(struct task_struct *child, | ||
5249 | void *ss_priv[CGROUP_CANFORK_COUNT]) | ||
5250 | { | ||
5251 | struct cgroup_subsys *ss; | ||
5252 | int i, j, ret; | ||
5253 | |||
5254 | for_each_subsys_which(ss, i, &have_canfork_callback) { | ||
5255 | ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i)); | ||
5256 | if (ret) | ||
5257 | goto out_revert; | ||
5258 | } | ||
5259 | |||
5260 | return 0; | ||
5261 | |||
5262 | out_revert: | ||
5263 | for_each_subsys(ss, j) { | ||
5264 | if (j >= i) | ||
5265 | break; | ||
5266 | if (ss->cancel_fork) | ||
5267 | ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j)); | ||
5268 | } | ||
5269 | |||
5270 | return ret; | ||
5271 | } | ||
5272 | |||
5273 | /** | ||
5274 | * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() | ||
5275 | * @child: the task in question | ||
5276 | * | ||
5277 | * This calls the cancel_fork() callbacks if a fork failed *after* | ||
5278 | * cgroup_can_fork() succeded. | ||
5279 | */ | ||
5280 | void cgroup_cancel_fork(struct task_struct *child, | ||
5281 | void *ss_priv[CGROUP_CANFORK_COUNT]) | ||
5282 | { | ||
5283 | struct cgroup_subsys *ss; | ||
5284 | int i; | ||
5285 | |||
5286 | for_each_subsys(ss, i) | ||
5287 | if (ss->cancel_fork) | ||
5288 | ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i)); | ||
5289 | } | ||
5290 | |||
5291 | /** | ||
5215 | * cgroup_post_fork - called on a new task after adding it to the task list | 5292 | * cgroup_post_fork - called on a new task after adding it to the task list |
5216 | * @child: the task in question | 5293 | * @child: the task in question |
5217 | * | 5294 | * |
@@ -5221,7 +5298,8 @@ void cgroup_fork(struct task_struct *child) | |||
5221 | * cgroup_task_iter_start() - to guarantee that the new task ends up on its | 5298 | * cgroup_task_iter_start() - to guarantee that the new task ends up on its |
5222 | * list. | 5299 | * list. |
5223 | */ | 5300 | */ |
5224 | void cgroup_post_fork(struct task_struct *child) | 5301 | void cgroup_post_fork(struct task_struct *child, |
5302 | void *old_ss_priv[CGROUP_CANFORK_COUNT]) | ||
5225 | { | 5303 | { |
5226 | struct cgroup_subsys *ss; | 5304 | struct cgroup_subsys *ss; |
5227 | int i; | 5305 | int i; |
@@ -5266,7 +5344,7 @@ void cgroup_post_fork(struct task_struct *child) | |||
5266 | * and addition to css_set. | 5344 | * and addition to css_set. |
5267 | */ | 5345 | */ |
5268 | for_each_subsys_which(ss, i, &have_fork_callback) | 5346 | for_each_subsys_which(ss, i, &have_fork_callback) |
5269 | ss->fork(child); | 5347 | ss->fork(child, subsys_canfork_priv(old_ss_priv, i)); |
5270 | } | 5348 | } |
5271 | 5349 | ||
5272 | /** | 5350 | /** |
@@ -5400,12 +5478,14 @@ static int __init cgroup_disable(char *str) | |||
5400 | continue; | 5478 | continue; |
5401 | 5479 | ||
5402 | for_each_subsys(ss, i) { | 5480 | for_each_subsys(ss, i) { |
5403 | if (!strcmp(token, ss->name)) { | 5481 | if (strcmp(token, ss->name) && |
5404 | ss->disabled = 1; | 5482 | strcmp(token, ss->legacy_name)) |
5405 | printk(KERN_INFO "Disabling %s control group" | 5483 | continue; |
5406 | " subsystem\n", ss->name); | 5484 | |
5407 | break; | 5485 | ss->disabled = 1; |
5408 | } | 5486 | printk(KERN_INFO "Disabling %s control group subsystem\n", |
5487 | ss->name); | ||
5488 | break; | ||
5409 | } | 5489 | } |
5410 | } | 5490 | } |
5411 | return 1; | 5491 | return 1; |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 92b98cc0ee76..f1b30ad5dc6d 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -203,7 +203,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, | |||
203 | * to do anything as freezer_attach() will put @task into the appropriate | 203 | * to do anything as freezer_attach() will put @task into the appropriate |
204 | * state. | 204 | * state. |
205 | */ | 205 | */ |
206 | static void freezer_fork(struct task_struct *task) | 206 | static void freezer_fork(struct task_struct *task, void *private) |
207 | { | 207 | { |
208 | struct freezer *freezer; | 208 | struct freezer *freezer; |
209 | 209 | ||
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c new file mode 100644 index 000000000000..806cd7693ac8 --- /dev/null +++ b/kernel/cgroup_pids.c | |||
@@ -0,0 +1,355 @@ | |||
1 | /* | ||
2 | * Process number limiting controller for cgroups. | ||
3 | * | ||
4 | * Used to allow a cgroup hierarchy to stop any new processes from fork()ing | ||
5 | * after a certain limit is reached. | ||
6 | * | ||
7 | * Since it is trivial to hit the task limit without hitting any kmemcg limits | ||
8 | * in place, PIDs are a fundamental resource. As such, PID exhaustion must be | ||
9 | * preventable in the scope of a cgroup hierarchy by allowing resource limiting | ||
10 | * of the number of tasks in a cgroup. | ||
11 | * | ||
12 | * In order to use the `pids` controller, set the maximum number of tasks in | ||
13 | * pids.max (this is not available in the root cgroup for obvious reasons). The | ||
14 | * number of processes currently in the cgroup is given by pids.current. | ||
15 | * Organisational operations are not blocked by cgroup policies, so it is | ||
16 | * possible to have pids.current > pids.max. However, it is not possible to | ||
17 | * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking | ||
18 | * would cause a cgroup policy to be violated. | ||
19 | * | ||
20 | * To set a cgroup to have no limit, set pids.max to "max". This is the default | ||
21 | * for all new cgroups (N.B. that PID limits are hierarchical, so the most | ||
22 | * stringent limit in the hierarchy is followed). | ||
23 | * | ||
24 | * pids.current tracks all child cgroup hierarchies, so parent/pids.current is | ||
25 | * a superset of parent/child/pids.current. | ||
26 | * | ||
27 | * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com> | ||
28 | * | ||
29 | * This file is subject to the terms and conditions of version 2 of the GNU | ||
30 | * General Public License. See the file COPYING in the main directory of the | ||
31 | * Linux distribution for more details. | ||
32 | */ | ||
33 | |||
34 | #include <linux/kernel.h> | ||
35 | #include <linux/threads.h> | ||
36 | #include <linux/atomic.h> | ||
37 | #include <linux/cgroup.h> | ||
38 | #include <linux/slab.h> | ||
39 | |||
40 | #define PIDS_MAX (PID_MAX_LIMIT + 1ULL) | ||
41 | #define PIDS_MAX_STR "max" | ||
42 | |||
43 | struct pids_cgroup { | ||
44 | struct cgroup_subsys_state css; | ||
45 | |||
46 | /* | ||
47 | * Use 64-bit types so that we can safely represent "max" as | ||
48 | * %PIDS_MAX = (%PID_MAX_LIMIT + 1). | ||
49 | */ | ||
50 | atomic64_t counter; | ||
51 | int64_t limit; | ||
52 | }; | ||
53 | |||
54 | static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) | ||
55 | { | ||
56 | return container_of(css, struct pids_cgroup, css); | ||
57 | } | ||
58 | |||
59 | static struct pids_cgroup *parent_pids(struct pids_cgroup *pids) | ||
60 | { | ||
61 | return css_pids(pids->css.parent); | ||
62 | } | ||
63 | |||
64 | static struct cgroup_subsys_state * | ||
65 | pids_css_alloc(struct cgroup_subsys_state *parent) | ||
66 | { | ||
67 | struct pids_cgroup *pids; | ||
68 | |||
69 | pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL); | ||
70 | if (!pids) | ||
71 | return ERR_PTR(-ENOMEM); | ||
72 | |||
73 | pids->limit = PIDS_MAX; | ||
74 | atomic64_set(&pids->counter, 0); | ||
75 | return &pids->css; | ||
76 | } | ||
77 | |||
78 | static void pids_css_free(struct cgroup_subsys_state *css) | ||
79 | { | ||
80 | kfree(css_pids(css)); | ||
81 | } | ||
82 | |||
83 | /** | ||
84 | * pids_cancel - uncharge the local pid count | ||
85 | * @pids: the pid cgroup state | ||
86 | * @num: the number of pids to cancel | ||
87 | * | ||
88 | * This function will WARN if the pid count goes under 0, because such a case is | ||
89 | * a bug in the pids controller proper. | ||
90 | */ | ||
91 | static void pids_cancel(struct pids_cgroup *pids, int num) | ||
92 | { | ||
93 | /* | ||
94 | * A negative count (or overflow for that matter) is invalid, | ||
95 | * and indicates a bug in the `pids` controller proper. | ||
96 | */ | ||
97 | WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter)); | ||
98 | } | ||
99 | |||
100 | /** | ||
101 | * pids_uncharge - hierarchically uncharge the pid count | ||
102 | * @pids: the pid cgroup state | ||
103 | * @num: the number of pids to uncharge | ||
104 | */ | ||
105 | static void pids_uncharge(struct pids_cgroup *pids, int num) | ||
106 | { | ||
107 | struct pids_cgroup *p; | ||
108 | |||
109 | for (p = pids; p; p = parent_pids(p)) | ||
110 | pids_cancel(p, num); | ||
111 | } | ||
112 | |||
113 | /** | ||
114 | * pids_charge - hierarchically charge the pid count | ||
115 | * @pids: the pid cgroup state | ||
116 | * @num: the number of pids to charge | ||
117 | * | ||
118 | * This function does *not* follow the pid limit set. It cannot fail and the new | ||
119 | * pid count may exceed the limit. This is only used for reverting failed | ||
120 | * attaches, where there is no other way out than violating the limit. | ||
121 | */ | ||
122 | static void pids_charge(struct pids_cgroup *pids, int num) | ||
123 | { | ||
124 | struct pids_cgroup *p; | ||
125 | |||
126 | for (p = pids; p; p = parent_pids(p)) | ||
127 | atomic64_add(num, &p->counter); | ||
128 | } | ||
129 | |||
130 | /** | ||
131 | * pids_try_charge - hierarchically try to charge the pid count | ||
132 | * @pids: the pid cgroup state | ||
133 | * @num: the number of pids to charge | ||
134 | * | ||
135 | * This function follows the set limit. It will fail if the charge would cause | ||
136 | * the new value to exceed the hierarchical limit. Returns 0 if the charge | ||
137 | * succeded, otherwise -EAGAIN. | ||
138 | */ | ||
139 | static int pids_try_charge(struct pids_cgroup *pids, int num) | ||
140 | { | ||
141 | struct pids_cgroup *p, *q; | ||
142 | |||
143 | for (p = pids; p; p = parent_pids(p)) { | ||
144 | int64_t new = atomic64_add_return(num, &p->counter); | ||
145 | |||
146 | /* | ||
147 | * Since new is capped to the maximum number of pid_t, if | ||
148 | * p->limit is %PIDS_MAX then we know that this test will never | ||
149 | * fail. | ||
150 | */ | ||
151 | if (new > p->limit) | ||
152 | goto revert; | ||
153 | } | ||
154 | |||
155 | return 0; | ||
156 | |||
157 | revert: | ||
158 | for (q = pids; q != p; q = parent_pids(q)) | ||
159 | pids_cancel(q, num); | ||
160 | pids_cancel(p, num); | ||
161 | |||
162 | return -EAGAIN; | ||
163 | } | ||
164 | |||
165 | static int pids_can_attach(struct cgroup_subsys_state *css, | ||
166 | struct cgroup_taskset *tset) | ||
167 | { | ||
168 | struct pids_cgroup *pids = css_pids(css); | ||
169 | struct task_struct *task; | ||
170 | |||
171 | cgroup_taskset_for_each(task, tset) { | ||
172 | struct cgroup_subsys_state *old_css; | ||
173 | struct pids_cgroup *old_pids; | ||
174 | |||
175 | /* | ||
176 | * No need to pin @old_css between here and cancel_attach() | ||
177 | * because cgroup core protects it from being freed before | ||
178 | * the migration completes or fails. | ||
179 | */ | ||
180 | old_css = task_css(task, pids_cgrp_id); | ||
181 | old_pids = css_pids(old_css); | ||
182 | |||
183 | pids_charge(pids, 1); | ||
184 | pids_uncharge(old_pids, 1); | ||
185 | } | ||
186 | |||
187 | return 0; | ||
188 | } | ||
189 | |||
190 | static void pids_cancel_attach(struct cgroup_subsys_state *css, | ||
191 | struct cgroup_taskset *tset) | ||
192 | { | ||
193 | struct pids_cgroup *pids = css_pids(css); | ||
194 | struct task_struct *task; | ||
195 | |||
196 | cgroup_taskset_for_each(task, tset) { | ||
197 | struct cgroup_subsys_state *old_css; | ||
198 | struct pids_cgroup *old_pids; | ||
199 | |||
200 | old_css = task_css(task, pids_cgrp_id); | ||
201 | old_pids = css_pids(old_css); | ||
202 | |||
203 | pids_charge(old_pids, 1); | ||
204 | pids_uncharge(pids, 1); | ||
205 | } | ||
206 | } | ||
207 | |||
208 | static int pids_can_fork(struct task_struct *task, void **priv_p) | ||
209 | { | ||
210 | struct cgroup_subsys_state *css; | ||
211 | struct pids_cgroup *pids; | ||
212 | int err; | ||
213 | |||
214 | /* | ||
215 | * Use the "current" task_css for the pids subsystem as the tentative | ||
216 | * css. It is possible we will charge the wrong hierarchy, in which | ||
217 | * case we will forcefully revert/reapply the charge on the right | ||
218 | * hierarchy after it is committed to the task proper. | ||
219 | */ | ||
220 | css = task_get_css(current, pids_cgrp_id); | ||
221 | pids = css_pids(css); | ||
222 | |||
223 | err = pids_try_charge(pids, 1); | ||
224 | if (err) | ||
225 | goto err_css_put; | ||
226 | |||
227 | *priv_p = css; | ||
228 | return 0; | ||
229 | |||
230 | err_css_put: | ||
231 | css_put(css); | ||
232 | return err; | ||
233 | } | ||
234 | |||
235 | static void pids_cancel_fork(struct task_struct *task, void *priv) | ||
236 | { | ||
237 | struct cgroup_subsys_state *css = priv; | ||
238 | struct pids_cgroup *pids = css_pids(css); | ||
239 | |||
240 | pids_uncharge(pids, 1); | ||
241 | css_put(css); | ||
242 | } | ||
243 | |||
244 | static void pids_fork(struct task_struct *task, void *priv) | ||
245 | { | ||
246 | struct cgroup_subsys_state *css; | ||
247 | struct cgroup_subsys_state *old_css = priv; | ||
248 | struct pids_cgroup *pids; | ||
249 | struct pids_cgroup *old_pids = css_pids(old_css); | ||
250 | |||
251 | css = task_get_css(task, pids_cgrp_id); | ||
252 | pids = css_pids(css); | ||
253 | |||
254 | /* | ||
255 | * If the association has changed, we have to revert and reapply the | ||
256 | * charge/uncharge on the wrong hierarchy to the current one. Since | ||
257 | * the association can only change due to an organisation event, its | ||
258 | * okay for us to ignore the limit in this case. | ||
259 | */ | ||
260 | if (pids != old_pids) { | ||
261 | pids_uncharge(old_pids, 1); | ||
262 | pids_charge(pids, 1); | ||
263 | } | ||
264 | |||
265 | css_put(css); | ||
266 | css_put(old_css); | ||
267 | } | ||
268 | |||
269 | static void pids_exit(struct cgroup_subsys_state *css, | ||
270 | struct cgroup_subsys_state *old_css, | ||
271 | struct task_struct *task) | ||
272 | { | ||
273 | struct pids_cgroup *pids = css_pids(old_css); | ||
274 | |||
275 | pids_uncharge(pids, 1); | ||
276 | } | ||
277 | |||
278 | static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf, | ||
279 | size_t nbytes, loff_t off) | ||
280 | { | ||
281 | struct cgroup_subsys_state *css = of_css(of); | ||
282 | struct pids_cgroup *pids = css_pids(css); | ||
283 | int64_t limit; | ||
284 | int err; | ||
285 | |||
286 | buf = strstrip(buf); | ||
287 | if (!strcmp(buf, PIDS_MAX_STR)) { | ||
288 | limit = PIDS_MAX; | ||
289 | goto set_limit; | ||
290 | } | ||
291 | |||
292 | err = kstrtoll(buf, 0, &limit); | ||
293 | if (err) | ||
294 | return err; | ||
295 | |||
296 | if (limit < 0 || limit >= PIDS_MAX) | ||
297 | return -EINVAL; | ||
298 | |||
299 | set_limit: | ||
300 | /* | ||
301 | * Limit updates don't need to be mutex'd, since it isn't | ||
302 | * critical that any racing fork()s follow the new limit. | ||
303 | */ | ||
304 | pids->limit = limit; | ||
305 | return nbytes; | ||
306 | } | ||
307 | |||
308 | static int pids_max_show(struct seq_file *sf, void *v) | ||
309 | { | ||
310 | struct cgroup_subsys_state *css = seq_css(sf); | ||
311 | struct pids_cgroup *pids = css_pids(css); | ||
312 | int64_t limit = pids->limit; | ||
313 | |||
314 | if (limit >= PIDS_MAX) | ||
315 | seq_printf(sf, "%s\n", PIDS_MAX_STR); | ||
316 | else | ||
317 | seq_printf(sf, "%lld\n", limit); | ||
318 | |||
319 | return 0; | ||
320 | } | ||
321 | |||
322 | static s64 pids_current_read(struct cgroup_subsys_state *css, | ||
323 | struct cftype *cft) | ||
324 | { | ||
325 | struct pids_cgroup *pids = css_pids(css); | ||
326 | |||
327 | return atomic64_read(&pids->counter); | ||
328 | } | ||
329 | |||
330 | static struct cftype pids_files[] = { | ||
331 | { | ||
332 | .name = "max", | ||
333 | .write = pids_max_write, | ||
334 | .seq_show = pids_max_show, | ||
335 | .flags = CFTYPE_NOT_ON_ROOT, | ||
336 | }, | ||
337 | { | ||
338 | .name = "current", | ||
339 | .read_s64 = pids_current_read, | ||
340 | }, | ||
341 | { } /* terminate */ | ||
342 | }; | ||
343 | |||
344 | struct cgroup_subsys pids_cgrp_subsys = { | ||
345 | .css_alloc = pids_css_alloc, | ||
346 | .css_free = pids_css_free, | ||
347 | .can_attach = pids_can_attach, | ||
348 | .cancel_attach = pids_cancel_attach, | ||
349 | .can_fork = pids_can_fork, | ||
350 | .cancel_fork = pids_cancel_fork, | ||
351 | .fork = pids_fork, | ||
352 | .exit = pids_exit, | ||
353 | .legacy_cftypes = pids_files, | ||
354 | .dfl_cftypes = pids_files, | ||
355 | }; | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 2b1a61cddc19..03aa2e6de7a4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1246,6 +1246,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1246 | { | 1246 | { |
1247 | int retval; | 1247 | int retval; |
1248 | struct task_struct *p; | 1248 | struct task_struct *p; |
1249 | void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {}; | ||
1249 | 1250 | ||
1250 | if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) | 1251 | if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) |
1251 | return ERR_PTR(-EINVAL); | 1252 | return ERR_PTR(-EINVAL); |
@@ -1518,6 +1519,16 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1518 | p->task_works = NULL; | 1519 | p->task_works = NULL; |
1519 | 1520 | ||
1520 | /* | 1521 | /* |
1522 | * Ensure that the cgroup subsystem policies allow the new process to be | ||
1523 | * forked. It should be noted the the new process's css_set can be changed | ||
1524 | * between here and cgroup_post_fork() if an organisation operation is in | ||
1525 | * progress. | ||
1526 | */ | ||
1527 | retval = cgroup_can_fork(p, cgrp_ss_priv); | ||
1528 | if (retval) | ||
1529 | goto bad_fork_free_pid; | ||
1530 | |||
1531 | /* | ||
1521 | * Make it visible to the rest of the system, but dont wake it up yet. | 1532 | * Make it visible to the rest of the system, but dont wake it up yet. |
1522 | * Need tasklist lock for parent etc handling! | 1533 | * Need tasklist lock for parent etc handling! |
1523 | */ | 1534 | */ |
@@ -1553,7 +1564,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1553 | spin_unlock(¤t->sighand->siglock); | 1564 | spin_unlock(¤t->sighand->siglock); |
1554 | write_unlock_irq(&tasklist_lock); | 1565 | write_unlock_irq(&tasklist_lock); |
1555 | retval = -ERESTARTNOINTR; | 1566 | retval = -ERESTARTNOINTR; |
1556 | goto bad_fork_free_pid; | 1567 | goto bad_fork_cancel_cgroup; |
1557 | } | 1568 | } |
1558 | 1569 | ||
1559 | if (likely(p->pid)) { | 1570 | if (likely(p->pid)) { |
@@ -1595,7 +1606,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1595 | write_unlock_irq(&tasklist_lock); | 1606 | write_unlock_irq(&tasklist_lock); |
1596 | 1607 | ||
1597 | proc_fork_connector(p); | 1608 | proc_fork_connector(p); |
1598 | cgroup_post_fork(p); | 1609 | cgroup_post_fork(p, cgrp_ss_priv); |
1599 | if (clone_flags & CLONE_THREAD) | 1610 | if (clone_flags & CLONE_THREAD) |
1600 | threadgroup_change_end(current); | 1611 | threadgroup_change_end(current); |
1601 | perf_event_fork(p); | 1612 | perf_event_fork(p); |
@@ -1605,6 +1616,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1605 | 1616 | ||
1606 | return p; | 1617 | return p; |
1607 | 1618 | ||
1619 | bad_fork_cancel_cgroup: | ||
1620 | cgroup_cancel_fork(p, cgrp_ss_priv); | ||
1608 | bad_fork_free_pid: | 1621 | bad_fork_free_pid: |
1609 | if (pid != &init_struct_pid) | 1622 | if (pid != &init_struct_pid) |
1610 | free_pid(pid); | 1623 | free_pid(pid); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8b864ecee0e1..d8420c233ff7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -8133,7 +8133,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
8133 | sched_offline_group(tg); | 8133 | sched_offline_group(tg); |
8134 | } | 8134 | } |
8135 | 8135 | ||
8136 | static void cpu_cgroup_fork(struct task_struct *task) | 8136 | static void cpu_cgroup_fork(struct task_struct *task, void *private) |
8137 | { | 8137 | { |
8138 | sched_move_task(task); | 8138 | sched_move_task(task); |
8139 | } | 8139 | } |