aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-09-02 11:04:23 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-02 11:04:23 -0400
commit8bdc69b764013a9b5ebeef7df8f314f1066c5d79 (patch)
tree335dcb29c9ba06142917121d551575d360eca63e
parent76ec51ef5edfe540bbc3c61b860f88deb8e6a37b (diff)
parent20f1f4b5ffb870631bf4a4e7c7ba10e3528ae6a6 (diff)
Merge branch 'for-4.3' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: - a new PIDs controller is added. It turns out that PIDs are actually an independent resource from kmem due to the limited PID space. - more core preparations for the v2 interface. Once cpu side interface is settled, it should be ready for lifting the devel mask. for-4.3-unified-base was temporarily branched so that other trees (block) can pull cgroup core changes that blkcg changes depend on. - a non-critical idr_preload usage bug fix. * 'for-4.3' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: cgroup: pids: fix invalid get/put usage cgroup: introduce cgroup_subsys->legacy_name cgroup: don't print subsystems for the default hierarchy cgroup: make cftype->private a unsigned long cgroup: export cgrp_dfl_root cgroup: define controller file conventions cgroup: fix idr_preload usage cgroup: add documentation for the PIDs controller cgroup: implement the PIDs subsystem cgroup: allow a cgroup subsystem to reject a fork
-rw-r--r--CREDITS5
-rw-r--r--Documentation/cgroups/00-INDEX2
-rw-r--r--Documentation/cgroups/pids.txt85
-rw-r--r--Documentation/cgroups/unified-hierarchy.txt80
-rw-r--r--include/linux/cgroup-defs.h15
-rw-r--r--include/linux/cgroup.h24
-rw-r--r--include/linux/cgroup_subsys.h28
-rw-r--r--init/Kconfig16
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/cgroup.c122
-rw-r--r--kernel/cgroup_freezer.c2
-rw-r--r--kernel/cgroup_pids.c355
-rw-r--r--kernel/fork.c17
-rw-r--r--kernel/sched/core.c2
14 files changed, 717 insertions, 37 deletions
diff --git a/CREDITS b/CREDITS
index 1d616640bbf6..4fcf9cd8544c 100644
--- a/CREDITS
+++ b/CREDITS
@@ -3219,6 +3219,11 @@ S: 69 rue Dunois
3219S: 75013 Paris 3219S: 75013 Paris
3220S: France 3220S: France
3221 3221
3222N: Aleksa Sarai
3223E: cyphar@cyphar.com
3224W: https://www.cyphar.com/
3225D: `pids` cgroup subsystem
3226
3222N: Dipankar Sarma 3227N: Dipankar Sarma
3223E: dipankar@in.ibm.com 3228E: dipankar@in.ibm.com
3224D: RCU 3229D: RCU
diff --git a/Documentation/cgroups/00-INDEX b/Documentation/cgroups/00-INDEX
index 96ce071a3633..3f5a40f57d4a 100644
--- a/Documentation/cgroups/00-INDEX
+++ b/Documentation/cgroups/00-INDEX
@@ -22,6 +22,8 @@ net_cls.txt
22 - Network classifier cgroups details and usages. 22 - Network classifier cgroups details and usages.
23net_prio.txt 23net_prio.txt
24 - Network priority cgroups details and usages. 24 - Network priority cgroups details and usages.
25pids.txt
26 - Process number cgroups details and usages.
25resource_counter.txt 27resource_counter.txt
26 - Resource Counter API. 28 - Resource Counter API.
27unified-hierarchy.txt 29unified-hierarchy.txt
diff --git a/Documentation/cgroups/pids.txt b/Documentation/cgroups/pids.txt
new file mode 100644
index 000000000000..1a078b5d281a
--- /dev/null
+++ b/Documentation/cgroups/pids.txt
@@ -0,0 +1,85 @@
1 Process Number Controller
2 =========================
3
4Abstract
5--------
6
7The process number controller is used to allow a cgroup hierarchy to stop any
8new tasks from being fork()'d or clone()'d after a certain limit is reached.
9
10Since it is trivial to hit the task limit without hitting any kmemcg limits in
11place, PIDs are a fundamental resource. As such, PID exhaustion must be
12preventable in the scope of a cgroup hierarchy by allowing resource limiting of
13the number of tasks in a cgroup.
14
15Usage
16-----
17
18In order to use the `pids` controller, set the maximum number of tasks in
19pids.max (this is not available in the root cgroup for obvious reasons). The
20number of processes currently in the cgroup is given by pids.current.
21
22Organisational operations are not blocked by cgroup policies, so it is possible
23to have pids.current > pids.max. This can be done by either setting the limit to
24be smaller than pids.current, or attaching enough processes to the cgroup such
25that pids.current > pids.max. However, it is not possible to violate a cgroup
26policy through fork() or clone(). fork() and clone() will return -EAGAIN if the
27creation of a new process would cause a cgroup policy to be violated.
28
29To set a cgroup to have no limit, set pids.max to "max". This is the default for
30all new cgroups (N.B. that PID limits are hierarchical, so the most stringent
31limit in the hierarchy is followed).
32
33pids.current tracks all child cgroup hierarchies, so parent/pids.current is a
34superset of parent/child/pids.current.
35
36Example
37-------
38
39First, we mount the pids controller:
40# mkdir -p /sys/fs/cgroup/pids
41# mount -t cgroup -o pids none /sys/fs/cgroup/pids
42
43Then we create a hierarchy, set limits and attach processes to it:
44# mkdir -p /sys/fs/cgroup/pids/parent/child
45# echo 2 > /sys/fs/cgroup/pids/parent/pids.max
46# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs
47# cat /sys/fs/cgroup/pids/parent/pids.current
482
49#
50
51It should be noted that attempts to overcome the set limit (2 in this case) will
52fail:
53
54# cat /sys/fs/cgroup/pids/parent/pids.current
552
56# ( /bin/echo "Here's some processes for you." | cat )
57sh: fork: Resource temporary unavailable
58#
59
60Even if we migrate to a child cgroup (which doesn't have a set limit), we will
61not be able to overcome the most stringent limit in the hierarchy (in this case,
62parent's):
63
64# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs
65# cat /sys/fs/cgroup/pids/parent/pids.current
662
67# cat /sys/fs/cgroup/pids/parent/child/pids.current
682
69# cat /sys/fs/cgroup/pids/parent/child/pids.max
70max
71# ( /bin/echo "Here's some processes for you." | cat )
72sh: fork: Resource temporary unavailable
73#
74
75We can set a limit that is smaller than pids.current, which will stop any new
76processes from being forked at all (note that the shell itself counts towards
77pids.current):
78
79# echo 1 > /sys/fs/cgroup/pids/parent/pids.max
80# /bin/echo "We can't even spawn a single process now."
81sh: fork: Resource temporary unavailable
82# echo 0 > /sys/fs/cgroup/pids/parent/pids.max
83# /bin/echo "We can't even spawn a single process now."
84sh: fork: Resource temporary unavailable
85#
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index 86847a7647ab..1ee9caf29e57 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -23,10 +23,13 @@ CONTENTS
235. Other Changes 235. Other Changes
24 5-1. [Un]populated Notification 24 5-1. [Un]populated Notification
25 5-2. Other Core Changes 25 5-2. Other Core Changes
26 5-3. Per-Controller Changes 26 5-3. Controller File Conventions
27 5-3-1. blkio 27 5-3-1. Format
28 5-3-2. cpuset 28 5-3-2. Control Knobs
29 5-3-3. memory 29 5-4. Per-Controller Changes
30 5-4-1. blkio
31 5-4-2. cpuset
32 5-4-3. memory
306. Planned Changes 336. Planned Changes
31 6-1. CAP for resource control 34 6-1. CAP for resource control
32 35
@@ -372,14 +375,75 @@ supported and the interface files "release_agent" and
372- The "cgroup.clone_children" file is removed. 375- The "cgroup.clone_children" file is removed.
373 376
374 377
3755-3. Per-Controller Changes 3785-3. Controller File Conventions
376 379
3775-3-1. blkio 3805-3-1. Format
381
382In general, all controller files should be in one of the following
383formats whenever possible.
384
385- Values only files
386
387 VAL0 VAL1...\n
388
389- Flat keyed files
390
391 KEY0 VAL0\n
392 KEY1 VAL1\n
393 ...
394
395- Nested keyed files
396
397 KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01...
398 KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11...
399 ...
400
401For a writeable file, the format for writing should generally match
402reading; however, controllers may allow omitting later fields or
403implement restricted shortcuts for most common use cases.
404
405For both flat and nested keyed files, only the values for a single key
406can be written at a time. For nested keyed files, the sub key pairs
407may be specified in any order and not all pairs have to be specified.
408
409
4105-3-2. Control Knobs
411
412- Settings for a single feature should generally be implemented in a
413 single file.
414
415- In general, the root cgroup should be exempt from resource control
416 and thus shouldn't have resource control knobs.
417
418- If a controller implements ratio based resource distribution, the
419 control knob should be named "weight" and have the range [1, 10000]
420 and 100 should be the default value. The values are chosen to allow
421 enough and symmetric bias in both directions while keeping it
422 intuitive (the default is 100%).
423
424- If a controller implements an absolute resource guarantee and/or
425 limit, the control knobs should be named "min" and "max"
426 respectively. If a controller implements best effort resource
427 gurantee and/or limit, the control knobs should be named "low" and
428 "high" respectively.
429
430 In the above four control files, the special token "max" should be
431 used to represent upward infinity for both reading and writing.
432
433- If a setting has configurable default value and specific overrides,
434 the default settings should be keyed with "default" and appear as
435 the first entry in the file. Specific entries can use "default" as
436 its value to indicate inheritance of the default value.
437
438
4395-4. Per-Controller Changes
440
4415-4-1. blkio
378 442
379- blk-throttle becomes properly hierarchical. 443- blk-throttle becomes properly hierarchical.
380 444
381 445
3825-3-2. cpuset 4465-4-2. cpuset
383 447
384- Tasks are kept in empty cpusets after hotplug and take on the masks 448- Tasks are kept in empty cpusets after hotplug and take on the masks
385 of the nearest non-empty ancestor, instead of being moved to it. 449 of the nearest non-empty ancestor, instead of being moved to it.
@@ -388,7 +452,7 @@ supported and the interface files "release_agent" and
388 masks of the nearest non-empty ancestor. 452 masks of the nearest non-empty ancestor.
389 453
390 454
3915-3-3. memory 4555-4-3. memory
392 456
393- use_hierarchy is on by default and the cgroup file for the flag is 457- use_hierarchy is on by default and the cgroup file for the flag is
394 not created. 458 not created.
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 93755a629299..4d8fcf2187dc 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -34,12 +34,17 @@ struct seq_file;
34 34
35/* define the enumeration of all cgroup subsystems */ 35/* define the enumeration of all cgroup subsystems */
36#define SUBSYS(_x) _x ## _cgrp_id, 36#define SUBSYS(_x) _x ## _cgrp_id,
37#define SUBSYS_TAG(_t) CGROUP_ ## _t, \
38 __unused_tag_ ## _t = CGROUP_ ## _t - 1,
37enum cgroup_subsys_id { 39enum cgroup_subsys_id {
38#include <linux/cgroup_subsys.h> 40#include <linux/cgroup_subsys.h>
39 CGROUP_SUBSYS_COUNT, 41 CGROUP_SUBSYS_COUNT,
40}; 42};
43#undef SUBSYS_TAG
41#undef SUBSYS 44#undef SUBSYS
42 45
46#define CGROUP_CANFORK_COUNT (CGROUP_CANFORK_END - CGROUP_CANFORK_START)
47
43/* bits in struct cgroup_subsys_state flags field */ 48/* bits in struct cgroup_subsys_state flags field */
44enum { 49enum {
45 CSS_NO_REF = (1 << 0), /* no reference counting for this css */ 50 CSS_NO_REF = (1 << 0), /* no reference counting for this css */
@@ -318,7 +323,7 @@ struct cftype {
318 * end of cftype array. 323 * end of cftype array.
319 */ 324 */
320 char name[MAX_CFTYPE_NAME]; 325 char name[MAX_CFTYPE_NAME];
321 int private; 326 unsigned long private;
322 /* 327 /*
323 * If not 0, file mode is set to this value, otherwise it will 328 * If not 0, file mode is set to this value, otherwise it will
324 * be figured out automatically 329 * be figured out automatically
@@ -406,7 +411,9 @@ struct cgroup_subsys {
406 struct cgroup_taskset *tset); 411 struct cgroup_taskset *tset);
407 void (*attach)(struct cgroup_subsys_state *css, 412 void (*attach)(struct cgroup_subsys_state *css,
408 struct cgroup_taskset *tset); 413 struct cgroup_taskset *tset);
409 void (*fork)(struct task_struct *task); 414 int (*can_fork)(struct task_struct *task, void **priv_p);
415 void (*cancel_fork)(struct task_struct *task, void *priv);
416 void (*fork)(struct task_struct *task, void *priv);
410 void (*exit)(struct cgroup_subsys_state *css, 417 void (*exit)(struct cgroup_subsys_state *css,
411 struct cgroup_subsys_state *old_css, 418 struct cgroup_subsys_state *old_css,
412 struct task_struct *task); 419 struct task_struct *task);
@@ -434,6 +441,9 @@ struct cgroup_subsys {
434 int id; 441 int id;
435 const char *name; 442 const char *name;
436 443
444 /* optional, initialized automatically during boot if not set */
445 const char *legacy_name;
446
437 /* link to parent, protected by cgroup_lock() */ 447 /* link to parent, protected by cgroup_lock() */
438 struct cgroup_root *root; 448 struct cgroup_root *root;
439 449
@@ -491,6 +501,7 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
491 501
492#else /* CONFIG_CGROUPS */ 502#else /* CONFIG_CGROUPS */
493 503
504#define CGROUP_CANFORK_COUNT 0
494#define CGROUP_SUBSYS_COUNT 0 505#define CGROUP_SUBSYS_COUNT 0
495 506
496static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {} 507static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {}
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index a593e299162e..eb7ca55f72ef 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -22,6 +22,15 @@
22 22
23#ifdef CONFIG_CGROUPS 23#ifdef CONFIG_CGROUPS
24 24
25/*
26 * All weight knobs on the default hierarhcy should use the following min,
27 * default and max values. The default value is the logarithmic center of
28 * MIN and MAX and allows 100x to be expressed in both directions.
29 */
30#define CGROUP_WEIGHT_MIN 1
31#define CGROUP_WEIGHT_DFL 100
32#define CGROUP_WEIGHT_MAX 10000
33
25/* a css_task_iter should be treated as an opaque object */ 34/* a css_task_iter should be treated as an opaque object */
26struct css_task_iter { 35struct css_task_iter {
27 struct cgroup_subsys *ss; 36 struct cgroup_subsys *ss;
@@ -62,7 +71,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
62 struct pid *pid, struct task_struct *tsk); 71 struct pid *pid, struct task_struct *tsk);
63 72
64void cgroup_fork(struct task_struct *p); 73void cgroup_fork(struct task_struct *p);
65void cgroup_post_fork(struct task_struct *p); 74extern int cgroup_can_fork(struct task_struct *p,
75 void *ss_priv[CGROUP_CANFORK_COUNT]);
76extern void cgroup_cancel_fork(struct task_struct *p,
77 void *ss_priv[CGROUP_CANFORK_COUNT]);
78extern void cgroup_post_fork(struct task_struct *p,
79 void *old_ss_priv[CGROUP_CANFORK_COUNT]);
66void cgroup_exit(struct task_struct *p); 80void cgroup_exit(struct task_struct *p);
67 81
68int cgroup_init_early(void); 82int cgroup_init_early(void);
@@ -524,7 +538,13 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
524 struct dentry *dentry) { return -EINVAL; } 538 struct dentry *dentry) { return -EINVAL; }
525 539
526static inline void cgroup_fork(struct task_struct *p) {} 540static inline void cgroup_fork(struct task_struct *p) {}
527static inline void cgroup_post_fork(struct task_struct *p) {} 541static inline int cgroup_can_fork(struct task_struct *p,
542 void *ss_priv[CGROUP_CANFORK_COUNT])
543{ return 0; }
544static inline void cgroup_cancel_fork(struct task_struct *p,
545 void *ss_priv[CGROUP_CANFORK_COUNT]) {}
546static inline void cgroup_post_fork(struct task_struct *p,
547 void *ss_priv[CGROUP_CANFORK_COUNT]) {}
528static inline void cgroup_exit(struct task_struct *p) {} 548static inline void cgroup_exit(struct task_struct *p) {}
529 549
530static inline int cgroup_init_early(void) { return 0; } 550static inline int cgroup_init_early(void) { return 0; }
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index e4a96fb14403..1f36945fd23d 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -3,6 +3,17 @@
3 * 3 *
4 * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. 4 * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
5 */ 5 */
6
7/*
8 * This file *must* be included with SUBSYS() defined.
9 * SUBSYS_TAG() is a noop if undefined.
10 */
11
12#ifndef SUBSYS_TAG
13#define __TMP_SUBSYS_TAG
14#define SUBSYS_TAG(_x)
15#endif
16
6#if IS_ENABLED(CONFIG_CPUSETS) 17#if IS_ENABLED(CONFIG_CPUSETS)
7SUBSYS(cpuset) 18SUBSYS(cpuset)
8#endif 19#endif
@@ -48,11 +59,28 @@ SUBSYS(hugetlb)
48#endif 59#endif
49 60
50/* 61/*
62 * Subsystems that implement the can_fork() family of callbacks.
63 */
64SUBSYS_TAG(CANFORK_START)
65
66#if IS_ENABLED(CONFIG_CGROUP_PIDS)
67SUBSYS(pids)
68#endif
69
70SUBSYS_TAG(CANFORK_END)
71
72/*
51 * The following subsystems are not supported on the default hierarchy. 73 * The following subsystems are not supported on the default hierarchy.
52 */ 74 */
53#if IS_ENABLED(CONFIG_CGROUP_DEBUG) 75#if IS_ENABLED(CONFIG_CGROUP_DEBUG)
54SUBSYS(debug) 76SUBSYS(debug)
55#endif 77#endif
78
79#ifdef __TMP_SUBSYS_TAG
80#undef __TMP_SUBSYS_TAG
81#undef SUBSYS_TAG
82#endif
83
56/* 84/*
57 * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. 85 * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
58 */ 86 */
diff --git a/init/Kconfig b/init/Kconfig
index ba1e6eaf4c36..bb9b4dd55889 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -947,6 +947,22 @@ config CGROUP_FREEZER
947 Provides a way to freeze and unfreeze all tasks in a 947 Provides a way to freeze and unfreeze all tasks in a
948 cgroup. 948 cgroup.
949 949
950config CGROUP_PIDS
951 bool "PIDs cgroup subsystem"
952 help
953 Provides enforcement of process number limits in the scope of a
954 cgroup. Any attempt to fork more processes than is allowed in the
955 cgroup will fail. PIDs are fundamentally a global resource because it
956 is fairly trivial to reach PID exhaustion before you reach even a
957 conservative kmemcg limit. As a result, it is possible to grind a
958 system to halt without being limited by other cgroup policies. The
959 PIDs cgroup subsystem is designed to stop this from happening.
960
961 It should be noted that organisational operations (such as attaching
962 to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
963 since the PIDs limit only affects a process's ability to fork, not to
964 attach to a cgroup.
965
950config CGROUP_DEVICE 966config CGROUP_DEVICE
951 bool "Device controller for cgroups" 967 bool "Device controller for cgroups"
952 help 968 help
diff --git a/kernel/Makefile b/kernel/Makefile
index 43c4c920f30a..718fb8afab7a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
55obj-$(CONFIG_COMPAT) += compat.o 55obj-$(CONFIG_COMPAT) += compat.o
56obj-$(CONFIG_CGROUPS) += cgroup.o 56obj-$(CONFIG_CGROUPS) += cgroup.o
57obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o 57obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
58obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
58obj-$(CONFIG_CPUSETS) += cpuset.o 59obj-$(CONFIG_CPUSETS) += cpuset.o
59obj-$(CONFIG_UTS_NS) += utsname.o 60obj-$(CONFIG_UTS_NS) += utsname.o
60obj-$(CONFIG_USER_NS) += user_namespace.o 61obj-$(CONFIG_USER_NS) += user_namespace.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b89f3168411b..f3f5cd5e2c0d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -145,6 +145,7 @@ static const char *cgroup_subsys_name[] = {
145 * part of that cgroup. 145 * part of that cgroup.
146 */ 146 */
147struct cgroup_root cgrp_dfl_root; 147struct cgroup_root cgrp_dfl_root;
148EXPORT_SYMBOL_GPL(cgrp_dfl_root);
148 149
149/* 150/*
150 * The default hierarchy always exists but is hidden until mounted for the 151 * The default hierarchy always exists but is hidden until mounted for the
@@ -186,6 +187,9 @@ static u64 css_serial_nr_next = 1;
186static unsigned long have_fork_callback __read_mostly; 187static unsigned long have_fork_callback __read_mostly;
187static unsigned long have_exit_callback __read_mostly; 188static unsigned long have_exit_callback __read_mostly;
188 189
190/* Ditto for the can_fork callback. */
191static unsigned long have_canfork_callback __read_mostly;
192
189static struct cftype cgroup_dfl_base_files[]; 193static struct cftype cgroup_dfl_base_files[];
190static struct cftype cgroup_legacy_base_files[]; 194static struct cftype cgroup_legacy_base_files[];
191 195
@@ -207,7 +211,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
207 211
208 idr_preload(gfp_mask); 212 idr_preload(gfp_mask);
209 spin_lock_bh(&cgroup_idr_lock); 213 spin_lock_bh(&cgroup_idr_lock);
210 ret = idr_alloc(idr, ptr, start, end, gfp_mask); 214 ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT);
211 spin_unlock_bh(&cgroup_idr_lock); 215 spin_unlock_bh(&cgroup_idr_lock);
212 idr_preload_end(); 216 idr_preload_end();
213 return ret; 217 return ret;
@@ -1027,10 +1031,13 @@ static const struct file_operations proc_cgroupstats_operations;
1027static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, 1031static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1028 char *buf) 1032 char *buf)
1029{ 1033{
1034 struct cgroup_subsys *ss = cft->ss;
1035
1030 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && 1036 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1031 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) 1037 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
1032 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s", 1038 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
1033 cft->ss->name, cft->name); 1039 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1040 cft->name);
1034 else 1041 else
1035 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); 1042 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1036 return buf; 1043 return buf;
@@ -1332,9 +1339,10 @@ static int cgroup_show_options(struct seq_file *seq,
1332 struct cgroup_subsys *ss; 1339 struct cgroup_subsys *ss;
1333 int ssid; 1340 int ssid;
1334 1341
1335 for_each_subsys(ss, ssid) 1342 if (root != &cgrp_dfl_root)
1336 if (root->subsys_mask & (1 << ssid)) 1343 for_each_subsys(ss, ssid)
1337 seq_printf(seq, ",%s", ss->name); 1344 if (root->subsys_mask & (1 << ssid))
1345 seq_printf(seq, ",%s", ss->legacy_name);
1338 if (root->flags & CGRP_ROOT_NOPREFIX) 1346 if (root->flags & CGRP_ROOT_NOPREFIX)
1339 seq_puts(seq, ",noprefix"); 1347 seq_puts(seq, ",noprefix");
1340 if (root->flags & CGRP_ROOT_XATTR) 1348 if (root->flags & CGRP_ROOT_XATTR)
@@ -1447,7 +1455,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1447 } 1455 }
1448 1456
1449 for_each_subsys(ss, i) { 1457 for_each_subsys(ss, i) {
1450 if (strcmp(token, ss->name)) 1458 if (strcmp(token, ss->legacy_name))
1451 continue; 1459 continue;
1452 if (ss->disabled) 1460 if (ss->disabled)
1453 continue; 1461 continue;
@@ -1666,7 +1674,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1666 1674
1667 lockdep_assert_held(&cgroup_mutex); 1675 lockdep_assert_held(&cgroup_mutex);
1668 1676
1669 ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT); 1677 ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
1670 if (ret < 0) 1678 if (ret < 0)
1671 goto out; 1679 goto out;
1672 root_cgrp->id = ret; 1680 root_cgrp->id = ret;
@@ -4579,7 +4587,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
4579 if (err) 4587 if (err)
4580 goto err_free_css; 4588 goto err_free_css;
4581 4589
4582 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT); 4590 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
4583 if (err < 0) 4591 if (err < 0)
4584 goto err_free_percpu_ref; 4592 goto err_free_percpu_ref;
4585 css->id = err; 4593 css->id = err;
@@ -4656,7 +4664,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4656 * Temporarily set the pointer to NULL, so idr_find() won't return 4664 * Temporarily set the pointer to NULL, so idr_find() won't return
4657 * a half-baked cgroup. 4665 * a half-baked cgroup.
4658 */ 4666 */
4659 cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); 4667 cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
4660 if (cgrp->id < 0) { 4668 if (cgrp->id < 0) {
4661 ret = -ENOMEM; 4669 ret = -ENOMEM;
4662 goto out_cancel_ref; 4670 goto out_cancel_ref;
@@ -4955,6 +4963,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
4955 4963
4956 have_fork_callback |= (bool)ss->fork << ss->id; 4964 have_fork_callback |= (bool)ss->fork << ss->id;
4957 have_exit_callback |= (bool)ss->exit << ss->id; 4965 have_exit_callback |= (bool)ss->exit << ss->id;
4966 have_canfork_callback |= (bool)ss->can_fork << ss->id;
4958 4967
4959 /* At system boot, before all subsystems have been 4968 /* At system boot, before all subsystems have been
4960 * registered, no tasks have been forked, so we don't 4969 * registered, no tasks have been forked, so we don't
@@ -4993,6 +5002,8 @@ int __init cgroup_init_early(void)
4993 5002
4994 ss->id = i; 5003 ss->id = i;
4995 ss->name = cgroup_subsys_name[i]; 5004 ss->name = cgroup_subsys_name[i];
5005 if (!ss->legacy_name)
5006 ss->legacy_name = cgroup_subsys_name[i];
4996 5007
4997 if (ss->early_init) 5008 if (ss->early_init)
4998 cgroup_init_subsys(ss, true); 5009 cgroup_init_subsys(ss, true);
@@ -5136,9 +5147,11 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5136 continue; 5147 continue;
5137 5148
5138 seq_printf(m, "%d:", root->hierarchy_id); 5149 seq_printf(m, "%d:", root->hierarchy_id);
5139 for_each_subsys(ss, ssid) 5150 if (root != &cgrp_dfl_root)
5140 if (root->subsys_mask & (1 << ssid)) 5151 for_each_subsys(ss, ssid)
5141 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 5152 if (root->subsys_mask & (1 << ssid))
5153 seq_printf(m, "%s%s", count++ ? "," : "",
5154 ss->legacy_name);
5142 if (strlen(root->name)) 5155 if (strlen(root->name))
5143 seq_printf(m, "%sname=%s", count ? "," : "", 5156 seq_printf(m, "%sname=%s", count ? "," : "",
5144 root->name); 5157 root->name);
@@ -5178,7 +5191,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
5178 5191
5179 for_each_subsys(ss, i) 5192 for_each_subsys(ss, i)
5180 seq_printf(m, "%s\t%d\t%d\t%d\n", 5193 seq_printf(m, "%s\t%d\t%d\t%d\n",
5181 ss->name, ss->root->hierarchy_id, 5194 ss->legacy_name, ss->root->hierarchy_id,
5182 atomic_read(&ss->root->nr_cgrps), !ss->disabled); 5195 atomic_read(&ss->root->nr_cgrps), !ss->disabled);
5183 5196
5184 mutex_unlock(&cgroup_mutex); 5197 mutex_unlock(&cgroup_mutex);
@@ -5197,6 +5210,19 @@ static const struct file_operations proc_cgroupstats_operations = {
5197 .release = single_release, 5210 .release = single_release,
5198}; 5211};
5199 5212
5213static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
5214{
5215 if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
5216 return &ss_priv[i - CGROUP_CANFORK_START];
5217 return NULL;
5218}
5219
5220static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
5221{
5222 void **private = subsys_canfork_priv_p(ss_priv, i);
5223 return private ? *private : NULL;
5224}
5225
5200/** 5226/**
5201 * cgroup_fork - initialize cgroup related fields during copy_process() 5227 * cgroup_fork - initialize cgroup related fields during copy_process()
5202 * @child: pointer to task_struct of forking parent process. 5228 * @child: pointer to task_struct of forking parent process.
@@ -5212,6 +5238,57 @@ void cgroup_fork(struct task_struct *child)
5212} 5238}
5213 5239
5214/** 5240/**
5241 * cgroup_can_fork - called on a new task before the process is exposed
5242 * @child: the task in question.
5243 *
5244 * This calls the subsystem can_fork() callbacks. If the can_fork() callback
5245 * returns an error, the fork aborts with that error code. This allows for
5246 * a cgroup subsystem to conditionally allow or deny new forks.
5247 */
5248int cgroup_can_fork(struct task_struct *child,
5249 void *ss_priv[CGROUP_CANFORK_COUNT])
5250{
5251 struct cgroup_subsys *ss;
5252 int i, j, ret;
5253
5254 for_each_subsys_which(ss, i, &have_canfork_callback) {
5255 ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
5256 if (ret)
5257 goto out_revert;
5258 }
5259
5260 return 0;
5261
5262out_revert:
5263 for_each_subsys(ss, j) {
5264 if (j >= i)
5265 break;
5266 if (ss->cancel_fork)
5267 ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
5268 }
5269
5270 return ret;
5271}
5272
5273/**
5274 * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
5275 * @child: the task in question
5276 *
5277 * This calls the cancel_fork() callbacks if a fork failed *after*
5278 * cgroup_can_fork() succeded.
5279 */
5280void cgroup_cancel_fork(struct task_struct *child,
5281 void *ss_priv[CGROUP_CANFORK_COUNT])
5282{
5283 struct cgroup_subsys *ss;
5284 int i;
5285
5286 for_each_subsys(ss, i)
5287 if (ss->cancel_fork)
5288 ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
5289}
5290
5291/**
5215 * cgroup_post_fork - called on a new task after adding it to the task list 5292 * cgroup_post_fork - called on a new task after adding it to the task list
5216 * @child: the task in question 5293 * @child: the task in question
5217 * 5294 *
@@ -5221,7 +5298,8 @@ void cgroup_fork(struct task_struct *child)
5221 * cgroup_task_iter_start() - to guarantee that the new task ends up on its 5298 * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5222 * list. 5299 * list.
5223 */ 5300 */
5224void cgroup_post_fork(struct task_struct *child) 5301void cgroup_post_fork(struct task_struct *child,
5302 void *old_ss_priv[CGROUP_CANFORK_COUNT])
5225{ 5303{
5226 struct cgroup_subsys *ss; 5304 struct cgroup_subsys *ss;
5227 int i; 5305 int i;
@@ -5266,7 +5344,7 @@ void cgroup_post_fork(struct task_struct *child)
5266 * and addition to css_set. 5344 * and addition to css_set.
5267 */ 5345 */
5268 for_each_subsys_which(ss, i, &have_fork_callback) 5346 for_each_subsys_which(ss, i, &have_fork_callback)
5269 ss->fork(child); 5347 ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
5270} 5348}
5271 5349
5272/** 5350/**
@@ -5400,12 +5478,14 @@ static int __init cgroup_disable(char *str)
5400 continue; 5478 continue;
5401 5479
5402 for_each_subsys(ss, i) { 5480 for_each_subsys(ss, i) {
5403 if (!strcmp(token, ss->name)) { 5481 if (strcmp(token, ss->name) &&
5404 ss->disabled = 1; 5482 strcmp(token, ss->legacy_name))
5405 printk(KERN_INFO "Disabling %s control group" 5483 continue;
5406 " subsystem\n", ss->name); 5484
5407 break; 5485 ss->disabled = 1;
5408 } 5486 printk(KERN_INFO "Disabling %s control group subsystem\n",
5487 ss->name);
5488 break;
5409 } 5489 }
5410 } 5490 }
5411 return 1; 5491 return 1;
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 92b98cc0ee76..f1b30ad5dc6d 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -203,7 +203,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
203 * to do anything as freezer_attach() will put @task into the appropriate 203 * to do anything as freezer_attach() will put @task into the appropriate
204 * state. 204 * state.
205 */ 205 */
206static void freezer_fork(struct task_struct *task) 206static void freezer_fork(struct task_struct *task, void *private)
207{ 207{
208 struct freezer *freezer; 208 struct freezer *freezer;
209 209
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
new file mode 100644
index 000000000000..806cd7693ac8
--- /dev/null
+++ b/kernel/cgroup_pids.c
@@ -0,0 +1,355 @@
1/*
2 * Process number limiting controller for cgroups.
3 *
4 * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
5 * after a certain limit is reached.
6 *
7 * Since it is trivial to hit the task limit without hitting any kmemcg limits
8 * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
9 * preventable in the scope of a cgroup hierarchy by allowing resource limiting
10 * of the number of tasks in a cgroup.
11 *
12 * In order to use the `pids` controller, set the maximum number of tasks in
13 * pids.max (this is not available in the root cgroup for obvious reasons). The
14 * number of processes currently in the cgroup is given by pids.current.
15 * Organisational operations are not blocked by cgroup policies, so it is
16 * possible to have pids.current > pids.max. However, it is not possible to
17 * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
18 * would cause a cgroup policy to be violated.
19 *
20 * To set a cgroup to have no limit, set pids.max to "max". This is the default
21 * for all new cgroups (N.B. that PID limits are hierarchical, so the most
22 * stringent limit in the hierarchy is followed).
23 *
24 * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
25 * a superset of parent/child/pids.current.
26 *
27 * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
28 *
29 * This file is subject to the terms and conditions of version 2 of the GNU
30 * General Public License. See the file COPYING in the main directory of the
31 * Linux distribution for more details.
32 */
33
34#include <linux/kernel.h>
35#include <linux/threads.h>
36#include <linux/atomic.h>
37#include <linux/cgroup.h>
38#include <linux/slab.h>
39
40#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
41#define PIDS_MAX_STR "max"
42
43struct pids_cgroup {
44 struct cgroup_subsys_state css;
45
46 /*
47 * Use 64-bit types so that we can safely represent "max" as
48 * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
49 */
50 atomic64_t counter;
51 int64_t limit;
52};
53
54static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
55{
56 return container_of(css, struct pids_cgroup, css);
57}
58
59static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
60{
61 return css_pids(pids->css.parent);
62}
63
64static struct cgroup_subsys_state *
65pids_css_alloc(struct cgroup_subsys_state *parent)
66{
67 struct pids_cgroup *pids;
68
69 pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
70 if (!pids)
71 return ERR_PTR(-ENOMEM);
72
73 pids->limit = PIDS_MAX;
74 atomic64_set(&pids->counter, 0);
75 return &pids->css;
76}
77
78static void pids_css_free(struct cgroup_subsys_state *css)
79{
80 kfree(css_pids(css));
81}
82
83/**
84 * pids_cancel - uncharge the local pid count
85 * @pids: the pid cgroup state
86 * @num: the number of pids to cancel
87 *
88 * This function will WARN if the pid count goes under 0, because such a case is
89 * a bug in the pids controller proper.
90 */
91static void pids_cancel(struct pids_cgroup *pids, int num)
92{
93 /*
94 * A negative count (or overflow for that matter) is invalid,
95 * and indicates a bug in the `pids` controller proper.
96 */
97 WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
98}
99
100/**
101 * pids_uncharge - hierarchically uncharge the pid count
102 * @pids: the pid cgroup state
103 * @num: the number of pids to uncharge
104 */
105static void pids_uncharge(struct pids_cgroup *pids, int num)
106{
107 struct pids_cgroup *p;
108
109 for (p = pids; p; p = parent_pids(p))
110 pids_cancel(p, num);
111}
112
113/**
114 * pids_charge - hierarchically charge the pid count
115 * @pids: the pid cgroup state
116 * @num: the number of pids to charge
117 *
118 * This function does *not* follow the pid limit set. It cannot fail and the new
119 * pid count may exceed the limit. This is only used for reverting failed
120 * attaches, where there is no other way out than violating the limit.
121 */
122static void pids_charge(struct pids_cgroup *pids, int num)
123{
124 struct pids_cgroup *p;
125
126 for (p = pids; p; p = parent_pids(p))
127 atomic64_add(num, &p->counter);
128}
129
130/**
131 * pids_try_charge - hierarchically try to charge the pid count
132 * @pids: the pid cgroup state
133 * @num: the number of pids to charge
134 *
135 * This function follows the set limit. It will fail if the charge would cause
136 * the new value to exceed the hierarchical limit. Returns 0 if the charge
137 * succeded, otherwise -EAGAIN.
138 */
139static int pids_try_charge(struct pids_cgroup *pids, int num)
140{
141 struct pids_cgroup *p, *q;
142
143 for (p = pids; p; p = parent_pids(p)) {
144 int64_t new = atomic64_add_return(num, &p->counter);
145
146 /*
147 * Since new is capped to the maximum number of pid_t, if
148 * p->limit is %PIDS_MAX then we know that this test will never
149 * fail.
150 */
151 if (new > p->limit)
152 goto revert;
153 }
154
155 return 0;
156
157revert:
158 for (q = pids; q != p; q = parent_pids(q))
159 pids_cancel(q, num);
160 pids_cancel(p, num);
161
162 return -EAGAIN;
163}
164
165static int pids_can_attach(struct cgroup_subsys_state *css,
166 struct cgroup_taskset *tset)
167{
168 struct pids_cgroup *pids = css_pids(css);
169 struct task_struct *task;
170
171 cgroup_taskset_for_each(task, tset) {
172 struct cgroup_subsys_state *old_css;
173 struct pids_cgroup *old_pids;
174
175 /*
176 * No need to pin @old_css between here and cancel_attach()
177 * because cgroup core protects it from being freed before
178 * the migration completes or fails.
179 */
180 old_css = task_css(task, pids_cgrp_id);
181 old_pids = css_pids(old_css);
182
183 pids_charge(pids, 1);
184 pids_uncharge(old_pids, 1);
185 }
186
187 return 0;
188}
189
190static void pids_cancel_attach(struct cgroup_subsys_state *css,
191 struct cgroup_taskset *tset)
192{
193 struct pids_cgroup *pids = css_pids(css);
194 struct task_struct *task;
195
196 cgroup_taskset_for_each(task, tset) {
197 struct cgroup_subsys_state *old_css;
198 struct pids_cgroup *old_pids;
199
200 old_css = task_css(task, pids_cgrp_id);
201 old_pids = css_pids(old_css);
202
203 pids_charge(old_pids, 1);
204 pids_uncharge(pids, 1);
205 }
206}
207
208static int pids_can_fork(struct task_struct *task, void **priv_p)
209{
210 struct cgroup_subsys_state *css;
211 struct pids_cgroup *pids;
212 int err;
213
214 /*
215 * Use the "current" task_css for the pids subsystem as the tentative
216 * css. It is possible we will charge the wrong hierarchy, in which
217 * case we will forcefully revert/reapply the charge on the right
218 * hierarchy after it is committed to the task proper.
219 */
220 css = task_get_css(current, pids_cgrp_id);
221 pids = css_pids(css);
222
223 err = pids_try_charge(pids, 1);
224 if (err)
225 goto err_css_put;
226
227 *priv_p = css;
228 return 0;
229
230err_css_put:
231 css_put(css);
232 return err;
233}
234
235static void pids_cancel_fork(struct task_struct *task, void *priv)
236{
237 struct cgroup_subsys_state *css = priv;
238 struct pids_cgroup *pids = css_pids(css);
239
240 pids_uncharge(pids, 1);
241 css_put(css);
242}
243
244static void pids_fork(struct task_struct *task, void *priv)
245{
246 struct cgroup_subsys_state *css;
247 struct cgroup_subsys_state *old_css = priv;
248 struct pids_cgroup *pids;
249 struct pids_cgroup *old_pids = css_pids(old_css);
250
251 css = task_get_css(task, pids_cgrp_id);
252 pids = css_pids(css);
253
254 /*
255 * If the association has changed, we have to revert and reapply the
256 * charge/uncharge on the wrong hierarchy to the current one. Since
257 * the association can only change due to an organisation event, its
258 * okay for us to ignore the limit in this case.
259 */
260 if (pids != old_pids) {
261 pids_uncharge(old_pids, 1);
262 pids_charge(pids, 1);
263 }
264
265 css_put(css);
266 css_put(old_css);
267}
268
269static void pids_exit(struct cgroup_subsys_state *css,
270 struct cgroup_subsys_state *old_css,
271 struct task_struct *task)
272{
273 struct pids_cgroup *pids = css_pids(old_css);
274
275 pids_uncharge(pids, 1);
276}
277
278static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
279 size_t nbytes, loff_t off)
280{
281 struct cgroup_subsys_state *css = of_css(of);
282 struct pids_cgroup *pids = css_pids(css);
283 int64_t limit;
284 int err;
285
286 buf = strstrip(buf);
287 if (!strcmp(buf, PIDS_MAX_STR)) {
288 limit = PIDS_MAX;
289 goto set_limit;
290 }
291
292 err = kstrtoll(buf, 0, &limit);
293 if (err)
294 return err;
295
296 if (limit < 0 || limit >= PIDS_MAX)
297 return -EINVAL;
298
299set_limit:
300 /*
301 * Limit updates don't need to be mutex'd, since it isn't
302 * critical that any racing fork()s follow the new limit.
303 */
304 pids->limit = limit;
305 return nbytes;
306}
307
308static int pids_max_show(struct seq_file *sf, void *v)
309{
310 struct cgroup_subsys_state *css = seq_css(sf);
311 struct pids_cgroup *pids = css_pids(css);
312 int64_t limit = pids->limit;
313
314 if (limit >= PIDS_MAX)
315 seq_printf(sf, "%s\n", PIDS_MAX_STR);
316 else
317 seq_printf(sf, "%lld\n", limit);
318
319 return 0;
320}
321
322static s64 pids_current_read(struct cgroup_subsys_state *css,
323 struct cftype *cft)
324{
325 struct pids_cgroup *pids = css_pids(css);
326
327 return atomic64_read(&pids->counter);
328}
329
330static struct cftype pids_files[] = {
331 {
332 .name = "max",
333 .write = pids_max_write,
334 .seq_show = pids_max_show,
335 .flags = CFTYPE_NOT_ON_ROOT,
336 },
337 {
338 .name = "current",
339 .read_s64 = pids_current_read,
340 },
341 { } /* terminate */
342};
343
344struct cgroup_subsys pids_cgrp_subsys = {
345 .css_alloc = pids_css_alloc,
346 .css_free = pids_css_free,
347 .can_attach = pids_can_attach,
348 .cancel_attach = pids_cancel_attach,
349 .can_fork = pids_can_fork,
350 .cancel_fork = pids_cancel_fork,
351 .fork = pids_fork,
352 .exit = pids_exit,
353 .legacy_cftypes = pids_files,
354 .dfl_cftypes = pids_files,
355};
diff --git a/kernel/fork.c b/kernel/fork.c
index 2b1a61cddc19..03aa2e6de7a4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1246,6 +1246,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1246{ 1246{
1247 int retval; 1247 int retval;
1248 struct task_struct *p; 1248 struct task_struct *p;
1249 void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
1249 1250
1250 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 1251 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1251 return ERR_PTR(-EINVAL); 1252 return ERR_PTR(-EINVAL);
@@ -1518,6 +1519,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1518 p->task_works = NULL; 1519 p->task_works = NULL;
1519 1520
1520 /* 1521 /*
1522 * Ensure that the cgroup subsystem policies allow the new process to be
1523 * forked. It should be noted the the new process's css_set can be changed
1524 * between here and cgroup_post_fork() if an organisation operation is in
1525 * progress.
1526 */
1527 retval = cgroup_can_fork(p, cgrp_ss_priv);
1528 if (retval)
1529 goto bad_fork_free_pid;
1530
1531 /*
1521 * Make it visible to the rest of the system, but dont wake it up yet. 1532 * Make it visible to the rest of the system, but dont wake it up yet.
1522 * Need tasklist lock for parent etc handling! 1533 * Need tasklist lock for parent etc handling!
1523 */ 1534 */
@@ -1553,7 +1564,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1553 spin_unlock(&current->sighand->siglock); 1564 spin_unlock(&current->sighand->siglock);
1554 write_unlock_irq(&tasklist_lock); 1565 write_unlock_irq(&tasklist_lock);
1555 retval = -ERESTARTNOINTR; 1566 retval = -ERESTARTNOINTR;
1556 goto bad_fork_free_pid; 1567 goto bad_fork_cancel_cgroup;
1557 } 1568 }
1558 1569
1559 if (likely(p->pid)) { 1570 if (likely(p->pid)) {
@@ -1595,7 +1606,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1595 write_unlock_irq(&tasklist_lock); 1606 write_unlock_irq(&tasklist_lock);
1596 1607
1597 proc_fork_connector(p); 1608 proc_fork_connector(p);
1598 cgroup_post_fork(p); 1609 cgroup_post_fork(p, cgrp_ss_priv);
1599 if (clone_flags & CLONE_THREAD) 1610 if (clone_flags & CLONE_THREAD)
1600 threadgroup_change_end(current); 1611 threadgroup_change_end(current);
1601 perf_event_fork(p); 1612 perf_event_fork(p);
@@ -1605,6 +1616,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1605 1616
1606 return p; 1617 return p;
1607 1618
1619bad_fork_cancel_cgroup:
1620 cgroup_cancel_fork(p, cgrp_ss_priv);
1608bad_fork_free_pid: 1621bad_fork_free_pid:
1609 if (pid != &init_struct_pid) 1622 if (pid != &init_struct_pid)
1610 free_pid(pid); 1623 free_pid(pid);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8b864ecee0e1..d8420c233ff7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8133,7 +8133,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
8133 sched_offline_group(tg); 8133 sched_offline_group(tg);
8134} 8134}
8135 8135
8136static void cpu_cgroup_fork(struct task_struct *task) 8136static void cpu_cgroup_fork(struct task_struct *task, void *private)
8137{ 8137{
8138 sched_move_task(task); 8138 sched_move_task(task);
8139} 8139}