aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAleksa Sarai <cyphar@cyphar.com>2015-06-09 07:32:10 -0400
committerTejun Heo <tj@kernel.org>2015-07-14 17:29:23 -0400
commit49b786ea146f69c371df18e81ce0a2d5839f865c (patch)
tree8e7abdd61fb2a8e5d3b7ffbf263fc36d8f9969f5
parent7e47682ea555e7c1edef1d8fd96e2aa4c12abe59 (diff)
cgroup: implement the PIDs subsystem
Adds a new single-purpose PIDs subsystem to limit the number of tasks that can be forked inside a cgroup. Essentially this is an implementation of RLIMIT_NPROC that applies to a cgroup rather than a process tree. However, it should be noted that organisational operations (adding and removing tasks from a PIDs hierarchy) will *not* be prevented. Rather, the number of tasks in the hierarchy cannot exceed the limit through forking. This is due to the fact that, in the unified hierarchy, attach cannot fail (and it is not possible for a task to overcome its PIDs cgroup policy limit by attaching to a child cgroup -- even if migrating mid-fork it must be able to fork in the parent first). PIDs are fundamentally a global resource, and it is possible to reach PID exhaustion inside a cgroup without hitting any reasonable kmemcg policy. Once you've hit PID exhaustion, you're only in a marginally better state than OOM. This subsystem allows PID exhaustion inside a cgroup to be prevented. Signed-off-by: Aleksa Sarai <cyphar@cyphar.com> Signed-off-by: Tejun Heo <tj@kernel.org>
-rw-r--r--CREDITS5
-rw-r--r--include/linux/cgroup_subsys.h5
-rw-r--r--init/Kconfig16
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/cgroup_pids.c366
5 files changed, 393 insertions, 0 deletions
diff --git a/CREDITS b/CREDITS
index 1d616640bbf6..4fcf9cd8544c 100644
--- a/CREDITS
+++ b/CREDITS
@@ -3219,6 +3219,11 @@ S: 69 rue Dunois
3219S: 75013 Paris 3219S: 75013 Paris
3220S: France 3220S: France
3221 3221
3222N: Aleksa Sarai
3223E: cyphar@cyphar.com
3224W: https://www.cyphar.com/
3225D: `pids` cgroup subsystem
3226
3222N: Dipankar Sarma 3227N: Dipankar Sarma
3223E: dipankar@in.ibm.com 3228E: dipankar@in.ibm.com
3224D: RCU 3229D: RCU
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ec43bce7e1ea..1f36945fd23d 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -62,6 +62,11 @@ SUBSYS(hugetlb)
62 * Subsystems that implement the can_fork() family of callbacks. 62 * Subsystems that implement the can_fork() family of callbacks.
63 */ 63 */
64SUBSYS_TAG(CANFORK_START) 64SUBSYS_TAG(CANFORK_START)
65
66#if IS_ENABLED(CONFIG_CGROUP_PIDS)
67SUBSYS(pids)
68#endif
69
65SUBSYS_TAG(CANFORK_END) 70SUBSYS_TAG(CANFORK_END)
66 71
67/* 72/*
diff --git a/init/Kconfig b/init/Kconfig
index af09b4fb43d2..2184b34cbf73 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -955,6 +955,22 @@ config CGROUP_FREEZER
955 Provides a way to freeze and unfreeze all tasks in a 955 Provides a way to freeze and unfreeze all tasks in a
956 cgroup. 956 cgroup.
957 957
958config CGROUP_PIDS
959 bool "PIDs cgroup subsystem"
960 help
961 Provides enforcement of process number limits in the scope of a
962 cgroup. Any attempt to fork more processes than is allowed in the
963 cgroup will fail. PIDs are fundamentally a global resource because it
964 is fairly trivial to reach PID exhaustion before you reach even a
965 conservative kmemcg limit. As a result, it is possible to grind a
966 system to halt without being limited by other cgroup policies. The
967 PIDs cgroup subsystem is designed to stop this from happening.
968
969 It should be noted that organisational operations (such as attaching
970 to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
971 since the PIDs limit only affects a process's ability to fork, not to
972 attach to a cgroup.
973
958config CGROUP_DEVICE 974config CGROUP_DEVICE
959 bool "Device controller for cgroups" 975 bool "Device controller for cgroups"
960 help 976 help
diff --git a/kernel/Makefile b/kernel/Makefile
index 43c4c920f30a..718fb8afab7a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
55obj-$(CONFIG_COMPAT) += compat.o 55obj-$(CONFIG_COMPAT) += compat.o
56obj-$(CONFIG_CGROUPS) += cgroup.o 56obj-$(CONFIG_CGROUPS) += cgroup.o
57obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o 57obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
58obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
58obj-$(CONFIG_CPUSETS) += cpuset.o 59obj-$(CONFIG_CPUSETS) += cpuset.o
59obj-$(CONFIG_UTS_NS) += utsname.o 60obj-$(CONFIG_UTS_NS) += utsname.o
60obj-$(CONFIG_USER_NS) += user_namespace.o 61obj-$(CONFIG_USER_NS) += user_namespace.o
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
new file mode 100644
index 000000000000..d75488824ae2
--- /dev/null
+++ b/kernel/cgroup_pids.c
@@ -0,0 +1,366 @@
1/*
2 * Process number limiting controller for cgroups.
3 *
4 * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
5 * after a certain limit is reached.
6 *
7 * Since it is trivial to hit the task limit without hitting any kmemcg limits
8 * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
9 * preventable in the scope of a cgroup hierarchy by allowing resource limiting
10 * of the number of tasks in a cgroup.
11 *
12 * In order to use the `pids` controller, set the maximum number of tasks in
13 * pids.max (this is not available in the root cgroup for obvious reasons). The
14 * number of processes currently in the cgroup is given by pids.current.
15 * Organisational operations are not blocked by cgroup policies, so it is
16 * possible to have pids.current > pids.max. However, it is not possible to
17 * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
18 * would cause a cgroup policy to be violated.
19 *
20 * To set a cgroup to have no limit, set pids.max to "max". This is the default
21 * for all new cgroups (N.B. that PID limits are hierarchical, so the most
22 * stringent limit in the hierarchy is followed).
23 *
24 * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
25 * a superset of parent/child/pids.current.
26 *
27 * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
28 *
29 * This file is subject to the terms and conditions of version 2 of the GNU
30 * General Public License. See the file COPYING in the main directory of the
31 * Linux distribution for more details.
32 */
33
34#include <linux/kernel.h>
35#include <linux/threads.h>
36#include <linux/atomic.h>
37#include <linux/cgroup.h>
38#include <linux/slab.h>
39
40#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
41#define PIDS_MAX_STR "max"
42
43struct pids_cgroup {
44 struct cgroup_subsys_state css;
45
46 /*
47 * Use 64-bit types so that we can safely represent "max" as
48 * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
49 */
50 atomic64_t counter;
51 int64_t limit;
52};
53
54static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
55{
56 return container_of(css, struct pids_cgroup, css);
57}
58
59static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
60{
61 return css_pids(pids->css.parent);
62}
63
64static struct cgroup_subsys_state *
65pids_css_alloc(struct cgroup_subsys_state *parent)
66{
67 struct pids_cgroup *pids;
68
69 pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
70 if (!pids)
71 return ERR_PTR(-ENOMEM);
72
73 pids->limit = PIDS_MAX;
74 atomic64_set(&pids->counter, 0);
75 return &pids->css;
76}
77
78static void pids_css_free(struct cgroup_subsys_state *css)
79{
80 kfree(css_pids(css));
81}
82
83/**
84 * pids_cancel - uncharge the local pid count
85 * @pids: the pid cgroup state
86 * @num: the number of pids to cancel
87 *
88 * This function will WARN if the pid count goes under 0, because such a case is
89 * a bug in the pids controller proper.
90 */
91static void pids_cancel(struct pids_cgroup *pids, int num)
92{
93 /*
94 * A negative count (or overflow for that matter) is invalid,
95 * and indicates a bug in the `pids` controller proper.
96 */
97 WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
98}
99
100/**
101 * pids_uncharge - hierarchically uncharge the pid count
102 * @pids: the pid cgroup state
103 * @num: the number of pids to uncharge
104 */
105static void pids_uncharge(struct pids_cgroup *pids, int num)
106{
107 struct pids_cgroup *p;
108
109 for (p = pids; p; p = parent_pids(p))
110 pids_cancel(p, num);
111}
112
113/**
114 * pids_charge - hierarchically charge the pid count
115 * @pids: the pid cgroup state
116 * @num: the number of pids to charge
117 *
118 * This function does *not* follow the pid limit set. It cannot fail and the new
119 * pid count may exceed the limit. This is only used for reverting failed
120 * attaches, where there is no other way out than violating the limit.
121 */
122static void pids_charge(struct pids_cgroup *pids, int num)
123{
124 struct pids_cgroup *p;
125
126 for (p = pids; p; p = parent_pids(p))
127 atomic64_add(num, &p->counter);
128}
129
130/**
131 * pids_try_charge - hierarchically try to charge the pid count
132 * @pids: the pid cgroup state
133 * @num: the number of pids to charge
134 *
135 * This function follows the set limit. It will fail if the charge would cause
136 * the new value to exceed the hierarchical limit. Returns 0 if the charge
137 * succeded, otherwise -EAGAIN.
138 */
139static int pids_try_charge(struct pids_cgroup *pids, int num)
140{
141 struct pids_cgroup *p, *q;
142
143 for (p = pids; p; p = parent_pids(p)) {
144 int64_t new = atomic64_add_return(num, &p->counter);
145
146 /*
147 * Since new is capped to the maximum number of pid_t, if
148 * p->limit is %PIDS_MAX then we know that this test will never
149 * fail.
150 */
151 if (new > p->limit)
152 goto revert;
153 }
154
155 return 0;
156
157revert:
158 for (q = pids; q != p; q = parent_pids(q))
159 pids_cancel(q, num);
160 pids_cancel(p, num);
161
162 return -EAGAIN;
163}
164
165static int pids_can_attach(struct cgroup_subsys_state *css,
166 struct cgroup_taskset *tset)
167{
168 struct pids_cgroup *pids = css_pids(css);
169 struct task_struct *task;
170
171 cgroup_taskset_for_each(task, tset) {
172 struct cgroup_subsys_state *old_css;
173 struct pids_cgroup *old_pids;
174
175 /*
176 * Grab a ref to each task's css. We don't drop the ref until
177 * we either fail and hit ->cancel_attach() or succeed and hit
178 * ->attach().
179 */
180 old_css = task_get_css(task, pids_cgrp_id);
181 old_pids = css_pids(old_css);
182
183 pids_charge(pids, 1);
184 pids_uncharge(old_pids, 1);
185 }
186
187 return 0;
188}
189
190static void pids_cancel_attach(struct cgroup_subsys_state *css,
191 struct cgroup_taskset *tset)
192{
193 struct pids_cgroup *pids = css_pids(css);
194 struct task_struct *task;
195
196 cgroup_taskset_for_each(task, tset) {
197 struct cgroup_subsys_state *old_css;
198 struct pids_cgroup *old_pids;
199
200 old_css = task_css(task, pids_cgrp_id);
201 old_pids = css_pids(old_css);
202
203 pids_charge(old_pids, 1);
204 pids_uncharge(pids, 1);
205 css_put(old_css);
206 }
207}
208
209static void pids_attach(struct cgroup_subsys_state *css,
210 struct cgroup_taskset *tset)
211{
212 struct task_struct *task;
213
214 cgroup_taskset_for_each(task, tset)
215 css_put(task_css(task, pids_cgrp_id));
216}
217
218static int pids_can_fork(struct task_struct *task, void **priv_p)
219{
220 struct cgroup_subsys_state *css;
221 struct pids_cgroup *pids;
222 int err;
223
224 /*
225 * Use the "current" task_css for the pids subsystem as the tentative
226 * css. It is possible we will charge the wrong hierarchy, in which
227 * case we will forcefully revert/reapply the charge on the right
228 * hierarchy after it is committed to the task proper.
229 */
230 css = task_get_css(current, pids_cgrp_id);
231 pids = css_pids(css);
232
233 err = pids_try_charge(pids, 1);
234 if (err)
235 goto err_css_put;
236
237 *priv_p = css;
238 return 0;
239
240err_css_put:
241 css_put(css);
242 return err;
243}
244
245static void pids_cancel_fork(struct task_struct *task, void *priv)
246{
247 struct cgroup_subsys_state *css = priv;
248 struct pids_cgroup *pids = css_pids(css);
249
250 pids_uncharge(pids, 1);
251 css_put(css);
252}
253
254static void pids_fork(struct task_struct *task, void *priv)
255{
256 struct cgroup_subsys_state *css;
257 struct cgroup_subsys_state *old_css = priv;
258 struct pids_cgroup *pids;
259 struct pids_cgroup *old_pids = css_pids(old_css);
260
261 css = task_get_css(task, pids_cgrp_id);
262 pids = css_pids(css);
263
264 /*
265 * If the association has changed, we have to revert and reapply the
266 * charge/uncharge on the wrong hierarchy to the current one. Since
267 * the association can only change due to an organisation event, its
268 * okay for us to ignore the limit in this case.
269 */
270 if (pids != old_pids) {
271 pids_uncharge(old_pids, 1);
272 pids_charge(pids, 1);
273 }
274
275 css_put(css);
276 css_put(old_css);
277}
278
279static void pids_exit(struct cgroup_subsys_state *css,
280 struct cgroup_subsys_state *old_css,
281 struct task_struct *task)
282{
283 struct pids_cgroup *pids = css_pids(old_css);
284
285 pids_uncharge(pids, 1);
286}
287
288static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
289 size_t nbytes, loff_t off)
290{
291 struct cgroup_subsys_state *css = of_css(of);
292 struct pids_cgroup *pids = css_pids(css);
293 int64_t limit;
294 int err;
295
296 buf = strstrip(buf);
297 if (!strcmp(buf, PIDS_MAX_STR)) {
298 limit = PIDS_MAX;
299 goto set_limit;
300 }
301
302 err = kstrtoll(buf, 0, &limit);
303 if (err)
304 return err;
305
306 if (limit < 0 || limit >= PIDS_MAX)
307 return -EINVAL;
308
309set_limit:
310 /*
311 * Limit updates don't need to be mutex'd, since it isn't
312 * critical that any racing fork()s follow the new limit.
313 */
314 pids->limit = limit;
315 return nbytes;
316}
317
318static int pids_max_show(struct seq_file *sf, void *v)
319{
320 struct cgroup_subsys_state *css = seq_css(sf);
321 struct pids_cgroup *pids = css_pids(css);
322 int64_t limit = pids->limit;
323
324 if (limit >= PIDS_MAX)
325 seq_printf(sf, "%s\n", PIDS_MAX_STR);
326 else
327 seq_printf(sf, "%lld\n", limit);
328
329 return 0;
330}
331
332static s64 pids_current_read(struct cgroup_subsys_state *css,
333 struct cftype *cft)
334{
335 struct pids_cgroup *pids = css_pids(css);
336
337 return atomic64_read(&pids->counter);
338}
339
340static struct cftype pids_files[] = {
341 {
342 .name = "max",
343 .write = pids_max_write,
344 .seq_show = pids_max_show,
345 .flags = CFTYPE_NOT_ON_ROOT,
346 },
347 {
348 .name = "current",
349 .read_s64 = pids_current_read,
350 },
351 { } /* terminate */
352};
353
354struct cgroup_subsys pids_cgrp_subsys = {
355 .css_alloc = pids_css_alloc,
356 .css_free = pids_css_free,
357 .attach = pids_attach,
358 .can_attach = pids_can_attach,
359 .cancel_attach = pids_cancel_attach,
360 .can_fork = pids_can_fork,
361 .cancel_fork = pids_cancel_fork,
362 .fork = pids_fork,
363 .exit = pids_exit,
364 .legacy_cftypes = pids_files,
365 .dfl_cftypes = pids_files,
366};