cgroup: implement the PIDs subsystem

Adds a new single-purpose PIDs subsystem to limit the number of tasks that can be forked inside a cgroup. Essentially this is an implementation of RLIMIT_NPROC that applies to a cgroup rather than a process tree. However, it should be noted that organisational operations (adding and removing tasks from a PIDs hierarchy) will *not* be prevented. Rather, the number of tasks in the hierarchy cannot exceed the limit through forking. This is due to the fact that, in the unified hierarchy, attach cannot fail (and it is not possible for a task to overcome its PIDs cgroup policy limit by attaching to a child cgroup -- even if migrating mid-fork it must be able to fork in the parent first). PIDs are fundamentally a global resource, and it is possible to reach PID exhaustion inside a cgroup without hitting any reasonable kmemcg policy. Once you've hit PID exhaustion, you're only in a marginally better state than OOM. This subsystem allows PID exhaustion inside a cgroup to be prevented. Signed-off-by: Aleksa Sarai <cyphar@cyphar.com> Signed-off-by: Tejun Heo <tj@kernel.org>
author: Aleksa Sarai <cyphar@cyphar.com> 2015-06-09 07:32:10 -0400
committer: Tejun Heo <tj@kernel.org> 2015-07-14 17:29:23 -0400
commit: 49b786ea146f69c371df18e81ce0a2d5839f865c (patch)
tree: 8e7abdd61fb2a8e5d3b7ffbf263fc36d8f9969f5
parent: 7e47682ea555e7c1edef1d8fd96e2aa4c12abe59 (diff)
5 files changed, 393 insertions, 0 deletions
diff --git a/CREDITS b/CREDITS
index 1d616640bbf6..4fcf9cd8544c 100644
--- a/CREDITS
+++ b/CREDITS
@@ -3219,6 +3219,11 @@ S: 69 rue Dunois
 S: 75013 Paris
 S: France
+N: Aleksa Sarai
+E: cyphar@cyphar.com
+W: https://www.cyphar.com/
+D: `pids` cgroup subsystem
 N: Dipankar Sarma
 E: dipankar@in.ibm.com
 D: RCU
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ec43bce7e1ea..1f36945fd23d 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -62,6 +62,11 @@ SUBSYS(hugetlb)
 * Subsystems that implement the can_fork() family of callbacks.
 */
 SUBSYS_TAG(CANFORK_START)
+#if IS_ENABLED(CONFIG_CGROUP_PIDS)
+SUBSYS(pids)
+#endif
 SUBSYS_TAG(CANFORK_END)
 /*
diff --git a/init/Kconfig b/init/Kconfig
index af09b4fb43d2..2184b34cbf73 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -955,6 +955,22 @@ config CGROUP_FREEZER
          Provides a way to freeze and unfreeze all tasks in a
          cgroup.
+config CGROUP_PIDS
+        bool "PIDs cgroup subsystem"
+        help
+          Provides enforcement of process number limits in the scope of a
+          cgroup. Any attempt to fork more processes than is allowed in the
+          cgroup will fail. PIDs are fundamentally a global resource because it
+          is fairly trivial to reach PID exhaustion before you reach even a
+          conservative kmemcg limit. As a result, it is possible to grind a
+          system to halt without being limited by other cgroup policies. The
+          PIDs cgroup subsystem is designed to stop this from happening.
+          It should be noted that organisational operations (such as attaching
+          to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
+          since the PIDs limit only affects a process's ability to fork, not to
+          attach to a cgroup.
 config CGROUP_DEVICE
        bool "Device controller for cgroups"
        help
diff --git a/kernel/Makefile b/kernel/Makefile
index 43c4c920f30a..718fb8afab7a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
new file mode 100644
index 000000000000..d75488824ae2
--- /dev/null
+++ b/kernel/cgroup_pids.c
@@ -0,0 +1,366 @@
+/*
+ * Process number limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
+ * after a certain limit is reached.
+ *
+ * Since it is trivial to hit the task limit without hitting any kmemcg limits
+ * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
+ * preventable in the scope of a cgroup hierarchy by allowing resource limiting
+ * of the number of tasks in a cgroup.
+ *
+ * In order to use the `pids` controller, set the maximum number of tasks in
+ * pids.max (this is not available in the root cgroup for obvious reasons). The
+ * number of processes currently in the cgroup is given by pids.current.
+ * Organisational operations are not blocked by cgroup policies, so it is
+ * possible to have pids.current > pids.max. However, it is not possible to
+ * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
+ * would cause a cgroup policy to be violated.
+ *
+ * To set a cgroup to have no limit, set pids.max to "max". This is the default
+ * for all new cgroups (N.B. that PID limits are hierarchical, so the most
+ * stringent limit in the hierarchy is followed).
+ *
+ * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
+ * a superset of parent/child/pids.current.
+ *
+ * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License.  See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
+#define PIDS_MAX_STR "max"
+struct pids_cgroup {
+        struct cgroup_subsys_state      css;
+        /*
+         * Use 64-bit types so that we can safely represent "max" as
+         * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
+         */
+        atomic64_t                      counter;
+        int64_t                         limit;
+};
+static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
+{
+        return container_of(css, struct pids_cgroup, css);
+}
+static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
+{
+        return css_pids(pids->css.parent);
+}
+static struct cgroup_subsys_state *
+pids_css_alloc(struct cgroup_subsys_state *parent)
+{
+        struct pids_cgroup *pids;
+        pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
+        if (!pids)
+                return ERR_PTR(-ENOMEM);
+        pids->limit = PIDS_MAX;
+        atomic64_set(&pids->counter, 0);
+        return &pids->css;
+}
+static void pids_css_free(struct cgroup_subsys_state *css)
+{
+        kfree(css_pids(css));
+}
+/**
+ * pids_cancel - uncharge the local pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to cancel
+ *
+ * This function will WARN if the pid count goes under 0, because such a case is
+ * a bug in the pids controller proper.
+ */
+static void pids_cancel(struct pids_cgroup *pids, int num)
+{
+        /*
+         * A negative count (or overflow for that matter) is invalid,
+         * and indicates a bug in the `pids` controller proper.
+         */
+        WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
+}
+/**
+ * pids_uncharge - hierarchically uncharge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to uncharge
+ */
+static void pids_uncharge(struct pids_cgroup *pids, int num)
+{
+        struct pids_cgroup *p;
+        for (p = pids; p; p = parent_pids(p))
+                pids_cancel(p, num);
+}
+/**
+ * pids_charge - hierarchically charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function does *not* follow the pid limit set. It cannot fail and the new
+ * pid count may exceed the limit. This is only used for reverting failed
+ * attaches, where there is no other way out than violating the limit.
+ */
+static void pids_charge(struct pids_cgroup *pids, int num)
+{
+        struct pids_cgroup *p;
+        for (p = pids; p; p = parent_pids(p))
+                atomic64_add(num, &p->counter);
+}
+/**
+ * pids_try_charge - hierarchically try to charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function follows the set limit. It will fail if the charge would cause
+ * the new value to exceed the hierarchical limit. Returns 0 if the charge
+ * succeded, otherwise -EAGAIN.
+ */
+static int pids_try_charge(struct pids_cgroup *pids, int num)
+{
+        struct pids_cgroup *p, *q;
+        for (p = pids; p; p = parent_pids(p)) {
+                int64_t new = atomic64_add_return(num, &p->counter);
+                /*
+                 * Since new is capped to the maximum number of pid_t, if
+                 * p->limit is %PIDS_MAX then we know that this test will never
+                 * fail.
+                 */
+                if (new > p->limit)
+                        goto revert;
+        }
+        return 0;
+revert:
+        for (q = pids; q != p; q = parent_pids(q))
+                pids_cancel(q, num);
+        pids_cancel(p, num);
+        return -EAGAIN;
+}
+static int pids_can_attach(struct cgroup_subsys_state *css,
+                           struct cgroup_taskset *tset)
+{
+        struct pids_cgroup *pids = css_pids(css);
+        struct task_struct *task;
+        cgroup_taskset_for_each(task, tset) {
+                struct cgroup_subsys_state *old_css;
+                struct pids_cgroup *old_pids;
+                /*
+                 * Grab a ref to each task's css. We don't drop the ref until
+                 * we either fail and hit ->cancel_attach() or succeed and hit
+                 * ->attach().
+                 */
+                old_css = task_get_css(task, pids_cgrp_id);
+                old_pids = css_pids(old_css);
+                pids_charge(pids, 1);
+                pids_uncharge(old_pids, 1);
+        }
+        return 0;
+}
+static void pids_cancel_attach(struct cgroup_subsys_state *css,
+                               struct cgroup_taskset *tset)
+{
+        struct pids_cgroup *pids = css_pids(css);
+        struct task_struct *task;
+        cgroup_taskset_for_each(task, tset) {
+                struct cgroup_subsys_state *old_css;
+                struct pids_cgroup *old_pids;
+                old_css = task_css(task, pids_cgrp_id);
+                old_pids = css_pids(old_css);
+                pids_charge(old_pids, 1);
+                pids_uncharge(pids, 1);
+                css_put(old_css);
+        }
+}
+static void pids_attach(struct cgroup_subsys_state *css,
+                        struct cgroup_taskset *tset)
+{
+        struct task_struct *task;
+        cgroup_taskset_for_each(task, tset)
+                css_put(task_css(task, pids_cgrp_id));
+}
+static int pids_can_fork(struct task_struct *task, void **priv_p)
+{
+        struct cgroup_subsys_state *css;
+        struct pids_cgroup *pids;
+        int err;
+        /*
+         * Use the "current" task_css for the pids subsystem as the tentative
+         * css. It is possible we will charge the wrong hierarchy, in which
+         * case we will forcefully revert/reapply the charge on the right
+         * hierarchy after it is committed to the task proper.
+         */
+        css = task_get_css(current, pids_cgrp_id);
+        pids = css_pids(css);
+        err = pids_try_charge(pids, 1);
+        if (err)
+                goto err_css_put;
+        *priv_p = css;
+        return 0;
+err_css_put:
+        css_put(css);
+        return err;
+}
+static void pids_cancel_fork(struct task_struct *task, void *priv)
+{
+        struct cgroup_subsys_state *css = priv;
+        struct pids_cgroup *pids = css_pids(css);
+        pids_uncharge(pids, 1);
+        css_put(css);
+}
+static void pids_fork(struct task_struct *task, void *priv)
+{
+        struct cgroup_subsys_state *css;
+        struct cgroup_subsys_state *old_css = priv;
+        struct pids_cgroup *pids;
+        struct pids_cgroup *old_pids = css_pids(old_css);
+        css = task_get_css(task, pids_cgrp_id);
+        pids = css_pids(css);
+        /*
+         * If the association has changed, we have to revert and reapply the
+         * charge/uncharge on the wrong hierarchy to the current one. Since
+         * the association can only change due to an organisation event, its
+         * okay for us to ignore the limit in this case.
+         */
+        if (pids != old_pids) {
+                pids_uncharge(old_pids, 1);
+                pids_charge(pids, 1);
+        }
+        css_put(css);
+        css_put(old_css);
+}
+static void pids_exit(struct cgroup_subsys_state *css,
+                      struct cgroup_subsys_state *old_css,
+                      struct task_struct *task)
+{
+        struct pids_cgroup *pids = css_pids(old_css);
+        pids_uncharge(pids, 1);
+}
+static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
+                              size_t nbytes, loff_t off)
+{
+        struct cgroup_subsys_state *css = of_css(of);
+        struct pids_cgroup *pids = css_pids(css);
+        int64_t limit;
+        int err;
+        buf = strstrip(buf);
+        if (!strcmp(buf, PIDS_MAX_STR)) {
+                limit = PIDS_MAX;
+                goto set_limit;
+        }
+        err = kstrtoll(buf, 0, &limit);
+        if (err)
+                return err;
+        if (limit < 0 || limit >= PIDS_MAX)
+                return -EINVAL;
+set_limit:
+        /*
+         * Limit updates don't need to be mutex'd, since it isn't
+         * critical that any racing fork()s follow the new limit.
+         */
+        pids->limit = limit;
+        return nbytes;
+}
+static int pids_max_show(struct seq_file *sf, void *v)
+{
+        struct cgroup_subsys_state *css = seq_css(sf);
+        struct pids_cgroup *pids = css_pids(css);
+        int64_t limit = pids->limit;
+        if (limit >= PIDS_MAX)
+                seq_printf(sf, "%s\n", PIDS_MAX_STR);
+        else
+                seq_printf(sf, "%lld\n", limit);
+        return 0;
+}
+static s64 pids_current_read(struct cgroup_subsys_state *css,
+                             struct cftype *cft)
+{
+        struct pids_cgroup *pids = css_pids(css);
+        return atomic64_read(&pids->counter);
+}
+static struct cftype pids_files[] = {
+        {
+                .name = "max",
+                .write = pids_max_write,
+                .seq_show = pids_max_show,
+                .flags = CFTYPE_NOT_ON_ROOT,
+        },
+        {
+                .name = "current",
+                .read_s64 = pids_current_read,
+        },
+        { }     /* terminate */
+};
+struct cgroup_subsys pids_cgrp_subsys = {
+        .css_alloc      = pids_css_alloc,
+        .css_free       = pids_css_free,
+        .attach         = pids_attach,
+        .can_attach     = pids_can_attach,
+        .cancel_attach  = pids_cancel_attach,
+        .can_fork       = pids_can_fork,
+        .cancel_fork    = pids_cancel_fork,
+        .fork           = pids_fork,
+        .exit           = pids_exit,
+        .legacy_cftypes = pids_files,
+        .dfl_cftypes    = pids_files,
+};
author	Aleksa Sarai <cyphar@cyphar.com>	2015-06-09 07:32:10 -0400
committer	Tejun Heo <tj@kernel.org>	2015-07-14 17:29:23 -0400
commit	49b786ea146f69c371df18e81ce0a2d5839f865c (patch)
tree	8e7abdd61fb2a8e5d3b7ffbf263fc36d8f9969f5
parent	7e47682ea555e7c1edef1d8fd96e2aa4c12abe59 (diff)

diff --git a/CREDITS b/CREDITS index 1d616640bbf6..4fcf9cd8544c 100644 --- a/CREDITS +++ b/CREDITS
@@ -3219,6 +3219,11 @@ S: 69 rue Dunois
3219	S: 75013 Paris	3219	S: 75013 Paris
3220	S: France	3220	S: France
3221		3221
		3222	N: Aleksa Sarai
		3223	E: cyphar@cyphar.com
		3224	W: https://www.cyphar.com/
		3225	D: `pids` cgroup subsystem
		3226
3222	N: Dipankar Sarma	3227	N: Dipankar Sarma
3223	E: dipankar@in.ibm.com	3228	E: dipankar@in.ibm.com
3224	D: RCU	3229	D: RCU


diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index ec43bce7e1ea..1f36945fd23d 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h
@@ -62,6 +62,11 @@ SUBSYS(hugetlb)
62	* Subsystems that implement the can_fork() family of callbacks.	62	* Subsystems that implement the can_fork() family of callbacks.
63	*/	63	*/
64	SUBSYS_TAG(CANFORK_START)	64	SUBSYS_TAG(CANFORK_START)
		65
		66	#if IS_ENABLED(CONFIG_CGROUP_PIDS)
		67	SUBSYS(pids)
		68	#endif
		69
65	SUBSYS_TAG(CANFORK_END)	70	SUBSYS_TAG(CANFORK_END)
66		71
67	/*	72	/*


diff --git a/init/Kconfig b/init/Kconfig index af09b4fb43d2..2184b34cbf73 100644 --- a/init/Kconfig +++ b/init/Kconfig
@@ -955,6 +955,22 @@ config CGROUP_FREEZER
955	Provides a way to freeze and unfreeze all tasks in a	955	Provides a way to freeze and unfreeze all tasks in a
956	cgroup.	956	cgroup.
957		957
		958	config CGROUP_PIDS
		959	bool "PIDs cgroup subsystem"
		960	help
		961	Provides enforcement of process number limits in the scope of a
		962	cgroup. Any attempt to fork more processes than is allowed in the
		963	cgroup will fail. PIDs are fundamentally a global resource because it
		964	is fairly trivial to reach PID exhaustion before you reach even a
		965	conservative kmemcg limit. As a result, it is possible to grind a
		966	system to halt without being limited by other cgroup policies. The
		967	PIDs cgroup subsystem is designed to stop this from happening.
		968
		969	It should be noted that organisational operations (such as attaching
		970	to a cgroup hierarchy will not be blocked by the PIDs subsystem),
		971	since the PIDs limit only affects a process's ability to fork, not to
		972	attach to a cgroup.
		973
958	config CGROUP_DEVICE	974	config CGROUP_DEVICE
959	bool "Device controller for cgroups"	975	bool "Device controller for cgroups"
960	help	976	help


diff --git a/kernel/Makefile b/kernel/Makefile index 43c4c920f30a..718fb8afab7a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
55	obj-$(CONFIG_COMPAT) += compat.o	55	obj-$(CONFIG_COMPAT) += compat.o
56	obj-$(CONFIG_CGROUPS) += cgroup.o	56	obj-$(CONFIG_CGROUPS) += cgroup.o
57	obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o	57	obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
		58	obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
58	obj-$(CONFIG_CPUSETS) += cpuset.o	59	obj-$(CONFIG_CPUSETS) += cpuset.o
59	obj-$(CONFIG_UTS_NS) += utsname.o	60	obj-$(CONFIG_UTS_NS) += utsname.o
60	obj-$(CONFIG_USER_NS) += user_namespace.o	61	obj-$(CONFIG_USER_NS) += user_namespace.o


diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c new file mode 100644 index 000000000000..d75488824ae2 --- /dev/null +++ b/kernel/cgroup_pids.c
@@ -0,0 +1,366 @@
		1	/*
		2	* Process number limiting controller for cgroups.
		3	*
		4	* Used to allow a cgroup hierarchy to stop any new processes from fork()ing
		5	* after a certain limit is reached.
		6	*
		7	* Since it is trivial to hit the task limit without hitting any kmemcg limits
		8	* in place, PIDs are a fundamental resource. As such, PID exhaustion must be
		9	* preventable in the scope of a cgroup hierarchy by allowing resource limiting
		10	* of the number of tasks in a cgroup.
		11	*
		12	* In order to use the `pids` controller, set the maximum number of tasks in
		13	* pids.max (this is not available in the root cgroup for obvious reasons). The
		14	* number of processes currently in the cgroup is given by pids.current.
		15	* Organisational operations are not blocked by cgroup policies, so it is
		16	* possible to have pids.current > pids.max. However, it is not possible to
		17	* violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
		18	* would cause a cgroup policy to be violated.
		19	*
		20	* To set a cgroup to have no limit, set pids.max to "max". This is the default
		21	* for all new cgroups (N.B. that PID limits are hierarchical, so the most
		22	* stringent limit in the hierarchy is followed).
		23	*
		24	* pids.current tracks all child cgroup hierarchies, so parent/pids.current is
		25	* a superset of parent/child/pids.current.
		26	*
		27	* Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
		28	*
		29	* This file is subject to the terms and conditions of version 2 of the GNU
		30	* General Public License. See the file COPYING in the main directory of the
		31	* Linux distribution for more details.
		32	*/
		33
		34	#include <linux/kernel.h>
		35	#include <linux/threads.h>
		36	#include <linux/atomic.h>
		37	#include <linux/cgroup.h>
		38	#include <linux/slab.h>
		39
		40	#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
		41	#define PIDS_MAX_STR "max"
		42
		43	struct pids_cgroup {
		44	struct cgroup_subsys_state css;
		45
		46	/*
		47	* Use 64-bit types so that we can safely represent "max" as
		48	* %PIDS_MAX = (%PID_MAX_LIMIT + 1).
		49	*/
		50	atomic64_t counter;
		51	int64_t limit;
		52	};
		53
		54	static struct pids_cgroup css_pids(struct cgroup_subsys_state css)
		55	{
		56	return container_of(css, struct pids_cgroup, css);
		57	}
		58
		59	static struct pids_cgroup parent_pids(struct pids_cgroup pids)
		60	{
		61	return css_pids(pids->css.parent);
		62	}
		63
		64	static struct cgroup_subsys_state *
		65	pids_css_alloc(struct cgroup_subsys_state *parent)
		66	{
		67	struct pids_cgroup *pids;
		68
		69	pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
		70	if (!pids)
		71	return ERR_PTR(-ENOMEM);
		72
		73	pids->limit = PIDS_MAX;
		74	atomic64_set(&pids->counter, 0);
		75	return &pids->css;
		76	}
		77
		78	static void pids_css_free(struct cgroup_subsys_state *css)
		79	{
		80	kfree(css_pids(css));
		81	}
		82
		83	/**
		84	* pids_cancel - uncharge the local pid count
		85	* @pids: the pid cgroup state
		86	* @num: the number of pids to cancel
		87	*
		88	* This function will WARN if the pid count goes under 0, because such a case is
		89	* a bug in the pids controller proper.
		90	*/
		91	static void pids_cancel(struct pids_cgroup *pids, int num)
		92	{
		93	/*
		94	* A negative count (or overflow for that matter) is invalid,
		95	* and indicates a bug in the `pids` controller proper.
		96	*/
		97	WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
		98	}
		99
		100	/**
		101	* pids_uncharge - hierarchically uncharge the pid count
		102	* @pids: the pid cgroup state
		103	* @num: the number of pids to uncharge
		104	*/
		105	static void pids_uncharge(struct pids_cgroup *pids, int num)
		106	{
		107	struct pids_cgroup *p;
		108
		109	for (p = pids; p; p = parent_pids(p))
		110	pids_cancel(p, num);
		111	}
		112
		113	/**
		114	* pids_charge - hierarchically charge the pid count
		115	* @pids: the pid cgroup state
		116	* @num: the number of pids to charge
		117	*
		118	* This function does not follow the pid limit set. It cannot fail and the new
		119	* pid count may exceed the limit. This is only used for reverting failed
		120	* attaches, where there is no other way out than violating the limit.
		121	*/
		122	static void pids_charge(struct pids_cgroup *pids, int num)
		123	{
		124	struct pids_cgroup *p;
		125
		126	for (p = pids; p; p = parent_pids(p))
		127	atomic64_add(num, &p->counter);
		128	}
		129
		130	/**
		131	* pids_try_charge - hierarchically try to charge the pid count
		132	* @pids: the pid cgroup state
		133	* @num: the number of pids to charge
		134	*
		135	* This function follows the set limit. It will fail if the charge would cause
		136	* the new value to exceed the hierarchical limit. Returns 0 if the charge
		137	* succeded, otherwise -EAGAIN.
		138	*/
		139	static int pids_try_charge(struct pids_cgroup *pids, int num)
		140	{
		141	struct pids_cgroup p, q;
		142
		143	for (p = pids; p; p = parent_pids(p)) {
		144	int64_t new = atomic64_add_return(num, &p->counter);
		145
		146	/*
		147	* Since new is capped to the maximum number of pid_t, if
		148	* p->limit is %PIDS_MAX then we know that this test will never
		149	* fail.
		150	*/
		151	if (new > p->limit)
		152	goto revert;
		153	}
		154
		155	return 0;
		156
		157	revert:
		158	for (q = pids; q != p; q = parent_pids(q))
		159	pids_cancel(q, num);
		160	pids_cancel(p, num);
		161
		162	return -EAGAIN;
		163	}
		164
		165	static int pids_can_attach(struct cgroup_subsys_state *css,
		166	struct cgroup_taskset *tset)
		167	{
		168	struct pids_cgroup *pids = css_pids(css);
		169	struct task_struct *task;
		170
		171	cgroup_taskset_for_each(task, tset) {
		172	struct cgroup_subsys_state *old_css;
		173	struct pids_cgroup *old_pids;
		174
		175	/*
		176	* Grab a ref to each task's css. We don't drop the ref until
		177	* we either fail and hit ->cancel_attach() or succeed and hit
		178	* ->attach().
		179	*/
		180	old_css = task_get_css(task, pids_cgrp_id);
		181	old_pids = css_pids(old_css);
		182
		183	pids_charge(pids, 1);
		184	pids_uncharge(old_pids, 1);
		185	}
		186
		187	return 0;
		188	}
		189
		190	static void pids_cancel_attach(struct cgroup_subsys_state *css,
		191	struct cgroup_taskset *tset)
		192	{
		193	struct pids_cgroup *pids = css_pids(css);
		194	struct task_struct *task;
		195
		196	cgroup_taskset_for_each(task, tset) {
		197	struct cgroup_subsys_state *old_css;
		198	struct pids_cgroup *old_pids;
		199
		200	old_css = task_css(task, pids_cgrp_id);
		201	old_pids = css_pids(old_css);
		202
		203	pids_charge(old_pids, 1);
		204	pids_uncharge(pids, 1);
		205	css_put(old_css);
		206	}
		207	}
		208
		209	static void pids_attach(struct cgroup_subsys_state *css,
		210	struct cgroup_taskset *tset)
		211	{
		212	struct task_struct *task;
		213
		214	cgroup_taskset_for_each(task, tset)
		215	css_put(task_css(task, pids_cgrp_id));
		216	}
		217
		218	static int pids_can_fork(struct task_struct task, void *priv_p)
		219	{
		220	struct cgroup_subsys_state *css;
		221	struct pids_cgroup *pids;
		222	int err;
		223
		224	/*
		225	* Use the "current" task_css for the pids subsystem as the tentative
		226	* css. It is possible we will charge the wrong hierarchy, in which
		227	* case we will forcefully revert/reapply the charge on the right
		228	* hierarchy after it is committed to the task proper.
		229	*/
		230	css = task_get_css(current, pids_cgrp_id);
		231	pids = css_pids(css);
		232
		233	err = pids_try_charge(pids, 1);
		234	if (err)
		235	goto err_css_put;
		236
		237	*priv_p = css;
		238	return 0;
		239
		240	err_css_put:
		241	css_put(css);
		242	return err;
		243	}
		244
		245	static void pids_cancel_fork(struct task_struct task, void priv)
		246	{
		247	struct cgroup_subsys_state *css = priv;
		248	struct pids_cgroup *pids = css_pids(css);
		249
		250	pids_uncharge(pids, 1);
		251	css_put(css);
		252	}
		253
		254	static void pids_fork(struct task_struct task, void priv)
		255	{
		256	struct cgroup_subsys_state *css;
		257	struct cgroup_subsys_state *old_css = priv;
		258	struct pids_cgroup *pids;
		259	struct pids_cgroup *old_pids = css_pids(old_css);
		260
		261	css = task_get_css(task, pids_cgrp_id);
		262	pids = css_pids(css);
		263
		264	/*
		265	* If the association has changed, we have to revert and reapply the
		266	* charge/uncharge on the wrong hierarchy to the current one. Since
		267	* the association can only change due to an organisation event, its
		268	* okay for us to ignore the limit in this case.
		269	*/
		270	if (pids != old_pids) {
		271	pids_uncharge(old_pids, 1);
		272	pids_charge(pids, 1);
		273	}
		274
		275	css_put(css);
		276	css_put(old_css);
		277	}
		278
		279	static void pids_exit(struct cgroup_subsys_state *css,
		280	struct cgroup_subsys_state *old_css,
		281	struct task_struct *task)
		282	{
		283	struct pids_cgroup *pids = css_pids(old_css);
		284
		285	pids_uncharge(pids, 1);
		286	}
		287
		288	static ssize_t pids_max_write(struct kernfs_open_file of, char buf,
		289	size_t nbytes, loff_t off)
		290	{
		291	struct cgroup_subsys_state *css = of_css(of);
		292	struct pids_cgroup *pids = css_pids(css);
		293	int64_t limit;
		294	int err;
		295
		296	buf = strstrip(buf);
		297	if (!strcmp(buf, PIDS_MAX_STR)) {
		298	limit = PIDS_MAX;
		299	goto set_limit;
		300	}
		301
		302	err = kstrtoll(buf, 0, &limit);
		303	if (err)
		304	return err;
		305
		306	if (limit < 0 \|\| limit >= PIDS_MAX)
		307	return -EINVAL;
		308
		309	set_limit:
		310	/*
		311	* Limit updates don't need to be mutex'd, since it isn't
		312	* critical that any racing fork()s follow the new limit.
		313	*/
		314	pids->limit = limit;
		315	return nbytes;
		316	}
		317
		318	static int pids_max_show(struct seq_file sf, void v)
		319	{
		320	struct cgroup_subsys_state *css = seq_css(sf);
		321	struct pids_cgroup *pids = css_pids(css);
		322	int64_t limit = pids->limit;
		323
		324	if (limit >= PIDS_MAX)
		325	seq_printf(sf, "%s\n", PIDS_MAX_STR);
		326	else
		327	seq_printf(sf, "%lld\n", limit);
		328
		329	return 0;
		330	}
		331
		332	static s64 pids_current_read(struct cgroup_subsys_state *css,
		333	struct cftype *cft)
		334	{
		335	struct pids_cgroup *pids = css_pids(css);
		336
		337	return atomic64_read(&pids->counter);
		338	}
		339
		340	static struct cftype pids_files[] = {
		341	{
		342	.name = "max",
		343	.write = pids_max_write,
		344	.seq_show = pids_max_show,
		345	.flags = CFTYPE_NOT_ON_ROOT,
		346	},
		347	{
		348	.name = "current",
		349	.read_s64 = pids_current_read,
		350	},
		351	{ } /* terminate */
		352	};
		353
		354	struct cgroup_subsys pids_cgrp_subsys = {
		355	.css_alloc = pids_css_alloc,
		356	.css_free = pids_css_free,
		357	.attach = pids_attach,
		358	.can_attach = pids_can_attach,
		359	.cancel_attach = pids_cancel_attach,
		360	.can_fork = pids_can_fork,
		361	.cancel_fork = pids_cancel_fork,
		362	.fork = pids_fork,
		363	.exit = pids_exit,
		364	.legacy_cftypes = pids_files,
		365	.dfl_cftypes = pids_files,
		366	};