Task Control Groups: add fork()/exit() hooks

This adds the necessary hooks to the fork() and exit() paths to ensure that new children inherit their parent's cgroup assignments, and that exiting processes release reference counts on their cgroups. Signed-off-by: Paul Menage <menage@google.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Jackson <pj@sgi.com> Cc: Kirill Korotaev <dev@openvz.org> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Paul Menage <menage@google.com> 2007-10-19 02:39:33 -0400
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-10-19 14:53:36 -0400
commit: b4f48b6363c81ca743ef46943ef23fd72e60f679 (patch)
tree: 40437b78e2d7a7d9d71e7bd63bc96e1ad02daa94 /kernel
parent: 355e0c48b757b7fcc79ccb98fda8105ed37a1598 (diff)
3 files changed, 135 insertions, 2 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f4c4dce9558f..7bb520aaf0a3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -132,6 +132,33 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
 #define for_each_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
+/* Each task_struct has an embedded css_set, so the get/put
+ * operation simply takes a reference count on all the cgroups
+ * referenced by subsystems in this css_set. This can end up
+ * multiple-counting some cgroups, but that's OK - the ref-count is
+ * just a busy/not-busy indicator; ensuring that we only count each
+ * cgroup once would require taking a global lock to ensure that no
+ * subsystems moved between hierarchies while we were doing so.
+ *
+ * Possible TODO: decide at boot time based on the number of
+ * registered subsystems and the number of CPUs or NUMA nodes whether
+ * it's better for performance to ref-count every subsystem, or to
+ * take a global lock and only add one ref count to each hierarchy.
+ */
+static void get_css_set(struct css_set *cg)
+{
+        int i;
+        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
+                atomic_inc(&cg->subsys[i]->cgroup->count);
+}
+static void put_css_set(struct css_set *cg)
+{
+        int i;
+        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
+                atomic_dec(&cg->subsys[i]->cgroup->count);
+}
 /*
 * There is one global cgroup mutex. We also require taking
 * task_lock() when dereferencing a task's cgroup subsys pointers.
@@ -1587,3 +1614,97 @@ int __init cgroup_init(void)
 out:
        return err;
 }
+/**
+ * cgroup_fork - attach newly forked task to its parents cgroup.
+ * @tsk: pointer to task_struct of forking parent process.
+ *
+ * Description: A task inherits its parent's cgroup at fork().
+ *
+ * A pointer to the shared css_set was automatically copied in
+ * fork.c by dup_task_struct().  However, we ignore that copy, since
+ * it was not made under the protection of RCU or cgroup_mutex, so
+ * might no longer be a valid cgroup pointer.  attach_task() might
+ * have already changed current->cgroup, allowing the previously
+ * referenced cgroup to be removed and freed.
+ *
+ * At the point that cgroup_fork() is called, 'current' is the parent
+ * task, and the passed argument 'child' points to the child task.
+ */
+void cgroup_fork(struct task_struct *child)
+{
+        rcu_read_lock();
+        child->cgroups = rcu_dereference(current->cgroups);
+        get_css_set(&child->cgroups);
+        rcu_read_unlock();
+}
+/**
+ * cgroup_fork_callbacks - called on a new task very soon before
+ * adding it to the tasklist. No need to take any locks since no-one
+ * can be operating on this task
+ */
+void cgroup_fork_callbacks(struct task_struct *child)
+{
+        if (need_forkexit_callback) {
+                int i;
+                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                        struct cgroup_subsys *ss = subsys[i];
+                        if (ss->fork)
+                                ss->fork(ss, child);
+                }
+        }
+}
+/**
+ * cgroup_exit - detach cgroup from exiting task
+ * @tsk: pointer to task_struct of exiting process
+ *
+ * Description: Detach cgroup from @tsk and release it.
+ *
+ * Note that cgroups marked notify_on_release force every task in
+ * them to take the global cgroup_mutex mutex when exiting.
+ * This could impact scaling on very large systems.  Be reluctant to
+ * use notify_on_release cgroups where very high task exit scaling
+ * is required on large systems.
+ *
+ * the_top_cgroup_hack:
+ *
+ *    Set the exiting tasks cgroup to the root cgroup (top_cgroup).
+ *
+ *    We call cgroup_exit() while the task is still competent to
+ *    handle notify_on_release(), then leave the task attached to the
+ *    root cgroup in each hierarchy for the remainder of its exit.
+ *
+ *    To do this properly, we would increment the reference count on
+ *    top_cgroup, and near the very end of the kernel/exit.c do_exit()
+ *    code we would add a second cgroup function call, to drop that
+ *    reference.  This would just create an unnecessary hot spot on
+ *    the top_cgroup reference count, to no avail.
+ *
+ *    Normally, holding a reference to a cgroup without bumping its
+ *    count is unsafe.   The cgroup could go away, or someone could
+ *    attach us to a different cgroup, decrementing the count on
+ *    the first cgroup that we never incremented.  But in this case,
+ *    top_cgroup isn't going away, and either task has PF_EXITING set,
+ *    which wards off any attach_task() attempts, or task is a failed
+ *    fork, never visible to attach_task.
+ *
+ */
+void cgroup_exit(struct task_struct *tsk, int run_callbacks)
+{
+        int i;
+        if (run_callbacks && need_forkexit_callback) {
+                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                        struct cgroup_subsys *ss = subsys[i];
+                        if (ss->exit)
+                                ss->exit(ss, tsk);
+                }
+        }
+        /* Reassign the task to the init_css_set. */
+        task_lock(tsk);
+        put_css_set(&tsk->cgroups);
+        tsk->cgroups = init_task.cgroups;
+        task_unlock(tsk);
+}
diff --git a/kernel/exit.c b/kernel/exit.c
index 2c704c86edb3..44ff6147556a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -32,6 +32,7 @@
 #include <linux/delayacct.h>
 #include <linux/freezer.h>
 #include <linux/cpuset.h>
+#include <linux/cgroup.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
 #include <linux/posix-timers.h>
@@ -973,6 +974,7 @@ fastcall NORET_TYPE void do_exit(long code)
        check_stack_usage();
        exit_thread();
        cpuset_exit(tsk);
+        cgroup_exit(tsk, 1);
        exit_keys(tsk);
        if (group_dead && tsk->signal->leader)
diff --git a/kernel/fork.c b/kernel/fork.c
index 2ce28f165e31..e7c181454dca 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -30,6 +30,7 @@
 #include <linux/capability.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/cgroup.h>
 #include <linux/security.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
@@ -979,6 +980,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 {
        int retval;
        struct task_struct *p = NULL;
+        int cgroup_callbacks_done = 0;
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                return ERR_PTR(-EINVAL);
@@ -1088,12 +1090,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->io_context = NULL;
        p->audit_context = NULL;
        cpuset_fork(p);
+        cgroup_fork(p);
 #ifdef CONFIG_NUMA
        p->mempolicy = mpol_copy(p->mempolicy);
        if (IS_ERR(p->mempolicy)) {
                retval = PTR_ERR(p->mempolicy);
                p->mempolicy = NULL;
-                goto bad_fork_cleanup_cpuset;
+                goto bad_fork_cleanup_cgroup;
        }
        mpol_fix_fork_child_flag(p);
 #endif
@@ -1204,6 +1207,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        /* Perform scheduler related setup. Assign this task to a CPU. */
        sched_fork(p, clone_flags);
+        /* Now that the task is set up, run cgroup callbacks if
+         * necessary. We need to run them before the task is visible
+         * on the tasklist. */
+        cgroup_fork_callbacks(p);
+        cgroup_callbacks_done = 1;
        /* Need tasklist lock for parent etc handling! */
        write_lock_irq(&tasklist_lock);
@@ -1318,9 +1327,10 @@ bad_fork_cleanup_security:
 bad_fork_cleanup_policy:
 #ifdef CONFIG_NUMA
        mpol_free(p->mempolicy);
-bad_fork_cleanup_cpuset:
+bad_fork_cleanup_cgroup:
 #endif
        cpuset_exit(p);
+        cgroup_exit(p, cgroup_callbacks_done);
 bad_fork_cleanup_delays_binfmt:
        delayacct_tsk_free(p);
        if (p->binfmt)
author	Paul Menage <menage@google.com>	2007-10-19 02:39:33 -0400
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-10-19 14:53:36 -0400
commit	b4f48b6363c81ca743ef46943ef23fd72e60f679 (patch)
tree	40437b78e2d7a7d9d71e7bd63bc96e1ad02daa94 /kernel
parent	355e0c48b757b7fcc79ccb98fda8105ed37a1598 (diff)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f4c4dce9558f..7bb520aaf0a3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c
@@ -132,6 +132,33 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
132	#define for_each_root(_root) \	132	#define for_each_root(_root) \
133	list_for_each_entry(_root, &roots, root_list)	133	list_for_each_entry(_root, &roots, root_list)
134		134
		135	/* Each task_struct has an embedded css_set, so the get/put
		136	* operation simply takes a reference count on all the cgroups
		137	* referenced by subsystems in this css_set. This can end up
		138	* multiple-counting some cgroups, but that's OK - the ref-count is
		139	* just a busy/not-busy indicator; ensuring that we only count each
		140	* cgroup once would require taking a global lock to ensure that no
		141	* subsystems moved between hierarchies while we were doing so.
		142	*
		143	* Possible TODO: decide at boot time based on the number of
		144	* registered subsystems and the number of CPUs or NUMA nodes whether
		145	* it's better for performance to ref-count every subsystem, or to
		146	* take a global lock and only add one ref count to each hierarchy.
		147	*/
		148	static void get_css_set(struct css_set *cg)
		149	{
		150	int i;
		151	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
		152	atomic_inc(&cg->subsys[i]->cgroup->count);
		153	}
		154
		155	static void put_css_set(struct css_set *cg)
		156	{
		157	int i;
		158	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
		159	atomic_dec(&cg->subsys[i]->cgroup->count);
		160	}
		161
135	/*	162	/*
136	* There is one global cgroup mutex. We also require taking	163	* There is one global cgroup mutex. We also require taking
137	* task_lock() when dereferencing a task's cgroup subsys pointers.	164	* task_lock() when dereferencing a task's cgroup subsys pointers.
@@ -1587,3 +1614,97 @@ int __init cgroup_init(void)
1587	out:	1614	out:
1588	return err;	1615	return err;
1589	}	1616	}
		1617
		1618	/**
		1619	* cgroup_fork - attach newly forked task to its parents cgroup.
		1620	* @tsk: pointer to task_struct of forking parent process.
		1621	*
		1622	* Description: A task inherits its parent's cgroup at fork().
		1623	*
		1624	* A pointer to the shared css_set was automatically copied in
		1625	* fork.c by dup_task_struct(). However, we ignore that copy, since
		1626	* it was not made under the protection of RCU or cgroup_mutex, so
		1627	* might no longer be a valid cgroup pointer. attach_task() might
		1628	* have already changed current->cgroup, allowing the previously
		1629	* referenced cgroup to be removed and freed.
		1630	*
		1631	* At the point that cgroup_fork() is called, 'current' is the parent
		1632	* task, and the passed argument 'child' points to the child task.
		1633	*/
		1634	void cgroup_fork(struct task_struct *child)
		1635	{
		1636	rcu_read_lock();
		1637	child->cgroups = rcu_dereference(current->cgroups);
		1638	get_css_set(&child->cgroups);
		1639	rcu_read_unlock();
		1640	}
		1641
		1642	/**
		1643	* cgroup_fork_callbacks - called on a new task very soon before
		1644	* adding it to the tasklist. No need to take any locks since no-one
		1645	* can be operating on this task
		1646	*/
		1647	void cgroup_fork_callbacks(struct task_struct *child)
		1648	{
		1649	if (need_forkexit_callback) {
		1650	int i;
		1651	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
		1652	struct cgroup_subsys *ss = subsys[i];
		1653	if (ss->fork)
		1654	ss->fork(ss, child);
		1655	}
		1656	}
		1657	}
		1658
		1659	/**
		1660	* cgroup_exit - detach cgroup from exiting task
		1661	* @tsk: pointer to task_struct of exiting process
		1662	*
		1663	* Description: Detach cgroup from @tsk and release it.
		1664	*
		1665	* Note that cgroups marked notify_on_release force every task in
		1666	* them to take the global cgroup_mutex mutex when exiting.
		1667	* This could impact scaling on very large systems. Be reluctant to
		1668	* use notify_on_release cgroups where very high task exit scaling
		1669	* is required on large systems.
		1670	*
		1671	* the_top_cgroup_hack:
		1672	*
		1673	* Set the exiting tasks cgroup to the root cgroup (top_cgroup).
		1674	*
		1675	* We call cgroup_exit() while the task is still competent to
		1676	* handle notify_on_release(), then leave the task attached to the
		1677	* root cgroup in each hierarchy for the remainder of its exit.
		1678	*
		1679	* To do this properly, we would increment the reference count on
		1680	* top_cgroup, and near the very end of the kernel/exit.c do_exit()
		1681	* code we would add a second cgroup function call, to drop that
		1682	* reference. This would just create an unnecessary hot spot on
		1683	* the top_cgroup reference count, to no avail.
		1684	*
		1685	* Normally, holding a reference to a cgroup without bumping its
		1686	* count is unsafe. The cgroup could go away, or someone could
		1687	* attach us to a different cgroup, decrementing the count on
		1688	* the first cgroup that we never incremented. But in this case,
		1689	* top_cgroup isn't going away, and either task has PF_EXITING set,
		1690	* which wards off any attach_task() attempts, or task is a failed
		1691	* fork, never visible to attach_task.
		1692	*
		1693	*/
		1694	void cgroup_exit(struct task_struct *tsk, int run_callbacks)
		1695	{
		1696	int i;
		1697
		1698	if (run_callbacks && need_forkexit_callback) {
		1699	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
		1700	struct cgroup_subsys *ss = subsys[i];
		1701	if (ss->exit)
		1702	ss->exit(ss, tsk);
		1703	}
		1704	}
		1705	/* Reassign the task to the init_css_set. */
		1706	task_lock(tsk);
		1707	put_css_set(&tsk->cgroups);
		1708	tsk->cgroups = init_task.cgroups;
		1709	task_unlock(tsk);
		1710	}


diff --git a/kernel/exit.c b/kernel/exit.c index 2c704c86edb3..44ff6147556a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c
@@ -32,6 +32,7 @@
32	#include <linux/delayacct.h>	32	#include <linux/delayacct.h>
33	#include <linux/freezer.h>	33	#include <linux/freezer.h>
34	#include <linux/cpuset.h>	34	#include <linux/cpuset.h>
		35	#include <linux/cgroup.h>
35	#include <linux/syscalls.h>	36	#include <linux/syscalls.h>
36	#include <linux/signal.h>	37	#include <linux/signal.h>
37	#include <linux/posix-timers.h>	38	#include <linux/posix-timers.h>
@@ -973,6 +974,7 @@ fastcall NORET_TYPE void do_exit(long code)
973	check_stack_usage();	974	check_stack_usage();
974	exit_thread();	975	exit_thread();
975	cpuset_exit(tsk);	976	cpuset_exit(tsk);
		977	cgroup_exit(tsk, 1);
976	exit_keys(tsk);	978	exit_keys(tsk);
977		979
978	if (group_dead && tsk->signal->leader)	980	if (group_dead && tsk->signal->leader)


diff --git a/kernel/fork.c b/kernel/fork.c index 2ce28f165e31..e7c181454dca 100644 --- a/kernel/fork.c +++ b/kernel/fork.c
@@ -30,6 +30,7 @@
30	#include <linux/capability.h>	30	#include <linux/capability.h>
31	#include <linux/cpu.h>	31	#include <linux/cpu.h>
32	#include <linux/cpuset.h>	32	#include <linux/cpuset.h>
		33	#include <linux/cgroup.h>
33	#include <linux/security.h>	34	#include <linux/security.h>
34	#include <linux/swap.h>	35	#include <linux/swap.h>
35	#include <linux/syscalls.h>	36	#include <linux/syscalls.h>
@@ -979,6 +980,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
979	{	980	{
980	int retval;	981	int retval;
981	struct task_struct *p = NULL;	982	struct task_struct *p = NULL;
		983	int cgroup_callbacks_done = 0;
982		984
983	if ((clone_flags & (CLONE_NEWNS\|CLONE_FS)) == (CLONE_NEWNS\|CLONE_FS))	985	if ((clone_flags & (CLONE_NEWNS\|CLONE_FS)) == (CLONE_NEWNS\|CLONE_FS))
984	return ERR_PTR(-EINVAL);	986	return ERR_PTR(-EINVAL);
@@ -1088,12 +1090,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1088	p->io_context = NULL;	1090	p->io_context = NULL;
1089	p->audit_context = NULL;	1091	p->audit_context = NULL;
1090	cpuset_fork(p);	1092	cpuset_fork(p);
		1093	cgroup_fork(p);
1091	#ifdef CONFIG_NUMA	1094	#ifdef CONFIG_NUMA
1092	p->mempolicy = mpol_copy(p->mempolicy);	1095	p->mempolicy = mpol_copy(p->mempolicy);
1093	if (IS_ERR(p->mempolicy)) {	1096	if (IS_ERR(p->mempolicy)) {
1094	retval = PTR_ERR(p->mempolicy);	1097	retval = PTR_ERR(p->mempolicy);
1095	p->mempolicy = NULL;	1098	p->mempolicy = NULL;
1096	goto bad_fork_cleanup_cpuset;	1099	goto bad_fork_cleanup_cgroup;
1097	}	1100	}
1098	mpol_fix_fork_child_flag(p);	1101	mpol_fix_fork_child_flag(p);
1099	#endif	1102	#endif
@@ -1204,6 +1207,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1204	/* Perform scheduler related setup. Assign this task to a CPU. */	1207	/* Perform scheduler related setup. Assign this task to a CPU. */
1205	sched_fork(p, clone_flags);	1208	sched_fork(p, clone_flags);
1206		1209
		1210	/* Now that the task is set up, run cgroup callbacks if
		1211	* necessary. We need to run them before the task is visible
		1212	* on the tasklist. */
		1213	cgroup_fork_callbacks(p);
		1214	cgroup_callbacks_done = 1;
		1215
1207	/* Need tasklist lock for parent etc handling! */	1216	/* Need tasklist lock for parent etc handling! */
1208	write_lock_irq(&tasklist_lock);	1217	write_lock_irq(&tasklist_lock);
1209		1218
@@ -1318,9 +1327,10 @@ bad_fork_cleanup_security:
1318	bad_fork_cleanup_policy:	1327	bad_fork_cleanup_policy:
1319	#ifdef CONFIG_NUMA	1328	#ifdef CONFIG_NUMA
1320	mpol_free(p->mempolicy);	1329	mpol_free(p->mempolicy);
1321	bad_fork_cleanup_cpuset:	1330	bad_fork_cleanup_cgroup:
1322	#endif	1331	#endif
1323	cpuset_exit(p);	1332	cpuset_exit(p);
		1333	cgroup_exit(p, cgroup_callbacks_done);
1324	bad_fork_cleanup_delays_binfmt:	1334	bad_fork_cleanup_delays_binfmt:
1325	delayacct_tsk_free(p);	1335	delayacct_tsk_free(p);
1326	if (p->binfmt)	1336	if (p->binfmt)