4 files changed, 141 insertions, 2 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e95143c884b2..792ad74be170 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -25,6 +25,9 @@ extern int cgroup_init(void);
 extern void cgroup_init_smp(void);
 extern void cgroup_lock(void);
 extern void cgroup_unlock(void);
+extern void cgroup_fork(struct task_struct *p);
+extern void cgroup_fork_callbacks(struct task_struct *p);
+extern void cgroup_exit(struct task_struct *p, int run_callbacks);
 /* Per-subsystem/per-cgroup state maintained by the system. */
 struct cgroup_subsys_state {
@@ -223,6 +226,9 @@ int cgroup_path(const struct cgroup *cont, char *buf, int buflen);
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
 static inline void cgroup_init_smp(void) {}
+static inline void cgroup_fork(struct task_struct *p) {}
+static inline void cgroup_fork_callbacks(struct task_struct *p) {}
+static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
 static inline void cgroup_lock(void) {}
 static inline void cgroup_unlock(void) {}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f4c4dce9558f..7bb520aaf0a3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -132,6 +132,33 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
 #define for_each_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
+/* Each task_struct has an embedded css_set, so the get/put
+ * operation simply takes a reference count on all the cgroups
+ * referenced by subsystems in this css_set. This can end up
+ * multiple-counting some cgroups, but that's OK - the ref-count is
+ * just a busy/not-busy indicator; ensuring that we only count each
+ * cgroup once would require taking a global lock to ensure that no
+ * subsystems moved between hierarchies while we were doing so.
+ *
+ * Possible TODO: decide at boot time based on the number of
+ * registered subsystems and the number of CPUs or NUMA nodes whether
+ * it's better for performance to ref-count every subsystem, or to
+ * take a global lock and only add one ref count to each hierarchy.
+ */
+static void get_css_set(struct css_set *cg)
+{
+        int i;
+        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
+                atomic_inc(&cg->subsys[i]->cgroup->count);
+}
+static void put_css_set(struct css_set *cg)
+{
+        int i;
+        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
+                atomic_dec(&cg->subsys[i]->cgroup->count);
+}
 /*
 * There is one global cgroup mutex. We also require taking
 * task_lock() when dereferencing a task's cgroup subsys pointers.
@@ -1587,3 +1614,97 @@ int __init cgroup_init(void)
 out:
        return err;
 }
+/**
+ * cgroup_fork - attach newly forked task to its parents cgroup.
+ * @tsk: pointer to task_struct of forking parent process.
+ *
+ * Description: A task inherits its parent's cgroup at fork().
+ *
+ * A pointer to the shared css_set was automatically copied in
+ * fork.c by dup_task_struct().  However, we ignore that copy, since
+ * it was not made under the protection of RCU or cgroup_mutex, so
+ * might no longer be a valid cgroup pointer.  attach_task() might
+ * have already changed current->cgroup, allowing the previously
+ * referenced cgroup to be removed and freed.
+ *
+ * At the point that cgroup_fork() is called, 'current' is the parent
+ * task, and the passed argument 'child' points to the child task.
+ */
+void cgroup_fork(struct task_struct *child)
+{
+        rcu_read_lock();
+        child->cgroups = rcu_dereference(current->cgroups);
+        get_css_set(&child->cgroups);
+        rcu_read_unlock();
+}
+/**
+ * cgroup_fork_callbacks - called on a new task very soon before
+ * adding it to the tasklist. No need to take any locks since no-one
+ * can be operating on this task
+ */
+void cgroup_fork_callbacks(struct task_struct *child)
+{
+        if (need_forkexit_callback) {
+                int i;
+                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                        struct cgroup_subsys *ss = subsys[i];
+                        if (ss->fork)
+                                ss->fork(ss, child);
+                }
+        }
+}
+/**
+ * cgroup_exit - detach cgroup from exiting task
+ * @tsk: pointer to task_struct of exiting process
+ *
+ * Description: Detach cgroup from @tsk and release it.
+ *
+ * Note that cgroups marked notify_on_release force every task in
+ * them to take the global cgroup_mutex mutex when exiting.
+ * This could impact scaling on very large systems.  Be reluctant to
+ * use notify_on_release cgroups where very high task exit scaling
+ * is required on large systems.
+ *
+ * the_top_cgroup_hack:
+ *
+ *    Set the exiting tasks cgroup to the root cgroup (top_cgroup).
+ *
+ *    We call cgroup_exit() while the task is still competent to
+ *    handle notify_on_release(), then leave the task attached to the
+ *    root cgroup in each hierarchy for the remainder of its exit.
+ *
+ *    To do this properly, we would increment the reference count on
+ *    top_cgroup, and near the very end of the kernel/exit.c do_exit()
+ *    code we would add a second cgroup function call, to drop that
+ *    reference.  This would just create an unnecessary hot spot on
+ *    the top_cgroup reference count, to no avail.
+ *
+ *    Normally, holding a reference to a cgroup without bumping its
+ *    count is unsafe.   The cgroup could go away, or someone could
+ *    attach us to a different cgroup, decrementing the count on
+ *    the first cgroup that we never incremented.  But in this case,
+ *    top_cgroup isn't going away, and either task has PF_EXITING set,
+ *    which wards off any attach_task() attempts, or task is a failed
+ *    fork, never visible to attach_task.
+ *
+ */
+void cgroup_exit(struct task_struct *tsk, int run_callbacks)
+{
+        int i;
+        if (run_callbacks && need_forkexit_callback) {
+                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                        struct cgroup_subsys *ss = subsys[i];
+                        if (ss->exit)
+                                ss->exit(ss, tsk);
+                }
+        }
+        /* Reassign the task to the init_css_set. */
+        task_lock(tsk);
+        put_css_set(&tsk->cgroups);
+        tsk->cgroups = init_task.cgroups;
+        task_unlock(tsk);
+}
diff --git a/kernel/exit.c b/kernel/exit.c
index 2c704c86edb3..44ff6147556a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -32,6 +32,7 @@
 #include <linux/delayacct.h>
 #include <linux/freezer.h>
 #include <linux/cpuset.h>
+#include <linux/cgroup.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
 #include <linux/posix-timers.h>
@@ -973,6 +974,7 @@ fastcall NORET_TYPE void do_exit(long code)
        check_stack_usage();
        exit_thread();
        cpuset_exit(tsk);
+        cgroup_exit(tsk, 1);
        exit_keys(tsk);
        if (group_dead && tsk->signal->leader)
diff --git a/kernel/fork.c b/kernel/fork.c
index 2ce28f165e31..e7c181454dca 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -30,6 +30,7 @@
 #include <linux/capability.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/cgroup.h>
 #include <linux/security.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
@@ -979,6 +980,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 {
        int retval;
        struct task_struct *p = NULL;
+        int cgroup_callbacks_done = 0;
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                return ERR_PTR(-EINVAL);
@@ -1088,12 +1090,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->io_context = NULL;
        p->audit_context = NULL;
        cpuset_fork(p);
+        cgroup_fork(p);
 #ifdef CONFIG_NUMA
        p->mempolicy = mpol_copy(p->mempolicy);
        if (IS_ERR(p->mempolicy)) {
                retval = PTR_ERR(p->mempolicy);
                p->mempolicy = NULL;
-                goto bad_fork_cleanup_cpuset;
+                goto bad_fork_cleanup_cgroup;
        }
        mpol_fix_fork_child_flag(p);
 #endif
@@ -1204,6 +1207,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        /* Perform scheduler related setup. Assign this task to a CPU. */
        sched_fork(p, clone_flags);
+        /* Now that the task is set up, run cgroup callbacks if
+         * necessary. We need to run them before the task is visible
+         * on the tasklist. */
+        cgroup_fork_callbacks(p);
+        cgroup_callbacks_done = 1;
        /* Need tasklist lock for parent etc handling! */
        write_lock_irq(&tasklist_lock);
@@ -1318,9 +1327,10 @@ bad_fork_cleanup_security:
 bad_fork_cleanup_policy:
 #ifdef CONFIG_NUMA
        mpol_free(p->mempolicy);
-bad_fork_cleanup_cpuset:
+bad_fork_cleanup_cgroup:
 #endif
        cpuset_exit(p);
+        cgroup_exit(p, cgroup_callbacks_done);
 bad_fork_cleanup_delays_binfmt:
        delayacct_tsk_free(p);
        if (p->binfmt)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e95143c884b2..792ad74be170 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h
@@ -25,6 +25,9 @@ extern int cgroup_init(void);
25	extern void cgroup_init_smp(void);	25	extern void cgroup_init_smp(void);
26	extern void cgroup_lock(void);	26	extern void cgroup_lock(void);
27	extern void cgroup_unlock(void);	27	extern void cgroup_unlock(void);
		28	extern void cgroup_fork(struct task_struct *p);
		29	extern void cgroup_fork_callbacks(struct task_struct *p);
		30	extern void cgroup_exit(struct task_struct *p, int run_callbacks);
28		31
29	/* Per-subsystem/per-cgroup state maintained by the system. */	32	/* Per-subsystem/per-cgroup state maintained by the system. */
30	struct cgroup_subsys_state {	33	struct cgroup_subsys_state {
@@ -223,6 +226,9 @@ int cgroup_path(const struct cgroup cont, char buf, int buflen);
223	static inline int cgroup_init_early(void) { return 0; }	226	static inline int cgroup_init_early(void) { return 0; }
224	static inline int cgroup_init(void) { return 0; }	227	static inline int cgroup_init(void) { return 0; }
225	static inline void cgroup_init_smp(void) {}	228	static inline void cgroup_init_smp(void) {}
		229	static inline void cgroup_fork(struct task_struct *p) {}
		230	static inline void cgroup_fork_callbacks(struct task_struct *p) {}
		231	static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
226		232
227	static inline void cgroup_lock(void) {}	233	static inline void cgroup_lock(void) {}
228	static inline void cgroup_unlock(void) {}	234	static inline void cgroup_unlock(void) {}


diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f4c4dce9558f..7bb520aaf0a3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c
@@ -132,6 +132,33 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
132	#define for_each_root(_root) \	132	#define for_each_root(_root) \
133	list_for_each_entry(_root, &roots, root_list)	133	list_for_each_entry(_root, &roots, root_list)
134		134
		135	/* Each task_struct has an embedded css_set, so the get/put
		136	* operation simply takes a reference count on all the cgroups
		137	* referenced by subsystems in this css_set. This can end up
		138	* multiple-counting some cgroups, but that's OK - the ref-count is
		139	* just a busy/not-busy indicator; ensuring that we only count each
		140	* cgroup once would require taking a global lock to ensure that no
		141	* subsystems moved between hierarchies while we were doing so.
		142	*
		143	* Possible TODO: decide at boot time based on the number of
		144	* registered subsystems and the number of CPUs or NUMA nodes whether
		145	* it's better for performance to ref-count every subsystem, or to
		146	* take a global lock and only add one ref count to each hierarchy.
		147	*/
		148	static void get_css_set(struct css_set *cg)
		149	{
		150	int i;
		151	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
		152	atomic_inc(&cg->subsys[i]->cgroup->count);
		153	}
		154
		155	static void put_css_set(struct css_set *cg)
		156	{
		157	int i;
		158	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
		159	atomic_dec(&cg->subsys[i]->cgroup->count);
		160	}
		161
135	/*	162	/*
136	* There is one global cgroup mutex. We also require taking	163	* There is one global cgroup mutex. We also require taking
137	* task_lock() when dereferencing a task's cgroup subsys pointers.	164	* task_lock() when dereferencing a task's cgroup subsys pointers.
@@ -1587,3 +1614,97 @@ int __init cgroup_init(void)
1587	out:	1614	out:
1588	return err;	1615	return err;
1589	}	1616	}
		1617
		1618	/**
		1619	* cgroup_fork - attach newly forked task to its parents cgroup.
		1620	* @tsk: pointer to task_struct of forking parent process.
		1621	*
		1622	* Description: A task inherits its parent's cgroup at fork().
		1623	*
		1624	* A pointer to the shared css_set was automatically copied in
		1625	* fork.c by dup_task_struct(). However, we ignore that copy, since
		1626	* it was not made under the protection of RCU or cgroup_mutex, so
		1627	* might no longer be a valid cgroup pointer. attach_task() might
		1628	* have already changed current->cgroup, allowing the previously
		1629	* referenced cgroup to be removed and freed.
		1630	*
		1631	* At the point that cgroup_fork() is called, 'current' is the parent
		1632	* task, and the passed argument 'child' points to the child task.
		1633	*/
		1634	void cgroup_fork(struct task_struct *child)
		1635	{
		1636	rcu_read_lock();
		1637	child->cgroups = rcu_dereference(current->cgroups);
		1638	get_css_set(&child->cgroups);
		1639	rcu_read_unlock();
		1640	}
		1641
		1642	/**
		1643	* cgroup_fork_callbacks - called on a new task very soon before
		1644	* adding it to the tasklist. No need to take any locks since no-one
		1645	* can be operating on this task
		1646	*/
		1647	void cgroup_fork_callbacks(struct task_struct *child)
		1648	{
		1649	if (need_forkexit_callback) {
		1650	int i;
		1651	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
		1652	struct cgroup_subsys *ss = subsys[i];
		1653	if (ss->fork)
		1654	ss->fork(ss, child);
		1655	}
		1656	}
		1657	}
		1658
		1659	/**
		1660	* cgroup_exit - detach cgroup from exiting task
		1661	* @tsk: pointer to task_struct of exiting process
		1662	*
		1663	* Description: Detach cgroup from @tsk and release it.
		1664	*
		1665	* Note that cgroups marked notify_on_release force every task in
		1666	* them to take the global cgroup_mutex mutex when exiting.
		1667	* This could impact scaling on very large systems. Be reluctant to
		1668	* use notify_on_release cgroups where very high task exit scaling
		1669	* is required on large systems.
		1670	*
		1671	* the_top_cgroup_hack:
		1672	*
		1673	* Set the exiting tasks cgroup to the root cgroup (top_cgroup).
		1674	*
		1675	* We call cgroup_exit() while the task is still competent to
		1676	* handle notify_on_release(), then leave the task attached to the
		1677	* root cgroup in each hierarchy for the remainder of its exit.
		1678	*
		1679	* To do this properly, we would increment the reference count on
		1680	* top_cgroup, and near the very end of the kernel/exit.c do_exit()
		1681	* code we would add a second cgroup function call, to drop that
		1682	* reference. This would just create an unnecessary hot spot on
		1683	* the top_cgroup reference count, to no avail.
		1684	*
		1685	* Normally, holding a reference to a cgroup without bumping its
		1686	* count is unsafe. The cgroup could go away, or someone could
		1687	* attach us to a different cgroup, decrementing the count on
		1688	* the first cgroup that we never incremented. But in this case,
		1689	* top_cgroup isn't going away, and either task has PF_EXITING set,
		1690	* which wards off any attach_task() attempts, or task is a failed
		1691	* fork, never visible to attach_task.
		1692	*
		1693	*/
		1694	void cgroup_exit(struct task_struct *tsk, int run_callbacks)
		1695	{
		1696	int i;
		1697
		1698	if (run_callbacks && need_forkexit_callback) {
		1699	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
		1700	struct cgroup_subsys *ss = subsys[i];
		1701	if (ss->exit)
		1702	ss->exit(ss, tsk);
		1703	}
		1704	}
		1705	/* Reassign the task to the init_css_set. */
		1706	task_lock(tsk);
		1707	put_css_set(&tsk->cgroups);
		1708	tsk->cgroups = init_task.cgroups;
		1709	task_unlock(tsk);
		1710	}


diff --git a/kernel/exit.c b/kernel/exit.c index 2c704c86edb3..44ff6147556a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c
@@ -32,6 +32,7 @@
32	#include <linux/delayacct.h>	32	#include <linux/delayacct.h>
33	#include <linux/freezer.h>	33	#include <linux/freezer.h>
34	#include <linux/cpuset.h>	34	#include <linux/cpuset.h>
		35	#include <linux/cgroup.h>
35	#include <linux/syscalls.h>	36	#include <linux/syscalls.h>
36	#include <linux/signal.h>	37	#include <linux/signal.h>
37	#include <linux/posix-timers.h>	38	#include <linux/posix-timers.h>
@@ -973,6 +974,7 @@ fastcall NORET_TYPE void do_exit(long code)
973	check_stack_usage();	974	check_stack_usage();
974	exit_thread();	975	exit_thread();
975	cpuset_exit(tsk);	976	cpuset_exit(tsk);
		977	cgroup_exit(tsk, 1);
976	exit_keys(tsk);	978	exit_keys(tsk);
977		979
978	if (group_dead && tsk->signal->leader)	980	if (group_dead && tsk->signal->leader)


diff --git a/kernel/fork.c b/kernel/fork.c index 2ce28f165e31..e7c181454dca 100644 --- a/kernel/fork.c +++ b/kernel/fork.c
@@ -30,6 +30,7 @@
30	#include <linux/capability.h>	30	#include <linux/capability.h>
31	#include <linux/cpu.h>	31	#include <linux/cpu.h>
32	#include <linux/cpuset.h>	32	#include <linux/cpuset.h>
		33	#include <linux/cgroup.h>
33	#include <linux/security.h>	34	#include <linux/security.h>
34	#include <linux/swap.h>	35	#include <linux/swap.h>
35	#include <linux/syscalls.h>	36	#include <linux/syscalls.h>
@@ -979,6 +980,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
979	{	980	{
980	int retval;	981	int retval;
981	struct task_struct *p = NULL;	982	struct task_struct *p = NULL;
		983	int cgroup_callbacks_done = 0;
982		984
983	if ((clone_flags & (CLONE_NEWNS\|CLONE_FS)) == (CLONE_NEWNS\|CLONE_FS))	985	if ((clone_flags & (CLONE_NEWNS\|CLONE_FS)) == (CLONE_NEWNS\|CLONE_FS))
984	return ERR_PTR(-EINVAL);	986	return ERR_PTR(-EINVAL);
@@ -1088,12 +1090,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1088	p->io_context = NULL;	1090	p->io_context = NULL;
1089	p->audit_context = NULL;	1091	p->audit_context = NULL;
1090	cpuset_fork(p);	1092	cpuset_fork(p);
		1093	cgroup_fork(p);
1091	#ifdef CONFIG_NUMA	1094	#ifdef CONFIG_NUMA
1092	p->mempolicy = mpol_copy(p->mempolicy);	1095	p->mempolicy = mpol_copy(p->mempolicy);
1093	if (IS_ERR(p->mempolicy)) {	1096	if (IS_ERR(p->mempolicy)) {
1094	retval = PTR_ERR(p->mempolicy);	1097	retval = PTR_ERR(p->mempolicy);
1095	p->mempolicy = NULL;	1098	p->mempolicy = NULL;
1096	goto bad_fork_cleanup_cpuset;	1099	goto bad_fork_cleanup_cgroup;
1097	}	1100	}
1098	mpol_fix_fork_child_flag(p);	1101	mpol_fix_fork_child_flag(p);
1099	#endif	1102	#endif
@@ -1204,6 +1207,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1204	/* Perform scheduler related setup. Assign this task to a CPU. */	1207	/* Perform scheduler related setup. Assign this task to a CPU. */
1205	sched_fork(p, clone_flags);	1208	sched_fork(p, clone_flags);
1206		1209
		1210	/* Now that the task is set up, run cgroup callbacks if
		1211	* necessary. We need to run them before the task is visible
		1212	* on the tasklist. */
		1213	cgroup_fork_callbacks(p);
		1214	cgroup_callbacks_done = 1;
		1215
1207	/* Need tasklist lock for parent etc handling! */	1216	/* Need tasklist lock for parent etc handling! */
1208	write_lock_irq(&tasklist_lock);	1217	write_lock_irq(&tasklist_lock);
1209		1218
@@ -1318,9 +1327,10 @@ bad_fork_cleanup_security:
1318	bad_fork_cleanup_policy:	1327	bad_fork_cleanup_policy:
1319	#ifdef CONFIG_NUMA	1328	#ifdef CONFIG_NUMA
1320	mpol_free(p->mempolicy);	1329	mpol_free(p->mempolicy);
1321	bad_fork_cleanup_cpuset:	1330	bad_fork_cleanup_cgroup:
1322	#endif	1331	#endif
1323	cpuset_exit(p);	1332	cpuset_exit(p);
		1333	cgroup_exit(p, cgroup_callbacks_done);
1324	bad_fork_cleanup_delays_binfmt:	1334	bad_fork_cleanup_delays_binfmt:
1325	delayacct_tsk_free(p);	1335	delayacct_tsk_free(p);
1326	if (p->binfmt)	1336	if (p->binfmt)