aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Menage <menage@google.com>2007-10-19 02:39:33 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-19 14:53:36 -0400
commitb4f48b6363c81ca743ef46943ef23fd72e60f679 (patch)
tree40437b78e2d7a7d9d71e7bd63bc96e1ad02daa94
parent355e0c48b757b7fcc79ccb98fda8105ed37a1598 (diff)
Task Control Groups: add fork()/exit() hooks
This adds the necessary hooks to the fork() and exit() paths to ensure that new children inherit their parent's cgroup assignments, and that exiting processes release reference counts on their cgroups. Signed-off-by: Paul Menage <menage@google.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Jackson <pj@sgi.com> Cc: Kirill Korotaev <dev@openvz.org> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/cgroup.h6
-rw-r--r--kernel/cgroup.c121
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c14
4 files changed, 141 insertions, 2 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e95143c884b2..792ad74be170 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -25,6 +25,9 @@ extern int cgroup_init(void);
25extern void cgroup_init_smp(void); 25extern void cgroup_init_smp(void);
26extern void cgroup_lock(void); 26extern void cgroup_lock(void);
27extern void cgroup_unlock(void); 27extern void cgroup_unlock(void);
28extern void cgroup_fork(struct task_struct *p);
29extern void cgroup_fork_callbacks(struct task_struct *p);
30extern void cgroup_exit(struct task_struct *p, int run_callbacks);
28 31
29/* Per-subsystem/per-cgroup state maintained by the system. */ 32/* Per-subsystem/per-cgroup state maintained by the system. */
30struct cgroup_subsys_state { 33struct cgroup_subsys_state {
@@ -223,6 +226,9 @@ int cgroup_path(const struct cgroup *cont, char *buf, int buflen);
223static inline int cgroup_init_early(void) { return 0; } 226static inline int cgroup_init_early(void) { return 0; }
224static inline int cgroup_init(void) { return 0; } 227static inline int cgroup_init(void) { return 0; }
225static inline void cgroup_init_smp(void) {} 228static inline void cgroup_init_smp(void) {}
229static inline void cgroup_fork(struct task_struct *p) {}
230static inline void cgroup_fork_callbacks(struct task_struct *p) {}
231static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
226 232
227static inline void cgroup_lock(void) {} 233static inline void cgroup_lock(void) {}
228static inline void cgroup_unlock(void) {} 234static inline void cgroup_unlock(void) {}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f4c4dce9558f..7bb520aaf0a3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -132,6 +132,33 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
132#define for_each_root(_root) \ 132#define for_each_root(_root) \
133list_for_each_entry(_root, &roots, root_list) 133list_for_each_entry(_root, &roots, root_list)
134 134
135/* Each task_struct has an embedded css_set, so the get/put
136 * operation simply takes a reference count on all the cgroups
137 * referenced by subsystems in this css_set. This can end up
138 * multiple-counting some cgroups, but that's OK - the ref-count is
139 * just a busy/not-busy indicator; ensuring that we only count each
140 * cgroup once would require taking a global lock to ensure that no
141 * subsystems moved between hierarchies while we were doing so.
142 *
143 * Possible TODO: decide at boot time based on the number of
144 * registered subsystems and the number of CPUs or NUMA nodes whether
145 * it's better for performance to ref-count every subsystem, or to
146 * take a global lock and only add one ref count to each hierarchy.
147 */
148static void get_css_set(struct css_set *cg)
149{
150 int i;
151 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
152 atomic_inc(&cg->subsys[i]->cgroup->count);
153}
154
155static void put_css_set(struct css_set *cg)
156{
157 int i;
158 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
159 atomic_dec(&cg->subsys[i]->cgroup->count);
160}
161
135/* 162/*
136 * There is one global cgroup mutex. We also require taking 163 * There is one global cgroup mutex. We also require taking
137 * task_lock() when dereferencing a task's cgroup subsys pointers. 164 * task_lock() when dereferencing a task's cgroup subsys pointers.
@@ -1587,3 +1614,97 @@ int __init cgroup_init(void)
1587out: 1614out:
1588 return err; 1615 return err;
1589} 1616}
1617
1618/**
1619 * cgroup_fork - attach newly forked task to its parents cgroup.
1620 * @tsk: pointer to task_struct of forking parent process.
1621 *
1622 * Description: A task inherits its parent's cgroup at fork().
1623 *
1624 * A pointer to the shared css_set was automatically copied in
1625 * fork.c by dup_task_struct(). However, we ignore that copy, since
1626 * it was not made under the protection of RCU or cgroup_mutex, so
1627 * might no longer be a valid cgroup pointer. attach_task() might
1628 * have already changed current->cgroup, allowing the previously
1629 * referenced cgroup to be removed and freed.
1630 *
1631 * At the point that cgroup_fork() is called, 'current' is the parent
1632 * task, and the passed argument 'child' points to the child task.
1633 */
1634void cgroup_fork(struct task_struct *child)
1635{
1636 rcu_read_lock();
1637 child->cgroups = rcu_dereference(current->cgroups);
1638 get_css_set(&child->cgroups);
1639 rcu_read_unlock();
1640}
1641
1642/**
1643 * cgroup_fork_callbacks - called on a new task very soon before
1644 * adding it to the tasklist. No need to take any locks since no-one
1645 * can be operating on this task
1646 */
1647void cgroup_fork_callbacks(struct task_struct *child)
1648{
1649 if (need_forkexit_callback) {
1650 int i;
1651 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1652 struct cgroup_subsys *ss = subsys[i];
1653 if (ss->fork)
1654 ss->fork(ss, child);
1655 }
1656 }
1657}
1658
1659/**
1660 * cgroup_exit - detach cgroup from exiting task
1661 * @tsk: pointer to task_struct of exiting process
1662 *
1663 * Description: Detach cgroup from @tsk and release it.
1664 *
1665 * Note that cgroups marked notify_on_release force every task in
1666 * them to take the global cgroup_mutex mutex when exiting.
1667 * This could impact scaling on very large systems. Be reluctant to
1668 * use notify_on_release cgroups where very high task exit scaling
1669 * is required on large systems.
1670 *
1671 * the_top_cgroup_hack:
1672 *
1673 * Set the exiting tasks cgroup to the root cgroup (top_cgroup).
1674 *
1675 * We call cgroup_exit() while the task is still competent to
1676 * handle notify_on_release(), then leave the task attached to the
1677 * root cgroup in each hierarchy for the remainder of its exit.
1678 *
1679 * To do this properly, we would increment the reference count on
1680 * top_cgroup, and near the very end of the kernel/exit.c do_exit()
1681 * code we would add a second cgroup function call, to drop that
1682 * reference. This would just create an unnecessary hot spot on
1683 * the top_cgroup reference count, to no avail.
1684 *
1685 * Normally, holding a reference to a cgroup without bumping its
1686 * count is unsafe. The cgroup could go away, or someone could
1687 * attach us to a different cgroup, decrementing the count on
1688 * the first cgroup that we never incremented. But in this case,
1689 * top_cgroup isn't going away, and either task has PF_EXITING set,
1690 * which wards off any attach_task() attempts, or task is a failed
1691 * fork, never visible to attach_task.
1692 *
1693 */
1694void cgroup_exit(struct task_struct *tsk, int run_callbacks)
1695{
1696 int i;
1697
1698 if (run_callbacks && need_forkexit_callback) {
1699 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1700 struct cgroup_subsys *ss = subsys[i];
1701 if (ss->exit)
1702 ss->exit(ss, tsk);
1703 }
1704 }
1705 /* Reassign the task to the init_css_set. */
1706 task_lock(tsk);
1707 put_css_set(&tsk->cgroups);
1708 tsk->cgroups = init_task.cgroups;
1709 task_unlock(tsk);
1710}
diff --git a/kernel/exit.c b/kernel/exit.c
index 2c704c86edb3..44ff6147556a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -32,6 +32,7 @@
32#include <linux/delayacct.h> 32#include <linux/delayacct.h>
33#include <linux/freezer.h> 33#include <linux/freezer.h>
34#include <linux/cpuset.h> 34#include <linux/cpuset.h>
35#include <linux/cgroup.h>
35#include <linux/syscalls.h> 36#include <linux/syscalls.h>
36#include <linux/signal.h> 37#include <linux/signal.h>
37#include <linux/posix-timers.h> 38#include <linux/posix-timers.h>
@@ -973,6 +974,7 @@ fastcall NORET_TYPE void do_exit(long code)
973 check_stack_usage(); 974 check_stack_usage();
974 exit_thread(); 975 exit_thread();
975 cpuset_exit(tsk); 976 cpuset_exit(tsk);
977 cgroup_exit(tsk, 1);
976 exit_keys(tsk); 978 exit_keys(tsk);
977 979
978 if (group_dead && tsk->signal->leader) 980 if (group_dead && tsk->signal->leader)
diff --git a/kernel/fork.c b/kernel/fork.c
index 2ce28f165e31..e7c181454dca 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -30,6 +30,7 @@
30#include <linux/capability.h> 30#include <linux/capability.h>
31#include <linux/cpu.h> 31#include <linux/cpu.h>
32#include <linux/cpuset.h> 32#include <linux/cpuset.h>
33#include <linux/cgroup.h>
33#include <linux/security.h> 34#include <linux/security.h>
34#include <linux/swap.h> 35#include <linux/swap.h>
35#include <linux/syscalls.h> 36#include <linux/syscalls.h>
@@ -979,6 +980,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
979{ 980{
980 int retval; 981 int retval;
981 struct task_struct *p = NULL; 982 struct task_struct *p = NULL;
983 int cgroup_callbacks_done = 0;
982 984
983 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 985 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
984 return ERR_PTR(-EINVAL); 986 return ERR_PTR(-EINVAL);
@@ -1088,12 +1090,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1088 p->io_context = NULL; 1090 p->io_context = NULL;
1089 p->audit_context = NULL; 1091 p->audit_context = NULL;
1090 cpuset_fork(p); 1092 cpuset_fork(p);
1093 cgroup_fork(p);
1091#ifdef CONFIG_NUMA 1094#ifdef CONFIG_NUMA
1092 p->mempolicy = mpol_copy(p->mempolicy); 1095 p->mempolicy = mpol_copy(p->mempolicy);
1093 if (IS_ERR(p->mempolicy)) { 1096 if (IS_ERR(p->mempolicy)) {
1094 retval = PTR_ERR(p->mempolicy); 1097 retval = PTR_ERR(p->mempolicy);
1095 p->mempolicy = NULL; 1098 p->mempolicy = NULL;
1096 goto bad_fork_cleanup_cpuset; 1099 goto bad_fork_cleanup_cgroup;
1097 } 1100 }
1098 mpol_fix_fork_child_flag(p); 1101 mpol_fix_fork_child_flag(p);
1099#endif 1102#endif
@@ -1204,6 +1207,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1204 /* Perform scheduler related setup. Assign this task to a CPU. */ 1207 /* Perform scheduler related setup. Assign this task to a CPU. */
1205 sched_fork(p, clone_flags); 1208 sched_fork(p, clone_flags);
1206 1209
1210 /* Now that the task is set up, run cgroup callbacks if
1211 * necessary. We need to run them before the task is visible
1212 * on the tasklist. */
1213 cgroup_fork_callbacks(p);
1214 cgroup_callbacks_done = 1;
1215
1207 /* Need tasklist lock for parent etc handling! */ 1216 /* Need tasklist lock for parent etc handling! */
1208 write_lock_irq(&tasklist_lock); 1217 write_lock_irq(&tasklist_lock);
1209 1218
@@ -1318,9 +1327,10 @@ bad_fork_cleanup_security:
1318bad_fork_cleanup_policy: 1327bad_fork_cleanup_policy:
1319#ifdef CONFIG_NUMA 1328#ifdef CONFIG_NUMA
1320 mpol_free(p->mempolicy); 1329 mpol_free(p->mempolicy);
1321bad_fork_cleanup_cpuset: 1330bad_fork_cleanup_cgroup:
1322#endif 1331#endif
1323 cpuset_exit(p); 1332 cpuset_exit(p);
1333 cgroup_exit(p, cgroup_callbacks_done);
1324bad_fork_cleanup_delays_binfmt: 1334bad_fork_cleanup_delays_binfmt:
1325 delayacct_tsk_free(p); 1335 delayacct_tsk_free(p);
1326 if (p->binfmt) 1336 if (p->binfmt)