aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c121
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c14
3 files changed, 135 insertions, 2 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f4c4dce9558f..7bb520aaf0a3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -132,6 +132,33 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
132#define for_each_root(_root) \ 132#define for_each_root(_root) \
133list_for_each_entry(_root, &roots, root_list) 133list_for_each_entry(_root, &roots, root_list)
134 134
135/* Each task_struct has an embedded css_set, so the get/put
136 * operation simply takes a reference count on all the cgroups
137 * referenced by subsystems in this css_set. This can end up
138 * multiple-counting some cgroups, but that's OK - the ref-count is
139 * just a busy/not-busy indicator; ensuring that we only count each
140 * cgroup once would require taking a global lock to ensure that no
141 * subsystems moved between hierarchies while we were doing so.
142 *
143 * Possible TODO: decide at boot time based on the number of
144 * registered subsystems and the number of CPUs or NUMA nodes whether
145 * it's better for performance to ref-count every subsystem, or to
146 * take a global lock and only add one ref count to each hierarchy.
147 */
148static void get_css_set(struct css_set *cg)
149{
150 int i;
151 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
152 atomic_inc(&cg->subsys[i]->cgroup->count);
153}
154
155static void put_css_set(struct css_set *cg)
156{
157 int i;
158 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
159 atomic_dec(&cg->subsys[i]->cgroup->count);
160}
161
135/* 162/*
136 * There is one global cgroup mutex. We also require taking 163 * There is one global cgroup mutex. We also require taking
137 * task_lock() when dereferencing a task's cgroup subsys pointers. 164 * task_lock() when dereferencing a task's cgroup subsys pointers.
@@ -1587,3 +1614,97 @@ int __init cgroup_init(void)
1587out: 1614out:
1588 return err; 1615 return err;
1589} 1616}
1617
1618/**
1619 * cgroup_fork - attach newly forked task to its parents cgroup.
1620 * @tsk: pointer to task_struct of forking parent process.
1621 *
1622 * Description: A task inherits its parent's cgroup at fork().
1623 *
1624 * A pointer to the shared css_set was automatically copied in
1625 * fork.c by dup_task_struct(). However, we ignore that copy, since
1626 * it was not made under the protection of RCU or cgroup_mutex, so
1627 * might no longer be a valid cgroup pointer. attach_task() might
1628 * have already changed current->cgroup, allowing the previously
1629 * referenced cgroup to be removed and freed.
1630 *
1631 * At the point that cgroup_fork() is called, 'current' is the parent
1632 * task, and the passed argument 'child' points to the child task.
1633 */
1634void cgroup_fork(struct task_struct *child)
1635{
1636 rcu_read_lock();
1637 child->cgroups = rcu_dereference(current->cgroups);
1638 get_css_set(&child->cgroups);
1639 rcu_read_unlock();
1640}
1641
1642/**
1643 * cgroup_fork_callbacks - called on a new task very soon before
1644 * adding it to the tasklist. No need to take any locks since no-one
1645 * can be operating on this task
1646 */
1647void cgroup_fork_callbacks(struct task_struct *child)
1648{
1649 if (need_forkexit_callback) {
1650 int i;
1651 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1652 struct cgroup_subsys *ss = subsys[i];
1653 if (ss->fork)
1654 ss->fork(ss, child);
1655 }
1656 }
1657}
1658
1659/**
1660 * cgroup_exit - detach cgroup from exiting task
1661 * @tsk: pointer to task_struct of exiting process
1662 *
1663 * Description: Detach cgroup from @tsk and release it.
1664 *
1665 * Note that cgroups marked notify_on_release force every task in
1666 * them to take the global cgroup_mutex mutex when exiting.
1667 * This could impact scaling on very large systems. Be reluctant to
1668 * use notify_on_release cgroups where very high task exit scaling
1669 * is required on large systems.
1670 *
1671 * the_top_cgroup_hack:
1672 *
1673 * Set the exiting tasks cgroup to the root cgroup (top_cgroup).
1674 *
1675 * We call cgroup_exit() while the task is still competent to
1676 * handle notify_on_release(), then leave the task attached to the
1677 * root cgroup in each hierarchy for the remainder of its exit.
1678 *
1679 * To do this properly, we would increment the reference count on
1680 * top_cgroup, and near the very end of the kernel/exit.c do_exit()
1681 * code we would add a second cgroup function call, to drop that
1682 * reference. This would just create an unnecessary hot spot on
1683 * the top_cgroup reference count, to no avail.
1684 *
1685 * Normally, holding a reference to a cgroup without bumping its
1686 * count is unsafe. The cgroup could go away, or someone could
1687 * attach us to a different cgroup, decrementing the count on
1688 * the first cgroup that we never incremented. But in this case,
1689 * top_cgroup isn't going away, and either task has PF_EXITING set,
1690 * which wards off any attach_task() attempts, or task is a failed
1691 * fork, never visible to attach_task.
1692 *
1693 */
1694void cgroup_exit(struct task_struct *tsk, int run_callbacks)
1695{
1696 int i;
1697
1698 if (run_callbacks && need_forkexit_callback) {
1699 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1700 struct cgroup_subsys *ss = subsys[i];
1701 if (ss->exit)
1702 ss->exit(ss, tsk);
1703 }
1704 }
1705 /* Reassign the task to the init_css_set. */
1706 task_lock(tsk);
1707 put_css_set(&tsk->cgroups);
1708 tsk->cgroups = init_task.cgroups;
1709 task_unlock(tsk);
1710}
diff --git a/kernel/exit.c b/kernel/exit.c
index 2c704c86edb3..44ff6147556a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -32,6 +32,7 @@
32#include <linux/delayacct.h> 32#include <linux/delayacct.h>
33#include <linux/freezer.h> 33#include <linux/freezer.h>
34#include <linux/cpuset.h> 34#include <linux/cpuset.h>
35#include <linux/cgroup.h>
35#include <linux/syscalls.h> 36#include <linux/syscalls.h>
36#include <linux/signal.h> 37#include <linux/signal.h>
37#include <linux/posix-timers.h> 38#include <linux/posix-timers.h>
@@ -973,6 +974,7 @@ fastcall NORET_TYPE void do_exit(long code)
973 check_stack_usage(); 974 check_stack_usage();
974 exit_thread(); 975 exit_thread();
975 cpuset_exit(tsk); 976 cpuset_exit(tsk);
977 cgroup_exit(tsk, 1);
976 exit_keys(tsk); 978 exit_keys(tsk);
977 979
978 if (group_dead && tsk->signal->leader) 980 if (group_dead && tsk->signal->leader)
diff --git a/kernel/fork.c b/kernel/fork.c
index 2ce28f165e31..e7c181454dca 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -30,6 +30,7 @@
30#include <linux/capability.h> 30#include <linux/capability.h>
31#include <linux/cpu.h> 31#include <linux/cpu.h>
32#include <linux/cpuset.h> 32#include <linux/cpuset.h>
33#include <linux/cgroup.h>
33#include <linux/security.h> 34#include <linux/security.h>
34#include <linux/swap.h> 35#include <linux/swap.h>
35#include <linux/syscalls.h> 36#include <linux/syscalls.h>
@@ -979,6 +980,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
979{ 980{
980 int retval; 981 int retval;
981 struct task_struct *p = NULL; 982 struct task_struct *p = NULL;
983 int cgroup_callbacks_done = 0;
982 984
983 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 985 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
984 return ERR_PTR(-EINVAL); 986 return ERR_PTR(-EINVAL);
@@ -1088,12 +1090,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1088 p->io_context = NULL; 1090 p->io_context = NULL;
1089 p->audit_context = NULL; 1091 p->audit_context = NULL;
1090 cpuset_fork(p); 1092 cpuset_fork(p);
1093 cgroup_fork(p);
1091#ifdef CONFIG_NUMA 1094#ifdef CONFIG_NUMA
1092 p->mempolicy = mpol_copy(p->mempolicy); 1095 p->mempolicy = mpol_copy(p->mempolicy);
1093 if (IS_ERR(p->mempolicy)) { 1096 if (IS_ERR(p->mempolicy)) {
1094 retval = PTR_ERR(p->mempolicy); 1097 retval = PTR_ERR(p->mempolicy);
1095 p->mempolicy = NULL; 1098 p->mempolicy = NULL;
1096 goto bad_fork_cleanup_cpuset; 1099 goto bad_fork_cleanup_cgroup;
1097 } 1100 }
1098 mpol_fix_fork_child_flag(p); 1101 mpol_fix_fork_child_flag(p);
1099#endif 1102#endif
@@ -1204,6 +1207,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1204 /* Perform scheduler related setup. Assign this task to a CPU. */ 1207 /* Perform scheduler related setup. Assign this task to a CPU. */
1205 sched_fork(p, clone_flags); 1208 sched_fork(p, clone_flags);
1206 1209
1210 /* Now that the task is set up, run cgroup callbacks if
1211 * necessary. We need to run them before the task is visible
1212 * on the tasklist. */
1213 cgroup_fork_callbacks(p);
1214 cgroup_callbacks_done = 1;
1215
1207 /* Need tasklist lock for parent etc handling! */ 1216 /* Need tasklist lock for parent etc handling! */
1208 write_lock_irq(&tasklist_lock); 1217 write_lock_irq(&tasklist_lock);
1209 1218
@@ -1318,9 +1327,10 @@ bad_fork_cleanup_security:
1318bad_fork_cleanup_policy: 1327bad_fork_cleanup_policy:
1319#ifdef CONFIG_NUMA 1328#ifdef CONFIG_NUMA
1320 mpol_free(p->mempolicy); 1329 mpol_free(p->mempolicy);
1321bad_fork_cleanup_cpuset: 1330bad_fork_cleanup_cgroup:
1322#endif 1331#endif
1323 cpuset_exit(p); 1332 cpuset_exit(p);
1333 cgroup_exit(p, cgroup_callbacks_done);
1324bad_fork_cleanup_delays_binfmt: 1334bad_fork_cleanup_delays_binfmt:
1325 delayacct_tsk_free(p); 1335 delayacct_tsk_free(p);
1326 if (p->binfmt) 1336 if (p->binfmt)