diff options
-rw-r--r-- | include/linux/cgroup.h | 6 | ||||
-rw-r--r-- | kernel/cgroup.c | 121 | ||||
-rw-r--r-- | kernel/exit.c | 2 | ||||
-rw-r--r-- | kernel/fork.c | 14 |
4 files changed, 141 insertions, 2 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e95143c884b2..792ad74be170 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -25,6 +25,9 @@ extern int cgroup_init(void); | |||
25 | extern void cgroup_init_smp(void); | 25 | extern void cgroup_init_smp(void); |
26 | extern void cgroup_lock(void); | 26 | extern void cgroup_lock(void); |
27 | extern void cgroup_unlock(void); | 27 | extern void cgroup_unlock(void); |
28 | extern void cgroup_fork(struct task_struct *p); | ||
29 | extern void cgroup_fork_callbacks(struct task_struct *p); | ||
30 | extern void cgroup_exit(struct task_struct *p, int run_callbacks); | ||
28 | 31 | ||
29 | /* Per-subsystem/per-cgroup state maintained by the system. */ | 32 | /* Per-subsystem/per-cgroup state maintained by the system. */ |
30 | struct cgroup_subsys_state { | 33 | struct cgroup_subsys_state { |
@@ -223,6 +226,9 @@ int cgroup_path(const struct cgroup *cont, char *buf, int buflen); | |||
223 | static inline int cgroup_init_early(void) { return 0; } | 226 | static inline int cgroup_init_early(void) { return 0; } |
224 | static inline int cgroup_init(void) { return 0; } | 227 | static inline int cgroup_init(void) { return 0; } |
225 | static inline void cgroup_init_smp(void) {} | 228 | static inline void cgroup_init_smp(void) {} |
229 | static inline void cgroup_fork(struct task_struct *p) {} | ||
230 | static inline void cgroup_fork_callbacks(struct task_struct *p) {} | ||
231 | static inline void cgroup_exit(struct task_struct *p, int callbacks) {} | ||
226 | 232 | ||
227 | static inline void cgroup_lock(void) {} | 233 | static inline void cgroup_lock(void) {} |
228 | static inline void cgroup_unlock(void) {} | 234 | static inline void cgroup_unlock(void) {} |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f4c4dce9558f..7bb520aaf0a3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -132,6 +132,33 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling) | |||
132 | #define for_each_root(_root) \ | 132 | #define for_each_root(_root) \ |
133 | list_for_each_entry(_root, &roots, root_list) | 133 | list_for_each_entry(_root, &roots, root_list) |
134 | 134 | ||
135 | /* Each task_struct has an embedded css_set, so the get/put | ||
136 | * operation simply takes a reference count on all the cgroups | ||
137 | * referenced by subsystems in this css_set. This can end up | ||
138 | * multiple-counting some cgroups, but that's OK - the ref-count is | ||
139 | * just a busy/not-busy indicator; ensuring that we only count each | ||
140 | * cgroup once would require taking a global lock to ensure that no | ||
141 | * subsystems moved between hierarchies while we were doing so. | ||
142 | * | ||
143 | * Possible TODO: decide at boot time based on the number of | ||
144 | * registered subsystems and the number of CPUs or NUMA nodes whether | ||
145 | * it's better for performance to ref-count every subsystem, or to | ||
146 | * take a global lock and only add one ref count to each hierarchy. | ||
147 | */ | ||
148 | static void get_css_set(struct css_set *cg) | ||
149 | { | ||
150 | int i; | ||
151 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) | ||
152 | atomic_inc(&cg->subsys[i]->cgroup->count); | ||
153 | } | ||
154 | |||
155 | static void put_css_set(struct css_set *cg) | ||
156 | { | ||
157 | int i; | ||
158 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) | ||
159 | atomic_dec(&cg->subsys[i]->cgroup->count); | ||
160 | } | ||
161 | |||
135 | /* | 162 | /* |
136 | * There is one global cgroup mutex. We also require taking | 163 | * There is one global cgroup mutex. We also require taking |
137 | * task_lock() when dereferencing a task's cgroup subsys pointers. | 164 | * task_lock() when dereferencing a task's cgroup subsys pointers. |
@@ -1587,3 +1614,97 @@ int __init cgroup_init(void) | |||
1587 | out: | 1614 | out: |
1588 | return err; | 1615 | return err; |
1589 | } | 1616 | } |
1617 | |||
1618 | /** | ||
1619 | * cgroup_fork - attach newly forked task to its parents cgroup. | ||
1620 | * @tsk: pointer to task_struct of forking parent process. | ||
1621 | * | ||
1622 | * Description: A task inherits its parent's cgroup at fork(). | ||
1623 | * | ||
1624 | * A pointer to the shared css_set was automatically copied in | ||
1625 | * fork.c by dup_task_struct(). However, we ignore that copy, since | ||
1626 | * it was not made under the protection of RCU or cgroup_mutex, so | ||
1627 | * might no longer be a valid cgroup pointer. attach_task() might | ||
1628 | * have already changed current->cgroup, allowing the previously | ||
1629 | * referenced cgroup to be removed and freed. | ||
1630 | * | ||
1631 | * At the point that cgroup_fork() is called, 'current' is the parent | ||
1632 | * task, and the passed argument 'child' points to the child task. | ||
1633 | */ | ||
1634 | void cgroup_fork(struct task_struct *child) | ||
1635 | { | ||
1636 | rcu_read_lock(); | ||
1637 | child->cgroups = rcu_dereference(current->cgroups); | ||
1638 | get_css_set(&child->cgroups); | ||
1639 | rcu_read_unlock(); | ||
1640 | } | ||
1641 | |||
1642 | /** | ||
1643 | * cgroup_fork_callbacks - called on a new task very soon before | ||
1644 | * adding it to the tasklist. No need to take any locks since no-one | ||
1645 | * can be operating on this task | ||
1646 | */ | ||
1647 | void cgroup_fork_callbacks(struct task_struct *child) | ||
1648 | { | ||
1649 | if (need_forkexit_callback) { | ||
1650 | int i; | ||
1651 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
1652 | struct cgroup_subsys *ss = subsys[i]; | ||
1653 | if (ss->fork) | ||
1654 | ss->fork(ss, child); | ||
1655 | } | ||
1656 | } | ||
1657 | } | ||
1658 | |||
1659 | /** | ||
1660 | * cgroup_exit - detach cgroup from exiting task | ||
1661 | * @tsk: pointer to task_struct of exiting process | ||
1662 | * | ||
1663 | * Description: Detach cgroup from @tsk and release it. | ||
1664 | * | ||
1665 | * Note that cgroups marked notify_on_release force every task in | ||
1666 | * them to take the global cgroup_mutex mutex when exiting. | ||
1667 | * This could impact scaling on very large systems. Be reluctant to | ||
1668 | * use notify_on_release cgroups where very high task exit scaling | ||
1669 | * is required on large systems. | ||
1670 | * | ||
1671 | * the_top_cgroup_hack: | ||
1672 | * | ||
1673 | * Set the exiting tasks cgroup to the root cgroup (top_cgroup). | ||
1674 | * | ||
1675 | * We call cgroup_exit() while the task is still competent to | ||
1676 | * handle notify_on_release(), then leave the task attached to the | ||
1677 | * root cgroup in each hierarchy for the remainder of its exit. | ||
1678 | * | ||
1679 | * To do this properly, we would increment the reference count on | ||
1680 | * top_cgroup, and near the very end of the kernel/exit.c do_exit() | ||
1681 | * code we would add a second cgroup function call, to drop that | ||
1682 | * reference. This would just create an unnecessary hot spot on | ||
1683 | * the top_cgroup reference count, to no avail. | ||
1684 | * | ||
1685 | * Normally, holding a reference to a cgroup without bumping its | ||
1686 | * count is unsafe. The cgroup could go away, or someone could | ||
1687 | * attach us to a different cgroup, decrementing the count on | ||
1688 | * the first cgroup that we never incremented. But in this case, | ||
1689 | * top_cgroup isn't going away, and either task has PF_EXITING set, | ||
1690 | * which wards off any attach_task() attempts, or task is a failed | ||
1691 | * fork, never visible to attach_task. | ||
1692 | * | ||
1693 | */ | ||
1694 | void cgroup_exit(struct task_struct *tsk, int run_callbacks) | ||
1695 | { | ||
1696 | int i; | ||
1697 | |||
1698 | if (run_callbacks && need_forkexit_callback) { | ||
1699 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
1700 | struct cgroup_subsys *ss = subsys[i]; | ||
1701 | if (ss->exit) | ||
1702 | ss->exit(ss, tsk); | ||
1703 | } | ||
1704 | } | ||
1705 | /* Reassign the task to the init_css_set. */ | ||
1706 | task_lock(tsk); | ||
1707 | put_css_set(&tsk->cgroups); | ||
1708 | tsk->cgroups = init_task.cgroups; | ||
1709 | task_unlock(tsk); | ||
1710 | } | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 2c704c86edb3..44ff6147556a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/delayacct.h> | 32 | #include <linux/delayacct.h> |
33 | #include <linux/freezer.h> | 33 | #include <linux/freezer.h> |
34 | #include <linux/cpuset.h> | 34 | #include <linux/cpuset.h> |
35 | #include <linux/cgroup.h> | ||
35 | #include <linux/syscalls.h> | 36 | #include <linux/syscalls.h> |
36 | #include <linux/signal.h> | 37 | #include <linux/signal.h> |
37 | #include <linux/posix-timers.h> | 38 | #include <linux/posix-timers.h> |
@@ -973,6 +974,7 @@ fastcall NORET_TYPE void do_exit(long code) | |||
973 | check_stack_usage(); | 974 | check_stack_usage(); |
974 | exit_thread(); | 975 | exit_thread(); |
975 | cpuset_exit(tsk); | 976 | cpuset_exit(tsk); |
977 | cgroup_exit(tsk, 1); | ||
976 | exit_keys(tsk); | 978 | exit_keys(tsk); |
977 | 979 | ||
978 | if (group_dead && tsk->signal->leader) | 980 | if (group_dead && tsk->signal->leader) |
diff --git a/kernel/fork.c b/kernel/fork.c index 2ce28f165e31..e7c181454dca 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/capability.h> | 30 | #include <linux/capability.h> |
31 | #include <linux/cpu.h> | 31 | #include <linux/cpu.h> |
32 | #include <linux/cpuset.h> | 32 | #include <linux/cpuset.h> |
33 | #include <linux/cgroup.h> | ||
33 | #include <linux/security.h> | 34 | #include <linux/security.h> |
34 | #include <linux/swap.h> | 35 | #include <linux/swap.h> |
35 | #include <linux/syscalls.h> | 36 | #include <linux/syscalls.h> |
@@ -979,6 +980,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
979 | { | 980 | { |
980 | int retval; | 981 | int retval; |
981 | struct task_struct *p = NULL; | 982 | struct task_struct *p = NULL; |
983 | int cgroup_callbacks_done = 0; | ||
982 | 984 | ||
983 | if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) | 985 | if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) |
984 | return ERR_PTR(-EINVAL); | 986 | return ERR_PTR(-EINVAL); |
@@ -1088,12 +1090,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1088 | p->io_context = NULL; | 1090 | p->io_context = NULL; |
1089 | p->audit_context = NULL; | 1091 | p->audit_context = NULL; |
1090 | cpuset_fork(p); | 1092 | cpuset_fork(p); |
1093 | cgroup_fork(p); | ||
1091 | #ifdef CONFIG_NUMA | 1094 | #ifdef CONFIG_NUMA |
1092 | p->mempolicy = mpol_copy(p->mempolicy); | 1095 | p->mempolicy = mpol_copy(p->mempolicy); |
1093 | if (IS_ERR(p->mempolicy)) { | 1096 | if (IS_ERR(p->mempolicy)) { |
1094 | retval = PTR_ERR(p->mempolicy); | 1097 | retval = PTR_ERR(p->mempolicy); |
1095 | p->mempolicy = NULL; | 1098 | p->mempolicy = NULL; |
1096 | goto bad_fork_cleanup_cpuset; | 1099 | goto bad_fork_cleanup_cgroup; |
1097 | } | 1100 | } |
1098 | mpol_fix_fork_child_flag(p); | 1101 | mpol_fix_fork_child_flag(p); |
1099 | #endif | 1102 | #endif |
@@ -1204,6 +1207,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1204 | /* Perform scheduler related setup. Assign this task to a CPU. */ | 1207 | /* Perform scheduler related setup. Assign this task to a CPU. */ |
1205 | sched_fork(p, clone_flags); | 1208 | sched_fork(p, clone_flags); |
1206 | 1209 | ||
1210 | /* Now that the task is set up, run cgroup callbacks if | ||
1211 | * necessary. We need to run them before the task is visible | ||
1212 | * on the tasklist. */ | ||
1213 | cgroup_fork_callbacks(p); | ||
1214 | cgroup_callbacks_done = 1; | ||
1215 | |||
1207 | /* Need tasklist lock for parent etc handling! */ | 1216 | /* Need tasklist lock for parent etc handling! */ |
1208 | write_lock_irq(&tasklist_lock); | 1217 | write_lock_irq(&tasklist_lock); |
1209 | 1218 | ||
@@ -1318,9 +1327,10 @@ bad_fork_cleanup_security: | |||
1318 | bad_fork_cleanup_policy: | 1327 | bad_fork_cleanup_policy: |
1319 | #ifdef CONFIG_NUMA | 1328 | #ifdef CONFIG_NUMA |
1320 | mpol_free(p->mempolicy); | 1329 | mpol_free(p->mempolicy); |
1321 | bad_fork_cleanup_cpuset: | 1330 | bad_fork_cleanup_cgroup: |
1322 | #endif | 1331 | #endif |
1323 | cpuset_exit(p); | 1332 | cpuset_exit(p); |
1333 | cgroup_exit(p, cgroup_callbacks_done); | ||
1324 | bad_fork_cleanup_delays_binfmt: | 1334 | bad_fork_cleanup_delays_binfmt: |
1325 | delayacct_tsk_free(p); | 1335 | delayacct_tsk_free(p); |
1326 | if (p->binfmt) | 1336 | if (p->binfmt) |