diff options
-rw-r--r-- | include/linux/cgroup.h | 1 | ||||
-rw-r--r-- | kernel/cgroup.c | 22 | ||||
-rw-r--r-- | kernel/cpuset.c | 167 |
3 files changed, 145 insertions, 45 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 8675c691d3e2..ff9055fc3d2a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -318,6 +318,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cont, | |||
318 | struct cgroup_iter *it); | 318 | struct cgroup_iter *it); |
319 | void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it); | 319 | void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it); |
320 | int cgroup_scan_tasks(struct cgroup_scanner *scan); | 320 | int cgroup_scan_tasks(struct cgroup_scanner *scan); |
321 | int cgroup_attach_task(struct cgroup *, struct task_struct *); | ||
321 | 322 | ||
322 | #else /* !CONFIG_CGROUPS */ | 323 | #else /* !CONFIG_CGROUPS */ |
323 | 324 | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bcc7a6e8e3c0..2c5cccbe12e2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -489,7 +489,7 @@ static struct css_set *find_css_set( | |||
489 | * Any task can increment and decrement the count field without lock. | 489 | * Any task can increment and decrement the count field without lock. |
490 | * So in general, code holding cgroup_mutex can't rely on the count | 490 | * So in general, code holding cgroup_mutex can't rely on the count |
491 | * field not changing. However, if the count goes to zero, then only | 491 | * field not changing. However, if the count goes to zero, then only |
492 | * attach_task() can increment it again. Because a count of zero | 492 | * cgroup_attach_task() can increment it again. Because a count of zero |
493 | * means that no tasks are currently attached, therefore there is no | 493 | * means that no tasks are currently attached, therefore there is no |
494 | * way a task attached to that cgroup can fork (the other way to | 494 | * way a task attached to that cgroup can fork (the other way to |
495 | * increment the count). So code holding cgroup_mutex can safely | 495 | * increment the count). So code holding cgroup_mutex can safely |
@@ -520,17 +520,17 @@ static struct css_set *find_css_set( | |||
520 | * The task_lock() exception | 520 | * The task_lock() exception |
521 | * | 521 | * |
522 | * The need for this exception arises from the action of | 522 | * The need for this exception arises from the action of |
523 | * attach_task(), which overwrites one tasks cgroup pointer with | 523 | * cgroup_attach_task(), which overwrites one tasks cgroup pointer with |
524 | * another. It does so using cgroup_mutexe, however there are | 524 | * another. It does so using cgroup_mutexe, however there are |
525 | * several performance critical places that need to reference | 525 | * several performance critical places that need to reference |
526 | * task->cgroup without the expense of grabbing a system global | 526 | * task->cgroup without the expense of grabbing a system global |
527 | * mutex. Therefore except as noted below, when dereferencing or, as | 527 | * mutex. Therefore except as noted below, when dereferencing or, as |
528 | * in attach_task(), modifying a task'ss cgroup pointer we use | 528 | * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use |
529 | * task_lock(), which acts on a spinlock (task->alloc_lock) already in | 529 | * task_lock(), which acts on a spinlock (task->alloc_lock) already in |
530 | * the task_struct routinely used for such matters. | 530 | * the task_struct routinely used for such matters. |
531 | * | 531 | * |
532 | * P.S. One more locking exception. RCU is used to guard the | 532 | * P.S. One more locking exception. RCU is used to guard the |
533 | * update of a tasks cgroup pointer by attach_task() | 533 | * update of a tasks cgroup pointer by cgroup_attach_task() |
534 | */ | 534 | */ |
535 | 535 | ||
536 | /** | 536 | /** |
@@ -1194,7 +1194,7 @@ static void get_first_subsys(const struct cgroup *cgrp, | |||
1194 | * Call holding cgroup_mutex. May take task_lock of | 1194 | * Call holding cgroup_mutex. May take task_lock of |
1195 | * the task 'pid' during call. | 1195 | * the task 'pid' during call. |
1196 | */ | 1196 | */ |
1197 | static int attach_task(struct cgroup *cgrp, struct task_struct *tsk) | 1197 | int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
1198 | { | 1198 | { |
1199 | int retval = 0; | 1199 | int retval = 0; |
1200 | struct cgroup_subsys *ss; | 1200 | struct cgroup_subsys *ss; |
@@ -1287,7 +1287,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) | |||
1287 | get_task_struct(tsk); | 1287 | get_task_struct(tsk); |
1288 | } | 1288 | } |
1289 | 1289 | ||
1290 | ret = attach_task(cgrp, tsk); | 1290 | ret = cgroup_attach_task(cgrp, tsk); |
1291 | put_task_struct(tsk); | 1291 | put_task_struct(tsk); |
1292 | return ret; | 1292 | return ret; |
1293 | } | 1293 | } |
@@ -2514,7 +2514,7 @@ out: | |||
2514 | * - Used for /proc/<pid>/cgroup. | 2514 | * - Used for /proc/<pid>/cgroup. |
2515 | * - No need to task_lock(tsk) on this tsk->cgroup reference, as it | 2515 | * - No need to task_lock(tsk) on this tsk->cgroup reference, as it |
2516 | * doesn't really matter if tsk->cgroup changes after we read it, | 2516 | * doesn't really matter if tsk->cgroup changes after we read it, |
2517 | * and we take cgroup_mutex, keeping attach_task() from changing it | 2517 | * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it |
2518 | * anyway. No need to check that tsk->cgroup != NULL, thanks to | 2518 | * anyway. No need to check that tsk->cgroup != NULL, thanks to |
2519 | * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks | 2519 | * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks |
2520 | * cgroup to top_cgroup. | 2520 | * cgroup to top_cgroup. |
@@ -2625,7 +2625,7 @@ static struct file_operations proc_cgroupstats_operations = { | |||
2625 | * A pointer to the shared css_set was automatically copied in | 2625 | * A pointer to the shared css_set was automatically copied in |
2626 | * fork.c by dup_task_struct(). However, we ignore that copy, since | 2626 | * fork.c by dup_task_struct(). However, we ignore that copy, since |
2627 | * it was not made under the protection of RCU or cgroup_mutex, so | 2627 | * it was not made under the protection of RCU or cgroup_mutex, so |
2628 | * might no longer be a valid cgroup pointer. attach_task() might | 2628 | * might no longer be a valid cgroup pointer. cgroup_attach_task() might |
2629 | * have already changed current->cgroups, allowing the previously | 2629 | * have already changed current->cgroups, allowing the previously |
2630 | * referenced cgroup group to be removed and freed. | 2630 | * referenced cgroup group to be removed and freed. |
2631 | * | 2631 | * |
@@ -2704,8 +2704,8 @@ void cgroup_post_fork(struct task_struct *child) | |||
2704 | * attach us to a different cgroup, decrementing the count on | 2704 | * attach us to a different cgroup, decrementing the count on |
2705 | * the first cgroup that we never incremented. But in this case, | 2705 | * the first cgroup that we never incremented. But in this case, |
2706 | * top_cgroup isn't going away, and either task has PF_EXITING set, | 2706 | * top_cgroup isn't going away, and either task has PF_EXITING set, |
2707 | * which wards off any attach_task() attempts, or task is a failed | 2707 | * which wards off any cgroup_attach_task() attempts, or task is a failed |
2708 | * fork, never visible to attach_task. | 2708 | * fork, never visible to cgroup_attach_task. |
2709 | * | 2709 | * |
2710 | */ | 2710 | */ |
2711 | void cgroup_exit(struct task_struct *tsk, int run_callbacks) | 2711 | void cgroup_exit(struct task_struct *tsk, int run_callbacks) |
@@ -2845,7 +2845,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) | |||
2845 | } | 2845 | } |
2846 | 2846 | ||
2847 | /* All seems fine. Finish by moving the task into the new cgroup */ | 2847 | /* All seems fine. Finish by moving the task into the new cgroup */ |
2848 | ret = attach_task(child, tsk); | 2848 | ret = cgroup_attach_task(child, tsk); |
2849 | mutex_unlock(&cgroup_mutex); | 2849 | mutex_unlock(&cgroup_mutex); |
2850 | 2850 | ||
2851 | out_release: | 2851 | out_release: |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index cfaf6419d817..d94a8f7c4c29 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -56,6 +56,8 @@ | |||
56 | #include <asm/atomic.h> | 56 | #include <asm/atomic.h> |
57 | #include <linux/mutex.h> | 57 | #include <linux/mutex.h> |
58 | #include <linux/kfifo.h> | 58 | #include <linux/kfifo.h> |
59 | #include <linux/workqueue.h> | ||
60 | #include <linux/cgroup.h> | ||
59 | 61 | ||
60 | /* | 62 | /* |
61 | * Tracks how many cpusets are currently defined in system. | 63 | * Tracks how many cpusets are currently defined in system. |
@@ -96,6 +98,9 @@ struct cpuset { | |||
96 | 98 | ||
97 | /* partition number for rebuild_sched_domains() */ | 99 | /* partition number for rebuild_sched_domains() */ |
98 | int pn; | 100 | int pn; |
101 | |||
102 | /* used for walking a cpuset heirarchy */ | ||
103 | struct list_head stack_list; | ||
99 | }; | 104 | }; |
100 | 105 | ||
101 | /* Retrieve the cpuset for a cgroup */ | 106 | /* Retrieve the cpuset for a cgroup */ |
@@ -111,7 +116,10 @@ static inline struct cpuset *task_cs(struct task_struct *task) | |||
111 | return container_of(task_subsys_state(task, cpuset_subsys_id), | 116 | return container_of(task_subsys_state(task, cpuset_subsys_id), |
112 | struct cpuset, css); | 117 | struct cpuset, css); |
113 | } | 118 | } |
114 | 119 | struct cpuset_hotplug_scanner { | |
120 | struct cgroup_scanner scan; | ||
121 | struct cgroup *to; | ||
122 | }; | ||
115 | 123 | ||
116 | /* bits in struct cpuset flags field */ | 124 | /* bits in struct cpuset flags field */ |
117 | typedef enum { | 125 | typedef enum { |
@@ -1687,53 +1695,146 @@ int __init cpuset_init(void) | |||
1687 | return 0; | 1695 | return 0; |
1688 | } | 1696 | } |
1689 | 1697 | ||
1698 | /** | ||
1699 | * cpuset_do_move_task - move a given task to another cpuset | ||
1700 | * @tsk: pointer to task_struct the task to move | ||
1701 | * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner | ||
1702 | * | ||
1703 | * Called by cgroup_scan_tasks() for each task in a cgroup. | ||
1704 | * Return nonzero to stop the walk through the tasks. | ||
1705 | */ | ||
1706 | void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan) | ||
1707 | { | ||
1708 | struct cpuset_hotplug_scanner *chsp; | ||
1709 | |||
1710 | chsp = container_of(scan, struct cpuset_hotplug_scanner, scan); | ||
1711 | cgroup_attach_task(chsp->to, tsk); | ||
1712 | } | ||
1713 | |||
1714 | /** | ||
1715 | * move_member_tasks_to_cpuset - move tasks from one cpuset to another | ||
1716 | * @from: cpuset in which the tasks currently reside | ||
1717 | * @to: cpuset to which the tasks will be moved | ||
1718 | * | ||
1719 | * Called with manage_sem held | ||
1720 | * callback_mutex must not be held, as attach_task() will take it. | ||
1721 | * | ||
1722 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | ||
1723 | * calling callback functions for each. | ||
1724 | */ | ||
1725 | static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) | ||
1726 | { | ||
1727 | struct cpuset_hotplug_scanner scan; | ||
1728 | |||
1729 | scan.scan.cg = from->css.cgroup; | ||
1730 | scan.scan.test_task = NULL; /* select all tasks in cgroup */ | ||
1731 | scan.scan.process_task = cpuset_do_move_task; | ||
1732 | scan.scan.heap = NULL; | ||
1733 | scan.to = to->css.cgroup; | ||
1734 | |||
1735 | if (cgroup_scan_tasks((struct cgroup_scanner *)&scan)) | ||
1736 | printk(KERN_ERR "move_member_tasks_to_cpuset: " | ||
1737 | "cgroup_scan_tasks failed\n"); | ||
1738 | } | ||
1739 | |||
1690 | /* | 1740 | /* |
1691 | * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs | 1741 | * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs |
1692 | * or memory nodes, we need to walk over the cpuset hierarchy, | 1742 | * or memory nodes, we need to walk over the cpuset hierarchy, |
1693 | * removing that CPU or node from all cpusets. If this removes the | 1743 | * removing that CPU or node from all cpusets. If this removes the |
1694 | * last CPU or node from a cpuset, then the guarantee_online_cpus() | 1744 | * last CPU or node from a cpuset, then move the tasks in the empty |
1695 | * or guarantee_online_mems() code will use that emptied cpusets | 1745 | * cpuset to its next-highest non-empty parent. |
1696 | * parent online CPUs or nodes. Cpusets that were already empty of | ||
1697 | * CPUs or nodes are left empty. | ||
1698 | * | ||
1699 | * This routine is intentionally inefficient in a couple of regards. | ||
1700 | * It will check all cpusets in a subtree even if the top cpuset of | ||
1701 | * the subtree has no offline CPUs or nodes. It checks both CPUs and | ||
1702 | * nodes, even though the caller could have been coded to know that | ||
1703 | * only one of CPUs or nodes needed to be checked on a given call. | ||
1704 | * This was done to minimize text size rather than cpu cycles. | ||
1705 | * | 1746 | * |
1706 | * Call with both manage_mutex and callback_mutex held. | 1747 | * The parent cpuset has some superset of the 'mems' nodes that the |
1748 | * newly empty cpuset held, so no migration of memory is necessary. | ||
1707 | * | 1749 | * |
1708 | * Recursive, on depth of cpuset subtree. | 1750 | * Called with both manage_sem and callback_sem held |
1709 | */ | 1751 | */ |
1752 | static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | ||
1753 | { | ||
1754 | struct cpuset *parent; | ||
1755 | |||
1756 | /* the cgroup's css_sets list is in use if there are tasks | ||
1757 | in the cpuset; the list is empty if there are none; | ||
1758 | the cs->css.refcnt seems always 0 */ | ||
1759 | if (list_empty(&cs->css.cgroup->css_sets)) | ||
1760 | return; | ||
1710 | 1761 | ||
1711 | static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) | 1762 | /* |
1763 | * Find its next-highest non-empty parent, (top cpuset | ||
1764 | * has online cpus, so can't be empty). | ||
1765 | */ | ||
1766 | parent = cs->parent; | ||
1767 | while (cpus_empty(parent->cpus_allowed)) { | ||
1768 | /* | ||
1769 | * this empty cpuset should now be considered to | ||
1770 | * have been used, and therefore eligible for | ||
1771 | * release when empty (if it is notify_on_release) | ||
1772 | */ | ||
1773 | parent = parent->parent; | ||
1774 | } | ||
1775 | |||
1776 | move_member_tasks_to_cpuset(cs, parent); | ||
1777 | } | ||
1778 | |||
1779 | /* | ||
1780 | * Walk the specified cpuset subtree and look for empty cpusets. | ||
1781 | * The tasks of such cpuset must be moved to a parent cpuset. | ||
1782 | * | ||
1783 | * Note that such a notify_on_release cpuset must have had, at some time, | ||
1784 | * member tasks or cpuset descendants and cpus and memory, before it can | ||
1785 | * be a candidate for release. | ||
1786 | * | ||
1787 | * Called with manage_mutex held. We take callback_mutex to modify | ||
1788 | * cpus_allowed and mems_allowed. | ||
1789 | * | ||
1790 | * This walk processes the tree from top to bottom, completing one layer | ||
1791 | * before dropping down to the next. It always processes a node before | ||
1792 | * any of its children. | ||
1793 | * | ||
1794 | * For now, since we lack memory hot unplug, we'll never see a cpuset | ||
1795 | * that has tasks along with an empty 'mems'. But if we did see such | ||
1796 | * a cpuset, we'd handle it just like we do if its 'cpus' was empty. | ||
1797 | */ | ||
1798 | static void scan_for_empty_cpusets(const struct cpuset *root) | ||
1712 | { | 1799 | { |
1800 | struct cpuset *cp; /* scans cpusets being updated */ | ||
1801 | struct cpuset *child; /* scans child cpusets of cp */ | ||
1802 | struct list_head queue; | ||
1713 | struct cgroup *cont; | 1803 | struct cgroup *cont; |
1714 | struct cpuset *c; | ||
1715 | 1804 | ||
1716 | /* Each of our child cpusets mems must be online */ | 1805 | INIT_LIST_HEAD(&queue); |
1717 | list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { | 1806 | |
1718 | c = cgroup_cs(cont); | 1807 | list_add_tail((struct list_head *)&root->stack_list, &queue); |
1719 | guarantee_online_cpus_mems_in_subtree(c); | 1808 | |
1720 | if (!cpus_empty(c->cpus_allowed)) | 1809 | mutex_lock(&callback_mutex); |
1721 | guarantee_online_cpus(c, &c->cpus_allowed); | 1810 | while (!list_empty(&queue)) { |
1722 | if (!nodes_empty(c->mems_allowed)) | 1811 | cp = container_of(queue.next, struct cpuset, stack_list); |
1723 | guarantee_online_mems(c, &c->mems_allowed); | 1812 | list_del(queue.next); |
1813 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | ||
1814 | child = cgroup_cs(cont); | ||
1815 | list_add_tail(&child->stack_list, &queue); | ||
1816 | } | ||
1817 | cont = cp->css.cgroup; | ||
1818 | /* Remove offline cpus and mems from this cpuset. */ | ||
1819 | cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); | ||
1820 | nodes_and(cp->mems_allowed, cp->mems_allowed, | ||
1821 | node_states[N_HIGH_MEMORY]); | ||
1822 | if ((cpus_empty(cp->cpus_allowed) || | ||
1823 | nodes_empty(cp->mems_allowed))) { | ||
1824 | /* Move tasks from the empty cpuset to a parent */ | ||
1825 | mutex_unlock(&callback_mutex); | ||
1826 | remove_tasks_in_empty_cpuset(cp); | ||
1827 | mutex_lock(&callback_mutex); | ||
1828 | } | ||
1724 | } | 1829 | } |
1830 | mutex_unlock(&callback_mutex); | ||
1831 | return; | ||
1725 | } | 1832 | } |
1726 | 1833 | ||
1727 | /* | 1834 | /* |
1728 | * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track | 1835 | * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track |
1729 | * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to | 1836 | * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to |
1730 | * track what's online after any CPU or memory node hotplug or unplug | 1837 | * track what's online after any CPU or memory node hotplug or unplug event. |
1731 | * event. | ||
1732 | * | ||
1733 | * To ensure that we don't remove a CPU or node from the top cpuset | ||
1734 | * that is currently in use by a child cpuset (which would violate | ||
1735 | * the rule that cpusets must be subsets of their parent), we first | ||
1736 | * call the recursive routine guarantee_online_cpus_mems_in_subtree(). | ||
1737 | * | 1838 | * |
1738 | * Since there are two callers of this routine, one for CPU hotplug | 1839 | * Since there are two callers of this routine, one for CPU hotplug |
1739 | * events and one for memory node hotplug events, we could have coded | 1840 | * events and one for memory node hotplug events, we could have coded |
@@ -1744,13 +1845,11 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) | |||
1744 | static void common_cpu_mem_hotplug_unplug(void) | 1845 | static void common_cpu_mem_hotplug_unplug(void) |
1745 | { | 1846 | { |
1746 | cgroup_lock(); | 1847 | cgroup_lock(); |
1747 | mutex_lock(&callback_mutex); | ||
1748 | 1848 | ||
1749 | guarantee_online_cpus_mems_in_subtree(&top_cpuset); | ||
1750 | top_cpuset.cpus_allowed = cpu_online_map; | 1849 | top_cpuset.cpus_allowed = cpu_online_map; |
1751 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 1850 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; |
1851 | scan_for_empty_cpusets(&top_cpuset); | ||
1752 | 1852 | ||
1753 | mutex_unlock(&callback_mutex); | ||
1754 | cgroup_unlock(); | 1853 | cgroup_unlock(); |
1755 | } | 1854 | } |
1756 | 1855 | ||