diff options
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r-- | kernel/cpuset.c | 167 |
1 files changed, 133 insertions, 34 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index cfaf6419d817..d94a8f7c4c29 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -56,6 +56,8 @@ | |||
56 | #include <asm/atomic.h> | 56 | #include <asm/atomic.h> |
57 | #include <linux/mutex.h> | 57 | #include <linux/mutex.h> |
58 | #include <linux/kfifo.h> | 58 | #include <linux/kfifo.h> |
59 | #include <linux/workqueue.h> | ||
60 | #include <linux/cgroup.h> | ||
59 | 61 | ||
60 | /* | 62 | /* |
61 | * Tracks how many cpusets are currently defined in system. | 63 | * Tracks how many cpusets are currently defined in system. |
@@ -96,6 +98,9 @@ struct cpuset { | |||
96 | 98 | ||
97 | /* partition number for rebuild_sched_domains() */ | 99 | /* partition number for rebuild_sched_domains() */ |
98 | int pn; | 100 | int pn; |
101 | |||
102 | /* used for walking a cpuset heirarchy */ | ||
103 | struct list_head stack_list; | ||
99 | }; | 104 | }; |
100 | 105 | ||
101 | /* Retrieve the cpuset for a cgroup */ | 106 | /* Retrieve the cpuset for a cgroup */ |
@@ -111,7 +116,10 @@ static inline struct cpuset *task_cs(struct task_struct *task) | |||
111 | return container_of(task_subsys_state(task, cpuset_subsys_id), | 116 | return container_of(task_subsys_state(task, cpuset_subsys_id), |
112 | struct cpuset, css); | 117 | struct cpuset, css); |
113 | } | 118 | } |
114 | 119 | struct cpuset_hotplug_scanner { | |
120 | struct cgroup_scanner scan; | ||
121 | struct cgroup *to; | ||
122 | }; | ||
115 | 123 | ||
116 | /* bits in struct cpuset flags field */ | 124 | /* bits in struct cpuset flags field */ |
117 | typedef enum { | 125 | typedef enum { |
@@ -1687,53 +1695,146 @@ int __init cpuset_init(void) | |||
1687 | return 0; | 1695 | return 0; |
1688 | } | 1696 | } |
1689 | 1697 | ||
1698 | /** | ||
1699 | * cpuset_do_move_task - move a given task to another cpuset | ||
1700 | * @tsk: pointer to task_struct the task to move | ||
1701 | * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner | ||
1702 | * | ||
1703 | * Called by cgroup_scan_tasks() for each task in a cgroup. | ||
1704 | * Return nonzero to stop the walk through the tasks. | ||
1705 | */ | ||
1706 | void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan) | ||
1707 | { | ||
1708 | struct cpuset_hotplug_scanner *chsp; | ||
1709 | |||
1710 | chsp = container_of(scan, struct cpuset_hotplug_scanner, scan); | ||
1711 | cgroup_attach_task(chsp->to, tsk); | ||
1712 | } | ||
1713 | |||
1714 | /** | ||
1715 | * move_member_tasks_to_cpuset - move tasks from one cpuset to another | ||
1716 | * @from: cpuset in which the tasks currently reside | ||
1717 | * @to: cpuset to which the tasks will be moved | ||
1718 | * | ||
1719 | * Called with manage_sem held | ||
1720 | * callback_mutex must not be held, as attach_task() will take it. | ||
1721 | * | ||
1722 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | ||
1723 | * calling callback functions for each. | ||
1724 | */ | ||
1725 | static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) | ||
1726 | { | ||
1727 | struct cpuset_hotplug_scanner scan; | ||
1728 | |||
1729 | scan.scan.cg = from->css.cgroup; | ||
1730 | scan.scan.test_task = NULL; /* select all tasks in cgroup */ | ||
1731 | scan.scan.process_task = cpuset_do_move_task; | ||
1732 | scan.scan.heap = NULL; | ||
1733 | scan.to = to->css.cgroup; | ||
1734 | |||
1735 | if (cgroup_scan_tasks((struct cgroup_scanner *)&scan)) | ||
1736 | printk(KERN_ERR "move_member_tasks_to_cpuset: " | ||
1737 | "cgroup_scan_tasks failed\n"); | ||
1738 | } | ||
1739 | |||
1690 | /* | 1740 | /* |
1691 | * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs | 1741 | * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs |
1692 | * or memory nodes, we need to walk over the cpuset hierarchy, | 1742 | * or memory nodes, we need to walk over the cpuset hierarchy, |
1693 | * removing that CPU or node from all cpusets. If this removes the | 1743 | * removing that CPU or node from all cpusets. If this removes the |
1694 | * last CPU or node from a cpuset, then the guarantee_online_cpus() | 1744 | * last CPU or node from a cpuset, then move the tasks in the empty |
1695 | * or guarantee_online_mems() code will use that emptied cpusets | 1745 | * cpuset to its next-highest non-empty parent. |
1696 | * parent online CPUs or nodes. Cpusets that were already empty of | ||
1697 | * CPUs or nodes are left empty. | ||
1698 | * | ||
1699 | * This routine is intentionally inefficient in a couple of regards. | ||
1700 | * It will check all cpusets in a subtree even if the top cpuset of | ||
1701 | * the subtree has no offline CPUs or nodes. It checks both CPUs and | ||
1702 | * nodes, even though the caller could have been coded to know that | ||
1703 | * only one of CPUs or nodes needed to be checked on a given call. | ||
1704 | * This was done to minimize text size rather than cpu cycles. | ||
1705 | * | 1746 | * |
1706 | * Call with both manage_mutex and callback_mutex held. | 1747 | * The parent cpuset has some superset of the 'mems' nodes that the |
1748 | * newly empty cpuset held, so no migration of memory is necessary. | ||
1707 | * | 1749 | * |
1708 | * Recursive, on depth of cpuset subtree. | 1750 | * Called with both manage_sem and callback_sem held |
1709 | */ | 1751 | */ |
1752 | static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | ||
1753 | { | ||
1754 | struct cpuset *parent; | ||
1755 | |||
1756 | /* the cgroup's css_sets list is in use if there are tasks | ||
1757 | in the cpuset; the list is empty if there are none; | ||
1758 | the cs->css.refcnt seems always 0 */ | ||
1759 | if (list_empty(&cs->css.cgroup->css_sets)) | ||
1760 | return; | ||
1710 | 1761 | ||
1711 | static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) | 1762 | /* |
1763 | * Find its next-highest non-empty parent, (top cpuset | ||
1764 | * has online cpus, so can't be empty). | ||
1765 | */ | ||
1766 | parent = cs->parent; | ||
1767 | while (cpus_empty(parent->cpus_allowed)) { | ||
1768 | /* | ||
1769 | * this empty cpuset should now be considered to | ||
1770 | * have been used, and therefore eligible for | ||
1771 | * release when empty (if it is notify_on_release) | ||
1772 | */ | ||
1773 | parent = parent->parent; | ||
1774 | } | ||
1775 | |||
1776 | move_member_tasks_to_cpuset(cs, parent); | ||
1777 | } | ||
1778 | |||
1779 | /* | ||
1780 | * Walk the specified cpuset subtree and look for empty cpusets. | ||
1781 | * The tasks of such cpuset must be moved to a parent cpuset. | ||
1782 | * | ||
1783 | * Note that such a notify_on_release cpuset must have had, at some time, | ||
1784 | * member tasks or cpuset descendants and cpus and memory, before it can | ||
1785 | * be a candidate for release. | ||
1786 | * | ||
1787 | * Called with manage_mutex held. We take callback_mutex to modify | ||
1788 | * cpus_allowed and mems_allowed. | ||
1789 | * | ||
1790 | * This walk processes the tree from top to bottom, completing one layer | ||
1791 | * before dropping down to the next. It always processes a node before | ||
1792 | * any of its children. | ||
1793 | * | ||
1794 | * For now, since we lack memory hot unplug, we'll never see a cpuset | ||
1795 | * that has tasks along with an empty 'mems'. But if we did see such | ||
1796 | * a cpuset, we'd handle it just like we do if its 'cpus' was empty. | ||
1797 | */ | ||
1798 | static void scan_for_empty_cpusets(const struct cpuset *root) | ||
1712 | { | 1799 | { |
1800 | struct cpuset *cp; /* scans cpusets being updated */ | ||
1801 | struct cpuset *child; /* scans child cpusets of cp */ | ||
1802 | struct list_head queue; | ||
1713 | struct cgroup *cont; | 1803 | struct cgroup *cont; |
1714 | struct cpuset *c; | ||
1715 | 1804 | ||
1716 | /* Each of our child cpusets mems must be online */ | 1805 | INIT_LIST_HEAD(&queue); |
1717 | list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { | 1806 | |
1718 | c = cgroup_cs(cont); | 1807 | list_add_tail((struct list_head *)&root->stack_list, &queue); |
1719 | guarantee_online_cpus_mems_in_subtree(c); | 1808 | |
1720 | if (!cpus_empty(c->cpus_allowed)) | 1809 | mutex_lock(&callback_mutex); |
1721 | guarantee_online_cpus(c, &c->cpus_allowed); | 1810 | while (!list_empty(&queue)) { |
1722 | if (!nodes_empty(c->mems_allowed)) | 1811 | cp = container_of(queue.next, struct cpuset, stack_list); |
1723 | guarantee_online_mems(c, &c->mems_allowed); | 1812 | list_del(queue.next); |
1813 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | ||
1814 | child = cgroup_cs(cont); | ||
1815 | list_add_tail(&child->stack_list, &queue); | ||
1816 | } | ||
1817 | cont = cp->css.cgroup; | ||
1818 | /* Remove offline cpus and mems from this cpuset. */ | ||
1819 | cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); | ||
1820 | nodes_and(cp->mems_allowed, cp->mems_allowed, | ||
1821 | node_states[N_HIGH_MEMORY]); | ||
1822 | if ((cpus_empty(cp->cpus_allowed) || | ||
1823 | nodes_empty(cp->mems_allowed))) { | ||
1824 | /* Move tasks from the empty cpuset to a parent */ | ||
1825 | mutex_unlock(&callback_mutex); | ||
1826 | remove_tasks_in_empty_cpuset(cp); | ||
1827 | mutex_lock(&callback_mutex); | ||
1828 | } | ||
1724 | } | 1829 | } |
1830 | mutex_unlock(&callback_mutex); | ||
1831 | return; | ||
1725 | } | 1832 | } |
1726 | 1833 | ||
1727 | /* | 1834 | /* |
1728 | * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track | 1835 | * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track |
1729 | * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to | 1836 | * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to |
1730 | * track what's online after any CPU or memory node hotplug or unplug | 1837 | * track what's online after any CPU or memory node hotplug or unplug event. |
1731 | * event. | ||
1732 | * | ||
1733 | * To ensure that we don't remove a CPU or node from the top cpuset | ||
1734 | * that is currently in use by a child cpuset (which would violate | ||
1735 | * the rule that cpusets must be subsets of their parent), we first | ||
1736 | * call the recursive routine guarantee_online_cpus_mems_in_subtree(). | ||
1737 | * | 1838 | * |
1738 | * Since there are two callers of this routine, one for CPU hotplug | 1839 | * Since there are two callers of this routine, one for CPU hotplug |
1739 | * events and one for memory node hotplug events, we could have coded | 1840 | * events and one for memory node hotplug events, we could have coded |
@@ -1744,13 +1845,11 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) | |||
1744 | static void common_cpu_mem_hotplug_unplug(void) | 1845 | static void common_cpu_mem_hotplug_unplug(void) |
1745 | { | 1846 | { |
1746 | cgroup_lock(); | 1847 | cgroup_lock(); |
1747 | mutex_lock(&callback_mutex); | ||
1748 | 1848 | ||
1749 | guarantee_online_cpus_mems_in_subtree(&top_cpuset); | ||
1750 | top_cpuset.cpus_allowed = cpu_online_map; | 1849 | top_cpuset.cpus_allowed = cpu_online_map; |
1751 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 1850 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; |
1851 | scan_for_empty_cpusets(&top_cpuset); | ||
1752 | 1852 | ||
1753 | mutex_unlock(&callback_mutex); | ||
1754 | cgroup_unlock(); | 1853 | cgroup_unlock(); |
1755 | } | 1854 | } |
1756 | 1855 | ||