aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cpuset.c
diff options
context:
space:
mode:
authorCliff Wickman <cpw@sgi.com>2008-02-07 03:14:43 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2008-02-07 11:42:22 -0500
commit956db3ca0606e78456786ef19fd4dc7a5151a6e1 (patch)
tree0bef3d107df1115ecf76e342f30ecee67a7f3705 /kernel/cpuset.c
parent31a7df01fd0cd786f60873a921aecafac148c290 (diff)
hotplug cpu: move tasks in empty cpusets to parent
This patch corrects a situation that occurs when one disables all the cpus in a cpuset. Currently, the disabled (cpu-less) cpuset inherits the cpus of its parent, which is incorrect because it may then overlap its cpu-exclusive sibling. Tasks of an empty cpuset should be moved to the cpuset which is the parent of their current cpuset. Or if the parent cpuset has no cpus, to its parent, etc. And the empty cpuset should be released (if it is flagged notify_on_release). Depends on the cgroup_scan_tasks() function (proposed by David Rientjes) to iterate through all tasks in the cpu-less cpuset. We are deliberately avoiding a walk of the tasklist. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Cliff Wickman <cpw@sgi.com> Cc: Paul Menage <menage@google.com> Cc: Paul Jackson <pj@sgi.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r--kernel/cpuset.c167
1 files changed, 133 insertions, 34 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cfaf6419d817..d94a8f7c4c29 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -56,6 +56,8 @@
56#include <asm/atomic.h> 56#include <asm/atomic.h>
57#include <linux/mutex.h> 57#include <linux/mutex.h>
58#include <linux/kfifo.h> 58#include <linux/kfifo.h>
59#include <linux/workqueue.h>
60#include <linux/cgroup.h>
59 61
60/* 62/*
61 * Tracks how many cpusets are currently defined in system. 63 * Tracks how many cpusets are currently defined in system.
@@ -96,6 +98,9 @@ struct cpuset {
96 98
97 /* partition number for rebuild_sched_domains() */ 99 /* partition number for rebuild_sched_domains() */
98 int pn; 100 int pn;
101
102 /* used for walking a cpuset heirarchy */
103 struct list_head stack_list;
99}; 104};
100 105
101/* Retrieve the cpuset for a cgroup */ 106/* Retrieve the cpuset for a cgroup */
@@ -111,7 +116,10 @@ static inline struct cpuset *task_cs(struct task_struct *task)
111 return container_of(task_subsys_state(task, cpuset_subsys_id), 116 return container_of(task_subsys_state(task, cpuset_subsys_id),
112 struct cpuset, css); 117 struct cpuset, css);
113} 118}
114 119struct cpuset_hotplug_scanner {
120 struct cgroup_scanner scan;
121 struct cgroup *to;
122};
115 123
116/* bits in struct cpuset flags field */ 124/* bits in struct cpuset flags field */
117typedef enum { 125typedef enum {
@@ -1687,53 +1695,146 @@ int __init cpuset_init(void)
1687 return 0; 1695 return 0;
1688} 1696}
1689 1697
1698/**
1699 * cpuset_do_move_task - move a given task to another cpuset
1700 * @tsk: pointer to task_struct the task to move
1701 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
1702 *
1703 * Called by cgroup_scan_tasks() for each task in a cgroup.
1704 * Return nonzero to stop the walk through the tasks.
1705 */
1706void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan)
1707{
1708 struct cpuset_hotplug_scanner *chsp;
1709
1710 chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
1711 cgroup_attach_task(chsp->to, tsk);
1712}
1713
1714/**
1715 * move_member_tasks_to_cpuset - move tasks from one cpuset to another
1716 * @from: cpuset in which the tasks currently reside
1717 * @to: cpuset to which the tasks will be moved
1718 *
1719 * Called with manage_sem held
1720 * callback_mutex must not be held, as attach_task() will take it.
1721 *
1722 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
1723 * calling callback functions for each.
1724 */
1725static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1726{
1727 struct cpuset_hotplug_scanner scan;
1728
1729 scan.scan.cg = from->css.cgroup;
1730 scan.scan.test_task = NULL; /* select all tasks in cgroup */
1731 scan.scan.process_task = cpuset_do_move_task;
1732 scan.scan.heap = NULL;
1733 scan.to = to->css.cgroup;
1734
1735 if (cgroup_scan_tasks((struct cgroup_scanner *)&scan))
1736 printk(KERN_ERR "move_member_tasks_to_cpuset: "
1737 "cgroup_scan_tasks failed\n");
1738}
1739
1690/* 1740/*
1691 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs 1741 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
1692 * or memory nodes, we need to walk over the cpuset hierarchy, 1742 * or memory nodes, we need to walk over the cpuset hierarchy,
1693 * removing that CPU or node from all cpusets. If this removes the 1743 * removing that CPU or node from all cpusets. If this removes the
1694 * last CPU or node from a cpuset, then the guarantee_online_cpus() 1744 * last CPU or node from a cpuset, then move the tasks in the empty
1695 * or guarantee_online_mems() code will use that emptied cpusets 1745 * cpuset to its next-highest non-empty parent.
1696 * parent online CPUs or nodes. Cpusets that were already empty of
1697 * CPUs or nodes are left empty.
1698 *
1699 * This routine is intentionally inefficient in a couple of regards.
1700 * It will check all cpusets in a subtree even if the top cpuset of
1701 * the subtree has no offline CPUs or nodes. It checks both CPUs and
1702 * nodes, even though the caller could have been coded to know that
1703 * only one of CPUs or nodes needed to be checked on a given call.
1704 * This was done to minimize text size rather than cpu cycles.
1705 * 1746 *
1706 * Call with both manage_mutex and callback_mutex held. 1747 * The parent cpuset has some superset of the 'mems' nodes that the
1748 * newly empty cpuset held, so no migration of memory is necessary.
1707 * 1749 *
1708 * Recursive, on depth of cpuset subtree. 1750 * Called with both manage_sem and callback_sem held
1709 */ 1751 */
1752static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1753{
1754 struct cpuset *parent;
1755
1756 /* the cgroup's css_sets list is in use if there are tasks
1757 in the cpuset; the list is empty if there are none;
1758 the cs->css.refcnt seems always 0 */
1759 if (list_empty(&cs->css.cgroup->css_sets))
1760 return;
1710 1761
1711static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) 1762 /*
1763 * Find its next-highest non-empty parent, (top cpuset
1764 * has online cpus, so can't be empty).
1765 */
1766 parent = cs->parent;
1767 while (cpus_empty(parent->cpus_allowed)) {
1768 /*
1769 * this empty cpuset should now be considered to
1770 * have been used, and therefore eligible for
1771 * release when empty (if it is notify_on_release)
1772 */
1773 parent = parent->parent;
1774 }
1775
1776 move_member_tasks_to_cpuset(cs, parent);
1777}
1778
1779/*
1780 * Walk the specified cpuset subtree and look for empty cpusets.
1781 * The tasks of such cpuset must be moved to a parent cpuset.
1782 *
1783 * Note that such a notify_on_release cpuset must have had, at some time,
1784 * member tasks or cpuset descendants and cpus and memory, before it can
1785 * be a candidate for release.
1786 *
1787 * Called with manage_mutex held. We take callback_mutex to modify
1788 * cpus_allowed and mems_allowed.
1789 *
1790 * This walk processes the tree from top to bottom, completing one layer
1791 * before dropping down to the next. It always processes a node before
1792 * any of its children.
1793 *
1794 * For now, since we lack memory hot unplug, we'll never see a cpuset
1795 * that has tasks along with an empty 'mems'. But if we did see such
1796 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
1797 */
1798static void scan_for_empty_cpusets(const struct cpuset *root)
1712{ 1799{
1800 struct cpuset *cp; /* scans cpusets being updated */
1801 struct cpuset *child; /* scans child cpusets of cp */
1802 struct list_head queue;
1713 struct cgroup *cont; 1803 struct cgroup *cont;
1714 struct cpuset *c;
1715 1804
1716 /* Each of our child cpusets mems must be online */ 1805 INIT_LIST_HEAD(&queue);
1717 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { 1806
1718 c = cgroup_cs(cont); 1807 list_add_tail((struct list_head *)&root->stack_list, &queue);
1719 guarantee_online_cpus_mems_in_subtree(c); 1808
1720 if (!cpus_empty(c->cpus_allowed)) 1809 mutex_lock(&callback_mutex);
1721 guarantee_online_cpus(c, &c->cpus_allowed); 1810 while (!list_empty(&queue)) {
1722 if (!nodes_empty(c->mems_allowed)) 1811 cp = container_of(queue.next, struct cpuset, stack_list);
1723 guarantee_online_mems(c, &c->mems_allowed); 1812 list_del(queue.next);
1813 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
1814 child = cgroup_cs(cont);
1815 list_add_tail(&child->stack_list, &queue);
1816 }
1817 cont = cp->css.cgroup;
1818 /* Remove offline cpus and mems from this cpuset. */
1819 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
1820 nodes_and(cp->mems_allowed, cp->mems_allowed,
1821 node_states[N_HIGH_MEMORY]);
1822 if ((cpus_empty(cp->cpus_allowed) ||
1823 nodes_empty(cp->mems_allowed))) {
1824 /* Move tasks from the empty cpuset to a parent */
1825 mutex_unlock(&callback_mutex);
1826 remove_tasks_in_empty_cpuset(cp);
1827 mutex_lock(&callback_mutex);
1828 }
1724 } 1829 }
1830 mutex_unlock(&callback_mutex);
1831 return;
1725} 1832}
1726 1833
1727/* 1834/*
1728 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track 1835 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
1729 * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to 1836 * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
1730 * track what's online after any CPU or memory node hotplug or unplug 1837 * track what's online after any CPU or memory node hotplug or unplug event.
1731 * event.
1732 *
1733 * To ensure that we don't remove a CPU or node from the top cpuset
1734 * that is currently in use by a child cpuset (which would violate
1735 * the rule that cpusets must be subsets of their parent), we first
1736 * call the recursive routine guarantee_online_cpus_mems_in_subtree().
1737 * 1838 *
1738 * Since there are two callers of this routine, one for CPU hotplug 1839 * Since there are two callers of this routine, one for CPU hotplug
1739 * events and one for memory node hotplug events, we could have coded 1840 * events and one for memory node hotplug events, we could have coded
@@ -1744,13 +1845,11 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
1744static void common_cpu_mem_hotplug_unplug(void) 1845static void common_cpu_mem_hotplug_unplug(void)
1745{ 1846{
1746 cgroup_lock(); 1847 cgroup_lock();
1747 mutex_lock(&callback_mutex);
1748 1848
1749 guarantee_online_cpus_mems_in_subtree(&top_cpuset);
1750 top_cpuset.cpus_allowed = cpu_online_map; 1849 top_cpuset.cpus_allowed = cpu_online_map;
1751 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 1850 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1851 scan_for_empty_cpusets(&top_cpuset);
1752 1852
1753 mutex_unlock(&callback_mutex);
1754 cgroup_unlock(); 1853 cgroup_unlock();
1755} 1854}
1756 1855