aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cpuset.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r--kernel/cpuset.c167
1 files changed, 133 insertions, 34 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cfaf6419d817..d94a8f7c4c29 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -56,6 +56,8 @@
56#include <asm/atomic.h> 56#include <asm/atomic.h>
57#include <linux/mutex.h> 57#include <linux/mutex.h>
58#include <linux/kfifo.h> 58#include <linux/kfifo.h>
59#include <linux/workqueue.h>
60#include <linux/cgroup.h>
59 61
60/* 62/*
61 * Tracks how many cpusets are currently defined in system. 63 * Tracks how many cpusets are currently defined in system.
@@ -96,6 +98,9 @@ struct cpuset {
96 98
97 /* partition number for rebuild_sched_domains() */ 99 /* partition number for rebuild_sched_domains() */
98 int pn; 100 int pn;
101
102 /* used for walking a cpuset heirarchy */
103 struct list_head stack_list;
99}; 104};
100 105
101/* Retrieve the cpuset for a cgroup */ 106/* Retrieve the cpuset for a cgroup */
@@ -111,7 +116,10 @@ static inline struct cpuset *task_cs(struct task_struct *task)
111 return container_of(task_subsys_state(task, cpuset_subsys_id), 116 return container_of(task_subsys_state(task, cpuset_subsys_id),
112 struct cpuset, css); 117 struct cpuset, css);
113} 118}
114 119struct cpuset_hotplug_scanner {
120 struct cgroup_scanner scan;
121 struct cgroup *to;
122};
115 123
116/* bits in struct cpuset flags field */ 124/* bits in struct cpuset flags field */
117typedef enum { 125typedef enum {
@@ -1687,53 +1695,146 @@ int __init cpuset_init(void)
1687 return 0; 1695 return 0;
1688} 1696}
1689 1697
1698/**
1699 * cpuset_do_move_task - move a given task to another cpuset
1700 * @tsk: pointer to task_struct the task to move
1701 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
1702 *
1703 * Called by cgroup_scan_tasks() for each task in a cgroup.
1704 * Return nonzero to stop the walk through the tasks.
1705 */
1706void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan)
1707{
1708 struct cpuset_hotplug_scanner *chsp;
1709
1710 chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
1711 cgroup_attach_task(chsp->to, tsk);
1712}
1713
1714/**
1715 * move_member_tasks_to_cpuset - move tasks from one cpuset to another
1716 * @from: cpuset in which the tasks currently reside
1717 * @to: cpuset to which the tasks will be moved
1718 *
1719 * Called with manage_sem held
1720 * callback_mutex must not be held, as attach_task() will take it.
1721 *
1722 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
1723 * calling callback functions for each.
1724 */
1725static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1726{
1727 struct cpuset_hotplug_scanner scan;
1728
1729 scan.scan.cg = from->css.cgroup;
1730 scan.scan.test_task = NULL; /* select all tasks in cgroup */
1731 scan.scan.process_task = cpuset_do_move_task;
1732 scan.scan.heap = NULL;
1733 scan.to = to->css.cgroup;
1734
1735 if (cgroup_scan_tasks((struct cgroup_scanner *)&scan))
1736 printk(KERN_ERR "move_member_tasks_to_cpuset: "
1737 "cgroup_scan_tasks failed\n");
1738}
1739
1690/* 1740/*
1691 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs 1741 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
1692 * or memory nodes, we need to walk over the cpuset hierarchy, 1742 * or memory nodes, we need to walk over the cpuset hierarchy,
1693 * removing that CPU or node from all cpusets. If this removes the 1743 * removing that CPU or node from all cpusets. If this removes the
1694 * last CPU or node from a cpuset, then the guarantee_online_cpus() 1744 * last CPU or node from a cpuset, then move the tasks in the empty
1695 * or guarantee_online_mems() code will use that emptied cpusets 1745 * cpuset to its next-highest non-empty parent.
1696 * parent online CPUs or nodes. Cpusets that were already empty of
1697 * CPUs or nodes are left empty.
1698 *
1699 * This routine is intentionally inefficient in a couple of regards.
1700 * It will check all cpusets in a subtree even if the top cpuset of
1701 * the subtree has no offline CPUs or nodes. It checks both CPUs and
1702 * nodes, even though the caller could have been coded to know that
1703 * only one of CPUs or nodes needed to be checked on a given call.
1704 * This was done to minimize text size rather than cpu cycles.
1705 * 1746 *
1706 * Call with both manage_mutex and callback_mutex held. 1747 * The parent cpuset has some superset of the 'mems' nodes that the
1748 * newly empty cpuset held, so no migration of memory is necessary.
1707 * 1749 *
1708 * Recursive, on depth of cpuset subtree. 1750 * Called with both manage_sem and callback_sem held
1709 */ 1751 */
1752static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1753{
1754 struct cpuset *parent;
1755
1756 /* the cgroup's css_sets list is in use if there are tasks
1757 in the cpuset; the list is empty if there are none;
1758 the cs->css.refcnt seems always 0 */
1759 if (list_empty(&cs->css.cgroup->css_sets))
1760 return;
1710 1761
1711static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) 1762 /*
1763 * Find its next-highest non-empty parent, (top cpuset
1764 * has online cpus, so can't be empty).
1765 */
1766 parent = cs->parent;
1767 while (cpus_empty(parent->cpus_allowed)) {
1768 /*
1769 * this empty cpuset should now be considered to
1770 * have been used, and therefore eligible for
1771 * release when empty (if it is notify_on_release)
1772 */
1773 parent = parent->parent;
1774 }
1775
1776 move_member_tasks_to_cpuset(cs, parent);
1777}
1778
1779/*
1780 * Walk the specified cpuset subtree and look for empty cpusets.
1781 * The tasks of such cpuset must be moved to a parent cpuset.
1782 *
1783 * Note that such a notify_on_release cpuset must have had, at some time,
1784 * member tasks or cpuset descendants and cpus and memory, before it can
1785 * be a candidate for release.
1786 *
1787 * Called with manage_mutex held. We take callback_mutex to modify
1788 * cpus_allowed and mems_allowed.
1789 *
1790 * This walk processes the tree from top to bottom, completing one layer
1791 * before dropping down to the next. It always processes a node before
1792 * any of its children.
1793 *
1794 * For now, since we lack memory hot unplug, we'll never see a cpuset
1795 * that has tasks along with an empty 'mems'. But if we did see such
1796 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
1797 */
1798static void scan_for_empty_cpusets(const struct cpuset *root)
1712{ 1799{
1800 struct cpuset *cp; /* scans cpusets being updated */
1801 struct cpuset *child; /* scans child cpusets of cp */
1802 struct list_head queue;
1713 struct cgroup *cont; 1803 struct cgroup *cont;
1714 struct cpuset *c;
1715 1804
1716 /* Each of our child cpusets mems must be online */ 1805 INIT_LIST_HEAD(&queue);
1717 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { 1806
1718 c = cgroup_cs(cont); 1807 list_add_tail((struct list_head *)&root->stack_list, &queue);
1719 guarantee_online_cpus_mems_in_subtree(c); 1808
1720 if (!cpus_empty(c->cpus_allowed)) 1809 mutex_lock(&callback_mutex);
1721 guarantee_online_cpus(c, &c->cpus_allowed); 1810 while (!list_empty(&queue)) {
1722 if (!nodes_empty(c->mems_allowed)) 1811 cp = container_of(queue.next, struct cpuset, stack_list);
1723 guarantee_online_mems(c, &c->mems_allowed); 1812 list_del(queue.next);
1813 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
1814 child = cgroup_cs(cont);
1815 list_add_tail(&child->stack_list, &queue);
1816 }
1817 cont = cp->css.cgroup;
1818 /* Remove offline cpus and mems from this cpuset. */
1819 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
1820 nodes_and(cp->mems_allowed, cp->mems_allowed,
1821 node_states[N_HIGH_MEMORY]);
1822 if ((cpus_empty(cp->cpus_allowed) ||
1823 nodes_empty(cp->mems_allowed))) {
1824 /* Move tasks from the empty cpuset to a parent */
1825 mutex_unlock(&callback_mutex);
1826 remove_tasks_in_empty_cpuset(cp);
1827 mutex_lock(&callback_mutex);
1828 }
1724 } 1829 }
1830 mutex_unlock(&callback_mutex);
1831 return;
1725} 1832}
1726 1833
1727/* 1834/*
1728 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track 1835 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
1729 * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to 1836 * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
1730 * track what's online after any CPU or memory node hotplug or unplug 1837 * track what's online after any CPU or memory node hotplug or unplug event.
1731 * event.
1732 *
1733 * To ensure that we don't remove a CPU or node from the top cpuset
1734 * that is currently in use by a child cpuset (which would violate
1735 * the rule that cpusets must be subsets of their parent), we first
1736 * call the recursive routine guarantee_online_cpus_mems_in_subtree().
1737 * 1838 *
1738 * Since there are two callers of this routine, one for CPU hotplug 1839 * Since there are two callers of this routine, one for CPU hotplug
1739 * events and one for memory node hotplug events, we could have coded 1840 * events and one for memory node hotplug events, we could have coded
@@ -1744,13 +1845,11 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
1744static void common_cpu_mem_hotplug_unplug(void) 1845static void common_cpu_mem_hotplug_unplug(void)
1745{ 1846{
1746 cgroup_lock(); 1847 cgroup_lock();
1747 mutex_lock(&callback_mutex);
1748 1848
1749 guarantee_online_cpus_mems_in_subtree(&top_cpuset);
1750 top_cpuset.cpus_allowed = cpu_online_map; 1849 top_cpuset.cpus_allowed = cpu_online_map;
1751 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 1850 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1851 scan_for_empty_cpusets(&top_cpuset);
1752 1852
1753 mutex_unlock(&callback_mutex);
1754 cgroup_unlock(); 1853 cgroup_unlock();
1755} 1854}
1756 1855