aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/cgroup.h1
-rw-r--r--kernel/cgroup.c22
-rw-r--r--kernel/cpuset.c167
3 files changed, 145 insertions, 45 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8675c691d3e..ff9055fc3d2 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -318,6 +318,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cont,
318 struct cgroup_iter *it); 318 struct cgroup_iter *it);
319void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it); 319void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it);
320int cgroup_scan_tasks(struct cgroup_scanner *scan); 320int cgroup_scan_tasks(struct cgroup_scanner *scan);
321int cgroup_attach_task(struct cgroup *, struct task_struct *);
321 322
322#else /* !CONFIG_CGROUPS */ 323#else /* !CONFIG_CGROUPS */
323 324
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bcc7a6e8e3c..2c5cccbe12e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -489,7 +489,7 @@ static struct css_set *find_css_set(
489 * Any task can increment and decrement the count field without lock. 489 * Any task can increment and decrement the count field without lock.
490 * So in general, code holding cgroup_mutex can't rely on the count 490 * So in general, code holding cgroup_mutex can't rely on the count
491 * field not changing. However, if the count goes to zero, then only 491 * field not changing. However, if the count goes to zero, then only
492 * attach_task() can increment it again. Because a count of zero 492 * cgroup_attach_task() can increment it again. Because a count of zero
493 * means that no tasks are currently attached, therefore there is no 493 * means that no tasks are currently attached, therefore there is no
494 * way a task attached to that cgroup can fork (the other way to 494 * way a task attached to that cgroup can fork (the other way to
495 * increment the count). So code holding cgroup_mutex can safely 495 * increment the count). So code holding cgroup_mutex can safely
@@ -520,17 +520,17 @@ static struct css_set *find_css_set(
520 * The task_lock() exception 520 * The task_lock() exception
521 * 521 *
522 * The need for this exception arises from the action of 522 * The need for this exception arises from the action of
523 * attach_task(), which overwrites one tasks cgroup pointer with 523 * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
524 * another. It does so using cgroup_mutexe, however there are 524 * another. It does so using cgroup_mutexe, however there are
525 * several performance critical places that need to reference 525 * several performance critical places that need to reference
526 * task->cgroup without the expense of grabbing a system global 526 * task->cgroup without the expense of grabbing a system global
527 * mutex. Therefore except as noted below, when dereferencing or, as 527 * mutex. Therefore except as noted below, when dereferencing or, as
528 * in attach_task(), modifying a task'ss cgroup pointer we use 528 * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
529 * task_lock(), which acts on a spinlock (task->alloc_lock) already in 529 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
530 * the task_struct routinely used for such matters. 530 * the task_struct routinely used for such matters.
531 * 531 *
532 * P.S. One more locking exception. RCU is used to guard the 532 * P.S. One more locking exception. RCU is used to guard the
533 * update of a tasks cgroup pointer by attach_task() 533 * update of a tasks cgroup pointer by cgroup_attach_task()
534 */ 534 */
535 535
536/** 536/**
@@ -1194,7 +1194,7 @@ static void get_first_subsys(const struct cgroup *cgrp,
1194 * Call holding cgroup_mutex. May take task_lock of 1194 * Call holding cgroup_mutex. May take task_lock of
1195 * the task 'pid' during call. 1195 * the task 'pid' during call.
1196 */ 1196 */
1197static int attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1197int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1198{ 1198{
1199 int retval = 0; 1199 int retval = 0;
1200 struct cgroup_subsys *ss; 1200 struct cgroup_subsys *ss;
@@ -1287,7 +1287,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
1287 get_task_struct(tsk); 1287 get_task_struct(tsk);
1288 } 1288 }
1289 1289
1290 ret = attach_task(cgrp, tsk); 1290 ret = cgroup_attach_task(cgrp, tsk);
1291 put_task_struct(tsk); 1291 put_task_struct(tsk);
1292 return ret; 1292 return ret;
1293} 1293}
@@ -2514,7 +2514,7 @@ out:
2514 * - Used for /proc/<pid>/cgroup. 2514 * - Used for /proc/<pid>/cgroup.
2515 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it 2515 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
2516 * doesn't really matter if tsk->cgroup changes after we read it, 2516 * doesn't really matter if tsk->cgroup changes after we read it,
2517 * and we take cgroup_mutex, keeping attach_task() from changing it 2517 * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
2518 * anyway. No need to check that tsk->cgroup != NULL, thanks to 2518 * anyway. No need to check that tsk->cgroup != NULL, thanks to
2519 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks 2519 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
2520 * cgroup to top_cgroup. 2520 * cgroup to top_cgroup.
@@ -2625,7 +2625,7 @@ static struct file_operations proc_cgroupstats_operations = {
2625 * A pointer to the shared css_set was automatically copied in 2625 * A pointer to the shared css_set was automatically copied in
2626 * fork.c by dup_task_struct(). However, we ignore that copy, since 2626 * fork.c by dup_task_struct(). However, we ignore that copy, since
2627 * it was not made under the protection of RCU or cgroup_mutex, so 2627 * it was not made under the protection of RCU or cgroup_mutex, so
2628 * might no longer be a valid cgroup pointer. attach_task() might 2628 * might no longer be a valid cgroup pointer. cgroup_attach_task() might
2629 * have already changed current->cgroups, allowing the previously 2629 * have already changed current->cgroups, allowing the previously
2630 * referenced cgroup group to be removed and freed. 2630 * referenced cgroup group to be removed and freed.
2631 * 2631 *
@@ -2704,8 +2704,8 @@ void cgroup_post_fork(struct task_struct *child)
2704 * attach us to a different cgroup, decrementing the count on 2704 * attach us to a different cgroup, decrementing the count on
2705 * the first cgroup that we never incremented. But in this case, 2705 * the first cgroup that we never incremented. But in this case,
2706 * top_cgroup isn't going away, and either task has PF_EXITING set, 2706 * top_cgroup isn't going away, and either task has PF_EXITING set,
2707 * which wards off any attach_task() attempts, or task is a failed 2707 * which wards off any cgroup_attach_task() attempts, or task is a failed
2708 * fork, never visible to attach_task. 2708 * fork, never visible to cgroup_attach_task.
2709 * 2709 *
2710 */ 2710 */
2711void cgroup_exit(struct task_struct *tsk, int run_callbacks) 2711void cgroup_exit(struct task_struct *tsk, int run_callbacks)
@@ -2845,7 +2845,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
2845 } 2845 }
2846 2846
2847 /* All seems fine. Finish by moving the task into the new cgroup */ 2847 /* All seems fine. Finish by moving the task into the new cgroup */
2848 ret = attach_task(child, tsk); 2848 ret = cgroup_attach_task(child, tsk);
2849 mutex_unlock(&cgroup_mutex); 2849 mutex_unlock(&cgroup_mutex);
2850 2850
2851 out_release: 2851 out_release:
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cfaf6419d81..d94a8f7c4c2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -56,6 +56,8 @@
56#include <asm/atomic.h> 56#include <asm/atomic.h>
57#include <linux/mutex.h> 57#include <linux/mutex.h>
58#include <linux/kfifo.h> 58#include <linux/kfifo.h>
59#include <linux/workqueue.h>
60#include <linux/cgroup.h>
59 61
60/* 62/*
61 * Tracks how many cpusets are currently defined in system. 63 * Tracks how many cpusets are currently defined in system.
@@ -96,6 +98,9 @@ struct cpuset {
96 98
97 /* partition number for rebuild_sched_domains() */ 99 /* partition number for rebuild_sched_domains() */
98 int pn; 100 int pn;
101
102 /* used for walking a cpuset heirarchy */
103 struct list_head stack_list;
99}; 104};
100 105
101/* Retrieve the cpuset for a cgroup */ 106/* Retrieve the cpuset for a cgroup */
@@ -111,7 +116,10 @@ static inline struct cpuset *task_cs(struct task_struct *task)
111 return container_of(task_subsys_state(task, cpuset_subsys_id), 116 return container_of(task_subsys_state(task, cpuset_subsys_id),
112 struct cpuset, css); 117 struct cpuset, css);
113} 118}
114 119struct cpuset_hotplug_scanner {
120 struct cgroup_scanner scan;
121 struct cgroup *to;
122};
115 123
116/* bits in struct cpuset flags field */ 124/* bits in struct cpuset flags field */
117typedef enum { 125typedef enum {
@@ -1687,53 +1695,146 @@ int __init cpuset_init(void)
1687 return 0; 1695 return 0;
1688} 1696}
1689 1697
1698/**
1699 * cpuset_do_move_task - move a given task to another cpuset
1700 * @tsk: pointer to task_struct the task to move
1701 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
1702 *
1703 * Called by cgroup_scan_tasks() for each task in a cgroup.
1704 * Return nonzero to stop the walk through the tasks.
1705 */
1706void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan)
1707{
1708 struct cpuset_hotplug_scanner *chsp;
1709
1710 chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
1711 cgroup_attach_task(chsp->to, tsk);
1712}
1713
1714/**
1715 * move_member_tasks_to_cpuset - move tasks from one cpuset to another
1716 * @from: cpuset in which the tasks currently reside
1717 * @to: cpuset to which the tasks will be moved
1718 *
1719 * Called with manage_sem held
1720 * callback_mutex must not be held, as attach_task() will take it.
1721 *
1722 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
1723 * calling callback functions for each.
1724 */
1725static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1726{
1727 struct cpuset_hotplug_scanner scan;
1728
1729 scan.scan.cg = from->css.cgroup;
1730 scan.scan.test_task = NULL; /* select all tasks in cgroup */
1731 scan.scan.process_task = cpuset_do_move_task;
1732 scan.scan.heap = NULL;
1733 scan.to = to->css.cgroup;
1734
1735 if (cgroup_scan_tasks((struct cgroup_scanner *)&scan))
1736 printk(KERN_ERR "move_member_tasks_to_cpuset: "
1737 "cgroup_scan_tasks failed\n");
1738}
1739
1690/* 1740/*
1691 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs 1741 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
1692 * or memory nodes, we need to walk over the cpuset hierarchy, 1742 * or memory nodes, we need to walk over the cpuset hierarchy,
1693 * removing that CPU or node from all cpusets. If this removes the 1743 * removing that CPU or node from all cpusets. If this removes the
1694 * last CPU or node from a cpuset, then the guarantee_online_cpus() 1744 * last CPU or node from a cpuset, then move the tasks in the empty
1695 * or guarantee_online_mems() code will use that emptied cpusets 1745 * cpuset to its next-highest non-empty parent.
1696 * parent online CPUs or nodes. Cpusets that were already empty of
1697 * CPUs or nodes are left empty.
1698 *
1699 * This routine is intentionally inefficient in a couple of regards.
1700 * It will check all cpusets in a subtree even if the top cpuset of
1701 * the subtree has no offline CPUs or nodes. It checks both CPUs and
1702 * nodes, even though the caller could have been coded to know that
1703 * only one of CPUs or nodes needed to be checked on a given call.
1704 * This was done to minimize text size rather than cpu cycles.
1705 * 1746 *
1706 * Call with both manage_mutex and callback_mutex held. 1747 * The parent cpuset has some superset of the 'mems' nodes that the
1748 * newly empty cpuset held, so no migration of memory is necessary.
1707 * 1749 *
1708 * Recursive, on depth of cpuset subtree. 1750 * Called with both manage_sem and callback_sem held
1709 */ 1751 */
1752static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1753{
1754 struct cpuset *parent;
1755
1756 /* the cgroup's css_sets list is in use if there are tasks
1757 in the cpuset; the list is empty if there are none;
1758 the cs->css.refcnt seems always 0 */
1759 if (list_empty(&cs->css.cgroup->css_sets))
1760 return;
1710 1761
1711static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) 1762 /*
1763 * Find its next-highest non-empty parent, (top cpuset
1764 * has online cpus, so can't be empty).
1765 */
1766 parent = cs->parent;
1767 while (cpus_empty(parent->cpus_allowed)) {
1768 /*
1769 * this empty cpuset should now be considered to
1770 * have been used, and therefore eligible for
1771 * release when empty (if it is notify_on_release)
1772 */
1773 parent = parent->parent;
1774 }
1775
1776 move_member_tasks_to_cpuset(cs, parent);
1777}
1778
1779/*
1780 * Walk the specified cpuset subtree and look for empty cpusets.
1781 * The tasks of such cpuset must be moved to a parent cpuset.
1782 *
1783 * Note that such a notify_on_release cpuset must have had, at some time,
1784 * member tasks or cpuset descendants and cpus and memory, before it can
1785 * be a candidate for release.
1786 *
1787 * Called with manage_mutex held. We take callback_mutex to modify
1788 * cpus_allowed and mems_allowed.
1789 *
1790 * This walk processes the tree from top to bottom, completing one layer
1791 * before dropping down to the next. It always processes a node before
1792 * any of its children.
1793 *
1794 * For now, since we lack memory hot unplug, we'll never see a cpuset
1795 * that has tasks along with an empty 'mems'. But if we did see such
1796 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
1797 */
1798static void scan_for_empty_cpusets(const struct cpuset *root)
1712{ 1799{
1800 struct cpuset *cp; /* scans cpusets being updated */
1801 struct cpuset *child; /* scans child cpusets of cp */
1802 struct list_head queue;
1713 struct cgroup *cont; 1803 struct cgroup *cont;
1714 struct cpuset *c;
1715 1804
1716 /* Each of our child cpusets mems must be online */ 1805 INIT_LIST_HEAD(&queue);
1717 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { 1806
1718 c = cgroup_cs(cont); 1807 list_add_tail((struct list_head *)&root->stack_list, &queue);
1719 guarantee_online_cpus_mems_in_subtree(c); 1808
1720 if (!cpus_empty(c->cpus_allowed)) 1809 mutex_lock(&callback_mutex);
1721 guarantee_online_cpus(c, &c->cpus_allowed); 1810 while (!list_empty(&queue)) {
1722 if (!nodes_empty(c->mems_allowed)) 1811 cp = container_of(queue.next, struct cpuset, stack_list);
1723 guarantee_online_mems(c, &c->mems_allowed); 1812 list_del(queue.next);
1813 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
1814 child = cgroup_cs(cont);
1815 list_add_tail(&child->stack_list, &queue);
1816 }
1817 cont = cp->css.cgroup;
1818 /* Remove offline cpus and mems from this cpuset. */
1819 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
1820 nodes_and(cp->mems_allowed, cp->mems_allowed,
1821 node_states[N_HIGH_MEMORY]);
1822 if ((cpus_empty(cp->cpus_allowed) ||
1823 nodes_empty(cp->mems_allowed))) {
1824 /* Move tasks from the empty cpuset to a parent */
1825 mutex_unlock(&callback_mutex);
1826 remove_tasks_in_empty_cpuset(cp);
1827 mutex_lock(&callback_mutex);
1828 }
1724 } 1829 }
1830 mutex_unlock(&callback_mutex);
1831 return;
1725} 1832}
1726 1833
1727/* 1834/*
1728 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track 1835 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
1729 * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to 1836 * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
1730 * track what's online after any CPU or memory node hotplug or unplug 1837 * track what's online after any CPU or memory node hotplug or unplug event.
1731 * event.
1732 *
1733 * To ensure that we don't remove a CPU or node from the top cpuset
1734 * that is currently in use by a child cpuset (which would violate
1735 * the rule that cpusets must be subsets of their parent), we first
1736 * call the recursive routine guarantee_online_cpus_mems_in_subtree().
1737 * 1838 *
1738 * Since there are two callers of this routine, one for CPU hotplug 1839 * Since there are two callers of this routine, one for CPU hotplug
1739 * events and one for memory node hotplug events, we could have coded 1840 * events and one for memory node hotplug events, we could have coded
@@ -1744,13 +1845,11 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
1744static void common_cpu_mem_hotplug_unplug(void) 1845static void common_cpu_mem_hotplug_unplug(void)
1745{ 1846{
1746 cgroup_lock(); 1847 cgroup_lock();
1747 mutex_lock(&callback_mutex);
1748 1848
1749 guarantee_online_cpus_mems_in_subtree(&top_cpuset);
1750 top_cpuset.cpus_allowed = cpu_online_map; 1849 top_cpuset.cpus_allowed = cpu_online_map;
1751 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 1850 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1851 scan_for_empty_cpusets(&top_cpuset);
1752 1852
1753 mutex_unlock(&callback_mutex);
1754 cgroup_unlock(); 1853 cgroup_unlock();
1755} 1854}
1756 1855