hotplug cpu: move tasks in empty cpusets to parent

This patch corrects a situation that occurs when one disables all the cpus in a cpuset. Currently, the disabled (cpu-less) cpuset inherits the cpus of its parent, which is incorrect because it may then overlap its cpu-exclusive sibling. Tasks of an empty cpuset should be moved to the cpuset which is the parent of their current cpuset. Or if the parent cpuset has no cpus, to its parent, etc. And the empty cpuset should be released (if it is flagged notify_on_release). Depends on the cgroup_scan_tasks() function (proposed by David Rientjes) to iterate through all tasks in the cpu-less cpuset. We are deliberately avoiding a walk of the tasklist. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Cliff Wickman <cpw@sgi.com> Cc: Paul Menage <menage@google.com> Cc: Paul Jackson <pj@sgi.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Cliff Wickman <cpw@sgi.com> 2008-02-07 03:14:43 -0500
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2008-02-07 11:42:22 -0500
commit: 956db3ca0606e78456786ef19fd4dc7a5151a6e1 (patch)
tree: 0bef3d107df1115ecf76e342f30ecee67a7f3705 /kernel
parent: 31a7df01fd0cd786f60873a921aecafac148c290 (diff)
2 files changed, 144 insertions, 45 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bcc7a6e8e3c0..2c5cccbe12e2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -489,7 +489,7 @@ static struct css_set *find_css_set(
 * Any task can increment and decrement the count field without lock.
 * So in general, code holding cgroup_mutex can't rely on the count
 * field not changing.  However, if the count goes to zero, then only
- * attach_task() can increment it again.  Because a count of zero
+ * cgroup_attach_task() can increment it again.  Because a count of zero
 * means that no tasks are currently attached, therefore there is no
 * way a task attached to that cgroup can fork (the other way to
 * increment the count).  So code holding cgroup_mutex can safely
@@ -520,17 +520,17 @@ static struct css_set *find_css_set(
 *      The task_lock() exception
 *
 * The need for this exception arises from the action of
- * attach_task(), which overwrites one tasks cgroup pointer with
+ * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
 * another.  It does so using cgroup_mutexe, however there are
 * several performance critical places that need to reference
 * task->cgroup without the expense of grabbing a system global
 * mutex.  Therefore except as noted below, when dereferencing or, as
- * in attach_task(), modifying a task'ss cgroup pointer we use
+ * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
 * the task_struct routinely used for such matters.
 *
 * P.S.  One more locking exception.  RCU is used to guard the
- * update of a tasks cgroup pointer by attach_task()
+ * update of a tasks cgroup pointer by cgroup_attach_task()
 */
 /**
@@ -1194,7 +1194,7 @@ static void get_first_subsys(const struct cgroup *cgrp,
 * Call holding cgroup_mutex.  May take task_lock of
 * the task 'pid' during call.
 */
-static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
        int retval = 0;
        struct cgroup_subsys *ss;
@@ -1287,7 +1287,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
                get_task_struct(tsk);
        }
-        ret = attach_task(cgrp, tsk);
+        ret = cgroup_attach_task(cgrp, tsk);
        put_task_struct(tsk);
        return ret;
 }
@@ -2514,7 +2514,7 @@ out:
 *  - Used for /proc/<pid>/cgroup.
 *  - No need to task_lock(tsk) on this tsk->cgroup reference, as it
 *    doesn't really matter if tsk->cgroup changes after we read it,
- *    and we take cgroup_mutex, keeping attach_task() from changing it
+ *    and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
 *    anyway.  No need to check that tsk->cgroup != NULL, thanks to
 *    the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
 *    cgroup to top_cgroup.
@@ -2625,7 +2625,7 @@ static struct file_operations proc_cgroupstats_operations = {
 * A pointer to the shared css_set was automatically copied in
 * fork.c by dup_task_struct().  However, we ignore that copy, since
 * it was not made under the protection of RCU or cgroup_mutex, so
- * might no longer be a valid cgroup pointer.  attach_task() might
+ * might no longer be a valid cgroup pointer.  cgroup_attach_task() might
 * have already changed current->cgroups, allowing the previously
 * referenced cgroup group to be removed and freed.
 *
@@ -2704,8 +2704,8 @@ void cgroup_post_fork(struct task_struct *child)
 *    attach us to a different cgroup, decrementing the count on
 *    the first cgroup that we never incremented.  But in this case,
 *    top_cgroup isn't going away, and either task has PF_EXITING set,
- *    which wards off any attach_task() attempts, or task is a failed
+ *    which wards off any cgroup_attach_task() attempts, or task is a failed
- *    fork, never visible to attach_task.
+ *    fork, never visible to cgroup_attach_task.
 *
 */
 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
@@ -2845,7 +2845,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
        }
        /* All seems fine. Finish by moving the task into the new cgroup */
-        ret = attach_task(child, tsk);
+        ret = cgroup_attach_task(child, tsk);
        mutex_unlock(&cgroup_mutex);
 out_release:
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cfaf6419d817..d94a8f7c4c29 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -56,6 +56,8 @@
 #include <asm/atomic.h>
 #include <linux/mutex.h>
 #include <linux/kfifo.h>
+#include <linux/workqueue.h>
+#include <linux/cgroup.h>
 /*
 * Tracks how many cpusets are currently defined in system.
@@ -96,6 +98,9 @@ struct cpuset {
        /* partition number for rebuild_sched_domains() */
        int pn;
+        /* used for walking a cpuset heirarchy */
+        struct list_head stack_list;
 };
 /* Retrieve the cpuset for a cgroup */
@@ -111,7 +116,10 @@ static inline struct cpuset *task_cs(struct task_struct *task)
        return container_of(task_subsys_state(task, cpuset_subsys_id),
                            struct cpuset, css);
 }
+struct cpuset_hotplug_scanner {
+        struct cgroup_scanner scan;
+        struct cgroup *to;
+};
 /* bits in struct cpuset flags field */
 typedef enum {
@@ -1687,53 +1695,146 @@ int __init cpuset_init(void)
        return 0;
 }
+/**
+ * cpuset_do_move_task - move a given task to another cpuset
+ * @tsk: pointer to task_struct the task to move
+ * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
+ *
+ * Called by cgroup_scan_tasks() for each task in a cgroup.
+ * Return nonzero to stop the walk through the tasks.
+ */
+void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan)
+{
+        struct cpuset_hotplug_scanner *chsp;
+        chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
+        cgroup_attach_task(chsp->to, tsk);
+}
+/**
+ * move_member_tasks_to_cpuset - move tasks from one cpuset to another
+ * @from: cpuset in which the tasks currently reside
+ * @to: cpuset to which the tasks will be moved
+ *
+ * Called with manage_sem held
+ * callback_mutex must not be held, as attach_task() will take it.
+ *
+ * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
+ * calling callback functions for each.
+ */
+static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
+{
+        struct cpuset_hotplug_scanner scan;
+        scan.scan.cg = from->css.cgroup;
+        scan.scan.test_task = NULL; /* select all tasks in cgroup */
+        scan.scan.process_task = cpuset_do_move_task;
+        scan.scan.heap = NULL;
+        scan.to = to->css.cgroup;
+        if (cgroup_scan_tasks((struct cgroup_scanner *)&scan))
+                printk(KERN_ERR "move_member_tasks_to_cpuset: "
+                                "cgroup_scan_tasks failed\n");
+}
 /*
 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
 * or memory nodes, we need to walk over the cpuset hierarchy,
 * removing that CPU or node from all cpusets.  If this removes the
- * last CPU or node from a cpuset, then the guarantee_online_cpus()
+ * last CPU or node from a cpuset, then move the tasks in the empty
- * or guarantee_online_mems() code will use that emptied cpusets
+ * cpuset to its next-highest non-empty parent.
- * parent online CPUs or nodes.  Cpusets that were already empty of
- * CPUs or nodes are left empty.
- *
- * This routine is intentionally inefficient in a couple of regards.
- * It will check all cpusets in a subtree even if the top cpuset of
- * the subtree has no offline CPUs or nodes.  It checks both CPUs and
- * nodes, even though the caller could have been coded to know that
- * only one of CPUs or nodes needed to be checked on a given call.
- * This was done to minimize text size rather than cpu cycles.
 *
- * Call with both manage_mutex and callback_mutex held.
+ * The parent cpuset has some superset of the 'mems' nodes that the
+ * newly empty cpuset held, so no migration of memory is necessary.
 *
- * Recursive, on depth of cpuset subtree.
+ * Called with both manage_sem and callback_sem held
 */
+static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
+{
+        struct cpuset *parent;
+        /* the cgroup's css_sets list is in use if there are tasks
+           in the cpuset; the list is empty if there are none;
+           the cs->css.refcnt seems always 0 */
+        if (list_empty(&cs->css.cgroup->css_sets))
+                return;
-static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
+        /*
+         * Find its next-highest non-empty parent, (top cpuset
+         * has online cpus, so can't be empty).
+         */
+        parent = cs->parent;
+        while (cpus_empty(parent->cpus_allowed)) {
+                /*
+                 * this empty cpuset should now be considered to
+                 * have been used, and therefore eligible for
+                 * release when empty (if it is notify_on_release)
+                 */
+                parent = parent->parent;
+        }
+        move_member_tasks_to_cpuset(cs, parent);
+}
+/*
+ * Walk the specified cpuset subtree and look for empty cpusets.
+ * The tasks of such cpuset must be moved to a parent cpuset.
+ *
+ * Note that such a notify_on_release cpuset must have had, at some time,
+ * member tasks or cpuset descendants and cpus and memory, before it can
+ * be a candidate for release.
+ *
+ * Called with manage_mutex held.  We take callback_mutex to modify
+ * cpus_allowed and mems_allowed.
+ *
+ * This walk processes the tree from top to bottom, completing one layer
+ * before dropping down to the next.  It always processes a node before
+ * any of its children.
+ *
+ * For now, since we lack memory hot unplug, we'll never see a cpuset
+ * that has tasks along with an empty 'mems'.  But if we did see such
+ * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
+ */
+static void scan_for_empty_cpusets(const struct cpuset *root)
 {
+        struct cpuset *cp;      /* scans cpusets being updated */
+        struct cpuset *child;   /* scans child cpusets of cp */
+        struct list_head queue;
        struct cgroup *cont;
-        struct cpuset *c;
-        /* Each of our child cpusets mems must be online */
+        INIT_LIST_HEAD(&queue);
-        list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
-                c = cgroup_cs(cont);
+        list_add_tail((struct list_head *)&root->stack_list, &queue);
-                guarantee_online_cpus_mems_in_subtree(c);
-                if (!cpus_empty(c->cpus_allowed))
+        mutex_lock(&callback_mutex);
-                        guarantee_online_cpus(c, &c->cpus_allowed);
+        while (!list_empty(&queue)) {
-                if (!nodes_empty(c->mems_allowed))
+                cp = container_of(queue.next, struct cpuset, stack_list);
-                        guarantee_online_mems(c, &c->mems_allowed);
+                list_del(queue.next);
+                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+                        child = cgroup_cs(cont);
+                        list_add_tail(&child->stack_list, &queue);
+                }
+                cont = cp->css.cgroup;
+                /* Remove offline cpus and mems from this cpuset. */
+                cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
+                nodes_and(cp->mems_allowed, cp->mems_allowed,
+                                                node_states[N_HIGH_MEMORY]);
+                if ((cpus_empty(cp->cpus_allowed) ||
+                     nodes_empty(cp->mems_allowed))) {
+                        /* Move tasks from the empty cpuset to a parent */
+                        mutex_unlock(&callback_mutex);
+                        remove_tasks_in_empty_cpuset(cp);
+                        mutex_lock(&callback_mutex);
+                }
        }
+        mutex_unlock(&callback_mutex);
+        return;
 }
 /*
 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
 * cpu_online_map and node_states[N_HIGH_MEMORY].  Force the top cpuset to
- * track what's online after any CPU or memory node hotplug or unplug
+ * track what's online after any CPU or memory node hotplug or unplug event.
- * event.
- *
- * To ensure that we don't remove a CPU or node from the top cpuset
- * that is currently in use by a child cpuset (which would violate
- * the rule that cpusets must be subsets of their parent), we first
- * call the recursive routine guarantee_online_cpus_mems_in_subtree().
 *
 * Since there are two callers of this routine, one for CPU hotplug
 * events and one for memory node hotplug events, we could have coded
@@ -1744,13 +1845,11 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
 static void common_cpu_mem_hotplug_unplug(void)
 {
        cgroup_lock();
-        mutex_lock(&callback_mutex);
-        guarantee_online_cpus_mems_in_subtree(&top_cpuset);
        top_cpuset.cpus_allowed = cpu_online_map;
        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+        scan_for_empty_cpusets(&top_cpuset);
-        mutex_unlock(&callback_mutex);
        cgroup_unlock();
 }
author	Cliff Wickman <cpw@sgi.com>	2008-02-07 03:14:43 -0500
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2008-02-07 11:42:22 -0500
commit	956db3ca0606e78456786ef19fd4dc7a5151a6e1 (patch)
tree	0bef3d107df1115ecf76e342f30ecee67a7f3705 /kernel
parent	31a7df01fd0cd786f60873a921aecafac148c290 (diff)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bcc7a6e8e3c0..2c5cccbe12e2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c
@@ -489,7 +489,7 @@ static struct css_set *find_css_set(
489	* Any task can increment and decrement the count field without lock.	489	* Any task can increment and decrement the count field without lock.
490	* So in general, code holding cgroup_mutex can't rely on the count	490	* So in general, code holding cgroup_mutex can't rely on the count
491	* field not changing. However, if the count goes to zero, then only	491	* field not changing. However, if the count goes to zero, then only
492	* attach_task() can increment it again. Because a count of zero	492	* cgroup_attach_task() can increment it again. Because a count of zero
493	* means that no tasks are currently attached, therefore there is no	493	* means that no tasks are currently attached, therefore there is no
494	* way a task attached to that cgroup can fork (the other way to	494	* way a task attached to that cgroup can fork (the other way to
495	* increment the count). So code holding cgroup_mutex can safely	495	* increment the count). So code holding cgroup_mutex can safely
@@ -520,17 +520,17 @@ static struct css_set *find_css_set(
520	* The task_lock() exception	520	* The task_lock() exception
521	*	521	*
522	* The need for this exception arises from the action of	522	* The need for this exception arises from the action of
523	* attach_task(), which overwrites one tasks cgroup pointer with	523	* cgroup_attach_task(), which overwrites one tasks cgroup pointer with
524	* another. It does so using cgroup_mutexe, however there are	524	* another. It does so using cgroup_mutexe, however there are
525	* several performance critical places that need to reference	525	* several performance critical places that need to reference
526	* task->cgroup without the expense of grabbing a system global	526	* task->cgroup without the expense of grabbing a system global
527	* mutex. Therefore except as noted below, when dereferencing or, as	527	* mutex. Therefore except as noted below, when dereferencing or, as
528	* in attach_task(), modifying a task'ss cgroup pointer we use	528	* in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
529	* task_lock(), which acts on a spinlock (task->alloc_lock) already in	529	* task_lock(), which acts on a spinlock (task->alloc_lock) already in
530	* the task_struct routinely used for such matters.	530	* the task_struct routinely used for such matters.
531	*	531	*
532	* P.S. One more locking exception. RCU is used to guard the	532	* P.S. One more locking exception. RCU is used to guard the
533	* update of a tasks cgroup pointer by attach_task()	533	* update of a tasks cgroup pointer by cgroup_attach_task()
534	*/	534	*/
535		535
536	/**	536	/**
@@ -1194,7 +1194,7 @@ static void get_first_subsys(const struct cgroup *cgrp,
1194	* Call holding cgroup_mutex. May take task_lock of	1194	* Call holding cgroup_mutex. May take task_lock of
1195	* the task 'pid' during call.	1195	* the task 'pid' during call.
1196	*/	1196	*/
1197	static int attach_task(struct cgroup cgrp, struct task_struct tsk)	1197	int cgroup_attach_task(struct cgroup cgrp, struct task_struct tsk)
1198	{	1198	{
1199	int retval = 0;	1199	int retval = 0;
1200	struct cgroup_subsys *ss;	1200	struct cgroup_subsys *ss;
@@ -1287,7 +1287,7 @@ static int attach_task_by_pid(struct cgroup cgrp, char pidbuf)
1287	get_task_struct(tsk);	1287	get_task_struct(tsk);
1288	}	1288	}
1289		1289
1290	ret = attach_task(cgrp, tsk);	1290	ret = cgroup_attach_task(cgrp, tsk);
1291	put_task_struct(tsk);	1291	put_task_struct(tsk);
1292	return ret;	1292	return ret;
1293	}	1293	}
@@ -2514,7 +2514,7 @@ out:
2514	* - Used for /proc/<pid>/cgroup.	2514	* - Used for /proc/<pid>/cgroup.
2515	* - No need to task_lock(tsk) on this tsk->cgroup reference, as it	2515	* - No need to task_lock(tsk) on this tsk->cgroup reference, as it
2516	* doesn't really matter if tsk->cgroup changes after we read it,	2516	* doesn't really matter if tsk->cgroup changes after we read it,
2517	* and we take cgroup_mutex, keeping attach_task() from changing it	2517	* and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
2518	* anyway. No need to check that tsk->cgroup != NULL, thanks to	2518	* anyway. No need to check that tsk->cgroup != NULL, thanks to
2519	* the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks	2519	* the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
2520	* cgroup to top_cgroup.	2520	* cgroup to top_cgroup.
@@ -2625,7 +2625,7 @@ static struct file_operations proc_cgroupstats_operations = {
2625	* A pointer to the shared css_set was automatically copied in	2625	* A pointer to the shared css_set was automatically copied in
2626	* fork.c by dup_task_struct(). However, we ignore that copy, since	2626	* fork.c by dup_task_struct(). However, we ignore that copy, since
2627	* it was not made under the protection of RCU or cgroup_mutex, so	2627	* it was not made under the protection of RCU or cgroup_mutex, so
2628	* might no longer be a valid cgroup pointer. attach_task() might	2628	* might no longer be a valid cgroup pointer. cgroup_attach_task() might
2629	* have already changed current->cgroups, allowing the previously	2629	* have already changed current->cgroups, allowing the previously
2630	* referenced cgroup group to be removed and freed.	2630	* referenced cgroup group to be removed and freed.
2631	*	2631	*
@@ -2704,8 +2704,8 @@ void cgroup_post_fork(struct task_struct *child)
2704	* attach us to a different cgroup, decrementing the count on	2704	* attach us to a different cgroup, decrementing the count on
2705	* the first cgroup that we never incremented. But in this case,	2705	* the first cgroup that we never incremented. But in this case,
2706	* top_cgroup isn't going away, and either task has PF_EXITING set,	2706	* top_cgroup isn't going away, and either task has PF_EXITING set,
2707	* which wards off any attach_task() attempts, or task is a failed	2707	* which wards off any cgroup_attach_task() attempts, or task is a failed
2708	* fork, never visible to attach_task.	2708	* fork, never visible to cgroup_attach_task.
2709	*	2709	*
2710	*/	2710	*/
2711	void cgroup_exit(struct task_struct *tsk, int run_callbacks)	2711	void cgroup_exit(struct task_struct *tsk, int run_callbacks)
@@ -2845,7 +2845,7 @@ int cgroup_clone(struct task_struct tsk, struct cgroup_subsys subsys)
2845	}	2845	}
2846		2846
2847	/* All seems fine. Finish by moving the task into the new cgroup */	2847	/* All seems fine. Finish by moving the task into the new cgroup */
2848	ret = attach_task(child, tsk);	2848	ret = cgroup_attach_task(child, tsk);
2849	mutex_unlock(&cgroup_mutex);	2849	mutex_unlock(&cgroup_mutex);
2850		2850
2851	out_release:	2851	out_release:


diff --git a/kernel/cpuset.c b/kernel/cpuset.c index cfaf6419d817..d94a8f7c4c29 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c
@@ -56,6 +56,8 @@
56	#include <asm/atomic.h>	56	#include <asm/atomic.h>
57	#include <linux/mutex.h>	57	#include <linux/mutex.h>
58	#include <linux/kfifo.h>	58	#include <linux/kfifo.h>
		59	#include <linux/workqueue.h>
		60	#include <linux/cgroup.h>
59		61
60	/*	62	/*
61	* Tracks how many cpusets are currently defined in system.	63	* Tracks how many cpusets are currently defined in system.
@@ -96,6 +98,9 @@ struct cpuset {
96		98
97	/* partition number for rebuild_sched_domains() */	99	/* partition number for rebuild_sched_domains() */
98	int pn;	100	int pn;
		101
		102	/* used for walking a cpuset heirarchy */
		103	struct list_head stack_list;
99	};	104	};
100		105
101	/* Retrieve the cpuset for a cgroup */	106	/* Retrieve the cpuset for a cgroup */
@@ -111,7 +116,10 @@ static inline struct cpuset task_cs(struct task_struct task)
111	return container_of(task_subsys_state(task, cpuset_subsys_id),	116	return container_of(task_subsys_state(task, cpuset_subsys_id),
112	struct cpuset, css);	117	struct cpuset, css);
113	}	118	}
114		119	struct cpuset_hotplug_scanner {
		120	struct cgroup_scanner scan;
		121	struct cgroup *to;
		122	};
115		123
116	/* bits in struct cpuset flags field */	124	/* bits in struct cpuset flags field */
117	typedef enum {	125	typedef enum {
@@ -1687,53 +1695,146 @@ int __init cpuset_init(void)
1687	return 0;	1695	return 0;
1688	}	1696	}
1689		1697
		1698	/**
		1699	* cpuset_do_move_task - move a given task to another cpuset
		1700	* @tsk: pointer to task_struct the task to move
		1701	* @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
		1702	*
		1703	* Called by cgroup_scan_tasks() for each task in a cgroup.
		1704	* Return nonzero to stop the walk through the tasks.
		1705	*/
		1706	void cpuset_do_move_task(struct task_struct tsk, struct cgroup_scanner scan)
		1707	{
		1708	struct cpuset_hotplug_scanner *chsp;
		1709
		1710	chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
		1711	cgroup_attach_task(chsp->to, tsk);
		1712	}
		1713
		1714	/**
		1715	* move_member_tasks_to_cpuset - move tasks from one cpuset to another
		1716	* @from: cpuset in which the tasks currently reside
		1717	* @to: cpuset to which the tasks will be moved
		1718	*
		1719	* Called with manage_sem held
		1720	* callback_mutex must not be held, as attach_task() will take it.
		1721	*
		1722	* The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
		1723	* calling callback functions for each.
		1724	*/
		1725	static void move_member_tasks_to_cpuset(struct cpuset from, struct cpuset to)
		1726	{
		1727	struct cpuset_hotplug_scanner scan;
		1728
		1729	scan.scan.cg = from->css.cgroup;
		1730	scan.scan.test_task = NULL; /* select all tasks in cgroup */
		1731	scan.scan.process_task = cpuset_do_move_task;
		1732	scan.scan.heap = NULL;
		1733	scan.to = to->css.cgroup;
		1734
		1735	if (cgroup_scan_tasks((struct cgroup_scanner *)&scan))
		1736	printk(KERN_ERR "move_member_tasks_to_cpuset: "
		1737	"cgroup_scan_tasks failed\n");
		1738	}
		1739
1690	/*	1740	/*
1691	* If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs	1741	* If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
1692	* or memory nodes, we need to walk over the cpuset hierarchy,	1742	* or memory nodes, we need to walk over the cpuset hierarchy,
1693	* removing that CPU or node from all cpusets. If this removes the	1743	* removing that CPU or node from all cpusets. If this removes the
1694	* last CPU or node from a cpuset, then the guarantee_online_cpus()	1744	* last CPU or node from a cpuset, then move the tasks in the empty
1695	* or guarantee_online_mems() code will use that emptied cpusets	1745	* cpuset to its next-highest non-empty parent.
1696	* parent online CPUs or nodes. Cpusets that were already empty of
1697	* CPUs or nodes are left empty.
1698	*
1699	* This routine is intentionally inefficient in a couple of regards.
1700	* It will check all cpusets in a subtree even if the top cpuset of
1701	* the subtree has no offline CPUs or nodes. It checks both CPUs and
1702	* nodes, even though the caller could have been coded to know that
1703	* only one of CPUs or nodes needed to be checked on a given call.
1704	* This was done to minimize text size rather than cpu cycles.
1705	*	1746	*
1706	* Call with both manage_mutex and callback_mutex held.	1747	* The parent cpuset has some superset of the 'mems' nodes that the
		1748	* newly empty cpuset held, so no migration of memory is necessary.
1707	*	1749	*
1708	* Recursive, on depth of cpuset subtree.	1750	* Called with both manage_sem and callback_sem held
1709	*/	1751	*/
		1752	static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
		1753	{
		1754	struct cpuset *parent;
		1755
		1756	/* the cgroup's css_sets list is in use if there are tasks
		1757	in the cpuset; the list is empty if there are none;
		1758	the cs->css.refcnt seems always 0 */
		1759	if (list_empty(&cs->css.cgroup->css_sets))
		1760	return;
1710		1761
1711	static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)	1762	/*
		1763	* Find its next-highest non-empty parent, (top cpuset
		1764	* has online cpus, so can't be empty).
		1765	*/
		1766	parent = cs->parent;
		1767	while (cpus_empty(parent->cpus_allowed)) {
		1768	/*
		1769	* this empty cpuset should now be considered to
		1770	* have been used, and therefore eligible for
		1771	* release when empty (if it is notify_on_release)
		1772	*/
		1773	parent = parent->parent;
		1774	}
		1775
		1776	move_member_tasks_to_cpuset(cs, parent);
		1777	}
		1778
		1779	/*
		1780	* Walk the specified cpuset subtree and look for empty cpusets.
		1781	* The tasks of such cpuset must be moved to a parent cpuset.
		1782	*
		1783	* Note that such a notify_on_release cpuset must have had, at some time,
		1784	* member tasks or cpuset descendants and cpus and memory, before it can
		1785	* be a candidate for release.
		1786	*
		1787	* Called with manage_mutex held. We take callback_mutex to modify
		1788	* cpus_allowed and mems_allowed.
		1789	*
		1790	* This walk processes the tree from top to bottom, completing one layer
		1791	* before dropping down to the next. It always processes a node before
		1792	* any of its children.
		1793	*
		1794	* For now, since we lack memory hot unplug, we'll never see a cpuset
		1795	* that has tasks along with an empty 'mems'. But if we did see such
		1796	* a cpuset, we'd handle it just like we do if its 'cpus' was empty.
		1797	*/
		1798	static void scan_for_empty_cpusets(const struct cpuset *root)
1712	{	1799	{
		1800	struct cpuset cp; / scans cpusets being updated */
		1801	struct cpuset child; / scans child cpusets of cp */
		1802	struct list_head queue;
1713	struct cgroup *cont;	1803	struct cgroup *cont;
1714	struct cpuset *c;
1715		1804
1716	/* Each of our child cpusets mems must be online */	1805	INIT_LIST_HEAD(&queue);
1717	list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {	1806
1718	c = cgroup_cs(cont);	1807	list_add_tail((struct list_head *)&root->stack_list, &queue);
1719	guarantee_online_cpus_mems_in_subtree(c);	1808
1720	if (!cpus_empty(c->cpus_allowed))	1809	mutex_lock(&callback_mutex);
1721	guarantee_online_cpus(c, &c->cpus_allowed);	1810	while (!list_empty(&queue)) {
1722	if (!nodes_empty(c->mems_allowed))	1811	cp = container_of(queue.next, struct cpuset, stack_list);
1723	guarantee_online_mems(c, &c->mems_allowed);	1812	list_del(queue.next);
		1813	list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
		1814	child = cgroup_cs(cont);
		1815	list_add_tail(&child->stack_list, &queue);
		1816	}
		1817	cont = cp->css.cgroup;
		1818	/* Remove offline cpus and mems from this cpuset. */
		1819	cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
		1820	nodes_and(cp->mems_allowed, cp->mems_allowed,
		1821	node_states[N_HIGH_MEMORY]);
		1822	if ((cpus_empty(cp->cpus_allowed) \|\|
		1823	nodes_empty(cp->mems_allowed))) {
		1824	/* Move tasks from the empty cpuset to a parent */
		1825	mutex_unlock(&callback_mutex);
		1826	remove_tasks_in_empty_cpuset(cp);
		1827	mutex_lock(&callback_mutex);
		1828	}
1724	}	1829	}
		1830	mutex_unlock(&callback_mutex);
		1831	return;
1725	}	1832	}
1726		1833
1727	/*	1834	/*
1728	* The cpus_allowed and mems_allowed nodemasks in the top_cpuset track	1835	* The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
1729	* cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to	1836	* cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
1730	* track what's online after any CPU or memory node hotplug or unplug	1837	* track what's online after any CPU or memory node hotplug or unplug event.
1731	* event.
1732	*
1733	* To ensure that we don't remove a CPU or node from the top cpuset
1734	* that is currently in use by a child cpuset (which would violate
1735	* the rule that cpusets must be subsets of their parent), we first
1736	* call the recursive routine guarantee_online_cpus_mems_in_subtree().
1737	*	1838	*
1738	* Since there are two callers of this routine, one for CPU hotplug	1839	* Since there are two callers of this routine, one for CPU hotplug
1739	* events and one for memory node hotplug events, we could have coded	1840	* events and one for memory node hotplug events, we could have coded
@@ -1744,13 +1845,11 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
1744	static void common_cpu_mem_hotplug_unplug(void)	1845	static void common_cpu_mem_hotplug_unplug(void)
1745	{	1846	{
1746	cgroup_lock();	1847	cgroup_lock();
1747	mutex_lock(&callback_mutex);
1748		1848
1749	guarantee_online_cpus_mems_in_subtree(&top_cpuset);
1750	top_cpuset.cpus_allowed = cpu_online_map;	1849	top_cpuset.cpus_allowed = cpu_online_map;
1751	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];	1850	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
		1851	scan_for_empty_cpusets(&top_cpuset);
1752		1852
1753	mutex_unlock(&callback_mutex);
1754	cgroup_unlock();	1853	cgroup_unlock();
1755	}	1854	}
1756		1855