sched: replace migration_thread with cpu_stop

Currently migration_thread is serving three purposes - migration pusher, context to execute active_load_balance() and forced context switcher for expedited RCU synchronize_sched. All three roles are hardcoded into migration_thread() and determining which job is scheduled is slightly messy. This patch kills migration_thread and replaces all three uses with cpu_stop. The three different roles of migration_thread() are splitted into three separate cpu_stop callbacks - migration_cpu_stop(), active_load_balance_cpu_stop() and synchronize_sched_expedited_cpu_stop() - and each use case now simply asks cpu_stop to execute the callback as necessary. synchronize_sched_expedited() was implemented with private preallocated resources and custom multi-cpu queueing and waiting logic, both of which are provided by cpu_stop. synchronize_sched_expedited_count is made atomic and all other shared resources along with the mutex are dropped. synchronize_sched_expedited() also implemented a check to detect cases where not all the callback got executed on their assigned cpus and fall back to synchronize_sched(). If called with cpu hotplug blocked, cpu_stop already guarantees that and the condition cannot happen; otherwise, stop_machine() would break. However, this patch preserves the paranoid check using a cpumask to record on which cpus the stopper ran so that it can serve as a bisection point if something actually goes wrong theree. Because the internal execution state is no longer visible, rcu_expedited_torture_stats() is removed. This patch also renames cpu_stop threads to from "stopper/%d" to "migration/%d". The names of these threads ultimately don't matter and there's no reason to make unnecessary userland visible changes. With this patch applied, stop_machine() and sched now share the same resources. stop_machine() is faster without wasting any resources and sched migration users are much cleaner. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@elte.hu> Cc: Dipankar Sarma <dipankar@in.ibm.com> Cc: Josh Triplett <josh@freedesktop.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Dimitri Sivanich <sivanich@sgi.com>
author: Tejun Heo <tj@kernel.org> 2010-05-06 12:49:21 -0400
committer: Tejun Heo <tj@kernel.org> 2010-05-06 12:49:21 -0400
commit: 969c79215a35b06e5e3efe69b9412f858df7856c (patch)
tree: 4256378687c8bc2011ec35a3e28bc3b6473e912c /kernel/sched.c
parent: 3fc1f1e27a5b807791d72e5d992aa33b668a6626 (diff)
1 files changed, 91 insertions, 224 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 4956ed092838..f1d577a0a8ab 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,9 +55,9 @@
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/percpu.h>
-#include <linux/kthread.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/stop_machine.h>
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
@@ -539,15 +539,13 @@ struct rq {
        int post_schedule;
        int active_balance;
        int push_cpu;
+        struct cpu_stop_work active_balance_work;
        /* cpu of this runqueue: */
        int cpu;
        int online;
        unsigned long avg_load_per_task;
-        struct task_struct *migration_thread;
-        struct list_head migration_queue;
        u64 rt_avg;
        u64 age_stamp;
        u64 idle_stamp;
@@ -2037,21 +2035,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        __set_task_cpu(p, new_cpu);
 }
-struct migration_req {
+struct migration_arg {
-        struct list_head list;
        struct task_struct *task;
        int dest_cpu;
-        struct completion done;
 };
+static int migration_cpu_stop(void *data);
 /*
 * The task's runqueue lock must be held.
 * Returns true if you have to wait for migration thread.
 */
-static int
+static bool migrate_task(struct task_struct *p, int dest_cpu)
-migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 {
        struct rq *rq = task_rq(p);
@@ -2059,15 +2054,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
         * If the task is not on a runqueue (and not running), then
         * the next wake-up will properly place the task.
         */
-        if (!p->se.on_rq && !task_running(rq, p))
+        return p->se.on_rq || task_running(rq, p);
-                return 0;
-        init_completion(&req->done);
-        req->task = p;
-        req->dest_cpu = dest_cpu;
-        list_add(&req->list, &rq->migration_queue);
-        return 1;
 }
 /*
@@ -3110,7 +3097,6 @@ static void update_cpu_load(struct rq *this_rq)
 void sched_exec(void)
 {
        struct task_struct *p = current;
-        struct migration_req req;
        unsigned long flags;
        struct rq *rq;
        int dest_cpu;
@@ -3124,17 +3110,11 @@ void sched_exec(void)
         * select_task_rq() can race against ->cpus_allowed
         */
        if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-            likely(cpu_active(dest_cpu)) &&
+            likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
-            migrate_task(p, dest_cpu, &req)) {
+                struct migration_arg arg = { p, dest_cpu };
-                /* Need to wait for migration thread (might exit: take ref). */
-                struct task_struct *mt = rq->migration_thread;
-                get_task_struct(mt);
                task_rq_unlock(rq, &flags);
-                wake_up_process(mt);
+                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-                put_task_struct(mt);
-                wait_for_completion(&req.done);
                return;
        }
 unlock:
@@ -5290,17 +5270,15 @@ static inline void sched_init_granularity(void)
 /*
 * This is how migration works:
 *
- * 1) we queue a struct migration_req structure in the source CPU's
+ * 1) we invoke migration_cpu_stop() on the target CPU using
- *    runqueue and wake up that CPU's migration thread.
+ *    stop_one_cpu().
- * 2) we down() the locked semaphore => thread blocks.
+ * 2) stopper starts to run (implicitly forcing the migrated thread
- * 3) migration thread wakes up (implicitly it forces the migrated
+ *    off the CPU)
- *    thread off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
- * 4) it gets the migration request and checks whether the migrated
+ * 4) if it's in the wrong runqueue then the migration thread removes
- *    task is still in the wrong runqueue.
- * 5) if it's in the wrong runqueue then the migration thread removes
 *    it and puts it into the right queue.
- * 6) migration thread up()s the semaphore.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
- * 7) we wake up and the migration is done.
+ *    is done.
 */
 /*
@@ -5314,9 +5292,9 @@ static inline void sched_init_granularity(void)
 */
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
-        struct migration_req req;
        unsigned long flags;
        struct rq *rq;
+        unsigned int dest_cpu;
        int ret = 0;
        /*
@@ -5354,15 +5332,12 @@ again:
        if (cpumask_test_cpu(task_cpu(p), new_mask))
                goto out;
-        if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
+        dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+        if (migrate_task(p, dest_cpu)) {
+                struct migration_arg arg = { p, dest_cpu };
                /* Need help from migration thread: drop lock and wait. */
-                struct task_struct *mt = rq->migration_thread;
-                get_task_struct(mt);
                task_rq_unlock(rq, &flags);
-                wake_up_process(mt);
+                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-                put_task_struct(mt);
-                wait_for_completion(&req.done);
                tlb_migrate_finish(p->mm);
                return 0;
        }
@@ -5420,70 +5395,22 @@ fail:
        return ret;
 }
-#define RCU_MIGRATION_IDLE      0
-#define RCU_MIGRATION_NEED_QS   1
-#define RCU_MIGRATION_GOT_QS    2
-#define RCU_MIGRATION_MUST_SYNC 3
 /*
- * migration_thread - this is a highprio system thread that performs
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
- * thread migration by bumping thread off CPU then 'pushing' onto
+ * and performs thread migration by bumping thread off CPU then
- * another runqueue.
+ * 'pushing' onto another runqueue.
 */
-static int migration_thread(void *data)
+static int migration_cpu_stop(void *data)
 {
-        int badcpu;
+        struct migration_arg *arg = data;
-        int cpu = (long)data;
-        struct rq *rq;
-        rq = cpu_rq(cpu);
-        BUG_ON(rq->migration_thread != current);
-        set_current_state(TASK_INTERRUPTIBLE);
-        while (!kthread_should_stop()) {
-                struct migration_req *req;
-                struct list_head *head;
-                raw_spin_lock_irq(&rq->lock);
-                if (cpu_is_offline(cpu)) {
-                        raw_spin_unlock_irq(&rq->lock);
-                        break;
-                }
-                if (rq->active_balance) {
-                        active_load_balance(rq, cpu);
-                        rq->active_balance = 0;
-                }
-                head = &rq->migration_queue;
-                if (list_empty(head)) {
-                        raw_spin_unlock_irq(&rq->lock);
-                        schedule();
-                        set_current_state(TASK_INTERRUPTIBLE);
-                        continue;
-                }
-                req = list_entry(head->next, struct migration_req, list);
-                list_del_init(head->next);
-                if (req->task != NULL) {
-                        raw_spin_unlock(&rq->lock);
-                        __migrate_task(req->task, cpu, req->dest_cpu);
-                } else if (likely(cpu == (badcpu = smp_processor_id()))) {
-                        req->dest_cpu = RCU_MIGRATION_GOT_QS;
-                        raw_spin_unlock(&rq->lock);
-                } else {
-                        req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
-                        raw_spin_unlock(&rq->lock);
-                        WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
-                }
-                local_irq_enable();
-                complete(&req->done);
-        }
-        __set_current_state(TASK_RUNNING);
+        /*
+         * The original target cpu might have gone down and we might
+         * be on another cpu but it doesn't matter.
+         */
+        local_irq_disable();
+        __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
+        local_irq_enable();
        return 0;
 }
@@ -5850,35 +5777,20 @@ static void set_rq_offline(struct rq *rq)
 static int __cpuinit
 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
-        struct task_struct *p;
        int cpu = (long)hcpu;
        unsigned long flags;
-        struct rq *rq;
+        struct rq *rq = cpu_rq(cpu);
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
-                if (IS_ERR(p))
-                        return NOTIFY_BAD;
-                kthread_bind(p, cpu);
-                /* Must be high prio: stop_machine expects to yield to it. */
-                rq = task_rq_lock(p, &flags);
-                __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-                task_rq_unlock(rq, &flags);
-                get_task_struct(p);
-                cpu_rq(cpu)->migration_thread = p;
                rq->calc_load_update = calc_load_update;
                break;
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-                /* Strictly unnecessary, as first user will wake it. */
-                wake_up_process(cpu_rq(cpu)->migration_thread);
                /* Update our root-domain */
-                rq = cpu_rq(cpu);
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -5889,25 +5801,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
 #ifdef CONFIG_HOTPLUG_CPU
-        case CPU_UP_CANCELED:
-        case CPU_UP_CANCELED_FROZEN:
-                if (!cpu_rq(cpu)->migration_thread)
-                        break;
-                /* Unbind it from offline cpu so it can run. Fall thru. */
-                kthread_bind(cpu_rq(cpu)->migration_thread,
-                             cpumask_any(cpu_online_mask));
-                kthread_stop(cpu_rq(cpu)->migration_thread);
-                put_task_struct(cpu_rq(cpu)->migration_thread);
-                cpu_rq(cpu)->migration_thread = NULL;
-                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
                migrate_live_tasks(cpu);
-                rq = cpu_rq(cpu);
-                kthread_stop(rq->migration_thread);
-                put_task_struct(rq->migration_thread);
-                rq->migration_thread = NULL;
                /* Idle task back to normal (off runqueue, low prio) */
                raw_spin_lock_irq(&rq->lock);
                deactivate_task(rq, rq->idle, 0);
@@ -5918,29 +5814,11 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                migrate_nr_uninterruptible(rq);
                BUG_ON(rq->nr_running != 0);
                calc_global_load_remove(rq);
-                /*
-                 * No need to migrate the tasks: it was best-effort if
-                 * they didn't take sched_hotcpu_mutex. Just wake up
-                 * the requestors.
-                 */
-                raw_spin_lock_irq(&rq->lock);
-                while (!list_empty(&rq->migration_queue)) {
-                        struct migration_req *req;
-                        req = list_entry(rq->migration_queue.next,
-                                         struct migration_req, list);
-                        list_del_init(&req->list);
-                        raw_spin_unlock_irq(&rq->lock);
-                        complete(&req->done);
-                        raw_spin_lock_irq(&rq->lock);
-                }
-                raw_spin_unlock_irq(&rq->lock);
                break;
        case CPU_DYING:
        case CPU_DYING_FROZEN:
                /* Update our root-domain */
-                rq = cpu_rq(cpu);
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -7757,10 +7635,8 @@ void __init sched_init(void)
                rq->push_cpu = 0;
                rq->cpu = i;
                rq->online = 0;
-                rq->migration_thread = NULL;
                rq->idle_stamp = 0;
                rq->avg_idle = 2*sysctl_sched_migration_cost;
-                INIT_LIST_HEAD(&rq->migration_queue);
                rq_attach_root(rq, &def_root_domain);
 #endif
                init_rq_hrtick(rq);
@@ -9054,43 +8930,39 @@ struct cgroup_subsys cpuacct_subsys = {
 #ifndef CONFIG_SMP
-int rcu_expedited_torture_stats(char *page)
-{
-        return 0;
-}
-EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
 void synchronize_sched_expedited(void)
 {
+        /*
+         * There must be a full memory barrier on each affected CPU
+         * between the time that try_stop_cpus() is called and the
+         * time that it returns.
+         *
+         * In the current initial implementation of cpu_stop, the
+         * above condition is already met when the control reaches
+         * this point and the following smp_mb() is not strictly
+         * necessary.  Do smp_mb() anyway for documentation and
+         * robustness against future implementation changes.
+         */
+        smp_mb();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 #else /* #ifndef CONFIG_SMP */
-static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
+static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-static DEFINE_MUTEX(rcu_sched_expedited_mutex);
-#define RCU_EXPEDITED_STATE_POST -2
-#define RCU_EXPEDITED_STATE_IDLE -1
-static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
+static int synchronize_sched_expedited_cpu_stop(void *data)
-int rcu_expedited_torture_stats(char *page)
 {
-        int cnt = 0;
+        static DEFINE_SPINLOCK(done_mask_lock);
-        int cpu;
+        struct cpumask *done_mask = data;
-        cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
+        if (done_mask) {
-        for_each_online_cpu(cpu) {
+                spin_lock(&done_mask_lock);
-                 cnt += sprintf(&page[cnt], " %d:%d",
+                cpumask_set_cpu(smp_processor_id(), done_mask);
-                                cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
+                spin_unlock(&done_mask_lock);
        }
-        cnt += sprintf(&page[cnt], "\n");
+        return 0;
-        return cnt;
 }
-EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
-static long synchronize_sched_expedited_count;
 /*
 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -9104,60 +8976,55 @@ static long synchronize_sched_expedited_count;
 */
 void synchronize_sched_expedited(void)
 {
-        int cpu;
+        cpumask_var_t done_mask_var;
-        unsigned long flags;
+        struct cpumask *done_mask = NULL;
-        bool need_full_sync = 0;
+        int snap, trycount = 0;
-        struct rq *rq;
-        struct migration_req *req;
+        /*
-        long snap;
+         * done_mask is used to check that all cpus actually have
-        int trycount = 0;
+         * finished running the stopper, which is guaranteed by
+         * stop_cpus() if it's called with cpu hotplug blocked.  Keep
+         * the paranoia for now but it's best effort if cpumask is off
+         * stack.
+         */
+        if (zalloc_cpumask_var(&done_mask_var, GFP_ATOMIC))
+                done_mask = done_mask_var;
        smp_mb();  /* ensure prior mod happens before capturing snap. */
-        snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
+        snap = atomic_read(&synchronize_sched_expedited_count) + 1;
        get_online_cpus();
-        while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
+        while (try_stop_cpus(cpu_online_mask,
+                             synchronize_sched_expedited_cpu_stop,
+                             done_mask) == -EAGAIN) {
                put_online_cpus();
                if (trycount++ < 10)
                        udelay(trycount * num_online_cpus());
                else {
                        synchronize_sched();
-                        return;
+                        goto free_out;
                }
-                if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
+                if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
                        smp_mb(); /* ensure test happens before caller kfree */
-                        return;
+                        goto free_out;
                }
                get_online_cpus();
        }
-        rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
+        atomic_inc(&synchronize_sched_expedited_count);
-        for_each_online_cpu(cpu) {
+        if (done_mask)
-                rq = cpu_rq(cpu);
+                cpumask_xor(done_mask, done_mask, cpu_online_mask);
-                req = &per_cpu(rcu_migration_req, cpu);
-                init_completion(&req->done);
-                req->task = NULL;
-                req->dest_cpu = RCU_MIGRATION_NEED_QS;
-                raw_spin_lock_irqsave(&rq->lock, flags);
-                list_add(&req->list, &rq->migration_queue);
-                raw_spin_unlock_irqrestore(&rq->lock, flags);
-                wake_up_process(rq->migration_thread);
-        }
-        for_each_online_cpu(cpu) {
-                rcu_expedited_state = cpu;
-                req = &per_cpu(rcu_migration_req, cpu);
-                rq = cpu_rq(cpu);
-                wait_for_completion(&req->done);
-                raw_spin_lock_irqsave(&rq->lock, flags);
-                if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
-                        need_full_sync = 1;
-                req->dest_cpu = RCU_MIGRATION_IDLE;
-                raw_spin_unlock_irqrestore(&rq->lock, flags);
-        }
-        rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
-        synchronize_sched_expedited_count++;
-        mutex_unlock(&rcu_sched_expedited_mutex);
        put_online_cpus();
-        if (need_full_sync)
+        /* paranoia - this can't happen */
+        if (done_mask && cpumask_weight(done_mask)) {
+                char buf[80];
+                cpulist_scnprintf(buf, sizeof(buf), done_mask);
+                WARN_ONCE(1, "synchronize_sched_expedited: cpu online and done masks disagree on %d cpus: %s\n",
+                          cpumask_weight(done_mask), buf);
                synchronize_sched();
+        }
+free_out:
+        free_cpumask_var(done_mask_var);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
author	Tejun Heo <tj@kernel.org>	2010-05-06 12:49:21 -0400
committer	Tejun Heo <tj@kernel.org>	2010-05-06 12:49:21 -0400
commit	969c79215a35b06e5e3efe69b9412f858df7856c (patch)
tree	4256378687c8bc2011ec35a3e28bc3b6473e912c /kernel/sched.c
parent	3fc1f1e27a5b807791d72e5d992aa33b668a6626 (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index 4956ed092838..f1d577a0a8ab 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -55,9 +55,9 @@
55	#include <linux/cpu.h>	55	#include <linux/cpu.h>
56	#include <linux/cpuset.h>	56	#include <linux/cpuset.h>
57	#include <linux/percpu.h>	57	#include <linux/percpu.h>
58	#include <linux/kthread.h>
59	#include <linux/proc_fs.h>	58	#include <linux/proc_fs.h>
60	#include <linux/seq_file.h>	59	#include <linux/seq_file.h>
		60	#include <linux/stop_machine.h>
61	#include <linux/sysctl.h>	61	#include <linux/sysctl.h>
62	#include <linux/syscalls.h>	62	#include <linux/syscalls.h>
63	#include <linux/times.h>	63	#include <linux/times.h>
@@ -539,15 +539,13 @@ struct rq {
539	int post_schedule;	539	int post_schedule;
540	int active_balance;	540	int active_balance;
541	int push_cpu;	541	int push_cpu;
		542	struct cpu_stop_work active_balance_work;
542	/* cpu of this runqueue: */	543	/* cpu of this runqueue: */
543	int cpu;	544	int cpu;
544	int online;	545	int online;
545		546
546	unsigned long avg_load_per_task;	547	unsigned long avg_load_per_task;
547		548
548	struct task_struct *migration_thread;
549	struct list_head migration_queue;
550
551	u64 rt_avg;	549	u64 rt_avg;
552	u64 age_stamp;	550	u64 age_stamp;
553	u64 idle_stamp;	551	u64 idle_stamp;
@@ -2037,21 +2035,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2037	__set_task_cpu(p, new_cpu);	2035	__set_task_cpu(p, new_cpu);
2038	}	2036	}
2039		2037
2040	struct migration_req {	2038	struct migration_arg {
2041	struct list_head list;
2042
2043	struct task_struct *task;	2039	struct task_struct *task;
2044	int dest_cpu;	2040	int dest_cpu;
2045
2046	struct completion done;
2047	};	2041	};
2048		2042
		2043	static int migration_cpu_stop(void *data);
		2044
2049	/*	2045	/*
2050	* The task's runqueue lock must be held.	2046	* The task's runqueue lock must be held.
2051	* Returns true if you have to wait for migration thread.	2047	* Returns true if you have to wait for migration thread.
2052	*/	2048	*/
2053	static int	2049	static bool migrate_task(struct task_struct *p, int dest_cpu)
2054	migrate_task(struct task_struct p, int dest_cpu, struct migration_req req)
2055	{	2050	{
2056	struct rq *rq = task_rq(p);	2051	struct rq *rq = task_rq(p);
2057		2052
@@ -2059,15 +2054,7 @@ migrate_task(struct task_struct p, int dest_cpu, struct migration_req req)
2059	* If the task is not on a runqueue (and not running), then	2054	* If the task is not on a runqueue (and not running), then
2060	* the next wake-up will properly place the task.	2055	* the next wake-up will properly place the task.
2061	*/	2056	*/
2062	if (!p->se.on_rq && !task_running(rq, p))	2057	return p->se.on_rq \|\| task_running(rq, p);
2063	return 0;
2064
2065	init_completion(&req->done);
2066	req->task = p;
2067	req->dest_cpu = dest_cpu;
2068	list_add(&req->list, &rq->migration_queue);
2069
2070	return 1;
2071	}	2058	}
2072		2059
2073	/*	2060	/*
@@ -3110,7 +3097,6 @@ static void update_cpu_load(struct rq *this_rq)
3110	void sched_exec(void)	3097	void sched_exec(void)
3111	{	3098	{
3112	struct task_struct *p = current;	3099	struct task_struct *p = current;
3113	struct migration_req req;
3114	unsigned long flags;	3100	unsigned long flags;
3115	struct rq *rq;	3101	struct rq *rq;
3116	int dest_cpu;	3102	int dest_cpu;
@@ -3124,17 +3110,11 @@ void sched_exec(void)
3124	* select_task_rq() can race against ->cpus_allowed	3110	* select_task_rq() can race against ->cpus_allowed
3125	*/	3111	*/
3126	if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&	3112	if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3127	likely(cpu_active(dest_cpu)) &&	3113	likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
3128	migrate_task(p, dest_cpu, &req)) {	3114	struct migration_arg arg = { p, dest_cpu };
3129	/* Need to wait for migration thread (might exit: take ref). */
3130	struct task_struct *mt = rq->migration_thread;
3131		3115
3132	get_task_struct(mt);
3133	task_rq_unlock(rq, &flags);	3116	task_rq_unlock(rq, &flags);
3134	wake_up_process(mt);	3117	stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
3135	put_task_struct(mt);
3136	wait_for_completion(&req.done);
3137
3138	return;	3118	return;
3139	}	3119	}
3140	unlock:	3120	unlock:
@@ -5290,17 +5270,15 @@ static inline void sched_init_granularity(void)
5290	/*	5270	/*
5291	* This is how migration works:	5271	* This is how migration works:
5292	*	5272	*
5293	* 1) we queue a struct migration_req structure in the source CPU's	5273	* 1) we invoke migration_cpu_stop() on the target CPU using
5294	* runqueue and wake up that CPU's migration thread.	5274	* stop_one_cpu().
5295	* 2) we down() the locked semaphore => thread blocks.	5275	* 2) stopper starts to run (implicitly forcing the migrated thread
5296	* 3) migration thread wakes up (implicitly it forces the migrated	5276	* off the CPU)
5297	* thread off the CPU)	5277	* 3) it checks whether the migrated task is still in the wrong runqueue.
5298	* 4) it gets the migration request and checks whether the migrated	5278	* 4) if it's in the wrong runqueue then the migration thread removes
5299	* task is still in the wrong runqueue.
5300	* 5) if it's in the wrong runqueue then the migration thread removes
5301	* it and puts it into the right queue.	5279	* it and puts it into the right queue.
5302	* 6) migration thread up()s the semaphore.	5280	* 5) stopper completes and stop_one_cpu() returns and the migration
5303	* 7) we wake up and the migration is done.	5281	* is done.
5304	*/	5282	*/
5305		5283
5306	/*	5284	/*
@@ -5314,9 +5292,9 @@ static inline void sched_init_granularity(void)
5314	*/	5292	*/
5315	int set_cpus_allowed_ptr(struct task_struct p, const struct cpumask new_mask)	5293	int set_cpus_allowed_ptr(struct task_struct p, const struct cpumask new_mask)
5316	{	5294	{
5317	struct migration_req req;
5318	unsigned long flags;	5295	unsigned long flags;
5319	struct rq *rq;	5296	struct rq *rq;
		5297	unsigned int dest_cpu;
5320	int ret = 0;	5298	int ret = 0;
5321		5299
5322	/*	5300	/*
@@ -5354,15 +5332,12 @@ again:
5354	if (cpumask_test_cpu(task_cpu(p), new_mask))	5332	if (cpumask_test_cpu(task_cpu(p), new_mask))
5355	goto out;	5333	goto out;
5356		5334
5357	if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {	5335	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
		5336	if (migrate_task(p, dest_cpu)) {
		5337	struct migration_arg arg = { p, dest_cpu };
5358	/* Need help from migration thread: drop lock and wait. */	5338	/* Need help from migration thread: drop lock and wait. */
5359	struct task_struct *mt = rq->migration_thread;
5360
5361	get_task_struct(mt);
5362	task_rq_unlock(rq, &flags);	5339	task_rq_unlock(rq, &flags);
5363	wake_up_process(mt);	5340	stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5364	put_task_struct(mt);
5365	wait_for_completion(&req.done);
5366	tlb_migrate_finish(p->mm);	5341	tlb_migrate_finish(p->mm);
5367	return 0;	5342	return 0;
5368	}	5343	}
@@ -5420,70 +5395,22 @@ fail:
5420	return ret;	5395	return ret;
5421	}	5396	}
5422		5397
5423	#define RCU_MIGRATION_IDLE 0
5424	#define RCU_MIGRATION_NEED_QS 1
5425	#define RCU_MIGRATION_GOT_QS 2
5426	#define RCU_MIGRATION_MUST_SYNC 3
5427
5428	/*	5398	/*
5429	* migration_thread - this is a highprio system thread that performs	5399	* migration_cpu_stop - this will be executed by a highprio stopper thread
5430	* thread migration by bumping thread off CPU then 'pushing' onto	5400	* and performs thread migration by bumping thread off CPU then
5431	* another runqueue.	5401	* 'pushing' onto another runqueue.
5432	*/	5402	*/
5433	static int migration_thread(void *data)	5403	static int migration_cpu_stop(void *data)
5434	{	5404	{
5435	int badcpu;	5405	struct migration_arg *arg = data;
5436	int cpu = (long)data;
5437	struct rq *rq;
5438
5439	rq = cpu_rq(cpu);
5440	BUG_ON(rq->migration_thread != current);
5441
5442	set_current_state(TASK_INTERRUPTIBLE);
5443	while (!kthread_should_stop()) {
5444	struct migration_req *req;
5445	struct list_head *head;
5446
5447	raw_spin_lock_irq(&rq->lock);
5448
5449	if (cpu_is_offline(cpu)) {
5450	raw_spin_unlock_irq(&rq->lock);
5451	break;
5452	}
5453
5454	if (rq->active_balance) {
5455	active_load_balance(rq, cpu);
5456	rq->active_balance = 0;
5457	}
5458
5459	head = &rq->migration_queue;
5460
5461	if (list_empty(head)) {
5462	raw_spin_unlock_irq(&rq->lock);
5463	schedule();
5464	set_current_state(TASK_INTERRUPTIBLE);
5465	continue;
5466	}
5467	req = list_entry(head->next, struct migration_req, list);
5468	list_del_init(head->next);
5469
5470	if (req->task != NULL) {
5471	raw_spin_unlock(&rq->lock);
5472	__migrate_task(req->task, cpu, req->dest_cpu);
5473	} else if (likely(cpu == (badcpu = smp_processor_id()))) {
5474	req->dest_cpu = RCU_MIGRATION_GOT_QS;
5475	raw_spin_unlock(&rq->lock);
5476	} else {
5477	req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
5478	raw_spin_unlock(&rq->lock);
5479	WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
5480	}
5481	local_irq_enable();
5482
5483	complete(&req->done);
5484	}
5485	__set_current_state(TASK_RUNNING);
5486		5406
		5407	/*
		5408	* The original target cpu might have gone down and we might
		5409	* be on another cpu but it doesn't matter.
		5410	*/
		5411	local_irq_disable();
		5412	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
		5413	local_irq_enable();
5487	return 0;	5414	return 0;
5488	}	5415	}
5489		5416
@@ -5850,35 +5777,20 @@ static void set_rq_offline(struct rq *rq)
5850	static int __cpuinit	5777	static int __cpuinit
5851	migration_call(struct notifier_block nfb, unsigned long action, void hcpu)	5778	migration_call(struct notifier_block nfb, unsigned long action, void hcpu)
5852	{	5779	{
5853	struct task_struct *p;
5854	int cpu = (long)hcpu;	5780	int cpu = (long)hcpu;
5855	unsigned long flags;	5781	unsigned long flags;
5856	struct rq *rq;	5782	struct rq *rq = cpu_rq(cpu);
5857		5783
5858	switch (action) {	5784	switch (action) {
5859		5785
5860	case CPU_UP_PREPARE:	5786	case CPU_UP_PREPARE:
5861	case CPU_UP_PREPARE_FROZEN:	5787	case CPU_UP_PREPARE_FROZEN:
5862	p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5863	if (IS_ERR(p))
5864	return NOTIFY_BAD;
5865	kthread_bind(p, cpu);
5866	/* Must be high prio: stop_machine expects to yield to it. */
5867	rq = task_rq_lock(p, &flags);
5868	__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5869	task_rq_unlock(rq, &flags);
5870	get_task_struct(p);
5871	cpu_rq(cpu)->migration_thread = p;
5872	rq->calc_load_update = calc_load_update;	5788	rq->calc_load_update = calc_load_update;
5873	break;	5789	break;
5874		5790
5875	case CPU_ONLINE:	5791	case CPU_ONLINE:
5876	case CPU_ONLINE_FROZEN:	5792	case CPU_ONLINE_FROZEN:
5877	/* Strictly unnecessary, as first user will wake it. */
5878	wake_up_process(cpu_rq(cpu)->migration_thread);
5879
5880	/* Update our root-domain */	5793	/* Update our root-domain */
5881	rq = cpu_rq(cpu);
5882	raw_spin_lock_irqsave(&rq->lock, flags);	5794	raw_spin_lock_irqsave(&rq->lock, flags);
5883	if (rq->rd) {	5795	if (rq->rd) {
5884	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));	5796	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -5889,25 +5801,9 @@ migration_call(struct notifier_block nfb, unsigned long action, void hcpu)
5889	break;	5801	break;
5890		5802
5891	#ifdef CONFIG_HOTPLUG_CPU	5803	#ifdef CONFIG_HOTPLUG_CPU
5892	case CPU_UP_CANCELED:
5893	case CPU_UP_CANCELED_FROZEN:
5894	if (!cpu_rq(cpu)->migration_thread)
5895	break;
5896	/* Unbind it from offline cpu so it can run. Fall thru. */
5897	kthread_bind(cpu_rq(cpu)->migration_thread,
5898	cpumask_any(cpu_online_mask));
5899	kthread_stop(cpu_rq(cpu)->migration_thread);
5900	put_task_struct(cpu_rq(cpu)->migration_thread);
5901	cpu_rq(cpu)->migration_thread = NULL;
5902	break;
5903
5904	case CPU_DEAD:	5804	case CPU_DEAD:
5905	case CPU_DEAD_FROZEN:	5805	case CPU_DEAD_FROZEN:
5906	migrate_live_tasks(cpu);	5806	migrate_live_tasks(cpu);
5907	rq = cpu_rq(cpu);
5908	kthread_stop(rq->migration_thread);
5909	put_task_struct(rq->migration_thread);
5910	rq->migration_thread = NULL;
5911	/* Idle task back to normal (off runqueue, low prio) */	5807	/* Idle task back to normal (off runqueue, low prio) */
5912	raw_spin_lock_irq(&rq->lock);	5808	raw_spin_lock_irq(&rq->lock);
5913	deactivate_task(rq, rq->idle, 0);	5809	deactivate_task(rq, rq->idle, 0);
@@ -5918,29 +5814,11 @@ migration_call(struct notifier_block nfb, unsigned long action, void hcpu)
5918	migrate_nr_uninterruptible(rq);	5814	migrate_nr_uninterruptible(rq);
5919	BUG_ON(rq->nr_running != 0);	5815	BUG_ON(rq->nr_running != 0);
5920	calc_global_load_remove(rq);	5816	calc_global_load_remove(rq);
5921	/*
5922	* No need to migrate the tasks: it was best-effort if
5923	* they didn't take sched_hotcpu_mutex. Just wake up
5924	* the requestors.
5925	*/
5926	raw_spin_lock_irq(&rq->lock);
5927	while (!list_empty(&rq->migration_queue)) {
5928	struct migration_req *req;
5929
5930	req = list_entry(rq->migration_queue.next,
5931	struct migration_req, list);
5932	list_del_init(&req->list);
5933	raw_spin_unlock_irq(&rq->lock);
5934	complete(&req->done);
5935	raw_spin_lock_irq(&rq->lock);
5936	}
5937	raw_spin_unlock_irq(&rq->lock);
5938	break;	5817	break;
5939		5818
5940	case CPU_DYING:	5819	case CPU_DYING:
5941	case CPU_DYING_FROZEN:	5820	case CPU_DYING_FROZEN:
5942	/* Update our root-domain */	5821	/* Update our root-domain */
5943	rq = cpu_rq(cpu);
5944	raw_spin_lock_irqsave(&rq->lock, flags);	5822	raw_spin_lock_irqsave(&rq->lock, flags);
5945	if (rq->rd) {	5823	if (rq->rd) {
5946	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));	5824	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -7757,10 +7635,8 @@ void __init sched_init(void)
7757	rq->push_cpu = 0;	7635	rq->push_cpu = 0;
7758	rq->cpu = i;	7636	rq->cpu = i;
7759	rq->online = 0;	7637	rq->online = 0;
7760	rq->migration_thread = NULL;
7761	rq->idle_stamp = 0;	7638	rq->idle_stamp = 0;
7762	rq->avg_idle = 2*sysctl_sched_migration_cost;	7639	rq->avg_idle = 2*sysctl_sched_migration_cost;
7763	INIT_LIST_HEAD(&rq->migration_queue);
7764	rq_attach_root(rq, &def_root_domain);	7640	rq_attach_root(rq, &def_root_domain);
7765	#endif	7641	#endif
7766	init_rq_hrtick(rq);	7642	init_rq_hrtick(rq);
@@ -9054,43 +8930,39 @@ struct cgroup_subsys cpuacct_subsys = {
9054		8930
9055	#ifndef CONFIG_SMP	8931	#ifndef CONFIG_SMP
9056		8932
9057	int rcu_expedited_torture_stats(char *page)
9058	{
9059	return 0;
9060	}
9061	EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9062
9063	void synchronize_sched_expedited(void)	8933	void synchronize_sched_expedited(void)
9064	{	8934	{
		8935	/*
		8936	* There must be a full memory barrier on each affected CPU
		8937	* between the time that try_stop_cpus() is called and the
		8938	* time that it returns.
		8939	*
		8940	* In the current initial implementation of cpu_stop, the
		8941	* above condition is already met when the control reaches
		8942	* this point and the following smp_mb() is not strictly
		8943	* necessary. Do smp_mb() anyway for documentation and
		8944	* robustness against future implementation changes.
		8945	*/
		8946	smp_mb();
9065	}	8947	}
9066	EXPORT_SYMBOL_GPL(synchronize_sched_expedited);	8948	EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9067		8949
9068	#else /* #ifndef CONFIG_SMP */	8950	#else /* #ifndef CONFIG_SMP */
9069		8951
9070	static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);	8952	static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9071	static DEFINE_MUTEX(rcu_sched_expedited_mutex);
9072
9073	#define RCU_EXPEDITED_STATE_POST -2
9074	#define RCU_EXPEDITED_STATE_IDLE -1
9075		8953
9076	static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;	8954	static int synchronize_sched_expedited_cpu_stop(void *data)
9077
9078	int rcu_expedited_torture_stats(char *page)
9079	{	8955	{
9080	int cnt = 0;	8956	static DEFINE_SPINLOCK(done_mask_lock);
9081	int cpu;	8957	struct cpumask *done_mask = data;
9082		8958
9083	cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);	8959	if (done_mask) {
9084	for_each_online_cpu(cpu) {	8960	spin_lock(&done_mask_lock);
9085	cnt += sprintf(&page[cnt], " %d:%d",	8961	cpumask_set_cpu(smp_processor_id(), done_mask);
9086	cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);	8962	spin_unlock(&done_mask_lock);
9087	}	8963	}
9088	cnt += sprintf(&page[cnt], "\n");	8964	return 0;
9089	return cnt;
9090	}	8965	}
9091	EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9092
9093	static long synchronize_sched_expedited_count;
9094		8966
9095	/*	8967	/*
9096	* Wait for an rcu-sched grace period to elapse, but use "big hammer"	8968	* Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -9104,60 +8976,55 @@ static long synchronize_sched_expedited_count;
9104	*/	8976	*/
9105	void synchronize_sched_expedited(void)	8977	void synchronize_sched_expedited(void)
9106	{	8978	{
9107	int cpu;	8979	cpumask_var_t done_mask_var;
9108	unsigned long flags;	8980	struct cpumask *done_mask = NULL;
9109	bool need_full_sync = 0;	8981	int snap, trycount = 0;
9110	struct rq *rq;	8982
9111	struct migration_req *req;	8983	/*
9112	long snap;	8984	* done_mask is used to check that all cpus actually have
9113	int trycount = 0;	8985	* finished running the stopper, which is guaranteed by
		8986	* stop_cpus() if it's called with cpu hotplug blocked. Keep
		8987	* the paranoia for now but it's best effort if cpumask is off
		8988	* stack.
		8989	*/
		8990	if (zalloc_cpumask_var(&done_mask_var, GFP_ATOMIC))
		8991	done_mask = done_mask_var;
9114		8992
9115	smp_mb(); /* ensure prior mod happens before capturing snap. */	8993	smp_mb(); /* ensure prior mod happens before capturing snap. */
9116	snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;	8994	snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9117	get_online_cpus();	8995	get_online_cpus();
9118	while (!mutex_trylock(&rcu_sched_expedited_mutex)) {	8996	while (try_stop_cpus(cpu_online_mask,
		8997	synchronize_sched_expedited_cpu_stop,
		8998	done_mask) == -EAGAIN) {
9119	put_online_cpus();	8999	put_online_cpus();
9120	if (trycount++ < 10)	9000	if (trycount++ < 10)
9121	udelay(trycount * num_online_cpus());	9001	udelay(trycount * num_online_cpus());
9122	else {	9002	else {
9123	synchronize_sched();	9003	synchronize_sched();
9124	return;	9004	goto free_out;
9125	}	9005	}
9126	if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {	9006	if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9127	smp_mb(); /* ensure test happens before caller kfree */	9007	smp_mb(); /* ensure test happens before caller kfree */
9128	return;	9008	goto free_out;
9129	}	9009	}
9130	get_online_cpus();	9010	get_online_cpus();
9131	}	9011	}
9132	rcu_expedited_state = RCU_EXPEDITED_STATE_POST;	9012	atomic_inc(&synchronize_sched_expedited_count);
9133	for_each_online_cpu(cpu) {	9013	if (done_mask)
9134	rq = cpu_rq(cpu);	9014	cpumask_xor(done_mask, done_mask, cpu_online_mask);
9135	req = &per_cpu(rcu_migration_req, cpu);
9136	init_completion(&req->done);
9137	req->task = NULL;
9138	req->dest_cpu = RCU_MIGRATION_NEED_QS;
9139	raw_spin_lock_irqsave(&rq->lock, flags);
9140	list_add(&req->list, &rq->migration_queue);
9141	raw_spin_unlock_irqrestore(&rq->lock, flags);
9142	wake_up_process(rq->migration_thread);
9143	}
9144	for_each_online_cpu(cpu) {
9145	rcu_expedited_state = cpu;
9146	req = &per_cpu(rcu_migration_req, cpu);
9147	rq = cpu_rq(cpu);
9148	wait_for_completion(&req->done);
9149	raw_spin_lock_irqsave(&rq->lock, flags);
9150	if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
9151	need_full_sync = 1;
9152	req->dest_cpu = RCU_MIGRATION_IDLE;
9153	raw_spin_unlock_irqrestore(&rq->lock, flags);
9154	}
9155	rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
9156	synchronize_sched_expedited_count++;
9157	mutex_unlock(&rcu_sched_expedited_mutex);
9158	put_online_cpus();	9015	put_online_cpus();
9159	if (need_full_sync)	9016
		9017	/* paranoia - this can't happen */
		9018	if (done_mask && cpumask_weight(done_mask)) {
		9019	char buf[80];
		9020
		9021	cpulist_scnprintf(buf, sizeof(buf), done_mask);
		9022	WARN_ONCE(1, "synchronize_sched_expedited: cpu online and done masks disagree on %d cpus: %s\n",
		9023	cpumask_weight(done_mask), buf);
9160	synchronize_sched();	9024	synchronize_sched();
		9025	}
		9026	free_out:
		9027	free_cpumask_var(done_mask_var);
9161	}	9028	}
9162	EXPORT_SYMBOL_GPL(synchronize_sched_expedited);	9029	EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9163		9030