diff options
author | Tejun Heo <tj@kernel.org> | 2010-05-06 12:49:21 -0400 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2010-05-06 12:49:21 -0400 |
commit | 969c79215a35b06e5e3efe69b9412f858df7856c (patch) | |
tree | 4256378687c8bc2011ec35a3e28bc3b6473e912c /kernel | |
parent | 3fc1f1e27a5b807791d72e5d992aa33b668a6626 (diff) |
sched: replace migration_thread with cpu_stop
Currently migration_thread is serving three purposes - migration
pusher, context to execute active_load_balance() and forced context
switcher for expedited RCU synchronize_sched. All three roles are
hardcoded into migration_thread() and determining which job is
scheduled is slightly messy.
This patch kills migration_thread and replaces all three uses with
cpu_stop. The three different roles of migration_thread() are
splitted into three separate cpu_stop callbacks -
migration_cpu_stop(), active_load_balance_cpu_stop() and
synchronize_sched_expedited_cpu_stop() - and each use case now simply
asks cpu_stop to execute the callback as necessary.
synchronize_sched_expedited() was implemented with private
preallocated resources and custom multi-cpu queueing and waiting
logic, both of which are provided by cpu_stop.
synchronize_sched_expedited_count is made atomic and all other shared
resources along with the mutex are dropped.
synchronize_sched_expedited() also implemented a check to detect cases
where not all the callback got executed on their assigned cpus and
fall back to synchronize_sched(). If called with cpu hotplug blocked,
cpu_stop already guarantees that and the condition cannot happen;
otherwise, stop_machine() would break. However, this patch preserves
the paranoid check using a cpumask to record on which cpus the stopper
ran so that it can serve as a bisection point if something actually
goes wrong theree.
Because the internal execution state is no longer visible,
rcu_expedited_torture_stats() is removed.
This patch also renames cpu_stop threads to from "stopper/%d" to
"migration/%d". The names of these threads ultimately don't matter
and there's no reason to make unnecessary userland visible changes.
With this patch applied, stop_machine() and sched now share the same
resources. stop_machine() is faster without wasting any resources and
sched migration users are much cleaner.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Josh Triplett <josh@freedesktop.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/rcutorture.c | 2 | ||||
-rw-r--r-- | kernel/sched.c | 315 | ||||
-rw-r--r-- | kernel/sched_fair.c | 48 | ||||
-rw-r--r-- | kernel/stop_machine.c | 2 |
4 files changed, 127 insertions, 240 deletions
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 58df55bf83ed..2b676f3a0f26 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -669,7 +669,7 @@ static struct rcu_torture_ops sched_expedited_ops = { | |||
669 | .sync = synchronize_sched_expedited, | 669 | .sync = synchronize_sched_expedited, |
670 | .cb_barrier = NULL, | 670 | .cb_barrier = NULL, |
671 | .fqs = rcu_sched_force_quiescent_state, | 671 | .fqs = rcu_sched_force_quiescent_state, |
672 | .stats = rcu_expedited_torture_stats, | 672 | .stats = NULL, |
673 | .irq_capable = 1, | 673 | .irq_capable = 1, |
674 | .name = "sched_expedited" | 674 | .name = "sched_expedited" |
675 | }; | 675 | }; |
diff --git a/kernel/sched.c b/kernel/sched.c index 4956ed092838..f1d577a0a8ab 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -55,9 +55,9 @@ | |||
55 | #include <linux/cpu.h> | 55 | #include <linux/cpu.h> |
56 | #include <linux/cpuset.h> | 56 | #include <linux/cpuset.h> |
57 | #include <linux/percpu.h> | 57 | #include <linux/percpu.h> |
58 | #include <linux/kthread.h> | ||
59 | #include <linux/proc_fs.h> | 58 | #include <linux/proc_fs.h> |
60 | #include <linux/seq_file.h> | 59 | #include <linux/seq_file.h> |
60 | #include <linux/stop_machine.h> | ||
61 | #include <linux/sysctl.h> | 61 | #include <linux/sysctl.h> |
62 | #include <linux/syscalls.h> | 62 | #include <linux/syscalls.h> |
63 | #include <linux/times.h> | 63 | #include <linux/times.h> |
@@ -539,15 +539,13 @@ struct rq { | |||
539 | int post_schedule; | 539 | int post_schedule; |
540 | int active_balance; | 540 | int active_balance; |
541 | int push_cpu; | 541 | int push_cpu; |
542 | struct cpu_stop_work active_balance_work; | ||
542 | /* cpu of this runqueue: */ | 543 | /* cpu of this runqueue: */ |
543 | int cpu; | 544 | int cpu; |
544 | int online; | 545 | int online; |
545 | 546 | ||
546 | unsigned long avg_load_per_task; | 547 | unsigned long avg_load_per_task; |
547 | 548 | ||
548 | struct task_struct *migration_thread; | ||
549 | struct list_head migration_queue; | ||
550 | |||
551 | u64 rt_avg; | 549 | u64 rt_avg; |
552 | u64 age_stamp; | 550 | u64 age_stamp; |
553 | u64 idle_stamp; | 551 | u64 idle_stamp; |
@@ -2037,21 +2035,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
2037 | __set_task_cpu(p, new_cpu); | 2035 | __set_task_cpu(p, new_cpu); |
2038 | } | 2036 | } |
2039 | 2037 | ||
2040 | struct migration_req { | 2038 | struct migration_arg { |
2041 | struct list_head list; | ||
2042 | |||
2043 | struct task_struct *task; | 2039 | struct task_struct *task; |
2044 | int dest_cpu; | 2040 | int dest_cpu; |
2045 | |||
2046 | struct completion done; | ||
2047 | }; | 2041 | }; |
2048 | 2042 | ||
2043 | static int migration_cpu_stop(void *data); | ||
2044 | |||
2049 | /* | 2045 | /* |
2050 | * The task's runqueue lock must be held. | 2046 | * The task's runqueue lock must be held. |
2051 | * Returns true if you have to wait for migration thread. | 2047 | * Returns true if you have to wait for migration thread. |
2052 | */ | 2048 | */ |
2053 | static int | 2049 | static bool migrate_task(struct task_struct *p, int dest_cpu) |
2054 | migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) | ||
2055 | { | 2050 | { |
2056 | struct rq *rq = task_rq(p); | 2051 | struct rq *rq = task_rq(p); |
2057 | 2052 | ||
@@ -2059,15 +2054,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) | |||
2059 | * If the task is not on a runqueue (and not running), then | 2054 | * If the task is not on a runqueue (and not running), then |
2060 | * the next wake-up will properly place the task. | 2055 | * the next wake-up will properly place the task. |
2061 | */ | 2056 | */ |
2062 | if (!p->se.on_rq && !task_running(rq, p)) | 2057 | return p->se.on_rq || task_running(rq, p); |
2063 | return 0; | ||
2064 | |||
2065 | init_completion(&req->done); | ||
2066 | req->task = p; | ||
2067 | req->dest_cpu = dest_cpu; | ||
2068 | list_add(&req->list, &rq->migration_queue); | ||
2069 | |||
2070 | return 1; | ||
2071 | } | 2058 | } |
2072 | 2059 | ||
2073 | /* | 2060 | /* |
@@ -3110,7 +3097,6 @@ static void update_cpu_load(struct rq *this_rq) | |||
3110 | void sched_exec(void) | 3097 | void sched_exec(void) |
3111 | { | 3098 | { |
3112 | struct task_struct *p = current; | 3099 | struct task_struct *p = current; |
3113 | struct migration_req req; | ||
3114 | unsigned long flags; | 3100 | unsigned long flags; |
3115 | struct rq *rq; | 3101 | struct rq *rq; |
3116 | int dest_cpu; | 3102 | int dest_cpu; |
@@ -3124,17 +3110,11 @@ void sched_exec(void) | |||
3124 | * select_task_rq() can race against ->cpus_allowed | 3110 | * select_task_rq() can race against ->cpus_allowed |
3125 | */ | 3111 | */ |
3126 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && | 3112 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && |
3127 | likely(cpu_active(dest_cpu)) && | 3113 | likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { |
3128 | migrate_task(p, dest_cpu, &req)) { | 3114 | struct migration_arg arg = { p, dest_cpu }; |
3129 | /* Need to wait for migration thread (might exit: take ref). */ | ||
3130 | struct task_struct *mt = rq->migration_thread; | ||
3131 | 3115 | ||
3132 | get_task_struct(mt); | ||
3133 | task_rq_unlock(rq, &flags); | 3116 | task_rq_unlock(rq, &flags); |
3134 | wake_up_process(mt); | 3117 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); |
3135 | put_task_struct(mt); | ||
3136 | wait_for_completion(&req.done); | ||
3137 | |||
3138 | return; | 3118 | return; |
3139 | } | 3119 | } |
3140 | unlock: | 3120 | unlock: |
@@ -5290,17 +5270,15 @@ static inline void sched_init_granularity(void) | |||
5290 | /* | 5270 | /* |
5291 | * This is how migration works: | 5271 | * This is how migration works: |
5292 | * | 5272 | * |
5293 | * 1) we queue a struct migration_req structure in the source CPU's | 5273 | * 1) we invoke migration_cpu_stop() on the target CPU using |
5294 | * runqueue and wake up that CPU's migration thread. | 5274 | * stop_one_cpu(). |
5295 | * 2) we down() the locked semaphore => thread blocks. | 5275 | * 2) stopper starts to run (implicitly forcing the migrated thread |
5296 | * 3) migration thread wakes up (implicitly it forces the migrated | 5276 | * off the CPU) |
5297 | * thread off the CPU) | 5277 | * 3) it checks whether the migrated task is still in the wrong runqueue. |
5298 | * 4) it gets the migration request and checks whether the migrated | 5278 | * 4) if it's in the wrong runqueue then the migration thread removes |
5299 | * task is still in the wrong runqueue. | ||
5300 | * 5) if it's in the wrong runqueue then the migration thread removes | ||
5301 | * it and puts it into the right queue. | 5279 | * it and puts it into the right queue. |
5302 | * 6) migration thread up()s the semaphore. | 5280 | * 5) stopper completes and stop_one_cpu() returns and the migration |
5303 | * 7) we wake up and the migration is done. | 5281 | * is done. |
5304 | */ | 5282 | */ |
5305 | 5283 | ||
5306 | /* | 5284 | /* |
@@ -5314,9 +5292,9 @@ static inline void sched_init_granularity(void) | |||
5314 | */ | 5292 | */ |
5315 | int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | 5293 | int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) |
5316 | { | 5294 | { |
5317 | struct migration_req req; | ||
5318 | unsigned long flags; | 5295 | unsigned long flags; |
5319 | struct rq *rq; | 5296 | struct rq *rq; |
5297 | unsigned int dest_cpu; | ||
5320 | int ret = 0; | 5298 | int ret = 0; |
5321 | 5299 | ||
5322 | /* | 5300 | /* |
@@ -5354,15 +5332,12 @@ again: | |||
5354 | if (cpumask_test_cpu(task_cpu(p), new_mask)) | 5332 | if (cpumask_test_cpu(task_cpu(p), new_mask)) |
5355 | goto out; | 5333 | goto out; |
5356 | 5334 | ||
5357 | if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { | 5335 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); |
5336 | if (migrate_task(p, dest_cpu)) { | ||
5337 | struct migration_arg arg = { p, dest_cpu }; | ||
5358 | /* Need help from migration thread: drop lock and wait. */ | 5338 | /* Need help from migration thread: drop lock and wait. */ |
5359 | struct task_struct *mt = rq->migration_thread; | ||
5360 | |||
5361 | get_task_struct(mt); | ||
5362 | task_rq_unlock(rq, &flags); | 5339 | task_rq_unlock(rq, &flags); |
5363 | wake_up_process(mt); | 5340 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); |
5364 | put_task_struct(mt); | ||
5365 | wait_for_completion(&req.done); | ||
5366 | tlb_migrate_finish(p->mm); | 5341 | tlb_migrate_finish(p->mm); |
5367 | return 0; | 5342 | return 0; |
5368 | } | 5343 | } |
@@ -5420,70 +5395,22 @@ fail: | |||
5420 | return ret; | 5395 | return ret; |
5421 | } | 5396 | } |
5422 | 5397 | ||
5423 | #define RCU_MIGRATION_IDLE 0 | ||
5424 | #define RCU_MIGRATION_NEED_QS 1 | ||
5425 | #define RCU_MIGRATION_GOT_QS 2 | ||
5426 | #define RCU_MIGRATION_MUST_SYNC 3 | ||
5427 | |||
5428 | /* | 5398 | /* |
5429 | * migration_thread - this is a highprio system thread that performs | 5399 | * migration_cpu_stop - this will be executed by a highprio stopper thread |
5430 | * thread migration by bumping thread off CPU then 'pushing' onto | 5400 | * and performs thread migration by bumping thread off CPU then |
5431 | * another runqueue. | 5401 | * 'pushing' onto another runqueue. |
5432 | */ | 5402 | */ |
5433 | static int migration_thread(void *data) | 5403 | static int migration_cpu_stop(void *data) |
5434 | { | 5404 | { |
5435 | int badcpu; | 5405 | struct migration_arg *arg = data; |
5436 | int cpu = (long)data; | ||
5437 | struct rq *rq; | ||
5438 | |||
5439 | rq = cpu_rq(cpu); | ||
5440 | BUG_ON(rq->migration_thread != current); | ||
5441 | |||
5442 | set_current_state(TASK_INTERRUPTIBLE); | ||
5443 | while (!kthread_should_stop()) { | ||
5444 | struct migration_req *req; | ||
5445 | struct list_head *head; | ||
5446 | |||
5447 | raw_spin_lock_irq(&rq->lock); | ||
5448 | |||
5449 | if (cpu_is_offline(cpu)) { | ||
5450 | raw_spin_unlock_irq(&rq->lock); | ||
5451 | break; | ||
5452 | } | ||
5453 | |||
5454 | if (rq->active_balance) { | ||
5455 | active_load_balance(rq, cpu); | ||
5456 | rq->active_balance = 0; | ||
5457 | } | ||
5458 | |||
5459 | head = &rq->migration_queue; | ||
5460 | |||
5461 | if (list_empty(head)) { | ||
5462 | raw_spin_unlock_irq(&rq->lock); | ||
5463 | schedule(); | ||
5464 | set_current_state(TASK_INTERRUPTIBLE); | ||
5465 | continue; | ||
5466 | } | ||
5467 | req = list_entry(head->next, struct migration_req, list); | ||
5468 | list_del_init(head->next); | ||
5469 | |||
5470 | if (req->task != NULL) { | ||
5471 | raw_spin_unlock(&rq->lock); | ||
5472 | __migrate_task(req->task, cpu, req->dest_cpu); | ||
5473 | } else if (likely(cpu == (badcpu = smp_processor_id()))) { | ||
5474 | req->dest_cpu = RCU_MIGRATION_GOT_QS; | ||
5475 | raw_spin_unlock(&rq->lock); | ||
5476 | } else { | ||
5477 | req->dest_cpu = RCU_MIGRATION_MUST_SYNC; | ||
5478 | raw_spin_unlock(&rq->lock); | ||
5479 | WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); | ||
5480 | } | ||
5481 | local_irq_enable(); | ||
5482 | |||
5483 | complete(&req->done); | ||
5484 | } | ||
5485 | __set_current_state(TASK_RUNNING); | ||
5486 | 5406 | ||
5407 | /* | ||
5408 | * The original target cpu might have gone down and we might | ||
5409 | * be on another cpu but it doesn't matter. | ||
5410 | */ | ||
5411 | local_irq_disable(); | ||
5412 | __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); | ||
5413 | local_irq_enable(); | ||
5487 | return 0; | 5414 | return 0; |
5488 | } | 5415 | } |
5489 | 5416 | ||
@@ -5850,35 +5777,20 @@ static void set_rq_offline(struct rq *rq) | |||
5850 | static int __cpuinit | 5777 | static int __cpuinit |
5851 | migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | 5778 | migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) |
5852 | { | 5779 | { |
5853 | struct task_struct *p; | ||
5854 | int cpu = (long)hcpu; | 5780 | int cpu = (long)hcpu; |
5855 | unsigned long flags; | 5781 | unsigned long flags; |
5856 | struct rq *rq; | 5782 | struct rq *rq = cpu_rq(cpu); |
5857 | 5783 | ||
5858 | switch (action) { | 5784 | switch (action) { |
5859 | 5785 | ||
5860 | case CPU_UP_PREPARE: | 5786 | case CPU_UP_PREPARE: |
5861 | case CPU_UP_PREPARE_FROZEN: | 5787 | case CPU_UP_PREPARE_FROZEN: |
5862 | p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); | ||
5863 | if (IS_ERR(p)) | ||
5864 | return NOTIFY_BAD; | ||
5865 | kthread_bind(p, cpu); | ||
5866 | /* Must be high prio: stop_machine expects to yield to it. */ | ||
5867 | rq = task_rq_lock(p, &flags); | ||
5868 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); | ||
5869 | task_rq_unlock(rq, &flags); | ||
5870 | get_task_struct(p); | ||
5871 | cpu_rq(cpu)->migration_thread = p; | ||
5872 | rq->calc_load_update = calc_load_update; | 5788 | rq->calc_load_update = calc_load_update; |
5873 | break; | 5789 | break; |
5874 | 5790 | ||
5875 | case CPU_ONLINE: | 5791 | case CPU_ONLINE: |
5876 | case CPU_ONLINE_FROZEN: | 5792 | case CPU_ONLINE_FROZEN: |
5877 | /* Strictly unnecessary, as first user will wake it. */ | ||
5878 | wake_up_process(cpu_rq(cpu)->migration_thread); | ||
5879 | |||
5880 | /* Update our root-domain */ | 5793 | /* Update our root-domain */ |
5881 | rq = cpu_rq(cpu); | ||
5882 | raw_spin_lock_irqsave(&rq->lock, flags); | 5794 | raw_spin_lock_irqsave(&rq->lock, flags); |
5883 | if (rq->rd) { | 5795 | if (rq->rd) { |
5884 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 5796 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
@@ -5889,25 +5801,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5889 | break; | 5801 | break; |
5890 | 5802 | ||
5891 | #ifdef CONFIG_HOTPLUG_CPU | 5803 | #ifdef CONFIG_HOTPLUG_CPU |
5892 | case CPU_UP_CANCELED: | ||
5893 | case CPU_UP_CANCELED_FROZEN: | ||
5894 | if (!cpu_rq(cpu)->migration_thread) | ||
5895 | break; | ||
5896 | /* Unbind it from offline cpu so it can run. Fall thru. */ | ||
5897 | kthread_bind(cpu_rq(cpu)->migration_thread, | ||
5898 | cpumask_any(cpu_online_mask)); | ||
5899 | kthread_stop(cpu_rq(cpu)->migration_thread); | ||
5900 | put_task_struct(cpu_rq(cpu)->migration_thread); | ||
5901 | cpu_rq(cpu)->migration_thread = NULL; | ||
5902 | break; | ||
5903 | |||
5904 | case CPU_DEAD: | 5804 | case CPU_DEAD: |
5905 | case CPU_DEAD_FROZEN: | 5805 | case CPU_DEAD_FROZEN: |
5906 | migrate_live_tasks(cpu); | 5806 | migrate_live_tasks(cpu); |
5907 | rq = cpu_rq(cpu); | ||
5908 | kthread_stop(rq->migration_thread); | ||
5909 | put_task_struct(rq->migration_thread); | ||
5910 | rq->migration_thread = NULL; | ||
5911 | /* Idle task back to normal (off runqueue, low prio) */ | 5807 | /* Idle task back to normal (off runqueue, low prio) */ |
5912 | raw_spin_lock_irq(&rq->lock); | 5808 | raw_spin_lock_irq(&rq->lock); |
5913 | deactivate_task(rq, rq->idle, 0); | 5809 | deactivate_task(rq, rq->idle, 0); |
@@ -5918,29 +5814,11 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5918 | migrate_nr_uninterruptible(rq); | 5814 | migrate_nr_uninterruptible(rq); |
5919 | BUG_ON(rq->nr_running != 0); | 5815 | BUG_ON(rq->nr_running != 0); |
5920 | calc_global_load_remove(rq); | 5816 | calc_global_load_remove(rq); |
5921 | /* | ||
5922 | * No need to migrate the tasks: it was best-effort if | ||
5923 | * they didn't take sched_hotcpu_mutex. Just wake up | ||
5924 | * the requestors. | ||
5925 | */ | ||
5926 | raw_spin_lock_irq(&rq->lock); | ||
5927 | while (!list_empty(&rq->migration_queue)) { | ||
5928 | struct migration_req *req; | ||
5929 | |||
5930 | req = list_entry(rq->migration_queue.next, | ||
5931 | struct migration_req, list); | ||
5932 | list_del_init(&req->list); | ||
5933 | raw_spin_unlock_irq(&rq->lock); | ||
5934 | complete(&req->done); | ||
5935 | raw_spin_lock_irq(&rq->lock); | ||
5936 | } | ||
5937 | raw_spin_unlock_irq(&rq->lock); | ||
5938 | break; | 5817 | break; |
5939 | 5818 | ||
5940 | case CPU_DYING: | 5819 | case CPU_DYING: |
5941 | case CPU_DYING_FROZEN: | 5820 | case CPU_DYING_FROZEN: |
5942 | /* Update our root-domain */ | 5821 | /* Update our root-domain */ |
5943 | rq = cpu_rq(cpu); | ||
5944 | raw_spin_lock_irqsave(&rq->lock, flags); | 5822 | raw_spin_lock_irqsave(&rq->lock, flags); |
5945 | if (rq->rd) { | 5823 | if (rq->rd) { |
5946 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 5824 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
@@ -7757,10 +7635,8 @@ void __init sched_init(void) | |||
7757 | rq->push_cpu = 0; | 7635 | rq->push_cpu = 0; |
7758 | rq->cpu = i; | 7636 | rq->cpu = i; |
7759 | rq->online = 0; | 7637 | rq->online = 0; |
7760 | rq->migration_thread = NULL; | ||
7761 | rq->idle_stamp = 0; | 7638 | rq->idle_stamp = 0; |
7762 | rq->avg_idle = 2*sysctl_sched_migration_cost; | 7639 | rq->avg_idle = 2*sysctl_sched_migration_cost; |
7763 | INIT_LIST_HEAD(&rq->migration_queue); | ||
7764 | rq_attach_root(rq, &def_root_domain); | 7640 | rq_attach_root(rq, &def_root_domain); |
7765 | #endif | 7641 | #endif |
7766 | init_rq_hrtick(rq); | 7642 | init_rq_hrtick(rq); |
@@ -9054,43 +8930,39 @@ struct cgroup_subsys cpuacct_subsys = { | |||
9054 | 8930 | ||
9055 | #ifndef CONFIG_SMP | 8931 | #ifndef CONFIG_SMP |
9056 | 8932 | ||
9057 | int rcu_expedited_torture_stats(char *page) | ||
9058 | { | ||
9059 | return 0; | ||
9060 | } | ||
9061 | EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); | ||
9062 | |||
9063 | void synchronize_sched_expedited(void) | 8933 | void synchronize_sched_expedited(void) |
9064 | { | 8934 | { |
8935 | /* | ||
8936 | * There must be a full memory barrier on each affected CPU | ||
8937 | * between the time that try_stop_cpus() is called and the | ||
8938 | * time that it returns. | ||
8939 | * | ||
8940 | * In the current initial implementation of cpu_stop, the | ||
8941 | * above condition is already met when the control reaches | ||
8942 | * this point and the following smp_mb() is not strictly | ||
8943 | * necessary. Do smp_mb() anyway for documentation and | ||
8944 | * robustness against future implementation changes. | ||
8945 | */ | ||
8946 | smp_mb(); | ||
9065 | } | 8947 | } |
9066 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | 8948 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); |
9067 | 8949 | ||
9068 | #else /* #ifndef CONFIG_SMP */ | 8950 | #else /* #ifndef CONFIG_SMP */ |
9069 | 8951 | ||
9070 | static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); | 8952 | static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0); |
9071 | static DEFINE_MUTEX(rcu_sched_expedited_mutex); | ||
9072 | |||
9073 | #define RCU_EXPEDITED_STATE_POST -2 | ||
9074 | #define RCU_EXPEDITED_STATE_IDLE -1 | ||
9075 | 8953 | ||
9076 | static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; | 8954 | static int synchronize_sched_expedited_cpu_stop(void *data) |
9077 | |||
9078 | int rcu_expedited_torture_stats(char *page) | ||
9079 | { | 8955 | { |
9080 | int cnt = 0; | 8956 | static DEFINE_SPINLOCK(done_mask_lock); |
9081 | int cpu; | 8957 | struct cpumask *done_mask = data; |
9082 | 8958 | ||
9083 | cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); | 8959 | if (done_mask) { |
9084 | for_each_online_cpu(cpu) { | 8960 | spin_lock(&done_mask_lock); |
9085 | cnt += sprintf(&page[cnt], " %d:%d", | 8961 | cpumask_set_cpu(smp_processor_id(), done_mask); |
9086 | cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); | 8962 | spin_unlock(&done_mask_lock); |
9087 | } | 8963 | } |
9088 | cnt += sprintf(&page[cnt], "\n"); | 8964 | return 0; |
9089 | return cnt; | ||
9090 | } | 8965 | } |
9091 | EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); | ||
9092 | |||
9093 | static long synchronize_sched_expedited_count; | ||
9094 | 8966 | ||
9095 | /* | 8967 | /* |
9096 | * Wait for an rcu-sched grace period to elapse, but use "big hammer" | 8968 | * Wait for an rcu-sched grace period to elapse, but use "big hammer" |
@@ -9104,60 +8976,55 @@ static long synchronize_sched_expedited_count; | |||
9104 | */ | 8976 | */ |
9105 | void synchronize_sched_expedited(void) | 8977 | void synchronize_sched_expedited(void) |
9106 | { | 8978 | { |
9107 | int cpu; | 8979 | cpumask_var_t done_mask_var; |
9108 | unsigned long flags; | 8980 | struct cpumask *done_mask = NULL; |
9109 | bool need_full_sync = 0; | 8981 | int snap, trycount = 0; |
9110 | struct rq *rq; | 8982 | |
9111 | struct migration_req *req; | 8983 | /* |
9112 | long snap; | 8984 | * done_mask is used to check that all cpus actually have |
9113 | int trycount = 0; | 8985 | * finished running the stopper, which is guaranteed by |
8986 | * stop_cpus() if it's called with cpu hotplug blocked. Keep | ||
8987 | * the paranoia for now but it's best effort if cpumask is off | ||
8988 | * stack. | ||
8989 | */ | ||
8990 | if (zalloc_cpumask_var(&done_mask_var, GFP_ATOMIC)) | ||
8991 | done_mask = done_mask_var; | ||
9114 | 8992 | ||
9115 | smp_mb(); /* ensure prior mod happens before capturing snap. */ | 8993 | smp_mb(); /* ensure prior mod happens before capturing snap. */ |
9116 | snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; | 8994 | snap = atomic_read(&synchronize_sched_expedited_count) + 1; |
9117 | get_online_cpus(); | 8995 | get_online_cpus(); |
9118 | while (!mutex_trylock(&rcu_sched_expedited_mutex)) { | 8996 | while (try_stop_cpus(cpu_online_mask, |
8997 | synchronize_sched_expedited_cpu_stop, | ||
8998 | done_mask) == -EAGAIN) { | ||
9119 | put_online_cpus(); | 8999 | put_online_cpus(); |
9120 | if (trycount++ < 10) | 9000 | if (trycount++ < 10) |
9121 | udelay(trycount * num_online_cpus()); | 9001 | udelay(trycount * num_online_cpus()); |
9122 | else { | 9002 | else { |
9123 | synchronize_sched(); | 9003 | synchronize_sched(); |
9124 | return; | 9004 | goto free_out; |
9125 | } | 9005 | } |
9126 | if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { | 9006 | if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) { |
9127 | smp_mb(); /* ensure test happens before caller kfree */ | 9007 | smp_mb(); /* ensure test happens before caller kfree */ |
9128 | return; | 9008 | goto free_out; |
9129 | } | 9009 | } |
9130 | get_online_cpus(); | 9010 | get_online_cpus(); |
9131 | } | 9011 | } |
9132 | rcu_expedited_state = RCU_EXPEDITED_STATE_POST; | 9012 | atomic_inc(&synchronize_sched_expedited_count); |
9133 | for_each_online_cpu(cpu) { | 9013 | if (done_mask) |
9134 | rq = cpu_rq(cpu); | 9014 | cpumask_xor(done_mask, done_mask, cpu_online_mask); |
9135 | req = &per_cpu(rcu_migration_req, cpu); | ||
9136 | init_completion(&req->done); | ||
9137 | req->task = NULL; | ||
9138 | req->dest_cpu = RCU_MIGRATION_NEED_QS; | ||
9139 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
9140 | list_add(&req->list, &rq->migration_queue); | ||
9141 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
9142 | wake_up_process(rq->migration_thread); | ||
9143 | } | ||
9144 | for_each_online_cpu(cpu) { | ||
9145 | rcu_expedited_state = cpu; | ||
9146 | req = &per_cpu(rcu_migration_req, cpu); | ||
9147 | rq = cpu_rq(cpu); | ||
9148 | wait_for_completion(&req->done); | ||
9149 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
9150 | if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) | ||
9151 | need_full_sync = 1; | ||
9152 | req->dest_cpu = RCU_MIGRATION_IDLE; | ||
9153 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
9154 | } | ||
9155 | rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; | ||
9156 | synchronize_sched_expedited_count++; | ||
9157 | mutex_unlock(&rcu_sched_expedited_mutex); | ||
9158 | put_online_cpus(); | 9015 | put_online_cpus(); |
9159 | if (need_full_sync) | 9016 | |
9017 | /* paranoia - this can't happen */ | ||
9018 | if (done_mask && cpumask_weight(done_mask)) { | ||
9019 | char buf[80]; | ||
9020 | |||
9021 | cpulist_scnprintf(buf, sizeof(buf), done_mask); | ||
9022 | WARN_ONCE(1, "synchronize_sched_expedited: cpu online and done masks disagree on %d cpus: %s\n", | ||
9023 | cpumask_weight(done_mask), buf); | ||
9160 | synchronize_sched(); | 9024 | synchronize_sched(); |
9025 | } | ||
9026 | free_out: | ||
9027 | free_cpumask_var(done_mask_var); | ||
9161 | } | 9028 | } |
9162 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | 9029 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); |
9163 | 9030 | ||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index cbd8b8a296d1..217e4a9393e4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -2798,6 +2798,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) | |||
2798 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); | 2798 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); |
2799 | } | 2799 | } |
2800 | 2800 | ||
2801 | static int active_load_balance_cpu_stop(void *data); | ||
2802 | |||
2801 | /* | 2803 | /* |
2802 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2804 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
2803 | * tasks if there is an imbalance. | 2805 | * tasks if there is an imbalance. |
@@ -2887,8 +2889,9 @@ redo: | |||
2887 | if (need_active_balance(sd, sd_idle, idle)) { | 2889 | if (need_active_balance(sd, sd_idle, idle)) { |
2888 | raw_spin_lock_irqsave(&busiest->lock, flags); | 2890 | raw_spin_lock_irqsave(&busiest->lock, flags); |
2889 | 2891 | ||
2890 | /* don't kick the migration_thread, if the curr | 2892 | /* don't kick the active_load_balance_cpu_stop, |
2891 | * task on busiest cpu can't be moved to this_cpu | 2893 | * if the curr task on busiest cpu can't be |
2894 | * moved to this_cpu | ||
2892 | */ | 2895 | */ |
2893 | if (!cpumask_test_cpu(this_cpu, | 2896 | if (!cpumask_test_cpu(this_cpu, |
2894 | &busiest->curr->cpus_allowed)) { | 2897 | &busiest->curr->cpus_allowed)) { |
@@ -2898,14 +2901,22 @@ redo: | |||
2898 | goto out_one_pinned; | 2901 | goto out_one_pinned; |
2899 | } | 2902 | } |
2900 | 2903 | ||
2904 | /* | ||
2905 | * ->active_balance synchronizes accesses to | ||
2906 | * ->active_balance_work. Once set, it's cleared | ||
2907 | * only after active load balance is finished. | ||
2908 | */ | ||
2901 | if (!busiest->active_balance) { | 2909 | if (!busiest->active_balance) { |
2902 | busiest->active_balance = 1; | 2910 | busiest->active_balance = 1; |
2903 | busiest->push_cpu = this_cpu; | 2911 | busiest->push_cpu = this_cpu; |
2904 | active_balance = 1; | 2912 | active_balance = 1; |
2905 | } | 2913 | } |
2906 | raw_spin_unlock_irqrestore(&busiest->lock, flags); | 2914 | raw_spin_unlock_irqrestore(&busiest->lock, flags); |
2915 | |||
2907 | if (active_balance) | 2916 | if (active_balance) |
2908 | wake_up_process(busiest->migration_thread); | 2917 | stop_one_cpu_nowait(cpu_of(busiest), |
2918 | active_load_balance_cpu_stop, busiest, | ||
2919 | &busiest->active_balance_work); | ||
2909 | 2920 | ||
2910 | /* | 2921 | /* |
2911 | * We've kicked active balancing, reset the failure | 2922 | * We've kicked active balancing, reset the failure |
@@ -3012,24 +3023,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3012 | } | 3023 | } |
3013 | 3024 | ||
3014 | /* | 3025 | /* |
3015 | * active_load_balance is run by migration threads. It pushes running tasks | 3026 | * active_load_balance_cpu_stop is run by cpu stopper. It pushes |
3016 | * off the busiest CPU onto idle CPUs. It requires at least 1 task to be | 3027 | * running tasks off the busiest CPU onto idle CPUs. It requires at |
3017 | * running on each physical CPU where possible, and avoids physical / | 3028 | * least 1 task to be running on each physical CPU where possible, and |
3018 | * logical imbalances. | 3029 | * avoids physical / logical imbalances. |
3019 | * | ||
3020 | * Called with busiest_rq locked. | ||
3021 | */ | 3030 | */ |
3022 | static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | 3031 | static int active_load_balance_cpu_stop(void *data) |
3023 | { | 3032 | { |
3033 | struct rq *busiest_rq = data; | ||
3034 | int busiest_cpu = cpu_of(busiest_rq); | ||
3024 | int target_cpu = busiest_rq->push_cpu; | 3035 | int target_cpu = busiest_rq->push_cpu; |
3036 | struct rq *target_rq = cpu_rq(target_cpu); | ||
3025 | struct sched_domain *sd; | 3037 | struct sched_domain *sd; |
3026 | struct rq *target_rq; | 3038 | |
3039 | raw_spin_lock_irq(&busiest_rq->lock); | ||
3040 | |||
3041 | /* make sure the requested cpu hasn't gone down in the meantime */ | ||
3042 | if (unlikely(busiest_cpu != smp_processor_id() || | ||
3043 | !busiest_rq->active_balance)) | ||
3044 | goto out_unlock; | ||
3027 | 3045 | ||
3028 | /* Is there any task to move? */ | 3046 | /* Is there any task to move? */ |
3029 | if (busiest_rq->nr_running <= 1) | 3047 | if (busiest_rq->nr_running <= 1) |
3030 | return; | 3048 | goto out_unlock; |
3031 | |||
3032 | target_rq = cpu_rq(target_cpu); | ||
3033 | 3049 | ||
3034 | /* | 3050 | /* |
3035 | * This condition is "impossible", if it occurs | 3051 | * This condition is "impossible", if it occurs |
@@ -3058,6 +3074,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
3058 | schedstat_inc(sd, alb_failed); | 3074 | schedstat_inc(sd, alb_failed); |
3059 | } | 3075 | } |
3060 | double_unlock_balance(busiest_rq, target_rq); | 3076 | double_unlock_balance(busiest_rq, target_rq); |
3077 | out_unlock: | ||
3078 | busiest_rq->active_balance = 0; | ||
3079 | raw_spin_unlock_irq(&busiest_rq->lock); | ||
3080 | return 0; | ||
3061 | } | 3081 | } |
3062 | 3082 | ||
3063 | #ifdef CONFIG_NO_HZ | 3083 | #ifdef CONFIG_NO_HZ |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 884c7a1afeed..5b20141a5ec1 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -301,7 +301,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, | |||
301 | case CPU_UP_PREPARE: | 301 | case CPU_UP_PREPARE: |
302 | BUG_ON(stopper->thread || stopper->enabled || | 302 | BUG_ON(stopper->thread || stopper->enabled || |
303 | !list_empty(&stopper->works)); | 303 | !list_empty(&stopper->works)); |
304 | p = kthread_create(cpu_stopper_thread, stopper, "stopper/%d", | 304 | p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", |
305 | cpu); | 305 | cpu); |
306 | if (IS_ERR(p)) | 306 | if (IS_ERR(p)) |
307 | return NOTIFY_BAD; | 307 | return NOTIFY_BAD; |