aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2010-05-08 12:11:19 -0400
committerIngo Molnar <mingo@elte.hu>2010-05-08 12:11:19 -0400
commite7858f52a5cb868289a72264534a1f05f3340c6c (patch)
treeaa7308603cf30d8aec6e45ecaddc6c8ed29d2edb
parent27a9da6538ee18046d7bff8e36a9f783542c54c3 (diff)
parentbbf1bb3eee86f2eef2baa14e600be454d09109ee (diff)
Merge branch 'cpu_stop' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc into sched/core
-rw-r--r--Documentation/RCU/torture.txt10
-rw-r--r--arch/s390/kernel/time.c1
-rw-r--r--drivers/xen/manage.c14
-rw-r--r--include/linux/rcutiny.h2
-rw-r--r--include/linux/rcutree.h1
-rw-r--r--include/linux/stop_machine.h122
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/cpu.c8
-rw-r--r--kernel/module.c14
-rw-r--r--kernel/rcutorture.c2
-rw-r--r--kernel/sched.c285
-rw-r--r--kernel/sched_fair.c48
-rw-r--r--kernel/stop_machine.c534
13 files changed, 604 insertions, 439 deletions
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index 0e50bc2aa1e2..5d9016795fd8 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -182,16 +182,6 @@ Similarly, sched_expedited RCU provides the following:
182 sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0 182 sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0
183 sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0 183 sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0
184 sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0 184 sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0
185 state: -1 / 0:0 3:0 4:0
186
187As before, the first four lines are similar to those for RCU.
188The last line shows the task-migration state. The first number is
189-1 if synchronize_sched_expedited() is idle, -2 if in the process of
190posting wakeups to the migration kthreads, and N when waiting on CPU N.
191Each of the colon-separated fields following the "/" is a CPU:state pair.
192Valid states are "0" for idle, "1" for waiting for quiescent state,
193"2" for passed through quiescent state, and "3" when a race with a
194CPU-hotplug event forces use of the synchronize_sched() primitive.
195 185
196 186
197USAGE 187USAGE
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index d906bf19c14a..a2163c95eb98 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -391,7 +391,6 @@ static void __init time_init_wq(void)
391 if (time_sync_wq) 391 if (time_sync_wq)
392 return; 392 return;
393 time_sync_wq = create_singlethread_workqueue("timesync"); 393 time_sync_wq = create_singlethread_workqueue("timesync");
394 stop_machine_create();
395} 394}
396 395
397/* 396/*
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 2ac4440e7b08..8943b8ccee1a 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -80,12 +80,6 @@ static void do_suspend(void)
80 80
81 shutting_down = SHUTDOWN_SUSPEND; 81 shutting_down = SHUTDOWN_SUSPEND;
82 82
83 err = stop_machine_create();
84 if (err) {
85 printk(KERN_ERR "xen suspend: failed to setup stop_machine %d\n", err);
86 goto out;
87 }
88
89#ifdef CONFIG_PREEMPT 83#ifdef CONFIG_PREEMPT
90 /* If the kernel is preemptible, we need to freeze all the processes 84 /* If the kernel is preemptible, we need to freeze all the processes
91 to prevent them from being in the middle of a pagetable update 85 to prevent them from being in the middle of a pagetable update
@@ -93,7 +87,7 @@ static void do_suspend(void)
93 err = freeze_processes(); 87 err = freeze_processes();
94 if (err) { 88 if (err) {
95 printk(KERN_ERR "xen suspend: freeze failed %d\n", err); 89 printk(KERN_ERR "xen suspend: freeze failed %d\n", err);
96 goto out_destroy_sm; 90 goto out;
97 } 91 }
98#endif 92#endif
99 93
@@ -136,12 +130,8 @@ out_resume:
136out_thaw: 130out_thaw:
137#ifdef CONFIG_PREEMPT 131#ifdef CONFIG_PREEMPT
138 thaw_processes(); 132 thaw_processes();
139
140out_destroy_sm:
141#endif
142 stop_machine_destroy();
143
144out: 133out:
134#endif
145 shutting_down = SHUTDOWN_INVALID; 135 shutting_down = SHUTDOWN_INVALID;
146} 136}
147#endif /* CONFIG_PM_SLEEP */ 137#endif /* CONFIG_PM_SLEEP */
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index a5195875480a..0006b2df00e1 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -60,8 +60,6 @@ static inline long rcu_batches_completed_bh(void)
60 return 0; 60 return 0;
61} 61}
62 62
63extern int rcu_expedited_torture_stats(char *page);
64
65static inline void rcu_force_quiescent_state(void) 63static inline void rcu_force_quiescent_state(void)
66{ 64{
67} 65}
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 42cc3a04779e..24e467e526b8 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -35,7 +35,6 @@ struct notifier_block;
35extern void rcu_sched_qs(int cpu); 35extern void rcu_sched_qs(int cpu);
36extern void rcu_bh_qs(int cpu); 36extern void rcu_bh_qs(int cpu);
37extern int rcu_needs_cpu(int cpu); 37extern int rcu_needs_cpu(int cpu);
38extern int rcu_expedited_torture_stats(char *page);
39 38
40#ifdef CONFIG_TREE_PREEMPT_RCU 39#ifdef CONFIG_TREE_PREEMPT_RCU
41 40
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index baba3a23a814..6b524a0d02e4 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -1,13 +1,101 @@
1#ifndef _LINUX_STOP_MACHINE 1#ifndef _LINUX_STOP_MACHINE
2#define _LINUX_STOP_MACHINE 2#define _LINUX_STOP_MACHINE
3/* "Bogolock": stop the entire machine, disable interrupts. This is a 3
4 very heavy lock, which is equivalent to grabbing every spinlock
5 (and more). So the "read" side to such a lock is anything which
6 disables preeempt. */
7#include <linux/cpu.h> 4#include <linux/cpu.h>
8#include <linux/cpumask.h> 5#include <linux/cpumask.h>
6#include <linux/list.h>
9#include <asm/system.h> 7#include <asm/system.h>
10 8
9/*
10 * stop_cpu[s]() is simplistic per-cpu maximum priority cpu
11 * monopolization mechanism. The caller can specify a non-sleeping
12 * function to be executed on a single or multiple cpus preempting all
13 * other processes and monopolizing those cpus until it finishes.
14 *
15 * Resources for this mechanism are preallocated when a cpu is brought
16 * up and requests are guaranteed to be served as long as the target
17 * cpus are online.
18 */
19typedef int (*cpu_stop_fn_t)(void *arg);
20
21#ifdef CONFIG_SMP
22
23struct cpu_stop_work {
24 struct list_head list; /* cpu_stopper->works */
25 cpu_stop_fn_t fn;
26 void *arg;
27 struct cpu_stop_done *done;
28};
29
30int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
31void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
32 struct cpu_stop_work *work_buf);
33int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
34int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
35
36#else /* CONFIG_SMP */
37
38#include <linux/workqueue.h>
39
40struct cpu_stop_work {
41 struct work_struct work;
42 cpu_stop_fn_t fn;
43 void *arg;
44};
45
46static inline int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
47{
48 int ret = -ENOENT;
49 preempt_disable();
50 if (cpu == smp_processor_id())
51 ret = fn(arg);
52 preempt_enable();
53 return ret;
54}
55
56static void stop_one_cpu_nowait_workfn(struct work_struct *work)
57{
58 struct cpu_stop_work *stwork =
59 container_of(work, struct cpu_stop_work, work);
60 preempt_disable();
61 stwork->fn(stwork->arg);
62 preempt_enable();
63}
64
65static inline void stop_one_cpu_nowait(unsigned int cpu,
66 cpu_stop_fn_t fn, void *arg,
67 struct cpu_stop_work *work_buf)
68{
69 if (cpu == smp_processor_id()) {
70 INIT_WORK(&work_buf->work, stop_one_cpu_nowait_workfn);
71 work_buf->fn = fn;
72 work_buf->arg = arg;
73 schedule_work(&work_buf->work);
74 }
75}
76
77static inline int stop_cpus(const struct cpumask *cpumask,
78 cpu_stop_fn_t fn, void *arg)
79{
80 if (cpumask_test_cpu(raw_smp_processor_id(), cpumask))
81 return stop_one_cpu(raw_smp_processor_id(), fn, arg);
82 return -ENOENT;
83}
84
85static inline int try_stop_cpus(const struct cpumask *cpumask,
86 cpu_stop_fn_t fn, void *arg)
87{
88 return stop_cpus(cpumask, fn, arg);
89}
90
91#endif /* CONFIG_SMP */
92
93/*
94 * stop_machine "Bogolock": stop the entire machine, disable
95 * interrupts. This is a very heavy lock, which is equivalent to
96 * grabbing every spinlock (and more). So the "read" side to such a
97 * lock is anything which disables preeempt.
98 */
11#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP) 99#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
12 100
13/** 101/**
@@ -36,24 +124,7 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
36 */ 124 */
37int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus); 125int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
38 126
39/** 127#else /* CONFIG_STOP_MACHINE && CONFIG_SMP */
40 * stop_machine_create: create all stop_machine threads
41 *
42 * Description: This causes all stop_machine threads to be created before
43 * stop_machine actually gets called. This can be used by subsystems that
44 * need a non failing stop_machine infrastructure.
45 */
46int stop_machine_create(void);
47
48/**
49 * stop_machine_destroy: destroy all stop_machine threads
50 *
51 * Description: This causes all stop_machine threads which were created with
52 * stop_machine_create to be destroyed again.
53 */
54void stop_machine_destroy(void);
55
56#else
57 128
58static inline int stop_machine(int (*fn)(void *), void *data, 129static inline int stop_machine(int (*fn)(void *), void *data,
59 const struct cpumask *cpus) 130 const struct cpumask *cpus)
@@ -65,8 +136,5 @@ static inline int stop_machine(int (*fn)(void *), void *data,
65 return ret; 136 return ret;
66} 137}
67 138
68static inline int stop_machine_create(void) { return 0; } 139#endif /* CONFIG_STOP_MACHINE && CONFIG_SMP */
69static inline void stop_machine_destroy(void) { } 140#endif /* _LINUX_STOP_MACHINE */
70
71#endif /* CONFIG_SMP */
72#endif /* _LINUX_STOP_MACHINE */
diff --git a/kernel/Makefile b/kernel/Makefile
index a987aa1676b5..149e18ef1ab1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -68,7 +68,7 @@ obj-$(CONFIG_USER_NS) += user_namespace.o
68obj-$(CONFIG_PID_NS) += pid_namespace.o 68obj-$(CONFIG_PID_NS) += pid_namespace.o
69obj-$(CONFIG_IKCONFIG) += configs.o 69obj-$(CONFIG_IKCONFIG) += configs.o
70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o 70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
71obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 71obj-$(CONFIG_SMP) += stop_machine.o
72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o 73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 914aedcde849..545777574779 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -266,9 +266,6 @@ int __ref cpu_down(unsigned int cpu)
266{ 266{
267 int err; 267 int err;
268 268
269 err = stop_machine_create();
270 if (err)
271 return err;
272 cpu_maps_update_begin(); 269 cpu_maps_update_begin();
273 270
274 if (cpu_hotplug_disabled) { 271 if (cpu_hotplug_disabled) {
@@ -280,7 +277,6 @@ int __ref cpu_down(unsigned int cpu)
280 277
281out: 278out:
282 cpu_maps_update_done(); 279 cpu_maps_update_done();
283 stop_machine_destroy();
284 return err; 280 return err;
285} 281}
286EXPORT_SYMBOL(cpu_down); 282EXPORT_SYMBOL(cpu_down);
@@ -361,9 +357,6 @@ int disable_nonboot_cpus(void)
361{ 357{
362 int cpu, first_cpu, error; 358 int cpu, first_cpu, error;
363 359
364 error = stop_machine_create();
365 if (error)
366 return error;
367 cpu_maps_update_begin(); 360 cpu_maps_update_begin();
368 first_cpu = cpumask_first(cpu_online_mask); 361 first_cpu = cpumask_first(cpu_online_mask);
369 /* 362 /*
@@ -394,7 +387,6 @@ int disable_nonboot_cpus(void)
394 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 387 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
395 } 388 }
396 cpu_maps_update_done(); 389 cpu_maps_update_done();
397 stop_machine_destroy();
398 return error; 390 return error;
399} 391}
400 392
diff --git a/kernel/module.c b/kernel/module.c
index 1016b75b026a..0838246d8c94 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -723,16 +723,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
723 return -EFAULT; 723 return -EFAULT;
724 name[MODULE_NAME_LEN-1] = '\0'; 724 name[MODULE_NAME_LEN-1] = '\0';
725 725
726 /* Create stop_machine threads since free_module relies on 726 if (mutex_lock_interruptible(&module_mutex) != 0)
727 * a non-failing stop_machine call. */ 727 return -EINTR;
728 ret = stop_machine_create();
729 if (ret)
730 return ret;
731
732 if (mutex_lock_interruptible(&module_mutex) != 0) {
733 ret = -EINTR;
734 goto out_stop;
735 }
736 728
737 mod = find_module(name); 729 mod = find_module(name);
738 if (!mod) { 730 if (!mod) {
@@ -792,8 +784,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
792 784
793 out: 785 out:
794 mutex_unlock(&module_mutex); 786 mutex_unlock(&module_mutex);
795out_stop:
796 stop_machine_destroy();
797 return ret; 787 return ret;
798} 788}
799 789
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 58df55bf83ed..2b676f3a0f26 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -669,7 +669,7 @@ static struct rcu_torture_ops sched_expedited_ops = {
669 .sync = synchronize_sched_expedited, 669 .sync = synchronize_sched_expedited,
670 .cb_barrier = NULL, 670 .cb_barrier = NULL,
671 .fqs = rcu_sched_force_quiescent_state, 671 .fqs = rcu_sched_force_quiescent_state,
672 .stats = rcu_expedited_torture_stats, 672 .stats = NULL,
673 .irq_capable = 1, 673 .irq_capable = 1,
674 .name = "sched_expedited" 674 .name = "sched_expedited"
675}; 675};
diff --git a/kernel/sched.c b/kernel/sched.c
index 11ac0eb0bce7..39aa9c7e22c0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,9 +55,9 @@
55#include <linux/cpu.h> 55#include <linux/cpu.h>
56#include <linux/cpuset.h> 56#include <linux/cpuset.h>
57#include <linux/percpu.h> 57#include <linux/percpu.h>
58#include <linux/kthread.h>
59#include <linux/proc_fs.h> 58#include <linux/proc_fs.h>
60#include <linux/seq_file.h> 59#include <linux/seq_file.h>
60#include <linux/stop_machine.h>
61#include <linux/sysctl.h> 61#include <linux/sysctl.h>
62#include <linux/syscalls.h> 62#include <linux/syscalls.h>
63#include <linux/times.h> 63#include <linux/times.h>
@@ -539,15 +539,13 @@ struct rq {
539 int post_schedule; 539 int post_schedule;
540 int active_balance; 540 int active_balance;
541 int push_cpu; 541 int push_cpu;
542 struct cpu_stop_work active_balance_work;
542 /* cpu of this runqueue: */ 543 /* cpu of this runqueue: */
543 int cpu; 544 int cpu;
544 int online; 545 int online;
545 546
546 unsigned long avg_load_per_task; 547 unsigned long avg_load_per_task;
547 548
548 struct task_struct *migration_thread;
549 struct list_head migration_queue;
550
551 u64 rt_avg; 549 u64 rt_avg;
552 u64 age_stamp; 550 u64 age_stamp;
553 u64 idle_stamp; 551 u64 idle_stamp;
@@ -2037,21 +2035,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2037 __set_task_cpu(p, new_cpu); 2035 __set_task_cpu(p, new_cpu);
2038} 2036}
2039 2037
2040struct migration_req { 2038struct migration_arg {
2041 struct list_head list;
2042
2043 struct task_struct *task; 2039 struct task_struct *task;
2044 int dest_cpu; 2040 int dest_cpu;
2045
2046 struct completion done;
2047}; 2041};
2048 2042
2043static int migration_cpu_stop(void *data);
2044
2049/* 2045/*
2050 * The task's runqueue lock must be held. 2046 * The task's runqueue lock must be held.
2051 * Returns true if you have to wait for migration thread. 2047 * Returns true if you have to wait for migration thread.
2052 */ 2048 */
2053static int 2049static bool migrate_task(struct task_struct *p, int dest_cpu)
2054migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2055{ 2050{
2056 struct rq *rq = task_rq(p); 2051 struct rq *rq = task_rq(p);
2057 2052
@@ -2059,15 +2054,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2059 * If the task is not on a runqueue (and not running), then 2054 * If the task is not on a runqueue (and not running), then
2060 * the next wake-up will properly place the task. 2055 * the next wake-up will properly place the task.
2061 */ 2056 */
2062 if (!p->se.on_rq && !task_running(rq, p)) 2057 return p->se.on_rq || task_running(rq, p);
2063 return 0;
2064
2065 init_completion(&req->done);
2066 req->task = p;
2067 req->dest_cpu = dest_cpu;
2068 list_add(&req->list, &rq->migration_queue);
2069
2070 return 1;
2071} 2058}
2072 2059
2073/* 2060/*
@@ -3110,7 +3097,6 @@ static void update_cpu_load(struct rq *this_rq)
3110void sched_exec(void) 3097void sched_exec(void)
3111{ 3098{
3112 struct task_struct *p = current; 3099 struct task_struct *p = current;
3113 struct migration_req req;
3114 unsigned long flags; 3100 unsigned long flags;
3115 struct rq *rq; 3101 struct rq *rq;
3116 int dest_cpu; 3102 int dest_cpu;
@@ -3124,17 +3110,11 @@ void sched_exec(void)
3124 * select_task_rq() can race against ->cpus_allowed 3110 * select_task_rq() can race against ->cpus_allowed
3125 */ 3111 */
3126 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && 3112 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3127 likely(cpu_active(dest_cpu)) && 3113 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
3128 migrate_task(p, dest_cpu, &req)) { 3114 struct migration_arg arg = { p, dest_cpu };
3129 /* Need to wait for migration thread (might exit: take ref). */
3130 struct task_struct *mt = rq->migration_thread;
3131 3115
3132 get_task_struct(mt);
3133 task_rq_unlock(rq, &flags); 3116 task_rq_unlock(rq, &flags);
3134 wake_up_process(mt); 3117 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
3135 put_task_struct(mt);
3136 wait_for_completion(&req.done);
3137
3138 return; 3118 return;
3139 } 3119 }
3140unlock: 3120unlock:
@@ -5290,17 +5270,15 @@ static inline void sched_init_granularity(void)
5290/* 5270/*
5291 * This is how migration works: 5271 * This is how migration works:
5292 * 5272 *
5293 * 1) we queue a struct migration_req structure in the source CPU's 5273 * 1) we invoke migration_cpu_stop() on the target CPU using
5294 * runqueue and wake up that CPU's migration thread. 5274 * stop_one_cpu().
5295 * 2) we down() the locked semaphore => thread blocks. 5275 * 2) stopper starts to run (implicitly forcing the migrated thread
5296 * 3) migration thread wakes up (implicitly it forces the migrated 5276 * off the CPU)
5297 * thread off the CPU) 5277 * 3) it checks whether the migrated task is still in the wrong runqueue.
5298 * 4) it gets the migration request and checks whether the migrated 5278 * 4) if it's in the wrong runqueue then the migration thread removes
5299 * task is still in the wrong runqueue.
5300 * 5) if it's in the wrong runqueue then the migration thread removes
5301 * it and puts it into the right queue. 5279 * it and puts it into the right queue.
5302 * 6) migration thread up()s the semaphore. 5280 * 5) stopper completes and stop_one_cpu() returns and the migration
5303 * 7) we wake up and the migration is done. 5281 * is done.
5304 */ 5282 */
5305 5283
5306/* 5284/*
@@ -5314,9 +5292,9 @@ static inline void sched_init_granularity(void)
5314 */ 5292 */
5315int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 5293int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5316{ 5294{
5317 struct migration_req req;
5318 unsigned long flags; 5295 unsigned long flags;
5319 struct rq *rq; 5296 struct rq *rq;
5297 unsigned int dest_cpu;
5320 int ret = 0; 5298 int ret = 0;
5321 5299
5322 /* 5300 /*
@@ -5354,15 +5332,12 @@ again:
5354 if (cpumask_test_cpu(task_cpu(p), new_mask)) 5332 if (cpumask_test_cpu(task_cpu(p), new_mask))
5355 goto out; 5333 goto out;
5356 5334
5357 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { 5335 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5336 if (migrate_task(p, dest_cpu)) {
5337 struct migration_arg arg = { p, dest_cpu };
5358 /* Need help from migration thread: drop lock and wait. */ 5338 /* Need help from migration thread: drop lock and wait. */
5359 struct task_struct *mt = rq->migration_thread;
5360
5361 get_task_struct(mt);
5362 task_rq_unlock(rq, &flags); 5339 task_rq_unlock(rq, &flags);
5363 wake_up_process(mt); 5340 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5364 put_task_struct(mt);
5365 wait_for_completion(&req.done);
5366 tlb_migrate_finish(p->mm); 5341 tlb_migrate_finish(p->mm);
5367 return 0; 5342 return 0;
5368 } 5343 }
@@ -5420,70 +5395,22 @@ fail:
5420 return ret; 5395 return ret;
5421} 5396}
5422 5397
5423#define RCU_MIGRATION_IDLE 0
5424#define RCU_MIGRATION_NEED_QS 1
5425#define RCU_MIGRATION_GOT_QS 2
5426#define RCU_MIGRATION_MUST_SYNC 3
5427
5428/* 5398/*
5429 * migration_thread - this is a highprio system thread that performs 5399 * migration_cpu_stop - this will be executed by a highprio stopper thread
5430 * thread migration by bumping thread off CPU then 'pushing' onto 5400 * and performs thread migration by bumping thread off CPU then
5431 * another runqueue. 5401 * 'pushing' onto another runqueue.
5432 */ 5402 */
5433static int migration_thread(void *data) 5403static int migration_cpu_stop(void *data)
5434{ 5404{
5435 int badcpu; 5405 struct migration_arg *arg = data;
5436 int cpu = (long)data;
5437 struct rq *rq;
5438
5439 rq = cpu_rq(cpu);
5440 BUG_ON(rq->migration_thread != current);
5441
5442 set_current_state(TASK_INTERRUPTIBLE);
5443 while (!kthread_should_stop()) {
5444 struct migration_req *req;
5445 struct list_head *head;
5446
5447 raw_spin_lock_irq(&rq->lock);
5448
5449 if (cpu_is_offline(cpu)) {
5450 raw_spin_unlock_irq(&rq->lock);
5451 break;
5452 }
5453
5454 if (rq->active_balance) {
5455 active_load_balance(rq, cpu);
5456 rq->active_balance = 0;
5457 }
5458
5459 head = &rq->migration_queue;
5460
5461 if (list_empty(head)) {
5462 raw_spin_unlock_irq(&rq->lock);
5463 schedule();
5464 set_current_state(TASK_INTERRUPTIBLE);
5465 continue;
5466 }
5467 req = list_entry(head->next, struct migration_req, list);
5468 list_del_init(head->next);
5469
5470 if (req->task != NULL) {
5471 raw_spin_unlock(&rq->lock);
5472 __migrate_task(req->task, cpu, req->dest_cpu);
5473 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
5474 req->dest_cpu = RCU_MIGRATION_GOT_QS;
5475 raw_spin_unlock(&rq->lock);
5476 } else {
5477 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
5478 raw_spin_unlock(&rq->lock);
5479 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
5480 }
5481 local_irq_enable();
5482
5483 complete(&req->done);
5484 }
5485 __set_current_state(TASK_RUNNING);
5486 5406
5407 /*
5408 * The original target cpu might have gone down and we might
5409 * be on another cpu but it doesn't matter.
5410 */
5411 local_irq_disable();
5412 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
5413 local_irq_enable();
5487 return 0; 5414 return 0;
5488} 5415}
5489 5416
@@ -5850,35 +5777,20 @@ static void set_rq_offline(struct rq *rq)
5850static int __cpuinit 5777static int __cpuinit
5851migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 5778migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5852{ 5779{
5853 struct task_struct *p;
5854 int cpu = (long)hcpu; 5780 int cpu = (long)hcpu;
5855 unsigned long flags; 5781 unsigned long flags;
5856 struct rq *rq; 5782 struct rq *rq = cpu_rq(cpu);
5857 5783
5858 switch (action) { 5784 switch (action) {
5859 5785
5860 case CPU_UP_PREPARE: 5786 case CPU_UP_PREPARE:
5861 case CPU_UP_PREPARE_FROZEN: 5787 case CPU_UP_PREPARE_FROZEN:
5862 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5863 if (IS_ERR(p))
5864 return NOTIFY_BAD;
5865 kthread_bind(p, cpu);
5866 /* Must be high prio: stop_machine expects to yield to it. */
5867 rq = task_rq_lock(p, &flags);
5868 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5869 task_rq_unlock(rq, &flags);
5870 get_task_struct(p);
5871 cpu_rq(cpu)->migration_thread = p;
5872 rq->calc_load_update = calc_load_update; 5788 rq->calc_load_update = calc_load_update;
5873 break; 5789 break;
5874 5790
5875 case CPU_ONLINE: 5791 case CPU_ONLINE:
5876 case CPU_ONLINE_FROZEN: 5792 case CPU_ONLINE_FROZEN:
5877 /* Strictly unnecessary, as first user will wake it. */
5878 wake_up_process(cpu_rq(cpu)->migration_thread);
5879
5880 /* Update our root-domain */ 5793 /* Update our root-domain */
5881 rq = cpu_rq(cpu);
5882 raw_spin_lock_irqsave(&rq->lock, flags); 5794 raw_spin_lock_irqsave(&rq->lock, flags);
5883 if (rq->rd) { 5795 if (rq->rd) {
5884 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5796 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -5889,25 +5801,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5889 break; 5801 break;
5890 5802
5891#ifdef CONFIG_HOTPLUG_CPU 5803#ifdef CONFIG_HOTPLUG_CPU
5892 case CPU_UP_CANCELED:
5893 case CPU_UP_CANCELED_FROZEN:
5894 if (!cpu_rq(cpu)->migration_thread)
5895 break;
5896 /* Unbind it from offline cpu so it can run. Fall thru. */
5897 kthread_bind(cpu_rq(cpu)->migration_thread,
5898 cpumask_any(cpu_online_mask));
5899 kthread_stop(cpu_rq(cpu)->migration_thread);
5900 put_task_struct(cpu_rq(cpu)->migration_thread);
5901 cpu_rq(cpu)->migration_thread = NULL;
5902 break;
5903
5904 case CPU_DEAD: 5804 case CPU_DEAD:
5905 case CPU_DEAD_FROZEN: 5805 case CPU_DEAD_FROZEN:
5906 migrate_live_tasks(cpu); 5806 migrate_live_tasks(cpu);
5907 rq = cpu_rq(cpu);
5908 kthread_stop(rq->migration_thread);
5909 put_task_struct(rq->migration_thread);
5910 rq->migration_thread = NULL;
5911 /* Idle task back to normal (off runqueue, low prio) */ 5807 /* Idle task back to normal (off runqueue, low prio) */
5912 raw_spin_lock_irq(&rq->lock); 5808 raw_spin_lock_irq(&rq->lock);
5913 deactivate_task(rq, rq->idle, 0); 5809 deactivate_task(rq, rq->idle, 0);
@@ -5918,29 +5814,11 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5918 migrate_nr_uninterruptible(rq); 5814 migrate_nr_uninterruptible(rq);
5919 BUG_ON(rq->nr_running != 0); 5815 BUG_ON(rq->nr_running != 0);
5920 calc_global_load_remove(rq); 5816 calc_global_load_remove(rq);
5921 /*
5922 * No need to migrate the tasks: it was best-effort if
5923 * they didn't take sched_hotcpu_mutex. Just wake up
5924 * the requestors.
5925 */
5926 raw_spin_lock_irq(&rq->lock);
5927 while (!list_empty(&rq->migration_queue)) {
5928 struct migration_req *req;
5929
5930 req = list_entry(rq->migration_queue.next,
5931 struct migration_req, list);
5932 list_del_init(&req->list);
5933 raw_spin_unlock_irq(&rq->lock);
5934 complete(&req->done);
5935 raw_spin_lock_irq(&rq->lock);
5936 }
5937 raw_spin_unlock_irq(&rq->lock);
5938 break; 5817 break;
5939 5818
5940 case CPU_DYING: 5819 case CPU_DYING:
5941 case CPU_DYING_FROZEN: 5820 case CPU_DYING_FROZEN:
5942 /* Update our root-domain */ 5821 /* Update our root-domain */
5943 rq = cpu_rq(cpu);
5944 raw_spin_lock_irqsave(&rq->lock, flags); 5822 raw_spin_lock_irqsave(&rq->lock, flags);
5945 if (rq->rd) { 5823 if (rq->rd) {
5946 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5824 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -7757,10 +7635,8 @@ void __init sched_init(void)
7757 rq->push_cpu = 0; 7635 rq->push_cpu = 0;
7758 rq->cpu = i; 7636 rq->cpu = i;
7759 rq->online = 0; 7637 rq->online = 0;
7760 rq->migration_thread = NULL;
7761 rq->idle_stamp = 0; 7638 rq->idle_stamp = 0;
7762 rq->avg_idle = 2*sysctl_sched_migration_cost; 7639 rq->avg_idle = 2*sysctl_sched_migration_cost;
7763 INIT_LIST_HEAD(&rq->migration_queue);
7764 rq_attach_root(rq, &def_root_domain); 7640 rq_attach_root(rq, &def_root_domain);
7765#endif 7641#endif
7766 init_rq_hrtick(rq); 7642 init_rq_hrtick(rq);
@@ -9054,43 +8930,32 @@ struct cgroup_subsys cpuacct_subsys = {
9054 8930
9055#ifndef CONFIG_SMP 8931#ifndef CONFIG_SMP
9056 8932
9057int rcu_expedited_torture_stats(char *page)
9058{
9059 return 0;
9060}
9061EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9062
9063void synchronize_sched_expedited(void) 8933void synchronize_sched_expedited(void)
9064{ 8934{
8935 barrier();
9065} 8936}
9066EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 8937EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9067 8938
9068#else /* #ifndef CONFIG_SMP */ 8939#else /* #ifndef CONFIG_SMP */
9069 8940
9070static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); 8941static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9071static DEFINE_MUTEX(rcu_sched_expedited_mutex);
9072
9073#define RCU_EXPEDITED_STATE_POST -2
9074#define RCU_EXPEDITED_STATE_IDLE -1
9075
9076static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
9077 8942
9078int rcu_expedited_torture_stats(char *page) 8943static int synchronize_sched_expedited_cpu_stop(void *data)
9079{ 8944{
9080 int cnt = 0; 8945 /*
9081 int cpu; 8946 * There must be a full memory barrier on each affected CPU
9082 8947 * between the time that try_stop_cpus() is called and the
9083 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); 8948 * time that it returns.
9084 for_each_online_cpu(cpu) { 8949 *
9085 cnt += sprintf(&page[cnt], " %d:%d", 8950 * In the current initial implementation of cpu_stop, the
9086 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); 8951 * above condition is already met when the control reaches
9087 } 8952 * this point and the following smp_mb() is not strictly
9088 cnt += sprintf(&page[cnt], "\n"); 8953 * necessary. Do smp_mb() anyway for documentation and
9089 return cnt; 8954 * robustness against future implementation changes.
8955 */
8956 smp_mb(); /* See above comment block. */
8957 return 0;
9090} 8958}
9091EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9092
9093static long synchronize_sched_expedited_count;
9094 8959
9095/* 8960/*
9096 * Wait for an rcu-sched grace period to elapse, but use "big hammer" 8961 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -9104,18 +8969,14 @@ static long synchronize_sched_expedited_count;
9104 */ 8969 */
9105void synchronize_sched_expedited(void) 8970void synchronize_sched_expedited(void)
9106{ 8971{
9107 int cpu; 8972 int snap, trycount = 0;
9108 unsigned long flags;
9109 bool need_full_sync = 0;
9110 struct rq *rq;
9111 struct migration_req *req;
9112 long snap;
9113 int trycount = 0;
9114 8973
9115 smp_mb(); /* ensure prior mod happens before capturing snap. */ 8974 smp_mb(); /* ensure prior mod happens before capturing snap. */
9116 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; 8975 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9117 get_online_cpus(); 8976 get_online_cpus();
9118 while (!mutex_trylock(&rcu_sched_expedited_mutex)) { 8977 while (try_stop_cpus(cpu_online_mask,
8978 synchronize_sched_expedited_cpu_stop,
8979 NULL) == -EAGAIN) {
9119 put_online_cpus(); 8980 put_online_cpus();
9120 if (trycount++ < 10) 8981 if (trycount++ < 10)
9121 udelay(trycount * num_online_cpus()); 8982 udelay(trycount * num_online_cpus());
@@ -9123,41 +8984,15 @@ void synchronize_sched_expedited(void)
9123 synchronize_sched(); 8984 synchronize_sched();
9124 return; 8985 return;
9125 } 8986 }
9126 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { 8987 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9127 smp_mb(); /* ensure test happens before caller kfree */ 8988 smp_mb(); /* ensure test happens before caller kfree */
9128 return; 8989 return;
9129 } 8990 }
9130 get_online_cpus(); 8991 get_online_cpus();
9131 } 8992 }
9132 rcu_expedited_state = RCU_EXPEDITED_STATE_POST; 8993 atomic_inc(&synchronize_sched_expedited_count);
9133 for_each_online_cpu(cpu) { 8994 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9134 rq = cpu_rq(cpu);
9135 req = &per_cpu(rcu_migration_req, cpu);
9136 init_completion(&req->done);
9137 req->task = NULL;
9138 req->dest_cpu = RCU_MIGRATION_NEED_QS;
9139 raw_spin_lock_irqsave(&rq->lock, flags);
9140 list_add(&req->list, &rq->migration_queue);
9141 raw_spin_unlock_irqrestore(&rq->lock, flags);
9142 wake_up_process(rq->migration_thread);
9143 }
9144 for_each_online_cpu(cpu) {
9145 rcu_expedited_state = cpu;
9146 req = &per_cpu(rcu_migration_req, cpu);
9147 rq = cpu_rq(cpu);
9148 wait_for_completion(&req->done);
9149 raw_spin_lock_irqsave(&rq->lock, flags);
9150 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
9151 need_full_sync = 1;
9152 req->dest_cpu = RCU_MIGRATION_IDLE;
9153 raw_spin_unlock_irqrestore(&rq->lock, flags);
9154 }
9155 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
9156 synchronize_sched_expedited_count++;
9157 mutex_unlock(&rcu_sched_expedited_mutex);
9158 put_online_cpus(); 8995 put_online_cpus();
9159 if (need_full_sync)
9160 synchronize_sched();
9161} 8996}
9162EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 8997EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9163 8998
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cbd8b8a296d1..217e4a9393e4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2798,6 +2798,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
2798 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 2798 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
2799} 2799}
2800 2800
2801static int active_load_balance_cpu_stop(void *data);
2802
2801/* 2803/*
2802 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2804 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2803 * tasks if there is an imbalance. 2805 * tasks if there is an imbalance.
@@ -2887,8 +2889,9 @@ redo:
2887 if (need_active_balance(sd, sd_idle, idle)) { 2889 if (need_active_balance(sd, sd_idle, idle)) {
2888 raw_spin_lock_irqsave(&busiest->lock, flags); 2890 raw_spin_lock_irqsave(&busiest->lock, flags);
2889 2891
2890 /* don't kick the migration_thread, if the curr 2892 /* don't kick the active_load_balance_cpu_stop,
2891 * task on busiest cpu can't be moved to this_cpu 2893 * if the curr task on busiest cpu can't be
2894 * moved to this_cpu
2892 */ 2895 */
2893 if (!cpumask_test_cpu(this_cpu, 2896 if (!cpumask_test_cpu(this_cpu,
2894 &busiest->curr->cpus_allowed)) { 2897 &busiest->curr->cpus_allowed)) {
@@ -2898,14 +2901,22 @@ redo:
2898 goto out_one_pinned; 2901 goto out_one_pinned;
2899 } 2902 }
2900 2903
2904 /*
2905 * ->active_balance synchronizes accesses to
2906 * ->active_balance_work. Once set, it's cleared
2907 * only after active load balance is finished.
2908 */
2901 if (!busiest->active_balance) { 2909 if (!busiest->active_balance) {
2902 busiest->active_balance = 1; 2910 busiest->active_balance = 1;
2903 busiest->push_cpu = this_cpu; 2911 busiest->push_cpu = this_cpu;
2904 active_balance = 1; 2912 active_balance = 1;
2905 } 2913 }
2906 raw_spin_unlock_irqrestore(&busiest->lock, flags); 2914 raw_spin_unlock_irqrestore(&busiest->lock, flags);
2915
2907 if (active_balance) 2916 if (active_balance)
2908 wake_up_process(busiest->migration_thread); 2917 stop_one_cpu_nowait(cpu_of(busiest),
2918 active_load_balance_cpu_stop, busiest,
2919 &busiest->active_balance_work);
2909 2920
2910 /* 2921 /*
2911 * We've kicked active balancing, reset the failure 2922 * We've kicked active balancing, reset the failure
@@ -3012,24 +3023,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3012} 3023}
3013 3024
3014/* 3025/*
3015 * active_load_balance is run by migration threads. It pushes running tasks 3026 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
3016 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be 3027 * running tasks off the busiest CPU onto idle CPUs. It requires at
3017 * running on each physical CPU where possible, and avoids physical / 3028 * least 1 task to be running on each physical CPU where possible, and
3018 * logical imbalances. 3029 * avoids physical / logical imbalances.
3019 *
3020 * Called with busiest_rq locked.
3021 */ 3030 */
3022static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) 3031static int active_load_balance_cpu_stop(void *data)
3023{ 3032{
3033 struct rq *busiest_rq = data;
3034 int busiest_cpu = cpu_of(busiest_rq);
3024 int target_cpu = busiest_rq->push_cpu; 3035 int target_cpu = busiest_rq->push_cpu;
3036 struct rq *target_rq = cpu_rq(target_cpu);
3025 struct sched_domain *sd; 3037 struct sched_domain *sd;
3026 struct rq *target_rq; 3038
3039 raw_spin_lock_irq(&busiest_rq->lock);
3040
3041 /* make sure the requested cpu hasn't gone down in the meantime */
3042 if (unlikely(busiest_cpu != smp_processor_id() ||
3043 !busiest_rq->active_balance))
3044 goto out_unlock;
3027 3045
3028 /* Is there any task to move? */ 3046 /* Is there any task to move? */
3029 if (busiest_rq->nr_running <= 1) 3047 if (busiest_rq->nr_running <= 1)
3030 return; 3048 goto out_unlock;
3031
3032 target_rq = cpu_rq(target_cpu);
3033 3049
3034 /* 3050 /*
3035 * This condition is "impossible", if it occurs 3051 * This condition is "impossible", if it occurs
@@ -3058,6 +3074,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3058 schedstat_inc(sd, alb_failed); 3074 schedstat_inc(sd, alb_failed);
3059 } 3075 }
3060 double_unlock_balance(busiest_rq, target_rq); 3076 double_unlock_balance(busiest_rq, target_rq);
3077out_unlock:
3078 busiest_rq->active_balance = 0;
3079 raw_spin_unlock_irq(&busiest_rq->lock);
3080 return 0;
3061} 3081}
3062 3082
3063#ifdef CONFIG_NO_HZ 3083#ifdef CONFIG_NO_HZ
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 9bb9fb1bd79c..ef51d1fcf5e6 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,17 +1,381 @@
1/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. 1/*
2 * GPL v2 and any later version. 2 * kernel/stop_machine.c
3 *
4 * Copyright (C) 2008, 2005 IBM Corporation.
5 * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au
6 * Copyright (C) 2010 SUSE Linux Products GmbH
7 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
8 *
9 * This file is released under the GPLv2 and any later version.
3 */ 10 */
11#include <linux/completion.h>
4#include <linux/cpu.h> 12#include <linux/cpu.h>
5#include <linux/err.h> 13#include <linux/init.h>
6#include <linux/kthread.h> 14#include <linux/kthread.h>
7#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/percpu.h>
8#include <linux/sched.h> 17#include <linux/sched.h>
9#include <linux/stop_machine.h> 18#include <linux/stop_machine.h>
10#include <linux/syscalls.h>
11#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/kallsyms.h>
12 21
13#include <asm/atomic.h> 22#include <asm/atomic.h>
14#include <asm/uaccess.h> 23
24/*
25 * Structure to determine completion condition and record errors. May
26 * be shared by works on different cpus.
27 */
28struct cpu_stop_done {
29 atomic_t nr_todo; /* nr left to execute */
30 bool executed; /* actually executed? */
31 int ret; /* collected return value */
32 struct completion completion; /* fired if nr_todo reaches 0 */
33};
34
35/* the actual stopper, one per every possible cpu, enabled on online cpus */
36struct cpu_stopper {
37 spinlock_t lock;
38 struct list_head works; /* list of pending works */
39 struct task_struct *thread; /* stopper thread */
40 bool enabled; /* is this stopper enabled? */
41};
42
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
44
45static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
46{
47 memset(done, 0, sizeof(*done));
48 atomic_set(&done->nr_todo, nr_todo);
49 init_completion(&done->completion);
50}
51
52/* signal completion unless @done is NULL */
53static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
54{
55 if (done) {
56 if (executed)
57 done->executed = true;
58 if (atomic_dec_and_test(&done->nr_todo))
59 complete(&done->completion);
60 }
61}
62
63/* queue @work to @stopper. if offline, @work is completed immediately */
64static void cpu_stop_queue_work(struct cpu_stopper *stopper,
65 struct cpu_stop_work *work)
66{
67 unsigned long flags;
68
69 spin_lock_irqsave(&stopper->lock, flags);
70
71 if (stopper->enabled) {
72 list_add_tail(&work->list, &stopper->works);
73 wake_up_process(stopper->thread);
74 } else
75 cpu_stop_signal_done(work->done, false);
76
77 spin_unlock_irqrestore(&stopper->lock, flags);
78}
79
80/**
81 * stop_one_cpu - stop a cpu
82 * @cpu: cpu to stop
83 * @fn: function to execute
84 * @arg: argument to @fn
85 *
86 * Execute @fn(@arg) on @cpu. @fn is run in a process context with
87 * the highest priority preempting any task on the cpu and
88 * monopolizing it. This function returns after the execution is
89 * complete.
90 *
91 * This function doesn't guarantee @cpu stays online till @fn
92 * completes. If @cpu goes down in the middle, execution may happen
93 * partially or fully on different cpus. @fn should either be ready
94 * for that or the caller should ensure that @cpu stays online until
95 * this function completes.
96 *
97 * CONTEXT:
98 * Might sleep.
99 *
100 * RETURNS:
101 * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
102 * otherwise, the return value of @fn.
103 */
104int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
105{
106 struct cpu_stop_done done;
107 struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
108
109 cpu_stop_init_done(&done, 1);
110 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
111 wait_for_completion(&done.completion);
112 return done.executed ? done.ret : -ENOENT;
113}
114
115/**
116 * stop_one_cpu_nowait - stop a cpu but don't wait for completion
117 * @cpu: cpu to stop
118 * @fn: function to execute
119 * @arg: argument to @fn
120 *
121 * Similar to stop_one_cpu() but doesn't wait for completion. The
122 * caller is responsible for ensuring @work_buf is currently unused
123 * and will remain untouched until stopper starts executing @fn.
124 *
125 * CONTEXT:
126 * Don't care.
127 */
128void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
129 struct cpu_stop_work *work_buf)
130{
131 *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
132 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
133}
134
135/* static data for stop_cpus */
136static DEFINE_MUTEX(stop_cpus_mutex);
137static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
138
139int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
140{
141 struct cpu_stop_work *work;
142 struct cpu_stop_done done;
143 unsigned int cpu;
144
145 /* initialize works and done */
146 for_each_cpu(cpu, cpumask) {
147 work = &per_cpu(stop_cpus_work, cpu);
148 work->fn = fn;
149 work->arg = arg;
150 work->done = &done;
151 }
152 cpu_stop_init_done(&done, cpumask_weight(cpumask));
153
154 /*
155 * Disable preemption while queueing to avoid getting
156 * preempted by a stopper which might wait for other stoppers
157 * to enter @fn which can lead to deadlock.
158 */
159 preempt_disable();
160 for_each_cpu(cpu, cpumask)
161 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
162 &per_cpu(stop_cpus_work, cpu));
163 preempt_enable();
164
165 wait_for_completion(&done.completion);
166 return done.executed ? done.ret : -ENOENT;
167}
168
169/**
170 * stop_cpus - stop multiple cpus
171 * @cpumask: cpus to stop
172 * @fn: function to execute
173 * @arg: argument to @fn
174 *
175 * Execute @fn(@arg) on online cpus in @cpumask. On each target cpu,
176 * @fn is run in a process context with the highest priority
177 * preempting any task on the cpu and monopolizing it. This function
178 * returns after all executions are complete.
179 *
180 * This function doesn't guarantee the cpus in @cpumask stay online
181 * till @fn completes. If some cpus go down in the middle, execution
182 * on the cpu may happen partially or fully on different cpus. @fn
183 * should either be ready for that or the caller should ensure that
184 * the cpus stay online until this function completes.
185 *
186 * All stop_cpus() calls are serialized making it safe for @fn to wait
187 * for all cpus to start executing it.
188 *
189 * CONTEXT:
190 * Might sleep.
191 *
192 * RETURNS:
193 * -ENOENT if @fn(@arg) was not executed at all because all cpus in
194 * @cpumask were offline; otherwise, 0 if all executions of @fn
195 * returned 0, any non zero return value if any returned non zero.
196 */
197int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
198{
199 int ret;
200
201 /* static works are used, process one request at a time */
202 mutex_lock(&stop_cpus_mutex);
203 ret = __stop_cpus(cpumask, fn, arg);
204 mutex_unlock(&stop_cpus_mutex);
205 return ret;
206}
207
208/**
209 * try_stop_cpus - try to stop multiple cpus
210 * @cpumask: cpus to stop
211 * @fn: function to execute
212 * @arg: argument to @fn
213 *
214 * Identical to stop_cpus() except that it fails with -EAGAIN if
215 * someone else is already using the facility.
216 *
217 * CONTEXT:
218 * Might sleep.
219 *
220 * RETURNS:
221 * -EAGAIN if someone else is already stopping cpus, -ENOENT if
222 * @fn(@arg) was not executed at all because all cpus in @cpumask were
223 * offline; otherwise, 0 if all executions of @fn returned 0, any non
224 * zero return value if any returned non zero.
225 */
226int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
227{
228 int ret;
229
230 /* static works are used, process one request at a time */
231 if (!mutex_trylock(&stop_cpus_mutex))
232 return -EAGAIN;
233 ret = __stop_cpus(cpumask, fn, arg);
234 mutex_unlock(&stop_cpus_mutex);
235 return ret;
236}
237
238static int cpu_stopper_thread(void *data)
239{
240 struct cpu_stopper *stopper = data;
241 struct cpu_stop_work *work;
242 int ret;
243
244repeat:
245 set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
246
247 if (kthread_should_stop()) {
248 __set_current_state(TASK_RUNNING);
249 return 0;
250 }
251
252 work = NULL;
253 spin_lock_irq(&stopper->lock);
254 if (!list_empty(&stopper->works)) {
255 work = list_first_entry(&stopper->works,
256 struct cpu_stop_work, list);
257 list_del_init(&work->list);
258 }
259 spin_unlock_irq(&stopper->lock);
260
261 if (work) {
262 cpu_stop_fn_t fn = work->fn;
263 void *arg = work->arg;
264 struct cpu_stop_done *done = work->done;
265 char ksym_buf[KSYM_NAME_LEN];
266
267 __set_current_state(TASK_RUNNING);
268
269 /* cpu stop callbacks are not allowed to sleep */
270 preempt_disable();
271
272 ret = fn(arg);
273 if (ret)
274 done->ret = ret;
275
276 /* restore preemption and check it's still balanced */
277 preempt_enable();
278 WARN_ONCE(preempt_count(),
279 "cpu_stop: %s(%p) leaked preempt count\n",
280 kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
281 ksym_buf), arg);
282
283 cpu_stop_signal_done(done, true);
284 } else
285 schedule();
286
287 goto repeat;
288}
289
290/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
291static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
292 unsigned long action, void *hcpu)
293{
294 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
295 unsigned int cpu = (unsigned long)hcpu;
296 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
297 struct cpu_stop_work *work;
298 struct task_struct *p;
299
300 switch (action & ~CPU_TASKS_FROZEN) {
301 case CPU_UP_PREPARE:
302 BUG_ON(stopper->thread || stopper->enabled ||
303 !list_empty(&stopper->works));
304 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
305 cpu);
306 if (IS_ERR(p))
307 return NOTIFY_BAD;
308 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
309 get_task_struct(p);
310 stopper->thread = p;
311 break;
312
313 case CPU_ONLINE:
314 kthread_bind(stopper->thread, cpu);
315 /* strictly unnecessary, as first user will wake it */
316 wake_up_process(stopper->thread);
317 /* mark enabled */
318 spin_lock_irq(&stopper->lock);
319 stopper->enabled = true;
320 spin_unlock_irq(&stopper->lock);
321 break;
322
323#ifdef CONFIG_HOTPLUG_CPU
324 case CPU_UP_CANCELED:
325 case CPU_DEAD:
326 /* kill the stopper */
327 kthread_stop(stopper->thread);
328 /* drain remaining works */
329 spin_lock_irq(&stopper->lock);
330 list_for_each_entry(work, &stopper->works, list)
331 cpu_stop_signal_done(work->done, false);
332 stopper->enabled = false;
333 spin_unlock_irq(&stopper->lock);
334 /* release the stopper */
335 put_task_struct(stopper->thread);
336 stopper->thread = NULL;
337 break;
338#endif
339 }
340
341 return NOTIFY_OK;
342}
343
344/*
345 * Give it a higher priority so that cpu stopper is available to other
346 * cpu notifiers. It currently shares the same priority as sched
347 * migration_notifier.
348 */
349static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
350 .notifier_call = cpu_stop_cpu_callback,
351 .priority = 10,
352};
353
354static int __init cpu_stop_init(void)
355{
356 void *bcpu = (void *)(long)smp_processor_id();
357 unsigned int cpu;
358 int err;
359
360 for_each_possible_cpu(cpu) {
361 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
362
363 spin_lock_init(&stopper->lock);
364 INIT_LIST_HEAD(&stopper->works);
365 }
366
367 /* start one for the boot cpu */
368 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
369 bcpu);
370 BUG_ON(err == NOTIFY_BAD);
371 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
372 register_cpu_notifier(&cpu_stop_cpu_notifier);
373
374 return 0;
375}
376early_initcall(cpu_stop_init);
377
378#ifdef CONFIG_STOP_MACHINE
15 379
16/* This controls the threads on each CPU. */ 380/* This controls the threads on each CPU. */
17enum stopmachine_state { 381enum stopmachine_state {
@@ -26,174 +390,94 @@ enum stopmachine_state {
26 /* Exit */ 390 /* Exit */
27 STOPMACHINE_EXIT, 391 STOPMACHINE_EXIT,
28}; 392};
29static enum stopmachine_state state;
30 393
31struct stop_machine_data { 394struct stop_machine_data {
32 int (*fn)(void *); 395 int (*fn)(void *);
33 void *data; 396 void *data;
34 int fnret; 397 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
398 unsigned int num_threads;
399 const struct cpumask *active_cpus;
400
401 enum stopmachine_state state;
402 atomic_t thread_ack;
35}; 403};
36 404
37/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ 405static void set_state(struct stop_machine_data *smdata,
38static unsigned int num_threads; 406 enum stopmachine_state newstate)
39static atomic_t thread_ack;
40static DEFINE_MUTEX(lock);
41/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
42static DEFINE_MUTEX(setup_lock);
43/* Users of stop_machine. */
44static int refcount;
45static struct workqueue_struct *stop_machine_wq;
46static struct stop_machine_data active, idle;
47static const struct cpumask *active_cpus;
48static void __percpu *stop_machine_work;
49
50static void set_state(enum stopmachine_state newstate)
51{ 407{
52 /* Reset ack counter. */ 408 /* Reset ack counter. */
53 atomic_set(&thread_ack, num_threads); 409 atomic_set(&smdata->thread_ack, smdata->num_threads);
54 smp_wmb(); 410 smp_wmb();
55 state = newstate; 411 smdata->state = newstate;
56} 412}
57 413
58/* Last one to ack a state moves to the next state. */ 414/* Last one to ack a state moves to the next state. */
59static void ack_state(void) 415static void ack_state(struct stop_machine_data *smdata)
60{ 416{
61 if (atomic_dec_and_test(&thread_ack)) 417 if (atomic_dec_and_test(&smdata->thread_ack))
62 set_state(state + 1); 418 set_state(smdata, smdata->state + 1);
63} 419}
64 420
65/* This is the actual function which stops the CPU. It runs 421/* This is the cpu_stop function which stops the CPU. */
66 * in the context of a dedicated stopmachine workqueue. */ 422static int stop_machine_cpu_stop(void *data)
67static void stop_cpu(struct work_struct *unused)
68{ 423{
424 struct stop_machine_data *smdata = data;
69 enum stopmachine_state curstate = STOPMACHINE_NONE; 425 enum stopmachine_state curstate = STOPMACHINE_NONE;
70 struct stop_machine_data *smdata = &idle; 426 int cpu = smp_processor_id(), err = 0;
71 int cpu = smp_processor_id(); 427 bool is_active;
72 int err; 428
429 if (!smdata->active_cpus)
430 is_active = cpu == cpumask_first(cpu_online_mask);
431 else
432 is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
73 433
74 if (!active_cpus) {
75 if (cpu == cpumask_first(cpu_online_mask))
76 smdata = &active;
77 } else {
78 if (cpumask_test_cpu(cpu, active_cpus))
79 smdata = &active;
80 }
81 /* Simple state machine */ 434 /* Simple state machine */
82 do { 435 do {
83 /* Chill out and ensure we re-read stopmachine_state. */ 436 /* Chill out and ensure we re-read stopmachine_state. */
84 cpu_relax(); 437 cpu_relax();
85 if (state != curstate) { 438 if (smdata->state != curstate) {
86 curstate = state; 439 curstate = smdata->state;
87 switch (curstate) { 440 switch (curstate) {
88 case STOPMACHINE_DISABLE_IRQ: 441 case STOPMACHINE_DISABLE_IRQ:
89 local_irq_disable(); 442 local_irq_disable();
90 hard_irq_disable(); 443 hard_irq_disable();
91 break; 444 break;
92 case STOPMACHINE_RUN: 445 case STOPMACHINE_RUN:
93 /* On multiple CPUs only a single error code 446 if (is_active)
94 * is needed to tell that something failed. */ 447 err = smdata->fn(smdata->data);
95 err = smdata->fn(smdata->data);
96 if (err)
97 smdata->fnret = err;
98 break; 448 break;
99 default: 449 default:
100 break; 450 break;
101 } 451 }
102 ack_state(); 452 ack_state(smdata);
103 } 453 }
104 } while (curstate != STOPMACHINE_EXIT); 454 } while (curstate != STOPMACHINE_EXIT);
105 455
106 local_irq_enable(); 456 local_irq_enable();
457 return err;
107} 458}
108 459
109/* Callback for CPUs which aren't supposed to do anything. */
110static int chill(void *unused)
111{
112 return 0;
113}
114
115int stop_machine_create(void)
116{
117 mutex_lock(&setup_lock);
118 if (refcount)
119 goto done;
120 stop_machine_wq = create_rt_workqueue("kstop");
121 if (!stop_machine_wq)
122 goto err_out;
123 stop_machine_work = alloc_percpu(struct work_struct);
124 if (!stop_machine_work)
125 goto err_out;
126done:
127 refcount++;
128 mutex_unlock(&setup_lock);
129 return 0;
130
131err_out:
132 if (stop_machine_wq)
133 destroy_workqueue(stop_machine_wq);
134 mutex_unlock(&setup_lock);
135 return -ENOMEM;
136}
137EXPORT_SYMBOL_GPL(stop_machine_create);
138
139void stop_machine_destroy(void)
140{
141 mutex_lock(&setup_lock);
142 refcount--;
143 if (refcount)
144 goto done;
145 destroy_workqueue(stop_machine_wq);
146 free_percpu(stop_machine_work);
147done:
148 mutex_unlock(&setup_lock);
149}
150EXPORT_SYMBOL_GPL(stop_machine_destroy);
151
152int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 460int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
153{ 461{
154 struct work_struct *sm_work; 462 struct stop_machine_data smdata = { .fn = fn, .data = data,
155 int i, ret; 463 .num_threads = num_online_cpus(),
156 464 .active_cpus = cpus };
157 /* Set up initial state. */ 465
158 mutex_lock(&lock); 466 /* Set the initial state and stop all online cpus. */
159 num_threads = num_online_cpus(); 467 set_state(&smdata, STOPMACHINE_PREPARE);
160 active_cpus = cpus; 468 return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
161 active.fn = fn;
162 active.data = data;
163 active.fnret = 0;
164 idle.fn = chill;
165 idle.data = NULL;
166
167 set_state(STOPMACHINE_PREPARE);
168
169 /* Schedule the stop_cpu work on all cpus: hold this CPU so one
170 * doesn't hit this CPU until we're ready. */
171 get_cpu();
172 for_each_online_cpu(i) {
173 sm_work = per_cpu_ptr(stop_machine_work, i);
174 INIT_WORK(sm_work, stop_cpu);
175 queue_work_on(i, stop_machine_wq, sm_work);
176 }
177 /* This will release the thread on our CPU. */
178 put_cpu();
179 flush_workqueue(stop_machine_wq);
180 ret = active.fnret;
181 mutex_unlock(&lock);
182 return ret;
183} 469}
184 470
185int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 471int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
186{ 472{
187 int ret; 473 int ret;
188 474
189 ret = stop_machine_create();
190 if (ret)
191 return ret;
192 /* No CPUs can come up or down during this. */ 475 /* No CPUs can come up or down during this. */
193 get_online_cpus(); 476 get_online_cpus();
194 ret = __stop_machine(fn, data, cpus); 477 ret = __stop_machine(fn, data, cpus);
195 put_online_cpus(); 478 put_online_cpus();
196 stop_machine_destroy();
197 return ret; 479 return ret;
198} 480}
199EXPORT_SYMBOL_GPL(stop_machine); 481EXPORT_SYMBOL_GPL(stop_machine);
482
483#endif /* CONFIG_STOP_MACHINE */