aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.hz2
-rw-r--r--kernel/Kconfig.preempt13
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/cpu.c164
-rw-r--r--kernel/cpuset.c14
-rw-r--r--kernel/fork.c11
-rw-r--r--kernel/hrtimer.c256
-rw-r--r--kernel/kthread.c12
-rw-r--r--kernel/latencytop.c239
-rw-r--r--kernel/lockdep.c12
-rw-r--r--kernel/module.c27
-rw-r--r--kernel/posix-cpu-timers.c30
-rw-r--r--kernel/printk.c57
-rw-r--r--kernel/profile.c99
-rw-r--r--kernel/rcuclassic.c575
-rw-r--r--kernel/rcupdate.c576
-rw-r--r--kernel/rcupreempt.c953
-rw-r--r--kernel/rcupreempt_trace.c330
-rw-r--r--kernel/rcutorture.c6
-rw-r--r--kernel/sched.c1384
-rw-r--r--kernel/sched_debug.c5
-rw-r--r--kernel/sched_fair.c391
-rw-r--r--kernel/sched_idletask.c42
-rw-r--r--kernel/sched_rt.c1112
-rw-r--r--kernel/softlockup.c116
-rw-r--r--kernel/stop_machine.c4
-rw-r--r--kernel/sysctl.c77
-rw-r--r--kernel/time/tick-sched.c13
-rw-r--r--kernel/timer.c3
-rw-r--r--kernel/user.c47
-rw-r--r--kernel/workqueue.c35
31 files changed, 5242 insertions, 1369 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 4af15802ccd4..526128a2e622 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -54,3 +54,5 @@ config HZ
54 default 300 if HZ_300 54 default 300 if HZ_300
55 default 1000 if HZ_1000 55 default 1000 if HZ_1000
56 56
57config SCHED_HRTICK
58 def_bool HIGH_RES_TIMERS && X86
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c64ce9c14207..0669b70fa6a3 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,14 +52,13 @@ config PREEMPT
52 52
53endchoice 53endchoice
54 54
55config PREEMPT_BKL 55config RCU_TRACE
56 bool "Preempt The Big Kernel Lock" 56 bool "Enable tracing for RCU - currently stats in debugfs"
57 depends on SMP || PREEMPT 57 select DEBUG_FS
58 default y 58 default y
59 help 59 help
60 This option reduces the latency of the kernel by making the 60 This option provides tracing in RCU which presents stats
61 big kernel lock preemptible. 61 in debugfs for debugging RCU implementation.
62 62
63 Say Y here if you are building a kernel for a desktop system. 63 Say Y here if you want to enable RCU tracing
64 Say N if you are unsure. 64 Say N if you are unsure.
65
diff --git a/kernel/Makefile b/kernel/Makefile
index dfa96956dae0..390d42146267 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -52,11 +52,17 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
52obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 52obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
53obj-$(CONFIG_SECCOMP) += seccomp.o 53obj-$(CONFIG_SECCOMP) += seccomp.o
54obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 54obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
55obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
56obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
57ifeq ($(CONFIG_PREEMPT_RCU),y)
58obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
59endif
55obj-$(CONFIG_RELAY) += relay.o 60obj-$(CONFIG_RELAY) += relay.o
56obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 61obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
57obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 62obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
58obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 63obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
59obj-$(CONFIG_MARKERS) += marker.o 64obj-$(CONFIG_MARKERS) += marker.o
65obj-$(CONFIG_LATENCYTOP) += latencytop.o
60 66
61ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 67ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
62# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 68# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6b3a0c15144f..e0d3a4f56ecb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,9 +15,8 @@
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17 17
18/* This protects CPUs going up and down... */ 18/* Serializes the updates to cpu_online_map, cpu_present_map */
19static DEFINE_MUTEX(cpu_add_remove_lock); 19static DEFINE_MUTEX(cpu_add_remove_lock);
20static DEFINE_MUTEX(cpu_bitmask_lock);
21 20
22static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); 21static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
23 22
@@ -26,52 +25,123 @@ static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
26 */ 25 */
27static int cpu_hotplug_disabled; 26static int cpu_hotplug_disabled;
28 27
29#ifdef CONFIG_HOTPLUG_CPU 28static struct {
29 struct task_struct *active_writer;
30 struct mutex lock; /* Synchronizes accesses to refcount, */
31 /*
32 * Also blocks the new readers during
33 * an ongoing cpu hotplug operation.
34 */
35 int refcount;
36 wait_queue_head_t writer_queue;
37} cpu_hotplug;
30 38
31/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */ 39#define writer_exists() (cpu_hotplug.active_writer != NULL)
32static struct task_struct *recursive;
33static int recursive_depth;
34 40
35void lock_cpu_hotplug(void) 41void __init cpu_hotplug_init(void)
36{ 42{
37 struct task_struct *tsk = current; 43 cpu_hotplug.active_writer = NULL;
38 44 mutex_init(&cpu_hotplug.lock);
39 if (tsk == recursive) { 45 cpu_hotplug.refcount = 0;
40 static int warnings = 10; 46 init_waitqueue_head(&cpu_hotplug.writer_queue);
41 if (warnings) { 47}
42 printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n"); 48
43 WARN_ON(1); 49#ifdef CONFIG_HOTPLUG_CPU
44 warnings--; 50
45 } 51void get_online_cpus(void)
46 recursive_depth++; 52{
53 might_sleep();
54 if (cpu_hotplug.active_writer == current)
47 return; 55 return;
48 } 56 mutex_lock(&cpu_hotplug.lock);
49 mutex_lock(&cpu_bitmask_lock); 57 cpu_hotplug.refcount++;
50 recursive = tsk; 58 mutex_unlock(&cpu_hotplug.lock);
59
51} 60}
52EXPORT_SYMBOL_GPL(lock_cpu_hotplug); 61EXPORT_SYMBOL_GPL(get_online_cpus);
53 62
54void unlock_cpu_hotplug(void) 63void put_online_cpus(void)
55{ 64{
56 WARN_ON(recursive != current); 65 if (cpu_hotplug.active_writer == current)
57 if (recursive_depth) {
58 recursive_depth--;
59 return; 66 return;
60 } 67 mutex_lock(&cpu_hotplug.lock);
61 recursive = NULL; 68 cpu_hotplug.refcount--;
62 mutex_unlock(&cpu_bitmask_lock); 69
70 if (unlikely(writer_exists()) && !cpu_hotplug.refcount)
71 wake_up(&cpu_hotplug.writer_queue);
72
73 mutex_unlock(&cpu_hotplug.lock);
74
63} 75}
64EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); 76EXPORT_SYMBOL_GPL(put_online_cpus);
65 77
66#endif /* CONFIG_HOTPLUG_CPU */ 78#endif /* CONFIG_HOTPLUG_CPU */
67 79
80/*
81 * The following two API's must be used when attempting
82 * to serialize the updates to cpu_online_map, cpu_present_map.
83 */
84void cpu_maps_update_begin(void)
85{
86 mutex_lock(&cpu_add_remove_lock);
87}
88
89void cpu_maps_update_done(void)
90{
91 mutex_unlock(&cpu_add_remove_lock);
92}
93
94/*
95 * This ensures that the hotplug operation can begin only when the
96 * refcount goes to zero.
97 *
98 * Note that during a cpu-hotplug operation, the new readers, if any,
99 * will be blocked by the cpu_hotplug.lock
100 *
101 * Since cpu_maps_update_begin is always called after invoking
102 * cpu_maps_update_begin, we can be sure that only one writer is active.
103 *
104 * Note that theoretically, there is a possibility of a livelock:
105 * - Refcount goes to zero, last reader wakes up the sleeping
106 * writer.
107 * - Last reader unlocks the cpu_hotplug.lock.
108 * - A new reader arrives at this moment, bumps up the refcount.
109 * - The writer acquires the cpu_hotplug.lock finds the refcount
110 * non zero and goes to sleep again.
111 *
112 * However, this is very difficult to achieve in practice since
113 * get_online_cpus() not an api which is called all that often.
114 *
115 */
116static void cpu_hotplug_begin(void)
117{
118 DECLARE_WAITQUEUE(wait, current);
119
120 mutex_lock(&cpu_hotplug.lock);
121
122 cpu_hotplug.active_writer = current;
123 add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait);
124 while (cpu_hotplug.refcount) {
125 set_current_state(TASK_UNINTERRUPTIBLE);
126 mutex_unlock(&cpu_hotplug.lock);
127 schedule();
128 mutex_lock(&cpu_hotplug.lock);
129 }
130 remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait);
131}
132
133static void cpu_hotplug_done(void)
134{
135 cpu_hotplug.active_writer = NULL;
136 mutex_unlock(&cpu_hotplug.lock);
137}
68/* Need to know about CPUs going up/down? */ 138/* Need to know about CPUs going up/down? */
69int __cpuinit register_cpu_notifier(struct notifier_block *nb) 139int __cpuinit register_cpu_notifier(struct notifier_block *nb)
70{ 140{
71 int ret; 141 int ret;
72 mutex_lock(&cpu_add_remove_lock); 142 cpu_maps_update_begin();
73 ret = raw_notifier_chain_register(&cpu_chain, nb); 143 ret = raw_notifier_chain_register(&cpu_chain, nb);
74 mutex_unlock(&cpu_add_remove_lock); 144 cpu_maps_update_done();
75 return ret; 145 return ret;
76} 146}
77 147
@@ -81,9 +151,9 @@ EXPORT_SYMBOL(register_cpu_notifier);
81 151
82void unregister_cpu_notifier(struct notifier_block *nb) 152void unregister_cpu_notifier(struct notifier_block *nb)
83{ 153{
84 mutex_lock(&cpu_add_remove_lock); 154 cpu_maps_update_begin();
85 raw_notifier_chain_unregister(&cpu_chain, nb); 155 raw_notifier_chain_unregister(&cpu_chain, nb);
86 mutex_unlock(&cpu_add_remove_lock); 156 cpu_maps_update_done();
87} 157}
88EXPORT_SYMBOL(unregister_cpu_notifier); 158EXPORT_SYMBOL(unregister_cpu_notifier);
89 159
@@ -147,7 +217,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
147 if (!cpu_online(cpu)) 217 if (!cpu_online(cpu))
148 return -EINVAL; 218 return -EINVAL;
149 219
150 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); 220 cpu_hotplug_begin();
151 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 221 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
152 hcpu, -1, &nr_calls); 222 hcpu, -1, &nr_calls);
153 if (err == NOTIFY_BAD) { 223 if (err == NOTIFY_BAD) {
@@ -166,9 +236,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
166 cpu_clear(cpu, tmp); 236 cpu_clear(cpu, tmp);
167 set_cpus_allowed(current, tmp); 237 set_cpus_allowed(current, tmp);
168 238
169 mutex_lock(&cpu_bitmask_lock);
170 p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); 239 p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
171 mutex_unlock(&cpu_bitmask_lock);
172 240
173 if (IS_ERR(p) || cpu_online(cpu)) { 241 if (IS_ERR(p) || cpu_online(cpu)) {
174 /* CPU didn't die: tell everyone. Can't complain. */ 242 /* CPU didn't die: tell everyone. Can't complain. */
@@ -202,7 +270,7 @@ out_thread:
202out_allowed: 270out_allowed:
203 set_cpus_allowed(current, old_allowed); 271 set_cpus_allowed(current, old_allowed);
204out_release: 272out_release:
205 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); 273 cpu_hotplug_done();
206 return err; 274 return err;
207} 275}
208 276
@@ -210,13 +278,13 @@ int cpu_down(unsigned int cpu)
210{ 278{
211 int err = 0; 279 int err = 0;
212 280
213 mutex_lock(&cpu_add_remove_lock); 281 cpu_maps_update_begin();
214 if (cpu_hotplug_disabled) 282 if (cpu_hotplug_disabled)
215 err = -EBUSY; 283 err = -EBUSY;
216 else 284 else
217 err = _cpu_down(cpu, 0); 285 err = _cpu_down(cpu, 0);
218 286
219 mutex_unlock(&cpu_add_remove_lock); 287 cpu_maps_update_done();
220 return err; 288 return err;
221} 289}
222#endif /*CONFIG_HOTPLUG_CPU*/ 290#endif /*CONFIG_HOTPLUG_CPU*/
@@ -231,7 +299,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
231 if (cpu_online(cpu) || !cpu_present(cpu)) 299 if (cpu_online(cpu) || !cpu_present(cpu))
232 return -EINVAL; 300 return -EINVAL;
233 301
234 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); 302 cpu_hotplug_begin();
235 ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, 303 ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
236 -1, &nr_calls); 304 -1, &nr_calls);
237 if (ret == NOTIFY_BAD) { 305 if (ret == NOTIFY_BAD) {
@@ -243,9 +311,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
243 } 311 }
244 312
245 /* Arch-specific enabling code. */ 313 /* Arch-specific enabling code. */
246 mutex_lock(&cpu_bitmask_lock);
247 ret = __cpu_up(cpu); 314 ret = __cpu_up(cpu);
248 mutex_unlock(&cpu_bitmask_lock);
249 if (ret != 0) 315 if (ret != 0)
250 goto out_notify; 316 goto out_notify;
251 BUG_ON(!cpu_online(cpu)); 317 BUG_ON(!cpu_online(cpu));
@@ -257,7 +323,7 @@ out_notify:
257 if (ret != 0) 323 if (ret != 0)
258 __raw_notifier_call_chain(&cpu_chain, 324 __raw_notifier_call_chain(&cpu_chain,
259 CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); 325 CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
260 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); 326 cpu_hotplug_done();
261 327
262 return ret; 328 return ret;
263} 329}
@@ -275,13 +341,13 @@ int __cpuinit cpu_up(unsigned int cpu)
275 return -EINVAL; 341 return -EINVAL;
276 } 342 }
277 343
278 mutex_lock(&cpu_add_remove_lock); 344 cpu_maps_update_begin();
279 if (cpu_hotplug_disabled) 345 if (cpu_hotplug_disabled)
280 err = -EBUSY; 346 err = -EBUSY;
281 else 347 else
282 err = _cpu_up(cpu, 0); 348 err = _cpu_up(cpu, 0);
283 349
284 mutex_unlock(&cpu_add_remove_lock); 350 cpu_maps_update_done();
285 return err; 351 return err;
286} 352}
287 353
@@ -292,7 +358,7 @@ int disable_nonboot_cpus(void)
292{ 358{
293 int cpu, first_cpu, error = 0; 359 int cpu, first_cpu, error = 0;
294 360
295 mutex_lock(&cpu_add_remove_lock); 361 cpu_maps_update_begin();
296 first_cpu = first_cpu(cpu_online_map); 362 first_cpu = first_cpu(cpu_online_map);
297 /* We take down all of the non-boot CPUs in one shot to avoid races 363 /* We take down all of the non-boot CPUs in one shot to avoid races
298 * with the userspace trying to use the CPU hotplug at the same time 364 * with the userspace trying to use the CPU hotplug at the same time
@@ -319,7 +385,7 @@ int disable_nonboot_cpus(void)
319 } else { 385 } else {
320 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 386 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
321 } 387 }
322 mutex_unlock(&cpu_add_remove_lock); 388 cpu_maps_update_done();
323 return error; 389 return error;
324} 390}
325 391
@@ -328,7 +394,7 @@ void enable_nonboot_cpus(void)
328 int cpu, error; 394 int cpu, error;
329 395
330 /* Allow everyone to use the CPU hotplug again */ 396 /* Allow everyone to use the CPU hotplug again */
331 mutex_lock(&cpu_add_remove_lock); 397 cpu_maps_update_begin();
332 cpu_hotplug_disabled = 0; 398 cpu_hotplug_disabled = 0;
333 if (cpus_empty(frozen_cpus)) 399 if (cpus_empty(frozen_cpus))
334 goto out; 400 goto out;
@@ -344,6 +410,6 @@ void enable_nonboot_cpus(void)
344 } 410 }
345 cpus_clear(frozen_cpus); 411 cpus_clear(frozen_cpus);
346out: 412out:
347 mutex_unlock(&cpu_add_remove_lock); 413 cpu_maps_update_done();
348} 414}
349#endif /* CONFIG_PM_SLEEP_SMP */ 415#endif /* CONFIG_PM_SLEEP_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 50f5dc463688..cfaf6419d817 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,10 +537,10 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
537 * 537 *
538 * Call with cgroup_mutex held. May take callback_mutex during 538 * Call with cgroup_mutex held. May take callback_mutex during
539 * call due to the kfifo_alloc() and kmalloc() calls. May nest 539 * call due to the kfifo_alloc() and kmalloc() calls. May nest
540 * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair. 540 * a call to the get_online_cpus()/put_online_cpus() pair.
541 * Must not be called holding callback_mutex, because we must not 541 * Must not be called holding callback_mutex, because we must not
542 * call lock_cpu_hotplug() while holding callback_mutex. Elsewhere 542 * call get_online_cpus() while holding callback_mutex. Elsewhere
543 * the kernel nests callback_mutex inside lock_cpu_hotplug() calls. 543 * the kernel nests callback_mutex inside get_online_cpus() calls.
544 * So the reverse nesting would risk an ABBA deadlock. 544 * So the reverse nesting would risk an ABBA deadlock.
545 * 545 *
546 * The three key local variables below are: 546 * The three key local variables below are:
@@ -691,9 +691,9 @@ restart:
691 691
692rebuild: 692rebuild:
693 /* Have scheduler rebuild sched domains */ 693 /* Have scheduler rebuild sched domains */
694 lock_cpu_hotplug(); 694 get_online_cpus();
695 partition_sched_domains(ndoms, doms); 695 partition_sched_domains(ndoms, doms);
696 unlock_cpu_hotplug(); 696 put_online_cpus();
697 697
698done: 698done:
699 if (q && !IS_ERR(q)) 699 if (q && !IS_ERR(q))
@@ -1617,10 +1617,10 @@ static struct cgroup_subsys_state *cpuset_create(
1617 * 1617 *
1618 * If the cpuset being removed has its flag 'sched_load_balance' 1618 * If the cpuset being removed has its flag 'sched_load_balance'
1619 * enabled, then simulate turning sched_load_balance off, which 1619 * enabled, then simulate turning sched_load_balance off, which
1620 * will call rebuild_sched_domains(). The lock_cpu_hotplug() 1620 * will call rebuild_sched_domains(). The get_online_cpus()
1621 * call in rebuild_sched_domains() must not be made while holding 1621 * call in rebuild_sched_domains() must not be made while holding
1622 * callback_mutex. Elsewhere the kernel nests callback_mutex inside 1622 * callback_mutex. Elsewhere the kernel nests callback_mutex inside
1623 * lock_cpu_hotplug() calls. So the reverse nesting would risk an 1623 * get_online_cpus() calls. So the reverse nesting would risk an
1624 * ABBA deadlock. 1624 * ABBA deadlock.
1625 */ 1625 */
1626 1626
diff --git a/kernel/fork.c b/kernel/fork.c
index 8dd8ff281009..39d22b3357de 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1045,6 +1045,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1045 copy_flags(clone_flags, p); 1045 copy_flags(clone_flags, p);
1046 INIT_LIST_HEAD(&p->children); 1046 INIT_LIST_HEAD(&p->children);
1047 INIT_LIST_HEAD(&p->sibling); 1047 INIT_LIST_HEAD(&p->sibling);
1048#ifdef CONFIG_PREEMPT_RCU
1049 p->rcu_read_lock_nesting = 0;
1050 p->rcu_flipctr_idx = 0;
1051#endif /* #ifdef CONFIG_PREEMPT_RCU */
1048 p->vfork_done = NULL; 1052 p->vfork_done = NULL;
1049 spin_lock_init(&p->alloc_lock); 1053 spin_lock_init(&p->alloc_lock);
1050 1054
@@ -1059,6 +1063,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1059 p->prev_utime = cputime_zero; 1063 p->prev_utime = cputime_zero;
1060 p->prev_stime = cputime_zero; 1064 p->prev_stime = cputime_zero;
1061 1065
1066#ifdef CONFIG_DETECT_SOFTLOCKUP
1067 p->last_switch_count = 0;
1068 p->last_switch_timestamp = 0;
1069#endif
1070
1062#ifdef CONFIG_TASK_XACCT 1071#ifdef CONFIG_TASK_XACCT
1063 p->rchar = 0; /* I/O counter: bytes read */ 1072 p->rchar = 0; /* I/O counter: bytes read */
1064 p->wchar = 0; /* I/O counter: bytes written */ 1073 p->wchar = 0; /* I/O counter: bytes written */
@@ -1196,6 +1205,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1196#ifdef TIF_SYSCALL_EMU 1205#ifdef TIF_SYSCALL_EMU
1197 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); 1206 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
1198#endif 1207#endif
1208 clear_all_latency_tracing(p);
1199 1209
1200 /* Our parent execution domain becomes current domain 1210 /* Our parent execution domain becomes current domain
1201 These must match for thread signalling to apply */ 1211 These must match for thread signalling to apply */
@@ -1237,6 +1247,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1237 * parent's CPU). This avoids alot of nasty races. 1247 * parent's CPU). This avoids alot of nasty races.
1238 */ 1248 */
1239 p->cpus_allowed = current->cpus_allowed; 1249 p->cpus_allowed = current->cpus_allowed;
1250 p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
1240 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || 1251 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
1241 !cpu_online(task_cpu(p)))) 1252 !cpu_online(task_cpu(p))))
1242 set_task_cpu(p, smp_processor_id()); 1253 set_task_cpu(p, smp_processor_id());
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f994bb8065e6..bd5d6b5060bc 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -325,6 +325,22 @@ unsigned long ktime_divns(const ktime_t kt, s64 div)
325} 325}
326#endif /* BITS_PER_LONG >= 64 */ 326#endif /* BITS_PER_LONG >= 64 */
327 327
328/*
329 * Check, whether the timer is on the callback pending list
330 */
331static inline int hrtimer_cb_pending(const struct hrtimer *timer)
332{
333 return timer->state & HRTIMER_STATE_PENDING;
334}
335
336/*
337 * Remove a timer from the callback pending list
338 */
339static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
340{
341 list_del_init(&timer->cb_entry);
342}
343
328/* High resolution timer related functions */ 344/* High resolution timer related functions */
329#ifdef CONFIG_HIGH_RES_TIMERS 345#ifdef CONFIG_HIGH_RES_TIMERS
330 346
@@ -494,29 +510,12 @@ void hres_timers_resume(void)
494} 510}
495 511
496/* 512/*
497 * Check, whether the timer is on the callback pending list
498 */
499static inline int hrtimer_cb_pending(const struct hrtimer *timer)
500{
501 return timer->state & HRTIMER_STATE_PENDING;
502}
503
504/*
505 * Remove a timer from the callback pending list
506 */
507static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
508{
509 list_del_init(&timer->cb_entry);
510}
511
512/*
513 * Initialize the high resolution related parts of cpu_base 513 * Initialize the high resolution related parts of cpu_base
514 */ 514 */
515static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) 515static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
516{ 516{
517 base->expires_next.tv64 = KTIME_MAX; 517 base->expires_next.tv64 = KTIME_MAX;
518 base->hres_active = 0; 518 base->hres_active = 0;
519 INIT_LIST_HEAD(&base->cb_pending);
520} 519}
521 520
522/* 521/*
@@ -524,7 +523,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
524 */ 523 */
525static inline void hrtimer_init_timer_hres(struct hrtimer *timer) 524static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
526{ 525{
527 INIT_LIST_HEAD(&timer->cb_entry);
528} 526}
529 527
530/* 528/*
@@ -618,10 +616,13 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
618{ 616{
619 return 0; 617 return 0;
620} 618}
621static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; }
622static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { }
623static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } 619static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
624static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } 620static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
621static inline int hrtimer_reprogram(struct hrtimer *timer,
622 struct hrtimer_clock_base *base)
623{
624 return 0;
625}
625 626
626#endif /* CONFIG_HIGH_RES_TIMERS */ 627#endif /* CONFIG_HIGH_RES_TIMERS */
627 628
@@ -1001,6 +1002,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1001 clock_id = CLOCK_MONOTONIC; 1002 clock_id = CLOCK_MONOTONIC;
1002 1003
1003 timer->base = &cpu_base->clock_base[clock_id]; 1004 timer->base = &cpu_base->clock_base[clock_id];
1005 INIT_LIST_HEAD(&timer->cb_entry);
1004 hrtimer_init_timer_hres(timer); 1006 hrtimer_init_timer_hres(timer);
1005 1007
1006#ifdef CONFIG_TIMER_STATS 1008#ifdef CONFIG_TIMER_STATS
@@ -1030,6 +1032,85 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1030} 1032}
1031EXPORT_SYMBOL_GPL(hrtimer_get_res); 1033EXPORT_SYMBOL_GPL(hrtimer_get_res);
1032 1034
1035static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
1036{
1037 spin_lock_irq(&cpu_base->lock);
1038
1039 while (!list_empty(&cpu_base->cb_pending)) {
1040 enum hrtimer_restart (*fn)(struct hrtimer *);
1041 struct hrtimer *timer;
1042 int restart;
1043
1044 timer = list_entry(cpu_base->cb_pending.next,
1045 struct hrtimer, cb_entry);
1046
1047 timer_stats_account_hrtimer(timer);
1048
1049 fn = timer->function;
1050 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
1051 spin_unlock_irq(&cpu_base->lock);
1052
1053 restart = fn(timer);
1054
1055 spin_lock_irq(&cpu_base->lock);
1056
1057 timer->state &= ~HRTIMER_STATE_CALLBACK;
1058 if (restart == HRTIMER_RESTART) {
1059 BUG_ON(hrtimer_active(timer));
1060 /*
1061 * Enqueue the timer, allow reprogramming of the event
1062 * device
1063 */
1064 enqueue_hrtimer(timer, timer->base, 1);
1065 } else if (hrtimer_active(timer)) {
1066 /*
1067 * If the timer was rearmed on another CPU, reprogram
1068 * the event device.
1069 */
1070 if (timer->base->first == &timer->node)
1071 hrtimer_reprogram(timer, timer->base);
1072 }
1073 }
1074 spin_unlock_irq(&cpu_base->lock);
1075}
1076
1077static void __run_hrtimer(struct hrtimer *timer)
1078{
1079 struct hrtimer_clock_base *base = timer->base;
1080 struct hrtimer_cpu_base *cpu_base = base->cpu_base;
1081 enum hrtimer_restart (*fn)(struct hrtimer *);
1082 int restart;
1083
1084 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1085 timer_stats_account_hrtimer(timer);
1086
1087 fn = timer->function;
1088 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
1089 /*
1090 * Used for scheduler timers, avoid lock inversion with
1091 * rq->lock and tasklist_lock.
1092 *
1093 * These timers are required to deal with enqueue expiry
1094 * themselves and are not allowed to migrate.
1095 */
1096 spin_unlock(&cpu_base->lock);
1097 restart = fn(timer);
1098 spin_lock(&cpu_base->lock);
1099 } else
1100 restart = fn(timer);
1101
1102 /*
1103 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
1104 * reprogramming of the event hardware. This happens at the end of this
1105 * function anyway.
1106 */
1107 if (restart != HRTIMER_NORESTART) {
1108 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1109 enqueue_hrtimer(timer, base, 0);
1110 }
1111 timer->state &= ~HRTIMER_STATE_CALLBACK;
1112}
1113
1033#ifdef CONFIG_HIGH_RES_TIMERS 1114#ifdef CONFIG_HIGH_RES_TIMERS
1034 1115
1035/* 1116/*
@@ -1087,21 +1168,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1087 continue; 1168 continue;
1088 } 1169 }
1089 1170
1090 __remove_hrtimer(timer, base, 1171 __run_hrtimer(timer);
1091 HRTIMER_STATE_CALLBACK, 0);
1092 timer_stats_account_hrtimer(timer);
1093
1094 /*
1095 * Note: We clear the CALLBACK bit after
1096 * enqueue_hrtimer to avoid reprogramming of
1097 * the event hardware. This happens at the end
1098 * of this function anyway.
1099 */
1100 if (timer->function(timer) != HRTIMER_NORESTART) {
1101 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1102 enqueue_hrtimer(timer, base, 0);
1103 }
1104 timer->state &= ~HRTIMER_STATE_CALLBACK;
1105 } 1172 }
1106 spin_unlock(&cpu_base->lock); 1173 spin_unlock(&cpu_base->lock);
1107 base++; 1174 base++;
@@ -1122,52 +1189,41 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1122 1189
1123static void run_hrtimer_softirq(struct softirq_action *h) 1190static void run_hrtimer_softirq(struct softirq_action *h)
1124{ 1191{
1125 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1192 run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
1126 1193}
1127 spin_lock_irq(&cpu_base->lock);
1128
1129 while (!list_empty(&cpu_base->cb_pending)) {
1130 enum hrtimer_restart (*fn)(struct hrtimer *);
1131 struct hrtimer *timer;
1132 int restart;
1133
1134 timer = list_entry(cpu_base->cb_pending.next,
1135 struct hrtimer, cb_entry);
1136 1194
1137 timer_stats_account_hrtimer(timer); 1195#endif /* CONFIG_HIGH_RES_TIMERS */
1138 1196
1139 fn = timer->function; 1197/*
1140 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); 1198 * Called from timer softirq every jiffy, expire hrtimers:
1141 spin_unlock_irq(&cpu_base->lock); 1199 *
1200 * For HRT its the fall back code to run the softirq in the timer
1201 * softirq context in case the hrtimer initialization failed or has
1202 * not been done yet.
1203 */
1204void hrtimer_run_pending(void)
1205{
1206 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1142 1207
1143 restart = fn(timer); 1208 if (hrtimer_hres_active())
1209 return;
1144 1210
1145 spin_lock_irq(&cpu_base->lock); 1211 /*
1212 * This _is_ ugly: We have to check in the softirq context,
1213 * whether we can switch to highres and / or nohz mode. The
1214 * clocksource switch happens in the timer interrupt with
1215 * xtime_lock held. Notification from there only sets the
1216 * check bit in the tick_oneshot code, otherwise we might
1217 * deadlock vs. xtime_lock.
1218 */
1219 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
1220 hrtimer_switch_to_hres();
1146 1221
1147 timer->state &= ~HRTIMER_STATE_CALLBACK; 1222 run_hrtimer_pending(cpu_base);
1148 if (restart == HRTIMER_RESTART) {
1149 BUG_ON(hrtimer_active(timer));
1150 /*
1151 * Enqueue the timer, allow reprogramming of the event
1152 * device
1153 */
1154 enqueue_hrtimer(timer, timer->base, 1);
1155 } else if (hrtimer_active(timer)) {
1156 /*
1157 * If the timer was rearmed on another CPU, reprogram
1158 * the event device.
1159 */
1160 if (timer->base->first == &timer->node)
1161 hrtimer_reprogram(timer, timer->base);
1162 }
1163 }
1164 spin_unlock_irq(&cpu_base->lock);
1165} 1223}
1166 1224
1167#endif /* CONFIG_HIGH_RES_TIMERS */
1168
1169/* 1225/*
1170 * Expire the per base hrtimer-queue: 1226 * Called from hardirq context every jiffy
1171 */ 1227 */
1172static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, 1228static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
1173 int index) 1229 int index)
@@ -1181,46 +1237,27 @@ static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
1181 if (base->get_softirq_time) 1237 if (base->get_softirq_time)
1182 base->softirq_time = base->get_softirq_time(); 1238 base->softirq_time = base->get_softirq_time();
1183 1239
1184 spin_lock_irq(&cpu_base->lock); 1240 spin_lock(&cpu_base->lock);
1185 1241
1186 while ((node = base->first)) { 1242 while ((node = base->first)) {
1187 struct hrtimer *timer; 1243 struct hrtimer *timer;
1188 enum hrtimer_restart (*fn)(struct hrtimer *);
1189 int restart;
1190 1244
1191 timer = rb_entry(node, struct hrtimer, node); 1245 timer = rb_entry(node, struct hrtimer, node);
1192 if (base->softirq_time.tv64 <= timer->expires.tv64) 1246 if (base->softirq_time.tv64 <= timer->expires.tv64)
1193 break; 1247 break;
1194 1248
1195#ifdef CONFIG_HIGH_RES_TIMERS 1249 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1196 WARN_ON_ONCE(timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ); 1250 __remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0);
1197#endif 1251 list_add_tail(&timer->cb_entry,
1198 timer_stats_account_hrtimer(timer); 1252 &base->cpu_base->cb_pending);
1199 1253 continue;
1200 fn = timer->function;
1201 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1202 spin_unlock_irq(&cpu_base->lock);
1203
1204 restart = fn(timer);
1205
1206 spin_lock_irq(&cpu_base->lock);
1207
1208 timer->state &= ~HRTIMER_STATE_CALLBACK;
1209 if (restart != HRTIMER_NORESTART) {
1210 BUG_ON(hrtimer_active(timer));
1211 enqueue_hrtimer(timer, base, 0);
1212 } 1254 }
1255
1256 __run_hrtimer(timer);
1213 } 1257 }
1214 spin_unlock_irq(&cpu_base->lock); 1258 spin_unlock(&cpu_base->lock);
1215} 1259}
1216 1260
1217/*
1218 * Called from timer softirq every jiffy, expire hrtimers:
1219 *
1220 * For HRT its the fall back code to run the softirq in the timer
1221 * softirq context in case the hrtimer initialization failed or has
1222 * not been done yet.
1223 */
1224void hrtimer_run_queues(void) 1261void hrtimer_run_queues(void)
1225{ 1262{
1226 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1263 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
@@ -1229,18 +1266,6 @@ void hrtimer_run_queues(void)
1229 if (hrtimer_hres_active()) 1266 if (hrtimer_hres_active())
1230 return; 1267 return;
1231 1268
1232 /*
1233 * This _is_ ugly: We have to check in the softirq context,
1234 * whether we can switch to highres and / or nohz mode. The
1235 * clocksource switch happens in the timer interrupt with
1236 * xtime_lock held. Notification from there only sets the
1237 * check bit in the tick_oneshot code, otherwise we might
1238 * deadlock vs. xtime_lock.
1239 */
1240 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
1241 if (hrtimer_switch_to_hres())
1242 return;
1243
1244 hrtimer_get_softirq_time(cpu_base); 1269 hrtimer_get_softirq_time(cpu_base);
1245 1270
1246 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1271 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
@@ -1268,7 +1293,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1268 sl->timer.function = hrtimer_wakeup; 1293 sl->timer.function = hrtimer_wakeup;
1269 sl->task = task; 1294 sl->task = task;
1270#ifdef CONFIG_HIGH_RES_TIMERS 1295#ifdef CONFIG_HIGH_RES_TIMERS
1271 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART; 1296 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
1272#endif 1297#endif
1273} 1298}
1274 1299
@@ -1279,6 +1304,8 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
1279 do { 1304 do {
1280 set_current_state(TASK_INTERRUPTIBLE); 1305 set_current_state(TASK_INTERRUPTIBLE);
1281 hrtimer_start(&t->timer, t->timer.expires, mode); 1306 hrtimer_start(&t->timer, t->timer.expires, mode);
1307 if (!hrtimer_active(&t->timer))
1308 t->task = NULL;
1282 1309
1283 if (likely(t->task)) 1310 if (likely(t->task))
1284 schedule(); 1311 schedule();
@@ -1389,6 +1416,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1389 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1416 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1390 cpu_base->clock_base[i].cpu_base = cpu_base; 1417 cpu_base->clock_base[i].cpu_base = cpu_base;
1391 1418
1419 INIT_LIST_HEAD(&cpu_base->cb_pending);
1392 hrtimer_init_hres(cpu_base); 1420 hrtimer_init_hres(cpu_base);
1393} 1421}
1394 1422
diff --git a/kernel/kthread.c b/kernel/kthread.c
index dcfe724300eb..0ac887882f90 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -15,6 +15,8 @@
15#include <linux/mutex.h> 15#include <linux/mutex.h>
16#include <asm/semaphore.h> 16#include <asm/semaphore.h>
17 17
18#define KTHREAD_NICE_LEVEL (-5)
19
18static DEFINE_SPINLOCK(kthread_create_lock); 20static DEFINE_SPINLOCK(kthread_create_lock);
19static LIST_HEAD(kthread_create_list); 21static LIST_HEAD(kthread_create_list);
20struct task_struct *kthreadd_task; 22struct task_struct *kthreadd_task;
@@ -94,10 +96,18 @@ static void create_kthread(struct kthread_create_info *create)
94 if (pid < 0) { 96 if (pid < 0) {
95 create->result = ERR_PTR(pid); 97 create->result = ERR_PTR(pid);
96 } else { 98 } else {
99 struct sched_param param = { .sched_priority = 0 };
97 wait_for_completion(&create->started); 100 wait_for_completion(&create->started);
98 read_lock(&tasklist_lock); 101 read_lock(&tasklist_lock);
99 create->result = find_task_by_pid(pid); 102 create->result = find_task_by_pid(pid);
100 read_unlock(&tasklist_lock); 103 read_unlock(&tasklist_lock);
104 /*
105 * root may have changed our (kthreadd's) priority or CPU mask.
106 * The kernel thread should not inherit these properties.
107 */
108 sched_setscheduler(create->result, SCHED_NORMAL, &param);
109 set_user_nice(create->result, KTHREAD_NICE_LEVEL);
110 set_cpus_allowed(create->result, CPU_MASK_ALL);
101 } 111 }
102 complete(&create->done); 112 complete(&create->done);
103} 113}
@@ -221,7 +231,7 @@ int kthreadd(void *unused)
221 /* Setup a clean context for our children to inherit. */ 231 /* Setup a clean context for our children to inherit. */
222 set_task_comm(tsk, "kthreadd"); 232 set_task_comm(tsk, "kthreadd");
223 ignore_signals(tsk); 233 ignore_signals(tsk);
224 set_user_nice(tsk, -5); 234 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
225 set_cpus_allowed(tsk, CPU_MASK_ALL); 235 set_cpus_allowed(tsk, CPU_MASK_ALL);
226 236
227 current->flags |= PF_NOFREEZE; 237 current->flags |= PF_NOFREEZE;
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
new file mode 100644
index 000000000000..b4e3c85abe74
--- /dev/null
+++ b/kernel/latencytop.c
@@ -0,0 +1,239 @@
1/*
2 * latencytop.c: Latency display infrastructure
3 *
4 * (C) Copyright 2008 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12#include <linux/latencytop.h>
13#include <linux/kallsyms.h>
14#include <linux/seq_file.h>
15#include <linux/notifier.h>
16#include <linux/spinlock.h>
17#include <linux/proc_fs.h>
18#include <linux/module.h>
19#include <linux/sched.h>
20#include <linux/list.h>
21#include <linux/slab.h>
22#include <linux/stacktrace.h>
23
24static DEFINE_SPINLOCK(latency_lock);
25
26#define MAXLR 128
27static struct latency_record latency_record[MAXLR];
28
29int latencytop_enabled;
30
31void clear_all_latency_tracing(struct task_struct *p)
32{
33 unsigned long flags;
34
35 if (!latencytop_enabled)
36 return;
37
38 spin_lock_irqsave(&latency_lock, flags);
39 memset(&p->latency_record, 0, sizeof(p->latency_record));
40 p->latency_record_count = 0;
41 spin_unlock_irqrestore(&latency_lock, flags);
42}
43
44static void clear_global_latency_tracing(void)
45{
46 unsigned long flags;
47
48 spin_lock_irqsave(&latency_lock, flags);
49 memset(&latency_record, 0, sizeof(latency_record));
50 spin_unlock_irqrestore(&latency_lock, flags);
51}
52
53static void __sched
54account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat)
55{
56 int firstnonnull = MAXLR + 1;
57 int i;
58
59 if (!latencytop_enabled)
60 return;
61
62 /* skip kernel threads for now */
63 if (!tsk->mm)
64 return;
65
66 for (i = 0; i < MAXLR; i++) {
67 int q;
68 int same = 1;
69 /* Nothing stored: */
70 if (!latency_record[i].backtrace[0]) {
71 if (firstnonnull > i)
72 firstnonnull = i;
73 continue;
74 }
75 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
76 if (latency_record[i].backtrace[q] !=
77 lat->backtrace[q])
78 same = 0;
79 if (same && lat->backtrace[q] == 0)
80 break;
81 if (same && lat->backtrace[q] == ULONG_MAX)
82 break;
83 }
84 if (same) {
85 latency_record[i].count++;
86 latency_record[i].time += lat->time;
87 if (lat->time > latency_record[i].max)
88 latency_record[i].max = lat->time;
89 return;
90 }
91 }
92
93 i = firstnonnull;
94 if (i >= MAXLR - 1)
95 return;
96
97 /* Allocted a new one: */
98 memcpy(&latency_record[i], lat, sizeof(struct latency_record));
99}
100
101static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat)
102{
103 struct stack_trace trace;
104
105 memset(&trace, 0, sizeof(trace));
106 trace.max_entries = LT_BACKTRACEDEPTH;
107 trace.entries = &lat->backtrace[0];
108 trace.skip = 0;
109 save_stack_trace_tsk(tsk, &trace);
110}
111
112void __sched
113account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
114{
115 unsigned long flags;
116 int i, q;
117 struct latency_record lat;
118
119 if (!latencytop_enabled)
120 return;
121
122 /* Long interruptible waits are generally user requested... */
123 if (inter && usecs > 5000)
124 return;
125
126 memset(&lat, 0, sizeof(lat));
127 lat.count = 1;
128 lat.time = usecs;
129 lat.max = usecs;
130 store_stacktrace(tsk, &lat);
131
132 spin_lock_irqsave(&latency_lock, flags);
133
134 account_global_scheduler_latency(tsk, &lat);
135
136 /*
137 * short term hack; if we're > 32 we stop; future we recycle:
138 */
139 tsk->latency_record_count++;
140 if (tsk->latency_record_count >= LT_SAVECOUNT)
141 goto out_unlock;
142
143 for (i = 0; i < LT_SAVECOUNT ; i++) {
144 struct latency_record *mylat;
145 int same = 1;
146 mylat = &tsk->latency_record[i];
147 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
148 if (mylat->backtrace[q] !=
149 lat.backtrace[q])
150 same = 0;
151 if (same && lat.backtrace[q] == 0)
152 break;
153 if (same && lat.backtrace[q] == ULONG_MAX)
154 break;
155 }
156 if (same) {
157 mylat->count++;
158 mylat->time += lat.time;
159 if (lat.time > mylat->max)
160 mylat->max = lat.time;
161 goto out_unlock;
162 }
163 }
164
165 /* Allocated a new one: */
166 i = tsk->latency_record_count;
167 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
168
169out_unlock:
170 spin_unlock_irqrestore(&latency_lock, flags);
171}
172
173static int lstats_show(struct seq_file *m, void *v)
174{
175 int i;
176
177 seq_puts(m, "Latency Top version : v0.1\n");
178
179 for (i = 0; i < MAXLR; i++) {
180 if (latency_record[i].backtrace[0]) {
181 int q;
182 seq_printf(m, "%i %li %li ",
183 latency_record[i].count,
184 latency_record[i].time,
185 latency_record[i].max);
186 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
187 char sym[KSYM_NAME_LEN];
188 char *c;
189 if (!latency_record[i].backtrace[q])
190 break;
191 if (latency_record[i].backtrace[q] == ULONG_MAX)
192 break;
193 sprint_symbol(sym, latency_record[i].backtrace[q]);
194 c = strchr(sym, '+');
195 if (c)
196 *c = 0;
197 seq_printf(m, "%s ", sym);
198 }
199 seq_printf(m, "\n");
200 }
201 }
202 return 0;
203}
204
205static ssize_t
206lstats_write(struct file *file, const char __user *buf, size_t count,
207 loff_t *offs)
208{
209 clear_global_latency_tracing();
210
211 return count;
212}
213
214static int lstats_open(struct inode *inode, struct file *filp)
215{
216 return single_open(filp, lstats_show, NULL);
217}
218
219static struct file_operations lstats_fops = {
220 .open = lstats_open,
221 .read = seq_read,
222 .write = lstats_write,
223 .llseek = seq_lseek,
224 .release = single_release,
225};
226
227static int __init init_lstats_procfs(void)
228{
229 struct proc_dir_entry *pe;
230
231 pe = create_proc_entry("latency_stats", 0644, NULL);
232 if (!pe)
233 return -ENOMEM;
234
235 pe->proc_fops = &lstats_fops;
236
237 return 0;
238}
239__initcall(init_lstats_procfs);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index e2c07ece367d..3574379f4d62 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3206,7 +3206,11 @@ retry:
3206 3206
3207EXPORT_SYMBOL_GPL(debug_show_all_locks); 3207EXPORT_SYMBOL_GPL(debug_show_all_locks);
3208 3208
3209void debug_show_held_locks(struct task_struct *task) 3209/*
3210 * Careful: only use this function if you are sure that
3211 * the task cannot run in parallel!
3212 */
3213void __debug_show_held_locks(struct task_struct *task)
3210{ 3214{
3211 if (unlikely(!debug_locks)) { 3215 if (unlikely(!debug_locks)) {
3212 printk("INFO: lockdep is turned off.\n"); 3216 printk("INFO: lockdep is turned off.\n");
@@ -3214,6 +3218,12 @@ void debug_show_held_locks(struct task_struct *task)
3214 } 3218 }
3215 lockdep_print_held_locks(task); 3219 lockdep_print_held_locks(task);
3216} 3220}
3221EXPORT_SYMBOL_GPL(__debug_show_held_locks);
3222
3223void debug_show_held_locks(struct task_struct *task)
3224{
3225 __debug_show_held_locks(task);
3226}
3217 3227
3218EXPORT_SYMBOL_GPL(debug_show_held_locks); 3228EXPORT_SYMBOL_GPL(debug_show_held_locks);
3219 3229
diff --git a/kernel/module.c b/kernel/module.c
index dcb8a2cbf75e..1bb4c5e0d56e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -496,6 +496,8 @@ static struct module_attribute modinfo_##field = { \
496MODINFO_ATTR(version); 496MODINFO_ATTR(version);
497MODINFO_ATTR(srcversion); 497MODINFO_ATTR(srcversion);
498 498
499static char last_unloaded_module[MODULE_NAME_LEN+1];
500
499#ifdef CONFIG_MODULE_UNLOAD 501#ifdef CONFIG_MODULE_UNLOAD
500/* Init the unload section of the module. */ 502/* Init the unload section of the module. */
501static void module_unload_init(struct module *mod) 503static void module_unload_init(struct module *mod)
@@ -719,6 +721,8 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
719 mod->exit(); 721 mod->exit();
720 mutex_lock(&module_mutex); 722 mutex_lock(&module_mutex);
721 } 723 }
724 /* Store the name of the last unloaded module for diagnostic purposes */
725 sprintf(last_unloaded_module, mod->name);
722 free_module(mod); 726 free_module(mod);
723 727
724 out: 728 out:
@@ -2357,21 +2361,30 @@ static void m_stop(struct seq_file *m, void *p)
2357 mutex_unlock(&module_mutex); 2361 mutex_unlock(&module_mutex);
2358} 2362}
2359 2363
2360static char *taint_flags(unsigned int taints, char *buf) 2364static char *module_flags(struct module *mod, char *buf)
2361{ 2365{
2362 int bx = 0; 2366 int bx = 0;
2363 2367
2364 if (taints) { 2368 if (mod->taints ||
2369 mod->state == MODULE_STATE_GOING ||
2370 mod->state == MODULE_STATE_COMING) {
2365 buf[bx++] = '('; 2371 buf[bx++] = '(';
2366 if (taints & TAINT_PROPRIETARY_MODULE) 2372 if (mod->taints & TAINT_PROPRIETARY_MODULE)
2367 buf[bx++] = 'P'; 2373 buf[bx++] = 'P';
2368 if (taints & TAINT_FORCED_MODULE) 2374 if (mod->taints & TAINT_FORCED_MODULE)
2369 buf[bx++] = 'F'; 2375 buf[bx++] = 'F';
2370 /* 2376 /*
2371 * TAINT_FORCED_RMMOD: could be added. 2377 * TAINT_FORCED_RMMOD: could be added.
2372 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't 2378 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
2373 * apply to modules. 2379 * apply to modules.
2374 */ 2380 */
2381
2382 /* Show a - for module-is-being-unloaded */
2383 if (mod->state == MODULE_STATE_GOING)
2384 buf[bx++] = '-';
2385 /* Show a + for module-is-being-loaded */
2386 if (mod->state == MODULE_STATE_COMING)
2387 buf[bx++] = '+';
2375 buf[bx++] = ')'; 2388 buf[bx++] = ')';
2376 } 2389 }
2377 buf[bx] = '\0'; 2390 buf[bx] = '\0';
@@ -2398,7 +2411,7 @@ static int m_show(struct seq_file *m, void *p)
2398 2411
2399 /* Taints info */ 2412 /* Taints info */
2400 if (mod->taints) 2413 if (mod->taints)
2401 seq_printf(m, " %s", taint_flags(mod->taints, buf)); 2414 seq_printf(m, " %s", module_flags(mod, buf));
2402 2415
2403 seq_printf(m, "\n"); 2416 seq_printf(m, "\n");
2404 return 0; 2417 return 0;
@@ -2493,7 +2506,9 @@ void print_modules(void)
2493 2506
2494 printk("Modules linked in:"); 2507 printk("Modules linked in:");
2495 list_for_each_entry(mod, &modules, list) 2508 list_for_each_entry(mod, &modules, list)
2496 printk(" %s%s", mod->name, taint_flags(mod->taints, buf)); 2509 printk(" %s%s", mod->name, module_flags(mod, buf));
2510 if (last_unloaded_module[0])
2511 printk(" [last unloaded: %s]", last_unloaded_module);
2497 printk("\n"); 2512 printk("\n");
2498} 2513}
2499 2514
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 68c96376e84a..0b7c82ac467e 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -967,6 +967,7 @@ static void check_thread_timers(struct task_struct *tsk,
967{ 967{
968 int maxfire; 968 int maxfire;
969 struct list_head *timers = tsk->cpu_timers; 969 struct list_head *timers = tsk->cpu_timers;
970 struct signal_struct *const sig = tsk->signal;
970 971
971 maxfire = 20; 972 maxfire = 20;
972 tsk->it_prof_expires = cputime_zero; 973 tsk->it_prof_expires = cputime_zero;
@@ -1011,6 +1012,35 @@ static void check_thread_timers(struct task_struct *tsk,
1011 t->firing = 1; 1012 t->firing = 1;
1012 list_move_tail(&t->entry, firing); 1013 list_move_tail(&t->entry, firing);
1013 } 1014 }
1015
1016 /*
1017 * Check for the special case thread timers.
1018 */
1019 if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) {
1020 unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max;
1021 unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur;
1022
1023 if (hard != RLIM_INFINITY &&
1024 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
1025 /*
1026 * At the hard limit, we just die.
1027 * No need to calculate anything else now.
1028 */
1029 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1030 return;
1031 }
1032 if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) {
1033 /*
1034 * At the soft limit, send a SIGXCPU every second.
1035 */
1036 if (sig->rlim[RLIMIT_RTTIME].rlim_cur
1037 < sig->rlim[RLIMIT_RTTIME].rlim_max) {
1038 sig->rlim[RLIMIT_RTTIME].rlim_cur +=
1039 USEC_PER_SEC;
1040 }
1041 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
1042 }
1043 }
1014} 1044}
1015 1045
1016/* 1046/*
diff --git a/kernel/printk.c b/kernel/printk.c
index 89011bf8c106..423a8c765a57 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -573,11 +573,6 @@ static int __init printk_time_setup(char *str)
573 573
574__setup("time", printk_time_setup); 574__setup("time", printk_time_setup);
575 575
576__attribute__((weak)) unsigned long long printk_clock(void)
577{
578 return sched_clock();
579}
580
581/* Check if we have any console registered that can be called early in boot. */ 576/* Check if we have any console registered that can be called early in boot. */
582static int have_callable_console(void) 577static int have_callable_console(void)
583{ 578{
@@ -628,30 +623,57 @@ asmlinkage int printk(const char *fmt, ...)
628/* cpu currently holding logbuf_lock */ 623/* cpu currently holding logbuf_lock */
629static volatile unsigned int printk_cpu = UINT_MAX; 624static volatile unsigned int printk_cpu = UINT_MAX;
630 625
626const char printk_recursion_bug_msg [] =
627 KERN_CRIT "BUG: recent printk recursion!\n";
628static int printk_recursion_bug;
629
631asmlinkage int vprintk(const char *fmt, va_list args) 630asmlinkage int vprintk(const char *fmt, va_list args)
632{ 631{
632 static int log_level_unknown = 1;
633 static char printk_buf[1024];
634
633 unsigned long flags; 635 unsigned long flags;
634 int printed_len; 636 int printed_len = 0;
637 int this_cpu;
635 char *p; 638 char *p;
636 static char printk_buf[1024];
637 static int log_level_unknown = 1;
638 639
639 boot_delay_msec(); 640 boot_delay_msec();
640 641
641 preempt_disable(); 642 preempt_disable();
642 if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id())
643 /* If a crash is occurring during printk() on this CPU,
644 * make sure we can't deadlock */
645 zap_locks();
646
647 /* This stops the holder of console_sem just where we want him */ 643 /* This stops the holder of console_sem just where we want him */
648 raw_local_irq_save(flags); 644 raw_local_irq_save(flags);
645 this_cpu = smp_processor_id();
646
647 /*
648 * Ouch, printk recursed into itself!
649 */
650 if (unlikely(printk_cpu == this_cpu)) {
651 /*
652 * If a crash is occurring during printk() on this CPU,
653 * then try to get the crash message out but make sure
654 * we can't deadlock. Otherwise just return to avoid the
655 * recursion and return - but flag the recursion so that
656 * it can be printed at the next appropriate moment:
657 */
658 if (!oops_in_progress) {
659 printk_recursion_bug = 1;
660 goto out_restore_irqs;
661 }
662 zap_locks();
663 }
664
649 lockdep_off(); 665 lockdep_off();
650 spin_lock(&logbuf_lock); 666 spin_lock(&logbuf_lock);
651 printk_cpu = smp_processor_id(); 667 printk_cpu = this_cpu;
652 668
669 if (printk_recursion_bug) {
670 printk_recursion_bug = 0;
671 strcpy(printk_buf, printk_recursion_bug_msg);
672 printed_len = sizeof(printk_recursion_bug_msg);
673 }
653 /* Emit the output into the temporary buffer */ 674 /* Emit the output into the temporary buffer */
654 printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); 675 printed_len += vscnprintf(printk_buf + printed_len,
676 sizeof(printk_buf), fmt, args);
655 677
656 /* 678 /*
657 * Copy the output into log_buf. If the caller didn't provide 679 * Copy the output into log_buf. If the caller didn't provide
@@ -680,7 +702,9 @@ asmlinkage int vprintk(const char *fmt, va_list args)
680 loglev_char = default_message_loglevel 702 loglev_char = default_message_loglevel
681 + '0'; 703 + '0';
682 } 704 }
683 t = printk_clock(); 705 t = 0;
706 if (system_state != SYSTEM_BOOTING)
707 t = ktime_to_ns(ktime_get());
684 nanosec_rem = do_div(t, 1000000000); 708 nanosec_rem = do_div(t, 1000000000);
685 tlen = sprintf(tbuf, 709 tlen = sprintf(tbuf,
686 "<%c>[%5lu.%06lu] ", 710 "<%c>[%5lu.%06lu] ",
@@ -744,6 +768,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
744 printk_cpu = UINT_MAX; 768 printk_cpu = UINT_MAX;
745 spin_unlock(&logbuf_lock); 769 spin_unlock(&logbuf_lock);
746 lockdep_on(); 770 lockdep_on();
771out_restore_irqs:
747 raw_local_irq_restore(flags); 772 raw_local_irq_restore(flags);
748 } 773 }
749 774
diff --git a/kernel/profile.c b/kernel/profile.c
index 5e95330e5120..e64c2da11c0f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -52,7 +52,7 @@ static DEFINE_PER_CPU(int, cpu_profile_flip);
52static DEFINE_MUTEX(profile_flip_mutex); 52static DEFINE_MUTEX(profile_flip_mutex);
53#endif /* CONFIG_SMP */ 53#endif /* CONFIG_SMP */
54 54
55static int __init profile_setup(char * str) 55static int __init profile_setup(char *str)
56{ 56{
57 static char __initdata schedstr[] = "schedule"; 57 static char __initdata schedstr[] = "schedule";
58 static char __initdata sleepstr[] = "sleep"; 58 static char __initdata sleepstr[] = "sleep";
@@ -104,28 +104,28 @@ __setup("profile=", profile_setup);
104 104
105void __init profile_init(void) 105void __init profile_init(void)
106{ 106{
107 if (!prof_on) 107 if (!prof_on)
108 return; 108 return;
109 109
110 /* only text is profiled */ 110 /* only text is profiled */
111 prof_len = (_etext - _stext) >> prof_shift; 111 prof_len = (_etext - _stext) >> prof_shift;
112 prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); 112 prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t));
113} 113}
114 114
115/* Profile event notifications */ 115/* Profile event notifications */
116 116
117#ifdef CONFIG_PROFILING 117#ifdef CONFIG_PROFILING
118 118
119static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); 119static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
120static ATOMIC_NOTIFIER_HEAD(task_free_notifier); 120static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
121static BLOCKING_NOTIFIER_HEAD(munmap_notifier); 121static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
122 122
123void profile_task_exit(struct task_struct * task) 123void profile_task_exit(struct task_struct *task)
124{ 124{
125 blocking_notifier_call_chain(&task_exit_notifier, 0, task); 125 blocking_notifier_call_chain(&task_exit_notifier, 0, task);
126} 126}
127 127
128int profile_handoff_task(struct task_struct * task) 128int profile_handoff_task(struct task_struct *task)
129{ 129{
130 int ret; 130 int ret;
131 ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); 131 ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
@@ -137,52 +137,55 @@ void profile_munmap(unsigned long addr)
137 blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); 137 blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
138} 138}
139 139
140int task_handoff_register(struct notifier_block * n) 140int task_handoff_register(struct notifier_block *n)
141{ 141{
142 return atomic_notifier_chain_register(&task_free_notifier, n); 142 return atomic_notifier_chain_register(&task_free_notifier, n);
143} 143}
144EXPORT_SYMBOL_GPL(task_handoff_register);
144 145
145int task_handoff_unregister(struct notifier_block * n) 146int task_handoff_unregister(struct notifier_block *n)
146{ 147{
147 return atomic_notifier_chain_unregister(&task_free_notifier, n); 148 return atomic_notifier_chain_unregister(&task_free_notifier, n);
148} 149}
150EXPORT_SYMBOL_GPL(task_handoff_unregister);
149 151
150int profile_event_register(enum profile_type type, struct notifier_block * n) 152int profile_event_register(enum profile_type type, struct notifier_block *n)
151{ 153{
152 int err = -EINVAL; 154 int err = -EINVAL;
153 155
154 switch (type) { 156 switch (type) {
155 case PROFILE_TASK_EXIT: 157 case PROFILE_TASK_EXIT:
156 err = blocking_notifier_chain_register( 158 err = blocking_notifier_chain_register(
157 &task_exit_notifier, n); 159 &task_exit_notifier, n);
158 break; 160 break;
159 case PROFILE_MUNMAP: 161 case PROFILE_MUNMAP:
160 err = blocking_notifier_chain_register( 162 err = blocking_notifier_chain_register(
161 &munmap_notifier, n); 163 &munmap_notifier, n);
162 break; 164 break;
163 } 165 }
164 166
165 return err; 167 return err;
166} 168}
169EXPORT_SYMBOL_GPL(profile_event_register);
167 170
168 171int profile_event_unregister(enum profile_type type, struct notifier_block *n)
169int profile_event_unregister(enum profile_type type, struct notifier_block * n)
170{ 172{
171 int err = -EINVAL; 173 int err = -EINVAL;
172 174
173 switch (type) { 175 switch (type) {
174 case PROFILE_TASK_EXIT: 176 case PROFILE_TASK_EXIT:
175 err = blocking_notifier_chain_unregister( 177 err = blocking_notifier_chain_unregister(
176 &task_exit_notifier, n); 178 &task_exit_notifier, n);
177 break; 179 break;
178 case PROFILE_MUNMAP: 180 case PROFILE_MUNMAP:
179 err = blocking_notifier_chain_unregister( 181 err = blocking_notifier_chain_unregister(
180 &munmap_notifier, n); 182 &munmap_notifier, n);
181 break; 183 break;
182 } 184 }
183 185
184 return err; 186 return err;
185} 187}
188EXPORT_SYMBOL_GPL(profile_event_unregister);
186 189
187int register_timer_hook(int (*hook)(struct pt_regs *)) 190int register_timer_hook(int (*hook)(struct pt_regs *))
188{ 191{
@@ -191,6 +194,7 @@ int register_timer_hook(int (*hook)(struct pt_regs *))
191 timer_hook = hook; 194 timer_hook = hook;
192 return 0; 195 return 0;
193} 196}
197EXPORT_SYMBOL_GPL(register_timer_hook);
194 198
195void unregister_timer_hook(int (*hook)(struct pt_regs *)) 199void unregister_timer_hook(int (*hook)(struct pt_regs *))
196{ 200{
@@ -199,13 +203,7 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *))
199 /* make sure all CPUs see the NULL hook */ 203 /* make sure all CPUs see the NULL hook */
200 synchronize_sched(); /* Allow ongoing interrupts to complete. */ 204 synchronize_sched(); /* Allow ongoing interrupts to complete. */
201} 205}
202
203EXPORT_SYMBOL_GPL(register_timer_hook);
204EXPORT_SYMBOL_GPL(unregister_timer_hook); 206EXPORT_SYMBOL_GPL(unregister_timer_hook);
205EXPORT_SYMBOL_GPL(task_handoff_register);
206EXPORT_SYMBOL_GPL(task_handoff_unregister);
207EXPORT_SYMBOL_GPL(profile_event_register);
208EXPORT_SYMBOL_GPL(profile_event_unregister);
209 207
210#endif /* CONFIG_PROFILING */ 208#endif /* CONFIG_PROFILING */
211 209
@@ -366,7 +364,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
366 per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); 364 per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
367 } 365 }
368 break; 366 break;
369 out_free: 367out_free:
370 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); 368 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
371 per_cpu(cpu_profile_hits, cpu)[1] = NULL; 369 per_cpu(cpu_profile_hits, cpu)[1] = NULL;
372 __free_page(page); 370 __free_page(page);
@@ -409,7 +407,6 @@ void profile_hits(int type, void *__pc, unsigned int nr_hits)
409 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); 407 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
410} 408}
411#endif /* !CONFIG_SMP */ 409#endif /* !CONFIG_SMP */
412
413EXPORT_SYMBOL_GPL(profile_hits); 410EXPORT_SYMBOL_GPL(profile_hits);
414 411
415void profile_tick(int type) 412void profile_tick(int type)
@@ -427,7 +424,7 @@ void profile_tick(int type)
427#include <asm/uaccess.h> 424#include <asm/uaccess.h>
428#include <asm/ptrace.h> 425#include <asm/ptrace.h>
429 426
430static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, 427static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
431 int count, int *eof, void *data) 428 int count, int *eof, void *data)
432{ 429{
433 int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); 430 int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
@@ -437,8 +434,8 @@ static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
437 return len; 434 return len;
438} 435}
439 436
440static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer, 437static int prof_cpu_mask_write_proc(struct file *file,
441 unsigned long count, void *data) 438 const char __user *buffer, unsigned long count, void *data)
442{ 439{
443 cpumask_t *mask = (cpumask_t *)data; 440 cpumask_t *mask = (cpumask_t *)data;
444 unsigned long full_count = count, err; 441 unsigned long full_count = count, err;
@@ -457,7 +454,8 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
457 struct proc_dir_entry *entry; 454 struct proc_dir_entry *entry;
458 455
459 /* create /proc/irq/prof_cpu_mask */ 456 /* create /proc/irq/prof_cpu_mask */
460 if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir))) 457 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
458 if (!entry)
461 return; 459 return;
462 entry->data = (void *)&prof_cpu_mask; 460 entry->data = (void *)&prof_cpu_mask;
463 entry->read_proc = prof_cpu_mask_read_proc; 461 entry->read_proc = prof_cpu_mask_read_proc;
@@ -475,7 +473,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
475{ 473{
476 unsigned long p = *ppos; 474 unsigned long p = *ppos;
477 ssize_t read; 475 ssize_t read;
478 char * pnt; 476 char *pnt;
479 unsigned int sample_step = 1 << prof_shift; 477 unsigned int sample_step = 1 << prof_shift;
480 478
481 profile_flip_buffers(); 479 profile_flip_buffers();
@@ -486,12 +484,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
486 read = 0; 484 read = 0;
487 485
488 while (p < sizeof(unsigned int) && count > 0) { 486 while (p < sizeof(unsigned int) && count > 0) {
489 if (put_user(*((char *)(&sample_step)+p),buf)) 487 if (put_user(*((char *)(&sample_step)+p), buf))
490 return -EFAULT; 488 return -EFAULT;
491 buf++; p++; count--; read++; 489 buf++; p++; count--; read++;
492 } 490 }
493 pnt = (char *)prof_buffer + p - sizeof(atomic_t); 491 pnt = (char *)prof_buffer + p - sizeof(atomic_t);
494 if (copy_to_user(buf,(void *)pnt,count)) 492 if (copy_to_user(buf, (void *)pnt, count))
495 return -EFAULT; 493 return -EFAULT;
496 read += count; 494 read += count;
497 *ppos += read; 495 *ppos += read;
@@ -508,7 +506,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
508 size_t count, loff_t *ppos) 506 size_t count, loff_t *ppos)
509{ 507{
510#ifdef CONFIG_SMP 508#ifdef CONFIG_SMP
511 extern int setup_profiling_timer (unsigned int multiplier); 509 extern int setup_profiling_timer(unsigned int multiplier);
512 510
513 if (count == sizeof(int)) { 511 if (count == sizeof(int)) {
514 unsigned int multiplier; 512 unsigned int multiplier;
@@ -591,7 +589,8 @@ static int __init create_proc_profile(void)
591 return 0; 589 return 0;
592 if (create_hash_tables()) 590 if (create_hash_tables())
593 return -1; 591 return -1;
594 if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL))) 592 entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL);
593 if (!entry)
595 return 0; 594 return 0;
596 entry->proc_fops = &proc_profile_operations; 595 entry->proc_fops = &proc_profile_operations;
597 entry->size = (1+prof_len) * sizeof(atomic_t); 596 entry->size = (1+prof_len) * sizeof(atomic_t);
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
new file mode 100644
index 000000000000..f4ffbd0f306f
--- /dev/null
+++ b/kernel/rcuclassic.c
@@ -0,0 +1,575 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2001
19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com>
22 *
23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 * Papers:
26 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
28 *
29 * For detailed explanation of Read-Copy Update mechanism see -
30 * Documentation/RCU
31 *
32 */
33#include <linux/types.h>
34#include <linux/kernel.h>
35#include <linux/init.h>
36#include <linux/spinlock.h>
37#include <linux/smp.h>
38#include <linux/rcupdate.h>
39#include <linux/interrupt.h>
40#include <linux/sched.h>
41#include <asm/atomic.h>
42#include <linux/bitops.h>
43#include <linux/module.h>
44#include <linux/completion.h>
45#include <linux/moduleparam.h>
46#include <linux/percpu.h>
47#include <linux/notifier.h>
48#include <linux/cpu.h>
49#include <linux/mutex.h>
50
51#ifdef CONFIG_DEBUG_LOCK_ALLOC
52static struct lock_class_key rcu_lock_key;
53struct lockdep_map rcu_lock_map =
54 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
55EXPORT_SYMBOL_GPL(rcu_lock_map);
56#endif
57
58
59/* Definition for rcupdate control block. */
60static struct rcu_ctrlblk rcu_ctrlblk = {
61 .cur = -300,
62 .completed = -300,
63 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
64 .cpumask = CPU_MASK_NONE,
65};
66static struct rcu_ctrlblk rcu_bh_ctrlblk = {
67 .cur = -300,
68 .completed = -300,
69 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
70 .cpumask = CPU_MASK_NONE,
71};
72
73DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
74DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
75
76static int blimit = 10;
77static int qhimark = 10000;
78static int qlowmark = 100;
79
80#ifdef CONFIG_SMP
81static void force_quiescent_state(struct rcu_data *rdp,
82 struct rcu_ctrlblk *rcp)
83{
84 int cpu;
85 cpumask_t cpumask;
86 set_need_resched();
87 if (unlikely(!rcp->signaled)) {
88 rcp->signaled = 1;
89 /*
90 * Don't send IPI to itself. With irqs disabled,
91 * rdp->cpu is the current cpu.
92 */
93 cpumask = rcp->cpumask;
94 cpu_clear(rdp->cpu, cpumask);
95 for_each_cpu_mask(cpu, cpumask)
96 smp_send_reschedule(cpu);
97 }
98}
99#else
100static inline void force_quiescent_state(struct rcu_data *rdp,
101 struct rcu_ctrlblk *rcp)
102{
103 set_need_resched();
104}
105#endif
106
107/**
108 * call_rcu - Queue an RCU callback for invocation after a grace period.
109 * @head: structure to be used for queueing the RCU updates.
110 * @func: actual update function to be invoked after the grace period
111 *
112 * The update function will be invoked some time after a full grace
113 * period elapses, in other words after all currently executing RCU
114 * read-side critical sections have completed. RCU read-side critical
115 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
116 * and may be nested.
117 */
118void call_rcu(struct rcu_head *head,
119 void (*func)(struct rcu_head *rcu))
120{
121 unsigned long flags;
122 struct rcu_data *rdp;
123
124 head->func = func;
125 head->next = NULL;
126 local_irq_save(flags);
127 rdp = &__get_cpu_var(rcu_data);
128 *rdp->nxttail = head;
129 rdp->nxttail = &head->next;
130 if (unlikely(++rdp->qlen > qhimark)) {
131 rdp->blimit = INT_MAX;
132 force_quiescent_state(rdp, &rcu_ctrlblk);
133 }
134 local_irq_restore(flags);
135}
136EXPORT_SYMBOL_GPL(call_rcu);
137
138/**
139 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
140 * @head: structure to be used for queueing the RCU updates.
141 * @func: actual update function to be invoked after the grace period
142 *
143 * The update function will be invoked some time after a full grace
144 * period elapses, in other words after all currently executing RCU
145 * read-side critical sections have completed. call_rcu_bh() assumes
146 * that the read-side critical sections end on completion of a softirq
147 * handler. This means that read-side critical sections in process
148 * context must not be interrupted by softirqs. This interface is to be
149 * used when most of the read-side critical sections are in softirq context.
150 * RCU read-side critical sections are delimited by rcu_read_lock() and
151 * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
152 * and rcu_read_unlock_bh(), if in process context. These may be nested.
153 */
154void call_rcu_bh(struct rcu_head *head,
155 void (*func)(struct rcu_head *rcu))
156{
157 unsigned long flags;
158 struct rcu_data *rdp;
159
160 head->func = func;
161 head->next = NULL;
162 local_irq_save(flags);
163 rdp = &__get_cpu_var(rcu_bh_data);
164 *rdp->nxttail = head;
165 rdp->nxttail = &head->next;
166
167 if (unlikely(++rdp->qlen > qhimark)) {
168 rdp->blimit = INT_MAX;
169 force_quiescent_state(rdp, &rcu_bh_ctrlblk);
170 }
171
172 local_irq_restore(flags);
173}
174EXPORT_SYMBOL_GPL(call_rcu_bh);
175
176/*
177 * Return the number of RCU batches processed thus far. Useful
178 * for debug and statistics.
179 */
180long rcu_batches_completed(void)
181{
182 return rcu_ctrlblk.completed;
183}
184EXPORT_SYMBOL_GPL(rcu_batches_completed);
185
186/*
187 * Return the number of RCU batches processed thus far. Useful
188 * for debug and statistics.
189 */
190long rcu_batches_completed_bh(void)
191{
192 return rcu_bh_ctrlblk.completed;
193}
194EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
195
196/* Raises the softirq for processing rcu_callbacks. */
197static inline void raise_rcu_softirq(void)
198{
199 raise_softirq(RCU_SOFTIRQ);
200 /*
201 * The smp_mb() here is required to ensure that this cpu's
202 * __rcu_process_callbacks() reads the most recently updated
203 * value of rcu->cur.
204 */
205 smp_mb();
206}
207
208/*
209 * Invoke the completed RCU callbacks. They are expected to be in
210 * a per-cpu list.
211 */
212static void rcu_do_batch(struct rcu_data *rdp)
213{
214 struct rcu_head *next, *list;
215 int count = 0;
216
217 list = rdp->donelist;
218 while (list) {
219 next = list->next;
220 prefetch(next);
221 list->func(list);
222 list = next;
223 if (++count >= rdp->blimit)
224 break;
225 }
226 rdp->donelist = list;
227
228 local_irq_disable();
229 rdp->qlen -= count;
230 local_irq_enable();
231 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
232 rdp->blimit = blimit;
233
234 if (!rdp->donelist)
235 rdp->donetail = &rdp->donelist;
236 else
237 raise_rcu_softirq();
238}
239
240/*
241 * Grace period handling:
242 * The grace period handling consists out of two steps:
243 * - A new grace period is started.
244 * This is done by rcu_start_batch. The start is not broadcasted to
245 * all cpus, they must pick this up by comparing rcp->cur with
246 * rdp->quiescbatch. All cpus are recorded in the
247 * rcu_ctrlblk.cpumask bitmap.
248 * - All cpus must go through a quiescent state.
249 * Since the start of the grace period is not broadcasted, at least two
250 * calls to rcu_check_quiescent_state are required:
251 * The first call just notices that a new grace period is running. The
252 * following calls check if there was a quiescent state since the beginning
253 * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
254 * the bitmap is empty, then the grace period is completed.
255 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
256 * period (if necessary).
257 */
258/*
259 * Register a new batch of callbacks, and start it up if there is currently no
260 * active batch and the batch to be registered has not already occurred.
261 * Caller must hold rcu_ctrlblk.lock.
262 */
263static void rcu_start_batch(struct rcu_ctrlblk *rcp)
264{
265 if (rcp->next_pending &&
266 rcp->completed == rcp->cur) {
267 rcp->next_pending = 0;
268 /*
269 * next_pending == 0 must be visible in
270 * __rcu_process_callbacks() before it can see new value of cur.
271 */
272 smp_wmb();
273 rcp->cur++;
274
275 /*
276 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
277 * Barrier Otherwise it can cause tickless idle CPUs to be
278 * included in rcp->cpumask, which will extend graceperiods
279 * unnecessarily.
280 */
281 smp_mb();
282 cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
283
284 rcp->signaled = 0;
285 }
286}
287
288/*
289 * cpu went through a quiescent state since the beginning of the grace period.
290 * Clear it from the cpu mask and complete the grace period if it was the last
291 * cpu. Start another grace period if someone has further entries pending
292 */
293static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
294{
295 cpu_clear(cpu, rcp->cpumask);
296 if (cpus_empty(rcp->cpumask)) {
297 /* batch completed ! */
298 rcp->completed = rcp->cur;
299 rcu_start_batch(rcp);
300 }
301}
302
303/*
304 * Check if the cpu has gone through a quiescent state (say context
305 * switch). If so and if it already hasn't done so in this RCU
306 * quiescent cycle, then indicate that it has done so.
307 */
308static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
309 struct rcu_data *rdp)
310{
311 if (rdp->quiescbatch != rcp->cur) {
312 /* start new grace period: */
313 rdp->qs_pending = 1;
314 rdp->passed_quiesc = 0;
315 rdp->quiescbatch = rcp->cur;
316 return;
317 }
318
319 /* Grace period already completed for this cpu?
320 * qs_pending is checked instead of the actual bitmap to avoid
321 * cacheline trashing.
322 */
323 if (!rdp->qs_pending)
324 return;
325
326 /*
327 * Was there a quiescent state since the beginning of the grace
328 * period? If no, then exit and wait for the next call.
329 */
330 if (!rdp->passed_quiesc)
331 return;
332 rdp->qs_pending = 0;
333
334 spin_lock(&rcp->lock);
335 /*
336 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
337 * during cpu startup. Ignore the quiescent state.
338 */
339 if (likely(rdp->quiescbatch == rcp->cur))
340 cpu_quiet(rdp->cpu, rcp);
341
342 spin_unlock(&rcp->lock);
343}
344
345
346#ifdef CONFIG_HOTPLUG_CPU
347
348/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
349 * locking requirements, the list it's pulling from has to belong to a cpu
350 * which is dead and hence not processing interrupts.
351 */
352static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
353 struct rcu_head **tail)
354{
355 local_irq_disable();
356 *this_rdp->nxttail = list;
357 if (list)
358 this_rdp->nxttail = tail;
359 local_irq_enable();
360}
361
362static void __rcu_offline_cpu(struct rcu_data *this_rdp,
363 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
364{
365 /* if the cpu going offline owns the grace period
366 * we can block indefinitely waiting for it, so flush
367 * it here
368 */
369 spin_lock_bh(&rcp->lock);
370 if (rcp->cur != rcp->completed)
371 cpu_quiet(rdp->cpu, rcp);
372 spin_unlock_bh(&rcp->lock);
373 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
374 rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
375 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
376}
377
378static void rcu_offline_cpu(int cpu)
379{
380 struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
381 struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
382
383 __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
384 &per_cpu(rcu_data, cpu));
385 __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
386 &per_cpu(rcu_bh_data, cpu));
387 put_cpu_var(rcu_data);
388 put_cpu_var(rcu_bh_data);
389}
390
391#else
392
393static void rcu_offline_cpu(int cpu)
394{
395}
396
397#endif
398
399/*
400 * This does the RCU processing work from softirq context.
401 */
402static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
403 struct rcu_data *rdp)
404{
405 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
406 *rdp->donetail = rdp->curlist;
407 rdp->donetail = rdp->curtail;
408 rdp->curlist = NULL;
409 rdp->curtail = &rdp->curlist;
410 }
411
412 if (rdp->nxtlist && !rdp->curlist) {
413 local_irq_disable();
414 rdp->curlist = rdp->nxtlist;
415 rdp->curtail = rdp->nxttail;
416 rdp->nxtlist = NULL;
417 rdp->nxttail = &rdp->nxtlist;
418 local_irq_enable();
419
420 /*
421 * start the next batch of callbacks
422 */
423
424 /* determine batch number */
425 rdp->batch = rcp->cur + 1;
426 /* see the comment and corresponding wmb() in
427 * the rcu_start_batch()
428 */
429 smp_rmb();
430
431 if (!rcp->next_pending) {
432 /* and start it/schedule start if it's a new batch */
433 spin_lock(&rcp->lock);
434 rcp->next_pending = 1;
435 rcu_start_batch(rcp);
436 spin_unlock(&rcp->lock);
437 }
438 }
439
440 rcu_check_quiescent_state(rcp, rdp);
441 if (rdp->donelist)
442 rcu_do_batch(rdp);
443}
444
445static void rcu_process_callbacks(struct softirq_action *unused)
446{
447 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
448 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
449}
450
451static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
452{
453 /* This cpu has pending rcu entries and the grace period
454 * for them has completed.
455 */
456 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
457 return 1;
458
459 /* This cpu has no pending entries, but there are new entries */
460 if (!rdp->curlist && rdp->nxtlist)
461 return 1;
462
463 /* This cpu has finished callbacks to invoke */
464 if (rdp->donelist)
465 return 1;
466
467 /* The rcu core waits for a quiescent state from the cpu */
468 if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
469 return 1;
470
471 /* nothing to do */
472 return 0;
473}
474
475/*
476 * Check to see if there is any immediate RCU-related work to be done
477 * by the current CPU, returning 1 if so. This function is part of the
478 * RCU implementation; it is -not- an exported member of the RCU API.
479 */
480int rcu_pending(int cpu)
481{
482 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
483 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
484}
485
486/*
487 * Check to see if any future RCU-related work will need to be done
488 * by the current CPU, even if none need be done immediately, returning
489 * 1 if so. This function is part of the RCU implementation; it is -not-
490 * an exported member of the RCU API.
491 */
492int rcu_needs_cpu(int cpu)
493{
494 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
495 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
496
497 return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
498}
499
500void rcu_check_callbacks(int cpu, int user)
501{
502 if (user ||
503 (idle_cpu(cpu) && !in_softirq() &&
504 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
505 rcu_qsctr_inc(cpu);
506 rcu_bh_qsctr_inc(cpu);
507 } else if (!in_softirq())
508 rcu_bh_qsctr_inc(cpu);
509 raise_rcu_softirq();
510}
511
512static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
513 struct rcu_data *rdp)
514{
515 memset(rdp, 0, sizeof(*rdp));
516 rdp->curtail = &rdp->curlist;
517 rdp->nxttail = &rdp->nxtlist;
518 rdp->donetail = &rdp->donelist;
519 rdp->quiescbatch = rcp->completed;
520 rdp->qs_pending = 0;
521 rdp->cpu = cpu;
522 rdp->blimit = blimit;
523}
524
525static void __cpuinit rcu_online_cpu(int cpu)
526{
527 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
528 struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
529
530 rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
531 rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
532 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
533}
534
535static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
536 unsigned long action, void *hcpu)
537{
538 long cpu = (long)hcpu;
539
540 switch (action) {
541 case CPU_UP_PREPARE:
542 case CPU_UP_PREPARE_FROZEN:
543 rcu_online_cpu(cpu);
544 break;
545 case CPU_DEAD:
546 case CPU_DEAD_FROZEN:
547 rcu_offline_cpu(cpu);
548 break;
549 default:
550 break;
551 }
552 return NOTIFY_OK;
553}
554
555static struct notifier_block __cpuinitdata rcu_nb = {
556 .notifier_call = rcu_cpu_notify,
557};
558
559/*
560 * Initializes rcu mechanism. Assumed to be called early.
561 * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
562 * Note that rcu_qsctr and friends are implicitly
563 * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
564 */
565void __init __rcu_init(void)
566{
567 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
568 (void *)(long)smp_processor_id());
569 /* Register notifier for non-boot CPUs */
570 register_cpu_notifier(&rcu_nb);
571}
572
573module_param(blimit, int, 0);
574module_param(qhimark, int, 0);
575module_param(qlowmark, int, 0);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f2c1a04e9b18..760dfc233a00 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -15,7 +15,7 @@
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 * 17 *
18 * Copyright (C) IBM Corporation, 2001 18 * Copyright IBM Corporation, 2001
19 * 19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com> 20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com> 21 * Manfred Spraul <manfred@colorfullife.com>
@@ -35,165 +35,57 @@
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/spinlock.h> 36#include <linux/spinlock.h>
37#include <linux/smp.h> 37#include <linux/smp.h>
38#include <linux/rcupdate.h>
39#include <linux/interrupt.h> 38#include <linux/interrupt.h>
40#include <linux/sched.h> 39#include <linux/sched.h>
41#include <asm/atomic.h> 40#include <asm/atomic.h>
42#include <linux/bitops.h> 41#include <linux/bitops.h>
43#include <linux/module.h>
44#include <linux/completion.h> 42#include <linux/completion.h>
45#include <linux/moduleparam.h>
46#include <linux/percpu.h> 43#include <linux/percpu.h>
47#include <linux/notifier.h> 44#include <linux/notifier.h>
48#include <linux/cpu.h> 45#include <linux/cpu.h>
49#include <linux/mutex.h> 46#include <linux/mutex.h>
47#include <linux/module.h>
50 48
51#ifdef CONFIG_DEBUG_LOCK_ALLOC 49struct rcu_synchronize {
52static struct lock_class_key rcu_lock_key; 50 struct rcu_head head;
53struct lockdep_map rcu_lock_map = 51 struct completion completion;
54 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
55
56EXPORT_SYMBOL_GPL(rcu_lock_map);
57#endif
58
59/* Definition for rcupdate control block. */
60static struct rcu_ctrlblk rcu_ctrlblk = {
61 .cur = -300,
62 .completed = -300,
63 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
64 .cpumask = CPU_MASK_NONE,
65};
66static struct rcu_ctrlblk rcu_bh_ctrlblk = {
67 .cur = -300,
68 .completed = -300,
69 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
70 .cpumask = CPU_MASK_NONE,
71}; 52};
72 53
73DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; 54static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
74DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
75
76/* Fake initialization required by compiler */
77static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
78static int blimit = 10;
79static int qhimark = 10000;
80static int qlowmark = 100;
81
82static atomic_t rcu_barrier_cpu_count; 55static atomic_t rcu_barrier_cpu_count;
83static DEFINE_MUTEX(rcu_barrier_mutex); 56static DEFINE_MUTEX(rcu_barrier_mutex);
84static struct completion rcu_barrier_completion; 57static struct completion rcu_barrier_completion;
85 58
86#ifdef CONFIG_SMP 59/* Because of FASTCALL declaration of complete, we use this wrapper */
87static void force_quiescent_state(struct rcu_data *rdp, 60static void wakeme_after_rcu(struct rcu_head *head)
88 struct rcu_ctrlblk *rcp)
89{
90 int cpu;
91 cpumask_t cpumask;
92 set_need_resched();
93 if (unlikely(!rcp->signaled)) {
94 rcp->signaled = 1;
95 /*
96 * Don't send IPI to itself. With irqs disabled,
97 * rdp->cpu is the current cpu.
98 */
99 cpumask = rcp->cpumask;
100 cpu_clear(rdp->cpu, cpumask);
101 for_each_cpu_mask(cpu, cpumask)
102 smp_send_reschedule(cpu);
103 }
104}
105#else
106static inline void force_quiescent_state(struct rcu_data *rdp,
107 struct rcu_ctrlblk *rcp)
108{ 61{
109 set_need_resched(); 62 struct rcu_synchronize *rcu;
63
64 rcu = container_of(head, struct rcu_synchronize, head);
65 complete(&rcu->completion);
110} 66}
111#endif
112 67
113/** 68/**
114 * call_rcu - Queue an RCU callback for invocation after a grace period. 69 * synchronize_rcu - wait until a grace period has elapsed.
115 * @head: structure to be used for queueing the RCU updates.
116 * @func: actual update function to be invoked after the grace period
117 * 70 *
118 * The update function will be invoked some time after a full grace 71 * Control will return to the caller some time after a full grace
119 * period elapses, in other words after all currently executing RCU 72 * period has elapsed, in other words after all currently executing RCU
120 * read-side critical sections have completed. RCU read-side critical 73 * read-side critical sections have completed. RCU read-side critical
121 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 74 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
122 * and may be nested. 75 * and may be nested.
123 */ 76 */
124void fastcall call_rcu(struct rcu_head *head, 77void synchronize_rcu(void)
125 void (*func)(struct rcu_head *rcu))
126{
127 unsigned long flags;
128 struct rcu_data *rdp;
129
130 head->func = func;
131 head->next = NULL;
132 local_irq_save(flags);
133 rdp = &__get_cpu_var(rcu_data);
134 *rdp->nxttail = head;
135 rdp->nxttail = &head->next;
136 if (unlikely(++rdp->qlen > qhimark)) {
137 rdp->blimit = INT_MAX;
138 force_quiescent_state(rdp, &rcu_ctrlblk);
139 }
140 local_irq_restore(flags);
141}
142
143/**
144 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
145 * @head: structure to be used for queueing the RCU updates.
146 * @func: actual update function to be invoked after the grace period
147 *
148 * The update function will be invoked some time after a full grace
149 * period elapses, in other words after all currently executing RCU
150 * read-side critical sections have completed. call_rcu_bh() assumes
151 * that the read-side critical sections end on completion of a softirq
152 * handler. This means that read-side critical sections in process
153 * context must not be interrupted by softirqs. This interface is to be
154 * used when most of the read-side critical sections are in softirq context.
155 * RCU read-side critical sections are delimited by rcu_read_lock() and
156 * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
157 * and rcu_read_unlock_bh(), if in process context. These may be nested.
158 */
159void fastcall call_rcu_bh(struct rcu_head *head,
160 void (*func)(struct rcu_head *rcu))
161{ 78{
162 unsigned long flags; 79 struct rcu_synchronize rcu;
163 struct rcu_data *rdp;
164
165 head->func = func;
166 head->next = NULL;
167 local_irq_save(flags);
168 rdp = &__get_cpu_var(rcu_bh_data);
169 *rdp->nxttail = head;
170 rdp->nxttail = &head->next;
171
172 if (unlikely(++rdp->qlen > qhimark)) {
173 rdp->blimit = INT_MAX;
174 force_quiescent_state(rdp, &rcu_bh_ctrlblk);
175 }
176
177 local_irq_restore(flags);
178}
179 80
180/* 81 init_completion(&rcu.completion);
181 * Return the number of RCU batches processed thus far. Useful 82 /* Will wake me after RCU finished */
182 * for debug and statistics. 83 call_rcu(&rcu.head, wakeme_after_rcu);
183 */
184long rcu_batches_completed(void)
185{
186 return rcu_ctrlblk.completed;
187}
188 84
189/* 85 /* Wait for it */
190 * Return the number of RCU batches processed thus far. Useful 86 wait_for_completion(&rcu.completion);
191 * for debug and statistics.
192 */
193long rcu_batches_completed_bh(void)
194{
195 return rcu_bh_ctrlblk.completed;
196} 87}
88EXPORT_SYMBOL_GPL(synchronize_rcu);
197 89
198static void rcu_barrier_callback(struct rcu_head *notused) 90static void rcu_barrier_callback(struct rcu_head *notused)
199{ 91{
@@ -207,10 +99,8 @@ static void rcu_barrier_callback(struct rcu_head *notused)
207static void rcu_barrier_func(void *notused) 99static void rcu_barrier_func(void *notused)
208{ 100{
209 int cpu = smp_processor_id(); 101 int cpu = smp_processor_id();
210 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); 102 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
211 struct rcu_head *head;
212 103
213 head = &rdp->barrier;
214 atomic_inc(&rcu_barrier_cpu_count); 104 atomic_inc(&rcu_barrier_cpu_count);
215 call_rcu(head, rcu_barrier_callback); 105 call_rcu(head, rcu_barrier_callback);
216} 106}
@@ -225,420 +115,24 @@ void rcu_barrier(void)
225 mutex_lock(&rcu_barrier_mutex); 115 mutex_lock(&rcu_barrier_mutex);
226 init_completion(&rcu_barrier_completion); 116 init_completion(&rcu_barrier_completion);
227 atomic_set(&rcu_barrier_cpu_count, 0); 117 atomic_set(&rcu_barrier_cpu_count, 0);
118 /*
119 * The queueing of callbacks in all CPUs must be atomic with
120 * respect to RCU, otherwise one CPU may queue a callback,
121 * wait for a grace period, decrement barrier count and call
122 * complete(), while other CPUs have not yet queued anything.
123 * So, we need to make sure that grace periods cannot complete
124 * until all the callbacks are queued.
125 */
126 rcu_read_lock();
228 on_each_cpu(rcu_barrier_func, NULL, 0, 1); 127 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
128 rcu_read_unlock();
229 wait_for_completion(&rcu_barrier_completion); 129 wait_for_completion(&rcu_barrier_completion);
230 mutex_unlock(&rcu_barrier_mutex); 130 mutex_unlock(&rcu_barrier_mutex);
231} 131}
232EXPORT_SYMBOL_GPL(rcu_barrier); 132EXPORT_SYMBOL_GPL(rcu_barrier);
233 133
234/*
235 * Invoke the completed RCU callbacks. They are expected to be in
236 * a per-cpu list.
237 */
238static void rcu_do_batch(struct rcu_data *rdp)
239{
240 struct rcu_head *next, *list;
241 int count = 0;
242
243 list = rdp->donelist;
244 while (list) {
245 next = list->next;
246 prefetch(next);
247 list->func(list);
248 list = next;
249 if (++count >= rdp->blimit)
250 break;
251 }
252 rdp->donelist = list;
253
254 local_irq_disable();
255 rdp->qlen -= count;
256 local_irq_enable();
257 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
258 rdp->blimit = blimit;
259
260 if (!rdp->donelist)
261 rdp->donetail = &rdp->donelist;
262 else
263 tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu));
264}
265
266/*
267 * Grace period handling:
268 * The grace period handling consists out of two steps:
269 * - A new grace period is started.
270 * This is done by rcu_start_batch. The start is not broadcasted to
271 * all cpus, they must pick this up by comparing rcp->cur with
272 * rdp->quiescbatch. All cpus are recorded in the
273 * rcu_ctrlblk.cpumask bitmap.
274 * - All cpus must go through a quiescent state.
275 * Since the start of the grace period is not broadcasted, at least two
276 * calls to rcu_check_quiescent_state are required:
277 * The first call just notices that a new grace period is running. The
278 * following calls check if there was a quiescent state since the beginning
279 * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
280 * the bitmap is empty, then the grace period is completed.
281 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
282 * period (if necessary).
283 */
284/*
285 * Register a new batch of callbacks, and start it up if there is currently no
286 * active batch and the batch to be registered has not already occurred.
287 * Caller must hold rcu_ctrlblk.lock.
288 */
289static void rcu_start_batch(struct rcu_ctrlblk *rcp)
290{
291 if (rcp->next_pending &&
292 rcp->completed == rcp->cur) {
293 rcp->next_pending = 0;
294 /*
295 * next_pending == 0 must be visible in
296 * __rcu_process_callbacks() before it can see new value of cur.
297 */
298 smp_wmb();
299 rcp->cur++;
300
301 /*
302 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
303 * Barrier Otherwise it can cause tickless idle CPUs to be
304 * included in rcp->cpumask, which will extend graceperiods
305 * unnecessarily.
306 */
307 smp_mb();
308 cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
309
310 rcp->signaled = 0;
311 }
312}
313
314/*
315 * cpu went through a quiescent state since the beginning of the grace period.
316 * Clear it from the cpu mask and complete the grace period if it was the last
317 * cpu. Start another grace period if someone has further entries pending
318 */
319static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
320{
321 cpu_clear(cpu, rcp->cpumask);
322 if (cpus_empty(rcp->cpumask)) {
323 /* batch completed ! */
324 rcp->completed = rcp->cur;
325 rcu_start_batch(rcp);
326 }
327}
328
329/*
330 * Check if the cpu has gone through a quiescent state (say context
331 * switch). If so and if it already hasn't done so in this RCU
332 * quiescent cycle, then indicate that it has done so.
333 */
334static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
335 struct rcu_data *rdp)
336{
337 if (rdp->quiescbatch != rcp->cur) {
338 /* start new grace period: */
339 rdp->qs_pending = 1;
340 rdp->passed_quiesc = 0;
341 rdp->quiescbatch = rcp->cur;
342 return;
343 }
344
345 /* Grace period already completed for this cpu?
346 * qs_pending is checked instead of the actual bitmap to avoid
347 * cacheline trashing.
348 */
349 if (!rdp->qs_pending)
350 return;
351
352 /*
353 * Was there a quiescent state since the beginning of the grace
354 * period? If no, then exit and wait for the next call.
355 */
356 if (!rdp->passed_quiesc)
357 return;
358 rdp->qs_pending = 0;
359
360 spin_lock(&rcp->lock);
361 /*
362 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
363 * during cpu startup. Ignore the quiescent state.
364 */
365 if (likely(rdp->quiescbatch == rcp->cur))
366 cpu_quiet(rdp->cpu, rcp);
367
368 spin_unlock(&rcp->lock);
369}
370
371
372#ifdef CONFIG_HOTPLUG_CPU
373
374/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
375 * locking requirements, the list it's pulling from has to belong to a cpu
376 * which is dead and hence not processing interrupts.
377 */
378static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
379 struct rcu_head **tail)
380{
381 local_irq_disable();
382 *this_rdp->nxttail = list;
383 if (list)
384 this_rdp->nxttail = tail;
385 local_irq_enable();
386}
387
388static void __rcu_offline_cpu(struct rcu_data *this_rdp,
389 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
390{
391 /* if the cpu going offline owns the grace period
392 * we can block indefinitely waiting for it, so flush
393 * it here
394 */
395 spin_lock_bh(&rcp->lock);
396 if (rcp->cur != rcp->completed)
397 cpu_quiet(rdp->cpu, rcp);
398 spin_unlock_bh(&rcp->lock);
399 rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
400 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
401 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
402}
403
404static void rcu_offline_cpu(int cpu)
405{
406 struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
407 struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
408
409 __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
410 &per_cpu(rcu_data, cpu));
411 __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
412 &per_cpu(rcu_bh_data, cpu));
413 put_cpu_var(rcu_data);
414 put_cpu_var(rcu_bh_data);
415 tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu);
416}
417
418#else
419
420static void rcu_offline_cpu(int cpu)
421{
422}
423
424#endif
425
426/*
427 * This does the RCU processing work from tasklet context.
428 */
429static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
430 struct rcu_data *rdp)
431{
432 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
433 *rdp->donetail = rdp->curlist;
434 rdp->donetail = rdp->curtail;
435 rdp->curlist = NULL;
436 rdp->curtail = &rdp->curlist;
437 }
438
439 if (rdp->nxtlist && !rdp->curlist) {
440 local_irq_disable();
441 rdp->curlist = rdp->nxtlist;
442 rdp->curtail = rdp->nxttail;
443 rdp->nxtlist = NULL;
444 rdp->nxttail = &rdp->nxtlist;
445 local_irq_enable();
446
447 /*
448 * start the next batch of callbacks
449 */
450
451 /* determine batch number */
452 rdp->batch = rcp->cur + 1;
453 /* see the comment and corresponding wmb() in
454 * the rcu_start_batch()
455 */
456 smp_rmb();
457
458 if (!rcp->next_pending) {
459 /* and start it/schedule start if it's a new batch */
460 spin_lock(&rcp->lock);
461 rcp->next_pending = 1;
462 rcu_start_batch(rcp);
463 spin_unlock(&rcp->lock);
464 }
465 }
466
467 rcu_check_quiescent_state(rcp, rdp);
468 if (rdp->donelist)
469 rcu_do_batch(rdp);
470}
471
472static void rcu_process_callbacks(unsigned long unused)
473{
474 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
475 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
476}
477
478static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
479{
480 /* This cpu has pending rcu entries and the grace period
481 * for them has completed.
482 */
483 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
484 return 1;
485
486 /* This cpu has no pending entries, but there are new entries */
487 if (!rdp->curlist && rdp->nxtlist)
488 return 1;
489
490 /* This cpu has finished callbacks to invoke */
491 if (rdp->donelist)
492 return 1;
493
494 /* The rcu core waits for a quiescent state from the cpu */
495 if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
496 return 1;
497
498 /* nothing to do */
499 return 0;
500}
501
502/*
503 * Check to see if there is any immediate RCU-related work to be done
504 * by the current CPU, returning 1 if so. This function is part of the
505 * RCU implementation; it is -not- an exported member of the RCU API.
506 */
507int rcu_pending(int cpu)
508{
509 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
510 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
511}
512
513/*
514 * Check to see if any future RCU-related work will need to be done
515 * by the current CPU, even if none need be done immediately, returning
516 * 1 if so. This function is part of the RCU implementation; it is -not-
517 * an exported member of the RCU API.
518 */
519int rcu_needs_cpu(int cpu)
520{
521 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
522 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
523
524 return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
525}
526
527void rcu_check_callbacks(int cpu, int user)
528{
529 if (user ||
530 (idle_cpu(cpu) && !in_softirq() &&
531 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
532 rcu_qsctr_inc(cpu);
533 rcu_bh_qsctr_inc(cpu);
534 } else if (!in_softirq())
535 rcu_bh_qsctr_inc(cpu);
536 tasklet_schedule(&per_cpu(rcu_tasklet, cpu));
537}
538
539static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
540 struct rcu_data *rdp)
541{
542 memset(rdp, 0, sizeof(*rdp));
543 rdp->curtail = &rdp->curlist;
544 rdp->nxttail = &rdp->nxtlist;
545 rdp->donetail = &rdp->donelist;
546 rdp->quiescbatch = rcp->completed;
547 rdp->qs_pending = 0;
548 rdp->cpu = cpu;
549 rdp->blimit = blimit;
550}
551
552static void __cpuinit rcu_online_cpu(int cpu)
553{
554 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
555 struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
556
557 rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
558 rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
559 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
560}
561
562static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
563 unsigned long action, void *hcpu)
564{
565 long cpu = (long)hcpu;
566 switch (action) {
567 case CPU_UP_PREPARE:
568 case CPU_UP_PREPARE_FROZEN:
569 rcu_online_cpu(cpu);
570 break;
571 case CPU_DEAD:
572 case CPU_DEAD_FROZEN:
573 rcu_offline_cpu(cpu);
574 break;
575 default:
576 break;
577 }
578 return NOTIFY_OK;
579}
580
581static struct notifier_block __cpuinitdata rcu_nb = {
582 .notifier_call = rcu_cpu_notify,
583};
584
585/*
586 * Initializes rcu mechanism. Assumed to be called early.
587 * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
588 * Note that rcu_qsctr and friends are implicitly
589 * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
590 */
591void __init rcu_init(void) 134void __init rcu_init(void)
592{ 135{
593 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, 136 __rcu_init();
594 (void *)(long)smp_processor_id());
595 /* Register notifier for non-boot CPUs */
596 register_cpu_notifier(&rcu_nb);
597}
598
599struct rcu_synchronize {
600 struct rcu_head head;
601 struct completion completion;
602};
603
604/* Because of FASTCALL declaration of complete, we use this wrapper */
605static void wakeme_after_rcu(struct rcu_head *head)
606{
607 struct rcu_synchronize *rcu;
608
609 rcu = container_of(head, struct rcu_synchronize, head);
610 complete(&rcu->completion);
611} 137}
612 138
613/**
614 * synchronize_rcu - wait until a grace period has elapsed.
615 *
616 * Control will return to the caller some time after a full grace
617 * period has elapsed, in other words after all currently executing RCU
618 * read-side critical sections have completed. RCU read-side critical
619 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
620 * and may be nested.
621 *
622 * If your read-side code is not protected by rcu_read_lock(), do -not-
623 * use synchronize_rcu().
624 */
625void synchronize_rcu(void)
626{
627 struct rcu_synchronize rcu;
628
629 init_completion(&rcu.completion);
630 /* Will wake me after RCU finished */
631 call_rcu(&rcu.head, wakeme_after_rcu);
632
633 /* Wait for it */
634 wait_for_completion(&rcu.completion);
635}
636
637module_param(blimit, int, 0);
638module_param(qhimark, int, 0);
639module_param(qlowmark, int, 0);
640EXPORT_SYMBOL_GPL(rcu_batches_completed);
641EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
642EXPORT_SYMBOL_GPL(call_rcu);
643EXPORT_SYMBOL_GPL(call_rcu_bh);
644EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
new file mode 100644
index 000000000000..987cfb7ade89
--- /dev/null
+++ b/kernel/rcupreempt.c
@@ -0,0 +1,953 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
22 * for pushing me away from locks and towards counters, and
23 * to Suparna Bhattacharya for pushing me completely away
24 * from atomic instructions on the read side.
25 *
26 * Papers: http://www.rdrop.com/users/paulmck/RCU
27 *
28 * Design Document: http://lwn.net/Articles/253651/
29 *
30 * For detailed explanation of Read-Copy Update mechanism see -
31 * Documentation/RCU/ *.txt
32 *
33 */
34#include <linux/types.h>
35#include <linux/kernel.h>
36#include <linux/init.h>
37#include <linux/spinlock.h>
38#include <linux/smp.h>
39#include <linux/rcupdate.h>
40#include <linux/interrupt.h>
41#include <linux/sched.h>
42#include <asm/atomic.h>
43#include <linux/bitops.h>
44#include <linux/module.h>
45#include <linux/completion.h>
46#include <linux/moduleparam.h>
47#include <linux/percpu.h>
48#include <linux/notifier.h>
49#include <linux/rcupdate.h>
50#include <linux/cpu.h>
51#include <linux/random.h>
52#include <linux/delay.h>
53#include <linux/byteorder/swabb.h>
54#include <linux/cpumask.h>
55#include <linux/rcupreempt_trace.h>
56
57/*
58 * Macro that prevents the compiler from reordering accesses, but does
59 * absolutely -nothing- to prevent CPUs from reordering. This is used
60 * only to mediate communication between mainline code and hardware
61 * interrupt and NMI handlers.
62 */
63#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
64
65/*
66 * PREEMPT_RCU data structures.
67 */
68
69/*
70 * GP_STAGES specifies the number of times the state machine has
71 * to go through the all the rcu_try_flip_states (see below)
72 * in a single Grace Period.
73 *
74 * GP in GP_STAGES stands for Grace Period ;)
75 */
76#define GP_STAGES 2
77struct rcu_data {
78 spinlock_t lock; /* Protect rcu_data fields. */
79 long completed; /* Number of last completed batch. */
80 int waitlistcount;
81 struct tasklet_struct rcu_tasklet;
82 struct rcu_head *nextlist;
83 struct rcu_head **nexttail;
84 struct rcu_head *waitlist[GP_STAGES];
85 struct rcu_head **waittail[GP_STAGES];
86 struct rcu_head *donelist;
87 struct rcu_head **donetail;
88 long rcu_flipctr[2];
89#ifdef CONFIG_RCU_TRACE
90 struct rcupreempt_trace trace;
91#endif /* #ifdef CONFIG_RCU_TRACE */
92};
93
94/*
95 * States for rcu_try_flip() and friends.
96 */
97
98enum rcu_try_flip_states {
99
100 /*
101 * Stay here if nothing is happening. Flip the counter if somthing
102 * starts happening. Denoted by "I"
103 */
104 rcu_try_flip_idle_state,
105
106 /*
107 * Wait here for all CPUs to notice that the counter has flipped. This
108 * prevents the old set of counters from ever being incremented once
109 * we leave this state, which in turn is necessary because we cannot
110 * test any individual counter for zero -- we can only check the sum.
111 * Denoted by "A".
112 */
113 rcu_try_flip_waitack_state,
114
115 /*
116 * Wait here for the sum of the old per-CPU counters to reach zero.
117 * Denoted by "Z".
118 */
119 rcu_try_flip_waitzero_state,
120
121 /*
122 * Wait here for each of the other CPUs to execute a memory barrier.
123 * This is necessary to ensure that these other CPUs really have
124 * completed executing their RCU read-side critical sections, despite
125 * their CPUs wildly reordering memory. Denoted by "M".
126 */
127 rcu_try_flip_waitmb_state,
128};
129
130struct rcu_ctrlblk {
131 spinlock_t fliplock; /* Protect state-machine transitions. */
132 long completed; /* Number of last completed batch. */
133 enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
134 the rcu state machine */
135};
136
137static DEFINE_PER_CPU(struct rcu_data, rcu_data);
138static struct rcu_ctrlblk rcu_ctrlblk = {
139 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
140 .completed = 0,
141 .rcu_try_flip_state = rcu_try_flip_idle_state,
142};
143
144
145#ifdef CONFIG_RCU_TRACE
146static char *rcu_try_flip_state_names[] =
147 { "idle", "waitack", "waitzero", "waitmb" };
148#endif /* #ifdef CONFIG_RCU_TRACE */
149
150static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE;
151
152/*
153 * Enum and per-CPU flag to determine when each CPU has seen
154 * the most recent counter flip.
155 */
156
157enum rcu_flip_flag_values {
158 rcu_flip_seen, /* Steady/initial state, last flip seen. */
159 /* Only GP detector can update. */
160 rcu_flipped /* Flip just completed, need confirmation. */
161 /* Only corresponding CPU can update. */
162};
163static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
164 = rcu_flip_seen;
165
166/*
167 * Enum and per-CPU flag to determine when each CPU has executed the
168 * needed memory barrier to fence in memory references from its last RCU
169 * read-side critical section in the just-completed grace period.
170 */
171
172enum rcu_mb_flag_values {
173 rcu_mb_done, /* Steady/initial state, no mb()s required. */
174 /* Only GP detector can update. */
175 rcu_mb_needed /* Flip just completed, need an mb(). */
176 /* Only corresponding CPU can update. */
177};
178static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
179 = rcu_mb_done;
180
181/*
182 * RCU_DATA_ME: find the current CPU's rcu_data structure.
183 * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
184 */
185#define RCU_DATA_ME() (&__get_cpu_var(rcu_data))
186#define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu))
187
188/*
189 * Helper macro for tracing when the appropriate rcu_data is not
190 * cached in a local variable, but where the CPU number is so cached.
191 */
192#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
193
194/*
195 * Helper macro for tracing when the appropriate rcu_data is not
196 * cached in a local variable.
197 */
198#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
199
200/*
201 * Helper macro for tracing when the appropriate rcu_data is pointed
202 * to by a local variable.
203 */
204#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
205
206/*
207 * Return the number of RCU batches processed thus far. Useful
208 * for debug and statistics.
209 */
210long rcu_batches_completed(void)
211{
212 return rcu_ctrlblk.completed;
213}
214EXPORT_SYMBOL_GPL(rcu_batches_completed);
215
216EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
217
218void __rcu_read_lock(void)
219{
220 int idx;
221 struct task_struct *t = current;
222 int nesting;
223
224 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
225 if (nesting != 0) {
226
227 /* An earlier rcu_read_lock() covers us, just count it. */
228
229 t->rcu_read_lock_nesting = nesting + 1;
230
231 } else {
232 unsigned long flags;
233
234 /*
235 * We disable interrupts for the following reasons:
236 * - If we get scheduling clock interrupt here, and we
237 * end up acking the counter flip, it's like a promise
238 * that we will never increment the old counter again.
239 * Thus we will break that promise if that
240 * scheduling clock interrupt happens between the time
241 * we pick the .completed field and the time that we
242 * increment our counter.
243 *
244 * - We don't want to be preempted out here.
245 *
246 * NMIs can still occur, of course, and might themselves
247 * contain rcu_read_lock().
248 */
249
250 local_irq_save(flags);
251
252 /*
253 * Outermost nesting of rcu_read_lock(), so increment
254 * the current counter for the current CPU. Use volatile
255 * casts to prevent the compiler from reordering.
256 */
257
258 idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
259 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
260
261 /*
262 * Now that the per-CPU counter has been incremented, we
263 * are protected from races with rcu_read_lock() invoked
264 * from NMI handlers on this CPU. We can therefore safely
265 * increment the nesting counter, relieving further NMIs
266 * of the need to increment the per-CPU counter.
267 */
268
269 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
270
271 /*
272 * Now that we have preventing any NMIs from storing
273 * to the ->rcu_flipctr_idx, we can safely use it to
274 * remember which counter to decrement in the matching
275 * rcu_read_unlock().
276 */
277
278 ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
279 local_irq_restore(flags);
280 }
281}
282EXPORT_SYMBOL_GPL(__rcu_read_lock);
283
284void __rcu_read_unlock(void)
285{
286 int idx;
287 struct task_struct *t = current;
288 int nesting;
289
290 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
291 if (nesting > 1) {
292
293 /*
294 * We are still protected by the enclosing rcu_read_lock(),
295 * so simply decrement the counter.
296 */
297
298 t->rcu_read_lock_nesting = nesting - 1;
299
300 } else {
301 unsigned long flags;
302
303 /*
304 * Disable local interrupts to prevent the grace-period
305 * detection state machine from seeing us half-done.
306 * NMIs can still occur, of course, and might themselves
307 * contain rcu_read_lock() and rcu_read_unlock().
308 */
309
310 local_irq_save(flags);
311
312 /*
313 * Outermost nesting of rcu_read_unlock(), so we must
314 * decrement the current counter for the current CPU.
315 * This must be done carefully, because NMIs can
316 * occur at any point in this code, and any rcu_read_lock()
317 * and rcu_read_unlock() pairs in the NMI handlers
318 * must interact non-destructively with this code.
319 * Lots of volatile casts, and -very- careful ordering.
320 *
321 * Changes to this code, including this one, must be
322 * inspected, validated, and tested extremely carefully!!!
323 */
324
325 /*
326 * First, pick up the index.
327 */
328
329 idx = ACCESS_ONCE(t->rcu_flipctr_idx);
330
331 /*
332 * Now that we have fetched the counter index, it is
333 * safe to decrement the per-task RCU nesting counter.
334 * After this, any interrupts or NMIs will increment and
335 * decrement the per-CPU counters.
336 */
337 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
338
339 /*
340 * It is now safe to decrement this task's nesting count.
341 * NMIs that occur after this statement will route their
342 * rcu_read_lock() calls through this "else" clause, and
343 * will thus start incrementing the per-CPU counter on
344 * their own. They will also clobber ->rcu_flipctr_idx,
345 * but that is OK, since we have already fetched it.
346 */
347
348 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
349 local_irq_restore(flags);
350 }
351}
352EXPORT_SYMBOL_GPL(__rcu_read_unlock);
353
354/*
355 * If a global counter flip has occurred since the last time that we
356 * advanced callbacks, advance them. Hardware interrupts must be
357 * disabled when calling this function.
358 */
359static void __rcu_advance_callbacks(struct rcu_data *rdp)
360{
361 int cpu;
362 int i;
363 int wlc = 0;
364
365 if (rdp->completed != rcu_ctrlblk.completed) {
366 if (rdp->waitlist[GP_STAGES - 1] != NULL) {
367 *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
368 rdp->donetail = rdp->waittail[GP_STAGES - 1];
369 RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
370 }
371 for (i = GP_STAGES - 2; i >= 0; i--) {
372 if (rdp->waitlist[i] != NULL) {
373 rdp->waitlist[i + 1] = rdp->waitlist[i];
374 rdp->waittail[i + 1] = rdp->waittail[i];
375 wlc++;
376 } else {
377 rdp->waitlist[i + 1] = NULL;
378 rdp->waittail[i + 1] =
379 &rdp->waitlist[i + 1];
380 }
381 }
382 if (rdp->nextlist != NULL) {
383 rdp->waitlist[0] = rdp->nextlist;
384 rdp->waittail[0] = rdp->nexttail;
385 wlc++;
386 rdp->nextlist = NULL;
387 rdp->nexttail = &rdp->nextlist;
388 RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
389 } else {
390 rdp->waitlist[0] = NULL;
391 rdp->waittail[0] = &rdp->waitlist[0];
392 }
393 rdp->waitlistcount = wlc;
394 rdp->completed = rcu_ctrlblk.completed;
395 }
396
397 /*
398 * Check to see if this CPU needs to report that it has seen
399 * the most recent counter flip, thereby declaring that all
400 * subsequent rcu_read_lock() invocations will respect this flip.
401 */
402
403 cpu = raw_smp_processor_id();
404 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
405 smp_mb(); /* Subsequent counter accesses must see new value */
406 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
407 smp_mb(); /* Subsequent RCU read-side critical sections */
408 /* seen -after- acknowledgement. */
409 }
410}
411
412/*
413 * Get here when RCU is idle. Decide whether we need to
414 * move out of idle state, and return non-zero if so.
415 * "Straightforward" approach for the moment, might later
416 * use callback-list lengths, grace-period duration, or
417 * some such to determine when to exit idle state.
418 * Might also need a pre-idle test that does not acquire
419 * the lock, but let's get the simple case working first...
420 */
421
422static int
423rcu_try_flip_idle(void)
424{
425 int cpu;
426
427 RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
428 if (!rcu_pending(smp_processor_id())) {
429 RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
430 return 0;
431 }
432
433 /*
434 * Do the flip.
435 */
436
437 RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
438 rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */
439
440 /*
441 * Need a memory barrier so that other CPUs see the new
442 * counter value before they see the subsequent change of all
443 * the rcu_flip_flag instances to rcu_flipped.
444 */
445
446 smp_mb(); /* see above block comment. */
447
448 /* Now ask each CPU for acknowledgement of the flip. */
449
450 for_each_cpu_mask(cpu, rcu_cpu_online_map)
451 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
452
453 return 1;
454}
455
456/*
457 * Wait for CPUs to acknowledge the flip.
458 */
459
460static int
461rcu_try_flip_waitack(void)
462{
463 int cpu;
464
465 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
466 for_each_cpu_mask(cpu, rcu_cpu_online_map)
467 if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
468 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
469 return 0;
470 }
471
472 /*
473 * Make sure our checks above don't bleed into subsequent
474 * waiting for the sum of the counters to reach zero.
475 */
476
477 smp_mb(); /* see above block comment. */
478 RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
479 return 1;
480}
481
482/*
483 * Wait for collective ``last'' counter to reach zero,
484 * then tell all CPUs to do an end-of-grace-period memory barrier.
485 */
486
487static int
488rcu_try_flip_waitzero(void)
489{
490 int cpu;
491 int lastidx = !(rcu_ctrlblk.completed & 0x1);
492 int sum = 0;
493
494 /* Check to see if the sum of the "last" counters is zero. */
495
496 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
497 for_each_cpu_mask(cpu, rcu_cpu_online_map)
498 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
499 if (sum != 0) {
500 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
501 return 0;
502 }
503
504 /*
505 * This ensures that the other CPUs see the call for
506 * memory barriers -after- the sum to zero has been
507 * detected here
508 */
509 smp_mb(); /* ^^^^^^^^^^^^ */
510
511 /* Call for a memory barrier from each CPU. */
512 for_each_cpu_mask(cpu, rcu_cpu_online_map)
513 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
514
515 RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
516 return 1;
517}
518
519/*
520 * Wait for all CPUs to do their end-of-grace-period memory barrier.
521 * Return 0 once all CPUs have done so.
522 */
523
524static int
525rcu_try_flip_waitmb(void)
526{
527 int cpu;
528
529 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
530 for_each_cpu_mask(cpu, rcu_cpu_online_map)
531 if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
532 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
533 return 0;
534 }
535
536 smp_mb(); /* Ensure that the above checks precede any following flip. */
537 RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
538 return 1;
539}
540
541/*
542 * Attempt a single flip of the counters. Remember, a single flip does
543 * -not- constitute a grace period. Instead, the interval between
544 * at least GP_STAGES consecutive flips is a grace period.
545 *
546 * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
547 * on a large SMP, they might want to use a hierarchical organization of
548 * the per-CPU-counter pairs.
549 */
550static void rcu_try_flip(void)
551{
552 unsigned long flags;
553
554 RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
555 if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
556 RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
557 return;
558 }
559
560 /*
561 * Take the next transition(s) through the RCU grace-period
562 * flip-counter state machine.
563 */
564
565 switch (rcu_ctrlblk.rcu_try_flip_state) {
566 case rcu_try_flip_idle_state:
567 if (rcu_try_flip_idle())
568 rcu_ctrlblk.rcu_try_flip_state =
569 rcu_try_flip_waitack_state;
570 break;
571 case rcu_try_flip_waitack_state:
572 if (rcu_try_flip_waitack())
573 rcu_ctrlblk.rcu_try_flip_state =
574 rcu_try_flip_waitzero_state;
575 break;
576 case rcu_try_flip_waitzero_state:
577 if (rcu_try_flip_waitzero())
578 rcu_ctrlblk.rcu_try_flip_state =
579 rcu_try_flip_waitmb_state;
580 break;
581 case rcu_try_flip_waitmb_state:
582 if (rcu_try_flip_waitmb())
583 rcu_ctrlblk.rcu_try_flip_state =
584 rcu_try_flip_idle_state;
585 }
586 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
587}
588
589/*
590 * Check to see if this CPU needs to do a memory barrier in order to
591 * ensure that any prior RCU read-side critical sections have committed
592 * their counter manipulations and critical-section memory references
593 * before declaring the grace period to be completed.
594 */
595static void rcu_check_mb(int cpu)
596{
597 if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
598 smp_mb(); /* Ensure RCU read-side accesses are visible. */
599 per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
600 }
601}
602
603void rcu_check_callbacks(int cpu, int user)
604{
605 unsigned long flags;
606 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
607
608 rcu_check_mb(cpu);
609 if (rcu_ctrlblk.completed == rdp->completed)
610 rcu_try_flip();
611 spin_lock_irqsave(&rdp->lock, flags);
612 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
613 __rcu_advance_callbacks(rdp);
614 if (rdp->donelist == NULL) {
615 spin_unlock_irqrestore(&rdp->lock, flags);
616 } else {
617 spin_unlock_irqrestore(&rdp->lock, flags);
618 raise_softirq(RCU_SOFTIRQ);
619 }
620}
621
622/*
623 * Needed by dynticks, to make sure all RCU processing has finished
624 * when we go idle:
625 */
626void rcu_advance_callbacks(int cpu, int user)
627{
628 unsigned long flags;
629 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
630
631 if (rcu_ctrlblk.completed == rdp->completed) {
632 rcu_try_flip();
633 if (rcu_ctrlblk.completed == rdp->completed)
634 return;
635 }
636 spin_lock_irqsave(&rdp->lock, flags);
637 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
638 __rcu_advance_callbacks(rdp);
639 spin_unlock_irqrestore(&rdp->lock, flags);
640}
641
642#ifdef CONFIG_HOTPLUG_CPU
643#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
644 *dsttail = srclist; \
645 if (srclist != NULL) { \
646 dsttail = srctail; \
647 srclist = NULL; \
648 srctail = &srclist;\
649 } \
650 } while (0)
651
652void rcu_offline_cpu(int cpu)
653{
654 int i;
655 struct rcu_head *list = NULL;
656 unsigned long flags;
657 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
658 struct rcu_head **tail = &list;
659
660 /*
661 * Remove all callbacks from the newly dead CPU, retaining order.
662 * Otherwise rcu_barrier() will fail
663 */
664
665 spin_lock_irqsave(&rdp->lock, flags);
666 rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
667 for (i = GP_STAGES - 1; i >= 0; i--)
668 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
669 list, tail);
670 rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
671 spin_unlock_irqrestore(&rdp->lock, flags);
672 rdp->waitlistcount = 0;
673
674 /* Disengage the newly dead CPU from the grace-period computation. */
675
676 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
677 rcu_check_mb(cpu);
678 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
679 smp_mb(); /* Subsequent counter accesses must see new value */
680 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
681 smp_mb(); /* Subsequent RCU read-side critical sections */
682 /* seen -after- acknowledgement. */
683 }
684
685 RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
686 RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
687
688 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
689 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
690
691 cpu_clear(cpu, rcu_cpu_online_map);
692
693 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
694
695 /*
696 * Place the removed callbacks on the current CPU's queue.
697 * Make them all start a new grace period: simple approach,
698 * in theory could starve a given set of callbacks, but
699 * you would need to be doing some serious CPU hotplugging
700 * to make this happen. If this becomes a problem, adding
701 * a synchronize_rcu() to the hotplug path would be a simple
702 * fix.
703 */
704
705 rdp = RCU_DATA_ME();
706 spin_lock_irqsave(&rdp->lock, flags);
707 *rdp->nexttail = list;
708 if (list)
709 rdp->nexttail = tail;
710 spin_unlock_irqrestore(&rdp->lock, flags);
711}
712
713void __devinit rcu_online_cpu(int cpu)
714{
715 unsigned long flags;
716
717 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
718 cpu_set(cpu, rcu_cpu_online_map);
719 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
720}
721
722#else /* #ifdef CONFIG_HOTPLUG_CPU */
723
724void rcu_offline_cpu(int cpu)
725{
726}
727
728void __devinit rcu_online_cpu(int cpu)
729{
730}
731
732#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
733
734static void rcu_process_callbacks(struct softirq_action *unused)
735{
736 unsigned long flags;
737 struct rcu_head *next, *list;
738 struct rcu_data *rdp = RCU_DATA_ME();
739
740 spin_lock_irqsave(&rdp->lock, flags);
741 list = rdp->donelist;
742 if (list == NULL) {
743 spin_unlock_irqrestore(&rdp->lock, flags);
744 return;
745 }
746 rdp->donelist = NULL;
747 rdp->donetail = &rdp->donelist;
748 RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
749 spin_unlock_irqrestore(&rdp->lock, flags);
750 while (list) {
751 next = list->next;
752 list->func(list);
753 list = next;
754 RCU_TRACE_ME(rcupreempt_trace_invoke);
755 }
756}
757
758void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
759{
760 unsigned long flags;
761 struct rcu_data *rdp;
762
763 head->func = func;
764 head->next = NULL;
765 local_irq_save(flags);
766 rdp = RCU_DATA_ME();
767 spin_lock(&rdp->lock);
768 __rcu_advance_callbacks(rdp);
769 *rdp->nexttail = head;
770 rdp->nexttail = &head->next;
771 RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
772 spin_unlock(&rdp->lock);
773 local_irq_restore(flags);
774}
775EXPORT_SYMBOL_GPL(call_rcu);
776
777/*
778 * Wait until all currently running preempt_disable() code segments
779 * (including hardware-irq-disable segments) complete. Note that
780 * in -rt this does -not- necessarily result in all currently executing
781 * interrupt -handlers- having completed.
782 */
783void __synchronize_sched(void)
784{
785 cpumask_t oldmask;
786 int cpu;
787
788 if (sched_getaffinity(0, &oldmask) < 0)
789 oldmask = cpu_possible_map;
790 for_each_online_cpu(cpu) {
791 sched_setaffinity(0, cpumask_of_cpu(cpu));
792 schedule();
793 }
794 sched_setaffinity(0, oldmask);
795}
796EXPORT_SYMBOL_GPL(__synchronize_sched);
797
798/*
799 * Check to see if any future RCU-related work will need to be done
800 * by the current CPU, even if none need be done immediately, returning
801 * 1 if so. Assumes that notifiers would take care of handling any
802 * outstanding requests from the RCU core.
803 *
804 * This function is part of the RCU implementation; it is -not-
805 * an exported member of the RCU API.
806 */
807int rcu_needs_cpu(int cpu)
808{
809 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
810
811 return (rdp->donelist != NULL ||
812 !!rdp->waitlistcount ||
813 rdp->nextlist != NULL);
814}
815
816int rcu_pending(int cpu)
817{
818 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
819
820 /* The CPU has at least one callback queued somewhere. */
821
822 if (rdp->donelist != NULL ||
823 !!rdp->waitlistcount ||
824 rdp->nextlist != NULL)
825 return 1;
826
827 /* The RCU core needs an acknowledgement from this CPU. */
828
829 if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
830 (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
831 return 1;
832
833 /* This CPU has fallen behind the global grace-period number. */
834
835 if (rdp->completed != rcu_ctrlblk.completed)
836 return 1;
837
838 /* Nothing needed from this CPU. */
839
840 return 0;
841}
842
843static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
844 unsigned long action, void *hcpu)
845{
846 long cpu = (long)hcpu;
847
848 switch (action) {
849 case CPU_UP_PREPARE:
850 case CPU_UP_PREPARE_FROZEN:
851 rcu_online_cpu(cpu);
852 break;
853 case CPU_UP_CANCELED:
854 case CPU_UP_CANCELED_FROZEN:
855 case CPU_DEAD:
856 case CPU_DEAD_FROZEN:
857 rcu_offline_cpu(cpu);
858 break;
859 default:
860 break;
861 }
862 return NOTIFY_OK;
863}
864
865static struct notifier_block __cpuinitdata rcu_nb = {
866 .notifier_call = rcu_cpu_notify,
867};
868
869void __init __rcu_init(void)
870{
871 int cpu;
872 int i;
873 struct rcu_data *rdp;
874
875 printk(KERN_NOTICE "Preemptible RCU implementation.\n");
876 for_each_possible_cpu(cpu) {
877 rdp = RCU_DATA_CPU(cpu);
878 spin_lock_init(&rdp->lock);
879 rdp->completed = 0;
880 rdp->waitlistcount = 0;
881 rdp->nextlist = NULL;
882 rdp->nexttail = &rdp->nextlist;
883 for (i = 0; i < GP_STAGES; i++) {
884 rdp->waitlist[i] = NULL;
885 rdp->waittail[i] = &rdp->waitlist[i];
886 }
887 rdp->donelist = NULL;
888 rdp->donetail = &rdp->donelist;
889 rdp->rcu_flipctr[0] = 0;
890 rdp->rcu_flipctr[1] = 0;
891 }
892 register_cpu_notifier(&rcu_nb);
893
894 /*
895 * We don't need protection against CPU-Hotplug here
896 * since
897 * a) If a CPU comes online while we are iterating over the
898 * cpu_online_map below, we would only end up making a
899 * duplicate call to rcu_online_cpu() which sets the corresponding
900 * CPU's mask in the rcu_cpu_online_map.
901 *
902 * b) A CPU cannot go offline at this point in time since the user
903 * does not have access to the sysfs interface, nor do we
904 * suspend the system.
905 */
906 for_each_online_cpu(cpu)
907 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
908
909 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
910}
911
912/*
913 * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
914 */
915void synchronize_kernel(void)
916{
917 synchronize_rcu();
918}
919
920#ifdef CONFIG_RCU_TRACE
921long *rcupreempt_flipctr(int cpu)
922{
923 return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
924}
925EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
926
927int rcupreempt_flip_flag(int cpu)
928{
929 return per_cpu(rcu_flip_flag, cpu);
930}
931EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
932
933int rcupreempt_mb_flag(int cpu)
934{
935 return per_cpu(rcu_mb_flag, cpu);
936}
937EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
938
939char *rcupreempt_try_flip_state_name(void)
940{
941 return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
942}
943EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
944
945struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
946{
947 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
948
949 return &rdp->trace;
950}
951EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
952
953#endif /* #ifdef RCU_TRACE */
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
new file mode 100644
index 000000000000..49ac4947af24
--- /dev/null
+++ b/kernel/rcupreempt_trace.c
@@ -0,0 +1,330 @@
1/*
2 * Read-Copy Update tracing for realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt
24 *
25 */
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/init.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/rcupdate.h>
32#include <linux/interrupt.h>
33#include <linux/sched.h>
34#include <asm/atomic.h>
35#include <linux/bitops.h>
36#include <linux/module.h>
37#include <linux/completion.h>
38#include <linux/moduleparam.h>
39#include <linux/percpu.h>
40#include <linux/notifier.h>
41#include <linux/rcupdate.h>
42#include <linux/cpu.h>
43#include <linux/mutex.h>
44#include <linux/rcupreempt_trace.h>
45#include <linux/debugfs.h>
46
47static struct mutex rcupreempt_trace_mutex;
48static char *rcupreempt_trace_buf;
49#define RCUPREEMPT_TRACE_BUF_SIZE 4096
50
51void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
52{
53 trace->done_length += trace->wait_length;
54 trace->done_add += trace->wait_length;
55 trace->wait_length = 0;
56}
57void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
58{
59 trace->wait_length += trace->next_length;
60 trace->wait_add += trace->next_length;
61 trace->next_length = 0;
62}
63void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace)
64{
65 atomic_inc(&trace->rcu_try_flip_1);
66}
67void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
68{
69 atomic_inc(&trace->rcu_try_flip_e1);
70}
71void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace)
72{
73 trace->rcu_try_flip_i1++;
74}
75void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace)
76{
77 trace->rcu_try_flip_ie1++;
78}
79void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace)
80{
81 trace->rcu_try_flip_g1++;
82}
83void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace)
84{
85 trace->rcu_try_flip_a1++;
86}
87void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace)
88{
89 trace->rcu_try_flip_ae1++;
90}
91void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace)
92{
93 trace->rcu_try_flip_a2++;
94}
95void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace)
96{
97 trace->rcu_try_flip_z1++;
98}
99void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace)
100{
101 trace->rcu_try_flip_ze1++;
102}
103void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace)
104{
105 trace->rcu_try_flip_z2++;
106}
107void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace)
108{
109 trace->rcu_try_flip_m1++;
110}
111void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace)
112{
113 trace->rcu_try_flip_me1++;
114}
115void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace)
116{
117 trace->rcu_try_flip_m2++;
118}
119void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
120{
121 trace->rcu_check_callbacks++;
122}
123void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
124{
125 trace->done_remove += trace->done_length;
126 trace->done_length = 0;
127}
128void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
129{
130 atomic_inc(&trace->done_invoked);
131}
132void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
133{
134 trace->next_add++;
135 trace->next_length++;
136}
137
138static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
139{
140 struct rcupreempt_trace *cp;
141 int cpu;
142
143 memset(sp, 0, sizeof(*sp));
144 for_each_possible_cpu(cpu) {
145 cp = rcupreempt_trace_cpu(cpu);
146 sp->next_length += cp->next_length;
147 sp->next_add += cp->next_add;
148 sp->wait_length += cp->wait_length;
149 sp->wait_add += cp->wait_add;
150 sp->done_length += cp->done_length;
151 sp->done_add += cp->done_add;
152 sp->done_remove += cp->done_remove;
153 atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked));
154 sp->rcu_check_callbacks += cp->rcu_check_callbacks;
155 atomic_set(&sp->rcu_try_flip_1,
156 atomic_read(&cp->rcu_try_flip_1));
157 atomic_set(&sp->rcu_try_flip_e1,
158 atomic_read(&cp->rcu_try_flip_e1));
159 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
160 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
161 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
162 sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1;
163 sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1;
164 sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2;
165 sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1;
166 sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1;
167 sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2;
168 sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1;
169 sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1;
170 sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2;
171 }
172}
173
174static ssize_t rcustats_read(struct file *filp, char __user *buffer,
175 size_t count, loff_t *ppos)
176{
177 struct rcupreempt_trace trace;
178 ssize_t bcount;
179 int cnt = 0;
180
181 rcupreempt_trace_sum(&trace);
182 mutex_lock(&rcupreempt_trace_mutex);
183 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
184 "ggp=%ld rcc=%ld\n",
185 rcu_batches_completed(),
186 trace.rcu_check_callbacks);
187 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
188 "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
189 "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n"
190 "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n",
191
192 trace.next_add, trace.next_length,
193 trace.wait_add, trace.wait_length,
194 trace.done_add, trace.done_length,
195 trace.done_remove, atomic_read(&trace.done_invoked),
196 atomic_read(&trace.rcu_try_flip_1),
197 atomic_read(&trace.rcu_try_flip_e1),
198 trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1,
199 trace.rcu_try_flip_g1,
200 trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1,
201 trace.rcu_try_flip_a2,
202 trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1,
203 trace.rcu_try_flip_z2,
204 trace.rcu_try_flip_m1, trace.rcu_try_flip_me1,
205 trace.rcu_try_flip_m2);
206 bcount = simple_read_from_buffer(buffer, count, ppos,
207 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
208 mutex_unlock(&rcupreempt_trace_mutex);
209 return bcount;
210}
211
212static ssize_t rcugp_read(struct file *filp, char __user *buffer,
213 size_t count, loff_t *ppos)
214{
215 long oldgp = rcu_batches_completed();
216 ssize_t bcount;
217
218 mutex_lock(&rcupreempt_trace_mutex);
219 synchronize_rcu();
220 snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE,
221 "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed());
222 bcount = simple_read_from_buffer(buffer, count, ppos,
223 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
224 mutex_unlock(&rcupreempt_trace_mutex);
225 return bcount;
226}
227
228static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
229 size_t count, loff_t *ppos)
230{
231 int cnt = 0;
232 int cpu;
233 int f = rcu_batches_completed() & 0x1;
234 ssize_t bcount;
235
236 mutex_lock(&rcupreempt_trace_mutex);
237
238 cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
239 "CPU last cur F M\n");
240 for_each_online_cpu(cpu) {
241 long *flipctr = rcupreempt_flipctr(cpu);
242 cnt += snprintf(&rcupreempt_trace_buf[cnt],
243 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
244 "%3d %4ld %3ld %d %d\n",
245 cpu,
246 flipctr[!f],
247 flipctr[f],
248 rcupreempt_flip_flag(cpu),
249 rcupreempt_mb_flag(cpu));
250 }
251 cnt += snprintf(&rcupreempt_trace_buf[cnt],
252 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
253 "ggp = %ld, state = %s\n",
254 rcu_batches_completed(),
255 rcupreempt_try_flip_state_name());
256 cnt += snprintf(&rcupreempt_trace_buf[cnt],
257 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
258 "\n");
259 bcount = simple_read_from_buffer(buffer, count, ppos,
260 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
261 mutex_unlock(&rcupreempt_trace_mutex);
262 return bcount;
263}
264
265static struct file_operations rcustats_fops = {
266 .owner = THIS_MODULE,
267 .read = rcustats_read,
268};
269
270static struct file_operations rcugp_fops = {
271 .owner = THIS_MODULE,
272 .read = rcugp_read,
273};
274
275static struct file_operations rcuctrs_fops = {
276 .owner = THIS_MODULE,
277 .read = rcuctrs_read,
278};
279
280static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir;
281static int rcupreempt_debugfs_init(void)
282{
283 rcudir = debugfs_create_dir("rcu", NULL);
284 if (!rcudir)
285 goto out;
286 statdir = debugfs_create_file("rcustats", 0444, rcudir,
287 NULL, &rcustats_fops);
288 if (!statdir)
289 goto free_out;
290
291 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
292 if (!gpdir)
293 goto free_out;
294
295 ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir,
296 NULL, &rcuctrs_fops);
297 if (!ctrsdir)
298 goto free_out;
299 return 0;
300free_out:
301 if (statdir)
302 debugfs_remove(statdir);
303 if (gpdir)
304 debugfs_remove(gpdir);
305 debugfs_remove(rcudir);
306out:
307 return 1;
308}
309
310static int __init rcupreempt_trace_init(void)
311{
312 mutex_init(&rcupreempt_trace_mutex);
313 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
314 if (!rcupreempt_trace_buf)
315 return 1;
316 return rcupreempt_debugfs_init();
317}
318
319static void __exit rcupreempt_trace_cleanup(void)
320{
321 debugfs_remove(statdir);
322 debugfs_remove(gpdir);
323 debugfs_remove(ctrsdir);
324 debugfs_remove(rcudir);
325 kfree(rcupreempt_trace_buf);
326}
327
328
329module_init(rcupreempt_trace_init);
330module_exit(rcupreempt_trace_cleanup);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index c3e165c2318f..fd599829e72a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -726,11 +726,11 @@ static void rcu_torture_shuffle_tasks(void)
726 cpumask_t tmp_mask = CPU_MASK_ALL; 726 cpumask_t tmp_mask = CPU_MASK_ALL;
727 int i; 727 int i;
728 728
729 lock_cpu_hotplug(); 729 get_online_cpus();
730 730
731 /* No point in shuffling if there is only one online CPU (ex: UP) */ 731 /* No point in shuffling if there is only one online CPU (ex: UP) */
732 if (num_online_cpus() == 1) { 732 if (num_online_cpus() == 1) {
733 unlock_cpu_hotplug(); 733 put_online_cpus();
734 return; 734 return;
735 } 735 }
736 736
@@ -762,7 +762,7 @@ static void rcu_torture_shuffle_tasks(void)
762 else 762 else
763 rcu_idle_cpu--; 763 rcu_idle_cpu--;
764 764
765 unlock_cpu_hotplug(); 765 put_online_cpus();
766} 766}
767 767
768/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the 768/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
diff --git a/kernel/sched.c b/kernel/sched.c
index e76b11ca6df3..524285e46fa7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -22,6 +22,8 @@
22 * by Peter Williams 22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
26 * Thomas Gleixner, Mike Kravetz
25 */ 27 */
26 28
27#include <linux/mm.h> 29#include <linux/mm.h>
@@ -63,6 +65,7 @@
63#include <linux/reciprocal_div.h> 65#include <linux/reciprocal_div.h>
64#include <linux/unistd.h> 66#include <linux/unistd.h>
65#include <linux/pagemap.h> 67#include <linux/pagemap.h>
68#include <linux/hrtimer.h>
66 69
67#include <asm/tlb.h> 70#include <asm/tlb.h>
68#include <asm/irq_regs.h> 71#include <asm/irq_regs.h>
@@ -96,10 +99,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
96#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) 99#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
97 100
98/* 101/*
99 * Some helpers for converting nanosecond timing to jiffy resolution 102 * Helpers for converting nanosecond timing to jiffy resolution
100 */ 103 */
101#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 104#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
102#define JIFFIES_TO_NS(TIME) ((TIME) * (NSEC_PER_SEC / HZ))
103 105
104#define NICE_0_LOAD SCHED_LOAD_SCALE 106#define NICE_0_LOAD SCHED_LOAD_SCALE
105#define NICE_0_SHIFT SCHED_LOAD_SHIFT 107#define NICE_0_SHIFT SCHED_LOAD_SHIFT
@@ -159,6 +161,8 @@ struct rt_prio_array {
159 161
160struct cfs_rq; 162struct cfs_rq;
161 163
164static LIST_HEAD(task_groups);
165
162/* task group related information */ 166/* task group related information */
163struct task_group { 167struct task_group {
164#ifdef CONFIG_FAIR_CGROUP_SCHED 168#ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -168,10 +172,50 @@ struct task_group {
168 struct sched_entity **se; 172 struct sched_entity **se;
169 /* runqueue "owned" by this group on each cpu */ 173 /* runqueue "owned" by this group on each cpu */
170 struct cfs_rq **cfs_rq; 174 struct cfs_rq **cfs_rq;
175
176 struct sched_rt_entity **rt_se;
177 struct rt_rq **rt_rq;
178
179 unsigned int rt_ratio;
180
181 /*
182 * shares assigned to a task group governs how much of cpu bandwidth
183 * is allocated to the group. The more shares a group has, the more is
184 * the cpu bandwidth allocated to it.
185 *
186 * For ex, lets say that there are three task groups, A, B and C which
187 * have been assigned shares 1000, 2000 and 3000 respectively. Then,
188 * cpu bandwidth allocated by the scheduler to task groups A, B and C
189 * should be:
190 *
191 * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
192 * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
193 * Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
194 *
195 * The weight assigned to a task group's schedulable entities on every
196 * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
197 * group's shares. For ex: lets say that task group A has been
198 * assigned shares of 1000 and there are two CPUs in a system. Then,
199 *
200 * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
201 *
202 * Note: It's not necessary that each of a task's group schedulable
203 * entity have the same weight on all CPUs. If the group
204 * has 2 of its tasks on CPU0 and 1 task on CPU1, then a
205 * better distribution of weight could be:
206 *
207 * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
208 * tg_A->se[1]->load.weight = 1/2 * 2000 = 667
209 *
210 * rebalance_shares() is responsible for distributing the shares of a
211 * task groups like this among the group's schedulable entities across
212 * cpus.
213 *
214 */
171 unsigned long shares; 215 unsigned long shares;
172 /* spinlock to serialize modification to shares */ 216
173 spinlock_t lock;
174 struct rcu_head rcu; 217 struct rcu_head rcu;
218 struct list_head list;
175}; 219};
176 220
177/* Default task group's sched entity on each cpu */ 221/* Default task group's sched entity on each cpu */
@@ -179,24 +223,51 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
179/* Default task group's cfs_rq on each cpu */ 223/* Default task group's cfs_rq on each cpu */
180static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 224static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
181 225
226static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
227static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
228
182static struct sched_entity *init_sched_entity_p[NR_CPUS]; 229static struct sched_entity *init_sched_entity_p[NR_CPUS];
183static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; 230static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
184 231
232static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
233static struct rt_rq *init_rt_rq_p[NR_CPUS];
234
235/* task_group_mutex serializes add/remove of task groups and also changes to
236 * a task group's cpu shares.
237 */
238static DEFINE_MUTEX(task_group_mutex);
239
240/* doms_cur_mutex serializes access to doms_cur[] array */
241static DEFINE_MUTEX(doms_cur_mutex);
242
243#ifdef CONFIG_SMP
244/* kernel thread that runs rebalance_shares() periodically */
245static struct task_struct *lb_monitor_task;
246static int load_balance_monitor(void *unused);
247#endif
248
249static void set_se_shares(struct sched_entity *se, unsigned long shares);
250
185/* Default task group. 251/* Default task group.
186 * Every task in system belong to this group at bootup. 252 * Every task in system belong to this group at bootup.
187 */ 253 */
188struct task_group init_task_group = { 254struct task_group init_task_group = {
189 .se = init_sched_entity_p, 255 .se = init_sched_entity_p,
190 .cfs_rq = init_cfs_rq_p, 256 .cfs_rq = init_cfs_rq_p,
257
258 .rt_se = init_sched_rt_entity_p,
259 .rt_rq = init_rt_rq_p,
191}; 260};
192 261
193#ifdef CONFIG_FAIR_USER_SCHED 262#ifdef CONFIG_FAIR_USER_SCHED
194# define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD 263# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
195#else 264#else
196# define INIT_TASK_GRP_LOAD NICE_0_LOAD 265# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
197#endif 266#endif
198 267
199static int init_task_group_load = INIT_TASK_GRP_LOAD; 268#define MIN_GROUP_SHARES 2
269
270static int init_task_group_load = INIT_TASK_GROUP_LOAD;
200 271
201/* return group to which a task belongs */ 272/* return group to which a task belongs */
202static inline struct task_group *task_group(struct task_struct *p) 273static inline struct task_group *task_group(struct task_struct *p)
@@ -215,15 +286,42 @@ static inline struct task_group *task_group(struct task_struct *p)
215} 286}
216 287
217/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 288/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
218static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) 289static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
219{ 290{
220 p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; 291 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
221 p->se.parent = task_group(p)->se[cpu]; 292 p->se.parent = task_group(p)->se[cpu];
293
294 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
295 p->rt.parent = task_group(p)->rt_se[cpu];
296}
297
298static inline void lock_task_group_list(void)
299{
300 mutex_lock(&task_group_mutex);
301}
302
303static inline void unlock_task_group_list(void)
304{
305 mutex_unlock(&task_group_mutex);
306}
307
308static inline void lock_doms_cur(void)
309{
310 mutex_lock(&doms_cur_mutex);
311}
312
313static inline void unlock_doms_cur(void)
314{
315 mutex_unlock(&doms_cur_mutex);
222} 316}
223 317
224#else 318#else
225 319
226static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { } 320static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
321static inline void lock_task_group_list(void) { }
322static inline void unlock_task_group_list(void) { }
323static inline void lock_doms_cur(void) { }
324static inline void unlock_doms_cur(void) { }
227 325
228#endif /* CONFIG_FAIR_GROUP_SCHED */ 326#endif /* CONFIG_FAIR_GROUP_SCHED */
229 327
@@ -264,10 +362,56 @@ struct cfs_rq {
264/* Real-Time classes' related field in a runqueue: */ 362/* Real-Time classes' related field in a runqueue: */
265struct rt_rq { 363struct rt_rq {
266 struct rt_prio_array active; 364 struct rt_prio_array active;
267 int rt_load_balance_idx; 365 unsigned long rt_nr_running;
268 struct list_head *rt_load_balance_head, *rt_load_balance_curr; 366#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
367 int highest_prio; /* highest queued rt task prio */
368#endif
369#ifdef CONFIG_SMP
370 unsigned long rt_nr_migratory;
371 int overloaded;
372#endif
373 int rt_throttled;
374 u64 rt_time;
375
376#ifdef CONFIG_FAIR_GROUP_SCHED
377 struct rq *rq;
378 struct list_head leaf_rt_rq_list;
379 struct task_group *tg;
380 struct sched_rt_entity *rt_se;
381#endif
269}; 382};
270 383
384#ifdef CONFIG_SMP
385
386/*
387 * We add the notion of a root-domain which will be used to define per-domain
388 * variables. Each exclusive cpuset essentially defines an island domain by
389 * fully partitioning the member cpus from any other cpuset. Whenever a new
390 * exclusive cpuset is created, we also create and attach a new root-domain
391 * object.
392 *
393 */
394struct root_domain {
395 atomic_t refcount;
396 cpumask_t span;
397 cpumask_t online;
398
399 /*
400 * The "RT overload" flag: it gets set if a CPU has more than
401 * one runnable RT task.
402 */
403 cpumask_t rto_mask;
404 atomic_t rto_count;
405};
406
407/*
408 * By default the system creates a single root-domain with all cpus as
409 * members (mimicking the global state we have today).
410 */
411static struct root_domain def_root_domain;
412
413#endif
414
271/* 415/*
272 * This is the main, per-CPU runqueue data structure. 416 * This is the main, per-CPU runqueue data structure.
273 * 417 *
@@ -296,11 +440,15 @@ struct rq {
296 u64 nr_switches; 440 u64 nr_switches;
297 441
298 struct cfs_rq cfs; 442 struct cfs_rq cfs;
443 struct rt_rq rt;
444 u64 rt_period_expire;
445 int rt_throttled;
446
299#ifdef CONFIG_FAIR_GROUP_SCHED 447#ifdef CONFIG_FAIR_GROUP_SCHED
300 /* list of leaf cfs_rq on this cpu: */ 448 /* list of leaf cfs_rq on this cpu: */
301 struct list_head leaf_cfs_rq_list; 449 struct list_head leaf_cfs_rq_list;
450 struct list_head leaf_rt_rq_list;
302#endif 451#endif
303 struct rt_rq rt;
304 452
305 /* 453 /*
306 * This is part of a global counter where only the total sum 454 * This is part of a global counter where only the total sum
@@ -317,7 +465,7 @@ struct rq {
317 u64 clock, prev_clock_raw; 465 u64 clock, prev_clock_raw;
318 s64 clock_max_delta; 466 s64 clock_max_delta;
319 467
320 unsigned int clock_warps, clock_overflows; 468 unsigned int clock_warps, clock_overflows, clock_underflows;
321 u64 idle_clock; 469 u64 idle_clock;
322 unsigned int clock_deep_idle_events; 470 unsigned int clock_deep_idle_events;
323 u64 tick_timestamp; 471 u64 tick_timestamp;
@@ -325,6 +473,7 @@ struct rq {
325 atomic_t nr_iowait; 473 atomic_t nr_iowait;
326 474
327#ifdef CONFIG_SMP 475#ifdef CONFIG_SMP
476 struct root_domain *rd;
328 struct sched_domain *sd; 477 struct sched_domain *sd;
329 478
330 /* For active balancing */ 479 /* For active balancing */
@@ -337,6 +486,12 @@ struct rq {
337 struct list_head migration_queue; 486 struct list_head migration_queue;
338#endif 487#endif
339 488
489#ifdef CONFIG_SCHED_HRTICK
490 unsigned long hrtick_flags;
491 ktime_t hrtick_expire;
492 struct hrtimer hrtick_timer;
493#endif
494
340#ifdef CONFIG_SCHEDSTATS 495#ifdef CONFIG_SCHEDSTATS
341 /* latency stats */ 496 /* latency stats */
342 struct sched_info rq_sched_info; 497 struct sched_info rq_sched_info;
@@ -363,7 +518,6 @@ struct rq {
363}; 518};
364 519
365static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 520static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
366static DEFINE_MUTEX(sched_hotcpu_mutex);
367 521
368static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 522static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
369{ 523{
@@ -441,6 +595,23 @@ static void update_rq_clock(struct rq *rq)
441#define task_rq(p) cpu_rq(task_cpu(p)) 595#define task_rq(p) cpu_rq(task_cpu(p))
442#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 596#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
443 597
598unsigned long rt_needs_cpu(int cpu)
599{
600 struct rq *rq = cpu_rq(cpu);
601 u64 delta;
602
603 if (!rq->rt_throttled)
604 return 0;
605
606 if (rq->clock > rq->rt_period_expire)
607 return 1;
608
609 delta = rq->rt_period_expire - rq->clock;
610 do_div(delta, NSEC_PER_SEC / HZ);
611
612 return (unsigned long)delta;
613}
614
444/* 615/*
445 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 616 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
446 */ 617 */
@@ -459,6 +630,8 @@ enum {
459 SCHED_FEAT_START_DEBIT = 4, 630 SCHED_FEAT_START_DEBIT = 4,
460 SCHED_FEAT_TREE_AVG = 8, 631 SCHED_FEAT_TREE_AVG = 8,
461 SCHED_FEAT_APPROX_AVG = 16, 632 SCHED_FEAT_APPROX_AVG = 16,
633 SCHED_FEAT_HRTICK = 32,
634 SCHED_FEAT_DOUBLE_TICK = 64,
462}; 635};
463 636
464const_debug unsigned int sysctl_sched_features = 637const_debug unsigned int sysctl_sched_features =
@@ -466,7 +639,9 @@ const_debug unsigned int sysctl_sched_features =
466 SCHED_FEAT_WAKEUP_PREEMPT * 1 | 639 SCHED_FEAT_WAKEUP_PREEMPT * 1 |
467 SCHED_FEAT_START_DEBIT * 1 | 640 SCHED_FEAT_START_DEBIT * 1 |
468 SCHED_FEAT_TREE_AVG * 0 | 641 SCHED_FEAT_TREE_AVG * 0 |
469 SCHED_FEAT_APPROX_AVG * 0; 642 SCHED_FEAT_APPROX_AVG * 0 |
643 SCHED_FEAT_HRTICK * 1 |
644 SCHED_FEAT_DOUBLE_TICK * 0;
470 645
471#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) 646#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
472 647
@@ -477,6 +652,21 @@ const_debug unsigned int sysctl_sched_features =
477const_debug unsigned int sysctl_sched_nr_migrate = 32; 652const_debug unsigned int sysctl_sched_nr_migrate = 32;
478 653
479/* 654/*
655 * period over which we measure -rt task cpu usage in ms.
656 * default: 1s
657 */
658const_debug unsigned int sysctl_sched_rt_period = 1000;
659
660#define SCHED_RT_FRAC_SHIFT 16
661#define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT)
662
663/*
664 * ratio of time -rt tasks may consume.
665 * default: 95%
666 */
667const_debug unsigned int sysctl_sched_rt_ratio = 62259;
668
669/*
480 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 670 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
481 * clock constructed from sched_clock(): 671 * clock constructed from sched_clock():
482 */ 672 */
@@ -668,7 +858,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
668 struct rq *rq = cpu_rq(smp_processor_id()); 858 struct rq *rq = cpu_rq(smp_processor_id());
669 u64 now = sched_clock(); 859 u64 now = sched_clock();
670 860
671 touch_softlockup_watchdog();
672 rq->idle_clock += delta_ns; 861 rq->idle_clock += delta_ns;
673 /* 862 /*
674 * Override the previous timestamp and ignore all 863 * Override the previous timestamp and ignore all
@@ -680,9 +869,177 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
680 rq->prev_clock_raw = now; 869 rq->prev_clock_raw = now;
681 rq->clock += delta_ns; 870 rq->clock += delta_ns;
682 spin_unlock(&rq->lock); 871 spin_unlock(&rq->lock);
872 touch_softlockup_watchdog();
683} 873}
684EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 874EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
685 875
876static void __resched_task(struct task_struct *p, int tif_bit);
877
878static inline void resched_task(struct task_struct *p)
879{
880 __resched_task(p, TIF_NEED_RESCHED);
881}
882
883#ifdef CONFIG_SCHED_HRTICK
884/*
885 * Use HR-timers to deliver accurate preemption points.
886 *
887 * Its all a bit involved since we cannot program an hrt while holding the
888 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
889 * reschedule event.
890 *
891 * When we get rescheduled we reprogram the hrtick_timer outside of the
892 * rq->lock.
893 */
894static inline void resched_hrt(struct task_struct *p)
895{
896 __resched_task(p, TIF_HRTICK_RESCHED);
897}
898
899static inline void resched_rq(struct rq *rq)
900{
901 unsigned long flags;
902
903 spin_lock_irqsave(&rq->lock, flags);
904 resched_task(rq->curr);
905 spin_unlock_irqrestore(&rq->lock, flags);
906}
907
908enum {
909 HRTICK_SET, /* re-programm hrtick_timer */
910 HRTICK_RESET, /* not a new slice */
911};
912
913/*
914 * Use hrtick when:
915 * - enabled by features
916 * - hrtimer is actually high res
917 */
918static inline int hrtick_enabled(struct rq *rq)
919{
920 if (!sched_feat(HRTICK))
921 return 0;
922 return hrtimer_is_hres_active(&rq->hrtick_timer);
923}
924
925/*
926 * Called to set the hrtick timer state.
927 *
928 * called with rq->lock held and irqs disabled
929 */
930static void hrtick_start(struct rq *rq, u64 delay, int reset)
931{
932 assert_spin_locked(&rq->lock);
933
934 /*
935 * preempt at: now + delay
936 */
937 rq->hrtick_expire =
938 ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
939 /*
940 * indicate we need to program the timer
941 */
942 __set_bit(HRTICK_SET, &rq->hrtick_flags);
943 if (reset)
944 __set_bit(HRTICK_RESET, &rq->hrtick_flags);
945
946 /*
947 * New slices are called from the schedule path and don't need a
948 * forced reschedule.
949 */
950 if (reset)
951 resched_hrt(rq->curr);
952}
953
954static void hrtick_clear(struct rq *rq)
955{
956 if (hrtimer_active(&rq->hrtick_timer))
957 hrtimer_cancel(&rq->hrtick_timer);
958}
959
960/*
961 * Update the timer from the possible pending state.
962 */
963static void hrtick_set(struct rq *rq)
964{
965 ktime_t time;
966 int set, reset;
967 unsigned long flags;
968
969 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
970
971 spin_lock_irqsave(&rq->lock, flags);
972 set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
973 reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
974 time = rq->hrtick_expire;
975 clear_thread_flag(TIF_HRTICK_RESCHED);
976 spin_unlock_irqrestore(&rq->lock, flags);
977
978 if (set) {
979 hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
980 if (reset && !hrtimer_active(&rq->hrtick_timer))
981 resched_rq(rq);
982 } else
983 hrtick_clear(rq);
984}
985
986/*
987 * High-resolution timer tick.
988 * Runs from hardirq context with interrupts disabled.
989 */
990static enum hrtimer_restart hrtick(struct hrtimer *timer)
991{
992 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
993
994 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
995
996 spin_lock(&rq->lock);
997 __update_rq_clock(rq);
998 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
999 spin_unlock(&rq->lock);
1000
1001 return HRTIMER_NORESTART;
1002}
1003
1004static inline void init_rq_hrtick(struct rq *rq)
1005{
1006 rq->hrtick_flags = 0;
1007 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1008 rq->hrtick_timer.function = hrtick;
1009 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
1010}
1011
1012void hrtick_resched(void)
1013{
1014 struct rq *rq;
1015 unsigned long flags;
1016
1017 if (!test_thread_flag(TIF_HRTICK_RESCHED))
1018 return;
1019
1020 local_irq_save(flags);
1021 rq = cpu_rq(smp_processor_id());
1022 hrtick_set(rq);
1023 local_irq_restore(flags);
1024}
1025#else
1026static inline void hrtick_clear(struct rq *rq)
1027{
1028}
1029
1030static inline void hrtick_set(struct rq *rq)
1031{
1032}
1033
1034static inline void init_rq_hrtick(struct rq *rq)
1035{
1036}
1037
1038void hrtick_resched(void)
1039{
1040}
1041#endif
1042
686/* 1043/*
687 * resched_task - mark a task 'to be rescheduled now'. 1044 * resched_task - mark a task 'to be rescheduled now'.
688 * 1045 *
@@ -696,16 +1053,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
696#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 1053#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
697#endif 1054#endif
698 1055
699static void resched_task(struct task_struct *p) 1056static void __resched_task(struct task_struct *p, int tif_bit)
700{ 1057{
701 int cpu; 1058 int cpu;
702 1059
703 assert_spin_locked(&task_rq(p)->lock); 1060 assert_spin_locked(&task_rq(p)->lock);
704 1061
705 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) 1062 if (unlikely(test_tsk_thread_flag(p, tif_bit)))
706 return; 1063 return;
707 1064
708 set_tsk_thread_flag(p, TIF_NEED_RESCHED); 1065 set_tsk_thread_flag(p, tif_bit);
709 1066
710 cpu = task_cpu(p); 1067 cpu = task_cpu(p);
711 if (cpu == smp_processor_id()) 1068 if (cpu == smp_processor_id())
@@ -728,10 +1085,10 @@ static void resched_cpu(int cpu)
728 spin_unlock_irqrestore(&rq->lock, flags); 1085 spin_unlock_irqrestore(&rq->lock, flags);
729} 1086}
730#else 1087#else
731static inline void resched_task(struct task_struct *p) 1088static void __resched_task(struct task_struct *p, int tif_bit)
732{ 1089{
733 assert_spin_locked(&task_rq(p)->lock); 1090 assert_spin_locked(&task_rq(p)->lock);
734 set_tsk_need_resched(p); 1091 set_tsk_thread_flag(p, tif_bit);
735} 1092}
736#endif 1093#endif
737 1094
@@ -871,6 +1228,23 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
871static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1228static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
872#endif 1229#endif
873 1230
1231static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1232{
1233 update_load_add(&rq->load, load);
1234}
1235
1236static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1237{
1238 update_load_sub(&rq->load, load);
1239}
1240
1241#ifdef CONFIG_SMP
1242static unsigned long source_load(int cpu, int type);
1243static unsigned long target_load(int cpu, int type);
1244static unsigned long cpu_avg_load_per_task(int cpu);
1245static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1246#endif /* CONFIG_SMP */
1247
874#include "sched_stats.h" 1248#include "sched_stats.h"
875#include "sched_idletask.c" 1249#include "sched_idletask.c"
876#include "sched_fair.c" 1250#include "sched_fair.c"
@@ -881,41 +1255,14 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
881 1255
882#define sched_class_highest (&rt_sched_class) 1256#define sched_class_highest (&rt_sched_class)
883 1257
884/*
885 * Update delta_exec, delta_fair fields for rq.
886 *
887 * delta_fair clock advances at a rate inversely proportional to
888 * total load (rq->load.weight) on the runqueue, while
889 * delta_exec advances at the same rate as wall-clock (provided
890 * cpu is not idle).
891 *
892 * delta_exec / delta_fair is a measure of the (smoothened) load on this
893 * runqueue over any given interval. This (smoothened) load is used
894 * during load balance.
895 *
896 * This function is called /before/ updating rq->load
897 * and when switching tasks.
898 */
899static inline void inc_load(struct rq *rq, const struct task_struct *p)
900{
901 update_load_add(&rq->load, p->se.load.weight);
902}
903
904static inline void dec_load(struct rq *rq, const struct task_struct *p)
905{
906 update_load_sub(&rq->load, p->se.load.weight);
907}
908
909static void inc_nr_running(struct task_struct *p, struct rq *rq) 1258static void inc_nr_running(struct task_struct *p, struct rq *rq)
910{ 1259{
911 rq->nr_running++; 1260 rq->nr_running++;
912 inc_load(rq, p);
913} 1261}
914 1262
915static void dec_nr_running(struct task_struct *p, struct rq *rq) 1263static void dec_nr_running(struct task_struct *p, struct rq *rq)
916{ 1264{
917 rq->nr_running--; 1265 rq->nr_running--;
918 dec_load(rq, p);
919} 1266}
920 1267
921static void set_load_weight(struct task_struct *p) 1268static void set_load_weight(struct task_struct *p)
@@ -1039,7 +1386,7 @@ unsigned long weighted_cpuload(const int cpu)
1039 1386
1040static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1387static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1041{ 1388{
1042 set_task_cfs_rq(p, cpu); 1389 set_task_rq(p, cpu);
1043#ifdef CONFIG_SMP 1390#ifdef CONFIG_SMP
1044 /* 1391 /*
1045 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be 1392 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
@@ -1051,12 +1398,24 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1051#endif 1398#endif
1052} 1399}
1053 1400
1401static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1402 const struct sched_class *prev_class,
1403 int oldprio, int running)
1404{
1405 if (prev_class != p->sched_class) {
1406 if (prev_class->switched_from)
1407 prev_class->switched_from(rq, p, running);
1408 p->sched_class->switched_to(rq, p, running);
1409 } else
1410 p->sched_class->prio_changed(rq, p, oldprio, running);
1411}
1412
1054#ifdef CONFIG_SMP 1413#ifdef CONFIG_SMP
1055 1414
1056/* 1415/*
1057 * Is this task likely cache-hot: 1416 * Is this task likely cache-hot:
1058 */ 1417 */
1059static inline int 1418static int
1060task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) 1419task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1061{ 1420{
1062 s64 delta; 1421 s64 delta;
@@ -1281,7 +1640,7 @@ static unsigned long target_load(int cpu, int type)
1281/* 1640/*
1282 * Return the average load per task on the cpu's run queue 1641 * Return the average load per task on the cpu's run queue
1283 */ 1642 */
1284static inline unsigned long cpu_avg_load_per_task(int cpu) 1643static unsigned long cpu_avg_load_per_task(int cpu)
1285{ 1644{
1286 struct rq *rq = cpu_rq(cpu); 1645 struct rq *rq = cpu_rq(cpu);
1287 unsigned long total = weighted_cpuload(cpu); 1646 unsigned long total = weighted_cpuload(cpu);
@@ -1438,58 +1797,6 @@ static int sched_balance_self(int cpu, int flag)
1438 1797
1439#endif /* CONFIG_SMP */ 1798#endif /* CONFIG_SMP */
1440 1799
1441/*
1442 * wake_idle() will wake a task on an idle cpu if task->cpu is
1443 * not idle and an idle cpu is available. The span of cpus to
1444 * search starts with cpus closest then further out as needed,
1445 * so we always favor a closer, idle cpu.
1446 *
1447 * Returns the CPU we should wake onto.
1448 */
1449#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1450static int wake_idle(int cpu, struct task_struct *p)
1451{
1452 cpumask_t tmp;
1453 struct sched_domain *sd;
1454 int i;
1455
1456 /*
1457 * If it is idle, then it is the best cpu to run this task.
1458 *
1459 * This cpu is also the best, if it has more than one task already.
1460 * Siblings must be also busy(in most cases) as they didn't already
1461 * pickup the extra load from this cpu and hence we need not check
1462 * sibling runqueue info. This will avoid the checks and cache miss
1463 * penalities associated with that.
1464 */
1465 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1466 return cpu;
1467
1468 for_each_domain(cpu, sd) {
1469 if (sd->flags & SD_WAKE_IDLE) {
1470 cpus_and(tmp, sd->span, p->cpus_allowed);
1471 for_each_cpu_mask(i, tmp) {
1472 if (idle_cpu(i)) {
1473 if (i != task_cpu(p)) {
1474 schedstat_inc(p,
1475 se.nr_wakeups_idle);
1476 }
1477 return i;
1478 }
1479 }
1480 } else {
1481 break;
1482 }
1483 }
1484 return cpu;
1485}
1486#else
1487static inline int wake_idle(int cpu, struct task_struct *p)
1488{
1489 return cpu;
1490}
1491#endif
1492
1493/*** 1800/***
1494 * try_to_wake_up - wake up a thread 1801 * try_to_wake_up - wake up a thread
1495 * @p: the to-be-woken-up thread 1802 * @p: the to-be-woken-up thread
@@ -1510,11 +1817,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1510 unsigned long flags; 1817 unsigned long flags;
1511 long old_state; 1818 long old_state;
1512 struct rq *rq; 1819 struct rq *rq;
1513#ifdef CONFIG_SMP
1514 struct sched_domain *sd, *this_sd = NULL;
1515 unsigned long load, this_load;
1516 int new_cpu;
1517#endif
1518 1820
1519 rq = task_rq_lock(p, &flags); 1821 rq = task_rq_lock(p, &flags);
1520 old_state = p->state; 1822 old_state = p->state;
@@ -1532,92 +1834,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1532 if (unlikely(task_running(rq, p))) 1834 if (unlikely(task_running(rq, p)))
1533 goto out_activate; 1835 goto out_activate;
1534 1836
1535 new_cpu = cpu; 1837 cpu = p->sched_class->select_task_rq(p, sync);
1536 1838 if (cpu != orig_cpu) {
1537 schedstat_inc(rq, ttwu_count); 1839 set_task_cpu(p, cpu);
1538 if (cpu == this_cpu) {
1539 schedstat_inc(rq, ttwu_local);
1540 goto out_set_cpu;
1541 }
1542
1543 for_each_domain(this_cpu, sd) {
1544 if (cpu_isset(cpu, sd->span)) {
1545 schedstat_inc(sd, ttwu_wake_remote);
1546 this_sd = sd;
1547 break;
1548 }
1549 }
1550
1551 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1552 goto out_set_cpu;
1553
1554 /*
1555 * Check for affine wakeup and passive balancing possibilities.
1556 */
1557 if (this_sd) {
1558 int idx = this_sd->wake_idx;
1559 unsigned int imbalance;
1560
1561 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1562
1563 load = source_load(cpu, idx);
1564 this_load = target_load(this_cpu, idx);
1565
1566 new_cpu = this_cpu; /* Wake to this CPU if we can */
1567
1568 if (this_sd->flags & SD_WAKE_AFFINE) {
1569 unsigned long tl = this_load;
1570 unsigned long tl_per_task;
1571
1572 /*
1573 * Attract cache-cold tasks on sync wakeups:
1574 */
1575 if (sync && !task_hot(p, rq->clock, this_sd))
1576 goto out_set_cpu;
1577
1578 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1579 tl_per_task = cpu_avg_load_per_task(this_cpu);
1580
1581 /*
1582 * If sync wakeup then subtract the (maximum possible)
1583 * effect of the currently running task from the load
1584 * of the current CPU:
1585 */
1586 if (sync)
1587 tl -= current->se.load.weight;
1588
1589 if ((tl <= load &&
1590 tl + target_load(cpu, idx) <= tl_per_task) ||
1591 100*(tl + p->se.load.weight) <= imbalance*load) {
1592 /*
1593 * This domain has SD_WAKE_AFFINE and
1594 * p is cache cold in this domain, and
1595 * there is no bad imbalance.
1596 */
1597 schedstat_inc(this_sd, ttwu_move_affine);
1598 schedstat_inc(p, se.nr_wakeups_affine);
1599 goto out_set_cpu;
1600 }
1601 }
1602
1603 /*
1604 * Start passive balancing when half the imbalance_pct
1605 * limit is reached.
1606 */
1607 if (this_sd->flags & SD_WAKE_BALANCE) {
1608 if (imbalance*this_load <= 100*load) {
1609 schedstat_inc(this_sd, ttwu_move_balance);
1610 schedstat_inc(p, se.nr_wakeups_passive);
1611 goto out_set_cpu;
1612 }
1613 }
1614 }
1615
1616 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1617out_set_cpu:
1618 new_cpu = wake_idle(new_cpu, p);
1619 if (new_cpu != cpu) {
1620 set_task_cpu(p, new_cpu);
1621 task_rq_unlock(rq, &flags); 1840 task_rq_unlock(rq, &flags);
1622 /* might preempt at this point */ 1841 /* might preempt at this point */
1623 rq = task_rq_lock(p, &flags); 1842 rq = task_rq_lock(p, &flags);
@@ -1631,6 +1850,21 @@ out_set_cpu:
1631 cpu = task_cpu(p); 1850 cpu = task_cpu(p);
1632 } 1851 }
1633 1852
1853#ifdef CONFIG_SCHEDSTATS
1854 schedstat_inc(rq, ttwu_count);
1855 if (cpu == this_cpu)
1856 schedstat_inc(rq, ttwu_local);
1857 else {
1858 struct sched_domain *sd;
1859 for_each_domain(this_cpu, sd) {
1860 if (cpu_isset(cpu, sd->span)) {
1861 schedstat_inc(sd, ttwu_wake_remote);
1862 break;
1863 }
1864 }
1865 }
1866#endif
1867
1634out_activate: 1868out_activate:
1635#endif /* CONFIG_SMP */ 1869#endif /* CONFIG_SMP */
1636 schedstat_inc(p, se.nr_wakeups); 1870 schedstat_inc(p, se.nr_wakeups);
@@ -1649,6 +1883,10 @@ out_activate:
1649 1883
1650out_running: 1884out_running:
1651 p->state = TASK_RUNNING; 1885 p->state = TASK_RUNNING;
1886#ifdef CONFIG_SMP
1887 if (p->sched_class->task_wake_up)
1888 p->sched_class->task_wake_up(rq, p);
1889#endif
1652out: 1890out:
1653 task_rq_unlock(rq, &flags); 1891 task_rq_unlock(rq, &flags);
1654 1892
@@ -1691,7 +1929,7 @@ static void __sched_fork(struct task_struct *p)
1691 p->se.wait_max = 0; 1929 p->se.wait_max = 0;
1692#endif 1930#endif
1693 1931
1694 INIT_LIST_HEAD(&p->run_list); 1932 INIT_LIST_HEAD(&p->rt.run_list);
1695 p->se.on_rq = 0; 1933 p->se.on_rq = 0;
1696 1934
1697#ifdef CONFIG_PREEMPT_NOTIFIERS 1935#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1771,6 +2009,10 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1771 inc_nr_running(p, rq); 2009 inc_nr_running(p, rq);
1772 } 2010 }
1773 check_preempt_curr(rq, p); 2011 check_preempt_curr(rq, p);
2012#ifdef CONFIG_SMP
2013 if (p->sched_class->task_wake_up)
2014 p->sched_class->task_wake_up(rq, p);
2015#endif
1774 task_rq_unlock(rq, &flags); 2016 task_rq_unlock(rq, &flags);
1775} 2017}
1776 2018
@@ -1891,6 +2133,11 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1891 prev_state = prev->state; 2133 prev_state = prev->state;
1892 finish_arch_switch(prev); 2134 finish_arch_switch(prev);
1893 finish_lock_switch(rq, prev); 2135 finish_lock_switch(rq, prev);
2136#ifdef CONFIG_SMP
2137 if (current->sched_class->post_schedule)
2138 current->sched_class->post_schedule(rq);
2139#endif
2140
1894 fire_sched_in_preempt_notifiers(current); 2141 fire_sched_in_preempt_notifiers(current);
1895 if (mm) 2142 if (mm)
1896 mmdrop(mm); 2143 mmdrop(mm);
@@ -2124,11 +2371,13 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2124/* 2371/*
2125 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 2372 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2126 */ 2373 */
2127static void double_lock_balance(struct rq *this_rq, struct rq *busiest) 2374static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2128 __releases(this_rq->lock) 2375 __releases(this_rq->lock)
2129 __acquires(busiest->lock) 2376 __acquires(busiest->lock)
2130 __acquires(this_rq->lock) 2377 __acquires(this_rq->lock)
2131{ 2378{
2379 int ret = 0;
2380
2132 if (unlikely(!irqs_disabled())) { 2381 if (unlikely(!irqs_disabled())) {
2133 /* printk() doesn't work good under rq->lock */ 2382 /* printk() doesn't work good under rq->lock */
2134 spin_unlock(&this_rq->lock); 2383 spin_unlock(&this_rq->lock);
@@ -2139,9 +2388,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
2139 spin_unlock(&this_rq->lock); 2388 spin_unlock(&this_rq->lock);
2140 spin_lock(&busiest->lock); 2389 spin_lock(&busiest->lock);
2141 spin_lock(&this_rq->lock); 2390 spin_lock(&this_rq->lock);
2391 ret = 1;
2142 } else 2392 } else
2143 spin_lock(&busiest->lock); 2393 spin_lock(&busiest->lock);
2144 } 2394 }
2395 return ret;
2145} 2396}
2146 2397
2147/* 2398/*
@@ -3485,12 +3736,14 @@ void scheduler_tick(void)
3485 /* 3736 /*
3486 * Let rq->clock advance by at least TICK_NSEC: 3737 * Let rq->clock advance by at least TICK_NSEC:
3487 */ 3738 */
3488 if (unlikely(rq->clock < next_tick)) 3739 if (unlikely(rq->clock < next_tick)) {
3489 rq->clock = next_tick; 3740 rq->clock = next_tick;
3741 rq->clock_underflows++;
3742 }
3490 rq->tick_timestamp = rq->clock; 3743 rq->tick_timestamp = rq->clock;
3491 update_cpu_load(rq); 3744 update_cpu_load(rq);
3492 if (curr != rq->idle) /* FIXME: needed? */ 3745 curr->sched_class->task_tick(rq, curr, 0);
3493 curr->sched_class->task_tick(rq, curr); 3746 update_sched_rt_period(rq);
3494 spin_unlock(&rq->lock); 3747 spin_unlock(&rq->lock);
3495 3748
3496#ifdef CONFIG_SMP 3749#ifdef CONFIG_SMP
@@ -3636,6 +3889,8 @@ need_resched_nonpreemptible:
3636 3889
3637 schedule_debug(prev); 3890 schedule_debug(prev);
3638 3891
3892 hrtick_clear(rq);
3893
3639 /* 3894 /*
3640 * Do the rq-clock update outside the rq lock: 3895 * Do the rq-clock update outside the rq lock:
3641 */ 3896 */
@@ -3654,6 +3909,11 @@ need_resched_nonpreemptible:
3654 switch_count = &prev->nvcsw; 3909 switch_count = &prev->nvcsw;
3655 } 3910 }
3656 3911
3912#ifdef CONFIG_SMP
3913 if (prev->sched_class->pre_schedule)
3914 prev->sched_class->pre_schedule(rq, prev);
3915#endif
3916
3657 if (unlikely(!rq->nr_running)) 3917 if (unlikely(!rq->nr_running))
3658 idle_balance(cpu, rq); 3918 idle_balance(cpu, rq);
3659 3919
@@ -3668,14 +3928,20 @@ need_resched_nonpreemptible:
3668 ++*switch_count; 3928 ++*switch_count;
3669 3929
3670 context_switch(rq, prev, next); /* unlocks the rq */ 3930 context_switch(rq, prev, next); /* unlocks the rq */
3931 /*
3932 * the context switch might have flipped the stack from under
3933 * us, hence refresh the local variables.
3934 */
3935 cpu = smp_processor_id();
3936 rq = cpu_rq(cpu);
3671 } else 3937 } else
3672 spin_unlock_irq(&rq->lock); 3938 spin_unlock_irq(&rq->lock);
3673 3939
3674 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3940 hrtick_set(rq);
3675 cpu = smp_processor_id(); 3941
3676 rq = cpu_rq(cpu); 3942 if (unlikely(reacquire_kernel_lock(current) < 0))
3677 goto need_resched_nonpreemptible; 3943 goto need_resched_nonpreemptible;
3678 } 3944
3679 preempt_enable_no_resched(); 3945 preempt_enable_no_resched();
3680 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3946 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3681 goto need_resched; 3947 goto need_resched;
@@ -3691,10 +3957,9 @@ EXPORT_SYMBOL(schedule);
3691asmlinkage void __sched preempt_schedule(void) 3957asmlinkage void __sched preempt_schedule(void)
3692{ 3958{
3693 struct thread_info *ti = current_thread_info(); 3959 struct thread_info *ti = current_thread_info();
3694#ifdef CONFIG_PREEMPT_BKL
3695 struct task_struct *task = current; 3960 struct task_struct *task = current;
3696 int saved_lock_depth; 3961 int saved_lock_depth;
3697#endif 3962
3698 /* 3963 /*
3699 * If there is a non-zero preempt_count or interrupts are disabled, 3964 * If there is a non-zero preempt_count or interrupts are disabled,
3700 * we do not want to preempt the current task. Just return.. 3965 * we do not want to preempt the current task. Just return..
@@ -3710,14 +3975,10 @@ asmlinkage void __sched preempt_schedule(void)
3710 * clear ->lock_depth so that schedule() doesnt 3975 * clear ->lock_depth so that schedule() doesnt
3711 * auto-release the semaphore: 3976 * auto-release the semaphore:
3712 */ 3977 */
3713#ifdef CONFIG_PREEMPT_BKL
3714 saved_lock_depth = task->lock_depth; 3978 saved_lock_depth = task->lock_depth;
3715 task->lock_depth = -1; 3979 task->lock_depth = -1;
3716#endif
3717 schedule(); 3980 schedule();
3718#ifdef CONFIG_PREEMPT_BKL
3719 task->lock_depth = saved_lock_depth; 3981 task->lock_depth = saved_lock_depth;
3720#endif
3721 sub_preempt_count(PREEMPT_ACTIVE); 3982 sub_preempt_count(PREEMPT_ACTIVE);
3722 3983
3723 /* 3984 /*
@@ -3738,10 +3999,9 @@ EXPORT_SYMBOL(preempt_schedule);
3738asmlinkage void __sched preempt_schedule_irq(void) 3999asmlinkage void __sched preempt_schedule_irq(void)
3739{ 4000{
3740 struct thread_info *ti = current_thread_info(); 4001 struct thread_info *ti = current_thread_info();
3741#ifdef CONFIG_PREEMPT_BKL
3742 struct task_struct *task = current; 4002 struct task_struct *task = current;
3743 int saved_lock_depth; 4003 int saved_lock_depth;
3744#endif 4004
3745 /* Catch callers which need to be fixed */ 4005 /* Catch callers which need to be fixed */
3746 BUG_ON(ti->preempt_count || !irqs_disabled()); 4006 BUG_ON(ti->preempt_count || !irqs_disabled());
3747 4007
@@ -3753,16 +4013,12 @@ asmlinkage void __sched preempt_schedule_irq(void)
3753 * clear ->lock_depth so that schedule() doesnt 4013 * clear ->lock_depth so that schedule() doesnt
3754 * auto-release the semaphore: 4014 * auto-release the semaphore:
3755 */ 4015 */
3756#ifdef CONFIG_PREEMPT_BKL
3757 saved_lock_depth = task->lock_depth; 4016 saved_lock_depth = task->lock_depth;
3758 task->lock_depth = -1; 4017 task->lock_depth = -1;
3759#endif
3760 local_irq_enable(); 4018 local_irq_enable();
3761 schedule(); 4019 schedule();
3762 local_irq_disable(); 4020 local_irq_disable();
3763#ifdef CONFIG_PREEMPT_BKL
3764 task->lock_depth = saved_lock_depth; 4021 task->lock_depth = saved_lock_depth;
3765#endif
3766 sub_preempt_count(PREEMPT_ACTIVE); 4022 sub_preempt_count(PREEMPT_ACTIVE);
3767 4023
3768 /* 4024 /*
@@ -4019,6 +4275,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4019 unsigned long flags; 4275 unsigned long flags;
4020 int oldprio, on_rq, running; 4276 int oldprio, on_rq, running;
4021 struct rq *rq; 4277 struct rq *rq;
4278 const struct sched_class *prev_class = p->sched_class;
4022 4279
4023 BUG_ON(prio < 0 || prio > MAX_PRIO); 4280 BUG_ON(prio < 0 || prio > MAX_PRIO);
4024 4281
@@ -4044,18 +4301,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4044 if (on_rq) { 4301 if (on_rq) {
4045 if (running) 4302 if (running)
4046 p->sched_class->set_curr_task(rq); 4303 p->sched_class->set_curr_task(rq);
4304
4047 enqueue_task(rq, p, 0); 4305 enqueue_task(rq, p, 0);
4048 /* 4306
4049 * Reschedule if we are currently running on this runqueue and 4307 check_class_changed(rq, p, prev_class, oldprio, running);
4050 * our priority decreased, or if we are not currently running on
4051 * this runqueue and our priority is higher than the current's
4052 */
4053 if (running) {
4054 if (p->prio > oldprio)
4055 resched_task(rq->curr);
4056 } else {
4057 check_preempt_curr(rq, p);
4058 }
4059 } 4308 }
4060 task_rq_unlock(rq, &flags); 4309 task_rq_unlock(rq, &flags);
4061} 4310}
@@ -4087,10 +4336,8 @@ void set_user_nice(struct task_struct *p, long nice)
4087 goto out_unlock; 4336 goto out_unlock;
4088 } 4337 }
4089 on_rq = p->se.on_rq; 4338 on_rq = p->se.on_rq;
4090 if (on_rq) { 4339 if (on_rq)
4091 dequeue_task(rq, p, 0); 4340 dequeue_task(rq, p, 0);
4092 dec_load(rq, p);
4093 }
4094 4341
4095 p->static_prio = NICE_TO_PRIO(nice); 4342 p->static_prio = NICE_TO_PRIO(nice);
4096 set_load_weight(p); 4343 set_load_weight(p);
@@ -4100,7 +4347,6 @@ void set_user_nice(struct task_struct *p, long nice)
4100 4347
4101 if (on_rq) { 4348 if (on_rq) {
4102 enqueue_task(rq, p, 0); 4349 enqueue_task(rq, p, 0);
4103 inc_load(rq, p);
4104 /* 4350 /*
4105 * If the task increased its priority or is running and 4351 * If the task increased its priority or is running and
4106 * lowered its priority, then reschedule its CPU: 4352 * lowered its priority, then reschedule its CPU:
@@ -4258,6 +4504,7 @@ int sched_setscheduler(struct task_struct *p, int policy,
4258{ 4504{
4259 int retval, oldprio, oldpolicy = -1, on_rq, running; 4505 int retval, oldprio, oldpolicy = -1, on_rq, running;
4260 unsigned long flags; 4506 unsigned long flags;
4507 const struct sched_class *prev_class = p->sched_class;
4261 struct rq *rq; 4508 struct rq *rq;
4262 4509
4263 /* may grab non-irq protected spin_locks */ 4510 /* may grab non-irq protected spin_locks */
@@ -4351,18 +4598,10 @@ recheck:
4351 if (on_rq) { 4598 if (on_rq) {
4352 if (running) 4599 if (running)
4353 p->sched_class->set_curr_task(rq); 4600 p->sched_class->set_curr_task(rq);
4601
4354 activate_task(rq, p, 0); 4602 activate_task(rq, p, 0);
4355 /* 4603
4356 * Reschedule if we are currently running on this runqueue and 4604 check_class_changed(rq, p, prev_class, oldprio, running);
4357 * our priority decreased, or if we are not currently running on
4358 * this runqueue and our priority is higher than the current's
4359 */
4360 if (running) {
4361 if (p->prio > oldprio)
4362 resched_task(rq->curr);
4363 } else {
4364 check_preempt_curr(rq, p);
4365 }
4366 } 4605 }
4367 __task_rq_unlock(rq); 4606 __task_rq_unlock(rq);
4368 spin_unlock_irqrestore(&p->pi_lock, flags); 4607 spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -4490,13 +4729,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4490 struct task_struct *p; 4729 struct task_struct *p;
4491 int retval; 4730 int retval;
4492 4731
4493 mutex_lock(&sched_hotcpu_mutex); 4732 get_online_cpus();
4494 read_lock(&tasklist_lock); 4733 read_lock(&tasklist_lock);
4495 4734
4496 p = find_process_by_pid(pid); 4735 p = find_process_by_pid(pid);
4497 if (!p) { 4736 if (!p) {
4498 read_unlock(&tasklist_lock); 4737 read_unlock(&tasklist_lock);
4499 mutex_unlock(&sched_hotcpu_mutex); 4738 put_online_cpus();
4500 return -ESRCH; 4739 return -ESRCH;
4501 } 4740 }
4502 4741
@@ -4536,7 +4775,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4536 } 4775 }
4537out_unlock: 4776out_unlock:
4538 put_task_struct(p); 4777 put_task_struct(p);
4539 mutex_unlock(&sched_hotcpu_mutex); 4778 put_online_cpus();
4540 return retval; 4779 return retval;
4541} 4780}
4542 4781
@@ -4593,7 +4832,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
4593 struct task_struct *p; 4832 struct task_struct *p;
4594 int retval; 4833 int retval;
4595 4834
4596 mutex_lock(&sched_hotcpu_mutex); 4835 get_online_cpus();
4597 read_lock(&tasklist_lock); 4836 read_lock(&tasklist_lock);
4598 4837
4599 retval = -ESRCH; 4838 retval = -ESRCH;
@@ -4609,7 +4848,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
4609 4848
4610out_unlock: 4849out_unlock:
4611 read_unlock(&tasklist_lock); 4850 read_unlock(&tasklist_lock);
4612 mutex_unlock(&sched_hotcpu_mutex); 4851 put_online_cpus();
4613 4852
4614 return retval; 4853 return retval;
4615} 4854}
@@ -4683,7 +4922,8 @@ static void __cond_resched(void)
4683 } while (need_resched()); 4922 } while (need_resched());
4684} 4923}
4685 4924
4686int __sched cond_resched(void) 4925#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
4926int __sched _cond_resched(void)
4687{ 4927{
4688 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && 4928 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4689 system_state == SYSTEM_RUNNING) { 4929 system_state == SYSTEM_RUNNING) {
@@ -4692,7 +4932,8 @@ int __sched cond_resched(void)
4692 } 4932 }
4693 return 0; 4933 return 0;
4694} 4934}
4695EXPORT_SYMBOL(cond_resched); 4935EXPORT_SYMBOL(_cond_resched);
4936#endif
4696 4937
4697/* 4938/*
4698 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 4939 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
@@ -4890,7 +5131,7 @@ out_unlock:
4890 5131
4891static const char stat_nam[] = "RSDTtZX"; 5132static const char stat_nam[] = "RSDTtZX";
4892 5133
4893static void show_task(struct task_struct *p) 5134void sched_show_task(struct task_struct *p)
4894{ 5135{
4895 unsigned long free = 0; 5136 unsigned long free = 0;
4896 unsigned state; 5137 unsigned state;
@@ -4920,8 +5161,7 @@ static void show_task(struct task_struct *p)
4920 printk(KERN_CONT "%5lu %5d %6d\n", free, 5161 printk(KERN_CONT "%5lu %5d %6d\n", free,
4921 task_pid_nr(p), task_pid_nr(p->real_parent)); 5162 task_pid_nr(p), task_pid_nr(p->real_parent));
4922 5163
4923 if (state != TASK_RUNNING) 5164 show_stack(p, NULL);
4924 show_stack(p, NULL);
4925} 5165}
4926 5166
4927void show_state_filter(unsigned long state_filter) 5167void show_state_filter(unsigned long state_filter)
@@ -4943,7 +5183,7 @@ void show_state_filter(unsigned long state_filter)
4943 */ 5183 */
4944 touch_nmi_watchdog(); 5184 touch_nmi_watchdog();
4945 if (!state_filter || (p->state & state_filter)) 5185 if (!state_filter || (p->state & state_filter))
4946 show_task(p); 5186 sched_show_task(p);
4947 } while_each_thread(g, p); 5187 } while_each_thread(g, p);
4948 5188
4949 touch_all_softlockup_watchdogs(); 5189 touch_all_softlockup_watchdogs();
@@ -4992,11 +5232,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4992 spin_unlock_irqrestore(&rq->lock, flags); 5232 spin_unlock_irqrestore(&rq->lock, flags);
4993 5233
4994 /* Set the preempt count _outside_ the spinlocks! */ 5234 /* Set the preempt count _outside_ the spinlocks! */
4995#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4996 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4997#else
4998 task_thread_info(idle)->preempt_count = 0; 5235 task_thread_info(idle)->preempt_count = 0;
4999#endif 5236
5000 /* 5237 /*
5001 * The idle tasks have their own, simple scheduling class: 5238 * The idle tasks have their own, simple scheduling class:
5002 */ 5239 */
@@ -5077,7 +5314,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
5077 goto out; 5314 goto out;
5078 } 5315 }
5079 5316
5080 p->cpus_allowed = new_mask; 5317 if (p->sched_class->set_cpus_allowed)
5318 p->sched_class->set_cpus_allowed(p, &new_mask);
5319 else {
5320 p->cpus_allowed = new_mask;
5321 p->rt.nr_cpus_allowed = cpus_weight(new_mask);
5322 }
5323
5081 /* Can the task run on the task's current CPU? If so, we're done */ 5324 /* Can the task run on the task's current CPU? If so, we're done */
5082 if (cpu_isset(task_cpu(p), new_mask)) 5325 if (cpu_isset(task_cpu(p), new_mask))
5083 goto out; 5326 goto out;
@@ -5569,9 +5812,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5569 struct rq *rq; 5812 struct rq *rq;
5570 5813
5571 switch (action) { 5814 switch (action) {
5572 case CPU_LOCK_ACQUIRE:
5573 mutex_lock(&sched_hotcpu_mutex);
5574 break;
5575 5815
5576 case CPU_UP_PREPARE: 5816 case CPU_UP_PREPARE:
5577 case CPU_UP_PREPARE_FROZEN: 5817 case CPU_UP_PREPARE_FROZEN:
@@ -5590,6 +5830,15 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5590 case CPU_ONLINE_FROZEN: 5830 case CPU_ONLINE_FROZEN:
5591 /* Strictly unnecessary, as first user will wake it. */ 5831 /* Strictly unnecessary, as first user will wake it. */
5592 wake_up_process(cpu_rq(cpu)->migration_thread); 5832 wake_up_process(cpu_rq(cpu)->migration_thread);
5833
5834 /* Update our root-domain */
5835 rq = cpu_rq(cpu);
5836 spin_lock_irqsave(&rq->lock, flags);
5837 if (rq->rd) {
5838 BUG_ON(!cpu_isset(cpu, rq->rd->span));
5839 cpu_set(cpu, rq->rd->online);
5840 }
5841 spin_unlock_irqrestore(&rq->lock, flags);
5593 break; 5842 break;
5594 5843
5595#ifdef CONFIG_HOTPLUG_CPU 5844#ifdef CONFIG_HOTPLUG_CPU
@@ -5640,10 +5889,18 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5640 } 5889 }
5641 spin_unlock_irq(&rq->lock); 5890 spin_unlock_irq(&rq->lock);
5642 break; 5891 break;
5643#endif 5892
5644 case CPU_LOCK_RELEASE: 5893 case CPU_DOWN_PREPARE:
5645 mutex_unlock(&sched_hotcpu_mutex); 5894 /* Update our root-domain */
5895 rq = cpu_rq(cpu);
5896 spin_lock_irqsave(&rq->lock, flags);
5897 if (rq->rd) {
5898 BUG_ON(!cpu_isset(cpu, rq->rd->span));
5899 cpu_clear(cpu, rq->rd->online);
5900 }
5901 spin_unlock_irqrestore(&rq->lock, flags);
5646 break; 5902 break;
5903#endif
5647 } 5904 }
5648 return NOTIFY_OK; 5905 return NOTIFY_OK;
5649} 5906}
@@ -5831,11 +6088,76 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5831 return 1; 6088 return 1;
5832} 6089}
5833 6090
6091static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6092{
6093 unsigned long flags;
6094 const struct sched_class *class;
6095
6096 spin_lock_irqsave(&rq->lock, flags);
6097
6098 if (rq->rd) {
6099 struct root_domain *old_rd = rq->rd;
6100
6101 for (class = sched_class_highest; class; class = class->next) {
6102 if (class->leave_domain)
6103 class->leave_domain(rq);
6104 }
6105
6106 cpu_clear(rq->cpu, old_rd->span);
6107 cpu_clear(rq->cpu, old_rd->online);
6108
6109 if (atomic_dec_and_test(&old_rd->refcount))
6110 kfree(old_rd);
6111 }
6112
6113 atomic_inc(&rd->refcount);
6114 rq->rd = rd;
6115
6116 cpu_set(rq->cpu, rd->span);
6117 if (cpu_isset(rq->cpu, cpu_online_map))
6118 cpu_set(rq->cpu, rd->online);
6119
6120 for (class = sched_class_highest; class; class = class->next) {
6121 if (class->join_domain)
6122 class->join_domain(rq);
6123 }
6124
6125 spin_unlock_irqrestore(&rq->lock, flags);
6126}
6127
6128static void init_rootdomain(struct root_domain *rd)
6129{
6130 memset(rd, 0, sizeof(*rd));
6131
6132 cpus_clear(rd->span);
6133 cpus_clear(rd->online);
6134}
6135
6136static void init_defrootdomain(void)
6137{
6138 init_rootdomain(&def_root_domain);
6139 atomic_set(&def_root_domain.refcount, 1);
6140}
6141
6142static struct root_domain *alloc_rootdomain(void)
6143{
6144 struct root_domain *rd;
6145
6146 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
6147 if (!rd)
6148 return NULL;
6149
6150 init_rootdomain(rd);
6151
6152 return rd;
6153}
6154
5834/* 6155/*
5835 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6156 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5836 * hold the hotplug lock. 6157 * hold the hotplug lock.
5837 */ 6158 */
5838static void cpu_attach_domain(struct sched_domain *sd, int cpu) 6159static void
6160cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5839{ 6161{
5840 struct rq *rq = cpu_rq(cpu); 6162 struct rq *rq = cpu_rq(cpu);
5841 struct sched_domain *tmp; 6163 struct sched_domain *tmp;
@@ -5860,6 +6182,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5860 6182
5861 sched_domain_debug(sd, cpu); 6183 sched_domain_debug(sd, cpu);
5862 6184
6185 rq_attach_root(rq, rd);
5863 rcu_assign_pointer(rq->sd, sd); 6186 rcu_assign_pointer(rq->sd, sd);
5864} 6187}
5865 6188
@@ -6228,6 +6551,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6228static int build_sched_domains(const cpumask_t *cpu_map) 6551static int build_sched_domains(const cpumask_t *cpu_map)
6229{ 6552{
6230 int i; 6553 int i;
6554 struct root_domain *rd;
6231#ifdef CONFIG_NUMA 6555#ifdef CONFIG_NUMA
6232 struct sched_group **sched_group_nodes = NULL; 6556 struct sched_group **sched_group_nodes = NULL;
6233 int sd_allnodes = 0; 6557 int sd_allnodes = 0;
@@ -6244,6 +6568,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6244 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; 6568 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
6245#endif 6569#endif
6246 6570
6571 rd = alloc_rootdomain();
6572 if (!rd) {
6573 printk(KERN_WARNING "Cannot alloc root domain\n");
6574 return -ENOMEM;
6575 }
6576
6247 /* 6577 /*
6248 * Set up domains for cpus specified by the cpu_map. 6578 * Set up domains for cpus specified by the cpu_map.
6249 */ 6579 */
@@ -6460,7 +6790,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6460#else 6790#else
6461 sd = &per_cpu(phys_domains, i); 6791 sd = &per_cpu(phys_domains, i);
6462#endif 6792#endif
6463 cpu_attach_domain(sd, i); 6793 cpu_attach_domain(sd, rd, i);
6464 } 6794 }
6465 6795
6466 return 0; 6796 return 0;
@@ -6518,7 +6848,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6518 unregister_sched_domain_sysctl(); 6848 unregister_sched_domain_sysctl();
6519 6849
6520 for_each_cpu_mask(i, *cpu_map) 6850 for_each_cpu_mask(i, *cpu_map)
6521 cpu_attach_domain(NULL, i); 6851 cpu_attach_domain(NULL, &def_root_domain, i);
6522 synchronize_sched(); 6852 synchronize_sched();
6523 arch_destroy_sched_domains(cpu_map); 6853 arch_destroy_sched_domains(cpu_map);
6524} 6854}
@@ -6548,6 +6878,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
6548{ 6878{
6549 int i, j; 6879 int i, j;
6550 6880
6881 lock_doms_cur();
6882
6551 /* always unregister in case we don't destroy any domains */ 6883 /* always unregister in case we don't destroy any domains */
6552 unregister_sched_domain_sysctl(); 6884 unregister_sched_domain_sysctl();
6553 6885
@@ -6588,6 +6920,8 @@ match2:
6588 ndoms_cur = ndoms_new; 6920 ndoms_cur = ndoms_new;
6589 6921
6590 register_sched_domain_sysctl(); 6922 register_sched_domain_sysctl();
6923
6924 unlock_doms_cur();
6591} 6925}
6592 6926
6593#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 6927#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -6595,10 +6929,10 @@ static int arch_reinit_sched_domains(void)
6595{ 6929{
6596 int err; 6930 int err;
6597 6931
6598 mutex_lock(&sched_hotcpu_mutex); 6932 get_online_cpus();
6599 detach_destroy_domains(&cpu_online_map); 6933 detach_destroy_domains(&cpu_online_map);
6600 err = arch_init_sched_domains(&cpu_online_map); 6934 err = arch_init_sched_domains(&cpu_online_map);
6601 mutex_unlock(&sched_hotcpu_mutex); 6935 put_online_cpus();
6602 6936
6603 return err; 6937 return err;
6604} 6938}
@@ -6709,12 +7043,12 @@ void __init sched_init_smp(void)
6709{ 7043{
6710 cpumask_t non_isolated_cpus; 7044 cpumask_t non_isolated_cpus;
6711 7045
6712 mutex_lock(&sched_hotcpu_mutex); 7046 get_online_cpus();
6713 arch_init_sched_domains(&cpu_online_map); 7047 arch_init_sched_domains(&cpu_online_map);
6714 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 7048 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6715 if (cpus_empty(non_isolated_cpus)) 7049 if (cpus_empty(non_isolated_cpus))
6716 cpu_set(smp_processor_id(), non_isolated_cpus); 7050 cpu_set(smp_processor_id(), non_isolated_cpus);
6717 mutex_unlock(&sched_hotcpu_mutex); 7051 put_online_cpus();
6718 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7052 /* XXX: Theoretical race here - CPU may be hotplugged now */
6719 hotcpu_notifier(update_sched_domains, 0); 7053 hotcpu_notifier(update_sched_domains, 0);
6720 7054
@@ -6722,6 +7056,21 @@ void __init sched_init_smp(void)
6722 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 7056 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6723 BUG(); 7057 BUG();
6724 sched_init_granularity(); 7058 sched_init_granularity();
7059
7060#ifdef CONFIG_FAIR_GROUP_SCHED
7061 if (nr_cpu_ids == 1)
7062 return;
7063
7064 lb_monitor_task = kthread_create(load_balance_monitor, NULL,
7065 "group_balance");
7066 if (!IS_ERR(lb_monitor_task)) {
7067 lb_monitor_task->flags |= PF_NOFREEZE;
7068 wake_up_process(lb_monitor_task);
7069 } else {
7070 printk(KERN_ERR "Could not create load balance monitor thread"
7071 "(error = %ld) \n", PTR_ERR(lb_monitor_task));
7072 }
7073#endif
6725} 7074}
6726#else 7075#else
6727void __init sched_init_smp(void) 7076void __init sched_init_smp(void)
@@ -6746,13 +7095,87 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6746 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 7095 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
6747} 7096}
6748 7097
7098static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7099{
7100 struct rt_prio_array *array;
7101 int i;
7102
7103 array = &rt_rq->active;
7104 for (i = 0; i < MAX_RT_PRIO; i++) {
7105 INIT_LIST_HEAD(array->queue + i);
7106 __clear_bit(i, array->bitmap);
7107 }
7108 /* delimiter for bitsearch: */
7109 __set_bit(MAX_RT_PRIO, array->bitmap);
7110
7111#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
7112 rt_rq->highest_prio = MAX_RT_PRIO;
7113#endif
7114#ifdef CONFIG_SMP
7115 rt_rq->rt_nr_migratory = 0;
7116 rt_rq->overloaded = 0;
7117#endif
7118
7119 rt_rq->rt_time = 0;
7120 rt_rq->rt_throttled = 0;
7121
7122#ifdef CONFIG_FAIR_GROUP_SCHED
7123 rt_rq->rq = rq;
7124#endif
7125}
7126
7127#ifdef CONFIG_FAIR_GROUP_SCHED
7128static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
7129 struct cfs_rq *cfs_rq, struct sched_entity *se,
7130 int cpu, int add)
7131{
7132 tg->cfs_rq[cpu] = cfs_rq;
7133 init_cfs_rq(cfs_rq, rq);
7134 cfs_rq->tg = tg;
7135 if (add)
7136 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7137
7138 tg->se[cpu] = se;
7139 se->cfs_rq = &rq->cfs;
7140 se->my_q = cfs_rq;
7141 se->load.weight = tg->shares;
7142 se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
7143 se->parent = NULL;
7144}
7145
7146static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
7147 struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
7148 int cpu, int add)
7149{
7150 tg->rt_rq[cpu] = rt_rq;
7151 init_rt_rq(rt_rq, rq);
7152 rt_rq->tg = tg;
7153 rt_rq->rt_se = rt_se;
7154 if (add)
7155 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7156
7157 tg->rt_se[cpu] = rt_se;
7158 rt_se->rt_rq = &rq->rt;
7159 rt_se->my_q = rt_rq;
7160 rt_se->parent = NULL;
7161 INIT_LIST_HEAD(&rt_se->run_list);
7162}
7163#endif
7164
6749void __init sched_init(void) 7165void __init sched_init(void)
6750{ 7166{
6751 int highest_cpu = 0; 7167 int highest_cpu = 0;
6752 int i, j; 7168 int i, j;
6753 7169
7170#ifdef CONFIG_SMP
7171 init_defrootdomain();
7172#endif
7173
7174#ifdef CONFIG_FAIR_GROUP_SCHED
7175 list_add(&init_task_group.list, &task_groups);
7176#endif
7177
6754 for_each_possible_cpu(i) { 7178 for_each_possible_cpu(i) {
6755 struct rt_prio_array *array;
6756 struct rq *rq; 7179 struct rq *rq;
6757 7180
6758 rq = cpu_rq(i); 7181 rq = cpu_rq(i);
@@ -6761,52 +7184,39 @@ void __init sched_init(void)
6761 rq->nr_running = 0; 7184 rq->nr_running = 0;
6762 rq->clock = 1; 7185 rq->clock = 1;
6763 init_cfs_rq(&rq->cfs, rq); 7186 init_cfs_rq(&rq->cfs, rq);
7187 init_rt_rq(&rq->rt, rq);
6764#ifdef CONFIG_FAIR_GROUP_SCHED 7188#ifdef CONFIG_FAIR_GROUP_SCHED
6765 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6766 {
6767 struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
6768 struct sched_entity *se =
6769 &per_cpu(init_sched_entity, i);
6770
6771 init_cfs_rq_p[i] = cfs_rq;
6772 init_cfs_rq(cfs_rq, rq);
6773 cfs_rq->tg = &init_task_group;
6774 list_add(&cfs_rq->leaf_cfs_rq_list,
6775 &rq->leaf_cfs_rq_list);
6776
6777 init_sched_entity_p[i] = se;
6778 se->cfs_rq = &rq->cfs;
6779 se->my_q = cfs_rq;
6780 se->load.weight = init_task_group_load;
6781 se->load.inv_weight =
6782 div64_64(1ULL<<32, init_task_group_load);
6783 se->parent = NULL;
6784 }
6785 init_task_group.shares = init_task_group_load; 7189 init_task_group.shares = init_task_group_load;
6786 spin_lock_init(&init_task_group.lock); 7190 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7191 init_tg_cfs_entry(rq, &init_task_group,
7192 &per_cpu(init_cfs_rq, i),
7193 &per_cpu(init_sched_entity, i), i, 1);
7194
7195 init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
7196 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7197 init_tg_rt_entry(rq, &init_task_group,
7198 &per_cpu(init_rt_rq, i),
7199 &per_cpu(init_sched_rt_entity, i), i, 1);
6787#endif 7200#endif
7201 rq->rt_period_expire = 0;
7202 rq->rt_throttled = 0;
6788 7203
6789 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7204 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6790 rq->cpu_load[j] = 0; 7205 rq->cpu_load[j] = 0;
6791#ifdef CONFIG_SMP 7206#ifdef CONFIG_SMP
6792 rq->sd = NULL; 7207 rq->sd = NULL;
7208 rq->rd = NULL;
6793 rq->active_balance = 0; 7209 rq->active_balance = 0;
6794 rq->next_balance = jiffies; 7210 rq->next_balance = jiffies;
6795 rq->push_cpu = 0; 7211 rq->push_cpu = 0;
6796 rq->cpu = i; 7212 rq->cpu = i;
6797 rq->migration_thread = NULL; 7213 rq->migration_thread = NULL;
6798 INIT_LIST_HEAD(&rq->migration_queue); 7214 INIT_LIST_HEAD(&rq->migration_queue);
7215 rq_attach_root(rq, &def_root_domain);
6799#endif 7216#endif
7217 init_rq_hrtick(rq);
6800 atomic_set(&rq->nr_iowait, 0); 7218 atomic_set(&rq->nr_iowait, 0);
6801
6802 array = &rq->rt.active;
6803 for (j = 0; j < MAX_RT_PRIO; j++) {
6804 INIT_LIST_HEAD(array->queue + j);
6805 __clear_bit(j, array->bitmap);
6806 }
6807 highest_cpu = i; 7219 highest_cpu = i;
6808 /* delimiter for bitsearch: */
6809 __set_bit(MAX_RT_PRIO, array->bitmap);
6810 } 7220 }
6811 7221
6812 set_load_weight(&init_task); 7222 set_load_weight(&init_task);
@@ -6975,12 +7385,187 @@ void set_curr_task(int cpu, struct task_struct *p)
6975 7385
6976#ifdef CONFIG_FAIR_GROUP_SCHED 7386#ifdef CONFIG_FAIR_GROUP_SCHED
6977 7387
7388#ifdef CONFIG_SMP
7389/*
7390 * distribute shares of all task groups among their schedulable entities,
7391 * to reflect load distribution across cpus.
7392 */
7393static int rebalance_shares(struct sched_domain *sd, int this_cpu)
7394{
7395 struct cfs_rq *cfs_rq;
7396 struct rq *rq = cpu_rq(this_cpu);
7397 cpumask_t sdspan = sd->span;
7398 int balanced = 1;
7399
7400 /* Walk thr' all the task groups that we have */
7401 for_each_leaf_cfs_rq(rq, cfs_rq) {
7402 int i;
7403 unsigned long total_load = 0, total_shares;
7404 struct task_group *tg = cfs_rq->tg;
7405
7406 /* Gather total task load of this group across cpus */
7407 for_each_cpu_mask(i, sdspan)
7408 total_load += tg->cfs_rq[i]->load.weight;
7409
7410 /* Nothing to do if this group has no load */
7411 if (!total_load)
7412 continue;
7413
7414 /*
7415 * tg->shares represents the number of cpu shares the task group
7416 * is eligible to hold on a single cpu. On N cpus, it is
7417 * eligible to hold (N * tg->shares) number of cpu shares.
7418 */
7419 total_shares = tg->shares * cpus_weight(sdspan);
7420
7421 /*
7422 * redistribute total_shares across cpus as per the task load
7423 * distribution.
7424 */
7425 for_each_cpu_mask(i, sdspan) {
7426 unsigned long local_load, local_shares;
7427
7428 local_load = tg->cfs_rq[i]->load.weight;
7429 local_shares = (local_load * total_shares) / total_load;
7430 if (!local_shares)
7431 local_shares = MIN_GROUP_SHARES;
7432 if (local_shares == tg->se[i]->load.weight)
7433 continue;
7434
7435 spin_lock_irq(&cpu_rq(i)->lock);
7436 set_se_shares(tg->se[i], local_shares);
7437 spin_unlock_irq(&cpu_rq(i)->lock);
7438 balanced = 0;
7439 }
7440 }
7441
7442 return balanced;
7443}
7444
7445/*
7446 * How frequently should we rebalance_shares() across cpus?
7447 *
7448 * The more frequently we rebalance shares, the more accurate is the fairness
7449 * of cpu bandwidth distribution between task groups. However higher frequency
7450 * also implies increased scheduling overhead.
7451 *
7452 * sysctl_sched_min_bal_int_shares represents the minimum interval between
7453 * consecutive calls to rebalance_shares() in the same sched domain.
7454 *
7455 * sysctl_sched_max_bal_int_shares represents the maximum interval between
7456 * consecutive calls to rebalance_shares() in the same sched domain.
7457 *
7458 * These settings allows for the appropriate trade-off between accuracy of
7459 * fairness and the associated overhead.
7460 *
7461 */
7462
7463/* default: 8ms, units: milliseconds */
7464const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
7465
7466/* default: 128ms, units: milliseconds */
7467const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
7468
7469/* kernel thread that runs rebalance_shares() periodically */
7470static int load_balance_monitor(void *unused)
7471{
7472 unsigned int timeout = sysctl_sched_min_bal_int_shares;
7473 struct sched_param schedparm;
7474 int ret;
7475
7476 /*
7477 * We don't want this thread's execution to be limited by the shares
7478 * assigned to default group (init_task_group). Hence make it run
7479 * as a SCHED_RR RT task at the lowest priority.
7480 */
7481 schedparm.sched_priority = 1;
7482 ret = sched_setscheduler(current, SCHED_RR, &schedparm);
7483 if (ret)
7484 printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
7485 " monitor thread (error = %d) \n", ret);
7486
7487 while (!kthread_should_stop()) {
7488 int i, cpu, balanced = 1;
7489
7490 /* Prevent cpus going down or coming up */
7491 get_online_cpus();
7492 /* lockout changes to doms_cur[] array */
7493 lock_doms_cur();
7494 /*
7495 * Enter a rcu read-side critical section to safely walk rq->sd
7496 * chain on various cpus and to walk task group list
7497 * (rq->leaf_cfs_rq_list) in rebalance_shares().
7498 */
7499 rcu_read_lock();
7500
7501 for (i = 0; i < ndoms_cur; i++) {
7502 cpumask_t cpumap = doms_cur[i];
7503 struct sched_domain *sd = NULL, *sd_prev = NULL;
7504
7505 cpu = first_cpu(cpumap);
7506
7507 /* Find the highest domain at which to balance shares */
7508 for_each_domain(cpu, sd) {
7509 if (!(sd->flags & SD_LOAD_BALANCE))
7510 continue;
7511 sd_prev = sd;
7512 }
7513
7514 sd = sd_prev;
7515 /* sd == NULL? No load balance reqd in this domain */
7516 if (!sd)
7517 continue;
7518
7519 balanced &= rebalance_shares(sd, cpu);
7520 }
7521
7522 rcu_read_unlock();
7523
7524 unlock_doms_cur();
7525 put_online_cpus();
7526
7527 if (!balanced)
7528 timeout = sysctl_sched_min_bal_int_shares;
7529 else if (timeout < sysctl_sched_max_bal_int_shares)
7530 timeout *= 2;
7531
7532 msleep_interruptible(timeout);
7533 }
7534
7535 return 0;
7536}
7537#endif /* CONFIG_SMP */
7538
7539static void free_sched_group(struct task_group *tg)
7540{
7541 int i;
7542
7543 for_each_possible_cpu(i) {
7544 if (tg->cfs_rq)
7545 kfree(tg->cfs_rq[i]);
7546 if (tg->se)
7547 kfree(tg->se[i]);
7548 if (tg->rt_rq)
7549 kfree(tg->rt_rq[i]);
7550 if (tg->rt_se)
7551 kfree(tg->rt_se[i]);
7552 }
7553
7554 kfree(tg->cfs_rq);
7555 kfree(tg->se);
7556 kfree(tg->rt_rq);
7557 kfree(tg->rt_se);
7558 kfree(tg);
7559}
7560
6978/* allocate runqueue etc for a new task group */ 7561/* allocate runqueue etc for a new task group */
6979struct task_group *sched_create_group(void) 7562struct task_group *sched_create_group(void)
6980{ 7563{
6981 struct task_group *tg; 7564 struct task_group *tg;
6982 struct cfs_rq *cfs_rq; 7565 struct cfs_rq *cfs_rq;
6983 struct sched_entity *se; 7566 struct sched_entity *se;
7567 struct rt_rq *rt_rq;
7568 struct sched_rt_entity *rt_se;
6984 struct rq *rq; 7569 struct rq *rq;
6985 int i; 7570 int i;
6986 7571
@@ -6994,97 +7579,89 @@ struct task_group *sched_create_group(void)
6994 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); 7579 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
6995 if (!tg->se) 7580 if (!tg->se)
6996 goto err; 7581 goto err;
7582 tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
7583 if (!tg->rt_rq)
7584 goto err;
7585 tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
7586 if (!tg->rt_se)
7587 goto err;
7588
7589 tg->shares = NICE_0_LOAD;
7590 tg->rt_ratio = 0; /* XXX */
6997 7591
6998 for_each_possible_cpu(i) { 7592 for_each_possible_cpu(i) {
6999 rq = cpu_rq(i); 7593 rq = cpu_rq(i);
7000 7594
7001 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, 7595 cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
7002 cpu_to_node(i)); 7596 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7003 if (!cfs_rq) 7597 if (!cfs_rq)
7004 goto err; 7598 goto err;
7005 7599
7006 se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, 7600 se = kmalloc_node(sizeof(struct sched_entity),
7007 cpu_to_node(i)); 7601 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7008 if (!se) 7602 if (!se)
7009 goto err; 7603 goto err;
7010 7604
7011 memset(cfs_rq, 0, sizeof(struct cfs_rq)); 7605 rt_rq = kmalloc_node(sizeof(struct rt_rq),
7012 memset(se, 0, sizeof(struct sched_entity)); 7606 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7607 if (!rt_rq)
7608 goto err;
7013 7609
7014 tg->cfs_rq[i] = cfs_rq; 7610 rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
7015 init_cfs_rq(cfs_rq, rq); 7611 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7016 cfs_rq->tg = tg; 7612 if (!rt_se)
7613 goto err;
7017 7614
7018 tg->se[i] = se; 7615 init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
7019 se->cfs_rq = &rq->cfs; 7616 init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
7020 se->my_q = cfs_rq;
7021 se->load.weight = NICE_0_LOAD;
7022 se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
7023 se->parent = NULL;
7024 } 7617 }
7025 7618
7619 lock_task_group_list();
7026 for_each_possible_cpu(i) { 7620 for_each_possible_cpu(i) {
7027 rq = cpu_rq(i); 7621 rq = cpu_rq(i);
7028 cfs_rq = tg->cfs_rq[i]; 7622 cfs_rq = tg->cfs_rq[i];
7029 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 7623 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7624 rt_rq = tg->rt_rq[i];
7625 list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7030 } 7626 }
7031 7627 list_add_rcu(&tg->list, &task_groups);
7032 tg->shares = NICE_0_LOAD; 7628 unlock_task_group_list();
7033 spin_lock_init(&tg->lock);
7034 7629
7035 return tg; 7630 return tg;
7036 7631
7037err: 7632err:
7038 for_each_possible_cpu(i) { 7633 free_sched_group(tg);
7039 if (tg->cfs_rq)
7040 kfree(tg->cfs_rq[i]);
7041 if (tg->se)
7042 kfree(tg->se[i]);
7043 }
7044 kfree(tg->cfs_rq);
7045 kfree(tg->se);
7046 kfree(tg);
7047
7048 return ERR_PTR(-ENOMEM); 7634 return ERR_PTR(-ENOMEM);
7049} 7635}
7050 7636
7051/* rcu callback to free various structures associated with a task group */ 7637/* rcu callback to free various structures associated with a task group */
7052static void free_sched_group(struct rcu_head *rhp) 7638static void free_sched_group_rcu(struct rcu_head *rhp)
7053{ 7639{
7054 struct task_group *tg = container_of(rhp, struct task_group, rcu);
7055 struct cfs_rq *cfs_rq;
7056 struct sched_entity *se;
7057 int i;
7058
7059 /* now it should be safe to free those cfs_rqs */ 7640 /* now it should be safe to free those cfs_rqs */
7060 for_each_possible_cpu(i) { 7641 free_sched_group(container_of(rhp, struct task_group, rcu));
7061 cfs_rq = tg->cfs_rq[i];
7062 kfree(cfs_rq);
7063
7064 se = tg->se[i];
7065 kfree(se);
7066 }
7067
7068 kfree(tg->cfs_rq);
7069 kfree(tg->se);
7070 kfree(tg);
7071} 7642}
7072 7643
7073/* Destroy runqueue etc associated with a task group */ 7644/* Destroy runqueue etc associated with a task group */
7074void sched_destroy_group(struct task_group *tg) 7645void sched_destroy_group(struct task_group *tg)
7075{ 7646{
7076 struct cfs_rq *cfs_rq = NULL; 7647 struct cfs_rq *cfs_rq = NULL;
7648 struct rt_rq *rt_rq = NULL;
7077 int i; 7649 int i;
7078 7650
7651 lock_task_group_list();
7079 for_each_possible_cpu(i) { 7652 for_each_possible_cpu(i) {
7080 cfs_rq = tg->cfs_rq[i]; 7653 cfs_rq = tg->cfs_rq[i];
7081 list_del_rcu(&cfs_rq->leaf_cfs_rq_list); 7654 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
7655 rt_rq = tg->rt_rq[i];
7656 list_del_rcu(&rt_rq->leaf_rt_rq_list);
7082 } 7657 }
7658 list_del_rcu(&tg->list);
7659 unlock_task_group_list();
7083 7660
7084 BUG_ON(!cfs_rq); 7661 BUG_ON(!cfs_rq);
7085 7662
7086 /* wait for possible concurrent references to cfs_rqs complete */ 7663 /* wait for possible concurrent references to cfs_rqs complete */
7087 call_rcu(&tg->rcu, free_sched_group); 7664 call_rcu(&tg->rcu, free_sched_group_rcu);
7088} 7665}
7089 7666
7090/* change task's runqueue when it moves between groups. 7667/* change task's runqueue when it moves between groups.
@@ -7100,11 +7677,6 @@ void sched_move_task(struct task_struct *tsk)
7100 7677
7101 rq = task_rq_lock(tsk, &flags); 7678 rq = task_rq_lock(tsk, &flags);
7102 7679
7103 if (tsk->sched_class != &fair_sched_class) {
7104 set_task_cfs_rq(tsk, task_cpu(tsk));
7105 goto done;
7106 }
7107
7108 update_rq_clock(rq); 7680 update_rq_clock(rq);
7109 7681
7110 running = task_current(rq, tsk); 7682 running = task_current(rq, tsk);
@@ -7116,7 +7688,7 @@ void sched_move_task(struct task_struct *tsk)
7116 tsk->sched_class->put_prev_task(rq, tsk); 7688 tsk->sched_class->put_prev_task(rq, tsk);
7117 } 7689 }
7118 7690
7119 set_task_cfs_rq(tsk, task_cpu(tsk)); 7691 set_task_rq(tsk, task_cpu(tsk));
7120 7692
7121 if (on_rq) { 7693 if (on_rq) {
7122 if (unlikely(running)) 7694 if (unlikely(running))
@@ -7124,53 +7696,82 @@ void sched_move_task(struct task_struct *tsk)
7124 enqueue_task(rq, tsk, 0); 7696 enqueue_task(rq, tsk, 0);
7125 } 7697 }
7126 7698
7127done:
7128 task_rq_unlock(rq, &flags); 7699 task_rq_unlock(rq, &flags);
7129} 7700}
7130 7701
7702/* rq->lock to be locked by caller */
7131static void set_se_shares(struct sched_entity *se, unsigned long shares) 7703static void set_se_shares(struct sched_entity *se, unsigned long shares)
7132{ 7704{
7133 struct cfs_rq *cfs_rq = se->cfs_rq; 7705 struct cfs_rq *cfs_rq = se->cfs_rq;
7134 struct rq *rq = cfs_rq->rq; 7706 struct rq *rq = cfs_rq->rq;
7135 int on_rq; 7707 int on_rq;
7136 7708
7137 spin_lock_irq(&rq->lock); 7709 if (!shares)
7710 shares = MIN_GROUP_SHARES;
7138 7711
7139 on_rq = se->on_rq; 7712 on_rq = se->on_rq;
7140 if (on_rq) 7713 if (on_rq) {
7141 dequeue_entity(cfs_rq, se, 0); 7714 dequeue_entity(cfs_rq, se, 0);
7715 dec_cpu_load(rq, se->load.weight);
7716 }
7142 7717
7143 se->load.weight = shares; 7718 se->load.weight = shares;
7144 se->load.inv_weight = div64_64((1ULL<<32), shares); 7719 se->load.inv_weight = div64_64((1ULL<<32), shares);
7145 7720
7146 if (on_rq) 7721 if (on_rq) {
7147 enqueue_entity(cfs_rq, se, 0); 7722 enqueue_entity(cfs_rq, se, 0);
7148 7723 inc_cpu_load(rq, se->load.weight);
7149 spin_unlock_irq(&rq->lock); 7724 }
7150} 7725}
7151 7726
7152int sched_group_set_shares(struct task_group *tg, unsigned long shares) 7727int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7153{ 7728{
7154 int i; 7729 int i;
7730 struct cfs_rq *cfs_rq;
7731 struct rq *rq;
7732
7733 lock_task_group_list();
7734 if (tg->shares == shares)
7735 goto done;
7736
7737 if (shares < MIN_GROUP_SHARES)
7738 shares = MIN_GROUP_SHARES;
7155 7739
7156 /* 7740 /*
7157 * A weight of 0 or 1 can cause arithmetics problems. 7741 * Prevent any load balance activity (rebalance_shares,
7158 * (The default weight is 1024 - so there's no practical 7742 * load_balance_fair) from referring to this group first,
7159 * limitation from this.) 7743 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
7160 */ 7744 */
7161 if (shares < 2) 7745 for_each_possible_cpu(i) {
7162 shares = 2; 7746 cfs_rq = tg->cfs_rq[i];
7747 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
7748 }
7163 7749
7164 spin_lock(&tg->lock); 7750 /* wait for any ongoing reference to this group to finish */
7165 if (tg->shares == shares) 7751 synchronize_sched();
7166 goto done;
7167 7752
7753 /*
7754 * Now we are free to modify the group's share on each cpu
7755 * w/o tripping rebalance_share or load_balance_fair.
7756 */
7168 tg->shares = shares; 7757 tg->shares = shares;
7169 for_each_possible_cpu(i) 7758 for_each_possible_cpu(i) {
7759 spin_lock_irq(&cpu_rq(i)->lock);
7170 set_se_shares(tg->se[i], shares); 7760 set_se_shares(tg->se[i], shares);
7761 spin_unlock_irq(&cpu_rq(i)->lock);
7762 }
7171 7763
7764 /*
7765 * Enable load balance activity on this group, by inserting it back on
7766 * each cpu's rq->leaf_cfs_rq_list.
7767 */
7768 for_each_possible_cpu(i) {
7769 rq = cpu_rq(i);
7770 cfs_rq = tg->cfs_rq[i];
7771 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7772 }
7172done: 7773done:
7173 spin_unlock(&tg->lock); 7774 unlock_task_group_list();
7174 return 0; 7775 return 0;
7175} 7776}
7176 7777
@@ -7179,6 +7780,31 @@ unsigned long sched_group_shares(struct task_group *tg)
7179 return tg->shares; 7780 return tg->shares;
7180} 7781}
7181 7782
7783/*
7784 * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
7785 */
7786int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
7787{
7788 struct task_group *tgi;
7789 unsigned long total = 0;
7790
7791 rcu_read_lock();
7792 list_for_each_entry_rcu(tgi, &task_groups, list)
7793 total += tgi->rt_ratio;
7794 rcu_read_unlock();
7795
7796 if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
7797 return -EINVAL;
7798
7799 tg->rt_ratio = rt_ratio;
7800 return 0;
7801}
7802
7803unsigned long sched_group_rt_ratio(struct task_group *tg)
7804{
7805 return tg->rt_ratio;
7806}
7807
7182#endif /* CONFIG_FAIR_GROUP_SCHED */ 7808#endif /* CONFIG_FAIR_GROUP_SCHED */
7183 7809
7184#ifdef CONFIG_FAIR_CGROUP_SCHED 7810#ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7254,12 +7880,30 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
7254 return (u64) tg->shares; 7880 return (u64) tg->shares;
7255} 7881}
7256 7882
7883static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
7884 u64 rt_ratio_val)
7885{
7886 return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
7887}
7888
7889static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
7890{
7891 struct task_group *tg = cgroup_tg(cgrp);
7892
7893 return (u64) tg->rt_ratio;
7894}
7895
7257static struct cftype cpu_files[] = { 7896static struct cftype cpu_files[] = {
7258 { 7897 {
7259 .name = "shares", 7898 .name = "shares",
7260 .read_uint = cpu_shares_read_uint, 7899 .read_uint = cpu_shares_read_uint,
7261 .write_uint = cpu_shares_write_uint, 7900 .write_uint = cpu_shares_write_uint,
7262 }, 7901 },
7902 {
7903 .name = "rt_ratio",
7904 .read_uint = cpu_rt_ratio_read_uint,
7905 .write_uint = cpu_rt_ratio_write_uint,
7906 },
7263}; 7907};
7264 7908
7265static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) 7909static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 80fbbfc04290..4b5e24cf2f4a 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -179,6 +179,7 @@ static void print_cpu(struct seq_file *m, int cpu)
179 PN(prev_clock_raw); 179 PN(prev_clock_raw);
180 P(clock_warps); 180 P(clock_warps);
181 P(clock_overflows); 181 P(clock_overflows);
182 P(clock_underflows);
182 P(clock_deep_idle_events); 183 P(clock_deep_idle_events);
183 PN(clock_max_delta); 184 PN(clock_max_delta);
184 P(cpu_load[0]); 185 P(cpu_load[0]);
@@ -299,6 +300,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
299 PN(se.exec_max); 300 PN(se.exec_max);
300 PN(se.slice_max); 301 PN(se.slice_max);
301 PN(se.wait_max); 302 PN(se.wait_max);
303 PN(se.wait_sum);
304 P(se.wait_count);
302 P(sched_info.bkl_count); 305 P(sched_info.bkl_count);
303 P(se.nr_migrations); 306 P(se.nr_migrations);
304 P(se.nr_migrations_cold); 307 P(se.nr_migrations_cold);
@@ -366,6 +369,8 @@ void proc_sched_set_task(struct task_struct *p)
366{ 369{
367#ifdef CONFIG_SCHEDSTATS 370#ifdef CONFIG_SCHEDSTATS
368 p->se.wait_max = 0; 371 p->se.wait_max = 0;
372 p->se.wait_sum = 0;
373 p->se.wait_count = 0;
369 p->se.sleep_max = 0; 374 p->se.sleep_max = 0;
370 p->se.sum_sleep_runtime = 0; 375 p->se.sum_sleep_runtime = 0;
371 p->se.block_max = 0; 376 p->se.block_max = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index da7c061e7206..72e25c7a3a18 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -20,6 +20,8 @@
20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
21 */ 21 */
22 22
23#include <linux/latencytop.h>
24
23/* 25/*
24 * Targeted preemption latency for CPU-bound tasks: 26 * Targeted preemption latency for CPU-bound tasks:
25 * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) 27 * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -248,8 +250,8 @@ static u64 __sched_period(unsigned long nr_running)
248 unsigned long nr_latency = sched_nr_latency; 250 unsigned long nr_latency = sched_nr_latency;
249 251
250 if (unlikely(nr_running > nr_latency)) { 252 if (unlikely(nr_running > nr_latency)) {
253 period = sysctl_sched_min_granularity;
251 period *= nr_running; 254 period *= nr_running;
252 do_div(period, nr_latency);
253 } 255 }
254 256
255 return period; 257 return period;
@@ -383,6 +385,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
383{ 385{
384 schedstat_set(se->wait_max, max(se->wait_max, 386 schedstat_set(se->wait_max, max(se->wait_max,
385 rq_of(cfs_rq)->clock - se->wait_start)); 387 rq_of(cfs_rq)->clock - se->wait_start));
388 schedstat_set(se->wait_count, se->wait_count + 1);
389 schedstat_set(se->wait_sum, se->wait_sum +
390 rq_of(cfs_rq)->clock - se->wait_start);
386 schedstat_set(se->wait_start, 0); 391 schedstat_set(se->wait_start, 0);
387} 392}
388 393
@@ -434,6 +439,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
434#ifdef CONFIG_SCHEDSTATS 439#ifdef CONFIG_SCHEDSTATS
435 if (se->sleep_start) { 440 if (se->sleep_start) {
436 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 441 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
442 struct task_struct *tsk = task_of(se);
437 443
438 if ((s64)delta < 0) 444 if ((s64)delta < 0)
439 delta = 0; 445 delta = 0;
@@ -443,9 +449,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
443 449
444 se->sleep_start = 0; 450 se->sleep_start = 0;
445 se->sum_sleep_runtime += delta; 451 se->sum_sleep_runtime += delta;
452
453 account_scheduler_latency(tsk, delta >> 10, 1);
446 } 454 }
447 if (se->block_start) { 455 if (se->block_start) {
448 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 456 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
457 struct task_struct *tsk = task_of(se);
449 458
450 if ((s64)delta < 0) 459 if ((s64)delta < 0)
451 delta = 0; 460 delta = 0;
@@ -462,11 +471,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
462 * time that the task spent sleeping: 471 * time that the task spent sleeping:
463 */ 472 */
464 if (unlikely(prof_on == SLEEP_PROFILING)) { 473 if (unlikely(prof_on == SLEEP_PROFILING)) {
465 struct task_struct *tsk = task_of(se);
466 474
467 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), 475 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
468 delta >> 20); 476 delta >> 20);
469 } 477 }
478 account_scheduler_latency(tsk, delta >> 10, 0);
470 } 479 }
471#endif 480#endif
472} 481}
@@ -642,13 +651,29 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
642 cfs_rq->curr = NULL; 651 cfs_rq->curr = NULL;
643} 652}
644 653
645static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 654static void
655entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
646{ 656{
647 /* 657 /*
648 * Update run-time statistics of the 'current'. 658 * Update run-time statistics of the 'current'.
649 */ 659 */
650 update_curr(cfs_rq); 660 update_curr(cfs_rq);
651 661
662#ifdef CONFIG_SCHED_HRTICK
663 /*
664 * queued ticks are scheduled to match the slice, so don't bother
665 * validating it and just reschedule.
666 */
667 if (queued)
668 return resched_task(rq_of(cfs_rq)->curr);
669 /*
670 * don't let the period tick interfere with the hrtick preemption
671 */
672 if (!sched_feat(DOUBLE_TICK) &&
673 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
674 return;
675#endif
676
652 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) 677 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
653 check_preempt_tick(cfs_rq, curr); 678 check_preempt_tick(cfs_rq, curr);
654} 679}
@@ -690,7 +715,7 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
690 715
691/* Iterate thr' all leaf cfs_rq's on a runqueue */ 716/* Iterate thr' all leaf cfs_rq's on a runqueue */
692#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 717#define for_each_leaf_cfs_rq(rq, cfs_rq) \
693 list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 718 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
694 719
695/* Do the two (enqueued) entities belong to the same group ? */ 720/* Do the two (enqueued) entities belong to the same group ? */
696static inline int 721static inline int
@@ -707,6 +732,8 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
707 return se->parent; 732 return se->parent;
708} 733}
709 734
735#define GROUP_IMBALANCE_PCT 20
736
710#else /* CONFIG_FAIR_GROUP_SCHED */ 737#else /* CONFIG_FAIR_GROUP_SCHED */
711 738
712#define for_each_sched_entity(se) \ 739#define for_each_sched_entity(se) \
@@ -752,6 +779,43 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
752 779
753#endif /* CONFIG_FAIR_GROUP_SCHED */ 780#endif /* CONFIG_FAIR_GROUP_SCHED */
754 781
782#ifdef CONFIG_SCHED_HRTICK
783static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
784{
785 int requeue = rq->curr == p;
786 struct sched_entity *se = &p->se;
787 struct cfs_rq *cfs_rq = cfs_rq_of(se);
788
789 WARN_ON(task_rq(p) != rq);
790
791 if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
792 u64 slice = sched_slice(cfs_rq, se);
793 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
794 s64 delta = slice - ran;
795
796 if (delta < 0) {
797 if (rq->curr == p)
798 resched_task(p);
799 return;
800 }
801
802 /*
803 * Don't schedule slices shorter than 10000ns, that just
804 * doesn't make sense. Rely on vruntime for fairness.
805 */
806 if (!requeue)
807 delta = max(10000LL, delta);
808
809 hrtick_start(rq, delta, requeue);
810 }
811}
812#else
813static inline void
814hrtick_start_fair(struct rq *rq, struct task_struct *p)
815{
816}
817#endif
818
755/* 819/*
756 * The enqueue_task method is called before nr_running is 820 * The enqueue_task method is called before nr_running is
757 * increased. Here we update the fair scheduling stats and 821 * increased. Here we update the fair scheduling stats and
@@ -760,15 +824,28 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
760static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) 824static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
761{ 825{
762 struct cfs_rq *cfs_rq; 826 struct cfs_rq *cfs_rq;
763 struct sched_entity *se = &p->se; 827 struct sched_entity *se = &p->se,
828 *topse = NULL; /* Highest schedulable entity */
829 int incload = 1;
764 830
765 for_each_sched_entity(se) { 831 for_each_sched_entity(se) {
766 if (se->on_rq) 832 topse = se;
833 if (se->on_rq) {
834 incload = 0;
767 break; 835 break;
836 }
768 cfs_rq = cfs_rq_of(se); 837 cfs_rq = cfs_rq_of(se);
769 enqueue_entity(cfs_rq, se, wakeup); 838 enqueue_entity(cfs_rq, se, wakeup);
770 wakeup = 1; 839 wakeup = 1;
771 } 840 }
841 /* Increment cpu load if we just enqueued the first task of a group on
842 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
843 * at the highest grouping level.
844 */
845 if (incload)
846 inc_cpu_load(rq, topse->load.weight);
847
848 hrtick_start_fair(rq, rq->curr);
772} 849}
773 850
774/* 851/*
@@ -779,16 +856,30 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
779static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) 856static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
780{ 857{
781 struct cfs_rq *cfs_rq; 858 struct cfs_rq *cfs_rq;
782 struct sched_entity *se = &p->se; 859 struct sched_entity *se = &p->se,
860 *topse = NULL; /* Highest schedulable entity */
861 int decload = 1;
783 862
784 for_each_sched_entity(se) { 863 for_each_sched_entity(se) {
864 topse = se;
785 cfs_rq = cfs_rq_of(se); 865 cfs_rq = cfs_rq_of(se);
786 dequeue_entity(cfs_rq, se, sleep); 866 dequeue_entity(cfs_rq, se, sleep);
787 /* Don't dequeue parent if it has other entities besides us */ 867 /* Don't dequeue parent if it has other entities besides us */
788 if (cfs_rq->load.weight) 868 if (cfs_rq->load.weight) {
869 if (parent_entity(se))
870 decload = 0;
789 break; 871 break;
872 }
790 sleep = 1; 873 sleep = 1;
791 } 874 }
875 /* Decrement cpu load if we just dequeued the last task of a group on
876 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
877 * at the highest grouping level.
878 */
879 if (decload)
880 dec_cpu_load(rq, topse->load.weight);
881
882 hrtick_start_fair(rq, rq->curr);
792} 883}
793 884
794/* 885/*
@@ -836,6 +927,154 @@ static void yield_task_fair(struct rq *rq)
836} 927}
837 928
838/* 929/*
930 * wake_idle() will wake a task on an idle cpu if task->cpu is
931 * not idle and an idle cpu is available. The span of cpus to
932 * search starts with cpus closest then further out as needed,
933 * so we always favor a closer, idle cpu.
934 *
935 * Returns the CPU we should wake onto.
936 */
937#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
938static int wake_idle(int cpu, struct task_struct *p)
939{
940 cpumask_t tmp;
941 struct sched_domain *sd;
942 int i;
943
944 /*
945 * If it is idle, then it is the best cpu to run this task.
946 *
947 * This cpu is also the best, if it has more than one task already.
948 * Siblings must be also busy(in most cases) as they didn't already
949 * pickup the extra load from this cpu and hence we need not check
950 * sibling runqueue info. This will avoid the checks and cache miss
951 * penalities associated with that.
952 */
953 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
954 return cpu;
955
956 for_each_domain(cpu, sd) {
957 if (sd->flags & SD_WAKE_IDLE) {
958 cpus_and(tmp, sd->span, p->cpus_allowed);
959 for_each_cpu_mask(i, tmp) {
960 if (idle_cpu(i)) {
961 if (i != task_cpu(p)) {
962 schedstat_inc(p,
963 se.nr_wakeups_idle);
964 }
965 return i;
966 }
967 }
968 } else {
969 break;
970 }
971 }
972 return cpu;
973}
974#else
975static inline int wake_idle(int cpu, struct task_struct *p)
976{
977 return cpu;
978}
979#endif
980
981#ifdef CONFIG_SMP
982static int select_task_rq_fair(struct task_struct *p, int sync)
983{
984 int cpu, this_cpu;
985 struct rq *rq;
986 struct sched_domain *sd, *this_sd = NULL;
987 int new_cpu;
988
989 cpu = task_cpu(p);
990 rq = task_rq(p);
991 this_cpu = smp_processor_id();
992 new_cpu = cpu;
993
994 if (cpu == this_cpu)
995 goto out_set_cpu;
996
997 for_each_domain(this_cpu, sd) {
998 if (cpu_isset(cpu, sd->span)) {
999 this_sd = sd;
1000 break;
1001 }
1002 }
1003
1004 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1005 goto out_set_cpu;
1006
1007 /*
1008 * Check for affine wakeup and passive balancing possibilities.
1009 */
1010 if (this_sd) {
1011 int idx = this_sd->wake_idx;
1012 unsigned int imbalance;
1013 unsigned long load, this_load;
1014
1015 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1016
1017 load = source_load(cpu, idx);
1018 this_load = target_load(this_cpu, idx);
1019
1020 new_cpu = this_cpu; /* Wake to this CPU if we can */
1021
1022 if (this_sd->flags & SD_WAKE_AFFINE) {
1023 unsigned long tl = this_load;
1024 unsigned long tl_per_task;
1025
1026 /*
1027 * Attract cache-cold tasks on sync wakeups:
1028 */
1029 if (sync && !task_hot(p, rq->clock, this_sd))
1030 goto out_set_cpu;
1031
1032 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1033 tl_per_task = cpu_avg_load_per_task(this_cpu);
1034
1035 /*
1036 * If sync wakeup then subtract the (maximum possible)
1037 * effect of the currently running task from the load
1038 * of the current CPU:
1039 */
1040 if (sync)
1041 tl -= current->se.load.weight;
1042
1043 if ((tl <= load &&
1044 tl + target_load(cpu, idx) <= tl_per_task) ||
1045 100*(tl + p->se.load.weight) <= imbalance*load) {
1046 /*
1047 * This domain has SD_WAKE_AFFINE and
1048 * p is cache cold in this domain, and
1049 * there is no bad imbalance.
1050 */
1051 schedstat_inc(this_sd, ttwu_move_affine);
1052 schedstat_inc(p, se.nr_wakeups_affine);
1053 goto out_set_cpu;
1054 }
1055 }
1056
1057 /*
1058 * Start passive balancing when half the imbalance_pct
1059 * limit is reached.
1060 */
1061 if (this_sd->flags & SD_WAKE_BALANCE) {
1062 if (imbalance*this_load <= 100*load) {
1063 schedstat_inc(this_sd, ttwu_move_balance);
1064 schedstat_inc(p, se.nr_wakeups_passive);
1065 goto out_set_cpu;
1066 }
1067 }
1068 }
1069
1070 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1071out_set_cpu:
1072 return wake_idle(new_cpu, p);
1073}
1074#endif /* CONFIG_SMP */
1075
1076
1077/*
839 * Preempt the current task with a newly woken task if needed: 1078 * Preempt the current task with a newly woken task if needed:
840 */ 1079 */
841static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) 1080static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
@@ -876,6 +1115,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
876 1115
877static struct task_struct *pick_next_task_fair(struct rq *rq) 1116static struct task_struct *pick_next_task_fair(struct rq *rq)
878{ 1117{
1118 struct task_struct *p;
879 struct cfs_rq *cfs_rq = &rq->cfs; 1119 struct cfs_rq *cfs_rq = &rq->cfs;
880 struct sched_entity *se; 1120 struct sched_entity *se;
881 1121
@@ -887,7 +1127,10 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
887 cfs_rq = group_cfs_rq(se); 1127 cfs_rq = group_cfs_rq(se);
888 } while (cfs_rq); 1128 } while (cfs_rq);
889 1129
890 return task_of(se); 1130 p = task_of(se);
1131 hrtick_start_fair(rq, p);
1132
1133 return p;
891} 1134}
892 1135
893/* 1136/*
@@ -944,25 +1187,6 @@ static struct task_struct *load_balance_next_fair(void *arg)
944 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); 1187 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
945} 1188}
946 1189
947#ifdef CONFIG_FAIR_GROUP_SCHED
948static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
949{
950 struct sched_entity *curr;
951 struct task_struct *p;
952
953 if (!cfs_rq->nr_running)
954 return MAX_PRIO;
955
956 curr = cfs_rq->curr;
957 if (!curr)
958 curr = __pick_next_entity(cfs_rq);
959
960 p = task_of(curr);
961
962 return p->prio;
963}
964#endif
965
966static unsigned long 1190static unsigned long
967load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1191load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
968 unsigned long max_load_move, 1192 unsigned long max_load_move,
@@ -972,28 +1196,45 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
972 struct cfs_rq *busy_cfs_rq; 1196 struct cfs_rq *busy_cfs_rq;
973 long rem_load_move = max_load_move; 1197 long rem_load_move = max_load_move;
974 struct rq_iterator cfs_rq_iterator; 1198 struct rq_iterator cfs_rq_iterator;
1199 unsigned long load_moved;
975 1200
976 cfs_rq_iterator.start = load_balance_start_fair; 1201 cfs_rq_iterator.start = load_balance_start_fair;
977 cfs_rq_iterator.next = load_balance_next_fair; 1202 cfs_rq_iterator.next = load_balance_next_fair;
978 1203
979 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 1204 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
980#ifdef CONFIG_FAIR_GROUP_SCHED 1205#ifdef CONFIG_FAIR_GROUP_SCHED
981 struct cfs_rq *this_cfs_rq; 1206 struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu];
982 long imbalance; 1207 unsigned long maxload, task_load, group_weight;
983 unsigned long maxload; 1208 unsigned long thisload, per_task_load;
1209 struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
1210
1211 task_load = busy_cfs_rq->load.weight;
1212 group_weight = se->load.weight;
984 1213
985 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); 1214 /*
1215 * 'group_weight' is contributed by tasks of total weight
1216 * 'task_load'. To move 'rem_load_move' worth of weight only,
1217 * we need to move a maximum task load of:
1218 *
1219 * maxload = (remload / group_weight) * task_load;
1220 */
1221 maxload = (rem_load_move * task_load) / group_weight;
986 1222
987 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; 1223 if (!maxload || !task_load)
988 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
989 if (imbalance <= 0)
990 continue; 1224 continue;
991 1225
992 /* Don't pull more than imbalance/2 */ 1226 per_task_load = task_load / busy_cfs_rq->nr_running;
993 imbalance /= 2; 1227 /*
994 maxload = min(rem_load_move, imbalance); 1228 * balance_tasks will try to forcibly move atleast one task if
1229 * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
1230 * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
1231 */
1232 if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
1233 continue;
995 1234
996 *this_best_prio = cfs_rq_best_prio(this_cfs_rq); 1235 /* Disable priority-based load balance */
1236 *this_best_prio = 0;
1237 thisload = this_cfs_rq->load.weight;
997#else 1238#else
998# define maxload rem_load_move 1239# define maxload rem_load_move
999#endif 1240#endif
@@ -1002,11 +1243,33 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1002 * load_balance_[start|next]_fair iterators 1243 * load_balance_[start|next]_fair iterators
1003 */ 1244 */
1004 cfs_rq_iterator.arg = busy_cfs_rq; 1245 cfs_rq_iterator.arg = busy_cfs_rq;
1005 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, 1246 load_moved = balance_tasks(this_rq, this_cpu, busiest,
1006 maxload, sd, idle, all_pinned, 1247 maxload, sd, idle, all_pinned,
1007 this_best_prio, 1248 this_best_prio,
1008 &cfs_rq_iterator); 1249 &cfs_rq_iterator);
1009 1250
1251#ifdef CONFIG_FAIR_GROUP_SCHED
1252 /*
1253 * load_moved holds the task load that was moved. The
1254 * effective (group) weight moved would be:
1255 * load_moved_eff = load_moved/task_load * group_weight;
1256 */
1257 load_moved = (group_weight * load_moved) / task_load;
1258
1259 /* Adjust shares on both cpus to reflect load_moved */
1260 group_weight -= load_moved;
1261 set_se_shares(se, group_weight);
1262
1263 se = busy_cfs_rq->tg->se[this_cpu];
1264 if (!thisload)
1265 group_weight = load_moved;
1266 else
1267 group_weight = se->load.weight + load_moved;
1268 set_se_shares(se, group_weight);
1269#endif
1270
1271 rem_load_move -= load_moved;
1272
1010 if (rem_load_move <= 0) 1273 if (rem_load_move <= 0)
1011 break; 1274 break;
1012 } 1275 }
@@ -1042,14 +1305,14 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1042/* 1305/*
1043 * scheduler tick hitting a task of our scheduling class: 1306 * scheduler tick hitting a task of our scheduling class:
1044 */ 1307 */
1045static void task_tick_fair(struct rq *rq, struct task_struct *curr) 1308static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1046{ 1309{
1047 struct cfs_rq *cfs_rq; 1310 struct cfs_rq *cfs_rq;
1048 struct sched_entity *se = &curr->se; 1311 struct sched_entity *se = &curr->se;
1049 1312
1050 for_each_sched_entity(se) { 1313 for_each_sched_entity(se) {
1051 cfs_rq = cfs_rq_of(se); 1314 cfs_rq = cfs_rq_of(se);
1052 entity_tick(cfs_rq, se); 1315 entity_tick(cfs_rq, se, queued);
1053 } 1316 }
1054} 1317}
1055 1318
@@ -1087,6 +1350,42 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1087 resched_task(rq->curr); 1350 resched_task(rq->curr);
1088} 1351}
1089 1352
1353/*
1354 * Priority of the task has changed. Check to see if we preempt
1355 * the current task.
1356 */
1357static void prio_changed_fair(struct rq *rq, struct task_struct *p,
1358 int oldprio, int running)
1359{
1360 /*
1361 * Reschedule if we are currently running on this runqueue and
1362 * our priority decreased, or if we are not currently running on
1363 * this runqueue and our priority is higher than the current's
1364 */
1365 if (running) {
1366 if (p->prio > oldprio)
1367 resched_task(rq->curr);
1368 } else
1369 check_preempt_curr(rq, p);
1370}
1371
1372/*
1373 * We switched to the sched_fair class.
1374 */
1375static void switched_to_fair(struct rq *rq, struct task_struct *p,
1376 int running)
1377{
1378 /*
1379 * We were most likely switched from sched_rt, so
1380 * kick off the schedule if running, otherwise just see
1381 * if we can still preempt the current task.
1382 */
1383 if (running)
1384 resched_task(rq->curr);
1385 else
1386 check_preempt_curr(rq, p);
1387}
1388
1090/* Account for a task changing its policy or group. 1389/* Account for a task changing its policy or group.
1091 * 1390 *
1092 * This routine is mostly called to set cfs_rq->curr field when a task 1391 * This routine is mostly called to set cfs_rq->curr field when a task
@@ -1108,6 +1407,9 @@ static const struct sched_class fair_sched_class = {
1108 .enqueue_task = enqueue_task_fair, 1407 .enqueue_task = enqueue_task_fair,
1109 .dequeue_task = dequeue_task_fair, 1408 .dequeue_task = dequeue_task_fair,
1110 .yield_task = yield_task_fair, 1409 .yield_task = yield_task_fair,
1410#ifdef CONFIG_SMP
1411 .select_task_rq = select_task_rq_fair,
1412#endif /* CONFIG_SMP */
1111 1413
1112 .check_preempt_curr = check_preempt_wakeup, 1414 .check_preempt_curr = check_preempt_wakeup,
1113 1415
@@ -1122,6 +1424,9 @@ static const struct sched_class fair_sched_class = {
1122 .set_curr_task = set_curr_task_fair, 1424 .set_curr_task = set_curr_task_fair,
1123 .task_tick = task_tick_fair, 1425 .task_tick = task_tick_fair,
1124 .task_new = task_new_fair, 1426 .task_new = task_new_fair,
1427
1428 .prio_changed = prio_changed_fair,
1429 .switched_to = switched_to_fair,
1125}; 1430};
1126 1431
1127#ifdef CONFIG_SCHED_DEBUG 1432#ifdef CONFIG_SCHED_DEBUG
@@ -1132,7 +1437,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
1132#ifdef CONFIG_FAIR_GROUP_SCHED 1437#ifdef CONFIG_FAIR_GROUP_SCHED
1133 print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); 1438 print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
1134#endif 1439#endif
1440 rcu_read_lock();
1135 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) 1441 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1136 print_cfs_rq(m, cpu, cfs_rq); 1442 print_cfs_rq(m, cpu, cfs_rq);
1443 rcu_read_unlock();
1137} 1444}
1138#endif 1445#endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index bf9c25c15b8b..2bcafa375633 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -5,6 +5,12 @@
5 * handled in sched_fair.c) 5 * handled in sched_fair.c)
6 */ 6 */
7 7
8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sync)
10{
11 return task_cpu(p); /* IDLE tasks as never migrated */
12}
13#endif /* CONFIG_SMP */
8/* 14/*
9 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
10 */ 16 */
@@ -55,7 +61,7 @@ move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
55} 61}
56#endif 62#endif
57 63
58static void task_tick_idle(struct rq *rq, struct task_struct *curr) 64static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
59{ 65{
60} 66}
61 67
@@ -63,6 +69,33 @@ static void set_curr_task_idle(struct rq *rq)
63{ 69{
64} 70}
65 71
72static void switched_to_idle(struct rq *rq, struct task_struct *p,
73 int running)
74{
75 /* Can this actually happen?? */
76 if (running)
77 resched_task(rq->curr);
78 else
79 check_preempt_curr(rq, p);
80}
81
82static void prio_changed_idle(struct rq *rq, struct task_struct *p,
83 int oldprio, int running)
84{
85 /* This can happen for hot plug CPUS */
86
87 /*
88 * Reschedule if we are currently running on this runqueue and
89 * our priority decreased, or if we are not currently running on
90 * this runqueue and our priority is higher than the current's
91 */
92 if (running) {
93 if (p->prio > oldprio)
94 resched_task(rq->curr);
95 } else
96 check_preempt_curr(rq, p);
97}
98
66/* 99/*
67 * Simple, special scheduling class for the per-CPU idle tasks: 100 * Simple, special scheduling class for the per-CPU idle tasks:
68 */ 101 */
@@ -72,6 +105,9 @@ const struct sched_class idle_sched_class = {
72 105
73 /* dequeue is not valid, we print a debug message there: */ 106 /* dequeue is not valid, we print a debug message there: */
74 .dequeue_task = dequeue_task_idle, 107 .dequeue_task = dequeue_task_idle,
108#ifdef CONFIG_SMP
109 .select_task_rq = select_task_rq_idle,
110#endif /* CONFIG_SMP */
75 111
76 .check_preempt_curr = check_preempt_curr_idle, 112 .check_preempt_curr = check_preempt_curr_idle,
77 113
@@ -85,5 +121,9 @@ const struct sched_class idle_sched_class = {
85 121
86 .set_curr_task = set_curr_task_idle, 122 .set_curr_task = set_curr_task_idle,
87 .task_tick = task_tick_idle, 123 .task_tick = task_tick_idle,
124
125 .prio_changed = prio_changed_idle,
126 .switched_to = switched_to_idle,
127
88 /* no .task_new for idle tasks */ 128 /* no .task_new for idle tasks */
89}; 129};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 9ba3daa03475..274b40d7bef2 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,6 +3,217 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6#ifdef CONFIG_SMP
7
8static inline int rt_overloaded(struct rq *rq)
9{
10 return atomic_read(&rq->rd->rto_count);
11}
12
13static inline void rt_set_overload(struct rq *rq)
14{
15 cpu_set(rq->cpu, rq->rd->rto_mask);
16 /*
17 * Make sure the mask is visible before we set
18 * the overload count. That is checked to determine
19 * if we should look at the mask. It would be a shame
20 * if we looked at the mask, but the mask was not
21 * updated yet.
22 */
23 wmb();
24 atomic_inc(&rq->rd->rto_count);
25}
26
27static inline void rt_clear_overload(struct rq *rq)
28{
29 /* the order here really doesn't matter */
30 atomic_dec(&rq->rd->rto_count);
31 cpu_clear(rq->cpu, rq->rd->rto_mask);
32}
33
34static void update_rt_migration(struct rq *rq)
35{
36 if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
37 if (!rq->rt.overloaded) {
38 rt_set_overload(rq);
39 rq->rt.overloaded = 1;
40 }
41 } else if (rq->rt.overloaded) {
42 rt_clear_overload(rq);
43 rq->rt.overloaded = 0;
44 }
45}
46#endif /* CONFIG_SMP */
47
48static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
49{
50 return container_of(rt_se, struct task_struct, rt);
51}
52
53static inline int on_rt_rq(struct sched_rt_entity *rt_se)
54{
55 return !list_empty(&rt_se->run_list);
56}
57
58#ifdef CONFIG_FAIR_GROUP_SCHED
59
60static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
61{
62 if (!rt_rq->tg)
63 return SCHED_RT_FRAC;
64
65 return rt_rq->tg->rt_ratio;
66}
67
68#define for_each_leaf_rt_rq(rt_rq, rq) \
69 list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
70
71static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
72{
73 return rt_rq->rq;
74}
75
76static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
77{
78 return rt_se->rt_rq;
79}
80
81#define for_each_sched_rt_entity(rt_se) \
82 for (; rt_se; rt_se = rt_se->parent)
83
84static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
85{
86 return rt_se->my_q;
87}
88
89static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
90static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
91
92static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
93{
94 struct sched_rt_entity *rt_se = rt_rq->rt_se;
95
96 if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
97 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
98
99 enqueue_rt_entity(rt_se);
100 if (rt_rq->highest_prio < curr->prio)
101 resched_task(curr);
102 }
103}
104
105static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
106{
107 struct sched_rt_entity *rt_se = rt_rq->rt_se;
108
109 if (rt_se && on_rt_rq(rt_se))
110 dequeue_rt_entity(rt_se);
111}
112
113#else
114
115static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
116{
117 return sysctl_sched_rt_ratio;
118}
119
120#define for_each_leaf_rt_rq(rt_rq, rq) \
121 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
122
123static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
124{
125 return container_of(rt_rq, struct rq, rt);
126}
127
128static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
129{
130 struct task_struct *p = rt_task_of(rt_se);
131 struct rq *rq = task_rq(p);
132
133 return &rq->rt;
134}
135
136#define for_each_sched_rt_entity(rt_se) \
137 for (; rt_se; rt_se = NULL)
138
139static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
140{
141 return NULL;
142}
143
144static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
145{
146}
147
148static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
149{
150}
151
152#endif
153
154static inline int rt_se_prio(struct sched_rt_entity *rt_se)
155{
156#ifdef CONFIG_FAIR_GROUP_SCHED
157 struct rt_rq *rt_rq = group_rt_rq(rt_se);
158
159 if (rt_rq)
160 return rt_rq->highest_prio;
161#endif
162
163 return rt_task_of(rt_se)->prio;
164}
165
166static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
167{
168 unsigned int rt_ratio = sched_rt_ratio(rt_rq);
169 u64 period, ratio;
170
171 if (rt_ratio == SCHED_RT_FRAC)
172 return 0;
173
174 if (rt_rq->rt_throttled)
175 return 1;
176
177 period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
178 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
179
180 if (rt_rq->rt_time > ratio) {
181 struct rq *rq = rq_of_rt_rq(rt_rq);
182
183 rq->rt_throttled = 1;
184 rt_rq->rt_throttled = 1;
185
186 sched_rt_ratio_dequeue(rt_rq);
187 return 1;
188 }
189
190 return 0;
191}
192
193static void update_sched_rt_period(struct rq *rq)
194{
195 struct rt_rq *rt_rq;
196 u64 period;
197
198 while (rq->clock > rq->rt_period_expire) {
199 period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
200 rq->rt_period_expire += period;
201
202 for_each_leaf_rt_rq(rt_rq, rq) {
203 unsigned long rt_ratio = sched_rt_ratio(rt_rq);
204 u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
205
206 rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
207 if (rt_rq->rt_throttled) {
208 rt_rq->rt_throttled = 0;
209 sched_rt_ratio_enqueue(rt_rq);
210 }
211 }
212
213 rq->rt_throttled = 0;
214 }
215}
216
6/* 217/*
7 * Update the current task's runtime statistics. Skip current tasks that 218 * Update the current task's runtime statistics. Skip current tasks that
8 * are not in our scheduling class. 219 * are not in our scheduling class.
@@ -10,6 +221,8 @@
10static void update_curr_rt(struct rq *rq) 221static void update_curr_rt(struct rq *rq)
11{ 222{
12 struct task_struct *curr = rq->curr; 223 struct task_struct *curr = rq->curr;
224 struct sched_rt_entity *rt_se = &curr->rt;
225 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
13 u64 delta_exec; 226 u64 delta_exec;
14 227
15 if (!task_has_rt_policy(curr)) 228 if (!task_has_rt_policy(curr))
@@ -24,47 +237,228 @@ static void update_curr_rt(struct rq *rq)
24 curr->se.sum_exec_runtime += delta_exec; 237 curr->se.sum_exec_runtime += delta_exec;
25 curr->se.exec_start = rq->clock; 238 curr->se.exec_start = rq->clock;
26 cpuacct_charge(curr, delta_exec); 239 cpuacct_charge(curr, delta_exec);
240
241 rt_rq->rt_time += delta_exec;
242 /*
243 * might make it a tad more accurate:
244 *
245 * update_sched_rt_period(rq);
246 */
247 if (sched_rt_ratio_exceeded(rt_rq))
248 resched_task(curr);
27} 249}
28 250
29static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) 251static inline
252void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
253{
254 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
255 rt_rq->rt_nr_running++;
256#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
257 if (rt_se_prio(rt_se) < rt_rq->highest_prio)
258 rt_rq->highest_prio = rt_se_prio(rt_se);
259#endif
260#ifdef CONFIG_SMP
261 if (rt_se->nr_cpus_allowed > 1) {
262 struct rq *rq = rq_of_rt_rq(rt_rq);
263 rq->rt.rt_nr_migratory++;
264 }
265
266 update_rt_migration(rq_of_rt_rq(rt_rq));
267#endif
268}
269
270static inline
271void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
272{
273 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
274 WARN_ON(!rt_rq->rt_nr_running);
275 rt_rq->rt_nr_running--;
276#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
277 if (rt_rq->rt_nr_running) {
278 struct rt_prio_array *array;
279
280 WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
281 if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
282 /* recalculate */
283 array = &rt_rq->active;
284 rt_rq->highest_prio =
285 sched_find_first_bit(array->bitmap);
286 } /* otherwise leave rq->highest prio alone */
287 } else
288 rt_rq->highest_prio = MAX_RT_PRIO;
289#endif
290#ifdef CONFIG_SMP
291 if (rt_se->nr_cpus_allowed > 1) {
292 struct rq *rq = rq_of_rt_rq(rt_rq);
293 rq->rt.rt_nr_migratory--;
294 }
295
296 update_rt_migration(rq_of_rt_rq(rt_rq));
297#endif /* CONFIG_SMP */
298}
299
300static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
301{
302 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
303 struct rt_prio_array *array = &rt_rq->active;
304 struct rt_rq *group_rq = group_rt_rq(rt_se);
305
306 if (group_rq && group_rq->rt_throttled)
307 return;
308
309 list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
310 __set_bit(rt_se_prio(rt_se), array->bitmap);
311
312 inc_rt_tasks(rt_se, rt_rq);
313}
314
315static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
30{ 316{
31 struct rt_prio_array *array = &rq->rt.active; 317 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
318 struct rt_prio_array *array = &rt_rq->active;
319
320 list_del_init(&rt_se->run_list);
321 if (list_empty(array->queue + rt_se_prio(rt_se)))
322 __clear_bit(rt_se_prio(rt_se), array->bitmap);
32 323
33 list_add_tail(&p->run_list, array->queue + p->prio); 324 dec_rt_tasks(rt_se, rt_rq);
34 __set_bit(p->prio, array->bitmap); 325}
326
327/*
328 * Because the prio of an upper entry depends on the lower
329 * entries, we must remove entries top - down.
330 *
331 * XXX: O(1/2 h^2) because we can only walk up, not down the chain.
332 * doesn't matter much for now, as h=2 for GROUP_SCHED.
333 */
334static void dequeue_rt_stack(struct task_struct *p)
335{
336 struct sched_rt_entity *rt_se, *top_se;
337
338 /*
339 * dequeue all, top - down.
340 */
341 do {
342 rt_se = &p->rt;
343 top_se = NULL;
344 for_each_sched_rt_entity(rt_se) {
345 if (on_rt_rq(rt_se))
346 top_se = rt_se;
347 }
348 if (top_se)
349 dequeue_rt_entity(top_se);
350 } while (top_se);
35} 351}
36 352
37/* 353/*
38 * Adding/removing a task to/from a priority array: 354 * Adding/removing a task to/from a priority array:
39 */ 355 */
356static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
357{
358 struct sched_rt_entity *rt_se = &p->rt;
359
360 if (wakeup)
361 rt_se->timeout = 0;
362
363 dequeue_rt_stack(p);
364
365 /*
366 * enqueue everybody, bottom - up.
367 */
368 for_each_sched_rt_entity(rt_se)
369 enqueue_rt_entity(rt_se);
370
371 inc_cpu_load(rq, p->se.load.weight);
372}
373
40static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 374static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
41{ 375{
42 struct rt_prio_array *array = &rq->rt.active; 376 struct sched_rt_entity *rt_se = &p->rt;
377 struct rt_rq *rt_rq;
43 378
44 update_curr_rt(rq); 379 update_curr_rt(rq);
45 380
46 list_del(&p->run_list); 381 dequeue_rt_stack(p);
47 if (list_empty(array->queue + p->prio)) 382
48 __clear_bit(p->prio, array->bitmap); 383 /*
384 * re-enqueue all non-empty rt_rq entities.
385 */
386 for_each_sched_rt_entity(rt_se) {
387 rt_rq = group_rt_rq(rt_se);
388 if (rt_rq && rt_rq->rt_nr_running)
389 enqueue_rt_entity(rt_se);
390 }
391
392 dec_cpu_load(rq, p->se.load.weight);
49} 393}
50 394
51/* 395/*
52 * Put task to the end of the run list without the overhead of dequeue 396 * Put task to the end of the run list without the overhead of dequeue
53 * followed by enqueue. 397 * followed by enqueue.
54 */ 398 */
399static
400void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
401{
402 struct rt_prio_array *array = &rt_rq->active;
403
404 list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
405}
406
55static void requeue_task_rt(struct rq *rq, struct task_struct *p) 407static void requeue_task_rt(struct rq *rq, struct task_struct *p)
56{ 408{
57 struct rt_prio_array *array = &rq->rt.active; 409 struct sched_rt_entity *rt_se = &p->rt;
410 struct rt_rq *rt_rq;
58 411
59 list_move_tail(&p->run_list, array->queue + p->prio); 412 for_each_sched_rt_entity(rt_se) {
413 rt_rq = rt_rq_of_se(rt_se);
414 requeue_rt_entity(rt_rq, rt_se);
415 }
60} 416}
61 417
62static void 418static void yield_task_rt(struct rq *rq)
63yield_task_rt(struct rq *rq)
64{ 419{
65 requeue_task_rt(rq, rq->curr); 420 requeue_task_rt(rq, rq->curr);
66} 421}
67 422
423#ifdef CONFIG_SMP
424static int find_lowest_rq(struct task_struct *task);
425
426static int select_task_rq_rt(struct task_struct *p, int sync)
427{
428 struct rq *rq = task_rq(p);
429
430 /*
431 * If the current task is an RT task, then
432 * try to see if we can wake this RT task up on another
433 * runqueue. Otherwise simply start this RT task
434 * on its current runqueue.
435 *
436 * We want to avoid overloading runqueues. Even if
437 * the RT task is of higher priority than the current RT task.
438 * RT tasks behave differently than other tasks. If
439 * one gets preempted, we try to push it off to another queue.
440 * So trying to keep a preempting RT task on the same
441 * cache hot CPU will force the running RT task to
442 * a cold CPU. So we waste all the cache for the lower
443 * RT task in hopes of saving some of a RT task
444 * that is just being woken and probably will have
445 * cold cache anyway.
446 */
447 if (unlikely(rt_task(rq->curr)) &&
448 (p->rt.nr_cpus_allowed > 1)) {
449 int cpu = find_lowest_rq(p);
450
451 return (cpu == -1) ? task_cpu(p) : cpu;
452 }
453
454 /*
455 * Otherwise, just let it ride on the affined RQ and the
456 * post-schedule router will push the preempted task away
457 */
458 return task_cpu(p);
459}
460#endif /* CONFIG_SMP */
461
68/* 462/*
69 * Preempt the current task with a newly woken task if needed: 463 * Preempt the current task with a newly woken task if needed:
70 */ 464 */
@@ -74,25 +468,48 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
74 resched_task(rq->curr); 468 resched_task(rq->curr);
75} 469}
76 470
77static struct task_struct *pick_next_task_rt(struct rq *rq) 471static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
472 struct rt_rq *rt_rq)
78{ 473{
79 struct rt_prio_array *array = &rq->rt.active; 474 struct rt_prio_array *array = &rt_rq->active;
80 struct task_struct *next; 475 struct sched_rt_entity *next = NULL;
81 struct list_head *queue; 476 struct list_head *queue;
82 int idx; 477 int idx;
83 478
84 idx = sched_find_first_bit(array->bitmap); 479 idx = sched_find_first_bit(array->bitmap);
85 if (idx >= MAX_RT_PRIO) 480 BUG_ON(idx >= MAX_RT_PRIO);
86 return NULL;
87 481
88 queue = array->queue + idx; 482 queue = array->queue + idx;
89 next = list_entry(queue->next, struct task_struct, run_list); 483 next = list_entry(queue->next, struct sched_rt_entity, run_list);
90
91 next->se.exec_start = rq->clock;
92 484
93 return next; 485 return next;
94} 486}
95 487
488static struct task_struct *pick_next_task_rt(struct rq *rq)
489{
490 struct sched_rt_entity *rt_se;
491 struct task_struct *p;
492 struct rt_rq *rt_rq;
493
494 rt_rq = &rq->rt;
495
496 if (unlikely(!rt_rq->rt_nr_running))
497 return NULL;
498
499 if (sched_rt_ratio_exceeded(rt_rq))
500 return NULL;
501
502 do {
503 rt_se = pick_next_rt_entity(rq, rt_rq);
504 BUG_ON(!rt_se);
505 rt_rq = group_rt_rq(rt_se);
506 } while (rt_rq);
507
508 p = rt_task_of(rt_se);
509 p->se.exec_start = rq->clock;
510 return p;
511}
512
96static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 513static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
97{ 514{
98 update_curr_rt(rq); 515 update_curr_rt(rq);
@@ -100,76 +517,448 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
100} 517}
101 518
102#ifdef CONFIG_SMP 519#ifdef CONFIG_SMP
103/* 520
104 * Load-balancing iterator. Note: while the runqueue stays locked 521/* Only try algorithms three times */
105 * during the whole iteration, the current task might be 522#define RT_MAX_TRIES 3
106 * dequeued so the iterator has to be dequeue-safe. Here we 523
107 * achieve that by always pre-iterating before returning 524static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
108 * the current task: 525static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
109 */ 526
110static struct task_struct *load_balance_start_rt(void *arg) 527static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
111{ 528{
112 struct rq *rq = arg; 529 if (!task_running(rq, p) &&
113 struct rt_prio_array *array = &rq->rt.active; 530 (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
114 struct list_head *head, *curr; 531 (p->rt.nr_cpus_allowed > 1))
115 struct task_struct *p; 532 return 1;
533 return 0;
534}
535
536/* Return the second highest RT task, NULL otherwise */
537static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
538{
539 struct task_struct *next = NULL;
540 struct sched_rt_entity *rt_se;
541 struct rt_prio_array *array;
542 struct rt_rq *rt_rq;
116 int idx; 543 int idx;
117 544
118 idx = sched_find_first_bit(array->bitmap); 545 for_each_leaf_rt_rq(rt_rq, rq) {
119 if (idx >= MAX_RT_PRIO) 546 array = &rt_rq->active;
120 return NULL; 547 idx = sched_find_first_bit(array->bitmap);
548 next_idx:
549 if (idx >= MAX_RT_PRIO)
550 continue;
551 if (next && next->prio < idx)
552 continue;
553 list_for_each_entry(rt_se, array->queue + idx, run_list) {
554 struct task_struct *p = rt_task_of(rt_se);
555 if (pick_rt_task(rq, p, cpu)) {
556 next = p;
557 break;
558 }
559 }
560 if (!next) {
561 idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
562 goto next_idx;
563 }
564 }
121 565
122 head = array->queue + idx; 566 return next;
123 curr = head->prev; 567}
124 568
125 p = list_entry(curr, struct task_struct, run_list); 569static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
126 570
127 curr = curr->prev; 571static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
572{
573 int lowest_prio = -1;
574 int lowest_cpu = -1;
575 int count = 0;
576 int cpu;
128 577
129 rq->rt.rt_load_balance_idx = idx; 578 cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
130 rq->rt.rt_load_balance_head = head;
131 rq->rt.rt_load_balance_curr = curr;
132 579
133 return p; 580 /*
581 * Scan each rq for the lowest prio.
582 */
583 for_each_cpu_mask(cpu, *lowest_mask) {
584 struct rq *rq = cpu_rq(cpu);
585
586 /* We look for lowest RT prio or non-rt CPU */
587 if (rq->rt.highest_prio >= MAX_RT_PRIO) {
588 /*
589 * if we already found a low RT queue
590 * and now we found this non-rt queue
591 * clear the mask and set our bit.
592 * Otherwise just return the queue as is
593 * and the count==1 will cause the algorithm
594 * to use the first bit found.
595 */
596 if (lowest_cpu != -1) {
597 cpus_clear(*lowest_mask);
598 cpu_set(rq->cpu, *lowest_mask);
599 }
600 return 1;
601 }
602
603 /* no locking for now */
604 if ((rq->rt.highest_prio > task->prio)
605 && (rq->rt.highest_prio >= lowest_prio)) {
606 if (rq->rt.highest_prio > lowest_prio) {
607 /* new low - clear old data */
608 lowest_prio = rq->rt.highest_prio;
609 lowest_cpu = cpu;
610 count = 0;
611 }
612 count++;
613 } else
614 cpu_clear(cpu, *lowest_mask);
615 }
616
617 /*
618 * Clear out all the set bits that represent
619 * runqueues that were of higher prio than
620 * the lowest_prio.
621 */
622 if (lowest_cpu > 0) {
623 /*
624 * Perhaps we could add another cpumask op to
625 * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
626 * Then that could be optimized to use memset and such.
627 */
628 for_each_cpu_mask(cpu, *lowest_mask) {
629 if (cpu >= lowest_cpu)
630 break;
631 cpu_clear(cpu, *lowest_mask);
632 }
633 }
634
635 return count;
134} 636}
135 637
136static struct task_struct *load_balance_next_rt(void *arg) 638static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
137{ 639{
138 struct rq *rq = arg; 640 int first;
139 struct rt_prio_array *array = &rq->rt.active; 641
140 struct list_head *head, *curr; 642 /* "this_cpu" is cheaper to preempt than a remote processor */
141 struct task_struct *p; 643 if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
142 int idx; 644 return this_cpu;
645
646 first = first_cpu(*mask);
647 if (first != NR_CPUS)
648 return first;
649
650 return -1;
651}
652
653static int find_lowest_rq(struct task_struct *task)
654{
655 struct sched_domain *sd;
656 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
657 int this_cpu = smp_processor_id();
658 int cpu = task_cpu(task);
659 int count = find_lowest_cpus(task, lowest_mask);
143 660
144 idx = rq->rt.rt_load_balance_idx; 661 if (!count)
145 head = rq->rt.rt_load_balance_head; 662 return -1; /* No targets found */
146 curr = rq->rt.rt_load_balance_curr;
147 663
148 /* 664 /*
149 * If we arrived back to the head again then 665 * There is no sense in performing an optimal search if only one
150 * iterate to the next queue (if any): 666 * target is found.
151 */ 667 */
152 if (unlikely(head == curr)) { 668 if (count == 1)
153 int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); 669 return first_cpu(*lowest_mask);
154 670
155 if (next_idx >= MAX_RT_PRIO) 671 /*
156 return NULL; 672 * At this point we have built a mask of cpus representing the
673 * lowest priority tasks in the system. Now we want to elect
674 * the best one based on our affinity and topology.
675 *
676 * We prioritize the last cpu that the task executed on since
677 * it is most likely cache-hot in that location.
678 */
679 if (cpu_isset(cpu, *lowest_mask))
680 return cpu;
681
682 /*
683 * Otherwise, we consult the sched_domains span maps to figure
684 * out which cpu is logically closest to our hot cache data.
685 */
686 if (this_cpu == cpu)
687 this_cpu = -1; /* Skip this_cpu opt if the same */
688
689 for_each_domain(cpu, sd) {
690 if (sd->flags & SD_WAKE_AFFINE) {
691 cpumask_t domain_mask;
692 int best_cpu;
157 693
158 idx = next_idx; 694 cpus_and(domain_mask, sd->span, *lowest_mask);
159 head = array->queue + idx;
160 curr = head->prev;
161 695
162 rq->rt.rt_load_balance_idx = idx; 696 best_cpu = pick_optimal_cpu(this_cpu,
163 rq->rt.rt_load_balance_head = head; 697 &domain_mask);
698 if (best_cpu != -1)
699 return best_cpu;
700 }
164 } 701 }
165 702
166 p = list_entry(curr, struct task_struct, run_list); 703 /*
704 * And finally, if there were no matches within the domains
705 * just give the caller *something* to work with from the compatible
706 * locations.
707 */
708 return pick_optimal_cpu(this_cpu, lowest_mask);
709}
167 710
168 curr = curr->prev; 711/* Will lock the rq it finds */
712static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
713{
714 struct rq *lowest_rq = NULL;
715 int tries;
716 int cpu;
169 717
170 rq->rt.rt_load_balance_curr = curr; 718 for (tries = 0; tries < RT_MAX_TRIES; tries++) {
719 cpu = find_lowest_rq(task);
171 720
172 return p; 721 if ((cpu == -1) || (cpu == rq->cpu))
722 break;
723
724 lowest_rq = cpu_rq(cpu);
725
726 /* if the prio of this runqueue changed, try again */
727 if (double_lock_balance(rq, lowest_rq)) {
728 /*
729 * We had to unlock the run queue. In
730 * the mean time, task could have
731 * migrated already or had its affinity changed.
732 * Also make sure that it wasn't scheduled on its rq.
733 */
734 if (unlikely(task_rq(task) != rq ||
735 !cpu_isset(lowest_rq->cpu,
736 task->cpus_allowed) ||
737 task_running(rq, task) ||
738 !task->se.on_rq)) {
739
740 spin_unlock(&lowest_rq->lock);
741 lowest_rq = NULL;
742 break;
743 }
744 }
745
746 /* If this rq is still suitable use it. */
747 if (lowest_rq->rt.highest_prio > task->prio)
748 break;
749
750 /* try again */
751 spin_unlock(&lowest_rq->lock);
752 lowest_rq = NULL;
753 }
754
755 return lowest_rq;
756}
757
758/*
759 * If the current CPU has more than one RT task, see if the non
760 * running task can migrate over to a CPU that is running a task
761 * of lesser priority.
762 */
763static int push_rt_task(struct rq *rq)
764{
765 struct task_struct *next_task;
766 struct rq *lowest_rq;
767 int ret = 0;
768 int paranoid = RT_MAX_TRIES;
769
770 if (!rq->rt.overloaded)
771 return 0;
772
773 next_task = pick_next_highest_task_rt(rq, -1);
774 if (!next_task)
775 return 0;
776
777 retry:
778 if (unlikely(next_task == rq->curr)) {
779 WARN_ON(1);
780 return 0;
781 }
782
783 /*
784 * It's possible that the next_task slipped in of
785 * higher priority than current. If that's the case
786 * just reschedule current.
787 */
788 if (unlikely(next_task->prio < rq->curr->prio)) {
789 resched_task(rq->curr);
790 return 0;
791 }
792
793 /* We might release rq lock */
794 get_task_struct(next_task);
795
796 /* find_lock_lowest_rq locks the rq if found */
797 lowest_rq = find_lock_lowest_rq(next_task, rq);
798 if (!lowest_rq) {
799 struct task_struct *task;
800 /*
801 * find lock_lowest_rq releases rq->lock
802 * so it is possible that next_task has changed.
803 * If it has, then try again.
804 */
805 task = pick_next_highest_task_rt(rq, -1);
806 if (unlikely(task != next_task) && task && paranoid--) {
807 put_task_struct(next_task);
808 next_task = task;
809 goto retry;
810 }
811 goto out;
812 }
813
814 deactivate_task(rq, next_task, 0);
815 set_task_cpu(next_task, lowest_rq->cpu);
816 activate_task(lowest_rq, next_task, 0);
817
818 resched_task(lowest_rq->curr);
819
820 spin_unlock(&lowest_rq->lock);
821
822 ret = 1;
823out:
824 put_task_struct(next_task);
825
826 return ret;
827}
828
829/*
830 * TODO: Currently we just use the second highest prio task on
831 * the queue, and stop when it can't migrate (or there's
832 * no more RT tasks). There may be a case where a lower
833 * priority RT task has a different affinity than the
834 * higher RT task. In this case the lower RT task could
835 * possibly be able to migrate where as the higher priority
836 * RT task could not. We currently ignore this issue.
837 * Enhancements are welcome!
838 */
839static void push_rt_tasks(struct rq *rq)
840{
841 /* push_rt_task will return true if it moved an RT */
842 while (push_rt_task(rq))
843 ;
844}
845
846static int pull_rt_task(struct rq *this_rq)
847{
848 int this_cpu = this_rq->cpu, ret = 0, cpu;
849 struct task_struct *p, *next;
850 struct rq *src_rq;
851
852 if (likely(!rt_overloaded(this_rq)))
853 return 0;
854
855 next = pick_next_task_rt(this_rq);
856
857 for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
858 if (this_cpu == cpu)
859 continue;
860
861 src_rq = cpu_rq(cpu);
862 /*
863 * We can potentially drop this_rq's lock in
864 * double_lock_balance, and another CPU could
865 * steal our next task - hence we must cause
866 * the caller to recalculate the next task
867 * in that case:
868 */
869 if (double_lock_balance(this_rq, src_rq)) {
870 struct task_struct *old_next = next;
871
872 next = pick_next_task_rt(this_rq);
873 if (next != old_next)
874 ret = 1;
875 }
876
877 /*
878 * Are there still pullable RT tasks?
879 */
880 if (src_rq->rt.rt_nr_running <= 1)
881 goto skip;
882
883 p = pick_next_highest_task_rt(src_rq, this_cpu);
884
885 /*
886 * Do we have an RT task that preempts
887 * the to-be-scheduled task?
888 */
889 if (p && (!next || (p->prio < next->prio))) {
890 WARN_ON(p == src_rq->curr);
891 WARN_ON(!p->se.on_rq);
892
893 /*
894 * There's a chance that p is higher in priority
895 * than what's currently running on its cpu.
896 * This is just that p is wakeing up and hasn't
897 * had a chance to schedule. We only pull
898 * p if it is lower in priority than the
899 * current task on the run queue or
900 * this_rq next task is lower in prio than
901 * the current task on that rq.
902 */
903 if (p->prio < src_rq->curr->prio ||
904 (next && next->prio < src_rq->curr->prio))
905 goto skip;
906
907 ret = 1;
908
909 deactivate_task(src_rq, p, 0);
910 set_task_cpu(p, this_cpu);
911 activate_task(this_rq, p, 0);
912 /*
913 * We continue with the search, just in
914 * case there's an even higher prio task
915 * in another runqueue. (low likelyhood
916 * but possible)
917 *
918 * Update next so that we won't pick a task
919 * on another cpu with a priority lower (or equal)
920 * than the one we just picked.
921 */
922 next = p;
923
924 }
925 skip:
926 spin_unlock(&src_rq->lock);
927 }
928
929 return ret;
930}
931
932static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
933{
934 /* Try to pull RT tasks here if we lower this rq's prio */
935 if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
936 pull_rt_task(rq);
937}
938
939static void post_schedule_rt(struct rq *rq)
940{
941 /*
942 * If we have more than one rt_task queued, then
943 * see if we can push the other rt_tasks off to other CPUS.
944 * Note we may release the rq lock, and since
945 * the lock was owned by prev, we need to release it
946 * first via finish_lock_switch and then reaquire it here.
947 */
948 if (unlikely(rq->rt.overloaded)) {
949 spin_lock_irq(&rq->lock);
950 push_rt_tasks(rq);
951 spin_unlock_irq(&rq->lock);
952 }
953}
954
955
956static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
957{
958 if (!task_running(rq, p) &&
959 (p->prio >= rq->rt.highest_prio) &&
960 rq->rt.overloaded)
961 push_rt_tasks(rq);
173} 962}
174 963
175static unsigned long 964static unsigned long
@@ -178,38 +967,170 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
178 struct sched_domain *sd, enum cpu_idle_type idle, 967 struct sched_domain *sd, enum cpu_idle_type idle,
179 int *all_pinned, int *this_best_prio) 968 int *all_pinned, int *this_best_prio)
180{ 969{
181 struct rq_iterator rt_rq_iterator; 970 /* don't touch RT tasks */
182 971 return 0;
183 rt_rq_iterator.start = load_balance_start_rt;
184 rt_rq_iterator.next = load_balance_next_rt;
185 /* pass 'busiest' rq argument into
186 * load_balance_[start|next]_rt iterators
187 */
188 rt_rq_iterator.arg = busiest;
189
190 return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd,
191 idle, all_pinned, this_best_prio, &rt_rq_iterator);
192} 972}
193 973
194static int 974static int
195move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, 975move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
196 struct sched_domain *sd, enum cpu_idle_type idle) 976 struct sched_domain *sd, enum cpu_idle_type idle)
197{ 977{
198 struct rq_iterator rt_rq_iterator; 978 /* don't touch RT tasks */
979 return 0;
980}
981
982static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
983{
984 int weight = cpus_weight(*new_mask);
985
986 BUG_ON(!rt_task(p));
199 987
200 rt_rq_iterator.start = load_balance_start_rt; 988 /*
201 rt_rq_iterator.next = load_balance_next_rt; 989 * Update the migration status of the RQ if we have an RT task
202 rt_rq_iterator.arg = busiest; 990 * which is running AND changing its weight value.
991 */
992 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
993 struct rq *rq = task_rq(p);
994
995 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
996 rq->rt.rt_nr_migratory++;
997 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
998 BUG_ON(!rq->rt.rt_nr_migratory);
999 rq->rt.rt_nr_migratory--;
1000 }
1001
1002 update_rt_migration(rq);
1003 }
203 1004
204 return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, 1005 p->cpus_allowed = *new_mask;
205 &rt_rq_iterator); 1006 p->rt.nr_cpus_allowed = weight;
206} 1007}
207#endif
208 1008
209static void task_tick_rt(struct rq *rq, struct task_struct *p) 1009/* Assumes rq->lock is held */
1010static void join_domain_rt(struct rq *rq)
1011{
1012 if (rq->rt.overloaded)
1013 rt_set_overload(rq);
1014}
1015
1016/* Assumes rq->lock is held */
1017static void leave_domain_rt(struct rq *rq)
1018{
1019 if (rq->rt.overloaded)
1020 rt_clear_overload(rq);
1021}
1022
1023/*
1024 * When switch from the rt queue, we bring ourselves to a position
1025 * that we might want to pull RT tasks from other runqueues.
1026 */
1027static void switched_from_rt(struct rq *rq, struct task_struct *p,
1028 int running)
1029{
1030 /*
1031 * If there are other RT tasks then we will reschedule
1032 * and the scheduling of the other RT tasks will handle
1033 * the balancing. But if we are the last RT task
1034 * we may need to handle the pulling of RT tasks
1035 * now.
1036 */
1037 if (!rq->rt.rt_nr_running)
1038 pull_rt_task(rq);
1039}
1040#endif /* CONFIG_SMP */
1041
1042/*
1043 * When switching a task to RT, we may overload the runqueue
1044 * with RT tasks. In this case we try to push them off to
1045 * other runqueues.
1046 */
1047static void switched_to_rt(struct rq *rq, struct task_struct *p,
1048 int running)
1049{
1050 int check_resched = 1;
1051
1052 /*
1053 * If we are already running, then there's nothing
1054 * that needs to be done. But if we are not running
1055 * we may need to preempt the current running task.
1056 * If that current running task is also an RT task
1057 * then see if we can move to another run queue.
1058 */
1059 if (!running) {
1060#ifdef CONFIG_SMP
1061 if (rq->rt.overloaded && push_rt_task(rq) &&
1062 /* Don't resched if we changed runqueues */
1063 rq != task_rq(p))
1064 check_resched = 0;
1065#endif /* CONFIG_SMP */
1066 if (check_resched && p->prio < rq->curr->prio)
1067 resched_task(rq->curr);
1068 }
1069}
1070
1071/*
1072 * Priority of the task has changed. This may cause
1073 * us to initiate a push or pull.
1074 */
1075static void prio_changed_rt(struct rq *rq, struct task_struct *p,
1076 int oldprio, int running)
1077{
1078 if (running) {
1079#ifdef CONFIG_SMP
1080 /*
1081 * If our priority decreases while running, we
1082 * may need to pull tasks to this runqueue.
1083 */
1084 if (oldprio < p->prio)
1085 pull_rt_task(rq);
1086 /*
1087 * If there's a higher priority task waiting to run
1088 * then reschedule.
1089 */
1090 if (p->prio > rq->rt.highest_prio)
1091 resched_task(p);
1092#else
1093 /* For UP simply resched on drop of prio */
1094 if (oldprio < p->prio)
1095 resched_task(p);
1096#endif /* CONFIG_SMP */
1097 } else {
1098 /*
1099 * This task is not running, but if it is
1100 * greater than the current running task
1101 * then reschedule.
1102 */
1103 if (p->prio < rq->curr->prio)
1104 resched_task(rq->curr);
1105 }
1106}
1107
1108static void watchdog(struct rq *rq, struct task_struct *p)
1109{
1110 unsigned long soft, hard;
1111
1112 if (!p->signal)
1113 return;
1114
1115 soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
1116 hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
1117
1118 if (soft != RLIM_INFINITY) {
1119 unsigned long next;
1120
1121 p->rt.timeout++;
1122 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1123 if (p->rt.timeout > next)
1124 p->it_sched_expires = p->se.sum_exec_runtime;
1125 }
1126}
1127
1128static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
210{ 1129{
211 update_curr_rt(rq); 1130 update_curr_rt(rq);
212 1131
1132 watchdog(rq, p);
1133
213 /* 1134 /*
214 * RR tasks need a special form of timeslice management. 1135 * RR tasks need a special form of timeslice management.
215 * FIFO tasks have no timeslices. 1136 * FIFO tasks have no timeslices.
@@ -217,16 +1138,16 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
217 if (p->policy != SCHED_RR) 1138 if (p->policy != SCHED_RR)
218 return; 1139 return;
219 1140
220 if (--p->time_slice) 1141 if (--p->rt.time_slice)
221 return; 1142 return;
222 1143
223 p->time_slice = DEF_TIMESLICE; 1144 p->rt.time_slice = DEF_TIMESLICE;
224 1145
225 /* 1146 /*
226 * Requeue to the end of queue if we are not the only element 1147 * Requeue to the end of queue if we are not the only element
227 * on the queue: 1148 * on the queue:
228 */ 1149 */
229 if (p->run_list.prev != p->run_list.next) { 1150 if (p->rt.run_list.prev != p->rt.run_list.next) {
230 requeue_task_rt(rq, p); 1151 requeue_task_rt(rq, p);
231 set_tsk_need_resched(p); 1152 set_tsk_need_resched(p);
232 } 1153 }
@@ -244,6 +1165,9 @@ const struct sched_class rt_sched_class = {
244 .enqueue_task = enqueue_task_rt, 1165 .enqueue_task = enqueue_task_rt,
245 .dequeue_task = dequeue_task_rt, 1166 .dequeue_task = dequeue_task_rt,
246 .yield_task = yield_task_rt, 1167 .yield_task = yield_task_rt,
1168#ifdef CONFIG_SMP
1169 .select_task_rq = select_task_rq_rt,
1170#endif /* CONFIG_SMP */
247 1171
248 .check_preempt_curr = check_preempt_curr_rt, 1172 .check_preempt_curr = check_preempt_curr_rt,
249 1173
@@ -253,8 +1177,18 @@ const struct sched_class rt_sched_class = {
253#ifdef CONFIG_SMP 1177#ifdef CONFIG_SMP
254 .load_balance = load_balance_rt, 1178 .load_balance = load_balance_rt,
255 .move_one_task = move_one_task_rt, 1179 .move_one_task = move_one_task_rt,
1180 .set_cpus_allowed = set_cpus_allowed_rt,
1181 .join_domain = join_domain_rt,
1182 .leave_domain = leave_domain_rt,
1183 .pre_schedule = pre_schedule_rt,
1184 .post_schedule = post_schedule_rt,
1185 .task_wake_up = task_wake_up_rt,
1186 .switched_from = switched_from_rt,
256#endif 1187#endif
257 1188
258 .set_curr_task = set_curr_task_rt, 1189 .set_curr_task = set_curr_task_rt,
259 .task_tick = task_tick_rt, 1190 .task_tick = task_tick_rt,
1191
1192 .prio_changed = prio_changed_rt,
1193 .switched_to = switched_to_rt,
260}; 1194};
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 11df812263c8..c1d76552446e 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -8,6 +8,7 @@
8 */ 8 */
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/cpu.h> 10#include <linux/cpu.h>
11#include <linux/nmi.h>
11#include <linux/init.h> 12#include <linux/init.h>
12#include <linux/delay.h> 13#include <linux/delay.h>
13#include <linux/freezer.h> 14#include <linux/freezer.h>
@@ -23,8 +24,8 @@ static DEFINE_PER_CPU(unsigned long, touch_timestamp);
23static DEFINE_PER_CPU(unsigned long, print_timestamp); 24static DEFINE_PER_CPU(unsigned long, print_timestamp);
24static DEFINE_PER_CPU(struct task_struct *, watchdog_task); 25static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
25 26
26static int did_panic; 27static int __read_mostly did_panic;
27int softlockup_thresh = 10; 28unsigned long __read_mostly softlockup_thresh = 60;
28 29
29static int 30static int
30softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) 31softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -45,7 +46,7 @@ static struct notifier_block panic_block = {
45 */ 46 */
46static unsigned long get_timestamp(int this_cpu) 47static unsigned long get_timestamp(int this_cpu)
47{ 48{
48 return cpu_clock(this_cpu) >> 30; /* 2^30 ~= 10^9 */ 49 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
49} 50}
50 51
51void touch_softlockup_watchdog(void) 52void touch_softlockup_watchdog(void)
@@ -100,11 +101,7 @@ void softlockup_tick(void)
100 101
101 now = get_timestamp(this_cpu); 102 now = get_timestamp(this_cpu);
102 103
103 /* Wake up the high-prio watchdog task every second: */ 104 /* Warn about unreasonable delays: */
104 if (now > (touch_timestamp + 1))
105 wake_up_process(per_cpu(watchdog_task, this_cpu));
106
107 /* Warn about unreasonable 10+ seconds delays: */
108 if (now <= (touch_timestamp + softlockup_thresh)) 105 if (now <= (touch_timestamp + softlockup_thresh))
109 return; 106 return;
110 107
@@ -122,11 +119,93 @@ void softlockup_tick(void)
122} 119}
123 120
124/* 121/*
122 * Have a reasonable limit on the number of tasks checked:
123 */
124unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
125
126/*
127 * Zero means infinite timeout - no checking done:
128 */
129unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
130
131unsigned long __read_mostly sysctl_hung_task_warnings = 10;
132
133/*
134 * Only do the hung-tasks check on one CPU:
135 */
136static int check_cpu __read_mostly = -1;
137
138static void check_hung_task(struct task_struct *t, unsigned long now)
139{
140 unsigned long switch_count = t->nvcsw + t->nivcsw;
141
142 if (t->flags & PF_FROZEN)
143 return;
144
145 if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
146 t->last_switch_count = switch_count;
147 t->last_switch_timestamp = now;
148 return;
149 }
150 if ((long)(now - t->last_switch_timestamp) <
151 sysctl_hung_task_timeout_secs)
152 return;
153 if (sysctl_hung_task_warnings < 0)
154 return;
155 sysctl_hung_task_warnings--;
156
157 /*
158 * Ok, the task did not get scheduled for more than 2 minutes,
159 * complain:
160 */
161 printk(KERN_ERR "INFO: task %s:%d blocked for more than "
162 "%ld seconds.\n", t->comm, t->pid,
163 sysctl_hung_task_timeout_secs);
164 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
165 " disables this message.\n");
166 sched_show_task(t);
167 __debug_show_held_locks(t);
168
169 t->last_switch_timestamp = now;
170 touch_nmi_watchdog();
171}
172
173/*
174 * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
175 * a really long time (120 seconds). If that happens, print out
176 * a warning.
177 */
178static void check_hung_uninterruptible_tasks(int this_cpu)
179{
180 int max_count = sysctl_hung_task_check_count;
181 unsigned long now = get_timestamp(this_cpu);
182 struct task_struct *g, *t;
183
184 /*
185 * If the system crashed already then all bets are off,
186 * do not report extra hung tasks:
187 */
188 if ((tainted & TAINT_DIE) || did_panic)
189 return;
190
191 read_lock(&tasklist_lock);
192 do_each_thread(g, t) {
193 if (!--max_count)
194 break;
195 if (t->state & TASK_UNINTERRUPTIBLE)
196 check_hung_task(t, now);
197 } while_each_thread(g, t);
198
199 read_unlock(&tasklist_lock);
200}
201
202/*
125 * The watchdog thread - runs every second and touches the timestamp. 203 * The watchdog thread - runs every second and touches the timestamp.
126 */ 204 */
127static int watchdog(void *__bind_cpu) 205static int watchdog(void *__bind_cpu)
128{ 206{
129 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 207 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
208 int this_cpu = (long)__bind_cpu;
130 209
131 sched_setscheduler(current, SCHED_FIFO, &param); 210 sched_setscheduler(current, SCHED_FIFO, &param);
132 211
@@ -135,13 +214,18 @@ static int watchdog(void *__bind_cpu)
135 214
136 /* 215 /*
137 * Run briefly once per second to reset the softlockup timestamp. 216 * Run briefly once per second to reset the softlockup timestamp.
138 * If this gets delayed for more than 10 seconds then the 217 * If this gets delayed for more than 60 seconds then the
139 * debug-printout triggers in softlockup_tick(). 218 * debug-printout triggers in softlockup_tick().
140 */ 219 */
141 while (!kthread_should_stop()) { 220 while (!kthread_should_stop()) {
142 set_current_state(TASK_INTERRUPTIBLE);
143 touch_softlockup_watchdog(); 221 touch_softlockup_watchdog();
144 schedule(); 222 msleep_interruptible(10000);
223
224 if (this_cpu != check_cpu)
225 continue;
226
227 if (sysctl_hung_task_timeout_secs)
228 check_hung_uninterruptible_tasks(this_cpu);
145 } 229 }
146 230
147 return 0; 231 return 0;
@@ -171,6 +255,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
171 break; 255 break;
172 case CPU_ONLINE: 256 case CPU_ONLINE:
173 case CPU_ONLINE_FROZEN: 257 case CPU_ONLINE_FROZEN:
258 check_cpu = any_online_cpu(cpu_online_map);
174 wake_up_process(per_cpu(watchdog_task, hotcpu)); 259 wake_up_process(per_cpu(watchdog_task, hotcpu));
175 break; 260 break;
176#ifdef CONFIG_HOTPLUG_CPU 261#ifdef CONFIG_HOTPLUG_CPU
@@ -181,6 +266,15 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
181 /* Unbind so it can run. Fall thru. */ 266 /* Unbind so it can run. Fall thru. */
182 kthread_bind(per_cpu(watchdog_task, hotcpu), 267 kthread_bind(per_cpu(watchdog_task, hotcpu),
183 any_online_cpu(cpu_online_map)); 268 any_online_cpu(cpu_online_map));
269 case CPU_DOWN_PREPARE:
270 case CPU_DOWN_PREPARE_FROZEN:
271 if (hotcpu == check_cpu) {
272 cpumask_t temp_cpu_online_map = cpu_online_map;
273
274 cpu_clear(hotcpu, temp_cpu_online_map);
275 check_cpu = any_online_cpu(temp_cpu_online_map);
276 }
277 break;
184 case CPU_DEAD: 278 case CPU_DEAD:
185 case CPU_DEAD_FROZEN: 279 case CPU_DEAD_FROZEN:
186 p = per_cpu(watchdog_task, hotcpu); 280 p = per_cpu(watchdog_task, hotcpu);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 319821ef78af..51b5ee53571a 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -203,13 +203,13 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
203 int ret; 203 int ret;
204 204
205 /* No CPUs can come up or down during this. */ 205 /* No CPUs can come up or down during this. */
206 lock_cpu_hotplug(); 206 get_online_cpus();
207 p = __stop_machine_run(fn, data, cpu); 207 p = __stop_machine_run(fn, data, cpu);
208 if (!IS_ERR(p)) 208 if (!IS_ERR(p))
209 ret = kthread_stop(p); 209 ret = kthread_stop(p);
210 else 210 else
211 ret = PTR_ERR(p); 211 ret = PTR_ERR(p);
212 unlock_cpu_hotplug(); 212 put_online_cpus();
213 213
214 return ret; 214 return ret;
215} 215}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c68f68dcc605..8e96558cb8f3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -81,6 +81,7 @@ extern int compat_log;
81extern int maps_protect; 81extern int maps_protect;
82extern int sysctl_stat_interval; 82extern int sysctl_stat_interval;
83extern int audit_argv_kb; 83extern int audit_argv_kb;
84extern int latencytop_enabled;
84 85
85/* Constants used for minimum and maximum */ 86/* Constants used for minimum and maximum */
86#ifdef CONFIG_DETECT_SOFTLOCKUP 87#ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -306,9 +307,43 @@ static struct ctl_table kern_table[] = {
306 .procname = "sched_nr_migrate", 307 .procname = "sched_nr_migrate",
307 .data = &sysctl_sched_nr_migrate, 308 .data = &sysctl_sched_nr_migrate,
308 .maxlen = sizeof(unsigned int), 309 .maxlen = sizeof(unsigned int),
309 .mode = 644, 310 .mode = 0644,
311 .proc_handler = &proc_dointvec,
312 },
313 {
314 .ctl_name = CTL_UNNUMBERED,
315 .procname = "sched_rt_period_ms",
316 .data = &sysctl_sched_rt_period,
317 .maxlen = sizeof(unsigned int),
318 .mode = 0644,
310 .proc_handler = &proc_dointvec, 319 .proc_handler = &proc_dointvec,
311 }, 320 },
321 {
322 .ctl_name = CTL_UNNUMBERED,
323 .procname = "sched_rt_ratio",
324 .data = &sysctl_sched_rt_ratio,
325 .maxlen = sizeof(unsigned int),
326 .mode = 0644,
327 .proc_handler = &proc_dointvec,
328 },
329#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
330 {
331 .ctl_name = CTL_UNNUMBERED,
332 .procname = "sched_min_bal_int_shares",
333 .data = &sysctl_sched_min_bal_int_shares,
334 .maxlen = sizeof(unsigned int),
335 .mode = 0644,
336 .proc_handler = &proc_dointvec,
337 },
338 {
339 .ctl_name = CTL_UNNUMBERED,
340 .procname = "sched_max_bal_int_shares",
341 .data = &sysctl_sched_max_bal_int_shares,
342 .maxlen = sizeof(unsigned int),
343 .mode = 0644,
344 .proc_handler = &proc_dointvec,
345 },
346#endif
312#endif 347#endif
313 { 348 {
314 .ctl_name = CTL_UNNUMBERED, 349 .ctl_name = CTL_UNNUMBERED,
@@ -382,6 +417,15 @@ static struct ctl_table kern_table[] = {
382 .proc_handler = &proc_dointvec_taint, 417 .proc_handler = &proc_dointvec_taint,
383 }, 418 },
384#endif 419#endif
420#ifdef CONFIG_LATENCYTOP
421 {
422 .procname = "latencytop",
423 .data = &latencytop_enabled,
424 .maxlen = sizeof(int),
425 .mode = 0644,
426 .proc_handler = &proc_dointvec,
427 },
428#endif
385#ifdef CONFIG_SECURITY_CAPABILITIES 429#ifdef CONFIG_SECURITY_CAPABILITIES
386 { 430 {
387 .procname = "cap-bound", 431 .procname = "cap-bound",
@@ -728,13 +772,40 @@ static struct ctl_table kern_table[] = {
728 .ctl_name = CTL_UNNUMBERED, 772 .ctl_name = CTL_UNNUMBERED,
729 .procname = "softlockup_thresh", 773 .procname = "softlockup_thresh",
730 .data = &softlockup_thresh, 774 .data = &softlockup_thresh,
731 .maxlen = sizeof(int), 775 .maxlen = sizeof(unsigned long),
732 .mode = 0644, 776 .mode = 0644,
733 .proc_handler = &proc_dointvec_minmax, 777 .proc_handler = &proc_doulongvec_minmax,
734 .strategy = &sysctl_intvec, 778 .strategy = &sysctl_intvec,
735 .extra1 = &one, 779 .extra1 = &one,
736 .extra2 = &sixty, 780 .extra2 = &sixty,
737 }, 781 },
782 {
783 .ctl_name = CTL_UNNUMBERED,
784 .procname = "hung_task_check_count",
785 .data = &sysctl_hung_task_check_count,
786 .maxlen = sizeof(unsigned long),
787 .mode = 0644,
788 .proc_handler = &proc_doulongvec_minmax,
789 .strategy = &sysctl_intvec,
790 },
791 {
792 .ctl_name = CTL_UNNUMBERED,
793 .procname = "hung_task_timeout_secs",
794 .data = &sysctl_hung_task_timeout_secs,
795 .maxlen = sizeof(unsigned long),
796 .mode = 0644,
797 .proc_handler = &proc_doulongvec_minmax,
798 .strategy = &sysctl_intvec,
799 },
800 {
801 .ctl_name = CTL_UNNUMBERED,
802 .procname = "hung_task_warnings",
803 .data = &sysctl_hung_task_warnings,
804 .maxlen = sizeof(unsigned long),
805 .mode = 0644,
806 .proc_handler = &proc_doulongvec_minmax,
807 .strategy = &sysctl_intvec,
808 },
738#endif 809#endif
739#ifdef CONFIG_COMPAT 810#ifdef CONFIG_COMPAT
740 { 811 {
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cb89fa8db110..1a21b6fdb674 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -153,6 +153,7 @@ void tick_nohz_update_jiffies(void)
153void tick_nohz_stop_sched_tick(void) 153void tick_nohz_stop_sched_tick(void)
154{ 154{
155 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; 155 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
156 unsigned long rt_jiffies;
156 struct tick_sched *ts; 157 struct tick_sched *ts;
157 ktime_t last_update, expires, now, delta; 158 ktime_t last_update, expires, now, delta;
158 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 159 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@@ -216,6 +217,10 @@ void tick_nohz_stop_sched_tick(void)
216 next_jiffies = get_next_timer_interrupt(last_jiffies); 217 next_jiffies = get_next_timer_interrupt(last_jiffies);
217 delta_jiffies = next_jiffies - last_jiffies; 218 delta_jiffies = next_jiffies - last_jiffies;
218 219
220 rt_jiffies = rt_needs_cpu(cpu);
221 if (rt_jiffies && rt_jiffies < delta_jiffies)
222 delta_jiffies = rt_jiffies;
223
219 if (rcu_needs_cpu(cpu)) 224 if (rcu_needs_cpu(cpu))
220 delta_jiffies = 1; 225 delta_jiffies = 1;
221 /* 226 /*
@@ -509,7 +514,6 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
509{ 514{
510 struct tick_sched *ts = 515 struct tick_sched *ts =
511 container_of(timer, struct tick_sched, sched_timer); 516 container_of(timer, struct tick_sched, sched_timer);
512 struct hrtimer_cpu_base *base = timer->base->cpu_base;
513 struct pt_regs *regs = get_irq_regs(); 517 struct pt_regs *regs = get_irq_regs();
514 ktime_t now = ktime_get(); 518 ktime_t now = ktime_get();
515 int cpu = smp_processor_id(); 519 int cpu = smp_processor_id();
@@ -547,15 +551,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
547 touch_softlockup_watchdog(); 551 touch_softlockup_watchdog();
548 ts->idle_jiffies++; 552 ts->idle_jiffies++;
549 } 553 }
550 /*
551 * update_process_times() might take tasklist_lock, hence
552 * drop the base lock. sched-tick hrtimers are per-CPU and
553 * never accessible by userspace APIs, so this is safe to do.
554 */
555 spin_unlock(&base->lock);
556 update_process_times(user_mode(regs)); 554 update_process_times(user_mode(regs));
557 profile_tick(CPU_PROFILING); 555 profile_tick(CPU_PROFILING);
558 spin_lock(&base->lock);
559 } 556 }
560 557
561 /* Do not restart, when we are in the idle loop */ 558 /* Do not restart, when we are in the idle loop */
diff --git a/kernel/timer.c b/kernel/timer.c
index 2a00c22203f3..f739dfb539ce 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -896,7 +896,7 @@ static void run_timer_softirq(struct softirq_action *h)
896{ 896{
897 tvec_base_t *base = __get_cpu_var(tvec_bases); 897 tvec_base_t *base = __get_cpu_var(tvec_bases);
898 898
899 hrtimer_run_queues(); 899 hrtimer_run_pending();
900 900
901 if (time_after_eq(jiffies, base->timer_jiffies)) 901 if (time_after_eq(jiffies, base->timer_jiffies))
902 __run_timers(base); 902 __run_timers(base);
@@ -907,6 +907,7 @@ static void run_timer_softirq(struct softirq_action *h)
907 */ 907 */
908void run_local_timers(void) 908void run_local_timers(void)
909{ 909{
910 hrtimer_run_queues();
910 raise_softirq(TIMER_SOFTIRQ); 911 raise_softirq(TIMER_SOFTIRQ);
911 softlockup_tick(); 912 softlockup_tick();
912} 913}
diff --git a/kernel/user.c b/kernel/user.c
index ab4fd706993b..bc1c48d35cb3 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -319,7 +319,7 @@ void free_uid(struct user_struct *up)
319struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) 319struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
320{ 320{
321 struct hlist_head *hashent = uidhashentry(ns, uid); 321 struct hlist_head *hashent = uidhashentry(ns, uid);
322 struct user_struct *up; 322 struct user_struct *up, *new;
323 323
324 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() 324 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
325 * atomic. 325 * atomic.
@@ -331,13 +331,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
331 spin_unlock_irq(&uidhash_lock); 331 spin_unlock_irq(&uidhash_lock);
332 332
333 if (!up) { 333 if (!up) {
334 struct user_struct *new;
335
336 new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); 334 new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
337 if (!new) { 335 if (!new)
338 uids_mutex_unlock(); 336 goto out_unlock;
339 return NULL;
340 }
341 337
342 new->uid = uid; 338 new->uid = uid;
343 atomic_set(&new->__count, 1); 339 atomic_set(&new->__count, 1);
@@ -353,28 +349,14 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
353#endif 349#endif
354 new->locked_shm = 0; 350 new->locked_shm = 0;
355 351
356 if (alloc_uid_keyring(new, current) < 0) { 352 if (alloc_uid_keyring(new, current) < 0)
357 kmem_cache_free(uid_cachep, new); 353 goto out_free_user;
358 uids_mutex_unlock();
359 return NULL;
360 }
361 354
362 if (sched_create_user(new) < 0) { 355 if (sched_create_user(new) < 0)
363 key_put(new->uid_keyring); 356 goto out_put_keys;
364 key_put(new->session_keyring);
365 kmem_cache_free(uid_cachep, new);
366 uids_mutex_unlock();
367 return NULL;
368 }
369 357
370 if (uids_user_create(new)) { 358 if (uids_user_create(new))
371 sched_destroy_user(new); 359 goto out_destoy_sched;
372 key_put(new->uid_keyring);
373 key_put(new->session_keyring);
374 kmem_cache_free(uid_cachep, new);
375 uids_mutex_unlock();
376 return NULL;
377 }
378 360
379 /* 361 /*
380 * Before adding this, check whether we raced 362 * Before adding this, check whether we raced
@@ -402,6 +384,17 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
402 uids_mutex_unlock(); 384 uids_mutex_unlock();
403 385
404 return up; 386 return up;
387
388out_destoy_sched:
389 sched_destroy_user(new);
390out_put_keys:
391 key_put(new->uid_keyring);
392 key_put(new->session_keyring);
393out_free_user:
394 kmem_cache_free(uid_cachep, new);
395out_unlock:
396 uids_mutex_unlock();
397 return NULL;
405} 398}
406 399
407void switch_uid(struct user_struct *new_user) 400void switch_uid(struct user_struct *new_user)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8db0b597509e..52db48e7f6e7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -67,9 +67,8 @@ struct workqueue_struct {
67#endif 67#endif
68}; 68};
69 69
70/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove 70/* Serializes the accesses to the list of workqueues. */
71 threads to each one as cpus come/go. */ 71static DEFINE_SPINLOCK(workqueue_lock);
72static DEFINE_MUTEX(workqueue_mutex);
73static LIST_HEAD(workqueues); 72static LIST_HEAD(workqueues);
74 73
75static int singlethread_cpu __read_mostly; 74static int singlethread_cpu __read_mostly;
@@ -592,8 +591,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
592 * Returns zero on success. 591 * Returns zero on success.
593 * Returns -ve errno on failure. 592 * Returns -ve errno on failure.
594 * 593 *
595 * Appears to be racy against CPU hotplug.
596 *
597 * schedule_on_each_cpu() is very slow. 594 * schedule_on_each_cpu() is very slow.
598 */ 595 */
599int schedule_on_each_cpu(work_func_t func) 596int schedule_on_each_cpu(work_func_t func)
@@ -605,7 +602,7 @@ int schedule_on_each_cpu(work_func_t func)
605 if (!works) 602 if (!works)
606 return -ENOMEM; 603 return -ENOMEM;
607 604
608 preempt_disable(); /* CPU hotplug */ 605 get_online_cpus();
609 for_each_online_cpu(cpu) { 606 for_each_online_cpu(cpu) {
610 struct work_struct *work = per_cpu_ptr(works, cpu); 607 struct work_struct *work = per_cpu_ptr(works, cpu);
611 608
@@ -613,8 +610,8 @@ int schedule_on_each_cpu(work_func_t func)
613 set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); 610 set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
614 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); 611 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
615 } 612 }
616 preempt_enable();
617 flush_workqueue(keventd_wq); 613 flush_workqueue(keventd_wq);
614 put_online_cpus();
618 free_percpu(works); 615 free_percpu(works);
619 return 0; 616 return 0;
620} 617}
@@ -750,8 +747,10 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
750 err = create_workqueue_thread(cwq, singlethread_cpu); 747 err = create_workqueue_thread(cwq, singlethread_cpu);
751 start_workqueue_thread(cwq, -1); 748 start_workqueue_thread(cwq, -1);
752 } else { 749 } else {
753 mutex_lock(&workqueue_mutex); 750 get_online_cpus();
751 spin_lock(&workqueue_lock);
754 list_add(&wq->list, &workqueues); 752 list_add(&wq->list, &workqueues);
753 spin_unlock(&workqueue_lock);
755 754
756 for_each_possible_cpu(cpu) { 755 for_each_possible_cpu(cpu) {
757 cwq = init_cpu_workqueue(wq, cpu); 756 cwq = init_cpu_workqueue(wq, cpu);
@@ -760,7 +759,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
760 err = create_workqueue_thread(cwq, cpu); 759 err = create_workqueue_thread(cwq, cpu);
761 start_workqueue_thread(cwq, cpu); 760 start_workqueue_thread(cwq, cpu);
762 } 761 }
763 mutex_unlock(&workqueue_mutex); 762 put_online_cpus();
764 } 763 }
765 764
766 if (err) { 765 if (err) {
@@ -775,7 +774,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
775{ 774{
776 /* 775 /*
777 * Our caller is either destroy_workqueue() or CPU_DEAD, 776 * Our caller is either destroy_workqueue() or CPU_DEAD,
778 * workqueue_mutex protects cwq->thread 777 * get_online_cpus() protects cwq->thread.
779 */ 778 */
780 if (cwq->thread == NULL) 779 if (cwq->thread == NULL)
781 return; 780 return;
@@ -810,9 +809,11 @@ void destroy_workqueue(struct workqueue_struct *wq)
810 struct cpu_workqueue_struct *cwq; 809 struct cpu_workqueue_struct *cwq;
811 int cpu; 810 int cpu;
812 811
813 mutex_lock(&workqueue_mutex); 812 get_online_cpus();
813 spin_lock(&workqueue_lock);
814 list_del(&wq->list); 814 list_del(&wq->list);
815 mutex_unlock(&workqueue_mutex); 815 spin_unlock(&workqueue_lock);
816 put_online_cpus();
816 817
817 for_each_cpu_mask(cpu, *cpu_map) { 818 for_each_cpu_mask(cpu, *cpu_map) {
818 cwq = per_cpu_ptr(wq->cpu_wq, cpu); 819 cwq = per_cpu_ptr(wq->cpu_wq, cpu);
@@ -835,13 +836,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
835 action &= ~CPU_TASKS_FROZEN; 836 action &= ~CPU_TASKS_FROZEN;
836 837
837 switch (action) { 838 switch (action) {
838 case CPU_LOCK_ACQUIRE:
839 mutex_lock(&workqueue_mutex);
840 return NOTIFY_OK;
841
842 case CPU_LOCK_RELEASE:
843 mutex_unlock(&workqueue_mutex);
844 return NOTIFY_OK;
845 839
846 case CPU_UP_PREPARE: 840 case CPU_UP_PREPARE:
847 cpu_set(cpu, cpu_populated_map); 841 cpu_set(cpu, cpu_populated_map);
@@ -854,7 +848,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
854 case CPU_UP_PREPARE: 848 case CPU_UP_PREPARE:
855 if (!create_workqueue_thread(cwq, cpu)) 849 if (!create_workqueue_thread(cwq, cpu))
856 break; 850 break;
857 printk(KERN_ERR "workqueue for %i failed\n", cpu); 851 printk(KERN_ERR "workqueue [%s] for %i failed\n",
852 wq->name, cpu);
858 return NOTIFY_BAD; 853 return NOTIFY_BAD;
859 854
860 case CPU_ONLINE: 855 case CPU_ONLINE: