aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.hz2
-rw-r--r--kernel/Kconfig.preempt13
-rw-r--r--kernel/Makefile8
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/backtracetest.c48
-rw-r--r--kernel/cpu.c164
-rw-r--r--kernel/cpuset.c14
-rw-r--r--kernel/extable.c3
-rw-r--r--kernel/fork.c49
-rw-r--r--kernel/futex.c51
-rw-r--r--kernel/hrtimer.c266
-rw-r--r--kernel/irq/chip.c9
-rw-r--r--kernel/irq/manage.c3
-rw-r--r--kernel/irq/proc.c21
-rw-r--r--kernel/irq/spurious.c5
-rw-r--r--kernel/kallsyms.c11
-rw-r--r--kernel/kexec.c1
-rw-r--r--kernel/kmod.c13
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/ksysfs.c82
-rw-r--r--kernel/kthread.c12
-rw-r--r--kernel/latencytop.c239
-rw-r--r--kernel/lockdep.c44
-rw-r--r--kernel/module.c286
-rw-r--r--kernel/panic.c41
-rw-r--r--kernel/params.c44
-rw-r--r--kernel/posix-cpu-timers.c30
-rw-r--r--kernel/power/disk.c20
-rw-r--r--kernel/power/main.c26
-rw-r--r--kernel/power/pm.c4
-rw-r--r--kernel/power/power.h4
-rw-r--r--kernel/printk.c68
-rw-r--r--kernel/profile.c99
-rw-r--r--kernel/ptrace.c173
-rw-r--r--kernel/rcuclassic.c575
-rw-r--r--kernel/rcupdate.c576
-rw-r--r--kernel/rcupreempt.c953
-rw-r--r--kernel/rcupreempt_trace.c330
-rw-r--r--kernel/rcutorture.c6
-rw-r--r--kernel/rtmutex-tester.c2
-rw-r--r--kernel/rwsem.c5
-rw-r--r--kernel/sched.c1429
-rw-r--r--kernel/sched_debug.c13
-rw-r--r--kernel/sched_fair.c400
-rw-r--r--kernel/sched_idletask.c42
-rw-r--r--kernel/sched_rt.c1114
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/softirq.c11
-rw-r--r--kernel/softlockup.c116
-rw-r--r--kernel/spinlock.c3
-rw-r--r--kernel/stop_machine.c4
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c283
-rw-r--r--kernel/sysctl_check.c33
-rw-r--r--kernel/test_kprobes.c216
-rw-r--r--kernel/time/clockevents.c18
-rw-r--r--kernel/time/clocksource.c33
-rw-r--r--kernel/time/tick-broadcast.c63
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/tick-sched.c89
-rw-r--r--kernel/time/timekeeping.c30
-rw-r--r--kernel/time/timer_stats.c2
-rw-r--r--kernel/timer.c93
-rw-r--r--kernel/user.c152
-rw-r--r--kernel/workqueue.c40
65 files changed, 6558 insertions, 1937 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 4af15802ccd4..526128a2e622 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -54,3 +54,5 @@ config HZ
54 default 300 if HZ_300 54 default 300 if HZ_300
55 default 1000 if HZ_1000 55 default 1000 if HZ_1000
56 56
57config SCHED_HRTICK
58 def_bool HIGH_RES_TIMERS && X86
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c64ce9c14207..0669b70fa6a3 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,14 +52,13 @@ config PREEMPT
52 52
53endchoice 53endchoice
54 54
55config PREEMPT_BKL 55config RCU_TRACE
56 bool "Preempt The Big Kernel Lock" 56 bool "Enable tracing for RCU - currently stats in debugfs"
57 depends on SMP || PREEMPT 57 select DEBUG_FS
58 default y 58 default y
59 help 59 help
60 This option reduces the latency of the kernel by making the 60 This option provides tracing in RCU which presents stats
61 big kernel lock preemptible. 61 in debugfs for debugging RCU implementation.
62 62
63 Say Y here if you are building a kernel for a desktop system. 63 Say Y here if you want to enable RCU tracing
64 Say N if you are unsure. 64 Say N if you are unsure.
65
diff --git a/kernel/Makefile b/kernel/Makefile
index dfa96956dae0..8885627ea021 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_KALLSYMS) += kallsyms.o
36obj-$(CONFIG_PM) += power/ 36obj-$(CONFIG_PM) += power/
37obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 37obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
38obj-$(CONFIG_KEXEC) += kexec.o 38obj-$(CONFIG_KEXEC) += kexec.o
39obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
39obj-$(CONFIG_COMPAT) += compat.o 40obj-$(CONFIG_COMPAT) += compat.o
40obj-$(CONFIG_CGROUPS) += cgroup.o 41obj-$(CONFIG_CGROUPS) += cgroup.o
41obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o 42obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
@@ -43,6 +44,7 @@ obj-$(CONFIG_CPUSETS) += cpuset.o
43obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o 44obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
44obj-$(CONFIG_IKCONFIG) += configs.o 45obj-$(CONFIG_IKCONFIG) += configs.o
45obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 46obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
47obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
46obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 48obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
47obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 49obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
48obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 50obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
@@ -52,11 +54,17 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
52obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 54obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
53obj-$(CONFIG_SECCOMP) += seccomp.o 55obj-$(CONFIG_SECCOMP) += seccomp.o
54obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 56obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
57obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
58obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
59ifeq ($(CONFIG_PREEMPT_RCU),y)
60obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
61endif
55obj-$(CONFIG_RELAY) += relay.o 62obj-$(CONFIG_RELAY) += relay.o
56obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 63obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
57obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 64obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
58obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 65obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
59obj-$(CONFIG_MARKERS) += marker.o 66obj-$(CONFIG_MARKERS) += marker.o
67obj-$(CONFIG_LATENCYTOP) += latencytop.o
60 68
61ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 69ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
62# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 70# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index cf19547cc9e4..521dfa53cb99 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -482,7 +482,7 @@ static void do_acct_process(struct file *file)
482#endif 482#endif
483#if ACCT_VERSION==3 483#if ACCT_VERSION==3
484 ac.ac_pid = current->tgid; 484 ac.ac_pid = current->tgid;
485 ac.ac_ppid = current->parent->tgid; 485 ac.ac_ppid = current->real_parent->tgid;
486#endif 486#endif
487 487
488 spin_lock_irq(&current->sighand->siglock); 488 spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
new file mode 100644
index 000000000000..d1a7605c5b8f
--- /dev/null
+++ b/kernel/backtracetest.c
@@ -0,0 +1,48 @@
1/*
2 * Simple stack backtrace regression test module
3 *
4 * (C) Copyright 2008 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12
13#include <linux/module.h>
14#include <linux/sched.h>
15#include <linux/delay.h>
16
17static struct timer_list backtrace_timer;
18
19static void backtrace_test_timer(unsigned long data)
20{
21 printk("Testing a backtrace from irq context.\n");
22 printk("The following trace is a kernel self test and not a bug!\n");
23 dump_stack();
24}
25static int backtrace_regression_test(void)
26{
27 printk("====[ backtrace testing ]===========\n");
28 printk("Testing a backtrace from process context.\n");
29 printk("The following trace is a kernel self test and not a bug!\n");
30 dump_stack();
31
32 init_timer(&backtrace_timer);
33 backtrace_timer.function = backtrace_test_timer;
34 mod_timer(&backtrace_timer, jiffies + 10);
35
36 msleep(10);
37 printk("====[ end of backtrace testing ]====\n");
38 return 0;
39}
40
41static void exitf(void)
42{
43}
44
45module_init(backtrace_regression_test);
46module_exit(exitf);
47MODULE_LICENSE("GPL");
48MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6b3a0c15144f..e0d3a4f56ecb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,9 +15,8 @@
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17 17
18/* This protects CPUs going up and down... */ 18/* Serializes the updates to cpu_online_map, cpu_present_map */
19static DEFINE_MUTEX(cpu_add_remove_lock); 19static DEFINE_MUTEX(cpu_add_remove_lock);
20static DEFINE_MUTEX(cpu_bitmask_lock);
21 20
22static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); 21static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
23 22
@@ -26,52 +25,123 @@ static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
26 */ 25 */
27static int cpu_hotplug_disabled; 26static int cpu_hotplug_disabled;
28 27
29#ifdef CONFIG_HOTPLUG_CPU 28static struct {
29 struct task_struct *active_writer;
30 struct mutex lock; /* Synchronizes accesses to refcount, */
31 /*
32 * Also blocks the new readers during
33 * an ongoing cpu hotplug operation.
34 */
35 int refcount;
36 wait_queue_head_t writer_queue;
37} cpu_hotplug;
30 38
31/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */ 39#define writer_exists() (cpu_hotplug.active_writer != NULL)
32static struct task_struct *recursive;
33static int recursive_depth;
34 40
35void lock_cpu_hotplug(void) 41void __init cpu_hotplug_init(void)
36{ 42{
37 struct task_struct *tsk = current; 43 cpu_hotplug.active_writer = NULL;
38 44 mutex_init(&cpu_hotplug.lock);
39 if (tsk == recursive) { 45 cpu_hotplug.refcount = 0;
40 static int warnings = 10; 46 init_waitqueue_head(&cpu_hotplug.writer_queue);
41 if (warnings) { 47}
42 printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n"); 48
43 WARN_ON(1); 49#ifdef CONFIG_HOTPLUG_CPU
44 warnings--; 50
45 } 51void get_online_cpus(void)
46 recursive_depth++; 52{
53 might_sleep();
54 if (cpu_hotplug.active_writer == current)
47 return; 55 return;
48 } 56 mutex_lock(&cpu_hotplug.lock);
49 mutex_lock(&cpu_bitmask_lock); 57 cpu_hotplug.refcount++;
50 recursive = tsk; 58 mutex_unlock(&cpu_hotplug.lock);
59
51} 60}
52EXPORT_SYMBOL_GPL(lock_cpu_hotplug); 61EXPORT_SYMBOL_GPL(get_online_cpus);
53 62
54void unlock_cpu_hotplug(void) 63void put_online_cpus(void)
55{ 64{
56 WARN_ON(recursive != current); 65 if (cpu_hotplug.active_writer == current)
57 if (recursive_depth) {
58 recursive_depth--;
59 return; 66 return;
60 } 67 mutex_lock(&cpu_hotplug.lock);
61 recursive = NULL; 68 cpu_hotplug.refcount--;
62 mutex_unlock(&cpu_bitmask_lock); 69
70 if (unlikely(writer_exists()) && !cpu_hotplug.refcount)
71 wake_up(&cpu_hotplug.writer_queue);
72
73 mutex_unlock(&cpu_hotplug.lock);
74
63} 75}
64EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); 76EXPORT_SYMBOL_GPL(put_online_cpus);
65 77
66#endif /* CONFIG_HOTPLUG_CPU */ 78#endif /* CONFIG_HOTPLUG_CPU */
67 79
80/*
81 * The following two API's must be used when attempting
82 * to serialize the updates to cpu_online_map, cpu_present_map.
83 */
84void cpu_maps_update_begin(void)
85{
86 mutex_lock(&cpu_add_remove_lock);
87}
88
89void cpu_maps_update_done(void)
90{
91 mutex_unlock(&cpu_add_remove_lock);
92}
93
94/*
95 * This ensures that the hotplug operation can begin only when the
96 * refcount goes to zero.
97 *
98 * Note that during a cpu-hotplug operation, the new readers, if any,
99 * will be blocked by the cpu_hotplug.lock
100 *
101 * Since cpu_maps_update_begin is always called after invoking
102 * cpu_maps_update_begin, we can be sure that only one writer is active.
103 *
104 * Note that theoretically, there is a possibility of a livelock:
105 * - Refcount goes to zero, last reader wakes up the sleeping
106 * writer.
107 * - Last reader unlocks the cpu_hotplug.lock.
108 * - A new reader arrives at this moment, bumps up the refcount.
109 * - The writer acquires the cpu_hotplug.lock finds the refcount
110 * non zero and goes to sleep again.
111 *
112 * However, this is very difficult to achieve in practice since
113 * get_online_cpus() not an api which is called all that often.
114 *
115 */
116static void cpu_hotplug_begin(void)
117{
118 DECLARE_WAITQUEUE(wait, current);
119
120 mutex_lock(&cpu_hotplug.lock);
121
122 cpu_hotplug.active_writer = current;
123 add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait);
124 while (cpu_hotplug.refcount) {
125 set_current_state(TASK_UNINTERRUPTIBLE);
126 mutex_unlock(&cpu_hotplug.lock);
127 schedule();
128 mutex_lock(&cpu_hotplug.lock);
129 }
130 remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait);
131}
132
133static void cpu_hotplug_done(void)
134{
135 cpu_hotplug.active_writer = NULL;
136 mutex_unlock(&cpu_hotplug.lock);
137}
68/* Need to know about CPUs going up/down? */ 138/* Need to know about CPUs going up/down? */
69int __cpuinit register_cpu_notifier(struct notifier_block *nb) 139int __cpuinit register_cpu_notifier(struct notifier_block *nb)
70{ 140{
71 int ret; 141 int ret;
72 mutex_lock(&cpu_add_remove_lock); 142 cpu_maps_update_begin();
73 ret = raw_notifier_chain_register(&cpu_chain, nb); 143 ret = raw_notifier_chain_register(&cpu_chain, nb);
74 mutex_unlock(&cpu_add_remove_lock); 144 cpu_maps_update_done();
75 return ret; 145 return ret;
76} 146}
77 147
@@ -81,9 +151,9 @@ EXPORT_SYMBOL(register_cpu_notifier);
81 151
82void unregister_cpu_notifier(struct notifier_block *nb) 152void unregister_cpu_notifier(struct notifier_block *nb)
83{ 153{
84 mutex_lock(&cpu_add_remove_lock); 154 cpu_maps_update_begin();
85 raw_notifier_chain_unregister(&cpu_chain, nb); 155 raw_notifier_chain_unregister(&cpu_chain, nb);
86 mutex_unlock(&cpu_add_remove_lock); 156 cpu_maps_update_done();
87} 157}
88EXPORT_SYMBOL(unregister_cpu_notifier); 158EXPORT_SYMBOL(unregister_cpu_notifier);
89 159
@@ -147,7 +217,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
147 if (!cpu_online(cpu)) 217 if (!cpu_online(cpu))
148 return -EINVAL; 218 return -EINVAL;
149 219
150 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); 220 cpu_hotplug_begin();
151 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 221 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
152 hcpu, -1, &nr_calls); 222 hcpu, -1, &nr_calls);
153 if (err == NOTIFY_BAD) { 223 if (err == NOTIFY_BAD) {
@@ -166,9 +236,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
166 cpu_clear(cpu, tmp); 236 cpu_clear(cpu, tmp);
167 set_cpus_allowed(current, tmp); 237 set_cpus_allowed(current, tmp);
168 238
169 mutex_lock(&cpu_bitmask_lock);
170 p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); 239 p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
171 mutex_unlock(&cpu_bitmask_lock);
172 240
173 if (IS_ERR(p) || cpu_online(cpu)) { 241 if (IS_ERR(p) || cpu_online(cpu)) {
174 /* CPU didn't die: tell everyone. Can't complain. */ 242 /* CPU didn't die: tell everyone. Can't complain. */
@@ -202,7 +270,7 @@ out_thread:
202out_allowed: 270out_allowed:
203 set_cpus_allowed(current, old_allowed); 271 set_cpus_allowed(current, old_allowed);
204out_release: 272out_release:
205 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); 273 cpu_hotplug_done();
206 return err; 274 return err;
207} 275}
208 276
@@ -210,13 +278,13 @@ int cpu_down(unsigned int cpu)
210{ 278{
211 int err = 0; 279 int err = 0;
212 280
213 mutex_lock(&cpu_add_remove_lock); 281 cpu_maps_update_begin();
214 if (cpu_hotplug_disabled) 282 if (cpu_hotplug_disabled)
215 err = -EBUSY; 283 err = -EBUSY;
216 else 284 else
217 err = _cpu_down(cpu, 0); 285 err = _cpu_down(cpu, 0);
218 286
219 mutex_unlock(&cpu_add_remove_lock); 287 cpu_maps_update_done();
220 return err; 288 return err;
221} 289}
222#endif /*CONFIG_HOTPLUG_CPU*/ 290#endif /*CONFIG_HOTPLUG_CPU*/
@@ -231,7 +299,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
231 if (cpu_online(cpu) || !cpu_present(cpu)) 299 if (cpu_online(cpu) || !cpu_present(cpu))
232 return -EINVAL; 300 return -EINVAL;
233 301
234 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); 302 cpu_hotplug_begin();
235 ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, 303 ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
236 -1, &nr_calls); 304 -1, &nr_calls);
237 if (ret == NOTIFY_BAD) { 305 if (ret == NOTIFY_BAD) {
@@ -243,9 +311,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
243 } 311 }
244 312
245 /* Arch-specific enabling code. */ 313 /* Arch-specific enabling code. */
246 mutex_lock(&cpu_bitmask_lock);
247 ret = __cpu_up(cpu); 314 ret = __cpu_up(cpu);
248 mutex_unlock(&cpu_bitmask_lock);
249 if (ret != 0) 315 if (ret != 0)
250 goto out_notify; 316 goto out_notify;
251 BUG_ON(!cpu_online(cpu)); 317 BUG_ON(!cpu_online(cpu));
@@ -257,7 +323,7 @@ out_notify:
257 if (ret != 0) 323 if (ret != 0)
258 __raw_notifier_call_chain(&cpu_chain, 324 __raw_notifier_call_chain(&cpu_chain,
259 CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); 325 CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
260 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); 326 cpu_hotplug_done();
261 327
262 return ret; 328 return ret;
263} 329}
@@ -275,13 +341,13 @@ int __cpuinit cpu_up(unsigned int cpu)
275 return -EINVAL; 341 return -EINVAL;
276 } 342 }
277 343
278 mutex_lock(&cpu_add_remove_lock); 344 cpu_maps_update_begin();
279 if (cpu_hotplug_disabled) 345 if (cpu_hotplug_disabled)
280 err = -EBUSY; 346 err = -EBUSY;
281 else 347 else
282 err = _cpu_up(cpu, 0); 348 err = _cpu_up(cpu, 0);
283 349
284 mutex_unlock(&cpu_add_remove_lock); 350 cpu_maps_update_done();
285 return err; 351 return err;
286} 352}
287 353
@@ -292,7 +358,7 @@ int disable_nonboot_cpus(void)
292{ 358{
293 int cpu, first_cpu, error = 0; 359 int cpu, first_cpu, error = 0;
294 360
295 mutex_lock(&cpu_add_remove_lock); 361 cpu_maps_update_begin();
296 first_cpu = first_cpu(cpu_online_map); 362 first_cpu = first_cpu(cpu_online_map);
297 /* We take down all of the non-boot CPUs in one shot to avoid races 363 /* We take down all of the non-boot CPUs in one shot to avoid races
298 * with the userspace trying to use the CPU hotplug at the same time 364 * with the userspace trying to use the CPU hotplug at the same time
@@ -319,7 +385,7 @@ int disable_nonboot_cpus(void)
319 } else { 385 } else {
320 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 386 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
321 } 387 }
322 mutex_unlock(&cpu_add_remove_lock); 388 cpu_maps_update_done();
323 return error; 389 return error;
324} 390}
325 391
@@ -328,7 +394,7 @@ void enable_nonboot_cpus(void)
328 int cpu, error; 394 int cpu, error;
329 395
330 /* Allow everyone to use the CPU hotplug again */ 396 /* Allow everyone to use the CPU hotplug again */
331 mutex_lock(&cpu_add_remove_lock); 397 cpu_maps_update_begin();
332 cpu_hotplug_disabled = 0; 398 cpu_hotplug_disabled = 0;
333 if (cpus_empty(frozen_cpus)) 399 if (cpus_empty(frozen_cpus))
334 goto out; 400 goto out;
@@ -344,6 +410,6 @@ void enable_nonboot_cpus(void)
344 } 410 }
345 cpus_clear(frozen_cpus); 411 cpus_clear(frozen_cpus);
346out: 412out:
347 mutex_unlock(&cpu_add_remove_lock); 413 cpu_maps_update_done();
348} 414}
349#endif /* CONFIG_PM_SLEEP_SMP */ 415#endif /* CONFIG_PM_SLEEP_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 50f5dc463688..cfaf6419d817 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,10 +537,10 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
537 * 537 *
538 * Call with cgroup_mutex held. May take callback_mutex during 538 * Call with cgroup_mutex held. May take callback_mutex during
539 * call due to the kfifo_alloc() and kmalloc() calls. May nest 539 * call due to the kfifo_alloc() and kmalloc() calls. May nest
540 * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair. 540 * a call to the get_online_cpus()/put_online_cpus() pair.
541 * Must not be called holding callback_mutex, because we must not 541 * Must not be called holding callback_mutex, because we must not
542 * call lock_cpu_hotplug() while holding callback_mutex. Elsewhere 542 * call get_online_cpus() while holding callback_mutex. Elsewhere
543 * the kernel nests callback_mutex inside lock_cpu_hotplug() calls. 543 * the kernel nests callback_mutex inside get_online_cpus() calls.
544 * So the reverse nesting would risk an ABBA deadlock. 544 * So the reverse nesting would risk an ABBA deadlock.
545 * 545 *
546 * The three key local variables below are: 546 * The three key local variables below are:
@@ -691,9 +691,9 @@ restart:
691 691
692rebuild: 692rebuild:
693 /* Have scheduler rebuild sched domains */ 693 /* Have scheduler rebuild sched domains */
694 lock_cpu_hotplug(); 694 get_online_cpus();
695 partition_sched_domains(ndoms, doms); 695 partition_sched_domains(ndoms, doms);
696 unlock_cpu_hotplug(); 696 put_online_cpus();
697 697
698done: 698done:
699 if (q && !IS_ERR(q)) 699 if (q && !IS_ERR(q))
@@ -1617,10 +1617,10 @@ static struct cgroup_subsys_state *cpuset_create(
1617 * 1617 *
1618 * If the cpuset being removed has its flag 'sched_load_balance' 1618 * If the cpuset being removed has its flag 'sched_load_balance'
1619 * enabled, then simulate turning sched_load_balance off, which 1619 * enabled, then simulate turning sched_load_balance off, which
1620 * will call rebuild_sched_domains(). The lock_cpu_hotplug() 1620 * will call rebuild_sched_domains(). The get_online_cpus()
1621 * call in rebuild_sched_domains() must not be made while holding 1621 * call in rebuild_sched_domains() must not be made while holding
1622 * callback_mutex. Elsewhere the kernel nests callback_mutex inside 1622 * callback_mutex. Elsewhere the kernel nests callback_mutex inside
1623 * lock_cpu_hotplug() calls. So the reverse nesting would risk an 1623 * get_online_cpus() calls. So the reverse nesting would risk an
1624 * ABBA deadlock. 1624 * ABBA deadlock.
1625 */ 1625 */
1626 1626
diff --git a/kernel/extable.c b/kernel/extable.c
index 7fe262855317..a26cb2e17023 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -46,7 +46,8 @@ int core_kernel_text(unsigned long addr)
46 addr <= (unsigned long)_etext) 46 addr <= (unsigned long)_etext)
47 return 1; 47 return 1;
48 48
49 if (addr >= (unsigned long)_sinittext && 49 if (system_state == SYSTEM_BOOTING &&
50 addr >= (unsigned long)_sinittext &&
50 addr <= (unsigned long)_einittext) 51 addr <= (unsigned long)_einittext)
51 return 1; 52 return 1;
52 return 0; 53 return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 8dd8ff281009..05e0b6f4365b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -51,6 +51,7 @@
51#include <linux/random.h> 51#include <linux/random.h>
52#include <linux/tty.h> 52#include <linux/tty.h>
53#include <linux/proc_fs.h> 53#include <linux/proc_fs.h>
54#include <linux/blkdev.h>
54 55
55#include <asm/pgtable.h> 56#include <asm/pgtable.h>
56#include <asm/pgalloc.h> 57#include <asm/pgalloc.h>
@@ -392,6 +393,7 @@ void fastcall __mmdrop(struct mm_struct *mm)
392 destroy_context(mm); 393 destroy_context(mm);
393 free_mm(mm); 394 free_mm(mm);
394} 395}
396EXPORT_SYMBOL_GPL(__mmdrop);
395 397
396/* 398/*
397 * Decrement the use count and release all resources for an mm. 399 * Decrement the use count and release all resources for an mm.
@@ -791,6 +793,31 @@ out:
791 return error; 793 return error;
792} 794}
793 795
796static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
797{
798#ifdef CONFIG_BLOCK
799 struct io_context *ioc = current->io_context;
800
801 if (!ioc)
802 return 0;
803 /*
804 * Share io context with parent, if CLONE_IO is set
805 */
806 if (clone_flags & CLONE_IO) {
807 tsk->io_context = ioc_task_link(ioc);
808 if (unlikely(!tsk->io_context))
809 return -ENOMEM;
810 } else if (ioprio_valid(ioc->ioprio)) {
811 tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
812 if (unlikely(!tsk->io_context))
813 return -ENOMEM;
814
815 tsk->io_context->ioprio = ioc->ioprio;
816 }
817#endif
818 return 0;
819}
820
794/* 821/*
795 * Helper to unshare the files of the current task. 822 * Helper to unshare the files of the current task.
796 * We don't want to expose copy_files internals to 823 * We don't want to expose copy_files internals to
@@ -1045,6 +1072,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1045 copy_flags(clone_flags, p); 1072 copy_flags(clone_flags, p);
1046 INIT_LIST_HEAD(&p->children); 1073 INIT_LIST_HEAD(&p->children);
1047 INIT_LIST_HEAD(&p->sibling); 1074 INIT_LIST_HEAD(&p->sibling);
1075#ifdef CONFIG_PREEMPT_RCU
1076 p->rcu_read_lock_nesting = 0;
1077 p->rcu_flipctr_idx = 0;
1078#endif /* #ifdef CONFIG_PREEMPT_RCU */
1048 p->vfork_done = NULL; 1079 p->vfork_done = NULL;
1049 spin_lock_init(&p->alloc_lock); 1080 spin_lock_init(&p->alloc_lock);
1050 1081
@@ -1059,6 +1090,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1059 p->prev_utime = cputime_zero; 1090 p->prev_utime = cputime_zero;
1060 p->prev_stime = cputime_zero; 1091 p->prev_stime = cputime_zero;
1061 1092
1093#ifdef CONFIG_DETECT_SOFTLOCKUP
1094 p->last_switch_count = 0;
1095 p->last_switch_timestamp = 0;
1096#endif
1097
1062#ifdef CONFIG_TASK_XACCT 1098#ifdef CONFIG_TASK_XACCT
1063 p->rchar = 0; /* I/O counter: bytes read */ 1099 p->rchar = 0; /* I/O counter: bytes read */
1064 p->wchar = 0; /* I/O counter: bytes written */ 1100 p->wchar = 0; /* I/O counter: bytes written */
@@ -1147,15 +1183,17 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1147 goto bad_fork_cleanup_mm; 1183 goto bad_fork_cleanup_mm;
1148 if ((retval = copy_namespaces(clone_flags, p))) 1184 if ((retval = copy_namespaces(clone_flags, p)))
1149 goto bad_fork_cleanup_keys; 1185 goto bad_fork_cleanup_keys;
1186 if ((retval = copy_io(clone_flags, p)))
1187 goto bad_fork_cleanup_namespaces;
1150 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); 1188 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
1151 if (retval) 1189 if (retval)
1152 goto bad_fork_cleanup_namespaces; 1190 goto bad_fork_cleanup_io;
1153 1191
1154 if (pid != &init_struct_pid) { 1192 if (pid != &init_struct_pid) {
1155 retval = -ENOMEM; 1193 retval = -ENOMEM;
1156 pid = alloc_pid(task_active_pid_ns(p)); 1194 pid = alloc_pid(task_active_pid_ns(p));
1157 if (!pid) 1195 if (!pid)
1158 goto bad_fork_cleanup_namespaces; 1196 goto bad_fork_cleanup_io;
1159 1197
1160 if (clone_flags & CLONE_NEWPID) { 1198 if (clone_flags & CLONE_NEWPID) {
1161 retval = pid_ns_prepare_proc(task_active_pid_ns(p)); 1199 retval = pid_ns_prepare_proc(task_active_pid_ns(p));
@@ -1196,6 +1234,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1196#ifdef TIF_SYSCALL_EMU 1234#ifdef TIF_SYSCALL_EMU
1197 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); 1235 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
1198#endif 1236#endif
1237 clear_all_latency_tracing(p);
1199 1238
1200 /* Our parent execution domain becomes current domain 1239 /* Our parent execution domain becomes current domain
1201 These must match for thread signalling to apply */ 1240 These must match for thread signalling to apply */
@@ -1224,9 +1263,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1224 /* Need tasklist lock for parent etc handling! */ 1263 /* Need tasklist lock for parent etc handling! */
1225 write_lock_irq(&tasklist_lock); 1264 write_lock_irq(&tasklist_lock);
1226 1265
1227 /* for sys_ioprio_set(IOPRIO_WHO_PGRP) */
1228 p->ioprio = current->ioprio;
1229
1230 /* 1266 /*
1231 * The task hasn't been attached yet, so its cpus_allowed mask will 1267 * The task hasn't been attached yet, so its cpus_allowed mask will
1232 * not be changed, nor will its assigned CPU. 1268 * not be changed, nor will its assigned CPU.
@@ -1237,6 +1273,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1237 * parent's CPU). This avoids alot of nasty races. 1273 * parent's CPU). This avoids alot of nasty races.
1238 */ 1274 */
1239 p->cpus_allowed = current->cpus_allowed; 1275 p->cpus_allowed = current->cpus_allowed;
1276 p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
1240 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || 1277 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
1241 !cpu_online(task_cpu(p)))) 1278 !cpu_online(task_cpu(p))))
1242 set_task_cpu(p, smp_processor_id()); 1279 set_task_cpu(p, smp_processor_id());
@@ -1317,6 +1354,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1317bad_fork_free_pid: 1354bad_fork_free_pid:
1318 if (pid != &init_struct_pid) 1355 if (pid != &init_struct_pid)
1319 free_pid(pid); 1356 free_pid(pid);
1357bad_fork_cleanup_io:
1358 put_io_context(p->io_context);
1320bad_fork_cleanup_namespaces: 1359bad_fork_cleanup_namespaces:
1321 exit_task_namespaces(p); 1360 exit_task_namespaces(p);
1322bad_fork_cleanup_keys: 1361bad_fork_cleanup_keys:
diff --git a/kernel/futex.c b/kernel/futex.c
index 172a1aeeafdb..db9824de8bf0 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1097,15 +1097,15 @@ static void unqueue_me_pi(struct futex_q *q)
1097} 1097}
1098 1098
1099/* 1099/*
1100 * Fixup the pi_state owner with current. 1100 * Fixup the pi_state owner with the new owner.
1101 * 1101 *
1102 * Must be called with hash bucket lock held and mm->sem held for non 1102 * Must be called with hash bucket lock held and mm->sem held for non
1103 * private futexes. 1103 * private futexes.
1104 */ 1104 */
1105static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1105static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1106 struct task_struct *curr) 1106 struct task_struct *newowner)
1107{ 1107{
1108 u32 newtid = task_pid_vnr(curr) | FUTEX_WAITERS; 1108 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1109 struct futex_pi_state *pi_state = q->pi_state; 1109 struct futex_pi_state *pi_state = q->pi_state;
1110 u32 uval, curval, newval; 1110 u32 uval, curval, newval;
1111 int ret; 1111 int ret;
@@ -1119,12 +1119,12 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1119 } else 1119 } else
1120 newtid |= FUTEX_OWNER_DIED; 1120 newtid |= FUTEX_OWNER_DIED;
1121 1121
1122 pi_state->owner = curr; 1122 pi_state->owner = newowner;
1123 1123
1124 spin_lock_irq(&curr->pi_lock); 1124 spin_lock_irq(&newowner->pi_lock);
1125 WARN_ON(!list_empty(&pi_state->list)); 1125 WARN_ON(!list_empty(&pi_state->list));
1126 list_add(&pi_state->list, &curr->pi_state_list); 1126 list_add(&pi_state->list, &newowner->pi_state_list);
1127 spin_unlock_irq(&curr->pi_lock); 1127 spin_unlock_irq(&newowner->pi_lock);
1128 1128
1129 /* 1129 /*
1130 * We own it, so we have to replace the pending owner 1130 * We own it, so we have to replace the pending owner
@@ -1508,9 +1508,40 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1508 * when we were on the way back before we locked the 1508 * when we were on the way back before we locked the
1509 * hash bucket. 1509 * hash bucket.
1510 */ 1510 */
1511 if (q.pi_state->owner == curr && 1511 if (q.pi_state->owner == curr) {
1512 rt_mutex_trylock(&q.pi_state->pi_mutex)) { 1512 /*
1513 ret = 0; 1513 * Try to get the rt_mutex now. This might
1514 * fail as some other task acquired the
1515 * rt_mutex after we removed ourself from the
1516 * rt_mutex waiters list.
1517 */
1518 if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1519 ret = 0;
1520 else {
1521 /*
1522 * pi_state is incorrect, some other
1523 * task did a lock steal and we
1524 * returned due to timeout or signal
1525 * without taking the rt_mutex. Too
1526 * late. We can access the
1527 * rt_mutex_owner without locking, as
1528 * the other task is now blocked on
1529 * the hash bucket lock. Fix the state
1530 * up.
1531 */
1532 struct task_struct *owner;
1533 int res;
1534
1535 owner = rt_mutex_owner(&q.pi_state->pi_mutex);
1536 res = fixup_pi_state_owner(uaddr, &q, owner);
1537
1538 WARN_ON(rt_mutex_owner(&q.pi_state->pi_mutex) !=
1539 owner);
1540
1541 /* propagate -EFAULT, if the fixup failed */
1542 if (res)
1543 ret = res;
1544 }
1514 } else { 1545 } else {
1515 /* 1546 /*
1516 * Paranoia check. If we did not take the lock 1547 * Paranoia check. If we did not take the lock
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 22a25142e4cf..bd5d6b5060bc 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -325,6 +325,22 @@ unsigned long ktime_divns(const ktime_t kt, s64 div)
325} 325}
326#endif /* BITS_PER_LONG >= 64 */ 326#endif /* BITS_PER_LONG >= 64 */
327 327
328/*
329 * Check, whether the timer is on the callback pending list
330 */
331static inline int hrtimer_cb_pending(const struct hrtimer *timer)
332{
333 return timer->state & HRTIMER_STATE_PENDING;
334}
335
336/*
337 * Remove a timer from the callback pending list
338 */
339static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
340{
341 list_del_init(&timer->cb_entry);
342}
343
328/* High resolution timer related functions */ 344/* High resolution timer related functions */
329#ifdef CONFIG_HIGH_RES_TIMERS 345#ifdef CONFIG_HIGH_RES_TIMERS
330 346
@@ -494,29 +510,12 @@ void hres_timers_resume(void)
494} 510}
495 511
496/* 512/*
497 * Check, whether the timer is on the callback pending list
498 */
499static inline int hrtimer_cb_pending(const struct hrtimer *timer)
500{
501 return timer->state & HRTIMER_STATE_PENDING;
502}
503
504/*
505 * Remove a timer from the callback pending list
506 */
507static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
508{
509 list_del_init(&timer->cb_entry);
510}
511
512/*
513 * Initialize the high resolution related parts of cpu_base 513 * Initialize the high resolution related parts of cpu_base
514 */ 514 */
515static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) 515static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
516{ 516{
517 base->expires_next.tv64 = KTIME_MAX; 517 base->expires_next.tv64 = KTIME_MAX;
518 base->hres_active = 0; 518 base->hres_active = 0;
519 INIT_LIST_HEAD(&base->cb_pending);
520} 519}
521 520
522/* 521/*
@@ -524,7 +523,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
524 */ 523 */
525static inline void hrtimer_init_timer_hres(struct hrtimer *timer) 524static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
526{ 525{
527 INIT_LIST_HEAD(&timer->cb_entry);
528} 526}
529 527
530/* 528/*
@@ -618,10 +616,13 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
618{ 616{
619 return 0; 617 return 0;
620} 618}
621static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; }
622static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { }
623static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } 619static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
624static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } 620static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
621static inline int hrtimer_reprogram(struct hrtimer *timer,
622 struct hrtimer_clock_base *base)
623{
624 return 0;
625}
625 626
626#endif /* CONFIG_HIGH_RES_TIMERS */ 627#endif /* CONFIG_HIGH_RES_TIMERS */
627 628
@@ -850,6 +851,14 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
850#ifdef CONFIG_TIME_LOW_RES 851#ifdef CONFIG_TIME_LOW_RES
851 tim = ktime_add(tim, base->resolution); 852 tim = ktime_add(tim, base->resolution);
852#endif 853#endif
854 /*
855 * Careful here: User space might have asked for a
856 * very long sleep, so the add above might result in a
857 * negative number, which enqueues the timer in front
858 * of the queue.
859 */
860 if (tim.tv64 < 0)
861 tim.tv64 = KTIME_MAX;
853 } 862 }
854 timer->expires = tim; 863 timer->expires = tim;
855 864
@@ -993,6 +1002,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
993 clock_id = CLOCK_MONOTONIC; 1002 clock_id = CLOCK_MONOTONIC;
994 1003
995 timer->base = &cpu_base->clock_base[clock_id]; 1004 timer->base = &cpu_base->clock_base[clock_id];
1005 INIT_LIST_HEAD(&timer->cb_entry);
996 hrtimer_init_timer_hres(timer); 1006 hrtimer_init_timer_hres(timer);
997 1007
998#ifdef CONFIG_TIMER_STATS 1008#ifdef CONFIG_TIMER_STATS
@@ -1022,6 +1032,85 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1022} 1032}
1023EXPORT_SYMBOL_GPL(hrtimer_get_res); 1033EXPORT_SYMBOL_GPL(hrtimer_get_res);
1024 1034
1035static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
1036{
1037 spin_lock_irq(&cpu_base->lock);
1038
1039 while (!list_empty(&cpu_base->cb_pending)) {
1040 enum hrtimer_restart (*fn)(struct hrtimer *);
1041 struct hrtimer *timer;
1042 int restart;
1043
1044 timer = list_entry(cpu_base->cb_pending.next,
1045 struct hrtimer, cb_entry);
1046
1047 timer_stats_account_hrtimer(timer);
1048
1049 fn = timer->function;
1050 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
1051 spin_unlock_irq(&cpu_base->lock);
1052
1053 restart = fn(timer);
1054
1055 spin_lock_irq(&cpu_base->lock);
1056
1057 timer->state &= ~HRTIMER_STATE_CALLBACK;
1058 if (restart == HRTIMER_RESTART) {
1059 BUG_ON(hrtimer_active(timer));
1060 /*
1061 * Enqueue the timer, allow reprogramming of the event
1062 * device
1063 */
1064 enqueue_hrtimer(timer, timer->base, 1);
1065 } else if (hrtimer_active(timer)) {
1066 /*
1067 * If the timer was rearmed on another CPU, reprogram
1068 * the event device.
1069 */
1070 if (timer->base->first == &timer->node)
1071 hrtimer_reprogram(timer, timer->base);
1072 }
1073 }
1074 spin_unlock_irq(&cpu_base->lock);
1075}
1076
1077static void __run_hrtimer(struct hrtimer *timer)
1078{
1079 struct hrtimer_clock_base *base = timer->base;
1080 struct hrtimer_cpu_base *cpu_base = base->cpu_base;
1081 enum hrtimer_restart (*fn)(struct hrtimer *);
1082 int restart;
1083
1084 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1085 timer_stats_account_hrtimer(timer);
1086
1087 fn = timer->function;
1088 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
1089 /*
1090 * Used for scheduler timers, avoid lock inversion with
1091 * rq->lock and tasklist_lock.
1092 *
1093 * These timers are required to deal with enqueue expiry
1094 * themselves and are not allowed to migrate.
1095 */
1096 spin_unlock(&cpu_base->lock);
1097 restart = fn(timer);
1098 spin_lock(&cpu_base->lock);
1099 } else
1100 restart = fn(timer);
1101
1102 /*
1103 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
1104 * reprogramming of the event hardware. This happens at the end of this
1105 * function anyway.
1106 */
1107 if (restart != HRTIMER_NORESTART) {
1108 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1109 enqueue_hrtimer(timer, base, 0);
1110 }
1111 timer->state &= ~HRTIMER_STATE_CALLBACK;
1112}
1113
1025#ifdef CONFIG_HIGH_RES_TIMERS 1114#ifdef CONFIG_HIGH_RES_TIMERS
1026 1115
1027/* 1116/*
@@ -1079,21 +1168,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1079 continue; 1168 continue;
1080 } 1169 }
1081 1170
1082 __remove_hrtimer(timer, base, 1171 __run_hrtimer(timer);
1083 HRTIMER_STATE_CALLBACK, 0);
1084 timer_stats_account_hrtimer(timer);
1085
1086 /*
1087 * Note: We clear the CALLBACK bit after
1088 * enqueue_hrtimer to avoid reprogramming of
1089 * the event hardware. This happens at the end
1090 * of this function anyway.
1091 */
1092 if (timer->function(timer) != HRTIMER_NORESTART) {
1093 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1094 enqueue_hrtimer(timer, base, 0);
1095 }
1096 timer->state &= ~HRTIMER_STATE_CALLBACK;
1097 } 1172 }
1098 spin_unlock(&cpu_base->lock); 1173 spin_unlock(&cpu_base->lock);
1099 base++; 1174 base++;
@@ -1114,52 +1189,41 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1114 1189
1115static void run_hrtimer_softirq(struct softirq_action *h) 1190static void run_hrtimer_softirq(struct softirq_action *h)
1116{ 1191{
1117 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1192 run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
1118 1193}
1119 spin_lock_irq(&cpu_base->lock);
1120
1121 while (!list_empty(&cpu_base->cb_pending)) {
1122 enum hrtimer_restart (*fn)(struct hrtimer *);
1123 struct hrtimer *timer;
1124 int restart;
1125
1126 timer = list_entry(cpu_base->cb_pending.next,
1127 struct hrtimer, cb_entry);
1128 1194
1129 timer_stats_account_hrtimer(timer); 1195#endif /* CONFIG_HIGH_RES_TIMERS */
1130 1196
1131 fn = timer->function; 1197/*
1132 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); 1198 * Called from timer softirq every jiffy, expire hrtimers:
1133 spin_unlock_irq(&cpu_base->lock); 1199 *
1200 * For HRT its the fall back code to run the softirq in the timer
1201 * softirq context in case the hrtimer initialization failed or has
1202 * not been done yet.
1203 */
1204void hrtimer_run_pending(void)
1205{
1206 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1134 1207
1135 restart = fn(timer); 1208 if (hrtimer_hres_active())
1209 return;
1136 1210
1137 spin_lock_irq(&cpu_base->lock); 1211 /*
1212 * This _is_ ugly: We have to check in the softirq context,
1213 * whether we can switch to highres and / or nohz mode. The
1214 * clocksource switch happens in the timer interrupt with
1215 * xtime_lock held. Notification from there only sets the
1216 * check bit in the tick_oneshot code, otherwise we might
1217 * deadlock vs. xtime_lock.
1218 */
1219 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
1220 hrtimer_switch_to_hres();
1138 1221
1139 timer->state &= ~HRTIMER_STATE_CALLBACK; 1222 run_hrtimer_pending(cpu_base);
1140 if (restart == HRTIMER_RESTART) {
1141 BUG_ON(hrtimer_active(timer));
1142 /*
1143 * Enqueue the timer, allow reprogramming of the event
1144 * device
1145 */
1146 enqueue_hrtimer(timer, timer->base, 1);
1147 } else if (hrtimer_active(timer)) {
1148 /*
1149 * If the timer was rearmed on another CPU, reprogram
1150 * the event device.
1151 */
1152 if (timer->base->first == &timer->node)
1153 hrtimer_reprogram(timer, timer->base);
1154 }
1155 }
1156 spin_unlock_irq(&cpu_base->lock);
1157} 1223}
1158 1224
1159#endif /* CONFIG_HIGH_RES_TIMERS */
1160
1161/* 1225/*
1162 * Expire the per base hrtimer-queue: 1226 * Called from hardirq context every jiffy
1163 */ 1227 */
1164static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, 1228static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
1165 int index) 1229 int index)
@@ -1173,46 +1237,27 @@ static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
1173 if (base->get_softirq_time) 1237 if (base->get_softirq_time)
1174 base->softirq_time = base->get_softirq_time(); 1238 base->softirq_time = base->get_softirq_time();
1175 1239
1176 spin_lock_irq(&cpu_base->lock); 1240 spin_lock(&cpu_base->lock);
1177 1241
1178 while ((node = base->first)) { 1242 while ((node = base->first)) {
1179 struct hrtimer *timer; 1243 struct hrtimer *timer;
1180 enum hrtimer_restart (*fn)(struct hrtimer *);
1181 int restart;
1182 1244
1183 timer = rb_entry(node, struct hrtimer, node); 1245 timer = rb_entry(node, struct hrtimer, node);
1184 if (base->softirq_time.tv64 <= timer->expires.tv64) 1246 if (base->softirq_time.tv64 <= timer->expires.tv64)
1185 break; 1247 break;
1186 1248
1187#ifdef CONFIG_HIGH_RES_TIMERS 1249 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1188 WARN_ON_ONCE(timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ); 1250 __remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0);
1189#endif 1251 list_add_tail(&timer->cb_entry,
1190 timer_stats_account_hrtimer(timer); 1252 &base->cpu_base->cb_pending);
1191 1253 continue;
1192 fn = timer->function;
1193 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1194 spin_unlock_irq(&cpu_base->lock);
1195
1196 restart = fn(timer);
1197
1198 spin_lock_irq(&cpu_base->lock);
1199
1200 timer->state &= ~HRTIMER_STATE_CALLBACK;
1201 if (restart != HRTIMER_NORESTART) {
1202 BUG_ON(hrtimer_active(timer));
1203 enqueue_hrtimer(timer, base, 0);
1204 } 1254 }
1255
1256 __run_hrtimer(timer);
1205 } 1257 }
1206 spin_unlock_irq(&cpu_base->lock); 1258 spin_unlock(&cpu_base->lock);
1207} 1259}
1208 1260
1209/*
1210 * Called from timer softirq every jiffy, expire hrtimers:
1211 *
1212 * For HRT its the fall back code to run the softirq in the timer
1213 * softirq context in case the hrtimer initialization failed or has
1214 * not been done yet.
1215 */
1216void hrtimer_run_queues(void) 1261void hrtimer_run_queues(void)
1217{ 1262{
1218 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1263 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
@@ -1221,18 +1266,6 @@ void hrtimer_run_queues(void)
1221 if (hrtimer_hres_active()) 1266 if (hrtimer_hres_active())
1222 return; 1267 return;
1223 1268
1224 /*
1225 * This _is_ ugly: We have to check in the softirq context,
1226 * whether we can switch to highres and / or nohz mode. The
1227 * clocksource switch happens in the timer interrupt with
1228 * xtime_lock held. Notification from there only sets the
1229 * check bit in the tick_oneshot code, otherwise we might
1230 * deadlock vs. xtime_lock.
1231 */
1232 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
1233 if (hrtimer_switch_to_hres())
1234 return;
1235
1236 hrtimer_get_softirq_time(cpu_base); 1269 hrtimer_get_softirq_time(cpu_base);
1237 1270
1238 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1271 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
@@ -1260,7 +1293,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1260 sl->timer.function = hrtimer_wakeup; 1293 sl->timer.function = hrtimer_wakeup;
1261 sl->task = task; 1294 sl->task = task;
1262#ifdef CONFIG_HIGH_RES_TIMERS 1295#ifdef CONFIG_HIGH_RES_TIMERS
1263 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART; 1296 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
1264#endif 1297#endif
1265} 1298}
1266 1299
@@ -1271,6 +1304,8 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
1271 do { 1304 do {
1272 set_current_state(TASK_INTERRUPTIBLE); 1305 set_current_state(TASK_INTERRUPTIBLE);
1273 hrtimer_start(&t->timer, t->timer.expires, mode); 1306 hrtimer_start(&t->timer, t->timer.expires, mode);
1307 if (!hrtimer_active(&t->timer))
1308 t->task = NULL;
1274 1309
1275 if (likely(t->task)) 1310 if (likely(t->task))
1276 schedule(); 1311 schedule();
@@ -1370,7 +1405,7 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
1370/* 1405/*
1371 * Functions related to boot-time initialization: 1406 * Functions related to boot-time initialization:
1372 */ 1407 */
1373static void __devinit init_hrtimers_cpu(int cpu) 1408static void __cpuinit init_hrtimers_cpu(int cpu)
1374{ 1409{
1375 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 1410 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
1376 int i; 1411 int i;
@@ -1381,6 +1416,7 @@ static void __devinit init_hrtimers_cpu(int cpu)
1381 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1416 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1382 cpu_base->clock_base[i].cpu_base = cpu_base; 1417 cpu_base->clock_base[i].cpu_base = cpu_base;
1383 1418
1419 INIT_LIST_HEAD(&cpu_base->cb_pending);
1384 hrtimer_init_hres(cpu_base); 1420 hrtimer_init_hres(cpu_base);
1385} 1421}
1386 1422
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 9b5dff6b3f6a..44019ce30a14 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -297,18 +297,13 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
297 297
298 if (unlikely(desc->status & IRQ_INPROGRESS)) 298 if (unlikely(desc->status & IRQ_INPROGRESS))
299 goto out_unlock; 299 goto out_unlock;
300 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
300 kstat_cpu(cpu).irqs[irq]++; 301 kstat_cpu(cpu).irqs[irq]++;
301 302
302 action = desc->action; 303 action = desc->action;
303 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 304 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
304 if (desc->chip->mask)
305 desc->chip->mask(irq);
306 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
307 desc->status |= IRQ_PENDING;
308 goto out_unlock; 305 goto out_unlock;
309 }
310 306
311 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING | IRQ_PENDING);
312 desc->status |= IRQ_INPROGRESS; 307 desc->status |= IRQ_INPROGRESS;
313 spin_unlock(&desc->lock); 308 spin_unlock(&desc->lock);
314 309
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1f314221d534..438a01464287 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -479,6 +479,9 @@ void free_irq(unsigned int irq, void *dev_id)
479 return; 479 return;
480 } 480 }
481 printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq); 481 printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);
482#ifdef CONFIG_DEBUG_SHIRQ
483 dump_stack();
484#endif
482 spin_unlock_irqrestore(&desc->lock, flags); 485 spin_unlock_irqrestore(&desc->lock, flags);
483 return; 486 return;
484 } 487 }
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 50b81b98046a..c2f2ccb0549a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -75,6 +75,18 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
75 75
76#endif 76#endif
77 77
78static int irq_spurious_read(char *page, char **start, off_t off,
79 int count, int *eof, void *data)
80{
81 struct irq_desc *d = &irq_desc[(long) data];
82 return sprintf(page, "count %u\n"
83 "unhandled %u\n"
84 "last_unhandled %u ms\n",
85 d->irq_count,
86 d->irqs_unhandled,
87 jiffies_to_msecs(d->last_unhandled));
88}
89
78#define MAX_NAMELEN 128 90#define MAX_NAMELEN 128
79 91
80static int name_unique(unsigned int irq, struct irqaction *new_action) 92static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -118,6 +130,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
118void register_irq_proc(unsigned int irq) 130void register_irq_proc(unsigned int irq)
119{ 131{
120 char name [MAX_NAMELEN]; 132 char name [MAX_NAMELEN];
133 struct proc_dir_entry *entry;
121 134
122 if (!root_irq_dir || 135 if (!root_irq_dir ||
123 (irq_desc[irq].chip == &no_irq_chip) || 136 (irq_desc[irq].chip == &no_irq_chip) ||
@@ -132,8 +145,6 @@ void register_irq_proc(unsigned int irq)
132 145
133#ifdef CONFIG_SMP 146#ifdef CONFIG_SMP
134 { 147 {
135 struct proc_dir_entry *entry;
136
137 /* create /proc/irq/<irq>/smp_affinity */ 148 /* create /proc/irq/<irq>/smp_affinity */
138 entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); 149 entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir);
139 150
@@ -144,6 +155,12 @@ void register_irq_proc(unsigned int irq)
144 } 155 }
145 } 156 }
146#endif 157#endif
158
159 entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir);
160 if (entry) {
161 entry->data = (void *)(long)irq;
162 entry->read_proc = irq_spurious_read;
163 }
147} 164}
148 165
149#undef MAX_NAMELEN 166#undef MAX_NAMELEN
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 32b161972fad..a6b2bc831dd0 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -10,6 +10,7 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13#include <linux/moduleparam.h>
13 14
14static int irqfixup __read_mostly; 15static int irqfixup __read_mostly;
15 16
@@ -225,6 +226,8 @@ int noirqdebug_setup(char *str)
225} 226}
226 227
227__setup("noirqdebug", noirqdebug_setup); 228__setup("noirqdebug", noirqdebug_setup);
229module_param(noirqdebug, bool, 0644);
230MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
228 231
229static int __init irqfixup_setup(char *str) 232static int __init irqfixup_setup(char *str)
230{ 233{
@@ -236,6 +239,8 @@ static int __init irqfixup_setup(char *str)
236} 239}
237 240
238__setup("irqfixup", irqfixup_setup); 241__setup("irqfixup", irqfixup_setup);
242module_param(irqfixup, int, 0644);
243MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode 2: irqpoll mode");
239 244
240static int __init irqpoll_setup(char *str) 245static int __init irqpoll_setup(char *str)
241{ 246{
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 2fc25810509e..7dadc71ce516 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -233,10 +233,11 @@ static unsigned long get_symbol_pos(unsigned long addr,
233int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, 233int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
234 unsigned long *offset) 234 unsigned long *offset)
235{ 235{
236 char namebuf[KSYM_NAME_LEN];
236 if (is_ksym_addr(addr)) 237 if (is_ksym_addr(addr))
237 return !!get_symbol_pos(addr, symbolsize, offset); 238 return !!get_symbol_pos(addr, symbolsize, offset);
238 239
239 return !!module_address_lookup(addr, symbolsize, offset, NULL); 240 return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf);
240} 241}
241 242
242/* 243/*
@@ -251,8 +252,6 @@ const char *kallsyms_lookup(unsigned long addr,
251 unsigned long *offset, 252 unsigned long *offset,
252 char **modname, char *namebuf) 253 char **modname, char *namebuf)
253{ 254{
254 const char *msym;
255
256 namebuf[KSYM_NAME_LEN - 1] = 0; 255 namebuf[KSYM_NAME_LEN - 1] = 0;
257 namebuf[0] = 0; 256 namebuf[0] = 0;
258 257
@@ -268,10 +267,8 @@ const char *kallsyms_lookup(unsigned long addr,
268 } 267 }
269 268
270 /* see if it's in a module */ 269 /* see if it's in a module */
271 msym = module_address_lookup(addr, symbolsize, offset, modname); 270 return module_address_lookup(addr, symbolsize, offset, modname,
272 if (msym) 271 namebuf);
273 return strncpy(namebuf, msym, KSYM_NAME_LEN - 1);
274
275 return NULL; 272 return NULL;
276} 273}
277 274
diff --git a/kernel/kexec.c b/kernel/kexec.c
index aa74a1ef2da8..9a26eec9eb04 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1404,6 +1404,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1404 VMCOREINFO_OFFSET(list_head, next); 1404 VMCOREINFO_OFFSET(list_head, next);
1405 VMCOREINFO_OFFSET(list_head, prev); 1405 VMCOREINFO_OFFSET(list_head, prev);
1406 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1406 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1407 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1407 VMCOREINFO_NUMBER(NR_FREE_PAGES); 1408 VMCOREINFO_NUMBER(NR_FREE_PAGES);
1408 1409
1409 arch_crash_save_vmcoreinfo(); 1410 arch_crash_save_vmcoreinfo();
diff --git a/kernel/kmod.c b/kernel/kmod.c
index c6a4f8aebeba..bb7df2a28bd7 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -451,13 +451,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
451 enum umh_wait wait) 451 enum umh_wait wait)
452{ 452{
453 DECLARE_COMPLETION_ONSTACK(done); 453 DECLARE_COMPLETION_ONSTACK(done);
454 int retval; 454 int retval = 0;
455 455
456 helper_lock(); 456 helper_lock();
457 if (sub_info->path[0] == '\0') { 457 if (sub_info->path[0] == '\0')
458 retval = 0;
459 goto out; 458 goto out;
460 }
461 459
462 if (!khelper_wq || usermodehelper_disabled) { 460 if (!khelper_wq || usermodehelper_disabled) {
463 retval = -EBUSY; 461 retval = -EBUSY;
@@ -468,13 +466,14 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
468 sub_info->wait = wait; 466 sub_info->wait = wait;
469 467
470 queue_work(khelper_wq, &sub_info->work); 468 queue_work(khelper_wq, &sub_info->work);
471 if (wait == UMH_NO_WAIT) /* task has freed sub_info */ 469 if (wait == UMH_NO_WAIT) /* task has freed sub_info */
472 return 0; 470 goto unlock;
473 wait_for_completion(&done); 471 wait_for_completion(&done);
474 retval = sub_info->retval; 472 retval = sub_info->retval;
475 473
476 out: 474out:
477 call_usermodehelper_freeinfo(sub_info); 475 call_usermodehelper_freeinfo(sub_info);
476unlock:
478 helper_unlock(); 477 helper_unlock();
479 return retval; 478 return retval;
480} 479}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e3a5d817ac9b..d0493eafea3e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -824,6 +824,8 @@ static int __init init_kprobes(void)
824 if (!err) 824 if (!err)
825 err = register_die_notifier(&kprobe_exceptions_nb); 825 err = register_die_notifier(&kprobe_exceptions_nb);
826 826
827 if (!err)
828 init_test_probes();
827 return err; 829 return err;
828} 830}
829 831
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 65daa5373ca6..e53bc30e9ba5 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -17,30 +17,34 @@
17#include <linux/sched.h> 17#include <linux/sched.h>
18 18
19#define KERNEL_ATTR_RO(_name) \ 19#define KERNEL_ATTR_RO(_name) \
20static struct subsys_attribute _name##_attr = __ATTR_RO(_name) 20static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
21 21
22#define KERNEL_ATTR_RW(_name) \ 22#define KERNEL_ATTR_RW(_name) \
23static struct subsys_attribute _name##_attr = \ 23static struct kobj_attribute _name##_attr = \
24 __ATTR(_name, 0644, _name##_show, _name##_store) 24 __ATTR(_name, 0644, _name##_show, _name##_store)
25 25
26#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 26#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
27/* current uevent sequence number */ 27/* current uevent sequence number */
28static ssize_t uevent_seqnum_show(struct kset *kset, char *page) 28static ssize_t uevent_seqnum_show(struct kobject *kobj,
29 struct kobj_attribute *attr, char *buf)
29{ 30{
30 return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum); 31 return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum);
31} 32}
32KERNEL_ATTR_RO(uevent_seqnum); 33KERNEL_ATTR_RO(uevent_seqnum);
33 34
34/* uevent helper program, used during early boo */ 35/* uevent helper program, used during early boo */
35static ssize_t uevent_helper_show(struct kset *kset, char *page) 36static ssize_t uevent_helper_show(struct kobject *kobj,
37 struct kobj_attribute *attr, char *buf)
36{ 38{
37 return sprintf(page, "%s\n", uevent_helper); 39 return sprintf(buf, "%s\n", uevent_helper);
38} 40}
39static ssize_t uevent_helper_store(struct kset *kset, const char *page, size_t count) 41static ssize_t uevent_helper_store(struct kobject *kobj,
42 struct kobj_attribute *attr,
43 const char *buf, size_t count)
40{ 44{
41 if (count+1 > UEVENT_HELPER_PATH_LEN) 45 if (count+1 > UEVENT_HELPER_PATH_LEN)
42 return -ENOENT; 46 return -ENOENT;
43 memcpy(uevent_helper, page, count); 47 memcpy(uevent_helper, buf, count);
44 uevent_helper[count] = '\0'; 48 uevent_helper[count] = '\0';
45 if (count && uevent_helper[count-1] == '\n') 49 if (count && uevent_helper[count-1] == '\n')
46 uevent_helper[count-1] = '\0'; 50 uevent_helper[count-1] = '\0';
@@ -50,21 +54,24 @@ KERNEL_ATTR_RW(uevent_helper);
50#endif 54#endif
51 55
52#ifdef CONFIG_KEXEC 56#ifdef CONFIG_KEXEC
53static ssize_t kexec_loaded_show(struct kset *kset, char *page) 57static ssize_t kexec_loaded_show(struct kobject *kobj,
58 struct kobj_attribute *attr, char *buf)
54{ 59{
55 return sprintf(page, "%d\n", !!kexec_image); 60 return sprintf(buf, "%d\n", !!kexec_image);
56} 61}
57KERNEL_ATTR_RO(kexec_loaded); 62KERNEL_ATTR_RO(kexec_loaded);
58 63
59static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page) 64static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
65 struct kobj_attribute *attr, char *buf)
60{ 66{
61 return sprintf(page, "%d\n", !!kexec_crash_image); 67 return sprintf(buf, "%d\n", !!kexec_crash_image);
62} 68}
63KERNEL_ATTR_RO(kexec_crash_loaded); 69KERNEL_ATTR_RO(kexec_crash_loaded);
64 70
65static ssize_t vmcoreinfo_show(struct kset *kset, char *page) 71static ssize_t vmcoreinfo_show(struct kobject *kobj,
72 struct kobj_attribute *attr, char *buf)
66{ 73{
67 return sprintf(page, "%lx %x\n", 74 return sprintf(buf, "%lx %x\n",
68 paddr_vmcoreinfo_note(), 75 paddr_vmcoreinfo_note(),
69 (unsigned int)vmcoreinfo_max_size); 76 (unsigned int)vmcoreinfo_max_size);
70} 77}
@@ -94,8 +101,8 @@ static struct bin_attribute notes_attr = {
94 .read = &notes_read, 101 .read = &notes_read,
95}; 102};
96 103
97decl_subsys(kernel, NULL, NULL); 104struct kobject *kernel_kobj;
98EXPORT_SYMBOL_GPL(kernel_subsys); 105EXPORT_SYMBOL_GPL(kernel_kobj);
99 106
100static struct attribute * kernel_attrs[] = { 107static struct attribute * kernel_attrs[] = {
101#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 108#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
@@ -116,24 +123,39 @@ static struct attribute_group kernel_attr_group = {
116 123
117static int __init ksysfs_init(void) 124static int __init ksysfs_init(void)
118{ 125{
119 int error = subsystem_register(&kernel_subsys); 126 int error;
120 if (!error)
121 error = sysfs_create_group(&kernel_subsys.kobj,
122 &kernel_attr_group);
123 127
124 if (!error && notes_size > 0) { 128 kernel_kobj = kobject_create_and_add("kernel", NULL);
125 notes_attr.size = notes_size; 129 if (!kernel_kobj) {
126 error = sysfs_create_bin_file(&kernel_subsys.kobj, 130 error = -ENOMEM;
127 &notes_attr); 131 goto exit;
128 } 132 }
133 error = sysfs_create_group(kernel_kobj, &kernel_attr_group);
134 if (error)
135 goto kset_exit;
129 136
130 /* 137 if (notes_size > 0) {
131 * Create "/sys/kernel/uids" directory and corresponding root user's 138 notes_attr.size = notes_size;
132 * directory under it. 139 error = sysfs_create_bin_file(kernel_kobj, &notes_attr);
133 */ 140 if (error)
134 if (!error) 141 goto group_exit;
135 error = uids_kobject_init(); 142 }
136 143
144 /* create the /sys/kernel/uids/ directory */
145 error = uids_sysfs_init();
146 if (error)
147 goto notes_exit;
148
149 return 0;
150
151notes_exit:
152 if (notes_size > 0)
153 sysfs_remove_bin_file(kernel_kobj, &notes_attr);
154group_exit:
155 sysfs_remove_group(kernel_kobj, &kernel_attr_group);
156kset_exit:
157 kobject_put(kernel_kobj);
158exit:
137 return error; 159 return error;
138} 160}
139 161
diff --git a/kernel/kthread.c b/kernel/kthread.c
index dcfe724300eb..0ac887882f90 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -15,6 +15,8 @@
15#include <linux/mutex.h> 15#include <linux/mutex.h>
16#include <asm/semaphore.h> 16#include <asm/semaphore.h>
17 17
18#define KTHREAD_NICE_LEVEL (-5)
19
18static DEFINE_SPINLOCK(kthread_create_lock); 20static DEFINE_SPINLOCK(kthread_create_lock);
19static LIST_HEAD(kthread_create_list); 21static LIST_HEAD(kthread_create_list);
20struct task_struct *kthreadd_task; 22struct task_struct *kthreadd_task;
@@ -94,10 +96,18 @@ static void create_kthread(struct kthread_create_info *create)
94 if (pid < 0) { 96 if (pid < 0) {
95 create->result = ERR_PTR(pid); 97 create->result = ERR_PTR(pid);
96 } else { 98 } else {
99 struct sched_param param = { .sched_priority = 0 };
97 wait_for_completion(&create->started); 100 wait_for_completion(&create->started);
98 read_lock(&tasklist_lock); 101 read_lock(&tasklist_lock);
99 create->result = find_task_by_pid(pid); 102 create->result = find_task_by_pid(pid);
100 read_unlock(&tasklist_lock); 103 read_unlock(&tasklist_lock);
104 /*
105 * root may have changed our (kthreadd's) priority or CPU mask.
106 * The kernel thread should not inherit these properties.
107 */
108 sched_setscheduler(create->result, SCHED_NORMAL, &param);
109 set_user_nice(create->result, KTHREAD_NICE_LEVEL);
110 set_cpus_allowed(create->result, CPU_MASK_ALL);
101 } 111 }
102 complete(&create->done); 112 complete(&create->done);
103} 113}
@@ -221,7 +231,7 @@ int kthreadd(void *unused)
221 /* Setup a clean context for our children to inherit. */ 231 /* Setup a clean context for our children to inherit. */
222 set_task_comm(tsk, "kthreadd"); 232 set_task_comm(tsk, "kthreadd");
223 ignore_signals(tsk); 233 ignore_signals(tsk);
224 set_user_nice(tsk, -5); 234 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
225 set_cpus_allowed(tsk, CPU_MASK_ALL); 235 set_cpus_allowed(tsk, CPU_MASK_ALL);
226 236
227 current->flags |= PF_NOFREEZE; 237 current->flags |= PF_NOFREEZE;
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
new file mode 100644
index 000000000000..b4e3c85abe74
--- /dev/null
+++ b/kernel/latencytop.c
@@ -0,0 +1,239 @@
1/*
2 * latencytop.c: Latency display infrastructure
3 *
4 * (C) Copyright 2008 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12#include <linux/latencytop.h>
13#include <linux/kallsyms.h>
14#include <linux/seq_file.h>
15#include <linux/notifier.h>
16#include <linux/spinlock.h>
17#include <linux/proc_fs.h>
18#include <linux/module.h>
19#include <linux/sched.h>
20#include <linux/list.h>
21#include <linux/slab.h>
22#include <linux/stacktrace.h>
23
24static DEFINE_SPINLOCK(latency_lock);
25
26#define MAXLR 128
27static struct latency_record latency_record[MAXLR];
28
29int latencytop_enabled;
30
31void clear_all_latency_tracing(struct task_struct *p)
32{
33 unsigned long flags;
34
35 if (!latencytop_enabled)
36 return;
37
38 spin_lock_irqsave(&latency_lock, flags);
39 memset(&p->latency_record, 0, sizeof(p->latency_record));
40 p->latency_record_count = 0;
41 spin_unlock_irqrestore(&latency_lock, flags);
42}
43
44static void clear_global_latency_tracing(void)
45{
46 unsigned long flags;
47
48 spin_lock_irqsave(&latency_lock, flags);
49 memset(&latency_record, 0, sizeof(latency_record));
50 spin_unlock_irqrestore(&latency_lock, flags);
51}
52
53static void __sched
54account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat)
55{
56 int firstnonnull = MAXLR + 1;
57 int i;
58
59 if (!latencytop_enabled)
60 return;
61
62 /* skip kernel threads for now */
63 if (!tsk->mm)
64 return;
65
66 for (i = 0; i < MAXLR; i++) {
67 int q;
68 int same = 1;
69 /* Nothing stored: */
70 if (!latency_record[i].backtrace[0]) {
71 if (firstnonnull > i)
72 firstnonnull = i;
73 continue;
74 }
75 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
76 if (latency_record[i].backtrace[q] !=
77 lat->backtrace[q])
78 same = 0;
79 if (same && lat->backtrace[q] == 0)
80 break;
81 if (same && lat->backtrace[q] == ULONG_MAX)
82 break;
83 }
84 if (same) {
85 latency_record[i].count++;
86 latency_record[i].time += lat->time;
87 if (lat->time > latency_record[i].max)
88 latency_record[i].max = lat->time;
89 return;
90 }
91 }
92
93 i = firstnonnull;
94 if (i >= MAXLR - 1)
95 return;
96
97 /* Allocted a new one: */
98 memcpy(&latency_record[i], lat, sizeof(struct latency_record));
99}
100
101static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat)
102{
103 struct stack_trace trace;
104
105 memset(&trace, 0, sizeof(trace));
106 trace.max_entries = LT_BACKTRACEDEPTH;
107 trace.entries = &lat->backtrace[0];
108 trace.skip = 0;
109 save_stack_trace_tsk(tsk, &trace);
110}
111
112void __sched
113account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
114{
115 unsigned long flags;
116 int i, q;
117 struct latency_record lat;
118
119 if (!latencytop_enabled)
120 return;
121
122 /* Long interruptible waits are generally user requested... */
123 if (inter && usecs > 5000)
124 return;
125
126 memset(&lat, 0, sizeof(lat));
127 lat.count = 1;
128 lat.time = usecs;
129 lat.max = usecs;
130 store_stacktrace(tsk, &lat);
131
132 spin_lock_irqsave(&latency_lock, flags);
133
134 account_global_scheduler_latency(tsk, &lat);
135
136 /*
137 * short term hack; if we're > 32 we stop; future we recycle:
138 */
139 tsk->latency_record_count++;
140 if (tsk->latency_record_count >= LT_SAVECOUNT)
141 goto out_unlock;
142
143 for (i = 0; i < LT_SAVECOUNT ; i++) {
144 struct latency_record *mylat;
145 int same = 1;
146 mylat = &tsk->latency_record[i];
147 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
148 if (mylat->backtrace[q] !=
149 lat.backtrace[q])
150 same = 0;
151 if (same && lat.backtrace[q] == 0)
152 break;
153 if (same && lat.backtrace[q] == ULONG_MAX)
154 break;
155 }
156 if (same) {
157 mylat->count++;
158 mylat->time += lat.time;
159 if (lat.time > mylat->max)
160 mylat->max = lat.time;
161 goto out_unlock;
162 }
163 }
164
165 /* Allocated a new one: */
166 i = tsk->latency_record_count;
167 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
168
169out_unlock:
170 spin_unlock_irqrestore(&latency_lock, flags);
171}
172
173static int lstats_show(struct seq_file *m, void *v)
174{
175 int i;
176
177 seq_puts(m, "Latency Top version : v0.1\n");
178
179 for (i = 0; i < MAXLR; i++) {
180 if (latency_record[i].backtrace[0]) {
181 int q;
182 seq_printf(m, "%i %li %li ",
183 latency_record[i].count,
184 latency_record[i].time,
185 latency_record[i].max);
186 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
187 char sym[KSYM_NAME_LEN];
188 char *c;
189 if (!latency_record[i].backtrace[q])
190 break;
191 if (latency_record[i].backtrace[q] == ULONG_MAX)
192 break;
193 sprint_symbol(sym, latency_record[i].backtrace[q]);
194 c = strchr(sym, '+');
195 if (c)
196 *c = 0;
197 seq_printf(m, "%s ", sym);
198 }
199 seq_printf(m, "\n");
200 }
201 }
202 return 0;
203}
204
205static ssize_t
206lstats_write(struct file *file, const char __user *buf, size_t count,
207 loff_t *offs)
208{
209 clear_global_latency_tracing();
210
211 return count;
212}
213
214static int lstats_open(struct inode *inode, struct file *filp)
215{
216 return single_open(filp, lstats_show, NULL);
217}
218
219static struct file_operations lstats_fops = {
220 .open = lstats_open,
221 .read = seq_read,
222 .write = lstats_write,
223 .llseek = seq_lseek,
224 .release = single_release,
225};
226
227static int __init init_lstats_procfs(void)
228{
229 struct proc_dir_entry *pe;
230
231 pe = create_proc_entry("latency_stats", 0644, NULL);
232 if (!pe)
233 return -ENOMEM;
234
235 pe->proc_fops = &lstats_fops;
236
237 return 0;
238}
239__initcall(init_lstats_procfs);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0f389621bb6b..3574379f4d62 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2654,10 +2654,15 @@ static void check_flags(unsigned long flags)
2654 if (!debug_locks) 2654 if (!debug_locks)
2655 return; 2655 return;
2656 2656
2657 if (irqs_disabled_flags(flags)) 2657 if (irqs_disabled_flags(flags)) {
2658 DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled); 2658 if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) {
2659 else 2659 printk("possible reason: unannotated irqs-off.\n");
2660 DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled); 2660 }
2661 } else {
2662 if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) {
2663 printk("possible reason: unannotated irqs-on.\n");
2664 }
2665 }
2661 2666
2662 /* 2667 /*
2663 * We dont accurately track softirq state in e.g. 2668 * We dont accurately track softirq state in e.g.
@@ -2927,7 +2932,7 @@ static void zap_class(struct lock_class *class)
2927 2932
2928} 2933}
2929 2934
2930static inline int within(void *addr, void *start, unsigned long size) 2935static inline int within(const void *addr, void *start, unsigned long size)
2931{ 2936{
2932 return addr >= start && addr < start + size; 2937 return addr >= start && addr < start + size;
2933} 2938}
@@ -2938,9 +2943,10 @@ void lockdep_free_key_range(void *start, unsigned long size)
2938 struct list_head *head; 2943 struct list_head *head;
2939 unsigned long flags; 2944 unsigned long flags;
2940 int i; 2945 int i;
2946 int locked;
2941 2947
2942 raw_local_irq_save(flags); 2948 raw_local_irq_save(flags);
2943 graph_lock(); 2949 locked = graph_lock();
2944 2950
2945 /* 2951 /*
2946 * Unhash all classes that were created by this module: 2952 * Unhash all classes that were created by this module:
@@ -2949,12 +2955,16 @@ void lockdep_free_key_range(void *start, unsigned long size)
2949 head = classhash_table + i; 2955 head = classhash_table + i;
2950 if (list_empty(head)) 2956 if (list_empty(head))
2951 continue; 2957 continue;
2952 list_for_each_entry_safe(class, next, head, hash_entry) 2958 list_for_each_entry_safe(class, next, head, hash_entry) {
2953 if (within(class->key, start, size)) 2959 if (within(class->key, start, size))
2954 zap_class(class); 2960 zap_class(class);
2961 else if (within(class->name, start, size))
2962 zap_class(class);
2963 }
2955 } 2964 }
2956 2965
2957 graph_unlock(); 2966 if (locked)
2967 graph_unlock();
2958 raw_local_irq_restore(flags); 2968 raw_local_irq_restore(flags);
2959} 2969}
2960 2970
@@ -2964,6 +2974,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
2964 struct list_head *head; 2974 struct list_head *head;
2965 unsigned long flags; 2975 unsigned long flags;
2966 int i, j; 2976 int i, j;
2977 int locked;
2967 2978
2968 raw_local_irq_save(flags); 2979 raw_local_irq_save(flags);
2969 2980
@@ -2982,7 +2993,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
2982 * Debug check: in the end all mapped classes should 2993 * Debug check: in the end all mapped classes should
2983 * be gone. 2994 * be gone.
2984 */ 2995 */
2985 graph_lock(); 2996 locked = graph_lock();
2986 for (i = 0; i < CLASSHASH_SIZE; i++) { 2997 for (i = 0; i < CLASSHASH_SIZE; i++) {
2987 head = classhash_table + i; 2998 head = classhash_table + i;
2988 if (list_empty(head)) 2999 if (list_empty(head))
@@ -2995,7 +3006,8 @@ void lockdep_reset_lock(struct lockdep_map *lock)
2995 } 3006 }
2996 } 3007 }
2997 } 3008 }
2998 graph_unlock(); 3009 if (locked)
3010 graph_unlock();
2999 3011
3000out_restore: 3012out_restore:
3001 raw_local_irq_restore(flags); 3013 raw_local_irq_restore(flags);
@@ -3194,7 +3206,11 @@ retry:
3194 3206
3195EXPORT_SYMBOL_GPL(debug_show_all_locks); 3207EXPORT_SYMBOL_GPL(debug_show_all_locks);
3196 3208
3197void debug_show_held_locks(struct task_struct *task) 3209/*
3210 * Careful: only use this function if you are sure that
3211 * the task cannot run in parallel!
3212 */
3213void __debug_show_held_locks(struct task_struct *task)
3198{ 3214{
3199 if (unlikely(!debug_locks)) { 3215 if (unlikely(!debug_locks)) {
3200 printk("INFO: lockdep is turned off.\n"); 3216 printk("INFO: lockdep is turned off.\n");
@@ -3202,6 +3218,12 @@ void debug_show_held_locks(struct task_struct *task)
3202 } 3218 }
3203 lockdep_print_held_locks(task); 3219 lockdep_print_held_locks(task);
3204} 3220}
3221EXPORT_SYMBOL_GPL(__debug_show_held_locks);
3222
3223void debug_show_held_locks(struct task_struct *task)
3224{
3225 __debug_show_held_locks(task);
3226}
3205 3227
3206EXPORT_SYMBOL_GPL(debug_show_held_locks); 3228EXPORT_SYMBOL_GPL(debug_show_held_locks);
3207 3229
diff --git a/kernel/module.c b/kernel/module.c
index 91fe6958b6e1..bd60278ee703 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -47,8 +47,6 @@
47#include <asm/cacheflush.h> 47#include <asm/cacheflush.h>
48#include <linux/license.h> 48#include <linux/license.h>
49 49
50extern int module_sysfs_initialized;
51
52#if 0 50#if 0
53#define DEBUGP printk 51#define DEBUGP printk
54#else 52#else
@@ -67,6 +65,9 @@ extern int module_sysfs_initialized;
67static DEFINE_MUTEX(module_mutex); 65static DEFINE_MUTEX(module_mutex);
68static LIST_HEAD(modules); 66static LIST_HEAD(modules);
69 67
68/* Waiting for a module to finish initializing? */
69static DECLARE_WAIT_QUEUE_HEAD(module_wq);
70
70static BLOCKING_NOTIFIER_HEAD(module_notify_list); 71static BLOCKING_NOTIFIER_HEAD(module_notify_list);
71 72
72int register_module_notifier(struct notifier_block * nb) 73int register_module_notifier(struct notifier_block * nb)
@@ -86,8 +87,11 @@ EXPORT_SYMBOL(unregister_module_notifier);
86static inline int strong_try_module_get(struct module *mod) 87static inline int strong_try_module_get(struct module *mod)
87{ 88{
88 if (mod && mod->state == MODULE_STATE_COMING) 89 if (mod && mod->state == MODULE_STATE_COMING)
90 return -EBUSY;
91 if (try_module_get(mod))
89 return 0; 92 return 0;
90 return try_module_get(mod); 93 else
94 return -ENOENT;
91} 95}
92 96
93static inline void add_taint_module(struct module *mod, unsigned flag) 97static inline void add_taint_module(struct module *mod, unsigned flag)
@@ -426,6 +430,14 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr,
426 return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); 430 return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
427} 431}
428 432
433static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
434{
435 int cpu;
436
437 for_each_possible_cpu(cpu)
438 memcpy(pcpudest + per_cpu_offset(cpu), from, size);
439}
440
429static int percpu_modinit(void) 441static int percpu_modinit(void)
430{ 442{
431 pcpu_num_used = 2; 443 pcpu_num_used = 2;
@@ -498,6 +510,8 @@ static struct module_attribute modinfo_##field = { \
498MODINFO_ATTR(version); 510MODINFO_ATTR(version);
499MODINFO_ATTR(srcversion); 511MODINFO_ATTR(srcversion);
500 512
513static char last_unloaded_module[MODULE_NAME_LEN+1];
514
501#ifdef CONFIG_MODULE_UNLOAD 515#ifdef CONFIG_MODULE_UNLOAD
502/* Init the unload section of the module. */ 516/* Init the unload section of the module. */
503static void module_unload_init(struct module *mod) 517static void module_unload_init(struct module *mod)
@@ -539,11 +553,21 @@ static int already_uses(struct module *a, struct module *b)
539static int use_module(struct module *a, struct module *b) 553static int use_module(struct module *a, struct module *b)
540{ 554{
541 struct module_use *use; 555 struct module_use *use;
542 int no_warn; 556 int no_warn, err;
543 557
544 if (b == NULL || already_uses(a, b)) return 1; 558 if (b == NULL || already_uses(a, b)) return 1;
545 559
546 if (!strong_try_module_get(b)) 560 /* If we're interrupted or time out, we fail. */
561 if (wait_event_interruptible_timeout(
562 module_wq, (err = strong_try_module_get(b)) != -EBUSY,
563 30 * HZ) <= 0) {
564 printk("%s: gave up waiting for init of module %s.\n",
565 a->name, b->name);
566 return 0;
567 }
568
569 /* If strong_try_module_get() returned a different error, we fail. */
570 if (err)
547 return 0; 571 return 0;
548 572
549 DEBUGP("Allocating new usage for %s.\n", a->name); 573 DEBUGP("Allocating new usage for %s.\n", a->name);
@@ -721,6 +745,8 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
721 mod->exit(); 745 mod->exit();
722 mutex_lock(&module_mutex); 746 mutex_lock(&module_mutex);
723 } 747 }
748 /* Store the name of the last unloaded module for diagnostic purposes */
749 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
724 free_module(mod); 750 free_module(mod);
725 751
726 out: 752 out:
@@ -814,7 +840,7 @@ static inline void module_unload_free(struct module *mod)
814 840
815static inline int use_module(struct module *a, struct module *b) 841static inline int use_module(struct module *a, struct module *b)
816{ 842{
817 return strong_try_module_get(b); 843 return strong_try_module_get(b) == 0;
818} 844}
819 845
820static inline void module_unload_init(struct module *mod) 846static inline void module_unload_init(struct module *mod)
@@ -1122,7 +1148,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1122 ++loaded; 1148 ++loaded;
1123 } 1149 }
1124 1150
1125 notes_attrs->dir = kobject_add_dir(&mod->mkobj.kobj, "notes"); 1151 notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
1126 if (!notes_attrs->dir) 1152 if (!notes_attrs->dir)
1127 goto out; 1153 goto out;
1128 1154
@@ -1212,6 +1238,7 @@ void module_remove_modinfo_attrs(struct module *mod)
1212int mod_sysfs_init(struct module *mod) 1238int mod_sysfs_init(struct module *mod)
1213{ 1239{
1214 int err; 1240 int err;
1241 struct kobject *kobj;
1215 1242
1216 if (!module_sysfs_initialized) { 1243 if (!module_sysfs_initialized) {
1217 printk(KERN_ERR "%s: module sysfs not initialized\n", 1244 printk(KERN_ERR "%s: module sysfs not initialized\n",
@@ -1219,15 +1246,25 @@ int mod_sysfs_init(struct module *mod)
1219 err = -EINVAL; 1246 err = -EINVAL;
1220 goto out; 1247 goto out;
1221 } 1248 }
1222 memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); 1249
1223 err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name); 1250 kobj = kset_find_obj(module_kset, mod->name);
1224 if (err) 1251 if (kobj) {
1252 printk(KERN_ERR "%s: module is already loaded\n", mod->name);
1253 kobject_put(kobj);
1254 err = -EINVAL;
1225 goto out; 1255 goto out;
1226 kobj_set_kset_s(&mod->mkobj, module_subsys); 1256 }
1257
1227 mod->mkobj.mod = mod; 1258 mod->mkobj.mod = mod;
1228 1259
1229 kobject_init(&mod->mkobj.kobj); 1260 memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
1261 mod->mkobj.kobj.kset = module_kset;
1262 err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
1263 "%s", mod->name);
1264 if (err)
1265 kobject_put(&mod->mkobj.kobj);
1230 1266
1267 /* delay uevent until full sysfs population */
1231out: 1268out:
1232 return err; 1269 return err;
1233} 1270}
@@ -1238,12 +1275,7 @@ int mod_sysfs_setup(struct module *mod,
1238{ 1275{
1239 int err; 1276 int err;
1240 1277
1241 /* delay uevent until full sysfs population */ 1278 mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
1242 err = kobject_add(&mod->mkobj.kobj);
1243 if (err)
1244 goto out;
1245
1246 mod->holders_dir = kobject_add_dir(&mod->mkobj.kobj, "holders");
1247 if (!mod->holders_dir) { 1279 if (!mod->holders_dir) {
1248 err = -ENOMEM; 1280 err = -ENOMEM;
1249 goto out_unreg; 1281 goto out_unreg;
@@ -1263,11 +1295,9 @@ int mod_sysfs_setup(struct module *mod,
1263out_unreg_param: 1295out_unreg_param:
1264 module_param_sysfs_remove(mod); 1296 module_param_sysfs_remove(mod);
1265out_unreg_holders: 1297out_unreg_holders:
1266 kobject_unregister(mod->holders_dir); 1298 kobject_put(mod->holders_dir);
1267out_unreg: 1299out_unreg:
1268 kobject_del(&mod->mkobj.kobj);
1269 kobject_put(&mod->mkobj.kobj); 1300 kobject_put(&mod->mkobj.kobj);
1270out:
1271 return err; 1301 return err;
1272} 1302}
1273#endif 1303#endif
@@ -1276,9 +1306,20 @@ static void mod_kobject_remove(struct module *mod)
1276{ 1306{
1277 module_remove_modinfo_attrs(mod); 1307 module_remove_modinfo_attrs(mod);
1278 module_param_sysfs_remove(mod); 1308 module_param_sysfs_remove(mod);
1279 kobject_unregister(mod->mkobj.drivers_dir); 1309 kobject_put(mod->mkobj.drivers_dir);
1280 kobject_unregister(mod->holders_dir); 1310 kobject_put(mod->holders_dir);
1281 kobject_unregister(&mod->mkobj.kobj); 1311 kobject_put(&mod->mkobj.kobj);
1312}
1313
1314/*
1315 * link the module with the whole machine is stopped with interrupts off
1316 * - this defends against kallsyms not taking locks
1317 */
1318static int __link_module(void *_mod)
1319{
1320 struct module *mod = _mod;
1321 list_add(&mod->list, &modules);
1322 return 0;
1282} 1323}
1283 1324
1284/* 1325/*
@@ -1330,7 +1371,7 @@ void *__symbol_get(const char *symbol)
1330 1371
1331 preempt_disable(); 1372 preempt_disable();
1332 value = __find_symbol(symbol, &owner, &crc, 1); 1373 value = __find_symbol(symbol, &owner, &crc, 1);
1333 if (value && !strong_try_module_get(owner)) 1374 if (value && strong_try_module_get(owner) != 0)
1334 value = 0; 1375 value = 0;
1335 preempt_enable(); 1376 preempt_enable();
1336 1377
@@ -1884,16 +1925,16 @@ static struct module *load_module(void __user *umod,
1884 /* Now we've moved module, initialize linked lists, etc. */ 1925 /* Now we've moved module, initialize linked lists, etc. */
1885 module_unload_init(mod); 1926 module_unload_init(mod);
1886 1927
1887 /* Initialize kobject, so we can reference it. */ 1928 /* add kobject, so we can reference it. */
1888 err = mod_sysfs_init(mod); 1929 err = mod_sysfs_init(mod);
1889 if (err) 1930 if (err)
1890 goto cleanup; 1931 goto free_unload;
1891 1932
1892 /* Set up license info based on the info section */ 1933 /* Set up license info based on the info section */
1893 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 1934 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
1894 1935
1895 if (strcmp(mod->name, "ndiswrapper") == 0) 1936 if (strcmp(mod->name, "ndiswrapper") == 0)
1896 add_taint(TAINT_PROPRIETARY_MODULE); 1937 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
1897 if (strcmp(mod->name, "driverloader") == 0) 1938 if (strcmp(mod->name, "driverloader") == 0)
1898 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 1939 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
1899 1940
@@ -2023,6 +2064,11 @@ static struct module *load_module(void __user *umod,
2023 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", 2064 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
2024 mod->name); 2065 mod->name);
2025 2066
2067 /* Now sew it into the lists so we can get lockdep and oops
2068 * info during argument parsing. Noone should access us, since
2069 * strong_try_module_get() will fail. */
2070 stop_machine_run(__link_module, mod, NR_CPUS);
2071
2026 /* Size of section 0 is 0, so this works well if no params */ 2072 /* Size of section 0 is 0, so this works well if no params */
2027 err = parse_args(mod->name, mod->args, 2073 err = parse_args(mod->name, mod->args,
2028 (struct kernel_param *) 2074 (struct kernel_param *)
@@ -2031,7 +2077,7 @@ static struct module *load_module(void __user *umod,
2031 / sizeof(struct kernel_param), 2077 / sizeof(struct kernel_param),
2032 NULL); 2078 NULL);
2033 if (err < 0) 2079 if (err < 0)
2034 goto arch_cleanup; 2080 goto unlink;
2035 2081
2036 err = mod_sysfs_setup(mod, 2082 err = mod_sysfs_setup(mod,
2037 (struct kernel_param *) 2083 (struct kernel_param *)
@@ -2039,7 +2085,7 @@ static struct module *load_module(void __user *umod,
2039 sechdrs[setupindex].sh_size 2085 sechdrs[setupindex].sh_size
2040 / sizeof(struct kernel_param)); 2086 / sizeof(struct kernel_param));
2041 if (err < 0) 2087 if (err < 0)
2042 goto arch_cleanup; 2088 goto unlink;
2043 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2089 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2044 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2090 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2045 2091
@@ -2054,9 +2100,13 @@ static struct module *load_module(void __user *umod,
2054 /* Done! */ 2100 /* Done! */
2055 return mod; 2101 return mod;
2056 2102
2057 arch_cleanup: 2103 unlink:
2104 stop_machine_run(__unlink_module, mod, NR_CPUS);
2058 module_arch_cleanup(mod); 2105 module_arch_cleanup(mod);
2059 cleanup: 2106 cleanup:
2107 kobject_del(&mod->mkobj.kobj);
2108 kobject_put(&mod->mkobj.kobj);
2109 free_unload:
2060 module_unload_free(mod); 2110 module_unload_free(mod);
2061 module_free(mod, mod->module_init); 2111 module_free(mod, mod->module_init);
2062 free_core: 2112 free_core:
@@ -2076,17 +2126,6 @@ static struct module *load_module(void __user *umod,
2076 goto free_hdr; 2126 goto free_hdr;
2077} 2127}
2078 2128
2079/*
2080 * link the module with the whole machine is stopped with interrupts off
2081 * - this defends against kallsyms not taking locks
2082 */
2083static int __link_module(void *_mod)
2084{
2085 struct module *mod = _mod;
2086 list_add(&mod->list, &modules);
2087 return 0;
2088}
2089
2090/* This is where the real work happens */ 2129/* This is where the real work happens */
2091asmlinkage long 2130asmlinkage long
2092sys_init_module(void __user *umod, 2131sys_init_module(void __user *umod,
@@ -2111,10 +2150,6 @@ sys_init_module(void __user *umod,
2111 return PTR_ERR(mod); 2150 return PTR_ERR(mod);
2112 } 2151 }
2113 2152
2114 /* Now sew it into the lists. They won't access us, since
2115 strong_try_module_get() will fail. */
2116 stop_machine_run(__link_module, mod, NR_CPUS);
2117
2118 /* Drop lock so they can recurse */ 2153 /* Drop lock so they can recurse */
2119 mutex_unlock(&module_mutex); 2154 mutex_unlock(&module_mutex);
2120 2155
@@ -2133,6 +2168,7 @@ sys_init_module(void __user *umod,
2133 mutex_lock(&module_mutex); 2168 mutex_lock(&module_mutex);
2134 free_module(mod); 2169 free_module(mod);
2135 mutex_unlock(&module_mutex); 2170 mutex_unlock(&module_mutex);
2171 wake_up(&module_wq);
2136 return ret; 2172 return ret;
2137 } 2173 }
2138 2174
@@ -2147,6 +2183,7 @@ sys_init_module(void __user *umod,
2147 mod->init_size = 0; 2183 mod->init_size = 0;
2148 mod->init_text_size = 0; 2184 mod->init_text_size = 0;
2149 mutex_unlock(&module_mutex); 2185 mutex_unlock(&module_mutex);
2186 wake_up(&module_wq);
2150 2187
2151 return 0; 2188 return 0;
2152} 2189}
@@ -2211,32 +2248,41 @@ static const char *get_ksymbol(struct module *mod,
2211 return mod->strtab + mod->symtab[best].st_name; 2248 return mod->strtab + mod->symtab[best].st_name;
2212} 2249}
2213 2250
2214/* For kallsyms to ask for address resolution. NULL means not found. 2251/* For kallsyms to ask for address resolution. NULL means not found. Careful
2215 We don't lock, as this is used for oops resolution and races are a 2252 * not to lock to avoid deadlock on oopses, simply disable preemption. */
2216 lesser concern. */ 2253char *module_address_lookup(unsigned long addr,
2217const char *module_address_lookup(unsigned long addr, 2254 unsigned long *size,
2218 unsigned long *size, 2255 unsigned long *offset,
2219 unsigned long *offset, 2256 char **modname,
2220 char **modname) 2257 char *namebuf)
2221{ 2258{
2222 struct module *mod; 2259 struct module *mod;
2260 const char *ret = NULL;
2223 2261
2262 preempt_disable();
2224 list_for_each_entry(mod, &modules, list) { 2263 list_for_each_entry(mod, &modules, list) {
2225 if (within(addr, mod->module_init, mod->init_size) 2264 if (within(addr, mod->module_init, mod->init_size)
2226 || within(addr, mod->module_core, mod->core_size)) { 2265 || within(addr, mod->module_core, mod->core_size)) {
2227 if (modname) 2266 if (modname)
2228 *modname = mod->name; 2267 *modname = mod->name;
2229 return get_ksymbol(mod, addr, size, offset); 2268 ret = get_ksymbol(mod, addr, size, offset);
2269 break;
2230 } 2270 }
2231 } 2271 }
2232 return NULL; 2272 /* Make a copy in here where it's safe */
2273 if (ret) {
2274 strncpy(namebuf, ret, KSYM_NAME_LEN - 1);
2275 ret = namebuf;
2276 }
2277 preempt_enable();
2278 return (char *)ret;
2233} 2279}
2234 2280
2235int lookup_module_symbol_name(unsigned long addr, char *symname) 2281int lookup_module_symbol_name(unsigned long addr, char *symname)
2236{ 2282{
2237 struct module *mod; 2283 struct module *mod;
2238 2284
2239 mutex_lock(&module_mutex); 2285 preempt_disable();
2240 list_for_each_entry(mod, &modules, list) { 2286 list_for_each_entry(mod, &modules, list) {
2241 if (within(addr, mod->module_init, mod->init_size) || 2287 if (within(addr, mod->module_init, mod->init_size) ||
2242 within(addr, mod->module_core, mod->core_size)) { 2288 within(addr, mod->module_core, mod->core_size)) {
@@ -2246,12 +2292,12 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
2246 if (!sym) 2292 if (!sym)
2247 goto out; 2293 goto out;
2248 strlcpy(symname, sym, KSYM_NAME_LEN); 2294 strlcpy(symname, sym, KSYM_NAME_LEN);
2249 mutex_unlock(&module_mutex); 2295 preempt_enable();
2250 return 0; 2296 return 0;
2251 } 2297 }
2252 } 2298 }
2253out: 2299out:
2254 mutex_unlock(&module_mutex); 2300 preempt_enable();
2255 return -ERANGE; 2301 return -ERANGE;
2256} 2302}
2257 2303
@@ -2260,7 +2306,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
2260{ 2306{
2261 struct module *mod; 2307 struct module *mod;
2262 2308
2263 mutex_lock(&module_mutex); 2309 preempt_disable();
2264 list_for_each_entry(mod, &modules, list) { 2310 list_for_each_entry(mod, &modules, list) {
2265 if (within(addr, mod->module_init, mod->init_size) || 2311 if (within(addr, mod->module_init, mod->init_size) ||
2266 within(addr, mod->module_core, mod->core_size)) { 2312 within(addr, mod->module_core, mod->core_size)) {
@@ -2273,12 +2319,12 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
2273 strlcpy(modname, mod->name, MODULE_NAME_LEN); 2319 strlcpy(modname, mod->name, MODULE_NAME_LEN);
2274 if (name) 2320 if (name)
2275 strlcpy(name, sym, KSYM_NAME_LEN); 2321 strlcpy(name, sym, KSYM_NAME_LEN);
2276 mutex_unlock(&module_mutex); 2322 preempt_enable();
2277 return 0; 2323 return 0;
2278 } 2324 }
2279 } 2325 }
2280out: 2326out:
2281 mutex_unlock(&module_mutex); 2327 preempt_enable();
2282 return -ERANGE; 2328 return -ERANGE;
2283} 2329}
2284 2330
@@ -2287,7 +2333,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
2287{ 2333{
2288 struct module *mod; 2334 struct module *mod;
2289 2335
2290 mutex_lock(&module_mutex); 2336 preempt_disable();
2291 list_for_each_entry(mod, &modules, list) { 2337 list_for_each_entry(mod, &modules, list) {
2292 if (symnum < mod->num_symtab) { 2338 if (symnum < mod->num_symtab) {
2293 *value = mod->symtab[symnum].st_value; 2339 *value = mod->symtab[symnum].st_value;
@@ -2296,12 +2342,12 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
2296 KSYM_NAME_LEN); 2342 KSYM_NAME_LEN);
2297 strlcpy(module_name, mod->name, MODULE_NAME_LEN); 2343 strlcpy(module_name, mod->name, MODULE_NAME_LEN);
2298 *exported = is_exported(name, mod); 2344 *exported = is_exported(name, mod);
2299 mutex_unlock(&module_mutex); 2345 preempt_enable();
2300 return 0; 2346 return 0;
2301 } 2347 }
2302 symnum -= mod->num_symtab; 2348 symnum -= mod->num_symtab;
2303 } 2349 }
2304 mutex_unlock(&module_mutex); 2350 preempt_enable();
2305 return -ERANGE; 2351 return -ERANGE;
2306} 2352}
2307 2353
@@ -2324,6 +2370,7 @@ unsigned long module_kallsyms_lookup_name(const char *name)
2324 unsigned long ret = 0; 2370 unsigned long ret = 0;
2325 2371
2326 /* Don't lock: we're in enough trouble already. */ 2372 /* Don't lock: we're in enough trouble already. */
2373 preempt_disable();
2327 if ((colon = strchr(name, ':')) != NULL) { 2374 if ((colon = strchr(name, ':')) != NULL) {
2328 *colon = '\0'; 2375 *colon = '\0';
2329 if ((mod = find_module(name)) != NULL) 2376 if ((mod = find_module(name)) != NULL)
@@ -2334,6 +2381,7 @@ unsigned long module_kallsyms_lookup_name(const char *name)
2334 if ((ret = mod_find_symname(mod, name)) != 0) 2381 if ((ret = mod_find_symname(mod, name)) != 0)
2335 break; 2382 break;
2336 } 2383 }
2384 preempt_enable();
2337 return ret; 2385 return ret;
2338} 2386}
2339#endif /* CONFIG_KALLSYMS */ 2387#endif /* CONFIG_KALLSYMS */
@@ -2355,21 +2403,30 @@ static void m_stop(struct seq_file *m, void *p)
2355 mutex_unlock(&module_mutex); 2403 mutex_unlock(&module_mutex);
2356} 2404}
2357 2405
2358static char *taint_flags(unsigned int taints, char *buf) 2406static char *module_flags(struct module *mod, char *buf)
2359{ 2407{
2360 int bx = 0; 2408 int bx = 0;
2361 2409
2362 if (taints) { 2410 if (mod->taints ||
2411 mod->state == MODULE_STATE_GOING ||
2412 mod->state == MODULE_STATE_COMING) {
2363 buf[bx++] = '('; 2413 buf[bx++] = '(';
2364 if (taints & TAINT_PROPRIETARY_MODULE) 2414 if (mod->taints & TAINT_PROPRIETARY_MODULE)
2365 buf[bx++] = 'P'; 2415 buf[bx++] = 'P';
2366 if (taints & TAINT_FORCED_MODULE) 2416 if (mod->taints & TAINT_FORCED_MODULE)
2367 buf[bx++] = 'F'; 2417 buf[bx++] = 'F';
2368 /* 2418 /*
2369 * TAINT_FORCED_RMMOD: could be added. 2419 * TAINT_FORCED_RMMOD: could be added.
2370 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't 2420 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
2371 * apply to modules. 2421 * apply to modules.
2372 */ 2422 */
2423
2424 /* Show a - for module-is-being-unloaded */
2425 if (mod->state == MODULE_STATE_GOING)
2426 buf[bx++] = '-';
2427 /* Show a + for module-is-being-loaded */
2428 if (mod->state == MODULE_STATE_COMING)
2429 buf[bx++] = '+';
2373 buf[bx++] = ')'; 2430 buf[bx++] = ')';
2374 } 2431 }
2375 buf[bx] = '\0'; 2432 buf[bx] = '\0';
@@ -2396,7 +2453,7 @@ static int m_show(struct seq_file *m, void *p)
2396 2453
2397 /* Taints info */ 2454 /* Taints info */
2398 if (mod->taints) 2455 if (mod->taints)
2399 seq_printf(m, " %s", taint_flags(mod->taints, buf)); 2456 seq_printf(m, " %s", module_flags(mod, buf));
2400 2457
2401 seq_printf(m, "\n"); 2458 seq_printf(m, "\n");
2402 return 0; 2459 return 0;
@@ -2491,97 +2548,12 @@ void print_modules(void)
2491 2548
2492 printk("Modules linked in:"); 2549 printk("Modules linked in:");
2493 list_for_each_entry(mod, &modules, list) 2550 list_for_each_entry(mod, &modules, list)
2494 printk(" %s%s", mod->name, taint_flags(mod->taints, buf)); 2551 printk(" %s%s", mod->name, module_flags(mod, buf));
2552 if (last_unloaded_module[0])
2553 printk(" [last unloaded: %s]", last_unloaded_module);
2495 printk("\n"); 2554 printk("\n");
2496} 2555}
2497 2556
2498#ifdef CONFIG_SYSFS
2499static char *make_driver_name(struct device_driver *drv)
2500{
2501 char *driver_name;
2502
2503 driver_name = kmalloc(strlen(drv->name) + strlen(drv->bus->name) + 2,
2504 GFP_KERNEL);
2505 if (!driver_name)
2506 return NULL;
2507
2508 sprintf(driver_name, "%s:%s", drv->bus->name, drv->name);
2509 return driver_name;
2510}
2511
2512static void module_create_drivers_dir(struct module_kobject *mk)
2513{
2514 if (!mk || mk->drivers_dir)
2515 return;
2516
2517 mk->drivers_dir = kobject_add_dir(&mk->kobj, "drivers");
2518}
2519
2520void module_add_driver(struct module *mod, struct device_driver *drv)
2521{
2522 char *driver_name;
2523 int no_warn;
2524 struct module_kobject *mk = NULL;
2525
2526 if (!drv)
2527 return;
2528
2529 if (mod)
2530 mk = &mod->mkobj;
2531 else if (drv->mod_name) {
2532 struct kobject *mkobj;
2533
2534 /* Lookup built-in module entry in /sys/modules */
2535 mkobj = kset_find_obj(&module_subsys, drv->mod_name);
2536 if (mkobj) {
2537 mk = container_of(mkobj, struct module_kobject, kobj);
2538 /* remember our module structure */
2539 drv->mkobj = mk;
2540 /* kset_find_obj took a reference */
2541 kobject_put(mkobj);
2542 }
2543 }
2544
2545 if (!mk)
2546 return;
2547
2548 /* Don't check return codes; these calls are idempotent */
2549 no_warn = sysfs_create_link(&drv->kobj, &mk->kobj, "module");
2550 driver_name = make_driver_name(drv);
2551 if (driver_name) {
2552 module_create_drivers_dir(mk);
2553 no_warn = sysfs_create_link(mk->drivers_dir, &drv->kobj,
2554 driver_name);
2555 kfree(driver_name);
2556 }
2557}
2558EXPORT_SYMBOL(module_add_driver);
2559
2560void module_remove_driver(struct device_driver *drv)
2561{
2562 struct module_kobject *mk = NULL;
2563 char *driver_name;
2564
2565 if (!drv)
2566 return;
2567
2568 sysfs_remove_link(&drv->kobj, "module");
2569
2570 if (drv->owner)
2571 mk = &drv->owner->mkobj;
2572 else if (drv->mkobj)
2573 mk = drv->mkobj;
2574 if (mk && mk->drivers_dir) {
2575 driver_name = make_driver_name(drv);
2576 if (driver_name) {
2577 sysfs_remove_link(mk->drivers_dir, driver_name);
2578 kfree(driver_name);
2579 }
2580 }
2581}
2582EXPORT_SYMBOL(module_remove_driver);
2583#endif
2584
2585#ifdef CONFIG_MODVERSIONS 2557#ifdef CONFIG_MODVERSIONS
2586/* Generate the signature for struct module here, too, for modversions. */ 2558/* Generate the signature for struct module here, too, for modversions. */
2587void struct_module(struct module *mod) { return; } 2559void struct_module(struct module *mod) { return; }
diff --git a/kernel/panic.c b/kernel/panic.c
index 6f6e03e91595..d9e90cfe3298 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -19,6 +19,8 @@
19#include <linux/nmi.h> 19#include <linux/nmi.h>
20#include <linux/kexec.h> 20#include <linux/kexec.h>
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/random.h>
23#include <linux/kallsyms.h>
22 24
23int panic_on_oops; 25int panic_on_oops;
24int tainted; 26int tainted;
@@ -266,13 +268,52 @@ void oops_enter(void)
266} 268}
267 269
268/* 270/*
271 * 64-bit random ID for oopses:
272 */
273static u64 oops_id;
274
275static int init_oops_id(void)
276{
277 if (!oops_id)
278 get_random_bytes(&oops_id, sizeof(oops_id));
279
280 return 0;
281}
282late_initcall(init_oops_id);
283
284static void print_oops_end_marker(void)
285{
286 init_oops_id();
287 printk(KERN_WARNING "---[ end trace %016llx ]---\n",
288 (unsigned long long)oops_id);
289}
290
291/*
269 * Called when the architecture exits its oops handler, after printing 292 * Called when the architecture exits its oops handler, after printing
270 * everything. 293 * everything.
271 */ 294 */
272void oops_exit(void) 295void oops_exit(void)
273{ 296{
274 do_oops_enter_exit(); 297 do_oops_enter_exit();
298 print_oops_end_marker();
299}
300
301#ifdef WANT_WARN_ON_SLOWPATH
302void warn_on_slowpath(const char *file, int line)
303{
304 char function[KSYM_SYMBOL_LEN];
305 unsigned long caller = (unsigned long) __builtin_return_address(0);
306 sprint_symbol(function, caller);
307
308 printk(KERN_WARNING "------------[ cut here ]------------\n");
309 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
310 line, function);
311 print_modules();
312 dump_stack();
313 print_oops_end_marker();
275} 314}
315EXPORT_SYMBOL(warn_on_slowpath);
316#endif
276 317
277#ifdef CONFIG_CC_STACKPROTECTOR 318#ifdef CONFIG_CC_STACKPROTECTOR
278/* 319/*
diff --git a/kernel/params.c b/kernel/params.c
index 2a4c51487e72..42fe5e6126c0 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -376,8 +376,6 @@ int param_get_string(char *buffer, struct kernel_param *kp)
376 376
377extern struct kernel_param __start___param[], __stop___param[]; 377extern struct kernel_param __start___param[], __stop___param[];
378 378
379#define MAX_KBUILD_MODNAME KOBJ_NAME_LEN
380
381struct param_attribute 379struct param_attribute
382{ 380{
383 struct module_attribute mattr; 381 struct module_attribute mattr;
@@ -472,7 +470,7 @@ param_sysfs_setup(struct module_kobject *mk,
472 sizeof(mp->grp.attrs[0])); 470 sizeof(mp->grp.attrs[0]));
473 size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]); 471 size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]);
474 472
475 mp = kmalloc(size[0] + size[1], GFP_KERNEL); 473 mp = kzalloc(size[0] + size[1], GFP_KERNEL);
476 if (!mp) 474 if (!mp)
477 return ERR_PTR(-ENOMEM); 475 return ERR_PTR(-ENOMEM);
478 476
@@ -560,11 +558,10 @@ static void __init kernel_param_sysfs_setup(const char *name,
560 BUG_ON(!mk); 558 BUG_ON(!mk);
561 559
562 mk->mod = THIS_MODULE; 560 mk->mod = THIS_MODULE;
563 kobj_set_kset_s(mk, module_subsys); 561 mk->kobj.kset = module_kset;
564 kobject_set_name(&mk->kobj, name); 562 ret = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name);
565 kobject_init(&mk->kobj);
566 ret = kobject_add(&mk->kobj);
567 if (ret) { 563 if (ret) {
564 kobject_put(&mk->kobj);
568 printk(KERN_ERR "Module '%s' failed to be added to sysfs, " 565 printk(KERN_ERR "Module '%s' failed to be added to sysfs, "
569 "error number %d\n", name, ret); 566 "error number %d\n", name, ret);
570 printk(KERN_ERR "The system will be unstable now.\n"); 567 printk(KERN_ERR "The system will be unstable now.\n");
@@ -588,7 +585,7 @@ static void __init param_sysfs_builtin(void)
588{ 585{
589 struct kernel_param *kp, *kp_begin = NULL; 586 struct kernel_param *kp, *kp_begin = NULL;
590 unsigned int i, name_len, count = 0; 587 unsigned int i, name_len, count = 0;
591 char modname[MAX_KBUILD_MODNAME + 1] = ""; 588 char modname[MODULE_NAME_LEN + 1] = "";
592 589
593 for (i=0; i < __stop___param - __start___param; i++) { 590 for (i=0; i < __stop___param - __start___param; i++) {
594 char *dot; 591 char *dot;
@@ -596,12 +593,12 @@ static void __init param_sysfs_builtin(void)
596 593
597 kp = &__start___param[i]; 594 kp = &__start___param[i];
598 max_name_len = 595 max_name_len =
599 min_t(size_t, MAX_KBUILD_MODNAME, strlen(kp->name)); 596 min_t(size_t, MODULE_NAME_LEN, strlen(kp->name));
600 597
601 dot = memchr(kp->name, '.', max_name_len); 598 dot = memchr(kp->name, '.', max_name_len);
602 if (!dot) { 599 if (!dot) {
603 DEBUGP("couldn't find period in first %d characters " 600 DEBUGP("couldn't find period in first %d characters "
604 "of %s\n", MAX_KBUILD_MODNAME, kp->name); 601 "of %s\n", MODULE_NAME_LEN, kp->name);
605 continue; 602 continue;
606 } 603 }
607 name_len = dot - kp->name; 604 name_len = dot - kp->name;
@@ -679,8 +676,6 @@ static struct sysfs_ops module_sysfs_ops = {
679 .store = module_attr_store, 676 .store = module_attr_store,
680}; 677};
681 678
682static struct kobj_type module_ktype;
683
684static int uevent_filter(struct kset *kset, struct kobject *kobj) 679static int uevent_filter(struct kset *kset, struct kobject *kobj)
685{ 680{
686 struct kobj_type *ktype = get_ktype(kobj); 681 struct kobj_type *ktype = get_ktype(kobj);
@@ -694,10 +689,10 @@ static struct kset_uevent_ops module_uevent_ops = {
694 .filter = uevent_filter, 689 .filter = uevent_filter,
695}; 690};
696 691
697decl_subsys(module, &module_ktype, &module_uevent_ops); 692struct kset *module_kset;
698int module_sysfs_initialized; 693int module_sysfs_initialized;
699 694
700static struct kobj_type module_ktype = { 695struct kobj_type module_ktype = {
701 .sysfs_ops = &module_sysfs_ops, 696 .sysfs_ops = &module_sysfs_ops,
702}; 697};
703 698
@@ -706,13 +701,11 @@ static struct kobj_type module_ktype = {
706 */ 701 */
707static int __init param_sysfs_init(void) 702static int __init param_sysfs_init(void)
708{ 703{
709 int ret; 704 module_kset = kset_create_and_add("module", &module_uevent_ops, NULL);
710 705 if (!module_kset) {
711 ret = subsystem_register(&module_subsys); 706 printk(KERN_WARNING "%s (%d): error creating kset\n",
712 if (ret < 0) { 707 __FILE__, __LINE__);
713 printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n", 708 return -ENOMEM;
714 __FILE__, __LINE__, ret);
715 return ret;
716 } 709 }
717 module_sysfs_initialized = 1; 710 module_sysfs_initialized = 1;
718 711
@@ -722,14 +715,7 @@ static int __init param_sysfs_init(void)
722} 715}
723subsys_initcall(param_sysfs_init); 716subsys_initcall(param_sysfs_init);
724 717
725#else 718#endif /* CONFIG_SYSFS */
726#if 0
727static struct sysfs_ops module_sysfs_ops = {
728 .show = NULL,
729 .store = NULL,
730};
731#endif
732#endif
733 719
734EXPORT_SYMBOL(param_set_byte); 720EXPORT_SYMBOL(param_set_byte);
735EXPORT_SYMBOL(param_get_byte); 721EXPORT_SYMBOL(param_get_byte);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 68c96376e84a..0b7c82ac467e 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -967,6 +967,7 @@ static void check_thread_timers(struct task_struct *tsk,
967{ 967{
968 int maxfire; 968 int maxfire;
969 struct list_head *timers = tsk->cpu_timers; 969 struct list_head *timers = tsk->cpu_timers;
970 struct signal_struct *const sig = tsk->signal;
970 971
971 maxfire = 20; 972 maxfire = 20;
972 tsk->it_prof_expires = cputime_zero; 973 tsk->it_prof_expires = cputime_zero;
@@ -1011,6 +1012,35 @@ static void check_thread_timers(struct task_struct *tsk,
1011 t->firing = 1; 1012 t->firing = 1;
1012 list_move_tail(&t->entry, firing); 1013 list_move_tail(&t->entry, firing);
1013 } 1014 }
1015
1016 /*
1017 * Check for the special case thread timers.
1018 */
1019 if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) {
1020 unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max;
1021 unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur;
1022
1023 if (hard != RLIM_INFINITY &&
1024 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
1025 /*
1026 * At the hard limit, we just die.
1027 * No need to calculate anything else now.
1028 */
1029 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1030 return;
1031 }
1032 if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) {
1033 /*
1034 * At the soft limit, send a SIGXCPU every second.
1035 */
1036 if (sig->rlim[RLIMIT_RTTIME].rlim_cur
1037 < sig->rlim[RLIMIT_RTTIME].rlim_max) {
1038 sig->rlim[RLIMIT_RTTIME].rlim_cur +=
1039 USEC_PER_SEC;
1040 }
1041 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
1042 }
1043 }
1014} 1044}
1015 1045
1016/* 1046/*
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 05b64790fe83..b138b431e271 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -567,7 +567,8 @@ static const char * const hibernation_modes[] = {
567 * supports it (as determined by having hibernation_ops). 567 * supports it (as determined by having hibernation_ops).
568 */ 568 */
569 569
570static ssize_t disk_show(struct kset *kset, char *buf) 570static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
571 char *buf)
571{ 572{
572 int i; 573 int i;
573 char *start = buf; 574 char *start = buf;
@@ -597,7 +598,8 @@ static ssize_t disk_show(struct kset *kset, char *buf)
597} 598}
598 599
599 600
600static ssize_t disk_store(struct kset *kset, const char *buf, size_t n) 601static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
602 const char *buf, size_t n)
601{ 603{
602 int error = 0; 604 int error = 0;
603 int i; 605 int i;
@@ -642,13 +644,15 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
642 644
643power_attr(disk); 645power_attr(disk);
644 646
645static ssize_t resume_show(struct kset *kset, char *buf) 647static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
648 char *buf)
646{ 649{
647 return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), 650 return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
648 MINOR(swsusp_resume_device)); 651 MINOR(swsusp_resume_device));
649} 652}
650 653
651static ssize_t resume_store(struct kset *kset, const char *buf, size_t n) 654static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
655 const char *buf, size_t n)
652{ 656{
653 unsigned int maj, min; 657 unsigned int maj, min;
654 dev_t res; 658 dev_t res;
@@ -674,12 +678,14 @@ static ssize_t resume_store(struct kset *kset, const char *buf, size_t n)
674 678
675power_attr(resume); 679power_attr(resume);
676 680
677static ssize_t image_size_show(struct kset *kset, char *buf) 681static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr,
682 char *buf)
678{ 683{
679 return sprintf(buf, "%lu\n", image_size); 684 return sprintf(buf, "%lu\n", image_size);
680} 685}
681 686
682static ssize_t image_size_store(struct kset *kset, const char *buf, size_t n) 687static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr,
688 const char *buf, size_t n)
683{ 689{
684 unsigned long size; 690 unsigned long size;
685 691
@@ -708,7 +714,7 @@ static struct attribute_group attr_group = {
708 714
709static int __init pm_disk_init(void) 715static int __init pm_disk_init(void)
710{ 716{
711 return sysfs_create_group(&power_subsys.kobj, &attr_group); 717 return sysfs_create_group(power_kobj, &attr_group);
712} 718}
713 719
714core_initcall(pm_disk_init); 720core_initcall(pm_disk_init);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 3cdf95b1dc92..efc08360e627 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -28,6 +28,9 @@ BLOCKING_NOTIFIER_HEAD(pm_chain_head);
28 28
29DEFINE_MUTEX(pm_mutex); 29DEFINE_MUTEX(pm_mutex);
30 30
31unsigned int pm_flags;
32EXPORT_SYMBOL(pm_flags);
33
31#ifdef CONFIG_SUSPEND 34#ifdef CONFIG_SUSPEND
32 35
33/* This is just an arbitrary number */ 36/* This is just an arbitrary number */
@@ -273,8 +276,7 @@ EXPORT_SYMBOL(pm_suspend);
273 276
274#endif /* CONFIG_SUSPEND */ 277#endif /* CONFIG_SUSPEND */
275 278
276decl_subsys(power,NULL,NULL); 279struct kobject *power_kobj;
277
278 280
279/** 281/**
280 * state - control system power state. 282 * state - control system power state.
@@ -287,7 +289,8 @@ decl_subsys(power,NULL,NULL);
287 * proper enumerated value, and initiates a suspend transition. 289 * proper enumerated value, and initiates a suspend transition.
288 */ 290 */
289 291
290static ssize_t state_show(struct kset *kset, char *buf) 292static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
293 char *buf)
291{ 294{
292 char *s = buf; 295 char *s = buf;
293#ifdef CONFIG_SUSPEND 296#ifdef CONFIG_SUSPEND
@@ -308,7 +311,8 @@ static ssize_t state_show(struct kset *kset, char *buf)
308 return (s - buf); 311 return (s - buf);
309} 312}
310 313
311static ssize_t state_store(struct kset *kset, const char *buf, size_t n) 314static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
315 const char *buf, size_t n)
312{ 316{
313#ifdef CONFIG_SUSPEND 317#ifdef CONFIG_SUSPEND
314 suspend_state_t state = PM_SUSPEND_STANDBY; 318 suspend_state_t state = PM_SUSPEND_STANDBY;
@@ -345,13 +349,15 @@ power_attr(state);
345#ifdef CONFIG_PM_TRACE 349#ifdef CONFIG_PM_TRACE
346int pm_trace_enabled; 350int pm_trace_enabled;
347 351
348static ssize_t pm_trace_show(struct kset *kset, char *buf) 352static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr,
353 char *buf)
349{ 354{
350 return sprintf(buf, "%d\n", pm_trace_enabled); 355 return sprintf(buf, "%d\n", pm_trace_enabled);
351} 356}
352 357
353static ssize_t 358static ssize_t
354pm_trace_store(struct kset *kset, const char *buf, size_t n) 359pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
360 const char *buf, size_t n)
355{ 361{
356 int val; 362 int val;
357 363
@@ -383,10 +389,10 @@ static struct attribute_group attr_group = {
383 389
384static int __init pm_init(void) 390static int __init pm_init(void)
385{ 391{
386 int error = subsystem_register(&power_subsys); 392 power_kobj = kobject_create_and_add("power", NULL);
387 if (!error) 393 if (!power_kobj)
388 error = sysfs_create_group(&power_subsys.kobj,&attr_group); 394 return -ENOMEM;
389 return error; 395 return sysfs_create_group(power_kobj, &attr_group);
390} 396}
391 397
392core_initcall(pm_init); 398core_initcall(pm_init);
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index c50d15266c10..60c73fa670d5 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -27,8 +27,6 @@
27#include <linux/interrupt.h> 27#include <linux/interrupt.h>
28#include <linux/mutex.h> 28#include <linux/mutex.h>
29 29
30int pm_active;
31
32/* 30/*
33 * Locking notes: 31 * Locking notes:
34 * pm_devs_lock can be a semaphore providing pm ops are not called 32 * pm_devs_lock can be a semaphore providing pm ops are not called
@@ -204,6 +202,4 @@ int pm_send_all(pm_request_t rqst, void *data)
204 202
205EXPORT_SYMBOL(pm_register); 203EXPORT_SYMBOL(pm_register);
206EXPORT_SYMBOL(pm_send_all); 204EXPORT_SYMBOL(pm_send_all);
207EXPORT_SYMBOL(pm_active);
208
209 205
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 195dc4611764..2093c3a9a994 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -54,7 +54,7 @@ extern int pfn_is_nosave(unsigned long);
54extern struct mutex pm_mutex; 54extern struct mutex pm_mutex;
55 55
56#define power_attr(_name) \ 56#define power_attr(_name) \
57static struct subsys_attribute _name##_attr = { \ 57static struct kobj_attribute _name##_attr = { \
58 .attr = { \ 58 .attr = { \
59 .name = __stringify(_name), \ 59 .name = __stringify(_name), \
60 .mode = 0644, \ 60 .mode = 0644, \
@@ -63,8 +63,6 @@ static struct subsys_attribute _name##_attr = { \
63 .store = _name##_store, \ 63 .store = _name##_store, \
64} 64}
65 65
66extern struct kset power_subsys;
67
68/* Preferred image size in bytes (default 500 MB) */ 66/* Preferred image size in bytes (default 500 MB) */
69extern unsigned long image_size; 67extern unsigned long image_size;
70extern int in_suspend; 68extern int in_suspend;
diff --git a/kernel/printk.c b/kernel/printk.c
index a30fe33de395..29ae1e99cde0 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -36,6 +36,13 @@
36 36
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
38 38
39/*
40 * Architectures can override it:
41 */
42void __attribute__((weak)) early_printk(const char *fmt, ...)
43{
44}
45
39#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) 46#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
40 47
41/* printk's without a loglevel use this.. */ 48/* printk's without a loglevel use this.. */
@@ -448,10 +455,10 @@ static int __init ignore_loglevel_setup(char *str)
448 ignore_loglevel = 1; 455 ignore_loglevel = 1;
449 printk(KERN_INFO "debug: ignoring loglevel setting.\n"); 456 printk(KERN_INFO "debug: ignoring loglevel setting.\n");
450 457
451 return 1; 458 return 0;
452} 459}
453 460
454__setup("ignore_loglevel", ignore_loglevel_setup); 461early_param("ignore_loglevel", ignore_loglevel_setup);
455 462
456/* 463/*
457 * Write out chars from start to end - 1 inclusive 464 * Write out chars from start to end - 1 inclusive
@@ -573,11 +580,6 @@ static int __init printk_time_setup(char *str)
573 580
574__setup("time", printk_time_setup); 581__setup("time", printk_time_setup);
575 582
576__attribute__((weak)) unsigned long long printk_clock(void)
577{
578 return sched_clock();
579}
580
581/* Check if we have any console registered that can be called early in boot. */ 583/* Check if we have any console registered that can be called early in boot. */
582static int have_callable_console(void) 584static int have_callable_console(void)
583{ 585{
@@ -628,30 +630,57 @@ asmlinkage int printk(const char *fmt, ...)
628/* cpu currently holding logbuf_lock */ 630/* cpu currently holding logbuf_lock */
629static volatile unsigned int printk_cpu = UINT_MAX; 631static volatile unsigned int printk_cpu = UINT_MAX;
630 632
633const char printk_recursion_bug_msg [] =
634 KERN_CRIT "BUG: recent printk recursion!\n";
635static int printk_recursion_bug;
636
631asmlinkage int vprintk(const char *fmt, va_list args) 637asmlinkage int vprintk(const char *fmt, va_list args)
632{ 638{
639 static int log_level_unknown = 1;
640 static char printk_buf[1024];
641
633 unsigned long flags; 642 unsigned long flags;
634 int printed_len; 643 int printed_len = 0;
644 int this_cpu;
635 char *p; 645 char *p;
636 static char printk_buf[1024];
637 static int log_level_unknown = 1;
638 646
639 boot_delay_msec(); 647 boot_delay_msec();
640 648
641 preempt_disable(); 649 preempt_disable();
642 if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id())
643 /* If a crash is occurring during printk() on this CPU,
644 * make sure we can't deadlock */
645 zap_locks();
646
647 /* This stops the holder of console_sem just where we want him */ 650 /* This stops the holder of console_sem just where we want him */
648 raw_local_irq_save(flags); 651 raw_local_irq_save(flags);
652 this_cpu = smp_processor_id();
653
654 /*
655 * Ouch, printk recursed into itself!
656 */
657 if (unlikely(printk_cpu == this_cpu)) {
658 /*
659 * If a crash is occurring during printk() on this CPU,
660 * then try to get the crash message out but make sure
661 * we can't deadlock. Otherwise just return to avoid the
662 * recursion and return - but flag the recursion so that
663 * it can be printed at the next appropriate moment:
664 */
665 if (!oops_in_progress) {
666 printk_recursion_bug = 1;
667 goto out_restore_irqs;
668 }
669 zap_locks();
670 }
671
649 lockdep_off(); 672 lockdep_off();
650 spin_lock(&logbuf_lock); 673 spin_lock(&logbuf_lock);
651 printk_cpu = smp_processor_id(); 674 printk_cpu = this_cpu;
652 675
676 if (printk_recursion_bug) {
677 printk_recursion_bug = 0;
678 strcpy(printk_buf, printk_recursion_bug_msg);
679 printed_len = sizeof(printk_recursion_bug_msg);
680 }
653 /* Emit the output into the temporary buffer */ 681 /* Emit the output into the temporary buffer */
654 printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); 682 printed_len += vscnprintf(printk_buf + printed_len,
683 sizeof(printk_buf), fmt, args);
655 684
656 /* 685 /*
657 * Copy the output into log_buf. If the caller didn't provide 686 * Copy the output into log_buf. If the caller didn't provide
@@ -680,7 +709,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
680 loglev_char = default_message_loglevel 709 loglev_char = default_message_loglevel
681 + '0'; 710 + '0';
682 } 711 }
683 t = printk_clock(); 712 t = cpu_clock(printk_cpu);
684 nanosec_rem = do_div(t, 1000000000); 713 nanosec_rem = do_div(t, 1000000000);
685 tlen = sprintf(tbuf, 714 tlen = sprintf(tbuf,
686 "<%c>[%5lu.%06lu] ", 715 "<%c>[%5lu.%06lu] ",
@@ -744,6 +773,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
744 printk_cpu = UINT_MAX; 773 printk_cpu = UINT_MAX;
745 spin_unlock(&logbuf_lock); 774 spin_unlock(&logbuf_lock);
746 lockdep_on(); 775 lockdep_on();
776out_restore_irqs:
747 raw_local_irq_restore(flags); 777 raw_local_irq_restore(flags);
748 } 778 }
749 779
@@ -817,7 +847,7 @@ __setup("console=", console_setup);
817 * commonly to provide a default console (ie from PROM variables) when 847 * commonly to provide a default console (ie from PROM variables) when
818 * the user has not supplied one. 848 * the user has not supplied one.
819 */ 849 */
820int __init add_preferred_console(char *name, int idx, char *options) 850int add_preferred_console(char *name, int idx, char *options)
821{ 851{
822 struct console_cmdline *c; 852 struct console_cmdline *c;
823 int i; 853 int i;
diff --git a/kernel/profile.c b/kernel/profile.c
index 5e95330e5120..e64c2da11c0f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -52,7 +52,7 @@ static DEFINE_PER_CPU(int, cpu_profile_flip);
52static DEFINE_MUTEX(profile_flip_mutex); 52static DEFINE_MUTEX(profile_flip_mutex);
53#endif /* CONFIG_SMP */ 53#endif /* CONFIG_SMP */
54 54
55static int __init profile_setup(char * str) 55static int __init profile_setup(char *str)
56{ 56{
57 static char __initdata schedstr[] = "schedule"; 57 static char __initdata schedstr[] = "schedule";
58 static char __initdata sleepstr[] = "sleep"; 58 static char __initdata sleepstr[] = "sleep";
@@ -104,28 +104,28 @@ __setup("profile=", profile_setup);
104 104
105void __init profile_init(void) 105void __init profile_init(void)
106{ 106{
107 if (!prof_on) 107 if (!prof_on)
108 return; 108 return;
109 109
110 /* only text is profiled */ 110 /* only text is profiled */
111 prof_len = (_etext - _stext) >> prof_shift; 111 prof_len = (_etext - _stext) >> prof_shift;
112 prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); 112 prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t));
113} 113}
114 114
115/* Profile event notifications */ 115/* Profile event notifications */
116 116
117#ifdef CONFIG_PROFILING 117#ifdef CONFIG_PROFILING
118 118
119static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); 119static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
120static ATOMIC_NOTIFIER_HEAD(task_free_notifier); 120static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
121static BLOCKING_NOTIFIER_HEAD(munmap_notifier); 121static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
122 122
123void profile_task_exit(struct task_struct * task) 123void profile_task_exit(struct task_struct *task)
124{ 124{
125 blocking_notifier_call_chain(&task_exit_notifier, 0, task); 125 blocking_notifier_call_chain(&task_exit_notifier, 0, task);
126} 126}
127 127
128int profile_handoff_task(struct task_struct * task) 128int profile_handoff_task(struct task_struct *task)
129{ 129{
130 int ret; 130 int ret;
131 ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); 131 ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
@@ -137,52 +137,55 @@ void profile_munmap(unsigned long addr)
137 blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); 137 blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
138} 138}
139 139
140int task_handoff_register(struct notifier_block * n) 140int task_handoff_register(struct notifier_block *n)
141{ 141{
142 return atomic_notifier_chain_register(&task_free_notifier, n); 142 return atomic_notifier_chain_register(&task_free_notifier, n);
143} 143}
144EXPORT_SYMBOL_GPL(task_handoff_register);
144 145
145int task_handoff_unregister(struct notifier_block * n) 146int task_handoff_unregister(struct notifier_block *n)
146{ 147{
147 return atomic_notifier_chain_unregister(&task_free_notifier, n); 148 return atomic_notifier_chain_unregister(&task_free_notifier, n);
148} 149}
150EXPORT_SYMBOL_GPL(task_handoff_unregister);
149 151
150int profile_event_register(enum profile_type type, struct notifier_block * n) 152int profile_event_register(enum profile_type type, struct notifier_block *n)
151{ 153{
152 int err = -EINVAL; 154 int err = -EINVAL;
153 155
154 switch (type) { 156 switch (type) {
155 case PROFILE_TASK_EXIT: 157 case PROFILE_TASK_EXIT:
156 err = blocking_notifier_chain_register( 158 err = blocking_notifier_chain_register(
157 &task_exit_notifier, n); 159 &task_exit_notifier, n);
158 break; 160 break;
159 case PROFILE_MUNMAP: 161 case PROFILE_MUNMAP:
160 err = blocking_notifier_chain_register( 162 err = blocking_notifier_chain_register(
161 &munmap_notifier, n); 163 &munmap_notifier, n);
162 break; 164 break;
163 } 165 }
164 166
165 return err; 167 return err;
166} 168}
169EXPORT_SYMBOL_GPL(profile_event_register);
167 170
168 171int profile_event_unregister(enum profile_type type, struct notifier_block *n)
169int profile_event_unregister(enum profile_type type, struct notifier_block * n)
170{ 172{
171 int err = -EINVAL; 173 int err = -EINVAL;
172 174
173 switch (type) { 175 switch (type) {
174 case PROFILE_TASK_EXIT: 176 case PROFILE_TASK_EXIT:
175 err = blocking_notifier_chain_unregister( 177 err = blocking_notifier_chain_unregister(
176 &task_exit_notifier, n); 178 &task_exit_notifier, n);
177 break; 179 break;
178 case PROFILE_MUNMAP: 180 case PROFILE_MUNMAP:
179 err = blocking_notifier_chain_unregister( 181 err = blocking_notifier_chain_unregister(
180 &munmap_notifier, n); 182 &munmap_notifier, n);
181 break; 183 break;
182 } 184 }
183 185
184 return err; 186 return err;
185} 187}
188EXPORT_SYMBOL_GPL(profile_event_unregister);
186 189
187int register_timer_hook(int (*hook)(struct pt_regs *)) 190int register_timer_hook(int (*hook)(struct pt_regs *))
188{ 191{
@@ -191,6 +194,7 @@ int register_timer_hook(int (*hook)(struct pt_regs *))
191 timer_hook = hook; 194 timer_hook = hook;
192 return 0; 195 return 0;
193} 196}
197EXPORT_SYMBOL_GPL(register_timer_hook);
194 198
195void unregister_timer_hook(int (*hook)(struct pt_regs *)) 199void unregister_timer_hook(int (*hook)(struct pt_regs *))
196{ 200{
@@ -199,13 +203,7 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *))
199 /* make sure all CPUs see the NULL hook */ 203 /* make sure all CPUs see the NULL hook */
200 synchronize_sched(); /* Allow ongoing interrupts to complete. */ 204 synchronize_sched(); /* Allow ongoing interrupts to complete. */
201} 205}
202
203EXPORT_SYMBOL_GPL(register_timer_hook);
204EXPORT_SYMBOL_GPL(unregister_timer_hook); 206EXPORT_SYMBOL_GPL(unregister_timer_hook);
205EXPORT_SYMBOL_GPL(task_handoff_register);
206EXPORT_SYMBOL_GPL(task_handoff_unregister);
207EXPORT_SYMBOL_GPL(profile_event_register);
208EXPORT_SYMBOL_GPL(profile_event_unregister);
209 207
210#endif /* CONFIG_PROFILING */ 208#endif /* CONFIG_PROFILING */
211 209
@@ -366,7 +364,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
366 per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); 364 per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
367 } 365 }
368 break; 366 break;
369 out_free: 367out_free:
370 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); 368 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
371 per_cpu(cpu_profile_hits, cpu)[1] = NULL; 369 per_cpu(cpu_profile_hits, cpu)[1] = NULL;
372 __free_page(page); 370 __free_page(page);
@@ -409,7 +407,6 @@ void profile_hits(int type, void *__pc, unsigned int nr_hits)
409 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); 407 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
410} 408}
411#endif /* !CONFIG_SMP */ 409#endif /* !CONFIG_SMP */
412
413EXPORT_SYMBOL_GPL(profile_hits); 410EXPORT_SYMBOL_GPL(profile_hits);
414 411
415void profile_tick(int type) 412void profile_tick(int type)
@@ -427,7 +424,7 @@ void profile_tick(int type)
427#include <asm/uaccess.h> 424#include <asm/uaccess.h>
428#include <asm/ptrace.h> 425#include <asm/ptrace.h>
429 426
430static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, 427static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
431 int count, int *eof, void *data) 428 int count, int *eof, void *data)
432{ 429{
433 int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); 430 int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
@@ -437,8 +434,8 @@ static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
437 return len; 434 return len;
438} 435}
439 436
440static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer, 437static int prof_cpu_mask_write_proc(struct file *file,
441 unsigned long count, void *data) 438 const char __user *buffer, unsigned long count, void *data)
442{ 439{
443 cpumask_t *mask = (cpumask_t *)data; 440 cpumask_t *mask = (cpumask_t *)data;
444 unsigned long full_count = count, err; 441 unsigned long full_count = count, err;
@@ -457,7 +454,8 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
457 struct proc_dir_entry *entry; 454 struct proc_dir_entry *entry;
458 455
459 /* create /proc/irq/prof_cpu_mask */ 456 /* create /proc/irq/prof_cpu_mask */
460 if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir))) 457 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
458 if (!entry)
461 return; 459 return;
462 entry->data = (void *)&prof_cpu_mask; 460 entry->data = (void *)&prof_cpu_mask;
463 entry->read_proc = prof_cpu_mask_read_proc; 461 entry->read_proc = prof_cpu_mask_read_proc;
@@ -475,7 +473,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
475{ 473{
476 unsigned long p = *ppos; 474 unsigned long p = *ppos;
477 ssize_t read; 475 ssize_t read;
478 char * pnt; 476 char *pnt;
479 unsigned int sample_step = 1 << prof_shift; 477 unsigned int sample_step = 1 << prof_shift;
480 478
481 profile_flip_buffers(); 479 profile_flip_buffers();
@@ -486,12 +484,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
486 read = 0; 484 read = 0;
487 485
488 while (p < sizeof(unsigned int) && count > 0) { 486 while (p < sizeof(unsigned int) && count > 0) {
489 if (put_user(*((char *)(&sample_step)+p),buf)) 487 if (put_user(*((char *)(&sample_step)+p), buf))
490 return -EFAULT; 488 return -EFAULT;
491 buf++; p++; count--; read++; 489 buf++; p++; count--; read++;
492 } 490 }
493 pnt = (char *)prof_buffer + p - sizeof(atomic_t); 491 pnt = (char *)prof_buffer + p - sizeof(atomic_t);
494 if (copy_to_user(buf,(void *)pnt,count)) 492 if (copy_to_user(buf, (void *)pnt, count))
495 return -EFAULT; 493 return -EFAULT;
496 read += count; 494 read += count;
497 *ppos += read; 495 *ppos += read;
@@ -508,7 +506,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
508 size_t count, loff_t *ppos) 506 size_t count, loff_t *ppos)
509{ 507{
510#ifdef CONFIG_SMP 508#ifdef CONFIG_SMP
511 extern int setup_profiling_timer (unsigned int multiplier); 509 extern int setup_profiling_timer(unsigned int multiplier);
512 510
513 if (count == sizeof(int)) { 511 if (count == sizeof(int)) {
514 unsigned int multiplier; 512 unsigned int multiplier;
@@ -591,7 +589,8 @@ static int __init create_proc_profile(void)
591 return 0; 589 return 0;
592 if (create_hash_tables()) 590 if (create_hash_tables())
593 return -1; 591 return -1;
594 if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL))) 592 entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL);
593 if (!entry)
595 return 0; 594 return 0;
596 entry->proc_fops = &proc_profile_operations; 595 entry->proc_fops = &proc_profile_operations;
597 entry->size = (1+prof_len) * sizeof(atomic_t); 596 entry->size = (1+prof_len) * sizeof(atomic_t);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 26f9923baddc..b0d4ab4dfd3d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -120,7 +120,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
120 return ret; 120 return ret;
121} 121}
122 122
123static int may_attach(struct task_struct *task) 123int __ptrace_may_attach(struct task_struct *task)
124{ 124{
125 /* May we inspect the given task? 125 /* May we inspect the given task?
126 * This check is used both for attaching with ptrace 126 * This check is used both for attaching with ptrace
@@ -154,7 +154,7 @@ int ptrace_may_attach(struct task_struct *task)
154{ 154{
155 int err; 155 int err;
156 task_lock(task); 156 task_lock(task);
157 err = may_attach(task); 157 err = __ptrace_may_attach(task);
158 task_unlock(task); 158 task_unlock(task);
159 return !err; 159 return !err;
160} 160}
@@ -196,7 +196,7 @@ repeat:
196 /* the same process cannot be attached many times */ 196 /* the same process cannot be attached many times */
197 if (task->ptrace & PT_PTRACED) 197 if (task->ptrace & PT_PTRACED)
198 goto bad; 198 goto bad;
199 retval = may_attach(task); 199 retval = __ptrace_may_attach(task);
200 if (retval) 200 if (retval)
201 goto bad; 201 goto bad;
202 202
@@ -366,12 +366,73 @@ static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data)
366 return error; 366 return error;
367} 367}
368 368
369
370#ifdef PTRACE_SINGLESTEP
371#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
372#else
373#define is_singlestep(request) 0
374#endif
375
376#ifdef PTRACE_SINGLEBLOCK
377#define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK)
378#else
379#define is_singleblock(request) 0
380#endif
381
382#ifdef PTRACE_SYSEMU
383#define is_sysemu_singlestep(request) ((request) == PTRACE_SYSEMU_SINGLESTEP)
384#else
385#define is_sysemu_singlestep(request) 0
386#endif
387
388static int ptrace_resume(struct task_struct *child, long request, long data)
389{
390 if (!valid_signal(data))
391 return -EIO;
392
393 if (request == PTRACE_SYSCALL)
394 set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
395 else
396 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
397
398#ifdef TIF_SYSCALL_EMU
399 if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP)
400 set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
401 else
402 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
403#endif
404
405 if (is_singleblock(request)) {
406 if (unlikely(!arch_has_block_step()))
407 return -EIO;
408 user_enable_block_step(child);
409 } else if (is_singlestep(request) || is_sysemu_singlestep(request)) {
410 if (unlikely(!arch_has_single_step()))
411 return -EIO;
412 user_enable_single_step(child);
413 }
414 else
415 user_disable_single_step(child);
416
417 child->exit_code = data;
418 wake_up_process(child);
419
420 return 0;
421}
422
369int ptrace_request(struct task_struct *child, long request, 423int ptrace_request(struct task_struct *child, long request,
370 long addr, long data) 424 long addr, long data)
371{ 425{
372 int ret = -EIO; 426 int ret = -EIO;
373 427
374 switch (request) { 428 switch (request) {
429 case PTRACE_PEEKTEXT:
430 case PTRACE_PEEKDATA:
431 return generic_ptrace_peekdata(child, addr, data);
432 case PTRACE_POKETEXT:
433 case PTRACE_POKEDATA:
434 return generic_ptrace_pokedata(child, addr, data);
435
375#ifdef PTRACE_OLDSETOPTIONS 436#ifdef PTRACE_OLDSETOPTIONS
376 case PTRACE_OLDSETOPTIONS: 437 case PTRACE_OLDSETOPTIONS:
377#endif 438#endif
@@ -390,6 +451,26 @@ int ptrace_request(struct task_struct *child, long request,
390 case PTRACE_DETACH: /* detach a process that was attached. */ 451 case PTRACE_DETACH: /* detach a process that was attached. */
391 ret = ptrace_detach(child, data); 452 ret = ptrace_detach(child, data);
392 break; 453 break;
454
455#ifdef PTRACE_SINGLESTEP
456 case PTRACE_SINGLESTEP:
457#endif
458#ifdef PTRACE_SINGLEBLOCK
459 case PTRACE_SINGLEBLOCK:
460#endif
461#ifdef PTRACE_SYSEMU
462 case PTRACE_SYSEMU:
463 case PTRACE_SYSEMU_SINGLESTEP:
464#endif
465 case PTRACE_SYSCALL:
466 case PTRACE_CONT:
467 return ptrace_resume(child, request, data);
468
469 case PTRACE_KILL:
470 if (child->exit_state) /* already dead */
471 return 0;
472 return ptrace_resume(child, request, SIGKILL);
473
393 default: 474 default:
394 break; 475 break;
395 } 476 }
@@ -470,6 +551,8 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
470 lock_kernel(); 551 lock_kernel();
471 if (request == PTRACE_TRACEME) { 552 if (request == PTRACE_TRACEME) {
472 ret = ptrace_traceme(); 553 ret = ptrace_traceme();
554 if (!ret)
555 arch_ptrace_attach(current);
473 goto out; 556 goto out;
474 } 557 }
475 558
@@ -524,3 +607,87 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
524 copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); 607 copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
525 return (copied == sizeof(data)) ? 0 : -EIO; 608 return (copied == sizeof(data)) ? 0 : -EIO;
526} 609}
610
611#ifdef CONFIG_COMPAT
612#include <linux/compat.h>
613
614int compat_ptrace_request(struct task_struct *child, compat_long_t request,
615 compat_ulong_t addr, compat_ulong_t data)
616{
617 compat_ulong_t __user *datap = compat_ptr(data);
618 compat_ulong_t word;
619 int ret;
620
621 switch (request) {
622 case PTRACE_PEEKTEXT:
623 case PTRACE_PEEKDATA:
624 ret = access_process_vm(child, addr, &word, sizeof(word), 0);
625 if (ret != sizeof(word))
626 ret = -EIO;
627 else
628 ret = put_user(word, datap);
629 break;
630
631 case PTRACE_POKETEXT:
632 case PTRACE_POKEDATA:
633 ret = access_process_vm(child, addr, &data, sizeof(data), 1);
634 ret = (ret != sizeof(data) ? -EIO : 0);
635 break;
636
637 case PTRACE_GETEVENTMSG:
638 ret = put_user((compat_ulong_t) child->ptrace_message, datap);
639 break;
640
641 default:
642 ret = ptrace_request(child, request, addr, data);
643 }
644
645 return ret;
646}
647
648#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE
649asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
650 compat_long_t addr, compat_long_t data)
651{
652 struct task_struct *child;
653 long ret;
654
655 /*
656 * This lock_kernel fixes a subtle race with suid exec
657 */
658 lock_kernel();
659 if (request == PTRACE_TRACEME) {
660 ret = ptrace_traceme();
661 goto out;
662 }
663
664 child = ptrace_get_task_struct(pid);
665 if (IS_ERR(child)) {
666 ret = PTR_ERR(child);
667 goto out;
668 }
669
670 if (request == PTRACE_ATTACH) {
671 ret = ptrace_attach(child);
672 /*
673 * Some architectures need to do book-keeping after
674 * a ptrace attach.
675 */
676 if (!ret)
677 arch_ptrace_attach(child);
678 goto out_put_task_struct;
679 }
680
681 ret = ptrace_check_attach(child, request == PTRACE_KILL);
682 if (!ret)
683 ret = compat_arch_ptrace(child, request, addr, data);
684
685 out_put_task_struct:
686 put_task_struct(child);
687 out:
688 unlock_kernel();
689 return ret;
690}
691#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */
692
693#endif /* CONFIG_COMPAT */
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
new file mode 100644
index 000000000000..f4ffbd0f306f
--- /dev/null
+++ b/kernel/rcuclassic.c
@@ -0,0 +1,575 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2001
19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com>
22 *
23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 * Papers:
26 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
28 *
29 * For detailed explanation of Read-Copy Update mechanism see -
30 * Documentation/RCU
31 *
32 */
33#include <linux/types.h>
34#include <linux/kernel.h>
35#include <linux/init.h>
36#include <linux/spinlock.h>
37#include <linux/smp.h>
38#include <linux/rcupdate.h>
39#include <linux/interrupt.h>
40#include <linux/sched.h>
41#include <asm/atomic.h>
42#include <linux/bitops.h>
43#include <linux/module.h>
44#include <linux/completion.h>
45#include <linux/moduleparam.h>
46#include <linux/percpu.h>
47#include <linux/notifier.h>
48#include <linux/cpu.h>
49#include <linux/mutex.h>
50
51#ifdef CONFIG_DEBUG_LOCK_ALLOC
52static struct lock_class_key rcu_lock_key;
53struct lockdep_map rcu_lock_map =
54 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
55EXPORT_SYMBOL_GPL(rcu_lock_map);
56#endif
57
58
59/* Definition for rcupdate control block. */
60static struct rcu_ctrlblk rcu_ctrlblk = {
61 .cur = -300,
62 .completed = -300,
63 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
64 .cpumask = CPU_MASK_NONE,
65};
66static struct rcu_ctrlblk rcu_bh_ctrlblk = {
67 .cur = -300,
68 .completed = -300,
69 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
70 .cpumask = CPU_MASK_NONE,
71};
72
73DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
74DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
75
76static int blimit = 10;
77static int qhimark = 10000;
78static int qlowmark = 100;
79
80#ifdef CONFIG_SMP
81static void force_quiescent_state(struct rcu_data *rdp,
82 struct rcu_ctrlblk *rcp)
83{
84 int cpu;
85 cpumask_t cpumask;
86 set_need_resched();
87 if (unlikely(!rcp->signaled)) {
88 rcp->signaled = 1;
89 /*
90 * Don't send IPI to itself. With irqs disabled,
91 * rdp->cpu is the current cpu.
92 */
93 cpumask = rcp->cpumask;
94 cpu_clear(rdp->cpu, cpumask);
95 for_each_cpu_mask(cpu, cpumask)
96 smp_send_reschedule(cpu);
97 }
98}
99#else
100static inline void force_quiescent_state(struct rcu_data *rdp,
101 struct rcu_ctrlblk *rcp)
102{
103 set_need_resched();
104}
105#endif
106
107/**
108 * call_rcu - Queue an RCU callback for invocation after a grace period.
109 * @head: structure to be used for queueing the RCU updates.
110 * @func: actual update function to be invoked after the grace period
111 *
112 * The update function will be invoked some time after a full grace
113 * period elapses, in other words after all currently executing RCU
114 * read-side critical sections have completed. RCU read-side critical
115 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
116 * and may be nested.
117 */
118void call_rcu(struct rcu_head *head,
119 void (*func)(struct rcu_head *rcu))
120{
121 unsigned long flags;
122 struct rcu_data *rdp;
123
124 head->func = func;
125 head->next = NULL;
126 local_irq_save(flags);
127 rdp = &__get_cpu_var(rcu_data);
128 *rdp->nxttail = head;
129 rdp->nxttail = &head->next;
130 if (unlikely(++rdp->qlen > qhimark)) {
131 rdp->blimit = INT_MAX;
132 force_quiescent_state(rdp, &rcu_ctrlblk);
133 }
134 local_irq_restore(flags);
135}
136EXPORT_SYMBOL_GPL(call_rcu);
137
138/**
139 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
140 * @head: structure to be used for queueing the RCU updates.
141 * @func: actual update function to be invoked after the grace period
142 *
143 * The update function will be invoked some time after a full grace
144 * period elapses, in other words after all currently executing RCU
145 * read-side critical sections have completed. call_rcu_bh() assumes
146 * that the read-side critical sections end on completion of a softirq
147 * handler. This means that read-side critical sections in process
148 * context must not be interrupted by softirqs. This interface is to be
149 * used when most of the read-side critical sections are in softirq context.
150 * RCU read-side critical sections are delimited by rcu_read_lock() and
151 * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
152 * and rcu_read_unlock_bh(), if in process context. These may be nested.
153 */
154void call_rcu_bh(struct rcu_head *head,
155 void (*func)(struct rcu_head *rcu))
156{
157 unsigned long flags;
158 struct rcu_data *rdp;
159
160 head->func = func;
161 head->next = NULL;
162 local_irq_save(flags);
163 rdp = &__get_cpu_var(rcu_bh_data);
164 *rdp->nxttail = head;
165 rdp->nxttail = &head->next;
166
167 if (unlikely(++rdp->qlen > qhimark)) {
168 rdp->blimit = INT_MAX;
169 force_quiescent_state(rdp, &rcu_bh_ctrlblk);
170 }
171
172 local_irq_restore(flags);
173}
174EXPORT_SYMBOL_GPL(call_rcu_bh);
175
176/*
177 * Return the number of RCU batches processed thus far. Useful
178 * for debug and statistics.
179 */
180long rcu_batches_completed(void)
181{
182 return rcu_ctrlblk.completed;
183}
184EXPORT_SYMBOL_GPL(rcu_batches_completed);
185
186/*
187 * Return the number of RCU batches processed thus far. Useful
188 * for debug and statistics.
189 */
190long rcu_batches_completed_bh(void)
191{
192 return rcu_bh_ctrlblk.completed;
193}
194EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
195
196/* Raises the softirq for processing rcu_callbacks. */
197static inline void raise_rcu_softirq(void)
198{
199 raise_softirq(RCU_SOFTIRQ);
200 /*
201 * The smp_mb() here is required to ensure that this cpu's
202 * __rcu_process_callbacks() reads the most recently updated
203 * value of rcu->cur.
204 */
205 smp_mb();
206}
207
208/*
209 * Invoke the completed RCU callbacks. They are expected to be in
210 * a per-cpu list.
211 */
212static void rcu_do_batch(struct rcu_data *rdp)
213{
214 struct rcu_head *next, *list;
215 int count = 0;
216
217 list = rdp->donelist;
218 while (list) {
219 next = list->next;
220 prefetch(next);
221 list->func(list);
222 list = next;
223 if (++count >= rdp->blimit)
224 break;
225 }
226 rdp->donelist = list;
227
228 local_irq_disable();
229 rdp->qlen -= count;
230 local_irq_enable();
231 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
232 rdp->blimit = blimit;
233
234 if (!rdp->donelist)
235 rdp->donetail = &rdp->donelist;
236 else
237 raise_rcu_softirq();
238}
239
240/*
241 * Grace period handling:
242 * The grace period handling consists out of two steps:
243 * - A new grace period is started.
244 * This is done by rcu_start_batch. The start is not broadcasted to
245 * all cpus, they must pick this up by comparing rcp->cur with
246 * rdp->quiescbatch. All cpus are recorded in the
247 * rcu_ctrlblk.cpumask bitmap.
248 * - All cpus must go through a quiescent state.
249 * Since the start of the grace period is not broadcasted, at least two
250 * calls to rcu_check_quiescent_state are required:
251 * The first call just notices that a new grace period is running. The
252 * following calls check if there was a quiescent state since the beginning
253 * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
254 * the bitmap is empty, then the grace period is completed.
255 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
256 * period (if necessary).
257 */
258/*
259 * Register a new batch of callbacks, and start it up if there is currently no
260 * active batch and the batch to be registered has not already occurred.
261 * Caller must hold rcu_ctrlblk.lock.
262 */
263static void rcu_start_batch(struct rcu_ctrlblk *rcp)
264{
265 if (rcp->next_pending &&
266 rcp->completed == rcp->cur) {
267 rcp->next_pending = 0;
268 /*
269 * next_pending == 0 must be visible in
270 * __rcu_process_callbacks() before it can see new value of cur.
271 */
272 smp_wmb();
273 rcp->cur++;
274
275 /*
276 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
277 * Barrier Otherwise it can cause tickless idle CPUs to be
278 * included in rcp->cpumask, which will extend graceperiods
279 * unnecessarily.
280 */
281 smp_mb();
282 cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
283
284 rcp->signaled = 0;
285 }
286}
287
288/*
289 * cpu went through a quiescent state since the beginning of the grace period.
290 * Clear it from the cpu mask and complete the grace period if it was the last
291 * cpu. Start another grace period if someone has further entries pending
292 */
293static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
294{
295 cpu_clear(cpu, rcp->cpumask);
296 if (cpus_empty(rcp->cpumask)) {
297 /* batch completed ! */
298 rcp->completed = rcp->cur;
299 rcu_start_batch(rcp);
300 }
301}
302
303/*
304 * Check if the cpu has gone through a quiescent state (say context
305 * switch). If so and if it already hasn't done so in this RCU
306 * quiescent cycle, then indicate that it has done so.
307 */
308static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
309 struct rcu_data *rdp)
310{
311 if (rdp->quiescbatch != rcp->cur) {
312 /* start new grace period: */
313 rdp->qs_pending = 1;
314 rdp->passed_quiesc = 0;
315 rdp->quiescbatch = rcp->cur;
316 return;
317 }
318
319 /* Grace period already completed for this cpu?
320 * qs_pending is checked instead of the actual bitmap to avoid
321 * cacheline trashing.
322 */
323 if (!rdp->qs_pending)
324 return;
325
326 /*
327 * Was there a quiescent state since the beginning of the grace
328 * period? If no, then exit and wait for the next call.
329 */
330 if (!rdp->passed_quiesc)
331 return;
332 rdp->qs_pending = 0;
333
334 spin_lock(&rcp->lock);
335 /*
336 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
337 * during cpu startup. Ignore the quiescent state.
338 */
339 if (likely(rdp->quiescbatch == rcp->cur))
340 cpu_quiet(rdp->cpu, rcp);
341
342 spin_unlock(&rcp->lock);
343}
344
345
346#ifdef CONFIG_HOTPLUG_CPU
347
348/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
349 * locking requirements, the list it's pulling from has to belong to a cpu
350 * which is dead and hence not processing interrupts.
351 */
352static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
353 struct rcu_head **tail)
354{
355 local_irq_disable();
356 *this_rdp->nxttail = list;
357 if (list)
358 this_rdp->nxttail = tail;
359 local_irq_enable();
360}
361
362static void __rcu_offline_cpu(struct rcu_data *this_rdp,
363 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
364{
365 /* if the cpu going offline owns the grace period
366 * we can block indefinitely waiting for it, so flush
367 * it here
368 */
369 spin_lock_bh(&rcp->lock);
370 if (rcp->cur != rcp->completed)
371 cpu_quiet(rdp->cpu, rcp);
372 spin_unlock_bh(&rcp->lock);
373 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
374 rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
375 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
376}
377
378static void rcu_offline_cpu(int cpu)
379{
380 struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
381 struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
382
383 __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
384 &per_cpu(rcu_data, cpu));
385 __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
386 &per_cpu(rcu_bh_data, cpu));
387 put_cpu_var(rcu_data);
388 put_cpu_var(rcu_bh_data);
389}
390
391#else
392
393static void rcu_offline_cpu(int cpu)
394{
395}
396
397#endif
398
399/*
400 * This does the RCU processing work from softirq context.
401 */
402static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
403 struct rcu_data *rdp)
404{
405 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
406 *rdp->donetail = rdp->curlist;
407 rdp->donetail = rdp->curtail;
408 rdp->curlist = NULL;
409 rdp->curtail = &rdp->curlist;
410 }
411
412 if (rdp->nxtlist && !rdp->curlist) {
413 local_irq_disable();
414 rdp->curlist = rdp->nxtlist;
415 rdp->curtail = rdp->nxttail;
416 rdp->nxtlist = NULL;
417 rdp->nxttail = &rdp->nxtlist;
418 local_irq_enable();
419
420 /*
421 * start the next batch of callbacks
422 */
423
424 /* determine batch number */
425 rdp->batch = rcp->cur + 1;
426 /* see the comment and corresponding wmb() in
427 * the rcu_start_batch()
428 */
429 smp_rmb();
430
431 if (!rcp->next_pending) {
432 /* and start it/schedule start if it's a new batch */
433 spin_lock(&rcp->lock);
434 rcp->next_pending = 1;
435 rcu_start_batch(rcp);
436 spin_unlock(&rcp->lock);
437 }
438 }
439
440 rcu_check_quiescent_state(rcp, rdp);
441 if (rdp->donelist)
442 rcu_do_batch(rdp);
443}
444
445static void rcu_process_callbacks(struct softirq_action *unused)
446{
447 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
448 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
449}
450
451static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
452{
453 /* This cpu has pending rcu entries and the grace period
454 * for them has completed.
455 */
456 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
457 return 1;
458
459 /* This cpu has no pending entries, but there are new entries */
460 if (!rdp->curlist && rdp->nxtlist)
461 return 1;
462
463 /* This cpu has finished callbacks to invoke */
464 if (rdp->donelist)
465 return 1;
466
467 /* The rcu core waits for a quiescent state from the cpu */
468 if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
469 return 1;
470
471 /* nothing to do */
472 return 0;
473}
474
475/*
476 * Check to see if there is any immediate RCU-related work to be done
477 * by the current CPU, returning 1 if so. This function is part of the
478 * RCU implementation; it is -not- an exported member of the RCU API.
479 */
480int rcu_pending(int cpu)
481{
482 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
483 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
484}
485
486/*
487 * Check to see if any future RCU-related work will need to be done
488 * by the current CPU, even if none need be done immediately, returning
489 * 1 if so. This function is part of the RCU implementation; it is -not-
490 * an exported member of the RCU API.
491 */
492int rcu_needs_cpu(int cpu)
493{
494 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
495 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
496
497 return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
498}
499
500void rcu_check_callbacks(int cpu, int user)
501{
502 if (user ||
503 (idle_cpu(cpu) && !in_softirq() &&
504 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
505 rcu_qsctr_inc(cpu);
506 rcu_bh_qsctr_inc(cpu);
507 } else if (!in_softirq())
508 rcu_bh_qsctr_inc(cpu);
509 raise_rcu_softirq();
510}
511
512static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
513 struct rcu_data *rdp)
514{
515 memset(rdp, 0, sizeof(*rdp));
516 rdp->curtail = &rdp->curlist;
517 rdp->nxttail = &rdp->nxtlist;
518 rdp->donetail = &rdp->donelist;
519 rdp->quiescbatch = rcp->completed;
520 rdp->qs_pending = 0;
521 rdp->cpu = cpu;
522 rdp->blimit = blimit;
523}
524
525static void __cpuinit rcu_online_cpu(int cpu)
526{
527 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
528 struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
529
530 rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
531 rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
532 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
533}
534
535static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
536 unsigned long action, void *hcpu)
537{
538 long cpu = (long)hcpu;
539
540 switch (action) {
541 case CPU_UP_PREPARE:
542 case CPU_UP_PREPARE_FROZEN:
543 rcu_online_cpu(cpu);
544 break;
545 case CPU_DEAD:
546 case CPU_DEAD_FROZEN:
547 rcu_offline_cpu(cpu);
548 break;
549 default:
550 break;
551 }
552 return NOTIFY_OK;
553}
554
555static struct notifier_block __cpuinitdata rcu_nb = {
556 .notifier_call = rcu_cpu_notify,
557};
558
559/*
560 * Initializes rcu mechanism. Assumed to be called early.
561 * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
562 * Note that rcu_qsctr and friends are implicitly
563 * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
564 */
565void __init __rcu_init(void)
566{
567 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
568 (void *)(long)smp_processor_id());
569 /* Register notifier for non-boot CPUs */
570 register_cpu_notifier(&rcu_nb);
571}
572
573module_param(blimit, int, 0);
574module_param(qhimark, int, 0);
575module_param(qlowmark, int, 0);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a66d4d1615f7..760dfc233a00 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -15,7 +15,7 @@
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 * 17 *
18 * Copyright (C) IBM Corporation, 2001 18 * Copyright IBM Corporation, 2001
19 * 19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com> 20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com> 21 * Manfred Spraul <manfred@colorfullife.com>
@@ -35,165 +35,57 @@
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/spinlock.h> 36#include <linux/spinlock.h>
37#include <linux/smp.h> 37#include <linux/smp.h>
38#include <linux/rcupdate.h>
39#include <linux/interrupt.h> 38#include <linux/interrupt.h>
40#include <linux/sched.h> 39#include <linux/sched.h>
41#include <asm/atomic.h> 40#include <asm/atomic.h>
42#include <linux/bitops.h> 41#include <linux/bitops.h>
43#include <linux/module.h>
44#include <linux/completion.h> 42#include <linux/completion.h>
45#include <linux/moduleparam.h>
46#include <linux/percpu.h> 43#include <linux/percpu.h>
47#include <linux/notifier.h> 44#include <linux/notifier.h>
48#include <linux/cpu.h> 45#include <linux/cpu.h>
49#include <linux/mutex.h> 46#include <linux/mutex.h>
47#include <linux/module.h>
50 48
51#ifdef CONFIG_DEBUG_LOCK_ALLOC 49struct rcu_synchronize {
52static struct lock_class_key rcu_lock_key; 50 struct rcu_head head;
53struct lockdep_map rcu_lock_map = 51 struct completion completion;
54 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
55
56EXPORT_SYMBOL_GPL(rcu_lock_map);
57#endif
58
59/* Definition for rcupdate control block. */
60static struct rcu_ctrlblk rcu_ctrlblk = {
61 .cur = -300,
62 .completed = -300,
63 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
64 .cpumask = CPU_MASK_NONE,
65};
66static struct rcu_ctrlblk rcu_bh_ctrlblk = {
67 .cur = -300,
68 .completed = -300,
69 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
70 .cpumask = CPU_MASK_NONE,
71}; 52};
72 53
73DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; 54static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
74DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
75
76/* Fake initialization required by compiler */
77static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
78static int blimit = 10;
79static int qhimark = 10000;
80static int qlowmark = 100;
81
82static atomic_t rcu_barrier_cpu_count; 55static atomic_t rcu_barrier_cpu_count;
83static DEFINE_MUTEX(rcu_barrier_mutex); 56static DEFINE_MUTEX(rcu_barrier_mutex);
84static struct completion rcu_barrier_completion; 57static struct completion rcu_barrier_completion;
85 58
86#ifdef CONFIG_SMP 59/* Because of FASTCALL declaration of complete, we use this wrapper */
87static void force_quiescent_state(struct rcu_data *rdp, 60static void wakeme_after_rcu(struct rcu_head *head)
88 struct rcu_ctrlblk *rcp)
89{
90 int cpu;
91 cpumask_t cpumask;
92 set_need_resched();
93 if (unlikely(!rcp->signaled)) {
94 rcp->signaled = 1;
95 /*
96 * Don't send IPI to itself. With irqs disabled,
97 * rdp->cpu is the current cpu.
98 */
99 cpumask = rcp->cpumask;
100 cpu_clear(rdp->cpu, cpumask);
101 for_each_cpu_mask(cpu, cpumask)
102 smp_send_reschedule(cpu);
103 }
104}
105#else
106static inline void force_quiescent_state(struct rcu_data *rdp,
107 struct rcu_ctrlblk *rcp)
108{ 61{
109 set_need_resched(); 62 struct rcu_synchronize *rcu;
63
64 rcu = container_of(head, struct rcu_synchronize, head);
65 complete(&rcu->completion);
110} 66}
111#endif
112 67
113/** 68/**
114 * call_rcu - Queue an RCU callback for invocation after a grace period. 69 * synchronize_rcu - wait until a grace period has elapsed.
115 * @head: structure to be used for queueing the RCU updates.
116 * @func: actual update function to be invoked after the grace period
117 * 70 *
118 * The update function will be invoked some time after a full grace 71 * Control will return to the caller some time after a full grace
119 * period elapses, in other words after all currently executing RCU 72 * period has elapsed, in other words after all currently executing RCU
120 * read-side critical sections have completed. RCU read-side critical 73 * read-side critical sections have completed. RCU read-side critical
121 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 74 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
122 * and may be nested. 75 * and may be nested.
123 */ 76 */
124void fastcall call_rcu(struct rcu_head *head, 77void synchronize_rcu(void)
125 void (*func)(struct rcu_head *rcu))
126{
127 unsigned long flags;
128 struct rcu_data *rdp;
129
130 head->func = func;
131 head->next = NULL;
132 local_irq_save(flags);
133 rdp = &__get_cpu_var(rcu_data);
134 *rdp->nxttail = head;
135 rdp->nxttail = &head->next;
136 if (unlikely(++rdp->qlen > qhimark)) {
137 rdp->blimit = INT_MAX;
138 force_quiescent_state(rdp, &rcu_ctrlblk);
139 }
140 local_irq_restore(flags);
141}
142
143/**
144 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
145 * @head: structure to be used for queueing the RCU updates.
146 * @func: actual update function to be invoked after the grace period
147 *
148 * The update function will be invoked some time after a full grace
149 * period elapses, in other words after all currently executing RCU
150 * read-side critical sections have completed. call_rcu_bh() assumes
151 * that the read-side critical sections end on completion of a softirq
152 * handler. This means that read-side critical sections in process
153 * context must not be interrupted by softirqs. This interface is to be
154 * used when most of the read-side critical sections are in softirq context.
155 * RCU read-side critical sections are delimited by rcu_read_lock() and
156 * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
157 * and rcu_read_unlock_bh(), if in process context. These may be nested.
158 */
159void fastcall call_rcu_bh(struct rcu_head *head,
160 void (*func)(struct rcu_head *rcu))
161{ 78{
162 unsigned long flags; 79 struct rcu_synchronize rcu;
163 struct rcu_data *rdp;
164
165 head->func = func;
166 head->next = NULL;
167 local_irq_save(flags);
168 rdp = &__get_cpu_var(rcu_bh_data);
169 *rdp->nxttail = head;
170 rdp->nxttail = &head->next;
171
172 if (unlikely(++rdp->qlen > qhimark)) {
173 rdp->blimit = INT_MAX;
174 force_quiescent_state(rdp, &rcu_bh_ctrlblk);
175 }
176
177 local_irq_restore(flags);
178}
179 80
180/* 81 init_completion(&rcu.completion);
181 * Return the number of RCU batches processed thus far. Useful 82 /* Will wake me after RCU finished */
182 * for debug and statistics. 83 call_rcu(&rcu.head, wakeme_after_rcu);
183 */
184long rcu_batches_completed(void)
185{
186 return rcu_ctrlblk.completed;
187}
188 84
189/* 85 /* Wait for it */
190 * Return the number of RCU batches processed thus far. Useful 86 wait_for_completion(&rcu.completion);
191 * for debug and statistics.
192 */
193long rcu_batches_completed_bh(void)
194{
195 return rcu_bh_ctrlblk.completed;
196} 87}
88EXPORT_SYMBOL_GPL(synchronize_rcu);
197 89
198static void rcu_barrier_callback(struct rcu_head *notused) 90static void rcu_barrier_callback(struct rcu_head *notused)
199{ 91{
@@ -207,10 +99,8 @@ static void rcu_barrier_callback(struct rcu_head *notused)
207static void rcu_barrier_func(void *notused) 99static void rcu_barrier_func(void *notused)
208{ 100{
209 int cpu = smp_processor_id(); 101 int cpu = smp_processor_id();
210 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); 102 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
211 struct rcu_head *head;
212 103
213 head = &rdp->barrier;
214 atomic_inc(&rcu_barrier_cpu_count); 104 atomic_inc(&rcu_barrier_cpu_count);
215 call_rcu(head, rcu_barrier_callback); 105 call_rcu(head, rcu_barrier_callback);
216} 106}
@@ -225,420 +115,24 @@ void rcu_barrier(void)
225 mutex_lock(&rcu_barrier_mutex); 115 mutex_lock(&rcu_barrier_mutex);
226 init_completion(&rcu_barrier_completion); 116 init_completion(&rcu_barrier_completion);
227 atomic_set(&rcu_barrier_cpu_count, 0); 117 atomic_set(&rcu_barrier_cpu_count, 0);
118 /*
119 * The queueing of callbacks in all CPUs must be atomic with
120 * respect to RCU, otherwise one CPU may queue a callback,
121 * wait for a grace period, decrement barrier count and call
122 * complete(), while other CPUs have not yet queued anything.
123 * So, we need to make sure that grace periods cannot complete
124 * until all the callbacks are queued.
125 */
126 rcu_read_lock();
228 on_each_cpu(rcu_barrier_func, NULL, 0, 1); 127 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
128 rcu_read_unlock();
229 wait_for_completion(&rcu_barrier_completion); 129 wait_for_completion(&rcu_barrier_completion);
230 mutex_unlock(&rcu_barrier_mutex); 130 mutex_unlock(&rcu_barrier_mutex);
231} 131}
232EXPORT_SYMBOL_GPL(rcu_barrier); 132EXPORT_SYMBOL_GPL(rcu_barrier);
233 133
234/*
235 * Invoke the completed RCU callbacks. They are expected to be in
236 * a per-cpu list.
237 */
238static void rcu_do_batch(struct rcu_data *rdp)
239{
240 struct rcu_head *next, *list;
241 int count = 0;
242
243 list = rdp->donelist;
244 while (list) {
245 next = list->next;
246 prefetch(next);
247 list->func(list);
248 list = next;
249 if (++count >= rdp->blimit)
250 break;
251 }
252 rdp->donelist = list;
253
254 local_irq_disable();
255 rdp->qlen -= count;
256 local_irq_enable();
257 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
258 rdp->blimit = blimit;
259
260 if (!rdp->donelist)
261 rdp->donetail = &rdp->donelist;
262 else
263 tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu));
264}
265
266/*
267 * Grace period handling:
268 * The grace period handling consists out of two steps:
269 * - A new grace period is started.
270 * This is done by rcu_start_batch. The start is not broadcasted to
271 * all cpus, they must pick this up by comparing rcp->cur with
272 * rdp->quiescbatch. All cpus are recorded in the
273 * rcu_ctrlblk.cpumask bitmap.
274 * - All cpus must go through a quiescent state.
275 * Since the start of the grace period is not broadcasted, at least two
276 * calls to rcu_check_quiescent_state are required:
277 * The first call just notices that a new grace period is running. The
278 * following calls check if there was a quiescent state since the beginning
279 * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
280 * the bitmap is empty, then the grace period is completed.
281 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
282 * period (if necessary).
283 */
284/*
285 * Register a new batch of callbacks, and start it up if there is currently no
286 * active batch and the batch to be registered has not already occurred.
287 * Caller must hold rcu_ctrlblk.lock.
288 */
289static void rcu_start_batch(struct rcu_ctrlblk *rcp)
290{
291 if (rcp->next_pending &&
292 rcp->completed == rcp->cur) {
293 rcp->next_pending = 0;
294 /*
295 * next_pending == 0 must be visible in
296 * __rcu_process_callbacks() before it can see new value of cur.
297 */
298 smp_wmb();
299 rcp->cur++;
300
301 /*
302 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
303 * Barrier Otherwise it can cause tickless idle CPUs to be
304 * included in rcp->cpumask, which will extend graceperiods
305 * unnecessarily.
306 */
307 smp_mb();
308 cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
309
310 rcp->signaled = 0;
311 }
312}
313
314/*
315 * cpu went through a quiescent state since the beginning of the grace period.
316 * Clear it from the cpu mask and complete the grace period if it was the last
317 * cpu. Start another grace period if someone has further entries pending
318 */
319static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
320{
321 cpu_clear(cpu, rcp->cpumask);
322 if (cpus_empty(rcp->cpumask)) {
323 /* batch completed ! */
324 rcp->completed = rcp->cur;
325 rcu_start_batch(rcp);
326 }
327}
328
329/*
330 * Check if the cpu has gone through a quiescent state (say context
331 * switch). If so and if it already hasn't done so in this RCU
332 * quiescent cycle, then indicate that it has done so.
333 */
334static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
335 struct rcu_data *rdp)
336{
337 if (rdp->quiescbatch != rcp->cur) {
338 /* start new grace period: */
339 rdp->qs_pending = 1;
340 rdp->passed_quiesc = 0;
341 rdp->quiescbatch = rcp->cur;
342 return;
343 }
344
345 /* Grace period already completed for this cpu?
346 * qs_pending is checked instead of the actual bitmap to avoid
347 * cacheline trashing.
348 */
349 if (!rdp->qs_pending)
350 return;
351
352 /*
353 * Was there a quiescent state since the beginning of the grace
354 * period? If no, then exit and wait for the next call.
355 */
356 if (!rdp->passed_quiesc)
357 return;
358 rdp->qs_pending = 0;
359
360 spin_lock(&rcp->lock);
361 /*
362 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
363 * during cpu startup. Ignore the quiescent state.
364 */
365 if (likely(rdp->quiescbatch == rcp->cur))
366 cpu_quiet(rdp->cpu, rcp);
367
368 spin_unlock(&rcp->lock);
369}
370
371
372#ifdef CONFIG_HOTPLUG_CPU
373
374/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
375 * locking requirements, the list it's pulling from has to belong to a cpu
376 * which is dead and hence not processing interrupts.
377 */
378static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
379 struct rcu_head **tail)
380{
381 local_irq_disable();
382 *this_rdp->nxttail = list;
383 if (list)
384 this_rdp->nxttail = tail;
385 local_irq_enable();
386}
387
388static void __rcu_offline_cpu(struct rcu_data *this_rdp,
389 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
390{
391 /* if the cpu going offline owns the grace period
392 * we can block indefinitely waiting for it, so flush
393 * it here
394 */
395 spin_lock_bh(&rcp->lock);
396 if (rcp->cur != rcp->completed)
397 cpu_quiet(rdp->cpu, rcp);
398 spin_unlock_bh(&rcp->lock);
399 rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
400 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
401 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
402}
403
404static void rcu_offline_cpu(int cpu)
405{
406 struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
407 struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
408
409 __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
410 &per_cpu(rcu_data, cpu));
411 __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
412 &per_cpu(rcu_bh_data, cpu));
413 put_cpu_var(rcu_data);
414 put_cpu_var(rcu_bh_data);
415 tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu);
416}
417
418#else
419
420static void rcu_offline_cpu(int cpu)
421{
422}
423
424#endif
425
426/*
427 * This does the RCU processing work from tasklet context.
428 */
429static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
430 struct rcu_data *rdp)
431{
432 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
433 *rdp->donetail = rdp->curlist;
434 rdp->donetail = rdp->curtail;
435 rdp->curlist = NULL;
436 rdp->curtail = &rdp->curlist;
437 }
438
439 if (rdp->nxtlist && !rdp->curlist) {
440 local_irq_disable();
441 rdp->curlist = rdp->nxtlist;
442 rdp->curtail = rdp->nxttail;
443 rdp->nxtlist = NULL;
444 rdp->nxttail = &rdp->nxtlist;
445 local_irq_enable();
446
447 /*
448 * start the next batch of callbacks
449 */
450
451 /* determine batch number */
452 rdp->batch = rcp->cur + 1;
453 /* see the comment and corresponding wmb() in
454 * the rcu_start_batch()
455 */
456 smp_rmb();
457
458 if (!rcp->next_pending) {
459 /* and start it/schedule start if it's a new batch */
460 spin_lock(&rcp->lock);
461 rcp->next_pending = 1;
462 rcu_start_batch(rcp);
463 spin_unlock(&rcp->lock);
464 }
465 }
466
467 rcu_check_quiescent_state(rcp, rdp);
468 if (rdp->donelist)
469 rcu_do_batch(rdp);
470}
471
472static void rcu_process_callbacks(unsigned long unused)
473{
474 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
475 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
476}
477
478static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
479{
480 /* This cpu has pending rcu entries and the grace period
481 * for them has completed.
482 */
483 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
484 return 1;
485
486 /* This cpu has no pending entries, but there are new entries */
487 if (!rdp->curlist && rdp->nxtlist)
488 return 1;
489
490 /* This cpu has finished callbacks to invoke */
491 if (rdp->donelist)
492 return 1;
493
494 /* The rcu core waits for a quiescent state from the cpu */
495 if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
496 return 1;
497
498 /* nothing to do */
499 return 0;
500}
501
502/*
503 * Check to see if there is any immediate RCU-related work to be done
504 * by the current CPU, returning 1 if so. This function is part of the
505 * RCU implementation; it is -not- an exported member of the RCU API.
506 */
507int rcu_pending(int cpu)
508{
509 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
510 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
511}
512
513/*
514 * Check to see if any future RCU-related work will need to be done
515 * by the current CPU, even if none need be done immediately, returning
516 * 1 if so. This function is part of the RCU implementation; it is -not-
517 * an exported member of the RCU API.
518 */
519int rcu_needs_cpu(int cpu)
520{
521 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
522 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
523
524 return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
525}
526
527void rcu_check_callbacks(int cpu, int user)
528{
529 if (user ||
530 (idle_cpu(cpu) && !in_softirq() &&
531 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
532 rcu_qsctr_inc(cpu);
533 rcu_bh_qsctr_inc(cpu);
534 } else if (!in_softirq())
535 rcu_bh_qsctr_inc(cpu);
536 tasklet_schedule(&per_cpu(rcu_tasklet, cpu));
537}
538
539static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
540 struct rcu_data *rdp)
541{
542 memset(rdp, 0, sizeof(*rdp));
543 rdp->curtail = &rdp->curlist;
544 rdp->nxttail = &rdp->nxtlist;
545 rdp->donetail = &rdp->donelist;
546 rdp->quiescbatch = rcp->completed;
547 rdp->qs_pending = 0;
548 rdp->cpu = cpu;
549 rdp->blimit = blimit;
550}
551
552static void __devinit rcu_online_cpu(int cpu)
553{
554 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
555 struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
556
557 rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
558 rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
559 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
560}
561
562static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
563 unsigned long action, void *hcpu)
564{
565 long cpu = (long)hcpu;
566 switch (action) {
567 case CPU_UP_PREPARE:
568 case CPU_UP_PREPARE_FROZEN:
569 rcu_online_cpu(cpu);
570 break;
571 case CPU_DEAD:
572 case CPU_DEAD_FROZEN:
573 rcu_offline_cpu(cpu);
574 break;
575 default:
576 break;
577 }
578 return NOTIFY_OK;
579}
580
581static struct notifier_block __cpuinitdata rcu_nb = {
582 .notifier_call = rcu_cpu_notify,
583};
584
585/*
586 * Initializes rcu mechanism. Assumed to be called early.
587 * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
588 * Note that rcu_qsctr and friends are implicitly
589 * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
590 */
591void __init rcu_init(void) 134void __init rcu_init(void)
592{ 135{
593 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, 136 __rcu_init();
594 (void *)(long)smp_processor_id());
595 /* Register notifier for non-boot CPUs */
596 register_cpu_notifier(&rcu_nb);
597}
598
599struct rcu_synchronize {
600 struct rcu_head head;
601 struct completion completion;
602};
603
604/* Because of FASTCALL declaration of complete, we use this wrapper */
605static void wakeme_after_rcu(struct rcu_head *head)
606{
607 struct rcu_synchronize *rcu;
608
609 rcu = container_of(head, struct rcu_synchronize, head);
610 complete(&rcu->completion);
611} 137}
612 138
613/**
614 * synchronize_rcu - wait until a grace period has elapsed.
615 *
616 * Control will return to the caller some time after a full grace
617 * period has elapsed, in other words after all currently executing RCU
618 * read-side critical sections have completed. RCU read-side critical
619 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
620 * and may be nested.
621 *
622 * If your read-side code is not protected by rcu_read_lock(), do -not-
623 * use synchronize_rcu().
624 */
625void synchronize_rcu(void)
626{
627 struct rcu_synchronize rcu;
628
629 init_completion(&rcu.completion);
630 /* Will wake me after RCU finished */
631 call_rcu(&rcu.head, wakeme_after_rcu);
632
633 /* Wait for it */
634 wait_for_completion(&rcu.completion);
635}
636
637module_param(blimit, int, 0);
638module_param(qhimark, int, 0);
639module_param(qlowmark, int, 0);
640EXPORT_SYMBOL_GPL(rcu_batches_completed);
641EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
642EXPORT_SYMBOL_GPL(call_rcu);
643EXPORT_SYMBOL_GPL(call_rcu_bh);
644EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
new file mode 100644
index 000000000000..987cfb7ade89
--- /dev/null
+++ b/kernel/rcupreempt.c
@@ -0,0 +1,953 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
22 * for pushing me away from locks and towards counters, and
23 * to Suparna Bhattacharya for pushing me completely away
24 * from atomic instructions on the read side.
25 *
26 * Papers: http://www.rdrop.com/users/paulmck/RCU
27 *
28 * Design Document: http://lwn.net/Articles/253651/
29 *
30 * For detailed explanation of Read-Copy Update mechanism see -
31 * Documentation/RCU/ *.txt
32 *
33 */
34#include <linux/types.h>
35#include <linux/kernel.h>
36#include <linux/init.h>
37#include <linux/spinlock.h>
38#include <linux/smp.h>
39#include <linux/rcupdate.h>
40#include <linux/interrupt.h>
41#include <linux/sched.h>
42#include <asm/atomic.h>
43#include <linux/bitops.h>
44#include <linux/module.h>
45#include <linux/completion.h>
46#include <linux/moduleparam.h>
47#include <linux/percpu.h>
48#include <linux/notifier.h>
49#include <linux/rcupdate.h>
50#include <linux/cpu.h>
51#include <linux/random.h>
52#include <linux/delay.h>
53#include <linux/byteorder/swabb.h>
54#include <linux/cpumask.h>
55#include <linux/rcupreempt_trace.h>
56
57/*
58 * Macro that prevents the compiler from reordering accesses, but does
59 * absolutely -nothing- to prevent CPUs from reordering. This is used
60 * only to mediate communication between mainline code and hardware
61 * interrupt and NMI handlers.
62 */
63#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
64
65/*
66 * PREEMPT_RCU data structures.
67 */
68
69/*
70 * GP_STAGES specifies the number of times the state machine has
71 * to go through the all the rcu_try_flip_states (see below)
72 * in a single Grace Period.
73 *
74 * GP in GP_STAGES stands for Grace Period ;)
75 */
76#define GP_STAGES 2
77struct rcu_data {
78 spinlock_t lock; /* Protect rcu_data fields. */
79 long completed; /* Number of last completed batch. */
80 int waitlistcount;
81 struct tasklet_struct rcu_tasklet;
82 struct rcu_head *nextlist;
83 struct rcu_head **nexttail;
84 struct rcu_head *waitlist[GP_STAGES];
85 struct rcu_head **waittail[GP_STAGES];
86 struct rcu_head *donelist;
87 struct rcu_head **donetail;
88 long rcu_flipctr[2];
89#ifdef CONFIG_RCU_TRACE
90 struct rcupreempt_trace trace;
91#endif /* #ifdef CONFIG_RCU_TRACE */
92};
93
94/*
95 * States for rcu_try_flip() and friends.
96 */
97
98enum rcu_try_flip_states {
99
100 /*
101 * Stay here if nothing is happening. Flip the counter if somthing
102 * starts happening. Denoted by "I"
103 */
104 rcu_try_flip_idle_state,
105
106 /*
107 * Wait here for all CPUs to notice that the counter has flipped. This
108 * prevents the old set of counters from ever being incremented once
109 * we leave this state, which in turn is necessary because we cannot
110 * test any individual counter for zero -- we can only check the sum.
111 * Denoted by "A".
112 */
113 rcu_try_flip_waitack_state,
114
115 /*
116 * Wait here for the sum of the old per-CPU counters to reach zero.
117 * Denoted by "Z".
118 */
119 rcu_try_flip_waitzero_state,
120
121 /*
122 * Wait here for each of the other CPUs to execute a memory barrier.
123 * This is necessary to ensure that these other CPUs really have
124 * completed executing their RCU read-side critical sections, despite
125 * their CPUs wildly reordering memory. Denoted by "M".
126 */
127 rcu_try_flip_waitmb_state,
128};
129
130struct rcu_ctrlblk {
131 spinlock_t fliplock; /* Protect state-machine transitions. */
132 long completed; /* Number of last completed batch. */
133 enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
134 the rcu state machine */
135};
136
137static DEFINE_PER_CPU(struct rcu_data, rcu_data);
138static struct rcu_ctrlblk rcu_ctrlblk = {
139 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
140 .completed = 0,
141 .rcu_try_flip_state = rcu_try_flip_idle_state,
142};
143
144
145#ifdef CONFIG_RCU_TRACE
146static char *rcu_try_flip_state_names[] =
147 { "idle", "waitack", "waitzero", "waitmb" };
148#endif /* #ifdef CONFIG_RCU_TRACE */
149
150static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE;
151
152/*
153 * Enum and per-CPU flag to determine when each CPU has seen
154 * the most recent counter flip.
155 */
156
157enum rcu_flip_flag_values {
158 rcu_flip_seen, /* Steady/initial state, last flip seen. */
159 /* Only GP detector can update. */
160 rcu_flipped /* Flip just completed, need confirmation. */
161 /* Only corresponding CPU can update. */
162};
163static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
164 = rcu_flip_seen;
165
166/*
167 * Enum and per-CPU flag to determine when each CPU has executed the
168 * needed memory barrier to fence in memory references from its last RCU
169 * read-side critical section in the just-completed grace period.
170 */
171
172enum rcu_mb_flag_values {
173 rcu_mb_done, /* Steady/initial state, no mb()s required. */
174 /* Only GP detector can update. */
175 rcu_mb_needed /* Flip just completed, need an mb(). */
176 /* Only corresponding CPU can update. */
177};
178static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
179 = rcu_mb_done;
180
181/*
182 * RCU_DATA_ME: find the current CPU's rcu_data structure.
183 * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
184 */
185#define RCU_DATA_ME() (&__get_cpu_var(rcu_data))
186#define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu))
187
188/*
189 * Helper macro for tracing when the appropriate rcu_data is not
190 * cached in a local variable, but where the CPU number is so cached.
191 */
192#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
193
194/*
195 * Helper macro for tracing when the appropriate rcu_data is not
196 * cached in a local variable.
197 */
198#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
199
200/*
201 * Helper macro for tracing when the appropriate rcu_data is pointed
202 * to by a local variable.
203 */
204#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
205
206/*
207 * Return the number of RCU batches processed thus far. Useful
208 * for debug and statistics.
209 */
210long rcu_batches_completed(void)
211{
212 return rcu_ctrlblk.completed;
213}
214EXPORT_SYMBOL_GPL(rcu_batches_completed);
215
216EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
217
218void __rcu_read_lock(void)
219{
220 int idx;
221 struct task_struct *t = current;
222 int nesting;
223
224 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
225 if (nesting != 0) {
226
227 /* An earlier rcu_read_lock() covers us, just count it. */
228
229 t->rcu_read_lock_nesting = nesting + 1;
230
231 } else {
232 unsigned long flags;
233
234 /*
235 * We disable interrupts for the following reasons:
236 * - If we get scheduling clock interrupt here, and we
237 * end up acking the counter flip, it's like a promise
238 * that we will never increment the old counter again.
239 * Thus we will break that promise if that
240 * scheduling clock interrupt happens between the time
241 * we pick the .completed field and the time that we
242 * increment our counter.
243 *
244 * - We don't want to be preempted out here.
245 *
246 * NMIs can still occur, of course, and might themselves
247 * contain rcu_read_lock().
248 */
249
250 local_irq_save(flags);
251
252 /*
253 * Outermost nesting of rcu_read_lock(), so increment
254 * the current counter for the current CPU. Use volatile
255 * casts to prevent the compiler from reordering.
256 */
257
258 idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
259 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
260
261 /*
262 * Now that the per-CPU counter has been incremented, we
263 * are protected from races with rcu_read_lock() invoked
264 * from NMI handlers on this CPU. We can therefore safely
265 * increment the nesting counter, relieving further NMIs
266 * of the need to increment the per-CPU counter.
267 */
268
269 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
270
271 /*
272 * Now that we have preventing any NMIs from storing
273 * to the ->rcu_flipctr_idx, we can safely use it to
274 * remember which counter to decrement in the matching
275 * rcu_read_unlock().
276 */
277
278 ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
279 local_irq_restore(flags);
280 }
281}
282EXPORT_SYMBOL_GPL(__rcu_read_lock);
283
284void __rcu_read_unlock(void)
285{
286 int idx;
287 struct task_struct *t = current;
288 int nesting;
289
290 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
291 if (nesting > 1) {
292
293 /*
294 * We are still protected by the enclosing rcu_read_lock(),
295 * so simply decrement the counter.
296 */
297
298 t->rcu_read_lock_nesting = nesting - 1;
299
300 } else {
301 unsigned long flags;
302
303 /*
304 * Disable local interrupts to prevent the grace-period
305 * detection state machine from seeing us half-done.
306 * NMIs can still occur, of course, and might themselves
307 * contain rcu_read_lock() and rcu_read_unlock().
308 */
309
310 local_irq_save(flags);
311
312 /*
313 * Outermost nesting of rcu_read_unlock(), so we must
314 * decrement the current counter for the current CPU.
315 * This must be done carefully, because NMIs can
316 * occur at any point in this code, and any rcu_read_lock()
317 * and rcu_read_unlock() pairs in the NMI handlers
318 * must interact non-destructively with this code.
319 * Lots of volatile casts, and -very- careful ordering.
320 *
321 * Changes to this code, including this one, must be
322 * inspected, validated, and tested extremely carefully!!!
323 */
324
325 /*
326 * First, pick up the index.
327 */
328
329 idx = ACCESS_ONCE(t->rcu_flipctr_idx);
330
331 /*
332 * Now that we have fetched the counter index, it is
333 * safe to decrement the per-task RCU nesting counter.
334 * After this, any interrupts or NMIs will increment and
335 * decrement the per-CPU counters.
336 */
337 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
338
339 /*
340 * It is now safe to decrement this task's nesting count.
341 * NMIs that occur after this statement will route their
342 * rcu_read_lock() calls through this "else" clause, and
343 * will thus start incrementing the per-CPU counter on
344 * their own. They will also clobber ->rcu_flipctr_idx,
345 * but that is OK, since we have already fetched it.
346 */
347
348 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
349 local_irq_restore(flags);
350 }
351}
352EXPORT_SYMBOL_GPL(__rcu_read_unlock);
353
354/*
355 * If a global counter flip has occurred since the last time that we
356 * advanced callbacks, advance them. Hardware interrupts must be
357 * disabled when calling this function.
358 */
359static void __rcu_advance_callbacks(struct rcu_data *rdp)
360{
361 int cpu;
362 int i;
363 int wlc = 0;
364
365 if (rdp->completed != rcu_ctrlblk.completed) {
366 if (rdp->waitlist[GP_STAGES - 1] != NULL) {
367 *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
368 rdp->donetail = rdp->waittail[GP_STAGES - 1];
369 RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
370 }
371 for (i = GP_STAGES - 2; i >= 0; i--) {
372 if (rdp->waitlist[i] != NULL) {
373 rdp->waitlist[i + 1] = rdp->waitlist[i];
374 rdp->waittail[i + 1] = rdp->waittail[i];
375 wlc++;
376 } else {
377 rdp->waitlist[i + 1] = NULL;
378 rdp->waittail[i + 1] =
379 &rdp->waitlist[i + 1];
380 }
381 }
382 if (rdp->nextlist != NULL) {
383 rdp->waitlist[0] = rdp->nextlist;
384 rdp->waittail[0] = rdp->nexttail;
385 wlc++;
386 rdp->nextlist = NULL;
387 rdp->nexttail = &rdp->nextlist;
388 RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
389 } else {
390 rdp->waitlist[0] = NULL;
391 rdp->waittail[0] = &rdp->waitlist[0];
392 }
393 rdp->waitlistcount = wlc;
394 rdp->completed = rcu_ctrlblk.completed;
395 }
396
397 /*
398 * Check to see if this CPU needs to report that it has seen
399 * the most recent counter flip, thereby declaring that all
400 * subsequent rcu_read_lock() invocations will respect this flip.
401 */
402
403 cpu = raw_smp_processor_id();
404 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
405 smp_mb(); /* Subsequent counter accesses must see new value */
406 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
407 smp_mb(); /* Subsequent RCU read-side critical sections */
408 /* seen -after- acknowledgement. */
409 }
410}
411
412/*
413 * Get here when RCU is idle. Decide whether we need to
414 * move out of idle state, and return non-zero if so.
415 * "Straightforward" approach for the moment, might later
416 * use callback-list lengths, grace-period duration, or
417 * some such to determine when to exit idle state.
418 * Might also need a pre-idle test that does not acquire
419 * the lock, but let's get the simple case working first...
420 */
421
422static int
423rcu_try_flip_idle(void)
424{
425 int cpu;
426
427 RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
428 if (!rcu_pending(smp_processor_id())) {
429 RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
430 return 0;
431 }
432
433 /*
434 * Do the flip.
435 */
436
437 RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
438 rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */
439
440 /*
441 * Need a memory barrier so that other CPUs see the new
442 * counter value before they see the subsequent change of all
443 * the rcu_flip_flag instances to rcu_flipped.
444 */
445
446 smp_mb(); /* see above block comment. */
447
448 /* Now ask each CPU for acknowledgement of the flip. */
449
450 for_each_cpu_mask(cpu, rcu_cpu_online_map)
451 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
452
453 return 1;
454}
455
456/*
457 * Wait for CPUs to acknowledge the flip.
458 */
459
460static int
461rcu_try_flip_waitack(void)
462{
463 int cpu;
464
465 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
466 for_each_cpu_mask(cpu, rcu_cpu_online_map)
467 if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
468 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
469 return 0;
470 }
471
472 /*
473 * Make sure our checks above don't bleed into subsequent
474 * waiting for the sum of the counters to reach zero.
475 */
476
477 smp_mb(); /* see above block comment. */
478 RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
479 return 1;
480}
481
482/*
483 * Wait for collective ``last'' counter to reach zero,
484 * then tell all CPUs to do an end-of-grace-period memory barrier.
485 */
486
487static int
488rcu_try_flip_waitzero(void)
489{
490 int cpu;
491 int lastidx = !(rcu_ctrlblk.completed & 0x1);
492 int sum = 0;
493
494 /* Check to see if the sum of the "last" counters is zero. */
495
496 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
497 for_each_cpu_mask(cpu, rcu_cpu_online_map)
498 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
499 if (sum != 0) {
500 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
501 return 0;
502 }
503
504 /*
505 * This ensures that the other CPUs see the call for
506 * memory barriers -after- the sum to zero has been
507 * detected here
508 */
509 smp_mb(); /* ^^^^^^^^^^^^ */
510
511 /* Call for a memory barrier from each CPU. */
512 for_each_cpu_mask(cpu, rcu_cpu_online_map)
513 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
514
515 RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
516 return 1;
517}
518
519/*
520 * Wait for all CPUs to do their end-of-grace-period memory barrier.
521 * Return 0 once all CPUs have done so.
522 */
523
524static int
525rcu_try_flip_waitmb(void)
526{
527 int cpu;
528
529 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
530 for_each_cpu_mask(cpu, rcu_cpu_online_map)
531 if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
532 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
533 return 0;
534 }
535
536 smp_mb(); /* Ensure that the above checks precede any following flip. */
537 RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
538 return 1;
539}
540
541/*
542 * Attempt a single flip of the counters. Remember, a single flip does
543 * -not- constitute a grace period. Instead, the interval between
544 * at least GP_STAGES consecutive flips is a grace period.
545 *
546 * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
547 * on a large SMP, they might want to use a hierarchical organization of
548 * the per-CPU-counter pairs.
549 */
550static void rcu_try_flip(void)
551{
552 unsigned long flags;
553
554 RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
555 if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
556 RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
557 return;
558 }
559
560 /*
561 * Take the next transition(s) through the RCU grace-period
562 * flip-counter state machine.
563 */
564
565 switch (rcu_ctrlblk.rcu_try_flip_state) {
566 case rcu_try_flip_idle_state:
567 if (rcu_try_flip_idle())
568 rcu_ctrlblk.rcu_try_flip_state =
569 rcu_try_flip_waitack_state;
570 break;
571 case rcu_try_flip_waitack_state:
572 if (rcu_try_flip_waitack())
573 rcu_ctrlblk.rcu_try_flip_state =
574 rcu_try_flip_waitzero_state;
575 break;
576 case rcu_try_flip_waitzero_state:
577 if (rcu_try_flip_waitzero())
578 rcu_ctrlblk.rcu_try_flip_state =
579 rcu_try_flip_waitmb_state;
580 break;
581 case rcu_try_flip_waitmb_state:
582 if (rcu_try_flip_waitmb())
583 rcu_ctrlblk.rcu_try_flip_state =
584 rcu_try_flip_idle_state;
585 }
586 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
587}
588
589/*
590 * Check to see if this CPU needs to do a memory barrier in order to
591 * ensure that any prior RCU read-side critical sections have committed
592 * their counter manipulations and critical-section memory references
593 * before declaring the grace period to be completed.
594 */
595static void rcu_check_mb(int cpu)
596{
597 if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
598 smp_mb(); /* Ensure RCU read-side accesses are visible. */
599 per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
600 }
601}
602
603void rcu_check_callbacks(int cpu, int user)
604{
605 unsigned long flags;
606 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
607
608 rcu_check_mb(cpu);
609 if (rcu_ctrlblk.completed == rdp->completed)
610 rcu_try_flip();
611 spin_lock_irqsave(&rdp->lock, flags);
612 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
613 __rcu_advance_callbacks(rdp);
614 if (rdp->donelist == NULL) {
615 spin_unlock_irqrestore(&rdp->lock, flags);
616 } else {
617 spin_unlock_irqrestore(&rdp->lock, flags);
618 raise_softirq(RCU_SOFTIRQ);
619 }
620}
621
622/*
623 * Needed by dynticks, to make sure all RCU processing has finished
624 * when we go idle:
625 */
626void rcu_advance_callbacks(int cpu, int user)
627{
628 unsigned long flags;
629 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
630
631 if (rcu_ctrlblk.completed == rdp->completed) {
632 rcu_try_flip();
633 if (rcu_ctrlblk.completed == rdp->completed)
634 return;
635 }
636 spin_lock_irqsave(&rdp->lock, flags);
637 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
638 __rcu_advance_callbacks(rdp);
639 spin_unlock_irqrestore(&rdp->lock, flags);
640}
641
642#ifdef CONFIG_HOTPLUG_CPU
643#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
644 *dsttail = srclist; \
645 if (srclist != NULL) { \
646 dsttail = srctail; \
647 srclist = NULL; \
648 srctail = &srclist;\
649 } \
650 } while (0)
651
652void rcu_offline_cpu(int cpu)
653{
654 int i;
655 struct rcu_head *list = NULL;
656 unsigned long flags;
657 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
658 struct rcu_head **tail = &list;
659
660 /*
661 * Remove all callbacks from the newly dead CPU, retaining order.
662 * Otherwise rcu_barrier() will fail
663 */
664
665 spin_lock_irqsave(&rdp->lock, flags);
666 rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
667 for (i = GP_STAGES - 1; i >= 0; i--)
668 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
669 list, tail);
670 rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
671 spin_unlock_irqrestore(&rdp->lock, flags);
672 rdp->waitlistcount = 0;
673
674 /* Disengage the newly dead CPU from the grace-period computation. */
675
676 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
677 rcu_check_mb(cpu);
678 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
679 smp_mb(); /* Subsequent counter accesses must see new value */
680 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
681 smp_mb(); /* Subsequent RCU read-side critical sections */
682 /* seen -after- acknowledgement. */
683 }
684
685 RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
686 RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
687
688 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
689 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
690
691 cpu_clear(cpu, rcu_cpu_online_map);
692
693 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
694
695 /*
696 * Place the removed callbacks on the current CPU's queue.
697 * Make them all start a new grace period: simple approach,
698 * in theory could starve a given set of callbacks, but
699 * you would need to be doing some serious CPU hotplugging
700 * to make this happen. If this becomes a problem, adding
701 * a synchronize_rcu() to the hotplug path would be a simple
702 * fix.
703 */
704
705 rdp = RCU_DATA_ME();
706 spin_lock_irqsave(&rdp->lock, flags);
707 *rdp->nexttail = list;
708 if (list)
709 rdp->nexttail = tail;
710 spin_unlock_irqrestore(&rdp->lock, flags);
711}
712
713void __devinit rcu_online_cpu(int cpu)
714{
715 unsigned long flags;
716
717 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
718 cpu_set(cpu, rcu_cpu_online_map);
719 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
720}
721
722#else /* #ifdef CONFIG_HOTPLUG_CPU */
723
724void rcu_offline_cpu(int cpu)
725{
726}
727
728void __devinit rcu_online_cpu(int cpu)
729{
730}
731
732#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
733
734static void rcu_process_callbacks(struct softirq_action *unused)
735{
736 unsigned long flags;
737 struct rcu_head *next, *list;
738 struct rcu_data *rdp = RCU_DATA_ME();
739
740 spin_lock_irqsave(&rdp->lock, flags);
741 list = rdp->donelist;
742 if (list == NULL) {
743 spin_unlock_irqrestore(&rdp->lock, flags);
744 return;
745 }
746 rdp->donelist = NULL;
747 rdp->donetail = &rdp->donelist;
748 RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
749 spin_unlock_irqrestore(&rdp->lock, flags);
750 while (list) {
751 next = list->next;
752 list->func(list);
753 list = next;
754 RCU_TRACE_ME(rcupreempt_trace_invoke);
755 }
756}
757
758void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
759{
760 unsigned long flags;
761 struct rcu_data *rdp;
762
763 head->func = func;
764 head->next = NULL;
765 local_irq_save(flags);
766 rdp = RCU_DATA_ME();
767 spin_lock(&rdp->lock);
768 __rcu_advance_callbacks(rdp);
769 *rdp->nexttail = head;
770 rdp->nexttail = &head->next;
771 RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
772 spin_unlock(&rdp->lock);
773 local_irq_restore(flags);
774}
775EXPORT_SYMBOL_GPL(call_rcu);
776
777/*
778 * Wait until all currently running preempt_disable() code segments
779 * (including hardware-irq-disable segments) complete. Note that
780 * in -rt this does -not- necessarily result in all currently executing
781 * interrupt -handlers- having completed.
782 */
783void __synchronize_sched(void)
784{
785 cpumask_t oldmask;
786 int cpu;
787
788 if (sched_getaffinity(0, &oldmask) < 0)
789 oldmask = cpu_possible_map;
790 for_each_online_cpu(cpu) {
791 sched_setaffinity(0, cpumask_of_cpu(cpu));
792 schedule();
793 }
794 sched_setaffinity(0, oldmask);
795}
796EXPORT_SYMBOL_GPL(__synchronize_sched);
797
798/*
799 * Check to see if any future RCU-related work will need to be done
800 * by the current CPU, even if none need be done immediately, returning
801 * 1 if so. Assumes that notifiers would take care of handling any
802 * outstanding requests from the RCU core.
803 *
804 * This function is part of the RCU implementation; it is -not-
805 * an exported member of the RCU API.
806 */
807int rcu_needs_cpu(int cpu)
808{
809 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
810
811 return (rdp->donelist != NULL ||
812 !!rdp->waitlistcount ||
813 rdp->nextlist != NULL);
814}
815
816int rcu_pending(int cpu)
817{
818 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
819
820 /* The CPU has at least one callback queued somewhere. */
821
822 if (rdp->donelist != NULL ||
823 !!rdp->waitlistcount ||
824 rdp->nextlist != NULL)
825 return 1;
826
827 /* The RCU core needs an acknowledgement from this CPU. */
828
829 if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
830 (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
831 return 1;
832
833 /* This CPU has fallen behind the global grace-period number. */
834
835 if (rdp->completed != rcu_ctrlblk.completed)
836 return 1;
837
838 /* Nothing needed from this CPU. */
839
840 return 0;
841}
842
843static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
844 unsigned long action, void *hcpu)
845{
846 long cpu = (long)hcpu;
847
848 switch (action) {
849 case CPU_UP_PREPARE:
850 case CPU_UP_PREPARE_FROZEN:
851 rcu_online_cpu(cpu);
852 break;
853 case CPU_UP_CANCELED:
854 case CPU_UP_CANCELED_FROZEN:
855 case CPU_DEAD:
856 case CPU_DEAD_FROZEN:
857 rcu_offline_cpu(cpu);
858 break;
859 default:
860 break;
861 }
862 return NOTIFY_OK;
863}
864
865static struct notifier_block __cpuinitdata rcu_nb = {
866 .notifier_call = rcu_cpu_notify,
867};
868
869void __init __rcu_init(void)
870{
871 int cpu;
872 int i;
873 struct rcu_data *rdp;
874
875 printk(KERN_NOTICE "Preemptible RCU implementation.\n");
876 for_each_possible_cpu(cpu) {
877 rdp = RCU_DATA_CPU(cpu);
878 spin_lock_init(&rdp->lock);
879 rdp->completed = 0;
880 rdp->waitlistcount = 0;
881 rdp->nextlist = NULL;
882 rdp->nexttail = &rdp->nextlist;
883 for (i = 0; i < GP_STAGES; i++) {
884 rdp->waitlist[i] = NULL;
885 rdp->waittail[i] = &rdp->waitlist[i];
886 }
887 rdp->donelist = NULL;
888 rdp->donetail = &rdp->donelist;
889 rdp->rcu_flipctr[0] = 0;
890 rdp->rcu_flipctr[1] = 0;
891 }
892 register_cpu_notifier(&rcu_nb);
893
894 /*
895 * We don't need protection against CPU-Hotplug here
896 * since
897 * a) If a CPU comes online while we are iterating over the
898 * cpu_online_map below, we would only end up making a
899 * duplicate call to rcu_online_cpu() which sets the corresponding
900 * CPU's mask in the rcu_cpu_online_map.
901 *
902 * b) A CPU cannot go offline at this point in time since the user
903 * does not have access to the sysfs interface, nor do we
904 * suspend the system.
905 */
906 for_each_online_cpu(cpu)
907 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
908
909 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
910}
911
912/*
913 * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
914 */
915void synchronize_kernel(void)
916{
917 synchronize_rcu();
918}
919
920#ifdef CONFIG_RCU_TRACE
921long *rcupreempt_flipctr(int cpu)
922{
923 return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
924}
925EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
926
927int rcupreempt_flip_flag(int cpu)
928{
929 return per_cpu(rcu_flip_flag, cpu);
930}
931EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
932
933int rcupreempt_mb_flag(int cpu)
934{
935 return per_cpu(rcu_mb_flag, cpu);
936}
937EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
938
939char *rcupreempt_try_flip_state_name(void)
940{
941 return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
942}
943EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
944
945struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
946{
947 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
948
949 return &rdp->trace;
950}
951EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
952
953#endif /* #ifdef RCU_TRACE */
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
new file mode 100644
index 000000000000..49ac4947af24
--- /dev/null
+++ b/kernel/rcupreempt_trace.c
@@ -0,0 +1,330 @@
1/*
2 * Read-Copy Update tracing for realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt
24 *
25 */
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/init.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/rcupdate.h>
32#include <linux/interrupt.h>
33#include <linux/sched.h>
34#include <asm/atomic.h>
35#include <linux/bitops.h>
36#include <linux/module.h>
37#include <linux/completion.h>
38#include <linux/moduleparam.h>
39#include <linux/percpu.h>
40#include <linux/notifier.h>
41#include <linux/rcupdate.h>
42#include <linux/cpu.h>
43#include <linux/mutex.h>
44#include <linux/rcupreempt_trace.h>
45#include <linux/debugfs.h>
46
47static struct mutex rcupreempt_trace_mutex;
48static char *rcupreempt_trace_buf;
49#define RCUPREEMPT_TRACE_BUF_SIZE 4096
50
51void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
52{
53 trace->done_length += trace->wait_length;
54 trace->done_add += trace->wait_length;
55 trace->wait_length = 0;
56}
57void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
58{
59 trace->wait_length += trace->next_length;
60 trace->wait_add += trace->next_length;
61 trace->next_length = 0;
62}
63void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace)
64{
65 atomic_inc(&trace->rcu_try_flip_1);
66}
67void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
68{
69 atomic_inc(&trace->rcu_try_flip_e1);
70}
71void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace)
72{
73 trace->rcu_try_flip_i1++;
74}
75void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace)
76{
77 trace->rcu_try_flip_ie1++;
78}
79void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace)
80{
81 trace->rcu_try_flip_g1++;
82}
83void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace)
84{
85 trace->rcu_try_flip_a1++;
86}
87void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace)
88{
89 trace->rcu_try_flip_ae1++;
90}
91void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace)
92{
93 trace->rcu_try_flip_a2++;
94}
95void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace)
96{
97 trace->rcu_try_flip_z1++;
98}
99void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace)
100{
101 trace->rcu_try_flip_ze1++;
102}
103void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace)
104{
105 trace->rcu_try_flip_z2++;
106}
107void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace)
108{
109 trace->rcu_try_flip_m1++;
110}
111void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace)
112{
113 trace->rcu_try_flip_me1++;
114}
115void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace)
116{
117 trace->rcu_try_flip_m2++;
118}
119void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
120{
121 trace->rcu_check_callbacks++;
122}
123void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
124{
125 trace->done_remove += trace->done_length;
126 trace->done_length = 0;
127}
128void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
129{
130 atomic_inc(&trace->done_invoked);
131}
132void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
133{
134 trace->next_add++;
135 trace->next_length++;
136}
137
138static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
139{
140 struct rcupreempt_trace *cp;
141 int cpu;
142
143 memset(sp, 0, sizeof(*sp));
144 for_each_possible_cpu(cpu) {
145 cp = rcupreempt_trace_cpu(cpu);
146 sp->next_length += cp->next_length;
147 sp->next_add += cp->next_add;
148 sp->wait_length += cp->wait_length;
149 sp->wait_add += cp->wait_add;
150 sp->done_length += cp->done_length;
151 sp->done_add += cp->done_add;
152 sp->done_remove += cp->done_remove;
153 atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked));
154 sp->rcu_check_callbacks += cp->rcu_check_callbacks;
155 atomic_set(&sp->rcu_try_flip_1,
156 atomic_read(&cp->rcu_try_flip_1));
157 atomic_set(&sp->rcu_try_flip_e1,
158 atomic_read(&cp->rcu_try_flip_e1));
159 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
160 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
161 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
162 sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1;
163 sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1;
164 sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2;
165 sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1;
166 sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1;
167 sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2;
168 sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1;
169 sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1;
170 sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2;
171 }
172}
173
174static ssize_t rcustats_read(struct file *filp, char __user *buffer,
175 size_t count, loff_t *ppos)
176{
177 struct rcupreempt_trace trace;
178 ssize_t bcount;
179 int cnt = 0;
180
181 rcupreempt_trace_sum(&trace);
182 mutex_lock(&rcupreempt_trace_mutex);
183 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
184 "ggp=%ld rcc=%ld\n",
185 rcu_batches_completed(),
186 trace.rcu_check_callbacks);
187 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
188 "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
189 "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n"
190 "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n",
191
192 trace.next_add, trace.next_length,
193 trace.wait_add, trace.wait_length,
194 trace.done_add, trace.done_length,
195 trace.done_remove, atomic_read(&trace.done_invoked),
196 atomic_read(&trace.rcu_try_flip_1),
197 atomic_read(&trace.rcu_try_flip_e1),
198 trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1,
199 trace.rcu_try_flip_g1,
200 trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1,
201 trace.rcu_try_flip_a2,
202 trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1,
203 trace.rcu_try_flip_z2,
204 trace.rcu_try_flip_m1, trace.rcu_try_flip_me1,
205 trace.rcu_try_flip_m2);
206 bcount = simple_read_from_buffer(buffer, count, ppos,
207 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
208 mutex_unlock(&rcupreempt_trace_mutex);
209 return bcount;
210}
211
212static ssize_t rcugp_read(struct file *filp, char __user *buffer,
213 size_t count, loff_t *ppos)
214{
215 long oldgp = rcu_batches_completed();
216 ssize_t bcount;
217
218 mutex_lock(&rcupreempt_trace_mutex);
219 synchronize_rcu();
220 snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE,
221 "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed());
222 bcount = simple_read_from_buffer(buffer, count, ppos,
223 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
224 mutex_unlock(&rcupreempt_trace_mutex);
225 return bcount;
226}
227
228static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
229 size_t count, loff_t *ppos)
230{
231 int cnt = 0;
232 int cpu;
233 int f = rcu_batches_completed() & 0x1;
234 ssize_t bcount;
235
236 mutex_lock(&rcupreempt_trace_mutex);
237
238 cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
239 "CPU last cur F M\n");
240 for_each_online_cpu(cpu) {
241 long *flipctr = rcupreempt_flipctr(cpu);
242 cnt += snprintf(&rcupreempt_trace_buf[cnt],
243 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
244 "%3d %4ld %3ld %d %d\n",
245 cpu,
246 flipctr[!f],
247 flipctr[f],
248 rcupreempt_flip_flag(cpu),
249 rcupreempt_mb_flag(cpu));
250 }
251 cnt += snprintf(&rcupreempt_trace_buf[cnt],
252 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
253 "ggp = %ld, state = %s\n",
254 rcu_batches_completed(),
255 rcupreempt_try_flip_state_name());
256 cnt += snprintf(&rcupreempt_trace_buf[cnt],
257 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
258 "\n");
259 bcount = simple_read_from_buffer(buffer, count, ppos,
260 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
261 mutex_unlock(&rcupreempt_trace_mutex);
262 return bcount;
263}
264
265static struct file_operations rcustats_fops = {
266 .owner = THIS_MODULE,
267 .read = rcustats_read,
268};
269
270static struct file_operations rcugp_fops = {
271 .owner = THIS_MODULE,
272 .read = rcugp_read,
273};
274
275static struct file_operations rcuctrs_fops = {
276 .owner = THIS_MODULE,
277 .read = rcuctrs_read,
278};
279
280static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir;
281static int rcupreempt_debugfs_init(void)
282{
283 rcudir = debugfs_create_dir("rcu", NULL);
284 if (!rcudir)
285 goto out;
286 statdir = debugfs_create_file("rcustats", 0444, rcudir,
287 NULL, &rcustats_fops);
288 if (!statdir)
289 goto free_out;
290
291 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
292 if (!gpdir)
293 goto free_out;
294
295 ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir,
296 NULL, &rcuctrs_fops);
297 if (!ctrsdir)
298 goto free_out;
299 return 0;
300free_out:
301 if (statdir)
302 debugfs_remove(statdir);
303 if (gpdir)
304 debugfs_remove(gpdir);
305 debugfs_remove(rcudir);
306out:
307 return 1;
308}
309
310static int __init rcupreempt_trace_init(void)
311{
312 mutex_init(&rcupreempt_trace_mutex);
313 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
314 if (!rcupreempt_trace_buf)
315 return 1;
316 return rcupreempt_debugfs_init();
317}
318
319static void __exit rcupreempt_trace_cleanup(void)
320{
321 debugfs_remove(statdir);
322 debugfs_remove(gpdir);
323 debugfs_remove(ctrsdir);
324 debugfs_remove(rcudir);
325 kfree(rcupreempt_trace_buf);
326}
327
328
329module_init(rcupreempt_trace_init);
330module_exit(rcupreempt_trace_cleanup);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index c3e165c2318f..fd599829e72a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -726,11 +726,11 @@ static void rcu_torture_shuffle_tasks(void)
726 cpumask_t tmp_mask = CPU_MASK_ALL; 726 cpumask_t tmp_mask = CPU_MASK_ALL;
727 int i; 727 int i;
728 728
729 lock_cpu_hotplug(); 729 get_online_cpus();
730 730
731 /* No point in shuffling if there is only one online CPU (ex: UP) */ 731 /* No point in shuffling if there is only one online CPU (ex: UP) */
732 if (num_online_cpus() == 1) { 732 if (num_online_cpus() == 1) {
733 unlock_cpu_hotplug(); 733 put_online_cpus();
734 return; 734 return;
735 } 735 }
736 736
@@ -762,7 +762,7 @@ static void rcu_torture_shuffle_tasks(void)
762 else 762 else
763 rcu_idle_cpu--; 763 rcu_idle_cpu--;
764 764
765 unlock_cpu_hotplug(); 765 put_online_cpus();
766} 766}
767 767
768/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the 768/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index e3055ba69159..092e4c620af9 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -394,7 +394,7 @@ static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
394static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); 394static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
395 395
396static struct sysdev_class rttest_sysclass = { 396static struct sysdev_class rttest_sysclass = {
397 set_kset_name("rttest"), 397 .name = "rttest",
398}; 398};
399 399
400static int init_test_thread(int id) 400static int init_test_thread(int id)
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 1ec620c03064..cae050b05f5e 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -6,6 +6,7 @@
6 6
7#include <linux/types.h> 7#include <linux/types.h>
8#include <linux/kernel.h> 8#include <linux/kernel.h>
9#include <linux/sched.h>
9#include <linux/module.h> 10#include <linux/module.h>
10#include <linux/rwsem.h> 11#include <linux/rwsem.h>
11 12
@@ -15,7 +16,7 @@
15/* 16/*
16 * lock for reading 17 * lock for reading
17 */ 18 */
18void down_read(struct rw_semaphore *sem) 19void __sched down_read(struct rw_semaphore *sem)
19{ 20{
20 might_sleep(); 21 might_sleep();
21 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); 22 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
@@ -42,7 +43,7 @@ EXPORT_SYMBOL(down_read_trylock);
42/* 43/*
43 * lock for writing 44 * lock for writing
44 */ 45 */
45void down_write(struct rw_semaphore *sem) 46void __sched down_write(struct rw_semaphore *sem)
46{ 47{
47 might_sleep(); 48 might_sleep();
48 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); 49 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
diff --git a/kernel/sched.c b/kernel/sched.c
index d2f77fab0f46..9474b23c28bf 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -22,6 +22,8 @@
22 * by Peter Williams 22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
26 * Thomas Gleixner, Mike Kravetz
25 */ 27 */
26 28
27#include <linux/mm.h> 29#include <linux/mm.h>
@@ -63,6 +65,7 @@
63#include <linux/reciprocal_div.h> 65#include <linux/reciprocal_div.h>
64#include <linux/unistd.h> 66#include <linux/unistd.h>
65#include <linux/pagemap.h> 67#include <linux/pagemap.h>
68#include <linux/hrtimer.h>
66 69
67#include <asm/tlb.h> 70#include <asm/tlb.h>
68#include <asm/irq_regs.h> 71#include <asm/irq_regs.h>
@@ -96,10 +99,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
96#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) 99#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
97 100
98/* 101/*
99 * Some helpers for converting nanosecond timing to jiffy resolution 102 * Helpers for converting nanosecond timing to jiffy resolution
100 */ 103 */
101#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 104#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
102#define JIFFIES_TO_NS(TIME) ((TIME) * (NSEC_PER_SEC / HZ))
103 105
104#define NICE_0_LOAD SCHED_LOAD_SCALE 106#define NICE_0_LOAD SCHED_LOAD_SCALE
105#define NICE_0_SHIFT SCHED_LOAD_SHIFT 107#define NICE_0_SHIFT SCHED_LOAD_SHIFT
@@ -159,6 +161,8 @@ struct rt_prio_array {
159 161
160struct cfs_rq; 162struct cfs_rq;
161 163
164static LIST_HEAD(task_groups);
165
162/* task group related information */ 166/* task group related information */
163struct task_group { 167struct task_group {
164#ifdef CONFIG_FAIR_CGROUP_SCHED 168#ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -168,10 +172,50 @@ struct task_group {
168 struct sched_entity **se; 172 struct sched_entity **se;
169 /* runqueue "owned" by this group on each cpu */ 173 /* runqueue "owned" by this group on each cpu */
170 struct cfs_rq **cfs_rq; 174 struct cfs_rq **cfs_rq;
175
176 struct sched_rt_entity **rt_se;
177 struct rt_rq **rt_rq;
178
179 unsigned int rt_ratio;
180
181 /*
182 * shares assigned to a task group governs how much of cpu bandwidth
183 * is allocated to the group. The more shares a group has, the more is
184 * the cpu bandwidth allocated to it.
185 *
186 * For ex, lets say that there are three task groups, A, B and C which
187 * have been assigned shares 1000, 2000 and 3000 respectively. Then,
188 * cpu bandwidth allocated by the scheduler to task groups A, B and C
189 * should be:
190 *
191 * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
192 * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
193 * Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
194 *
195 * The weight assigned to a task group's schedulable entities on every
196 * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
197 * group's shares. For ex: lets say that task group A has been
198 * assigned shares of 1000 and there are two CPUs in a system. Then,
199 *
200 * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
201 *
202 * Note: It's not necessary that each of a task's group schedulable
203 * entity have the same weight on all CPUs. If the group
204 * has 2 of its tasks on CPU0 and 1 task on CPU1, then a
205 * better distribution of weight could be:
206 *
207 * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
208 * tg_A->se[1]->load.weight = 1/2 * 2000 = 667
209 *
210 * rebalance_shares() is responsible for distributing the shares of a
211 * task groups like this among the group's schedulable entities across
212 * cpus.
213 *
214 */
171 unsigned long shares; 215 unsigned long shares;
172 /* spinlock to serialize modification to shares */ 216
173 spinlock_t lock;
174 struct rcu_head rcu; 217 struct rcu_head rcu;
218 struct list_head list;
175}; 219};
176 220
177/* Default task group's sched entity on each cpu */ 221/* Default task group's sched entity on each cpu */
@@ -179,24 +223,51 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
179/* Default task group's cfs_rq on each cpu */ 223/* Default task group's cfs_rq on each cpu */
180static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 224static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
181 225
226static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
227static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
228
182static struct sched_entity *init_sched_entity_p[NR_CPUS]; 229static struct sched_entity *init_sched_entity_p[NR_CPUS];
183static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; 230static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
184 231
232static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
233static struct rt_rq *init_rt_rq_p[NR_CPUS];
234
235/* task_group_mutex serializes add/remove of task groups and also changes to
236 * a task group's cpu shares.
237 */
238static DEFINE_MUTEX(task_group_mutex);
239
240/* doms_cur_mutex serializes access to doms_cur[] array */
241static DEFINE_MUTEX(doms_cur_mutex);
242
243#ifdef CONFIG_SMP
244/* kernel thread that runs rebalance_shares() periodically */
245static struct task_struct *lb_monitor_task;
246static int load_balance_monitor(void *unused);
247#endif
248
249static void set_se_shares(struct sched_entity *se, unsigned long shares);
250
185/* Default task group. 251/* Default task group.
186 * Every task in system belong to this group at bootup. 252 * Every task in system belong to this group at bootup.
187 */ 253 */
188struct task_group init_task_group = { 254struct task_group init_task_group = {
189 .se = init_sched_entity_p, 255 .se = init_sched_entity_p,
190 .cfs_rq = init_cfs_rq_p, 256 .cfs_rq = init_cfs_rq_p,
257
258 .rt_se = init_sched_rt_entity_p,
259 .rt_rq = init_rt_rq_p,
191}; 260};
192 261
193#ifdef CONFIG_FAIR_USER_SCHED 262#ifdef CONFIG_FAIR_USER_SCHED
194# define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD 263# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
195#else 264#else
196# define INIT_TASK_GRP_LOAD NICE_0_LOAD 265# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
197#endif 266#endif
198 267
199static int init_task_group_load = INIT_TASK_GRP_LOAD; 268#define MIN_GROUP_SHARES 2
269
270static int init_task_group_load = INIT_TASK_GROUP_LOAD;
200 271
201/* return group to which a task belongs */ 272/* return group to which a task belongs */
202static inline struct task_group *task_group(struct task_struct *p) 273static inline struct task_group *task_group(struct task_struct *p)
@@ -215,15 +286,42 @@ static inline struct task_group *task_group(struct task_struct *p)
215} 286}
216 287
217/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 288/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
218static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) 289static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
219{ 290{
220 p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; 291 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
221 p->se.parent = task_group(p)->se[cpu]; 292 p->se.parent = task_group(p)->se[cpu];
293
294 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
295 p->rt.parent = task_group(p)->rt_se[cpu];
296}
297
298static inline void lock_task_group_list(void)
299{
300 mutex_lock(&task_group_mutex);
301}
302
303static inline void unlock_task_group_list(void)
304{
305 mutex_unlock(&task_group_mutex);
306}
307
308static inline void lock_doms_cur(void)
309{
310 mutex_lock(&doms_cur_mutex);
311}
312
313static inline void unlock_doms_cur(void)
314{
315 mutex_unlock(&doms_cur_mutex);
222} 316}
223 317
224#else 318#else
225 319
226static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { } 320static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
321static inline void lock_task_group_list(void) { }
322static inline void unlock_task_group_list(void) { }
323static inline void lock_doms_cur(void) { }
324static inline void unlock_doms_cur(void) { }
227 325
228#endif /* CONFIG_FAIR_GROUP_SCHED */ 326#endif /* CONFIG_FAIR_GROUP_SCHED */
229 327
@@ -264,11 +362,57 @@ struct cfs_rq {
264/* Real-Time classes' related field in a runqueue: */ 362/* Real-Time classes' related field in a runqueue: */
265struct rt_rq { 363struct rt_rq {
266 struct rt_prio_array active; 364 struct rt_prio_array active;
267 int rt_load_balance_idx; 365 unsigned long rt_nr_running;
268 struct list_head *rt_load_balance_head, *rt_load_balance_curr; 366#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
367 int highest_prio; /* highest queued rt task prio */
368#endif
369#ifdef CONFIG_SMP
370 unsigned long rt_nr_migratory;
371 int overloaded;
372#endif
373 int rt_throttled;
374 u64 rt_time;
375
376#ifdef CONFIG_FAIR_GROUP_SCHED
377 struct rq *rq;
378 struct list_head leaf_rt_rq_list;
379 struct task_group *tg;
380 struct sched_rt_entity *rt_se;
381#endif
382};
383
384#ifdef CONFIG_SMP
385
386/*
387 * We add the notion of a root-domain which will be used to define per-domain
388 * variables. Each exclusive cpuset essentially defines an island domain by
389 * fully partitioning the member cpus from any other cpuset. Whenever a new
390 * exclusive cpuset is created, we also create and attach a new root-domain
391 * object.
392 *
393 */
394struct root_domain {
395 atomic_t refcount;
396 cpumask_t span;
397 cpumask_t online;
398
399 /*
400 * The "RT overload" flag: it gets set if a CPU has more than
401 * one runnable RT task.
402 */
403 cpumask_t rto_mask;
404 atomic_t rto_count;
269}; 405};
270 406
271/* 407/*
408 * By default the system creates a single root-domain with all cpus as
409 * members (mimicking the global state we have today).
410 */
411static struct root_domain def_root_domain;
412
413#endif
414
415/*
272 * This is the main, per-CPU runqueue data structure. 416 * This is the main, per-CPU runqueue data structure.
273 * 417 *
274 * Locking rule: those places that want to lock multiple runqueues 418 * Locking rule: those places that want to lock multiple runqueues
@@ -296,11 +440,15 @@ struct rq {
296 u64 nr_switches; 440 u64 nr_switches;
297 441
298 struct cfs_rq cfs; 442 struct cfs_rq cfs;
443 struct rt_rq rt;
444 u64 rt_period_expire;
445 int rt_throttled;
446
299#ifdef CONFIG_FAIR_GROUP_SCHED 447#ifdef CONFIG_FAIR_GROUP_SCHED
300 /* list of leaf cfs_rq on this cpu: */ 448 /* list of leaf cfs_rq on this cpu: */
301 struct list_head leaf_cfs_rq_list; 449 struct list_head leaf_cfs_rq_list;
450 struct list_head leaf_rt_rq_list;
302#endif 451#endif
303 struct rt_rq rt;
304 452
305 /* 453 /*
306 * This is part of a global counter where only the total sum 454 * This is part of a global counter where only the total sum
@@ -317,7 +465,7 @@ struct rq {
317 u64 clock, prev_clock_raw; 465 u64 clock, prev_clock_raw;
318 s64 clock_max_delta; 466 s64 clock_max_delta;
319 467
320 unsigned int clock_warps, clock_overflows; 468 unsigned int clock_warps, clock_overflows, clock_underflows;
321 u64 idle_clock; 469 u64 idle_clock;
322 unsigned int clock_deep_idle_events; 470 unsigned int clock_deep_idle_events;
323 u64 tick_timestamp; 471 u64 tick_timestamp;
@@ -325,6 +473,7 @@ struct rq {
325 atomic_t nr_iowait; 473 atomic_t nr_iowait;
326 474
327#ifdef CONFIG_SMP 475#ifdef CONFIG_SMP
476 struct root_domain *rd;
328 struct sched_domain *sd; 477 struct sched_domain *sd;
329 478
330 /* For active balancing */ 479 /* For active balancing */
@@ -337,6 +486,12 @@ struct rq {
337 struct list_head migration_queue; 486 struct list_head migration_queue;
338#endif 487#endif
339 488
489#ifdef CONFIG_SCHED_HRTICK
490 unsigned long hrtick_flags;
491 ktime_t hrtick_expire;
492 struct hrtimer hrtick_timer;
493#endif
494
340#ifdef CONFIG_SCHEDSTATS 495#ifdef CONFIG_SCHEDSTATS
341 /* latency stats */ 496 /* latency stats */
342 struct sched_info rq_sched_info; 497 struct sched_info rq_sched_info;
@@ -363,7 +518,6 @@ struct rq {
363}; 518};
364 519
365static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 520static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
366static DEFINE_MUTEX(sched_hotcpu_mutex);
367 521
368static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 522static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
369{ 523{
@@ -441,6 +595,23 @@ static void update_rq_clock(struct rq *rq)
441#define task_rq(p) cpu_rq(task_cpu(p)) 595#define task_rq(p) cpu_rq(task_cpu(p))
442#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 596#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
443 597
598unsigned long rt_needs_cpu(int cpu)
599{
600 struct rq *rq = cpu_rq(cpu);
601 u64 delta;
602
603 if (!rq->rt_throttled)
604 return 0;
605
606 if (rq->clock > rq->rt_period_expire)
607 return 1;
608
609 delta = rq->rt_period_expire - rq->clock;
610 do_div(delta, NSEC_PER_SEC / HZ);
611
612 return (unsigned long)delta;
613}
614
444/* 615/*
445 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 616 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
446 */ 617 */
@@ -459,6 +630,8 @@ enum {
459 SCHED_FEAT_START_DEBIT = 4, 630 SCHED_FEAT_START_DEBIT = 4,
460 SCHED_FEAT_TREE_AVG = 8, 631 SCHED_FEAT_TREE_AVG = 8,
461 SCHED_FEAT_APPROX_AVG = 16, 632 SCHED_FEAT_APPROX_AVG = 16,
633 SCHED_FEAT_HRTICK = 32,
634 SCHED_FEAT_DOUBLE_TICK = 64,
462}; 635};
463 636
464const_debug unsigned int sysctl_sched_features = 637const_debug unsigned int sysctl_sched_features =
@@ -466,7 +639,9 @@ const_debug unsigned int sysctl_sched_features =
466 SCHED_FEAT_WAKEUP_PREEMPT * 1 | 639 SCHED_FEAT_WAKEUP_PREEMPT * 1 |
467 SCHED_FEAT_START_DEBIT * 1 | 640 SCHED_FEAT_START_DEBIT * 1 |
468 SCHED_FEAT_TREE_AVG * 0 | 641 SCHED_FEAT_TREE_AVG * 0 |
469 SCHED_FEAT_APPROX_AVG * 0; 642 SCHED_FEAT_APPROX_AVG * 0 |
643 SCHED_FEAT_HRTICK * 1 |
644 SCHED_FEAT_DOUBLE_TICK * 0;
470 645
471#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) 646#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
472 647
@@ -477,6 +652,21 @@ const_debug unsigned int sysctl_sched_features =
477const_debug unsigned int sysctl_sched_nr_migrate = 32; 652const_debug unsigned int sysctl_sched_nr_migrate = 32;
478 653
479/* 654/*
655 * period over which we measure -rt task cpu usage in ms.
656 * default: 1s
657 */
658const_debug unsigned int sysctl_sched_rt_period = 1000;
659
660#define SCHED_RT_FRAC_SHIFT 16
661#define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT)
662
663/*
664 * ratio of time -rt tasks may consume.
665 * default: 95%
666 */
667const_debug unsigned int sysctl_sched_rt_ratio = 62259;
668
669/*
480 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 670 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
481 * clock constructed from sched_clock(): 671 * clock constructed from sched_clock():
482 */ 672 */
@@ -488,7 +678,12 @@ unsigned long long cpu_clock(int cpu)
488 678
489 local_irq_save(flags); 679 local_irq_save(flags);
490 rq = cpu_rq(cpu); 680 rq = cpu_rq(cpu);
491 update_rq_clock(rq); 681 /*
682 * Only call sched_clock() if the scheduler has already been
683 * initialized (some code might call cpu_clock() very early):
684 */
685 if (rq->idle)
686 update_rq_clock(rq);
492 now = rq->clock; 687 now = rq->clock;
493 local_irq_restore(flags); 688 local_irq_restore(flags);
494 689
@@ -503,10 +698,15 @@ EXPORT_SYMBOL_GPL(cpu_clock);
503# define finish_arch_switch(prev) do { } while (0) 698# define finish_arch_switch(prev) do { } while (0)
504#endif 699#endif
505 700
701static inline int task_current(struct rq *rq, struct task_struct *p)
702{
703 return rq->curr == p;
704}
705
506#ifndef __ARCH_WANT_UNLOCKED_CTXSW 706#ifndef __ARCH_WANT_UNLOCKED_CTXSW
507static inline int task_running(struct rq *rq, struct task_struct *p) 707static inline int task_running(struct rq *rq, struct task_struct *p)
508{ 708{
509 return rq->curr == p; 709 return task_current(rq, p);
510} 710}
511 711
512static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 712static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
@@ -535,7 +735,7 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
535#ifdef CONFIG_SMP 735#ifdef CONFIG_SMP
536 return p->oncpu; 736 return p->oncpu;
537#else 737#else
538 return rq->curr == p; 738 return task_current(rq, p);
539#endif 739#endif
540} 740}
541 741
@@ -669,9 +869,177 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
669 rq->prev_clock_raw = now; 869 rq->prev_clock_raw = now;
670 rq->clock += delta_ns; 870 rq->clock += delta_ns;
671 spin_unlock(&rq->lock); 871 spin_unlock(&rq->lock);
872 touch_softlockup_watchdog();
672} 873}
673EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 874EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
674 875
876static void __resched_task(struct task_struct *p, int tif_bit);
877
878static inline void resched_task(struct task_struct *p)
879{
880 __resched_task(p, TIF_NEED_RESCHED);
881}
882
883#ifdef CONFIG_SCHED_HRTICK
884/*
885 * Use HR-timers to deliver accurate preemption points.
886 *
887 * Its all a bit involved since we cannot program an hrt while holding the
888 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
889 * reschedule event.
890 *
891 * When we get rescheduled we reprogram the hrtick_timer outside of the
892 * rq->lock.
893 */
894static inline void resched_hrt(struct task_struct *p)
895{
896 __resched_task(p, TIF_HRTICK_RESCHED);
897}
898
899static inline void resched_rq(struct rq *rq)
900{
901 unsigned long flags;
902
903 spin_lock_irqsave(&rq->lock, flags);
904 resched_task(rq->curr);
905 spin_unlock_irqrestore(&rq->lock, flags);
906}
907
908enum {
909 HRTICK_SET, /* re-programm hrtick_timer */
910 HRTICK_RESET, /* not a new slice */
911};
912
913/*
914 * Use hrtick when:
915 * - enabled by features
916 * - hrtimer is actually high res
917 */
918static inline int hrtick_enabled(struct rq *rq)
919{
920 if (!sched_feat(HRTICK))
921 return 0;
922 return hrtimer_is_hres_active(&rq->hrtick_timer);
923}
924
925/*
926 * Called to set the hrtick timer state.
927 *
928 * called with rq->lock held and irqs disabled
929 */
930static void hrtick_start(struct rq *rq, u64 delay, int reset)
931{
932 assert_spin_locked(&rq->lock);
933
934 /*
935 * preempt at: now + delay
936 */
937 rq->hrtick_expire =
938 ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
939 /*
940 * indicate we need to program the timer
941 */
942 __set_bit(HRTICK_SET, &rq->hrtick_flags);
943 if (reset)
944 __set_bit(HRTICK_RESET, &rq->hrtick_flags);
945
946 /*
947 * New slices are called from the schedule path and don't need a
948 * forced reschedule.
949 */
950 if (reset)
951 resched_hrt(rq->curr);
952}
953
954static void hrtick_clear(struct rq *rq)
955{
956 if (hrtimer_active(&rq->hrtick_timer))
957 hrtimer_cancel(&rq->hrtick_timer);
958}
959
960/*
961 * Update the timer from the possible pending state.
962 */
963static void hrtick_set(struct rq *rq)
964{
965 ktime_t time;
966 int set, reset;
967 unsigned long flags;
968
969 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
970
971 spin_lock_irqsave(&rq->lock, flags);
972 set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
973 reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
974 time = rq->hrtick_expire;
975 clear_thread_flag(TIF_HRTICK_RESCHED);
976 spin_unlock_irqrestore(&rq->lock, flags);
977
978 if (set) {
979 hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
980 if (reset && !hrtimer_active(&rq->hrtick_timer))
981 resched_rq(rq);
982 } else
983 hrtick_clear(rq);
984}
985
986/*
987 * High-resolution timer tick.
988 * Runs from hardirq context with interrupts disabled.
989 */
990static enum hrtimer_restart hrtick(struct hrtimer *timer)
991{
992 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
993
994 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
995
996 spin_lock(&rq->lock);
997 __update_rq_clock(rq);
998 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
999 spin_unlock(&rq->lock);
1000
1001 return HRTIMER_NORESTART;
1002}
1003
1004static inline void init_rq_hrtick(struct rq *rq)
1005{
1006 rq->hrtick_flags = 0;
1007 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1008 rq->hrtick_timer.function = hrtick;
1009 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
1010}
1011
1012void hrtick_resched(void)
1013{
1014 struct rq *rq;
1015 unsigned long flags;
1016
1017 if (!test_thread_flag(TIF_HRTICK_RESCHED))
1018 return;
1019
1020 local_irq_save(flags);
1021 rq = cpu_rq(smp_processor_id());
1022 hrtick_set(rq);
1023 local_irq_restore(flags);
1024}
1025#else
1026static inline void hrtick_clear(struct rq *rq)
1027{
1028}
1029
1030static inline void hrtick_set(struct rq *rq)
1031{
1032}
1033
1034static inline void init_rq_hrtick(struct rq *rq)
1035{
1036}
1037
1038void hrtick_resched(void)
1039{
1040}
1041#endif
1042
675/* 1043/*
676 * resched_task - mark a task 'to be rescheduled now'. 1044 * resched_task - mark a task 'to be rescheduled now'.
677 * 1045 *
@@ -685,16 +1053,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
685#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 1053#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
686#endif 1054#endif
687 1055
688static void resched_task(struct task_struct *p) 1056static void __resched_task(struct task_struct *p, int tif_bit)
689{ 1057{
690 int cpu; 1058 int cpu;
691 1059
692 assert_spin_locked(&task_rq(p)->lock); 1060 assert_spin_locked(&task_rq(p)->lock);
693 1061
694 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) 1062 if (unlikely(test_tsk_thread_flag(p, tif_bit)))
695 return; 1063 return;
696 1064
697 set_tsk_thread_flag(p, TIF_NEED_RESCHED); 1065 set_tsk_thread_flag(p, tif_bit);
698 1066
699 cpu = task_cpu(p); 1067 cpu = task_cpu(p);
700 if (cpu == smp_processor_id()) 1068 if (cpu == smp_processor_id())
@@ -717,10 +1085,10 @@ static void resched_cpu(int cpu)
717 spin_unlock_irqrestore(&rq->lock, flags); 1085 spin_unlock_irqrestore(&rq->lock, flags);
718} 1086}
719#else 1087#else
720static inline void resched_task(struct task_struct *p) 1088static void __resched_task(struct task_struct *p, int tif_bit)
721{ 1089{
722 assert_spin_locked(&task_rq(p)->lock); 1090 assert_spin_locked(&task_rq(p)->lock);
723 set_tsk_need_resched(p); 1091 set_tsk_thread_flag(p, tif_bit);
724} 1092}
725#endif 1093#endif
726 1094
@@ -860,6 +1228,23 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
860static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1228static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
861#endif 1229#endif
862 1230
1231static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1232{
1233 update_load_add(&rq->load, load);
1234}
1235
1236static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1237{
1238 update_load_sub(&rq->load, load);
1239}
1240
1241#ifdef CONFIG_SMP
1242static unsigned long source_load(int cpu, int type);
1243static unsigned long target_load(int cpu, int type);
1244static unsigned long cpu_avg_load_per_task(int cpu);
1245static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1246#endif /* CONFIG_SMP */
1247
863#include "sched_stats.h" 1248#include "sched_stats.h"
864#include "sched_idletask.c" 1249#include "sched_idletask.c"
865#include "sched_fair.c" 1250#include "sched_fair.c"
@@ -870,41 +1255,14 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
870 1255
871#define sched_class_highest (&rt_sched_class) 1256#define sched_class_highest (&rt_sched_class)
872 1257
873/* 1258static void inc_nr_running(struct rq *rq)
874 * Update delta_exec, delta_fair fields for rq.
875 *
876 * delta_fair clock advances at a rate inversely proportional to
877 * total load (rq->load.weight) on the runqueue, while
878 * delta_exec advances at the same rate as wall-clock (provided
879 * cpu is not idle).
880 *
881 * delta_exec / delta_fair is a measure of the (smoothened) load on this
882 * runqueue over any given interval. This (smoothened) load is used
883 * during load balance.
884 *
885 * This function is called /before/ updating rq->load
886 * and when switching tasks.
887 */
888static inline void inc_load(struct rq *rq, const struct task_struct *p)
889{
890 update_load_add(&rq->load, p->se.load.weight);
891}
892
893static inline void dec_load(struct rq *rq, const struct task_struct *p)
894{
895 update_load_sub(&rq->load, p->se.load.weight);
896}
897
898static void inc_nr_running(struct task_struct *p, struct rq *rq)
899{ 1259{
900 rq->nr_running++; 1260 rq->nr_running++;
901 inc_load(rq, p);
902} 1261}
903 1262
904static void dec_nr_running(struct task_struct *p, struct rq *rq) 1263static void dec_nr_running(struct rq *rq)
905{ 1264{
906 rq->nr_running--; 1265 rq->nr_running--;
907 dec_load(rq, p);
908} 1266}
909 1267
910static void set_load_weight(struct task_struct *p) 1268static void set_load_weight(struct task_struct *p)
@@ -996,7 +1354,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
996 rq->nr_uninterruptible--; 1354 rq->nr_uninterruptible--;
997 1355
998 enqueue_task(rq, p, wakeup); 1356 enqueue_task(rq, p, wakeup);
999 inc_nr_running(p, rq); 1357 inc_nr_running(rq);
1000} 1358}
1001 1359
1002/* 1360/*
@@ -1008,7 +1366,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1008 rq->nr_uninterruptible++; 1366 rq->nr_uninterruptible++;
1009 1367
1010 dequeue_task(rq, p, sleep); 1368 dequeue_task(rq, p, sleep);
1011 dec_nr_running(p, rq); 1369 dec_nr_running(rq);
1012} 1370}
1013 1371
1014/** 1372/**
@@ -1028,7 +1386,7 @@ unsigned long weighted_cpuload(const int cpu)
1028 1386
1029static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1387static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1030{ 1388{
1031 set_task_cfs_rq(p, cpu); 1389 set_task_rq(p, cpu);
1032#ifdef CONFIG_SMP 1390#ifdef CONFIG_SMP
1033 /* 1391 /*
1034 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be 1392 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
@@ -1040,12 +1398,24 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1040#endif 1398#endif
1041} 1399}
1042 1400
1401static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1402 const struct sched_class *prev_class,
1403 int oldprio, int running)
1404{
1405 if (prev_class != p->sched_class) {
1406 if (prev_class->switched_from)
1407 prev_class->switched_from(rq, p, running);
1408 p->sched_class->switched_to(rq, p, running);
1409 } else
1410 p->sched_class->prio_changed(rq, p, oldprio, running);
1411}
1412
1043#ifdef CONFIG_SMP 1413#ifdef CONFIG_SMP
1044 1414
1045/* 1415/*
1046 * Is this task likely cache-hot: 1416 * Is this task likely cache-hot:
1047 */ 1417 */
1048static inline int 1418static int
1049task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) 1419task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1050{ 1420{
1051 s64 delta; 1421 s64 delta;
@@ -1270,7 +1640,7 @@ static unsigned long target_load(int cpu, int type)
1270/* 1640/*
1271 * Return the average load per task on the cpu's run queue 1641 * Return the average load per task on the cpu's run queue
1272 */ 1642 */
1273static inline unsigned long cpu_avg_load_per_task(int cpu) 1643static unsigned long cpu_avg_load_per_task(int cpu)
1274{ 1644{
1275 struct rq *rq = cpu_rq(cpu); 1645 struct rq *rq = cpu_rq(cpu);
1276 unsigned long total = weighted_cpuload(cpu); 1646 unsigned long total = weighted_cpuload(cpu);
@@ -1427,58 +1797,6 @@ static int sched_balance_self(int cpu, int flag)
1427 1797
1428#endif /* CONFIG_SMP */ 1798#endif /* CONFIG_SMP */
1429 1799
1430/*
1431 * wake_idle() will wake a task on an idle cpu if task->cpu is
1432 * not idle and an idle cpu is available. The span of cpus to
1433 * search starts with cpus closest then further out as needed,
1434 * so we always favor a closer, idle cpu.
1435 *
1436 * Returns the CPU we should wake onto.
1437 */
1438#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1439static int wake_idle(int cpu, struct task_struct *p)
1440{
1441 cpumask_t tmp;
1442 struct sched_domain *sd;
1443 int i;
1444
1445 /*
1446 * If it is idle, then it is the best cpu to run this task.
1447 *
1448 * This cpu is also the best, if it has more than one task already.
1449 * Siblings must be also busy(in most cases) as they didn't already
1450 * pickup the extra load from this cpu and hence we need not check
1451 * sibling runqueue info. This will avoid the checks and cache miss
1452 * penalities associated with that.
1453 */
1454 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1455 return cpu;
1456
1457 for_each_domain(cpu, sd) {
1458 if (sd->flags & SD_WAKE_IDLE) {
1459 cpus_and(tmp, sd->span, p->cpus_allowed);
1460 for_each_cpu_mask(i, tmp) {
1461 if (idle_cpu(i)) {
1462 if (i != task_cpu(p)) {
1463 schedstat_inc(p,
1464 se.nr_wakeups_idle);
1465 }
1466 return i;
1467 }
1468 }
1469 } else {
1470 break;
1471 }
1472 }
1473 return cpu;
1474}
1475#else
1476static inline int wake_idle(int cpu, struct task_struct *p)
1477{
1478 return cpu;
1479}
1480#endif
1481
1482/*** 1800/***
1483 * try_to_wake_up - wake up a thread 1801 * try_to_wake_up - wake up a thread
1484 * @p: the to-be-woken-up thread 1802 * @p: the to-be-woken-up thread
@@ -1499,11 +1817,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1499 unsigned long flags; 1817 unsigned long flags;
1500 long old_state; 1818 long old_state;
1501 struct rq *rq; 1819 struct rq *rq;
1502#ifdef CONFIG_SMP
1503 struct sched_domain *sd, *this_sd = NULL;
1504 unsigned long load, this_load;
1505 int new_cpu;
1506#endif
1507 1820
1508 rq = task_rq_lock(p, &flags); 1821 rq = task_rq_lock(p, &flags);
1509 old_state = p->state; 1822 old_state = p->state;
@@ -1521,92 +1834,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1521 if (unlikely(task_running(rq, p))) 1834 if (unlikely(task_running(rq, p)))
1522 goto out_activate; 1835 goto out_activate;
1523 1836
1524 new_cpu = cpu; 1837 cpu = p->sched_class->select_task_rq(p, sync);
1525 1838 if (cpu != orig_cpu) {
1526 schedstat_inc(rq, ttwu_count); 1839 set_task_cpu(p, cpu);
1527 if (cpu == this_cpu) {
1528 schedstat_inc(rq, ttwu_local);
1529 goto out_set_cpu;
1530 }
1531
1532 for_each_domain(this_cpu, sd) {
1533 if (cpu_isset(cpu, sd->span)) {
1534 schedstat_inc(sd, ttwu_wake_remote);
1535 this_sd = sd;
1536 break;
1537 }
1538 }
1539
1540 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1541 goto out_set_cpu;
1542
1543 /*
1544 * Check for affine wakeup and passive balancing possibilities.
1545 */
1546 if (this_sd) {
1547 int idx = this_sd->wake_idx;
1548 unsigned int imbalance;
1549
1550 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1551
1552 load = source_load(cpu, idx);
1553 this_load = target_load(this_cpu, idx);
1554
1555 new_cpu = this_cpu; /* Wake to this CPU if we can */
1556
1557 if (this_sd->flags & SD_WAKE_AFFINE) {
1558 unsigned long tl = this_load;
1559 unsigned long tl_per_task;
1560
1561 /*
1562 * Attract cache-cold tasks on sync wakeups:
1563 */
1564 if (sync && !task_hot(p, rq->clock, this_sd))
1565 goto out_set_cpu;
1566
1567 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1568 tl_per_task = cpu_avg_load_per_task(this_cpu);
1569
1570 /*
1571 * If sync wakeup then subtract the (maximum possible)
1572 * effect of the currently running task from the load
1573 * of the current CPU:
1574 */
1575 if (sync)
1576 tl -= current->se.load.weight;
1577
1578 if ((tl <= load &&
1579 tl + target_load(cpu, idx) <= tl_per_task) ||
1580 100*(tl + p->se.load.weight) <= imbalance*load) {
1581 /*
1582 * This domain has SD_WAKE_AFFINE and
1583 * p is cache cold in this domain, and
1584 * there is no bad imbalance.
1585 */
1586 schedstat_inc(this_sd, ttwu_move_affine);
1587 schedstat_inc(p, se.nr_wakeups_affine);
1588 goto out_set_cpu;
1589 }
1590 }
1591
1592 /*
1593 * Start passive balancing when half the imbalance_pct
1594 * limit is reached.
1595 */
1596 if (this_sd->flags & SD_WAKE_BALANCE) {
1597 if (imbalance*this_load <= 100*load) {
1598 schedstat_inc(this_sd, ttwu_move_balance);
1599 schedstat_inc(p, se.nr_wakeups_passive);
1600 goto out_set_cpu;
1601 }
1602 }
1603 }
1604
1605 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1606out_set_cpu:
1607 new_cpu = wake_idle(new_cpu, p);
1608 if (new_cpu != cpu) {
1609 set_task_cpu(p, new_cpu);
1610 task_rq_unlock(rq, &flags); 1840 task_rq_unlock(rq, &flags);
1611 /* might preempt at this point */ 1841 /* might preempt at this point */
1612 rq = task_rq_lock(p, &flags); 1842 rq = task_rq_lock(p, &flags);
@@ -1620,6 +1850,21 @@ out_set_cpu:
1620 cpu = task_cpu(p); 1850 cpu = task_cpu(p);
1621 } 1851 }
1622 1852
1853#ifdef CONFIG_SCHEDSTATS
1854 schedstat_inc(rq, ttwu_count);
1855 if (cpu == this_cpu)
1856 schedstat_inc(rq, ttwu_local);
1857 else {
1858 struct sched_domain *sd;
1859 for_each_domain(this_cpu, sd) {
1860 if (cpu_isset(cpu, sd->span)) {
1861 schedstat_inc(sd, ttwu_wake_remote);
1862 break;
1863 }
1864 }
1865 }
1866#endif
1867
1623out_activate: 1868out_activate:
1624#endif /* CONFIG_SMP */ 1869#endif /* CONFIG_SMP */
1625 schedstat_inc(p, se.nr_wakeups); 1870 schedstat_inc(p, se.nr_wakeups);
@@ -1638,6 +1883,10 @@ out_activate:
1638 1883
1639out_running: 1884out_running:
1640 p->state = TASK_RUNNING; 1885 p->state = TASK_RUNNING;
1886#ifdef CONFIG_SMP
1887 if (p->sched_class->task_wake_up)
1888 p->sched_class->task_wake_up(rq, p);
1889#endif
1641out: 1890out:
1642 task_rq_unlock(rq, &flags); 1891 task_rq_unlock(rq, &flags);
1643 1892
@@ -1679,7 +1928,7 @@ static void __sched_fork(struct task_struct *p)
1679 p->se.wait_max = 0; 1928 p->se.wait_max = 0;
1680#endif 1929#endif
1681 1930
1682 INIT_LIST_HEAD(&p->run_list); 1931 INIT_LIST_HEAD(&p->rt.run_list);
1683 p->se.on_rq = 0; 1932 p->se.on_rq = 0;
1684 1933
1685#ifdef CONFIG_PREEMPT_NOTIFIERS 1934#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1756,9 +2005,13 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1756 * management (if any): 2005 * management (if any):
1757 */ 2006 */
1758 p->sched_class->task_new(rq, p); 2007 p->sched_class->task_new(rq, p);
1759 inc_nr_running(p, rq); 2008 inc_nr_running(rq);
1760 } 2009 }
1761 check_preempt_curr(rq, p); 2010 check_preempt_curr(rq, p);
2011#ifdef CONFIG_SMP
2012 if (p->sched_class->task_wake_up)
2013 p->sched_class->task_wake_up(rq, p);
2014#endif
1762 task_rq_unlock(rq, &flags); 2015 task_rq_unlock(rq, &flags);
1763} 2016}
1764 2017
@@ -1879,6 +2132,11 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1879 prev_state = prev->state; 2132 prev_state = prev->state;
1880 finish_arch_switch(prev); 2133 finish_arch_switch(prev);
1881 finish_lock_switch(rq, prev); 2134 finish_lock_switch(rq, prev);
2135#ifdef CONFIG_SMP
2136 if (current->sched_class->post_schedule)
2137 current->sched_class->post_schedule(rq);
2138#endif
2139
1882 fire_sched_in_preempt_notifiers(current); 2140 fire_sched_in_preempt_notifiers(current);
1883 if (mm) 2141 if (mm)
1884 mmdrop(mm); 2142 mmdrop(mm);
@@ -2112,11 +2370,13 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2112/* 2370/*
2113 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 2371 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2114 */ 2372 */
2115static void double_lock_balance(struct rq *this_rq, struct rq *busiest) 2373static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2116 __releases(this_rq->lock) 2374 __releases(this_rq->lock)
2117 __acquires(busiest->lock) 2375 __acquires(busiest->lock)
2118 __acquires(this_rq->lock) 2376 __acquires(this_rq->lock)
2119{ 2377{
2378 int ret = 0;
2379
2120 if (unlikely(!irqs_disabled())) { 2380 if (unlikely(!irqs_disabled())) {
2121 /* printk() doesn't work good under rq->lock */ 2381 /* printk() doesn't work good under rq->lock */
2122 spin_unlock(&this_rq->lock); 2382 spin_unlock(&this_rq->lock);
@@ -2127,9 +2387,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
2127 spin_unlock(&this_rq->lock); 2387 spin_unlock(&this_rq->lock);
2128 spin_lock(&busiest->lock); 2388 spin_lock(&busiest->lock);
2129 spin_lock(&this_rq->lock); 2389 spin_lock(&this_rq->lock);
2390 ret = 1;
2130 } else 2391 } else
2131 spin_lock(&busiest->lock); 2392 spin_lock(&busiest->lock);
2132 } 2393 }
2394 return ret;
2133} 2395}
2134 2396
2135/* 2397/*
@@ -3328,7 +3590,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3328 3590
3329 rq = task_rq_lock(p, &flags); 3591 rq = task_rq_lock(p, &flags);
3330 ns = p->se.sum_exec_runtime; 3592 ns = p->se.sum_exec_runtime;
3331 if (rq->curr == p) { 3593 if (task_current(rq, p)) {
3332 update_rq_clock(rq); 3594 update_rq_clock(rq);
3333 delta_exec = rq->clock - p->se.exec_start; 3595 delta_exec = rq->clock - p->se.exec_start;
3334 if ((s64)delta_exec > 0) 3596 if ((s64)delta_exec > 0)
@@ -3473,12 +3735,14 @@ void scheduler_tick(void)
3473 /* 3735 /*
3474 * Let rq->clock advance by at least TICK_NSEC: 3736 * Let rq->clock advance by at least TICK_NSEC:
3475 */ 3737 */
3476 if (unlikely(rq->clock < next_tick)) 3738 if (unlikely(rq->clock < next_tick)) {
3477 rq->clock = next_tick; 3739 rq->clock = next_tick;
3740 rq->clock_underflows++;
3741 }
3478 rq->tick_timestamp = rq->clock; 3742 rq->tick_timestamp = rq->clock;
3479 update_cpu_load(rq); 3743 update_cpu_load(rq);
3480 if (curr != rq->idle) /* FIXME: needed? */ 3744 curr->sched_class->task_tick(rq, curr, 0);
3481 curr->sched_class->task_tick(rq, curr); 3745 update_sched_rt_period(rq);
3482 spin_unlock(&rq->lock); 3746 spin_unlock(&rq->lock);
3483 3747
3484#ifdef CONFIG_SMP 3748#ifdef CONFIG_SMP
@@ -3624,6 +3888,8 @@ need_resched_nonpreemptible:
3624 3888
3625 schedule_debug(prev); 3889 schedule_debug(prev);
3626 3890
3891 hrtick_clear(rq);
3892
3627 /* 3893 /*
3628 * Do the rq-clock update outside the rq lock: 3894 * Do the rq-clock update outside the rq lock:
3629 */ 3895 */
@@ -3642,6 +3908,11 @@ need_resched_nonpreemptible:
3642 switch_count = &prev->nvcsw; 3908 switch_count = &prev->nvcsw;
3643 } 3909 }
3644 3910
3911#ifdef CONFIG_SMP
3912 if (prev->sched_class->pre_schedule)
3913 prev->sched_class->pre_schedule(rq, prev);
3914#endif
3915
3645 if (unlikely(!rq->nr_running)) 3916 if (unlikely(!rq->nr_running))
3646 idle_balance(cpu, rq); 3917 idle_balance(cpu, rq);
3647 3918
@@ -3656,14 +3927,20 @@ need_resched_nonpreemptible:
3656 ++*switch_count; 3927 ++*switch_count;
3657 3928
3658 context_switch(rq, prev, next); /* unlocks the rq */ 3929 context_switch(rq, prev, next); /* unlocks the rq */
3930 /*
3931 * the context switch might have flipped the stack from under
3932 * us, hence refresh the local variables.
3933 */
3934 cpu = smp_processor_id();
3935 rq = cpu_rq(cpu);
3659 } else 3936 } else
3660 spin_unlock_irq(&rq->lock); 3937 spin_unlock_irq(&rq->lock);
3661 3938
3662 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3939 hrtick_set(rq);
3663 cpu = smp_processor_id(); 3940
3664 rq = cpu_rq(cpu); 3941 if (unlikely(reacquire_kernel_lock(current) < 0))
3665 goto need_resched_nonpreemptible; 3942 goto need_resched_nonpreemptible;
3666 } 3943
3667 preempt_enable_no_resched(); 3944 preempt_enable_no_resched();
3668 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3945 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3669 goto need_resched; 3946 goto need_resched;
@@ -3679,10 +3956,9 @@ EXPORT_SYMBOL(schedule);
3679asmlinkage void __sched preempt_schedule(void) 3956asmlinkage void __sched preempt_schedule(void)
3680{ 3957{
3681 struct thread_info *ti = current_thread_info(); 3958 struct thread_info *ti = current_thread_info();
3682#ifdef CONFIG_PREEMPT_BKL
3683 struct task_struct *task = current; 3959 struct task_struct *task = current;
3684 int saved_lock_depth; 3960 int saved_lock_depth;
3685#endif 3961
3686 /* 3962 /*
3687 * If there is a non-zero preempt_count or interrupts are disabled, 3963 * If there is a non-zero preempt_count or interrupts are disabled,
3688 * we do not want to preempt the current task. Just return.. 3964 * we do not want to preempt the current task. Just return..
@@ -3698,14 +3974,10 @@ asmlinkage void __sched preempt_schedule(void)
3698 * clear ->lock_depth so that schedule() doesnt 3974 * clear ->lock_depth so that schedule() doesnt
3699 * auto-release the semaphore: 3975 * auto-release the semaphore:
3700 */ 3976 */
3701#ifdef CONFIG_PREEMPT_BKL
3702 saved_lock_depth = task->lock_depth; 3977 saved_lock_depth = task->lock_depth;
3703 task->lock_depth = -1; 3978 task->lock_depth = -1;
3704#endif
3705 schedule(); 3979 schedule();
3706#ifdef CONFIG_PREEMPT_BKL
3707 task->lock_depth = saved_lock_depth; 3980 task->lock_depth = saved_lock_depth;
3708#endif
3709 sub_preempt_count(PREEMPT_ACTIVE); 3981 sub_preempt_count(PREEMPT_ACTIVE);
3710 3982
3711 /* 3983 /*
@@ -3726,10 +3998,9 @@ EXPORT_SYMBOL(preempt_schedule);
3726asmlinkage void __sched preempt_schedule_irq(void) 3998asmlinkage void __sched preempt_schedule_irq(void)
3727{ 3999{
3728 struct thread_info *ti = current_thread_info(); 4000 struct thread_info *ti = current_thread_info();
3729#ifdef CONFIG_PREEMPT_BKL
3730 struct task_struct *task = current; 4001 struct task_struct *task = current;
3731 int saved_lock_depth; 4002 int saved_lock_depth;
3732#endif 4003
3733 /* Catch callers which need to be fixed */ 4004 /* Catch callers which need to be fixed */
3734 BUG_ON(ti->preempt_count || !irqs_disabled()); 4005 BUG_ON(ti->preempt_count || !irqs_disabled());
3735 4006
@@ -3741,16 +4012,12 @@ asmlinkage void __sched preempt_schedule_irq(void)
3741 * clear ->lock_depth so that schedule() doesnt 4012 * clear ->lock_depth so that schedule() doesnt
3742 * auto-release the semaphore: 4013 * auto-release the semaphore:
3743 */ 4014 */
3744#ifdef CONFIG_PREEMPT_BKL
3745 saved_lock_depth = task->lock_depth; 4015 saved_lock_depth = task->lock_depth;
3746 task->lock_depth = -1; 4016 task->lock_depth = -1;
3747#endif
3748 local_irq_enable(); 4017 local_irq_enable();
3749 schedule(); 4018 schedule();
3750 local_irq_disable(); 4019 local_irq_disable();
3751#ifdef CONFIG_PREEMPT_BKL
3752 task->lock_depth = saved_lock_depth; 4020 task->lock_depth = saved_lock_depth;
3753#endif
3754 sub_preempt_count(PREEMPT_ACTIVE); 4021 sub_preempt_count(PREEMPT_ACTIVE);
3755 4022
3756 /* 4023 /*
@@ -4016,6 +4283,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4016 unsigned long flags; 4283 unsigned long flags;
4017 int oldprio, on_rq, running; 4284 int oldprio, on_rq, running;
4018 struct rq *rq; 4285 struct rq *rq;
4286 const struct sched_class *prev_class = p->sched_class;
4019 4287
4020 BUG_ON(prio < 0 || prio > MAX_PRIO); 4288 BUG_ON(prio < 0 || prio > MAX_PRIO);
4021 4289
@@ -4024,7 +4292,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4024 4292
4025 oldprio = p->prio; 4293 oldprio = p->prio;
4026 on_rq = p->se.on_rq; 4294 on_rq = p->se.on_rq;
4027 running = task_running(rq, p); 4295 running = task_current(rq, p);
4028 if (on_rq) { 4296 if (on_rq) {
4029 dequeue_task(rq, p, 0); 4297 dequeue_task(rq, p, 0);
4030 if (running) 4298 if (running)
@@ -4041,18 +4309,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4041 if (on_rq) { 4309 if (on_rq) {
4042 if (running) 4310 if (running)
4043 p->sched_class->set_curr_task(rq); 4311 p->sched_class->set_curr_task(rq);
4312
4044 enqueue_task(rq, p, 0); 4313 enqueue_task(rq, p, 0);
4045 /* 4314
4046 * Reschedule if we are currently running on this runqueue and 4315 check_class_changed(rq, p, prev_class, oldprio, running);
4047 * our priority decreased, or if we are not currently running on
4048 * this runqueue and our priority is higher than the current's
4049 */
4050 if (running) {
4051 if (p->prio > oldprio)
4052 resched_task(rq->curr);
4053 } else {
4054 check_preempt_curr(rq, p);
4055 }
4056 } 4316 }
4057 task_rq_unlock(rq, &flags); 4317 task_rq_unlock(rq, &flags);
4058} 4318}
@@ -4084,10 +4344,8 @@ void set_user_nice(struct task_struct *p, long nice)
4084 goto out_unlock; 4344 goto out_unlock;
4085 } 4345 }
4086 on_rq = p->se.on_rq; 4346 on_rq = p->se.on_rq;
4087 if (on_rq) { 4347 if (on_rq)
4088 dequeue_task(rq, p, 0); 4348 dequeue_task(rq, p, 0);
4089 dec_load(rq, p);
4090 }
4091 4349
4092 p->static_prio = NICE_TO_PRIO(nice); 4350 p->static_prio = NICE_TO_PRIO(nice);
4093 set_load_weight(p); 4351 set_load_weight(p);
@@ -4097,7 +4355,6 @@ void set_user_nice(struct task_struct *p, long nice)
4097 4355
4098 if (on_rq) { 4356 if (on_rq) {
4099 enqueue_task(rq, p, 0); 4357 enqueue_task(rq, p, 0);
4100 inc_load(rq, p);
4101 /* 4358 /*
4102 * If the task increased its priority or is running and 4359 * If the task increased its priority or is running and
4103 * lowered its priority, then reschedule its CPU: 4360 * lowered its priority, then reschedule its CPU:
@@ -4255,6 +4512,7 @@ int sched_setscheduler(struct task_struct *p, int policy,
4255{ 4512{
4256 int retval, oldprio, oldpolicy = -1, on_rq, running; 4513 int retval, oldprio, oldpolicy = -1, on_rq, running;
4257 unsigned long flags; 4514 unsigned long flags;
4515 const struct sched_class *prev_class = p->sched_class;
4258 struct rq *rq; 4516 struct rq *rq;
4259 4517
4260 /* may grab non-irq protected spin_locks */ 4518 /* may grab non-irq protected spin_locks */
@@ -4335,7 +4593,7 @@ recheck:
4335 } 4593 }
4336 update_rq_clock(rq); 4594 update_rq_clock(rq);
4337 on_rq = p->se.on_rq; 4595 on_rq = p->se.on_rq;
4338 running = task_running(rq, p); 4596 running = task_current(rq, p);
4339 if (on_rq) { 4597 if (on_rq) {
4340 deactivate_task(rq, p, 0); 4598 deactivate_task(rq, p, 0);
4341 if (running) 4599 if (running)
@@ -4348,18 +4606,10 @@ recheck:
4348 if (on_rq) { 4606 if (on_rq) {
4349 if (running) 4607 if (running)
4350 p->sched_class->set_curr_task(rq); 4608 p->sched_class->set_curr_task(rq);
4609
4351 activate_task(rq, p, 0); 4610 activate_task(rq, p, 0);
4352 /* 4611
4353 * Reschedule if we are currently running on this runqueue and 4612 check_class_changed(rq, p, prev_class, oldprio, running);
4354 * our priority decreased, or if we are not currently running on
4355 * this runqueue and our priority is higher than the current's
4356 */
4357 if (running) {
4358 if (p->prio > oldprio)
4359 resched_task(rq->curr);
4360 } else {
4361 check_preempt_curr(rq, p);
4362 }
4363 } 4613 }
4364 __task_rq_unlock(rq); 4614 __task_rq_unlock(rq);
4365 spin_unlock_irqrestore(&p->pi_lock, flags); 4615 spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -4487,13 +4737,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4487 struct task_struct *p; 4737 struct task_struct *p;
4488 int retval; 4738 int retval;
4489 4739
4490 mutex_lock(&sched_hotcpu_mutex); 4740 get_online_cpus();
4491 read_lock(&tasklist_lock); 4741 read_lock(&tasklist_lock);
4492 4742
4493 p = find_process_by_pid(pid); 4743 p = find_process_by_pid(pid);
4494 if (!p) { 4744 if (!p) {
4495 read_unlock(&tasklist_lock); 4745 read_unlock(&tasklist_lock);
4496 mutex_unlock(&sched_hotcpu_mutex); 4746 put_online_cpus();
4497 return -ESRCH; 4747 return -ESRCH;
4498 } 4748 }
4499 4749
@@ -4533,7 +4783,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4533 } 4783 }
4534out_unlock: 4784out_unlock:
4535 put_task_struct(p); 4785 put_task_struct(p);
4536 mutex_unlock(&sched_hotcpu_mutex); 4786 put_online_cpus();
4537 return retval; 4787 return retval;
4538} 4788}
4539 4789
@@ -4590,7 +4840,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
4590 struct task_struct *p; 4840 struct task_struct *p;
4591 int retval; 4841 int retval;
4592 4842
4593 mutex_lock(&sched_hotcpu_mutex); 4843 get_online_cpus();
4594 read_lock(&tasklist_lock); 4844 read_lock(&tasklist_lock);
4595 4845
4596 retval = -ESRCH; 4846 retval = -ESRCH;
@@ -4606,7 +4856,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
4606 4856
4607out_unlock: 4857out_unlock:
4608 read_unlock(&tasklist_lock); 4858 read_unlock(&tasklist_lock);
4609 mutex_unlock(&sched_hotcpu_mutex); 4859 put_online_cpus();
4610 4860
4611 return retval; 4861 return retval;
4612} 4862}
@@ -4680,7 +4930,8 @@ static void __cond_resched(void)
4680 } while (need_resched()); 4930 } while (need_resched());
4681} 4931}
4682 4932
4683int __sched cond_resched(void) 4933#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
4934int __sched _cond_resched(void)
4684{ 4935{
4685 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && 4936 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4686 system_state == SYSTEM_RUNNING) { 4937 system_state == SYSTEM_RUNNING) {
@@ -4689,7 +4940,8 @@ int __sched cond_resched(void)
4689 } 4940 }
4690 return 0; 4941 return 0;
4691} 4942}
4692EXPORT_SYMBOL(cond_resched); 4943EXPORT_SYMBOL(_cond_resched);
4944#endif
4693 4945
4694/* 4946/*
4695 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 4947 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
@@ -4701,19 +4953,15 @@ EXPORT_SYMBOL(cond_resched);
4701 */ 4953 */
4702int cond_resched_lock(spinlock_t *lock) 4954int cond_resched_lock(spinlock_t *lock)
4703{ 4955{
4956 int resched = need_resched() && system_state == SYSTEM_RUNNING;
4704 int ret = 0; 4957 int ret = 0;
4705 4958
4706 if (need_lockbreak(lock)) { 4959 if (spin_needbreak(lock) || resched) {
4707 spin_unlock(lock); 4960 spin_unlock(lock);
4708 cpu_relax(); 4961 if (resched && need_resched())
4709 ret = 1; 4962 __cond_resched();
4710 spin_lock(lock); 4963 else
4711 } 4964 cpu_relax();
4712 if (need_resched() && system_state == SYSTEM_RUNNING) {
4713 spin_release(&lock->dep_map, 1, _THIS_IP_);
4714 _raw_spin_unlock(lock);
4715 preempt_enable_no_resched();
4716 __cond_resched();
4717 ret = 1; 4965 ret = 1;
4718 spin_lock(lock); 4966 spin_lock(lock);
4719 } 4967 }
@@ -4887,7 +5135,7 @@ out_unlock:
4887 5135
4888static const char stat_nam[] = "RSDTtZX"; 5136static const char stat_nam[] = "RSDTtZX";
4889 5137
4890static void show_task(struct task_struct *p) 5138void sched_show_task(struct task_struct *p)
4891{ 5139{
4892 unsigned long free = 0; 5140 unsigned long free = 0;
4893 unsigned state; 5141 unsigned state;
@@ -4915,10 +5163,9 @@ static void show_task(struct task_struct *p)
4915 } 5163 }
4916#endif 5164#endif
4917 printk(KERN_CONT "%5lu %5d %6d\n", free, 5165 printk(KERN_CONT "%5lu %5d %6d\n", free,
4918 task_pid_nr(p), task_pid_nr(p->parent)); 5166 task_pid_nr(p), task_pid_nr(p->real_parent));
4919 5167
4920 if (state != TASK_RUNNING) 5168 show_stack(p, NULL);
4921 show_stack(p, NULL);
4922} 5169}
4923 5170
4924void show_state_filter(unsigned long state_filter) 5171void show_state_filter(unsigned long state_filter)
@@ -4940,7 +5187,7 @@ void show_state_filter(unsigned long state_filter)
4940 */ 5187 */
4941 touch_nmi_watchdog(); 5188 touch_nmi_watchdog();
4942 if (!state_filter || (p->state & state_filter)) 5189 if (!state_filter || (p->state & state_filter))
4943 show_task(p); 5190 sched_show_task(p);
4944 } while_each_thread(g, p); 5191 } while_each_thread(g, p);
4945 5192
4946 touch_all_softlockup_watchdogs(); 5193 touch_all_softlockup_watchdogs();
@@ -4989,11 +5236,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4989 spin_unlock_irqrestore(&rq->lock, flags); 5236 spin_unlock_irqrestore(&rq->lock, flags);
4990 5237
4991 /* Set the preempt count _outside_ the spinlocks! */ 5238 /* Set the preempt count _outside_ the spinlocks! */
4992#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4993 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4994#else
4995 task_thread_info(idle)->preempt_count = 0; 5239 task_thread_info(idle)->preempt_count = 0;
4996#endif 5240
4997 /* 5241 /*
4998 * The idle tasks have their own, simple scheduling class: 5242 * The idle tasks have their own, simple scheduling class:
4999 */ 5243 */
@@ -5074,7 +5318,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
5074 goto out; 5318 goto out;
5075 } 5319 }
5076 5320
5077 p->cpus_allowed = new_mask; 5321 if (p->sched_class->set_cpus_allowed)
5322 p->sched_class->set_cpus_allowed(p, &new_mask);
5323 else {
5324 p->cpus_allowed = new_mask;
5325 p->rt.nr_cpus_allowed = cpus_weight(new_mask);
5326 }
5327
5078 /* Can the task run on the task's current CPU? If so, we're done */ 5328 /* Can the task run on the task's current CPU? If so, we're done */
5079 if (cpu_isset(task_cpu(p), new_mask)) 5329 if (cpu_isset(task_cpu(p), new_mask))
5080 goto out; 5330 goto out;
@@ -5566,9 +5816,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5566 struct rq *rq; 5816 struct rq *rq;
5567 5817
5568 switch (action) { 5818 switch (action) {
5569 case CPU_LOCK_ACQUIRE:
5570 mutex_lock(&sched_hotcpu_mutex);
5571 break;
5572 5819
5573 case CPU_UP_PREPARE: 5820 case CPU_UP_PREPARE:
5574 case CPU_UP_PREPARE_FROZEN: 5821 case CPU_UP_PREPARE_FROZEN:
@@ -5587,6 +5834,15 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5587 case CPU_ONLINE_FROZEN: 5834 case CPU_ONLINE_FROZEN:
5588 /* Strictly unnecessary, as first user will wake it. */ 5835 /* Strictly unnecessary, as first user will wake it. */
5589 wake_up_process(cpu_rq(cpu)->migration_thread); 5836 wake_up_process(cpu_rq(cpu)->migration_thread);
5837
5838 /* Update our root-domain */
5839 rq = cpu_rq(cpu);
5840 spin_lock_irqsave(&rq->lock, flags);
5841 if (rq->rd) {
5842 BUG_ON(!cpu_isset(cpu, rq->rd->span));
5843 cpu_set(cpu, rq->rd->online);
5844 }
5845 spin_unlock_irqrestore(&rq->lock, flags);
5590 break; 5846 break;
5591 5847
5592#ifdef CONFIG_HOTPLUG_CPU 5848#ifdef CONFIG_HOTPLUG_CPU
@@ -5637,10 +5893,18 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5637 } 5893 }
5638 spin_unlock_irq(&rq->lock); 5894 spin_unlock_irq(&rq->lock);
5639 break; 5895 break;
5640#endif 5896
5641 case CPU_LOCK_RELEASE: 5897 case CPU_DOWN_PREPARE:
5642 mutex_unlock(&sched_hotcpu_mutex); 5898 /* Update our root-domain */
5899 rq = cpu_rq(cpu);
5900 spin_lock_irqsave(&rq->lock, flags);
5901 if (rq->rd) {
5902 BUG_ON(!cpu_isset(cpu, rq->rd->span));
5903 cpu_clear(cpu, rq->rd->online);
5904 }
5905 spin_unlock_irqrestore(&rq->lock, flags);
5643 break; 5906 break;
5907#endif
5644 } 5908 }
5645 return NOTIFY_OK; 5909 return NOTIFY_OK;
5646} 5910}
@@ -5828,11 +6092,76 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5828 return 1; 6092 return 1;
5829} 6093}
5830 6094
6095static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6096{
6097 unsigned long flags;
6098 const struct sched_class *class;
6099
6100 spin_lock_irqsave(&rq->lock, flags);
6101
6102 if (rq->rd) {
6103 struct root_domain *old_rd = rq->rd;
6104
6105 for (class = sched_class_highest; class; class = class->next) {
6106 if (class->leave_domain)
6107 class->leave_domain(rq);
6108 }
6109
6110 cpu_clear(rq->cpu, old_rd->span);
6111 cpu_clear(rq->cpu, old_rd->online);
6112
6113 if (atomic_dec_and_test(&old_rd->refcount))
6114 kfree(old_rd);
6115 }
6116
6117 atomic_inc(&rd->refcount);
6118 rq->rd = rd;
6119
6120 cpu_set(rq->cpu, rd->span);
6121 if (cpu_isset(rq->cpu, cpu_online_map))
6122 cpu_set(rq->cpu, rd->online);
6123
6124 for (class = sched_class_highest; class; class = class->next) {
6125 if (class->join_domain)
6126 class->join_domain(rq);
6127 }
6128
6129 spin_unlock_irqrestore(&rq->lock, flags);
6130}
6131
6132static void init_rootdomain(struct root_domain *rd)
6133{
6134 memset(rd, 0, sizeof(*rd));
6135
6136 cpus_clear(rd->span);
6137 cpus_clear(rd->online);
6138}
6139
6140static void init_defrootdomain(void)
6141{
6142 init_rootdomain(&def_root_domain);
6143 atomic_set(&def_root_domain.refcount, 1);
6144}
6145
6146static struct root_domain *alloc_rootdomain(void)
6147{
6148 struct root_domain *rd;
6149
6150 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
6151 if (!rd)
6152 return NULL;
6153
6154 init_rootdomain(rd);
6155
6156 return rd;
6157}
6158
5831/* 6159/*
5832 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6160 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5833 * hold the hotplug lock. 6161 * hold the hotplug lock.
5834 */ 6162 */
5835static void cpu_attach_domain(struct sched_domain *sd, int cpu) 6163static void
6164cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5836{ 6165{
5837 struct rq *rq = cpu_rq(cpu); 6166 struct rq *rq = cpu_rq(cpu);
5838 struct sched_domain *tmp; 6167 struct sched_domain *tmp;
@@ -5857,6 +6186,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5857 6186
5858 sched_domain_debug(sd, cpu); 6187 sched_domain_debug(sd, cpu);
5859 6188
6189 rq_attach_root(rq, rd);
5860 rcu_assign_pointer(rq->sd, sd); 6190 rcu_assign_pointer(rq->sd, sd);
5861} 6191}
5862 6192
@@ -6225,6 +6555,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6225static int build_sched_domains(const cpumask_t *cpu_map) 6555static int build_sched_domains(const cpumask_t *cpu_map)
6226{ 6556{
6227 int i; 6557 int i;
6558 struct root_domain *rd;
6228#ifdef CONFIG_NUMA 6559#ifdef CONFIG_NUMA
6229 struct sched_group **sched_group_nodes = NULL; 6560 struct sched_group **sched_group_nodes = NULL;
6230 int sd_allnodes = 0; 6561 int sd_allnodes = 0;
@@ -6241,6 +6572,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6241 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; 6572 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
6242#endif 6573#endif
6243 6574
6575 rd = alloc_rootdomain();
6576 if (!rd) {
6577 printk(KERN_WARNING "Cannot alloc root domain\n");
6578 return -ENOMEM;
6579 }
6580
6244 /* 6581 /*
6245 * Set up domains for cpus specified by the cpu_map. 6582 * Set up domains for cpus specified by the cpu_map.
6246 */ 6583 */
@@ -6457,7 +6794,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6457#else 6794#else
6458 sd = &per_cpu(phys_domains, i); 6795 sd = &per_cpu(phys_domains, i);
6459#endif 6796#endif
6460 cpu_attach_domain(sd, i); 6797 cpu_attach_domain(sd, rd, i);
6461 } 6798 }
6462 6799
6463 return 0; 6800 return 0;
@@ -6515,7 +6852,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6515 unregister_sched_domain_sysctl(); 6852 unregister_sched_domain_sysctl();
6516 6853
6517 for_each_cpu_mask(i, *cpu_map) 6854 for_each_cpu_mask(i, *cpu_map)
6518 cpu_attach_domain(NULL, i); 6855 cpu_attach_domain(NULL, &def_root_domain, i);
6519 synchronize_sched(); 6856 synchronize_sched();
6520 arch_destroy_sched_domains(cpu_map); 6857 arch_destroy_sched_domains(cpu_map);
6521} 6858}
@@ -6545,6 +6882,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
6545{ 6882{
6546 int i, j; 6883 int i, j;
6547 6884
6885 lock_doms_cur();
6886
6548 /* always unregister in case we don't destroy any domains */ 6887 /* always unregister in case we don't destroy any domains */
6549 unregister_sched_domain_sysctl(); 6888 unregister_sched_domain_sysctl();
6550 6889
@@ -6585,6 +6924,8 @@ match2:
6585 ndoms_cur = ndoms_new; 6924 ndoms_cur = ndoms_new;
6586 6925
6587 register_sched_domain_sysctl(); 6926 register_sched_domain_sysctl();
6927
6928 unlock_doms_cur();
6588} 6929}
6589 6930
6590#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 6931#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -6592,10 +6933,10 @@ static int arch_reinit_sched_domains(void)
6592{ 6933{
6593 int err; 6934 int err;
6594 6935
6595 mutex_lock(&sched_hotcpu_mutex); 6936 get_online_cpus();
6596 detach_destroy_domains(&cpu_online_map); 6937 detach_destroy_domains(&cpu_online_map);
6597 err = arch_init_sched_domains(&cpu_online_map); 6938 err = arch_init_sched_domains(&cpu_online_map);
6598 mutex_unlock(&sched_hotcpu_mutex); 6939 put_online_cpus();
6599 6940
6600 return err; 6941 return err;
6601} 6942}
@@ -6706,12 +7047,12 @@ void __init sched_init_smp(void)
6706{ 7047{
6707 cpumask_t non_isolated_cpus; 7048 cpumask_t non_isolated_cpus;
6708 7049
6709 mutex_lock(&sched_hotcpu_mutex); 7050 get_online_cpus();
6710 arch_init_sched_domains(&cpu_online_map); 7051 arch_init_sched_domains(&cpu_online_map);
6711 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 7052 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6712 if (cpus_empty(non_isolated_cpus)) 7053 if (cpus_empty(non_isolated_cpus))
6713 cpu_set(smp_processor_id(), non_isolated_cpus); 7054 cpu_set(smp_processor_id(), non_isolated_cpus);
6714 mutex_unlock(&sched_hotcpu_mutex); 7055 put_online_cpus();
6715 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7056 /* XXX: Theoretical race here - CPU may be hotplugged now */
6716 hotcpu_notifier(update_sched_domains, 0); 7057 hotcpu_notifier(update_sched_domains, 0);
6717 7058
@@ -6719,6 +7060,21 @@ void __init sched_init_smp(void)
6719 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 7060 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6720 BUG(); 7061 BUG();
6721 sched_init_granularity(); 7062 sched_init_granularity();
7063
7064#ifdef CONFIG_FAIR_GROUP_SCHED
7065 if (nr_cpu_ids == 1)
7066 return;
7067
7068 lb_monitor_task = kthread_create(load_balance_monitor, NULL,
7069 "group_balance");
7070 if (!IS_ERR(lb_monitor_task)) {
7071 lb_monitor_task->flags |= PF_NOFREEZE;
7072 wake_up_process(lb_monitor_task);
7073 } else {
7074 printk(KERN_ERR "Could not create load balance monitor thread"
7075 "(error = %ld) \n", PTR_ERR(lb_monitor_task));
7076 }
7077#endif
6722} 7078}
6723#else 7079#else
6724void __init sched_init_smp(void) 7080void __init sched_init_smp(void)
@@ -6743,13 +7099,87 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6743 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 7099 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
6744} 7100}
6745 7101
7102static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7103{
7104 struct rt_prio_array *array;
7105 int i;
7106
7107 array = &rt_rq->active;
7108 for (i = 0; i < MAX_RT_PRIO; i++) {
7109 INIT_LIST_HEAD(array->queue + i);
7110 __clear_bit(i, array->bitmap);
7111 }
7112 /* delimiter for bitsearch: */
7113 __set_bit(MAX_RT_PRIO, array->bitmap);
7114
7115#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
7116 rt_rq->highest_prio = MAX_RT_PRIO;
7117#endif
7118#ifdef CONFIG_SMP
7119 rt_rq->rt_nr_migratory = 0;
7120 rt_rq->overloaded = 0;
7121#endif
7122
7123 rt_rq->rt_time = 0;
7124 rt_rq->rt_throttled = 0;
7125
7126#ifdef CONFIG_FAIR_GROUP_SCHED
7127 rt_rq->rq = rq;
7128#endif
7129}
7130
7131#ifdef CONFIG_FAIR_GROUP_SCHED
7132static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
7133 struct cfs_rq *cfs_rq, struct sched_entity *se,
7134 int cpu, int add)
7135{
7136 tg->cfs_rq[cpu] = cfs_rq;
7137 init_cfs_rq(cfs_rq, rq);
7138 cfs_rq->tg = tg;
7139 if (add)
7140 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7141
7142 tg->se[cpu] = se;
7143 se->cfs_rq = &rq->cfs;
7144 se->my_q = cfs_rq;
7145 se->load.weight = tg->shares;
7146 se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
7147 se->parent = NULL;
7148}
7149
7150static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
7151 struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
7152 int cpu, int add)
7153{
7154 tg->rt_rq[cpu] = rt_rq;
7155 init_rt_rq(rt_rq, rq);
7156 rt_rq->tg = tg;
7157 rt_rq->rt_se = rt_se;
7158 if (add)
7159 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7160
7161 tg->rt_se[cpu] = rt_se;
7162 rt_se->rt_rq = &rq->rt;
7163 rt_se->my_q = rt_rq;
7164 rt_se->parent = NULL;
7165 INIT_LIST_HEAD(&rt_se->run_list);
7166}
7167#endif
7168
6746void __init sched_init(void) 7169void __init sched_init(void)
6747{ 7170{
6748 int highest_cpu = 0; 7171 int highest_cpu = 0;
6749 int i, j; 7172 int i, j;
6750 7173
7174#ifdef CONFIG_SMP
7175 init_defrootdomain();
7176#endif
7177
7178#ifdef CONFIG_FAIR_GROUP_SCHED
7179 list_add(&init_task_group.list, &task_groups);
7180#endif
7181
6751 for_each_possible_cpu(i) { 7182 for_each_possible_cpu(i) {
6752 struct rt_prio_array *array;
6753 struct rq *rq; 7183 struct rq *rq;
6754 7184
6755 rq = cpu_rq(i); 7185 rq = cpu_rq(i);
@@ -6758,52 +7188,39 @@ void __init sched_init(void)
6758 rq->nr_running = 0; 7188 rq->nr_running = 0;
6759 rq->clock = 1; 7189 rq->clock = 1;
6760 init_cfs_rq(&rq->cfs, rq); 7190 init_cfs_rq(&rq->cfs, rq);
7191 init_rt_rq(&rq->rt, rq);
6761#ifdef CONFIG_FAIR_GROUP_SCHED 7192#ifdef CONFIG_FAIR_GROUP_SCHED
6762 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6763 {
6764 struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
6765 struct sched_entity *se =
6766 &per_cpu(init_sched_entity, i);
6767
6768 init_cfs_rq_p[i] = cfs_rq;
6769 init_cfs_rq(cfs_rq, rq);
6770 cfs_rq->tg = &init_task_group;
6771 list_add(&cfs_rq->leaf_cfs_rq_list,
6772 &rq->leaf_cfs_rq_list);
6773
6774 init_sched_entity_p[i] = se;
6775 se->cfs_rq = &rq->cfs;
6776 se->my_q = cfs_rq;
6777 se->load.weight = init_task_group_load;
6778 se->load.inv_weight =
6779 div64_64(1ULL<<32, init_task_group_load);
6780 se->parent = NULL;
6781 }
6782 init_task_group.shares = init_task_group_load; 7193 init_task_group.shares = init_task_group_load;
6783 spin_lock_init(&init_task_group.lock); 7194 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7195 init_tg_cfs_entry(rq, &init_task_group,
7196 &per_cpu(init_cfs_rq, i),
7197 &per_cpu(init_sched_entity, i), i, 1);
7198
7199 init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
7200 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7201 init_tg_rt_entry(rq, &init_task_group,
7202 &per_cpu(init_rt_rq, i),
7203 &per_cpu(init_sched_rt_entity, i), i, 1);
6784#endif 7204#endif
7205 rq->rt_period_expire = 0;
7206 rq->rt_throttled = 0;
6785 7207
6786 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7208 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6787 rq->cpu_load[j] = 0; 7209 rq->cpu_load[j] = 0;
6788#ifdef CONFIG_SMP 7210#ifdef CONFIG_SMP
6789 rq->sd = NULL; 7211 rq->sd = NULL;
7212 rq->rd = NULL;
6790 rq->active_balance = 0; 7213 rq->active_balance = 0;
6791 rq->next_balance = jiffies; 7214 rq->next_balance = jiffies;
6792 rq->push_cpu = 0; 7215 rq->push_cpu = 0;
6793 rq->cpu = i; 7216 rq->cpu = i;
6794 rq->migration_thread = NULL; 7217 rq->migration_thread = NULL;
6795 INIT_LIST_HEAD(&rq->migration_queue); 7218 INIT_LIST_HEAD(&rq->migration_queue);
7219 rq_attach_root(rq, &def_root_domain);
6796#endif 7220#endif
7221 init_rq_hrtick(rq);
6797 atomic_set(&rq->nr_iowait, 0); 7222 atomic_set(&rq->nr_iowait, 0);
6798
6799 array = &rq->rt.active;
6800 for (j = 0; j < MAX_RT_PRIO; j++) {
6801 INIT_LIST_HEAD(array->queue + j);
6802 __clear_bit(j, array->bitmap);
6803 }
6804 highest_cpu = i; 7223 highest_cpu = i;
6805 /* delimiter for bitsearch: */
6806 __set_bit(MAX_RT_PRIO, array->bitmap);
6807 } 7224 }
6808 7225
6809 set_load_weight(&init_task); 7226 set_load_weight(&init_task);
@@ -6972,12 +7389,187 @@ void set_curr_task(int cpu, struct task_struct *p)
6972 7389
6973#ifdef CONFIG_FAIR_GROUP_SCHED 7390#ifdef CONFIG_FAIR_GROUP_SCHED
6974 7391
7392#ifdef CONFIG_SMP
7393/*
7394 * distribute shares of all task groups among their schedulable entities,
7395 * to reflect load distribution across cpus.
7396 */
7397static int rebalance_shares(struct sched_domain *sd, int this_cpu)
7398{
7399 struct cfs_rq *cfs_rq;
7400 struct rq *rq = cpu_rq(this_cpu);
7401 cpumask_t sdspan = sd->span;
7402 int balanced = 1;
7403
7404 /* Walk thr' all the task groups that we have */
7405 for_each_leaf_cfs_rq(rq, cfs_rq) {
7406 int i;
7407 unsigned long total_load = 0, total_shares;
7408 struct task_group *tg = cfs_rq->tg;
7409
7410 /* Gather total task load of this group across cpus */
7411 for_each_cpu_mask(i, sdspan)
7412 total_load += tg->cfs_rq[i]->load.weight;
7413
7414 /* Nothing to do if this group has no load */
7415 if (!total_load)
7416 continue;
7417
7418 /*
7419 * tg->shares represents the number of cpu shares the task group
7420 * is eligible to hold on a single cpu. On N cpus, it is
7421 * eligible to hold (N * tg->shares) number of cpu shares.
7422 */
7423 total_shares = tg->shares * cpus_weight(sdspan);
7424
7425 /*
7426 * redistribute total_shares across cpus as per the task load
7427 * distribution.
7428 */
7429 for_each_cpu_mask(i, sdspan) {
7430 unsigned long local_load, local_shares;
7431
7432 local_load = tg->cfs_rq[i]->load.weight;
7433 local_shares = (local_load * total_shares) / total_load;
7434 if (!local_shares)
7435 local_shares = MIN_GROUP_SHARES;
7436 if (local_shares == tg->se[i]->load.weight)
7437 continue;
7438
7439 spin_lock_irq(&cpu_rq(i)->lock);
7440 set_se_shares(tg->se[i], local_shares);
7441 spin_unlock_irq(&cpu_rq(i)->lock);
7442 balanced = 0;
7443 }
7444 }
7445
7446 return balanced;
7447}
7448
7449/*
7450 * How frequently should we rebalance_shares() across cpus?
7451 *
7452 * The more frequently we rebalance shares, the more accurate is the fairness
7453 * of cpu bandwidth distribution between task groups. However higher frequency
7454 * also implies increased scheduling overhead.
7455 *
7456 * sysctl_sched_min_bal_int_shares represents the minimum interval between
7457 * consecutive calls to rebalance_shares() in the same sched domain.
7458 *
7459 * sysctl_sched_max_bal_int_shares represents the maximum interval between
7460 * consecutive calls to rebalance_shares() in the same sched domain.
7461 *
7462 * These settings allows for the appropriate trade-off between accuracy of
7463 * fairness and the associated overhead.
7464 *
7465 */
7466
7467/* default: 8ms, units: milliseconds */
7468const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
7469
7470/* default: 128ms, units: milliseconds */
7471const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
7472
7473/* kernel thread that runs rebalance_shares() periodically */
7474static int load_balance_monitor(void *unused)
7475{
7476 unsigned int timeout = sysctl_sched_min_bal_int_shares;
7477 struct sched_param schedparm;
7478 int ret;
7479
7480 /*
7481 * We don't want this thread's execution to be limited by the shares
7482 * assigned to default group (init_task_group). Hence make it run
7483 * as a SCHED_RR RT task at the lowest priority.
7484 */
7485 schedparm.sched_priority = 1;
7486 ret = sched_setscheduler(current, SCHED_RR, &schedparm);
7487 if (ret)
7488 printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
7489 " monitor thread (error = %d) \n", ret);
7490
7491 while (!kthread_should_stop()) {
7492 int i, cpu, balanced = 1;
7493
7494 /* Prevent cpus going down or coming up */
7495 get_online_cpus();
7496 /* lockout changes to doms_cur[] array */
7497 lock_doms_cur();
7498 /*
7499 * Enter a rcu read-side critical section to safely walk rq->sd
7500 * chain on various cpus and to walk task group list
7501 * (rq->leaf_cfs_rq_list) in rebalance_shares().
7502 */
7503 rcu_read_lock();
7504
7505 for (i = 0; i < ndoms_cur; i++) {
7506 cpumask_t cpumap = doms_cur[i];
7507 struct sched_domain *sd = NULL, *sd_prev = NULL;
7508
7509 cpu = first_cpu(cpumap);
7510
7511 /* Find the highest domain at which to balance shares */
7512 for_each_domain(cpu, sd) {
7513 if (!(sd->flags & SD_LOAD_BALANCE))
7514 continue;
7515 sd_prev = sd;
7516 }
7517
7518 sd = sd_prev;
7519 /* sd == NULL? No load balance reqd in this domain */
7520 if (!sd)
7521 continue;
7522
7523 balanced &= rebalance_shares(sd, cpu);
7524 }
7525
7526 rcu_read_unlock();
7527
7528 unlock_doms_cur();
7529 put_online_cpus();
7530
7531 if (!balanced)
7532 timeout = sysctl_sched_min_bal_int_shares;
7533 else if (timeout < sysctl_sched_max_bal_int_shares)
7534 timeout *= 2;
7535
7536 msleep_interruptible(timeout);
7537 }
7538
7539 return 0;
7540}
7541#endif /* CONFIG_SMP */
7542
7543static void free_sched_group(struct task_group *tg)
7544{
7545 int i;
7546
7547 for_each_possible_cpu(i) {
7548 if (tg->cfs_rq)
7549 kfree(tg->cfs_rq[i]);
7550 if (tg->se)
7551 kfree(tg->se[i]);
7552 if (tg->rt_rq)
7553 kfree(tg->rt_rq[i]);
7554 if (tg->rt_se)
7555 kfree(tg->rt_se[i]);
7556 }
7557
7558 kfree(tg->cfs_rq);
7559 kfree(tg->se);
7560 kfree(tg->rt_rq);
7561 kfree(tg->rt_se);
7562 kfree(tg);
7563}
7564
6975/* allocate runqueue etc for a new task group */ 7565/* allocate runqueue etc for a new task group */
6976struct task_group *sched_create_group(void) 7566struct task_group *sched_create_group(void)
6977{ 7567{
6978 struct task_group *tg; 7568 struct task_group *tg;
6979 struct cfs_rq *cfs_rq; 7569 struct cfs_rq *cfs_rq;
6980 struct sched_entity *se; 7570 struct sched_entity *se;
7571 struct rt_rq *rt_rq;
7572 struct sched_rt_entity *rt_se;
6981 struct rq *rq; 7573 struct rq *rq;
6982 int i; 7574 int i;
6983 7575
@@ -6991,97 +7583,89 @@ struct task_group *sched_create_group(void)
6991 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); 7583 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
6992 if (!tg->se) 7584 if (!tg->se)
6993 goto err; 7585 goto err;
7586 tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
7587 if (!tg->rt_rq)
7588 goto err;
7589 tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
7590 if (!tg->rt_se)
7591 goto err;
7592
7593 tg->shares = NICE_0_LOAD;
7594 tg->rt_ratio = 0; /* XXX */
6994 7595
6995 for_each_possible_cpu(i) { 7596 for_each_possible_cpu(i) {
6996 rq = cpu_rq(i); 7597 rq = cpu_rq(i);
6997 7598
6998 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, 7599 cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
6999 cpu_to_node(i)); 7600 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7000 if (!cfs_rq) 7601 if (!cfs_rq)
7001 goto err; 7602 goto err;
7002 7603
7003 se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, 7604 se = kmalloc_node(sizeof(struct sched_entity),
7004 cpu_to_node(i)); 7605 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7005 if (!se) 7606 if (!se)
7006 goto err; 7607 goto err;
7007 7608
7008 memset(cfs_rq, 0, sizeof(struct cfs_rq)); 7609 rt_rq = kmalloc_node(sizeof(struct rt_rq),
7009 memset(se, 0, sizeof(struct sched_entity)); 7610 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7611 if (!rt_rq)
7612 goto err;
7010 7613
7011 tg->cfs_rq[i] = cfs_rq; 7614 rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
7012 init_cfs_rq(cfs_rq, rq); 7615 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7013 cfs_rq->tg = tg; 7616 if (!rt_se)
7617 goto err;
7014 7618
7015 tg->se[i] = se; 7619 init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
7016 se->cfs_rq = &rq->cfs; 7620 init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
7017 se->my_q = cfs_rq;
7018 se->load.weight = NICE_0_LOAD;
7019 se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
7020 se->parent = NULL;
7021 } 7621 }
7022 7622
7623 lock_task_group_list();
7023 for_each_possible_cpu(i) { 7624 for_each_possible_cpu(i) {
7024 rq = cpu_rq(i); 7625 rq = cpu_rq(i);
7025 cfs_rq = tg->cfs_rq[i]; 7626 cfs_rq = tg->cfs_rq[i];
7026 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 7627 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7628 rt_rq = tg->rt_rq[i];
7629 list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7027 } 7630 }
7028 7631 list_add_rcu(&tg->list, &task_groups);
7029 tg->shares = NICE_0_LOAD; 7632 unlock_task_group_list();
7030 spin_lock_init(&tg->lock);
7031 7633
7032 return tg; 7634 return tg;
7033 7635
7034err: 7636err:
7035 for_each_possible_cpu(i) { 7637 free_sched_group(tg);
7036 if (tg->cfs_rq)
7037 kfree(tg->cfs_rq[i]);
7038 if (tg->se)
7039 kfree(tg->se[i]);
7040 }
7041 kfree(tg->cfs_rq);
7042 kfree(tg->se);
7043 kfree(tg);
7044
7045 return ERR_PTR(-ENOMEM); 7638 return ERR_PTR(-ENOMEM);
7046} 7639}
7047 7640
7048/* rcu callback to free various structures associated with a task group */ 7641/* rcu callback to free various structures associated with a task group */
7049static void free_sched_group(struct rcu_head *rhp) 7642static void free_sched_group_rcu(struct rcu_head *rhp)
7050{ 7643{
7051 struct task_group *tg = container_of(rhp, struct task_group, rcu);
7052 struct cfs_rq *cfs_rq;
7053 struct sched_entity *se;
7054 int i;
7055
7056 /* now it should be safe to free those cfs_rqs */ 7644 /* now it should be safe to free those cfs_rqs */
7057 for_each_possible_cpu(i) { 7645 free_sched_group(container_of(rhp, struct task_group, rcu));
7058 cfs_rq = tg->cfs_rq[i];
7059 kfree(cfs_rq);
7060
7061 se = tg->se[i];
7062 kfree(se);
7063 }
7064
7065 kfree(tg->cfs_rq);
7066 kfree(tg->se);
7067 kfree(tg);
7068} 7646}
7069 7647
7070/* Destroy runqueue etc associated with a task group */ 7648/* Destroy runqueue etc associated with a task group */
7071void sched_destroy_group(struct task_group *tg) 7649void sched_destroy_group(struct task_group *tg)
7072{ 7650{
7073 struct cfs_rq *cfs_rq = NULL; 7651 struct cfs_rq *cfs_rq = NULL;
7652 struct rt_rq *rt_rq = NULL;
7074 int i; 7653 int i;
7075 7654
7655 lock_task_group_list();
7076 for_each_possible_cpu(i) { 7656 for_each_possible_cpu(i) {
7077 cfs_rq = tg->cfs_rq[i]; 7657 cfs_rq = tg->cfs_rq[i];
7078 list_del_rcu(&cfs_rq->leaf_cfs_rq_list); 7658 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
7659 rt_rq = tg->rt_rq[i];
7660 list_del_rcu(&rt_rq->leaf_rt_rq_list);
7079 } 7661 }
7662 list_del_rcu(&tg->list);
7663 unlock_task_group_list();
7080 7664
7081 BUG_ON(!cfs_rq); 7665 BUG_ON(!cfs_rq);
7082 7666
7083 /* wait for possible concurrent references to cfs_rqs complete */ 7667 /* wait for possible concurrent references to cfs_rqs complete */
7084 call_rcu(&tg->rcu, free_sched_group); 7668 call_rcu(&tg->rcu, free_sched_group_rcu);
7085} 7669}
7086 7670
7087/* change task's runqueue when it moves between groups. 7671/* change task's runqueue when it moves between groups.
@@ -7097,14 +7681,9 @@ void sched_move_task(struct task_struct *tsk)
7097 7681
7098 rq = task_rq_lock(tsk, &flags); 7682 rq = task_rq_lock(tsk, &flags);
7099 7683
7100 if (tsk->sched_class != &fair_sched_class) {
7101 set_task_cfs_rq(tsk, task_cpu(tsk));
7102 goto done;
7103 }
7104
7105 update_rq_clock(rq); 7684 update_rq_clock(rq);
7106 7685
7107 running = task_running(rq, tsk); 7686 running = task_current(rq, tsk);
7108 on_rq = tsk->se.on_rq; 7687 on_rq = tsk->se.on_rq;
7109 7688
7110 if (on_rq) { 7689 if (on_rq) {
@@ -7113,7 +7692,7 @@ void sched_move_task(struct task_struct *tsk)
7113 tsk->sched_class->put_prev_task(rq, tsk); 7692 tsk->sched_class->put_prev_task(rq, tsk);
7114 } 7693 }
7115 7694
7116 set_task_cfs_rq(tsk, task_cpu(tsk)); 7695 set_task_rq(tsk, task_cpu(tsk));
7117 7696
7118 if (on_rq) { 7697 if (on_rq) {
7119 if (unlikely(running)) 7698 if (unlikely(running))
@@ -7121,45 +7700,82 @@ void sched_move_task(struct task_struct *tsk)
7121 enqueue_task(rq, tsk, 0); 7700 enqueue_task(rq, tsk, 0);
7122 } 7701 }
7123 7702
7124done:
7125 task_rq_unlock(rq, &flags); 7703 task_rq_unlock(rq, &flags);
7126} 7704}
7127 7705
7706/* rq->lock to be locked by caller */
7128static void set_se_shares(struct sched_entity *se, unsigned long shares) 7707static void set_se_shares(struct sched_entity *se, unsigned long shares)
7129{ 7708{
7130 struct cfs_rq *cfs_rq = se->cfs_rq; 7709 struct cfs_rq *cfs_rq = se->cfs_rq;
7131 struct rq *rq = cfs_rq->rq; 7710 struct rq *rq = cfs_rq->rq;
7132 int on_rq; 7711 int on_rq;
7133 7712
7134 spin_lock_irq(&rq->lock); 7713 if (!shares)
7714 shares = MIN_GROUP_SHARES;
7135 7715
7136 on_rq = se->on_rq; 7716 on_rq = se->on_rq;
7137 if (on_rq) 7717 if (on_rq) {
7138 dequeue_entity(cfs_rq, se, 0); 7718 dequeue_entity(cfs_rq, se, 0);
7719 dec_cpu_load(rq, se->load.weight);
7720 }
7139 7721
7140 se->load.weight = shares; 7722 se->load.weight = shares;
7141 se->load.inv_weight = div64_64((1ULL<<32), shares); 7723 se->load.inv_weight = div64_64((1ULL<<32), shares);
7142 7724
7143 if (on_rq) 7725 if (on_rq) {
7144 enqueue_entity(cfs_rq, se, 0); 7726 enqueue_entity(cfs_rq, se, 0);
7145 7727 inc_cpu_load(rq, se->load.weight);
7146 spin_unlock_irq(&rq->lock); 7728 }
7147} 7729}
7148 7730
7149int sched_group_set_shares(struct task_group *tg, unsigned long shares) 7731int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7150{ 7732{
7151 int i; 7733 int i;
7734 struct cfs_rq *cfs_rq;
7735 struct rq *rq;
7152 7736
7153 spin_lock(&tg->lock); 7737 lock_task_group_list();
7154 if (tg->shares == shares) 7738 if (tg->shares == shares)
7155 goto done; 7739 goto done;
7156 7740
7741 if (shares < MIN_GROUP_SHARES)
7742 shares = MIN_GROUP_SHARES;
7743
7744 /*
7745 * Prevent any load balance activity (rebalance_shares,
7746 * load_balance_fair) from referring to this group first,
7747 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
7748 */
7749 for_each_possible_cpu(i) {
7750 cfs_rq = tg->cfs_rq[i];
7751 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
7752 }
7753
7754 /* wait for any ongoing reference to this group to finish */
7755 synchronize_sched();
7756
7757 /*
7758 * Now we are free to modify the group's share on each cpu
7759 * w/o tripping rebalance_share or load_balance_fair.
7760 */
7157 tg->shares = shares; 7761 tg->shares = shares;
7158 for_each_possible_cpu(i) 7762 for_each_possible_cpu(i) {
7763 spin_lock_irq(&cpu_rq(i)->lock);
7159 set_se_shares(tg->se[i], shares); 7764 set_se_shares(tg->se[i], shares);
7765 spin_unlock_irq(&cpu_rq(i)->lock);
7766 }
7160 7767
7768 /*
7769 * Enable load balance activity on this group, by inserting it back on
7770 * each cpu's rq->leaf_cfs_rq_list.
7771 */
7772 for_each_possible_cpu(i) {
7773 rq = cpu_rq(i);
7774 cfs_rq = tg->cfs_rq[i];
7775 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7776 }
7161done: 7777done:
7162 spin_unlock(&tg->lock); 7778 unlock_task_group_list();
7163 return 0; 7779 return 0;
7164} 7780}
7165 7781
@@ -7168,6 +7784,31 @@ unsigned long sched_group_shares(struct task_group *tg)
7168 return tg->shares; 7784 return tg->shares;
7169} 7785}
7170 7786
7787/*
7788 * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
7789 */
7790int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
7791{
7792 struct task_group *tgi;
7793 unsigned long total = 0;
7794
7795 rcu_read_lock();
7796 list_for_each_entry_rcu(tgi, &task_groups, list)
7797 total += tgi->rt_ratio;
7798 rcu_read_unlock();
7799
7800 if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
7801 return -EINVAL;
7802
7803 tg->rt_ratio = rt_ratio;
7804 return 0;
7805}
7806
7807unsigned long sched_group_rt_ratio(struct task_group *tg)
7808{
7809 return tg->rt_ratio;
7810}
7811
7171#endif /* CONFIG_FAIR_GROUP_SCHED */ 7812#endif /* CONFIG_FAIR_GROUP_SCHED */
7172 7813
7173#ifdef CONFIG_FAIR_CGROUP_SCHED 7814#ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7243,12 +7884,30 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
7243 return (u64) tg->shares; 7884 return (u64) tg->shares;
7244} 7885}
7245 7886
7887static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
7888 u64 rt_ratio_val)
7889{
7890 return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
7891}
7892
7893static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
7894{
7895 struct task_group *tg = cgroup_tg(cgrp);
7896
7897 return (u64) tg->rt_ratio;
7898}
7899
7246static struct cftype cpu_files[] = { 7900static struct cftype cpu_files[] = {
7247 { 7901 {
7248 .name = "shares", 7902 .name = "shares",
7249 .read_uint = cpu_shares_read_uint, 7903 .read_uint = cpu_shares_read_uint,
7250 .write_uint = cpu_shares_write_uint, 7904 .write_uint = cpu_shares_write_uint,
7251 }, 7905 },
7906 {
7907 .name = "rt_ratio",
7908 .read_uint = cpu_rt_ratio_read_uint,
7909 .write_uint = cpu_rt_ratio_write_uint,
7910 },
7252}; 7911};
7253 7912
7254static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) 7913static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index d30467b47ddd..4b5e24cf2f4a 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -31,9 +31,9 @@
31/* 31/*
32 * Ease the printing of nsec fields: 32 * Ease the printing of nsec fields:
33 */ 33 */
34static long long nsec_high(long long nsec) 34static long long nsec_high(unsigned long long nsec)
35{ 35{
36 if (nsec < 0) { 36 if ((long long)nsec < 0) {
37 nsec = -nsec; 37 nsec = -nsec;
38 do_div(nsec, 1000000); 38 do_div(nsec, 1000000);
39 return -nsec; 39 return -nsec;
@@ -43,9 +43,9 @@ static long long nsec_high(long long nsec)
43 return nsec; 43 return nsec;
44} 44}
45 45
46static unsigned long nsec_low(long long nsec) 46static unsigned long nsec_low(unsigned long long nsec)
47{ 47{
48 if (nsec < 0) 48 if ((long long)nsec < 0)
49 nsec = -nsec; 49 nsec = -nsec;
50 50
51 return do_div(nsec, 1000000); 51 return do_div(nsec, 1000000);
@@ -179,6 +179,7 @@ static void print_cpu(struct seq_file *m, int cpu)
179 PN(prev_clock_raw); 179 PN(prev_clock_raw);
180 P(clock_warps); 180 P(clock_warps);
181 P(clock_overflows); 181 P(clock_overflows);
182 P(clock_underflows);
182 P(clock_deep_idle_events); 183 P(clock_deep_idle_events);
183 PN(clock_max_delta); 184 PN(clock_max_delta);
184 P(cpu_load[0]); 185 P(cpu_load[0]);
@@ -299,6 +300,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
299 PN(se.exec_max); 300 PN(se.exec_max);
300 PN(se.slice_max); 301 PN(se.slice_max);
301 PN(se.wait_max); 302 PN(se.wait_max);
303 PN(se.wait_sum);
304 P(se.wait_count);
302 P(sched_info.bkl_count); 305 P(sched_info.bkl_count);
303 P(se.nr_migrations); 306 P(se.nr_migrations);
304 P(se.nr_migrations_cold); 307 P(se.nr_migrations_cold);
@@ -366,6 +369,8 @@ void proc_sched_set_task(struct task_struct *p)
366{ 369{
367#ifdef CONFIG_SCHEDSTATS 370#ifdef CONFIG_SCHEDSTATS
368 p->se.wait_max = 0; 371 p->se.wait_max = 0;
372 p->se.wait_sum = 0;
373 p->se.wait_count = 0;
369 p->se.sleep_max = 0; 374 p->se.sleep_max = 0;
370 p->se.sum_sleep_runtime = 0; 375 p->se.sum_sleep_runtime = 0;
371 p->se.block_max = 0; 376 p->se.block_max = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c33f0ceb3de9..6c091d6e159d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -20,6 +20,8 @@
20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
21 */ 21 */
22 22
23#include <linux/latencytop.h>
24
23/* 25/*
24 * Targeted preemption latency for CPU-bound tasks: 26 * Targeted preemption latency for CPU-bound tasks:
25 * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) 27 * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -248,8 +250,8 @@ static u64 __sched_period(unsigned long nr_running)
248 unsigned long nr_latency = sched_nr_latency; 250 unsigned long nr_latency = sched_nr_latency;
249 251
250 if (unlikely(nr_running > nr_latency)) { 252 if (unlikely(nr_running > nr_latency)) {
253 period = sysctl_sched_min_granularity;
251 period *= nr_running; 254 period *= nr_running;
252 do_div(period, nr_latency);
253 } 255 }
254 256
255 return period; 257 return period;
@@ -383,6 +385,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
383{ 385{
384 schedstat_set(se->wait_max, max(se->wait_max, 386 schedstat_set(se->wait_max, max(se->wait_max,
385 rq_of(cfs_rq)->clock - se->wait_start)); 387 rq_of(cfs_rq)->clock - se->wait_start));
388 schedstat_set(se->wait_count, se->wait_count + 1);
389 schedstat_set(se->wait_sum, se->wait_sum +
390 rq_of(cfs_rq)->clock - se->wait_start);
386 schedstat_set(se->wait_start, 0); 391 schedstat_set(se->wait_start, 0);
387} 392}
388 393
@@ -434,6 +439,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
434#ifdef CONFIG_SCHEDSTATS 439#ifdef CONFIG_SCHEDSTATS
435 if (se->sleep_start) { 440 if (se->sleep_start) {
436 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 441 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
442 struct task_struct *tsk = task_of(se);
437 443
438 if ((s64)delta < 0) 444 if ((s64)delta < 0)
439 delta = 0; 445 delta = 0;
@@ -443,9 +449,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
443 449
444 se->sleep_start = 0; 450 se->sleep_start = 0;
445 se->sum_sleep_runtime += delta; 451 se->sum_sleep_runtime += delta;
452
453 account_scheduler_latency(tsk, delta >> 10, 1);
446 } 454 }
447 if (se->block_start) { 455 if (se->block_start) {
448 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 456 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
457 struct task_struct *tsk = task_of(se);
449 458
450 if ((s64)delta < 0) 459 if ((s64)delta < 0)
451 delta = 0; 460 delta = 0;
@@ -462,11 +471,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
462 * time that the task spent sleeping: 471 * time that the task spent sleeping:
463 */ 472 */
464 if (unlikely(prof_on == SLEEP_PROFILING)) { 473 if (unlikely(prof_on == SLEEP_PROFILING)) {
465 struct task_struct *tsk = task_of(se);
466 474
467 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), 475 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
468 delta >> 20); 476 delta >> 20);
469 } 477 }
478 account_scheduler_latency(tsk, delta >> 10, 0);
470 } 479 }
471#endif 480#endif
472} 481}
@@ -511,8 +520,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
511 520
512 if (!initial) { 521 if (!initial) {
513 /* sleeps upto a single latency don't count. */ 522 /* sleeps upto a single latency don't count. */
514 if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) && 523 if (sched_feat(NEW_FAIR_SLEEPERS))
515 task_of(se)->policy != SCHED_BATCH)
516 vruntime -= sysctl_sched_latency; 524 vruntime -= sysctl_sched_latency;
517 525
518 /* ensure we never gain time by being placed backwards. */ 526 /* ensure we never gain time by being placed backwards. */
@@ -643,13 +651,29 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
643 cfs_rq->curr = NULL; 651 cfs_rq->curr = NULL;
644} 652}
645 653
646static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 654static void
655entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
647{ 656{
648 /* 657 /*
649 * Update run-time statistics of the 'current'. 658 * Update run-time statistics of the 'current'.
650 */ 659 */
651 update_curr(cfs_rq); 660 update_curr(cfs_rq);
652 661
662#ifdef CONFIG_SCHED_HRTICK
663 /*
664 * queued ticks are scheduled to match the slice, so don't bother
665 * validating it and just reschedule.
666 */
667 if (queued)
668 return resched_task(rq_of(cfs_rq)->curr);
669 /*
670 * don't let the period tick interfere with the hrtick preemption
671 */
672 if (!sched_feat(DOUBLE_TICK) &&
673 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
674 return;
675#endif
676
653 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) 677 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
654 check_preempt_tick(cfs_rq, curr); 678 check_preempt_tick(cfs_rq, curr);
655} 679}
@@ -691,7 +715,7 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
691 715
692/* Iterate thr' all leaf cfs_rq's on a runqueue */ 716/* Iterate thr' all leaf cfs_rq's on a runqueue */
693#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 717#define for_each_leaf_cfs_rq(rq, cfs_rq) \
694 list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 718 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
695 719
696/* Do the two (enqueued) entities belong to the same group ? */ 720/* Do the two (enqueued) entities belong to the same group ? */
697static inline int 721static inline int
@@ -708,6 +732,8 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
708 return se->parent; 732 return se->parent;
709} 733}
710 734
735#define GROUP_IMBALANCE_PCT 20
736
711#else /* CONFIG_FAIR_GROUP_SCHED */ 737#else /* CONFIG_FAIR_GROUP_SCHED */
712 738
713#define for_each_sched_entity(se) \ 739#define for_each_sched_entity(se) \
@@ -753,6 +779,43 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
753 779
754#endif /* CONFIG_FAIR_GROUP_SCHED */ 780#endif /* CONFIG_FAIR_GROUP_SCHED */
755 781
782#ifdef CONFIG_SCHED_HRTICK
783static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
784{
785 int requeue = rq->curr == p;
786 struct sched_entity *se = &p->se;
787 struct cfs_rq *cfs_rq = cfs_rq_of(se);
788
789 WARN_ON(task_rq(p) != rq);
790
791 if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
792 u64 slice = sched_slice(cfs_rq, se);
793 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
794 s64 delta = slice - ran;
795
796 if (delta < 0) {
797 if (rq->curr == p)
798 resched_task(p);
799 return;
800 }
801
802 /*
803 * Don't schedule slices shorter than 10000ns, that just
804 * doesn't make sense. Rely on vruntime for fairness.
805 */
806 if (!requeue)
807 delta = max(10000LL, delta);
808
809 hrtick_start(rq, delta, requeue);
810 }
811}
812#else
813static inline void
814hrtick_start_fair(struct rq *rq, struct task_struct *p)
815{
816}
817#endif
818
756/* 819/*
757 * The enqueue_task method is called before nr_running is 820 * The enqueue_task method is called before nr_running is
758 * increased. Here we update the fair scheduling stats and 821 * increased. Here we update the fair scheduling stats and
@@ -761,15 +824,28 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
761static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) 824static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
762{ 825{
763 struct cfs_rq *cfs_rq; 826 struct cfs_rq *cfs_rq;
764 struct sched_entity *se = &p->se; 827 struct sched_entity *se = &p->se,
828 *topse = NULL; /* Highest schedulable entity */
829 int incload = 1;
765 830
766 for_each_sched_entity(se) { 831 for_each_sched_entity(se) {
767 if (se->on_rq) 832 topse = se;
833 if (se->on_rq) {
834 incload = 0;
768 break; 835 break;
836 }
769 cfs_rq = cfs_rq_of(se); 837 cfs_rq = cfs_rq_of(se);
770 enqueue_entity(cfs_rq, se, wakeup); 838 enqueue_entity(cfs_rq, se, wakeup);
771 wakeup = 1; 839 wakeup = 1;
772 } 840 }
841 /* Increment cpu load if we just enqueued the first task of a group on
842 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
843 * at the highest grouping level.
844 */
845 if (incload)
846 inc_cpu_load(rq, topse->load.weight);
847
848 hrtick_start_fair(rq, rq->curr);
773} 849}
774 850
775/* 851/*
@@ -780,16 +856,30 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
780static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) 856static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
781{ 857{
782 struct cfs_rq *cfs_rq; 858 struct cfs_rq *cfs_rq;
783 struct sched_entity *se = &p->se; 859 struct sched_entity *se = &p->se,
860 *topse = NULL; /* Highest schedulable entity */
861 int decload = 1;
784 862
785 for_each_sched_entity(se) { 863 for_each_sched_entity(se) {
864 topse = se;
786 cfs_rq = cfs_rq_of(se); 865 cfs_rq = cfs_rq_of(se);
787 dequeue_entity(cfs_rq, se, sleep); 866 dequeue_entity(cfs_rq, se, sleep);
788 /* Don't dequeue parent if it has other entities besides us */ 867 /* Don't dequeue parent if it has other entities besides us */
789 if (cfs_rq->load.weight) 868 if (cfs_rq->load.weight) {
869 if (parent_entity(se))
870 decload = 0;
790 break; 871 break;
872 }
791 sleep = 1; 873 sleep = 1;
792 } 874 }
875 /* Decrement cpu load if we just dequeued the last task of a group on
876 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
877 * at the highest grouping level.
878 */
879 if (decload)
880 dec_cpu_load(rq, topse->load.weight);
881
882 hrtick_start_fair(rq, rq->curr);
793} 883}
794 884
795/* 885/*
@@ -837,6 +927,154 @@ static void yield_task_fair(struct rq *rq)
837} 927}
838 928
839/* 929/*
930 * wake_idle() will wake a task on an idle cpu if task->cpu is
931 * not idle and an idle cpu is available. The span of cpus to
932 * search starts with cpus closest then further out as needed,
933 * so we always favor a closer, idle cpu.
934 *
935 * Returns the CPU we should wake onto.
936 */
937#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
938static int wake_idle(int cpu, struct task_struct *p)
939{
940 cpumask_t tmp;
941 struct sched_domain *sd;
942 int i;
943
944 /*
945 * If it is idle, then it is the best cpu to run this task.
946 *
947 * This cpu is also the best, if it has more than one task already.
948 * Siblings must be also busy(in most cases) as they didn't already
949 * pickup the extra load from this cpu and hence we need not check
950 * sibling runqueue info. This will avoid the checks and cache miss
951 * penalities associated with that.
952 */
953 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
954 return cpu;
955
956 for_each_domain(cpu, sd) {
957 if (sd->flags & SD_WAKE_IDLE) {
958 cpus_and(tmp, sd->span, p->cpus_allowed);
959 for_each_cpu_mask(i, tmp) {
960 if (idle_cpu(i)) {
961 if (i != task_cpu(p)) {
962 schedstat_inc(p,
963 se.nr_wakeups_idle);
964 }
965 return i;
966 }
967 }
968 } else {
969 break;
970 }
971 }
972 return cpu;
973}
974#else
975static inline int wake_idle(int cpu, struct task_struct *p)
976{
977 return cpu;
978}
979#endif
980
981#ifdef CONFIG_SMP
982static int select_task_rq_fair(struct task_struct *p, int sync)
983{
984 int cpu, this_cpu;
985 struct rq *rq;
986 struct sched_domain *sd, *this_sd = NULL;
987 int new_cpu;
988
989 cpu = task_cpu(p);
990 rq = task_rq(p);
991 this_cpu = smp_processor_id();
992 new_cpu = cpu;
993
994 if (cpu == this_cpu)
995 goto out_set_cpu;
996
997 for_each_domain(this_cpu, sd) {
998 if (cpu_isset(cpu, sd->span)) {
999 this_sd = sd;
1000 break;
1001 }
1002 }
1003
1004 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1005 goto out_set_cpu;
1006
1007 /*
1008 * Check for affine wakeup and passive balancing possibilities.
1009 */
1010 if (this_sd) {
1011 int idx = this_sd->wake_idx;
1012 unsigned int imbalance;
1013 unsigned long load, this_load;
1014
1015 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1016
1017 load = source_load(cpu, idx);
1018 this_load = target_load(this_cpu, idx);
1019
1020 new_cpu = this_cpu; /* Wake to this CPU if we can */
1021
1022 if (this_sd->flags & SD_WAKE_AFFINE) {
1023 unsigned long tl = this_load;
1024 unsigned long tl_per_task;
1025
1026 /*
1027 * Attract cache-cold tasks on sync wakeups:
1028 */
1029 if (sync && !task_hot(p, rq->clock, this_sd))
1030 goto out_set_cpu;
1031
1032 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1033 tl_per_task = cpu_avg_load_per_task(this_cpu);
1034
1035 /*
1036 * If sync wakeup then subtract the (maximum possible)
1037 * effect of the currently running task from the load
1038 * of the current CPU:
1039 */
1040 if (sync)
1041 tl -= current->se.load.weight;
1042
1043 if ((tl <= load &&
1044 tl + target_load(cpu, idx) <= tl_per_task) ||
1045 100*(tl + p->se.load.weight) <= imbalance*load) {
1046 /*
1047 * This domain has SD_WAKE_AFFINE and
1048 * p is cache cold in this domain, and
1049 * there is no bad imbalance.
1050 */
1051 schedstat_inc(this_sd, ttwu_move_affine);
1052 schedstat_inc(p, se.nr_wakeups_affine);
1053 goto out_set_cpu;
1054 }
1055 }
1056
1057 /*
1058 * Start passive balancing when half the imbalance_pct
1059 * limit is reached.
1060 */
1061 if (this_sd->flags & SD_WAKE_BALANCE) {
1062 if (imbalance*this_load <= 100*load) {
1063 schedstat_inc(this_sd, ttwu_move_balance);
1064 schedstat_inc(p, se.nr_wakeups_passive);
1065 goto out_set_cpu;
1066 }
1067 }
1068 }
1069
1070 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1071out_set_cpu:
1072 return wake_idle(new_cpu, p);
1073}
1074#endif /* CONFIG_SMP */
1075
1076
1077/*
840 * Preempt the current task with a newly woken task if needed: 1078 * Preempt the current task with a newly woken task if needed:
841 */ 1079 */
842static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) 1080static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
@@ -868,7 +1106,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
868 } 1106 }
869 1107
870 gran = sysctl_sched_wakeup_granularity; 1108 gran = sysctl_sched_wakeup_granularity;
871 if (unlikely(se->load.weight != NICE_0_LOAD)) 1109 /*
1110 * More easily preempt - nice tasks, while not making
1111 * it harder for + nice tasks.
1112 */
1113 if (unlikely(se->load.weight > NICE_0_LOAD))
872 gran = calc_delta_fair(gran, &se->load); 1114 gran = calc_delta_fair(gran, &se->load);
873 1115
874 if (pse->vruntime + gran < se->vruntime) 1116 if (pse->vruntime + gran < se->vruntime)
@@ -877,6 +1119,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
877 1119
878static struct task_struct *pick_next_task_fair(struct rq *rq) 1120static struct task_struct *pick_next_task_fair(struct rq *rq)
879{ 1121{
1122 struct task_struct *p;
880 struct cfs_rq *cfs_rq = &rq->cfs; 1123 struct cfs_rq *cfs_rq = &rq->cfs;
881 struct sched_entity *se; 1124 struct sched_entity *se;
882 1125
@@ -888,7 +1131,10 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
888 cfs_rq = group_cfs_rq(se); 1131 cfs_rq = group_cfs_rq(se);
889 } while (cfs_rq); 1132 } while (cfs_rq);
890 1133
891 return task_of(se); 1134 p = task_of(se);
1135 hrtick_start_fair(rq, p);
1136
1137 return p;
892} 1138}
893 1139
894/* 1140/*
@@ -945,25 +1191,6 @@ static struct task_struct *load_balance_next_fair(void *arg)
945 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); 1191 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
946} 1192}
947 1193
948#ifdef CONFIG_FAIR_GROUP_SCHED
949static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
950{
951 struct sched_entity *curr;
952 struct task_struct *p;
953
954 if (!cfs_rq->nr_running)
955 return MAX_PRIO;
956
957 curr = cfs_rq->curr;
958 if (!curr)
959 curr = __pick_next_entity(cfs_rq);
960
961 p = task_of(curr);
962
963 return p->prio;
964}
965#endif
966
967static unsigned long 1194static unsigned long
968load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1195load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
969 unsigned long max_load_move, 1196 unsigned long max_load_move,
@@ -973,28 +1200,45 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
973 struct cfs_rq *busy_cfs_rq; 1200 struct cfs_rq *busy_cfs_rq;
974 long rem_load_move = max_load_move; 1201 long rem_load_move = max_load_move;
975 struct rq_iterator cfs_rq_iterator; 1202 struct rq_iterator cfs_rq_iterator;
1203 unsigned long load_moved;
976 1204
977 cfs_rq_iterator.start = load_balance_start_fair; 1205 cfs_rq_iterator.start = load_balance_start_fair;
978 cfs_rq_iterator.next = load_balance_next_fair; 1206 cfs_rq_iterator.next = load_balance_next_fair;
979 1207
980 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 1208 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
981#ifdef CONFIG_FAIR_GROUP_SCHED 1209#ifdef CONFIG_FAIR_GROUP_SCHED
982 struct cfs_rq *this_cfs_rq; 1210 struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu];
983 long imbalance; 1211 unsigned long maxload, task_load, group_weight;
984 unsigned long maxload; 1212 unsigned long thisload, per_task_load;
1213 struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
1214
1215 task_load = busy_cfs_rq->load.weight;
1216 group_weight = se->load.weight;
985 1217
986 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); 1218 /*
1219 * 'group_weight' is contributed by tasks of total weight
1220 * 'task_load'. To move 'rem_load_move' worth of weight only,
1221 * we need to move a maximum task load of:
1222 *
1223 * maxload = (remload / group_weight) * task_load;
1224 */
1225 maxload = (rem_load_move * task_load) / group_weight;
987 1226
988 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; 1227 if (!maxload || !task_load)
989 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
990 if (imbalance <= 0)
991 continue; 1228 continue;
992 1229
993 /* Don't pull more than imbalance/2 */ 1230 per_task_load = task_load / busy_cfs_rq->nr_running;
994 imbalance /= 2; 1231 /*
995 maxload = min(rem_load_move, imbalance); 1232 * balance_tasks will try to forcibly move atleast one task if
1233 * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
1234 * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
1235 */
1236 if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
1237 continue;
996 1238
997 *this_best_prio = cfs_rq_best_prio(this_cfs_rq); 1239 /* Disable priority-based load balance */
1240 *this_best_prio = 0;
1241 thisload = this_cfs_rq->load.weight;
998#else 1242#else
999# define maxload rem_load_move 1243# define maxload rem_load_move
1000#endif 1244#endif
@@ -1003,11 +1247,33 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1003 * load_balance_[start|next]_fair iterators 1247 * load_balance_[start|next]_fair iterators
1004 */ 1248 */
1005 cfs_rq_iterator.arg = busy_cfs_rq; 1249 cfs_rq_iterator.arg = busy_cfs_rq;
1006 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, 1250 load_moved = balance_tasks(this_rq, this_cpu, busiest,
1007 maxload, sd, idle, all_pinned, 1251 maxload, sd, idle, all_pinned,
1008 this_best_prio, 1252 this_best_prio,
1009 &cfs_rq_iterator); 1253 &cfs_rq_iterator);
1010 1254
1255#ifdef CONFIG_FAIR_GROUP_SCHED
1256 /*
1257 * load_moved holds the task load that was moved. The
1258 * effective (group) weight moved would be:
1259 * load_moved_eff = load_moved/task_load * group_weight;
1260 */
1261 load_moved = (group_weight * load_moved) / task_load;
1262
1263 /* Adjust shares on both cpus to reflect load_moved */
1264 group_weight -= load_moved;
1265 set_se_shares(se, group_weight);
1266
1267 se = busy_cfs_rq->tg->se[this_cpu];
1268 if (!thisload)
1269 group_weight = load_moved;
1270 else
1271 group_weight = se->load.weight + load_moved;
1272 set_se_shares(se, group_weight);
1273#endif
1274
1275 rem_load_move -= load_moved;
1276
1011 if (rem_load_move <= 0) 1277 if (rem_load_move <= 0)
1012 break; 1278 break;
1013 } 1279 }
@@ -1043,14 +1309,14 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1043/* 1309/*
1044 * scheduler tick hitting a task of our scheduling class: 1310 * scheduler tick hitting a task of our scheduling class:
1045 */ 1311 */
1046static void task_tick_fair(struct rq *rq, struct task_struct *curr) 1312static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1047{ 1313{
1048 struct cfs_rq *cfs_rq; 1314 struct cfs_rq *cfs_rq;
1049 struct sched_entity *se = &curr->se; 1315 struct sched_entity *se = &curr->se;
1050 1316
1051 for_each_sched_entity(se) { 1317 for_each_sched_entity(se) {
1052 cfs_rq = cfs_rq_of(se); 1318 cfs_rq = cfs_rq_of(se);
1053 entity_tick(cfs_rq, se); 1319 entity_tick(cfs_rq, se, queued);
1054 } 1320 }
1055} 1321}
1056 1322
@@ -1088,6 +1354,42 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1088 resched_task(rq->curr); 1354 resched_task(rq->curr);
1089} 1355}
1090 1356
1357/*
1358 * Priority of the task has changed. Check to see if we preempt
1359 * the current task.
1360 */
1361static void prio_changed_fair(struct rq *rq, struct task_struct *p,
1362 int oldprio, int running)
1363{
1364 /*
1365 * Reschedule if we are currently running on this runqueue and
1366 * our priority decreased, or if we are not currently running on
1367 * this runqueue and our priority is higher than the current's
1368 */
1369 if (running) {
1370 if (p->prio > oldprio)
1371 resched_task(rq->curr);
1372 } else
1373 check_preempt_curr(rq, p);
1374}
1375
1376/*
1377 * We switched to the sched_fair class.
1378 */
1379static void switched_to_fair(struct rq *rq, struct task_struct *p,
1380 int running)
1381{
1382 /*
1383 * We were most likely switched from sched_rt, so
1384 * kick off the schedule if running, otherwise just see
1385 * if we can still preempt the current task.
1386 */
1387 if (running)
1388 resched_task(rq->curr);
1389 else
1390 check_preempt_curr(rq, p);
1391}
1392
1091/* Account for a task changing its policy or group. 1393/* Account for a task changing its policy or group.
1092 * 1394 *
1093 * This routine is mostly called to set cfs_rq->curr field when a task 1395 * This routine is mostly called to set cfs_rq->curr field when a task
@@ -1109,6 +1411,9 @@ static const struct sched_class fair_sched_class = {
1109 .enqueue_task = enqueue_task_fair, 1411 .enqueue_task = enqueue_task_fair,
1110 .dequeue_task = dequeue_task_fair, 1412 .dequeue_task = dequeue_task_fair,
1111 .yield_task = yield_task_fair, 1413 .yield_task = yield_task_fair,
1414#ifdef CONFIG_SMP
1415 .select_task_rq = select_task_rq_fair,
1416#endif /* CONFIG_SMP */
1112 1417
1113 .check_preempt_curr = check_preempt_wakeup, 1418 .check_preempt_curr = check_preempt_wakeup,
1114 1419
@@ -1123,6 +1428,9 @@ static const struct sched_class fair_sched_class = {
1123 .set_curr_task = set_curr_task_fair, 1428 .set_curr_task = set_curr_task_fair,
1124 .task_tick = task_tick_fair, 1429 .task_tick = task_tick_fair,
1125 .task_new = task_new_fair, 1430 .task_new = task_new_fair,
1431
1432 .prio_changed = prio_changed_fair,
1433 .switched_to = switched_to_fair,
1126}; 1434};
1127 1435
1128#ifdef CONFIG_SCHED_DEBUG 1436#ifdef CONFIG_SCHED_DEBUG
@@ -1133,7 +1441,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
1133#ifdef CONFIG_FAIR_GROUP_SCHED 1441#ifdef CONFIG_FAIR_GROUP_SCHED
1134 print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); 1442 print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
1135#endif 1443#endif
1444 rcu_read_lock();
1136 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) 1445 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1137 print_cfs_rq(m, cpu, cfs_rq); 1446 print_cfs_rq(m, cpu, cfs_rq);
1447 rcu_read_unlock();
1138} 1448}
1139#endif 1449#endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index bf9c25c15b8b..2bcafa375633 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -5,6 +5,12 @@
5 * handled in sched_fair.c) 5 * handled in sched_fair.c)
6 */ 6 */
7 7
8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sync)
10{
11 return task_cpu(p); /* IDLE tasks as never migrated */
12}
13#endif /* CONFIG_SMP */
8/* 14/*
9 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
10 */ 16 */
@@ -55,7 +61,7 @@ move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
55} 61}
56#endif 62#endif
57 63
58static void task_tick_idle(struct rq *rq, struct task_struct *curr) 64static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
59{ 65{
60} 66}
61 67
@@ -63,6 +69,33 @@ static void set_curr_task_idle(struct rq *rq)
63{ 69{
64} 70}
65 71
72static void switched_to_idle(struct rq *rq, struct task_struct *p,
73 int running)
74{
75 /* Can this actually happen?? */
76 if (running)
77 resched_task(rq->curr);
78 else
79 check_preempt_curr(rq, p);
80}
81
82static void prio_changed_idle(struct rq *rq, struct task_struct *p,
83 int oldprio, int running)
84{
85 /* This can happen for hot plug CPUS */
86
87 /*
88 * Reschedule if we are currently running on this runqueue and
89 * our priority decreased, or if we are not currently running on
90 * this runqueue and our priority is higher than the current's
91 */
92 if (running) {
93 if (p->prio > oldprio)
94 resched_task(rq->curr);
95 } else
96 check_preempt_curr(rq, p);
97}
98
66/* 99/*
67 * Simple, special scheduling class for the per-CPU idle tasks: 100 * Simple, special scheduling class for the per-CPU idle tasks:
68 */ 101 */
@@ -72,6 +105,9 @@ const struct sched_class idle_sched_class = {
72 105
73 /* dequeue is not valid, we print a debug message there: */ 106 /* dequeue is not valid, we print a debug message there: */
74 .dequeue_task = dequeue_task_idle, 107 .dequeue_task = dequeue_task_idle,
108#ifdef CONFIG_SMP
109 .select_task_rq = select_task_rq_idle,
110#endif /* CONFIG_SMP */
75 111
76 .check_preempt_curr = check_preempt_curr_idle, 112 .check_preempt_curr = check_preempt_curr_idle,
77 113
@@ -85,5 +121,9 @@ const struct sched_class idle_sched_class = {
85 121
86 .set_curr_task = set_curr_task_idle, 122 .set_curr_task = set_curr_task_idle,
87 .task_tick = task_tick_idle, 123 .task_tick = task_tick_idle,
124
125 .prio_changed = prio_changed_idle,
126 .switched_to = switched_to_idle,
127
88 /* no .task_new for idle tasks */ 128 /* no .task_new for idle tasks */
89}; 129};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index ee9c8b6529e9..274b40d7bef2 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,6 +3,217 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6#ifdef CONFIG_SMP
7
8static inline int rt_overloaded(struct rq *rq)
9{
10 return atomic_read(&rq->rd->rto_count);
11}
12
13static inline void rt_set_overload(struct rq *rq)
14{
15 cpu_set(rq->cpu, rq->rd->rto_mask);
16 /*
17 * Make sure the mask is visible before we set
18 * the overload count. That is checked to determine
19 * if we should look at the mask. It would be a shame
20 * if we looked at the mask, but the mask was not
21 * updated yet.
22 */
23 wmb();
24 atomic_inc(&rq->rd->rto_count);
25}
26
27static inline void rt_clear_overload(struct rq *rq)
28{
29 /* the order here really doesn't matter */
30 atomic_dec(&rq->rd->rto_count);
31 cpu_clear(rq->cpu, rq->rd->rto_mask);
32}
33
34static void update_rt_migration(struct rq *rq)
35{
36 if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
37 if (!rq->rt.overloaded) {
38 rt_set_overload(rq);
39 rq->rt.overloaded = 1;
40 }
41 } else if (rq->rt.overloaded) {
42 rt_clear_overload(rq);
43 rq->rt.overloaded = 0;
44 }
45}
46#endif /* CONFIG_SMP */
47
48static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
49{
50 return container_of(rt_se, struct task_struct, rt);
51}
52
53static inline int on_rt_rq(struct sched_rt_entity *rt_se)
54{
55 return !list_empty(&rt_se->run_list);
56}
57
58#ifdef CONFIG_FAIR_GROUP_SCHED
59
60static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
61{
62 if (!rt_rq->tg)
63 return SCHED_RT_FRAC;
64
65 return rt_rq->tg->rt_ratio;
66}
67
68#define for_each_leaf_rt_rq(rt_rq, rq) \
69 list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
70
71static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
72{
73 return rt_rq->rq;
74}
75
76static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
77{
78 return rt_se->rt_rq;
79}
80
81#define for_each_sched_rt_entity(rt_se) \
82 for (; rt_se; rt_se = rt_se->parent)
83
84static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
85{
86 return rt_se->my_q;
87}
88
89static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
90static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
91
92static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
93{
94 struct sched_rt_entity *rt_se = rt_rq->rt_se;
95
96 if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
97 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
98
99 enqueue_rt_entity(rt_se);
100 if (rt_rq->highest_prio < curr->prio)
101 resched_task(curr);
102 }
103}
104
105static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
106{
107 struct sched_rt_entity *rt_se = rt_rq->rt_se;
108
109 if (rt_se && on_rt_rq(rt_se))
110 dequeue_rt_entity(rt_se);
111}
112
113#else
114
115static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
116{
117 return sysctl_sched_rt_ratio;
118}
119
120#define for_each_leaf_rt_rq(rt_rq, rq) \
121 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
122
123static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
124{
125 return container_of(rt_rq, struct rq, rt);
126}
127
128static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
129{
130 struct task_struct *p = rt_task_of(rt_se);
131 struct rq *rq = task_rq(p);
132
133 return &rq->rt;
134}
135
136#define for_each_sched_rt_entity(rt_se) \
137 for (; rt_se; rt_se = NULL)
138
139static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
140{
141 return NULL;
142}
143
144static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
145{
146}
147
148static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
149{
150}
151
152#endif
153
154static inline int rt_se_prio(struct sched_rt_entity *rt_se)
155{
156#ifdef CONFIG_FAIR_GROUP_SCHED
157 struct rt_rq *rt_rq = group_rt_rq(rt_se);
158
159 if (rt_rq)
160 return rt_rq->highest_prio;
161#endif
162
163 return rt_task_of(rt_se)->prio;
164}
165
166static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
167{
168 unsigned int rt_ratio = sched_rt_ratio(rt_rq);
169 u64 period, ratio;
170
171 if (rt_ratio == SCHED_RT_FRAC)
172 return 0;
173
174 if (rt_rq->rt_throttled)
175 return 1;
176
177 period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
178 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
179
180 if (rt_rq->rt_time > ratio) {
181 struct rq *rq = rq_of_rt_rq(rt_rq);
182
183 rq->rt_throttled = 1;
184 rt_rq->rt_throttled = 1;
185
186 sched_rt_ratio_dequeue(rt_rq);
187 return 1;
188 }
189
190 return 0;
191}
192
193static void update_sched_rt_period(struct rq *rq)
194{
195 struct rt_rq *rt_rq;
196 u64 period;
197
198 while (rq->clock > rq->rt_period_expire) {
199 period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
200 rq->rt_period_expire += period;
201
202 for_each_leaf_rt_rq(rt_rq, rq) {
203 unsigned long rt_ratio = sched_rt_ratio(rt_rq);
204 u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
205
206 rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
207 if (rt_rq->rt_throttled) {
208 rt_rq->rt_throttled = 0;
209 sched_rt_ratio_enqueue(rt_rq);
210 }
211 }
212
213 rq->rt_throttled = 0;
214 }
215}
216
6/* 217/*
7 * Update the current task's runtime statistics. Skip current tasks that 218 * Update the current task's runtime statistics. Skip current tasks that
8 * are not in our scheduling class. 219 * are not in our scheduling class.
@@ -10,6 +221,8 @@
10static void update_curr_rt(struct rq *rq) 221static void update_curr_rt(struct rq *rq)
11{ 222{
12 struct task_struct *curr = rq->curr; 223 struct task_struct *curr = rq->curr;
224 struct sched_rt_entity *rt_se = &curr->rt;
225 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
13 u64 delta_exec; 226 u64 delta_exec;
14 227
15 if (!task_has_rt_policy(curr)) 228 if (!task_has_rt_policy(curr))
@@ -24,47 +237,228 @@ static void update_curr_rt(struct rq *rq)
24 curr->se.sum_exec_runtime += delta_exec; 237 curr->se.sum_exec_runtime += delta_exec;
25 curr->se.exec_start = rq->clock; 238 curr->se.exec_start = rq->clock;
26 cpuacct_charge(curr, delta_exec); 239 cpuacct_charge(curr, delta_exec);
240
241 rt_rq->rt_time += delta_exec;
242 /*
243 * might make it a tad more accurate:
244 *
245 * update_sched_rt_period(rq);
246 */
247 if (sched_rt_ratio_exceeded(rt_rq))
248 resched_task(curr);
27} 249}
28 250
29static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) 251static inline
252void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
253{
254 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
255 rt_rq->rt_nr_running++;
256#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
257 if (rt_se_prio(rt_se) < rt_rq->highest_prio)
258 rt_rq->highest_prio = rt_se_prio(rt_se);
259#endif
260#ifdef CONFIG_SMP
261 if (rt_se->nr_cpus_allowed > 1) {
262 struct rq *rq = rq_of_rt_rq(rt_rq);
263 rq->rt.rt_nr_migratory++;
264 }
265
266 update_rt_migration(rq_of_rt_rq(rt_rq));
267#endif
268}
269
270static inline
271void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
272{
273 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
274 WARN_ON(!rt_rq->rt_nr_running);
275 rt_rq->rt_nr_running--;
276#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
277 if (rt_rq->rt_nr_running) {
278 struct rt_prio_array *array;
279
280 WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
281 if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
282 /* recalculate */
283 array = &rt_rq->active;
284 rt_rq->highest_prio =
285 sched_find_first_bit(array->bitmap);
286 } /* otherwise leave rq->highest prio alone */
287 } else
288 rt_rq->highest_prio = MAX_RT_PRIO;
289#endif
290#ifdef CONFIG_SMP
291 if (rt_se->nr_cpus_allowed > 1) {
292 struct rq *rq = rq_of_rt_rq(rt_rq);
293 rq->rt.rt_nr_migratory--;
294 }
295
296 update_rt_migration(rq_of_rt_rq(rt_rq));
297#endif /* CONFIG_SMP */
298}
299
300static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
301{
302 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
303 struct rt_prio_array *array = &rt_rq->active;
304 struct rt_rq *group_rq = group_rt_rq(rt_se);
305
306 if (group_rq && group_rq->rt_throttled)
307 return;
308
309 list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
310 __set_bit(rt_se_prio(rt_se), array->bitmap);
311
312 inc_rt_tasks(rt_se, rt_rq);
313}
314
315static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
316{
317 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
318 struct rt_prio_array *array = &rt_rq->active;
319
320 list_del_init(&rt_se->run_list);
321 if (list_empty(array->queue + rt_se_prio(rt_se)))
322 __clear_bit(rt_se_prio(rt_se), array->bitmap);
323
324 dec_rt_tasks(rt_se, rt_rq);
325}
326
327/*
328 * Because the prio of an upper entry depends on the lower
329 * entries, we must remove entries top - down.
330 *
331 * XXX: O(1/2 h^2) because we can only walk up, not down the chain.
332 * doesn't matter much for now, as h=2 for GROUP_SCHED.
333 */
334static void dequeue_rt_stack(struct task_struct *p)
30{ 335{
31 struct rt_prio_array *array = &rq->rt.active; 336 struct sched_rt_entity *rt_se, *top_se;
32 337
33 list_add_tail(&p->run_list, array->queue + p->prio); 338 /*
34 __set_bit(p->prio, array->bitmap); 339 * dequeue all, top - down.
340 */
341 do {
342 rt_se = &p->rt;
343 top_se = NULL;
344 for_each_sched_rt_entity(rt_se) {
345 if (on_rt_rq(rt_se))
346 top_se = rt_se;
347 }
348 if (top_se)
349 dequeue_rt_entity(top_se);
350 } while (top_se);
35} 351}
36 352
37/* 353/*
38 * Adding/removing a task to/from a priority array: 354 * Adding/removing a task to/from a priority array:
39 */ 355 */
356static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
357{
358 struct sched_rt_entity *rt_se = &p->rt;
359
360 if (wakeup)
361 rt_se->timeout = 0;
362
363 dequeue_rt_stack(p);
364
365 /*
366 * enqueue everybody, bottom - up.
367 */
368 for_each_sched_rt_entity(rt_se)
369 enqueue_rt_entity(rt_se);
370
371 inc_cpu_load(rq, p->se.load.weight);
372}
373
40static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 374static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
41{ 375{
42 struct rt_prio_array *array = &rq->rt.active; 376 struct sched_rt_entity *rt_se = &p->rt;
377 struct rt_rq *rt_rq;
43 378
44 update_curr_rt(rq); 379 update_curr_rt(rq);
45 380
46 list_del(&p->run_list); 381 dequeue_rt_stack(p);
47 if (list_empty(array->queue + p->prio)) 382
48 __clear_bit(p->prio, array->bitmap); 383 /*
384 * re-enqueue all non-empty rt_rq entities.
385 */
386 for_each_sched_rt_entity(rt_se) {
387 rt_rq = group_rt_rq(rt_se);
388 if (rt_rq && rt_rq->rt_nr_running)
389 enqueue_rt_entity(rt_se);
390 }
391
392 dec_cpu_load(rq, p->se.load.weight);
49} 393}
50 394
51/* 395/*
52 * Put task to the end of the run list without the overhead of dequeue 396 * Put task to the end of the run list without the overhead of dequeue
53 * followed by enqueue. 397 * followed by enqueue.
54 */ 398 */
399static
400void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
401{
402 struct rt_prio_array *array = &rt_rq->active;
403
404 list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
405}
406
55static void requeue_task_rt(struct rq *rq, struct task_struct *p) 407static void requeue_task_rt(struct rq *rq, struct task_struct *p)
56{ 408{
57 struct rt_prio_array *array = &rq->rt.active; 409 struct sched_rt_entity *rt_se = &p->rt;
410 struct rt_rq *rt_rq;
58 411
59 list_move_tail(&p->run_list, array->queue + p->prio); 412 for_each_sched_rt_entity(rt_se) {
413 rt_rq = rt_rq_of_se(rt_se);
414 requeue_rt_entity(rt_rq, rt_se);
415 }
60} 416}
61 417
62static void 418static void yield_task_rt(struct rq *rq)
63yield_task_rt(struct rq *rq)
64{ 419{
65 requeue_task_rt(rq, rq->curr); 420 requeue_task_rt(rq, rq->curr);
66} 421}
67 422
423#ifdef CONFIG_SMP
424static int find_lowest_rq(struct task_struct *task);
425
426static int select_task_rq_rt(struct task_struct *p, int sync)
427{
428 struct rq *rq = task_rq(p);
429
430 /*
431 * If the current task is an RT task, then
432 * try to see if we can wake this RT task up on another
433 * runqueue. Otherwise simply start this RT task
434 * on its current runqueue.
435 *
436 * We want to avoid overloading runqueues. Even if
437 * the RT task is of higher priority than the current RT task.
438 * RT tasks behave differently than other tasks. If
439 * one gets preempted, we try to push it off to another queue.
440 * So trying to keep a preempting RT task on the same
441 * cache hot CPU will force the running RT task to
442 * a cold CPU. So we waste all the cache for the lower
443 * RT task in hopes of saving some of a RT task
444 * that is just being woken and probably will have
445 * cold cache anyway.
446 */
447 if (unlikely(rt_task(rq->curr)) &&
448 (p->rt.nr_cpus_allowed > 1)) {
449 int cpu = find_lowest_rq(p);
450
451 return (cpu == -1) ? task_cpu(p) : cpu;
452 }
453
454 /*
455 * Otherwise, just let it ride on the affined RQ and the
456 * post-schedule router will push the preempted task away
457 */
458 return task_cpu(p);
459}
460#endif /* CONFIG_SMP */
461
68/* 462/*
69 * Preempt the current task with a newly woken task if needed: 463 * Preempt the current task with a newly woken task if needed:
70 */ 464 */
@@ -74,25 +468,48 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
74 resched_task(rq->curr); 468 resched_task(rq->curr);
75} 469}
76 470
77static struct task_struct *pick_next_task_rt(struct rq *rq) 471static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
472 struct rt_rq *rt_rq)
78{ 473{
79 struct rt_prio_array *array = &rq->rt.active; 474 struct rt_prio_array *array = &rt_rq->active;
80 struct task_struct *next; 475 struct sched_rt_entity *next = NULL;
81 struct list_head *queue; 476 struct list_head *queue;
82 int idx; 477 int idx;
83 478
84 idx = sched_find_first_bit(array->bitmap); 479 idx = sched_find_first_bit(array->bitmap);
85 if (idx >= MAX_RT_PRIO) 480 BUG_ON(idx >= MAX_RT_PRIO);
86 return NULL;
87 481
88 queue = array->queue + idx; 482 queue = array->queue + idx;
89 next = list_entry(queue->next, struct task_struct, run_list); 483 next = list_entry(queue->next, struct sched_rt_entity, run_list);
90
91 next->se.exec_start = rq->clock;
92 484
93 return next; 485 return next;
94} 486}
95 487
488static struct task_struct *pick_next_task_rt(struct rq *rq)
489{
490 struct sched_rt_entity *rt_se;
491 struct task_struct *p;
492 struct rt_rq *rt_rq;
493
494 rt_rq = &rq->rt;
495
496 if (unlikely(!rt_rq->rt_nr_running))
497 return NULL;
498
499 if (sched_rt_ratio_exceeded(rt_rq))
500 return NULL;
501
502 do {
503 rt_se = pick_next_rt_entity(rq, rt_rq);
504 BUG_ON(!rt_se);
505 rt_rq = group_rt_rq(rt_se);
506 } while (rt_rq);
507
508 p = rt_task_of(rt_se);
509 p->se.exec_start = rq->clock;
510 return p;
511}
512
96static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 513static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
97{ 514{
98 update_curr_rt(rq); 515 update_curr_rt(rq);
@@ -100,76 +517,448 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
100} 517}
101 518
102#ifdef CONFIG_SMP 519#ifdef CONFIG_SMP
103/* 520
104 * Load-balancing iterator. Note: while the runqueue stays locked 521/* Only try algorithms three times */
105 * during the whole iteration, the current task might be 522#define RT_MAX_TRIES 3
106 * dequeued so the iterator has to be dequeue-safe. Here we 523
107 * achieve that by always pre-iterating before returning 524static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
108 * the current task: 525static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
109 */ 526
110static struct task_struct *load_balance_start_rt(void *arg) 527static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
111{ 528{
112 struct rq *rq = arg; 529 if (!task_running(rq, p) &&
113 struct rt_prio_array *array = &rq->rt.active; 530 (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
114 struct list_head *head, *curr; 531 (p->rt.nr_cpus_allowed > 1))
115 struct task_struct *p; 532 return 1;
533 return 0;
534}
535
536/* Return the second highest RT task, NULL otherwise */
537static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
538{
539 struct task_struct *next = NULL;
540 struct sched_rt_entity *rt_se;
541 struct rt_prio_array *array;
542 struct rt_rq *rt_rq;
116 int idx; 543 int idx;
117 544
118 idx = sched_find_first_bit(array->bitmap); 545 for_each_leaf_rt_rq(rt_rq, rq) {
119 if (idx >= MAX_RT_PRIO) 546 array = &rt_rq->active;
120 return NULL; 547 idx = sched_find_first_bit(array->bitmap);
548 next_idx:
549 if (idx >= MAX_RT_PRIO)
550 continue;
551 if (next && next->prio < idx)
552 continue;
553 list_for_each_entry(rt_se, array->queue + idx, run_list) {
554 struct task_struct *p = rt_task_of(rt_se);
555 if (pick_rt_task(rq, p, cpu)) {
556 next = p;
557 break;
558 }
559 }
560 if (!next) {
561 idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
562 goto next_idx;
563 }
564 }
565
566 return next;
567}
121 568
122 head = array->queue + idx; 569static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
123 curr = head->prev;
124 570
125 p = list_entry(curr, struct task_struct, run_list); 571static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
572{
573 int lowest_prio = -1;
574 int lowest_cpu = -1;
575 int count = 0;
576 int cpu;
126 577
127 curr = curr->prev; 578 cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
128 579
129 rq->rt.rt_load_balance_idx = idx; 580 /*
130 rq->rt.rt_load_balance_head = head; 581 * Scan each rq for the lowest prio.
131 rq->rt.rt_load_balance_curr = curr; 582 */
583 for_each_cpu_mask(cpu, *lowest_mask) {
584 struct rq *rq = cpu_rq(cpu);
132 585
133 return p; 586 /* We look for lowest RT prio or non-rt CPU */
587 if (rq->rt.highest_prio >= MAX_RT_PRIO) {
588 /*
589 * if we already found a low RT queue
590 * and now we found this non-rt queue
591 * clear the mask and set our bit.
592 * Otherwise just return the queue as is
593 * and the count==1 will cause the algorithm
594 * to use the first bit found.
595 */
596 if (lowest_cpu != -1) {
597 cpus_clear(*lowest_mask);
598 cpu_set(rq->cpu, *lowest_mask);
599 }
600 return 1;
601 }
602
603 /* no locking for now */
604 if ((rq->rt.highest_prio > task->prio)
605 && (rq->rt.highest_prio >= lowest_prio)) {
606 if (rq->rt.highest_prio > lowest_prio) {
607 /* new low - clear old data */
608 lowest_prio = rq->rt.highest_prio;
609 lowest_cpu = cpu;
610 count = 0;
611 }
612 count++;
613 } else
614 cpu_clear(cpu, *lowest_mask);
615 }
616
617 /*
618 * Clear out all the set bits that represent
619 * runqueues that were of higher prio than
620 * the lowest_prio.
621 */
622 if (lowest_cpu > 0) {
623 /*
624 * Perhaps we could add another cpumask op to
625 * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
626 * Then that could be optimized to use memset and such.
627 */
628 for_each_cpu_mask(cpu, *lowest_mask) {
629 if (cpu >= lowest_cpu)
630 break;
631 cpu_clear(cpu, *lowest_mask);
632 }
633 }
634
635 return count;
134} 636}
135 637
136static struct task_struct *load_balance_next_rt(void *arg) 638static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
137{ 639{
138 struct rq *rq = arg; 640 int first;
139 struct rt_prio_array *array = &rq->rt.active; 641
140 struct list_head *head, *curr; 642 /* "this_cpu" is cheaper to preempt than a remote processor */
141 struct task_struct *p; 643 if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
142 int idx; 644 return this_cpu;
645
646 first = first_cpu(*mask);
647 if (first != NR_CPUS)
648 return first;
649
650 return -1;
651}
652
653static int find_lowest_rq(struct task_struct *task)
654{
655 struct sched_domain *sd;
656 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
657 int this_cpu = smp_processor_id();
658 int cpu = task_cpu(task);
659 int count = find_lowest_cpus(task, lowest_mask);
660
661 if (!count)
662 return -1; /* No targets found */
143 663
144 idx = rq->rt.rt_load_balance_idx; 664 /*
145 head = rq->rt.rt_load_balance_head; 665 * There is no sense in performing an optimal search if only one
146 curr = rq->rt.rt_load_balance_curr; 666 * target is found.
667 */
668 if (count == 1)
669 return first_cpu(*lowest_mask);
670
671 /*
672 * At this point we have built a mask of cpus representing the
673 * lowest priority tasks in the system. Now we want to elect
674 * the best one based on our affinity and topology.
675 *
676 * We prioritize the last cpu that the task executed on since
677 * it is most likely cache-hot in that location.
678 */
679 if (cpu_isset(cpu, *lowest_mask))
680 return cpu;
147 681
148 /* 682 /*
149 * If we arrived back to the head again then 683 * Otherwise, we consult the sched_domains span maps to figure
150 * iterate to the next queue (if any): 684 * out which cpu is logically closest to our hot cache data.
151 */ 685 */
152 if (unlikely(head == curr)) { 686 if (this_cpu == cpu)
153 int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); 687 this_cpu = -1; /* Skip this_cpu opt if the same */
154 688
155 if (next_idx >= MAX_RT_PRIO) 689 for_each_domain(cpu, sd) {
156 return NULL; 690 if (sd->flags & SD_WAKE_AFFINE) {
691 cpumask_t domain_mask;
692 int best_cpu;
157 693
158 idx = next_idx; 694 cpus_and(domain_mask, sd->span, *lowest_mask);
159 head = array->queue + idx;
160 curr = head->prev;
161 695
162 rq->rt.rt_load_balance_idx = idx; 696 best_cpu = pick_optimal_cpu(this_cpu,
163 rq->rt.rt_load_balance_head = head; 697 &domain_mask);
698 if (best_cpu != -1)
699 return best_cpu;
700 }
164 } 701 }
165 702
166 p = list_entry(curr, struct task_struct, run_list); 703 /*
704 * And finally, if there were no matches within the domains
705 * just give the caller *something* to work with from the compatible
706 * locations.
707 */
708 return pick_optimal_cpu(this_cpu, lowest_mask);
709}
167 710
168 curr = curr->prev; 711/* Will lock the rq it finds */
712static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
713{
714 struct rq *lowest_rq = NULL;
715 int tries;
716 int cpu;
169 717
170 rq->rt.rt_load_balance_curr = curr; 718 for (tries = 0; tries < RT_MAX_TRIES; tries++) {
719 cpu = find_lowest_rq(task);
171 720
172 return p; 721 if ((cpu == -1) || (cpu == rq->cpu))
722 break;
723
724 lowest_rq = cpu_rq(cpu);
725
726 /* if the prio of this runqueue changed, try again */
727 if (double_lock_balance(rq, lowest_rq)) {
728 /*
729 * We had to unlock the run queue. In
730 * the mean time, task could have
731 * migrated already or had its affinity changed.
732 * Also make sure that it wasn't scheduled on its rq.
733 */
734 if (unlikely(task_rq(task) != rq ||
735 !cpu_isset(lowest_rq->cpu,
736 task->cpus_allowed) ||
737 task_running(rq, task) ||
738 !task->se.on_rq)) {
739
740 spin_unlock(&lowest_rq->lock);
741 lowest_rq = NULL;
742 break;
743 }
744 }
745
746 /* If this rq is still suitable use it. */
747 if (lowest_rq->rt.highest_prio > task->prio)
748 break;
749
750 /* try again */
751 spin_unlock(&lowest_rq->lock);
752 lowest_rq = NULL;
753 }
754
755 return lowest_rq;
756}
757
758/*
759 * If the current CPU has more than one RT task, see if the non
760 * running task can migrate over to a CPU that is running a task
761 * of lesser priority.
762 */
763static int push_rt_task(struct rq *rq)
764{
765 struct task_struct *next_task;
766 struct rq *lowest_rq;
767 int ret = 0;
768 int paranoid = RT_MAX_TRIES;
769
770 if (!rq->rt.overloaded)
771 return 0;
772
773 next_task = pick_next_highest_task_rt(rq, -1);
774 if (!next_task)
775 return 0;
776
777 retry:
778 if (unlikely(next_task == rq->curr)) {
779 WARN_ON(1);
780 return 0;
781 }
782
783 /*
784 * It's possible that the next_task slipped in of
785 * higher priority than current. If that's the case
786 * just reschedule current.
787 */
788 if (unlikely(next_task->prio < rq->curr->prio)) {
789 resched_task(rq->curr);
790 return 0;
791 }
792
793 /* We might release rq lock */
794 get_task_struct(next_task);
795
796 /* find_lock_lowest_rq locks the rq if found */
797 lowest_rq = find_lock_lowest_rq(next_task, rq);
798 if (!lowest_rq) {
799 struct task_struct *task;
800 /*
801 * find lock_lowest_rq releases rq->lock
802 * so it is possible that next_task has changed.
803 * If it has, then try again.
804 */
805 task = pick_next_highest_task_rt(rq, -1);
806 if (unlikely(task != next_task) && task && paranoid--) {
807 put_task_struct(next_task);
808 next_task = task;
809 goto retry;
810 }
811 goto out;
812 }
813
814 deactivate_task(rq, next_task, 0);
815 set_task_cpu(next_task, lowest_rq->cpu);
816 activate_task(lowest_rq, next_task, 0);
817
818 resched_task(lowest_rq->curr);
819
820 spin_unlock(&lowest_rq->lock);
821
822 ret = 1;
823out:
824 put_task_struct(next_task);
825
826 return ret;
827}
828
829/*
830 * TODO: Currently we just use the second highest prio task on
831 * the queue, and stop when it can't migrate (or there's
832 * no more RT tasks). There may be a case where a lower
833 * priority RT task has a different affinity than the
834 * higher RT task. In this case the lower RT task could
835 * possibly be able to migrate where as the higher priority
836 * RT task could not. We currently ignore this issue.
837 * Enhancements are welcome!
838 */
839static void push_rt_tasks(struct rq *rq)
840{
841 /* push_rt_task will return true if it moved an RT */
842 while (push_rt_task(rq))
843 ;
844}
845
846static int pull_rt_task(struct rq *this_rq)
847{
848 int this_cpu = this_rq->cpu, ret = 0, cpu;
849 struct task_struct *p, *next;
850 struct rq *src_rq;
851
852 if (likely(!rt_overloaded(this_rq)))
853 return 0;
854
855 next = pick_next_task_rt(this_rq);
856
857 for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
858 if (this_cpu == cpu)
859 continue;
860
861 src_rq = cpu_rq(cpu);
862 /*
863 * We can potentially drop this_rq's lock in
864 * double_lock_balance, and another CPU could
865 * steal our next task - hence we must cause
866 * the caller to recalculate the next task
867 * in that case:
868 */
869 if (double_lock_balance(this_rq, src_rq)) {
870 struct task_struct *old_next = next;
871
872 next = pick_next_task_rt(this_rq);
873 if (next != old_next)
874 ret = 1;
875 }
876
877 /*
878 * Are there still pullable RT tasks?
879 */
880 if (src_rq->rt.rt_nr_running <= 1)
881 goto skip;
882
883 p = pick_next_highest_task_rt(src_rq, this_cpu);
884
885 /*
886 * Do we have an RT task that preempts
887 * the to-be-scheduled task?
888 */
889 if (p && (!next || (p->prio < next->prio))) {
890 WARN_ON(p == src_rq->curr);
891 WARN_ON(!p->se.on_rq);
892
893 /*
894 * There's a chance that p is higher in priority
895 * than what's currently running on its cpu.
896 * This is just that p is wakeing up and hasn't
897 * had a chance to schedule. We only pull
898 * p if it is lower in priority than the
899 * current task on the run queue or
900 * this_rq next task is lower in prio than
901 * the current task on that rq.
902 */
903 if (p->prio < src_rq->curr->prio ||
904 (next && next->prio < src_rq->curr->prio))
905 goto skip;
906
907 ret = 1;
908
909 deactivate_task(src_rq, p, 0);
910 set_task_cpu(p, this_cpu);
911 activate_task(this_rq, p, 0);
912 /*
913 * We continue with the search, just in
914 * case there's an even higher prio task
915 * in another runqueue. (low likelyhood
916 * but possible)
917 *
918 * Update next so that we won't pick a task
919 * on another cpu with a priority lower (or equal)
920 * than the one we just picked.
921 */
922 next = p;
923
924 }
925 skip:
926 spin_unlock(&src_rq->lock);
927 }
928
929 return ret;
930}
931
932static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
933{
934 /* Try to pull RT tasks here if we lower this rq's prio */
935 if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
936 pull_rt_task(rq);
937}
938
939static void post_schedule_rt(struct rq *rq)
940{
941 /*
942 * If we have more than one rt_task queued, then
943 * see if we can push the other rt_tasks off to other CPUS.
944 * Note we may release the rq lock, and since
945 * the lock was owned by prev, we need to release it
946 * first via finish_lock_switch and then reaquire it here.
947 */
948 if (unlikely(rq->rt.overloaded)) {
949 spin_lock_irq(&rq->lock);
950 push_rt_tasks(rq);
951 spin_unlock_irq(&rq->lock);
952 }
953}
954
955
956static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
957{
958 if (!task_running(rq, p) &&
959 (p->prio >= rq->rt.highest_prio) &&
960 rq->rt.overloaded)
961 push_rt_tasks(rq);
173} 962}
174 963
175static unsigned long 964static unsigned long
@@ -178,36 +967,170 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
178 struct sched_domain *sd, enum cpu_idle_type idle, 967 struct sched_domain *sd, enum cpu_idle_type idle,
179 int *all_pinned, int *this_best_prio) 968 int *all_pinned, int *this_best_prio)
180{ 969{
181 struct rq_iterator rt_rq_iterator; 970 /* don't touch RT tasks */
182 971 return 0;
183 rt_rq_iterator.start = load_balance_start_rt;
184 rt_rq_iterator.next = load_balance_next_rt;
185 /* pass 'busiest' rq argument into
186 * load_balance_[start|next]_rt iterators
187 */
188 rt_rq_iterator.arg = busiest;
189
190 return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd,
191 idle, all_pinned, this_best_prio, &rt_rq_iterator);
192} 972}
193 973
194static int 974static int
195move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, 975move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
196 struct sched_domain *sd, enum cpu_idle_type idle) 976 struct sched_domain *sd, enum cpu_idle_type idle)
197{ 977{
198 struct rq_iterator rt_rq_iterator; 978 /* don't touch RT tasks */
979 return 0;
980}
981
982static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
983{
984 int weight = cpus_weight(*new_mask);
199 985
200 rt_rq_iterator.start = load_balance_start_rt; 986 BUG_ON(!rt_task(p));
201 rt_rq_iterator.next = load_balance_next_rt; 987
202 rt_rq_iterator.arg = busiest; 988 /*
989 * Update the migration status of the RQ if we have an RT task
990 * which is running AND changing its weight value.
991 */
992 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
993 struct rq *rq = task_rq(p);
203 994
204 return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, 995 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
205 &rt_rq_iterator); 996 rq->rt.rt_nr_migratory++;
997 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
998 BUG_ON(!rq->rt.rt_nr_migratory);
999 rq->rt.rt_nr_migratory--;
1000 }
1001
1002 update_rt_migration(rq);
1003 }
1004
1005 p->cpus_allowed = *new_mask;
1006 p->rt.nr_cpus_allowed = weight;
206} 1007}
207#endif
208 1008
209static void task_tick_rt(struct rq *rq, struct task_struct *p) 1009/* Assumes rq->lock is held */
1010static void join_domain_rt(struct rq *rq)
210{ 1011{
1012 if (rq->rt.overloaded)
1013 rt_set_overload(rq);
1014}
1015
1016/* Assumes rq->lock is held */
1017static void leave_domain_rt(struct rq *rq)
1018{
1019 if (rq->rt.overloaded)
1020 rt_clear_overload(rq);
1021}
1022
1023/*
1024 * When switch from the rt queue, we bring ourselves to a position
1025 * that we might want to pull RT tasks from other runqueues.
1026 */
1027static void switched_from_rt(struct rq *rq, struct task_struct *p,
1028 int running)
1029{
1030 /*
1031 * If there are other RT tasks then we will reschedule
1032 * and the scheduling of the other RT tasks will handle
1033 * the balancing. But if we are the last RT task
1034 * we may need to handle the pulling of RT tasks
1035 * now.
1036 */
1037 if (!rq->rt.rt_nr_running)
1038 pull_rt_task(rq);
1039}
1040#endif /* CONFIG_SMP */
1041
1042/*
1043 * When switching a task to RT, we may overload the runqueue
1044 * with RT tasks. In this case we try to push them off to
1045 * other runqueues.
1046 */
1047static void switched_to_rt(struct rq *rq, struct task_struct *p,
1048 int running)
1049{
1050 int check_resched = 1;
1051
1052 /*
1053 * If we are already running, then there's nothing
1054 * that needs to be done. But if we are not running
1055 * we may need to preempt the current running task.
1056 * If that current running task is also an RT task
1057 * then see if we can move to another run queue.
1058 */
1059 if (!running) {
1060#ifdef CONFIG_SMP
1061 if (rq->rt.overloaded && push_rt_task(rq) &&
1062 /* Don't resched if we changed runqueues */
1063 rq != task_rq(p))
1064 check_resched = 0;
1065#endif /* CONFIG_SMP */
1066 if (check_resched && p->prio < rq->curr->prio)
1067 resched_task(rq->curr);
1068 }
1069}
1070
1071/*
1072 * Priority of the task has changed. This may cause
1073 * us to initiate a push or pull.
1074 */
1075static void prio_changed_rt(struct rq *rq, struct task_struct *p,
1076 int oldprio, int running)
1077{
1078 if (running) {
1079#ifdef CONFIG_SMP
1080 /*
1081 * If our priority decreases while running, we
1082 * may need to pull tasks to this runqueue.
1083 */
1084 if (oldprio < p->prio)
1085 pull_rt_task(rq);
1086 /*
1087 * If there's a higher priority task waiting to run
1088 * then reschedule.
1089 */
1090 if (p->prio > rq->rt.highest_prio)
1091 resched_task(p);
1092#else
1093 /* For UP simply resched on drop of prio */
1094 if (oldprio < p->prio)
1095 resched_task(p);
1096#endif /* CONFIG_SMP */
1097 } else {
1098 /*
1099 * This task is not running, but if it is
1100 * greater than the current running task
1101 * then reschedule.
1102 */
1103 if (p->prio < rq->curr->prio)
1104 resched_task(rq->curr);
1105 }
1106}
1107
1108static void watchdog(struct rq *rq, struct task_struct *p)
1109{
1110 unsigned long soft, hard;
1111
1112 if (!p->signal)
1113 return;
1114
1115 soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
1116 hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
1117
1118 if (soft != RLIM_INFINITY) {
1119 unsigned long next;
1120
1121 p->rt.timeout++;
1122 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1123 if (p->rt.timeout > next)
1124 p->it_sched_expires = p->se.sum_exec_runtime;
1125 }
1126}
1127
1128static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1129{
1130 update_curr_rt(rq);
1131
1132 watchdog(rq, p);
1133
211 /* 1134 /*
212 * RR tasks need a special form of timeslice management. 1135 * RR tasks need a special form of timeslice management.
213 * FIFO tasks have no timeslices. 1136 * FIFO tasks have no timeslices.
@@ -215,16 +1138,16 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
215 if (p->policy != SCHED_RR) 1138 if (p->policy != SCHED_RR)
216 return; 1139 return;
217 1140
218 if (--p->time_slice) 1141 if (--p->rt.time_slice)
219 return; 1142 return;
220 1143
221 p->time_slice = DEF_TIMESLICE; 1144 p->rt.time_slice = DEF_TIMESLICE;
222 1145
223 /* 1146 /*
224 * Requeue to the end of queue if we are not the only element 1147 * Requeue to the end of queue if we are not the only element
225 * on the queue: 1148 * on the queue:
226 */ 1149 */
227 if (p->run_list.prev != p->run_list.next) { 1150 if (p->rt.run_list.prev != p->rt.run_list.next) {
228 requeue_task_rt(rq, p); 1151 requeue_task_rt(rq, p);
229 set_tsk_need_resched(p); 1152 set_tsk_need_resched(p);
230 } 1153 }
@@ -242,6 +1165,9 @@ const struct sched_class rt_sched_class = {
242 .enqueue_task = enqueue_task_rt, 1165 .enqueue_task = enqueue_task_rt,
243 .dequeue_task = dequeue_task_rt, 1166 .dequeue_task = dequeue_task_rt,
244 .yield_task = yield_task_rt, 1167 .yield_task = yield_task_rt,
1168#ifdef CONFIG_SMP
1169 .select_task_rq = select_task_rq_rt,
1170#endif /* CONFIG_SMP */
245 1171
246 .check_preempt_curr = check_preempt_curr_rt, 1172 .check_preempt_curr = check_preempt_curr_rt,
247 1173
@@ -251,8 +1177,18 @@ const struct sched_class rt_sched_class = {
251#ifdef CONFIG_SMP 1177#ifdef CONFIG_SMP
252 .load_balance = load_balance_rt, 1178 .load_balance = load_balance_rt,
253 .move_one_task = move_one_task_rt, 1179 .move_one_task = move_one_task_rt,
1180 .set_cpus_allowed = set_cpus_allowed_rt,
1181 .join_domain = join_domain_rt,
1182 .leave_domain = leave_domain_rt,
1183 .pre_schedule = pre_schedule_rt,
1184 .post_schedule = post_schedule_rt,
1185 .task_wake_up = task_wake_up_rt,
1186 .switched_from = switched_from_rt,
254#endif 1187#endif
255 1188
256 .set_curr_task = set_curr_task_rt, 1189 .set_curr_task = set_curr_task_rt,
257 .task_tick = task_tick_rt, 1190 .task_tick = task_tick_rt,
1191
1192 .prio_changed = prio_changed_rt,
1193 .switched_to = switched_to_rt,
258}; 1194};
diff --git a/kernel/signal.c b/kernel/signal.c
index 657aa16d97cb..8054dd4e2d76 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -733,13 +733,13 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
733 current->comm, task_pid_nr(current), signr); 733 current->comm, task_pid_nr(current), signr);
734 734
735#if defined(__i386__) && !defined(__arch_um__) 735#if defined(__i386__) && !defined(__arch_um__)
736 printk("code at %08lx: ", regs->eip); 736 printk("code at %08lx: ", regs->ip);
737 { 737 {
738 int i; 738 int i;
739 for (i = 0; i < 16; i++) { 739 for (i = 0; i < 16; i++) {
740 unsigned char insn; 740 unsigned char insn;
741 741
742 __get_user(insn, (unsigned char *)(regs->eip + i)); 742 __get_user(insn, (unsigned char *)(regs->ip + i));
743 printk("%02x ", insn); 743 printk("%02x ", insn);
744 } 744 }
745 } 745 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bd89bc4eb0b9..d7837d45419e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -3,7 +3,9 @@
3 * 3 *
4 * Copyright (C) 1992 Linus Torvalds 4 * Copyright (C) 1992 Linus Torvalds
5 * 5 *
6 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) 6 * Distribute under GPLv2.
7 *
8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
7 */ 9 */
8 10
9#include <linux/module.h> 11#include <linux/module.h>
@@ -278,9 +280,14 @@ asmlinkage void do_softirq(void)
278 */ 280 */
279void irq_enter(void) 281void irq_enter(void)
280{ 282{
283#ifdef CONFIG_NO_HZ
284 int cpu = smp_processor_id();
285 if (idle_cpu(cpu) && !in_interrupt())
286 tick_nohz_stop_idle(cpu);
287#endif
281 __irq_enter(); 288 __irq_enter();
282#ifdef CONFIG_NO_HZ 289#ifdef CONFIG_NO_HZ
283 if (idle_cpu(smp_processor_id())) 290 if (idle_cpu(cpu))
284 tick_nohz_update_jiffies(); 291 tick_nohz_update_jiffies();
285#endif 292#endif
286} 293}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 11df812263c8..c1d76552446e 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -8,6 +8,7 @@
8 */ 8 */
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/cpu.h> 10#include <linux/cpu.h>
11#include <linux/nmi.h>
11#include <linux/init.h> 12#include <linux/init.h>
12#include <linux/delay.h> 13#include <linux/delay.h>
13#include <linux/freezer.h> 14#include <linux/freezer.h>
@@ -23,8 +24,8 @@ static DEFINE_PER_CPU(unsigned long, touch_timestamp);
23static DEFINE_PER_CPU(unsigned long, print_timestamp); 24static DEFINE_PER_CPU(unsigned long, print_timestamp);
24static DEFINE_PER_CPU(struct task_struct *, watchdog_task); 25static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
25 26
26static int did_panic; 27static int __read_mostly did_panic;
27int softlockup_thresh = 10; 28unsigned long __read_mostly softlockup_thresh = 60;
28 29
29static int 30static int
30softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) 31softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -45,7 +46,7 @@ static struct notifier_block panic_block = {
45 */ 46 */
46static unsigned long get_timestamp(int this_cpu) 47static unsigned long get_timestamp(int this_cpu)
47{ 48{
48 return cpu_clock(this_cpu) >> 30; /* 2^30 ~= 10^9 */ 49 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
49} 50}
50 51
51void touch_softlockup_watchdog(void) 52void touch_softlockup_watchdog(void)
@@ -100,11 +101,7 @@ void softlockup_tick(void)
100 101
101 now = get_timestamp(this_cpu); 102 now = get_timestamp(this_cpu);
102 103
103 /* Wake up the high-prio watchdog task every second: */ 104 /* Warn about unreasonable delays: */
104 if (now > (touch_timestamp + 1))
105 wake_up_process(per_cpu(watchdog_task, this_cpu));
106
107 /* Warn about unreasonable 10+ seconds delays: */
108 if (now <= (touch_timestamp + softlockup_thresh)) 105 if (now <= (touch_timestamp + softlockup_thresh))
109 return; 106 return;
110 107
@@ -122,11 +119,93 @@ void softlockup_tick(void)
122} 119}
123 120
124/* 121/*
122 * Have a reasonable limit on the number of tasks checked:
123 */
124unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
125
126/*
127 * Zero means infinite timeout - no checking done:
128 */
129unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
130
131unsigned long __read_mostly sysctl_hung_task_warnings = 10;
132
133/*
134 * Only do the hung-tasks check on one CPU:
135 */
136static int check_cpu __read_mostly = -1;
137
138static void check_hung_task(struct task_struct *t, unsigned long now)
139{
140 unsigned long switch_count = t->nvcsw + t->nivcsw;
141
142 if (t->flags & PF_FROZEN)
143 return;
144
145 if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
146 t->last_switch_count = switch_count;
147 t->last_switch_timestamp = now;
148 return;
149 }
150 if ((long)(now - t->last_switch_timestamp) <
151 sysctl_hung_task_timeout_secs)
152 return;
153 if (sysctl_hung_task_warnings < 0)
154 return;
155 sysctl_hung_task_warnings--;
156
157 /*
158 * Ok, the task did not get scheduled for more than 2 minutes,
159 * complain:
160 */
161 printk(KERN_ERR "INFO: task %s:%d blocked for more than "
162 "%ld seconds.\n", t->comm, t->pid,
163 sysctl_hung_task_timeout_secs);
164 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
165 " disables this message.\n");
166 sched_show_task(t);
167 __debug_show_held_locks(t);
168
169 t->last_switch_timestamp = now;
170 touch_nmi_watchdog();
171}
172
173/*
174 * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
175 * a really long time (120 seconds). If that happens, print out
176 * a warning.
177 */
178static void check_hung_uninterruptible_tasks(int this_cpu)
179{
180 int max_count = sysctl_hung_task_check_count;
181 unsigned long now = get_timestamp(this_cpu);
182 struct task_struct *g, *t;
183
184 /*
185 * If the system crashed already then all bets are off,
186 * do not report extra hung tasks:
187 */
188 if ((tainted & TAINT_DIE) || did_panic)
189 return;
190
191 read_lock(&tasklist_lock);
192 do_each_thread(g, t) {
193 if (!--max_count)
194 break;
195 if (t->state & TASK_UNINTERRUPTIBLE)
196 check_hung_task(t, now);
197 } while_each_thread(g, t);
198
199 read_unlock(&tasklist_lock);
200}
201
202/*
125 * The watchdog thread - runs every second and touches the timestamp. 203 * The watchdog thread - runs every second and touches the timestamp.
126 */ 204 */
127static int watchdog(void *__bind_cpu) 205static int watchdog(void *__bind_cpu)
128{ 206{
129 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 207 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
208 int this_cpu = (long)__bind_cpu;
130 209
131 sched_setscheduler(current, SCHED_FIFO, &param); 210 sched_setscheduler(current, SCHED_FIFO, &param);
132 211
@@ -135,13 +214,18 @@ static int watchdog(void *__bind_cpu)
135 214
136 /* 215 /*
137 * Run briefly once per second to reset the softlockup timestamp. 216 * Run briefly once per second to reset the softlockup timestamp.
138 * If this gets delayed for more than 10 seconds then the 217 * If this gets delayed for more than 60 seconds then the
139 * debug-printout triggers in softlockup_tick(). 218 * debug-printout triggers in softlockup_tick().
140 */ 219 */
141 while (!kthread_should_stop()) { 220 while (!kthread_should_stop()) {
142 set_current_state(TASK_INTERRUPTIBLE);
143 touch_softlockup_watchdog(); 221 touch_softlockup_watchdog();
144 schedule(); 222 msleep_interruptible(10000);
223
224 if (this_cpu != check_cpu)
225 continue;
226
227 if (sysctl_hung_task_timeout_secs)
228 check_hung_uninterruptible_tasks(this_cpu);
145 } 229 }
146 230
147 return 0; 231 return 0;
@@ -171,6 +255,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
171 break; 255 break;
172 case CPU_ONLINE: 256 case CPU_ONLINE:
173 case CPU_ONLINE_FROZEN: 257 case CPU_ONLINE_FROZEN:
258 check_cpu = any_online_cpu(cpu_online_map);
174 wake_up_process(per_cpu(watchdog_task, hotcpu)); 259 wake_up_process(per_cpu(watchdog_task, hotcpu));
175 break; 260 break;
176#ifdef CONFIG_HOTPLUG_CPU 261#ifdef CONFIG_HOTPLUG_CPU
@@ -181,6 +266,15 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
181 /* Unbind so it can run. Fall thru. */ 266 /* Unbind so it can run. Fall thru. */
182 kthread_bind(per_cpu(watchdog_task, hotcpu), 267 kthread_bind(per_cpu(watchdog_task, hotcpu),
183 any_online_cpu(cpu_online_map)); 268 any_online_cpu(cpu_online_map));
269 case CPU_DOWN_PREPARE:
270 case CPU_DOWN_PREPARE_FROZEN:
271 if (hotcpu == check_cpu) {
272 cpumask_t temp_cpu_online_map = cpu_online_map;
273
274 cpu_clear(hotcpu, temp_cpu_online_map);
275 check_cpu = any_online_cpu(temp_cpu_online_map);
276 }
277 break;
184 case CPU_DEAD: 278 case CPU_DEAD:
185 case CPU_DEAD_FROZEN: 279 case CPU_DEAD_FROZEN:
186 p = per_cpu(watchdog_task, hotcpu); 280 p = per_cpu(watchdog_task, hotcpu);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index cd72424c2662..ae28c8245123 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -65,8 +65,7 @@ EXPORT_SYMBOL(_write_trylock);
65 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are 65 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
66 * not re-enabled during lock-acquire (which the preempt-spin-ops do): 66 * not re-enabled during lock-acquire (which the preempt-spin-ops do):
67 */ 67 */
68#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \ 68#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
69 defined(CONFIG_DEBUG_LOCK_ALLOC)
70 69
71void __lockfunc _read_lock(rwlock_t *lock) 70void __lockfunc _read_lock(rwlock_t *lock)
72{ 71{
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 319821ef78af..51b5ee53571a 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -203,13 +203,13 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
203 int ret; 203 int ret;
204 204
205 /* No CPUs can come up or down during this. */ 205 /* No CPUs can come up or down during this. */
206 lock_cpu_hotplug(); 206 get_online_cpus();
207 p = __stop_machine_run(fn, data, cpu); 207 p = __stop_machine_run(fn, data, cpu);
208 if (!IS_ERR(p)) 208 if (!IS_ERR(p))
209 ret = kthread_stop(p); 209 ret = kthread_stop(p);
210 else 210 else
211 ret = PTR_ERR(p); 211 ret = PTR_ERR(p);
212 unlock_cpu_hotplug(); 212 put_online_cpus();
213 213
214 return ret; 214 return ret;
215} 215}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 56cb009a4b35..beee5b3b68a2 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -131,6 +131,7 @@ cond_syscall(sys32_sysctl);
131cond_syscall(ppc_rtas); 131cond_syscall(ppc_rtas);
132cond_syscall(sys_spu_run); 132cond_syscall(sys_spu_run);
133cond_syscall(sys_spu_create); 133cond_syscall(sys_spu_create);
134cond_syscall(sys_subpage_prot);
134 135
135/* mmu depending weak syscall entries */ 136/* mmu depending weak syscall entries */
136cond_syscall(sys_mprotect); 137cond_syscall(sys_mprotect);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8ac51714b08c..357b68ba23ec 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -53,6 +53,7 @@
53#ifdef CONFIG_X86 53#ifdef CONFIG_X86
54#include <asm/nmi.h> 54#include <asm/nmi.h>
55#include <asm/stacktrace.h> 55#include <asm/stacktrace.h>
56#include <asm/io.h>
56#endif 57#endif
57 58
58static int deprecated_sysctl_warning(struct __sysctl_args *args); 59static int deprecated_sysctl_warning(struct __sysctl_args *args);
@@ -81,6 +82,7 @@ extern int compat_log;
81extern int maps_protect; 82extern int maps_protect;
82extern int sysctl_stat_interval; 83extern int sysctl_stat_interval;
83extern int audit_argv_kb; 84extern int audit_argv_kb;
85extern int latencytop_enabled;
84 86
85/* Constants used for minimum and maximum */ 87/* Constants used for minimum and maximum */
86#ifdef CONFIG_DETECT_SOFTLOCKUP 88#ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -156,8 +158,16 @@ static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *
156#endif 158#endif
157 159
158static struct ctl_table root_table[]; 160static struct ctl_table root_table[];
159static struct ctl_table_header root_table_header = 161static struct ctl_table_root sysctl_table_root;
160 { root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) }; 162static struct ctl_table_header root_table_header = {
163 .ctl_table = root_table,
164 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list),
165 .root = &sysctl_table_root,
166};
167static struct ctl_table_root sysctl_table_root = {
168 .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
169 .header_list = LIST_HEAD_INIT(root_table_header.ctl_entry),
170};
161 171
162static struct ctl_table kern_table[]; 172static struct ctl_table kern_table[];
163static struct ctl_table vm_table[]; 173static struct ctl_table vm_table[];
@@ -191,14 +201,6 @@ static struct ctl_table root_table[] = {
191 .mode = 0555, 201 .mode = 0555,
192 .child = vm_table, 202 .child = vm_table,
193 }, 203 },
194#ifdef CONFIG_NET
195 {
196 .ctl_name = CTL_NET,
197 .procname = "net",
198 .mode = 0555,
199 .child = net_table,
200 },
201#endif
202 { 204 {
203 .ctl_name = CTL_FS, 205 .ctl_name = CTL_FS,
204 .procname = "fs", 206 .procname = "fs",
@@ -225,10 +227,10 @@ static struct ctl_table root_table[] = {
225}; 227};
226 228
227#ifdef CONFIG_SCHED_DEBUG 229#ifdef CONFIG_SCHED_DEBUG
228static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */ 230static int min_sched_granularity_ns = 100000; /* 100 usecs */
229static unsigned long max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ 231static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
230static unsigned long min_wakeup_granularity_ns; /* 0 usecs */ 232static int min_wakeup_granularity_ns; /* 0 usecs */
231static unsigned long max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 233static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
232#endif 234#endif
233 235
234static struct ctl_table kern_table[] = { 236static struct ctl_table kern_table[] = {
@@ -306,9 +308,43 @@ static struct ctl_table kern_table[] = {
306 .procname = "sched_nr_migrate", 308 .procname = "sched_nr_migrate",
307 .data = &sysctl_sched_nr_migrate, 309 .data = &sysctl_sched_nr_migrate,
308 .maxlen = sizeof(unsigned int), 310 .maxlen = sizeof(unsigned int),
309 .mode = 644, 311 .mode = 0644,
312 .proc_handler = &proc_dointvec,
313 },
314 {
315 .ctl_name = CTL_UNNUMBERED,
316 .procname = "sched_rt_period_ms",
317 .data = &sysctl_sched_rt_period,
318 .maxlen = sizeof(unsigned int),
319 .mode = 0644,
310 .proc_handler = &proc_dointvec, 320 .proc_handler = &proc_dointvec,
311 }, 321 },
322 {
323 .ctl_name = CTL_UNNUMBERED,
324 .procname = "sched_rt_ratio",
325 .data = &sysctl_sched_rt_ratio,
326 .maxlen = sizeof(unsigned int),
327 .mode = 0644,
328 .proc_handler = &proc_dointvec,
329 },
330#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
331 {
332 .ctl_name = CTL_UNNUMBERED,
333 .procname = "sched_min_bal_int_shares",
334 .data = &sysctl_sched_min_bal_int_shares,
335 .maxlen = sizeof(unsigned int),
336 .mode = 0644,
337 .proc_handler = &proc_dointvec,
338 },
339 {
340 .ctl_name = CTL_UNNUMBERED,
341 .procname = "sched_max_bal_int_shares",
342 .data = &sysctl_sched_max_bal_int_shares,
343 .maxlen = sizeof(unsigned int),
344 .mode = 0644,
345 .proc_handler = &proc_dointvec,
346 },
347#endif
312#endif 348#endif
313 { 349 {
314 .ctl_name = CTL_UNNUMBERED, 350 .ctl_name = CTL_UNNUMBERED,
@@ -382,6 +418,15 @@ static struct ctl_table kern_table[] = {
382 .proc_handler = &proc_dointvec_taint, 418 .proc_handler = &proc_dointvec_taint,
383 }, 419 },
384#endif 420#endif
421#ifdef CONFIG_LATENCYTOP
422 {
423 .procname = "latencytop",
424 .data = &latencytop_enabled,
425 .maxlen = sizeof(int),
426 .mode = 0644,
427 .proc_handler = &proc_dointvec,
428 },
429#endif
385#ifdef CONFIG_SECURITY_CAPABILITIES 430#ifdef CONFIG_SECURITY_CAPABILITIES
386 { 431 {
387 .procname = "cap-bound", 432 .procname = "cap-bound",
@@ -683,6 +728,14 @@ static struct ctl_table kern_table[] = {
683 .mode = 0644, 728 .mode = 0644,
684 .proc_handler = &proc_dointvec, 729 .proc_handler = &proc_dointvec,
685 }, 730 },
731 {
732 .ctl_name = CTL_UNNUMBERED,
733 .procname = "io_delay_type",
734 .data = &io_delay_type,
735 .maxlen = sizeof(int),
736 .mode = 0644,
737 .proc_handler = &proc_dointvec,
738 },
686#endif 739#endif
687#if defined(CONFIG_MMU) 740#if defined(CONFIG_MMU)
688 { 741 {
@@ -728,13 +781,40 @@ static struct ctl_table kern_table[] = {
728 .ctl_name = CTL_UNNUMBERED, 781 .ctl_name = CTL_UNNUMBERED,
729 .procname = "softlockup_thresh", 782 .procname = "softlockup_thresh",
730 .data = &softlockup_thresh, 783 .data = &softlockup_thresh,
731 .maxlen = sizeof(int), 784 .maxlen = sizeof(unsigned long),
732 .mode = 0644, 785 .mode = 0644,
733 .proc_handler = &proc_dointvec_minmax, 786 .proc_handler = &proc_doulongvec_minmax,
734 .strategy = &sysctl_intvec, 787 .strategy = &sysctl_intvec,
735 .extra1 = &one, 788 .extra1 = &one,
736 .extra2 = &sixty, 789 .extra2 = &sixty,
737 }, 790 },
791 {
792 .ctl_name = CTL_UNNUMBERED,
793 .procname = "hung_task_check_count",
794 .data = &sysctl_hung_task_check_count,
795 .maxlen = sizeof(unsigned long),
796 .mode = 0644,
797 .proc_handler = &proc_doulongvec_minmax,
798 .strategy = &sysctl_intvec,
799 },
800 {
801 .ctl_name = CTL_UNNUMBERED,
802 .procname = "hung_task_timeout_secs",
803 .data = &sysctl_hung_task_timeout_secs,
804 .maxlen = sizeof(unsigned long),
805 .mode = 0644,
806 .proc_handler = &proc_doulongvec_minmax,
807 .strategy = &sysctl_intvec,
808 },
809 {
810 .ctl_name = CTL_UNNUMBERED,
811 .procname = "hung_task_warnings",
812 .data = &sysctl_hung_task_warnings,
813 .maxlen = sizeof(unsigned long),
814 .mode = 0644,
815 .proc_handler = &proc_doulongvec_minmax,
816 .strategy = &sysctl_intvec,
817 },
738#endif 818#endif
739#ifdef CONFIG_COMPAT 819#ifdef CONFIG_COMPAT
740 { 820 {
@@ -906,11 +986,11 @@ static struct ctl_table vm_table[] = {
906 }, 986 },
907 { 987 {
908 .ctl_name = CTL_UNNUMBERED, 988 .ctl_name = CTL_UNNUMBERED,
909 .procname = "hugetlb_dynamic_pool", 989 .procname = "nr_overcommit_hugepages",
910 .data = &hugetlb_dynamic_pool, 990 .data = &nr_overcommit_huge_pages,
911 .maxlen = sizeof(hugetlb_dynamic_pool), 991 .maxlen = sizeof(nr_overcommit_huge_pages),
912 .mode = 0644, 992 .mode = 0644,
913 .proc_handler = &proc_dointvec, 993 .proc_handler = &proc_doulongvec_minmax,
914 }, 994 },
915#endif 995#endif
916 { 996 {
@@ -1300,12 +1380,27 @@ void sysctl_head_finish(struct ctl_table_header *head)
1300 spin_unlock(&sysctl_lock); 1380 spin_unlock(&sysctl_lock);
1301} 1381}
1302 1382
1303struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) 1383static struct list_head *
1384lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
1385{
1386 struct list_head *header_list;
1387 header_list = &root->header_list;
1388 if (root->lookup)
1389 header_list = root->lookup(root, namespaces);
1390 return header_list;
1391}
1392
1393struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
1394 struct ctl_table_header *prev)
1304{ 1395{
1396 struct ctl_table_root *root;
1397 struct list_head *header_list;
1305 struct ctl_table_header *head; 1398 struct ctl_table_header *head;
1306 struct list_head *tmp; 1399 struct list_head *tmp;
1400
1307 spin_lock(&sysctl_lock); 1401 spin_lock(&sysctl_lock);
1308 if (prev) { 1402 if (prev) {
1403 head = prev;
1309 tmp = &prev->ctl_entry; 1404 tmp = &prev->ctl_entry;
1310 unuse_table(prev); 1405 unuse_table(prev);
1311 goto next; 1406 goto next;
@@ -1319,14 +1414,38 @@ struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
1319 spin_unlock(&sysctl_lock); 1414 spin_unlock(&sysctl_lock);
1320 return head; 1415 return head;
1321 next: 1416 next:
1417 root = head->root;
1322 tmp = tmp->next; 1418 tmp = tmp->next;
1323 if (tmp == &root_table_header.ctl_entry) 1419 header_list = lookup_header_list(root, namespaces);
1324 break; 1420 if (tmp != header_list)
1421 continue;
1422
1423 do {
1424 root = list_entry(root->root_list.next,
1425 struct ctl_table_root, root_list);
1426 if (root == &sysctl_table_root)
1427 goto out;
1428 header_list = lookup_header_list(root, namespaces);
1429 } while (list_empty(header_list));
1430 tmp = header_list->next;
1325 } 1431 }
1432out:
1326 spin_unlock(&sysctl_lock); 1433 spin_unlock(&sysctl_lock);
1327 return NULL; 1434 return NULL;
1328} 1435}
1329 1436
1437struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
1438{
1439 return __sysctl_head_next(current->nsproxy, prev);
1440}
1441
1442void register_sysctl_root(struct ctl_table_root *root)
1443{
1444 spin_lock(&sysctl_lock);
1445 list_add_tail(&root->root_list, &sysctl_table_root.root_list);
1446 spin_unlock(&sysctl_lock);
1447}
1448
1330#ifdef CONFIG_SYSCTL_SYSCALL 1449#ifdef CONFIG_SYSCTL_SYSCALL
1331int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, 1450int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
1332 void __user *newval, size_t newlen) 1451 void __user *newval, size_t newlen)
@@ -1483,18 +1602,21 @@ static __init int sysctl_init(void)
1483{ 1602{
1484 int err; 1603 int err;
1485 sysctl_set_parent(NULL, root_table); 1604 sysctl_set_parent(NULL, root_table);
1486 err = sysctl_check_table(root_table); 1605 err = sysctl_check_table(current->nsproxy, root_table);
1487 return 0; 1606 return 0;
1488} 1607}
1489 1608
1490core_initcall(sysctl_init); 1609core_initcall(sysctl_init);
1491 1610
1492/** 1611/**
1493 * register_sysctl_table - register a sysctl hierarchy 1612 * __register_sysctl_paths - register a sysctl hierarchy
1613 * @root: List of sysctl headers to register on
1614 * @namespaces: Data to compute which lists of sysctl entries are visible
1615 * @path: The path to the directory the sysctl table is in.
1494 * @table: the top-level table structure 1616 * @table: the top-level table structure
1495 * 1617 *
1496 * Register a sysctl table hierarchy. @table should be a filled in ctl_table 1618 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1497 * array. An entry with a ctl_name of 0 terminates the table. 1619 * array. A completely 0 filled entry terminates the table.
1498 * 1620 *
1499 * The members of the &struct ctl_table structure are used as follows: 1621 * The members of the &struct ctl_table structure are used as follows:
1500 * 1622 *
@@ -1557,25 +1679,99 @@ core_initcall(sysctl_init);
1557 * This routine returns %NULL on a failure to register, and a pointer 1679 * This routine returns %NULL on a failure to register, and a pointer
1558 * to the table header on success. 1680 * to the table header on success.
1559 */ 1681 */
1560struct ctl_table_header *register_sysctl_table(struct ctl_table * table) 1682struct ctl_table_header *__register_sysctl_paths(
1683 struct ctl_table_root *root,
1684 struct nsproxy *namespaces,
1685 const struct ctl_path *path, struct ctl_table *table)
1561{ 1686{
1562 struct ctl_table_header *tmp; 1687 struct list_head *header_list;
1563 tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); 1688 struct ctl_table_header *header;
1564 if (!tmp) 1689 struct ctl_table *new, **prevp;
1690 unsigned int n, npath;
1691
1692 /* Count the path components */
1693 for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath)
1694 ;
1695
1696 /*
1697 * For each path component, allocate a 2-element ctl_table array.
1698 * The first array element will be filled with the sysctl entry
1699 * for this, the second will be the sentinel (ctl_name == 0).
1700 *
1701 * We allocate everything in one go so that we don't have to
1702 * worry about freeing additional memory in unregister_sysctl_table.
1703 */
1704 header = kzalloc(sizeof(struct ctl_table_header) +
1705 (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
1706 if (!header)
1565 return NULL; 1707 return NULL;
1566 tmp->ctl_table = table; 1708
1567 INIT_LIST_HEAD(&tmp->ctl_entry); 1709 new = (struct ctl_table *) (header + 1);
1568 tmp->used = 0; 1710
1569 tmp->unregistering = NULL; 1711 /* Now connect the dots */
1570 sysctl_set_parent(NULL, table); 1712 prevp = &header->ctl_table;
1571 if (sysctl_check_table(tmp->ctl_table)) { 1713 for (n = 0; n < npath; ++n, ++path) {
1572 kfree(tmp); 1714 /* Copy the procname */
1715 new->procname = path->procname;
1716 new->ctl_name = path->ctl_name;
1717 new->mode = 0555;
1718
1719 *prevp = new;
1720 prevp = &new->child;
1721
1722 new += 2;
1723 }
1724 *prevp = table;
1725 header->ctl_table_arg = table;
1726
1727 INIT_LIST_HEAD(&header->ctl_entry);
1728 header->used = 0;
1729 header->unregistering = NULL;
1730 header->root = root;
1731 sysctl_set_parent(NULL, header->ctl_table);
1732 if (sysctl_check_table(namespaces, header->ctl_table)) {
1733 kfree(header);
1573 return NULL; 1734 return NULL;
1574 } 1735 }
1575 spin_lock(&sysctl_lock); 1736 spin_lock(&sysctl_lock);
1576 list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); 1737 header_list = lookup_header_list(root, namespaces);
1738 list_add_tail(&header->ctl_entry, header_list);
1577 spin_unlock(&sysctl_lock); 1739 spin_unlock(&sysctl_lock);
1578 return tmp; 1740
1741 return header;
1742}
1743
1744/**
1745 * register_sysctl_table_path - register a sysctl table hierarchy
1746 * @path: The path to the directory the sysctl table is in.
1747 * @table: the top-level table structure
1748 *
1749 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1750 * array. A completely 0 filled entry terminates the table.
1751 *
1752 * See __register_sysctl_paths for more details.
1753 */
1754struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
1755 struct ctl_table *table)
1756{
1757 return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
1758 path, table);
1759}
1760
1761/**
1762 * register_sysctl_table - register a sysctl table hierarchy
1763 * @table: the top-level table structure
1764 *
1765 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1766 * array. A completely 0 filled entry terminates the table.
1767 *
1768 * See register_sysctl_paths for more details.
1769 */
1770struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
1771{
1772 static const struct ctl_path null_path[] = { {} };
1773
1774 return register_sysctl_paths(null_path, table);
1579} 1775}
1580 1776
1581/** 1777/**
@@ -1604,6 +1800,12 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
1604 return NULL; 1800 return NULL;
1605} 1801}
1606 1802
1803struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
1804 struct ctl_table *table)
1805{
1806 return NULL;
1807}
1808
1607void unregister_sysctl_table(struct ctl_table_header * table) 1809void unregister_sysctl_table(struct ctl_table_header * table)
1608{ 1810{
1609} 1811}
@@ -2662,6 +2864,7 @@ EXPORT_SYMBOL(proc_dostring);
2662EXPORT_SYMBOL(proc_doulongvec_minmax); 2864EXPORT_SYMBOL(proc_doulongvec_minmax);
2663EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); 2865EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
2664EXPORT_SYMBOL(register_sysctl_table); 2866EXPORT_SYMBOL(register_sysctl_table);
2867EXPORT_SYMBOL(register_sysctl_paths);
2665EXPORT_SYMBOL(sysctl_intvec); 2868EXPORT_SYMBOL(sysctl_intvec);
2666EXPORT_SYMBOL(sysctl_jiffies); 2869EXPORT_SYMBOL(sysctl_jiffies);
2667EXPORT_SYMBOL(sysctl_ms_jiffies); 2870EXPORT_SYMBOL(sysctl_ms_jiffies);
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index bed939f82c31..c3206fa50048 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1,6 +1,5 @@
1#include <linux/stat.h> 1#include <linux/stat.h>
2#include <linux/sysctl.h> 2#include <linux/sysctl.h>
3#include "../arch/s390/appldata/appldata.h"
4#include "../fs/xfs/linux-2.6/xfs_sysctl.h" 3#include "../fs/xfs/linux-2.6/xfs_sysctl.h"
5#include <linux/sunrpc/debug.h> 4#include <linux/sunrpc/debug.h>
6#include <linux/string.h> 5#include <linux/string.h>
@@ -428,7 +427,7 @@ static struct trans_ctl_table trans_net_netrom_table[] = {
428 {} 427 {}
429}; 428};
430 429
431static struct trans_ctl_table trans_net_ax25_table[] = { 430static struct trans_ctl_table trans_net_ax25_param_table[] = {
432 { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" }, 431 { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" },
433 { NET_AX25_DEFAULT_MODE, "ax25_default_mode" }, 432 { NET_AX25_DEFAULT_MODE, "ax25_default_mode" },
434 { NET_AX25_BACKOFF_TYPE, "backoff_type" }, 433 { NET_AX25_BACKOFF_TYPE, "backoff_type" },
@@ -446,6 +445,11 @@ static struct trans_ctl_table trans_net_ax25_table[] = {
446 {} 445 {}
447}; 446};
448 447
448static struct trans_ctl_table trans_net_ax25_table[] = {
449 { 0, NULL, trans_net_ax25_param_table },
450 {}
451};
452
449static struct trans_ctl_table trans_net_bridge_table[] = { 453static struct trans_ctl_table trans_net_bridge_table[] = {
450 { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" }, 454 { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" },
451 { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" }, 455 { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" },
@@ -1338,7 +1342,8 @@ static void sysctl_repair_table(struct ctl_table *table)
1338 } 1342 }
1339} 1343}
1340 1344
1341static struct ctl_table *sysctl_check_lookup(struct ctl_table *table) 1345static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
1346 struct ctl_table *table)
1342{ 1347{
1343 struct ctl_table_header *head; 1348 struct ctl_table_header *head;
1344 struct ctl_table *ref, *test; 1349 struct ctl_table *ref, *test;
@@ -1346,8 +1351,8 @@ static struct ctl_table *sysctl_check_lookup(struct ctl_table *table)
1346 1351
1347 depth = sysctl_depth(table); 1352 depth = sysctl_depth(table);
1348 1353
1349 for (head = sysctl_head_next(NULL); head; 1354 for (head = __sysctl_head_next(namespaces, NULL); head;
1350 head = sysctl_head_next(head)) { 1355 head = __sysctl_head_next(namespaces, head)) {
1351 cur_depth = depth; 1356 cur_depth = depth;
1352 ref = head->ctl_table; 1357 ref = head->ctl_table;
1353repeat: 1358repeat:
@@ -1392,13 +1397,14 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str
1392 *fail = str; 1397 *fail = str;
1393} 1398}
1394 1399
1395static int sysctl_check_dir(struct ctl_table *table) 1400static int sysctl_check_dir(struct nsproxy *namespaces,
1401 struct ctl_table *table)
1396{ 1402{
1397 struct ctl_table *ref; 1403 struct ctl_table *ref;
1398 int error; 1404 int error;
1399 1405
1400 error = 0; 1406 error = 0;
1401 ref = sysctl_check_lookup(table); 1407 ref = sysctl_check_lookup(namespaces, table);
1402 if (ref) { 1408 if (ref) {
1403 int match = 0; 1409 int match = 0;
1404 if ((!table->procname && !ref->procname) || 1410 if ((!table->procname && !ref->procname) ||
@@ -1423,11 +1429,12 @@ static int sysctl_check_dir(struct ctl_table *table)
1423 return error; 1429 return error;
1424} 1430}
1425 1431
1426static void sysctl_check_leaf(struct ctl_table *table, const char **fail) 1432static void sysctl_check_leaf(struct nsproxy *namespaces,
1433 struct ctl_table *table, const char **fail)
1427{ 1434{
1428 struct ctl_table *ref; 1435 struct ctl_table *ref;
1429 1436
1430 ref = sysctl_check_lookup(table); 1437 ref = sysctl_check_lookup(namespaces, table);
1431 if (ref && (ref != table)) 1438 if (ref && (ref != table))
1432 set_fail(fail, table, "Sysctl already exists"); 1439 set_fail(fail, table, "Sysctl already exists");
1433} 1440}
@@ -1451,7 +1458,7 @@ static void sysctl_check_bin_path(struct ctl_table *table, const char **fail)
1451 } 1458 }
1452} 1459}
1453 1460
1454int sysctl_check_table(struct ctl_table *table) 1461int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1455{ 1462{
1456 int error = 0; 1463 int error = 0;
1457 for (; table->ctl_name || table->procname; table++) { 1464 for (; table->ctl_name || table->procname; table++) {
@@ -1481,7 +1488,7 @@ int sysctl_check_table(struct ctl_table *table)
1481 set_fail(&fail, table, "Directory with extra1"); 1488 set_fail(&fail, table, "Directory with extra1");
1482 if (table->extra2) 1489 if (table->extra2)
1483 set_fail(&fail, table, "Directory with extra2"); 1490 set_fail(&fail, table, "Directory with extra2");
1484 if (sysctl_check_dir(table)) 1491 if (sysctl_check_dir(namespaces, table))
1485 set_fail(&fail, table, "Inconsistent directory names"); 1492 set_fail(&fail, table, "Inconsistent directory names");
1486 } else { 1493 } else {
1487 if ((table->strategy == sysctl_data) || 1494 if ((table->strategy == sysctl_data) ||
@@ -1530,7 +1537,7 @@ int sysctl_check_table(struct ctl_table *table)
1530 if (!table->procname && table->proc_handler) 1537 if (!table->procname && table->proc_handler)
1531 set_fail(&fail, table, "proc_handler without procname"); 1538 set_fail(&fail, table, "proc_handler without procname");
1532#endif 1539#endif
1533 sysctl_check_leaf(table, &fail); 1540 sysctl_check_leaf(namespaces, table, &fail);
1534 } 1541 }
1535 sysctl_check_bin_path(table, &fail); 1542 sysctl_check_bin_path(table, &fail);
1536 if (fail) { 1543 if (fail) {
@@ -1538,7 +1545,7 @@ int sysctl_check_table(struct ctl_table *table)
1538 error = -EINVAL; 1545 error = -EINVAL;
1539 } 1546 }
1540 if (table->child) 1547 if (table->child)
1541 error |= sysctl_check_table(table->child); 1548 error |= sysctl_check_table(namespaces, table->child);
1542 } 1549 }
1543 return error; 1550 return error;
1544} 1551}
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
new file mode 100644
index 000000000000..88cdb109e13c
--- /dev/null
+++ b/kernel/test_kprobes.c
@@ -0,0 +1,216 @@
1/*
2 * test_kprobes.c - simple sanity test for *probes
3 *
4 * Copyright IBM Corp. 2008
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it would be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
14 * the GNU General Public License for more details.
15 */
16
17#include <linux/kernel.h>
18#include <linux/kprobes.h>
19#include <linux/random.h>
20
21#define div_factor 3
22
23static u32 rand1, preh_val, posth_val, jph_val;
24static int errors, handler_errors, num_tests;
25
26static noinline u32 kprobe_target(u32 value)
27{
28 /*
29 * gcc ignores noinline on some architectures unless we stuff
30 * sufficient lard into the function. The get_kprobe() here is
31 * just for that.
32 *
33 * NOTE: We aren't concerned about the correctness of get_kprobe()
34 * here; hence, this call is neither under !preempt nor with the
35 * kprobe_mutex held. This is fine(tm)
36 */
37 if (get_kprobe((void *)0xdeadbeef))
38 printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n");
39
40 return (value / div_factor);
41}
42
43static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs)
44{
45 preh_val = (rand1 / div_factor);
46 return 0;
47}
48
49static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
50 unsigned long flags)
51{
52 if (preh_val != (rand1 / div_factor)) {
53 handler_errors++;
54 printk(KERN_ERR "Kprobe smoke test failed: "
55 "incorrect value in post_handler\n");
56 }
57 posth_val = preh_val + div_factor;
58}
59
60static struct kprobe kp = {
61 .symbol_name = "kprobe_target",
62 .pre_handler = kp_pre_handler,
63 .post_handler = kp_post_handler
64};
65
66static int test_kprobe(void)
67{
68 int ret;
69
70 ret = register_kprobe(&kp);
71 if (ret < 0) {
72 printk(KERN_ERR "Kprobe smoke test failed: "
73 "register_kprobe returned %d\n", ret);
74 return ret;
75 }
76
77 ret = kprobe_target(rand1);
78 unregister_kprobe(&kp);
79
80 if (preh_val == 0) {
81 printk(KERN_ERR "Kprobe smoke test failed: "
82 "kprobe pre_handler not called\n");
83 handler_errors++;
84 }
85
86 if (posth_val == 0) {
87 printk(KERN_ERR "Kprobe smoke test failed: "
88 "kprobe post_handler not called\n");
89 handler_errors++;
90 }
91
92 return 0;
93}
94
95static u32 j_kprobe_target(u32 value)
96{
97 if (value != rand1) {
98 handler_errors++;
99 printk(KERN_ERR "Kprobe smoke test failed: "
100 "incorrect value in jprobe handler\n");
101 }
102
103 jph_val = rand1;
104 jprobe_return();
105 return 0;
106}
107
108static struct jprobe jp = {
109 .entry = j_kprobe_target,
110 .kp.symbol_name = "kprobe_target"
111};
112
113static int test_jprobe(void)
114{
115 int ret;
116
117 ret = register_jprobe(&jp);
118 if (ret < 0) {
119 printk(KERN_ERR "Kprobe smoke test failed: "
120 "register_jprobe returned %d\n", ret);
121 return ret;
122 }
123
124 ret = kprobe_target(rand1);
125 unregister_jprobe(&jp);
126 if (jph_val == 0) {
127 printk(KERN_ERR "Kprobe smoke test failed: "
128 "jprobe handler not called\n");
129 handler_errors++;
130 }
131
132 return 0;
133}
134
135#ifdef CONFIG_KRETPROBES
136static u32 krph_val;
137
138static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
139{
140 unsigned long ret = regs_return_value(regs);
141
142 if (ret != (rand1 / div_factor)) {
143 handler_errors++;
144 printk(KERN_ERR "Kprobe smoke test failed: "
145 "incorrect value in kretprobe handler\n");
146 }
147
148 krph_val = (rand1 / div_factor);
149 return 0;
150}
151
152static struct kretprobe rp = {
153 .handler = return_handler,
154 .kp.symbol_name = "kprobe_target"
155};
156
157static int test_kretprobe(void)
158{
159 int ret;
160
161 ret = register_kretprobe(&rp);
162 if (ret < 0) {
163 printk(KERN_ERR "Kprobe smoke test failed: "
164 "register_kretprobe returned %d\n", ret);
165 return ret;
166 }
167
168 ret = kprobe_target(rand1);
169 unregister_kretprobe(&rp);
170 if (krph_val == 0) {
171 printk(KERN_ERR "Kprobe smoke test failed: "
172 "kretprobe handler not called\n");
173 handler_errors++;
174 }
175
176 return 0;
177}
178#endif /* CONFIG_KRETPROBES */
179
180int init_test_probes(void)
181{
182 int ret;
183
184 do {
185 rand1 = random32();
186 } while (rand1 <= div_factor);
187
188 printk(KERN_INFO "Kprobe smoke test started\n");
189 num_tests++;
190 ret = test_kprobe();
191 if (ret < 0)
192 errors++;
193
194 num_tests++;
195 ret = test_jprobe();
196 if (ret < 0)
197 errors++;
198
199#ifdef CONFIG_KRETPROBES
200 num_tests++;
201 ret = test_kretprobe();
202 if (ret < 0)
203 errors++;
204#endif /* CONFIG_KRETPROBES */
205
206 if (errors)
207 printk(KERN_ERR "BUG: Kprobe smoke test: %d out of "
208 "%d tests failed\n", errors, num_tests);
209 else if (handler_errors)
210 printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) "
211 "running handlers\n", handler_errors);
212 else
213 printk(KERN_INFO "Kprobe smoke test passed successfully\n");
214
215 return 0;
216}
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 822beebe664a..3e59fce6dd43 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -41,6 +41,11 @@ unsigned long clockevent_delta2ns(unsigned long latch,
41{ 41{
42 u64 clc = ((u64) latch << evt->shift); 42 u64 clc = ((u64) latch << evt->shift);
43 43
44 if (unlikely(!evt->mult)) {
45 evt->mult = 1;
46 WARN_ON(1);
47 }
48
44 do_div(clc, evt->mult); 49 do_div(clc, evt->mult);
45 if (clc < 1000) 50 if (clc < 1000)
46 clc = 1000; 51 clc = 1000;
@@ -78,6 +83,11 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
78 unsigned long long clc; 83 unsigned long long clc;
79 int64_t delta; 84 int64_t delta;
80 85
86 if (unlikely(expires.tv64 < 0)) {
87 WARN_ON_ONCE(1);
88 return -ETIME;
89 }
90
81 delta = ktime_to_ns(ktime_sub(expires, now)); 91 delta = ktime_to_ns(ktime_sub(expires, now));
82 92
83 if (delta <= 0) 93 if (delta <= 0)
@@ -146,6 +156,14 @@ static void clockevents_notify_released(void)
146void clockevents_register_device(struct clock_event_device *dev) 156void clockevents_register_device(struct clock_event_device *dev)
147{ 157{
148 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 158 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
159 /*
160 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
161 * on it, so fix it up and emit a warning:
162 */
163 if (unlikely(!dev->mult)) {
164 dev->mult = 1;
165 WARN_ON(1);
166 }
149 167
150 spin_lock(&clockevents_lock); 168 spin_lock(&clockevents_lock);
151 169
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c8a9d13874df..6e9259a5d501 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -142,8 +142,13 @@ static void clocksource_watchdog(unsigned long data)
142 } 142 }
143 143
144 if (!list_empty(&watchdog_list)) { 144 if (!list_empty(&watchdog_list)) {
145 __mod_timer(&watchdog_timer, 145 /* Cycle through CPUs to check if the CPUs stay synchronized to
146 watchdog_timer.expires + WATCHDOG_INTERVAL); 146 * each other. */
147 int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map);
148 if (next_cpu >= NR_CPUS)
149 next_cpu = first_cpu(cpu_online_map);
150 watchdog_timer.expires += WATCHDOG_INTERVAL;
151 add_timer_on(&watchdog_timer, next_cpu);
147 } 152 }
148 spin_unlock(&watchdog_lock); 153 spin_unlock(&watchdog_lock);
149} 154}
@@ -165,7 +170,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
165 if (!started && watchdog) { 170 if (!started && watchdog) {
166 watchdog_last = watchdog->read(); 171 watchdog_last = watchdog->read();
167 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; 172 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
168 add_timer(&watchdog_timer); 173 add_timer_on(&watchdog_timer, first_cpu(cpu_online_map));
169 } 174 }
170 } else { 175 } else {
171 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 176 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
@@ -175,7 +180,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
175 if (watchdog) 180 if (watchdog)
176 del_timer(&watchdog_timer); 181 del_timer(&watchdog_timer);
177 watchdog = cs; 182 watchdog = cs;
178 init_timer(&watchdog_timer); 183 init_timer_deferrable(&watchdog_timer);
179 watchdog_timer.function = clocksource_watchdog; 184 watchdog_timer.function = clocksource_watchdog;
180 185
181 /* Reset watchdog cycles */ 186 /* Reset watchdog cycles */
@@ -186,7 +191,8 @@ static void clocksource_check_watchdog(struct clocksource *cs)
186 watchdog_last = watchdog->read(); 191 watchdog_last = watchdog->read();
187 watchdog_timer.expires = 192 watchdog_timer.expires =
188 jiffies + WATCHDOG_INTERVAL; 193 jiffies + WATCHDOG_INTERVAL;
189 add_timer(&watchdog_timer); 194 add_timer_on(&watchdog_timer,
195 first_cpu(cpu_online_map));
190 } 196 }
191 } 197 }
192 } 198 }
@@ -331,6 +337,21 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
331 spin_unlock_irqrestore(&clocksource_lock, flags); 337 spin_unlock_irqrestore(&clocksource_lock, flags);
332} 338}
333 339
340/**
341 * clocksource_unregister - remove a registered clocksource
342 */
343void clocksource_unregister(struct clocksource *cs)
344{
345 unsigned long flags;
346
347 spin_lock_irqsave(&clocksource_lock, flags);
348 list_del(&cs->list);
349 if (clocksource_override == cs)
350 clocksource_override = NULL;
351 next_clocksource = select_clocksource();
352 spin_unlock_irqrestore(&clocksource_lock, flags);
353}
354
334#ifdef CONFIG_SYSFS 355#ifdef CONFIG_SYSFS
335/** 356/**
336 * sysfs_show_current_clocksources - sysfs interface for current clocksource 357 * sysfs_show_current_clocksources - sysfs interface for current clocksource
@@ -441,7 +462,7 @@ static SYSDEV_ATTR(available_clocksource, 0600,
441 sysfs_show_available_clocksources, NULL); 462 sysfs_show_available_clocksources, NULL);
442 463
443static struct sysdev_class clocksource_sysclass = { 464static struct sysdev_class clocksource_sysclass = {
444 set_kset_name("clocksource"), 465 .name = "clocksource",
445}; 466};
446 467
447static struct sys_device device_clocksource = { 468static struct sys_device device_clocksource = {
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index aa82d7bf478a..e1bd50cbbf5d 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -126,9 +126,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
126/* 126/*
127 * Broadcast the event to the cpus, which are set in the mask 127 * Broadcast the event to the cpus, which are set in the mask
128 */ 128 */
129int tick_do_broadcast(cpumask_t mask) 129static void tick_do_broadcast(cpumask_t mask)
130{ 130{
131 int ret = 0, cpu = smp_processor_id(); 131 int cpu = smp_processor_id();
132 struct tick_device *td; 132 struct tick_device *td;
133 133
134 /* 134 /*
@@ -138,7 +138,6 @@ int tick_do_broadcast(cpumask_t mask)
138 cpu_clear(cpu, mask); 138 cpu_clear(cpu, mask);
139 td = &per_cpu(tick_cpu_device, cpu); 139 td = &per_cpu(tick_cpu_device, cpu);
140 td->evtdev->event_handler(td->evtdev); 140 td->evtdev->event_handler(td->evtdev);
141 ret = 1;
142 } 141 }
143 142
144 if (!cpus_empty(mask)) { 143 if (!cpus_empty(mask)) {
@@ -151,9 +150,7 @@ int tick_do_broadcast(cpumask_t mask)
151 cpu = first_cpu(mask); 150 cpu = first_cpu(mask);
152 td = &per_cpu(tick_cpu_device, cpu); 151 td = &per_cpu(tick_cpu_device, cpu);
153 td->evtdev->broadcast(mask); 152 td->evtdev->broadcast(mask);
154 ret = 1;
155 } 153 }
156 return ret;
157} 154}
158 155
159/* 156/*
@@ -384,45 +381,19 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
384} 381}
385 382
386/* 383/*
387 * Reprogram the broadcast device:
388 *
389 * Called with tick_broadcast_lock held and interrupts disabled.
390 */
391static int tick_broadcast_reprogram(void)
392{
393 ktime_t expires = { .tv64 = KTIME_MAX };
394 struct tick_device *td;
395 int cpu;
396
397 /*
398 * Find the event which expires next:
399 */
400 for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
401 cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
402 td = &per_cpu(tick_cpu_device, cpu);
403 if (td->evtdev->next_event.tv64 < expires.tv64)
404 expires = td->evtdev->next_event;
405 }
406
407 if (expires.tv64 == KTIME_MAX)
408 return 0;
409
410 return tick_broadcast_set_event(expires, 0);
411}
412
413/*
414 * Handle oneshot mode broadcasting 384 * Handle oneshot mode broadcasting
415 */ 385 */
416static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) 386static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
417{ 387{
418 struct tick_device *td; 388 struct tick_device *td;
419 cpumask_t mask; 389 cpumask_t mask;
420 ktime_t now; 390 ktime_t now, next_event;
421 int cpu; 391 int cpu;
422 392
423 spin_lock(&tick_broadcast_lock); 393 spin_lock(&tick_broadcast_lock);
424again: 394again:
425 dev->next_event.tv64 = KTIME_MAX; 395 dev->next_event.tv64 = KTIME_MAX;
396 next_event.tv64 = KTIME_MAX;
426 mask = CPU_MASK_NONE; 397 mask = CPU_MASK_NONE;
427 now = ktime_get(); 398 now = ktime_get();
428 /* Find all expired events */ 399 /* Find all expired events */
@@ -431,19 +402,31 @@ again:
431 td = &per_cpu(tick_cpu_device, cpu); 402 td = &per_cpu(tick_cpu_device, cpu);
432 if (td->evtdev->next_event.tv64 <= now.tv64) 403 if (td->evtdev->next_event.tv64 <= now.tv64)
433 cpu_set(cpu, mask); 404 cpu_set(cpu, mask);
405 else if (td->evtdev->next_event.tv64 < next_event.tv64)
406 next_event.tv64 = td->evtdev->next_event.tv64;
434 } 407 }
435 408
436 /* 409 /*
437 * Wakeup the cpus which have an expired event. The broadcast 410 * Wakeup the cpus which have an expired event.
438 * device is reprogrammed in the return from idle code. 411 */
412 tick_do_broadcast(mask);
413
414 /*
415 * Two reasons for reprogram:
416 *
417 * - The global event did not expire any CPU local
418 * events. This happens in dyntick mode, as the maximum PIT
419 * delta is quite small.
420 *
421 * - There are pending events on sleeping CPUs which were not
422 * in the event mask
439 */ 423 */
440 if (!tick_do_broadcast(mask)) { 424 if (next_event.tv64 != KTIME_MAX) {
441 /* 425 /*
442 * The global event did not expire any CPU local 426 * Rearm the broadcast device. If event expired,
443 * events. This happens in dyntick mode, as the 427 * repeat the above
444 * maximum PIT delta is quite small.
445 */ 428 */
446 if (tick_broadcast_reprogram()) 429 if (tick_broadcast_set_event(next_event, 0))
447 goto again; 430 goto again;
448 } 431 }
449 spin_unlock(&tick_broadcast_lock); 432 spin_unlock(&tick_broadcast_lock);
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index bb13f2724905..f13f2b7f4fd4 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -70,8 +70,6 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
70 * Broadcasting support 70 * Broadcasting support
71 */ 71 */
72#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 72#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
73extern int tick_do_broadcast(cpumask_t mask);
74
75extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); 73extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
76extern int tick_check_broadcast_device(struct clock_event_device *dev); 74extern int tick_check_broadcast_device(struct clock_event_device *dev);
77extern int tick_is_broadcast_device(struct clock_event_device *dev); 75extern int tick_is_broadcast_device(struct clock_event_device *dev);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cb89fa8db110..63f24b550695 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -9,7 +9,7 @@
9 * 9 *
10 * Started by: Thomas Gleixner and Ingo Molnar 10 * Started by: Thomas Gleixner and Ingo Molnar
11 * 11 *
12 * For licencing details see kernel-base/COPYING 12 * Distribute under GPLv2.
13 */ 13 */
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/err.h> 15#include <linux/err.h>
@@ -143,6 +143,44 @@ void tick_nohz_update_jiffies(void)
143 local_irq_restore(flags); 143 local_irq_restore(flags);
144} 144}
145 145
146void tick_nohz_stop_idle(int cpu)
147{
148 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
149
150 if (ts->idle_active) {
151 ktime_t now, delta;
152 now = ktime_get();
153 delta = ktime_sub(now, ts->idle_entrytime);
154 ts->idle_lastupdate = now;
155 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
156 ts->idle_active = 0;
157 }
158}
159
160static ktime_t tick_nohz_start_idle(int cpu)
161{
162 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
163 ktime_t now, delta;
164
165 now = ktime_get();
166 if (ts->idle_active) {
167 delta = ktime_sub(now, ts->idle_entrytime);
168 ts->idle_lastupdate = now;
169 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
170 }
171 ts->idle_entrytime = now;
172 ts->idle_active = 1;
173 return now;
174}
175
176u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
177{
178 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
179
180 *last_update_time = ktime_to_us(ts->idle_lastupdate);
181 return ktime_to_us(ts->idle_sleeptime);
182}
183
146/** 184/**
147 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task 185 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
148 * 186 *
@@ -153,14 +191,16 @@ void tick_nohz_update_jiffies(void)
153void tick_nohz_stop_sched_tick(void) 191void tick_nohz_stop_sched_tick(void)
154{ 192{
155 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; 193 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
194 unsigned long rt_jiffies;
156 struct tick_sched *ts; 195 struct tick_sched *ts;
157 ktime_t last_update, expires, now, delta; 196 ktime_t last_update, expires, now;
158 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 197 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
159 int cpu; 198 int cpu;
160 199
161 local_irq_save(flags); 200 local_irq_save(flags);
162 201
163 cpu = smp_processor_id(); 202 cpu = smp_processor_id();
203 now = tick_nohz_start_idle(cpu);
164 ts = &per_cpu(tick_cpu_sched, cpu); 204 ts = &per_cpu(tick_cpu_sched, cpu);
165 205
166 /* 206 /*
@@ -192,19 +232,7 @@ void tick_nohz_stop_sched_tick(void)
192 } 232 }
193 } 233 }
194 234
195 now = ktime_get();
196 /*
197 * When called from irq_exit we need to account the idle sleep time
198 * correctly.
199 */
200 if (ts->tick_stopped) {
201 delta = ktime_sub(now, ts->idle_entrytime);
202 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
203 }
204
205 ts->idle_entrytime = now;
206 ts->idle_calls++; 235 ts->idle_calls++;
207
208 /* Read jiffies and the time when jiffies were updated last */ 236 /* Read jiffies and the time when jiffies were updated last */
209 do { 237 do {
210 seq = read_seqbegin(&xtime_lock); 238 seq = read_seqbegin(&xtime_lock);
@@ -216,6 +244,10 @@ void tick_nohz_stop_sched_tick(void)
216 next_jiffies = get_next_timer_interrupt(last_jiffies); 244 next_jiffies = get_next_timer_interrupt(last_jiffies);
217 delta_jiffies = next_jiffies - last_jiffies; 245 delta_jiffies = next_jiffies - last_jiffies;
218 246
247 rt_jiffies = rt_needs_cpu(cpu);
248 if (rt_jiffies && rt_jiffies < delta_jiffies)
249 delta_jiffies = rt_jiffies;
250
219 if (rcu_needs_cpu(cpu)) 251 if (rcu_needs_cpu(cpu))
220 delta_jiffies = 1; 252 delta_jiffies = 1;
221 /* 253 /*
@@ -291,7 +323,7 @@ void tick_nohz_stop_sched_tick(void)
291 /* Check, if the timer was already in the past */ 323 /* Check, if the timer was already in the past */
292 if (hrtimer_active(&ts->sched_timer)) 324 if (hrtimer_active(&ts->sched_timer))
293 goto out; 325 goto out;
294 } else if(!tick_program_event(expires, 0)) 326 } else if (!tick_program_event(expires, 0))
295 goto out; 327 goto out;
296 /* 328 /*
297 * We are past the event already. So we crossed a 329 * We are past the event already. So we crossed a
@@ -332,23 +364,22 @@ void tick_nohz_restart_sched_tick(void)
332 int cpu = smp_processor_id(); 364 int cpu = smp_processor_id();
333 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 365 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
334 unsigned long ticks; 366 unsigned long ticks;
335 ktime_t now, delta; 367 ktime_t now;
336 368
337 if (!ts->tick_stopped) 369 local_irq_disable();
370 tick_nohz_stop_idle(cpu);
371
372 if (!ts->tick_stopped) {
373 local_irq_enable();
338 return; 374 return;
375 }
339 376
340 /* Update jiffies first */ 377 /* Update jiffies first */
341 now = ktime_get();
342
343 local_irq_disable();
344 select_nohz_load_balancer(0); 378 select_nohz_load_balancer(0);
379 now = ktime_get();
345 tick_do_update_jiffies64(now); 380 tick_do_update_jiffies64(now);
346 cpu_clear(cpu, nohz_cpu_mask); 381 cpu_clear(cpu, nohz_cpu_mask);
347 382
348 /* Account the idle time */
349 delta = ktime_sub(now, ts->idle_entrytime);
350 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
351
352 /* 383 /*
353 * We stopped the tick in idle. Update process times would miss the 384 * We stopped the tick in idle. Update process times would miss the
354 * time we slept as update_process_times does only a 1 tick 385 * time we slept as update_process_times does only a 1 tick
@@ -502,14 +533,13 @@ static inline void tick_nohz_switch_to_nohz(void) { }
502 */ 533 */
503#ifdef CONFIG_HIGH_RES_TIMERS 534#ifdef CONFIG_HIGH_RES_TIMERS
504/* 535/*
505 * We rearm the timer until we get disabled by the idle code 536 * We rearm the timer until we get disabled by the idle code.
506 * Called with interrupts disabled and timer->base->cpu_base->lock held. 537 * Called with interrupts disabled and timer->base->cpu_base->lock held.
507 */ 538 */
508static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) 539static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
509{ 540{
510 struct tick_sched *ts = 541 struct tick_sched *ts =
511 container_of(timer, struct tick_sched, sched_timer); 542 container_of(timer, struct tick_sched, sched_timer);
512 struct hrtimer_cpu_base *base = timer->base->cpu_base;
513 struct pt_regs *regs = get_irq_regs(); 543 struct pt_regs *regs = get_irq_regs();
514 ktime_t now = ktime_get(); 544 ktime_t now = ktime_get();
515 int cpu = smp_processor_id(); 545 int cpu = smp_processor_id();
@@ -547,15 +577,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
547 touch_softlockup_watchdog(); 577 touch_softlockup_watchdog();
548 ts->idle_jiffies++; 578 ts->idle_jiffies++;
549 } 579 }
550 /*
551 * update_process_times() might take tasklist_lock, hence
552 * drop the base lock. sched-tick hrtimers are per-CPU and
553 * never accessible by userspace APIs, so this is safe to do.
554 */
555 spin_unlock(&base->lock);
556 update_process_times(user_mode(regs)); 580 update_process_times(user_mode(regs));
557 profile_tick(CPU_PROFILING); 581 profile_tick(CPU_PROFILING);
558 spin_lock(&base->lock);
559 } 582 }
560 583
561 /* Do not restart, when we are in the idle loop */ 584 /* Do not restart, when we are in the idle loop */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e5e466b27598..092a2366b5a9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -82,13 +82,12 @@ static inline s64 __get_nsec_offset(void)
82} 82}
83 83
84/** 84/**
85 * __get_realtime_clock_ts - Returns the time of day in a timespec 85 * getnstimeofday - Returns the time of day in a timespec
86 * @ts: pointer to the timespec to be set 86 * @ts: pointer to the timespec to be set
87 * 87 *
88 * Returns the time of day in a timespec. Used by 88 * Returns the time of day in a timespec.
89 * do_gettimeofday() and get_realtime_clock_ts().
90 */ 89 */
91static inline void __get_realtime_clock_ts(struct timespec *ts) 90void getnstimeofday(struct timespec *ts)
92{ 91{
93 unsigned long seq; 92 unsigned long seq;
94 s64 nsecs; 93 s64 nsecs;
@@ -104,30 +103,19 @@ static inline void __get_realtime_clock_ts(struct timespec *ts)
104 timespec_add_ns(ts, nsecs); 103 timespec_add_ns(ts, nsecs);
105} 104}
106 105
107/**
108 * getnstimeofday - Returns the time of day in a timespec
109 * @ts: pointer to the timespec to be set
110 *
111 * Returns the time of day in a timespec.
112 */
113void getnstimeofday(struct timespec *ts)
114{
115 __get_realtime_clock_ts(ts);
116}
117
118EXPORT_SYMBOL(getnstimeofday); 106EXPORT_SYMBOL(getnstimeofday);
119 107
120/** 108/**
121 * do_gettimeofday - Returns the time of day in a timeval 109 * do_gettimeofday - Returns the time of day in a timeval
122 * @tv: pointer to the timeval to be set 110 * @tv: pointer to the timeval to be set
123 * 111 *
124 * NOTE: Users should be converted to using get_realtime_clock_ts() 112 * NOTE: Users should be converted to using getnstimeofday()
125 */ 113 */
126void do_gettimeofday(struct timeval *tv) 114void do_gettimeofday(struct timeval *tv)
127{ 115{
128 struct timespec now; 116 struct timespec now;
129 117
130 __get_realtime_clock_ts(&now); 118 getnstimeofday(&now);
131 tv->tv_sec = now.tv_sec; 119 tv->tv_sec = now.tv_sec;
132 tv->tv_usec = now.tv_nsec/1000; 120 tv->tv_usec = now.tv_nsec/1000;
133} 121}
@@ -198,7 +186,8 @@ static void change_clocksource(void)
198 186
199 clock->error = 0; 187 clock->error = 0;
200 clock->xtime_nsec = 0; 188 clock->xtime_nsec = 0;
201 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 189 clocksource_calculate_interval(clock,
190 (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
202 191
203 tick_clock_notify(); 192 tick_clock_notify();
204 193
@@ -255,7 +244,8 @@ void __init timekeeping_init(void)
255 ntp_clear(); 244 ntp_clear();
256 245
257 clock = clocksource_get_next(); 246 clock = clocksource_get_next();
258 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 247 clocksource_calculate_interval(clock,
248 (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
259 clock->cycle_last = clocksource_read(clock); 249 clock->cycle_last = clocksource_read(clock);
260 250
261 xtime.tv_sec = sec; 251 xtime.tv_sec = sec;
@@ -335,9 +325,9 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
335 325
336/* sysfs resume/suspend bits for timekeeping */ 326/* sysfs resume/suspend bits for timekeeping */
337static struct sysdev_class timekeeping_sysclass = { 327static struct sysdev_class timekeeping_sysclass = {
328 .name = "timekeeping",
338 .resume = timekeeping_resume, 329 .resume = timekeeping_resume,
339 .suspend = timekeeping_suspend, 330 .suspend = timekeeping_suspend,
340 set_kset_name("timekeeping"),
341}; 331};
342 332
343static struct sys_device device_timer = { 333static struct sys_device device_timer = {
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c36bb7ed0301..417da8c5bc72 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -26,7 +26,7 @@
26 * the pid and cmdline from the owner process if applicable. 26 * the pid and cmdline from the owner process if applicable.
27 * 27 *
28 * Start/stop data collection: 28 * Start/stop data collection:
29 * # echo 1[0] >/proc/timer_stats 29 * # echo [1|0] >/proc/timer_stats
30 * 30 *
31 * Display the information collected so far: 31 * Display the information collected so far:
32 * # cat /proc/timer_stats 32 * # cat /proc/timer_stats
diff --git a/kernel/timer.c b/kernel/timer.c
index 66d7d8bca1a3..9fbb472b8cf0 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -58,59 +58,57 @@ EXPORT_SYMBOL(jiffies_64);
58#define TVN_MASK (TVN_SIZE - 1) 58#define TVN_MASK (TVN_SIZE - 1)
59#define TVR_MASK (TVR_SIZE - 1) 59#define TVR_MASK (TVR_SIZE - 1)
60 60
61typedef struct tvec_s { 61struct tvec {
62 struct list_head vec[TVN_SIZE]; 62 struct list_head vec[TVN_SIZE];
63} tvec_t; 63};
64 64
65typedef struct tvec_root_s { 65struct tvec_root {
66 struct list_head vec[TVR_SIZE]; 66 struct list_head vec[TVR_SIZE];
67} tvec_root_t; 67};
68 68
69struct tvec_t_base_s { 69struct tvec_base {
70 spinlock_t lock; 70 spinlock_t lock;
71 struct timer_list *running_timer; 71 struct timer_list *running_timer;
72 unsigned long timer_jiffies; 72 unsigned long timer_jiffies;
73 tvec_root_t tv1; 73 struct tvec_root tv1;
74 tvec_t tv2; 74 struct tvec tv2;
75 tvec_t tv3; 75 struct tvec tv3;
76 tvec_t tv4; 76 struct tvec tv4;
77 tvec_t tv5; 77 struct tvec tv5;
78} ____cacheline_aligned; 78} ____cacheline_aligned;
79 79
80typedef struct tvec_t_base_s tvec_base_t; 80struct tvec_base boot_tvec_bases;
81
82tvec_base_t boot_tvec_bases;
83EXPORT_SYMBOL(boot_tvec_bases); 81EXPORT_SYMBOL(boot_tvec_bases);
84static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; 82static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
85 83
86/* 84/*
87 * Note that all tvec_bases is 2 byte aligned and lower bit of 85 * Note that all tvec_bases are 2 byte aligned and lower bit of
88 * base in timer_list is guaranteed to be zero. Use the LSB for 86 * base in timer_list is guaranteed to be zero. Use the LSB for
89 * the new flag to indicate whether the timer is deferrable 87 * the new flag to indicate whether the timer is deferrable
90 */ 88 */
91#define TBASE_DEFERRABLE_FLAG (0x1) 89#define TBASE_DEFERRABLE_FLAG (0x1)
92 90
93/* Functions below help us manage 'deferrable' flag */ 91/* Functions below help us manage 'deferrable' flag */
94static inline unsigned int tbase_get_deferrable(tvec_base_t *base) 92static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
95{ 93{
96 return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); 94 return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
97} 95}
98 96
99static inline tvec_base_t *tbase_get_base(tvec_base_t *base) 97static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
100{ 98{
101 return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); 99 return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
102} 100}
103 101
104static inline void timer_set_deferrable(struct timer_list *timer) 102static inline void timer_set_deferrable(struct timer_list *timer)
105{ 103{
106 timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | 104 timer->base = ((struct tvec_base *)((unsigned long)(timer->base) |
107 TBASE_DEFERRABLE_FLAG)); 105 TBASE_DEFERRABLE_FLAG));
108} 106}
109 107
110static inline void 108static inline void
111timer_set_base(struct timer_list *timer, tvec_base_t *new_base) 109timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
112{ 110{
113 timer->base = (tvec_base_t *)((unsigned long)(new_base) | 111 timer->base = (struct tvec_base *)((unsigned long)(new_base) |
114 tbase_get_deferrable(timer->base)); 112 tbase_get_deferrable(timer->base));
115} 113}
116 114
@@ -246,7 +244,7 @@ unsigned long round_jiffies_relative(unsigned long j)
246EXPORT_SYMBOL_GPL(round_jiffies_relative); 244EXPORT_SYMBOL_GPL(round_jiffies_relative);
247 245
248 246
249static inline void set_running_timer(tvec_base_t *base, 247static inline void set_running_timer(struct tvec_base *base,
250 struct timer_list *timer) 248 struct timer_list *timer)
251{ 249{
252#ifdef CONFIG_SMP 250#ifdef CONFIG_SMP
@@ -254,7 +252,7 @@ static inline void set_running_timer(tvec_base_t *base,
254#endif 252#endif
255} 253}
256 254
257static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) 255static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
258{ 256{
259 unsigned long expires = timer->expires; 257 unsigned long expires = timer->expires;
260 unsigned long idx = expires - base->timer_jiffies; 258 unsigned long idx = expires - base->timer_jiffies;
@@ -371,14 +369,14 @@ static inline void detach_timer(struct timer_list *timer,
371 * possible to set timer->base = NULL and drop the lock: the timer remains 369 * possible to set timer->base = NULL and drop the lock: the timer remains
372 * locked. 370 * locked.
373 */ 371 */
374static tvec_base_t *lock_timer_base(struct timer_list *timer, 372static struct tvec_base *lock_timer_base(struct timer_list *timer,
375 unsigned long *flags) 373 unsigned long *flags)
376 __acquires(timer->base->lock) 374 __acquires(timer->base->lock)
377{ 375{
378 tvec_base_t *base; 376 struct tvec_base *base;
379 377
380 for (;;) { 378 for (;;) {
381 tvec_base_t *prelock_base = timer->base; 379 struct tvec_base *prelock_base = timer->base;
382 base = tbase_get_base(prelock_base); 380 base = tbase_get_base(prelock_base);
383 if (likely(base != NULL)) { 381 if (likely(base != NULL)) {
384 spin_lock_irqsave(&base->lock, *flags); 382 spin_lock_irqsave(&base->lock, *flags);
@@ -393,7 +391,7 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer,
393 391
394int __mod_timer(struct timer_list *timer, unsigned long expires) 392int __mod_timer(struct timer_list *timer, unsigned long expires)
395{ 393{
396 tvec_base_t *base, *new_base; 394 struct tvec_base *base, *new_base;
397 unsigned long flags; 395 unsigned long flags;
398 int ret = 0; 396 int ret = 0;
399 397
@@ -445,7 +443,7 @@ EXPORT_SYMBOL(__mod_timer);
445 */ 443 */
446void add_timer_on(struct timer_list *timer, int cpu) 444void add_timer_on(struct timer_list *timer, int cpu)
447{ 445{
448 tvec_base_t *base = per_cpu(tvec_bases, cpu); 446 struct tvec_base *base = per_cpu(tvec_bases, cpu);
449 unsigned long flags; 447 unsigned long flags;
450 448
451 timer_stats_timer_set_start_info(timer); 449 timer_stats_timer_set_start_info(timer);
@@ -508,7 +506,7 @@ EXPORT_SYMBOL(mod_timer);
508 */ 506 */
509int del_timer(struct timer_list *timer) 507int del_timer(struct timer_list *timer)
510{ 508{
511 tvec_base_t *base; 509 struct tvec_base *base;
512 unsigned long flags; 510 unsigned long flags;
513 int ret = 0; 511 int ret = 0;
514 512
@@ -539,7 +537,7 @@ EXPORT_SYMBOL(del_timer);
539 */ 537 */
540int try_to_del_timer_sync(struct timer_list *timer) 538int try_to_del_timer_sync(struct timer_list *timer)
541{ 539{
542 tvec_base_t *base; 540 struct tvec_base *base;
543 unsigned long flags; 541 unsigned long flags;
544 int ret = -1; 542 int ret = -1;
545 543
@@ -591,7 +589,7 @@ int del_timer_sync(struct timer_list *timer)
591EXPORT_SYMBOL(del_timer_sync); 589EXPORT_SYMBOL(del_timer_sync);
592#endif 590#endif
593 591
594static int cascade(tvec_base_t *base, tvec_t *tv, int index) 592static int cascade(struct tvec_base *base, struct tvec *tv, int index)
595{ 593{
596 /* cascade all the timers from tv up one level */ 594 /* cascade all the timers from tv up one level */
597 struct timer_list *timer, *tmp; 595 struct timer_list *timer, *tmp;
@@ -620,7 +618,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
620 * This function cascades all vectors and executes all expired timer 618 * This function cascades all vectors and executes all expired timer
621 * vectors. 619 * vectors.
622 */ 620 */
623static inline void __run_timers(tvec_base_t *base) 621static inline void __run_timers(struct tvec_base *base)
624{ 622{
625 struct timer_list *timer; 623 struct timer_list *timer;
626 624
@@ -657,7 +655,7 @@ static inline void __run_timers(tvec_base_t *base)
657 int preempt_count = preempt_count(); 655 int preempt_count = preempt_count();
658 fn(data); 656 fn(data);
659 if (preempt_count != preempt_count()) { 657 if (preempt_count != preempt_count()) {
660 printk(KERN_WARNING "huh, entered %p " 658 printk(KERN_ERR "huh, entered %p "
661 "with preempt_count %08x, exited" 659 "with preempt_count %08x, exited"
662 " with %08x?\n", 660 " with %08x?\n",
663 fn, preempt_count, 661 fn, preempt_count,
@@ -678,13 +676,13 @@ static inline void __run_timers(tvec_base_t *base)
678 * is used on S/390 to stop all activity when a cpus is idle. 676 * is used on S/390 to stop all activity when a cpus is idle.
679 * This functions needs to be called disabled. 677 * This functions needs to be called disabled.
680 */ 678 */
681static unsigned long __next_timer_interrupt(tvec_base_t *base) 679static unsigned long __next_timer_interrupt(struct tvec_base *base)
682{ 680{
683 unsigned long timer_jiffies = base->timer_jiffies; 681 unsigned long timer_jiffies = base->timer_jiffies;
684 unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; 682 unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
685 int index, slot, array, found = 0; 683 int index, slot, array, found = 0;
686 struct timer_list *nte; 684 struct timer_list *nte;
687 tvec_t *varray[4]; 685 struct tvec *varray[4];
688 686
689 /* Look for timer events in tv1. */ 687 /* Look for timer events in tv1. */
690 index = slot = timer_jiffies & TVR_MASK; 688 index = slot = timer_jiffies & TVR_MASK;
@@ -716,7 +714,7 @@ cascade:
716 varray[3] = &base->tv5; 714 varray[3] = &base->tv5;
717 715
718 for (array = 0; array < 4; array++) { 716 for (array = 0; array < 4; array++) {
719 tvec_t *varp = varray[array]; 717 struct tvec *varp = varray[array];
720 718
721 index = slot = timer_jiffies & TVN_MASK; 719 index = slot = timer_jiffies & TVN_MASK;
722 do { 720 do {
@@ -795,7 +793,7 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
795 */ 793 */
796unsigned long get_next_timer_interrupt(unsigned long now) 794unsigned long get_next_timer_interrupt(unsigned long now)
797{ 795{
798 tvec_base_t *base = __get_cpu_var(tvec_bases); 796 struct tvec_base *base = __get_cpu_var(tvec_bases);
799 unsigned long expires; 797 unsigned long expires;
800 798
801 spin_lock(&base->lock); 799 spin_lock(&base->lock);
@@ -894,9 +892,9 @@ static inline void calc_load(unsigned long ticks)
894 */ 892 */
895static void run_timer_softirq(struct softirq_action *h) 893static void run_timer_softirq(struct softirq_action *h)
896{ 894{
897 tvec_base_t *base = __get_cpu_var(tvec_bases); 895 struct tvec_base *base = __get_cpu_var(tvec_bases);
898 896
899 hrtimer_run_queues(); 897 hrtimer_run_pending();
900 898
901 if (time_after_eq(jiffies, base->timer_jiffies)) 899 if (time_after_eq(jiffies, base->timer_jiffies))
902 __run_timers(base); 900 __run_timers(base);
@@ -907,6 +905,7 @@ static void run_timer_softirq(struct softirq_action *h)
907 */ 905 */
908void run_local_timers(void) 906void run_local_timers(void)
909{ 907{
908 hrtimer_run_queues();
910 raise_softirq(TIMER_SOFTIRQ); 909 raise_softirq(TIMER_SOFTIRQ);
911 softlockup_tick(); 910 softlockup_tick();
912} 911}
@@ -978,7 +977,7 @@ asmlinkage long sys_getppid(void)
978 int pid; 977 int pid;
979 978
980 rcu_read_lock(); 979 rcu_read_lock();
981 pid = task_ppid_nr_ns(current, current->nsproxy->pid_ns); 980 pid = task_tgid_nr_ns(current->real_parent, current->nsproxy->pid_ns);
982 rcu_read_unlock(); 981 rcu_read_unlock();
983 982
984 return pid; 983 return pid;
@@ -1226,11 +1225,11 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
1226 */ 1225 */
1227static struct lock_class_key base_lock_keys[NR_CPUS]; 1226static struct lock_class_key base_lock_keys[NR_CPUS];
1228 1227
1229static int __devinit init_timers_cpu(int cpu) 1228static int __cpuinit init_timers_cpu(int cpu)
1230{ 1229{
1231 int j; 1230 int j;
1232 tvec_base_t *base; 1231 struct tvec_base *base;
1233 static char __devinitdata tvec_base_done[NR_CPUS]; 1232 static char __cpuinitdata tvec_base_done[NR_CPUS];
1234 1233
1235 if (!tvec_base_done[cpu]) { 1234 if (!tvec_base_done[cpu]) {
1236 static char boot_done; 1235 static char boot_done;
@@ -1284,7 +1283,7 @@ static int __devinit init_timers_cpu(int cpu)
1284} 1283}
1285 1284
1286#ifdef CONFIG_HOTPLUG_CPU 1285#ifdef CONFIG_HOTPLUG_CPU
1287static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) 1286static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
1288{ 1287{
1289 struct timer_list *timer; 1288 struct timer_list *timer;
1290 1289
@@ -1296,10 +1295,10 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
1296 } 1295 }
1297} 1296}
1298 1297
1299static void __devinit migrate_timers(int cpu) 1298static void __cpuinit migrate_timers(int cpu)
1300{ 1299{
1301 tvec_base_t *old_base; 1300 struct tvec_base *old_base;
1302 tvec_base_t *new_base; 1301 struct tvec_base *new_base;
1303 int i; 1302 int i;
1304 1303
1305 BUG_ON(cpu_online(cpu)); 1304 BUG_ON(cpu_online(cpu));
diff --git a/kernel/user.c b/kernel/user.c
index 8320a87f3e5a..bc1c48d35cb3 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -115,7 +115,7 @@ static void sched_switch_user(struct task_struct *p) { }
115 115
116#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS) 116#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS)
117 117
118static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */ 118static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
119static DEFINE_MUTEX(uids_mutex); 119static DEFINE_MUTEX(uids_mutex);
120 120
121static inline void uids_mutex_lock(void) 121static inline void uids_mutex_lock(void)
@@ -128,86 +128,83 @@ static inline void uids_mutex_unlock(void)
128 mutex_unlock(&uids_mutex); 128 mutex_unlock(&uids_mutex);
129} 129}
130 130
131/* return cpu shares held by the user */ 131/* uid directory attributes */
132static ssize_t cpu_shares_show(struct kset *kset, char *buffer) 132static ssize_t cpu_shares_show(struct kobject *kobj,
133 struct kobj_attribute *attr,
134 char *buf)
133{ 135{
134 struct user_struct *up = container_of(kset, struct user_struct, kset); 136 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
135 137
136 return sprintf(buffer, "%lu\n", sched_group_shares(up->tg)); 138 return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
137} 139}
138 140
139/* modify cpu shares held by the user */ 141static ssize_t cpu_shares_store(struct kobject *kobj,
140static ssize_t cpu_shares_store(struct kset *kset, const char *buffer, 142 struct kobj_attribute *attr,
141 size_t size) 143 const char *buf, size_t size)
142{ 144{
143 struct user_struct *up = container_of(kset, struct user_struct, kset); 145 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
144 unsigned long shares; 146 unsigned long shares;
145 int rc; 147 int rc;
146 148
147 sscanf(buffer, "%lu", &shares); 149 sscanf(buf, "%lu", &shares);
148 150
149 rc = sched_group_set_shares(up->tg, shares); 151 rc = sched_group_set_shares(up->tg, shares);
150 152
151 return (rc ? rc : size); 153 return (rc ? rc : size);
152} 154}
153 155
154static void user_attr_init(struct subsys_attribute *sa, char *name, int mode) 156static struct kobj_attribute cpu_share_attr =
157 __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
158
159/* default attributes per uid directory */
160static struct attribute *uids_attributes[] = {
161 &cpu_share_attr.attr,
162 NULL
163};
164
165/* the lifetime of user_struct is not managed by the core (now) */
166static void uids_release(struct kobject *kobj)
155{ 167{
156 sa->attr.name = name; 168 return;
157 sa->attr.mode = mode;
158 sa->show = cpu_shares_show;
159 sa->store = cpu_shares_store;
160} 169}
161 170
162/* Create "/sys/kernel/uids/<uid>" directory and 171static struct kobj_type uids_ktype = {
163 * "/sys/kernel/uids/<uid>/cpu_share" file for this user. 172 .sysfs_ops = &kobj_sysfs_ops,
164 */ 173 .default_attrs = uids_attributes,
165static int user_kobject_create(struct user_struct *up) 174 .release = uids_release,
175};
176
177/* create /sys/kernel/uids/<uid>/cpu_share file for this user */
178static int uids_user_create(struct user_struct *up)
166{ 179{
167 struct kset *kset = &up->kset; 180 struct kobject *kobj = &up->kobj;
168 struct kobject *kobj = &kset->kobj;
169 int error; 181 int error;
170 182
171 memset(kset, 0, sizeof(struct kset)); 183 memset(kobj, 0, sizeof(struct kobject));
172 kobj->parent = &uids_kobject; /* create under /sys/kernel/uids dir */ 184 kobj->kset = uids_kset;
173 kobject_set_name(kobj, "%d", up->uid); 185 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
174 kset_init(kset); 186 if (error) {
175 user_attr_init(&up->user_attr, "cpu_share", 0644); 187 kobject_put(kobj);
176
177 error = kobject_add(kobj);
178 if (error)
179 goto done; 188 goto done;
180 189 }
181 error = sysfs_create_file(kobj, &up->user_attr.attr);
182 if (error)
183 kobject_del(kobj);
184 190
185 kobject_uevent(kobj, KOBJ_ADD); 191 kobject_uevent(kobj, KOBJ_ADD);
186
187done: 192done:
188 return error; 193 return error;
189} 194}
190 195
191/* create these in sysfs filesystem: 196/* create these entries in sysfs:
192 * "/sys/kernel/uids" directory 197 * "/sys/kernel/uids" directory
193 * "/sys/kernel/uids/0" directory (for root user) 198 * "/sys/kernel/uids/0" directory (for root user)
194 * "/sys/kernel/uids/0/cpu_share" file (for root user) 199 * "/sys/kernel/uids/0/cpu_share" file (for root user)
195 */ 200 */
196int __init uids_kobject_init(void) 201int __init uids_sysfs_init(void)
197{ 202{
198 int error; 203 uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
199 204 if (!uids_kset)
200 /* create under /sys/kernel dir */ 205 return -ENOMEM;
201 uids_kobject.parent = &kernel_subsys.kobj;
202 uids_kobject.kset = &kernel_subsys;
203 kobject_set_name(&uids_kobject, "uids");
204 kobject_init(&uids_kobject);
205 206
206 error = kobject_add(&uids_kobject); 207 return uids_user_create(&root_user);
207 if (!error)
208 error = user_kobject_create(&root_user);
209
210 return error;
211} 208}
212 209
213/* work function to remove sysfs directory for a user and free up 210/* work function to remove sysfs directory for a user and free up
@@ -216,7 +213,6 @@ int __init uids_kobject_init(void)
216static void remove_user_sysfs_dir(struct work_struct *w) 213static void remove_user_sysfs_dir(struct work_struct *w)
217{ 214{
218 struct user_struct *up = container_of(w, struct user_struct, work); 215 struct user_struct *up = container_of(w, struct user_struct, work);
219 struct kobject *kobj = &up->kset.kobj;
220 unsigned long flags; 216 unsigned long flags;
221 int remove_user = 0; 217 int remove_user = 0;
222 218
@@ -238,9 +234,9 @@ static void remove_user_sysfs_dir(struct work_struct *w)
238 if (!remove_user) 234 if (!remove_user)
239 goto done; 235 goto done;
240 236
241 sysfs_remove_file(kobj, &up->user_attr.attr); 237 kobject_uevent(&up->kobj, KOBJ_REMOVE);
242 kobject_uevent(kobj, KOBJ_REMOVE); 238 kobject_del(&up->kobj);
243 kobject_del(kobj); 239 kobject_put(&up->kobj);
244 240
245 sched_destroy_user(up); 241 sched_destroy_user(up);
246 key_put(up->uid_keyring); 242 key_put(up->uid_keyring);
@@ -267,7 +263,8 @@ static inline void free_user(struct user_struct *up, unsigned long flags)
267 263
268#else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */ 264#else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */
269 265
270static inline int user_kobject_create(struct user_struct *up) { return 0; } 266int uids_sysfs_init(void) { return 0; }
267static inline int uids_user_create(struct user_struct *up) { return 0; }
271static inline void uids_mutex_lock(void) { } 268static inline void uids_mutex_lock(void) { }
272static inline void uids_mutex_unlock(void) { } 269static inline void uids_mutex_unlock(void) { }
273 270
@@ -322,9 +319,9 @@ void free_uid(struct user_struct *up)
322struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) 319struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
323{ 320{
324 struct hlist_head *hashent = uidhashentry(ns, uid); 321 struct hlist_head *hashent = uidhashentry(ns, uid);
325 struct user_struct *up; 322 struct user_struct *up, *new;
326 323
327 /* Make uid_hash_find() + user_kobject_create() + uid_hash_insert() 324 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
328 * atomic. 325 * atomic.
329 */ 326 */
330 uids_mutex_lock(); 327 uids_mutex_lock();
@@ -334,13 +331,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
334 spin_unlock_irq(&uidhash_lock); 331 spin_unlock_irq(&uidhash_lock);
335 332
336 if (!up) { 333 if (!up) {
337 struct user_struct *new;
338
339 new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); 334 new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
340 if (!new) { 335 if (!new)
341 uids_mutex_unlock(); 336 goto out_unlock;
342 return NULL;
343 }
344 337
345 new->uid = uid; 338 new->uid = uid;
346 atomic_set(&new->__count, 1); 339 atomic_set(&new->__count, 1);
@@ -356,28 +349,14 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
356#endif 349#endif
357 new->locked_shm = 0; 350 new->locked_shm = 0;
358 351
359 if (alloc_uid_keyring(new, current) < 0) { 352 if (alloc_uid_keyring(new, current) < 0)
360 kmem_cache_free(uid_cachep, new); 353 goto out_free_user;
361 uids_mutex_unlock();
362 return NULL;
363 }
364 354
365 if (sched_create_user(new) < 0) { 355 if (sched_create_user(new) < 0)
366 key_put(new->uid_keyring); 356 goto out_put_keys;
367 key_put(new->session_keyring);
368 kmem_cache_free(uid_cachep, new);
369 uids_mutex_unlock();
370 return NULL;
371 }
372 357
373 if (user_kobject_create(new)) { 358 if (uids_user_create(new))
374 sched_destroy_user(new); 359 goto out_destoy_sched;
375 key_put(new->uid_keyring);
376 key_put(new->session_keyring);
377 kmem_cache_free(uid_cachep, new);
378 uids_mutex_unlock();
379 return NULL;
380 }
381 360
382 /* 361 /*
383 * Before adding this, check whether we raced 362 * Before adding this, check whether we raced
@@ -405,6 +384,17 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
405 uids_mutex_unlock(); 384 uids_mutex_unlock();
406 385
407 return up; 386 return up;
387
388out_destoy_sched:
389 sched_destroy_user(new);
390out_put_keys:
391 key_put(new->uid_keyring);
392 key_put(new->session_keyring);
393out_free_user:
394 kmem_cache_free(uid_cachep, new);
395out_unlock:
396 uids_mutex_unlock();
397 return NULL;
408} 398}
409 399
410void switch_uid(struct user_struct *new_user) 400void switch_uid(struct user_struct *new_user)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 52d5e7c9a8e6..52db48e7f6e7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -67,9 +67,8 @@ struct workqueue_struct {
67#endif 67#endif
68}; 68};
69 69
70/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove 70/* Serializes the accesses to the list of workqueues. */
71 threads to each one as cpus come/go. */ 71static DEFINE_SPINLOCK(workqueue_lock);
72static DEFINE_MUTEX(workqueue_mutex);
73static LIST_HEAD(workqueues); 72static LIST_HEAD(workqueues);
74 73
75static int singlethread_cpu __read_mostly; 74static int singlethread_cpu __read_mostly;
@@ -592,8 +591,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
592 * Returns zero on success. 591 * Returns zero on success.
593 * Returns -ve errno on failure. 592 * Returns -ve errno on failure.
594 * 593 *
595 * Appears to be racy against CPU hotplug.
596 *
597 * schedule_on_each_cpu() is very slow. 594 * schedule_on_each_cpu() is very slow.
598 */ 595 */
599int schedule_on_each_cpu(work_func_t func) 596int schedule_on_each_cpu(work_func_t func)
@@ -605,7 +602,7 @@ int schedule_on_each_cpu(work_func_t func)
605 if (!works) 602 if (!works)
606 return -ENOMEM; 603 return -ENOMEM;
607 604
608 preempt_disable(); /* CPU hotplug */ 605 get_online_cpus();
609 for_each_online_cpu(cpu) { 606 for_each_online_cpu(cpu) {
610 struct work_struct *work = per_cpu_ptr(works, cpu); 607 struct work_struct *work = per_cpu_ptr(works, cpu);
611 608
@@ -613,8 +610,8 @@ int schedule_on_each_cpu(work_func_t func)
613 set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); 610 set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
614 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); 611 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
615 } 612 }
616 preempt_enable();
617 flush_workqueue(keventd_wq); 613 flush_workqueue(keventd_wq);
614 put_online_cpus();
618 free_percpu(works); 615 free_percpu(works);
619 return 0; 616 return 0;
620} 617}
@@ -722,7 +719,8 @@ static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
722struct workqueue_struct *__create_workqueue_key(const char *name, 719struct workqueue_struct *__create_workqueue_key(const char *name,
723 int singlethread, 720 int singlethread,
724 int freezeable, 721 int freezeable,
725 struct lock_class_key *key) 722 struct lock_class_key *key,
723 const char *lock_name)
726{ 724{
727 struct workqueue_struct *wq; 725 struct workqueue_struct *wq;
728 struct cpu_workqueue_struct *cwq; 726 struct cpu_workqueue_struct *cwq;
@@ -739,7 +737,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
739 } 737 }
740 738
741 wq->name = name; 739 wq->name = name;
742 lockdep_init_map(&wq->lockdep_map, name, key, 0); 740 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
743 wq->singlethread = singlethread; 741 wq->singlethread = singlethread;
744 wq->freezeable = freezeable; 742 wq->freezeable = freezeable;
745 INIT_LIST_HEAD(&wq->list); 743 INIT_LIST_HEAD(&wq->list);
@@ -749,8 +747,10 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
749 err = create_workqueue_thread(cwq, singlethread_cpu); 747 err = create_workqueue_thread(cwq, singlethread_cpu);
750 start_workqueue_thread(cwq, -1); 748 start_workqueue_thread(cwq, -1);
751 } else { 749 } else {
752 mutex_lock(&workqueue_mutex); 750 get_online_cpus();
751 spin_lock(&workqueue_lock);
753 list_add(&wq->list, &workqueues); 752 list_add(&wq->list, &workqueues);
753 spin_unlock(&workqueue_lock);
754 754
755 for_each_possible_cpu(cpu) { 755 for_each_possible_cpu(cpu) {
756 cwq = init_cpu_workqueue(wq, cpu); 756 cwq = init_cpu_workqueue(wq, cpu);
@@ -759,7 +759,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
759 err = create_workqueue_thread(cwq, cpu); 759 err = create_workqueue_thread(cwq, cpu);
760 start_workqueue_thread(cwq, cpu); 760 start_workqueue_thread(cwq, cpu);
761 } 761 }
762 mutex_unlock(&workqueue_mutex); 762 put_online_cpus();
763 } 763 }
764 764
765 if (err) { 765 if (err) {
@@ -774,7 +774,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
774{ 774{
775 /* 775 /*
776 * Our caller is either destroy_workqueue() or CPU_DEAD, 776 * Our caller is either destroy_workqueue() or CPU_DEAD,
777 * workqueue_mutex protects cwq->thread 777 * get_online_cpus() protects cwq->thread.
778 */ 778 */
779 if (cwq->thread == NULL) 779 if (cwq->thread == NULL)
780 return; 780 return;
@@ -809,9 +809,11 @@ void destroy_workqueue(struct workqueue_struct *wq)
809 struct cpu_workqueue_struct *cwq; 809 struct cpu_workqueue_struct *cwq;
810 int cpu; 810 int cpu;
811 811
812 mutex_lock(&workqueue_mutex); 812 get_online_cpus();
813 spin_lock(&workqueue_lock);
813 list_del(&wq->list); 814 list_del(&wq->list);
814 mutex_unlock(&workqueue_mutex); 815 spin_unlock(&workqueue_lock);
816 put_online_cpus();
815 817
816 for_each_cpu_mask(cpu, *cpu_map) { 818 for_each_cpu_mask(cpu, *cpu_map) {
817 cwq = per_cpu_ptr(wq->cpu_wq, cpu); 819 cwq = per_cpu_ptr(wq->cpu_wq, cpu);
@@ -834,13 +836,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
834 action &= ~CPU_TASKS_FROZEN; 836 action &= ~CPU_TASKS_FROZEN;
835 837
836 switch (action) { 838 switch (action) {
837 case CPU_LOCK_ACQUIRE:
838 mutex_lock(&workqueue_mutex);
839 return NOTIFY_OK;
840
841 case CPU_LOCK_RELEASE:
842 mutex_unlock(&workqueue_mutex);
843 return NOTIFY_OK;
844 839
845 case CPU_UP_PREPARE: 840 case CPU_UP_PREPARE:
846 cpu_set(cpu, cpu_populated_map); 841 cpu_set(cpu, cpu_populated_map);
@@ -853,7 +848,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
853 case CPU_UP_PREPARE: 848 case CPU_UP_PREPARE:
854 if (!create_workqueue_thread(cwq, cpu)) 849 if (!create_workqueue_thread(cwq, cpu))
855 break; 850 break;
856 printk(KERN_ERR "workqueue for %i failed\n", cpu); 851 printk(KERN_ERR "workqueue [%s] for %i failed\n",
852 wq->name, cpu);
857 return NOTIFY_BAD; 853 return NOTIFY_BAD;
858 854
859 case CPU_ONLINE: 855 case CPU_ONLINE: