aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.hz2
-rw-r--r--kernel/Kconfig.preempt13
-rw-r--r--kernel/Makefile8
-rw-r--r--kernel/backtracetest.c48
-rw-r--r--kernel/cpu.c164
-rw-r--r--kernel/cpuset.c14
-rw-r--r--kernel/extable.c3
-rw-r--r--kernel/fork.c49
-rw-r--r--kernel/hrtimer.c256
-rw-r--r--kernel/irq/manage.c3
-rw-r--r--kernel/irq/proc.c21
-rw-r--r--kernel/irq/spurious.c5
-rw-r--r--kernel/kallsyms.c11
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/ksysfs.c82
-rw-r--r--kernel/kthread.c12
-rw-r--r--kernel/latencytop.c239
-rw-r--r--kernel/lockdep.c19
-rw-r--r--kernel/module.c261
-rw-r--r--kernel/panic.c29
-rw-r--r--kernel/params.c54
-rw-r--r--kernel/posix-cpu-timers.c30
-rw-r--r--kernel/power/disk.c20
-rw-r--r--kernel/power/main.c23
-rw-r--r--kernel/power/power.h4
-rw-r--r--kernel/printk.c62
-rw-r--r--kernel/profile.c99
-rw-r--r--kernel/ptrace.c167
-rw-r--r--kernel/rcuclassic.c575
-rw-r--r--kernel/rcupdate.c576
-rw-r--r--kernel/rcupreempt.c953
-rw-r--r--kernel/rcupreempt_trace.c330
-rw-r--r--kernel/rcutorture.c6
-rw-r--r--kernel/rtmutex-tester.c2
-rw-r--r--kernel/sched.c1400
-rw-r--r--kernel/sched_debug.c5
-rw-r--r--kernel/sched_fair.c391
-rw-r--r--kernel/sched_idletask.c42
-rw-r--r--kernel/sched_rt.c1112
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/softirq.c11
-rw-r--r--kernel/softlockup.c116
-rw-r--r--kernel/spinlock.c3
-rw-r--r--kernel/stop_machine.c4
-rw-r--r--kernel/sysctl.c267
-rw-r--r--kernel/sysctl_check.c26
-rw-r--r--kernel/test_kprobes.c216
-rw-r--r--kernel/time/clockevents.c13
-rw-r--r--kernel/time/clocksource.c33
-rw-r--r--kernel/time/tick-broadcast.c7
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/tick-sched.c89
-rw-r--r--kernel/time/timekeeping.c30
-rw-r--r--kernel/time/timer_stats.c2
-rw-r--r--kernel/timer.c85
-rw-r--r--kernel/user.c152
-rw-r--r--kernel/workqueue.c35
57 files changed, 6354 insertions, 1833 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 4af15802ccd4..526128a2e622 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -54,3 +54,5 @@ config HZ
54 default 300 if HZ_300 54 default 300 if HZ_300
55 default 1000 if HZ_1000 55 default 1000 if HZ_1000
56 56
57config SCHED_HRTICK
58 def_bool HIGH_RES_TIMERS && X86
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c64ce9c14207..0669b70fa6a3 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,14 +52,13 @@ config PREEMPT
52 52
53endchoice 53endchoice
54 54
55config PREEMPT_BKL 55config RCU_TRACE
56 bool "Preempt The Big Kernel Lock" 56 bool "Enable tracing for RCU - currently stats in debugfs"
57 depends on SMP || PREEMPT 57 select DEBUG_FS
58 default y 58 default y
59 help 59 help
60 This option reduces the latency of the kernel by making the 60 This option provides tracing in RCU which presents stats
61 big kernel lock preemptible. 61 in debugfs for debugging RCU implementation.
62 62
63 Say Y here if you are building a kernel for a desktop system. 63 Say Y here if you want to enable RCU tracing
64 Say N if you are unsure. 64 Say N if you are unsure.
65
diff --git a/kernel/Makefile b/kernel/Makefile
index dfa96956dae0..8885627ea021 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_KALLSYMS) += kallsyms.o
36obj-$(CONFIG_PM) += power/ 36obj-$(CONFIG_PM) += power/
37obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 37obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
38obj-$(CONFIG_KEXEC) += kexec.o 38obj-$(CONFIG_KEXEC) += kexec.o
39obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
39obj-$(CONFIG_COMPAT) += compat.o 40obj-$(CONFIG_COMPAT) += compat.o
40obj-$(CONFIG_CGROUPS) += cgroup.o 41obj-$(CONFIG_CGROUPS) += cgroup.o
41obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o 42obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
@@ -43,6 +44,7 @@ obj-$(CONFIG_CPUSETS) += cpuset.o
43obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o 44obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
44obj-$(CONFIG_IKCONFIG) += configs.o 45obj-$(CONFIG_IKCONFIG) += configs.o
45obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 46obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
47obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
46obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 48obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
47obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 49obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
48obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 50obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
@@ -52,11 +54,17 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
52obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 54obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
53obj-$(CONFIG_SECCOMP) += seccomp.o 55obj-$(CONFIG_SECCOMP) += seccomp.o
54obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 56obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
57obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
58obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
59ifeq ($(CONFIG_PREEMPT_RCU),y)
60obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
61endif
55obj-$(CONFIG_RELAY) += relay.o 62obj-$(CONFIG_RELAY) += relay.o
56obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 63obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
57obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 64obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
58obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 65obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
59obj-$(CONFIG_MARKERS) += marker.o 66obj-$(CONFIG_MARKERS) += marker.o
67obj-$(CONFIG_LATENCYTOP) += latencytop.o
60 68
61ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 69ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
62# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 70# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
new file mode 100644
index 000000000000..d1a7605c5b8f
--- /dev/null
+++ b/kernel/backtracetest.c
@@ -0,0 +1,48 @@
1/*
2 * Simple stack backtrace regression test module
3 *
4 * (C) Copyright 2008 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12
13#include <linux/module.h>
14#include <linux/sched.h>
15#include <linux/delay.h>
16
17static struct timer_list backtrace_timer;
18
19static void backtrace_test_timer(unsigned long data)
20{
21 printk("Testing a backtrace from irq context.\n");
22 printk("The following trace is a kernel self test and not a bug!\n");
23 dump_stack();
24}
25static int backtrace_regression_test(void)
26{
27 printk("====[ backtrace testing ]===========\n");
28 printk("Testing a backtrace from process context.\n");
29 printk("The following trace is a kernel self test and not a bug!\n");
30 dump_stack();
31
32 init_timer(&backtrace_timer);
33 backtrace_timer.function = backtrace_test_timer;
34 mod_timer(&backtrace_timer, jiffies + 10);
35
36 msleep(10);
37 printk("====[ end of backtrace testing ]====\n");
38 return 0;
39}
40
41static void exitf(void)
42{
43}
44
45module_init(backtrace_regression_test);
46module_exit(exitf);
47MODULE_LICENSE("GPL");
48MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6b3a0c15144f..e0d3a4f56ecb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,9 +15,8 @@
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17 17
18/* This protects CPUs going up and down... */ 18/* Serializes the updates to cpu_online_map, cpu_present_map */
19static DEFINE_MUTEX(cpu_add_remove_lock); 19static DEFINE_MUTEX(cpu_add_remove_lock);
20static DEFINE_MUTEX(cpu_bitmask_lock);
21 20
22static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); 21static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
23 22
@@ -26,52 +25,123 @@ static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
26 */ 25 */
27static int cpu_hotplug_disabled; 26static int cpu_hotplug_disabled;
28 27
29#ifdef CONFIG_HOTPLUG_CPU 28static struct {
29 struct task_struct *active_writer;
30 struct mutex lock; /* Synchronizes accesses to refcount, */
31 /*
32 * Also blocks the new readers during
33 * an ongoing cpu hotplug operation.
34 */
35 int refcount;
36 wait_queue_head_t writer_queue;
37} cpu_hotplug;
30 38
31/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */ 39#define writer_exists() (cpu_hotplug.active_writer != NULL)
32static struct task_struct *recursive;
33static int recursive_depth;
34 40
35void lock_cpu_hotplug(void) 41void __init cpu_hotplug_init(void)
36{ 42{
37 struct task_struct *tsk = current; 43 cpu_hotplug.active_writer = NULL;
38 44 mutex_init(&cpu_hotplug.lock);
39 if (tsk == recursive) { 45 cpu_hotplug.refcount = 0;
40 static int warnings = 10; 46 init_waitqueue_head(&cpu_hotplug.writer_queue);
41 if (warnings) { 47}
42 printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n"); 48
43 WARN_ON(1); 49#ifdef CONFIG_HOTPLUG_CPU
44 warnings--; 50
45 } 51void get_online_cpus(void)
46 recursive_depth++; 52{
53 might_sleep();
54 if (cpu_hotplug.active_writer == current)
47 return; 55 return;
48 } 56 mutex_lock(&cpu_hotplug.lock);
49 mutex_lock(&cpu_bitmask_lock); 57 cpu_hotplug.refcount++;
50 recursive = tsk; 58 mutex_unlock(&cpu_hotplug.lock);
59
51} 60}
52EXPORT_SYMBOL_GPL(lock_cpu_hotplug); 61EXPORT_SYMBOL_GPL(get_online_cpus);
53 62
54void unlock_cpu_hotplug(void) 63void put_online_cpus(void)
55{ 64{
56 WARN_ON(recursive != current); 65 if (cpu_hotplug.active_writer == current)
57 if (recursive_depth) {
58 recursive_depth--;
59 return; 66 return;
60 } 67 mutex_lock(&cpu_hotplug.lock);
61 recursive = NULL; 68 cpu_hotplug.refcount--;
62 mutex_unlock(&cpu_bitmask_lock); 69
70 if (unlikely(writer_exists()) && !cpu_hotplug.refcount)
71 wake_up(&cpu_hotplug.writer_queue);
72
73 mutex_unlock(&cpu_hotplug.lock);
74
63} 75}
64EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); 76EXPORT_SYMBOL_GPL(put_online_cpus);
65 77
66#endif /* CONFIG_HOTPLUG_CPU */ 78#endif /* CONFIG_HOTPLUG_CPU */
67 79
80/*
81 * The following two API's must be used when attempting
82 * to serialize the updates to cpu_online_map, cpu_present_map.
83 */
84void cpu_maps_update_begin(void)
85{
86 mutex_lock(&cpu_add_remove_lock);
87}
88
89void cpu_maps_update_done(void)
90{
91 mutex_unlock(&cpu_add_remove_lock);
92}
93
94/*
95 * This ensures that the hotplug operation can begin only when the
96 * refcount goes to zero.
97 *
98 * Note that during a cpu-hotplug operation, the new readers, if any,
99 * will be blocked by the cpu_hotplug.lock
100 *
101 * Since cpu_maps_update_begin is always called after invoking
102 * cpu_maps_update_begin, we can be sure that only one writer is active.
103 *
104 * Note that theoretically, there is a possibility of a livelock:
105 * - Refcount goes to zero, last reader wakes up the sleeping
106 * writer.
107 * - Last reader unlocks the cpu_hotplug.lock.
108 * - A new reader arrives at this moment, bumps up the refcount.
109 * - The writer acquires the cpu_hotplug.lock finds the refcount
110 * non zero and goes to sleep again.
111 *
112 * However, this is very difficult to achieve in practice since
113 * get_online_cpus() not an api which is called all that often.
114 *
115 */
116static void cpu_hotplug_begin(void)
117{
118 DECLARE_WAITQUEUE(wait, current);
119
120 mutex_lock(&cpu_hotplug.lock);
121
122 cpu_hotplug.active_writer = current;
123 add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait);
124 while (cpu_hotplug.refcount) {
125 set_current_state(TASK_UNINTERRUPTIBLE);
126 mutex_unlock(&cpu_hotplug.lock);
127 schedule();
128 mutex_lock(&cpu_hotplug.lock);
129 }
130 remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait);
131}
132
133static void cpu_hotplug_done(void)
134{
135 cpu_hotplug.active_writer = NULL;
136 mutex_unlock(&cpu_hotplug.lock);
137}
68/* Need to know about CPUs going up/down? */ 138/* Need to know about CPUs going up/down? */
69int __cpuinit register_cpu_notifier(struct notifier_block *nb) 139int __cpuinit register_cpu_notifier(struct notifier_block *nb)
70{ 140{
71 int ret; 141 int ret;
72 mutex_lock(&cpu_add_remove_lock); 142 cpu_maps_update_begin();
73 ret = raw_notifier_chain_register(&cpu_chain, nb); 143 ret = raw_notifier_chain_register(&cpu_chain, nb);
74 mutex_unlock(&cpu_add_remove_lock); 144 cpu_maps_update_done();
75 return ret; 145 return ret;
76} 146}
77 147
@@ -81,9 +151,9 @@ EXPORT_SYMBOL(register_cpu_notifier);
81 151
82void unregister_cpu_notifier(struct notifier_block *nb) 152void unregister_cpu_notifier(struct notifier_block *nb)
83{ 153{
84 mutex_lock(&cpu_add_remove_lock); 154 cpu_maps_update_begin();
85 raw_notifier_chain_unregister(&cpu_chain, nb); 155 raw_notifier_chain_unregister(&cpu_chain, nb);
86 mutex_unlock(&cpu_add_remove_lock); 156 cpu_maps_update_done();
87} 157}
88EXPORT_SYMBOL(unregister_cpu_notifier); 158EXPORT_SYMBOL(unregister_cpu_notifier);
89 159
@@ -147,7 +217,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
147 if (!cpu_online(cpu)) 217 if (!cpu_online(cpu))
148 return -EINVAL; 218 return -EINVAL;
149 219
150 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); 220 cpu_hotplug_begin();
151 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 221 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
152 hcpu, -1, &nr_calls); 222 hcpu, -1, &nr_calls);
153 if (err == NOTIFY_BAD) { 223 if (err == NOTIFY_BAD) {
@@ -166,9 +236,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
166 cpu_clear(cpu, tmp); 236 cpu_clear(cpu, tmp);
167 set_cpus_allowed(current, tmp); 237 set_cpus_allowed(current, tmp);
168 238
169 mutex_lock(&cpu_bitmask_lock);
170 p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); 239 p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
171 mutex_unlock(&cpu_bitmask_lock);
172 240
173 if (IS_ERR(p) || cpu_online(cpu)) { 241 if (IS_ERR(p) || cpu_online(cpu)) {
174 /* CPU didn't die: tell everyone. Can't complain. */ 242 /* CPU didn't die: tell everyone. Can't complain. */
@@ -202,7 +270,7 @@ out_thread:
202out_allowed: 270out_allowed:
203 set_cpus_allowed(current, old_allowed); 271 set_cpus_allowed(current, old_allowed);
204out_release: 272out_release:
205 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); 273 cpu_hotplug_done();
206 return err; 274 return err;
207} 275}
208 276
@@ -210,13 +278,13 @@ int cpu_down(unsigned int cpu)
210{ 278{
211 int err = 0; 279 int err = 0;
212 280
213 mutex_lock(&cpu_add_remove_lock); 281 cpu_maps_update_begin();
214 if (cpu_hotplug_disabled) 282 if (cpu_hotplug_disabled)
215 err = -EBUSY; 283 err = -EBUSY;
216 else 284 else
217 err = _cpu_down(cpu, 0); 285 err = _cpu_down(cpu, 0);
218 286
219 mutex_unlock(&cpu_add_remove_lock); 287 cpu_maps_update_done();
220 return err; 288 return err;
221} 289}
222#endif /*CONFIG_HOTPLUG_CPU*/ 290#endif /*CONFIG_HOTPLUG_CPU*/
@@ -231,7 +299,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
231 if (cpu_online(cpu) || !cpu_present(cpu)) 299 if (cpu_online(cpu) || !cpu_present(cpu))
232 return -EINVAL; 300 return -EINVAL;
233 301
234 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); 302 cpu_hotplug_begin();
235 ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, 303 ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
236 -1, &nr_calls); 304 -1, &nr_calls);
237 if (ret == NOTIFY_BAD) { 305 if (ret == NOTIFY_BAD) {
@@ -243,9 +311,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
243 } 311 }
244 312
245 /* Arch-specific enabling code. */ 313 /* Arch-specific enabling code. */
246 mutex_lock(&cpu_bitmask_lock);
247 ret = __cpu_up(cpu); 314 ret = __cpu_up(cpu);
248 mutex_unlock(&cpu_bitmask_lock);
249 if (ret != 0) 315 if (ret != 0)
250 goto out_notify; 316 goto out_notify;
251 BUG_ON(!cpu_online(cpu)); 317 BUG_ON(!cpu_online(cpu));
@@ -257,7 +323,7 @@ out_notify:
257 if (ret != 0) 323 if (ret != 0)
258 __raw_notifier_call_chain(&cpu_chain, 324 __raw_notifier_call_chain(&cpu_chain,
259 CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); 325 CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
260 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); 326 cpu_hotplug_done();
261 327
262 return ret; 328 return ret;
263} 329}
@@ -275,13 +341,13 @@ int __cpuinit cpu_up(unsigned int cpu)
275 return -EINVAL; 341 return -EINVAL;
276 } 342 }
277 343
278 mutex_lock(&cpu_add_remove_lock); 344 cpu_maps_update_begin();
279 if (cpu_hotplug_disabled) 345 if (cpu_hotplug_disabled)
280 err = -EBUSY; 346 err = -EBUSY;
281 else 347 else
282 err = _cpu_up(cpu, 0); 348 err = _cpu_up(cpu, 0);
283 349
284 mutex_unlock(&cpu_add_remove_lock); 350 cpu_maps_update_done();
285 return err; 351 return err;
286} 352}
287 353
@@ -292,7 +358,7 @@ int disable_nonboot_cpus(void)
292{ 358{
293 int cpu, first_cpu, error = 0; 359 int cpu, first_cpu, error = 0;
294 360
295 mutex_lock(&cpu_add_remove_lock); 361 cpu_maps_update_begin();
296 first_cpu = first_cpu(cpu_online_map); 362 first_cpu = first_cpu(cpu_online_map);
297 /* We take down all of the non-boot CPUs in one shot to avoid races 363 /* We take down all of the non-boot CPUs in one shot to avoid races
298 * with the userspace trying to use the CPU hotplug at the same time 364 * with the userspace trying to use the CPU hotplug at the same time
@@ -319,7 +385,7 @@ int disable_nonboot_cpus(void)
319 } else { 385 } else {
320 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 386 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
321 } 387 }
322 mutex_unlock(&cpu_add_remove_lock); 388 cpu_maps_update_done();
323 return error; 389 return error;
324} 390}
325 391
@@ -328,7 +394,7 @@ void enable_nonboot_cpus(void)
328 int cpu, error; 394 int cpu, error;
329 395
330 /* Allow everyone to use the CPU hotplug again */ 396 /* Allow everyone to use the CPU hotplug again */
331 mutex_lock(&cpu_add_remove_lock); 397 cpu_maps_update_begin();
332 cpu_hotplug_disabled = 0; 398 cpu_hotplug_disabled = 0;
333 if (cpus_empty(frozen_cpus)) 399 if (cpus_empty(frozen_cpus))
334 goto out; 400 goto out;
@@ -344,6 +410,6 @@ void enable_nonboot_cpus(void)
344 } 410 }
345 cpus_clear(frozen_cpus); 411 cpus_clear(frozen_cpus);
346out: 412out:
347 mutex_unlock(&cpu_add_remove_lock); 413 cpu_maps_update_done();
348} 414}
349#endif /* CONFIG_PM_SLEEP_SMP */ 415#endif /* CONFIG_PM_SLEEP_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 50f5dc463688..cfaf6419d817 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,10 +537,10 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
537 * 537 *
538 * Call with cgroup_mutex held. May take callback_mutex during 538 * Call with cgroup_mutex held. May take callback_mutex during
539 * call due to the kfifo_alloc() and kmalloc() calls. May nest 539 * call due to the kfifo_alloc() and kmalloc() calls. May nest
540 * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair. 540 * a call to the get_online_cpus()/put_online_cpus() pair.
541 * Must not be called holding callback_mutex, because we must not 541 * Must not be called holding callback_mutex, because we must not
542 * call lock_cpu_hotplug() while holding callback_mutex. Elsewhere 542 * call get_online_cpus() while holding callback_mutex. Elsewhere
543 * the kernel nests callback_mutex inside lock_cpu_hotplug() calls. 543 * the kernel nests callback_mutex inside get_online_cpus() calls.
544 * So the reverse nesting would risk an ABBA deadlock. 544 * So the reverse nesting would risk an ABBA deadlock.
545 * 545 *
546 * The three key local variables below are: 546 * The three key local variables below are:
@@ -691,9 +691,9 @@ restart:
691 691
692rebuild: 692rebuild:
693 /* Have scheduler rebuild sched domains */ 693 /* Have scheduler rebuild sched domains */
694 lock_cpu_hotplug(); 694 get_online_cpus();
695 partition_sched_domains(ndoms, doms); 695 partition_sched_domains(ndoms, doms);
696 unlock_cpu_hotplug(); 696 put_online_cpus();
697 697
698done: 698done:
699 if (q && !IS_ERR(q)) 699 if (q && !IS_ERR(q))
@@ -1617,10 +1617,10 @@ static struct cgroup_subsys_state *cpuset_create(
1617 * 1617 *
1618 * If the cpuset being removed has its flag 'sched_load_balance' 1618 * If the cpuset being removed has its flag 'sched_load_balance'
1619 * enabled, then simulate turning sched_load_balance off, which 1619 * enabled, then simulate turning sched_load_balance off, which
1620 * will call rebuild_sched_domains(). The lock_cpu_hotplug() 1620 * will call rebuild_sched_domains(). The get_online_cpus()
1621 * call in rebuild_sched_domains() must not be made while holding 1621 * call in rebuild_sched_domains() must not be made while holding
1622 * callback_mutex. Elsewhere the kernel nests callback_mutex inside 1622 * callback_mutex. Elsewhere the kernel nests callback_mutex inside
1623 * lock_cpu_hotplug() calls. So the reverse nesting would risk an 1623 * get_online_cpus() calls. So the reverse nesting would risk an
1624 * ABBA deadlock. 1624 * ABBA deadlock.
1625 */ 1625 */
1626 1626
diff --git a/kernel/extable.c b/kernel/extable.c
index 7fe262855317..a26cb2e17023 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -46,7 +46,8 @@ int core_kernel_text(unsigned long addr)
46 addr <= (unsigned long)_etext) 46 addr <= (unsigned long)_etext)
47 return 1; 47 return 1;
48 48
49 if (addr >= (unsigned long)_sinittext && 49 if (system_state == SYSTEM_BOOTING &&
50 addr >= (unsigned long)_sinittext &&
50 addr <= (unsigned long)_einittext) 51 addr <= (unsigned long)_einittext)
51 return 1; 52 return 1;
52 return 0; 53 return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 8dd8ff281009..05e0b6f4365b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -51,6 +51,7 @@
51#include <linux/random.h> 51#include <linux/random.h>
52#include <linux/tty.h> 52#include <linux/tty.h>
53#include <linux/proc_fs.h> 53#include <linux/proc_fs.h>
54#include <linux/blkdev.h>
54 55
55#include <asm/pgtable.h> 56#include <asm/pgtable.h>
56#include <asm/pgalloc.h> 57#include <asm/pgalloc.h>
@@ -392,6 +393,7 @@ void fastcall __mmdrop(struct mm_struct *mm)
392 destroy_context(mm); 393 destroy_context(mm);
393 free_mm(mm); 394 free_mm(mm);
394} 395}
396EXPORT_SYMBOL_GPL(__mmdrop);
395 397
396/* 398/*
397 * Decrement the use count and release all resources for an mm. 399 * Decrement the use count and release all resources for an mm.
@@ -791,6 +793,31 @@ out:
791 return error; 793 return error;
792} 794}
793 795
796static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
797{
798#ifdef CONFIG_BLOCK
799 struct io_context *ioc = current->io_context;
800
801 if (!ioc)
802 return 0;
803 /*
804 * Share io context with parent, if CLONE_IO is set
805 */
806 if (clone_flags & CLONE_IO) {
807 tsk->io_context = ioc_task_link(ioc);
808 if (unlikely(!tsk->io_context))
809 return -ENOMEM;
810 } else if (ioprio_valid(ioc->ioprio)) {
811 tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
812 if (unlikely(!tsk->io_context))
813 return -ENOMEM;
814
815 tsk->io_context->ioprio = ioc->ioprio;
816 }
817#endif
818 return 0;
819}
820
794/* 821/*
795 * Helper to unshare the files of the current task. 822 * Helper to unshare the files of the current task.
796 * We don't want to expose copy_files internals to 823 * We don't want to expose copy_files internals to
@@ -1045,6 +1072,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1045 copy_flags(clone_flags, p); 1072 copy_flags(clone_flags, p);
1046 INIT_LIST_HEAD(&p->children); 1073 INIT_LIST_HEAD(&p->children);
1047 INIT_LIST_HEAD(&p->sibling); 1074 INIT_LIST_HEAD(&p->sibling);
1075#ifdef CONFIG_PREEMPT_RCU
1076 p->rcu_read_lock_nesting = 0;
1077 p->rcu_flipctr_idx = 0;
1078#endif /* #ifdef CONFIG_PREEMPT_RCU */
1048 p->vfork_done = NULL; 1079 p->vfork_done = NULL;
1049 spin_lock_init(&p->alloc_lock); 1080 spin_lock_init(&p->alloc_lock);
1050 1081
@@ -1059,6 +1090,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1059 p->prev_utime = cputime_zero; 1090 p->prev_utime = cputime_zero;
1060 p->prev_stime = cputime_zero; 1091 p->prev_stime = cputime_zero;
1061 1092
1093#ifdef CONFIG_DETECT_SOFTLOCKUP
1094 p->last_switch_count = 0;
1095 p->last_switch_timestamp = 0;
1096#endif
1097
1062#ifdef CONFIG_TASK_XACCT 1098#ifdef CONFIG_TASK_XACCT
1063 p->rchar = 0; /* I/O counter: bytes read */ 1099 p->rchar = 0; /* I/O counter: bytes read */
1064 p->wchar = 0; /* I/O counter: bytes written */ 1100 p->wchar = 0; /* I/O counter: bytes written */
@@ -1147,15 +1183,17 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1147 goto bad_fork_cleanup_mm; 1183 goto bad_fork_cleanup_mm;
1148 if ((retval = copy_namespaces(clone_flags, p))) 1184 if ((retval = copy_namespaces(clone_flags, p)))
1149 goto bad_fork_cleanup_keys; 1185 goto bad_fork_cleanup_keys;
1186 if ((retval = copy_io(clone_flags, p)))
1187 goto bad_fork_cleanup_namespaces;
1150 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); 1188 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
1151 if (retval) 1189 if (retval)
1152 goto bad_fork_cleanup_namespaces; 1190 goto bad_fork_cleanup_io;
1153 1191
1154 if (pid != &init_struct_pid) { 1192 if (pid != &init_struct_pid) {
1155 retval = -ENOMEM; 1193 retval = -ENOMEM;
1156 pid = alloc_pid(task_active_pid_ns(p)); 1194 pid = alloc_pid(task_active_pid_ns(p));
1157 if (!pid) 1195 if (!pid)
1158 goto bad_fork_cleanup_namespaces; 1196 goto bad_fork_cleanup_io;
1159 1197
1160 if (clone_flags & CLONE_NEWPID) { 1198 if (clone_flags & CLONE_NEWPID) {
1161 retval = pid_ns_prepare_proc(task_active_pid_ns(p)); 1199 retval = pid_ns_prepare_proc(task_active_pid_ns(p));
@@ -1196,6 +1234,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1196#ifdef TIF_SYSCALL_EMU 1234#ifdef TIF_SYSCALL_EMU
1197 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); 1235 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
1198#endif 1236#endif
1237 clear_all_latency_tracing(p);
1199 1238
1200 /* Our parent execution domain becomes current domain 1239 /* Our parent execution domain becomes current domain
1201 These must match for thread signalling to apply */ 1240 These must match for thread signalling to apply */
@@ -1224,9 +1263,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1224 /* Need tasklist lock for parent etc handling! */ 1263 /* Need tasklist lock for parent etc handling! */
1225 write_lock_irq(&tasklist_lock); 1264 write_lock_irq(&tasklist_lock);
1226 1265
1227 /* for sys_ioprio_set(IOPRIO_WHO_PGRP) */
1228 p->ioprio = current->ioprio;
1229
1230 /* 1266 /*
1231 * The task hasn't been attached yet, so its cpus_allowed mask will 1267 * The task hasn't been attached yet, so its cpus_allowed mask will
1232 * not be changed, nor will its assigned CPU. 1268 * not be changed, nor will its assigned CPU.
@@ -1237,6 +1273,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1237 * parent's CPU). This avoids alot of nasty races. 1273 * parent's CPU). This avoids alot of nasty races.
1238 */ 1274 */
1239 p->cpus_allowed = current->cpus_allowed; 1275 p->cpus_allowed = current->cpus_allowed;
1276 p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
1240 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || 1277 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
1241 !cpu_online(task_cpu(p)))) 1278 !cpu_online(task_cpu(p))))
1242 set_task_cpu(p, smp_processor_id()); 1279 set_task_cpu(p, smp_processor_id());
@@ -1317,6 +1354,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1317bad_fork_free_pid: 1354bad_fork_free_pid:
1318 if (pid != &init_struct_pid) 1355 if (pid != &init_struct_pid)
1319 free_pid(pid); 1356 free_pid(pid);
1357bad_fork_cleanup_io:
1358 put_io_context(p->io_context);
1320bad_fork_cleanup_namespaces: 1359bad_fork_cleanup_namespaces:
1321 exit_task_namespaces(p); 1360 exit_task_namespaces(p);
1322bad_fork_cleanup_keys: 1361bad_fork_cleanup_keys:
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f994bb8065e6..bd5d6b5060bc 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -325,6 +325,22 @@ unsigned long ktime_divns(const ktime_t kt, s64 div)
325} 325}
326#endif /* BITS_PER_LONG >= 64 */ 326#endif /* BITS_PER_LONG >= 64 */
327 327
328/*
329 * Check, whether the timer is on the callback pending list
330 */
331static inline int hrtimer_cb_pending(const struct hrtimer *timer)
332{
333 return timer->state & HRTIMER_STATE_PENDING;
334}
335
336/*
337 * Remove a timer from the callback pending list
338 */
339static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
340{
341 list_del_init(&timer->cb_entry);
342}
343
328/* High resolution timer related functions */ 344/* High resolution timer related functions */
329#ifdef CONFIG_HIGH_RES_TIMERS 345#ifdef CONFIG_HIGH_RES_TIMERS
330 346
@@ -494,29 +510,12 @@ void hres_timers_resume(void)
494} 510}
495 511
496/* 512/*
497 * Check, whether the timer is on the callback pending list
498 */
499static inline int hrtimer_cb_pending(const struct hrtimer *timer)
500{
501 return timer->state & HRTIMER_STATE_PENDING;
502}
503
504/*
505 * Remove a timer from the callback pending list
506 */
507static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
508{
509 list_del_init(&timer->cb_entry);
510}
511
512/*
513 * Initialize the high resolution related parts of cpu_base 513 * Initialize the high resolution related parts of cpu_base
514 */ 514 */
515static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) 515static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
516{ 516{
517 base->expires_next.tv64 = KTIME_MAX; 517 base->expires_next.tv64 = KTIME_MAX;
518 base->hres_active = 0; 518 base->hres_active = 0;
519 INIT_LIST_HEAD(&base->cb_pending);
520} 519}
521 520
522/* 521/*
@@ -524,7 +523,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
524 */ 523 */
525static inline void hrtimer_init_timer_hres(struct hrtimer *timer) 524static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
526{ 525{
527 INIT_LIST_HEAD(&timer->cb_entry);
528} 526}
529 527
530/* 528/*
@@ -618,10 +616,13 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
618{ 616{
619 return 0; 617 return 0;
620} 618}
621static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; }
622static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { }
623static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } 619static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
624static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } 620static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
621static inline int hrtimer_reprogram(struct hrtimer *timer,
622 struct hrtimer_clock_base *base)
623{
624 return 0;
625}
625 626
626#endif /* CONFIG_HIGH_RES_TIMERS */ 627#endif /* CONFIG_HIGH_RES_TIMERS */
627 628
@@ -1001,6 +1002,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1001 clock_id = CLOCK_MONOTONIC; 1002 clock_id = CLOCK_MONOTONIC;
1002 1003
1003 timer->base = &cpu_base->clock_base[clock_id]; 1004 timer->base = &cpu_base->clock_base[clock_id];
1005 INIT_LIST_HEAD(&timer->cb_entry);
1004 hrtimer_init_timer_hres(timer); 1006 hrtimer_init_timer_hres(timer);
1005 1007
1006#ifdef CONFIG_TIMER_STATS 1008#ifdef CONFIG_TIMER_STATS
@@ -1030,6 +1032,85 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1030} 1032}
1031EXPORT_SYMBOL_GPL(hrtimer_get_res); 1033EXPORT_SYMBOL_GPL(hrtimer_get_res);
1032 1034
1035static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
1036{
1037 spin_lock_irq(&cpu_base->lock);
1038
1039 while (!list_empty(&cpu_base->cb_pending)) {
1040 enum hrtimer_restart (*fn)(struct hrtimer *);
1041 struct hrtimer *timer;
1042 int restart;
1043
1044 timer = list_entry(cpu_base->cb_pending.next,
1045 struct hrtimer, cb_entry);
1046
1047 timer_stats_account_hrtimer(timer);
1048
1049 fn = timer->function;
1050 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
1051 spin_unlock_irq(&cpu_base->lock);
1052
1053 restart = fn(timer);
1054
1055 spin_lock_irq(&cpu_base->lock);
1056
1057 timer->state &= ~HRTIMER_STATE_CALLBACK;
1058 if (restart == HRTIMER_RESTART) {
1059 BUG_ON(hrtimer_active(timer));
1060 /*
1061 * Enqueue the timer, allow reprogramming of the event
1062 * device
1063 */
1064 enqueue_hrtimer(timer, timer->base, 1);
1065 } else if (hrtimer_active(timer)) {
1066 /*
1067 * If the timer was rearmed on another CPU, reprogram
1068 * the event device.
1069 */
1070 if (timer->base->first == &timer->node)
1071 hrtimer_reprogram(timer, timer->base);
1072 }
1073 }
1074 spin_unlock_irq(&cpu_base->lock);
1075}
1076
1077static void __run_hrtimer(struct hrtimer *timer)
1078{
1079 struct hrtimer_clock_base *base = timer->base;
1080 struct hrtimer_cpu_base *cpu_base = base->cpu_base;
1081 enum hrtimer_restart (*fn)(struct hrtimer *);
1082 int restart;
1083
1084 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1085 timer_stats_account_hrtimer(timer);
1086
1087 fn = timer->function;
1088 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
1089 /*
1090 * Used for scheduler timers, avoid lock inversion with
1091 * rq->lock and tasklist_lock.
1092 *
1093 * These timers are required to deal with enqueue expiry
1094 * themselves and are not allowed to migrate.
1095 */
1096 spin_unlock(&cpu_base->lock);
1097 restart = fn(timer);
1098 spin_lock(&cpu_base->lock);
1099 } else
1100 restart = fn(timer);
1101
1102 /*
1103 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
1104 * reprogramming of the event hardware. This happens at the end of this
1105 * function anyway.
1106 */
1107 if (restart != HRTIMER_NORESTART) {
1108 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1109 enqueue_hrtimer(timer, base, 0);
1110 }
1111 timer->state &= ~HRTIMER_STATE_CALLBACK;
1112}
1113
1033#ifdef CONFIG_HIGH_RES_TIMERS 1114#ifdef CONFIG_HIGH_RES_TIMERS
1034 1115
1035/* 1116/*
@@ -1087,21 +1168,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1087 continue; 1168 continue;
1088 } 1169 }
1089 1170
1090 __remove_hrtimer(timer, base, 1171 __run_hrtimer(timer);
1091 HRTIMER_STATE_CALLBACK, 0);
1092 timer_stats_account_hrtimer(timer);
1093
1094 /*
1095 * Note: We clear the CALLBACK bit after
1096 * enqueue_hrtimer to avoid reprogramming of
1097 * the event hardware. This happens at the end
1098 * of this function anyway.
1099 */
1100 if (timer->function(timer) != HRTIMER_NORESTART) {
1101 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1102 enqueue_hrtimer(timer, base, 0);
1103 }
1104 timer->state &= ~HRTIMER_STATE_CALLBACK;
1105 } 1172 }
1106 spin_unlock(&cpu_base->lock); 1173 spin_unlock(&cpu_base->lock);
1107 base++; 1174 base++;
@@ -1122,52 +1189,41 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1122 1189
1123static void run_hrtimer_softirq(struct softirq_action *h) 1190static void run_hrtimer_softirq(struct softirq_action *h)
1124{ 1191{
1125 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1192 run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
1126 1193}
1127 spin_lock_irq(&cpu_base->lock);
1128
1129 while (!list_empty(&cpu_base->cb_pending)) {
1130 enum hrtimer_restart (*fn)(struct hrtimer *);
1131 struct hrtimer *timer;
1132 int restart;
1133
1134 timer = list_entry(cpu_base->cb_pending.next,
1135 struct hrtimer, cb_entry);
1136 1194
1137 timer_stats_account_hrtimer(timer); 1195#endif /* CONFIG_HIGH_RES_TIMERS */
1138 1196
1139 fn = timer->function; 1197/*
1140 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); 1198 * Called from timer softirq every jiffy, expire hrtimers:
1141 spin_unlock_irq(&cpu_base->lock); 1199 *
1200 * For HRT its the fall back code to run the softirq in the timer
1201 * softirq context in case the hrtimer initialization failed or has
1202 * not been done yet.
1203 */
1204void hrtimer_run_pending(void)
1205{
1206 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1142 1207
1143 restart = fn(timer); 1208 if (hrtimer_hres_active())
1209 return;
1144 1210
1145 spin_lock_irq(&cpu_base->lock); 1211 /*
1212 * This _is_ ugly: We have to check in the softirq context,
1213 * whether we can switch to highres and / or nohz mode. The
1214 * clocksource switch happens in the timer interrupt with
1215 * xtime_lock held. Notification from there only sets the
1216 * check bit in the tick_oneshot code, otherwise we might
1217 * deadlock vs. xtime_lock.
1218 */
1219 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
1220 hrtimer_switch_to_hres();
1146 1221
1147 timer->state &= ~HRTIMER_STATE_CALLBACK; 1222 run_hrtimer_pending(cpu_base);
1148 if (restart == HRTIMER_RESTART) {
1149 BUG_ON(hrtimer_active(timer));
1150 /*
1151 * Enqueue the timer, allow reprogramming of the event
1152 * device
1153 */
1154 enqueue_hrtimer(timer, timer->base, 1);
1155 } else if (hrtimer_active(timer)) {
1156 /*
1157 * If the timer was rearmed on another CPU, reprogram
1158 * the event device.
1159 */
1160 if (timer->base->first == &timer->node)
1161 hrtimer_reprogram(timer, timer->base);
1162 }
1163 }
1164 spin_unlock_irq(&cpu_base->lock);
1165} 1223}
1166 1224
1167#endif /* CONFIG_HIGH_RES_TIMERS */
1168
1169/* 1225/*
1170 * Expire the per base hrtimer-queue: 1226 * Called from hardirq context every jiffy
1171 */ 1227 */
1172static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, 1228static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
1173 int index) 1229 int index)
@@ -1181,46 +1237,27 @@ static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
1181 if (base->get_softirq_time) 1237 if (base->get_softirq_time)
1182 base->softirq_time = base->get_softirq_time(); 1238 base->softirq_time = base->get_softirq_time();
1183 1239
1184 spin_lock_irq(&cpu_base->lock); 1240 spin_lock(&cpu_base->lock);
1185 1241
1186 while ((node = base->first)) { 1242 while ((node = base->first)) {
1187 struct hrtimer *timer; 1243 struct hrtimer *timer;
1188 enum hrtimer_restart (*fn)(struct hrtimer *);
1189 int restart;
1190 1244
1191 timer = rb_entry(node, struct hrtimer, node); 1245 timer = rb_entry(node, struct hrtimer, node);
1192 if (base->softirq_time.tv64 <= timer->expires.tv64) 1246 if (base->softirq_time.tv64 <= timer->expires.tv64)
1193 break; 1247 break;
1194 1248
1195#ifdef CONFIG_HIGH_RES_TIMERS 1249 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1196 WARN_ON_ONCE(timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ); 1250 __remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0);
1197#endif 1251 list_add_tail(&timer->cb_entry,
1198 timer_stats_account_hrtimer(timer); 1252 &base->cpu_base->cb_pending);
1199 1253 continue;
1200 fn = timer->function;
1201 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1202 spin_unlock_irq(&cpu_base->lock);
1203
1204 restart = fn(timer);
1205
1206 spin_lock_irq(&cpu_base->lock);
1207
1208 timer->state &= ~HRTIMER_STATE_CALLBACK;
1209 if (restart != HRTIMER_NORESTART) {
1210 BUG_ON(hrtimer_active(timer));
1211 enqueue_hrtimer(timer, base, 0);
1212 } 1254 }
1255
1256 __run_hrtimer(timer);
1213 } 1257 }
1214 spin_unlock_irq(&cpu_base->lock); 1258 spin_unlock(&cpu_base->lock);
1215} 1259}
1216 1260
1217/*
1218 * Called from timer softirq every jiffy, expire hrtimers:
1219 *
1220 * For HRT its the fall back code to run the softirq in the timer
1221 * softirq context in case the hrtimer initialization failed or has
1222 * not been done yet.
1223 */
1224void hrtimer_run_queues(void) 1261void hrtimer_run_queues(void)
1225{ 1262{
1226 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1263 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
@@ -1229,18 +1266,6 @@ void hrtimer_run_queues(void)
1229 if (hrtimer_hres_active()) 1266 if (hrtimer_hres_active())
1230 return; 1267 return;
1231 1268
1232 /*
1233 * This _is_ ugly: We have to check in the softirq context,
1234 * whether we can switch to highres and / or nohz mode. The
1235 * clocksource switch happens in the timer interrupt with
1236 * xtime_lock held. Notification from there only sets the
1237 * check bit in the tick_oneshot code, otherwise we might
1238 * deadlock vs. xtime_lock.
1239 */
1240 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
1241 if (hrtimer_switch_to_hres())
1242 return;
1243
1244 hrtimer_get_softirq_time(cpu_base); 1269 hrtimer_get_softirq_time(cpu_base);
1245 1270
1246 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1271 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
@@ -1268,7 +1293,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1268 sl->timer.function = hrtimer_wakeup; 1293 sl->timer.function = hrtimer_wakeup;
1269 sl->task = task; 1294 sl->task = task;
1270#ifdef CONFIG_HIGH_RES_TIMERS 1295#ifdef CONFIG_HIGH_RES_TIMERS
1271 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART; 1296 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
1272#endif 1297#endif
1273} 1298}
1274 1299
@@ -1279,6 +1304,8 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
1279 do { 1304 do {
1280 set_current_state(TASK_INTERRUPTIBLE); 1305 set_current_state(TASK_INTERRUPTIBLE);
1281 hrtimer_start(&t->timer, t->timer.expires, mode); 1306 hrtimer_start(&t->timer, t->timer.expires, mode);
1307 if (!hrtimer_active(&t->timer))
1308 t->task = NULL;
1282 1309
1283 if (likely(t->task)) 1310 if (likely(t->task))
1284 schedule(); 1311 schedule();
@@ -1389,6 +1416,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1389 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1416 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1390 cpu_base->clock_base[i].cpu_base = cpu_base; 1417 cpu_base->clock_base[i].cpu_base = cpu_base;
1391 1418
1419 INIT_LIST_HEAD(&cpu_base->cb_pending);
1392 hrtimer_init_hres(cpu_base); 1420 hrtimer_init_hres(cpu_base);
1393} 1421}
1394 1422
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1f314221d534..438a01464287 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -479,6 +479,9 @@ void free_irq(unsigned int irq, void *dev_id)
479 return; 479 return;
480 } 480 }
481 printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq); 481 printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);
482#ifdef CONFIG_DEBUG_SHIRQ
483 dump_stack();
484#endif
482 spin_unlock_irqrestore(&desc->lock, flags); 485 spin_unlock_irqrestore(&desc->lock, flags);
483 return; 486 return;
484 } 487 }
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 50b81b98046a..c2f2ccb0549a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -75,6 +75,18 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
75 75
76#endif 76#endif
77 77
78static int irq_spurious_read(char *page, char **start, off_t off,
79 int count, int *eof, void *data)
80{
81 struct irq_desc *d = &irq_desc[(long) data];
82 return sprintf(page, "count %u\n"
83 "unhandled %u\n"
84 "last_unhandled %u ms\n",
85 d->irq_count,
86 d->irqs_unhandled,
87 jiffies_to_msecs(d->last_unhandled));
88}
89
78#define MAX_NAMELEN 128 90#define MAX_NAMELEN 128
79 91
80static int name_unique(unsigned int irq, struct irqaction *new_action) 92static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -118,6 +130,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
118void register_irq_proc(unsigned int irq) 130void register_irq_proc(unsigned int irq)
119{ 131{
120 char name [MAX_NAMELEN]; 132 char name [MAX_NAMELEN];
133 struct proc_dir_entry *entry;
121 134
122 if (!root_irq_dir || 135 if (!root_irq_dir ||
123 (irq_desc[irq].chip == &no_irq_chip) || 136 (irq_desc[irq].chip == &no_irq_chip) ||
@@ -132,8 +145,6 @@ void register_irq_proc(unsigned int irq)
132 145
133#ifdef CONFIG_SMP 146#ifdef CONFIG_SMP
134 { 147 {
135 struct proc_dir_entry *entry;
136
137 /* create /proc/irq/<irq>/smp_affinity */ 148 /* create /proc/irq/<irq>/smp_affinity */
138 entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); 149 entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir);
139 150
@@ -144,6 +155,12 @@ void register_irq_proc(unsigned int irq)
144 } 155 }
145 } 156 }
146#endif 157#endif
158
159 entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir);
160 if (entry) {
161 entry->data = (void *)(long)irq;
162 entry->read_proc = irq_spurious_read;
163 }
147} 164}
148 165
149#undef MAX_NAMELEN 166#undef MAX_NAMELEN
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 32b161972fad..a6b2bc831dd0 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -10,6 +10,7 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13#include <linux/moduleparam.h>
13 14
14static int irqfixup __read_mostly; 15static int irqfixup __read_mostly;
15 16
@@ -225,6 +226,8 @@ int noirqdebug_setup(char *str)
225} 226}
226 227
227__setup("noirqdebug", noirqdebug_setup); 228__setup("noirqdebug", noirqdebug_setup);
229module_param(noirqdebug, bool, 0644);
230MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
228 231
229static int __init irqfixup_setup(char *str) 232static int __init irqfixup_setup(char *str)
230{ 233{
@@ -236,6 +239,8 @@ static int __init irqfixup_setup(char *str)
236} 239}
237 240
238__setup("irqfixup", irqfixup_setup); 241__setup("irqfixup", irqfixup_setup);
242module_param(irqfixup, int, 0644);
243MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode 2: irqpoll mode");
239 244
240static int __init irqpoll_setup(char *str) 245static int __init irqpoll_setup(char *str)
241{ 246{
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 2fc25810509e..7dadc71ce516 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -233,10 +233,11 @@ static unsigned long get_symbol_pos(unsigned long addr,
233int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, 233int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
234 unsigned long *offset) 234 unsigned long *offset)
235{ 235{
236 char namebuf[KSYM_NAME_LEN];
236 if (is_ksym_addr(addr)) 237 if (is_ksym_addr(addr))
237 return !!get_symbol_pos(addr, symbolsize, offset); 238 return !!get_symbol_pos(addr, symbolsize, offset);
238 239
239 return !!module_address_lookup(addr, symbolsize, offset, NULL); 240 return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf);
240} 241}
241 242
242/* 243/*
@@ -251,8 +252,6 @@ const char *kallsyms_lookup(unsigned long addr,
251 unsigned long *offset, 252 unsigned long *offset,
252 char **modname, char *namebuf) 253 char **modname, char *namebuf)
253{ 254{
254 const char *msym;
255
256 namebuf[KSYM_NAME_LEN - 1] = 0; 255 namebuf[KSYM_NAME_LEN - 1] = 0;
257 namebuf[0] = 0; 256 namebuf[0] = 0;
258 257
@@ -268,10 +267,8 @@ const char *kallsyms_lookup(unsigned long addr,
268 } 267 }
269 268
270 /* see if it's in a module */ 269 /* see if it's in a module */
271 msym = module_address_lookup(addr, symbolsize, offset, modname); 270 return module_address_lookup(addr, symbolsize, offset, modname,
272 if (msym) 271 namebuf);
273 return strncpy(namebuf, msym, KSYM_NAME_LEN - 1);
274
275 return NULL; 272 return NULL;
276} 273}
277 274
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e3a5d817ac9b..d0493eafea3e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -824,6 +824,8 @@ static int __init init_kprobes(void)
824 if (!err) 824 if (!err)
825 err = register_die_notifier(&kprobe_exceptions_nb); 825 err = register_die_notifier(&kprobe_exceptions_nb);
826 826
827 if (!err)
828 init_test_probes();
827 return err; 829 return err;
828} 830}
829 831
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 65daa5373ca6..e53bc30e9ba5 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -17,30 +17,34 @@
17#include <linux/sched.h> 17#include <linux/sched.h>
18 18
19#define KERNEL_ATTR_RO(_name) \ 19#define KERNEL_ATTR_RO(_name) \
20static struct subsys_attribute _name##_attr = __ATTR_RO(_name) 20static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
21 21
22#define KERNEL_ATTR_RW(_name) \ 22#define KERNEL_ATTR_RW(_name) \
23static struct subsys_attribute _name##_attr = \ 23static struct kobj_attribute _name##_attr = \
24 __ATTR(_name, 0644, _name##_show, _name##_store) 24 __ATTR(_name, 0644, _name##_show, _name##_store)
25 25
26#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 26#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
27/* current uevent sequence number */ 27/* current uevent sequence number */
28static ssize_t uevent_seqnum_show(struct kset *kset, char *page) 28static ssize_t uevent_seqnum_show(struct kobject *kobj,
29 struct kobj_attribute *attr, char *buf)
29{ 30{
30 return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum); 31 return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum);
31} 32}
32KERNEL_ATTR_RO(uevent_seqnum); 33KERNEL_ATTR_RO(uevent_seqnum);
33 34
34/* uevent helper program, used during early boo */ 35/* uevent helper program, used during early boo */
35static ssize_t uevent_helper_show(struct kset *kset, char *page) 36static ssize_t uevent_helper_show(struct kobject *kobj,
37 struct kobj_attribute *attr, char *buf)
36{ 38{
37 return sprintf(page, "%s\n", uevent_helper); 39 return sprintf(buf, "%s\n", uevent_helper);
38} 40}
39static ssize_t uevent_helper_store(struct kset *kset, const char *page, size_t count) 41static ssize_t uevent_helper_store(struct kobject *kobj,
42 struct kobj_attribute *attr,
43 const char *buf, size_t count)
40{ 44{
41 if (count+1 > UEVENT_HELPER_PATH_LEN) 45 if (count+1 > UEVENT_HELPER_PATH_LEN)
42 return -ENOENT; 46 return -ENOENT;
43 memcpy(uevent_helper, page, count); 47 memcpy(uevent_helper, buf, count);
44 uevent_helper[count] = '\0'; 48 uevent_helper[count] = '\0';
45 if (count && uevent_helper[count-1] == '\n') 49 if (count && uevent_helper[count-1] == '\n')
46 uevent_helper[count-1] = '\0'; 50 uevent_helper[count-1] = '\0';
@@ -50,21 +54,24 @@ KERNEL_ATTR_RW(uevent_helper);
50#endif 54#endif
51 55
52#ifdef CONFIG_KEXEC 56#ifdef CONFIG_KEXEC
53static ssize_t kexec_loaded_show(struct kset *kset, char *page) 57static ssize_t kexec_loaded_show(struct kobject *kobj,
58 struct kobj_attribute *attr, char *buf)
54{ 59{
55 return sprintf(page, "%d\n", !!kexec_image); 60 return sprintf(buf, "%d\n", !!kexec_image);
56} 61}
57KERNEL_ATTR_RO(kexec_loaded); 62KERNEL_ATTR_RO(kexec_loaded);
58 63
59static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page) 64static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
65 struct kobj_attribute *attr, char *buf)
60{ 66{
61 return sprintf(page, "%d\n", !!kexec_crash_image); 67 return sprintf(buf, "%d\n", !!kexec_crash_image);
62} 68}
63KERNEL_ATTR_RO(kexec_crash_loaded); 69KERNEL_ATTR_RO(kexec_crash_loaded);
64 70
65static ssize_t vmcoreinfo_show(struct kset *kset, char *page) 71static ssize_t vmcoreinfo_show(struct kobject *kobj,
72 struct kobj_attribute *attr, char *buf)
66{ 73{
67 return sprintf(page, "%lx %x\n", 74 return sprintf(buf, "%lx %x\n",
68 paddr_vmcoreinfo_note(), 75 paddr_vmcoreinfo_note(),
69 (unsigned int)vmcoreinfo_max_size); 76 (unsigned int)vmcoreinfo_max_size);
70} 77}
@@ -94,8 +101,8 @@ static struct bin_attribute notes_attr = {
94 .read = &notes_read, 101 .read = &notes_read,
95}; 102};
96 103
97decl_subsys(kernel, NULL, NULL); 104struct kobject *kernel_kobj;
98EXPORT_SYMBOL_GPL(kernel_subsys); 105EXPORT_SYMBOL_GPL(kernel_kobj);
99 106
100static struct attribute * kernel_attrs[] = { 107static struct attribute * kernel_attrs[] = {
101#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 108#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
@@ -116,24 +123,39 @@ static struct attribute_group kernel_attr_group = {
116 123
117static int __init ksysfs_init(void) 124static int __init ksysfs_init(void)
118{ 125{
119 int error = subsystem_register(&kernel_subsys); 126 int error;
120 if (!error)
121 error = sysfs_create_group(&kernel_subsys.kobj,
122 &kernel_attr_group);
123 127
124 if (!error && notes_size > 0) { 128 kernel_kobj = kobject_create_and_add("kernel", NULL);
125 notes_attr.size = notes_size; 129 if (!kernel_kobj) {
126 error = sysfs_create_bin_file(&kernel_subsys.kobj, 130 error = -ENOMEM;
127 &notes_attr); 131 goto exit;
128 } 132 }
133 error = sysfs_create_group(kernel_kobj, &kernel_attr_group);
134 if (error)
135 goto kset_exit;
129 136
130 /* 137 if (notes_size > 0) {
131 * Create "/sys/kernel/uids" directory and corresponding root user's 138 notes_attr.size = notes_size;
132 * directory under it. 139 error = sysfs_create_bin_file(kernel_kobj, &notes_attr);
133 */ 140 if (error)
134 if (!error) 141 goto group_exit;
135 error = uids_kobject_init(); 142 }
136 143
144 /* create the /sys/kernel/uids/ directory */
145 error = uids_sysfs_init();
146 if (error)
147 goto notes_exit;
148
149 return 0;
150
151notes_exit:
152 if (notes_size > 0)
153 sysfs_remove_bin_file(kernel_kobj, &notes_attr);
154group_exit:
155 sysfs_remove_group(kernel_kobj, &kernel_attr_group);
156kset_exit:
157 kobject_put(kernel_kobj);
158exit:
137 return error; 159 return error;
138} 160}
139 161
diff --git a/kernel/kthread.c b/kernel/kthread.c
index dcfe724300eb..0ac887882f90 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -15,6 +15,8 @@
15#include <linux/mutex.h> 15#include <linux/mutex.h>
16#include <asm/semaphore.h> 16#include <asm/semaphore.h>
17 17
18#define KTHREAD_NICE_LEVEL (-5)
19
18static DEFINE_SPINLOCK(kthread_create_lock); 20static DEFINE_SPINLOCK(kthread_create_lock);
19static LIST_HEAD(kthread_create_list); 21static LIST_HEAD(kthread_create_list);
20struct task_struct *kthreadd_task; 22struct task_struct *kthreadd_task;
@@ -94,10 +96,18 @@ static void create_kthread(struct kthread_create_info *create)
94 if (pid < 0) { 96 if (pid < 0) {
95 create->result = ERR_PTR(pid); 97 create->result = ERR_PTR(pid);
96 } else { 98 } else {
99 struct sched_param param = { .sched_priority = 0 };
97 wait_for_completion(&create->started); 100 wait_for_completion(&create->started);
98 read_lock(&tasklist_lock); 101 read_lock(&tasklist_lock);
99 create->result = find_task_by_pid(pid); 102 create->result = find_task_by_pid(pid);
100 read_unlock(&tasklist_lock); 103 read_unlock(&tasklist_lock);
104 /*
105 * root may have changed our (kthreadd's) priority or CPU mask.
106 * The kernel thread should not inherit these properties.
107 */
108 sched_setscheduler(create->result, SCHED_NORMAL, &param);
109 set_user_nice(create->result, KTHREAD_NICE_LEVEL);
110 set_cpus_allowed(create->result, CPU_MASK_ALL);
101 } 111 }
102 complete(&create->done); 112 complete(&create->done);
103} 113}
@@ -221,7 +231,7 @@ int kthreadd(void *unused)
221 /* Setup a clean context for our children to inherit. */ 231 /* Setup a clean context for our children to inherit. */
222 set_task_comm(tsk, "kthreadd"); 232 set_task_comm(tsk, "kthreadd");
223 ignore_signals(tsk); 233 ignore_signals(tsk);
224 set_user_nice(tsk, -5); 234 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
225 set_cpus_allowed(tsk, CPU_MASK_ALL); 235 set_cpus_allowed(tsk, CPU_MASK_ALL);
226 236
227 current->flags |= PF_NOFREEZE; 237 current->flags |= PF_NOFREEZE;
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
new file mode 100644
index 000000000000..b4e3c85abe74
--- /dev/null
+++ b/kernel/latencytop.c
@@ -0,0 +1,239 @@
1/*
2 * latencytop.c: Latency display infrastructure
3 *
4 * (C) Copyright 2008 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12#include <linux/latencytop.h>
13#include <linux/kallsyms.h>
14#include <linux/seq_file.h>
15#include <linux/notifier.h>
16#include <linux/spinlock.h>
17#include <linux/proc_fs.h>
18#include <linux/module.h>
19#include <linux/sched.h>
20#include <linux/list.h>
21#include <linux/slab.h>
22#include <linux/stacktrace.h>
23
24static DEFINE_SPINLOCK(latency_lock);
25
26#define MAXLR 128
27static struct latency_record latency_record[MAXLR];
28
29int latencytop_enabled;
30
31void clear_all_latency_tracing(struct task_struct *p)
32{
33 unsigned long flags;
34
35 if (!latencytop_enabled)
36 return;
37
38 spin_lock_irqsave(&latency_lock, flags);
39 memset(&p->latency_record, 0, sizeof(p->latency_record));
40 p->latency_record_count = 0;
41 spin_unlock_irqrestore(&latency_lock, flags);
42}
43
44static void clear_global_latency_tracing(void)
45{
46 unsigned long flags;
47
48 spin_lock_irqsave(&latency_lock, flags);
49 memset(&latency_record, 0, sizeof(latency_record));
50 spin_unlock_irqrestore(&latency_lock, flags);
51}
52
53static void __sched
54account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat)
55{
56 int firstnonnull = MAXLR + 1;
57 int i;
58
59 if (!latencytop_enabled)
60 return;
61
62 /* skip kernel threads for now */
63 if (!tsk->mm)
64 return;
65
66 for (i = 0; i < MAXLR; i++) {
67 int q;
68 int same = 1;
69 /* Nothing stored: */
70 if (!latency_record[i].backtrace[0]) {
71 if (firstnonnull > i)
72 firstnonnull = i;
73 continue;
74 }
75 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
76 if (latency_record[i].backtrace[q] !=
77 lat->backtrace[q])
78 same = 0;
79 if (same && lat->backtrace[q] == 0)
80 break;
81 if (same && lat->backtrace[q] == ULONG_MAX)
82 break;
83 }
84 if (same) {
85 latency_record[i].count++;
86 latency_record[i].time += lat->time;
87 if (lat->time > latency_record[i].max)
88 latency_record[i].max = lat->time;
89 return;
90 }
91 }
92
93 i = firstnonnull;
94 if (i >= MAXLR - 1)
95 return;
96
97 /* Allocted a new one: */
98 memcpy(&latency_record[i], lat, sizeof(struct latency_record));
99}
100
101static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat)
102{
103 struct stack_trace trace;
104
105 memset(&trace, 0, sizeof(trace));
106 trace.max_entries = LT_BACKTRACEDEPTH;
107 trace.entries = &lat->backtrace[0];
108 trace.skip = 0;
109 save_stack_trace_tsk(tsk, &trace);
110}
111
112void __sched
113account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
114{
115 unsigned long flags;
116 int i, q;
117 struct latency_record lat;
118
119 if (!latencytop_enabled)
120 return;
121
122 /* Long interruptible waits are generally user requested... */
123 if (inter && usecs > 5000)
124 return;
125
126 memset(&lat, 0, sizeof(lat));
127 lat.count = 1;
128 lat.time = usecs;
129 lat.max = usecs;
130 store_stacktrace(tsk, &lat);
131
132 spin_lock_irqsave(&latency_lock, flags);
133
134 account_global_scheduler_latency(tsk, &lat);
135
136 /*
137 * short term hack; if we're > 32 we stop; future we recycle:
138 */
139 tsk->latency_record_count++;
140 if (tsk->latency_record_count >= LT_SAVECOUNT)
141 goto out_unlock;
142
143 for (i = 0; i < LT_SAVECOUNT ; i++) {
144 struct latency_record *mylat;
145 int same = 1;
146 mylat = &tsk->latency_record[i];
147 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
148 if (mylat->backtrace[q] !=
149 lat.backtrace[q])
150 same = 0;
151 if (same && lat.backtrace[q] == 0)
152 break;
153 if (same && lat.backtrace[q] == ULONG_MAX)
154 break;
155 }
156 if (same) {
157 mylat->count++;
158 mylat->time += lat.time;
159 if (lat.time > mylat->max)
160 mylat->max = lat.time;
161 goto out_unlock;
162 }
163 }
164
165 /* Allocated a new one: */
166 i = tsk->latency_record_count;
167 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
168
169out_unlock:
170 spin_unlock_irqrestore(&latency_lock, flags);
171}
172
173static int lstats_show(struct seq_file *m, void *v)
174{
175 int i;
176
177 seq_puts(m, "Latency Top version : v0.1\n");
178
179 for (i = 0; i < MAXLR; i++) {
180 if (latency_record[i].backtrace[0]) {
181 int q;
182 seq_printf(m, "%i %li %li ",
183 latency_record[i].count,
184 latency_record[i].time,
185 latency_record[i].max);
186 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
187 char sym[KSYM_NAME_LEN];
188 char *c;
189 if (!latency_record[i].backtrace[q])
190 break;
191 if (latency_record[i].backtrace[q] == ULONG_MAX)
192 break;
193 sprint_symbol(sym, latency_record[i].backtrace[q]);
194 c = strchr(sym, '+');
195 if (c)
196 *c = 0;
197 seq_printf(m, "%s ", sym);
198 }
199 seq_printf(m, "\n");
200 }
201 }
202 return 0;
203}
204
205static ssize_t
206lstats_write(struct file *file, const char __user *buf, size_t count,
207 loff_t *offs)
208{
209 clear_global_latency_tracing();
210
211 return count;
212}
213
214static int lstats_open(struct inode *inode, struct file *filp)
215{
216 return single_open(filp, lstats_show, NULL);
217}
218
219static struct file_operations lstats_fops = {
220 .open = lstats_open,
221 .read = seq_read,
222 .write = lstats_write,
223 .llseek = seq_lseek,
224 .release = single_release,
225};
226
227static int __init init_lstats_procfs(void)
228{
229 struct proc_dir_entry *pe;
230
231 pe = create_proc_entry("latency_stats", 0644, NULL);
232 if (!pe)
233 return -ENOMEM;
234
235 pe->proc_fops = &lstats_fops;
236
237 return 0;
238}
239__initcall(init_lstats_procfs);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 4335f12a27c6..3574379f4d62 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2932,7 +2932,7 @@ static void zap_class(struct lock_class *class)
2932 2932
2933} 2933}
2934 2934
2935static inline int within(void *addr, void *start, unsigned long size) 2935static inline int within(const void *addr, void *start, unsigned long size)
2936{ 2936{
2937 return addr >= start && addr < start + size; 2937 return addr >= start && addr < start + size;
2938} 2938}
@@ -2955,9 +2955,12 @@ void lockdep_free_key_range(void *start, unsigned long size)
2955 head = classhash_table + i; 2955 head = classhash_table + i;
2956 if (list_empty(head)) 2956 if (list_empty(head))
2957 continue; 2957 continue;
2958 list_for_each_entry_safe(class, next, head, hash_entry) 2958 list_for_each_entry_safe(class, next, head, hash_entry) {
2959 if (within(class->key, start, size)) 2959 if (within(class->key, start, size))
2960 zap_class(class); 2960 zap_class(class);
2961 else if (within(class->name, start, size))
2962 zap_class(class);
2963 }
2961 } 2964 }
2962 2965
2963 if (locked) 2966 if (locked)
@@ -3203,7 +3206,11 @@ retry:
3203 3206
3204EXPORT_SYMBOL_GPL(debug_show_all_locks); 3207EXPORT_SYMBOL_GPL(debug_show_all_locks);
3205 3208
3206void debug_show_held_locks(struct task_struct *task) 3209/*
3210 * Careful: only use this function if you are sure that
3211 * the task cannot run in parallel!
3212 */
3213void __debug_show_held_locks(struct task_struct *task)
3207{ 3214{
3208 if (unlikely(!debug_locks)) { 3215 if (unlikely(!debug_locks)) {
3209 printk("INFO: lockdep is turned off.\n"); 3216 printk("INFO: lockdep is turned off.\n");
@@ -3211,6 +3218,12 @@ void debug_show_held_locks(struct task_struct *task)
3211 } 3218 }
3212 lockdep_print_held_locks(task); 3219 lockdep_print_held_locks(task);
3213} 3220}
3221EXPORT_SYMBOL_GPL(__debug_show_held_locks);
3222
3223void debug_show_held_locks(struct task_struct *task)
3224{
3225 __debug_show_held_locks(task);
3226}
3214 3227
3215EXPORT_SYMBOL_GPL(debug_show_held_locks); 3228EXPORT_SYMBOL_GPL(debug_show_held_locks);
3216 3229
diff --git a/kernel/module.c b/kernel/module.c
index c2e3e2e98801..bd60278ee703 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -47,8 +47,6 @@
47#include <asm/cacheflush.h> 47#include <asm/cacheflush.h>
48#include <linux/license.h> 48#include <linux/license.h>
49 49
50extern int module_sysfs_initialized;
51
52#if 0 50#if 0
53#define DEBUGP printk 51#define DEBUGP printk
54#else 52#else
@@ -67,6 +65,9 @@ extern int module_sysfs_initialized;
67static DEFINE_MUTEX(module_mutex); 65static DEFINE_MUTEX(module_mutex);
68static LIST_HEAD(modules); 66static LIST_HEAD(modules);
69 67
68/* Waiting for a module to finish initializing? */
69static DECLARE_WAIT_QUEUE_HEAD(module_wq);
70
70static BLOCKING_NOTIFIER_HEAD(module_notify_list); 71static BLOCKING_NOTIFIER_HEAD(module_notify_list);
71 72
72int register_module_notifier(struct notifier_block * nb) 73int register_module_notifier(struct notifier_block * nb)
@@ -86,8 +87,11 @@ EXPORT_SYMBOL(unregister_module_notifier);
86static inline int strong_try_module_get(struct module *mod) 87static inline int strong_try_module_get(struct module *mod)
87{ 88{
88 if (mod && mod->state == MODULE_STATE_COMING) 89 if (mod && mod->state == MODULE_STATE_COMING)
90 return -EBUSY;
91 if (try_module_get(mod))
89 return 0; 92 return 0;
90 return try_module_get(mod); 93 else
94 return -ENOENT;
91} 95}
92 96
93static inline void add_taint_module(struct module *mod, unsigned flag) 97static inline void add_taint_module(struct module *mod, unsigned flag)
@@ -426,6 +430,14 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr,
426 return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); 430 return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
427} 431}
428 432
433static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
434{
435 int cpu;
436
437 for_each_possible_cpu(cpu)
438 memcpy(pcpudest + per_cpu_offset(cpu), from, size);
439}
440
429static int percpu_modinit(void) 441static int percpu_modinit(void)
430{ 442{
431 pcpu_num_used = 2; 443 pcpu_num_used = 2;
@@ -498,6 +510,8 @@ static struct module_attribute modinfo_##field = { \
498MODINFO_ATTR(version); 510MODINFO_ATTR(version);
499MODINFO_ATTR(srcversion); 511MODINFO_ATTR(srcversion);
500 512
513static char last_unloaded_module[MODULE_NAME_LEN+1];
514
501#ifdef CONFIG_MODULE_UNLOAD 515#ifdef CONFIG_MODULE_UNLOAD
502/* Init the unload section of the module. */ 516/* Init the unload section of the module. */
503static void module_unload_init(struct module *mod) 517static void module_unload_init(struct module *mod)
@@ -539,11 +553,21 @@ static int already_uses(struct module *a, struct module *b)
539static int use_module(struct module *a, struct module *b) 553static int use_module(struct module *a, struct module *b)
540{ 554{
541 struct module_use *use; 555 struct module_use *use;
542 int no_warn; 556 int no_warn, err;
543 557
544 if (b == NULL || already_uses(a, b)) return 1; 558 if (b == NULL || already_uses(a, b)) return 1;
545 559
546 if (!strong_try_module_get(b)) 560 /* If we're interrupted or time out, we fail. */
561 if (wait_event_interruptible_timeout(
562 module_wq, (err = strong_try_module_get(b)) != -EBUSY,
563 30 * HZ) <= 0) {
564 printk("%s: gave up waiting for init of module %s.\n",
565 a->name, b->name);
566 return 0;
567 }
568
569 /* If strong_try_module_get() returned a different error, we fail. */
570 if (err)
547 return 0; 571 return 0;
548 572
549 DEBUGP("Allocating new usage for %s.\n", a->name); 573 DEBUGP("Allocating new usage for %s.\n", a->name);
@@ -721,6 +745,8 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
721 mod->exit(); 745 mod->exit();
722 mutex_lock(&module_mutex); 746 mutex_lock(&module_mutex);
723 } 747 }
748 /* Store the name of the last unloaded module for diagnostic purposes */
749 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
724 free_module(mod); 750 free_module(mod);
725 751
726 out: 752 out:
@@ -814,7 +840,7 @@ static inline void module_unload_free(struct module *mod)
814 840
815static inline int use_module(struct module *a, struct module *b) 841static inline int use_module(struct module *a, struct module *b)
816{ 842{
817 return strong_try_module_get(b); 843 return strong_try_module_get(b) == 0;
818} 844}
819 845
820static inline void module_unload_init(struct module *mod) 846static inline void module_unload_init(struct module *mod)
@@ -1122,7 +1148,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1122 ++loaded; 1148 ++loaded;
1123 } 1149 }
1124 1150
1125 notes_attrs->dir = kobject_add_dir(&mod->mkobj.kobj, "notes"); 1151 notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
1126 if (!notes_attrs->dir) 1152 if (!notes_attrs->dir)
1127 goto out; 1153 goto out;
1128 1154
@@ -1212,6 +1238,7 @@ void module_remove_modinfo_attrs(struct module *mod)
1212int mod_sysfs_init(struct module *mod) 1238int mod_sysfs_init(struct module *mod)
1213{ 1239{
1214 int err; 1240 int err;
1241 struct kobject *kobj;
1215 1242
1216 if (!module_sysfs_initialized) { 1243 if (!module_sysfs_initialized) {
1217 printk(KERN_ERR "%s: module sysfs not initialized\n", 1244 printk(KERN_ERR "%s: module sysfs not initialized\n",
@@ -1219,15 +1246,25 @@ int mod_sysfs_init(struct module *mod)
1219 err = -EINVAL; 1246 err = -EINVAL;
1220 goto out; 1247 goto out;
1221 } 1248 }
1222 memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); 1249
1223 err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name); 1250 kobj = kset_find_obj(module_kset, mod->name);
1224 if (err) 1251 if (kobj) {
1252 printk(KERN_ERR "%s: module is already loaded\n", mod->name);
1253 kobject_put(kobj);
1254 err = -EINVAL;
1225 goto out; 1255 goto out;
1226 kobj_set_kset_s(&mod->mkobj, module_subsys); 1256 }
1257
1227 mod->mkobj.mod = mod; 1258 mod->mkobj.mod = mod;
1228 1259
1229 kobject_init(&mod->mkobj.kobj); 1260 memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
1261 mod->mkobj.kobj.kset = module_kset;
1262 err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
1263 "%s", mod->name);
1264 if (err)
1265 kobject_put(&mod->mkobj.kobj);
1230 1266
1267 /* delay uevent until full sysfs population */
1231out: 1268out:
1232 return err; 1269 return err;
1233} 1270}
@@ -1238,12 +1275,7 @@ int mod_sysfs_setup(struct module *mod,
1238{ 1275{
1239 int err; 1276 int err;
1240 1277
1241 /* delay uevent until full sysfs population */ 1278 mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
1242 err = kobject_add(&mod->mkobj.kobj);
1243 if (err)
1244 goto out;
1245
1246 mod->holders_dir = kobject_add_dir(&mod->mkobj.kobj, "holders");
1247 if (!mod->holders_dir) { 1279 if (!mod->holders_dir) {
1248 err = -ENOMEM; 1280 err = -ENOMEM;
1249 goto out_unreg; 1281 goto out_unreg;
@@ -1263,11 +1295,9 @@ int mod_sysfs_setup(struct module *mod,
1263out_unreg_param: 1295out_unreg_param:
1264 module_param_sysfs_remove(mod); 1296 module_param_sysfs_remove(mod);
1265out_unreg_holders: 1297out_unreg_holders:
1266 kobject_unregister(mod->holders_dir); 1298 kobject_put(mod->holders_dir);
1267out_unreg: 1299out_unreg:
1268 kobject_del(&mod->mkobj.kobj);
1269 kobject_put(&mod->mkobj.kobj); 1300 kobject_put(&mod->mkobj.kobj);
1270out:
1271 return err; 1301 return err;
1272} 1302}
1273#endif 1303#endif
@@ -1276,9 +1306,20 @@ static void mod_kobject_remove(struct module *mod)
1276{ 1306{
1277 module_remove_modinfo_attrs(mod); 1307 module_remove_modinfo_attrs(mod);
1278 module_param_sysfs_remove(mod); 1308 module_param_sysfs_remove(mod);
1279 kobject_unregister(mod->mkobj.drivers_dir); 1309 kobject_put(mod->mkobj.drivers_dir);
1280 kobject_unregister(mod->holders_dir); 1310 kobject_put(mod->holders_dir);
1281 kobject_unregister(&mod->mkobj.kobj); 1311 kobject_put(&mod->mkobj.kobj);
1312}
1313
1314/*
1315 * link the module with the whole machine is stopped with interrupts off
1316 * - this defends against kallsyms not taking locks
1317 */
1318static int __link_module(void *_mod)
1319{
1320 struct module *mod = _mod;
1321 list_add(&mod->list, &modules);
1322 return 0;
1282} 1323}
1283 1324
1284/* 1325/*
@@ -1330,7 +1371,7 @@ void *__symbol_get(const char *symbol)
1330 1371
1331 preempt_disable(); 1372 preempt_disable();
1332 value = __find_symbol(symbol, &owner, &crc, 1); 1373 value = __find_symbol(symbol, &owner, &crc, 1);
1333 if (value && !strong_try_module_get(owner)) 1374 if (value && strong_try_module_get(owner) != 0)
1334 value = 0; 1375 value = 0;
1335 preempt_enable(); 1376 preempt_enable();
1336 1377
@@ -1884,16 +1925,16 @@ static struct module *load_module(void __user *umod,
1884 /* Now we've moved module, initialize linked lists, etc. */ 1925 /* Now we've moved module, initialize linked lists, etc. */
1885 module_unload_init(mod); 1926 module_unload_init(mod);
1886 1927
1887 /* Initialize kobject, so we can reference it. */ 1928 /* add kobject, so we can reference it. */
1888 err = mod_sysfs_init(mod); 1929 err = mod_sysfs_init(mod);
1889 if (err) 1930 if (err)
1890 goto cleanup; 1931 goto free_unload;
1891 1932
1892 /* Set up license info based on the info section */ 1933 /* Set up license info based on the info section */
1893 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 1934 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
1894 1935
1895 if (strcmp(mod->name, "ndiswrapper") == 0) 1936 if (strcmp(mod->name, "ndiswrapper") == 0)
1896 add_taint(TAINT_PROPRIETARY_MODULE); 1937 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
1897 if (strcmp(mod->name, "driverloader") == 0) 1938 if (strcmp(mod->name, "driverloader") == 0)
1898 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 1939 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
1899 1940
@@ -2023,6 +2064,11 @@ static struct module *load_module(void __user *umod,
2023 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", 2064 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
2024 mod->name); 2065 mod->name);
2025 2066
2067 /* Now sew it into the lists so we can get lockdep and oops
2068 * info during argument parsing. Noone should access us, since
2069 * strong_try_module_get() will fail. */
2070 stop_machine_run(__link_module, mod, NR_CPUS);
2071
2026 /* Size of section 0 is 0, so this works well if no params */ 2072 /* Size of section 0 is 0, so this works well if no params */
2027 err = parse_args(mod->name, mod->args, 2073 err = parse_args(mod->name, mod->args,
2028 (struct kernel_param *) 2074 (struct kernel_param *)
@@ -2031,7 +2077,7 @@ static struct module *load_module(void __user *umod,
2031 / sizeof(struct kernel_param), 2077 / sizeof(struct kernel_param),
2032 NULL); 2078 NULL);
2033 if (err < 0) 2079 if (err < 0)
2034 goto arch_cleanup; 2080 goto unlink;
2035 2081
2036 err = mod_sysfs_setup(mod, 2082 err = mod_sysfs_setup(mod,
2037 (struct kernel_param *) 2083 (struct kernel_param *)
@@ -2039,7 +2085,7 @@ static struct module *load_module(void __user *umod,
2039 sechdrs[setupindex].sh_size 2085 sechdrs[setupindex].sh_size
2040 / sizeof(struct kernel_param)); 2086 / sizeof(struct kernel_param));
2041 if (err < 0) 2087 if (err < 0)
2042 goto arch_cleanup; 2088 goto unlink;
2043 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2089 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2044 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2090 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2045 2091
@@ -2054,9 +2100,13 @@ static struct module *load_module(void __user *umod,
2054 /* Done! */ 2100 /* Done! */
2055 return mod; 2101 return mod;
2056 2102
2057 arch_cleanup: 2103 unlink:
2104 stop_machine_run(__unlink_module, mod, NR_CPUS);
2058 module_arch_cleanup(mod); 2105 module_arch_cleanup(mod);
2059 cleanup: 2106 cleanup:
2107 kobject_del(&mod->mkobj.kobj);
2108 kobject_put(&mod->mkobj.kobj);
2109 free_unload:
2060 module_unload_free(mod); 2110 module_unload_free(mod);
2061 module_free(mod, mod->module_init); 2111 module_free(mod, mod->module_init);
2062 free_core: 2112 free_core:
@@ -2076,17 +2126,6 @@ static struct module *load_module(void __user *umod,
2076 goto free_hdr; 2126 goto free_hdr;
2077} 2127}
2078 2128
2079/*
2080 * link the module with the whole machine is stopped with interrupts off
2081 * - this defends against kallsyms not taking locks
2082 */
2083static int __link_module(void *_mod)
2084{
2085 struct module *mod = _mod;
2086 list_add(&mod->list, &modules);
2087 return 0;
2088}
2089
2090/* This is where the real work happens */ 2129/* This is where the real work happens */
2091asmlinkage long 2130asmlinkage long
2092sys_init_module(void __user *umod, 2131sys_init_module(void __user *umod,
@@ -2111,10 +2150,6 @@ sys_init_module(void __user *umod,
2111 return PTR_ERR(mod); 2150 return PTR_ERR(mod);
2112 } 2151 }
2113 2152
2114 /* Now sew it into the lists. They won't access us, since
2115 strong_try_module_get() will fail. */
2116 stop_machine_run(__link_module, mod, NR_CPUS);
2117
2118 /* Drop lock so they can recurse */ 2153 /* Drop lock so they can recurse */
2119 mutex_unlock(&module_mutex); 2154 mutex_unlock(&module_mutex);
2120 2155
@@ -2133,6 +2168,7 @@ sys_init_module(void __user *umod,
2133 mutex_lock(&module_mutex); 2168 mutex_lock(&module_mutex);
2134 free_module(mod); 2169 free_module(mod);
2135 mutex_unlock(&module_mutex); 2170 mutex_unlock(&module_mutex);
2171 wake_up(&module_wq);
2136 return ret; 2172 return ret;
2137 } 2173 }
2138 2174
@@ -2147,6 +2183,7 @@ sys_init_module(void __user *umod,
2147 mod->init_size = 0; 2183 mod->init_size = 0;
2148 mod->init_text_size = 0; 2184 mod->init_text_size = 0;
2149 mutex_unlock(&module_mutex); 2185 mutex_unlock(&module_mutex);
2186 wake_up(&module_wq);
2150 2187
2151 return 0; 2188 return 0;
2152} 2189}
@@ -2211,14 +2248,13 @@ static const char *get_ksymbol(struct module *mod,
2211 return mod->strtab + mod->symtab[best].st_name; 2248 return mod->strtab + mod->symtab[best].st_name;
2212} 2249}
2213 2250
2214/* For kallsyms to ask for address resolution. NULL means not found. 2251/* For kallsyms to ask for address resolution. NULL means not found. Careful
2215 We don't lock, as this is used for oops resolution and races are a 2252 * not to lock to avoid deadlock on oopses, simply disable preemption. */
2216 lesser concern. */ 2253char *module_address_lookup(unsigned long addr,
2217/* FIXME: Risky: returns a pointer into a module w/o lock */ 2254 unsigned long *size,
2218const char *module_address_lookup(unsigned long addr, 2255 unsigned long *offset,
2219 unsigned long *size, 2256 char **modname,
2220 unsigned long *offset, 2257 char *namebuf)
2221 char **modname)
2222{ 2258{
2223 struct module *mod; 2259 struct module *mod;
2224 const char *ret = NULL; 2260 const char *ret = NULL;
@@ -2233,8 +2269,13 @@ const char *module_address_lookup(unsigned long addr,
2233 break; 2269 break;
2234 } 2270 }
2235 } 2271 }
2272 /* Make a copy in here where it's safe */
2273 if (ret) {
2274 strncpy(namebuf, ret, KSYM_NAME_LEN - 1);
2275 ret = namebuf;
2276 }
2236 preempt_enable(); 2277 preempt_enable();
2237 return ret; 2278 return (char *)ret;
2238} 2279}
2239 2280
2240int lookup_module_symbol_name(unsigned long addr, char *symname) 2281int lookup_module_symbol_name(unsigned long addr, char *symname)
@@ -2362,21 +2403,30 @@ static void m_stop(struct seq_file *m, void *p)
2362 mutex_unlock(&module_mutex); 2403 mutex_unlock(&module_mutex);
2363} 2404}
2364 2405
2365static char *taint_flags(unsigned int taints, char *buf) 2406static char *module_flags(struct module *mod, char *buf)
2366{ 2407{
2367 int bx = 0; 2408 int bx = 0;
2368 2409
2369 if (taints) { 2410 if (mod->taints ||
2411 mod->state == MODULE_STATE_GOING ||
2412 mod->state == MODULE_STATE_COMING) {
2370 buf[bx++] = '('; 2413 buf[bx++] = '(';
2371 if (taints & TAINT_PROPRIETARY_MODULE) 2414 if (mod->taints & TAINT_PROPRIETARY_MODULE)
2372 buf[bx++] = 'P'; 2415 buf[bx++] = 'P';
2373 if (taints & TAINT_FORCED_MODULE) 2416 if (mod->taints & TAINT_FORCED_MODULE)
2374 buf[bx++] = 'F'; 2417 buf[bx++] = 'F';
2375 /* 2418 /*
2376 * TAINT_FORCED_RMMOD: could be added. 2419 * TAINT_FORCED_RMMOD: could be added.
2377 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't 2420 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
2378 * apply to modules. 2421 * apply to modules.
2379 */ 2422 */
2423
2424 /* Show a - for module-is-being-unloaded */
2425 if (mod->state == MODULE_STATE_GOING)
2426 buf[bx++] = '-';
2427 /* Show a + for module-is-being-loaded */
2428 if (mod->state == MODULE_STATE_COMING)
2429 buf[bx++] = '+';
2380 buf[bx++] = ')'; 2430 buf[bx++] = ')';
2381 } 2431 }
2382 buf[bx] = '\0'; 2432 buf[bx] = '\0';
@@ -2403,7 +2453,7 @@ static int m_show(struct seq_file *m, void *p)
2403 2453
2404 /* Taints info */ 2454 /* Taints info */
2405 if (mod->taints) 2455 if (mod->taints)
2406 seq_printf(m, " %s", taint_flags(mod->taints, buf)); 2456 seq_printf(m, " %s", module_flags(mod, buf));
2407 2457
2408 seq_printf(m, "\n"); 2458 seq_printf(m, "\n");
2409 return 0; 2459 return 0;
@@ -2498,97 +2548,12 @@ void print_modules(void)
2498 2548
2499 printk("Modules linked in:"); 2549 printk("Modules linked in:");
2500 list_for_each_entry(mod, &modules, list) 2550 list_for_each_entry(mod, &modules, list)
2501 printk(" %s%s", mod->name, taint_flags(mod->taints, buf)); 2551 printk(" %s%s", mod->name, module_flags(mod, buf));
2552 if (last_unloaded_module[0])
2553 printk(" [last unloaded: %s]", last_unloaded_module);
2502 printk("\n"); 2554 printk("\n");
2503} 2555}
2504 2556
2505#ifdef CONFIG_SYSFS
2506static char *make_driver_name(struct device_driver *drv)
2507{
2508 char *driver_name;
2509
2510 driver_name = kmalloc(strlen(drv->name) + strlen(drv->bus->name) + 2,
2511 GFP_KERNEL);
2512 if (!driver_name)
2513 return NULL;
2514
2515 sprintf(driver_name, "%s:%s", drv->bus->name, drv->name);
2516 return driver_name;
2517}
2518
2519static void module_create_drivers_dir(struct module_kobject *mk)
2520{
2521 if (!mk || mk->drivers_dir)
2522 return;
2523
2524 mk->drivers_dir = kobject_add_dir(&mk->kobj, "drivers");
2525}
2526
2527void module_add_driver(struct module *mod, struct device_driver *drv)
2528{
2529 char *driver_name;
2530 int no_warn;
2531 struct module_kobject *mk = NULL;
2532
2533 if (!drv)
2534 return;
2535
2536 if (mod)
2537 mk = &mod->mkobj;
2538 else if (drv->mod_name) {
2539 struct kobject *mkobj;
2540
2541 /* Lookup built-in module entry in /sys/modules */
2542 mkobj = kset_find_obj(&module_subsys, drv->mod_name);
2543 if (mkobj) {
2544 mk = container_of(mkobj, struct module_kobject, kobj);
2545 /* remember our module structure */
2546 drv->mkobj = mk;
2547 /* kset_find_obj took a reference */
2548 kobject_put(mkobj);
2549 }
2550 }
2551
2552 if (!mk)
2553 return;
2554
2555 /* Don't check return codes; these calls are idempotent */
2556 no_warn = sysfs_create_link(&drv->kobj, &mk->kobj, "module");
2557 driver_name = make_driver_name(drv);
2558 if (driver_name) {
2559 module_create_drivers_dir(mk);
2560 no_warn = sysfs_create_link(mk->drivers_dir, &drv->kobj,
2561 driver_name);
2562 kfree(driver_name);
2563 }
2564}
2565EXPORT_SYMBOL(module_add_driver);
2566
2567void module_remove_driver(struct device_driver *drv)
2568{
2569 struct module_kobject *mk = NULL;
2570 char *driver_name;
2571
2572 if (!drv)
2573 return;
2574
2575 sysfs_remove_link(&drv->kobj, "module");
2576
2577 if (drv->owner)
2578 mk = &drv->owner->mkobj;
2579 else if (drv->mkobj)
2580 mk = drv->mkobj;
2581 if (mk && mk->drivers_dir) {
2582 driver_name = make_driver_name(drv);
2583 if (driver_name) {
2584 sysfs_remove_link(mk->drivers_dir, driver_name);
2585 kfree(driver_name);
2586 }
2587 }
2588}
2589EXPORT_SYMBOL(module_remove_driver);
2590#endif
2591
2592#ifdef CONFIG_MODVERSIONS 2557#ifdef CONFIG_MODVERSIONS
2593/* Generate the signature for struct module here, too, for modversions. */ 2558/* Generate the signature for struct module here, too, for modversions. */
2594void struct_module(struct module *mod) { return; } 2559void struct_module(struct module *mod) { return; }
diff --git a/kernel/panic.c b/kernel/panic.c
index da4d6bac270e..d9e90cfe3298 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -20,6 +20,7 @@
20#include <linux/kexec.h> 20#include <linux/kexec.h>
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/kallsyms.h>
23 24
24int panic_on_oops; 25int panic_on_oops;
25int tainted; 26int tainted;
@@ -280,6 +281,13 @@ static int init_oops_id(void)
280} 281}
281late_initcall(init_oops_id); 282late_initcall(init_oops_id);
282 283
284static void print_oops_end_marker(void)
285{
286 init_oops_id();
287 printk(KERN_WARNING "---[ end trace %016llx ]---\n",
288 (unsigned long long)oops_id);
289}
290
283/* 291/*
284 * Called when the architecture exits its oops handler, after printing 292 * Called when the architecture exits its oops handler, after printing
285 * everything. 293 * everything.
@@ -287,11 +295,26 @@ late_initcall(init_oops_id);
287void oops_exit(void) 295void oops_exit(void)
288{ 296{
289 do_oops_enter_exit(); 297 do_oops_enter_exit();
290 init_oops_id(); 298 print_oops_end_marker();
291 printk(KERN_WARNING "---[ end trace %016llx ]---\n",
292 (unsigned long long)oops_id);
293} 299}
294 300
301#ifdef WANT_WARN_ON_SLOWPATH
302void warn_on_slowpath(const char *file, int line)
303{
304 char function[KSYM_SYMBOL_LEN];
305 unsigned long caller = (unsigned long) __builtin_return_address(0);
306 sprint_symbol(function, caller);
307
308 printk(KERN_WARNING "------------[ cut here ]------------\n");
309 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
310 line, function);
311 print_modules();
312 dump_stack();
313 print_oops_end_marker();
314}
315EXPORT_SYMBOL(warn_on_slowpath);
316#endif
317
295#ifdef CONFIG_CC_STACKPROTECTOR 318#ifdef CONFIG_CC_STACKPROTECTOR
296/* 319/*
297 * Called when gcc's -fstack-protector feature is used, and 320 * Called when gcc's -fstack-protector feature is used, and
diff --git a/kernel/params.c b/kernel/params.c
index 7686417ee00e..42fe5e6126c0 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -376,8 +376,6 @@ int param_get_string(char *buffer, struct kernel_param *kp)
376 376
377extern struct kernel_param __start___param[], __stop___param[]; 377extern struct kernel_param __start___param[], __stop___param[];
378 378
379#define MAX_KBUILD_MODNAME KOBJ_NAME_LEN
380
381struct param_attribute 379struct param_attribute
382{ 380{
383 struct module_attribute mattr; 381 struct module_attribute mattr;
@@ -472,7 +470,7 @@ param_sysfs_setup(struct module_kobject *mk,
472 sizeof(mp->grp.attrs[0])); 470 sizeof(mp->grp.attrs[0]));
473 size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]); 471 size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]);
474 472
475 mp = kmalloc(size[0] + size[1], GFP_KERNEL); 473 mp = kzalloc(size[0] + size[1], GFP_KERNEL);
476 if (!mp) 474 if (!mp)
477 return ERR_PTR(-ENOMEM); 475 return ERR_PTR(-ENOMEM);
478 476
@@ -560,11 +558,10 @@ static void __init kernel_param_sysfs_setup(const char *name,
560 BUG_ON(!mk); 558 BUG_ON(!mk);
561 559
562 mk->mod = THIS_MODULE; 560 mk->mod = THIS_MODULE;
563 kobj_set_kset_s(mk, module_subsys); 561 mk->kobj.kset = module_kset;
564 kobject_set_name(&mk->kobj, name); 562 ret = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name);
565 kobject_init(&mk->kobj);
566 ret = kobject_add(&mk->kobj);
567 if (ret) { 563 if (ret) {
564 kobject_put(&mk->kobj);
568 printk(KERN_ERR "Module '%s' failed to be added to sysfs, " 565 printk(KERN_ERR "Module '%s' failed to be added to sysfs, "
569 "error number %d\n", name, ret); 566 "error number %d\n", name, ret);
570 printk(KERN_ERR "The system will be unstable now.\n"); 567 printk(KERN_ERR "The system will be unstable now.\n");
@@ -588,7 +585,7 @@ static void __init param_sysfs_builtin(void)
588{ 585{
589 struct kernel_param *kp, *kp_begin = NULL; 586 struct kernel_param *kp, *kp_begin = NULL;
590 unsigned int i, name_len, count = 0; 587 unsigned int i, name_len, count = 0;
591 char modname[MAX_KBUILD_MODNAME + 1] = ""; 588 char modname[MODULE_NAME_LEN + 1] = "";
592 589
593 for (i=0; i < __stop___param - __start___param; i++) { 590 for (i=0; i < __stop___param - __start___param; i++) {
594 char *dot; 591 char *dot;
@@ -596,12 +593,12 @@ static void __init param_sysfs_builtin(void)
596 593
597 kp = &__start___param[i]; 594 kp = &__start___param[i];
598 max_name_len = 595 max_name_len =
599 min_t(size_t, MAX_KBUILD_MODNAME, strlen(kp->name)); 596 min_t(size_t, MODULE_NAME_LEN, strlen(kp->name));
600 597
601 dot = memchr(kp->name, '.', max_name_len); 598 dot = memchr(kp->name, '.', max_name_len);
602 if (!dot) { 599 if (!dot) {
603 DEBUGP("couldn't find period in first %d characters " 600 DEBUGP("couldn't find period in first %d characters "
604 "of %s\n", MAX_KBUILD_MODNAME, kp->name); 601 "of %s\n", MODULE_NAME_LEN, kp->name);
605 continue; 602 continue;
606 } 603 }
607 name_len = dot - kp->name; 604 name_len = dot - kp->name;
@@ -679,8 +676,6 @@ static struct sysfs_ops module_sysfs_ops = {
679 .store = module_attr_store, 676 .store = module_attr_store,
680}; 677};
681 678
682static struct kobj_type module_ktype;
683
684static int uevent_filter(struct kset *kset, struct kobject *kobj) 679static int uevent_filter(struct kset *kset, struct kobject *kobj)
685{ 680{
686 struct kobj_type *ktype = get_ktype(kobj); 681 struct kobj_type *ktype = get_ktype(kobj);
@@ -694,21 +689,11 @@ static struct kset_uevent_ops module_uevent_ops = {
694 .filter = uevent_filter, 689 .filter = uevent_filter,
695}; 690};
696 691
697decl_subsys(module, &module_ktype, &module_uevent_ops); 692struct kset *module_kset;
698int module_sysfs_initialized; 693int module_sysfs_initialized;
699 694
700static void module_release(struct kobject *kobj) 695struct kobj_type module_ktype = {
701{
702 /*
703 * Stupid empty release function to allow the memory for the kobject to
704 * be properly cleaned up. This will not need to be present for 2.6.25
705 * with the upcoming kobject core rework.
706 */
707}
708
709static struct kobj_type module_ktype = {
710 .sysfs_ops = &module_sysfs_ops, 696 .sysfs_ops = &module_sysfs_ops,
711 .release = module_release,
712}; 697};
713 698
714/* 699/*
@@ -716,13 +701,11 @@ static struct kobj_type module_ktype = {
716 */ 701 */
717static int __init param_sysfs_init(void) 702static int __init param_sysfs_init(void)
718{ 703{
719 int ret; 704 module_kset = kset_create_and_add("module", &module_uevent_ops, NULL);
720 705 if (!module_kset) {
721 ret = subsystem_register(&module_subsys); 706 printk(KERN_WARNING "%s (%d): error creating kset\n",
722 if (ret < 0) { 707 __FILE__, __LINE__);
723 printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n", 708 return -ENOMEM;
724 __FILE__, __LINE__, ret);
725 return ret;
726 } 709 }
727 module_sysfs_initialized = 1; 710 module_sysfs_initialized = 1;
728 711
@@ -732,14 +715,7 @@ static int __init param_sysfs_init(void)
732} 715}
733subsys_initcall(param_sysfs_init); 716subsys_initcall(param_sysfs_init);
734 717
735#else 718#endif /* CONFIG_SYSFS */
736#if 0
737static struct sysfs_ops module_sysfs_ops = {
738 .show = NULL,
739 .store = NULL,
740};
741#endif
742#endif
743 719
744EXPORT_SYMBOL(param_set_byte); 720EXPORT_SYMBOL(param_set_byte);
745EXPORT_SYMBOL(param_get_byte); 721EXPORT_SYMBOL(param_get_byte);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 68c96376e84a..0b7c82ac467e 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -967,6 +967,7 @@ static void check_thread_timers(struct task_struct *tsk,
967{ 967{
968 int maxfire; 968 int maxfire;
969 struct list_head *timers = tsk->cpu_timers; 969 struct list_head *timers = tsk->cpu_timers;
970 struct signal_struct *const sig = tsk->signal;
970 971
971 maxfire = 20; 972 maxfire = 20;
972 tsk->it_prof_expires = cputime_zero; 973 tsk->it_prof_expires = cputime_zero;
@@ -1011,6 +1012,35 @@ static void check_thread_timers(struct task_struct *tsk,
1011 t->firing = 1; 1012 t->firing = 1;
1012 list_move_tail(&t->entry, firing); 1013 list_move_tail(&t->entry, firing);
1013 } 1014 }
1015
1016 /*
1017 * Check for the special case thread timers.
1018 */
1019 if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) {
1020 unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max;
1021 unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur;
1022
1023 if (hard != RLIM_INFINITY &&
1024 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
1025 /*
1026 * At the hard limit, we just die.
1027 * No need to calculate anything else now.
1028 */
1029 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1030 return;
1031 }
1032 if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) {
1033 /*
1034 * At the soft limit, send a SIGXCPU every second.
1035 */
1036 if (sig->rlim[RLIMIT_RTTIME].rlim_cur
1037 < sig->rlim[RLIMIT_RTTIME].rlim_max) {
1038 sig->rlim[RLIMIT_RTTIME].rlim_cur +=
1039 USEC_PER_SEC;
1040 }
1041 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
1042 }
1043 }
1014} 1044}
1015 1045
1016/* 1046/*
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 05b64790fe83..b138b431e271 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -567,7 +567,8 @@ static const char * const hibernation_modes[] = {
567 * supports it (as determined by having hibernation_ops). 567 * supports it (as determined by having hibernation_ops).
568 */ 568 */
569 569
570static ssize_t disk_show(struct kset *kset, char *buf) 570static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
571 char *buf)
571{ 572{
572 int i; 573 int i;
573 char *start = buf; 574 char *start = buf;
@@ -597,7 +598,8 @@ static ssize_t disk_show(struct kset *kset, char *buf)
597} 598}
598 599
599 600
600static ssize_t disk_store(struct kset *kset, const char *buf, size_t n) 601static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
602 const char *buf, size_t n)
601{ 603{
602 int error = 0; 604 int error = 0;
603 int i; 605 int i;
@@ -642,13 +644,15 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
642 644
643power_attr(disk); 645power_attr(disk);
644 646
645static ssize_t resume_show(struct kset *kset, char *buf) 647static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
648 char *buf)
646{ 649{
647 return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), 650 return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
648 MINOR(swsusp_resume_device)); 651 MINOR(swsusp_resume_device));
649} 652}
650 653
651static ssize_t resume_store(struct kset *kset, const char *buf, size_t n) 654static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
655 const char *buf, size_t n)
652{ 656{
653 unsigned int maj, min; 657 unsigned int maj, min;
654 dev_t res; 658 dev_t res;
@@ -674,12 +678,14 @@ static ssize_t resume_store(struct kset *kset, const char *buf, size_t n)
674 678
675power_attr(resume); 679power_attr(resume);
676 680
677static ssize_t image_size_show(struct kset *kset, char *buf) 681static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr,
682 char *buf)
678{ 683{
679 return sprintf(buf, "%lu\n", image_size); 684 return sprintf(buf, "%lu\n", image_size);
680} 685}
681 686
682static ssize_t image_size_store(struct kset *kset, const char *buf, size_t n) 687static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr,
688 const char *buf, size_t n)
683{ 689{
684 unsigned long size; 690 unsigned long size;
685 691
@@ -708,7 +714,7 @@ static struct attribute_group attr_group = {
708 714
709static int __init pm_disk_init(void) 715static int __init pm_disk_init(void)
710{ 716{
711 return sysfs_create_group(&power_subsys.kobj, &attr_group); 717 return sysfs_create_group(power_kobj, &attr_group);
712} 718}
713 719
714core_initcall(pm_disk_init); 720core_initcall(pm_disk_init);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f71c9504a5c5..efc08360e627 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -276,8 +276,7 @@ EXPORT_SYMBOL(pm_suspend);
276 276
277#endif /* CONFIG_SUSPEND */ 277#endif /* CONFIG_SUSPEND */
278 278
279decl_subsys(power,NULL,NULL); 279struct kobject *power_kobj;
280
281 280
282/** 281/**
283 * state - control system power state. 282 * state - control system power state.
@@ -290,7 +289,8 @@ decl_subsys(power,NULL,NULL);
290 * proper enumerated value, and initiates a suspend transition. 289 * proper enumerated value, and initiates a suspend transition.
291 */ 290 */
292 291
293static ssize_t state_show(struct kset *kset, char *buf) 292static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
293 char *buf)
294{ 294{
295 char *s = buf; 295 char *s = buf;
296#ifdef CONFIG_SUSPEND 296#ifdef CONFIG_SUSPEND
@@ -311,7 +311,8 @@ static ssize_t state_show(struct kset *kset, char *buf)
311 return (s - buf); 311 return (s - buf);
312} 312}
313 313
314static ssize_t state_store(struct kset *kset, const char *buf, size_t n) 314static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
315 const char *buf, size_t n)
315{ 316{
316#ifdef CONFIG_SUSPEND 317#ifdef CONFIG_SUSPEND
317 suspend_state_t state = PM_SUSPEND_STANDBY; 318 suspend_state_t state = PM_SUSPEND_STANDBY;
@@ -348,13 +349,15 @@ power_attr(state);
348#ifdef CONFIG_PM_TRACE 349#ifdef CONFIG_PM_TRACE
349int pm_trace_enabled; 350int pm_trace_enabled;
350 351
351static ssize_t pm_trace_show(struct kset *kset, char *buf) 352static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr,
353 char *buf)
352{ 354{
353 return sprintf(buf, "%d\n", pm_trace_enabled); 355 return sprintf(buf, "%d\n", pm_trace_enabled);
354} 356}
355 357
356static ssize_t 358static ssize_t
357pm_trace_store(struct kset *kset, const char *buf, size_t n) 359pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
360 const char *buf, size_t n)
358{ 361{
359 int val; 362 int val;
360 363
@@ -386,10 +389,10 @@ static struct attribute_group attr_group = {
386 389
387static int __init pm_init(void) 390static int __init pm_init(void)
388{ 391{
389 int error = subsystem_register(&power_subsys); 392 power_kobj = kobject_create_and_add("power", NULL);
390 if (!error) 393 if (!power_kobj)
391 error = sysfs_create_group(&power_subsys.kobj,&attr_group); 394 return -ENOMEM;
392 return error; 395 return sysfs_create_group(power_kobj, &attr_group);
393} 396}
394 397
395core_initcall(pm_init); 398core_initcall(pm_init);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 195dc4611764..2093c3a9a994 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -54,7 +54,7 @@ extern int pfn_is_nosave(unsigned long);
54extern struct mutex pm_mutex; 54extern struct mutex pm_mutex;
55 55
56#define power_attr(_name) \ 56#define power_attr(_name) \
57static struct subsys_attribute _name##_attr = { \ 57static struct kobj_attribute _name##_attr = { \
58 .attr = { \ 58 .attr = { \
59 .name = __stringify(_name), \ 59 .name = __stringify(_name), \
60 .mode = 0644, \ 60 .mode = 0644, \
@@ -63,8 +63,6 @@ static struct subsys_attribute _name##_attr = { \
63 .store = _name##_store, \ 63 .store = _name##_store, \
64} 64}
65 65
66extern struct kset power_subsys;
67
68/* Preferred image size in bytes (default 500 MB) */ 66/* Preferred image size in bytes (default 500 MB) */
69extern unsigned long image_size; 67extern unsigned long image_size;
70extern int in_suspend; 68extern int in_suspend;
diff --git a/kernel/printk.c b/kernel/printk.c
index 89011bf8c106..58bbec684119 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -36,6 +36,13 @@
36 36
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
38 38
39/*
40 * Architectures can override it:
41 */
42void __attribute__((weak)) early_printk(const char *fmt, ...)
43{
44}
45
39#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) 46#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
40 47
41/* printk's without a loglevel use this.. */ 48/* printk's without a loglevel use this.. */
@@ -573,11 +580,6 @@ static int __init printk_time_setup(char *str)
573 580
574__setup("time", printk_time_setup); 581__setup("time", printk_time_setup);
575 582
576__attribute__((weak)) unsigned long long printk_clock(void)
577{
578 return sched_clock();
579}
580
581/* Check if we have any console registered that can be called early in boot. */ 583/* Check if we have any console registered that can be called early in boot. */
582static int have_callable_console(void) 584static int have_callable_console(void)
583{ 585{
@@ -628,30 +630,57 @@ asmlinkage int printk(const char *fmt, ...)
628/* cpu currently holding logbuf_lock */ 630/* cpu currently holding logbuf_lock */
629static volatile unsigned int printk_cpu = UINT_MAX; 631static volatile unsigned int printk_cpu = UINT_MAX;
630 632
633const char printk_recursion_bug_msg [] =
634 KERN_CRIT "BUG: recent printk recursion!\n";
635static int printk_recursion_bug;
636
631asmlinkage int vprintk(const char *fmt, va_list args) 637asmlinkage int vprintk(const char *fmt, va_list args)
632{ 638{
639 static int log_level_unknown = 1;
640 static char printk_buf[1024];
641
633 unsigned long flags; 642 unsigned long flags;
634 int printed_len; 643 int printed_len = 0;
644 int this_cpu;
635 char *p; 645 char *p;
636 static char printk_buf[1024];
637 static int log_level_unknown = 1;
638 646
639 boot_delay_msec(); 647 boot_delay_msec();
640 648
641 preempt_disable(); 649 preempt_disable();
642 if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id())
643 /* If a crash is occurring during printk() on this CPU,
644 * make sure we can't deadlock */
645 zap_locks();
646
647 /* This stops the holder of console_sem just where we want him */ 650 /* This stops the holder of console_sem just where we want him */
648 raw_local_irq_save(flags); 651 raw_local_irq_save(flags);
652 this_cpu = smp_processor_id();
653
654 /*
655 * Ouch, printk recursed into itself!
656 */
657 if (unlikely(printk_cpu == this_cpu)) {
658 /*
659 * If a crash is occurring during printk() on this CPU,
660 * then try to get the crash message out but make sure
661 * we can't deadlock. Otherwise just return to avoid the
662 * recursion and return - but flag the recursion so that
663 * it can be printed at the next appropriate moment:
664 */
665 if (!oops_in_progress) {
666 printk_recursion_bug = 1;
667 goto out_restore_irqs;
668 }
669 zap_locks();
670 }
671
649 lockdep_off(); 672 lockdep_off();
650 spin_lock(&logbuf_lock); 673 spin_lock(&logbuf_lock);
651 printk_cpu = smp_processor_id(); 674 printk_cpu = this_cpu;
652 675
676 if (printk_recursion_bug) {
677 printk_recursion_bug = 0;
678 strcpy(printk_buf, printk_recursion_bug_msg);
679 printed_len = sizeof(printk_recursion_bug_msg);
680 }
653 /* Emit the output into the temporary buffer */ 681 /* Emit the output into the temporary buffer */
654 printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); 682 printed_len += vscnprintf(printk_buf + printed_len,
683 sizeof(printk_buf), fmt, args);
655 684
656 /* 685 /*
657 * Copy the output into log_buf. If the caller didn't provide 686 * Copy the output into log_buf. If the caller didn't provide
@@ -680,7 +709,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
680 loglev_char = default_message_loglevel 709 loglev_char = default_message_loglevel
681 + '0'; 710 + '0';
682 } 711 }
683 t = printk_clock(); 712 t = cpu_clock(printk_cpu);
684 nanosec_rem = do_div(t, 1000000000); 713 nanosec_rem = do_div(t, 1000000000);
685 tlen = sprintf(tbuf, 714 tlen = sprintf(tbuf,
686 "<%c>[%5lu.%06lu] ", 715 "<%c>[%5lu.%06lu] ",
@@ -744,6 +773,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
744 printk_cpu = UINT_MAX; 773 printk_cpu = UINT_MAX;
745 spin_unlock(&logbuf_lock); 774 spin_unlock(&logbuf_lock);
746 lockdep_on(); 775 lockdep_on();
776out_restore_irqs:
747 raw_local_irq_restore(flags); 777 raw_local_irq_restore(flags);
748 } 778 }
749 779
diff --git a/kernel/profile.c b/kernel/profile.c
index 5e95330e5120..e64c2da11c0f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -52,7 +52,7 @@ static DEFINE_PER_CPU(int, cpu_profile_flip);
52static DEFINE_MUTEX(profile_flip_mutex); 52static DEFINE_MUTEX(profile_flip_mutex);
53#endif /* CONFIG_SMP */ 53#endif /* CONFIG_SMP */
54 54
55static int __init profile_setup(char * str) 55static int __init profile_setup(char *str)
56{ 56{
57 static char __initdata schedstr[] = "schedule"; 57 static char __initdata schedstr[] = "schedule";
58 static char __initdata sleepstr[] = "sleep"; 58 static char __initdata sleepstr[] = "sleep";
@@ -104,28 +104,28 @@ __setup("profile=", profile_setup);
104 104
105void __init profile_init(void) 105void __init profile_init(void)
106{ 106{
107 if (!prof_on) 107 if (!prof_on)
108 return; 108 return;
109 109
110 /* only text is profiled */ 110 /* only text is profiled */
111 prof_len = (_etext - _stext) >> prof_shift; 111 prof_len = (_etext - _stext) >> prof_shift;
112 prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); 112 prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t));
113} 113}
114 114
115/* Profile event notifications */ 115/* Profile event notifications */
116 116
117#ifdef CONFIG_PROFILING 117#ifdef CONFIG_PROFILING
118 118
119static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); 119static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
120static ATOMIC_NOTIFIER_HEAD(task_free_notifier); 120static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
121static BLOCKING_NOTIFIER_HEAD(munmap_notifier); 121static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
122 122
123void profile_task_exit(struct task_struct * task) 123void profile_task_exit(struct task_struct *task)
124{ 124{
125 blocking_notifier_call_chain(&task_exit_notifier, 0, task); 125 blocking_notifier_call_chain(&task_exit_notifier, 0, task);
126} 126}
127 127
128int profile_handoff_task(struct task_struct * task) 128int profile_handoff_task(struct task_struct *task)
129{ 129{
130 int ret; 130 int ret;
131 ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); 131 ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
@@ -137,52 +137,55 @@ void profile_munmap(unsigned long addr)
137 blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); 137 blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
138} 138}
139 139
140int task_handoff_register(struct notifier_block * n) 140int task_handoff_register(struct notifier_block *n)
141{ 141{
142 return atomic_notifier_chain_register(&task_free_notifier, n); 142 return atomic_notifier_chain_register(&task_free_notifier, n);
143} 143}
144EXPORT_SYMBOL_GPL(task_handoff_register);
144 145
145int task_handoff_unregister(struct notifier_block * n) 146int task_handoff_unregister(struct notifier_block *n)
146{ 147{
147 return atomic_notifier_chain_unregister(&task_free_notifier, n); 148 return atomic_notifier_chain_unregister(&task_free_notifier, n);
148} 149}
150EXPORT_SYMBOL_GPL(task_handoff_unregister);
149 151
150int profile_event_register(enum profile_type type, struct notifier_block * n) 152int profile_event_register(enum profile_type type, struct notifier_block *n)
151{ 153{
152 int err = -EINVAL; 154 int err = -EINVAL;
153 155
154 switch (type) { 156 switch (type) {
155 case PROFILE_TASK_EXIT: 157 case PROFILE_TASK_EXIT:
156 err = blocking_notifier_chain_register( 158 err = blocking_notifier_chain_register(
157 &task_exit_notifier, n); 159 &task_exit_notifier, n);
158 break; 160 break;
159 case PROFILE_MUNMAP: 161 case PROFILE_MUNMAP:
160 err = blocking_notifier_chain_register( 162 err = blocking_notifier_chain_register(
161 &munmap_notifier, n); 163 &munmap_notifier, n);
162 break; 164 break;
163 } 165 }
164 166
165 return err; 167 return err;
166} 168}
169EXPORT_SYMBOL_GPL(profile_event_register);
167 170
168 171int profile_event_unregister(enum profile_type type, struct notifier_block *n)
169int profile_event_unregister(enum profile_type type, struct notifier_block * n)
170{ 172{
171 int err = -EINVAL; 173 int err = -EINVAL;
172 174
173 switch (type) { 175 switch (type) {
174 case PROFILE_TASK_EXIT: 176 case PROFILE_TASK_EXIT:
175 err = blocking_notifier_chain_unregister( 177 err = blocking_notifier_chain_unregister(
176 &task_exit_notifier, n); 178 &task_exit_notifier, n);
177 break; 179 break;
178 case PROFILE_MUNMAP: 180 case PROFILE_MUNMAP:
179 err = blocking_notifier_chain_unregister( 181 err = blocking_notifier_chain_unregister(
180 &munmap_notifier, n); 182 &munmap_notifier, n);
181 break; 183 break;
182 } 184 }
183 185
184 return err; 186 return err;
185} 187}
188EXPORT_SYMBOL_GPL(profile_event_unregister);
186 189
187int register_timer_hook(int (*hook)(struct pt_regs *)) 190int register_timer_hook(int (*hook)(struct pt_regs *))
188{ 191{
@@ -191,6 +194,7 @@ int register_timer_hook(int (*hook)(struct pt_regs *))
191 timer_hook = hook; 194 timer_hook = hook;
192 return 0; 195 return 0;
193} 196}
197EXPORT_SYMBOL_GPL(register_timer_hook);
194 198
195void unregister_timer_hook(int (*hook)(struct pt_regs *)) 199void unregister_timer_hook(int (*hook)(struct pt_regs *))
196{ 200{
@@ -199,13 +203,7 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *))
199 /* make sure all CPUs see the NULL hook */ 203 /* make sure all CPUs see the NULL hook */
200 synchronize_sched(); /* Allow ongoing interrupts to complete. */ 204 synchronize_sched(); /* Allow ongoing interrupts to complete. */
201} 205}
202
203EXPORT_SYMBOL_GPL(register_timer_hook);
204EXPORT_SYMBOL_GPL(unregister_timer_hook); 206EXPORT_SYMBOL_GPL(unregister_timer_hook);
205EXPORT_SYMBOL_GPL(task_handoff_register);
206EXPORT_SYMBOL_GPL(task_handoff_unregister);
207EXPORT_SYMBOL_GPL(profile_event_register);
208EXPORT_SYMBOL_GPL(profile_event_unregister);
209 207
210#endif /* CONFIG_PROFILING */ 208#endif /* CONFIG_PROFILING */
211 209
@@ -366,7 +364,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
366 per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); 364 per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
367 } 365 }
368 break; 366 break;
369 out_free: 367out_free:
370 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); 368 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
371 per_cpu(cpu_profile_hits, cpu)[1] = NULL; 369 per_cpu(cpu_profile_hits, cpu)[1] = NULL;
372 __free_page(page); 370 __free_page(page);
@@ -409,7 +407,6 @@ void profile_hits(int type, void *__pc, unsigned int nr_hits)
409 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); 407 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
410} 408}
411#endif /* !CONFIG_SMP */ 409#endif /* !CONFIG_SMP */
412
413EXPORT_SYMBOL_GPL(profile_hits); 410EXPORT_SYMBOL_GPL(profile_hits);
414 411
415void profile_tick(int type) 412void profile_tick(int type)
@@ -427,7 +424,7 @@ void profile_tick(int type)
427#include <asm/uaccess.h> 424#include <asm/uaccess.h>
428#include <asm/ptrace.h> 425#include <asm/ptrace.h>
429 426
430static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, 427static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
431 int count, int *eof, void *data) 428 int count, int *eof, void *data)
432{ 429{
433 int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); 430 int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
@@ -437,8 +434,8 @@ static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
437 return len; 434 return len;
438} 435}
439 436
440static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer, 437static int prof_cpu_mask_write_proc(struct file *file,
441 unsigned long count, void *data) 438 const char __user *buffer, unsigned long count, void *data)
442{ 439{
443 cpumask_t *mask = (cpumask_t *)data; 440 cpumask_t *mask = (cpumask_t *)data;
444 unsigned long full_count = count, err; 441 unsigned long full_count = count, err;
@@ -457,7 +454,8 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
457 struct proc_dir_entry *entry; 454 struct proc_dir_entry *entry;
458 455
459 /* create /proc/irq/prof_cpu_mask */ 456 /* create /proc/irq/prof_cpu_mask */
460 if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir))) 457 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
458 if (!entry)
461 return; 459 return;
462 entry->data = (void *)&prof_cpu_mask; 460 entry->data = (void *)&prof_cpu_mask;
463 entry->read_proc = prof_cpu_mask_read_proc; 461 entry->read_proc = prof_cpu_mask_read_proc;
@@ -475,7 +473,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
475{ 473{
476 unsigned long p = *ppos; 474 unsigned long p = *ppos;
477 ssize_t read; 475 ssize_t read;
478 char * pnt; 476 char *pnt;
479 unsigned int sample_step = 1 << prof_shift; 477 unsigned int sample_step = 1 << prof_shift;
480 478
481 profile_flip_buffers(); 479 profile_flip_buffers();
@@ -486,12 +484,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
486 read = 0; 484 read = 0;
487 485
488 while (p < sizeof(unsigned int) && count > 0) { 486 while (p < sizeof(unsigned int) && count > 0) {
489 if (put_user(*((char *)(&sample_step)+p),buf)) 487 if (put_user(*((char *)(&sample_step)+p), buf))
490 return -EFAULT; 488 return -EFAULT;
491 buf++; p++; count--; read++; 489 buf++; p++; count--; read++;
492 } 490 }
493 pnt = (char *)prof_buffer + p - sizeof(atomic_t); 491 pnt = (char *)prof_buffer + p - sizeof(atomic_t);
494 if (copy_to_user(buf,(void *)pnt,count)) 492 if (copy_to_user(buf, (void *)pnt, count))
495 return -EFAULT; 493 return -EFAULT;
496 read += count; 494 read += count;
497 *ppos += read; 495 *ppos += read;
@@ -508,7 +506,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
508 size_t count, loff_t *ppos) 506 size_t count, loff_t *ppos)
509{ 507{
510#ifdef CONFIG_SMP 508#ifdef CONFIG_SMP
511 extern int setup_profiling_timer (unsigned int multiplier); 509 extern int setup_profiling_timer(unsigned int multiplier);
512 510
513 if (count == sizeof(int)) { 511 if (count == sizeof(int)) {
514 unsigned int multiplier; 512 unsigned int multiplier;
@@ -591,7 +589,8 @@ static int __init create_proc_profile(void)
591 return 0; 589 return 0;
592 if (create_hash_tables()) 590 if (create_hash_tables())
593 return -1; 591 return -1;
594 if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL))) 592 entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL);
593 if (!entry)
595 return 0; 594 return 0;
596 entry->proc_fops = &proc_profile_operations; 595 entry->proc_fops = &proc_profile_operations;
597 entry->size = (1+prof_len) * sizeof(atomic_t); 596 entry->size = (1+prof_len) * sizeof(atomic_t);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c25db863081d..e6e9b8be4b05 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -366,12 +366,73 @@ static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data)
366 return error; 366 return error;
367} 367}
368 368
369
370#ifdef PTRACE_SINGLESTEP
371#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
372#else
373#define is_singlestep(request) 0
374#endif
375
376#ifdef PTRACE_SINGLEBLOCK
377#define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK)
378#else
379#define is_singleblock(request) 0
380#endif
381
382#ifdef PTRACE_SYSEMU
383#define is_sysemu_singlestep(request) ((request) == PTRACE_SYSEMU_SINGLESTEP)
384#else
385#define is_sysemu_singlestep(request) 0
386#endif
387
388static int ptrace_resume(struct task_struct *child, long request, long data)
389{
390 if (!valid_signal(data))
391 return -EIO;
392
393 if (request == PTRACE_SYSCALL)
394 set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
395 else
396 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
397
398#ifdef TIF_SYSCALL_EMU
399 if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP)
400 set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
401 else
402 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
403#endif
404
405 if (is_singleblock(request)) {
406 if (unlikely(!arch_has_block_step()))
407 return -EIO;
408 user_enable_block_step(child);
409 } else if (is_singlestep(request) || is_sysemu_singlestep(request)) {
410 if (unlikely(!arch_has_single_step()))
411 return -EIO;
412 user_enable_single_step(child);
413 }
414 else
415 user_disable_single_step(child);
416
417 child->exit_code = data;
418 wake_up_process(child);
419
420 return 0;
421}
422
369int ptrace_request(struct task_struct *child, long request, 423int ptrace_request(struct task_struct *child, long request,
370 long addr, long data) 424 long addr, long data)
371{ 425{
372 int ret = -EIO; 426 int ret = -EIO;
373 427
374 switch (request) { 428 switch (request) {
429 case PTRACE_PEEKTEXT:
430 case PTRACE_PEEKDATA:
431 return generic_ptrace_peekdata(child, addr, data);
432 case PTRACE_POKETEXT:
433 case PTRACE_POKEDATA:
434 return generic_ptrace_pokedata(child, addr, data);
435
375#ifdef PTRACE_OLDSETOPTIONS 436#ifdef PTRACE_OLDSETOPTIONS
376 case PTRACE_OLDSETOPTIONS: 437 case PTRACE_OLDSETOPTIONS:
377#endif 438#endif
@@ -390,6 +451,26 @@ int ptrace_request(struct task_struct *child, long request,
390 case PTRACE_DETACH: /* detach a process that was attached. */ 451 case PTRACE_DETACH: /* detach a process that was attached. */
391 ret = ptrace_detach(child, data); 452 ret = ptrace_detach(child, data);
392 break; 453 break;
454
455#ifdef PTRACE_SINGLESTEP
456 case PTRACE_SINGLESTEP:
457#endif
458#ifdef PTRACE_SINGLEBLOCK
459 case PTRACE_SINGLEBLOCK:
460#endif
461#ifdef PTRACE_SYSEMU
462 case PTRACE_SYSEMU:
463 case PTRACE_SYSEMU_SINGLESTEP:
464#endif
465 case PTRACE_SYSCALL:
466 case PTRACE_CONT:
467 return ptrace_resume(child, request, data);
468
469 case PTRACE_KILL:
470 if (child->exit_state) /* already dead */
471 return 0;
472 return ptrace_resume(child, request, SIGKILL);
473
393 default: 474 default:
394 break; 475 break;
395 } 476 }
@@ -470,6 +551,8 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
470 lock_kernel(); 551 lock_kernel();
471 if (request == PTRACE_TRACEME) { 552 if (request == PTRACE_TRACEME) {
472 ret = ptrace_traceme(); 553 ret = ptrace_traceme();
554 if (!ret)
555 arch_ptrace_attach(current);
473 goto out; 556 goto out;
474 } 557 }
475 558
@@ -524,3 +607,87 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
524 copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); 607 copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
525 return (copied == sizeof(data)) ? 0 : -EIO; 608 return (copied == sizeof(data)) ? 0 : -EIO;
526} 609}
610
611#ifdef CONFIG_COMPAT
612#include <linux/compat.h>
613
614int compat_ptrace_request(struct task_struct *child, compat_long_t request,
615 compat_ulong_t addr, compat_ulong_t data)
616{
617 compat_ulong_t __user *datap = compat_ptr(data);
618 compat_ulong_t word;
619 int ret;
620
621 switch (request) {
622 case PTRACE_PEEKTEXT:
623 case PTRACE_PEEKDATA:
624 ret = access_process_vm(child, addr, &word, sizeof(word), 0);
625 if (ret != sizeof(word))
626 ret = -EIO;
627 else
628 ret = put_user(word, datap);
629 break;
630
631 case PTRACE_POKETEXT:
632 case PTRACE_POKEDATA:
633 ret = access_process_vm(child, addr, &data, sizeof(data), 1);
634 ret = (ret != sizeof(data) ? -EIO : 0);
635 break;
636
637 case PTRACE_GETEVENTMSG:
638 ret = put_user((compat_ulong_t) child->ptrace_message, datap);
639 break;
640
641 default:
642 ret = ptrace_request(child, request, addr, data);
643 }
644
645 return ret;
646}
647
648#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE
649asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
650 compat_long_t addr, compat_long_t data)
651{
652 struct task_struct *child;
653 long ret;
654
655 /*
656 * This lock_kernel fixes a subtle race with suid exec
657 */
658 lock_kernel();
659 if (request == PTRACE_TRACEME) {
660 ret = ptrace_traceme();
661 goto out;
662 }
663
664 child = ptrace_get_task_struct(pid);
665 if (IS_ERR(child)) {
666 ret = PTR_ERR(child);
667 goto out;
668 }
669
670 if (request == PTRACE_ATTACH) {
671 ret = ptrace_attach(child);
672 /*
673 * Some architectures need to do book-keeping after
674 * a ptrace attach.
675 */
676 if (!ret)
677 arch_ptrace_attach(child);
678 goto out_put_task_struct;
679 }
680
681 ret = ptrace_check_attach(child, request == PTRACE_KILL);
682 if (!ret)
683 ret = compat_arch_ptrace(child, request, addr, data);
684
685 out_put_task_struct:
686 put_task_struct(child);
687 out:
688 unlock_kernel();
689 return ret;
690}
691#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */
692
693#endif /* CONFIG_COMPAT */
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
new file mode 100644
index 000000000000..f4ffbd0f306f
--- /dev/null
+++ b/kernel/rcuclassic.c
@@ -0,0 +1,575 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2001
19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com>
22 *
23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 * Papers:
26 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
28 *
29 * For detailed explanation of Read-Copy Update mechanism see -
30 * Documentation/RCU
31 *
32 */
33#include <linux/types.h>
34#include <linux/kernel.h>
35#include <linux/init.h>
36#include <linux/spinlock.h>
37#include <linux/smp.h>
38#include <linux/rcupdate.h>
39#include <linux/interrupt.h>
40#include <linux/sched.h>
41#include <asm/atomic.h>
42#include <linux/bitops.h>
43#include <linux/module.h>
44#include <linux/completion.h>
45#include <linux/moduleparam.h>
46#include <linux/percpu.h>
47#include <linux/notifier.h>
48#include <linux/cpu.h>
49#include <linux/mutex.h>
50
51#ifdef CONFIG_DEBUG_LOCK_ALLOC
52static struct lock_class_key rcu_lock_key;
53struct lockdep_map rcu_lock_map =
54 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
55EXPORT_SYMBOL_GPL(rcu_lock_map);
56#endif
57
58
59/* Definition for rcupdate control block. */
60static struct rcu_ctrlblk rcu_ctrlblk = {
61 .cur = -300,
62 .completed = -300,
63 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
64 .cpumask = CPU_MASK_NONE,
65};
66static struct rcu_ctrlblk rcu_bh_ctrlblk = {
67 .cur = -300,
68 .completed = -300,
69 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
70 .cpumask = CPU_MASK_NONE,
71};
72
73DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
74DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
75
76static int blimit = 10;
77static int qhimark = 10000;
78static int qlowmark = 100;
79
80#ifdef CONFIG_SMP
81static void force_quiescent_state(struct rcu_data *rdp,
82 struct rcu_ctrlblk *rcp)
83{
84 int cpu;
85 cpumask_t cpumask;
86 set_need_resched();
87 if (unlikely(!rcp->signaled)) {
88 rcp->signaled = 1;
89 /*
90 * Don't send IPI to itself. With irqs disabled,
91 * rdp->cpu is the current cpu.
92 */
93 cpumask = rcp->cpumask;
94 cpu_clear(rdp->cpu, cpumask);
95 for_each_cpu_mask(cpu, cpumask)
96 smp_send_reschedule(cpu);
97 }
98}
99#else
100static inline void force_quiescent_state(struct rcu_data *rdp,
101 struct rcu_ctrlblk *rcp)
102{
103 set_need_resched();
104}
105#endif
106
107/**
108 * call_rcu - Queue an RCU callback for invocation after a grace period.
109 * @head: structure to be used for queueing the RCU updates.
110 * @func: actual update function to be invoked after the grace period
111 *
112 * The update function will be invoked some time after a full grace
113 * period elapses, in other words after all currently executing RCU
114 * read-side critical sections have completed. RCU read-side critical
115 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
116 * and may be nested.
117 */
118void call_rcu(struct rcu_head *head,
119 void (*func)(struct rcu_head *rcu))
120{
121 unsigned long flags;
122 struct rcu_data *rdp;
123
124 head->func = func;
125 head->next = NULL;
126 local_irq_save(flags);
127 rdp = &__get_cpu_var(rcu_data);
128 *rdp->nxttail = head;
129 rdp->nxttail = &head->next;
130 if (unlikely(++rdp->qlen > qhimark)) {
131 rdp->blimit = INT_MAX;
132 force_quiescent_state(rdp, &rcu_ctrlblk);
133 }
134 local_irq_restore(flags);
135}
136EXPORT_SYMBOL_GPL(call_rcu);
137
138/**
139 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
140 * @head: structure to be used for queueing the RCU updates.
141 * @func: actual update function to be invoked after the grace period
142 *
143 * The update function will be invoked some time after a full grace
144 * period elapses, in other words after all currently executing RCU
145 * read-side critical sections have completed. call_rcu_bh() assumes
146 * that the read-side critical sections end on completion of a softirq
147 * handler. This means that read-side critical sections in process
148 * context must not be interrupted by softirqs. This interface is to be
149 * used when most of the read-side critical sections are in softirq context.
150 * RCU read-side critical sections are delimited by rcu_read_lock() and
151 * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
152 * and rcu_read_unlock_bh(), if in process context. These may be nested.
153 */
154void call_rcu_bh(struct rcu_head *head,
155 void (*func)(struct rcu_head *rcu))
156{
157 unsigned long flags;
158 struct rcu_data *rdp;
159
160 head->func = func;
161 head->next = NULL;
162 local_irq_save(flags);
163 rdp = &__get_cpu_var(rcu_bh_data);
164 *rdp->nxttail = head;
165 rdp->nxttail = &head->next;
166
167 if (unlikely(++rdp->qlen > qhimark)) {
168 rdp->blimit = INT_MAX;
169 force_quiescent_state(rdp, &rcu_bh_ctrlblk);
170 }
171
172 local_irq_restore(flags);
173}
174EXPORT_SYMBOL_GPL(call_rcu_bh);
175
176/*
177 * Return the number of RCU batches processed thus far. Useful
178 * for debug and statistics.
179 */
180long rcu_batches_completed(void)
181{
182 return rcu_ctrlblk.completed;
183}
184EXPORT_SYMBOL_GPL(rcu_batches_completed);
185
186/*
187 * Return the number of RCU batches processed thus far. Useful
188 * for debug and statistics.
189 */
190long rcu_batches_completed_bh(void)
191{
192 return rcu_bh_ctrlblk.completed;
193}
194EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
195
196/* Raises the softirq for processing rcu_callbacks. */
197static inline void raise_rcu_softirq(void)
198{
199 raise_softirq(RCU_SOFTIRQ);
200 /*
201 * The smp_mb() here is required to ensure that this cpu's
202 * __rcu_process_callbacks() reads the most recently updated
203 * value of rcu->cur.
204 */
205 smp_mb();
206}
207
208/*
209 * Invoke the completed RCU callbacks. They are expected to be in
210 * a per-cpu list.
211 */
212static void rcu_do_batch(struct rcu_data *rdp)
213{
214 struct rcu_head *next, *list;
215 int count = 0;
216
217 list = rdp->donelist;
218 while (list) {
219 next = list->next;
220 prefetch(next);
221 list->func(list);
222 list = next;
223 if (++count >= rdp->blimit)
224 break;
225 }
226 rdp->donelist = list;
227
228 local_irq_disable();
229 rdp->qlen -= count;
230 local_irq_enable();
231 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
232 rdp->blimit = blimit;
233
234 if (!rdp->donelist)
235 rdp->donetail = &rdp->donelist;
236 else
237 raise_rcu_softirq();
238}
239
240/*
241 * Grace period handling:
242 * The grace period handling consists out of two steps:
243 * - A new grace period is started.
244 * This is done by rcu_start_batch. The start is not broadcasted to
245 * all cpus, they must pick this up by comparing rcp->cur with
246 * rdp->quiescbatch. All cpus are recorded in the
247 * rcu_ctrlblk.cpumask bitmap.
248 * - All cpus must go through a quiescent state.
249 * Since the start of the grace period is not broadcasted, at least two
250 * calls to rcu_check_quiescent_state are required:
251 * The first call just notices that a new grace period is running. The
252 * following calls check if there was a quiescent state since the beginning
253 * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
254 * the bitmap is empty, then the grace period is completed.
255 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
256 * period (if necessary).
257 */
258/*
259 * Register a new batch of callbacks, and start it up if there is currently no
260 * active batch and the batch to be registered has not already occurred.
261 * Caller must hold rcu_ctrlblk.lock.
262 */
263static void rcu_start_batch(struct rcu_ctrlblk *rcp)
264{
265 if (rcp->next_pending &&
266 rcp->completed == rcp->cur) {
267 rcp->next_pending = 0;
268 /*
269 * next_pending == 0 must be visible in
270 * __rcu_process_callbacks() before it can see new value of cur.
271 */
272 smp_wmb();
273 rcp->cur++;
274
275 /*
276 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
277 * Barrier Otherwise it can cause tickless idle CPUs to be
278 * included in rcp->cpumask, which will extend graceperiods
279 * unnecessarily.
280 */
281 smp_mb();
282 cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
283
284 rcp->signaled = 0;
285 }
286}
287
288/*
289 * cpu went through a quiescent state since the beginning of the grace period.
290 * Clear it from the cpu mask and complete the grace period if it was the last
291 * cpu. Start another grace period if someone has further entries pending
292 */
293static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
294{
295 cpu_clear(cpu, rcp->cpumask);
296 if (cpus_empty(rcp->cpumask)) {
297 /* batch completed ! */
298 rcp->completed = rcp->cur;
299 rcu_start_batch(rcp);
300 }
301}
302
303/*
304 * Check if the cpu has gone through a quiescent state (say context
305 * switch). If so and if it already hasn't done so in this RCU
306 * quiescent cycle, then indicate that it has done so.
307 */
308static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
309 struct rcu_data *rdp)
310{
311 if (rdp->quiescbatch != rcp->cur) {
312 /* start new grace period: */
313 rdp->qs_pending = 1;
314 rdp->passed_quiesc = 0;
315 rdp->quiescbatch = rcp->cur;
316 return;
317 }
318
319 /* Grace period already completed for this cpu?
320 * qs_pending is checked instead of the actual bitmap to avoid
321 * cacheline trashing.
322 */
323 if (!rdp->qs_pending)
324 return;
325
326 /*
327 * Was there a quiescent state since the beginning of the grace
328 * period? If no, then exit and wait for the next call.
329 */
330 if (!rdp->passed_quiesc)
331 return;
332 rdp->qs_pending = 0;
333
334 spin_lock(&rcp->lock);
335 /*
336 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
337 * during cpu startup. Ignore the quiescent state.
338 */
339 if (likely(rdp->quiescbatch == rcp->cur))
340 cpu_quiet(rdp->cpu, rcp);
341
342 spin_unlock(&rcp->lock);
343}
344
345
346#ifdef CONFIG_HOTPLUG_CPU
347
348/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
349 * locking requirements, the list it's pulling from has to belong to a cpu
350 * which is dead and hence not processing interrupts.
351 */
352static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
353 struct rcu_head **tail)
354{
355 local_irq_disable();
356 *this_rdp->nxttail = list;
357 if (list)
358 this_rdp->nxttail = tail;
359 local_irq_enable();
360}
361
362static void __rcu_offline_cpu(struct rcu_data *this_rdp,
363 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
364{
365 /* if the cpu going offline owns the grace period
366 * we can block indefinitely waiting for it, so flush
367 * it here
368 */
369 spin_lock_bh(&rcp->lock);
370 if (rcp->cur != rcp->completed)
371 cpu_quiet(rdp->cpu, rcp);
372 spin_unlock_bh(&rcp->lock);
373 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
374 rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
375 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
376}
377
378static void rcu_offline_cpu(int cpu)
379{
380 struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
381 struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
382
383 __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
384 &per_cpu(rcu_data, cpu));
385 __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
386 &per_cpu(rcu_bh_data, cpu));
387 put_cpu_var(rcu_data);
388 put_cpu_var(rcu_bh_data);
389}
390
391#else
392
393static void rcu_offline_cpu(int cpu)
394{
395}
396
397#endif
398
399/*
400 * This does the RCU processing work from softirq context.
401 */
402static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
403 struct rcu_data *rdp)
404{
405 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
406 *rdp->donetail = rdp->curlist;
407 rdp->donetail = rdp->curtail;
408 rdp->curlist = NULL;
409 rdp->curtail = &rdp->curlist;
410 }
411
412 if (rdp->nxtlist && !rdp->curlist) {
413 local_irq_disable();
414 rdp->curlist = rdp->nxtlist;
415 rdp->curtail = rdp->nxttail;
416 rdp->nxtlist = NULL;
417 rdp->nxttail = &rdp->nxtlist;
418 local_irq_enable();
419
420 /*
421 * start the next batch of callbacks
422 */
423
424 /* determine batch number */
425 rdp->batch = rcp->cur + 1;
426 /* see the comment and corresponding wmb() in
427 * the rcu_start_batch()
428 */
429 smp_rmb();
430
431 if (!rcp->next_pending) {
432 /* and start it/schedule start if it's a new batch */
433 spin_lock(&rcp->lock);
434 rcp->next_pending = 1;
435 rcu_start_batch(rcp);
436 spin_unlock(&rcp->lock);
437 }
438 }
439
440 rcu_check_quiescent_state(rcp, rdp);
441 if (rdp->donelist)
442 rcu_do_batch(rdp);
443}
444
445static void rcu_process_callbacks(struct softirq_action *unused)
446{
447 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
448 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
449}
450
451static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
452{
453 /* This cpu has pending rcu entries and the grace period
454 * for them has completed.
455 */
456 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
457 return 1;
458
459 /* This cpu has no pending entries, but there are new entries */
460 if (!rdp->curlist && rdp->nxtlist)
461 return 1;
462
463 /* This cpu has finished callbacks to invoke */
464 if (rdp->donelist)
465 return 1;
466
467 /* The rcu core waits for a quiescent state from the cpu */
468 if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
469 return 1;
470
471 /* nothing to do */
472 return 0;
473}
474
475/*
476 * Check to see if there is any immediate RCU-related work to be done
477 * by the current CPU, returning 1 if so. This function is part of the
478 * RCU implementation; it is -not- an exported member of the RCU API.
479 */
480int rcu_pending(int cpu)
481{
482 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
483 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
484}
485
486/*
487 * Check to see if any future RCU-related work will need to be done
488 * by the current CPU, even if none need be done immediately, returning
489 * 1 if so. This function is part of the RCU implementation; it is -not-
490 * an exported member of the RCU API.
491 */
492int rcu_needs_cpu(int cpu)
493{
494 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
495 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
496
497 return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
498}
499
500void rcu_check_callbacks(int cpu, int user)
501{
502 if (user ||
503 (idle_cpu(cpu) && !in_softirq() &&
504 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
505 rcu_qsctr_inc(cpu);
506 rcu_bh_qsctr_inc(cpu);
507 } else if (!in_softirq())
508 rcu_bh_qsctr_inc(cpu);
509 raise_rcu_softirq();
510}
511
512static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
513 struct rcu_data *rdp)
514{
515 memset(rdp, 0, sizeof(*rdp));
516 rdp->curtail = &rdp->curlist;
517 rdp->nxttail = &rdp->nxtlist;
518 rdp->donetail = &rdp->donelist;
519 rdp->quiescbatch = rcp->completed;
520 rdp->qs_pending = 0;
521 rdp->cpu = cpu;
522 rdp->blimit = blimit;
523}
524
525static void __cpuinit rcu_online_cpu(int cpu)
526{
527 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
528 struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
529
530 rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
531 rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
532 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
533}
534
535static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
536 unsigned long action, void *hcpu)
537{
538 long cpu = (long)hcpu;
539
540 switch (action) {
541 case CPU_UP_PREPARE:
542 case CPU_UP_PREPARE_FROZEN:
543 rcu_online_cpu(cpu);
544 break;
545 case CPU_DEAD:
546 case CPU_DEAD_FROZEN:
547 rcu_offline_cpu(cpu);
548 break;
549 default:
550 break;
551 }
552 return NOTIFY_OK;
553}
554
555static struct notifier_block __cpuinitdata rcu_nb = {
556 .notifier_call = rcu_cpu_notify,
557};
558
559/*
560 * Initializes rcu mechanism. Assumed to be called early.
561 * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
562 * Note that rcu_qsctr and friends are implicitly
563 * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
564 */
565void __init __rcu_init(void)
566{
567 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
568 (void *)(long)smp_processor_id());
569 /* Register notifier for non-boot CPUs */
570 register_cpu_notifier(&rcu_nb);
571}
572
573module_param(blimit, int, 0);
574module_param(qhimark, int, 0);
575module_param(qlowmark, int, 0);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f2c1a04e9b18..760dfc233a00 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -15,7 +15,7 @@
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 * 17 *
18 * Copyright (C) IBM Corporation, 2001 18 * Copyright IBM Corporation, 2001
19 * 19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com> 20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com> 21 * Manfred Spraul <manfred@colorfullife.com>
@@ -35,165 +35,57 @@
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/spinlock.h> 36#include <linux/spinlock.h>
37#include <linux/smp.h> 37#include <linux/smp.h>
38#include <linux/rcupdate.h>
39#include <linux/interrupt.h> 38#include <linux/interrupt.h>
40#include <linux/sched.h> 39#include <linux/sched.h>
41#include <asm/atomic.h> 40#include <asm/atomic.h>
42#include <linux/bitops.h> 41#include <linux/bitops.h>
43#include <linux/module.h>
44#include <linux/completion.h> 42#include <linux/completion.h>
45#include <linux/moduleparam.h>
46#include <linux/percpu.h> 43#include <linux/percpu.h>
47#include <linux/notifier.h> 44#include <linux/notifier.h>
48#include <linux/cpu.h> 45#include <linux/cpu.h>
49#include <linux/mutex.h> 46#include <linux/mutex.h>
47#include <linux/module.h>
50 48
51#ifdef CONFIG_DEBUG_LOCK_ALLOC 49struct rcu_synchronize {
52static struct lock_class_key rcu_lock_key; 50 struct rcu_head head;
53struct lockdep_map rcu_lock_map = 51 struct completion completion;
54 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
55
56EXPORT_SYMBOL_GPL(rcu_lock_map);
57#endif
58
59/* Definition for rcupdate control block. */
60static struct rcu_ctrlblk rcu_ctrlblk = {
61 .cur = -300,
62 .completed = -300,
63 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
64 .cpumask = CPU_MASK_NONE,
65};
66static struct rcu_ctrlblk rcu_bh_ctrlblk = {
67 .cur = -300,
68 .completed = -300,
69 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
70 .cpumask = CPU_MASK_NONE,
71}; 52};
72 53
73DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; 54static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
74DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
75
76/* Fake initialization required by compiler */
77static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
78static int blimit = 10;
79static int qhimark = 10000;
80static int qlowmark = 100;
81
82static atomic_t rcu_barrier_cpu_count; 55static atomic_t rcu_barrier_cpu_count;
83static DEFINE_MUTEX(rcu_barrier_mutex); 56static DEFINE_MUTEX(rcu_barrier_mutex);
84static struct completion rcu_barrier_completion; 57static struct completion rcu_barrier_completion;
85 58
86#ifdef CONFIG_SMP 59/* Because of FASTCALL declaration of complete, we use this wrapper */
87static void force_quiescent_state(struct rcu_data *rdp, 60static void wakeme_after_rcu(struct rcu_head *head)
88 struct rcu_ctrlblk *rcp)
89{
90 int cpu;
91 cpumask_t cpumask;
92 set_need_resched();
93 if (unlikely(!rcp->signaled)) {
94 rcp->signaled = 1;
95 /*
96 * Don't send IPI to itself. With irqs disabled,
97 * rdp->cpu is the current cpu.
98 */
99 cpumask = rcp->cpumask;
100 cpu_clear(rdp->cpu, cpumask);
101 for_each_cpu_mask(cpu, cpumask)
102 smp_send_reschedule(cpu);
103 }
104}
105#else
106static inline void force_quiescent_state(struct rcu_data *rdp,
107 struct rcu_ctrlblk *rcp)
108{ 61{
109 set_need_resched(); 62 struct rcu_synchronize *rcu;
63
64 rcu = container_of(head, struct rcu_synchronize, head);
65 complete(&rcu->completion);
110} 66}
111#endif
112 67
113/** 68/**
114 * call_rcu - Queue an RCU callback for invocation after a grace period. 69 * synchronize_rcu - wait until a grace period has elapsed.
115 * @head: structure to be used for queueing the RCU updates.
116 * @func: actual update function to be invoked after the grace period
117 * 70 *
118 * The update function will be invoked some time after a full grace 71 * Control will return to the caller some time after a full grace
119 * period elapses, in other words after all currently executing RCU 72 * period has elapsed, in other words after all currently executing RCU
120 * read-side critical sections have completed. RCU read-side critical 73 * read-side critical sections have completed. RCU read-side critical
121 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 74 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
122 * and may be nested. 75 * and may be nested.
123 */ 76 */
124void fastcall call_rcu(struct rcu_head *head, 77void synchronize_rcu(void)
125 void (*func)(struct rcu_head *rcu))
126{
127 unsigned long flags;
128 struct rcu_data *rdp;
129
130 head->func = func;
131 head->next = NULL;
132 local_irq_save(flags);
133 rdp = &__get_cpu_var(rcu_data);
134 *rdp->nxttail = head;
135 rdp->nxttail = &head->next;
136 if (unlikely(++rdp->qlen > qhimark)) {
137 rdp->blimit = INT_MAX;
138 force_quiescent_state(rdp, &rcu_ctrlblk);
139 }
140 local_irq_restore(flags);
141}
142
143/**
144 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
145 * @head: structure to be used for queueing the RCU updates.
146 * @func: actual update function to be invoked after the grace period
147 *
148 * The update function will be invoked some time after a full grace
149 * period elapses, in other words after all currently executing RCU
150 * read-side critical sections have completed. call_rcu_bh() assumes
151 * that the read-side critical sections end on completion of a softirq
152 * handler. This means that read-side critical sections in process
153 * context must not be interrupted by softirqs. This interface is to be
154 * used when most of the read-side critical sections are in softirq context.
155 * RCU read-side critical sections are delimited by rcu_read_lock() and
156 * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
157 * and rcu_read_unlock_bh(), if in process context. These may be nested.
158 */
159void fastcall call_rcu_bh(struct rcu_head *head,
160 void (*func)(struct rcu_head *rcu))
161{ 78{
162 unsigned long flags; 79 struct rcu_synchronize rcu;
163 struct rcu_data *rdp;
164
165 head->func = func;
166 head->next = NULL;
167 local_irq_save(flags);
168 rdp = &__get_cpu_var(rcu_bh_data);
169 *rdp->nxttail = head;
170 rdp->nxttail = &head->next;
171
172 if (unlikely(++rdp->qlen > qhimark)) {
173 rdp->blimit = INT_MAX;
174 force_quiescent_state(rdp, &rcu_bh_ctrlblk);
175 }
176
177 local_irq_restore(flags);
178}
179 80
180/* 81 init_completion(&rcu.completion);
181 * Return the number of RCU batches processed thus far. Useful 82 /* Will wake me after RCU finished */
182 * for debug and statistics. 83 call_rcu(&rcu.head, wakeme_after_rcu);
183 */
184long rcu_batches_completed(void)
185{
186 return rcu_ctrlblk.completed;
187}
188 84
189/* 85 /* Wait for it */
190 * Return the number of RCU batches processed thus far. Useful 86 wait_for_completion(&rcu.completion);
191 * for debug and statistics.
192 */
193long rcu_batches_completed_bh(void)
194{
195 return rcu_bh_ctrlblk.completed;
196} 87}
88EXPORT_SYMBOL_GPL(synchronize_rcu);
197 89
198static void rcu_barrier_callback(struct rcu_head *notused) 90static void rcu_barrier_callback(struct rcu_head *notused)
199{ 91{
@@ -207,10 +99,8 @@ static void rcu_barrier_callback(struct rcu_head *notused)
207static void rcu_barrier_func(void *notused) 99static void rcu_barrier_func(void *notused)
208{ 100{
209 int cpu = smp_processor_id(); 101 int cpu = smp_processor_id();
210 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); 102 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
211 struct rcu_head *head;
212 103
213 head = &rdp->barrier;
214 atomic_inc(&rcu_barrier_cpu_count); 104 atomic_inc(&rcu_barrier_cpu_count);
215 call_rcu(head, rcu_barrier_callback); 105 call_rcu(head, rcu_barrier_callback);
216} 106}
@@ -225,420 +115,24 @@ void rcu_barrier(void)
225 mutex_lock(&rcu_barrier_mutex); 115 mutex_lock(&rcu_barrier_mutex);
226 init_completion(&rcu_barrier_completion); 116 init_completion(&rcu_barrier_completion);
227 atomic_set(&rcu_barrier_cpu_count, 0); 117 atomic_set(&rcu_barrier_cpu_count, 0);
118 /*
119 * The queueing of callbacks in all CPUs must be atomic with
120 * respect to RCU, otherwise one CPU may queue a callback,
121 * wait for a grace period, decrement barrier count and call
122 * complete(), while other CPUs have not yet queued anything.
123 * So, we need to make sure that grace periods cannot complete
124 * until all the callbacks are queued.
125 */
126 rcu_read_lock();
228 on_each_cpu(rcu_barrier_func, NULL, 0, 1); 127 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
128 rcu_read_unlock();
229 wait_for_completion(&rcu_barrier_completion); 129 wait_for_completion(&rcu_barrier_completion);
230 mutex_unlock(&rcu_barrier_mutex); 130 mutex_unlock(&rcu_barrier_mutex);
231} 131}
232EXPORT_SYMBOL_GPL(rcu_barrier); 132EXPORT_SYMBOL_GPL(rcu_barrier);
233 133
234/*
235 * Invoke the completed RCU callbacks. They are expected to be in
236 * a per-cpu list.
237 */
238static void rcu_do_batch(struct rcu_data *rdp)
239{
240 struct rcu_head *next, *list;
241 int count = 0;
242
243 list = rdp->donelist;
244 while (list) {
245 next = list->next;
246 prefetch(next);
247 list->func(list);
248 list = next;
249 if (++count >= rdp->blimit)
250 break;
251 }
252 rdp->donelist = list;
253
254 local_irq_disable();
255 rdp->qlen -= count;
256 local_irq_enable();
257 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
258 rdp->blimit = blimit;
259
260 if (!rdp->donelist)
261 rdp->donetail = &rdp->donelist;
262 else
263 tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu));
264}
265
266/*
267 * Grace period handling:
268 * The grace period handling consists out of two steps:
269 * - A new grace period is started.
270 * This is done by rcu_start_batch. The start is not broadcasted to
271 * all cpus, they must pick this up by comparing rcp->cur with
272 * rdp->quiescbatch. All cpus are recorded in the
273 * rcu_ctrlblk.cpumask bitmap.
274 * - All cpus must go through a quiescent state.
275 * Since the start of the grace period is not broadcasted, at least two
276 * calls to rcu_check_quiescent_state are required:
277 * The first call just notices that a new grace period is running. The
278 * following calls check if there was a quiescent state since the beginning
279 * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
280 * the bitmap is empty, then the grace period is completed.
281 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
282 * period (if necessary).
283 */
284/*
285 * Register a new batch of callbacks, and start it up if there is currently no
286 * active batch and the batch to be registered has not already occurred.
287 * Caller must hold rcu_ctrlblk.lock.
288 */
289static void rcu_start_batch(struct rcu_ctrlblk *rcp)
290{
291 if (rcp->next_pending &&
292 rcp->completed == rcp->cur) {
293 rcp->next_pending = 0;
294 /*
295 * next_pending == 0 must be visible in
296 * __rcu_process_callbacks() before it can see new value of cur.
297 */
298 smp_wmb();
299 rcp->cur++;
300
301 /*
302 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
303 * Barrier Otherwise it can cause tickless idle CPUs to be
304 * included in rcp->cpumask, which will extend graceperiods
305 * unnecessarily.
306 */
307 smp_mb();
308 cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
309
310 rcp->signaled = 0;
311 }
312}
313
314/*
315 * cpu went through a quiescent state since the beginning of the grace period.
316 * Clear it from the cpu mask and complete the grace period if it was the last
317 * cpu. Start another grace period if someone has further entries pending
318 */
319static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
320{
321 cpu_clear(cpu, rcp->cpumask);
322 if (cpus_empty(rcp->cpumask)) {
323 /* batch completed ! */
324 rcp->completed = rcp->cur;
325 rcu_start_batch(rcp);
326 }
327}
328
329/*
330 * Check if the cpu has gone through a quiescent state (say context
331 * switch). If so and if it already hasn't done so in this RCU
332 * quiescent cycle, then indicate that it has done so.
333 */
334static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
335 struct rcu_data *rdp)
336{
337 if (rdp->quiescbatch != rcp->cur) {
338 /* start new grace period: */
339 rdp->qs_pending = 1;
340 rdp->passed_quiesc = 0;
341 rdp->quiescbatch = rcp->cur;
342 return;
343 }
344
345 /* Grace period already completed for this cpu?
346 * qs_pending is checked instead of the actual bitmap to avoid
347 * cacheline trashing.
348 */
349 if (!rdp->qs_pending)
350 return;
351
352 /*
353 * Was there a quiescent state since the beginning of the grace
354 * period? If no, then exit and wait for the next call.
355 */
356 if (!rdp->passed_quiesc)
357 return;
358 rdp->qs_pending = 0;
359
360 spin_lock(&rcp->lock);
361 /*
362 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
363 * during cpu startup. Ignore the quiescent state.
364 */
365 if (likely(rdp->quiescbatch == rcp->cur))
366 cpu_quiet(rdp->cpu, rcp);
367
368 spin_unlock(&rcp->lock);
369}
370
371
372#ifdef CONFIG_HOTPLUG_CPU
373
374/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
375 * locking requirements, the list it's pulling from has to belong to a cpu
376 * which is dead and hence not processing interrupts.
377 */
378static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
379 struct rcu_head **tail)
380{
381 local_irq_disable();
382 *this_rdp->nxttail = list;
383 if (list)
384 this_rdp->nxttail = tail;
385 local_irq_enable();
386}
387
388static void __rcu_offline_cpu(struct rcu_data *this_rdp,
389 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
390{
391 /* if the cpu going offline owns the grace period
392 * we can block indefinitely waiting for it, so flush
393 * it here
394 */
395 spin_lock_bh(&rcp->lock);
396 if (rcp->cur != rcp->completed)
397 cpu_quiet(rdp->cpu, rcp);
398 spin_unlock_bh(&rcp->lock);
399 rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
400 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
401 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
402}
403
404static void rcu_offline_cpu(int cpu)
405{
406 struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
407 struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
408
409 __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
410 &per_cpu(rcu_data, cpu));
411 __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
412 &per_cpu(rcu_bh_data, cpu));
413 put_cpu_var(rcu_data);
414 put_cpu_var(rcu_bh_data);
415 tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu);
416}
417
418#else
419
420static void rcu_offline_cpu(int cpu)
421{
422}
423
424#endif
425
426/*
427 * This does the RCU processing work from tasklet context.
428 */
429static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
430 struct rcu_data *rdp)
431{
432 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
433 *rdp->donetail = rdp->curlist;
434 rdp->donetail = rdp->curtail;
435 rdp->curlist = NULL;
436 rdp->curtail = &rdp->curlist;
437 }
438
439 if (rdp->nxtlist && !rdp->curlist) {
440 local_irq_disable();
441 rdp->curlist = rdp->nxtlist;
442 rdp->curtail = rdp->nxttail;
443 rdp->nxtlist = NULL;
444 rdp->nxttail = &rdp->nxtlist;
445 local_irq_enable();
446
447 /*
448 * start the next batch of callbacks
449 */
450
451 /* determine batch number */
452 rdp->batch = rcp->cur + 1;
453 /* see the comment and corresponding wmb() in
454 * the rcu_start_batch()
455 */
456 smp_rmb();
457
458 if (!rcp->next_pending) {
459 /* and start it/schedule start if it's a new batch */
460 spin_lock(&rcp->lock);
461 rcp->next_pending = 1;
462 rcu_start_batch(rcp);
463 spin_unlock(&rcp->lock);
464 }
465 }
466
467 rcu_check_quiescent_state(rcp, rdp);
468 if (rdp->donelist)
469 rcu_do_batch(rdp);
470}
471
472static void rcu_process_callbacks(unsigned long unused)
473{
474 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
475 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
476}
477
478static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
479{
480 /* This cpu has pending rcu entries and the grace period
481 * for them has completed.
482 */
483 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
484 return 1;
485
486 /* This cpu has no pending entries, but there are new entries */
487 if (!rdp->curlist && rdp->nxtlist)
488 return 1;
489
490 /* This cpu has finished callbacks to invoke */
491 if (rdp->donelist)
492 return 1;
493
494 /* The rcu core waits for a quiescent state from the cpu */
495 if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
496 return 1;
497
498 /* nothing to do */
499 return 0;
500}
501
502/*
503 * Check to see if there is any immediate RCU-related work to be done
504 * by the current CPU, returning 1 if so. This function is part of the
505 * RCU implementation; it is -not- an exported member of the RCU API.
506 */
507int rcu_pending(int cpu)
508{
509 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
510 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
511}
512
513/*
514 * Check to see if any future RCU-related work will need to be done
515 * by the current CPU, even if none need be done immediately, returning
516 * 1 if so. This function is part of the RCU implementation; it is -not-
517 * an exported member of the RCU API.
518 */
519int rcu_needs_cpu(int cpu)
520{
521 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
522 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
523
524 return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
525}
526
527void rcu_check_callbacks(int cpu, int user)
528{
529 if (user ||
530 (idle_cpu(cpu) && !in_softirq() &&
531 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
532 rcu_qsctr_inc(cpu);
533 rcu_bh_qsctr_inc(cpu);
534 } else if (!in_softirq())
535 rcu_bh_qsctr_inc(cpu);
536 tasklet_schedule(&per_cpu(rcu_tasklet, cpu));
537}
538
539static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
540 struct rcu_data *rdp)
541{
542 memset(rdp, 0, sizeof(*rdp));
543 rdp->curtail = &rdp->curlist;
544 rdp->nxttail = &rdp->nxtlist;
545 rdp->donetail = &rdp->donelist;
546 rdp->quiescbatch = rcp->completed;
547 rdp->qs_pending = 0;
548 rdp->cpu = cpu;
549 rdp->blimit = blimit;
550}
551
552static void __cpuinit rcu_online_cpu(int cpu)
553{
554 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
555 struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
556
557 rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
558 rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
559 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
560}
561
562static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
563 unsigned long action, void *hcpu)
564{
565 long cpu = (long)hcpu;
566 switch (action) {
567 case CPU_UP_PREPARE:
568 case CPU_UP_PREPARE_FROZEN:
569 rcu_online_cpu(cpu);
570 break;
571 case CPU_DEAD:
572 case CPU_DEAD_FROZEN:
573 rcu_offline_cpu(cpu);
574 break;
575 default:
576 break;
577 }
578 return NOTIFY_OK;
579}
580
581static struct notifier_block __cpuinitdata rcu_nb = {
582 .notifier_call = rcu_cpu_notify,
583};
584
585/*
586 * Initializes rcu mechanism. Assumed to be called early.
587 * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
588 * Note that rcu_qsctr and friends are implicitly
589 * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
590 */
591void __init rcu_init(void) 134void __init rcu_init(void)
592{ 135{
593 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, 136 __rcu_init();
594 (void *)(long)smp_processor_id());
595 /* Register notifier for non-boot CPUs */
596 register_cpu_notifier(&rcu_nb);
597}
598
599struct rcu_synchronize {
600 struct rcu_head head;
601 struct completion completion;
602};
603
604/* Because of FASTCALL declaration of complete, we use this wrapper */
605static void wakeme_after_rcu(struct rcu_head *head)
606{
607 struct rcu_synchronize *rcu;
608
609 rcu = container_of(head, struct rcu_synchronize, head);
610 complete(&rcu->completion);
611} 137}
612 138
613/**
614 * synchronize_rcu - wait until a grace period has elapsed.
615 *
616 * Control will return to the caller some time after a full grace
617 * period has elapsed, in other words after all currently executing RCU
618 * read-side critical sections have completed. RCU read-side critical
619 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
620 * and may be nested.
621 *
622 * If your read-side code is not protected by rcu_read_lock(), do -not-
623 * use synchronize_rcu().
624 */
625void synchronize_rcu(void)
626{
627 struct rcu_synchronize rcu;
628
629 init_completion(&rcu.completion);
630 /* Will wake me after RCU finished */
631 call_rcu(&rcu.head, wakeme_after_rcu);
632
633 /* Wait for it */
634 wait_for_completion(&rcu.completion);
635}
636
637module_param(blimit, int, 0);
638module_param(qhimark, int, 0);
639module_param(qlowmark, int, 0);
640EXPORT_SYMBOL_GPL(rcu_batches_completed);
641EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
642EXPORT_SYMBOL_GPL(call_rcu);
643EXPORT_SYMBOL_GPL(call_rcu_bh);
644EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
new file mode 100644
index 000000000000..987cfb7ade89
--- /dev/null
+++ b/kernel/rcupreempt.c
@@ -0,0 +1,953 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
22 * for pushing me away from locks and towards counters, and
23 * to Suparna Bhattacharya for pushing me completely away
24 * from atomic instructions on the read side.
25 *
26 * Papers: http://www.rdrop.com/users/paulmck/RCU
27 *
28 * Design Document: http://lwn.net/Articles/253651/
29 *
30 * For detailed explanation of Read-Copy Update mechanism see -
31 * Documentation/RCU/ *.txt
32 *
33 */
34#include <linux/types.h>
35#include <linux/kernel.h>
36#include <linux/init.h>
37#include <linux/spinlock.h>
38#include <linux/smp.h>
39#include <linux/rcupdate.h>
40#include <linux/interrupt.h>
41#include <linux/sched.h>
42#include <asm/atomic.h>
43#include <linux/bitops.h>
44#include <linux/module.h>
45#include <linux/completion.h>
46#include <linux/moduleparam.h>
47#include <linux/percpu.h>
48#include <linux/notifier.h>
49#include <linux/rcupdate.h>
50#include <linux/cpu.h>
51#include <linux/random.h>
52#include <linux/delay.h>
53#include <linux/byteorder/swabb.h>
54#include <linux/cpumask.h>
55#include <linux/rcupreempt_trace.h>
56
57/*
58 * Macro that prevents the compiler from reordering accesses, but does
59 * absolutely -nothing- to prevent CPUs from reordering. This is used
60 * only to mediate communication between mainline code and hardware
61 * interrupt and NMI handlers.
62 */
63#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
64
65/*
66 * PREEMPT_RCU data structures.
67 */
68
69/*
70 * GP_STAGES specifies the number of times the state machine has
71 * to go through the all the rcu_try_flip_states (see below)
72 * in a single Grace Period.
73 *
74 * GP in GP_STAGES stands for Grace Period ;)
75 */
76#define GP_STAGES 2
77struct rcu_data {
78 spinlock_t lock; /* Protect rcu_data fields. */
79 long completed; /* Number of last completed batch. */
80 int waitlistcount;
81 struct tasklet_struct rcu_tasklet;
82 struct rcu_head *nextlist;
83 struct rcu_head **nexttail;
84 struct rcu_head *waitlist[GP_STAGES];
85 struct rcu_head **waittail[GP_STAGES];
86 struct rcu_head *donelist;
87 struct rcu_head **donetail;
88 long rcu_flipctr[2];
89#ifdef CONFIG_RCU_TRACE
90 struct rcupreempt_trace trace;
91#endif /* #ifdef CONFIG_RCU_TRACE */
92};
93
94/*
95 * States for rcu_try_flip() and friends.
96 */
97
98enum rcu_try_flip_states {
99
100 /*
101 * Stay here if nothing is happening. Flip the counter if somthing
102 * starts happening. Denoted by "I"
103 */
104 rcu_try_flip_idle_state,
105
106 /*
107 * Wait here for all CPUs to notice that the counter has flipped. This
108 * prevents the old set of counters from ever being incremented once
109 * we leave this state, which in turn is necessary because we cannot
110 * test any individual counter for zero -- we can only check the sum.
111 * Denoted by "A".
112 */
113 rcu_try_flip_waitack_state,
114
115 /*
116 * Wait here for the sum of the old per-CPU counters to reach zero.
117 * Denoted by "Z".
118 */
119 rcu_try_flip_waitzero_state,
120
121 /*
122 * Wait here for each of the other CPUs to execute a memory barrier.
123 * This is necessary to ensure that these other CPUs really have
124 * completed executing their RCU read-side critical sections, despite
125 * their CPUs wildly reordering memory. Denoted by "M".
126 */
127 rcu_try_flip_waitmb_state,
128};
129
130struct rcu_ctrlblk {
131 spinlock_t fliplock; /* Protect state-machine transitions. */
132 long completed; /* Number of last completed batch. */
133 enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
134 the rcu state machine */
135};
136
137static DEFINE_PER_CPU(struct rcu_data, rcu_data);
138static struct rcu_ctrlblk rcu_ctrlblk = {
139 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
140 .completed = 0,
141 .rcu_try_flip_state = rcu_try_flip_idle_state,
142};
143
144
145#ifdef CONFIG_RCU_TRACE
146static char *rcu_try_flip_state_names[] =
147 { "idle", "waitack", "waitzero", "waitmb" };
148#endif /* #ifdef CONFIG_RCU_TRACE */
149
150static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE;
151
152/*
153 * Enum and per-CPU flag to determine when each CPU has seen
154 * the most recent counter flip.
155 */
156
157enum rcu_flip_flag_values {
158 rcu_flip_seen, /* Steady/initial state, last flip seen. */
159 /* Only GP detector can update. */
160 rcu_flipped /* Flip just completed, need confirmation. */
161 /* Only corresponding CPU can update. */
162};
163static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
164 = rcu_flip_seen;
165
166/*
167 * Enum and per-CPU flag to determine when each CPU has executed the
168 * needed memory barrier to fence in memory references from its last RCU
169 * read-side critical section in the just-completed grace period.
170 */
171
172enum rcu_mb_flag_values {
173 rcu_mb_done, /* Steady/initial state, no mb()s required. */
174 /* Only GP detector can update. */
175 rcu_mb_needed /* Flip just completed, need an mb(). */
176 /* Only corresponding CPU can update. */
177};
178static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
179 = rcu_mb_done;
180
181/*
182 * RCU_DATA_ME: find the current CPU's rcu_data structure.
183 * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
184 */
185#define RCU_DATA_ME() (&__get_cpu_var(rcu_data))
186#define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu))
187
188/*
189 * Helper macro for tracing when the appropriate rcu_data is not
190 * cached in a local variable, but where the CPU number is so cached.
191 */
192#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
193
194/*
195 * Helper macro for tracing when the appropriate rcu_data is not
196 * cached in a local variable.
197 */
198#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
199
200/*
201 * Helper macro for tracing when the appropriate rcu_data is pointed
202 * to by a local variable.
203 */
204#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
205
206/*
207 * Return the number of RCU batches processed thus far. Useful
208 * for debug and statistics.
209 */
210long rcu_batches_completed(void)
211{
212 return rcu_ctrlblk.completed;
213}
214EXPORT_SYMBOL_GPL(rcu_batches_completed);
215
216EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
217
218void __rcu_read_lock(void)
219{
220 int idx;
221 struct task_struct *t = current;
222 int nesting;
223
224 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
225 if (nesting != 0) {
226
227 /* An earlier rcu_read_lock() covers us, just count it. */
228
229 t->rcu_read_lock_nesting = nesting + 1;
230
231 } else {
232 unsigned long flags;
233
234 /*
235 * We disable interrupts for the following reasons:
236 * - If we get scheduling clock interrupt here, and we
237 * end up acking the counter flip, it's like a promise
238 * that we will never increment the old counter again.
239 * Thus we will break that promise if that
240 * scheduling clock interrupt happens between the time
241 * we pick the .completed field and the time that we
242 * increment our counter.
243 *
244 * - We don't want to be preempted out here.
245 *
246 * NMIs can still occur, of course, and might themselves
247 * contain rcu_read_lock().
248 */
249
250 local_irq_save(flags);
251
252 /*
253 * Outermost nesting of rcu_read_lock(), so increment
254 * the current counter for the current CPU. Use volatile
255 * casts to prevent the compiler from reordering.
256 */
257
258 idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
259 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
260
261 /*
262 * Now that the per-CPU counter has been incremented, we
263 * are protected from races with rcu_read_lock() invoked
264 * from NMI handlers on this CPU. We can therefore safely
265 * increment the nesting counter, relieving further NMIs
266 * of the need to increment the per-CPU counter.
267 */
268
269 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
270
271 /*
272 * Now that we have preventing any NMIs from storing
273 * to the ->rcu_flipctr_idx, we can safely use it to
274 * remember which counter to decrement in the matching
275 * rcu_read_unlock().
276 */
277
278 ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
279 local_irq_restore(flags);
280 }
281}
282EXPORT_SYMBOL_GPL(__rcu_read_lock);
283
284void __rcu_read_unlock(void)
285{
286 int idx;
287 struct task_struct *t = current;
288 int nesting;
289
290 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
291 if (nesting > 1) {
292
293 /*
294 * We are still protected by the enclosing rcu_read_lock(),
295 * so simply decrement the counter.
296 */
297
298 t->rcu_read_lock_nesting = nesting - 1;
299
300 } else {
301 unsigned long flags;
302
303 /*
304 * Disable local interrupts to prevent the grace-period
305 * detection state machine from seeing us half-done.
306 * NMIs can still occur, of course, and might themselves
307 * contain rcu_read_lock() and rcu_read_unlock().
308 */
309
310 local_irq_save(flags);
311
312 /*
313 * Outermost nesting of rcu_read_unlock(), so we must
314 * decrement the current counter for the current CPU.
315 * This must be done carefully, because NMIs can
316 * occur at any point in this code, and any rcu_read_lock()
317 * and rcu_read_unlock() pairs in the NMI handlers
318 * must interact non-destructively with this code.
319 * Lots of volatile casts, and -very- careful ordering.
320 *
321 * Changes to this code, including this one, must be
322 * inspected, validated, and tested extremely carefully!!!
323 */
324
325 /*
326 * First, pick up the index.
327 */
328
329 idx = ACCESS_ONCE(t->rcu_flipctr_idx);
330
331 /*
332 * Now that we have fetched the counter index, it is
333 * safe to decrement the per-task RCU nesting counter.
334 * After this, any interrupts or NMIs will increment and
335 * decrement the per-CPU counters.
336 */
337 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
338
339 /*
340 * It is now safe to decrement this task's nesting count.
341 * NMIs that occur after this statement will route their
342 * rcu_read_lock() calls through this "else" clause, and
343 * will thus start incrementing the per-CPU counter on
344 * their own. They will also clobber ->rcu_flipctr_idx,
345 * but that is OK, since we have already fetched it.
346 */
347
348 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
349 local_irq_restore(flags);
350 }
351}
352EXPORT_SYMBOL_GPL(__rcu_read_unlock);
353
354/*
355 * If a global counter flip has occurred since the last time that we
356 * advanced callbacks, advance them. Hardware interrupts must be
357 * disabled when calling this function.
358 */
359static void __rcu_advance_callbacks(struct rcu_data *rdp)
360{
361 int cpu;
362 int i;
363 int wlc = 0;
364
365 if (rdp->completed != rcu_ctrlblk.completed) {
366 if (rdp->waitlist[GP_STAGES - 1] != NULL) {
367 *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
368 rdp->donetail = rdp->waittail[GP_STAGES - 1];
369 RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
370 }
371 for (i = GP_STAGES - 2; i >= 0; i--) {
372 if (rdp->waitlist[i] != NULL) {
373 rdp->waitlist[i + 1] = rdp->waitlist[i];
374 rdp->waittail[i + 1] = rdp->waittail[i];
375 wlc++;
376 } else {
377 rdp->waitlist[i + 1] = NULL;
378 rdp->waittail[i + 1] =
379 &rdp->waitlist[i + 1];
380 }
381 }
382 if (rdp->nextlist != NULL) {
383 rdp->waitlist[0] = rdp->nextlist;
384 rdp->waittail[0] = rdp->nexttail;
385 wlc++;
386 rdp->nextlist = NULL;
387 rdp->nexttail = &rdp->nextlist;
388 RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
389 } else {
390 rdp->waitlist[0] = NULL;
391 rdp->waittail[0] = &rdp->waitlist[0];
392 }
393 rdp->waitlistcount = wlc;
394 rdp->completed = rcu_ctrlblk.completed;
395 }
396
397 /*
398 * Check to see if this CPU needs to report that it has seen
399 * the most recent counter flip, thereby declaring that all
400 * subsequent rcu_read_lock() invocations will respect this flip.
401 */
402
403 cpu = raw_smp_processor_id();
404 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
405 smp_mb(); /* Subsequent counter accesses must see new value */
406 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
407 smp_mb(); /* Subsequent RCU read-side critical sections */
408 /* seen -after- acknowledgement. */
409 }
410}
411
412/*
413 * Get here when RCU is idle. Decide whether we need to
414 * move out of idle state, and return non-zero if so.
415 * "Straightforward" approach for the moment, might later
416 * use callback-list lengths, grace-period duration, or
417 * some such to determine when to exit idle state.
418 * Might also need a pre-idle test that does not acquire
419 * the lock, but let's get the simple case working first...
420 */
421
422static int
423rcu_try_flip_idle(void)
424{
425 int cpu;
426
427 RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
428 if (!rcu_pending(smp_processor_id())) {
429 RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
430 return 0;
431 }
432
433 /*
434 * Do the flip.
435 */
436
437 RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
438 rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */
439
440 /*
441 * Need a memory barrier so that other CPUs see the new
442 * counter value before they see the subsequent change of all
443 * the rcu_flip_flag instances to rcu_flipped.
444 */
445
446 smp_mb(); /* see above block comment. */
447
448 /* Now ask each CPU for acknowledgement of the flip. */
449
450 for_each_cpu_mask(cpu, rcu_cpu_online_map)
451 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
452
453 return 1;
454}
455
456/*
457 * Wait for CPUs to acknowledge the flip.
458 */
459
460static int
461rcu_try_flip_waitack(void)
462{
463 int cpu;
464
465 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
466 for_each_cpu_mask(cpu, rcu_cpu_online_map)
467 if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
468 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
469 return 0;
470 }
471
472 /*
473 * Make sure our checks above don't bleed into subsequent
474 * waiting for the sum of the counters to reach zero.
475 */
476
477 smp_mb(); /* see above block comment. */
478 RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
479 return 1;
480}
481
482/*
483 * Wait for collective ``last'' counter to reach zero,
484 * then tell all CPUs to do an end-of-grace-period memory barrier.
485 */
486
487static int
488rcu_try_flip_waitzero(void)
489{
490 int cpu;
491 int lastidx = !(rcu_ctrlblk.completed & 0x1);
492 int sum = 0;
493
494 /* Check to see if the sum of the "last" counters is zero. */
495
496 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
497 for_each_cpu_mask(cpu, rcu_cpu_online_map)
498 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
499 if (sum != 0) {
500 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
501 return 0;
502 }
503
504 /*
505 * This ensures that the other CPUs see the call for
506 * memory barriers -after- the sum to zero has been
507 * detected here
508 */
509 smp_mb(); /* ^^^^^^^^^^^^ */
510
511 /* Call for a memory barrier from each CPU. */
512 for_each_cpu_mask(cpu, rcu_cpu_online_map)
513 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
514
515 RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
516 return 1;
517}
518
519/*
520 * Wait for all CPUs to do their end-of-grace-period memory barrier.
521 * Return 0 once all CPUs have done so.
522 */
523
524static int
525rcu_try_flip_waitmb(void)
526{
527 int cpu;
528
529 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
530 for_each_cpu_mask(cpu, rcu_cpu_online_map)
531 if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
532 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
533 return 0;
534 }
535
536 smp_mb(); /* Ensure that the above checks precede any following flip. */
537 RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
538 return 1;
539}
540
541/*
542 * Attempt a single flip of the counters. Remember, a single flip does
543 * -not- constitute a grace period. Instead, the interval between
544 * at least GP_STAGES consecutive flips is a grace period.
545 *
546 * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
547 * on a large SMP, they might want to use a hierarchical organization of
548 * the per-CPU-counter pairs.
549 */
550static void rcu_try_flip(void)
551{
552 unsigned long flags;
553
554 RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
555 if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
556 RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
557 return;
558 }
559
560 /*
561 * Take the next transition(s) through the RCU grace-period
562 * flip-counter state machine.
563 */
564
565 switch (rcu_ctrlblk.rcu_try_flip_state) {
566 case rcu_try_flip_idle_state:
567 if (rcu_try_flip_idle())
568 rcu_ctrlblk.rcu_try_flip_state =
569 rcu_try_flip_waitack_state;
570 break;
571 case rcu_try_flip_waitack_state:
572 if (rcu_try_flip_waitack())
573 rcu_ctrlblk.rcu_try_flip_state =
574 rcu_try_flip_waitzero_state;
575 break;
576 case rcu_try_flip_waitzero_state:
577 if (rcu_try_flip_waitzero())
578 rcu_ctrlblk.rcu_try_flip_state =
579 rcu_try_flip_waitmb_state;
580 break;
581 case rcu_try_flip_waitmb_state:
582 if (rcu_try_flip_waitmb())
583 rcu_ctrlblk.rcu_try_flip_state =
584 rcu_try_flip_idle_state;
585 }
586 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
587}
588
589/*
590 * Check to see if this CPU needs to do a memory barrier in order to
591 * ensure that any prior RCU read-side critical sections have committed
592 * their counter manipulations and critical-section memory references
593 * before declaring the grace period to be completed.
594 */
595static void rcu_check_mb(int cpu)
596{
597 if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
598 smp_mb(); /* Ensure RCU read-side accesses are visible. */
599 per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
600 }
601}
602
603void rcu_check_callbacks(int cpu, int user)
604{
605 unsigned long flags;
606 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
607
608 rcu_check_mb(cpu);
609 if (rcu_ctrlblk.completed == rdp->completed)
610 rcu_try_flip();
611 spin_lock_irqsave(&rdp->lock, flags);
612 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
613 __rcu_advance_callbacks(rdp);
614 if (rdp->donelist == NULL) {
615 spin_unlock_irqrestore(&rdp->lock, flags);
616 } else {
617 spin_unlock_irqrestore(&rdp->lock, flags);
618 raise_softirq(RCU_SOFTIRQ);
619 }
620}
621
622/*
623 * Needed by dynticks, to make sure all RCU processing has finished
624 * when we go idle:
625 */
626void rcu_advance_callbacks(int cpu, int user)
627{
628 unsigned long flags;
629 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
630
631 if (rcu_ctrlblk.completed == rdp->completed) {
632 rcu_try_flip();
633 if (rcu_ctrlblk.completed == rdp->completed)
634 return;
635 }
636 spin_lock_irqsave(&rdp->lock, flags);
637 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
638 __rcu_advance_callbacks(rdp);
639 spin_unlock_irqrestore(&rdp->lock, flags);
640}
641
642#ifdef CONFIG_HOTPLUG_CPU
643#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
644 *dsttail = srclist; \
645 if (srclist != NULL) { \
646 dsttail = srctail; \
647 srclist = NULL; \
648 srctail = &srclist;\
649 } \
650 } while (0)
651
652void rcu_offline_cpu(int cpu)
653{
654 int i;
655 struct rcu_head *list = NULL;
656 unsigned long flags;
657 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
658 struct rcu_head **tail = &list;
659
660 /*
661 * Remove all callbacks from the newly dead CPU, retaining order.
662 * Otherwise rcu_barrier() will fail
663 */
664
665 spin_lock_irqsave(&rdp->lock, flags);
666 rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
667 for (i = GP_STAGES - 1; i >= 0; i--)
668 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
669 list, tail);
670 rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
671 spin_unlock_irqrestore(&rdp->lock, flags);
672 rdp->waitlistcount = 0;
673
674 /* Disengage the newly dead CPU from the grace-period computation. */
675
676 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
677 rcu_check_mb(cpu);
678 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
679 smp_mb(); /* Subsequent counter accesses must see new value */
680 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
681 smp_mb(); /* Subsequent RCU read-side critical sections */
682 /* seen -after- acknowledgement. */
683 }
684
685 RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
686 RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
687
688 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
689 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
690
691 cpu_clear(cpu, rcu_cpu_online_map);
692
693 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
694
695 /*
696 * Place the removed callbacks on the current CPU's queue.
697 * Make them all start a new grace period: simple approach,
698 * in theory could starve a given set of callbacks, but
699 * you would need to be doing some serious CPU hotplugging
700 * to make this happen. If this becomes a problem, adding
701 * a synchronize_rcu() to the hotplug path would be a simple
702 * fix.
703 */
704
705 rdp = RCU_DATA_ME();
706 spin_lock_irqsave(&rdp->lock, flags);
707 *rdp->nexttail = list;
708 if (list)
709 rdp->nexttail = tail;
710 spin_unlock_irqrestore(&rdp->lock, flags);
711}
712
713void __devinit rcu_online_cpu(int cpu)
714{
715 unsigned long flags;
716
717 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
718 cpu_set(cpu, rcu_cpu_online_map);
719 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
720}
721
722#else /* #ifdef CONFIG_HOTPLUG_CPU */
723
724void rcu_offline_cpu(int cpu)
725{
726}
727
728void __devinit rcu_online_cpu(int cpu)
729{
730}
731
732#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
733
734static void rcu_process_callbacks(struct softirq_action *unused)
735{
736 unsigned long flags;
737 struct rcu_head *next, *list;
738 struct rcu_data *rdp = RCU_DATA_ME();
739
740 spin_lock_irqsave(&rdp->lock, flags);
741 list = rdp->donelist;
742 if (list == NULL) {
743 spin_unlock_irqrestore(&rdp->lock, flags);
744 return;
745 }
746 rdp->donelist = NULL;
747 rdp->donetail = &rdp->donelist;
748 RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
749 spin_unlock_irqrestore(&rdp->lock, flags);
750 while (list) {
751 next = list->next;
752 list->func(list);
753 list = next;
754 RCU_TRACE_ME(rcupreempt_trace_invoke);
755 }
756}
757
758void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
759{
760 unsigned long flags;
761 struct rcu_data *rdp;
762
763 head->func = func;
764 head->next = NULL;
765 local_irq_save(flags);
766 rdp = RCU_DATA_ME();
767 spin_lock(&rdp->lock);
768 __rcu_advance_callbacks(rdp);
769 *rdp->nexttail = head;
770 rdp->nexttail = &head->next;
771 RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
772 spin_unlock(&rdp->lock);
773 local_irq_restore(flags);
774}
775EXPORT_SYMBOL_GPL(call_rcu);
776
777/*
778 * Wait until all currently running preempt_disable() code segments
779 * (including hardware-irq-disable segments) complete. Note that
780 * in -rt this does -not- necessarily result in all currently executing
781 * interrupt -handlers- having completed.
782 */
783void __synchronize_sched(void)
784{
785 cpumask_t oldmask;
786 int cpu;
787
788 if (sched_getaffinity(0, &oldmask) < 0)
789 oldmask = cpu_possible_map;
790 for_each_online_cpu(cpu) {
791 sched_setaffinity(0, cpumask_of_cpu(cpu));
792 schedule();
793 }
794 sched_setaffinity(0, oldmask);
795}
796EXPORT_SYMBOL_GPL(__synchronize_sched);
797
798/*
799 * Check to see if any future RCU-related work will need to be done
800 * by the current CPU, even if none need be done immediately, returning
801 * 1 if so. Assumes that notifiers would take care of handling any
802 * outstanding requests from the RCU core.
803 *
804 * This function is part of the RCU implementation; it is -not-
805 * an exported member of the RCU API.
806 */
807int rcu_needs_cpu(int cpu)
808{
809 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
810
811 return (rdp->donelist != NULL ||
812 !!rdp->waitlistcount ||
813 rdp->nextlist != NULL);
814}
815
816int rcu_pending(int cpu)
817{
818 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
819
820 /* The CPU has at least one callback queued somewhere. */
821
822 if (rdp->donelist != NULL ||
823 !!rdp->waitlistcount ||
824 rdp->nextlist != NULL)
825 return 1;
826
827 /* The RCU core needs an acknowledgement from this CPU. */
828
829 if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
830 (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
831 return 1;
832
833 /* This CPU has fallen behind the global grace-period number. */
834
835 if (rdp->completed != rcu_ctrlblk.completed)
836 return 1;
837
838 /* Nothing needed from this CPU. */
839
840 return 0;
841}
842
843static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
844 unsigned long action, void *hcpu)
845{
846 long cpu = (long)hcpu;
847
848 switch (action) {
849 case CPU_UP_PREPARE:
850 case CPU_UP_PREPARE_FROZEN:
851 rcu_online_cpu(cpu);
852 break;
853 case CPU_UP_CANCELED:
854 case CPU_UP_CANCELED_FROZEN:
855 case CPU_DEAD:
856 case CPU_DEAD_FROZEN:
857 rcu_offline_cpu(cpu);
858 break;
859 default:
860 break;
861 }
862 return NOTIFY_OK;
863}
864
865static struct notifier_block __cpuinitdata rcu_nb = {
866 .notifier_call = rcu_cpu_notify,
867};
868
869void __init __rcu_init(void)
870{
871 int cpu;
872 int i;
873 struct rcu_data *rdp;
874
875 printk(KERN_NOTICE "Preemptible RCU implementation.\n");
876 for_each_possible_cpu(cpu) {
877 rdp = RCU_DATA_CPU(cpu);
878 spin_lock_init(&rdp->lock);
879 rdp->completed = 0;
880 rdp->waitlistcount = 0;
881 rdp->nextlist = NULL;
882 rdp->nexttail = &rdp->nextlist;
883 for (i = 0; i < GP_STAGES; i++) {
884 rdp->waitlist[i] = NULL;
885 rdp->waittail[i] = &rdp->waitlist[i];
886 }
887 rdp->donelist = NULL;
888 rdp->donetail = &rdp->donelist;
889 rdp->rcu_flipctr[0] = 0;
890 rdp->rcu_flipctr[1] = 0;
891 }
892 register_cpu_notifier(&rcu_nb);
893
894 /*
895 * We don't need protection against CPU-Hotplug here
896 * since
897 * a) If a CPU comes online while we are iterating over the
898 * cpu_online_map below, we would only end up making a
899 * duplicate call to rcu_online_cpu() which sets the corresponding
900 * CPU's mask in the rcu_cpu_online_map.
901 *
902 * b) A CPU cannot go offline at this point in time since the user
903 * does not have access to the sysfs interface, nor do we
904 * suspend the system.
905 */
906 for_each_online_cpu(cpu)
907 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
908
909 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
910}
911
912/*
913 * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
914 */
915void synchronize_kernel(void)
916{
917 synchronize_rcu();
918}
919
920#ifdef CONFIG_RCU_TRACE
921long *rcupreempt_flipctr(int cpu)
922{
923 return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
924}
925EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
926
927int rcupreempt_flip_flag(int cpu)
928{
929 return per_cpu(rcu_flip_flag, cpu);
930}
931EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
932
933int rcupreempt_mb_flag(int cpu)
934{
935 return per_cpu(rcu_mb_flag, cpu);
936}
937EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
938
939char *rcupreempt_try_flip_state_name(void)
940{
941 return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
942}
943EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
944
945struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
946{
947 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
948
949 return &rdp->trace;
950}
951EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
952
953#endif /* #ifdef RCU_TRACE */
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
new file mode 100644
index 000000000000..49ac4947af24
--- /dev/null
+++ b/kernel/rcupreempt_trace.c
@@ -0,0 +1,330 @@
1/*
2 * Read-Copy Update tracing for realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt
24 *
25 */
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/init.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/rcupdate.h>
32#include <linux/interrupt.h>
33#include <linux/sched.h>
34#include <asm/atomic.h>
35#include <linux/bitops.h>
36#include <linux/module.h>
37#include <linux/completion.h>
38#include <linux/moduleparam.h>
39#include <linux/percpu.h>
40#include <linux/notifier.h>
41#include <linux/rcupdate.h>
42#include <linux/cpu.h>
43#include <linux/mutex.h>
44#include <linux/rcupreempt_trace.h>
45#include <linux/debugfs.h>
46
47static struct mutex rcupreempt_trace_mutex;
48static char *rcupreempt_trace_buf;
49#define RCUPREEMPT_TRACE_BUF_SIZE 4096
50
51void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
52{
53 trace->done_length += trace->wait_length;
54 trace->done_add += trace->wait_length;
55 trace->wait_length = 0;
56}
57void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
58{
59 trace->wait_length += trace->next_length;
60 trace->wait_add += trace->next_length;
61 trace->next_length = 0;
62}
63void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace)
64{
65 atomic_inc(&trace->rcu_try_flip_1);
66}
67void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
68{
69 atomic_inc(&trace->rcu_try_flip_e1);
70}
71void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace)
72{
73 trace->rcu_try_flip_i1++;
74}
75void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace)
76{
77 trace->rcu_try_flip_ie1++;
78}
79void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace)
80{
81 trace->rcu_try_flip_g1++;
82}
83void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace)
84{
85 trace->rcu_try_flip_a1++;
86}
87void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace)
88{
89 trace->rcu_try_flip_ae1++;
90}
91void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace)
92{
93 trace->rcu_try_flip_a2++;
94}
95void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace)
96{
97 trace->rcu_try_flip_z1++;
98}
99void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace)
100{
101 trace->rcu_try_flip_ze1++;
102}
103void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace)
104{
105 trace->rcu_try_flip_z2++;
106}
107void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace)
108{
109 trace->rcu_try_flip_m1++;
110}
111void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace)
112{
113 trace->rcu_try_flip_me1++;
114}
115void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace)
116{
117 trace->rcu_try_flip_m2++;
118}
119void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
120{
121 trace->rcu_check_callbacks++;
122}
123void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
124{
125 trace->done_remove += trace->done_length;
126 trace->done_length = 0;
127}
128void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
129{
130 atomic_inc(&trace->done_invoked);
131}
132void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
133{
134 trace->next_add++;
135 trace->next_length++;
136}
137
138static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
139{
140 struct rcupreempt_trace *cp;
141 int cpu;
142
143 memset(sp, 0, sizeof(*sp));
144 for_each_possible_cpu(cpu) {
145 cp = rcupreempt_trace_cpu(cpu);
146 sp->next_length += cp->next_length;
147 sp->next_add += cp->next_add;
148 sp->wait_length += cp->wait_length;
149 sp->wait_add += cp->wait_add;
150 sp->done_length += cp->done_length;
151 sp->done_add += cp->done_add;
152 sp->done_remove += cp->done_remove;
153 atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked));
154 sp->rcu_check_callbacks += cp->rcu_check_callbacks;
155 atomic_set(&sp->rcu_try_flip_1,
156 atomic_read(&cp->rcu_try_flip_1));
157 atomic_set(&sp->rcu_try_flip_e1,
158 atomic_read(&cp->rcu_try_flip_e1));
159 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
160 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
161 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
162 sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1;
163 sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1;
164 sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2;
165 sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1;
166 sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1;
167 sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2;
168 sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1;
169 sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1;
170 sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2;
171 }
172}
173
174static ssize_t rcustats_read(struct file *filp, char __user *buffer,
175 size_t count, loff_t *ppos)
176{
177 struct rcupreempt_trace trace;
178 ssize_t bcount;
179 int cnt = 0;
180
181 rcupreempt_trace_sum(&trace);
182 mutex_lock(&rcupreempt_trace_mutex);
183 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
184 "ggp=%ld rcc=%ld\n",
185 rcu_batches_completed(),
186 trace.rcu_check_callbacks);
187 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
188 "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
189 "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n"
190 "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n",
191
192 trace.next_add, trace.next_length,
193 trace.wait_add, trace.wait_length,
194 trace.done_add, trace.done_length,
195 trace.done_remove, atomic_read(&trace.done_invoked),
196 atomic_read(&trace.rcu_try_flip_1),
197 atomic_read(&trace.rcu_try_flip_e1),
198 trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1,
199 trace.rcu_try_flip_g1,
200 trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1,
201 trace.rcu_try_flip_a2,
202 trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1,
203 trace.rcu_try_flip_z2,
204 trace.rcu_try_flip_m1, trace.rcu_try_flip_me1,
205 trace.rcu_try_flip_m2);
206 bcount = simple_read_from_buffer(buffer, count, ppos,
207 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
208 mutex_unlock(&rcupreempt_trace_mutex);
209 return bcount;
210}
211
212static ssize_t rcugp_read(struct file *filp, char __user *buffer,
213 size_t count, loff_t *ppos)
214{
215 long oldgp = rcu_batches_completed();
216 ssize_t bcount;
217
218 mutex_lock(&rcupreempt_trace_mutex);
219 synchronize_rcu();
220 snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE,
221 "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed());
222 bcount = simple_read_from_buffer(buffer, count, ppos,
223 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
224 mutex_unlock(&rcupreempt_trace_mutex);
225 return bcount;
226}
227
228static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
229 size_t count, loff_t *ppos)
230{
231 int cnt = 0;
232 int cpu;
233 int f = rcu_batches_completed() & 0x1;
234 ssize_t bcount;
235
236 mutex_lock(&rcupreempt_trace_mutex);
237
238 cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
239 "CPU last cur F M\n");
240 for_each_online_cpu(cpu) {
241 long *flipctr = rcupreempt_flipctr(cpu);
242 cnt += snprintf(&rcupreempt_trace_buf[cnt],
243 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
244 "%3d %4ld %3ld %d %d\n",
245 cpu,
246 flipctr[!f],
247 flipctr[f],
248 rcupreempt_flip_flag(cpu),
249 rcupreempt_mb_flag(cpu));
250 }
251 cnt += snprintf(&rcupreempt_trace_buf[cnt],
252 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
253 "ggp = %ld, state = %s\n",
254 rcu_batches_completed(),
255 rcupreempt_try_flip_state_name());
256 cnt += snprintf(&rcupreempt_trace_buf[cnt],
257 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
258 "\n");
259 bcount = simple_read_from_buffer(buffer, count, ppos,
260 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
261 mutex_unlock(&rcupreempt_trace_mutex);
262 return bcount;
263}
264
265static struct file_operations rcustats_fops = {
266 .owner = THIS_MODULE,
267 .read = rcustats_read,
268};
269
270static struct file_operations rcugp_fops = {
271 .owner = THIS_MODULE,
272 .read = rcugp_read,
273};
274
275static struct file_operations rcuctrs_fops = {
276 .owner = THIS_MODULE,
277 .read = rcuctrs_read,
278};
279
280static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir;
281static int rcupreempt_debugfs_init(void)
282{
283 rcudir = debugfs_create_dir("rcu", NULL);
284 if (!rcudir)
285 goto out;
286 statdir = debugfs_create_file("rcustats", 0444, rcudir,
287 NULL, &rcustats_fops);
288 if (!statdir)
289 goto free_out;
290
291 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
292 if (!gpdir)
293 goto free_out;
294
295 ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir,
296 NULL, &rcuctrs_fops);
297 if (!ctrsdir)
298 goto free_out;
299 return 0;
300free_out:
301 if (statdir)
302 debugfs_remove(statdir);
303 if (gpdir)
304 debugfs_remove(gpdir);
305 debugfs_remove(rcudir);
306out:
307 return 1;
308}
309
310static int __init rcupreempt_trace_init(void)
311{
312 mutex_init(&rcupreempt_trace_mutex);
313 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
314 if (!rcupreempt_trace_buf)
315 return 1;
316 return rcupreempt_debugfs_init();
317}
318
319static void __exit rcupreempt_trace_cleanup(void)
320{
321 debugfs_remove(statdir);
322 debugfs_remove(gpdir);
323 debugfs_remove(ctrsdir);
324 debugfs_remove(rcudir);
325 kfree(rcupreempt_trace_buf);
326}
327
328
329module_init(rcupreempt_trace_init);
330module_exit(rcupreempt_trace_cleanup);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index c3e165c2318f..fd599829e72a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -726,11 +726,11 @@ static void rcu_torture_shuffle_tasks(void)
726 cpumask_t tmp_mask = CPU_MASK_ALL; 726 cpumask_t tmp_mask = CPU_MASK_ALL;
727 int i; 727 int i;
728 728
729 lock_cpu_hotplug(); 729 get_online_cpus();
730 730
731 /* No point in shuffling if there is only one online CPU (ex: UP) */ 731 /* No point in shuffling if there is only one online CPU (ex: UP) */
732 if (num_online_cpus() == 1) { 732 if (num_online_cpus() == 1) {
733 unlock_cpu_hotplug(); 733 put_online_cpus();
734 return; 734 return;
735 } 735 }
736 736
@@ -762,7 +762,7 @@ static void rcu_torture_shuffle_tasks(void)
762 else 762 else
763 rcu_idle_cpu--; 763 rcu_idle_cpu--;
764 764
765 unlock_cpu_hotplug(); 765 put_online_cpus();
766} 766}
767 767
768/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the 768/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index e3055ba69159..092e4c620af9 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -394,7 +394,7 @@ static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
394static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); 394static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
395 395
396static struct sysdev_class rttest_sysclass = { 396static struct sysdev_class rttest_sysclass = {
397 set_kset_name("rttest"), 397 .name = "rttest",
398}; 398};
399 399
400static int init_test_thread(int id) 400static int init_test_thread(int id)
diff --git a/kernel/sched.c b/kernel/sched.c
index e76b11ca6df3..ba4c88088f62 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -22,6 +22,8 @@
22 * by Peter Williams 22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
26 * Thomas Gleixner, Mike Kravetz
25 */ 27 */
26 28
27#include <linux/mm.h> 29#include <linux/mm.h>
@@ -63,6 +65,7 @@
63#include <linux/reciprocal_div.h> 65#include <linux/reciprocal_div.h>
64#include <linux/unistd.h> 66#include <linux/unistd.h>
65#include <linux/pagemap.h> 67#include <linux/pagemap.h>
68#include <linux/hrtimer.h>
66 69
67#include <asm/tlb.h> 70#include <asm/tlb.h>
68#include <asm/irq_regs.h> 71#include <asm/irq_regs.h>
@@ -96,10 +99,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
96#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) 99#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
97 100
98/* 101/*
99 * Some helpers for converting nanosecond timing to jiffy resolution 102 * Helpers for converting nanosecond timing to jiffy resolution
100 */ 103 */
101#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 104#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
102#define JIFFIES_TO_NS(TIME) ((TIME) * (NSEC_PER_SEC / HZ))
103 105
104#define NICE_0_LOAD SCHED_LOAD_SCALE 106#define NICE_0_LOAD SCHED_LOAD_SCALE
105#define NICE_0_SHIFT SCHED_LOAD_SHIFT 107#define NICE_0_SHIFT SCHED_LOAD_SHIFT
@@ -159,6 +161,8 @@ struct rt_prio_array {
159 161
160struct cfs_rq; 162struct cfs_rq;
161 163
164static LIST_HEAD(task_groups);
165
162/* task group related information */ 166/* task group related information */
163struct task_group { 167struct task_group {
164#ifdef CONFIG_FAIR_CGROUP_SCHED 168#ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -168,10 +172,50 @@ struct task_group {
168 struct sched_entity **se; 172 struct sched_entity **se;
169 /* runqueue "owned" by this group on each cpu */ 173 /* runqueue "owned" by this group on each cpu */
170 struct cfs_rq **cfs_rq; 174 struct cfs_rq **cfs_rq;
175
176 struct sched_rt_entity **rt_se;
177 struct rt_rq **rt_rq;
178
179 unsigned int rt_ratio;
180
181 /*
182 * shares assigned to a task group governs how much of cpu bandwidth
183 * is allocated to the group. The more shares a group has, the more is
184 * the cpu bandwidth allocated to it.
185 *
186 * For ex, lets say that there are three task groups, A, B and C which
187 * have been assigned shares 1000, 2000 and 3000 respectively. Then,
188 * cpu bandwidth allocated by the scheduler to task groups A, B and C
189 * should be:
190 *
191 * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
192 * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
193 * Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
194 *
195 * The weight assigned to a task group's schedulable entities on every
196 * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
197 * group's shares. For ex: lets say that task group A has been
198 * assigned shares of 1000 and there are two CPUs in a system. Then,
199 *
200 * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
201 *
202 * Note: It's not necessary that each of a task's group schedulable
203 * entity have the same weight on all CPUs. If the group
204 * has 2 of its tasks on CPU0 and 1 task on CPU1, then a
205 * better distribution of weight could be:
206 *
207 * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
208 * tg_A->se[1]->load.weight = 1/2 * 2000 = 667
209 *
210 * rebalance_shares() is responsible for distributing the shares of a
211 * task groups like this among the group's schedulable entities across
212 * cpus.
213 *
214 */
171 unsigned long shares; 215 unsigned long shares;
172 /* spinlock to serialize modification to shares */ 216
173 spinlock_t lock;
174 struct rcu_head rcu; 217 struct rcu_head rcu;
218 struct list_head list;
175}; 219};
176 220
177/* Default task group's sched entity on each cpu */ 221/* Default task group's sched entity on each cpu */
@@ -179,24 +223,51 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
179/* Default task group's cfs_rq on each cpu */ 223/* Default task group's cfs_rq on each cpu */
180static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 224static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
181 225
226static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
227static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
228
182static struct sched_entity *init_sched_entity_p[NR_CPUS]; 229static struct sched_entity *init_sched_entity_p[NR_CPUS];
183static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; 230static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
184 231
232static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
233static struct rt_rq *init_rt_rq_p[NR_CPUS];
234
235/* task_group_mutex serializes add/remove of task groups and also changes to
236 * a task group's cpu shares.
237 */
238static DEFINE_MUTEX(task_group_mutex);
239
240/* doms_cur_mutex serializes access to doms_cur[] array */
241static DEFINE_MUTEX(doms_cur_mutex);
242
243#ifdef CONFIG_SMP
244/* kernel thread that runs rebalance_shares() periodically */
245static struct task_struct *lb_monitor_task;
246static int load_balance_monitor(void *unused);
247#endif
248
249static void set_se_shares(struct sched_entity *se, unsigned long shares);
250
185/* Default task group. 251/* Default task group.
186 * Every task in system belong to this group at bootup. 252 * Every task in system belong to this group at bootup.
187 */ 253 */
188struct task_group init_task_group = { 254struct task_group init_task_group = {
189 .se = init_sched_entity_p, 255 .se = init_sched_entity_p,
190 .cfs_rq = init_cfs_rq_p, 256 .cfs_rq = init_cfs_rq_p,
257
258 .rt_se = init_sched_rt_entity_p,
259 .rt_rq = init_rt_rq_p,
191}; 260};
192 261
193#ifdef CONFIG_FAIR_USER_SCHED 262#ifdef CONFIG_FAIR_USER_SCHED
194# define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD 263# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
195#else 264#else
196# define INIT_TASK_GRP_LOAD NICE_0_LOAD 265# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
197#endif 266#endif
198 267
199static int init_task_group_load = INIT_TASK_GRP_LOAD; 268#define MIN_GROUP_SHARES 2
269
270static int init_task_group_load = INIT_TASK_GROUP_LOAD;
200 271
201/* return group to which a task belongs */ 272/* return group to which a task belongs */
202static inline struct task_group *task_group(struct task_struct *p) 273static inline struct task_group *task_group(struct task_struct *p)
@@ -215,15 +286,42 @@ static inline struct task_group *task_group(struct task_struct *p)
215} 286}
216 287
217/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 288/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
218static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) 289static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
219{ 290{
220 p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; 291 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
221 p->se.parent = task_group(p)->se[cpu]; 292 p->se.parent = task_group(p)->se[cpu];
293
294 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
295 p->rt.parent = task_group(p)->rt_se[cpu];
296}
297
298static inline void lock_task_group_list(void)
299{
300 mutex_lock(&task_group_mutex);
301}
302
303static inline void unlock_task_group_list(void)
304{
305 mutex_unlock(&task_group_mutex);
306}
307
308static inline void lock_doms_cur(void)
309{
310 mutex_lock(&doms_cur_mutex);
311}
312
313static inline void unlock_doms_cur(void)
314{
315 mutex_unlock(&doms_cur_mutex);
222} 316}
223 317
224#else 318#else
225 319
226static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { } 320static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
321static inline void lock_task_group_list(void) { }
322static inline void unlock_task_group_list(void) { }
323static inline void lock_doms_cur(void) { }
324static inline void unlock_doms_cur(void) { }
227 325
228#endif /* CONFIG_FAIR_GROUP_SCHED */ 326#endif /* CONFIG_FAIR_GROUP_SCHED */
229 327
@@ -264,11 +362,57 @@ struct cfs_rq {
264/* Real-Time classes' related field in a runqueue: */ 362/* Real-Time classes' related field in a runqueue: */
265struct rt_rq { 363struct rt_rq {
266 struct rt_prio_array active; 364 struct rt_prio_array active;
267 int rt_load_balance_idx; 365 unsigned long rt_nr_running;
268 struct list_head *rt_load_balance_head, *rt_load_balance_curr; 366#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
367 int highest_prio; /* highest queued rt task prio */
368#endif
369#ifdef CONFIG_SMP
370 unsigned long rt_nr_migratory;
371 int overloaded;
372#endif
373 int rt_throttled;
374 u64 rt_time;
375
376#ifdef CONFIG_FAIR_GROUP_SCHED
377 struct rq *rq;
378 struct list_head leaf_rt_rq_list;
379 struct task_group *tg;
380 struct sched_rt_entity *rt_se;
381#endif
382};
383
384#ifdef CONFIG_SMP
385
386/*
387 * We add the notion of a root-domain which will be used to define per-domain
388 * variables. Each exclusive cpuset essentially defines an island domain by
389 * fully partitioning the member cpus from any other cpuset. Whenever a new
390 * exclusive cpuset is created, we also create and attach a new root-domain
391 * object.
392 *
393 */
394struct root_domain {
395 atomic_t refcount;
396 cpumask_t span;
397 cpumask_t online;
398
399 /*
400 * The "RT overload" flag: it gets set if a CPU has more than
401 * one runnable RT task.
402 */
403 cpumask_t rto_mask;
404 atomic_t rto_count;
269}; 405};
270 406
271/* 407/*
408 * By default the system creates a single root-domain with all cpus as
409 * members (mimicking the global state we have today).
410 */
411static struct root_domain def_root_domain;
412
413#endif
414
415/*
272 * This is the main, per-CPU runqueue data structure. 416 * This is the main, per-CPU runqueue data structure.
273 * 417 *
274 * Locking rule: those places that want to lock multiple runqueues 418 * Locking rule: those places that want to lock multiple runqueues
@@ -296,11 +440,15 @@ struct rq {
296 u64 nr_switches; 440 u64 nr_switches;
297 441
298 struct cfs_rq cfs; 442 struct cfs_rq cfs;
443 struct rt_rq rt;
444 u64 rt_period_expire;
445 int rt_throttled;
446
299#ifdef CONFIG_FAIR_GROUP_SCHED 447#ifdef CONFIG_FAIR_GROUP_SCHED
300 /* list of leaf cfs_rq on this cpu: */ 448 /* list of leaf cfs_rq on this cpu: */
301 struct list_head leaf_cfs_rq_list; 449 struct list_head leaf_cfs_rq_list;
450 struct list_head leaf_rt_rq_list;
302#endif 451#endif
303 struct rt_rq rt;
304 452
305 /* 453 /*
306 * This is part of a global counter where only the total sum 454 * This is part of a global counter where only the total sum
@@ -317,7 +465,7 @@ struct rq {
317 u64 clock, prev_clock_raw; 465 u64 clock, prev_clock_raw;
318 s64 clock_max_delta; 466 s64 clock_max_delta;
319 467
320 unsigned int clock_warps, clock_overflows; 468 unsigned int clock_warps, clock_overflows, clock_underflows;
321 u64 idle_clock; 469 u64 idle_clock;
322 unsigned int clock_deep_idle_events; 470 unsigned int clock_deep_idle_events;
323 u64 tick_timestamp; 471 u64 tick_timestamp;
@@ -325,6 +473,7 @@ struct rq {
325 atomic_t nr_iowait; 473 atomic_t nr_iowait;
326 474
327#ifdef CONFIG_SMP 475#ifdef CONFIG_SMP
476 struct root_domain *rd;
328 struct sched_domain *sd; 477 struct sched_domain *sd;
329 478
330 /* For active balancing */ 479 /* For active balancing */
@@ -337,6 +486,12 @@ struct rq {
337 struct list_head migration_queue; 486 struct list_head migration_queue;
338#endif 487#endif
339 488
489#ifdef CONFIG_SCHED_HRTICK
490 unsigned long hrtick_flags;
491 ktime_t hrtick_expire;
492 struct hrtimer hrtick_timer;
493#endif
494
340#ifdef CONFIG_SCHEDSTATS 495#ifdef CONFIG_SCHEDSTATS
341 /* latency stats */ 496 /* latency stats */
342 struct sched_info rq_sched_info; 497 struct sched_info rq_sched_info;
@@ -363,7 +518,6 @@ struct rq {
363}; 518};
364 519
365static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 520static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
366static DEFINE_MUTEX(sched_hotcpu_mutex);
367 521
368static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 522static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
369{ 523{
@@ -441,6 +595,23 @@ static void update_rq_clock(struct rq *rq)
441#define task_rq(p) cpu_rq(task_cpu(p)) 595#define task_rq(p) cpu_rq(task_cpu(p))
442#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 596#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
443 597
598unsigned long rt_needs_cpu(int cpu)
599{
600 struct rq *rq = cpu_rq(cpu);
601 u64 delta;
602
603 if (!rq->rt_throttled)
604 return 0;
605
606 if (rq->clock > rq->rt_period_expire)
607 return 1;
608
609 delta = rq->rt_period_expire - rq->clock;
610 do_div(delta, NSEC_PER_SEC / HZ);
611
612 return (unsigned long)delta;
613}
614
444/* 615/*
445 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 616 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
446 */ 617 */
@@ -459,6 +630,8 @@ enum {
459 SCHED_FEAT_START_DEBIT = 4, 630 SCHED_FEAT_START_DEBIT = 4,
460 SCHED_FEAT_TREE_AVG = 8, 631 SCHED_FEAT_TREE_AVG = 8,
461 SCHED_FEAT_APPROX_AVG = 16, 632 SCHED_FEAT_APPROX_AVG = 16,
633 SCHED_FEAT_HRTICK = 32,
634 SCHED_FEAT_DOUBLE_TICK = 64,
462}; 635};
463 636
464const_debug unsigned int sysctl_sched_features = 637const_debug unsigned int sysctl_sched_features =
@@ -466,7 +639,9 @@ const_debug unsigned int sysctl_sched_features =
466 SCHED_FEAT_WAKEUP_PREEMPT * 1 | 639 SCHED_FEAT_WAKEUP_PREEMPT * 1 |
467 SCHED_FEAT_START_DEBIT * 1 | 640 SCHED_FEAT_START_DEBIT * 1 |
468 SCHED_FEAT_TREE_AVG * 0 | 641 SCHED_FEAT_TREE_AVG * 0 |
469 SCHED_FEAT_APPROX_AVG * 0; 642 SCHED_FEAT_APPROX_AVG * 0 |
643 SCHED_FEAT_HRTICK * 1 |
644 SCHED_FEAT_DOUBLE_TICK * 0;
470 645
471#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) 646#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
472 647
@@ -477,6 +652,21 @@ const_debug unsigned int sysctl_sched_features =
477const_debug unsigned int sysctl_sched_nr_migrate = 32; 652const_debug unsigned int sysctl_sched_nr_migrate = 32;
478 653
479/* 654/*
655 * period over which we measure -rt task cpu usage in ms.
656 * default: 1s
657 */
658const_debug unsigned int sysctl_sched_rt_period = 1000;
659
660#define SCHED_RT_FRAC_SHIFT 16
661#define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT)
662
663/*
664 * ratio of time -rt tasks may consume.
665 * default: 95%
666 */
667const_debug unsigned int sysctl_sched_rt_ratio = 62259;
668
669/*
480 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 670 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
481 * clock constructed from sched_clock(): 671 * clock constructed from sched_clock():
482 */ 672 */
@@ -668,7 +858,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
668 struct rq *rq = cpu_rq(smp_processor_id()); 858 struct rq *rq = cpu_rq(smp_processor_id());
669 u64 now = sched_clock(); 859 u64 now = sched_clock();
670 860
671 touch_softlockup_watchdog();
672 rq->idle_clock += delta_ns; 861 rq->idle_clock += delta_ns;
673 /* 862 /*
674 * Override the previous timestamp and ignore all 863 * Override the previous timestamp and ignore all
@@ -680,9 +869,177 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
680 rq->prev_clock_raw = now; 869 rq->prev_clock_raw = now;
681 rq->clock += delta_ns; 870 rq->clock += delta_ns;
682 spin_unlock(&rq->lock); 871 spin_unlock(&rq->lock);
872 touch_softlockup_watchdog();
683} 873}
684EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 874EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
685 875
876static void __resched_task(struct task_struct *p, int tif_bit);
877
878static inline void resched_task(struct task_struct *p)
879{
880 __resched_task(p, TIF_NEED_RESCHED);
881}
882
883#ifdef CONFIG_SCHED_HRTICK
884/*
885 * Use HR-timers to deliver accurate preemption points.
886 *
887 * Its all a bit involved since we cannot program an hrt while holding the
888 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
889 * reschedule event.
890 *
891 * When we get rescheduled we reprogram the hrtick_timer outside of the
892 * rq->lock.
893 */
894static inline void resched_hrt(struct task_struct *p)
895{
896 __resched_task(p, TIF_HRTICK_RESCHED);
897}
898
899static inline void resched_rq(struct rq *rq)
900{
901 unsigned long flags;
902
903 spin_lock_irqsave(&rq->lock, flags);
904 resched_task(rq->curr);
905 spin_unlock_irqrestore(&rq->lock, flags);
906}
907
908enum {
909 HRTICK_SET, /* re-programm hrtick_timer */
910 HRTICK_RESET, /* not a new slice */
911};
912
913/*
914 * Use hrtick when:
915 * - enabled by features
916 * - hrtimer is actually high res
917 */
918static inline int hrtick_enabled(struct rq *rq)
919{
920 if (!sched_feat(HRTICK))
921 return 0;
922 return hrtimer_is_hres_active(&rq->hrtick_timer);
923}
924
925/*
926 * Called to set the hrtick timer state.
927 *
928 * called with rq->lock held and irqs disabled
929 */
930static void hrtick_start(struct rq *rq, u64 delay, int reset)
931{
932 assert_spin_locked(&rq->lock);
933
934 /*
935 * preempt at: now + delay
936 */
937 rq->hrtick_expire =
938 ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
939 /*
940 * indicate we need to program the timer
941 */
942 __set_bit(HRTICK_SET, &rq->hrtick_flags);
943 if (reset)
944 __set_bit(HRTICK_RESET, &rq->hrtick_flags);
945
946 /*
947 * New slices are called from the schedule path and don't need a
948 * forced reschedule.
949 */
950 if (reset)
951 resched_hrt(rq->curr);
952}
953
954static void hrtick_clear(struct rq *rq)
955{
956 if (hrtimer_active(&rq->hrtick_timer))
957 hrtimer_cancel(&rq->hrtick_timer);
958}
959
960/*
961 * Update the timer from the possible pending state.
962 */
963static void hrtick_set(struct rq *rq)
964{
965 ktime_t time;
966 int set, reset;
967 unsigned long flags;
968
969 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
970
971 spin_lock_irqsave(&rq->lock, flags);
972 set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
973 reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
974 time = rq->hrtick_expire;
975 clear_thread_flag(TIF_HRTICK_RESCHED);
976 spin_unlock_irqrestore(&rq->lock, flags);
977
978 if (set) {
979 hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
980 if (reset && !hrtimer_active(&rq->hrtick_timer))
981 resched_rq(rq);
982 } else
983 hrtick_clear(rq);
984}
985
986/*
987 * High-resolution timer tick.
988 * Runs from hardirq context with interrupts disabled.
989 */
990static enum hrtimer_restart hrtick(struct hrtimer *timer)
991{
992 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
993
994 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
995
996 spin_lock(&rq->lock);
997 __update_rq_clock(rq);
998 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
999 spin_unlock(&rq->lock);
1000
1001 return HRTIMER_NORESTART;
1002}
1003
1004static inline void init_rq_hrtick(struct rq *rq)
1005{
1006 rq->hrtick_flags = 0;
1007 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1008 rq->hrtick_timer.function = hrtick;
1009 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
1010}
1011
1012void hrtick_resched(void)
1013{
1014 struct rq *rq;
1015 unsigned long flags;
1016
1017 if (!test_thread_flag(TIF_HRTICK_RESCHED))
1018 return;
1019
1020 local_irq_save(flags);
1021 rq = cpu_rq(smp_processor_id());
1022 hrtick_set(rq);
1023 local_irq_restore(flags);
1024}
1025#else
1026static inline void hrtick_clear(struct rq *rq)
1027{
1028}
1029
1030static inline void hrtick_set(struct rq *rq)
1031{
1032}
1033
1034static inline void init_rq_hrtick(struct rq *rq)
1035{
1036}
1037
1038void hrtick_resched(void)
1039{
1040}
1041#endif
1042
686/* 1043/*
687 * resched_task - mark a task 'to be rescheduled now'. 1044 * resched_task - mark a task 'to be rescheduled now'.
688 * 1045 *
@@ -696,16 +1053,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
696#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 1053#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
697#endif 1054#endif
698 1055
699static void resched_task(struct task_struct *p) 1056static void __resched_task(struct task_struct *p, int tif_bit)
700{ 1057{
701 int cpu; 1058 int cpu;
702 1059
703 assert_spin_locked(&task_rq(p)->lock); 1060 assert_spin_locked(&task_rq(p)->lock);
704 1061
705 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) 1062 if (unlikely(test_tsk_thread_flag(p, tif_bit)))
706 return; 1063 return;
707 1064
708 set_tsk_thread_flag(p, TIF_NEED_RESCHED); 1065 set_tsk_thread_flag(p, tif_bit);
709 1066
710 cpu = task_cpu(p); 1067 cpu = task_cpu(p);
711 if (cpu == smp_processor_id()) 1068 if (cpu == smp_processor_id())
@@ -728,10 +1085,10 @@ static void resched_cpu(int cpu)
728 spin_unlock_irqrestore(&rq->lock, flags); 1085 spin_unlock_irqrestore(&rq->lock, flags);
729} 1086}
730#else 1087#else
731static inline void resched_task(struct task_struct *p) 1088static void __resched_task(struct task_struct *p, int tif_bit)
732{ 1089{
733 assert_spin_locked(&task_rq(p)->lock); 1090 assert_spin_locked(&task_rq(p)->lock);
734 set_tsk_need_resched(p); 1091 set_tsk_thread_flag(p, tif_bit);
735} 1092}
736#endif 1093#endif
737 1094
@@ -871,6 +1228,23 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
871static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1228static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
872#endif 1229#endif
873 1230
1231static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1232{
1233 update_load_add(&rq->load, load);
1234}
1235
1236static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1237{
1238 update_load_sub(&rq->load, load);
1239}
1240
1241#ifdef CONFIG_SMP
1242static unsigned long source_load(int cpu, int type);
1243static unsigned long target_load(int cpu, int type);
1244static unsigned long cpu_avg_load_per_task(int cpu);
1245static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1246#endif /* CONFIG_SMP */
1247
874#include "sched_stats.h" 1248#include "sched_stats.h"
875#include "sched_idletask.c" 1249#include "sched_idletask.c"
876#include "sched_fair.c" 1250#include "sched_fair.c"
@@ -881,41 +1255,14 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
881 1255
882#define sched_class_highest (&rt_sched_class) 1256#define sched_class_highest (&rt_sched_class)
883 1257
884/*
885 * Update delta_exec, delta_fair fields for rq.
886 *
887 * delta_fair clock advances at a rate inversely proportional to
888 * total load (rq->load.weight) on the runqueue, while
889 * delta_exec advances at the same rate as wall-clock (provided
890 * cpu is not idle).
891 *
892 * delta_exec / delta_fair is a measure of the (smoothened) load on this
893 * runqueue over any given interval. This (smoothened) load is used
894 * during load balance.
895 *
896 * This function is called /before/ updating rq->load
897 * and when switching tasks.
898 */
899static inline void inc_load(struct rq *rq, const struct task_struct *p)
900{
901 update_load_add(&rq->load, p->se.load.weight);
902}
903
904static inline void dec_load(struct rq *rq, const struct task_struct *p)
905{
906 update_load_sub(&rq->load, p->se.load.weight);
907}
908
909static void inc_nr_running(struct task_struct *p, struct rq *rq) 1258static void inc_nr_running(struct task_struct *p, struct rq *rq)
910{ 1259{
911 rq->nr_running++; 1260 rq->nr_running++;
912 inc_load(rq, p);
913} 1261}
914 1262
915static void dec_nr_running(struct task_struct *p, struct rq *rq) 1263static void dec_nr_running(struct task_struct *p, struct rq *rq)
916{ 1264{
917 rq->nr_running--; 1265 rq->nr_running--;
918 dec_load(rq, p);
919} 1266}
920 1267
921static void set_load_weight(struct task_struct *p) 1268static void set_load_weight(struct task_struct *p)
@@ -1039,7 +1386,7 @@ unsigned long weighted_cpuload(const int cpu)
1039 1386
1040static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1387static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1041{ 1388{
1042 set_task_cfs_rq(p, cpu); 1389 set_task_rq(p, cpu);
1043#ifdef CONFIG_SMP 1390#ifdef CONFIG_SMP
1044 /* 1391 /*
1045 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be 1392 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
@@ -1051,12 +1398,24 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1051#endif 1398#endif
1052} 1399}
1053 1400
1401static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1402 const struct sched_class *prev_class,
1403 int oldprio, int running)
1404{
1405 if (prev_class != p->sched_class) {
1406 if (prev_class->switched_from)
1407 prev_class->switched_from(rq, p, running);
1408 p->sched_class->switched_to(rq, p, running);
1409 } else
1410 p->sched_class->prio_changed(rq, p, oldprio, running);
1411}
1412
1054#ifdef CONFIG_SMP 1413#ifdef CONFIG_SMP
1055 1414
1056/* 1415/*
1057 * Is this task likely cache-hot: 1416 * Is this task likely cache-hot:
1058 */ 1417 */
1059static inline int 1418static int
1060task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) 1419task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1061{ 1420{
1062 s64 delta; 1421 s64 delta;
@@ -1281,7 +1640,7 @@ static unsigned long target_load(int cpu, int type)
1281/* 1640/*
1282 * Return the average load per task on the cpu's run queue 1641 * Return the average load per task on the cpu's run queue
1283 */ 1642 */
1284static inline unsigned long cpu_avg_load_per_task(int cpu) 1643static unsigned long cpu_avg_load_per_task(int cpu)
1285{ 1644{
1286 struct rq *rq = cpu_rq(cpu); 1645 struct rq *rq = cpu_rq(cpu);
1287 unsigned long total = weighted_cpuload(cpu); 1646 unsigned long total = weighted_cpuload(cpu);
@@ -1438,58 +1797,6 @@ static int sched_balance_self(int cpu, int flag)
1438 1797
1439#endif /* CONFIG_SMP */ 1798#endif /* CONFIG_SMP */
1440 1799
1441/*
1442 * wake_idle() will wake a task on an idle cpu if task->cpu is
1443 * not idle and an idle cpu is available. The span of cpus to
1444 * search starts with cpus closest then further out as needed,
1445 * so we always favor a closer, idle cpu.
1446 *
1447 * Returns the CPU we should wake onto.
1448 */
1449#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1450static int wake_idle(int cpu, struct task_struct *p)
1451{
1452 cpumask_t tmp;
1453 struct sched_domain *sd;
1454 int i;
1455
1456 /*
1457 * If it is idle, then it is the best cpu to run this task.
1458 *
1459 * This cpu is also the best, if it has more than one task already.
1460 * Siblings must be also busy(in most cases) as they didn't already
1461 * pickup the extra load from this cpu and hence we need not check
1462 * sibling runqueue info. This will avoid the checks and cache miss
1463 * penalities associated with that.
1464 */
1465 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1466 return cpu;
1467
1468 for_each_domain(cpu, sd) {
1469 if (sd->flags & SD_WAKE_IDLE) {
1470 cpus_and(tmp, sd->span, p->cpus_allowed);
1471 for_each_cpu_mask(i, tmp) {
1472 if (idle_cpu(i)) {
1473 if (i != task_cpu(p)) {
1474 schedstat_inc(p,
1475 se.nr_wakeups_idle);
1476 }
1477 return i;
1478 }
1479 }
1480 } else {
1481 break;
1482 }
1483 }
1484 return cpu;
1485}
1486#else
1487static inline int wake_idle(int cpu, struct task_struct *p)
1488{
1489 return cpu;
1490}
1491#endif
1492
1493/*** 1800/***
1494 * try_to_wake_up - wake up a thread 1801 * try_to_wake_up - wake up a thread
1495 * @p: the to-be-woken-up thread 1802 * @p: the to-be-woken-up thread
@@ -1510,11 +1817,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1510 unsigned long flags; 1817 unsigned long flags;
1511 long old_state; 1818 long old_state;
1512 struct rq *rq; 1819 struct rq *rq;
1513#ifdef CONFIG_SMP
1514 struct sched_domain *sd, *this_sd = NULL;
1515 unsigned long load, this_load;
1516 int new_cpu;
1517#endif
1518 1820
1519 rq = task_rq_lock(p, &flags); 1821 rq = task_rq_lock(p, &flags);
1520 old_state = p->state; 1822 old_state = p->state;
@@ -1532,92 +1834,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1532 if (unlikely(task_running(rq, p))) 1834 if (unlikely(task_running(rq, p)))
1533 goto out_activate; 1835 goto out_activate;
1534 1836
1535 new_cpu = cpu; 1837 cpu = p->sched_class->select_task_rq(p, sync);
1536 1838 if (cpu != orig_cpu) {
1537 schedstat_inc(rq, ttwu_count); 1839 set_task_cpu(p, cpu);
1538 if (cpu == this_cpu) {
1539 schedstat_inc(rq, ttwu_local);
1540 goto out_set_cpu;
1541 }
1542
1543 for_each_domain(this_cpu, sd) {
1544 if (cpu_isset(cpu, sd->span)) {
1545 schedstat_inc(sd, ttwu_wake_remote);
1546 this_sd = sd;
1547 break;
1548 }
1549 }
1550
1551 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1552 goto out_set_cpu;
1553
1554 /*
1555 * Check for affine wakeup and passive balancing possibilities.
1556 */
1557 if (this_sd) {
1558 int idx = this_sd->wake_idx;
1559 unsigned int imbalance;
1560
1561 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1562
1563 load = source_load(cpu, idx);
1564 this_load = target_load(this_cpu, idx);
1565
1566 new_cpu = this_cpu; /* Wake to this CPU if we can */
1567
1568 if (this_sd->flags & SD_WAKE_AFFINE) {
1569 unsigned long tl = this_load;
1570 unsigned long tl_per_task;
1571
1572 /*
1573 * Attract cache-cold tasks on sync wakeups:
1574 */
1575 if (sync && !task_hot(p, rq->clock, this_sd))
1576 goto out_set_cpu;
1577
1578 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1579 tl_per_task = cpu_avg_load_per_task(this_cpu);
1580
1581 /*
1582 * If sync wakeup then subtract the (maximum possible)
1583 * effect of the currently running task from the load
1584 * of the current CPU:
1585 */
1586 if (sync)
1587 tl -= current->se.load.weight;
1588
1589 if ((tl <= load &&
1590 tl + target_load(cpu, idx) <= tl_per_task) ||
1591 100*(tl + p->se.load.weight) <= imbalance*load) {
1592 /*
1593 * This domain has SD_WAKE_AFFINE and
1594 * p is cache cold in this domain, and
1595 * there is no bad imbalance.
1596 */
1597 schedstat_inc(this_sd, ttwu_move_affine);
1598 schedstat_inc(p, se.nr_wakeups_affine);
1599 goto out_set_cpu;
1600 }
1601 }
1602
1603 /*
1604 * Start passive balancing when half the imbalance_pct
1605 * limit is reached.
1606 */
1607 if (this_sd->flags & SD_WAKE_BALANCE) {
1608 if (imbalance*this_load <= 100*load) {
1609 schedstat_inc(this_sd, ttwu_move_balance);
1610 schedstat_inc(p, se.nr_wakeups_passive);
1611 goto out_set_cpu;
1612 }
1613 }
1614 }
1615
1616 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1617out_set_cpu:
1618 new_cpu = wake_idle(new_cpu, p);
1619 if (new_cpu != cpu) {
1620 set_task_cpu(p, new_cpu);
1621 task_rq_unlock(rq, &flags); 1840 task_rq_unlock(rq, &flags);
1622 /* might preempt at this point */ 1841 /* might preempt at this point */
1623 rq = task_rq_lock(p, &flags); 1842 rq = task_rq_lock(p, &flags);
@@ -1631,6 +1850,21 @@ out_set_cpu:
1631 cpu = task_cpu(p); 1850 cpu = task_cpu(p);
1632 } 1851 }
1633 1852
1853#ifdef CONFIG_SCHEDSTATS
1854 schedstat_inc(rq, ttwu_count);
1855 if (cpu == this_cpu)
1856 schedstat_inc(rq, ttwu_local);
1857 else {
1858 struct sched_domain *sd;
1859 for_each_domain(this_cpu, sd) {
1860 if (cpu_isset(cpu, sd->span)) {
1861 schedstat_inc(sd, ttwu_wake_remote);
1862 break;
1863 }
1864 }
1865 }
1866#endif
1867
1634out_activate: 1868out_activate:
1635#endif /* CONFIG_SMP */ 1869#endif /* CONFIG_SMP */
1636 schedstat_inc(p, se.nr_wakeups); 1870 schedstat_inc(p, se.nr_wakeups);
@@ -1649,6 +1883,10 @@ out_activate:
1649 1883
1650out_running: 1884out_running:
1651 p->state = TASK_RUNNING; 1885 p->state = TASK_RUNNING;
1886#ifdef CONFIG_SMP
1887 if (p->sched_class->task_wake_up)
1888 p->sched_class->task_wake_up(rq, p);
1889#endif
1652out: 1890out:
1653 task_rq_unlock(rq, &flags); 1891 task_rq_unlock(rq, &flags);
1654 1892
@@ -1691,7 +1929,7 @@ static void __sched_fork(struct task_struct *p)
1691 p->se.wait_max = 0; 1929 p->se.wait_max = 0;
1692#endif 1930#endif
1693 1931
1694 INIT_LIST_HEAD(&p->run_list); 1932 INIT_LIST_HEAD(&p->rt.run_list);
1695 p->se.on_rq = 0; 1933 p->se.on_rq = 0;
1696 1934
1697#ifdef CONFIG_PREEMPT_NOTIFIERS 1935#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1771,6 +2009,10 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1771 inc_nr_running(p, rq); 2009 inc_nr_running(p, rq);
1772 } 2010 }
1773 check_preempt_curr(rq, p); 2011 check_preempt_curr(rq, p);
2012#ifdef CONFIG_SMP
2013 if (p->sched_class->task_wake_up)
2014 p->sched_class->task_wake_up(rq, p);
2015#endif
1774 task_rq_unlock(rq, &flags); 2016 task_rq_unlock(rq, &flags);
1775} 2017}
1776 2018
@@ -1891,6 +2133,11 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1891 prev_state = prev->state; 2133 prev_state = prev->state;
1892 finish_arch_switch(prev); 2134 finish_arch_switch(prev);
1893 finish_lock_switch(rq, prev); 2135 finish_lock_switch(rq, prev);
2136#ifdef CONFIG_SMP
2137 if (current->sched_class->post_schedule)
2138 current->sched_class->post_schedule(rq);
2139#endif
2140
1894 fire_sched_in_preempt_notifiers(current); 2141 fire_sched_in_preempt_notifiers(current);
1895 if (mm) 2142 if (mm)
1896 mmdrop(mm); 2143 mmdrop(mm);
@@ -2124,11 +2371,13 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2124/* 2371/*
2125 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 2372 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2126 */ 2373 */
2127static void double_lock_balance(struct rq *this_rq, struct rq *busiest) 2374static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2128 __releases(this_rq->lock) 2375 __releases(this_rq->lock)
2129 __acquires(busiest->lock) 2376 __acquires(busiest->lock)
2130 __acquires(this_rq->lock) 2377 __acquires(this_rq->lock)
2131{ 2378{
2379 int ret = 0;
2380
2132 if (unlikely(!irqs_disabled())) { 2381 if (unlikely(!irqs_disabled())) {
2133 /* printk() doesn't work good under rq->lock */ 2382 /* printk() doesn't work good under rq->lock */
2134 spin_unlock(&this_rq->lock); 2383 spin_unlock(&this_rq->lock);
@@ -2139,9 +2388,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
2139 spin_unlock(&this_rq->lock); 2388 spin_unlock(&this_rq->lock);
2140 spin_lock(&busiest->lock); 2389 spin_lock(&busiest->lock);
2141 spin_lock(&this_rq->lock); 2390 spin_lock(&this_rq->lock);
2391 ret = 1;
2142 } else 2392 } else
2143 spin_lock(&busiest->lock); 2393 spin_lock(&busiest->lock);
2144 } 2394 }
2395 return ret;
2145} 2396}
2146 2397
2147/* 2398/*
@@ -3485,12 +3736,14 @@ void scheduler_tick(void)
3485 /* 3736 /*
3486 * Let rq->clock advance by at least TICK_NSEC: 3737 * Let rq->clock advance by at least TICK_NSEC:
3487 */ 3738 */
3488 if (unlikely(rq->clock < next_tick)) 3739 if (unlikely(rq->clock < next_tick)) {
3489 rq->clock = next_tick; 3740 rq->clock = next_tick;
3741 rq->clock_underflows++;
3742 }
3490 rq->tick_timestamp = rq->clock; 3743 rq->tick_timestamp = rq->clock;
3491 update_cpu_load(rq); 3744 update_cpu_load(rq);
3492 if (curr != rq->idle) /* FIXME: needed? */ 3745 curr->sched_class->task_tick(rq, curr, 0);
3493 curr->sched_class->task_tick(rq, curr); 3746 update_sched_rt_period(rq);
3494 spin_unlock(&rq->lock); 3747 spin_unlock(&rq->lock);
3495 3748
3496#ifdef CONFIG_SMP 3749#ifdef CONFIG_SMP
@@ -3636,6 +3889,8 @@ need_resched_nonpreemptible:
3636 3889
3637 schedule_debug(prev); 3890 schedule_debug(prev);
3638 3891
3892 hrtick_clear(rq);
3893
3639 /* 3894 /*
3640 * Do the rq-clock update outside the rq lock: 3895 * Do the rq-clock update outside the rq lock:
3641 */ 3896 */
@@ -3654,6 +3909,11 @@ need_resched_nonpreemptible:
3654 switch_count = &prev->nvcsw; 3909 switch_count = &prev->nvcsw;
3655 } 3910 }
3656 3911
3912#ifdef CONFIG_SMP
3913 if (prev->sched_class->pre_schedule)
3914 prev->sched_class->pre_schedule(rq, prev);
3915#endif
3916
3657 if (unlikely(!rq->nr_running)) 3917 if (unlikely(!rq->nr_running))
3658 idle_balance(cpu, rq); 3918 idle_balance(cpu, rq);
3659 3919
@@ -3668,14 +3928,20 @@ need_resched_nonpreemptible:
3668 ++*switch_count; 3928 ++*switch_count;
3669 3929
3670 context_switch(rq, prev, next); /* unlocks the rq */ 3930 context_switch(rq, prev, next); /* unlocks the rq */
3931 /*
3932 * the context switch might have flipped the stack from under
3933 * us, hence refresh the local variables.
3934 */
3935 cpu = smp_processor_id();
3936 rq = cpu_rq(cpu);
3671 } else 3937 } else
3672 spin_unlock_irq(&rq->lock); 3938 spin_unlock_irq(&rq->lock);
3673 3939
3674 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3940 hrtick_set(rq);
3675 cpu = smp_processor_id(); 3941
3676 rq = cpu_rq(cpu); 3942 if (unlikely(reacquire_kernel_lock(current) < 0))
3677 goto need_resched_nonpreemptible; 3943 goto need_resched_nonpreemptible;
3678 } 3944
3679 preempt_enable_no_resched(); 3945 preempt_enable_no_resched();
3680 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3946 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3681 goto need_resched; 3947 goto need_resched;
@@ -3691,10 +3957,9 @@ EXPORT_SYMBOL(schedule);
3691asmlinkage void __sched preempt_schedule(void) 3957asmlinkage void __sched preempt_schedule(void)
3692{ 3958{
3693 struct thread_info *ti = current_thread_info(); 3959 struct thread_info *ti = current_thread_info();
3694#ifdef CONFIG_PREEMPT_BKL
3695 struct task_struct *task = current; 3960 struct task_struct *task = current;
3696 int saved_lock_depth; 3961 int saved_lock_depth;
3697#endif 3962
3698 /* 3963 /*
3699 * If there is a non-zero preempt_count or interrupts are disabled, 3964 * If there is a non-zero preempt_count or interrupts are disabled,
3700 * we do not want to preempt the current task. Just return.. 3965 * we do not want to preempt the current task. Just return..
@@ -3710,14 +3975,10 @@ asmlinkage void __sched preempt_schedule(void)
3710 * clear ->lock_depth so that schedule() doesnt 3975 * clear ->lock_depth so that schedule() doesnt
3711 * auto-release the semaphore: 3976 * auto-release the semaphore:
3712 */ 3977 */
3713#ifdef CONFIG_PREEMPT_BKL
3714 saved_lock_depth = task->lock_depth; 3978 saved_lock_depth = task->lock_depth;
3715 task->lock_depth = -1; 3979 task->lock_depth = -1;
3716#endif
3717 schedule(); 3980 schedule();
3718#ifdef CONFIG_PREEMPT_BKL
3719 task->lock_depth = saved_lock_depth; 3981 task->lock_depth = saved_lock_depth;
3720#endif
3721 sub_preempt_count(PREEMPT_ACTIVE); 3982 sub_preempt_count(PREEMPT_ACTIVE);
3722 3983
3723 /* 3984 /*
@@ -3738,10 +3999,9 @@ EXPORT_SYMBOL(preempt_schedule);
3738asmlinkage void __sched preempt_schedule_irq(void) 3999asmlinkage void __sched preempt_schedule_irq(void)
3739{ 4000{
3740 struct thread_info *ti = current_thread_info(); 4001 struct thread_info *ti = current_thread_info();
3741#ifdef CONFIG_PREEMPT_BKL
3742 struct task_struct *task = current; 4002 struct task_struct *task = current;
3743 int saved_lock_depth; 4003 int saved_lock_depth;
3744#endif 4004
3745 /* Catch callers which need to be fixed */ 4005 /* Catch callers which need to be fixed */
3746 BUG_ON(ti->preempt_count || !irqs_disabled()); 4006 BUG_ON(ti->preempt_count || !irqs_disabled());
3747 4007
@@ -3753,16 +4013,12 @@ asmlinkage void __sched preempt_schedule_irq(void)
3753 * clear ->lock_depth so that schedule() doesnt 4013 * clear ->lock_depth so that schedule() doesnt
3754 * auto-release the semaphore: 4014 * auto-release the semaphore:
3755 */ 4015 */
3756#ifdef CONFIG_PREEMPT_BKL
3757 saved_lock_depth = task->lock_depth; 4016 saved_lock_depth = task->lock_depth;
3758 task->lock_depth = -1; 4017 task->lock_depth = -1;
3759#endif
3760 local_irq_enable(); 4018 local_irq_enable();
3761 schedule(); 4019 schedule();
3762 local_irq_disable(); 4020 local_irq_disable();
3763#ifdef CONFIG_PREEMPT_BKL
3764 task->lock_depth = saved_lock_depth; 4021 task->lock_depth = saved_lock_depth;
3765#endif
3766 sub_preempt_count(PREEMPT_ACTIVE); 4022 sub_preempt_count(PREEMPT_ACTIVE);
3767 4023
3768 /* 4024 /*
@@ -4019,6 +4275,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4019 unsigned long flags; 4275 unsigned long flags;
4020 int oldprio, on_rq, running; 4276 int oldprio, on_rq, running;
4021 struct rq *rq; 4277 struct rq *rq;
4278 const struct sched_class *prev_class = p->sched_class;
4022 4279
4023 BUG_ON(prio < 0 || prio > MAX_PRIO); 4280 BUG_ON(prio < 0 || prio > MAX_PRIO);
4024 4281
@@ -4044,18 +4301,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4044 if (on_rq) { 4301 if (on_rq) {
4045 if (running) 4302 if (running)
4046 p->sched_class->set_curr_task(rq); 4303 p->sched_class->set_curr_task(rq);
4304
4047 enqueue_task(rq, p, 0); 4305 enqueue_task(rq, p, 0);
4048 /* 4306
4049 * Reschedule if we are currently running on this runqueue and 4307 check_class_changed(rq, p, prev_class, oldprio, running);
4050 * our priority decreased, or if we are not currently running on
4051 * this runqueue and our priority is higher than the current's
4052 */
4053 if (running) {
4054 if (p->prio > oldprio)
4055 resched_task(rq->curr);
4056 } else {
4057 check_preempt_curr(rq, p);
4058 }
4059 } 4308 }
4060 task_rq_unlock(rq, &flags); 4309 task_rq_unlock(rq, &flags);
4061} 4310}
@@ -4087,10 +4336,8 @@ void set_user_nice(struct task_struct *p, long nice)
4087 goto out_unlock; 4336 goto out_unlock;
4088 } 4337 }
4089 on_rq = p->se.on_rq; 4338 on_rq = p->se.on_rq;
4090 if (on_rq) { 4339 if (on_rq)
4091 dequeue_task(rq, p, 0); 4340 dequeue_task(rq, p, 0);
4092 dec_load(rq, p);
4093 }
4094 4341
4095 p->static_prio = NICE_TO_PRIO(nice); 4342 p->static_prio = NICE_TO_PRIO(nice);
4096 set_load_weight(p); 4343 set_load_weight(p);
@@ -4100,7 +4347,6 @@ void set_user_nice(struct task_struct *p, long nice)
4100 4347
4101 if (on_rq) { 4348 if (on_rq) {
4102 enqueue_task(rq, p, 0); 4349 enqueue_task(rq, p, 0);
4103 inc_load(rq, p);
4104 /* 4350 /*
4105 * If the task increased its priority or is running and 4351 * If the task increased its priority or is running and
4106 * lowered its priority, then reschedule its CPU: 4352 * lowered its priority, then reschedule its CPU:
@@ -4258,6 +4504,7 @@ int sched_setscheduler(struct task_struct *p, int policy,
4258{ 4504{
4259 int retval, oldprio, oldpolicy = -1, on_rq, running; 4505 int retval, oldprio, oldpolicy = -1, on_rq, running;
4260 unsigned long flags; 4506 unsigned long flags;
4507 const struct sched_class *prev_class = p->sched_class;
4261 struct rq *rq; 4508 struct rq *rq;
4262 4509
4263 /* may grab non-irq protected spin_locks */ 4510 /* may grab non-irq protected spin_locks */
@@ -4351,18 +4598,10 @@ recheck:
4351 if (on_rq) { 4598 if (on_rq) {
4352 if (running) 4599 if (running)
4353 p->sched_class->set_curr_task(rq); 4600 p->sched_class->set_curr_task(rq);
4601
4354 activate_task(rq, p, 0); 4602 activate_task(rq, p, 0);
4355 /* 4603
4356 * Reschedule if we are currently running on this runqueue and 4604 check_class_changed(rq, p, prev_class, oldprio, running);
4357 * our priority decreased, or if we are not currently running on
4358 * this runqueue and our priority is higher than the current's
4359 */
4360 if (running) {
4361 if (p->prio > oldprio)
4362 resched_task(rq->curr);
4363 } else {
4364 check_preempt_curr(rq, p);
4365 }
4366 } 4605 }
4367 __task_rq_unlock(rq); 4606 __task_rq_unlock(rq);
4368 spin_unlock_irqrestore(&p->pi_lock, flags); 4607 spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -4490,13 +4729,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4490 struct task_struct *p; 4729 struct task_struct *p;
4491 int retval; 4730 int retval;
4492 4731
4493 mutex_lock(&sched_hotcpu_mutex); 4732 get_online_cpus();
4494 read_lock(&tasklist_lock); 4733 read_lock(&tasklist_lock);
4495 4734
4496 p = find_process_by_pid(pid); 4735 p = find_process_by_pid(pid);
4497 if (!p) { 4736 if (!p) {
4498 read_unlock(&tasklist_lock); 4737 read_unlock(&tasklist_lock);
4499 mutex_unlock(&sched_hotcpu_mutex); 4738 put_online_cpus();
4500 return -ESRCH; 4739 return -ESRCH;
4501 } 4740 }
4502 4741
@@ -4536,7 +4775,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4536 } 4775 }
4537out_unlock: 4776out_unlock:
4538 put_task_struct(p); 4777 put_task_struct(p);
4539 mutex_unlock(&sched_hotcpu_mutex); 4778 put_online_cpus();
4540 return retval; 4779 return retval;
4541} 4780}
4542 4781
@@ -4593,7 +4832,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
4593 struct task_struct *p; 4832 struct task_struct *p;
4594 int retval; 4833 int retval;
4595 4834
4596 mutex_lock(&sched_hotcpu_mutex); 4835 get_online_cpus();
4597 read_lock(&tasklist_lock); 4836 read_lock(&tasklist_lock);
4598 4837
4599 retval = -ESRCH; 4838 retval = -ESRCH;
@@ -4609,7 +4848,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
4609 4848
4610out_unlock: 4849out_unlock:
4611 read_unlock(&tasklist_lock); 4850 read_unlock(&tasklist_lock);
4612 mutex_unlock(&sched_hotcpu_mutex); 4851 put_online_cpus();
4613 4852
4614 return retval; 4853 return retval;
4615} 4854}
@@ -4683,7 +4922,8 @@ static void __cond_resched(void)
4683 } while (need_resched()); 4922 } while (need_resched());
4684} 4923}
4685 4924
4686int __sched cond_resched(void) 4925#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
4926int __sched _cond_resched(void)
4687{ 4927{
4688 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && 4928 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4689 system_state == SYSTEM_RUNNING) { 4929 system_state == SYSTEM_RUNNING) {
@@ -4692,7 +4932,8 @@ int __sched cond_resched(void)
4692 } 4932 }
4693 return 0; 4933 return 0;
4694} 4934}
4695EXPORT_SYMBOL(cond_resched); 4935EXPORT_SYMBOL(_cond_resched);
4936#endif
4696 4937
4697/* 4938/*
4698 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 4939 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
@@ -4704,19 +4945,15 @@ EXPORT_SYMBOL(cond_resched);
4704 */ 4945 */
4705int cond_resched_lock(spinlock_t *lock) 4946int cond_resched_lock(spinlock_t *lock)
4706{ 4947{
4948 int resched = need_resched() && system_state == SYSTEM_RUNNING;
4707 int ret = 0; 4949 int ret = 0;
4708 4950
4709 if (need_lockbreak(lock)) { 4951 if (spin_needbreak(lock) || resched) {
4710 spin_unlock(lock); 4952 spin_unlock(lock);
4711 cpu_relax(); 4953 if (resched && need_resched())
4712 ret = 1; 4954 __cond_resched();
4713 spin_lock(lock); 4955 else
4714 } 4956 cpu_relax();
4715 if (need_resched() && system_state == SYSTEM_RUNNING) {
4716 spin_release(&lock->dep_map, 1, _THIS_IP_);
4717 _raw_spin_unlock(lock);
4718 preempt_enable_no_resched();
4719 __cond_resched();
4720 ret = 1; 4957 ret = 1;
4721 spin_lock(lock); 4958 spin_lock(lock);
4722 } 4959 }
@@ -4890,7 +5127,7 @@ out_unlock:
4890 5127
4891static const char stat_nam[] = "RSDTtZX"; 5128static const char stat_nam[] = "RSDTtZX";
4892 5129
4893static void show_task(struct task_struct *p) 5130void sched_show_task(struct task_struct *p)
4894{ 5131{
4895 unsigned long free = 0; 5132 unsigned long free = 0;
4896 unsigned state; 5133 unsigned state;
@@ -4920,8 +5157,7 @@ static void show_task(struct task_struct *p)
4920 printk(KERN_CONT "%5lu %5d %6d\n", free, 5157 printk(KERN_CONT "%5lu %5d %6d\n", free,
4921 task_pid_nr(p), task_pid_nr(p->real_parent)); 5158 task_pid_nr(p), task_pid_nr(p->real_parent));
4922 5159
4923 if (state != TASK_RUNNING) 5160 show_stack(p, NULL);
4924 show_stack(p, NULL);
4925} 5161}
4926 5162
4927void show_state_filter(unsigned long state_filter) 5163void show_state_filter(unsigned long state_filter)
@@ -4943,7 +5179,7 @@ void show_state_filter(unsigned long state_filter)
4943 */ 5179 */
4944 touch_nmi_watchdog(); 5180 touch_nmi_watchdog();
4945 if (!state_filter || (p->state & state_filter)) 5181 if (!state_filter || (p->state & state_filter))
4946 show_task(p); 5182 sched_show_task(p);
4947 } while_each_thread(g, p); 5183 } while_each_thread(g, p);
4948 5184
4949 touch_all_softlockup_watchdogs(); 5185 touch_all_softlockup_watchdogs();
@@ -4992,11 +5228,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4992 spin_unlock_irqrestore(&rq->lock, flags); 5228 spin_unlock_irqrestore(&rq->lock, flags);
4993 5229
4994 /* Set the preempt count _outside_ the spinlocks! */ 5230 /* Set the preempt count _outside_ the spinlocks! */
4995#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4996 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4997#else
4998 task_thread_info(idle)->preempt_count = 0; 5231 task_thread_info(idle)->preempt_count = 0;
4999#endif 5232
5000 /* 5233 /*
5001 * The idle tasks have their own, simple scheduling class: 5234 * The idle tasks have their own, simple scheduling class:
5002 */ 5235 */
@@ -5077,7 +5310,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
5077 goto out; 5310 goto out;
5078 } 5311 }
5079 5312
5080 p->cpus_allowed = new_mask; 5313 if (p->sched_class->set_cpus_allowed)
5314 p->sched_class->set_cpus_allowed(p, &new_mask);
5315 else {
5316 p->cpus_allowed = new_mask;
5317 p->rt.nr_cpus_allowed = cpus_weight(new_mask);
5318 }
5319
5081 /* Can the task run on the task's current CPU? If so, we're done */ 5320 /* Can the task run on the task's current CPU? If so, we're done */
5082 if (cpu_isset(task_cpu(p), new_mask)) 5321 if (cpu_isset(task_cpu(p), new_mask))
5083 goto out; 5322 goto out;
@@ -5569,9 +5808,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5569 struct rq *rq; 5808 struct rq *rq;
5570 5809
5571 switch (action) { 5810 switch (action) {
5572 case CPU_LOCK_ACQUIRE:
5573 mutex_lock(&sched_hotcpu_mutex);
5574 break;
5575 5811
5576 case CPU_UP_PREPARE: 5812 case CPU_UP_PREPARE:
5577 case CPU_UP_PREPARE_FROZEN: 5813 case CPU_UP_PREPARE_FROZEN:
@@ -5590,6 +5826,15 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5590 case CPU_ONLINE_FROZEN: 5826 case CPU_ONLINE_FROZEN:
5591 /* Strictly unnecessary, as first user will wake it. */ 5827 /* Strictly unnecessary, as first user will wake it. */
5592 wake_up_process(cpu_rq(cpu)->migration_thread); 5828 wake_up_process(cpu_rq(cpu)->migration_thread);
5829
5830 /* Update our root-domain */
5831 rq = cpu_rq(cpu);
5832 spin_lock_irqsave(&rq->lock, flags);
5833 if (rq->rd) {
5834 BUG_ON(!cpu_isset(cpu, rq->rd->span));
5835 cpu_set(cpu, rq->rd->online);
5836 }
5837 spin_unlock_irqrestore(&rq->lock, flags);
5593 break; 5838 break;
5594 5839
5595#ifdef CONFIG_HOTPLUG_CPU 5840#ifdef CONFIG_HOTPLUG_CPU
@@ -5640,10 +5885,18 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5640 } 5885 }
5641 spin_unlock_irq(&rq->lock); 5886 spin_unlock_irq(&rq->lock);
5642 break; 5887 break;
5643#endif 5888
5644 case CPU_LOCK_RELEASE: 5889 case CPU_DOWN_PREPARE:
5645 mutex_unlock(&sched_hotcpu_mutex); 5890 /* Update our root-domain */
5891 rq = cpu_rq(cpu);
5892 spin_lock_irqsave(&rq->lock, flags);
5893 if (rq->rd) {
5894 BUG_ON(!cpu_isset(cpu, rq->rd->span));
5895 cpu_clear(cpu, rq->rd->online);
5896 }
5897 spin_unlock_irqrestore(&rq->lock, flags);
5646 break; 5898 break;
5899#endif
5647 } 5900 }
5648 return NOTIFY_OK; 5901 return NOTIFY_OK;
5649} 5902}
@@ -5831,11 +6084,76 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5831 return 1; 6084 return 1;
5832} 6085}
5833 6086
6087static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6088{
6089 unsigned long flags;
6090 const struct sched_class *class;
6091
6092 spin_lock_irqsave(&rq->lock, flags);
6093
6094 if (rq->rd) {
6095 struct root_domain *old_rd = rq->rd;
6096
6097 for (class = sched_class_highest; class; class = class->next) {
6098 if (class->leave_domain)
6099 class->leave_domain(rq);
6100 }
6101
6102 cpu_clear(rq->cpu, old_rd->span);
6103 cpu_clear(rq->cpu, old_rd->online);
6104
6105 if (atomic_dec_and_test(&old_rd->refcount))
6106 kfree(old_rd);
6107 }
6108
6109 atomic_inc(&rd->refcount);
6110 rq->rd = rd;
6111
6112 cpu_set(rq->cpu, rd->span);
6113 if (cpu_isset(rq->cpu, cpu_online_map))
6114 cpu_set(rq->cpu, rd->online);
6115
6116 for (class = sched_class_highest; class; class = class->next) {
6117 if (class->join_domain)
6118 class->join_domain(rq);
6119 }
6120
6121 spin_unlock_irqrestore(&rq->lock, flags);
6122}
6123
6124static void init_rootdomain(struct root_domain *rd)
6125{
6126 memset(rd, 0, sizeof(*rd));
6127
6128 cpus_clear(rd->span);
6129 cpus_clear(rd->online);
6130}
6131
6132static void init_defrootdomain(void)
6133{
6134 init_rootdomain(&def_root_domain);
6135 atomic_set(&def_root_domain.refcount, 1);
6136}
6137
6138static struct root_domain *alloc_rootdomain(void)
6139{
6140 struct root_domain *rd;
6141
6142 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
6143 if (!rd)
6144 return NULL;
6145
6146 init_rootdomain(rd);
6147
6148 return rd;
6149}
6150
5834/* 6151/*
5835 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6152 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5836 * hold the hotplug lock. 6153 * hold the hotplug lock.
5837 */ 6154 */
5838static void cpu_attach_domain(struct sched_domain *sd, int cpu) 6155static void
6156cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5839{ 6157{
5840 struct rq *rq = cpu_rq(cpu); 6158 struct rq *rq = cpu_rq(cpu);
5841 struct sched_domain *tmp; 6159 struct sched_domain *tmp;
@@ -5860,6 +6178,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5860 6178
5861 sched_domain_debug(sd, cpu); 6179 sched_domain_debug(sd, cpu);
5862 6180
6181 rq_attach_root(rq, rd);
5863 rcu_assign_pointer(rq->sd, sd); 6182 rcu_assign_pointer(rq->sd, sd);
5864} 6183}
5865 6184
@@ -6228,6 +6547,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6228static int build_sched_domains(const cpumask_t *cpu_map) 6547static int build_sched_domains(const cpumask_t *cpu_map)
6229{ 6548{
6230 int i; 6549 int i;
6550 struct root_domain *rd;
6231#ifdef CONFIG_NUMA 6551#ifdef CONFIG_NUMA
6232 struct sched_group **sched_group_nodes = NULL; 6552 struct sched_group **sched_group_nodes = NULL;
6233 int sd_allnodes = 0; 6553 int sd_allnodes = 0;
@@ -6244,6 +6564,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6244 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; 6564 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
6245#endif 6565#endif
6246 6566
6567 rd = alloc_rootdomain();
6568 if (!rd) {
6569 printk(KERN_WARNING "Cannot alloc root domain\n");
6570 return -ENOMEM;
6571 }
6572
6247 /* 6573 /*
6248 * Set up domains for cpus specified by the cpu_map. 6574 * Set up domains for cpus specified by the cpu_map.
6249 */ 6575 */
@@ -6460,7 +6786,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6460#else 6786#else
6461 sd = &per_cpu(phys_domains, i); 6787 sd = &per_cpu(phys_domains, i);
6462#endif 6788#endif
6463 cpu_attach_domain(sd, i); 6789 cpu_attach_domain(sd, rd, i);
6464 } 6790 }
6465 6791
6466 return 0; 6792 return 0;
@@ -6518,7 +6844,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6518 unregister_sched_domain_sysctl(); 6844 unregister_sched_domain_sysctl();
6519 6845
6520 for_each_cpu_mask(i, *cpu_map) 6846 for_each_cpu_mask(i, *cpu_map)
6521 cpu_attach_domain(NULL, i); 6847 cpu_attach_domain(NULL, &def_root_domain, i);
6522 synchronize_sched(); 6848 synchronize_sched();
6523 arch_destroy_sched_domains(cpu_map); 6849 arch_destroy_sched_domains(cpu_map);
6524} 6850}
@@ -6548,6 +6874,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
6548{ 6874{
6549 int i, j; 6875 int i, j;
6550 6876
6877 lock_doms_cur();
6878
6551 /* always unregister in case we don't destroy any domains */ 6879 /* always unregister in case we don't destroy any domains */
6552 unregister_sched_domain_sysctl(); 6880 unregister_sched_domain_sysctl();
6553 6881
@@ -6588,6 +6916,8 @@ match2:
6588 ndoms_cur = ndoms_new; 6916 ndoms_cur = ndoms_new;
6589 6917
6590 register_sched_domain_sysctl(); 6918 register_sched_domain_sysctl();
6919
6920 unlock_doms_cur();
6591} 6921}
6592 6922
6593#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 6923#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -6595,10 +6925,10 @@ static int arch_reinit_sched_domains(void)
6595{ 6925{
6596 int err; 6926 int err;
6597 6927
6598 mutex_lock(&sched_hotcpu_mutex); 6928 get_online_cpus();
6599 detach_destroy_domains(&cpu_online_map); 6929 detach_destroy_domains(&cpu_online_map);
6600 err = arch_init_sched_domains(&cpu_online_map); 6930 err = arch_init_sched_domains(&cpu_online_map);
6601 mutex_unlock(&sched_hotcpu_mutex); 6931 put_online_cpus();
6602 6932
6603 return err; 6933 return err;
6604} 6934}
@@ -6709,12 +7039,12 @@ void __init sched_init_smp(void)
6709{ 7039{
6710 cpumask_t non_isolated_cpus; 7040 cpumask_t non_isolated_cpus;
6711 7041
6712 mutex_lock(&sched_hotcpu_mutex); 7042 get_online_cpus();
6713 arch_init_sched_domains(&cpu_online_map); 7043 arch_init_sched_domains(&cpu_online_map);
6714 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 7044 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6715 if (cpus_empty(non_isolated_cpus)) 7045 if (cpus_empty(non_isolated_cpus))
6716 cpu_set(smp_processor_id(), non_isolated_cpus); 7046 cpu_set(smp_processor_id(), non_isolated_cpus);
6717 mutex_unlock(&sched_hotcpu_mutex); 7047 put_online_cpus();
6718 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7048 /* XXX: Theoretical race here - CPU may be hotplugged now */
6719 hotcpu_notifier(update_sched_domains, 0); 7049 hotcpu_notifier(update_sched_domains, 0);
6720 7050
@@ -6722,6 +7052,21 @@ void __init sched_init_smp(void)
6722 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 7052 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6723 BUG(); 7053 BUG();
6724 sched_init_granularity(); 7054 sched_init_granularity();
7055
7056#ifdef CONFIG_FAIR_GROUP_SCHED
7057 if (nr_cpu_ids == 1)
7058 return;
7059
7060 lb_monitor_task = kthread_create(load_balance_monitor, NULL,
7061 "group_balance");
7062 if (!IS_ERR(lb_monitor_task)) {
7063 lb_monitor_task->flags |= PF_NOFREEZE;
7064 wake_up_process(lb_monitor_task);
7065 } else {
7066 printk(KERN_ERR "Could not create load balance monitor thread"
7067 "(error = %ld) \n", PTR_ERR(lb_monitor_task));
7068 }
7069#endif
6725} 7070}
6726#else 7071#else
6727void __init sched_init_smp(void) 7072void __init sched_init_smp(void)
@@ -6746,13 +7091,87 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6746 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 7091 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
6747} 7092}
6748 7093
7094static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7095{
7096 struct rt_prio_array *array;
7097 int i;
7098
7099 array = &rt_rq->active;
7100 for (i = 0; i < MAX_RT_PRIO; i++) {
7101 INIT_LIST_HEAD(array->queue + i);
7102 __clear_bit(i, array->bitmap);
7103 }
7104 /* delimiter for bitsearch: */
7105 __set_bit(MAX_RT_PRIO, array->bitmap);
7106
7107#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
7108 rt_rq->highest_prio = MAX_RT_PRIO;
7109#endif
7110#ifdef CONFIG_SMP
7111 rt_rq->rt_nr_migratory = 0;
7112 rt_rq->overloaded = 0;
7113#endif
7114
7115 rt_rq->rt_time = 0;
7116 rt_rq->rt_throttled = 0;
7117
7118#ifdef CONFIG_FAIR_GROUP_SCHED
7119 rt_rq->rq = rq;
7120#endif
7121}
7122
7123#ifdef CONFIG_FAIR_GROUP_SCHED
7124static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
7125 struct cfs_rq *cfs_rq, struct sched_entity *se,
7126 int cpu, int add)
7127{
7128 tg->cfs_rq[cpu] = cfs_rq;
7129 init_cfs_rq(cfs_rq, rq);
7130 cfs_rq->tg = tg;
7131 if (add)
7132 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7133
7134 tg->se[cpu] = se;
7135 se->cfs_rq = &rq->cfs;
7136 se->my_q = cfs_rq;
7137 se->load.weight = tg->shares;
7138 se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
7139 se->parent = NULL;
7140}
7141
7142static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
7143 struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
7144 int cpu, int add)
7145{
7146 tg->rt_rq[cpu] = rt_rq;
7147 init_rt_rq(rt_rq, rq);
7148 rt_rq->tg = tg;
7149 rt_rq->rt_se = rt_se;
7150 if (add)
7151 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7152
7153 tg->rt_se[cpu] = rt_se;
7154 rt_se->rt_rq = &rq->rt;
7155 rt_se->my_q = rt_rq;
7156 rt_se->parent = NULL;
7157 INIT_LIST_HEAD(&rt_se->run_list);
7158}
7159#endif
7160
6749void __init sched_init(void) 7161void __init sched_init(void)
6750{ 7162{
6751 int highest_cpu = 0; 7163 int highest_cpu = 0;
6752 int i, j; 7164 int i, j;
6753 7165
7166#ifdef CONFIG_SMP
7167 init_defrootdomain();
7168#endif
7169
7170#ifdef CONFIG_FAIR_GROUP_SCHED
7171 list_add(&init_task_group.list, &task_groups);
7172#endif
7173
6754 for_each_possible_cpu(i) { 7174 for_each_possible_cpu(i) {
6755 struct rt_prio_array *array;
6756 struct rq *rq; 7175 struct rq *rq;
6757 7176
6758 rq = cpu_rq(i); 7177 rq = cpu_rq(i);
@@ -6761,52 +7180,39 @@ void __init sched_init(void)
6761 rq->nr_running = 0; 7180 rq->nr_running = 0;
6762 rq->clock = 1; 7181 rq->clock = 1;
6763 init_cfs_rq(&rq->cfs, rq); 7182 init_cfs_rq(&rq->cfs, rq);
7183 init_rt_rq(&rq->rt, rq);
6764#ifdef CONFIG_FAIR_GROUP_SCHED 7184#ifdef CONFIG_FAIR_GROUP_SCHED
6765 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6766 {
6767 struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
6768 struct sched_entity *se =
6769 &per_cpu(init_sched_entity, i);
6770
6771 init_cfs_rq_p[i] = cfs_rq;
6772 init_cfs_rq(cfs_rq, rq);
6773 cfs_rq->tg = &init_task_group;
6774 list_add(&cfs_rq->leaf_cfs_rq_list,
6775 &rq->leaf_cfs_rq_list);
6776
6777 init_sched_entity_p[i] = se;
6778 se->cfs_rq = &rq->cfs;
6779 se->my_q = cfs_rq;
6780 se->load.weight = init_task_group_load;
6781 se->load.inv_weight =
6782 div64_64(1ULL<<32, init_task_group_load);
6783 se->parent = NULL;
6784 }
6785 init_task_group.shares = init_task_group_load; 7185 init_task_group.shares = init_task_group_load;
6786 spin_lock_init(&init_task_group.lock); 7186 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7187 init_tg_cfs_entry(rq, &init_task_group,
7188 &per_cpu(init_cfs_rq, i),
7189 &per_cpu(init_sched_entity, i), i, 1);
7190
7191 init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
7192 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7193 init_tg_rt_entry(rq, &init_task_group,
7194 &per_cpu(init_rt_rq, i),
7195 &per_cpu(init_sched_rt_entity, i), i, 1);
6787#endif 7196#endif
7197 rq->rt_period_expire = 0;
7198 rq->rt_throttled = 0;
6788 7199
6789 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7200 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6790 rq->cpu_load[j] = 0; 7201 rq->cpu_load[j] = 0;
6791#ifdef CONFIG_SMP 7202#ifdef CONFIG_SMP
6792 rq->sd = NULL; 7203 rq->sd = NULL;
7204 rq->rd = NULL;
6793 rq->active_balance = 0; 7205 rq->active_balance = 0;
6794 rq->next_balance = jiffies; 7206 rq->next_balance = jiffies;
6795 rq->push_cpu = 0; 7207 rq->push_cpu = 0;
6796 rq->cpu = i; 7208 rq->cpu = i;
6797 rq->migration_thread = NULL; 7209 rq->migration_thread = NULL;
6798 INIT_LIST_HEAD(&rq->migration_queue); 7210 INIT_LIST_HEAD(&rq->migration_queue);
7211 rq_attach_root(rq, &def_root_domain);
6799#endif 7212#endif
7213 init_rq_hrtick(rq);
6800 atomic_set(&rq->nr_iowait, 0); 7214 atomic_set(&rq->nr_iowait, 0);
6801
6802 array = &rq->rt.active;
6803 for (j = 0; j < MAX_RT_PRIO; j++) {
6804 INIT_LIST_HEAD(array->queue + j);
6805 __clear_bit(j, array->bitmap);
6806 }
6807 highest_cpu = i; 7215 highest_cpu = i;
6808 /* delimiter for bitsearch: */
6809 __set_bit(MAX_RT_PRIO, array->bitmap);
6810 } 7216 }
6811 7217
6812 set_load_weight(&init_task); 7218 set_load_weight(&init_task);
@@ -6975,12 +7381,187 @@ void set_curr_task(int cpu, struct task_struct *p)
6975 7381
6976#ifdef CONFIG_FAIR_GROUP_SCHED 7382#ifdef CONFIG_FAIR_GROUP_SCHED
6977 7383
7384#ifdef CONFIG_SMP
7385/*
7386 * distribute shares of all task groups among their schedulable entities,
7387 * to reflect load distribution across cpus.
7388 */
7389static int rebalance_shares(struct sched_domain *sd, int this_cpu)
7390{
7391 struct cfs_rq *cfs_rq;
7392 struct rq *rq = cpu_rq(this_cpu);
7393 cpumask_t sdspan = sd->span;
7394 int balanced = 1;
7395
7396 /* Walk thr' all the task groups that we have */
7397 for_each_leaf_cfs_rq(rq, cfs_rq) {
7398 int i;
7399 unsigned long total_load = 0, total_shares;
7400 struct task_group *tg = cfs_rq->tg;
7401
7402 /* Gather total task load of this group across cpus */
7403 for_each_cpu_mask(i, sdspan)
7404 total_load += tg->cfs_rq[i]->load.weight;
7405
7406 /* Nothing to do if this group has no load */
7407 if (!total_load)
7408 continue;
7409
7410 /*
7411 * tg->shares represents the number of cpu shares the task group
7412 * is eligible to hold on a single cpu. On N cpus, it is
7413 * eligible to hold (N * tg->shares) number of cpu shares.
7414 */
7415 total_shares = tg->shares * cpus_weight(sdspan);
7416
7417 /*
7418 * redistribute total_shares across cpus as per the task load
7419 * distribution.
7420 */
7421 for_each_cpu_mask(i, sdspan) {
7422 unsigned long local_load, local_shares;
7423
7424 local_load = tg->cfs_rq[i]->load.weight;
7425 local_shares = (local_load * total_shares) / total_load;
7426 if (!local_shares)
7427 local_shares = MIN_GROUP_SHARES;
7428 if (local_shares == tg->se[i]->load.weight)
7429 continue;
7430
7431 spin_lock_irq(&cpu_rq(i)->lock);
7432 set_se_shares(tg->se[i], local_shares);
7433 spin_unlock_irq(&cpu_rq(i)->lock);
7434 balanced = 0;
7435 }
7436 }
7437
7438 return balanced;
7439}
7440
7441/*
7442 * How frequently should we rebalance_shares() across cpus?
7443 *
7444 * The more frequently we rebalance shares, the more accurate is the fairness
7445 * of cpu bandwidth distribution between task groups. However higher frequency
7446 * also implies increased scheduling overhead.
7447 *
7448 * sysctl_sched_min_bal_int_shares represents the minimum interval between
7449 * consecutive calls to rebalance_shares() in the same sched domain.
7450 *
7451 * sysctl_sched_max_bal_int_shares represents the maximum interval between
7452 * consecutive calls to rebalance_shares() in the same sched domain.
7453 *
7454 * These settings allows for the appropriate trade-off between accuracy of
7455 * fairness and the associated overhead.
7456 *
7457 */
7458
7459/* default: 8ms, units: milliseconds */
7460const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
7461
7462/* default: 128ms, units: milliseconds */
7463const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
7464
7465/* kernel thread that runs rebalance_shares() periodically */
7466static int load_balance_monitor(void *unused)
7467{
7468 unsigned int timeout = sysctl_sched_min_bal_int_shares;
7469 struct sched_param schedparm;
7470 int ret;
7471
7472 /*
7473 * We don't want this thread's execution to be limited by the shares
7474 * assigned to default group (init_task_group). Hence make it run
7475 * as a SCHED_RR RT task at the lowest priority.
7476 */
7477 schedparm.sched_priority = 1;
7478 ret = sched_setscheduler(current, SCHED_RR, &schedparm);
7479 if (ret)
7480 printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
7481 " monitor thread (error = %d) \n", ret);
7482
7483 while (!kthread_should_stop()) {
7484 int i, cpu, balanced = 1;
7485
7486 /* Prevent cpus going down or coming up */
7487 get_online_cpus();
7488 /* lockout changes to doms_cur[] array */
7489 lock_doms_cur();
7490 /*
7491 * Enter a rcu read-side critical section to safely walk rq->sd
7492 * chain on various cpus and to walk task group list
7493 * (rq->leaf_cfs_rq_list) in rebalance_shares().
7494 */
7495 rcu_read_lock();
7496
7497 for (i = 0; i < ndoms_cur; i++) {
7498 cpumask_t cpumap = doms_cur[i];
7499 struct sched_domain *sd = NULL, *sd_prev = NULL;
7500
7501 cpu = first_cpu(cpumap);
7502
7503 /* Find the highest domain at which to balance shares */
7504 for_each_domain(cpu, sd) {
7505 if (!(sd->flags & SD_LOAD_BALANCE))
7506 continue;
7507 sd_prev = sd;
7508 }
7509
7510 sd = sd_prev;
7511 /* sd == NULL? No load balance reqd in this domain */
7512 if (!sd)
7513 continue;
7514
7515 balanced &= rebalance_shares(sd, cpu);
7516 }
7517
7518 rcu_read_unlock();
7519
7520 unlock_doms_cur();
7521 put_online_cpus();
7522
7523 if (!balanced)
7524 timeout = sysctl_sched_min_bal_int_shares;
7525 else if (timeout < sysctl_sched_max_bal_int_shares)
7526 timeout *= 2;
7527
7528 msleep_interruptible(timeout);
7529 }
7530
7531 return 0;
7532}
7533#endif /* CONFIG_SMP */
7534
7535static void free_sched_group(struct task_group *tg)
7536{
7537 int i;
7538
7539 for_each_possible_cpu(i) {
7540 if (tg->cfs_rq)
7541 kfree(tg->cfs_rq[i]);
7542 if (tg->se)
7543 kfree(tg->se[i]);
7544 if (tg->rt_rq)
7545 kfree(tg->rt_rq[i]);
7546 if (tg->rt_se)
7547 kfree(tg->rt_se[i]);
7548 }
7549
7550 kfree(tg->cfs_rq);
7551 kfree(tg->se);
7552 kfree(tg->rt_rq);
7553 kfree(tg->rt_se);
7554 kfree(tg);
7555}
7556
6978/* allocate runqueue etc for a new task group */ 7557/* allocate runqueue etc for a new task group */
6979struct task_group *sched_create_group(void) 7558struct task_group *sched_create_group(void)
6980{ 7559{
6981 struct task_group *tg; 7560 struct task_group *tg;
6982 struct cfs_rq *cfs_rq; 7561 struct cfs_rq *cfs_rq;
6983 struct sched_entity *se; 7562 struct sched_entity *se;
7563 struct rt_rq *rt_rq;
7564 struct sched_rt_entity *rt_se;
6984 struct rq *rq; 7565 struct rq *rq;
6985 int i; 7566 int i;
6986 7567
@@ -6994,97 +7575,89 @@ struct task_group *sched_create_group(void)
6994 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); 7575 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
6995 if (!tg->se) 7576 if (!tg->se)
6996 goto err; 7577 goto err;
7578 tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
7579 if (!tg->rt_rq)
7580 goto err;
7581 tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
7582 if (!tg->rt_se)
7583 goto err;
7584
7585 tg->shares = NICE_0_LOAD;
7586 tg->rt_ratio = 0; /* XXX */
6997 7587
6998 for_each_possible_cpu(i) { 7588 for_each_possible_cpu(i) {
6999 rq = cpu_rq(i); 7589 rq = cpu_rq(i);
7000 7590
7001 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, 7591 cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
7002 cpu_to_node(i)); 7592 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7003 if (!cfs_rq) 7593 if (!cfs_rq)
7004 goto err; 7594 goto err;
7005 7595
7006 se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, 7596 se = kmalloc_node(sizeof(struct sched_entity),
7007 cpu_to_node(i)); 7597 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7008 if (!se) 7598 if (!se)
7009 goto err; 7599 goto err;
7010 7600
7011 memset(cfs_rq, 0, sizeof(struct cfs_rq)); 7601 rt_rq = kmalloc_node(sizeof(struct rt_rq),
7012 memset(se, 0, sizeof(struct sched_entity)); 7602 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7603 if (!rt_rq)
7604 goto err;
7013 7605
7014 tg->cfs_rq[i] = cfs_rq; 7606 rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
7015 init_cfs_rq(cfs_rq, rq); 7607 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7016 cfs_rq->tg = tg; 7608 if (!rt_se)
7609 goto err;
7017 7610
7018 tg->se[i] = se; 7611 init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
7019 se->cfs_rq = &rq->cfs; 7612 init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
7020 se->my_q = cfs_rq;
7021 se->load.weight = NICE_0_LOAD;
7022 se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
7023 se->parent = NULL;
7024 } 7613 }
7025 7614
7615 lock_task_group_list();
7026 for_each_possible_cpu(i) { 7616 for_each_possible_cpu(i) {
7027 rq = cpu_rq(i); 7617 rq = cpu_rq(i);
7028 cfs_rq = tg->cfs_rq[i]; 7618 cfs_rq = tg->cfs_rq[i];
7029 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 7619 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7620 rt_rq = tg->rt_rq[i];
7621 list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7030 } 7622 }
7031 7623 list_add_rcu(&tg->list, &task_groups);
7032 tg->shares = NICE_0_LOAD; 7624 unlock_task_group_list();
7033 spin_lock_init(&tg->lock);
7034 7625
7035 return tg; 7626 return tg;
7036 7627
7037err: 7628err:
7038 for_each_possible_cpu(i) { 7629 free_sched_group(tg);
7039 if (tg->cfs_rq)
7040 kfree(tg->cfs_rq[i]);
7041 if (tg->se)
7042 kfree(tg->se[i]);
7043 }
7044 kfree(tg->cfs_rq);
7045 kfree(tg->se);
7046 kfree(tg);
7047
7048 return ERR_PTR(-ENOMEM); 7630 return ERR_PTR(-ENOMEM);
7049} 7631}
7050 7632
7051/* rcu callback to free various structures associated with a task group */ 7633/* rcu callback to free various structures associated with a task group */
7052static void free_sched_group(struct rcu_head *rhp) 7634static void free_sched_group_rcu(struct rcu_head *rhp)
7053{ 7635{
7054 struct task_group *tg = container_of(rhp, struct task_group, rcu);
7055 struct cfs_rq *cfs_rq;
7056 struct sched_entity *se;
7057 int i;
7058
7059 /* now it should be safe to free those cfs_rqs */ 7636 /* now it should be safe to free those cfs_rqs */
7060 for_each_possible_cpu(i) { 7637 free_sched_group(container_of(rhp, struct task_group, rcu));
7061 cfs_rq = tg->cfs_rq[i];
7062 kfree(cfs_rq);
7063
7064 se = tg->se[i];
7065 kfree(se);
7066 }
7067
7068 kfree(tg->cfs_rq);
7069 kfree(tg->se);
7070 kfree(tg);
7071} 7638}
7072 7639
7073/* Destroy runqueue etc associated with a task group */ 7640/* Destroy runqueue etc associated with a task group */
7074void sched_destroy_group(struct task_group *tg) 7641void sched_destroy_group(struct task_group *tg)
7075{ 7642{
7076 struct cfs_rq *cfs_rq = NULL; 7643 struct cfs_rq *cfs_rq = NULL;
7644 struct rt_rq *rt_rq = NULL;
7077 int i; 7645 int i;
7078 7646
7647 lock_task_group_list();
7079 for_each_possible_cpu(i) { 7648 for_each_possible_cpu(i) {
7080 cfs_rq = tg->cfs_rq[i]; 7649 cfs_rq = tg->cfs_rq[i];
7081 list_del_rcu(&cfs_rq->leaf_cfs_rq_list); 7650 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
7651 rt_rq = tg->rt_rq[i];
7652 list_del_rcu(&rt_rq->leaf_rt_rq_list);
7082 } 7653 }
7654 list_del_rcu(&tg->list);
7655 unlock_task_group_list();
7083 7656
7084 BUG_ON(!cfs_rq); 7657 BUG_ON(!cfs_rq);
7085 7658
7086 /* wait for possible concurrent references to cfs_rqs complete */ 7659 /* wait for possible concurrent references to cfs_rqs complete */
7087 call_rcu(&tg->rcu, free_sched_group); 7660 call_rcu(&tg->rcu, free_sched_group_rcu);
7088} 7661}
7089 7662
7090/* change task's runqueue when it moves between groups. 7663/* change task's runqueue when it moves between groups.
@@ -7100,11 +7673,6 @@ void sched_move_task(struct task_struct *tsk)
7100 7673
7101 rq = task_rq_lock(tsk, &flags); 7674 rq = task_rq_lock(tsk, &flags);
7102 7675
7103 if (tsk->sched_class != &fair_sched_class) {
7104 set_task_cfs_rq(tsk, task_cpu(tsk));
7105 goto done;
7106 }
7107
7108 update_rq_clock(rq); 7676 update_rq_clock(rq);
7109 7677
7110 running = task_current(rq, tsk); 7678 running = task_current(rq, tsk);
@@ -7116,7 +7684,7 @@ void sched_move_task(struct task_struct *tsk)
7116 tsk->sched_class->put_prev_task(rq, tsk); 7684 tsk->sched_class->put_prev_task(rq, tsk);
7117 } 7685 }
7118 7686
7119 set_task_cfs_rq(tsk, task_cpu(tsk)); 7687 set_task_rq(tsk, task_cpu(tsk));
7120 7688
7121 if (on_rq) { 7689 if (on_rq) {
7122 if (unlikely(running)) 7690 if (unlikely(running))
@@ -7124,53 +7692,82 @@ void sched_move_task(struct task_struct *tsk)
7124 enqueue_task(rq, tsk, 0); 7692 enqueue_task(rq, tsk, 0);
7125 } 7693 }
7126 7694
7127done:
7128 task_rq_unlock(rq, &flags); 7695 task_rq_unlock(rq, &flags);
7129} 7696}
7130 7697
7698/* rq->lock to be locked by caller */
7131static void set_se_shares(struct sched_entity *se, unsigned long shares) 7699static void set_se_shares(struct sched_entity *se, unsigned long shares)
7132{ 7700{
7133 struct cfs_rq *cfs_rq = se->cfs_rq; 7701 struct cfs_rq *cfs_rq = se->cfs_rq;
7134 struct rq *rq = cfs_rq->rq; 7702 struct rq *rq = cfs_rq->rq;
7135 int on_rq; 7703 int on_rq;
7136 7704
7137 spin_lock_irq(&rq->lock); 7705 if (!shares)
7706 shares = MIN_GROUP_SHARES;
7138 7707
7139 on_rq = se->on_rq; 7708 on_rq = se->on_rq;
7140 if (on_rq) 7709 if (on_rq) {
7141 dequeue_entity(cfs_rq, se, 0); 7710 dequeue_entity(cfs_rq, se, 0);
7711 dec_cpu_load(rq, se->load.weight);
7712 }
7142 7713
7143 se->load.weight = shares; 7714 se->load.weight = shares;
7144 se->load.inv_weight = div64_64((1ULL<<32), shares); 7715 se->load.inv_weight = div64_64((1ULL<<32), shares);
7145 7716
7146 if (on_rq) 7717 if (on_rq) {
7147 enqueue_entity(cfs_rq, se, 0); 7718 enqueue_entity(cfs_rq, se, 0);
7148 7719 inc_cpu_load(rq, se->load.weight);
7149 spin_unlock_irq(&rq->lock); 7720 }
7150} 7721}
7151 7722
7152int sched_group_set_shares(struct task_group *tg, unsigned long shares) 7723int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7153{ 7724{
7154 int i; 7725 int i;
7726 struct cfs_rq *cfs_rq;
7727 struct rq *rq;
7728
7729 lock_task_group_list();
7730 if (tg->shares == shares)
7731 goto done;
7732
7733 if (shares < MIN_GROUP_SHARES)
7734 shares = MIN_GROUP_SHARES;
7155 7735
7156 /* 7736 /*
7157 * A weight of 0 or 1 can cause arithmetics problems. 7737 * Prevent any load balance activity (rebalance_shares,
7158 * (The default weight is 1024 - so there's no practical 7738 * load_balance_fair) from referring to this group first,
7159 * limitation from this.) 7739 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
7160 */ 7740 */
7161 if (shares < 2) 7741 for_each_possible_cpu(i) {
7162 shares = 2; 7742 cfs_rq = tg->cfs_rq[i];
7743 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
7744 }
7163 7745
7164 spin_lock(&tg->lock); 7746 /* wait for any ongoing reference to this group to finish */
7165 if (tg->shares == shares) 7747 synchronize_sched();
7166 goto done;
7167 7748
7749 /*
7750 * Now we are free to modify the group's share on each cpu
7751 * w/o tripping rebalance_share or load_balance_fair.
7752 */
7168 tg->shares = shares; 7753 tg->shares = shares;
7169 for_each_possible_cpu(i) 7754 for_each_possible_cpu(i) {
7755 spin_lock_irq(&cpu_rq(i)->lock);
7170 set_se_shares(tg->se[i], shares); 7756 set_se_shares(tg->se[i], shares);
7757 spin_unlock_irq(&cpu_rq(i)->lock);
7758 }
7171 7759
7760 /*
7761 * Enable load balance activity on this group, by inserting it back on
7762 * each cpu's rq->leaf_cfs_rq_list.
7763 */
7764 for_each_possible_cpu(i) {
7765 rq = cpu_rq(i);
7766 cfs_rq = tg->cfs_rq[i];
7767 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7768 }
7172done: 7769done:
7173 spin_unlock(&tg->lock); 7770 unlock_task_group_list();
7174 return 0; 7771 return 0;
7175} 7772}
7176 7773
@@ -7179,6 +7776,31 @@ unsigned long sched_group_shares(struct task_group *tg)
7179 return tg->shares; 7776 return tg->shares;
7180} 7777}
7181 7778
7779/*
7780 * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
7781 */
7782int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
7783{
7784 struct task_group *tgi;
7785 unsigned long total = 0;
7786
7787 rcu_read_lock();
7788 list_for_each_entry_rcu(tgi, &task_groups, list)
7789 total += tgi->rt_ratio;
7790 rcu_read_unlock();
7791
7792 if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
7793 return -EINVAL;
7794
7795 tg->rt_ratio = rt_ratio;
7796 return 0;
7797}
7798
7799unsigned long sched_group_rt_ratio(struct task_group *tg)
7800{
7801 return tg->rt_ratio;
7802}
7803
7182#endif /* CONFIG_FAIR_GROUP_SCHED */ 7804#endif /* CONFIG_FAIR_GROUP_SCHED */
7183 7805
7184#ifdef CONFIG_FAIR_CGROUP_SCHED 7806#ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7254,12 +7876,30 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
7254 return (u64) tg->shares; 7876 return (u64) tg->shares;
7255} 7877}
7256 7878
7879static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
7880 u64 rt_ratio_val)
7881{
7882 return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
7883}
7884
7885static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
7886{
7887 struct task_group *tg = cgroup_tg(cgrp);
7888
7889 return (u64) tg->rt_ratio;
7890}
7891
7257static struct cftype cpu_files[] = { 7892static struct cftype cpu_files[] = {
7258 { 7893 {
7259 .name = "shares", 7894 .name = "shares",
7260 .read_uint = cpu_shares_read_uint, 7895 .read_uint = cpu_shares_read_uint,
7261 .write_uint = cpu_shares_write_uint, 7896 .write_uint = cpu_shares_write_uint,
7262 }, 7897 },
7898 {
7899 .name = "rt_ratio",
7900 .read_uint = cpu_rt_ratio_read_uint,
7901 .write_uint = cpu_rt_ratio_write_uint,
7902 },
7263}; 7903};
7264 7904
7265static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) 7905static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 80fbbfc04290..4b5e24cf2f4a 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -179,6 +179,7 @@ static void print_cpu(struct seq_file *m, int cpu)
179 PN(prev_clock_raw); 179 PN(prev_clock_raw);
180 P(clock_warps); 180 P(clock_warps);
181 P(clock_overflows); 181 P(clock_overflows);
182 P(clock_underflows);
182 P(clock_deep_idle_events); 183 P(clock_deep_idle_events);
183 PN(clock_max_delta); 184 PN(clock_max_delta);
184 P(cpu_load[0]); 185 P(cpu_load[0]);
@@ -299,6 +300,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
299 PN(se.exec_max); 300 PN(se.exec_max);
300 PN(se.slice_max); 301 PN(se.slice_max);
301 PN(se.wait_max); 302 PN(se.wait_max);
303 PN(se.wait_sum);
304 P(se.wait_count);
302 P(sched_info.bkl_count); 305 P(sched_info.bkl_count);
303 P(se.nr_migrations); 306 P(se.nr_migrations);
304 P(se.nr_migrations_cold); 307 P(se.nr_migrations_cold);
@@ -366,6 +369,8 @@ void proc_sched_set_task(struct task_struct *p)
366{ 369{
367#ifdef CONFIG_SCHEDSTATS 370#ifdef CONFIG_SCHEDSTATS
368 p->se.wait_max = 0; 371 p->se.wait_max = 0;
372 p->se.wait_sum = 0;
373 p->se.wait_count = 0;
369 p->se.sleep_max = 0; 374 p->se.sleep_max = 0;
370 p->se.sum_sleep_runtime = 0; 375 p->se.sum_sleep_runtime = 0;
371 p->se.block_max = 0; 376 p->se.block_max = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index da7c061e7206..72e25c7a3a18 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -20,6 +20,8 @@
20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
21 */ 21 */
22 22
23#include <linux/latencytop.h>
24
23/* 25/*
24 * Targeted preemption latency for CPU-bound tasks: 26 * Targeted preemption latency for CPU-bound tasks:
25 * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) 27 * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -248,8 +250,8 @@ static u64 __sched_period(unsigned long nr_running)
248 unsigned long nr_latency = sched_nr_latency; 250 unsigned long nr_latency = sched_nr_latency;
249 251
250 if (unlikely(nr_running > nr_latency)) { 252 if (unlikely(nr_running > nr_latency)) {
253 period = sysctl_sched_min_granularity;
251 period *= nr_running; 254 period *= nr_running;
252 do_div(period, nr_latency);
253 } 255 }
254 256
255 return period; 257 return period;
@@ -383,6 +385,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
383{ 385{
384 schedstat_set(se->wait_max, max(se->wait_max, 386 schedstat_set(se->wait_max, max(se->wait_max,
385 rq_of(cfs_rq)->clock - se->wait_start)); 387 rq_of(cfs_rq)->clock - se->wait_start));
388 schedstat_set(se->wait_count, se->wait_count + 1);
389 schedstat_set(se->wait_sum, se->wait_sum +
390 rq_of(cfs_rq)->clock - se->wait_start);
386 schedstat_set(se->wait_start, 0); 391 schedstat_set(se->wait_start, 0);
387} 392}
388 393
@@ -434,6 +439,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
434#ifdef CONFIG_SCHEDSTATS 439#ifdef CONFIG_SCHEDSTATS
435 if (se->sleep_start) { 440 if (se->sleep_start) {
436 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 441 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
442 struct task_struct *tsk = task_of(se);
437 443
438 if ((s64)delta < 0) 444 if ((s64)delta < 0)
439 delta = 0; 445 delta = 0;
@@ -443,9 +449,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
443 449
444 se->sleep_start = 0; 450 se->sleep_start = 0;
445 se->sum_sleep_runtime += delta; 451 se->sum_sleep_runtime += delta;
452
453 account_scheduler_latency(tsk, delta >> 10, 1);
446 } 454 }
447 if (se->block_start) { 455 if (se->block_start) {
448 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 456 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
457 struct task_struct *tsk = task_of(se);
449 458
450 if ((s64)delta < 0) 459 if ((s64)delta < 0)
451 delta = 0; 460 delta = 0;
@@ -462,11 +471,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
462 * time that the task spent sleeping: 471 * time that the task spent sleeping:
463 */ 472 */
464 if (unlikely(prof_on == SLEEP_PROFILING)) { 473 if (unlikely(prof_on == SLEEP_PROFILING)) {
465 struct task_struct *tsk = task_of(se);
466 474
467 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), 475 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
468 delta >> 20); 476 delta >> 20);
469 } 477 }
478 account_scheduler_latency(tsk, delta >> 10, 0);
470 } 479 }
471#endif 480#endif
472} 481}
@@ -642,13 +651,29 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
642 cfs_rq->curr = NULL; 651 cfs_rq->curr = NULL;
643} 652}
644 653
645static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 654static void
655entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
646{ 656{
647 /* 657 /*
648 * Update run-time statistics of the 'current'. 658 * Update run-time statistics of the 'current'.
649 */ 659 */
650 update_curr(cfs_rq); 660 update_curr(cfs_rq);
651 661
662#ifdef CONFIG_SCHED_HRTICK
663 /*
664 * queued ticks are scheduled to match the slice, so don't bother
665 * validating it and just reschedule.
666 */
667 if (queued)
668 return resched_task(rq_of(cfs_rq)->curr);
669 /*
670 * don't let the period tick interfere with the hrtick preemption
671 */
672 if (!sched_feat(DOUBLE_TICK) &&
673 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
674 return;
675#endif
676
652 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) 677 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
653 check_preempt_tick(cfs_rq, curr); 678 check_preempt_tick(cfs_rq, curr);
654} 679}
@@ -690,7 +715,7 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
690 715
691/* Iterate thr' all leaf cfs_rq's on a runqueue */ 716/* Iterate thr' all leaf cfs_rq's on a runqueue */
692#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 717#define for_each_leaf_cfs_rq(rq, cfs_rq) \
693 list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 718 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
694 719
695/* Do the two (enqueued) entities belong to the same group ? */ 720/* Do the two (enqueued) entities belong to the same group ? */
696static inline int 721static inline int
@@ -707,6 +732,8 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
707 return se->parent; 732 return se->parent;
708} 733}
709 734
735#define GROUP_IMBALANCE_PCT 20
736
710#else /* CONFIG_FAIR_GROUP_SCHED */ 737#else /* CONFIG_FAIR_GROUP_SCHED */
711 738
712#define for_each_sched_entity(se) \ 739#define for_each_sched_entity(se) \
@@ -752,6 +779,43 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
752 779
753#endif /* CONFIG_FAIR_GROUP_SCHED */ 780#endif /* CONFIG_FAIR_GROUP_SCHED */
754 781
782#ifdef CONFIG_SCHED_HRTICK
783static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
784{
785 int requeue = rq->curr == p;
786 struct sched_entity *se = &p->se;
787 struct cfs_rq *cfs_rq = cfs_rq_of(se);
788
789 WARN_ON(task_rq(p) != rq);
790
791 if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
792 u64 slice = sched_slice(cfs_rq, se);
793 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
794 s64 delta = slice - ran;
795
796 if (delta < 0) {
797 if (rq->curr == p)
798 resched_task(p);
799 return;
800 }
801
802 /*
803 * Don't schedule slices shorter than 10000ns, that just
804 * doesn't make sense. Rely on vruntime for fairness.
805 */
806 if (!requeue)
807 delta = max(10000LL, delta);
808
809 hrtick_start(rq, delta, requeue);
810 }
811}
812#else
813static inline void
814hrtick_start_fair(struct rq *rq, struct task_struct *p)
815{
816}
817#endif
818
755/* 819/*
756 * The enqueue_task method is called before nr_running is 820 * The enqueue_task method is called before nr_running is
757 * increased. Here we update the fair scheduling stats and 821 * increased. Here we update the fair scheduling stats and
@@ -760,15 +824,28 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
760static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) 824static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
761{ 825{
762 struct cfs_rq *cfs_rq; 826 struct cfs_rq *cfs_rq;
763 struct sched_entity *se = &p->se; 827 struct sched_entity *se = &p->se,
828 *topse = NULL; /* Highest schedulable entity */
829 int incload = 1;
764 830
765 for_each_sched_entity(se) { 831 for_each_sched_entity(se) {
766 if (se->on_rq) 832 topse = se;
833 if (se->on_rq) {
834 incload = 0;
767 break; 835 break;
836 }
768 cfs_rq = cfs_rq_of(se); 837 cfs_rq = cfs_rq_of(se);
769 enqueue_entity(cfs_rq, se, wakeup); 838 enqueue_entity(cfs_rq, se, wakeup);
770 wakeup = 1; 839 wakeup = 1;
771 } 840 }
841 /* Increment cpu load if we just enqueued the first task of a group on
842 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
843 * at the highest grouping level.
844 */
845 if (incload)
846 inc_cpu_load(rq, topse->load.weight);
847
848 hrtick_start_fair(rq, rq->curr);
772} 849}
773 850
774/* 851/*
@@ -779,16 +856,30 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
779static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) 856static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
780{ 857{
781 struct cfs_rq *cfs_rq; 858 struct cfs_rq *cfs_rq;
782 struct sched_entity *se = &p->se; 859 struct sched_entity *se = &p->se,
860 *topse = NULL; /* Highest schedulable entity */
861 int decload = 1;
783 862
784 for_each_sched_entity(se) { 863 for_each_sched_entity(se) {
864 topse = se;
785 cfs_rq = cfs_rq_of(se); 865 cfs_rq = cfs_rq_of(se);
786 dequeue_entity(cfs_rq, se, sleep); 866 dequeue_entity(cfs_rq, se, sleep);
787 /* Don't dequeue parent if it has other entities besides us */ 867 /* Don't dequeue parent if it has other entities besides us */
788 if (cfs_rq->load.weight) 868 if (cfs_rq->load.weight) {
869 if (parent_entity(se))
870 decload = 0;
789 break; 871 break;
872 }
790 sleep = 1; 873 sleep = 1;
791 } 874 }
875 /* Decrement cpu load if we just dequeued the last task of a group on
876 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
877 * at the highest grouping level.
878 */
879 if (decload)
880 dec_cpu_load(rq, topse->load.weight);
881
882 hrtick_start_fair(rq, rq->curr);
792} 883}
793 884
794/* 885/*
@@ -836,6 +927,154 @@ static void yield_task_fair(struct rq *rq)
836} 927}
837 928
838/* 929/*
930 * wake_idle() will wake a task on an idle cpu if task->cpu is
931 * not idle and an idle cpu is available. The span of cpus to
932 * search starts with cpus closest then further out as needed,
933 * so we always favor a closer, idle cpu.
934 *
935 * Returns the CPU we should wake onto.
936 */
937#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
938static int wake_idle(int cpu, struct task_struct *p)
939{
940 cpumask_t tmp;
941 struct sched_domain *sd;
942 int i;
943
944 /*
945 * If it is idle, then it is the best cpu to run this task.
946 *
947 * This cpu is also the best, if it has more than one task already.
948 * Siblings must be also busy(in most cases) as they didn't already
949 * pickup the extra load from this cpu and hence we need not check
950 * sibling runqueue info. This will avoid the checks and cache miss
951 * penalities associated with that.
952 */
953 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
954 return cpu;
955
956 for_each_domain(cpu, sd) {
957 if (sd->flags & SD_WAKE_IDLE) {
958 cpus_and(tmp, sd->span, p->cpus_allowed);
959 for_each_cpu_mask(i, tmp) {
960 if (idle_cpu(i)) {
961 if (i != task_cpu(p)) {
962 schedstat_inc(p,
963 se.nr_wakeups_idle);
964 }
965 return i;
966 }
967 }
968 } else {
969 break;
970 }
971 }
972 return cpu;
973}
974#else
975static inline int wake_idle(int cpu, struct task_struct *p)
976{
977 return cpu;
978}
979#endif
980
981#ifdef CONFIG_SMP
982static int select_task_rq_fair(struct task_struct *p, int sync)
983{
984 int cpu, this_cpu;
985 struct rq *rq;
986 struct sched_domain *sd, *this_sd = NULL;
987 int new_cpu;
988
989 cpu = task_cpu(p);
990 rq = task_rq(p);
991 this_cpu = smp_processor_id();
992 new_cpu = cpu;
993
994 if (cpu == this_cpu)
995 goto out_set_cpu;
996
997 for_each_domain(this_cpu, sd) {
998 if (cpu_isset(cpu, sd->span)) {
999 this_sd = sd;
1000 break;
1001 }
1002 }
1003
1004 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1005 goto out_set_cpu;
1006
1007 /*
1008 * Check for affine wakeup and passive balancing possibilities.
1009 */
1010 if (this_sd) {
1011 int idx = this_sd->wake_idx;
1012 unsigned int imbalance;
1013 unsigned long load, this_load;
1014
1015 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1016
1017 load = source_load(cpu, idx);
1018 this_load = target_load(this_cpu, idx);
1019
1020 new_cpu = this_cpu; /* Wake to this CPU if we can */
1021
1022 if (this_sd->flags & SD_WAKE_AFFINE) {
1023 unsigned long tl = this_load;
1024 unsigned long tl_per_task;
1025
1026 /*
1027 * Attract cache-cold tasks on sync wakeups:
1028 */
1029 if (sync && !task_hot(p, rq->clock, this_sd))
1030 goto out_set_cpu;
1031
1032 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1033 tl_per_task = cpu_avg_load_per_task(this_cpu);
1034
1035 /*
1036 * If sync wakeup then subtract the (maximum possible)
1037 * effect of the currently running task from the load
1038 * of the current CPU:
1039 */
1040 if (sync)
1041 tl -= current->se.load.weight;
1042
1043 if ((tl <= load &&
1044 tl + target_load(cpu, idx) <= tl_per_task) ||
1045 100*(tl + p->se.load.weight) <= imbalance*load) {
1046 /*
1047 * This domain has SD_WAKE_AFFINE and
1048 * p is cache cold in this domain, and
1049 * there is no bad imbalance.
1050 */
1051 schedstat_inc(this_sd, ttwu_move_affine);
1052 schedstat_inc(p, se.nr_wakeups_affine);
1053 goto out_set_cpu;
1054 }
1055 }
1056
1057 /*
1058 * Start passive balancing when half the imbalance_pct
1059 * limit is reached.
1060 */
1061 if (this_sd->flags & SD_WAKE_BALANCE) {
1062 if (imbalance*this_load <= 100*load) {
1063 schedstat_inc(this_sd, ttwu_move_balance);
1064 schedstat_inc(p, se.nr_wakeups_passive);
1065 goto out_set_cpu;
1066 }
1067 }
1068 }
1069
1070 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1071out_set_cpu:
1072 return wake_idle(new_cpu, p);
1073}
1074#endif /* CONFIG_SMP */
1075
1076
1077/*
839 * Preempt the current task with a newly woken task if needed: 1078 * Preempt the current task with a newly woken task if needed:
840 */ 1079 */
841static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) 1080static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
@@ -876,6 +1115,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
876 1115
877static struct task_struct *pick_next_task_fair(struct rq *rq) 1116static struct task_struct *pick_next_task_fair(struct rq *rq)
878{ 1117{
1118 struct task_struct *p;
879 struct cfs_rq *cfs_rq = &rq->cfs; 1119 struct cfs_rq *cfs_rq = &rq->cfs;
880 struct sched_entity *se; 1120 struct sched_entity *se;
881 1121
@@ -887,7 +1127,10 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
887 cfs_rq = group_cfs_rq(se); 1127 cfs_rq = group_cfs_rq(se);
888 } while (cfs_rq); 1128 } while (cfs_rq);
889 1129
890 return task_of(se); 1130 p = task_of(se);
1131 hrtick_start_fair(rq, p);
1132
1133 return p;
891} 1134}
892 1135
893/* 1136/*
@@ -944,25 +1187,6 @@ static struct task_struct *load_balance_next_fair(void *arg)
944 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); 1187 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
945} 1188}
946 1189
947#ifdef CONFIG_FAIR_GROUP_SCHED
948static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
949{
950 struct sched_entity *curr;
951 struct task_struct *p;
952
953 if (!cfs_rq->nr_running)
954 return MAX_PRIO;
955
956 curr = cfs_rq->curr;
957 if (!curr)
958 curr = __pick_next_entity(cfs_rq);
959
960 p = task_of(curr);
961
962 return p->prio;
963}
964#endif
965
966static unsigned long 1190static unsigned long
967load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1191load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
968 unsigned long max_load_move, 1192 unsigned long max_load_move,
@@ -972,28 +1196,45 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
972 struct cfs_rq *busy_cfs_rq; 1196 struct cfs_rq *busy_cfs_rq;
973 long rem_load_move = max_load_move; 1197 long rem_load_move = max_load_move;
974 struct rq_iterator cfs_rq_iterator; 1198 struct rq_iterator cfs_rq_iterator;
1199 unsigned long load_moved;
975 1200
976 cfs_rq_iterator.start = load_balance_start_fair; 1201 cfs_rq_iterator.start = load_balance_start_fair;
977 cfs_rq_iterator.next = load_balance_next_fair; 1202 cfs_rq_iterator.next = load_balance_next_fair;
978 1203
979 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 1204 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
980#ifdef CONFIG_FAIR_GROUP_SCHED 1205#ifdef CONFIG_FAIR_GROUP_SCHED
981 struct cfs_rq *this_cfs_rq; 1206 struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu];
982 long imbalance; 1207 unsigned long maxload, task_load, group_weight;
983 unsigned long maxload; 1208 unsigned long thisload, per_task_load;
1209 struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
1210
1211 task_load = busy_cfs_rq->load.weight;
1212 group_weight = se->load.weight;
984 1213
985 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); 1214 /*
1215 * 'group_weight' is contributed by tasks of total weight
1216 * 'task_load'. To move 'rem_load_move' worth of weight only,
1217 * we need to move a maximum task load of:
1218 *
1219 * maxload = (remload / group_weight) * task_load;
1220 */
1221 maxload = (rem_load_move * task_load) / group_weight;
986 1222
987 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; 1223 if (!maxload || !task_load)
988 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
989 if (imbalance <= 0)
990 continue; 1224 continue;
991 1225
992 /* Don't pull more than imbalance/2 */ 1226 per_task_load = task_load / busy_cfs_rq->nr_running;
993 imbalance /= 2; 1227 /*
994 maxload = min(rem_load_move, imbalance); 1228 * balance_tasks will try to forcibly move atleast one task if
1229 * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
1230 * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
1231 */
1232 if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
1233 continue;
995 1234
996 *this_best_prio = cfs_rq_best_prio(this_cfs_rq); 1235 /* Disable priority-based load balance */
1236 *this_best_prio = 0;
1237 thisload = this_cfs_rq->load.weight;
997#else 1238#else
998# define maxload rem_load_move 1239# define maxload rem_load_move
999#endif 1240#endif
@@ -1002,11 +1243,33 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1002 * load_balance_[start|next]_fair iterators 1243 * load_balance_[start|next]_fair iterators
1003 */ 1244 */
1004 cfs_rq_iterator.arg = busy_cfs_rq; 1245 cfs_rq_iterator.arg = busy_cfs_rq;
1005 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, 1246 load_moved = balance_tasks(this_rq, this_cpu, busiest,
1006 maxload, sd, idle, all_pinned, 1247 maxload, sd, idle, all_pinned,
1007 this_best_prio, 1248 this_best_prio,
1008 &cfs_rq_iterator); 1249 &cfs_rq_iterator);
1009 1250
1251#ifdef CONFIG_FAIR_GROUP_SCHED
1252 /*
1253 * load_moved holds the task load that was moved. The
1254 * effective (group) weight moved would be:
1255 * load_moved_eff = load_moved/task_load * group_weight;
1256 */
1257 load_moved = (group_weight * load_moved) / task_load;
1258
1259 /* Adjust shares on both cpus to reflect load_moved */
1260 group_weight -= load_moved;
1261 set_se_shares(se, group_weight);
1262
1263 se = busy_cfs_rq->tg->se[this_cpu];
1264 if (!thisload)
1265 group_weight = load_moved;
1266 else
1267 group_weight = se->load.weight + load_moved;
1268 set_se_shares(se, group_weight);
1269#endif
1270
1271 rem_load_move -= load_moved;
1272
1010 if (rem_load_move <= 0) 1273 if (rem_load_move <= 0)
1011 break; 1274 break;
1012 } 1275 }
@@ -1042,14 +1305,14 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1042/* 1305/*
1043 * scheduler tick hitting a task of our scheduling class: 1306 * scheduler tick hitting a task of our scheduling class:
1044 */ 1307 */
1045static void task_tick_fair(struct rq *rq, struct task_struct *curr) 1308static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1046{ 1309{
1047 struct cfs_rq *cfs_rq; 1310 struct cfs_rq *cfs_rq;
1048 struct sched_entity *se = &curr->se; 1311 struct sched_entity *se = &curr->se;
1049 1312
1050 for_each_sched_entity(se) { 1313 for_each_sched_entity(se) {
1051 cfs_rq = cfs_rq_of(se); 1314 cfs_rq = cfs_rq_of(se);
1052 entity_tick(cfs_rq, se); 1315 entity_tick(cfs_rq, se, queued);
1053 } 1316 }
1054} 1317}
1055 1318
@@ -1087,6 +1350,42 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1087 resched_task(rq->curr); 1350 resched_task(rq->curr);
1088} 1351}
1089 1352
1353/*
1354 * Priority of the task has changed. Check to see if we preempt
1355 * the current task.
1356 */
1357static void prio_changed_fair(struct rq *rq, struct task_struct *p,
1358 int oldprio, int running)
1359{
1360 /*
1361 * Reschedule if we are currently running on this runqueue and
1362 * our priority decreased, or if we are not currently running on
1363 * this runqueue and our priority is higher than the current's
1364 */
1365 if (running) {
1366 if (p->prio > oldprio)
1367 resched_task(rq->curr);
1368 } else
1369 check_preempt_curr(rq, p);
1370}
1371
1372/*
1373 * We switched to the sched_fair class.
1374 */
1375static void switched_to_fair(struct rq *rq, struct task_struct *p,
1376 int running)
1377{
1378 /*
1379 * We were most likely switched from sched_rt, so
1380 * kick off the schedule if running, otherwise just see
1381 * if we can still preempt the current task.
1382 */
1383 if (running)
1384 resched_task(rq->curr);
1385 else
1386 check_preempt_curr(rq, p);
1387}
1388
1090/* Account for a task changing its policy or group. 1389/* Account for a task changing its policy or group.
1091 * 1390 *
1092 * This routine is mostly called to set cfs_rq->curr field when a task 1391 * This routine is mostly called to set cfs_rq->curr field when a task
@@ -1108,6 +1407,9 @@ static const struct sched_class fair_sched_class = {
1108 .enqueue_task = enqueue_task_fair, 1407 .enqueue_task = enqueue_task_fair,
1109 .dequeue_task = dequeue_task_fair, 1408 .dequeue_task = dequeue_task_fair,
1110 .yield_task = yield_task_fair, 1409 .yield_task = yield_task_fair,
1410#ifdef CONFIG_SMP
1411 .select_task_rq = select_task_rq_fair,
1412#endif /* CONFIG_SMP */
1111 1413
1112 .check_preempt_curr = check_preempt_wakeup, 1414 .check_preempt_curr = check_preempt_wakeup,
1113 1415
@@ -1122,6 +1424,9 @@ static const struct sched_class fair_sched_class = {
1122 .set_curr_task = set_curr_task_fair, 1424 .set_curr_task = set_curr_task_fair,
1123 .task_tick = task_tick_fair, 1425 .task_tick = task_tick_fair,
1124 .task_new = task_new_fair, 1426 .task_new = task_new_fair,
1427
1428 .prio_changed = prio_changed_fair,
1429 .switched_to = switched_to_fair,
1125}; 1430};
1126 1431
1127#ifdef CONFIG_SCHED_DEBUG 1432#ifdef CONFIG_SCHED_DEBUG
@@ -1132,7 +1437,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
1132#ifdef CONFIG_FAIR_GROUP_SCHED 1437#ifdef CONFIG_FAIR_GROUP_SCHED
1133 print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); 1438 print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
1134#endif 1439#endif
1440 rcu_read_lock();
1135 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) 1441 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1136 print_cfs_rq(m, cpu, cfs_rq); 1442 print_cfs_rq(m, cpu, cfs_rq);
1443 rcu_read_unlock();
1137} 1444}
1138#endif 1445#endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index bf9c25c15b8b..2bcafa375633 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -5,6 +5,12 @@
5 * handled in sched_fair.c) 5 * handled in sched_fair.c)
6 */ 6 */
7 7
8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sync)
10{
11 return task_cpu(p); /* IDLE tasks as never migrated */
12}
13#endif /* CONFIG_SMP */
8/* 14/*
9 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
10 */ 16 */
@@ -55,7 +61,7 @@ move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
55} 61}
56#endif 62#endif
57 63
58static void task_tick_idle(struct rq *rq, struct task_struct *curr) 64static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
59{ 65{
60} 66}
61 67
@@ -63,6 +69,33 @@ static void set_curr_task_idle(struct rq *rq)
63{ 69{
64} 70}
65 71
72static void switched_to_idle(struct rq *rq, struct task_struct *p,
73 int running)
74{
75 /* Can this actually happen?? */
76 if (running)
77 resched_task(rq->curr);
78 else
79 check_preempt_curr(rq, p);
80}
81
82static void prio_changed_idle(struct rq *rq, struct task_struct *p,
83 int oldprio, int running)
84{
85 /* This can happen for hot plug CPUS */
86
87 /*
88 * Reschedule if we are currently running on this runqueue and
89 * our priority decreased, or if we are not currently running on
90 * this runqueue and our priority is higher than the current's
91 */
92 if (running) {
93 if (p->prio > oldprio)
94 resched_task(rq->curr);
95 } else
96 check_preempt_curr(rq, p);
97}
98
66/* 99/*
67 * Simple, special scheduling class for the per-CPU idle tasks: 100 * Simple, special scheduling class for the per-CPU idle tasks:
68 */ 101 */
@@ -72,6 +105,9 @@ const struct sched_class idle_sched_class = {
72 105
73 /* dequeue is not valid, we print a debug message there: */ 106 /* dequeue is not valid, we print a debug message there: */
74 .dequeue_task = dequeue_task_idle, 107 .dequeue_task = dequeue_task_idle,
108#ifdef CONFIG_SMP
109 .select_task_rq = select_task_rq_idle,
110#endif /* CONFIG_SMP */
75 111
76 .check_preempt_curr = check_preempt_curr_idle, 112 .check_preempt_curr = check_preempt_curr_idle,
77 113
@@ -85,5 +121,9 @@ const struct sched_class idle_sched_class = {
85 121
86 .set_curr_task = set_curr_task_idle, 122 .set_curr_task = set_curr_task_idle,
87 .task_tick = task_tick_idle, 123 .task_tick = task_tick_idle,
124
125 .prio_changed = prio_changed_idle,
126 .switched_to = switched_to_idle,
127
88 /* no .task_new for idle tasks */ 128 /* no .task_new for idle tasks */
89}; 129};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 9ba3daa03475..274b40d7bef2 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,6 +3,217 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6#ifdef CONFIG_SMP
7
8static inline int rt_overloaded(struct rq *rq)
9{
10 return atomic_read(&rq->rd->rto_count);
11}
12
13static inline void rt_set_overload(struct rq *rq)
14{
15 cpu_set(rq->cpu, rq->rd->rto_mask);
16 /*
17 * Make sure the mask is visible before we set
18 * the overload count. That is checked to determine
19 * if we should look at the mask. It would be a shame
20 * if we looked at the mask, but the mask was not
21 * updated yet.
22 */
23 wmb();
24 atomic_inc(&rq->rd->rto_count);
25}
26
27static inline void rt_clear_overload(struct rq *rq)
28{
29 /* the order here really doesn't matter */
30 atomic_dec(&rq->rd->rto_count);
31 cpu_clear(rq->cpu, rq->rd->rto_mask);
32}
33
34static void update_rt_migration(struct rq *rq)
35{
36 if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
37 if (!rq->rt.overloaded) {
38 rt_set_overload(rq);
39 rq->rt.overloaded = 1;
40 }
41 } else if (rq->rt.overloaded) {
42 rt_clear_overload(rq);
43 rq->rt.overloaded = 0;
44 }
45}
46#endif /* CONFIG_SMP */
47
48static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
49{
50 return container_of(rt_se, struct task_struct, rt);
51}
52
53static inline int on_rt_rq(struct sched_rt_entity *rt_se)
54{
55 return !list_empty(&rt_se->run_list);
56}
57
58#ifdef CONFIG_FAIR_GROUP_SCHED
59
60static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
61{
62 if (!rt_rq->tg)
63 return SCHED_RT_FRAC;
64
65 return rt_rq->tg->rt_ratio;
66}
67
68#define for_each_leaf_rt_rq(rt_rq, rq) \
69 list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
70
71static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
72{
73 return rt_rq->rq;
74}
75
76static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
77{
78 return rt_se->rt_rq;
79}
80
81#define for_each_sched_rt_entity(rt_se) \
82 for (; rt_se; rt_se = rt_se->parent)
83
84static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
85{
86 return rt_se->my_q;
87}
88
89static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
90static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
91
92static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
93{
94 struct sched_rt_entity *rt_se = rt_rq->rt_se;
95
96 if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
97 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
98
99 enqueue_rt_entity(rt_se);
100 if (rt_rq->highest_prio < curr->prio)
101 resched_task(curr);
102 }
103}
104
105static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
106{
107 struct sched_rt_entity *rt_se = rt_rq->rt_se;
108
109 if (rt_se && on_rt_rq(rt_se))
110 dequeue_rt_entity(rt_se);
111}
112
113#else
114
115static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
116{
117 return sysctl_sched_rt_ratio;
118}
119
120#define for_each_leaf_rt_rq(rt_rq, rq) \
121 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
122
123static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
124{
125 return container_of(rt_rq, struct rq, rt);
126}
127
128static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
129{
130 struct task_struct *p = rt_task_of(rt_se);
131 struct rq *rq = task_rq(p);
132
133 return &rq->rt;
134}
135
136#define for_each_sched_rt_entity(rt_se) \
137 for (; rt_se; rt_se = NULL)
138
139static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
140{
141 return NULL;
142}
143
144static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
145{
146}
147
148static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
149{
150}
151
152#endif
153
154static inline int rt_se_prio(struct sched_rt_entity *rt_se)
155{
156#ifdef CONFIG_FAIR_GROUP_SCHED
157 struct rt_rq *rt_rq = group_rt_rq(rt_se);
158
159 if (rt_rq)
160 return rt_rq->highest_prio;
161#endif
162
163 return rt_task_of(rt_se)->prio;
164}
165
166static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
167{
168 unsigned int rt_ratio = sched_rt_ratio(rt_rq);
169 u64 period, ratio;
170
171 if (rt_ratio == SCHED_RT_FRAC)
172 return 0;
173
174 if (rt_rq->rt_throttled)
175 return 1;
176
177 period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
178 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
179
180 if (rt_rq->rt_time > ratio) {
181 struct rq *rq = rq_of_rt_rq(rt_rq);
182
183 rq->rt_throttled = 1;
184 rt_rq->rt_throttled = 1;
185
186 sched_rt_ratio_dequeue(rt_rq);
187 return 1;
188 }
189
190 return 0;
191}
192
193static void update_sched_rt_period(struct rq *rq)
194{
195 struct rt_rq *rt_rq;
196 u64 period;
197
198 while (rq->clock > rq->rt_period_expire) {
199 period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
200 rq->rt_period_expire += period;
201
202 for_each_leaf_rt_rq(rt_rq, rq) {
203 unsigned long rt_ratio = sched_rt_ratio(rt_rq);
204 u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
205
206 rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
207 if (rt_rq->rt_throttled) {
208 rt_rq->rt_throttled = 0;
209 sched_rt_ratio_enqueue(rt_rq);
210 }
211 }
212
213 rq->rt_throttled = 0;
214 }
215}
216
6/* 217/*
7 * Update the current task's runtime statistics. Skip current tasks that 218 * Update the current task's runtime statistics. Skip current tasks that
8 * are not in our scheduling class. 219 * are not in our scheduling class.
@@ -10,6 +221,8 @@
10static void update_curr_rt(struct rq *rq) 221static void update_curr_rt(struct rq *rq)
11{ 222{
12 struct task_struct *curr = rq->curr; 223 struct task_struct *curr = rq->curr;
224 struct sched_rt_entity *rt_se = &curr->rt;
225 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
13 u64 delta_exec; 226 u64 delta_exec;
14 227
15 if (!task_has_rt_policy(curr)) 228 if (!task_has_rt_policy(curr))
@@ -24,47 +237,228 @@ static void update_curr_rt(struct rq *rq)
24 curr->se.sum_exec_runtime += delta_exec; 237 curr->se.sum_exec_runtime += delta_exec;
25 curr->se.exec_start = rq->clock; 238 curr->se.exec_start = rq->clock;
26 cpuacct_charge(curr, delta_exec); 239 cpuacct_charge(curr, delta_exec);
240
241 rt_rq->rt_time += delta_exec;
242 /*
243 * might make it a tad more accurate:
244 *
245 * update_sched_rt_period(rq);
246 */
247 if (sched_rt_ratio_exceeded(rt_rq))
248 resched_task(curr);
27} 249}
28 250
29static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) 251static inline
252void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
253{
254 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
255 rt_rq->rt_nr_running++;
256#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
257 if (rt_se_prio(rt_se) < rt_rq->highest_prio)
258 rt_rq->highest_prio = rt_se_prio(rt_se);
259#endif
260#ifdef CONFIG_SMP
261 if (rt_se->nr_cpus_allowed > 1) {
262 struct rq *rq = rq_of_rt_rq(rt_rq);
263 rq->rt.rt_nr_migratory++;
264 }
265
266 update_rt_migration(rq_of_rt_rq(rt_rq));
267#endif
268}
269
270static inline
271void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
272{
273 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
274 WARN_ON(!rt_rq->rt_nr_running);
275 rt_rq->rt_nr_running--;
276#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
277 if (rt_rq->rt_nr_running) {
278 struct rt_prio_array *array;
279
280 WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
281 if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
282 /* recalculate */
283 array = &rt_rq->active;
284 rt_rq->highest_prio =
285 sched_find_first_bit(array->bitmap);
286 } /* otherwise leave rq->highest prio alone */
287 } else
288 rt_rq->highest_prio = MAX_RT_PRIO;
289#endif
290#ifdef CONFIG_SMP
291 if (rt_se->nr_cpus_allowed > 1) {
292 struct rq *rq = rq_of_rt_rq(rt_rq);
293 rq->rt.rt_nr_migratory--;
294 }
295
296 update_rt_migration(rq_of_rt_rq(rt_rq));
297#endif /* CONFIG_SMP */
298}
299
300static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
301{
302 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
303 struct rt_prio_array *array = &rt_rq->active;
304 struct rt_rq *group_rq = group_rt_rq(rt_se);
305
306 if (group_rq && group_rq->rt_throttled)
307 return;
308
309 list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
310 __set_bit(rt_se_prio(rt_se), array->bitmap);
311
312 inc_rt_tasks(rt_se, rt_rq);
313}
314
315static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
30{ 316{
31 struct rt_prio_array *array = &rq->rt.active; 317 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
318 struct rt_prio_array *array = &rt_rq->active;
319
320 list_del_init(&rt_se->run_list);
321 if (list_empty(array->queue + rt_se_prio(rt_se)))
322 __clear_bit(rt_se_prio(rt_se), array->bitmap);
32 323
33 list_add_tail(&p->run_list, array->queue + p->prio); 324 dec_rt_tasks(rt_se, rt_rq);
34 __set_bit(p->prio, array->bitmap); 325}
326
327/*
328 * Because the prio of an upper entry depends on the lower
329 * entries, we must remove entries top - down.
330 *
331 * XXX: O(1/2 h^2) because we can only walk up, not down the chain.
332 * doesn't matter much for now, as h=2 for GROUP_SCHED.
333 */
334static void dequeue_rt_stack(struct task_struct *p)
335{
336 struct sched_rt_entity *rt_se, *top_se;
337
338 /*
339 * dequeue all, top - down.
340 */
341 do {
342 rt_se = &p->rt;
343 top_se = NULL;
344 for_each_sched_rt_entity(rt_se) {
345 if (on_rt_rq(rt_se))
346 top_se = rt_se;
347 }
348 if (top_se)
349 dequeue_rt_entity(top_se);
350 } while (top_se);
35} 351}
36 352
37/* 353/*
38 * Adding/removing a task to/from a priority array: 354 * Adding/removing a task to/from a priority array:
39 */ 355 */
356static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
357{
358 struct sched_rt_entity *rt_se = &p->rt;
359
360 if (wakeup)
361 rt_se->timeout = 0;
362
363 dequeue_rt_stack(p);
364
365 /*
366 * enqueue everybody, bottom - up.
367 */
368 for_each_sched_rt_entity(rt_se)
369 enqueue_rt_entity(rt_se);
370
371 inc_cpu_load(rq, p->se.load.weight);
372}
373
40static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 374static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
41{ 375{
42 struct rt_prio_array *array = &rq->rt.active; 376 struct sched_rt_entity *rt_se = &p->rt;
377 struct rt_rq *rt_rq;
43 378
44 update_curr_rt(rq); 379 update_curr_rt(rq);
45 380
46 list_del(&p->run_list); 381 dequeue_rt_stack(p);
47 if (list_empty(array->queue + p->prio)) 382
48 __clear_bit(p->prio, array->bitmap); 383 /*
384 * re-enqueue all non-empty rt_rq entities.
385 */
386 for_each_sched_rt_entity(rt_se) {
387 rt_rq = group_rt_rq(rt_se);
388 if (rt_rq && rt_rq->rt_nr_running)
389 enqueue_rt_entity(rt_se);
390 }
391
392 dec_cpu_load(rq, p->se.load.weight);
49} 393}
50 394
51/* 395/*
52 * Put task to the end of the run list without the overhead of dequeue 396 * Put task to the end of the run list without the overhead of dequeue
53 * followed by enqueue. 397 * followed by enqueue.
54 */ 398 */
399static
400void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
401{
402 struct rt_prio_array *array = &rt_rq->active;
403
404 list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
405}
406
55static void requeue_task_rt(struct rq *rq, struct task_struct *p) 407static void requeue_task_rt(struct rq *rq, struct task_struct *p)
56{ 408{
57 struct rt_prio_array *array = &rq->rt.active; 409 struct sched_rt_entity *rt_se = &p->rt;
410 struct rt_rq *rt_rq;
58 411
59 list_move_tail(&p->run_list, array->queue + p->prio); 412 for_each_sched_rt_entity(rt_se) {
413 rt_rq = rt_rq_of_se(rt_se);
414 requeue_rt_entity(rt_rq, rt_se);
415 }
60} 416}
61 417
62static void 418static void yield_task_rt(struct rq *rq)
63yield_task_rt(struct rq *rq)
64{ 419{
65 requeue_task_rt(rq, rq->curr); 420 requeue_task_rt(rq, rq->curr);
66} 421}
67 422
423#ifdef CONFIG_SMP
424static int find_lowest_rq(struct task_struct *task);
425
426static int select_task_rq_rt(struct task_struct *p, int sync)
427{
428 struct rq *rq = task_rq(p);
429
430 /*
431 * If the current task is an RT task, then
432 * try to see if we can wake this RT task up on another
433 * runqueue. Otherwise simply start this RT task
434 * on its current runqueue.
435 *
436 * We want to avoid overloading runqueues. Even if
437 * the RT task is of higher priority than the current RT task.
438 * RT tasks behave differently than other tasks. If
439 * one gets preempted, we try to push it off to another queue.
440 * So trying to keep a preempting RT task on the same
441 * cache hot CPU will force the running RT task to
442 * a cold CPU. So we waste all the cache for the lower
443 * RT task in hopes of saving some of a RT task
444 * that is just being woken and probably will have
445 * cold cache anyway.
446 */
447 if (unlikely(rt_task(rq->curr)) &&
448 (p->rt.nr_cpus_allowed > 1)) {
449 int cpu = find_lowest_rq(p);
450
451 return (cpu == -1) ? task_cpu(p) : cpu;
452 }
453
454 /*
455 * Otherwise, just let it ride on the affined RQ and the
456 * post-schedule router will push the preempted task away
457 */
458 return task_cpu(p);
459}
460#endif /* CONFIG_SMP */
461
68/* 462/*
69 * Preempt the current task with a newly woken task if needed: 463 * Preempt the current task with a newly woken task if needed:
70 */ 464 */
@@ -74,25 +468,48 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
74 resched_task(rq->curr); 468 resched_task(rq->curr);
75} 469}
76 470
77static struct task_struct *pick_next_task_rt(struct rq *rq) 471static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
472 struct rt_rq *rt_rq)
78{ 473{
79 struct rt_prio_array *array = &rq->rt.active; 474 struct rt_prio_array *array = &rt_rq->active;
80 struct task_struct *next; 475 struct sched_rt_entity *next = NULL;
81 struct list_head *queue; 476 struct list_head *queue;
82 int idx; 477 int idx;
83 478
84 idx = sched_find_first_bit(array->bitmap); 479 idx = sched_find_first_bit(array->bitmap);
85 if (idx >= MAX_RT_PRIO) 480 BUG_ON(idx >= MAX_RT_PRIO);
86 return NULL;
87 481
88 queue = array->queue + idx; 482 queue = array->queue + idx;
89 next = list_entry(queue->next, struct task_struct, run_list); 483 next = list_entry(queue->next, struct sched_rt_entity, run_list);
90
91 next->se.exec_start = rq->clock;
92 484
93 return next; 485 return next;
94} 486}
95 487
488static struct task_struct *pick_next_task_rt(struct rq *rq)
489{
490 struct sched_rt_entity *rt_se;
491 struct task_struct *p;
492 struct rt_rq *rt_rq;
493
494 rt_rq = &rq->rt;
495
496 if (unlikely(!rt_rq->rt_nr_running))
497 return NULL;
498
499 if (sched_rt_ratio_exceeded(rt_rq))
500 return NULL;
501
502 do {
503 rt_se = pick_next_rt_entity(rq, rt_rq);
504 BUG_ON(!rt_se);
505 rt_rq = group_rt_rq(rt_se);
506 } while (rt_rq);
507
508 p = rt_task_of(rt_se);
509 p->se.exec_start = rq->clock;
510 return p;
511}
512
96static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 513static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
97{ 514{
98 update_curr_rt(rq); 515 update_curr_rt(rq);
@@ -100,76 +517,448 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
100} 517}
101 518
102#ifdef CONFIG_SMP 519#ifdef CONFIG_SMP
103/* 520
104 * Load-balancing iterator. Note: while the runqueue stays locked 521/* Only try algorithms three times */
105 * during the whole iteration, the current task might be 522#define RT_MAX_TRIES 3
106 * dequeued so the iterator has to be dequeue-safe. Here we 523
107 * achieve that by always pre-iterating before returning 524static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
108 * the current task: 525static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
109 */ 526
110static struct task_struct *load_balance_start_rt(void *arg) 527static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
111{ 528{
112 struct rq *rq = arg; 529 if (!task_running(rq, p) &&
113 struct rt_prio_array *array = &rq->rt.active; 530 (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
114 struct list_head *head, *curr; 531 (p->rt.nr_cpus_allowed > 1))
115 struct task_struct *p; 532 return 1;
533 return 0;
534}
535
536/* Return the second highest RT task, NULL otherwise */
537static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
538{
539 struct task_struct *next = NULL;
540 struct sched_rt_entity *rt_se;
541 struct rt_prio_array *array;
542 struct rt_rq *rt_rq;
116 int idx; 543 int idx;
117 544
118 idx = sched_find_first_bit(array->bitmap); 545 for_each_leaf_rt_rq(rt_rq, rq) {
119 if (idx >= MAX_RT_PRIO) 546 array = &rt_rq->active;
120 return NULL; 547 idx = sched_find_first_bit(array->bitmap);
548 next_idx:
549 if (idx >= MAX_RT_PRIO)
550 continue;
551 if (next && next->prio < idx)
552 continue;
553 list_for_each_entry(rt_se, array->queue + idx, run_list) {
554 struct task_struct *p = rt_task_of(rt_se);
555 if (pick_rt_task(rq, p, cpu)) {
556 next = p;
557 break;
558 }
559 }
560 if (!next) {
561 idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
562 goto next_idx;
563 }
564 }
121 565
122 head = array->queue + idx; 566 return next;
123 curr = head->prev; 567}
124 568
125 p = list_entry(curr, struct task_struct, run_list); 569static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
126 570
127 curr = curr->prev; 571static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
572{
573 int lowest_prio = -1;
574 int lowest_cpu = -1;
575 int count = 0;
576 int cpu;
128 577
129 rq->rt.rt_load_balance_idx = idx; 578 cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
130 rq->rt.rt_load_balance_head = head;
131 rq->rt.rt_load_balance_curr = curr;
132 579
133 return p; 580 /*
581 * Scan each rq for the lowest prio.
582 */
583 for_each_cpu_mask(cpu, *lowest_mask) {
584 struct rq *rq = cpu_rq(cpu);
585
586 /* We look for lowest RT prio or non-rt CPU */
587 if (rq->rt.highest_prio >= MAX_RT_PRIO) {
588 /*
589 * if we already found a low RT queue
590 * and now we found this non-rt queue
591 * clear the mask and set our bit.
592 * Otherwise just return the queue as is
593 * and the count==1 will cause the algorithm
594 * to use the first bit found.
595 */
596 if (lowest_cpu != -1) {
597 cpus_clear(*lowest_mask);
598 cpu_set(rq->cpu, *lowest_mask);
599 }
600 return 1;
601 }
602
603 /* no locking for now */
604 if ((rq->rt.highest_prio > task->prio)
605 && (rq->rt.highest_prio >= lowest_prio)) {
606 if (rq->rt.highest_prio > lowest_prio) {
607 /* new low - clear old data */
608 lowest_prio = rq->rt.highest_prio;
609 lowest_cpu = cpu;
610 count = 0;
611 }
612 count++;
613 } else
614 cpu_clear(cpu, *lowest_mask);
615 }
616
617 /*
618 * Clear out all the set bits that represent
619 * runqueues that were of higher prio than
620 * the lowest_prio.
621 */
622 if (lowest_cpu > 0) {
623 /*
624 * Perhaps we could add another cpumask op to
625 * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
626 * Then that could be optimized to use memset and such.
627 */
628 for_each_cpu_mask(cpu, *lowest_mask) {
629 if (cpu >= lowest_cpu)
630 break;
631 cpu_clear(cpu, *lowest_mask);
632 }
633 }
634
635 return count;
134} 636}
135 637
136static struct task_struct *load_balance_next_rt(void *arg) 638static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
137{ 639{
138 struct rq *rq = arg; 640 int first;
139 struct rt_prio_array *array = &rq->rt.active; 641
140 struct list_head *head, *curr; 642 /* "this_cpu" is cheaper to preempt than a remote processor */
141 struct task_struct *p; 643 if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
142 int idx; 644 return this_cpu;
645
646 first = first_cpu(*mask);
647 if (first != NR_CPUS)
648 return first;
649
650 return -1;
651}
652
653static int find_lowest_rq(struct task_struct *task)
654{
655 struct sched_domain *sd;
656 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
657 int this_cpu = smp_processor_id();
658 int cpu = task_cpu(task);
659 int count = find_lowest_cpus(task, lowest_mask);
143 660
144 idx = rq->rt.rt_load_balance_idx; 661 if (!count)
145 head = rq->rt.rt_load_balance_head; 662 return -1; /* No targets found */
146 curr = rq->rt.rt_load_balance_curr;
147 663
148 /* 664 /*
149 * If we arrived back to the head again then 665 * There is no sense in performing an optimal search if only one
150 * iterate to the next queue (if any): 666 * target is found.
151 */ 667 */
152 if (unlikely(head == curr)) { 668 if (count == 1)
153 int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); 669 return first_cpu(*lowest_mask);
154 670
155 if (next_idx >= MAX_RT_PRIO) 671 /*
156 return NULL; 672 * At this point we have built a mask of cpus representing the
673 * lowest priority tasks in the system. Now we want to elect
674 * the best one based on our affinity and topology.
675 *
676 * We prioritize the last cpu that the task executed on since
677 * it is most likely cache-hot in that location.
678 */
679 if (cpu_isset(cpu, *lowest_mask))
680 return cpu;
681
682 /*
683 * Otherwise, we consult the sched_domains span maps to figure
684 * out which cpu is logically closest to our hot cache data.
685 */
686 if (this_cpu == cpu)
687 this_cpu = -1; /* Skip this_cpu opt if the same */
688
689 for_each_domain(cpu, sd) {
690 if (sd->flags & SD_WAKE_AFFINE) {
691 cpumask_t domain_mask;
692 int best_cpu;
157 693
158 idx = next_idx; 694 cpus_and(domain_mask, sd->span, *lowest_mask);
159 head = array->queue + idx;
160 curr = head->prev;
161 695
162 rq->rt.rt_load_balance_idx = idx; 696 best_cpu = pick_optimal_cpu(this_cpu,
163 rq->rt.rt_load_balance_head = head; 697 &domain_mask);
698 if (best_cpu != -1)
699 return best_cpu;
700 }
164 } 701 }
165 702
166 p = list_entry(curr, struct task_struct, run_list); 703 /*
704 * And finally, if there were no matches within the domains
705 * just give the caller *something* to work with from the compatible
706 * locations.
707 */
708 return pick_optimal_cpu(this_cpu, lowest_mask);
709}
167 710
168 curr = curr->prev; 711/* Will lock the rq it finds */
712static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
713{
714 struct rq *lowest_rq = NULL;
715 int tries;
716 int cpu;
169 717
170 rq->rt.rt_load_balance_curr = curr; 718 for (tries = 0; tries < RT_MAX_TRIES; tries++) {
719 cpu = find_lowest_rq(task);
171 720
172 return p; 721 if ((cpu == -1) || (cpu == rq->cpu))
722 break;
723
724 lowest_rq = cpu_rq(cpu);
725
726 /* if the prio of this runqueue changed, try again */
727 if (double_lock_balance(rq, lowest_rq)) {
728 /*
729 * We had to unlock the run queue. In
730 * the mean time, task could have
731 * migrated already or had its affinity changed.
732 * Also make sure that it wasn't scheduled on its rq.
733 */
734 if (unlikely(task_rq(task) != rq ||
735 !cpu_isset(lowest_rq->cpu,
736 task->cpus_allowed) ||
737 task_running(rq, task) ||
738 !task->se.on_rq)) {
739
740 spin_unlock(&lowest_rq->lock);
741 lowest_rq = NULL;
742 break;
743 }
744 }
745
746 /* If this rq is still suitable use it. */
747 if (lowest_rq->rt.highest_prio > task->prio)
748 break;
749
750 /* try again */
751 spin_unlock(&lowest_rq->lock);
752 lowest_rq = NULL;
753 }
754
755 return lowest_rq;
756}
757
758/*
759 * If the current CPU has more than one RT task, see if the non
760 * running task can migrate over to a CPU that is running a task
761 * of lesser priority.
762 */
763static int push_rt_task(struct rq *rq)
764{
765 struct task_struct *next_task;
766 struct rq *lowest_rq;
767 int ret = 0;
768 int paranoid = RT_MAX_TRIES;
769
770 if (!rq->rt.overloaded)
771 return 0;
772
773 next_task = pick_next_highest_task_rt(rq, -1);
774 if (!next_task)
775 return 0;
776
777 retry:
778 if (unlikely(next_task == rq->curr)) {
779 WARN_ON(1);
780 return 0;
781 }
782
783 /*
784 * It's possible that the next_task slipped in of
785 * higher priority than current. If that's the case
786 * just reschedule current.
787 */
788 if (unlikely(next_task->prio < rq->curr->prio)) {
789 resched_task(rq->curr);
790 return 0;
791 }
792
793 /* We might release rq lock */
794 get_task_struct(next_task);
795
796 /* find_lock_lowest_rq locks the rq if found */
797 lowest_rq = find_lock_lowest_rq(next_task, rq);
798 if (!lowest_rq) {
799 struct task_struct *task;
800 /*
801 * find lock_lowest_rq releases rq->lock
802 * so it is possible that next_task has changed.
803 * If it has, then try again.
804 */
805 task = pick_next_highest_task_rt(rq, -1);
806 if (unlikely(task != next_task) && task && paranoid--) {
807 put_task_struct(next_task);
808 next_task = task;
809 goto retry;
810 }
811 goto out;
812 }
813
814 deactivate_task(rq, next_task, 0);
815 set_task_cpu(next_task, lowest_rq->cpu);
816 activate_task(lowest_rq, next_task, 0);
817
818 resched_task(lowest_rq->curr);
819
820 spin_unlock(&lowest_rq->lock);
821
822 ret = 1;
823out:
824 put_task_struct(next_task);
825
826 return ret;
827}
828
829/*
830 * TODO: Currently we just use the second highest prio task on
831 * the queue, and stop when it can't migrate (or there's
832 * no more RT tasks). There may be a case where a lower
833 * priority RT task has a different affinity than the
834 * higher RT task. In this case the lower RT task could
835 * possibly be able to migrate where as the higher priority
836 * RT task could not. We currently ignore this issue.
837 * Enhancements are welcome!
838 */
839static void push_rt_tasks(struct rq *rq)
840{
841 /* push_rt_task will return true if it moved an RT */
842 while (push_rt_task(rq))
843 ;
844}
845
846static int pull_rt_task(struct rq *this_rq)
847{
848 int this_cpu = this_rq->cpu, ret = 0, cpu;
849 struct task_struct *p, *next;
850 struct rq *src_rq;
851
852 if (likely(!rt_overloaded(this_rq)))
853 return 0;
854
855 next = pick_next_task_rt(this_rq);
856
857 for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
858 if (this_cpu == cpu)
859 continue;
860
861 src_rq = cpu_rq(cpu);
862 /*
863 * We can potentially drop this_rq's lock in
864 * double_lock_balance, and another CPU could
865 * steal our next task - hence we must cause
866 * the caller to recalculate the next task
867 * in that case:
868 */
869 if (double_lock_balance(this_rq, src_rq)) {
870 struct task_struct *old_next = next;
871
872 next = pick_next_task_rt(this_rq);
873 if (next != old_next)
874 ret = 1;
875 }
876
877 /*
878 * Are there still pullable RT tasks?
879 */
880 if (src_rq->rt.rt_nr_running <= 1)
881 goto skip;
882
883 p = pick_next_highest_task_rt(src_rq, this_cpu);
884
885 /*
886 * Do we have an RT task that preempts
887 * the to-be-scheduled task?
888 */
889 if (p && (!next || (p->prio < next->prio))) {
890 WARN_ON(p == src_rq->curr);
891 WARN_ON(!p->se.on_rq);
892
893 /*
894 * There's a chance that p is higher in priority
895 * than what's currently running on its cpu.
896 * This is just that p is wakeing up and hasn't
897 * had a chance to schedule. We only pull
898 * p if it is lower in priority than the
899 * current task on the run queue or
900 * this_rq next task is lower in prio than
901 * the current task on that rq.
902 */
903 if (p->prio < src_rq->curr->prio ||
904 (next && next->prio < src_rq->curr->prio))
905 goto skip;
906
907 ret = 1;
908
909 deactivate_task(src_rq, p, 0);
910 set_task_cpu(p, this_cpu);
911 activate_task(this_rq, p, 0);
912 /*
913 * We continue with the search, just in
914 * case there's an even higher prio task
915 * in another runqueue. (low likelyhood
916 * but possible)
917 *
918 * Update next so that we won't pick a task
919 * on another cpu with a priority lower (or equal)
920 * than the one we just picked.
921 */
922 next = p;
923
924 }
925 skip:
926 spin_unlock(&src_rq->lock);
927 }
928
929 return ret;
930}
931
932static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
933{
934 /* Try to pull RT tasks here if we lower this rq's prio */
935 if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
936 pull_rt_task(rq);
937}
938
939static void post_schedule_rt(struct rq *rq)
940{
941 /*
942 * If we have more than one rt_task queued, then
943 * see if we can push the other rt_tasks off to other CPUS.
944 * Note we may release the rq lock, and since
945 * the lock was owned by prev, we need to release it
946 * first via finish_lock_switch and then reaquire it here.
947 */
948 if (unlikely(rq->rt.overloaded)) {
949 spin_lock_irq(&rq->lock);
950 push_rt_tasks(rq);
951 spin_unlock_irq(&rq->lock);
952 }
953}
954
955
956static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
957{
958 if (!task_running(rq, p) &&
959 (p->prio >= rq->rt.highest_prio) &&
960 rq->rt.overloaded)
961 push_rt_tasks(rq);
173} 962}
174 963
175static unsigned long 964static unsigned long
@@ -178,38 +967,170 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
178 struct sched_domain *sd, enum cpu_idle_type idle, 967 struct sched_domain *sd, enum cpu_idle_type idle,
179 int *all_pinned, int *this_best_prio) 968 int *all_pinned, int *this_best_prio)
180{ 969{
181 struct rq_iterator rt_rq_iterator; 970 /* don't touch RT tasks */
182 971 return 0;
183 rt_rq_iterator.start = load_balance_start_rt;
184 rt_rq_iterator.next = load_balance_next_rt;
185 /* pass 'busiest' rq argument into
186 * load_balance_[start|next]_rt iterators
187 */
188 rt_rq_iterator.arg = busiest;
189
190 return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd,
191 idle, all_pinned, this_best_prio, &rt_rq_iterator);
192} 972}
193 973
194static int 974static int
195move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, 975move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
196 struct sched_domain *sd, enum cpu_idle_type idle) 976 struct sched_domain *sd, enum cpu_idle_type idle)
197{ 977{
198 struct rq_iterator rt_rq_iterator; 978 /* don't touch RT tasks */
979 return 0;
980}
981
982static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
983{
984 int weight = cpus_weight(*new_mask);
985
986 BUG_ON(!rt_task(p));
199 987
200 rt_rq_iterator.start = load_balance_start_rt; 988 /*
201 rt_rq_iterator.next = load_balance_next_rt; 989 * Update the migration status of the RQ if we have an RT task
202 rt_rq_iterator.arg = busiest; 990 * which is running AND changing its weight value.
991 */
992 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
993 struct rq *rq = task_rq(p);
994
995 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
996 rq->rt.rt_nr_migratory++;
997 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
998 BUG_ON(!rq->rt.rt_nr_migratory);
999 rq->rt.rt_nr_migratory--;
1000 }
1001
1002 update_rt_migration(rq);
1003 }
203 1004
204 return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, 1005 p->cpus_allowed = *new_mask;
205 &rt_rq_iterator); 1006 p->rt.nr_cpus_allowed = weight;
206} 1007}
207#endif
208 1008
209static void task_tick_rt(struct rq *rq, struct task_struct *p) 1009/* Assumes rq->lock is held */
1010static void join_domain_rt(struct rq *rq)
1011{
1012 if (rq->rt.overloaded)
1013 rt_set_overload(rq);
1014}
1015
1016/* Assumes rq->lock is held */
1017static void leave_domain_rt(struct rq *rq)
1018{
1019 if (rq->rt.overloaded)
1020 rt_clear_overload(rq);
1021}
1022
1023/*
1024 * When switch from the rt queue, we bring ourselves to a position
1025 * that we might want to pull RT tasks from other runqueues.
1026 */
1027static void switched_from_rt(struct rq *rq, struct task_struct *p,
1028 int running)
1029{
1030 /*
1031 * If there are other RT tasks then we will reschedule
1032 * and the scheduling of the other RT tasks will handle
1033 * the balancing. But if we are the last RT task
1034 * we may need to handle the pulling of RT tasks
1035 * now.
1036 */
1037 if (!rq->rt.rt_nr_running)
1038 pull_rt_task(rq);
1039}
1040#endif /* CONFIG_SMP */
1041
1042/*
1043 * When switching a task to RT, we may overload the runqueue
1044 * with RT tasks. In this case we try to push them off to
1045 * other runqueues.
1046 */
1047static void switched_to_rt(struct rq *rq, struct task_struct *p,
1048 int running)
1049{
1050 int check_resched = 1;
1051
1052 /*
1053 * If we are already running, then there's nothing
1054 * that needs to be done. But if we are not running
1055 * we may need to preempt the current running task.
1056 * If that current running task is also an RT task
1057 * then see if we can move to another run queue.
1058 */
1059 if (!running) {
1060#ifdef CONFIG_SMP
1061 if (rq->rt.overloaded && push_rt_task(rq) &&
1062 /* Don't resched if we changed runqueues */
1063 rq != task_rq(p))
1064 check_resched = 0;
1065#endif /* CONFIG_SMP */
1066 if (check_resched && p->prio < rq->curr->prio)
1067 resched_task(rq->curr);
1068 }
1069}
1070
1071/*
1072 * Priority of the task has changed. This may cause
1073 * us to initiate a push or pull.
1074 */
1075static void prio_changed_rt(struct rq *rq, struct task_struct *p,
1076 int oldprio, int running)
1077{
1078 if (running) {
1079#ifdef CONFIG_SMP
1080 /*
1081 * If our priority decreases while running, we
1082 * may need to pull tasks to this runqueue.
1083 */
1084 if (oldprio < p->prio)
1085 pull_rt_task(rq);
1086 /*
1087 * If there's a higher priority task waiting to run
1088 * then reschedule.
1089 */
1090 if (p->prio > rq->rt.highest_prio)
1091 resched_task(p);
1092#else
1093 /* For UP simply resched on drop of prio */
1094 if (oldprio < p->prio)
1095 resched_task(p);
1096#endif /* CONFIG_SMP */
1097 } else {
1098 /*
1099 * This task is not running, but if it is
1100 * greater than the current running task
1101 * then reschedule.
1102 */
1103 if (p->prio < rq->curr->prio)
1104 resched_task(rq->curr);
1105 }
1106}
1107
1108static void watchdog(struct rq *rq, struct task_struct *p)
1109{
1110 unsigned long soft, hard;
1111
1112 if (!p->signal)
1113 return;
1114
1115 soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
1116 hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
1117
1118 if (soft != RLIM_INFINITY) {
1119 unsigned long next;
1120
1121 p->rt.timeout++;
1122 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1123 if (p->rt.timeout > next)
1124 p->it_sched_expires = p->se.sum_exec_runtime;
1125 }
1126}
1127
1128static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
210{ 1129{
211 update_curr_rt(rq); 1130 update_curr_rt(rq);
212 1131
1132 watchdog(rq, p);
1133
213 /* 1134 /*
214 * RR tasks need a special form of timeslice management. 1135 * RR tasks need a special form of timeslice management.
215 * FIFO tasks have no timeslices. 1136 * FIFO tasks have no timeslices.
@@ -217,16 +1138,16 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
217 if (p->policy != SCHED_RR) 1138 if (p->policy != SCHED_RR)
218 return; 1139 return;
219 1140
220 if (--p->time_slice) 1141 if (--p->rt.time_slice)
221 return; 1142 return;
222 1143
223 p->time_slice = DEF_TIMESLICE; 1144 p->rt.time_slice = DEF_TIMESLICE;
224 1145
225 /* 1146 /*
226 * Requeue to the end of queue if we are not the only element 1147 * Requeue to the end of queue if we are not the only element
227 * on the queue: 1148 * on the queue:
228 */ 1149 */
229 if (p->run_list.prev != p->run_list.next) { 1150 if (p->rt.run_list.prev != p->rt.run_list.next) {
230 requeue_task_rt(rq, p); 1151 requeue_task_rt(rq, p);
231 set_tsk_need_resched(p); 1152 set_tsk_need_resched(p);
232 } 1153 }
@@ -244,6 +1165,9 @@ const struct sched_class rt_sched_class = {
244 .enqueue_task = enqueue_task_rt, 1165 .enqueue_task = enqueue_task_rt,
245 .dequeue_task = dequeue_task_rt, 1166 .dequeue_task = dequeue_task_rt,
246 .yield_task = yield_task_rt, 1167 .yield_task = yield_task_rt,
1168#ifdef CONFIG_SMP
1169 .select_task_rq = select_task_rq_rt,
1170#endif /* CONFIG_SMP */
247 1171
248 .check_preempt_curr = check_preempt_curr_rt, 1172 .check_preempt_curr = check_preempt_curr_rt,
249 1173
@@ -253,8 +1177,18 @@ const struct sched_class rt_sched_class = {
253#ifdef CONFIG_SMP 1177#ifdef CONFIG_SMP
254 .load_balance = load_balance_rt, 1178 .load_balance = load_balance_rt,
255 .move_one_task = move_one_task_rt, 1179 .move_one_task = move_one_task_rt,
1180 .set_cpus_allowed = set_cpus_allowed_rt,
1181 .join_domain = join_domain_rt,
1182 .leave_domain = leave_domain_rt,
1183 .pre_schedule = pre_schedule_rt,
1184 .post_schedule = post_schedule_rt,
1185 .task_wake_up = task_wake_up_rt,
1186 .switched_from = switched_from_rt,
256#endif 1187#endif
257 1188
258 .set_curr_task = set_curr_task_rt, 1189 .set_curr_task = set_curr_task_rt,
259 .task_tick = task_tick_rt, 1190 .task_tick = task_tick_rt,
1191
1192 .prio_changed = prio_changed_rt,
1193 .switched_to = switched_to_rt,
260}; 1194};
diff --git a/kernel/signal.c b/kernel/signal.c
index afa4f781f924..bf49ce6f016b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -733,13 +733,13 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
733 current->comm, task_pid_nr(current), signr); 733 current->comm, task_pid_nr(current), signr);
734 734
735#if defined(__i386__) && !defined(__arch_um__) 735#if defined(__i386__) && !defined(__arch_um__)
736 printk("code at %08lx: ", regs->eip); 736 printk("code at %08lx: ", regs->ip);
737 { 737 {
738 int i; 738 int i;
739 for (i = 0; i < 16; i++) { 739 for (i = 0; i < 16; i++) {
740 unsigned char insn; 740 unsigned char insn;
741 741
742 __get_user(insn, (unsigned char *)(regs->eip + i)); 742 __get_user(insn, (unsigned char *)(regs->ip + i));
743 printk("%02x ", insn); 743 printk("%02x ", insn);
744 } 744 }
745 } 745 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bd89bc4eb0b9..d7837d45419e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -3,7 +3,9 @@
3 * 3 *
4 * Copyright (C) 1992 Linus Torvalds 4 * Copyright (C) 1992 Linus Torvalds
5 * 5 *
6 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) 6 * Distribute under GPLv2.
7 *
8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
7 */ 9 */
8 10
9#include <linux/module.h> 11#include <linux/module.h>
@@ -278,9 +280,14 @@ asmlinkage void do_softirq(void)
278 */ 280 */
279void irq_enter(void) 281void irq_enter(void)
280{ 282{
283#ifdef CONFIG_NO_HZ
284 int cpu = smp_processor_id();
285 if (idle_cpu(cpu) && !in_interrupt())
286 tick_nohz_stop_idle(cpu);
287#endif
281 __irq_enter(); 288 __irq_enter();
282#ifdef CONFIG_NO_HZ 289#ifdef CONFIG_NO_HZ
283 if (idle_cpu(smp_processor_id())) 290 if (idle_cpu(cpu))
284 tick_nohz_update_jiffies(); 291 tick_nohz_update_jiffies();
285#endif 292#endif
286} 293}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 11df812263c8..c1d76552446e 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -8,6 +8,7 @@
8 */ 8 */
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/cpu.h> 10#include <linux/cpu.h>
11#include <linux/nmi.h>
11#include <linux/init.h> 12#include <linux/init.h>
12#include <linux/delay.h> 13#include <linux/delay.h>
13#include <linux/freezer.h> 14#include <linux/freezer.h>
@@ -23,8 +24,8 @@ static DEFINE_PER_CPU(unsigned long, touch_timestamp);
23static DEFINE_PER_CPU(unsigned long, print_timestamp); 24static DEFINE_PER_CPU(unsigned long, print_timestamp);
24static DEFINE_PER_CPU(struct task_struct *, watchdog_task); 25static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
25 26
26static int did_panic; 27static int __read_mostly did_panic;
27int softlockup_thresh = 10; 28unsigned long __read_mostly softlockup_thresh = 60;
28 29
29static int 30static int
30softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) 31softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -45,7 +46,7 @@ static struct notifier_block panic_block = {
45 */ 46 */
46static unsigned long get_timestamp(int this_cpu) 47static unsigned long get_timestamp(int this_cpu)
47{ 48{
48 return cpu_clock(this_cpu) >> 30; /* 2^30 ~= 10^9 */ 49 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
49} 50}
50 51
51void touch_softlockup_watchdog(void) 52void touch_softlockup_watchdog(void)
@@ -100,11 +101,7 @@ void softlockup_tick(void)
100 101
101 now = get_timestamp(this_cpu); 102 now = get_timestamp(this_cpu);
102 103
103 /* Wake up the high-prio watchdog task every second: */ 104 /* Warn about unreasonable delays: */
104 if (now > (touch_timestamp + 1))
105 wake_up_process(per_cpu(watchdog_task, this_cpu));
106
107 /* Warn about unreasonable 10+ seconds delays: */
108 if (now <= (touch_timestamp + softlockup_thresh)) 105 if (now <= (touch_timestamp + softlockup_thresh))
109 return; 106 return;
110 107
@@ -122,11 +119,93 @@ void softlockup_tick(void)
122} 119}
123 120
124/* 121/*
122 * Have a reasonable limit on the number of tasks checked:
123 */
124unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
125
126/*
127 * Zero means infinite timeout - no checking done:
128 */
129unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
130
131unsigned long __read_mostly sysctl_hung_task_warnings = 10;
132
133/*
134 * Only do the hung-tasks check on one CPU:
135 */
136static int check_cpu __read_mostly = -1;
137
138static void check_hung_task(struct task_struct *t, unsigned long now)
139{
140 unsigned long switch_count = t->nvcsw + t->nivcsw;
141
142 if (t->flags & PF_FROZEN)
143 return;
144
145 if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
146 t->last_switch_count = switch_count;
147 t->last_switch_timestamp = now;
148 return;
149 }
150 if ((long)(now - t->last_switch_timestamp) <
151 sysctl_hung_task_timeout_secs)
152 return;
153 if (sysctl_hung_task_warnings < 0)
154 return;
155 sysctl_hung_task_warnings--;
156
157 /*
158 * Ok, the task did not get scheduled for more than 2 minutes,
159 * complain:
160 */
161 printk(KERN_ERR "INFO: task %s:%d blocked for more than "
162 "%ld seconds.\n", t->comm, t->pid,
163 sysctl_hung_task_timeout_secs);
164 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
165 " disables this message.\n");
166 sched_show_task(t);
167 __debug_show_held_locks(t);
168
169 t->last_switch_timestamp = now;
170 touch_nmi_watchdog();
171}
172
173/*
174 * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
175 * a really long time (120 seconds). If that happens, print out
176 * a warning.
177 */
178static void check_hung_uninterruptible_tasks(int this_cpu)
179{
180 int max_count = sysctl_hung_task_check_count;
181 unsigned long now = get_timestamp(this_cpu);
182 struct task_struct *g, *t;
183
184 /*
185 * If the system crashed already then all bets are off,
186 * do not report extra hung tasks:
187 */
188 if ((tainted & TAINT_DIE) || did_panic)
189 return;
190
191 read_lock(&tasklist_lock);
192 do_each_thread(g, t) {
193 if (!--max_count)
194 break;
195 if (t->state & TASK_UNINTERRUPTIBLE)
196 check_hung_task(t, now);
197 } while_each_thread(g, t);
198
199 read_unlock(&tasklist_lock);
200}
201
202/*
125 * The watchdog thread - runs every second and touches the timestamp. 203 * The watchdog thread - runs every second and touches the timestamp.
126 */ 204 */
127static int watchdog(void *__bind_cpu) 205static int watchdog(void *__bind_cpu)
128{ 206{
129 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 207 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
208 int this_cpu = (long)__bind_cpu;
130 209
131 sched_setscheduler(current, SCHED_FIFO, &param); 210 sched_setscheduler(current, SCHED_FIFO, &param);
132 211
@@ -135,13 +214,18 @@ static int watchdog(void *__bind_cpu)
135 214
136 /* 215 /*
137 * Run briefly once per second to reset the softlockup timestamp. 216 * Run briefly once per second to reset the softlockup timestamp.
138 * If this gets delayed for more than 10 seconds then the 217 * If this gets delayed for more than 60 seconds then the
139 * debug-printout triggers in softlockup_tick(). 218 * debug-printout triggers in softlockup_tick().
140 */ 219 */
141 while (!kthread_should_stop()) { 220 while (!kthread_should_stop()) {
142 set_current_state(TASK_INTERRUPTIBLE);
143 touch_softlockup_watchdog(); 221 touch_softlockup_watchdog();
144 schedule(); 222 msleep_interruptible(10000);
223
224 if (this_cpu != check_cpu)
225 continue;
226
227 if (sysctl_hung_task_timeout_secs)
228 check_hung_uninterruptible_tasks(this_cpu);
145 } 229 }
146 230
147 return 0; 231 return 0;
@@ -171,6 +255,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
171 break; 255 break;
172 case CPU_ONLINE: 256 case CPU_ONLINE:
173 case CPU_ONLINE_FROZEN: 257 case CPU_ONLINE_FROZEN:
258 check_cpu = any_online_cpu(cpu_online_map);
174 wake_up_process(per_cpu(watchdog_task, hotcpu)); 259 wake_up_process(per_cpu(watchdog_task, hotcpu));
175 break; 260 break;
176#ifdef CONFIG_HOTPLUG_CPU 261#ifdef CONFIG_HOTPLUG_CPU
@@ -181,6 +266,15 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
181 /* Unbind so it can run. Fall thru. */ 266 /* Unbind so it can run. Fall thru. */
182 kthread_bind(per_cpu(watchdog_task, hotcpu), 267 kthread_bind(per_cpu(watchdog_task, hotcpu),
183 any_online_cpu(cpu_online_map)); 268 any_online_cpu(cpu_online_map));
269 case CPU_DOWN_PREPARE:
270 case CPU_DOWN_PREPARE_FROZEN:
271 if (hotcpu == check_cpu) {
272 cpumask_t temp_cpu_online_map = cpu_online_map;
273
274 cpu_clear(hotcpu, temp_cpu_online_map);
275 check_cpu = any_online_cpu(temp_cpu_online_map);
276 }
277 break;
184 case CPU_DEAD: 278 case CPU_DEAD:
185 case CPU_DEAD_FROZEN: 279 case CPU_DEAD_FROZEN:
186 p = per_cpu(watchdog_task, hotcpu); 280 p = per_cpu(watchdog_task, hotcpu);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index cd72424c2662..ae28c8245123 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -65,8 +65,7 @@ EXPORT_SYMBOL(_write_trylock);
65 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are 65 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
66 * not re-enabled during lock-acquire (which the preempt-spin-ops do): 66 * not re-enabled during lock-acquire (which the preempt-spin-ops do):
67 */ 67 */
68#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \ 68#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
69 defined(CONFIG_DEBUG_LOCK_ALLOC)
70 69
71void __lockfunc _read_lock(rwlock_t *lock) 70void __lockfunc _read_lock(rwlock_t *lock)
72{ 71{
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 319821ef78af..51b5ee53571a 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -203,13 +203,13 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
203 int ret; 203 int ret;
204 204
205 /* No CPUs can come up or down during this. */ 205 /* No CPUs can come up or down during this. */
206 lock_cpu_hotplug(); 206 get_online_cpus();
207 p = __stop_machine_run(fn, data, cpu); 207 p = __stop_machine_run(fn, data, cpu);
208 if (!IS_ERR(p)) 208 if (!IS_ERR(p))
209 ret = kthread_stop(p); 209 ret = kthread_stop(p);
210 else 210 else
211 ret = PTR_ERR(p); 211 ret = PTR_ERR(p);
212 unlock_cpu_hotplug(); 212 put_online_cpus();
213 213
214 return ret; 214 return ret;
215} 215}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c68f68dcc605..357b68ba23ec 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -53,6 +53,7 @@
53#ifdef CONFIG_X86 53#ifdef CONFIG_X86
54#include <asm/nmi.h> 54#include <asm/nmi.h>
55#include <asm/stacktrace.h> 55#include <asm/stacktrace.h>
56#include <asm/io.h>
56#endif 57#endif
57 58
58static int deprecated_sysctl_warning(struct __sysctl_args *args); 59static int deprecated_sysctl_warning(struct __sysctl_args *args);
@@ -81,6 +82,7 @@ extern int compat_log;
81extern int maps_protect; 82extern int maps_protect;
82extern int sysctl_stat_interval; 83extern int sysctl_stat_interval;
83extern int audit_argv_kb; 84extern int audit_argv_kb;
85extern int latencytop_enabled;
84 86
85/* Constants used for minimum and maximum */ 87/* Constants used for minimum and maximum */
86#ifdef CONFIG_DETECT_SOFTLOCKUP 88#ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -156,8 +158,16 @@ static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *
156#endif 158#endif
157 159
158static struct ctl_table root_table[]; 160static struct ctl_table root_table[];
159static struct ctl_table_header root_table_header = 161static struct ctl_table_root sysctl_table_root;
160 { root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) }; 162static struct ctl_table_header root_table_header = {
163 .ctl_table = root_table,
164 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list),
165 .root = &sysctl_table_root,
166};
167static struct ctl_table_root sysctl_table_root = {
168 .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
169 .header_list = LIST_HEAD_INIT(root_table_header.ctl_entry),
170};
161 171
162static struct ctl_table kern_table[]; 172static struct ctl_table kern_table[];
163static struct ctl_table vm_table[]; 173static struct ctl_table vm_table[];
@@ -191,14 +201,6 @@ static struct ctl_table root_table[] = {
191 .mode = 0555, 201 .mode = 0555,
192 .child = vm_table, 202 .child = vm_table,
193 }, 203 },
194#ifdef CONFIG_NET
195 {
196 .ctl_name = CTL_NET,
197 .procname = "net",
198 .mode = 0555,
199 .child = net_table,
200 },
201#endif
202 { 204 {
203 .ctl_name = CTL_FS, 205 .ctl_name = CTL_FS,
204 .procname = "fs", 206 .procname = "fs",
@@ -306,9 +308,43 @@ static struct ctl_table kern_table[] = {
306 .procname = "sched_nr_migrate", 308 .procname = "sched_nr_migrate",
307 .data = &sysctl_sched_nr_migrate, 309 .data = &sysctl_sched_nr_migrate,
308 .maxlen = sizeof(unsigned int), 310 .maxlen = sizeof(unsigned int),
309 .mode = 644, 311 .mode = 0644,
312 .proc_handler = &proc_dointvec,
313 },
314 {
315 .ctl_name = CTL_UNNUMBERED,
316 .procname = "sched_rt_period_ms",
317 .data = &sysctl_sched_rt_period,
318 .maxlen = sizeof(unsigned int),
319 .mode = 0644,
320 .proc_handler = &proc_dointvec,
321 },
322 {
323 .ctl_name = CTL_UNNUMBERED,
324 .procname = "sched_rt_ratio",
325 .data = &sysctl_sched_rt_ratio,
326 .maxlen = sizeof(unsigned int),
327 .mode = 0644,
310 .proc_handler = &proc_dointvec, 328 .proc_handler = &proc_dointvec,
311 }, 329 },
330#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
331 {
332 .ctl_name = CTL_UNNUMBERED,
333 .procname = "sched_min_bal_int_shares",
334 .data = &sysctl_sched_min_bal_int_shares,
335 .maxlen = sizeof(unsigned int),
336 .mode = 0644,
337 .proc_handler = &proc_dointvec,
338 },
339 {
340 .ctl_name = CTL_UNNUMBERED,
341 .procname = "sched_max_bal_int_shares",
342 .data = &sysctl_sched_max_bal_int_shares,
343 .maxlen = sizeof(unsigned int),
344 .mode = 0644,
345 .proc_handler = &proc_dointvec,
346 },
347#endif
312#endif 348#endif
313 { 349 {
314 .ctl_name = CTL_UNNUMBERED, 350 .ctl_name = CTL_UNNUMBERED,
@@ -382,6 +418,15 @@ static struct ctl_table kern_table[] = {
382 .proc_handler = &proc_dointvec_taint, 418 .proc_handler = &proc_dointvec_taint,
383 }, 419 },
384#endif 420#endif
421#ifdef CONFIG_LATENCYTOP
422 {
423 .procname = "latencytop",
424 .data = &latencytop_enabled,
425 .maxlen = sizeof(int),
426 .mode = 0644,
427 .proc_handler = &proc_dointvec,
428 },
429#endif
385#ifdef CONFIG_SECURITY_CAPABILITIES 430#ifdef CONFIG_SECURITY_CAPABILITIES
386 { 431 {
387 .procname = "cap-bound", 432 .procname = "cap-bound",
@@ -683,6 +728,14 @@ static struct ctl_table kern_table[] = {
683 .mode = 0644, 728 .mode = 0644,
684 .proc_handler = &proc_dointvec, 729 .proc_handler = &proc_dointvec,
685 }, 730 },
731 {
732 .ctl_name = CTL_UNNUMBERED,
733 .procname = "io_delay_type",
734 .data = &io_delay_type,
735 .maxlen = sizeof(int),
736 .mode = 0644,
737 .proc_handler = &proc_dointvec,
738 },
686#endif 739#endif
687#if defined(CONFIG_MMU) 740#if defined(CONFIG_MMU)
688 { 741 {
@@ -728,13 +781,40 @@ static struct ctl_table kern_table[] = {
728 .ctl_name = CTL_UNNUMBERED, 781 .ctl_name = CTL_UNNUMBERED,
729 .procname = "softlockup_thresh", 782 .procname = "softlockup_thresh",
730 .data = &softlockup_thresh, 783 .data = &softlockup_thresh,
731 .maxlen = sizeof(int), 784 .maxlen = sizeof(unsigned long),
732 .mode = 0644, 785 .mode = 0644,
733 .proc_handler = &proc_dointvec_minmax, 786 .proc_handler = &proc_doulongvec_minmax,
734 .strategy = &sysctl_intvec, 787 .strategy = &sysctl_intvec,
735 .extra1 = &one, 788 .extra1 = &one,
736 .extra2 = &sixty, 789 .extra2 = &sixty,
737 }, 790 },
791 {
792 .ctl_name = CTL_UNNUMBERED,
793 .procname = "hung_task_check_count",
794 .data = &sysctl_hung_task_check_count,
795 .maxlen = sizeof(unsigned long),
796 .mode = 0644,
797 .proc_handler = &proc_doulongvec_minmax,
798 .strategy = &sysctl_intvec,
799 },
800 {
801 .ctl_name = CTL_UNNUMBERED,
802 .procname = "hung_task_timeout_secs",
803 .data = &sysctl_hung_task_timeout_secs,
804 .maxlen = sizeof(unsigned long),
805 .mode = 0644,
806 .proc_handler = &proc_doulongvec_minmax,
807 .strategy = &sysctl_intvec,
808 },
809 {
810 .ctl_name = CTL_UNNUMBERED,
811 .procname = "hung_task_warnings",
812 .data = &sysctl_hung_task_warnings,
813 .maxlen = sizeof(unsigned long),
814 .mode = 0644,
815 .proc_handler = &proc_doulongvec_minmax,
816 .strategy = &sysctl_intvec,
817 },
738#endif 818#endif
739#ifdef CONFIG_COMPAT 819#ifdef CONFIG_COMPAT
740 { 820 {
@@ -1300,12 +1380,27 @@ void sysctl_head_finish(struct ctl_table_header *head)
1300 spin_unlock(&sysctl_lock); 1380 spin_unlock(&sysctl_lock);
1301} 1381}
1302 1382
1303struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) 1383static struct list_head *
1384lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
1304{ 1385{
1386 struct list_head *header_list;
1387 header_list = &root->header_list;
1388 if (root->lookup)
1389 header_list = root->lookup(root, namespaces);
1390 return header_list;
1391}
1392
1393struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
1394 struct ctl_table_header *prev)
1395{
1396 struct ctl_table_root *root;
1397 struct list_head *header_list;
1305 struct ctl_table_header *head; 1398 struct ctl_table_header *head;
1306 struct list_head *tmp; 1399 struct list_head *tmp;
1400
1307 spin_lock(&sysctl_lock); 1401 spin_lock(&sysctl_lock);
1308 if (prev) { 1402 if (prev) {
1403 head = prev;
1309 tmp = &prev->ctl_entry; 1404 tmp = &prev->ctl_entry;
1310 unuse_table(prev); 1405 unuse_table(prev);
1311 goto next; 1406 goto next;
@@ -1319,14 +1414,38 @@ struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
1319 spin_unlock(&sysctl_lock); 1414 spin_unlock(&sysctl_lock);
1320 return head; 1415 return head;
1321 next: 1416 next:
1417 root = head->root;
1322 tmp = tmp->next; 1418 tmp = tmp->next;
1323 if (tmp == &root_table_header.ctl_entry) 1419 header_list = lookup_header_list(root, namespaces);
1324 break; 1420 if (tmp != header_list)
1421 continue;
1422
1423 do {
1424 root = list_entry(root->root_list.next,
1425 struct ctl_table_root, root_list);
1426 if (root == &sysctl_table_root)
1427 goto out;
1428 header_list = lookup_header_list(root, namespaces);
1429 } while (list_empty(header_list));
1430 tmp = header_list->next;
1325 } 1431 }
1432out:
1326 spin_unlock(&sysctl_lock); 1433 spin_unlock(&sysctl_lock);
1327 return NULL; 1434 return NULL;
1328} 1435}
1329 1436
1437struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
1438{
1439 return __sysctl_head_next(current->nsproxy, prev);
1440}
1441
1442void register_sysctl_root(struct ctl_table_root *root)
1443{
1444 spin_lock(&sysctl_lock);
1445 list_add_tail(&root->root_list, &sysctl_table_root.root_list);
1446 spin_unlock(&sysctl_lock);
1447}
1448
1330#ifdef CONFIG_SYSCTL_SYSCALL 1449#ifdef CONFIG_SYSCTL_SYSCALL
1331int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, 1450int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
1332 void __user *newval, size_t newlen) 1451 void __user *newval, size_t newlen)
@@ -1483,18 +1602,21 @@ static __init int sysctl_init(void)
1483{ 1602{
1484 int err; 1603 int err;
1485 sysctl_set_parent(NULL, root_table); 1604 sysctl_set_parent(NULL, root_table);
1486 err = sysctl_check_table(root_table); 1605 err = sysctl_check_table(current->nsproxy, root_table);
1487 return 0; 1606 return 0;
1488} 1607}
1489 1608
1490core_initcall(sysctl_init); 1609core_initcall(sysctl_init);
1491 1610
1492/** 1611/**
1493 * register_sysctl_table - register a sysctl hierarchy 1612 * __register_sysctl_paths - register a sysctl hierarchy
1613 * @root: List of sysctl headers to register on
1614 * @namespaces: Data to compute which lists of sysctl entries are visible
1615 * @path: The path to the directory the sysctl table is in.
1494 * @table: the top-level table structure 1616 * @table: the top-level table structure
1495 * 1617 *
1496 * Register a sysctl table hierarchy. @table should be a filled in ctl_table 1618 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1497 * array. An entry with a ctl_name of 0 terminates the table. 1619 * array. A completely 0 filled entry terminates the table.
1498 * 1620 *
1499 * The members of the &struct ctl_table structure are used as follows: 1621 * The members of the &struct ctl_table structure are used as follows:
1500 * 1622 *
@@ -1557,25 +1679,99 @@ core_initcall(sysctl_init);
1557 * This routine returns %NULL on a failure to register, and a pointer 1679 * This routine returns %NULL on a failure to register, and a pointer
1558 * to the table header on success. 1680 * to the table header on success.
1559 */ 1681 */
1560struct ctl_table_header *register_sysctl_table(struct ctl_table * table) 1682struct ctl_table_header *__register_sysctl_paths(
1683 struct ctl_table_root *root,
1684 struct nsproxy *namespaces,
1685 const struct ctl_path *path, struct ctl_table *table)
1561{ 1686{
1562 struct ctl_table_header *tmp; 1687 struct list_head *header_list;
1563 tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); 1688 struct ctl_table_header *header;
1564 if (!tmp) 1689 struct ctl_table *new, **prevp;
1690 unsigned int n, npath;
1691
1692 /* Count the path components */
1693 for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath)
1694 ;
1695
1696 /*
1697 * For each path component, allocate a 2-element ctl_table array.
1698 * The first array element will be filled with the sysctl entry
1699 * for this, the second will be the sentinel (ctl_name == 0).
1700 *
1701 * We allocate everything in one go so that we don't have to
1702 * worry about freeing additional memory in unregister_sysctl_table.
1703 */
1704 header = kzalloc(sizeof(struct ctl_table_header) +
1705 (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
1706 if (!header)
1565 return NULL; 1707 return NULL;
1566 tmp->ctl_table = table; 1708
1567 INIT_LIST_HEAD(&tmp->ctl_entry); 1709 new = (struct ctl_table *) (header + 1);
1568 tmp->used = 0; 1710
1569 tmp->unregistering = NULL; 1711 /* Now connect the dots */
1570 sysctl_set_parent(NULL, table); 1712 prevp = &header->ctl_table;
1571 if (sysctl_check_table(tmp->ctl_table)) { 1713 for (n = 0; n < npath; ++n, ++path) {
1572 kfree(tmp); 1714 /* Copy the procname */
1715 new->procname = path->procname;
1716 new->ctl_name = path->ctl_name;
1717 new->mode = 0555;
1718
1719 *prevp = new;
1720 prevp = &new->child;
1721
1722 new += 2;
1723 }
1724 *prevp = table;
1725 header->ctl_table_arg = table;
1726
1727 INIT_LIST_HEAD(&header->ctl_entry);
1728 header->used = 0;
1729 header->unregistering = NULL;
1730 header->root = root;
1731 sysctl_set_parent(NULL, header->ctl_table);
1732 if (sysctl_check_table(namespaces, header->ctl_table)) {
1733 kfree(header);
1573 return NULL; 1734 return NULL;
1574 } 1735 }
1575 spin_lock(&sysctl_lock); 1736 spin_lock(&sysctl_lock);
1576 list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); 1737 header_list = lookup_header_list(root, namespaces);
1738 list_add_tail(&header->ctl_entry, header_list);
1577 spin_unlock(&sysctl_lock); 1739 spin_unlock(&sysctl_lock);
1578 return tmp; 1740
1741 return header;
1742}
1743
1744/**
1745 * register_sysctl_table_path - register a sysctl table hierarchy
1746 * @path: The path to the directory the sysctl table is in.
1747 * @table: the top-level table structure
1748 *
1749 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1750 * array. A completely 0 filled entry terminates the table.
1751 *
1752 * See __register_sysctl_paths for more details.
1753 */
1754struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
1755 struct ctl_table *table)
1756{
1757 return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
1758 path, table);
1759}
1760
1761/**
1762 * register_sysctl_table - register a sysctl table hierarchy
1763 * @table: the top-level table structure
1764 *
1765 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1766 * array. A completely 0 filled entry terminates the table.
1767 *
1768 * See register_sysctl_paths for more details.
1769 */
1770struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
1771{
1772 static const struct ctl_path null_path[] = { {} };
1773
1774 return register_sysctl_paths(null_path, table);
1579} 1775}
1580 1776
1581/** 1777/**
@@ -1604,6 +1800,12 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
1604 return NULL; 1800 return NULL;
1605} 1801}
1606 1802
1803struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
1804 struct ctl_table *table)
1805{
1806 return NULL;
1807}
1808
1607void unregister_sysctl_table(struct ctl_table_header * table) 1809void unregister_sysctl_table(struct ctl_table_header * table)
1608{ 1810{
1609} 1811}
@@ -2662,6 +2864,7 @@ EXPORT_SYMBOL(proc_dostring);
2662EXPORT_SYMBOL(proc_doulongvec_minmax); 2864EXPORT_SYMBOL(proc_doulongvec_minmax);
2663EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); 2865EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
2664EXPORT_SYMBOL(register_sysctl_table); 2866EXPORT_SYMBOL(register_sysctl_table);
2867EXPORT_SYMBOL(register_sysctl_paths);
2665EXPORT_SYMBOL(sysctl_intvec); 2868EXPORT_SYMBOL(sysctl_intvec);
2666EXPORT_SYMBOL(sysctl_jiffies); 2869EXPORT_SYMBOL(sysctl_jiffies);
2667EXPORT_SYMBOL(sysctl_ms_jiffies); 2870EXPORT_SYMBOL(sysctl_ms_jiffies);
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index a68425a5cc1d..c3206fa50048 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1,6 +1,5 @@
1#include <linux/stat.h> 1#include <linux/stat.h>
2#include <linux/sysctl.h> 2#include <linux/sysctl.h>
3#include "../arch/s390/appldata/appldata.h"
4#include "../fs/xfs/linux-2.6/xfs_sysctl.h" 3#include "../fs/xfs/linux-2.6/xfs_sysctl.h"
5#include <linux/sunrpc/debug.h> 4#include <linux/sunrpc/debug.h>
6#include <linux/string.h> 5#include <linux/string.h>
@@ -1343,7 +1342,8 @@ static void sysctl_repair_table(struct ctl_table *table)
1343 } 1342 }
1344} 1343}
1345 1344
1346static struct ctl_table *sysctl_check_lookup(struct ctl_table *table) 1345static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
1346 struct ctl_table *table)
1347{ 1347{
1348 struct ctl_table_header *head; 1348 struct ctl_table_header *head;
1349 struct ctl_table *ref, *test; 1349 struct ctl_table *ref, *test;
@@ -1351,8 +1351,8 @@ static struct ctl_table *sysctl_check_lookup(struct ctl_table *table)
1351 1351
1352 depth = sysctl_depth(table); 1352 depth = sysctl_depth(table);
1353 1353
1354 for (head = sysctl_head_next(NULL); head; 1354 for (head = __sysctl_head_next(namespaces, NULL); head;
1355 head = sysctl_head_next(head)) { 1355 head = __sysctl_head_next(namespaces, head)) {
1356 cur_depth = depth; 1356 cur_depth = depth;
1357 ref = head->ctl_table; 1357 ref = head->ctl_table;
1358repeat: 1358repeat:
@@ -1397,13 +1397,14 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str
1397 *fail = str; 1397 *fail = str;
1398} 1398}
1399 1399
1400static int sysctl_check_dir(struct ctl_table *table) 1400static int sysctl_check_dir(struct nsproxy *namespaces,
1401 struct ctl_table *table)
1401{ 1402{
1402 struct ctl_table *ref; 1403 struct ctl_table *ref;
1403 int error; 1404 int error;
1404 1405
1405 error = 0; 1406 error = 0;
1406 ref = sysctl_check_lookup(table); 1407 ref = sysctl_check_lookup(namespaces, table);
1407 if (ref) { 1408 if (ref) {
1408 int match = 0; 1409 int match = 0;
1409 if ((!table->procname && !ref->procname) || 1410 if ((!table->procname && !ref->procname) ||
@@ -1428,11 +1429,12 @@ static int sysctl_check_dir(struct ctl_table *table)
1428 return error; 1429 return error;
1429} 1430}
1430 1431
1431static void sysctl_check_leaf(struct ctl_table *table, const char **fail) 1432static void sysctl_check_leaf(struct nsproxy *namespaces,
1433 struct ctl_table *table, const char **fail)
1432{ 1434{
1433 struct ctl_table *ref; 1435 struct ctl_table *ref;
1434 1436
1435 ref = sysctl_check_lookup(table); 1437 ref = sysctl_check_lookup(namespaces, table);
1436 if (ref && (ref != table)) 1438 if (ref && (ref != table))
1437 set_fail(fail, table, "Sysctl already exists"); 1439 set_fail(fail, table, "Sysctl already exists");
1438} 1440}
@@ -1456,7 +1458,7 @@ static void sysctl_check_bin_path(struct ctl_table *table, const char **fail)
1456 } 1458 }
1457} 1459}
1458 1460
1459int sysctl_check_table(struct ctl_table *table) 1461int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1460{ 1462{
1461 int error = 0; 1463 int error = 0;
1462 for (; table->ctl_name || table->procname; table++) { 1464 for (; table->ctl_name || table->procname; table++) {
@@ -1486,7 +1488,7 @@ int sysctl_check_table(struct ctl_table *table)
1486 set_fail(&fail, table, "Directory with extra1"); 1488 set_fail(&fail, table, "Directory with extra1");
1487 if (table->extra2) 1489 if (table->extra2)
1488 set_fail(&fail, table, "Directory with extra2"); 1490 set_fail(&fail, table, "Directory with extra2");
1489 if (sysctl_check_dir(table)) 1491 if (sysctl_check_dir(namespaces, table))
1490 set_fail(&fail, table, "Inconsistent directory names"); 1492 set_fail(&fail, table, "Inconsistent directory names");
1491 } else { 1493 } else {
1492 if ((table->strategy == sysctl_data) || 1494 if ((table->strategy == sysctl_data) ||
@@ -1535,7 +1537,7 @@ int sysctl_check_table(struct ctl_table *table)
1535 if (!table->procname && table->proc_handler) 1537 if (!table->procname && table->proc_handler)
1536 set_fail(&fail, table, "proc_handler without procname"); 1538 set_fail(&fail, table, "proc_handler without procname");
1537#endif 1539#endif
1538 sysctl_check_leaf(table, &fail); 1540 sysctl_check_leaf(namespaces, table, &fail);
1539 } 1541 }
1540 sysctl_check_bin_path(table, &fail); 1542 sysctl_check_bin_path(table, &fail);
1541 if (fail) { 1543 if (fail) {
@@ -1543,7 +1545,7 @@ int sysctl_check_table(struct ctl_table *table)
1543 error = -EINVAL; 1545 error = -EINVAL;
1544 } 1546 }
1545 if (table->child) 1547 if (table->child)
1546 error |= sysctl_check_table(table->child); 1548 error |= sysctl_check_table(namespaces, table->child);
1547 } 1549 }
1548 return error; 1550 return error;
1549} 1551}
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
new file mode 100644
index 000000000000..88cdb109e13c
--- /dev/null
+++ b/kernel/test_kprobes.c
@@ -0,0 +1,216 @@
1/*
2 * test_kprobes.c - simple sanity test for *probes
3 *
4 * Copyright IBM Corp. 2008
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it would be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
14 * the GNU General Public License for more details.
15 */
16
17#include <linux/kernel.h>
18#include <linux/kprobes.h>
19#include <linux/random.h>
20
21#define div_factor 3
22
23static u32 rand1, preh_val, posth_val, jph_val;
24static int errors, handler_errors, num_tests;
25
26static noinline u32 kprobe_target(u32 value)
27{
28 /*
29 * gcc ignores noinline on some architectures unless we stuff
30 * sufficient lard into the function. The get_kprobe() here is
31 * just for that.
32 *
33 * NOTE: We aren't concerned about the correctness of get_kprobe()
34 * here; hence, this call is neither under !preempt nor with the
35 * kprobe_mutex held. This is fine(tm)
36 */
37 if (get_kprobe((void *)0xdeadbeef))
38 printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n");
39
40 return (value / div_factor);
41}
42
43static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs)
44{
45 preh_val = (rand1 / div_factor);
46 return 0;
47}
48
49static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
50 unsigned long flags)
51{
52 if (preh_val != (rand1 / div_factor)) {
53 handler_errors++;
54 printk(KERN_ERR "Kprobe smoke test failed: "
55 "incorrect value in post_handler\n");
56 }
57 posth_val = preh_val + div_factor;
58}
59
60static struct kprobe kp = {
61 .symbol_name = "kprobe_target",
62 .pre_handler = kp_pre_handler,
63 .post_handler = kp_post_handler
64};
65
66static int test_kprobe(void)
67{
68 int ret;
69
70 ret = register_kprobe(&kp);
71 if (ret < 0) {
72 printk(KERN_ERR "Kprobe smoke test failed: "
73 "register_kprobe returned %d\n", ret);
74 return ret;
75 }
76
77 ret = kprobe_target(rand1);
78 unregister_kprobe(&kp);
79
80 if (preh_val == 0) {
81 printk(KERN_ERR "Kprobe smoke test failed: "
82 "kprobe pre_handler not called\n");
83 handler_errors++;
84 }
85
86 if (posth_val == 0) {
87 printk(KERN_ERR "Kprobe smoke test failed: "
88 "kprobe post_handler not called\n");
89 handler_errors++;
90 }
91
92 return 0;
93}
94
95static u32 j_kprobe_target(u32 value)
96{
97 if (value != rand1) {
98 handler_errors++;
99 printk(KERN_ERR "Kprobe smoke test failed: "
100 "incorrect value in jprobe handler\n");
101 }
102
103 jph_val = rand1;
104 jprobe_return();
105 return 0;
106}
107
108static struct jprobe jp = {
109 .entry = j_kprobe_target,
110 .kp.symbol_name = "kprobe_target"
111};
112
113static int test_jprobe(void)
114{
115 int ret;
116
117 ret = register_jprobe(&jp);
118 if (ret < 0) {
119 printk(KERN_ERR "Kprobe smoke test failed: "
120 "register_jprobe returned %d\n", ret);
121 return ret;
122 }
123
124 ret = kprobe_target(rand1);
125 unregister_jprobe(&jp);
126 if (jph_val == 0) {
127 printk(KERN_ERR "Kprobe smoke test failed: "
128 "jprobe handler not called\n");
129 handler_errors++;
130 }
131
132 return 0;
133}
134
135#ifdef CONFIG_KRETPROBES
136static u32 krph_val;
137
138static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
139{
140 unsigned long ret = regs_return_value(regs);
141
142 if (ret != (rand1 / div_factor)) {
143 handler_errors++;
144 printk(KERN_ERR "Kprobe smoke test failed: "
145 "incorrect value in kretprobe handler\n");
146 }
147
148 krph_val = (rand1 / div_factor);
149 return 0;
150}
151
152static struct kretprobe rp = {
153 .handler = return_handler,
154 .kp.symbol_name = "kprobe_target"
155};
156
157static int test_kretprobe(void)
158{
159 int ret;
160
161 ret = register_kretprobe(&rp);
162 if (ret < 0) {
163 printk(KERN_ERR "Kprobe smoke test failed: "
164 "register_kretprobe returned %d\n", ret);
165 return ret;
166 }
167
168 ret = kprobe_target(rand1);
169 unregister_kretprobe(&rp);
170 if (krph_val == 0) {
171 printk(KERN_ERR "Kprobe smoke test failed: "
172 "kretprobe handler not called\n");
173 handler_errors++;
174 }
175
176 return 0;
177}
178#endif /* CONFIG_KRETPROBES */
179
180int init_test_probes(void)
181{
182 int ret;
183
184 do {
185 rand1 = random32();
186 } while (rand1 <= div_factor);
187
188 printk(KERN_INFO "Kprobe smoke test started\n");
189 num_tests++;
190 ret = test_kprobe();
191 if (ret < 0)
192 errors++;
193
194 num_tests++;
195 ret = test_jprobe();
196 if (ret < 0)
197 errors++;
198
199#ifdef CONFIG_KRETPROBES
200 num_tests++;
201 ret = test_kretprobe();
202 if (ret < 0)
203 errors++;
204#endif /* CONFIG_KRETPROBES */
205
206 if (errors)
207 printk(KERN_ERR "BUG: Kprobe smoke test: %d out of "
208 "%d tests failed\n", errors, num_tests);
209 else if (handler_errors)
210 printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) "
211 "running handlers\n", handler_errors);
212 else
213 printk(KERN_INFO "Kprobe smoke test passed successfully\n");
214
215 return 0;
216}
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 5fb139fef9fa..3e59fce6dd43 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -41,6 +41,11 @@ unsigned long clockevent_delta2ns(unsigned long latch,
41{ 41{
42 u64 clc = ((u64) latch << evt->shift); 42 u64 clc = ((u64) latch << evt->shift);
43 43
44 if (unlikely(!evt->mult)) {
45 evt->mult = 1;
46 WARN_ON(1);
47 }
48
44 do_div(clc, evt->mult); 49 do_div(clc, evt->mult);
45 if (clc < 1000) 50 if (clc < 1000)
46 clc = 1000; 51 clc = 1000;
@@ -151,6 +156,14 @@ static void clockevents_notify_released(void)
151void clockevents_register_device(struct clock_event_device *dev) 156void clockevents_register_device(struct clock_event_device *dev)
152{ 157{
153 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 158 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
159 /*
160 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
161 * on it, so fix it up and emit a warning:
162 */
163 if (unlikely(!dev->mult)) {
164 dev->mult = 1;
165 WARN_ON(1);
166 }
154 167
155 spin_lock(&clockevents_lock); 168 spin_lock(&clockevents_lock);
156 169
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c8a9d13874df..6e9259a5d501 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -142,8 +142,13 @@ static void clocksource_watchdog(unsigned long data)
142 } 142 }
143 143
144 if (!list_empty(&watchdog_list)) { 144 if (!list_empty(&watchdog_list)) {
145 __mod_timer(&watchdog_timer, 145 /* Cycle through CPUs to check if the CPUs stay synchronized to
146 watchdog_timer.expires + WATCHDOG_INTERVAL); 146 * each other. */
147 int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map);
148 if (next_cpu >= NR_CPUS)
149 next_cpu = first_cpu(cpu_online_map);
150 watchdog_timer.expires += WATCHDOG_INTERVAL;
151 add_timer_on(&watchdog_timer, next_cpu);
147 } 152 }
148 spin_unlock(&watchdog_lock); 153 spin_unlock(&watchdog_lock);
149} 154}
@@ -165,7 +170,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
165 if (!started && watchdog) { 170 if (!started && watchdog) {
166 watchdog_last = watchdog->read(); 171 watchdog_last = watchdog->read();
167 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; 172 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
168 add_timer(&watchdog_timer); 173 add_timer_on(&watchdog_timer, first_cpu(cpu_online_map));
169 } 174 }
170 } else { 175 } else {
171 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 176 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
@@ -175,7 +180,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
175 if (watchdog) 180 if (watchdog)
176 del_timer(&watchdog_timer); 181 del_timer(&watchdog_timer);
177 watchdog = cs; 182 watchdog = cs;
178 init_timer(&watchdog_timer); 183 init_timer_deferrable(&watchdog_timer);
179 watchdog_timer.function = clocksource_watchdog; 184 watchdog_timer.function = clocksource_watchdog;
180 185
181 /* Reset watchdog cycles */ 186 /* Reset watchdog cycles */
@@ -186,7 +191,8 @@ static void clocksource_check_watchdog(struct clocksource *cs)
186 watchdog_last = watchdog->read(); 191 watchdog_last = watchdog->read();
187 watchdog_timer.expires = 192 watchdog_timer.expires =
188 jiffies + WATCHDOG_INTERVAL; 193 jiffies + WATCHDOG_INTERVAL;
189 add_timer(&watchdog_timer); 194 add_timer_on(&watchdog_timer,
195 first_cpu(cpu_online_map));
190 } 196 }
191 } 197 }
192 } 198 }
@@ -331,6 +337,21 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
331 spin_unlock_irqrestore(&clocksource_lock, flags); 337 spin_unlock_irqrestore(&clocksource_lock, flags);
332} 338}
333 339
340/**
341 * clocksource_unregister - remove a registered clocksource
342 */
343void clocksource_unregister(struct clocksource *cs)
344{
345 unsigned long flags;
346
347 spin_lock_irqsave(&clocksource_lock, flags);
348 list_del(&cs->list);
349 if (clocksource_override == cs)
350 clocksource_override = NULL;
351 next_clocksource = select_clocksource();
352 spin_unlock_irqrestore(&clocksource_lock, flags);
353}
354
334#ifdef CONFIG_SYSFS 355#ifdef CONFIG_SYSFS
335/** 356/**
336 * sysfs_show_current_clocksources - sysfs interface for current clocksource 357 * sysfs_show_current_clocksources - sysfs interface for current clocksource
@@ -441,7 +462,7 @@ static SYSDEV_ATTR(available_clocksource, 0600,
441 sysfs_show_available_clocksources, NULL); 462 sysfs_show_available_clocksources, NULL);
442 463
443static struct sysdev_class clocksource_sysclass = { 464static struct sysdev_class clocksource_sysclass = {
444 set_kset_name("clocksource"), 465 .name = "clocksource",
445}; 466};
446 467
447static struct sys_device device_clocksource = { 468static struct sys_device device_clocksource = {
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 5b86698faa0b..e1bd50cbbf5d 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -126,9 +126,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
126/* 126/*
127 * Broadcast the event to the cpus, which are set in the mask 127 * Broadcast the event to the cpus, which are set in the mask
128 */ 128 */
129int tick_do_broadcast(cpumask_t mask) 129static void tick_do_broadcast(cpumask_t mask)
130{ 130{
131 int ret = 0, cpu = smp_processor_id(); 131 int cpu = smp_processor_id();
132 struct tick_device *td; 132 struct tick_device *td;
133 133
134 /* 134 /*
@@ -138,7 +138,6 @@ int tick_do_broadcast(cpumask_t mask)
138 cpu_clear(cpu, mask); 138 cpu_clear(cpu, mask);
139 td = &per_cpu(tick_cpu_device, cpu); 139 td = &per_cpu(tick_cpu_device, cpu);
140 td->evtdev->event_handler(td->evtdev); 140 td->evtdev->event_handler(td->evtdev);
141 ret = 1;
142 } 141 }
143 142
144 if (!cpus_empty(mask)) { 143 if (!cpus_empty(mask)) {
@@ -151,9 +150,7 @@ int tick_do_broadcast(cpumask_t mask)
151 cpu = first_cpu(mask); 150 cpu = first_cpu(mask);
152 td = &per_cpu(tick_cpu_device, cpu); 151 td = &per_cpu(tick_cpu_device, cpu);
153 td->evtdev->broadcast(mask); 152 td->evtdev->broadcast(mask);
154 ret = 1;
155 } 153 }
156 return ret;
157} 154}
158 155
159/* 156/*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index bb13f2724905..f13f2b7f4fd4 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -70,8 +70,6 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
70 * Broadcasting support 70 * Broadcasting support
71 */ 71 */
72#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 72#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
73extern int tick_do_broadcast(cpumask_t mask);
74
75extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); 73extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
76extern int tick_check_broadcast_device(struct clock_event_device *dev); 74extern int tick_check_broadcast_device(struct clock_event_device *dev);
77extern int tick_is_broadcast_device(struct clock_event_device *dev); 75extern int tick_is_broadcast_device(struct clock_event_device *dev);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cb89fa8db110..63f24b550695 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -9,7 +9,7 @@
9 * 9 *
10 * Started by: Thomas Gleixner and Ingo Molnar 10 * Started by: Thomas Gleixner and Ingo Molnar
11 * 11 *
12 * For licencing details see kernel-base/COPYING 12 * Distribute under GPLv2.
13 */ 13 */
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/err.h> 15#include <linux/err.h>
@@ -143,6 +143,44 @@ void tick_nohz_update_jiffies(void)
143 local_irq_restore(flags); 143 local_irq_restore(flags);
144} 144}
145 145
146void tick_nohz_stop_idle(int cpu)
147{
148 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
149
150 if (ts->idle_active) {
151 ktime_t now, delta;
152 now = ktime_get();
153 delta = ktime_sub(now, ts->idle_entrytime);
154 ts->idle_lastupdate = now;
155 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
156 ts->idle_active = 0;
157 }
158}
159
160static ktime_t tick_nohz_start_idle(int cpu)
161{
162 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
163 ktime_t now, delta;
164
165 now = ktime_get();
166 if (ts->idle_active) {
167 delta = ktime_sub(now, ts->idle_entrytime);
168 ts->idle_lastupdate = now;
169 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
170 }
171 ts->idle_entrytime = now;
172 ts->idle_active = 1;
173 return now;
174}
175
176u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
177{
178 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
179
180 *last_update_time = ktime_to_us(ts->idle_lastupdate);
181 return ktime_to_us(ts->idle_sleeptime);
182}
183
146/** 184/**
147 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task 185 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
148 * 186 *
@@ -153,14 +191,16 @@ void tick_nohz_update_jiffies(void)
153void tick_nohz_stop_sched_tick(void) 191void tick_nohz_stop_sched_tick(void)
154{ 192{
155 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; 193 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
194 unsigned long rt_jiffies;
156 struct tick_sched *ts; 195 struct tick_sched *ts;
157 ktime_t last_update, expires, now, delta; 196 ktime_t last_update, expires, now;
158 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 197 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
159 int cpu; 198 int cpu;
160 199
161 local_irq_save(flags); 200 local_irq_save(flags);
162 201
163 cpu = smp_processor_id(); 202 cpu = smp_processor_id();
203 now = tick_nohz_start_idle(cpu);
164 ts = &per_cpu(tick_cpu_sched, cpu); 204 ts = &per_cpu(tick_cpu_sched, cpu);
165 205
166 /* 206 /*
@@ -192,19 +232,7 @@ void tick_nohz_stop_sched_tick(void)
192 } 232 }
193 } 233 }
194 234
195 now = ktime_get();
196 /*
197 * When called from irq_exit we need to account the idle sleep time
198 * correctly.
199 */
200 if (ts->tick_stopped) {
201 delta = ktime_sub(now, ts->idle_entrytime);
202 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
203 }
204
205 ts->idle_entrytime = now;
206 ts->idle_calls++; 235 ts->idle_calls++;
207
208 /* Read jiffies and the time when jiffies were updated last */ 236 /* Read jiffies and the time when jiffies were updated last */
209 do { 237 do {
210 seq = read_seqbegin(&xtime_lock); 238 seq = read_seqbegin(&xtime_lock);
@@ -216,6 +244,10 @@ void tick_nohz_stop_sched_tick(void)
216 next_jiffies = get_next_timer_interrupt(last_jiffies); 244 next_jiffies = get_next_timer_interrupt(last_jiffies);
217 delta_jiffies = next_jiffies - last_jiffies; 245 delta_jiffies = next_jiffies - last_jiffies;
218 246
247 rt_jiffies = rt_needs_cpu(cpu);
248 if (rt_jiffies && rt_jiffies < delta_jiffies)
249 delta_jiffies = rt_jiffies;
250
219 if (rcu_needs_cpu(cpu)) 251 if (rcu_needs_cpu(cpu))
220 delta_jiffies = 1; 252 delta_jiffies = 1;
221 /* 253 /*
@@ -291,7 +323,7 @@ void tick_nohz_stop_sched_tick(void)
291 /* Check, if the timer was already in the past */ 323 /* Check, if the timer was already in the past */
292 if (hrtimer_active(&ts->sched_timer)) 324 if (hrtimer_active(&ts->sched_timer))
293 goto out; 325 goto out;
294 } else if(!tick_program_event(expires, 0)) 326 } else if (!tick_program_event(expires, 0))
295 goto out; 327 goto out;
296 /* 328 /*
297 * We are past the event already. So we crossed a 329 * We are past the event already. So we crossed a
@@ -332,23 +364,22 @@ void tick_nohz_restart_sched_tick(void)
332 int cpu = smp_processor_id(); 364 int cpu = smp_processor_id();
333 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 365 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
334 unsigned long ticks; 366 unsigned long ticks;
335 ktime_t now, delta; 367 ktime_t now;
336 368
337 if (!ts->tick_stopped) 369 local_irq_disable();
370 tick_nohz_stop_idle(cpu);
371
372 if (!ts->tick_stopped) {
373 local_irq_enable();
338 return; 374 return;
375 }
339 376
340 /* Update jiffies first */ 377 /* Update jiffies first */
341 now = ktime_get();
342
343 local_irq_disable();
344 select_nohz_load_balancer(0); 378 select_nohz_load_balancer(0);
379 now = ktime_get();
345 tick_do_update_jiffies64(now); 380 tick_do_update_jiffies64(now);
346 cpu_clear(cpu, nohz_cpu_mask); 381 cpu_clear(cpu, nohz_cpu_mask);
347 382
348 /* Account the idle time */
349 delta = ktime_sub(now, ts->idle_entrytime);
350 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
351
352 /* 383 /*
353 * We stopped the tick in idle. Update process times would miss the 384 * We stopped the tick in idle. Update process times would miss the
354 * time we slept as update_process_times does only a 1 tick 385 * time we slept as update_process_times does only a 1 tick
@@ -502,14 +533,13 @@ static inline void tick_nohz_switch_to_nohz(void) { }
502 */ 533 */
503#ifdef CONFIG_HIGH_RES_TIMERS 534#ifdef CONFIG_HIGH_RES_TIMERS
504/* 535/*
505 * We rearm the timer until we get disabled by the idle code 536 * We rearm the timer until we get disabled by the idle code.
506 * Called with interrupts disabled and timer->base->cpu_base->lock held. 537 * Called with interrupts disabled and timer->base->cpu_base->lock held.
507 */ 538 */
508static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) 539static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
509{ 540{
510 struct tick_sched *ts = 541 struct tick_sched *ts =
511 container_of(timer, struct tick_sched, sched_timer); 542 container_of(timer, struct tick_sched, sched_timer);
512 struct hrtimer_cpu_base *base = timer->base->cpu_base;
513 struct pt_regs *regs = get_irq_regs(); 543 struct pt_regs *regs = get_irq_regs();
514 ktime_t now = ktime_get(); 544 ktime_t now = ktime_get();
515 int cpu = smp_processor_id(); 545 int cpu = smp_processor_id();
@@ -547,15 +577,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
547 touch_softlockup_watchdog(); 577 touch_softlockup_watchdog();
548 ts->idle_jiffies++; 578 ts->idle_jiffies++;
549 } 579 }
550 /*
551 * update_process_times() might take tasklist_lock, hence
552 * drop the base lock. sched-tick hrtimers are per-CPU and
553 * never accessible by userspace APIs, so this is safe to do.
554 */
555 spin_unlock(&base->lock);
556 update_process_times(user_mode(regs)); 580 update_process_times(user_mode(regs));
557 profile_tick(CPU_PROFILING); 581 profile_tick(CPU_PROFILING);
558 spin_lock(&base->lock);
559 } 582 }
560 583
561 /* Do not restart, when we are in the idle loop */ 584 /* Do not restart, when we are in the idle loop */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e5e466b27598..092a2366b5a9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -82,13 +82,12 @@ static inline s64 __get_nsec_offset(void)
82} 82}
83 83
84/** 84/**
85 * __get_realtime_clock_ts - Returns the time of day in a timespec 85 * getnstimeofday - Returns the time of day in a timespec
86 * @ts: pointer to the timespec to be set 86 * @ts: pointer to the timespec to be set
87 * 87 *
88 * Returns the time of day in a timespec. Used by 88 * Returns the time of day in a timespec.
89 * do_gettimeofday() and get_realtime_clock_ts().
90 */ 89 */
91static inline void __get_realtime_clock_ts(struct timespec *ts) 90void getnstimeofday(struct timespec *ts)
92{ 91{
93 unsigned long seq; 92 unsigned long seq;
94 s64 nsecs; 93 s64 nsecs;
@@ -104,30 +103,19 @@ static inline void __get_realtime_clock_ts(struct timespec *ts)
104 timespec_add_ns(ts, nsecs); 103 timespec_add_ns(ts, nsecs);
105} 104}
106 105
107/**
108 * getnstimeofday - Returns the time of day in a timespec
109 * @ts: pointer to the timespec to be set
110 *
111 * Returns the time of day in a timespec.
112 */
113void getnstimeofday(struct timespec *ts)
114{
115 __get_realtime_clock_ts(ts);
116}
117
118EXPORT_SYMBOL(getnstimeofday); 106EXPORT_SYMBOL(getnstimeofday);
119 107
120/** 108/**
121 * do_gettimeofday - Returns the time of day in a timeval 109 * do_gettimeofday - Returns the time of day in a timeval
122 * @tv: pointer to the timeval to be set 110 * @tv: pointer to the timeval to be set
123 * 111 *
124 * NOTE: Users should be converted to using get_realtime_clock_ts() 112 * NOTE: Users should be converted to using getnstimeofday()
125 */ 113 */
126void do_gettimeofday(struct timeval *tv) 114void do_gettimeofday(struct timeval *tv)
127{ 115{
128 struct timespec now; 116 struct timespec now;
129 117
130 __get_realtime_clock_ts(&now); 118 getnstimeofday(&now);
131 tv->tv_sec = now.tv_sec; 119 tv->tv_sec = now.tv_sec;
132 tv->tv_usec = now.tv_nsec/1000; 120 tv->tv_usec = now.tv_nsec/1000;
133} 121}
@@ -198,7 +186,8 @@ static void change_clocksource(void)
198 186
199 clock->error = 0; 187 clock->error = 0;
200 clock->xtime_nsec = 0; 188 clock->xtime_nsec = 0;
201 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 189 clocksource_calculate_interval(clock,
190 (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
202 191
203 tick_clock_notify(); 192 tick_clock_notify();
204 193
@@ -255,7 +244,8 @@ void __init timekeeping_init(void)
255 ntp_clear(); 244 ntp_clear();
256 245
257 clock = clocksource_get_next(); 246 clock = clocksource_get_next();
258 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 247 clocksource_calculate_interval(clock,
248 (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
259 clock->cycle_last = clocksource_read(clock); 249 clock->cycle_last = clocksource_read(clock);
260 250
261 xtime.tv_sec = sec; 251 xtime.tv_sec = sec;
@@ -335,9 +325,9 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
335 325
336/* sysfs resume/suspend bits for timekeeping */ 326/* sysfs resume/suspend bits for timekeeping */
337static struct sysdev_class timekeeping_sysclass = { 327static struct sysdev_class timekeeping_sysclass = {
328 .name = "timekeeping",
338 .resume = timekeeping_resume, 329 .resume = timekeeping_resume,
339 .suspend = timekeeping_suspend, 330 .suspend = timekeeping_suspend,
340 set_kset_name("timekeeping"),
341}; 331};
342 332
343static struct sys_device device_timer = { 333static struct sys_device device_timer = {
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c36bb7ed0301..417da8c5bc72 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -26,7 +26,7 @@
26 * the pid and cmdline from the owner process if applicable. 26 * the pid and cmdline from the owner process if applicable.
27 * 27 *
28 * Start/stop data collection: 28 * Start/stop data collection:
29 * # echo 1[0] >/proc/timer_stats 29 * # echo [1|0] >/proc/timer_stats
30 * 30 *
31 * Display the information collected so far: 31 * Display the information collected so far:
32 * # cat /proc/timer_stats 32 * # cat /proc/timer_stats
diff --git a/kernel/timer.c b/kernel/timer.c
index 2a00c22203f3..23f7ead78fae 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -58,59 +58,57 @@ EXPORT_SYMBOL(jiffies_64);
58#define TVN_MASK (TVN_SIZE - 1) 58#define TVN_MASK (TVN_SIZE - 1)
59#define TVR_MASK (TVR_SIZE - 1) 59#define TVR_MASK (TVR_SIZE - 1)
60 60
61typedef struct tvec_s { 61struct tvec {
62 struct list_head vec[TVN_SIZE]; 62 struct list_head vec[TVN_SIZE];
63} tvec_t; 63};
64 64
65typedef struct tvec_root_s { 65struct tvec_root {
66 struct list_head vec[TVR_SIZE]; 66 struct list_head vec[TVR_SIZE];
67} tvec_root_t; 67};
68 68
69struct tvec_t_base_s { 69struct tvec_base {
70 spinlock_t lock; 70 spinlock_t lock;
71 struct timer_list *running_timer; 71 struct timer_list *running_timer;
72 unsigned long timer_jiffies; 72 unsigned long timer_jiffies;
73 tvec_root_t tv1; 73 struct tvec_root tv1;
74 tvec_t tv2; 74 struct tvec tv2;
75 tvec_t tv3; 75 struct tvec tv3;
76 tvec_t tv4; 76 struct tvec tv4;
77 tvec_t tv5; 77 struct tvec tv5;
78} ____cacheline_aligned; 78} ____cacheline_aligned;
79 79
80typedef struct tvec_t_base_s tvec_base_t; 80struct tvec_base boot_tvec_bases;
81
82tvec_base_t boot_tvec_bases;
83EXPORT_SYMBOL(boot_tvec_bases); 81EXPORT_SYMBOL(boot_tvec_bases);
84static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; 82static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
85 83
86/* 84/*
87 * Note that all tvec_bases is 2 byte aligned and lower bit of 85 * Note that all tvec_bases are 2 byte aligned and lower bit of
88 * base in timer_list is guaranteed to be zero. Use the LSB for 86 * base in timer_list is guaranteed to be zero. Use the LSB for
89 * the new flag to indicate whether the timer is deferrable 87 * the new flag to indicate whether the timer is deferrable
90 */ 88 */
91#define TBASE_DEFERRABLE_FLAG (0x1) 89#define TBASE_DEFERRABLE_FLAG (0x1)
92 90
93/* Functions below help us manage 'deferrable' flag */ 91/* Functions below help us manage 'deferrable' flag */
94static inline unsigned int tbase_get_deferrable(tvec_base_t *base) 92static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
95{ 93{
96 return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); 94 return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
97} 95}
98 96
99static inline tvec_base_t *tbase_get_base(tvec_base_t *base) 97static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
100{ 98{
101 return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); 99 return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
102} 100}
103 101
104static inline void timer_set_deferrable(struct timer_list *timer) 102static inline void timer_set_deferrable(struct timer_list *timer)
105{ 103{
106 timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | 104 timer->base = ((struct tvec_base *)((unsigned long)(timer->base) |
107 TBASE_DEFERRABLE_FLAG)); 105 TBASE_DEFERRABLE_FLAG));
108} 106}
109 107
110static inline void 108static inline void
111timer_set_base(struct timer_list *timer, tvec_base_t *new_base) 109timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
112{ 110{
113 timer->base = (tvec_base_t *)((unsigned long)(new_base) | 111 timer->base = (struct tvec_base *)((unsigned long)(new_base) |
114 tbase_get_deferrable(timer->base)); 112 tbase_get_deferrable(timer->base));
115} 113}
116 114
@@ -246,7 +244,7 @@ unsigned long round_jiffies_relative(unsigned long j)
246EXPORT_SYMBOL_GPL(round_jiffies_relative); 244EXPORT_SYMBOL_GPL(round_jiffies_relative);
247 245
248 246
249static inline void set_running_timer(tvec_base_t *base, 247static inline void set_running_timer(struct tvec_base *base,
250 struct timer_list *timer) 248 struct timer_list *timer)
251{ 249{
252#ifdef CONFIG_SMP 250#ifdef CONFIG_SMP
@@ -254,7 +252,7 @@ static inline void set_running_timer(tvec_base_t *base,
254#endif 252#endif
255} 253}
256 254
257static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) 255static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
258{ 256{
259 unsigned long expires = timer->expires; 257 unsigned long expires = timer->expires;
260 unsigned long idx = expires - base->timer_jiffies; 258 unsigned long idx = expires - base->timer_jiffies;
@@ -371,14 +369,14 @@ static inline void detach_timer(struct timer_list *timer,
371 * possible to set timer->base = NULL and drop the lock: the timer remains 369 * possible to set timer->base = NULL and drop the lock: the timer remains
372 * locked. 370 * locked.
373 */ 371 */
374static tvec_base_t *lock_timer_base(struct timer_list *timer, 372static struct tvec_base *lock_timer_base(struct timer_list *timer,
375 unsigned long *flags) 373 unsigned long *flags)
376 __acquires(timer->base->lock) 374 __acquires(timer->base->lock)
377{ 375{
378 tvec_base_t *base; 376 struct tvec_base *base;
379 377
380 for (;;) { 378 for (;;) {
381 tvec_base_t *prelock_base = timer->base; 379 struct tvec_base *prelock_base = timer->base;
382 base = tbase_get_base(prelock_base); 380 base = tbase_get_base(prelock_base);
383 if (likely(base != NULL)) { 381 if (likely(base != NULL)) {
384 spin_lock_irqsave(&base->lock, *flags); 382 spin_lock_irqsave(&base->lock, *flags);
@@ -393,7 +391,7 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer,
393 391
394int __mod_timer(struct timer_list *timer, unsigned long expires) 392int __mod_timer(struct timer_list *timer, unsigned long expires)
395{ 393{
396 tvec_base_t *base, *new_base; 394 struct tvec_base *base, *new_base;
397 unsigned long flags; 395 unsigned long flags;
398 int ret = 0; 396 int ret = 0;
399 397
@@ -445,7 +443,7 @@ EXPORT_SYMBOL(__mod_timer);
445 */ 443 */
446void add_timer_on(struct timer_list *timer, int cpu) 444void add_timer_on(struct timer_list *timer, int cpu)
447{ 445{
448 tvec_base_t *base = per_cpu(tvec_bases, cpu); 446 struct tvec_base *base = per_cpu(tvec_bases, cpu);
449 unsigned long flags; 447 unsigned long flags;
450 448
451 timer_stats_timer_set_start_info(timer); 449 timer_stats_timer_set_start_info(timer);
@@ -508,7 +506,7 @@ EXPORT_SYMBOL(mod_timer);
508 */ 506 */
509int del_timer(struct timer_list *timer) 507int del_timer(struct timer_list *timer)
510{ 508{
511 tvec_base_t *base; 509 struct tvec_base *base;
512 unsigned long flags; 510 unsigned long flags;
513 int ret = 0; 511 int ret = 0;
514 512
@@ -539,7 +537,7 @@ EXPORT_SYMBOL(del_timer);
539 */ 537 */
540int try_to_del_timer_sync(struct timer_list *timer) 538int try_to_del_timer_sync(struct timer_list *timer)
541{ 539{
542 tvec_base_t *base; 540 struct tvec_base *base;
543 unsigned long flags; 541 unsigned long flags;
544 int ret = -1; 542 int ret = -1;
545 543
@@ -591,7 +589,7 @@ int del_timer_sync(struct timer_list *timer)
591EXPORT_SYMBOL(del_timer_sync); 589EXPORT_SYMBOL(del_timer_sync);
592#endif 590#endif
593 591
594static int cascade(tvec_base_t *base, tvec_t *tv, int index) 592static int cascade(struct tvec_base *base, struct tvec *tv, int index)
595{ 593{
596 /* cascade all the timers from tv up one level */ 594 /* cascade all the timers from tv up one level */
597 struct timer_list *timer, *tmp; 595 struct timer_list *timer, *tmp;
@@ -620,7 +618,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
620 * This function cascades all vectors and executes all expired timer 618 * This function cascades all vectors and executes all expired timer
621 * vectors. 619 * vectors.
622 */ 620 */
623static inline void __run_timers(tvec_base_t *base) 621static inline void __run_timers(struct tvec_base *base)
624{ 622{
625 struct timer_list *timer; 623 struct timer_list *timer;
626 624
@@ -657,7 +655,7 @@ static inline void __run_timers(tvec_base_t *base)
657 int preempt_count = preempt_count(); 655 int preempt_count = preempt_count();
658 fn(data); 656 fn(data);
659 if (preempt_count != preempt_count()) { 657 if (preempt_count != preempt_count()) {
660 printk(KERN_WARNING "huh, entered %p " 658 printk(KERN_ERR "huh, entered %p "
661 "with preempt_count %08x, exited" 659 "with preempt_count %08x, exited"
662 " with %08x?\n", 660 " with %08x?\n",
663 fn, preempt_count, 661 fn, preempt_count,
@@ -678,13 +676,13 @@ static inline void __run_timers(tvec_base_t *base)
678 * is used on S/390 to stop all activity when a cpus is idle. 676 * is used on S/390 to stop all activity when a cpus is idle.
679 * This functions needs to be called disabled. 677 * This functions needs to be called disabled.
680 */ 678 */
681static unsigned long __next_timer_interrupt(tvec_base_t *base) 679static unsigned long __next_timer_interrupt(struct tvec_base *base)
682{ 680{
683 unsigned long timer_jiffies = base->timer_jiffies; 681 unsigned long timer_jiffies = base->timer_jiffies;
684 unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; 682 unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
685 int index, slot, array, found = 0; 683 int index, slot, array, found = 0;
686 struct timer_list *nte; 684 struct timer_list *nte;
687 tvec_t *varray[4]; 685 struct tvec *varray[4];
688 686
689 /* Look for timer events in tv1. */ 687 /* Look for timer events in tv1. */
690 index = slot = timer_jiffies & TVR_MASK; 688 index = slot = timer_jiffies & TVR_MASK;
@@ -716,7 +714,7 @@ cascade:
716 varray[3] = &base->tv5; 714 varray[3] = &base->tv5;
717 715
718 for (array = 0; array < 4; array++) { 716 for (array = 0; array < 4; array++) {
719 tvec_t *varp = varray[array]; 717 struct tvec *varp = varray[array];
720 718
721 index = slot = timer_jiffies & TVN_MASK; 719 index = slot = timer_jiffies & TVN_MASK;
722 do { 720 do {
@@ -795,7 +793,7 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
795 */ 793 */
796unsigned long get_next_timer_interrupt(unsigned long now) 794unsigned long get_next_timer_interrupt(unsigned long now)
797{ 795{
798 tvec_base_t *base = __get_cpu_var(tvec_bases); 796 struct tvec_base *base = __get_cpu_var(tvec_bases);
799 unsigned long expires; 797 unsigned long expires;
800 798
801 spin_lock(&base->lock); 799 spin_lock(&base->lock);
@@ -894,9 +892,9 @@ static inline void calc_load(unsigned long ticks)
894 */ 892 */
895static void run_timer_softirq(struct softirq_action *h) 893static void run_timer_softirq(struct softirq_action *h)
896{ 894{
897 tvec_base_t *base = __get_cpu_var(tvec_bases); 895 struct tvec_base *base = __get_cpu_var(tvec_bases);
898 896
899 hrtimer_run_queues(); 897 hrtimer_run_pending();
900 898
901 if (time_after_eq(jiffies, base->timer_jiffies)) 899 if (time_after_eq(jiffies, base->timer_jiffies))
902 __run_timers(base); 900 __run_timers(base);
@@ -907,6 +905,7 @@ static void run_timer_softirq(struct softirq_action *h)
907 */ 905 */
908void run_local_timers(void) 906void run_local_timers(void)
909{ 907{
908 hrtimer_run_queues();
910 raise_softirq(TIMER_SOFTIRQ); 909 raise_softirq(TIMER_SOFTIRQ);
911 softlockup_tick(); 910 softlockup_tick();
912} 911}
@@ -1222,7 +1221,7 @@ static struct lock_class_key base_lock_keys[NR_CPUS];
1222static int __cpuinit init_timers_cpu(int cpu) 1221static int __cpuinit init_timers_cpu(int cpu)
1223{ 1222{
1224 int j; 1223 int j;
1225 tvec_base_t *base; 1224 struct tvec_base *base;
1226 static char __cpuinitdata tvec_base_done[NR_CPUS]; 1225 static char __cpuinitdata tvec_base_done[NR_CPUS];
1227 1226
1228 if (!tvec_base_done[cpu]) { 1227 if (!tvec_base_done[cpu]) {
@@ -1277,7 +1276,7 @@ static int __cpuinit init_timers_cpu(int cpu)
1277} 1276}
1278 1277
1279#ifdef CONFIG_HOTPLUG_CPU 1278#ifdef CONFIG_HOTPLUG_CPU
1280static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) 1279static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
1281{ 1280{
1282 struct timer_list *timer; 1281 struct timer_list *timer;
1283 1282
@@ -1291,8 +1290,8 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
1291 1290
1292static void __cpuinit migrate_timers(int cpu) 1291static void __cpuinit migrate_timers(int cpu)
1293{ 1292{
1294 tvec_base_t *old_base; 1293 struct tvec_base *old_base;
1295 tvec_base_t *new_base; 1294 struct tvec_base *new_base;
1296 int i; 1295 int i;
1297 1296
1298 BUG_ON(cpu_online(cpu)); 1297 BUG_ON(cpu_online(cpu));
diff --git a/kernel/user.c b/kernel/user.c
index 8320a87f3e5a..bc1c48d35cb3 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -115,7 +115,7 @@ static void sched_switch_user(struct task_struct *p) { }
115 115
116#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS) 116#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS)
117 117
118static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */ 118static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
119static DEFINE_MUTEX(uids_mutex); 119static DEFINE_MUTEX(uids_mutex);
120 120
121static inline void uids_mutex_lock(void) 121static inline void uids_mutex_lock(void)
@@ -128,86 +128,83 @@ static inline void uids_mutex_unlock(void)
128 mutex_unlock(&uids_mutex); 128 mutex_unlock(&uids_mutex);
129} 129}
130 130
131/* return cpu shares held by the user */ 131/* uid directory attributes */
132static ssize_t cpu_shares_show(struct kset *kset, char *buffer) 132static ssize_t cpu_shares_show(struct kobject *kobj,
133 struct kobj_attribute *attr,
134 char *buf)
133{ 135{
134 struct user_struct *up = container_of(kset, struct user_struct, kset); 136 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
135 137
136 return sprintf(buffer, "%lu\n", sched_group_shares(up->tg)); 138 return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
137} 139}
138 140
139/* modify cpu shares held by the user */ 141static ssize_t cpu_shares_store(struct kobject *kobj,
140static ssize_t cpu_shares_store(struct kset *kset, const char *buffer, 142 struct kobj_attribute *attr,
141 size_t size) 143 const char *buf, size_t size)
142{ 144{
143 struct user_struct *up = container_of(kset, struct user_struct, kset); 145 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
144 unsigned long shares; 146 unsigned long shares;
145 int rc; 147 int rc;
146 148
147 sscanf(buffer, "%lu", &shares); 149 sscanf(buf, "%lu", &shares);
148 150
149 rc = sched_group_set_shares(up->tg, shares); 151 rc = sched_group_set_shares(up->tg, shares);
150 152
151 return (rc ? rc : size); 153 return (rc ? rc : size);
152} 154}
153 155
154static void user_attr_init(struct subsys_attribute *sa, char *name, int mode) 156static struct kobj_attribute cpu_share_attr =
157 __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
158
159/* default attributes per uid directory */
160static struct attribute *uids_attributes[] = {
161 &cpu_share_attr.attr,
162 NULL
163};
164
165/* the lifetime of user_struct is not managed by the core (now) */
166static void uids_release(struct kobject *kobj)
155{ 167{
156 sa->attr.name = name; 168 return;
157 sa->attr.mode = mode;
158 sa->show = cpu_shares_show;
159 sa->store = cpu_shares_store;
160} 169}
161 170
162/* Create "/sys/kernel/uids/<uid>" directory and 171static struct kobj_type uids_ktype = {
163 * "/sys/kernel/uids/<uid>/cpu_share" file for this user. 172 .sysfs_ops = &kobj_sysfs_ops,
164 */ 173 .default_attrs = uids_attributes,
165static int user_kobject_create(struct user_struct *up) 174 .release = uids_release,
175};
176
177/* create /sys/kernel/uids/<uid>/cpu_share file for this user */
178static int uids_user_create(struct user_struct *up)
166{ 179{
167 struct kset *kset = &up->kset; 180 struct kobject *kobj = &up->kobj;
168 struct kobject *kobj = &kset->kobj;
169 int error; 181 int error;
170 182
171 memset(kset, 0, sizeof(struct kset)); 183 memset(kobj, 0, sizeof(struct kobject));
172 kobj->parent = &uids_kobject; /* create under /sys/kernel/uids dir */ 184 kobj->kset = uids_kset;
173 kobject_set_name(kobj, "%d", up->uid); 185 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
174 kset_init(kset); 186 if (error) {
175 user_attr_init(&up->user_attr, "cpu_share", 0644); 187 kobject_put(kobj);
176
177 error = kobject_add(kobj);
178 if (error)
179 goto done; 188 goto done;
180 189 }
181 error = sysfs_create_file(kobj, &up->user_attr.attr);
182 if (error)
183 kobject_del(kobj);
184 190
185 kobject_uevent(kobj, KOBJ_ADD); 191 kobject_uevent(kobj, KOBJ_ADD);
186
187done: 192done:
188 return error; 193 return error;
189} 194}
190 195
191/* create these in sysfs filesystem: 196/* create these entries in sysfs:
192 * "/sys/kernel/uids" directory 197 * "/sys/kernel/uids" directory
193 * "/sys/kernel/uids/0" directory (for root user) 198 * "/sys/kernel/uids/0" directory (for root user)
194 * "/sys/kernel/uids/0/cpu_share" file (for root user) 199 * "/sys/kernel/uids/0/cpu_share" file (for root user)
195 */ 200 */
196int __init uids_kobject_init(void) 201int __init uids_sysfs_init(void)
197{ 202{
198 int error; 203 uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
199 204 if (!uids_kset)
200 /* create under /sys/kernel dir */ 205 return -ENOMEM;
201 uids_kobject.parent = &kernel_subsys.kobj;
202 uids_kobject.kset = &kernel_subsys;
203 kobject_set_name(&uids_kobject, "uids");
204 kobject_init(&uids_kobject);
205 206
206 error = kobject_add(&uids_kobject); 207 return uids_user_create(&root_user);
207 if (!error)
208 error = user_kobject_create(&root_user);
209
210 return error;
211} 208}
212 209
213/* work function to remove sysfs directory for a user and free up 210/* work function to remove sysfs directory for a user and free up
@@ -216,7 +213,6 @@ int __init uids_kobject_init(void)
216static void remove_user_sysfs_dir(struct work_struct *w) 213static void remove_user_sysfs_dir(struct work_struct *w)
217{ 214{
218 struct user_struct *up = container_of(w, struct user_struct, work); 215 struct user_struct *up = container_of(w, struct user_struct, work);
219 struct kobject *kobj = &up->kset.kobj;
220 unsigned long flags; 216 unsigned long flags;
221 int remove_user = 0; 217 int remove_user = 0;
222 218
@@ -238,9 +234,9 @@ static void remove_user_sysfs_dir(struct work_struct *w)
238 if (!remove_user) 234 if (!remove_user)
239 goto done; 235 goto done;
240 236
241 sysfs_remove_file(kobj, &up->user_attr.attr); 237 kobject_uevent(&up->kobj, KOBJ_REMOVE);
242 kobject_uevent(kobj, KOBJ_REMOVE); 238 kobject_del(&up->kobj);
243 kobject_del(kobj); 239 kobject_put(&up->kobj);
244 240
245 sched_destroy_user(up); 241 sched_destroy_user(up);
246 key_put(up->uid_keyring); 242 key_put(up->uid_keyring);
@@ -267,7 +263,8 @@ static inline void free_user(struct user_struct *up, unsigned long flags)
267 263
268#else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */ 264#else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */
269 265
270static inline int user_kobject_create(struct user_struct *up) { return 0; } 266int uids_sysfs_init(void) { return 0; }
267static inline int uids_user_create(struct user_struct *up) { return 0; }
271static inline void uids_mutex_lock(void) { } 268static inline void uids_mutex_lock(void) { }
272static inline void uids_mutex_unlock(void) { } 269static inline void uids_mutex_unlock(void) { }
273 270
@@ -322,9 +319,9 @@ void free_uid(struct user_struct *up)
322struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) 319struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
323{ 320{
324 struct hlist_head *hashent = uidhashentry(ns, uid); 321 struct hlist_head *hashent = uidhashentry(ns, uid);
325 struct user_struct *up; 322 struct user_struct *up, *new;
326 323
327 /* Make uid_hash_find() + user_kobject_create() + uid_hash_insert() 324 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
328 * atomic. 325 * atomic.
329 */ 326 */
330 uids_mutex_lock(); 327 uids_mutex_lock();
@@ -334,13 +331,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
334 spin_unlock_irq(&uidhash_lock); 331 spin_unlock_irq(&uidhash_lock);
335 332
336 if (!up) { 333 if (!up) {
337 struct user_struct *new;
338
339 new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); 334 new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
340 if (!new) { 335 if (!new)
341 uids_mutex_unlock(); 336 goto out_unlock;
342 return NULL;
343 }
344 337
345 new->uid = uid; 338 new->uid = uid;
346 atomic_set(&new->__count, 1); 339 atomic_set(&new->__count, 1);
@@ -356,28 +349,14 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
356#endif 349#endif
357 new->locked_shm = 0; 350 new->locked_shm = 0;
358 351
359 if (alloc_uid_keyring(new, current) < 0) { 352 if (alloc_uid_keyring(new, current) < 0)
360 kmem_cache_free(uid_cachep, new); 353 goto out_free_user;
361 uids_mutex_unlock();
362 return NULL;
363 }
364 354
365 if (sched_create_user(new) < 0) { 355 if (sched_create_user(new) < 0)
366 key_put(new->uid_keyring); 356 goto out_put_keys;
367 key_put(new->session_keyring);
368 kmem_cache_free(uid_cachep, new);
369 uids_mutex_unlock();
370 return NULL;
371 }
372 357
373 if (user_kobject_create(new)) { 358 if (uids_user_create(new))
374 sched_destroy_user(new); 359 goto out_destoy_sched;
375 key_put(new->uid_keyring);
376 key_put(new->session_keyring);
377 kmem_cache_free(uid_cachep, new);
378 uids_mutex_unlock();
379 return NULL;
380 }
381 360
382 /* 361 /*
383 * Before adding this, check whether we raced 362 * Before adding this, check whether we raced
@@ -405,6 +384,17 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
405 uids_mutex_unlock(); 384 uids_mutex_unlock();
406 385
407 return up; 386 return up;
387
388out_destoy_sched:
389 sched_destroy_user(new);
390out_put_keys:
391 key_put(new->uid_keyring);
392 key_put(new->session_keyring);
393out_free_user:
394 kmem_cache_free(uid_cachep, new);
395out_unlock:
396 uids_mutex_unlock();
397 return NULL;
408} 398}
409 399
410void switch_uid(struct user_struct *new_user) 400void switch_uid(struct user_struct *new_user)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8db0b597509e..52db48e7f6e7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -67,9 +67,8 @@ struct workqueue_struct {
67#endif 67#endif
68}; 68};
69 69
70/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove 70/* Serializes the accesses to the list of workqueues. */
71 threads to each one as cpus come/go. */ 71static DEFINE_SPINLOCK(workqueue_lock);
72static DEFINE_MUTEX(workqueue_mutex);
73static LIST_HEAD(workqueues); 72static LIST_HEAD(workqueues);
74 73
75static int singlethread_cpu __read_mostly; 74static int singlethread_cpu __read_mostly;
@@ -592,8 +591,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
592 * Returns zero on success. 591 * Returns zero on success.
593 * Returns -ve errno on failure. 592 * Returns -ve errno on failure.
594 * 593 *
595 * Appears to be racy against CPU hotplug.
596 *
597 * schedule_on_each_cpu() is very slow. 594 * schedule_on_each_cpu() is very slow.
598 */ 595 */
599int schedule_on_each_cpu(work_func_t func) 596int schedule_on_each_cpu(work_func_t func)
@@ -605,7 +602,7 @@ int schedule_on_each_cpu(work_func_t func)
605 if (!works) 602 if (!works)
606 return -ENOMEM; 603 return -ENOMEM;
607 604
608 preempt_disable(); /* CPU hotplug */ 605 get_online_cpus();
609 for_each_online_cpu(cpu) { 606 for_each_online_cpu(cpu) {
610 struct work_struct *work = per_cpu_ptr(works, cpu); 607 struct work_struct *work = per_cpu_ptr(works, cpu);
611 608
@@ -613,8 +610,8 @@ int schedule_on_each_cpu(work_func_t func)
613 set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); 610 set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
614 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); 611 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
615 } 612 }
616 preempt_enable();
617 flush_workqueue(keventd_wq); 613 flush_workqueue(keventd_wq);
614 put_online_cpus();
618 free_percpu(works); 615 free_percpu(works);
619 return 0; 616 return 0;
620} 617}
@@ -750,8 +747,10 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
750 err = create_workqueue_thread(cwq, singlethread_cpu); 747 err = create_workqueue_thread(cwq, singlethread_cpu);
751 start_workqueue_thread(cwq, -1); 748 start_workqueue_thread(cwq, -1);
752 } else { 749 } else {
753 mutex_lock(&workqueue_mutex); 750 get_online_cpus();
751 spin_lock(&workqueue_lock);
754 list_add(&wq->list, &workqueues); 752 list_add(&wq->list, &workqueues);
753 spin_unlock(&workqueue_lock);
755 754
756 for_each_possible_cpu(cpu) { 755 for_each_possible_cpu(cpu) {
757 cwq = init_cpu_workqueue(wq, cpu); 756 cwq = init_cpu_workqueue(wq, cpu);
@@ -760,7 +759,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
760 err = create_workqueue_thread(cwq, cpu); 759 err = create_workqueue_thread(cwq, cpu);
761 start_workqueue_thread(cwq, cpu); 760 start_workqueue_thread(cwq, cpu);
762 } 761 }
763 mutex_unlock(&workqueue_mutex); 762 put_online_cpus();
764 } 763 }
765 764
766 if (err) { 765 if (err) {
@@ -775,7 +774,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
775{ 774{
776 /* 775 /*
777 * Our caller is either destroy_workqueue() or CPU_DEAD, 776 * Our caller is either destroy_workqueue() or CPU_DEAD,
778 * workqueue_mutex protects cwq->thread 777 * get_online_cpus() protects cwq->thread.
779 */ 778 */
780 if (cwq->thread == NULL) 779 if (cwq->thread == NULL)
781 return; 780 return;
@@ -810,9 +809,11 @@ void destroy_workqueue(struct workqueue_struct *wq)
810 struct cpu_workqueue_struct *cwq; 809 struct cpu_workqueue_struct *cwq;
811 int cpu; 810 int cpu;
812 811
813 mutex_lock(&workqueue_mutex); 812 get_online_cpus();
813 spin_lock(&workqueue_lock);
814 list_del(&wq->list); 814 list_del(&wq->list);
815 mutex_unlock(&workqueue_mutex); 815 spin_unlock(&workqueue_lock);
816 put_online_cpus();
816 817
817 for_each_cpu_mask(cpu, *cpu_map) { 818 for_each_cpu_mask(cpu, *cpu_map) {
818 cwq = per_cpu_ptr(wq->cpu_wq, cpu); 819 cwq = per_cpu_ptr(wq->cpu_wq, cpu);
@@ -835,13 +836,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
835 action &= ~CPU_TASKS_FROZEN; 836 action &= ~CPU_TASKS_FROZEN;
836 837
837 switch (action) { 838 switch (action) {
838 case CPU_LOCK_ACQUIRE:
839 mutex_lock(&workqueue_mutex);
840 return NOTIFY_OK;
841
842 case CPU_LOCK_RELEASE:
843 mutex_unlock(&workqueue_mutex);
844 return NOTIFY_OK;
845 839
846 case CPU_UP_PREPARE: 840 case CPU_UP_PREPARE:
847 cpu_set(cpu, cpu_populated_map); 841 cpu_set(cpu, cpu_populated_map);
@@ -854,7 +848,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
854 case CPU_UP_PREPARE: 848 case CPU_UP_PREPARE:
855 if (!create_workqueue_thread(cwq, cpu)) 849 if (!create_workqueue_thread(cwq, cpu))
856 break; 850 break;
857 printk(KERN_ERR "workqueue for %i failed\n", cpu); 851 printk(KERN_ERR "workqueue [%s] for %i failed\n",
852 wq->name, cpu);
858 return NOTIFY_BAD; 853 return NOTIFY_BAD;
859 854
860 case CPU_ONLINE: 855 case CPU_ONLINE: