diff options
Diffstat (limited to 'kernel')
65 files changed, 6558 insertions, 1937 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 4af15802ccd4..526128a2e622 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz | |||
@@ -54,3 +54,5 @@ config HZ | |||
54 | default 300 if HZ_300 | 54 | default 300 if HZ_300 |
55 | default 1000 if HZ_1000 | 55 | default 1000 if HZ_1000 |
56 | 56 | ||
57 | config SCHED_HRTICK | ||
58 | def_bool HIGH_RES_TIMERS && X86 | ||
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index c64ce9c14207..0669b70fa6a3 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt | |||
@@ -52,14 +52,13 @@ config PREEMPT | |||
52 | 52 | ||
53 | endchoice | 53 | endchoice |
54 | 54 | ||
55 | config PREEMPT_BKL | 55 | config RCU_TRACE |
56 | bool "Preempt The Big Kernel Lock" | 56 | bool "Enable tracing for RCU - currently stats in debugfs" |
57 | depends on SMP || PREEMPT | 57 | select DEBUG_FS |
58 | default y | 58 | default y |
59 | help | 59 | help |
60 | This option reduces the latency of the kernel by making the | 60 | This option provides tracing in RCU which presents stats |
61 | big kernel lock preemptible. | 61 | in debugfs for debugging RCU implementation. |
62 | 62 | ||
63 | Say Y here if you are building a kernel for a desktop system. | 63 | Say Y here if you want to enable RCU tracing |
64 | Say N if you are unsure. | 64 | Say N if you are unsure. |
65 | |||
diff --git a/kernel/Makefile b/kernel/Makefile index dfa96956dae0..8885627ea021 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -36,6 +36,7 @@ obj-$(CONFIG_KALLSYMS) += kallsyms.o | |||
36 | obj-$(CONFIG_PM) += power/ | 36 | obj-$(CONFIG_PM) += power/ |
37 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o | 37 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o |
38 | obj-$(CONFIG_KEXEC) += kexec.o | 38 | obj-$(CONFIG_KEXEC) += kexec.o |
39 | obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o | ||
39 | obj-$(CONFIG_COMPAT) += compat.o | 40 | obj-$(CONFIG_COMPAT) += compat.o |
40 | obj-$(CONFIG_CGROUPS) += cgroup.o | 41 | obj-$(CONFIG_CGROUPS) += cgroup.o |
41 | obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o | 42 | obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o |
@@ -43,6 +44,7 @@ obj-$(CONFIG_CPUSETS) += cpuset.o | |||
43 | obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o | 44 | obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o |
44 | obj-$(CONFIG_IKCONFIG) += configs.o | 45 | obj-$(CONFIG_IKCONFIG) += configs.o |
45 | obj-$(CONFIG_STOP_MACHINE) += stop_machine.o | 46 | obj-$(CONFIG_STOP_MACHINE) += stop_machine.o |
47 | obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o | ||
46 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o | 48 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o |
47 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o | 49 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o |
48 | obj-$(CONFIG_AUDIT_TREE) += audit_tree.o | 50 | obj-$(CONFIG_AUDIT_TREE) += audit_tree.o |
@@ -52,11 +54,17 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o | |||
52 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | 54 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ |
53 | obj-$(CONFIG_SECCOMP) += seccomp.o | 55 | obj-$(CONFIG_SECCOMP) += seccomp.o |
54 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 56 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
57 | obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o | ||
58 | obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o | ||
59 | ifeq ($(CONFIG_PREEMPT_RCU),y) | ||
60 | obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o | ||
61 | endif | ||
55 | obj-$(CONFIG_RELAY) += relay.o | 62 | obj-$(CONFIG_RELAY) += relay.o |
56 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o | 63 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o |
57 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o | 64 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o |
58 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o | 65 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o |
59 | obj-$(CONFIG_MARKERS) += marker.o | 66 | obj-$(CONFIG_MARKERS) += marker.o |
67 | obj-$(CONFIG_LATENCYTOP) += latencytop.o | ||
60 | 68 | ||
61 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) | 69 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) |
62 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 70 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
diff --git a/kernel/acct.c b/kernel/acct.c index cf19547cc9e4..521dfa53cb99 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -482,7 +482,7 @@ static void do_acct_process(struct file *file) | |||
482 | #endif | 482 | #endif |
483 | #if ACCT_VERSION==3 | 483 | #if ACCT_VERSION==3 |
484 | ac.ac_pid = current->tgid; | 484 | ac.ac_pid = current->tgid; |
485 | ac.ac_ppid = current->parent->tgid; | 485 | ac.ac_ppid = current->real_parent->tgid; |
486 | #endif | 486 | #endif |
487 | 487 | ||
488 | spin_lock_irq(¤t->sighand->siglock); | 488 | spin_lock_irq(¤t->sighand->siglock); |
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c new file mode 100644 index 000000000000..d1a7605c5b8f --- /dev/null +++ b/kernel/backtracetest.c | |||
@@ -0,0 +1,48 @@ | |||
1 | /* | ||
2 | * Simple stack backtrace regression test module | ||
3 | * | ||
4 | * (C) Copyright 2008 Intel Corporation | ||
5 | * Author: Arjan van de Ven <arjan@linux.intel.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; version 2 | ||
10 | * of the License. | ||
11 | */ | ||
12 | |||
13 | #include <linux/module.h> | ||
14 | #include <linux/sched.h> | ||
15 | #include <linux/delay.h> | ||
16 | |||
17 | static struct timer_list backtrace_timer; | ||
18 | |||
19 | static void backtrace_test_timer(unsigned long data) | ||
20 | { | ||
21 | printk("Testing a backtrace from irq context.\n"); | ||
22 | printk("The following trace is a kernel self test and not a bug!\n"); | ||
23 | dump_stack(); | ||
24 | } | ||
25 | static int backtrace_regression_test(void) | ||
26 | { | ||
27 | printk("====[ backtrace testing ]===========\n"); | ||
28 | printk("Testing a backtrace from process context.\n"); | ||
29 | printk("The following trace is a kernel self test and not a bug!\n"); | ||
30 | dump_stack(); | ||
31 | |||
32 | init_timer(&backtrace_timer); | ||
33 | backtrace_timer.function = backtrace_test_timer; | ||
34 | mod_timer(&backtrace_timer, jiffies + 10); | ||
35 | |||
36 | msleep(10); | ||
37 | printk("====[ end of backtrace testing ]====\n"); | ||
38 | return 0; | ||
39 | } | ||
40 | |||
41 | static void exitf(void) | ||
42 | { | ||
43 | } | ||
44 | |||
45 | module_init(backtrace_regression_test); | ||
46 | module_exit(exitf); | ||
47 | MODULE_LICENSE("GPL"); | ||
48 | MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index 6b3a0c15144f..e0d3a4f56ecb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -15,9 +15,8 @@ | |||
15 | #include <linux/stop_machine.h> | 15 | #include <linux/stop_machine.h> |
16 | #include <linux/mutex.h> | 16 | #include <linux/mutex.h> |
17 | 17 | ||
18 | /* This protects CPUs going up and down... */ | 18 | /* Serializes the updates to cpu_online_map, cpu_present_map */ |
19 | static DEFINE_MUTEX(cpu_add_remove_lock); | 19 | static DEFINE_MUTEX(cpu_add_remove_lock); |
20 | static DEFINE_MUTEX(cpu_bitmask_lock); | ||
21 | 20 | ||
22 | static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); | 21 | static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); |
23 | 22 | ||
@@ -26,52 +25,123 @@ static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); | |||
26 | */ | 25 | */ |
27 | static int cpu_hotplug_disabled; | 26 | static int cpu_hotplug_disabled; |
28 | 27 | ||
29 | #ifdef CONFIG_HOTPLUG_CPU | 28 | static struct { |
29 | struct task_struct *active_writer; | ||
30 | struct mutex lock; /* Synchronizes accesses to refcount, */ | ||
31 | /* | ||
32 | * Also blocks the new readers during | ||
33 | * an ongoing cpu hotplug operation. | ||
34 | */ | ||
35 | int refcount; | ||
36 | wait_queue_head_t writer_queue; | ||
37 | } cpu_hotplug; | ||
30 | 38 | ||
31 | /* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */ | 39 | #define writer_exists() (cpu_hotplug.active_writer != NULL) |
32 | static struct task_struct *recursive; | ||
33 | static int recursive_depth; | ||
34 | 40 | ||
35 | void lock_cpu_hotplug(void) | 41 | void __init cpu_hotplug_init(void) |
36 | { | 42 | { |
37 | struct task_struct *tsk = current; | 43 | cpu_hotplug.active_writer = NULL; |
38 | 44 | mutex_init(&cpu_hotplug.lock); | |
39 | if (tsk == recursive) { | 45 | cpu_hotplug.refcount = 0; |
40 | static int warnings = 10; | 46 | init_waitqueue_head(&cpu_hotplug.writer_queue); |
41 | if (warnings) { | 47 | } |
42 | printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n"); | 48 | |
43 | WARN_ON(1); | 49 | #ifdef CONFIG_HOTPLUG_CPU |
44 | warnings--; | 50 | |
45 | } | 51 | void get_online_cpus(void) |
46 | recursive_depth++; | 52 | { |
53 | might_sleep(); | ||
54 | if (cpu_hotplug.active_writer == current) | ||
47 | return; | 55 | return; |
48 | } | 56 | mutex_lock(&cpu_hotplug.lock); |
49 | mutex_lock(&cpu_bitmask_lock); | 57 | cpu_hotplug.refcount++; |
50 | recursive = tsk; | 58 | mutex_unlock(&cpu_hotplug.lock); |
59 | |||
51 | } | 60 | } |
52 | EXPORT_SYMBOL_GPL(lock_cpu_hotplug); | 61 | EXPORT_SYMBOL_GPL(get_online_cpus); |
53 | 62 | ||
54 | void unlock_cpu_hotplug(void) | 63 | void put_online_cpus(void) |
55 | { | 64 | { |
56 | WARN_ON(recursive != current); | 65 | if (cpu_hotplug.active_writer == current) |
57 | if (recursive_depth) { | ||
58 | recursive_depth--; | ||
59 | return; | 66 | return; |
60 | } | 67 | mutex_lock(&cpu_hotplug.lock); |
61 | recursive = NULL; | 68 | cpu_hotplug.refcount--; |
62 | mutex_unlock(&cpu_bitmask_lock); | 69 | |
70 | if (unlikely(writer_exists()) && !cpu_hotplug.refcount) | ||
71 | wake_up(&cpu_hotplug.writer_queue); | ||
72 | |||
73 | mutex_unlock(&cpu_hotplug.lock); | ||
74 | |||
63 | } | 75 | } |
64 | EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); | 76 | EXPORT_SYMBOL_GPL(put_online_cpus); |
65 | 77 | ||
66 | #endif /* CONFIG_HOTPLUG_CPU */ | 78 | #endif /* CONFIG_HOTPLUG_CPU */ |
67 | 79 | ||
80 | /* | ||
81 | * The following two API's must be used when attempting | ||
82 | * to serialize the updates to cpu_online_map, cpu_present_map. | ||
83 | */ | ||
84 | void cpu_maps_update_begin(void) | ||
85 | { | ||
86 | mutex_lock(&cpu_add_remove_lock); | ||
87 | } | ||
88 | |||
89 | void cpu_maps_update_done(void) | ||
90 | { | ||
91 | mutex_unlock(&cpu_add_remove_lock); | ||
92 | } | ||
93 | |||
94 | /* | ||
95 | * This ensures that the hotplug operation can begin only when the | ||
96 | * refcount goes to zero. | ||
97 | * | ||
98 | * Note that during a cpu-hotplug operation, the new readers, if any, | ||
99 | * will be blocked by the cpu_hotplug.lock | ||
100 | * | ||
101 | * Since cpu_maps_update_begin is always called after invoking | ||
102 | * cpu_maps_update_begin, we can be sure that only one writer is active. | ||
103 | * | ||
104 | * Note that theoretically, there is a possibility of a livelock: | ||
105 | * - Refcount goes to zero, last reader wakes up the sleeping | ||
106 | * writer. | ||
107 | * - Last reader unlocks the cpu_hotplug.lock. | ||
108 | * - A new reader arrives at this moment, bumps up the refcount. | ||
109 | * - The writer acquires the cpu_hotplug.lock finds the refcount | ||
110 | * non zero and goes to sleep again. | ||
111 | * | ||
112 | * However, this is very difficult to achieve in practice since | ||
113 | * get_online_cpus() not an api which is called all that often. | ||
114 | * | ||
115 | */ | ||
116 | static void cpu_hotplug_begin(void) | ||
117 | { | ||
118 | DECLARE_WAITQUEUE(wait, current); | ||
119 | |||
120 | mutex_lock(&cpu_hotplug.lock); | ||
121 | |||
122 | cpu_hotplug.active_writer = current; | ||
123 | add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait); | ||
124 | while (cpu_hotplug.refcount) { | ||
125 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
126 | mutex_unlock(&cpu_hotplug.lock); | ||
127 | schedule(); | ||
128 | mutex_lock(&cpu_hotplug.lock); | ||
129 | } | ||
130 | remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait); | ||
131 | } | ||
132 | |||
133 | static void cpu_hotplug_done(void) | ||
134 | { | ||
135 | cpu_hotplug.active_writer = NULL; | ||
136 | mutex_unlock(&cpu_hotplug.lock); | ||
137 | } | ||
68 | /* Need to know about CPUs going up/down? */ | 138 | /* Need to know about CPUs going up/down? */ |
69 | int __cpuinit register_cpu_notifier(struct notifier_block *nb) | 139 | int __cpuinit register_cpu_notifier(struct notifier_block *nb) |
70 | { | 140 | { |
71 | int ret; | 141 | int ret; |
72 | mutex_lock(&cpu_add_remove_lock); | 142 | cpu_maps_update_begin(); |
73 | ret = raw_notifier_chain_register(&cpu_chain, nb); | 143 | ret = raw_notifier_chain_register(&cpu_chain, nb); |
74 | mutex_unlock(&cpu_add_remove_lock); | 144 | cpu_maps_update_done(); |
75 | return ret; | 145 | return ret; |
76 | } | 146 | } |
77 | 147 | ||
@@ -81,9 +151,9 @@ EXPORT_SYMBOL(register_cpu_notifier); | |||
81 | 151 | ||
82 | void unregister_cpu_notifier(struct notifier_block *nb) | 152 | void unregister_cpu_notifier(struct notifier_block *nb) |
83 | { | 153 | { |
84 | mutex_lock(&cpu_add_remove_lock); | 154 | cpu_maps_update_begin(); |
85 | raw_notifier_chain_unregister(&cpu_chain, nb); | 155 | raw_notifier_chain_unregister(&cpu_chain, nb); |
86 | mutex_unlock(&cpu_add_remove_lock); | 156 | cpu_maps_update_done(); |
87 | } | 157 | } |
88 | EXPORT_SYMBOL(unregister_cpu_notifier); | 158 | EXPORT_SYMBOL(unregister_cpu_notifier); |
89 | 159 | ||
@@ -147,7 +217,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) | |||
147 | if (!cpu_online(cpu)) | 217 | if (!cpu_online(cpu)) |
148 | return -EINVAL; | 218 | return -EINVAL; |
149 | 219 | ||
150 | raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); | 220 | cpu_hotplug_begin(); |
151 | err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, | 221 | err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, |
152 | hcpu, -1, &nr_calls); | 222 | hcpu, -1, &nr_calls); |
153 | if (err == NOTIFY_BAD) { | 223 | if (err == NOTIFY_BAD) { |
@@ -166,9 +236,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) | |||
166 | cpu_clear(cpu, tmp); | 236 | cpu_clear(cpu, tmp); |
167 | set_cpus_allowed(current, tmp); | 237 | set_cpus_allowed(current, tmp); |
168 | 238 | ||
169 | mutex_lock(&cpu_bitmask_lock); | ||
170 | p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); | 239 | p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); |
171 | mutex_unlock(&cpu_bitmask_lock); | ||
172 | 240 | ||
173 | if (IS_ERR(p) || cpu_online(cpu)) { | 241 | if (IS_ERR(p) || cpu_online(cpu)) { |
174 | /* CPU didn't die: tell everyone. Can't complain. */ | 242 | /* CPU didn't die: tell everyone. Can't complain. */ |
@@ -202,7 +270,7 @@ out_thread: | |||
202 | out_allowed: | 270 | out_allowed: |
203 | set_cpus_allowed(current, old_allowed); | 271 | set_cpus_allowed(current, old_allowed); |
204 | out_release: | 272 | out_release: |
205 | raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); | 273 | cpu_hotplug_done(); |
206 | return err; | 274 | return err; |
207 | } | 275 | } |
208 | 276 | ||
@@ -210,13 +278,13 @@ int cpu_down(unsigned int cpu) | |||
210 | { | 278 | { |
211 | int err = 0; | 279 | int err = 0; |
212 | 280 | ||
213 | mutex_lock(&cpu_add_remove_lock); | 281 | cpu_maps_update_begin(); |
214 | if (cpu_hotplug_disabled) | 282 | if (cpu_hotplug_disabled) |
215 | err = -EBUSY; | 283 | err = -EBUSY; |
216 | else | 284 | else |
217 | err = _cpu_down(cpu, 0); | 285 | err = _cpu_down(cpu, 0); |
218 | 286 | ||
219 | mutex_unlock(&cpu_add_remove_lock); | 287 | cpu_maps_update_done(); |
220 | return err; | 288 | return err; |
221 | } | 289 | } |
222 | #endif /*CONFIG_HOTPLUG_CPU*/ | 290 | #endif /*CONFIG_HOTPLUG_CPU*/ |
@@ -231,7 +299,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
231 | if (cpu_online(cpu) || !cpu_present(cpu)) | 299 | if (cpu_online(cpu) || !cpu_present(cpu)) |
232 | return -EINVAL; | 300 | return -EINVAL; |
233 | 301 | ||
234 | raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); | 302 | cpu_hotplug_begin(); |
235 | ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, | 303 | ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, |
236 | -1, &nr_calls); | 304 | -1, &nr_calls); |
237 | if (ret == NOTIFY_BAD) { | 305 | if (ret == NOTIFY_BAD) { |
@@ -243,9 +311,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
243 | } | 311 | } |
244 | 312 | ||
245 | /* Arch-specific enabling code. */ | 313 | /* Arch-specific enabling code. */ |
246 | mutex_lock(&cpu_bitmask_lock); | ||
247 | ret = __cpu_up(cpu); | 314 | ret = __cpu_up(cpu); |
248 | mutex_unlock(&cpu_bitmask_lock); | ||
249 | if (ret != 0) | 315 | if (ret != 0) |
250 | goto out_notify; | 316 | goto out_notify; |
251 | BUG_ON(!cpu_online(cpu)); | 317 | BUG_ON(!cpu_online(cpu)); |
@@ -257,7 +323,7 @@ out_notify: | |||
257 | if (ret != 0) | 323 | if (ret != 0) |
258 | __raw_notifier_call_chain(&cpu_chain, | 324 | __raw_notifier_call_chain(&cpu_chain, |
259 | CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); | 325 | CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); |
260 | raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); | 326 | cpu_hotplug_done(); |
261 | 327 | ||
262 | return ret; | 328 | return ret; |
263 | } | 329 | } |
@@ -275,13 +341,13 @@ int __cpuinit cpu_up(unsigned int cpu) | |||
275 | return -EINVAL; | 341 | return -EINVAL; |
276 | } | 342 | } |
277 | 343 | ||
278 | mutex_lock(&cpu_add_remove_lock); | 344 | cpu_maps_update_begin(); |
279 | if (cpu_hotplug_disabled) | 345 | if (cpu_hotplug_disabled) |
280 | err = -EBUSY; | 346 | err = -EBUSY; |
281 | else | 347 | else |
282 | err = _cpu_up(cpu, 0); | 348 | err = _cpu_up(cpu, 0); |
283 | 349 | ||
284 | mutex_unlock(&cpu_add_remove_lock); | 350 | cpu_maps_update_done(); |
285 | return err; | 351 | return err; |
286 | } | 352 | } |
287 | 353 | ||
@@ -292,7 +358,7 @@ int disable_nonboot_cpus(void) | |||
292 | { | 358 | { |
293 | int cpu, first_cpu, error = 0; | 359 | int cpu, first_cpu, error = 0; |
294 | 360 | ||
295 | mutex_lock(&cpu_add_remove_lock); | 361 | cpu_maps_update_begin(); |
296 | first_cpu = first_cpu(cpu_online_map); | 362 | first_cpu = first_cpu(cpu_online_map); |
297 | /* We take down all of the non-boot CPUs in one shot to avoid races | 363 | /* We take down all of the non-boot CPUs in one shot to avoid races |
298 | * with the userspace trying to use the CPU hotplug at the same time | 364 | * with the userspace trying to use the CPU hotplug at the same time |
@@ -319,7 +385,7 @@ int disable_nonboot_cpus(void) | |||
319 | } else { | 385 | } else { |
320 | printk(KERN_ERR "Non-boot CPUs are not disabled\n"); | 386 | printk(KERN_ERR "Non-boot CPUs are not disabled\n"); |
321 | } | 387 | } |
322 | mutex_unlock(&cpu_add_remove_lock); | 388 | cpu_maps_update_done(); |
323 | return error; | 389 | return error; |
324 | } | 390 | } |
325 | 391 | ||
@@ -328,7 +394,7 @@ void enable_nonboot_cpus(void) | |||
328 | int cpu, error; | 394 | int cpu, error; |
329 | 395 | ||
330 | /* Allow everyone to use the CPU hotplug again */ | 396 | /* Allow everyone to use the CPU hotplug again */ |
331 | mutex_lock(&cpu_add_remove_lock); | 397 | cpu_maps_update_begin(); |
332 | cpu_hotplug_disabled = 0; | 398 | cpu_hotplug_disabled = 0; |
333 | if (cpus_empty(frozen_cpus)) | 399 | if (cpus_empty(frozen_cpus)) |
334 | goto out; | 400 | goto out; |
@@ -344,6 +410,6 @@ void enable_nonboot_cpus(void) | |||
344 | } | 410 | } |
345 | cpus_clear(frozen_cpus); | 411 | cpus_clear(frozen_cpus); |
346 | out: | 412 | out: |
347 | mutex_unlock(&cpu_add_remove_lock); | 413 | cpu_maps_update_done(); |
348 | } | 414 | } |
349 | #endif /* CONFIG_PM_SLEEP_SMP */ | 415 | #endif /* CONFIG_PM_SLEEP_SMP */ |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 50f5dc463688..cfaf6419d817 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -537,10 +537,10 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | |||
537 | * | 537 | * |
538 | * Call with cgroup_mutex held. May take callback_mutex during | 538 | * Call with cgroup_mutex held. May take callback_mutex during |
539 | * call due to the kfifo_alloc() and kmalloc() calls. May nest | 539 | * call due to the kfifo_alloc() and kmalloc() calls. May nest |
540 | * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | 540 | * a call to the get_online_cpus()/put_online_cpus() pair. |
541 | * Must not be called holding callback_mutex, because we must not | 541 | * Must not be called holding callback_mutex, because we must not |
542 | * call lock_cpu_hotplug() while holding callback_mutex. Elsewhere | 542 | * call get_online_cpus() while holding callback_mutex. Elsewhere |
543 | * the kernel nests callback_mutex inside lock_cpu_hotplug() calls. | 543 | * the kernel nests callback_mutex inside get_online_cpus() calls. |
544 | * So the reverse nesting would risk an ABBA deadlock. | 544 | * So the reverse nesting would risk an ABBA deadlock. |
545 | * | 545 | * |
546 | * The three key local variables below are: | 546 | * The three key local variables below are: |
@@ -691,9 +691,9 @@ restart: | |||
691 | 691 | ||
692 | rebuild: | 692 | rebuild: |
693 | /* Have scheduler rebuild sched domains */ | 693 | /* Have scheduler rebuild sched domains */ |
694 | lock_cpu_hotplug(); | 694 | get_online_cpus(); |
695 | partition_sched_domains(ndoms, doms); | 695 | partition_sched_domains(ndoms, doms); |
696 | unlock_cpu_hotplug(); | 696 | put_online_cpus(); |
697 | 697 | ||
698 | done: | 698 | done: |
699 | if (q && !IS_ERR(q)) | 699 | if (q && !IS_ERR(q)) |
@@ -1617,10 +1617,10 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1617 | * | 1617 | * |
1618 | * If the cpuset being removed has its flag 'sched_load_balance' | 1618 | * If the cpuset being removed has its flag 'sched_load_balance' |
1619 | * enabled, then simulate turning sched_load_balance off, which | 1619 | * enabled, then simulate turning sched_load_balance off, which |
1620 | * will call rebuild_sched_domains(). The lock_cpu_hotplug() | 1620 | * will call rebuild_sched_domains(). The get_online_cpus() |
1621 | * call in rebuild_sched_domains() must not be made while holding | 1621 | * call in rebuild_sched_domains() must not be made while holding |
1622 | * callback_mutex. Elsewhere the kernel nests callback_mutex inside | 1622 | * callback_mutex. Elsewhere the kernel nests callback_mutex inside |
1623 | * lock_cpu_hotplug() calls. So the reverse nesting would risk an | 1623 | * get_online_cpus() calls. So the reverse nesting would risk an |
1624 | * ABBA deadlock. | 1624 | * ABBA deadlock. |
1625 | */ | 1625 | */ |
1626 | 1626 | ||
diff --git a/kernel/extable.c b/kernel/extable.c index 7fe262855317..a26cb2e17023 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
@@ -46,7 +46,8 @@ int core_kernel_text(unsigned long addr) | |||
46 | addr <= (unsigned long)_etext) | 46 | addr <= (unsigned long)_etext) |
47 | return 1; | 47 | return 1; |
48 | 48 | ||
49 | if (addr >= (unsigned long)_sinittext && | 49 | if (system_state == SYSTEM_BOOTING && |
50 | addr >= (unsigned long)_sinittext && | ||
50 | addr <= (unsigned long)_einittext) | 51 | addr <= (unsigned long)_einittext) |
51 | return 1; | 52 | return 1; |
52 | return 0; | 53 | return 0; |
diff --git a/kernel/fork.c b/kernel/fork.c index 8dd8ff281009..05e0b6f4365b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <linux/random.h> | 51 | #include <linux/random.h> |
52 | #include <linux/tty.h> | 52 | #include <linux/tty.h> |
53 | #include <linux/proc_fs.h> | 53 | #include <linux/proc_fs.h> |
54 | #include <linux/blkdev.h> | ||
54 | 55 | ||
55 | #include <asm/pgtable.h> | 56 | #include <asm/pgtable.h> |
56 | #include <asm/pgalloc.h> | 57 | #include <asm/pgalloc.h> |
@@ -392,6 +393,7 @@ void fastcall __mmdrop(struct mm_struct *mm) | |||
392 | destroy_context(mm); | 393 | destroy_context(mm); |
393 | free_mm(mm); | 394 | free_mm(mm); |
394 | } | 395 | } |
396 | EXPORT_SYMBOL_GPL(__mmdrop); | ||
395 | 397 | ||
396 | /* | 398 | /* |
397 | * Decrement the use count and release all resources for an mm. | 399 | * Decrement the use count and release all resources for an mm. |
@@ -791,6 +793,31 @@ out: | |||
791 | return error; | 793 | return error; |
792 | } | 794 | } |
793 | 795 | ||
796 | static int copy_io(unsigned long clone_flags, struct task_struct *tsk) | ||
797 | { | ||
798 | #ifdef CONFIG_BLOCK | ||
799 | struct io_context *ioc = current->io_context; | ||
800 | |||
801 | if (!ioc) | ||
802 | return 0; | ||
803 | /* | ||
804 | * Share io context with parent, if CLONE_IO is set | ||
805 | */ | ||
806 | if (clone_flags & CLONE_IO) { | ||
807 | tsk->io_context = ioc_task_link(ioc); | ||
808 | if (unlikely(!tsk->io_context)) | ||
809 | return -ENOMEM; | ||
810 | } else if (ioprio_valid(ioc->ioprio)) { | ||
811 | tsk->io_context = alloc_io_context(GFP_KERNEL, -1); | ||
812 | if (unlikely(!tsk->io_context)) | ||
813 | return -ENOMEM; | ||
814 | |||
815 | tsk->io_context->ioprio = ioc->ioprio; | ||
816 | } | ||
817 | #endif | ||
818 | return 0; | ||
819 | } | ||
820 | |||
794 | /* | 821 | /* |
795 | * Helper to unshare the files of the current task. | 822 | * Helper to unshare the files of the current task. |
796 | * We don't want to expose copy_files internals to | 823 | * We don't want to expose copy_files internals to |
@@ -1045,6 +1072,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1045 | copy_flags(clone_flags, p); | 1072 | copy_flags(clone_flags, p); |
1046 | INIT_LIST_HEAD(&p->children); | 1073 | INIT_LIST_HEAD(&p->children); |
1047 | INIT_LIST_HEAD(&p->sibling); | 1074 | INIT_LIST_HEAD(&p->sibling); |
1075 | #ifdef CONFIG_PREEMPT_RCU | ||
1076 | p->rcu_read_lock_nesting = 0; | ||
1077 | p->rcu_flipctr_idx = 0; | ||
1078 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ | ||
1048 | p->vfork_done = NULL; | 1079 | p->vfork_done = NULL; |
1049 | spin_lock_init(&p->alloc_lock); | 1080 | spin_lock_init(&p->alloc_lock); |
1050 | 1081 | ||
@@ -1059,6 +1090,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1059 | p->prev_utime = cputime_zero; | 1090 | p->prev_utime = cputime_zero; |
1060 | p->prev_stime = cputime_zero; | 1091 | p->prev_stime = cputime_zero; |
1061 | 1092 | ||
1093 | #ifdef CONFIG_DETECT_SOFTLOCKUP | ||
1094 | p->last_switch_count = 0; | ||
1095 | p->last_switch_timestamp = 0; | ||
1096 | #endif | ||
1097 | |||
1062 | #ifdef CONFIG_TASK_XACCT | 1098 | #ifdef CONFIG_TASK_XACCT |
1063 | p->rchar = 0; /* I/O counter: bytes read */ | 1099 | p->rchar = 0; /* I/O counter: bytes read */ |
1064 | p->wchar = 0; /* I/O counter: bytes written */ | 1100 | p->wchar = 0; /* I/O counter: bytes written */ |
@@ -1147,15 +1183,17 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1147 | goto bad_fork_cleanup_mm; | 1183 | goto bad_fork_cleanup_mm; |
1148 | if ((retval = copy_namespaces(clone_flags, p))) | 1184 | if ((retval = copy_namespaces(clone_flags, p))) |
1149 | goto bad_fork_cleanup_keys; | 1185 | goto bad_fork_cleanup_keys; |
1186 | if ((retval = copy_io(clone_flags, p))) | ||
1187 | goto bad_fork_cleanup_namespaces; | ||
1150 | retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); | 1188 | retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); |
1151 | if (retval) | 1189 | if (retval) |
1152 | goto bad_fork_cleanup_namespaces; | 1190 | goto bad_fork_cleanup_io; |
1153 | 1191 | ||
1154 | if (pid != &init_struct_pid) { | 1192 | if (pid != &init_struct_pid) { |
1155 | retval = -ENOMEM; | 1193 | retval = -ENOMEM; |
1156 | pid = alloc_pid(task_active_pid_ns(p)); | 1194 | pid = alloc_pid(task_active_pid_ns(p)); |
1157 | if (!pid) | 1195 | if (!pid) |
1158 | goto bad_fork_cleanup_namespaces; | 1196 | goto bad_fork_cleanup_io; |
1159 | 1197 | ||
1160 | if (clone_flags & CLONE_NEWPID) { | 1198 | if (clone_flags & CLONE_NEWPID) { |
1161 | retval = pid_ns_prepare_proc(task_active_pid_ns(p)); | 1199 | retval = pid_ns_prepare_proc(task_active_pid_ns(p)); |
@@ -1196,6 +1234,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1196 | #ifdef TIF_SYSCALL_EMU | 1234 | #ifdef TIF_SYSCALL_EMU |
1197 | clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); | 1235 | clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); |
1198 | #endif | 1236 | #endif |
1237 | clear_all_latency_tracing(p); | ||
1199 | 1238 | ||
1200 | /* Our parent execution domain becomes current domain | 1239 | /* Our parent execution domain becomes current domain |
1201 | These must match for thread signalling to apply */ | 1240 | These must match for thread signalling to apply */ |
@@ -1224,9 +1263,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1224 | /* Need tasklist lock for parent etc handling! */ | 1263 | /* Need tasklist lock for parent etc handling! */ |
1225 | write_lock_irq(&tasklist_lock); | 1264 | write_lock_irq(&tasklist_lock); |
1226 | 1265 | ||
1227 | /* for sys_ioprio_set(IOPRIO_WHO_PGRP) */ | ||
1228 | p->ioprio = current->ioprio; | ||
1229 | |||
1230 | /* | 1266 | /* |
1231 | * The task hasn't been attached yet, so its cpus_allowed mask will | 1267 | * The task hasn't been attached yet, so its cpus_allowed mask will |
1232 | * not be changed, nor will its assigned CPU. | 1268 | * not be changed, nor will its assigned CPU. |
@@ -1237,6 +1273,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1237 | * parent's CPU). This avoids alot of nasty races. | 1273 | * parent's CPU). This avoids alot of nasty races. |
1238 | */ | 1274 | */ |
1239 | p->cpus_allowed = current->cpus_allowed; | 1275 | p->cpus_allowed = current->cpus_allowed; |
1276 | p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; | ||
1240 | if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || | 1277 | if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || |
1241 | !cpu_online(task_cpu(p)))) | 1278 | !cpu_online(task_cpu(p)))) |
1242 | set_task_cpu(p, smp_processor_id()); | 1279 | set_task_cpu(p, smp_processor_id()); |
@@ -1317,6 +1354,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1317 | bad_fork_free_pid: | 1354 | bad_fork_free_pid: |
1318 | if (pid != &init_struct_pid) | 1355 | if (pid != &init_struct_pid) |
1319 | free_pid(pid); | 1356 | free_pid(pid); |
1357 | bad_fork_cleanup_io: | ||
1358 | put_io_context(p->io_context); | ||
1320 | bad_fork_cleanup_namespaces: | 1359 | bad_fork_cleanup_namespaces: |
1321 | exit_task_namespaces(p); | 1360 | exit_task_namespaces(p); |
1322 | bad_fork_cleanup_keys: | 1361 | bad_fork_cleanup_keys: |
diff --git a/kernel/futex.c b/kernel/futex.c index 172a1aeeafdb..db9824de8bf0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -1097,15 +1097,15 @@ static void unqueue_me_pi(struct futex_q *q) | |||
1097 | } | 1097 | } |
1098 | 1098 | ||
1099 | /* | 1099 | /* |
1100 | * Fixup the pi_state owner with current. | 1100 | * Fixup the pi_state owner with the new owner. |
1101 | * | 1101 | * |
1102 | * Must be called with hash bucket lock held and mm->sem held for non | 1102 | * Must be called with hash bucket lock held and mm->sem held for non |
1103 | * private futexes. | 1103 | * private futexes. |
1104 | */ | 1104 | */ |
1105 | static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | 1105 | static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, |
1106 | struct task_struct *curr) | 1106 | struct task_struct *newowner) |
1107 | { | 1107 | { |
1108 | u32 newtid = task_pid_vnr(curr) | FUTEX_WAITERS; | 1108 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; |
1109 | struct futex_pi_state *pi_state = q->pi_state; | 1109 | struct futex_pi_state *pi_state = q->pi_state; |
1110 | u32 uval, curval, newval; | 1110 | u32 uval, curval, newval; |
1111 | int ret; | 1111 | int ret; |
@@ -1119,12 +1119,12 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |||
1119 | } else | 1119 | } else |
1120 | newtid |= FUTEX_OWNER_DIED; | 1120 | newtid |= FUTEX_OWNER_DIED; |
1121 | 1121 | ||
1122 | pi_state->owner = curr; | 1122 | pi_state->owner = newowner; |
1123 | 1123 | ||
1124 | spin_lock_irq(&curr->pi_lock); | 1124 | spin_lock_irq(&newowner->pi_lock); |
1125 | WARN_ON(!list_empty(&pi_state->list)); | 1125 | WARN_ON(!list_empty(&pi_state->list)); |
1126 | list_add(&pi_state->list, &curr->pi_state_list); | 1126 | list_add(&pi_state->list, &newowner->pi_state_list); |
1127 | spin_unlock_irq(&curr->pi_lock); | 1127 | spin_unlock_irq(&newowner->pi_lock); |
1128 | 1128 | ||
1129 | /* | 1129 | /* |
1130 | * We own it, so we have to replace the pending owner | 1130 | * We own it, so we have to replace the pending owner |
@@ -1508,9 +1508,40 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1508 | * when we were on the way back before we locked the | 1508 | * when we were on the way back before we locked the |
1509 | * hash bucket. | 1509 | * hash bucket. |
1510 | */ | 1510 | */ |
1511 | if (q.pi_state->owner == curr && | 1511 | if (q.pi_state->owner == curr) { |
1512 | rt_mutex_trylock(&q.pi_state->pi_mutex)) { | 1512 | /* |
1513 | ret = 0; | 1513 | * Try to get the rt_mutex now. This might |
1514 | * fail as some other task acquired the | ||
1515 | * rt_mutex after we removed ourself from the | ||
1516 | * rt_mutex waiters list. | ||
1517 | */ | ||
1518 | if (rt_mutex_trylock(&q.pi_state->pi_mutex)) | ||
1519 | ret = 0; | ||
1520 | else { | ||
1521 | /* | ||
1522 | * pi_state is incorrect, some other | ||
1523 | * task did a lock steal and we | ||
1524 | * returned due to timeout or signal | ||
1525 | * without taking the rt_mutex. Too | ||
1526 | * late. We can access the | ||
1527 | * rt_mutex_owner without locking, as | ||
1528 | * the other task is now blocked on | ||
1529 | * the hash bucket lock. Fix the state | ||
1530 | * up. | ||
1531 | */ | ||
1532 | struct task_struct *owner; | ||
1533 | int res; | ||
1534 | |||
1535 | owner = rt_mutex_owner(&q.pi_state->pi_mutex); | ||
1536 | res = fixup_pi_state_owner(uaddr, &q, owner); | ||
1537 | |||
1538 | WARN_ON(rt_mutex_owner(&q.pi_state->pi_mutex) != | ||
1539 | owner); | ||
1540 | |||
1541 | /* propagate -EFAULT, if the fixup failed */ | ||
1542 | if (res) | ||
1543 | ret = res; | ||
1544 | } | ||
1514 | } else { | 1545 | } else { |
1515 | /* | 1546 | /* |
1516 | * Paranoia check. If we did not take the lock | 1547 | * Paranoia check. If we did not take the lock |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 22a25142e4cf..bd5d6b5060bc 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -325,6 +325,22 @@ unsigned long ktime_divns(const ktime_t kt, s64 div) | |||
325 | } | 325 | } |
326 | #endif /* BITS_PER_LONG >= 64 */ | 326 | #endif /* BITS_PER_LONG >= 64 */ |
327 | 327 | ||
328 | /* | ||
329 | * Check, whether the timer is on the callback pending list | ||
330 | */ | ||
331 | static inline int hrtimer_cb_pending(const struct hrtimer *timer) | ||
332 | { | ||
333 | return timer->state & HRTIMER_STATE_PENDING; | ||
334 | } | ||
335 | |||
336 | /* | ||
337 | * Remove a timer from the callback pending list | ||
338 | */ | ||
339 | static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) | ||
340 | { | ||
341 | list_del_init(&timer->cb_entry); | ||
342 | } | ||
343 | |||
328 | /* High resolution timer related functions */ | 344 | /* High resolution timer related functions */ |
329 | #ifdef CONFIG_HIGH_RES_TIMERS | 345 | #ifdef CONFIG_HIGH_RES_TIMERS |
330 | 346 | ||
@@ -494,29 +510,12 @@ void hres_timers_resume(void) | |||
494 | } | 510 | } |
495 | 511 | ||
496 | /* | 512 | /* |
497 | * Check, whether the timer is on the callback pending list | ||
498 | */ | ||
499 | static inline int hrtimer_cb_pending(const struct hrtimer *timer) | ||
500 | { | ||
501 | return timer->state & HRTIMER_STATE_PENDING; | ||
502 | } | ||
503 | |||
504 | /* | ||
505 | * Remove a timer from the callback pending list | ||
506 | */ | ||
507 | static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) | ||
508 | { | ||
509 | list_del_init(&timer->cb_entry); | ||
510 | } | ||
511 | |||
512 | /* | ||
513 | * Initialize the high resolution related parts of cpu_base | 513 | * Initialize the high resolution related parts of cpu_base |
514 | */ | 514 | */ |
515 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) | 515 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) |
516 | { | 516 | { |
517 | base->expires_next.tv64 = KTIME_MAX; | 517 | base->expires_next.tv64 = KTIME_MAX; |
518 | base->hres_active = 0; | 518 | base->hres_active = 0; |
519 | INIT_LIST_HEAD(&base->cb_pending); | ||
520 | } | 519 | } |
521 | 520 | ||
522 | /* | 521 | /* |
@@ -524,7 +523,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) | |||
524 | */ | 523 | */ |
525 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) | 524 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) |
526 | { | 525 | { |
527 | INIT_LIST_HEAD(&timer->cb_entry); | ||
528 | } | 526 | } |
529 | 527 | ||
530 | /* | 528 | /* |
@@ -618,10 +616,13 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
618 | { | 616 | { |
619 | return 0; | 617 | return 0; |
620 | } | 618 | } |
621 | static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; } | ||
622 | static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { } | ||
623 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } | 619 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } |
624 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } | 620 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } |
621 | static inline int hrtimer_reprogram(struct hrtimer *timer, | ||
622 | struct hrtimer_clock_base *base) | ||
623 | { | ||
624 | return 0; | ||
625 | } | ||
625 | 626 | ||
626 | #endif /* CONFIG_HIGH_RES_TIMERS */ | 627 | #endif /* CONFIG_HIGH_RES_TIMERS */ |
627 | 628 | ||
@@ -850,6 +851,14 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) | |||
850 | #ifdef CONFIG_TIME_LOW_RES | 851 | #ifdef CONFIG_TIME_LOW_RES |
851 | tim = ktime_add(tim, base->resolution); | 852 | tim = ktime_add(tim, base->resolution); |
852 | #endif | 853 | #endif |
854 | /* | ||
855 | * Careful here: User space might have asked for a | ||
856 | * very long sleep, so the add above might result in a | ||
857 | * negative number, which enqueues the timer in front | ||
858 | * of the queue. | ||
859 | */ | ||
860 | if (tim.tv64 < 0) | ||
861 | tim.tv64 = KTIME_MAX; | ||
853 | } | 862 | } |
854 | timer->expires = tim; | 863 | timer->expires = tim; |
855 | 864 | ||
@@ -993,6 +1002,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
993 | clock_id = CLOCK_MONOTONIC; | 1002 | clock_id = CLOCK_MONOTONIC; |
994 | 1003 | ||
995 | timer->base = &cpu_base->clock_base[clock_id]; | 1004 | timer->base = &cpu_base->clock_base[clock_id]; |
1005 | INIT_LIST_HEAD(&timer->cb_entry); | ||
996 | hrtimer_init_timer_hres(timer); | 1006 | hrtimer_init_timer_hres(timer); |
997 | 1007 | ||
998 | #ifdef CONFIG_TIMER_STATS | 1008 | #ifdef CONFIG_TIMER_STATS |
@@ -1022,6 +1032,85 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | |||
1022 | } | 1032 | } |
1023 | EXPORT_SYMBOL_GPL(hrtimer_get_res); | 1033 | EXPORT_SYMBOL_GPL(hrtimer_get_res); |
1024 | 1034 | ||
1035 | static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) | ||
1036 | { | ||
1037 | spin_lock_irq(&cpu_base->lock); | ||
1038 | |||
1039 | while (!list_empty(&cpu_base->cb_pending)) { | ||
1040 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
1041 | struct hrtimer *timer; | ||
1042 | int restart; | ||
1043 | |||
1044 | timer = list_entry(cpu_base->cb_pending.next, | ||
1045 | struct hrtimer, cb_entry); | ||
1046 | |||
1047 | timer_stats_account_hrtimer(timer); | ||
1048 | |||
1049 | fn = timer->function; | ||
1050 | __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); | ||
1051 | spin_unlock_irq(&cpu_base->lock); | ||
1052 | |||
1053 | restart = fn(timer); | ||
1054 | |||
1055 | spin_lock_irq(&cpu_base->lock); | ||
1056 | |||
1057 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
1058 | if (restart == HRTIMER_RESTART) { | ||
1059 | BUG_ON(hrtimer_active(timer)); | ||
1060 | /* | ||
1061 | * Enqueue the timer, allow reprogramming of the event | ||
1062 | * device | ||
1063 | */ | ||
1064 | enqueue_hrtimer(timer, timer->base, 1); | ||
1065 | } else if (hrtimer_active(timer)) { | ||
1066 | /* | ||
1067 | * If the timer was rearmed on another CPU, reprogram | ||
1068 | * the event device. | ||
1069 | */ | ||
1070 | if (timer->base->first == &timer->node) | ||
1071 | hrtimer_reprogram(timer, timer->base); | ||
1072 | } | ||
1073 | } | ||
1074 | spin_unlock_irq(&cpu_base->lock); | ||
1075 | } | ||
1076 | |||
1077 | static void __run_hrtimer(struct hrtimer *timer) | ||
1078 | { | ||
1079 | struct hrtimer_clock_base *base = timer->base; | ||
1080 | struct hrtimer_cpu_base *cpu_base = base->cpu_base; | ||
1081 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
1082 | int restart; | ||
1083 | |||
1084 | __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); | ||
1085 | timer_stats_account_hrtimer(timer); | ||
1086 | |||
1087 | fn = timer->function; | ||
1088 | if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { | ||
1089 | /* | ||
1090 | * Used for scheduler timers, avoid lock inversion with | ||
1091 | * rq->lock and tasklist_lock. | ||
1092 | * | ||
1093 | * These timers are required to deal with enqueue expiry | ||
1094 | * themselves and are not allowed to migrate. | ||
1095 | */ | ||
1096 | spin_unlock(&cpu_base->lock); | ||
1097 | restart = fn(timer); | ||
1098 | spin_lock(&cpu_base->lock); | ||
1099 | } else | ||
1100 | restart = fn(timer); | ||
1101 | |||
1102 | /* | ||
1103 | * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid | ||
1104 | * reprogramming of the event hardware. This happens at the end of this | ||
1105 | * function anyway. | ||
1106 | */ | ||
1107 | if (restart != HRTIMER_NORESTART) { | ||
1108 | BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); | ||
1109 | enqueue_hrtimer(timer, base, 0); | ||
1110 | } | ||
1111 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
1112 | } | ||
1113 | |||
1025 | #ifdef CONFIG_HIGH_RES_TIMERS | 1114 | #ifdef CONFIG_HIGH_RES_TIMERS |
1026 | 1115 | ||
1027 | /* | 1116 | /* |
@@ -1079,21 +1168,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) | |||
1079 | continue; | 1168 | continue; |
1080 | } | 1169 | } |
1081 | 1170 | ||
1082 | __remove_hrtimer(timer, base, | 1171 | __run_hrtimer(timer); |
1083 | HRTIMER_STATE_CALLBACK, 0); | ||
1084 | timer_stats_account_hrtimer(timer); | ||
1085 | |||
1086 | /* | ||
1087 | * Note: We clear the CALLBACK bit after | ||
1088 | * enqueue_hrtimer to avoid reprogramming of | ||
1089 | * the event hardware. This happens at the end | ||
1090 | * of this function anyway. | ||
1091 | */ | ||
1092 | if (timer->function(timer) != HRTIMER_NORESTART) { | ||
1093 | BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); | ||
1094 | enqueue_hrtimer(timer, base, 0); | ||
1095 | } | ||
1096 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
1097 | } | 1172 | } |
1098 | spin_unlock(&cpu_base->lock); | 1173 | spin_unlock(&cpu_base->lock); |
1099 | base++; | 1174 | base++; |
@@ -1114,52 +1189,41 @@ void hrtimer_interrupt(struct clock_event_device *dev) | |||
1114 | 1189 | ||
1115 | static void run_hrtimer_softirq(struct softirq_action *h) | 1190 | static void run_hrtimer_softirq(struct softirq_action *h) |
1116 | { | 1191 | { |
1117 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1192 | run_hrtimer_pending(&__get_cpu_var(hrtimer_bases)); |
1118 | 1193 | } | |
1119 | spin_lock_irq(&cpu_base->lock); | ||
1120 | |||
1121 | while (!list_empty(&cpu_base->cb_pending)) { | ||
1122 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
1123 | struct hrtimer *timer; | ||
1124 | int restart; | ||
1125 | |||
1126 | timer = list_entry(cpu_base->cb_pending.next, | ||
1127 | struct hrtimer, cb_entry); | ||
1128 | 1194 | ||
1129 | timer_stats_account_hrtimer(timer); | 1195 | #endif /* CONFIG_HIGH_RES_TIMERS */ |
1130 | 1196 | ||
1131 | fn = timer->function; | 1197 | /* |
1132 | __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); | 1198 | * Called from timer softirq every jiffy, expire hrtimers: |
1133 | spin_unlock_irq(&cpu_base->lock); | 1199 | * |
1200 | * For HRT its the fall back code to run the softirq in the timer | ||
1201 | * softirq context in case the hrtimer initialization failed or has | ||
1202 | * not been done yet. | ||
1203 | */ | ||
1204 | void hrtimer_run_pending(void) | ||
1205 | { | ||
1206 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
1134 | 1207 | ||
1135 | restart = fn(timer); | 1208 | if (hrtimer_hres_active()) |
1209 | return; | ||
1136 | 1210 | ||
1137 | spin_lock_irq(&cpu_base->lock); | 1211 | /* |
1212 | * This _is_ ugly: We have to check in the softirq context, | ||
1213 | * whether we can switch to highres and / or nohz mode. The | ||
1214 | * clocksource switch happens in the timer interrupt with | ||
1215 | * xtime_lock held. Notification from there only sets the | ||
1216 | * check bit in the tick_oneshot code, otherwise we might | ||
1217 | * deadlock vs. xtime_lock. | ||
1218 | */ | ||
1219 | if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) | ||
1220 | hrtimer_switch_to_hres(); | ||
1138 | 1221 | ||
1139 | timer->state &= ~HRTIMER_STATE_CALLBACK; | 1222 | run_hrtimer_pending(cpu_base); |
1140 | if (restart == HRTIMER_RESTART) { | ||
1141 | BUG_ON(hrtimer_active(timer)); | ||
1142 | /* | ||
1143 | * Enqueue the timer, allow reprogramming of the event | ||
1144 | * device | ||
1145 | */ | ||
1146 | enqueue_hrtimer(timer, timer->base, 1); | ||
1147 | } else if (hrtimer_active(timer)) { | ||
1148 | /* | ||
1149 | * If the timer was rearmed on another CPU, reprogram | ||
1150 | * the event device. | ||
1151 | */ | ||
1152 | if (timer->base->first == &timer->node) | ||
1153 | hrtimer_reprogram(timer, timer->base); | ||
1154 | } | ||
1155 | } | ||
1156 | spin_unlock_irq(&cpu_base->lock); | ||
1157 | } | 1223 | } |
1158 | 1224 | ||
1159 | #endif /* CONFIG_HIGH_RES_TIMERS */ | ||
1160 | |||
1161 | /* | 1225 | /* |
1162 | * Expire the per base hrtimer-queue: | 1226 | * Called from hardirq context every jiffy |
1163 | */ | 1227 | */ |
1164 | static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, | 1228 | static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, |
1165 | int index) | 1229 | int index) |
@@ -1173,46 +1237,27 @@ static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, | |||
1173 | if (base->get_softirq_time) | 1237 | if (base->get_softirq_time) |
1174 | base->softirq_time = base->get_softirq_time(); | 1238 | base->softirq_time = base->get_softirq_time(); |
1175 | 1239 | ||
1176 | spin_lock_irq(&cpu_base->lock); | 1240 | spin_lock(&cpu_base->lock); |
1177 | 1241 | ||
1178 | while ((node = base->first)) { | 1242 | while ((node = base->first)) { |
1179 | struct hrtimer *timer; | 1243 | struct hrtimer *timer; |
1180 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
1181 | int restart; | ||
1182 | 1244 | ||
1183 | timer = rb_entry(node, struct hrtimer, node); | 1245 | timer = rb_entry(node, struct hrtimer, node); |
1184 | if (base->softirq_time.tv64 <= timer->expires.tv64) | 1246 | if (base->softirq_time.tv64 <= timer->expires.tv64) |
1185 | break; | 1247 | break; |
1186 | 1248 | ||
1187 | #ifdef CONFIG_HIGH_RES_TIMERS | 1249 | if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { |
1188 | WARN_ON_ONCE(timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ); | 1250 | __remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0); |
1189 | #endif | 1251 | list_add_tail(&timer->cb_entry, |
1190 | timer_stats_account_hrtimer(timer); | 1252 | &base->cpu_base->cb_pending); |
1191 | 1253 | continue; | |
1192 | fn = timer->function; | ||
1193 | __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); | ||
1194 | spin_unlock_irq(&cpu_base->lock); | ||
1195 | |||
1196 | restart = fn(timer); | ||
1197 | |||
1198 | spin_lock_irq(&cpu_base->lock); | ||
1199 | |||
1200 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
1201 | if (restart != HRTIMER_NORESTART) { | ||
1202 | BUG_ON(hrtimer_active(timer)); | ||
1203 | enqueue_hrtimer(timer, base, 0); | ||
1204 | } | 1254 | } |
1255 | |||
1256 | __run_hrtimer(timer); | ||
1205 | } | 1257 | } |
1206 | spin_unlock_irq(&cpu_base->lock); | 1258 | spin_unlock(&cpu_base->lock); |
1207 | } | 1259 | } |
1208 | 1260 | ||
1209 | /* | ||
1210 | * Called from timer softirq every jiffy, expire hrtimers: | ||
1211 | * | ||
1212 | * For HRT its the fall back code to run the softirq in the timer | ||
1213 | * softirq context in case the hrtimer initialization failed or has | ||
1214 | * not been done yet. | ||
1215 | */ | ||
1216 | void hrtimer_run_queues(void) | 1261 | void hrtimer_run_queues(void) |
1217 | { | 1262 | { |
1218 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1263 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); |
@@ -1221,18 +1266,6 @@ void hrtimer_run_queues(void) | |||
1221 | if (hrtimer_hres_active()) | 1266 | if (hrtimer_hres_active()) |
1222 | return; | 1267 | return; |
1223 | 1268 | ||
1224 | /* | ||
1225 | * This _is_ ugly: We have to check in the softirq context, | ||
1226 | * whether we can switch to highres and / or nohz mode. The | ||
1227 | * clocksource switch happens in the timer interrupt with | ||
1228 | * xtime_lock held. Notification from there only sets the | ||
1229 | * check bit in the tick_oneshot code, otherwise we might | ||
1230 | * deadlock vs. xtime_lock. | ||
1231 | */ | ||
1232 | if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) | ||
1233 | if (hrtimer_switch_to_hres()) | ||
1234 | return; | ||
1235 | |||
1236 | hrtimer_get_softirq_time(cpu_base); | 1269 | hrtimer_get_softirq_time(cpu_base); |
1237 | 1270 | ||
1238 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) | 1271 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) |
@@ -1260,7 +1293,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) | |||
1260 | sl->timer.function = hrtimer_wakeup; | 1293 | sl->timer.function = hrtimer_wakeup; |
1261 | sl->task = task; | 1294 | sl->task = task; |
1262 | #ifdef CONFIG_HIGH_RES_TIMERS | 1295 | #ifdef CONFIG_HIGH_RES_TIMERS |
1263 | sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART; | 1296 | sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; |
1264 | #endif | 1297 | #endif |
1265 | } | 1298 | } |
1266 | 1299 | ||
@@ -1271,6 +1304,8 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod | |||
1271 | do { | 1304 | do { |
1272 | set_current_state(TASK_INTERRUPTIBLE); | 1305 | set_current_state(TASK_INTERRUPTIBLE); |
1273 | hrtimer_start(&t->timer, t->timer.expires, mode); | 1306 | hrtimer_start(&t->timer, t->timer.expires, mode); |
1307 | if (!hrtimer_active(&t->timer)) | ||
1308 | t->task = NULL; | ||
1274 | 1309 | ||
1275 | if (likely(t->task)) | 1310 | if (likely(t->task)) |
1276 | schedule(); | 1311 | schedule(); |
@@ -1370,7 +1405,7 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) | |||
1370 | /* | 1405 | /* |
1371 | * Functions related to boot-time initialization: | 1406 | * Functions related to boot-time initialization: |
1372 | */ | 1407 | */ |
1373 | static void __devinit init_hrtimers_cpu(int cpu) | 1408 | static void __cpuinit init_hrtimers_cpu(int cpu) |
1374 | { | 1409 | { |
1375 | struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); | 1410 | struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); |
1376 | int i; | 1411 | int i; |
@@ -1381,6 +1416,7 @@ static void __devinit init_hrtimers_cpu(int cpu) | |||
1381 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) | 1416 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) |
1382 | cpu_base->clock_base[i].cpu_base = cpu_base; | 1417 | cpu_base->clock_base[i].cpu_base = cpu_base; |
1383 | 1418 | ||
1419 | INIT_LIST_HEAD(&cpu_base->cb_pending); | ||
1384 | hrtimer_init_hres(cpu_base); | 1420 | hrtimer_init_hres(cpu_base); |
1385 | } | 1421 | } |
1386 | 1422 | ||
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 9b5dff6b3f6a..44019ce30a14 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -297,18 +297,13 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) | |||
297 | 297 | ||
298 | if (unlikely(desc->status & IRQ_INPROGRESS)) | 298 | if (unlikely(desc->status & IRQ_INPROGRESS)) |
299 | goto out_unlock; | 299 | goto out_unlock; |
300 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | ||
300 | kstat_cpu(cpu).irqs[irq]++; | 301 | kstat_cpu(cpu).irqs[irq]++; |
301 | 302 | ||
302 | action = desc->action; | 303 | action = desc->action; |
303 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { | 304 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) |
304 | if (desc->chip->mask) | ||
305 | desc->chip->mask(irq); | ||
306 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | ||
307 | desc->status |= IRQ_PENDING; | ||
308 | goto out_unlock; | 305 | goto out_unlock; |
309 | } | ||
310 | 306 | ||
311 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING | IRQ_PENDING); | ||
312 | desc->status |= IRQ_INPROGRESS; | 307 | desc->status |= IRQ_INPROGRESS; |
313 | spin_unlock(&desc->lock); | 308 | spin_unlock(&desc->lock); |
314 | 309 | ||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1f314221d534..438a01464287 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -479,6 +479,9 @@ void free_irq(unsigned int irq, void *dev_id) | |||
479 | return; | 479 | return; |
480 | } | 480 | } |
481 | printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq); | 481 | printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq); |
482 | #ifdef CONFIG_DEBUG_SHIRQ | ||
483 | dump_stack(); | ||
484 | #endif | ||
482 | spin_unlock_irqrestore(&desc->lock, flags); | 485 | spin_unlock_irqrestore(&desc->lock, flags); |
483 | return; | 486 | return; |
484 | } | 487 | } |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 50b81b98046a..c2f2ccb0549a 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -75,6 +75,18 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, | |||
75 | 75 | ||
76 | #endif | 76 | #endif |
77 | 77 | ||
78 | static int irq_spurious_read(char *page, char **start, off_t off, | ||
79 | int count, int *eof, void *data) | ||
80 | { | ||
81 | struct irq_desc *d = &irq_desc[(long) data]; | ||
82 | return sprintf(page, "count %u\n" | ||
83 | "unhandled %u\n" | ||
84 | "last_unhandled %u ms\n", | ||
85 | d->irq_count, | ||
86 | d->irqs_unhandled, | ||
87 | jiffies_to_msecs(d->last_unhandled)); | ||
88 | } | ||
89 | |||
78 | #define MAX_NAMELEN 128 | 90 | #define MAX_NAMELEN 128 |
79 | 91 | ||
80 | static int name_unique(unsigned int irq, struct irqaction *new_action) | 92 | static int name_unique(unsigned int irq, struct irqaction *new_action) |
@@ -118,6 +130,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) | |||
118 | void register_irq_proc(unsigned int irq) | 130 | void register_irq_proc(unsigned int irq) |
119 | { | 131 | { |
120 | char name [MAX_NAMELEN]; | 132 | char name [MAX_NAMELEN]; |
133 | struct proc_dir_entry *entry; | ||
121 | 134 | ||
122 | if (!root_irq_dir || | 135 | if (!root_irq_dir || |
123 | (irq_desc[irq].chip == &no_irq_chip) || | 136 | (irq_desc[irq].chip == &no_irq_chip) || |
@@ -132,8 +145,6 @@ void register_irq_proc(unsigned int irq) | |||
132 | 145 | ||
133 | #ifdef CONFIG_SMP | 146 | #ifdef CONFIG_SMP |
134 | { | 147 | { |
135 | struct proc_dir_entry *entry; | ||
136 | |||
137 | /* create /proc/irq/<irq>/smp_affinity */ | 148 | /* create /proc/irq/<irq>/smp_affinity */ |
138 | entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); | 149 | entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); |
139 | 150 | ||
@@ -144,6 +155,12 @@ void register_irq_proc(unsigned int irq) | |||
144 | } | 155 | } |
145 | } | 156 | } |
146 | #endif | 157 | #endif |
158 | |||
159 | entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir); | ||
160 | if (entry) { | ||
161 | entry->data = (void *)(long)irq; | ||
162 | entry->read_proc = irq_spurious_read; | ||
163 | } | ||
147 | } | 164 | } |
148 | 165 | ||
149 | #undef MAX_NAMELEN | 166 | #undef MAX_NAMELEN |
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 32b161972fad..a6b2bc831dd0 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/kallsyms.h> | 11 | #include <linux/kallsyms.h> |
12 | #include <linux/interrupt.h> | 12 | #include <linux/interrupt.h> |
13 | #include <linux/moduleparam.h> | ||
13 | 14 | ||
14 | static int irqfixup __read_mostly; | 15 | static int irqfixup __read_mostly; |
15 | 16 | ||
@@ -225,6 +226,8 @@ int noirqdebug_setup(char *str) | |||
225 | } | 226 | } |
226 | 227 | ||
227 | __setup("noirqdebug", noirqdebug_setup); | 228 | __setup("noirqdebug", noirqdebug_setup); |
229 | module_param(noirqdebug, bool, 0644); | ||
230 | MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); | ||
228 | 231 | ||
229 | static int __init irqfixup_setup(char *str) | 232 | static int __init irqfixup_setup(char *str) |
230 | { | 233 | { |
@@ -236,6 +239,8 @@ static int __init irqfixup_setup(char *str) | |||
236 | } | 239 | } |
237 | 240 | ||
238 | __setup("irqfixup", irqfixup_setup); | 241 | __setup("irqfixup", irqfixup_setup); |
242 | module_param(irqfixup, int, 0644); | ||
243 | MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode 2: irqpoll mode"); | ||
239 | 244 | ||
240 | static int __init irqpoll_setup(char *str) | 245 | static int __init irqpoll_setup(char *str) |
241 | { | 246 | { |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 2fc25810509e..7dadc71ce516 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
@@ -233,10 +233,11 @@ static unsigned long get_symbol_pos(unsigned long addr, | |||
233 | int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, | 233 | int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, |
234 | unsigned long *offset) | 234 | unsigned long *offset) |
235 | { | 235 | { |
236 | char namebuf[KSYM_NAME_LEN]; | ||
236 | if (is_ksym_addr(addr)) | 237 | if (is_ksym_addr(addr)) |
237 | return !!get_symbol_pos(addr, symbolsize, offset); | 238 | return !!get_symbol_pos(addr, symbolsize, offset); |
238 | 239 | ||
239 | return !!module_address_lookup(addr, symbolsize, offset, NULL); | 240 | return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf); |
240 | } | 241 | } |
241 | 242 | ||
242 | /* | 243 | /* |
@@ -251,8 +252,6 @@ const char *kallsyms_lookup(unsigned long addr, | |||
251 | unsigned long *offset, | 252 | unsigned long *offset, |
252 | char **modname, char *namebuf) | 253 | char **modname, char *namebuf) |
253 | { | 254 | { |
254 | const char *msym; | ||
255 | |||
256 | namebuf[KSYM_NAME_LEN - 1] = 0; | 255 | namebuf[KSYM_NAME_LEN - 1] = 0; |
257 | namebuf[0] = 0; | 256 | namebuf[0] = 0; |
258 | 257 | ||
@@ -268,10 +267,8 @@ const char *kallsyms_lookup(unsigned long addr, | |||
268 | } | 267 | } |
269 | 268 | ||
270 | /* see if it's in a module */ | 269 | /* see if it's in a module */ |
271 | msym = module_address_lookup(addr, symbolsize, offset, modname); | 270 | return module_address_lookup(addr, symbolsize, offset, modname, |
272 | if (msym) | 271 | namebuf); |
273 | return strncpy(namebuf, msym, KSYM_NAME_LEN - 1); | ||
274 | |||
275 | return NULL; | 272 | return NULL; |
276 | } | 273 | } |
277 | 274 | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index aa74a1ef2da8..9a26eec9eb04 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -1404,6 +1404,7 @@ static int __init crash_save_vmcoreinfo_init(void) | |||
1404 | VMCOREINFO_OFFSET(list_head, next); | 1404 | VMCOREINFO_OFFSET(list_head, next); |
1405 | VMCOREINFO_OFFSET(list_head, prev); | 1405 | VMCOREINFO_OFFSET(list_head, prev); |
1406 | VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); | 1406 | VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); |
1407 | VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); | ||
1407 | VMCOREINFO_NUMBER(NR_FREE_PAGES); | 1408 | VMCOREINFO_NUMBER(NR_FREE_PAGES); |
1408 | 1409 | ||
1409 | arch_crash_save_vmcoreinfo(); | 1410 | arch_crash_save_vmcoreinfo(); |
diff --git a/kernel/kmod.c b/kernel/kmod.c index c6a4f8aebeba..bb7df2a28bd7 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -451,13 +451,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, | |||
451 | enum umh_wait wait) | 451 | enum umh_wait wait) |
452 | { | 452 | { |
453 | DECLARE_COMPLETION_ONSTACK(done); | 453 | DECLARE_COMPLETION_ONSTACK(done); |
454 | int retval; | 454 | int retval = 0; |
455 | 455 | ||
456 | helper_lock(); | 456 | helper_lock(); |
457 | if (sub_info->path[0] == '\0') { | 457 | if (sub_info->path[0] == '\0') |
458 | retval = 0; | ||
459 | goto out; | 458 | goto out; |
460 | } | ||
461 | 459 | ||
462 | if (!khelper_wq || usermodehelper_disabled) { | 460 | if (!khelper_wq || usermodehelper_disabled) { |
463 | retval = -EBUSY; | 461 | retval = -EBUSY; |
@@ -468,13 +466,14 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, | |||
468 | sub_info->wait = wait; | 466 | sub_info->wait = wait; |
469 | 467 | ||
470 | queue_work(khelper_wq, &sub_info->work); | 468 | queue_work(khelper_wq, &sub_info->work); |
471 | if (wait == UMH_NO_WAIT) /* task has freed sub_info */ | 469 | if (wait == UMH_NO_WAIT) /* task has freed sub_info */ |
472 | return 0; | 470 | goto unlock; |
473 | wait_for_completion(&done); | 471 | wait_for_completion(&done); |
474 | retval = sub_info->retval; | 472 | retval = sub_info->retval; |
475 | 473 | ||
476 | out: | 474 | out: |
477 | call_usermodehelper_freeinfo(sub_info); | 475 | call_usermodehelper_freeinfo(sub_info); |
476 | unlock: | ||
478 | helper_unlock(); | 477 | helper_unlock(); |
479 | return retval; | 478 | return retval; |
480 | } | 479 | } |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e3a5d817ac9b..d0493eafea3e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -824,6 +824,8 @@ static int __init init_kprobes(void) | |||
824 | if (!err) | 824 | if (!err) |
825 | err = register_die_notifier(&kprobe_exceptions_nb); | 825 | err = register_die_notifier(&kprobe_exceptions_nb); |
826 | 826 | ||
827 | if (!err) | ||
828 | init_test_probes(); | ||
827 | return err; | 829 | return err; |
828 | } | 830 | } |
829 | 831 | ||
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 65daa5373ca6..e53bc30e9ba5 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -17,30 +17,34 @@ | |||
17 | #include <linux/sched.h> | 17 | #include <linux/sched.h> |
18 | 18 | ||
19 | #define KERNEL_ATTR_RO(_name) \ | 19 | #define KERNEL_ATTR_RO(_name) \ |
20 | static struct subsys_attribute _name##_attr = __ATTR_RO(_name) | 20 | static struct kobj_attribute _name##_attr = __ATTR_RO(_name) |
21 | 21 | ||
22 | #define KERNEL_ATTR_RW(_name) \ | 22 | #define KERNEL_ATTR_RW(_name) \ |
23 | static struct subsys_attribute _name##_attr = \ | 23 | static struct kobj_attribute _name##_attr = \ |
24 | __ATTR(_name, 0644, _name##_show, _name##_store) | 24 | __ATTR(_name, 0644, _name##_show, _name##_store) |
25 | 25 | ||
26 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) | 26 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) |
27 | /* current uevent sequence number */ | 27 | /* current uevent sequence number */ |
28 | static ssize_t uevent_seqnum_show(struct kset *kset, char *page) | 28 | static ssize_t uevent_seqnum_show(struct kobject *kobj, |
29 | struct kobj_attribute *attr, char *buf) | ||
29 | { | 30 | { |
30 | return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum); | 31 | return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum); |
31 | } | 32 | } |
32 | KERNEL_ATTR_RO(uevent_seqnum); | 33 | KERNEL_ATTR_RO(uevent_seqnum); |
33 | 34 | ||
34 | /* uevent helper program, used during early boo */ | 35 | /* uevent helper program, used during early boo */ |
35 | static ssize_t uevent_helper_show(struct kset *kset, char *page) | 36 | static ssize_t uevent_helper_show(struct kobject *kobj, |
37 | struct kobj_attribute *attr, char *buf) | ||
36 | { | 38 | { |
37 | return sprintf(page, "%s\n", uevent_helper); | 39 | return sprintf(buf, "%s\n", uevent_helper); |
38 | } | 40 | } |
39 | static ssize_t uevent_helper_store(struct kset *kset, const char *page, size_t count) | 41 | static ssize_t uevent_helper_store(struct kobject *kobj, |
42 | struct kobj_attribute *attr, | ||
43 | const char *buf, size_t count) | ||
40 | { | 44 | { |
41 | if (count+1 > UEVENT_HELPER_PATH_LEN) | 45 | if (count+1 > UEVENT_HELPER_PATH_LEN) |
42 | return -ENOENT; | 46 | return -ENOENT; |
43 | memcpy(uevent_helper, page, count); | 47 | memcpy(uevent_helper, buf, count); |
44 | uevent_helper[count] = '\0'; | 48 | uevent_helper[count] = '\0'; |
45 | if (count && uevent_helper[count-1] == '\n') | 49 | if (count && uevent_helper[count-1] == '\n') |
46 | uevent_helper[count-1] = '\0'; | 50 | uevent_helper[count-1] = '\0'; |
@@ -50,21 +54,24 @@ KERNEL_ATTR_RW(uevent_helper); | |||
50 | #endif | 54 | #endif |
51 | 55 | ||
52 | #ifdef CONFIG_KEXEC | 56 | #ifdef CONFIG_KEXEC |
53 | static ssize_t kexec_loaded_show(struct kset *kset, char *page) | 57 | static ssize_t kexec_loaded_show(struct kobject *kobj, |
58 | struct kobj_attribute *attr, char *buf) | ||
54 | { | 59 | { |
55 | return sprintf(page, "%d\n", !!kexec_image); | 60 | return sprintf(buf, "%d\n", !!kexec_image); |
56 | } | 61 | } |
57 | KERNEL_ATTR_RO(kexec_loaded); | 62 | KERNEL_ATTR_RO(kexec_loaded); |
58 | 63 | ||
59 | static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page) | 64 | static ssize_t kexec_crash_loaded_show(struct kobject *kobj, |
65 | struct kobj_attribute *attr, char *buf) | ||
60 | { | 66 | { |
61 | return sprintf(page, "%d\n", !!kexec_crash_image); | 67 | return sprintf(buf, "%d\n", !!kexec_crash_image); |
62 | } | 68 | } |
63 | KERNEL_ATTR_RO(kexec_crash_loaded); | 69 | KERNEL_ATTR_RO(kexec_crash_loaded); |
64 | 70 | ||
65 | static ssize_t vmcoreinfo_show(struct kset *kset, char *page) | 71 | static ssize_t vmcoreinfo_show(struct kobject *kobj, |
72 | struct kobj_attribute *attr, char *buf) | ||
66 | { | 73 | { |
67 | return sprintf(page, "%lx %x\n", | 74 | return sprintf(buf, "%lx %x\n", |
68 | paddr_vmcoreinfo_note(), | 75 | paddr_vmcoreinfo_note(), |
69 | (unsigned int)vmcoreinfo_max_size); | 76 | (unsigned int)vmcoreinfo_max_size); |
70 | } | 77 | } |
@@ -94,8 +101,8 @@ static struct bin_attribute notes_attr = { | |||
94 | .read = ¬es_read, | 101 | .read = ¬es_read, |
95 | }; | 102 | }; |
96 | 103 | ||
97 | decl_subsys(kernel, NULL, NULL); | 104 | struct kobject *kernel_kobj; |
98 | EXPORT_SYMBOL_GPL(kernel_subsys); | 105 | EXPORT_SYMBOL_GPL(kernel_kobj); |
99 | 106 | ||
100 | static struct attribute * kernel_attrs[] = { | 107 | static struct attribute * kernel_attrs[] = { |
101 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) | 108 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) |
@@ -116,24 +123,39 @@ static struct attribute_group kernel_attr_group = { | |||
116 | 123 | ||
117 | static int __init ksysfs_init(void) | 124 | static int __init ksysfs_init(void) |
118 | { | 125 | { |
119 | int error = subsystem_register(&kernel_subsys); | 126 | int error; |
120 | if (!error) | ||
121 | error = sysfs_create_group(&kernel_subsys.kobj, | ||
122 | &kernel_attr_group); | ||
123 | 127 | ||
124 | if (!error && notes_size > 0) { | 128 | kernel_kobj = kobject_create_and_add("kernel", NULL); |
125 | notes_attr.size = notes_size; | 129 | if (!kernel_kobj) { |
126 | error = sysfs_create_bin_file(&kernel_subsys.kobj, | 130 | error = -ENOMEM; |
127 | ¬es_attr); | 131 | goto exit; |
128 | } | 132 | } |
133 | error = sysfs_create_group(kernel_kobj, &kernel_attr_group); | ||
134 | if (error) | ||
135 | goto kset_exit; | ||
129 | 136 | ||
130 | /* | 137 | if (notes_size > 0) { |
131 | * Create "/sys/kernel/uids" directory and corresponding root user's | 138 | notes_attr.size = notes_size; |
132 | * directory under it. | 139 | error = sysfs_create_bin_file(kernel_kobj, ¬es_attr); |
133 | */ | 140 | if (error) |
134 | if (!error) | 141 | goto group_exit; |
135 | error = uids_kobject_init(); | 142 | } |
136 | 143 | ||
144 | /* create the /sys/kernel/uids/ directory */ | ||
145 | error = uids_sysfs_init(); | ||
146 | if (error) | ||
147 | goto notes_exit; | ||
148 | |||
149 | return 0; | ||
150 | |||
151 | notes_exit: | ||
152 | if (notes_size > 0) | ||
153 | sysfs_remove_bin_file(kernel_kobj, ¬es_attr); | ||
154 | group_exit: | ||
155 | sysfs_remove_group(kernel_kobj, &kernel_attr_group); | ||
156 | kset_exit: | ||
157 | kobject_put(kernel_kobj); | ||
158 | exit: | ||
137 | return error; | 159 | return error; |
138 | } | 160 | } |
139 | 161 | ||
diff --git a/kernel/kthread.c b/kernel/kthread.c index dcfe724300eb..0ac887882f90 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -15,6 +15,8 @@ | |||
15 | #include <linux/mutex.h> | 15 | #include <linux/mutex.h> |
16 | #include <asm/semaphore.h> | 16 | #include <asm/semaphore.h> |
17 | 17 | ||
18 | #define KTHREAD_NICE_LEVEL (-5) | ||
19 | |||
18 | static DEFINE_SPINLOCK(kthread_create_lock); | 20 | static DEFINE_SPINLOCK(kthread_create_lock); |
19 | static LIST_HEAD(kthread_create_list); | 21 | static LIST_HEAD(kthread_create_list); |
20 | struct task_struct *kthreadd_task; | 22 | struct task_struct *kthreadd_task; |
@@ -94,10 +96,18 @@ static void create_kthread(struct kthread_create_info *create) | |||
94 | if (pid < 0) { | 96 | if (pid < 0) { |
95 | create->result = ERR_PTR(pid); | 97 | create->result = ERR_PTR(pid); |
96 | } else { | 98 | } else { |
99 | struct sched_param param = { .sched_priority = 0 }; | ||
97 | wait_for_completion(&create->started); | 100 | wait_for_completion(&create->started); |
98 | read_lock(&tasklist_lock); | 101 | read_lock(&tasklist_lock); |
99 | create->result = find_task_by_pid(pid); | 102 | create->result = find_task_by_pid(pid); |
100 | read_unlock(&tasklist_lock); | 103 | read_unlock(&tasklist_lock); |
104 | /* | ||
105 | * root may have changed our (kthreadd's) priority or CPU mask. | ||
106 | * The kernel thread should not inherit these properties. | ||
107 | */ | ||
108 | sched_setscheduler(create->result, SCHED_NORMAL, ¶m); | ||
109 | set_user_nice(create->result, KTHREAD_NICE_LEVEL); | ||
110 | set_cpus_allowed(create->result, CPU_MASK_ALL); | ||
101 | } | 111 | } |
102 | complete(&create->done); | 112 | complete(&create->done); |
103 | } | 113 | } |
@@ -221,7 +231,7 @@ int kthreadd(void *unused) | |||
221 | /* Setup a clean context for our children to inherit. */ | 231 | /* Setup a clean context for our children to inherit. */ |
222 | set_task_comm(tsk, "kthreadd"); | 232 | set_task_comm(tsk, "kthreadd"); |
223 | ignore_signals(tsk); | 233 | ignore_signals(tsk); |
224 | set_user_nice(tsk, -5); | 234 | set_user_nice(tsk, KTHREAD_NICE_LEVEL); |
225 | set_cpus_allowed(tsk, CPU_MASK_ALL); | 235 | set_cpus_allowed(tsk, CPU_MASK_ALL); |
226 | 236 | ||
227 | current->flags |= PF_NOFREEZE; | 237 | current->flags |= PF_NOFREEZE; |
diff --git a/kernel/latencytop.c b/kernel/latencytop.c new file mode 100644 index 000000000000..b4e3c85abe74 --- /dev/null +++ b/kernel/latencytop.c | |||
@@ -0,0 +1,239 @@ | |||
1 | /* | ||
2 | * latencytop.c: Latency display infrastructure | ||
3 | * | ||
4 | * (C) Copyright 2008 Intel Corporation | ||
5 | * Author: Arjan van de Ven <arjan@linux.intel.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; version 2 | ||
10 | * of the License. | ||
11 | */ | ||
12 | #include <linux/latencytop.h> | ||
13 | #include <linux/kallsyms.h> | ||
14 | #include <linux/seq_file.h> | ||
15 | #include <linux/notifier.h> | ||
16 | #include <linux/spinlock.h> | ||
17 | #include <linux/proc_fs.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/sched.h> | ||
20 | #include <linux/list.h> | ||
21 | #include <linux/slab.h> | ||
22 | #include <linux/stacktrace.h> | ||
23 | |||
24 | static DEFINE_SPINLOCK(latency_lock); | ||
25 | |||
26 | #define MAXLR 128 | ||
27 | static struct latency_record latency_record[MAXLR]; | ||
28 | |||
29 | int latencytop_enabled; | ||
30 | |||
31 | void clear_all_latency_tracing(struct task_struct *p) | ||
32 | { | ||
33 | unsigned long flags; | ||
34 | |||
35 | if (!latencytop_enabled) | ||
36 | return; | ||
37 | |||
38 | spin_lock_irqsave(&latency_lock, flags); | ||
39 | memset(&p->latency_record, 0, sizeof(p->latency_record)); | ||
40 | p->latency_record_count = 0; | ||
41 | spin_unlock_irqrestore(&latency_lock, flags); | ||
42 | } | ||
43 | |||
44 | static void clear_global_latency_tracing(void) | ||
45 | { | ||
46 | unsigned long flags; | ||
47 | |||
48 | spin_lock_irqsave(&latency_lock, flags); | ||
49 | memset(&latency_record, 0, sizeof(latency_record)); | ||
50 | spin_unlock_irqrestore(&latency_lock, flags); | ||
51 | } | ||
52 | |||
53 | static void __sched | ||
54 | account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) | ||
55 | { | ||
56 | int firstnonnull = MAXLR + 1; | ||
57 | int i; | ||
58 | |||
59 | if (!latencytop_enabled) | ||
60 | return; | ||
61 | |||
62 | /* skip kernel threads for now */ | ||
63 | if (!tsk->mm) | ||
64 | return; | ||
65 | |||
66 | for (i = 0; i < MAXLR; i++) { | ||
67 | int q; | ||
68 | int same = 1; | ||
69 | /* Nothing stored: */ | ||
70 | if (!latency_record[i].backtrace[0]) { | ||
71 | if (firstnonnull > i) | ||
72 | firstnonnull = i; | ||
73 | continue; | ||
74 | } | ||
75 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { | ||
76 | if (latency_record[i].backtrace[q] != | ||
77 | lat->backtrace[q]) | ||
78 | same = 0; | ||
79 | if (same && lat->backtrace[q] == 0) | ||
80 | break; | ||
81 | if (same && lat->backtrace[q] == ULONG_MAX) | ||
82 | break; | ||
83 | } | ||
84 | if (same) { | ||
85 | latency_record[i].count++; | ||
86 | latency_record[i].time += lat->time; | ||
87 | if (lat->time > latency_record[i].max) | ||
88 | latency_record[i].max = lat->time; | ||
89 | return; | ||
90 | } | ||
91 | } | ||
92 | |||
93 | i = firstnonnull; | ||
94 | if (i >= MAXLR - 1) | ||
95 | return; | ||
96 | |||
97 | /* Allocted a new one: */ | ||
98 | memcpy(&latency_record[i], lat, sizeof(struct latency_record)); | ||
99 | } | ||
100 | |||
101 | static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat) | ||
102 | { | ||
103 | struct stack_trace trace; | ||
104 | |||
105 | memset(&trace, 0, sizeof(trace)); | ||
106 | trace.max_entries = LT_BACKTRACEDEPTH; | ||
107 | trace.entries = &lat->backtrace[0]; | ||
108 | trace.skip = 0; | ||
109 | save_stack_trace_tsk(tsk, &trace); | ||
110 | } | ||
111 | |||
112 | void __sched | ||
113 | account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) | ||
114 | { | ||
115 | unsigned long flags; | ||
116 | int i, q; | ||
117 | struct latency_record lat; | ||
118 | |||
119 | if (!latencytop_enabled) | ||
120 | return; | ||
121 | |||
122 | /* Long interruptible waits are generally user requested... */ | ||
123 | if (inter && usecs > 5000) | ||
124 | return; | ||
125 | |||
126 | memset(&lat, 0, sizeof(lat)); | ||
127 | lat.count = 1; | ||
128 | lat.time = usecs; | ||
129 | lat.max = usecs; | ||
130 | store_stacktrace(tsk, &lat); | ||
131 | |||
132 | spin_lock_irqsave(&latency_lock, flags); | ||
133 | |||
134 | account_global_scheduler_latency(tsk, &lat); | ||
135 | |||
136 | /* | ||
137 | * short term hack; if we're > 32 we stop; future we recycle: | ||
138 | */ | ||
139 | tsk->latency_record_count++; | ||
140 | if (tsk->latency_record_count >= LT_SAVECOUNT) | ||
141 | goto out_unlock; | ||
142 | |||
143 | for (i = 0; i < LT_SAVECOUNT ; i++) { | ||
144 | struct latency_record *mylat; | ||
145 | int same = 1; | ||
146 | mylat = &tsk->latency_record[i]; | ||
147 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { | ||
148 | if (mylat->backtrace[q] != | ||
149 | lat.backtrace[q]) | ||
150 | same = 0; | ||
151 | if (same && lat.backtrace[q] == 0) | ||
152 | break; | ||
153 | if (same && lat.backtrace[q] == ULONG_MAX) | ||
154 | break; | ||
155 | } | ||
156 | if (same) { | ||
157 | mylat->count++; | ||
158 | mylat->time += lat.time; | ||
159 | if (lat.time > mylat->max) | ||
160 | mylat->max = lat.time; | ||
161 | goto out_unlock; | ||
162 | } | ||
163 | } | ||
164 | |||
165 | /* Allocated a new one: */ | ||
166 | i = tsk->latency_record_count; | ||
167 | memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); | ||
168 | |||
169 | out_unlock: | ||
170 | spin_unlock_irqrestore(&latency_lock, flags); | ||
171 | } | ||
172 | |||
173 | static int lstats_show(struct seq_file *m, void *v) | ||
174 | { | ||
175 | int i; | ||
176 | |||
177 | seq_puts(m, "Latency Top version : v0.1\n"); | ||
178 | |||
179 | for (i = 0; i < MAXLR; i++) { | ||
180 | if (latency_record[i].backtrace[0]) { | ||
181 | int q; | ||
182 | seq_printf(m, "%i %li %li ", | ||
183 | latency_record[i].count, | ||
184 | latency_record[i].time, | ||
185 | latency_record[i].max); | ||
186 | for (q = 0; q < LT_BACKTRACEDEPTH; q++) { | ||
187 | char sym[KSYM_NAME_LEN]; | ||
188 | char *c; | ||
189 | if (!latency_record[i].backtrace[q]) | ||
190 | break; | ||
191 | if (latency_record[i].backtrace[q] == ULONG_MAX) | ||
192 | break; | ||
193 | sprint_symbol(sym, latency_record[i].backtrace[q]); | ||
194 | c = strchr(sym, '+'); | ||
195 | if (c) | ||
196 | *c = 0; | ||
197 | seq_printf(m, "%s ", sym); | ||
198 | } | ||
199 | seq_printf(m, "\n"); | ||
200 | } | ||
201 | } | ||
202 | return 0; | ||
203 | } | ||
204 | |||
205 | static ssize_t | ||
206 | lstats_write(struct file *file, const char __user *buf, size_t count, | ||
207 | loff_t *offs) | ||
208 | { | ||
209 | clear_global_latency_tracing(); | ||
210 | |||
211 | return count; | ||
212 | } | ||
213 | |||
214 | static int lstats_open(struct inode *inode, struct file *filp) | ||
215 | { | ||
216 | return single_open(filp, lstats_show, NULL); | ||
217 | } | ||
218 | |||
219 | static struct file_operations lstats_fops = { | ||
220 | .open = lstats_open, | ||
221 | .read = seq_read, | ||
222 | .write = lstats_write, | ||
223 | .llseek = seq_lseek, | ||
224 | .release = single_release, | ||
225 | }; | ||
226 | |||
227 | static int __init init_lstats_procfs(void) | ||
228 | { | ||
229 | struct proc_dir_entry *pe; | ||
230 | |||
231 | pe = create_proc_entry("latency_stats", 0644, NULL); | ||
232 | if (!pe) | ||
233 | return -ENOMEM; | ||
234 | |||
235 | pe->proc_fops = &lstats_fops; | ||
236 | |||
237 | return 0; | ||
238 | } | ||
239 | __initcall(init_lstats_procfs); | ||
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 0f389621bb6b..3574379f4d62 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -2654,10 +2654,15 @@ static void check_flags(unsigned long flags) | |||
2654 | if (!debug_locks) | 2654 | if (!debug_locks) |
2655 | return; | 2655 | return; |
2656 | 2656 | ||
2657 | if (irqs_disabled_flags(flags)) | 2657 | if (irqs_disabled_flags(flags)) { |
2658 | DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled); | 2658 | if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) { |
2659 | else | 2659 | printk("possible reason: unannotated irqs-off.\n"); |
2660 | DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled); | 2660 | } |
2661 | } else { | ||
2662 | if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) { | ||
2663 | printk("possible reason: unannotated irqs-on.\n"); | ||
2664 | } | ||
2665 | } | ||
2661 | 2666 | ||
2662 | /* | 2667 | /* |
2663 | * We dont accurately track softirq state in e.g. | 2668 | * We dont accurately track softirq state in e.g. |
@@ -2927,7 +2932,7 @@ static void zap_class(struct lock_class *class) | |||
2927 | 2932 | ||
2928 | } | 2933 | } |
2929 | 2934 | ||
2930 | static inline int within(void *addr, void *start, unsigned long size) | 2935 | static inline int within(const void *addr, void *start, unsigned long size) |
2931 | { | 2936 | { |
2932 | return addr >= start && addr < start + size; | 2937 | return addr >= start && addr < start + size; |
2933 | } | 2938 | } |
@@ -2938,9 +2943,10 @@ void lockdep_free_key_range(void *start, unsigned long size) | |||
2938 | struct list_head *head; | 2943 | struct list_head *head; |
2939 | unsigned long flags; | 2944 | unsigned long flags; |
2940 | int i; | 2945 | int i; |
2946 | int locked; | ||
2941 | 2947 | ||
2942 | raw_local_irq_save(flags); | 2948 | raw_local_irq_save(flags); |
2943 | graph_lock(); | 2949 | locked = graph_lock(); |
2944 | 2950 | ||
2945 | /* | 2951 | /* |
2946 | * Unhash all classes that were created by this module: | 2952 | * Unhash all classes that were created by this module: |
@@ -2949,12 +2955,16 @@ void lockdep_free_key_range(void *start, unsigned long size) | |||
2949 | head = classhash_table + i; | 2955 | head = classhash_table + i; |
2950 | if (list_empty(head)) | 2956 | if (list_empty(head)) |
2951 | continue; | 2957 | continue; |
2952 | list_for_each_entry_safe(class, next, head, hash_entry) | 2958 | list_for_each_entry_safe(class, next, head, hash_entry) { |
2953 | if (within(class->key, start, size)) | 2959 | if (within(class->key, start, size)) |
2954 | zap_class(class); | 2960 | zap_class(class); |
2961 | else if (within(class->name, start, size)) | ||
2962 | zap_class(class); | ||
2963 | } | ||
2955 | } | 2964 | } |
2956 | 2965 | ||
2957 | graph_unlock(); | 2966 | if (locked) |
2967 | graph_unlock(); | ||
2958 | raw_local_irq_restore(flags); | 2968 | raw_local_irq_restore(flags); |
2959 | } | 2969 | } |
2960 | 2970 | ||
@@ -2964,6 +2974,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) | |||
2964 | struct list_head *head; | 2974 | struct list_head *head; |
2965 | unsigned long flags; | 2975 | unsigned long flags; |
2966 | int i, j; | 2976 | int i, j; |
2977 | int locked; | ||
2967 | 2978 | ||
2968 | raw_local_irq_save(flags); | 2979 | raw_local_irq_save(flags); |
2969 | 2980 | ||
@@ -2982,7 +2993,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) | |||
2982 | * Debug check: in the end all mapped classes should | 2993 | * Debug check: in the end all mapped classes should |
2983 | * be gone. | 2994 | * be gone. |
2984 | */ | 2995 | */ |
2985 | graph_lock(); | 2996 | locked = graph_lock(); |
2986 | for (i = 0; i < CLASSHASH_SIZE; i++) { | 2997 | for (i = 0; i < CLASSHASH_SIZE; i++) { |
2987 | head = classhash_table + i; | 2998 | head = classhash_table + i; |
2988 | if (list_empty(head)) | 2999 | if (list_empty(head)) |
@@ -2995,7 +3006,8 @@ void lockdep_reset_lock(struct lockdep_map *lock) | |||
2995 | } | 3006 | } |
2996 | } | 3007 | } |
2997 | } | 3008 | } |
2998 | graph_unlock(); | 3009 | if (locked) |
3010 | graph_unlock(); | ||
2999 | 3011 | ||
3000 | out_restore: | 3012 | out_restore: |
3001 | raw_local_irq_restore(flags); | 3013 | raw_local_irq_restore(flags); |
@@ -3194,7 +3206,11 @@ retry: | |||
3194 | 3206 | ||
3195 | EXPORT_SYMBOL_GPL(debug_show_all_locks); | 3207 | EXPORT_SYMBOL_GPL(debug_show_all_locks); |
3196 | 3208 | ||
3197 | void debug_show_held_locks(struct task_struct *task) | 3209 | /* |
3210 | * Careful: only use this function if you are sure that | ||
3211 | * the task cannot run in parallel! | ||
3212 | */ | ||
3213 | void __debug_show_held_locks(struct task_struct *task) | ||
3198 | { | 3214 | { |
3199 | if (unlikely(!debug_locks)) { | 3215 | if (unlikely(!debug_locks)) { |
3200 | printk("INFO: lockdep is turned off.\n"); | 3216 | printk("INFO: lockdep is turned off.\n"); |
@@ -3202,6 +3218,12 @@ void debug_show_held_locks(struct task_struct *task) | |||
3202 | } | 3218 | } |
3203 | lockdep_print_held_locks(task); | 3219 | lockdep_print_held_locks(task); |
3204 | } | 3220 | } |
3221 | EXPORT_SYMBOL_GPL(__debug_show_held_locks); | ||
3222 | |||
3223 | void debug_show_held_locks(struct task_struct *task) | ||
3224 | { | ||
3225 | __debug_show_held_locks(task); | ||
3226 | } | ||
3205 | 3227 | ||
3206 | EXPORT_SYMBOL_GPL(debug_show_held_locks); | 3228 | EXPORT_SYMBOL_GPL(debug_show_held_locks); |
3207 | 3229 | ||
diff --git a/kernel/module.c b/kernel/module.c index 91fe6958b6e1..bd60278ee703 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -47,8 +47,6 @@ | |||
47 | #include <asm/cacheflush.h> | 47 | #include <asm/cacheflush.h> |
48 | #include <linux/license.h> | 48 | #include <linux/license.h> |
49 | 49 | ||
50 | extern int module_sysfs_initialized; | ||
51 | |||
52 | #if 0 | 50 | #if 0 |
53 | #define DEBUGP printk | 51 | #define DEBUGP printk |
54 | #else | 52 | #else |
@@ -67,6 +65,9 @@ extern int module_sysfs_initialized; | |||
67 | static DEFINE_MUTEX(module_mutex); | 65 | static DEFINE_MUTEX(module_mutex); |
68 | static LIST_HEAD(modules); | 66 | static LIST_HEAD(modules); |
69 | 67 | ||
68 | /* Waiting for a module to finish initializing? */ | ||
69 | static DECLARE_WAIT_QUEUE_HEAD(module_wq); | ||
70 | |||
70 | static BLOCKING_NOTIFIER_HEAD(module_notify_list); | 71 | static BLOCKING_NOTIFIER_HEAD(module_notify_list); |
71 | 72 | ||
72 | int register_module_notifier(struct notifier_block * nb) | 73 | int register_module_notifier(struct notifier_block * nb) |
@@ -86,8 +87,11 @@ EXPORT_SYMBOL(unregister_module_notifier); | |||
86 | static inline int strong_try_module_get(struct module *mod) | 87 | static inline int strong_try_module_get(struct module *mod) |
87 | { | 88 | { |
88 | if (mod && mod->state == MODULE_STATE_COMING) | 89 | if (mod && mod->state == MODULE_STATE_COMING) |
90 | return -EBUSY; | ||
91 | if (try_module_get(mod)) | ||
89 | return 0; | 92 | return 0; |
90 | return try_module_get(mod); | 93 | else |
94 | return -ENOENT; | ||
91 | } | 95 | } |
92 | 96 | ||
93 | static inline void add_taint_module(struct module *mod, unsigned flag) | 97 | static inline void add_taint_module(struct module *mod, unsigned flag) |
@@ -426,6 +430,14 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr, | |||
426 | return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); | 430 | return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); |
427 | } | 431 | } |
428 | 432 | ||
433 | static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) | ||
434 | { | ||
435 | int cpu; | ||
436 | |||
437 | for_each_possible_cpu(cpu) | ||
438 | memcpy(pcpudest + per_cpu_offset(cpu), from, size); | ||
439 | } | ||
440 | |||
429 | static int percpu_modinit(void) | 441 | static int percpu_modinit(void) |
430 | { | 442 | { |
431 | pcpu_num_used = 2; | 443 | pcpu_num_used = 2; |
@@ -498,6 +510,8 @@ static struct module_attribute modinfo_##field = { \ | |||
498 | MODINFO_ATTR(version); | 510 | MODINFO_ATTR(version); |
499 | MODINFO_ATTR(srcversion); | 511 | MODINFO_ATTR(srcversion); |
500 | 512 | ||
513 | static char last_unloaded_module[MODULE_NAME_LEN+1]; | ||
514 | |||
501 | #ifdef CONFIG_MODULE_UNLOAD | 515 | #ifdef CONFIG_MODULE_UNLOAD |
502 | /* Init the unload section of the module. */ | 516 | /* Init the unload section of the module. */ |
503 | static void module_unload_init(struct module *mod) | 517 | static void module_unload_init(struct module *mod) |
@@ -539,11 +553,21 @@ static int already_uses(struct module *a, struct module *b) | |||
539 | static int use_module(struct module *a, struct module *b) | 553 | static int use_module(struct module *a, struct module *b) |
540 | { | 554 | { |
541 | struct module_use *use; | 555 | struct module_use *use; |
542 | int no_warn; | 556 | int no_warn, err; |
543 | 557 | ||
544 | if (b == NULL || already_uses(a, b)) return 1; | 558 | if (b == NULL || already_uses(a, b)) return 1; |
545 | 559 | ||
546 | if (!strong_try_module_get(b)) | 560 | /* If we're interrupted or time out, we fail. */ |
561 | if (wait_event_interruptible_timeout( | ||
562 | module_wq, (err = strong_try_module_get(b)) != -EBUSY, | ||
563 | 30 * HZ) <= 0) { | ||
564 | printk("%s: gave up waiting for init of module %s.\n", | ||
565 | a->name, b->name); | ||
566 | return 0; | ||
567 | } | ||
568 | |||
569 | /* If strong_try_module_get() returned a different error, we fail. */ | ||
570 | if (err) | ||
547 | return 0; | 571 | return 0; |
548 | 572 | ||
549 | DEBUGP("Allocating new usage for %s.\n", a->name); | 573 | DEBUGP("Allocating new usage for %s.\n", a->name); |
@@ -721,6 +745,8 @@ sys_delete_module(const char __user *name_user, unsigned int flags) | |||
721 | mod->exit(); | 745 | mod->exit(); |
722 | mutex_lock(&module_mutex); | 746 | mutex_lock(&module_mutex); |
723 | } | 747 | } |
748 | /* Store the name of the last unloaded module for diagnostic purposes */ | ||
749 | strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); | ||
724 | free_module(mod); | 750 | free_module(mod); |
725 | 751 | ||
726 | out: | 752 | out: |
@@ -814,7 +840,7 @@ static inline void module_unload_free(struct module *mod) | |||
814 | 840 | ||
815 | static inline int use_module(struct module *a, struct module *b) | 841 | static inline int use_module(struct module *a, struct module *b) |
816 | { | 842 | { |
817 | return strong_try_module_get(b); | 843 | return strong_try_module_get(b) == 0; |
818 | } | 844 | } |
819 | 845 | ||
820 | static inline void module_unload_init(struct module *mod) | 846 | static inline void module_unload_init(struct module *mod) |
@@ -1122,7 +1148,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, | |||
1122 | ++loaded; | 1148 | ++loaded; |
1123 | } | 1149 | } |
1124 | 1150 | ||
1125 | notes_attrs->dir = kobject_add_dir(&mod->mkobj.kobj, "notes"); | 1151 | notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj); |
1126 | if (!notes_attrs->dir) | 1152 | if (!notes_attrs->dir) |
1127 | goto out; | 1153 | goto out; |
1128 | 1154 | ||
@@ -1212,6 +1238,7 @@ void module_remove_modinfo_attrs(struct module *mod) | |||
1212 | int mod_sysfs_init(struct module *mod) | 1238 | int mod_sysfs_init(struct module *mod) |
1213 | { | 1239 | { |
1214 | int err; | 1240 | int err; |
1241 | struct kobject *kobj; | ||
1215 | 1242 | ||
1216 | if (!module_sysfs_initialized) { | 1243 | if (!module_sysfs_initialized) { |
1217 | printk(KERN_ERR "%s: module sysfs not initialized\n", | 1244 | printk(KERN_ERR "%s: module sysfs not initialized\n", |
@@ -1219,15 +1246,25 @@ int mod_sysfs_init(struct module *mod) | |||
1219 | err = -EINVAL; | 1246 | err = -EINVAL; |
1220 | goto out; | 1247 | goto out; |
1221 | } | 1248 | } |
1222 | memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); | 1249 | |
1223 | err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name); | 1250 | kobj = kset_find_obj(module_kset, mod->name); |
1224 | if (err) | 1251 | if (kobj) { |
1252 | printk(KERN_ERR "%s: module is already loaded\n", mod->name); | ||
1253 | kobject_put(kobj); | ||
1254 | err = -EINVAL; | ||
1225 | goto out; | 1255 | goto out; |
1226 | kobj_set_kset_s(&mod->mkobj, module_subsys); | 1256 | } |
1257 | |||
1227 | mod->mkobj.mod = mod; | 1258 | mod->mkobj.mod = mod; |
1228 | 1259 | ||
1229 | kobject_init(&mod->mkobj.kobj); | 1260 | memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); |
1261 | mod->mkobj.kobj.kset = module_kset; | ||
1262 | err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, | ||
1263 | "%s", mod->name); | ||
1264 | if (err) | ||
1265 | kobject_put(&mod->mkobj.kobj); | ||
1230 | 1266 | ||
1267 | /* delay uevent until full sysfs population */ | ||
1231 | out: | 1268 | out: |
1232 | return err; | 1269 | return err; |
1233 | } | 1270 | } |
@@ -1238,12 +1275,7 @@ int mod_sysfs_setup(struct module *mod, | |||
1238 | { | 1275 | { |
1239 | int err; | 1276 | int err; |
1240 | 1277 | ||
1241 | /* delay uevent until full sysfs population */ | 1278 | mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); |
1242 | err = kobject_add(&mod->mkobj.kobj); | ||
1243 | if (err) | ||
1244 | goto out; | ||
1245 | |||
1246 | mod->holders_dir = kobject_add_dir(&mod->mkobj.kobj, "holders"); | ||
1247 | if (!mod->holders_dir) { | 1279 | if (!mod->holders_dir) { |
1248 | err = -ENOMEM; | 1280 | err = -ENOMEM; |
1249 | goto out_unreg; | 1281 | goto out_unreg; |
@@ -1263,11 +1295,9 @@ int mod_sysfs_setup(struct module *mod, | |||
1263 | out_unreg_param: | 1295 | out_unreg_param: |
1264 | module_param_sysfs_remove(mod); | 1296 | module_param_sysfs_remove(mod); |
1265 | out_unreg_holders: | 1297 | out_unreg_holders: |
1266 | kobject_unregister(mod->holders_dir); | 1298 | kobject_put(mod->holders_dir); |
1267 | out_unreg: | 1299 | out_unreg: |
1268 | kobject_del(&mod->mkobj.kobj); | ||
1269 | kobject_put(&mod->mkobj.kobj); | 1300 | kobject_put(&mod->mkobj.kobj); |
1270 | out: | ||
1271 | return err; | 1301 | return err; |
1272 | } | 1302 | } |
1273 | #endif | 1303 | #endif |
@@ -1276,9 +1306,20 @@ static void mod_kobject_remove(struct module *mod) | |||
1276 | { | 1306 | { |
1277 | module_remove_modinfo_attrs(mod); | 1307 | module_remove_modinfo_attrs(mod); |
1278 | module_param_sysfs_remove(mod); | 1308 | module_param_sysfs_remove(mod); |
1279 | kobject_unregister(mod->mkobj.drivers_dir); | 1309 | kobject_put(mod->mkobj.drivers_dir); |
1280 | kobject_unregister(mod->holders_dir); | 1310 | kobject_put(mod->holders_dir); |
1281 | kobject_unregister(&mod->mkobj.kobj); | 1311 | kobject_put(&mod->mkobj.kobj); |
1312 | } | ||
1313 | |||
1314 | /* | ||
1315 | * link the module with the whole machine is stopped with interrupts off | ||
1316 | * - this defends against kallsyms not taking locks | ||
1317 | */ | ||
1318 | static int __link_module(void *_mod) | ||
1319 | { | ||
1320 | struct module *mod = _mod; | ||
1321 | list_add(&mod->list, &modules); | ||
1322 | return 0; | ||
1282 | } | 1323 | } |
1283 | 1324 | ||
1284 | /* | 1325 | /* |
@@ -1330,7 +1371,7 @@ void *__symbol_get(const char *symbol) | |||
1330 | 1371 | ||
1331 | preempt_disable(); | 1372 | preempt_disable(); |
1332 | value = __find_symbol(symbol, &owner, &crc, 1); | 1373 | value = __find_symbol(symbol, &owner, &crc, 1); |
1333 | if (value && !strong_try_module_get(owner)) | 1374 | if (value && strong_try_module_get(owner) != 0) |
1334 | value = 0; | 1375 | value = 0; |
1335 | preempt_enable(); | 1376 | preempt_enable(); |
1336 | 1377 | ||
@@ -1884,16 +1925,16 @@ static struct module *load_module(void __user *umod, | |||
1884 | /* Now we've moved module, initialize linked lists, etc. */ | 1925 | /* Now we've moved module, initialize linked lists, etc. */ |
1885 | module_unload_init(mod); | 1926 | module_unload_init(mod); |
1886 | 1927 | ||
1887 | /* Initialize kobject, so we can reference it. */ | 1928 | /* add kobject, so we can reference it. */ |
1888 | err = mod_sysfs_init(mod); | 1929 | err = mod_sysfs_init(mod); |
1889 | if (err) | 1930 | if (err) |
1890 | goto cleanup; | 1931 | goto free_unload; |
1891 | 1932 | ||
1892 | /* Set up license info based on the info section */ | 1933 | /* Set up license info based on the info section */ |
1893 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); | 1934 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); |
1894 | 1935 | ||
1895 | if (strcmp(mod->name, "ndiswrapper") == 0) | 1936 | if (strcmp(mod->name, "ndiswrapper") == 0) |
1896 | add_taint(TAINT_PROPRIETARY_MODULE); | 1937 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); |
1897 | if (strcmp(mod->name, "driverloader") == 0) | 1938 | if (strcmp(mod->name, "driverloader") == 0) |
1898 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); | 1939 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); |
1899 | 1940 | ||
@@ -2023,6 +2064,11 @@ static struct module *load_module(void __user *umod, | |||
2023 | printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", | 2064 | printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", |
2024 | mod->name); | 2065 | mod->name); |
2025 | 2066 | ||
2067 | /* Now sew it into the lists so we can get lockdep and oops | ||
2068 | * info during argument parsing. Noone should access us, since | ||
2069 | * strong_try_module_get() will fail. */ | ||
2070 | stop_machine_run(__link_module, mod, NR_CPUS); | ||
2071 | |||
2026 | /* Size of section 0 is 0, so this works well if no params */ | 2072 | /* Size of section 0 is 0, so this works well if no params */ |
2027 | err = parse_args(mod->name, mod->args, | 2073 | err = parse_args(mod->name, mod->args, |
2028 | (struct kernel_param *) | 2074 | (struct kernel_param *) |
@@ -2031,7 +2077,7 @@ static struct module *load_module(void __user *umod, | |||
2031 | / sizeof(struct kernel_param), | 2077 | / sizeof(struct kernel_param), |
2032 | NULL); | 2078 | NULL); |
2033 | if (err < 0) | 2079 | if (err < 0) |
2034 | goto arch_cleanup; | 2080 | goto unlink; |
2035 | 2081 | ||
2036 | err = mod_sysfs_setup(mod, | 2082 | err = mod_sysfs_setup(mod, |
2037 | (struct kernel_param *) | 2083 | (struct kernel_param *) |
@@ -2039,7 +2085,7 @@ static struct module *load_module(void __user *umod, | |||
2039 | sechdrs[setupindex].sh_size | 2085 | sechdrs[setupindex].sh_size |
2040 | / sizeof(struct kernel_param)); | 2086 | / sizeof(struct kernel_param)); |
2041 | if (err < 0) | 2087 | if (err < 0) |
2042 | goto arch_cleanup; | 2088 | goto unlink; |
2043 | add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); | 2089 | add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); |
2044 | add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); | 2090 | add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); |
2045 | 2091 | ||
@@ -2054,9 +2100,13 @@ static struct module *load_module(void __user *umod, | |||
2054 | /* Done! */ | 2100 | /* Done! */ |
2055 | return mod; | 2101 | return mod; |
2056 | 2102 | ||
2057 | arch_cleanup: | 2103 | unlink: |
2104 | stop_machine_run(__unlink_module, mod, NR_CPUS); | ||
2058 | module_arch_cleanup(mod); | 2105 | module_arch_cleanup(mod); |
2059 | cleanup: | 2106 | cleanup: |
2107 | kobject_del(&mod->mkobj.kobj); | ||
2108 | kobject_put(&mod->mkobj.kobj); | ||
2109 | free_unload: | ||
2060 | module_unload_free(mod); | 2110 | module_unload_free(mod); |
2061 | module_free(mod, mod->module_init); | 2111 | module_free(mod, mod->module_init); |
2062 | free_core: | 2112 | free_core: |
@@ -2076,17 +2126,6 @@ static struct module *load_module(void __user *umod, | |||
2076 | goto free_hdr; | 2126 | goto free_hdr; |
2077 | } | 2127 | } |
2078 | 2128 | ||
2079 | /* | ||
2080 | * link the module with the whole machine is stopped with interrupts off | ||
2081 | * - this defends against kallsyms not taking locks | ||
2082 | */ | ||
2083 | static int __link_module(void *_mod) | ||
2084 | { | ||
2085 | struct module *mod = _mod; | ||
2086 | list_add(&mod->list, &modules); | ||
2087 | return 0; | ||
2088 | } | ||
2089 | |||
2090 | /* This is where the real work happens */ | 2129 | /* This is where the real work happens */ |
2091 | asmlinkage long | 2130 | asmlinkage long |
2092 | sys_init_module(void __user *umod, | 2131 | sys_init_module(void __user *umod, |
@@ -2111,10 +2150,6 @@ sys_init_module(void __user *umod, | |||
2111 | return PTR_ERR(mod); | 2150 | return PTR_ERR(mod); |
2112 | } | 2151 | } |
2113 | 2152 | ||
2114 | /* Now sew it into the lists. They won't access us, since | ||
2115 | strong_try_module_get() will fail. */ | ||
2116 | stop_machine_run(__link_module, mod, NR_CPUS); | ||
2117 | |||
2118 | /* Drop lock so they can recurse */ | 2153 | /* Drop lock so they can recurse */ |
2119 | mutex_unlock(&module_mutex); | 2154 | mutex_unlock(&module_mutex); |
2120 | 2155 | ||
@@ -2133,6 +2168,7 @@ sys_init_module(void __user *umod, | |||
2133 | mutex_lock(&module_mutex); | 2168 | mutex_lock(&module_mutex); |
2134 | free_module(mod); | 2169 | free_module(mod); |
2135 | mutex_unlock(&module_mutex); | 2170 | mutex_unlock(&module_mutex); |
2171 | wake_up(&module_wq); | ||
2136 | return ret; | 2172 | return ret; |
2137 | } | 2173 | } |
2138 | 2174 | ||
@@ -2147,6 +2183,7 @@ sys_init_module(void __user *umod, | |||
2147 | mod->init_size = 0; | 2183 | mod->init_size = 0; |
2148 | mod->init_text_size = 0; | 2184 | mod->init_text_size = 0; |
2149 | mutex_unlock(&module_mutex); | 2185 | mutex_unlock(&module_mutex); |
2186 | wake_up(&module_wq); | ||
2150 | 2187 | ||
2151 | return 0; | 2188 | return 0; |
2152 | } | 2189 | } |
@@ -2211,32 +2248,41 @@ static const char *get_ksymbol(struct module *mod, | |||
2211 | return mod->strtab + mod->symtab[best].st_name; | 2248 | return mod->strtab + mod->symtab[best].st_name; |
2212 | } | 2249 | } |
2213 | 2250 | ||
2214 | /* For kallsyms to ask for address resolution. NULL means not found. | 2251 | /* For kallsyms to ask for address resolution. NULL means not found. Careful |
2215 | We don't lock, as this is used for oops resolution and races are a | 2252 | * not to lock to avoid deadlock on oopses, simply disable preemption. */ |
2216 | lesser concern. */ | 2253 | char *module_address_lookup(unsigned long addr, |
2217 | const char *module_address_lookup(unsigned long addr, | 2254 | unsigned long *size, |
2218 | unsigned long *size, | 2255 | unsigned long *offset, |
2219 | unsigned long *offset, | 2256 | char **modname, |
2220 | char **modname) | 2257 | char *namebuf) |
2221 | { | 2258 | { |
2222 | struct module *mod; | 2259 | struct module *mod; |
2260 | const char *ret = NULL; | ||
2223 | 2261 | ||
2262 | preempt_disable(); | ||
2224 | list_for_each_entry(mod, &modules, list) { | 2263 | list_for_each_entry(mod, &modules, list) { |
2225 | if (within(addr, mod->module_init, mod->init_size) | 2264 | if (within(addr, mod->module_init, mod->init_size) |
2226 | || within(addr, mod->module_core, mod->core_size)) { | 2265 | || within(addr, mod->module_core, mod->core_size)) { |
2227 | if (modname) | 2266 | if (modname) |
2228 | *modname = mod->name; | 2267 | *modname = mod->name; |
2229 | return get_ksymbol(mod, addr, size, offset); | 2268 | ret = get_ksymbol(mod, addr, size, offset); |
2269 | break; | ||
2230 | } | 2270 | } |
2231 | } | 2271 | } |
2232 | return NULL; | 2272 | /* Make a copy in here where it's safe */ |
2273 | if (ret) { | ||
2274 | strncpy(namebuf, ret, KSYM_NAME_LEN - 1); | ||
2275 | ret = namebuf; | ||
2276 | } | ||
2277 | preempt_enable(); | ||
2278 | return (char *)ret; | ||
2233 | } | 2279 | } |
2234 | 2280 | ||
2235 | int lookup_module_symbol_name(unsigned long addr, char *symname) | 2281 | int lookup_module_symbol_name(unsigned long addr, char *symname) |
2236 | { | 2282 | { |
2237 | struct module *mod; | 2283 | struct module *mod; |
2238 | 2284 | ||
2239 | mutex_lock(&module_mutex); | 2285 | preempt_disable(); |
2240 | list_for_each_entry(mod, &modules, list) { | 2286 | list_for_each_entry(mod, &modules, list) { |
2241 | if (within(addr, mod->module_init, mod->init_size) || | 2287 | if (within(addr, mod->module_init, mod->init_size) || |
2242 | within(addr, mod->module_core, mod->core_size)) { | 2288 | within(addr, mod->module_core, mod->core_size)) { |
@@ -2246,12 +2292,12 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) | |||
2246 | if (!sym) | 2292 | if (!sym) |
2247 | goto out; | 2293 | goto out; |
2248 | strlcpy(symname, sym, KSYM_NAME_LEN); | 2294 | strlcpy(symname, sym, KSYM_NAME_LEN); |
2249 | mutex_unlock(&module_mutex); | 2295 | preempt_enable(); |
2250 | return 0; | 2296 | return 0; |
2251 | } | 2297 | } |
2252 | } | 2298 | } |
2253 | out: | 2299 | out: |
2254 | mutex_unlock(&module_mutex); | 2300 | preempt_enable(); |
2255 | return -ERANGE; | 2301 | return -ERANGE; |
2256 | } | 2302 | } |
2257 | 2303 | ||
@@ -2260,7 +2306,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, | |||
2260 | { | 2306 | { |
2261 | struct module *mod; | 2307 | struct module *mod; |
2262 | 2308 | ||
2263 | mutex_lock(&module_mutex); | 2309 | preempt_disable(); |
2264 | list_for_each_entry(mod, &modules, list) { | 2310 | list_for_each_entry(mod, &modules, list) { |
2265 | if (within(addr, mod->module_init, mod->init_size) || | 2311 | if (within(addr, mod->module_init, mod->init_size) || |
2266 | within(addr, mod->module_core, mod->core_size)) { | 2312 | within(addr, mod->module_core, mod->core_size)) { |
@@ -2273,12 +2319,12 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, | |||
2273 | strlcpy(modname, mod->name, MODULE_NAME_LEN); | 2319 | strlcpy(modname, mod->name, MODULE_NAME_LEN); |
2274 | if (name) | 2320 | if (name) |
2275 | strlcpy(name, sym, KSYM_NAME_LEN); | 2321 | strlcpy(name, sym, KSYM_NAME_LEN); |
2276 | mutex_unlock(&module_mutex); | 2322 | preempt_enable(); |
2277 | return 0; | 2323 | return 0; |
2278 | } | 2324 | } |
2279 | } | 2325 | } |
2280 | out: | 2326 | out: |
2281 | mutex_unlock(&module_mutex); | 2327 | preempt_enable(); |
2282 | return -ERANGE; | 2328 | return -ERANGE; |
2283 | } | 2329 | } |
2284 | 2330 | ||
@@ -2287,7 +2333,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, | |||
2287 | { | 2333 | { |
2288 | struct module *mod; | 2334 | struct module *mod; |
2289 | 2335 | ||
2290 | mutex_lock(&module_mutex); | 2336 | preempt_disable(); |
2291 | list_for_each_entry(mod, &modules, list) { | 2337 | list_for_each_entry(mod, &modules, list) { |
2292 | if (symnum < mod->num_symtab) { | 2338 | if (symnum < mod->num_symtab) { |
2293 | *value = mod->symtab[symnum].st_value; | 2339 | *value = mod->symtab[symnum].st_value; |
@@ -2296,12 +2342,12 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, | |||
2296 | KSYM_NAME_LEN); | 2342 | KSYM_NAME_LEN); |
2297 | strlcpy(module_name, mod->name, MODULE_NAME_LEN); | 2343 | strlcpy(module_name, mod->name, MODULE_NAME_LEN); |
2298 | *exported = is_exported(name, mod); | 2344 | *exported = is_exported(name, mod); |
2299 | mutex_unlock(&module_mutex); | 2345 | preempt_enable(); |
2300 | return 0; | 2346 | return 0; |
2301 | } | 2347 | } |
2302 | symnum -= mod->num_symtab; | 2348 | symnum -= mod->num_symtab; |
2303 | } | 2349 | } |
2304 | mutex_unlock(&module_mutex); | 2350 | preempt_enable(); |
2305 | return -ERANGE; | 2351 | return -ERANGE; |
2306 | } | 2352 | } |
2307 | 2353 | ||
@@ -2324,6 +2370,7 @@ unsigned long module_kallsyms_lookup_name(const char *name) | |||
2324 | unsigned long ret = 0; | 2370 | unsigned long ret = 0; |
2325 | 2371 | ||
2326 | /* Don't lock: we're in enough trouble already. */ | 2372 | /* Don't lock: we're in enough trouble already. */ |
2373 | preempt_disable(); | ||
2327 | if ((colon = strchr(name, ':')) != NULL) { | 2374 | if ((colon = strchr(name, ':')) != NULL) { |
2328 | *colon = '\0'; | 2375 | *colon = '\0'; |
2329 | if ((mod = find_module(name)) != NULL) | 2376 | if ((mod = find_module(name)) != NULL) |
@@ -2334,6 +2381,7 @@ unsigned long module_kallsyms_lookup_name(const char *name) | |||
2334 | if ((ret = mod_find_symname(mod, name)) != 0) | 2381 | if ((ret = mod_find_symname(mod, name)) != 0) |
2335 | break; | 2382 | break; |
2336 | } | 2383 | } |
2384 | preempt_enable(); | ||
2337 | return ret; | 2385 | return ret; |
2338 | } | 2386 | } |
2339 | #endif /* CONFIG_KALLSYMS */ | 2387 | #endif /* CONFIG_KALLSYMS */ |
@@ -2355,21 +2403,30 @@ static void m_stop(struct seq_file *m, void *p) | |||
2355 | mutex_unlock(&module_mutex); | 2403 | mutex_unlock(&module_mutex); |
2356 | } | 2404 | } |
2357 | 2405 | ||
2358 | static char *taint_flags(unsigned int taints, char *buf) | 2406 | static char *module_flags(struct module *mod, char *buf) |
2359 | { | 2407 | { |
2360 | int bx = 0; | 2408 | int bx = 0; |
2361 | 2409 | ||
2362 | if (taints) { | 2410 | if (mod->taints || |
2411 | mod->state == MODULE_STATE_GOING || | ||
2412 | mod->state == MODULE_STATE_COMING) { | ||
2363 | buf[bx++] = '('; | 2413 | buf[bx++] = '('; |
2364 | if (taints & TAINT_PROPRIETARY_MODULE) | 2414 | if (mod->taints & TAINT_PROPRIETARY_MODULE) |
2365 | buf[bx++] = 'P'; | 2415 | buf[bx++] = 'P'; |
2366 | if (taints & TAINT_FORCED_MODULE) | 2416 | if (mod->taints & TAINT_FORCED_MODULE) |
2367 | buf[bx++] = 'F'; | 2417 | buf[bx++] = 'F'; |
2368 | /* | 2418 | /* |
2369 | * TAINT_FORCED_RMMOD: could be added. | 2419 | * TAINT_FORCED_RMMOD: could be added. |
2370 | * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't | 2420 | * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't |
2371 | * apply to modules. | 2421 | * apply to modules. |
2372 | */ | 2422 | */ |
2423 | |||
2424 | /* Show a - for module-is-being-unloaded */ | ||
2425 | if (mod->state == MODULE_STATE_GOING) | ||
2426 | buf[bx++] = '-'; | ||
2427 | /* Show a + for module-is-being-loaded */ | ||
2428 | if (mod->state == MODULE_STATE_COMING) | ||
2429 | buf[bx++] = '+'; | ||
2373 | buf[bx++] = ')'; | 2430 | buf[bx++] = ')'; |
2374 | } | 2431 | } |
2375 | buf[bx] = '\0'; | 2432 | buf[bx] = '\0'; |
@@ -2396,7 +2453,7 @@ static int m_show(struct seq_file *m, void *p) | |||
2396 | 2453 | ||
2397 | /* Taints info */ | 2454 | /* Taints info */ |
2398 | if (mod->taints) | 2455 | if (mod->taints) |
2399 | seq_printf(m, " %s", taint_flags(mod->taints, buf)); | 2456 | seq_printf(m, " %s", module_flags(mod, buf)); |
2400 | 2457 | ||
2401 | seq_printf(m, "\n"); | 2458 | seq_printf(m, "\n"); |
2402 | return 0; | 2459 | return 0; |
@@ -2491,97 +2548,12 @@ void print_modules(void) | |||
2491 | 2548 | ||
2492 | printk("Modules linked in:"); | 2549 | printk("Modules linked in:"); |
2493 | list_for_each_entry(mod, &modules, list) | 2550 | list_for_each_entry(mod, &modules, list) |
2494 | printk(" %s%s", mod->name, taint_flags(mod->taints, buf)); | 2551 | printk(" %s%s", mod->name, module_flags(mod, buf)); |
2552 | if (last_unloaded_module[0]) | ||
2553 | printk(" [last unloaded: %s]", last_unloaded_module); | ||
2495 | printk("\n"); | 2554 | printk("\n"); |
2496 | } | 2555 | } |
2497 | 2556 | ||
2498 | #ifdef CONFIG_SYSFS | ||
2499 | static char *make_driver_name(struct device_driver *drv) | ||
2500 | { | ||
2501 | char *driver_name; | ||
2502 | |||
2503 | driver_name = kmalloc(strlen(drv->name) + strlen(drv->bus->name) + 2, | ||
2504 | GFP_KERNEL); | ||
2505 | if (!driver_name) | ||
2506 | return NULL; | ||
2507 | |||
2508 | sprintf(driver_name, "%s:%s", drv->bus->name, drv->name); | ||
2509 | return driver_name; | ||
2510 | } | ||
2511 | |||
2512 | static void module_create_drivers_dir(struct module_kobject *mk) | ||
2513 | { | ||
2514 | if (!mk || mk->drivers_dir) | ||
2515 | return; | ||
2516 | |||
2517 | mk->drivers_dir = kobject_add_dir(&mk->kobj, "drivers"); | ||
2518 | } | ||
2519 | |||
2520 | void module_add_driver(struct module *mod, struct device_driver *drv) | ||
2521 | { | ||
2522 | char *driver_name; | ||
2523 | int no_warn; | ||
2524 | struct module_kobject *mk = NULL; | ||
2525 | |||
2526 | if (!drv) | ||
2527 | return; | ||
2528 | |||
2529 | if (mod) | ||
2530 | mk = &mod->mkobj; | ||
2531 | else if (drv->mod_name) { | ||
2532 | struct kobject *mkobj; | ||
2533 | |||
2534 | /* Lookup built-in module entry in /sys/modules */ | ||
2535 | mkobj = kset_find_obj(&module_subsys, drv->mod_name); | ||
2536 | if (mkobj) { | ||
2537 | mk = container_of(mkobj, struct module_kobject, kobj); | ||
2538 | /* remember our module structure */ | ||
2539 | drv->mkobj = mk; | ||
2540 | /* kset_find_obj took a reference */ | ||
2541 | kobject_put(mkobj); | ||
2542 | } | ||
2543 | } | ||
2544 | |||
2545 | if (!mk) | ||
2546 | return; | ||
2547 | |||
2548 | /* Don't check return codes; these calls are idempotent */ | ||
2549 | no_warn = sysfs_create_link(&drv->kobj, &mk->kobj, "module"); | ||
2550 | driver_name = make_driver_name(drv); | ||
2551 | if (driver_name) { | ||
2552 | module_create_drivers_dir(mk); | ||
2553 | no_warn = sysfs_create_link(mk->drivers_dir, &drv->kobj, | ||
2554 | driver_name); | ||
2555 | kfree(driver_name); | ||
2556 | } | ||
2557 | } | ||
2558 | EXPORT_SYMBOL(module_add_driver); | ||
2559 | |||
2560 | void module_remove_driver(struct device_driver *drv) | ||
2561 | { | ||
2562 | struct module_kobject *mk = NULL; | ||
2563 | char *driver_name; | ||
2564 | |||
2565 | if (!drv) | ||
2566 | return; | ||
2567 | |||
2568 | sysfs_remove_link(&drv->kobj, "module"); | ||
2569 | |||
2570 | if (drv->owner) | ||
2571 | mk = &drv->owner->mkobj; | ||
2572 | else if (drv->mkobj) | ||
2573 | mk = drv->mkobj; | ||
2574 | if (mk && mk->drivers_dir) { | ||
2575 | driver_name = make_driver_name(drv); | ||
2576 | if (driver_name) { | ||
2577 | sysfs_remove_link(mk->drivers_dir, driver_name); | ||
2578 | kfree(driver_name); | ||
2579 | } | ||
2580 | } | ||
2581 | } | ||
2582 | EXPORT_SYMBOL(module_remove_driver); | ||
2583 | #endif | ||
2584 | |||
2585 | #ifdef CONFIG_MODVERSIONS | 2557 | #ifdef CONFIG_MODVERSIONS |
2586 | /* Generate the signature for struct module here, too, for modversions. */ | 2558 | /* Generate the signature for struct module here, too, for modversions. */ |
2587 | void struct_module(struct module *mod) { return; } | 2559 | void struct_module(struct module *mod) { return; } |
diff --git a/kernel/panic.c b/kernel/panic.c index 6f6e03e91595..d9e90cfe3298 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -19,6 +19,8 @@ | |||
19 | #include <linux/nmi.h> | 19 | #include <linux/nmi.h> |
20 | #include <linux/kexec.h> | 20 | #include <linux/kexec.h> |
21 | #include <linux/debug_locks.h> | 21 | #include <linux/debug_locks.h> |
22 | #include <linux/random.h> | ||
23 | #include <linux/kallsyms.h> | ||
22 | 24 | ||
23 | int panic_on_oops; | 25 | int panic_on_oops; |
24 | int tainted; | 26 | int tainted; |
@@ -266,13 +268,52 @@ void oops_enter(void) | |||
266 | } | 268 | } |
267 | 269 | ||
268 | /* | 270 | /* |
271 | * 64-bit random ID for oopses: | ||
272 | */ | ||
273 | static u64 oops_id; | ||
274 | |||
275 | static int init_oops_id(void) | ||
276 | { | ||
277 | if (!oops_id) | ||
278 | get_random_bytes(&oops_id, sizeof(oops_id)); | ||
279 | |||
280 | return 0; | ||
281 | } | ||
282 | late_initcall(init_oops_id); | ||
283 | |||
284 | static void print_oops_end_marker(void) | ||
285 | { | ||
286 | init_oops_id(); | ||
287 | printk(KERN_WARNING "---[ end trace %016llx ]---\n", | ||
288 | (unsigned long long)oops_id); | ||
289 | } | ||
290 | |||
291 | /* | ||
269 | * Called when the architecture exits its oops handler, after printing | 292 | * Called when the architecture exits its oops handler, after printing |
270 | * everything. | 293 | * everything. |
271 | */ | 294 | */ |
272 | void oops_exit(void) | 295 | void oops_exit(void) |
273 | { | 296 | { |
274 | do_oops_enter_exit(); | 297 | do_oops_enter_exit(); |
298 | print_oops_end_marker(); | ||
299 | } | ||
300 | |||
301 | #ifdef WANT_WARN_ON_SLOWPATH | ||
302 | void warn_on_slowpath(const char *file, int line) | ||
303 | { | ||
304 | char function[KSYM_SYMBOL_LEN]; | ||
305 | unsigned long caller = (unsigned long) __builtin_return_address(0); | ||
306 | sprint_symbol(function, caller); | ||
307 | |||
308 | printk(KERN_WARNING "------------[ cut here ]------------\n"); | ||
309 | printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, | ||
310 | line, function); | ||
311 | print_modules(); | ||
312 | dump_stack(); | ||
313 | print_oops_end_marker(); | ||
275 | } | 314 | } |
315 | EXPORT_SYMBOL(warn_on_slowpath); | ||
316 | #endif | ||
276 | 317 | ||
277 | #ifdef CONFIG_CC_STACKPROTECTOR | 318 | #ifdef CONFIG_CC_STACKPROTECTOR |
278 | /* | 319 | /* |
diff --git a/kernel/params.c b/kernel/params.c index 2a4c51487e72..42fe5e6126c0 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -376,8 +376,6 @@ int param_get_string(char *buffer, struct kernel_param *kp) | |||
376 | 376 | ||
377 | extern struct kernel_param __start___param[], __stop___param[]; | 377 | extern struct kernel_param __start___param[], __stop___param[]; |
378 | 378 | ||
379 | #define MAX_KBUILD_MODNAME KOBJ_NAME_LEN | ||
380 | |||
381 | struct param_attribute | 379 | struct param_attribute |
382 | { | 380 | { |
383 | struct module_attribute mattr; | 381 | struct module_attribute mattr; |
@@ -472,7 +470,7 @@ param_sysfs_setup(struct module_kobject *mk, | |||
472 | sizeof(mp->grp.attrs[0])); | 470 | sizeof(mp->grp.attrs[0])); |
473 | size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]); | 471 | size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]); |
474 | 472 | ||
475 | mp = kmalloc(size[0] + size[1], GFP_KERNEL); | 473 | mp = kzalloc(size[0] + size[1], GFP_KERNEL); |
476 | if (!mp) | 474 | if (!mp) |
477 | return ERR_PTR(-ENOMEM); | 475 | return ERR_PTR(-ENOMEM); |
478 | 476 | ||
@@ -560,11 +558,10 @@ static void __init kernel_param_sysfs_setup(const char *name, | |||
560 | BUG_ON(!mk); | 558 | BUG_ON(!mk); |
561 | 559 | ||
562 | mk->mod = THIS_MODULE; | 560 | mk->mod = THIS_MODULE; |
563 | kobj_set_kset_s(mk, module_subsys); | 561 | mk->kobj.kset = module_kset; |
564 | kobject_set_name(&mk->kobj, name); | 562 | ret = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name); |
565 | kobject_init(&mk->kobj); | ||
566 | ret = kobject_add(&mk->kobj); | ||
567 | if (ret) { | 563 | if (ret) { |
564 | kobject_put(&mk->kobj); | ||
568 | printk(KERN_ERR "Module '%s' failed to be added to sysfs, " | 565 | printk(KERN_ERR "Module '%s' failed to be added to sysfs, " |
569 | "error number %d\n", name, ret); | 566 | "error number %d\n", name, ret); |
570 | printk(KERN_ERR "The system will be unstable now.\n"); | 567 | printk(KERN_ERR "The system will be unstable now.\n"); |
@@ -588,7 +585,7 @@ static void __init param_sysfs_builtin(void) | |||
588 | { | 585 | { |
589 | struct kernel_param *kp, *kp_begin = NULL; | 586 | struct kernel_param *kp, *kp_begin = NULL; |
590 | unsigned int i, name_len, count = 0; | 587 | unsigned int i, name_len, count = 0; |
591 | char modname[MAX_KBUILD_MODNAME + 1] = ""; | 588 | char modname[MODULE_NAME_LEN + 1] = ""; |
592 | 589 | ||
593 | for (i=0; i < __stop___param - __start___param; i++) { | 590 | for (i=0; i < __stop___param - __start___param; i++) { |
594 | char *dot; | 591 | char *dot; |
@@ -596,12 +593,12 @@ static void __init param_sysfs_builtin(void) | |||
596 | 593 | ||
597 | kp = &__start___param[i]; | 594 | kp = &__start___param[i]; |
598 | max_name_len = | 595 | max_name_len = |
599 | min_t(size_t, MAX_KBUILD_MODNAME, strlen(kp->name)); | 596 | min_t(size_t, MODULE_NAME_LEN, strlen(kp->name)); |
600 | 597 | ||
601 | dot = memchr(kp->name, '.', max_name_len); | 598 | dot = memchr(kp->name, '.', max_name_len); |
602 | if (!dot) { | 599 | if (!dot) { |
603 | DEBUGP("couldn't find period in first %d characters " | 600 | DEBUGP("couldn't find period in first %d characters " |
604 | "of %s\n", MAX_KBUILD_MODNAME, kp->name); | 601 | "of %s\n", MODULE_NAME_LEN, kp->name); |
605 | continue; | 602 | continue; |
606 | } | 603 | } |
607 | name_len = dot - kp->name; | 604 | name_len = dot - kp->name; |
@@ -679,8 +676,6 @@ static struct sysfs_ops module_sysfs_ops = { | |||
679 | .store = module_attr_store, | 676 | .store = module_attr_store, |
680 | }; | 677 | }; |
681 | 678 | ||
682 | static struct kobj_type module_ktype; | ||
683 | |||
684 | static int uevent_filter(struct kset *kset, struct kobject *kobj) | 679 | static int uevent_filter(struct kset *kset, struct kobject *kobj) |
685 | { | 680 | { |
686 | struct kobj_type *ktype = get_ktype(kobj); | 681 | struct kobj_type *ktype = get_ktype(kobj); |
@@ -694,10 +689,10 @@ static struct kset_uevent_ops module_uevent_ops = { | |||
694 | .filter = uevent_filter, | 689 | .filter = uevent_filter, |
695 | }; | 690 | }; |
696 | 691 | ||
697 | decl_subsys(module, &module_ktype, &module_uevent_ops); | 692 | struct kset *module_kset; |
698 | int module_sysfs_initialized; | 693 | int module_sysfs_initialized; |
699 | 694 | ||
700 | static struct kobj_type module_ktype = { | 695 | struct kobj_type module_ktype = { |
701 | .sysfs_ops = &module_sysfs_ops, | 696 | .sysfs_ops = &module_sysfs_ops, |
702 | }; | 697 | }; |
703 | 698 | ||
@@ -706,13 +701,11 @@ static struct kobj_type module_ktype = { | |||
706 | */ | 701 | */ |
707 | static int __init param_sysfs_init(void) | 702 | static int __init param_sysfs_init(void) |
708 | { | 703 | { |
709 | int ret; | 704 | module_kset = kset_create_and_add("module", &module_uevent_ops, NULL); |
710 | 705 | if (!module_kset) { | |
711 | ret = subsystem_register(&module_subsys); | 706 | printk(KERN_WARNING "%s (%d): error creating kset\n", |
712 | if (ret < 0) { | 707 | __FILE__, __LINE__); |
713 | printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n", | 708 | return -ENOMEM; |
714 | __FILE__, __LINE__, ret); | ||
715 | return ret; | ||
716 | } | 709 | } |
717 | module_sysfs_initialized = 1; | 710 | module_sysfs_initialized = 1; |
718 | 711 | ||
@@ -722,14 +715,7 @@ static int __init param_sysfs_init(void) | |||
722 | } | 715 | } |
723 | subsys_initcall(param_sysfs_init); | 716 | subsys_initcall(param_sysfs_init); |
724 | 717 | ||
725 | #else | 718 | #endif /* CONFIG_SYSFS */ |
726 | #if 0 | ||
727 | static struct sysfs_ops module_sysfs_ops = { | ||
728 | .show = NULL, | ||
729 | .store = NULL, | ||
730 | }; | ||
731 | #endif | ||
732 | #endif | ||
733 | 719 | ||
734 | EXPORT_SYMBOL(param_set_byte); | 720 | EXPORT_SYMBOL(param_set_byte); |
735 | EXPORT_SYMBOL(param_get_byte); | 721 | EXPORT_SYMBOL(param_get_byte); |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 68c96376e84a..0b7c82ac467e 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -967,6 +967,7 @@ static void check_thread_timers(struct task_struct *tsk, | |||
967 | { | 967 | { |
968 | int maxfire; | 968 | int maxfire; |
969 | struct list_head *timers = tsk->cpu_timers; | 969 | struct list_head *timers = tsk->cpu_timers; |
970 | struct signal_struct *const sig = tsk->signal; | ||
970 | 971 | ||
971 | maxfire = 20; | 972 | maxfire = 20; |
972 | tsk->it_prof_expires = cputime_zero; | 973 | tsk->it_prof_expires = cputime_zero; |
@@ -1011,6 +1012,35 @@ static void check_thread_timers(struct task_struct *tsk, | |||
1011 | t->firing = 1; | 1012 | t->firing = 1; |
1012 | list_move_tail(&t->entry, firing); | 1013 | list_move_tail(&t->entry, firing); |
1013 | } | 1014 | } |
1015 | |||
1016 | /* | ||
1017 | * Check for the special case thread timers. | ||
1018 | */ | ||
1019 | if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) { | ||
1020 | unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max; | ||
1021 | unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur; | ||
1022 | |||
1023 | if (hard != RLIM_INFINITY && | ||
1024 | tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { | ||
1025 | /* | ||
1026 | * At the hard limit, we just die. | ||
1027 | * No need to calculate anything else now. | ||
1028 | */ | ||
1029 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); | ||
1030 | return; | ||
1031 | } | ||
1032 | if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) { | ||
1033 | /* | ||
1034 | * At the soft limit, send a SIGXCPU every second. | ||
1035 | */ | ||
1036 | if (sig->rlim[RLIMIT_RTTIME].rlim_cur | ||
1037 | < sig->rlim[RLIMIT_RTTIME].rlim_max) { | ||
1038 | sig->rlim[RLIMIT_RTTIME].rlim_cur += | ||
1039 | USEC_PER_SEC; | ||
1040 | } | ||
1041 | __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); | ||
1042 | } | ||
1043 | } | ||
1014 | } | 1044 | } |
1015 | 1045 | ||
1016 | /* | 1046 | /* |
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 05b64790fe83..b138b431e271 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
@@ -567,7 +567,8 @@ static const char * const hibernation_modes[] = { | |||
567 | * supports it (as determined by having hibernation_ops). | 567 | * supports it (as determined by having hibernation_ops). |
568 | */ | 568 | */ |
569 | 569 | ||
570 | static ssize_t disk_show(struct kset *kset, char *buf) | 570 | static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, |
571 | char *buf) | ||
571 | { | 572 | { |
572 | int i; | 573 | int i; |
573 | char *start = buf; | 574 | char *start = buf; |
@@ -597,7 +598,8 @@ static ssize_t disk_show(struct kset *kset, char *buf) | |||
597 | } | 598 | } |
598 | 599 | ||
599 | 600 | ||
600 | static ssize_t disk_store(struct kset *kset, const char *buf, size_t n) | 601 | static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, |
602 | const char *buf, size_t n) | ||
601 | { | 603 | { |
602 | int error = 0; | 604 | int error = 0; |
603 | int i; | 605 | int i; |
@@ -642,13 +644,15 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n) | |||
642 | 644 | ||
643 | power_attr(disk); | 645 | power_attr(disk); |
644 | 646 | ||
645 | static ssize_t resume_show(struct kset *kset, char *buf) | 647 | static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, |
648 | char *buf) | ||
646 | { | 649 | { |
647 | return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), | 650 | return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), |
648 | MINOR(swsusp_resume_device)); | 651 | MINOR(swsusp_resume_device)); |
649 | } | 652 | } |
650 | 653 | ||
651 | static ssize_t resume_store(struct kset *kset, const char *buf, size_t n) | 654 | static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, |
655 | const char *buf, size_t n) | ||
652 | { | 656 | { |
653 | unsigned int maj, min; | 657 | unsigned int maj, min; |
654 | dev_t res; | 658 | dev_t res; |
@@ -674,12 +678,14 @@ static ssize_t resume_store(struct kset *kset, const char *buf, size_t n) | |||
674 | 678 | ||
675 | power_attr(resume); | 679 | power_attr(resume); |
676 | 680 | ||
677 | static ssize_t image_size_show(struct kset *kset, char *buf) | 681 | static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr, |
682 | char *buf) | ||
678 | { | 683 | { |
679 | return sprintf(buf, "%lu\n", image_size); | 684 | return sprintf(buf, "%lu\n", image_size); |
680 | } | 685 | } |
681 | 686 | ||
682 | static ssize_t image_size_store(struct kset *kset, const char *buf, size_t n) | 687 | static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr, |
688 | const char *buf, size_t n) | ||
683 | { | 689 | { |
684 | unsigned long size; | 690 | unsigned long size; |
685 | 691 | ||
@@ -708,7 +714,7 @@ static struct attribute_group attr_group = { | |||
708 | 714 | ||
709 | static int __init pm_disk_init(void) | 715 | static int __init pm_disk_init(void) |
710 | { | 716 | { |
711 | return sysfs_create_group(&power_subsys.kobj, &attr_group); | 717 | return sysfs_create_group(power_kobj, &attr_group); |
712 | } | 718 | } |
713 | 719 | ||
714 | core_initcall(pm_disk_init); | 720 | core_initcall(pm_disk_init); |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 3cdf95b1dc92..efc08360e627 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -28,6 +28,9 @@ BLOCKING_NOTIFIER_HEAD(pm_chain_head); | |||
28 | 28 | ||
29 | DEFINE_MUTEX(pm_mutex); | 29 | DEFINE_MUTEX(pm_mutex); |
30 | 30 | ||
31 | unsigned int pm_flags; | ||
32 | EXPORT_SYMBOL(pm_flags); | ||
33 | |||
31 | #ifdef CONFIG_SUSPEND | 34 | #ifdef CONFIG_SUSPEND |
32 | 35 | ||
33 | /* This is just an arbitrary number */ | 36 | /* This is just an arbitrary number */ |
@@ -273,8 +276,7 @@ EXPORT_SYMBOL(pm_suspend); | |||
273 | 276 | ||
274 | #endif /* CONFIG_SUSPEND */ | 277 | #endif /* CONFIG_SUSPEND */ |
275 | 278 | ||
276 | decl_subsys(power,NULL,NULL); | 279 | struct kobject *power_kobj; |
277 | |||
278 | 280 | ||
279 | /** | 281 | /** |
280 | * state - control system power state. | 282 | * state - control system power state. |
@@ -287,7 +289,8 @@ decl_subsys(power,NULL,NULL); | |||
287 | * proper enumerated value, and initiates a suspend transition. | 289 | * proper enumerated value, and initiates a suspend transition. |
288 | */ | 290 | */ |
289 | 291 | ||
290 | static ssize_t state_show(struct kset *kset, char *buf) | 292 | static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, |
293 | char *buf) | ||
291 | { | 294 | { |
292 | char *s = buf; | 295 | char *s = buf; |
293 | #ifdef CONFIG_SUSPEND | 296 | #ifdef CONFIG_SUSPEND |
@@ -308,7 +311,8 @@ static ssize_t state_show(struct kset *kset, char *buf) | |||
308 | return (s - buf); | 311 | return (s - buf); |
309 | } | 312 | } |
310 | 313 | ||
311 | static ssize_t state_store(struct kset *kset, const char *buf, size_t n) | 314 | static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, |
315 | const char *buf, size_t n) | ||
312 | { | 316 | { |
313 | #ifdef CONFIG_SUSPEND | 317 | #ifdef CONFIG_SUSPEND |
314 | suspend_state_t state = PM_SUSPEND_STANDBY; | 318 | suspend_state_t state = PM_SUSPEND_STANDBY; |
@@ -345,13 +349,15 @@ power_attr(state); | |||
345 | #ifdef CONFIG_PM_TRACE | 349 | #ifdef CONFIG_PM_TRACE |
346 | int pm_trace_enabled; | 350 | int pm_trace_enabled; |
347 | 351 | ||
348 | static ssize_t pm_trace_show(struct kset *kset, char *buf) | 352 | static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr, |
353 | char *buf) | ||
349 | { | 354 | { |
350 | return sprintf(buf, "%d\n", pm_trace_enabled); | 355 | return sprintf(buf, "%d\n", pm_trace_enabled); |
351 | } | 356 | } |
352 | 357 | ||
353 | static ssize_t | 358 | static ssize_t |
354 | pm_trace_store(struct kset *kset, const char *buf, size_t n) | 359 | pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, |
360 | const char *buf, size_t n) | ||
355 | { | 361 | { |
356 | int val; | 362 | int val; |
357 | 363 | ||
@@ -383,10 +389,10 @@ static struct attribute_group attr_group = { | |||
383 | 389 | ||
384 | static int __init pm_init(void) | 390 | static int __init pm_init(void) |
385 | { | 391 | { |
386 | int error = subsystem_register(&power_subsys); | 392 | power_kobj = kobject_create_and_add("power", NULL); |
387 | if (!error) | 393 | if (!power_kobj) |
388 | error = sysfs_create_group(&power_subsys.kobj,&attr_group); | 394 | return -ENOMEM; |
389 | return error; | 395 | return sysfs_create_group(power_kobj, &attr_group); |
390 | } | 396 | } |
391 | 397 | ||
392 | core_initcall(pm_init); | 398 | core_initcall(pm_init); |
diff --git a/kernel/power/pm.c b/kernel/power/pm.c index c50d15266c10..60c73fa670d5 100644 --- a/kernel/power/pm.c +++ b/kernel/power/pm.c | |||
@@ -27,8 +27,6 @@ | |||
27 | #include <linux/interrupt.h> | 27 | #include <linux/interrupt.h> |
28 | #include <linux/mutex.h> | 28 | #include <linux/mutex.h> |
29 | 29 | ||
30 | int pm_active; | ||
31 | |||
32 | /* | 30 | /* |
33 | * Locking notes: | 31 | * Locking notes: |
34 | * pm_devs_lock can be a semaphore providing pm ops are not called | 32 | * pm_devs_lock can be a semaphore providing pm ops are not called |
@@ -204,6 +202,4 @@ int pm_send_all(pm_request_t rqst, void *data) | |||
204 | 202 | ||
205 | EXPORT_SYMBOL(pm_register); | 203 | EXPORT_SYMBOL(pm_register); |
206 | EXPORT_SYMBOL(pm_send_all); | 204 | EXPORT_SYMBOL(pm_send_all); |
207 | EXPORT_SYMBOL(pm_active); | ||
208 | |||
209 | 205 | ||
diff --git a/kernel/power/power.h b/kernel/power/power.h index 195dc4611764..2093c3a9a994 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -54,7 +54,7 @@ extern int pfn_is_nosave(unsigned long); | |||
54 | extern struct mutex pm_mutex; | 54 | extern struct mutex pm_mutex; |
55 | 55 | ||
56 | #define power_attr(_name) \ | 56 | #define power_attr(_name) \ |
57 | static struct subsys_attribute _name##_attr = { \ | 57 | static struct kobj_attribute _name##_attr = { \ |
58 | .attr = { \ | 58 | .attr = { \ |
59 | .name = __stringify(_name), \ | 59 | .name = __stringify(_name), \ |
60 | .mode = 0644, \ | 60 | .mode = 0644, \ |
@@ -63,8 +63,6 @@ static struct subsys_attribute _name##_attr = { \ | |||
63 | .store = _name##_store, \ | 63 | .store = _name##_store, \ |
64 | } | 64 | } |
65 | 65 | ||
66 | extern struct kset power_subsys; | ||
67 | |||
68 | /* Preferred image size in bytes (default 500 MB) */ | 66 | /* Preferred image size in bytes (default 500 MB) */ |
69 | extern unsigned long image_size; | 67 | extern unsigned long image_size; |
70 | extern int in_suspend; | 68 | extern int in_suspend; |
diff --git a/kernel/printk.c b/kernel/printk.c index a30fe33de395..29ae1e99cde0 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -36,6 +36,13 @@ | |||
36 | 36 | ||
37 | #include <asm/uaccess.h> | 37 | #include <asm/uaccess.h> |
38 | 38 | ||
39 | /* | ||
40 | * Architectures can override it: | ||
41 | */ | ||
42 | void __attribute__((weak)) early_printk(const char *fmt, ...) | ||
43 | { | ||
44 | } | ||
45 | |||
39 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | 46 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) |
40 | 47 | ||
41 | /* printk's without a loglevel use this.. */ | 48 | /* printk's without a loglevel use this.. */ |
@@ -448,10 +455,10 @@ static int __init ignore_loglevel_setup(char *str) | |||
448 | ignore_loglevel = 1; | 455 | ignore_loglevel = 1; |
449 | printk(KERN_INFO "debug: ignoring loglevel setting.\n"); | 456 | printk(KERN_INFO "debug: ignoring loglevel setting.\n"); |
450 | 457 | ||
451 | return 1; | 458 | return 0; |
452 | } | 459 | } |
453 | 460 | ||
454 | __setup("ignore_loglevel", ignore_loglevel_setup); | 461 | early_param("ignore_loglevel", ignore_loglevel_setup); |
455 | 462 | ||
456 | /* | 463 | /* |
457 | * Write out chars from start to end - 1 inclusive | 464 | * Write out chars from start to end - 1 inclusive |
@@ -573,11 +580,6 @@ static int __init printk_time_setup(char *str) | |||
573 | 580 | ||
574 | __setup("time", printk_time_setup); | 581 | __setup("time", printk_time_setup); |
575 | 582 | ||
576 | __attribute__((weak)) unsigned long long printk_clock(void) | ||
577 | { | ||
578 | return sched_clock(); | ||
579 | } | ||
580 | |||
581 | /* Check if we have any console registered that can be called early in boot. */ | 583 | /* Check if we have any console registered that can be called early in boot. */ |
582 | static int have_callable_console(void) | 584 | static int have_callable_console(void) |
583 | { | 585 | { |
@@ -628,30 +630,57 @@ asmlinkage int printk(const char *fmt, ...) | |||
628 | /* cpu currently holding logbuf_lock */ | 630 | /* cpu currently holding logbuf_lock */ |
629 | static volatile unsigned int printk_cpu = UINT_MAX; | 631 | static volatile unsigned int printk_cpu = UINT_MAX; |
630 | 632 | ||
633 | const char printk_recursion_bug_msg [] = | ||
634 | KERN_CRIT "BUG: recent printk recursion!\n"; | ||
635 | static int printk_recursion_bug; | ||
636 | |||
631 | asmlinkage int vprintk(const char *fmt, va_list args) | 637 | asmlinkage int vprintk(const char *fmt, va_list args) |
632 | { | 638 | { |
639 | static int log_level_unknown = 1; | ||
640 | static char printk_buf[1024]; | ||
641 | |||
633 | unsigned long flags; | 642 | unsigned long flags; |
634 | int printed_len; | 643 | int printed_len = 0; |
644 | int this_cpu; | ||
635 | char *p; | 645 | char *p; |
636 | static char printk_buf[1024]; | ||
637 | static int log_level_unknown = 1; | ||
638 | 646 | ||
639 | boot_delay_msec(); | 647 | boot_delay_msec(); |
640 | 648 | ||
641 | preempt_disable(); | 649 | preempt_disable(); |
642 | if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id()) | ||
643 | /* If a crash is occurring during printk() on this CPU, | ||
644 | * make sure we can't deadlock */ | ||
645 | zap_locks(); | ||
646 | |||
647 | /* This stops the holder of console_sem just where we want him */ | 650 | /* This stops the holder of console_sem just where we want him */ |
648 | raw_local_irq_save(flags); | 651 | raw_local_irq_save(flags); |
652 | this_cpu = smp_processor_id(); | ||
653 | |||
654 | /* | ||
655 | * Ouch, printk recursed into itself! | ||
656 | */ | ||
657 | if (unlikely(printk_cpu == this_cpu)) { | ||
658 | /* | ||
659 | * If a crash is occurring during printk() on this CPU, | ||
660 | * then try to get the crash message out but make sure | ||
661 | * we can't deadlock. Otherwise just return to avoid the | ||
662 | * recursion and return - but flag the recursion so that | ||
663 | * it can be printed at the next appropriate moment: | ||
664 | */ | ||
665 | if (!oops_in_progress) { | ||
666 | printk_recursion_bug = 1; | ||
667 | goto out_restore_irqs; | ||
668 | } | ||
669 | zap_locks(); | ||
670 | } | ||
671 | |||
649 | lockdep_off(); | 672 | lockdep_off(); |
650 | spin_lock(&logbuf_lock); | 673 | spin_lock(&logbuf_lock); |
651 | printk_cpu = smp_processor_id(); | 674 | printk_cpu = this_cpu; |
652 | 675 | ||
676 | if (printk_recursion_bug) { | ||
677 | printk_recursion_bug = 0; | ||
678 | strcpy(printk_buf, printk_recursion_bug_msg); | ||
679 | printed_len = sizeof(printk_recursion_bug_msg); | ||
680 | } | ||
653 | /* Emit the output into the temporary buffer */ | 681 | /* Emit the output into the temporary buffer */ |
654 | printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); | 682 | printed_len += vscnprintf(printk_buf + printed_len, |
683 | sizeof(printk_buf), fmt, args); | ||
655 | 684 | ||
656 | /* | 685 | /* |
657 | * Copy the output into log_buf. If the caller didn't provide | 686 | * Copy the output into log_buf. If the caller didn't provide |
@@ -680,7 +709,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
680 | loglev_char = default_message_loglevel | 709 | loglev_char = default_message_loglevel |
681 | + '0'; | 710 | + '0'; |
682 | } | 711 | } |
683 | t = printk_clock(); | 712 | t = cpu_clock(printk_cpu); |
684 | nanosec_rem = do_div(t, 1000000000); | 713 | nanosec_rem = do_div(t, 1000000000); |
685 | tlen = sprintf(tbuf, | 714 | tlen = sprintf(tbuf, |
686 | "<%c>[%5lu.%06lu] ", | 715 | "<%c>[%5lu.%06lu] ", |
@@ -744,6 +773,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
744 | printk_cpu = UINT_MAX; | 773 | printk_cpu = UINT_MAX; |
745 | spin_unlock(&logbuf_lock); | 774 | spin_unlock(&logbuf_lock); |
746 | lockdep_on(); | 775 | lockdep_on(); |
776 | out_restore_irqs: | ||
747 | raw_local_irq_restore(flags); | 777 | raw_local_irq_restore(flags); |
748 | } | 778 | } |
749 | 779 | ||
@@ -817,7 +847,7 @@ __setup("console=", console_setup); | |||
817 | * commonly to provide a default console (ie from PROM variables) when | 847 | * commonly to provide a default console (ie from PROM variables) when |
818 | * the user has not supplied one. | 848 | * the user has not supplied one. |
819 | */ | 849 | */ |
820 | int __init add_preferred_console(char *name, int idx, char *options) | 850 | int add_preferred_console(char *name, int idx, char *options) |
821 | { | 851 | { |
822 | struct console_cmdline *c; | 852 | struct console_cmdline *c; |
823 | int i; | 853 | int i; |
diff --git a/kernel/profile.c b/kernel/profile.c index 5e95330e5120..e64c2da11c0f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -52,7 +52,7 @@ static DEFINE_PER_CPU(int, cpu_profile_flip); | |||
52 | static DEFINE_MUTEX(profile_flip_mutex); | 52 | static DEFINE_MUTEX(profile_flip_mutex); |
53 | #endif /* CONFIG_SMP */ | 53 | #endif /* CONFIG_SMP */ |
54 | 54 | ||
55 | static int __init profile_setup(char * str) | 55 | static int __init profile_setup(char *str) |
56 | { | 56 | { |
57 | static char __initdata schedstr[] = "schedule"; | 57 | static char __initdata schedstr[] = "schedule"; |
58 | static char __initdata sleepstr[] = "sleep"; | 58 | static char __initdata sleepstr[] = "sleep"; |
@@ -104,28 +104,28 @@ __setup("profile=", profile_setup); | |||
104 | 104 | ||
105 | void __init profile_init(void) | 105 | void __init profile_init(void) |
106 | { | 106 | { |
107 | if (!prof_on) | 107 | if (!prof_on) |
108 | return; | 108 | return; |
109 | 109 | ||
110 | /* only text is profiled */ | 110 | /* only text is profiled */ |
111 | prof_len = (_etext - _stext) >> prof_shift; | 111 | prof_len = (_etext - _stext) >> prof_shift; |
112 | prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); | 112 | prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); |
113 | } | 113 | } |
114 | 114 | ||
115 | /* Profile event notifications */ | 115 | /* Profile event notifications */ |
116 | 116 | ||
117 | #ifdef CONFIG_PROFILING | 117 | #ifdef CONFIG_PROFILING |
118 | 118 | ||
119 | static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); | 119 | static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); |
120 | static ATOMIC_NOTIFIER_HEAD(task_free_notifier); | 120 | static ATOMIC_NOTIFIER_HEAD(task_free_notifier); |
121 | static BLOCKING_NOTIFIER_HEAD(munmap_notifier); | 121 | static BLOCKING_NOTIFIER_HEAD(munmap_notifier); |
122 | 122 | ||
123 | void profile_task_exit(struct task_struct * task) | 123 | void profile_task_exit(struct task_struct *task) |
124 | { | 124 | { |
125 | blocking_notifier_call_chain(&task_exit_notifier, 0, task); | 125 | blocking_notifier_call_chain(&task_exit_notifier, 0, task); |
126 | } | 126 | } |
127 | 127 | ||
128 | int profile_handoff_task(struct task_struct * task) | 128 | int profile_handoff_task(struct task_struct *task) |
129 | { | 129 | { |
130 | int ret; | 130 | int ret; |
131 | ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); | 131 | ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); |
@@ -137,52 +137,55 @@ void profile_munmap(unsigned long addr) | |||
137 | blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); | 137 | blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); |
138 | } | 138 | } |
139 | 139 | ||
140 | int task_handoff_register(struct notifier_block * n) | 140 | int task_handoff_register(struct notifier_block *n) |
141 | { | 141 | { |
142 | return atomic_notifier_chain_register(&task_free_notifier, n); | 142 | return atomic_notifier_chain_register(&task_free_notifier, n); |
143 | } | 143 | } |
144 | EXPORT_SYMBOL_GPL(task_handoff_register); | ||
144 | 145 | ||
145 | int task_handoff_unregister(struct notifier_block * n) | 146 | int task_handoff_unregister(struct notifier_block *n) |
146 | { | 147 | { |
147 | return atomic_notifier_chain_unregister(&task_free_notifier, n); | 148 | return atomic_notifier_chain_unregister(&task_free_notifier, n); |
148 | } | 149 | } |
150 | EXPORT_SYMBOL_GPL(task_handoff_unregister); | ||
149 | 151 | ||
150 | int profile_event_register(enum profile_type type, struct notifier_block * n) | 152 | int profile_event_register(enum profile_type type, struct notifier_block *n) |
151 | { | 153 | { |
152 | int err = -EINVAL; | 154 | int err = -EINVAL; |
153 | 155 | ||
154 | switch (type) { | 156 | switch (type) { |
155 | case PROFILE_TASK_EXIT: | 157 | case PROFILE_TASK_EXIT: |
156 | err = blocking_notifier_chain_register( | 158 | err = blocking_notifier_chain_register( |
157 | &task_exit_notifier, n); | 159 | &task_exit_notifier, n); |
158 | break; | 160 | break; |
159 | case PROFILE_MUNMAP: | 161 | case PROFILE_MUNMAP: |
160 | err = blocking_notifier_chain_register( | 162 | err = blocking_notifier_chain_register( |
161 | &munmap_notifier, n); | 163 | &munmap_notifier, n); |
162 | break; | 164 | break; |
163 | } | 165 | } |
164 | 166 | ||
165 | return err; | 167 | return err; |
166 | } | 168 | } |
169 | EXPORT_SYMBOL_GPL(profile_event_register); | ||
167 | 170 | ||
168 | 171 | int profile_event_unregister(enum profile_type type, struct notifier_block *n) | |
169 | int profile_event_unregister(enum profile_type type, struct notifier_block * n) | ||
170 | { | 172 | { |
171 | int err = -EINVAL; | 173 | int err = -EINVAL; |
172 | 174 | ||
173 | switch (type) { | 175 | switch (type) { |
174 | case PROFILE_TASK_EXIT: | 176 | case PROFILE_TASK_EXIT: |
175 | err = blocking_notifier_chain_unregister( | 177 | err = blocking_notifier_chain_unregister( |
176 | &task_exit_notifier, n); | 178 | &task_exit_notifier, n); |
177 | break; | 179 | break; |
178 | case PROFILE_MUNMAP: | 180 | case PROFILE_MUNMAP: |
179 | err = blocking_notifier_chain_unregister( | 181 | err = blocking_notifier_chain_unregister( |
180 | &munmap_notifier, n); | 182 | &munmap_notifier, n); |
181 | break; | 183 | break; |
182 | } | 184 | } |
183 | 185 | ||
184 | return err; | 186 | return err; |
185 | } | 187 | } |
188 | EXPORT_SYMBOL_GPL(profile_event_unregister); | ||
186 | 189 | ||
187 | int register_timer_hook(int (*hook)(struct pt_regs *)) | 190 | int register_timer_hook(int (*hook)(struct pt_regs *)) |
188 | { | 191 | { |
@@ -191,6 +194,7 @@ int register_timer_hook(int (*hook)(struct pt_regs *)) | |||
191 | timer_hook = hook; | 194 | timer_hook = hook; |
192 | return 0; | 195 | return 0; |
193 | } | 196 | } |
197 | EXPORT_SYMBOL_GPL(register_timer_hook); | ||
194 | 198 | ||
195 | void unregister_timer_hook(int (*hook)(struct pt_regs *)) | 199 | void unregister_timer_hook(int (*hook)(struct pt_regs *)) |
196 | { | 200 | { |
@@ -199,13 +203,7 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *)) | |||
199 | /* make sure all CPUs see the NULL hook */ | 203 | /* make sure all CPUs see the NULL hook */ |
200 | synchronize_sched(); /* Allow ongoing interrupts to complete. */ | 204 | synchronize_sched(); /* Allow ongoing interrupts to complete. */ |
201 | } | 205 | } |
202 | |||
203 | EXPORT_SYMBOL_GPL(register_timer_hook); | ||
204 | EXPORT_SYMBOL_GPL(unregister_timer_hook); | 206 | EXPORT_SYMBOL_GPL(unregister_timer_hook); |
205 | EXPORT_SYMBOL_GPL(task_handoff_register); | ||
206 | EXPORT_SYMBOL_GPL(task_handoff_unregister); | ||
207 | EXPORT_SYMBOL_GPL(profile_event_register); | ||
208 | EXPORT_SYMBOL_GPL(profile_event_unregister); | ||
209 | 207 | ||
210 | #endif /* CONFIG_PROFILING */ | 208 | #endif /* CONFIG_PROFILING */ |
211 | 209 | ||
@@ -366,7 +364,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info, | |||
366 | per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); | 364 | per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); |
367 | } | 365 | } |
368 | break; | 366 | break; |
369 | out_free: | 367 | out_free: |
370 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); | 368 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); |
371 | per_cpu(cpu_profile_hits, cpu)[1] = NULL; | 369 | per_cpu(cpu_profile_hits, cpu)[1] = NULL; |
372 | __free_page(page); | 370 | __free_page(page); |
@@ -409,7 +407,6 @@ void profile_hits(int type, void *__pc, unsigned int nr_hits) | |||
409 | atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); | 407 | atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); |
410 | } | 408 | } |
411 | #endif /* !CONFIG_SMP */ | 409 | #endif /* !CONFIG_SMP */ |
412 | |||
413 | EXPORT_SYMBOL_GPL(profile_hits); | 410 | EXPORT_SYMBOL_GPL(profile_hits); |
414 | 411 | ||
415 | void profile_tick(int type) | 412 | void profile_tick(int type) |
@@ -427,7 +424,7 @@ void profile_tick(int type) | |||
427 | #include <asm/uaccess.h> | 424 | #include <asm/uaccess.h> |
428 | #include <asm/ptrace.h> | 425 | #include <asm/ptrace.h> |
429 | 426 | ||
430 | static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, | 427 | static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, |
431 | int count, int *eof, void *data) | 428 | int count, int *eof, void *data) |
432 | { | 429 | { |
433 | int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); | 430 | int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); |
@@ -437,8 +434,8 @@ static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, | |||
437 | return len; | 434 | return len; |
438 | } | 435 | } |
439 | 436 | ||
440 | static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer, | 437 | static int prof_cpu_mask_write_proc(struct file *file, |
441 | unsigned long count, void *data) | 438 | const char __user *buffer, unsigned long count, void *data) |
442 | { | 439 | { |
443 | cpumask_t *mask = (cpumask_t *)data; | 440 | cpumask_t *mask = (cpumask_t *)data; |
444 | unsigned long full_count = count, err; | 441 | unsigned long full_count = count, err; |
@@ -457,7 +454,8 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) | |||
457 | struct proc_dir_entry *entry; | 454 | struct proc_dir_entry *entry; |
458 | 455 | ||
459 | /* create /proc/irq/prof_cpu_mask */ | 456 | /* create /proc/irq/prof_cpu_mask */ |
460 | if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir))) | 457 | entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); |
458 | if (!entry) | ||
461 | return; | 459 | return; |
462 | entry->data = (void *)&prof_cpu_mask; | 460 | entry->data = (void *)&prof_cpu_mask; |
463 | entry->read_proc = prof_cpu_mask_read_proc; | 461 | entry->read_proc = prof_cpu_mask_read_proc; |
@@ -475,7 +473,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
475 | { | 473 | { |
476 | unsigned long p = *ppos; | 474 | unsigned long p = *ppos; |
477 | ssize_t read; | 475 | ssize_t read; |
478 | char * pnt; | 476 | char *pnt; |
479 | unsigned int sample_step = 1 << prof_shift; | 477 | unsigned int sample_step = 1 << prof_shift; |
480 | 478 | ||
481 | profile_flip_buffers(); | 479 | profile_flip_buffers(); |
@@ -486,12 +484,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
486 | read = 0; | 484 | read = 0; |
487 | 485 | ||
488 | while (p < sizeof(unsigned int) && count > 0) { | 486 | while (p < sizeof(unsigned int) && count > 0) { |
489 | if (put_user(*((char *)(&sample_step)+p),buf)) | 487 | if (put_user(*((char *)(&sample_step)+p), buf)) |
490 | return -EFAULT; | 488 | return -EFAULT; |
491 | buf++; p++; count--; read++; | 489 | buf++; p++; count--; read++; |
492 | } | 490 | } |
493 | pnt = (char *)prof_buffer + p - sizeof(atomic_t); | 491 | pnt = (char *)prof_buffer + p - sizeof(atomic_t); |
494 | if (copy_to_user(buf,(void *)pnt,count)) | 492 | if (copy_to_user(buf, (void *)pnt, count)) |
495 | return -EFAULT; | 493 | return -EFAULT; |
496 | read += count; | 494 | read += count; |
497 | *ppos += read; | 495 | *ppos += read; |
@@ -508,7 +506,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf, | |||
508 | size_t count, loff_t *ppos) | 506 | size_t count, loff_t *ppos) |
509 | { | 507 | { |
510 | #ifdef CONFIG_SMP | 508 | #ifdef CONFIG_SMP |
511 | extern int setup_profiling_timer (unsigned int multiplier); | 509 | extern int setup_profiling_timer(unsigned int multiplier); |
512 | 510 | ||
513 | if (count == sizeof(int)) { | 511 | if (count == sizeof(int)) { |
514 | unsigned int multiplier; | 512 | unsigned int multiplier; |
@@ -591,7 +589,8 @@ static int __init create_proc_profile(void) | |||
591 | return 0; | 589 | return 0; |
592 | if (create_hash_tables()) | 590 | if (create_hash_tables()) |
593 | return -1; | 591 | return -1; |
594 | if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL))) | 592 | entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL); |
593 | if (!entry) | ||
595 | return 0; | 594 | return 0; |
596 | entry->proc_fops = &proc_profile_operations; | 595 | entry->proc_fops = &proc_profile_operations; |
597 | entry->size = (1+prof_len) * sizeof(atomic_t); | 596 | entry->size = (1+prof_len) * sizeof(atomic_t); |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 26f9923baddc..b0d4ab4dfd3d 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -120,7 +120,7 @@ int ptrace_check_attach(struct task_struct *child, int kill) | |||
120 | return ret; | 120 | return ret; |
121 | } | 121 | } |
122 | 122 | ||
123 | static int may_attach(struct task_struct *task) | 123 | int __ptrace_may_attach(struct task_struct *task) |
124 | { | 124 | { |
125 | /* May we inspect the given task? | 125 | /* May we inspect the given task? |
126 | * This check is used both for attaching with ptrace | 126 | * This check is used both for attaching with ptrace |
@@ -154,7 +154,7 @@ int ptrace_may_attach(struct task_struct *task) | |||
154 | { | 154 | { |
155 | int err; | 155 | int err; |
156 | task_lock(task); | 156 | task_lock(task); |
157 | err = may_attach(task); | 157 | err = __ptrace_may_attach(task); |
158 | task_unlock(task); | 158 | task_unlock(task); |
159 | return !err; | 159 | return !err; |
160 | } | 160 | } |
@@ -196,7 +196,7 @@ repeat: | |||
196 | /* the same process cannot be attached many times */ | 196 | /* the same process cannot be attached many times */ |
197 | if (task->ptrace & PT_PTRACED) | 197 | if (task->ptrace & PT_PTRACED) |
198 | goto bad; | 198 | goto bad; |
199 | retval = may_attach(task); | 199 | retval = __ptrace_may_attach(task); |
200 | if (retval) | 200 | if (retval) |
201 | goto bad; | 201 | goto bad; |
202 | 202 | ||
@@ -366,12 +366,73 @@ static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data) | |||
366 | return error; | 366 | return error; |
367 | } | 367 | } |
368 | 368 | ||
369 | |||
370 | #ifdef PTRACE_SINGLESTEP | ||
371 | #define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) | ||
372 | #else | ||
373 | #define is_singlestep(request) 0 | ||
374 | #endif | ||
375 | |||
376 | #ifdef PTRACE_SINGLEBLOCK | ||
377 | #define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK) | ||
378 | #else | ||
379 | #define is_singleblock(request) 0 | ||
380 | #endif | ||
381 | |||
382 | #ifdef PTRACE_SYSEMU | ||
383 | #define is_sysemu_singlestep(request) ((request) == PTRACE_SYSEMU_SINGLESTEP) | ||
384 | #else | ||
385 | #define is_sysemu_singlestep(request) 0 | ||
386 | #endif | ||
387 | |||
388 | static int ptrace_resume(struct task_struct *child, long request, long data) | ||
389 | { | ||
390 | if (!valid_signal(data)) | ||
391 | return -EIO; | ||
392 | |||
393 | if (request == PTRACE_SYSCALL) | ||
394 | set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | ||
395 | else | ||
396 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | ||
397 | |||
398 | #ifdef TIF_SYSCALL_EMU | ||
399 | if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP) | ||
400 | set_tsk_thread_flag(child, TIF_SYSCALL_EMU); | ||
401 | else | ||
402 | clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); | ||
403 | #endif | ||
404 | |||
405 | if (is_singleblock(request)) { | ||
406 | if (unlikely(!arch_has_block_step())) | ||
407 | return -EIO; | ||
408 | user_enable_block_step(child); | ||
409 | } else if (is_singlestep(request) || is_sysemu_singlestep(request)) { | ||
410 | if (unlikely(!arch_has_single_step())) | ||
411 | return -EIO; | ||
412 | user_enable_single_step(child); | ||
413 | } | ||
414 | else | ||
415 | user_disable_single_step(child); | ||
416 | |||
417 | child->exit_code = data; | ||
418 | wake_up_process(child); | ||
419 | |||
420 | return 0; | ||
421 | } | ||
422 | |||
369 | int ptrace_request(struct task_struct *child, long request, | 423 | int ptrace_request(struct task_struct *child, long request, |
370 | long addr, long data) | 424 | long addr, long data) |
371 | { | 425 | { |
372 | int ret = -EIO; | 426 | int ret = -EIO; |
373 | 427 | ||
374 | switch (request) { | 428 | switch (request) { |
429 | case PTRACE_PEEKTEXT: | ||
430 | case PTRACE_PEEKDATA: | ||
431 | return generic_ptrace_peekdata(child, addr, data); | ||
432 | case PTRACE_POKETEXT: | ||
433 | case PTRACE_POKEDATA: | ||
434 | return generic_ptrace_pokedata(child, addr, data); | ||
435 | |||
375 | #ifdef PTRACE_OLDSETOPTIONS | 436 | #ifdef PTRACE_OLDSETOPTIONS |
376 | case PTRACE_OLDSETOPTIONS: | 437 | case PTRACE_OLDSETOPTIONS: |
377 | #endif | 438 | #endif |
@@ -390,6 +451,26 @@ int ptrace_request(struct task_struct *child, long request, | |||
390 | case PTRACE_DETACH: /* detach a process that was attached. */ | 451 | case PTRACE_DETACH: /* detach a process that was attached. */ |
391 | ret = ptrace_detach(child, data); | 452 | ret = ptrace_detach(child, data); |
392 | break; | 453 | break; |
454 | |||
455 | #ifdef PTRACE_SINGLESTEP | ||
456 | case PTRACE_SINGLESTEP: | ||
457 | #endif | ||
458 | #ifdef PTRACE_SINGLEBLOCK | ||
459 | case PTRACE_SINGLEBLOCK: | ||
460 | #endif | ||
461 | #ifdef PTRACE_SYSEMU | ||
462 | case PTRACE_SYSEMU: | ||
463 | case PTRACE_SYSEMU_SINGLESTEP: | ||
464 | #endif | ||
465 | case PTRACE_SYSCALL: | ||
466 | case PTRACE_CONT: | ||
467 | return ptrace_resume(child, request, data); | ||
468 | |||
469 | case PTRACE_KILL: | ||
470 | if (child->exit_state) /* already dead */ | ||
471 | return 0; | ||
472 | return ptrace_resume(child, request, SIGKILL); | ||
473 | |||
393 | default: | 474 | default: |
394 | break; | 475 | break; |
395 | } | 476 | } |
@@ -470,6 +551,8 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data) | |||
470 | lock_kernel(); | 551 | lock_kernel(); |
471 | if (request == PTRACE_TRACEME) { | 552 | if (request == PTRACE_TRACEME) { |
472 | ret = ptrace_traceme(); | 553 | ret = ptrace_traceme(); |
554 | if (!ret) | ||
555 | arch_ptrace_attach(current); | ||
473 | goto out; | 556 | goto out; |
474 | } | 557 | } |
475 | 558 | ||
@@ -524,3 +607,87 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) | |||
524 | copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); | 607 | copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); |
525 | return (copied == sizeof(data)) ? 0 : -EIO; | 608 | return (copied == sizeof(data)) ? 0 : -EIO; |
526 | } | 609 | } |
610 | |||
611 | #ifdef CONFIG_COMPAT | ||
612 | #include <linux/compat.h> | ||
613 | |||
614 | int compat_ptrace_request(struct task_struct *child, compat_long_t request, | ||
615 | compat_ulong_t addr, compat_ulong_t data) | ||
616 | { | ||
617 | compat_ulong_t __user *datap = compat_ptr(data); | ||
618 | compat_ulong_t word; | ||
619 | int ret; | ||
620 | |||
621 | switch (request) { | ||
622 | case PTRACE_PEEKTEXT: | ||
623 | case PTRACE_PEEKDATA: | ||
624 | ret = access_process_vm(child, addr, &word, sizeof(word), 0); | ||
625 | if (ret != sizeof(word)) | ||
626 | ret = -EIO; | ||
627 | else | ||
628 | ret = put_user(word, datap); | ||
629 | break; | ||
630 | |||
631 | case PTRACE_POKETEXT: | ||
632 | case PTRACE_POKEDATA: | ||
633 | ret = access_process_vm(child, addr, &data, sizeof(data), 1); | ||
634 | ret = (ret != sizeof(data) ? -EIO : 0); | ||
635 | break; | ||
636 | |||
637 | case PTRACE_GETEVENTMSG: | ||
638 | ret = put_user((compat_ulong_t) child->ptrace_message, datap); | ||
639 | break; | ||
640 | |||
641 | default: | ||
642 | ret = ptrace_request(child, request, addr, data); | ||
643 | } | ||
644 | |||
645 | return ret; | ||
646 | } | ||
647 | |||
648 | #ifdef __ARCH_WANT_COMPAT_SYS_PTRACE | ||
649 | asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, | ||
650 | compat_long_t addr, compat_long_t data) | ||
651 | { | ||
652 | struct task_struct *child; | ||
653 | long ret; | ||
654 | |||
655 | /* | ||
656 | * This lock_kernel fixes a subtle race with suid exec | ||
657 | */ | ||
658 | lock_kernel(); | ||
659 | if (request == PTRACE_TRACEME) { | ||
660 | ret = ptrace_traceme(); | ||
661 | goto out; | ||
662 | } | ||
663 | |||
664 | child = ptrace_get_task_struct(pid); | ||
665 | if (IS_ERR(child)) { | ||
666 | ret = PTR_ERR(child); | ||
667 | goto out; | ||
668 | } | ||
669 | |||
670 | if (request == PTRACE_ATTACH) { | ||
671 | ret = ptrace_attach(child); | ||
672 | /* | ||
673 | * Some architectures need to do book-keeping after | ||
674 | * a ptrace attach. | ||
675 | */ | ||
676 | if (!ret) | ||
677 | arch_ptrace_attach(child); | ||
678 | goto out_put_task_struct; | ||
679 | } | ||
680 | |||
681 | ret = ptrace_check_attach(child, request == PTRACE_KILL); | ||
682 | if (!ret) | ||
683 | ret = compat_arch_ptrace(child, request, addr, data); | ||
684 | |||
685 | out_put_task_struct: | ||
686 | put_task_struct(child); | ||
687 | out: | ||
688 | unlock_kernel(); | ||
689 | return ret; | ||
690 | } | ||
691 | #endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */ | ||
692 | |||
693 | #endif /* CONFIG_COMPAT */ | ||
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c new file mode 100644 index 000000000000..f4ffbd0f306f --- /dev/null +++ b/kernel/rcuclassic.c | |||
@@ -0,0 +1,575 @@ | |||
1 | /* | ||
2 | * Read-Copy Update mechanism for mutual exclusion | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2001 | ||
19 | * | ||
20 | * Authors: Dipankar Sarma <dipankar@in.ibm.com> | ||
21 | * Manfred Spraul <manfred@colorfullife.com> | ||
22 | * | ||
23 | * Based on the original work by Paul McKenney <paulmck@us.ibm.com> | ||
24 | * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. | ||
25 | * Papers: | ||
26 | * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf | ||
27 | * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) | ||
28 | * | ||
29 | * For detailed explanation of Read-Copy Update mechanism see - | ||
30 | * Documentation/RCU | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/types.h> | ||
34 | #include <linux/kernel.h> | ||
35 | #include <linux/init.h> | ||
36 | #include <linux/spinlock.h> | ||
37 | #include <linux/smp.h> | ||
38 | #include <linux/rcupdate.h> | ||
39 | #include <linux/interrupt.h> | ||
40 | #include <linux/sched.h> | ||
41 | #include <asm/atomic.h> | ||
42 | #include <linux/bitops.h> | ||
43 | #include <linux/module.h> | ||
44 | #include <linux/completion.h> | ||
45 | #include <linux/moduleparam.h> | ||
46 | #include <linux/percpu.h> | ||
47 | #include <linux/notifier.h> | ||
48 | #include <linux/cpu.h> | ||
49 | #include <linux/mutex.h> | ||
50 | |||
51 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
52 | static struct lock_class_key rcu_lock_key; | ||
53 | struct lockdep_map rcu_lock_map = | ||
54 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); | ||
55 | EXPORT_SYMBOL_GPL(rcu_lock_map); | ||
56 | #endif | ||
57 | |||
58 | |||
59 | /* Definition for rcupdate control block. */ | ||
60 | static struct rcu_ctrlblk rcu_ctrlblk = { | ||
61 | .cur = -300, | ||
62 | .completed = -300, | ||
63 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), | ||
64 | .cpumask = CPU_MASK_NONE, | ||
65 | }; | ||
66 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | ||
67 | .cur = -300, | ||
68 | .completed = -300, | ||
69 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), | ||
70 | .cpumask = CPU_MASK_NONE, | ||
71 | }; | ||
72 | |||
73 | DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; | ||
74 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; | ||
75 | |||
76 | static int blimit = 10; | ||
77 | static int qhimark = 10000; | ||
78 | static int qlowmark = 100; | ||
79 | |||
80 | #ifdef CONFIG_SMP | ||
81 | static void force_quiescent_state(struct rcu_data *rdp, | ||
82 | struct rcu_ctrlblk *rcp) | ||
83 | { | ||
84 | int cpu; | ||
85 | cpumask_t cpumask; | ||
86 | set_need_resched(); | ||
87 | if (unlikely(!rcp->signaled)) { | ||
88 | rcp->signaled = 1; | ||
89 | /* | ||
90 | * Don't send IPI to itself. With irqs disabled, | ||
91 | * rdp->cpu is the current cpu. | ||
92 | */ | ||
93 | cpumask = rcp->cpumask; | ||
94 | cpu_clear(rdp->cpu, cpumask); | ||
95 | for_each_cpu_mask(cpu, cpumask) | ||
96 | smp_send_reschedule(cpu); | ||
97 | } | ||
98 | } | ||
99 | #else | ||
100 | static inline void force_quiescent_state(struct rcu_data *rdp, | ||
101 | struct rcu_ctrlblk *rcp) | ||
102 | { | ||
103 | set_need_resched(); | ||
104 | } | ||
105 | #endif | ||
106 | |||
107 | /** | ||
108 | * call_rcu - Queue an RCU callback for invocation after a grace period. | ||
109 | * @head: structure to be used for queueing the RCU updates. | ||
110 | * @func: actual update function to be invoked after the grace period | ||
111 | * | ||
112 | * The update function will be invoked some time after a full grace | ||
113 | * period elapses, in other words after all currently executing RCU | ||
114 | * read-side critical sections have completed. RCU read-side critical | ||
115 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | ||
116 | * and may be nested. | ||
117 | */ | ||
118 | void call_rcu(struct rcu_head *head, | ||
119 | void (*func)(struct rcu_head *rcu)) | ||
120 | { | ||
121 | unsigned long flags; | ||
122 | struct rcu_data *rdp; | ||
123 | |||
124 | head->func = func; | ||
125 | head->next = NULL; | ||
126 | local_irq_save(flags); | ||
127 | rdp = &__get_cpu_var(rcu_data); | ||
128 | *rdp->nxttail = head; | ||
129 | rdp->nxttail = &head->next; | ||
130 | if (unlikely(++rdp->qlen > qhimark)) { | ||
131 | rdp->blimit = INT_MAX; | ||
132 | force_quiescent_state(rdp, &rcu_ctrlblk); | ||
133 | } | ||
134 | local_irq_restore(flags); | ||
135 | } | ||
136 | EXPORT_SYMBOL_GPL(call_rcu); | ||
137 | |||
138 | /** | ||
139 | * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. | ||
140 | * @head: structure to be used for queueing the RCU updates. | ||
141 | * @func: actual update function to be invoked after the grace period | ||
142 | * | ||
143 | * The update function will be invoked some time after a full grace | ||
144 | * period elapses, in other words after all currently executing RCU | ||
145 | * read-side critical sections have completed. call_rcu_bh() assumes | ||
146 | * that the read-side critical sections end on completion of a softirq | ||
147 | * handler. This means that read-side critical sections in process | ||
148 | * context must not be interrupted by softirqs. This interface is to be | ||
149 | * used when most of the read-side critical sections are in softirq context. | ||
150 | * RCU read-side critical sections are delimited by rcu_read_lock() and | ||
151 | * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() | ||
152 | * and rcu_read_unlock_bh(), if in process context. These may be nested. | ||
153 | */ | ||
154 | void call_rcu_bh(struct rcu_head *head, | ||
155 | void (*func)(struct rcu_head *rcu)) | ||
156 | { | ||
157 | unsigned long flags; | ||
158 | struct rcu_data *rdp; | ||
159 | |||
160 | head->func = func; | ||
161 | head->next = NULL; | ||
162 | local_irq_save(flags); | ||
163 | rdp = &__get_cpu_var(rcu_bh_data); | ||
164 | *rdp->nxttail = head; | ||
165 | rdp->nxttail = &head->next; | ||
166 | |||
167 | if (unlikely(++rdp->qlen > qhimark)) { | ||
168 | rdp->blimit = INT_MAX; | ||
169 | force_quiescent_state(rdp, &rcu_bh_ctrlblk); | ||
170 | } | ||
171 | |||
172 | local_irq_restore(flags); | ||
173 | } | ||
174 | EXPORT_SYMBOL_GPL(call_rcu_bh); | ||
175 | |||
176 | /* | ||
177 | * Return the number of RCU batches processed thus far. Useful | ||
178 | * for debug and statistics. | ||
179 | */ | ||
180 | long rcu_batches_completed(void) | ||
181 | { | ||
182 | return rcu_ctrlblk.completed; | ||
183 | } | ||
184 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
185 | |||
186 | /* | ||
187 | * Return the number of RCU batches processed thus far. Useful | ||
188 | * for debug and statistics. | ||
189 | */ | ||
190 | long rcu_batches_completed_bh(void) | ||
191 | { | ||
192 | return rcu_bh_ctrlblk.completed; | ||
193 | } | ||
194 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | ||
195 | |||
196 | /* Raises the softirq for processing rcu_callbacks. */ | ||
197 | static inline void raise_rcu_softirq(void) | ||
198 | { | ||
199 | raise_softirq(RCU_SOFTIRQ); | ||
200 | /* | ||
201 | * The smp_mb() here is required to ensure that this cpu's | ||
202 | * __rcu_process_callbacks() reads the most recently updated | ||
203 | * value of rcu->cur. | ||
204 | */ | ||
205 | smp_mb(); | ||
206 | } | ||
207 | |||
208 | /* | ||
209 | * Invoke the completed RCU callbacks. They are expected to be in | ||
210 | * a per-cpu list. | ||
211 | */ | ||
212 | static void rcu_do_batch(struct rcu_data *rdp) | ||
213 | { | ||
214 | struct rcu_head *next, *list; | ||
215 | int count = 0; | ||
216 | |||
217 | list = rdp->donelist; | ||
218 | while (list) { | ||
219 | next = list->next; | ||
220 | prefetch(next); | ||
221 | list->func(list); | ||
222 | list = next; | ||
223 | if (++count >= rdp->blimit) | ||
224 | break; | ||
225 | } | ||
226 | rdp->donelist = list; | ||
227 | |||
228 | local_irq_disable(); | ||
229 | rdp->qlen -= count; | ||
230 | local_irq_enable(); | ||
231 | if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) | ||
232 | rdp->blimit = blimit; | ||
233 | |||
234 | if (!rdp->donelist) | ||
235 | rdp->donetail = &rdp->donelist; | ||
236 | else | ||
237 | raise_rcu_softirq(); | ||
238 | } | ||
239 | |||
240 | /* | ||
241 | * Grace period handling: | ||
242 | * The grace period handling consists out of two steps: | ||
243 | * - A new grace period is started. | ||
244 | * This is done by rcu_start_batch. The start is not broadcasted to | ||
245 | * all cpus, they must pick this up by comparing rcp->cur with | ||
246 | * rdp->quiescbatch. All cpus are recorded in the | ||
247 | * rcu_ctrlblk.cpumask bitmap. | ||
248 | * - All cpus must go through a quiescent state. | ||
249 | * Since the start of the grace period is not broadcasted, at least two | ||
250 | * calls to rcu_check_quiescent_state are required: | ||
251 | * The first call just notices that a new grace period is running. The | ||
252 | * following calls check if there was a quiescent state since the beginning | ||
253 | * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If | ||
254 | * the bitmap is empty, then the grace period is completed. | ||
255 | * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace | ||
256 | * period (if necessary). | ||
257 | */ | ||
258 | /* | ||
259 | * Register a new batch of callbacks, and start it up if there is currently no | ||
260 | * active batch and the batch to be registered has not already occurred. | ||
261 | * Caller must hold rcu_ctrlblk.lock. | ||
262 | */ | ||
263 | static void rcu_start_batch(struct rcu_ctrlblk *rcp) | ||
264 | { | ||
265 | if (rcp->next_pending && | ||
266 | rcp->completed == rcp->cur) { | ||
267 | rcp->next_pending = 0; | ||
268 | /* | ||
269 | * next_pending == 0 must be visible in | ||
270 | * __rcu_process_callbacks() before it can see new value of cur. | ||
271 | */ | ||
272 | smp_wmb(); | ||
273 | rcp->cur++; | ||
274 | |||
275 | /* | ||
276 | * Accessing nohz_cpu_mask before incrementing rcp->cur needs a | ||
277 | * Barrier Otherwise it can cause tickless idle CPUs to be | ||
278 | * included in rcp->cpumask, which will extend graceperiods | ||
279 | * unnecessarily. | ||
280 | */ | ||
281 | smp_mb(); | ||
282 | cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); | ||
283 | |||
284 | rcp->signaled = 0; | ||
285 | } | ||
286 | } | ||
287 | |||
288 | /* | ||
289 | * cpu went through a quiescent state since the beginning of the grace period. | ||
290 | * Clear it from the cpu mask and complete the grace period if it was the last | ||
291 | * cpu. Start another grace period if someone has further entries pending | ||
292 | */ | ||
293 | static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) | ||
294 | { | ||
295 | cpu_clear(cpu, rcp->cpumask); | ||
296 | if (cpus_empty(rcp->cpumask)) { | ||
297 | /* batch completed ! */ | ||
298 | rcp->completed = rcp->cur; | ||
299 | rcu_start_batch(rcp); | ||
300 | } | ||
301 | } | ||
302 | |||
303 | /* | ||
304 | * Check if the cpu has gone through a quiescent state (say context | ||
305 | * switch). If so and if it already hasn't done so in this RCU | ||
306 | * quiescent cycle, then indicate that it has done so. | ||
307 | */ | ||
308 | static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, | ||
309 | struct rcu_data *rdp) | ||
310 | { | ||
311 | if (rdp->quiescbatch != rcp->cur) { | ||
312 | /* start new grace period: */ | ||
313 | rdp->qs_pending = 1; | ||
314 | rdp->passed_quiesc = 0; | ||
315 | rdp->quiescbatch = rcp->cur; | ||
316 | return; | ||
317 | } | ||
318 | |||
319 | /* Grace period already completed for this cpu? | ||
320 | * qs_pending is checked instead of the actual bitmap to avoid | ||
321 | * cacheline trashing. | ||
322 | */ | ||
323 | if (!rdp->qs_pending) | ||
324 | return; | ||
325 | |||
326 | /* | ||
327 | * Was there a quiescent state since the beginning of the grace | ||
328 | * period? If no, then exit and wait for the next call. | ||
329 | */ | ||
330 | if (!rdp->passed_quiesc) | ||
331 | return; | ||
332 | rdp->qs_pending = 0; | ||
333 | |||
334 | spin_lock(&rcp->lock); | ||
335 | /* | ||
336 | * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync | ||
337 | * during cpu startup. Ignore the quiescent state. | ||
338 | */ | ||
339 | if (likely(rdp->quiescbatch == rcp->cur)) | ||
340 | cpu_quiet(rdp->cpu, rcp); | ||
341 | |||
342 | spin_unlock(&rcp->lock); | ||
343 | } | ||
344 | |||
345 | |||
346 | #ifdef CONFIG_HOTPLUG_CPU | ||
347 | |||
348 | /* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing | ||
349 | * locking requirements, the list it's pulling from has to belong to a cpu | ||
350 | * which is dead and hence not processing interrupts. | ||
351 | */ | ||
352 | static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, | ||
353 | struct rcu_head **tail) | ||
354 | { | ||
355 | local_irq_disable(); | ||
356 | *this_rdp->nxttail = list; | ||
357 | if (list) | ||
358 | this_rdp->nxttail = tail; | ||
359 | local_irq_enable(); | ||
360 | } | ||
361 | |||
362 | static void __rcu_offline_cpu(struct rcu_data *this_rdp, | ||
363 | struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
364 | { | ||
365 | /* if the cpu going offline owns the grace period | ||
366 | * we can block indefinitely waiting for it, so flush | ||
367 | * it here | ||
368 | */ | ||
369 | spin_lock_bh(&rcp->lock); | ||
370 | if (rcp->cur != rcp->completed) | ||
371 | cpu_quiet(rdp->cpu, rcp); | ||
372 | spin_unlock_bh(&rcp->lock); | ||
373 | rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); | ||
374 | rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); | ||
375 | rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); | ||
376 | } | ||
377 | |||
378 | static void rcu_offline_cpu(int cpu) | ||
379 | { | ||
380 | struct rcu_data *this_rdp = &get_cpu_var(rcu_data); | ||
381 | struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); | ||
382 | |||
383 | __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, | ||
384 | &per_cpu(rcu_data, cpu)); | ||
385 | __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, | ||
386 | &per_cpu(rcu_bh_data, cpu)); | ||
387 | put_cpu_var(rcu_data); | ||
388 | put_cpu_var(rcu_bh_data); | ||
389 | } | ||
390 | |||
391 | #else | ||
392 | |||
393 | static void rcu_offline_cpu(int cpu) | ||
394 | { | ||
395 | } | ||
396 | |||
397 | #endif | ||
398 | |||
399 | /* | ||
400 | * This does the RCU processing work from softirq context. | ||
401 | */ | ||
402 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, | ||
403 | struct rcu_data *rdp) | ||
404 | { | ||
405 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { | ||
406 | *rdp->donetail = rdp->curlist; | ||
407 | rdp->donetail = rdp->curtail; | ||
408 | rdp->curlist = NULL; | ||
409 | rdp->curtail = &rdp->curlist; | ||
410 | } | ||
411 | |||
412 | if (rdp->nxtlist && !rdp->curlist) { | ||
413 | local_irq_disable(); | ||
414 | rdp->curlist = rdp->nxtlist; | ||
415 | rdp->curtail = rdp->nxttail; | ||
416 | rdp->nxtlist = NULL; | ||
417 | rdp->nxttail = &rdp->nxtlist; | ||
418 | local_irq_enable(); | ||
419 | |||
420 | /* | ||
421 | * start the next batch of callbacks | ||
422 | */ | ||
423 | |||
424 | /* determine batch number */ | ||
425 | rdp->batch = rcp->cur + 1; | ||
426 | /* see the comment and corresponding wmb() in | ||
427 | * the rcu_start_batch() | ||
428 | */ | ||
429 | smp_rmb(); | ||
430 | |||
431 | if (!rcp->next_pending) { | ||
432 | /* and start it/schedule start if it's a new batch */ | ||
433 | spin_lock(&rcp->lock); | ||
434 | rcp->next_pending = 1; | ||
435 | rcu_start_batch(rcp); | ||
436 | spin_unlock(&rcp->lock); | ||
437 | } | ||
438 | } | ||
439 | |||
440 | rcu_check_quiescent_state(rcp, rdp); | ||
441 | if (rdp->donelist) | ||
442 | rcu_do_batch(rdp); | ||
443 | } | ||
444 | |||
445 | static void rcu_process_callbacks(struct softirq_action *unused) | ||
446 | { | ||
447 | __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); | ||
448 | __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); | ||
449 | } | ||
450 | |||
451 | static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
452 | { | ||
453 | /* This cpu has pending rcu entries and the grace period | ||
454 | * for them has completed. | ||
455 | */ | ||
456 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) | ||
457 | return 1; | ||
458 | |||
459 | /* This cpu has no pending entries, but there are new entries */ | ||
460 | if (!rdp->curlist && rdp->nxtlist) | ||
461 | return 1; | ||
462 | |||
463 | /* This cpu has finished callbacks to invoke */ | ||
464 | if (rdp->donelist) | ||
465 | return 1; | ||
466 | |||
467 | /* The rcu core waits for a quiescent state from the cpu */ | ||
468 | if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) | ||
469 | return 1; | ||
470 | |||
471 | /* nothing to do */ | ||
472 | return 0; | ||
473 | } | ||
474 | |||
475 | /* | ||
476 | * Check to see if there is any immediate RCU-related work to be done | ||
477 | * by the current CPU, returning 1 if so. This function is part of the | ||
478 | * RCU implementation; it is -not- an exported member of the RCU API. | ||
479 | */ | ||
480 | int rcu_pending(int cpu) | ||
481 | { | ||
482 | return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || | ||
483 | __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); | ||
484 | } | ||
485 | |||
486 | /* | ||
487 | * Check to see if any future RCU-related work will need to be done | ||
488 | * by the current CPU, even if none need be done immediately, returning | ||
489 | * 1 if so. This function is part of the RCU implementation; it is -not- | ||
490 | * an exported member of the RCU API. | ||
491 | */ | ||
492 | int rcu_needs_cpu(int cpu) | ||
493 | { | ||
494 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
495 | struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); | ||
496 | |||
497 | return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); | ||
498 | } | ||
499 | |||
500 | void rcu_check_callbacks(int cpu, int user) | ||
501 | { | ||
502 | if (user || | ||
503 | (idle_cpu(cpu) && !in_softirq() && | ||
504 | hardirq_count() <= (1 << HARDIRQ_SHIFT))) { | ||
505 | rcu_qsctr_inc(cpu); | ||
506 | rcu_bh_qsctr_inc(cpu); | ||
507 | } else if (!in_softirq()) | ||
508 | rcu_bh_qsctr_inc(cpu); | ||
509 | raise_rcu_softirq(); | ||
510 | } | ||
511 | |||
512 | static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, | ||
513 | struct rcu_data *rdp) | ||
514 | { | ||
515 | memset(rdp, 0, sizeof(*rdp)); | ||
516 | rdp->curtail = &rdp->curlist; | ||
517 | rdp->nxttail = &rdp->nxtlist; | ||
518 | rdp->donetail = &rdp->donelist; | ||
519 | rdp->quiescbatch = rcp->completed; | ||
520 | rdp->qs_pending = 0; | ||
521 | rdp->cpu = cpu; | ||
522 | rdp->blimit = blimit; | ||
523 | } | ||
524 | |||
525 | static void __cpuinit rcu_online_cpu(int cpu) | ||
526 | { | ||
527 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
528 | struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); | ||
529 | |||
530 | rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); | ||
531 | rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); | ||
532 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); | ||
533 | } | ||
534 | |||
535 | static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | ||
536 | unsigned long action, void *hcpu) | ||
537 | { | ||
538 | long cpu = (long)hcpu; | ||
539 | |||
540 | switch (action) { | ||
541 | case CPU_UP_PREPARE: | ||
542 | case CPU_UP_PREPARE_FROZEN: | ||
543 | rcu_online_cpu(cpu); | ||
544 | break; | ||
545 | case CPU_DEAD: | ||
546 | case CPU_DEAD_FROZEN: | ||
547 | rcu_offline_cpu(cpu); | ||
548 | break; | ||
549 | default: | ||
550 | break; | ||
551 | } | ||
552 | return NOTIFY_OK; | ||
553 | } | ||
554 | |||
555 | static struct notifier_block __cpuinitdata rcu_nb = { | ||
556 | .notifier_call = rcu_cpu_notify, | ||
557 | }; | ||
558 | |||
559 | /* | ||
560 | * Initializes rcu mechanism. Assumed to be called early. | ||
561 | * That is before local timer(SMP) or jiffie timer (uniproc) is setup. | ||
562 | * Note that rcu_qsctr and friends are implicitly | ||
563 | * initialized due to the choice of ``0'' for RCU_CTR_INVALID. | ||
564 | */ | ||
565 | void __init __rcu_init(void) | ||
566 | { | ||
567 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, | ||
568 | (void *)(long)smp_processor_id()); | ||
569 | /* Register notifier for non-boot CPUs */ | ||
570 | register_cpu_notifier(&rcu_nb); | ||
571 | } | ||
572 | |||
573 | module_param(blimit, int, 0); | ||
574 | module_param(qhimark, int, 0); | ||
575 | module_param(qlowmark, int, 0); | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a66d4d1615f7..760dfc233a00 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -15,7 +15,7 @@ | |||
15 | * along with this program; if not, write to the Free Software | 15 | * along with this program; if not, write to the Free Software |
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
17 | * | 17 | * |
18 | * Copyright (C) IBM Corporation, 2001 | 18 | * Copyright IBM Corporation, 2001 |
19 | * | 19 | * |
20 | * Authors: Dipankar Sarma <dipankar@in.ibm.com> | 20 | * Authors: Dipankar Sarma <dipankar@in.ibm.com> |
21 | * Manfred Spraul <manfred@colorfullife.com> | 21 | * Manfred Spraul <manfred@colorfullife.com> |
@@ -35,165 +35,57 @@ | |||
35 | #include <linux/init.h> | 35 | #include <linux/init.h> |
36 | #include <linux/spinlock.h> | 36 | #include <linux/spinlock.h> |
37 | #include <linux/smp.h> | 37 | #include <linux/smp.h> |
38 | #include <linux/rcupdate.h> | ||
39 | #include <linux/interrupt.h> | 38 | #include <linux/interrupt.h> |
40 | #include <linux/sched.h> | 39 | #include <linux/sched.h> |
41 | #include <asm/atomic.h> | 40 | #include <asm/atomic.h> |
42 | #include <linux/bitops.h> | 41 | #include <linux/bitops.h> |
43 | #include <linux/module.h> | ||
44 | #include <linux/completion.h> | 42 | #include <linux/completion.h> |
45 | #include <linux/moduleparam.h> | ||
46 | #include <linux/percpu.h> | 43 | #include <linux/percpu.h> |
47 | #include <linux/notifier.h> | 44 | #include <linux/notifier.h> |
48 | #include <linux/cpu.h> | 45 | #include <linux/cpu.h> |
49 | #include <linux/mutex.h> | 46 | #include <linux/mutex.h> |
47 | #include <linux/module.h> | ||
50 | 48 | ||
51 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 49 | struct rcu_synchronize { |
52 | static struct lock_class_key rcu_lock_key; | 50 | struct rcu_head head; |
53 | struct lockdep_map rcu_lock_map = | 51 | struct completion completion; |
54 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); | ||
55 | |||
56 | EXPORT_SYMBOL_GPL(rcu_lock_map); | ||
57 | #endif | ||
58 | |||
59 | /* Definition for rcupdate control block. */ | ||
60 | static struct rcu_ctrlblk rcu_ctrlblk = { | ||
61 | .cur = -300, | ||
62 | .completed = -300, | ||
63 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), | ||
64 | .cpumask = CPU_MASK_NONE, | ||
65 | }; | ||
66 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | ||
67 | .cur = -300, | ||
68 | .completed = -300, | ||
69 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), | ||
70 | .cpumask = CPU_MASK_NONE, | ||
71 | }; | 52 | }; |
72 | 53 | ||
73 | DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; | 54 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; |
74 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; | ||
75 | |||
76 | /* Fake initialization required by compiler */ | ||
77 | static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; | ||
78 | static int blimit = 10; | ||
79 | static int qhimark = 10000; | ||
80 | static int qlowmark = 100; | ||
81 | |||
82 | static atomic_t rcu_barrier_cpu_count; | 55 | static atomic_t rcu_barrier_cpu_count; |
83 | static DEFINE_MUTEX(rcu_barrier_mutex); | 56 | static DEFINE_MUTEX(rcu_barrier_mutex); |
84 | static struct completion rcu_barrier_completion; | 57 | static struct completion rcu_barrier_completion; |
85 | 58 | ||
86 | #ifdef CONFIG_SMP | 59 | /* Because of FASTCALL declaration of complete, we use this wrapper */ |
87 | static void force_quiescent_state(struct rcu_data *rdp, | 60 | static void wakeme_after_rcu(struct rcu_head *head) |
88 | struct rcu_ctrlblk *rcp) | ||
89 | { | ||
90 | int cpu; | ||
91 | cpumask_t cpumask; | ||
92 | set_need_resched(); | ||
93 | if (unlikely(!rcp->signaled)) { | ||
94 | rcp->signaled = 1; | ||
95 | /* | ||
96 | * Don't send IPI to itself. With irqs disabled, | ||
97 | * rdp->cpu is the current cpu. | ||
98 | */ | ||
99 | cpumask = rcp->cpumask; | ||
100 | cpu_clear(rdp->cpu, cpumask); | ||
101 | for_each_cpu_mask(cpu, cpumask) | ||
102 | smp_send_reschedule(cpu); | ||
103 | } | ||
104 | } | ||
105 | #else | ||
106 | static inline void force_quiescent_state(struct rcu_data *rdp, | ||
107 | struct rcu_ctrlblk *rcp) | ||
108 | { | 61 | { |
109 | set_need_resched(); | 62 | struct rcu_synchronize *rcu; |
63 | |||
64 | rcu = container_of(head, struct rcu_synchronize, head); | ||
65 | complete(&rcu->completion); | ||
110 | } | 66 | } |
111 | #endif | ||
112 | 67 | ||
113 | /** | 68 | /** |
114 | * call_rcu - Queue an RCU callback for invocation after a grace period. | 69 | * synchronize_rcu - wait until a grace period has elapsed. |
115 | * @head: structure to be used for queueing the RCU updates. | ||
116 | * @func: actual update function to be invoked after the grace period | ||
117 | * | 70 | * |
118 | * The update function will be invoked some time after a full grace | 71 | * Control will return to the caller some time after a full grace |
119 | * period elapses, in other words after all currently executing RCU | 72 | * period has elapsed, in other words after all currently executing RCU |
120 | * read-side critical sections have completed. RCU read-side critical | 73 | * read-side critical sections have completed. RCU read-side critical |
121 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | 74 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), |
122 | * and may be nested. | 75 | * and may be nested. |
123 | */ | 76 | */ |
124 | void fastcall call_rcu(struct rcu_head *head, | 77 | void synchronize_rcu(void) |
125 | void (*func)(struct rcu_head *rcu)) | ||
126 | { | ||
127 | unsigned long flags; | ||
128 | struct rcu_data *rdp; | ||
129 | |||
130 | head->func = func; | ||
131 | head->next = NULL; | ||
132 | local_irq_save(flags); | ||
133 | rdp = &__get_cpu_var(rcu_data); | ||
134 | *rdp->nxttail = head; | ||
135 | rdp->nxttail = &head->next; | ||
136 | if (unlikely(++rdp->qlen > qhimark)) { | ||
137 | rdp->blimit = INT_MAX; | ||
138 | force_quiescent_state(rdp, &rcu_ctrlblk); | ||
139 | } | ||
140 | local_irq_restore(flags); | ||
141 | } | ||
142 | |||
143 | /** | ||
144 | * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. | ||
145 | * @head: structure to be used for queueing the RCU updates. | ||
146 | * @func: actual update function to be invoked after the grace period | ||
147 | * | ||
148 | * The update function will be invoked some time after a full grace | ||
149 | * period elapses, in other words after all currently executing RCU | ||
150 | * read-side critical sections have completed. call_rcu_bh() assumes | ||
151 | * that the read-side critical sections end on completion of a softirq | ||
152 | * handler. This means that read-side critical sections in process | ||
153 | * context must not be interrupted by softirqs. This interface is to be | ||
154 | * used when most of the read-side critical sections are in softirq context. | ||
155 | * RCU read-side critical sections are delimited by rcu_read_lock() and | ||
156 | * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() | ||
157 | * and rcu_read_unlock_bh(), if in process context. These may be nested. | ||
158 | */ | ||
159 | void fastcall call_rcu_bh(struct rcu_head *head, | ||
160 | void (*func)(struct rcu_head *rcu)) | ||
161 | { | 78 | { |
162 | unsigned long flags; | 79 | struct rcu_synchronize rcu; |
163 | struct rcu_data *rdp; | ||
164 | |||
165 | head->func = func; | ||
166 | head->next = NULL; | ||
167 | local_irq_save(flags); | ||
168 | rdp = &__get_cpu_var(rcu_bh_data); | ||
169 | *rdp->nxttail = head; | ||
170 | rdp->nxttail = &head->next; | ||
171 | |||
172 | if (unlikely(++rdp->qlen > qhimark)) { | ||
173 | rdp->blimit = INT_MAX; | ||
174 | force_quiescent_state(rdp, &rcu_bh_ctrlblk); | ||
175 | } | ||
176 | |||
177 | local_irq_restore(flags); | ||
178 | } | ||
179 | 80 | ||
180 | /* | 81 | init_completion(&rcu.completion); |
181 | * Return the number of RCU batches processed thus far. Useful | 82 | /* Will wake me after RCU finished */ |
182 | * for debug and statistics. | 83 | call_rcu(&rcu.head, wakeme_after_rcu); |
183 | */ | ||
184 | long rcu_batches_completed(void) | ||
185 | { | ||
186 | return rcu_ctrlblk.completed; | ||
187 | } | ||
188 | 84 | ||
189 | /* | 85 | /* Wait for it */ |
190 | * Return the number of RCU batches processed thus far. Useful | 86 | wait_for_completion(&rcu.completion); |
191 | * for debug and statistics. | ||
192 | */ | ||
193 | long rcu_batches_completed_bh(void) | ||
194 | { | ||
195 | return rcu_bh_ctrlblk.completed; | ||
196 | } | 87 | } |
88 | EXPORT_SYMBOL_GPL(synchronize_rcu); | ||
197 | 89 | ||
198 | static void rcu_barrier_callback(struct rcu_head *notused) | 90 | static void rcu_barrier_callback(struct rcu_head *notused) |
199 | { | 91 | { |
@@ -207,10 +99,8 @@ static void rcu_barrier_callback(struct rcu_head *notused) | |||
207 | static void rcu_barrier_func(void *notused) | 99 | static void rcu_barrier_func(void *notused) |
208 | { | 100 | { |
209 | int cpu = smp_processor_id(); | 101 | int cpu = smp_processor_id(); |
210 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | 102 | struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); |
211 | struct rcu_head *head; | ||
212 | 103 | ||
213 | head = &rdp->barrier; | ||
214 | atomic_inc(&rcu_barrier_cpu_count); | 104 | atomic_inc(&rcu_barrier_cpu_count); |
215 | call_rcu(head, rcu_barrier_callback); | 105 | call_rcu(head, rcu_barrier_callback); |
216 | } | 106 | } |
@@ -225,420 +115,24 @@ void rcu_barrier(void) | |||
225 | mutex_lock(&rcu_barrier_mutex); | 115 | mutex_lock(&rcu_barrier_mutex); |
226 | init_completion(&rcu_barrier_completion); | 116 | init_completion(&rcu_barrier_completion); |
227 | atomic_set(&rcu_barrier_cpu_count, 0); | 117 | atomic_set(&rcu_barrier_cpu_count, 0); |
118 | /* | ||
119 | * The queueing of callbacks in all CPUs must be atomic with | ||
120 | * respect to RCU, otherwise one CPU may queue a callback, | ||
121 | * wait for a grace period, decrement barrier count and call | ||
122 | * complete(), while other CPUs have not yet queued anything. | ||
123 | * So, we need to make sure that grace periods cannot complete | ||
124 | * until all the callbacks are queued. | ||
125 | */ | ||
126 | rcu_read_lock(); | ||
228 | on_each_cpu(rcu_barrier_func, NULL, 0, 1); | 127 | on_each_cpu(rcu_barrier_func, NULL, 0, 1); |
128 | rcu_read_unlock(); | ||
229 | wait_for_completion(&rcu_barrier_completion); | 129 | wait_for_completion(&rcu_barrier_completion); |
230 | mutex_unlock(&rcu_barrier_mutex); | 130 | mutex_unlock(&rcu_barrier_mutex); |
231 | } | 131 | } |
232 | EXPORT_SYMBOL_GPL(rcu_barrier); | 132 | EXPORT_SYMBOL_GPL(rcu_barrier); |
233 | 133 | ||
234 | /* | ||
235 | * Invoke the completed RCU callbacks. They are expected to be in | ||
236 | * a per-cpu list. | ||
237 | */ | ||
238 | static void rcu_do_batch(struct rcu_data *rdp) | ||
239 | { | ||
240 | struct rcu_head *next, *list; | ||
241 | int count = 0; | ||
242 | |||
243 | list = rdp->donelist; | ||
244 | while (list) { | ||
245 | next = list->next; | ||
246 | prefetch(next); | ||
247 | list->func(list); | ||
248 | list = next; | ||
249 | if (++count >= rdp->blimit) | ||
250 | break; | ||
251 | } | ||
252 | rdp->donelist = list; | ||
253 | |||
254 | local_irq_disable(); | ||
255 | rdp->qlen -= count; | ||
256 | local_irq_enable(); | ||
257 | if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) | ||
258 | rdp->blimit = blimit; | ||
259 | |||
260 | if (!rdp->donelist) | ||
261 | rdp->donetail = &rdp->donelist; | ||
262 | else | ||
263 | tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu)); | ||
264 | } | ||
265 | |||
266 | /* | ||
267 | * Grace period handling: | ||
268 | * The grace period handling consists out of two steps: | ||
269 | * - A new grace period is started. | ||
270 | * This is done by rcu_start_batch. The start is not broadcasted to | ||
271 | * all cpus, they must pick this up by comparing rcp->cur with | ||
272 | * rdp->quiescbatch. All cpus are recorded in the | ||
273 | * rcu_ctrlblk.cpumask bitmap. | ||
274 | * - All cpus must go through a quiescent state. | ||
275 | * Since the start of the grace period is not broadcasted, at least two | ||
276 | * calls to rcu_check_quiescent_state are required: | ||
277 | * The first call just notices that a new grace period is running. The | ||
278 | * following calls check if there was a quiescent state since the beginning | ||
279 | * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If | ||
280 | * the bitmap is empty, then the grace period is completed. | ||
281 | * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace | ||
282 | * period (if necessary). | ||
283 | */ | ||
284 | /* | ||
285 | * Register a new batch of callbacks, and start it up if there is currently no | ||
286 | * active batch and the batch to be registered has not already occurred. | ||
287 | * Caller must hold rcu_ctrlblk.lock. | ||
288 | */ | ||
289 | static void rcu_start_batch(struct rcu_ctrlblk *rcp) | ||
290 | { | ||
291 | if (rcp->next_pending && | ||
292 | rcp->completed == rcp->cur) { | ||
293 | rcp->next_pending = 0; | ||
294 | /* | ||
295 | * next_pending == 0 must be visible in | ||
296 | * __rcu_process_callbacks() before it can see new value of cur. | ||
297 | */ | ||
298 | smp_wmb(); | ||
299 | rcp->cur++; | ||
300 | |||
301 | /* | ||
302 | * Accessing nohz_cpu_mask before incrementing rcp->cur needs a | ||
303 | * Barrier Otherwise it can cause tickless idle CPUs to be | ||
304 | * included in rcp->cpumask, which will extend graceperiods | ||
305 | * unnecessarily. | ||
306 | */ | ||
307 | smp_mb(); | ||
308 | cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); | ||
309 | |||
310 | rcp->signaled = 0; | ||
311 | } | ||
312 | } | ||
313 | |||
314 | /* | ||
315 | * cpu went through a quiescent state since the beginning of the grace period. | ||
316 | * Clear it from the cpu mask and complete the grace period if it was the last | ||
317 | * cpu. Start another grace period if someone has further entries pending | ||
318 | */ | ||
319 | static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) | ||
320 | { | ||
321 | cpu_clear(cpu, rcp->cpumask); | ||
322 | if (cpus_empty(rcp->cpumask)) { | ||
323 | /* batch completed ! */ | ||
324 | rcp->completed = rcp->cur; | ||
325 | rcu_start_batch(rcp); | ||
326 | } | ||
327 | } | ||
328 | |||
329 | /* | ||
330 | * Check if the cpu has gone through a quiescent state (say context | ||
331 | * switch). If so and if it already hasn't done so in this RCU | ||
332 | * quiescent cycle, then indicate that it has done so. | ||
333 | */ | ||
334 | static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, | ||
335 | struct rcu_data *rdp) | ||
336 | { | ||
337 | if (rdp->quiescbatch != rcp->cur) { | ||
338 | /* start new grace period: */ | ||
339 | rdp->qs_pending = 1; | ||
340 | rdp->passed_quiesc = 0; | ||
341 | rdp->quiescbatch = rcp->cur; | ||
342 | return; | ||
343 | } | ||
344 | |||
345 | /* Grace period already completed for this cpu? | ||
346 | * qs_pending is checked instead of the actual bitmap to avoid | ||
347 | * cacheline trashing. | ||
348 | */ | ||
349 | if (!rdp->qs_pending) | ||
350 | return; | ||
351 | |||
352 | /* | ||
353 | * Was there a quiescent state since the beginning of the grace | ||
354 | * period? If no, then exit and wait for the next call. | ||
355 | */ | ||
356 | if (!rdp->passed_quiesc) | ||
357 | return; | ||
358 | rdp->qs_pending = 0; | ||
359 | |||
360 | spin_lock(&rcp->lock); | ||
361 | /* | ||
362 | * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync | ||
363 | * during cpu startup. Ignore the quiescent state. | ||
364 | */ | ||
365 | if (likely(rdp->quiescbatch == rcp->cur)) | ||
366 | cpu_quiet(rdp->cpu, rcp); | ||
367 | |||
368 | spin_unlock(&rcp->lock); | ||
369 | } | ||
370 | |||
371 | |||
372 | #ifdef CONFIG_HOTPLUG_CPU | ||
373 | |||
374 | /* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing | ||
375 | * locking requirements, the list it's pulling from has to belong to a cpu | ||
376 | * which is dead and hence not processing interrupts. | ||
377 | */ | ||
378 | static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, | ||
379 | struct rcu_head **tail) | ||
380 | { | ||
381 | local_irq_disable(); | ||
382 | *this_rdp->nxttail = list; | ||
383 | if (list) | ||
384 | this_rdp->nxttail = tail; | ||
385 | local_irq_enable(); | ||
386 | } | ||
387 | |||
388 | static void __rcu_offline_cpu(struct rcu_data *this_rdp, | ||
389 | struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
390 | { | ||
391 | /* if the cpu going offline owns the grace period | ||
392 | * we can block indefinitely waiting for it, so flush | ||
393 | * it here | ||
394 | */ | ||
395 | spin_lock_bh(&rcp->lock); | ||
396 | if (rcp->cur != rcp->completed) | ||
397 | cpu_quiet(rdp->cpu, rcp); | ||
398 | spin_unlock_bh(&rcp->lock); | ||
399 | rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); | ||
400 | rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); | ||
401 | rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); | ||
402 | } | ||
403 | |||
404 | static void rcu_offline_cpu(int cpu) | ||
405 | { | ||
406 | struct rcu_data *this_rdp = &get_cpu_var(rcu_data); | ||
407 | struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); | ||
408 | |||
409 | __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, | ||
410 | &per_cpu(rcu_data, cpu)); | ||
411 | __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, | ||
412 | &per_cpu(rcu_bh_data, cpu)); | ||
413 | put_cpu_var(rcu_data); | ||
414 | put_cpu_var(rcu_bh_data); | ||
415 | tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu); | ||
416 | } | ||
417 | |||
418 | #else | ||
419 | |||
420 | static void rcu_offline_cpu(int cpu) | ||
421 | { | ||
422 | } | ||
423 | |||
424 | #endif | ||
425 | |||
426 | /* | ||
427 | * This does the RCU processing work from tasklet context. | ||
428 | */ | ||
429 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, | ||
430 | struct rcu_data *rdp) | ||
431 | { | ||
432 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { | ||
433 | *rdp->donetail = rdp->curlist; | ||
434 | rdp->donetail = rdp->curtail; | ||
435 | rdp->curlist = NULL; | ||
436 | rdp->curtail = &rdp->curlist; | ||
437 | } | ||
438 | |||
439 | if (rdp->nxtlist && !rdp->curlist) { | ||
440 | local_irq_disable(); | ||
441 | rdp->curlist = rdp->nxtlist; | ||
442 | rdp->curtail = rdp->nxttail; | ||
443 | rdp->nxtlist = NULL; | ||
444 | rdp->nxttail = &rdp->nxtlist; | ||
445 | local_irq_enable(); | ||
446 | |||
447 | /* | ||
448 | * start the next batch of callbacks | ||
449 | */ | ||
450 | |||
451 | /* determine batch number */ | ||
452 | rdp->batch = rcp->cur + 1; | ||
453 | /* see the comment and corresponding wmb() in | ||
454 | * the rcu_start_batch() | ||
455 | */ | ||
456 | smp_rmb(); | ||
457 | |||
458 | if (!rcp->next_pending) { | ||
459 | /* and start it/schedule start if it's a new batch */ | ||
460 | spin_lock(&rcp->lock); | ||
461 | rcp->next_pending = 1; | ||
462 | rcu_start_batch(rcp); | ||
463 | spin_unlock(&rcp->lock); | ||
464 | } | ||
465 | } | ||
466 | |||
467 | rcu_check_quiescent_state(rcp, rdp); | ||
468 | if (rdp->donelist) | ||
469 | rcu_do_batch(rdp); | ||
470 | } | ||
471 | |||
472 | static void rcu_process_callbacks(unsigned long unused) | ||
473 | { | ||
474 | __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); | ||
475 | __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); | ||
476 | } | ||
477 | |||
478 | static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
479 | { | ||
480 | /* This cpu has pending rcu entries and the grace period | ||
481 | * for them has completed. | ||
482 | */ | ||
483 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) | ||
484 | return 1; | ||
485 | |||
486 | /* This cpu has no pending entries, but there are new entries */ | ||
487 | if (!rdp->curlist && rdp->nxtlist) | ||
488 | return 1; | ||
489 | |||
490 | /* This cpu has finished callbacks to invoke */ | ||
491 | if (rdp->donelist) | ||
492 | return 1; | ||
493 | |||
494 | /* The rcu core waits for a quiescent state from the cpu */ | ||
495 | if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) | ||
496 | return 1; | ||
497 | |||
498 | /* nothing to do */ | ||
499 | return 0; | ||
500 | } | ||
501 | |||
502 | /* | ||
503 | * Check to see if there is any immediate RCU-related work to be done | ||
504 | * by the current CPU, returning 1 if so. This function is part of the | ||
505 | * RCU implementation; it is -not- an exported member of the RCU API. | ||
506 | */ | ||
507 | int rcu_pending(int cpu) | ||
508 | { | ||
509 | return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || | ||
510 | __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); | ||
511 | } | ||
512 | |||
513 | /* | ||
514 | * Check to see if any future RCU-related work will need to be done | ||
515 | * by the current CPU, even if none need be done immediately, returning | ||
516 | * 1 if so. This function is part of the RCU implementation; it is -not- | ||
517 | * an exported member of the RCU API. | ||
518 | */ | ||
519 | int rcu_needs_cpu(int cpu) | ||
520 | { | ||
521 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
522 | struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); | ||
523 | |||
524 | return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); | ||
525 | } | ||
526 | |||
527 | void rcu_check_callbacks(int cpu, int user) | ||
528 | { | ||
529 | if (user || | ||
530 | (idle_cpu(cpu) && !in_softirq() && | ||
531 | hardirq_count() <= (1 << HARDIRQ_SHIFT))) { | ||
532 | rcu_qsctr_inc(cpu); | ||
533 | rcu_bh_qsctr_inc(cpu); | ||
534 | } else if (!in_softirq()) | ||
535 | rcu_bh_qsctr_inc(cpu); | ||
536 | tasklet_schedule(&per_cpu(rcu_tasklet, cpu)); | ||
537 | } | ||
538 | |||
539 | static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, | ||
540 | struct rcu_data *rdp) | ||
541 | { | ||
542 | memset(rdp, 0, sizeof(*rdp)); | ||
543 | rdp->curtail = &rdp->curlist; | ||
544 | rdp->nxttail = &rdp->nxtlist; | ||
545 | rdp->donetail = &rdp->donelist; | ||
546 | rdp->quiescbatch = rcp->completed; | ||
547 | rdp->qs_pending = 0; | ||
548 | rdp->cpu = cpu; | ||
549 | rdp->blimit = blimit; | ||
550 | } | ||
551 | |||
552 | static void __devinit rcu_online_cpu(int cpu) | ||
553 | { | ||
554 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
555 | struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); | ||
556 | |||
557 | rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); | ||
558 | rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); | ||
559 | tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); | ||
560 | } | ||
561 | |||
562 | static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | ||
563 | unsigned long action, void *hcpu) | ||
564 | { | ||
565 | long cpu = (long)hcpu; | ||
566 | switch (action) { | ||
567 | case CPU_UP_PREPARE: | ||
568 | case CPU_UP_PREPARE_FROZEN: | ||
569 | rcu_online_cpu(cpu); | ||
570 | break; | ||
571 | case CPU_DEAD: | ||
572 | case CPU_DEAD_FROZEN: | ||
573 | rcu_offline_cpu(cpu); | ||
574 | break; | ||
575 | default: | ||
576 | break; | ||
577 | } | ||
578 | return NOTIFY_OK; | ||
579 | } | ||
580 | |||
581 | static struct notifier_block __cpuinitdata rcu_nb = { | ||
582 | .notifier_call = rcu_cpu_notify, | ||
583 | }; | ||
584 | |||
585 | /* | ||
586 | * Initializes rcu mechanism. Assumed to be called early. | ||
587 | * That is before local timer(SMP) or jiffie timer (uniproc) is setup. | ||
588 | * Note that rcu_qsctr and friends are implicitly | ||
589 | * initialized due to the choice of ``0'' for RCU_CTR_INVALID. | ||
590 | */ | ||
591 | void __init rcu_init(void) | 134 | void __init rcu_init(void) |
592 | { | 135 | { |
593 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, | 136 | __rcu_init(); |
594 | (void *)(long)smp_processor_id()); | ||
595 | /* Register notifier for non-boot CPUs */ | ||
596 | register_cpu_notifier(&rcu_nb); | ||
597 | } | ||
598 | |||
599 | struct rcu_synchronize { | ||
600 | struct rcu_head head; | ||
601 | struct completion completion; | ||
602 | }; | ||
603 | |||
604 | /* Because of FASTCALL declaration of complete, we use this wrapper */ | ||
605 | static void wakeme_after_rcu(struct rcu_head *head) | ||
606 | { | ||
607 | struct rcu_synchronize *rcu; | ||
608 | |||
609 | rcu = container_of(head, struct rcu_synchronize, head); | ||
610 | complete(&rcu->completion); | ||
611 | } | 137 | } |
612 | 138 | ||
613 | /** | ||
614 | * synchronize_rcu - wait until a grace period has elapsed. | ||
615 | * | ||
616 | * Control will return to the caller some time after a full grace | ||
617 | * period has elapsed, in other words after all currently executing RCU | ||
618 | * read-side critical sections have completed. RCU read-side critical | ||
619 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | ||
620 | * and may be nested. | ||
621 | * | ||
622 | * If your read-side code is not protected by rcu_read_lock(), do -not- | ||
623 | * use synchronize_rcu(). | ||
624 | */ | ||
625 | void synchronize_rcu(void) | ||
626 | { | ||
627 | struct rcu_synchronize rcu; | ||
628 | |||
629 | init_completion(&rcu.completion); | ||
630 | /* Will wake me after RCU finished */ | ||
631 | call_rcu(&rcu.head, wakeme_after_rcu); | ||
632 | |||
633 | /* Wait for it */ | ||
634 | wait_for_completion(&rcu.completion); | ||
635 | } | ||
636 | |||
637 | module_param(blimit, int, 0); | ||
638 | module_param(qhimark, int, 0); | ||
639 | module_param(qlowmark, int, 0); | ||
640 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
641 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | ||
642 | EXPORT_SYMBOL_GPL(call_rcu); | ||
643 | EXPORT_SYMBOL_GPL(call_rcu_bh); | ||
644 | EXPORT_SYMBOL_GPL(synchronize_rcu); | ||
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c new file mode 100644 index 000000000000..987cfb7ade89 --- /dev/null +++ b/kernel/rcupreempt.c | |||
@@ -0,0 +1,953 @@ | |||
1 | /* | ||
2 | * Read-Copy Update mechanism for mutual exclusion, realtime implementation | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2006 | ||
19 | * | ||
20 | * Authors: Paul E. McKenney <paulmck@us.ibm.com> | ||
21 | * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar | ||
22 | * for pushing me away from locks and towards counters, and | ||
23 | * to Suparna Bhattacharya for pushing me completely away | ||
24 | * from atomic instructions on the read side. | ||
25 | * | ||
26 | * Papers: http://www.rdrop.com/users/paulmck/RCU | ||
27 | * | ||
28 | * Design Document: http://lwn.net/Articles/253651/ | ||
29 | * | ||
30 | * For detailed explanation of Read-Copy Update mechanism see - | ||
31 | * Documentation/RCU/ *.txt | ||
32 | * | ||
33 | */ | ||
34 | #include <linux/types.h> | ||
35 | #include <linux/kernel.h> | ||
36 | #include <linux/init.h> | ||
37 | #include <linux/spinlock.h> | ||
38 | #include <linux/smp.h> | ||
39 | #include <linux/rcupdate.h> | ||
40 | #include <linux/interrupt.h> | ||
41 | #include <linux/sched.h> | ||
42 | #include <asm/atomic.h> | ||
43 | #include <linux/bitops.h> | ||
44 | #include <linux/module.h> | ||
45 | #include <linux/completion.h> | ||
46 | #include <linux/moduleparam.h> | ||
47 | #include <linux/percpu.h> | ||
48 | #include <linux/notifier.h> | ||
49 | #include <linux/rcupdate.h> | ||
50 | #include <linux/cpu.h> | ||
51 | #include <linux/random.h> | ||
52 | #include <linux/delay.h> | ||
53 | #include <linux/byteorder/swabb.h> | ||
54 | #include <linux/cpumask.h> | ||
55 | #include <linux/rcupreempt_trace.h> | ||
56 | |||
57 | /* | ||
58 | * Macro that prevents the compiler from reordering accesses, but does | ||
59 | * absolutely -nothing- to prevent CPUs from reordering. This is used | ||
60 | * only to mediate communication between mainline code and hardware | ||
61 | * interrupt and NMI handlers. | ||
62 | */ | ||
63 | #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) | ||
64 | |||
65 | /* | ||
66 | * PREEMPT_RCU data structures. | ||
67 | */ | ||
68 | |||
69 | /* | ||
70 | * GP_STAGES specifies the number of times the state machine has | ||
71 | * to go through the all the rcu_try_flip_states (see below) | ||
72 | * in a single Grace Period. | ||
73 | * | ||
74 | * GP in GP_STAGES stands for Grace Period ;) | ||
75 | */ | ||
76 | #define GP_STAGES 2 | ||
77 | struct rcu_data { | ||
78 | spinlock_t lock; /* Protect rcu_data fields. */ | ||
79 | long completed; /* Number of last completed batch. */ | ||
80 | int waitlistcount; | ||
81 | struct tasklet_struct rcu_tasklet; | ||
82 | struct rcu_head *nextlist; | ||
83 | struct rcu_head **nexttail; | ||
84 | struct rcu_head *waitlist[GP_STAGES]; | ||
85 | struct rcu_head **waittail[GP_STAGES]; | ||
86 | struct rcu_head *donelist; | ||
87 | struct rcu_head **donetail; | ||
88 | long rcu_flipctr[2]; | ||
89 | #ifdef CONFIG_RCU_TRACE | ||
90 | struct rcupreempt_trace trace; | ||
91 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
92 | }; | ||
93 | |||
94 | /* | ||
95 | * States for rcu_try_flip() and friends. | ||
96 | */ | ||
97 | |||
98 | enum rcu_try_flip_states { | ||
99 | |||
100 | /* | ||
101 | * Stay here if nothing is happening. Flip the counter if somthing | ||
102 | * starts happening. Denoted by "I" | ||
103 | */ | ||
104 | rcu_try_flip_idle_state, | ||
105 | |||
106 | /* | ||
107 | * Wait here for all CPUs to notice that the counter has flipped. This | ||
108 | * prevents the old set of counters from ever being incremented once | ||
109 | * we leave this state, which in turn is necessary because we cannot | ||
110 | * test any individual counter for zero -- we can only check the sum. | ||
111 | * Denoted by "A". | ||
112 | */ | ||
113 | rcu_try_flip_waitack_state, | ||
114 | |||
115 | /* | ||
116 | * Wait here for the sum of the old per-CPU counters to reach zero. | ||
117 | * Denoted by "Z". | ||
118 | */ | ||
119 | rcu_try_flip_waitzero_state, | ||
120 | |||
121 | /* | ||
122 | * Wait here for each of the other CPUs to execute a memory barrier. | ||
123 | * This is necessary to ensure that these other CPUs really have | ||
124 | * completed executing their RCU read-side critical sections, despite | ||
125 | * their CPUs wildly reordering memory. Denoted by "M". | ||
126 | */ | ||
127 | rcu_try_flip_waitmb_state, | ||
128 | }; | ||
129 | |||
130 | struct rcu_ctrlblk { | ||
131 | spinlock_t fliplock; /* Protect state-machine transitions. */ | ||
132 | long completed; /* Number of last completed batch. */ | ||
133 | enum rcu_try_flip_states rcu_try_flip_state; /* The current state of | ||
134 | the rcu state machine */ | ||
135 | }; | ||
136 | |||
137 | static DEFINE_PER_CPU(struct rcu_data, rcu_data); | ||
138 | static struct rcu_ctrlblk rcu_ctrlblk = { | ||
139 | .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), | ||
140 | .completed = 0, | ||
141 | .rcu_try_flip_state = rcu_try_flip_idle_state, | ||
142 | }; | ||
143 | |||
144 | |||
145 | #ifdef CONFIG_RCU_TRACE | ||
146 | static char *rcu_try_flip_state_names[] = | ||
147 | { "idle", "waitack", "waitzero", "waitmb" }; | ||
148 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
149 | |||
150 | static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE; | ||
151 | |||
152 | /* | ||
153 | * Enum and per-CPU flag to determine when each CPU has seen | ||
154 | * the most recent counter flip. | ||
155 | */ | ||
156 | |||
157 | enum rcu_flip_flag_values { | ||
158 | rcu_flip_seen, /* Steady/initial state, last flip seen. */ | ||
159 | /* Only GP detector can update. */ | ||
160 | rcu_flipped /* Flip just completed, need confirmation. */ | ||
161 | /* Only corresponding CPU can update. */ | ||
162 | }; | ||
163 | static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag) | ||
164 | = rcu_flip_seen; | ||
165 | |||
166 | /* | ||
167 | * Enum and per-CPU flag to determine when each CPU has executed the | ||
168 | * needed memory barrier to fence in memory references from its last RCU | ||
169 | * read-side critical section in the just-completed grace period. | ||
170 | */ | ||
171 | |||
172 | enum rcu_mb_flag_values { | ||
173 | rcu_mb_done, /* Steady/initial state, no mb()s required. */ | ||
174 | /* Only GP detector can update. */ | ||
175 | rcu_mb_needed /* Flip just completed, need an mb(). */ | ||
176 | /* Only corresponding CPU can update. */ | ||
177 | }; | ||
178 | static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag) | ||
179 | = rcu_mb_done; | ||
180 | |||
181 | /* | ||
182 | * RCU_DATA_ME: find the current CPU's rcu_data structure. | ||
183 | * RCU_DATA_CPU: find the specified CPU's rcu_data structure. | ||
184 | */ | ||
185 | #define RCU_DATA_ME() (&__get_cpu_var(rcu_data)) | ||
186 | #define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu)) | ||
187 | |||
188 | /* | ||
189 | * Helper macro for tracing when the appropriate rcu_data is not | ||
190 | * cached in a local variable, but where the CPU number is so cached. | ||
191 | */ | ||
192 | #define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace)); | ||
193 | |||
194 | /* | ||
195 | * Helper macro for tracing when the appropriate rcu_data is not | ||
196 | * cached in a local variable. | ||
197 | */ | ||
198 | #define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace)); | ||
199 | |||
200 | /* | ||
201 | * Helper macro for tracing when the appropriate rcu_data is pointed | ||
202 | * to by a local variable. | ||
203 | */ | ||
204 | #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace)); | ||
205 | |||
206 | /* | ||
207 | * Return the number of RCU batches processed thus far. Useful | ||
208 | * for debug and statistics. | ||
209 | */ | ||
210 | long rcu_batches_completed(void) | ||
211 | { | ||
212 | return rcu_ctrlblk.completed; | ||
213 | } | ||
214 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
215 | |||
216 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | ||
217 | |||
218 | void __rcu_read_lock(void) | ||
219 | { | ||
220 | int idx; | ||
221 | struct task_struct *t = current; | ||
222 | int nesting; | ||
223 | |||
224 | nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
225 | if (nesting != 0) { | ||
226 | |||
227 | /* An earlier rcu_read_lock() covers us, just count it. */ | ||
228 | |||
229 | t->rcu_read_lock_nesting = nesting + 1; | ||
230 | |||
231 | } else { | ||
232 | unsigned long flags; | ||
233 | |||
234 | /* | ||
235 | * We disable interrupts for the following reasons: | ||
236 | * - If we get scheduling clock interrupt here, and we | ||
237 | * end up acking the counter flip, it's like a promise | ||
238 | * that we will never increment the old counter again. | ||
239 | * Thus we will break that promise if that | ||
240 | * scheduling clock interrupt happens between the time | ||
241 | * we pick the .completed field and the time that we | ||
242 | * increment our counter. | ||
243 | * | ||
244 | * - We don't want to be preempted out here. | ||
245 | * | ||
246 | * NMIs can still occur, of course, and might themselves | ||
247 | * contain rcu_read_lock(). | ||
248 | */ | ||
249 | |||
250 | local_irq_save(flags); | ||
251 | |||
252 | /* | ||
253 | * Outermost nesting of rcu_read_lock(), so increment | ||
254 | * the current counter for the current CPU. Use volatile | ||
255 | * casts to prevent the compiler from reordering. | ||
256 | */ | ||
257 | |||
258 | idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1; | ||
259 | ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++; | ||
260 | |||
261 | /* | ||
262 | * Now that the per-CPU counter has been incremented, we | ||
263 | * are protected from races with rcu_read_lock() invoked | ||
264 | * from NMI handlers on this CPU. We can therefore safely | ||
265 | * increment the nesting counter, relieving further NMIs | ||
266 | * of the need to increment the per-CPU counter. | ||
267 | */ | ||
268 | |||
269 | ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1; | ||
270 | |||
271 | /* | ||
272 | * Now that we have preventing any NMIs from storing | ||
273 | * to the ->rcu_flipctr_idx, we can safely use it to | ||
274 | * remember which counter to decrement in the matching | ||
275 | * rcu_read_unlock(). | ||
276 | */ | ||
277 | |||
278 | ACCESS_ONCE(t->rcu_flipctr_idx) = idx; | ||
279 | local_irq_restore(flags); | ||
280 | } | ||
281 | } | ||
282 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | ||
283 | |||
284 | void __rcu_read_unlock(void) | ||
285 | { | ||
286 | int idx; | ||
287 | struct task_struct *t = current; | ||
288 | int nesting; | ||
289 | |||
290 | nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
291 | if (nesting > 1) { | ||
292 | |||
293 | /* | ||
294 | * We are still protected by the enclosing rcu_read_lock(), | ||
295 | * so simply decrement the counter. | ||
296 | */ | ||
297 | |||
298 | t->rcu_read_lock_nesting = nesting - 1; | ||
299 | |||
300 | } else { | ||
301 | unsigned long flags; | ||
302 | |||
303 | /* | ||
304 | * Disable local interrupts to prevent the grace-period | ||
305 | * detection state machine from seeing us half-done. | ||
306 | * NMIs can still occur, of course, and might themselves | ||
307 | * contain rcu_read_lock() and rcu_read_unlock(). | ||
308 | */ | ||
309 | |||
310 | local_irq_save(flags); | ||
311 | |||
312 | /* | ||
313 | * Outermost nesting of rcu_read_unlock(), so we must | ||
314 | * decrement the current counter for the current CPU. | ||
315 | * This must be done carefully, because NMIs can | ||
316 | * occur at any point in this code, and any rcu_read_lock() | ||
317 | * and rcu_read_unlock() pairs in the NMI handlers | ||
318 | * must interact non-destructively with this code. | ||
319 | * Lots of volatile casts, and -very- careful ordering. | ||
320 | * | ||
321 | * Changes to this code, including this one, must be | ||
322 | * inspected, validated, and tested extremely carefully!!! | ||
323 | */ | ||
324 | |||
325 | /* | ||
326 | * First, pick up the index. | ||
327 | */ | ||
328 | |||
329 | idx = ACCESS_ONCE(t->rcu_flipctr_idx); | ||
330 | |||
331 | /* | ||
332 | * Now that we have fetched the counter index, it is | ||
333 | * safe to decrement the per-task RCU nesting counter. | ||
334 | * After this, any interrupts or NMIs will increment and | ||
335 | * decrement the per-CPU counters. | ||
336 | */ | ||
337 | ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1; | ||
338 | |||
339 | /* | ||
340 | * It is now safe to decrement this task's nesting count. | ||
341 | * NMIs that occur after this statement will route their | ||
342 | * rcu_read_lock() calls through this "else" clause, and | ||
343 | * will thus start incrementing the per-CPU counter on | ||
344 | * their own. They will also clobber ->rcu_flipctr_idx, | ||
345 | * but that is OK, since we have already fetched it. | ||
346 | */ | ||
347 | |||
348 | ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--; | ||
349 | local_irq_restore(flags); | ||
350 | } | ||
351 | } | ||
352 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | ||
353 | |||
354 | /* | ||
355 | * If a global counter flip has occurred since the last time that we | ||
356 | * advanced callbacks, advance them. Hardware interrupts must be | ||
357 | * disabled when calling this function. | ||
358 | */ | ||
359 | static void __rcu_advance_callbacks(struct rcu_data *rdp) | ||
360 | { | ||
361 | int cpu; | ||
362 | int i; | ||
363 | int wlc = 0; | ||
364 | |||
365 | if (rdp->completed != rcu_ctrlblk.completed) { | ||
366 | if (rdp->waitlist[GP_STAGES - 1] != NULL) { | ||
367 | *rdp->donetail = rdp->waitlist[GP_STAGES - 1]; | ||
368 | rdp->donetail = rdp->waittail[GP_STAGES - 1]; | ||
369 | RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp); | ||
370 | } | ||
371 | for (i = GP_STAGES - 2; i >= 0; i--) { | ||
372 | if (rdp->waitlist[i] != NULL) { | ||
373 | rdp->waitlist[i + 1] = rdp->waitlist[i]; | ||
374 | rdp->waittail[i + 1] = rdp->waittail[i]; | ||
375 | wlc++; | ||
376 | } else { | ||
377 | rdp->waitlist[i + 1] = NULL; | ||
378 | rdp->waittail[i + 1] = | ||
379 | &rdp->waitlist[i + 1]; | ||
380 | } | ||
381 | } | ||
382 | if (rdp->nextlist != NULL) { | ||
383 | rdp->waitlist[0] = rdp->nextlist; | ||
384 | rdp->waittail[0] = rdp->nexttail; | ||
385 | wlc++; | ||
386 | rdp->nextlist = NULL; | ||
387 | rdp->nexttail = &rdp->nextlist; | ||
388 | RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp); | ||
389 | } else { | ||
390 | rdp->waitlist[0] = NULL; | ||
391 | rdp->waittail[0] = &rdp->waitlist[0]; | ||
392 | } | ||
393 | rdp->waitlistcount = wlc; | ||
394 | rdp->completed = rcu_ctrlblk.completed; | ||
395 | } | ||
396 | |||
397 | /* | ||
398 | * Check to see if this CPU needs to report that it has seen | ||
399 | * the most recent counter flip, thereby declaring that all | ||
400 | * subsequent rcu_read_lock() invocations will respect this flip. | ||
401 | */ | ||
402 | |||
403 | cpu = raw_smp_processor_id(); | ||
404 | if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { | ||
405 | smp_mb(); /* Subsequent counter accesses must see new value */ | ||
406 | per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; | ||
407 | smp_mb(); /* Subsequent RCU read-side critical sections */ | ||
408 | /* seen -after- acknowledgement. */ | ||
409 | } | ||
410 | } | ||
411 | |||
412 | /* | ||
413 | * Get here when RCU is idle. Decide whether we need to | ||
414 | * move out of idle state, and return non-zero if so. | ||
415 | * "Straightforward" approach for the moment, might later | ||
416 | * use callback-list lengths, grace-period duration, or | ||
417 | * some such to determine when to exit idle state. | ||
418 | * Might also need a pre-idle test that does not acquire | ||
419 | * the lock, but let's get the simple case working first... | ||
420 | */ | ||
421 | |||
422 | static int | ||
423 | rcu_try_flip_idle(void) | ||
424 | { | ||
425 | int cpu; | ||
426 | |||
427 | RCU_TRACE_ME(rcupreempt_trace_try_flip_i1); | ||
428 | if (!rcu_pending(smp_processor_id())) { | ||
429 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1); | ||
430 | return 0; | ||
431 | } | ||
432 | |||
433 | /* | ||
434 | * Do the flip. | ||
435 | */ | ||
436 | |||
437 | RCU_TRACE_ME(rcupreempt_trace_try_flip_g1); | ||
438 | rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */ | ||
439 | |||
440 | /* | ||
441 | * Need a memory barrier so that other CPUs see the new | ||
442 | * counter value before they see the subsequent change of all | ||
443 | * the rcu_flip_flag instances to rcu_flipped. | ||
444 | */ | ||
445 | |||
446 | smp_mb(); /* see above block comment. */ | ||
447 | |||
448 | /* Now ask each CPU for acknowledgement of the flip. */ | ||
449 | |||
450 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
451 | per_cpu(rcu_flip_flag, cpu) = rcu_flipped; | ||
452 | |||
453 | return 1; | ||
454 | } | ||
455 | |||
456 | /* | ||
457 | * Wait for CPUs to acknowledge the flip. | ||
458 | */ | ||
459 | |||
460 | static int | ||
461 | rcu_try_flip_waitack(void) | ||
462 | { | ||
463 | int cpu; | ||
464 | |||
465 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); | ||
466 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
467 | if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { | ||
468 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); | ||
469 | return 0; | ||
470 | } | ||
471 | |||
472 | /* | ||
473 | * Make sure our checks above don't bleed into subsequent | ||
474 | * waiting for the sum of the counters to reach zero. | ||
475 | */ | ||
476 | |||
477 | smp_mb(); /* see above block comment. */ | ||
478 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a2); | ||
479 | return 1; | ||
480 | } | ||
481 | |||
482 | /* | ||
483 | * Wait for collective ``last'' counter to reach zero, | ||
484 | * then tell all CPUs to do an end-of-grace-period memory barrier. | ||
485 | */ | ||
486 | |||
487 | static int | ||
488 | rcu_try_flip_waitzero(void) | ||
489 | { | ||
490 | int cpu; | ||
491 | int lastidx = !(rcu_ctrlblk.completed & 0x1); | ||
492 | int sum = 0; | ||
493 | |||
494 | /* Check to see if the sum of the "last" counters is zero. */ | ||
495 | |||
496 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); | ||
497 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
498 | sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; | ||
499 | if (sum != 0) { | ||
500 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); | ||
501 | return 0; | ||
502 | } | ||
503 | |||
504 | /* | ||
505 | * This ensures that the other CPUs see the call for | ||
506 | * memory barriers -after- the sum to zero has been | ||
507 | * detected here | ||
508 | */ | ||
509 | smp_mb(); /* ^^^^^^^^^^^^ */ | ||
510 | |||
511 | /* Call for a memory barrier from each CPU. */ | ||
512 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
513 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; | ||
514 | |||
515 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); | ||
516 | return 1; | ||
517 | } | ||
518 | |||
519 | /* | ||
520 | * Wait for all CPUs to do their end-of-grace-period memory barrier. | ||
521 | * Return 0 once all CPUs have done so. | ||
522 | */ | ||
523 | |||
524 | static int | ||
525 | rcu_try_flip_waitmb(void) | ||
526 | { | ||
527 | int cpu; | ||
528 | |||
529 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); | ||
530 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
531 | if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { | ||
532 | RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); | ||
533 | return 0; | ||
534 | } | ||
535 | |||
536 | smp_mb(); /* Ensure that the above checks precede any following flip. */ | ||
537 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m2); | ||
538 | return 1; | ||
539 | } | ||
540 | |||
541 | /* | ||
542 | * Attempt a single flip of the counters. Remember, a single flip does | ||
543 | * -not- constitute a grace period. Instead, the interval between | ||
544 | * at least GP_STAGES consecutive flips is a grace period. | ||
545 | * | ||
546 | * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation | ||
547 | * on a large SMP, they might want to use a hierarchical organization of | ||
548 | * the per-CPU-counter pairs. | ||
549 | */ | ||
550 | static void rcu_try_flip(void) | ||
551 | { | ||
552 | unsigned long flags; | ||
553 | |||
554 | RCU_TRACE_ME(rcupreempt_trace_try_flip_1); | ||
555 | if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) { | ||
556 | RCU_TRACE_ME(rcupreempt_trace_try_flip_e1); | ||
557 | return; | ||
558 | } | ||
559 | |||
560 | /* | ||
561 | * Take the next transition(s) through the RCU grace-period | ||
562 | * flip-counter state machine. | ||
563 | */ | ||
564 | |||
565 | switch (rcu_ctrlblk.rcu_try_flip_state) { | ||
566 | case rcu_try_flip_idle_state: | ||
567 | if (rcu_try_flip_idle()) | ||
568 | rcu_ctrlblk.rcu_try_flip_state = | ||
569 | rcu_try_flip_waitack_state; | ||
570 | break; | ||
571 | case rcu_try_flip_waitack_state: | ||
572 | if (rcu_try_flip_waitack()) | ||
573 | rcu_ctrlblk.rcu_try_flip_state = | ||
574 | rcu_try_flip_waitzero_state; | ||
575 | break; | ||
576 | case rcu_try_flip_waitzero_state: | ||
577 | if (rcu_try_flip_waitzero()) | ||
578 | rcu_ctrlblk.rcu_try_flip_state = | ||
579 | rcu_try_flip_waitmb_state; | ||
580 | break; | ||
581 | case rcu_try_flip_waitmb_state: | ||
582 | if (rcu_try_flip_waitmb()) | ||
583 | rcu_ctrlblk.rcu_try_flip_state = | ||
584 | rcu_try_flip_idle_state; | ||
585 | } | ||
586 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
587 | } | ||
588 | |||
589 | /* | ||
590 | * Check to see if this CPU needs to do a memory barrier in order to | ||
591 | * ensure that any prior RCU read-side critical sections have committed | ||
592 | * their counter manipulations and critical-section memory references | ||
593 | * before declaring the grace period to be completed. | ||
594 | */ | ||
595 | static void rcu_check_mb(int cpu) | ||
596 | { | ||
597 | if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) { | ||
598 | smp_mb(); /* Ensure RCU read-side accesses are visible. */ | ||
599 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_done; | ||
600 | } | ||
601 | } | ||
602 | |||
603 | void rcu_check_callbacks(int cpu, int user) | ||
604 | { | ||
605 | unsigned long flags; | ||
606 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
607 | |||
608 | rcu_check_mb(cpu); | ||
609 | if (rcu_ctrlblk.completed == rdp->completed) | ||
610 | rcu_try_flip(); | ||
611 | spin_lock_irqsave(&rdp->lock, flags); | ||
612 | RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); | ||
613 | __rcu_advance_callbacks(rdp); | ||
614 | if (rdp->donelist == NULL) { | ||
615 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
616 | } else { | ||
617 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
618 | raise_softirq(RCU_SOFTIRQ); | ||
619 | } | ||
620 | } | ||
621 | |||
622 | /* | ||
623 | * Needed by dynticks, to make sure all RCU processing has finished | ||
624 | * when we go idle: | ||
625 | */ | ||
626 | void rcu_advance_callbacks(int cpu, int user) | ||
627 | { | ||
628 | unsigned long flags; | ||
629 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
630 | |||
631 | if (rcu_ctrlblk.completed == rdp->completed) { | ||
632 | rcu_try_flip(); | ||
633 | if (rcu_ctrlblk.completed == rdp->completed) | ||
634 | return; | ||
635 | } | ||
636 | spin_lock_irqsave(&rdp->lock, flags); | ||
637 | RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); | ||
638 | __rcu_advance_callbacks(rdp); | ||
639 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
640 | } | ||
641 | |||
642 | #ifdef CONFIG_HOTPLUG_CPU | ||
643 | #define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \ | ||
644 | *dsttail = srclist; \ | ||
645 | if (srclist != NULL) { \ | ||
646 | dsttail = srctail; \ | ||
647 | srclist = NULL; \ | ||
648 | srctail = &srclist;\ | ||
649 | } \ | ||
650 | } while (0) | ||
651 | |||
652 | void rcu_offline_cpu(int cpu) | ||
653 | { | ||
654 | int i; | ||
655 | struct rcu_head *list = NULL; | ||
656 | unsigned long flags; | ||
657 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
658 | struct rcu_head **tail = &list; | ||
659 | |||
660 | /* | ||
661 | * Remove all callbacks from the newly dead CPU, retaining order. | ||
662 | * Otherwise rcu_barrier() will fail | ||
663 | */ | ||
664 | |||
665 | spin_lock_irqsave(&rdp->lock, flags); | ||
666 | rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail); | ||
667 | for (i = GP_STAGES - 1; i >= 0; i--) | ||
668 | rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i], | ||
669 | list, tail); | ||
670 | rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail); | ||
671 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
672 | rdp->waitlistcount = 0; | ||
673 | |||
674 | /* Disengage the newly dead CPU from the grace-period computation. */ | ||
675 | |||
676 | spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); | ||
677 | rcu_check_mb(cpu); | ||
678 | if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { | ||
679 | smp_mb(); /* Subsequent counter accesses must see new value */ | ||
680 | per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; | ||
681 | smp_mb(); /* Subsequent RCU read-side critical sections */ | ||
682 | /* seen -after- acknowledgement. */ | ||
683 | } | ||
684 | |||
685 | RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0]; | ||
686 | RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1]; | ||
687 | |||
688 | RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0; | ||
689 | RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0; | ||
690 | |||
691 | cpu_clear(cpu, rcu_cpu_online_map); | ||
692 | |||
693 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
694 | |||
695 | /* | ||
696 | * Place the removed callbacks on the current CPU's queue. | ||
697 | * Make them all start a new grace period: simple approach, | ||
698 | * in theory could starve a given set of callbacks, but | ||
699 | * you would need to be doing some serious CPU hotplugging | ||
700 | * to make this happen. If this becomes a problem, adding | ||
701 | * a synchronize_rcu() to the hotplug path would be a simple | ||
702 | * fix. | ||
703 | */ | ||
704 | |||
705 | rdp = RCU_DATA_ME(); | ||
706 | spin_lock_irqsave(&rdp->lock, flags); | ||
707 | *rdp->nexttail = list; | ||
708 | if (list) | ||
709 | rdp->nexttail = tail; | ||
710 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
711 | } | ||
712 | |||
713 | void __devinit rcu_online_cpu(int cpu) | ||
714 | { | ||
715 | unsigned long flags; | ||
716 | |||
717 | spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); | ||
718 | cpu_set(cpu, rcu_cpu_online_map); | ||
719 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
720 | } | ||
721 | |||
722 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
723 | |||
724 | void rcu_offline_cpu(int cpu) | ||
725 | { | ||
726 | } | ||
727 | |||
728 | void __devinit rcu_online_cpu(int cpu) | ||
729 | { | ||
730 | } | ||
731 | |||
732 | #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ | ||
733 | |||
734 | static void rcu_process_callbacks(struct softirq_action *unused) | ||
735 | { | ||
736 | unsigned long flags; | ||
737 | struct rcu_head *next, *list; | ||
738 | struct rcu_data *rdp = RCU_DATA_ME(); | ||
739 | |||
740 | spin_lock_irqsave(&rdp->lock, flags); | ||
741 | list = rdp->donelist; | ||
742 | if (list == NULL) { | ||
743 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
744 | return; | ||
745 | } | ||
746 | rdp->donelist = NULL; | ||
747 | rdp->donetail = &rdp->donelist; | ||
748 | RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp); | ||
749 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
750 | while (list) { | ||
751 | next = list->next; | ||
752 | list->func(list); | ||
753 | list = next; | ||
754 | RCU_TRACE_ME(rcupreempt_trace_invoke); | ||
755 | } | ||
756 | } | ||
757 | |||
758 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
759 | { | ||
760 | unsigned long flags; | ||
761 | struct rcu_data *rdp; | ||
762 | |||
763 | head->func = func; | ||
764 | head->next = NULL; | ||
765 | local_irq_save(flags); | ||
766 | rdp = RCU_DATA_ME(); | ||
767 | spin_lock(&rdp->lock); | ||
768 | __rcu_advance_callbacks(rdp); | ||
769 | *rdp->nexttail = head; | ||
770 | rdp->nexttail = &head->next; | ||
771 | RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp); | ||
772 | spin_unlock(&rdp->lock); | ||
773 | local_irq_restore(flags); | ||
774 | } | ||
775 | EXPORT_SYMBOL_GPL(call_rcu); | ||
776 | |||
777 | /* | ||
778 | * Wait until all currently running preempt_disable() code segments | ||
779 | * (including hardware-irq-disable segments) complete. Note that | ||
780 | * in -rt this does -not- necessarily result in all currently executing | ||
781 | * interrupt -handlers- having completed. | ||
782 | */ | ||
783 | void __synchronize_sched(void) | ||
784 | { | ||
785 | cpumask_t oldmask; | ||
786 | int cpu; | ||
787 | |||
788 | if (sched_getaffinity(0, &oldmask) < 0) | ||
789 | oldmask = cpu_possible_map; | ||
790 | for_each_online_cpu(cpu) { | ||
791 | sched_setaffinity(0, cpumask_of_cpu(cpu)); | ||
792 | schedule(); | ||
793 | } | ||
794 | sched_setaffinity(0, oldmask); | ||
795 | } | ||
796 | EXPORT_SYMBOL_GPL(__synchronize_sched); | ||
797 | |||
798 | /* | ||
799 | * Check to see if any future RCU-related work will need to be done | ||
800 | * by the current CPU, even if none need be done immediately, returning | ||
801 | * 1 if so. Assumes that notifiers would take care of handling any | ||
802 | * outstanding requests from the RCU core. | ||
803 | * | ||
804 | * This function is part of the RCU implementation; it is -not- | ||
805 | * an exported member of the RCU API. | ||
806 | */ | ||
807 | int rcu_needs_cpu(int cpu) | ||
808 | { | ||
809 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
810 | |||
811 | return (rdp->donelist != NULL || | ||
812 | !!rdp->waitlistcount || | ||
813 | rdp->nextlist != NULL); | ||
814 | } | ||
815 | |||
816 | int rcu_pending(int cpu) | ||
817 | { | ||
818 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
819 | |||
820 | /* The CPU has at least one callback queued somewhere. */ | ||
821 | |||
822 | if (rdp->donelist != NULL || | ||
823 | !!rdp->waitlistcount || | ||
824 | rdp->nextlist != NULL) | ||
825 | return 1; | ||
826 | |||
827 | /* The RCU core needs an acknowledgement from this CPU. */ | ||
828 | |||
829 | if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) || | ||
830 | (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed)) | ||
831 | return 1; | ||
832 | |||
833 | /* This CPU has fallen behind the global grace-period number. */ | ||
834 | |||
835 | if (rdp->completed != rcu_ctrlblk.completed) | ||
836 | return 1; | ||
837 | |||
838 | /* Nothing needed from this CPU. */ | ||
839 | |||
840 | return 0; | ||
841 | } | ||
842 | |||
843 | static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | ||
844 | unsigned long action, void *hcpu) | ||
845 | { | ||
846 | long cpu = (long)hcpu; | ||
847 | |||
848 | switch (action) { | ||
849 | case CPU_UP_PREPARE: | ||
850 | case CPU_UP_PREPARE_FROZEN: | ||
851 | rcu_online_cpu(cpu); | ||
852 | break; | ||
853 | case CPU_UP_CANCELED: | ||
854 | case CPU_UP_CANCELED_FROZEN: | ||
855 | case CPU_DEAD: | ||
856 | case CPU_DEAD_FROZEN: | ||
857 | rcu_offline_cpu(cpu); | ||
858 | break; | ||
859 | default: | ||
860 | break; | ||
861 | } | ||
862 | return NOTIFY_OK; | ||
863 | } | ||
864 | |||
865 | static struct notifier_block __cpuinitdata rcu_nb = { | ||
866 | .notifier_call = rcu_cpu_notify, | ||
867 | }; | ||
868 | |||
869 | void __init __rcu_init(void) | ||
870 | { | ||
871 | int cpu; | ||
872 | int i; | ||
873 | struct rcu_data *rdp; | ||
874 | |||
875 | printk(KERN_NOTICE "Preemptible RCU implementation.\n"); | ||
876 | for_each_possible_cpu(cpu) { | ||
877 | rdp = RCU_DATA_CPU(cpu); | ||
878 | spin_lock_init(&rdp->lock); | ||
879 | rdp->completed = 0; | ||
880 | rdp->waitlistcount = 0; | ||
881 | rdp->nextlist = NULL; | ||
882 | rdp->nexttail = &rdp->nextlist; | ||
883 | for (i = 0; i < GP_STAGES; i++) { | ||
884 | rdp->waitlist[i] = NULL; | ||
885 | rdp->waittail[i] = &rdp->waitlist[i]; | ||
886 | } | ||
887 | rdp->donelist = NULL; | ||
888 | rdp->donetail = &rdp->donelist; | ||
889 | rdp->rcu_flipctr[0] = 0; | ||
890 | rdp->rcu_flipctr[1] = 0; | ||
891 | } | ||
892 | register_cpu_notifier(&rcu_nb); | ||
893 | |||
894 | /* | ||
895 | * We don't need protection against CPU-Hotplug here | ||
896 | * since | ||
897 | * a) If a CPU comes online while we are iterating over the | ||
898 | * cpu_online_map below, we would only end up making a | ||
899 | * duplicate call to rcu_online_cpu() which sets the corresponding | ||
900 | * CPU's mask in the rcu_cpu_online_map. | ||
901 | * | ||
902 | * b) A CPU cannot go offline at this point in time since the user | ||
903 | * does not have access to the sysfs interface, nor do we | ||
904 | * suspend the system. | ||
905 | */ | ||
906 | for_each_online_cpu(cpu) | ||
907 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu); | ||
908 | |||
909 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); | ||
910 | } | ||
911 | |||
912 | /* | ||
913 | * Deprecated, use synchronize_rcu() or synchronize_sched() instead. | ||
914 | */ | ||
915 | void synchronize_kernel(void) | ||
916 | { | ||
917 | synchronize_rcu(); | ||
918 | } | ||
919 | |||
920 | #ifdef CONFIG_RCU_TRACE | ||
921 | long *rcupreempt_flipctr(int cpu) | ||
922 | { | ||
923 | return &RCU_DATA_CPU(cpu)->rcu_flipctr[0]; | ||
924 | } | ||
925 | EXPORT_SYMBOL_GPL(rcupreempt_flipctr); | ||
926 | |||
927 | int rcupreempt_flip_flag(int cpu) | ||
928 | { | ||
929 | return per_cpu(rcu_flip_flag, cpu); | ||
930 | } | ||
931 | EXPORT_SYMBOL_GPL(rcupreempt_flip_flag); | ||
932 | |||
933 | int rcupreempt_mb_flag(int cpu) | ||
934 | { | ||
935 | return per_cpu(rcu_mb_flag, cpu); | ||
936 | } | ||
937 | EXPORT_SYMBOL_GPL(rcupreempt_mb_flag); | ||
938 | |||
939 | char *rcupreempt_try_flip_state_name(void) | ||
940 | { | ||
941 | return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state]; | ||
942 | } | ||
943 | EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name); | ||
944 | |||
945 | struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu) | ||
946 | { | ||
947 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
948 | |||
949 | return &rdp->trace; | ||
950 | } | ||
951 | EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu); | ||
952 | |||
953 | #endif /* #ifdef RCU_TRACE */ | ||
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c new file mode 100644 index 000000000000..49ac4947af24 --- /dev/null +++ b/kernel/rcupreempt_trace.c | |||
@@ -0,0 +1,330 @@ | |||
1 | /* | ||
2 | * Read-Copy Update tracing for realtime implementation | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2006 | ||
19 | * | ||
20 | * Papers: http://www.rdrop.com/users/paulmck/RCU | ||
21 | * | ||
22 | * For detailed explanation of Read-Copy Update mechanism see - | ||
23 | * Documentation/RCU/ *.txt | ||
24 | * | ||
25 | */ | ||
26 | #include <linux/types.h> | ||
27 | #include <linux/kernel.h> | ||
28 | #include <linux/init.h> | ||
29 | #include <linux/spinlock.h> | ||
30 | #include <linux/smp.h> | ||
31 | #include <linux/rcupdate.h> | ||
32 | #include <linux/interrupt.h> | ||
33 | #include <linux/sched.h> | ||
34 | #include <asm/atomic.h> | ||
35 | #include <linux/bitops.h> | ||
36 | #include <linux/module.h> | ||
37 | #include <linux/completion.h> | ||
38 | #include <linux/moduleparam.h> | ||
39 | #include <linux/percpu.h> | ||
40 | #include <linux/notifier.h> | ||
41 | #include <linux/rcupdate.h> | ||
42 | #include <linux/cpu.h> | ||
43 | #include <linux/mutex.h> | ||
44 | #include <linux/rcupreempt_trace.h> | ||
45 | #include <linux/debugfs.h> | ||
46 | |||
47 | static struct mutex rcupreempt_trace_mutex; | ||
48 | static char *rcupreempt_trace_buf; | ||
49 | #define RCUPREEMPT_TRACE_BUF_SIZE 4096 | ||
50 | |||
51 | void rcupreempt_trace_move2done(struct rcupreempt_trace *trace) | ||
52 | { | ||
53 | trace->done_length += trace->wait_length; | ||
54 | trace->done_add += trace->wait_length; | ||
55 | trace->wait_length = 0; | ||
56 | } | ||
57 | void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace) | ||
58 | { | ||
59 | trace->wait_length += trace->next_length; | ||
60 | trace->wait_add += trace->next_length; | ||
61 | trace->next_length = 0; | ||
62 | } | ||
63 | void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace) | ||
64 | { | ||
65 | atomic_inc(&trace->rcu_try_flip_1); | ||
66 | } | ||
67 | void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace) | ||
68 | { | ||
69 | atomic_inc(&trace->rcu_try_flip_e1); | ||
70 | } | ||
71 | void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace) | ||
72 | { | ||
73 | trace->rcu_try_flip_i1++; | ||
74 | } | ||
75 | void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace) | ||
76 | { | ||
77 | trace->rcu_try_flip_ie1++; | ||
78 | } | ||
79 | void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace) | ||
80 | { | ||
81 | trace->rcu_try_flip_g1++; | ||
82 | } | ||
83 | void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace) | ||
84 | { | ||
85 | trace->rcu_try_flip_a1++; | ||
86 | } | ||
87 | void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace) | ||
88 | { | ||
89 | trace->rcu_try_flip_ae1++; | ||
90 | } | ||
91 | void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace) | ||
92 | { | ||
93 | trace->rcu_try_flip_a2++; | ||
94 | } | ||
95 | void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace) | ||
96 | { | ||
97 | trace->rcu_try_flip_z1++; | ||
98 | } | ||
99 | void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace) | ||
100 | { | ||
101 | trace->rcu_try_flip_ze1++; | ||
102 | } | ||
103 | void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace) | ||
104 | { | ||
105 | trace->rcu_try_flip_z2++; | ||
106 | } | ||
107 | void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace) | ||
108 | { | ||
109 | trace->rcu_try_flip_m1++; | ||
110 | } | ||
111 | void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace) | ||
112 | { | ||
113 | trace->rcu_try_flip_me1++; | ||
114 | } | ||
115 | void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace) | ||
116 | { | ||
117 | trace->rcu_try_flip_m2++; | ||
118 | } | ||
119 | void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace) | ||
120 | { | ||
121 | trace->rcu_check_callbacks++; | ||
122 | } | ||
123 | void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace) | ||
124 | { | ||
125 | trace->done_remove += trace->done_length; | ||
126 | trace->done_length = 0; | ||
127 | } | ||
128 | void rcupreempt_trace_invoke(struct rcupreempt_trace *trace) | ||
129 | { | ||
130 | atomic_inc(&trace->done_invoked); | ||
131 | } | ||
132 | void rcupreempt_trace_next_add(struct rcupreempt_trace *trace) | ||
133 | { | ||
134 | trace->next_add++; | ||
135 | trace->next_length++; | ||
136 | } | ||
137 | |||
138 | static void rcupreempt_trace_sum(struct rcupreempt_trace *sp) | ||
139 | { | ||
140 | struct rcupreempt_trace *cp; | ||
141 | int cpu; | ||
142 | |||
143 | memset(sp, 0, sizeof(*sp)); | ||
144 | for_each_possible_cpu(cpu) { | ||
145 | cp = rcupreempt_trace_cpu(cpu); | ||
146 | sp->next_length += cp->next_length; | ||
147 | sp->next_add += cp->next_add; | ||
148 | sp->wait_length += cp->wait_length; | ||
149 | sp->wait_add += cp->wait_add; | ||
150 | sp->done_length += cp->done_length; | ||
151 | sp->done_add += cp->done_add; | ||
152 | sp->done_remove += cp->done_remove; | ||
153 | atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked)); | ||
154 | sp->rcu_check_callbacks += cp->rcu_check_callbacks; | ||
155 | atomic_set(&sp->rcu_try_flip_1, | ||
156 | atomic_read(&cp->rcu_try_flip_1)); | ||
157 | atomic_set(&sp->rcu_try_flip_e1, | ||
158 | atomic_read(&cp->rcu_try_flip_e1)); | ||
159 | sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1; | ||
160 | sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1; | ||
161 | sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1; | ||
162 | sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1; | ||
163 | sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1; | ||
164 | sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2; | ||
165 | sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1; | ||
166 | sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1; | ||
167 | sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2; | ||
168 | sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1; | ||
169 | sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1; | ||
170 | sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2; | ||
171 | } | ||
172 | } | ||
173 | |||
174 | static ssize_t rcustats_read(struct file *filp, char __user *buffer, | ||
175 | size_t count, loff_t *ppos) | ||
176 | { | ||
177 | struct rcupreempt_trace trace; | ||
178 | ssize_t bcount; | ||
179 | int cnt = 0; | ||
180 | |||
181 | rcupreempt_trace_sum(&trace); | ||
182 | mutex_lock(&rcupreempt_trace_mutex); | ||
183 | snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
184 | "ggp=%ld rcc=%ld\n", | ||
185 | rcu_batches_completed(), | ||
186 | trace.rcu_check_callbacks); | ||
187 | snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
188 | "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n" | ||
189 | "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n" | ||
190 | "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n", | ||
191 | |||
192 | trace.next_add, trace.next_length, | ||
193 | trace.wait_add, trace.wait_length, | ||
194 | trace.done_add, trace.done_length, | ||
195 | trace.done_remove, atomic_read(&trace.done_invoked), | ||
196 | atomic_read(&trace.rcu_try_flip_1), | ||
197 | atomic_read(&trace.rcu_try_flip_e1), | ||
198 | trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1, | ||
199 | trace.rcu_try_flip_g1, | ||
200 | trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1, | ||
201 | trace.rcu_try_flip_a2, | ||
202 | trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1, | ||
203 | trace.rcu_try_flip_z2, | ||
204 | trace.rcu_try_flip_m1, trace.rcu_try_flip_me1, | ||
205 | trace.rcu_try_flip_m2); | ||
206 | bcount = simple_read_from_buffer(buffer, count, ppos, | ||
207 | rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); | ||
208 | mutex_unlock(&rcupreempt_trace_mutex); | ||
209 | return bcount; | ||
210 | } | ||
211 | |||
212 | static ssize_t rcugp_read(struct file *filp, char __user *buffer, | ||
213 | size_t count, loff_t *ppos) | ||
214 | { | ||
215 | long oldgp = rcu_batches_completed(); | ||
216 | ssize_t bcount; | ||
217 | |||
218 | mutex_lock(&rcupreempt_trace_mutex); | ||
219 | synchronize_rcu(); | ||
220 | snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE, | ||
221 | "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed()); | ||
222 | bcount = simple_read_from_buffer(buffer, count, ppos, | ||
223 | rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); | ||
224 | mutex_unlock(&rcupreempt_trace_mutex); | ||
225 | return bcount; | ||
226 | } | ||
227 | |||
228 | static ssize_t rcuctrs_read(struct file *filp, char __user *buffer, | ||
229 | size_t count, loff_t *ppos) | ||
230 | { | ||
231 | int cnt = 0; | ||
232 | int cpu; | ||
233 | int f = rcu_batches_completed() & 0x1; | ||
234 | ssize_t bcount; | ||
235 | |||
236 | mutex_lock(&rcupreempt_trace_mutex); | ||
237 | |||
238 | cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE, | ||
239 | "CPU last cur F M\n"); | ||
240 | for_each_online_cpu(cpu) { | ||
241 | long *flipctr = rcupreempt_flipctr(cpu); | ||
242 | cnt += snprintf(&rcupreempt_trace_buf[cnt], | ||
243 | RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
244 | "%3d %4ld %3ld %d %d\n", | ||
245 | cpu, | ||
246 | flipctr[!f], | ||
247 | flipctr[f], | ||
248 | rcupreempt_flip_flag(cpu), | ||
249 | rcupreempt_mb_flag(cpu)); | ||
250 | } | ||
251 | cnt += snprintf(&rcupreempt_trace_buf[cnt], | ||
252 | RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
253 | "ggp = %ld, state = %s\n", | ||
254 | rcu_batches_completed(), | ||
255 | rcupreempt_try_flip_state_name()); | ||
256 | cnt += snprintf(&rcupreempt_trace_buf[cnt], | ||
257 | RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
258 | "\n"); | ||
259 | bcount = simple_read_from_buffer(buffer, count, ppos, | ||
260 | rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); | ||
261 | mutex_unlock(&rcupreempt_trace_mutex); | ||
262 | return bcount; | ||
263 | } | ||
264 | |||
265 | static struct file_operations rcustats_fops = { | ||
266 | .owner = THIS_MODULE, | ||
267 | .read = rcustats_read, | ||
268 | }; | ||
269 | |||
270 | static struct file_operations rcugp_fops = { | ||
271 | .owner = THIS_MODULE, | ||
272 | .read = rcugp_read, | ||
273 | }; | ||
274 | |||
275 | static struct file_operations rcuctrs_fops = { | ||
276 | .owner = THIS_MODULE, | ||
277 | .read = rcuctrs_read, | ||
278 | }; | ||
279 | |||
280 | static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir; | ||
281 | static int rcupreempt_debugfs_init(void) | ||
282 | { | ||
283 | rcudir = debugfs_create_dir("rcu", NULL); | ||
284 | if (!rcudir) | ||
285 | goto out; | ||
286 | statdir = debugfs_create_file("rcustats", 0444, rcudir, | ||
287 | NULL, &rcustats_fops); | ||
288 | if (!statdir) | ||
289 | goto free_out; | ||
290 | |||
291 | gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); | ||
292 | if (!gpdir) | ||
293 | goto free_out; | ||
294 | |||
295 | ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir, | ||
296 | NULL, &rcuctrs_fops); | ||
297 | if (!ctrsdir) | ||
298 | goto free_out; | ||
299 | return 0; | ||
300 | free_out: | ||
301 | if (statdir) | ||
302 | debugfs_remove(statdir); | ||
303 | if (gpdir) | ||
304 | debugfs_remove(gpdir); | ||
305 | debugfs_remove(rcudir); | ||
306 | out: | ||
307 | return 1; | ||
308 | } | ||
309 | |||
310 | static int __init rcupreempt_trace_init(void) | ||
311 | { | ||
312 | mutex_init(&rcupreempt_trace_mutex); | ||
313 | rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); | ||
314 | if (!rcupreempt_trace_buf) | ||
315 | return 1; | ||
316 | return rcupreempt_debugfs_init(); | ||
317 | } | ||
318 | |||
319 | static void __exit rcupreempt_trace_cleanup(void) | ||
320 | { | ||
321 | debugfs_remove(statdir); | ||
322 | debugfs_remove(gpdir); | ||
323 | debugfs_remove(ctrsdir); | ||
324 | debugfs_remove(rcudir); | ||
325 | kfree(rcupreempt_trace_buf); | ||
326 | } | ||
327 | |||
328 | |||
329 | module_init(rcupreempt_trace_init); | ||
330 | module_exit(rcupreempt_trace_cleanup); | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index c3e165c2318f..fd599829e72a 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -726,11 +726,11 @@ static void rcu_torture_shuffle_tasks(void) | |||
726 | cpumask_t tmp_mask = CPU_MASK_ALL; | 726 | cpumask_t tmp_mask = CPU_MASK_ALL; |
727 | int i; | 727 | int i; |
728 | 728 | ||
729 | lock_cpu_hotplug(); | 729 | get_online_cpus(); |
730 | 730 | ||
731 | /* No point in shuffling if there is only one online CPU (ex: UP) */ | 731 | /* No point in shuffling if there is only one online CPU (ex: UP) */ |
732 | if (num_online_cpus() == 1) { | 732 | if (num_online_cpus() == 1) { |
733 | unlock_cpu_hotplug(); | 733 | put_online_cpus(); |
734 | return; | 734 | return; |
735 | } | 735 | } |
736 | 736 | ||
@@ -762,7 +762,7 @@ static void rcu_torture_shuffle_tasks(void) | |||
762 | else | 762 | else |
763 | rcu_idle_cpu--; | 763 | rcu_idle_cpu--; |
764 | 764 | ||
765 | unlock_cpu_hotplug(); | 765 | put_online_cpus(); |
766 | } | 766 | } |
767 | 767 | ||
768 | /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the | 768 | /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the |
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index e3055ba69159..092e4c620af9 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c | |||
@@ -394,7 +394,7 @@ static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); | |||
394 | static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); | 394 | static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); |
395 | 395 | ||
396 | static struct sysdev_class rttest_sysclass = { | 396 | static struct sysdev_class rttest_sysclass = { |
397 | set_kset_name("rttest"), | 397 | .name = "rttest", |
398 | }; | 398 | }; |
399 | 399 | ||
400 | static int init_test_thread(int id) | 400 | static int init_test_thread(int id) |
diff --git a/kernel/rwsem.c b/kernel/rwsem.c index 1ec620c03064..cae050b05f5e 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c | |||
@@ -6,6 +6,7 @@ | |||
6 | 6 | ||
7 | #include <linux/types.h> | 7 | #include <linux/types.h> |
8 | #include <linux/kernel.h> | 8 | #include <linux/kernel.h> |
9 | #include <linux/sched.h> | ||
9 | #include <linux/module.h> | 10 | #include <linux/module.h> |
10 | #include <linux/rwsem.h> | 11 | #include <linux/rwsem.h> |
11 | 12 | ||
@@ -15,7 +16,7 @@ | |||
15 | /* | 16 | /* |
16 | * lock for reading | 17 | * lock for reading |
17 | */ | 18 | */ |
18 | void down_read(struct rw_semaphore *sem) | 19 | void __sched down_read(struct rw_semaphore *sem) |
19 | { | 20 | { |
20 | might_sleep(); | 21 | might_sleep(); |
21 | rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); | 22 | rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); |
@@ -42,7 +43,7 @@ EXPORT_SYMBOL(down_read_trylock); | |||
42 | /* | 43 | /* |
43 | * lock for writing | 44 | * lock for writing |
44 | */ | 45 | */ |
45 | void down_write(struct rw_semaphore *sem) | 46 | void __sched down_write(struct rw_semaphore *sem) |
46 | { | 47 | { |
47 | might_sleep(); | 48 | might_sleep(); |
48 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); | 49 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); |
diff --git a/kernel/sched.c b/kernel/sched.c index d2f77fab0f46..9474b23c28bf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -22,6 +22,8 @@ | |||
22 | * by Peter Williams | 22 | * by Peter Williams |
23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith | 23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith |
24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri | 24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri |
25 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, | ||
26 | * Thomas Gleixner, Mike Kravetz | ||
25 | */ | 27 | */ |
26 | 28 | ||
27 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
@@ -63,6 +65,7 @@ | |||
63 | #include <linux/reciprocal_div.h> | 65 | #include <linux/reciprocal_div.h> |
64 | #include <linux/unistd.h> | 66 | #include <linux/unistd.h> |
65 | #include <linux/pagemap.h> | 67 | #include <linux/pagemap.h> |
68 | #include <linux/hrtimer.h> | ||
66 | 69 | ||
67 | #include <asm/tlb.h> | 70 | #include <asm/tlb.h> |
68 | #include <asm/irq_regs.h> | 71 | #include <asm/irq_regs.h> |
@@ -96,10 +99,9 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
96 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | 99 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) |
97 | 100 | ||
98 | /* | 101 | /* |
99 | * Some helpers for converting nanosecond timing to jiffy resolution | 102 | * Helpers for converting nanosecond timing to jiffy resolution |
100 | */ | 103 | */ |
101 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) | 104 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) |
102 | #define JIFFIES_TO_NS(TIME) ((TIME) * (NSEC_PER_SEC / HZ)) | ||
103 | 105 | ||
104 | #define NICE_0_LOAD SCHED_LOAD_SCALE | 106 | #define NICE_0_LOAD SCHED_LOAD_SCALE |
105 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | 107 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT |
@@ -159,6 +161,8 @@ struct rt_prio_array { | |||
159 | 161 | ||
160 | struct cfs_rq; | 162 | struct cfs_rq; |
161 | 163 | ||
164 | static LIST_HEAD(task_groups); | ||
165 | |||
162 | /* task group related information */ | 166 | /* task group related information */ |
163 | struct task_group { | 167 | struct task_group { |
164 | #ifdef CONFIG_FAIR_CGROUP_SCHED | 168 | #ifdef CONFIG_FAIR_CGROUP_SCHED |
@@ -168,10 +172,50 @@ struct task_group { | |||
168 | struct sched_entity **se; | 172 | struct sched_entity **se; |
169 | /* runqueue "owned" by this group on each cpu */ | 173 | /* runqueue "owned" by this group on each cpu */ |
170 | struct cfs_rq **cfs_rq; | 174 | struct cfs_rq **cfs_rq; |
175 | |||
176 | struct sched_rt_entity **rt_se; | ||
177 | struct rt_rq **rt_rq; | ||
178 | |||
179 | unsigned int rt_ratio; | ||
180 | |||
181 | /* | ||
182 | * shares assigned to a task group governs how much of cpu bandwidth | ||
183 | * is allocated to the group. The more shares a group has, the more is | ||
184 | * the cpu bandwidth allocated to it. | ||
185 | * | ||
186 | * For ex, lets say that there are three task groups, A, B and C which | ||
187 | * have been assigned shares 1000, 2000 and 3000 respectively. Then, | ||
188 | * cpu bandwidth allocated by the scheduler to task groups A, B and C | ||
189 | * should be: | ||
190 | * | ||
191 | * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% | ||
192 | * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% | ||
193 | * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% | ||
194 | * | ||
195 | * The weight assigned to a task group's schedulable entities on every | ||
196 | * cpu (task_group.se[a_cpu]->load.weight) is derived from the task | ||
197 | * group's shares. For ex: lets say that task group A has been | ||
198 | * assigned shares of 1000 and there are two CPUs in a system. Then, | ||
199 | * | ||
200 | * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; | ||
201 | * | ||
202 | * Note: It's not necessary that each of a task's group schedulable | ||
203 | * entity have the same weight on all CPUs. If the group | ||
204 | * has 2 of its tasks on CPU0 and 1 task on CPU1, then a | ||
205 | * better distribution of weight could be: | ||
206 | * | ||
207 | * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 | ||
208 | * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 | ||
209 | * | ||
210 | * rebalance_shares() is responsible for distributing the shares of a | ||
211 | * task groups like this among the group's schedulable entities across | ||
212 | * cpus. | ||
213 | * | ||
214 | */ | ||
171 | unsigned long shares; | 215 | unsigned long shares; |
172 | /* spinlock to serialize modification to shares */ | 216 | |
173 | spinlock_t lock; | ||
174 | struct rcu_head rcu; | 217 | struct rcu_head rcu; |
218 | struct list_head list; | ||
175 | }; | 219 | }; |
176 | 220 | ||
177 | /* Default task group's sched entity on each cpu */ | 221 | /* Default task group's sched entity on each cpu */ |
@@ -179,24 +223,51 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | |||
179 | /* Default task group's cfs_rq on each cpu */ | 223 | /* Default task group's cfs_rq on each cpu */ |
180 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 224 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; |
181 | 225 | ||
226 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | ||
227 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | ||
228 | |||
182 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; | 229 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; |
183 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; | 230 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; |
184 | 231 | ||
232 | static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; | ||
233 | static struct rt_rq *init_rt_rq_p[NR_CPUS]; | ||
234 | |||
235 | /* task_group_mutex serializes add/remove of task groups and also changes to | ||
236 | * a task group's cpu shares. | ||
237 | */ | ||
238 | static DEFINE_MUTEX(task_group_mutex); | ||
239 | |||
240 | /* doms_cur_mutex serializes access to doms_cur[] array */ | ||
241 | static DEFINE_MUTEX(doms_cur_mutex); | ||
242 | |||
243 | #ifdef CONFIG_SMP | ||
244 | /* kernel thread that runs rebalance_shares() periodically */ | ||
245 | static struct task_struct *lb_monitor_task; | ||
246 | static int load_balance_monitor(void *unused); | ||
247 | #endif | ||
248 | |||
249 | static void set_se_shares(struct sched_entity *se, unsigned long shares); | ||
250 | |||
185 | /* Default task group. | 251 | /* Default task group. |
186 | * Every task in system belong to this group at bootup. | 252 | * Every task in system belong to this group at bootup. |
187 | */ | 253 | */ |
188 | struct task_group init_task_group = { | 254 | struct task_group init_task_group = { |
189 | .se = init_sched_entity_p, | 255 | .se = init_sched_entity_p, |
190 | .cfs_rq = init_cfs_rq_p, | 256 | .cfs_rq = init_cfs_rq_p, |
257 | |||
258 | .rt_se = init_sched_rt_entity_p, | ||
259 | .rt_rq = init_rt_rq_p, | ||
191 | }; | 260 | }; |
192 | 261 | ||
193 | #ifdef CONFIG_FAIR_USER_SCHED | 262 | #ifdef CONFIG_FAIR_USER_SCHED |
194 | # define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD | 263 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) |
195 | #else | 264 | #else |
196 | # define INIT_TASK_GRP_LOAD NICE_0_LOAD | 265 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
197 | #endif | 266 | #endif |
198 | 267 | ||
199 | static int init_task_group_load = INIT_TASK_GRP_LOAD; | 268 | #define MIN_GROUP_SHARES 2 |
269 | |||
270 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | ||
200 | 271 | ||
201 | /* return group to which a task belongs */ | 272 | /* return group to which a task belongs */ |
202 | static inline struct task_group *task_group(struct task_struct *p) | 273 | static inline struct task_group *task_group(struct task_struct *p) |
@@ -215,15 +286,42 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
215 | } | 286 | } |
216 | 287 | ||
217 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | 288 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ |
218 | static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) | 289 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) |
219 | { | 290 | { |
220 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; | 291 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; |
221 | p->se.parent = task_group(p)->se[cpu]; | 292 | p->se.parent = task_group(p)->se[cpu]; |
293 | |||
294 | p->rt.rt_rq = task_group(p)->rt_rq[cpu]; | ||
295 | p->rt.parent = task_group(p)->rt_se[cpu]; | ||
296 | } | ||
297 | |||
298 | static inline void lock_task_group_list(void) | ||
299 | { | ||
300 | mutex_lock(&task_group_mutex); | ||
301 | } | ||
302 | |||
303 | static inline void unlock_task_group_list(void) | ||
304 | { | ||
305 | mutex_unlock(&task_group_mutex); | ||
306 | } | ||
307 | |||
308 | static inline void lock_doms_cur(void) | ||
309 | { | ||
310 | mutex_lock(&doms_cur_mutex); | ||
311 | } | ||
312 | |||
313 | static inline void unlock_doms_cur(void) | ||
314 | { | ||
315 | mutex_unlock(&doms_cur_mutex); | ||
222 | } | 316 | } |
223 | 317 | ||
224 | #else | 318 | #else |
225 | 319 | ||
226 | static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { } | 320 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } |
321 | static inline void lock_task_group_list(void) { } | ||
322 | static inline void unlock_task_group_list(void) { } | ||
323 | static inline void lock_doms_cur(void) { } | ||
324 | static inline void unlock_doms_cur(void) { } | ||
227 | 325 | ||
228 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 326 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
229 | 327 | ||
@@ -264,11 +362,57 @@ struct cfs_rq { | |||
264 | /* Real-Time classes' related field in a runqueue: */ | 362 | /* Real-Time classes' related field in a runqueue: */ |
265 | struct rt_rq { | 363 | struct rt_rq { |
266 | struct rt_prio_array active; | 364 | struct rt_prio_array active; |
267 | int rt_load_balance_idx; | 365 | unsigned long rt_nr_running; |
268 | struct list_head *rt_load_balance_head, *rt_load_balance_curr; | 366 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED |
367 | int highest_prio; /* highest queued rt task prio */ | ||
368 | #endif | ||
369 | #ifdef CONFIG_SMP | ||
370 | unsigned long rt_nr_migratory; | ||
371 | int overloaded; | ||
372 | #endif | ||
373 | int rt_throttled; | ||
374 | u64 rt_time; | ||
375 | |||
376 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
377 | struct rq *rq; | ||
378 | struct list_head leaf_rt_rq_list; | ||
379 | struct task_group *tg; | ||
380 | struct sched_rt_entity *rt_se; | ||
381 | #endif | ||
382 | }; | ||
383 | |||
384 | #ifdef CONFIG_SMP | ||
385 | |||
386 | /* | ||
387 | * We add the notion of a root-domain which will be used to define per-domain | ||
388 | * variables. Each exclusive cpuset essentially defines an island domain by | ||
389 | * fully partitioning the member cpus from any other cpuset. Whenever a new | ||
390 | * exclusive cpuset is created, we also create and attach a new root-domain | ||
391 | * object. | ||
392 | * | ||
393 | */ | ||
394 | struct root_domain { | ||
395 | atomic_t refcount; | ||
396 | cpumask_t span; | ||
397 | cpumask_t online; | ||
398 | |||
399 | /* | ||
400 | * The "RT overload" flag: it gets set if a CPU has more than | ||
401 | * one runnable RT task. | ||
402 | */ | ||
403 | cpumask_t rto_mask; | ||
404 | atomic_t rto_count; | ||
269 | }; | 405 | }; |
270 | 406 | ||
271 | /* | 407 | /* |
408 | * By default the system creates a single root-domain with all cpus as | ||
409 | * members (mimicking the global state we have today). | ||
410 | */ | ||
411 | static struct root_domain def_root_domain; | ||
412 | |||
413 | #endif | ||
414 | |||
415 | /* | ||
272 | * This is the main, per-CPU runqueue data structure. | 416 | * This is the main, per-CPU runqueue data structure. |
273 | * | 417 | * |
274 | * Locking rule: those places that want to lock multiple runqueues | 418 | * Locking rule: those places that want to lock multiple runqueues |
@@ -296,11 +440,15 @@ struct rq { | |||
296 | u64 nr_switches; | 440 | u64 nr_switches; |
297 | 441 | ||
298 | struct cfs_rq cfs; | 442 | struct cfs_rq cfs; |
443 | struct rt_rq rt; | ||
444 | u64 rt_period_expire; | ||
445 | int rt_throttled; | ||
446 | |||
299 | #ifdef CONFIG_FAIR_GROUP_SCHED | 447 | #ifdef CONFIG_FAIR_GROUP_SCHED |
300 | /* list of leaf cfs_rq on this cpu: */ | 448 | /* list of leaf cfs_rq on this cpu: */ |
301 | struct list_head leaf_cfs_rq_list; | 449 | struct list_head leaf_cfs_rq_list; |
450 | struct list_head leaf_rt_rq_list; | ||
302 | #endif | 451 | #endif |
303 | struct rt_rq rt; | ||
304 | 452 | ||
305 | /* | 453 | /* |
306 | * This is part of a global counter where only the total sum | 454 | * This is part of a global counter where only the total sum |
@@ -317,7 +465,7 @@ struct rq { | |||
317 | u64 clock, prev_clock_raw; | 465 | u64 clock, prev_clock_raw; |
318 | s64 clock_max_delta; | 466 | s64 clock_max_delta; |
319 | 467 | ||
320 | unsigned int clock_warps, clock_overflows; | 468 | unsigned int clock_warps, clock_overflows, clock_underflows; |
321 | u64 idle_clock; | 469 | u64 idle_clock; |
322 | unsigned int clock_deep_idle_events; | 470 | unsigned int clock_deep_idle_events; |
323 | u64 tick_timestamp; | 471 | u64 tick_timestamp; |
@@ -325,6 +473,7 @@ struct rq { | |||
325 | atomic_t nr_iowait; | 473 | atomic_t nr_iowait; |
326 | 474 | ||
327 | #ifdef CONFIG_SMP | 475 | #ifdef CONFIG_SMP |
476 | struct root_domain *rd; | ||
328 | struct sched_domain *sd; | 477 | struct sched_domain *sd; |
329 | 478 | ||
330 | /* For active balancing */ | 479 | /* For active balancing */ |
@@ -337,6 +486,12 @@ struct rq { | |||
337 | struct list_head migration_queue; | 486 | struct list_head migration_queue; |
338 | #endif | 487 | #endif |
339 | 488 | ||
489 | #ifdef CONFIG_SCHED_HRTICK | ||
490 | unsigned long hrtick_flags; | ||
491 | ktime_t hrtick_expire; | ||
492 | struct hrtimer hrtick_timer; | ||
493 | #endif | ||
494 | |||
340 | #ifdef CONFIG_SCHEDSTATS | 495 | #ifdef CONFIG_SCHEDSTATS |
341 | /* latency stats */ | 496 | /* latency stats */ |
342 | struct sched_info rq_sched_info; | 497 | struct sched_info rq_sched_info; |
@@ -363,7 +518,6 @@ struct rq { | |||
363 | }; | 518 | }; |
364 | 519 | ||
365 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 520 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
366 | static DEFINE_MUTEX(sched_hotcpu_mutex); | ||
367 | 521 | ||
368 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) | 522 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) |
369 | { | 523 | { |
@@ -441,6 +595,23 @@ static void update_rq_clock(struct rq *rq) | |||
441 | #define task_rq(p) cpu_rq(task_cpu(p)) | 595 | #define task_rq(p) cpu_rq(task_cpu(p)) |
442 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 596 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
443 | 597 | ||
598 | unsigned long rt_needs_cpu(int cpu) | ||
599 | { | ||
600 | struct rq *rq = cpu_rq(cpu); | ||
601 | u64 delta; | ||
602 | |||
603 | if (!rq->rt_throttled) | ||
604 | return 0; | ||
605 | |||
606 | if (rq->clock > rq->rt_period_expire) | ||
607 | return 1; | ||
608 | |||
609 | delta = rq->rt_period_expire - rq->clock; | ||
610 | do_div(delta, NSEC_PER_SEC / HZ); | ||
611 | |||
612 | return (unsigned long)delta; | ||
613 | } | ||
614 | |||
444 | /* | 615 | /* |
445 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | 616 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: |
446 | */ | 617 | */ |
@@ -459,6 +630,8 @@ enum { | |||
459 | SCHED_FEAT_START_DEBIT = 4, | 630 | SCHED_FEAT_START_DEBIT = 4, |
460 | SCHED_FEAT_TREE_AVG = 8, | 631 | SCHED_FEAT_TREE_AVG = 8, |
461 | SCHED_FEAT_APPROX_AVG = 16, | 632 | SCHED_FEAT_APPROX_AVG = 16, |
633 | SCHED_FEAT_HRTICK = 32, | ||
634 | SCHED_FEAT_DOUBLE_TICK = 64, | ||
462 | }; | 635 | }; |
463 | 636 | ||
464 | const_debug unsigned int sysctl_sched_features = | 637 | const_debug unsigned int sysctl_sched_features = |
@@ -466,7 +639,9 @@ const_debug unsigned int sysctl_sched_features = | |||
466 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | | 639 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | |
467 | SCHED_FEAT_START_DEBIT * 1 | | 640 | SCHED_FEAT_START_DEBIT * 1 | |
468 | SCHED_FEAT_TREE_AVG * 0 | | 641 | SCHED_FEAT_TREE_AVG * 0 | |
469 | SCHED_FEAT_APPROX_AVG * 0; | 642 | SCHED_FEAT_APPROX_AVG * 0 | |
643 | SCHED_FEAT_HRTICK * 1 | | ||
644 | SCHED_FEAT_DOUBLE_TICK * 0; | ||
470 | 645 | ||
471 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) | 646 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) |
472 | 647 | ||
@@ -477,6 +652,21 @@ const_debug unsigned int sysctl_sched_features = | |||
477 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 652 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
478 | 653 | ||
479 | /* | 654 | /* |
655 | * period over which we measure -rt task cpu usage in ms. | ||
656 | * default: 1s | ||
657 | */ | ||
658 | const_debug unsigned int sysctl_sched_rt_period = 1000; | ||
659 | |||
660 | #define SCHED_RT_FRAC_SHIFT 16 | ||
661 | #define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT) | ||
662 | |||
663 | /* | ||
664 | * ratio of time -rt tasks may consume. | ||
665 | * default: 95% | ||
666 | */ | ||
667 | const_debug unsigned int sysctl_sched_rt_ratio = 62259; | ||
668 | |||
669 | /* | ||
480 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | 670 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu |
481 | * clock constructed from sched_clock(): | 671 | * clock constructed from sched_clock(): |
482 | */ | 672 | */ |
@@ -488,7 +678,12 @@ unsigned long long cpu_clock(int cpu) | |||
488 | 678 | ||
489 | local_irq_save(flags); | 679 | local_irq_save(flags); |
490 | rq = cpu_rq(cpu); | 680 | rq = cpu_rq(cpu); |
491 | update_rq_clock(rq); | 681 | /* |
682 | * Only call sched_clock() if the scheduler has already been | ||
683 | * initialized (some code might call cpu_clock() very early): | ||
684 | */ | ||
685 | if (rq->idle) | ||
686 | update_rq_clock(rq); | ||
492 | now = rq->clock; | 687 | now = rq->clock; |
493 | local_irq_restore(flags); | 688 | local_irq_restore(flags); |
494 | 689 | ||
@@ -503,10 +698,15 @@ EXPORT_SYMBOL_GPL(cpu_clock); | |||
503 | # define finish_arch_switch(prev) do { } while (0) | 698 | # define finish_arch_switch(prev) do { } while (0) |
504 | #endif | 699 | #endif |
505 | 700 | ||
701 | static inline int task_current(struct rq *rq, struct task_struct *p) | ||
702 | { | ||
703 | return rq->curr == p; | ||
704 | } | ||
705 | |||
506 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | 706 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW |
507 | static inline int task_running(struct rq *rq, struct task_struct *p) | 707 | static inline int task_running(struct rq *rq, struct task_struct *p) |
508 | { | 708 | { |
509 | return rq->curr == p; | 709 | return task_current(rq, p); |
510 | } | 710 | } |
511 | 711 | ||
512 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 712 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
@@ -535,7 +735,7 @@ static inline int task_running(struct rq *rq, struct task_struct *p) | |||
535 | #ifdef CONFIG_SMP | 735 | #ifdef CONFIG_SMP |
536 | return p->oncpu; | 736 | return p->oncpu; |
537 | #else | 737 | #else |
538 | return rq->curr == p; | 738 | return task_current(rq, p); |
539 | #endif | 739 | #endif |
540 | } | 740 | } |
541 | 741 | ||
@@ -669,9 +869,177 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) | |||
669 | rq->prev_clock_raw = now; | 869 | rq->prev_clock_raw = now; |
670 | rq->clock += delta_ns; | 870 | rq->clock += delta_ns; |
671 | spin_unlock(&rq->lock); | 871 | spin_unlock(&rq->lock); |
872 | touch_softlockup_watchdog(); | ||
672 | } | 873 | } |
673 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | 874 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); |
674 | 875 | ||
876 | static void __resched_task(struct task_struct *p, int tif_bit); | ||
877 | |||
878 | static inline void resched_task(struct task_struct *p) | ||
879 | { | ||
880 | __resched_task(p, TIF_NEED_RESCHED); | ||
881 | } | ||
882 | |||
883 | #ifdef CONFIG_SCHED_HRTICK | ||
884 | /* | ||
885 | * Use HR-timers to deliver accurate preemption points. | ||
886 | * | ||
887 | * Its all a bit involved since we cannot program an hrt while holding the | ||
888 | * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a | ||
889 | * reschedule event. | ||
890 | * | ||
891 | * When we get rescheduled we reprogram the hrtick_timer outside of the | ||
892 | * rq->lock. | ||
893 | */ | ||
894 | static inline void resched_hrt(struct task_struct *p) | ||
895 | { | ||
896 | __resched_task(p, TIF_HRTICK_RESCHED); | ||
897 | } | ||
898 | |||
899 | static inline void resched_rq(struct rq *rq) | ||
900 | { | ||
901 | unsigned long flags; | ||
902 | |||
903 | spin_lock_irqsave(&rq->lock, flags); | ||
904 | resched_task(rq->curr); | ||
905 | spin_unlock_irqrestore(&rq->lock, flags); | ||
906 | } | ||
907 | |||
908 | enum { | ||
909 | HRTICK_SET, /* re-programm hrtick_timer */ | ||
910 | HRTICK_RESET, /* not a new slice */ | ||
911 | }; | ||
912 | |||
913 | /* | ||
914 | * Use hrtick when: | ||
915 | * - enabled by features | ||
916 | * - hrtimer is actually high res | ||
917 | */ | ||
918 | static inline int hrtick_enabled(struct rq *rq) | ||
919 | { | ||
920 | if (!sched_feat(HRTICK)) | ||
921 | return 0; | ||
922 | return hrtimer_is_hres_active(&rq->hrtick_timer); | ||
923 | } | ||
924 | |||
925 | /* | ||
926 | * Called to set the hrtick timer state. | ||
927 | * | ||
928 | * called with rq->lock held and irqs disabled | ||
929 | */ | ||
930 | static void hrtick_start(struct rq *rq, u64 delay, int reset) | ||
931 | { | ||
932 | assert_spin_locked(&rq->lock); | ||
933 | |||
934 | /* | ||
935 | * preempt at: now + delay | ||
936 | */ | ||
937 | rq->hrtick_expire = | ||
938 | ktime_add_ns(rq->hrtick_timer.base->get_time(), delay); | ||
939 | /* | ||
940 | * indicate we need to program the timer | ||
941 | */ | ||
942 | __set_bit(HRTICK_SET, &rq->hrtick_flags); | ||
943 | if (reset) | ||
944 | __set_bit(HRTICK_RESET, &rq->hrtick_flags); | ||
945 | |||
946 | /* | ||
947 | * New slices are called from the schedule path and don't need a | ||
948 | * forced reschedule. | ||
949 | */ | ||
950 | if (reset) | ||
951 | resched_hrt(rq->curr); | ||
952 | } | ||
953 | |||
954 | static void hrtick_clear(struct rq *rq) | ||
955 | { | ||
956 | if (hrtimer_active(&rq->hrtick_timer)) | ||
957 | hrtimer_cancel(&rq->hrtick_timer); | ||
958 | } | ||
959 | |||
960 | /* | ||
961 | * Update the timer from the possible pending state. | ||
962 | */ | ||
963 | static void hrtick_set(struct rq *rq) | ||
964 | { | ||
965 | ktime_t time; | ||
966 | int set, reset; | ||
967 | unsigned long flags; | ||
968 | |||
969 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); | ||
970 | |||
971 | spin_lock_irqsave(&rq->lock, flags); | ||
972 | set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags); | ||
973 | reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags); | ||
974 | time = rq->hrtick_expire; | ||
975 | clear_thread_flag(TIF_HRTICK_RESCHED); | ||
976 | spin_unlock_irqrestore(&rq->lock, flags); | ||
977 | |||
978 | if (set) { | ||
979 | hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS); | ||
980 | if (reset && !hrtimer_active(&rq->hrtick_timer)) | ||
981 | resched_rq(rq); | ||
982 | } else | ||
983 | hrtick_clear(rq); | ||
984 | } | ||
985 | |||
986 | /* | ||
987 | * High-resolution timer tick. | ||
988 | * Runs from hardirq context with interrupts disabled. | ||
989 | */ | ||
990 | static enum hrtimer_restart hrtick(struct hrtimer *timer) | ||
991 | { | ||
992 | struct rq *rq = container_of(timer, struct rq, hrtick_timer); | ||
993 | |||
994 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); | ||
995 | |||
996 | spin_lock(&rq->lock); | ||
997 | __update_rq_clock(rq); | ||
998 | rq->curr->sched_class->task_tick(rq, rq->curr, 1); | ||
999 | spin_unlock(&rq->lock); | ||
1000 | |||
1001 | return HRTIMER_NORESTART; | ||
1002 | } | ||
1003 | |||
1004 | static inline void init_rq_hrtick(struct rq *rq) | ||
1005 | { | ||
1006 | rq->hrtick_flags = 0; | ||
1007 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
1008 | rq->hrtick_timer.function = hrtick; | ||
1009 | rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | ||
1010 | } | ||
1011 | |||
1012 | void hrtick_resched(void) | ||
1013 | { | ||
1014 | struct rq *rq; | ||
1015 | unsigned long flags; | ||
1016 | |||
1017 | if (!test_thread_flag(TIF_HRTICK_RESCHED)) | ||
1018 | return; | ||
1019 | |||
1020 | local_irq_save(flags); | ||
1021 | rq = cpu_rq(smp_processor_id()); | ||
1022 | hrtick_set(rq); | ||
1023 | local_irq_restore(flags); | ||
1024 | } | ||
1025 | #else | ||
1026 | static inline void hrtick_clear(struct rq *rq) | ||
1027 | { | ||
1028 | } | ||
1029 | |||
1030 | static inline void hrtick_set(struct rq *rq) | ||
1031 | { | ||
1032 | } | ||
1033 | |||
1034 | static inline void init_rq_hrtick(struct rq *rq) | ||
1035 | { | ||
1036 | } | ||
1037 | |||
1038 | void hrtick_resched(void) | ||
1039 | { | ||
1040 | } | ||
1041 | #endif | ||
1042 | |||
675 | /* | 1043 | /* |
676 | * resched_task - mark a task 'to be rescheduled now'. | 1044 | * resched_task - mark a task 'to be rescheduled now'. |
677 | * | 1045 | * |
@@ -685,16 +1053,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | |||
685 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | 1053 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) |
686 | #endif | 1054 | #endif |
687 | 1055 | ||
688 | static void resched_task(struct task_struct *p) | 1056 | static void __resched_task(struct task_struct *p, int tif_bit) |
689 | { | 1057 | { |
690 | int cpu; | 1058 | int cpu; |
691 | 1059 | ||
692 | assert_spin_locked(&task_rq(p)->lock); | 1060 | assert_spin_locked(&task_rq(p)->lock); |
693 | 1061 | ||
694 | if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) | 1062 | if (unlikely(test_tsk_thread_flag(p, tif_bit))) |
695 | return; | 1063 | return; |
696 | 1064 | ||
697 | set_tsk_thread_flag(p, TIF_NEED_RESCHED); | 1065 | set_tsk_thread_flag(p, tif_bit); |
698 | 1066 | ||
699 | cpu = task_cpu(p); | 1067 | cpu = task_cpu(p); |
700 | if (cpu == smp_processor_id()) | 1068 | if (cpu == smp_processor_id()) |
@@ -717,10 +1085,10 @@ static void resched_cpu(int cpu) | |||
717 | spin_unlock_irqrestore(&rq->lock, flags); | 1085 | spin_unlock_irqrestore(&rq->lock, flags); |
718 | } | 1086 | } |
719 | #else | 1087 | #else |
720 | static inline void resched_task(struct task_struct *p) | 1088 | static void __resched_task(struct task_struct *p, int tif_bit) |
721 | { | 1089 | { |
722 | assert_spin_locked(&task_rq(p)->lock); | 1090 | assert_spin_locked(&task_rq(p)->lock); |
723 | set_tsk_need_resched(p); | 1091 | set_tsk_thread_flag(p, tif_bit); |
724 | } | 1092 | } |
725 | #endif | 1093 | #endif |
726 | 1094 | ||
@@ -860,6 +1228,23 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | |||
860 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | 1228 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} |
861 | #endif | 1229 | #endif |
862 | 1230 | ||
1231 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) | ||
1232 | { | ||
1233 | update_load_add(&rq->load, load); | ||
1234 | } | ||
1235 | |||
1236 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) | ||
1237 | { | ||
1238 | update_load_sub(&rq->load, load); | ||
1239 | } | ||
1240 | |||
1241 | #ifdef CONFIG_SMP | ||
1242 | static unsigned long source_load(int cpu, int type); | ||
1243 | static unsigned long target_load(int cpu, int type); | ||
1244 | static unsigned long cpu_avg_load_per_task(int cpu); | ||
1245 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | ||
1246 | #endif /* CONFIG_SMP */ | ||
1247 | |||
863 | #include "sched_stats.h" | 1248 | #include "sched_stats.h" |
864 | #include "sched_idletask.c" | 1249 | #include "sched_idletask.c" |
865 | #include "sched_fair.c" | 1250 | #include "sched_fair.c" |
@@ -870,41 +1255,14 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | |||
870 | 1255 | ||
871 | #define sched_class_highest (&rt_sched_class) | 1256 | #define sched_class_highest (&rt_sched_class) |
872 | 1257 | ||
873 | /* | 1258 | static void inc_nr_running(struct rq *rq) |
874 | * Update delta_exec, delta_fair fields for rq. | ||
875 | * | ||
876 | * delta_fair clock advances at a rate inversely proportional to | ||
877 | * total load (rq->load.weight) on the runqueue, while | ||
878 | * delta_exec advances at the same rate as wall-clock (provided | ||
879 | * cpu is not idle). | ||
880 | * | ||
881 | * delta_exec / delta_fair is a measure of the (smoothened) load on this | ||
882 | * runqueue over any given interval. This (smoothened) load is used | ||
883 | * during load balance. | ||
884 | * | ||
885 | * This function is called /before/ updating rq->load | ||
886 | * and when switching tasks. | ||
887 | */ | ||
888 | static inline void inc_load(struct rq *rq, const struct task_struct *p) | ||
889 | { | ||
890 | update_load_add(&rq->load, p->se.load.weight); | ||
891 | } | ||
892 | |||
893 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
894 | { | ||
895 | update_load_sub(&rq->load, p->se.load.weight); | ||
896 | } | ||
897 | |||
898 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
899 | { | 1259 | { |
900 | rq->nr_running++; | 1260 | rq->nr_running++; |
901 | inc_load(rq, p); | ||
902 | } | 1261 | } |
903 | 1262 | ||
904 | static void dec_nr_running(struct task_struct *p, struct rq *rq) | 1263 | static void dec_nr_running(struct rq *rq) |
905 | { | 1264 | { |
906 | rq->nr_running--; | 1265 | rq->nr_running--; |
907 | dec_load(rq, p); | ||
908 | } | 1266 | } |
909 | 1267 | ||
910 | static void set_load_weight(struct task_struct *p) | 1268 | static void set_load_weight(struct task_struct *p) |
@@ -996,7 +1354,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
996 | rq->nr_uninterruptible--; | 1354 | rq->nr_uninterruptible--; |
997 | 1355 | ||
998 | enqueue_task(rq, p, wakeup); | 1356 | enqueue_task(rq, p, wakeup); |
999 | inc_nr_running(p, rq); | 1357 | inc_nr_running(rq); |
1000 | } | 1358 | } |
1001 | 1359 | ||
1002 | /* | 1360 | /* |
@@ -1008,7 +1366,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | |||
1008 | rq->nr_uninterruptible++; | 1366 | rq->nr_uninterruptible++; |
1009 | 1367 | ||
1010 | dequeue_task(rq, p, sleep); | 1368 | dequeue_task(rq, p, sleep); |
1011 | dec_nr_running(p, rq); | 1369 | dec_nr_running(rq); |
1012 | } | 1370 | } |
1013 | 1371 | ||
1014 | /** | 1372 | /** |
@@ -1028,7 +1386,7 @@ unsigned long weighted_cpuload(const int cpu) | |||
1028 | 1386 | ||
1029 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 1387 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
1030 | { | 1388 | { |
1031 | set_task_cfs_rq(p, cpu); | 1389 | set_task_rq(p, cpu); |
1032 | #ifdef CONFIG_SMP | 1390 | #ifdef CONFIG_SMP |
1033 | /* | 1391 | /* |
1034 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be | 1392 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be |
@@ -1040,12 +1398,24 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
1040 | #endif | 1398 | #endif |
1041 | } | 1399 | } |
1042 | 1400 | ||
1401 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, | ||
1402 | const struct sched_class *prev_class, | ||
1403 | int oldprio, int running) | ||
1404 | { | ||
1405 | if (prev_class != p->sched_class) { | ||
1406 | if (prev_class->switched_from) | ||
1407 | prev_class->switched_from(rq, p, running); | ||
1408 | p->sched_class->switched_to(rq, p, running); | ||
1409 | } else | ||
1410 | p->sched_class->prio_changed(rq, p, oldprio, running); | ||
1411 | } | ||
1412 | |||
1043 | #ifdef CONFIG_SMP | 1413 | #ifdef CONFIG_SMP |
1044 | 1414 | ||
1045 | /* | 1415 | /* |
1046 | * Is this task likely cache-hot: | 1416 | * Is this task likely cache-hot: |
1047 | */ | 1417 | */ |
1048 | static inline int | 1418 | static int |
1049 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | 1419 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) |
1050 | { | 1420 | { |
1051 | s64 delta; | 1421 | s64 delta; |
@@ -1270,7 +1640,7 @@ static unsigned long target_load(int cpu, int type) | |||
1270 | /* | 1640 | /* |
1271 | * Return the average load per task on the cpu's run queue | 1641 | * Return the average load per task on the cpu's run queue |
1272 | */ | 1642 | */ |
1273 | static inline unsigned long cpu_avg_load_per_task(int cpu) | 1643 | static unsigned long cpu_avg_load_per_task(int cpu) |
1274 | { | 1644 | { |
1275 | struct rq *rq = cpu_rq(cpu); | 1645 | struct rq *rq = cpu_rq(cpu); |
1276 | unsigned long total = weighted_cpuload(cpu); | 1646 | unsigned long total = weighted_cpuload(cpu); |
@@ -1427,58 +1797,6 @@ static int sched_balance_self(int cpu, int flag) | |||
1427 | 1797 | ||
1428 | #endif /* CONFIG_SMP */ | 1798 | #endif /* CONFIG_SMP */ |
1429 | 1799 | ||
1430 | /* | ||
1431 | * wake_idle() will wake a task on an idle cpu if task->cpu is | ||
1432 | * not idle and an idle cpu is available. The span of cpus to | ||
1433 | * search starts with cpus closest then further out as needed, | ||
1434 | * so we always favor a closer, idle cpu. | ||
1435 | * | ||
1436 | * Returns the CPU we should wake onto. | ||
1437 | */ | ||
1438 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | ||
1439 | static int wake_idle(int cpu, struct task_struct *p) | ||
1440 | { | ||
1441 | cpumask_t tmp; | ||
1442 | struct sched_domain *sd; | ||
1443 | int i; | ||
1444 | |||
1445 | /* | ||
1446 | * If it is idle, then it is the best cpu to run this task. | ||
1447 | * | ||
1448 | * This cpu is also the best, if it has more than one task already. | ||
1449 | * Siblings must be also busy(in most cases) as they didn't already | ||
1450 | * pickup the extra load from this cpu and hence we need not check | ||
1451 | * sibling runqueue info. This will avoid the checks and cache miss | ||
1452 | * penalities associated with that. | ||
1453 | */ | ||
1454 | if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) | ||
1455 | return cpu; | ||
1456 | |||
1457 | for_each_domain(cpu, sd) { | ||
1458 | if (sd->flags & SD_WAKE_IDLE) { | ||
1459 | cpus_and(tmp, sd->span, p->cpus_allowed); | ||
1460 | for_each_cpu_mask(i, tmp) { | ||
1461 | if (idle_cpu(i)) { | ||
1462 | if (i != task_cpu(p)) { | ||
1463 | schedstat_inc(p, | ||
1464 | se.nr_wakeups_idle); | ||
1465 | } | ||
1466 | return i; | ||
1467 | } | ||
1468 | } | ||
1469 | } else { | ||
1470 | break; | ||
1471 | } | ||
1472 | } | ||
1473 | return cpu; | ||
1474 | } | ||
1475 | #else | ||
1476 | static inline int wake_idle(int cpu, struct task_struct *p) | ||
1477 | { | ||
1478 | return cpu; | ||
1479 | } | ||
1480 | #endif | ||
1481 | |||
1482 | /*** | 1800 | /*** |
1483 | * try_to_wake_up - wake up a thread | 1801 | * try_to_wake_up - wake up a thread |
1484 | * @p: the to-be-woken-up thread | 1802 | * @p: the to-be-woken-up thread |
@@ -1499,11 +1817,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1499 | unsigned long flags; | 1817 | unsigned long flags; |
1500 | long old_state; | 1818 | long old_state; |
1501 | struct rq *rq; | 1819 | struct rq *rq; |
1502 | #ifdef CONFIG_SMP | ||
1503 | struct sched_domain *sd, *this_sd = NULL; | ||
1504 | unsigned long load, this_load; | ||
1505 | int new_cpu; | ||
1506 | #endif | ||
1507 | 1820 | ||
1508 | rq = task_rq_lock(p, &flags); | 1821 | rq = task_rq_lock(p, &flags); |
1509 | old_state = p->state; | 1822 | old_state = p->state; |
@@ -1521,92 +1834,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1521 | if (unlikely(task_running(rq, p))) | 1834 | if (unlikely(task_running(rq, p))) |
1522 | goto out_activate; | 1835 | goto out_activate; |
1523 | 1836 | ||
1524 | new_cpu = cpu; | 1837 | cpu = p->sched_class->select_task_rq(p, sync); |
1525 | 1838 | if (cpu != orig_cpu) { | |
1526 | schedstat_inc(rq, ttwu_count); | 1839 | set_task_cpu(p, cpu); |
1527 | if (cpu == this_cpu) { | ||
1528 | schedstat_inc(rq, ttwu_local); | ||
1529 | goto out_set_cpu; | ||
1530 | } | ||
1531 | |||
1532 | for_each_domain(this_cpu, sd) { | ||
1533 | if (cpu_isset(cpu, sd->span)) { | ||
1534 | schedstat_inc(sd, ttwu_wake_remote); | ||
1535 | this_sd = sd; | ||
1536 | break; | ||
1537 | } | ||
1538 | } | ||
1539 | |||
1540 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | ||
1541 | goto out_set_cpu; | ||
1542 | |||
1543 | /* | ||
1544 | * Check for affine wakeup and passive balancing possibilities. | ||
1545 | */ | ||
1546 | if (this_sd) { | ||
1547 | int idx = this_sd->wake_idx; | ||
1548 | unsigned int imbalance; | ||
1549 | |||
1550 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; | ||
1551 | |||
1552 | load = source_load(cpu, idx); | ||
1553 | this_load = target_load(this_cpu, idx); | ||
1554 | |||
1555 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | ||
1556 | |||
1557 | if (this_sd->flags & SD_WAKE_AFFINE) { | ||
1558 | unsigned long tl = this_load; | ||
1559 | unsigned long tl_per_task; | ||
1560 | |||
1561 | /* | ||
1562 | * Attract cache-cold tasks on sync wakeups: | ||
1563 | */ | ||
1564 | if (sync && !task_hot(p, rq->clock, this_sd)) | ||
1565 | goto out_set_cpu; | ||
1566 | |||
1567 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | ||
1568 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
1569 | |||
1570 | /* | ||
1571 | * If sync wakeup then subtract the (maximum possible) | ||
1572 | * effect of the currently running task from the load | ||
1573 | * of the current CPU: | ||
1574 | */ | ||
1575 | if (sync) | ||
1576 | tl -= current->se.load.weight; | ||
1577 | |||
1578 | if ((tl <= load && | ||
1579 | tl + target_load(cpu, idx) <= tl_per_task) || | ||
1580 | 100*(tl + p->se.load.weight) <= imbalance*load) { | ||
1581 | /* | ||
1582 | * This domain has SD_WAKE_AFFINE and | ||
1583 | * p is cache cold in this domain, and | ||
1584 | * there is no bad imbalance. | ||
1585 | */ | ||
1586 | schedstat_inc(this_sd, ttwu_move_affine); | ||
1587 | schedstat_inc(p, se.nr_wakeups_affine); | ||
1588 | goto out_set_cpu; | ||
1589 | } | ||
1590 | } | ||
1591 | |||
1592 | /* | ||
1593 | * Start passive balancing when half the imbalance_pct | ||
1594 | * limit is reached. | ||
1595 | */ | ||
1596 | if (this_sd->flags & SD_WAKE_BALANCE) { | ||
1597 | if (imbalance*this_load <= 100*load) { | ||
1598 | schedstat_inc(this_sd, ttwu_move_balance); | ||
1599 | schedstat_inc(p, se.nr_wakeups_passive); | ||
1600 | goto out_set_cpu; | ||
1601 | } | ||
1602 | } | ||
1603 | } | ||
1604 | |||
1605 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ | ||
1606 | out_set_cpu: | ||
1607 | new_cpu = wake_idle(new_cpu, p); | ||
1608 | if (new_cpu != cpu) { | ||
1609 | set_task_cpu(p, new_cpu); | ||
1610 | task_rq_unlock(rq, &flags); | 1840 | task_rq_unlock(rq, &flags); |
1611 | /* might preempt at this point */ | 1841 | /* might preempt at this point */ |
1612 | rq = task_rq_lock(p, &flags); | 1842 | rq = task_rq_lock(p, &flags); |
@@ -1620,6 +1850,21 @@ out_set_cpu: | |||
1620 | cpu = task_cpu(p); | 1850 | cpu = task_cpu(p); |
1621 | } | 1851 | } |
1622 | 1852 | ||
1853 | #ifdef CONFIG_SCHEDSTATS | ||
1854 | schedstat_inc(rq, ttwu_count); | ||
1855 | if (cpu == this_cpu) | ||
1856 | schedstat_inc(rq, ttwu_local); | ||
1857 | else { | ||
1858 | struct sched_domain *sd; | ||
1859 | for_each_domain(this_cpu, sd) { | ||
1860 | if (cpu_isset(cpu, sd->span)) { | ||
1861 | schedstat_inc(sd, ttwu_wake_remote); | ||
1862 | break; | ||
1863 | } | ||
1864 | } | ||
1865 | } | ||
1866 | #endif | ||
1867 | |||
1623 | out_activate: | 1868 | out_activate: |
1624 | #endif /* CONFIG_SMP */ | 1869 | #endif /* CONFIG_SMP */ |
1625 | schedstat_inc(p, se.nr_wakeups); | 1870 | schedstat_inc(p, se.nr_wakeups); |
@@ -1638,6 +1883,10 @@ out_activate: | |||
1638 | 1883 | ||
1639 | out_running: | 1884 | out_running: |
1640 | p->state = TASK_RUNNING; | 1885 | p->state = TASK_RUNNING; |
1886 | #ifdef CONFIG_SMP | ||
1887 | if (p->sched_class->task_wake_up) | ||
1888 | p->sched_class->task_wake_up(rq, p); | ||
1889 | #endif | ||
1641 | out: | 1890 | out: |
1642 | task_rq_unlock(rq, &flags); | 1891 | task_rq_unlock(rq, &flags); |
1643 | 1892 | ||
@@ -1679,7 +1928,7 @@ static void __sched_fork(struct task_struct *p) | |||
1679 | p->se.wait_max = 0; | 1928 | p->se.wait_max = 0; |
1680 | #endif | 1929 | #endif |
1681 | 1930 | ||
1682 | INIT_LIST_HEAD(&p->run_list); | 1931 | INIT_LIST_HEAD(&p->rt.run_list); |
1683 | p->se.on_rq = 0; | 1932 | p->se.on_rq = 0; |
1684 | 1933 | ||
1685 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 1934 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
@@ -1756,9 +2005,13 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
1756 | * management (if any): | 2005 | * management (if any): |
1757 | */ | 2006 | */ |
1758 | p->sched_class->task_new(rq, p); | 2007 | p->sched_class->task_new(rq, p); |
1759 | inc_nr_running(p, rq); | 2008 | inc_nr_running(rq); |
1760 | } | 2009 | } |
1761 | check_preempt_curr(rq, p); | 2010 | check_preempt_curr(rq, p); |
2011 | #ifdef CONFIG_SMP | ||
2012 | if (p->sched_class->task_wake_up) | ||
2013 | p->sched_class->task_wake_up(rq, p); | ||
2014 | #endif | ||
1762 | task_rq_unlock(rq, &flags); | 2015 | task_rq_unlock(rq, &flags); |
1763 | } | 2016 | } |
1764 | 2017 | ||
@@ -1879,6 +2132,11 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
1879 | prev_state = prev->state; | 2132 | prev_state = prev->state; |
1880 | finish_arch_switch(prev); | 2133 | finish_arch_switch(prev); |
1881 | finish_lock_switch(rq, prev); | 2134 | finish_lock_switch(rq, prev); |
2135 | #ifdef CONFIG_SMP | ||
2136 | if (current->sched_class->post_schedule) | ||
2137 | current->sched_class->post_schedule(rq); | ||
2138 | #endif | ||
2139 | |||
1882 | fire_sched_in_preempt_notifiers(current); | 2140 | fire_sched_in_preempt_notifiers(current); |
1883 | if (mm) | 2141 | if (mm) |
1884 | mmdrop(mm); | 2142 | mmdrop(mm); |
@@ -2112,11 +2370,13 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
2112 | /* | 2370 | /* |
2113 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | 2371 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. |
2114 | */ | 2372 | */ |
2115 | static void double_lock_balance(struct rq *this_rq, struct rq *busiest) | 2373 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest) |
2116 | __releases(this_rq->lock) | 2374 | __releases(this_rq->lock) |
2117 | __acquires(busiest->lock) | 2375 | __acquires(busiest->lock) |
2118 | __acquires(this_rq->lock) | 2376 | __acquires(this_rq->lock) |
2119 | { | 2377 | { |
2378 | int ret = 0; | ||
2379 | |||
2120 | if (unlikely(!irqs_disabled())) { | 2380 | if (unlikely(!irqs_disabled())) { |
2121 | /* printk() doesn't work good under rq->lock */ | 2381 | /* printk() doesn't work good under rq->lock */ |
2122 | spin_unlock(&this_rq->lock); | 2382 | spin_unlock(&this_rq->lock); |
@@ -2127,9 +2387,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
2127 | spin_unlock(&this_rq->lock); | 2387 | spin_unlock(&this_rq->lock); |
2128 | spin_lock(&busiest->lock); | 2388 | spin_lock(&busiest->lock); |
2129 | spin_lock(&this_rq->lock); | 2389 | spin_lock(&this_rq->lock); |
2390 | ret = 1; | ||
2130 | } else | 2391 | } else |
2131 | spin_lock(&busiest->lock); | 2392 | spin_lock(&busiest->lock); |
2132 | } | 2393 | } |
2394 | return ret; | ||
2133 | } | 2395 | } |
2134 | 2396 | ||
2135 | /* | 2397 | /* |
@@ -3328,7 +3590,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3328 | 3590 | ||
3329 | rq = task_rq_lock(p, &flags); | 3591 | rq = task_rq_lock(p, &flags); |
3330 | ns = p->se.sum_exec_runtime; | 3592 | ns = p->se.sum_exec_runtime; |
3331 | if (rq->curr == p) { | 3593 | if (task_current(rq, p)) { |
3332 | update_rq_clock(rq); | 3594 | update_rq_clock(rq); |
3333 | delta_exec = rq->clock - p->se.exec_start; | 3595 | delta_exec = rq->clock - p->se.exec_start; |
3334 | if ((s64)delta_exec > 0) | 3596 | if ((s64)delta_exec > 0) |
@@ -3473,12 +3735,14 @@ void scheduler_tick(void) | |||
3473 | /* | 3735 | /* |
3474 | * Let rq->clock advance by at least TICK_NSEC: | 3736 | * Let rq->clock advance by at least TICK_NSEC: |
3475 | */ | 3737 | */ |
3476 | if (unlikely(rq->clock < next_tick)) | 3738 | if (unlikely(rq->clock < next_tick)) { |
3477 | rq->clock = next_tick; | 3739 | rq->clock = next_tick; |
3740 | rq->clock_underflows++; | ||
3741 | } | ||
3478 | rq->tick_timestamp = rq->clock; | 3742 | rq->tick_timestamp = rq->clock; |
3479 | update_cpu_load(rq); | 3743 | update_cpu_load(rq); |
3480 | if (curr != rq->idle) /* FIXME: needed? */ | 3744 | curr->sched_class->task_tick(rq, curr, 0); |
3481 | curr->sched_class->task_tick(rq, curr); | 3745 | update_sched_rt_period(rq); |
3482 | spin_unlock(&rq->lock); | 3746 | spin_unlock(&rq->lock); |
3483 | 3747 | ||
3484 | #ifdef CONFIG_SMP | 3748 | #ifdef CONFIG_SMP |
@@ -3624,6 +3888,8 @@ need_resched_nonpreemptible: | |||
3624 | 3888 | ||
3625 | schedule_debug(prev); | 3889 | schedule_debug(prev); |
3626 | 3890 | ||
3891 | hrtick_clear(rq); | ||
3892 | |||
3627 | /* | 3893 | /* |
3628 | * Do the rq-clock update outside the rq lock: | 3894 | * Do the rq-clock update outside the rq lock: |
3629 | */ | 3895 | */ |
@@ -3642,6 +3908,11 @@ need_resched_nonpreemptible: | |||
3642 | switch_count = &prev->nvcsw; | 3908 | switch_count = &prev->nvcsw; |
3643 | } | 3909 | } |
3644 | 3910 | ||
3911 | #ifdef CONFIG_SMP | ||
3912 | if (prev->sched_class->pre_schedule) | ||
3913 | prev->sched_class->pre_schedule(rq, prev); | ||
3914 | #endif | ||
3915 | |||
3645 | if (unlikely(!rq->nr_running)) | 3916 | if (unlikely(!rq->nr_running)) |
3646 | idle_balance(cpu, rq); | 3917 | idle_balance(cpu, rq); |
3647 | 3918 | ||
@@ -3656,14 +3927,20 @@ need_resched_nonpreemptible: | |||
3656 | ++*switch_count; | 3927 | ++*switch_count; |
3657 | 3928 | ||
3658 | context_switch(rq, prev, next); /* unlocks the rq */ | 3929 | context_switch(rq, prev, next); /* unlocks the rq */ |
3930 | /* | ||
3931 | * the context switch might have flipped the stack from under | ||
3932 | * us, hence refresh the local variables. | ||
3933 | */ | ||
3934 | cpu = smp_processor_id(); | ||
3935 | rq = cpu_rq(cpu); | ||
3659 | } else | 3936 | } else |
3660 | spin_unlock_irq(&rq->lock); | 3937 | spin_unlock_irq(&rq->lock); |
3661 | 3938 | ||
3662 | if (unlikely(reacquire_kernel_lock(current) < 0)) { | 3939 | hrtick_set(rq); |
3663 | cpu = smp_processor_id(); | 3940 | |
3664 | rq = cpu_rq(cpu); | 3941 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
3665 | goto need_resched_nonpreemptible; | 3942 | goto need_resched_nonpreemptible; |
3666 | } | 3943 | |
3667 | preempt_enable_no_resched(); | 3944 | preempt_enable_no_resched(); |
3668 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3945 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
3669 | goto need_resched; | 3946 | goto need_resched; |
@@ -3679,10 +3956,9 @@ EXPORT_SYMBOL(schedule); | |||
3679 | asmlinkage void __sched preempt_schedule(void) | 3956 | asmlinkage void __sched preempt_schedule(void) |
3680 | { | 3957 | { |
3681 | struct thread_info *ti = current_thread_info(); | 3958 | struct thread_info *ti = current_thread_info(); |
3682 | #ifdef CONFIG_PREEMPT_BKL | ||
3683 | struct task_struct *task = current; | 3959 | struct task_struct *task = current; |
3684 | int saved_lock_depth; | 3960 | int saved_lock_depth; |
3685 | #endif | 3961 | |
3686 | /* | 3962 | /* |
3687 | * If there is a non-zero preempt_count or interrupts are disabled, | 3963 | * If there is a non-zero preempt_count or interrupts are disabled, |
3688 | * we do not want to preempt the current task. Just return.. | 3964 | * we do not want to preempt the current task. Just return.. |
@@ -3698,14 +3974,10 @@ asmlinkage void __sched preempt_schedule(void) | |||
3698 | * clear ->lock_depth so that schedule() doesnt | 3974 | * clear ->lock_depth so that schedule() doesnt |
3699 | * auto-release the semaphore: | 3975 | * auto-release the semaphore: |
3700 | */ | 3976 | */ |
3701 | #ifdef CONFIG_PREEMPT_BKL | ||
3702 | saved_lock_depth = task->lock_depth; | 3977 | saved_lock_depth = task->lock_depth; |
3703 | task->lock_depth = -1; | 3978 | task->lock_depth = -1; |
3704 | #endif | ||
3705 | schedule(); | 3979 | schedule(); |
3706 | #ifdef CONFIG_PREEMPT_BKL | ||
3707 | task->lock_depth = saved_lock_depth; | 3980 | task->lock_depth = saved_lock_depth; |
3708 | #endif | ||
3709 | sub_preempt_count(PREEMPT_ACTIVE); | 3981 | sub_preempt_count(PREEMPT_ACTIVE); |
3710 | 3982 | ||
3711 | /* | 3983 | /* |
@@ -3726,10 +3998,9 @@ EXPORT_SYMBOL(preempt_schedule); | |||
3726 | asmlinkage void __sched preempt_schedule_irq(void) | 3998 | asmlinkage void __sched preempt_schedule_irq(void) |
3727 | { | 3999 | { |
3728 | struct thread_info *ti = current_thread_info(); | 4000 | struct thread_info *ti = current_thread_info(); |
3729 | #ifdef CONFIG_PREEMPT_BKL | ||
3730 | struct task_struct *task = current; | 4001 | struct task_struct *task = current; |
3731 | int saved_lock_depth; | 4002 | int saved_lock_depth; |
3732 | #endif | 4003 | |
3733 | /* Catch callers which need to be fixed */ | 4004 | /* Catch callers which need to be fixed */ |
3734 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 4005 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
3735 | 4006 | ||
@@ -3741,16 +4012,12 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
3741 | * clear ->lock_depth so that schedule() doesnt | 4012 | * clear ->lock_depth so that schedule() doesnt |
3742 | * auto-release the semaphore: | 4013 | * auto-release the semaphore: |
3743 | */ | 4014 | */ |
3744 | #ifdef CONFIG_PREEMPT_BKL | ||
3745 | saved_lock_depth = task->lock_depth; | 4015 | saved_lock_depth = task->lock_depth; |
3746 | task->lock_depth = -1; | 4016 | task->lock_depth = -1; |
3747 | #endif | ||
3748 | local_irq_enable(); | 4017 | local_irq_enable(); |
3749 | schedule(); | 4018 | schedule(); |
3750 | local_irq_disable(); | 4019 | local_irq_disable(); |
3751 | #ifdef CONFIG_PREEMPT_BKL | ||
3752 | task->lock_depth = saved_lock_depth; | 4020 | task->lock_depth = saved_lock_depth; |
3753 | #endif | ||
3754 | sub_preempt_count(PREEMPT_ACTIVE); | 4021 | sub_preempt_count(PREEMPT_ACTIVE); |
3755 | 4022 | ||
3756 | /* | 4023 | /* |
@@ -4016,6 +4283,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4016 | unsigned long flags; | 4283 | unsigned long flags; |
4017 | int oldprio, on_rq, running; | 4284 | int oldprio, on_rq, running; |
4018 | struct rq *rq; | 4285 | struct rq *rq; |
4286 | const struct sched_class *prev_class = p->sched_class; | ||
4019 | 4287 | ||
4020 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 4288 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
4021 | 4289 | ||
@@ -4024,7 +4292,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4024 | 4292 | ||
4025 | oldprio = p->prio; | 4293 | oldprio = p->prio; |
4026 | on_rq = p->se.on_rq; | 4294 | on_rq = p->se.on_rq; |
4027 | running = task_running(rq, p); | 4295 | running = task_current(rq, p); |
4028 | if (on_rq) { | 4296 | if (on_rq) { |
4029 | dequeue_task(rq, p, 0); | 4297 | dequeue_task(rq, p, 0); |
4030 | if (running) | 4298 | if (running) |
@@ -4041,18 +4309,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4041 | if (on_rq) { | 4309 | if (on_rq) { |
4042 | if (running) | 4310 | if (running) |
4043 | p->sched_class->set_curr_task(rq); | 4311 | p->sched_class->set_curr_task(rq); |
4312 | |||
4044 | enqueue_task(rq, p, 0); | 4313 | enqueue_task(rq, p, 0); |
4045 | /* | 4314 | |
4046 | * Reschedule if we are currently running on this runqueue and | 4315 | check_class_changed(rq, p, prev_class, oldprio, running); |
4047 | * our priority decreased, or if we are not currently running on | ||
4048 | * this runqueue and our priority is higher than the current's | ||
4049 | */ | ||
4050 | if (running) { | ||
4051 | if (p->prio > oldprio) | ||
4052 | resched_task(rq->curr); | ||
4053 | } else { | ||
4054 | check_preempt_curr(rq, p); | ||
4055 | } | ||
4056 | } | 4316 | } |
4057 | task_rq_unlock(rq, &flags); | 4317 | task_rq_unlock(rq, &flags); |
4058 | } | 4318 | } |
@@ -4084,10 +4344,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4084 | goto out_unlock; | 4344 | goto out_unlock; |
4085 | } | 4345 | } |
4086 | on_rq = p->se.on_rq; | 4346 | on_rq = p->se.on_rq; |
4087 | if (on_rq) { | 4347 | if (on_rq) |
4088 | dequeue_task(rq, p, 0); | 4348 | dequeue_task(rq, p, 0); |
4089 | dec_load(rq, p); | ||
4090 | } | ||
4091 | 4349 | ||
4092 | p->static_prio = NICE_TO_PRIO(nice); | 4350 | p->static_prio = NICE_TO_PRIO(nice); |
4093 | set_load_weight(p); | 4351 | set_load_weight(p); |
@@ -4097,7 +4355,6 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4097 | 4355 | ||
4098 | if (on_rq) { | 4356 | if (on_rq) { |
4099 | enqueue_task(rq, p, 0); | 4357 | enqueue_task(rq, p, 0); |
4100 | inc_load(rq, p); | ||
4101 | /* | 4358 | /* |
4102 | * If the task increased its priority or is running and | 4359 | * If the task increased its priority or is running and |
4103 | * lowered its priority, then reschedule its CPU: | 4360 | * lowered its priority, then reschedule its CPU: |
@@ -4255,6 +4512,7 @@ int sched_setscheduler(struct task_struct *p, int policy, | |||
4255 | { | 4512 | { |
4256 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 4513 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
4257 | unsigned long flags; | 4514 | unsigned long flags; |
4515 | const struct sched_class *prev_class = p->sched_class; | ||
4258 | struct rq *rq; | 4516 | struct rq *rq; |
4259 | 4517 | ||
4260 | /* may grab non-irq protected spin_locks */ | 4518 | /* may grab non-irq protected spin_locks */ |
@@ -4335,7 +4593,7 @@ recheck: | |||
4335 | } | 4593 | } |
4336 | update_rq_clock(rq); | 4594 | update_rq_clock(rq); |
4337 | on_rq = p->se.on_rq; | 4595 | on_rq = p->se.on_rq; |
4338 | running = task_running(rq, p); | 4596 | running = task_current(rq, p); |
4339 | if (on_rq) { | 4597 | if (on_rq) { |
4340 | deactivate_task(rq, p, 0); | 4598 | deactivate_task(rq, p, 0); |
4341 | if (running) | 4599 | if (running) |
@@ -4348,18 +4606,10 @@ recheck: | |||
4348 | if (on_rq) { | 4606 | if (on_rq) { |
4349 | if (running) | 4607 | if (running) |
4350 | p->sched_class->set_curr_task(rq); | 4608 | p->sched_class->set_curr_task(rq); |
4609 | |||
4351 | activate_task(rq, p, 0); | 4610 | activate_task(rq, p, 0); |
4352 | /* | 4611 | |
4353 | * Reschedule if we are currently running on this runqueue and | 4612 | check_class_changed(rq, p, prev_class, oldprio, running); |
4354 | * our priority decreased, or if we are not currently running on | ||
4355 | * this runqueue and our priority is higher than the current's | ||
4356 | */ | ||
4357 | if (running) { | ||
4358 | if (p->prio > oldprio) | ||
4359 | resched_task(rq->curr); | ||
4360 | } else { | ||
4361 | check_preempt_curr(rq, p); | ||
4362 | } | ||
4363 | } | 4613 | } |
4364 | __task_rq_unlock(rq); | 4614 | __task_rq_unlock(rq); |
4365 | spin_unlock_irqrestore(&p->pi_lock, flags); | 4615 | spin_unlock_irqrestore(&p->pi_lock, flags); |
@@ -4487,13 +4737,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
4487 | struct task_struct *p; | 4737 | struct task_struct *p; |
4488 | int retval; | 4738 | int retval; |
4489 | 4739 | ||
4490 | mutex_lock(&sched_hotcpu_mutex); | 4740 | get_online_cpus(); |
4491 | read_lock(&tasklist_lock); | 4741 | read_lock(&tasklist_lock); |
4492 | 4742 | ||
4493 | p = find_process_by_pid(pid); | 4743 | p = find_process_by_pid(pid); |
4494 | if (!p) { | 4744 | if (!p) { |
4495 | read_unlock(&tasklist_lock); | 4745 | read_unlock(&tasklist_lock); |
4496 | mutex_unlock(&sched_hotcpu_mutex); | 4746 | put_online_cpus(); |
4497 | return -ESRCH; | 4747 | return -ESRCH; |
4498 | } | 4748 | } |
4499 | 4749 | ||
@@ -4533,7 +4783,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
4533 | } | 4783 | } |
4534 | out_unlock: | 4784 | out_unlock: |
4535 | put_task_struct(p); | 4785 | put_task_struct(p); |
4536 | mutex_unlock(&sched_hotcpu_mutex); | 4786 | put_online_cpus(); |
4537 | return retval; | 4787 | return retval; |
4538 | } | 4788 | } |
4539 | 4789 | ||
@@ -4590,7 +4840,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) | |||
4590 | struct task_struct *p; | 4840 | struct task_struct *p; |
4591 | int retval; | 4841 | int retval; |
4592 | 4842 | ||
4593 | mutex_lock(&sched_hotcpu_mutex); | 4843 | get_online_cpus(); |
4594 | read_lock(&tasklist_lock); | 4844 | read_lock(&tasklist_lock); |
4595 | 4845 | ||
4596 | retval = -ESRCH; | 4846 | retval = -ESRCH; |
@@ -4606,7 +4856,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) | |||
4606 | 4856 | ||
4607 | out_unlock: | 4857 | out_unlock: |
4608 | read_unlock(&tasklist_lock); | 4858 | read_unlock(&tasklist_lock); |
4609 | mutex_unlock(&sched_hotcpu_mutex); | 4859 | put_online_cpus(); |
4610 | 4860 | ||
4611 | return retval; | 4861 | return retval; |
4612 | } | 4862 | } |
@@ -4680,7 +4930,8 @@ static void __cond_resched(void) | |||
4680 | } while (need_resched()); | 4930 | } while (need_resched()); |
4681 | } | 4931 | } |
4682 | 4932 | ||
4683 | int __sched cond_resched(void) | 4933 | #if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) |
4934 | int __sched _cond_resched(void) | ||
4684 | { | 4935 | { |
4685 | if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && | 4936 | if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && |
4686 | system_state == SYSTEM_RUNNING) { | 4937 | system_state == SYSTEM_RUNNING) { |
@@ -4689,7 +4940,8 @@ int __sched cond_resched(void) | |||
4689 | } | 4940 | } |
4690 | return 0; | 4941 | return 0; |
4691 | } | 4942 | } |
4692 | EXPORT_SYMBOL(cond_resched); | 4943 | EXPORT_SYMBOL(_cond_resched); |
4944 | #endif | ||
4693 | 4945 | ||
4694 | /* | 4946 | /* |
4695 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, | 4947 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, |
@@ -4701,19 +4953,15 @@ EXPORT_SYMBOL(cond_resched); | |||
4701 | */ | 4953 | */ |
4702 | int cond_resched_lock(spinlock_t *lock) | 4954 | int cond_resched_lock(spinlock_t *lock) |
4703 | { | 4955 | { |
4956 | int resched = need_resched() && system_state == SYSTEM_RUNNING; | ||
4704 | int ret = 0; | 4957 | int ret = 0; |
4705 | 4958 | ||
4706 | if (need_lockbreak(lock)) { | 4959 | if (spin_needbreak(lock) || resched) { |
4707 | spin_unlock(lock); | 4960 | spin_unlock(lock); |
4708 | cpu_relax(); | 4961 | if (resched && need_resched()) |
4709 | ret = 1; | 4962 | __cond_resched(); |
4710 | spin_lock(lock); | 4963 | else |
4711 | } | 4964 | cpu_relax(); |
4712 | if (need_resched() && system_state == SYSTEM_RUNNING) { | ||
4713 | spin_release(&lock->dep_map, 1, _THIS_IP_); | ||
4714 | _raw_spin_unlock(lock); | ||
4715 | preempt_enable_no_resched(); | ||
4716 | __cond_resched(); | ||
4717 | ret = 1; | 4965 | ret = 1; |
4718 | spin_lock(lock); | 4966 | spin_lock(lock); |
4719 | } | 4967 | } |
@@ -4887,7 +5135,7 @@ out_unlock: | |||
4887 | 5135 | ||
4888 | static const char stat_nam[] = "RSDTtZX"; | 5136 | static const char stat_nam[] = "RSDTtZX"; |
4889 | 5137 | ||
4890 | static void show_task(struct task_struct *p) | 5138 | void sched_show_task(struct task_struct *p) |
4891 | { | 5139 | { |
4892 | unsigned long free = 0; | 5140 | unsigned long free = 0; |
4893 | unsigned state; | 5141 | unsigned state; |
@@ -4915,10 +5163,9 @@ static void show_task(struct task_struct *p) | |||
4915 | } | 5163 | } |
4916 | #endif | 5164 | #endif |
4917 | printk(KERN_CONT "%5lu %5d %6d\n", free, | 5165 | printk(KERN_CONT "%5lu %5d %6d\n", free, |
4918 | task_pid_nr(p), task_pid_nr(p->parent)); | 5166 | task_pid_nr(p), task_pid_nr(p->real_parent)); |
4919 | 5167 | ||
4920 | if (state != TASK_RUNNING) | 5168 | show_stack(p, NULL); |
4921 | show_stack(p, NULL); | ||
4922 | } | 5169 | } |
4923 | 5170 | ||
4924 | void show_state_filter(unsigned long state_filter) | 5171 | void show_state_filter(unsigned long state_filter) |
@@ -4940,7 +5187,7 @@ void show_state_filter(unsigned long state_filter) | |||
4940 | */ | 5187 | */ |
4941 | touch_nmi_watchdog(); | 5188 | touch_nmi_watchdog(); |
4942 | if (!state_filter || (p->state & state_filter)) | 5189 | if (!state_filter || (p->state & state_filter)) |
4943 | show_task(p); | 5190 | sched_show_task(p); |
4944 | } while_each_thread(g, p); | 5191 | } while_each_thread(g, p); |
4945 | 5192 | ||
4946 | touch_all_softlockup_watchdogs(); | 5193 | touch_all_softlockup_watchdogs(); |
@@ -4989,11 +5236,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
4989 | spin_unlock_irqrestore(&rq->lock, flags); | 5236 | spin_unlock_irqrestore(&rq->lock, flags); |
4990 | 5237 | ||
4991 | /* Set the preempt count _outside_ the spinlocks! */ | 5238 | /* Set the preempt count _outside_ the spinlocks! */ |
4992 | #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) | ||
4993 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); | ||
4994 | #else | ||
4995 | task_thread_info(idle)->preempt_count = 0; | 5239 | task_thread_info(idle)->preempt_count = 0; |
4996 | #endif | 5240 | |
4997 | /* | 5241 | /* |
4998 | * The idle tasks have their own, simple scheduling class: | 5242 | * The idle tasks have their own, simple scheduling class: |
4999 | */ | 5243 | */ |
@@ -5074,7 +5318,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | |||
5074 | goto out; | 5318 | goto out; |
5075 | } | 5319 | } |
5076 | 5320 | ||
5077 | p->cpus_allowed = new_mask; | 5321 | if (p->sched_class->set_cpus_allowed) |
5322 | p->sched_class->set_cpus_allowed(p, &new_mask); | ||
5323 | else { | ||
5324 | p->cpus_allowed = new_mask; | ||
5325 | p->rt.nr_cpus_allowed = cpus_weight(new_mask); | ||
5326 | } | ||
5327 | |||
5078 | /* Can the task run on the task's current CPU? If so, we're done */ | 5328 | /* Can the task run on the task's current CPU? If so, we're done */ |
5079 | if (cpu_isset(task_cpu(p), new_mask)) | 5329 | if (cpu_isset(task_cpu(p), new_mask)) |
5080 | goto out; | 5330 | goto out; |
@@ -5566,9 +5816,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5566 | struct rq *rq; | 5816 | struct rq *rq; |
5567 | 5817 | ||
5568 | switch (action) { | 5818 | switch (action) { |
5569 | case CPU_LOCK_ACQUIRE: | ||
5570 | mutex_lock(&sched_hotcpu_mutex); | ||
5571 | break; | ||
5572 | 5819 | ||
5573 | case CPU_UP_PREPARE: | 5820 | case CPU_UP_PREPARE: |
5574 | case CPU_UP_PREPARE_FROZEN: | 5821 | case CPU_UP_PREPARE_FROZEN: |
@@ -5587,6 +5834,15 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5587 | case CPU_ONLINE_FROZEN: | 5834 | case CPU_ONLINE_FROZEN: |
5588 | /* Strictly unnecessary, as first user will wake it. */ | 5835 | /* Strictly unnecessary, as first user will wake it. */ |
5589 | wake_up_process(cpu_rq(cpu)->migration_thread); | 5836 | wake_up_process(cpu_rq(cpu)->migration_thread); |
5837 | |||
5838 | /* Update our root-domain */ | ||
5839 | rq = cpu_rq(cpu); | ||
5840 | spin_lock_irqsave(&rq->lock, flags); | ||
5841 | if (rq->rd) { | ||
5842 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | ||
5843 | cpu_set(cpu, rq->rd->online); | ||
5844 | } | ||
5845 | spin_unlock_irqrestore(&rq->lock, flags); | ||
5590 | break; | 5846 | break; |
5591 | 5847 | ||
5592 | #ifdef CONFIG_HOTPLUG_CPU | 5848 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -5637,10 +5893,18 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5637 | } | 5893 | } |
5638 | spin_unlock_irq(&rq->lock); | 5894 | spin_unlock_irq(&rq->lock); |
5639 | break; | 5895 | break; |
5640 | #endif | 5896 | |
5641 | case CPU_LOCK_RELEASE: | 5897 | case CPU_DOWN_PREPARE: |
5642 | mutex_unlock(&sched_hotcpu_mutex); | 5898 | /* Update our root-domain */ |
5899 | rq = cpu_rq(cpu); | ||
5900 | spin_lock_irqsave(&rq->lock, flags); | ||
5901 | if (rq->rd) { | ||
5902 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | ||
5903 | cpu_clear(cpu, rq->rd->online); | ||
5904 | } | ||
5905 | spin_unlock_irqrestore(&rq->lock, flags); | ||
5643 | break; | 5906 | break; |
5907 | #endif | ||
5644 | } | 5908 | } |
5645 | return NOTIFY_OK; | 5909 | return NOTIFY_OK; |
5646 | } | 5910 | } |
@@ -5828,11 +6092,76 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
5828 | return 1; | 6092 | return 1; |
5829 | } | 6093 | } |
5830 | 6094 | ||
6095 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) | ||
6096 | { | ||
6097 | unsigned long flags; | ||
6098 | const struct sched_class *class; | ||
6099 | |||
6100 | spin_lock_irqsave(&rq->lock, flags); | ||
6101 | |||
6102 | if (rq->rd) { | ||
6103 | struct root_domain *old_rd = rq->rd; | ||
6104 | |||
6105 | for (class = sched_class_highest; class; class = class->next) { | ||
6106 | if (class->leave_domain) | ||
6107 | class->leave_domain(rq); | ||
6108 | } | ||
6109 | |||
6110 | cpu_clear(rq->cpu, old_rd->span); | ||
6111 | cpu_clear(rq->cpu, old_rd->online); | ||
6112 | |||
6113 | if (atomic_dec_and_test(&old_rd->refcount)) | ||
6114 | kfree(old_rd); | ||
6115 | } | ||
6116 | |||
6117 | atomic_inc(&rd->refcount); | ||
6118 | rq->rd = rd; | ||
6119 | |||
6120 | cpu_set(rq->cpu, rd->span); | ||
6121 | if (cpu_isset(rq->cpu, cpu_online_map)) | ||
6122 | cpu_set(rq->cpu, rd->online); | ||
6123 | |||
6124 | for (class = sched_class_highest; class; class = class->next) { | ||
6125 | if (class->join_domain) | ||
6126 | class->join_domain(rq); | ||
6127 | } | ||
6128 | |||
6129 | spin_unlock_irqrestore(&rq->lock, flags); | ||
6130 | } | ||
6131 | |||
6132 | static void init_rootdomain(struct root_domain *rd) | ||
6133 | { | ||
6134 | memset(rd, 0, sizeof(*rd)); | ||
6135 | |||
6136 | cpus_clear(rd->span); | ||
6137 | cpus_clear(rd->online); | ||
6138 | } | ||
6139 | |||
6140 | static void init_defrootdomain(void) | ||
6141 | { | ||
6142 | init_rootdomain(&def_root_domain); | ||
6143 | atomic_set(&def_root_domain.refcount, 1); | ||
6144 | } | ||
6145 | |||
6146 | static struct root_domain *alloc_rootdomain(void) | ||
6147 | { | ||
6148 | struct root_domain *rd; | ||
6149 | |||
6150 | rd = kmalloc(sizeof(*rd), GFP_KERNEL); | ||
6151 | if (!rd) | ||
6152 | return NULL; | ||
6153 | |||
6154 | init_rootdomain(rd); | ||
6155 | |||
6156 | return rd; | ||
6157 | } | ||
6158 | |||
5831 | /* | 6159 | /* |
5832 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 6160 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
5833 | * hold the hotplug lock. | 6161 | * hold the hotplug lock. |
5834 | */ | 6162 | */ |
5835 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) | 6163 | static void |
6164 | cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | ||
5836 | { | 6165 | { |
5837 | struct rq *rq = cpu_rq(cpu); | 6166 | struct rq *rq = cpu_rq(cpu); |
5838 | struct sched_domain *tmp; | 6167 | struct sched_domain *tmp; |
@@ -5857,6 +6186,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu) | |||
5857 | 6186 | ||
5858 | sched_domain_debug(sd, cpu); | 6187 | sched_domain_debug(sd, cpu); |
5859 | 6188 | ||
6189 | rq_attach_root(rq, rd); | ||
5860 | rcu_assign_pointer(rq->sd, sd); | 6190 | rcu_assign_pointer(rq->sd, sd); |
5861 | } | 6191 | } |
5862 | 6192 | ||
@@ -6225,6 +6555,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6225 | static int build_sched_domains(const cpumask_t *cpu_map) | 6555 | static int build_sched_domains(const cpumask_t *cpu_map) |
6226 | { | 6556 | { |
6227 | int i; | 6557 | int i; |
6558 | struct root_domain *rd; | ||
6228 | #ifdef CONFIG_NUMA | 6559 | #ifdef CONFIG_NUMA |
6229 | struct sched_group **sched_group_nodes = NULL; | 6560 | struct sched_group **sched_group_nodes = NULL; |
6230 | int sd_allnodes = 0; | 6561 | int sd_allnodes = 0; |
@@ -6241,6 +6572,12 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6241 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | 6572 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; |
6242 | #endif | 6573 | #endif |
6243 | 6574 | ||
6575 | rd = alloc_rootdomain(); | ||
6576 | if (!rd) { | ||
6577 | printk(KERN_WARNING "Cannot alloc root domain\n"); | ||
6578 | return -ENOMEM; | ||
6579 | } | ||
6580 | |||
6244 | /* | 6581 | /* |
6245 | * Set up domains for cpus specified by the cpu_map. | 6582 | * Set up domains for cpus specified by the cpu_map. |
6246 | */ | 6583 | */ |
@@ -6457,7 +6794,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6457 | #else | 6794 | #else |
6458 | sd = &per_cpu(phys_domains, i); | 6795 | sd = &per_cpu(phys_domains, i); |
6459 | #endif | 6796 | #endif |
6460 | cpu_attach_domain(sd, i); | 6797 | cpu_attach_domain(sd, rd, i); |
6461 | } | 6798 | } |
6462 | 6799 | ||
6463 | return 0; | 6800 | return 0; |
@@ -6515,7 +6852,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
6515 | unregister_sched_domain_sysctl(); | 6852 | unregister_sched_domain_sysctl(); |
6516 | 6853 | ||
6517 | for_each_cpu_mask(i, *cpu_map) | 6854 | for_each_cpu_mask(i, *cpu_map) |
6518 | cpu_attach_domain(NULL, i); | 6855 | cpu_attach_domain(NULL, &def_root_domain, i); |
6519 | synchronize_sched(); | 6856 | synchronize_sched(); |
6520 | arch_destroy_sched_domains(cpu_map); | 6857 | arch_destroy_sched_domains(cpu_map); |
6521 | } | 6858 | } |
@@ -6545,6 +6882,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) | |||
6545 | { | 6882 | { |
6546 | int i, j; | 6883 | int i, j; |
6547 | 6884 | ||
6885 | lock_doms_cur(); | ||
6886 | |||
6548 | /* always unregister in case we don't destroy any domains */ | 6887 | /* always unregister in case we don't destroy any domains */ |
6549 | unregister_sched_domain_sysctl(); | 6888 | unregister_sched_domain_sysctl(); |
6550 | 6889 | ||
@@ -6585,6 +6924,8 @@ match2: | |||
6585 | ndoms_cur = ndoms_new; | 6924 | ndoms_cur = ndoms_new; |
6586 | 6925 | ||
6587 | register_sched_domain_sysctl(); | 6926 | register_sched_domain_sysctl(); |
6927 | |||
6928 | unlock_doms_cur(); | ||
6588 | } | 6929 | } |
6589 | 6930 | ||
6590 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 6931 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
@@ -6592,10 +6933,10 @@ static int arch_reinit_sched_domains(void) | |||
6592 | { | 6933 | { |
6593 | int err; | 6934 | int err; |
6594 | 6935 | ||
6595 | mutex_lock(&sched_hotcpu_mutex); | 6936 | get_online_cpus(); |
6596 | detach_destroy_domains(&cpu_online_map); | 6937 | detach_destroy_domains(&cpu_online_map); |
6597 | err = arch_init_sched_domains(&cpu_online_map); | 6938 | err = arch_init_sched_domains(&cpu_online_map); |
6598 | mutex_unlock(&sched_hotcpu_mutex); | 6939 | put_online_cpus(); |
6599 | 6940 | ||
6600 | return err; | 6941 | return err; |
6601 | } | 6942 | } |
@@ -6706,12 +7047,12 @@ void __init sched_init_smp(void) | |||
6706 | { | 7047 | { |
6707 | cpumask_t non_isolated_cpus; | 7048 | cpumask_t non_isolated_cpus; |
6708 | 7049 | ||
6709 | mutex_lock(&sched_hotcpu_mutex); | 7050 | get_online_cpus(); |
6710 | arch_init_sched_domains(&cpu_online_map); | 7051 | arch_init_sched_domains(&cpu_online_map); |
6711 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); | 7052 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); |
6712 | if (cpus_empty(non_isolated_cpus)) | 7053 | if (cpus_empty(non_isolated_cpus)) |
6713 | cpu_set(smp_processor_id(), non_isolated_cpus); | 7054 | cpu_set(smp_processor_id(), non_isolated_cpus); |
6714 | mutex_unlock(&sched_hotcpu_mutex); | 7055 | put_online_cpus(); |
6715 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 7056 | /* XXX: Theoretical race here - CPU may be hotplugged now */ |
6716 | hotcpu_notifier(update_sched_domains, 0); | 7057 | hotcpu_notifier(update_sched_domains, 0); |
6717 | 7058 | ||
@@ -6719,6 +7060,21 @@ void __init sched_init_smp(void) | |||
6719 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 7060 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
6720 | BUG(); | 7061 | BUG(); |
6721 | sched_init_granularity(); | 7062 | sched_init_granularity(); |
7063 | |||
7064 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7065 | if (nr_cpu_ids == 1) | ||
7066 | return; | ||
7067 | |||
7068 | lb_monitor_task = kthread_create(load_balance_monitor, NULL, | ||
7069 | "group_balance"); | ||
7070 | if (!IS_ERR(lb_monitor_task)) { | ||
7071 | lb_monitor_task->flags |= PF_NOFREEZE; | ||
7072 | wake_up_process(lb_monitor_task); | ||
7073 | } else { | ||
7074 | printk(KERN_ERR "Could not create load balance monitor thread" | ||
7075 | "(error = %ld) \n", PTR_ERR(lb_monitor_task)); | ||
7076 | } | ||
7077 | #endif | ||
6722 | } | 7078 | } |
6723 | #else | 7079 | #else |
6724 | void __init sched_init_smp(void) | 7080 | void __init sched_init_smp(void) |
@@ -6743,13 +7099,87 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | |||
6743 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 7099 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
6744 | } | 7100 | } |
6745 | 7101 | ||
7102 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | ||
7103 | { | ||
7104 | struct rt_prio_array *array; | ||
7105 | int i; | ||
7106 | |||
7107 | array = &rt_rq->active; | ||
7108 | for (i = 0; i < MAX_RT_PRIO; i++) { | ||
7109 | INIT_LIST_HEAD(array->queue + i); | ||
7110 | __clear_bit(i, array->bitmap); | ||
7111 | } | ||
7112 | /* delimiter for bitsearch: */ | ||
7113 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
7114 | |||
7115 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED | ||
7116 | rt_rq->highest_prio = MAX_RT_PRIO; | ||
7117 | #endif | ||
7118 | #ifdef CONFIG_SMP | ||
7119 | rt_rq->rt_nr_migratory = 0; | ||
7120 | rt_rq->overloaded = 0; | ||
7121 | #endif | ||
7122 | |||
7123 | rt_rq->rt_time = 0; | ||
7124 | rt_rq->rt_throttled = 0; | ||
7125 | |||
7126 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7127 | rt_rq->rq = rq; | ||
7128 | #endif | ||
7129 | } | ||
7130 | |||
7131 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7132 | static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, | ||
7133 | struct cfs_rq *cfs_rq, struct sched_entity *se, | ||
7134 | int cpu, int add) | ||
7135 | { | ||
7136 | tg->cfs_rq[cpu] = cfs_rq; | ||
7137 | init_cfs_rq(cfs_rq, rq); | ||
7138 | cfs_rq->tg = tg; | ||
7139 | if (add) | ||
7140 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
7141 | |||
7142 | tg->se[cpu] = se; | ||
7143 | se->cfs_rq = &rq->cfs; | ||
7144 | se->my_q = cfs_rq; | ||
7145 | se->load.weight = tg->shares; | ||
7146 | se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); | ||
7147 | se->parent = NULL; | ||
7148 | } | ||
7149 | |||
7150 | static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, | ||
7151 | struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, | ||
7152 | int cpu, int add) | ||
7153 | { | ||
7154 | tg->rt_rq[cpu] = rt_rq; | ||
7155 | init_rt_rq(rt_rq, rq); | ||
7156 | rt_rq->tg = tg; | ||
7157 | rt_rq->rt_se = rt_se; | ||
7158 | if (add) | ||
7159 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | ||
7160 | |||
7161 | tg->rt_se[cpu] = rt_se; | ||
7162 | rt_se->rt_rq = &rq->rt; | ||
7163 | rt_se->my_q = rt_rq; | ||
7164 | rt_se->parent = NULL; | ||
7165 | INIT_LIST_HEAD(&rt_se->run_list); | ||
7166 | } | ||
7167 | #endif | ||
7168 | |||
6746 | void __init sched_init(void) | 7169 | void __init sched_init(void) |
6747 | { | 7170 | { |
6748 | int highest_cpu = 0; | 7171 | int highest_cpu = 0; |
6749 | int i, j; | 7172 | int i, j; |
6750 | 7173 | ||
7174 | #ifdef CONFIG_SMP | ||
7175 | init_defrootdomain(); | ||
7176 | #endif | ||
7177 | |||
7178 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7179 | list_add(&init_task_group.list, &task_groups); | ||
7180 | #endif | ||
7181 | |||
6751 | for_each_possible_cpu(i) { | 7182 | for_each_possible_cpu(i) { |
6752 | struct rt_prio_array *array; | ||
6753 | struct rq *rq; | 7183 | struct rq *rq; |
6754 | 7184 | ||
6755 | rq = cpu_rq(i); | 7185 | rq = cpu_rq(i); |
@@ -6758,52 +7188,39 @@ void __init sched_init(void) | |||
6758 | rq->nr_running = 0; | 7188 | rq->nr_running = 0; |
6759 | rq->clock = 1; | 7189 | rq->clock = 1; |
6760 | init_cfs_rq(&rq->cfs, rq); | 7190 | init_cfs_rq(&rq->cfs, rq); |
7191 | init_rt_rq(&rq->rt, rq); | ||
6761 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7192 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6762 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | ||
6763 | { | ||
6764 | struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i); | ||
6765 | struct sched_entity *se = | ||
6766 | &per_cpu(init_sched_entity, i); | ||
6767 | |||
6768 | init_cfs_rq_p[i] = cfs_rq; | ||
6769 | init_cfs_rq(cfs_rq, rq); | ||
6770 | cfs_rq->tg = &init_task_group; | ||
6771 | list_add(&cfs_rq->leaf_cfs_rq_list, | ||
6772 | &rq->leaf_cfs_rq_list); | ||
6773 | |||
6774 | init_sched_entity_p[i] = se; | ||
6775 | se->cfs_rq = &rq->cfs; | ||
6776 | se->my_q = cfs_rq; | ||
6777 | se->load.weight = init_task_group_load; | ||
6778 | se->load.inv_weight = | ||
6779 | div64_64(1ULL<<32, init_task_group_load); | ||
6780 | se->parent = NULL; | ||
6781 | } | ||
6782 | init_task_group.shares = init_task_group_load; | 7193 | init_task_group.shares = init_task_group_load; |
6783 | spin_lock_init(&init_task_group.lock); | 7194 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
7195 | init_tg_cfs_entry(rq, &init_task_group, | ||
7196 | &per_cpu(init_cfs_rq, i), | ||
7197 | &per_cpu(init_sched_entity, i), i, 1); | ||
7198 | |||
7199 | init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */ | ||
7200 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | ||
7201 | init_tg_rt_entry(rq, &init_task_group, | ||
7202 | &per_cpu(init_rt_rq, i), | ||
7203 | &per_cpu(init_sched_rt_entity, i), i, 1); | ||
6784 | #endif | 7204 | #endif |
7205 | rq->rt_period_expire = 0; | ||
7206 | rq->rt_throttled = 0; | ||
6785 | 7207 | ||
6786 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 7208 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
6787 | rq->cpu_load[j] = 0; | 7209 | rq->cpu_load[j] = 0; |
6788 | #ifdef CONFIG_SMP | 7210 | #ifdef CONFIG_SMP |
6789 | rq->sd = NULL; | 7211 | rq->sd = NULL; |
7212 | rq->rd = NULL; | ||
6790 | rq->active_balance = 0; | 7213 | rq->active_balance = 0; |
6791 | rq->next_balance = jiffies; | 7214 | rq->next_balance = jiffies; |
6792 | rq->push_cpu = 0; | 7215 | rq->push_cpu = 0; |
6793 | rq->cpu = i; | 7216 | rq->cpu = i; |
6794 | rq->migration_thread = NULL; | 7217 | rq->migration_thread = NULL; |
6795 | INIT_LIST_HEAD(&rq->migration_queue); | 7218 | INIT_LIST_HEAD(&rq->migration_queue); |
7219 | rq_attach_root(rq, &def_root_domain); | ||
6796 | #endif | 7220 | #endif |
7221 | init_rq_hrtick(rq); | ||
6797 | atomic_set(&rq->nr_iowait, 0); | 7222 | atomic_set(&rq->nr_iowait, 0); |
6798 | |||
6799 | array = &rq->rt.active; | ||
6800 | for (j = 0; j < MAX_RT_PRIO; j++) { | ||
6801 | INIT_LIST_HEAD(array->queue + j); | ||
6802 | __clear_bit(j, array->bitmap); | ||
6803 | } | ||
6804 | highest_cpu = i; | 7223 | highest_cpu = i; |
6805 | /* delimiter for bitsearch: */ | ||
6806 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
6807 | } | 7224 | } |
6808 | 7225 | ||
6809 | set_load_weight(&init_task); | 7226 | set_load_weight(&init_task); |
@@ -6972,12 +7389,187 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
6972 | 7389 | ||
6973 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7390 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6974 | 7391 | ||
7392 | #ifdef CONFIG_SMP | ||
7393 | /* | ||
7394 | * distribute shares of all task groups among their schedulable entities, | ||
7395 | * to reflect load distribution across cpus. | ||
7396 | */ | ||
7397 | static int rebalance_shares(struct sched_domain *sd, int this_cpu) | ||
7398 | { | ||
7399 | struct cfs_rq *cfs_rq; | ||
7400 | struct rq *rq = cpu_rq(this_cpu); | ||
7401 | cpumask_t sdspan = sd->span; | ||
7402 | int balanced = 1; | ||
7403 | |||
7404 | /* Walk thr' all the task groups that we have */ | ||
7405 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
7406 | int i; | ||
7407 | unsigned long total_load = 0, total_shares; | ||
7408 | struct task_group *tg = cfs_rq->tg; | ||
7409 | |||
7410 | /* Gather total task load of this group across cpus */ | ||
7411 | for_each_cpu_mask(i, sdspan) | ||
7412 | total_load += tg->cfs_rq[i]->load.weight; | ||
7413 | |||
7414 | /* Nothing to do if this group has no load */ | ||
7415 | if (!total_load) | ||
7416 | continue; | ||
7417 | |||
7418 | /* | ||
7419 | * tg->shares represents the number of cpu shares the task group | ||
7420 | * is eligible to hold on a single cpu. On N cpus, it is | ||
7421 | * eligible to hold (N * tg->shares) number of cpu shares. | ||
7422 | */ | ||
7423 | total_shares = tg->shares * cpus_weight(sdspan); | ||
7424 | |||
7425 | /* | ||
7426 | * redistribute total_shares across cpus as per the task load | ||
7427 | * distribution. | ||
7428 | */ | ||
7429 | for_each_cpu_mask(i, sdspan) { | ||
7430 | unsigned long local_load, local_shares; | ||
7431 | |||
7432 | local_load = tg->cfs_rq[i]->load.weight; | ||
7433 | local_shares = (local_load * total_shares) / total_load; | ||
7434 | if (!local_shares) | ||
7435 | local_shares = MIN_GROUP_SHARES; | ||
7436 | if (local_shares == tg->se[i]->load.weight) | ||
7437 | continue; | ||
7438 | |||
7439 | spin_lock_irq(&cpu_rq(i)->lock); | ||
7440 | set_se_shares(tg->se[i], local_shares); | ||
7441 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
7442 | balanced = 0; | ||
7443 | } | ||
7444 | } | ||
7445 | |||
7446 | return balanced; | ||
7447 | } | ||
7448 | |||
7449 | /* | ||
7450 | * How frequently should we rebalance_shares() across cpus? | ||
7451 | * | ||
7452 | * The more frequently we rebalance shares, the more accurate is the fairness | ||
7453 | * of cpu bandwidth distribution between task groups. However higher frequency | ||
7454 | * also implies increased scheduling overhead. | ||
7455 | * | ||
7456 | * sysctl_sched_min_bal_int_shares represents the minimum interval between | ||
7457 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
7458 | * | ||
7459 | * sysctl_sched_max_bal_int_shares represents the maximum interval between | ||
7460 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
7461 | * | ||
7462 | * These settings allows for the appropriate trade-off between accuracy of | ||
7463 | * fairness and the associated overhead. | ||
7464 | * | ||
7465 | */ | ||
7466 | |||
7467 | /* default: 8ms, units: milliseconds */ | ||
7468 | const_debug unsigned int sysctl_sched_min_bal_int_shares = 8; | ||
7469 | |||
7470 | /* default: 128ms, units: milliseconds */ | ||
7471 | const_debug unsigned int sysctl_sched_max_bal_int_shares = 128; | ||
7472 | |||
7473 | /* kernel thread that runs rebalance_shares() periodically */ | ||
7474 | static int load_balance_monitor(void *unused) | ||
7475 | { | ||
7476 | unsigned int timeout = sysctl_sched_min_bal_int_shares; | ||
7477 | struct sched_param schedparm; | ||
7478 | int ret; | ||
7479 | |||
7480 | /* | ||
7481 | * We don't want this thread's execution to be limited by the shares | ||
7482 | * assigned to default group (init_task_group). Hence make it run | ||
7483 | * as a SCHED_RR RT task at the lowest priority. | ||
7484 | */ | ||
7485 | schedparm.sched_priority = 1; | ||
7486 | ret = sched_setscheduler(current, SCHED_RR, &schedparm); | ||
7487 | if (ret) | ||
7488 | printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance" | ||
7489 | " monitor thread (error = %d) \n", ret); | ||
7490 | |||
7491 | while (!kthread_should_stop()) { | ||
7492 | int i, cpu, balanced = 1; | ||
7493 | |||
7494 | /* Prevent cpus going down or coming up */ | ||
7495 | get_online_cpus(); | ||
7496 | /* lockout changes to doms_cur[] array */ | ||
7497 | lock_doms_cur(); | ||
7498 | /* | ||
7499 | * Enter a rcu read-side critical section to safely walk rq->sd | ||
7500 | * chain on various cpus and to walk task group list | ||
7501 | * (rq->leaf_cfs_rq_list) in rebalance_shares(). | ||
7502 | */ | ||
7503 | rcu_read_lock(); | ||
7504 | |||
7505 | for (i = 0; i < ndoms_cur; i++) { | ||
7506 | cpumask_t cpumap = doms_cur[i]; | ||
7507 | struct sched_domain *sd = NULL, *sd_prev = NULL; | ||
7508 | |||
7509 | cpu = first_cpu(cpumap); | ||
7510 | |||
7511 | /* Find the highest domain at which to balance shares */ | ||
7512 | for_each_domain(cpu, sd) { | ||
7513 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
7514 | continue; | ||
7515 | sd_prev = sd; | ||
7516 | } | ||
7517 | |||
7518 | sd = sd_prev; | ||
7519 | /* sd == NULL? No load balance reqd in this domain */ | ||
7520 | if (!sd) | ||
7521 | continue; | ||
7522 | |||
7523 | balanced &= rebalance_shares(sd, cpu); | ||
7524 | } | ||
7525 | |||
7526 | rcu_read_unlock(); | ||
7527 | |||
7528 | unlock_doms_cur(); | ||
7529 | put_online_cpus(); | ||
7530 | |||
7531 | if (!balanced) | ||
7532 | timeout = sysctl_sched_min_bal_int_shares; | ||
7533 | else if (timeout < sysctl_sched_max_bal_int_shares) | ||
7534 | timeout *= 2; | ||
7535 | |||
7536 | msleep_interruptible(timeout); | ||
7537 | } | ||
7538 | |||
7539 | return 0; | ||
7540 | } | ||
7541 | #endif /* CONFIG_SMP */ | ||
7542 | |||
7543 | static void free_sched_group(struct task_group *tg) | ||
7544 | { | ||
7545 | int i; | ||
7546 | |||
7547 | for_each_possible_cpu(i) { | ||
7548 | if (tg->cfs_rq) | ||
7549 | kfree(tg->cfs_rq[i]); | ||
7550 | if (tg->se) | ||
7551 | kfree(tg->se[i]); | ||
7552 | if (tg->rt_rq) | ||
7553 | kfree(tg->rt_rq[i]); | ||
7554 | if (tg->rt_se) | ||
7555 | kfree(tg->rt_se[i]); | ||
7556 | } | ||
7557 | |||
7558 | kfree(tg->cfs_rq); | ||
7559 | kfree(tg->se); | ||
7560 | kfree(tg->rt_rq); | ||
7561 | kfree(tg->rt_se); | ||
7562 | kfree(tg); | ||
7563 | } | ||
7564 | |||
6975 | /* allocate runqueue etc for a new task group */ | 7565 | /* allocate runqueue etc for a new task group */ |
6976 | struct task_group *sched_create_group(void) | 7566 | struct task_group *sched_create_group(void) |
6977 | { | 7567 | { |
6978 | struct task_group *tg; | 7568 | struct task_group *tg; |
6979 | struct cfs_rq *cfs_rq; | 7569 | struct cfs_rq *cfs_rq; |
6980 | struct sched_entity *se; | 7570 | struct sched_entity *se; |
7571 | struct rt_rq *rt_rq; | ||
7572 | struct sched_rt_entity *rt_se; | ||
6981 | struct rq *rq; | 7573 | struct rq *rq; |
6982 | int i; | 7574 | int i; |
6983 | 7575 | ||
@@ -6991,97 +7583,89 @@ struct task_group *sched_create_group(void) | |||
6991 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); | 7583 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); |
6992 | if (!tg->se) | 7584 | if (!tg->se) |
6993 | goto err; | 7585 | goto err; |
7586 | tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); | ||
7587 | if (!tg->rt_rq) | ||
7588 | goto err; | ||
7589 | tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); | ||
7590 | if (!tg->rt_se) | ||
7591 | goto err; | ||
7592 | |||
7593 | tg->shares = NICE_0_LOAD; | ||
7594 | tg->rt_ratio = 0; /* XXX */ | ||
6994 | 7595 | ||
6995 | for_each_possible_cpu(i) { | 7596 | for_each_possible_cpu(i) { |
6996 | rq = cpu_rq(i); | 7597 | rq = cpu_rq(i); |
6997 | 7598 | ||
6998 | cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, | 7599 | cfs_rq = kmalloc_node(sizeof(struct cfs_rq), |
6999 | cpu_to_node(i)); | 7600 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
7000 | if (!cfs_rq) | 7601 | if (!cfs_rq) |
7001 | goto err; | 7602 | goto err; |
7002 | 7603 | ||
7003 | se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, | 7604 | se = kmalloc_node(sizeof(struct sched_entity), |
7004 | cpu_to_node(i)); | 7605 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
7005 | if (!se) | 7606 | if (!se) |
7006 | goto err; | 7607 | goto err; |
7007 | 7608 | ||
7008 | memset(cfs_rq, 0, sizeof(struct cfs_rq)); | 7609 | rt_rq = kmalloc_node(sizeof(struct rt_rq), |
7009 | memset(se, 0, sizeof(struct sched_entity)); | 7610 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
7611 | if (!rt_rq) | ||
7612 | goto err; | ||
7010 | 7613 | ||
7011 | tg->cfs_rq[i] = cfs_rq; | 7614 | rt_se = kmalloc_node(sizeof(struct sched_rt_entity), |
7012 | init_cfs_rq(cfs_rq, rq); | 7615 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
7013 | cfs_rq->tg = tg; | 7616 | if (!rt_se) |
7617 | goto err; | ||
7014 | 7618 | ||
7015 | tg->se[i] = se; | 7619 | init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); |
7016 | se->cfs_rq = &rq->cfs; | 7620 | init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); |
7017 | se->my_q = cfs_rq; | ||
7018 | se->load.weight = NICE_0_LOAD; | ||
7019 | se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); | ||
7020 | se->parent = NULL; | ||
7021 | } | 7621 | } |
7022 | 7622 | ||
7623 | lock_task_group_list(); | ||
7023 | for_each_possible_cpu(i) { | 7624 | for_each_possible_cpu(i) { |
7024 | rq = cpu_rq(i); | 7625 | rq = cpu_rq(i); |
7025 | cfs_rq = tg->cfs_rq[i]; | 7626 | cfs_rq = tg->cfs_rq[i]; |
7026 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | 7627 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); |
7628 | rt_rq = tg->rt_rq[i]; | ||
7629 | list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | ||
7027 | } | 7630 | } |
7028 | 7631 | list_add_rcu(&tg->list, &task_groups); | |
7029 | tg->shares = NICE_0_LOAD; | 7632 | unlock_task_group_list(); |
7030 | spin_lock_init(&tg->lock); | ||
7031 | 7633 | ||
7032 | return tg; | 7634 | return tg; |
7033 | 7635 | ||
7034 | err: | 7636 | err: |
7035 | for_each_possible_cpu(i) { | 7637 | free_sched_group(tg); |
7036 | if (tg->cfs_rq) | ||
7037 | kfree(tg->cfs_rq[i]); | ||
7038 | if (tg->se) | ||
7039 | kfree(tg->se[i]); | ||
7040 | } | ||
7041 | kfree(tg->cfs_rq); | ||
7042 | kfree(tg->se); | ||
7043 | kfree(tg); | ||
7044 | |||
7045 | return ERR_PTR(-ENOMEM); | 7638 | return ERR_PTR(-ENOMEM); |
7046 | } | 7639 | } |
7047 | 7640 | ||
7048 | /* rcu callback to free various structures associated with a task group */ | 7641 | /* rcu callback to free various structures associated with a task group */ |
7049 | static void free_sched_group(struct rcu_head *rhp) | 7642 | static void free_sched_group_rcu(struct rcu_head *rhp) |
7050 | { | 7643 | { |
7051 | struct task_group *tg = container_of(rhp, struct task_group, rcu); | ||
7052 | struct cfs_rq *cfs_rq; | ||
7053 | struct sched_entity *se; | ||
7054 | int i; | ||
7055 | |||
7056 | /* now it should be safe to free those cfs_rqs */ | 7644 | /* now it should be safe to free those cfs_rqs */ |
7057 | for_each_possible_cpu(i) { | 7645 | free_sched_group(container_of(rhp, struct task_group, rcu)); |
7058 | cfs_rq = tg->cfs_rq[i]; | ||
7059 | kfree(cfs_rq); | ||
7060 | |||
7061 | se = tg->se[i]; | ||
7062 | kfree(se); | ||
7063 | } | ||
7064 | |||
7065 | kfree(tg->cfs_rq); | ||
7066 | kfree(tg->se); | ||
7067 | kfree(tg); | ||
7068 | } | 7646 | } |
7069 | 7647 | ||
7070 | /* Destroy runqueue etc associated with a task group */ | 7648 | /* Destroy runqueue etc associated with a task group */ |
7071 | void sched_destroy_group(struct task_group *tg) | 7649 | void sched_destroy_group(struct task_group *tg) |
7072 | { | 7650 | { |
7073 | struct cfs_rq *cfs_rq = NULL; | 7651 | struct cfs_rq *cfs_rq = NULL; |
7652 | struct rt_rq *rt_rq = NULL; | ||
7074 | int i; | 7653 | int i; |
7075 | 7654 | ||
7655 | lock_task_group_list(); | ||
7076 | for_each_possible_cpu(i) { | 7656 | for_each_possible_cpu(i) { |
7077 | cfs_rq = tg->cfs_rq[i]; | 7657 | cfs_rq = tg->cfs_rq[i]; |
7078 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | 7658 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); |
7659 | rt_rq = tg->rt_rq[i]; | ||
7660 | list_del_rcu(&rt_rq->leaf_rt_rq_list); | ||
7079 | } | 7661 | } |
7662 | list_del_rcu(&tg->list); | ||
7663 | unlock_task_group_list(); | ||
7080 | 7664 | ||
7081 | BUG_ON(!cfs_rq); | 7665 | BUG_ON(!cfs_rq); |
7082 | 7666 | ||
7083 | /* wait for possible concurrent references to cfs_rqs complete */ | 7667 | /* wait for possible concurrent references to cfs_rqs complete */ |
7084 | call_rcu(&tg->rcu, free_sched_group); | 7668 | call_rcu(&tg->rcu, free_sched_group_rcu); |
7085 | } | 7669 | } |
7086 | 7670 | ||
7087 | /* change task's runqueue when it moves between groups. | 7671 | /* change task's runqueue when it moves between groups. |
@@ -7097,14 +7681,9 @@ void sched_move_task(struct task_struct *tsk) | |||
7097 | 7681 | ||
7098 | rq = task_rq_lock(tsk, &flags); | 7682 | rq = task_rq_lock(tsk, &flags); |
7099 | 7683 | ||
7100 | if (tsk->sched_class != &fair_sched_class) { | ||
7101 | set_task_cfs_rq(tsk, task_cpu(tsk)); | ||
7102 | goto done; | ||
7103 | } | ||
7104 | |||
7105 | update_rq_clock(rq); | 7684 | update_rq_clock(rq); |
7106 | 7685 | ||
7107 | running = task_running(rq, tsk); | 7686 | running = task_current(rq, tsk); |
7108 | on_rq = tsk->se.on_rq; | 7687 | on_rq = tsk->se.on_rq; |
7109 | 7688 | ||
7110 | if (on_rq) { | 7689 | if (on_rq) { |
@@ -7113,7 +7692,7 @@ void sched_move_task(struct task_struct *tsk) | |||
7113 | tsk->sched_class->put_prev_task(rq, tsk); | 7692 | tsk->sched_class->put_prev_task(rq, tsk); |
7114 | } | 7693 | } |
7115 | 7694 | ||
7116 | set_task_cfs_rq(tsk, task_cpu(tsk)); | 7695 | set_task_rq(tsk, task_cpu(tsk)); |
7117 | 7696 | ||
7118 | if (on_rq) { | 7697 | if (on_rq) { |
7119 | if (unlikely(running)) | 7698 | if (unlikely(running)) |
@@ -7121,45 +7700,82 @@ void sched_move_task(struct task_struct *tsk) | |||
7121 | enqueue_task(rq, tsk, 0); | 7700 | enqueue_task(rq, tsk, 0); |
7122 | } | 7701 | } |
7123 | 7702 | ||
7124 | done: | ||
7125 | task_rq_unlock(rq, &flags); | 7703 | task_rq_unlock(rq, &flags); |
7126 | } | 7704 | } |
7127 | 7705 | ||
7706 | /* rq->lock to be locked by caller */ | ||
7128 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 7707 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
7129 | { | 7708 | { |
7130 | struct cfs_rq *cfs_rq = se->cfs_rq; | 7709 | struct cfs_rq *cfs_rq = se->cfs_rq; |
7131 | struct rq *rq = cfs_rq->rq; | 7710 | struct rq *rq = cfs_rq->rq; |
7132 | int on_rq; | 7711 | int on_rq; |
7133 | 7712 | ||
7134 | spin_lock_irq(&rq->lock); | 7713 | if (!shares) |
7714 | shares = MIN_GROUP_SHARES; | ||
7135 | 7715 | ||
7136 | on_rq = se->on_rq; | 7716 | on_rq = se->on_rq; |
7137 | if (on_rq) | 7717 | if (on_rq) { |
7138 | dequeue_entity(cfs_rq, se, 0); | 7718 | dequeue_entity(cfs_rq, se, 0); |
7719 | dec_cpu_load(rq, se->load.weight); | ||
7720 | } | ||
7139 | 7721 | ||
7140 | se->load.weight = shares; | 7722 | se->load.weight = shares; |
7141 | se->load.inv_weight = div64_64((1ULL<<32), shares); | 7723 | se->load.inv_weight = div64_64((1ULL<<32), shares); |
7142 | 7724 | ||
7143 | if (on_rq) | 7725 | if (on_rq) { |
7144 | enqueue_entity(cfs_rq, se, 0); | 7726 | enqueue_entity(cfs_rq, se, 0); |
7145 | 7727 | inc_cpu_load(rq, se->load.weight); | |
7146 | spin_unlock_irq(&rq->lock); | 7728 | } |
7147 | } | 7729 | } |
7148 | 7730 | ||
7149 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | 7731 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) |
7150 | { | 7732 | { |
7151 | int i; | 7733 | int i; |
7734 | struct cfs_rq *cfs_rq; | ||
7735 | struct rq *rq; | ||
7152 | 7736 | ||
7153 | spin_lock(&tg->lock); | 7737 | lock_task_group_list(); |
7154 | if (tg->shares == shares) | 7738 | if (tg->shares == shares) |
7155 | goto done; | 7739 | goto done; |
7156 | 7740 | ||
7741 | if (shares < MIN_GROUP_SHARES) | ||
7742 | shares = MIN_GROUP_SHARES; | ||
7743 | |||
7744 | /* | ||
7745 | * Prevent any load balance activity (rebalance_shares, | ||
7746 | * load_balance_fair) from referring to this group first, | ||
7747 | * by taking it off the rq->leaf_cfs_rq_list on each cpu. | ||
7748 | */ | ||
7749 | for_each_possible_cpu(i) { | ||
7750 | cfs_rq = tg->cfs_rq[i]; | ||
7751 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | ||
7752 | } | ||
7753 | |||
7754 | /* wait for any ongoing reference to this group to finish */ | ||
7755 | synchronize_sched(); | ||
7756 | |||
7757 | /* | ||
7758 | * Now we are free to modify the group's share on each cpu | ||
7759 | * w/o tripping rebalance_share or load_balance_fair. | ||
7760 | */ | ||
7157 | tg->shares = shares; | 7761 | tg->shares = shares; |
7158 | for_each_possible_cpu(i) | 7762 | for_each_possible_cpu(i) { |
7763 | spin_lock_irq(&cpu_rq(i)->lock); | ||
7159 | set_se_shares(tg->se[i], shares); | 7764 | set_se_shares(tg->se[i], shares); |
7765 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
7766 | } | ||
7160 | 7767 | ||
7768 | /* | ||
7769 | * Enable load balance activity on this group, by inserting it back on | ||
7770 | * each cpu's rq->leaf_cfs_rq_list. | ||
7771 | */ | ||
7772 | for_each_possible_cpu(i) { | ||
7773 | rq = cpu_rq(i); | ||
7774 | cfs_rq = tg->cfs_rq[i]; | ||
7775 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
7776 | } | ||
7161 | done: | 7777 | done: |
7162 | spin_unlock(&tg->lock); | 7778 | unlock_task_group_list(); |
7163 | return 0; | 7779 | return 0; |
7164 | } | 7780 | } |
7165 | 7781 | ||
@@ -7168,6 +7784,31 @@ unsigned long sched_group_shares(struct task_group *tg) | |||
7168 | return tg->shares; | 7784 | return tg->shares; |
7169 | } | 7785 | } |
7170 | 7786 | ||
7787 | /* | ||
7788 | * Ensure the total rt_ratio <= sysctl_sched_rt_ratio | ||
7789 | */ | ||
7790 | int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio) | ||
7791 | { | ||
7792 | struct task_group *tgi; | ||
7793 | unsigned long total = 0; | ||
7794 | |||
7795 | rcu_read_lock(); | ||
7796 | list_for_each_entry_rcu(tgi, &task_groups, list) | ||
7797 | total += tgi->rt_ratio; | ||
7798 | rcu_read_unlock(); | ||
7799 | |||
7800 | if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio) | ||
7801 | return -EINVAL; | ||
7802 | |||
7803 | tg->rt_ratio = rt_ratio; | ||
7804 | return 0; | ||
7805 | } | ||
7806 | |||
7807 | unsigned long sched_group_rt_ratio(struct task_group *tg) | ||
7808 | { | ||
7809 | return tg->rt_ratio; | ||
7810 | } | ||
7811 | |||
7171 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7812 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7172 | 7813 | ||
7173 | #ifdef CONFIG_FAIR_CGROUP_SCHED | 7814 | #ifdef CONFIG_FAIR_CGROUP_SCHED |
@@ -7243,12 +7884,30 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) | |||
7243 | return (u64) tg->shares; | 7884 | return (u64) tg->shares; |
7244 | } | 7885 | } |
7245 | 7886 | ||
7887 | static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype, | ||
7888 | u64 rt_ratio_val) | ||
7889 | { | ||
7890 | return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val); | ||
7891 | } | ||
7892 | |||
7893 | static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft) | ||
7894 | { | ||
7895 | struct task_group *tg = cgroup_tg(cgrp); | ||
7896 | |||
7897 | return (u64) tg->rt_ratio; | ||
7898 | } | ||
7899 | |||
7246 | static struct cftype cpu_files[] = { | 7900 | static struct cftype cpu_files[] = { |
7247 | { | 7901 | { |
7248 | .name = "shares", | 7902 | .name = "shares", |
7249 | .read_uint = cpu_shares_read_uint, | 7903 | .read_uint = cpu_shares_read_uint, |
7250 | .write_uint = cpu_shares_write_uint, | 7904 | .write_uint = cpu_shares_write_uint, |
7251 | }, | 7905 | }, |
7906 | { | ||
7907 | .name = "rt_ratio", | ||
7908 | .read_uint = cpu_rt_ratio_read_uint, | ||
7909 | .write_uint = cpu_rt_ratio_write_uint, | ||
7910 | }, | ||
7252 | }; | 7911 | }; |
7253 | 7912 | ||
7254 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 7913 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index d30467b47ddd..4b5e24cf2f4a 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -31,9 +31,9 @@ | |||
31 | /* | 31 | /* |
32 | * Ease the printing of nsec fields: | 32 | * Ease the printing of nsec fields: |
33 | */ | 33 | */ |
34 | static long long nsec_high(long long nsec) | 34 | static long long nsec_high(unsigned long long nsec) |
35 | { | 35 | { |
36 | if (nsec < 0) { | 36 | if ((long long)nsec < 0) { |
37 | nsec = -nsec; | 37 | nsec = -nsec; |
38 | do_div(nsec, 1000000); | 38 | do_div(nsec, 1000000); |
39 | return -nsec; | 39 | return -nsec; |
@@ -43,9 +43,9 @@ static long long nsec_high(long long nsec) | |||
43 | return nsec; | 43 | return nsec; |
44 | } | 44 | } |
45 | 45 | ||
46 | static unsigned long nsec_low(long long nsec) | 46 | static unsigned long nsec_low(unsigned long long nsec) |
47 | { | 47 | { |
48 | if (nsec < 0) | 48 | if ((long long)nsec < 0) |
49 | nsec = -nsec; | 49 | nsec = -nsec; |
50 | 50 | ||
51 | return do_div(nsec, 1000000); | 51 | return do_div(nsec, 1000000); |
@@ -179,6 +179,7 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
179 | PN(prev_clock_raw); | 179 | PN(prev_clock_raw); |
180 | P(clock_warps); | 180 | P(clock_warps); |
181 | P(clock_overflows); | 181 | P(clock_overflows); |
182 | P(clock_underflows); | ||
182 | P(clock_deep_idle_events); | 183 | P(clock_deep_idle_events); |
183 | PN(clock_max_delta); | 184 | PN(clock_max_delta); |
184 | P(cpu_load[0]); | 185 | P(cpu_load[0]); |
@@ -299,6 +300,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
299 | PN(se.exec_max); | 300 | PN(se.exec_max); |
300 | PN(se.slice_max); | 301 | PN(se.slice_max); |
301 | PN(se.wait_max); | 302 | PN(se.wait_max); |
303 | PN(se.wait_sum); | ||
304 | P(se.wait_count); | ||
302 | P(sched_info.bkl_count); | 305 | P(sched_info.bkl_count); |
303 | P(se.nr_migrations); | 306 | P(se.nr_migrations); |
304 | P(se.nr_migrations_cold); | 307 | P(se.nr_migrations_cold); |
@@ -366,6 +369,8 @@ void proc_sched_set_task(struct task_struct *p) | |||
366 | { | 369 | { |
367 | #ifdef CONFIG_SCHEDSTATS | 370 | #ifdef CONFIG_SCHEDSTATS |
368 | p->se.wait_max = 0; | 371 | p->se.wait_max = 0; |
372 | p->se.wait_sum = 0; | ||
373 | p->se.wait_count = 0; | ||
369 | p->se.sleep_max = 0; | 374 | p->se.sleep_max = 0; |
370 | p->se.sum_sleep_runtime = 0; | 375 | p->se.sum_sleep_runtime = 0; |
371 | p->se.block_max = 0; | 376 | p->se.block_max = 0; |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c33f0ceb3de9..6c091d6e159d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -20,6 +20,8 @@ | |||
20 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | 20 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> |
21 | */ | 21 | */ |
22 | 22 | ||
23 | #include <linux/latencytop.h> | ||
24 | |||
23 | /* | 25 | /* |
24 | * Targeted preemption latency for CPU-bound tasks: | 26 | * Targeted preemption latency for CPU-bound tasks: |
25 | * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) | 27 | * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) |
@@ -248,8 +250,8 @@ static u64 __sched_period(unsigned long nr_running) | |||
248 | unsigned long nr_latency = sched_nr_latency; | 250 | unsigned long nr_latency = sched_nr_latency; |
249 | 251 | ||
250 | if (unlikely(nr_running > nr_latency)) { | 252 | if (unlikely(nr_running > nr_latency)) { |
253 | period = sysctl_sched_min_granularity; | ||
251 | period *= nr_running; | 254 | period *= nr_running; |
252 | do_div(period, nr_latency); | ||
253 | } | 255 | } |
254 | 256 | ||
255 | return period; | 257 | return period; |
@@ -383,6 +385,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
383 | { | 385 | { |
384 | schedstat_set(se->wait_max, max(se->wait_max, | 386 | schedstat_set(se->wait_max, max(se->wait_max, |
385 | rq_of(cfs_rq)->clock - se->wait_start)); | 387 | rq_of(cfs_rq)->clock - se->wait_start)); |
388 | schedstat_set(se->wait_count, se->wait_count + 1); | ||
389 | schedstat_set(se->wait_sum, se->wait_sum + | ||
390 | rq_of(cfs_rq)->clock - se->wait_start); | ||
386 | schedstat_set(se->wait_start, 0); | 391 | schedstat_set(se->wait_start, 0); |
387 | } | 392 | } |
388 | 393 | ||
@@ -434,6 +439,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
434 | #ifdef CONFIG_SCHEDSTATS | 439 | #ifdef CONFIG_SCHEDSTATS |
435 | if (se->sleep_start) { | 440 | if (se->sleep_start) { |
436 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; | 441 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; |
442 | struct task_struct *tsk = task_of(se); | ||
437 | 443 | ||
438 | if ((s64)delta < 0) | 444 | if ((s64)delta < 0) |
439 | delta = 0; | 445 | delta = 0; |
@@ -443,9 +449,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
443 | 449 | ||
444 | se->sleep_start = 0; | 450 | se->sleep_start = 0; |
445 | se->sum_sleep_runtime += delta; | 451 | se->sum_sleep_runtime += delta; |
452 | |||
453 | account_scheduler_latency(tsk, delta >> 10, 1); | ||
446 | } | 454 | } |
447 | if (se->block_start) { | 455 | if (se->block_start) { |
448 | u64 delta = rq_of(cfs_rq)->clock - se->block_start; | 456 | u64 delta = rq_of(cfs_rq)->clock - se->block_start; |
457 | struct task_struct *tsk = task_of(se); | ||
449 | 458 | ||
450 | if ((s64)delta < 0) | 459 | if ((s64)delta < 0) |
451 | delta = 0; | 460 | delta = 0; |
@@ -462,11 +471,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
462 | * time that the task spent sleeping: | 471 | * time that the task spent sleeping: |
463 | */ | 472 | */ |
464 | if (unlikely(prof_on == SLEEP_PROFILING)) { | 473 | if (unlikely(prof_on == SLEEP_PROFILING)) { |
465 | struct task_struct *tsk = task_of(se); | ||
466 | 474 | ||
467 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), | 475 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), |
468 | delta >> 20); | 476 | delta >> 20); |
469 | } | 477 | } |
478 | account_scheduler_latency(tsk, delta >> 10, 0); | ||
470 | } | 479 | } |
471 | #endif | 480 | #endif |
472 | } | 481 | } |
@@ -511,8 +520,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
511 | 520 | ||
512 | if (!initial) { | 521 | if (!initial) { |
513 | /* sleeps upto a single latency don't count. */ | 522 | /* sleeps upto a single latency don't count. */ |
514 | if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) && | 523 | if (sched_feat(NEW_FAIR_SLEEPERS)) |
515 | task_of(se)->policy != SCHED_BATCH) | ||
516 | vruntime -= sysctl_sched_latency; | 524 | vruntime -= sysctl_sched_latency; |
517 | 525 | ||
518 | /* ensure we never gain time by being placed backwards. */ | 526 | /* ensure we never gain time by being placed backwards. */ |
@@ -643,13 +651,29 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
643 | cfs_rq->curr = NULL; | 651 | cfs_rq->curr = NULL; |
644 | } | 652 | } |
645 | 653 | ||
646 | static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | 654 | static void |
655 | entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | ||
647 | { | 656 | { |
648 | /* | 657 | /* |
649 | * Update run-time statistics of the 'current'. | 658 | * Update run-time statistics of the 'current'. |
650 | */ | 659 | */ |
651 | update_curr(cfs_rq); | 660 | update_curr(cfs_rq); |
652 | 661 | ||
662 | #ifdef CONFIG_SCHED_HRTICK | ||
663 | /* | ||
664 | * queued ticks are scheduled to match the slice, so don't bother | ||
665 | * validating it and just reschedule. | ||
666 | */ | ||
667 | if (queued) | ||
668 | return resched_task(rq_of(cfs_rq)->curr); | ||
669 | /* | ||
670 | * don't let the period tick interfere with the hrtick preemption | ||
671 | */ | ||
672 | if (!sched_feat(DOUBLE_TICK) && | ||
673 | hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) | ||
674 | return; | ||
675 | #endif | ||
676 | |||
653 | if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) | 677 | if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) |
654 | check_preempt_tick(cfs_rq, curr); | 678 | check_preempt_tick(cfs_rq, curr); |
655 | } | 679 | } |
@@ -691,7 +715,7 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
691 | 715 | ||
692 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | 716 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ |
693 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 717 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
694 | list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | 718 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) |
695 | 719 | ||
696 | /* Do the two (enqueued) entities belong to the same group ? */ | 720 | /* Do the two (enqueued) entities belong to the same group ? */ |
697 | static inline int | 721 | static inline int |
@@ -708,6 +732,8 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
708 | return se->parent; | 732 | return se->parent; |
709 | } | 733 | } |
710 | 734 | ||
735 | #define GROUP_IMBALANCE_PCT 20 | ||
736 | |||
711 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 737 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
712 | 738 | ||
713 | #define for_each_sched_entity(se) \ | 739 | #define for_each_sched_entity(se) \ |
@@ -753,6 +779,43 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
753 | 779 | ||
754 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 780 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
755 | 781 | ||
782 | #ifdef CONFIG_SCHED_HRTICK | ||
783 | static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | ||
784 | { | ||
785 | int requeue = rq->curr == p; | ||
786 | struct sched_entity *se = &p->se; | ||
787 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
788 | |||
789 | WARN_ON(task_rq(p) != rq); | ||
790 | |||
791 | if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { | ||
792 | u64 slice = sched_slice(cfs_rq, se); | ||
793 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; | ||
794 | s64 delta = slice - ran; | ||
795 | |||
796 | if (delta < 0) { | ||
797 | if (rq->curr == p) | ||
798 | resched_task(p); | ||
799 | return; | ||
800 | } | ||
801 | |||
802 | /* | ||
803 | * Don't schedule slices shorter than 10000ns, that just | ||
804 | * doesn't make sense. Rely on vruntime for fairness. | ||
805 | */ | ||
806 | if (!requeue) | ||
807 | delta = max(10000LL, delta); | ||
808 | |||
809 | hrtick_start(rq, delta, requeue); | ||
810 | } | ||
811 | } | ||
812 | #else | ||
813 | static inline void | ||
814 | hrtick_start_fair(struct rq *rq, struct task_struct *p) | ||
815 | { | ||
816 | } | ||
817 | #endif | ||
818 | |||
756 | /* | 819 | /* |
757 | * The enqueue_task method is called before nr_running is | 820 | * The enqueue_task method is called before nr_running is |
758 | * increased. Here we update the fair scheduling stats and | 821 | * increased. Here we update the fair scheduling stats and |
@@ -761,15 +824,28 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
761 | static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | 824 | static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) |
762 | { | 825 | { |
763 | struct cfs_rq *cfs_rq; | 826 | struct cfs_rq *cfs_rq; |
764 | struct sched_entity *se = &p->se; | 827 | struct sched_entity *se = &p->se, |
828 | *topse = NULL; /* Highest schedulable entity */ | ||
829 | int incload = 1; | ||
765 | 830 | ||
766 | for_each_sched_entity(se) { | 831 | for_each_sched_entity(se) { |
767 | if (se->on_rq) | 832 | topse = se; |
833 | if (se->on_rq) { | ||
834 | incload = 0; | ||
768 | break; | 835 | break; |
836 | } | ||
769 | cfs_rq = cfs_rq_of(se); | 837 | cfs_rq = cfs_rq_of(se); |
770 | enqueue_entity(cfs_rq, se, wakeup); | 838 | enqueue_entity(cfs_rq, se, wakeup); |
771 | wakeup = 1; | 839 | wakeup = 1; |
772 | } | 840 | } |
841 | /* Increment cpu load if we just enqueued the first task of a group on | ||
842 | * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs | ||
843 | * at the highest grouping level. | ||
844 | */ | ||
845 | if (incload) | ||
846 | inc_cpu_load(rq, topse->load.weight); | ||
847 | |||
848 | hrtick_start_fair(rq, rq->curr); | ||
773 | } | 849 | } |
774 | 850 | ||
775 | /* | 851 | /* |
@@ -780,16 +856,30 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | |||
780 | static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) | 856 | static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) |
781 | { | 857 | { |
782 | struct cfs_rq *cfs_rq; | 858 | struct cfs_rq *cfs_rq; |
783 | struct sched_entity *se = &p->se; | 859 | struct sched_entity *se = &p->se, |
860 | *topse = NULL; /* Highest schedulable entity */ | ||
861 | int decload = 1; | ||
784 | 862 | ||
785 | for_each_sched_entity(se) { | 863 | for_each_sched_entity(se) { |
864 | topse = se; | ||
786 | cfs_rq = cfs_rq_of(se); | 865 | cfs_rq = cfs_rq_of(se); |
787 | dequeue_entity(cfs_rq, se, sleep); | 866 | dequeue_entity(cfs_rq, se, sleep); |
788 | /* Don't dequeue parent if it has other entities besides us */ | 867 | /* Don't dequeue parent if it has other entities besides us */ |
789 | if (cfs_rq->load.weight) | 868 | if (cfs_rq->load.weight) { |
869 | if (parent_entity(se)) | ||
870 | decload = 0; | ||
790 | break; | 871 | break; |
872 | } | ||
791 | sleep = 1; | 873 | sleep = 1; |
792 | } | 874 | } |
875 | /* Decrement cpu load if we just dequeued the last task of a group on | ||
876 | * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs | ||
877 | * at the highest grouping level. | ||
878 | */ | ||
879 | if (decload) | ||
880 | dec_cpu_load(rq, topse->load.weight); | ||
881 | |||
882 | hrtick_start_fair(rq, rq->curr); | ||
793 | } | 883 | } |
794 | 884 | ||
795 | /* | 885 | /* |
@@ -837,6 +927,154 @@ static void yield_task_fair(struct rq *rq) | |||
837 | } | 927 | } |
838 | 928 | ||
839 | /* | 929 | /* |
930 | * wake_idle() will wake a task on an idle cpu if task->cpu is | ||
931 | * not idle and an idle cpu is available. The span of cpus to | ||
932 | * search starts with cpus closest then further out as needed, | ||
933 | * so we always favor a closer, idle cpu. | ||
934 | * | ||
935 | * Returns the CPU we should wake onto. | ||
936 | */ | ||
937 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | ||
938 | static int wake_idle(int cpu, struct task_struct *p) | ||
939 | { | ||
940 | cpumask_t tmp; | ||
941 | struct sched_domain *sd; | ||
942 | int i; | ||
943 | |||
944 | /* | ||
945 | * If it is idle, then it is the best cpu to run this task. | ||
946 | * | ||
947 | * This cpu is also the best, if it has more than one task already. | ||
948 | * Siblings must be also busy(in most cases) as they didn't already | ||
949 | * pickup the extra load from this cpu and hence we need not check | ||
950 | * sibling runqueue info. This will avoid the checks and cache miss | ||
951 | * penalities associated with that. | ||
952 | */ | ||
953 | if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) | ||
954 | return cpu; | ||
955 | |||
956 | for_each_domain(cpu, sd) { | ||
957 | if (sd->flags & SD_WAKE_IDLE) { | ||
958 | cpus_and(tmp, sd->span, p->cpus_allowed); | ||
959 | for_each_cpu_mask(i, tmp) { | ||
960 | if (idle_cpu(i)) { | ||
961 | if (i != task_cpu(p)) { | ||
962 | schedstat_inc(p, | ||
963 | se.nr_wakeups_idle); | ||
964 | } | ||
965 | return i; | ||
966 | } | ||
967 | } | ||
968 | } else { | ||
969 | break; | ||
970 | } | ||
971 | } | ||
972 | return cpu; | ||
973 | } | ||
974 | #else | ||
975 | static inline int wake_idle(int cpu, struct task_struct *p) | ||
976 | { | ||
977 | return cpu; | ||
978 | } | ||
979 | #endif | ||
980 | |||
981 | #ifdef CONFIG_SMP | ||
982 | static int select_task_rq_fair(struct task_struct *p, int sync) | ||
983 | { | ||
984 | int cpu, this_cpu; | ||
985 | struct rq *rq; | ||
986 | struct sched_domain *sd, *this_sd = NULL; | ||
987 | int new_cpu; | ||
988 | |||
989 | cpu = task_cpu(p); | ||
990 | rq = task_rq(p); | ||
991 | this_cpu = smp_processor_id(); | ||
992 | new_cpu = cpu; | ||
993 | |||
994 | if (cpu == this_cpu) | ||
995 | goto out_set_cpu; | ||
996 | |||
997 | for_each_domain(this_cpu, sd) { | ||
998 | if (cpu_isset(cpu, sd->span)) { | ||
999 | this_sd = sd; | ||
1000 | break; | ||
1001 | } | ||
1002 | } | ||
1003 | |||
1004 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | ||
1005 | goto out_set_cpu; | ||
1006 | |||
1007 | /* | ||
1008 | * Check for affine wakeup and passive balancing possibilities. | ||
1009 | */ | ||
1010 | if (this_sd) { | ||
1011 | int idx = this_sd->wake_idx; | ||
1012 | unsigned int imbalance; | ||
1013 | unsigned long load, this_load; | ||
1014 | |||
1015 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; | ||
1016 | |||
1017 | load = source_load(cpu, idx); | ||
1018 | this_load = target_load(this_cpu, idx); | ||
1019 | |||
1020 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | ||
1021 | |||
1022 | if (this_sd->flags & SD_WAKE_AFFINE) { | ||
1023 | unsigned long tl = this_load; | ||
1024 | unsigned long tl_per_task; | ||
1025 | |||
1026 | /* | ||
1027 | * Attract cache-cold tasks on sync wakeups: | ||
1028 | */ | ||
1029 | if (sync && !task_hot(p, rq->clock, this_sd)) | ||
1030 | goto out_set_cpu; | ||
1031 | |||
1032 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | ||
1033 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
1034 | |||
1035 | /* | ||
1036 | * If sync wakeup then subtract the (maximum possible) | ||
1037 | * effect of the currently running task from the load | ||
1038 | * of the current CPU: | ||
1039 | */ | ||
1040 | if (sync) | ||
1041 | tl -= current->se.load.weight; | ||
1042 | |||
1043 | if ((tl <= load && | ||
1044 | tl + target_load(cpu, idx) <= tl_per_task) || | ||
1045 | 100*(tl + p->se.load.weight) <= imbalance*load) { | ||
1046 | /* | ||
1047 | * This domain has SD_WAKE_AFFINE and | ||
1048 | * p is cache cold in this domain, and | ||
1049 | * there is no bad imbalance. | ||
1050 | */ | ||
1051 | schedstat_inc(this_sd, ttwu_move_affine); | ||
1052 | schedstat_inc(p, se.nr_wakeups_affine); | ||
1053 | goto out_set_cpu; | ||
1054 | } | ||
1055 | } | ||
1056 | |||
1057 | /* | ||
1058 | * Start passive balancing when half the imbalance_pct | ||
1059 | * limit is reached. | ||
1060 | */ | ||
1061 | if (this_sd->flags & SD_WAKE_BALANCE) { | ||
1062 | if (imbalance*this_load <= 100*load) { | ||
1063 | schedstat_inc(this_sd, ttwu_move_balance); | ||
1064 | schedstat_inc(p, se.nr_wakeups_passive); | ||
1065 | goto out_set_cpu; | ||
1066 | } | ||
1067 | } | ||
1068 | } | ||
1069 | |||
1070 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ | ||
1071 | out_set_cpu: | ||
1072 | return wake_idle(new_cpu, p); | ||
1073 | } | ||
1074 | #endif /* CONFIG_SMP */ | ||
1075 | |||
1076 | |||
1077 | /* | ||
840 | * Preempt the current task with a newly woken task if needed: | 1078 | * Preempt the current task with a newly woken task if needed: |
841 | */ | 1079 | */ |
842 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | 1080 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) |
@@ -868,7 +1106,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
868 | } | 1106 | } |
869 | 1107 | ||
870 | gran = sysctl_sched_wakeup_granularity; | 1108 | gran = sysctl_sched_wakeup_granularity; |
871 | if (unlikely(se->load.weight != NICE_0_LOAD)) | 1109 | /* |
1110 | * More easily preempt - nice tasks, while not making | ||
1111 | * it harder for + nice tasks. | ||
1112 | */ | ||
1113 | if (unlikely(se->load.weight > NICE_0_LOAD)) | ||
872 | gran = calc_delta_fair(gran, &se->load); | 1114 | gran = calc_delta_fair(gran, &se->load); |
873 | 1115 | ||
874 | if (pse->vruntime + gran < se->vruntime) | 1116 | if (pse->vruntime + gran < se->vruntime) |
@@ -877,6 +1119,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
877 | 1119 | ||
878 | static struct task_struct *pick_next_task_fair(struct rq *rq) | 1120 | static struct task_struct *pick_next_task_fair(struct rq *rq) |
879 | { | 1121 | { |
1122 | struct task_struct *p; | ||
880 | struct cfs_rq *cfs_rq = &rq->cfs; | 1123 | struct cfs_rq *cfs_rq = &rq->cfs; |
881 | struct sched_entity *se; | 1124 | struct sched_entity *se; |
882 | 1125 | ||
@@ -888,7 +1131,10 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) | |||
888 | cfs_rq = group_cfs_rq(se); | 1131 | cfs_rq = group_cfs_rq(se); |
889 | } while (cfs_rq); | 1132 | } while (cfs_rq); |
890 | 1133 | ||
891 | return task_of(se); | 1134 | p = task_of(se); |
1135 | hrtick_start_fair(rq, p); | ||
1136 | |||
1137 | return p; | ||
892 | } | 1138 | } |
893 | 1139 | ||
894 | /* | 1140 | /* |
@@ -945,25 +1191,6 @@ static struct task_struct *load_balance_next_fair(void *arg) | |||
945 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); | 1191 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); |
946 | } | 1192 | } |
947 | 1193 | ||
948 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
949 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | ||
950 | { | ||
951 | struct sched_entity *curr; | ||
952 | struct task_struct *p; | ||
953 | |||
954 | if (!cfs_rq->nr_running) | ||
955 | return MAX_PRIO; | ||
956 | |||
957 | curr = cfs_rq->curr; | ||
958 | if (!curr) | ||
959 | curr = __pick_next_entity(cfs_rq); | ||
960 | |||
961 | p = task_of(curr); | ||
962 | |||
963 | return p->prio; | ||
964 | } | ||
965 | #endif | ||
966 | |||
967 | static unsigned long | 1194 | static unsigned long |
968 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1195 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
969 | unsigned long max_load_move, | 1196 | unsigned long max_load_move, |
@@ -973,28 +1200,45 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
973 | struct cfs_rq *busy_cfs_rq; | 1200 | struct cfs_rq *busy_cfs_rq; |
974 | long rem_load_move = max_load_move; | 1201 | long rem_load_move = max_load_move; |
975 | struct rq_iterator cfs_rq_iterator; | 1202 | struct rq_iterator cfs_rq_iterator; |
1203 | unsigned long load_moved; | ||
976 | 1204 | ||
977 | cfs_rq_iterator.start = load_balance_start_fair; | 1205 | cfs_rq_iterator.start = load_balance_start_fair; |
978 | cfs_rq_iterator.next = load_balance_next_fair; | 1206 | cfs_rq_iterator.next = load_balance_next_fair; |
979 | 1207 | ||
980 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 1208 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { |
981 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1209 | #ifdef CONFIG_FAIR_GROUP_SCHED |
982 | struct cfs_rq *this_cfs_rq; | 1210 | struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu]; |
983 | long imbalance; | 1211 | unsigned long maxload, task_load, group_weight; |
984 | unsigned long maxload; | 1212 | unsigned long thisload, per_task_load; |
1213 | struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu]; | ||
1214 | |||
1215 | task_load = busy_cfs_rq->load.weight; | ||
1216 | group_weight = se->load.weight; | ||
985 | 1217 | ||
986 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | 1218 | /* |
1219 | * 'group_weight' is contributed by tasks of total weight | ||
1220 | * 'task_load'. To move 'rem_load_move' worth of weight only, | ||
1221 | * we need to move a maximum task load of: | ||
1222 | * | ||
1223 | * maxload = (remload / group_weight) * task_load; | ||
1224 | */ | ||
1225 | maxload = (rem_load_move * task_load) / group_weight; | ||
987 | 1226 | ||
988 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; | 1227 | if (!maxload || !task_load) |
989 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | ||
990 | if (imbalance <= 0) | ||
991 | continue; | 1228 | continue; |
992 | 1229 | ||
993 | /* Don't pull more than imbalance/2 */ | 1230 | per_task_load = task_load / busy_cfs_rq->nr_running; |
994 | imbalance /= 2; | 1231 | /* |
995 | maxload = min(rem_load_move, imbalance); | 1232 | * balance_tasks will try to forcibly move atleast one task if |
1233 | * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if | ||
1234 | * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load. | ||
1235 | */ | ||
1236 | if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load) | ||
1237 | continue; | ||
996 | 1238 | ||
997 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); | 1239 | /* Disable priority-based load balance */ |
1240 | *this_best_prio = 0; | ||
1241 | thisload = this_cfs_rq->load.weight; | ||
998 | #else | 1242 | #else |
999 | # define maxload rem_load_move | 1243 | # define maxload rem_load_move |
1000 | #endif | 1244 | #endif |
@@ -1003,11 +1247,33 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1003 | * load_balance_[start|next]_fair iterators | 1247 | * load_balance_[start|next]_fair iterators |
1004 | */ | 1248 | */ |
1005 | cfs_rq_iterator.arg = busy_cfs_rq; | 1249 | cfs_rq_iterator.arg = busy_cfs_rq; |
1006 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, | 1250 | load_moved = balance_tasks(this_rq, this_cpu, busiest, |
1007 | maxload, sd, idle, all_pinned, | 1251 | maxload, sd, idle, all_pinned, |
1008 | this_best_prio, | 1252 | this_best_prio, |
1009 | &cfs_rq_iterator); | 1253 | &cfs_rq_iterator); |
1010 | 1254 | ||
1255 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1256 | /* | ||
1257 | * load_moved holds the task load that was moved. The | ||
1258 | * effective (group) weight moved would be: | ||
1259 | * load_moved_eff = load_moved/task_load * group_weight; | ||
1260 | */ | ||
1261 | load_moved = (group_weight * load_moved) / task_load; | ||
1262 | |||
1263 | /* Adjust shares on both cpus to reflect load_moved */ | ||
1264 | group_weight -= load_moved; | ||
1265 | set_se_shares(se, group_weight); | ||
1266 | |||
1267 | se = busy_cfs_rq->tg->se[this_cpu]; | ||
1268 | if (!thisload) | ||
1269 | group_weight = load_moved; | ||
1270 | else | ||
1271 | group_weight = se->load.weight + load_moved; | ||
1272 | set_se_shares(se, group_weight); | ||
1273 | #endif | ||
1274 | |||
1275 | rem_load_move -= load_moved; | ||
1276 | |||
1011 | if (rem_load_move <= 0) | 1277 | if (rem_load_move <= 0) |
1012 | break; | 1278 | break; |
1013 | } | 1279 | } |
@@ -1043,14 +1309,14 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1043 | /* | 1309 | /* |
1044 | * scheduler tick hitting a task of our scheduling class: | 1310 | * scheduler tick hitting a task of our scheduling class: |
1045 | */ | 1311 | */ |
1046 | static void task_tick_fair(struct rq *rq, struct task_struct *curr) | 1312 | static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) |
1047 | { | 1313 | { |
1048 | struct cfs_rq *cfs_rq; | 1314 | struct cfs_rq *cfs_rq; |
1049 | struct sched_entity *se = &curr->se; | 1315 | struct sched_entity *se = &curr->se; |
1050 | 1316 | ||
1051 | for_each_sched_entity(se) { | 1317 | for_each_sched_entity(se) { |
1052 | cfs_rq = cfs_rq_of(se); | 1318 | cfs_rq = cfs_rq_of(se); |
1053 | entity_tick(cfs_rq, se); | 1319 | entity_tick(cfs_rq, se, queued); |
1054 | } | 1320 | } |
1055 | } | 1321 | } |
1056 | 1322 | ||
@@ -1088,6 +1354,42 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) | |||
1088 | resched_task(rq->curr); | 1354 | resched_task(rq->curr); |
1089 | } | 1355 | } |
1090 | 1356 | ||
1357 | /* | ||
1358 | * Priority of the task has changed. Check to see if we preempt | ||
1359 | * the current task. | ||
1360 | */ | ||
1361 | static void prio_changed_fair(struct rq *rq, struct task_struct *p, | ||
1362 | int oldprio, int running) | ||
1363 | { | ||
1364 | /* | ||
1365 | * Reschedule if we are currently running on this runqueue and | ||
1366 | * our priority decreased, or if we are not currently running on | ||
1367 | * this runqueue and our priority is higher than the current's | ||
1368 | */ | ||
1369 | if (running) { | ||
1370 | if (p->prio > oldprio) | ||
1371 | resched_task(rq->curr); | ||
1372 | } else | ||
1373 | check_preempt_curr(rq, p); | ||
1374 | } | ||
1375 | |||
1376 | /* | ||
1377 | * We switched to the sched_fair class. | ||
1378 | */ | ||
1379 | static void switched_to_fair(struct rq *rq, struct task_struct *p, | ||
1380 | int running) | ||
1381 | { | ||
1382 | /* | ||
1383 | * We were most likely switched from sched_rt, so | ||
1384 | * kick off the schedule if running, otherwise just see | ||
1385 | * if we can still preempt the current task. | ||
1386 | */ | ||
1387 | if (running) | ||
1388 | resched_task(rq->curr); | ||
1389 | else | ||
1390 | check_preempt_curr(rq, p); | ||
1391 | } | ||
1392 | |||
1091 | /* Account for a task changing its policy or group. | 1393 | /* Account for a task changing its policy or group. |
1092 | * | 1394 | * |
1093 | * This routine is mostly called to set cfs_rq->curr field when a task | 1395 | * This routine is mostly called to set cfs_rq->curr field when a task |
@@ -1109,6 +1411,9 @@ static const struct sched_class fair_sched_class = { | |||
1109 | .enqueue_task = enqueue_task_fair, | 1411 | .enqueue_task = enqueue_task_fair, |
1110 | .dequeue_task = dequeue_task_fair, | 1412 | .dequeue_task = dequeue_task_fair, |
1111 | .yield_task = yield_task_fair, | 1413 | .yield_task = yield_task_fair, |
1414 | #ifdef CONFIG_SMP | ||
1415 | .select_task_rq = select_task_rq_fair, | ||
1416 | #endif /* CONFIG_SMP */ | ||
1112 | 1417 | ||
1113 | .check_preempt_curr = check_preempt_wakeup, | 1418 | .check_preempt_curr = check_preempt_wakeup, |
1114 | 1419 | ||
@@ -1123,6 +1428,9 @@ static const struct sched_class fair_sched_class = { | |||
1123 | .set_curr_task = set_curr_task_fair, | 1428 | .set_curr_task = set_curr_task_fair, |
1124 | .task_tick = task_tick_fair, | 1429 | .task_tick = task_tick_fair, |
1125 | .task_new = task_new_fair, | 1430 | .task_new = task_new_fair, |
1431 | |||
1432 | .prio_changed = prio_changed_fair, | ||
1433 | .switched_to = switched_to_fair, | ||
1126 | }; | 1434 | }; |
1127 | 1435 | ||
1128 | #ifdef CONFIG_SCHED_DEBUG | 1436 | #ifdef CONFIG_SCHED_DEBUG |
@@ -1133,7 +1441,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu) | |||
1133 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1441 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1134 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); | 1442 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); |
1135 | #endif | 1443 | #endif |
1444 | rcu_read_lock(); | ||
1136 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) | 1445 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) |
1137 | print_cfs_rq(m, cpu, cfs_rq); | 1446 | print_cfs_rq(m, cpu, cfs_rq); |
1447 | rcu_read_unlock(); | ||
1138 | } | 1448 | } |
1139 | #endif | 1449 | #endif |
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index bf9c25c15b8b..2bcafa375633 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -5,6 +5,12 @@ | |||
5 | * handled in sched_fair.c) | 5 | * handled in sched_fair.c) |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #ifdef CONFIG_SMP | ||
9 | static int select_task_rq_idle(struct task_struct *p, int sync) | ||
10 | { | ||
11 | return task_cpu(p); /* IDLE tasks as never migrated */ | ||
12 | } | ||
13 | #endif /* CONFIG_SMP */ | ||
8 | /* | 14 | /* |
9 | * Idle tasks are unconditionally rescheduled: | 15 | * Idle tasks are unconditionally rescheduled: |
10 | */ | 16 | */ |
@@ -55,7 +61,7 @@ move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
55 | } | 61 | } |
56 | #endif | 62 | #endif |
57 | 63 | ||
58 | static void task_tick_idle(struct rq *rq, struct task_struct *curr) | 64 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) |
59 | { | 65 | { |
60 | } | 66 | } |
61 | 67 | ||
@@ -63,6 +69,33 @@ static void set_curr_task_idle(struct rq *rq) | |||
63 | { | 69 | { |
64 | } | 70 | } |
65 | 71 | ||
72 | static void switched_to_idle(struct rq *rq, struct task_struct *p, | ||
73 | int running) | ||
74 | { | ||
75 | /* Can this actually happen?? */ | ||
76 | if (running) | ||
77 | resched_task(rq->curr); | ||
78 | else | ||
79 | check_preempt_curr(rq, p); | ||
80 | } | ||
81 | |||
82 | static void prio_changed_idle(struct rq *rq, struct task_struct *p, | ||
83 | int oldprio, int running) | ||
84 | { | ||
85 | /* This can happen for hot plug CPUS */ | ||
86 | |||
87 | /* | ||
88 | * Reschedule if we are currently running on this runqueue and | ||
89 | * our priority decreased, or if we are not currently running on | ||
90 | * this runqueue and our priority is higher than the current's | ||
91 | */ | ||
92 | if (running) { | ||
93 | if (p->prio > oldprio) | ||
94 | resched_task(rq->curr); | ||
95 | } else | ||
96 | check_preempt_curr(rq, p); | ||
97 | } | ||
98 | |||
66 | /* | 99 | /* |
67 | * Simple, special scheduling class for the per-CPU idle tasks: | 100 | * Simple, special scheduling class for the per-CPU idle tasks: |
68 | */ | 101 | */ |
@@ -72,6 +105,9 @@ const struct sched_class idle_sched_class = { | |||
72 | 105 | ||
73 | /* dequeue is not valid, we print a debug message there: */ | 106 | /* dequeue is not valid, we print a debug message there: */ |
74 | .dequeue_task = dequeue_task_idle, | 107 | .dequeue_task = dequeue_task_idle, |
108 | #ifdef CONFIG_SMP | ||
109 | .select_task_rq = select_task_rq_idle, | ||
110 | #endif /* CONFIG_SMP */ | ||
75 | 111 | ||
76 | .check_preempt_curr = check_preempt_curr_idle, | 112 | .check_preempt_curr = check_preempt_curr_idle, |
77 | 113 | ||
@@ -85,5 +121,9 @@ const struct sched_class idle_sched_class = { | |||
85 | 121 | ||
86 | .set_curr_task = set_curr_task_idle, | 122 | .set_curr_task = set_curr_task_idle, |
87 | .task_tick = task_tick_idle, | 123 | .task_tick = task_tick_idle, |
124 | |||
125 | .prio_changed = prio_changed_idle, | ||
126 | .switched_to = switched_to_idle, | ||
127 | |||
88 | /* no .task_new for idle tasks */ | 128 | /* no .task_new for idle tasks */ |
89 | }; | 129 | }; |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index ee9c8b6529e9..274b40d7bef2 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -3,6 +3,217 @@ | |||
3 | * policies) | 3 | * policies) |
4 | */ | 4 | */ |
5 | 5 | ||
6 | #ifdef CONFIG_SMP | ||
7 | |||
8 | static inline int rt_overloaded(struct rq *rq) | ||
9 | { | ||
10 | return atomic_read(&rq->rd->rto_count); | ||
11 | } | ||
12 | |||
13 | static inline void rt_set_overload(struct rq *rq) | ||
14 | { | ||
15 | cpu_set(rq->cpu, rq->rd->rto_mask); | ||
16 | /* | ||
17 | * Make sure the mask is visible before we set | ||
18 | * the overload count. That is checked to determine | ||
19 | * if we should look at the mask. It would be a shame | ||
20 | * if we looked at the mask, but the mask was not | ||
21 | * updated yet. | ||
22 | */ | ||
23 | wmb(); | ||
24 | atomic_inc(&rq->rd->rto_count); | ||
25 | } | ||
26 | |||
27 | static inline void rt_clear_overload(struct rq *rq) | ||
28 | { | ||
29 | /* the order here really doesn't matter */ | ||
30 | atomic_dec(&rq->rd->rto_count); | ||
31 | cpu_clear(rq->cpu, rq->rd->rto_mask); | ||
32 | } | ||
33 | |||
34 | static void update_rt_migration(struct rq *rq) | ||
35 | { | ||
36 | if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) { | ||
37 | if (!rq->rt.overloaded) { | ||
38 | rt_set_overload(rq); | ||
39 | rq->rt.overloaded = 1; | ||
40 | } | ||
41 | } else if (rq->rt.overloaded) { | ||
42 | rt_clear_overload(rq); | ||
43 | rq->rt.overloaded = 0; | ||
44 | } | ||
45 | } | ||
46 | #endif /* CONFIG_SMP */ | ||
47 | |||
48 | static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) | ||
49 | { | ||
50 | return container_of(rt_se, struct task_struct, rt); | ||
51 | } | ||
52 | |||
53 | static inline int on_rt_rq(struct sched_rt_entity *rt_se) | ||
54 | { | ||
55 | return !list_empty(&rt_se->run_list); | ||
56 | } | ||
57 | |||
58 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
59 | |||
60 | static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) | ||
61 | { | ||
62 | if (!rt_rq->tg) | ||
63 | return SCHED_RT_FRAC; | ||
64 | |||
65 | return rt_rq->tg->rt_ratio; | ||
66 | } | ||
67 | |||
68 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | ||
69 | list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) | ||
70 | |||
71 | static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) | ||
72 | { | ||
73 | return rt_rq->rq; | ||
74 | } | ||
75 | |||
76 | static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | ||
77 | { | ||
78 | return rt_se->rt_rq; | ||
79 | } | ||
80 | |||
81 | #define for_each_sched_rt_entity(rt_se) \ | ||
82 | for (; rt_se; rt_se = rt_se->parent) | ||
83 | |||
84 | static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) | ||
85 | { | ||
86 | return rt_se->my_q; | ||
87 | } | ||
88 | |||
89 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se); | ||
90 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se); | ||
91 | |||
92 | static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) | ||
93 | { | ||
94 | struct sched_rt_entity *rt_se = rt_rq->rt_se; | ||
95 | |||
96 | if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { | ||
97 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; | ||
98 | |||
99 | enqueue_rt_entity(rt_se); | ||
100 | if (rt_rq->highest_prio < curr->prio) | ||
101 | resched_task(curr); | ||
102 | } | ||
103 | } | ||
104 | |||
105 | static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) | ||
106 | { | ||
107 | struct sched_rt_entity *rt_se = rt_rq->rt_se; | ||
108 | |||
109 | if (rt_se && on_rt_rq(rt_se)) | ||
110 | dequeue_rt_entity(rt_se); | ||
111 | } | ||
112 | |||
113 | #else | ||
114 | |||
115 | static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) | ||
116 | { | ||
117 | return sysctl_sched_rt_ratio; | ||
118 | } | ||
119 | |||
120 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | ||
121 | for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) | ||
122 | |||
123 | static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) | ||
124 | { | ||
125 | return container_of(rt_rq, struct rq, rt); | ||
126 | } | ||
127 | |||
128 | static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | ||
129 | { | ||
130 | struct task_struct *p = rt_task_of(rt_se); | ||
131 | struct rq *rq = task_rq(p); | ||
132 | |||
133 | return &rq->rt; | ||
134 | } | ||
135 | |||
136 | #define for_each_sched_rt_entity(rt_se) \ | ||
137 | for (; rt_se; rt_se = NULL) | ||
138 | |||
139 | static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) | ||
140 | { | ||
141 | return NULL; | ||
142 | } | ||
143 | |||
144 | static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) | ||
145 | { | ||
146 | } | ||
147 | |||
148 | static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) | ||
149 | { | ||
150 | } | ||
151 | |||
152 | #endif | ||
153 | |||
154 | static inline int rt_se_prio(struct sched_rt_entity *rt_se) | ||
155 | { | ||
156 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
157 | struct rt_rq *rt_rq = group_rt_rq(rt_se); | ||
158 | |||
159 | if (rt_rq) | ||
160 | return rt_rq->highest_prio; | ||
161 | #endif | ||
162 | |||
163 | return rt_task_of(rt_se)->prio; | ||
164 | } | ||
165 | |||
166 | static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq) | ||
167 | { | ||
168 | unsigned int rt_ratio = sched_rt_ratio(rt_rq); | ||
169 | u64 period, ratio; | ||
170 | |||
171 | if (rt_ratio == SCHED_RT_FRAC) | ||
172 | return 0; | ||
173 | |||
174 | if (rt_rq->rt_throttled) | ||
175 | return 1; | ||
176 | |||
177 | period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; | ||
178 | ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT; | ||
179 | |||
180 | if (rt_rq->rt_time > ratio) { | ||
181 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
182 | |||
183 | rq->rt_throttled = 1; | ||
184 | rt_rq->rt_throttled = 1; | ||
185 | |||
186 | sched_rt_ratio_dequeue(rt_rq); | ||
187 | return 1; | ||
188 | } | ||
189 | |||
190 | return 0; | ||
191 | } | ||
192 | |||
193 | static void update_sched_rt_period(struct rq *rq) | ||
194 | { | ||
195 | struct rt_rq *rt_rq; | ||
196 | u64 period; | ||
197 | |||
198 | while (rq->clock > rq->rt_period_expire) { | ||
199 | period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; | ||
200 | rq->rt_period_expire += period; | ||
201 | |||
202 | for_each_leaf_rt_rq(rt_rq, rq) { | ||
203 | unsigned long rt_ratio = sched_rt_ratio(rt_rq); | ||
204 | u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT; | ||
205 | |||
206 | rt_rq->rt_time -= min(rt_rq->rt_time, ratio); | ||
207 | if (rt_rq->rt_throttled) { | ||
208 | rt_rq->rt_throttled = 0; | ||
209 | sched_rt_ratio_enqueue(rt_rq); | ||
210 | } | ||
211 | } | ||
212 | |||
213 | rq->rt_throttled = 0; | ||
214 | } | ||
215 | } | ||
216 | |||
6 | /* | 217 | /* |
7 | * Update the current task's runtime statistics. Skip current tasks that | 218 | * Update the current task's runtime statistics. Skip current tasks that |
8 | * are not in our scheduling class. | 219 | * are not in our scheduling class. |
@@ -10,6 +221,8 @@ | |||
10 | static void update_curr_rt(struct rq *rq) | 221 | static void update_curr_rt(struct rq *rq) |
11 | { | 222 | { |
12 | struct task_struct *curr = rq->curr; | 223 | struct task_struct *curr = rq->curr; |
224 | struct sched_rt_entity *rt_se = &curr->rt; | ||
225 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | ||
13 | u64 delta_exec; | 226 | u64 delta_exec; |
14 | 227 | ||
15 | if (!task_has_rt_policy(curr)) | 228 | if (!task_has_rt_policy(curr)) |
@@ -24,47 +237,228 @@ static void update_curr_rt(struct rq *rq) | |||
24 | curr->se.sum_exec_runtime += delta_exec; | 237 | curr->se.sum_exec_runtime += delta_exec; |
25 | curr->se.exec_start = rq->clock; | 238 | curr->se.exec_start = rq->clock; |
26 | cpuacct_charge(curr, delta_exec); | 239 | cpuacct_charge(curr, delta_exec); |
240 | |||
241 | rt_rq->rt_time += delta_exec; | ||
242 | /* | ||
243 | * might make it a tad more accurate: | ||
244 | * | ||
245 | * update_sched_rt_period(rq); | ||
246 | */ | ||
247 | if (sched_rt_ratio_exceeded(rt_rq)) | ||
248 | resched_task(curr); | ||
27 | } | 249 | } |
28 | 250 | ||
29 | static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | 251 | static inline |
252 | void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | ||
253 | { | ||
254 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); | ||
255 | rt_rq->rt_nr_running++; | ||
256 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED | ||
257 | if (rt_se_prio(rt_se) < rt_rq->highest_prio) | ||
258 | rt_rq->highest_prio = rt_se_prio(rt_se); | ||
259 | #endif | ||
260 | #ifdef CONFIG_SMP | ||
261 | if (rt_se->nr_cpus_allowed > 1) { | ||
262 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
263 | rq->rt.rt_nr_migratory++; | ||
264 | } | ||
265 | |||
266 | update_rt_migration(rq_of_rt_rq(rt_rq)); | ||
267 | #endif | ||
268 | } | ||
269 | |||
270 | static inline | ||
271 | void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | ||
272 | { | ||
273 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); | ||
274 | WARN_ON(!rt_rq->rt_nr_running); | ||
275 | rt_rq->rt_nr_running--; | ||
276 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED | ||
277 | if (rt_rq->rt_nr_running) { | ||
278 | struct rt_prio_array *array; | ||
279 | |||
280 | WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio); | ||
281 | if (rt_se_prio(rt_se) == rt_rq->highest_prio) { | ||
282 | /* recalculate */ | ||
283 | array = &rt_rq->active; | ||
284 | rt_rq->highest_prio = | ||
285 | sched_find_first_bit(array->bitmap); | ||
286 | } /* otherwise leave rq->highest prio alone */ | ||
287 | } else | ||
288 | rt_rq->highest_prio = MAX_RT_PRIO; | ||
289 | #endif | ||
290 | #ifdef CONFIG_SMP | ||
291 | if (rt_se->nr_cpus_allowed > 1) { | ||
292 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
293 | rq->rt.rt_nr_migratory--; | ||
294 | } | ||
295 | |||
296 | update_rt_migration(rq_of_rt_rq(rt_rq)); | ||
297 | #endif /* CONFIG_SMP */ | ||
298 | } | ||
299 | |||
300 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se) | ||
301 | { | ||
302 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | ||
303 | struct rt_prio_array *array = &rt_rq->active; | ||
304 | struct rt_rq *group_rq = group_rt_rq(rt_se); | ||
305 | |||
306 | if (group_rq && group_rq->rt_throttled) | ||
307 | return; | ||
308 | |||
309 | list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); | ||
310 | __set_bit(rt_se_prio(rt_se), array->bitmap); | ||
311 | |||
312 | inc_rt_tasks(rt_se, rt_rq); | ||
313 | } | ||
314 | |||
315 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | ||
316 | { | ||
317 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | ||
318 | struct rt_prio_array *array = &rt_rq->active; | ||
319 | |||
320 | list_del_init(&rt_se->run_list); | ||
321 | if (list_empty(array->queue + rt_se_prio(rt_se))) | ||
322 | __clear_bit(rt_se_prio(rt_se), array->bitmap); | ||
323 | |||
324 | dec_rt_tasks(rt_se, rt_rq); | ||
325 | } | ||
326 | |||
327 | /* | ||
328 | * Because the prio of an upper entry depends on the lower | ||
329 | * entries, we must remove entries top - down. | ||
330 | * | ||
331 | * XXX: O(1/2 h^2) because we can only walk up, not down the chain. | ||
332 | * doesn't matter much for now, as h=2 for GROUP_SCHED. | ||
333 | */ | ||
334 | static void dequeue_rt_stack(struct task_struct *p) | ||
30 | { | 335 | { |
31 | struct rt_prio_array *array = &rq->rt.active; | 336 | struct sched_rt_entity *rt_se, *top_se; |
32 | 337 | ||
33 | list_add_tail(&p->run_list, array->queue + p->prio); | 338 | /* |
34 | __set_bit(p->prio, array->bitmap); | 339 | * dequeue all, top - down. |
340 | */ | ||
341 | do { | ||
342 | rt_se = &p->rt; | ||
343 | top_se = NULL; | ||
344 | for_each_sched_rt_entity(rt_se) { | ||
345 | if (on_rt_rq(rt_se)) | ||
346 | top_se = rt_se; | ||
347 | } | ||
348 | if (top_se) | ||
349 | dequeue_rt_entity(top_se); | ||
350 | } while (top_se); | ||
35 | } | 351 | } |
36 | 352 | ||
37 | /* | 353 | /* |
38 | * Adding/removing a task to/from a priority array: | 354 | * Adding/removing a task to/from a priority array: |
39 | */ | 355 | */ |
356 | static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | ||
357 | { | ||
358 | struct sched_rt_entity *rt_se = &p->rt; | ||
359 | |||
360 | if (wakeup) | ||
361 | rt_se->timeout = 0; | ||
362 | |||
363 | dequeue_rt_stack(p); | ||
364 | |||
365 | /* | ||
366 | * enqueue everybody, bottom - up. | ||
367 | */ | ||
368 | for_each_sched_rt_entity(rt_se) | ||
369 | enqueue_rt_entity(rt_se); | ||
370 | |||
371 | inc_cpu_load(rq, p->se.load.weight); | ||
372 | } | ||
373 | |||
40 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | 374 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) |
41 | { | 375 | { |
42 | struct rt_prio_array *array = &rq->rt.active; | 376 | struct sched_rt_entity *rt_se = &p->rt; |
377 | struct rt_rq *rt_rq; | ||
43 | 378 | ||
44 | update_curr_rt(rq); | 379 | update_curr_rt(rq); |
45 | 380 | ||
46 | list_del(&p->run_list); | 381 | dequeue_rt_stack(p); |
47 | if (list_empty(array->queue + p->prio)) | 382 | |
48 | __clear_bit(p->prio, array->bitmap); | 383 | /* |
384 | * re-enqueue all non-empty rt_rq entities. | ||
385 | */ | ||
386 | for_each_sched_rt_entity(rt_se) { | ||
387 | rt_rq = group_rt_rq(rt_se); | ||
388 | if (rt_rq && rt_rq->rt_nr_running) | ||
389 | enqueue_rt_entity(rt_se); | ||
390 | } | ||
391 | |||
392 | dec_cpu_load(rq, p->se.load.weight); | ||
49 | } | 393 | } |
50 | 394 | ||
51 | /* | 395 | /* |
52 | * Put task to the end of the run list without the overhead of dequeue | 396 | * Put task to the end of the run list without the overhead of dequeue |
53 | * followed by enqueue. | 397 | * followed by enqueue. |
54 | */ | 398 | */ |
399 | static | ||
400 | void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) | ||
401 | { | ||
402 | struct rt_prio_array *array = &rt_rq->active; | ||
403 | |||
404 | list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); | ||
405 | } | ||
406 | |||
55 | static void requeue_task_rt(struct rq *rq, struct task_struct *p) | 407 | static void requeue_task_rt(struct rq *rq, struct task_struct *p) |
56 | { | 408 | { |
57 | struct rt_prio_array *array = &rq->rt.active; | 409 | struct sched_rt_entity *rt_se = &p->rt; |
410 | struct rt_rq *rt_rq; | ||
58 | 411 | ||
59 | list_move_tail(&p->run_list, array->queue + p->prio); | 412 | for_each_sched_rt_entity(rt_se) { |
413 | rt_rq = rt_rq_of_se(rt_se); | ||
414 | requeue_rt_entity(rt_rq, rt_se); | ||
415 | } | ||
60 | } | 416 | } |
61 | 417 | ||
62 | static void | 418 | static void yield_task_rt(struct rq *rq) |
63 | yield_task_rt(struct rq *rq) | ||
64 | { | 419 | { |
65 | requeue_task_rt(rq, rq->curr); | 420 | requeue_task_rt(rq, rq->curr); |
66 | } | 421 | } |
67 | 422 | ||
423 | #ifdef CONFIG_SMP | ||
424 | static int find_lowest_rq(struct task_struct *task); | ||
425 | |||
426 | static int select_task_rq_rt(struct task_struct *p, int sync) | ||
427 | { | ||
428 | struct rq *rq = task_rq(p); | ||
429 | |||
430 | /* | ||
431 | * If the current task is an RT task, then | ||
432 | * try to see if we can wake this RT task up on another | ||
433 | * runqueue. Otherwise simply start this RT task | ||
434 | * on its current runqueue. | ||
435 | * | ||
436 | * We want to avoid overloading runqueues. Even if | ||
437 | * the RT task is of higher priority than the current RT task. | ||
438 | * RT tasks behave differently than other tasks. If | ||
439 | * one gets preempted, we try to push it off to another queue. | ||
440 | * So trying to keep a preempting RT task on the same | ||
441 | * cache hot CPU will force the running RT task to | ||
442 | * a cold CPU. So we waste all the cache for the lower | ||
443 | * RT task in hopes of saving some of a RT task | ||
444 | * that is just being woken and probably will have | ||
445 | * cold cache anyway. | ||
446 | */ | ||
447 | if (unlikely(rt_task(rq->curr)) && | ||
448 | (p->rt.nr_cpus_allowed > 1)) { | ||
449 | int cpu = find_lowest_rq(p); | ||
450 | |||
451 | return (cpu == -1) ? task_cpu(p) : cpu; | ||
452 | } | ||
453 | |||
454 | /* | ||
455 | * Otherwise, just let it ride on the affined RQ and the | ||
456 | * post-schedule router will push the preempted task away | ||
457 | */ | ||
458 | return task_cpu(p); | ||
459 | } | ||
460 | #endif /* CONFIG_SMP */ | ||
461 | |||
68 | /* | 462 | /* |
69 | * Preempt the current task with a newly woken task if needed: | 463 | * Preempt the current task with a newly woken task if needed: |
70 | */ | 464 | */ |
@@ -74,25 +468,48 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) | |||
74 | resched_task(rq->curr); | 468 | resched_task(rq->curr); |
75 | } | 469 | } |
76 | 470 | ||
77 | static struct task_struct *pick_next_task_rt(struct rq *rq) | 471 | static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, |
472 | struct rt_rq *rt_rq) | ||
78 | { | 473 | { |
79 | struct rt_prio_array *array = &rq->rt.active; | 474 | struct rt_prio_array *array = &rt_rq->active; |
80 | struct task_struct *next; | 475 | struct sched_rt_entity *next = NULL; |
81 | struct list_head *queue; | 476 | struct list_head *queue; |
82 | int idx; | 477 | int idx; |
83 | 478 | ||
84 | idx = sched_find_first_bit(array->bitmap); | 479 | idx = sched_find_first_bit(array->bitmap); |
85 | if (idx >= MAX_RT_PRIO) | 480 | BUG_ON(idx >= MAX_RT_PRIO); |
86 | return NULL; | ||
87 | 481 | ||
88 | queue = array->queue + idx; | 482 | queue = array->queue + idx; |
89 | next = list_entry(queue->next, struct task_struct, run_list); | 483 | next = list_entry(queue->next, struct sched_rt_entity, run_list); |
90 | |||
91 | next->se.exec_start = rq->clock; | ||
92 | 484 | ||
93 | return next; | 485 | return next; |
94 | } | 486 | } |
95 | 487 | ||
488 | static struct task_struct *pick_next_task_rt(struct rq *rq) | ||
489 | { | ||
490 | struct sched_rt_entity *rt_se; | ||
491 | struct task_struct *p; | ||
492 | struct rt_rq *rt_rq; | ||
493 | |||
494 | rt_rq = &rq->rt; | ||
495 | |||
496 | if (unlikely(!rt_rq->rt_nr_running)) | ||
497 | return NULL; | ||
498 | |||
499 | if (sched_rt_ratio_exceeded(rt_rq)) | ||
500 | return NULL; | ||
501 | |||
502 | do { | ||
503 | rt_se = pick_next_rt_entity(rq, rt_rq); | ||
504 | BUG_ON(!rt_se); | ||
505 | rt_rq = group_rt_rq(rt_se); | ||
506 | } while (rt_rq); | ||
507 | |||
508 | p = rt_task_of(rt_se); | ||
509 | p->se.exec_start = rq->clock; | ||
510 | return p; | ||
511 | } | ||
512 | |||
96 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | 513 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p) |
97 | { | 514 | { |
98 | update_curr_rt(rq); | 515 | update_curr_rt(rq); |
@@ -100,76 +517,448 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
100 | } | 517 | } |
101 | 518 | ||
102 | #ifdef CONFIG_SMP | 519 | #ifdef CONFIG_SMP |
103 | /* | 520 | |
104 | * Load-balancing iterator. Note: while the runqueue stays locked | 521 | /* Only try algorithms three times */ |
105 | * during the whole iteration, the current task might be | 522 | #define RT_MAX_TRIES 3 |
106 | * dequeued so the iterator has to be dequeue-safe. Here we | 523 | |
107 | * achieve that by always pre-iterating before returning | 524 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest); |
108 | * the current task: | 525 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); |
109 | */ | 526 | |
110 | static struct task_struct *load_balance_start_rt(void *arg) | 527 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) |
111 | { | 528 | { |
112 | struct rq *rq = arg; | 529 | if (!task_running(rq, p) && |
113 | struct rt_prio_array *array = &rq->rt.active; | 530 | (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && |
114 | struct list_head *head, *curr; | 531 | (p->rt.nr_cpus_allowed > 1)) |
115 | struct task_struct *p; | 532 | return 1; |
533 | return 0; | ||
534 | } | ||
535 | |||
536 | /* Return the second highest RT task, NULL otherwise */ | ||
537 | static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) | ||
538 | { | ||
539 | struct task_struct *next = NULL; | ||
540 | struct sched_rt_entity *rt_se; | ||
541 | struct rt_prio_array *array; | ||
542 | struct rt_rq *rt_rq; | ||
116 | int idx; | 543 | int idx; |
117 | 544 | ||
118 | idx = sched_find_first_bit(array->bitmap); | 545 | for_each_leaf_rt_rq(rt_rq, rq) { |
119 | if (idx >= MAX_RT_PRIO) | 546 | array = &rt_rq->active; |
120 | return NULL; | 547 | idx = sched_find_first_bit(array->bitmap); |
548 | next_idx: | ||
549 | if (idx >= MAX_RT_PRIO) | ||
550 | continue; | ||
551 | if (next && next->prio < idx) | ||
552 | continue; | ||
553 | list_for_each_entry(rt_se, array->queue + idx, run_list) { | ||
554 | struct task_struct *p = rt_task_of(rt_se); | ||
555 | if (pick_rt_task(rq, p, cpu)) { | ||
556 | next = p; | ||
557 | break; | ||
558 | } | ||
559 | } | ||
560 | if (!next) { | ||
561 | idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); | ||
562 | goto next_idx; | ||
563 | } | ||
564 | } | ||
565 | |||
566 | return next; | ||
567 | } | ||
121 | 568 | ||
122 | head = array->queue + idx; | 569 | static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); |
123 | curr = head->prev; | ||
124 | 570 | ||
125 | p = list_entry(curr, struct task_struct, run_list); | 571 | static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask) |
572 | { | ||
573 | int lowest_prio = -1; | ||
574 | int lowest_cpu = -1; | ||
575 | int count = 0; | ||
576 | int cpu; | ||
126 | 577 | ||
127 | curr = curr->prev; | 578 | cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed); |
128 | 579 | ||
129 | rq->rt.rt_load_balance_idx = idx; | 580 | /* |
130 | rq->rt.rt_load_balance_head = head; | 581 | * Scan each rq for the lowest prio. |
131 | rq->rt.rt_load_balance_curr = curr; | 582 | */ |
583 | for_each_cpu_mask(cpu, *lowest_mask) { | ||
584 | struct rq *rq = cpu_rq(cpu); | ||
132 | 585 | ||
133 | return p; | 586 | /* We look for lowest RT prio or non-rt CPU */ |
587 | if (rq->rt.highest_prio >= MAX_RT_PRIO) { | ||
588 | /* | ||
589 | * if we already found a low RT queue | ||
590 | * and now we found this non-rt queue | ||
591 | * clear the mask and set our bit. | ||
592 | * Otherwise just return the queue as is | ||
593 | * and the count==1 will cause the algorithm | ||
594 | * to use the first bit found. | ||
595 | */ | ||
596 | if (lowest_cpu != -1) { | ||
597 | cpus_clear(*lowest_mask); | ||
598 | cpu_set(rq->cpu, *lowest_mask); | ||
599 | } | ||
600 | return 1; | ||
601 | } | ||
602 | |||
603 | /* no locking for now */ | ||
604 | if ((rq->rt.highest_prio > task->prio) | ||
605 | && (rq->rt.highest_prio >= lowest_prio)) { | ||
606 | if (rq->rt.highest_prio > lowest_prio) { | ||
607 | /* new low - clear old data */ | ||
608 | lowest_prio = rq->rt.highest_prio; | ||
609 | lowest_cpu = cpu; | ||
610 | count = 0; | ||
611 | } | ||
612 | count++; | ||
613 | } else | ||
614 | cpu_clear(cpu, *lowest_mask); | ||
615 | } | ||
616 | |||
617 | /* | ||
618 | * Clear out all the set bits that represent | ||
619 | * runqueues that were of higher prio than | ||
620 | * the lowest_prio. | ||
621 | */ | ||
622 | if (lowest_cpu > 0) { | ||
623 | /* | ||
624 | * Perhaps we could add another cpumask op to | ||
625 | * zero out bits. Like cpu_zero_bits(cpumask, nrbits); | ||
626 | * Then that could be optimized to use memset and such. | ||
627 | */ | ||
628 | for_each_cpu_mask(cpu, *lowest_mask) { | ||
629 | if (cpu >= lowest_cpu) | ||
630 | break; | ||
631 | cpu_clear(cpu, *lowest_mask); | ||
632 | } | ||
633 | } | ||
634 | |||
635 | return count; | ||
134 | } | 636 | } |
135 | 637 | ||
136 | static struct task_struct *load_balance_next_rt(void *arg) | 638 | static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) |
137 | { | 639 | { |
138 | struct rq *rq = arg; | 640 | int first; |
139 | struct rt_prio_array *array = &rq->rt.active; | 641 | |
140 | struct list_head *head, *curr; | 642 | /* "this_cpu" is cheaper to preempt than a remote processor */ |
141 | struct task_struct *p; | 643 | if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) |
142 | int idx; | 644 | return this_cpu; |
645 | |||
646 | first = first_cpu(*mask); | ||
647 | if (first != NR_CPUS) | ||
648 | return first; | ||
649 | |||
650 | return -1; | ||
651 | } | ||
652 | |||
653 | static int find_lowest_rq(struct task_struct *task) | ||
654 | { | ||
655 | struct sched_domain *sd; | ||
656 | cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); | ||
657 | int this_cpu = smp_processor_id(); | ||
658 | int cpu = task_cpu(task); | ||
659 | int count = find_lowest_cpus(task, lowest_mask); | ||
660 | |||
661 | if (!count) | ||
662 | return -1; /* No targets found */ | ||
143 | 663 | ||
144 | idx = rq->rt.rt_load_balance_idx; | 664 | /* |
145 | head = rq->rt.rt_load_balance_head; | 665 | * There is no sense in performing an optimal search if only one |
146 | curr = rq->rt.rt_load_balance_curr; | 666 | * target is found. |
667 | */ | ||
668 | if (count == 1) | ||
669 | return first_cpu(*lowest_mask); | ||
670 | |||
671 | /* | ||
672 | * At this point we have built a mask of cpus representing the | ||
673 | * lowest priority tasks in the system. Now we want to elect | ||
674 | * the best one based on our affinity and topology. | ||
675 | * | ||
676 | * We prioritize the last cpu that the task executed on since | ||
677 | * it is most likely cache-hot in that location. | ||
678 | */ | ||
679 | if (cpu_isset(cpu, *lowest_mask)) | ||
680 | return cpu; | ||
147 | 681 | ||
148 | /* | 682 | /* |
149 | * If we arrived back to the head again then | 683 | * Otherwise, we consult the sched_domains span maps to figure |
150 | * iterate to the next queue (if any): | 684 | * out which cpu is logically closest to our hot cache data. |
151 | */ | 685 | */ |
152 | if (unlikely(head == curr)) { | 686 | if (this_cpu == cpu) |
153 | int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); | 687 | this_cpu = -1; /* Skip this_cpu opt if the same */ |
154 | 688 | ||
155 | if (next_idx >= MAX_RT_PRIO) | 689 | for_each_domain(cpu, sd) { |
156 | return NULL; | 690 | if (sd->flags & SD_WAKE_AFFINE) { |
691 | cpumask_t domain_mask; | ||
692 | int best_cpu; | ||
157 | 693 | ||
158 | idx = next_idx; | 694 | cpus_and(domain_mask, sd->span, *lowest_mask); |
159 | head = array->queue + idx; | ||
160 | curr = head->prev; | ||
161 | 695 | ||
162 | rq->rt.rt_load_balance_idx = idx; | 696 | best_cpu = pick_optimal_cpu(this_cpu, |
163 | rq->rt.rt_load_balance_head = head; | 697 | &domain_mask); |
698 | if (best_cpu != -1) | ||
699 | return best_cpu; | ||
700 | } | ||
164 | } | 701 | } |
165 | 702 | ||
166 | p = list_entry(curr, struct task_struct, run_list); | 703 | /* |
704 | * And finally, if there were no matches within the domains | ||
705 | * just give the caller *something* to work with from the compatible | ||
706 | * locations. | ||
707 | */ | ||
708 | return pick_optimal_cpu(this_cpu, lowest_mask); | ||
709 | } | ||
167 | 710 | ||
168 | curr = curr->prev; | 711 | /* Will lock the rq it finds */ |
712 | static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | ||
713 | { | ||
714 | struct rq *lowest_rq = NULL; | ||
715 | int tries; | ||
716 | int cpu; | ||
169 | 717 | ||
170 | rq->rt.rt_load_balance_curr = curr; | 718 | for (tries = 0; tries < RT_MAX_TRIES; tries++) { |
719 | cpu = find_lowest_rq(task); | ||
171 | 720 | ||
172 | return p; | 721 | if ((cpu == -1) || (cpu == rq->cpu)) |
722 | break; | ||
723 | |||
724 | lowest_rq = cpu_rq(cpu); | ||
725 | |||
726 | /* if the prio of this runqueue changed, try again */ | ||
727 | if (double_lock_balance(rq, lowest_rq)) { | ||
728 | /* | ||
729 | * We had to unlock the run queue. In | ||
730 | * the mean time, task could have | ||
731 | * migrated already or had its affinity changed. | ||
732 | * Also make sure that it wasn't scheduled on its rq. | ||
733 | */ | ||
734 | if (unlikely(task_rq(task) != rq || | ||
735 | !cpu_isset(lowest_rq->cpu, | ||
736 | task->cpus_allowed) || | ||
737 | task_running(rq, task) || | ||
738 | !task->se.on_rq)) { | ||
739 | |||
740 | spin_unlock(&lowest_rq->lock); | ||
741 | lowest_rq = NULL; | ||
742 | break; | ||
743 | } | ||
744 | } | ||
745 | |||
746 | /* If this rq is still suitable use it. */ | ||
747 | if (lowest_rq->rt.highest_prio > task->prio) | ||
748 | break; | ||
749 | |||
750 | /* try again */ | ||
751 | spin_unlock(&lowest_rq->lock); | ||
752 | lowest_rq = NULL; | ||
753 | } | ||
754 | |||
755 | return lowest_rq; | ||
756 | } | ||
757 | |||
758 | /* | ||
759 | * If the current CPU has more than one RT task, see if the non | ||
760 | * running task can migrate over to a CPU that is running a task | ||
761 | * of lesser priority. | ||
762 | */ | ||
763 | static int push_rt_task(struct rq *rq) | ||
764 | { | ||
765 | struct task_struct *next_task; | ||
766 | struct rq *lowest_rq; | ||
767 | int ret = 0; | ||
768 | int paranoid = RT_MAX_TRIES; | ||
769 | |||
770 | if (!rq->rt.overloaded) | ||
771 | return 0; | ||
772 | |||
773 | next_task = pick_next_highest_task_rt(rq, -1); | ||
774 | if (!next_task) | ||
775 | return 0; | ||
776 | |||
777 | retry: | ||
778 | if (unlikely(next_task == rq->curr)) { | ||
779 | WARN_ON(1); | ||
780 | return 0; | ||
781 | } | ||
782 | |||
783 | /* | ||
784 | * It's possible that the next_task slipped in of | ||
785 | * higher priority than current. If that's the case | ||
786 | * just reschedule current. | ||
787 | */ | ||
788 | if (unlikely(next_task->prio < rq->curr->prio)) { | ||
789 | resched_task(rq->curr); | ||
790 | return 0; | ||
791 | } | ||
792 | |||
793 | /* We might release rq lock */ | ||
794 | get_task_struct(next_task); | ||
795 | |||
796 | /* find_lock_lowest_rq locks the rq if found */ | ||
797 | lowest_rq = find_lock_lowest_rq(next_task, rq); | ||
798 | if (!lowest_rq) { | ||
799 | struct task_struct *task; | ||
800 | /* | ||
801 | * find lock_lowest_rq releases rq->lock | ||
802 | * so it is possible that next_task has changed. | ||
803 | * If it has, then try again. | ||
804 | */ | ||
805 | task = pick_next_highest_task_rt(rq, -1); | ||
806 | if (unlikely(task != next_task) && task && paranoid--) { | ||
807 | put_task_struct(next_task); | ||
808 | next_task = task; | ||
809 | goto retry; | ||
810 | } | ||
811 | goto out; | ||
812 | } | ||
813 | |||
814 | deactivate_task(rq, next_task, 0); | ||
815 | set_task_cpu(next_task, lowest_rq->cpu); | ||
816 | activate_task(lowest_rq, next_task, 0); | ||
817 | |||
818 | resched_task(lowest_rq->curr); | ||
819 | |||
820 | spin_unlock(&lowest_rq->lock); | ||
821 | |||
822 | ret = 1; | ||
823 | out: | ||
824 | put_task_struct(next_task); | ||
825 | |||
826 | return ret; | ||
827 | } | ||
828 | |||
829 | /* | ||
830 | * TODO: Currently we just use the second highest prio task on | ||
831 | * the queue, and stop when it can't migrate (or there's | ||
832 | * no more RT tasks). There may be a case where a lower | ||
833 | * priority RT task has a different affinity than the | ||
834 | * higher RT task. In this case the lower RT task could | ||
835 | * possibly be able to migrate where as the higher priority | ||
836 | * RT task could not. We currently ignore this issue. | ||
837 | * Enhancements are welcome! | ||
838 | */ | ||
839 | static void push_rt_tasks(struct rq *rq) | ||
840 | { | ||
841 | /* push_rt_task will return true if it moved an RT */ | ||
842 | while (push_rt_task(rq)) | ||
843 | ; | ||
844 | } | ||
845 | |||
846 | static int pull_rt_task(struct rq *this_rq) | ||
847 | { | ||
848 | int this_cpu = this_rq->cpu, ret = 0, cpu; | ||
849 | struct task_struct *p, *next; | ||
850 | struct rq *src_rq; | ||
851 | |||
852 | if (likely(!rt_overloaded(this_rq))) | ||
853 | return 0; | ||
854 | |||
855 | next = pick_next_task_rt(this_rq); | ||
856 | |||
857 | for_each_cpu_mask(cpu, this_rq->rd->rto_mask) { | ||
858 | if (this_cpu == cpu) | ||
859 | continue; | ||
860 | |||
861 | src_rq = cpu_rq(cpu); | ||
862 | /* | ||
863 | * We can potentially drop this_rq's lock in | ||
864 | * double_lock_balance, and another CPU could | ||
865 | * steal our next task - hence we must cause | ||
866 | * the caller to recalculate the next task | ||
867 | * in that case: | ||
868 | */ | ||
869 | if (double_lock_balance(this_rq, src_rq)) { | ||
870 | struct task_struct *old_next = next; | ||
871 | |||
872 | next = pick_next_task_rt(this_rq); | ||
873 | if (next != old_next) | ||
874 | ret = 1; | ||
875 | } | ||
876 | |||
877 | /* | ||
878 | * Are there still pullable RT tasks? | ||
879 | */ | ||
880 | if (src_rq->rt.rt_nr_running <= 1) | ||
881 | goto skip; | ||
882 | |||
883 | p = pick_next_highest_task_rt(src_rq, this_cpu); | ||
884 | |||
885 | /* | ||
886 | * Do we have an RT task that preempts | ||
887 | * the to-be-scheduled task? | ||
888 | */ | ||
889 | if (p && (!next || (p->prio < next->prio))) { | ||
890 | WARN_ON(p == src_rq->curr); | ||
891 | WARN_ON(!p->se.on_rq); | ||
892 | |||
893 | /* | ||
894 | * There's a chance that p is higher in priority | ||
895 | * than what's currently running on its cpu. | ||
896 | * This is just that p is wakeing up and hasn't | ||
897 | * had a chance to schedule. We only pull | ||
898 | * p if it is lower in priority than the | ||
899 | * current task on the run queue or | ||
900 | * this_rq next task is lower in prio than | ||
901 | * the current task on that rq. | ||
902 | */ | ||
903 | if (p->prio < src_rq->curr->prio || | ||
904 | (next && next->prio < src_rq->curr->prio)) | ||
905 | goto skip; | ||
906 | |||
907 | ret = 1; | ||
908 | |||
909 | deactivate_task(src_rq, p, 0); | ||
910 | set_task_cpu(p, this_cpu); | ||
911 | activate_task(this_rq, p, 0); | ||
912 | /* | ||
913 | * We continue with the search, just in | ||
914 | * case there's an even higher prio task | ||
915 | * in another runqueue. (low likelyhood | ||
916 | * but possible) | ||
917 | * | ||
918 | * Update next so that we won't pick a task | ||
919 | * on another cpu with a priority lower (or equal) | ||
920 | * than the one we just picked. | ||
921 | */ | ||
922 | next = p; | ||
923 | |||
924 | } | ||
925 | skip: | ||
926 | spin_unlock(&src_rq->lock); | ||
927 | } | ||
928 | |||
929 | return ret; | ||
930 | } | ||
931 | |||
932 | static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) | ||
933 | { | ||
934 | /* Try to pull RT tasks here if we lower this rq's prio */ | ||
935 | if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio) | ||
936 | pull_rt_task(rq); | ||
937 | } | ||
938 | |||
939 | static void post_schedule_rt(struct rq *rq) | ||
940 | { | ||
941 | /* | ||
942 | * If we have more than one rt_task queued, then | ||
943 | * see if we can push the other rt_tasks off to other CPUS. | ||
944 | * Note we may release the rq lock, and since | ||
945 | * the lock was owned by prev, we need to release it | ||
946 | * first via finish_lock_switch and then reaquire it here. | ||
947 | */ | ||
948 | if (unlikely(rq->rt.overloaded)) { | ||
949 | spin_lock_irq(&rq->lock); | ||
950 | push_rt_tasks(rq); | ||
951 | spin_unlock_irq(&rq->lock); | ||
952 | } | ||
953 | } | ||
954 | |||
955 | |||
956 | static void task_wake_up_rt(struct rq *rq, struct task_struct *p) | ||
957 | { | ||
958 | if (!task_running(rq, p) && | ||
959 | (p->prio >= rq->rt.highest_prio) && | ||
960 | rq->rt.overloaded) | ||
961 | push_rt_tasks(rq); | ||
173 | } | 962 | } |
174 | 963 | ||
175 | static unsigned long | 964 | static unsigned long |
@@ -178,36 +967,170 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
178 | struct sched_domain *sd, enum cpu_idle_type idle, | 967 | struct sched_domain *sd, enum cpu_idle_type idle, |
179 | int *all_pinned, int *this_best_prio) | 968 | int *all_pinned, int *this_best_prio) |
180 | { | 969 | { |
181 | struct rq_iterator rt_rq_iterator; | 970 | /* don't touch RT tasks */ |
182 | 971 | return 0; | |
183 | rt_rq_iterator.start = load_balance_start_rt; | ||
184 | rt_rq_iterator.next = load_balance_next_rt; | ||
185 | /* pass 'busiest' rq argument into | ||
186 | * load_balance_[start|next]_rt iterators | ||
187 | */ | ||
188 | rt_rq_iterator.arg = busiest; | ||
189 | |||
190 | return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd, | ||
191 | idle, all_pinned, this_best_prio, &rt_rq_iterator); | ||
192 | } | 972 | } |
193 | 973 | ||
194 | static int | 974 | static int |
195 | move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | 975 | move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, |
196 | struct sched_domain *sd, enum cpu_idle_type idle) | 976 | struct sched_domain *sd, enum cpu_idle_type idle) |
197 | { | 977 | { |
198 | struct rq_iterator rt_rq_iterator; | 978 | /* don't touch RT tasks */ |
979 | return 0; | ||
980 | } | ||
981 | |||
982 | static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask) | ||
983 | { | ||
984 | int weight = cpus_weight(*new_mask); | ||
199 | 985 | ||
200 | rt_rq_iterator.start = load_balance_start_rt; | 986 | BUG_ON(!rt_task(p)); |
201 | rt_rq_iterator.next = load_balance_next_rt; | 987 | |
202 | rt_rq_iterator.arg = busiest; | 988 | /* |
989 | * Update the migration status of the RQ if we have an RT task | ||
990 | * which is running AND changing its weight value. | ||
991 | */ | ||
992 | if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { | ||
993 | struct rq *rq = task_rq(p); | ||
203 | 994 | ||
204 | return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, | 995 | if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { |
205 | &rt_rq_iterator); | 996 | rq->rt.rt_nr_migratory++; |
997 | } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { | ||
998 | BUG_ON(!rq->rt.rt_nr_migratory); | ||
999 | rq->rt.rt_nr_migratory--; | ||
1000 | } | ||
1001 | |||
1002 | update_rt_migration(rq); | ||
1003 | } | ||
1004 | |||
1005 | p->cpus_allowed = *new_mask; | ||
1006 | p->rt.nr_cpus_allowed = weight; | ||
206 | } | 1007 | } |
207 | #endif | ||
208 | 1008 | ||
209 | static void task_tick_rt(struct rq *rq, struct task_struct *p) | 1009 | /* Assumes rq->lock is held */ |
1010 | static void join_domain_rt(struct rq *rq) | ||
210 | { | 1011 | { |
1012 | if (rq->rt.overloaded) | ||
1013 | rt_set_overload(rq); | ||
1014 | } | ||
1015 | |||
1016 | /* Assumes rq->lock is held */ | ||
1017 | static void leave_domain_rt(struct rq *rq) | ||
1018 | { | ||
1019 | if (rq->rt.overloaded) | ||
1020 | rt_clear_overload(rq); | ||
1021 | } | ||
1022 | |||
1023 | /* | ||
1024 | * When switch from the rt queue, we bring ourselves to a position | ||
1025 | * that we might want to pull RT tasks from other runqueues. | ||
1026 | */ | ||
1027 | static void switched_from_rt(struct rq *rq, struct task_struct *p, | ||
1028 | int running) | ||
1029 | { | ||
1030 | /* | ||
1031 | * If there are other RT tasks then we will reschedule | ||
1032 | * and the scheduling of the other RT tasks will handle | ||
1033 | * the balancing. But if we are the last RT task | ||
1034 | * we may need to handle the pulling of RT tasks | ||
1035 | * now. | ||
1036 | */ | ||
1037 | if (!rq->rt.rt_nr_running) | ||
1038 | pull_rt_task(rq); | ||
1039 | } | ||
1040 | #endif /* CONFIG_SMP */ | ||
1041 | |||
1042 | /* | ||
1043 | * When switching a task to RT, we may overload the runqueue | ||
1044 | * with RT tasks. In this case we try to push them off to | ||
1045 | * other runqueues. | ||
1046 | */ | ||
1047 | static void switched_to_rt(struct rq *rq, struct task_struct *p, | ||
1048 | int running) | ||
1049 | { | ||
1050 | int check_resched = 1; | ||
1051 | |||
1052 | /* | ||
1053 | * If we are already running, then there's nothing | ||
1054 | * that needs to be done. But if we are not running | ||
1055 | * we may need to preempt the current running task. | ||
1056 | * If that current running task is also an RT task | ||
1057 | * then see if we can move to another run queue. | ||
1058 | */ | ||
1059 | if (!running) { | ||
1060 | #ifdef CONFIG_SMP | ||
1061 | if (rq->rt.overloaded && push_rt_task(rq) && | ||
1062 | /* Don't resched if we changed runqueues */ | ||
1063 | rq != task_rq(p)) | ||
1064 | check_resched = 0; | ||
1065 | #endif /* CONFIG_SMP */ | ||
1066 | if (check_resched && p->prio < rq->curr->prio) | ||
1067 | resched_task(rq->curr); | ||
1068 | } | ||
1069 | } | ||
1070 | |||
1071 | /* | ||
1072 | * Priority of the task has changed. This may cause | ||
1073 | * us to initiate a push or pull. | ||
1074 | */ | ||
1075 | static void prio_changed_rt(struct rq *rq, struct task_struct *p, | ||
1076 | int oldprio, int running) | ||
1077 | { | ||
1078 | if (running) { | ||
1079 | #ifdef CONFIG_SMP | ||
1080 | /* | ||
1081 | * If our priority decreases while running, we | ||
1082 | * may need to pull tasks to this runqueue. | ||
1083 | */ | ||
1084 | if (oldprio < p->prio) | ||
1085 | pull_rt_task(rq); | ||
1086 | /* | ||
1087 | * If there's a higher priority task waiting to run | ||
1088 | * then reschedule. | ||
1089 | */ | ||
1090 | if (p->prio > rq->rt.highest_prio) | ||
1091 | resched_task(p); | ||
1092 | #else | ||
1093 | /* For UP simply resched on drop of prio */ | ||
1094 | if (oldprio < p->prio) | ||
1095 | resched_task(p); | ||
1096 | #endif /* CONFIG_SMP */ | ||
1097 | } else { | ||
1098 | /* | ||
1099 | * This task is not running, but if it is | ||
1100 | * greater than the current running task | ||
1101 | * then reschedule. | ||
1102 | */ | ||
1103 | if (p->prio < rq->curr->prio) | ||
1104 | resched_task(rq->curr); | ||
1105 | } | ||
1106 | } | ||
1107 | |||
1108 | static void watchdog(struct rq *rq, struct task_struct *p) | ||
1109 | { | ||
1110 | unsigned long soft, hard; | ||
1111 | |||
1112 | if (!p->signal) | ||
1113 | return; | ||
1114 | |||
1115 | soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur; | ||
1116 | hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max; | ||
1117 | |||
1118 | if (soft != RLIM_INFINITY) { | ||
1119 | unsigned long next; | ||
1120 | |||
1121 | p->rt.timeout++; | ||
1122 | next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); | ||
1123 | if (p->rt.timeout > next) | ||
1124 | p->it_sched_expires = p->se.sum_exec_runtime; | ||
1125 | } | ||
1126 | } | ||
1127 | |||
1128 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | ||
1129 | { | ||
1130 | update_curr_rt(rq); | ||
1131 | |||
1132 | watchdog(rq, p); | ||
1133 | |||
211 | /* | 1134 | /* |
212 | * RR tasks need a special form of timeslice management. | 1135 | * RR tasks need a special form of timeslice management. |
213 | * FIFO tasks have no timeslices. | 1136 | * FIFO tasks have no timeslices. |
@@ -215,16 +1138,16 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) | |||
215 | if (p->policy != SCHED_RR) | 1138 | if (p->policy != SCHED_RR) |
216 | return; | 1139 | return; |
217 | 1140 | ||
218 | if (--p->time_slice) | 1141 | if (--p->rt.time_slice) |
219 | return; | 1142 | return; |
220 | 1143 | ||
221 | p->time_slice = DEF_TIMESLICE; | 1144 | p->rt.time_slice = DEF_TIMESLICE; |
222 | 1145 | ||
223 | /* | 1146 | /* |
224 | * Requeue to the end of queue if we are not the only element | 1147 | * Requeue to the end of queue if we are not the only element |
225 | * on the queue: | 1148 | * on the queue: |
226 | */ | 1149 | */ |
227 | if (p->run_list.prev != p->run_list.next) { | 1150 | if (p->rt.run_list.prev != p->rt.run_list.next) { |
228 | requeue_task_rt(rq, p); | 1151 | requeue_task_rt(rq, p); |
229 | set_tsk_need_resched(p); | 1152 | set_tsk_need_resched(p); |
230 | } | 1153 | } |
@@ -242,6 +1165,9 @@ const struct sched_class rt_sched_class = { | |||
242 | .enqueue_task = enqueue_task_rt, | 1165 | .enqueue_task = enqueue_task_rt, |
243 | .dequeue_task = dequeue_task_rt, | 1166 | .dequeue_task = dequeue_task_rt, |
244 | .yield_task = yield_task_rt, | 1167 | .yield_task = yield_task_rt, |
1168 | #ifdef CONFIG_SMP | ||
1169 | .select_task_rq = select_task_rq_rt, | ||
1170 | #endif /* CONFIG_SMP */ | ||
245 | 1171 | ||
246 | .check_preempt_curr = check_preempt_curr_rt, | 1172 | .check_preempt_curr = check_preempt_curr_rt, |
247 | 1173 | ||
@@ -251,8 +1177,18 @@ const struct sched_class rt_sched_class = { | |||
251 | #ifdef CONFIG_SMP | 1177 | #ifdef CONFIG_SMP |
252 | .load_balance = load_balance_rt, | 1178 | .load_balance = load_balance_rt, |
253 | .move_one_task = move_one_task_rt, | 1179 | .move_one_task = move_one_task_rt, |
1180 | .set_cpus_allowed = set_cpus_allowed_rt, | ||
1181 | .join_domain = join_domain_rt, | ||
1182 | .leave_domain = leave_domain_rt, | ||
1183 | .pre_schedule = pre_schedule_rt, | ||
1184 | .post_schedule = post_schedule_rt, | ||
1185 | .task_wake_up = task_wake_up_rt, | ||
1186 | .switched_from = switched_from_rt, | ||
254 | #endif | 1187 | #endif |
255 | 1188 | ||
256 | .set_curr_task = set_curr_task_rt, | 1189 | .set_curr_task = set_curr_task_rt, |
257 | .task_tick = task_tick_rt, | 1190 | .task_tick = task_tick_rt, |
1191 | |||
1192 | .prio_changed = prio_changed_rt, | ||
1193 | .switched_to = switched_to_rt, | ||
258 | }; | 1194 | }; |
diff --git a/kernel/signal.c b/kernel/signal.c index 657aa16d97cb..8054dd4e2d76 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -733,13 +733,13 @@ static void print_fatal_signal(struct pt_regs *regs, int signr) | |||
733 | current->comm, task_pid_nr(current), signr); | 733 | current->comm, task_pid_nr(current), signr); |
734 | 734 | ||
735 | #if defined(__i386__) && !defined(__arch_um__) | 735 | #if defined(__i386__) && !defined(__arch_um__) |
736 | printk("code at %08lx: ", regs->eip); | 736 | printk("code at %08lx: ", regs->ip); |
737 | { | 737 | { |
738 | int i; | 738 | int i; |
739 | for (i = 0; i < 16; i++) { | 739 | for (i = 0; i < 16; i++) { |
740 | unsigned char insn; | 740 | unsigned char insn; |
741 | 741 | ||
742 | __get_user(insn, (unsigned char *)(regs->eip + i)); | 742 | __get_user(insn, (unsigned char *)(regs->ip + i)); |
743 | printk("%02x ", insn); | 743 | printk("%02x ", insn); |
744 | } | 744 | } |
745 | } | 745 | } |
diff --git a/kernel/softirq.c b/kernel/softirq.c index bd89bc4eb0b9..d7837d45419e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -3,7 +3,9 @@ | |||
3 | * | 3 | * |
4 | * Copyright (C) 1992 Linus Torvalds | 4 | * Copyright (C) 1992 Linus Torvalds |
5 | * | 5 | * |
6 | * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) | 6 | * Distribute under GPLv2. |
7 | * | ||
8 | * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) | ||
7 | */ | 9 | */ |
8 | 10 | ||
9 | #include <linux/module.h> | 11 | #include <linux/module.h> |
@@ -278,9 +280,14 @@ asmlinkage void do_softirq(void) | |||
278 | */ | 280 | */ |
279 | void irq_enter(void) | 281 | void irq_enter(void) |
280 | { | 282 | { |
283 | #ifdef CONFIG_NO_HZ | ||
284 | int cpu = smp_processor_id(); | ||
285 | if (idle_cpu(cpu) && !in_interrupt()) | ||
286 | tick_nohz_stop_idle(cpu); | ||
287 | #endif | ||
281 | __irq_enter(); | 288 | __irq_enter(); |
282 | #ifdef CONFIG_NO_HZ | 289 | #ifdef CONFIG_NO_HZ |
283 | if (idle_cpu(smp_processor_id())) | 290 | if (idle_cpu(cpu)) |
284 | tick_nohz_update_jiffies(); | 291 | tick_nohz_update_jiffies(); |
285 | #endif | 292 | #endif |
286 | } | 293 | } |
diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 11df812263c8..c1d76552446e 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c | |||
@@ -8,6 +8,7 @@ | |||
8 | */ | 8 | */ |
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/cpu.h> | 10 | #include <linux/cpu.h> |
11 | #include <linux/nmi.h> | ||
11 | #include <linux/init.h> | 12 | #include <linux/init.h> |
12 | #include <linux/delay.h> | 13 | #include <linux/delay.h> |
13 | #include <linux/freezer.h> | 14 | #include <linux/freezer.h> |
@@ -23,8 +24,8 @@ static DEFINE_PER_CPU(unsigned long, touch_timestamp); | |||
23 | static DEFINE_PER_CPU(unsigned long, print_timestamp); | 24 | static DEFINE_PER_CPU(unsigned long, print_timestamp); |
24 | static DEFINE_PER_CPU(struct task_struct *, watchdog_task); | 25 | static DEFINE_PER_CPU(struct task_struct *, watchdog_task); |
25 | 26 | ||
26 | static int did_panic; | 27 | static int __read_mostly did_panic; |
27 | int softlockup_thresh = 10; | 28 | unsigned long __read_mostly softlockup_thresh = 60; |
28 | 29 | ||
29 | static int | 30 | static int |
30 | softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) | 31 | softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) |
@@ -45,7 +46,7 @@ static struct notifier_block panic_block = { | |||
45 | */ | 46 | */ |
46 | static unsigned long get_timestamp(int this_cpu) | 47 | static unsigned long get_timestamp(int this_cpu) |
47 | { | 48 | { |
48 | return cpu_clock(this_cpu) >> 30; /* 2^30 ~= 10^9 */ | 49 | return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ |
49 | } | 50 | } |
50 | 51 | ||
51 | void touch_softlockup_watchdog(void) | 52 | void touch_softlockup_watchdog(void) |
@@ -100,11 +101,7 @@ void softlockup_tick(void) | |||
100 | 101 | ||
101 | now = get_timestamp(this_cpu); | 102 | now = get_timestamp(this_cpu); |
102 | 103 | ||
103 | /* Wake up the high-prio watchdog task every second: */ | 104 | /* Warn about unreasonable delays: */ |
104 | if (now > (touch_timestamp + 1)) | ||
105 | wake_up_process(per_cpu(watchdog_task, this_cpu)); | ||
106 | |||
107 | /* Warn about unreasonable 10+ seconds delays: */ | ||
108 | if (now <= (touch_timestamp + softlockup_thresh)) | 105 | if (now <= (touch_timestamp + softlockup_thresh)) |
109 | return; | 106 | return; |
110 | 107 | ||
@@ -122,11 +119,93 @@ void softlockup_tick(void) | |||
122 | } | 119 | } |
123 | 120 | ||
124 | /* | 121 | /* |
122 | * Have a reasonable limit on the number of tasks checked: | ||
123 | */ | ||
124 | unsigned long __read_mostly sysctl_hung_task_check_count = 1024; | ||
125 | |||
126 | /* | ||
127 | * Zero means infinite timeout - no checking done: | ||
128 | */ | ||
129 | unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; | ||
130 | |||
131 | unsigned long __read_mostly sysctl_hung_task_warnings = 10; | ||
132 | |||
133 | /* | ||
134 | * Only do the hung-tasks check on one CPU: | ||
135 | */ | ||
136 | static int check_cpu __read_mostly = -1; | ||
137 | |||
138 | static void check_hung_task(struct task_struct *t, unsigned long now) | ||
139 | { | ||
140 | unsigned long switch_count = t->nvcsw + t->nivcsw; | ||
141 | |||
142 | if (t->flags & PF_FROZEN) | ||
143 | return; | ||
144 | |||
145 | if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { | ||
146 | t->last_switch_count = switch_count; | ||
147 | t->last_switch_timestamp = now; | ||
148 | return; | ||
149 | } | ||
150 | if ((long)(now - t->last_switch_timestamp) < | ||
151 | sysctl_hung_task_timeout_secs) | ||
152 | return; | ||
153 | if (sysctl_hung_task_warnings < 0) | ||
154 | return; | ||
155 | sysctl_hung_task_warnings--; | ||
156 | |||
157 | /* | ||
158 | * Ok, the task did not get scheduled for more than 2 minutes, | ||
159 | * complain: | ||
160 | */ | ||
161 | printk(KERN_ERR "INFO: task %s:%d blocked for more than " | ||
162 | "%ld seconds.\n", t->comm, t->pid, | ||
163 | sysctl_hung_task_timeout_secs); | ||
164 | printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | ||
165 | " disables this message.\n"); | ||
166 | sched_show_task(t); | ||
167 | __debug_show_held_locks(t); | ||
168 | |||
169 | t->last_switch_timestamp = now; | ||
170 | touch_nmi_watchdog(); | ||
171 | } | ||
172 | |||
173 | /* | ||
174 | * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for | ||
175 | * a really long time (120 seconds). If that happens, print out | ||
176 | * a warning. | ||
177 | */ | ||
178 | static void check_hung_uninterruptible_tasks(int this_cpu) | ||
179 | { | ||
180 | int max_count = sysctl_hung_task_check_count; | ||
181 | unsigned long now = get_timestamp(this_cpu); | ||
182 | struct task_struct *g, *t; | ||
183 | |||
184 | /* | ||
185 | * If the system crashed already then all bets are off, | ||
186 | * do not report extra hung tasks: | ||
187 | */ | ||
188 | if ((tainted & TAINT_DIE) || did_panic) | ||
189 | return; | ||
190 | |||
191 | read_lock(&tasklist_lock); | ||
192 | do_each_thread(g, t) { | ||
193 | if (!--max_count) | ||
194 | break; | ||
195 | if (t->state & TASK_UNINTERRUPTIBLE) | ||
196 | check_hung_task(t, now); | ||
197 | } while_each_thread(g, t); | ||
198 | |||
199 | read_unlock(&tasklist_lock); | ||
200 | } | ||
201 | |||
202 | /* | ||
125 | * The watchdog thread - runs every second and touches the timestamp. | 203 | * The watchdog thread - runs every second and touches the timestamp. |
126 | */ | 204 | */ |
127 | static int watchdog(void *__bind_cpu) | 205 | static int watchdog(void *__bind_cpu) |
128 | { | 206 | { |
129 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 207 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; |
208 | int this_cpu = (long)__bind_cpu; | ||
130 | 209 | ||
131 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 210 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
132 | 211 | ||
@@ -135,13 +214,18 @@ static int watchdog(void *__bind_cpu) | |||
135 | 214 | ||
136 | /* | 215 | /* |
137 | * Run briefly once per second to reset the softlockup timestamp. | 216 | * Run briefly once per second to reset the softlockup timestamp. |
138 | * If this gets delayed for more than 10 seconds then the | 217 | * If this gets delayed for more than 60 seconds then the |
139 | * debug-printout triggers in softlockup_tick(). | 218 | * debug-printout triggers in softlockup_tick(). |
140 | */ | 219 | */ |
141 | while (!kthread_should_stop()) { | 220 | while (!kthread_should_stop()) { |
142 | set_current_state(TASK_INTERRUPTIBLE); | ||
143 | touch_softlockup_watchdog(); | 221 | touch_softlockup_watchdog(); |
144 | schedule(); | 222 | msleep_interruptible(10000); |
223 | |||
224 | if (this_cpu != check_cpu) | ||
225 | continue; | ||
226 | |||
227 | if (sysctl_hung_task_timeout_secs) | ||
228 | check_hung_uninterruptible_tasks(this_cpu); | ||
145 | } | 229 | } |
146 | 230 | ||
147 | return 0; | 231 | return 0; |
@@ -171,6 +255,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
171 | break; | 255 | break; |
172 | case CPU_ONLINE: | 256 | case CPU_ONLINE: |
173 | case CPU_ONLINE_FROZEN: | 257 | case CPU_ONLINE_FROZEN: |
258 | check_cpu = any_online_cpu(cpu_online_map); | ||
174 | wake_up_process(per_cpu(watchdog_task, hotcpu)); | 259 | wake_up_process(per_cpu(watchdog_task, hotcpu)); |
175 | break; | 260 | break; |
176 | #ifdef CONFIG_HOTPLUG_CPU | 261 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -181,6 +266,15 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
181 | /* Unbind so it can run. Fall thru. */ | 266 | /* Unbind so it can run. Fall thru. */ |
182 | kthread_bind(per_cpu(watchdog_task, hotcpu), | 267 | kthread_bind(per_cpu(watchdog_task, hotcpu), |
183 | any_online_cpu(cpu_online_map)); | 268 | any_online_cpu(cpu_online_map)); |
269 | case CPU_DOWN_PREPARE: | ||
270 | case CPU_DOWN_PREPARE_FROZEN: | ||
271 | if (hotcpu == check_cpu) { | ||
272 | cpumask_t temp_cpu_online_map = cpu_online_map; | ||
273 | |||
274 | cpu_clear(hotcpu, temp_cpu_online_map); | ||
275 | check_cpu = any_online_cpu(temp_cpu_online_map); | ||
276 | } | ||
277 | break; | ||
184 | case CPU_DEAD: | 278 | case CPU_DEAD: |
185 | case CPU_DEAD_FROZEN: | 279 | case CPU_DEAD_FROZEN: |
186 | p = per_cpu(watchdog_task, hotcpu); | 280 | p = per_cpu(watchdog_task, hotcpu); |
diff --git a/kernel/spinlock.c b/kernel/spinlock.c index cd72424c2662..ae28c8245123 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c | |||
@@ -65,8 +65,7 @@ EXPORT_SYMBOL(_write_trylock); | |||
65 | * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are | 65 | * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are |
66 | * not re-enabled during lock-acquire (which the preempt-spin-ops do): | 66 | * not re-enabled during lock-acquire (which the preempt-spin-ops do): |
67 | */ | 67 | */ |
68 | #if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \ | 68 | #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) |
69 | defined(CONFIG_DEBUG_LOCK_ALLOC) | ||
70 | 69 | ||
71 | void __lockfunc _read_lock(rwlock_t *lock) | 70 | void __lockfunc _read_lock(rwlock_t *lock) |
72 | { | 71 | { |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 319821ef78af..51b5ee53571a 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -203,13 +203,13 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) | |||
203 | int ret; | 203 | int ret; |
204 | 204 | ||
205 | /* No CPUs can come up or down during this. */ | 205 | /* No CPUs can come up or down during this. */ |
206 | lock_cpu_hotplug(); | 206 | get_online_cpus(); |
207 | p = __stop_machine_run(fn, data, cpu); | 207 | p = __stop_machine_run(fn, data, cpu); |
208 | if (!IS_ERR(p)) | 208 | if (!IS_ERR(p)) |
209 | ret = kthread_stop(p); | 209 | ret = kthread_stop(p); |
210 | else | 210 | else |
211 | ret = PTR_ERR(p); | 211 | ret = PTR_ERR(p); |
212 | unlock_cpu_hotplug(); | 212 | put_online_cpus(); |
213 | 213 | ||
214 | return ret; | 214 | return ret; |
215 | } | 215 | } |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 56cb009a4b35..beee5b3b68a2 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -131,6 +131,7 @@ cond_syscall(sys32_sysctl); | |||
131 | cond_syscall(ppc_rtas); | 131 | cond_syscall(ppc_rtas); |
132 | cond_syscall(sys_spu_run); | 132 | cond_syscall(sys_spu_run); |
133 | cond_syscall(sys_spu_create); | 133 | cond_syscall(sys_spu_create); |
134 | cond_syscall(sys_subpage_prot); | ||
134 | 135 | ||
135 | /* mmu depending weak syscall entries */ | 136 | /* mmu depending weak syscall entries */ |
136 | cond_syscall(sys_mprotect); | 137 | cond_syscall(sys_mprotect); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8ac51714b08c..357b68ba23ec 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -53,6 +53,7 @@ | |||
53 | #ifdef CONFIG_X86 | 53 | #ifdef CONFIG_X86 |
54 | #include <asm/nmi.h> | 54 | #include <asm/nmi.h> |
55 | #include <asm/stacktrace.h> | 55 | #include <asm/stacktrace.h> |
56 | #include <asm/io.h> | ||
56 | #endif | 57 | #endif |
57 | 58 | ||
58 | static int deprecated_sysctl_warning(struct __sysctl_args *args); | 59 | static int deprecated_sysctl_warning(struct __sysctl_args *args); |
@@ -81,6 +82,7 @@ extern int compat_log; | |||
81 | extern int maps_protect; | 82 | extern int maps_protect; |
82 | extern int sysctl_stat_interval; | 83 | extern int sysctl_stat_interval; |
83 | extern int audit_argv_kb; | 84 | extern int audit_argv_kb; |
85 | extern int latencytop_enabled; | ||
84 | 86 | ||
85 | /* Constants used for minimum and maximum */ | 87 | /* Constants used for minimum and maximum */ |
86 | #ifdef CONFIG_DETECT_SOFTLOCKUP | 88 | #ifdef CONFIG_DETECT_SOFTLOCKUP |
@@ -156,8 +158,16 @@ static int proc_dointvec_taint(struct ctl_table *table, int write, struct file * | |||
156 | #endif | 158 | #endif |
157 | 159 | ||
158 | static struct ctl_table root_table[]; | 160 | static struct ctl_table root_table[]; |
159 | static struct ctl_table_header root_table_header = | 161 | static struct ctl_table_root sysctl_table_root; |
160 | { root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) }; | 162 | static struct ctl_table_header root_table_header = { |
163 | .ctl_table = root_table, | ||
164 | .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list), | ||
165 | .root = &sysctl_table_root, | ||
166 | }; | ||
167 | static struct ctl_table_root sysctl_table_root = { | ||
168 | .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), | ||
169 | .header_list = LIST_HEAD_INIT(root_table_header.ctl_entry), | ||
170 | }; | ||
161 | 171 | ||
162 | static struct ctl_table kern_table[]; | 172 | static struct ctl_table kern_table[]; |
163 | static struct ctl_table vm_table[]; | 173 | static struct ctl_table vm_table[]; |
@@ -191,14 +201,6 @@ static struct ctl_table root_table[] = { | |||
191 | .mode = 0555, | 201 | .mode = 0555, |
192 | .child = vm_table, | 202 | .child = vm_table, |
193 | }, | 203 | }, |
194 | #ifdef CONFIG_NET | ||
195 | { | ||
196 | .ctl_name = CTL_NET, | ||
197 | .procname = "net", | ||
198 | .mode = 0555, | ||
199 | .child = net_table, | ||
200 | }, | ||
201 | #endif | ||
202 | { | 204 | { |
203 | .ctl_name = CTL_FS, | 205 | .ctl_name = CTL_FS, |
204 | .procname = "fs", | 206 | .procname = "fs", |
@@ -225,10 +227,10 @@ static struct ctl_table root_table[] = { | |||
225 | }; | 227 | }; |
226 | 228 | ||
227 | #ifdef CONFIG_SCHED_DEBUG | 229 | #ifdef CONFIG_SCHED_DEBUG |
228 | static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */ | 230 | static int min_sched_granularity_ns = 100000; /* 100 usecs */ |
229 | static unsigned long max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ | 231 | static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
230 | static unsigned long min_wakeup_granularity_ns; /* 0 usecs */ | 232 | static int min_wakeup_granularity_ns; /* 0 usecs */ |
231 | static unsigned long max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ | 233 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
232 | #endif | 234 | #endif |
233 | 235 | ||
234 | static struct ctl_table kern_table[] = { | 236 | static struct ctl_table kern_table[] = { |
@@ -306,9 +308,43 @@ static struct ctl_table kern_table[] = { | |||
306 | .procname = "sched_nr_migrate", | 308 | .procname = "sched_nr_migrate", |
307 | .data = &sysctl_sched_nr_migrate, | 309 | .data = &sysctl_sched_nr_migrate, |
308 | .maxlen = sizeof(unsigned int), | 310 | .maxlen = sizeof(unsigned int), |
309 | .mode = 644, | 311 | .mode = 0644, |
312 | .proc_handler = &proc_dointvec, | ||
313 | }, | ||
314 | { | ||
315 | .ctl_name = CTL_UNNUMBERED, | ||
316 | .procname = "sched_rt_period_ms", | ||
317 | .data = &sysctl_sched_rt_period, | ||
318 | .maxlen = sizeof(unsigned int), | ||
319 | .mode = 0644, | ||
310 | .proc_handler = &proc_dointvec, | 320 | .proc_handler = &proc_dointvec, |
311 | }, | 321 | }, |
322 | { | ||
323 | .ctl_name = CTL_UNNUMBERED, | ||
324 | .procname = "sched_rt_ratio", | ||
325 | .data = &sysctl_sched_rt_ratio, | ||
326 | .maxlen = sizeof(unsigned int), | ||
327 | .mode = 0644, | ||
328 | .proc_handler = &proc_dointvec, | ||
329 | }, | ||
330 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | ||
331 | { | ||
332 | .ctl_name = CTL_UNNUMBERED, | ||
333 | .procname = "sched_min_bal_int_shares", | ||
334 | .data = &sysctl_sched_min_bal_int_shares, | ||
335 | .maxlen = sizeof(unsigned int), | ||
336 | .mode = 0644, | ||
337 | .proc_handler = &proc_dointvec, | ||
338 | }, | ||
339 | { | ||
340 | .ctl_name = CTL_UNNUMBERED, | ||
341 | .procname = "sched_max_bal_int_shares", | ||
342 | .data = &sysctl_sched_max_bal_int_shares, | ||
343 | .maxlen = sizeof(unsigned int), | ||
344 | .mode = 0644, | ||
345 | .proc_handler = &proc_dointvec, | ||
346 | }, | ||
347 | #endif | ||
312 | #endif | 348 | #endif |
313 | { | 349 | { |
314 | .ctl_name = CTL_UNNUMBERED, | 350 | .ctl_name = CTL_UNNUMBERED, |
@@ -382,6 +418,15 @@ static struct ctl_table kern_table[] = { | |||
382 | .proc_handler = &proc_dointvec_taint, | 418 | .proc_handler = &proc_dointvec_taint, |
383 | }, | 419 | }, |
384 | #endif | 420 | #endif |
421 | #ifdef CONFIG_LATENCYTOP | ||
422 | { | ||
423 | .procname = "latencytop", | ||
424 | .data = &latencytop_enabled, | ||
425 | .maxlen = sizeof(int), | ||
426 | .mode = 0644, | ||
427 | .proc_handler = &proc_dointvec, | ||
428 | }, | ||
429 | #endif | ||
385 | #ifdef CONFIG_SECURITY_CAPABILITIES | 430 | #ifdef CONFIG_SECURITY_CAPABILITIES |
386 | { | 431 | { |
387 | .procname = "cap-bound", | 432 | .procname = "cap-bound", |
@@ -683,6 +728,14 @@ static struct ctl_table kern_table[] = { | |||
683 | .mode = 0644, | 728 | .mode = 0644, |
684 | .proc_handler = &proc_dointvec, | 729 | .proc_handler = &proc_dointvec, |
685 | }, | 730 | }, |
731 | { | ||
732 | .ctl_name = CTL_UNNUMBERED, | ||
733 | .procname = "io_delay_type", | ||
734 | .data = &io_delay_type, | ||
735 | .maxlen = sizeof(int), | ||
736 | .mode = 0644, | ||
737 | .proc_handler = &proc_dointvec, | ||
738 | }, | ||
686 | #endif | 739 | #endif |
687 | #if defined(CONFIG_MMU) | 740 | #if defined(CONFIG_MMU) |
688 | { | 741 | { |
@@ -728,13 +781,40 @@ static struct ctl_table kern_table[] = { | |||
728 | .ctl_name = CTL_UNNUMBERED, | 781 | .ctl_name = CTL_UNNUMBERED, |
729 | .procname = "softlockup_thresh", | 782 | .procname = "softlockup_thresh", |
730 | .data = &softlockup_thresh, | 783 | .data = &softlockup_thresh, |
731 | .maxlen = sizeof(int), | 784 | .maxlen = sizeof(unsigned long), |
732 | .mode = 0644, | 785 | .mode = 0644, |
733 | .proc_handler = &proc_dointvec_minmax, | 786 | .proc_handler = &proc_doulongvec_minmax, |
734 | .strategy = &sysctl_intvec, | 787 | .strategy = &sysctl_intvec, |
735 | .extra1 = &one, | 788 | .extra1 = &one, |
736 | .extra2 = &sixty, | 789 | .extra2 = &sixty, |
737 | }, | 790 | }, |
791 | { | ||
792 | .ctl_name = CTL_UNNUMBERED, | ||
793 | .procname = "hung_task_check_count", | ||
794 | .data = &sysctl_hung_task_check_count, | ||
795 | .maxlen = sizeof(unsigned long), | ||
796 | .mode = 0644, | ||
797 | .proc_handler = &proc_doulongvec_minmax, | ||
798 | .strategy = &sysctl_intvec, | ||
799 | }, | ||
800 | { | ||
801 | .ctl_name = CTL_UNNUMBERED, | ||
802 | .procname = "hung_task_timeout_secs", | ||
803 | .data = &sysctl_hung_task_timeout_secs, | ||
804 | .maxlen = sizeof(unsigned long), | ||
805 | .mode = 0644, | ||
806 | .proc_handler = &proc_doulongvec_minmax, | ||
807 | .strategy = &sysctl_intvec, | ||
808 | }, | ||
809 | { | ||
810 | .ctl_name = CTL_UNNUMBERED, | ||
811 | .procname = "hung_task_warnings", | ||
812 | .data = &sysctl_hung_task_warnings, | ||
813 | .maxlen = sizeof(unsigned long), | ||
814 | .mode = 0644, | ||
815 | .proc_handler = &proc_doulongvec_minmax, | ||
816 | .strategy = &sysctl_intvec, | ||
817 | }, | ||
738 | #endif | 818 | #endif |
739 | #ifdef CONFIG_COMPAT | 819 | #ifdef CONFIG_COMPAT |
740 | { | 820 | { |
@@ -906,11 +986,11 @@ static struct ctl_table vm_table[] = { | |||
906 | }, | 986 | }, |
907 | { | 987 | { |
908 | .ctl_name = CTL_UNNUMBERED, | 988 | .ctl_name = CTL_UNNUMBERED, |
909 | .procname = "hugetlb_dynamic_pool", | 989 | .procname = "nr_overcommit_hugepages", |
910 | .data = &hugetlb_dynamic_pool, | 990 | .data = &nr_overcommit_huge_pages, |
911 | .maxlen = sizeof(hugetlb_dynamic_pool), | 991 | .maxlen = sizeof(nr_overcommit_huge_pages), |
912 | .mode = 0644, | 992 | .mode = 0644, |
913 | .proc_handler = &proc_dointvec, | 993 | .proc_handler = &proc_doulongvec_minmax, |
914 | }, | 994 | }, |
915 | #endif | 995 | #endif |
916 | { | 996 | { |
@@ -1300,12 +1380,27 @@ void sysctl_head_finish(struct ctl_table_header *head) | |||
1300 | spin_unlock(&sysctl_lock); | 1380 | spin_unlock(&sysctl_lock); |
1301 | } | 1381 | } |
1302 | 1382 | ||
1303 | struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) | 1383 | static struct list_head * |
1384 | lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) | ||
1385 | { | ||
1386 | struct list_head *header_list; | ||
1387 | header_list = &root->header_list; | ||
1388 | if (root->lookup) | ||
1389 | header_list = root->lookup(root, namespaces); | ||
1390 | return header_list; | ||
1391 | } | ||
1392 | |||
1393 | struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, | ||
1394 | struct ctl_table_header *prev) | ||
1304 | { | 1395 | { |
1396 | struct ctl_table_root *root; | ||
1397 | struct list_head *header_list; | ||
1305 | struct ctl_table_header *head; | 1398 | struct ctl_table_header *head; |
1306 | struct list_head *tmp; | 1399 | struct list_head *tmp; |
1400 | |||
1307 | spin_lock(&sysctl_lock); | 1401 | spin_lock(&sysctl_lock); |
1308 | if (prev) { | 1402 | if (prev) { |
1403 | head = prev; | ||
1309 | tmp = &prev->ctl_entry; | 1404 | tmp = &prev->ctl_entry; |
1310 | unuse_table(prev); | 1405 | unuse_table(prev); |
1311 | goto next; | 1406 | goto next; |
@@ -1319,14 +1414,38 @@ struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) | |||
1319 | spin_unlock(&sysctl_lock); | 1414 | spin_unlock(&sysctl_lock); |
1320 | return head; | 1415 | return head; |
1321 | next: | 1416 | next: |
1417 | root = head->root; | ||
1322 | tmp = tmp->next; | 1418 | tmp = tmp->next; |
1323 | if (tmp == &root_table_header.ctl_entry) | 1419 | header_list = lookup_header_list(root, namespaces); |
1324 | break; | 1420 | if (tmp != header_list) |
1421 | continue; | ||
1422 | |||
1423 | do { | ||
1424 | root = list_entry(root->root_list.next, | ||
1425 | struct ctl_table_root, root_list); | ||
1426 | if (root == &sysctl_table_root) | ||
1427 | goto out; | ||
1428 | header_list = lookup_header_list(root, namespaces); | ||
1429 | } while (list_empty(header_list)); | ||
1430 | tmp = header_list->next; | ||
1325 | } | 1431 | } |
1432 | out: | ||
1326 | spin_unlock(&sysctl_lock); | 1433 | spin_unlock(&sysctl_lock); |
1327 | return NULL; | 1434 | return NULL; |
1328 | } | 1435 | } |
1329 | 1436 | ||
1437 | struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) | ||
1438 | { | ||
1439 | return __sysctl_head_next(current->nsproxy, prev); | ||
1440 | } | ||
1441 | |||
1442 | void register_sysctl_root(struct ctl_table_root *root) | ||
1443 | { | ||
1444 | spin_lock(&sysctl_lock); | ||
1445 | list_add_tail(&root->root_list, &sysctl_table_root.root_list); | ||
1446 | spin_unlock(&sysctl_lock); | ||
1447 | } | ||
1448 | |||
1330 | #ifdef CONFIG_SYSCTL_SYSCALL | 1449 | #ifdef CONFIG_SYSCTL_SYSCALL |
1331 | int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, | 1450 | int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, |
1332 | void __user *newval, size_t newlen) | 1451 | void __user *newval, size_t newlen) |
@@ -1483,18 +1602,21 @@ static __init int sysctl_init(void) | |||
1483 | { | 1602 | { |
1484 | int err; | 1603 | int err; |
1485 | sysctl_set_parent(NULL, root_table); | 1604 | sysctl_set_parent(NULL, root_table); |
1486 | err = sysctl_check_table(root_table); | 1605 | err = sysctl_check_table(current->nsproxy, root_table); |
1487 | return 0; | 1606 | return 0; |
1488 | } | 1607 | } |
1489 | 1608 | ||
1490 | core_initcall(sysctl_init); | 1609 | core_initcall(sysctl_init); |
1491 | 1610 | ||
1492 | /** | 1611 | /** |
1493 | * register_sysctl_table - register a sysctl hierarchy | 1612 | * __register_sysctl_paths - register a sysctl hierarchy |
1613 | * @root: List of sysctl headers to register on | ||
1614 | * @namespaces: Data to compute which lists of sysctl entries are visible | ||
1615 | * @path: The path to the directory the sysctl table is in. | ||
1494 | * @table: the top-level table structure | 1616 | * @table: the top-level table structure |
1495 | * | 1617 | * |
1496 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table | 1618 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table |
1497 | * array. An entry with a ctl_name of 0 terminates the table. | 1619 | * array. A completely 0 filled entry terminates the table. |
1498 | * | 1620 | * |
1499 | * The members of the &struct ctl_table structure are used as follows: | 1621 | * The members of the &struct ctl_table structure are used as follows: |
1500 | * | 1622 | * |
@@ -1557,25 +1679,99 @@ core_initcall(sysctl_init); | |||
1557 | * This routine returns %NULL on a failure to register, and a pointer | 1679 | * This routine returns %NULL on a failure to register, and a pointer |
1558 | * to the table header on success. | 1680 | * to the table header on success. |
1559 | */ | 1681 | */ |
1560 | struct ctl_table_header *register_sysctl_table(struct ctl_table * table) | 1682 | struct ctl_table_header *__register_sysctl_paths( |
1683 | struct ctl_table_root *root, | ||
1684 | struct nsproxy *namespaces, | ||
1685 | const struct ctl_path *path, struct ctl_table *table) | ||
1561 | { | 1686 | { |
1562 | struct ctl_table_header *tmp; | 1687 | struct list_head *header_list; |
1563 | tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); | 1688 | struct ctl_table_header *header; |
1564 | if (!tmp) | 1689 | struct ctl_table *new, **prevp; |
1690 | unsigned int n, npath; | ||
1691 | |||
1692 | /* Count the path components */ | ||
1693 | for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath) | ||
1694 | ; | ||
1695 | |||
1696 | /* | ||
1697 | * For each path component, allocate a 2-element ctl_table array. | ||
1698 | * The first array element will be filled with the sysctl entry | ||
1699 | * for this, the second will be the sentinel (ctl_name == 0). | ||
1700 | * | ||
1701 | * We allocate everything in one go so that we don't have to | ||
1702 | * worry about freeing additional memory in unregister_sysctl_table. | ||
1703 | */ | ||
1704 | header = kzalloc(sizeof(struct ctl_table_header) + | ||
1705 | (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL); | ||
1706 | if (!header) | ||
1565 | return NULL; | 1707 | return NULL; |
1566 | tmp->ctl_table = table; | 1708 | |
1567 | INIT_LIST_HEAD(&tmp->ctl_entry); | 1709 | new = (struct ctl_table *) (header + 1); |
1568 | tmp->used = 0; | 1710 | |
1569 | tmp->unregistering = NULL; | 1711 | /* Now connect the dots */ |
1570 | sysctl_set_parent(NULL, table); | 1712 | prevp = &header->ctl_table; |
1571 | if (sysctl_check_table(tmp->ctl_table)) { | 1713 | for (n = 0; n < npath; ++n, ++path) { |
1572 | kfree(tmp); | 1714 | /* Copy the procname */ |
1715 | new->procname = path->procname; | ||
1716 | new->ctl_name = path->ctl_name; | ||
1717 | new->mode = 0555; | ||
1718 | |||
1719 | *prevp = new; | ||
1720 | prevp = &new->child; | ||
1721 | |||
1722 | new += 2; | ||
1723 | } | ||
1724 | *prevp = table; | ||
1725 | header->ctl_table_arg = table; | ||
1726 | |||
1727 | INIT_LIST_HEAD(&header->ctl_entry); | ||
1728 | header->used = 0; | ||
1729 | header->unregistering = NULL; | ||
1730 | header->root = root; | ||
1731 | sysctl_set_parent(NULL, header->ctl_table); | ||
1732 | if (sysctl_check_table(namespaces, header->ctl_table)) { | ||
1733 | kfree(header); | ||
1573 | return NULL; | 1734 | return NULL; |
1574 | } | 1735 | } |
1575 | spin_lock(&sysctl_lock); | 1736 | spin_lock(&sysctl_lock); |
1576 | list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); | 1737 | header_list = lookup_header_list(root, namespaces); |
1738 | list_add_tail(&header->ctl_entry, header_list); | ||
1577 | spin_unlock(&sysctl_lock); | 1739 | spin_unlock(&sysctl_lock); |
1578 | return tmp; | 1740 | |
1741 | return header; | ||
1742 | } | ||
1743 | |||
1744 | /** | ||
1745 | * register_sysctl_table_path - register a sysctl table hierarchy | ||
1746 | * @path: The path to the directory the sysctl table is in. | ||
1747 | * @table: the top-level table structure | ||
1748 | * | ||
1749 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table | ||
1750 | * array. A completely 0 filled entry terminates the table. | ||
1751 | * | ||
1752 | * See __register_sysctl_paths for more details. | ||
1753 | */ | ||
1754 | struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, | ||
1755 | struct ctl_table *table) | ||
1756 | { | ||
1757 | return __register_sysctl_paths(&sysctl_table_root, current->nsproxy, | ||
1758 | path, table); | ||
1759 | } | ||
1760 | |||
1761 | /** | ||
1762 | * register_sysctl_table - register a sysctl table hierarchy | ||
1763 | * @table: the top-level table structure | ||
1764 | * | ||
1765 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table | ||
1766 | * array. A completely 0 filled entry terminates the table. | ||
1767 | * | ||
1768 | * See register_sysctl_paths for more details. | ||
1769 | */ | ||
1770 | struct ctl_table_header *register_sysctl_table(struct ctl_table *table) | ||
1771 | { | ||
1772 | static const struct ctl_path null_path[] = { {} }; | ||
1773 | |||
1774 | return register_sysctl_paths(null_path, table); | ||
1579 | } | 1775 | } |
1580 | 1776 | ||
1581 | /** | 1777 | /** |
@@ -1604,6 +1800,12 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table * table) | |||
1604 | return NULL; | 1800 | return NULL; |
1605 | } | 1801 | } |
1606 | 1802 | ||
1803 | struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, | ||
1804 | struct ctl_table *table) | ||
1805 | { | ||
1806 | return NULL; | ||
1807 | } | ||
1808 | |||
1607 | void unregister_sysctl_table(struct ctl_table_header * table) | 1809 | void unregister_sysctl_table(struct ctl_table_header * table) |
1608 | { | 1810 | { |
1609 | } | 1811 | } |
@@ -2662,6 +2864,7 @@ EXPORT_SYMBOL(proc_dostring); | |||
2662 | EXPORT_SYMBOL(proc_doulongvec_minmax); | 2864 | EXPORT_SYMBOL(proc_doulongvec_minmax); |
2663 | EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); | 2865 | EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); |
2664 | EXPORT_SYMBOL(register_sysctl_table); | 2866 | EXPORT_SYMBOL(register_sysctl_table); |
2867 | EXPORT_SYMBOL(register_sysctl_paths); | ||
2665 | EXPORT_SYMBOL(sysctl_intvec); | 2868 | EXPORT_SYMBOL(sysctl_intvec); |
2666 | EXPORT_SYMBOL(sysctl_jiffies); | 2869 | EXPORT_SYMBOL(sysctl_jiffies); |
2667 | EXPORT_SYMBOL(sysctl_ms_jiffies); | 2870 | EXPORT_SYMBOL(sysctl_ms_jiffies); |
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index bed939f82c31..c3206fa50048 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c | |||
@@ -1,6 +1,5 @@ | |||
1 | #include <linux/stat.h> | 1 | #include <linux/stat.h> |
2 | #include <linux/sysctl.h> | 2 | #include <linux/sysctl.h> |
3 | #include "../arch/s390/appldata/appldata.h" | ||
4 | #include "../fs/xfs/linux-2.6/xfs_sysctl.h" | 3 | #include "../fs/xfs/linux-2.6/xfs_sysctl.h" |
5 | #include <linux/sunrpc/debug.h> | 4 | #include <linux/sunrpc/debug.h> |
6 | #include <linux/string.h> | 5 | #include <linux/string.h> |
@@ -428,7 +427,7 @@ static struct trans_ctl_table trans_net_netrom_table[] = { | |||
428 | {} | 427 | {} |
429 | }; | 428 | }; |
430 | 429 | ||
431 | static struct trans_ctl_table trans_net_ax25_table[] = { | 430 | static struct trans_ctl_table trans_net_ax25_param_table[] = { |
432 | { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" }, | 431 | { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" }, |
433 | { NET_AX25_DEFAULT_MODE, "ax25_default_mode" }, | 432 | { NET_AX25_DEFAULT_MODE, "ax25_default_mode" }, |
434 | { NET_AX25_BACKOFF_TYPE, "backoff_type" }, | 433 | { NET_AX25_BACKOFF_TYPE, "backoff_type" }, |
@@ -446,6 +445,11 @@ static struct trans_ctl_table trans_net_ax25_table[] = { | |||
446 | {} | 445 | {} |
447 | }; | 446 | }; |
448 | 447 | ||
448 | static struct trans_ctl_table trans_net_ax25_table[] = { | ||
449 | { 0, NULL, trans_net_ax25_param_table }, | ||
450 | {} | ||
451 | }; | ||
452 | |||
449 | static struct trans_ctl_table trans_net_bridge_table[] = { | 453 | static struct trans_ctl_table trans_net_bridge_table[] = { |
450 | { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" }, | 454 | { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" }, |
451 | { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" }, | 455 | { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" }, |
@@ -1338,7 +1342,8 @@ static void sysctl_repair_table(struct ctl_table *table) | |||
1338 | } | 1342 | } |
1339 | } | 1343 | } |
1340 | 1344 | ||
1341 | static struct ctl_table *sysctl_check_lookup(struct ctl_table *table) | 1345 | static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, |
1346 | struct ctl_table *table) | ||
1342 | { | 1347 | { |
1343 | struct ctl_table_header *head; | 1348 | struct ctl_table_header *head; |
1344 | struct ctl_table *ref, *test; | 1349 | struct ctl_table *ref, *test; |
@@ -1346,8 +1351,8 @@ static struct ctl_table *sysctl_check_lookup(struct ctl_table *table) | |||
1346 | 1351 | ||
1347 | depth = sysctl_depth(table); | 1352 | depth = sysctl_depth(table); |
1348 | 1353 | ||
1349 | for (head = sysctl_head_next(NULL); head; | 1354 | for (head = __sysctl_head_next(namespaces, NULL); head; |
1350 | head = sysctl_head_next(head)) { | 1355 | head = __sysctl_head_next(namespaces, head)) { |
1351 | cur_depth = depth; | 1356 | cur_depth = depth; |
1352 | ref = head->ctl_table; | 1357 | ref = head->ctl_table; |
1353 | repeat: | 1358 | repeat: |
@@ -1392,13 +1397,14 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str | |||
1392 | *fail = str; | 1397 | *fail = str; |
1393 | } | 1398 | } |
1394 | 1399 | ||
1395 | static int sysctl_check_dir(struct ctl_table *table) | 1400 | static int sysctl_check_dir(struct nsproxy *namespaces, |
1401 | struct ctl_table *table) | ||
1396 | { | 1402 | { |
1397 | struct ctl_table *ref; | 1403 | struct ctl_table *ref; |
1398 | int error; | 1404 | int error; |
1399 | 1405 | ||
1400 | error = 0; | 1406 | error = 0; |
1401 | ref = sysctl_check_lookup(table); | 1407 | ref = sysctl_check_lookup(namespaces, table); |
1402 | if (ref) { | 1408 | if (ref) { |
1403 | int match = 0; | 1409 | int match = 0; |
1404 | if ((!table->procname && !ref->procname) || | 1410 | if ((!table->procname && !ref->procname) || |
@@ -1423,11 +1429,12 @@ static int sysctl_check_dir(struct ctl_table *table) | |||
1423 | return error; | 1429 | return error; |
1424 | } | 1430 | } |
1425 | 1431 | ||
1426 | static void sysctl_check_leaf(struct ctl_table *table, const char **fail) | 1432 | static void sysctl_check_leaf(struct nsproxy *namespaces, |
1433 | struct ctl_table *table, const char **fail) | ||
1427 | { | 1434 | { |
1428 | struct ctl_table *ref; | 1435 | struct ctl_table *ref; |
1429 | 1436 | ||
1430 | ref = sysctl_check_lookup(table); | 1437 | ref = sysctl_check_lookup(namespaces, table); |
1431 | if (ref && (ref != table)) | 1438 | if (ref && (ref != table)) |
1432 | set_fail(fail, table, "Sysctl already exists"); | 1439 | set_fail(fail, table, "Sysctl already exists"); |
1433 | } | 1440 | } |
@@ -1451,7 +1458,7 @@ static void sysctl_check_bin_path(struct ctl_table *table, const char **fail) | |||
1451 | } | 1458 | } |
1452 | } | 1459 | } |
1453 | 1460 | ||
1454 | int sysctl_check_table(struct ctl_table *table) | 1461 | int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) |
1455 | { | 1462 | { |
1456 | int error = 0; | 1463 | int error = 0; |
1457 | for (; table->ctl_name || table->procname; table++) { | 1464 | for (; table->ctl_name || table->procname; table++) { |
@@ -1481,7 +1488,7 @@ int sysctl_check_table(struct ctl_table *table) | |||
1481 | set_fail(&fail, table, "Directory with extra1"); | 1488 | set_fail(&fail, table, "Directory with extra1"); |
1482 | if (table->extra2) | 1489 | if (table->extra2) |
1483 | set_fail(&fail, table, "Directory with extra2"); | 1490 | set_fail(&fail, table, "Directory with extra2"); |
1484 | if (sysctl_check_dir(table)) | 1491 | if (sysctl_check_dir(namespaces, table)) |
1485 | set_fail(&fail, table, "Inconsistent directory names"); | 1492 | set_fail(&fail, table, "Inconsistent directory names"); |
1486 | } else { | 1493 | } else { |
1487 | if ((table->strategy == sysctl_data) || | 1494 | if ((table->strategy == sysctl_data) || |
@@ -1530,7 +1537,7 @@ int sysctl_check_table(struct ctl_table *table) | |||
1530 | if (!table->procname && table->proc_handler) | 1537 | if (!table->procname && table->proc_handler) |
1531 | set_fail(&fail, table, "proc_handler without procname"); | 1538 | set_fail(&fail, table, "proc_handler without procname"); |
1532 | #endif | 1539 | #endif |
1533 | sysctl_check_leaf(table, &fail); | 1540 | sysctl_check_leaf(namespaces, table, &fail); |
1534 | } | 1541 | } |
1535 | sysctl_check_bin_path(table, &fail); | 1542 | sysctl_check_bin_path(table, &fail); |
1536 | if (fail) { | 1543 | if (fail) { |
@@ -1538,7 +1545,7 @@ int sysctl_check_table(struct ctl_table *table) | |||
1538 | error = -EINVAL; | 1545 | error = -EINVAL; |
1539 | } | 1546 | } |
1540 | if (table->child) | 1547 | if (table->child) |
1541 | error |= sysctl_check_table(table->child); | 1548 | error |= sysctl_check_table(namespaces, table->child); |
1542 | } | 1549 | } |
1543 | return error; | 1550 | return error; |
1544 | } | 1551 | } |
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c new file mode 100644 index 000000000000..88cdb109e13c --- /dev/null +++ b/kernel/test_kprobes.c | |||
@@ -0,0 +1,216 @@ | |||
1 | /* | ||
2 | * test_kprobes.c - simple sanity test for *probes | ||
3 | * | ||
4 | * Copyright IBM Corp. 2008 | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it would be useful, but | ||
12 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
14 | * the GNU General Public License for more details. | ||
15 | */ | ||
16 | |||
17 | #include <linux/kernel.h> | ||
18 | #include <linux/kprobes.h> | ||
19 | #include <linux/random.h> | ||
20 | |||
21 | #define div_factor 3 | ||
22 | |||
23 | static u32 rand1, preh_val, posth_val, jph_val; | ||
24 | static int errors, handler_errors, num_tests; | ||
25 | |||
26 | static noinline u32 kprobe_target(u32 value) | ||
27 | { | ||
28 | /* | ||
29 | * gcc ignores noinline on some architectures unless we stuff | ||
30 | * sufficient lard into the function. The get_kprobe() here is | ||
31 | * just for that. | ||
32 | * | ||
33 | * NOTE: We aren't concerned about the correctness of get_kprobe() | ||
34 | * here; hence, this call is neither under !preempt nor with the | ||
35 | * kprobe_mutex held. This is fine(tm) | ||
36 | */ | ||
37 | if (get_kprobe((void *)0xdeadbeef)) | ||
38 | printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n"); | ||
39 | |||
40 | return (value / div_factor); | ||
41 | } | ||
42 | |||
43 | static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs) | ||
44 | { | ||
45 | preh_val = (rand1 / div_factor); | ||
46 | return 0; | ||
47 | } | ||
48 | |||
49 | static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, | ||
50 | unsigned long flags) | ||
51 | { | ||
52 | if (preh_val != (rand1 / div_factor)) { | ||
53 | handler_errors++; | ||
54 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
55 | "incorrect value in post_handler\n"); | ||
56 | } | ||
57 | posth_val = preh_val + div_factor; | ||
58 | } | ||
59 | |||
60 | static struct kprobe kp = { | ||
61 | .symbol_name = "kprobe_target", | ||
62 | .pre_handler = kp_pre_handler, | ||
63 | .post_handler = kp_post_handler | ||
64 | }; | ||
65 | |||
66 | static int test_kprobe(void) | ||
67 | { | ||
68 | int ret; | ||
69 | |||
70 | ret = register_kprobe(&kp); | ||
71 | if (ret < 0) { | ||
72 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
73 | "register_kprobe returned %d\n", ret); | ||
74 | return ret; | ||
75 | } | ||
76 | |||
77 | ret = kprobe_target(rand1); | ||
78 | unregister_kprobe(&kp); | ||
79 | |||
80 | if (preh_val == 0) { | ||
81 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
82 | "kprobe pre_handler not called\n"); | ||
83 | handler_errors++; | ||
84 | } | ||
85 | |||
86 | if (posth_val == 0) { | ||
87 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
88 | "kprobe post_handler not called\n"); | ||
89 | handler_errors++; | ||
90 | } | ||
91 | |||
92 | return 0; | ||
93 | } | ||
94 | |||
95 | static u32 j_kprobe_target(u32 value) | ||
96 | { | ||
97 | if (value != rand1) { | ||
98 | handler_errors++; | ||
99 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
100 | "incorrect value in jprobe handler\n"); | ||
101 | } | ||
102 | |||
103 | jph_val = rand1; | ||
104 | jprobe_return(); | ||
105 | return 0; | ||
106 | } | ||
107 | |||
108 | static struct jprobe jp = { | ||
109 | .entry = j_kprobe_target, | ||
110 | .kp.symbol_name = "kprobe_target" | ||
111 | }; | ||
112 | |||
113 | static int test_jprobe(void) | ||
114 | { | ||
115 | int ret; | ||
116 | |||
117 | ret = register_jprobe(&jp); | ||
118 | if (ret < 0) { | ||
119 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
120 | "register_jprobe returned %d\n", ret); | ||
121 | return ret; | ||
122 | } | ||
123 | |||
124 | ret = kprobe_target(rand1); | ||
125 | unregister_jprobe(&jp); | ||
126 | if (jph_val == 0) { | ||
127 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
128 | "jprobe handler not called\n"); | ||
129 | handler_errors++; | ||
130 | } | ||
131 | |||
132 | return 0; | ||
133 | } | ||
134 | |||
135 | #ifdef CONFIG_KRETPROBES | ||
136 | static u32 krph_val; | ||
137 | |||
138 | static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) | ||
139 | { | ||
140 | unsigned long ret = regs_return_value(regs); | ||
141 | |||
142 | if (ret != (rand1 / div_factor)) { | ||
143 | handler_errors++; | ||
144 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
145 | "incorrect value in kretprobe handler\n"); | ||
146 | } | ||
147 | |||
148 | krph_val = (rand1 / div_factor); | ||
149 | return 0; | ||
150 | } | ||
151 | |||
152 | static struct kretprobe rp = { | ||
153 | .handler = return_handler, | ||
154 | .kp.symbol_name = "kprobe_target" | ||
155 | }; | ||
156 | |||
157 | static int test_kretprobe(void) | ||
158 | { | ||
159 | int ret; | ||
160 | |||
161 | ret = register_kretprobe(&rp); | ||
162 | if (ret < 0) { | ||
163 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
164 | "register_kretprobe returned %d\n", ret); | ||
165 | return ret; | ||
166 | } | ||
167 | |||
168 | ret = kprobe_target(rand1); | ||
169 | unregister_kretprobe(&rp); | ||
170 | if (krph_val == 0) { | ||
171 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
172 | "kretprobe handler not called\n"); | ||
173 | handler_errors++; | ||
174 | } | ||
175 | |||
176 | return 0; | ||
177 | } | ||
178 | #endif /* CONFIG_KRETPROBES */ | ||
179 | |||
180 | int init_test_probes(void) | ||
181 | { | ||
182 | int ret; | ||
183 | |||
184 | do { | ||
185 | rand1 = random32(); | ||
186 | } while (rand1 <= div_factor); | ||
187 | |||
188 | printk(KERN_INFO "Kprobe smoke test started\n"); | ||
189 | num_tests++; | ||
190 | ret = test_kprobe(); | ||
191 | if (ret < 0) | ||
192 | errors++; | ||
193 | |||
194 | num_tests++; | ||
195 | ret = test_jprobe(); | ||
196 | if (ret < 0) | ||
197 | errors++; | ||
198 | |||
199 | #ifdef CONFIG_KRETPROBES | ||
200 | num_tests++; | ||
201 | ret = test_kretprobe(); | ||
202 | if (ret < 0) | ||
203 | errors++; | ||
204 | #endif /* CONFIG_KRETPROBES */ | ||
205 | |||
206 | if (errors) | ||
207 | printk(KERN_ERR "BUG: Kprobe smoke test: %d out of " | ||
208 | "%d tests failed\n", errors, num_tests); | ||
209 | else if (handler_errors) | ||
210 | printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) " | ||
211 | "running handlers\n", handler_errors); | ||
212 | else | ||
213 | printk(KERN_INFO "Kprobe smoke test passed successfully\n"); | ||
214 | |||
215 | return 0; | ||
216 | } | ||
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 822beebe664a..3e59fce6dd43 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -41,6 +41,11 @@ unsigned long clockevent_delta2ns(unsigned long latch, | |||
41 | { | 41 | { |
42 | u64 clc = ((u64) latch << evt->shift); | 42 | u64 clc = ((u64) latch << evt->shift); |
43 | 43 | ||
44 | if (unlikely(!evt->mult)) { | ||
45 | evt->mult = 1; | ||
46 | WARN_ON(1); | ||
47 | } | ||
48 | |||
44 | do_div(clc, evt->mult); | 49 | do_div(clc, evt->mult); |
45 | if (clc < 1000) | 50 | if (clc < 1000) |
46 | clc = 1000; | 51 | clc = 1000; |
@@ -78,6 +83,11 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, | |||
78 | unsigned long long clc; | 83 | unsigned long long clc; |
79 | int64_t delta; | 84 | int64_t delta; |
80 | 85 | ||
86 | if (unlikely(expires.tv64 < 0)) { | ||
87 | WARN_ON_ONCE(1); | ||
88 | return -ETIME; | ||
89 | } | ||
90 | |||
81 | delta = ktime_to_ns(ktime_sub(expires, now)); | 91 | delta = ktime_to_ns(ktime_sub(expires, now)); |
82 | 92 | ||
83 | if (delta <= 0) | 93 | if (delta <= 0) |
@@ -146,6 +156,14 @@ static void clockevents_notify_released(void) | |||
146 | void clockevents_register_device(struct clock_event_device *dev) | 156 | void clockevents_register_device(struct clock_event_device *dev) |
147 | { | 157 | { |
148 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); | 158 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); |
159 | /* | ||
160 | * A nsec2cyc multiplicator of 0 is invalid and we'd crash | ||
161 | * on it, so fix it up and emit a warning: | ||
162 | */ | ||
163 | if (unlikely(!dev->mult)) { | ||
164 | dev->mult = 1; | ||
165 | WARN_ON(1); | ||
166 | } | ||
149 | 167 | ||
150 | spin_lock(&clockevents_lock); | 168 | spin_lock(&clockevents_lock); |
151 | 169 | ||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c8a9d13874df..6e9259a5d501 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -142,8 +142,13 @@ static void clocksource_watchdog(unsigned long data) | |||
142 | } | 142 | } |
143 | 143 | ||
144 | if (!list_empty(&watchdog_list)) { | 144 | if (!list_empty(&watchdog_list)) { |
145 | __mod_timer(&watchdog_timer, | 145 | /* Cycle through CPUs to check if the CPUs stay synchronized to |
146 | watchdog_timer.expires + WATCHDOG_INTERVAL); | 146 | * each other. */ |
147 | int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map); | ||
148 | if (next_cpu >= NR_CPUS) | ||
149 | next_cpu = first_cpu(cpu_online_map); | ||
150 | watchdog_timer.expires += WATCHDOG_INTERVAL; | ||
151 | add_timer_on(&watchdog_timer, next_cpu); | ||
147 | } | 152 | } |
148 | spin_unlock(&watchdog_lock); | 153 | spin_unlock(&watchdog_lock); |
149 | } | 154 | } |
@@ -165,7 +170,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) | |||
165 | if (!started && watchdog) { | 170 | if (!started && watchdog) { |
166 | watchdog_last = watchdog->read(); | 171 | watchdog_last = watchdog->read(); |
167 | watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; | 172 | watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; |
168 | add_timer(&watchdog_timer); | 173 | add_timer_on(&watchdog_timer, first_cpu(cpu_online_map)); |
169 | } | 174 | } |
170 | } else { | 175 | } else { |
171 | if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) | 176 | if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) |
@@ -175,7 +180,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) | |||
175 | if (watchdog) | 180 | if (watchdog) |
176 | del_timer(&watchdog_timer); | 181 | del_timer(&watchdog_timer); |
177 | watchdog = cs; | 182 | watchdog = cs; |
178 | init_timer(&watchdog_timer); | 183 | init_timer_deferrable(&watchdog_timer); |
179 | watchdog_timer.function = clocksource_watchdog; | 184 | watchdog_timer.function = clocksource_watchdog; |
180 | 185 | ||
181 | /* Reset watchdog cycles */ | 186 | /* Reset watchdog cycles */ |
@@ -186,7 +191,8 @@ static void clocksource_check_watchdog(struct clocksource *cs) | |||
186 | watchdog_last = watchdog->read(); | 191 | watchdog_last = watchdog->read(); |
187 | watchdog_timer.expires = | 192 | watchdog_timer.expires = |
188 | jiffies + WATCHDOG_INTERVAL; | 193 | jiffies + WATCHDOG_INTERVAL; |
189 | add_timer(&watchdog_timer); | 194 | add_timer_on(&watchdog_timer, |
195 | first_cpu(cpu_online_map)); | ||
190 | } | 196 | } |
191 | } | 197 | } |
192 | } | 198 | } |
@@ -331,6 +337,21 @@ void clocksource_change_rating(struct clocksource *cs, int rating) | |||
331 | spin_unlock_irqrestore(&clocksource_lock, flags); | 337 | spin_unlock_irqrestore(&clocksource_lock, flags); |
332 | } | 338 | } |
333 | 339 | ||
340 | /** | ||
341 | * clocksource_unregister - remove a registered clocksource | ||
342 | */ | ||
343 | void clocksource_unregister(struct clocksource *cs) | ||
344 | { | ||
345 | unsigned long flags; | ||
346 | |||
347 | spin_lock_irqsave(&clocksource_lock, flags); | ||
348 | list_del(&cs->list); | ||
349 | if (clocksource_override == cs) | ||
350 | clocksource_override = NULL; | ||
351 | next_clocksource = select_clocksource(); | ||
352 | spin_unlock_irqrestore(&clocksource_lock, flags); | ||
353 | } | ||
354 | |||
334 | #ifdef CONFIG_SYSFS | 355 | #ifdef CONFIG_SYSFS |
335 | /** | 356 | /** |
336 | * sysfs_show_current_clocksources - sysfs interface for current clocksource | 357 | * sysfs_show_current_clocksources - sysfs interface for current clocksource |
@@ -441,7 +462,7 @@ static SYSDEV_ATTR(available_clocksource, 0600, | |||
441 | sysfs_show_available_clocksources, NULL); | 462 | sysfs_show_available_clocksources, NULL); |
442 | 463 | ||
443 | static struct sysdev_class clocksource_sysclass = { | 464 | static struct sysdev_class clocksource_sysclass = { |
444 | set_kset_name("clocksource"), | 465 | .name = "clocksource", |
445 | }; | 466 | }; |
446 | 467 | ||
447 | static struct sys_device device_clocksource = { | 468 | static struct sys_device device_clocksource = { |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index aa82d7bf478a..e1bd50cbbf5d 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -126,9 +126,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) | |||
126 | /* | 126 | /* |
127 | * Broadcast the event to the cpus, which are set in the mask | 127 | * Broadcast the event to the cpus, which are set in the mask |
128 | */ | 128 | */ |
129 | int tick_do_broadcast(cpumask_t mask) | 129 | static void tick_do_broadcast(cpumask_t mask) |
130 | { | 130 | { |
131 | int ret = 0, cpu = smp_processor_id(); | 131 | int cpu = smp_processor_id(); |
132 | struct tick_device *td; | 132 | struct tick_device *td; |
133 | 133 | ||
134 | /* | 134 | /* |
@@ -138,7 +138,6 @@ int tick_do_broadcast(cpumask_t mask) | |||
138 | cpu_clear(cpu, mask); | 138 | cpu_clear(cpu, mask); |
139 | td = &per_cpu(tick_cpu_device, cpu); | 139 | td = &per_cpu(tick_cpu_device, cpu); |
140 | td->evtdev->event_handler(td->evtdev); | 140 | td->evtdev->event_handler(td->evtdev); |
141 | ret = 1; | ||
142 | } | 141 | } |
143 | 142 | ||
144 | if (!cpus_empty(mask)) { | 143 | if (!cpus_empty(mask)) { |
@@ -151,9 +150,7 @@ int tick_do_broadcast(cpumask_t mask) | |||
151 | cpu = first_cpu(mask); | 150 | cpu = first_cpu(mask); |
152 | td = &per_cpu(tick_cpu_device, cpu); | 151 | td = &per_cpu(tick_cpu_device, cpu); |
153 | td->evtdev->broadcast(mask); | 152 | td->evtdev->broadcast(mask); |
154 | ret = 1; | ||
155 | } | 153 | } |
156 | return ret; | ||
157 | } | 154 | } |
158 | 155 | ||
159 | /* | 156 | /* |
@@ -384,45 +381,19 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | |||
384 | } | 381 | } |
385 | 382 | ||
386 | /* | 383 | /* |
387 | * Reprogram the broadcast device: | ||
388 | * | ||
389 | * Called with tick_broadcast_lock held and interrupts disabled. | ||
390 | */ | ||
391 | static int tick_broadcast_reprogram(void) | ||
392 | { | ||
393 | ktime_t expires = { .tv64 = KTIME_MAX }; | ||
394 | struct tick_device *td; | ||
395 | int cpu; | ||
396 | |||
397 | /* | ||
398 | * Find the event which expires next: | ||
399 | */ | ||
400 | for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS; | ||
401 | cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) { | ||
402 | td = &per_cpu(tick_cpu_device, cpu); | ||
403 | if (td->evtdev->next_event.tv64 < expires.tv64) | ||
404 | expires = td->evtdev->next_event; | ||
405 | } | ||
406 | |||
407 | if (expires.tv64 == KTIME_MAX) | ||
408 | return 0; | ||
409 | |||
410 | return tick_broadcast_set_event(expires, 0); | ||
411 | } | ||
412 | |||
413 | /* | ||
414 | * Handle oneshot mode broadcasting | 384 | * Handle oneshot mode broadcasting |
415 | */ | 385 | */ |
416 | static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) | 386 | static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) |
417 | { | 387 | { |
418 | struct tick_device *td; | 388 | struct tick_device *td; |
419 | cpumask_t mask; | 389 | cpumask_t mask; |
420 | ktime_t now; | 390 | ktime_t now, next_event; |
421 | int cpu; | 391 | int cpu; |
422 | 392 | ||
423 | spin_lock(&tick_broadcast_lock); | 393 | spin_lock(&tick_broadcast_lock); |
424 | again: | 394 | again: |
425 | dev->next_event.tv64 = KTIME_MAX; | 395 | dev->next_event.tv64 = KTIME_MAX; |
396 | next_event.tv64 = KTIME_MAX; | ||
426 | mask = CPU_MASK_NONE; | 397 | mask = CPU_MASK_NONE; |
427 | now = ktime_get(); | 398 | now = ktime_get(); |
428 | /* Find all expired events */ | 399 | /* Find all expired events */ |
@@ -431,19 +402,31 @@ again: | |||
431 | td = &per_cpu(tick_cpu_device, cpu); | 402 | td = &per_cpu(tick_cpu_device, cpu); |
432 | if (td->evtdev->next_event.tv64 <= now.tv64) | 403 | if (td->evtdev->next_event.tv64 <= now.tv64) |
433 | cpu_set(cpu, mask); | 404 | cpu_set(cpu, mask); |
405 | else if (td->evtdev->next_event.tv64 < next_event.tv64) | ||
406 | next_event.tv64 = td->evtdev->next_event.tv64; | ||
434 | } | 407 | } |
435 | 408 | ||
436 | /* | 409 | /* |
437 | * Wakeup the cpus which have an expired event. The broadcast | 410 | * Wakeup the cpus which have an expired event. |
438 | * device is reprogrammed in the return from idle code. | 411 | */ |
412 | tick_do_broadcast(mask); | ||
413 | |||
414 | /* | ||
415 | * Two reasons for reprogram: | ||
416 | * | ||
417 | * - The global event did not expire any CPU local | ||
418 | * events. This happens in dyntick mode, as the maximum PIT | ||
419 | * delta is quite small. | ||
420 | * | ||
421 | * - There are pending events on sleeping CPUs which were not | ||
422 | * in the event mask | ||
439 | */ | 423 | */ |
440 | if (!tick_do_broadcast(mask)) { | 424 | if (next_event.tv64 != KTIME_MAX) { |
441 | /* | 425 | /* |
442 | * The global event did not expire any CPU local | 426 | * Rearm the broadcast device. If event expired, |
443 | * events. This happens in dyntick mode, as the | 427 | * repeat the above |
444 | * maximum PIT delta is quite small. | ||
445 | */ | 428 | */ |
446 | if (tick_broadcast_reprogram()) | 429 | if (tick_broadcast_set_event(next_event, 0)) |
447 | goto again; | 430 | goto again; |
448 | } | 431 | } |
449 | spin_unlock(&tick_broadcast_lock); | 432 | spin_unlock(&tick_broadcast_lock); |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index bb13f2724905..f13f2b7f4fd4 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
@@ -70,8 +70,6 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | |||
70 | * Broadcasting support | 70 | * Broadcasting support |
71 | */ | 71 | */ |
72 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | 72 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST |
73 | extern int tick_do_broadcast(cpumask_t mask); | ||
74 | |||
75 | extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); | 73 | extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); |
76 | extern int tick_check_broadcast_device(struct clock_event_device *dev); | 74 | extern int tick_check_broadcast_device(struct clock_event_device *dev); |
77 | extern int tick_is_broadcast_device(struct clock_event_device *dev); | 75 | extern int tick_is_broadcast_device(struct clock_event_device *dev); |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cb89fa8db110..63f24b550695 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -9,7 +9,7 @@ | |||
9 | * | 9 | * |
10 | * Started by: Thomas Gleixner and Ingo Molnar | 10 | * Started by: Thomas Gleixner and Ingo Molnar |
11 | * | 11 | * |
12 | * For licencing details see kernel-base/COPYING | 12 | * Distribute under GPLv2. |
13 | */ | 13 | */ |
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | #include <linux/err.h> | 15 | #include <linux/err.h> |
@@ -143,6 +143,44 @@ void tick_nohz_update_jiffies(void) | |||
143 | local_irq_restore(flags); | 143 | local_irq_restore(flags); |
144 | } | 144 | } |
145 | 145 | ||
146 | void tick_nohz_stop_idle(int cpu) | ||
147 | { | ||
148 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
149 | |||
150 | if (ts->idle_active) { | ||
151 | ktime_t now, delta; | ||
152 | now = ktime_get(); | ||
153 | delta = ktime_sub(now, ts->idle_entrytime); | ||
154 | ts->idle_lastupdate = now; | ||
155 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
156 | ts->idle_active = 0; | ||
157 | } | ||
158 | } | ||
159 | |||
160 | static ktime_t tick_nohz_start_idle(int cpu) | ||
161 | { | ||
162 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
163 | ktime_t now, delta; | ||
164 | |||
165 | now = ktime_get(); | ||
166 | if (ts->idle_active) { | ||
167 | delta = ktime_sub(now, ts->idle_entrytime); | ||
168 | ts->idle_lastupdate = now; | ||
169 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
170 | } | ||
171 | ts->idle_entrytime = now; | ||
172 | ts->idle_active = 1; | ||
173 | return now; | ||
174 | } | ||
175 | |||
176 | u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) | ||
177 | { | ||
178 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
179 | |||
180 | *last_update_time = ktime_to_us(ts->idle_lastupdate); | ||
181 | return ktime_to_us(ts->idle_sleeptime); | ||
182 | } | ||
183 | |||
146 | /** | 184 | /** |
147 | * tick_nohz_stop_sched_tick - stop the idle tick from the idle task | 185 | * tick_nohz_stop_sched_tick - stop the idle tick from the idle task |
148 | * | 186 | * |
@@ -153,14 +191,16 @@ void tick_nohz_update_jiffies(void) | |||
153 | void tick_nohz_stop_sched_tick(void) | 191 | void tick_nohz_stop_sched_tick(void) |
154 | { | 192 | { |
155 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; | 193 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; |
194 | unsigned long rt_jiffies; | ||
156 | struct tick_sched *ts; | 195 | struct tick_sched *ts; |
157 | ktime_t last_update, expires, now, delta; | 196 | ktime_t last_update, expires, now; |
158 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 197 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; |
159 | int cpu; | 198 | int cpu; |
160 | 199 | ||
161 | local_irq_save(flags); | 200 | local_irq_save(flags); |
162 | 201 | ||
163 | cpu = smp_processor_id(); | 202 | cpu = smp_processor_id(); |
203 | now = tick_nohz_start_idle(cpu); | ||
164 | ts = &per_cpu(tick_cpu_sched, cpu); | 204 | ts = &per_cpu(tick_cpu_sched, cpu); |
165 | 205 | ||
166 | /* | 206 | /* |
@@ -192,19 +232,7 @@ void tick_nohz_stop_sched_tick(void) | |||
192 | } | 232 | } |
193 | } | 233 | } |
194 | 234 | ||
195 | now = ktime_get(); | ||
196 | /* | ||
197 | * When called from irq_exit we need to account the idle sleep time | ||
198 | * correctly. | ||
199 | */ | ||
200 | if (ts->tick_stopped) { | ||
201 | delta = ktime_sub(now, ts->idle_entrytime); | ||
202 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
203 | } | ||
204 | |||
205 | ts->idle_entrytime = now; | ||
206 | ts->idle_calls++; | 235 | ts->idle_calls++; |
207 | |||
208 | /* Read jiffies and the time when jiffies were updated last */ | 236 | /* Read jiffies and the time when jiffies were updated last */ |
209 | do { | 237 | do { |
210 | seq = read_seqbegin(&xtime_lock); | 238 | seq = read_seqbegin(&xtime_lock); |
@@ -216,6 +244,10 @@ void tick_nohz_stop_sched_tick(void) | |||
216 | next_jiffies = get_next_timer_interrupt(last_jiffies); | 244 | next_jiffies = get_next_timer_interrupt(last_jiffies); |
217 | delta_jiffies = next_jiffies - last_jiffies; | 245 | delta_jiffies = next_jiffies - last_jiffies; |
218 | 246 | ||
247 | rt_jiffies = rt_needs_cpu(cpu); | ||
248 | if (rt_jiffies && rt_jiffies < delta_jiffies) | ||
249 | delta_jiffies = rt_jiffies; | ||
250 | |||
219 | if (rcu_needs_cpu(cpu)) | 251 | if (rcu_needs_cpu(cpu)) |
220 | delta_jiffies = 1; | 252 | delta_jiffies = 1; |
221 | /* | 253 | /* |
@@ -291,7 +323,7 @@ void tick_nohz_stop_sched_tick(void) | |||
291 | /* Check, if the timer was already in the past */ | 323 | /* Check, if the timer was already in the past */ |
292 | if (hrtimer_active(&ts->sched_timer)) | 324 | if (hrtimer_active(&ts->sched_timer)) |
293 | goto out; | 325 | goto out; |
294 | } else if(!tick_program_event(expires, 0)) | 326 | } else if (!tick_program_event(expires, 0)) |
295 | goto out; | 327 | goto out; |
296 | /* | 328 | /* |
297 | * We are past the event already. So we crossed a | 329 | * We are past the event already. So we crossed a |
@@ -332,23 +364,22 @@ void tick_nohz_restart_sched_tick(void) | |||
332 | int cpu = smp_processor_id(); | 364 | int cpu = smp_processor_id(); |
333 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 365 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
334 | unsigned long ticks; | 366 | unsigned long ticks; |
335 | ktime_t now, delta; | 367 | ktime_t now; |
336 | 368 | ||
337 | if (!ts->tick_stopped) | 369 | local_irq_disable(); |
370 | tick_nohz_stop_idle(cpu); | ||
371 | |||
372 | if (!ts->tick_stopped) { | ||
373 | local_irq_enable(); | ||
338 | return; | 374 | return; |
375 | } | ||
339 | 376 | ||
340 | /* Update jiffies first */ | 377 | /* Update jiffies first */ |
341 | now = ktime_get(); | ||
342 | |||
343 | local_irq_disable(); | ||
344 | select_nohz_load_balancer(0); | 378 | select_nohz_load_balancer(0); |
379 | now = ktime_get(); | ||
345 | tick_do_update_jiffies64(now); | 380 | tick_do_update_jiffies64(now); |
346 | cpu_clear(cpu, nohz_cpu_mask); | 381 | cpu_clear(cpu, nohz_cpu_mask); |
347 | 382 | ||
348 | /* Account the idle time */ | ||
349 | delta = ktime_sub(now, ts->idle_entrytime); | ||
350 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
351 | |||
352 | /* | 383 | /* |
353 | * We stopped the tick in idle. Update process times would miss the | 384 | * We stopped the tick in idle. Update process times would miss the |
354 | * time we slept as update_process_times does only a 1 tick | 385 | * time we slept as update_process_times does only a 1 tick |
@@ -502,14 +533,13 @@ static inline void tick_nohz_switch_to_nohz(void) { } | |||
502 | */ | 533 | */ |
503 | #ifdef CONFIG_HIGH_RES_TIMERS | 534 | #ifdef CONFIG_HIGH_RES_TIMERS |
504 | /* | 535 | /* |
505 | * We rearm the timer until we get disabled by the idle code | 536 | * We rearm the timer until we get disabled by the idle code. |
506 | * Called with interrupts disabled and timer->base->cpu_base->lock held. | 537 | * Called with interrupts disabled and timer->base->cpu_base->lock held. |
507 | */ | 538 | */ |
508 | static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | 539 | static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) |
509 | { | 540 | { |
510 | struct tick_sched *ts = | 541 | struct tick_sched *ts = |
511 | container_of(timer, struct tick_sched, sched_timer); | 542 | container_of(timer, struct tick_sched, sched_timer); |
512 | struct hrtimer_cpu_base *base = timer->base->cpu_base; | ||
513 | struct pt_regs *regs = get_irq_regs(); | 543 | struct pt_regs *regs = get_irq_regs(); |
514 | ktime_t now = ktime_get(); | 544 | ktime_t now = ktime_get(); |
515 | int cpu = smp_processor_id(); | 545 | int cpu = smp_processor_id(); |
@@ -547,15 +577,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | |||
547 | touch_softlockup_watchdog(); | 577 | touch_softlockup_watchdog(); |
548 | ts->idle_jiffies++; | 578 | ts->idle_jiffies++; |
549 | } | 579 | } |
550 | /* | ||
551 | * update_process_times() might take tasklist_lock, hence | ||
552 | * drop the base lock. sched-tick hrtimers are per-CPU and | ||
553 | * never accessible by userspace APIs, so this is safe to do. | ||
554 | */ | ||
555 | spin_unlock(&base->lock); | ||
556 | update_process_times(user_mode(regs)); | 580 | update_process_times(user_mode(regs)); |
557 | profile_tick(CPU_PROFILING); | 581 | profile_tick(CPU_PROFILING); |
558 | spin_lock(&base->lock); | ||
559 | } | 582 | } |
560 | 583 | ||
561 | /* Do not restart, when we are in the idle loop */ | 584 | /* Do not restart, when we are in the idle loop */ |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e5e466b27598..092a2366b5a9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -82,13 +82,12 @@ static inline s64 __get_nsec_offset(void) | |||
82 | } | 82 | } |
83 | 83 | ||
84 | /** | 84 | /** |
85 | * __get_realtime_clock_ts - Returns the time of day in a timespec | 85 | * getnstimeofday - Returns the time of day in a timespec |
86 | * @ts: pointer to the timespec to be set | 86 | * @ts: pointer to the timespec to be set |
87 | * | 87 | * |
88 | * Returns the time of day in a timespec. Used by | 88 | * Returns the time of day in a timespec. |
89 | * do_gettimeofday() and get_realtime_clock_ts(). | ||
90 | */ | 89 | */ |
91 | static inline void __get_realtime_clock_ts(struct timespec *ts) | 90 | void getnstimeofday(struct timespec *ts) |
92 | { | 91 | { |
93 | unsigned long seq; | 92 | unsigned long seq; |
94 | s64 nsecs; | 93 | s64 nsecs; |
@@ -104,30 +103,19 @@ static inline void __get_realtime_clock_ts(struct timespec *ts) | |||
104 | timespec_add_ns(ts, nsecs); | 103 | timespec_add_ns(ts, nsecs); |
105 | } | 104 | } |
106 | 105 | ||
107 | /** | ||
108 | * getnstimeofday - Returns the time of day in a timespec | ||
109 | * @ts: pointer to the timespec to be set | ||
110 | * | ||
111 | * Returns the time of day in a timespec. | ||
112 | */ | ||
113 | void getnstimeofday(struct timespec *ts) | ||
114 | { | ||
115 | __get_realtime_clock_ts(ts); | ||
116 | } | ||
117 | |||
118 | EXPORT_SYMBOL(getnstimeofday); | 106 | EXPORT_SYMBOL(getnstimeofday); |
119 | 107 | ||
120 | /** | 108 | /** |
121 | * do_gettimeofday - Returns the time of day in a timeval | 109 | * do_gettimeofday - Returns the time of day in a timeval |
122 | * @tv: pointer to the timeval to be set | 110 | * @tv: pointer to the timeval to be set |
123 | * | 111 | * |
124 | * NOTE: Users should be converted to using get_realtime_clock_ts() | 112 | * NOTE: Users should be converted to using getnstimeofday() |
125 | */ | 113 | */ |
126 | void do_gettimeofday(struct timeval *tv) | 114 | void do_gettimeofday(struct timeval *tv) |
127 | { | 115 | { |
128 | struct timespec now; | 116 | struct timespec now; |
129 | 117 | ||
130 | __get_realtime_clock_ts(&now); | 118 | getnstimeofday(&now); |
131 | tv->tv_sec = now.tv_sec; | 119 | tv->tv_sec = now.tv_sec; |
132 | tv->tv_usec = now.tv_nsec/1000; | 120 | tv->tv_usec = now.tv_nsec/1000; |
133 | } | 121 | } |
@@ -198,7 +186,8 @@ static void change_clocksource(void) | |||
198 | 186 | ||
199 | clock->error = 0; | 187 | clock->error = 0; |
200 | clock->xtime_nsec = 0; | 188 | clock->xtime_nsec = 0; |
201 | clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); | 189 | clocksource_calculate_interval(clock, |
190 | (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT)); | ||
202 | 191 | ||
203 | tick_clock_notify(); | 192 | tick_clock_notify(); |
204 | 193 | ||
@@ -255,7 +244,8 @@ void __init timekeeping_init(void) | |||
255 | ntp_clear(); | 244 | ntp_clear(); |
256 | 245 | ||
257 | clock = clocksource_get_next(); | 246 | clock = clocksource_get_next(); |
258 | clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); | 247 | clocksource_calculate_interval(clock, |
248 | (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT)); | ||
259 | clock->cycle_last = clocksource_read(clock); | 249 | clock->cycle_last = clocksource_read(clock); |
260 | 250 | ||
261 | xtime.tv_sec = sec; | 251 | xtime.tv_sec = sec; |
@@ -335,9 +325,9 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) | |||
335 | 325 | ||
336 | /* sysfs resume/suspend bits for timekeeping */ | 326 | /* sysfs resume/suspend bits for timekeeping */ |
337 | static struct sysdev_class timekeeping_sysclass = { | 327 | static struct sysdev_class timekeeping_sysclass = { |
328 | .name = "timekeeping", | ||
338 | .resume = timekeeping_resume, | 329 | .resume = timekeeping_resume, |
339 | .suspend = timekeeping_suspend, | 330 | .suspend = timekeeping_suspend, |
340 | set_kset_name("timekeeping"), | ||
341 | }; | 331 | }; |
342 | 332 | ||
343 | static struct sys_device device_timer = { | 333 | static struct sys_device device_timer = { |
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index c36bb7ed0301..417da8c5bc72 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c | |||
@@ -26,7 +26,7 @@ | |||
26 | * the pid and cmdline from the owner process if applicable. | 26 | * the pid and cmdline from the owner process if applicable. |
27 | * | 27 | * |
28 | * Start/stop data collection: | 28 | * Start/stop data collection: |
29 | * # echo 1[0] >/proc/timer_stats | 29 | * # echo [1|0] >/proc/timer_stats |
30 | * | 30 | * |
31 | * Display the information collected so far: | 31 | * Display the information collected so far: |
32 | * # cat /proc/timer_stats | 32 | * # cat /proc/timer_stats |
diff --git a/kernel/timer.c b/kernel/timer.c index 66d7d8bca1a3..9fbb472b8cf0 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -58,59 +58,57 @@ EXPORT_SYMBOL(jiffies_64); | |||
58 | #define TVN_MASK (TVN_SIZE - 1) | 58 | #define TVN_MASK (TVN_SIZE - 1) |
59 | #define TVR_MASK (TVR_SIZE - 1) | 59 | #define TVR_MASK (TVR_SIZE - 1) |
60 | 60 | ||
61 | typedef struct tvec_s { | 61 | struct tvec { |
62 | struct list_head vec[TVN_SIZE]; | 62 | struct list_head vec[TVN_SIZE]; |
63 | } tvec_t; | 63 | }; |
64 | 64 | ||
65 | typedef struct tvec_root_s { | 65 | struct tvec_root { |
66 | struct list_head vec[TVR_SIZE]; | 66 | struct list_head vec[TVR_SIZE]; |
67 | } tvec_root_t; | 67 | }; |
68 | 68 | ||
69 | struct tvec_t_base_s { | 69 | struct tvec_base { |
70 | spinlock_t lock; | 70 | spinlock_t lock; |
71 | struct timer_list *running_timer; | 71 | struct timer_list *running_timer; |
72 | unsigned long timer_jiffies; | 72 | unsigned long timer_jiffies; |
73 | tvec_root_t tv1; | 73 | struct tvec_root tv1; |
74 | tvec_t tv2; | 74 | struct tvec tv2; |
75 | tvec_t tv3; | 75 | struct tvec tv3; |
76 | tvec_t tv4; | 76 | struct tvec tv4; |
77 | tvec_t tv5; | 77 | struct tvec tv5; |
78 | } ____cacheline_aligned; | 78 | } ____cacheline_aligned; |
79 | 79 | ||
80 | typedef struct tvec_t_base_s tvec_base_t; | 80 | struct tvec_base boot_tvec_bases; |
81 | |||
82 | tvec_base_t boot_tvec_bases; | ||
83 | EXPORT_SYMBOL(boot_tvec_bases); | 81 | EXPORT_SYMBOL(boot_tvec_bases); |
84 | static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; | 82 | static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; |
85 | 83 | ||
86 | /* | 84 | /* |
87 | * Note that all tvec_bases is 2 byte aligned and lower bit of | 85 | * Note that all tvec_bases are 2 byte aligned and lower bit of |
88 | * base in timer_list is guaranteed to be zero. Use the LSB for | 86 | * base in timer_list is guaranteed to be zero. Use the LSB for |
89 | * the new flag to indicate whether the timer is deferrable | 87 | * the new flag to indicate whether the timer is deferrable |
90 | */ | 88 | */ |
91 | #define TBASE_DEFERRABLE_FLAG (0x1) | 89 | #define TBASE_DEFERRABLE_FLAG (0x1) |
92 | 90 | ||
93 | /* Functions below help us manage 'deferrable' flag */ | 91 | /* Functions below help us manage 'deferrable' flag */ |
94 | static inline unsigned int tbase_get_deferrable(tvec_base_t *base) | 92 | static inline unsigned int tbase_get_deferrable(struct tvec_base *base) |
95 | { | 93 | { |
96 | return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); | 94 | return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); |
97 | } | 95 | } |
98 | 96 | ||
99 | static inline tvec_base_t *tbase_get_base(tvec_base_t *base) | 97 | static inline struct tvec_base *tbase_get_base(struct tvec_base *base) |
100 | { | 98 | { |
101 | return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); | 99 | return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); |
102 | } | 100 | } |
103 | 101 | ||
104 | static inline void timer_set_deferrable(struct timer_list *timer) | 102 | static inline void timer_set_deferrable(struct timer_list *timer) |
105 | { | 103 | { |
106 | timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | | 104 | timer->base = ((struct tvec_base *)((unsigned long)(timer->base) | |
107 | TBASE_DEFERRABLE_FLAG)); | 105 | TBASE_DEFERRABLE_FLAG)); |
108 | } | 106 | } |
109 | 107 | ||
110 | static inline void | 108 | static inline void |
111 | timer_set_base(struct timer_list *timer, tvec_base_t *new_base) | 109 | timer_set_base(struct timer_list *timer, struct tvec_base *new_base) |
112 | { | 110 | { |
113 | timer->base = (tvec_base_t *)((unsigned long)(new_base) | | 111 | timer->base = (struct tvec_base *)((unsigned long)(new_base) | |
114 | tbase_get_deferrable(timer->base)); | 112 | tbase_get_deferrable(timer->base)); |
115 | } | 113 | } |
116 | 114 | ||
@@ -246,7 +244,7 @@ unsigned long round_jiffies_relative(unsigned long j) | |||
246 | EXPORT_SYMBOL_GPL(round_jiffies_relative); | 244 | EXPORT_SYMBOL_GPL(round_jiffies_relative); |
247 | 245 | ||
248 | 246 | ||
249 | static inline void set_running_timer(tvec_base_t *base, | 247 | static inline void set_running_timer(struct tvec_base *base, |
250 | struct timer_list *timer) | 248 | struct timer_list *timer) |
251 | { | 249 | { |
252 | #ifdef CONFIG_SMP | 250 | #ifdef CONFIG_SMP |
@@ -254,7 +252,7 @@ static inline void set_running_timer(tvec_base_t *base, | |||
254 | #endif | 252 | #endif |
255 | } | 253 | } |
256 | 254 | ||
257 | static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) | 255 | static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) |
258 | { | 256 | { |
259 | unsigned long expires = timer->expires; | 257 | unsigned long expires = timer->expires; |
260 | unsigned long idx = expires - base->timer_jiffies; | 258 | unsigned long idx = expires - base->timer_jiffies; |
@@ -371,14 +369,14 @@ static inline void detach_timer(struct timer_list *timer, | |||
371 | * possible to set timer->base = NULL and drop the lock: the timer remains | 369 | * possible to set timer->base = NULL and drop the lock: the timer remains |
372 | * locked. | 370 | * locked. |
373 | */ | 371 | */ |
374 | static tvec_base_t *lock_timer_base(struct timer_list *timer, | 372 | static struct tvec_base *lock_timer_base(struct timer_list *timer, |
375 | unsigned long *flags) | 373 | unsigned long *flags) |
376 | __acquires(timer->base->lock) | 374 | __acquires(timer->base->lock) |
377 | { | 375 | { |
378 | tvec_base_t *base; | 376 | struct tvec_base *base; |
379 | 377 | ||
380 | for (;;) { | 378 | for (;;) { |
381 | tvec_base_t *prelock_base = timer->base; | 379 | struct tvec_base *prelock_base = timer->base; |
382 | base = tbase_get_base(prelock_base); | 380 | base = tbase_get_base(prelock_base); |
383 | if (likely(base != NULL)) { | 381 | if (likely(base != NULL)) { |
384 | spin_lock_irqsave(&base->lock, *flags); | 382 | spin_lock_irqsave(&base->lock, *flags); |
@@ -393,7 +391,7 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer, | |||
393 | 391 | ||
394 | int __mod_timer(struct timer_list *timer, unsigned long expires) | 392 | int __mod_timer(struct timer_list *timer, unsigned long expires) |
395 | { | 393 | { |
396 | tvec_base_t *base, *new_base; | 394 | struct tvec_base *base, *new_base; |
397 | unsigned long flags; | 395 | unsigned long flags; |
398 | int ret = 0; | 396 | int ret = 0; |
399 | 397 | ||
@@ -445,7 +443,7 @@ EXPORT_SYMBOL(__mod_timer); | |||
445 | */ | 443 | */ |
446 | void add_timer_on(struct timer_list *timer, int cpu) | 444 | void add_timer_on(struct timer_list *timer, int cpu) |
447 | { | 445 | { |
448 | tvec_base_t *base = per_cpu(tvec_bases, cpu); | 446 | struct tvec_base *base = per_cpu(tvec_bases, cpu); |
449 | unsigned long flags; | 447 | unsigned long flags; |
450 | 448 | ||
451 | timer_stats_timer_set_start_info(timer); | 449 | timer_stats_timer_set_start_info(timer); |
@@ -508,7 +506,7 @@ EXPORT_SYMBOL(mod_timer); | |||
508 | */ | 506 | */ |
509 | int del_timer(struct timer_list *timer) | 507 | int del_timer(struct timer_list *timer) |
510 | { | 508 | { |
511 | tvec_base_t *base; | 509 | struct tvec_base *base; |
512 | unsigned long flags; | 510 | unsigned long flags; |
513 | int ret = 0; | 511 | int ret = 0; |
514 | 512 | ||
@@ -539,7 +537,7 @@ EXPORT_SYMBOL(del_timer); | |||
539 | */ | 537 | */ |
540 | int try_to_del_timer_sync(struct timer_list *timer) | 538 | int try_to_del_timer_sync(struct timer_list *timer) |
541 | { | 539 | { |
542 | tvec_base_t *base; | 540 | struct tvec_base *base; |
543 | unsigned long flags; | 541 | unsigned long flags; |
544 | int ret = -1; | 542 | int ret = -1; |
545 | 543 | ||
@@ -591,7 +589,7 @@ int del_timer_sync(struct timer_list *timer) | |||
591 | EXPORT_SYMBOL(del_timer_sync); | 589 | EXPORT_SYMBOL(del_timer_sync); |
592 | #endif | 590 | #endif |
593 | 591 | ||
594 | static int cascade(tvec_base_t *base, tvec_t *tv, int index) | 592 | static int cascade(struct tvec_base *base, struct tvec *tv, int index) |
595 | { | 593 | { |
596 | /* cascade all the timers from tv up one level */ | 594 | /* cascade all the timers from tv up one level */ |
597 | struct timer_list *timer, *tmp; | 595 | struct timer_list *timer, *tmp; |
@@ -620,7 +618,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) | |||
620 | * This function cascades all vectors and executes all expired timer | 618 | * This function cascades all vectors and executes all expired timer |
621 | * vectors. | 619 | * vectors. |
622 | */ | 620 | */ |
623 | static inline void __run_timers(tvec_base_t *base) | 621 | static inline void __run_timers(struct tvec_base *base) |
624 | { | 622 | { |
625 | struct timer_list *timer; | 623 | struct timer_list *timer; |
626 | 624 | ||
@@ -657,7 +655,7 @@ static inline void __run_timers(tvec_base_t *base) | |||
657 | int preempt_count = preempt_count(); | 655 | int preempt_count = preempt_count(); |
658 | fn(data); | 656 | fn(data); |
659 | if (preempt_count != preempt_count()) { | 657 | if (preempt_count != preempt_count()) { |
660 | printk(KERN_WARNING "huh, entered %p " | 658 | printk(KERN_ERR "huh, entered %p " |
661 | "with preempt_count %08x, exited" | 659 | "with preempt_count %08x, exited" |
662 | " with %08x?\n", | 660 | " with %08x?\n", |
663 | fn, preempt_count, | 661 | fn, preempt_count, |
@@ -678,13 +676,13 @@ static inline void __run_timers(tvec_base_t *base) | |||
678 | * is used on S/390 to stop all activity when a cpus is idle. | 676 | * is used on S/390 to stop all activity when a cpus is idle. |
679 | * This functions needs to be called disabled. | 677 | * This functions needs to be called disabled. |
680 | */ | 678 | */ |
681 | static unsigned long __next_timer_interrupt(tvec_base_t *base) | 679 | static unsigned long __next_timer_interrupt(struct tvec_base *base) |
682 | { | 680 | { |
683 | unsigned long timer_jiffies = base->timer_jiffies; | 681 | unsigned long timer_jiffies = base->timer_jiffies; |
684 | unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; | 682 | unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; |
685 | int index, slot, array, found = 0; | 683 | int index, slot, array, found = 0; |
686 | struct timer_list *nte; | 684 | struct timer_list *nte; |
687 | tvec_t *varray[4]; | 685 | struct tvec *varray[4]; |
688 | 686 | ||
689 | /* Look for timer events in tv1. */ | 687 | /* Look for timer events in tv1. */ |
690 | index = slot = timer_jiffies & TVR_MASK; | 688 | index = slot = timer_jiffies & TVR_MASK; |
@@ -716,7 +714,7 @@ cascade: | |||
716 | varray[3] = &base->tv5; | 714 | varray[3] = &base->tv5; |
717 | 715 | ||
718 | for (array = 0; array < 4; array++) { | 716 | for (array = 0; array < 4; array++) { |
719 | tvec_t *varp = varray[array]; | 717 | struct tvec *varp = varray[array]; |
720 | 718 | ||
721 | index = slot = timer_jiffies & TVN_MASK; | 719 | index = slot = timer_jiffies & TVN_MASK; |
722 | do { | 720 | do { |
@@ -795,7 +793,7 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now, | |||
795 | */ | 793 | */ |
796 | unsigned long get_next_timer_interrupt(unsigned long now) | 794 | unsigned long get_next_timer_interrupt(unsigned long now) |
797 | { | 795 | { |
798 | tvec_base_t *base = __get_cpu_var(tvec_bases); | 796 | struct tvec_base *base = __get_cpu_var(tvec_bases); |
799 | unsigned long expires; | 797 | unsigned long expires; |
800 | 798 | ||
801 | spin_lock(&base->lock); | 799 | spin_lock(&base->lock); |
@@ -894,9 +892,9 @@ static inline void calc_load(unsigned long ticks) | |||
894 | */ | 892 | */ |
895 | static void run_timer_softirq(struct softirq_action *h) | 893 | static void run_timer_softirq(struct softirq_action *h) |
896 | { | 894 | { |
897 | tvec_base_t *base = __get_cpu_var(tvec_bases); | 895 | struct tvec_base *base = __get_cpu_var(tvec_bases); |
898 | 896 | ||
899 | hrtimer_run_queues(); | 897 | hrtimer_run_pending(); |
900 | 898 | ||
901 | if (time_after_eq(jiffies, base->timer_jiffies)) | 899 | if (time_after_eq(jiffies, base->timer_jiffies)) |
902 | __run_timers(base); | 900 | __run_timers(base); |
@@ -907,6 +905,7 @@ static void run_timer_softirq(struct softirq_action *h) | |||
907 | */ | 905 | */ |
908 | void run_local_timers(void) | 906 | void run_local_timers(void) |
909 | { | 907 | { |
908 | hrtimer_run_queues(); | ||
910 | raise_softirq(TIMER_SOFTIRQ); | 909 | raise_softirq(TIMER_SOFTIRQ); |
911 | softlockup_tick(); | 910 | softlockup_tick(); |
912 | } | 911 | } |
@@ -978,7 +977,7 @@ asmlinkage long sys_getppid(void) | |||
978 | int pid; | 977 | int pid; |
979 | 978 | ||
980 | rcu_read_lock(); | 979 | rcu_read_lock(); |
981 | pid = task_ppid_nr_ns(current, current->nsproxy->pid_ns); | 980 | pid = task_tgid_nr_ns(current->real_parent, current->nsproxy->pid_ns); |
982 | rcu_read_unlock(); | 981 | rcu_read_unlock(); |
983 | 982 | ||
984 | return pid; | 983 | return pid; |
@@ -1226,11 +1225,11 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) | |||
1226 | */ | 1225 | */ |
1227 | static struct lock_class_key base_lock_keys[NR_CPUS]; | 1226 | static struct lock_class_key base_lock_keys[NR_CPUS]; |
1228 | 1227 | ||
1229 | static int __devinit init_timers_cpu(int cpu) | 1228 | static int __cpuinit init_timers_cpu(int cpu) |
1230 | { | 1229 | { |
1231 | int j; | 1230 | int j; |
1232 | tvec_base_t *base; | 1231 | struct tvec_base *base; |
1233 | static char __devinitdata tvec_base_done[NR_CPUS]; | 1232 | static char __cpuinitdata tvec_base_done[NR_CPUS]; |
1234 | 1233 | ||
1235 | if (!tvec_base_done[cpu]) { | 1234 | if (!tvec_base_done[cpu]) { |
1236 | static char boot_done; | 1235 | static char boot_done; |
@@ -1284,7 +1283,7 @@ static int __devinit init_timers_cpu(int cpu) | |||
1284 | } | 1283 | } |
1285 | 1284 | ||
1286 | #ifdef CONFIG_HOTPLUG_CPU | 1285 | #ifdef CONFIG_HOTPLUG_CPU |
1287 | static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) | 1286 | static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) |
1288 | { | 1287 | { |
1289 | struct timer_list *timer; | 1288 | struct timer_list *timer; |
1290 | 1289 | ||
@@ -1296,10 +1295,10 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) | |||
1296 | } | 1295 | } |
1297 | } | 1296 | } |
1298 | 1297 | ||
1299 | static void __devinit migrate_timers(int cpu) | 1298 | static void __cpuinit migrate_timers(int cpu) |
1300 | { | 1299 | { |
1301 | tvec_base_t *old_base; | 1300 | struct tvec_base *old_base; |
1302 | tvec_base_t *new_base; | 1301 | struct tvec_base *new_base; |
1303 | int i; | 1302 | int i; |
1304 | 1303 | ||
1305 | BUG_ON(cpu_online(cpu)); | 1304 | BUG_ON(cpu_online(cpu)); |
diff --git a/kernel/user.c b/kernel/user.c index 8320a87f3e5a..bc1c48d35cb3 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -115,7 +115,7 @@ static void sched_switch_user(struct task_struct *p) { } | |||
115 | 115 | ||
116 | #if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS) | 116 | #if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS) |
117 | 117 | ||
118 | static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */ | 118 | static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */ |
119 | static DEFINE_MUTEX(uids_mutex); | 119 | static DEFINE_MUTEX(uids_mutex); |
120 | 120 | ||
121 | static inline void uids_mutex_lock(void) | 121 | static inline void uids_mutex_lock(void) |
@@ -128,86 +128,83 @@ static inline void uids_mutex_unlock(void) | |||
128 | mutex_unlock(&uids_mutex); | 128 | mutex_unlock(&uids_mutex); |
129 | } | 129 | } |
130 | 130 | ||
131 | /* return cpu shares held by the user */ | 131 | /* uid directory attributes */ |
132 | static ssize_t cpu_shares_show(struct kset *kset, char *buffer) | 132 | static ssize_t cpu_shares_show(struct kobject *kobj, |
133 | struct kobj_attribute *attr, | ||
134 | char *buf) | ||
133 | { | 135 | { |
134 | struct user_struct *up = container_of(kset, struct user_struct, kset); | 136 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); |
135 | 137 | ||
136 | return sprintf(buffer, "%lu\n", sched_group_shares(up->tg)); | 138 | return sprintf(buf, "%lu\n", sched_group_shares(up->tg)); |
137 | } | 139 | } |
138 | 140 | ||
139 | /* modify cpu shares held by the user */ | 141 | static ssize_t cpu_shares_store(struct kobject *kobj, |
140 | static ssize_t cpu_shares_store(struct kset *kset, const char *buffer, | 142 | struct kobj_attribute *attr, |
141 | size_t size) | 143 | const char *buf, size_t size) |
142 | { | 144 | { |
143 | struct user_struct *up = container_of(kset, struct user_struct, kset); | 145 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); |
144 | unsigned long shares; | 146 | unsigned long shares; |
145 | int rc; | 147 | int rc; |
146 | 148 | ||
147 | sscanf(buffer, "%lu", &shares); | 149 | sscanf(buf, "%lu", &shares); |
148 | 150 | ||
149 | rc = sched_group_set_shares(up->tg, shares); | 151 | rc = sched_group_set_shares(up->tg, shares); |
150 | 152 | ||
151 | return (rc ? rc : size); | 153 | return (rc ? rc : size); |
152 | } | 154 | } |
153 | 155 | ||
154 | static void user_attr_init(struct subsys_attribute *sa, char *name, int mode) | 156 | static struct kobj_attribute cpu_share_attr = |
157 | __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store); | ||
158 | |||
159 | /* default attributes per uid directory */ | ||
160 | static struct attribute *uids_attributes[] = { | ||
161 | &cpu_share_attr.attr, | ||
162 | NULL | ||
163 | }; | ||
164 | |||
165 | /* the lifetime of user_struct is not managed by the core (now) */ | ||
166 | static void uids_release(struct kobject *kobj) | ||
155 | { | 167 | { |
156 | sa->attr.name = name; | 168 | return; |
157 | sa->attr.mode = mode; | ||
158 | sa->show = cpu_shares_show; | ||
159 | sa->store = cpu_shares_store; | ||
160 | } | 169 | } |
161 | 170 | ||
162 | /* Create "/sys/kernel/uids/<uid>" directory and | 171 | static struct kobj_type uids_ktype = { |
163 | * "/sys/kernel/uids/<uid>/cpu_share" file for this user. | 172 | .sysfs_ops = &kobj_sysfs_ops, |
164 | */ | 173 | .default_attrs = uids_attributes, |
165 | static int user_kobject_create(struct user_struct *up) | 174 | .release = uids_release, |
175 | }; | ||
176 | |||
177 | /* create /sys/kernel/uids/<uid>/cpu_share file for this user */ | ||
178 | static int uids_user_create(struct user_struct *up) | ||
166 | { | 179 | { |
167 | struct kset *kset = &up->kset; | 180 | struct kobject *kobj = &up->kobj; |
168 | struct kobject *kobj = &kset->kobj; | ||
169 | int error; | 181 | int error; |
170 | 182 | ||
171 | memset(kset, 0, sizeof(struct kset)); | 183 | memset(kobj, 0, sizeof(struct kobject)); |
172 | kobj->parent = &uids_kobject; /* create under /sys/kernel/uids dir */ | 184 | kobj->kset = uids_kset; |
173 | kobject_set_name(kobj, "%d", up->uid); | 185 | error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid); |
174 | kset_init(kset); | 186 | if (error) { |
175 | user_attr_init(&up->user_attr, "cpu_share", 0644); | 187 | kobject_put(kobj); |
176 | |||
177 | error = kobject_add(kobj); | ||
178 | if (error) | ||
179 | goto done; | 188 | goto done; |
180 | 189 | } | |
181 | error = sysfs_create_file(kobj, &up->user_attr.attr); | ||
182 | if (error) | ||
183 | kobject_del(kobj); | ||
184 | 190 | ||
185 | kobject_uevent(kobj, KOBJ_ADD); | 191 | kobject_uevent(kobj, KOBJ_ADD); |
186 | |||
187 | done: | 192 | done: |
188 | return error; | 193 | return error; |
189 | } | 194 | } |
190 | 195 | ||
191 | /* create these in sysfs filesystem: | 196 | /* create these entries in sysfs: |
192 | * "/sys/kernel/uids" directory | 197 | * "/sys/kernel/uids" directory |
193 | * "/sys/kernel/uids/0" directory (for root user) | 198 | * "/sys/kernel/uids/0" directory (for root user) |
194 | * "/sys/kernel/uids/0/cpu_share" file (for root user) | 199 | * "/sys/kernel/uids/0/cpu_share" file (for root user) |
195 | */ | 200 | */ |
196 | int __init uids_kobject_init(void) | 201 | int __init uids_sysfs_init(void) |
197 | { | 202 | { |
198 | int error; | 203 | uids_kset = kset_create_and_add("uids", NULL, kernel_kobj); |
199 | 204 | if (!uids_kset) | |
200 | /* create under /sys/kernel dir */ | 205 | return -ENOMEM; |
201 | uids_kobject.parent = &kernel_subsys.kobj; | ||
202 | uids_kobject.kset = &kernel_subsys; | ||
203 | kobject_set_name(&uids_kobject, "uids"); | ||
204 | kobject_init(&uids_kobject); | ||
205 | 206 | ||
206 | error = kobject_add(&uids_kobject); | 207 | return uids_user_create(&root_user); |
207 | if (!error) | ||
208 | error = user_kobject_create(&root_user); | ||
209 | |||
210 | return error; | ||
211 | } | 208 | } |
212 | 209 | ||
213 | /* work function to remove sysfs directory for a user and free up | 210 | /* work function to remove sysfs directory for a user and free up |
@@ -216,7 +213,6 @@ int __init uids_kobject_init(void) | |||
216 | static void remove_user_sysfs_dir(struct work_struct *w) | 213 | static void remove_user_sysfs_dir(struct work_struct *w) |
217 | { | 214 | { |
218 | struct user_struct *up = container_of(w, struct user_struct, work); | 215 | struct user_struct *up = container_of(w, struct user_struct, work); |
219 | struct kobject *kobj = &up->kset.kobj; | ||
220 | unsigned long flags; | 216 | unsigned long flags; |
221 | int remove_user = 0; | 217 | int remove_user = 0; |
222 | 218 | ||
@@ -238,9 +234,9 @@ static void remove_user_sysfs_dir(struct work_struct *w) | |||
238 | if (!remove_user) | 234 | if (!remove_user) |
239 | goto done; | 235 | goto done; |
240 | 236 | ||
241 | sysfs_remove_file(kobj, &up->user_attr.attr); | 237 | kobject_uevent(&up->kobj, KOBJ_REMOVE); |
242 | kobject_uevent(kobj, KOBJ_REMOVE); | 238 | kobject_del(&up->kobj); |
243 | kobject_del(kobj); | 239 | kobject_put(&up->kobj); |
244 | 240 | ||
245 | sched_destroy_user(up); | 241 | sched_destroy_user(up); |
246 | key_put(up->uid_keyring); | 242 | key_put(up->uid_keyring); |
@@ -267,7 +263,8 @@ static inline void free_user(struct user_struct *up, unsigned long flags) | |||
267 | 263 | ||
268 | #else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */ | 264 | #else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */ |
269 | 265 | ||
270 | static inline int user_kobject_create(struct user_struct *up) { return 0; } | 266 | int uids_sysfs_init(void) { return 0; } |
267 | static inline int uids_user_create(struct user_struct *up) { return 0; } | ||
271 | static inline void uids_mutex_lock(void) { } | 268 | static inline void uids_mutex_lock(void) { } |
272 | static inline void uids_mutex_unlock(void) { } | 269 | static inline void uids_mutex_unlock(void) { } |
273 | 270 | ||
@@ -322,9 +319,9 @@ void free_uid(struct user_struct *up) | |||
322 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | 319 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) |
323 | { | 320 | { |
324 | struct hlist_head *hashent = uidhashentry(ns, uid); | 321 | struct hlist_head *hashent = uidhashentry(ns, uid); |
325 | struct user_struct *up; | 322 | struct user_struct *up, *new; |
326 | 323 | ||
327 | /* Make uid_hash_find() + user_kobject_create() + uid_hash_insert() | 324 | /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() |
328 | * atomic. | 325 | * atomic. |
329 | */ | 326 | */ |
330 | uids_mutex_lock(); | 327 | uids_mutex_lock(); |
@@ -334,13 +331,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
334 | spin_unlock_irq(&uidhash_lock); | 331 | spin_unlock_irq(&uidhash_lock); |
335 | 332 | ||
336 | if (!up) { | 333 | if (!up) { |
337 | struct user_struct *new; | ||
338 | |||
339 | new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); | 334 | new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); |
340 | if (!new) { | 335 | if (!new) |
341 | uids_mutex_unlock(); | 336 | goto out_unlock; |
342 | return NULL; | ||
343 | } | ||
344 | 337 | ||
345 | new->uid = uid; | 338 | new->uid = uid; |
346 | atomic_set(&new->__count, 1); | 339 | atomic_set(&new->__count, 1); |
@@ -356,28 +349,14 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
356 | #endif | 349 | #endif |
357 | new->locked_shm = 0; | 350 | new->locked_shm = 0; |
358 | 351 | ||
359 | if (alloc_uid_keyring(new, current) < 0) { | 352 | if (alloc_uid_keyring(new, current) < 0) |
360 | kmem_cache_free(uid_cachep, new); | 353 | goto out_free_user; |
361 | uids_mutex_unlock(); | ||
362 | return NULL; | ||
363 | } | ||
364 | 354 | ||
365 | if (sched_create_user(new) < 0) { | 355 | if (sched_create_user(new) < 0) |
366 | key_put(new->uid_keyring); | 356 | goto out_put_keys; |
367 | key_put(new->session_keyring); | ||
368 | kmem_cache_free(uid_cachep, new); | ||
369 | uids_mutex_unlock(); | ||
370 | return NULL; | ||
371 | } | ||
372 | 357 | ||
373 | if (user_kobject_create(new)) { | 358 | if (uids_user_create(new)) |
374 | sched_destroy_user(new); | 359 | goto out_destoy_sched; |
375 | key_put(new->uid_keyring); | ||
376 | key_put(new->session_keyring); | ||
377 | kmem_cache_free(uid_cachep, new); | ||
378 | uids_mutex_unlock(); | ||
379 | return NULL; | ||
380 | } | ||
381 | 360 | ||
382 | /* | 361 | /* |
383 | * Before adding this, check whether we raced | 362 | * Before adding this, check whether we raced |
@@ -405,6 +384,17 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
405 | uids_mutex_unlock(); | 384 | uids_mutex_unlock(); |
406 | 385 | ||
407 | return up; | 386 | return up; |
387 | |||
388 | out_destoy_sched: | ||
389 | sched_destroy_user(new); | ||
390 | out_put_keys: | ||
391 | key_put(new->uid_keyring); | ||
392 | key_put(new->session_keyring); | ||
393 | out_free_user: | ||
394 | kmem_cache_free(uid_cachep, new); | ||
395 | out_unlock: | ||
396 | uids_mutex_unlock(); | ||
397 | return NULL; | ||
408 | } | 398 | } |
409 | 399 | ||
410 | void switch_uid(struct user_struct *new_user) | 400 | void switch_uid(struct user_struct *new_user) |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 52d5e7c9a8e6..52db48e7f6e7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -67,9 +67,8 @@ struct workqueue_struct { | |||
67 | #endif | 67 | #endif |
68 | }; | 68 | }; |
69 | 69 | ||
70 | /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove | 70 | /* Serializes the accesses to the list of workqueues. */ |
71 | threads to each one as cpus come/go. */ | 71 | static DEFINE_SPINLOCK(workqueue_lock); |
72 | static DEFINE_MUTEX(workqueue_mutex); | ||
73 | static LIST_HEAD(workqueues); | 72 | static LIST_HEAD(workqueues); |
74 | 73 | ||
75 | static int singlethread_cpu __read_mostly; | 74 | static int singlethread_cpu __read_mostly; |
@@ -592,8 +591,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on); | |||
592 | * Returns zero on success. | 591 | * Returns zero on success. |
593 | * Returns -ve errno on failure. | 592 | * Returns -ve errno on failure. |
594 | * | 593 | * |
595 | * Appears to be racy against CPU hotplug. | ||
596 | * | ||
597 | * schedule_on_each_cpu() is very slow. | 594 | * schedule_on_each_cpu() is very slow. |
598 | */ | 595 | */ |
599 | int schedule_on_each_cpu(work_func_t func) | 596 | int schedule_on_each_cpu(work_func_t func) |
@@ -605,7 +602,7 @@ int schedule_on_each_cpu(work_func_t func) | |||
605 | if (!works) | 602 | if (!works) |
606 | return -ENOMEM; | 603 | return -ENOMEM; |
607 | 604 | ||
608 | preempt_disable(); /* CPU hotplug */ | 605 | get_online_cpus(); |
609 | for_each_online_cpu(cpu) { | 606 | for_each_online_cpu(cpu) { |
610 | struct work_struct *work = per_cpu_ptr(works, cpu); | 607 | struct work_struct *work = per_cpu_ptr(works, cpu); |
611 | 608 | ||
@@ -613,8 +610,8 @@ int schedule_on_each_cpu(work_func_t func) | |||
613 | set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); | 610 | set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); |
614 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); | 611 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); |
615 | } | 612 | } |
616 | preempt_enable(); | ||
617 | flush_workqueue(keventd_wq); | 613 | flush_workqueue(keventd_wq); |
614 | put_online_cpus(); | ||
618 | free_percpu(works); | 615 | free_percpu(works); |
619 | return 0; | 616 | return 0; |
620 | } | 617 | } |
@@ -722,7 +719,8 @@ static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) | |||
722 | struct workqueue_struct *__create_workqueue_key(const char *name, | 719 | struct workqueue_struct *__create_workqueue_key(const char *name, |
723 | int singlethread, | 720 | int singlethread, |
724 | int freezeable, | 721 | int freezeable, |
725 | struct lock_class_key *key) | 722 | struct lock_class_key *key, |
723 | const char *lock_name) | ||
726 | { | 724 | { |
727 | struct workqueue_struct *wq; | 725 | struct workqueue_struct *wq; |
728 | struct cpu_workqueue_struct *cwq; | 726 | struct cpu_workqueue_struct *cwq; |
@@ -739,7 +737,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name, | |||
739 | } | 737 | } |
740 | 738 | ||
741 | wq->name = name; | 739 | wq->name = name; |
742 | lockdep_init_map(&wq->lockdep_map, name, key, 0); | 740 | lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); |
743 | wq->singlethread = singlethread; | 741 | wq->singlethread = singlethread; |
744 | wq->freezeable = freezeable; | 742 | wq->freezeable = freezeable; |
745 | INIT_LIST_HEAD(&wq->list); | 743 | INIT_LIST_HEAD(&wq->list); |
@@ -749,8 +747,10 @@ struct workqueue_struct *__create_workqueue_key(const char *name, | |||
749 | err = create_workqueue_thread(cwq, singlethread_cpu); | 747 | err = create_workqueue_thread(cwq, singlethread_cpu); |
750 | start_workqueue_thread(cwq, -1); | 748 | start_workqueue_thread(cwq, -1); |
751 | } else { | 749 | } else { |
752 | mutex_lock(&workqueue_mutex); | 750 | get_online_cpus(); |
751 | spin_lock(&workqueue_lock); | ||
753 | list_add(&wq->list, &workqueues); | 752 | list_add(&wq->list, &workqueues); |
753 | spin_unlock(&workqueue_lock); | ||
754 | 754 | ||
755 | for_each_possible_cpu(cpu) { | 755 | for_each_possible_cpu(cpu) { |
756 | cwq = init_cpu_workqueue(wq, cpu); | 756 | cwq = init_cpu_workqueue(wq, cpu); |
@@ -759,7 +759,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name, | |||
759 | err = create_workqueue_thread(cwq, cpu); | 759 | err = create_workqueue_thread(cwq, cpu); |
760 | start_workqueue_thread(cwq, cpu); | 760 | start_workqueue_thread(cwq, cpu); |
761 | } | 761 | } |
762 | mutex_unlock(&workqueue_mutex); | 762 | put_online_cpus(); |
763 | } | 763 | } |
764 | 764 | ||
765 | if (err) { | 765 | if (err) { |
@@ -774,7 +774,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) | |||
774 | { | 774 | { |
775 | /* | 775 | /* |
776 | * Our caller is either destroy_workqueue() or CPU_DEAD, | 776 | * Our caller is either destroy_workqueue() or CPU_DEAD, |
777 | * workqueue_mutex protects cwq->thread | 777 | * get_online_cpus() protects cwq->thread. |
778 | */ | 778 | */ |
779 | if (cwq->thread == NULL) | 779 | if (cwq->thread == NULL) |
780 | return; | 780 | return; |
@@ -809,9 +809,11 @@ void destroy_workqueue(struct workqueue_struct *wq) | |||
809 | struct cpu_workqueue_struct *cwq; | 809 | struct cpu_workqueue_struct *cwq; |
810 | int cpu; | 810 | int cpu; |
811 | 811 | ||
812 | mutex_lock(&workqueue_mutex); | 812 | get_online_cpus(); |
813 | spin_lock(&workqueue_lock); | ||
813 | list_del(&wq->list); | 814 | list_del(&wq->list); |
814 | mutex_unlock(&workqueue_mutex); | 815 | spin_unlock(&workqueue_lock); |
816 | put_online_cpus(); | ||
815 | 817 | ||
816 | for_each_cpu_mask(cpu, *cpu_map) { | 818 | for_each_cpu_mask(cpu, *cpu_map) { |
817 | cwq = per_cpu_ptr(wq->cpu_wq, cpu); | 819 | cwq = per_cpu_ptr(wq->cpu_wq, cpu); |
@@ -834,13 +836,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
834 | action &= ~CPU_TASKS_FROZEN; | 836 | action &= ~CPU_TASKS_FROZEN; |
835 | 837 | ||
836 | switch (action) { | 838 | switch (action) { |
837 | case CPU_LOCK_ACQUIRE: | ||
838 | mutex_lock(&workqueue_mutex); | ||
839 | return NOTIFY_OK; | ||
840 | |||
841 | case CPU_LOCK_RELEASE: | ||
842 | mutex_unlock(&workqueue_mutex); | ||
843 | return NOTIFY_OK; | ||
844 | 839 | ||
845 | case CPU_UP_PREPARE: | 840 | case CPU_UP_PREPARE: |
846 | cpu_set(cpu, cpu_populated_map); | 841 | cpu_set(cpu, cpu_populated_map); |
@@ -853,7 +848,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
853 | case CPU_UP_PREPARE: | 848 | case CPU_UP_PREPARE: |
854 | if (!create_workqueue_thread(cwq, cpu)) | 849 | if (!create_workqueue_thread(cwq, cpu)) |
855 | break; | 850 | break; |
856 | printk(KERN_ERR "workqueue for %i failed\n", cpu); | 851 | printk(KERN_ERR "workqueue [%s] for %i failed\n", |
852 | wq->name, cpu); | ||
857 | return NOTIFY_BAD; | 853 | return NOTIFY_BAD; |
858 | 854 | ||
859 | case CPU_ONLINE: | 855 | case CPU_ONLINE: |