diff options
Diffstat (limited to 'kernel')
57 files changed, 6354 insertions, 1833 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 4af15802ccd4..526128a2e622 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz | |||
@@ -54,3 +54,5 @@ config HZ | |||
54 | default 300 if HZ_300 | 54 | default 300 if HZ_300 |
55 | default 1000 if HZ_1000 | 55 | default 1000 if HZ_1000 |
56 | 56 | ||
57 | config SCHED_HRTICK | ||
58 | def_bool HIGH_RES_TIMERS && X86 | ||
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index c64ce9c14207..0669b70fa6a3 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt | |||
@@ -52,14 +52,13 @@ config PREEMPT | |||
52 | 52 | ||
53 | endchoice | 53 | endchoice |
54 | 54 | ||
55 | config PREEMPT_BKL | 55 | config RCU_TRACE |
56 | bool "Preempt The Big Kernel Lock" | 56 | bool "Enable tracing for RCU - currently stats in debugfs" |
57 | depends on SMP || PREEMPT | 57 | select DEBUG_FS |
58 | default y | 58 | default y |
59 | help | 59 | help |
60 | This option reduces the latency of the kernel by making the | 60 | This option provides tracing in RCU which presents stats |
61 | big kernel lock preemptible. | 61 | in debugfs for debugging RCU implementation. |
62 | 62 | ||
63 | Say Y here if you are building a kernel for a desktop system. | 63 | Say Y here if you want to enable RCU tracing |
64 | Say N if you are unsure. | 64 | Say N if you are unsure. |
65 | |||
diff --git a/kernel/Makefile b/kernel/Makefile index dfa96956dae0..8885627ea021 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -36,6 +36,7 @@ obj-$(CONFIG_KALLSYMS) += kallsyms.o | |||
36 | obj-$(CONFIG_PM) += power/ | 36 | obj-$(CONFIG_PM) += power/ |
37 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o | 37 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o |
38 | obj-$(CONFIG_KEXEC) += kexec.o | 38 | obj-$(CONFIG_KEXEC) += kexec.o |
39 | obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o | ||
39 | obj-$(CONFIG_COMPAT) += compat.o | 40 | obj-$(CONFIG_COMPAT) += compat.o |
40 | obj-$(CONFIG_CGROUPS) += cgroup.o | 41 | obj-$(CONFIG_CGROUPS) += cgroup.o |
41 | obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o | 42 | obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o |
@@ -43,6 +44,7 @@ obj-$(CONFIG_CPUSETS) += cpuset.o | |||
43 | obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o | 44 | obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o |
44 | obj-$(CONFIG_IKCONFIG) += configs.o | 45 | obj-$(CONFIG_IKCONFIG) += configs.o |
45 | obj-$(CONFIG_STOP_MACHINE) += stop_machine.o | 46 | obj-$(CONFIG_STOP_MACHINE) += stop_machine.o |
47 | obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o | ||
46 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o | 48 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o |
47 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o | 49 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o |
48 | obj-$(CONFIG_AUDIT_TREE) += audit_tree.o | 50 | obj-$(CONFIG_AUDIT_TREE) += audit_tree.o |
@@ -52,11 +54,17 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o | |||
52 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | 54 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ |
53 | obj-$(CONFIG_SECCOMP) += seccomp.o | 55 | obj-$(CONFIG_SECCOMP) += seccomp.o |
54 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 56 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
57 | obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o | ||
58 | obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o | ||
59 | ifeq ($(CONFIG_PREEMPT_RCU),y) | ||
60 | obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o | ||
61 | endif | ||
55 | obj-$(CONFIG_RELAY) += relay.o | 62 | obj-$(CONFIG_RELAY) += relay.o |
56 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o | 63 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o |
57 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o | 64 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o |
58 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o | 65 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o |
59 | obj-$(CONFIG_MARKERS) += marker.o | 66 | obj-$(CONFIG_MARKERS) += marker.o |
67 | obj-$(CONFIG_LATENCYTOP) += latencytop.o | ||
60 | 68 | ||
61 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) | 69 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) |
62 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 70 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c new file mode 100644 index 000000000000..d1a7605c5b8f --- /dev/null +++ b/kernel/backtracetest.c | |||
@@ -0,0 +1,48 @@ | |||
1 | /* | ||
2 | * Simple stack backtrace regression test module | ||
3 | * | ||
4 | * (C) Copyright 2008 Intel Corporation | ||
5 | * Author: Arjan van de Ven <arjan@linux.intel.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; version 2 | ||
10 | * of the License. | ||
11 | */ | ||
12 | |||
13 | #include <linux/module.h> | ||
14 | #include <linux/sched.h> | ||
15 | #include <linux/delay.h> | ||
16 | |||
17 | static struct timer_list backtrace_timer; | ||
18 | |||
19 | static void backtrace_test_timer(unsigned long data) | ||
20 | { | ||
21 | printk("Testing a backtrace from irq context.\n"); | ||
22 | printk("The following trace is a kernel self test and not a bug!\n"); | ||
23 | dump_stack(); | ||
24 | } | ||
25 | static int backtrace_regression_test(void) | ||
26 | { | ||
27 | printk("====[ backtrace testing ]===========\n"); | ||
28 | printk("Testing a backtrace from process context.\n"); | ||
29 | printk("The following trace is a kernel self test and not a bug!\n"); | ||
30 | dump_stack(); | ||
31 | |||
32 | init_timer(&backtrace_timer); | ||
33 | backtrace_timer.function = backtrace_test_timer; | ||
34 | mod_timer(&backtrace_timer, jiffies + 10); | ||
35 | |||
36 | msleep(10); | ||
37 | printk("====[ end of backtrace testing ]====\n"); | ||
38 | return 0; | ||
39 | } | ||
40 | |||
41 | static void exitf(void) | ||
42 | { | ||
43 | } | ||
44 | |||
45 | module_init(backtrace_regression_test); | ||
46 | module_exit(exitf); | ||
47 | MODULE_LICENSE("GPL"); | ||
48 | MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index 6b3a0c15144f..e0d3a4f56ecb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -15,9 +15,8 @@ | |||
15 | #include <linux/stop_machine.h> | 15 | #include <linux/stop_machine.h> |
16 | #include <linux/mutex.h> | 16 | #include <linux/mutex.h> |
17 | 17 | ||
18 | /* This protects CPUs going up and down... */ | 18 | /* Serializes the updates to cpu_online_map, cpu_present_map */ |
19 | static DEFINE_MUTEX(cpu_add_remove_lock); | 19 | static DEFINE_MUTEX(cpu_add_remove_lock); |
20 | static DEFINE_MUTEX(cpu_bitmask_lock); | ||
21 | 20 | ||
22 | static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); | 21 | static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); |
23 | 22 | ||
@@ -26,52 +25,123 @@ static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); | |||
26 | */ | 25 | */ |
27 | static int cpu_hotplug_disabled; | 26 | static int cpu_hotplug_disabled; |
28 | 27 | ||
29 | #ifdef CONFIG_HOTPLUG_CPU | 28 | static struct { |
29 | struct task_struct *active_writer; | ||
30 | struct mutex lock; /* Synchronizes accesses to refcount, */ | ||
31 | /* | ||
32 | * Also blocks the new readers during | ||
33 | * an ongoing cpu hotplug operation. | ||
34 | */ | ||
35 | int refcount; | ||
36 | wait_queue_head_t writer_queue; | ||
37 | } cpu_hotplug; | ||
30 | 38 | ||
31 | /* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */ | 39 | #define writer_exists() (cpu_hotplug.active_writer != NULL) |
32 | static struct task_struct *recursive; | ||
33 | static int recursive_depth; | ||
34 | 40 | ||
35 | void lock_cpu_hotplug(void) | 41 | void __init cpu_hotplug_init(void) |
36 | { | 42 | { |
37 | struct task_struct *tsk = current; | 43 | cpu_hotplug.active_writer = NULL; |
38 | 44 | mutex_init(&cpu_hotplug.lock); | |
39 | if (tsk == recursive) { | 45 | cpu_hotplug.refcount = 0; |
40 | static int warnings = 10; | 46 | init_waitqueue_head(&cpu_hotplug.writer_queue); |
41 | if (warnings) { | 47 | } |
42 | printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n"); | 48 | |
43 | WARN_ON(1); | 49 | #ifdef CONFIG_HOTPLUG_CPU |
44 | warnings--; | 50 | |
45 | } | 51 | void get_online_cpus(void) |
46 | recursive_depth++; | 52 | { |
53 | might_sleep(); | ||
54 | if (cpu_hotplug.active_writer == current) | ||
47 | return; | 55 | return; |
48 | } | 56 | mutex_lock(&cpu_hotplug.lock); |
49 | mutex_lock(&cpu_bitmask_lock); | 57 | cpu_hotplug.refcount++; |
50 | recursive = tsk; | 58 | mutex_unlock(&cpu_hotplug.lock); |
59 | |||
51 | } | 60 | } |
52 | EXPORT_SYMBOL_GPL(lock_cpu_hotplug); | 61 | EXPORT_SYMBOL_GPL(get_online_cpus); |
53 | 62 | ||
54 | void unlock_cpu_hotplug(void) | 63 | void put_online_cpus(void) |
55 | { | 64 | { |
56 | WARN_ON(recursive != current); | 65 | if (cpu_hotplug.active_writer == current) |
57 | if (recursive_depth) { | ||
58 | recursive_depth--; | ||
59 | return; | 66 | return; |
60 | } | 67 | mutex_lock(&cpu_hotplug.lock); |
61 | recursive = NULL; | 68 | cpu_hotplug.refcount--; |
62 | mutex_unlock(&cpu_bitmask_lock); | 69 | |
70 | if (unlikely(writer_exists()) && !cpu_hotplug.refcount) | ||
71 | wake_up(&cpu_hotplug.writer_queue); | ||
72 | |||
73 | mutex_unlock(&cpu_hotplug.lock); | ||
74 | |||
63 | } | 75 | } |
64 | EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); | 76 | EXPORT_SYMBOL_GPL(put_online_cpus); |
65 | 77 | ||
66 | #endif /* CONFIG_HOTPLUG_CPU */ | 78 | #endif /* CONFIG_HOTPLUG_CPU */ |
67 | 79 | ||
80 | /* | ||
81 | * The following two API's must be used when attempting | ||
82 | * to serialize the updates to cpu_online_map, cpu_present_map. | ||
83 | */ | ||
84 | void cpu_maps_update_begin(void) | ||
85 | { | ||
86 | mutex_lock(&cpu_add_remove_lock); | ||
87 | } | ||
88 | |||
89 | void cpu_maps_update_done(void) | ||
90 | { | ||
91 | mutex_unlock(&cpu_add_remove_lock); | ||
92 | } | ||
93 | |||
94 | /* | ||
95 | * This ensures that the hotplug operation can begin only when the | ||
96 | * refcount goes to zero. | ||
97 | * | ||
98 | * Note that during a cpu-hotplug operation, the new readers, if any, | ||
99 | * will be blocked by the cpu_hotplug.lock | ||
100 | * | ||
101 | * Since cpu_maps_update_begin is always called after invoking | ||
102 | * cpu_maps_update_begin, we can be sure that only one writer is active. | ||
103 | * | ||
104 | * Note that theoretically, there is a possibility of a livelock: | ||
105 | * - Refcount goes to zero, last reader wakes up the sleeping | ||
106 | * writer. | ||
107 | * - Last reader unlocks the cpu_hotplug.lock. | ||
108 | * - A new reader arrives at this moment, bumps up the refcount. | ||
109 | * - The writer acquires the cpu_hotplug.lock finds the refcount | ||
110 | * non zero and goes to sleep again. | ||
111 | * | ||
112 | * However, this is very difficult to achieve in practice since | ||
113 | * get_online_cpus() not an api which is called all that often. | ||
114 | * | ||
115 | */ | ||
116 | static void cpu_hotplug_begin(void) | ||
117 | { | ||
118 | DECLARE_WAITQUEUE(wait, current); | ||
119 | |||
120 | mutex_lock(&cpu_hotplug.lock); | ||
121 | |||
122 | cpu_hotplug.active_writer = current; | ||
123 | add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait); | ||
124 | while (cpu_hotplug.refcount) { | ||
125 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
126 | mutex_unlock(&cpu_hotplug.lock); | ||
127 | schedule(); | ||
128 | mutex_lock(&cpu_hotplug.lock); | ||
129 | } | ||
130 | remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait); | ||
131 | } | ||
132 | |||
133 | static void cpu_hotplug_done(void) | ||
134 | { | ||
135 | cpu_hotplug.active_writer = NULL; | ||
136 | mutex_unlock(&cpu_hotplug.lock); | ||
137 | } | ||
68 | /* Need to know about CPUs going up/down? */ | 138 | /* Need to know about CPUs going up/down? */ |
69 | int __cpuinit register_cpu_notifier(struct notifier_block *nb) | 139 | int __cpuinit register_cpu_notifier(struct notifier_block *nb) |
70 | { | 140 | { |
71 | int ret; | 141 | int ret; |
72 | mutex_lock(&cpu_add_remove_lock); | 142 | cpu_maps_update_begin(); |
73 | ret = raw_notifier_chain_register(&cpu_chain, nb); | 143 | ret = raw_notifier_chain_register(&cpu_chain, nb); |
74 | mutex_unlock(&cpu_add_remove_lock); | 144 | cpu_maps_update_done(); |
75 | return ret; | 145 | return ret; |
76 | } | 146 | } |
77 | 147 | ||
@@ -81,9 +151,9 @@ EXPORT_SYMBOL(register_cpu_notifier); | |||
81 | 151 | ||
82 | void unregister_cpu_notifier(struct notifier_block *nb) | 152 | void unregister_cpu_notifier(struct notifier_block *nb) |
83 | { | 153 | { |
84 | mutex_lock(&cpu_add_remove_lock); | 154 | cpu_maps_update_begin(); |
85 | raw_notifier_chain_unregister(&cpu_chain, nb); | 155 | raw_notifier_chain_unregister(&cpu_chain, nb); |
86 | mutex_unlock(&cpu_add_remove_lock); | 156 | cpu_maps_update_done(); |
87 | } | 157 | } |
88 | EXPORT_SYMBOL(unregister_cpu_notifier); | 158 | EXPORT_SYMBOL(unregister_cpu_notifier); |
89 | 159 | ||
@@ -147,7 +217,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) | |||
147 | if (!cpu_online(cpu)) | 217 | if (!cpu_online(cpu)) |
148 | return -EINVAL; | 218 | return -EINVAL; |
149 | 219 | ||
150 | raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); | 220 | cpu_hotplug_begin(); |
151 | err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, | 221 | err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, |
152 | hcpu, -1, &nr_calls); | 222 | hcpu, -1, &nr_calls); |
153 | if (err == NOTIFY_BAD) { | 223 | if (err == NOTIFY_BAD) { |
@@ -166,9 +236,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) | |||
166 | cpu_clear(cpu, tmp); | 236 | cpu_clear(cpu, tmp); |
167 | set_cpus_allowed(current, tmp); | 237 | set_cpus_allowed(current, tmp); |
168 | 238 | ||
169 | mutex_lock(&cpu_bitmask_lock); | ||
170 | p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); | 239 | p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); |
171 | mutex_unlock(&cpu_bitmask_lock); | ||
172 | 240 | ||
173 | if (IS_ERR(p) || cpu_online(cpu)) { | 241 | if (IS_ERR(p) || cpu_online(cpu)) { |
174 | /* CPU didn't die: tell everyone. Can't complain. */ | 242 | /* CPU didn't die: tell everyone. Can't complain. */ |
@@ -202,7 +270,7 @@ out_thread: | |||
202 | out_allowed: | 270 | out_allowed: |
203 | set_cpus_allowed(current, old_allowed); | 271 | set_cpus_allowed(current, old_allowed); |
204 | out_release: | 272 | out_release: |
205 | raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); | 273 | cpu_hotplug_done(); |
206 | return err; | 274 | return err; |
207 | } | 275 | } |
208 | 276 | ||
@@ -210,13 +278,13 @@ int cpu_down(unsigned int cpu) | |||
210 | { | 278 | { |
211 | int err = 0; | 279 | int err = 0; |
212 | 280 | ||
213 | mutex_lock(&cpu_add_remove_lock); | 281 | cpu_maps_update_begin(); |
214 | if (cpu_hotplug_disabled) | 282 | if (cpu_hotplug_disabled) |
215 | err = -EBUSY; | 283 | err = -EBUSY; |
216 | else | 284 | else |
217 | err = _cpu_down(cpu, 0); | 285 | err = _cpu_down(cpu, 0); |
218 | 286 | ||
219 | mutex_unlock(&cpu_add_remove_lock); | 287 | cpu_maps_update_done(); |
220 | return err; | 288 | return err; |
221 | } | 289 | } |
222 | #endif /*CONFIG_HOTPLUG_CPU*/ | 290 | #endif /*CONFIG_HOTPLUG_CPU*/ |
@@ -231,7 +299,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
231 | if (cpu_online(cpu) || !cpu_present(cpu)) | 299 | if (cpu_online(cpu) || !cpu_present(cpu)) |
232 | return -EINVAL; | 300 | return -EINVAL; |
233 | 301 | ||
234 | raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); | 302 | cpu_hotplug_begin(); |
235 | ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, | 303 | ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, |
236 | -1, &nr_calls); | 304 | -1, &nr_calls); |
237 | if (ret == NOTIFY_BAD) { | 305 | if (ret == NOTIFY_BAD) { |
@@ -243,9 +311,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
243 | } | 311 | } |
244 | 312 | ||
245 | /* Arch-specific enabling code. */ | 313 | /* Arch-specific enabling code. */ |
246 | mutex_lock(&cpu_bitmask_lock); | ||
247 | ret = __cpu_up(cpu); | 314 | ret = __cpu_up(cpu); |
248 | mutex_unlock(&cpu_bitmask_lock); | ||
249 | if (ret != 0) | 315 | if (ret != 0) |
250 | goto out_notify; | 316 | goto out_notify; |
251 | BUG_ON(!cpu_online(cpu)); | 317 | BUG_ON(!cpu_online(cpu)); |
@@ -257,7 +323,7 @@ out_notify: | |||
257 | if (ret != 0) | 323 | if (ret != 0) |
258 | __raw_notifier_call_chain(&cpu_chain, | 324 | __raw_notifier_call_chain(&cpu_chain, |
259 | CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); | 325 | CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); |
260 | raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); | 326 | cpu_hotplug_done(); |
261 | 327 | ||
262 | return ret; | 328 | return ret; |
263 | } | 329 | } |
@@ -275,13 +341,13 @@ int __cpuinit cpu_up(unsigned int cpu) | |||
275 | return -EINVAL; | 341 | return -EINVAL; |
276 | } | 342 | } |
277 | 343 | ||
278 | mutex_lock(&cpu_add_remove_lock); | 344 | cpu_maps_update_begin(); |
279 | if (cpu_hotplug_disabled) | 345 | if (cpu_hotplug_disabled) |
280 | err = -EBUSY; | 346 | err = -EBUSY; |
281 | else | 347 | else |
282 | err = _cpu_up(cpu, 0); | 348 | err = _cpu_up(cpu, 0); |
283 | 349 | ||
284 | mutex_unlock(&cpu_add_remove_lock); | 350 | cpu_maps_update_done(); |
285 | return err; | 351 | return err; |
286 | } | 352 | } |
287 | 353 | ||
@@ -292,7 +358,7 @@ int disable_nonboot_cpus(void) | |||
292 | { | 358 | { |
293 | int cpu, first_cpu, error = 0; | 359 | int cpu, first_cpu, error = 0; |
294 | 360 | ||
295 | mutex_lock(&cpu_add_remove_lock); | 361 | cpu_maps_update_begin(); |
296 | first_cpu = first_cpu(cpu_online_map); | 362 | first_cpu = first_cpu(cpu_online_map); |
297 | /* We take down all of the non-boot CPUs in one shot to avoid races | 363 | /* We take down all of the non-boot CPUs in one shot to avoid races |
298 | * with the userspace trying to use the CPU hotplug at the same time | 364 | * with the userspace trying to use the CPU hotplug at the same time |
@@ -319,7 +385,7 @@ int disable_nonboot_cpus(void) | |||
319 | } else { | 385 | } else { |
320 | printk(KERN_ERR "Non-boot CPUs are not disabled\n"); | 386 | printk(KERN_ERR "Non-boot CPUs are not disabled\n"); |
321 | } | 387 | } |
322 | mutex_unlock(&cpu_add_remove_lock); | 388 | cpu_maps_update_done(); |
323 | return error; | 389 | return error; |
324 | } | 390 | } |
325 | 391 | ||
@@ -328,7 +394,7 @@ void enable_nonboot_cpus(void) | |||
328 | int cpu, error; | 394 | int cpu, error; |
329 | 395 | ||
330 | /* Allow everyone to use the CPU hotplug again */ | 396 | /* Allow everyone to use the CPU hotplug again */ |
331 | mutex_lock(&cpu_add_remove_lock); | 397 | cpu_maps_update_begin(); |
332 | cpu_hotplug_disabled = 0; | 398 | cpu_hotplug_disabled = 0; |
333 | if (cpus_empty(frozen_cpus)) | 399 | if (cpus_empty(frozen_cpus)) |
334 | goto out; | 400 | goto out; |
@@ -344,6 +410,6 @@ void enable_nonboot_cpus(void) | |||
344 | } | 410 | } |
345 | cpus_clear(frozen_cpus); | 411 | cpus_clear(frozen_cpus); |
346 | out: | 412 | out: |
347 | mutex_unlock(&cpu_add_remove_lock); | 413 | cpu_maps_update_done(); |
348 | } | 414 | } |
349 | #endif /* CONFIG_PM_SLEEP_SMP */ | 415 | #endif /* CONFIG_PM_SLEEP_SMP */ |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 50f5dc463688..cfaf6419d817 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -537,10 +537,10 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | |||
537 | * | 537 | * |
538 | * Call with cgroup_mutex held. May take callback_mutex during | 538 | * Call with cgroup_mutex held. May take callback_mutex during |
539 | * call due to the kfifo_alloc() and kmalloc() calls. May nest | 539 | * call due to the kfifo_alloc() and kmalloc() calls. May nest |
540 | * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | 540 | * a call to the get_online_cpus()/put_online_cpus() pair. |
541 | * Must not be called holding callback_mutex, because we must not | 541 | * Must not be called holding callback_mutex, because we must not |
542 | * call lock_cpu_hotplug() while holding callback_mutex. Elsewhere | 542 | * call get_online_cpus() while holding callback_mutex. Elsewhere |
543 | * the kernel nests callback_mutex inside lock_cpu_hotplug() calls. | 543 | * the kernel nests callback_mutex inside get_online_cpus() calls. |
544 | * So the reverse nesting would risk an ABBA deadlock. | 544 | * So the reverse nesting would risk an ABBA deadlock. |
545 | * | 545 | * |
546 | * The three key local variables below are: | 546 | * The three key local variables below are: |
@@ -691,9 +691,9 @@ restart: | |||
691 | 691 | ||
692 | rebuild: | 692 | rebuild: |
693 | /* Have scheduler rebuild sched domains */ | 693 | /* Have scheduler rebuild sched domains */ |
694 | lock_cpu_hotplug(); | 694 | get_online_cpus(); |
695 | partition_sched_domains(ndoms, doms); | 695 | partition_sched_domains(ndoms, doms); |
696 | unlock_cpu_hotplug(); | 696 | put_online_cpus(); |
697 | 697 | ||
698 | done: | 698 | done: |
699 | if (q && !IS_ERR(q)) | 699 | if (q && !IS_ERR(q)) |
@@ -1617,10 +1617,10 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1617 | * | 1617 | * |
1618 | * If the cpuset being removed has its flag 'sched_load_balance' | 1618 | * If the cpuset being removed has its flag 'sched_load_balance' |
1619 | * enabled, then simulate turning sched_load_balance off, which | 1619 | * enabled, then simulate turning sched_load_balance off, which |
1620 | * will call rebuild_sched_domains(). The lock_cpu_hotplug() | 1620 | * will call rebuild_sched_domains(). The get_online_cpus() |
1621 | * call in rebuild_sched_domains() must not be made while holding | 1621 | * call in rebuild_sched_domains() must not be made while holding |
1622 | * callback_mutex. Elsewhere the kernel nests callback_mutex inside | 1622 | * callback_mutex. Elsewhere the kernel nests callback_mutex inside |
1623 | * lock_cpu_hotplug() calls. So the reverse nesting would risk an | 1623 | * get_online_cpus() calls. So the reverse nesting would risk an |
1624 | * ABBA deadlock. | 1624 | * ABBA deadlock. |
1625 | */ | 1625 | */ |
1626 | 1626 | ||
diff --git a/kernel/extable.c b/kernel/extable.c index 7fe262855317..a26cb2e17023 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
@@ -46,7 +46,8 @@ int core_kernel_text(unsigned long addr) | |||
46 | addr <= (unsigned long)_etext) | 46 | addr <= (unsigned long)_etext) |
47 | return 1; | 47 | return 1; |
48 | 48 | ||
49 | if (addr >= (unsigned long)_sinittext && | 49 | if (system_state == SYSTEM_BOOTING && |
50 | addr >= (unsigned long)_sinittext && | ||
50 | addr <= (unsigned long)_einittext) | 51 | addr <= (unsigned long)_einittext) |
51 | return 1; | 52 | return 1; |
52 | return 0; | 53 | return 0; |
diff --git a/kernel/fork.c b/kernel/fork.c index 8dd8ff281009..05e0b6f4365b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <linux/random.h> | 51 | #include <linux/random.h> |
52 | #include <linux/tty.h> | 52 | #include <linux/tty.h> |
53 | #include <linux/proc_fs.h> | 53 | #include <linux/proc_fs.h> |
54 | #include <linux/blkdev.h> | ||
54 | 55 | ||
55 | #include <asm/pgtable.h> | 56 | #include <asm/pgtable.h> |
56 | #include <asm/pgalloc.h> | 57 | #include <asm/pgalloc.h> |
@@ -392,6 +393,7 @@ void fastcall __mmdrop(struct mm_struct *mm) | |||
392 | destroy_context(mm); | 393 | destroy_context(mm); |
393 | free_mm(mm); | 394 | free_mm(mm); |
394 | } | 395 | } |
396 | EXPORT_SYMBOL_GPL(__mmdrop); | ||
395 | 397 | ||
396 | /* | 398 | /* |
397 | * Decrement the use count and release all resources for an mm. | 399 | * Decrement the use count and release all resources for an mm. |
@@ -791,6 +793,31 @@ out: | |||
791 | return error; | 793 | return error; |
792 | } | 794 | } |
793 | 795 | ||
796 | static int copy_io(unsigned long clone_flags, struct task_struct *tsk) | ||
797 | { | ||
798 | #ifdef CONFIG_BLOCK | ||
799 | struct io_context *ioc = current->io_context; | ||
800 | |||
801 | if (!ioc) | ||
802 | return 0; | ||
803 | /* | ||
804 | * Share io context with parent, if CLONE_IO is set | ||
805 | */ | ||
806 | if (clone_flags & CLONE_IO) { | ||
807 | tsk->io_context = ioc_task_link(ioc); | ||
808 | if (unlikely(!tsk->io_context)) | ||
809 | return -ENOMEM; | ||
810 | } else if (ioprio_valid(ioc->ioprio)) { | ||
811 | tsk->io_context = alloc_io_context(GFP_KERNEL, -1); | ||
812 | if (unlikely(!tsk->io_context)) | ||
813 | return -ENOMEM; | ||
814 | |||
815 | tsk->io_context->ioprio = ioc->ioprio; | ||
816 | } | ||
817 | #endif | ||
818 | return 0; | ||
819 | } | ||
820 | |||
794 | /* | 821 | /* |
795 | * Helper to unshare the files of the current task. | 822 | * Helper to unshare the files of the current task. |
796 | * We don't want to expose copy_files internals to | 823 | * We don't want to expose copy_files internals to |
@@ -1045,6 +1072,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1045 | copy_flags(clone_flags, p); | 1072 | copy_flags(clone_flags, p); |
1046 | INIT_LIST_HEAD(&p->children); | 1073 | INIT_LIST_HEAD(&p->children); |
1047 | INIT_LIST_HEAD(&p->sibling); | 1074 | INIT_LIST_HEAD(&p->sibling); |
1075 | #ifdef CONFIG_PREEMPT_RCU | ||
1076 | p->rcu_read_lock_nesting = 0; | ||
1077 | p->rcu_flipctr_idx = 0; | ||
1078 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ | ||
1048 | p->vfork_done = NULL; | 1079 | p->vfork_done = NULL; |
1049 | spin_lock_init(&p->alloc_lock); | 1080 | spin_lock_init(&p->alloc_lock); |
1050 | 1081 | ||
@@ -1059,6 +1090,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1059 | p->prev_utime = cputime_zero; | 1090 | p->prev_utime = cputime_zero; |
1060 | p->prev_stime = cputime_zero; | 1091 | p->prev_stime = cputime_zero; |
1061 | 1092 | ||
1093 | #ifdef CONFIG_DETECT_SOFTLOCKUP | ||
1094 | p->last_switch_count = 0; | ||
1095 | p->last_switch_timestamp = 0; | ||
1096 | #endif | ||
1097 | |||
1062 | #ifdef CONFIG_TASK_XACCT | 1098 | #ifdef CONFIG_TASK_XACCT |
1063 | p->rchar = 0; /* I/O counter: bytes read */ | 1099 | p->rchar = 0; /* I/O counter: bytes read */ |
1064 | p->wchar = 0; /* I/O counter: bytes written */ | 1100 | p->wchar = 0; /* I/O counter: bytes written */ |
@@ -1147,15 +1183,17 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1147 | goto bad_fork_cleanup_mm; | 1183 | goto bad_fork_cleanup_mm; |
1148 | if ((retval = copy_namespaces(clone_flags, p))) | 1184 | if ((retval = copy_namespaces(clone_flags, p))) |
1149 | goto bad_fork_cleanup_keys; | 1185 | goto bad_fork_cleanup_keys; |
1186 | if ((retval = copy_io(clone_flags, p))) | ||
1187 | goto bad_fork_cleanup_namespaces; | ||
1150 | retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); | 1188 | retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); |
1151 | if (retval) | 1189 | if (retval) |
1152 | goto bad_fork_cleanup_namespaces; | 1190 | goto bad_fork_cleanup_io; |
1153 | 1191 | ||
1154 | if (pid != &init_struct_pid) { | 1192 | if (pid != &init_struct_pid) { |
1155 | retval = -ENOMEM; | 1193 | retval = -ENOMEM; |
1156 | pid = alloc_pid(task_active_pid_ns(p)); | 1194 | pid = alloc_pid(task_active_pid_ns(p)); |
1157 | if (!pid) | 1195 | if (!pid) |
1158 | goto bad_fork_cleanup_namespaces; | 1196 | goto bad_fork_cleanup_io; |
1159 | 1197 | ||
1160 | if (clone_flags & CLONE_NEWPID) { | 1198 | if (clone_flags & CLONE_NEWPID) { |
1161 | retval = pid_ns_prepare_proc(task_active_pid_ns(p)); | 1199 | retval = pid_ns_prepare_proc(task_active_pid_ns(p)); |
@@ -1196,6 +1234,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1196 | #ifdef TIF_SYSCALL_EMU | 1234 | #ifdef TIF_SYSCALL_EMU |
1197 | clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); | 1235 | clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); |
1198 | #endif | 1236 | #endif |
1237 | clear_all_latency_tracing(p); | ||
1199 | 1238 | ||
1200 | /* Our parent execution domain becomes current domain | 1239 | /* Our parent execution domain becomes current domain |
1201 | These must match for thread signalling to apply */ | 1240 | These must match for thread signalling to apply */ |
@@ -1224,9 +1263,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1224 | /* Need tasklist lock for parent etc handling! */ | 1263 | /* Need tasklist lock for parent etc handling! */ |
1225 | write_lock_irq(&tasklist_lock); | 1264 | write_lock_irq(&tasklist_lock); |
1226 | 1265 | ||
1227 | /* for sys_ioprio_set(IOPRIO_WHO_PGRP) */ | ||
1228 | p->ioprio = current->ioprio; | ||
1229 | |||
1230 | /* | 1266 | /* |
1231 | * The task hasn't been attached yet, so its cpus_allowed mask will | 1267 | * The task hasn't been attached yet, so its cpus_allowed mask will |
1232 | * not be changed, nor will its assigned CPU. | 1268 | * not be changed, nor will its assigned CPU. |
@@ -1237,6 +1273,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1237 | * parent's CPU). This avoids alot of nasty races. | 1273 | * parent's CPU). This avoids alot of nasty races. |
1238 | */ | 1274 | */ |
1239 | p->cpus_allowed = current->cpus_allowed; | 1275 | p->cpus_allowed = current->cpus_allowed; |
1276 | p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; | ||
1240 | if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || | 1277 | if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || |
1241 | !cpu_online(task_cpu(p)))) | 1278 | !cpu_online(task_cpu(p)))) |
1242 | set_task_cpu(p, smp_processor_id()); | 1279 | set_task_cpu(p, smp_processor_id()); |
@@ -1317,6 +1354,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1317 | bad_fork_free_pid: | 1354 | bad_fork_free_pid: |
1318 | if (pid != &init_struct_pid) | 1355 | if (pid != &init_struct_pid) |
1319 | free_pid(pid); | 1356 | free_pid(pid); |
1357 | bad_fork_cleanup_io: | ||
1358 | put_io_context(p->io_context); | ||
1320 | bad_fork_cleanup_namespaces: | 1359 | bad_fork_cleanup_namespaces: |
1321 | exit_task_namespaces(p); | 1360 | exit_task_namespaces(p); |
1322 | bad_fork_cleanup_keys: | 1361 | bad_fork_cleanup_keys: |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f994bb8065e6..bd5d6b5060bc 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -325,6 +325,22 @@ unsigned long ktime_divns(const ktime_t kt, s64 div) | |||
325 | } | 325 | } |
326 | #endif /* BITS_PER_LONG >= 64 */ | 326 | #endif /* BITS_PER_LONG >= 64 */ |
327 | 327 | ||
328 | /* | ||
329 | * Check, whether the timer is on the callback pending list | ||
330 | */ | ||
331 | static inline int hrtimer_cb_pending(const struct hrtimer *timer) | ||
332 | { | ||
333 | return timer->state & HRTIMER_STATE_PENDING; | ||
334 | } | ||
335 | |||
336 | /* | ||
337 | * Remove a timer from the callback pending list | ||
338 | */ | ||
339 | static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) | ||
340 | { | ||
341 | list_del_init(&timer->cb_entry); | ||
342 | } | ||
343 | |||
328 | /* High resolution timer related functions */ | 344 | /* High resolution timer related functions */ |
329 | #ifdef CONFIG_HIGH_RES_TIMERS | 345 | #ifdef CONFIG_HIGH_RES_TIMERS |
330 | 346 | ||
@@ -494,29 +510,12 @@ void hres_timers_resume(void) | |||
494 | } | 510 | } |
495 | 511 | ||
496 | /* | 512 | /* |
497 | * Check, whether the timer is on the callback pending list | ||
498 | */ | ||
499 | static inline int hrtimer_cb_pending(const struct hrtimer *timer) | ||
500 | { | ||
501 | return timer->state & HRTIMER_STATE_PENDING; | ||
502 | } | ||
503 | |||
504 | /* | ||
505 | * Remove a timer from the callback pending list | ||
506 | */ | ||
507 | static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) | ||
508 | { | ||
509 | list_del_init(&timer->cb_entry); | ||
510 | } | ||
511 | |||
512 | /* | ||
513 | * Initialize the high resolution related parts of cpu_base | 513 | * Initialize the high resolution related parts of cpu_base |
514 | */ | 514 | */ |
515 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) | 515 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) |
516 | { | 516 | { |
517 | base->expires_next.tv64 = KTIME_MAX; | 517 | base->expires_next.tv64 = KTIME_MAX; |
518 | base->hres_active = 0; | 518 | base->hres_active = 0; |
519 | INIT_LIST_HEAD(&base->cb_pending); | ||
520 | } | 519 | } |
521 | 520 | ||
522 | /* | 521 | /* |
@@ -524,7 +523,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) | |||
524 | */ | 523 | */ |
525 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) | 524 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) |
526 | { | 525 | { |
527 | INIT_LIST_HEAD(&timer->cb_entry); | ||
528 | } | 526 | } |
529 | 527 | ||
530 | /* | 528 | /* |
@@ -618,10 +616,13 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
618 | { | 616 | { |
619 | return 0; | 617 | return 0; |
620 | } | 618 | } |
621 | static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; } | ||
622 | static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { } | ||
623 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } | 619 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } |
624 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } | 620 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } |
621 | static inline int hrtimer_reprogram(struct hrtimer *timer, | ||
622 | struct hrtimer_clock_base *base) | ||
623 | { | ||
624 | return 0; | ||
625 | } | ||
625 | 626 | ||
626 | #endif /* CONFIG_HIGH_RES_TIMERS */ | 627 | #endif /* CONFIG_HIGH_RES_TIMERS */ |
627 | 628 | ||
@@ -1001,6 +1002,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
1001 | clock_id = CLOCK_MONOTONIC; | 1002 | clock_id = CLOCK_MONOTONIC; |
1002 | 1003 | ||
1003 | timer->base = &cpu_base->clock_base[clock_id]; | 1004 | timer->base = &cpu_base->clock_base[clock_id]; |
1005 | INIT_LIST_HEAD(&timer->cb_entry); | ||
1004 | hrtimer_init_timer_hres(timer); | 1006 | hrtimer_init_timer_hres(timer); |
1005 | 1007 | ||
1006 | #ifdef CONFIG_TIMER_STATS | 1008 | #ifdef CONFIG_TIMER_STATS |
@@ -1030,6 +1032,85 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | |||
1030 | } | 1032 | } |
1031 | EXPORT_SYMBOL_GPL(hrtimer_get_res); | 1033 | EXPORT_SYMBOL_GPL(hrtimer_get_res); |
1032 | 1034 | ||
1035 | static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) | ||
1036 | { | ||
1037 | spin_lock_irq(&cpu_base->lock); | ||
1038 | |||
1039 | while (!list_empty(&cpu_base->cb_pending)) { | ||
1040 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
1041 | struct hrtimer *timer; | ||
1042 | int restart; | ||
1043 | |||
1044 | timer = list_entry(cpu_base->cb_pending.next, | ||
1045 | struct hrtimer, cb_entry); | ||
1046 | |||
1047 | timer_stats_account_hrtimer(timer); | ||
1048 | |||
1049 | fn = timer->function; | ||
1050 | __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); | ||
1051 | spin_unlock_irq(&cpu_base->lock); | ||
1052 | |||
1053 | restart = fn(timer); | ||
1054 | |||
1055 | spin_lock_irq(&cpu_base->lock); | ||
1056 | |||
1057 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
1058 | if (restart == HRTIMER_RESTART) { | ||
1059 | BUG_ON(hrtimer_active(timer)); | ||
1060 | /* | ||
1061 | * Enqueue the timer, allow reprogramming of the event | ||
1062 | * device | ||
1063 | */ | ||
1064 | enqueue_hrtimer(timer, timer->base, 1); | ||
1065 | } else if (hrtimer_active(timer)) { | ||
1066 | /* | ||
1067 | * If the timer was rearmed on another CPU, reprogram | ||
1068 | * the event device. | ||
1069 | */ | ||
1070 | if (timer->base->first == &timer->node) | ||
1071 | hrtimer_reprogram(timer, timer->base); | ||
1072 | } | ||
1073 | } | ||
1074 | spin_unlock_irq(&cpu_base->lock); | ||
1075 | } | ||
1076 | |||
1077 | static void __run_hrtimer(struct hrtimer *timer) | ||
1078 | { | ||
1079 | struct hrtimer_clock_base *base = timer->base; | ||
1080 | struct hrtimer_cpu_base *cpu_base = base->cpu_base; | ||
1081 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
1082 | int restart; | ||
1083 | |||
1084 | __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); | ||
1085 | timer_stats_account_hrtimer(timer); | ||
1086 | |||
1087 | fn = timer->function; | ||
1088 | if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { | ||
1089 | /* | ||
1090 | * Used for scheduler timers, avoid lock inversion with | ||
1091 | * rq->lock and tasklist_lock. | ||
1092 | * | ||
1093 | * These timers are required to deal with enqueue expiry | ||
1094 | * themselves and are not allowed to migrate. | ||
1095 | */ | ||
1096 | spin_unlock(&cpu_base->lock); | ||
1097 | restart = fn(timer); | ||
1098 | spin_lock(&cpu_base->lock); | ||
1099 | } else | ||
1100 | restart = fn(timer); | ||
1101 | |||
1102 | /* | ||
1103 | * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid | ||
1104 | * reprogramming of the event hardware. This happens at the end of this | ||
1105 | * function anyway. | ||
1106 | */ | ||
1107 | if (restart != HRTIMER_NORESTART) { | ||
1108 | BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); | ||
1109 | enqueue_hrtimer(timer, base, 0); | ||
1110 | } | ||
1111 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
1112 | } | ||
1113 | |||
1033 | #ifdef CONFIG_HIGH_RES_TIMERS | 1114 | #ifdef CONFIG_HIGH_RES_TIMERS |
1034 | 1115 | ||
1035 | /* | 1116 | /* |
@@ -1087,21 +1168,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) | |||
1087 | continue; | 1168 | continue; |
1088 | } | 1169 | } |
1089 | 1170 | ||
1090 | __remove_hrtimer(timer, base, | 1171 | __run_hrtimer(timer); |
1091 | HRTIMER_STATE_CALLBACK, 0); | ||
1092 | timer_stats_account_hrtimer(timer); | ||
1093 | |||
1094 | /* | ||
1095 | * Note: We clear the CALLBACK bit after | ||
1096 | * enqueue_hrtimer to avoid reprogramming of | ||
1097 | * the event hardware. This happens at the end | ||
1098 | * of this function anyway. | ||
1099 | */ | ||
1100 | if (timer->function(timer) != HRTIMER_NORESTART) { | ||
1101 | BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); | ||
1102 | enqueue_hrtimer(timer, base, 0); | ||
1103 | } | ||
1104 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
1105 | } | 1172 | } |
1106 | spin_unlock(&cpu_base->lock); | 1173 | spin_unlock(&cpu_base->lock); |
1107 | base++; | 1174 | base++; |
@@ -1122,52 +1189,41 @@ void hrtimer_interrupt(struct clock_event_device *dev) | |||
1122 | 1189 | ||
1123 | static void run_hrtimer_softirq(struct softirq_action *h) | 1190 | static void run_hrtimer_softirq(struct softirq_action *h) |
1124 | { | 1191 | { |
1125 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1192 | run_hrtimer_pending(&__get_cpu_var(hrtimer_bases)); |
1126 | 1193 | } | |
1127 | spin_lock_irq(&cpu_base->lock); | ||
1128 | |||
1129 | while (!list_empty(&cpu_base->cb_pending)) { | ||
1130 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
1131 | struct hrtimer *timer; | ||
1132 | int restart; | ||
1133 | |||
1134 | timer = list_entry(cpu_base->cb_pending.next, | ||
1135 | struct hrtimer, cb_entry); | ||
1136 | 1194 | ||
1137 | timer_stats_account_hrtimer(timer); | 1195 | #endif /* CONFIG_HIGH_RES_TIMERS */ |
1138 | 1196 | ||
1139 | fn = timer->function; | 1197 | /* |
1140 | __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); | 1198 | * Called from timer softirq every jiffy, expire hrtimers: |
1141 | spin_unlock_irq(&cpu_base->lock); | 1199 | * |
1200 | * For HRT its the fall back code to run the softirq in the timer | ||
1201 | * softirq context in case the hrtimer initialization failed or has | ||
1202 | * not been done yet. | ||
1203 | */ | ||
1204 | void hrtimer_run_pending(void) | ||
1205 | { | ||
1206 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
1142 | 1207 | ||
1143 | restart = fn(timer); | 1208 | if (hrtimer_hres_active()) |
1209 | return; | ||
1144 | 1210 | ||
1145 | spin_lock_irq(&cpu_base->lock); | 1211 | /* |
1212 | * This _is_ ugly: We have to check in the softirq context, | ||
1213 | * whether we can switch to highres and / or nohz mode. The | ||
1214 | * clocksource switch happens in the timer interrupt with | ||
1215 | * xtime_lock held. Notification from there only sets the | ||
1216 | * check bit in the tick_oneshot code, otherwise we might | ||
1217 | * deadlock vs. xtime_lock. | ||
1218 | */ | ||
1219 | if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) | ||
1220 | hrtimer_switch_to_hres(); | ||
1146 | 1221 | ||
1147 | timer->state &= ~HRTIMER_STATE_CALLBACK; | 1222 | run_hrtimer_pending(cpu_base); |
1148 | if (restart == HRTIMER_RESTART) { | ||
1149 | BUG_ON(hrtimer_active(timer)); | ||
1150 | /* | ||
1151 | * Enqueue the timer, allow reprogramming of the event | ||
1152 | * device | ||
1153 | */ | ||
1154 | enqueue_hrtimer(timer, timer->base, 1); | ||
1155 | } else if (hrtimer_active(timer)) { | ||
1156 | /* | ||
1157 | * If the timer was rearmed on another CPU, reprogram | ||
1158 | * the event device. | ||
1159 | */ | ||
1160 | if (timer->base->first == &timer->node) | ||
1161 | hrtimer_reprogram(timer, timer->base); | ||
1162 | } | ||
1163 | } | ||
1164 | spin_unlock_irq(&cpu_base->lock); | ||
1165 | } | 1223 | } |
1166 | 1224 | ||
1167 | #endif /* CONFIG_HIGH_RES_TIMERS */ | ||
1168 | |||
1169 | /* | 1225 | /* |
1170 | * Expire the per base hrtimer-queue: | 1226 | * Called from hardirq context every jiffy |
1171 | */ | 1227 | */ |
1172 | static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, | 1228 | static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, |
1173 | int index) | 1229 | int index) |
@@ -1181,46 +1237,27 @@ static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, | |||
1181 | if (base->get_softirq_time) | 1237 | if (base->get_softirq_time) |
1182 | base->softirq_time = base->get_softirq_time(); | 1238 | base->softirq_time = base->get_softirq_time(); |
1183 | 1239 | ||
1184 | spin_lock_irq(&cpu_base->lock); | 1240 | spin_lock(&cpu_base->lock); |
1185 | 1241 | ||
1186 | while ((node = base->first)) { | 1242 | while ((node = base->first)) { |
1187 | struct hrtimer *timer; | 1243 | struct hrtimer *timer; |
1188 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
1189 | int restart; | ||
1190 | 1244 | ||
1191 | timer = rb_entry(node, struct hrtimer, node); | 1245 | timer = rb_entry(node, struct hrtimer, node); |
1192 | if (base->softirq_time.tv64 <= timer->expires.tv64) | 1246 | if (base->softirq_time.tv64 <= timer->expires.tv64) |
1193 | break; | 1247 | break; |
1194 | 1248 | ||
1195 | #ifdef CONFIG_HIGH_RES_TIMERS | 1249 | if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { |
1196 | WARN_ON_ONCE(timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ); | 1250 | __remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0); |
1197 | #endif | 1251 | list_add_tail(&timer->cb_entry, |
1198 | timer_stats_account_hrtimer(timer); | 1252 | &base->cpu_base->cb_pending); |
1199 | 1253 | continue; | |
1200 | fn = timer->function; | ||
1201 | __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); | ||
1202 | spin_unlock_irq(&cpu_base->lock); | ||
1203 | |||
1204 | restart = fn(timer); | ||
1205 | |||
1206 | spin_lock_irq(&cpu_base->lock); | ||
1207 | |||
1208 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
1209 | if (restart != HRTIMER_NORESTART) { | ||
1210 | BUG_ON(hrtimer_active(timer)); | ||
1211 | enqueue_hrtimer(timer, base, 0); | ||
1212 | } | 1254 | } |
1255 | |||
1256 | __run_hrtimer(timer); | ||
1213 | } | 1257 | } |
1214 | spin_unlock_irq(&cpu_base->lock); | 1258 | spin_unlock(&cpu_base->lock); |
1215 | } | 1259 | } |
1216 | 1260 | ||
1217 | /* | ||
1218 | * Called from timer softirq every jiffy, expire hrtimers: | ||
1219 | * | ||
1220 | * For HRT its the fall back code to run the softirq in the timer | ||
1221 | * softirq context in case the hrtimer initialization failed or has | ||
1222 | * not been done yet. | ||
1223 | */ | ||
1224 | void hrtimer_run_queues(void) | 1261 | void hrtimer_run_queues(void) |
1225 | { | 1262 | { |
1226 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1263 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); |
@@ -1229,18 +1266,6 @@ void hrtimer_run_queues(void) | |||
1229 | if (hrtimer_hres_active()) | 1266 | if (hrtimer_hres_active()) |
1230 | return; | 1267 | return; |
1231 | 1268 | ||
1232 | /* | ||
1233 | * This _is_ ugly: We have to check in the softirq context, | ||
1234 | * whether we can switch to highres and / or nohz mode. The | ||
1235 | * clocksource switch happens in the timer interrupt with | ||
1236 | * xtime_lock held. Notification from there only sets the | ||
1237 | * check bit in the tick_oneshot code, otherwise we might | ||
1238 | * deadlock vs. xtime_lock. | ||
1239 | */ | ||
1240 | if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) | ||
1241 | if (hrtimer_switch_to_hres()) | ||
1242 | return; | ||
1243 | |||
1244 | hrtimer_get_softirq_time(cpu_base); | 1269 | hrtimer_get_softirq_time(cpu_base); |
1245 | 1270 | ||
1246 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) | 1271 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) |
@@ -1268,7 +1293,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) | |||
1268 | sl->timer.function = hrtimer_wakeup; | 1293 | sl->timer.function = hrtimer_wakeup; |
1269 | sl->task = task; | 1294 | sl->task = task; |
1270 | #ifdef CONFIG_HIGH_RES_TIMERS | 1295 | #ifdef CONFIG_HIGH_RES_TIMERS |
1271 | sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART; | 1296 | sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; |
1272 | #endif | 1297 | #endif |
1273 | } | 1298 | } |
1274 | 1299 | ||
@@ -1279,6 +1304,8 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod | |||
1279 | do { | 1304 | do { |
1280 | set_current_state(TASK_INTERRUPTIBLE); | 1305 | set_current_state(TASK_INTERRUPTIBLE); |
1281 | hrtimer_start(&t->timer, t->timer.expires, mode); | 1306 | hrtimer_start(&t->timer, t->timer.expires, mode); |
1307 | if (!hrtimer_active(&t->timer)) | ||
1308 | t->task = NULL; | ||
1282 | 1309 | ||
1283 | if (likely(t->task)) | 1310 | if (likely(t->task)) |
1284 | schedule(); | 1311 | schedule(); |
@@ -1389,6 +1416,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu) | |||
1389 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) | 1416 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) |
1390 | cpu_base->clock_base[i].cpu_base = cpu_base; | 1417 | cpu_base->clock_base[i].cpu_base = cpu_base; |
1391 | 1418 | ||
1419 | INIT_LIST_HEAD(&cpu_base->cb_pending); | ||
1392 | hrtimer_init_hres(cpu_base); | 1420 | hrtimer_init_hres(cpu_base); |
1393 | } | 1421 | } |
1394 | 1422 | ||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1f314221d534..438a01464287 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -479,6 +479,9 @@ void free_irq(unsigned int irq, void *dev_id) | |||
479 | return; | 479 | return; |
480 | } | 480 | } |
481 | printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq); | 481 | printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq); |
482 | #ifdef CONFIG_DEBUG_SHIRQ | ||
483 | dump_stack(); | ||
484 | #endif | ||
482 | spin_unlock_irqrestore(&desc->lock, flags); | 485 | spin_unlock_irqrestore(&desc->lock, flags); |
483 | return; | 486 | return; |
484 | } | 487 | } |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 50b81b98046a..c2f2ccb0549a 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -75,6 +75,18 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, | |||
75 | 75 | ||
76 | #endif | 76 | #endif |
77 | 77 | ||
78 | static int irq_spurious_read(char *page, char **start, off_t off, | ||
79 | int count, int *eof, void *data) | ||
80 | { | ||
81 | struct irq_desc *d = &irq_desc[(long) data]; | ||
82 | return sprintf(page, "count %u\n" | ||
83 | "unhandled %u\n" | ||
84 | "last_unhandled %u ms\n", | ||
85 | d->irq_count, | ||
86 | d->irqs_unhandled, | ||
87 | jiffies_to_msecs(d->last_unhandled)); | ||
88 | } | ||
89 | |||
78 | #define MAX_NAMELEN 128 | 90 | #define MAX_NAMELEN 128 |
79 | 91 | ||
80 | static int name_unique(unsigned int irq, struct irqaction *new_action) | 92 | static int name_unique(unsigned int irq, struct irqaction *new_action) |
@@ -118,6 +130,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) | |||
118 | void register_irq_proc(unsigned int irq) | 130 | void register_irq_proc(unsigned int irq) |
119 | { | 131 | { |
120 | char name [MAX_NAMELEN]; | 132 | char name [MAX_NAMELEN]; |
133 | struct proc_dir_entry *entry; | ||
121 | 134 | ||
122 | if (!root_irq_dir || | 135 | if (!root_irq_dir || |
123 | (irq_desc[irq].chip == &no_irq_chip) || | 136 | (irq_desc[irq].chip == &no_irq_chip) || |
@@ -132,8 +145,6 @@ void register_irq_proc(unsigned int irq) | |||
132 | 145 | ||
133 | #ifdef CONFIG_SMP | 146 | #ifdef CONFIG_SMP |
134 | { | 147 | { |
135 | struct proc_dir_entry *entry; | ||
136 | |||
137 | /* create /proc/irq/<irq>/smp_affinity */ | 148 | /* create /proc/irq/<irq>/smp_affinity */ |
138 | entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); | 149 | entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); |
139 | 150 | ||
@@ -144,6 +155,12 @@ void register_irq_proc(unsigned int irq) | |||
144 | } | 155 | } |
145 | } | 156 | } |
146 | #endif | 157 | #endif |
158 | |||
159 | entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir); | ||
160 | if (entry) { | ||
161 | entry->data = (void *)(long)irq; | ||
162 | entry->read_proc = irq_spurious_read; | ||
163 | } | ||
147 | } | 164 | } |
148 | 165 | ||
149 | #undef MAX_NAMELEN | 166 | #undef MAX_NAMELEN |
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 32b161972fad..a6b2bc831dd0 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/kallsyms.h> | 11 | #include <linux/kallsyms.h> |
12 | #include <linux/interrupt.h> | 12 | #include <linux/interrupt.h> |
13 | #include <linux/moduleparam.h> | ||
13 | 14 | ||
14 | static int irqfixup __read_mostly; | 15 | static int irqfixup __read_mostly; |
15 | 16 | ||
@@ -225,6 +226,8 @@ int noirqdebug_setup(char *str) | |||
225 | } | 226 | } |
226 | 227 | ||
227 | __setup("noirqdebug", noirqdebug_setup); | 228 | __setup("noirqdebug", noirqdebug_setup); |
229 | module_param(noirqdebug, bool, 0644); | ||
230 | MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); | ||
228 | 231 | ||
229 | static int __init irqfixup_setup(char *str) | 232 | static int __init irqfixup_setup(char *str) |
230 | { | 233 | { |
@@ -236,6 +239,8 @@ static int __init irqfixup_setup(char *str) | |||
236 | } | 239 | } |
237 | 240 | ||
238 | __setup("irqfixup", irqfixup_setup); | 241 | __setup("irqfixup", irqfixup_setup); |
242 | module_param(irqfixup, int, 0644); | ||
243 | MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode 2: irqpoll mode"); | ||
239 | 244 | ||
240 | static int __init irqpoll_setup(char *str) | 245 | static int __init irqpoll_setup(char *str) |
241 | { | 246 | { |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 2fc25810509e..7dadc71ce516 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
@@ -233,10 +233,11 @@ static unsigned long get_symbol_pos(unsigned long addr, | |||
233 | int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, | 233 | int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, |
234 | unsigned long *offset) | 234 | unsigned long *offset) |
235 | { | 235 | { |
236 | char namebuf[KSYM_NAME_LEN]; | ||
236 | if (is_ksym_addr(addr)) | 237 | if (is_ksym_addr(addr)) |
237 | return !!get_symbol_pos(addr, symbolsize, offset); | 238 | return !!get_symbol_pos(addr, symbolsize, offset); |
238 | 239 | ||
239 | return !!module_address_lookup(addr, symbolsize, offset, NULL); | 240 | return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf); |
240 | } | 241 | } |
241 | 242 | ||
242 | /* | 243 | /* |
@@ -251,8 +252,6 @@ const char *kallsyms_lookup(unsigned long addr, | |||
251 | unsigned long *offset, | 252 | unsigned long *offset, |
252 | char **modname, char *namebuf) | 253 | char **modname, char *namebuf) |
253 | { | 254 | { |
254 | const char *msym; | ||
255 | |||
256 | namebuf[KSYM_NAME_LEN - 1] = 0; | 255 | namebuf[KSYM_NAME_LEN - 1] = 0; |
257 | namebuf[0] = 0; | 256 | namebuf[0] = 0; |
258 | 257 | ||
@@ -268,10 +267,8 @@ const char *kallsyms_lookup(unsigned long addr, | |||
268 | } | 267 | } |
269 | 268 | ||
270 | /* see if it's in a module */ | 269 | /* see if it's in a module */ |
271 | msym = module_address_lookup(addr, symbolsize, offset, modname); | 270 | return module_address_lookup(addr, symbolsize, offset, modname, |
272 | if (msym) | 271 | namebuf); |
273 | return strncpy(namebuf, msym, KSYM_NAME_LEN - 1); | ||
274 | |||
275 | return NULL; | 272 | return NULL; |
276 | } | 273 | } |
277 | 274 | ||
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e3a5d817ac9b..d0493eafea3e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -824,6 +824,8 @@ static int __init init_kprobes(void) | |||
824 | if (!err) | 824 | if (!err) |
825 | err = register_die_notifier(&kprobe_exceptions_nb); | 825 | err = register_die_notifier(&kprobe_exceptions_nb); |
826 | 826 | ||
827 | if (!err) | ||
828 | init_test_probes(); | ||
827 | return err; | 829 | return err; |
828 | } | 830 | } |
829 | 831 | ||
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 65daa5373ca6..e53bc30e9ba5 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -17,30 +17,34 @@ | |||
17 | #include <linux/sched.h> | 17 | #include <linux/sched.h> |
18 | 18 | ||
19 | #define KERNEL_ATTR_RO(_name) \ | 19 | #define KERNEL_ATTR_RO(_name) \ |
20 | static struct subsys_attribute _name##_attr = __ATTR_RO(_name) | 20 | static struct kobj_attribute _name##_attr = __ATTR_RO(_name) |
21 | 21 | ||
22 | #define KERNEL_ATTR_RW(_name) \ | 22 | #define KERNEL_ATTR_RW(_name) \ |
23 | static struct subsys_attribute _name##_attr = \ | 23 | static struct kobj_attribute _name##_attr = \ |
24 | __ATTR(_name, 0644, _name##_show, _name##_store) | 24 | __ATTR(_name, 0644, _name##_show, _name##_store) |
25 | 25 | ||
26 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) | 26 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) |
27 | /* current uevent sequence number */ | 27 | /* current uevent sequence number */ |
28 | static ssize_t uevent_seqnum_show(struct kset *kset, char *page) | 28 | static ssize_t uevent_seqnum_show(struct kobject *kobj, |
29 | struct kobj_attribute *attr, char *buf) | ||
29 | { | 30 | { |
30 | return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum); | 31 | return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum); |
31 | } | 32 | } |
32 | KERNEL_ATTR_RO(uevent_seqnum); | 33 | KERNEL_ATTR_RO(uevent_seqnum); |
33 | 34 | ||
34 | /* uevent helper program, used during early boo */ | 35 | /* uevent helper program, used during early boo */ |
35 | static ssize_t uevent_helper_show(struct kset *kset, char *page) | 36 | static ssize_t uevent_helper_show(struct kobject *kobj, |
37 | struct kobj_attribute *attr, char *buf) | ||
36 | { | 38 | { |
37 | return sprintf(page, "%s\n", uevent_helper); | 39 | return sprintf(buf, "%s\n", uevent_helper); |
38 | } | 40 | } |
39 | static ssize_t uevent_helper_store(struct kset *kset, const char *page, size_t count) | 41 | static ssize_t uevent_helper_store(struct kobject *kobj, |
42 | struct kobj_attribute *attr, | ||
43 | const char *buf, size_t count) | ||
40 | { | 44 | { |
41 | if (count+1 > UEVENT_HELPER_PATH_LEN) | 45 | if (count+1 > UEVENT_HELPER_PATH_LEN) |
42 | return -ENOENT; | 46 | return -ENOENT; |
43 | memcpy(uevent_helper, page, count); | 47 | memcpy(uevent_helper, buf, count); |
44 | uevent_helper[count] = '\0'; | 48 | uevent_helper[count] = '\0'; |
45 | if (count && uevent_helper[count-1] == '\n') | 49 | if (count && uevent_helper[count-1] == '\n') |
46 | uevent_helper[count-1] = '\0'; | 50 | uevent_helper[count-1] = '\0'; |
@@ -50,21 +54,24 @@ KERNEL_ATTR_RW(uevent_helper); | |||
50 | #endif | 54 | #endif |
51 | 55 | ||
52 | #ifdef CONFIG_KEXEC | 56 | #ifdef CONFIG_KEXEC |
53 | static ssize_t kexec_loaded_show(struct kset *kset, char *page) | 57 | static ssize_t kexec_loaded_show(struct kobject *kobj, |
58 | struct kobj_attribute *attr, char *buf) | ||
54 | { | 59 | { |
55 | return sprintf(page, "%d\n", !!kexec_image); | 60 | return sprintf(buf, "%d\n", !!kexec_image); |
56 | } | 61 | } |
57 | KERNEL_ATTR_RO(kexec_loaded); | 62 | KERNEL_ATTR_RO(kexec_loaded); |
58 | 63 | ||
59 | static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page) | 64 | static ssize_t kexec_crash_loaded_show(struct kobject *kobj, |
65 | struct kobj_attribute *attr, char *buf) | ||
60 | { | 66 | { |
61 | return sprintf(page, "%d\n", !!kexec_crash_image); | 67 | return sprintf(buf, "%d\n", !!kexec_crash_image); |
62 | } | 68 | } |
63 | KERNEL_ATTR_RO(kexec_crash_loaded); | 69 | KERNEL_ATTR_RO(kexec_crash_loaded); |
64 | 70 | ||
65 | static ssize_t vmcoreinfo_show(struct kset *kset, char *page) | 71 | static ssize_t vmcoreinfo_show(struct kobject *kobj, |
72 | struct kobj_attribute *attr, char *buf) | ||
66 | { | 73 | { |
67 | return sprintf(page, "%lx %x\n", | 74 | return sprintf(buf, "%lx %x\n", |
68 | paddr_vmcoreinfo_note(), | 75 | paddr_vmcoreinfo_note(), |
69 | (unsigned int)vmcoreinfo_max_size); | 76 | (unsigned int)vmcoreinfo_max_size); |
70 | } | 77 | } |
@@ -94,8 +101,8 @@ static struct bin_attribute notes_attr = { | |||
94 | .read = ¬es_read, | 101 | .read = ¬es_read, |
95 | }; | 102 | }; |
96 | 103 | ||
97 | decl_subsys(kernel, NULL, NULL); | 104 | struct kobject *kernel_kobj; |
98 | EXPORT_SYMBOL_GPL(kernel_subsys); | 105 | EXPORT_SYMBOL_GPL(kernel_kobj); |
99 | 106 | ||
100 | static struct attribute * kernel_attrs[] = { | 107 | static struct attribute * kernel_attrs[] = { |
101 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) | 108 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) |
@@ -116,24 +123,39 @@ static struct attribute_group kernel_attr_group = { | |||
116 | 123 | ||
117 | static int __init ksysfs_init(void) | 124 | static int __init ksysfs_init(void) |
118 | { | 125 | { |
119 | int error = subsystem_register(&kernel_subsys); | 126 | int error; |
120 | if (!error) | ||
121 | error = sysfs_create_group(&kernel_subsys.kobj, | ||
122 | &kernel_attr_group); | ||
123 | 127 | ||
124 | if (!error && notes_size > 0) { | 128 | kernel_kobj = kobject_create_and_add("kernel", NULL); |
125 | notes_attr.size = notes_size; | 129 | if (!kernel_kobj) { |
126 | error = sysfs_create_bin_file(&kernel_subsys.kobj, | 130 | error = -ENOMEM; |
127 | ¬es_attr); | 131 | goto exit; |
128 | } | 132 | } |
133 | error = sysfs_create_group(kernel_kobj, &kernel_attr_group); | ||
134 | if (error) | ||
135 | goto kset_exit; | ||
129 | 136 | ||
130 | /* | 137 | if (notes_size > 0) { |
131 | * Create "/sys/kernel/uids" directory and corresponding root user's | 138 | notes_attr.size = notes_size; |
132 | * directory under it. | 139 | error = sysfs_create_bin_file(kernel_kobj, ¬es_attr); |
133 | */ | 140 | if (error) |
134 | if (!error) | 141 | goto group_exit; |
135 | error = uids_kobject_init(); | 142 | } |
136 | 143 | ||
144 | /* create the /sys/kernel/uids/ directory */ | ||
145 | error = uids_sysfs_init(); | ||
146 | if (error) | ||
147 | goto notes_exit; | ||
148 | |||
149 | return 0; | ||
150 | |||
151 | notes_exit: | ||
152 | if (notes_size > 0) | ||
153 | sysfs_remove_bin_file(kernel_kobj, ¬es_attr); | ||
154 | group_exit: | ||
155 | sysfs_remove_group(kernel_kobj, &kernel_attr_group); | ||
156 | kset_exit: | ||
157 | kobject_put(kernel_kobj); | ||
158 | exit: | ||
137 | return error; | 159 | return error; |
138 | } | 160 | } |
139 | 161 | ||
diff --git a/kernel/kthread.c b/kernel/kthread.c index dcfe724300eb..0ac887882f90 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -15,6 +15,8 @@ | |||
15 | #include <linux/mutex.h> | 15 | #include <linux/mutex.h> |
16 | #include <asm/semaphore.h> | 16 | #include <asm/semaphore.h> |
17 | 17 | ||
18 | #define KTHREAD_NICE_LEVEL (-5) | ||
19 | |||
18 | static DEFINE_SPINLOCK(kthread_create_lock); | 20 | static DEFINE_SPINLOCK(kthread_create_lock); |
19 | static LIST_HEAD(kthread_create_list); | 21 | static LIST_HEAD(kthread_create_list); |
20 | struct task_struct *kthreadd_task; | 22 | struct task_struct *kthreadd_task; |
@@ -94,10 +96,18 @@ static void create_kthread(struct kthread_create_info *create) | |||
94 | if (pid < 0) { | 96 | if (pid < 0) { |
95 | create->result = ERR_PTR(pid); | 97 | create->result = ERR_PTR(pid); |
96 | } else { | 98 | } else { |
99 | struct sched_param param = { .sched_priority = 0 }; | ||
97 | wait_for_completion(&create->started); | 100 | wait_for_completion(&create->started); |
98 | read_lock(&tasklist_lock); | 101 | read_lock(&tasklist_lock); |
99 | create->result = find_task_by_pid(pid); | 102 | create->result = find_task_by_pid(pid); |
100 | read_unlock(&tasklist_lock); | 103 | read_unlock(&tasklist_lock); |
104 | /* | ||
105 | * root may have changed our (kthreadd's) priority or CPU mask. | ||
106 | * The kernel thread should not inherit these properties. | ||
107 | */ | ||
108 | sched_setscheduler(create->result, SCHED_NORMAL, ¶m); | ||
109 | set_user_nice(create->result, KTHREAD_NICE_LEVEL); | ||
110 | set_cpus_allowed(create->result, CPU_MASK_ALL); | ||
101 | } | 111 | } |
102 | complete(&create->done); | 112 | complete(&create->done); |
103 | } | 113 | } |
@@ -221,7 +231,7 @@ int kthreadd(void *unused) | |||
221 | /* Setup a clean context for our children to inherit. */ | 231 | /* Setup a clean context for our children to inherit. */ |
222 | set_task_comm(tsk, "kthreadd"); | 232 | set_task_comm(tsk, "kthreadd"); |
223 | ignore_signals(tsk); | 233 | ignore_signals(tsk); |
224 | set_user_nice(tsk, -5); | 234 | set_user_nice(tsk, KTHREAD_NICE_LEVEL); |
225 | set_cpus_allowed(tsk, CPU_MASK_ALL); | 235 | set_cpus_allowed(tsk, CPU_MASK_ALL); |
226 | 236 | ||
227 | current->flags |= PF_NOFREEZE; | 237 | current->flags |= PF_NOFREEZE; |
diff --git a/kernel/latencytop.c b/kernel/latencytop.c new file mode 100644 index 000000000000..b4e3c85abe74 --- /dev/null +++ b/kernel/latencytop.c | |||
@@ -0,0 +1,239 @@ | |||
1 | /* | ||
2 | * latencytop.c: Latency display infrastructure | ||
3 | * | ||
4 | * (C) Copyright 2008 Intel Corporation | ||
5 | * Author: Arjan van de Ven <arjan@linux.intel.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; version 2 | ||
10 | * of the License. | ||
11 | */ | ||
12 | #include <linux/latencytop.h> | ||
13 | #include <linux/kallsyms.h> | ||
14 | #include <linux/seq_file.h> | ||
15 | #include <linux/notifier.h> | ||
16 | #include <linux/spinlock.h> | ||
17 | #include <linux/proc_fs.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/sched.h> | ||
20 | #include <linux/list.h> | ||
21 | #include <linux/slab.h> | ||
22 | #include <linux/stacktrace.h> | ||
23 | |||
24 | static DEFINE_SPINLOCK(latency_lock); | ||
25 | |||
26 | #define MAXLR 128 | ||
27 | static struct latency_record latency_record[MAXLR]; | ||
28 | |||
29 | int latencytop_enabled; | ||
30 | |||
31 | void clear_all_latency_tracing(struct task_struct *p) | ||
32 | { | ||
33 | unsigned long flags; | ||
34 | |||
35 | if (!latencytop_enabled) | ||
36 | return; | ||
37 | |||
38 | spin_lock_irqsave(&latency_lock, flags); | ||
39 | memset(&p->latency_record, 0, sizeof(p->latency_record)); | ||
40 | p->latency_record_count = 0; | ||
41 | spin_unlock_irqrestore(&latency_lock, flags); | ||
42 | } | ||
43 | |||
44 | static void clear_global_latency_tracing(void) | ||
45 | { | ||
46 | unsigned long flags; | ||
47 | |||
48 | spin_lock_irqsave(&latency_lock, flags); | ||
49 | memset(&latency_record, 0, sizeof(latency_record)); | ||
50 | spin_unlock_irqrestore(&latency_lock, flags); | ||
51 | } | ||
52 | |||
53 | static void __sched | ||
54 | account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) | ||
55 | { | ||
56 | int firstnonnull = MAXLR + 1; | ||
57 | int i; | ||
58 | |||
59 | if (!latencytop_enabled) | ||
60 | return; | ||
61 | |||
62 | /* skip kernel threads for now */ | ||
63 | if (!tsk->mm) | ||
64 | return; | ||
65 | |||
66 | for (i = 0; i < MAXLR; i++) { | ||
67 | int q; | ||
68 | int same = 1; | ||
69 | /* Nothing stored: */ | ||
70 | if (!latency_record[i].backtrace[0]) { | ||
71 | if (firstnonnull > i) | ||
72 | firstnonnull = i; | ||
73 | continue; | ||
74 | } | ||
75 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { | ||
76 | if (latency_record[i].backtrace[q] != | ||
77 | lat->backtrace[q]) | ||
78 | same = 0; | ||
79 | if (same && lat->backtrace[q] == 0) | ||
80 | break; | ||
81 | if (same && lat->backtrace[q] == ULONG_MAX) | ||
82 | break; | ||
83 | } | ||
84 | if (same) { | ||
85 | latency_record[i].count++; | ||
86 | latency_record[i].time += lat->time; | ||
87 | if (lat->time > latency_record[i].max) | ||
88 | latency_record[i].max = lat->time; | ||
89 | return; | ||
90 | } | ||
91 | } | ||
92 | |||
93 | i = firstnonnull; | ||
94 | if (i >= MAXLR - 1) | ||
95 | return; | ||
96 | |||
97 | /* Allocted a new one: */ | ||
98 | memcpy(&latency_record[i], lat, sizeof(struct latency_record)); | ||
99 | } | ||
100 | |||
101 | static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat) | ||
102 | { | ||
103 | struct stack_trace trace; | ||
104 | |||
105 | memset(&trace, 0, sizeof(trace)); | ||
106 | trace.max_entries = LT_BACKTRACEDEPTH; | ||
107 | trace.entries = &lat->backtrace[0]; | ||
108 | trace.skip = 0; | ||
109 | save_stack_trace_tsk(tsk, &trace); | ||
110 | } | ||
111 | |||
112 | void __sched | ||
113 | account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) | ||
114 | { | ||
115 | unsigned long flags; | ||
116 | int i, q; | ||
117 | struct latency_record lat; | ||
118 | |||
119 | if (!latencytop_enabled) | ||
120 | return; | ||
121 | |||
122 | /* Long interruptible waits are generally user requested... */ | ||
123 | if (inter && usecs > 5000) | ||
124 | return; | ||
125 | |||
126 | memset(&lat, 0, sizeof(lat)); | ||
127 | lat.count = 1; | ||
128 | lat.time = usecs; | ||
129 | lat.max = usecs; | ||
130 | store_stacktrace(tsk, &lat); | ||
131 | |||
132 | spin_lock_irqsave(&latency_lock, flags); | ||
133 | |||
134 | account_global_scheduler_latency(tsk, &lat); | ||
135 | |||
136 | /* | ||
137 | * short term hack; if we're > 32 we stop; future we recycle: | ||
138 | */ | ||
139 | tsk->latency_record_count++; | ||
140 | if (tsk->latency_record_count >= LT_SAVECOUNT) | ||
141 | goto out_unlock; | ||
142 | |||
143 | for (i = 0; i < LT_SAVECOUNT ; i++) { | ||
144 | struct latency_record *mylat; | ||
145 | int same = 1; | ||
146 | mylat = &tsk->latency_record[i]; | ||
147 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { | ||
148 | if (mylat->backtrace[q] != | ||
149 | lat.backtrace[q]) | ||
150 | same = 0; | ||
151 | if (same && lat.backtrace[q] == 0) | ||
152 | break; | ||
153 | if (same && lat.backtrace[q] == ULONG_MAX) | ||
154 | break; | ||
155 | } | ||
156 | if (same) { | ||
157 | mylat->count++; | ||
158 | mylat->time += lat.time; | ||
159 | if (lat.time > mylat->max) | ||
160 | mylat->max = lat.time; | ||
161 | goto out_unlock; | ||
162 | } | ||
163 | } | ||
164 | |||
165 | /* Allocated a new one: */ | ||
166 | i = tsk->latency_record_count; | ||
167 | memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); | ||
168 | |||
169 | out_unlock: | ||
170 | spin_unlock_irqrestore(&latency_lock, flags); | ||
171 | } | ||
172 | |||
173 | static int lstats_show(struct seq_file *m, void *v) | ||
174 | { | ||
175 | int i; | ||
176 | |||
177 | seq_puts(m, "Latency Top version : v0.1\n"); | ||
178 | |||
179 | for (i = 0; i < MAXLR; i++) { | ||
180 | if (latency_record[i].backtrace[0]) { | ||
181 | int q; | ||
182 | seq_printf(m, "%i %li %li ", | ||
183 | latency_record[i].count, | ||
184 | latency_record[i].time, | ||
185 | latency_record[i].max); | ||
186 | for (q = 0; q < LT_BACKTRACEDEPTH; q++) { | ||
187 | char sym[KSYM_NAME_LEN]; | ||
188 | char *c; | ||
189 | if (!latency_record[i].backtrace[q]) | ||
190 | break; | ||
191 | if (latency_record[i].backtrace[q] == ULONG_MAX) | ||
192 | break; | ||
193 | sprint_symbol(sym, latency_record[i].backtrace[q]); | ||
194 | c = strchr(sym, '+'); | ||
195 | if (c) | ||
196 | *c = 0; | ||
197 | seq_printf(m, "%s ", sym); | ||
198 | } | ||
199 | seq_printf(m, "\n"); | ||
200 | } | ||
201 | } | ||
202 | return 0; | ||
203 | } | ||
204 | |||
205 | static ssize_t | ||
206 | lstats_write(struct file *file, const char __user *buf, size_t count, | ||
207 | loff_t *offs) | ||
208 | { | ||
209 | clear_global_latency_tracing(); | ||
210 | |||
211 | return count; | ||
212 | } | ||
213 | |||
214 | static int lstats_open(struct inode *inode, struct file *filp) | ||
215 | { | ||
216 | return single_open(filp, lstats_show, NULL); | ||
217 | } | ||
218 | |||
219 | static struct file_operations lstats_fops = { | ||
220 | .open = lstats_open, | ||
221 | .read = seq_read, | ||
222 | .write = lstats_write, | ||
223 | .llseek = seq_lseek, | ||
224 | .release = single_release, | ||
225 | }; | ||
226 | |||
227 | static int __init init_lstats_procfs(void) | ||
228 | { | ||
229 | struct proc_dir_entry *pe; | ||
230 | |||
231 | pe = create_proc_entry("latency_stats", 0644, NULL); | ||
232 | if (!pe) | ||
233 | return -ENOMEM; | ||
234 | |||
235 | pe->proc_fops = &lstats_fops; | ||
236 | |||
237 | return 0; | ||
238 | } | ||
239 | __initcall(init_lstats_procfs); | ||
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 4335f12a27c6..3574379f4d62 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -2932,7 +2932,7 @@ static void zap_class(struct lock_class *class) | |||
2932 | 2932 | ||
2933 | } | 2933 | } |
2934 | 2934 | ||
2935 | static inline int within(void *addr, void *start, unsigned long size) | 2935 | static inline int within(const void *addr, void *start, unsigned long size) |
2936 | { | 2936 | { |
2937 | return addr >= start && addr < start + size; | 2937 | return addr >= start && addr < start + size; |
2938 | } | 2938 | } |
@@ -2955,9 +2955,12 @@ void lockdep_free_key_range(void *start, unsigned long size) | |||
2955 | head = classhash_table + i; | 2955 | head = classhash_table + i; |
2956 | if (list_empty(head)) | 2956 | if (list_empty(head)) |
2957 | continue; | 2957 | continue; |
2958 | list_for_each_entry_safe(class, next, head, hash_entry) | 2958 | list_for_each_entry_safe(class, next, head, hash_entry) { |
2959 | if (within(class->key, start, size)) | 2959 | if (within(class->key, start, size)) |
2960 | zap_class(class); | 2960 | zap_class(class); |
2961 | else if (within(class->name, start, size)) | ||
2962 | zap_class(class); | ||
2963 | } | ||
2961 | } | 2964 | } |
2962 | 2965 | ||
2963 | if (locked) | 2966 | if (locked) |
@@ -3203,7 +3206,11 @@ retry: | |||
3203 | 3206 | ||
3204 | EXPORT_SYMBOL_GPL(debug_show_all_locks); | 3207 | EXPORT_SYMBOL_GPL(debug_show_all_locks); |
3205 | 3208 | ||
3206 | void debug_show_held_locks(struct task_struct *task) | 3209 | /* |
3210 | * Careful: only use this function if you are sure that | ||
3211 | * the task cannot run in parallel! | ||
3212 | */ | ||
3213 | void __debug_show_held_locks(struct task_struct *task) | ||
3207 | { | 3214 | { |
3208 | if (unlikely(!debug_locks)) { | 3215 | if (unlikely(!debug_locks)) { |
3209 | printk("INFO: lockdep is turned off.\n"); | 3216 | printk("INFO: lockdep is turned off.\n"); |
@@ -3211,6 +3218,12 @@ void debug_show_held_locks(struct task_struct *task) | |||
3211 | } | 3218 | } |
3212 | lockdep_print_held_locks(task); | 3219 | lockdep_print_held_locks(task); |
3213 | } | 3220 | } |
3221 | EXPORT_SYMBOL_GPL(__debug_show_held_locks); | ||
3222 | |||
3223 | void debug_show_held_locks(struct task_struct *task) | ||
3224 | { | ||
3225 | __debug_show_held_locks(task); | ||
3226 | } | ||
3214 | 3227 | ||
3215 | EXPORT_SYMBOL_GPL(debug_show_held_locks); | 3228 | EXPORT_SYMBOL_GPL(debug_show_held_locks); |
3216 | 3229 | ||
diff --git a/kernel/module.c b/kernel/module.c index c2e3e2e98801..bd60278ee703 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -47,8 +47,6 @@ | |||
47 | #include <asm/cacheflush.h> | 47 | #include <asm/cacheflush.h> |
48 | #include <linux/license.h> | 48 | #include <linux/license.h> |
49 | 49 | ||
50 | extern int module_sysfs_initialized; | ||
51 | |||
52 | #if 0 | 50 | #if 0 |
53 | #define DEBUGP printk | 51 | #define DEBUGP printk |
54 | #else | 52 | #else |
@@ -67,6 +65,9 @@ extern int module_sysfs_initialized; | |||
67 | static DEFINE_MUTEX(module_mutex); | 65 | static DEFINE_MUTEX(module_mutex); |
68 | static LIST_HEAD(modules); | 66 | static LIST_HEAD(modules); |
69 | 67 | ||
68 | /* Waiting for a module to finish initializing? */ | ||
69 | static DECLARE_WAIT_QUEUE_HEAD(module_wq); | ||
70 | |||
70 | static BLOCKING_NOTIFIER_HEAD(module_notify_list); | 71 | static BLOCKING_NOTIFIER_HEAD(module_notify_list); |
71 | 72 | ||
72 | int register_module_notifier(struct notifier_block * nb) | 73 | int register_module_notifier(struct notifier_block * nb) |
@@ -86,8 +87,11 @@ EXPORT_SYMBOL(unregister_module_notifier); | |||
86 | static inline int strong_try_module_get(struct module *mod) | 87 | static inline int strong_try_module_get(struct module *mod) |
87 | { | 88 | { |
88 | if (mod && mod->state == MODULE_STATE_COMING) | 89 | if (mod && mod->state == MODULE_STATE_COMING) |
90 | return -EBUSY; | ||
91 | if (try_module_get(mod)) | ||
89 | return 0; | 92 | return 0; |
90 | return try_module_get(mod); | 93 | else |
94 | return -ENOENT; | ||
91 | } | 95 | } |
92 | 96 | ||
93 | static inline void add_taint_module(struct module *mod, unsigned flag) | 97 | static inline void add_taint_module(struct module *mod, unsigned flag) |
@@ -426,6 +430,14 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr, | |||
426 | return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); | 430 | return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); |
427 | } | 431 | } |
428 | 432 | ||
433 | static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) | ||
434 | { | ||
435 | int cpu; | ||
436 | |||
437 | for_each_possible_cpu(cpu) | ||
438 | memcpy(pcpudest + per_cpu_offset(cpu), from, size); | ||
439 | } | ||
440 | |||
429 | static int percpu_modinit(void) | 441 | static int percpu_modinit(void) |
430 | { | 442 | { |
431 | pcpu_num_used = 2; | 443 | pcpu_num_used = 2; |
@@ -498,6 +510,8 @@ static struct module_attribute modinfo_##field = { \ | |||
498 | MODINFO_ATTR(version); | 510 | MODINFO_ATTR(version); |
499 | MODINFO_ATTR(srcversion); | 511 | MODINFO_ATTR(srcversion); |
500 | 512 | ||
513 | static char last_unloaded_module[MODULE_NAME_LEN+1]; | ||
514 | |||
501 | #ifdef CONFIG_MODULE_UNLOAD | 515 | #ifdef CONFIG_MODULE_UNLOAD |
502 | /* Init the unload section of the module. */ | 516 | /* Init the unload section of the module. */ |
503 | static void module_unload_init(struct module *mod) | 517 | static void module_unload_init(struct module *mod) |
@@ -539,11 +553,21 @@ static int already_uses(struct module *a, struct module *b) | |||
539 | static int use_module(struct module *a, struct module *b) | 553 | static int use_module(struct module *a, struct module *b) |
540 | { | 554 | { |
541 | struct module_use *use; | 555 | struct module_use *use; |
542 | int no_warn; | 556 | int no_warn, err; |
543 | 557 | ||
544 | if (b == NULL || already_uses(a, b)) return 1; | 558 | if (b == NULL || already_uses(a, b)) return 1; |
545 | 559 | ||
546 | if (!strong_try_module_get(b)) | 560 | /* If we're interrupted or time out, we fail. */ |
561 | if (wait_event_interruptible_timeout( | ||
562 | module_wq, (err = strong_try_module_get(b)) != -EBUSY, | ||
563 | 30 * HZ) <= 0) { | ||
564 | printk("%s: gave up waiting for init of module %s.\n", | ||
565 | a->name, b->name); | ||
566 | return 0; | ||
567 | } | ||
568 | |||
569 | /* If strong_try_module_get() returned a different error, we fail. */ | ||
570 | if (err) | ||
547 | return 0; | 571 | return 0; |
548 | 572 | ||
549 | DEBUGP("Allocating new usage for %s.\n", a->name); | 573 | DEBUGP("Allocating new usage for %s.\n", a->name); |
@@ -721,6 +745,8 @@ sys_delete_module(const char __user *name_user, unsigned int flags) | |||
721 | mod->exit(); | 745 | mod->exit(); |
722 | mutex_lock(&module_mutex); | 746 | mutex_lock(&module_mutex); |
723 | } | 747 | } |
748 | /* Store the name of the last unloaded module for diagnostic purposes */ | ||
749 | strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); | ||
724 | free_module(mod); | 750 | free_module(mod); |
725 | 751 | ||
726 | out: | 752 | out: |
@@ -814,7 +840,7 @@ static inline void module_unload_free(struct module *mod) | |||
814 | 840 | ||
815 | static inline int use_module(struct module *a, struct module *b) | 841 | static inline int use_module(struct module *a, struct module *b) |
816 | { | 842 | { |
817 | return strong_try_module_get(b); | 843 | return strong_try_module_get(b) == 0; |
818 | } | 844 | } |
819 | 845 | ||
820 | static inline void module_unload_init(struct module *mod) | 846 | static inline void module_unload_init(struct module *mod) |
@@ -1122,7 +1148,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, | |||
1122 | ++loaded; | 1148 | ++loaded; |
1123 | } | 1149 | } |
1124 | 1150 | ||
1125 | notes_attrs->dir = kobject_add_dir(&mod->mkobj.kobj, "notes"); | 1151 | notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj); |
1126 | if (!notes_attrs->dir) | 1152 | if (!notes_attrs->dir) |
1127 | goto out; | 1153 | goto out; |
1128 | 1154 | ||
@@ -1212,6 +1238,7 @@ void module_remove_modinfo_attrs(struct module *mod) | |||
1212 | int mod_sysfs_init(struct module *mod) | 1238 | int mod_sysfs_init(struct module *mod) |
1213 | { | 1239 | { |
1214 | int err; | 1240 | int err; |
1241 | struct kobject *kobj; | ||
1215 | 1242 | ||
1216 | if (!module_sysfs_initialized) { | 1243 | if (!module_sysfs_initialized) { |
1217 | printk(KERN_ERR "%s: module sysfs not initialized\n", | 1244 | printk(KERN_ERR "%s: module sysfs not initialized\n", |
@@ -1219,15 +1246,25 @@ int mod_sysfs_init(struct module *mod) | |||
1219 | err = -EINVAL; | 1246 | err = -EINVAL; |
1220 | goto out; | 1247 | goto out; |
1221 | } | 1248 | } |
1222 | memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); | 1249 | |
1223 | err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name); | 1250 | kobj = kset_find_obj(module_kset, mod->name); |
1224 | if (err) | 1251 | if (kobj) { |
1252 | printk(KERN_ERR "%s: module is already loaded\n", mod->name); | ||
1253 | kobject_put(kobj); | ||
1254 | err = -EINVAL; | ||
1225 | goto out; | 1255 | goto out; |
1226 | kobj_set_kset_s(&mod->mkobj, module_subsys); | 1256 | } |
1257 | |||
1227 | mod->mkobj.mod = mod; | 1258 | mod->mkobj.mod = mod; |
1228 | 1259 | ||
1229 | kobject_init(&mod->mkobj.kobj); | 1260 | memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); |
1261 | mod->mkobj.kobj.kset = module_kset; | ||
1262 | err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, | ||
1263 | "%s", mod->name); | ||
1264 | if (err) | ||
1265 | kobject_put(&mod->mkobj.kobj); | ||
1230 | 1266 | ||
1267 | /* delay uevent until full sysfs population */ | ||
1231 | out: | 1268 | out: |
1232 | return err; | 1269 | return err; |
1233 | } | 1270 | } |
@@ -1238,12 +1275,7 @@ int mod_sysfs_setup(struct module *mod, | |||
1238 | { | 1275 | { |
1239 | int err; | 1276 | int err; |
1240 | 1277 | ||
1241 | /* delay uevent until full sysfs population */ | 1278 | mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); |
1242 | err = kobject_add(&mod->mkobj.kobj); | ||
1243 | if (err) | ||
1244 | goto out; | ||
1245 | |||
1246 | mod->holders_dir = kobject_add_dir(&mod->mkobj.kobj, "holders"); | ||
1247 | if (!mod->holders_dir) { | 1279 | if (!mod->holders_dir) { |
1248 | err = -ENOMEM; | 1280 | err = -ENOMEM; |
1249 | goto out_unreg; | 1281 | goto out_unreg; |
@@ -1263,11 +1295,9 @@ int mod_sysfs_setup(struct module *mod, | |||
1263 | out_unreg_param: | 1295 | out_unreg_param: |
1264 | module_param_sysfs_remove(mod); | 1296 | module_param_sysfs_remove(mod); |
1265 | out_unreg_holders: | 1297 | out_unreg_holders: |
1266 | kobject_unregister(mod->holders_dir); | 1298 | kobject_put(mod->holders_dir); |
1267 | out_unreg: | 1299 | out_unreg: |
1268 | kobject_del(&mod->mkobj.kobj); | ||
1269 | kobject_put(&mod->mkobj.kobj); | 1300 | kobject_put(&mod->mkobj.kobj); |
1270 | out: | ||
1271 | return err; | 1301 | return err; |
1272 | } | 1302 | } |
1273 | #endif | 1303 | #endif |
@@ -1276,9 +1306,20 @@ static void mod_kobject_remove(struct module *mod) | |||
1276 | { | 1306 | { |
1277 | module_remove_modinfo_attrs(mod); | 1307 | module_remove_modinfo_attrs(mod); |
1278 | module_param_sysfs_remove(mod); | 1308 | module_param_sysfs_remove(mod); |
1279 | kobject_unregister(mod->mkobj.drivers_dir); | 1309 | kobject_put(mod->mkobj.drivers_dir); |
1280 | kobject_unregister(mod->holders_dir); | 1310 | kobject_put(mod->holders_dir); |
1281 | kobject_unregister(&mod->mkobj.kobj); | 1311 | kobject_put(&mod->mkobj.kobj); |
1312 | } | ||
1313 | |||
1314 | /* | ||
1315 | * link the module with the whole machine is stopped with interrupts off | ||
1316 | * - this defends against kallsyms not taking locks | ||
1317 | */ | ||
1318 | static int __link_module(void *_mod) | ||
1319 | { | ||
1320 | struct module *mod = _mod; | ||
1321 | list_add(&mod->list, &modules); | ||
1322 | return 0; | ||
1282 | } | 1323 | } |
1283 | 1324 | ||
1284 | /* | 1325 | /* |
@@ -1330,7 +1371,7 @@ void *__symbol_get(const char *symbol) | |||
1330 | 1371 | ||
1331 | preempt_disable(); | 1372 | preempt_disable(); |
1332 | value = __find_symbol(symbol, &owner, &crc, 1); | 1373 | value = __find_symbol(symbol, &owner, &crc, 1); |
1333 | if (value && !strong_try_module_get(owner)) | 1374 | if (value && strong_try_module_get(owner) != 0) |
1334 | value = 0; | 1375 | value = 0; |
1335 | preempt_enable(); | 1376 | preempt_enable(); |
1336 | 1377 | ||
@@ -1884,16 +1925,16 @@ static struct module *load_module(void __user *umod, | |||
1884 | /* Now we've moved module, initialize linked lists, etc. */ | 1925 | /* Now we've moved module, initialize linked lists, etc. */ |
1885 | module_unload_init(mod); | 1926 | module_unload_init(mod); |
1886 | 1927 | ||
1887 | /* Initialize kobject, so we can reference it. */ | 1928 | /* add kobject, so we can reference it. */ |
1888 | err = mod_sysfs_init(mod); | 1929 | err = mod_sysfs_init(mod); |
1889 | if (err) | 1930 | if (err) |
1890 | goto cleanup; | 1931 | goto free_unload; |
1891 | 1932 | ||
1892 | /* Set up license info based on the info section */ | 1933 | /* Set up license info based on the info section */ |
1893 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); | 1934 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); |
1894 | 1935 | ||
1895 | if (strcmp(mod->name, "ndiswrapper") == 0) | 1936 | if (strcmp(mod->name, "ndiswrapper") == 0) |
1896 | add_taint(TAINT_PROPRIETARY_MODULE); | 1937 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); |
1897 | if (strcmp(mod->name, "driverloader") == 0) | 1938 | if (strcmp(mod->name, "driverloader") == 0) |
1898 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); | 1939 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); |
1899 | 1940 | ||
@@ -2023,6 +2064,11 @@ static struct module *load_module(void __user *umod, | |||
2023 | printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", | 2064 | printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", |
2024 | mod->name); | 2065 | mod->name); |
2025 | 2066 | ||
2067 | /* Now sew it into the lists so we can get lockdep and oops | ||
2068 | * info during argument parsing. Noone should access us, since | ||
2069 | * strong_try_module_get() will fail. */ | ||
2070 | stop_machine_run(__link_module, mod, NR_CPUS); | ||
2071 | |||
2026 | /* Size of section 0 is 0, so this works well if no params */ | 2072 | /* Size of section 0 is 0, so this works well if no params */ |
2027 | err = parse_args(mod->name, mod->args, | 2073 | err = parse_args(mod->name, mod->args, |
2028 | (struct kernel_param *) | 2074 | (struct kernel_param *) |
@@ -2031,7 +2077,7 @@ static struct module *load_module(void __user *umod, | |||
2031 | / sizeof(struct kernel_param), | 2077 | / sizeof(struct kernel_param), |
2032 | NULL); | 2078 | NULL); |
2033 | if (err < 0) | 2079 | if (err < 0) |
2034 | goto arch_cleanup; | 2080 | goto unlink; |
2035 | 2081 | ||
2036 | err = mod_sysfs_setup(mod, | 2082 | err = mod_sysfs_setup(mod, |
2037 | (struct kernel_param *) | 2083 | (struct kernel_param *) |
@@ -2039,7 +2085,7 @@ static struct module *load_module(void __user *umod, | |||
2039 | sechdrs[setupindex].sh_size | 2085 | sechdrs[setupindex].sh_size |
2040 | / sizeof(struct kernel_param)); | 2086 | / sizeof(struct kernel_param)); |
2041 | if (err < 0) | 2087 | if (err < 0) |
2042 | goto arch_cleanup; | 2088 | goto unlink; |
2043 | add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); | 2089 | add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); |
2044 | add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); | 2090 | add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); |
2045 | 2091 | ||
@@ -2054,9 +2100,13 @@ static struct module *load_module(void __user *umod, | |||
2054 | /* Done! */ | 2100 | /* Done! */ |
2055 | return mod; | 2101 | return mod; |
2056 | 2102 | ||
2057 | arch_cleanup: | 2103 | unlink: |
2104 | stop_machine_run(__unlink_module, mod, NR_CPUS); | ||
2058 | module_arch_cleanup(mod); | 2105 | module_arch_cleanup(mod); |
2059 | cleanup: | 2106 | cleanup: |
2107 | kobject_del(&mod->mkobj.kobj); | ||
2108 | kobject_put(&mod->mkobj.kobj); | ||
2109 | free_unload: | ||
2060 | module_unload_free(mod); | 2110 | module_unload_free(mod); |
2061 | module_free(mod, mod->module_init); | 2111 | module_free(mod, mod->module_init); |
2062 | free_core: | 2112 | free_core: |
@@ -2076,17 +2126,6 @@ static struct module *load_module(void __user *umod, | |||
2076 | goto free_hdr; | 2126 | goto free_hdr; |
2077 | } | 2127 | } |
2078 | 2128 | ||
2079 | /* | ||
2080 | * link the module with the whole machine is stopped with interrupts off | ||
2081 | * - this defends against kallsyms not taking locks | ||
2082 | */ | ||
2083 | static int __link_module(void *_mod) | ||
2084 | { | ||
2085 | struct module *mod = _mod; | ||
2086 | list_add(&mod->list, &modules); | ||
2087 | return 0; | ||
2088 | } | ||
2089 | |||
2090 | /* This is where the real work happens */ | 2129 | /* This is where the real work happens */ |
2091 | asmlinkage long | 2130 | asmlinkage long |
2092 | sys_init_module(void __user *umod, | 2131 | sys_init_module(void __user *umod, |
@@ -2111,10 +2150,6 @@ sys_init_module(void __user *umod, | |||
2111 | return PTR_ERR(mod); | 2150 | return PTR_ERR(mod); |
2112 | } | 2151 | } |
2113 | 2152 | ||
2114 | /* Now sew it into the lists. They won't access us, since | ||
2115 | strong_try_module_get() will fail. */ | ||
2116 | stop_machine_run(__link_module, mod, NR_CPUS); | ||
2117 | |||
2118 | /* Drop lock so they can recurse */ | 2153 | /* Drop lock so they can recurse */ |
2119 | mutex_unlock(&module_mutex); | 2154 | mutex_unlock(&module_mutex); |
2120 | 2155 | ||
@@ -2133,6 +2168,7 @@ sys_init_module(void __user *umod, | |||
2133 | mutex_lock(&module_mutex); | 2168 | mutex_lock(&module_mutex); |
2134 | free_module(mod); | 2169 | free_module(mod); |
2135 | mutex_unlock(&module_mutex); | 2170 | mutex_unlock(&module_mutex); |
2171 | wake_up(&module_wq); | ||
2136 | return ret; | 2172 | return ret; |
2137 | } | 2173 | } |
2138 | 2174 | ||
@@ -2147,6 +2183,7 @@ sys_init_module(void __user *umod, | |||
2147 | mod->init_size = 0; | 2183 | mod->init_size = 0; |
2148 | mod->init_text_size = 0; | 2184 | mod->init_text_size = 0; |
2149 | mutex_unlock(&module_mutex); | 2185 | mutex_unlock(&module_mutex); |
2186 | wake_up(&module_wq); | ||
2150 | 2187 | ||
2151 | return 0; | 2188 | return 0; |
2152 | } | 2189 | } |
@@ -2211,14 +2248,13 @@ static const char *get_ksymbol(struct module *mod, | |||
2211 | return mod->strtab + mod->symtab[best].st_name; | 2248 | return mod->strtab + mod->symtab[best].st_name; |
2212 | } | 2249 | } |
2213 | 2250 | ||
2214 | /* For kallsyms to ask for address resolution. NULL means not found. | 2251 | /* For kallsyms to ask for address resolution. NULL means not found. Careful |
2215 | We don't lock, as this is used for oops resolution and races are a | 2252 | * not to lock to avoid deadlock on oopses, simply disable preemption. */ |
2216 | lesser concern. */ | 2253 | char *module_address_lookup(unsigned long addr, |
2217 | /* FIXME: Risky: returns a pointer into a module w/o lock */ | 2254 | unsigned long *size, |
2218 | const char *module_address_lookup(unsigned long addr, | 2255 | unsigned long *offset, |
2219 | unsigned long *size, | 2256 | char **modname, |
2220 | unsigned long *offset, | 2257 | char *namebuf) |
2221 | char **modname) | ||
2222 | { | 2258 | { |
2223 | struct module *mod; | 2259 | struct module *mod; |
2224 | const char *ret = NULL; | 2260 | const char *ret = NULL; |
@@ -2233,8 +2269,13 @@ const char *module_address_lookup(unsigned long addr, | |||
2233 | break; | 2269 | break; |
2234 | } | 2270 | } |
2235 | } | 2271 | } |
2272 | /* Make a copy in here where it's safe */ | ||
2273 | if (ret) { | ||
2274 | strncpy(namebuf, ret, KSYM_NAME_LEN - 1); | ||
2275 | ret = namebuf; | ||
2276 | } | ||
2236 | preempt_enable(); | 2277 | preempt_enable(); |
2237 | return ret; | 2278 | return (char *)ret; |
2238 | } | 2279 | } |
2239 | 2280 | ||
2240 | int lookup_module_symbol_name(unsigned long addr, char *symname) | 2281 | int lookup_module_symbol_name(unsigned long addr, char *symname) |
@@ -2362,21 +2403,30 @@ static void m_stop(struct seq_file *m, void *p) | |||
2362 | mutex_unlock(&module_mutex); | 2403 | mutex_unlock(&module_mutex); |
2363 | } | 2404 | } |
2364 | 2405 | ||
2365 | static char *taint_flags(unsigned int taints, char *buf) | 2406 | static char *module_flags(struct module *mod, char *buf) |
2366 | { | 2407 | { |
2367 | int bx = 0; | 2408 | int bx = 0; |
2368 | 2409 | ||
2369 | if (taints) { | 2410 | if (mod->taints || |
2411 | mod->state == MODULE_STATE_GOING || | ||
2412 | mod->state == MODULE_STATE_COMING) { | ||
2370 | buf[bx++] = '('; | 2413 | buf[bx++] = '('; |
2371 | if (taints & TAINT_PROPRIETARY_MODULE) | 2414 | if (mod->taints & TAINT_PROPRIETARY_MODULE) |
2372 | buf[bx++] = 'P'; | 2415 | buf[bx++] = 'P'; |
2373 | if (taints & TAINT_FORCED_MODULE) | 2416 | if (mod->taints & TAINT_FORCED_MODULE) |
2374 | buf[bx++] = 'F'; | 2417 | buf[bx++] = 'F'; |
2375 | /* | 2418 | /* |
2376 | * TAINT_FORCED_RMMOD: could be added. | 2419 | * TAINT_FORCED_RMMOD: could be added. |
2377 | * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't | 2420 | * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't |
2378 | * apply to modules. | 2421 | * apply to modules. |
2379 | */ | 2422 | */ |
2423 | |||
2424 | /* Show a - for module-is-being-unloaded */ | ||
2425 | if (mod->state == MODULE_STATE_GOING) | ||
2426 | buf[bx++] = '-'; | ||
2427 | /* Show a + for module-is-being-loaded */ | ||
2428 | if (mod->state == MODULE_STATE_COMING) | ||
2429 | buf[bx++] = '+'; | ||
2380 | buf[bx++] = ')'; | 2430 | buf[bx++] = ')'; |
2381 | } | 2431 | } |
2382 | buf[bx] = '\0'; | 2432 | buf[bx] = '\0'; |
@@ -2403,7 +2453,7 @@ static int m_show(struct seq_file *m, void *p) | |||
2403 | 2453 | ||
2404 | /* Taints info */ | 2454 | /* Taints info */ |
2405 | if (mod->taints) | 2455 | if (mod->taints) |
2406 | seq_printf(m, " %s", taint_flags(mod->taints, buf)); | 2456 | seq_printf(m, " %s", module_flags(mod, buf)); |
2407 | 2457 | ||
2408 | seq_printf(m, "\n"); | 2458 | seq_printf(m, "\n"); |
2409 | return 0; | 2459 | return 0; |
@@ -2498,97 +2548,12 @@ void print_modules(void) | |||
2498 | 2548 | ||
2499 | printk("Modules linked in:"); | 2549 | printk("Modules linked in:"); |
2500 | list_for_each_entry(mod, &modules, list) | 2550 | list_for_each_entry(mod, &modules, list) |
2501 | printk(" %s%s", mod->name, taint_flags(mod->taints, buf)); | 2551 | printk(" %s%s", mod->name, module_flags(mod, buf)); |
2552 | if (last_unloaded_module[0]) | ||
2553 | printk(" [last unloaded: %s]", last_unloaded_module); | ||
2502 | printk("\n"); | 2554 | printk("\n"); |
2503 | } | 2555 | } |
2504 | 2556 | ||
2505 | #ifdef CONFIG_SYSFS | ||
2506 | static char *make_driver_name(struct device_driver *drv) | ||
2507 | { | ||
2508 | char *driver_name; | ||
2509 | |||
2510 | driver_name = kmalloc(strlen(drv->name) + strlen(drv->bus->name) + 2, | ||
2511 | GFP_KERNEL); | ||
2512 | if (!driver_name) | ||
2513 | return NULL; | ||
2514 | |||
2515 | sprintf(driver_name, "%s:%s", drv->bus->name, drv->name); | ||
2516 | return driver_name; | ||
2517 | } | ||
2518 | |||
2519 | static void module_create_drivers_dir(struct module_kobject *mk) | ||
2520 | { | ||
2521 | if (!mk || mk->drivers_dir) | ||
2522 | return; | ||
2523 | |||
2524 | mk->drivers_dir = kobject_add_dir(&mk->kobj, "drivers"); | ||
2525 | } | ||
2526 | |||
2527 | void module_add_driver(struct module *mod, struct device_driver *drv) | ||
2528 | { | ||
2529 | char *driver_name; | ||
2530 | int no_warn; | ||
2531 | struct module_kobject *mk = NULL; | ||
2532 | |||
2533 | if (!drv) | ||
2534 | return; | ||
2535 | |||
2536 | if (mod) | ||
2537 | mk = &mod->mkobj; | ||
2538 | else if (drv->mod_name) { | ||
2539 | struct kobject *mkobj; | ||
2540 | |||
2541 | /* Lookup built-in module entry in /sys/modules */ | ||
2542 | mkobj = kset_find_obj(&module_subsys, drv->mod_name); | ||
2543 | if (mkobj) { | ||
2544 | mk = container_of(mkobj, struct module_kobject, kobj); | ||
2545 | /* remember our module structure */ | ||
2546 | drv->mkobj = mk; | ||
2547 | /* kset_find_obj took a reference */ | ||
2548 | kobject_put(mkobj); | ||
2549 | } | ||
2550 | } | ||
2551 | |||
2552 | if (!mk) | ||
2553 | return; | ||
2554 | |||
2555 | /* Don't check return codes; these calls are idempotent */ | ||
2556 | no_warn = sysfs_create_link(&drv->kobj, &mk->kobj, "module"); | ||
2557 | driver_name = make_driver_name(drv); | ||
2558 | if (driver_name) { | ||
2559 | module_create_drivers_dir(mk); | ||
2560 | no_warn = sysfs_create_link(mk->drivers_dir, &drv->kobj, | ||
2561 | driver_name); | ||
2562 | kfree(driver_name); | ||
2563 | } | ||
2564 | } | ||
2565 | EXPORT_SYMBOL(module_add_driver); | ||
2566 | |||
2567 | void module_remove_driver(struct device_driver *drv) | ||
2568 | { | ||
2569 | struct module_kobject *mk = NULL; | ||
2570 | char *driver_name; | ||
2571 | |||
2572 | if (!drv) | ||
2573 | return; | ||
2574 | |||
2575 | sysfs_remove_link(&drv->kobj, "module"); | ||
2576 | |||
2577 | if (drv->owner) | ||
2578 | mk = &drv->owner->mkobj; | ||
2579 | else if (drv->mkobj) | ||
2580 | mk = drv->mkobj; | ||
2581 | if (mk && mk->drivers_dir) { | ||
2582 | driver_name = make_driver_name(drv); | ||
2583 | if (driver_name) { | ||
2584 | sysfs_remove_link(mk->drivers_dir, driver_name); | ||
2585 | kfree(driver_name); | ||
2586 | } | ||
2587 | } | ||
2588 | } | ||
2589 | EXPORT_SYMBOL(module_remove_driver); | ||
2590 | #endif | ||
2591 | |||
2592 | #ifdef CONFIG_MODVERSIONS | 2557 | #ifdef CONFIG_MODVERSIONS |
2593 | /* Generate the signature for struct module here, too, for modversions. */ | 2558 | /* Generate the signature for struct module here, too, for modversions. */ |
2594 | void struct_module(struct module *mod) { return; } | 2559 | void struct_module(struct module *mod) { return; } |
diff --git a/kernel/panic.c b/kernel/panic.c index da4d6bac270e..d9e90cfe3298 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/kexec.h> | 20 | #include <linux/kexec.h> |
21 | #include <linux/debug_locks.h> | 21 | #include <linux/debug_locks.h> |
22 | #include <linux/random.h> | 22 | #include <linux/random.h> |
23 | #include <linux/kallsyms.h> | ||
23 | 24 | ||
24 | int panic_on_oops; | 25 | int panic_on_oops; |
25 | int tainted; | 26 | int tainted; |
@@ -280,6 +281,13 @@ static int init_oops_id(void) | |||
280 | } | 281 | } |
281 | late_initcall(init_oops_id); | 282 | late_initcall(init_oops_id); |
282 | 283 | ||
284 | static void print_oops_end_marker(void) | ||
285 | { | ||
286 | init_oops_id(); | ||
287 | printk(KERN_WARNING "---[ end trace %016llx ]---\n", | ||
288 | (unsigned long long)oops_id); | ||
289 | } | ||
290 | |||
283 | /* | 291 | /* |
284 | * Called when the architecture exits its oops handler, after printing | 292 | * Called when the architecture exits its oops handler, after printing |
285 | * everything. | 293 | * everything. |
@@ -287,11 +295,26 @@ late_initcall(init_oops_id); | |||
287 | void oops_exit(void) | 295 | void oops_exit(void) |
288 | { | 296 | { |
289 | do_oops_enter_exit(); | 297 | do_oops_enter_exit(); |
290 | init_oops_id(); | 298 | print_oops_end_marker(); |
291 | printk(KERN_WARNING "---[ end trace %016llx ]---\n", | ||
292 | (unsigned long long)oops_id); | ||
293 | } | 299 | } |
294 | 300 | ||
301 | #ifdef WANT_WARN_ON_SLOWPATH | ||
302 | void warn_on_slowpath(const char *file, int line) | ||
303 | { | ||
304 | char function[KSYM_SYMBOL_LEN]; | ||
305 | unsigned long caller = (unsigned long) __builtin_return_address(0); | ||
306 | sprint_symbol(function, caller); | ||
307 | |||
308 | printk(KERN_WARNING "------------[ cut here ]------------\n"); | ||
309 | printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, | ||
310 | line, function); | ||
311 | print_modules(); | ||
312 | dump_stack(); | ||
313 | print_oops_end_marker(); | ||
314 | } | ||
315 | EXPORT_SYMBOL(warn_on_slowpath); | ||
316 | #endif | ||
317 | |||
295 | #ifdef CONFIG_CC_STACKPROTECTOR | 318 | #ifdef CONFIG_CC_STACKPROTECTOR |
296 | /* | 319 | /* |
297 | * Called when gcc's -fstack-protector feature is used, and | 320 | * Called when gcc's -fstack-protector feature is used, and |
diff --git a/kernel/params.c b/kernel/params.c index 7686417ee00e..42fe5e6126c0 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -376,8 +376,6 @@ int param_get_string(char *buffer, struct kernel_param *kp) | |||
376 | 376 | ||
377 | extern struct kernel_param __start___param[], __stop___param[]; | 377 | extern struct kernel_param __start___param[], __stop___param[]; |
378 | 378 | ||
379 | #define MAX_KBUILD_MODNAME KOBJ_NAME_LEN | ||
380 | |||
381 | struct param_attribute | 379 | struct param_attribute |
382 | { | 380 | { |
383 | struct module_attribute mattr; | 381 | struct module_attribute mattr; |
@@ -472,7 +470,7 @@ param_sysfs_setup(struct module_kobject *mk, | |||
472 | sizeof(mp->grp.attrs[0])); | 470 | sizeof(mp->grp.attrs[0])); |
473 | size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]); | 471 | size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]); |
474 | 472 | ||
475 | mp = kmalloc(size[0] + size[1], GFP_KERNEL); | 473 | mp = kzalloc(size[0] + size[1], GFP_KERNEL); |
476 | if (!mp) | 474 | if (!mp) |
477 | return ERR_PTR(-ENOMEM); | 475 | return ERR_PTR(-ENOMEM); |
478 | 476 | ||
@@ -560,11 +558,10 @@ static void __init kernel_param_sysfs_setup(const char *name, | |||
560 | BUG_ON(!mk); | 558 | BUG_ON(!mk); |
561 | 559 | ||
562 | mk->mod = THIS_MODULE; | 560 | mk->mod = THIS_MODULE; |
563 | kobj_set_kset_s(mk, module_subsys); | 561 | mk->kobj.kset = module_kset; |
564 | kobject_set_name(&mk->kobj, name); | 562 | ret = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name); |
565 | kobject_init(&mk->kobj); | ||
566 | ret = kobject_add(&mk->kobj); | ||
567 | if (ret) { | 563 | if (ret) { |
564 | kobject_put(&mk->kobj); | ||
568 | printk(KERN_ERR "Module '%s' failed to be added to sysfs, " | 565 | printk(KERN_ERR "Module '%s' failed to be added to sysfs, " |
569 | "error number %d\n", name, ret); | 566 | "error number %d\n", name, ret); |
570 | printk(KERN_ERR "The system will be unstable now.\n"); | 567 | printk(KERN_ERR "The system will be unstable now.\n"); |
@@ -588,7 +585,7 @@ static void __init param_sysfs_builtin(void) | |||
588 | { | 585 | { |
589 | struct kernel_param *kp, *kp_begin = NULL; | 586 | struct kernel_param *kp, *kp_begin = NULL; |
590 | unsigned int i, name_len, count = 0; | 587 | unsigned int i, name_len, count = 0; |
591 | char modname[MAX_KBUILD_MODNAME + 1] = ""; | 588 | char modname[MODULE_NAME_LEN + 1] = ""; |
592 | 589 | ||
593 | for (i=0; i < __stop___param - __start___param; i++) { | 590 | for (i=0; i < __stop___param - __start___param; i++) { |
594 | char *dot; | 591 | char *dot; |
@@ -596,12 +593,12 @@ static void __init param_sysfs_builtin(void) | |||
596 | 593 | ||
597 | kp = &__start___param[i]; | 594 | kp = &__start___param[i]; |
598 | max_name_len = | 595 | max_name_len = |
599 | min_t(size_t, MAX_KBUILD_MODNAME, strlen(kp->name)); | 596 | min_t(size_t, MODULE_NAME_LEN, strlen(kp->name)); |
600 | 597 | ||
601 | dot = memchr(kp->name, '.', max_name_len); | 598 | dot = memchr(kp->name, '.', max_name_len); |
602 | if (!dot) { | 599 | if (!dot) { |
603 | DEBUGP("couldn't find period in first %d characters " | 600 | DEBUGP("couldn't find period in first %d characters " |
604 | "of %s\n", MAX_KBUILD_MODNAME, kp->name); | 601 | "of %s\n", MODULE_NAME_LEN, kp->name); |
605 | continue; | 602 | continue; |
606 | } | 603 | } |
607 | name_len = dot - kp->name; | 604 | name_len = dot - kp->name; |
@@ -679,8 +676,6 @@ static struct sysfs_ops module_sysfs_ops = { | |||
679 | .store = module_attr_store, | 676 | .store = module_attr_store, |
680 | }; | 677 | }; |
681 | 678 | ||
682 | static struct kobj_type module_ktype; | ||
683 | |||
684 | static int uevent_filter(struct kset *kset, struct kobject *kobj) | 679 | static int uevent_filter(struct kset *kset, struct kobject *kobj) |
685 | { | 680 | { |
686 | struct kobj_type *ktype = get_ktype(kobj); | 681 | struct kobj_type *ktype = get_ktype(kobj); |
@@ -694,21 +689,11 @@ static struct kset_uevent_ops module_uevent_ops = { | |||
694 | .filter = uevent_filter, | 689 | .filter = uevent_filter, |
695 | }; | 690 | }; |
696 | 691 | ||
697 | decl_subsys(module, &module_ktype, &module_uevent_ops); | 692 | struct kset *module_kset; |
698 | int module_sysfs_initialized; | 693 | int module_sysfs_initialized; |
699 | 694 | ||
700 | static void module_release(struct kobject *kobj) | 695 | struct kobj_type module_ktype = { |
701 | { | ||
702 | /* | ||
703 | * Stupid empty release function to allow the memory for the kobject to | ||
704 | * be properly cleaned up. This will not need to be present for 2.6.25 | ||
705 | * with the upcoming kobject core rework. | ||
706 | */ | ||
707 | } | ||
708 | |||
709 | static struct kobj_type module_ktype = { | ||
710 | .sysfs_ops = &module_sysfs_ops, | 696 | .sysfs_ops = &module_sysfs_ops, |
711 | .release = module_release, | ||
712 | }; | 697 | }; |
713 | 698 | ||
714 | /* | 699 | /* |
@@ -716,13 +701,11 @@ static struct kobj_type module_ktype = { | |||
716 | */ | 701 | */ |
717 | static int __init param_sysfs_init(void) | 702 | static int __init param_sysfs_init(void) |
718 | { | 703 | { |
719 | int ret; | 704 | module_kset = kset_create_and_add("module", &module_uevent_ops, NULL); |
720 | 705 | if (!module_kset) { | |
721 | ret = subsystem_register(&module_subsys); | 706 | printk(KERN_WARNING "%s (%d): error creating kset\n", |
722 | if (ret < 0) { | 707 | __FILE__, __LINE__); |
723 | printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n", | 708 | return -ENOMEM; |
724 | __FILE__, __LINE__, ret); | ||
725 | return ret; | ||
726 | } | 709 | } |
727 | module_sysfs_initialized = 1; | 710 | module_sysfs_initialized = 1; |
728 | 711 | ||
@@ -732,14 +715,7 @@ static int __init param_sysfs_init(void) | |||
732 | } | 715 | } |
733 | subsys_initcall(param_sysfs_init); | 716 | subsys_initcall(param_sysfs_init); |
734 | 717 | ||
735 | #else | 718 | #endif /* CONFIG_SYSFS */ |
736 | #if 0 | ||
737 | static struct sysfs_ops module_sysfs_ops = { | ||
738 | .show = NULL, | ||
739 | .store = NULL, | ||
740 | }; | ||
741 | #endif | ||
742 | #endif | ||
743 | 719 | ||
744 | EXPORT_SYMBOL(param_set_byte); | 720 | EXPORT_SYMBOL(param_set_byte); |
745 | EXPORT_SYMBOL(param_get_byte); | 721 | EXPORT_SYMBOL(param_get_byte); |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 68c96376e84a..0b7c82ac467e 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -967,6 +967,7 @@ static void check_thread_timers(struct task_struct *tsk, | |||
967 | { | 967 | { |
968 | int maxfire; | 968 | int maxfire; |
969 | struct list_head *timers = tsk->cpu_timers; | 969 | struct list_head *timers = tsk->cpu_timers; |
970 | struct signal_struct *const sig = tsk->signal; | ||
970 | 971 | ||
971 | maxfire = 20; | 972 | maxfire = 20; |
972 | tsk->it_prof_expires = cputime_zero; | 973 | tsk->it_prof_expires = cputime_zero; |
@@ -1011,6 +1012,35 @@ static void check_thread_timers(struct task_struct *tsk, | |||
1011 | t->firing = 1; | 1012 | t->firing = 1; |
1012 | list_move_tail(&t->entry, firing); | 1013 | list_move_tail(&t->entry, firing); |
1013 | } | 1014 | } |
1015 | |||
1016 | /* | ||
1017 | * Check for the special case thread timers. | ||
1018 | */ | ||
1019 | if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) { | ||
1020 | unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max; | ||
1021 | unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur; | ||
1022 | |||
1023 | if (hard != RLIM_INFINITY && | ||
1024 | tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { | ||
1025 | /* | ||
1026 | * At the hard limit, we just die. | ||
1027 | * No need to calculate anything else now. | ||
1028 | */ | ||
1029 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); | ||
1030 | return; | ||
1031 | } | ||
1032 | if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) { | ||
1033 | /* | ||
1034 | * At the soft limit, send a SIGXCPU every second. | ||
1035 | */ | ||
1036 | if (sig->rlim[RLIMIT_RTTIME].rlim_cur | ||
1037 | < sig->rlim[RLIMIT_RTTIME].rlim_max) { | ||
1038 | sig->rlim[RLIMIT_RTTIME].rlim_cur += | ||
1039 | USEC_PER_SEC; | ||
1040 | } | ||
1041 | __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); | ||
1042 | } | ||
1043 | } | ||
1014 | } | 1044 | } |
1015 | 1045 | ||
1016 | /* | 1046 | /* |
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 05b64790fe83..b138b431e271 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
@@ -567,7 +567,8 @@ static const char * const hibernation_modes[] = { | |||
567 | * supports it (as determined by having hibernation_ops). | 567 | * supports it (as determined by having hibernation_ops). |
568 | */ | 568 | */ |
569 | 569 | ||
570 | static ssize_t disk_show(struct kset *kset, char *buf) | 570 | static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, |
571 | char *buf) | ||
571 | { | 572 | { |
572 | int i; | 573 | int i; |
573 | char *start = buf; | 574 | char *start = buf; |
@@ -597,7 +598,8 @@ static ssize_t disk_show(struct kset *kset, char *buf) | |||
597 | } | 598 | } |
598 | 599 | ||
599 | 600 | ||
600 | static ssize_t disk_store(struct kset *kset, const char *buf, size_t n) | 601 | static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, |
602 | const char *buf, size_t n) | ||
601 | { | 603 | { |
602 | int error = 0; | 604 | int error = 0; |
603 | int i; | 605 | int i; |
@@ -642,13 +644,15 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n) | |||
642 | 644 | ||
643 | power_attr(disk); | 645 | power_attr(disk); |
644 | 646 | ||
645 | static ssize_t resume_show(struct kset *kset, char *buf) | 647 | static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, |
648 | char *buf) | ||
646 | { | 649 | { |
647 | return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), | 650 | return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), |
648 | MINOR(swsusp_resume_device)); | 651 | MINOR(swsusp_resume_device)); |
649 | } | 652 | } |
650 | 653 | ||
651 | static ssize_t resume_store(struct kset *kset, const char *buf, size_t n) | 654 | static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, |
655 | const char *buf, size_t n) | ||
652 | { | 656 | { |
653 | unsigned int maj, min; | 657 | unsigned int maj, min; |
654 | dev_t res; | 658 | dev_t res; |
@@ -674,12 +678,14 @@ static ssize_t resume_store(struct kset *kset, const char *buf, size_t n) | |||
674 | 678 | ||
675 | power_attr(resume); | 679 | power_attr(resume); |
676 | 680 | ||
677 | static ssize_t image_size_show(struct kset *kset, char *buf) | 681 | static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr, |
682 | char *buf) | ||
678 | { | 683 | { |
679 | return sprintf(buf, "%lu\n", image_size); | 684 | return sprintf(buf, "%lu\n", image_size); |
680 | } | 685 | } |
681 | 686 | ||
682 | static ssize_t image_size_store(struct kset *kset, const char *buf, size_t n) | 687 | static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr, |
688 | const char *buf, size_t n) | ||
683 | { | 689 | { |
684 | unsigned long size; | 690 | unsigned long size; |
685 | 691 | ||
@@ -708,7 +714,7 @@ static struct attribute_group attr_group = { | |||
708 | 714 | ||
709 | static int __init pm_disk_init(void) | 715 | static int __init pm_disk_init(void) |
710 | { | 716 | { |
711 | return sysfs_create_group(&power_subsys.kobj, &attr_group); | 717 | return sysfs_create_group(power_kobj, &attr_group); |
712 | } | 718 | } |
713 | 719 | ||
714 | core_initcall(pm_disk_init); | 720 | core_initcall(pm_disk_init); |
diff --git a/kernel/power/main.c b/kernel/power/main.c index f71c9504a5c5..efc08360e627 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -276,8 +276,7 @@ EXPORT_SYMBOL(pm_suspend); | |||
276 | 276 | ||
277 | #endif /* CONFIG_SUSPEND */ | 277 | #endif /* CONFIG_SUSPEND */ |
278 | 278 | ||
279 | decl_subsys(power,NULL,NULL); | 279 | struct kobject *power_kobj; |
280 | |||
281 | 280 | ||
282 | /** | 281 | /** |
283 | * state - control system power state. | 282 | * state - control system power state. |
@@ -290,7 +289,8 @@ decl_subsys(power,NULL,NULL); | |||
290 | * proper enumerated value, and initiates a suspend transition. | 289 | * proper enumerated value, and initiates a suspend transition. |
291 | */ | 290 | */ |
292 | 291 | ||
293 | static ssize_t state_show(struct kset *kset, char *buf) | 292 | static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, |
293 | char *buf) | ||
294 | { | 294 | { |
295 | char *s = buf; | 295 | char *s = buf; |
296 | #ifdef CONFIG_SUSPEND | 296 | #ifdef CONFIG_SUSPEND |
@@ -311,7 +311,8 @@ static ssize_t state_show(struct kset *kset, char *buf) | |||
311 | return (s - buf); | 311 | return (s - buf); |
312 | } | 312 | } |
313 | 313 | ||
314 | static ssize_t state_store(struct kset *kset, const char *buf, size_t n) | 314 | static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, |
315 | const char *buf, size_t n) | ||
315 | { | 316 | { |
316 | #ifdef CONFIG_SUSPEND | 317 | #ifdef CONFIG_SUSPEND |
317 | suspend_state_t state = PM_SUSPEND_STANDBY; | 318 | suspend_state_t state = PM_SUSPEND_STANDBY; |
@@ -348,13 +349,15 @@ power_attr(state); | |||
348 | #ifdef CONFIG_PM_TRACE | 349 | #ifdef CONFIG_PM_TRACE |
349 | int pm_trace_enabled; | 350 | int pm_trace_enabled; |
350 | 351 | ||
351 | static ssize_t pm_trace_show(struct kset *kset, char *buf) | 352 | static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr, |
353 | char *buf) | ||
352 | { | 354 | { |
353 | return sprintf(buf, "%d\n", pm_trace_enabled); | 355 | return sprintf(buf, "%d\n", pm_trace_enabled); |
354 | } | 356 | } |
355 | 357 | ||
356 | static ssize_t | 358 | static ssize_t |
357 | pm_trace_store(struct kset *kset, const char *buf, size_t n) | 359 | pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, |
360 | const char *buf, size_t n) | ||
358 | { | 361 | { |
359 | int val; | 362 | int val; |
360 | 363 | ||
@@ -386,10 +389,10 @@ static struct attribute_group attr_group = { | |||
386 | 389 | ||
387 | static int __init pm_init(void) | 390 | static int __init pm_init(void) |
388 | { | 391 | { |
389 | int error = subsystem_register(&power_subsys); | 392 | power_kobj = kobject_create_and_add("power", NULL); |
390 | if (!error) | 393 | if (!power_kobj) |
391 | error = sysfs_create_group(&power_subsys.kobj,&attr_group); | 394 | return -ENOMEM; |
392 | return error; | 395 | return sysfs_create_group(power_kobj, &attr_group); |
393 | } | 396 | } |
394 | 397 | ||
395 | core_initcall(pm_init); | 398 | core_initcall(pm_init); |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 195dc4611764..2093c3a9a994 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -54,7 +54,7 @@ extern int pfn_is_nosave(unsigned long); | |||
54 | extern struct mutex pm_mutex; | 54 | extern struct mutex pm_mutex; |
55 | 55 | ||
56 | #define power_attr(_name) \ | 56 | #define power_attr(_name) \ |
57 | static struct subsys_attribute _name##_attr = { \ | 57 | static struct kobj_attribute _name##_attr = { \ |
58 | .attr = { \ | 58 | .attr = { \ |
59 | .name = __stringify(_name), \ | 59 | .name = __stringify(_name), \ |
60 | .mode = 0644, \ | 60 | .mode = 0644, \ |
@@ -63,8 +63,6 @@ static struct subsys_attribute _name##_attr = { \ | |||
63 | .store = _name##_store, \ | 63 | .store = _name##_store, \ |
64 | } | 64 | } |
65 | 65 | ||
66 | extern struct kset power_subsys; | ||
67 | |||
68 | /* Preferred image size in bytes (default 500 MB) */ | 66 | /* Preferred image size in bytes (default 500 MB) */ |
69 | extern unsigned long image_size; | 67 | extern unsigned long image_size; |
70 | extern int in_suspend; | 68 | extern int in_suspend; |
diff --git a/kernel/printk.c b/kernel/printk.c index 89011bf8c106..58bbec684119 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -36,6 +36,13 @@ | |||
36 | 36 | ||
37 | #include <asm/uaccess.h> | 37 | #include <asm/uaccess.h> |
38 | 38 | ||
39 | /* | ||
40 | * Architectures can override it: | ||
41 | */ | ||
42 | void __attribute__((weak)) early_printk(const char *fmt, ...) | ||
43 | { | ||
44 | } | ||
45 | |||
39 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | 46 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) |
40 | 47 | ||
41 | /* printk's without a loglevel use this.. */ | 48 | /* printk's without a loglevel use this.. */ |
@@ -573,11 +580,6 @@ static int __init printk_time_setup(char *str) | |||
573 | 580 | ||
574 | __setup("time", printk_time_setup); | 581 | __setup("time", printk_time_setup); |
575 | 582 | ||
576 | __attribute__((weak)) unsigned long long printk_clock(void) | ||
577 | { | ||
578 | return sched_clock(); | ||
579 | } | ||
580 | |||
581 | /* Check if we have any console registered that can be called early in boot. */ | 583 | /* Check if we have any console registered that can be called early in boot. */ |
582 | static int have_callable_console(void) | 584 | static int have_callable_console(void) |
583 | { | 585 | { |
@@ -628,30 +630,57 @@ asmlinkage int printk(const char *fmt, ...) | |||
628 | /* cpu currently holding logbuf_lock */ | 630 | /* cpu currently holding logbuf_lock */ |
629 | static volatile unsigned int printk_cpu = UINT_MAX; | 631 | static volatile unsigned int printk_cpu = UINT_MAX; |
630 | 632 | ||
633 | const char printk_recursion_bug_msg [] = | ||
634 | KERN_CRIT "BUG: recent printk recursion!\n"; | ||
635 | static int printk_recursion_bug; | ||
636 | |||
631 | asmlinkage int vprintk(const char *fmt, va_list args) | 637 | asmlinkage int vprintk(const char *fmt, va_list args) |
632 | { | 638 | { |
639 | static int log_level_unknown = 1; | ||
640 | static char printk_buf[1024]; | ||
641 | |||
633 | unsigned long flags; | 642 | unsigned long flags; |
634 | int printed_len; | 643 | int printed_len = 0; |
644 | int this_cpu; | ||
635 | char *p; | 645 | char *p; |
636 | static char printk_buf[1024]; | ||
637 | static int log_level_unknown = 1; | ||
638 | 646 | ||
639 | boot_delay_msec(); | 647 | boot_delay_msec(); |
640 | 648 | ||
641 | preempt_disable(); | 649 | preempt_disable(); |
642 | if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id()) | ||
643 | /* If a crash is occurring during printk() on this CPU, | ||
644 | * make sure we can't deadlock */ | ||
645 | zap_locks(); | ||
646 | |||
647 | /* This stops the holder of console_sem just where we want him */ | 650 | /* This stops the holder of console_sem just where we want him */ |
648 | raw_local_irq_save(flags); | 651 | raw_local_irq_save(flags); |
652 | this_cpu = smp_processor_id(); | ||
653 | |||
654 | /* | ||
655 | * Ouch, printk recursed into itself! | ||
656 | */ | ||
657 | if (unlikely(printk_cpu == this_cpu)) { | ||
658 | /* | ||
659 | * If a crash is occurring during printk() on this CPU, | ||
660 | * then try to get the crash message out but make sure | ||
661 | * we can't deadlock. Otherwise just return to avoid the | ||
662 | * recursion and return - but flag the recursion so that | ||
663 | * it can be printed at the next appropriate moment: | ||
664 | */ | ||
665 | if (!oops_in_progress) { | ||
666 | printk_recursion_bug = 1; | ||
667 | goto out_restore_irqs; | ||
668 | } | ||
669 | zap_locks(); | ||
670 | } | ||
671 | |||
649 | lockdep_off(); | 672 | lockdep_off(); |
650 | spin_lock(&logbuf_lock); | 673 | spin_lock(&logbuf_lock); |
651 | printk_cpu = smp_processor_id(); | 674 | printk_cpu = this_cpu; |
652 | 675 | ||
676 | if (printk_recursion_bug) { | ||
677 | printk_recursion_bug = 0; | ||
678 | strcpy(printk_buf, printk_recursion_bug_msg); | ||
679 | printed_len = sizeof(printk_recursion_bug_msg); | ||
680 | } | ||
653 | /* Emit the output into the temporary buffer */ | 681 | /* Emit the output into the temporary buffer */ |
654 | printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); | 682 | printed_len += vscnprintf(printk_buf + printed_len, |
683 | sizeof(printk_buf), fmt, args); | ||
655 | 684 | ||
656 | /* | 685 | /* |
657 | * Copy the output into log_buf. If the caller didn't provide | 686 | * Copy the output into log_buf. If the caller didn't provide |
@@ -680,7 +709,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
680 | loglev_char = default_message_loglevel | 709 | loglev_char = default_message_loglevel |
681 | + '0'; | 710 | + '0'; |
682 | } | 711 | } |
683 | t = printk_clock(); | 712 | t = cpu_clock(printk_cpu); |
684 | nanosec_rem = do_div(t, 1000000000); | 713 | nanosec_rem = do_div(t, 1000000000); |
685 | tlen = sprintf(tbuf, | 714 | tlen = sprintf(tbuf, |
686 | "<%c>[%5lu.%06lu] ", | 715 | "<%c>[%5lu.%06lu] ", |
@@ -744,6 +773,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
744 | printk_cpu = UINT_MAX; | 773 | printk_cpu = UINT_MAX; |
745 | spin_unlock(&logbuf_lock); | 774 | spin_unlock(&logbuf_lock); |
746 | lockdep_on(); | 775 | lockdep_on(); |
776 | out_restore_irqs: | ||
747 | raw_local_irq_restore(flags); | 777 | raw_local_irq_restore(flags); |
748 | } | 778 | } |
749 | 779 | ||
diff --git a/kernel/profile.c b/kernel/profile.c index 5e95330e5120..e64c2da11c0f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -52,7 +52,7 @@ static DEFINE_PER_CPU(int, cpu_profile_flip); | |||
52 | static DEFINE_MUTEX(profile_flip_mutex); | 52 | static DEFINE_MUTEX(profile_flip_mutex); |
53 | #endif /* CONFIG_SMP */ | 53 | #endif /* CONFIG_SMP */ |
54 | 54 | ||
55 | static int __init profile_setup(char * str) | 55 | static int __init profile_setup(char *str) |
56 | { | 56 | { |
57 | static char __initdata schedstr[] = "schedule"; | 57 | static char __initdata schedstr[] = "schedule"; |
58 | static char __initdata sleepstr[] = "sleep"; | 58 | static char __initdata sleepstr[] = "sleep"; |
@@ -104,28 +104,28 @@ __setup("profile=", profile_setup); | |||
104 | 104 | ||
105 | void __init profile_init(void) | 105 | void __init profile_init(void) |
106 | { | 106 | { |
107 | if (!prof_on) | 107 | if (!prof_on) |
108 | return; | 108 | return; |
109 | 109 | ||
110 | /* only text is profiled */ | 110 | /* only text is profiled */ |
111 | prof_len = (_etext - _stext) >> prof_shift; | 111 | prof_len = (_etext - _stext) >> prof_shift; |
112 | prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); | 112 | prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); |
113 | } | 113 | } |
114 | 114 | ||
115 | /* Profile event notifications */ | 115 | /* Profile event notifications */ |
116 | 116 | ||
117 | #ifdef CONFIG_PROFILING | 117 | #ifdef CONFIG_PROFILING |
118 | 118 | ||
119 | static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); | 119 | static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); |
120 | static ATOMIC_NOTIFIER_HEAD(task_free_notifier); | 120 | static ATOMIC_NOTIFIER_HEAD(task_free_notifier); |
121 | static BLOCKING_NOTIFIER_HEAD(munmap_notifier); | 121 | static BLOCKING_NOTIFIER_HEAD(munmap_notifier); |
122 | 122 | ||
123 | void profile_task_exit(struct task_struct * task) | 123 | void profile_task_exit(struct task_struct *task) |
124 | { | 124 | { |
125 | blocking_notifier_call_chain(&task_exit_notifier, 0, task); | 125 | blocking_notifier_call_chain(&task_exit_notifier, 0, task); |
126 | } | 126 | } |
127 | 127 | ||
128 | int profile_handoff_task(struct task_struct * task) | 128 | int profile_handoff_task(struct task_struct *task) |
129 | { | 129 | { |
130 | int ret; | 130 | int ret; |
131 | ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); | 131 | ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); |
@@ -137,52 +137,55 @@ void profile_munmap(unsigned long addr) | |||
137 | blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); | 137 | blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); |
138 | } | 138 | } |
139 | 139 | ||
140 | int task_handoff_register(struct notifier_block * n) | 140 | int task_handoff_register(struct notifier_block *n) |
141 | { | 141 | { |
142 | return atomic_notifier_chain_register(&task_free_notifier, n); | 142 | return atomic_notifier_chain_register(&task_free_notifier, n); |
143 | } | 143 | } |
144 | EXPORT_SYMBOL_GPL(task_handoff_register); | ||
144 | 145 | ||
145 | int task_handoff_unregister(struct notifier_block * n) | 146 | int task_handoff_unregister(struct notifier_block *n) |
146 | { | 147 | { |
147 | return atomic_notifier_chain_unregister(&task_free_notifier, n); | 148 | return atomic_notifier_chain_unregister(&task_free_notifier, n); |
148 | } | 149 | } |
150 | EXPORT_SYMBOL_GPL(task_handoff_unregister); | ||
149 | 151 | ||
150 | int profile_event_register(enum profile_type type, struct notifier_block * n) | 152 | int profile_event_register(enum profile_type type, struct notifier_block *n) |
151 | { | 153 | { |
152 | int err = -EINVAL; | 154 | int err = -EINVAL; |
153 | 155 | ||
154 | switch (type) { | 156 | switch (type) { |
155 | case PROFILE_TASK_EXIT: | 157 | case PROFILE_TASK_EXIT: |
156 | err = blocking_notifier_chain_register( | 158 | err = blocking_notifier_chain_register( |
157 | &task_exit_notifier, n); | 159 | &task_exit_notifier, n); |
158 | break; | 160 | break; |
159 | case PROFILE_MUNMAP: | 161 | case PROFILE_MUNMAP: |
160 | err = blocking_notifier_chain_register( | 162 | err = blocking_notifier_chain_register( |
161 | &munmap_notifier, n); | 163 | &munmap_notifier, n); |
162 | break; | 164 | break; |
163 | } | 165 | } |
164 | 166 | ||
165 | return err; | 167 | return err; |
166 | } | 168 | } |
169 | EXPORT_SYMBOL_GPL(profile_event_register); | ||
167 | 170 | ||
168 | 171 | int profile_event_unregister(enum profile_type type, struct notifier_block *n) | |
169 | int profile_event_unregister(enum profile_type type, struct notifier_block * n) | ||
170 | { | 172 | { |
171 | int err = -EINVAL; | 173 | int err = -EINVAL; |
172 | 174 | ||
173 | switch (type) { | 175 | switch (type) { |
174 | case PROFILE_TASK_EXIT: | 176 | case PROFILE_TASK_EXIT: |
175 | err = blocking_notifier_chain_unregister( | 177 | err = blocking_notifier_chain_unregister( |
176 | &task_exit_notifier, n); | 178 | &task_exit_notifier, n); |
177 | break; | 179 | break; |
178 | case PROFILE_MUNMAP: | 180 | case PROFILE_MUNMAP: |
179 | err = blocking_notifier_chain_unregister( | 181 | err = blocking_notifier_chain_unregister( |
180 | &munmap_notifier, n); | 182 | &munmap_notifier, n); |
181 | break; | 183 | break; |
182 | } | 184 | } |
183 | 185 | ||
184 | return err; | 186 | return err; |
185 | } | 187 | } |
188 | EXPORT_SYMBOL_GPL(profile_event_unregister); | ||
186 | 189 | ||
187 | int register_timer_hook(int (*hook)(struct pt_regs *)) | 190 | int register_timer_hook(int (*hook)(struct pt_regs *)) |
188 | { | 191 | { |
@@ -191,6 +194,7 @@ int register_timer_hook(int (*hook)(struct pt_regs *)) | |||
191 | timer_hook = hook; | 194 | timer_hook = hook; |
192 | return 0; | 195 | return 0; |
193 | } | 196 | } |
197 | EXPORT_SYMBOL_GPL(register_timer_hook); | ||
194 | 198 | ||
195 | void unregister_timer_hook(int (*hook)(struct pt_regs *)) | 199 | void unregister_timer_hook(int (*hook)(struct pt_regs *)) |
196 | { | 200 | { |
@@ -199,13 +203,7 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *)) | |||
199 | /* make sure all CPUs see the NULL hook */ | 203 | /* make sure all CPUs see the NULL hook */ |
200 | synchronize_sched(); /* Allow ongoing interrupts to complete. */ | 204 | synchronize_sched(); /* Allow ongoing interrupts to complete. */ |
201 | } | 205 | } |
202 | |||
203 | EXPORT_SYMBOL_GPL(register_timer_hook); | ||
204 | EXPORT_SYMBOL_GPL(unregister_timer_hook); | 206 | EXPORT_SYMBOL_GPL(unregister_timer_hook); |
205 | EXPORT_SYMBOL_GPL(task_handoff_register); | ||
206 | EXPORT_SYMBOL_GPL(task_handoff_unregister); | ||
207 | EXPORT_SYMBOL_GPL(profile_event_register); | ||
208 | EXPORT_SYMBOL_GPL(profile_event_unregister); | ||
209 | 207 | ||
210 | #endif /* CONFIG_PROFILING */ | 208 | #endif /* CONFIG_PROFILING */ |
211 | 209 | ||
@@ -366,7 +364,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info, | |||
366 | per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); | 364 | per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); |
367 | } | 365 | } |
368 | break; | 366 | break; |
369 | out_free: | 367 | out_free: |
370 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); | 368 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); |
371 | per_cpu(cpu_profile_hits, cpu)[1] = NULL; | 369 | per_cpu(cpu_profile_hits, cpu)[1] = NULL; |
372 | __free_page(page); | 370 | __free_page(page); |
@@ -409,7 +407,6 @@ void profile_hits(int type, void *__pc, unsigned int nr_hits) | |||
409 | atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); | 407 | atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); |
410 | } | 408 | } |
411 | #endif /* !CONFIG_SMP */ | 409 | #endif /* !CONFIG_SMP */ |
412 | |||
413 | EXPORT_SYMBOL_GPL(profile_hits); | 410 | EXPORT_SYMBOL_GPL(profile_hits); |
414 | 411 | ||
415 | void profile_tick(int type) | 412 | void profile_tick(int type) |
@@ -427,7 +424,7 @@ void profile_tick(int type) | |||
427 | #include <asm/uaccess.h> | 424 | #include <asm/uaccess.h> |
428 | #include <asm/ptrace.h> | 425 | #include <asm/ptrace.h> |
429 | 426 | ||
430 | static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, | 427 | static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, |
431 | int count, int *eof, void *data) | 428 | int count, int *eof, void *data) |
432 | { | 429 | { |
433 | int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); | 430 | int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); |
@@ -437,8 +434,8 @@ static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, | |||
437 | return len; | 434 | return len; |
438 | } | 435 | } |
439 | 436 | ||
440 | static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer, | 437 | static int prof_cpu_mask_write_proc(struct file *file, |
441 | unsigned long count, void *data) | 438 | const char __user *buffer, unsigned long count, void *data) |
442 | { | 439 | { |
443 | cpumask_t *mask = (cpumask_t *)data; | 440 | cpumask_t *mask = (cpumask_t *)data; |
444 | unsigned long full_count = count, err; | 441 | unsigned long full_count = count, err; |
@@ -457,7 +454,8 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) | |||
457 | struct proc_dir_entry *entry; | 454 | struct proc_dir_entry *entry; |
458 | 455 | ||
459 | /* create /proc/irq/prof_cpu_mask */ | 456 | /* create /proc/irq/prof_cpu_mask */ |
460 | if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir))) | 457 | entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); |
458 | if (!entry) | ||
461 | return; | 459 | return; |
462 | entry->data = (void *)&prof_cpu_mask; | 460 | entry->data = (void *)&prof_cpu_mask; |
463 | entry->read_proc = prof_cpu_mask_read_proc; | 461 | entry->read_proc = prof_cpu_mask_read_proc; |
@@ -475,7 +473,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
475 | { | 473 | { |
476 | unsigned long p = *ppos; | 474 | unsigned long p = *ppos; |
477 | ssize_t read; | 475 | ssize_t read; |
478 | char * pnt; | 476 | char *pnt; |
479 | unsigned int sample_step = 1 << prof_shift; | 477 | unsigned int sample_step = 1 << prof_shift; |
480 | 478 | ||
481 | profile_flip_buffers(); | 479 | profile_flip_buffers(); |
@@ -486,12 +484,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
486 | read = 0; | 484 | read = 0; |
487 | 485 | ||
488 | while (p < sizeof(unsigned int) && count > 0) { | 486 | while (p < sizeof(unsigned int) && count > 0) { |
489 | if (put_user(*((char *)(&sample_step)+p),buf)) | 487 | if (put_user(*((char *)(&sample_step)+p), buf)) |
490 | return -EFAULT; | 488 | return -EFAULT; |
491 | buf++; p++; count--; read++; | 489 | buf++; p++; count--; read++; |
492 | } | 490 | } |
493 | pnt = (char *)prof_buffer + p - sizeof(atomic_t); | 491 | pnt = (char *)prof_buffer + p - sizeof(atomic_t); |
494 | if (copy_to_user(buf,(void *)pnt,count)) | 492 | if (copy_to_user(buf, (void *)pnt, count)) |
495 | return -EFAULT; | 493 | return -EFAULT; |
496 | read += count; | 494 | read += count; |
497 | *ppos += read; | 495 | *ppos += read; |
@@ -508,7 +506,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf, | |||
508 | size_t count, loff_t *ppos) | 506 | size_t count, loff_t *ppos) |
509 | { | 507 | { |
510 | #ifdef CONFIG_SMP | 508 | #ifdef CONFIG_SMP |
511 | extern int setup_profiling_timer (unsigned int multiplier); | 509 | extern int setup_profiling_timer(unsigned int multiplier); |
512 | 510 | ||
513 | if (count == sizeof(int)) { | 511 | if (count == sizeof(int)) { |
514 | unsigned int multiplier; | 512 | unsigned int multiplier; |
@@ -591,7 +589,8 @@ static int __init create_proc_profile(void) | |||
591 | return 0; | 589 | return 0; |
592 | if (create_hash_tables()) | 590 | if (create_hash_tables()) |
593 | return -1; | 591 | return -1; |
594 | if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL))) | 592 | entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL); |
593 | if (!entry) | ||
595 | return 0; | 594 | return 0; |
596 | entry->proc_fops = &proc_profile_operations; | 595 | entry->proc_fops = &proc_profile_operations; |
597 | entry->size = (1+prof_len) * sizeof(atomic_t); | 596 | entry->size = (1+prof_len) * sizeof(atomic_t); |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index c25db863081d..e6e9b8be4b05 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -366,12 +366,73 @@ static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data) | |||
366 | return error; | 366 | return error; |
367 | } | 367 | } |
368 | 368 | ||
369 | |||
370 | #ifdef PTRACE_SINGLESTEP | ||
371 | #define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) | ||
372 | #else | ||
373 | #define is_singlestep(request) 0 | ||
374 | #endif | ||
375 | |||
376 | #ifdef PTRACE_SINGLEBLOCK | ||
377 | #define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK) | ||
378 | #else | ||
379 | #define is_singleblock(request) 0 | ||
380 | #endif | ||
381 | |||
382 | #ifdef PTRACE_SYSEMU | ||
383 | #define is_sysemu_singlestep(request) ((request) == PTRACE_SYSEMU_SINGLESTEP) | ||
384 | #else | ||
385 | #define is_sysemu_singlestep(request) 0 | ||
386 | #endif | ||
387 | |||
388 | static int ptrace_resume(struct task_struct *child, long request, long data) | ||
389 | { | ||
390 | if (!valid_signal(data)) | ||
391 | return -EIO; | ||
392 | |||
393 | if (request == PTRACE_SYSCALL) | ||
394 | set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | ||
395 | else | ||
396 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | ||
397 | |||
398 | #ifdef TIF_SYSCALL_EMU | ||
399 | if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP) | ||
400 | set_tsk_thread_flag(child, TIF_SYSCALL_EMU); | ||
401 | else | ||
402 | clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); | ||
403 | #endif | ||
404 | |||
405 | if (is_singleblock(request)) { | ||
406 | if (unlikely(!arch_has_block_step())) | ||
407 | return -EIO; | ||
408 | user_enable_block_step(child); | ||
409 | } else if (is_singlestep(request) || is_sysemu_singlestep(request)) { | ||
410 | if (unlikely(!arch_has_single_step())) | ||
411 | return -EIO; | ||
412 | user_enable_single_step(child); | ||
413 | } | ||
414 | else | ||
415 | user_disable_single_step(child); | ||
416 | |||
417 | child->exit_code = data; | ||
418 | wake_up_process(child); | ||
419 | |||
420 | return 0; | ||
421 | } | ||
422 | |||
369 | int ptrace_request(struct task_struct *child, long request, | 423 | int ptrace_request(struct task_struct *child, long request, |
370 | long addr, long data) | 424 | long addr, long data) |
371 | { | 425 | { |
372 | int ret = -EIO; | 426 | int ret = -EIO; |
373 | 427 | ||
374 | switch (request) { | 428 | switch (request) { |
429 | case PTRACE_PEEKTEXT: | ||
430 | case PTRACE_PEEKDATA: | ||
431 | return generic_ptrace_peekdata(child, addr, data); | ||
432 | case PTRACE_POKETEXT: | ||
433 | case PTRACE_POKEDATA: | ||
434 | return generic_ptrace_pokedata(child, addr, data); | ||
435 | |||
375 | #ifdef PTRACE_OLDSETOPTIONS | 436 | #ifdef PTRACE_OLDSETOPTIONS |
376 | case PTRACE_OLDSETOPTIONS: | 437 | case PTRACE_OLDSETOPTIONS: |
377 | #endif | 438 | #endif |
@@ -390,6 +451,26 @@ int ptrace_request(struct task_struct *child, long request, | |||
390 | case PTRACE_DETACH: /* detach a process that was attached. */ | 451 | case PTRACE_DETACH: /* detach a process that was attached. */ |
391 | ret = ptrace_detach(child, data); | 452 | ret = ptrace_detach(child, data); |
392 | break; | 453 | break; |
454 | |||
455 | #ifdef PTRACE_SINGLESTEP | ||
456 | case PTRACE_SINGLESTEP: | ||
457 | #endif | ||
458 | #ifdef PTRACE_SINGLEBLOCK | ||
459 | case PTRACE_SINGLEBLOCK: | ||
460 | #endif | ||
461 | #ifdef PTRACE_SYSEMU | ||
462 | case PTRACE_SYSEMU: | ||
463 | case PTRACE_SYSEMU_SINGLESTEP: | ||
464 | #endif | ||
465 | case PTRACE_SYSCALL: | ||
466 | case PTRACE_CONT: | ||
467 | return ptrace_resume(child, request, data); | ||
468 | |||
469 | case PTRACE_KILL: | ||
470 | if (child->exit_state) /* already dead */ | ||
471 | return 0; | ||
472 | return ptrace_resume(child, request, SIGKILL); | ||
473 | |||
393 | default: | 474 | default: |
394 | break; | 475 | break; |
395 | } | 476 | } |
@@ -470,6 +551,8 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data) | |||
470 | lock_kernel(); | 551 | lock_kernel(); |
471 | if (request == PTRACE_TRACEME) { | 552 | if (request == PTRACE_TRACEME) { |
472 | ret = ptrace_traceme(); | 553 | ret = ptrace_traceme(); |
554 | if (!ret) | ||
555 | arch_ptrace_attach(current); | ||
473 | goto out; | 556 | goto out; |
474 | } | 557 | } |
475 | 558 | ||
@@ -524,3 +607,87 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) | |||
524 | copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); | 607 | copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); |
525 | return (copied == sizeof(data)) ? 0 : -EIO; | 608 | return (copied == sizeof(data)) ? 0 : -EIO; |
526 | } | 609 | } |
610 | |||
611 | #ifdef CONFIG_COMPAT | ||
612 | #include <linux/compat.h> | ||
613 | |||
614 | int compat_ptrace_request(struct task_struct *child, compat_long_t request, | ||
615 | compat_ulong_t addr, compat_ulong_t data) | ||
616 | { | ||
617 | compat_ulong_t __user *datap = compat_ptr(data); | ||
618 | compat_ulong_t word; | ||
619 | int ret; | ||
620 | |||
621 | switch (request) { | ||
622 | case PTRACE_PEEKTEXT: | ||
623 | case PTRACE_PEEKDATA: | ||
624 | ret = access_process_vm(child, addr, &word, sizeof(word), 0); | ||
625 | if (ret != sizeof(word)) | ||
626 | ret = -EIO; | ||
627 | else | ||
628 | ret = put_user(word, datap); | ||
629 | break; | ||
630 | |||
631 | case PTRACE_POKETEXT: | ||
632 | case PTRACE_POKEDATA: | ||
633 | ret = access_process_vm(child, addr, &data, sizeof(data), 1); | ||
634 | ret = (ret != sizeof(data) ? -EIO : 0); | ||
635 | break; | ||
636 | |||
637 | case PTRACE_GETEVENTMSG: | ||
638 | ret = put_user((compat_ulong_t) child->ptrace_message, datap); | ||
639 | break; | ||
640 | |||
641 | default: | ||
642 | ret = ptrace_request(child, request, addr, data); | ||
643 | } | ||
644 | |||
645 | return ret; | ||
646 | } | ||
647 | |||
648 | #ifdef __ARCH_WANT_COMPAT_SYS_PTRACE | ||
649 | asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, | ||
650 | compat_long_t addr, compat_long_t data) | ||
651 | { | ||
652 | struct task_struct *child; | ||
653 | long ret; | ||
654 | |||
655 | /* | ||
656 | * This lock_kernel fixes a subtle race with suid exec | ||
657 | */ | ||
658 | lock_kernel(); | ||
659 | if (request == PTRACE_TRACEME) { | ||
660 | ret = ptrace_traceme(); | ||
661 | goto out; | ||
662 | } | ||
663 | |||
664 | child = ptrace_get_task_struct(pid); | ||
665 | if (IS_ERR(child)) { | ||
666 | ret = PTR_ERR(child); | ||
667 | goto out; | ||
668 | } | ||
669 | |||
670 | if (request == PTRACE_ATTACH) { | ||
671 | ret = ptrace_attach(child); | ||
672 | /* | ||
673 | * Some architectures need to do book-keeping after | ||
674 | * a ptrace attach. | ||
675 | */ | ||
676 | if (!ret) | ||
677 | arch_ptrace_attach(child); | ||
678 | goto out_put_task_struct; | ||
679 | } | ||
680 | |||
681 | ret = ptrace_check_attach(child, request == PTRACE_KILL); | ||
682 | if (!ret) | ||
683 | ret = compat_arch_ptrace(child, request, addr, data); | ||
684 | |||
685 | out_put_task_struct: | ||
686 | put_task_struct(child); | ||
687 | out: | ||
688 | unlock_kernel(); | ||
689 | return ret; | ||
690 | } | ||
691 | #endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */ | ||
692 | |||
693 | #endif /* CONFIG_COMPAT */ | ||
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c new file mode 100644 index 000000000000..f4ffbd0f306f --- /dev/null +++ b/kernel/rcuclassic.c | |||
@@ -0,0 +1,575 @@ | |||
1 | /* | ||
2 | * Read-Copy Update mechanism for mutual exclusion | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2001 | ||
19 | * | ||
20 | * Authors: Dipankar Sarma <dipankar@in.ibm.com> | ||
21 | * Manfred Spraul <manfred@colorfullife.com> | ||
22 | * | ||
23 | * Based on the original work by Paul McKenney <paulmck@us.ibm.com> | ||
24 | * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. | ||
25 | * Papers: | ||
26 | * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf | ||
27 | * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) | ||
28 | * | ||
29 | * For detailed explanation of Read-Copy Update mechanism see - | ||
30 | * Documentation/RCU | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/types.h> | ||
34 | #include <linux/kernel.h> | ||
35 | #include <linux/init.h> | ||
36 | #include <linux/spinlock.h> | ||
37 | #include <linux/smp.h> | ||
38 | #include <linux/rcupdate.h> | ||
39 | #include <linux/interrupt.h> | ||
40 | #include <linux/sched.h> | ||
41 | #include <asm/atomic.h> | ||
42 | #include <linux/bitops.h> | ||
43 | #include <linux/module.h> | ||
44 | #include <linux/completion.h> | ||
45 | #include <linux/moduleparam.h> | ||
46 | #include <linux/percpu.h> | ||
47 | #include <linux/notifier.h> | ||
48 | #include <linux/cpu.h> | ||
49 | #include <linux/mutex.h> | ||
50 | |||
51 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
52 | static struct lock_class_key rcu_lock_key; | ||
53 | struct lockdep_map rcu_lock_map = | ||
54 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); | ||
55 | EXPORT_SYMBOL_GPL(rcu_lock_map); | ||
56 | #endif | ||
57 | |||
58 | |||
59 | /* Definition for rcupdate control block. */ | ||
60 | static struct rcu_ctrlblk rcu_ctrlblk = { | ||
61 | .cur = -300, | ||
62 | .completed = -300, | ||
63 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), | ||
64 | .cpumask = CPU_MASK_NONE, | ||
65 | }; | ||
66 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | ||
67 | .cur = -300, | ||
68 | .completed = -300, | ||
69 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), | ||
70 | .cpumask = CPU_MASK_NONE, | ||
71 | }; | ||
72 | |||
73 | DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; | ||
74 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; | ||
75 | |||
76 | static int blimit = 10; | ||
77 | static int qhimark = 10000; | ||
78 | static int qlowmark = 100; | ||
79 | |||
80 | #ifdef CONFIG_SMP | ||
81 | static void force_quiescent_state(struct rcu_data *rdp, | ||
82 | struct rcu_ctrlblk *rcp) | ||
83 | { | ||
84 | int cpu; | ||
85 | cpumask_t cpumask; | ||
86 | set_need_resched(); | ||
87 | if (unlikely(!rcp->signaled)) { | ||
88 | rcp->signaled = 1; | ||
89 | /* | ||
90 | * Don't send IPI to itself. With irqs disabled, | ||
91 | * rdp->cpu is the current cpu. | ||
92 | */ | ||
93 | cpumask = rcp->cpumask; | ||
94 | cpu_clear(rdp->cpu, cpumask); | ||
95 | for_each_cpu_mask(cpu, cpumask) | ||
96 | smp_send_reschedule(cpu); | ||
97 | } | ||
98 | } | ||
99 | #else | ||
100 | static inline void force_quiescent_state(struct rcu_data *rdp, | ||
101 | struct rcu_ctrlblk *rcp) | ||
102 | { | ||
103 | set_need_resched(); | ||
104 | } | ||
105 | #endif | ||
106 | |||
107 | /** | ||
108 | * call_rcu - Queue an RCU callback for invocation after a grace period. | ||
109 | * @head: structure to be used for queueing the RCU updates. | ||
110 | * @func: actual update function to be invoked after the grace period | ||
111 | * | ||
112 | * The update function will be invoked some time after a full grace | ||
113 | * period elapses, in other words after all currently executing RCU | ||
114 | * read-side critical sections have completed. RCU read-side critical | ||
115 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | ||
116 | * and may be nested. | ||
117 | */ | ||
118 | void call_rcu(struct rcu_head *head, | ||
119 | void (*func)(struct rcu_head *rcu)) | ||
120 | { | ||
121 | unsigned long flags; | ||
122 | struct rcu_data *rdp; | ||
123 | |||
124 | head->func = func; | ||
125 | head->next = NULL; | ||
126 | local_irq_save(flags); | ||
127 | rdp = &__get_cpu_var(rcu_data); | ||
128 | *rdp->nxttail = head; | ||
129 | rdp->nxttail = &head->next; | ||
130 | if (unlikely(++rdp->qlen > qhimark)) { | ||
131 | rdp->blimit = INT_MAX; | ||
132 | force_quiescent_state(rdp, &rcu_ctrlblk); | ||
133 | } | ||
134 | local_irq_restore(flags); | ||
135 | } | ||
136 | EXPORT_SYMBOL_GPL(call_rcu); | ||
137 | |||
138 | /** | ||
139 | * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. | ||
140 | * @head: structure to be used for queueing the RCU updates. | ||
141 | * @func: actual update function to be invoked after the grace period | ||
142 | * | ||
143 | * The update function will be invoked some time after a full grace | ||
144 | * period elapses, in other words after all currently executing RCU | ||
145 | * read-side critical sections have completed. call_rcu_bh() assumes | ||
146 | * that the read-side critical sections end on completion of a softirq | ||
147 | * handler. This means that read-side critical sections in process | ||
148 | * context must not be interrupted by softirqs. This interface is to be | ||
149 | * used when most of the read-side critical sections are in softirq context. | ||
150 | * RCU read-side critical sections are delimited by rcu_read_lock() and | ||
151 | * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() | ||
152 | * and rcu_read_unlock_bh(), if in process context. These may be nested. | ||
153 | */ | ||
154 | void call_rcu_bh(struct rcu_head *head, | ||
155 | void (*func)(struct rcu_head *rcu)) | ||
156 | { | ||
157 | unsigned long flags; | ||
158 | struct rcu_data *rdp; | ||
159 | |||
160 | head->func = func; | ||
161 | head->next = NULL; | ||
162 | local_irq_save(flags); | ||
163 | rdp = &__get_cpu_var(rcu_bh_data); | ||
164 | *rdp->nxttail = head; | ||
165 | rdp->nxttail = &head->next; | ||
166 | |||
167 | if (unlikely(++rdp->qlen > qhimark)) { | ||
168 | rdp->blimit = INT_MAX; | ||
169 | force_quiescent_state(rdp, &rcu_bh_ctrlblk); | ||
170 | } | ||
171 | |||
172 | local_irq_restore(flags); | ||
173 | } | ||
174 | EXPORT_SYMBOL_GPL(call_rcu_bh); | ||
175 | |||
176 | /* | ||
177 | * Return the number of RCU batches processed thus far. Useful | ||
178 | * for debug and statistics. | ||
179 | */ | ||
180 | long rcu_batches_completed(void) | ||
181 | { | ||
182 | return rcu_ctrlblk.completed; | ||
183 | } | ||
184 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
185 | |||
186 | /* | ||
187 | * Return the number of RCU batches processed thus far. Useful | ||
188 | * for debug and statistics. | ||
189 | */ | ||
190 | long rcu_batches_completed_bh(void) | ||
191 | { | ||
192 | return rcu_bh_ctrlblk.completed; | ||
193 | } | ||
194 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | ||
195 | |||
196 | /* Raises the softirq for processing rcu_callbacks. */ | ||
197 | static inline void raise_rcu_softirq(void) | ||
198 | { | ||
199 | raise_softirq(RCU_SOFTIRQ); | ||
200 | /* | ||
201 | * The smp_mb() here is required to ensure that this cpu's | ||
202 | * __rcu_process_callbacks() reads the most recently updated | ||
203 | * value of rcu->cur. | ||
204 | */ | ||
205 | smp_mb(); | ||
206 | } | ||
207 | |||
208 | /* | ||
209 | * Invoke the completed RCU callbacks. They are expected to be in | ||
210 | * a per-cpu list. | ||
211 | */ | ||
212 | static void rcu_do_batch(struct rcu_data *rdp) | ||
213 | { | ||
214 | struct rcu_head *next, *list; | ||
215 | int count = 0; | ||
216 | |||
217 | list = rdp->donelist; | ||
218 | while (list) { | ||
219 | next = list->next; | ||
220 | prefetch(next); | ||
221 | list->func(list); | ||
222 | list = next; | ||
223 | if (++count >= rdp->blimit) | ||
224 | break; | ||
225 | } | ||
226 | rdp->donelist = list; | ||
227 | |||
228 | local_irq_disable(); | ||
229 | rdp->qlen -= count; | ||
230 | local_irq_enable(); | ||
231 | if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) | ||
232 | rdp->blimit = blimit; | ||
233 | |||
234 | if (!rdp->donelist) | ||
235 | rdp->donetail = &rdp->donelist; | ||
236 | else | ||
237 | raise_rcu_softirq(); | ||
238 | } | ||
239 | |||
240 | /* | ||
241 | * Grace period handling: | ||
242 | * The grace period handling consists out of two steps: | ||
243 | * - A new grace period is started. | ||
244 | * This is done by rcu_start_batch. The start is not broadcasted to | ||
245 | * all cpus, they must pick this up by comparing rcp->cur with | ||
246 | * rdp->quiescbatch. All cpus are recorded in the | ||
247 | * rcu_ctrlblk.cpumask bitmap. | ||
248 | * - All cpus must go through a quiescent state. | ||
249 | * Since the start of the grace period is not broadcasted, at least two | ||
250 | * calls to rcu_check_quiescent_state are required: | ||
251 | * The first call just notices that a new grace period is running. The | ||
252 | * following calls check if there was a quiescent state since the beginning | ||
253 | * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If | ||
254 | * the bitmap is empty, then the grace period is completed. | ||
255 | * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace | ||
256 | * period (if necessary). | ||
257 | */ | ||
258 | /* | ||
259 | * Register a new batch of callbacks, and start it up if there is currently no | ||
260 | * active batch and the batch to be registered has not already occurred. | ||
261 | * Caller must hold rcu_ctrlblk.lock. | ||
262 | */ | ||
263 | static void rcu_start_batch(struct rcu_ctrlblk *rcp) | ||
264 | { | ||
265 | if (rcp->next_pending && | ||
266 | rcp->completed == rcp->cur) { | ||
267 | rcp->next_pending = 0; | ||
268 | /* | ||
269 | * next_pending == 0 must be visible in | ||
270 | * __rcu_process_callbacks() before it can see new value of cur. | ||
271 | */ | ||
272 | smp_wmb(); | ||
273 | rcp->cur++; | ||
274 | |||
275 | /* | ||
276 | * Accessing nohz_cpu_mask before incrementing rcp->cur needs a | ||
277 | * Barrier Otherwise it can cause tickless idle CPUs to be | ||
278 | * included in rcp->cpumask, which will extend graceperiods | ||
279 | * unnecessarily. | ||
280 | */ | ||
281 | smp_mb(); | ||
282 | cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); | ||
283 | |||
284 | rcp->signaled = 0; | ||
285 | } | ||
286 | } | ||
287 | |||
288 | /* | ||
289 | * cpu went through a quiescent state since the beginning of the grace period. | ||
290 | * Clear it from the cpu mask and complete the grace period if it was the last | ||
291 | * cpu. Start another grace period if someone has further entries pending | ||
292 | */ | ||
293 | static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) | ||
294 | { | ||
295 | cpu_clear(cpu, rcp->cpumask); | ||
296 | if (cpus_empty(rcp->cpumask)) { | ||
297 | /* batch completed ! */ | ||
298 | rcp->completed = rcp->cur; | ||
299 | rcu_start_batch(rcp); | ||
300 | } | ||
301 | } | ||
302 | |||
303 | /* | ||
304 | * Check if the cpu has gone through a quiescent state (say context | ||
305 | * switch). If so and if it already hasn't done so in this RCU | ||
306 | * quiescent cycle, then indicate that it has done so. | ||
307 | */ | ||
308 | static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, | ||
309 | struct rcu_data *rdp) | ||
310 | { | ||
311 | if (rdp->quiescbatch != rcp->cur) { | ||
312 | /* start new grace period: */ | ||
313 | rdp->qs_pending = 1; | ||
314 | rdp->passed_quiesc = 0; | ||
315 | rdp->quiescbatch = rcp->cur; | ||
316 | return; | ||
317 | } | ||
318 | |||
319 | /* Grace period already completed for this cpu? | ||
320 | * qs_pending is checked instead of the actual bitmap to avoid | ||
321 | * cacheline trashing. | ||
322 | */ | ||
323 | if (!rdp->qs_pending) | ||
324 | return; | ||
325 | |||
326 | /* | ||
327 | * Was there a quiescent state since the beginning of the grace | ||
328 | * period? If no, then exit and wait for the next call. | ||
329 | */ | ||
330 | if (!rdp->passed_quiesc) | ||
331 | return; | ||
332 | rdp->qs_pending = 0; | ||
333 | |||
334 | spin_lock(&rcp->lock); | ||
335 | /* | ||
336 | * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync | ||
337 | * during cpu startup. Ignore the quiescent state. | ||
338 | */ | ||
339 | if (likely(rdp->quiescbatch == rcp->cur)) | ||
340 | cpu_quiet(rdp->cpu, rcp); | ||
341 | |||
342 | spin_unlock(&rcp->lock); | ||
343 | } | ||
344 | |||
345 | |||
346 | #ifdef CONFIG_HOTPLUG_CPU | ||
347 | |||
348 | /* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing | ||
349 | * locking requirements, the list it's pulling from has to belong to a cpu | ||
350 | * which is dead and hence not processing interrupts. | ||
351 | */ | ||
352 | static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, | ||
353 | struct rcu_head **tail) | ||
354 | { | ||
355 | local_irq_disable(); | ||
356 | *this_rdp->nxttail = list; | ||
357 | if (list) | ||
358 | this_rdp->nxttail = tail; | ||
359 | local_irq_enable(); | ||
360 | } | ||
361 | |||
362 | static void __rcu_offline_cpu(struct rcu_data *this_rdp, | ||
363 | struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
364 | { | ||
365 | /* if the cpu going offline owns the grace period | ||
366 | * we can block indefinitely waiting for it, so flush | ||
367 | * it here | ||
368 | */ | ||
369 | spin_lock_bh(&rcp->lock); | ||
370 | if (rcp->cur != rcp->completed) | ||
371 | cpu_quiet(rdp->cpu, rcp); | ||
372 | spin_unlock_bh(&rcp->lock); | ||
373 | rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); | ||
374 | rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); | ||
375 | rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); | ||
376 | } | ||
377 | |||
378 | static void rcu_offline_cpu(int cpu) | ||
379 | { | ||
380 | struct rcu_data *this_rdp = &get_cpu_var(rcu_data); | ||
381 | struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); | ||
382 | |||
383 | __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, | ||
384 | &per_cpu(rcu_data, cpu)); | ||
385 | __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, | ||
386 | &per_cpu(rcu_bh_data, cpu)); | ||
387 | put_cpu_var(rcu_data); | ||
388 | put_cpu_var(rcu_bh_data); | ||
389 | } | ||
390 | |||
391 | #else | ||
392 | |||
393 | static void rcu_offline_cpu(int cpu) | ||
394 | { | ||
395 | } | ||
396 | |||
397 | #endif | ||
398 | |||
399 | /* | ||
400 | * This does the RCU processing work from softirq context. | ||
401 | */ | ||
402 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, | ||
403 | struct rcu_data *rdp) | ||
404 | { | ||
405 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { | ||
406 | *rdp->donetail = rdp->curlist; | ||
407 | rdp->donetail = rdp->curtail; | ||
408 | rdp->curlist = NULL; | ||
409 | rdp->curtail = &rdp->curlist; | ||
410 | } | ||
411 | |||
412 | if (rdp->nxtlist && !rdp->curlist) { | ||
413 | local_irq_disable(); | ||
414 | rdp->curlist = rdp->nxtlist; | ||
415 | rdp->curtail = rdp->nxttail; | ||
416 | rdp->nxtlist = NULL; | ||
417 | rdp->nxttail = &rdp->nxtlist; | ||
418 | local_irq_enable(); | ||
419 | |||
420 | /* | ||
421 | * start the next batch of callbacks | ||
422 | */ | ||
423 | |||
424 | /* determine batch number */ | ||
425 | rdp->batch = rcp->cur + 1; | ||
426 | /* see the comment and corresponding wmb() in | ||
427 | * the rcu_start_batch() | ||
428 | */ | ||
429 | smp_rmb(); | ||
430 | |||
431 | if (!rcp->next_pending) { | ||
432 | /* and start it/schedule start if it's a new batch */ | ||
433 | spin_lock(&rcp->lock); | ||
434 | rcp->next_pending = 1; | ||
435 | rcu_start_batch(rcp); | ||
436 | spin_unlock(&rcp->lock); | ||
437 | } | ||
438 | } | ||
439 | |||
440 | rcu_check_quiescent_state(rcp, rdp); | ||
441 | if (rdp->donelist) | ||
442 | rcu_do_batch(rdp); | ||
443 | } | ||
444 | |||
445 | static void rcu_process_callbacks(struct softirq_action *unused) | ||
446 | { | ||
447 | __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); | ||
448 | __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); | ||
449 | } | ||
450 | |||
451 | static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
452 | { | ||
453 | /* This cpu has pending rcu entries and the grace period | ||
454 | * for them has completed. | ||
455 | */ | ||
456 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) | ||
457 | return 1; | ||
458 | |||
459 | /* This cpu has no pending entries, but there are new entries */ | ||
460 | if (!rdp->curlist && rdp->nxtlist) | ||
461 | return 1; | ||
462 | |||
463 | /* This cpu has finished callbacks to invoke */ | ||
464 | if (rdp->donelist) | ||
465 | return 1; | ||
466 | |||
467 | /* The rcu core waits for a quiescent state from the cpu */ | ||
468 | if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) | ||
469 | return 1; | ||
470 | |||
471 | /* nothing to do */ | ||
472 | return 0; | ||
473 | } | ||
474 | |||
475 | /* | ||
476 | * Check to see if there is any immediate RCU-related work to be done | ||
477 | * by the current CPU, returning 1 if so. This function is part of the | ||
478 | * RCU implementation; it is -not- an exported member of the RCU API. | ||
479 | */ | ||
480 | int rcu_pending(int cpu) | ||
481 | { | ||
482 | return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || | ||
483 | __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); | ||
484 | } | ||
485 | |||
486 | /* | ||
487 | * Check to see if any future RCU-related work will need to be done | ||
488 | * by the current CPU, even if none need be done immediately, returning | ||
489 | * 1 if so. This function is part of the RCU implementation; it is -not- | ||
490 | * an exported member of the RCU API. | ||
491 | */ | ||
492 | int rcu_needs_cpu(int cpu) | ||
493 | { | ||
494 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
495 | struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); | ||
496 | |||
497 | return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); | ||
498 | } | ||
499 | |||
500 | void rcu_check_callbacks(int cpu, int user) | ||
501 | { | ||
502 | if (user || | ||
503 | (idle_cpu(cpu) && !in_softirq() && | ||
504 | hardirq_count() <= (1 << HARDIRQ_SHIFT))) { | ||
505 | rcu_qsctr_inc(cpu); | ||
506 | rcu_bh_qsctr_inc(cpu); | ||
507 | } else if (!in_softirq()) | ||
508 | rcu_bh_qsctr_inc(cpu); | ||
509 | raise_rcu_softirq(); | ||
510 | } | ||
511 | |||
512 | static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, | ||
513 | struct rcu_data *rdp) | ||
514 | { | ||
515 | memset(rdp, 0, sizeof(*rdp)); | ||
516 | rdp->curtail = &rdp->curlist; | ||
517 | rdp->nxttail = &rdp->nxtlist; | ||
518 | rdp->donetail = &rdp->donelist; | ||
519 | rdp->quiescbatch = rcp->completed; | ||
520 | rdp->qs_pending = 0; | ||
521 | rdp->cpu = cpu; | ||
522 | rdp->blimit = blimit; | ||
523 | } | ||
524 | |||
525 | static void __cpuinit rcu_online_cpu(int cpu) | ||
526 | { | ||
527 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
528 | struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); | ||
529 | |||
530 | rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); | ||
531 | rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); | ||
532 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); | ||
533 | } | ||
534 | |||
535 | static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | ||
536 | unsigned long action, void *hcpu) | ||
537 | { | ||
538 | long cpu = (long)hcpu; | ||
539 | |||
540 | switch (action) { | ||
541 | case CPU_UP_PREPARE: | ||
542 | case CPU_UP_PREPARE_FROZEN: | ||
543 | rcu_online_cpu(cpu); | ||
544 | break; | ||
545 | case CPU_DEAD: | ||
546 | case CPU_DEAD_FROZEN: | ||
547 | rcu_offline_cpu(cpu); | ||
548 | break; | ||
549 | default: | ||
550 | break; | ||
551 | } | ||
552 | return NOTIFY_OK; | ||
553 | } | ||
554 | |||
555 | static struct notifier_block __cpuinitdata rcu_nb = { | ||
556 | .notifier_call = rcu_cpu_notify, | ||
557 | }; | ||
558 | |||
559 | /* | ||
560 | * Initializes rcu mechanism. Assumed to be called early. | ||
561 | * That is before local timer(SMP) or jiffie timer (uniproc) is setup. | ||
562 | * Note that rcu_qsctr and friends are implicitly | ||
563 | * initialized due to the choice of ``0'' for RCU_CTR_INVALID. | ||
564 | */ | ||
565 | void __init __rcu_init(void) | ||
566 | { | ||
567 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, | ||
568 | (void *)(long)smp_processor_id()); | ||
569 | /* Register notifier for non-boot CPUs */ | ||
570 | register_cpu_notifier(&rcu_nb); | ||
571 | } | ||
572 | |||
573 | module_param(blimit, int, 0); | ||
574 | module_param(qhimark, int, 0); | ||
575 | module_param(qlowmark, int, 0); | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f2c1a04e9b18..760dfc233a00 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -15,7 +15,7 @@ | |||
15 | * along with this program; if not, write to the Free Software | 15 | * along with this program; if not, write to the Free Software |
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
17 | * | 17 | * |
18 | * Copyright (C) IBM Corporation, 2001 | 18 | * Copyright IBM Corporation, 2001 |
19 | * | 19 | * |
20 | * Authors: Dipankar Sarma <dipankar@in.ibm.com> | 20 | * Authors: Dipankar Sarma <dipankar@in.ibm.com> |
21 | * Manfred Spraul <manfred@colorfullife.com> | 21 | * Manfred Spraul <manfred@colorfullife.com> |
@@ -35,165 +35,57 @@ | |||
35 | #include <linux/init.h> | 35 | #include <linux/init.h> |
36 | #include <linux/spinlock.h> | 36 | #include <linux/spinlock.h> |
37 | #include <linux/smp.h> | 37 | #include <linux/smp.h> |
38 | #include <linux/rcupdate.h> | ||
39 | #include <linux/interrupt.h> | 38 | #include <linux/interrupt.h> |
40 | #include <linux/sched.h> | 39 | #include <linux/sched.h> |
41 | #include <asm/atomic.h> | 40 | #include <asm/atomic.h> |
42 | #include <linux/bitops.h> | 41 | #include <linux/bitops.h> |
43 | #include <linux/module.h> | ||
44 | #include <linux/completion.h> | 42 | #include <linux/completion.h> |
45 | #include <linux/moduleparam.h> | ||
46 | #include <linux/percpu.h> | 43 | #include <linux/percpu.h> |
47 | #include <linux/notifier.h> | 44 | #include <linux/notifier.h> |
48 | #include <linux/cpu.h> | 45 | #include <linux/cpu.h> |
49 | #include <linux/mutex.h> | 46 | #include <linux/mutex.h> |
47 | #include <linux/module.h> | ||
50 | 48 | ||
51 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 49 | struct rcu_synchronize { |
52 | static struct lock_class_key rcu_lock_key; | 50 | struct rcu_head head; |
53 | struct lockdep_map rcu_lock_map = | 51 | struct completion completion; |
54 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); | ||
55 | |||
56 | EXPORT_SYMBOL_GPL(rcu_lock_map); | ||
57 | #endif | ||
58 | |||
59 | /* Definition for rcupdate control block. */ | ||
60 | static struct rcu_ctrlblk rcu_ctrlblk = { | ||
61 | .cur = -300, | ||
62 | .completed = -300, | ||
63 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), | ||
64 | .cpumask = CPU_MASK_NONE, | ||
65 | }; | ||
66 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | ||
67 | .cur = -300, | ||
68 | .completed = -300, | ||
69 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), | ||
70 | .cpumask = CPU_MASK_NONE, | ||
71 | }; | 52 | }; |
72 | 53 | ||
73 | DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; | 54 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; |
74 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; | ||
75 | |||
76 | /* Fake initialization required by compiler */ | ||
77 | static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; | ||
78 | static int blimit = 10; | ||
79 | static int qhimark = 10000; | ||
80 | static int qlowmark = 100; | ||
81 | |||
82 | static atomic_t rcu_barrier_cpu_count; | 55 | static atomic_t rcu_barrier_cpu_count; |
83 | static DEFINE_MUTEX(rcu_barrier_mutex); | 56 | static DEFINE_MUTEX(rcu_barrier_mutex); |
84 | static struct completion rcu_barrier_completion; | 57 | static struct completion rcu_barrier_completion; |
85 | 58 | ||
86 | #ifdef CONFIG_SMP | 59 | /* Because of FASTCALL declaration of complete, we use this wrapper */ |
87 | static void force_quiescent_state(struct rcu_data *rdp, | 60 | static void wakeme_after_rcu(struct rcu_head *head) |
88 | struct rcu_ctrlblk *rcp) | ||
89 | { | ||
90 | int cpu; | ||
91 | cpumask_t cpumask; | ||
92 | set_need_resched(); | ||
93 | if (unlikely(!rcp->signaled)) { | ||
94 | rcp->signaled = 1; | ||
95 | /* | ||
96 | * Don't send IPI to itself. With irqs disabled, | ||
97 | * rdp->cpu is the current cpu. | ||
98 | */ | ||
99 | cpumask = rcp->cpumask; | ||
100 | cpu_clear(rdp->cpu, cpumask); | ||
101 | for_each_cpu_mask(cpu, cpumask) | ||
102 | smp_send_reschedule(cpu); | ||
103 | } | ||
104 | } | ||
105 | #else | ||
106 | static inline void force_quiescent_state(struct rcu_data *rdp, | ||
107 | struct rcu_ctrlblk *rcp) | ||
108 | { | 61 | { |
109 | set_need_resched(); | 62 | struct rcu_synchronize *rcu; |
63 | |||
64 | rcu = container_of(head, struct rcu_synchronize, head); | ||
65 | complete(&rcu->completion); | ||
110 | } | 66 | } |
111 | #endif | ||
112 | 67 | ||
113 | /** | 68 | /** |
114 | * call_rcu - Queue an RCU callback for invocation after a grace period. | 69 | * synchronize_rcu - wait until a grace period has elapsed. |
115 | * @head: structure to be used for queueing the RCU updates. | ||
116 | * @func: actual update function to be invoked after the grace period | ||
117 | * | 70 | * |
118 | * The update function will be invoked some time after a full grace | 71 | * Control will return to the caller some time after a full grace |
119 | * period elapses, in other words after all currently executing RCU | 72 | * period has elapsed, in other words after all currently executing RCU |
120 | * read-side critical sections have completed. RCU read-side critical | 73 | * read-side critical sections have completed. RCU read-side critical |
121 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | 74 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), |
122 | * and may be nested. | 75 | * and may be nested. |
123 | */ | 76 | */ |
124 | void fastcall call_rcu(struct rcu_head *head, | 77 | void synchronize_rcu(void) |
125 | void (*func)(struct rcu_head *rcu)) | ||
126 | { | ||
127 | unsigned long flags; | ||
128 | struct rcu_data *rdp; | ||
129 | |||
130 | head->func = func; | ||
131 | head->next = NULL; | ||
132 | local_irq_save(flags); | ||
133 | rdp = &__get_cpu_var(rcu_data); | ||
134 | *rdp->nxttail = head; | ||
135 | rdp->nxttail = &head->next; | ||
136 | if (unlikely(++rdp->qlen > qhimark)) { | ||
137 | rdp->blimit = INT_MAX; | ||
138 | force_quiescent_state(rdp, &rcu_ctrlblk); | ||
139 | } | ||
140 | local_irq_restore(flags); | ||
141 | } | ||
142 | |||
143 | /** | ||
144 | * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. | ||
145 | * @head: structure to be used for queueing the RCU updates. | ||
146 | * @func: actual update function to be invoked after the grace period | ||
147 | * | ||
148 | * The update function will be invoked some time after a full grace | ||
149 | * period elapses, in other words after all currently executing RCU | ||
150 | * read-side critical sections have completed. call_rcu_bh() assumes | ||
151 | * that the read-side critical sections end on completion of a softirq | ||
152 | * handler. This means that read-side critical sections in process | ||
153 | * context must not be interrupted by softirqs. This interface is to be | ||
154 | * used when most of the read-side critical sections are in softirq context. | ||
155 | * RCU read-side critical sections are delimited by rcu_read_lock() and | ||
156 | * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() | ||
157 | * and rcu_read_unlock_bh(), if in process context. These may be nested. | ||
158 | */ | ||
159 | void fastcall call_rcu_bh(struct rcu_head *head, | ||
160 | void (*func)(struct rcu_head *rcu)) | ||
161 | { | 78 | { |
162 | unsigned long flags; | 79 | struct rcu_synchronize rcu; |
163 | struct rcu_data *rdp; | ||
164 | |||
165 | head->func = func; | ||
166 | head->next = NULL; | ||
167 | local_irq_save(flags); | ||
168 | rdp = &__get_cpu_var(rcu_bh_data); | ||
169 | *rdp->nxttail = head; | ||
170 | rdp->nxttail = &head->next; | ||
171 | |||
172 | if (unlikely(++rdp->qlen > qhimark)) { | ||
173 | rdp->blimit = INT_MAX; | ||
174 | force_quiescent_state(rdp, &rcu_bh_ctrlblk); | ||
175 | } | ||
176 | |||
177 | local_irq_restore(flags); | ||
178 | } | ||
179 | 80 | ||
180 | /* | 81 | init_completion(&rcu.completion); |
181 | * Return the number of RCU batches processed thus far. Useful | 82 | /* Will wake me after RCU finished */ |
182 | * for debug and statistics. | 83 | call_rcu(&rcu.head, wakeme_after_rcu); |
183 | */ | ||
184 | long rcu_batches_completed(void) | ||
185 | { | ||
186 | return rcu_ctrlblk.completed; | ||
187 | } | ||
188 | 84 | ||
189 | /* | 85 | /* Wait for it */ |
190 | * Return the number of RCU batches processed thus far. Useful | 86 | wait_for_completion(&rcu.completion); |
191 | * for debug and statistics. | ||
192 | */ | ||
193 | long rcu_batches_completed_bh(void) | ||
194 | { | ||
195 | return rcu_bh_ctrlblk.completed; | ||
196 | } | 87 | } |
88 | EXPORT_SYMBOL_GPL(synchronize_rcu); | ||
197 | 89 | ||
198 | static void rcu_barrier_callback(struct rcu_head *notused) | 90 | static void rcu_barrier_callback(struct rcu_head *notused) |
199 | { | 91 | { |
@@ -207,10 +99,8 @@ static void rcu_barrier_callback(struct rcu_head *notused) | |||
207 | static void rcu_barrier_func(void *notused) | 99 | static void rcu_barrier_func(void *notused) |
208 | { | 100 | { |
209 | int cpu = smp_processor_id(); | 101 | int cpu = smp_processor_id(); |
210 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | 102 | struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); |
211 | struct rcu_head *head; | ||
212 | 103 | ||
213 | head = &rdp->barrier; | ||
214 | atomic_inc(&rcu_barrier_cpu_count); | 104 | atomic_inc(&rcu_barrier_cpu_count); |
215 | call_rcu(head, rcu_barrier_callback); | 105 | call_rcu(head, rcu_barrier_callback); |
216 | } | 106 | } |
@@ -225,420 +115,24 @@ void rcu_barrier(void) | |||
225 | mutex_lock(&rcu_barrier_mutex); | 115 | mutex_lock(&rcu_barrier_mutex); |
226 | init_completion(&rcu_barrier_completion); | 116 | init_completion(&rcu_barrier_completion); |
227 | atomic_set(&rcu_barrier_cpu_count, 0); | 117 | atomic_set(&rcu_barrier_cpu_count, 0); |
118 | /* | ||
119 | * The queueing of callbacks in all CPUs must be atomic with | ||
120 | * respect to RCU, otherwise one CPU may queue a callback, | ||
121 | * wait for a grace period, decrement barrier count and call | ||
122 | * complete(), while other CPUs have not yet queued anything. | ||
123 | * So, we need to make sure that grace periods cannot complete | ||
124 | * until all the callbacks are queued. | ||
125 | */ | ||
126 | rcu_read_lock(); | ||
228 | on_each_cpu(rcu_barrier_func, NULL, 0, 1); | 127 | on_each_cpu(rcu_barrier_func, NULL, 0, 1); |
128 | rcu_read_unlock(); | ||
229 | wait_for_completion(&rcu_barrier_completion); | 129 | wait_for_completion(&rcu_barrier_completion); |
230 | mutex_unlock(&rcu_barrier_mutex); | 130 | mutex_unlock(&rcu_barrier_mutex); |
231 | } | 131 | } |
232 | EXPORT_SYMBOL_GPL(rcu_barrier); | 132 | EXPORT_SYMBOL_GPL(rcu_barrier); |
233 | 133 | ||
234 | /* | ||
235 | * Invoke the completed RCU callbacks. They are expected to be in | ||
236 | * a per-cpu list. | ||
237 | */ | ||
238 | static void rcu_do_batch(struct rcu_data *rdp) | ||
239 | { | ||
240 | struct rcu_head *next, *list; | ||
241 | int count = 0; | ||
242 | |||
243 | list = rdp->donelist; | ||
244 | while (list) { | ||
245 | next = list->next; | ||
246 | prefetch(next); | ||
247 | list->func(list); | ||
248 | list = next; | ||
249 | if (++count >= rdp->blimit) | ||
250 | break; | ||
251 | } | ||
252 | rdp->donelist = list; | ||
253 | |||
254 | local_irq_disable(); | ||
255 | rdp->qlen -= count; | ||
256 | local_irq_enable(); | ||
257 | if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) | ||
258 | rdp->blimit = blimit; | ||
259 | |||
260 | if (!rdp->donelist) | ||
261 | rdp->donetail = &rdp->donelist; | ||
262 | else | ||
263 | tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu)); | ||
264 | } | ||
265 | |||
266 | /* | ||
267 | * Grace period handling: | ||
268 | * The grace period handling consists out of two steps: | ||
269 | * - A new grace period is started. | ||
270 | * This is done by rcu_start_batch. The start is not broadcasted to | ||
271 | * all cpus, they must pick this up by comparing rcp->cur with | ||
272 | * rdp->quiescbatch. All cpus are recorded in the | ||
273 | * rcu_ctrlblk.cpumask bitmap. | ||
274 | * - All cpus must go through a quiescent state. | ||
275 | * Since the start of the grace period is not broadcasted, at least two | ||
276 | * calls to rcu_check_quiescent_state are required: | ||
277 | * The first call just notices that a new grace period is running. The | ||
278 | * following calls check if there was a quiescent state since the beginning | ||
279 | * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If | ||
280 | * the bitmap is empty, then the grace period is completed. | ||
281 | * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace | ||
282 | * period (if necessary). | ||
283 | */ | ||
284 | /* | ||
285 | * Register a new batch of callbacks, and start it up if there is currently no | ||
286 | * active batch and the batch to be registered has not already occurred. | ||
287 | * Caller must hold rcu_ctrlblk.lock. | ||
288 | */ | ||
289 | static void rcu_start_batch(struct rcu_ctrlblk *rcp) | ||
290 | { | ||
291 | if (rcp->next_pending && | ||
292 | rcp->completed == rcp->cur) { | ||
293 | rcp->next_pending = 0; | ||
294 | /* | ||
295 | * next_pending == 0 must be visible in | ||
296 | * __rcu_process_callbacks() before it can see new value of cur. | ||
297 | */ | ||
298 | smp_wmb(); | ||
299 | rcp->cur++; | ||
300 | |||
301 | /* | ||
302 | * Accessing nohz_cpu_mask before incrementing rcp->cur needs a | ||
303 | * Barrier Otherwise it can cause tickless idle CPUs to be | ||
304 | * included in rcp->cpumask, which will extend graceperiods | ||
305 | * unnecessarily. | ||
306 | */ | ||
307 | smp_mb(); | ||
308 | cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); | ||
309 | |||
310 | rcp->signaled = 0; | ||
311 | } | ||
312 | } | ||
313 | |||
314 | /* | ||
315 | * cpu went through a quiescent state since the beginning of the grace period. | ||
316 | * Clear it from the cpu mask and complete the grace period if it was the last | ||
317 | * cpu. Start another grace period if someone has further entries pending | ||
318 | */ | ||
319 | static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) | ||
320 | { | ||
321 | cpu_clear(cpu, rcp->cpumask); | ||
322 | if (cpus_empty(rcp->cpumask)) { | ||
323 | /* batch completed ! */ | ||
324 | rcp->completed = rcp->cur; | ||
325 | rcu_start_batch(rcp); | ||
326 | } | ||
327 | } | ||
328 | |||
329 | /* | ||
330 | * Check if the cpu has gone through a quiescent state (say context | ||
331 | * switch). If so and if it already hasn't done so in this RCU | ||
332 | * quiescent cycle, then indicate that it has done so. | ||
333 | */ | ||
334 | static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, | ||
335 | struct rcu_data *rdp) | ||
336 | { | ||
337 | if (rdp->quiescbatch != rcp->cur) { | ||
338 | /* start new grace period: */ | ||
339 | rdp->qs_pending = 1; | ||
340 | rdp->passed_quiesc = 0; | ||
341 | rdp->quiescbatch = rcp->cur; | ||
342 | return; | ||
343 | } | ||
344 | |||
345 | /* Grace period already completed for this cpu? | ||
346 | * qs_pending is checked instead of the actual bitmap to avoid | ||
347 | * cacheline trashing. | ||
348 | */ | ||
349 | if (!rdp->qs_pending) | ||
350 | return; | ||
351 | |||
352 | /* | ||
353 | * Was there a quiescent state since the beginning of the grace | ||
354 | * period? If no, then exit and wait for the next call. | ||
355 | */ | ||
356 | if (!rdp->passed_quiesc) | ||
357 | return; | ||
358 | rdp->qs_pending = 0; | ||
359 | |||
360 | spin_lock(&rcp->lock); | ||
361 | /* | ||
362 | * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync | ||
363 | * during cpu startup. Ignore the quiescent state. | ||
364 | */ | ||
365 | if (likely(rdp->quiescbatch == rcp->cur)) | ||
366 | cpu_quiet(rdp->cpu, rcp); | ||
367 | |||
368 | spin_unlock(&rcp->lock); | ||
369 | } | ||
370 | |||
371 | |||
372 | #ifdef CONFIG_HOTPLUG_CPU | ||
373 | |||
374 | /* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing | ||
375 | * locking requirements, the list it's pulling from has to belong to a cpu | ||
376 | * which is dead and hence not processing interrupts. | ||
377 | */ | ||
378 | static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, | ||
379 | struct rcu_head **tail) | ||
380 | { | ||
381 | local_irq_disable(); | ||
382 | *this_rdp->nxttail = list; | ||
383 | if (list) | ||
384 | this_rdp->nxttail = tail; | ||
385 | local_irq_enable(); | ||
386 | } | ||
387 | |||
388 | static void __rcu_offline_cpu(struct rcu_data *this_rdp, | ||
389 | struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
390 | { | ||
391 | /* if the cpu going offline owns the grace period | ||
392 | * we can block indefinitely waiting for it, so flush | ||
393 | * it here | ||
394 | */ | ||
395 | spin_lock_bh(&rcp->lock); | ||
396 | if (rcp->cur != rcp->completed) | ||
397 | cpu_quiet(rdp->cpu, rcp); | ||
398 | spin_unlock_bh(&rcp->lock); | ||
399 | rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); | ||
400 | rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); | ||
401 | rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); | ||
402 | } | ||
403 | |||
404 | static void rcu_offline_cpu(int cpu) | ||
405 | { | ||
406 | struct rcu_data *this_rdp = &get_cpu_var(rcu_data); | ||
407 | struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); | ||
408 | |||
409 | __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, | ||
410 | &per_cpu(rcu_data, cpu)); | ||
411 | __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, | ||
412 | &per_cpu(rcu_bh_data, cpu)); | ||
413 | put_cpu_var(rcu_data); | ||
414 | put_cpu_var(rcu_bh_data); | ||
415 | tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu); | ||
416 | } | ||
417 | |||
418 | #else | ||
419 | |||
420 | static void rcu_offline_cpu(int cpu) | ||
421 | { | ||
422 | } | ||
423 | |||
424 | #endif | ||
425 | |||
426 | /* | ||
427 | * This does the RCU processing work from tasklet context. | ||
428 | */ | ||
429 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, | ||
430 | struct rcu_data *rdp) | ||
431 | { | ||
432 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { | ||
433 | *rdp->donetail = rdp->curlist; | ||
434 | rdp->donetail = rdp->curtail; | ||
435 | rdp->curlist = NULL; | ||
436 | rdp->curtail = &rdp->curlist; | ||
437 | } | ||
438 | |||
439 | if (rdp->nxtlist && !rdp->curlist) { | ||
440 | local_irq_disable(); | ||
441 | rdp->curlist = rdp->nxtlist; | ||
442 | rdp->curtail = rdp->nxttail; | ||
443 | rdp->nxtlist = NULL; | ||
444 | rdp->nxttail = &rdp->nxtlist; | ||
445 | local_irq_enable(); | ||
446 | |||
447 | /* | ||
448 | * start the next batch of callbacks | ||
449 | */ | ||
450 | |||
451 | /* determine batch number */ | ||
452 | rdp->batch = rcp->cur + 1; | ||
453 | /* see the comment and corresponding wmb() in | ||
454 | * the rcu_start_batch() | ||
455 | */ | ||
456 | smp_rmb(); | ||
457 | |||
458 | if (!rcp->next_pending) { | ||
459 | /* and start it/schedule start if it's a new batch */ | ||
460 | spin_lock(&rcp->lock); | ||
461 | rcp->next_pending = 1; | ||
462 | rcu_start_batch(rcp); | ||
463 | spin_unlock(&rcp->lock); | ||
464 | } | ||
465 | } | ||
466 | |||
467 | rcu_check_quiescent_state(rcp, rdp); | ||
468 | if (rdp->donelist) | ||
469 | rcu_do_batch(rdp); | ||
470 | } | ||
471 | |||
472 | static void rcu_process_callbacks(unsigned long unused) | ||
473 | { | ||
474 | __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); | ||
475 | __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); | ||
476 | } | ||
477 | |||
478 | static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
479 | { | ||
480 | /* This cpu has pending rcu entries and the grace period | ||
481 | * for them has completed. | ||
482 | */ | ||
483 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) | ||
484 | return 1; | ||
485 | |||
486 | /* This cpu has no pending entries, but there are new entries */ | ||
487 | if (!rdp->curlist && rdp->nxtlist) | ||
488 | return 1; | ||
489 | |||
490 | /* This cpu has finished callbacks to invoke */ | ||
491 | if (rdp->donelist) | ||
492 | return 1; | ||
493 | |||
494 | /* The rcu core waits for a quiescent state from the cpu */ | ||
495 | if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) | ||
496 | return 1; | ||
497 | |||
498 | /* nothing to do */ | ||
499 | return 0; | ||
500 | } | ||
501 | |||
502 | /* | ||
503 | * Check to see if there is any immediate RCU-related work to be done | ||
504 | * by the current CPU, returning 1 if so. This function is part of the | ||
505 | * RCU implementation; it is -not- an exported member of the RCU API. | ||
506 | */ | ||
507 | int rcu_pending(int cpu) | ||
508 | { | ||
509 | return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || | ||
510 | __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); | ||
511 | } | ||
512 | |||
513 | /* | ||
514 | * Check to see if any future RCU-related work will need to be done | ||
515 | * by the current CPU, even if none need be done immediately, returning | ||
516 | * 1 if so. This function is part of the RCU implementation; it is -not- | ||
517 | * an exported member of the RCU API. | ||
518 | */ | ||
519 | int rcu_needs_cpu(int cpu) | ||
520 | { | ||
521 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
522 | struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); | ||
523 | |||
524 | return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); | ||
525 | } | ||
526 | |||
527 | void rcu_check_callbacks(int cpu, int user) | ||
528 | { | ||
529 | if (user || | ||
530 | (idle_cpu(cpu) && !in_softirq() && | ||
531 | hardirq_count() <= (1 << HARDIRQ_SHIFT))) { | ||
532 | rcu_qsctr_inc(cpu); | ||
533 | rcu_bh_qsctr_inc(cpu); | ||
534 | } else if (!in_softirq()) | ||
535 | rcu_bh_qsctr_inc(cpu); | ||
536 | tasklet_schedule(&per_cpu(rcu_tasklet, cpu)); | ||
537 | } | ||
538 | |||
539 | static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, | ||
540 | struct rcu_data *rdp) | ||
541 | { | ||
542 | memset(rdp, 0, sizeof(*rdp)); | ||
543 | rdp->curtail = &rdp->curlist; | ||
544 | rdp->nxttail = &rdp->nxtlist; | ||
545 | rdp->donetail = &rdp->donelist; | ||
546 | rdp->quiescbatch = rcp->completed; | ||
547 | rdp->qs_pending = 0; | ||
548 | rdp->cpu = cpu; | ||
549 | rdp->blimit = blimit; | ||
550 | } | ||
551 | |||
552 | static void __cpuinit rcu_online_cpu(int cpu) | ||
553 | { | ||
554 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
555 | struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); | ||
556 | |||
557 | rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); | ||
558 | rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); | ||
559 | tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); | ||
560 | } | ||
561 | |||
562 | static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | ||
563 | unsigned long action, void *hcpu) | ||
564 | { | ||
565 | long cpu = (long)hcpu; | ||
566 | switch (action) { | ||
567 | case CPU_UP_PREPARE: | ||
568 | case CPU_UP_PREPARE_FROZEN: | ||
569 | rcu_online_cpu(cpu); | ||
570 | break; | ||
571 | case CPU_DEAD: | ||
572 | case CPU_DEAD_FROZEN: | ||
573 | rcu_offline_cpu(cpu); | ||
574 | break; | ||
575 | default: | ||
576 | break; | ||
577 | } | ||
578 | return NOTIFY_OK; | ||
579 | } | ||
580 | |||
581 | static struct notifier_block __cpuinitdata rcu_nb = { | ||
582 | .notifier_call = rcu_cpu_notify, | ||
583 | }; | ||
584 | |||
585 | /* | ||
586 | * Initializes rcu mechanism. Assumed to be called early. | ||
587 | * That is before local timer(SMP) or jiffie timer (uniproc) is setup. | ||
588 | * Note that rcu_qsctr and friends are implicitly | ||
589 | * initialized due to the choice of ``0'' for RCU_CTR_INVALID. | ||
590 | */ | ||
591 | void __init rcu_init(void) | 134 | void __init rcu_init(void) |
592 | { | 135 | { |
593 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, | 136 | __rcu_init(); |
594 | (void *)(long)smp_processor_id()); | ||
595 | /* Register notifier for non-boot CPUs */ | ||
596 | register_cpu_notifier(&rcu_nb); | ||
597 | } | ||
598 | |||
599 | struct rcu_synchronize { | ||
600 | struct rcu_head head; | ||
601 | struct completion completion; | ||
602 | }; | ||
603 | |||
604 | /* Because of FASTCALL declaration of complete, we use this wrapper */ | ||
605 | static void wakeme_after_rcu(struct rcu_head *head) | ||
606 | { | ||
607 | struct rcu_synchronize *rcu; | ||
608 | |||
609 | rcu = container_of(head, struct rcu_synchronize, head); | ||
610 | complete(&rcu->completion); | ||
611 | } | 137 | } |
612 | 138 | ||
613 | /** | ||
614 | * synchronize_rcu - wait until a grace period has elapsed. | ||
615 | * | ||
616 | * Control will return to the caller some time after a full grace | ||
617 | * period has elapsed, in other words after all currently executing RCU | ||
618 | * read-side critical sections have completed. RCU read-side critical | ||
619 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | ||
620 | * and may be nested. | ||
621 | * | ||
622 | * If your read-side code is not protected by rcu_read_lock(), do -not- | ||
623 | * use synchronize_rcu(). | ||
624 | */ | ||
625 | void synchronize_rcu(void) | ||
626 | { | ||
627 | struct rcu_synchronize rcu; | ||
628 | |||
629 | init_completion(&rcu.completion); | ||
630 | /* Will wake me after RCU finished */ | ||
631 | call_rcu(&rcu.head, wakeme_after_rcu); | ||
632 | |||
633 | /* Wait for it */ | ||
634 | wait_for_completion(&rcu.completion); | ||
635 | } | ||
636 | |||
637 | module_param(blimit, int, 0); | ||
638 | module_param(qhimark, int, 0); | ||
639 | module_param(qlowmark, int, 0); | ||
640 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
641 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | ||
642 | EXPORT_SYMBOL_GPL(call_rcu); | ||
643 | EXPORT_SYMBOL_GPL(call_rcu_bh); | ||
644 | EXPORT_SYMBOL_GPL(synchronize_rcu); | ||
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c new file mode 100644 index 000000000000..987cfb7ade89 --- /dev/null +++ b/kernel/rcupreempt.c | |||
@@ -0,0 +1,953 @@ | |||
1 | /* | ||
2 | * Read-Copy Update mechanism for mutual exclusion, realtime implementation | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2006 | ||
19 | * | ||
20 | * Authors: Paul E. McKenney <paulmck@us.ibm.com> | ||
21 | * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar | ||
22 | * for pushing me away from locks and towards counters, and | ||
23 | * to Suparna Bhattacharya for pushing me completely away | ||
24 | * from atomic instructions on the read side. | ||
25 | * | ||
26 | * Papers: http://www.rdrop.com/users/paulmck/RCU | ||
27 | * | ||
28 | * Design Document: http://lwn.net/Articles/253651/ | ||
29 | * | ||
30 | * For detailed explanation of Read-Copy Update mechanism see - | ||
31 | * Documentation/RCU/ *.txt | ||
32 | * | ||
33 | */ | ||
34 | #include <linux/types.h> | ||
35 | #include <linux/kernel.h> | ||
36 | #include <linux/init.h> | ||
37 | #include <linux/spinlock.h> | ||
38 | #include <linux/smp.h> | ||
39 | #include <linux/rcupdate.h> | ||
40 | #include <linux/interrupt.h> | ||
41 | #include <linux/sched.h> | ||
42 | #include <asm/atomic.h> | ||
43 | #include <linux/bitops.h> | ||
44 | #include <linux/module.h> | ||
45 | #include <linux/completion.h> | ||
46 | #include <linux/moduleparam.h> | ||
47 | #include <linux/percpu.h> | ||
48 | #include <linux/notifier.h> | ||
49 | #include <linux/rcupdate.h> | ||
50 | #include <linux/cpu.h> | ||
51 | #include <linux/random.h> | ||
52 | #include <linux/delay.h> | ||
53 | #include <linux/byteorder/swabb.h> | ||
54 | #include <linux/cpumask.h> | ||
55 | #include <linux/rcupreempt_trace.h> | ||
56 | |||
57 | /* | ||
58 | * Macro that prevents the compiler from reordering accesses, but does | ||
59 | * absolutely -nothing- to prevent CPUs from reordering. This is used | ||
60 | * only to mediate communication between mainline code and hardware | ||
61 | * interrupt and NMI handlers. | ||
62 | */ | ||
63 | #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) | ||
64 | |||
65 | /* | ||
66 | * PREEMPT_RCU data structures. | ||
67 | */ | ||
68 | |||
69 | /* | ||
70 | * GP_STAGES specifies the number of times the state machine has | ||
71 | * to go through the all the rcu_try_flip_states (see below) | ||
72 | * in a single Grace Period. | ||
73 | * | ||
74 | * GP in GP_STAGES stands for Grace Period ;) | ||
75 | */ | ||
76 | #define GP_STAGES 2 | ||
77 | struct rcu_data { | ||
78 | spinlock_t lock; /* Protect rcu_data fields. */ | ||
79 | long completed; /* Number of last completed batch. */ | ||
80 | int waitlistcount; | ||
81 | struct tasklet_struct rcu_tasklet; | ||
82 | struct rcu_head *nextlist; | ||
83 | struct rcu_head **nexttail; | ||
84 | struct rcu_head *waitlist[GP_STAGES]; | ||
85 | struct rcu_head **waittail[GP_STAGES]; | ||
86 | struct rcu_head *donelist; | ||
87 | struct rcu_head **donetail; | ||
88 | long rcu_flipctr[2]; | ||
89 | #ifdef CONFIG_RCU_TRACE | ||
90 | struct rcupreempt_trace trace; | ||
91 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
92 | }; | ||
93 | |||
94 | /* | ||
95 | * States for rcu_try_flip() and friends. | ||
96 | */ | ||
97 | |||
98 | enum rcu_try_flip_states { | ||
99 | |||
100 | /* | ||
101 | * Stay here if nothing is happening. Flip the counter if somthing | ||
102 | * starts happening. Denoted by "I" | ||
103 | */ | ||
104 | rcu_try_flip_idle_state, | ||
105 | |||
106 | /* | ||
107 | * Wait here for all CPUs to notice that the counter has flipped. This | ||
108 | * prevents the old set of counters from ever being incremented once | ||
109 | * we leave this state, which in turn is necessary because we cannot | ||
110 | * test any individual counter for zero -- we can only check the sum. | ||
111 | * Denoted by "A". | ||
112 | */ | ||
113 | rcu_try_flip_waitack_state, | ||
114 | |||
115 | /* | ||
116 | * Wait here for the sum of the old per-CPU counters to reach zero. | ||
117 | * Denoted by "Z". | ||
118 | */ | ||
119 | rcu_try_flip_waitzero_state, | ||
120 | |||
121 | /* | ||
122 | * Wait here for each of the other CPUs to execute a memory barrier. | ||
123 | * This is necessary to ensure that these other CPUs really have | ||
124 | * completed executing their RCU read-side critical sections, despite | ||
125 | * their CPUs wildly reordering memory. Denoted by "M". | ||
126 | */ | ||
127 | rcu_try_flip_waitmb_state, | ||
128 | }; | ||
129 | |||
130 | struct rcu_ctrlblk { | ||
131 | spinlock_t fliplock; /* Protect state-machine transitions. */ | ||
132 | long completed; /* Number of last completed batch. */ | ||
133 | enum rcu_try_flip_states rcu_try_flip_state; /* The current state of | ||
134 | the rcu state machine */ | ||
135 | }; | ||
136 | |||
137 | static DEFINE_PER_CPU(struct rcu_data, rcu_data); | ||
138 | static struct rcu_ctrlblk rcu_ctrlblk = { | ||
139 | .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), | ||
140 | .completed = 0, | ||
141 | .rcu_try_flip_state = rcu_try_flip_idle_state, | ||
142 | }; | ||
143 | |||
144 | |||
145 | #ifdef CONFIG_RCU_TRACE | ||
146 | static char *rcu_try_flip_state_names[] = | ||
147 | { "idle", "waitack", "waitzero", "waitmb" }; | ||
148 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
149 | |||
150 | static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE; | ||
151 | |||
152 | /* | ||
153 | * Enum and per-CPU flag to determine when each CPU has seen | ||
154 | * the most recent counter flip. | ||
155 | */ | ||
156 | |||
157 | enum rcu_flip_flag_values { | ||
158 | rcu_flip_seen, /* Steady/initial state, last flip seen. */ | ||
159 | /* Only GP detector can update. */ | ||
160 | rcu_flipped /* Flip just completed, need confirmation. */ | ||
161 | /* Only corresponding CPU can update. */ | ||
162 | }; | ||
163 | static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag) | ||
164 | = rcu_flip_seen; | ||
165 | |||
166 | /* | ||
167 | * Enum and per-CPU flag to determine when each CPU has executed the | ||
168 | * needed memory barrier to fence in memory references from its last RCU | ||
169 | * read-side critical section in the just-completed grace period. | ||
170 | */ | ||
171 | |||
172 | enum rcu_mb_flag_values { | ||
173 | rcu_mb_done, /* Steady/initial state, no mb()s required. */ | ||
174 | /* Only GP detector can update. */ | ||
175 | rcu_mb_needed /* Flip just completed, need an mb(). */ | ||
176 | /* Only corresponding CPU can update. */ | ||
177 | }; | ||
178 | static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag) | ||
179 | = rcu_mb_done; | ||
180 | |||
181 | /* | ||
182 | * RCU_DATA_ME: find the current CPU's rcu_data structure. | ||
183 | * RCU_DATA_CPU: find the specified CPU's rcu_data structure. | ||
184 | */ | ||
185 | #define RCU_DATA_ME() (&__get_cpu_var(rcu_data)) | ||
186 | #define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu)) | ||
187 | |||
188 | /* | ||
189 | * Helper macro for tracing when the appropriate rcu_data is not | ||
190 | * cached in a local variable, but where the CPU number is so cached. | ||
191 | */ | ||
192 | #define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace)); | ||
193 | |||
194 | /* | ||
195 | * Helper macro for tracing when the appropriate rcu_data is not | ||
196 | * cached in a local variable. | ||
197 | */ | ||
198 | #define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace)); | ||
199 | |||
200 | /* | ||
201 | * Helper macro for tracing when the appropriate rcu_data is pointed | ||
202 | * to by a local variable. | ||
203 | */ | ||
204 | #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace)); | ||
205 | |||
206 | /* | ||
207 | * Return the number of RCU batches processed thus far. Useful | ||
208 | * for debug and statistics. | ||
209 | */ | ||
210 | long rcu_batches_completed(void) | ||
211 | { | ||
212 | return rcu_ctrlblk.completed; | ||
213 | } | ||
214 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
215 | |||
216 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | ||
217 | |||
218 | void __rcu_read_lock(void) | ||
219 | { | ||
220 | int idx; | ||
221 | struct task_struct *t = current; | ||
222 | int nesting; | ||
223 | |||
224 | nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
225 | if (nesting != 0) { | ||
226 | |||
227 | /* An earlier rcu_read_lock() covers us, just count it. */ | ||
228 | |||
229 | t->rcu_read_lock_nesting = nesting + 1; | ||
230 | |||
231 | } else { | ||
232 | unsigned long flags; | ||
233 | |||
234 | /* | ||
235 | * We disable interrupts for the following reasons: | ||
236 | * - If we get scheduling clock interrupt here, and we | ||
237 | * end up acking the counter flip, it's like a promise | ||
238 | * that we will never increment the old counter again. | ||
239 | * Thus we will break that promise if that | ||
240 | * scheduling clock interrupt happens between the time | ||
241 | * we pick the .completed field and the time that we | ||
242 | * increment our counter. | ||
243 | * | ||
244 | * - We don't want to be preempted out here. | ||
245 | * | ||
246 | * NMIs can still occur, of course, and might themselves | ||
247 | * contain rcu_read_lock(). | ||
248 | */ | ||
249 | |||
250 | local_irq_save(flags); | ||
251 | |||
252 | /* | ||
253 | * Outermost nesting of rcu_read_lock(), so increment | ||
254 | * the current counter for the current CPU. Use volatile | ||
255 | * casts to prevent the compiler from reordering. | ||
256 | */ | ||
257 | |||
258 | idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1; | ||
259 | ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++; | ||
260 | |||
261 | /* | ||
262 | * Now that the per-CPU counter has been incremented, we | ||
263 | * are protected from races with rcu_read_lock() invoked | ||
264 | * from NMI handlers on this CPU. We can therefore safely | ||
265 | * increment the nesting counter, relieving further NMIs | ||
266 | * of the need to increment the per-CPU counter. | ||
267 | */ | ||
268 | |||
269 | ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1; | ||
270 | |||
271 | /* | ||
272 | * Now that we have preventing any NMIs from storing | ||
273 | * to the ->rcu_flipctr_idx, we can safely use it to | ||
274 | * remember which counter to decrement in the matching | ||
275 | * rcu_read_unlock(). | ||
276 | */ | ||
277 | |||
278 | ACCESS_ONCE(t->rcu_flipctr_idx) = idx; | ||
279 | local_irq_restore(flags); | ||
280 | } | ||
281 | } | ||
282 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | ||
283 | |||
284 | void __rcu_read_unlock(void) | ||
285 | { | ||
286 | int idx; | ||
287 | struct task_struct *t = current; | ||
288 | int nesting; | ||
289 | |||
290 | nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
291 | if (nesting > 1) { | ||
292 | |||
293 | /* | ||
294 | * We are still protected by the enclosing rcu_read_lock(), | ||
295 | * so simply decrement the counter. | ||
296 | */ | ||
297 | |||
298 | t->rcu_read_lock_nesting = nesting - 1; | ||
299 | |||
300 | } else { | ||
301 | unsigned long flags; | ||
302 | |||
303 | /* | ||
304 | * Disable local interrupts to prevent the grace-period | ||
305 | * detection state machine from seeing us half-done. | ||
306 | * NMIs can still occur, of course, and might themselves | ||
307 | * contain rcu_read_lock() and rcu_read_unlock(). | ||
308 | */ | ||
309 | |||
310 | local_irq_save(flags); | ||
311 | |||
312 | /* | ||
313 | * Outermost nesting of rcu_read_unlock(), so we must | ||
314 | * decrement the current counter for the current CPU. | ||
315 | * This must be done carefully, because NMIs can | ||
316 | * occur at any point in this code, and any rcu_read_lock() | ||
317 | * and rcu_read_unlock() pairs in the NMI handlers | ||
318 | * must interact non-destructively with this code. | ||
319 | * Lots of volatile casts, and -very- careful ordering. | ||
320 | * | ||
321 | * Changes to this code, including this one, must be | ||
322 | * inspected, validated, and tested extremely carefully!!! | ||
323 | */ | ||
324 | |||
325 | /* | ||
326 | * First, pick up the index. | ||
327 | */ | ||
328 | |||
329 | idx = ACCESS_ONCE(t->rcu_flipctr_idx); | ||
330 | |||
331 | /* | ||
332 | * Now that we have fetched the counter index, it is | ||
333 | * safe to decrement the per-task RCU nesting counter. | ||
334 | * After this, any interrupts or NMIs will increment and | ||
335 | * decrement the per-CPU counters. | ||
336 | */ | ||
337 | ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1; | ||
338 | |||
339 | /* | ||
340 | * It is now safe to decrement this task's nesting count. | ||
341 | * NMIs that occur after this statement will route their | ||
342 | * rcu_read_lock() calls through this "else" clause, and | ||
343 | * will thus start incrementing the per-CPU counter on | ||
344 | * their own. They will also clobber ->rcu_flipctr_idx, | ||
345 | * but that is OK, since we have already fetched it. | ||
346 | */ | ||
347 | |||
348 | ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--; | ||
349 | local_irq_restore(flags); | ||
350 | } | ||
351 | } | ||
352 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | ||
353 | |||
354 | /* | ||
355 | * If a global counter flip has occurred since the last time that we | ||
356 | * advanced callbacks, advance them. Hardware interrupts must be | ||
357 | * disabled when calling this function. | ||
358 | */ | ||
359 | static void __rcu_advance_callbacks(struct rcu_data *rdp) | ||
360 | { | ||
361 | int cpu; | ||
362 | int i; | ||
363 | int wlc = 0; | ||
364 | |||
365 | if (rdp->completed != rcu_ctrlblk.completed) { | ||
366 | if (rdp->waitlist[GP_STAGES - 1] != NULL) { | ||
367 | *rdp->donetail = rdp->waitlist[GP_STAGES - 1]; | ||
368 | rdp->donetail = rdp->waittail[GP_STAGES - 1]; | ||
369 | RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp); | ||
370 | } | ||
371 | for (i = GP_STAGES - 2; i >= 0; i--) { | ||
372 | if (rdp->waitlist[i] != NULL) { | ||
373 | rdp->waitlist[i + 1] = rdp->waitlist[i]; | ||
374 | rdp->waittail[i + 1] = rdp->waittail[i]; | ||
375 | wlc++; | ||
376 | } else { | ||
377 | rdp->waitlist[i + 1] = NULL; | ||
378 | rdp->waittail[i + 1] = | ||
379 | &rdp->waitlist[i + 1]; | ||
380 | } | ||
381 | } | ||
382 | if (rdp->nextlist != NULL) { | ||
383 | rdp->waitlist[0] = rdp->nextlist; | ||
384 | rdp->waittail[0] = rdp->nexttail; | ||
385 | wlc++; | ||
386 | rdp->nextlist = NULL; | ||
387 | rdp->nexttail = &rdp->nextlist; | ||
388 | RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp); | ||
389 | } else { | ||
390 | rdp->waitlist[0] = NULL; | ||
391 | rdp->waittail[0] = &rdp->waitlist[0]; | ||
392 | } | ||
393 | rdp->waitlistcount = wlc; | ||
394 | rdp->completed = rcu_ctrlblk.completed; | ||
395 | } | ||
396 | |||
397 | /* | ||
398 | * Check to see if this CPU needs to report that it has seen | ||
399 | * the most recent counter flip, thereby declaring that all | ||
400 | * subsequent rcu_read_lock() invocations will respect this flip. | ||
401 | */ | ||
402 | |||
403 | cpu = raw_smp_processor_id(); | ||
404 | if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { | ||
405 | smp_mb(); /* Subsequent counter accesses must see new value */ | ||
406 | per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; | ||
407 | smp_mb(); /* Subsequent RCU read-side critical sections */ | ||
408 | /* seen -after- acknowledgement. */ | ||
409 | } | ||
410 | } | ||
411 | |||
412 | /* | ||
413 | * Get here when RCU is idle. Decide whether we need to | ||
414 | * move out of idle state, and return non-zero if so. | ||
415 | * "Straightforward" approach for the moment, might later | ||
416 | * use callback-list lengths, grace-period duration, or | ||
417 | * some such to determine when to exit idle state. | ||
418 | * Might also need a pre-idle test that does not acquire | ||
419 | * the lock, but let's get the simple case working first... | ||
420 | */ | ||
421 | |||
422 | static int | ||
423 | rcu_try_flip_idle(void) | ||
424 | { | ||
425 | int cpu; | ||
426 | |||
427 | RCU_TRACE_ME(rcupreempt_trace_try_flip_i1); | ||
428 | if (!rcu_pending(smp_processor_id())) { | ||
429 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1); | ||
430 | return 0; | ||
431 | } | ||
432 | |||
433 | /* | ||
434 | * Do the flip. | ||
435 | */ | ||
436 | |||
437 | RCU_TRACE_ME(rcupreempt_trace_try_flip_g1); | ||
438 | rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */ | ||
439 | |||
440 | /* | ||
441 | * Need a memory barrier so that other CPUs see the new | ||
442 | * counter value before they see the subsequent change of all | ||
443 | * the rcu_flip_flag instances to rcu_flipped. | ||
444 | */ | ||
445 | |||
446 | smp_mb(); /* see above block comment. */ | ||
447 | |||
448 | /* Now ask each CPU for acknowledgement of the flip. */ | ||
449 | |||
450 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
451 | per_cpu(rcu_flip_flag, cpu) = rcu_flipped; | ||
452 | |||
453 | return 1; | ||
454 | } | ||
455 | |||
456 | /* | ||
457 | * Wait for CPUs to acknowledge the flip. | ||
458 | */ | ||
459 | |||
460 | static int | ||
461 | rcu_try_flip_waitack(void) | ||
462 | { | ||
463 | int cpu; | ||
464 | |||
465 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); | ||
466 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
467 | if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { | ||
468 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); | ||
469 | return 0; | ||
470 | } | ||
471 | |||
472 | /* | ||
473 | * Make sure our checks above don't bleed into subsequent | ||
474 | * waiting for the sum of the counters to reach zero. | ||
475 | */ | ||
476 | |||
477 | smp_mb(); /* see above block comment. */ | ||
478 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a2); | ||
479 | return 1; | ||
480 | } | ||
481 | |||
482 | /* | ||
483 | * Wait for collective ``last'' counter to reach zero, | ||
484 | * then tell all CPUs to do an end-of-grace-period memory barrier. | ||
485 | */ | ||
486 | |||
487 | static int | ||
488 | rcu_try_flip_waitzero(void) | ||
489 | { | ||
490 | int cpu; | ||
491 | int lastidx = !(rcu_ctrlblk.completed & 0x1); | ||
492 | int sum = 0; | ||
493 | |||
494 | /* Check to see if the sum of the "last" counters is zero. */ | ||
495 | |||
496 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); | ||
497 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
498 | sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; | ||
499 | if (sum != 0) { | ||
500 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); | ||
501 | return 0; | ||
502 | } | ||
503 | |||
504 | /* | ||
505 | * This ensures that the other CPUs see the call for | ||
506 | * memory barriers -after- the sum to zero has been | ||
507 | * detected here | ||
508 | */ | ||
509 | smp_mb(); /* ^^^^^^^^^^^^ */ | ||
510 | |||
511 | /* Call for a memory barrier from each CPU. */ | ||
512 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
513 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; | ||
514 | |||
515 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); | ||
516 | return 1; | ||
517 | } | ||
518 | |||
519 | /* | ||
520 | * Wait for all CPUs to do their end-of-grace-period memory barrier. | ||
521 | * Return 0 once all CPUs have done so. | ||
522 | */ | ||
523 | |||
524 | static int | ||
525 | rcu_try_flip_waitmb(void) | ||
526 | { | ||
527 | int cpu; | ||
528 | |||
529 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); | ||
530 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
531 | if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { | ||
532 | RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); | ||
533 | return 0; | ||
534 | } | ||
535 | |||
536 | smp_mb(); /* Ensure that the above checks precede any following flip. */ | ||
537 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m2); | ||
538 | return 1; | ||
539 | } | ||
540 | |||
541 | /* | ||
542 | * Attempt a single flip of the counters. Remember, a single flip does | ||
543 | * -not- constitute a grace period. Instead, the interval between | ||
544 | * at least GP_STAGES consecutive flips is a grace period. | ||
545 | * | ||
546 | * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation | ||
547 | * on a large SMP, they might want to use a hierarchical organization of | ||
548 | * the per-CPU-counter pairs. | ||
549 | */ | ||
550 | static void rcu_try_flip(void) | ||
551 | { | ||
552 | unsigned long flags; | ||
553 | |||
554 | RCU_TRACE_ME(rcupreempt_trace_try_flip_1); | ||
555 | if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) { | ||
556 | RCU_TRACE_ME(rcupreempt_trace_try_flip_e1); | ||
557 | return; | ||
558 | } | ||
559 | |||
560 | /* | ||
561 | * Take the next transition(s) through the RCU grace-period | ||
562 | * flip-counter state machine. | ||
563 | */ | ||
564 | |||
565 | switch (rcu_ctrlblk.rcu_try_flip_state) { | ||
566 | case rcu_try_flip_idle_state: | ||
567 | if (rcu_try_flip_idle()) | ||
568 | rcu_ctrlblk.rcu_try_flip_state = | ||
569 | rcu_try_flip_waitack_state; | ||
570 | break; | ||
571 | case rcu_try_flip_waitack_state: | ||
572 | if (rcu_try_flip_waitack()) | ||
573 | rcu_ctrlblk.rcu_try_flip_state = | ||
574 | rcu_try_flip_waitzero_state; | ||
575 | break; | ||
576 | case rcu_try_flip_waitzero_state: | ||
577 | if (rcu_try_flip_waitzero()) | ||
578 | rcu_ctrlblk.rcu_try_flip_state = | ||
579 | rcu_try_flip_waitmb_state; | ||
580 | break; | ||
581 | case rcu_try_flip_waitmb_state: | ||
582 | if (rcu_try_flip_waitmb()) | ||
583 | rcu_ctrlblk.rcu_try_flip_state = | ||
584 | rcu_try_flip_idle_state; | ||
585 | } | ||
586 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
587 | } | ||
588 | |||
589 | /* | ||
590 | * Check to see if this CPU needs to do a memory barrier in order to | ||
591 | * ensure that any prior RCU read-side critical sections have committed | ||
592 | * their counter manipulations and critical-section memory references | ||
593 | * before declaring the grace period to be completed. | ||
594 | */ | ||
595 | static void rcu_check_mb(int cpu) | ||
596 | { | ||
597 | if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) { | ||
598 | smp_mb(); /* Ensure RCU read-side accesses are visible. */ | ||
599 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_done; | ||
600 | } | ||
601 | } | ||
602 | |||
603 | void rcu_check_callbacks(int cpu, int user) | ||
604 | { | ||
605 | unsigned long flags; | ||
606 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
607 | |||
608 | rcu_check_mb(cpu); | ||
609 | if (rcu_ctrlblk.completed == rdp->completed) | ||
610 | rcu_try_flip(); | ||
611 | spin_lock_irqsave(&rdp->lock, flags); | ||
612 | RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); | ||
613 | __rcu_advance_callbacks(rdp); | ||
614 | if (rdp->donelist == NULL) { | ||
615 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
616 | } else { | ||
617 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
618 | raise_softirq(RCU_SOFTIRQ); | ||
619 | } | ||
620 | } | ||
621 | |||
622 | /* | ||
623 | * Needed by dynticks, to make sure all RCU processing has finished | ||
624 | * when we go idle: | ||
625 | */ | ||
626 | void rcu_advance_callbacks(int cpu, int user) | ||
627 | { | ||
628 | unsigned long flags; | ||
629 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
630 | |||
631 | if (rcu_ctrlblk.completed == rdp->completed) { | ||
632 | rcu_try_flip(); | ||
633 | if (rcu_ctrlblk.completed == rdp->completed) | ||
634 | return; | ||
635 | } | ||
636 | spin_lock_irqsave(&rdp->lock, flags); | ||
637 | RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); | ||
638 | __rcu_advance_callbacks(rdp); | ||
639 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
640 | } | ||
641 | |||
642 | #ifdef CONFIG_HOTPLUG_CPU | ||
643 | #define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \ | ||
644 | *dsttail = srclist; \ | ||
645 | if (srclist != NULL) { \ | ||
646 | dsttail = srctail; \ | ||
647 | srclist = NULL; \ | ||
648 | srctail = &srclist;\ | ||
649 | } \ | ||
650 | } while (0) | ||
651 | |||
652 | void rcu_offline_cpu(int cpu) | ||
653 | { | ||
654 | int i; | ||
655 | struct rcu_head *list = NULL; | ||
656 | unsigned long flags; | ||
657 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
658 | struct rcu_head **tail = &list; | ||
659 | |||
660 | /* | ||
661 | * Remove all callbacks from the newly dead CPU, retaining order. | ||
662 | * Otherwise rcu_barrier() will fail | ||
663 | */ | ||
664 | |||
665 | spin_lock_irqsave(&rdp->lock, flags); | ||
666 | rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail); | ||
667 | for (i = GP_STAGES - 1; i >= 0; i--) | ||
668 | rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i], | ||
669 | list, tail); | ||
670 | rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail); | ||
671 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
672 | rdp->waitlistcount = 0; | ||
673 | |||
674 | /* Disengage the newly dead CPU from the grace-period computation. */ | ||
675 | |||
676 | spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); | ||
677 | rcu_check_mb(cpu); | ||
678 | if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { | ||
679 | smp_mb(); /* Subsequent counter accesses must see new value */ | ||
680 | per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; | ||
681 | smp_mb(); /* Subsequent RCU read-side critical sections */ | ||
682 | /* seen -after- acknowledgement. */ | ||
683 | } | ||
684 | |||
685 | RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0]; | ||
686 | RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1]; | ||
687 | |||
688 | RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0; | ||
689 | RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0; | ||
690 | |||
691 | cpu_clear(cpu, rcu_cpu_online_map); | ||
692 | |||
693 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
694 | |||
695 | /* | ||
696 | * Place the removed callbacks on the current CPU's queue. | ||
697 | * Make them all start a new grace period: simple approach, | ||
698 | * in theory could starve a given set of callbacks, but | ||
699 | * you would need to be doing some serious CPU hotplugging | ||
700 | * to make this happen. If this becomes a problem, adding | ||
701 | * a synchronize_rcu() to the hotplug path would be a simple | ||
702 | * fix. | ||
703 | */ | ||
704 | |||
705 | rdp = RCU_DATA_ME(); | ||
706 | spin_lock_irqsave(&rdp->lock, flags); | ||
707 | *rdp->nexttail = list; | ||
708 | if (list) | ||
709 | rdp->nexttail = tail; | ||
710 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
711 | } | ||
712 | |||
713 | void __devinit rcu_online_cpu(int cpu) | ||
714 | { | ||
715 | unsigned long flags; | ||
716 | |||
717 | spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); | ||
718 | cpu_set(cpu, rcu_cpu_online_map); | ||
719 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
720 | } | ||
721 | |||
722 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
723 | |||
724 | void rcu_offline_cpu(int cpu) | ||
725 | { | ||
726 | } | ||
727 | |||
728 | void __devinit rcu_online_cpu(int cpu) | ||
729 | { | ||
730 | } | ||
731 | |||
732 | #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ | ||
733 | |||
734 | static void rcu_process_callbacks(struct softirq_action *unused) | ||
735 | { | ||
736 | unsigned long flags; | ||
737 | struct rcu_head *next, *list; | ||
738 | struct rcu_data *rdp = RCU_DATA_ME(); | ||
739 | |||
740 | spin_lock_irqsave(&rdp->lock, flags); | ||
741 | list = rdp->donelist; | ||
742 | if (list == NULL) { | ||
743 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
744 | return; | ||
745 | } | ||
746 | rdp->donelist = NULL; | ||
747 | rdp->donetail = &rdp->donelist; | ||
748 | RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp); | ||
749 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
750 | while (list) { | ||
751 | next = list->next; | ||
752 | list->func(list); | ||
753 | list = next; | ||
754 | RCU_TRACE_ME(rcupreempt_trace_invoke); | ||
755 | } | ||
756 | } | ||
757 | |||
758 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
759 | { | ||
760 | unsigned long flags; | ||
761 | struct rcu_data *rdp; | ||
762 | |||
763 | head->func = func; | ||
764 | head->next = NULL; | ||
765 | local_irq_save(flags); | ||
766 | rdp = RCU_DATA_ME(); | ||
767 | spin_lock(&rdp->lock); | ||
768 | __rcu_advance_callbacks(rdp); | ||
769 | *rdp->nexttail = head; | ||
770 | rdp->nexttail = &head->next; | ||
771 | RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp); | ||
772 | spin_unlock(&rdp->lock); | ||
773 | local_irq_restore(flags); | ||
774 | } | ||
775 | EXPORT_SYMBOL_GPL(call_rcu); | ||
776 | |||
777 | /* | ||
778 | * Wait until all currently running preempt_disable() code segments | ||
779 | * (including hardware-irq-disable segments) complete. Note that | ||
780 | * in -rt this does -not- necessarily result in all currently executing | ||
781 | * interrupt -handlers- having completed. | ||
782 | */ | ||
783 | void __synchronize_sched(void) | ||
784 | { | ||
785 | cpumask_t oldmask; | ||
786 | int cpu; | ||
787 | |||
788 | if (sched_getaffinity(0, &oldmask) < 0) | ||
789 | oldmask = cpu_possible_map; | ||
790 | for_each_online_cpu(cpu) { | ||
791 | sched_setaffinity(0, cpumask_of_cpu(cpu)); | ||
792 | schedule(); | ||
793 | } | ||
794 | sched_setaffinity(0, oldmask); | ||
795 | } | ||
796 | EXPORT_SYMBOL_GPL(__synchronize_sched); | ||
797 | |||
798 | /* | ||
799 | * Check to see if any future RCU-related work will need to be done | ||
800 | * by the current CPU, even if none need be done immediately, returning | ||
801 | * 1 if so. Assumes that notifiers would take care of handling any | ||
802 | * outstanding requests from the RCU core. | ||
803 | * | ||
804 | * This function is part of the RCU implementation; it is -not- | ||
805 | * an exported member of the RCU API. | ||
806 | */ | ||
807 | int rcu_needs_cpu(int cpu) | ||
808 | { | ||
809 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
810 | |||
811 | return (rdp->donelist != NULL || | ||
812 | !!rdp->waitlistcount || | ||
813 | rdp->nextlist != NULL); | ||
814 | } | ||
815 | |||
816 | int rcu_pending(int cpu) | ||
817 | { | ||
818 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
819 | |||
820 | /* The CPU has at least one callback queued somewhere. */ | ||
821 | |||
822 | if (rdp->donelist != NULL || | ||
823 | !!rdp->waitlistcount || | ||
824 | rdp->nextlist != NULL) | ||
825 | return 1; | ||
826 | |||
827 | /* The RCU core needs an acknowledgement from this CPU. */ | ||
828 | |||
829 | if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) || | ||
830 | (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed)) | ||
831 | return 1; | ||
832 | |||
833 | /* This CPU has fallen behind the global grace-period number. */ | ||
834 | |||
835 | if (rdp->completed != rcu_ctrlblk.completed) | ||
836 | return 1; | ||
837 | |||
838 | /* Nothing needed from this CPU. */ | ||
839 | |||
840 | return 0; | ||
841 | } | ||
842 | |||
843 | static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | ||
844 | unsigned long action, void *hcpu) | ||
845 | { | ||
846 | long cpu = (long)hcpu; | ||
847 | |||
848 | switch (action) { | ||
849 | case CPU_UP_PREPARE: | ||
850 | case CPU_UP_PREPARE_FROZEN: | ||
851 | rcu_online_cpu(cpu); | ||
852 | break; | ||
853 | case CPU_UP_CANCELED: | ||
854 | case CPU_UP_CANCELED_FROZEN: | ||
855 | case CPU_DEAD: | ||
856 | case CPU_DEAD_FROZEN: | ||
857 | rcu_offline_cpu(cpu); | ||
858 | break; | ||
859 | default: | ||
860 | break; | ||
861 | } | ||
862 | return NOTIFY_OK; | ||
863 | } | ||
864 | |||
865 | static struct notifier_block __cpuinitdata rcu_nb = { | ||
866 | .notifier_call = rcu_cpu_notify, | ||
867 | }; | ||
868 | |||
869 | void __init __rcu_init(void) | ||
870 | { | ||
871 | int cpu; | ||
872 | int i; | ||
873 | struct rcu_data *rdp; | ||
874 | |||
875 | printk(KERN_NOTICE "Preemptible RCU implementation.\n"); | ||
876 | for_each_possible_cpu(cpu) { | ||
877 | rdp = RCU_DATA_CPU(cpu); | ||
878 | spin_lock_init(&rdp->lock); | ||
879 | rdp->completed = 0; | ||
880 | rdp->waitlistcount = 0; | ||
881 | rdp->nextlist = NULL; | ||
882 | rdp->nexttail = &rdp->nextlist; | ||
883 | for (i = 0; i < GP_STAGES; i++) { | ||
884 | rdp->waitlist[i] = NULL; | ||
885 | rdp->waittail[i] = &rdp->waitlist[i]; | ||
886 | } | ||
887 | rdp->donelist = NULL; | ||
888 | rdp->donetail = &rdp->donelist; | ||
889 | rdp->rcu_flipctr[0] = 0; | ||
890 | rdp->rcu_flipctr[1] = 0; | ||
891 | } | ||
892 | register_cpu_notifier(&rcu_nb); | ||
893 | |||
894 | /* | ||
895 | * We don't need protection against CPU-Hotplug here | ||
896 | * since | ||
897 | * a) If a CPU comes online while we are iterating over the | ||
898 | * cpu_online_map below, we would only end up making a | ||
899 | * duplicate call to rcu_online_cpu() which sets the corresponding | ||
900 | * CPU's mask in the rcu_cpu_online_map. | ||
901 | * | ||
902 | * b) A CPU cannot go offline at this point in time since the user | ||
903 | * does not have access to the sysfs interface, nor do we | ||
904 | * suspend the system. | ||
905 | */ | ||
906 | for_each_online_cpu(cpu) | ||
907 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu); | ||
908 | |||
909 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); | ||
910 | } | ||
911 | |||
912 | /* | ||
913 | * Deprecated, use synchronize_rcu() or synchronize_sched() instead. | ||
914 | */ | ||
915 | void synchronize_kernel(void) | ||
916 | { | ||
917 | synchronize_rcu(); | ||
918 | } | ||
919 | |||
920 | #ifdef CONFIG_RCU_TRACE | ||
921 | long *rcupreempt_flipctr(int cpu) | ||
922 | { | ||
923 | return &RCU_DATA_CPU(cpu)->rcu_flipctr[0]; | ||
924 | } | ||
925 | EXPORT_SYMBOL_GPL(rcupreempt_flipctr); | ||
926 | |||
927 | int rcupreempt_flip_flag(int cpu) | ||
928 | { | ||
929 | return per_cpu(rcu_flip_flag, cpu); | ||
930 | } | ||
931 | EXPORT_SYMBOL_GPL(rcupreempt_flip_flag); | ||
932 | |||
933 | int rcupreempt_mb_flag(int cpu) | ||
934 | { | ||
935 | return per_cpu(rcu_mb_flag, cpu); | ||
936 | } | ||
937 | EXPORT_SYMBOL_GPL(rcupreempt_mb_flag); | ||
938 | |||
939 | char *rcupreempt_try_flip_state_name(void) | ||
940 | { | ||
941 | return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state]; | ||
942 | } | ||
943 | EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name); | ||
944 | |||
945 | struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu) | ||
946 | { | ||
947 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
948 | |||
949 | return &rdp->trace; | ||
950 | } | ||
951 | EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu); | ||
952 | |||
953 | #endif /* #ifdef RCU_TRACE */ | ||
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c new file mode 100644 index 000000000000..49ac4947af24 --- /dev/null +++ b/kernel/rcupreempt_trace.c | |||
@@ -0,0 +1,330 @@ | |||
1 | /* | ||
2 | * Read-Copy Update tracing for realtime implementation | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2006 | ||
19 | * | ||
20 | * Papers: http://www.rdrop.com/users/paulmck/RCU | ||
21 | * | ||
22 | * For detailed explanation of Read-Copy Update mechanism see - | ||
23 | * Documentation/RCU/ *.txt | ||
24 | * | ||
25 | */ | ||
26 | #include <linux/types.h> | ||
27 | #include <linux/kernel.h> | ||
28 | #include <linux/init.h> | ||
29 | #include <linux/spinlock.h> | ||
30 | #include <linux/smp.h> | ||
31 | #include <linux/rcupdate.h> | ||
32 | #include <linux/interrupt.h> | ||
33 | #include <linux/sched.h> | ||
34 | #include <asm/atomic.h> | ||
35 | #include <linux/bitops.h> | ||
36 | #include <linux/module.h> | ||
37 | #include <linux/completion.h> | ||
38 | #include <linux/moduleparam.h> | ||
39 | #include <linux/percpu.h> | ||
40 | #include <linux/notifier.h> | ||
41 | #include <linux/rcupdate.h> | ||
42 | #include <linux/cpu.h> | ||
43 | #include <linux/mutex.h> | ||
44 | #include <linux/rcupreempt_trace.h> | ||
45 | #include <linux/debugfs.h> | ||
46 | |||
47 | static struct mutex rcupreempt_trace_mutex; | ||
48 | static char *rcupreempt_trace_buf; | ||
49 | #define RCUPREEMPT_TRACE_BUF_SIZE 4096 | ||
50 | |||
51 | void rcupreempt_trace_move2done(struct rcupreempt_trace *trace) | ||
52 | { | ||
53 | trace->done_length += trace->wait_length; | ||
54 | trace->done_add += trace->wait_length; | ||
55 | trace->wait_length = 0; | ||
56 | } | ||
57 | void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace) | ||
58 | { | ||
59 | trace->wait_length += trace->next_length; | ||
60 | trace->wait_add += trace->next_length; | ||
61 | trace->next_length = 0; | ||
62 | } | ||
63 | void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace) | ||
64 | { | ||
65 | atomic_inc(&trace->rcu_try_flip_1); | ||
66 | } | ||
67 | void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace) | ||
68 | { | ||
69 | atomic_inc(&trace->rcu_try_flip_e1); | ||
70 | } | ||
71 | void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace) | ||
72 | { | ||
73 | trace->rcu_try_flip_i1++; | ||
74 | } | ||
75 | void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace) | ||
76 | { | ||
77 | trace->rcu_try_flip_ie1++; | ||
78 | } | ||
79 | void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace) | ||
80 | { | ||
81 | trace->rcu_try_flip_g1++; | ||
82 | } | ||
83 | void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace) | ||
84 | { | ||
85 | trace->rcu_try_flip_a1++; | ||
86 | } | ||
87 | void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace) | ||
88 | { | ||
89 | trace->rcu_try_flip_ae1++; | ||
90 | } | ||
91 | void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace) | ||
92 | { | ||
93 | trace->rcu_try_flip_a2++; | ||
94 | } | ||
95 | void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace) | ||
96 | { | ||
97 | trace->rcu_try_flip_z1++; | ||
98 | } | ||
99 | void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace) | ||
100 | { | ||
101 | trace->rcu_try_flip_ze1++; | ||
102 | } | ||
103 | void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace) | ||
104 | { | ||
105 | trace->rcu_try_flip_z2++; | ||
106 | } | ||
107 | void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace) | ||
108 | { | ||
109 | trace->rcu_try_flip_m1++; | ||
110 | } | ||
111 | void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace) | ||
112 | { | ||
113 | trace->rcu_try_flip_me1++; | ||
114 | } | ||
115 | void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace) | ||
116 | { | ||
117 | trace->rcu_try_flip_m2++; | ||
118 | } | ||
119 | void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace) | ||
120 | { | ||
121 | trace->rcu_check_callbacks++; | ||
122 | } | ||
123 | void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace) | ||
124 | { | ||
125 | trace->done_remove += trace->done_length; | ||
126 | trace->done_length = 0; | ||
127 | } | ||
128 | void rcupreempt_trace_invoke(struct rcupreempt_trace *trace) | ||
129 | { | ||
130 | atomic_inc(&trace->done_invoked); | ||
131 | } | ||
132 | void rcupreempt_trace_next_add(struct rcupreempt_trace *trace) | ||
133 | { | ||
134 | trace->next_add++; | ||
135 | trace->next_length++; | ||
136 | } | ||
137 | |||
138 | static void rcupreempt_trace_sum(struct rcupreempt_trace *sp) | ||
139 | { | ||
140 | struct rcupreempt_trace *cp; | ||
141 | int cpu; | ||
142 | |||
143 | memset(sp, 0, sizeof(*sp)); | ||
144 | for_each_possible_cpu(cpu) { | ||
145 | cp = rcupreempt_trace_cpu(cpu); | ||
146 | sp->next_length += cp->next_length; | ||
147 | sp->next_add += cp->next_add; | ||
148 | sp->wait_length += cp->wait_length; | ||
149 | sp->wait_add += cp->wait_add; | ||
150 | sp->done_length += cp->done_length; | ||
151 | sp->done_add += cp->done_add; | ||
152 | sp->done_remove += cp->done_remove; | ||
153 | atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked)); | ||
154 | sp->rcu_check_callbacks += cp->rcu_check_callbacks; | ||
155 | atomic_set(&sp->rcu_try_flip_1, | ||
156 | atomic_read(&cp->rcu_try_flip_1)); | ||
157 | atomic_set(&sp->rcu_try_flip_e1, | ||
158 | atomic_read(&cp->rcu_try_flip_e1)); | ||
159 | sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1; | ||
160 | sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1; | ||
161 | sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1; | ||
162 | sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1; | ||
163 | sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1; | ||
164 | sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2; | ||
165 | sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1; | ||
166 | sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1; | ||
167 | sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2; | ||
168 | sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1; | ||
169 | sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1; | ||
170 | sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2; | ||
171 | } | ||
172 | } | ||
173 | |||
174 | static ssize_t rcustats_read(struct file *filp, char __user *buffer, | ||
175 | size_t count, loff_t *ppos) | ||
176 | { | ||
177 | struct rcupreempt_trace trace; | ||
178 | ssize_t bcount; | ||
179 | int cnt = 0; | ||
180 | |||
181 | rcupreempt_trace_sum(&trace); | ||
182 | mutex_lock(&rcupreempt_trace_mutex); | ||
183 | snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
184 | "ggp=%ld rcc=%ld\n", | ||
185 | rcu_batches_completed(), | ||
186 | trace.rcu_check_callbacks); | ||
187 | snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
188 | "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n" | ||
189 | "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n" | ||
190 | "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n", | ||
191 | |||
192 | trace.next_add, trace.next_length, | ||
193 | trace.wait_add, trace.wait_length, | ||
194 | trace.done_add, trace.done_length, | ||
195 | trace.done_remove, atomic_read(&trace.done_invoked), | ||
196 | atomic_read(&trace.rcu_try_flip_1), | ||
197 | atomic_read(&trace.rcu_try_flip_e1), | ||
198 | trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1, | ||
199 | trace.rcu_try_flip_g1, | ||
200 | trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1, | ||
201 | trace.rcu_try_flip_a2, | ||
202 | trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1, | ||
203 | trace.rcu_try_flip_z2, | ||
204 | trace.rcu_try_flip_m1, trace.rcu_try_flip_me1, | ||
205 | trace.rcu_try_flip_m2); | ||
206 | bcount = simple_read_from_buffer(buffer, count, ppos, | ||
207 | rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); | ||
208 | mutex_unlock(&rcupreempt_trace_mutex); | ||
209 | return bcount; | ||
210 | } | ||
211 | |||
212 | static ssize_t rcugp_read(struct file *filp, char __user *buffer, | ||
213 | size_t count, loff_t *ppos) | ||
214 | { | ||
215 | long oldgp = rcu_batches_completed(); | ||
216 | ssize_t bcount; | ||
217 | |||
218 | mutex_lock(&rcupreempt_trace_mutex); | ||
219 | synchronize_rcu(); | ||
220 | snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE, | ||
221 | "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed()); | ||
222 | bcount = simple_read_from_buffer(buffer, count, ppos, | ||
223 | rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); | ||
224 | mutex_unlock(&rcupreempt_trace_mutex); | ||
225 | return bcount; | ||
226 | } | ||
227 | |||
228 | static ssize_t rcuctrs_read(struct file *filp, char __user *buffer, | ||
229 | size_t count, loff_t *ppos) | ||
230 | { | ||
231 | int cnt = 0; | ||
232 | int cpu; | ||
233 | int f = rcu_batches_completed() & 0x1; | ||
234 | ssize_t bcount; | ||
235 | |||
236 | mutex_lock(&rcupreempt_trace_mutex); | ||
237 | |||
238 | cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE, | ||
239 | "CPU last cur F M\n"); | ||
240 | for_each_online_cpu(cpu) { | ||
241 | long *flipctr = rcupreempt_flipctr(cpu); | ||
242 | cnt += snprintf(&rcupreempt_trace_buf[cnt], | ||
243 | RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
244 | "%3d %4ld %3ld %d %d\n", | ||
245 | cpu, | ||
246 | flipctr[!f], | ||
247 | flipctr[f], | ||
248 | rcupreempt_flip_flag(cpu), | ||
249 | rcupreempt_mb_flag(cpu)); | ||
250 | } | ||
251 | cnt += snprintf(&rcupreempt_trace_buf[cnt], | ||
252 | RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
253 | "ggp = %ld, state = %s\n", | ||
254 | rcu_batches_completed(), | ||
255 | rcupreempt_try_flip_state_name()); | ||
256 | cnt += snprintf(&rcupreempt_trace_buf[cnt], | ||
257 | RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
258 | "\n"); | ||
259 | bcount = simple_read_from_buffer(buffer, count, ppos, | ||
260 | rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); | ||
261 | mutex_unlock(&rcupreempt_trace_mutex); | ||
262 | return bcount; | ||
263 | } | ||
264 | |||
265 | static struct file_operations rcustats_fops = { | ||
266 | .owner = THIS_MODULE, | ||
267 | .read = rcustats_read, | ||
268 | }; | ||
269 | |||
270 | static struct file_operations rcugp_fops = { | ||
271 | .owner = THIS_MODULE, | ||
272 | .read = rcugp_read, | ||
273 | }; | ||
274 | |||
275 | static struct file_operations rcuctrs_fops = { | ||
276 | .owner = THIS_MODULE, | ||
277 | .read = rcuctrs_read, | ||
278 | }; | ||
279 | |||
280 | static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir; | ||
281 | static int rcupreempt_debugfs_init(void) | ||
282 | { | ||
283 | rcudir = debugfs_create_dir("rcu", NULL); | ||
284 | if (!rcudir) | ||
285 | goto out; | ||
286 | statdir = debugfs_create_file("rcustats", 0444, rcudir, | ||
287 | NULL, &rcustats_fops); | ||
288 | if (!statdir) | ||
289 | goto free_out; | ||
290 | |||
291 | gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); | ||
292 | if (!gpdir) | ||
293 | goto free_out; | ||
294 | |||
295 | ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir, | ||
296 | NULL, &rcuctrs_fops); | ||
297 | if (!ctrsdir) | ||
298 | goto free_out; | ||
299 | return 0; | ||
300 | free_out: | ||
301 | if (statdir) | ||
302 | debugfs_remove(statdir); | ||
303 | if (gpdir) | ||
304 | debugfs_remove(gpdir); | ||
305 | debugfs_remove(rcudir); | ||
306 | out: | ||
307 | return 1; | ||
308 | } | ||
309 | |||
310 | static int __init rcupreempt_trace_init(void) | ||
311 | { | ||
312 | mutex_init(&rcupreempt_trace_mutex); | ||
313 | rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); | ||
314 | if (!rcupreempt_trace_buf) | ||
315 | return 1; | ||
316 | return rcupreempt_debugfs_init(); | ||
317 | } | ||
318 | |||
319 | static void __exit rcupreempt_trace_cleanup(void) | ||
320 | { | ||
321 | debugfs_remove(statdir); | ||
322 | debugfs_remove(gpdir); | ||
323 | debugfs_remove(ctrsdir); | ||
324 | debugfs_remove(rcudir); | ||
325 | kfree(rcupreempt_trace_buf); | ||
326 | } | ||
327 | |||
328 | |||
329 | module_init(rcupreempt_trace_init); | ||
330 | module_exit(rcupreempt_trace_cleanup); | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index c3e165c2318f..fd599829e72a 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -726,11 +726,11 @@ static void rcu_torture_shuffle_tasks(void) | |||
726 | cpumask_t tmp_mask = CPU_MASK_ALL; | 726 | cpumask_t tmp_mask = CPU_MASK_ALL; |
727 | int i; | 727 | int i; |
728 | 728 | ||
729 | lock_cpu_hotplug(); | 729 | get_online_cpus(); |
730 | 730 | ||
731 | /* No point in shuffling if there is only one online CPU (ex: UP) */ | 731 | /* No point in shuffling if there is only one online CPU (ex: UP) */ |
732 | if (num_online_cpus() == 1) { | 732 | if (num_online_cpus() == 1) { |
733 | unlock_cpu_hotplug(); | 733 | put_online_cpus(); |
734 | return; | 734 | return; |
735 | } | 735 | } |
736 | 736 | ||
@@ -762,7 +762,7 @@ static void rcu_torture_shuffle_tasks(void) | |||
762 | else | 762 | else |
763 | rcu_idle_cpu--; | 763 | rcu_idle_cpu--; |
764 | 764 | ||
765 | unlock_cpu_hotplug(); | 765 | put_online_cpus(); |
766 | } | 766 | } |
767 | 767 | ||
768 | /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the | 768 | /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the |
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index e3055ba69159..092e4c620af9 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c | |||
@@ -394,7 +394,7 @@ static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); | |||
394 | static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); | 394 | static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); |
395 | 395 | ||
396 | static struct sysdev_class rttest_sysclass = { | 396 | static struct sysdev_class rttest_sysclass = { |
397 | set_kset_name("rttest"), | 397 | .name = "rttest", |
398 | }; | 398 | }; |
399 | 399 | ||
400 | static int init_test_thread(int id) | 400 | static int init_test_thread(int id) |
diff --git a/kernel/sched.c b/kernel/sched.c index e76b11ca6df3..ba4c88088f62 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -22,6 +22,8 @@ | |||
22 | * by Peter Williams | 22 | * by Peter Williams |
23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith | 23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith |
24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri | 24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri |
25 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, | ||
26 | * Thomas Gleixner, Mike Kravetz | ||
25 | */ | 27 | */ |
26 | 28 | ||
27 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
@@ -63,6 +65,7 @@ | |||
63 | #include <linux/reciprocal_div.h> | 65 | #include <linux/reciprocal_div.h> |
64 | #include <linux/unistd.h> | 66 | #include <linux/unistd.h> |
65 | #include <linux/pagemap.h> | 67 | #include <linux/pagemap.h> |
68 | #include <linux/hrtimer.h> | ||
66 | 69 | ||
67 | #include <asm/tlb.h> | 70 | #include <asm/tlb.h> |
68 | #include <asm/irq_regs.h> | 71 | #include <asm/irq_regs.h> |
@@ -96,10 +99,9 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
96 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | 99 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) |
97 | 100 | ||
98 | /* | 101 | /* |
99 | * Some helpers for converting nanosecond timing to jiffy resolution | 102 | * Helpers for converting nanosecond timing to jiffy resolution |
100 | */ | 103 | */ |
101 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) | 104 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) |
102 | #define JIFFIES_TO_NS(TIME) ((TIME) * (NSEC_PER_SEC / HZ)) | ||
103 | 105 | ||
104 | #define NICE_0_LOAD SCHED_LOAD_SCALE | 106 | #define NICE_0_LOAD SCHED_LOAD_SCALE |
105 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | 107 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT |
@@ -159,6 +161,8 @@ struct rt_prio_array { | |||
159 | 161 | ||
160 | struct cfs_rq; | 162 | struct cfs_rq; |
161 | 163 | ||
164 | static LIST_HEAD(task_groups); | ||
165 | |||
162 | /* task group related information */ | 166 | /* task group related information */ |
163 | struct task_group { | 167 | struct task_group { |
164 | #ifdef CONFIG_FAIR_CGROUP_SCHED | 168 | #ifdef CONFIG_FAIR_CGROUP_SCHED |
@@ -168,10 +172,50 @@ struct task_group { | |||
168 | struct sched_entity **se; | 172 | struct sched_entity **se; |
169 | /* runqueue "owned" by this group on each cpu */ | 173 | /* runqueue "owned" by this group on each cpu */ |
170 | struct cfs_rq **cfs_rq; | 174 | struct cfs_rq **cfs_rq; |
175 | |||
176 | struct sched_rt_entity **rt_se; | ||
177 | struct rt_rq **rt_rq; | ||
178 | |||
179 | unsigned int rt_ratio; | ||
180 | |||
181 | /* | ||
182 | * shares assigned to a task group governs how much of cpu bandwidth | ||
183 | * is allocated to the group. The more shares a group has, the more is | ||
184 | * the cpu bandwidth allocated to it. | ||
185 | * | ||
186 | * For ex, lets say that there are three task groups, A, B and C which | ||
187 | * have been assigned shares 1000, 2000 and 3000 respectively. Then, | ||
188 | * cpu bandwidth allocated by the scheduler to task groups A, B and C | ||
189 | * should be: | ||
190 | * | ||
191 | * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% | ||
192 | * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% | ||
193 | * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% | ||
194 | * | ||
195 | * The weight assigned to a task group's schedulable entities on every | ||
196 | * cpu (task_group.se[a_cpu]->load.weight) is derived from the task | ||
197 | * group's shares. For ex: lets say that task group A has been | ||
198 | * assigned shares of 1000 and there are two CPUs in a system. Then, | ||
199 | * | ||
200 | * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; | ||
201 | * | ||
202 | * Note: It's not necessary that each of a task's group schedulable | ||
203 | * entity have the same weight on all CPUs. If the group | ||
204 | * has 2 of its tasks on CPU0 and 1 task on CPU1, then a | ||
205 | * better distribution of weight could be: | ||
206 | * | ||
207 | * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 | ||
208 | * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 | ||
209 | * | ||
210 | * rebalance_shares() is responsible for distributing the shares of a | ||
211 | * task groups like this among the group's schedulable entities across | ||
212 | * cpus. | ||
213 | * | ||
214 | */ | ||
171 | unsigned long shares; | 215 | unsigned long shares; |
172 | /* spinlock to serialize modification to shares */ | 216 | |
173 | spinlock_t lock; | ||
174 | struct rcu_head rcu; | 217 | struct rcu_head rcu; |
218 | struct list_head list; | ||
175 | }; | 219 | }; |
176 | 220 | ||
177 | /* Default task group's sched entity on each cpu */ | 221 | /* Default task group's sched entity on each cpu */ |
@@ -179,24 +223,51 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | |||
179 | /* Default task group's cfs_rq on each cpu */ | 223 | /* Default task group's cfs_rq on each cpu */ |
180 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 224 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; |
181 | 225 | ||
226 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | ||
227 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | ||
228 | |||
182 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; | 229 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; |
183 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; | 230 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; |
184 | 231 | ||
232 | static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; | ||
233 | static struct rt_rq *init_rt_rq_p[NR_CPUS]; | ||
234 | |||
235 | /* task_group_mutex serializes add/remove of task groups and also changes to | ||
236 | * a task group's cpu shares. | ||
237 | */ | ||
238 | static DEFINE_MUTEX(task_group_mutex); | ||
239 | |||
240 | /* doms_cur_mutex serializes access to doms_cur[] array */ | ||
241 | static DEFINE_MUTEX(doms_cur_mutex); | ||
242 | |||
243 | #ifdef CONFIG_SMP | ||
244 | /* kernel thread that runs rebalance_shares() periodically */ | ||
245 | static struct task_struct *lb_monitor_task; | ||
246 | static int load_balance_monitor(void *unused); | ||
247 | #endif | ||
248 | |||
249 | static void set_se_shares(struct sched_entity *se, unsigned long shares); | ||
250 | |||
185 | /* Default task group. | 251 | /* Default task group. |
186 | * Every task in system belong to this group at bootup. | 252 | * Every task in system belong to this group at bootup. |
187 | */ | 253 | */ |
188 | struct task_group init_task_group = { | 254 | struct task_group init_task_group = { |
189 | .se = init_sched_entity_p, | 255 | .se = init_sched_entity_p, |
190 | .cfs_rq = init_cfs_rq_p, | 256 | .cfs_rq = init_cfs_rq_p, |
257 | |||
258 | .rt_se = init_sched_rt_entity_p, | ||
259 | .rt_rq = init_rt_rq_p, | ||
191 | }; | 260 | }; |
192 | 261 | ||
193 | #ifdef CONFIG_FAIR_USER_SCHED | 262 | #ifdef CONFIG_FAIR_USER_SCHED |
194 | # define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD | 263 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) |
195 | #else | 264 | #else |
196 | # define INIT_TASK_GRP_LOAD NICE_0_LOAD | 265 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
197 | #endif | 266 | #endif |
198 | 267 | ||
199 | static int init_task_group_load = INIT_TASK_GRP_LOAD; | 268 | #define MIN_GROUP_SHARES 2 |
269 | |||
270 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | ||
200 | 271 | ||
201 | /* return group to which a task belongs */ | 272 | /* return group to which a task belongs */ |
202 | static inline struct task_group *task_group(struct task_struct *p) | 273 | static inline struct task_group *task_group(struct task_struct *p) |
@@ -215,15 +286,42 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
215 | } | 286 | } |
216 | 287 | ||
217 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | 288 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ |
218 | static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) | 289 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) |
219 | { | 290 | { |
220 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; | 291 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; |
221 | p->se.parent = task_group(p)->se[cpu]; | 292 | p->se.parent = task_group(p)->se[cpu]; |
293 | |||
294 | p->rt.rt_rq = task_group(p)->rt_rq[cpu]; | ||
295 | p->rt.parent = task_group(p)->rt_se[cpu]; | ||
296 | } | ||
297 | |||
298 | static inline void lock_task_group_list(void) | ||
299 | { | ||
300 | mutex_lock(&task_group_mutex); | ||
301 | } | ||
302 | |||
303 | static inline void unlock_task_group_list(void) | ||
304 | { | ||
305 | mutex_unlock(&task_group_mutex); | ||
306 | } | ||
307 | |||
308 | static inline void lock_doms_cur(void) | ||
309 | { | ||
310 | mutex_lock(&doms_cur_mutex); | ||
311 | } | ||
312 | |||
313 | static inline void unlock_doms_cur(void) | ||
314 | { | ||
315 | mutex_unlock(&doms_cur_mutex); | ||
222 | } | 316 | } |
223 | 317 | ||
224 | #else | 318 | #else |
225 | 319 | ||
226 | static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { } | 320 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } |
321 | static inline void lock_task_group_list(void) { } | ||
322 | static inline void unlock_task_group_list(void) { } | ||
323 | static inline void lock_doms_cur(void) { } | ||
324 | static inline void unlock_doms_cur(void) { } | ||
227 | 325 | ||
228 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 326 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
229 | 327 | ||
@@ -264,11 +362,57 @@ struct cfs_rq { | |||
264 | /* Real-Time classes' related field in a runqueue: */ | 362 | /* Real-Time classes' related field in a runqueue: */ |
265 | struct rt_rq { | 363 | struct rt_rq { |
266 | struct rt_prio_array active; | 364 | struct rt_prio_array active; |
267 | int rt_load_balance_idx; | 365 | unsigned long rt_nr_running; |
268 | struct list_head *rt_load_balance_head, *rt_load_balance_curr; | 366 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED |
367 | int highest_prio; /* highest queued rt task prio */ | ||
368 | #endif | ||
369 | #ifdef CONFIG_SMP | ||
370 | unsigned long rt_nr_migratory; | ||
371 | int overloaded; | ||
372 | #endif | ||
373 | int rt_throttled; | ||
374 | u64 rt_time; | ||
375 | |||
376 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
377 | struct rq *rq; | ||
378 | struct list_head leaf_rt_rq_list; | ||
379 | struct task_group *tg; | ||
380 | struct sched_rt_entity *rt_se; | ||
381 | #endif | ||
382 | }; | ||
383 | |||
384 | #ifdef CONFIG_SMP | ||
385 | |||
386 | /* | ||
387 | * We add the notion of a root-domain which will be used to define per-domain | ||
388 | * variables. Each exclusive cpuset essentially defines an island domain by | ||
389 | * fully partitioning the member cpus from any other cpuset. Whenever a new | ||
390 | * exclusive cpuset is created, we also create and attach a new root-domain | ||
391 | * object. | ||
392 | * | ||
393 | */ | ||
394 | struct root_domain { | ||
395 | atomic_t refcount; | ||
396 | cpumask_t span; | ||
397 | cpumask_t online; | ||
398 | |||
399 | /* | ||
400 | * The "RT overload" flag: it gets set if a CPU has more than | ||
401 | * one runnable RT task. | ||
402 | */ | ||
403 | cpumask_t rto_mask; | ||
404 | atomic_t rto_count; | ||
269 | }; | 405 | }; |
270 | 406 | ||
271 | /* | 407 | /* |
408 | * By default the system creates a single root-domain with all cpus as | ||
409 | * members (mimicking the global state we have today). | ||
410 | */ | ||
411 | static struct root_domain def_root_domain; | ||
412 | |||
413 | #endif | ||
414 | |||
415 | /* | ||
272 | * This is the main, per-CPU runqueue data structure. | 416 | * This is the main, per-CPU runqueue data structure. |
273 | * | 417 | * |
274 | * Locking rule: those places that want to lock multiple runqueues | 418 | * Locking rule: those places that want to lock multiple runqueues |
@@ -296,11 +440,15 @@ struct rq { | |||
296 | u64 nr_switches; | 440 | u64 nr_switches; |
297 | 441 | ||
298 | struct cfs_rq cfs; | 442 | struct cfs_rq cfs; |
443 | struct rt_rq rt; | ||
444 | u64 rt_period_expire; | ||
445 | int rt_throttled; | ||
446 | |||
299 | #ifdef CONFIG_FAIR_GROUP_SCHED | 447 | #ifdef CONFIG_FAIR_GROUP_SCHED |
300 | /* list of leaf cfs_rq on this cpu: */ | 448 | /* list of leaf cfs_rq on this cpu: */ |
301 | struct list_head leaf_cfs_rq_list; | 449 | struct list_head leaf_cfs_rq_list; |
450 | struct list_head leaf_rt_rq_list; | ||
302 | #endif | 451 | #endif |
303 | struct rt_rq rt; | ||
304 | 452 | ||
305 | /* | 453 | /* |
306 | * This is part of a global counter where only the total sum | 454 | * This is part of a global counter where only the total sum |
@@ -317,7 +465,7 @@ struct rq { | |||
317 | u64 clock, prev_clock_raw; | 465 | u64 clock, prev_clock_raw; |
318 | s64 clock_max_delta; | 466 | s64 clock_max_delta; |
319 | 467 | ||
320 | unsigned int clock_warps, clock_overflows; | 468 | unsigned int clock_warps, clock_overflows, clock_underflows; |
321 | u64 idle_clock; | 469 | u64 idle_clock; |
322 | unsigned int clock_deep_idle_events; | 470 | unsigned int clock_deep_idle_events; |
323 | u64 tick_timestamp; | 471 | u64 tick_timestamp; |
@@ -325,6 +473,7 @@ struct rq { | |||
325 | atomic_t nr_iowait; | 473 | atomic_t nr_iowait; |
326 | 474 | ||
327 | #ifdef CONFIG_SMP | 475 | #ifdef CONFIG_SMP |
476 | struct root_domain *rd; | ||
328 | struct sched_domain *sd; | 477 | struct sched_domain *sd; |
329 | 478 | ||
330 | /* For active balancing */ | 479 | /* For active balancing */ |
@@ -337,6 +486,12 @@ struct rq { | |||
337 | struct list_head migration_queue; | 486 | struct list_head migration_queue; |
338 | #endif | 487 | #endif |
339 | 488 | ||
489 | #ifdef CONFIG_SCHED_HRTICK | ||
490 | unsigned long hrtick_flags; | ||
491 | ktime_t hrtick_expire; | ||
492 | struct hrtimer hrtick_timer; | ||
493 | #endif | ||
494 | |||
340 | #ifdef CONFIG_SCHEDSTATS | 495 | #ifdef CONFIG_SCHEDSTATS |
341 | /* latency stats */ | 496 | /* latency stats */ |
342 | struct sched_info rq_sched_info; | 497 | struct sched_info rq_sched_info; |
@@ -363,7 +518,6 @@ struct rq { | |||
363 | }; | 518 | }; |
364 | 519 | ||
365 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 520 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
366 | static DEFINE_MUTEX(sched_hotcpu_mutex); | ||
367 | 521 | ||
368 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) | 522 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) |
369 | { | 523 | { |
@@ -441,6 +595,23 @@ static void update_rq_clock(struct rq *rq) | |||
441 | #define task_rq(p) cpu_rq(task_cpu(p)) | 595 | #define task_rq(p) cpu_rq(task_cpu(p)) |
442 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 596 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
443 | 597 | ||
598 | unsigned long rt_needs_cpu(int cpu) | ||
599 | { | ||
600 | struct rq *rq = cpu_rq(cpu); | ||
601 | u64 delta; | ||
602 | |||
603 | if (!rq->rt_throttled) | ||
604 | return 0; | ||
605 | |||
606 | if (rq->clock > rq->rt_period_expire) | ||
607 | return 1; | ||
608 | |||
609 | delta = rq->rt_period_expire - rq->clock; | ||
610 | do_div(delta, NSEC_PER_SEC / HZ); | ||
611 | |||
612 | return (unsigned long)delta; | ||
613 | } | ||
614 | |||
444 | /* | 615 | /* |
445 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | 616 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: |
446 | */ | 617 | */ |
@@ -459,6 +630,8 @@ enum { | |||
459 | SCHED_FEAT_START_DEBIT = 4, | 630 | SCHED_FEAT_START_DEBIT = 4, |
460 | SCHED_FEAT_TREE_AVG = 8, | 631 | SCHED_FEAT_TREE_AVG = 8, |
461 | SCHED_FEAT_APPROX_AVG = 16, | 632 | SCHED_FEAT_APPROX_AVG = 16, |
633 | SCHED_FEAT_HRTICK = 32, | ||
634 | SCHED_FEAT_DOUBLE_TICK = 64, | ||
462 | }; | 635 | }; |
463 | 636 | ||
464 | const_debug unsigned int sysctl_sched_features = | 637 | const_debug unsigned int sysctl_sched_features = |
@@ -466,7 +639,9 @@ const_debug unsigned int sysctl_sched_features = | |||
466 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | | 639 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | |
467 | SCHED_FEAT_START_DEBIT * 1 | | 640 | SCHED_FEAT_START_DEBIT * 1 | |
468 | SCHED_FEAT_TREE_AVG * 0 | | 641 | SCHED_FEAT_TREE_AVG * 0 | |
469 | SCHED_FEAT_APPROX_AVG * 0; | 642 | SCHED_FEAT_APPROX_AVG * 0 | |
643 | SCHED_FEAT_HRTICK * 1 | | ||
644 | SCHED_FEAT_DOUBLE_TICK * 0; | ||
470 | 645 | ||
471 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) | 646 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) |
472 | 647 | ||
@@ -477,6 +652,21 @@ const_debug unsigned int sysctl_sched_features = | |||
477 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 652 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
478 | 653 | ||
479 | /* | 654 | /* |
655 | * period over which we measure -rt task cpu usage in ms. | ||
656 | * default: 1s | ||
657 | */ | ||
658 | const_debug unsigned int sysctl_sched_rt_period = 1000; | ||
659 | |||
660 | #define SCHED_RT_FRAC_SHIFT 16 | ||
661 | #define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT) | ||
662 | |||
663 | /* | ||
664 | * ratio of time -rt tasks may consume. | ||
665 | * default: 95% | ||
666 | */ | ||
667 | const_debug unsigned int sysctl_sched_rt_ratio = 62259; | ||
668 | |||
669 | /* | ||
480 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | 670 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu |
481 | * clock constructed from sched_clock(): | 671 | * clock constructed from sched_clock(): |
482 | */ | 672 | */ |
@@ -668,7 +858,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) | |||
668 | struct rq *rq = cpu_rq(smp_processor_id()); | 858 | struct rq *rq = cpu_rq(smp_processor_id()); |
669 | u64 now = sched_clock(); | 859 | u64 now = sched_clock(); |
670 | 860 | ||
671 | touch_softlockup_watchdog(); | ||
672 | rq->idle_clock += delta_ns; | 861 | rq->idle_clock += delta_ns; |
673 | /* | 862 | /* |
674 | * Override the previous timestamp and ignore all | 863 | * Override the previous timestamp and ignore all |
@@ -680,9 +869,177 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) | |||
680 | rq->prev_clock_raw = now; | 869 | rq->prev_clock_raw = now; |
681 | rq->clock += delta_ns; | 870 | rq->clock += delta_ns; |
682 | spin_unlock(&rq->lock); | 871 | spin_unlock(&rq->lock); |
872 | touch_softlockup_watchdog(); | ||
683 | } | 873 | } |
684 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | 874 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); |
685 | 875 | ||
876 | static void __resched_task(struct task_struct *p, int tif_bit); | ||
877 | |||
878 | static inline void resched_task(struct task_struct *p) | ||
879 | { | ||
880 | __resched_task(p, TIF_NEED_RESCHED); | ||
881 | } | ||
882 | |||
883 | #ifdef CONFIG_SCHED_HRTICK | ||
884 | /* | ||
885 | * Use HR-timers to deliver accurate preemption points. | ||
886 | * | ||
887 | * Its all a bit involved since we cannot program an hrt while holding the | ||
888 | * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a | ||
889 | * reschedule event. | ||
890 | * | ||
891 | * When we get rescheduled we reprogram the hrtick_timer outside of the | ||
892 | * rq->lock. | ||
893 | */ | ||
894 | static inline void resched_hrt(struct task_struct *p) | ||
895 | { | ||
896 | __resched_task(p, TIF_HRTICK_RESCHED); | ||
897 | } | ||
898 | |||
899 | static inline void resched_rq(struct rq *rq) | ||
900 | { | ||
901 | unsigned long flags; | ||
902 | |||
903 | spin_lock_irqsave(&rq->lock, flags); | ||
904 | resched_task(rq->curr); | ||
905 | spin_unlock_irqrestore(&rq->lock, flags); | ||
906 | } | ||
907 | |||
908 | enum { | ||
909 | HRTICK_SET, /* re-programm hrtick_timer */ | ||
910 | HRTICK_RESET, /* not a new slice */ | ||
911 | }; | ||
912 | |||
913 | /* | ||
914 | * Use hrtick when: | ||
915 | * - enabled by features | ||
916 | * - hrtimer is actually high res | ||
917 | */ | ||
918 | static inline int hrtick_enabled(struct rq *rq) | ||
919 | { | ||
920 | if (!sched_feat(HRTICK)) | ||
921 | return 0; | ||
922 | return hrtimer_is_hres_active(&rq->hrtick_timer); | ||
923 | } | ||
924 | |||
925 | /* | ||
926 | * Called to set the hrtick timer state. | ||
927 | * | ||
928 | * called with rq->lock held and irqs disabled | ||
929 | */ | ||
930 | static void hrtick_start(struct rq *rq, u64 delay, int reset) | ||
931 | { | ||
932 | assert_spin_locked(&rq->lock); | ||
933 | |||
934 | /* | ||
935 | * preempt at: now + delay | ||
936 | */ | ||
937 | rq->hrtick_expire = | ||
938 | ktime_add_ns(rq->hrtick_timer.base->get_time(), delay); | ||
939 | /* | ||
940 | * indicate we need to program the timer | ||
941 | */ | ||
942 | __set_bit(HRTICK_SET, &rq->hrtick_flags); | ||
943 | if (reset) | ||
944 | __set_bit(HRTICK_RESET, &rq->hrtick_flags); | ||
945 | |||
946 | /* | ||
947 | * New slices are called from the schedule path and don't need a | ||
948 | * forced reschedule. | ||
949 | */ | ||
950 | if (reset) | ||
951 | resched_hrt(rq->curr); | ||
952 | } | ||
953 | |||
954 | static void hrtick_clear(struct rq *rq) | ||
955 | { | ||
956 | if (hrtimer_active(&rq->hrtick_timer)) | ||
957 | hrtimer_cancel(&rq->hrtick_timer); | ||
958 | } | ||
959 | |||
960 | /* | ||
961 | * Update the timer from the possible pending state. | ||
962 | */ | ||
963 | static void hrtick_set(struct rq *rq) | ||
964 | { | ||
965 | ktime_t time; | ||
966 | int set, reset; | ||
967 | unsigned long flags; | ||
968 | |||
969 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); | ||
970 | |||
971 | spin_lock_irqsave(&rq->lock, flags); | ||
972 | set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags); | ||
973 | reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags); | ||
974 | time = rq->hrtick_expire; | ||
975 | clear_thread_flag(TIF_HRTICK_RESCHED); | ||
976 | spin_unlock_irqrestore(&rq->lock, flags); | ||
977 | |||
978 | if (set) { | ||
979 | hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS); | ||
980 | if (reset && !hrtimer_active(&rq->hrtick_timer)) | ||
981 | resched_rq(rq); | ||
982 | } else | ||
983 | hrtick_clear(rq); | ||
984 | } | ||
985 | |||
986 | /* | ||
987 | * High-resolution timer tick. | ||
988 | * Runs from hardirq context with interrupts disabled. | ||
989 | */ | ||
990 | static enum hrtimer_restart hrtick(struct hrtimer *timer) | ||
991 | { | ||
992 | struct rq *rq = container_of(timer, struct rq, hrtick_timer); | ||
993 | |||
994 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); | ||
995 | |||
996 | spin_lock(&rq->lock); | ||
997 | __update_rq_clock(rq); | ||
998 | rq->curr->sched_class->task_tick(rq, rq->curr, 1); | ||
999 | spin_unlock(&rq->lock); | ||
1000 | |||
1001 | return HRTIMER_NORESTART; | ||
1002 | } | ||
1003 | |||
1004 | static inline void init_rq_hrtick(struct rq *rq) | ||
1005 | { | ||
1006 | rq->hrtick_flags = 0; | ||
1007 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
1008 | rq->hrtick_timer.function = hrtick; | ||
1009 | rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | ||
1010 | } | ||
1011 | |||
1012 | void hrtick_resched(void) | ||
1013 | { | ||
1014 | struct rq *rq; | ||
1015 | unsigned long flags; | ||
1016 | |||
1017 | if (!test_thread_flag(TIF_HRTICK_RESCHED)) | ||
1018 | return; | ||
1019 | |||
1020 | local_irq_save(flags); | ||
1021 | rq = cpu_rq(smp_processor_id()); | ||
1022 | hrtick_set(rq); | ||
1023 | local_irq_restore(flags); | ||
1024 | } | ||
1025 | #else | ||
1026 | static inline void hrtick_clear(struct rq *rq) | ||
1027 | { | ||
1028 | } | ||
1029 | |||
1030 | static inline void hrtick_set(struct rq *rq) | ||
1031 | { | ||
1032 | } | ||
1033 | |||
1034 | static inline void init_rq_hrtick(struct rq *rq) | ||
1035 | { | ||
1036 | } | ||
1037 | |||
1038 | void hrtick_resched(void) | ||
1039 | { | ||
1040 | } | ||
1041 | #endif | ||
1042 | |||
686 | /* | 1043 | /* |
687 | * resched_task - mark a task 'to be rescheduled now'. | 1044 | * resched_task - mark a task 'to be rescheduled now'. |
688 | * | 1045 | * |
@@ -696,16 +1053,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | |||
696 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | 1053 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) |
697 | #endif | 1054 | #endif |
698 | 1055 | ||
699 | static void resched_task(struct task_struct *p) | 1056 | static void __resched_task(struct task_struct *p, int tif_bit) |
700 | { | 1057 | { |
701 | int cpu; | 1058 | int cpu; |
702 | 1059 | ||
703 | assert_spin_locked(&task_rq(p)->lock); | 1060 | assert_spin_locked(&task_rq(p)->lock); |
704 | 1061 | ||
705 | if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) | 1062 | if (unlikely(test_tsk_thread_flag(p, tif_bit))) |
706 | return; | 1063 | return; |
707 | 1064 | ||
708 | set_tsk_thread_flag(p, TIF_NEED_RESCHED); | 1065 | set_tsk_thread_flag(p, tif_bit); |
709 | 1066 | ||
710 | cpu = task_cpu(p); | 1067 | cpu = task_cpu(p); |
711 | if (cpu == smp_processor_id()) | 1068 | if (cpu == smp_processor_id()) |
@@ -728,10 +1085,10 @@ static void resched_cpu(int cpu) | |||
728 | spin_unlock_irqrestore(&rq->lock, flags); | 1085 | spin_unlock_irqrestore(&rq->lock, flags); |
729 | } | 1086 | } |
730 | #else | 1087 | #else |
731 | static inline void resched_task(struct task_struct *p) | 1088 | static void __resched_task(struct task_struct *p, int tif_bit) |
732 | { | 1089 | { |
733 | assert_spin_locked(&task_rq(p)->lock); | 1090 | assert_spin_locked(&task_rq(p)->lock); |
734 | set_tsk_need_resched(p); | 1091 | set_tsk_thread_flag(p, tif_bit); |
735 | } | 1092 | } |
736 | #endif | 1093 | #endif |
737 | 1094 | ||
@@ -871,6 +1228,23 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | |||
871 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | 1228 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} |
872 | #endif | 1229 | #endif |
873 | 1230 | ||
1231 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) | ||
1232 | { | ||
1233 | update_load_add(&rq->load, load); | ||
1234 | } | ||
1235 | |||
1236 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) | ||
1237 | { | ||
1238 | update_load_sub(&rq->load, load); | ||
1239 | } | ||
1240 | |||
1241 | #ifdef CONFIG_SMP | ||
1242 | static unsigned long source_load(int cpu, int type); | ||
1243 | static unsigned long target_load(int cpu, int type); | ||
1244 | static unsigned long cpu_avg_load_per_task(int cpu); | ||
1245 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | ||
1246 | #endif /* CONFIG_SMP */ | ||
1247 | |||
874 | #include "sched_stats.h" | 1248 | #include "sched_stats.h" |
875 | #include "sched_idletask.c" | 1249 | #include "sched_idletask.c" |
876 | #include "sched_fair.c" | 1250 | #include "sched_fair.c" |
@@ -881,41 +1255,14 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | |||
881 | 1255 | ||
882 | #define sched_class_highest (&rt_sched_class) | 1256 | #define sched_class_highest (&rt_sched_class) |
883 | 1257 | ||
884 | /* | ||
885 | * Update delta_exec, delta_fair fields for rq. | ||
886 | * | ||
887 | * delta_fair clock advances at a rate inversely proportional to | ||
888 | * total load (rq->load.weight) on the runqueue, while | ||
889 | * delta_exec advances at the same rate as wall-clock (provided | ||
890 | * cpu is not idle). | ||
891 | * | ||
892 | * delta_exec / delta_fair is a measure of the (smoothened) load on this | ||
893 | * runqueue over any given interval. This (smoothened) load is used | ||
894 | * during load balance. | ||
895 | * | ||
896 | * This function is called /before/ updating rq->load | ||
897 | * and when switching tasks. | ||
898 | */ | ||
899 | static inline void inc_load(struct rq *rq, const struct task_struct *p) | ||
900 | { | ||
901 | update_load_add(&rq->load, p->se.load.weight); | ||
902 | } | ||
903 | |||
904 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
905 | { | ||
906 | update_load_sub(&rq->load, p->se.load.weight); | ||
907 | } | ||
908 | |||
909 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | 1258 | static void inc_nr_running(struct task_struct *p, struct rq *rq) |
910 | { | 1259 | { |
911 | rq->nr_running++; | 1260 | rq->nr_running++; |
912 | inc_load(rq, p); | ||
913 | } | 1261 | } |
914 | 1262 | ||
915 | static void dec_nr_running(struct task_struct *p, struct rq *rq) | 1263 | static void dec_nr_running(struct task_struct *p, struct rq *rq) |
916 | { | 1264 | { |
917 | rq->nr_running--; | 1265 | rq->nr_running--; |
918 | dec_load(rq, p); | ||
919 | } | 1266 | } |
920 | 1267 | ||
921 | static void set_load_weight(struct task_struct *p) | 1268 | static void set_load_weight(struct task_struct *p) |
@@ -1039,7 +1386,7 @@ unsigned long weighted_cpuload(const int cpu) | |||
1039 | 1386 | ||
1040 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 1387 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
1041 | { | 1388 | { |
1042 | set_task_cfs_rq(p, cpu); | 1389 | set_task_rq(p, cpu); |
1043 | #ifdef CONFIG_SMP | 1390 | #ifdef CONFIG_SMP |
1044 | /* | 1391 | /* |
1045 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be | 1392 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be |
@@ -1051,12 +1398,24 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
1051 | #endif | 1398 | #endif |
1052 | } | 1399 | } |
1053 | 1400 | ||
1401 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, | ||
1402 | const struct sched_class *prev_class, | ||
1403 | int oldprio, int running) | ||
1404 | { | ||
1405 | if (prev_class != p->sched_class) { | ||
1406 | if (prev_class->switched_from) | ||
1407 | prev_class->switched_from(rq, p, running); | ||
1408 | p->sched_class->switched_to(rq, p, running); | ||
1409 | } else | ||
1410 | p->sched_class->prio_changed(rq, p, oldprio, running); | ||
1411 | } | ||
1412 | |||
1054 | #ifdef CONFIG_SMP | 1413 | #ifdef CONFIG_SMP |
1055 | 1414 | ||
1056 | /* | 1415 | /* |
1057 | * Is this task likely cache-hot: | 1416 | * Is this task likely cache-hot: |
1058 | */ | 1417 | */ |
1059 | static inline int | 1418 | static int |
1060 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | 1419 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) |
1061 | { | 1420 | { |
1062 | s64 delta; | 1421 | s64 delta; |
@@ -1281,7 +1640,7 @@ static unsigned long target_load(int cpu, int type) | |||
1281 | /* | 1640 | /* |
1282 | * Return the average load per task on the cpu's run queue | 1641 | * Return the average load per task on the cpu's run queue |
1283 | */ | 1642 | */ |
1284 | static inline unsigned long cpu_avg_load_per_task(int cpu) | 1643 | static unsigned long cpu_avg_load_per_task(int cpu) |
1285 | { | 1644 | { |
1286 | struct rq *rq = cpu_rq(cpu); | 1645 | struct rq *rq = cpu_rq(cpu); |
1287 | unsigned long total = weighted_cpuload(cpu); | 1646 | unsigned long total = weighted_cpuload(cpu); |
@@ -1438,58 +1797,6 @@ static int sched_balance_self(int cpu, int flag) | |||
1438 | 1797 | ||
1439 | #endif /* CONFIG_SMP */ | 1798 | #endif /* CONFIG_SMP */ |
1440 | 1799 | ||
1441 | /* | ||
1442 | * wake_idle() will wake a task on an idle cpu if task->cpu is | ||
1443 | * not idle and an idle cpu is available. The span of cpus to | ||
1444 | * search starts with cpus closest then further out as needed, | ||
1445 | * so we always favor a closer, idle cpu. | ||
1446 | * | ||
1447 | * Returns the CPU we should wake onto. | ||
1448 | */ | ||
1449 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | ||
1450 | static int wake_idle(int cpu, struct task_struct *p) | ||
1451 | { | ||
1452 | cpumask_t tmp; | ||
1453 | struct sched_domain *sd; | ||
1454 | int i; | ||
1455 | |||
1456 | /* | ||
1457 | * If it is idle, then it is the best cpu to run this task. | ||
1458 | * | ||
1459 | * This cpu is also the best, if it has more than one task already. | ||
1460 | * Siblings must be also busy(in most cases) as they didn't already | ||
1461 | * pickup the extra load from this cpu and hence we need not check | ||
1462 | * sibling runqueue info. This will avoid the checks and cache miss | ||
1463 | * penalities associated with that. | ||
1464 | */ | ||
1465 | if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) | ||
1466 | return cpu; | ||
1467 | |||
1468 | for_each_domain(cpu, sd) { | ||
1469 | if (sd->flags & SD_WAKE_IDLE) { | ||
1470 | cpus_and(tmp, sd->span, p->cpus_allowed); | ||
1471 | for_each_cpu_mask(i, tmp) { | ||
1472 | if (idle_cpu(i)) { | ||
1473 | if (i != task_cpu(p)) { | ||
1474 | schedstat_inc(p, | ||
1475 | se.nr_wakeups_idle); | ||
1476 | } | ||
1477 | return i; | ||
1478 | } | ||
1479 | } | ||
1480 | } else { | ||
1481 | break; | ||
1482 | } | ||
1483 | } | ||
1484 | return cpu; | ||
1485 | } | ||
1486 | #else | ||
1487 | static inline int wake_idle(int cpu, struct task_struct *p) | ||
1488 | { | ||
1489 | return cpu; | ||
1490 | } | ||
1491 | #endif | ||
1492 | |||
1493 | /*** | 1800 | /*** |
1494 | * try_to_wake_up - wake up a thread | 1801 | * try_to_wake_up - wake up a thread |
1495 | * @p: the to-be-woken-up thread | 1802 | * @p: the to-be-woken-up thread |
@@ -1510,11 +1817,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1510 | unsigned long flags; | 1817 | unsigned long flags; |
1511 | long old_state; | 1818 | long old_state; |
1512 | struct rq *rq; | 1819 | struct rq *rq; |
1513 | #ifdef CONFIG_SMP | ||
1514 | struct sched_domain *sd, *this_sd = NULL; | ||
1515 | unsigned long load, this_load; | ||
1516 | int new_cpu; | ||
1517 | #endif | ||
1518 | 1820 | ||
1519 | rq = task_rq_lock(p, &flags); | 1821 | rq = task_rq_lock(p, &flags); |
1520 | old_state = p->state; | 1822 | old_state = p->state; |
@@ -1532,92 +1834,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1532 | if (unlikely(task_running(rq, p))) | 1834 | if (unlikely(task_running(rq, p))) |
1533 | goto out_activate; | 1835 | goto out_activate; |
1534 | 1836 | ||
1535 | new_cpu = cpu; | 1837 | cpu = p->sched_class->select_task_rq(p, sync); |
1536 | 1838 | if (cpu != orig_cpu) { | |
1537 | schedstat_inc(rq, ttwu_count); | 1839 | set_task_cpu(p, cpu); |
1538 | if (cpu == this_cpu) { | ||
1539 | schedstat_inc(rq, ttwu_local); | ||
1540 | goto out_set_cpu; | ||
1541 | } | ||
1542 | |||
1543 | for_each_domain(this_cpu, sd) { | ||
1544 | if (cpu_isset(cpu, sd->span)) { | ||
1545 | schedstat_inc(sd, ttwu_wake_remote); | ||
1546 | this_sd = sd; | ||
1547 | break; | ||
1548 | } | ||
1549 | } | ||
1550 | |||
1551 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | ||
1552 | goto out_set_cpu; | ||
1553 | |||
1554 | /* | ||
1555 | * Check for affine wakeup and passive balancing possibilities. | ||
1556 | */ | ||
1557 | if (this_sd) { | ||
1558 | int idx = this_sd->wake_idx; | ||
1559 | unsigned int imbalance; | ||
1560 | |||
1561 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; | ||
1562 | |||
1563 | load = source_load(cpu, idx); | ||
1564 | this_load = target_load(this_cpu, idx); | ||
1565 | |||
1566 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | ||
1567 | |||
1568 | if (this_sd->flags & SD_WAKE_AFFINE) { | ||
1569 | unsigned long tl = this_load; | ||
1570 | unsigned long tl_per_task; | ||
1571 | |||
1572 | /* | ||
1573 | * Attract cache-cold tasks on sync wakeups: | ||
1574 | */ | ||
1575 | if (sync && !task_hot(p, rq->clock, this_sd)) | ||
1576 | goto out_set_cpu; | ||
1577 | |||
1578 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | ||
1579 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
1580 | |||
1581 | /* | ||
1582 | * If sync wakeup then subtract the (maximum possible) | ||
1583 | * effect of the currently running task from the load | ||
1584 | * of the current CPU: | ||
1585 | */ | ||
1586 | if (sync) | ||
1587 | tl -= current->se.load.weight; | ||
1588 | |||
1589 | if ((tl <= load && | ||
1590 | tl + target_load(cpu, idx) <= tl_per_task) || | ||
1591 | 100*(tl + p->se.load.weight) <= imbalance*load) { | ||
1592 | /* | ||
1593 | * This domain has SD_WAKE_AFFINE and | ||
1594 | * p is cache cold in this domain, and | ||
1595 | * there is no bad imbalance. | ||
1596 | */ | ||
1597 | schedstat_inc(this_sd, ttwu_move_affine); | ||
1598 | schedstat_inc(p, se.nr_wakeups_affine); | ||
1599 | goto out_set_cpu; | ||
1600 | } | ||
1601 | } | ||
1602 | |||
1603 | /* | ||
1604 | * Start passive balancing when half the imbalance_pct | ||
1605 | * limit is reached. | ||
1606 | */ | ||
1607 | if (this_sd->flags & SD_WAKE_BALANCE) { | ||
1608 | if (imbalance*this_load <= 100*load) { | ||
1609 | schedstat_inc(this_sd, ttwu_move_balance); | ||
1610 | schedstat_inc(p, se.nr_wakeups_passive); | ||
1611 | goto out_set_cpu; | ||
1612 | } | ||
1613 | } | ||
1614 | } | ||
1615 | |||
1616 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ | ||
1617 | out_set_cpu: | ||
1618 | new_cpu = wake_idle(new_cpu, p); | ||
1619 | if (new_cpu != cpu) { | ||
1620 | set_task_cpu(p, new_cpu); | ||
1621 | task_rq_unlock(rq, &flags); | 1840 | task_rq_unlock(rq, &flags); |
1622 | /* might preempt at this point */ | 1841 | /* might preempt at this point */ |
1623 | rq = task_rq_lock(p, &flags); | 1842 | rq = task_rq_lock(p, &flags); |
@@ -1631,6 +1850,21 @@ out_set_cpu: | |||
1631 | cpu = task_cpu(p); | 1850 | cpu = task_cpu(p); |
1632 | } | 1851 | } |
1633 | 1852 | ||
1853 | #ifdef CONFIG_SCHEDSTATS | ||
1854 | schedstat_inc(rq, ttwu_count); | ||
1855 | if (cpu == this_cpu) | ||
1856 | schedstat_inc(rq, ttwu_local); | ||
1857 | else { | ||
1858 | struct sched_domain *sd; | ||
1859 | for_each_domain(this_cpu, sd) { | ||
1860 | if (cpu_isset(cpu, sd->span)) { | ||
1861 | schedstat_inc(sd, ttwu_wake_remote); | ||
1862 | break; | ||
1863 | } | ||
1864 | } | ||
1865 | } | ||
1866 | #endif | ||
1867 | |||
1634 | out_activate: | 1868 | out_activate: |
1635 | #endif /* CONFIG_SMP */ | 1869 | #endif /* CONFIG_SMP */ |
1636 | schedstat_inc(p, se.nr_wakeups); | 1870 | schedstat_inc(p, se.nr_wakeups); |
@@ -1649,6 +1883,10 @@ out_activate: | |||
1649 | 1883 | ||
1650 | out_running: | 1884 | out_running: |
1651 | p->state = TASK_RUNNING; | 1885 | p->state = TASK_RUNNING; |
1886 | #ifdef CONFIG_SMP | ||
1887 | if (p->sched_class->task_wake_up) | ||
1888 | p->sched_class->task_wake_up(rq, p); | ||
1889 | #endif | ||
1652 | out: | 1890 | out: |
1653 | task_rq_unlock(rq, &flags); | 1891 | task_rq_unlock(rq, &flags); |
1654 | 1892 | ||
@@ -1691,7 +1929,7 @@ static void __sched_fork(struct task_struct *p) | |||
1691 | p->se.wait_max = 0; | 1929 | p->se.wait_max = 0; |
1692 | #endif | 1930 | #endif |
1693 | 1931 | ||
1694 | INIT_LIST_HEAD(&p->run_list); | 1932 | INIT_LIST_HEAD(&p->rt.run_list); |
1695 | p->se.on_rq = 0; | 1933 | p->se.on_rq = 0; |
1696 | 1934 | ||
1697 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 1935 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
@@ -1771,6 +2009,10 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
1771 | inc_nr_running(p, rq); | 2009 | inc_nr_running(p, rq); |
1772 | } | 2010 | } |
1773 | check_preempt_curr(rq, p); | 2011 | check_preempt_curr(rq, p); |
2012 | #ifdef CONFIG_SMP | ||
2013 | if (p->sched_class->task_wake_up) | ||
2014 | p->sched_class->task_wake_up(rq, p); | ||
2015 | #endif | ||
1774 | task_rq_unlock(rq, &flags); | 2016 | task_rq_unlock(rq, &flags); |
1775 | } | 2017 | } |
1776 | 2018 | ||
@@ -1891,6 +2133,11 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
1891 | prev_state = prev->state; | 2133 | prev_state = prev->state; |
1892 | finish_arch_switch(prev); | 2134 | finish_arch_switch(prev); |
1893 | finish_lock_switch(rq, prev); | 2135 | finish_lock_switch(rq, prev); |
2136 | #ifdef CONFIG_SMP | ||
2137 | if (current->sched_class->post_schedule) | ||
2138 | current->sched_class->post_schedule(rq); | ||
2139 | #endif | ||
2140 | |||
1894 | fire_sched_in_preempt_notifiers(current); | 2141 | fire_sched_in_preempt_notifiers(current); |
1895 | if (mm) | 2142 | if (mm) |
1896 | mmdrop(mm); | 2143 | mmdrop(mm); |
@@ -2124,11 +2371,13 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
2124 | /* | 2371 | /* |
2125 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | 2372 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. |
2126 | */ | 2373 | */ |
2127 | static void double_lock_balance(struct rq *this_rq, struct rq *busiest) | 2374 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest) |
2128 | __releases(this_rq->lock) | 2375 | __releases(this_rq->lock) |
2129 | __acquires(busiest->lock) | 2376 | __acquires(busiest->lock) |
2130 | __acquires(this_rq->lock) | 2377 | __acquires(this_rq->lock) |
2131 | { | 2378 | { |
2379 | int ret = 0; | ||
2380 | |||
2132 | if (unlikely(!irqs_disabled())) { | 2381 | if (unlikely(!irqs_disabled())) { |
2133 | /* printk() doesn't work good under rq->lock */ | 2382 | /* printk() doesn't work good under rq->lock */ |
2134 | spin_unlock(&this_rq->lock); | 2383 | spin_unlock(&this_rq->lock); |
@@ -2139,9 +2388,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
2139 | spin_unlock(&this_rq->lock); | 2388 | spin_unlock(&this_rq->lock); |
2140 | spin_lock(&busiest->lock); | 2389 | spin_lock(&busiest->lock); |
2141 | spin_lock(&this_rq->lock); | 2390 | spin_lock(&this_rq->lock); |
2391 | ret = 1; | ||
2142 | } else | 2392 | } else |
2143 | spin_lock(&busiest->lock); | 2393 | spin_lock(&busiest->lock); |
2144 | } | 2394 | } |
2395 | return ret; | ||
2145 | } | 2396 | } |
2146 | 2397 | ||
2147 | /* | 2398 | /* |
@@ -3485,12 +3736,14 @@ void scheduler_tick(void) | |||
3485 | /* | 3736 | /* |
3486 | * Let rq->clock advance by at least TICK_NSEC: | 3737 | * Let rq->clock advance by at least TICK_NSEC: |
3487 | */ | 3738 | */ |
3488 | if (unlikely(rq->clock < next_tick)) | 3739 | if (unlikely(rq->clock < next_tick)) { |
3489 | rq->clock = next_tick; | 3740 | rq->clock = next_tick; |
3741 | rq->clock_underflows++; | ||
3742 | } | ||
3490 | rq->tick_timestamp = rq->clock; | 3743 | rq->tick_timestamp = rq->clock; |
3491 | update_cpu_load(rq); | 3744 | update_cpu_load(rq); |
3492 | if (curr != rq->idle) /* FIXME: needed? */ | 3745 | curr->sched_class->task_tick(rq, curr, 0); |
3493 | curr->sched_class->task_tick(rq, curr); | 3746 | update_sched_rt_period(rq); |
3494 | spin_unlock(&rq->lock); | 3747 | spin_unlock(&rq->lock); |
3495 | 3748 | ||
3496 | #ifdef CONFIG_SMP | 3749 | #ifdef CONFIG_SMP |
@@ -3636,6 +3889,8 @@ need_resched_nonpreemptible: | |||
3636 | 3889 | ||
3637 | schedule_debug(prev); | 3890 | schedule_debug(prev); |
3638 | 3891 | ||
3892 | hrtick_clear(rq); | ||
3893 | |||
3639 | /* | 3894 | /* |
3640 | * Do the rq-clock update outside the rq lock: | 3895 | * Do the rq-clock update outside the rq lock: |
3641 | */ | 3896 | */ |
@@ -3654,6 +3909,11 @@ need_resched_nonpreemptible: | |||
3654 | switch_count = &prev->nvcsw; | 3909 | switch_count = &prev->nvcsw; |
3655 | } | 3910 | } |
3656 | 3911 | ||
3912 | #ifdef CONFIG_SMP | ||
3913 | if (prev->sched_class->pre_schedule) | ||
3914 | prev->sched_class->pre_schedule(rq, prev); | ||
3915 | #endif | ||
3916 | |||
3657 | if (unlikely(!rq->nr_running)) | 3917 | if (unlikely(!rq->nr_running)) |
3658 | idle_balance(cpu, rq); | 3918 | idle_balance(cpu, rq); |
3659 | 3919 | ||
@@ -3668,14 +3928,20 @@ need_resched_nonpreemptible: | |||
3668 | ++*switch_count; | 3928 | ++*switch_count; |
3669 | 3929 | ||
3670 | context_switch(rq, prev, next); /* unlocks the rq */ | 3930 | context_switch(rq, prev, next); /* unlocks the rq */ |
3931 | /* | ||
3932 | * the context switch might have flipped the stack from under | ||
3933 | * us, hence refresh the local variables. | ||
3934 | */ | ||
3935 | cpu = smp_processor_id(); | ||
3936 | rq = cpu_rq(cpu); | ||
3671 | } else | 3937 | } else |
3672 | spin_unlock_irq(&rq->lock); | 3938 | spin_unlock_irq(&rq->lock); |
3673 | 3939 | ||
3674 | if (unlikely(reacquire_kernel_lock(current) < 0)) { | 3940 | hrtick_set(rq); |
3675 | cpu = smp_processor_id(); | 3941 | |
3676 | rq = cpu_rq(cpu); | 3942 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
3677 | goto need_resched_nonpreemptible; | 3943 | goto need_resched_nonpreemptible; |
3678 | } | 3944 | |
3679 | preempt_enable_no_resched(); | 3945 | preempt_enable_no_resched(); |
3680 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3946 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
3681 | goto need_resched; | 3947 | goto need_resched; |
@@ -3691,10 +3957,9 @@ EXPORT_SYMBOL(schedule); | |||
3691 | asmlinkage void __sched preempt_schedule(void) | 3957 | asmlinkage void __sched preempt_schedule(void) |
3692 | { | 3958 | { |
3693 | struct thread_info *ti = current_thread_info(); | 3959 | struct thread_info *ti = current_thread_info(); |
3694 | #ifdef CONFIG_PREEMPT_BKL | ||
3695 | struct task_struct *task = current; | 3960 | struct task_struct *task = current; |
3696 | int saved_lock_depth; | 3961 | int saved_lock_depth; |
3697 | #endif | 3962 | |
3698 | /* | 3963 | /* |
3699 | * If there is a non-zero preempt_count or interrupts are disabled, | 3964 | * If there is a non-zero preempt_count or interrupts are disabled, |
3700 | * we do not want to preempt the current task. Just return.. | 3965 | * we do not want to preempt the current task. Just return.. |
@@ -3710,14 +3975,10 @@ asmlinkage void __sched preempt_schedule(void) | |||
3710 | * clear ->lock_depth so that schedule() doesnt | 3975 | * clear ->lock_depth so that schedule() doesnt |
3711 | * auto-release the semaphore: | 3976 | * auto-release the semaphore: |
3712 | */ | 3977 | */ |
3713 | #ifdef CONFIG_PREEMPT_BKL | ||
3714 | saved_lock_depth = task->lock_depth; | 3978 | saved_lock_depth = task->lock_depth; |
3715 | task->lock_depth = -1; | 3979 | task->lock_depth = -1; |
3716 | #endif | ||
3717 | schedule(); | 3980 | schedule(); |
3718 | #ifdef CONFIG_PREEMPT_BKL | ||
3719 | task->lock_depth = saved_lock_depth; | 3981 | task->lock_depth = saved_lock_depth; |
3720 | #endif | ||
3721 | sub_preempt_count(PREEMPT_ACTIVE); | 3982 | sub_preempt_count(PREEMPT_ACTIVE); |
3722 | 3983 | ||
3723 | /* | 3984 | /* |
@@ -3738,10 +3999,9 @@ EXPORT_SYMBOL(preempt_schedule); | |||
3738 | asmlinkage void __sched preempt_schedule_irq(void) | 3999 | asmlinkage void __sched preempt_schedule_irq(void) |
3739 | { | 4000 | { |
3740 | struct thread_info *ti = current_thread_info(); | 4001 | struct thread_info *ti = current_thread_info(); |
3741 | #ifdef CONFIG_PREEMPT_BKL | ||
3742 | struct task_struct *task = current; | 4002 | struct task_struct *task = current; |
3743 | int saved_lock_depth; | 4003 | int saved_lock_depth; |
3744 | #endif | 4004 | |
3745 | /* Catch callers which need to be fixed */ | 4005 | /* Catch callers which need to be fixed */ |
3746 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 4006 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
3747 | 4007 | ||
@@ -3753,16 +4013,12 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
3753 | * clear ->lock_depth so that schedule() doesnt | 4013 | * clear ->lock_depth so that schedule() doesnt |
3754 | * auto-release the semaphore: | 4014 | * auto-release the semaphore: |
3755 | */ | 4015 | */ |
3756 | #ifdef CONFIG_PREEMPT_BKL | ||
3757 | saved_lock_depth = task->lock_depth; | 4016 | saved_lock_depth = task->lock_depth; |
3758 | task->lock_depth = -1; | 4017 | task->lock_depth = -1; |
3759 | #endif | ||
3760 | local_irq_enable(); | 4018 | local_irq_enable(); |
3761 | schedule(); | 4019 | schedule(); |
3762 | local_irq_disable(); | 4020 | local_irq_disable(); |
3763 | #ifdef CONFIG_PREEMPT_BKL | ||
3764 | task->lock_depth = saved_lock_depth; | 4021 | task->lock_depth = saved_lock_depth; |
3765 | #endif | ||
3766 | sub_preempt_count(PREEMPT_ACTIVE); | 4022 | sub_preempt_count(PREEMPT_ACTIVE); |
3767 | 4023 | ||
3768 | /* | 4024 | /* |
@@ -4019,6 +4275,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4019 | unsigned long flags; | 4275 | unsigned long flags; |
4020 | int oldprio, on_rq, running; | 4276 | int oldprio, on_rq, running; |
4021 | struct rq *rq; | 4277 | struct rq *rq; |
4278 | const struct sched_class *prev_class = p->sched_class; | ||
4022 | 4279 | ||
4023 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 4280 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
4024 | 4281 | ||
@@ -4044,18 +4301,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4044 | if (on_rq) { | 4301 | if (on_rq) { |
4045 | if (running) | 4302 | if (running) |
4046 | p->sched_class->set_curr_task(rq); | 4303 | p->sched_class->set_curr_task(rq); |
4304 | |||
4047 | enqueue_task(rq, p, 0); | 4305 | enqueue_task(rq, p, 0); |
4048 | /* | 4306 | |
4049 | * Reschedule if we are currently running on this runqueue and | 4307 | check_class_changed(rq, p, prev_class, oldprio, running); |
4050 | * our priority decreased, or if we are not currently running on | ||
4051 | * this runqueue and our priority is higher than the current's | ||
4052 | */ | ||
4053 | if (running) { | ||
4054 | if (p->prio > oldprio) | ||
4055 | resched_task(rq->curr); | ||
4056 | } else { | ||
4057 | check_preempt_curr(rq, p); | ||
4058 | } | ||
4059 | } | 4308 | } |
4060 | task_rq_unlock(rq, &flags); | 4309 | task_rq_unlock(rq, &flags); |
4061 | } | 4310 | } |
@@ -4087,10 +4336,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4087 | goto out_unlock; | 4336 | goto out_unlock; |
4088 | } | 4337 | } |
4089 | on_rq = p->se.on_rq; | 4338 | on_rq = p->se.on_rq; |
4090 | if (on_rq) { | 4339 | if (on_rq) |
4091 | dequeue_task(rq, p, 0); | 4340 | dequeue_task(rq, p, 0); |
4092 | dec_load(rq, p); | ||
4093 | } | ||
4094 | 4341 | ||
4095 | p->static_prio = NICE_TO_PRIO(nice); | 4342 | p->static_prio = NICE_TO_PRIO(nice); |
4096 | set_load_weight(p); | 4343 | set_load_weight(p); |
@@ -4100,7 +4347,6 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4100 | 4347 | ||
4101 | if (on_rq) { | 4348 | if (on_rq) { |
4102 | enqueue_task(rq, p, 0); | 4349 | enqueue_task(rq, p, 0); |
4103 | inc_load(rq, p); | ||
4104 | /* | 4350 | /* |
4105 | * If the task increased its priority or is running and | 4351 | * If the task increased its priority or is running and |
4106 | * lowered its priority, then reschedule its CPU: | 4352 | * lowered its priority, then reschedule its CPU: |
@@ -4258,6 +4504,7 @@ int sched_setscheduler(struct task_struct *p, int policy, | |||
4258 | { | 4504 | { |
4259 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 4505 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
4260 | unsigned long flags; | 4506 | unsigned long flags; |
4507 | const struct sched_class *prev_class = p->sched_class; | ||
4261 | struct rq *rq; | 4508 | struct rq *rq; |
4262 | 4509 | ||
4263 | /* may grab non-irq protected spin_locks */ | 4510 | /* may grab non-irq protected spin_locks */ |
@@ -4351,18 +4598,10 @@ recheck: | |||
4351 | if (on_rq) { | 4598 | if (on_rq) { |
4352 | if (running) | 4599 | if (running) |
4353 | p->sched_class->set_curr_task(rq); | 4600 | p->sched_class->set_curr_task(rq); |
4601 | |||
4354 | activate_task(rq, p, 0); | 4602 | activate_task(rq, p, 0); |
4355 | /* | 4603 | |
4356 | * Reschedule if we are currently running on this runqueue and | 4604 | check_class_changed(rq, p, prev_class, oldprio, running); |
4357 | * our priority decreased, or if we are not currently running on | ||
4358 | * this runqueue and our priority is higher than the current's | ||
4359 | */ | ||
4360 | if (running) { | ||
4361 | if (p->prio > oldprio) | ||
4362 | resched_task(rq->curr); | ||
4363 | } else { | ||
4364 | check_preempt_curr(rq, p); | ||
4365 | } | ||
4366 | } | 4605 | } |
4367 | __task_rq_unlock(rq); | 4606 | __task_rq_unlock(rq); |
4368 | spin_unlock_irqrestore(&p->pi_lock, flags); | 4607 | spin_unlock_irqrestore(&p->pi_lock, flags); |
@@ -4490,13 +4729,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
4490 | struct task_struct *p; | 4729 | struct task_struct *p; |
4491 | int retval; | 4730 | int retval; |
4492 | 4731 | ||
4493 | mutex_lock(&sched_hotcpu_mutex); | 4732 | get_online_cpus(); |
4494 | read_lock(&tasklist_lock); | 4733 | read_lock(&tasklist_lock); |
4495 | 4734 | ||
4496 | p = find_process_by_pid(pid); | 4735 | p = find_process_by_pid(pid); |
4497 | if (!p) { | 4736 | if (!p) { |
4498 | read_unlock(&tasklist_lock); | 4737 | read_unlock(&tasklist_lock); |
4499 | mutex_unlock(&sched_hotcpu_mutex); | 4738 | put_online_cpus(); |
4500 | return -ESRCH; | 4739 | return -ESRCH; |
4501 | } | 4740 | } |
4502 | 4741 | ||
@@ -4536,7 +4775,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
4536 | } | 4775 | } |
4537 | out_unlock: | 4776 | out_unlock: |
4538 | put_task_struct(p); | 4777 | put_task_struct(p); |
4539 | mutex_unlock(&sched_hotcpu_mutex); | 4778 | put_online_cpus(); |
4540 | return retval; | 4779 | return retval; |
4541 | } | 4780 | } |
4542 | 4781 | ||
@@ -4593,7 +4832,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) | |||
4593 | struct task_struct *p; | 4832 | struct task_struct *p; |
4594 | int retval; | 4833 | int retval; |
4595 | 4834 | ||
4596 | mutex_lock(&sched_hotcpu_mutex); | 4835 | get_online_cpus(); |
4597 | read_lock(&tasklist_lock); | 4836 | read_lock(&tasklist_lock); |
4598 | 4837 | ||
4599 | retval = -ESRCH; | 4838 | retval = -ESRCH; |
@@ -4609,7 +4848,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) | |||
4609 | 4848 | ||
4610 | out_unlock: | 4849 | out_unlock: |
4611 | read_unlock(&tasklist_lock); | 4850 | read_unlock(&tasklist_lock); |
4612 | mutex_unlock(&sched_hotcpu_mutex); | 4851 | put_online_cpus(); |
4613 | 4852 | ||
4614 | return retval; | 4853 | return retval; |
4615 | } | 4854 | } |
@@ -4683,7 +4922,8 @@ static void __cond_resched(void) | |||
4683 | } while (need_resched()); | 4922 | } while (need_resched()); |
4684 | } | 4923 | } |
4685 | 4924 | ||
4686 | int __sched cond_resched(void) | 4925 | #if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) |
4926 | int __sched _cond_resched(void) | ||
4687 | { | 4927 | { |
4688 | if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && | 4928 | if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && |
4689 | system_state == SYSTEM_RUNNING) { | 4929 | system_state == SYSTEM_RUNNING) { |
@@ -4692,7 +4932,8 @@ int __sched cond_resched(void) | |||
4692 | } | 4932 | } |
4693 | return 0; | 4933 | return 0; |
4694 | } | 4934 | } |
4695 | EXPORT_SYMBOL(cond_resched); | 4935 | EXPORT_SYMBOL(_cond_resched); |
4936 | #endif | ||
4696 | 4937 | ||
4697 | /* | 4938 | /* |
4698 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, | 4939 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, |
@@ -4704,19 +4945,15 @@ EXPORT_SYMBOL(cond_resched); | |||
4704 | */ | 4945 | */ |
4705 | int cond_resched_lock(spinlock_t *lock) | 4946 | int cond_resched_lock(spinlock_t *lock) |
4706 | { | 4947 | { |
4948 | int resched = need_resched() && system_state == SYSTEM_RUNNING; | ||
4707 | int ret = 0; | 4949 | int ret = 0; |
4708 | 4950 | ||
4709 | if (need_lockbreak(lock)) { | 4951 | if (spin_needbreak(lock) || resched) { |
4710 | spin_unlock(lock); | 4952 | spin_unlock(lock); |
4711 | cpu_relax(); | 4953 | if (resched && need_resched()) |
4712 | ret = 1; | 4954 | __cond_resched(); |
4713 | spin_lock(lock); | 4955 | else |
4714 | } | 4956 | cpu_relax(); |
4715 | if (need_resched() && system_state == SYSTEM_RUNNING) { | ||
4716 | spin_release(&lock->dep_map, 1, _THIS_IP_); | ||
4717 | _raw_spin_unlock(lock); | ||
4718 | preempt_enable_no_resched(); | ||
4719 | __cond_resched(); | ||
4720 | ret = 1; | 4957 | ret = 1; |
4721 | spin_lock(lock); | 4958 | spin_lock(lock); |
4722 | } | 4959 | } |
@@ -4890,7 +5127,7 @@ out_unlock: | |||
4890 | 5127 | ||
4891 | static const char stat_nam[] = "RSDTtZX"; | 5128 | static const char stat_nam[] = "RSDTtZX"; |
4892 | 5129 | ||
4893 | static void show_task(struct task_struct *p) | 5130 | void sched_show_task(struct task_struct *p) |
4894 | { | 5131 | { |
4895 | unsigned long free = 0; | 5132 | unsigned long free = 0; |
4896 | unsigned state; | 5133 | unsigned state; |
@@ -4920,8 +5157,7 @@ static void show_task(struct task_struct *p) | |||
4920 | printk(KERN_CONT "%5lu %5d %6d\n", free, | 5157 | printk(KERN_CONT "%5lu %5d %6d\n", free, |
4921 | task_pid_nr(p), task_pid_nr(p->real_parent)); | 5158 | task_pid_nr(p), task_pid_nr(p->real_parent)); |
4922 | 5159 | ||
4923 | if (state != TASK_RUNNING) | 5160 | show_stack(p, NULL); |
4924 | show_stack(p, NULL); | ||
4925 | } | 5161 | } |
4926 | 5162 | ||
4927 | void show_state_filter(unsigned long state_filter) | 5163 | void show_state_filter(unsigned long state_filter) |
@@ -4943,7 +5179,7 @@ void show_state_filter(unsigned long state_filter) | |||
4943 | */ | 5179 | */ |
4944 | touch_nmi_watchdog(); | 5180 | touch_nmi_watchdog(); |
4945 | if (!state_filter || (p->state & state_filter)) | 5181 | if (!state_filter || (p->state & state_filter)) |
4946 | show_task(p); | 5182 | sched_show_task(p); |
4947 | } while_each_thread(g, p); | 5183 | } while_each_thread(g, p); |
4948 | 5184 | ||
4949 | touch_all_softlockup_watchdogs(); | 5185 | touch_all_softlockup_watchdogs(); |
@@ -4992,11 +5228,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
4992 | spin_unlock_irqrestore(&rq->lock, flags); | 5228 | spin_unlock_irqrestore(&rq->lock, flags); |
4993 | 5229 | ||
4994 | /* Set the preempt count _outside_ the spinlocks! */ | 5230 | /* Set the preempt count _outside_ the spinlocks! */ |
4995 | #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) | ||
4996 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); | ||
4997 | #else | ||
4998 | task_thread_info(idle)->preempt_count = 0; | 5231 | task_thread_info(idle)->preempt_count = 0; |
4999 | #endif | 5232 | |
5000 | /* | 5233 | /* |
5001 | * The idle tasks have their own, simple scheduling class: | 5234 | * The idle tasks have their own, simple scheduling class: |
5002 | */ | 5235 | */ |
@@ -5077,7 +5310,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | |||
5077 | goto out; | 5310 | goto out; |
5078 | } | 5311 | } |
5079 | 5312 | ||
5080 | p->cpus_allowed = new_mask; | 5313 | if (p->sched_class->set_cpus_allowed) |
5314 | p->sched_class->set_cpus_allowed(p, &new_mask); | ||
5315 | else { | ||
5316 | p->cpus_allowed = new_mask; | ||
5317 | p->rt.nr_cpus_allowed = cpus_weight(new_mask); | ||
5318 | } | ||
5319 | |||
5081 | /* Can the task run on the task's current CPU? If so, we're done */ | 5320 | /* Can the task run on the task's current CPU? If so, we're done */ |
5082 | if (cpu_isset(task_cpu(p), new_mask)) | 5321 | if (cpu_isset(task_cpu(p), new_mask)) |
5083 | goto out; | 5322 | goto out; |
@@ -5569,9 +5808,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5569 | struct rq *rq; | 5808 | struct rq *rq; |
5570 | 5809 | ||
5571 | switch (action) { | 5810 | switch (action) { |
5572 | case CPU_LOCK_ACQUIRE: | ||
5573 | mutex_lock(&sched_hotcpu_mutex); | ||
5574 | break; | ||
5575 | 5811 | ||
5576 | case CPU_UP_PREPARE: | 5812 | case CPU_UP_PREPARE: |
5577 | case CPU_UP_PREPARE_FROZEN: | 5813 | case CPU_UP_PREPARE_FROZEN: |
@@ -5590,6 +5826,15 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5590 | case CPU_ONLINE_FROZEN: | 5826 | case CPU_ONLINE_FROZEN: |
5591 | /* Strictly unnecessary, as first user will wake it. */ | 5827 | /* Strictly unnecessary, as first user will wake it. */ |
5592 | wake_up_process(cpu_rq(cpu)->migration_thread); | 5828 | wake_up_process(cpu_rq(cpu)->migration_thread); |
5829 | |||
5830 | /* Update our root-domain */ | ||
5831 | rq = cpu_rq(cpu); | ||
5832 | spin_lock_irqsave(&rq->lock, flags); | ||
5833 | if (rq->rd) { | ||
5834 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | ||
5835 | cpu_set(cpu, rq->rd->online); | ||
5836 | } | ||
5837 | spin_unlock_irqrestore(&rq->lock, flags); | ||
5593 | break; | 5838 | break; |
5594 | 5839 | ||
5595 | #ifdef CONFIG_HOTPLUG_CPU | 5840 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -5640,10 +5885,18 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5640 | } | 5885 | } |
5641 | spin_unlock_irq(&rq->lock); | 5886 | spin_unlock_irq(&rq->lock); |
5642 | break; | 5887 | break; |
5643 | #endif | 5888 | |
5644 | case CPU_LOCK_RELEASE: | 5889 | case CPU_DOWN_PREPARE: |
5645 | mutex_unlock(&sched_hotcpu_mutex); | 5890 | /* Update our root-domain */ |
5891 | rq = cpu_rq(cpu); | ||
5892 | spin_lock_irqsave(&rq->lock, flags); | ||
5893 | if (rq->rd) { | ||
5894 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | ||
5895 | cpu_clear(cpu, rq->rd->online); | ||
5896 | } | ||
5897 | spin_unlock_irqrestore(&rq->lock, flags); | ||
5646 | break; | 5898 | break; |
5899 | #endif | ||
5647 | } | 5900 | } |
5648 | return NOTIFY_OK; | 5901 | return NOTIFY_OK; |
5649 | } | 5902 | } |
@@ -5831,11 +6084,76 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
5831 | return 1; | 6084 | return 1; |
5832 | } | 6085 | } |
5833 | 6086 | ||
6087 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) | ||
6088 | { | ||
6089 | unsigned long flags; | ||
6090 | const struct sched_class *class; | ||
6091 | |||
6092 | spin_lock_irqsave(&rq->lock, flags); | ||
6093 | |||
6094 | if (rq->rd) { | ||
6095 | struct root_domain *old_rd = rq->rd; | ||
6096 | |||
6097 | for (class = sched_class_highest; class; class = class->next) { | ||
6098 | if (class->leave_domain) | ||
6099 | class->leave_domain(rq); | ||
6100 | } | ||
6101 | |||
6102 | cpu_clear(rq->cpu, old_rd->span); | ||
6103 | cpu_clear(rq->cpu, old_rd->online); | ||
6104 | |||
6105 | if (atomic_dec_and_test(&old_rd->refcount)) | ||
6106 | kfree(old_rd); | ||
6107 | } | ||
6108 | |||
6109 | atomic_inc(&rd->refcount); | ||
6110 | rq->rd = rd; | ||
6111 | |||
6112 | cpu_set(rq->cpu, rd->span); | ||
6113 | if (cpu_isset(rq->cpu, cpu_online_map)) | ||
6114 | cpu_set(rq->cpu, rd->online); | ||
6115 | |||
6116 | for (class = sched_class_highest; class; class = class->next) { | ||
6117 | if (class->join_domain) | ||
6118 | class->join_domain(rq); | ||
6119 | } | ||
6120 | |||
6121 | spin_unlock_irqrestore(&rq->lock, flags); | ||
6122 | } | ||
6123 | |||
6124 | static void init_rootdomain(struct root_domain *rd) | ||
6125 | { | ||
6126 | memset(rd, 0, sizeof(*rd)); | ||
6127 | |||
6128 | cpus_clear(rd->span); | ||
6129 | cpus_clear(rd->online); | ||
6130 | } | ||
6131 | |||
6132 | static void init_defrootdomain(void) | ||
6133 | { | ||
6134 | init_rootdomain(&def_root_domain); | ||
6135 | atomic_set(&def_root_domain.refcount, 1); | ||
6136 | } | ||
6137 | |||
6138 | static struct root_domain *alloc_rootdomain(void) | ||
6139 | { | ||
6140 | struct root_domain *rd; | ||
6141 | |||
6142 | rd = kmalloc(sizeof(*rd), GFP_KERNEL); | ||
6143 | if (!rd) | ||
6144 | return NULL; | ||
6145 | |||
6146 | init_rootdomain(rd); | ||
6147 | |||
6148 | return rd; | ||
6149 | } | ||
6150 | |||
5834 | /* | 6151 | /* |
5835 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 6152 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
5836 | * hold the hotplug lock. | 6153 | * hold the hotplug lock. |
5837 | */ | 6154 | */ |
5838 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) | 6155 | static void |
6156 | cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | ||
5839 | { | 6157 | { |
5840 | struct rq *rq = cpu_rq(cpu); | 6158 | struct rq *rq = cpu_rq(cpu); |
5841 | struct sched_domain *tmp; | 6159 | struct sched_domain *tmp; |
@@ -5860,6 +6178,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu) | |||
5860 | 6178 | ||
5861 | sched_domain_debug(sd, cpu); | 6179 | sched_domain_debug(sd, cpu); |
5862 | 6180 | ||
6181 | rq_attach_root(rq, rd); | ||
5863 | rcu_assign_pointer(rq->sd, sd); | 6182 | rcu_assign_pointer(rq->sd, sd); |
5864 | } | 6183 | } |
5865 | 6184 | ||
@@ -6228,6 +6547,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6228 | static int build_sched_domains(const cpumask_t *cpu_map) | 6547 | static int build_sched_domains(const cpumask_t *cpu_map) |
6229 | { | 6548 | { |
6230 | int i; | 6549 | int i; |
6550 | struct root_domain *rd; | ||
6231 | #ifdef CONFIG_NUMA | 6551 | #ifdef CONFIG_NUMA |
6232 | struct sched_group **sched_group_nodes = NULL; | 6552 | struct sched_group **sched_group_nodes = NULL; |
6233 | int sd_allnodes = 0; | 6553 | int sd_allnodes = 0; |
@@ -6244,6 +6564,12 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6244 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | 6564 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; |
6245 | #endif | 6565 | #endif |
6246 | 6566 | ||
6567 | rd = alloc_rootdomain(); | ||
6568 | if (!rd) { | ||
6569 | printk(KERN_WARNING "Cannot alloc root domain\n"); | ||
6570 | return -ENOMEM; | ||
6571 | } | ||
6572 | |||
6247 | /* | 6573 | /* |
6248 | * Set up domains for cpus specified by the cpu_map. | 6574 | * Set up domains for cpus specified by the cpu_map. |
6249 | */ | 6575 | */ |
@@ -6460,7 +6786,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6460 | #else | 6786 | #else |
6461 | sd = &per_cpu(phys_domains, i); | 6787 | sd = &per_cpu(phys_domains, i); |
6462 | #endif | 6788 | #endif |
6463 | cpu_attach_domain(sd, i); | 6789 | cpu_attach_domain(sd, rd, i); |
6464 | } | 6790 | } |
6465 | 6791 | ||
6466 | return 0; | 6792 | return 0; |
@@ -6518,7 +6844,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
6518 | unregister_sched_domain_sysctl(); | 6844 | unregister_sched_domain_sysctl(); |
6519 | 6845 | ||
6520 | for_each_cpu_mask(i, *cpu_map) | 6846 | for_each_cpu_mask(i, *cpu_map) |
6521 | cpu_attach_domain(NULL, i); | 6847 | cpu_attach_domain(NULL, &def_root_domain, i); |
6522 | synchronize_sched(); | 6848 | synchronize_sched(); |
6523 | arch_destroy_sched_domains(cpu_map); | 6849 | arch_destroy_sched_domains(cpu_map); |
6524 | } | 6850 | } |
@@ -6548,6 +6874,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) | |||
6548 | { | 6874 | { |
6549 | int i, j; | 6875 | int i, j; |
6550 | 6876 | ||
6877 | lock_doms_cur(); | ||
6878 | |||
6551 | /* always unregister in case we don't destroy any domains */ | 6879 | /* always unregister in case we don't destroy any domains */ |
6552 | unregister_sched_domain_sysctl(); | 6880 | unregister_sched_domain_sysctl(); |
6553 | 6881 | ||
@@ -6588,6 +6916,8 @@ match2: | |||
6588 | ndoms_cur = ndoms_new; | 6916 | ndoms_cur = ndoms_new; |
6589 | 6917 | ||
6590 | register_sched_domain_sysctl(); | 6918 | register_sched_domain_sysctl(); |
6919 | |||
6920 | unlock_doms_cur(); | ||
6591 | } | 6921 | } |
6592 | 6922 | ||
6593 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 6923 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
@@ -6595,10 +6925,10 @@ static int arch_reinit_sched_domains(void) | |||
6595 | { | 6925 | { |
6596 | int err; | 6926 | int err; |
6597 | 6927 | ||
6598 | mutex_lock(&sched_hotcpu_mutex); | 6928 | get_online_cpus(); |
6599 | detach_destroy_domains(&cpu_online_map); | 6929 | detach_destroy_domains(&cpu_online_map); |
6600 | err = arch_init_sched_domains(&cpu_online_map); | 6930 | err = arch_init_sched_domains(&cpu_online_map); |
6601 | mutex_unlock(&sched_hotcpu_mutex); | 6931 | put_online_cpus(); |
6602 | 6932 | ||
6603 | return err; | 6933 | return err; |
6604 | } | 6934 | } |
@@ -6709,12 +7039,12 @@ void __init sched_init_smp(void) | |||
6709 | { | 7039 | { |
6710 | cpumask_t non_isolated_cpus; | 7040 | cpumask_t non_isolated_cpus; |
6711 | 7041 | ||
6712 | mutex_lock(&sched_hotcpu_mutex); | 7042 | get_online_cpus(); |
6713 | arch_init_sched_domains(&cpu_online_map); | 7043 | arch_init_sched_domains(&cpu_online_map); |
6714 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); | 7044 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); |
6715 | if (cpus_empty(non_isolated_cpus)) | 7045 | if (cpus_empty(non_isolated_cpus)) |
6716 | cpu_set(smp_processor_id(), non_isolated_cpus); | 7046 | cpu_set(smp_processor_id(), non_isolated_cpus); |
6717 | mutex_unlock(&sched_hotcpu_mutex); | 7047 | put_online_cpus(); |
6718 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 7048 | /* XXX: Theoretical race here - CPU may be hotplugged now */ |
6719 | hotcpu_notifier(update_sched_domains, 0); | 7049 | hotcpu_notifier(update_sched_domains, 0); |
6720 | 7050 | ||
@@ -6722,6 +7052,21 @@ void __init sched_init_smp(void) | |||
6722 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 7052 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
6723 | BUG(); | 7053 | BUG(); |
6724 | sched_init_granularity(); | 7054 | sched_init_granularity(); |
7055 | |||
7056 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7057 | if (nr_cpu_ids == 1) | ||
7058 | return; | ||
7059 | |||
7060 | lb_monitor_task = kthread_create(load_balance_monitor, NULL, | ||
7061 | "group_balance"); | ||
7062 | if (!IS_ERR(lb_monitor_task)) { | ||
7063 | lb_monitor_task->flags |= PF_NOFREEZE; | ||
7064 | wake_up_process(lb_monitor_task); | ||
7065 | } else { | ||
7066 | printk(KERN_ERR "Could not create load balance monitor thread" | ||
7067 | "(error = %ld) \n", PTR_ERR(lb_monitor_task)); | ||
7068 | } | ||
7069 | #endif | ||
6725 | } | 7070 | } |
6726 | #else | 7071 | #else |
6727 | void __init sched_init_smp(void) | 7072 | void __init sched_init_smp(void) |
@@ -6746,13 +7091,87 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | |||
6746 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 7091 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
6747 | } | 7092 | } |
6748 | 7093 | ||
7094 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | ||
7095 | { | ||
7096 | struct rt_prio_array *array; | ||
7097 | int i; | ||
7098 | |||
7099 | array = &rt_rq->active; | ||
7100 | for (i = 0; i < MAX_RT_PRIO; i++) { | ||
7101 | INIT_LIST_HEAD(array->queue + i); | ||
7102 | __clear_bit(i, array->bitmap); | ||
7103 | } | ||
7104 | /* delimiter for bitsearch: */ | ||
7105 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
7106 | |||
7107 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED | ||
7108 | rt_rq->highest_prio = MAX_RT_PRIO; | ||
7109 | #endif | ||
7110 | #ifdef CONFIG_SMP | ||
7111 | rt_rq->rt_nr_migratory = 0; | ||
7112 | rt_rq->overloaded = 0; | ||
7113 | #endif | ||
7114 | |||
7115 | rt_rq->rt_time = 0; | ||
7116 | rt_rq->rt_throttled = 0; | ||
7117 | |||
7118 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7119 | rt_rq->rq = rq; | ||
7120 | #endif | ||
7121 | } | ||
7122 | |||
7123 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7124 | static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, | ||
7125 | struct cfs_rq *cfs_rq, struct sched_entity *se, | ||
7126 | int cpu, int add) | ||
7127 | { | ||
7128 | tg->cfs_rq[cpu] = cfs_rq; | ||
7129 | init_cfs_rq(cfs_rq, rq); | ||
7130 | cfs_rq->tg = tg; | ||
7131 | if (add) | ||
7132 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
7133 | |||
7134 | tg->se[cpu] = se; | ||
7135 | se->cfs_rq = &rq->cfs; | ||
7136 | se->my_q = cfs_rq; | ||
7137 | se->load.weight = tg->shares; | ||
7138 | se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); | ||
7139 | se->parent = NULL; | ||
7140 | } | ||
7141 | |||
7142 | static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, | ||
7143 | struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, | ||
7144 | int cpu, int add) | ||
7145 | { | ||
7146 | tg->rt_rq[cpu] = rt_rq; | ||
7147 | init_rt_rq(rt_rq, rq); | ||
7148 | rt_rq->tg = tg; | ||
7149 | rt_rq->rt_se = rt_se; | ||
7150 | if (add) | ||
7151 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | ||
7152 | |||
7153 | tg->rt_se[cpu] = rt_se; | ||
7154 | rt_se->rt_rq = &rq->rt; | ||
7155 | rt_se->my_q = rt_rq; | ||
7156 | rt_se->parent = NULL; | ||
7157 | INIT_LIST_HEAD(&rt_se->run_list); | ||
7158 | } | ||
7159 | #endif | ||
7160 | |||
6749 | void __init sched_init(void) | 7161 | void __init sched_init(void) |
6750 | { | 7162 | { |
6751 | int highest_cpu = 0; | 7163 | int highest_cpu = 0; |
6752 | int i, j; | 7164 | int i, j; |
6753 | 7165 | ||
7166 | #ifdef CONFIG_SMP | ||
7167 | init_defrootdomain(); | ||
7168 | #endif | ||
7169 | |||
7170 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7171 | list_add(&init_task_group.list, &task_groups); | ||
7172 | #endif | ||
7173 | |||
6754 | for_each_possible_cpu(i) { | 7174 | for_each_possible_cpu(i) { |
6755 | struct rt_prio_array *array; | ||
6756 | struct rq *rq; | 7175 | struct rq *rq; |
6757 | 7176 | ||
6758 | rq = cpu_rq(i); | 7177 | rq = cpu_rq(i); |
@@ -6761,52 +7180,39 @@ void __init sched_init(void) | |||
6761 | rq->nr_running = 0; | 7180 | rq->nr_running = 0; |
6762 | rq->clock = 1; | 7181 | rq->clock = 1; |
6763 | init_cfs_rq(&rq->cfs, rq); | 7182 | init_cfs_rq(&rq->cfs, rq); |
7183 | init_rt_rq(&rq->rt, rq); | ||
6764 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7184 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6765 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | ||
6766 | { | ||
6767 | struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i); | ||
6768 | struct sched_entity *se = | ||
6769 | &per_cpu(init_sched_entity, i); | ||
6770 | |||
6771 | init_cfs_rq_p[i] = cfs_rq; | ||
6772 | init_cfs_rq(cfs_rq, rq); | ||
6773 | cfs_rq->tg = &init_task_group; | ||
6774 | list_add(&cfs_rq->leaf_cfs_rq_list, | ||
6775 | &rq->leaf_cfs_rq_list); | ||
6776 | |||
6777 | init_sched_entity_p[i] = se; | ||
6778 | se->cfs_rq = &rq->cfs; | ||
6779 | se->my_q = cfs_rq; | ||
6780 | se->load.weight = init_task_group_load; | ||
6781 | se->load.inv_weight = | ||
6782 | div64_64(1ULL<<32, init_task_group_load); | ||
6783 | se->parent = NULL; | ||
6784 | } | ||
6785 | init_task_group.shares = init_task_group_load; | 7185 | init_task_group.shares = init_task_group_load; |
6786 | spin_lock_init(&init_task_group.lock); | 7186 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
7187 | init_tg_cfs_entry(rq, &init_task_group, | ||
7188 | &per_cpu(init_cfs_rq, i), | ||
7189 | &per_cpu(init_sched_entity, i), i, 1); | ||
7190 | |||
7191 | init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */ | ||
7192 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | ||
7193 | init_tg_rt_entry(rq, &init_task_group, | ||
7194 | &per_cpu(init_rt_rq, i), | ||
7195 | &per_cpu(init_sched_rt_entity, i), i, 1); | ||
6787 | #endif | 7196 | #endif |
7197 | rq->rt_period_expire = 0; | ||
7198 | rq->rt_throttled = 0; | ||
6788 | 7199 | ||
6789 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 7200 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
6790 | rq->cpu_load[j] = 0; | 7201 | rq->cpu_load[j] = 0; |
6791 | #ifdef CONFIG_SMP | 7202 | #ifdef CONFIG_SMP |
6792 | rq->sd = NULL; | 7203 | rq->sd = NULL; |
7204 | rq->rd = NULL; | ||
6793 | rq->active_balance = 0; | 7205 | rq->active_balance = 0; |
6794 | rq->next_balance = jiffies; | 7206 | rq->next_balance = jiffies; |
6795 | rq->push_cpu = 0; | 7207 | rq->push_cpu = 0; |
6796 | rq->cpu = i; | 7208 | rq->cpu = i; |
6797 | rq->migration_thread = NULL; | 7209 | rq->migration_thread = NULL; |
6798 | INIT_LIST_HEAD(&rq->migration_queue); | 7210 | INIT_LIST_HEAD(&rq->migration_queue); |
7211 | rq_attach_root(rq, &def_root_domain); | ||
6799 | #endif | 7212 | #endif |
7213 | init_rq_hrtick(rq); | ||
6800 | atomic_set(&rq->nr_iowait, 0); | 7214 | atomic_set(&rq->nr_iowait, 0); |
6801 | |||
6802 | array = &rq->rt.active; | ||
6803 | for (j = 0; j < MAX_RT_PRIO; j++) { | ||
6804 | INIT_LIST_HEAD(array->queue + j); | ||
6805 | __clear_bit(j, array->bitmap); | ||
6806 | } | ||
6807 | highest_cpu = i; | 7215 | highest_cpu = i; |
6808 | /* delimiter for bitsearch: */ | ||
6809 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
6810 | } | 7216 | } |
6811 | 7217 | ||
6812 | set_load_weight(&init_task); | 7218 | set_load_weight(&init_task); |
@@ -6975,12 +7381,187 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
6975 | 7381 | ||
6976 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7382 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6977 | 7383 | ||
7384 | #ifdef CONFIG_SMP | ||
7385 | /* | ||
7386 | * distribute shares of all task groups among their schedulable entities, | ||
7387 | * to reflect load distribution across cpus. | ||
7388 | */ | ||
7389 | static int rebalance_shares(struct sched_domain *sd, int this_cpu) | ||
7390 | { | ||
7391 | struct cfs_rq *cfs_rq; | ||
7392 | struct rq *rq = cpu_rq(this_cpu); | ||
7393 | cpumask_t sdspan = sd->span; | ||
7394 | int balanced = 1; | ||
7395 | |||
7396 | /* Walk thr' all the task groups that we have */ | ||
7397 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
7398 | int i; | ||
7399 | unsigned long total_load = 0, total_shares; | ||
7400 | struct task_group *tg = cfs_rq->tg; | ||
7401 | |||
7402 | /* Gather total task load of this group across cpus */ | ||
7403 | for_each_cpu_mask(i, sdspan) | ||
7404 | total_load += tg->cfs_rq[i]->load.weight; | ||
7405 | |||
7406 | /* Nothing to do if this group has no load */ | ||
7407 | if (!total_load) | ||
7408 | continue; | ||
7409 | |||
7410 | /* | ||
7411 | * tg->shares represents the number of cpu shares the task group | ||
7412 | * is eligible to hold on a single cpu. On N cpus, it is | ||
7413 | * eligible to hold (N * tg->shares) number of cpu shares. | ||
7414 | */ | ||
7415 | total_shares = tg->shares * cpus_weight(sdspan); | ||
7416 | |||
7417 | /* | ||
7418 | * redistribute total_shares across cpus as per the task load | ||
7419 | * distribution. | ||
7420 | */ | ||
7421 | for_each_cpu_mask(i, sdspan) { | ||
7422 | unsigned long local_load, local_shares; | ||
7423 | |||
7424 | local_load = tg->cfs_rq[i]->load.weight; | ||
7425 | local_shares = (local_load * total_shares) / total_load; | ||
7426 | if (!local_shares) | ||
7427 | local_shares = MIN_GROUP_SHARES; | ||
7428 | if (local_shares == tg->se[i]->load.weight) | ||
7429 | continue; | ||
7430 | |||
7431 | spin_lock_irq(&cpu_rq(i)->lock); | ||
7432 | set_se_shares(tg->se[i], local_shares); | ||
7433 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
7434 | balanced = 0; | ||
7435 | } | ||
7436 | } | ||
7437 | |||
7438 | return balanced; | ||
7439 | } | ||
7440 | |||
7441 | /* | ||
7442 | * How frequently should we rebalance_shares() across cpus? | ||
7443 | * | ||
7444 | * The more frequently we rebalance shares, the more accurate is the fairness | ||
7445 | * of cpu bandwidth distribution between task groups. However higher frequency | ||
7446 | * also implies increased scheduling overhead. | ||
7447 | * | ||
7448 | * sysctl_sched_min_bal_int_shares represents the minimum interval between | ||
7449 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
7450 | * | ||
7451 | * sysctl_sched_max_bal_int_shares represents the maximum interval between | ||
7452 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
7453 | * | ||
7454 | * These settings allows for the appropriate trade-off between accuracy of | ||
7455 | * fairness and the associated overhead. | ||
7456 | * | ||
7457 | */ | ||
7458 | |||
7459 | /* default: 8ms, units: milliseconds */ | ||
7460 | const_debug unsigned int sysctl_sched_min_bal_int_shares = 8; | ||
7461 | |||
7462 | /* default: 128ms, units: milliseconds */ | ||
7463 | const_debug unsigned int sysctl_sched_max_bal_int_shares = 128; | ||
7464 | |||
7465 | /* kernel thread that runs rebalance_shares() periodically */ | ||
7466 | static int load_balance_monitor(void *unused) | ||
7467 | { | ||
7468 | unsigned int timeout = sysctl_sched_min_bal_int_shares; | ||
7469 | struct sched_param schedparm; | ||
7470 | int ret; | ||
7471 | |||
7472 | /* | ||
7473 | * We don't want this thread's execution to be limited by the shares | ||
7474 | * assigned to default group (init_task_group). Hence make it run | ||
7475 | * as a SCHED_RR RT task at the lowest priority. | ||
7476 | */ | ||
7477 | schedparm.sched_priority = 1; | ||
7478 | ret = sched_setscheduler(current, SCHED_RR, &schedparm); | ||
7479 | if (ret) | ||
7480 | printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance" | ||
7481 | " monitor thread (error = %d) \n", ret); | ||
7482 | |||
7483 | while (!kthread_should_stop()) { | ||
7484 | int i, cpu, balanced = 1; | ||
7485 | |||
7486 | /* Prevent cpus going down or coming up */ | ||
7487 | get_online_cpus(); | ||
7488 | /* lockout changes to doms_cur[] array */ | ||
7489 | lock_doms_cur(); | ||
7490 | /* | ||
7491 | * Enter a rcu read-side critical section to safely walk rq->sd | ||
7492 | * chain on various cpus and to walk task group list | ||
7493 | * (rq->leaf_cfs_rq_list) in rebalance_shares(). | ||
7494 | */ | ||
7495 | rcu_read_lock(); | ||
7496 | |||
7497 | for (i = 0; i < ndoms_cur; i++) { | ||
7498 | cpumask_t cpumap = doms_cur[i]; | ||
7499 | struct sched_domain *sd = NULL, *sd_prev = NULL; | ||
7500 | |||
7501 | cpu = first_cpu(cpumap); | ||
7502 | |||
7503 | /* Find the highest domain at which to balance shares */ | ||
7504 | for_each_domain(cpu, sd) { | ||
7505 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
7506 | continue; | ||
7507 | sd_prev = sd; | ||
7508 | } | ||
7509 | |||
7510 | sd = sd_prev; | ||
7511 | /* sd == NULL? No load balance reqd in this domain */ | ||
7512 | if (!sd) | ||
7513 | continue; | ||
7514 | |||
7515 | balanced &= rebalance_shares(sd, cpu); | ||
7516 | } | ||
7517 | |||
7518 | rcu_read_unlock(); | ||
7519 | |||
7520 | unlock_doms_cur(); | ||
7521 | put_online_cpus(); | ||
7522 | |||
7523 | if (!balanced) | ||
7524 | timeout = sysctl_sched_min_bal_int_shares; | ||
7525 | else if (timeout < sysctl_sched_max_bal_int_shares) | ||
7526 | timeout *= 2; | ||
7527 | |||
7528 | msleep_interruptible(timeout); | ||
7529 | } | ||
7530 | |||
7531 | return 0; | ||
7532 | } | ||
7533 | #endif /* CONFIG_SMP */ | ||
7534 | |||
7535 | static void free_sched_group(struct task_group *tg) | ||
7536 | { | ||
7537 | int i; | ||
7538 | |||
7539 | for_each_possible_cpu(i) { | ||
7540 | if (tg->cfs_rq) | ||
7541 | kfree(tg->cfs_rq[i]); | ||
7542 | if (tg->se) | ||
7543 | kfree(tg->se[i]); | ||
7544 | if (tg->rt_rq) | ||
7545 | kfree(tg->rt_rq[i]); | ||
7546 | if (tg->rt_se) | ||
7547 | kfree(tg->rt_se[i]); | ||
7548 | } | ||
7549 | |||
7550 | kfree(tg->cfs_rq); | ||
7551 | kfree(tg->se); | ||
7552 | kfree(tg->rt_rq); | ||
7553 | kfree(tg->rt_se); | ||
7554 | kfree(tg); | ||
7555 | } | ||
7556 | |||
6978 | /* allocate runqueue etc for a new task group */ | 7557 | /* allocate runqueue etc for a new task group */ |
6979 | struct task_group *sched_create_group(void) | 7558 | struct task_group *sched_create_group(void) |
6980 | { | 7559 | { |
6981 | struct task_group *tg; | 7560 | struct task_group *tg; |
6982 | struct cfs_rq *cfs_rq; | 7561 | struct cfs_rq *cfs_rq; |
6983 | struct sched_entity *se; | 7562 | struct sched_entity *se; |
7563 | struct rt_rq *rt_rq; | ||
7564 | struct sched_rt_entity *rt_se; | ||
6984 | struct rq *rq; | 7565 | struct rq *rq; |
6985 | int i; | 7566 | int i; |
6986 | 7567 | ||
@@ -6994,97 +7575,89 @@ struct task_group *sched_create_group(void) | |||
6994 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); | 7575 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); |
6995 | if (!tg->se) | 7576 | if (!tg->se) |
6996 | goto err; | 7577 | goto err; |
7578 | tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); | ||
7579 | if (!tg->rt_rq) | ||
7580 | goto err; | ||
7581 | tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); | ||
7582 | if (!tg->rt_se) | ||
7583 | goto err; | ||
7584 | |||
7585 | tg->shares = NICE_0_LOAD; | ||
7586 | tg->rt_ratio = 0; /* XXX */ | ||
6997 | 7587 | ||
6998 | for_each_possible_cpu(i) { | 7588 | for_each_possible_cpu(i) { |
6999 | rq = cpu_rq(i); | 7589 | rq = cpu_rq(i); |
7000 | 7590 | ||
7001 | cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, | 7591 | cfs_rq = kmalloc_node(sizeof(struct cfs_rq), |
7002 | cpu_to_node(i)); | 7592 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
7003 | if (!cfs_rq) | 7593 | if (!cfs_rq) |
7004 | goto err; | 7594 | goto err; |
7005 | 7595 | ||
7006 | se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, | 7596 | se = kmalloc_node(sizeof(struct sched_entity), |
7007 | cpu_to_node(i)); | 7597 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
7008 | if (!se) | 7598 | if (!se) |
7009 | goto err; | 7599 | goto err; |
7010 | 7600 | ||
7011 | memset(cfs_rq, 0, sizeof(struct cfs_rq)); | 7601 | rt_rq = kmalloc_node(sizeof(struct rt_rq), |
7012 | memset(se, 0, sizeof(struct sched_entity)); | 7602 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
7603 | if (!rt_rq) | ||
7604 | goto err; | ||
7013 | 7605 | ||
7014 | tg->cfs_rq[i] = cfs_rq; | 7606 | rt_se = kmalloc_node(sizeof(struct sched_rt_entity), |
7015 | init_cfs_rq(cfs_rq, rq); | 7607 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
7016 | cfs_rq->tg = tg; | 7608 | if (!rt_se) |
7609 | goto err; | ||
7017 | 7610 | ||
7018 | tg->se[i] = se; | 7611 | init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); |
7019 | se->cfs_rq = &rq->cfs; | 7612 | init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); |
7020 | se->my_q = cfs_rq; | ||
7021 | se->load.weight = NICE_0_LOAD; | ||
7022 | se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); | ||
7023 | se->parent = NULL; | ||
7024 | } | 7613 | } |
7025 | 7614 | ||
7615 | lock_task_group_list(); | ||
7026 | for_each_possible_cpu(i) { | 7616 | for_each_possible_cpu(i) { |
7027 | rq = cpu_rq(i); | 7617 | rq = cpu_rq(i); |
7028 | cfs_rq = tg->cfs_rq[i]; | 7618 | cfs_rq = tg->cfs_rq[i]; |
7029 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | 7619 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); |
7620 | rt_rq = tg->rt_rq[i]; | ||
7621 | list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | ||
7030 | } | 7622 | } |
7031 | 7623 | list_add_rcu(&tg->list, &task_groups); | |
7032 | tg->shares = NICE_0_LOAD; | 7624 | unlock_task_group_list(); |
7033 | spin_lock_init(&tg->lock); | ||
7034 | 7625 | ||
7035 | return tg; | 7626 | return tg; |
7036 | 7627 | ||
7037 | err: | 7628 | err: |
7038 | for_each_possible_cpu(i) { | 7629 | free_sched_group(tg); |
7039 | if (tg->cfs_rq) | ||
7040 | kfree(tg->cfs_rq[i]); | ||
7041 | if (tg->se) | ||
7042 | kfree(tg->se[i]); | ||
7043 | } | ||
7044 | kfree(tg->cfs_rq); | ||
7045 | kfree(tg->se); | ||
7046 | kfree(tg); | ||
7047 | |||
7048 | return ERR_PTR(-ENOMEM); | 7630 | return ERR_PTR(-ENOMEM); |
7049 | } | 7631 | } |
7050 | 7632 | ||
7051 | /* rcu callback to free various structures associated with a task group */ | 7633 | /* rcu callback to free various structures associated with a task group */ |
7052 | static void free_sched_group(struct rcu_head *rhp) | 7634 | static void free_sched_group_rcu(struct rcu_head *rhp) |
7053 | { | 7635 | { |
7054 | struct task_group *tg = container_of(rhp, struct task_group, rcu); | ||
7055 | struct cfs_rq *cfs_rq; | ||
7056 | struct sched_entity *se; | ||
7057 | int i; | ||
7058 | |||
7059 | /* now it should be safe to free those cfs_rqs */ | 7636 | /* now it should be safe to free those cfs_rqs */ |
7060 | for_each_possible_cpu(i) { | 7637 | free_sched_group(container_of(rhp, struct task_group, rcu)); |
7061 | cfs_rq = tg->cfs_rq[i]; | ||
7062 | kfree(cfs_rq); | ||
7063 | |||
7064 | se = tg->se[i]; | ||
7065 | kfree(se); | ||
7066 | } | ||
7067 | |||
7068 | kfree(tg->cfs_rq); | ||
7069 | kfree(tg->se); | ||
7070 | kfree(tg); | ||
7071 | } | 7638 | } |
7072 | 7639 | ||
7073 | /* Destroy runqueue etc associated with a task group */ | 7640 | /* Destroy runqueue etc associated with a task group */ |
7074 | void sched_destroy_group(struct task_group *tg) | 7641 | void sched_destroy_group(struct task_group *tg) |
7075 | { | 7642 | { |
7076 | struct cfs_rq *cfs_rq = NULL; | 7643 | struct cfs_rq *cfs_rq = NULL; |
7644 | struct rt_rq *rt_rq = NULL; | ||
7077 | int i; | 7645 | int i; |
7078 | 7646 | ||
7647 | lock_task_group_list(); | ||
7079 | for_each_possible_cpu(i) { | 7648 | for_each_possible_cpu(i) { |
7080 | cfs_rq = tg->cfs_rq[i]; | 7649 | cfs_rq = tg->cfs_rq[i]; |
7081 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | 7650 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); |
7651 | rt_rq = tg->rt_rq[i]; | ||
7652 | list_del_rcu(&rt_rq->leaf_rt_rq_list); | ||
7082 | } | 7653 | } |
7654 | list_del_rcu(&tg->list); | ||
7655 | unlock_task_group_list(); | ||
7083 | 7656 | ||
7084 | BUG_ON(!cfs_rq); | 7657 | BUG_ON(!cfs_rq); |
7085 | 7658 | ||
7086 | /* wait for possible concurrent references to cfs_rqs complete */ | 7659 | /* wait for possible concurrent references to cfs_rqs complete */ |
7087 | call_rcu(&tg->rcu, free_sched_group); | 7660 | call_rcu(&tg->rcu, free_sched_group_rcu); |
7088 | } | 7661 | } |
7089 | 7662 | ||
7090 | /* change task's runqueue when it moves between groups. | 7663 | /* change task's runqueue when it moves between groups. |
@@ -7100,11 +7673,6 @@ void sched_move_task(struct task_struct *tsk) | |||
7100 | 7673 | ||
7101 | rq = task_rq_lock(tsk, &flags); | 7674 | rq = task_rq_lock(tsk, &flags); |
7102 | 7675 | ||
7103 | if (tsk->sched_class != &fair_sched_class) { | ||
7104 | set_task_cfs_rq(tsk, task_cpu(tsk)); | ||
7105 | goto done; | ||
7106 | } | ||
7107 | |||
7108 | update_rq_clock(rq); | 7676 | update_rq_clock(rq); |
7109 | 7677 | ||
7110 | running = task_current(rq, tsk); | 7678 | running = task_current(rq, tsk); |
@@ -7116,7 +7684,7 @@ void sched_move_task(struct task_struct *tsk) | |||
7116 | tsk->sched_class->put_prev_task(rq, tsk); | 7684 | tsk->sched_class->put_prev_task(rq, tsk); |
7117 | } | 7685 | } |
7118 | 7686 | ||
7119 | set_task_cfs_rq(tsk, task_cpu(tsk)); | 7687 | set_task_rq(tsk, task_cpu(tsk)); |
7120 | 7688 | ||
7121 | if (on_rq) { | 7689 | if (on_rq) { |
7122 | if (unlikely(running)) | 7690 | if (unlikely(running)) |
@@ -7124,53 +7692,82 @@ void sched_move_task(struct task_struct *tsk) | |||
7124 | enqueue_task(rq, tsk, 0); | 7692 | enqueue_task(rq, tsk, 0); |
7125 | } | 7693 | } |
7126 | 7694 | ||
7127 | done: | ||
7128 | task_rq_unlock(rq, &flags); | 7695 | task_rq_unlock(rq, &flags); |
7129 | } | 7696 | } |
7130 | 7697 | ||
7698 | /* rq->lock to be locked by caller */ | ||
7131 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 7699 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
7132 | { | 7700 | { |
7133 | struct cfs_rq *cfs_rq = se->cfs_rq; | 7701 | struct cfs_rq *cfs_rq = se->cfs_rq; |
7134 | struct rq *rq = cfs_rq->rq; | 7702 | struct rq *rq = cfs_rq->rq; |
7135 | int on_rq; | 7703 | int on_rq; |
7136 | 7704 | ||
7137 | spin_lock_irq(&rq->lock); | 7705 | if (!shares) |
7706 | shares = MIN_GROUP_SHARES; | ||
7138 | 7707 | ||
7139 | on_rq = se->on_rq; | 7708 | on_rq = se->on_rq; |
7140 | if (on_rq) | 7709 | if (on_rq) { |
7141 | dequeue_entity(cfs_rq, se, 0); | 7710 | dequeue_entity(cfs_rq, se, 0); |
7711 | dec_cpu_load(rq, se->load.weight); | ||
7712 | } | ||
7142 | 7713 | ||
7143 | se->load.weight = shares; | 7714 | se->load.weight = shares; |
7144 | se->load.inv_weight = div64_64((1ULL<<32), shares); | 7715 | se->load.inv_weight = div64_64((1ULL<<32), shares); |
7145 | 7716 | ||
7146 | if (on_rq) | 7717 | if (on_rq) { |
7147 | enqueue_entity(cfs_rq, se, 0); | 7718 | enqueue_entity(cfs_rq, se, 0); |
7148 | 7719 | inc_cpu_load(rq, se->load.weight); | |
7149 | spin_unlock_irq(&rq->lock); | 7720 | } |
7150 | } | 7721 | } |
7151 | 7722 | ||
7152 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | 7723 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) |
7153 | { | 7724 | { |
7154 | int i; | 7725 | int i; |
7726 | struct cfs_rq *cfs_rq; | ||
7727 | struct rq *rq; | ||
7728 | |||
7729 | lock_task_group_list(); | ||
7730 | if (tg->shares == shares) | ||
7731 | goto done; | ||
7732 | |||
7733 | if (shares < MIN_GROUP_SHARES) | ||
7734 | shares = MIN_GROUP_SHARES; | ||
7155 | 7735 | ||
7156 | /* | 7736 | /* |
7157 | * A weight of 0 or 1 can cause arithmetics problems. | 7737 | * Prevent any load balance activity (rebalance_shares, |
7158 | * (The default weight is 1024 - so there's no practical | 7738 | * load_balance_fair) from referring to this group first, |
7159 | * limitation from this.) | 7739 | * by taking it off the rq->leaf_cfs_rq_list on each cpu. |
7160 | */ | 7740 | */ |
7161 | if (shares < 2) | 7741 | for_each_possible_cpu(i) { |
7162 | shares = 2; | 7742 | cfs_rq = tg->cfs_rq[i]; |
7743 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | ||
7744 | } | ||
7163 | 7745 | ||
7164 | spin_lock(&tg->lock); | 7746 | /* wait for any ongoing reference to this group to finish */ |
7165 | if (tg->shares == shares) | 7747 | synchronize_sched(); |
7166 | goto done; | ||
7167 | 7748 | ||
7749 | /* | ||
7750 | * Now we are free to modify the group's share on each cpu | ||
7751 | * w/o tripping rebalance_share or load_balance_fair. | ||
7752 | */ | ||
7168 | tg->shares = shares; | 7753 | tg->shares = shares; |
7169 | for_each_possible_cpu(i) | 7754 | for_each_possible_cpu(i) { |
7755 | spin_lock_irq(&cpu_rq(i)->lock); | ||
7170 | set_se_shares(tg->se[i], shares); | 7756 | set_se_shares(tg->se[i], shares); |
7757 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
7758 | } | ||
7171 | 7759 | ||
7760 | /* | ||
7761 | * Enable load balance activity on this group, by inserting it back on | ||
7762 | * each cpu's rq->leaf_cfs_rq_list. | ||
7763 | */ | ||
7764 | for_each_possible_cpu(i) { | ||
7765 | rq = cpu_rq(i); | ||
7766 | cfs_rq = tg->cfs_rq[i]; | ||
7767 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
7768 | } | ||
7172 | done: | 7769 | done: |
7173 | spin_unlock(&tg->lock); | 7770 | unlock_task_group_list(); |
7174 | return 0; | 7771 | return 0; |
7175 | } | 7772 | } |
7176 | 7773 | ||
@@ -7179,6 +7776,31 @@ unsigned long sched_group_shares(struct task_group *tg) | |||
7179 | return tg->shares; | 7776 | return tg->shares; |
7180 | } | 7777 | } |
7181 | 7778 | ||
7779 | /* | ||
7780 | * Ensure the total rt_ratio <= sysctl_sched_rt_ratio | ||
7781 | */ | ||
7782 | int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio) | ||
7783 | { | ||
7784 | struct task_group *tgi; | ||
7785 | unsigned long total = 0; | ||
7786 | |||
7787 | rcu_read_lock(); | ||
7788 | list_for_each_entry_rcu(tgi, &task_groups, list) | ||
7789 | total += tgi->rt_ratio; | ||
7790 | rcu_read_unlock(); | ||
7791 | |||
7792 | if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio) | ||
7793 | return -EINVAL; | ||
7794 | |||
7795 | tg->rt_ratio = rt_ratio; | ||
7796 | return 0; | ||
7797 | } | ||
7798 | |||
7799 | unsigned long sched_group_rt_ratio(struct task_group *tg) | ||
7800 | { | ||
7801 | return tg->rt_ratio; | ||
7802 | } | ||
7803 | |||
7182 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7804 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7183 | 7805 | ||
7184 | #ifdef CONFIG_FAIR_CGROUP_SCHED | 7806 | #ifdef CONFIG_FAIR_CGROUP_SCHED |
@@ -7254,12 +7876,30 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) | |||
7254 | return (u64) tg->shares; | 7876 | return (u64) tg->shares; |
7255 | } | 7877 | } |
7256 | 7878 | ||
7879 | static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype, | ||
7880 | u64 rt_ratio_val) | ||
7881 | { | ||
7882 | return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val); | ||
7883 | } | ||
7884 | |||
7885 | static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft) | ||
7886 | { | ||
7887 | struct task_group *tg = cgroup_tg(cgrp); | ||
7888 | |||
7889 | return (u64) tg->rt_ratio; | ||
7890 | } | ||
7891 | |||
7257 | static struct cftype cpu_files[] = { | 7892 | static struct cftype cpu_files[] = { |
7258 | { | 7893 | { |
7259 | .name = "shares", | 7894 | .name = "shares", |
7260 | .read_uint = cpu_shares_read_uint, | 7895 | .read_uint = cpu_shares_read_uint, |
7261 | .write_uint = cpu_shares_write_uint, | 7896 | .write_uint = cpu_shares_write_uint, |
7262 | }, | 7897 | }, |
7898 | { | ||
7899 | .name = "rt_ratio", | ||
7900 | .read_uint = cpu_rt_ratio_read_uint, | ||
7901 | .write_uint = cpu_rt_ratio_write_uint, | ||
7902 | }, | ||
7263 | }; | 7903 | }; |
7264 | 7904 | ||
7265 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 7905 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 80fbbfc04290..4b5e24cf2f4a 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -179,6 +179,7 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
179 | PN(prev_clock_raw); | 179 | PN(prev_clock_raw); |
180 | P(clock_warps); | 180 | P(clock_warps); |
181 | P(clock_overflows); | 181 | P(clock_overflows); |
182 | P(clock_underflows); | ||
182 | P(clock_deep_idle_events); | 183 | P(clock_deep_idle_events); |
183 | PN(clock_max_delta); | 184 | PN(clock_max_delta); |
184 | P(cpu_load[0]); | 185 | P(cpu_load[0]); |
@@ -299,6 +300,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
299 | PN(se.exec_max); | 300 | PN(se.exec_max); |
300 | PN(se.slice_max); | 301 | PN(se.slice_max); |
301 | PN(se.wait_max); | 302 | PN(se.wait_max); |
303 | PN(se.wait_sum); | ||
304 | P(se.wait_count); | ||
302 | P(sched_info.bkl_count); | 305 | P(sched_info.bkl_count); |
303 | P(se.nr_migrations); | 306 | P(se.nr_migrations); |
304 | P(se.nr_migrations_cold); | 307 | P(se.nr_migrations_cold); |
@@ -366,6 +369,8 @@ void proc_sched_set_task(struct task_struct *p) | |||
366 | { | 369 | { |
367 | #ifdef CONFIG_SCHEDSTATS | 370 | #ifdef CONFIG_SCHEDSTATS |
368 | p->se.wait_max = 0; | 371 | p->se.wait_max = 0; |
372 | p->se.wait_sum = 0; | ||
373 | p->se.wait_count = 0; | ||
369 | p->se.sleep_max = 0; | 374 | p->se.sleep_max = 0; |
370 | p->se.sum_sleep_runtime = 0; | 375 | p->se.sum_sleep_runtime = 0; |
371 | p->se.block_max = 0; | 376 | p->se.block_max = 0; |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index da7c061e7206..72e25c7a3a18 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -20,6 +20,8 @@ | |||
20 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | 20 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> |
21 | */ | 21 | */ |
22 | 22 | ||
23 | #include <linux/latencytop.h> | ||
24 | |||
23 | /* | 25 | /* |
24 | * Targeted preemption latency for CPU-bound tasks: | 26 | * Targeted preemption latency for CPU-bound tasks: |
25 | * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) | 27 | * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) |
@@ -248,8 +250,8 @@ static u64 __sched_period(unsigned long nr_running) | |||
248 | unsigned long nr_latency = sched_nr_latency; | 250 | unsigned long nr_latency = sched_nr_latency; |
249 | 251 | ||
250 | if (unlikely(nr_running > nr_latency)) { | 252 | if (unlikely(nr_running > nr_latency)) { |
253 | period = sysctl_sched_min_granularity; | ||
251 | period *= nr_running; | 254 | period *= nr_running; |
252 | do_div(period, nr_latency); | ||
253 | } | 255 | } |
254 | 256 | ||
255 | return period; | 257 | return period; |
@@ -383,6 +385,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
383 | { | 385 | { |
384 | schedstat_set(se->wait_max, max(se->wait_max, | 386 | schedstat_set(se->wait_max, max(se->wait_max, |
385 | rq_of(cfs_rq)->clock - se->wait_start)); | 387 | rq_of(cfs_rq)->clock - se->wait_start)); |
388 | schedstat_set(se->wait_count, se->wait_count + 1); | ||
389 | schedstat_set(se->wait_sum, se->wait_sum + | ||
390 | rq_of(cfs_rq)->clock - se->wait_start); | ||
386 | schedstat_set(se->wait_start, 0); | 391 | schedstat_set(se->wait_start, 0); |
387 | } | 392 | } |
388 | 393 | ||
@@ -434,6 +439,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
434 | #ifdef CONFIG_SCHEDSTATS | 439 | #ifdef CONFIG_SCHEDSTATS |
435 | if (se->sleep_start) { | 440 | if (se->sleep_start) { |
436 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; | 441 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; |
442 | struct task_struct *tsk = task_of(se); | ||
437 | 443 | ||
438 | if ((s64)delta < 0) | 444 | if ((s64)delta < 0) |
439 | delta = 0; | 445 | delta = 0; |
@@ -443,9 +449,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
443 | 449 | ||
444 | se->sleep_start = 0; | 450 | se->sleep_start = 0; |
445 | se->sum_sleep_runtime += delta; | 451 | se->sum_sleep_runtime += delta; |
452 | |||
453 | account_scheduler_latency(tsk, delta >> 10, 1); | ||
446 | } | 454 | } |
447 | if (se->block_start) { | 455 | if (se->block_start) { |
448 | u64 delta = rq_of(cfs_rq)->clock - se->block_start; | 456 | u64 delta = rq_of(cfs_rq)->clock - se->block_start; |
457 | struct task_struct *tsk = task_of(se); | ||
449 | 458 | ||
450 | if ((s64)delta < 0) | 459 | if ((s64)delta < 0) |
451 | delta = 0; | 460 | delta = 0; |
@@ -462,11 +471,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
462 | * time that the task spent sleeping: | 471 | * time that the task spent sleeping: |
463 | */ | 472 | */ |
464 | if (unlikely(prof_on == SLEEP_PROFILING)) { | 473 | if (unlikely(prof_on == SLEEP_PROFILING)) { |
465 | struct task_struct *tsk = task_of(se); | ||
466 | 474 | ||
467 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), | 475 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), |
468 | delta >> 20); | 476 | delta >> 20); |
469 | } | 477 | } |
478 | account_scheduler_latency(tsk, delta >> 10, 0); | ||
470 | } | 479 | } |
471 | #endif | 480 | #endif |
472 | } | 481 | } |
@@ -642,13 +651,29 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
642 | cfs_rq->curr = NULL; | 651 | cfs_rq->curr = NULL; |
643 | } | 652 | } |
644 | 653 | ||
645 | static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | 654 | static void |
655 | entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | ||
646 | { | 656 | { |
647 | /* | 657 | /* |
648 | * Update run-time statistics of the 'current'. | 658 | * Update run-time statistics of the 'current'. |
649 | */ | 659 | */ |
650 | update_curr(cfs_rq); | 660 | update_curr(cfs_rq); |
651 | 661 | ||
662 | #ifdef CONFIG_SCHED_HRTICK | ||
663 | /* | ||
664 | * queued ticks are scheduled to match the slice, so don't bother | ||
665 | * validating it and just reschedule. | ||
666 | */ | ||
667 | if (queued) | ||
668 | return resched_task(rq_of(cfs_rq)->curr); | ||
669 | /* | ||
670 | * don't let the period tick interfere with the hrtick preemption | ||
671 | */ | ||
672 | if (!sched_feat(DOUBLE_TICK) && | ||
673 | hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) | ||
674 | return; | ||
675 | #endif | ||
676 | |||
652 | if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) | 677 | if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) |
653 | check_preempt_tick(cfs_rq, curr); | 678 | check_preempt_tick(cfs_rq, curr); |
654 | } | 679 | } |
@@ -690,7 +715,7 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
690 | 715 | ||
691 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | 716 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ |
692 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 717 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
693 | list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | 718 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) |
694 | 719 | ||
695 | /* Do the two (enqueued) entities belong to the same group ? */ | 720 | /* Do the two (enqueued) entities belong to the same group ? */ |
696 | static inline int | 721 | static inline int |
@@ -707,6 +732,8 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
707 | return se->parent; | 732 | return se->parent; |
708 | } | 733 | } |
709 | 734 | ||
735 | #define GROUP_IMBALANCE_PCT 20 | ||
736 | |||
710 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 737 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
711 | 738 | ||
712 | #define for_each_sched_entity(se) \ | 739 | #define for_each_sched_entity(se) \ |
@@ -752,6 +779,43 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
752 | 779 | ||
753 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 780 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
754 | 781 | ||
782 | #ifdef CONFIG_SCHED_HRTICK | ||
783 | static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | ||
784 | { | ||
785 | int requeue = rq->curr == p; | ||
786 | struct sched_entity *se = &p->se; | ||
787 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
788 | |||
789 | WARN_ON(task_rq(p) != rq); | ||
790 | |||
791 | if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { | ||
792 | u64 slice = sched_slice(cfs_rq, se); | ||
793 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; | ||
794 | s64 delta = slice - ran; | ||
795 | |||
796 | if (delta < 0) { | ||
797 | if (rq->curr == p) | ||
798 | resched_task(p); | ||
799 | return; | ||
800 | } | ||
801 | |||
802 | /* | ||
803 | * Don't schedule slices shorter than 10000ns, that just | ||
804 | * doesn't make sense. Rely on vruntime for fairness. | ||
805 | */ | ||
806 | if (!requeue) | ||
807 | delta = max(10000LL, delta); | ||
808 | |||
809 | hrtick_start(rq, delta, requeue); | ||
810 | } | ||
811 | } | ||
812 | #else | ||
813 | static inline void | ||
814 | hrtick_start_fair(struct rq *rq, struct task_struct *p) | ||
815 | { | ||
816 | } | ||
817 | #endif | ||
818 | |||
755 | /* | 819 | /* |
756 | * The enqueue_task method is called before nr_running is | 820 | * The enqueue_task method is called before nr_running is |
757 | * increased. Here we update the fair scheduling stats and | 821 | * increased. Here we update the fair scheduling stats and |
@@ -760,15 +824,28 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
760 | static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | 824 | static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) |
761 | { | 825 | { |
762 | struct cfs_rq *cfs_rq; | 826 | struct cfs_rq *cfs_rq; |
763 | struct sched_entity *se = &p->se; | 827 | struct sched_entity *se = &p->se, |
828 | *topse = NULL; /* Highest schedulable entity */ | ||
829 | int incload = 1; | ||
764 | 830 | ||
765 | for_each_sched_entity(se) { | 831 | for_each_sched_entity(se) { |
766 | if (se->on_rq) | 832 | topse = se; |
833 | if (se->on_rq) { | ||
834 | incload = 0; | ||
767 | break; | 835 | break; |
836 | } | ||
768 | cfs_rq = cfs_rq_of(se); | 837 | cfs_rq = cfs_rq_of(se); |
769 | enqueue_entity(cfs_rq, se, wakeup); | 838 | enqueue_entity(cfs_rq, se, wakeup); |
770 | wakeup = 1; | 839 | wakeup = 1; |
771 | } | 840 | } |
841 | /* Increment cpu load if we just enqueued the first task of a group on | ||
842 | * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs | ||
843 | * at the highest grouping level. | ||
844 | */ | ||
845 | if (incload) | ||
846 | inc_cpu_load(rq, topse->load.weight); | ||
847 | |||
848 | hrtick_start_fair(rq, rq->curr); | ||
772 | } | 849 | } |
773 | 850 | ||
774 | /* | 851 | /* |
@@ -779,16 +856,30 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | |||
779 | static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) | 856 | static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) |
780 | { | 857 | { |
781 | struct cfs_rq *cfs_rq; | 858 | struct cfs_rq *cfs_rq; |
782 | struct sched_entity *se = &p->se; | 859 | struct sched_entity *se = &p->se, |
860 | *topse = NULL; /* Highest schedulable entity */ | ||
861 | int decload = 1; | ||
783 | 862 | ||
784 | for_each_sched_entity(se) { | 863 | for_each_sched_entity(se) { |
864 | topse = se; | ||
785 | cfs_rq = cfs_rq_of(se); | 865 | cfs_rq = cfs_rq_of(se); |
786 | dequeue_entity(cfs_rq, se, sleep); | 866 | dequeue_entity(cfs_rq, se, sleep); |
787 | /* Don't dequeue parent if it has other entities besides us */ | 867 | /* Don't dequeue parent if it has other entities besides us */ |
788 | if (cfs_rq->load.weight) | 868 | if (cfs_rq->load.weight) { |
869 | if (parent_entity(se)) | ||
870 | decload = 0; | ||
789 | break; | 871 | break; |
872 | } | ||
790 | sleep = 1; | 873 | sleep = 1; |
791 | } | 874 | } |
875 | /* Decrement cpu load if we just dequeued the last task of a group on | ||
876 | * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs | ||
877 | * at the highest grouping level. | ||
878 | */ | ||
879 | if (decload) | ||
880 | dec_cpu_load(rq, topse->load.weight); | ||
881 | |||
882 | hrtick_start_fair(rq, rq->curr); | ||
792 | } | 883 | } |
793 | 884 | ||
794 | /* | 885 | /* |
@@ -836,6 +927,154 @@ static void yield_task_fair(struct rq *rq) | |||
836 | } | 927 | } |
837 | 928 | ||
838 | /* | 929 | /* |
930 | * wake_idle() will wake a task on an idle cpu if task->cpu is | ||
931 | * not idle and an idle cpu is available. The span of cpus to | ||
932 | * search starts with cpus closest then further out as needed, | ||
933 | * so we always favor a closer, idle cpu. | ||
934 | * | ||
935 | * Returns the CPU we should wake onto. | ||
936 | */ | ||
937 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | ||
938 | static int wake_idle(int cpu, struct task_struct *p) | ||
939 | { | ||
940 | cpumask_t tmp; | ||
941 | struct sched_domain *sd; | ||
942 | int i; | ||
943 | |||
944 | /* | ||
945 | * If it is idle, then it is the best cpu to run this task. | ||
946 | * | ||
947 | * This cpu is also the best, if it has more than one task already. | ||
948 | * Siblings must be also busy(in most cases) as they didn't already | ||
949 | * pickup the extra load from this cpu and hence we need not check | ||
950 | * sibling runqueue info. This will avoid the checks and cache miss | ||
951 | * penalities associated with that. | ||
952 | */ | ||
953 | if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) | ||
954 | return cpu; | ||
955 | |||
956 | for_each_domain(cpu, sd) { | ||
957 | if (sd->flags & SD_WAKE_IDLE) { | ||
958 | cpus_and(tmp, sd->span, p->cpus_allowed); | ||
959 | for_each_cpu_mask(i, tmp) { | ||
960 | if (idle_cpu(i)) { | ||
961 | if (i != task_cpu(p)) { | ||
962 | schedstat_inc(p, | ||
963 | se.nr_wakeups_idle); | ||
964 | } | ||
965 | return i; | ||
966 | } | ||
967 | } | ||
968 | } else { | ||
969 | break; | ||
970 | } | ||
971 | } | ||
972 | return cpu; | ||
973 | } | ||
974 | #else | ||
975 | static inline int wake_idle(int cpu, struct task_struct *p) | ||
976 | { | ||
977 | return cpu; | ||
978 | } | ||
979 | #endif | ||
980 | |||
981 | #ifdef CONFIG_SMP | ||
982 | static int select_task_rq_fair(struct task_struct *p, int sync) | ||
983 | { | ||
984 | int cpu, this_cpu; | ||
985 | struct rq *rq; | ||
986 | struct sched_domain *sd, *this_sd = NULL; | ||
987 | int new_cpu; | ||
988 | |||
989 | cpu = task_cpu(p); | ||
990 | rq = task_rq(p); | ||
991 | this_cpu = smp_processor_id(); | ||
992 | new_cpu = cpu; | ||
993 | |||
994 | if (cpu == this_cpu) | ||
995 | goto out_set_cpu; | ||
996 | |||
997 | for_each_domain(this_cpu, sd) { | ||
998 | if (cpu_isset(cpu, sd->span)) { | ||
999 | this_sd = sd; | ||
1000 | break; | ||
1001 | } | ||
1002 | } | ||
1003 | |||
1004 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | ||
1005 | goto out_set_cpu; | ||
1006 | |||
1007 | /* | ||
1008 | * Check for affine wakeup and passive balancing possibilities. | ||
1009 | */ | ||
1010 | if (this_sd) { | ||
1011 | int idx = this_sd->wake_idx; | ||
1012 | unsigned int imbalance; | ||
1013 | unsigned long load, this_load; | ||
1014 | |||
1015 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; | ||
1016 | |||
1017 | load = source_load(cpu, idx); | ||
1018 | this_load = target_load(this_cpu, idx); | ||
1019 | |||
1020 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | ||
1021 | |||
1022 | if (this_sd->flags & SD_WAKE_AFFINE) { | ||
1023 | unsigned long tl = this_load; | ||
1024 | unsigned long tl_per_task; | ||
1025 | |||
1026 | /* | ||
1027 | * Attract cache-cold tasks on sync wakeups: | ||
1028 | */ | ||
1029 | if (sync && !task_hot(p, rq->clock, this_sd)) | ||
1030 | goto out_set_cpu; | ||
1031 | |||
1032 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | ||
1033 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
1034 | |||
1035 | /* | ||
1036 | * If sync wakeup then subtract the (maximum possible) | ||
1037 | * effect of the currently running task from the load | ||
1038 | * of the current CPU: | ||
1039 | */ | ||
1040 | if (sync) | ||
1041 | tl -= current->se.load.weight; | ||
1042 | |||
1043 | if ((tl <= load && | ||
1044 | tl + target_load(cpu, idx) <= tl_per_task) || | ||
1045 | 100*(tl + p->se.load.weight) <= imbalance*load) { | ||
1046 | /* | ||
1047 | * This domain has SD_WAKE_AFFINE and | ||
1048 | * p is cache cold in this domain, and | ||
1049 | * there is no bad imbalance. | ||
1050 | */ | ||
1051 | schedstat_inc(this_sd, ttwu_move_affine); | ||
1052 | schedstat_inc(p, se.nr_wakeups_affine); | ||
1053 | goto out_set_cpu; | ||
1054 | } | ||
1055 | } | ||
1056 | |||
1057 | /* | ||
1058 | * Start passive balancing when half the imbalance_pct | ||
1059 | * limit is reached. | ||
1060 | */ | ||
1061 | if (this_sd->flags & SD_WAKE_BALANCE) { | ||
1062 | if (imbalance*this_load <= 100*load) { | ||
1063 | schedstat_inc(this_sd, ttwu_move_balance); | ||
1064 | schedstat_inc(p, se.nr_wakeups_passive); | ||
1065 | goto out_set_cpu; | ||
1066 | } | ||
1067 | } | ||
1068 | } | ||
1069 | |||
1070 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ | ||
1071 | out_set_cpu: | ||
1072 | return wake_idle(new_cpu, p); | ||
1073 | } | ||
1074 | #endif /* CONFIG_SMP */ | ||
1075 | |||
1076 | |||
1077 | /* | ||
839 | * Preempt the current task with a newly woken task if needed: | 1078 | * Preempt the current task with a newly woken task if needed: |
840 | */ | 1079 | */ |
841 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | 1080 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) |
@@ -876,6 +1115,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
876 | 1115 | ||
877 | static struct task_struct *pick_next_task_fair(struct rq *rq) | 1116 | static struct task_struct *pick_next_task_fair(struct rq *rq) |
878 | { | 1117 | { |
1118 | struct task_struct *p; | ||
879 | struct cfs_rq *cfs_rq = &rq->cfs; | 1119 | struct cfs_rq *cfs_rq = &rq->cfs; |
880 | struct sched_entity *se; | 1120 | struct sched_entity *se; |
881 | 1121 | ||
@@ -887,7 +1127,10 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) | |||
887 | cfs_rq = group_cfs_rq(se); | 1127 | cfs_rq = group_cfs_rq(se); |
888 | } while (cfs_rq); | 1128 | } while (cfs_rq); |
889 | 1129 | ||
890 | return task_of(se); | 1130 | p = task_of(se); |
1131 | hrtick_start_fair(rq, p); | ||
1132 | |||
1133 | return p; | ||
891 | } | 1134 | } |
892 | 1135 | ||
893 | /* | 1136 | /* |
@@ -944,25 +1187,6 @@ static struct task_struct *load_balance_next_fair(void *arg) | |||
944 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); | 1187 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); |
945 | } | 1188 | } |
946 | 1189 | ||
947 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
948 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | ||
949 | { | ||
950 | struct sched_entity *curr; | ||
951 | struct task_struct *p; | ||
952 | |||
953 | if (!cfs_rq->nr_running) | ||
954 | return MAX_PRIO; | ||
955 | |||
956 | curr = cfs_rq->curr; | ||
957 | if (!curr) | ||
958 | curr = __pick_next_entity(cfs_rq); | ||
959 | |||
960 | p = task_of(curr); | ||
961 | |||
962 | return p->prio; | ||
963 | } | ||
964 | #endif | ||
965 | |||
966 | static unsigned long | 1190 | static unsigned long |
967 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1191 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
968 | unsigned long max_load_move, | 1192 | unsigned long max_load_move, |
@@ -972,28 +1196,45 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
972 | struct cfs_rq *busy_cfs_rq; | 1196 | struct cfs_rq *busy_cfs_rq; |
973 | long rem_load_move = max_load_move; | 1197 | long rem_load_move = max_load_move; |
974 | struct rq_iterator cfs_rq_iterator; | 1198 | struct rq_iterator cfs_rq_iterator; |
1199 | unsigned long load_moved; | ||
975 | 1200 | ||
976 | cfs_rq_iterator.start = load_balance_start_fair; | 1201 | cfs_rq_iterator.start = load_balance_start_fair; |
977 | cfs_rq_iterator.next = load_balance_next_fair; | 1202 | cfs_rq_iterator.next = load_balance_next_fair; |
978 | 1203 | ||
979 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 1204 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { |
980 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1205 | #ifdef CONFIG_FAIR_GROUP_SCHED |
981 | struct cfs_rq *this_cfs_rq; | 1206 | struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu]; |
982 | long imbalance; | 1207 | unsigned long maxload, task_load, group_weight; |
983 | unsigned long maxload; | 1208 | unsigned long thisload, per_task_load; |
1209 | struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu]; | ||
1210 | |||
1211 | task_load = busy_cfs_rq->load.weight; | ||
1212 | group_weight = se->load.weight; | ||
984 | 1213 | ||
985 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | 1214 | /* |
1215 | * 'group_weight' is contributed by tasks of total weight | ||
1216 | * 'task_load'. To move 'rem_load_move' worth of weight only, | ||
1217 | * we need to move a maximum task load of: | ||
1218 | * | ||
1219 | * maxload = (remload / group_weight) * task_load; | ||
1220 | */ | ||
1221 | maxload = (rem_load_move * task_load) / group_weight; | ||
986 | 1222 | ||
987 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; | 1223 | if (!maxload || !task_load) |
988 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | ||
989 | if (imbalance <= 0) | ||
990 | continue; | 1224 | continue; |
991 | 1225 | ||
992 | /* Don't pull more than imbalance/2 */ | 1226 | per_task_load = task_load / busy_cfs_rq->nr_running; |
993 | imbalance /= 2; | 1227 | /* |
994 | maxload = min(rem_load_move, imbalance); | 1228 | * balance_tasks will try to forcibly move atleast one task if |
1229 | * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if | ||
1230 | * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load. | ||
1231 | */ | ||
1232 | if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load) | ||
1233 | continue; | ||
995 | 1234 | ||
996 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); | 1235 | /* Disable priority-based load balance */ |
1236 | *this_best_prio = 0; | ||
1237 | thisload = this_cfs_rq->load.weight; | ||
997 | #else | 1238 | #else |
998 | # define maxload rem_load_move | 1239 | # define maxload rem_load_move |
999 | #endif | 1240 | #endif |
@@ -1002,11 +1243,33 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1002 | * load_balance_[start|next]_fair iterators | 1243 | * load_balance_[start|next]_fair iterators |
1003 | */ | 1244 | */ |
1004 | cfs_rq_iterator.arg = busy_cfs_rq; | 1245 | cfs_rq_iterator.arg = busy_cfs_rq; |
1005 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, | 1246 | load_moved = balance_tasks(this_rq, this_cpu, busiest, |
1006 | maxload, sd, idle, all_pinned, | 1247 | maxload, sd, idle, all_pinned, |
1007 | this_best_prio, | 1248 | this_best_prio, |
1008 | &cfs_rq_iterator); | 1249 | &cfs_rq_iterator); |
1009 | 1250 | ||
1251 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1252 | /* | ||
1253 | * load_moved holds the task load that was moved. The | ||
1254 | * effective (group) weight moved would be: | ||
1255 | * load_moved_eff = load_moved/task_load * group_weight; | ||
1256 | */ | ||
1257 | load_moved = (group_weight * load_moved) / task_load; | ||
1258 | |||
1259 | /* Adjust shares on both cpus to reflect load_moved */ | ||
1260 | group_weight -= load_moved; | ||
1261 | set_se_shares(se, group_weight); | ||
1262 | |||
1263 | se = busy_cfs_rq->tg->se[this_cpu]; | ||
1264 | if (!thisload) | ||
1265 | group_weight = load_moved; | ||
1266 | else | ||
1267 | group_weight = se->load.weight + load_moved; | ||
1268 | set_se_shares(se, group_weight); | ||
1269 | #endif | ||
1270 | |||
1271 | rem_load_move -= load_moved; | ||
1272 | |||
1010 | if (rem_load_move <= 0) | 1273 | if (rem_load_move <= 0) |
1011 | break; | 1274 | break; |
1012 | } | 1275 | } |
@@ -1042,14 +1305,14 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1042 | /* | 1305 | /* |
1043 | * scheduler tick hitting a task of our scheduling class: | 1306 | * scheduler tick hitting a task of our scheduling class: |
1044 | */ | 1307 | */ |
1045 | static void task_tick_fair(struct rq *rq, struct task_struct *curr) | 1308 | static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) |
1046 | { | 1309 | { |
1047 | struct cfs_rq *cfs_rq; | 1310 | struct cfs_rq *cfs_rq; |
1048 | struct sched_entity *se = &curr->se; | 1311 | struct sched_entity *se = &curr->se; |
1049 | 1312 | ||
1050 | for_each_sched_entity(se) { | 1313 | for_each_sched_entity(se) { |
1051 | cfs_rq = cfs_rq_of(se); | 1314 | cfs_rq = cfs_rq_of(se); |
1052 | entity_tick(cfs_rq, se); | 1315 | entity_tick(cfs_rq, se, queued); |
1053 | } | 1316 | } |
1054 | } | 1317 | } |
1055 | 1318 | ||
@@ -1087,6 +1350,42 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) | |||
1087 | resched_task(rq->curr); | 1350 | resched_task(rq->curr); |
1088 | } | 1351 | } |
1089 | 1352 | ||
1353 | /* | ||
1354 | * Priority of the task has changed. Check to see if we preempt | ||
1355 | * the current task. | ||
1356 | */ | ||
1357 | static void prio_changed_fair(struct rq *rq, struct task_struct *p, | ||
1358 | int oldprio, int running) | ||
1359 | { | ||
1360 | /* | ||
1361 | * Reschedule if we are currently running on this runqueue and | ||
1362 | * our priority decreased, or if we are not currently running on | ||
1363 | * this runqueue and our priority is higher than the current's | ||
1364 | */ | ||
1365 | if (running) { | ||
1366 | if (p->prio > oldprio) | ||
1367 | resched_task(rq->curr); | ||
1368 | } else | ||
1369 | check_preempt_curr(rq, p); | ||
1370 | } | ||
1371 | |||
1372 | /* | ||
1373 | * We switched to the sched_fair class. | ||
1374 | */ | ||
1375 | static void switched_to_fair(struct rq *rq, struct task_struct *p, | ||
1376 | int running) | ||
1377 | { | ||
1378 | /* | ||
1379 | * We were most likely switched from sched_rt, so | ||
1380 | * kick off the schedule if running, otherwise just see | ||
1381 | * if we can still preempt the current task. | ||
1382 | */ | ||
1383 | if (running) | ||
1384 | resched_task(rq->curr); | ||
1385 | else | ||
1386 | check_preempt_curr(rq, p); | ||
1387 | } | ||
1388 | |||
1090 | /* Account for a task changing its policy or group. | 1389 | /* Account for a task changing its policy or group. |
1091 | * | 1390 | * |
1092 | * This routine is mostly called to set cfs_rq->curr field when a task | 1391 | * This routine is mostly called to set cfs_rq->curr field when a task |
@@ -1108,6 +1407,9 @@ static const struct sched_class fair_sched_class = { | |||
1108 | .enqueue_task = enqueue_task_fair, | 1407 | .enqueue_task = enqueue_task_fair, |
1109 | .dequeue_task = dequeue_task_fair, | 1408 | .dequeue_task = dequeue_task_fair, |
1110 | .yield_task = yield_task_fair, | 1409 | .yield_task = yield_task_fair, |
1410 | #ifdef CONFIG_SMP | ||
1411 | .select_task_rq = select_task_rq_fair, | ||
1412 | #endif /* CONFIG_SMP */ | ||
1111 | 1413 | ||
1112 | .check_preempt_curr = check_preempt_wakeup, | 1414 | .check_preempt_curr = check_preempt_wakeup, |
1113 | 1415 | ||
@@ -1122,6 +1424,9 @@ static const struct sched_class fair_sched_class = { | |||
1122 | .set_curr_task = set_curr_task_fair, | 1424 | .set_curr_task = set_curr_task_fair, |
1123 | .task_tick = task_tick_fair, | 1425 | .task_tick = task_tick_fair, |
1124 | .task_new = task_new_fair, | 1426 | .task_new = task_new_fair, |
1427 | |||
1428 | .prio_changed = prio_changed_fair, | ||
1429 | .switched_to = switched_to_fair, | ||
1125 | }; | 1430 | }; |
1126 | 1431 | ||
1127 | #ifdef CONFIG_SCHED_DEBUG | 1432 | #ifdef CONFIG_SCHED_DEBUG |
@@ -1132,7 +1437,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu) | |||
1132 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1437 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1133 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); | 1438 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); |
1134 | #endif | 1439 | #endif |
1440 | rcu_read_lock(); | ||
1135 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) | 1441 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) |
1136 | print_cfs_rq(m, cpu, cfs_rq); | 1442 | print_cfs_rq(m, cpu, cfs_rq); |
1443 | rcu_read_unlock(); | ||
1137 | } | 1444 | } |
1138 | #endif | 1445 | #endif |
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index bf9c25c15b8b..2bcafa375633 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -5,6 +5,12 @@ | |||
5 | * handled in sched_fair.c) | 5 | * handled in sched_fair.c) |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #ifdef CONFIG_SMP | ||
9 | static int select_task_rq_idle(struct task_struct *p, int sync) | ||
10 | { | ||
11 | return task_cpu(p); /* IDLE tasks as never migrated */ | ||
12 | } | ||
13 | #endif /* CONFIG_SMP */ | ||
8 | /* | 14 | /* |
9 | * Idle tasks are unconditionally rescheduled: | 15 | * Idle tasks are unconditionally rescheduled: |
10 | */ | 16 | */ |
@@ -55,7 +61,7 @@ move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
55 | } | 61 | } |
56 | #endif | 62 | #endif |
57 | 63 | ||
58 | static void task_tick_idle(struct rq *rq, struct task_struct *curr) | 64 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) |
59 | { | 65 | { |
60 | } | 66 | } |
61 | 67 | ||
@@ -63,6 +69,33 @@ static void set_curr_task_idle(struct rq *rq) | |||
63 | { | 69 | { |
64 | } | 70 | } |
65 | 71 | ||
72 | static void switched_to_idle(struct rq *rq, struct task_struct *p, | ||
73 | int running) | ||
74 | { | ||
75 | /* Can this actually happen?? */ | ||
76 | if (running) | ||
77 | resched_task(rq->curr); | ||
78 | else | ||
79 | check_preempt_curr(rq, p); | ||
80 | } | ||
81 | |||
82 | static void prio_changed_idle(struct rq *rq, struct task_struct *p, | ||
83 | int oldprio, int running) | ||
84 | { | ||
85 | /* This can happen for hot plug CPUS */ | ||
86 | |||
87 | /* | ||
88 | * Reschedule if we are currently running on this runqueue and | ||
89 | * our priority decreased, or if we are not currently running on | ||
90 | * this runqueue and our priority is higher than the current's | ||
91 | */ | ||
92 | if (running) { | ||
93 | if (p->prio > oldprio) | ||
94 | resched_task(rq->curr); | ||
95 | } else | ||
96 | check_preempt_curr(rq, p); | ||
97 | } | ||
98 | |||
66 | /* | 99 | /* |
67 | * Simple, special scheduling class for the per-CPU idle tasks: | 100 | * Simple, special scheduling class for the per-CPU idle tasks: |
68 | */ | 101 | */ |
@@ -72,6 +105,9 @@ const struct sched_class idle_sched_class = { | |||
72 | 105 | ||
73 | /* dequeue is not valid, we print a debug message there: */ | 106 | /* dequeue is not valid, we print a debug message there: */ |
74 | .dequeue_task = dequeue_task_idle, | 107 | .dequeue_task = dequeue_task_idle, |
108 | #ifdef CONFIG_SMP | ||
109 | .select_task_rq = select_task_rq_idle, | ||
110 | #endif /* CONFIG_SMP */ | ||
75 | 111 | ||
76 | .check_preempt_curr = check_preempt_curr_idle, | 112 | .check_preempt_curr = check_preempt_curr_idle, |
77 | 113 | ||
@@ -85,5 +121,9 @@ const struct sched_class idle_sched_class = { | |||
85 | 121 | ||
86 | .set_curr_task = set_curr_task_idle, | 122 | .set_curr_task = set_curr_task_idle, |
87 | .task_tick = task_tick_idle, | 123 | .task_tick = task_tick_idle, |
124 | |||
125 | .prio_changed = prio_changed_idle, | ||
126 | .switched_to = switched_to_idle, | ||
127 | |||
88 | /* no .task_new for idle tasks */ | 128 | /* no .task_new for idle tasks */ |
89 | }; | 129 | }; |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 9ba3daa03475..274b40d7bef2 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -3,6 +3,217 @@ | |||
3 | * policies) | 3 | * policies) |
4 | */ | 4 | */ |
5 | 5 | ||
6 | #ifdef CONFIG_SMP | ||
7 | |||
8 | static inline int rt_overloaded(struct rq *rq) | ||
9 | { | ||
10 | return atomic_read(&rq->rd->rto_count); | ||
11 | } | ||
12 | |||
13 | static inline void rt_set_overload(struct rq *rq) | ||
14 | { | ||
15 | cpu_set(rq->cpu, rq->rd->rto_mask); | ||
16 | /* | ||
17 | * Make sure the mask is visible before we set | ||
18 | * the overload count. That is checked to determine | ||
19 | * if we should look at the mask. It would be a shame | ||
20 | * if we looked at the mask, but the mask was not | ||
21 | * updated yet. | ||
22 | */ | ||
23 | wmb(); | ||
24 | atomic_inc(&rq->rd->rto_count); | ||
25 | } | ||
26 | |||
27 | static inline void rt_clear_overload(struct rq *rq) | ||
28 | { | ||
29 | /* the order here really doesn't matter */ | ||
30 | atomic_dec(&rq->rd->rto_count); | ||
31 | cpu_clear(rq->cpu, rq->rd->rto_mask); | ||
32 | } | ||
33 | |||
34 | static void update_rt_migration(struct rq *rq) | ||
35 | { | ||
36 | if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) { | ||
37 | if (!rq->rt.overloaded) { | ||
38 | rt_set_overload(rq); | ||
39 | rq->rt.overloaded = 1; | ||
40 | } | ||
41 | } else if (rq->rt.overloaded) { | ||
42 | rt_clear_overload(rq); | ||
43 | rq->rt.overloaded = 0; | ||
44 | } | ||
45 | } | ||
46 | #endif /* CONFIG_SMP */ | ||
47 | |||
48 | static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) | ||
49 | { | ||
50 | return container_of(rt_se, struct task_struct, rt); | ||
51 | } | ||
52 | |||
53 | static inline int on_rt_rq(struct sched_rt_entity *rt_se) | ||
54 | { | ||
55 | return !list_empty(&rt_se->run_list); | ||
56 | } | ||
57 | |||
58 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
59 | |||
60 | static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) | ||
61 | { | ||
62 | if (!rt_rq->tg) | ||
63 | return SCHED_RT_FRAC; | ||
64 | |||
65 | return rt_rq->tg->rt_ratio; | ||
66 | } | ||
67 | |||
68 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | ||
69 | list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) | ||
70 | |||
71 | static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) | ||
72 | { | ||
73 | return rt_rq->rq; | ||
74 | } | ||
75 | |||
76 | static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | ||
77 | { | ||
78 | return rt_se->rt_rq; | ||
79 | } | ||
80 | |||
81 | #define for_each_sched_rt_entity(rt_se) \ | ||
82 | for (; rt_se; rt_se = rt_se->parent) | ||
83 | |||
84 | static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) | ||
85 | { | ||
86 | return rt_se->my_q; | ||
87 | } | ||
88 | |||
89 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se); | ||
90 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se); | ||
91 | |||
92 | static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) | ||
93 | { | ||
94 | struct sched_rt_entity *rt_se = rt_rq->rt_se; | ||
95 | |||
96 | if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { | ||
97 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; | ||
98 | |||
99 | enqueue_rt_entity(rt_se); | ||
100 | if (rt_rq->highest_prio < curr->prio) | ||
101 | resched_task(curr); | ||
102 | } | ||
103 | } | ||
104 | |||
105 | static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) | ||
106 | { | ||
107 | struct sched_rt_entity *rt_se = rt_rq->rt_se; | ||
108 | |||
109 | if (rt_se && on_rt_rq(rt_se)) | ||
110 | dequeue_rt_entity(rt_se); | ||
111 | } | ||
112 | |||
113 | #else | ||
114 | |||
115 | static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) | ||
116 | { | ||
117 | return sysctl_sched_rt_ratio; | ||
118 | } | ||
119 | |||
120 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | ||
121 | for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) | ||
122 | |||
123 | static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) | ||
124 | { | ||
125 | return container_of(rt_rq, struct rq, rt); | ||
126 | } | ||
127 | |||
128 | static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | ||
129 | { | ||
130 | struct task_struct *p = rt_task_of(rt_se); | ||
131 | struct rq *rq = task_rq(p); | ||
132 | |||
133 | return &rq->rt; | ||
134 | } | ||
135 | |||
136 | #define for_each_sched_rt_entity(rt_se) \ | ||
137 | for (; rt_se; rt_se = NULL) | ||
138 | |||
139 | static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) | ||
140 | { | ||
141 | return NULL; | ||
142 | } | ||
143 | |||
144 | static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) | ||
145 | { | ||
146 | } | ||
147 | |||
148 | static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) | ||
149 | { | ||
150 | } | ||
151 | |||
152 | #endif | ||
153 | |||
154 | static inline int rt_se_prio(struct sched_rt_entity *rt_se) | ||
155 | { | ||
156 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
157 | struct rt_rq *rt_rq = group_rt_rq(rt_se); | ||
158 | |||
159 | if (rt_rq) | ||
160 | return rt_rq->highest_prio; | ||
161 | #endif | ||
162 | |||
163 | return rt_task_of(rt_se)->prio; | ||
164 | } | ||
165 | |||
166 | static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq) | ||
167 | { | ||
168 | unsigned int rt_ratio = sched_rt_ratio(rt_rq); | ||
169 | u64 period, ratio; | ||
170 | |||
171 | if (rt_ratio == SCHED_RT_FRAC) | ||
172 | return 0; | ||
173 | |||
174 | if (rt_rq->rt_throttled) | ||
175 | return 1; | ||
176 | |||
177 | period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; | ||
178 | ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT; | ||
179 | |||
180 | if (rt_rq->rt_time > ratio) { | ||
181 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
182 | |||
183 | rq->rt_throttled = 1; | ||
184 | rt_rq->rt_throttled = 1; | ||
185 | |||
186 | sched_rt_ratio_dequeue(rt_rq); | ||
187 | return 1; | ||
188 | } | ||
189 | |||
190 | return 0; | ||
191 | } | ||
192 | |||
193 | static void update_sched_rt_period(struct rq *rq) | ||
194 | { | ||
195 | struct rt_rq *rt_rq; | ||
196 | u64 period; | ||
197 | |||
198 | while (rq->clock > rq->rt_period_expire) { | ||
199 | period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; | ||
200 | rq->rt_period_expire += period; | ||
201 | |||
202 | for_each_leaf_rt_rq(rt_rq, rq) { | ||
203 | unsigned long rt_ratio = sched_rt_ratio(rt_rq); | ||
204 | u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT; | ||
205 | |||
206 | rt_rq->rt_time -= min(rt_rq->rt_time, ratio); | ||
207 | if (rt_rq->rt_throttled) { | ||
208 | rt_rq->rt_throttled = 0; | ||
209 | sched_rt_ratio_enqueue(rt_rq); | ||
210 | } | ||
211 | } | ||
212 | |||
213 | rq->rt_throttled = 0; | ||
214 | } | ||
215 | } | ||
216 | |||
6 | /* | 217 | /* |
7 | * Update the current task's runtime statistics. Skip current tasks that | 218 | * Update the current task's runtime statistics. Skip current tasks that |
8 | * are not in our scheduling class. | 219 | * are not in our scheduling class. |
@@ -10,6 +221,8 @@ | |||
10 | static void update_curr_rt(struct rq *rq) | 221 | static void update_curr_rt(struct rq *rq) |
11 | { | 222 | { |
12 | struct task_struct *curr = rq->curr; | 223 | struct task_struct *curr = rq->curr; |
224 | struct sched_rt_entity *rt_se = &curr->rt; | ||
225 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | ||
13 | u64 delta_exec; | 226 | u64 delta_exec; |
14 | 227 | ||
15 | if (!task_has_rt_policy(curr)) | 228 | if (!task_has_rt_policy(curr)) |
@@ -24,47 +237,228 @@ static void update_curr_rt(struct rq *rq) | |||
24 | curr->se.sum_exec_runtime += delta_exec; | 237 | curr->se.sum_exec_runtime += delta_exec; |
25 | curr->se.exec_start = rq->clock; | 238 | curr->se.exec_start = rq->clock; |
26 | cpuacct_charge(curr, delta_exec); | 239 | cpuacct_charge(curr, delta_exec); |
240 | |||
241 | rt_rq->rt_time += delta_exec; | ||
242 | /* | ||
243 | * might make it a tad more accurate: | ||
244 | * | ||
245 | * update_sched_rt_period(rq); | ||
246 | */ | ||
247 | if (sched_rt_ratio_exceeded(rt_rq)) | ||
248 | resched_task(curr); | ||
27 | } | 249 | } |
28 | 250 | ||
29 | static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | 251 | static inline |
252 | void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | ||
253 | { | ||
254 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); | ||
255 | rt_rq->rt_nr_running++; | ||
256 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED | ||
257 | if (rt_se_prio(rt_se) < rt_rq->highest_prio) | ||
258 | rt_rq->highest_prio = rt_se_prio(rt_se); | ||
259 | #endif | ||
260 | #ifdef CONFIG_SMP | ||
261 | if (rt_se->nr_cpus_allowed > 1) { | ||
262 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
263 | rq->rt.rt_nr_migratory++; | ||
264 | } | ||
265 | |||
266 | update_rt_migration(rq_of_rt_rq(rt_rq)); | ||
267 | #endif | ||
268 | } | ||
269 | |||
270 | static inline | ||
271 | void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | ||
272 | { | ||
273 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); | ||
274 | WARN_ON(!rt_rq->rt_nr_running); | ||
275 | rt_rq->rt_nr_running--; | ||
276 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED | ||
277 | if (rt_rq->rt_nr_running) { | ||
278 | struct rt_prio_array *array; | ||
279 | |||
280 | WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio); | ||
281 | if (rt_se_prio(rt_se) == rt_rq->highest_prio) { | ||
282 | /* recalculate */ | ||
283 | array = &rt_rq->active; | ||
284 | rt_rq->highest_prio = | ||
285 | sched_find_first_bit(array->bitmap); | ||
286 | } /* otherwise leave rq->highest prio alone */ | ||
287 | } else | ||
288 | rt_rq->highest_prio = MAX_RT_PRIO; | ||
289 | #endif | ||
290 | #ifdef CONFIG_SMP | ||
291 | if (rt_se->nr_cpus_allowed > 1) { | ||
292 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
293 | rq->rt.rt_nr_migratory--; | ||
294 | } | ||
295 | |||
296 | update_rt_migration(rq_of_rt_rq(rt_rq)); | ||
297 | #endif /* CONFIG_SMP */ | ||
298 | } | ||
299 | |||
300 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se) | ||
301 | { | ||
302 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | ||
303 | struct rt_prio_array *array = &rt_rq->active; | ||
304 | struct rt_rq *group_rq = group_rt_rq(rt_se); | ||
305 | |||
306 | if (group_rq && group_rq->rt_throttled) | ||
307 | return; | ||
308 | |||
309 | list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); | ||
310 | __set_bit(rt_se_prio(rt_se), array->bitmap); | ||
311 | |||
312 | inc_rt_tasks(rt_se, rt_rq); | ||
313 | } | ||
314 | |||
315 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | ||
30 | { | 316 | { |
31 | struct rt_prio_array *array = &rq->rt.active; | 317 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); |
318 | struct rt_prio_array *array = &rt_rq->active; | ||
319 | |||
320 | list_del_init(&rt_se->run_list); | ||
321 | if (list_empty(array->queue + rt_se_prio(rt_se))) | ||
322 | __clear_bit(rt_se_prio(rt_se), array->bitmap); | ||
32 | 323 | ||
33 | list_add_tail(&p->run_list, array->queue + p->prio); | 324 | dec_rt_tasks(rt_se, rt_rq); |
34 | __set_bit(p->prio, array->bitmap); | 325 | } |
326 | |||
327 | /* | ||
328 | * Because the prio of an upper entry depends on the lower | ||
329 | * entries, we must remove entries top - down. | ||
330 | * | ||
331 | * XXX: O(1/2 h^2) because we can only walk up, not down the chain. | ||
332 | * doesn't matter much for now, as h=2 for GROUP_SCHED. | ||
333 | */ | ||
334 | static void dequeue_rt_stack(struct task_struct *p) | ||
335 | { | ||
336 | struct sched_rt_entity *rt_se, *top_se; | ||
337 | |||
338 | /* | ||
339 | * dequeue all, top - down. | ||
340 | */ | ||
341 | do { | ||
342 | rt_se = &p->rt; | ||
343 | top_se = NULL; | ||
344 | for_each_sched_rt_entity(rt_se) { | ||
345 | if (on_rt_rq(rt_se)) | ||
346 | top_se = rt_se; | ||
347 | } | ||
348 | if (top_se) | ||
349 | dequeue_rt_entity(top_se); | ||
350 | } while (top_se); | ||
35 | } | 351 | } |
36 | 352 | ||
37 | /* | 353 | /* |
38 | * Adding/removing a task to/from a priority array: | 354 | * Adding/removing a task to/from a priority array: |
39 | */ | 355 | */ |
356 | static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | ||
357 | { | ||
358 | struct sched_rt_entity *rt_se = &p->rt; | ||
359 | |||
360 | if (wakeup) | ||
361 | rt_se->timeout = 0; | ||
362 | |||
363 | dequeue_rt_stack(p); | ||
364 | |||
365 | /* | ||
366 | * enqueue everybody, bottom - up. | ||
367 | */ | ||
368 | for_each_sched_rt_entity(rt_se) | ||
369 | enqueue_rt_entity(rt_se); | ||
370 | |||
371 | inc_cpu_load(rq, p->se.load.weight); | ||
372 | } | ||
373 | |||
40 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | 374 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) |
41 | { | 375 | { |
42 | struct rt_prio_array *array = &rq->rt.active; | 376 | struct sched_rt_entity *rt_se = &p->rt; |
377 | struct rt_rq *rt_rq; | ||
43 | 378 | ||
44 | update_curr_rt(rq); | 379 | update_curr_rt(rq); |
45 | 380 | ||
46 | list_del(&p->run_list); | 381 | dequeue_rt_stack(p); |
47 | if (list_empty(array->queue + p->prio)) | 382 | |
48 | __clear_bit(p->prio, array->bitmap); | 383 | /* |
384 | * re-enqueue all non-empty rt_rq entities. | ||
385 | */ | ||
386 | for_each_sched_rt_entity(rt_se) { | ||
387 | rt_rq = group_rt_rq(rt_se); | ||
388 | if (rt_rq && rt_rq->rt_nr_running) | ||
389 | enqueue_rt_entity(rt_se); | ||
390 | } | ||
391 | |||
392 | dec_cpu_load(rq, p->se.load.weight); | ||
49 | } | 393 | } |
50 | 394 | ||
51 | /* | 395 | /* |
52 | * Put task to the end of the run list without the overhead of dequeue | 396 | * Put task to the end of the run list without the overhead of dequeue |
53 | * followed by enqueue. | 397 | * followed by enqueue. |
54 | */ | 398 | */ |
399 | static | ||
400 | void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) | ||
401 | { | ||
402 | struct rt_prio_array *array = &rt_rq->active; | ||
403 | |||
404 | list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); | ||
405 | } | ||
406 | |||
55 | static void requeue_task_rt(struct rq *rq, struct task_struct *p) | 407 | static void requeue_task_rt(struct rq *rq, struct task_struct *p) |
56 | { | 408 | { |
57 | struct rt_prio_array *array = &rq->rt.active; | 409 | struct sched_rt_entity *rt_se = &p->rt; |
410 | struct rt_rq *rt_rq; | ||
58 | 411 | ||
59 | list_move_tail(&p->run_list, array->queue + p->prio); | 412 | for_each_sched_rt_entity(rt_se) { |
413 | rt_rq = rt_rq_of_se(rt_se); | ||
414 | requeue_rt_entity(rt_rq, rt_se); | ||
415 | } | ||
60 | } | 416 | } |
61 | 417 | ||
62 | static void | 418 | static void yield_task_rt(struct rq *rq) |
63 | yield_task_rt(struct rq *rq) | ||
64 | { | 419 | { |
65 | requeue_task_rt(rq, rq->curr); | 420 | requeue_task_rt(rq, rq->curr); |
66 | } | 421 | } |
67 | 422 | ||
423 | #ifdef CONFIG_SMP | ||
424 | static int find_lowest_rq(struct task_struct *task); | ||
425 | |||
426 | static int select_task_rq_rt(struct task_struct *p, int sync) | ||
427 | { | ||
428 | struct rq *rq = task_rq(p); | ||
429 | |||
430 | /* | ||
431 | * If the current task is an RT task, then | ||
432 | * try to see if we can wake this RT task up on another | ||
433 | * runqueue. Otherwise simply start this RT task | ||
434 | * on its current runqueue. | ||
435 | * | ||
436 | * We want to avoid overloading runqueues. Even if | ||
437 | * the RT task is of higher priority than the current RT task. | ||
438 | * RT tasks behave differently than other tasks. If | ||
439 | * one gets preempted, we try to push it off to another queue. | ||
440 | * So trying to keep a preempting RT task on the same | ||
441 | * cache hot CPU will force the running RT task to | ||
442 | * a cold CPU. So we waste all the cache for the lower | ||
443 | * RT task in hopes of saving some of a RT task | ||
444 | * that is just being woken and probably will have | ||
445 | * cold cache anyway. | ||
446 | */ | ||
447 | if (unlikely(rt_task(rq->curr)) && | ||
448 | (p->rt.nr_cpus_allowed > 1)) { | ||
449 | int cpu = find_lowest_rq(p); | ||
450 | |||
451 | return (cpu == -1) ? task_cpu(p) : cpu; | ||
452 | } | ||
453 | |||
454 | /* | ||
455 | * Otherwise, just let it ride on the affined RQ and the | ||
456 | * post-schedule router will push the preempted task away | ||
457 | */ | ||
458 | return task_cpu(p); | ||
459 | } | ||
460 | #endif /* CONFIG_SMP */ | ||
461 | |||
68 | /* | 462 | /* |
69 | * Preempt the current task with a newly woken task if needed: | 463 | * Preempt the current task with a newly woken task if needed: |
70 | */ | 464 | */ |
@@ -74,25 +468,48 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) | |||
74 | resched_task(rq->curr); | 468 | resched_task(rq->curr); |
75 | } | 469 | } |
76 | 470 | ||
77 | static struct task_struct *pick_next_task_rt(struct rq *rq) | 471 | static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, |
472 | struct rt_rq *rt_rq) | ||
78 | { | 473 | { |
79 | struct rt_prio_array *array = &rq->rt.active; | 474 | struct rt_prio_array *array = &rt_rq->active; |
80 | struct task_struct *next; | 475 | struct sched_rt_entity *next = NULL; |
81 | struct list_head *queue; | 476 | struct list_head *queue; |
82 | int idx; | 477 | int idx; |
83 | 478 | ||
84 | idx = sched_find_first_bit(array->bitmap); | 479 | idx = sched_find_first_bit(array->bitmap); |
85 | if (idx >= MAX_RT_PRIO) | 480 | BUG_ON(idx >= MAX_RT_PRIO); |
86 | return NULL; | ||
87 | 481 | ||
88 | queue = array->queue + idx; | 482 | queue = array->queue + idx; |
89 | next = list_entry(queue->next, struct task_struct, run_list); | 483 | next = list_entry(queue->next, struct sched_rt_entity, run_list); |
90 | |||
91 | next->se.exec_start = rq->clock; | ||
92 | 484 | ||
93 | return next; | 485 | return next; |
94 | } | 486 | } |
95 | 487 | ||
488 | static struct task_struct *pick_next_task_rt(struct rq *rq) | ||
489 | { | ||
490 | struct sched_rt_entity *rt_se; | ||
491 | struct task_struct *p; | ||
492 | struct rt_rq *rt_rq; | ||
493 | |||
494 | rt_rq = &rq->rt; | ||
495 | |||
496 | if (unlikely(!rt_rq->rt_nr_running)) | ||
497 | return NULL; | ||
498 | |||
499 | if (sched_rt_ratio_exceeded(rt_rq)) | ||
500 | return NULL; | ||
501 | |||
502 | do { | ||
503 | rt_se = pick_next_rt_entity(rq, rt_rq); | ||
504 | BUG_ON(!rt_se); | ||
505 | rt_rq = group_rt_rq(rt_se); | ||
506 | } while (rt_rq); | ||
507 | |||
508 | p = rt_task_of(rt_se); | ||
509 | p->se.exec_start = rq->clock; | ||
510 | return p; | ||
511 | } | ||
512 | |||
96 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | 513 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p) |
97 | { | 514 | { |
98 | update_curr_rt(rq); | 515 | update_curr_rt(rq); |
@@ -100,76 +517,448 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
100 | } | 517 | } |
101 | 518 | ||
102 | #ifdef CONFIG_SMP | 519 | #ifdef CONFIG_SMP |
103 | /* | 520 | |
104 | * Load-balancing iterator. Note: while the runqueue stays locked | 521 | /* Only try algorithms three times */ |
105 | * during the whole iteration, the current task might be | 522 | #define RT_MAX_TRIES 3 |
106 | * dequeued so the iterator has to be dequeue-safe. Here we | 523 | |
107 | * achieve that by always pre-iterating before returning | 524 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest); |
108 | * the current task: | 525 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); |
109 | */ | 526 | |
110 | static struct task_struct *load_balance_start_rt(void *arg) | 527 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) |
111 | { | 528 | { |
112 | struct rq *rq = arg; | 529 | if (!task_running(rq, p) && |
113 | struct rt_prio_array *array = &rq->rt.active; | 530 | (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && |
114 | struct list_head *head, *curr; | 531 | (p->rt.nr_cpus_allowed > 1)) |
115 | struct task_struct *p; | 532 | return 1; |
533 | return 0; | ||
534 | } | ||
535 | |||
536 | /* Return the second highest RT task, NULL otherwise */ | ||
537 | static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) | ||
538 | { | ||
539 | struct task_struct *next = NULL; | ||
540 | struct sched_rt_entity *rt_se; | ||
541 | struct rt_prio_array *array; | ||
542 | struct rt_rq *rt_rq; | ||
116 | int idx; | 543 | int idx; |
117 | 544 | ||
118 | idx = sched_find_first_bit(array->bitmap); | 545 | for_each_leaf_rt_rq(rt_rq, rq) { |
119 | if (idx >= MAX_RT_PRIO) | 546 | array = &rt_rq->active; |
120 | return NULL; | 547 | idx = sched_find_first_bit(array->bitmap); |
548 | next_idx: | ||
549 | if (idx >= MAX_RT_PRIO) | ||
550 | continue; | ||
551 | if (next && next->prio < idx) | ||
552 | continue; | ||
553 | list_for_each_entry(rt_se, array->queue + idx, run_list) { | ||
554 | struct task_struct *p = rt_task_of(rt_se); | ||
555 | if (pick_rt_task(rq, p, cpu)) { | ||
556 | next = p; | ||
557 | break; | ||
558 | } | ||
559 | } | ||
560 | if (!next) { | ||
561 | idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); | ||
562 | goto next_idx; | ||
563 | } | ||
564 | } | ||
121 | 565 | ||
122 | head = array->queue + idx; | 566 | return next; |
123 | curr = head->prev; | 567 | } |
124 | 568 | ||
125 | p = list_entry(curr, struct task_struct, run_list); | 569 | static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); |
126 | 570 | ||
127 | curr = curr->prev; | 571 | static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask) |
572 | { | ||
573 | int lowest_prio = -1; | ||
574 | int lowest_cpu = -1; | ||
575 | int count = 0; | ||
576 | int cpu; | ||
128 | 577 | ||
129 | rq->rt.rt_load_balance_idx = idx; | 578 | cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed); |
130 | rq->rt.rt_load_balance_head = head; | ||
131 | rq->rt.rt_load_balance_curr = curr; | ||
132 | 579 | ||
133 | return p; | 580 | /* |
581 | * Scan each rq for the lowest prio. | ||
582 | */ | ||
583 | for_each_cpu_mask(cpu, *lowest_mask) { | ||
584 | struct rq *rq = cpu_rq(cpu); | ||
585 | |||
586 | /* We look for lowest RT prio or non-rt CPU */ | ||
587 | if (rq->rt.highest_prio >= MAX_RT_PRIO) { | ||
588 | /* | ||
589 | * if we already found a low RT queue | ||
590 | * and now we found this non-rt queue | ||
591 | * clear the mask and set our bit. | ||
592 | * Otherwise just return the queue as is | ||
593 | * and the count==1 will cause the algorithm | ||
594 | * to use the first bit found. | ||
595 | */ | ||
596 | if (lowest_cpu != -1) { | ||
597 | cpus_clear(*lowest_mask); | ||
598 | cpu_set(rq->cpu, *lowest_mask); | ||
599 | } | ||
600 | return 1; | ||
601 | } | ||
602 | |||
603 | /* no locking for now */ | ||
604 | if ((rq->rt.highest_prio > task->prio) | ||
605 | && (rq->rt.highest_prio >= lowest_prio)) { | ||
606 | if (rq->rt.highest_prio > lowest_prio) { | ||
607 | /* new low - clear old data */ | ||
608 | lowest_prio = rq->rt.highest_prio; | ||
609 | lowest_cpu = cpu; | ||
610 | count = 0; | ||
611 | } | ||
612 | count++; | ||
613 | } else | ||
614 | cpu_clear(cpu, *lowest_mask); | ||
615 | } | ||
616 | |||
617 | /* | ||
618 | * Clear out all the set bits that represent | ||
619 | * runqueues that were of higher prio than | ||
620 | * the lowest_prio. | ||
621 | */ | ||
622 | if (lowest_cpu > 0) { | ||
623 | /* | ||
624 | * Perhaps we could add another cpumask op to | ||
625 | * zero out bits. Like cpu_zero_bits(cpumask, nrbits); | ||
626 | * Then that could be optimized to use memset and such. | ||
627 | */ | ||
628 | for_each_cpu_mask(cpu, *lowest_mask) { | ||
629 | if (cpu >= lowest_cpu) | ||
630 | break; | ||
631 | cpu_clear(cpu, *lowest_mask); | ||
632 | } | ||
633 | } | ||
634 | |||
635 | return count; | ||
134 | } | 636 | } |
135 | 637 | ||
136 | static struct task_struct *load_balance_next_rt(void *arg) | 638 | static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) |
137 | { | 639 | { |
138 | struct rq *rq = arg; | 640 | int first; |
139 | struct rt_prio_array *array = &rq->rt.active; | 641 | |
140 | struct list_head *head, *curr; | 642 | /* "this_cpu" is cheaper to preempt than a remote processor */ |
141 | struct task_struct *p; | 643 | if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) |
142 | int idx; | 644 | return this_cpu; |
645 | |||
646 | first = first_cpu(*mask); | ||
647 | if (first != NR_CPUS) | ||
648 | return first; | ||
649 | |||
650 | return -1; | ||
651 | } | ||
652 | |||
653 | static int find_lowest_rq(struct task_struct *task) | ||
654 | { | ||
655 | struct sched_domain *sd; | ||
656 | cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); | ||
657 | int this_cpu = smp_processor_id(); | ||
658 | int cpu = task_cpu(task); | ||
659 | int count = find_lowest_cpus(task, lowest_mask); | ||
143 | 660 | ||
144 | idx = rq->rt.rt_load_balance_idx; | 661 | if (!count) |
145 | head = rq->rt.rt_load_balance_head; | 662 | return -1; /* No targets found */ |
146 | curr = rq->rt.rt_load_balance_curr; | ||
147 | 663 | ||
148 | /* | 664 | /* |
149 | * If we arrived back to the head again then | 665 | * There is no sense in performing an optimal search if only one |
150 | * iterate to the next queue (if any): | 666 | * target is found. |
151 | */ | 667 | */ |
152 | if (unlikely(head == curr)) { | 668 | if (count == 1) |
153 | int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); | 669 | return first_cpu(*lowest_mask); |
154 | 670 | ||
155 | if (next_idx >= MAX_RT_PRIO) | 671 | /* |
156 | return NULL; | 672 | * At this point we have built a mask of cpus representing the |
673 | * lowest priority tasks in the system. Now we want to elect | ||
674 | * the best one based on our affinity and topology. | ||
675 | * | ||
676 | * We prioritize the last cpu that the task executed on since | ||
677 | * it is most likely cache-hot in that location. | ||
678 | */ | ||
679 | if (cpu_isset(cpu, *lowest_mask)) | ||
680 | return cpu; | ||
681 | |||
682 | /* | ||
683 | * Otherwise, we consult the sched_domains span maps to figure | ||
684 | * out which cpu is logically closest to our hot cache data. | ||
685 | */ | ||
686 | if (this_cpu == cpu) | ||
687 | this_cpu = -1; /* Skip this_cpu opt if the same */ | ||
688 | |||
689 | for_each_domain(cpu, sd) { | ||
690 | if (sd->flags & SD_WAKE_AFFINE) { | ||
691 | cpumask_t domain_mask; | ||
692 | int best_cpu; | ||
157 | 693 | ||
158 | idx = next_idx; | 694 | cpus_and(domain_mask, sd->span, *lowest_mask); |
159 | head = array->queue + idx; | ||
160 | curr = head->prev; | ||
161 | 695 | ||
162 | rq->rt.rt_load_balance_idx = idx; | 696 | best_cpu = pick_optimal_cpu(this_cpu, |
163 | rq->rt.rt_load_balance_head = head; | 697 | &domain_mask); |
698 | if (best_cpu != -1) | ||
699 | return best_cpu; | ||
700 | } | ||
164 | } | 701 | } |
165 | 702 | ||
166 | p = list_entry(curr, struct task_struct, run_list); | 703 | /* |
704 | * And finally, if there were no matches within the domains | ||
705 | * just give the caller *something* to work with from the compatible | ||
706 | * locations. | ||
707 | */ | ||
708 | return pick_optimal_cpu(this_cpu, lowest_mask); | ||
709 | } | ||
167 | 710 | ||
168 | curr = curr->prev; | 711 | /* Will lock the rq it finds */ |
712 | static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | ||
713 | { | ||
714 | struct rq *lowest_rq = NULL; | ||
715 | int tries; | ||
716 | int cpu; | ||
169 | 717 | ||
170 | rq->rt.rt_load_balance_curr = curr; | 718 | for (tries = 0; tries < RT_MAX_TRIES; tries++) { |
719 | cpu = find_lowest_rq(task); | ||
171 | 720 | ||
172 | return p; | 721 | if ((cpu == -1) || (cpu == rq->cpu)) |
722 | break; | ||
723 | |||
724 | lowest_rq = cpu_rq(cpu); | ||
725 | |||
726 | /* if the prio of this runqueue changed, try again */ | ||
727 | if (double_lock_balance(rq, lowest_rq)) { | ||
728 | /* | ||
729 | * We had to unlock the run queue. In | ||
730 | * the mean time, task could have | ||
731 | * migrated already or had its affinity changed. | ||
732 | * Also make sure that it wasn't scheduled on its rq. | ||
733 | */ | ||
734 | if (unlikely(task_rq(task) != rq || | ||
735 | !cpu_isset(lowest_rq->cpu, | ||
736 | task->cpus_allowed) || | ||
737 | task_running(rq, task) || | ||
738 | !task->se.on_rq)) { | ||
739 | |||
740 | spin_unlock(&lowest_rq->lock); | ||
741 | lowest_rq = NULL; | ||
742 | break; | ||
743 | } | ||
744 | } | ||
745 | |||
746 | /* If this rq is still suitable use it. */ | ||
747 | if (lowest_rq->rt.highest_prio > task->prio) | ||
748 | break; | ||
749 | |||
750 | /* try again */ | ||
751 | spin_unlock(&lowest_rq->lock); | ||
752 | lowest_rq = NULL; | ||
753 | } | ||
754 | |||
755 | return lowest_rq; | ||
756 | } | ||
757 | |||
758 | /* | ||
759 | * If the current CPU has more than one RT task, see if the non | ||
760 | * running task can migrate over to a CPU that is running a task | ||
761 | * of lesser priority. | ||
762 | */ | ||
763 | static int push_rt_task(struct rq *rq) | ||
764 | { | ||
765 | struct task_struct *next_task; | ||
766 | struct rq *lowest_rq; | ||
767 | int ret = 0; | ||
768 | int paranoid = RT_MAX_TRIES; | ||
769 | |||
770 | if (!rq->rt.overloaded) | ||
771 | return 0; | ||
772 | |||
773 | next_task = pick_next_highest_task_rt(rq, -1); | ||
774 | if (!next_task) | ||
775 | return 0; | ||
776 | |||
777 | retry: | ||
778 | if (unlikely(next_task == rq->curr)) { | ||
779 | WARN_ON(1); | ||
780 | return 0; | ||
781 | } | ||
782 | |||
783 | /* | ||
784 | * It's possible that the next_task slipped in of | ||
785 | * higher priority than current. If that's the case | ||
786 | * just reschedule current. | ||
787 | */ | ||
788 | if (unlikely(next_task->prio < rq->curr->prio)) { | ||
789 | resched_task(rq->curr); | ||
790 | return 0; | ||
791 | } | ||
792 | |||
793 | /* We might release rq lock */ | ||
794 | get_task_struct(next_task); | ||
795 | |||
796 | /* find_lock_lowest_rq locks the rq if found */ | ||
797 | lowest_rq = find_lock_lowest_rq(next_task, rq); | ||
798 | if (!lowest_rq) { | ||
799 | struct task_struct *task; | ||
800 | /* | ||
801 | * find lock_lowest_rq releases rq->lock | ||
802 | * so it is possible that next_task has changed. | ||
803 | * If it has, then try again. | ||
804 | */ | ||
805 | task = pick_next_highest_task_rt(rq, -1); | ||
806 | if (unlikely(task != next_task) && task && paranoid--) { | ||
807 | put_task_struct(next_task); | ||
808 | next_task = task; | ||
809 | goto retry; | ||
810 | } | ||
811 | goto out; | ||
812 | } | ||
813 | |||
814 | deactivate_task(rq, next_task, 0); | ||
815 | set_task_cpu(next_task, lowest_rq->cpu); | ||
816 | activate_task(lowest_rq, next_task, 0); | ||
817 | |||
818 | resched_task(lowest_rq->curr); | ||
819 | |||
820 | spin_unlock(&lowest_rq->lock); | ||
821 | |||
822 | ret = 1; | ||
823 | out: | ||
824 | put_task_struct(next_task); | ||
825 | |||
826 | return ret; | ||
827 | } | ||
828 | |||
829 | /* | ||
830 | * TODO: Currently we just use the second highest prio task on | ||
831 | * the queue, and stop when it can't migrate (or there's | ||
832 | * no more RT tasks). There may be a case where a lower | ||
833 | * priority RT task has a different affinity than the | ||
834 | * higher RT task. In this case the lower RT task could | ||
835 | * possibly be able to migrate where as the higher priority | ||
836 | * RT task could not. We currently ignore this issue. | ||
837 | * Enhancements are welcome! | ||
838 | */ | ||
839 | static void push_rt_tasks(struct rq *rq) | ||
840 | { | ||
841 | /* push_rt_task will return true if it moved an RT */ | ||
842 | while (push_rt_task(rq)) | ||
843 | ; | ||
844 | } | ||
845 | |||
846 | static int pull_rt_task(struct rq *this_rq) | ||
847 | { | ||
848 | int this_cpu = this_rq->cpu, ret = 0, cpu; | ||
849 | struct task_struct *p, *next; | ||
850 | struct rq *src_rq; | ||
851 | |||
852 | if (likely(!rt_overloaded(this_rq))) | ||
853 | return 0; | ||
854 | |||
855 | next = pick_next_task_rt(this_rq); | ||
856 | |||
857 | for_each_cpu_mask(cpu, this_rq->rd->rto_mask) { | ||
858 | if (this_cpu == cpu) | ||
859 | continue; | ||
860 | |||
861 | src_rq = cpu_rq(cpu); | ||
862 | /* | ||
863 | * We can potentially drop this_rq's lock in | ||
864 | * double_lock_balance, and another CPU could | ||
865 | * steal our next task - hence we must cause | ||
866 | * the caller to recalculate the next task | ||
867 | * in that case: | ||
868 | */ | ||
869 | if (double_lock_balance(this_rq, src_rq)) { | ||
870 | struct task_struct *old_next = next; | ||
871 | |||
872 | next = pick_next_task_rt(this_rq); | ||
873 | if (next != old_next) | ||
874 | ret = 1; | ||
875 | } | ||
876 | |||
877 | /* | ||
878 | * Are there still pullable RT tasks? | ||
879 | */ | ||
880 | if (src_rq->rt.rt_nr_running <= 1) | ||
881 | goto skip; | ||
882 | |||
883 | p = pick_next_highest_task_rt(src_rq, this_cpu); | ||
884 | |||
885 | /* | ||
886 | * Do we have an RT task that preempts | ||
887 | * the to-be-scheduled task? | ||
888 | */ | ||
889 | if (p && (!next || (p->prio < next->prio))) { | ||
890 | WARN_ON(p == src_rq->curr); | ||
891 | WARN_ON(!p->se.on_rq); | ||
892 | |||
893 | /* | ||
894 | * There's a chance that p is higher in priority | ||
895 | * than what's currently running on its cpu. | ||
896 | * This is just that p is wakeing up and hasn't | ||
897 | * had a chance to schedule. We only pull | ||
898 | * p if it is lower in priority than the | ||
899 | * current task on the run queue or | ||
900 | * this_rq next task is lower in prio than | ||
901 | * the current task on that rq. | ||
902 | */ | ||
903 | if (p->prio < src_rq->curr->prio || | ||
904 | (next && next->prio < src_rq->curr->prio)) | ||
905 | goto skip; | ||
906 | |||
907 | ret = 1; | ||
908 | |||
909 | deactivate_task(src_rq, p, 0); | ||
910 | set_task_cpu(p, this_cpu); | ||
911 | activate_task(this_rq, p, 0); | ||
912 | /* | ||
913 | * We continue with the search, just in | ||
914 | * case there's an even higher prio task | ||
915 | * in another runqueue. (low likelyhood | ||
916 | * but possible) | ||
917 | * | ||
918 | * Update next so that we won't pick a task | ||
919 | * on another cpu with a priority lower (or equal) | ||
920 | * than the one we just picked. | ||
921 | */ | ||
922 | next = p; | ||
923 | |||
924 | } | ||
925 | skip: | ||
926 | spin_unlock(&src_rq->lock); | ||
927 | } | ||
928 | |||
929 | return ret; | ||
930 | } | ||
931 | |||
932 | static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) | ||
933 | { | ||
934 | /* Try to pull RT tasks here if we lower this rq's prio */ | ||
935 | if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio) | ||
936 | pull_rt_task(rq); | ||
937 | } | ||
938 | |||
939 | static void post_schedule_rt(struct rq *rq) | ||
940 | { | ||
941 | /* | ||
942 | * If we have more than one rt_task queued, then | ||
943 | * see if we can push the other rt_tasks off to other CPUS. | ||
944 | * Note we may release the rq lock, and since | ||
945 | * the lock was owned by prev, we need to release it | ||
946 | * first via finish_lock_switch and then reaquire it here. | ||
947 | */ | ||
948 | if (unlikely(rq->rt.overloaded)) { | ||
949 | spin_lock_irq(&rq->lock); | ||
950 | push_rt_tasks(rq); | ||
951 | spin_unlock_irq(&rq->lock); | ||
952 | } | ||
953 | } | ||
954 | |||
955 | |||
956 | static void task_wake_up_rt(struct rq *rq, struct task_struct *p) | ||
957 | { | ||
958 | if (!task_running(rq, p) && | ||
959 | (p->prio >= rq->rt.highest_prio) && | ||
960 | rq->rt.overloaded) | ||
961 | push_rt_tasks(rq); | ||
173 | } | 962 | } |
174 | 963 | ||
175 | static unsigned long | 964 | static unsigned long |
@@ -178,38 +967,170 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
178 | struct sched_domain *sd, enum cpu_idle_type idle, | 967 | struct sched_domain *sd, enum cpu_idle_type idle, |
179 | int *all_pinned, int *this_best_prio) | 968 | int *all_pinned, int *this_best_prio) |
180 | { | 969 | { |
181 | struct rq_iterator rt_rq_iterator; | 970 | /* don't touch RT tasks */ |
182 | 971 | return 0; | |
183 | rt_rq_iterator.start = load_balance_start_rt; | ||
184 | rt_rq_iterator.next = load_balance_next_rt; | ||
185 | /* pass 'busiest' rq argument into | ||
186 | * load_balance_[start|next]_rt iterators | ||
187 | */ | ||
188 | rt_rq_iterator.arg = busiest; | ||
189 | |||
190 | return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd, | ||
191 | idle, all_pinned, this_best_prio, &rt_rq_iterator); | ||
192 | } | 972 | } |
193 | 973 | ||
194 | static int | 974 | static int |
195 | move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | 975 | move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, |
196 | struct sched_domain *sd, enum cpu_idle_type idle) | 976 | struct sched_domain *sd, enum cpu_idle_type idle) |
197 | { | 977 | { |
198 | struct rq_iterator rt_rq_iterator; | 978 | /* don't touch RT tasks */ |
979 | return 0; | ||
980 | } | ||
981 | |||
982 | static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask) | ||
983 | { | ||
984 | int weight = cpus_weight(*new_mask); | ||
985 | |||
986 | BUG_ON(!rt_task(p)); | ||
199 | 987 | ||
200 | rt_rq_iterator.start = load_balance_start_rt; | 988 | /* |
201 | rt_rq_iterator.next = load_balance_next_rt; | 989 | * Update the migration status of the RQ if we have an RT task |
202 | rt_rq_iterator.arg = busiest; | 990 | * which is running AND changing its weight value. |
991 | */ | ||
992 | if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { | ||
993 | struct rq *rq = task_rq(p); | ||
994 | |||
995 | if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { | ||
996 | rq->rt.rt_nr_migratory++; | ||
997 | } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { | ||
998 | BUG_ON(!rq->rt.rt_nr_migratory); | ||
999 | rq->rt.rt_nr_migratory--; | ||
1000 | } | ||
1001 | |||
1002 | update_rt_migration(rq); | ||
1003 | } | ||
203 | 1004 | ||
204 | return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, | 1005 | p->cpus_allowed = *new_mask; |
205 | &rt_rq_iterator); | 1006 | p->rt.nr_cpus_allowed = weight; |
206 | } | 1007 | } |
207 | #endif | ||
208 | 1008 | ||
209 | static void task_tick_rt(struct rq *rq, struct task_struct *p) | 1009 | /* Assumes rq->lock is held */ |
1010 | static void join_domain_rt(struct rq *rq) | ||
1011 | { | ||
1012 | if (rq->rt.overloaded) | ||
1013 | rt_set_overload(rq); | ||
1014 | } | ||
1015 | |||
1016 | /* Assumes rq->lock is held */ | ||
1017 | static void leave_domain_rt(struct rq *rq) | ||
1018 | { | ||
1019 | if (rq->rt.overloaded) | ||
1020 | rt_clear_overload(rq); | ||
1021 | } | ||
1022 | |||
1023 | /* | ||
1024 | * When switch from the rt queue, we bring ourselves to a position | ||
1025 | * that we might want to pull RT tasks from other runqueues. | ||
1026 | */ | ||
1027 | static void switched_from_rt(struct rq *rq, struct task_struct *p, | ||
1028 | int running) | ||
1029 | { | ||
1030 | /* | ||
1031 | * If there are other RT tasks then we will reschedule | ||
1032 | * and the scheduling of the other RT tasks will handle | ||
1033 | * the balancing. But if we are the last RT task | ||
1034 | * we may need to handle the pulling of RT tasks | ||
1035 | * now. | ||
1036 | */ | ||
1037 | if (!rq->rt.rt_nr_running) | ||
1038 | pull_rt_task(rq); | ||
1039 | } | ||
1040 | #endif /* CONFIG_SMP */ | ||
1041 | |||
1042 | /* | ||
1043 | * When switching a task to RT, we may overload the runqueue | ||
1044 | * with RT tasks. In this case we try to push them off to | ||
1045 | * other runqueues. | ||
1046 | */ | ||
1047 | static void switched_to_rt(struct rq *rq, struct task_struct *p, | ||
1048 | int running) | ||
1049 | { | ||
1050 | int check_resched = 1; | ||
1051 | |||
1052 | /* | ||
1053 | * If we are already running, then there's nothing | ||
1054 | * that needs to be done. But if we are not running | ||
1055 | * we may need to preempt the current running task. | ||
1056 | * If that current running task is also an RT task | ||
1057 | * then see if we can move to another run queue. | ||
1058 | */ | ||
1059 | if (!running) { | ||
1060 | #ifdef CONFIG_SMP | ||
1061 | if (rq->rt.overloaded && push_rt_task(rq) && | ||
1062 | /* Don't resched if we changed runqueues */ | ||
1063 | rq != task_rq(p)) | ||
1064 | check_resched = 0; | ||
1065 | #endif /* CONFIG_SMP */ | ||
1066 | if (check_resched && p->prio < rq->curr->prio) | ||
1067 | resched_task(rq->curr); | ||
1068 | } | ||
1069 | } | ||
1070 | |||
1071 | /* | ||
1072 | * Priority of the task has changed. This may cause | ||
1073 | * us to initiate a push or pull. | ||
1074 | */ | ||
1075 | static void prio_changed_rt(struct rq *rq, struct task_struct *p, | ||
1076 | int oldprio, int running) | ||
1077 | { | ||
1078 | if (running) { | ||
1079 | #ifdef CONFIG_SMP | ||
1080 | /* | ||
1081 | * If our priority decreases while running, we | ||
1082 | * may need to pull tasks to this runqueue. | ||
1083 | */ | ||
1084 | if (oldprio < p->prio) | ||
1085 | pull_rt_task(rq); | ||
1086 | /* | ||
1087 | * If there's a higher priority task waiting to run | ||
1088 | * then reschedule. | ||
1089 | */ | ||
1090 | if (p->prio > rq->rt.highest_prio) | ||
1091 | resched_task(p); | ||
1092 | #else | ||
1093 | /* For UP simply resched on drop of prio */ | ||
1094 | if (oldprio < p->prio) | ||
1095 | resched_task(p); | ||
1096 | #endif /* CONFIG_SMP */ | ||
1097 | } else { | ||
1098 | /* | ||
1099 | * This task is not running, but if it is | ||
1100 | * greater than the current running task | ||
1101 | * then reschedule. | ||
1102 | */ | ||
1103 | if (p->prio < rq->curr->prio) | ||
1104 | resched_task(rq->curr); | ||
1105 | } | ||
1106 | } | ||
1107 | |||
1108 | static void watchdog(struct rq *rq, struct task_struct *p) | ||
1109 | { | ||
1110 | unsigned long soft, hard; | ||
1111 | |||
1112 | if (!p->signal) | ||
1113 | return; | ||
1114 | |||
1115 | soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur; | ||
1116 | hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max; | ||
1117 | |||
1118 | if (soft != RLIM_INFINITY) { | ||
1119 | unsigned long next; | ||
1120 | |||
1121 | p->rt.timeout++; | ||
1122 | next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); | ||
1123 | if (p->rt.timeout > next) | ||
1124 | p->it_sched_expires = p->se.sum_exec_runtime; | ||
1125 | } | ||
1126 | } | ||
1127 | |||
1128 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | ||
210 | { | 1129 | { |
211 | update_curr_rt(rq); | 1130 | update_curr_rt(rq); |
212 | 1131 | ||
1132 | watchdog(rq, p); | ||
1133 | |||
213 | /* | 1134 | /* |
214 | * RR tasks need a special form of timeslice management. | 1135 | * RR tasks need a special form of timeslice management. |
215 | * FIFO tasks have no timeslices. | 1136 | * FIFO tasks have no timeslices. |
@@ -217,16 +1138,16 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) | |||
217 | if (p->policy != SCHED_RR) | 1138 | if (p->policy != SCHED_RR) |
218 | return; | 1139 | return; |
219 | 1140 | ||
220 | if (--p->time_slice) | 1141 | if (--p->rt.time_slice) |
221 | return; | 1142 | return; |
222 | 1143 | ||
223 | p->time_slice = DEF_TIMESLICE; | 1144 | p->rt.time_slice = DEF_TIMESLICE; |
224 | 1145 | ||
225 | /* | 1146 | /* |
226 | * Requeue to the end of queue if we are not the only element | 1147 | * Requeue to the end of queue if we are not the only element |
227 | * on the queue: | 1148 | * on the queue: |
228 | */ | 1149 | */ |
229 | if (p->run_list.prev != p->run_list.next) { | 1150 | if (p->rt.run_list.prev != p->rt.run_list.next) { |
230 | requeue_task_rt(rq, p); | 1151 | requeue_task_rt(rq, p); |
231 | set_tsk_need_resched(p); | 1152 | set_tsk_need_resched(p); |
232 | } | 1153 | } |
@@ -244,6 +1165,9 @@ const struct sched_class rt_sched_class = { | |||
244 | .enqueue_task = enqueue_task_rt, | 1165 | .enqueue_task = enqueue_task_rt, |
245 | .dequeue_task = dequeue_task_rt, | 1166 | .dequeue_task = dequeue_task_rt, |
246 | .yield_task = yield_task_rt, | 1167 | .yield_task = yield_task_rt, |
1168 | #ifdef CONFIG_SMP | ||
1169 | .select_task_rq = select_task_rq_rt, | ||
1170 | #endif /* CONFIG_SMP */ | ||
247 | 1171 | ||
248 | .check_preempt_curr = check_preempt_curr_rt, | 1172 | .check_preempt_curr = check_preempt_curr_rt, |
249 | 1173 | ||
@@ -253,8 +1177,18 @@ const struct sched_class rt_sched_class = { | |||
253 | #ifdef CONFIG_SMP | 1177 | #ifdef CONFIG_SMP |
254 | .load_balance = load_balance_rt, | 1178 | .load_balance = load_balance_rt, |
255 | .move_one_task = move_one_task_rt, | 1179 | .move_one_task = move_one_task_rt, |
1180 | .set_cpus_allowed = set_cpus_allowed_rt, | ||
1181 | .join_domain = join_domain_rt, | ||
1182 | .leave_domain = leave_domain_rt, | ||
1183 | .pre_schedule = pre_schedule_rt, | ||
1184 | .post_schedule = post_schedule_rt, | ||
1185 | .task_wake_up = task_wake_up_rt, | ||
1186 | .switched_from = switched_from_rt, | ||
256 | #endif | 1187 | #endif |
257 | 1188 | ||
258 | .set_curr_task = set_curr_task_rt, | 1189 | .set_curr_task = set_curr_task_rt, |
259 | .task_tick = task_tick_rt, | 1190 | .task_tick = task_tick_rt, |
1191 | |||
1192 | .prio_changed = prio_changed_rt, | ||
1193 | .switched_to = switched_to_rt, | ||
260 | }; | 1194 | }; |
diff --git a/kernel/signal.c b/kernel/signal.c index afa4f781f924..bf49ce6f016b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -733,13 +733,13 @@ static void print_fatal_signal(struct pt_regs *regs, int signr) | |||
733 | current->comm, task_pid_nr(current), signr); | 733 | current->comm, task_pid_nr(current), signr); |
734 | 734 | ||
735 | #if defined(__i386__) && !defined(__arch_um__) | 735 | #if defined(__i386__) && !defined(__arch_um__) |
736 | printk("code at %08lx: ", regs->eip); | 736 | printk("code at %08lx: ", regs->ip); |
737 | { | 737 | { |
738 | int i; | 738 | int i; |
739 | for (i = 0; i < 16; i++) { | 739 | for (i = 0; i < 16; i++) { |
740 | unsigned char insn; | 740 | unsigned char insn; |
741 | 741 | ||
742 | __get_user(insn, (unsigned char *)(regs->eip + i)); | 742 | __get_user(insn, (unsigned char *)(regs->ip + i)); |
743 | printk("%02x ", insn); | 743 | printk("%02x ", insn); |
744 | } | 744 | } |
745 | } | 745 | } |
diff --git a/kernel/softirq.c b/kernel/softirq.c index bd89bc4eb0b9..d7837d45419e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -3,7 +3,9 @@ | |||
3 | * | 3 | * |
4 | * Copyright (C) 1992 Linus Torvalds | 4 | * Copyright (C) 1992 Linus Torvalds |
5 | * | 5 | * |
6 | * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) | 6 | * Distribute under GPLv2. |
7 | * | ||
8 | * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) | ||
7 | */ | 9 | */ |
8 | 10 | ||
9 | #include <linux/module.h> | 11 | #include <linux/module.h> |
@@ -278,9 +280,14 @@ asmlinkage void do_softirq(void) | |||
278 | */ | 280 | */ |
279 | void irq_enter(void) | 281 | void irq_enter(void) |
280 | { | 282 | { |
283 | #ifdef CONFIG_NO_HZ | ||
284 | int cpu = smp_processor_id(); | ||
285 | if (idle_cpu(cpu) && !in_interrupt()) | ||
286 | tick_nohz_stop_idle(cpu); | ||
287 | #endif | ||
281 | __irq_enter(); | 288 | __irq_enter(); |
282 | #ifdef CONFIG_NO_HZ | 289 | #ifdef CONFIG_NO_HZ |
283 | if (idle_cpu(smp_processor_id())) | 290 | if (idle_cpu(cpu)) |
284 | tick_nohz_update_jiffies(); | 291 | tick_nohz_update_jiffies(); |
285 | #endif | 292 | #endif |
286 | } | 293 | } |
diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 11df812263c8..c1d76552446e 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c | |||
@@ -8,6 +8,7 @@ | |||
8 | */ | 8 | */ |
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/cpu.h> | 10 | #include <linux/cpu.h> |
11 | #include <linux/nmi.h> | ||
11 | #include <linux/init.h> | 12 | #include <linux/init.h> |
12 | #include <linux/delay.h> | 13 | #include <linux/delay.h> |
13 | #include <linux/freezer.h> | 14 | #include <linux/freezer.h> |
@@ -23,8 +24,8 @@ static DEFINE_PER_CPU(unsigned long, touch_timestamp); | |||
23 | static DEFINE_PER_CPU(unsigned long, print_timestamp); | 24 | static DEFINE_PER_CPU(unsigned long, print_timestamp); |
24 | static DEFINE_PER_CPU(struct task_struct *, watchdog_task); | 25 | static DEFINE_PER_CPU(struct task_struct *, watchdog_task); |
25 | 26 | ||
26 | static int did_panic; | 27 | static int __read_mostly did_panic; |
27 | int softlockup_thresh = 10; | 28 | unsigned long __read_mostly softlockup_thresh = 60; |
28 | 29 | ||
29 | static int | 30 | static int |
30 | softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) | 31 | softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) |
@@ -45,7 +46,7 @@ static struct notifier_block panic_block = { | |||
45 | */ | 46 | */ |
46 | static unsigned long get_timestamp(int this_cpu) | 47 | static unsigned long get_timestamp(int this_cpu) |
47 | { | 48 | { |
48 | return cpu_clock(this_cpu) >> 30; /* 2^30 ~= 10^9 */ | 49 | return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ |
49 | } | 50 | } |
50 | 51 | ||
51 | void touch_softlockup_watchdog(void) | 52 | void touch_softlockup_watchdog(void) |
@@ -100,11 +101,7 @@ void softlockup_tick(void) | |||
100 | 101 | ||
101 | now = get_timestamp(this_cpu); | 102 | now = get_timestamp(this_cpu); |
102 | 103 | ||
103 | /* Wake up the high-prio watchdog task every second: */ | 104 | /* Warn about unreasonable delays: */ |
104 | if (now > (touch_timestamp + 1)) | ||
105 | wake_up_process(per_cpu(watchdog_task, this_cpu)); | ||
106 | |||
107 | /* Warn about unreasonable 10+ seconds delays: */ | ||
108 | if (now <= (touch_timestamp + softlockup_thresh)) | 105 | if (now <= (touch_timestamp + softlockup_thresh)) |
109 | return; | 106 | return; |
110 | 107 | ||
@@ -122,11 +119,93 @@ void softlockup_tick(void) | |||
122 | } | 119 | } |
123 | 120 | ||
124 | /* | 121 | /* |
122 | * Have a reasonable limit on the number of tasks checked: | ||
123 | */ | ||
124 | unsigned long __read_mostly sysctl_hung_task_check_count = 1024; | ||
125 | |||
126 | /* | ||
127 | * Zero means infinite timeout - no checking done: | ||
128 | */ | ||
129 | unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; | ||
130 | |||
131 | unsigned long __read_mostly sysctl_hung_task_warnings = 10; | ||
132 | |||
133 | /* | ||
134 | * Only do the hung-tasks check on one CPU: | ||
135 | */ | ||
136 | static int check_cpu __read_mostly = -1; | ||
137 | |||
138 | static void check_hung_task(struct task_struct *t, unsigned long now) | ||
139 | { | ||
140 | unsigned long switch_count = t->nvcsw + t->nivcsw; | ||
141 | |||
142 | if (t->flags & PF_FROZEN) | ||
143 | return; | ||
144 | |||
145 | if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { | ||
146 | t->last_switch_count = switch_count; | ||
147 | t->last_switch_timestamp = now; | ||
148 | return; | ||
149 | } | ||
150 | if ((long)(now - t->last_switch_timestamp) < | ||
151 | sysctl_hung_task_timeout_secs) | ||
152 | return; | ||
153 | if (sysctl_hung_task_warnings < 0) | ||
154 | return; | ||
155 | sysctl_hung_task_warnings--; | ||
156 | |||
157 | /* | ||
158 | * Ok, the task did not get scheduled for more than 2 minutes, | ||
159 | * complain: | ||
160 | */ | ||
161 | printk(KERN_ERR "INFO: task %s:%d blocked for more than " | ||
162 | "%ld seconds.\n", t->comm, t->pid, | ||
163 | sysctl_hung_task_timeout_secs); | ||
164 | printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | ||
165 | " disables this message.\n"); | ||
166 | sched_show_task(t); | ||
167 | __debug_show_held_locks(t); | ||
168 | |||
169 | t->last_switch_timestamp = now; | ||
170 | touch_nmi_watchdog(); | ||
171 | } | ||
172 | |||
173 | /* | ||
174 | * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for | ||
175 | * a really long time (120 seconds). If that happens, print out | ||
176 | * a warning. | ||
177 | */ | ||
178 | static void check_hung_uninterruptible_tasks(int this_cpu) | ||
179 | { | ||
180 | int max_count = sysctl_hung_task_check_count; | ||
181 | unsigned long now = get_timestamp(this_cpu); | ||
182 | struct task_struct *g, *t; | ||
183 | |||
184 | /* | ||
185 | * If the system crashed already then all bets are off, | ||
186 | * do not report extra hung tasks: | ||
187 | */ | ||
188 | if ((tainted & TAINT_DIE) || did_panic) | ||
189 | return; | ||
190 | |||
191 | read_lock(&tasklist_lock); | ||
192 | do_each_thread(g, t) { | ||
193 | if (!--max_count) | ||
194 | break; | ||
195 | if (t->state & TASK_UNINTERRUPTIBLE) | ||
196 | check_hung_task(t, now); | ||
197 | } while_each_thread(g, t); | ||
198 | |||
199 | read_unlock(&tasklist_lock); | ||
200 | } | ||
201 | |||
202 | /* | ||
125 | * The watchdog thread - runs every second and touches the timestamp. | 203 | * The watchdog thread - runs every second and touches the timestamp. |
126 | */ | 204 | */ |
127 | static int watchdog(void *__bind_cpu) | 205 | static int watchdog(void *__bind_cpu) |
128 | { | 206 | { |
129 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 207 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; |
208 | int this_cpu = (long)__bind_cpu; | ||
130 | 209 | ||
131 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 210 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
132 | 211 | ||
@@ -135,13 +214,18 @@ static int watchdog(void *__bind_cpu) | |||
135 | 214 | ||
136 | /* | 215 | /* |
137 | * Run briefly once per second to reset the softlockup timestamp. | 216 | * Run briefly once per second to reset the softlockup timestamp. |
138 | * If this gets delayed for more than 10 seconds then the | 217 | * If this gets delayed for more than 60 seconds then the |
139 | * debug-printout triggers in softlockup_tick(). | 218 | * debug-printout triggers in softlockup_tick(). |
140 | */ | 219 | */ |
141 | while (!kthread_should_stop()) { | 220 | while (!kthread_should_stop()) { |
142 | set_current_state(TASK_INTERRUPTIBLE); | ||
143 | touch_softlockup_watchdog(); | 221 | touch_softlockup_watchdog(); |
144 | schedule(); | 222 | msleep_interruptible(10000); |
223 | |||
224 | if (this_cpu != check_cpu) | ||
225 | continue; | ||
226 | |||
227 | if (sysctl_hung_task_timeout_secs) | ||
228 | check_hung_uninterruptible_tasks(this_cpu); | ||
145 | } | 229 | } |
146 | 230 | ||
147 | return 0; | 231 | return 0; |
@@ -171,6 +255,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
171 | break; | 255 | break; |
172 | case CPU_ONLINE: | 256 | case CPU_ONLINE: |
173 | case CPU_ONLINE_FROZEN: | 257 | case CPU_ONLINE_FROZEN: |
258 | check_cpu = any_online_cpu(cpu_online_map); | ||
174 | wake_up_process(per_cpu(watchdog_task, hotcpu)); | 259 | wake_up_process(per_cpu(watchdog_task, hotcpu)); |
175 | break; | 260 | break; |
176 | #ifdef CONFIG_HOTPLUG_CPU | 261 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -181,6 +266,15 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
181 | /* Unbind so it can run. Fall thru. */ | 266 | /* Unbind so it can run. Fall thru. */ |
182 | kthread_bind(per_cpu(watchdog_task, hotcpu), | 267 | kthread_bind(per_cpu(watchdog_task, hotcpu), |
183 | any_online_cpu(cpu_online_map)); | 268 | any_online_cpu(cpu_online_map)); |
269 | case CPU_DOWN_PREPARE: | ||
270 | case CPU_DOWN_PREPARE_FROZEN: | ||
271 | if (hotcpu == check_cpu) { | ||
272 | cpumask_t temp_cpu_online_map = cpu_online_map; | ||
273 | |||
274 | cpu_clear(hotcpu, temp_cpu_online_map); | ||
275 | check_cpu = any_online_cpu(temp_cpu_online_map); | ||
276 | } | ||
277 | break; | ||
184 | case CPU_DEAD: | 278 | case CPU_DEAD: |
185 | case CPU_DEAD_FROZEN: | 279 | case CPU_DEAD_FROZEN: |
186 | p = per_cpu(watchdog_task, hotcpu); | 280 | p = per_cpu(watchdog_task, hotcpu); |
diff --git a/kernel/spinlock.c b/kernel/spinlock.c index cd72424c2662..ae28c8245123 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c | |||
@@ -65,8 +65,7 @@ EXPORT_SYMBOL(_write_trylock); | |||
65 | * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are | 65 | * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are |
66 | * not re-enabled during lock-acquire (which the preempt-spin-ops do): | 66 | * not re-enabled during lock-acquire (which the preempt-spin-ops do): |
67 | */ | 67 | */ |
68 | #if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \ | 68 | #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) |
69 | defined(CONFIG_DEBUG_LOCK_ALLOC) | ||
70 | 69 | ||
71 | void __lockfunc _read_lock(rwlock_t *lock) | 70 | void __lockfunc _read_lock(rwlock_t *lock) |
72 | { | 71 | { |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 319821ef78af..51b5ee53571a 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -203,13 +203,13 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) | |||
203 | int ret; | 203 | int ret; |
204 | 204 | ||
205 | /* No CPUs can come up or down during this. */ | 205 | /* No CPUs can come up or down during this. */ |
206 | lock_cpu_hotplug(); | 206 | get_online_cpus(); |
207 | p = __stop_machine_run(fn, data, cpu); | 207 | p = __stop_machine_run(fn, data, cpu); |
208 | if (!IS_ERR(p)) | 208 | if (!IS_ERR(p)) |
209 | ret = kthread_stop(p); | 209 | ret = kthread_stop(p); |
210 | else | 210 | else |
211 | ret = PTR_ERR(p); | 211 | ret = PTR_ERR(p); |
212 | unlock_cpu_hotplug(); | 212 | put_online_cpus(); |
213 | 213 | ||
214 | return ret; | 214 | return ret; |
215 | } | 215 | } |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c68f68dcc605..357b68ba23ec 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -53,6 +53,7 @@ | |||
53 | #ifdef CONFIG_X86 | 53 | #ifdef CONFIG_X86 |
54 | #include <asm/nmi.h> | 54 | #include <asm/nmi.h> |
55 | #include <asm/stacktrace.h> | 55 | #include <asm/stacktrace.h> |
56 | #include <asm/io.h> | ||
56 | #endif | 57 | #endif |
57 | 58 | ||
58 | static int deprecated_sysctl_warning(struct __sysctl_args *args); | 59 | static int deprecated_sysctl_warning(struct __sysctl_args *args); |
@@ -81,6 +82,7 @@ extern int compat_log; | |||
81 | extern int maps_protect; | 82 | extern int maps_protect; |
82 | extern int sysctl_stat_interval; | 83 | extern int sysctl_stat_interval; |
83 | extern int audit_argv_kb; | 84 | extern int audit_argv_kb; |
85 | extern int latencytop_enabled; | ||
84 | 86 | ||
85 | /* Constants used for minimum and maximum */ | 87 | /* Constants used for minimum and maximum */ |
86 | #ifdef CONFIG_DETECT_SOFTLOCKUP | 88 | #ifdef CONFIG_DETECT_SOFTLOCKUP |
@@ -156,8 +158,16 @@ static int proc_dointvec_taint(struct ctl_table *table, int write, struct file * | |||
156 | #endif | 158 | #endif |
157 | 159 | ||
158 | static struct ctl_table root_table[]; | 160 | static struct ctl_table root_table[]; |
159 | static struct ctl_table_header root_table_header = | 161 | static struct ctl_table_root sysctl_table_root; |
160 | { root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) }; | 162 | static struct ctl_table_header root_table_header = { |
163 | .ctl_table = root_table, | ||
164 | .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list), | ||
165 | .root = &sysctl_table_root, | ||
166 | }; | ||
167 | static struct ctl_table_root sysctl_table_root = { | ||
168 | .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), | ||
169 | .header_list = LIST_HEAD_INIT(root_table_header.ctl_entry), | ||
170 | }; | ||
161 | 171 | ||
162 | static struct ctl_table kern_table[]; | 172 | static struct ctl_table kern_table[]; |
163 | static struct ctl_table vm_table[]; | 173 | static struct ctl_table vm_table[]; |
@@ -191,14 +201,6 @@ static struct ctl_table root_table[] = { | |||
191 | .mode = 0555, | 201 | .mode = 0555, |
192 | .child = vm_table, | 202 | .child = vm_table, |
193 | }, | 203 | }, |
194 | #ifdef CONFIG_NET | ||
195 | { | ||
196 | .ctl_name = CTL_NET, | ||
197 | .procname = "net", | ||
198 | .mode = 0555, | ||
199 | .child = net_table, | ||
200 | }, | ||
201 | #endif | ||
202 | { | 204 | { |
203 | .ctl_name = CTL_FS, | 205 | .ctl_name = CTL_FS, |
204 | .procname = "fs", | 206 | .procname = "fs", |
@@ -306,9 +308,43 @@ static struct ctl_table kern_table[] = { | |||
306 | .procname = "sched_nr_migrate", | 308 | .procname = "sched_nr_migrate", |
307 | .data = &sysctl_sched_nr_migrate, | 309 | .data = &sysctl_sched_nr_migrate, |
308 | .maxlen = sizeof(unsigned int), | 310 | .maxlen = sizeof(unsigned int), |
309 | .mode = 644, | 311 | .mode = 0644, |
312 | .proc_handler = &proc_dointvec, | ||
313 | }, | ||
314 | { | ||
315 | .ctl_name = CTL_UNNUMBERED, | ||
316 | .procname = "sched_rt_period_ms", | ||
317 | .data = &sysctl_sched_rt_period, | ||
318 | .maxlen = sizeof(unsigned int), | ||
319 | .mode = 0644, | ||
320 | .proc_handler = &proc_dointvec, | ||
321 | }, | ||
322 | { | ||
323 | .ctl_name = CTL_UNNUMBERED, | ||
324 | .procname = "sched_rt_ratio", | ||
325 | .data = &sysctl_sched_rt_ratio, | ||
326 | .maxlen = sizeof(unsigned int), | ||
327 | .mode = 0644, | ||
310 | .proc_handler = &proc_dointvec, | 328 | .proc_handler = &proc_dointvec, |
311 | }, | 329 | }, |
330 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | ||
331 | { | ||
332 | .ctl_name = CTL_UNNUMBERED, | ||
333 | .procname = "sched_min_bal_int_shares", | ||
334 | .data = &sysctl_sched_min_bal_int_shares, | ||
335 | .maxlen = sizeof(unsigned int), | ||
336 | .mode = 0644, | ||
337 | .proc_handler = &proc_dointvec, | ||
338 | }, | ||
339 | { | ||
340 | .ctl_name = CTL_UNNUMBERED, | ||
341 | .procname = "sched_max_bal_int_shares", | ||
342 | .data = &sysctl_sched_max_bal_int_shares, | ||
343 | .maxlen = sizeof(unsigned int), | ||
344 | .mode = 0644, | ||
345 | .proc_handler = &proc_dointvec, | ||
346 | }, | ||
347 | #endif | ||
312 | #endif | 348 | #endif |
313 | { | 349 | { |
314 | .ctl_name = CTL_UNNUMBERED, | 350 | .ctl_name = CTL_UNNUMBERED, |
@@ -382,6 +418,15 @@ static struct ctl_table kern_table[] = { | |||
382 | .proc_handler = &proc_dointvec_taint, | 418 | .proc_handler = &proc_dointvec_taint, |
383 | }, | 419 | }, |
384 | #endif | 420 | #endif |
421 | #ifdef CONFIG_LATENCYTOP | ||
422 | { | ||
423 | .procname = "latencytop", | ||
424 | .data = &latencytop_enabled, | ||
425 | .maxlen = sizeof(int), | ||
426 | .mode = 0644, | ||
427 | .proc_handler = &proc_dointvec, | ||
428 | }, | ||
429 | #endif | ||
385 | #ifdef CONFIG_SECURITY_CAPABILITIES | 430 | #ifdef CONFIG_SECURITY_CAPABILITIES |
386 | { | 431 | { |
387 | .procname = "cap-bound", | 432 | .procname = "cap-bound", |
@@ -683,6 +728,14 @@ static struct ctl_table kern_table[] = { | |||
683 | .mode = 0644, | 728 | .mode = 0644, |
684 | .proc_handler = &proc_dointvec, | 729 | .proc_handler = &proc_dointvec, |
685 | }, | 730 | }, |
731 | { | ||
732 | .ctl_name = CTL_UNNUMBERED, | ||
733 | .procname = "io_delay_type", | ||
734 | .data = &io_delay_type, | ||
735 | .maxlen = sizeof(int), | ||
736 | .mode = 0644, | ||
737 | .proc_handler = &proc_dointvec, | ||
738 | }, | ||
686 | #endif | 739 | #endif |
687 | #if defined(CONFIG_MMU) | 740 | #if defined(CONFIG_MMU) |
688 | { | 741 | { |
@@ -728,13 +781,40 @@ static struct ctl_table kern_table[] = { | |||
728 | .ctl_name = CTL_UNNUMBERED, | 781 | .ctl_name = CTL_UNNUMBERED, |
729 | .procname = "softlockup_thresh", | 782 | .procname = "softlockup_thresh", |
730 | .data = &softlockup_thresh, | 783 | .data = &softlockup_thresh, |
731 | .maxlen = sizeof(int), | 784 | .maxlen = sizeof(unsigned long), |
732 | .mode = 0644, | 785 | .mode = 0644, |
733 | .proc_handler = &proc_dointvec_minmax, | 786 | .proc_handler = &proc_doulongvec_minmax, |
734 | .strategy = &sysctl_intvec, | 787 | .strategy = &sysctl_intvec, |
735 | .extra1 = &one, | 788 | .extra1 = &one, |
736 | .extra2 = &sixty, | 789 | .extra2 = &sixty, |
737 | }, | 790 | }, |
791 | { | ||
792 | .ctl_name = CTL_UNNUMBERED, | ||
793 | .procname = "hung_task_check_count", | ||
794 | .data = &sysctl_hung_task_check_count, | ||
795 | .maxlen = sizeof(unsigned long), | ||
796 | .mode = 0644, | ||
797 | .proc_handler = &proc_doulongvec_minmax, | ||
798 | .strategy = &sysctl_intvec, | ||
799 | }, | ||
800 | { | ||
801 | .ctl_name = CTL_UNNUMBERED, | ||
802 | .procname = "hung_task_timeout_secs", | ||
803 | .data = &sysctl_hung_task_timeout_secs, | ||
804 | .maxlen = sizeof(unsigned long), | ||
805 | .mode = 0644, | ||
806 | .proc_handler = &proc_doulongvec_minmax, | ||
807 | .strategy = &sysctl_intvec, | ||
808 | }, | ||
809 | { | ||
810 | .ctl_name = CTL_UNNUMBERED, | ||
811 | .procname = "hung_task_warnings", | ||
812 | .data = &sysctl_hung_task_warnings, | ||
813 | .maxlen = sizeof(unsigned long), | ||
814 | .mode = 0644, | ||
815 | .proc_handler = &proc_doulongvec_minmax, | ||
816 | .strategy = &sysctl_intvec, | ||
817 | }, | ||
738 | #endif | 818 | #endif |
739 | #ifdef CONFIG_COMPAT | 819 | #ifdef CONFIG_COMPAT |
740 | { | 820 | { |
@@ -1300,12 +1380,27 @@ void sysctl_head_finish(struct ctl_table_header *head) | |||
1300 | spin_unlock(&sysctl_lock); | 1380 | spin_unlock(&sysctl_lock); |
1301 | } | 1381 | } |
1302 | 1382 | ||
1303 | struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) | 1383 | static struct list_head * |
1384 | lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) | ||
1304 | { | 1385 | { |
1386 | struct list_head *header_list; | ||
1387 | header_list = &root->header_list; | ||
1388 | if (root->lookup) | ||
1389 | header_list = root->lookup(root, namespaces); | ||
1390 | return header_list; | ||
1391 | } | ||
1392 | |||
1393 | struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, | ||
1394 | struct ctl_table_header *prev) | ||
1395 | { | ||
1396 | struct ctl_table_root *root; | ||
1397 | struct list_head *header_list; | ||
1305 | struct ctl_table_header *head; | 1398 | struct ctl_table_header *head; |
1306 | struct list_head *tmp; | 1399 | struct list_head *tmp; |
1400 | |||
1307 | spin_lock(&sysctl_lock); | 1401 | spin_lock(&sysctl_lock); |
1308 | if (prev) { | 1402 | if (prev) { |
1403 | head = prev; | ||
1309 | tmp = &prev->ctl_entry; | 1404 | tmp = &prev->ctl_entry; |
1310 | unuse_table(prev); | 1405 | unuse_table(prev); |
1311 | goto next; | 1406 | goto next; |
@@ -1319,14 +1414,38 @@ struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) | |||
1319 | spin_unlock(&sysctl_lock); | 1414 | spin_unlock(&sysctl_lock); |
1320 | return head; | 1415 | return head; |
1321 | next: | 1416 | next: |
1417 | root = head->root; | ||
1322 | tmp = tmp->next; | 1418 | tmp = tmp->next; |
1323 | if (tmp == &root_table_header.ctl_entry) | 1419 | header_list = lookup_header_list(root, namespaces); |
1324 | break; | 1420 | if (tmp != header_list) |
1421 | continue; | ||
1422 | |||
1423 | do { | ||
1424 | root = list_entry(root->root_list.next, | ||
1425 | struct ctl_table_root, root_list); | ||
1426 | if (root == &sysctl_table_root) | ||
1427 | goto out; | ||
1428 | header_list = lookup_header_list(root, namespaces); | ||
1429 | } while (list_empty(header_list)); | ||
1430 | tmp = header_list->next; | ||
1325 | } | 1431 | } |
1432 | out: | ||
1326 | spin_unlock(&sysctl_lock); | 1433 | spin_unlock(&sysctl_lock); |
1327 | return NULL; | 1434 | return NULL; |
1328 | } | 1435 | } |
1329 | 1436 | ||
1437 | struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) | ||
1438 | { | ||
1439 | return __sysctl_head_next(current->nsproxy, prev); | ||
1440 | } | ||
1441 | |||
1442 | void register_sysctl_root(struct ctl_table_root *root) | ||
1443 | { | ||
1444 | spin_lock(&sysctl_lock); | ||
1445 | list_add_tail(&root->root_list, &sysctl_table_root.root_list); | ||
1446 | spin_unlock(&sysctl_lock); | ||
1447 | } | ||
1448 | |||
1330 | #ifdef CONFIG_SYSCTL_SYSCALL | 1449 | #ifdef CONFIG_SYSCTL_SYSCALL |
1331 | int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, | 1450 | int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, |
1332 | void __user *newval, size_t newlen) | 1451 | void __user *newval, size_t newlen) |
@@ -1483,18 +1602,21 @@ static __init int sysctl_init(void) | |||
1483 | { | 1602 | { |
1484 | int err; | 1603 | int err; |
1485 | sysctl_set_parent(NULL, root_table); | 1604 | sysctl_set_parent(NULL, root_table); |
1486 | err = sysctl_check_table(root_table); | 1605 | err = sysctl_check_table(current->nsproxy, root_table); |
1487 | return 0; | 1606 | return 0; |
1488 | } | 1607 | } |
1489 | 1608 | ||
1490 | core_initcall(sysctl_init); | 1609 | core_initcall(sysctl_init); |
1491 | 1610 | ||
1492 | /** | 1611 | /** |
1493 | * register_sysctl_table - register a sysctl hierarchy | 1612 | * __register_sysctl_paths - register a sysctl hierarchy |
1613 | * @root: List of sysctl headers to register on | ||
1614 | * @namespaces: Data to compute which lists of sysctl entries are visible | ||
1615 | * @path: The path to the directory the sysctl table is in. | ||
1494 | * @table: the top-level table structure | 1616 | * @table: the top-level table structure |
1495 | * | 1617 | * |
1496 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table | 1618 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table |
1497 | * array. An entry with a ctl_name of 0 terminates the table. | 1619 | * array. A completely 0 filled entry terminates the table. |
1498 | * | 1620 | * |
1499 | * The members of the &struct ctl_table structure are used as follows: | 1621 | * The members of the &struct ctl_table structure are used as follows: |
1500 | * | 1622 | * |
@@ -1557,25 +1679,99 @@ core_initcall(sysctl_init); | |||
1557 | * This routine returns %NULL on a failure to register, and a pointer | 1679 | * This routine returns %NULL on a failure to register, and a pointer |
1558 | * to the table header on success. | 1680 | * to the table header on success. |
1559 | */ | 1681 | */ |
1560 | struct ctl_table_header *register_sysctl_table(struct ctl_table * table) | 1682 | struct ctl_table_header *__register_sysctl_paths( |
1683 | struct ctl_table_root *root, | ||
1684 | struct nsproxy *namespaces, | ||
1685 | const struct ctl_path *path, struct ctl_table *table) | ||
1561 | { | 1686 | { |
1562 | struct ctl_table_header *tmp; | 1687 | struct list_head *header_list; |
1563 | tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); | 1688 | struct ctl_table_header *header; |
1564 | if (!tmp) | 1689 | struct ctl_table *new, **prevp; |
1690 | unsigned int n, npath; | ||
1691 | |||
1692 | /* Count the path components */ | ||
1693 | for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath) | ||
1694 | ; | ||
1695 | |||
1696 | /* | ||
1697 | * For each path component, allocate a 2-element ctl_table array. | ||
1698 | * The first array element will be filled with the sysctl entry | ||
1699 | * for this, the second will be the sentinel (ctl_name == 0). | ||
1700 | * | ||
1701 | * We allocate everything in one go so that we don't have to | ||
1702 | * worry about freeing additional memory in unregister_sysctl_table. | ||
1703 | */ | ||
1704 | header = kzalloc(sizeof(struct ctl_table_header) + | ||
1705 | (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL); | ||
1706 | if (!header) | ||
1565 | return NULL; | 1707 | return NULL; |
1566 | tmp->ctl_table = table; | 1708 | |
1567 | INIT_LIST_HEAD(&tmp->ctl_entry); | 1709 | new = (struct ctl_table *) (header + 1); |
1568 | tmp->used = 0; | 1710 | |
1569 | tmp->unregistering = NULL; | 1711 | /* Now connect the dots */ |
1570 | sysctl_set_parent(NULL, table); | 1712 | prevp = &header->ctl_table; |
1571 | if (sysctl_check_table(tmp->ctl_table)) { | 1713 | for (n = 0; n < npath; ++n, ++path) { |
1572 | kfree(tmp); | 1714 | /* Copy the procname */ |
1715 | new->procname = path->procname; | ||
1716 | new->ctl_name = path->ctl_name; | ||
1717 | new->mode = 0555; | ||
1718 | |||
1719 | *prevp = new; | ||
1720 | prevp = &new->child; | ||
1721 | |||
1722 | new += 2; | ||
1723 | } | ||
1724 | *prevp = table; | ||
1725 | header->ctl_table_arg = table; | ||
1726 | |||
1727 | INIT_LIST_HEAD(&header->ctl_entry); | ||
1728 | header->used = 0; | ||
1729 | header->unregistering = NULL; | ||
1730 | header->root = root; | ||
1731 | sysctl_set_parent(NULL, header->ctl_table); | ||
1732 | if (sysctl_check_table(namespaces, header->ctl_table)) { | ||
1733 | kfree(header); | ||
1573 | return NULL; | 1734 | return NULL; |
1574 | } | 1735 | } |
1575 | spin_lock(&sysctl_lock); | 1736 | spin_lock(&sysctl_lock); |
1576 | list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); | 1737 | header_list = lookup_header_list(root, namespaces); |
1738 | list_add_tail(&header->ctl_entry, header_list); | ||
1577 | spin_unlock(&sysctl_lock); | 1739 | spin_unlock(&sysctl_lock); |
1578 | return tmp; | 1740 | |
1741 | return header; | ||
1742 | } | ||
1743 | |||
1744 | /** | ||
1745 | * register_sysctl_table_path - register a sysctl table hierarchy | ||
1746 | * @path: The path to the directory the sysctl table is in. | ||
1747 | * @table: the top-level table structure | ||
1748 | * | ||
1749 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table | ||
1750 | * array. A completely 0 filled entry terminates the table. | ||
1751 | * | ||
1752 | * See __register_sysctl_paths for more details. | ||
1753 | */ | ||
1754 | struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, | ||
1755 | struct ctl_table *table) | ||
1756 | { | ||
1757 | return __register_sysctl_paths(&sysctl_table_root, current->nsproxy, | ||
1758 | path, table); | ||
1759 | } | ||
1760 | |||
1761 | /** | ||
1762 | * register_sysctl_table - register a sysctl table hierarchy | ||
1763 | * @table: the top-level table structure | ||
1764 | * | ||
1765 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table | ||
1766 | * array. A completely 0 filled entry terminates the table. | ||
1767 | * | ||
1768 | * See register_sysctl_paths for more details. | ||
1769 | */ | ||
1770 | struct ctl_table_header *register_sysctl_table(struct ctl_table *table) | ||
1771 | { | ||
1772 | static const struct ctl_path null_path[] = { {} }; | ||
1773 | |||
1774 | return register_sysctl_paths(null_path, table); | ||
1579 | } | 1775 | } |
1580 | 1776 | ||
1581 | /** | 1777 | /** |
@@ -1604,6 +1800,12 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table * table) | |||
1604 | return NULL; | 1800 | return NULL; |
1605 | } | 1801 | } |
1606 | 1802 | ||
1803 | struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, | ||
1804 | struct ctl_table *table) | ||
1805 | { | ||
1806 | return NULL; | ||
1807 | } | ||
1808 | |||
1607 | void unregister_sysctl_table(struct ctl_table_header * table) | 1809 | void unregister_sysctl_table(struct ctl_table_header * table) |
1608 | { | 1810 | { |
1609 | } | 1811 | } |
@@ -2662,6 +2864,7 @@ EXPORT_SYMBOL(proc_dostring); | |||
2662 | EXPORT_SYMBOL(proc_doulongvec_minmax); | 2864 | EXPORT_SYMBOL(proc_doulongvec_minmax); |
2663 | EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); | 2865 | EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); |
2664 | EXPORT_SYMBOL(register_sysctl_table); | 2866 | EXPORT_SYMBOL(register_sysctl_table); |
2867 | EXPORT_SYMBOL(register_sysctl_paths); | ||
2665 | EXPORT_SYMBOL(sysctl_intvec); | 2868 | EXPORT_SYMBOL(sysctl_intvec); |
2666 | EXPORT_SYMBOL(sysctl_jiffies); | 2869 | EXPORT_SYMBOL(sysctl_jiffies); |
2667 | EXPORT_SYMBOL(sysctl_ms_jiffies); | 2870 | EXPORT_SYMBOL(sysctl_ms_jiffies); |
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index a68425a5cc1d..c3206fa50048 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c | |||
@@ -1,6 +1,5 @@ | |||
1 | #include <linux/stat.h> | 1 | #include <linux/stat.h> |
2 | #include <linux/sysctl.h> | 2 | #include <linux/sysctl.h> |
3 | #include "../arch/s390/appldata/appldata.h" | ||
4 | #include "../fs/xfs/linux-2.6/xfs_sysctl.h" | 3 | #include "../fs/xfs/linux-2.6/xfs_sysctl.h" |
5 | #include <linux/sunrpc/debug.h> | 4 | #include <linux/sunrpc/debug.h> |
6 | #include <linux/string.h> | 5 | #include <linux/string.h> |
@@ -1343,7 +1342,8 @@ static void sysctl_repair_table(struct ctl_table *table) | |||
1343 | } | 1342 | } |
1344 | } | 1343 | } |
1345 | 1344 | ||
1346 | static struct ctl_table *sysctl_check_lookup(struct ctl_table *table) | 1345 | static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, |
1346 | struct ctl_table *table) | ||
1347 | { | 1347 | { |
1348 | struct ctl_table_header *head; | 1348 | struct ctl_table_header *head; |
1349 | struct ctl_table *ref, *test; | 1349 | struct ctl_table *ref, *test; |
@@ -1351,8 +1351,8 @@ static struct ctl_table *sysctl_check_lookup(struct ctl_table *table) | |||
1351 | 1351 | ||
1352 | depth = sysctl_depth(table); | 1352 | depth = sysctl_depth(table); |
1353 | 1353 | ||
1354 | for (head = sysctl_head_next(NULL); head; | 1354 | for (head = __sysctl_head_next(namespaces, NULL); head; |
1355 | head = sysctl_head_next(head)) { | 1355 | head = __sysctl_head_next(namespaces, head)) { |
1356 | cur_depth = depth; | 1356 | cur_depth = depth; |
1357 | ref = head->ctl_table; | 1357 | ref = head->ctl_table; |
1358 | repeat: | 1358 | repeat: |
@@ -1397,13 +1397,14 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str | |||
1397 | *fail = str; | 1397 | *fail = str; |
1398 | } | 1398 | } |
1399 | 1399 | ||
1400 | static int sysctl_check_dir(struct ctl_table *table) | 1400 | static int sysctl_check_dir(struct nsproxy *namespaces, |
1401 | struct ctl_table *table) | ||
1401 | { | 1402 | { |
1402 | struct ctl_table *ref; | 1403 | struct ctl_table *ref; |
1403 | int error; | 1404 | int error; |
1404 | 1405 | ||
1405 | error = 0; | 1406 | error = 0; |
1406 | ref = sysctl_check_lookup(table); | 1407 | ref = sysctl_check_lookup(namespaces, table); |
1407 | if (ref) { | 1408 | if (ref) { |
1408 | int match = 0; | 1409 | int match = 0; |
1409 | if ((!table->procname && !ref->procname) || | 1410 | if ((!table->procname && !ref->procname) || |
@@ -1428,11 +1429,12 @@ static int sysctl_check_dir(struct ctl_table *table) | |||
1428 | return error; | 1429 | return error; |
1429 | } | 1430 | } |
1430 | 1431 | ||
1431 | static void sysctl_check_leaf(struct ctl_table *table, const char **fail) | 1432 | static void sysctl_check_leaf(struct nsproxy *namespaces, |
1433 | struct ctl_table *table, const char **fail) | ||
1432 | { | 1434 | { |
1433 | struct ctl_table *ref; | 1435 | struct ctl_table *ref; |
1434 | 1436 | ||
1435 | ref = sysctl_check_lookup(table); | 1437 | ref = sysctl_check_lookup(namespaces, table); |
1436 | if (ref && (ref != table)) | 1438 | if (ref && (ref != table)) |
1437 | set_fail(fail, table, "Sysctl already exists"); | 1439 | set_fail(fail, table, "Sysctl already exists"); |
1438 | } | 1440 | } |
@@ -1456,7 +1458,7 @@ static void sysctl_check_bin_path(struct ctl_table *table, const char **fail) | |||
1456 | } | 1458 | } |
1457 | } | 1459 | } |
1458 | 1460 | ||
1459 | int sysctl_check_table(struct ctl_table *table) | 1461 | int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) |
1460 | { | 1462 | { |
1461 | int error = 0; | 1463 | int error = 0; |
1462 | for (; table->ctl_name || table->procname; table++) { | 1464 | for (; table->ctl_name || table->procname; table++) { |
@@ -1486,7 +1488,7 @@ int sysctl_check_table(struct ctl_table *table) | |||
1486 | set_fail(&fail, table, "Directory with extra1"); | 1488 | set_fail(&fail, table, "Directory with extra1"); |
1487 | if (table->extra2) | 1489 | if (table->extra2) |
1488 | set_fail(&fail, table, "Directory with extra2"); | 1490 | set_fail(&fail, table, "Directory with extra2"); |
1489 | if (sysctl_check_dir(table)) | 1491 | if (sysctl_check_dir(namespaces, table)) |
1490 | set_fail(&fail, table, "Inconsistent directory names"); | 1492 | set_fail(&fail, table, "Inconsistent directory names"); |
1491 | } else { | 1493 | } else { |
1492 | if ((table->strategy == sysctl_data) || | 1494 | if ((table->strategy == sysctl_data) || |
@@ -1535,7 +1537,7 @@ int sysctl_check_table(struct ctl_table *table) | |||
1535 | if (!table->procname && table->proc_handler) | 1537 | if (!table->procname && table->proc_handler) |
1536 | set_fail(&fail, table, "proc_handler without procname"); | 1538 | set_fail(&fail, table, "proc_handler without procname"); |
1537 | #endif | 1539 | #endif |
1538 | sysctl_check_leaf(table, &fail); | 1540 | sysctl_check_leaf(namespaces, table, &fail); |
1539 | } | 1541 | } |
1540 | sysctl_check_bin_path(table, &fail); | 1542 | sysctl_check_bin_path(table, &fail); |
1541 | if (fail) { | 1543 | if (fail) { |
@@ -1543,7 +1545,7 @@ int sysctl_check_table(struct ctl_table *table) | |||
1543 | error = -EINVAL; | 1545 | error = -EINVAL; |
1544 | } | 1546 | } |
1545 | if (table->child) | 1547 | if (table->child) |
1546 | error |= sysctl_check_table(table->child); | 1548 | error |= sysctl_check_table(namespaces, table->child); |
1547 | } | 1549 | } |
1548 | return error; | 1550 | return error; |
1549 | } | 1551 | } |
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c new file mode 100644 index 000000000000..88cdb109e13c --- /dev/null +++ b/kernel/test_kprobes.c | |||
@@ -0,0 +1,216 @@ | |||
1 | /* | ||
2 | * test_kprobes.c - simple sanity test for *probes | ||
3 | * | ||
4 | * Copyright IBM Corp. 2008 | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it would be useful, but | ||
12 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
14 | * the GNU General Public License for more details. | ||
15 | */ | ||
16 | |||
17 | #include <linux/kernel.h> | ||
18 | #include <linux/kprobes.h> | ||
19 | #include <linux/random.h> | ||
20 | |||
21 | #define div_factor 3 | ||
22 | |||
23 | static u32 rand1, preh_val, posth_val, jph_val; | ||
24 | static int errors, handler_errors, num_tests; | ||
25 | |||
26 | static noinline u32 kprobe_target(u32 value) | ||
27 | { | ||
28 | /* | ||
29 | * gcc ignores noinline on some architectures unless we stuff | ||
30 | * sufficient lard into the function. The get_kprobe() here is | ||
31 | * just for that. | ||
32 | * | ||
33 | * NOTE: We aren't concerned about the correctness of get_kprobe() | ||
34 | * here; hence, this call is neither under !preempt nor with the | ||
35 | * kprobe_mutex held. This is fine(tm) | ||
36 | */ | ||
37 | if (get_kprobe((void *)0xdeadbeef)) | ||
38 | printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n"); | ||
39 | |||
40 | return (value / div_factor); | ||
41 | } | ||
42 | |||
43 | static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs) | ||
44 | { | ||
45 | preh_val = (rand1 / div_factor); | ||
46 | return 0; | ||
47 | } | ||
48 | |||
49 | static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, | ||
50 | unsigned long flags) | ||
51 | { | ||
52 | if (preh_val != (rand1 / div_factor)) { | ||
53 | handler_errors++; | ||
54 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
55 | "incorrect value in post_handler\n"); | ||
56 | } | ||
57 | posth_val = preh_val + div_factor; | ||
58 | } | ||
59 | |||
60 | static struct kprobe kp = { | ||
61 | .symbol_name = "kprobe_target", | ||
62 | .pre_handler = kp_pre_handler, | ||
63 | .post_handler = kp_post_handler | ||
64 | }; | ||
65 | |||
66 | static int test_kprobe(void) | ||
67 | { | ||
68 | int ret; | ||
69 | |||
70 | ret = register_kprobe(&kp); | ||
71 | if (ret < 0) { | ||
72 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
73 | "register_kprobe returned %d\n", ret); | ||
74 | return ret; | ||
75 | } | ||
76 | |||
77 | ret = kprobe_target(rand1); | ||
78 | unregister_kprobe(&kp); | ||
79 | |||
80 | if (preh_val == 0) { | ||
81 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
82 | "kprobe pre_handler not called\n"); | ||
83 | handler_errors++; | ||
84 | } | ||
85 | |||
86 | if (posth_val == 0) { | ||
87 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
88 | "kprobe post_handler not called\n"); | ||
89 | handler_errors++; | ||
90 | } | ||
91 | |||
92 | return 0; | ||
93 | } | ||
94 | |||
95 | static u32 j_kprobe_target(u32 value) | ||
96 | { | ||
97 | if (value != rand1) { | ||
98 | handler_errors++; | ||
99 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
100 | "incorrect value in jprobe handler\n"); | ||
101 | } | ||
102 | |||
103 | jph_val = rand1; | ||
104 | jprobe_return(); | ||
105 | return 0; | ||
106 | } | ||
107 | |||
108 | static struct jprobe jp = { | ||
109 | .entry = j_kprobe_target, | ||
110 | .kp.symbol_name = "kprobe_target" | ||
111 | }; | ||
112 | |||
113 | static int test_jprobe(void) | ||
114 | { | ||
115 | int ret; | ||
116 | |||
117 | ret = register_jprobe(&jp); | ||
118 | if (ret < 0) { | ||
119 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
120 | "register_jprobe returned %d\n", ret); | ||
121 | return ret; | ||
122 | } | ||
123 | |||
124 | ret = kprobe_target(rand1); | ||
125 | unregister_jprobe(&jp); | ||
126 | if (jph_val == 0) { | ||
127 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
128 | "jprobe handler not called\n"); | ||
129 | handler_errors++; | ||
130 | } | ||
131 | |||
132 | return 0; | ||
133 | } | ||
134 | |||
135 | #ifdef CONFIG_KRETPROBES | ||
136 | static u32 krph_val; | ||
137 | |||
138 | static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) | ||
139 | { | ||
140 | unsigned long ret = regs_return_value(regs); | ||
141 | |||
142 | if (ret != (rand1 / div_factor)) { | ||
143 | handler_errors++; | ||
144 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
145 | "incorrect value in kretprobe handler\n"); | ||
146 | } | ||
147 | |||
148 | krph_val = (rand1 / div_factor); | ||
149 | return 0; | ||
150 | } | ||
151 | |||
152 | static struct kretprobe rp = { | ||
153 | .handler = return_handler, | ||
154 | .kp.symbol_name = "kprobe_target" | ||
155 | }; | ||
156 | |||
157 | static int test_kretprobe(void) | ||
158 | { | ||
159 | int ret; | ||
160 | |||
161 | ret = register_kretprobe(&rp); | ||
162 | if (ret < 0) { | ||
163 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
164 | "register_kretprobe returned %d\n", ret); | ||
165 | return ret; | ||
166 | } | ||
167 | |||
168 | ret = kprobe_target(rand1); | ||
169 | unregister_kretprobe(&rp); | ||
170 | if (krph_val == 0) { | ||
171 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
172 | "kretprobe handler not called\n"); | ||
173 | handler_errors++; | ||
174 | } | ||
175 | |||
176 | return 0; | ||
177 | } | ||
178 | #endif /* CONFIG_KRETPROBES */ | ||
179 | |||
180 | int init_test_probes(void) | ||
181 | { | ||
182 | int ret; | ||
183 | |||
184 | do { | ||
185 | rand1 = random32(); | ||
186 | } while (rand1 <= div_factor); | ||
187 | |||
188 | printk(KERN_INFO "Kprobe smoke test started\n"); | ||
189 | num_tests++; | ||
190 | ret = test_kprobe(); | ||
191 | if (ret < 0) | ||
192 | errors++; | ||
193 | |||
194 | num_tests++; | ||
195 | ret = test_jprobe(); | ||
196 | if (ret < 0) | ||
197 | errors++; | ||
198 | |||
199 | #ifdef CONFIG_KRETPROBES | ||
200 | num_tests++; | ||
201 | ret = test_kretprobe(); | ||
202 | if (ret < 0) | ||
203 | errors++; | ||
204 | #endif /* CONFIG_KRETPROBES */ | ||
205 | |||
206 | if (errors) | ||
207 | printk(KERN_ERR "BUG: Kprobe smoke test: %d out of " | ||
208 | "%d tests failed\n", errors, num_tests); | ||
209 | else if (handler_errors) | ||
210 | printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) " | ||
211 | "running handlers\n", handler_errors); | ||
212 | else | ||
213 | printk(KERN_INFO "Kprobe smoke test passed successfully\n"); | ||
214 | |||
215 | return 0; | ||
216 | } | ||
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 5fb139fef9fa..3e59fce6dd43 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -41,6 +41,11 @@ unsigned long clockevent_delta2ns(unsigned long latch, | |||
41 | { | 41 | { |
42 | u64 clc = ((u64) latch << evt->shift); | 42 | u64 clc = ((u64) latch << evt->shift); |
43 | 43 | ||
44 | if (unlikely(!evt->mult)) { | ||
45 | evt->mult = 1; | ||
46 | WARN_ON(1); | ||
47 | } | ||
48 | |||
44 | do_div(clc, evt->mult); | 49 | do_div(clc, evt->mult); |
45 | if (clc < 1000) | 50 | if (clc < 1000) |
46 | clc = 1000; | 51 | clc = 1000; |
@@ -151,6 +156,14 @@ static void clockevents_notify_released(void) | |||
151 | void clockevents_register_device(struct clock_event_device *dev) | 156 | void clockevents_register_device(struct clock_event_device *dev) |
152 | { | 157 | { |
153 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); | 158 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); |
159 | /* | ||
160 | * A nsec2cyc multiplicator of 0 is invalid and we'd crash | ||
161 | * on it, so fix it up and emit a warning: | ||
162 | */ | ||
163 | if (unlikely(!dev->mult)) { | ||
164 | dev->mult = 1; | ||
165 | WARN_ON(1); | ||
166 | } | ||
154 | 167 | ||
155 | spin_lock(&clockevents_lock); | 168 | spin_lock(&clockevents_lock); |
156 | 169 | ||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c8a9d13874df..6e9259a5d501 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -142,8 +142,13 @@ static void clocksource_watchdog(unsigned long data) | |||
142 | } | 142 | } |
143 | 143 | ||
144 | if (!list_empty(&watchdog_list)) { | 144 | if (!list_empty(&watchdog_list)) { |
145 | __mod_timer(&watchdog_timer, | 145 | /* Cycle through CPUs to check if the CPUs stay synchronized to |
146 | watchdog_timer.expires + WATCHDOG_INTERVAL); | 146 | * each other. */ |
147 | int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map); | ||
148 | if (next_cpu >= NR_CPUS) | ||
149 | next_cpu = first_cpu(cpu_online_map); | ||
150 | watchdog_timer.expires += WATCHDOG_INTERVAL; | ||
151 | add_timer_on(&watchdog_timer, next_cpu); | ||
147 | } | 152 | } |
148 | spin_unlock(&watchdog_lock); | 153 | spin_unlock(&watchdog_lock); |
149 | } | 154 | } |
@@ -165,7 +170,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) | |||
165 | if (!started && watchdog) { | 170 | if (!started && watchdog) { |
166 | watchdog_last = watchdog->read(); | 171 | watchdog_last = watchdog->read(); |
167 | watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; | 172 | watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; |
168 | add_timer(&watchdog_timer); | 173 | add_timer_on(&watchdog_timer, first_cpu(cpu_online_map)); |
169 | } | 174 | } |
170 | } else { | 175 | } else { |
171 | if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) | 176 | if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) |
@@ -175,7 +180,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) | |||
175 | if (watchdog) | 180 | if (watchdog) |
176 | del_timer(&watchdog_timer); | 181 | del_timer(&watchdog_timer); |
177 | watchdog = cs; | 182 | watchdog = cs; |
178 | init_timer(&watchdog_timer); | 183 | init_timer_deferrable(&watchdog_timer); |
179 | watchdog_timer.function = clocksource_watchdog; | 184 | watchdog_timer.function = clocksource_watchdog; |
180 | 185 | ||
181 | /* Reset watchdog cycles */ | 186 | /* Reset watchdog cycles */ |
@@ -186,7 +191,8 @@ static void clocksource_check_watchdog(struct clocksource *cs) | |||
186 | watchdog_last = watchdog->read(); | 191 | watchdog_last = watchdog->read(); |
187 | watchdog_timer.expires = | 192 | watchdog_timer.expires = |
188 | jiffies + WATCHDOG_INTERVAL; | 193 | jiffies + WATCHDOG_INTERVAL; |
189 | add_timer(&watchdog_timer); | 194 | add_timer_on(&watchdog_timer, |
195 | first_cpu(cpu_online_map)); | ||
190 | } | 196 | } |
191 | } | 197 | } |
192 | } | 198 | } |
@@ -331,6 +337,21 @@ void clocksource_change_rating(struct clocksource *cs, int rating) | |||
331 | spin_unlock_irqrestore(&clocksource_lock, flags); | 337 | spin_unlock_irqrestore(&clocksource_lock, flags); |
332 | } | 338 | } |
333 | 339 | ||
340 | /** | ||
341 | * clocksource_unregister - remove a registered clocksource | ||
342 | */ | ||
343 | void clocksource_unregister(struct clocksource *cs) | ||
344 | { | ||
345 | unsigned long flags; | ||
346 | |||
347 | spin_lock_irqsave(&clocksource_lock, flags); | ||
348 | list_del(&cs->list); | ||
349 | if (clocksource_override == cs) | ||
350 | clocksource_override = NULL; | ||
351 | next_clocksource = select_clocksource(); | ||
352 | spin_unlock_irqrestore(&clocksource_lock, flags); | ||
353 | } | ||
354 | |||
334 | #ifdef CONFIG_SYSFS | 355 | #ifdef CONFIG_SYSFS |
335 | /** | 356 | /** |
336 | * sysfs_show_current_clocksources - sysfs interface for current clocksource | 357 | * sysfs_show_current_clocksources - sysfs interface for current clocksource |
@@ -441,7 +462,7 @@ static SYSDEV_ATTR(available_clocksource, 0600, | |||
441 | sysfs_show_available_clocksources, NULL); | 462 | sysfs_show_available_clocksources, NULL); |
442 | 463 | ||
443 | static struct sysdev_class clocksource_sysclass = { | 464 | static struct sysdev_class clocksource_sysclass = { |
444 | set_kset_name("clocksource"), | 465 | .name = "clocksource", |
445 | }; | 466 | }; |
446 | 467 | ||
447 | static struct sys_device device_clocksource = { | 468 | static struct sys_device device_clocksource = { |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 5b86698faa0b..e1bd50cbbf5d 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -126,9 +126,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) | |||
126 | /* | 126 | /* |
127 | * Broadcast the event to the cpus, which are set in the mask | 127 | * Broadcast the event to the cpus, which are set in the mask |
128 | */ | 128 | */ |
129 | int tick_do_broadcast(cpumask_t mask) | 129 | static void tick_do_broadcast(cpumask_t mask) |
130 | { | 130 | { |
131 | int ret = 0, cpu = smp_processor_id(); | 131 | int cpu = smp_processor_id(); |
132 | struct tick_device *td; | 132 | struct tick_device *td; |
133 | 133 | ||
134 | /* | 134 | /* |
@@ -138,7 +138,6 @@ int tick_do_broadcast(cpumask_t mask) | |||
138 | cpu_clear(cpu, mask); | 138 | cpu_clear(cpu, mask); |
139 | td = &per_cpu(tick_cpu_device, cpu); | 139 | td = &per_cpu(tick_cpu_device, cpu); |
140 | td->evtdev->event_handler(td->evtdev); | 140 | td->evtdev->event_handler(td->evtdev); |
141 | ret = 1; | ||
142 | } | 141 | } |
143 | 142 | ||
144 | if (!cpus_empty(mask)) { | 143 | if (!cpus_empty(mask)) { |
@@ -151,9 +150,7 @@ int tick_do_broadcast(cpumask_t mask) | |||
151 | cpu = first_cpu(mask); | 150 | cpu = first_cpu(mask); |
152 | td = &per_cpu(tick_cpu_device, cpu); | 151 | td = &per_cpu(tick_cpu_device, cpu); |
153 | td->evtdev->broadcast(mask); | 152 | td->evtdev->broadcast(mask); |
154 | ret = 1; | ||
155 | } | 153 | } |
156 | return ret; | ||
157 | } | 154 | } |
158 | 155 | ||
159 | /* | 156 | /* |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index bb13f2724905..f13f2b7f4fd4 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
@@ -70,8 +70,6 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | |||
70 | * Broadcasting support | 70 | * Broadcasting support |
71 | */ | 71 | */ |
72 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | 72 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST |
73 | extern int tick_do_broadcast(cpumask_t mask); | ||
74 | |||
75 | extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); | 73 | extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); |
76 | extern int tick_check_broadcast_device(struct clock_event_device *dev); | 74 | extern int tick_check_broadcast_device(struct clock_event_device *dev); |
77 | extern int tick_is_broadcast_device(struct clock_event_device *dev); | 75 | extern int tick_is_broadcast_device(struct clock_event_device *dev); |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cb89fa8db110..63f24b550695 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -9,7 +9,7 @@ | |||
9 | * | 9 | * |
10 | * Started by: Thomas Gleixner and Ingo Molnar | 10 | * Started by: Thomas Gleixner and Ingo Molnar |
11 | * | 11 | * |
12 | * For licencing details see kernel-base/COPYING | 12 | * Distribute under GPLv2. |
13 | */ | 13 | */ |
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | #include <linux/err.h> | 15 | #include <linux/err.h> |
@@ -143,6 +143,44 @@ void tick_nohz_update_jiffies(void) | |||
143 | local_irq_restore(flags); | 143 | local_irq_restore(flags); |
144 | } | 144 | } |
145 | 145 | ||
146 | void tick_nohz_stop_idle(int cpu) | ||
147 | { | ||
148 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
149 | |||
150 | if (ts->idle_active) { | ||
151 | ktime_t now, delta; | ||
152 | now = ktime_get(); | ||
153 | delta = ktime_sub(now, ts->idle_entrytime); | ||
154 | ts->idle_lastupdate = now; | ||
155 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
156 | ts->idle_active = 0; | ||
157 | } | ||
158 | } | ||
159 | |||
160 | static ktime_t tick_nohz_start_idle(int cpu) | ||
161 | { | ||
162 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
163 | ktime_t now, delta; | ||
164 | |||
165 | now = ktime_get(); | ||
166 | if (ts->idle_active) { | ||
167 | delta = ktime_sub(now, ts->idle_entrytime); | ||
168 | ts->idle_lastupdate = now; | ||
169 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
170 | } | ||
171 | ts->idle_entrytime = now; | ||
172 | ts->idle_active = 1; | ||
173 | return now; | ||
174 | } | ||
175 | |||
176 | u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) | ||
177 | { | ||
178 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
179 | |||
180 | *last_update_time = ktime_to_us(ts->idle_lastupdate); | ||
181 | return ktime_to_us(ts->idle_sleeptime); | ||
182 | } | ||
183 | |||
146 | /** | 184 | /** |
147 | * tick_nohz_stop_sched_tick - stop the idle tick from the idle task | 185 | * tick_nohz_stop_sched_tick - stop the idle tick from the idle task |
148 | * | 186 | * |
@@ -153,14 +191,16 @@ void tick_nohz_update_jiffies(void) | |||
153 | void tick_nohz_stop_sched_tick(void) | 191 | void tick_nohz_stop_sched_tick(void) |
154 | { | 192 | { |
155 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; | 193 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; |
194 | unsigned long rt_jiffies; | ||
156 | struct tick_sched *ts; | 195 | struct tick_sched *ts; |
157 | ktime_t last_update, expires, now, delta; | 196 | ktime_t last_update, expires, now; |
158 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 197 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; |
159 | int cpu; | 198 | int cpu; |
160 | 199 | ||
161 | local_irq_save(flags); | 200 | local_irq_save(flags); |
162 | 201 | ||
163 | cpu = smp_processor_id(); | 202 | cpu = smp_processor_id(); |
203 | now = tick_nohz_start_idle(cpu); | ||
164 | ts = &per_cpu(tick_cpu_sched, cpu); | 204 | ts = &per_cpu(tick_cpu_sched, cpu); |
165 | 205 | ||
166 | /* | 206 | /* |
@@ -192,19 +232,7 @@ void tick_nohz_stop_sched_tick(void) | |||
192 | } | 232 | } |
193 | } | 233 | } |
194 | 234 | ||
195 | now = ktime_get(); | ||
196 | /* | ||
197 | * When called from irq_exit we need to account the idle sleep time | ||
198 | * correctly. | ||
199 | */ | ||
200 | if (ts->tick_stopped) { | ||
201 | delta = ktime_sub(now, ts->idle_entrytime); | ||
202 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
203 | } | ||
204 | |||
205 | ts->idle_entrytime = now; | ||
206 | ts->idle_calls++; | 235 | ts->idle_calls++; |
207 | |||
208 | /* Read jiffies and the time when jiffies were updated last */ | 236 | /* Read jiffies and the time when jiffies were updated last */ |
209 | do { | 237 | do { |
210 | seq = read_seqbegin(&xtime_lock); | 238 | seq = read_seqbegin(&xtime_lock); |
@@ -216,6 +244,10 @@ void tick_nohz_stop_sched_tick(void) | |||
216 | next_jiffies = get_next_timer_interrupt(last_jiffies); | 244 | next_jiffies = get_next_timer_interrupt(last_jiffies); |
217 | delta_jiffies = next_jiffies - last_jiffies; | 245 | delta_jiffies = next_jiffies - last_jiffies; |
218 | 246 | ||
247 | rt_jiffies = rt_needs_cpu(cpu); | ||
248 | if (rt_jiffies && rt_jiffies < delta_jiffies) | ||
249 | delta_jiffies = rt_jiffies; | ||
250 | |||
219 | if (rcu_needs_cpu(cpu)) | 251 | if (rcu_needs_cpu(cpu)) |
220 | delta_jiffies = 1; | 252 | delta_jiffies = 1; |
221 | /* | 253 | /* |
@@ -291,7 +323,7 @@ void tick_nohz_stop_sched_tick(void) | |||
291 | /* Check, if the timer was already in the past */ | 323 | /* Check, if the timer was already in the past */ |
292 | if (hrtimer_active(&ts->sched_timer)) | 324 | if (hrtimer_active(&ts->sched_timer)) |
293 | goto out; | 325 | goto out; |
294 | } else if(!tick_program_event(expires, 0)) | 326 | } else if (!tick_program_event(expires, 0)) |
295 | goto out; | 327 | goto out; |
296 | /* | 328 | /* |
297 | * We are past the event already. So we crossed a | 329 | * We are past the event already. So we crossed a |
@@ -332,23 +364,22 @@ void tick_nohz_restart_sched_tick(void) | |||
332 | int cpu = smp_processor_id(); | 364 | int cpu = smp_processor_id(); |
333 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 365 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
334 | unsigned long ticks; | 366 | unsigned long ticks; |
335 | ktime_t now, delta; | 367 | ktime_t now; |
336 | 368 | ||
337 | if (!ts->tick_stopped) | 369 | local_irq_disable(); |
370 | tick_nohz_stop_idle(cpu); | ||
371 | |||
372 | if (!ts->tick_stopped) { | ||
373 | local_irq_enable(); | ||
338 | return; | 374 | return; |
375 | } | ||
339 | 376 | ||
340 | /* Update jiffies first */ | 377 | /* Update jiffies first */ |
341 | now = ktime_get(); | ||
342 | |||
343 | local_irq_disable(); | ||
344 | select_nohz_load_balancer(0); | 378 | select_nohz_load_balancer(0); |
379 | now = ktime_get(); | ||
345 | tick_do_update_jiffies64(now); | 380 | tick_do_update_jiffies64(now); |
346 | cpu_clear(cpu, nohz_cpu_mask); | 381 | cpu_clear(cpu, nohz_cpu_mask); |
347 | 382 | ||
348 | /* Account the idle time */ | ||
349 | delta = ktime_sub(now, ts->idle_entrytime); | ||
350 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
351 | |||
352 | /* | 383 | /* |
353 | * We stopped the tick in idle. Update process times would miss the | 384 | * We stopped the tick in idle. Update process times would miss the |
354 | * time we slept as update_process_times does only a 1 tick | 385 | * time we slept as update_process_times does only a 1 tick |
@@ -502,14 +533,13 @@ static inline void tick_nohz_switch_to_nohz(void) { } | |||
502 | */ | 533 | */ |
503 | #ifdef CONFIG_HIGH_RES_TIMERS | 534 | #ifdef CONFIG_HIGH_RES_TIMERS |
504 | /* | 535 | /* |
505 | * We rearm the timer until we get disabled by the idle code | 536 | * We rearm the timer until we get disabled by the idle code. |
506 | * Called with interrupts disabled and timer->base->cpu_base->lock held. | 537 | * Called with interrupts disabled and timer->base->cpu_base->lock held. |
507 | */ | 538 | */ |
508 | static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | 539 | static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) |
509 | { | 540 | { |
510 | struct tick_sched *ts = | 541 | struct tick_sched *ts = |
511 | container_of(timer, struct tick_sched, sched_timer); | 542 | container_of(timer, struct tick_sched, sched_timer); |
512 | struct hrtimer_cpu_base *base = timer->base->cpu_base; | ||
513 | struct pt_regs *regs = get_irq_regs(); | 543 | struct pt_regs *regs = get_irq_regs(); |
514 | ktime_t now = ktime_get(); | 544 | ktime_t now = ktime_get(); |
515 | int cpu = smp_processor_id(); | 545 | int cpu = smp_processor_id(); |
@@ -547,15 +577,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | |||
547 | touch_softlockup_watchdog(); | 577 | touch_softlockup_watchdog(); |
548 | ts->idle_jiffies++; | 578 | ts->idle_jiffies++; |
549 | } | 579 | } |
550 | /* | ||
551 | * update_process_times() might take tasklist_lock, hence | ||
552 | * drop the base lock. sched-tick hrtimers are per-CPU and | ||
553 | * never accessible by userspace APIs, so this is safe to do. | ||
554 | */ | ||
555 | spin_unlock(&base->lock); | ||
556 | update_process_times(user_mode(regs)); | 580 | update_process_times(user_mode(regs)); |
557 | profile_tick(CPU_PROFILING); | 581 | profile_tick(CPU_PROFILING); |
558 | spin_lock(&base->lock); | ||
559 | } | 582 | } |
560 | 583 | ||
561 | /* Do not restart, when we are in the idle loop */ | 584 | /* Do not restart, when we are in the idle loop */ |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e5e466b27598..092a2366b5a9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -82,13 +82,12 @@ static inline s64 __get_nsec_offset(void) | |||
82 | } | 82 | } |
83 | 83 | ||
84 | /** | 84 | /** |
85 | * __get_realtime_clock_ts - Returns the time of day in a timespec | 85 | * getnstimeofday - Returns the time of day in a timespec |
86 | * @ts: pointer to the timespec to be set | 86 | * @ts: pointer to the timespec to be set |
87 | * | 87 | * |
88 | * Returns the time of day in a timespec. Used by | 88 | * Returns the time of day in a timespec. |
89 | * do_gettimeofday() and get_realtime_clock_ts(). | ||
90 | */ | 89 | */ |
91 | static inline void __get_realtime_clock_ts(struct timespec *ts) | 90 | void getnstimeofday(struct timespec *ts) |
92 | { | 91 | { |
93 | unsigned long seq; | 92 | unsigned long seq; |
94 | s64 nsecs; | 93 | s64 nsecs; |
@@ -104,30 +103,19 @@ static inline void __get_realtime_clock_ts(struct timespec *ts) | |||
104 | timespec_add_ns(ts, nsecs); | 103 | timespec_add_ns(ts, nsecs); |
105 | } | 104 | } |
106 | 105 | ||
107 | /** | ||
108 | * getnstimeofday - Returns the time of day in a timespec | ||
109 | * @ts: pointer to the timespec to be set | ||
110 | * | ||
111 | * Returns the time of day in a timespec. | ||
112 | */ | ||
113 | void getnstimeofday(struct timespec *ts) | ||
114 | { | ||
115 | __get_realtime_clock_ts(ts); | ||
116 | } | ||
117 | |||
118 | EXPORT_SYMBOL(getnstimeofday); | 106 | EXPORT_SYMBOL(getnstimeofday); |
119 | 107 | ||
120 | /** | 108 | /** |
121 | * do_gettimeofday - Returns the time of day in a timeval | 109 | * do_gettimeofday - Returns the time of day in a timeval |
122 | * @tv: pointer to the timeval to be set | 110 | * @tv: pointer to the timeval to be set |
123 | * | 111 | * |
124 | * NOTE: Users should be converted to using get_realtime_clock_ts() | 112 | * NOTE: Users should be converted to using getnstimeofday() |
125 | */ | 113 | */ |
126 | void do_gettimeofday(struct timeval *tv) | 114 | void do_gettimeofday(struct timeval *tv) |
127 | { | 115 | { |
128 | struct timespec now; | 116 | struct timespec now; |
129 | 117 | ||
130 | __get_realtime_clock_ts(&now); | 118 | getnstimeofday(&now); |
131 | tv->tv_sec = now.tv_sec; | 119 | tv->tv_sec = now.tv_sec; |
132 | tv->tv_usec = now.tv_nsec/1000; | 120 | tv->tv_usec = now.tv_nsec/1000; |
133 | } | 121 | } |
@@ -198,7 +186,8 @@ static void change_clocksource(void) | |||
198 | 186 | ||
199 | clock->error = 0; | 187 | clock->error = 0; |
200 | clock->xtime_nsec = 0; | 188 | clock->xtime_nsec = 0; |
201 | clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); | 189 | clocksource_calculate_interval(clock, |
190 | (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT)); | ||
202 | 191 | ||
203 | tick_clock_notify(); | 192 | tick_clock_notify(); |
204 | 193 | ||
@@ -255,7 +244,8 @@ void __init timekeeping_init(void) | |||
255 | ntp_clear(); | 244 | ntp_clear(); |
256 | 245 | ||
257 | clock = clocksource_get_next(); | 246 | clock = clocksource_get_next(); |
258 | clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); | 247 | clocksource_calculate_interval(clock, |
248 | (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT)); | ||
259 | clock->cycle_last = clocksource_read(clock); | 249 | clock->cycle_last = clocksource_read(clock); |
260 | 250 | ||
261 | xtime.tv_sec = sec; | 251 | xtime.tv_sec = sec; |
@@ -335,9 +325,9 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) | |||
335 | 325 | ||
336 | /* sysfs resume/suspend bits for timekeeping */ | 326 | /* sysfs resume/suspend bits for timekeeping */ |
337 | static struct sysdev_class timekeeping_sysclass = { | 327 | static struct sysdev_class timekeeping_sysclass = { |
328 | .name = "timekeeping", | ||
338 | .resume = timekeeping_resume, | 329 | .resume = timekeeping_resume, |
339 | .suspend = timekeeping_suspend, | 330 | .suspend = timekeeping_suspend, |
340 | set_kset_name("timekeeping"), | ||
341 | }; | 331 | }; |
342 | 332 | ||
343 | static struct sys_device device_timer = { | 333 | static struct sys_device device_timer = { |
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index c36bb7ed0301..417da8c5bc72 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c | |||
@@ -26,7 +26,7 @@ | |||
26 | * the pid and cmdline from the owner process if applicable. | 26 | * the pid and cmdline from the owner process if applicable. |
27 | * | 27 | * |
28 | * Start/stop data collection: | 28 | * Start/stop data collection: |
29 | * # echo 1[0] >/proc/timer_stats | 29 | * # echo [1|0] >/proc/timer_stats |
30 | * | 30 | * |
31 | * Display the information collected so far: | 31 | * Display the information collected so far: |
32 | * # cat /proc/timer_stats | 32 | * # cat /proc/timer_stats |
diff --git a/kernel/timer.c b/kernel/timer.c index 2a00c22203f3..23f7ead78fae 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -58,59 +58,57 @@ EXPORT_SYMBOL(jiffies_64); | |||
58 | #define TVN_MASK (TVN_SIZE - 1) | 58 | #define TVN_MASK (TVN_SIZE - 1) |
59 | #define TVR_MASK (TVR_SIZE - 1) | 59 | #define TVR_MASK (TVR_SIZE - 1) |
60 | 60 | ||
61 | typedef struct tvec_s { | 61 | struct tvec { |
62 | struct list_head vec[TVN_SIZE]; | 62 | struct list_head vec[TVN_SIZE]; |
63 | } tvec_t; | 63 | }; |
64 | 64 | ||
65 | typedef struct tvec_root_s { | 65 | struct tvec_root { |
66 | struct list_head vec[TVR_SIZE]; | 66 | struct list_head vec[TVR_SIZE]; |
67 | } tvec_root_t; | 67 | }; |
68 | 68 | ||
69 | struct tvec_t_base_s { | 69 | struct tvec_base { |
70 | spinlock_t lock; | 70 | spinlock_t lock; |
71 | struct timer_list *running_timer; | 71 | struct timer_list *running_timer; |
72 | unsigned long timer_jiffies; | 72 | unsigned long timer_jiffies; |
73 | tvec_root_t tv1; | 73 | struct tvec_root tv1; |
74 | tvec_t tv2; | 74 | struct tvec tv2; |
75 | tvec_t tv3; | 75 | struct tvec tv3; |
76 | tvec_t tv4; | 76 | struct tvec tv4; |
77 | tvec_t tv5; | 77 | struct tvec tv5; |
78 | } ____cacheline_aligned; | 78 | } ____cacheline_aligned; |
79 | 79 | ||
80 | typedef struct tvec_t_base_s tvec_base_t; | 80 | struct tvec_base boot_tvec_bases; |
81 | |||
82 | tvec_base_t boot_tvec_bases; | ||
83 | EXPORT_SYMBOL(boot_tvec_bases); | 81 | EXPORT_SYMBOL(boot_tvec_bases); |
84 | static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; | 82 | static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; |
85 | 83 | ||
86 | /* | 84 | /* |
87 | * Note that all tvec_bases is 2 byte aligned and lower bit of | 85 | * Note that all tvec_bases are 2 byte aligned and lower bit of |
88 | * base in timer_list is guaranteed to be zero. Use the LSB for | 86 | * base in timer_list is guaranteed to be zero. Use the LSB for |
89 | * the new flag to indicate whether the timer is deferrable | 87 | * the new flag to indicate whether the timer is deferrable |
90 | */ | 88 | */ |
91 | #define TBASE_DEFERRABLE_FLAG (0x1) | 89 | #define TBASE_DEFERRABLE_FLAG (0x1) |
92 | 90 | ||
93 | /* Functions below help us manage 'deferrable' flag */ | 91 | /* Functions below help us manage 'deferrable' flag */ |
94 | static inline unsigned int tbase_get_deferrable(tvec_base_t *base) | 92 | static inline unsigned int tbase_get_deferrable(struct tvec_base *base) |
95 | { | 93 | { |
96 | return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); | 94 | return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); |
97 | } | 95 | } |
98 | 96 | ||
99 | static inline tvec_base_t *tbase_get_base(tvec_base_t *base) | 97 | static inline struct tvec_base *tbase_get_base(struct tvec_base *base) |
100 | { | 98 | { |
101 | return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); | 99 | return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); |
102 | } | 100 | } |
103 | 101 | ||
104 | static inline void timer_set_deferrable(struct timer_list *timer) | 102 | static inline void timer_set_deferrable(struct timer_list *timer) |
105 | { | 103 | { |
106 | timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | | 104 | timer->base = ((struct tvec_base *)((unsigned long)(timer->base) | |
107 | TBASE_DEFERRABLE_FLAG)); | 105 | TBASE_DEFERRABLE_FLAG)); |
108 | } | 106 | } |
109 | 107 | ||
110 | static inline void | 108 | static inline void |
111 | timer_set_base(struct timer_list *timer, tvec_base_t *new_base) | 109 | timer_set_base(struct timer_list *timer, struct tvec_base *new_base) |
112 | { | 110 | { |
113 | timer->base = (tvec_base_t *)((unsigned long)(new_base) | | 111 | timer->base = (struct tvec_base *)((unsigned long)(new_base) | |
114 | tbase_get_deferrable(timer->base)); | 112 | tbase_get_deferrable(timer->base)); |
115 | } | 113 | } |
116 | 114 | ||
@@ -246,7 +244,7 @@ unsigned long round_jiffies_relative(unsigned long j) | |||
246 | EXPORT_SYMBOL_GPL(round_jiffies_relative); | 244 | EXPORT_SYMBOL_GPL(round_jiffies_relative); |
247 | 245 | ||
248 | 246 | ||
249 | static inline void set_running_timer(tvec_base_t *base, | 247 | static inline void set_running_timer(struct tvec_base *base, |
250 | struct timer_list *timer) | 248 | struct timer_list *timer) |
251 | { | 249 | { |
252 | #ifdef CONFIG_SMP | 250 | #ifdef CONFIG_SMP |
@@ -254,7 +252,7 @@ static inline void set_running_timer(tvec_base_t *base, | |||
254 | #endif | 252 | #endif |
255 | } | 253 | } |
256 | 254 | ||
257 | static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) | 255 | static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) |
258 | { | 256 | { |
259 | unsigned long expires = timer->expires; | 257 | unsigned long expires = timer->expires; |
260 | unsigned long idx = expires - base->timer_jiffies; | 258 | unsigned long idx = expires - base->timer_jiffies; |
@@ -371,14 +369,14 @@ static inline void detach_timer(struct timer_list *timer, | |||
371 | * possible to set timer->base = NULL and drop the lock: the timer remains | 369 | * possible to set timer->base = NULL and drop the lock: the timer remains |
372 | * locked. | 370 | * locked. |
373 | */ | 371 | */ |
374 | static tvec_base_t *lock_timer_base(struct timer_list *timer, | 372 | static struct tvec_base *lock_timer_base(struct timer_list *timer, |
375 | unsigned long *flags) | 373 | unsigned long *flags) |
376 | __acquires(timer->base->lock) | 374 | __acquires(timer->base->lock) |
377 | { | 375 | { |
378 | tvec_base_t *base; | 376 | struct tvec_base *base; |
379 | 377 | ||
380 | for (;;) { | 378 | for (;;) { |
381 | tvec_base_t *prelock_base = timer->base; | 379 | struct tvec_base *prelock_base = timer->base; |
382 | base = tbase_get_base(prelock_base); | 380 | base = tbase_get_base(prelock_base); |
383 | if (likely(base != NULL)) { | 381 | if (likely(base != NULL)) { |
384 | spin_lock_irqsave(&base->lock, *flags); | 382 | spin_lock_irqsave(&base->lock, *flags); |
@@ -393,7 +391,7 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer, | |||
393 | 391 | ||
394 | int __mod_timer(struct timer_list *timer, unsigned long expires) | 392 | int __mod_timer(struct timer_list *timer, unsigned long expires) |
395 | { | 393 | { |
396 | tvec_base_t *base, *new_base; | 394 | struct tvec_base *base, *new_base; |
397 | unsigned long flags; | 395 | unsigned long flags; |
398 | int ret = 0; | 396 | int ret = 0; |
399 | 397 | ||
@@ -445,7 +443,7 @@ EXPORT_SYMBOL(__mod_timer); | |||
445 | */ | 443 | */ |
446 | void add_timer_on(struct timer_list *timer, int cpu) | 444 | void add_timer_on(struct timer_list *timer, int cpu) |
447 | { | 445 | { |
448 | tvec_base_t *base = per_cpu(tvec_bases, cpu); | 446 | struct tvec_base *base = per_cpu(tvec_bases, cpu); |
449 | unsigned long flags; | 447 | unsigned long flags; |
450 | 448 | ||
451 | timer_stats_timer_set_start_info(timer); | 449 | timer_stats_timer_set_start_info(timer); |
@@ -508,7 +506,7 @@ EXPORT_SYMBOL(mod_timer); | |||
508 | */ | 506 | */ |
509 | int del_timer(struct timer_list *timer) | 507 | int del_timer(struct timer_list *timer) |
510 | { | 508 | { |
511 | tvec_base_t *base; | 509 | struct tvec_base *base; |
512 | unsigned long flags; | 510 | unsigned long flags; |
513 | int ret = 0; | 511 | int ret = 0; |
514 | 512 | ||
@@ -539,7 +537,7 @@ EXPORT_SYMBOL(del_timer); | |||
539 | */ | 537 | */ |
540 | int try_to_del_timer_sync(struct timer_list *timer) | 538 | int try_to_del_timer_sync(struct timer_list *timer) |
541 | { | 539 | { |
542 | tvec_base_t *base; | 540 | struct tvec_base *base; |
543 | unsigned long flags; | 541 | unsigned long flags; |
544 | int ret = -1; | 542 | int ret = -1; |
545 | 543 | ||
@@ -591,7 +589,7 @@ int del_timer_sync(struct timer_list *timer) | |||
591 | EXPORT_SYMBOL(del_timer_sync); | 589 | EXPORT_SYMBOL(del_timer_sync); |
592 | #endif | 590 | #endif |
593 | 591 | ||
594 | static int cascade(tvec_base_t *base, tvec_t *tv, int index) | 592 | static int cascade(struct tvec_base *base, struct tvec *tv, int index) |
595 | { | 593 | { |
596 | /* cascade all the timers from tv up one level */ | 594 | /* cascade all the timers from tv up one level */ |
597 | struct timer_list *timer, *tmp; | 595 | struct timer_list *timer, *tmp; |
@@ -620,7 +618,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) | |||
620 | * This function cascades all vectors and executes all expired timer | 618 | * This function cascades all vectors and executes all expired timer |
621 | * vectors. | 619 | * vectors. |
622 | */ | 620 | */ |
623 | static inline void __run_timers(tvec_base_t *base) | 621 | static inline void __run_timers(struct tvec_base *base) |
624 | { | 622 | { |
625 | struct timer_list *timer; | 623 | struct timer_list *timer; |
626 | 624 | ||
@@ -657,7 +655,7 @@ static inline void __run_timers(tvec_base_t *base) | |||
657 | int preempt_count = preempt_count(); | 655 | int preempt_count = preempt_count(); |
658 | fn(data); | 656 | fn(data); |
659 | if (preempt_count != preempt_count()) { | 657 | if (preempt_count != preempt_count()) { |
660 | printk(KERN_WARNING "huh, entered %p " | 658 | printk(KERN_ERR "huh, entered %p " |
661 | "with preempt_count %08x, exited" | 659 | "with preempt_count %08x, exited" |
662 | " with %08x?\n", | 660 | " with %08x?\n", |
663 | fn, preempt_count, | 661 | fn, preempt_count, |
@@ -678,13 +676,13 @@ static inline void __run_timers(tvec_base_t *base) | |||
678 | * is used on S/390 to stop all activity when a cpus is idle. | 676 | * is used on S/390 to stop all activity when a cpus is idle. |
679 | * This functions needs to be called disabled. | 677 | * This functions needs to be called disabled. |
680 | */ | 678 | */ |
681 | static unsigned long __next_timer_interrupt(tvec_base_t *base) | 679 | static unsigned long __next_timer_interrupt(struct tvec_base *base) |
682 | { | 680 | { |
683 | unsigned long timer_jiffies = base->timer_jiffies; | 681 | unsigned long timer_jiffies = base->timer_jiffies; |
684 | unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; | 682 | unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; |
685 | int index, slot, array, found = 0; | 683 | int index, slot, array, found = 0; |
686 | struct timer_list *nte; | 684 | struct timer_list *nte; |
687 | tvec_t *varray[4]; | 685 | struct tvec *varray[4]; |
688 | 686 | ||
689 | /* Look for timer events in tv1. */ | 687 | /* Look for timer events in tv1. */ |
690 | index = slot = timer_jiffies & TVR_MASK; | 688 | index = slot = timer_jiffies & TVR_MASK; |
@@ -716,7 +714,7 @@ cascade: | |||
716 | varray[3] = &base->tv5; | 714 | varray[3] = &base->tv5; |
717 | 715 | ||
718 | for (array = 0; array < 4; array++) { | 716 | for (array = 0; array < 4; array++) { |
719 | tvec_t *varp = varray[array]; | 717 | struct tvec *varp = varray[array]; |
720 | 718 | ||
721 | index = slot = timer_jiffies & TVN_MASK; | 719 | index = slot = timer_jiffies & TVN_MASK; |
722 | do { | 720 | do { |
@@ -795,7 +793,7 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now, | |||
795 | */ | 793 | */ |
796 | unsigned long get_next_timer_interrupt(unsigned long now) | 794 | unsigned long get_next_timer_interrupt(unsigned long now) |
797 | { | 795 | { |
798 | tvec_base_t *base = __get_cpu_var(tvec_bases); | 796 | struct tvec_base *base = __get_cpu_var(tvec_bases); |
799 | unsigned long expires; | 797 | unsigned long expires; |
800 | 798 | ||
801 | spin_lock(&base->lock); | 799 | spin_lock(&base->lock); |
@@ -894,9 +892,9 @@ static inline void calc_load(unsigned long ticks) | |||
894 | */ | 892 | */ |
895 | static void run_timer_softirq(struct softirq_action *h) | 893 | static void run_timer_softirq(struct softirq_action *h) |
896 | { | 894 | { |
897 | tvec_base_t *base = __get_cpu_var(tvec_bases); | 895 | struct tvec_base *base = __get_cpu_var(tvec_bases); |
898 | 896 | ||
899 | hrtimer_run_queues(); | 897 | hrtimer_run_pending(); |
900 | 898 | ||
901 | if (time_after_eq(jiffies, base->timer_jiffies)) | 899 | if (time_after_eq(jiffies, base->timer_jiffies)) |
902 | __run_timers(base); | 900 | __run_timers(base); |
@@ -907,6 +905,7 @@ static void run_timer_softirq(struct softirq_action *h) | |||
907 | */ | 905 | */ |
908 | void run_local_timers(void) | 906 | void run_local_timers(void) |
909 | { | 907 | { |
908 | hrtimer_run_queues(); | ||
910 | raise_softirq(TIMER_SOFTIRQ); | 909 | raise_softirq(TIMER_SOFTIRQ); |
911 | softlockup_tick(); | 910 | softlockup_tick(); |
912 | } | 911 | } |
@@ -1222,7 +1221,7 @@ static struct lock_class_key base_lock_keys[NR_CPUS]; | |||
1222 | static int __cpuinit init_timers_cpu(int cpu) | 1221 | static int __cpuinit init_timers_cpu(int cpu) |
1223 | { | 1222 | { |
1224 | int j; | 1223 | int j; |
1225 | tvec_base_t *base; | 1224 | struct tvec_base *base; |
1226 | static char __cpuinitdata tvec_base_done[NR_CPUS]; | 1225 | static char __cpuinitdata tvec_base_done[NR_CPUS]; |
1227 | 1226 | ||
1228 | if (!tvec_base_done[cpu]) { | 1227 | if (!tvec_base_done[cpu]) { |
@@ -1277,7 +1276,7 @@ static int __cpuinit init_timers_cpu(int cpu) | |||
1277 | } | 1276 | } |
1278 | 1277 | ||
1279 | #ifdef CONFIG_HOTPLUG_CPU | 1278 | #ifdef CONFIG_HOTPLUG_CPU |
1280 | static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) | 1279 | static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) |
1281 | { | 1280 | { |
1282 | struct timer_list *timer; | 1281 | struct timer_list *timer; |
1283 | 1282 | ||
@@ -1291,8 +1290,8 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) | |||
1291 | 1290 | ||
1292 | static void __cpuinit migrate_timers(int cpu) | 1291 | static void __cpuinit migrate_timers(int cpu) |
1293 | { | 1292 | { |
1294 | tvec_base_t *old_base; | 1293 | struct tvec_base *old_base; |
1295 | tvec_base_t *new_base; | 1294 | struct tvec_base *new_base; |
1296 | int i; | 1295 | int i; |
1297 | 1296 | ||
1298 | BUG_ON(cpu_online(cpu)); | 1297 | BUG_ON(cpu_online(cpu)); |
diff --git a/kernel/user.c b/kernel/user.c index 8320a87f3e5a..bc1c48d35cb3 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -115,7 +115,7 @@ static void sched_switch_user(struct task_struct *p) { } | |||
115 | 115 | ||
116 | #if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS) | 116 | #if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS) |
117 | 117 | ||
118 | static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */ | 118 | static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */ |
119 | static DEFINE_MUTEX(uids_mutex); | 119 | static DEFINE_MUTEX(uids_mutex); |
120 | 120 | ||
121 | static inline void uids_mutex_lock(void) | 121 | static inline void uids_mutex_lock(void) |
@@ -128,86 +128,83 @@ static inline void uids_mutex_unlock(void) | |||
128 | mutex_unlock(&uids_mutex); | 128 | mutex_unlock(&uids_mutex); |
129 | } | 129 | } |
130 | 130 | ||
131 | /* return cpu shares held by the user */ | 131 | /* uid directory attributes */ |
132 | static ssize_t cpu_shares_show(struct kset *kset, char *buffer) | 132 | static ssize_t cpu_shares_show(struct kobject *kobj, |
133 | struct kobj_attribute *attr, | ||
134 | char *buf) | ||
133 | { | 135 | { |
134 | struct user_struct *up = container_of(kset, struct user_struct, kset); | 136 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); |
135 | 137 | ||
136 | return sprintf(buffer, "%lu\n", sched_group_shares(up->tg)); | 138 | return sprintf(buf, "%lu\n", sched_group_shares(up->tg)); |
137 | } | 139 | } |
138 | 140 | ||
139 | /* modify cpu shares held by the user */ | 141 | static ssize_t cpu_shares_store(struct kobject *kobj, |
140 | static ssize_t cpu_shares_store(struct kset *kset, const char *buffer, | 142 | struct kobj_attribute *attr, |
141 | size_t size) | 143 | const char *buf, size_t size) |
142 | { | 144 | { |
143 | struct user_struct *up = container_of(kset, struct user_struct, kset); | 145 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); |
144 | unsigned long shares; | 146 | unsigned long shares; |
145 | int rc; | 147 | int rc; |
146 | 148 | ||
147 | sscanf(buffer, "%lu", &shares); | 149 | sscanf(buf, "%lu", &shares); |
148 | 150 | ||
149 | rc = sched_group_set_shares(up->tg, shares); | 151 | rc = sched_group_set_shares(up->tg, shares); |
150 | 152 | ||
151 | return (rc ? rc : size); | 153 | return (rc ? rc : size); |
152 | } | 154 | } |
153 | 155 | ||
154 | static void user_attr_init(struct subsys_attribute *sa, char *name, int mode) | 156 | static struct kobj_attribute cpu_share_attr = |
157 | __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store); | ||
158 | |||
159 | /* default attributes per uid directory */ | ||
160 | static struct attribute *uids_attributes[] = { | ||
161 | &cpu_share_attr.attr, | ||
162 | NULL | ||
163 | }; | ||
164 | |||
165 | /* the lifetime of user_struct is not managed by the core (now) */ | ||
166 | static void uids_release(struct kobject *kobj) | ||
155 | { | 167 | { |
156 | sa->attr.name = name; | 168 | return; |
157 | sa->attr.mode = mode; | ||
158 | sa->show = cpu_shares_show; | ||
159 | sa->store = cpu_shares_store; | ||
160 | } | 169 | } |
161 | 170 | ||
162 | /* Create "/sys/kernel/uids/<uid>" directory and | 171 | static struct kobj_type uids_ktype = { |
163 | * "/sys/kernel/uids/<uid>/cpu_share" file for this user. | 172 | .sysfs_ops = &kobj_sysfs_ops, |
164 | */ | 173 | .default_attrs = uids_attributes, |
165 | static int user_kobject_create(struct user_struct *up) | 174 | .release = uids_release, |
175 | }; | ||
176 | |||
177 | /* create /sys/kernel/uids/<uid>/cpu_share file for this user */ | ||
178 | static int uids_user_create(struct user_struct *up) | ||
166 | { | 179 | { |
167 | struct kset *kset = &up->kset; | 180 | struct kobject *kobj = &up->kobj; |
168 | struct kobject *kobj = &kset->kobj; | ||
169 | int error; | 181 | int error; |
170 | 182 | ||
171 | memset(kset, 0, sizeof(struct kset)); | 183 | memset(kobj, 0, sizeof(struct kobject)); |
172 | kobj->parent = &uids_kobject; /* create under /sys/kernel/uids dir */ | 184 | kobj->kset = uids_kset; |
173 | kobject_set_name(kobj, "%d", up->uid); | 185 | error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid); |
174 | kset_init(kset); | 186 | if (error) { |
175 | user_attr_init(&up->user_attr, "cpu_share", 0644); | 187 | kobject_put(kobj); |
176 | |||
177 | error = kobject_add(kobj); | ||
178 | if (error) | ||
179 | goto done; | 188 | goto done; |
180 | 189 | } | |
181 | error = sysfs_create_file(kobj, &up->user_attr.attr); | ||
182 | if (error) | ||
183 | kobject_del(kobj); | ||
184 | 190 | ||
185 | kobject_uevent(kobj, KOBJ_ADD); | 191 | kobject_uevent(kobj, KOBJ_ADD); |
186 | |||
187 | done: | 192 | done: |
188 | return error; | 193 | return error; |
189 | } | 194 | } |
190 | 195 | ||
191 | /* create these in sysfs filesystem: | 196 | /* create these entries in sysfs: |
192 | * "/sys/kernel/uids" directory | 197 | * "/sys/kernel/uids" directory |
193 | * "/sys/kernel/uids/0" directory (for root user) | 198 | * "/sys/kernel/uids/0" directory (for root user) |
194 | * "/sys/kernel/uids/0/cpu_share" file (for root user) | 199 | * "/sys/kernel/uids/0/cpu_share" file (for root user) |
195 | */ | 200 | */ |
196 | int __init uids_kobject_init(void) | 201 | int __init uids_sysfs_init(void) |
197 | { | 202 | { |
198 | int error; | 203 | uids_kset = kset_create_and_add("uids", NULL, kernel_kobj); |
199 | 204 | if (!uids_kset) | |
200 | /* create under /sys/kernel dir */ | 205 | return -ENOMEM; |
201 | uids_kobject.parent = &kernel_subsys.kobj; | ||
202 | uids_kobject.kset = &kernel_subsys; | ||
203 | kobject_set_name(&uids_kobject, "uids"); | ||
204 | kobject_init(&uids_kobject); | ||
205 | 206 | ||
206 | error = kobject_add(&uids_kobject); | 207 | return uids_user_create(&root_user); |
207 | if (!error) | ||
208 | error = user_kobject_create(&root_user); | ||
209 | |||
210 | return error; | ||
211 | } | 208 | } |
212 | 209 | ||
213 | /* work function to remove sysfs directory for a user and free up | 210 | /* work function to remove sysfs directory for a user and free up |
@@ -216,7 +213,6 @@ int __init uids_kobject_init(void) | |||
216 | static void remove_user_sysfs_dir(struct work_struct *w) | 213 | static void remove_user_sysfs_dir(struct work_struct *w) |
217 | { | 214 | { |
218 | struct user_struct *up = container_of(w, struct user_struct, work); | 215 | struct user_struct *up = container_of(w, struct user_struct, work); |
219 | struct kobject *kobj = &up->kset.kobj; | ||
220 | unsigned long flags; | 216 | unsigned long flags; |
221 | int remove_user = 0; | 217 | int remove_user = 0; |
222 | 218 | ||
@@ -238,9 +234,9 @@ static void remove_user_sysfs_dir(struct work_struct *w) | |||
238 | if (!remove_user) | 234 | if (!remove_user) |
239 | goto done; | 235 | goto done; |
240 | 236 | ||
241 | sysfs_remove_file(kobj, &up->user_attr.attr); | 237 | kobject_uevent(&up->kobj, KOBJ_REMOVE); |
242 | kobject_uevent(kobj, KOBJ_REMOVE); | 238 | kobject_del(&up->kobj); |
243 | kobject_del(kobj); | 239 | kobject_put(&up->kobj); |
244 | 240 | ||
245 | sched_destroy_user(up); | 241 | sched_destroy_user(up); |
246 | key_put(up->uid_keyring); | 242 | key_put(up->uid_keyring); |
@@ -267,7 +263,8 @@ static inline void free_user(struct user_struct *up, unsigned long flags) | |||
267 | 263 | ||
268 | #else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */ | 264 | #else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */ |
269 | 265 | ||
270 | static inline int user_kobject_create(struct user_struct *up) { return 0; } | 266 | int uids_sysfs_init(void) { return 0; } |
267 | static inline int uids_user_create(struct user_struct *up) { return 0; } | ||
271 | static inline void uids_mutex_lock(void) { } | 268 | static inline void uids_mutex_lock(void) { } |
272 | static inline void uids_mutex_unlock(void) { } | 269 | static inline void uids_mutex_unlock(void) { } |
273 | 270 | ||
@@ -322,9 +319,9 @@ void free_uid(struct user_struct *up) | |||
322 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | 319 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) |
323 | { | 320 | { |
324 | struct hlist_head *hashent = uidhashentry(ns, uid); | 321 | struct hlist_head *hashent = uidhashentry(ns, uid); |
325 | struct user_struct *up; | 322 | struct user_struct *up, *new; |
326 | 323 | ||
327 | /* Make uid_hash_find() + user_kobject_create() + uid_hash_insert() | 324 | /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() |
328 | * atomic. | 325 | * atomic. |
329 | */ | 326 | */ |
330 | uids_mutex_lock(); | 327 | uids_mutex_lock(); |
@@ -334,13 +331,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
334 | spin_unlock_irq(&uidhash_lock); | 331 | spin_unlock_irq(&uidhash_lock); |
335 | 332 | ||
336 | if (!up) { | 333 | if (!up) { |
337 | struct user_struct *new; | ||
338 | |||
339 | new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); | 334 | new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); |
340 | if (!new) { | 335 | if (!new) |
341 | uids_mutex_unlock(); | 336 | goto out_unlock; |
342 | return NULL; | ||
343 | } | ||
344 | 337 | ||
345 | new->uid = uid; | 338 | new->uid = uid; |
346 | atomic_set(&new->__count, 1); | 339 | atomic_set(&new->__count, 1); |
@@ -356,28 +349,14 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
356 | #endif | 349 | #endif |
357 | new->locked_shm = 0; | 350 | new->locked_shm = 0; |
358 | 351 | ||
359 | if (alloc_uid_keyring(new, current) < 0) { | 352 | if (alloc_uid_keyring(new, current) < 0) |
360 | kmem_cache_free(uid_cachep, new); | 353 | goto out_free_user; |
361 | uids_mutex_unlock(); | ||
362 | return NULL; | ||
363 | } | ||
364 | 354 | ||
365 | if (sched_create_user(new) < 0) { | 355 | if (sched_create_user(new) < 0) |
366 | key_put(new->uid_keyring); | 356 | goto out_put_keys; |
367 | key_put(new->session_keyring); | ||
368 | kmem_cache_free(uid_cachep, new); | ||
369 | uids_mutex_unlock(); | ||
370 | return NULL; | ||
371 | } | ||
372 | 357 | ||
373 | if (user_kobject_create(new)) { | 358 | if (uids_user_create(new)) |
374 | sched_destroy_user(new); | 359 | goto out_destoy_sched; |
375 | key_put(new->uid_keyring); | ||
376 | key_put(new->session_keyring); | ||
377 | kmem_cache_free(uid_cachep, new); | ||
378 | uids_mutex_unlock(); | ||
379 | return NULL; | ||
380 | } | ||
381 | 360 | ||
382 | /* | 361 | /* |
383 | * Before adding this, check whether we raced | 362 | * Before adding this, check whether we raced |
@@ -405,6 +384,17 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
405 | uids_mutex_unlock(); | 384 | uids_mutex_unlock(); |
406 | 385 | ||
407 | return up; | 386 | return up; |
387 | |||
388 | out_destoy_sched: | ||
389 | sched_destroy_user(new); | ||
390 | out_put_keys: | ||
391 | key_put(new->uid_keyring); | ||
392 | key_put(new->session_keyring); | ||
393 | out_free_user: | ||
394 | kmem_cache_free(uid_cachep, new); | ||
395 | out_unlock: | ||
396 | uids_mutex_unlock(); | ||
397 | return NULL; | ||
408 | } | 398 | } |
409 | 399 | ||
410 | void switch_uid(struct user_struct *new_user) | 400 | void switch_uid(struct user_struct *new_user) |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8db0b597509e..52db48e7f6e7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -67,9 +67,8 @@ struct workqueue_struct { | |||
67 | #endif | 67 | #endif |
68 | }; | 68 | }; |
69 | 69 | ||
70 | /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove | 70 | /* Serializes the accesses to the list of workqueues. */ |
71 | threads to each one as cpus come/go. */ | 71 | static DEFINE_SPINLOCK(workqueue_lock); |
72 | static DEFINE_MUTEX(workqueue_mutex); | ||
73 | static LIST_HEAD(workqueues); | 72 | static LIST_HEAD(workqueues); |
74 | 73 | ||
75 | static int singlethread_cpu __read_mostly; | 74 | static int singlethread_cpu __read_mostly; |
@@ -592,8 +591,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on); | |||
592 | * Returns zero on success. | 591 | * Returns zero on success. |
593 | * Returns -ve errno on failure. | 592 | * Returns -ve errno on failure. |
594 | * | 593 | * |
595 | * Appears to be racy against CPU hotplug. | ||
596 | * | ||
597 | * schedule_on_each_cpu() is very slow. | 594 | * schedule_on_each_cpu() is very slow. |
598 | */ | 595 | */ |
599 | int schedule_on_each_cpu(work_func_t func) | 596 | int schedule_on_each_cpu(work_func_t func) |
@@ -605,7 +602,7 @@ int schedule_on_each_cpu(work_func_t func) | |||
605 | if (!works) | 602 | if (!works) |
606 | return -ENOMEM; | 603 | return -ENOMEM; |
607 | 604 | ||
608 | preempt_disable(); /* CPU hotplug */ | 605 | get_online_cpus(); |
609 | for_each_online_cpu(cpu) { | 606 | for_each_online_cpu(cpu) { |
610 | struct work_struct *work = per_cpu_ptr(works, cpu); | 607 | struct work_struct *work = per_cpu_ptr(works, cpu); |
611 | 608 | ||
@@ -613,8 +610,8 @@ int schedule_on_each_cpu(work_func_t func) | |||
613 | set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); | 610 | set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); |
614 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); | 611 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); |
615 | } | 612 | } |
616 | preempt_enable(); | ||
617 | flush_workqueue(keventd_wq); | 613 | flush_workqueue(keventd_wq); |
614 | put_online_cpus(); | ||
618 | free_percpu(works); | 615 | free_percpu(works); |
619 | return 0; | 616 | return 0; |
620 | } | 617 | } |
@@ -750,8 +747,10 @@ struct workqueue_struct *__create_workqueue_key(const char *name, | |||
750 | err = create_workqueue_thread(cwq, singlethread_cpu); | 747 | err = create_workqueue_thread(cwq, singlethread_cpu); |
751 | start_workqueue_thread(cwq, -1); | 748 | start_workqueue_thread(cwq, -1); |
752 | } else { | 749 | } else { |
753 | mutex_lock(&workqueue_mutex); | 750 | get_online_cpus(); |
751 | spin_lock(&workqueue_lock); | ||
754 | list_add(&wq->list, &workqueues); | 752 | list_add(&wq->list, &workqueues); |
753 | spin_unlock(&workqueue_lock); | ||
755 | 754 | ||
756 | for_each_possible_cpu(cpu) { | 755 | for_each_possible_cpu(cpu) { |
757 | cwq = init_cpu_workqueue(wq, cpu); | 756 | cwq = init_cpu_workqueue(wq, cpu); |
@@ -760,7 +759,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name, | |||
760 | err = create_workqueue_thread(cwq, cpu); | 759 | err = create_workqueue_thread(cwq, cpu); |
761 | start_workqueue_thread(cwq, cpu); | 760 | start_workqueue_thread(cwq, cpu); |
762 | } | 761 | } |
763 | mutex_unlock(&workqueue_mutex); | 762 | put_online_cpus(); |
764 | } | 763 | } |
765 | 764 | ||
766 | if (err) { | 765 | if (err) { |
@@ -775,7 +774,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) | |||
775 | { | 774 | { |
776 | /* | 775 | /* |
777 | * Our caller is either destroy_workqueue() or CPU_DEAD, | 776 | * Our caller is either destroy_workqueue() or CPU_DEAD, |
778 | * workqueue_mutex protects cwq->thread | 777 | * get_online_cpus() protects cwq->thread. |
779 | */ | 778 | */ |
780 | if (cwq->thread == NULL) | 779 | if (cwq->thread == NULL) |
781 | return; | 780 | return; |
@@ -810,9 +809,11 @@ void destroy_workqueue(struct workqueue_struct *wq) | |||
810 | struct cpu_workqueue_struct *cwq; | 809 | struct cpu_workqueue_struct *cwq; |
811 | int cpu; | 810 | int cpu; |
812 | 811 | ||
813 | mutex_lock(&workqueue_mutex); | 812 | get_online_cpus(); |
813 | spin_lock(&workqueue_lock); | ||
814 | list_del(&wq->list); | 814 | list_del(&wq->list); |
815 | mutex_unlock(&workqueue_mutex); | 815 | spin_unlock(&workqueue_lock); |
816 | put_online_cpus(); | ||
816 | 817 | ||
817 | for_each_cpu_mask(cpu, *cpu_map) { | 818 | for_each_cpu_mask(cpu, *cpu_map) { |
818 | cwq = per_cpu_ptr(wq->cpu_wq, cpu); | 819 | cwq = per_cpu_ptr(wq->cpu_wq, cpu); |
@@ -835,13 +836,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
835 | action &= ~CPU_TASKS_FROZEN; | 836 | action &= ~CPU_TASKS_FROZEN; |
836 | 837 | ||
837 | switch (action) { | 838 | switch (action) { |
838 | case CPU_LOCK_ACQUIRE: | ||
839 | mutex_lock(&workqueue_mutex); | ||
840 | return NOTIFY_OK; | ||
841 | |||
842 | case CPU_LOCK_RELEASE: | ||
843 | mutex_unlock(&workqueue_mutex); | ||
844 | return NOTIFY_OK; | ||
845 | 839 | ||
846 | case CPU_UP_PREPARE: | 840 | case CPU_UP_PREPARE: |
847 | cpu_set(cpu, cpu_populated_map); | 841 | cpu_set(cpu, cpu_populated_map); |
@@ -854,7 +848,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
854 | case CPU_UP_PREPARE: | 848 | case CPU_UP_PREPARE: |
855 | if (!create_workqueue_thread(cwq, cpu)) | 849 | if (!create_workqueue_thread(cwq, cpu)) |
856 | break; | 850 | break; |
857 | printk(KERN_ERR "workqueue for %i failed\n", cpu); | 851 | printk(KERN_ERR "workqueue [%s] for %i failed\n", |
852 | wq->name, cpu); | ||
858 | return NOTIFY_BAD; | 853 | return NOTIFY_BAD; |
859 | 854 | ||
860 | case CPU_ONLINE: | 855 | case CPU_ONLINE: |