diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Kconfig.hz | 2 | ||||
-rw-r--r-- | kernel/Kconfig.preempt | 13 | ||||
-rw-r--r-- | kernel/Makefile | 6 | ||||
-rw-r--r-- | kernel/cpu.c | 164 | ||||
-rw-r--r-- | kernel/cpuset.c | 14 | ||||
-rw-r--r-- | kernel/fork.c | 11 | ||||
-rw-r--r-- | kernel/hrtimer.c | 256 | ||||
-rw-r--r-- | kernel/kthread.c | 12 | ||||
-rw-r--r-- | kernel/latencytop.c | 239 | ||||
-rw-r--r-- | kernel/lockdep.c | 12 | ||||
-rw-r--r-- | kernel/module.c | 27 | ||||
-rw-r--r-- | kernel/posix-cpu-timers.c | 30 | ||||
-rw-r--r-- | kernel/printk.c | 57 | ||||
-rw-r--r-- | kernel/profile.c | 99 | ||||
-rw-r--r-- | kernel/rcuclassic.c | 575 | ||||
-rw-r--r-- | kernel/rcupdate.c | 576 | ||||
-rw-r--r-- | kernel/rcupreempt.c | 953 | ||||
-rw-r--r-- | kernel/rcupreempt_trace.c | 330 | ||||
-rw-r--r-- | kernel/rcutorture.c | 6 | ||||
-rw-r--r-- | kernel/sched.c | 1384 | ||||
-rw-r--r-- | kernel/sched_debug.c | 5 | ||||
-rw-r--r-- | kernel/sched_fair.c | 391 | ||||
-rw-r--r-- | kernel/sched_idletask.c | 42 | ||||
-rw-r--r-- | kernel/sched_rt.c | 1112 | ||||
-rw-r--r-- | kernel/softlockup.c | 116 | ||||
-rw-r--r-- | kernel/stop_machine.c | 4 | ||||
-rw-r--r-- | kernel/sysctl.c | 77 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 13 | ||||
-rw-r--r-- | kernel/timer.c | 3 | ||||
-rw-r--r-- | kernel/user.c | 47 | ||||
-rw-r--r-- | kernel/workqueue.c | 35 |
31 files changed, 5242 insertions, 1369 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 4af15802ccd4..526128a2e622 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz | |||
@@ -54,3 +54,5 @@ config HZ | |||
54 | default 300 if HZ_300 | 54 | default 300 if HZ_300 |
55 | default 1000 if HZ_1000 | 55 | default 1000 if HZ_1000 |
56 | 56 | ||
57 | config SCHED_HRTICK | ||
58 | def_bool HIGH_RES_TIMERS && X86 | ||
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index c64ce9c14207..0669b70fa6a3 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt | |||
@@ -52,14 +52,13 @@ config PREEMPT | |||
52 | 52 | ||
53 | endchoice | 53 | endchoice |
54 | 54 | ||
55 | config PREEMPT_BKL | 55 | config RCU_TRACE |
56 | bool "Preempt The Big Kernel Lock" | 56 | bool "Enable tracing for RCU - currently stats in debugfs" |
57 | depends on SMP || PREEMPT | 57 | select DEBUG_FS |
58 | default y | 58 | default y |
59 | help | 59 | help |
60 | This option reduces the latency of the kernel by making the | 60 | This option provides tracing in RCU which presents stats |
61 | big kernel lock preemptible. | 61 | in debugfs for debugging RCU implementation. |
62 | 62 | ||
63 | Say Y here if you are building a kernel for a desktop system. | 63 | Say Y here if you want to enable RCU tracing |
64 | Say N if you are unsure. | 64 | Say N if you are unsure. |
65 | |||
diff --git a/kernel/Makefile b/kernel/Makefile index dfa96956dae0..390d42146267 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -52,11 +52,17 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o | |||
52 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | 52 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ |
53 | obj-$(CONFIG_SECCOMP) += seccomp.o | 53 | obj-$(CONFIG_SECCOMP) += seccomp.o |
54 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 54 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
55 | obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o | ||
56 | obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o | ||
57 | ifeq ($(CONFIG_PREEMPT_RCU),y) | ||
58 | obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o | ||
59 | endif | ||
55 | obj-$(CONFIG_RELAY) += relay.o | 60 | obj-$(CONFIG_RELAY) += relay.o |
56 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o | 61 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o |
57 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o | 62 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o |
58 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o | 63 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o |
59 | obj-$(CONFIG_MARKERS) += marker.o | 64 | obj-$(CONFIG_MARKERS) += marker.o |
65 | obj-$(CONFIG_LATENCYTOP) += latencytop.o | ||
60 | 66 | ||
61 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) | 67 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) |
62 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 68 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 6b3a0c15144f..e0d3a4f56ecb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -15,9 +15,8 @@ | |||
15 | #include <linux/stop_machine.h> | 15 | #include <linux/stop_machine.h> |
16 | #include <linux/mutex.h> | 16 | #include <linux/mutex.h> |
17 | 17 | ||
18 | /* This protects CPUs going up and down... */ | 18 | /* Serializes the updates to cpu_online_map, cpu_present_map */ |
19 | static DEFINE_MUTEX(cpu_add_remove_lock); | 19 | static DEFINE_MUTEX(cpu_add_remove_lock); |
20 | static DEFINE_MUTEX(cpu_bitmask_lock); | ||
21 | 20 | ||
22 | static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); | 21 | static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); |
23 | 22 | ||
@@ -26,52 +25,123 @@ static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); | |||
26 | */ | 25 | */ |
27 | static int cpu_hotplug_disabled; | 26 | static int cpu_hotplug_disabled; |
28 | 27 | ||
29 | #ifdef CONFIG_HOTPLUG_CPU | 28 | static struct { |
29 | struct task_struct *active_writer; | ||
30 | struct mutex lock; /* Synchronizes accesses to refcount, */ | ||
31 | /* | ||
32 | * Also blocks the new readers during | ||
33 | * an ongoing cpu hotplug operation. | ||
34 | */ | ||
35 | int refcount; | ||
36 | wait_queue_head_t writer_queue; | ||
37 | } cpu_hotplug; | ||
30 | 38 | ||
31 | /* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */ | 39 | #define writer_exists() (cpu_hotplug.active_writer != NULL) |
32 | static struct task_struct *recursive; | ||
33 | static int recursive_depth; | ||
34 | 40 | ||
35 | void lock_cpu_hotplug(void) | 41 | void __init cpu_hotplug_init(void) |
36 | { | 42 | { |
37 | struct task_struct *tsk = current; | 43 | cpu_hotplug.active_writer = NULL; |
38 | 44 | mutex_init(&cpu_hotplug.lock); | |
39 | if (tsk == recursive) { | 45 | cpu_hotplug.refcount = 0; |
40 | static int warnings = 10; | 46 | init_waitqueue_head(&cpu_hotplug.writer_queue); |
41 | if (warnings) { | 47 | } |
42 | printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n"); | 48 | |
43 | WARN_ON(1); | 49 | #ifdef CONFIG_HOTPLUG_CPU |
44 | warnings--; | 50 | |
45 | } | 51 | void get_online_cpus(void) |
46 | recursive_depth++; | 52 | { |
53 | might_sleep(); | ||
54 | if (cpu_hotplug.active_writer == current) | ||
47 | return; | 55 | return; |
48 | } | 56 | mutex_lock(&cpu_hotplug.lock); |
49 | mutex_lock(&cpu_bitmask_lock); | 57 | cpu_hotplug.refcount++; |
50 | recursive = tsk; | 58 | mutex_unlock(&cpu_hotplug.lock); |
59 | |||
51 | } | 60 | } |
52 | EXPORT_SYMBOL_GPL(lock_cpu_hotplug); | 61 | EXPORT_SYMBOL_GPL(get_online_cpus); |
53 | 62 | ||
54 | void unlock_cpu_hotplug(void) | 63 | void put_online_cpus(void) |
55 | { | 64 | { |
56 | WARN_ON(recursive != current); | 65 | if (cpu_hotplug.active_writer == current) |
57 | if (recursive_depth) { | ||
58 | recursive_depth--; | ||
59 | return; | 66 | return; |
60 | } | 67 | mutex_lock(&cpu_hotplug.lock); |
61 | recursive = NULL; | 68 | cpu_hotplug.refcount--; |
62 | mutex_unlock(&cpu_bitmask_lock); | 69 | |
70 | if (unlikely(writer_exists()) && !cpu_hotplug.refcount) | ||
71 | wake_up(&cpu_hotplug.writer_queue); | ||
72 | |||
73 | mutex_unlock(&cpu_hotplug.lock); | ||
74 | |||
63 | } | 75 | } |
64 | EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); | 76 | EXPORT_SYMBOL_GPL(put_online_cpus); |
65 | 77 | ||
66 | #endif /* CONFIG_HOTPLUG_CPU */ | 78 | #endif /* CONFIG_HOTPLUG_CPU */ |
67 | 79 | ||
80 | /* | ||
81 | * The following two API's must be used when attempting | ||
82 | * to serialize the updates to cpu_online_map, cpu_present_map. | ||
83 | */ | ||
84 | void cpu_maps_update_begin(void) | ||
85 | { | ||
86 | mutex_lock(&cpu_add_remove_lock); | ||
87 | } | ||
88 | |||
89 | void cpu_maps_update_done(void) | ||
90 | { | ||
91 | mutex_unlock(&cpu_add_remove_lock); | ||
92 | } | ||
93 | |||
94 | /* | ||
95 | * This ensures that the hotplug operation can begin only when the | ||
96 | * refcount goes to zero. | ||
97 | * | ||
98 | * Note that during a cpu-hotplug operation, the new readers, if any, | ||
99 | * will be blocked by the cpu_hotplug.lock | ||
100 | * | ||
101 | * Since cpu_maps_update_begin is always called after invoking | ||
102 | * cpu_maps_update_begin, we can be sure that only one writer is active. | ||
103 | * | ||
104 | * Note that theoretically, there is a possibility of a livelock: | ||
105 | * - Refcount goes to zero, last reader wakes up the sleeping | ||
106 | * writer. | ||
107 | * - Last reader unlocks the cpu_hotplug.lock. | ||
108 | * - A new reader arrives at this moment, bumps up the refcount. | ||
109 | * - The writer acquires the cpu_hotplug.lock finds the refcount | ||
110 | * non zero and goes to sleep again. | ||
111 | * | ||
112 | * However, this is very difficult to achieve in practice since | ||
113 | * get_online_cpus() not an api which is called all that often. | ||
114 | * | ||
115 | */ | ||
116 | static void cpu_hotplug_begin(void) | ||
117 | { | ||
118 | DECLARE_WAITQUEUE(wait, current); | ||
119 | |||
120 | mutex_lock(&cpu_hotplug.lock); | ||
121 | |||
122 | cpu_hotplug.active_writer = current; | ||
123 | add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait); | ||
124 | while (cpu_hotplug.refcount) { | ||
125 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
126 | mutex_unlock(&cpu_hotplug.lock); | ||
127 | schedule(); | ||
128 | mutex_lock(&cpu_hotplug.lock); | ||
129 | } | ||
130 | remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait); | ||
131 | } | ||
132 | |||
133 | static void cpu_hotplug_done(void) | ||
134 | { | ||
135 | cpu_hotplug.active_writer = NULL; | ||
136 | mutex_unlock(&cpu_hotplug.lock); | ||
137 | } | ||
68 | /* Need to know about CPUs going up/down? */ | 138 | /* Need to know about CPUs going up/down? */ |
69 | int __cpuinit register_cpu_notifier(struct notifier_block *nb) | 139 | int __cpuinit register_cpu_notifier(struct notifier_block *nb) |
70 | { | 140 | { |
71 | int ret; | 141 | int ret; |
72 | mutex_lock(&cpu_add_remove_lock); | 142 | cpu_maps_update_begin(); |
73 | ret = raw_notifier_chain_register(&cpu_chain, nb); | 143 | ret = raw_notifier_chain_register(&cpu_chain, nb); |
74 | mutex_unlock(&cpu_add_remove_lock); | 144 | cpu_maps_update_done(); |
75 | return ret; | 145 | return ret; |
76 | } | 146 | } |
77 | 147 | ||
@@ -81,9 +151,9 @@ EXPORT_SYMBOL(register_cpu_notifier); | |||
81 | 151 | ||
82 | void unregister_cpu_notifier(struct notifier_block *nb) | 152 | void unregister_cpu_notifier(struct notifier_block *nb) |
83 | { | 153 | { |
84 | mutex_lock(&cpu_add_remove_lock); | 154 | cpu_maps_update_begin(); |
85 | raw_notifier_chain_unregister(&cpu_chain, nb); | 155 | raw_notifier_chain_unregister(&cpu_chain, nb); |
86 | mutex_unlock(&cpu_add_remove_lock); | 156 | cpu_maps_update_done(); |
87 | } | 157 | } |
88 | EXPORT_SYMBOL(unregister_cpu_notifier); | 158 | EXPORT_SYMBOL(unregister_cpu_notifier); |
89 | 159 | ||
@@ -147,7 +217,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) | |||
147 | if (!cpu_online(cpu)) | 217 | if (!cpu_online(cpu)) |
148 | return -EINVAL; | 218 | return -EINVAL; |
149 | 219 | ||
150 | raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); | 220 | cpu_hotplug_begin(); |
151 | err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, | 221 | err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, |
152 | hcpu, -1, &nr_calls); | 222 | hcpu, -1, &nr_calls); |
153 | if (err == NOTIFY_BAD) { | 223 | if (err == NOTIFY_BAD) { |
@@ -166,9 +236,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) | |||
166 | cpu_clear(cpu, tmp); | 236 | cpu_clear(cpu, tmp); |
167 | set_cpus_allowed(current, tmp); | 237 | set_cpus_allowed(current, tmp); |
168 | 238 | ||
169 | mutex_lock(&cpu_bitmask_lock); | ||
170 | p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); | 239 | p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); |
171 | mutex_unlock(&cpu_bitmask_lock); | ||
172 | 240 | ||
173 | if (IS_ERR(p) || cpu_online(cpu)) { | 241 | if (IS_ERR(p) || cpu_online(cpu)) { |
174 | /* CPU didn't die: tell everyone. Can't complain. */ | 242 | /* CPU didn't die: tell everyone. Can't complain. */ |
@@ -202,7 +270,7 @@ out_thread: | |||
202 | out_allowed: | 270 | out_allowed: |
203 | set_cpus_allowed(current, old_allowed); | 271 | set_cpus_allowed(current, old_allowed); |
204 | out_release: | 272 | out_release: |
205 | raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); | 273 | cpu_hotplug_done(); |
206 | return err; | 274 | return err; |
207 | } | 275 | } |
208 | 276 | ||
@@ -210,13 +278,13 @@ int cpu_down(unsigned int cpu) | |||
210 | { | 278 | { |
211 | int err = 0; | 279 | int err = 0; |
212 | 280 | ||
213 | mutex_lock(&cpu_add_remove_lock); | 281 | cpu_maps_update_begin(); |
214 | if (cpu_hotplug_disabled) | 282 | if (cpu_hotplug_disabled) |
215 | err = -EBUSY; | 283 | err = -EBUSY; |
216 | else | 284 | else |
217 | err = _cpu_down(cpu, 0); | 285 | err = _cpu_down(cpu, 0); |
218 | 286 | ||
219 | mutex_unlock(&cpu_add_remove_lock); | 287 | cpu_maps_update_done(); |
220 | return err; | 288 | return err; |
221 | } | 289 | } |
222 | #endif /*CONFIG_HOTPLUG_CPU*/ | 290 | #endif /*CONFIG_HOTPLUG_CPU*/ |
@@ -231,7 +299,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
231 | if (cpu_online(cpu) || !cpu_present(cpu)) | 299 | if (cpu_online(cpu) || !cpu_present(cpu)) |
232 | return -EINVAL; | 300 | return -EINVAL; |
233 | 301 | ||
234 | raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); | 302 | cpu_hotplug_begin(); |
235 | ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, | 303 | ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, |
236 | -1, &nr_calls); | 304 | -1, &nr_calls); |
237 | if (ret == NOTIFY_BAD) { | 305 | if (ret == NOTIFY_BAD) { |
@@ -243,9 +311,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
243 | } | 311 | } |
244 | 312 | ||
245 | /* Arch-specific enabling code. */ | 313 | /* Arch-specific enabling code. */ |
246 | mutex_lock(&cpu_bitmask_lock); | ||
247 | ret = __cpu_up(cpu); | 314 | ret = __cpu_up(cpu); |
248 | mutex_unlock(&cpu_bitmask_lock); | ||
249 | if (ret != 0) | 315 | if (ret != 0) |
250 | goto out_notify; | 316 | goto out_notify; |
251 | BUG_ON(!cpu_online(cpu)); | 317 | BUG_ON(!cpu_online(cpu)); |
@@ -257,7 +323,7 @@ out_notify: | |||
257 | if (ret != 0) | 323 | if (ret != 0) |
258 | __raw_notifier_call_chain(&cpu_chain, | 324 | __raw_notifier_call_chain(&cpu_chain, |
259 | CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); | 325 | CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); |
260 | raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); | 326 | cpu_hotplug_done(); |
261 | 327 | ||
262 | return ret; | 328 | return ret; |
263 | } | 329 | } |
@@ -275,13 +341,13 @@ int __cpuinit cpu_up(unsigned int cpu) | |||
275 | return -EINVAL; | 341 | return -EINVAL; |
276 | } | 342 | } |
277 | 343 | ||
278 | mutex_lock(&cpu_add_remove_lock); | 344 | cpu_maps_update_begin(); |
279 | if (cpu_hotplug_disabled) | 345 | if (cpu_hotplug_disabled) |
280 | err = -EBUSY; | 346 | err = -EBUSY; |
281 | else | 347 | else |
282 | err = _cpu_up(cpu, 0); | 348 | err = _cpu_up(cpu, 0); |
283 | 349 | ||
284 | mutex_unlock(&cpu_add_remove_lock); | 350 | cpu_maps_update_done(); |
285 | return err; | 351 | return err; |
286 | } | 352 | } |
287 | 353 | ||
@@ -292,7 +358,7 @@ int disable_nonboot_cpus(void) | |||
292 | { | 358 | { |
293 | int cpu, first_cpu, error = 0; | 359 | int cpu, first_cpu, error = 0; |
294 | 360 | ||
295 | mutex_lock(&cpu_add_remove_lock); | 361 | cpu_maps_update_begin(); |
296 | first_cpu = first_cpu(cpu_online_map); | 362 | first_cpu = first_cpu(cpu_online_map); |
297 | /* We take down all of the non-boot CPUs in one shot to avoid races | 363 | /* We take down all of the non-boot CPUs in one shot to avoid races |
298 | * with the userspace trying to use the CPU hotplug at the same time | 364 | * with the userspace trying to use the CPU hotplug at the same time |
@@ -319,7 +385,7 @@ int disable_nonboot_cpus(void) | |||
319 | } else { | 385 | } else { |
320 | printk(KERN_ERR "Non-boot CPUs are not disabled\n"); | 386 | printk(KERN_ERR "Non-boot CPUs are not disabled\n"); |
321 | } | 387 | } |
322 | mutex_unlock(&cpu_add_remove_lock); | 388 | cpu_maps_update_done(); |
323 | return error; | 389 | return error; |
324 | } | 390 | } |
325 | 391 | ||
@@ -328,7 +394,7 @@ void enable_nonboot_cpus(void) | |||
328 | int cpu, error; | 394 | int cpu, error; |
329 | 395 | ||
330 | /* Allow everyone to use the CPU hotplug again */ | 396 | /* Allow everyone to use the CPU hotplug again */ |
331 | mutex_lock(&cpu_add_remove_lock); | 397 | cpu_maps_update_begin(); |
332 | cpu_hotplug_disabled = 0; | 398 | cpu_hotplug_disabled = 0; |
333 | if (cpus_empty(frozen_cpus)) | 399 | if (cpus_empty(frozen_cpus)) |
334 | goto out; | 400 | goto out; |
@@ -344,6 +410,6 @@ void enable_nonboot_cpus(void) | |||
344 | } | 410 | } |
345 | cpus_clear(frozen_cpus); | 411 | cpus_clear(frozen_cpus); |
346 | out: | 412 | out: |
347 | mutex_unlock(&cpu_add_remove_lock); | 413 | cpu_maps_update_done(); |
348 | } | 414 | } |
349 | #endif /* CONFIG_PM_SLEEP_SMP */ | 415 | #endif /* CONFIG_PM_SLEEP_SMP */ |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 50f5dc463688..cfaf6419d817 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -537,10 +537,10 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | |||
537 | * | 537 | * |
538 | * Call with cgroup_mutex held. May take callback_mutex during | 538 | * Call with cgroup_mutex held. May take callback_mutex during |
539 | * call due to the kfifo_alloc() and kmalloc() calls. May nest | 539 | * call due to the kfifo_alloc() and kmalloc() calls. May nest |
540 | * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | 540 | * a call to the get_online_cpus()/put_online_cpus() pair. |
541 | * Must not be called holding callback_mutex, because we must not | 541 | * Must not be called holding callback_mutex, because we must not |
542 | * call lock_cpu_hotplug() while holding callback_mutex. Elsewhere | 542 | * call get_online_cpus() while holding callback_mutex. Elsewhere |
543 | * the kernel nests callback_mutex inside lock_cpu_hotplug() calls. | 543 | * the kernel nests callback_mutex inside get_online_cpus() calls. |
544 | * So the reverse nesting would risk an ABBA deadlock. | 544 | * So the reverse nesting would risk an ABBA deadlock. |
545 | * | 545 | * |
546 | * The three key local variables below are: | 546 | * The three key local variables below are: |
@@ -691,9 +691,9 @@ restart: | |||
691 | 691 | ||
692 | rebuild: | 692 | rebuild: |
693 | /* Have scheduler rebuild sched domains */ | 693 | /* Have scheduler rebuild sched domains */ |
694 | lock_cpu_hotplug(); | 694 | get_online_cpus(); |
695 | partition_sched_domains(ndoms, doms); | 695 | partition_sched_domains(ndoms, doms); |
696 | unlock_cpu_hotplug(); | 696 | put_online_cpus(); |
697 | 697 | ||
698 | done: | 698 | done: |
699 | if (q && !IS_ERR(q)) | 699 | if (q && !IS_ERR(q)) |
@@ -1617,10 +1617,10 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1617 | * | 1617 | * |
1618 | * If the cpuset being removed has its flag 'sched_load_balance' | 1618 | * If the cpuset being removed has its flag 'sched_load_balance' |
1619 | * enabled, then simulate turning sched_load_balance off, which | 1619 | * enabled, then simulate turning sched_load_balance off, which |
1620 | * will call rebuild_sched_domains(). The lock_cpu_hotplug() | 1620 | * will call rebuild_sched_domains(). The get_online_cpus() |
1621 | * call in rebuild_sched_domains() must not be made while holding | 1621 | * call in rebuild_sched_domains() must not be made while holding |
1622 | * callback_mutex. Elsewhere the kernel nests callback_mutex inside | 1622 | * callback_mutex. Elsewhere the kernel nests callback_mutex inside |
1623 | * lock_cpu_hotplug() calls. So the reverse nesting would risk an | 1623 | * get_online_cpus() calls. So the reverse nesting would risk an |
1624 | * ABBA deadlock. | 1624 | * ABBA deadlock. |
1625 | */ | 1625 | */ |
1626 | 1626 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 8dd8ff281009..39d22b3357de 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1045,6 +1045,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1045 | copy_flags(clone_flags, p); | 1045 | copy_flags(clone_flags, p); |
1046 | INIT_LIST_HEAD(&p->children); | 1046 | INIT_LIST_HEAD(&p->children); |
1047 | INIT_LIST_HEAD(&p->sibling); | 1047 | INIT_LIST_HEAD(&p->sibling); |
1048 | #ifdef CONFIG_PREEMPT_RCU | ||
1049 | p->rcu_read_lock_nesting = 0; | ||
1050 | p->rcu_flipctr_idx = 0; | ||
1051 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ | ||
1048 | p->vfork_done = NULL; | 1052 | p->vfork_done = NULL; |
1049 | spin_lock_init(&p->alloc_lock); | 1053 | spin_lock_init(&p->alloc_lock); |
1050 | 1054 | ||
@@ -1059,6 +1063,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1059 | p->prev_utime = cputime_zero; | 1063 | p->prev_utime = cputime_zero; |
1060 | p->prev_stime = cputime_zero; | 1064 | p->prev_stime = cputime_zero; |
1061 | 1065 | ||
1066 | #ifdef CONFIG_DETECT_SOFTLOCKUP | ||
1067 | p->last_switch_count = 0; | ||
1068 | p->last_switch_timestamp = 0; | ||
1069 | #endif | ||
1070 | |||
1062 | #ifdef CONFIG_TASK_XACCT | 1071 | #ifdef CONFIG_TASK_XACCT |
1063 | p->rchar = 0; /* I/O counter: bytes read */ | 1072 | p->rchar = 0; /* I/O counter: bytes read */ |
1064 | p->wchar = 0; /* I/O counter: bytes written */ | 1073 | p->wchar = 0; /* I/O counter: bytes written */ |
@@ -1196,6 +1205,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1196 | #ifdef TIF_SYSCALL_EMU | 1205 | #ifdef TIF_SYSCALL_EMU |
1197 | clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); | 1206 | clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); |
1198 | #endif | 1207 | #endif |
1208 | clear_all_latency_tracing(p); | ||
1199 | 1209 | ||
1200 | /* Our parent execution domain becomes current domain | 1210 | /* Our parent execution domain becomes current domain |
1201 | These must match for thread signalling to apply */ | 1211 | These must match for thread signalling to apply */ |
@@ -1237,6 +1247,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1237 | * parent's CPU). This avoids alot of nasty races. | 1247 | * parent's CPU). This avoids alot of nasty races. |
1238 | */ | 1248 | */ |
1239 | p->cpus_allowed = current->cpus_allowed; | 1249 | p->cpus_allowed = current->cpus_allowed; |
1250 | p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; | ||
1240 | if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || | 1251 | if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || |
1241 | !cpu_online(task_cpu(p)))) | 1252 | !cpu_online(task_cpu(p)))) |
1242 | set_task_cpu(p, smp_processor_id()); | 1253 | set_task_cpu(p, smp_processor_id()); |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f994bb8065e6..bd5d6b5060bc 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -325,6 +325,22 @@ unsigned long ktime_divns(const ktime_t kt, s64 div) | |||
325 | } | 325 | } |
326 | #endif /* BITS_PER_LONG >= 64 */ | 326 | #endif /* BITS_PER_LONG >= 64 */ |
327 | 327 | ||
328 | /* | ||
329 | * Check, whether the timer is on the callback pending list | ||
330 | */ | ||
331 | static inline int hrtimer_cb_pending(const struct hrtimer *timer) | ||
332 | { | ||
333 | return timer->state & HRTIMER_STATE_PENDING; | ||
334 | } | ||
335 | |||
336 | /* | ||
337 | * Remove a timer from the callback pending list | ||
338 | */ | ||
339 | static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) | ||
340 | { | ||
341 | list_del_init(&timer->cb_entry); | ||
342 | } | ||
343 | |||
328 | /* High resolution timer related functions */ | 344 | /* High resolution timer related functions */ |
329 | #ifdef CONFIG_HIGH_RES_TIMERS | 345 | #ifdef CONFIG_HIGH_RES_TIMERS |
330 | 346 | ||
@@ -494,29 +510,12 @@ void hres_timers_resume(void) | |||
494 | } | 510 | } |
495 | 511 | ||
496 | /* | 512 | /* |
497 | * Check, whether the timer is on the callback pending list | ||
498 | */ | ||
499 | static inline int hrtimer_cb_pending(const struct hrtimer *timer) | ||
500 | { | ||
501 | return timer->state & HRTIMER_STATE_PENDING; | ||
502 | } | ||
503 | |||
504 | /* | ||
505 | * Remove a timer from the callback pending list | ||
506 | */ | ||
507 | static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) | ||
508 | { | ||
509 | list_del_init(&timer->cb_entry); | ||
510 | } | ||
511 | |||
512 | /* | ||
513 | * Initialize the high resolution related parts of cpu_base | 513 | * Initialize the high resolution related parts of cpu_base |
514 | */ | 514 | */ |
515 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) | 515 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) |
516 | { | 516 | { |
517 | base->expires_next.tv64 = KTIME_MAX; | 517 | base->expires_next.tv64 = KTIME_MAX; |
518 | base->hres_active = 0; | 518 | base->hres_active = 0; |
519 | INIT_LIST_HEAD(&base->cb_pending); | ||
520 | } | 519 | } |
521 | 520 | ||
522 | /* | 521 | /* |
@@ -524,7 +523,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) | |||
524 | */ | 523 | */ |
525 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) | 524 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) |
526 | { | 525 | { |
527 | INIT_LIST_HEAD(&timer->cb_entry); | ||
528 | } | 526 | } |
529 | 527 | ||
530 | /* | 528 | /* |
@@ -618,10 +616,13 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
618 | { | 616 | { |
619 | return 0; | 617 | return 0; |
620 | } | 618 | } |
621 | static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; } | ||
622 | static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { } | ||
623 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } | 619 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } |
624 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } | 620 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } |
621 | static inline int hrtimer_reprogram(struct hrtimer *timer, | ||
622 | struct hrtimer_clock_base *base) | ||
623 | { | ||
624 | return 0; | ||
625 | } | ||
625 | 626 | ||
626 | #endif /* CONFIG_HIGH_RES_TIMERS */ | 627 | #endif /* CONFIG_HIGH_RES_TIMERS */ |
627 | 628 | ||
@@ -1001,6 +1002,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
1001 | clock_id = CLOCK_MONOTONIC; | 1002 | clock_id = CLOCK_MONOTONIC; |
1002 | 1003 | ||
1003 | timer->base = &cpu_base->clock_base[clock_id]; | 1004 | timer->base = &cpu_base->clock_base[clock_id]; |
1005 | INIT_LIST_HEAD(&timer->cb_entry); | ||
1004 | hrtimer_init_timer_hres(timer); | 1006 | hrtimer_init_timer_hres(timer); |
1005 | 1007 | ||
1006 | #ifdef CONFIG_TIMER_STATS | 1008 | #ifdef CONFIG_TIMER_STATS |
@@ -1030,6 +1032,85 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | |||
1030 | } | 1032 | } |
1031 | EXPORT_SYMBOL_GPL(hrtimer_get_res); | 1033 | EXPORT_SYMBOL_GPL(hrtimer_get_res); |
1032 | 1034 | ||
1035 | static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) | ||
1036 | { | ||
1037 | spin_lock_irq(&cpu_base->lock); | ||
1038 | |||
1039 | while (!list_empty(&cpu_base->cb_pending)) { | ||
1040 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
1041 | struct hrtimer *timer; | ||
1042 | int restart; | ||
1043 | |||
1044 | timer = list_entry(cpu_base->cb_pending.next, | ||
1045 | struct hrtimer, cb_entry); | ||
1046 | |||
1047 | timer_stats_account_hrtimer(timer); | ||
1048 | |||
1049 | fn = timer->function; | ||
1050 | __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); | ||
1051 | spin_unlock_irq(&cpu_base->lock); | ||
1052 | |||
1053 | restart = fn(timer); | ||
1054 | |||
1055 | spin_lock_irq(&cpu_base->lock); | ||
1056 | |||
1057 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
1058 | if (restart == HRTIMER_RESTART) { | ||
1059 | BUG_ON(hrtimer_active(timer)); | ||
1060 | /* | ||
1061 | * Enqueue the timer, allow reprogramming of the event | ||
1062 | * device | ||
1063 | */ | ||
1064 | enqueue_hrtimer(timer, timer->base, 1); | ||
1065 | } else if (hrtimer_active(timer)) { | ||
1066 | /* | ||
1067 | * If the timer was rearmed on another CPU, reprogram | ||
1068 | * the event device. | ||
1069 | */ | ||
1070 | if (timer->base->first == &timer->node) | ||
1071 | hrtimer_reprogram(timer, timer->base); | ||
1072 | } | ||
1073 | } | ||
1074 | spin_unlock_irq(&cpu_base->lock); | ||
1075 | } | ||
1076 | |||
1077 | static void __run_hrtimer(struct hrtimer *timer) | ||
1078 | { | ||
1079 | struct hrtimer_clock_base *base = timer->base; | ||
1080 | struct hrtimer_cpu_base *cpu_base = base->cpu_base; | ||
1081 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
1082 | int restart; | ||
1083 | |||
1084 | __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); | ||
1085 | timer_stats_account_hrtimer(timer); | ||
1086 | |||
1087 | fn = timer->function; | ||
1088 | if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { | ||
1089 | /* | ||
1090 | * Used for scheduler timers, avoid lock inversion with | ||
1091 | * rq->lock and tasklist_lock. | ||
1092 | * | ||
1093 | * These timers are required to deal with enqueue expiry | ||
1094 | * themselves and are not allowed to migrate. | ||
1095 | */ | ||
1096 | spin_unlock(&cpu_base->lock); | ||
1097 | restart = fn(timer); | ||
1098 | spin_lock(&cpu_base->lock); | ||
1099 | } else | ||
1100 | restart = fn(timer); | ||
1101 | |||
1102 | /* | ||
1103 | * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid | ||
1104 | * reprogramming of the event hardware. This happens at the end of this | ||
1105 | * function anyway. | ||
1106 | */ | ||
1107 | if (restart != HRTIMER_NORESTART) { | ||
1108 | BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); | ||
1109 | enqueue_hrtimer(timer, base, 0); | ||
1110 | } | ||
1111 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
1112 | } | ||
1113 | |||
1033 | #ifdef CONFIG_HIGH_RES_TIMERS | 1114 | #ifdef CONFIG_HIGH_RES_TIMERS |
1034 | 1115 | ||
1035 | /* | 1116 | /* |
@@ -1087,21 +1168,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) | |||
1087 | continue; | 1168 | continue; |
1088 | } | 1169 | } |
1089 | 1170 | ||
1090 | __remove_hrtimer(timer, base, | 1171 | __run_hrtimer(timer); |
1091 | HRTIMER_STATE_CALLBACK, 0); | ||
1092 | timer_stats_account_hrtimer(timer); | ||
1093 | |||
1094 | /* | ||
1095 | * Note: We clear the CALLBACK bit after | ||
1096 | * enqueue_hrtimer to avoid reprogramming of | ||
1097 | * the event hardware. This happens at the end | ||
1098 | * of this function anyway. | ||
1099 | */ | ||
1100 | if (timer->function(timer) != HRTIMER_NORESTART) { | ||
1101 | BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); | ||
1102 | enqueue_hrtimer(timer, base, 0); | ||
1103 | } | ||
1104 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
1105 | } | 1172 | } |
1106 | spin_unlock(&cpu_base->lock); | 1173 | spin_unlock(&cpu_base->lock); |
1107 | base++; | 1174 | base++; |
@@ -1122,52 +1189,41 @@ void hrtimer_interrupt(struct clock_event_device *dev) | |||
1122 | 1189 | ||
1123 | static void run_hrtimer_softirq(struct softirq_action *h) | 1190 | static void run_hrtimer_softirq(struct softirq_action *h) |
1124 | { | 1191 | { |
1125 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1192 | run_hrtimer_pending(&__get_cpu_var(hrtimer_bases)); |
1126 | 1193 | } | |
1127 | spin_lock_irq(&cpu_base->lock); | ||
1128 | |||
1129 | while (!list_empty(&cpu_base->cb_pending)) { | ||
1130 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
1131 | struct hrtimer *timer; | ||
1132 | int restart; | ||
1133 | |||
1134 | timer = list_entry(cpu_base->cb_pending.next, | ||
1135 | struct hrtimer, cb_entry); | ||
1136 | 1194 | ||
1137 | timer_stats_account_hrtimer(timer); | 1195 | #endif /* CONFIG_HIGH_RES_TIMERS */ |
1138 | 1196 | ||
1139 | fn = timer->function; | 1197 | /* |
1140 | __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); | 1198 | * Called from timer softirq every jiffy, expire hrtimers: |
1141 | spin_unlock_irq(&cpu_base->lock); | 1199 | * |
1200 | * For HRT its the fall back code to run the softirq in the timer | ||
1201 | * softirq context in case the hrtimer initialization failed or has | ||
1202 | * not been done yet. | ||
1203 | */ | ||
1204 | void hrtimer_run_pending(void) | ||
1205 | { | ||
1206 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
1142 | 1207 | ||
1143 | restart = fn(timer); | 1208 | if (hrtimer_hres_active()) |
1209 | return; | ||
1144 | 1210 | ||
1145 | spin_lock_irq(&cpu_base->lock); | 1211 | /* |
1212 | * This _is_ ugly: We have to check in the softirq context, | ||
1213 | * whether we can switch to highres and / or nohz mode. The | ||
1214 | * clocksource switch happens in the timer interrupt with | ||
1215 | * xtime_lock held. Notification from there only sets the | ||
1216 | * check bit in the tick_oneshot code, otherwise we might | ||
1217 | * deadlock vs. xtime_lock. | ||
1218 | */ | ||
1219 | if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) | ||
1220 | hrtimer_switch_to_hres(); | ||
1146 | 1221 | ||
1147 | timer->state &= ~HRTIMER_STATE_CALLBACK; | 1222 | run_hrtimer_pending(cpu_base); |
1148 | if (restart == HRTIMER_RESTART) { | ||
1149 | BUG_ON(hrtimer_active(timer)); | ||
1150 | /* | ||
1151 | * Enqueue the timer, allow reprogramming of the event | ||
1152 | * device | ||
1153 | */ | ||
1154 | enqueue_hrtimer(timer, timer->base, 1); | ||
1155 | } else if (hrtimer_active(timer)) { | ||
1156 | /* | ||
1157 | * If the timer was rearmed on another CPU, reprogram | ||
1158 | * the event device. | ||
1159 | */ | ||
1160 | if (timer->base->first == &timer->node) | ||
1161 | hrtimer_reprogram(timer, timer->base); | ||
1162 | } | ||
1163 | } | ||
1164 | spin_unlock_irq(&cpu_base->lock); | ||
1165 | } | 1223 | } |
1166 | 1224 | ||
1167 | #endif /* CONFIG_HIGH_RES_TIMERS */ | ||
1168 | |||
1169 | /* | 1225 | /* |
1170 | * Expire the per base hrtimer-queue: | 1226 | * Called from hardirq context every jiffy |
1171 | */ | 1227 | */ |
1172 | static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, | 1228 | static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, |
1173 | int index) | 1229 | int index) |
@@ -1181,46 +1237,27 @@ static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, | |||
1181 | if (base->get_softirq_time) | 1237 | if (base->get_softirq_time) |
1182 | base->softirq_time = base->get_softirq_time(); | 1238 | base->softirq_time = base->get_softirq_time(); |
1183 | 1239 | ||
1184 | spin_lock_irq(&cpu_base->lock); | 1240 | spin_lock(&cpu_base->lock); |
1185 | 1241 | ||
1186 | while ((node = base->first)) { | 1242 | while ((node = base->first)) { |
1187 | struct hrtimer *timer; | 1243 | struct hrtimer *timer; |
1188 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
1189 | int restart; | ||
1190 | 1244 | ||
1191 | timer = rb_entry(node, struct hrtimer, node); | 1245 | timer = rb_entry(node, struct hrtimer, node); |
1192 | if (base->softirq_time.tv64 <= timer->expires.tv64) | 1246 | if (base->softirq_time.tv64 <= timer->expires.tv64) |
1193 | break; | 1247 | break; |
1194 | 1248 | ||
1195 | #ifdef CONFIG_HIGH_RES_TIMERS | 1249 | if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { |
1196 | WARN_ON_ONCE(timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ); | 1250 | __remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0); |
1197 | #endif | 1251 | list_add_tail(&timer->cb_entry, |
1198 | timer_stats_account_hrtimer(timer); | 1252 | &base->cpu_base->cb_pending); |
1199 | 1253 | continue; | |
1200 | fn = timer->function; | ||
1201 | __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); | ||
1202 | spin_unlock_irq(&cpu_base->lock); | ||
1203 | |||
1204 | restart = fn(timer); | ||
1205 | |||
1206 | spin_lock_irq(&cpu_base->lock); | ||
1207 | |||
1208 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
1209 | if (restart != HRTIMER_NORESTART) { | ||
1210 | BUG_ON(hrtimer_active(timer)); | ||
1211 | enqueue_hrtimer(timer, base, 0); | ||
1212 | } | 1254 | } |
1255 | |||
1256 | __run_hrtimer(timer); | ||
1213 | } | 1257 | } |
1214 | spin_unlock_irq(&cpu_base->lock); | 1258 | spin_unlock(&cpu_base->lock); |
1215 | } | 1259 | } |
1216 | 1260 | ||
1217 | /* | ||
1218 | * Called from timer softirq every jiffy, expire hrtimers: | ||
1219 | * | ||
1220 | * For HRT its the fall back code to run the softirq in the timer | ||
1221 | * softirq context in case the hrtimer initialization failed or has | ||
1222 | * not been done yet. | ||
1223 | */ | ||
1224 | void hrtimer_run_queues(void) | 1261 | void hrtimer_run_queues(void) |
1225 | { | 1262 | { |
1226 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1263 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); |
@@ -1229,18 +1266,6 @@ void hrtimer_run_queues(void) | |||
1229 | if (hrtimer_hres_active()) | 1266 | if (hrtimer_hres_active()) |
1230 | return; | 1267 | return; |
1231 | 1268 | ||
1232 | /* | ||
1233 | * This _is_ ugly: We have to check in the softirq context, | ||
1234 | * whether we can switch to highres and / or nohz mode. The | ||
1235 | * clocksource switch happens in the timer interrupt with | ||
1236 | * xtime_lock held. Notification from there only sets the | ||
1237 | * check bit in the tick_oneshot code, otherwise we might | ||
1238 | * deadlock vs. xtime_lock. | ||
1239 | */ | ||
1240 | if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) | ||
1241 | if (hrtimer_switch_to_hres()) | ||
1242 | return; | ||
1243 | |||
1244 | hrtimer_get_softirq_time(cpu_base); | 1269 | hrtimer_get_softirq_time(cpu_base); |
1245 | 1270 | ||
1246 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) | 1271 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) |
@@ -1268,7 +1293,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) | |||
1268 | sl->timer.function = hrtimer_wakeup; | 1293 | sl->timer.function = hrtimer_wakeup; |
1269 | sl->task = task; | 1294 | sl->task = task; |
1270 | #ifdef CONFIG_HIGH_RES_TIMERS | 1295 | #ifdef CONFIG_HIGH_RES_TIMERS |
1271 | sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART; | 1296 | sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; |
1272 | #endif | 1297 | #endif |
1273 | } | 1298 | } |
1274 | 1299 | ||
@@ -1279,6 +1304,8 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod | |||
1279 | do { | 1304 | do { |
1280 | set_current_state(TASK_INTERRUPTIBLE); | 1305 | set_current_state(TASK_INTERRUPTIBLE); |
1281 | hrtimer_start(&t->timer, t->timer.expires, mode); | 1306 | hrtimer_start(&t->timer, t->timer.expires, mode); |
1307 | if (!hrtimer_active(&t->timer)) | ||
1308 | t->task = NULL; | ||
1282 | 1309 | ||
1283 | if (likely(t->task)) | 1310 | if (likely(t->task)) |
1284 | schedule(); | 1311 | schedule(); |
@@ -1389,6 +1416,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu) | |||
1389 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) | 1416 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) |
1390 | cpu_base->clock_base[i].cpu_base = cpu_base; | 1417 | cpu_base->clock_base[i].cpu_base = cpu_base; |
1391 | 1418 | ||
1419 | INIT_LIST_HEAD(&cpu_base->cb_pending); | ||
1392 | hrtimer_init_hres(cpu_base); | 1420 | hrtimer_init_hres(cpu_base); |
1393 | } | 1421 | } |
1394 | 1422 | ||
diff --git a/kernel/kthread.c b/kernel/kthread.c index dcfe724300eb..0ac887882f90 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -15,6 +15,8 @@ | |||
15 | #include <linux/mutex.h> | 15 | #include <linux/mutex.h> |
16 | #include <asm/semaphore.h> | 16 | #include <asm/semaphore.h> |
17 | 17 | ||
18 | #define KTHREAD_NICE_LEVEL (-5) | ||
19 | |||
18 | static DEFINE_SPINLOCK(kthread_create_lock); | 20 | static DEFINE_SPINLOCK(kthread_create_lock); |
19 | static LIST_HEAD(kthread_create_list); | 21 | static LIST_HEAD(kthread_create_list); |
20 | struct task_struct *kthreadd_task; | 22 | struct task_struct *kthreadd_task; |
@@ -94,10 +96,18 @@ static void create_kthread(struct kthread_create_info *create) | |||
94 | if (pid < 0) { | 96 | if (pid < 0) { |
95 | create->result = ERR_PTR(pid); | 97 | create->result = ERR_PTR(pid); |
96 | } else { | 98 | } else { |
99 | struct sched_param param = { .sched_priority = 0 }; | ||
97 | wait_for_completion(&create->started); | 100 | wait_for_completion(&create->started); |
98 | read_lock(&tasklist_lock); | 101 | read_lock(&tasklist_lock); |
99 | create->result = find_task_by_pid(pid); | 102 | create->result = find_task_by_pid(pid); |
100 | read_unlock(&tasklist_lock); | 103 | read_unlock(&tasklist_lock); |
104 | /* | ||
105 | * root may have changed our (kthreadd's) priority or CPU mask. | ||
106 | * The kernel thread should not inherit these properties. | ||
107 | */ | ||
108 | sched_setscheduler(create->result, SCHED_NORMAL, ¶m); | ||
109 | set_user_nice(create->result, KTHREAD_NICE_LEVEL); | ||
110 | set_cpus_allowed(create->result, CPU_MASK_ALL); | ||
101 | } | 111 | } |
102 | complete(&create->done); | 112 | complete(&create->done); |
103 | } | 113 | } |
@@ -221,7 +231,7 @@ int kthreadd(void *unused) | |||
221 | /* Setup a clean context for our children to inherit. */ | 231 | /* Setup a clean context for our children to inherit. */ |
222 | set_task_comm(tsk, "kthreadd"); | 232 | set_task_comm(tsk, "kthreadd"); |
223 | ignore_signals(tsk); | 233 | ignore_signals(tsk); |
224 | set_user_nice(tsk, -5); | 234 | set_user_nice(tsk, KTHREAD_NICE_LEVEL); |
225 | set_cpus_allowed(tsk, CPU_MASK_ALL); | 235 | set_cpus_allowed(tsk, CPU_MASK_ALL); |
226 | 236 | ||
227 | current->flags |= PF_NOFREEZE; | 237 | current->flags |= PF_NOFREEZE; |
diff --git a/kernel/latencytop.c b/kernel/latencytop.c new file mode 100644 index 000000000000..b4e3c85abe74 --- /dev/null +++ b/kernel/latencytop.c | |||
@@ -0,0 +1,239 @@ | |||
1 | /* | ||
2 | * latencytop.c: Latency display infrastructure | ||
3 | * | ||
4 | * (C) Copyright 2008 Intel Corporation | ||
5 | * Author: Arjan van de Ven <arjan@linux.intel.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; version 2 | ||
10 | * of the License. | ||
11 | */ | ||
12 | #include <linux/latencytop.h> | ||
13 | #include <linux/kallsyms.h> | ||
14 | #include <linux/seq_file.h> | ||
15 | #include <linux/notifier.h> | ||
16 | #include <linux/spinlock.h> | ||
17 | #include <linux/proc_fs.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/sched.h> | ||
20 | #include <linux/list.h> | ||
21 | #include <linux/slab.h> | ||
22 | #include <linux/stacktrace.h> | ||
23 | |||
24 | static DEFINE_SPINLOCK(latency_lock); | ||
25 | |||
26 | #define MAXLR 128 | ||
27 | static struct latency_record latency_record[MAXLR]; | ||
28 | |||
29 | int latencytop_enabled; | ||
30 | |||
31 | void clear_all_latency_tracing(struct task_struct *p) | ||
32 | { | ||
33 | unsigned long flags; | ||
34 | |||
35 | if (!latencytop_enabled) | ||
36 | return; | ||
37 | |||
38 | spin_lock_irqsave(&latency_lock, flags); | ||
39 | memset(&p->latency_record, 0, sizeof(p->latency_record)); | ||
40 | p->latency_record_count = 0; | ||
41 | spin_unlock_irqrestore(&latency_lock, flags); | ||
42 | } | ||
43 | |||
44 | static void clear_global_latency_tracing(void) | ||
45 | { | ||
46 | unsigned long flags; | ||
47 | |||
48 | spin_lock_irqsave(&latency_lock, flags); | ||
49 | memset(&latency_record, 0, sizeof(latency_record)); | ||
50 | spin_unlock_irqrestore(&latency_lock, flags); | ||
51 | } | ||
52 | |||
53 | static void __sched | ||
54 | account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) | ||
55 | { | ||
56 | int firstnonnull = MAXLR + 1; | ||
57 | int i; | ||
58 | |||
59 | if (!latencytop_enabled) | ||
60 | return; | ||
61 | |||
62 | /* skip kernel threads for now */ | ||
63 | if (!tsk->mm) | ||
64 | return; | ||
65 | |||
66 | for (i = 0; i < MAXLR; i++) { | ||
67 | int q; | ||
68 | int same = 1; | ||
69 | /* Nothing stored: */ | ||
70 | if (!latency_record[i].backtrace[0]) { | ||
71 | if (firstnonnull > i) | ||
72 | firstnonnull = i; | ||
73 | continue; | ||
74 | } | ||
75 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { | ||
76 | if (latency_record[i].backtrace[q] != | ||
77 | lat->backtrace[q]) | ||
78 | same = 0; | ||
79 | if (same && lat->backtrace[q] == 0) | ||
80 | break; | ||
81 | if (same && lat->backtrace[q] == ULONG_MAX) | ||
82 | break; | ||
83 | } | ||
84 | if (same) { | ||
85 | latency_record[i].count++; | ||
86 | latency_record[i].time += lat->time; | ||
87 | if (lat->time > latency_record[i].max) | ||
88 | latency_record[i].max = lat->time; | ||
89 | return; | ||
90 | } | ||
91 | } | ||
92 | |||
93 | i = firstnonnull; | ||
94 | if (i >= MAXLR - 1) | ||
95 | return; | ||
96 | |||
97 | /* Allocted a new one: */ | ||
98 | memcpy(&latency_record[i], lat, sizeof(struct latency_record)); | ||
99 | } | ||
100 | |||
101 | static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat) | ||
102 | { | ||
103 | struct stack_trace trace; | ||
104 | |||
105 | memset(&trace, 0, sizeof(trace)); | ||
106 | trace.max_entries = LT_BACKTRACEDEPTH; | ||
107 | trace.entries = &lat->backtrace[0]; | ||
108 | trace.skip = 0; | ||
109 | save_stack_trace_tsk(tsk, &trace); | ||
110 | } | ||
111 | |||
112 | void __sched | ||
113 | account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) | ||
114 | { | ||
115 | unsigned long flags; | ||
116 | int i, q; | ||
117 | struct latency_record lat; | ||
118 | |||
119 | if (!latencytop_enabled) | ||
120 | return; | ||
121 | |||
122 | /* Long interruptible waits are generally user requested... */ | ||
123 | if (inter && usecs > 5000) | ||
124 | return; | ||
125 | |||
126 | memset(&lat, 0, sizeof(lat)); | ||
127 | lat.count = 1; | ||
128 | lat.time = usecs; | ||
129 | lat.max = usecs; | ||
130 | store_stacktrace(tsk, &lat); | ||
131 | |||
132 | spin_lock_irqsave(&latency_lock, flags); | ||
133 | |||
134 | account_global_scheduler_latency(tsk, &lat); | ||
135 | |||
136 | /* | ||
137 | * short term hack; if we're > 32 we stop; future we recycle: | ||
138 | */ | ||
139 | tsk->latency_record_count++; | ||
140 | if (tsk->latency_record_count >= LT_SAVECOUNT) | ||
141 | goto out_unlock; | ||
142 | |||
143 | for (i = 0; i < LT_SAVECOUNT ; i++) { | ||
144 | struct latency_record *mylat; | ||
145 | int same = 1; | ||
146 | mylat = &tsk->latency_record[i]; | ||
147 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { | ||
148 | if (mylat->backtrace[q] != | ||
149 | lat.backtrace[q]) | ||
150 | same = 0; | ||
151 | if (same && lat.backtrace[q] == 0) | ||
152 | break; | ||
153 | if (same && lat.backtrace[q] == ULONG_MAX) | ||
154 | break; | ||
155 | } | ||
156 | if (same) { | ||
157 | mylat->count++; | ||
158 | mylat->time += lat.time; | ||
159 | if (lat.time > mylat->max) | ||
160 | mylat->max = lat.time; | ||
161 | goto out_unlock; | ||
162 | } | ||
163 | } | ||
164 | |||
165 | /* Allocated a new one: */ | ||
166 | i = tsk->latency_record_count; | ||
167 | memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); | ||
168 | |||
169 | out_unlock: | ||
170 | spin_unlock_irqrestore(&latency_lock, flags); | ||
171 | } | ||
172 | |||
173 | static int lstats_show(struct seq_file *m, void *v) | ||
174 | { | ||
175 | int i; | ||
176 | |||
177 | seq_puts(m, "Latency Top version : v0.1\n"); | ||
178 | |||
179 | for (i = 0; i < MAXLR; i++) { | ||
180 | if (latency_record[i].backtrace[0]) { | ||
181 | int q; | ||
182 | seq_printf(m, "%i %li %li ", | ||
183 | latency_record[i].count, | ||
184 | latency_record[i].time, | ||
185 | latency_record[i].max); | ||
186 | for (q = 0; q < LT_BACKTRACEDEPTH; q++) { | ||
187 | char sym[KSYM_NAME_LEN]; | ||
188 | char *c; | ||
189 | if (!latency_record[i].backtrace[q]) | ||
190 | break; | ||
191 | if (latency_record[i].backtrace[q] == ULONG_MAX) | ||
192 | break; | ||
193 | sprint_symbol(sym, latency_record[i].backtrace[q]); | ||
194 | c = strchr(sym, '+'); | ||
195 | if (c) | ||
196 | *c = 0; | ||
197 | seq_printf(m, "%s ", sym); | ||
198 | } | ||
199 | seq_printf(m, "\n"); | ||
200 | } | ||
201 | } | ||
202 | return 0; | ||
203 | } | ||
204 | |||
205 | static ssize_t | ||
206 | lstats_write(struct file *file, const char __user *buf, size_t count, | ||
207 | loff_t *offs) | ||
208 | { | ||
209 | clear_global_latency_tracing(); | ||
210 | |||
211 | return count; | ||
212 | } | ||
213 | |||
214 | static int lstats_open(struct inode *inode, struct file *filp) | ||
215 | { | ||
216 | return single_open(filp, lstats_show, NULL); | ||
217 | } | ||
218 | |||
219 | static struct file_operations lstats_fops = { | ||
220 | .open = lstats_open, | ||
221 | .read = seq_read, | ||
222 | .write = lstats_write, | ||
223 | .llseek = seq_lseek, | ||
224 | .release = single_release, | ||
225 | }; | ||
226 | |||
227 | static int __init init_lstats_procfs(void) | ||
228 | { | ||
229 | struct proc_dir_entry *pe; | ||
230 | |||
231 | pe = create_proc_entry("latency_stats", 0644, NULL); | ||
232 | if (!pe) | ||
233 | return -ENOMEM; | ||
234 | |||
235 | pe->proc_fops = &lstats_fops; | ||
236 | |||
237 | return 0; | ||
238 | } | ||
239 | __initcall(init_lstats_procfs); | ||
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index e2c07ece367d..3574379f4d62 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -3206,7 +3206,11 @@ retry: | |||
3206 | 3206 | ||
3207 | EXPORT_SYMBOL_GPL(debug_show_all_locks); | 3207 | EXPORT_SYMBOL_GPL(debug_show_all_locks); |
3208 | 3208 | ||
3209 | void debug_show_held_locks(struct task_struct *task) | 3209 | /* |
3210 | * Careful: only use this function if you are sure that | ||
3211 | * the task cannot run in parallel! | ||
3212 | */ | ||
3213 | void __debug_show_held_locks(struct task_struct *task) | ||
3210 | { | 3214 | { |
3211 | if (unlikely(!debug_locks)) { | 3215 | if (unlikely(!debug_locks)) { |
3212 | printk("INFO: lockdep is turned off.\n"); | 3216 | printk("INFO: lockdep is turned off.\n"); |
@@ -3214,6 +3218,12 @@ void debug_show_held_locks(struct task_struct *task) | |||
3214 | } | 3218 | } |
3215 | lockdep_print_held_locks(task); | 3219 | lockdep_print_held_locks(task); |
3216 | } | 3220 | } |
3221 | EXPORT_SYMBOL_GPL(__debug_show_held_locks); | ||
3222 | |||
3223 | void debug_show_held_locks(struct task_struct *task) | ||
3224 | { | ||
3225 | __debug_show_held_locks(task); | ||
3226 | } | ||
3217 | 3227 | ||
3218 | EXPORT_SYMBOL_GPL(debug_show_held_locks); | 3228 | EXPORT_SYMBOL_GPL(debug_show_held_locks); |
3219 | 3229 | ||
diff --git a/kernel/module.c b/kernel/module.c index dcb8a2cbf75e..1bb4c5e0d56e 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -496,6 +496,8 @@ static struct module_attribute modinfo_##field = { \ | |||
496 | MODINFO_ATTR(version); | 496 | MODINFO_ATTR(version); |
497 | MODINFO_ATTR(srcversion); | 497 | MODINFO_ATTR(srcversion); |
498 | 498 | ||
499 | static char last_unloaded_module[MODULE_NAME_LEN+1]; | ||
500 | |||
499 | #ifdef CONFIG_MODULE_UNLOAD | 501 | #ifdef CONFIG_MODULE_UNLOAD |
500 | /* Init the unload section of the module. */ | 502 | /* Init the unload section of the module. */ |
501 | static void module_unload_init(struct module *mod) | 503 | static void module_unload_init(struct module *mod) |
@@ -719,6 +721,8 @@ sys_delete_module(const char __user *name_user, unsigned int flags) | |||
719 | mod->exit(); | 721 | mod->exit(); |
720 | mutex_lock(&module_mutex); | 722 | mutex_lock(&module_mutex); |
721 | } | 723 | } |
724 | /* Store the name of the last unloaded module for diagnostic purposes */ | ||
725 | sprintf(last_unloaded_module, mod->name); | ||
722 | free_module(mod); | 726 | free_module(mod); |
723 | 727 | ||
724 | out: | 728 | out: |
@@ -2357,21 +2361,30 @@ static void m_stop(struct seq_file *m, void *p) | |||
2357 | mutex_unlock(&module_mutex); | 2361 | mutex_unlock(&module_mutex); |
2358 | } | 2362 | } |
2359 | 2363 | ||
2360 | static char *taint_flags(unsigned int taints, char *buf) | 2364 | static char *module_flags(struct module *mod, char *buf) |
2361 | { | 2365 | { |
2362 | int bx = 0; | 2366 | int bx = 0; |
2363 | 2367 | ||
2364 | if (taints) { | 2368 | if (mod->taints || |
2369 | mod->state == MODULE_STATE_GOING || | ||
2370 | mod->state == MODULE_STATE_COMING) { | ||
2365 | buf[bx++] = '('; | 2371 | buf[bx++] = '('; |
2366 | if (taints & TAINT_PROPRIETARY_MODULE) | 2372 | if (mod->taints & TAINT_PROPRIETARY_MODULE) |
2367 | buf[bx++] = 'P'; | 2373 | buf[bx++] = 'P'; |
2368 | if (taints & TAINT_FORCED_MODULE) | 2374 | if (mod->taints & TAINT_FORCED_MODULE) |
2369 | buf[bx++] = 'F'; | 2375 | buf[bx++] = 'F'; |
2370 | /* | 2376 | /* |
2371 | * TAINT_FORCED_RMMOD: could be added. | 2377 | * TAINT_FORCED_RMMOD: could be added. |
2372 | * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't | 2378 | * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't |
2373 | * apply to modules. | 2379 | * apply to modules. |
2374 | */ | 2380 | */ |
2381 | |||
2382 | /* Show a - for module-is-being-unloaded */ | ||
2383 | if (mod->state == MODULE_STATE_GOING) | ||
2384 | buf[bx++] = '-'; | ||
2385 | /* Show a + for module-is-being-loaded */ | ||
2386 | if (mod->state == MODULE_STATE_COMING) | ||
2387 | buf[bx++] = '+'; | ||
2375 | buf[bx++] = ')'; | 2388 | buf[bx++] = ')'; |
2376 | } | 2389 | } |
2377 | buf[bx] = '\0'; | 2390 | buf[bx] = '\0'; |
@@ -2398,7 +2411,7 @@ static int m_show(struct seq_file *m, void *p) | |||
2398 | 2411 | ||
2399 | /* Taints info */ | 2412 | /* Taints info */ |
2400 | if (mod->taints) | 2413 | if (mod->taints) |
2401 | seq_printf(m, " %s", taint_flags(mod->taints, buf)); | 2414 | seq_printf(m, " %s", module_flags(mod, buf)); |
2402 | 2415 | ||
2403 | seq_printf(m, "\n"); | 2416 | seq_printf(m, "\n"); |
2404 | return 0; | 2417 | return 0; |
@@ -2493,7 +2506,9 @@ void print_modules(void) | |||
2493 | 2506 | ||
2494 | printk("Modules linked in:"); | 2507 | printk("Modules linked in:"); |
2495 | list_for_each_entry(mod, &modules, list) | 2508 | list_for_each_entry(mod, &modules, list) |
2496 | printk(" %s%s", mod->name, taint_flags(mod->taints, buf)); | 2509 | printk(" %s%s", mod->name, module_flags(mod, buf)); |
2510 | if (last_unloaded_module[0]) | ||
2511 | printk(" [last unloaded: %s]", last_unloaded_module); | ||
2497 | printk("\n"); | 2512 | printk("\n"); |
2498 | } | 2513 | } |
2499 | 2514 | ||
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 68c96376e84a..0b7c82ac467e 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -967,6 +967,7 @@ static void check_thread_timers(struct task_struct *tsk, | |||
967 | { | 967 | { |
968 | int maxfire; | 968 | int maxfire; |
969 | struct list_head *timers = tsk->cpu_timers; | 969 | struct list_head *timers = tsk->cpu_timers; |
970 | struct signal_struct *const sig = tsk->signal; | ||
970 | 971 | ||
971 | maxfire = 20; | 972 | maxfire = 20; |
972 | tsk->it_prof_expires = cputime_zero; | 973 | tsk->it_prof_expires = cputime_zero; |
@@ -1011,6 +1012,35 @@ static void check_thread_timers(struct task_struct *tsk, | |||
1011 | t->firing = 1; | 1012 | t->firing = 1; |
1012 | list_move_tail(&t->entry, firing); | 1013 | list_move_tail(&t->entry, firing); |
1013 | } | 1014 | } |
1015 | |||
1016 | /* | ||
1017 | * Check for the special case thread timers. | ||
1018 | */ | ||
1019 | if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) { | ||
1020 | unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max; | ||
1021 | unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur; | ||
1022 | |||
1023 | if (hard != RLIM_INFINITY && | ||
1024 | tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { | ||
1025 | /* | ||
1026 | * At the hard limit, we just die. | ||
1027 | * No need to calculate anything else now. | ||
1028 | */ | ||
1029 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); | ||
1030 | return; | ||
1031 | } | ||
1032 | if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) { | ||
1033 | /* | ||
1034 | * At the soft limit, send a SIGXCPU every second. | ||
1035 | */ | ||
1036 | if (sig->rlim[RLIMIT_RTTIME].rlim_cur | ||
1037 | < sig->rlim[RLIMIT_RTTIME].rlim_max) { | ||
1038 | sig->rlim[RLIMIT_RTTIME].rlim_cur += | ||
1039 | USEC_PER_SEC; | ||
1040 | } | ||
1041 | __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); | ||
1042 | } | ||
1043 | } | ||
1014 | } | 1044 | } |
1015 | 1045 | ||
1016 | /* | 1046 | /* |
diff --git a/kernel/printk.c b/kernel/printk.c index 89011bf8c106..423a8c765a57 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -573,11 +573,6 @@ static int __init printk_time_setup(char *str) | |||
573 | 573 | ||
574 | __setup("time", printk_time_setup); | 574 | __setup("time", printk_time_setup); |
575 | 575 | ||
576 | __attribute__((weak)) unsigned long long printk_clock(void) | ||
577 | { | ||
578 | return sched_clock(); | ||
579 | } | ||
580 | |||
581 | /* Check if we have any console registered that can be called early in boot. */ | 576 | /* Check if we have any console registered that can be called early in boot. */ |
582 | static int have_callable_console(void) | 577 | static int have_callable_console(void) |
583 | { | 578 | { |
@@ -628,30 +623,57 @@ asmlinkage int printk(const char *fmt, ...) | |||
628 | /* cpu currently holding logbuf_lock */ | 623 | /* cpu currently holding logbuf_lock */ |
629 | static volatile unsigned int printk_cpu = UINT_MAX; | 624 | static volatile unsigned int printk_cpu = UINT_MAX; |
630 | 625 | ||
626 | const char printk_recursion_bug_msg [] = | ||
627 | KERN_CRIT "BUG: recent printk recursion!\n"; | ||
628 | static int printk_recursion_bug; | ||
629 | |||
631 | asmlinkage int vprintk(const char *fmt, va_list args) | 630 | asmlinkage int vprintk(const char *fmt, va_list args) |
632 | { | 631 | { |
632 | static int log_level_unknown = 1; | ||
633 | static char printk_buf[1024]; | ||
634 | |||
633 | unsigned long flags; | 635 | unsigned long flags; |
634 | int printed_len; | 636 | int printed_len = 0; |
637 | int this_cpu; | ||
635 | char *p; | 638 | char *p; |
636 | static char printk_buf[1024]; | ||
637 | static int log_level_unknown = 1; | ||
638 | 639 | ||
639 | boot_delay_msec(); | 640 | boot_delay_msec(); |
640 | 641 | ||
641 | preempt_disable(); | 642 | preempt_disable(); |
642 | if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id()) | ||
643 | /* If a crash is occurring during printk() on this CPU, | ||
644 | * make sure we can't deadlock */ | ||
645 | zap_locks(); | ||
646 | |||
647 | /* This stops the holder of console_sem just where we want him */ | 643 | /* This stops the holder of console_sem just where we want him */ |
648 | raw_local_irq_save(flags); | 644 | raw_local_irq_save(flags); |
645 | this_cpu = smp_processor_id(); | ||
646 | |||
647 | /* | ||
648 | * Ouch, printk recursed into itself! | ||
649 | */ | ||
650 | if (unlikely(printk_cpu == this_cpu)) { | ||
651 | /* | ||
652 | * If a crash is occurring during printk() on this CPU, | ||
653 | * then try to get the crash message out but make sure | ||
654 | * we can't deadlock. Otherwise just return to avoid the | ||
655 | * recursion and return - but flag the recursion so that | ||
656 | * it can be printed at the next appropriate moment: | ||
657 | */ | ||
658 | if (!oops_in_progress) { | ||
659 | printk_recursion_bug = 1; | ||
660 | goto out_restore_irqs; | ||
661 | } | ||
662 | zap_locks(); | ||
663 | } | ||
664 | |||
649 | lockdep_off(); | 665 | lockdep_off(); |
650 | spin_lock(&logbuf_lock); | 666 | spin_lock(&logbuf_lock); |
651 | printk_cpu = smp_processor_id(); | 667 | printk_cpu = this_cpu; |
652 | 668 | ||
669 | if (printk_recursion_bug) { | ||
670 | printk_recursion_bug = 0; | ||
671 | strcpy(printk_buf, printk_recursion_bug_msg); | ||
672 | printed_len = sizeof(printk_recursion_bug_msg); | ||
673 | } | ||
653 | /* Emit the output into the temporary buffer */ | 674 | /* Emit the output into the temporary buffer */ |
654 | printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); | 675 | printed_len += vscnprintf(printk_buf + printed_len, |
676 | sizeof(printk_buf), fmt, args); | ||
655 | 677 | ||
656 | /* | 678 | /* |
657 | * Copy the output into log_buf. If the caller didn't provide | 679 | * Copy the output into log_buf. If the caller didn't provide |
@@ -680,7 +702,9 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
680 | loglev_char = default_message_loglevel | 702 | loglev_char = default_message_loglevel |
681 | + '0'; | 703 | + '0'; |
682 | } | 704 | } |
683 | t = printk_clock(); | 705 | t = 0; |
706 | if (system_state != SYSTEM_BOOTING) | ||
707 | t = ktime_to_ns(ktime_get()); | ||
684 | nanosec_rem = do_div(t, 1000000000); | 708 | nanosec_rem = do_div(t, 1000000000); |
685 | tlen = sprintf(tbuf, | 709 | tlen = sprintf(tbuf, |
686 | "<%c>[%5lu.%06lu] ", | 710 | "<%c>[%5lu.%06lu] ", |
@@ -744,6 +768,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
744 | printk_cpu = UINT_MAX; | 768 | printk_cpu = UINT_MAX; |
745 | spin_unlock(&logbuf_lock); | 769 | spin_unlock(&logbuf_lock); |
746 | lockdep_on(); | 770 | lockdep_on(); |
771 | out_restore_irqs: | ||
747 | raw_local_irq_restore(flags); | 772 | raw_local_irq_restore(flags); |
748 | } | 773 | } |
749 | 774 | ||
diff --git a/kernel/profile.c b/kernel/profile.c index 5e95330e5120..e64c2da11c0f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -52,7 +52,7 @@ static DEFINE_PER_CPU(int, cpu_profile_flip); | |||
52 | static DEFINE_MUTEX(profile_flip_mutex); | 52 | static DEFINE_MUTEX(profile_flip_mutex); |
53 | #endif /* CONFIG_SMP */ | 53 | #endif /* CONFIG_SMP */ |
54 | 54 | ||
55 | static int __init profile_setup(char * str) | 55 | static int __init profile_setup(char *str) |
56 | { | 56 | { |
57 | static char __initdata schedstr[] = "schedule"; | 57 | static char __initdata schedstr[] = "schedule"; |
58 | static char __initdata sleepstr[] = "sleep"; | 58 | static char __initdata sleepstr[] = "sleep"; |
@@ -104,28 +104,28 @@ __setup("profile=", profile_setup); | |||
104 | 104 | ||
105 | void __init profile_init(void) | 105 | void __init profile_init(void) |
106 | { | 106 | { |
107 | if (!prof_on) | 107 | if (!prof_on) |
108 | return; | 108 | return; |
109 | 109 | ||
110 | /* only text is profiled */ | 110 | /* only text is profiled */ |
111 | prof_len = (_etext - _stext) >> prof_shift; | 111 | prof_len = (_etext - _stext) >> prof_shift; |
112 | prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); | 112 | prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); |
113 | } | 113 | } |
114 | 114 | ||
115 | /* Profile event notifications */ | 115 | /* Profile event notifications */ |
116 | 116 | ||
117 | #ifdef CONFIG_PROFILING | 117 | #ifdef CONFIG_PROFILING |
118 | 118 | ||
119 | static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); | 119 | static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); |
120 | static ATOMIC_NOTIFIER_HEAD(task_free_notifier); | 120 | static ATOMIC_NOTIFIER_HEAD(task_free_notifier); |
121 | static BLOCKING_NOTIFIER_HEAD(munmap_notifier); | 121 | static BLOCKING_NOTIFIER_HEAD(munmap_notifier); |
122 | 122 | ||
123 | void profile_task_exit(struct task_struct * task) | 123 | void profile_task_exit(struct task_struct *task) |
124 | { | 124 | { |
125 | blocking_notifier_call_chain(&task_exit_notifier, 0, task); | 125 | blocking_notifier_call_chain(&task_exit_notifier, 0, task); |
126 | } | 126 | } |
127 | 127 | ||
128 | int profile_handoff_task(struct task_struct * task) | 128 | int profile_handoff_task(struct task_struct *task) |
129 | { | 129 | { |
130 | int ret; | 130 | int ret; |
131 | ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); | 131 | ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); |
@@ -137,52 +137,55 @@ void profile_munmap(unsigned long addr) | |||
137 | blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); | 137 | blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); |
138 | } | 138 | } |
139 | 139 | ||
140 | int task_handoff_register(struct notifier_block * n) | 140 | int task_handoff_register(struct notifier_block *n) |
141 | { | 141 | { |
142 | return atomic_notifier_chain_register(&task_free_notifier, n); | 142 | return atomic_notifier_chain_register(&task_free_notifier, n); |
143 | } | 143 | } |
144 | EXPORT_SYMBOL_GPL(task_handoff_register); | ||
144 | 145 | ||
145 | int task_handoff_unregister(struct notifier_block * n) | 146 | int task_handoff_unregister(struct notifier_block *n) |
146 | { | 147 | { |
147 | return atomic_notifier_chain_unregister(&task_free_notifier, n); | 148 | return atomic_notifier_chain_unregister(&task_free_notifier, n); |
148 | } | 149 | } |
150 | EXPORT_SYMBOL_GPL(task_handoff_unregister); | ||
149 | 151 | ||
150 | int profile_event_register(enum profile_type type, struct notifier_block * n) | 152 | int profile_event_register(enum profile_type type, struct notifier_block *n) |
151 | { | 153 | { |
152 | int err = -EINVAL; | 154 | int err = -EINVAL; |
153 | 155 | ||
154 | switch (type) { | 156 | switch (type) { |
155 | case PROFILE_TASK_EXIT: | 157 | case PROFILE_TASK_EXIT: |
156 | err = blocking_notifier_chain_register( | 158 | err = blocking_notifier_chain_register( |
157 | &task_exit_notifier, n); | 159 | &task_exit_notifier, n); |
158 | break; | 160 | break; |
159 | case PROFILE_MUNMAP: | 161 | case PROFILE_MUNMAP: |
160 | err = blocking_notifier_chain_register( | 162 | err = blocking_notifier_chain_register( |
161 | &munmap_notifier, n); | 163 | &munmap_notifier, n); |
162 | break; | 164 | break; |
163 | } | 165 | } |
164 | 166 | ||
165 | return err; | 167 | return err; |
166 | } | 168 | } |
169 | EXPORT_SYMBOL_GPL(profile_event_register); | ||
167 | 170 | ||
168 | 171 | int profile_event_unregister(enum profile_type type, struct notifier_block *n) | |
169 | int profile_event_unregister(enum profile_type type, struct notifier_block * n) | ||
170 | { | 172 | { |
171 | int err = -EINVAL; | 173 | int err = -EINVAL; |
172 | 174 | ||
173 | switch (type) { | 175 | switch (type) { |
174 | case PROFILE_TASK_EXIT: | 176 | case PROFILE_TASK_EXIT: |
175 | err = blocking_notifier_chain_unregister( | 177 | err = blocking_notifier_chain_unregister( |
176 | &task_exit_notifier, n); | 178 | &task_exit_notifier, n); |
177 | break; | 179 | break; |
178 | case PROFILE_MUNMAP: | 180 | case PROFILE_MUNMAP: |
179 | err = blocking_notifier_chain_unregister( | 181 | err = blocking_notifier_chain_unregister( |
180 | &munmap_notifier, n); | 182 | &munmap_notifier, n); |
181 | break; | 183 | break; |
182 | } | 184 | } |
183 | 185 | ||
184 | return err; | 186 | return err; |
185 | } | 187 | } |
188 | EXPORT_SYMBOL_GPL(profile_event_unregister); | ||
186 | 189 | ||
187 | int register_timer_hook(int (*hook)(struct pt_regs *)) | 190 | int register_timer_hook(int (*hook)(struct pt_regs *)) |
188 | { | 191 | { |
@@ -191,6 +194,7 @@ int register_timer_hook(int (*hook)(struct pt_regs *)) | |||
191 | timer_hook = hook; | 194 | timer_hook = hook; |
192 | return 0; | 195 | return 0; |
193 | } | 196 | } |
197 | EXPORT_SYMBOL_GPL(register_timer_hook); | ||
194 | 198 | ||
195 | void unregister_timer_hook(int (*hook)(struct pt_regs *)) | 199 | void unregister_timer_hook(int (*hook)(struct pt_regs *)) |
196 | { | 200 | { |
@@ -199,13 +203,7 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *)) | |||
199 | /* make sure all CPUs see the NULL hook */ | 203 | /* make sure all CPUs see the NULL hook */ |
200 | synchronize_sched(); /* Allow ongoing interrupts to complete. */ | 204 | synchronize_sched(); /* Allow ongoing interrupts to complete. */ |
201 | } | 205 | } |
202 | |||
203 | EXPORT_SYMBOL_GPL(register_timer_hook); | ||
204 | EXPORT_SYMBOL_GPL(unregister_timer_hook); | 206 | EXPORT_SYMBOL_GPL(unregister_timer_hook); |
205 | EXPORT_SYMBOL_GPL(task_handoff_register); | ||
206 | EXPORT_SYMBOL_GPL(task_handoff_unregister); | ||
207 | EXPORT_SYMBOL_GPL(profile_event_register); | ||
208 | EXPORT_SYMBOL_GPL(profile_event_unregister); | ||
209 | 207 | ||
210 | #endif /* CONFIG_PROFILING */ | 208 | #endif /* CONFIG_PROFILING */ |
211 | 209 | ||
@@ -366,7 +364,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info, | |||
366 | per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); | 364 | per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); |
367 | } | 365 | } |
368 | break; | 366 | break; |
369 | out_free: | 367 | out_free: |
370 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); | 368 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); |
371 | per_cpu(cpu_profile_hits, cpu)[1] = NULL; | 369 | per_cpu(cpu_profile_hits, cpu)[1] = NULL; |
372 | __free_page(page); | 370 | __free_page(page); |
@@ -409,7 +407,6 @@ void profile_hits(int type, void *__pc, unsigned int nr_hits) | |||
409 | atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); | 407 | atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); |
410 | } | 408 | } |
411 | #endif /* !CONFIG_SMP */ | 409 | #endif /* !CONFIG_SMP */ |
412 | |||
413 | EXPORT_SYMBOL_GPL(profile_hits); | 410 | EXPORT_SYMBOL_GPL(profile_hits); |
414 | 411 | ||
415 | void profile_tick(int type) | 412 | void profile_tick(int type) |
@@ -427,7 +424,7 @@ void profile_tick(int type) | |||
427 | #include <asm/uaccess.h> | 424 | #include <asm/uaccess.h> |
428 | #include <asm/ptrace.h> | 425 | #include <asm/ptrace.h> |
429 | 426 | ||
430 | static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, | 427 | static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, |
431 | int count, int *eof, void *data) | 428 | int count, int *eof, void *data) |
432 | { | 429 | { |
433 | int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); | 430 | int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); |
@@ -437,8 +434,8 @@ static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, | |||
437 | return len; | 434 | return len; |
438 | } | 435 | } |
439 | 436 | ||
440 | static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer, | 437 | static int prof_cpu_mask_write_proc(struct file *file, |
441 | unsigned long count, void *data) | 438 | const char __user *buffer, unsigned long count, void *data) |
442 | { | 439 | { |
443 | cpumask_t *mask = (cpumask_t *)data; | 440 | cpumask_t *mask = (cpumask_t *)data; |
444 | unsigned long full_count = count, err; | 441 | unsigned long full_count = count, err; |
@@ -457,7 +454,8 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) | |||
457 | struct proc_dir_entry *entry; | 454 | struct proc_dir_entry *entry; |
458 | 455 | ||
459 | /* create /proc/irq/prof_cpu_mask */ | 456 | /* create /proc/irq/prof_cpu_mask */ |
460 | if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir))) | 457 | entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); |
458 | if (!entry) | ||
461 | return; | 459 | return; |
462 | entry->data = (void *)&prof_cpu_mask; | 460 | entry->data = (void *)&prof_cpu_mask; |
463 | entry->read_proc = prof_cpu_mask_read_proc; | 461 | entry->read_proc = prof_cpu_mask_read_proc; |
@@ -475,7 +473,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
475 | { | 473 | { |
476 | unsigned long p = *ppos; | 474 | unsigned long p = *ppos; |
477 | ssize_t read; | 475 | ssize_t read; |
478 | char * pnt; | 476 | char *pnt; |
479 | unsigned int sample_step = 1 << prof_shift; | 477 | unsigned int sample_step = 1 << prof_shift; |
480 | 478 | ||
481 | profile_flip_buffers(); | 479 | profile_flip_buffers(); |
@@ -486,12 +484,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
486 | read = 0; | 484 | read = 0; |
487 | 485 | ||
488 | while (p < sizeof(unsigned int) && count > 0) { | 486 | while (p < sizeof(unsigned int) && count > 0) { |
489 | if (put_user(*((char *)(&sample_step)+p),buf)) | 487 | if (put_user(*((char *)(&sample_step)+p), buf)) |
490 | return -EFAULT; | 488 | return -EFAULT; |
491 | buf++; p++; count--; read++; | 489 | buf++; p++; count--; read++; |
492 | } | 490 | } |
493 | pnt = (char *)prof_buffer + p - sizeof(atomic_t); | 491 | pnt = (char *)prof_buffer + p - sizeof(atomic_t); |
494 | if (copy_to_user(buf,(void *)pnt,count)) | 492 | if (copy_to_user(buf, (void *)pnt, count)) |
495 | return -EFAULT; | 493 | return -EFAULT; |
496 | read += count; | 494 | read += count; |
497 | *ppos += read; | 495 | *ppos += read; |
@@ -508,7 +506,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf, | |||
508 | size_t count, loff_t *ppos) | 506 | size_t count, loff_t *ppos) |
509 | { | 507 | { |
510 | #ifdef CONFIG_SMP | 508 | #ifdef CONFIG_SMP |
511 | extern int setup_profiling_timer (unsigned int multiplier); | 509 | extern int setup_profiling_timer(unsigned int multiplier); |
512 | 510 | ||
513 | if (count == sizeof(int)) { | 511 | if (count == sizeof(int)) { |
514 | unsigned int multiplier; | 512 | unsigned int multiplier; |
@@ -591,7 +589,8 @@ static int __init create_proc_profile(void) | |||
591 | return 0; | 589 | return 0; |
592 | if (create_hash_tables()) | 590 | if (create_hash_tables()) |
593 | return -1; | 591 | return -1; |
594 | if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL))) | 592 | entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL); |
593 | if (!entry) | ||
595 | return 0; | 594 | return 0; |
596 | entry->proc_fops = &proc_profile_operations; | 595 | entry->proc_fops = &proc_profile_operations; |
597 | entry->size = (1+prof_len) * sizeof(atomic_t); | 596 | entry->size = (1+prof_len) * sizeof(atomic_t); |
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c new file mode 100644 index 000000000000..f4ffbd0f306f --- /dev/null +++ b/kernel/rcuclassic.c | |||
@@ -0,0 +1,575 @@ | |||
1 | /* | ||
2 | * Read-Copy Update mechanism for mutual exclusion | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2001 | ||
19 | * | ||
20 | * Authors: Dipankar Sarma <dipankar@in.ibm.com> | ||
21 | * Manfred Spraul <manfred@colorfullife.com> | ||
22 | * | ||
23 | * Based on the original work by Paul McKenney <paulmck@us.ibm.com> | ||
24 | * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. | ||
25 | * Papers: | ||
26 | * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf | ||
27 | * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) | ||
28 | * | ||
29 | * For detailed explanation of Read-Copy Update mechanism see - | ||
30 | * Documentation/RCU | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/types.h> | ||
34 | #include <linux/kernel.h> | ||
35 | #include <linux/init.h> | ||
36 | #include <linux/spinlock.h> | ||
37 | #include <linux/smp.h> | ||
38 | #include <linux/rcupdate.h> | ||
39 | #include <linux/interrupt.h> | ||
40 | #include <linux/sched.h> | ||
41 | #include <asm/atomic.h> | ||
42 | #include <linux/bitops.h> | ||
43 | #include <linux/module.h> | ||
44 | #include <linux/completion.h> | ||
45 | #include <linux/moduleparam.h> | ||
46 | #include <linux/percpu.h> | ||
47 | #include <linux/notifier.h> | ||
48 | #include <linux/cpu.h> | ||
49 | #include <linux/mutex.h> | ||
50 | |||
51 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
52 | static struct lock_class_key rcu_lock_key; | ||
53 | struct lockdep_map rcu_lock_map = | ||
54 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); | ||
55 | EXPORT_SYMBOL_GPL(rcu_lock_map); | ||
56 | #endif | ||
57 | |||
58 | |||
59 | /* Definition for rcupdate control block. */ | ||
60 | static struct rcu_ctrlblk rcu_ctrlblk = { | ||
61 | .cur = -300, | ||
62 | .completed = -300, | ||
63 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), | ||
64 | .cpumask = CPU_MASK_NONE, | ||
65 | }; | ||
66 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | ||
67 | .cur = -300, | ||
68 | .completed = -300, | ||
69 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), | ||
70 | .cpumask = CPU_MASK_NONE, | ||
71 | }; | ||
72 | |||
73 | DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; | ||
74 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; | ||
75 | |||
76 | static int blimit = 10; | ||
77 | static int qhimark = 10000; | ||
78 | static int qlowmark = 100; | ||
79 | |||
80 | #ifdef CONFIG_SMP | ||
81 | static void force_quiescent_state(struct rcu_data *rdp, | ||
82 | struct rcu_ctrlblk *rcp) | ||
83 | { | ||
84 | int cpu; | ||
85 | cpumask_t cpumask; | ||
86 | set_need_resched(); | ||
87 | if (unlikely(!rcp->signaled)) { | ||
88 | rcp->signaled = 1; | ||
89 | /* | ||
90 | * Don't send IPI to itself. With irqs disabled, | ||
91 | * rdp->cpu is the current cpu. | ||
92 | */ | ||
93 | cpumask = rcp->cpumask; | ||
94 | cpu_clear(rdp->cpu, cpumask); | ||
95 | for_each_cpu_mask(cpu, cpumask) | ||
96 | smp_send_reschedule(cpu); | ||
97 | } | ||
98 | } | ||
99 | #else | ||
100 | static inline void force_quiescent_state(struct rcu_data *rdp, | ||
101 | struct rcu_ctrlblk *rcp) | ||
102 | { | ||
103 | set_need_resched(); | ||
104 | } | ||
105 | #endif | ||
106 | |||
107 | /** | ||
108 | * call_rcu - Queue an RCU callback for invocation after a grace period. | ||
109 | * @head: structure to be used for queueing the RCU updates. | ||
110 | * @func: actual update function to be invoked after the grace period | ||
111 | * | ||
112 | * The update function will be invoked some time after a full grace | ||
113 | * period elapses, in other words after all currently executing RCU | ||
114 | * read-side critical sections have completed. RCU read-side critical | ||
115 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | ||
116 | * and may be nested. | ||
117 | */ | ||
118 | void call_rcu(struct rcu_head *head, | ||
119 | void (*func)(struct rcu_head *rcu)) | ||
120 | { | ||
121 | unsigned long flags; | ||
122 | struct rcu_data *rdp; | ||
123 | |||
124 | head->func = func; | ||
125 | head->next = NULL; | ||
126 | local_irq_save(flags); | ||
127 | rdp = &__get_cpu_var(rcu_data); | ||
128 | *rdp->nxttail = head; | ||
129 | rdp->nxttail = &head->next; | ||
130 | if (unlikely(++rdp->qlen > qhimark)) { | ||
131 | rdp->blimit = INT_MAX; | ||
132 | force_quiescent_state(rdp, &rcu_ctrlblk); | ||
133 | } | ||
134 | local_irq_restore(flags); | ||
135 | } | ||
136 | EXPORT_SYMBOL_GPL(call_rcu); | ||
137 | |||
138 | /** | ||
139 | * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. | ||
140 | * @head: structure to be used for queueing the RCU updates. | ||
141 | * @func: actual update function to be invoked after the grace period | ||
142 | * | ||
143 | * The update function will be invoked some time after a full grace | ||
144 | * period elapses, in other words after all currently executing RCU | ||
145 | * read-side critical sections have completed. call_rcu_bh() assumes | ||
146 | * that the read-side critical sections end on completion of a softirq | ||
147 | * handler. This means that read-side critical sections in process | ||
148 | * context must not be interrupted by softirqs. This interface is to be | ||
149 | * used when most of the read-side critical sections are in softirq context. | ||
150 | * RCU read-side critical sections are delimited by rcu_read_lock() and | ||
151 | * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() | ||
152 | * and rcu_read_unlock_bh(), if in process context. These may be nested. | ||
153 | */ | ||
154 | void call_rcu_bh(struct rcu_head *head, | ||
155 | void (*func)(struct rcu_head *rcu)) | ||
156 | { | ||
157 | unsigned long flags; | ||
158 | struct rcu_data *rdp; | ||
159 | |||
160 | head->func = func; | ||
161 | head->next = NULL; | ||
162 | local_irq_save(flags); | ||
163 | rdp = &__get_cpu_var(rcu_bh_data); | ||
164 | *rdp->nxttail = head; | ||
165 | rdp->nxttail = &head->next; | ||
166 | |||
167 | if (unlikely(++rdp->qlen > qhimark)) { | ||
168 | rdp->blimit = INT_MAX; | ||
169 | force_quiescent_state(rdp, &rcu_bh_ctrlblk); | ||
170 | } | ||
171 | |||
172 | local_irq_restore(flags); | ||
173 | } | ||
174 | EXPORT_SYMBOL_GPL(call_rcu_bh); | ||
175 | |||
176 | /* | ||
177 | * Return the number of RCU batches processed thus far. Useful | ||
178 | * for debug and statistics. | ||
179 | */ | ||
180 | long rcu_batches_completed(void) | ||
181 | { | ||
182 | return rcu_ctrlblk.completed; | ||
183 | } | ||
184 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
185 | |||
186 | /* | ||
187 | * Return the number of RCU batches processed thus far. Useful | ||
188 | * for debug and statistics. | ||
189 | */ | ||
190 | long rcu_batches_completed_bh(void) | ||
191 | { | ||
192 | return rcu_bh_ctrlblk.completed; | ||
193 | } | ||
194 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | ||
195 | |||
196 | /* Raises the softirq for processing rcu_callbacks. */ | ||
197 | static inline void raise_rcu_softirq(void) | ||
198 | { | ||
199 | raise_softirq(RCU_SOFTIRQ); | ||
200 | /* | ||
201 | * The smp_mb() here is required to ensure that this cpu's | ||
202 | * __rcu_process_callbacks() reads the most recently updated | ||
203 | * value of rcu->cur. | ||
204 | */ | ||
205 | smp_mb(); | ||
206 | } | ||
207 | |||
208 | /* | ||
209 | * Invoke the completed RCU callbacks. They are expected to be in | ||
210 | * a per-cpu list. | ||
211 | */ | ||
212 | static void rcu_do_batch(struct rcu_data *rdp) | ||
213 | { | ||
214 | struct rcu_head *next, *list; | ||
215 | int count = 0; | ||
216 | |||
217 | list = rdp->donelist; | ||
218 | while (list) { | ||
219 | next = list->next; | ||
220 | prefetch(next); | ||
221 | list->func(list); | ||
222 | list = next; | ||
223 | if (++count >= rdp->blimit) | ||
224 | break; | ||
225 | } | ||
226 | rdp->donelist = list; | ||
227 | |||
228 | local_irq_disable(); | ||
229 | rdp->qlen -= count; | ||
230 | local_irq_enable(); | ||
231 | if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) | ||
232 | rdp->blimit = blimit; | ||
233 | |||
234 | if (!rdp->donelist) | ||
235 | rdp->donetail = &rdp->donelist; | ||
236 | else | ||
237 | raise_rcu_softirq(); | ||
238 | } | ||
239 | |||
240 | /* | ||
241 | * Grace period handling: | ||
242 | * The grace period handling consists out of two steps: | ||
243 | * - A new grace period is started. | ||
244 | * This is done by rcu_start_batch. The start is not broadcasted to | ||
245 | * all cpus, they must pick this up by comparing rcp->cur with | ||
246 | * rdp->quiescbatch. All cpus are recorded in the | ||
247 | * rcu_ctrlblk.cpumask bitmap. | ||
248 | * - All cpus must go through a quiescent state. | ||
249 | * Since the start of the grace period is not broadcasted, at least two | ||
250 | * calls to rcu_check_quiescent_state are required: | ||
251 | * The first call just notices that a new grace period is running. The | ||
252 | * following calls check if there was a quiescent state since the beginning | ||
253 | * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If | ||
254 | * the bitmap is empty, then the grace period is completed. | ||
255 | * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace | ||
256 | * period (if necessary). | ||
257 | */ | ||
258 | /* | ||
259 | * Register a new batch of callbacks, and start it up if there is currently no | ||
260 | * active batch and the batch to be registered has not already occurred. | ||
261 | * Caller must hold rcu_ctrlblk.lock. | ||
262 | */ | ||
263 | static void rcu_start_batch(struct rcu_ctrlblk *rcp) | ||
264 | { | ||
265 | if (rcp->next_pending && | ||
266 | rcp->completed == rcp->cur) { | ||
267 | rcp->next_pending = 0; | ||
268 | /* | ||
269 | * next_pending == 0 must be visible in | ||
270 | * __rcu_process_callbacks() before it can see new value of cur. | ||
271 | */ | ||
272 | smp_wmb(); | ||
273 | rcp->cur++; | ||
274 | |||
275 | /* | ||
276 | * Accessing nohz_cpu_mask before incrementing rcp->cur needs a | ||
277 | * Barrier Otherwise it can cause tickless idle CPUs to be | ||
278 | * included in rcp->cpumask, which will extend graceperiods | ||
279 | * unnecessarily. | ||
280 | */ | ||
281 | smp_mb(); | ||
282 | cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); | ||
283 | |||
284 | rcp->signaled = 0; | ||
285 | } | ||
286 | } | ||
287 | |||
288 | /* | ||
289 | * cpu went through a quiescent state since the beginning of the grace period. | ||
290 | * Clear it from the cpu mask and complete the grace period if it was the last | ||
291 | * cpu. Start another grace period if someone has further entries pending | ||
292 | */ | ||
293 | static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) | ||
294 | { | ||
295 | cpu_clear(cpu, rcp->cpumask); | ||
296 | if (cpus_empty(rcp->cpumask)) { | ||
297 | /* batch completed ! */ | ||
298 | rcp->completed = rcp->cur; | ||
299 | rcu_start_batch(rcp); | ||
300 | } | ||
301 | } | ||
302 | |||
303 | /* | ||
304 | * Check if the cpu has gone through a quiescent state (say context | ||
305 | * switch). If so and if it already hasn't done so in this RCU | ||
306 | * quiescent cycle, then indicate that it has done so. | ||
307 | */ | ||
308 | static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, | ||
309 | struct rcu_data *rdp) | ||
310 | { | ||
311 | if (rdp->quiescbatch != rcp->cur) { | ||
312 | /* start new grace period: */ | ||
313 | rdp->qs_pending = 1; | ||
314 | rdp->passed_quiesc = 0; | ||
315 | rdp->quiescbatch = rcp->cur; | ||
316 | return; | ||
317 | } | ||
318 | |||
319 | /* Grace period already completed for this cpu? | ||
320 | * qs_pending is checked instead of the actual bitmap to avoid | ||
321 | * cacheline trashing. | ||
322 | */ | ||
323 | if (!rdp->qs_pending) | ||
324 | return; | ||
325 | |||
326 | /* | ||
327 | * Was there a quiescent state since the beginning of the grace | ||
328 | * period? If no, then exit and wait for the next call. | ||
329 | */ | ||
330 | if (!rdp->passed_quiesc) | ||
331 | return; | ||
332 | rdp->qs_pending = 0; | ||
333 | |||
334 | spin_lock(&rcp->lock); | ||
335 | /* | ||
336 | * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync | ||
337 | * during cpu startup. Ignore the quiescent state. | ||
338 | */ | ||
339 | if (likely(rdp->quiescbatch == rcp->cur)) | ||
340 | cpu_quiet(rdp->cpu, rcp); | ||
341 | |||
342 | spin_unlock(&rcp->lock); | ||
343 | } | ||
344 | |||
345 | |||
346 | #ifdef CONFIG_HOTPLUG_CPU | ||
347 | |||
348 | /* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing | ||
349 | * locking requirements, the list it's pulling from has to belong to a cpu | ||
350 | * which is dead and hence not processing interrupts. | ||
351 | */ | ||
352 | static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, | ||
353 | struct rcu_head **tail) | ||
354 | { | ||
355 | local_irq_disable(); | ||
356 | *this_rdp->nxttail = list; | ||
357 | if (list) | ||
358 | this_rdp->nxttail = tail; | ||
359 | local_irq_enable(); | ||
360 | } | ||
361 | |||
362 | static void __rcu_offline_cpu(struct rcu_data *this_rdp, | ||
363 | struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
364 | { | ||
365 | /* if the cpu going offline owns the grace period | ||
366 | * we can block indefinitely waiting for it, so flush | ||
367 | * it here | ||
368 | */ | ||
369 | spin_lock_bh(&rcp->lock); | ||
370 | if (rcp->cur != rcp->completed) | ||
371 | cpu_quiet(rdp->cpu, rcp); | ||
372 | spin_unlock_bh(&rcp->lock); | ||
373 | rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); | ||
374 | rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); | ||
375 | rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); | ||
376 | } | ||
377 | |||
378 | static void rcu_offline_cpu(int cpu) | ||
379 | { | ||
380 | struct rcu_data *this_rdp = &get_cpu_var(rcu_data); | ||
381 | struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); | ||
382 | |||
383 | __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, | ||
384 | &per_cpu(rcu_data, cpu)); | ||
385 | __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, | ||
386 | &per_cpu(rcu_bh_data, cpu)); | ||
387 | put_cpu_var(rcu_data); | ||
388 | put_cpu_var(rcu_bh_data); | ||
389 | } | ||
390 | |||
391 | #else | ||
392 | |||
393 | static void rcu_offline_cpu(int cpu) | ||
394 | { | ||
395 | } | ||
396 | |||
397 | #endif | ||
398 | |||
399 | /* | ||
400 | * This does the RCU processing work from softirq context. | ||
401 | */ | ||
402 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, | ||
403 | struct rcu_data *rdp) | ||
404 | { | ||
405 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { | ||
406 | *rdp->donetail = rdp->curlist; | ||
407 | rdp->donetail = rdp->curtail; | ||
408 | rdp->curlist = NULL; | ||
409 | rdp->curtail = &rdp->curlist; | ||
410 | } | ||
411 | |||
412 | if (rdp->nxtlist && !rdp->curlist) { | ||
413 | local_irq_disable(); | ||
414 | rdp->curlist = rdp->nxtlist; | ||
415 | rdp->curtail = rdp->nxttail; | ||
416 | rdp->nxtlist = NULL; | ||
417 | rdp->nxttail = &rdp->nxtlist; | ||
418 | local_irq_enable(); | ||
419 | |||
420 | /* | ||
421 | * start the next batch of callbacks | ||
422 | */ | ||
423 | |||
424 | /* determine batch number */ | ||
425 | rdp->batch = rcp->cur + 1; | ||
426 | /* see the comment and corresponding wmb() in | ||
427 | * the rcu_start_batch() | ||
428 | */ | ||
429 | smp_rmb(); | ||
430 | |||
431 | if (!rcp->next_pending) { | ||
432 | /* and start it/schedule start if it's a new batch */ | ||
433 | spin_lock(&rcp->lock); | ||
434 | rcp->next_pending = 1; | ||
435 | rcu_start_batch(rcp); | ||
436 | spin_unlock(&rcp->lock); | ||
437 | } | ||
438 | } | ||
439 | |||
440 | rcu_check_quiescent_state(rcp, rdp); | ||
441 | if (rdp->donelist) | ||
442 | rcu_do_batch(rdp); | ||
443 | } | ||
444 | |||
445 | static void rcu_process_callbacks(struct softirq_action *unused) | ||
446 | { | ||
447 | __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); | ||
448 | __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); | ||
449 | } | ||
450 | |||
451 | static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
452 | { | ||
453 | /* This cpu has pending rcu entries and the grace period | ||
454 | * for them has completed. | ||
455 | */ | ||
456 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) | ||
457 | return 1; | ||
458 | |||
459 | /* This cpu has no pending entries, but there are new entries */ | ||
460 | if (!rdp->curlist && rdp->nxtlist) | ||
461 | return 1; | ||
462 | |||
463 | /* This cpu has finished callbacks to invoke */ | ||
464 | if (rdp->donelist) | ||
465 | return 1; | ||
466 | |||
467 | /* The rcu core waits for a quiescent state from the cpu */ | ||
468 | if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) | ||
469 | return 1; | ||
470 | |||
471 | /* nothing to do */ | ||
472 | return 0; | ||
473 | } | ||
474 | |||
475 | /* | ||
476 | * Check to see if there is any immediate RCU-related work to be done | ||
477 | * by the current CPU, returning 1 if so. This function is part of the | ||
478 | * RCU implementation; it is -not- an exported member of the RCU API. | ||
479 | */ | ||
480 | int rcu_pending(int cpu) | ||
481 | { | ||
482 | return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || | ||
483 | __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); | ||
484 | } | ||
485 | |||
486 | /* | ||
487 | * Check to see if any future RCU-related work will need to be done | ||
488 | * by the current CPU, even if none need be done immediately, returning | ||
489 | * 1 if so. This function is part of the RCU implementation; it is -not- | ||
490 | * an exported member of the RCU API. | ||
491 | */ | ||
492 | int rcu_needs_cpu(int cpu) | ||
493 | { | ||
494 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
495 | struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); | ||
496 | |||
497 | return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); | ||
498 | } | ||
499 | |||
500 | void rcu_check_callbacks(int cpu, int user) | ||
501 | { | ||
502 | if (user || | ||
503 | (idle_cpu(cpu) && !in_softirq() && | ||
504 | hardirq_count() <= (1 << HARDIRQ_SHIFT))) { | ||
505 | rcu_qsctr_inc(cpu); | ||
506 | rcu_bh_qsctr_inc(cpu); | ||
507 | } else if (!in_softirq()) | ||
508 | rcu_bh_qsctr_inc(cpu); | ||
509 | raise_rcu_softirq(); | ||
510 | } | ||
511 | |||
512 | static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, | ||
513 | struct rcu_data *rdp) | ||
514 | { | ||
515 | memset(rdp, 0, sizeof(*rdp)); | ||
516 | rdp->curtail = &rdp->curlist; | ||
517 | rdp->nxttail = &rdp->nxtlist; | ||
518 | rdp->donetail = &rdp->donelist; | ||
519 | rdp->quiescbatch = rcp->completed; | ||
520 | rdp->qs_pending = 0; | ||
521 | rdp->cpu = cpu; | ||
522 | rdp->blimit = blimit; | ||
523 | } | ||
524 | |||
525 | static void __cpuinit rcu_online_cpu(int cpu) | ||
526 | { | ||
527 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
528 | struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); | ||
529 | |||
530 | rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); | ||
531 | rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); | ||
532 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); | ||
533 | } | ||
534 | |||
535 | static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | ||
536 | unsigned long action, void *hcpu) | ||
537 | { | ||
538 | long cpu = (long)hcpu; | ||
539 | |||
540 | switch (action) { | ||
541 | case CPU_UP_PREPARE: | ||
542 | case CPU_UP_PREPARE_FROZEN: | ||
543 | rcu_online_cpu(cpu); | ||
544 | break; | ||
545 | case CPU_DEAD: | ||
546 | case CPU_DEAD_FROZEN: | ||
547 | rcu_offline_cpu(cpu); | ||
548 | break; | ||
549 | default: | ||
550 | break; | ||
551 | } | ||
552 | return NOTIFY_OK; | ||
553 | } | ||
554 | |||
555 | static struct notifier_block __cpuinitdata rcu_nb = { | ||
556 | .notifier_call = rcu_cpu_notify, | ||
557 | }; | ||
558 | |||
559 | /* | ||
560 | * Initializes rcu mechanism. Assumed to be called early. | ||
561 | * That is before local timer(SMP) or jiffie timer (uniproc) is setup. | ||
562 | * Note that rcu_qsctr and friends are implicitly | ||
563 | * initialized due to the choice of ``0'' for RCU_CTR_INVALID. | ||
564 | */ | ||
565 | void __init __rcu_init(void) | ||
566 | { | ||
567 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, | ||
568 | (void *)(long)smp_processor_id()); | ||
569 | /* Register notifier for non-boot CPUs */ | ||
570 | register_cpu_notifier(&rcu_nb); | ||
571 | } | ||
572 | |||
573 | module_param(blimit, int, 0); | ||
574 | module_param(qhimark, int, 0); | ||
575 | module_param(qlowmark, int, 0); | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f2c1a04e9b18..760dfc233a00 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -15,7 +15,7 @@ | |||
15 | * along with this program; if not, write to the Free Software | 15 | * along with this program; if not, write to the Free Software |
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
17 | * | 17 | * |
18 | * Copyright (C) IBM Corporation, 2001 | 18 | * Copyright IBM Corporation, 2001 |
19 | * | 19 | * |
20 | * Authors: Dipankar Sarma <dipankar@in.ibm.com> | 20 | * Authors: Dipankar Sarma <dipankar@in.ibm.com> |
21 | * Manfred Spraul <manfred@colorfullife.com> | 21 | * Manfred Spraul <manfred@colorfullife.com> |
@@ -35,165 +35,57 @@ | |||
35 | #include <linux/init.h> | 35 | #include <linux/init.h> |
36 | #include <linux/spinlock.h> | 36 | #include <linux/spinlock.h> |
37 | #include <linux/smp.h> | 37 | #include <linux/smp.h> |
38 | #include <linux/rcupdate.h> | ||
39 | #include <linux/interrupt.h> | 38 | #include <linux/interrupt.h> |
40 | #include <linux/sched.h> | 39 | #include <linux/sched.h> |
41 | #include <asm/atomic.h> | 40 | #include <asm/atomic.h> |
42 | #include <linux/bitops.h> | 41 | #include <linux/bitops.h> |
43 | #include <linux/module.h> | ||
44 | #include <linux/completion.h> | 42 | #include <linux/completion.h> |
45 | #include <linux/moduleparam.h> | ||
46 | #include <linux/percpu.h> | 43 | #include <linux/percpu.h> |
47 | #include <linux/notifier.h> | 44 | #include <linux/notifier.h> |
48 | #include <linux/cpu.h> | 45 | #include <linux/cpu.h> |
49 | #include <linux/mutex.h> | 46 | #include <linux/mutex.h> |
47 | #include <linux/module.h> | ||
50 | 48 | ||
51 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 49 | struct rcu_synchronize { |
52 | static struct lock_class_key rcu_lock_key; | 50 | struct rcu_head head; |
53 | struct lockdep_map rcu_lock_map = | 51 | struct completion completion; |
54 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); | ||
55 | |||
56 | EXPORT_SYMBOL_GPL(rcu_lock_map); | ||
57 | #endif | ||
58 | |||
59 | /* Definition for rcupdate control block. */ | ||
60 | static struct rcu_ctrlblk rcu_ctrlblk = { | ||
61 | .cur = -300, | ||
62 | .completed = -300, | ||
63 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), | ||
64 | .cpumask = CPU_MASK_NONE, | ||
65 | }; | ||
66 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | ||
67 | .cur = -300, | ||
68 | .completed = -300, | ||
69 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), | ||
70 | .cpumask = CPU_MASK_NONE, | ||
71 | }; | 52 | }; |
72 | 53 | ||
73 | DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; | 54 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; |
74 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; | ||
75 | |||
76 | /* Fake initialization required by compiler */ | ||
77 | static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; | ||
78 | static int blimit = 10; | ||
79 | static int qhimark = 10000; | ||
80 | static int qlowmark = 100; | ||
81 | |||
82 | static atomic_t rcu_barrier_cpu_count; | 55 | static atomic_t rcu_barrier_cpu_count; |
83 | static DEFINE_MUTEX(rcu_barrier_mutex); | 56 | static DEFINE_MUTEX(rcu_barrier_mutex); |
84 | static struct completion rcu_barrier_completion; | 57 | static struct completion rcu_barrier_completion; |
85 | 58 | ||
86 | #ifdef CONFIG_SMP | 59 | /* Because of FASTCALL declaration of complete, we use this wrapper */ |
87 | static void force_quiescent_state(struct rcu_data *rdp, | 60 | static void wakeme_after_rcu(struct rcu_head *head) |
88 | struct rcu_ctrlblk *rcp) | ||
89 | { | ||
90 | int cpu; | ||
91 | cpumask_t cpumask; | ||
92 | set_need_resched(); | ||
93 | if (unlikely(!rcp->signaled)) { | ||
94 | rcp->signaled = 1; | ||
95 | /* | ||
96 | * Don't send IPI to itself. With irqs disabled, | ||
97 | * rdp->cpu is the current cpu. | ||
98 | */ | ||
99 | cpumask = rcp->cpumask; | ||
100 | cpu_clear(rdp->cpu, cpumask); | ||
101 | for_each_cpu_mask(cpu, cpumask) | ||
102 | smp_send_reschedule(cpu); | ||
103 | } | ||
104 | } | ||
105 | #else | ||
106 | static inline void force_quiescent_state(struct rcu_data *rdp, | ||
107 | struct rcu_ctrlblk *rcp) | ||
108 | { | 61 | { |
109 | set_need_resched(); | 62 | struct rcu_synchronize *rcu; |
63 | |||
64 | rcu = container_of(head, struct rcu_synchronize, head); | ||
65 | complete(&rcu->completion); | ||
110 | } | 66 | } |
111 | #endif | ||
112 | 67 | ||
113 | /** | 68 | /** |
114 | * call_rcu - Queue an RCU callback for invocation after a grace period. | 69 | * synchronize_rcu - wait until a grace period has elapsed. |
115 | * @head: structure to be used for queueing the RCU updates. | ||
116 | * @func: actual update function to be invoked after the grace period | ||
117 | * | 70 | * |
118 | * The update function will be invoked some time after a full grace | 71 | * Control will return to the caller some time after a full grace |
119 | * period elapses, in other words after all currently executing RCU | 72 | * period has elapsed, in other words after all currently executing RCU |
120 | * read-side critical sections have completed. RCU read-side critical | 73 | * read-side critical sections have completed. RCU read-side critical |
121 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | 74 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), |
122 | * and may be nested. | 75 | * and may be nested. |
123 | */ | 76 | */ |
124 | void fastcall call_rcu(struct rcu_head *head, | 77 | void synchronize_rcu(void) |
125 | void (*func)(struct rcu_head *rcu)) | ||
126 | { | ||
127 | unsigned long flags; | ||
128 | struct rcu_data *rdp; | ||
129 | |||
130 | head->func = func; | ||
131 | head->next = NULL; | ||
132 | local_irq_save(flags); | ||
133 | rdp = &__get_cpu_var(rcu_data); | ||
134 | *rdp->nxttail = head; | ||
135 | rdp->nxttail = &head->next; | ||
136 | if (unlikely(++rdp->qlen > qhimark)) { | ||
137 | rdp->blimit = INT_MAX; | ||
138 | force_quiescent_state(rdp, &rcu_ctrlblk); | ||
139 | } | ||
140 | local_irq_restore(flags); | ||
141 | } | ||
142 | |||
143 | /** | ||
144 | * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. | ||
145 | * @head: structure to be used for queueing the RCU updates. | ||
146 | * @func: actual update function to be invoked after the grace period | ||
147 | * | ||
148 | * The update function will be invoked some time after a full grace | ||
149 | * period elapses, in other words after all currently executing RCU | ||
150 | * read-side critical sections have completed. call_rcu_bh() assumes | ||
151 | * that the read-side critical sections end on completion of a softirq | ||
152 | * handler. This means that read-side critical sections in process | ||
153 | * context must not be interrupted by softirqs. This interface is to be | ||
154 | * used when most of the read-side critical sections are in softirq context. | ||
155 | * RCU read-side critical sections are delimited by rcu_read_lock() and | ||
156 | * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() | ||
157 | * and rcu_read_unlock_bh(), if in process context. These may be nested. | ||
158 | */ | ||
159 | void fastcall call_rcu_bh(struct rcu_head *head, | ||
160 | void (*func)(struct rcu_head *rcu)) | ||
161 | { | 78 | { |
162 | unsigned long flags; | 79 | struct rcu_synchronize rcu; |
163 | struct rcu_data *rdp; | ||
164 | |||
165 | head->func = func; | ||
166 | head->next = NULL; | ||
167 | local_irq_save(flags); | ||
168 | rdp = &__get_cpu_var(rcu_bh_data); | ||
169 | *rdp->nxttail = head; | ||
170 | rdp->nxttail = &head->next; | ||
171 | |||
172 | if (unlikely(++rdp->qlen > qhimark)) { | ||
173 | rdp->blimit = INT_MAX; | ||
174 | force_quiescent_state(rdp, &rcu_bh_ctrlblk); | ||
175 | } | ||
176 | |||
177 | local_irq_restore(flags); | ||
178 | } | ||
179 | 80 | ||
180 | /* | 81 | init_completion(&rcu.completion); |
181 | * Return the number of RCU batches processed thus far. Useful | 82 | /* Will wake me after RCU finished */ |
182 | * for debug and statistics. | 83 | call_rcu(&rcu.head, wakeme_after_rcu); |
183 | */ | ||
184 | long rcu_batches_completed(void) | ||
185 | { | ||
186 | return rcu_ctrlblk.completed; | ||
187 | } | ||
188 | 84 | ||
189 | /* | 85 | /* Wait for it */ |
190 | * Return the number of RCU batches processed thus far. Useful | 86 | wait_for_completion(&rcu.completion); |
191 | * for debug and statistics. | ||
192 | */ | ||
193 | long rcu_batches_completed_bh(void) | ||
194 | { | ||
195 | return rcu_bh_ctrlblk.completed; | ||
196 | } | 87 | } |
88 | EXPORT_SYMBOL_GPL(synchronize_rcu); | ||
197 | 89 | ||
198 | static void rcu_barrier_callback(struct rcu_head *notused) | 90 | static void rcu_barrier_callback(struct rcu_head *notused) |
199 | { | 91 | { |
@@ -207,10 +99,8 @@ static void rcu_barrier_callback(struct rcu_head *notused) | |||
207 | static void rcu_barrier_func(void *notused) | 99 | static void rcu_barrier_func(void *notused) |
208 | { | 100 | { |
209 | int cpu = smp_processor_id(); | 101 | int cpu = smp_processor_id(); |
210 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | 102 | struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); |
211 | struct rcu_head *head; | ||
212 | 103 | ||
213 | head = &rdp->barrier; | ||
214 | atomic_inc(&rcu_barrier_cpu_count); | 104 | atomic_inc(&rcu_barrier_cpu_count); |
215 | call_rcu(head, rcu_barrier_callback); | 105 | call_rcu(head, rcu_barrier_callback); |
216 | } | 106 | } |
@@ -225,420 +115,24 @@ void rcu_barrier(void) | |||
225 | mutex_lock(&rcu_barrier_mutex); | 115 | mutex_lock(&rcu_barrier_mutex); |
226 | init_completion(&rcu_barrier_completion); | 116 | init_completion(&rcu_barrier_completion); |
227 | atomic_set(&rcu_barrier_cpu_count, 0); | 117 | atomic_set(&rcu_barrier_cpu_count, 0); |
118 | /* | ||
119 | * The queueing of callbacks in all CPUs must be atomic with | ||
120 | * respect to RCU, otherwise one CPU may queue a callback, | ||
121 | * wait for a grace period, decrement barrier count and call | ||
122 | * complete(), while other CPUs have not yet queued anything. | ||
123 | * So, we need to make sure that grace periods cannot complete | ||
124 | * until all the callbacks are queued. | ||
125 | */ | ||
126 | rcu_read_lock(); | ||
228 | on_each_cpu(rcu_barrier_func, NULL, 0, 1); | 127 | on_each_cpu(rcu_barrier_func, NULL, 0, 1); |
128 | rcu_read_unlock(); | ||
229 | wait_for_completion(&rcu_barrier_completion); | 129 | wait_for_completion(&rcu_barrier_completion); |
230 | mutex_unlock(&rcu_barrier_mutex); | 130 | mutex_unlock(&rcu_barrier_mutex); |
231 | } | 131 | } |
232 | EXPORT_SYMBOL_GPL(rcu_barrier); | 132 | EXPORT_SYMBOL_GPL(rcu_barrier); |
233 | 133 | ||
234 | /* | ||
235 | * Invoke the completed RCU callbacks. They are expected to be in | ||
236 | * a per-cpu list. | ||
237 | */ | ||
238 | static void rcu_do_batch(struct rcu_data *rdp) | ||
239 | { | ||
240 | struct rcu_head *next, *list; | ||
241 | int count = 0; | ||
242 | |||
243 | list = rdp->donelist; | ||
244 | while (list) { | ||
245 | next = list->next; | ||
246 | prefetch(next); | ||
247 | list->func(list); | ||
248 | list = next; | ||
249 | if (++count >= rdp->blimit) | ||
250 | break; | ||
251 | } | ||
252 | rdp->donelist = list; | ||
253 | |||
254 | local_irq_disable(); | ||
255 | rdp->qlen -= count; | ||
256 | local_irq_enable(); | ||
257 | if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) | ||
258 | rdp->blimit = blimit; | ||
259 | |||
260 | if (!rdp->donelist) | ||
261 | rdp->donetail = &rdp->donelist; | ||
262 | else | ||
263 | tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu)); | ||
264 | } | ||
265 | |||
266 | /* | ||
267 | * Grace period handling: | ||
268 | * The grace period handling consists out of two steps: | ||
269 | * - A new grace period is started. | ||
270 | * This is done by rcu_start_batch. The start is not broadcasted to | ||
271 | * all cpus, they must pick this up by comparing rcp->cur with | ||
272 | * rdp->quiescbatch. All cpus are recorded in the | ||
273 | * rcu_ctrlblk.cpumask bitmap. | ||
274 | * - All cpus must go through a quiescent state. | ||
275 | * Since the start of the grace period is not broadcasted, at least two | ||
276 | * calls to rcu_check_quiescent_state are required: | ||
277 | * The first call just notices that a new grace period is running. The | ||
278 | * following calls check if there was a quiescent state since the beginning | ||
279 | * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If | ||
280 | * the bitmap is empty, then the grace period is completed. | ||
281 | * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace | ||
282 | * period (if necessary). | ||
283 | */ | ||
284 | /* | ||
285 | * Register a new batch of callbacks, and start it up if there is currently no | ||
286 | * active batch and the batch to be registered has not already occurred. | ||
287 | * Caller must hold rcu_ctrlblk.lock. | ||
288 | */ | ||
289 | static void rcu_start_batch(struct rcu_ctrlblk *rcp) | ||
290 | { | ||
291 | if (rcp->next_pending && | ||
292 | rcp->completed == rcp->cur) { | ||
293 | rcp->next_pending = 0; | ||
294 | /* | ||
295 | * next_pending == 0 must be visible in | ||
296 | * __rcu_process_callbacks() before it can see new value of cur. | ||
297 | */ | ||
298 | smp_wmb(); | ||
299 | rcp->cur++; | ||
300 | |||
301 | /* | ||
302 | * Accessing nohz_cpu_mask before incrementing rcp->cur needs a | ||
303 | * Barrier Otherwise it can cause tickless idle CPUs to be | ||
304 | * included in rcp->cpumask, which will extend graceperiods | ||
305 | * unnecessarily. | ||
306 | */ | ||
307 | smp_mb(); | ||
308 | cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); | ||
309 | |||
310 | rcp->signaled = 0; | ||
311 | } | ||
312 | } | ||
313 | |||
314 | /* | ||
315 | * cpu went through a quiescent state since the beginning of the grace period. | ||
316 | * Clear it from the cpu mask and complete the grace period if it was the last | ||
317 | * cpu. Start another grace period if someone has further entries pending | ||
318 | */ | ||
319 | static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) | ||
320 | { | ||
321 | cpu_clear(cpu, rcp->cpumask); | ||
322 | if (cpus_empty(rcp->cpumask)) { | ||
323 | /* batch completed ! */ | ||
324 | rcp->completed = rcp->cur; | ||
325 | rcu_start_batch(rcp); | ||
326 | } | ||
327 | } | ||
328 | |||
329 | /* | ||
330 | * Check if the cpu has gone through a quiescent state (say context | ||
331 | * switch). If so and if it already hasn't done so in this RCU | ||
332 | * quiescent cycle, then indicate that it has done so. | ||
333 | */ | ||
334 | static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, | ||
335 | struct rcu_data *rdp) | ||
336 | { | ||
337 | if (rdp->quiescbatch != rcp->cur) { | ||
338 | /* start new grace period: */ | ||
339 | rdp->qs_pending = 1; | ||
340 | rdp->passed_quiesc = 0; | ||
341 | rdp->quiescbatch = rcp->cur; | ||
342 | return; | ||
343 | } | ||
344 | |||
345 | /* Grace period already completed for this cpu? | ||
346 | * qs_pending is checked instead of the actual bitmap to avoid | ||
347 | * cacheline trashing. | ||
348 | */ | ||
349 | if (!rdp->qs_pending) | ||
350 | return; | ||
351 | |||
352 | /* | ||
353 | * Was there a quiescent state since the beginning of the grace | ||
354 | * period? If no, then exit and wait for the next call. | ||
355 | */ | ||
356 | if (!rdp->passed_quiesc) | ||
357 | return; | ||
358 | rdp->qs_pending = 0; | ||
359 | |||
360 | spin_lock(&rcp->lock); | ||
361 | /* | ||
362 | * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync | ||
363 | * during cpu startup. Ignore the quiescent state. | ||
364 | */ | ||
365 | if (likely(rdp->quiescbatch == rcp->cur)) | ||
366 | cpu_quiet(rdp->cpu, rcp); | ||
367 | |||
368 | spin_unlock(&rcp->lock); | ||
369 | } | ||
370 | |||
371 | |||
372 | #ifdef CONFIG_HOTPLUG_CPU | ||
373 | |||
374 | /* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing | ||
375 | * locking requirements, the list it's pulling from has to belong to a cpu | ||
376 | * which is dead and hence not processing interrupts. | ||
377 | */ | ||
378 | static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, | ||
379 | struct rcu_head **tail) | ||
380 | { | ||
381 | local_irq_disable(); | ||
382 | *this_rdp->nxttail = list; | ||
383 | if (list) | ||
384 | this_rdp->nxttail = tail; | ||
385 | local_irq_enable(); | ||
386 | } | ||
387 | |||
388 | static void __rcu_offline_cpu(struct rcu_data *this_rdp, | ||
389 | struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
390 | { | ||
391 | /* if the cpu going offline owns the grace period | ||
392 | * we can block indefinitely waiting for it, so flush | ||
393 | * it here | ||
394 | */ | ||
395 | spin_lock_bh(&rcp->lock); | ||
396 | if (rcp->cur != rcp->completed) | ||
397 | cpu_quiet(rdp->cpu, rcp); | ||
398 | spin_unlock_bh(&rcp->lock); | ||
399 | rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); | ||
400 | rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); | ||
401 | rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); | ||
402 | } | ||
403 | |||
404 | static void rcu_offline_cpu(int cpu) | ||
405 | { | ||
406 | struct rcu_data *this_rdp = &get_cpu_var(rcu_data); | ||
407 | struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); | ||
408 | |||
409 | __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, | ||
410 | &per_cpu(rcu_data, cpu)); | ||
411 | __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, | ||
412 | &per_cpu(rcu_bh_data, cpu)); | ||
413 | put_cpu_var(rcu_data); | ||
414 | put_cpu_var(rcu_bh_data); | ||
415 | tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu); | ||
416 | } | ||
417 | |||
418 | #else | ||
419 | |||
420 | static void rcu_offline_cpu(int cpu) | ||
421 | { | ||
422 | } | ||
423 | |||
424 | #endif | ||
425 | |||
426 | /* | ||
427 | * This does the RCU processing work from tasklet context. | ||
428 | */ | ||
429 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, | ||
430 | struct rcu_data *rdp) | ||
431 | { | ||
432 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { | ||
433 | *rdp->donetail = rdp->curlist; | ||
434 | rdp->donetail = rdp->curtail; | ||
435 | rdp->curlist = NULL; | ||
436 | rdp->curtail = &rdp->curlist; | ||
437 | } | ||
438 | |||
439 | if (rdp->nxtlist && !rdp->curlist) { | ||
440 | local_irq_disable(); | ||
441 | rdp->curlist = rdp->nxtlist; | ||
442 | rdp->curtail = rdp->nxttail; | ||
443 | rdp->nxtlist = NULL; | ||
444 | rdp->nxttail = &rdp->nxtlist; | ||
445 | local_irq_enable(); | ||
446 | |||
447 | /* | ||
448 | * start the next batch of callbacks | ||
449 | */ | ||
450 | |||
451 | /* determine batch number */ | ||
452 | rdp->batch = rcp->cur + 1; | ||
453 | /* see the comment and corresponding wmb() in | ||
454 | * the rcu_start_batch() | ||
455 | */ | ||
456 | smp_rmb(); | ||
457 | |||
458 | if (!rcp->next_pending) { | ||
459 | /* and start it/schedule start if it's a new batch */ | ||
460 | spin_lock(&rcp->lock); | ||
461 | rcp->next_pending = 1; | ||
462 | rcu_start_batch(rcp); | ||
463 | spin_unlock(&rcp->lock); | ||
464 | } | ||
465 | } | ||
466 | |||
467 | rcu_check_quiescent_state(rcp, rdp); | ||
468 | if (rdp->donelist) | ||
469 | rcu_do_batch(rdp); | ||
470 | } | ||
471 | |||
472 | static void rcu_process_callbacks(unsigned long unused) | ||
473 | { | ||
474 | __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); | ||
475 | __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); | ||
476 | } | ||
477 | |||
478 | static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
479 | { | ||
480 | /* This cpu has pending rcu entries and the grace period | ||
481 | * for them has completed. | ||
482 | */ | ||
483 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) | ||
484 | return 1; | ||
485 | |||
486 | /* This cpu has no pending entries, but there are new entries */ | ||
487 | if (!rdp->curlist && rdp->nxtlist) | ||
488 | return 1; | ||
489 | |||
490 | /* This cpu has finished callbacks to invoke */ | ||
491 | if (rdp->donelist) | ||
492 | return 1; | ||
493 | |||
494 | /* The rcu core waits for a quiescent state from the cpu */ | ||
495 | if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) | ||
496 | return 1; | ||
497 | |||
498 | /* nothing to do */ | ||
499 | return 0; | ||
500 | } | ||
501 | |||
502 | /* | ||
503 | * Check to see if there is any immediate RCU-related work to be done | ||
504 | * by the current CPU, returning 1 if so. This function is part of the | ||
505 | * RCU implementation; it is -not- an exported member of the RCU API. | ||
506 | */ | ||
507 | int rcu_pending(int cpu) | ||
508 | { | ||
509 | return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || | ||
510 | __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); | ||
511 | } | ||
512 | |||
513 | /* | ||
514 | * Check to see if any future RCU-related work will need to be done | ||
515 | * by the current CPU, even if none need be done immediately, returning | ||
516 | * 1 if so. This function is part of the RCU implementation; it is -not- | ||
517 | * an exported member of the RCU API. | ||
518 | */ | ||
519 | int rcu_needs_cpu(int cpu) | ||
520 | { | ||
521 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
522 | struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); | ||
523 | |||
524 | return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); | ||
525 | } | ||
526 | |||
527 | void rcu_check_callbacks(int cpu, int user) | ||
528 | { | ||
529 | if (user || | ||
530 | (idle_cpu(cpu) && !in_softirq() && | ||
531 | hardirq_count() <= (1 << HARDIRQ_SHIFT))) { | ||
532 | rcu_qsctr_inc(cpu); | ||
533 | rcu_bh_qsctr_inc(cpu); | ||
534 | } else if (!in_softirq()) | ||
535 | rcu_bh_qsctr_inc(cpu); | ||
536 | tasklet_schedule(&per_cpu(rcu_tasklet, cpu)); | ||
537 | } | ||
538 | |||
539 | static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, | ||
540 | struct rcu_data *rdp) | ||
541 | { | ||
542 | memset(rdp, 0, sizeof(*rdp)); | ||
543 | rdp->curtail = &rdp->curlist; | ||
544 | rdp->nxttail = &rdp->nxtlist; | ||
545 | rdp->donetail = &rdp->donelist; | ||
546 | rdp->quiescbatch = rcp->completed; | ||
547 | rdp->qs_pending = 0; | ||
548 | rdp->cpu = cpu; | ||
549 | rdp->blimit = blimit; | ||
550 | } | ||
551 | |||
552 | static void __cpuinit rcu_online_cpu(int cpu) | ||
553 | { | ||
554 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
555 | struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); | ||
556 | |||
557 | rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); | ||
558 | rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); | ||
559 | tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); | ||
560 | } | ||
561 | |||
562 | static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | ||
563 | unsigned long action, void *hcpu) | ||
564 | { | ||
565 | long cpu = (long)hcpu; | ||
566 | switch (action) { | ||
567 | case CPU_UP_PREPARE: | ||
568 | case CPU_UP_PREPARE_FROZEN: | ||
569 | rcu_online_cpu(cpu); | ||
570 | break; | ||
571 | case CPU_DEAD: | ||
572 | case CPU_DEAD_FROZEN: | ||
573 | rcu_offline_cpu(cpu); | ||
574 | break; | ||
575 | default: | ||
576 | break; | ||
577 | } | ||
578 | return NOTIFY_OK; | ||
579 | } | ||
580 | |||
581 | static struct notifier_block __cpuinitdata rcu_nb = { | ||
582 | .notifier_call = rcu_cpu_notify, | ||
583 | }; | ||
584 | |||
585 | /* | ||
586 | * Initializes rcu mechanism. Assumed to be called early. | ||
587 | * That is before local timer(SMP) or jiffie timer (uniproc) is setup. | ||
588 | * Note that rcu_qsctr and friends are implicitly | ||
589 | * initialized due to the choice of ``0'' for RCU_CTR_INVALID. | ||
590 | */ | ||
591 | void __init rcu_init(void) | 134 | void __init rcu_init(void) |
592 | { | 135 | { |
593 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, | 136 | __rcu_init(); |
594 | (void *)(long)smp_processor_id()); | ||
595 | /* Register notifier for non-boot CPUs */ | ||
596 | register_cpu_notifier(&rcu_nb); | ||
597 | } | ||
598 | |||
599 | struct rcu_synchronize { | ||
600 | struct rcu_head head; | ||
601 | struct completion completion; | ||
602 | }; | ||
603 | |||
604 | /* Because of FASTCALL declaration of complete, we use this wrapper */ | ||
605 | static void wakeme_after_rcu(struct rcu_head *head) | ||
606 | { | ||
607 | struct rcu_synchronize *rcu; | ||
608 | |||
609 | rcu = container_of(head, struct rcu_synchronize, head); | ||
610 | complete(&rcu->completion); | ||
611 | } | 137 | } |
612 | 138 | ||
613 | /** | ||
614 | * synchronize_rcu - wait until a grace period has elapsed. | ||
615 | * | ||
616 | * Control will return to the caller some time after a full grace | ||
617 | * period has elapsed, in other words after all currently executing RCU | ||
618 | * read-side critical sections have completed. RCU read-side critical | ||
619 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | ||
620 | * and may be nested. | ||
621 | * | ||
622 | * If your read-side code is not protected by rcu_read_lock(), do -not- | ||
623 | * use synchronize_rcu(). | ||
624 | */ | ||
625 | void synchronize_rcu(void) | ||
626 | { | ||
627 | struct rcu_synchronize rcu; | ||
628 | |||
629 | init_completion(&rcu.completion); | ||
630 | /* Will wake me after RCU finished */ | ||
631 | call_rcu(&rcu.head, wakeme_after_rcu); | ||
632 | |||
633 | /* Wait for it */ | ||
634 | wait_for_completion(&rcu.completion); | ||
635 | } | ||
636 | |||
637 | module_param(blimit, int, 0); | ||
638 | module_param(qhimark, int, 0); | ||
639 | module_param(qlowmark, int, 0); | ||
640 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
641 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | ||
642 | EXPORT_SYMBOL_GPL(call_rcu); | ||
643 | EXPORT_SYMBOL_GPL(call_rcu_bh); | ||
644 | EXPORT_SYMBOL_GPL(synchronize_rcu); | ||
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c new file mode 100644 index 000000000000..987cfb7ade89 --- /dev/null +++ b/kernel/rcupreempt.c | |||
@@ -0,0 +1,953 @@ | |||
1 | /* | ||
2 | * Read-Copy Update mechanism for mutual exclusion, realtime implementation | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2006 | ||
19 | * | ||
20 | * Authors: Paul E. McKenney <paulmck@us.ibm.com> | ||
21 | * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar | ||
22 | * for pushing me away from locks and towards counters, and | ||
23 | * to Suparna Bhattacharya for pushing me completely away | ||
24 | * from atomic instructions on the read side. | ||
25 | * | ||
26 | * Papers: http://www.rdrop.com/users/paulmck/RCU | ||
27 | * | ||
28 | * Design Document: http://lwn.net/Articles/253651/ | ||
29 | * | ||
30 | * For detailed explanation of Read-Copy Update mechanism see - | ||
31 | * Documentation/RCU/ *.txt | ||
32 | * | ||
33 | */ | ||
34 | #include <linux/types.h> | ||
35 | #include <linux/kernel.h> | ||
36 | #include <linux/init.h> | ||
37 | #include <linux/spinlock.h> | ||
38 | #include <linux/smp.h> | ||
39 | #include <linux/rcupdate.h> | ||
40 | #include <linux/interrupt.h> | ||
41 | #include <linux/sched.h> | ||
42 | #include <asm/atomic.h> | ||
43 | #include <linux/bitops.h> | ||
44 | #include <linux/module.h> | ||
45 | #include <linux/completion.h> | ||
46 | #include <linux/moduleparam.h> | ||
47 | #include <linux/percpu.h> | ||
48 | #include <linux/notifier.h> | ||
49 | #include <linux/rcupdate.h> | ||
50 | #include <linux/cpu.h> | ||
51 | #include <linux/random.h> | ||
52 | #include <linux/delay.h> | ||
53 | #include <linux/byteorder/swabb.h> | ||
54 | #include <linux/cpumask.h> | ||
55 | #include <linux/rcupreempt_trace.h> | ||
56 | |||
57 | /* | ||
58 | * Macro that prevents the compiler from reordering accesses, but does | ||
59 | * absolutely -nothing- to prevent CPUs from reordering. This is used | ||
60 | * only to mediate communication between mainline code and hardware | ||
61 | * interrupt and NMI handlers. | ||
62 | */ | ||
63 | #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) | ||
64 | |||
65 | /* | ||
66 | * PREEMPT_RCU data structures. | ||
67 | */ | ||
68 | |||
69 | /* | ||
70 | * GP_STAGES specifies the number of times the state machine has | ||
71 | * to go through the all the rcu_try_flip_states (see below) | ||
72 | * in a single Grace Period. | ||
73 | * | ||
74 | * GP in GP_STAGES stands for Grace Period ;) | ||
75 | */ | ||
76 | #define GP_STAGES 2 | ||
77 | struct rcu_data { | ||
78 | spinlock_t lock; /* Protect rcu_data fields. */ | ||
79 | long completed; /* Number of last completed batch. */ | ||
80 | int waitlistcount; | ||
81 | struct tasklet_struct rcu_tasklet; | ||
82 | struct rcu_head *nextlist; | ||
83 | struct rcu_head **nexttail; | ||
84 | struct rcu_head *waitlist[GP_STAGES]; | ||
85 | struct rcu_head **waittail[GP_STAGES]; | ||
86 | struct rcu_head *donelist; | ||
87 | struct rcu_head **donetail; | ||
88 | long rcu_flipctr[2]; | ||
89 | #ifdef CONFIG_RCU_TRACE | ||
90 | struct rcupreempt_trace trace; | ||
91 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
92 | }; | ||
93 | |||
94 | /* | ||
95 | * States for rcu_try_flip() and friends. | ||
96 | */ | ||
97 | |||
98 | enum rcu_try_flip_states { | ||
99 | |||
100 | /* | ||
101 | * Stay here if nothing is happening. Flip the counter if somthing | ||
102 | * starts happening. Denoted by "I" | ||
103 | */ | ||
104 | rcu_try_flip_idle_state, | ||
105 | |||
106 | /* | ||
107 | * Wait here for all CPUs to notice that the counter has flipped. This | ||
108 | * prevents the old set of counters from ever being incremented once | ||
109 | * we leave this state, which in turn is necessary because we cannot | ||
110 | * test any individual counter for zero -- we can only check the sum. | ||
111 | * Denoted by "A". | ||
112 | */ | ||
113 | rcu_try_flip_waitack_state, | ||
114 | |||
115 | /* | ||
116 | * Wait here for the sum of the old per-CPU counters to reach zero. | ||
117 | * Denoted by "Z". | ||
118 | */ | ||
119 | rcu_try_flip_waitzero_state, | ||
120 | |||
121 | /* | ||
122 | * Wait here for each of the other CPUs to execute a memory barrier. | ||
123 | * This is necessary to ensure that these other CPUs really have | ||
124 | * completed executing their RCU read-side critical sections, despite | ||
125 | * their CPUs wildly reordering memory. Denoted by "M". | ||
126 | */ | ||
127 | rcu_try_flip_waitmb_state, | ||
128 | }; | ||
129 | |||
130 | struct rcu_ctrlblk { | ||
131 | spinlock_t fliplock; /* Protect state-machine transitions. */ | ||
132 | long completed; /* Number of last completed batch. */ | ||
133 | enum rcu_try_flip_states rcu_try_flip_state; /* The current state of | ||
134 | the rcu state machine */ | ||
135 | }; | ||
136 | |||
137 | static DEFINE_PER_CPU(struct rcu_data, rcu_data); | ||
138 | static struct rcu_ctrlblk rcu_ctrlblk = { | ||
139 | .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), | ||
140 | .completed = 0, | ||
141 | .rcu_try_flip_state = rcu_try_flip_idle_state, | ||
142 | }; | ||
143 | |||
144 | |||
145 | #ifdef CONFIG_RCU_TRACE | ||
146 | static char *rcu_try_flip_state_names[] = | ||
147 | { "idle", "waitack", "waitzero", "waitmb" }; | ||
148 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
149 | |||
150 | static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE; | ||
151 | |||
152 | /* | ||
153 | * Enum and per-CPU flag to determine when each CPU has seen | ||
154 | * the most recent counter flip. | ||
155 | */ | ||
156 | |||
157 | enum rcu_flip_flag_values { | ||
158 | rcu_flip_seen, /* Steady/initial state, last flip seen. */ | ||
159 | /* Only GP detector can update. */ | ||
160 | rcu_flipped /* Flip just completed, need confirmation. */ | ||
161 | /* Only corresponding CPU can update. */ | ||
162 | }; | ||
163 | static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag) | ||
164 | = rcu_flip_seen; | ||
165 | |||
166 | /* | ||
167 | * Enum and per-CPU flag to determine when each CPU has executed the | ||
168 | * needed memory barrier to fence in memory references from its last RCU | ||
169 | * read-side critical section in the just-completed grace period. | ||
170 | */ | ||
171 | |||
172 | enum rcu_mb_flag_values { | ||
173 | rcu_mb_done, /* Steady/initial state, no mb()s required. */ | ||
174 | /* Only GP detector can update. */ | ||
175 | rcu_mb_needed /* Flip just completed, need an mb(). */ | ||
176 | /* Only corresponding CPU can update. */ | ||
177 | }; | ||
178 | static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag) | ||
179 | = rcu_mb_done; | ||
180 | |||
181 | /* | ||
182 | * RCU_DATA_ME: find the current CPU's rcu_data structure. | ||
183 | * RCU_DATA_CPU: find the specified CPU's rcu_data structure. | ||
184 | */ | ||
185 | #define RCU_DATA_ME() (&__get_cpu_var(rcu_data)) | ||
186 | #define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu)) | ||
187 | |||
188 | /* | ||
189 | * Helper macro for tracing when the appropriate rcu_data is not | ||
190 | * cached in a local variable, but where the CPU number is so cached. | ||
191 | */ | ||
192 | #define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace)); | ||
193 | |||
194 | /* | ||
195 | * Helper macro for tracing when the appropriate rcu_data is not | ||
196 | * cached in a local variable. | ||
197 | */ | ||
198 | #define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace)); | ||
199 | |||
200 | /* | ||
201 | * Helper macro for tracing when the appropriate rcu_data is pointed | ||
202 | * to by a local variable. | ||
203 | */ | ||
204 | #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace)); | ||
205 | |||
206 | /* | ||
207 | * Return the number of RCU batches processed thus far. Useful | ||
208 | * for debug and statistics. | ||
209 | */ | ||
210 | long rcu_batches_completed(void) | ||
211 | { | ||
212 | return rcu_ctrlblk.completed; | ||
213 | } | ||
214 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
215 | |||
216 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | ||
217 | |||
218 | void __rcu_read_lock(void) | ||
219 | { | ||
220 | int idx; | ||
221 | struct task_struct *t = current; | ||
222 | int nesting; | ||
223 | |||
224 | nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
225 | if (nesting != 0) { | ||
226 | |||
227 | /* An earlier rcu_read_lock() covers us, just count it. */ | ||
228 | |||
229 | t->rcu_read_lock_nesting = nesting + 1; | ||
230 | |||
231 | } else { | ||
232 | unsigned long flags; | ||
233 | |||
234 | /* | ||
235 | * We disable interrupts for the following reasons: | ||
236 | * - If we get scheduling clock interrupt here, and we | ||
237 | * end up acking the counter flip, it's like a promise | ||
238 | * that we will never increment the old counter again. | ||
239 | * Thus we will break that promise if that | ||
240 | * scheduling clock interrupt happens between the time | ||
241 | * we pick the .completed field and the time that we | ||
242 | * increment our counter. | ||
243 | * | ||
244 | * - We don't want to be preempted out here. | ||
245 | * | ||
246 | * NMIs can still occur, of course, and might themselves | ||
247 | * contain rcu_read_lock(). | ||
248 | */ | ||
249 | |||
250 | local_irq_save(flags); | ||
251 | |||
252 | /* | ||
253 | * Outermost nesting of rcu_read_lock(), so increment | ||
254 | * the current counter for the current CPU. Use volatile | ||
255 | * casts to prevent the compiler from reordering. | ||
256 | */ | ||
257 | |||
258 | idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1; | ||
259 | ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++; | ||
260 | |||
261 | /* | ||
262 | * Now that the per-CPU counter has been incremented, we | ||
263 | * are protected from races with rcu_read_lock() invoked | ||
264 | * from NMI handlers on this CPU. We can therefore safely | ||
265 | * increment the nesting counter, relieving further NMIs | ||
266 | * of the need to increment the per-CPU counter. | ||
267 | */ | ||
268 | |||
269 | ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1; | ||
270 | |||
271 | /* | ||
272 | * Now that we have preventing any NMIs from storing | ||
273 | * to the ->rcu_flipctr_idx, we can safely use it to | ||
274 | * remember which counter to decrement in the matching | ||
275 | * rcu_read_unlock(). | ||
276 | */ | ||
277 | |||
278 | ACCESS_ONCE(t->rcu_flipctr_idx) = idx; | ||
279 | local_irq_restore(flags); | ||
280 | } | ||
281 | } | ||
282 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | ||
283 | |||
284 | void __rcu_read_unlock(void) | ||
285 | { | ||
286 | int idx; | ||
287 | struct task_struct *t = current; | ||
288 | int nesting; | ||
289 | |||
290 | nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
291 | if (nesting > 1) { | ||
292 | |||
293 | /* | ||
294 | * We are still protected by the enclosing rcu_read_lock(), | ||
295 | * so simply decrement the counter. | ||
296 | */ | ||
297 | |||
298 | t->rcu_read_lock_nesting = nesting - 1; | ||
299 | |||
300 | } else { | ||
301 | unsigned long flags; | ||
302 | |||
303 | /* | ||
304 | * Disable local interrupts to prevent the grace-period | ||
305 | * detection state machine from seeing us half-done. | ||
306 | * NMIs can still occur, of course, and might themselves | ||
307 | * contain rcu_read_lock() and rcu_read_unlock(). | ||
308 | */ | ||
309 | |||
310 | local_irq_save(flags); | ||
311 | |||
312 | /* | ||
313 | * Outermost nesting of rcu_read_unlock(), so we must | ||
314 | * decrement the current counter for the current CPU. | ||
315 | * This must be done carefully, because NMIs can | ||
316 | * occur at any point in this code, and any rcu_read_lock() | ||
317 | * and rcu_read_unlock() pairs in the NMI handlers | ||
318 | * must interact non-destructively with this code. | ||
319 | * Lots of volatile casts, and -very- careful ordering. | ||
320 | * | ||
321 | * Changes to this code, including this one, must be | ||
322 | * inspected, validated, and tested extremely carefully!!! | ||
323 | */ | ||
324 | |||
325 | /* | ||
326 | * First, pick up the index. | ||
327 | */ | ||
328 | |||
329 | idx = ACCESS_ONCE(t->rcu_flipctr_idx); | ||
330 | |||
331 | /* | ||
332 | * Now that we have fetched the counter index, it is | ||
333 | * safe to decrement the per-task RCU nesting counter. | ||
334 | * After this, any interrupts or NMIs will increment and | ||
335 | * decrement the per-CPU counters. | ||
336 | */ | ||
337 | ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1; | ||
338 | |||
339 | /* | ||
340 | * It is now safe to decrement this task's nesting count. | ||
341 | * NMIs that occur after this statement will route their | ||
342 | * rcu_read_lock() calls through this "else" clause, and | ||
343 | * will thus start incrementing the per-CPU counter on | ||
344 | * their own. They will also clobber ->rcu_flipctr_idx, | ||
345 | * but that is OK, since we have already fetched it. | ||
346 | */ | ||
347 | |||
348 | ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--; | ||
349 | local_irq_restore(flags); | ||
350 | } | ||
351 | } | ||
352 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | ||
353 | |||
354 | /* | ||
355 | * If a global counter flip has occurred since the last time that we | ||
356 | * advanced callbacks, advance them. Hardware interrupts must be | ||
357 | * disabled when calling this function. | ||
358 | */ | ||
359 | static void __rcu_advance_callbacks(struct rcu_data *rdp) | ||
360 | { | ||
361 | int cpu; | ||
362 | int i; | ||
363 | int wlc = 0; | ||
364 | |||
365 | if (rdp->completed != rcu_ctrlblk.completed) { | ||
366 | if (rdp->waitlist[GP_STAGES - 1] != NULL) { | ||
367 | *rdp->donetail = rdp->waitlist[GP_STAGES - 1]; | ||
368 | rdp->donetail = rdp->waittail[GP_STAGES - 1]; | ||
369 | RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp); | ||
370 | } | ||
371 | for (i = GP_STAGES - 2; i >= 0; i--) { | ||
372 | if (rdp->waitlist[i] != NULL) { | ||
373 | rdp->waitlist[i + 1] = rdp->waitlist[i]; | ||
374 | rdp->waittail[i + 1] = rdp->waittail[i]; | ||
375 | wlc++; | ||
376 | } else { | ||
377 | rdp->waitlist[i + 1] = NULL; | ||
378 | rdp->waittail[i + 1] = | ||
379 | &rdp->waitlist[i + 1]; | ||
380 | } | ||
381 | } | ||
382 | if (rdp->nextlist != NULL) { | ||
383 | rdp->waitlist[0] = rdp->nextlist; | ||
384 | rdp->waittail[0] = rdp->nexttail; | ||
385 | wlc++; | ||
386 | rdp->nextlist = NULL; | ||
387 | rdp->nexttail = &rdp->nextlist; | ||
388 | RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp); | ||
389 | } else { | ||
390 | rdp->waitlist[0] = NULL; | ||
391 | rdp->waittail[0] = &rdp->waitlist[0]; | ||
392 | } | ||
393 | rdp->waitlistcount = wlc; | ||
394 | rdp->completed = rcu_ctrlblk.completed; | ||
395 | } | ||
396 | |||
397 | /* | ||
398 | * Check to see if this CPU needs to report that it has seen | ||
399 | * the most recent counter flip, thereby declaring that all | ||
400 | * subsequent rcu_read_lock() invocations will respect this flip. | ||
401 | */ | ||
402 | |||
403 | cpu = raw_smp_processor_id(); | ||
404 | if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { | ||
405 | smp_mb(); /* Subsequent counter accesses must see new value */ | ||
406 | per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; | ||
407 | smp_mb(); /* Subsequent RCU read-side critical sections */ | ||
408 | /* seen -after- acknowledgement. */ | ||
409 | } | ||
410 | } | ||
411 | |||
412 | /* | ||
413 | * Get here when RCU is idle. Decide whether we need to | ||
414 | * move out of idle state, and return non-zero if so. | ||
415 | * "Straightforward" approach for the moment, might later | ||
416 | * use callback-list lengths, grace-period duration, or | ||
417 | * some such to determine when to exit idle state. | ||
418 | * Might also need a pre-idle test that does not acquire | ||
419 | * the lock, but let's get the simple case working first... | ||
420 | */ | ||
421 | |||
422 | static int | ||
423 | rcu_try_flip_idle(void) | ||
424 | { | ||
425 | int cpu; | ||
426 | |||
427 | RCU_TRACE_ME(rcupreempt_trace_try_flip_i1); | ||
428 | if (!rcu_pending(smp_processor_id())) { | ||
429 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1); | ||
430 | return 0; | ||
431 | } | ||
432 | |||
433 | /* | ||
434 | * Do the flip. | ||
435 | */ | ||
436 | |||
437 | RCU_TRACE_ME(rcupreempt_trace_try_flip_g1); | ||
438 | rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */ | ||
439 | |||
440 | /* | ||
441 | * Need a memory barrier so that other CPUs see the new | ||
442 | * counter value before they see the subsequent change of all | ||
443 | * the rcu_flip_flag instances to rcu_flipped. | ||
444 | */ | ||
445 | |||
446 | smp_mb(); /* see above block comment. */ | ||
447 | |||
448 | /* Now ask each CPU for acknowledgement of the flip. */ | ||
449 | |||
450 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
451 | per_cpu(rcu_flip_flag, cpu) = rcu_flipped; | ||
452 | |||
453 | return 1; | ||
454 | } | ||
455 | |||
456 | /* | ||
457 | * Wait for CPUs to acknowledge the flip. | ||
458 | */ | ||
459 | |||
460 | static int | ||
461 | rcu_try_flip_waitack(void) | ||
462 | { | ||
463 | int cpu; | ||
464 | |||
465 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); | ||
466 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
467 | if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { | ||
468 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); | ||
469 | return 0; | ||
470 | } | ||
471 | |||
472 | /* | ||
473 | * Make sure our checks above don't bleed into subsequent | ||
474 | * waiting for the sum of the counters to reach zero. | ||
475 | */ | ||
476 | |||
477 | smp_mb(); /* see above block comment. */ | ||
478 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a2); | ||
479 | return 1; | ||
480 | } | ||
481 | |||
482 | /* | ||
483 | * Wait for collective ``last'' counter to reach zero, | ||
484 | * then tell all CPUs to do an end-of-grace-period memory barrier. | ||
485 | */ | ||
486 | |||
487 | static int | ||
488 | rcu_try_flip_waitzero(void) | ||
489 | { | ||
490 | int cpu; | ||
491 | int lastidx = !(rcu_ctrlblk.completed & 0x1); | ||
492 | int sum = 0; | ||
493 | |||
494 | /* Check to see if the sum of the "last" counters is zero. */ | ||
495 | |||
496 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); | ||
497 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
498 | sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; | ||
499 | if (sum != 0) { | ||
500 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); | ||
501 | return 0; | ||
502 | } | ||
503 | |||
504 | /* | ||
505 | * This ensures that the other CPUs see the call for | ||
506 | * memory barriers -after- the sum to zero has been | ||
507 | * detected here | ||
508 | */ | ||
509 | smp_mb(); /* ^^^^^^^^^^^^ */ | ||
510 | |||
511 | /* Call for a memory barrier from each CPU. */ | ||
512 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
513 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; | ||
514 | |||
515 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); | ||
516 | return 1; | ||
517 | } | ||
518 | |||
519 | /* | ||
520 | * Wait for all CPUs to do their end-of-grace-period memory barrier. | ||
521 | * Return 0 once all CPUs have done so. | ||
522 | */ | ||
523 | |||
524 | static int | ||
525 | rcu_try_flip_waitmb(void) | ||
526 | { | ||
527 | int cpu; | ||
528 | |||
529 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); | ||
530 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
531 | if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { | ||
532 | RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); | ||
533 | return 0; | ||
534 | } | ||
535 | |||
536 | smp_mb(); /* Ensure that the above checks precede any following flip. */ | ||
537 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m2); | ||
538 | return 1; | ||
539 | } | ||
540 | |||
541 | /* | ||
542 | * Attempt a single flip of the counters. Remember, a single flip does | ||
543 | * -not- constitute a grace period. Instead, the interval between | ||
544 | * at least GP_STAGES consecutive flips is a grace period. | ||
545 | * | ||
546 | * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation | ||
547 | * on a large SMP, they might want to use a hierarchical organization of | ||
548 | * the per-CPU-counter pairs. | ||
549 | */ | ||
550 | static void rcu_try_flip(void) | ||
551 | { | ||
552 | unsigned long flags; | ||
553 | |||
554 | RCU_TRACE_ME(rcupreempt_trace_try_flip_1); | ||
555 | if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) { | ||
556 | RCU_TRACE_ME(rcupreempt_trace_try_flip_e1); | ||
557 | return; | ||
558 | } | ||
559 | |||
560 | /* | ||
561 | * Take the next transition(s) through the RCU grace-period | ||
562 | * flip-counter state machine. | ||
563 | */ | ||
564 | |||
565 | switch (rcu_ctrlblk.rcu_try_flip_state) { | ||
566 | case rcu_try_flip_idle_state: | ||
567 | if (rcu_try_flip_idle()) | ||
568 | rcu_ctrlblk.rcu_try_flip_state = | ||
569 | rcu_try_flip_waitack_state; | ||
570 | break; | ||
571 | case rcu_try_flip_waitack_state: | ||
572 | if (rcu_try_flip_waitack()) | ||
573 | rcu_ctrlblk.rcu_try_flip_state = | ||
574 | rcu_try_flip_waitzero_state; | ||
575 | break; | ||
576 | case rcu_try_flip_waitzero_state: | ||
577 | if (rcu_try_flip_waitzero()) | ||
578 | rcu_ctrlblk.rcu_try_flip_state = | ||
579 | rcu_try_flip_waitmb_state; | ||
580 | break; | ||
581 | case rcu_try_flip_waitmb_state: | ||
582 | if (rcu_try_flip_waitmb()) | ||
583 | rcu_ctrlblk.rcu_try_flip_state = | ||
584 | rcu_try_flip_idle_state; | ||
585 | } | ||
586 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
587 | } | ||
588 | |||
589 | /* | ||
590 | * Check to see if this CPU needs to do a memory barrier in order to | ||
591 | * ensure that any prior RCU read-side critical sections have committed | ||
592 | * their counter manipulations and critical-section memory references | ||
593 | * before declaring the grace period to be completed. | ||
594 | */ | ||
595 | static void rcu_check_mb(int cpu) | ||
596 | { | ||
597 | if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) { | ||
598 | smp_mb(); /* Ensure RCU read-side accesses are visible. */ | ||
599 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_done; | ||
600 | } | ||
601 | } | ||
602 | |||
603 | void rcu_check_callbacks(int cpu, int user) | ||
604 | { | ||
605 | unsigned long flags; | ||
606 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
607 | |||
608 | rcu_check_mb(cpu); | ||
609 | if (rcu_ctrlblk.completed == rdp->completed) | ||
610 | rcu_try_flip(); | ||
611 | spin_lock_irqsave(&rdp->lock, flags); | ||
612 | RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); | ||
613 | __rcu_advance_callbacks(rdp); | ||
614 | if (rdp->donelist == NULL) { | ||
615 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
616 | } else { | ||
617 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
618 | raise_softirq(RCU_SOFTIRQ); | ||
619 | } | ||
620 | } | ||
621 | |||
622 | /* | ||
623 | * Needed by dynticks, to make sure all RCU processing has finished | ||
624 | * when we go idle: | ||
625 | */ | ||
626 | void rcu_advance_callbacks(int cpu, int user) | ||
627 | { | ||
628 | unsigned long flags; | ||
629 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
630 | |||
631 | if (rcu_ctrlblk.completed == rdp->completed) { | ||
632 | rcu_try_flip(); | ||
633 | if (rcu_ctrlblk.completed == rdp->completed) | ||
634 | return; | ||
635 | } | ||
636 | spin_lock_irqsave(&rdp->lock, flags); | ||
637 | RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); | ||
638 | __rcu_advance_callbacks(rdp); | ||
639 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
640 | } | ||
641 | |||
642 | #ifdef CONFIG_HOTPLUG_CPU | ||
643 | #define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \ | ||
644 | *dsttail = srclist; \ | ||
645 | if (srclist != NULL) { \ | ||
646 | dsttail = srctail; \ | ||
647 | srclist = NULL; \ | ||
648 | srctail = &srclist;\ | ||
649 | } \ | ||
650 | } while (0) | ||
651 | |||
652 | void rcu_offline_cpu(int cpu) | ||
653 | { | ||
654 | int i; | ||
655 | struct rcu_head *list = NULL; | ||
656 | unsigned long flags; | ||
657 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
658 | struct rcu_head **tail = &list; | ||
659 | |||
660 | /* | ||
661 | * Remove all callbacks from the newly dead CPU, retaining order. | ||
662 | * Otherwise rcu_barrier() will fail | ||
663 | */ | ||
664 | |||
665 | spin_lock_irqsave(&rdp->lock, flags); | ||
666 | rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail); | ||
667 | for (i = GP_STAGES - 1; i >= 0; i--) | ||
668 | rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i], | ||
669 | list, tail); | ||
670 | rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail); | ||
671 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
672 | rdp->waitlistcount = 0; | ||
673 | |||
674 | /* Disengage the newly dead CPU from the grace-period computation. */ | ||
675 | |||
676 | spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); | ||
677 | rcu_check_mb(cpu); | ||
678 | if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { | ||
679 | smp_mb(); /* Subsequent counter accesses must see new value */ | ||
680 | per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; | ||
681 | smp_mb(); /* Subsequent RCU read-side critical sections */ | ||
682 | /* seen -after- acknowledgement. */ | ||
683 | } | ||
684 | |||
685 | RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0]; | ||
686 | RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1]; | ||
687 | |||
688 | RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0; | ||
689 | RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0; | ||
690 | |||
691 | cpu_clear(cpu, rcu_cpu_online_map); | ||
692 | |||
693 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
694 | |||
695 | /* | ||
696 | * Place the removed callbacks on the current CPU's queue. | ||
697 | * Make them all start a new grace period: simple approach, | ||
698 | * in theory could starve a given set of callbacks, but | ||
699 | * you would need to be doing some serious CPU hotplugging | ||
700 | * to make this happen. If this becomes a problem, adding | ||
701 | * a synchronize_rcu() to the hotplug path would be a simple | ||
702 | * fix. | ||
703 | */ | ||
704 | |||
705 | rdp = RCU_DATA_ME(); | ||
706 | spin_lock_irqsave(&rdp->lock, flags); | ||
707 | *rdp->nexttail = list; | ||
708 | if (list) | ||
709 | rdp->nexttail = tail; | ||
710 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
711 | } | ||
712 | |||
713 | void __devinit rcu_online_cpu(int cpu) | ||
714 | { | ||
715 | unsigned long flags; | ||
716 | |||
717 | spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); | ||
718 | cpu_set(cpu, rcu_cpu_online_map); | ||
719 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
720 | } | ||
721 | |||
722 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
723 | |||
724 | void rcu_offline_cpu(int cpu) | ||
725 | { | ||
726 | } | ||
727 | |||
728 | void __devinit rcu_online_cpu(int cpu) | ||
729 | { | ||
730 | } | ||
731 | |||
732 | #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ | ||
733 | |||
734 | static void rcu_process_callbacks(struct softirq_action *unused) | ||
735 | { | ||
736 | unsigned long flags; | ||
737 | struct rcu_head *next, *list; | ||
738 | struct rcu_data *rdp = RCU_DATA_ME(); | ||
739 | |||
740 | spin_lock_irqsave(&rdp->lock, flags); | ||
741 | list = rdp->donelist; | ||
742 | if (list == NULL) { | ||
743 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
744 | return; | ||
745 | } | ||
746 | rdp->donelist = NULL; | ||
747 | rdp->donetail = &rdp->donelist; | ||
748 | RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp); | ||
749 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
750 | while (list) { | ||
751 | next = list->next; | ||
752 | list->func(list); | ||
753 | list = next; | ||
754 | RCU_TRACE_ME(rcupreempt_trace_invoke); | ||
755 | } | ||
756 | } | ||
757 | |||
758 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
759 | { | ||
760 | unsigned long flags; | ||
761 | struct rcu_data *rdp; | ||
762 | |||
763 | head->func = func; | ||
764 | head->next = NULL; | ||
765 | local_irq_save(flags); | ||
766 | rdp = RCU_DATA_ME(); | ||
767 | spin_lock(&rdp->lock); | ||
768 | __rcu_advance_callbacks(rdp); | ||
769 | *rdp->nexttail = head; | ||
770 | rdp->nexttail = &head->next; | ||
771 | RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp); | ||
772 | spin_unlock(&rdp->lock); | ||
773 | local_irq_restore(flags); | ||
774 | } | ||
775 | EXPORT_SYMBOL_GPL(call_rcu); | ||
776 | |||
777 | /* | ||
778 | * Wait until all currently running preempt_disable() code segments | ||
779 | * (including hardware-irq-disable segments) complete. Note that | ||
780 | * in -rt this does -not- necessarily result in all currently executing | ||
781 | * interrupt -handlers- having completed. | ||
782 | */ | ||
783 | void __synchronize_sched(void) | ||
784 | { | ||
785 | cpumask_t oldmask; | ||
786 | int cpu; | ||
787 | |||
788 | if (sched_getaffinity(0, &oldmask) < 0) | ||
789 | oldmask = cpu_possible_map; | ||
790 | for_each_online_cpu(cpu) { | ||
791 | sched_setaffinity(0, cpumask_of_cpu(cpu)); | ||
792 | schedule(); | ||
793 | } | ||
794 | sched_setaffinity(0, oldmask); | ||
795 | } | ||
796 | EXPORT_SYMBOL_GPL(__synchronize_sched); | ||
797 | |||
798 | /* | ||
799 | * Check to see if any future RCU-related work will need to be done | ||
800 | * by the current CPU, even if none need be done immediately, returning | ||
801 | * 1 if so. Assumes that notifiers would take care of handling any | ||
802 | * outstanding requests from the RCU core. | ||
803 | * | ||
804 | * This function is part of the RCU implementation; it is -not- | ||
805 | * an exported member of the RCU API. | ||
806 | */ | ||
807 | int rcu_needs_cpu(int cpu) | ||
808 | { | ||
809 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
810 | |||
811 | return (rdp->donelist != NULL || | ||
812 | !!rdp->waitlistcount || | ||
813 | rdp->nextlist != NULL); | ||
814 | } | ||
815 | |||
816 | int rcu_pending(int cpu) | ||
817 | { | ||
818 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
819 | |||
820 | /* The CPU has at least one callback queued somewhere. */ | ||
821 | |||
822 | if (rdp->donelist != NULL || | ||
823 | !!rdp->waitlistcount || | ||
824 | rdp->nextlist != NULL) | ||
825 | return 1; | ||
826 | |||
827 | /* The RCU core needs an acknowledgement from this CPU. */ | ||
828 | |||
829 | if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) || | ||
830 | (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed)) | ||
831 | return 1; | ||
832 | |||
833 | /* This CPU has fallen behind the global grace-period number. */ | ||
834 | |||
835 | if (rdp->completed != rcu_ctrlblk.completed) | ||
836 | return 1; | ||
837 | |||
838 | /* Nothing needed from this CPU. */ | ||
839 | |||
840 | return 0; | ||
841 | } | ||
842 | |||
843 | static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | ||
844 | unsigned long action, void *hcpu) | ||
845 | { | ||
846 | long cpu = (long)hcpu; | ||
847 | |||
848 | switch (action) { | ||
849 | case CPU_UP_PREPARE: | ||
850 | case CPU_UP_PREPARE_FROZEN: | ||
851 | rcu_online_cpu(cpu); | ||
852 | break; | ||
853 | case CPU_UP_CANCELED: | ||
854 | case CPU_UP_CANCELED_FROZEN: | ||
855 | case CPU_DEAD: | ||
856 | case CPU_DEAD_FROZEN: | ||
857 | rcu_offline_cpu(cpu); | ||
858 | break; | ||
859 | default: | ||
860 | break; | ||
861 | } | ||
862 | return NOTIFY_OK; | ||
863 | } | ||
864 | |||
865 | static struct notifier_block __cpuinitdata rcu_nb = { | ||
866 | .notifier_call = rcu_cpu_notify, | ||
867 | }; | ||
868 | |||
869 | void __init __rcu_init(void) | ||
870 | { | ||
871 | int cpu; | ||
872 | int i; | ||
873 | struct rcu_data *rdp; | ||
874 | |||
875 | printk(KERN_NOTICE "Preemptible RCU implementation.\n"); | ||
876 | for_each_possible_cpu(cpu) { | ||
877 | rdp = RCU_DATA_CPU(cpu); | ||
878 | spin_lock_init(&rdp->lock); | ||
879 | rdp->completed = 0; | ||
880 | rdp->waitlistcount = 0; | ||
881 | rdp->nextlist = NULL; | ||
882 | rdp->nexttail = &rdp->nextlist; | ||
883 | for (i = 0; i < GP_STAGES; i++) { | ||
884 | rdp->waitlist[i] = NULL; | ||
885 | rdp->waittail[i] = &rdp->waitlist[i]; | ||
886 | } | ||
887 | rdp->donelist = NULL; | ||
888 | rdp->donetail = &rdp->donelist; | ||
889 | rdp->rcu_flipctr[0] = 0; | ||
890 | rdp->rcu_flipctr[1] = 0; | ||
891 | } | ||
892 | register_cpu_notifier(&rcu_nb); | ||
893 | |||
894 | /* | ||
895 | * We don't need protection against CPU-Hotplug here | ||
896 | * since | ||
897 | * a) If a CPU comes online while we are iterating over the | ||
898 | * cpu_online_map below, we would only end up making a | ||
899 | * duplicate call to rcu_online_cpu() which sets the corresponding | ||
900 | * CPU's mask in the rcu_cpu_online_map. | ||
901 | * | ||
902 | * b) A CPU cannot go offline at this point in time since the user | ||
903 | * does not have access to the sysfs interface, nor do we | ||
904 | * suspend the system. | ||
905 | */ | ||
906 | for_each_online_cpu(cpu) | ||
907 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu); | ||
908 | |||
909 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); | ||
910 | } | ||
911 | |||
912 | /* | ||
913 | * Deprecated, use synchronize_rcu() or synchronize_sched() instead. | ||
914 | */ | ||
915 | void synchronize_kernel(void) | ||
916 | { | ||
917 | synchronize_rcu(); | ||
918 | } | ||
919 | |||
920 | #ifdef CONFIG_RCU_TRACE | ||
921 | long *rcupreempt_flipctr(int cpu) | ||
922 | { | ||
923 | return &RCU_DATA_CPU(cpu)->rcu_flipctr[0]; | ||
924 | } | ||
925 | EXPORT_SYMBOL_GPL(rcupreempt_flipctr); | ||
926 | |||
927 | int rcupreempt_flip_flag(int cpu) | ||
928 | { | ||
929 | return per_cpu(rcu_flip_flag, cpu); | ||
930 | } | ||
931 | EXPORT_SYMBOL_GPL(rcupreempt_flip_flag); | ||
932 | |||
933 | int rcupreempt_mb_flag(int cpu) | ||
934 | { | ||
935 | return per_cpu(rcu_mb_flag, cpu); | ||
936 | } | ||
937 | EXPORT_SYMBOL_GPL(rcupreempt_mb_flag); | ||
938 | |||
939 | char *rcupreempt_try_flip_state_name(void) | ||
940 | { | ||
941 | return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state]; | ||
942 | } | ||
943 | EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name); | ||
944 | |||
945 | struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu) | ||
946 | { | ||
947 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
948 | |||
949 | return &rdp->trace; | ||
950 | } | ||
951 | EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu); | ||
952 | |||
953 | #endif /* #ifdef RCU_TRACE */ | ||
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c new file mode 100644 index 000000000000..49ac4947af24 --- /dev/null +++ b/kernel/rcupreempt_trace.c | |||
@@ -0,0 +1,330 @@ | |||
1 | /* | ||
2 | * Read-Copy Update tracing for realtime implementation | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2006 | ||
19 | * | ||
20 | * Papers: http://www.rdrop.com/users/paulmck/RCU | ||
21 | * | ||
22 | * For detailed explanation of Read-Copy Update mechanism see - | ||
23 | * Documentation/RCU/ *.txt | ||
24 | * | ||
25 | */ | ||
26 | #include <linux/types.h> | ||
27 | #include <linux/kernel.h> | ||
28 | #include <linux/init.h> | ||
29 | #include <linux/spinlock.h> | ||
30 | #include <linux/smp.h> | ||
31 | #include <linux/rcupdate.h> | ||
32 | #include <linux/interrupt.h> | ||
33 | #include <linux/sched.h> | ||
34 | #include <asm/atomic.h> | ||
35 | #include <linux/bitops.h> | ||
36 | #include <linux/module.h> | ||
37 | #include <linux/completion.h> | ||
38 | #include <linux/moduleparam.h> | ||
39 | #include <linux/percpu.h> | ||
40 | #include <linux/notifier.h> | ||
41 | #include <linux/rcupdate.h> | ||
42 | #include <linux/cpu.h> | ||
43 | #include <linux/mutex.h> | ||
44 | #include <linux/rcupreempt_trace.h> | ||
45 | #include <linux/debugfs.h> | ||
46 | |||
47 | static struct mutex rcupreempt_trace_mutex; | ||
48 | static char *rcupreempt_trace_buf; | ||
49 | #define RCUPREEMPT_TRACE_BUF_SIZE 4096 | ||
50 | |||
51 | void rcupreempt_trace_move2done(struct rcupreempt_trace *trace) | ||
52 | { | ||
53 | trace->done_length += trace->wait_length; | ||
54 | trace->done_add += trace->wait_length; | ||
55 | trace->wait_length = 0; | ||
56 | } | ||
57 | void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace) | ||
58 | { | ||
59 | trace->wait_length += trace->next_length; | ||
60 | trace->wait_add += trace->next_length; | ||
61 | trace->next_length = 0; | ||
62 | } | ||
63 | void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace) | ||
64 | { | ||
65 | atomic_inc(&trace->rcu_try_flip_1); | ||
66 | } | ||
67 | void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace) | ||
68 | { | ||
69 | atomic_inc(&trace->rcu_try_flip_e1); | ||
70 | } | ||
71 | void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace) | ||
72 | { | ||
73 | trace->rcu_try_flip_i1++; | ||
74 | } | ||
75 | void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace) | ||
76 | { | ||
77 | trace->rcu_try_flip_ie1++; | ||
78 | } | ||
79 | void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace) | ||
80 | { | ||
81 | trace->rcu_try_flip_g1++; | ||
82 | } | ||
83 | void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace) | ||
84 | { | ||
85 | trace->rcu_try_flip_a1++; | ||
86 | } | ||
87 | void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace) | ||
88 | { | ||
89 | trace->rcu_try_flip_ae1++; | ||
90 | } | ||
91 | void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace) | ||
92 | { | ||
93 | trace->rcu_try_flip_a2++; | ||
94 | } | ||
95 | void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace) | ||
96 | { | ||
97 | trace->rcu_try_flip_z1++; | ||
98 | } | ||
99 | void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace) | ||
100 | { | ||
101 | trace->rcu_try_flip_ze1++; | ||
102 | } | ||
103 | void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace) | ||
104 | { | ||
105 | trace->rcu_try_flip_z2++; | ||
106 | } | ||
107 | void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace) | ||
108 | { | ||
109 | trace->rcu_try_flip_m1++; | ||
110 | } | ||
111 | void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace) | ||
112 | { | ||
113 | trace->rcu_try_flip_me1++; | ||
114 | } | ||
115 | void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace) | ||
116 | { | ||
117 | trace->rcu_try_flip_m2++; | ||
118 | } | ||
119 | void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace) | ||
120 | { | ||
121 | trace->rcu_check_callbacks++; | ||
122 | } | ||
123 | void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace) | ||
124 | { | ||
125 | trace->done_remove += trace->done_length; | ||
126 | trace->done_length = 0; | ||
127 | } | ||
128 | void rcupreempt_trace_invoke(struct rcupreempt_trace *trace) | ||
129 | { | ||
130 | atomic_inc(&trace->done_invoked); | ||
131 | } | ||
132 | void rcupreempt_trace_next_add(struct rcupreempt_trace *trace) | ||
133 | { | ||
134 | trace->next_add++; | ||
135 | trace->next_length++; | ||
136 | } | ||
137 | |||
138 | static void rcupreempt_trace_sum(struct rcupreempt_trace *sp) | ||
139 | { | ||
140 | struct rcupreempt_trace *cp; | ||
141 | int cpu; | ||
142 | |||
143 | memset(sp, 0, sizeof(*sp)); | ||
144 | for_each_possible_cpu(cpu) { | ||
145 | cp = rcupreempt_trace_cpu(cpu); | ||
146 | sp->next_length += cp->next_length; | ||
147 | sp->next_add += cp->next_add; | ||
148 | sp->wait_length += cp->wait_length; | ||
149 | sp->wait_add += cp->wait_add; | ||
150 | sp->done_length += cp->done_length; | ||
151 | sp->done_add += cp->done_add; | ||
152 | sp->done_remove += cp->done_remove; | ||
153 | atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked)); | ||
154 | sp->rcu_check_callbacks += cp->rcu_check_callbacks; | ||
155 | atomic_set(&sp->rcu_try_flip_1, | ||
156 | atomic_read(&cp->rcu_try_flip_1)); | ||
157 | atomic_set(&sp->rcu_try_flip_e1, | ||
158 | atomic_read(&cp->rcu_try_flip_e1)); | ||
159 | sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1; | ||
160 | sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1; | ||
161 | sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1; | ||
162 | sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1; | ||
163 | sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1; | ||
164 | sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2; | ||
165 | sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1; | ||
166 | sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1; | ||
167 | sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2; | ||
168 | sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1; | ||
169 | sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1; | ||
170 | sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2; | ||
171 | } | ||
172 | } | ||
173 | |||
174 | static ssize_t rcustats_read(struct file *filp, char __user *buffer, | ||
175 | size_t count, loff_t *ppos) | ||
176 | { | ||
177 | struct rcupreempt_trace trace; | ||
178 | ssize_t bcount; | ||
179 | int cnt = 0; | ||
180 | |||
181 | rcupreempt_trace_sum(&trace); | ||
182 | mutex_lock(&rcupreempt_trace_mutex); | ||
183 | snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
184 | "ggp=%ld rcc=%ld\n", | ||
185 | rcu_batches_completed(), | ||
186 | trace.rcu_check_callbacks); | ||
187 | snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
188 | "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n" | ||
189 | "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n" | ||
190 | "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n", | ||
191 | |||
192 | trace.next_add, trace.next_length, | ||
193 | trace.wait_add, trace.wait_length, | ||
194 | trace.done_add, trace.done_length, | ||
195 | trace.done_remove, atomic_read(&trace.done_invoked), | ||
196 | atomic_read(&trace.rcu_try_flip_1), | ||
197 | atomic_read(&trace.rcu_try_flip_e1), | ||
198 | trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1, | ||
199 | trace.rcu_try_flip_g1, | ||
200 | trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1, | ||
201 | trace.rcu_try_flip_a2, | ||
202 | trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1, | ||
203 | trace.rcu_try_flip_z2, | ||
204 | trace.rcu_try_flip_m1, trace.rcu_try_flip_me1, | ||
205 | trace.rcu_try_flip_m2); | ||
206 | bcount = simple_read_from_buffer(buffer, count, ppos, | ||
207 | rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); | ||
208 | mutex_unlock(&rcupreempt_trace_mutex); | ||
209 | return bcount; | ||
210 | } | ||
211 | |||
212 | static ssize_t rcugp_read(struct file *filp, char __user *buffer, | ||
213 | size_t count, loff_t *ppos) | ||
214 | { | ||
215 | long oldgp = rcu_batches_completed(); | ||
216 | ssize_t bcount; | ||
217 | |||
218 | mutex_lock(&rcupreempt_trace_mutex); | ||
219 | synchronize_rcu(); | ||
220 | snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE, | ||
221 | "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed()); | ||
222 | bcount = simple_read_from_buffer(buffer, count, ppos, | ||
223 | rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); | ||
224 | mutex_unlock(&rcupreempt_trace_mutex); | ||
225 | return bcount; | ||
226 | } | ||
227 | |||
228 | static ssize_t rcuctrs_read(struct file *filp, char __user *buffer, | ||
229 | size_t count, loff_t *ppos) | ||
230 | { | ||
231 | int cnt = 0; | ||
232 | int cpu; | ||
233 | int f = rcu_batches_completed() & 0x1; | ||
234 | ssize_t bcount; | ||
235 | |||
236 | mutex_lock(&rcupreempt_trace_mutex); | ||
237 | |||
238 | cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE, | ||
239 | "CPU last cur F M\n"); | ||
240 | for_each_online_cpu(cpu) { | ||
241 | long *flipctr = rcupreempt_flipctr(cpu); | ||
242 | cnt += snprintf(&rcupreempt_trace_buf[cnt], | ||
243 | RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
244 | "%3d %4ld %3ld %d %d\n", | ||
245 | cpu, | ||
246 | flipctr[!f], | ||
247 | flipctr[f], | ||
248 | rcupreempt_flip_flag(cpu), | ||
249 | rcupreempt_mb_flag(cpu)); | ||
250 | } | ||
251 | cnt += snprintf(&rcupreempt_trace_buf[cnt], | ||
252 | RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
253 | "ggp = %ld, state = %s\n", | ||
254 | rcu_batches_completed(), | ||
255 | rcupreempt_try_flip_state_name()); | ||
256 | cnt += snprintf(&rcupreempt_trace_buf[cnt], | ||
257 | RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
258 | "\n"); | ||
259 | bcount = simple_read_from_buffer(buffer, count, ppos, | ||
260 | rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); | ||
261 | mutex_unlock(&rcupreempt_trace_mutex); | ||
262 | return bcount; | ||
263 | } | ||
264 | |||
265 | static struct file_operations rcustats_fops = { | ||
266 | .owner = THIS_MODULE, | ||
267 | .read = rcustats_read, | ||
268 | }; | ||
269 | |||
270 | static struct file_operations rcugp_fops = { | ||
271 | .owner = THIS_MODULE, | ||
272 | .read = rcugp_read, | ||
273 | }; | ||
274 | |||
275 | static struct file_operations rcuctrs_fops = { | ||
276 | .owner = THIS_MODULE, | ||
277 | .read = rcuctrs_read, | ||
278 | }; | ||
279 | |||
280 | static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir; | ||
281 | static int rcupreempt_debugfs_init(void) | ||
282 | { | ||
283 | rcudir = debugfs_create_dir("rcu", NULL); | ||
284 | if (!rcudir) | ||
285 | goto out; | ||
286 | statdir = debugfs_create_file("rcustats", 0444, rcudir, | ||
287 | NULL, &rcustats_fops); | ||
288 | if (!statdir) | ||
289 | goto free_out; | ||
290 | |||
291 | gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); | ||
292 | if (!gpdir) | ||
293 | goto free_out; | ||
294 | |||
295 | ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir, | ||
296 | NULL, &rcuctrs_fops); | ||
297 | if (!ctrsdir) | ||
298 | goto free_out; | ||
299 | return 0; | ||
300 | free_out: | ||
301 | if (statdir) | ||
302 | debugfs_remove(statdir); | ||
303 | if (gpdir) | ||
304 | debugfs_remove(gpdir); | ||
305 | debugfs_remove(rcudir); | ||
306 | out: | ||
307 | return 1; | ||
308 | } | ||
309 | |||
310 | static int __init rcupreempt_trace_init(void) | ||
311 | { | ||
312 | mutex_init(&rcupreempt_trace_mutex); | ||
313 | rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); | ||
314 | if (!rcupreempt_trace_buf) | ||
315 | return 1; | ||
316 | return rcupreempt_debugfs_init(); | ||
317 | } | ||
318 | |||
319 | static void __exit rcupreempt_trace_cleanup(void) | ||
320 | { | ||
321 | debugfs_remove(statdir); | ||
322 | debugfs_remove(gpdir); | ||
323 | debugfs_remove(ctrsdir); | ||
324 | debugfs_remove(rcudir); | ||
325 | kfree(rcupreempt_trace_buf); | ||
326 | } | ||
327 | |||
328 | |||
329 | module_init(rcupreempt_trace_init); | ||
330 | module_exit(rcupreempt_trace_cleanup); | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index c3e165c2318f..fd599829e72a 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -726,11 +726,11 @@ static void rcu_torture_shuffle_tasks(void) | |||
726 | cpumask_t tmp_mask = CPU_MASK_ALL; | 726 | cpumask_t tmp_mask = CPU_MASK_ALL; |
727 | int i; | 727 | int i; |
728 | 728 | ||
729 | lock_cpu_hotplug(); | 729 | get_online_cpus(); |
730 | 730 | ||
731 | /* No point in shuffling if there is only one online CPU (ex: UP) */ | 731 | /* No point in shuffling if there is only one online CPU (ex: UP) */ |
732 | if (num_online_cpus() == 1) { | 732 | if (num_online_cpus() == 1) { |
733 | unlock_cpu_hotplug(); | 733 | put_online_cpus(); |
734 | return; | 734 | return; |
735 | } | 735 | } |
736 | 736 | ||
@@ -762,7 +762,7 @@ static void rcu_torture_shuffle_tasks(void) | |||
762 | else | 762 | else |
763 | rcu_idle_cpu--; | 763 | rcu_idle_cpu--; |
764 | 764 | ||
765 | unlock_cpu_hotplug(); | 765 | put_online_cpus(); |
766 | } | 766 | } |
767 | 767 | ||
768 | /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the | 768 | /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the |
diff --git a/kernel/sched.c b/kernel/sched.c index e76b11ca6df3..524285e46fa7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -22,6 +22,8 @@ | |||
22 | * by Peter Williams | 22 | * by Peter Williams |
23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith | 23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith |
24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri | 24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri |
25 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, | ||
26 | * Thomas Gleixner, Mike Kravetz | ||
25 | */ | 27 | */ |
26 | 28 | ||
27 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
@@ -63,6 +65,7 @@ | |||
63 | #include <linux/reciprocal_div.h> | 65 | #include <linux/reciprocal_div.h> |
64 | #include <linux/unistd.h> | 66 | #include <linux/unistd.h> |
65 | #include <linux/pagemap.h> | 67 | #include <linux/pagemap.h> |
68 | #include <linux/hrtimer.h> | ||
66 | 69 | ||
67 | #include <asm/tlb.h> | 70 | #include <asm/tlb.h> |
68 | #include <asm/irq_regs.h> | 71 | #include <asm/irq_regs.h> |
@@ -96,10 +99,9 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
96 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | 99 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) |
97 | 100 | ||
98 | /* | 101 | /* |
99 | * Some helpers for converting nanosecond timing to jiffy resolution | 102 | * Helpers for converting nanosecond timing to jiffy resolution |
100 | */ | 103 | */ |
101 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) | 104 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) |
102 | #define JIFFIES_TO_NS(TIME) ((TIME) * (NSEC_PER_SEC / HZ)) | ||
103 | 105 | ||
104 | #define NICE_0_LOAD SCHED_LOAD_SCALE | 106 | #define NICE_0_LOAD SCHED_LOAD_SCALE |
105 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | 107 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT |
@@ -159,6 +161,8 @@ struct rt_prio_array { | |||
159 | 161 | ||
160 | struct cfs_rq; | 162 | struct cfs_rq; |
161 | 163 | ||
164 | static LIST_HEAD(task_groups); | ||
165 | |||
162 | /* task group related information */ | 166 | /* task group related information */ |
163 | struct task_group { | 167 | struct task_group { |
164 | #ifdef CONFIG_FAIR_CGROUP_SCHED | 168 | #ifdef CONFIG_FAIR_CGROUP_SCHED |
@@ -168,10 +172,50 @@ struct task_group { | |||
168 | struct sched_entity **se; | 172 | struct sched_entity **se; |
169 | /* runqueue "owned" by this group on each cpu */ | 173 | /* runqueue "owned" by this group on each cpu */ |
170 | struct cfs_rq **cfs_rq; | 174 | struct cfs_rq **cfs_rq; |
175 | |||
176 | struct sched_rt_entity **rt_se; | ||
177 | struct rt_rq **rt_rq; | ||
178 | |||
179 | unsigned int rt_ratio; | ||
180 | |||
181 | /* | ||
182 | * shares assigned to a task group governs how much of cpu bandwidth | ||
183 | * is allocated to the group. The more shares a group has, the more is | ||
184 | * the cpu bandwidth allocated to it. | ||
185 | * | ||
186 | * For ex, lets say that there are three task groups, A, B and C which | ||
187 | * have been assigned shares 1000, 2000 and 3000 respectively. Then, | ||
188 | * cpu bandwidth allocated by the scheduler to task groups A, B and C | ||
189 | * should be: | ||
190 | * | ||
191 | * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% | ||
192 | * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% | ||
193 | * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% | ||
194 | * | ||
195 | * The weight assigned to a task group's schedulable entities on every | ||
196 | * cpu (task_group.se[a_cpu]->load.weight) is derived from the task | ||
197 | * group's shares. For ex: lets say that task group A has been | ||
198 | * assigned shares of 1000 and there are two CPUs in a system. Then, | ||
199 | * | ||
200 | * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; | ||
201 | * | ||
202 | * Note: It's not necessary that each of a task's group schedulable | ||
203 | * entity have the same weight on all CPUs. If the group | ||
204 | * has 2 of its tasks on CPU0 and 1 task on CPU1, then a | ||
205 | * better distribution of weight could be: | ||
206 | * | ||
207 | * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 | ||
208 | * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 | ||
209 | * | ||
210 | * rebalance_shares() is responsible for distributing the shares of a | ||
211 | * task groups like this among the group's schedulable entities across | ||
212 | * cpus. | ||
213 | * | ||
214 | */ | ||
171 | unsigned long shares; | 215 | unsigned long shares; |
172 | /* spinlock to serialize modification to shares */ | 216 | |
173 | spinlock_t lock; | ||
174 | struct rcu_head rcu; | 217 | struct rcu_head rcu; |
218 | struct list_head list; | ||
175 | }; | 219 | }; |
176 | 220 | ||
177 | /* Default task group's sched entity on each cpu */ | 221 | /* Default task group's sched entity on each cpu */ |
@@ -179,24 +223,51 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | |||
179 | /* Default task group's cfs_rq on each cpu */ | 223 | /* Default task group's cfs_rq on each cpu */ |
180 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 224 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; |
181 | 225 | ||
226 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | ||
227 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | ||
228 | |||
182 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; | 229 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; |
183 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; | 230 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; |
184 | 231 | ||
232 | static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; | ||
233 | static struct rt_rq *init_rt_rq_p[NR_CPUS]; | ||
234 | |||
235 | /* task_group_mutex serializes add/remove of task groups and also changes to | ||
236 | * a task group's cpu shares. | ||
237 | */ | ||
238 | static DEFINE_MUTEX(task_group_mutex); | ||
239 | |||
240 | /* doms_cur_mutex serializes access to doms_cur[] array */ | ||
241 | static DEFINE_MUTEX(doms_cur_mutex); | ||
242 | |||
243 | #ifdef CONFIG_SMP | ||
244 | /* kernel thread that runs rebalance_shares() periodically */ | ||
245 | static struct task_struct *lb_monitor_task; | ||
246 | static int load_balance_monitor(void *unused); | ||
247 | #endif | ||
248 | |||
249 | static void set_se_shares(struct sched_entity *se, unsigned long shares); | ||
250 | |||
185 | /* Default task group. | 251 | /* Default task group. |
186 | * Every task in system belong to this group at bootup. | 252 | * Every task in system belong to this group at bootup. |
187 | */ | 253 | */ |
188 | struct task_group init_task_group = { | 254 | struct task_group init_task_group = { |
189 | .se = init_sched_entity_p, | 255 | .se = init_sched_entity_p, |
190 | .cfs_rq = init_cfs_rq_p, | 256 | .cfs_rq = init_cfs_rq_p, |
257 | |||
258 | .rt_se = init_sched_rt_entity_p, | ||
259 | .rt_rq = init_rt_rq_p, | ||
191 | }; | 260 | }; |
192 | 261 | ||
193 | #ifdef CONFIG_FAIR_USER_SCHED | 262 | #ifdef CONFIG_FAIR_USER_SCHED |
194 | # define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD | 263 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) |
195 | #else | 264 | #else |
196 | # define INIT_TASK_GRP_LOAD NICE_0_LOAD | 265 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
197 | #endif | 266 | #endif |
198 | 267 | ||
199 | static int init_task_group_load = INIT_TASK_GRP_LOAD; | 268 | #define MIN_GROUP_SHARES 2 |
269 | |||
270 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | ||
200 | 271 | ||
201 | /* return group to which a task belongs */ | 272 | /* return group to which a task belongs */ |
202 | static inline struct task_group *task_group(struct task_struct *p) | 273 | static inline struct task_group *task_group(struct task_struct *p) |
@@ -215,15 +286,42 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
215 | } | 286 | } |
216 | 287 | ||
217 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | 288 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ |
218 | static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) | 289 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) |
219 | { | 290 | { |
220 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; | 291 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; |
221 | p->se.parent = task_group(p)->se[cpu]; | 292 | p->se.parent = task_group(p)->se[cpu]; |
293 | |||
294 | p->rt.rt_rq = task_group(p)->rt_rq[cpu]; | ||
295 | p->rt.parent = task_group(p)->rt_se[cpu]; | ||
296 | } | ||
297 | |||
298 | static inline void lock_task_group_list(void) | ||
299 | { | ||
300 | mutex_lock(&task_group_mutex); | ||
301 | } | ||
302 | |||
303 | static inline void unlock_task_group_list(void) | ||
304 | { | ||
305 | mutex_unlock(&task_group_mutex); | ||
306 | } | ||
307 | |||
308 | static inline void lock_doms_cur(void) | ||
309 | { | ||
310 | mutex_lock(&doms_cur_mutex); | ||
311 | } | ||
312 | |||
313 | static inline void unlock_doms_cur(void) | ||
314 | { | ||
315 | mutex_unlock(&doms_cur_mutex); | ||
222 | } | 316 | } |
223 | 317 | ||
224 | #else | 318 | #else |
225 | 319 | ||
226 | static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { } | 320 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } |
321 | static inline void lock_task_group_list(void) { } | ||
322 | static inline void unlock_task_group_list(void) { } | ||
323 | static inline void lock_doms_cur(void) { } | ||
324 | static inline void unlock_doms_cur(void) { } | ||
227 | 325 | ||
228 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 326 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
229 | 327 | ||
@@ -264,10 +362,56 @@ struct cfs_rq { | |||
264 | /* Real-Time classes' related field in a runqueue: */ | 362 | /* Real-Time classes' related field in a runqueue: */ |
265 | struct rt_rq { | 363 | struct rt_rq { |
266 | struct rt_prio_array active; | 364 | struct rt_prio_array active; |
267 | int rt_load_balance_idx; | 365 | unsigned long rt_nr_running; |
268 | struct list_head *rt_load_balance_head, *rt_load_balance_curr; | 366 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED |
367 | int highest_prio; /* highest queued rt task prio */ | ||
368 | #endif | ||
369 | #ifdef CONFIG_SMP | ||
370 | unsigned long rt_nr_migratory; | ||
371 | int overloaded; | ||
372 | #endif | ||
373 | int rt_throttled; | ||
374 | u64 rt_time; | ||
375 | |||
376 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
377 | struct rq *rq; | ||
378 | struct list_head leaf_rt_rq_list; | ||
379 | struct task_group *tg; | ||
380 | struct sched_rt_entity *rt_se; | ||
381 | #endif | ||
269 | }; | 382 | }; |
270 | 383 | ||
384 | #ifdef CONFIG_SMP | ||
385 | |||
386 | /* | ||
387 | * We add the notion of a root-domain which will be used to define per-domain | ||
388 | * variables. Each exclusive cpuset essentially defines an island domain by | ||
389 | * fully partitioning the member cpus from any other cpuset. Whenever a new | ||
390 | * exclusive cpuset is created, we also create and attach a new root-domain | ||
391 | * object. | ||
392 | * | ||
393 | */ | ||
394 | struct root_domain { | ||
395 | atomic_t refcount; | ||
396 | cpumask_t span; | ||
397 | cpumask_t online; | ||
398 | |||
399 | /* | ||
400 | * The "RT overload" flag: it gets set if a CPU has more than | ||
401 | * one runnable RT task. | ||
402 | */ | ||
403 | cpumask_t rto_mask; | ||
404 | atomic_t rto_count; | ||
405 | }; | ||
406 | |||
407 | /* | ||
408 | * By default the system creates a single root-domain with all cpus as | ||
409 | * members (mimicking the global state we have today). | ||
410 | */ | ||
411 | static struct root_domain def_root_domain; | ||
412 | |||
413 | #endif | ||
414 | |||
271 | /* | 415 | /* |
272 | * This is the main, per-CPU runqueue data structure. | 416 | * This is the main, per-CPU runqueue data structure. |
273 | * | 417 | * |
@@ -296,11 +440,15 @@ struct rq { | |||
296 | u64 nr_switches; | 440 | u64 nr_switches; |
297 | 441 | ||
298 | struct cfs_rq cfs; | 442 | struct cfs_rq cfs; |
443 | struct rt_rq rt; | ||
444 | u64 rt_period_expire; | ||
445 | int rt_throttled; | ||
446 | |||
299 | #ifdef CONFIG_FAIR_GROUP_SCHED | 447 | #ifdef CONFIG_FAIR_GROUP_SCHED |
300 | /* list of leaf cfs_rq on this cpu: */ | 448 | /* list of leaf cfs_rq on this cpu: */ |
301 | struct list_head leaf_cfs_rq_list; | 449 | struct list_head leaf_cfs_rq_list; |
450 | struct list_head leaf_rt_rq_list; | ||
302 | #endif | 451 | #endif |
303 | struct rt_rq rt; | ||
304 | 452 | ||
305 | /* | 453 | /* |
306 | * This is part of a global counter where only the total sum | 454 | * This is part of a global counter where only the total sum |
@@ -317,7 +465,7 @@ struct rq { | |||
317 | u64 clock, prev_clock_raw; | 465 | u64 clock, prev_clock_raw; |
318 | s64 clock_max_delta; | 466 | s64 clock_max_delta; |
319 | 467 | ||
320 | unsigned int clock_warps, clock_overflows; | 468 | unsigned int clock_warps, clock_overflows, clock_underflows; |
321 | u64 idle_clock; | 469 | u64 idle_clock; |
322 | unsigned int clock_deep_idle_events; | 470 | unsigned int clock_deep_idle_events; |
323 | u64 tick_timestamp; | 471 | u64 tick_timestamp; |
@@ -325,6 +473,7 @@ struct rq { | |||
325 | atomic_t nr_iowait; | 473 | atomic_t nr_iowait; |
326 | 474 | ||
327 | #ifdef CONFIG_SMP | 475 | #ifdef CONFIG_SMP |
476 | struct root_domain *rd; | ||
328 | struct sched_domain *sd; | 477 | struct sched_domain *sd; |
329 | 478 | ||
330 | /* For active balancing */ | 479 | /* For active balancing */ |
@@ -337,6 +486,12 @@ struct rq { | |||
337 | struct list_head migration_queue; | 486 | struct list_head migration_queue; |
338 | #endif | 487 | #endif |
339 | 488 | ||
489 | #ifdef CONFIG_SCHED_HRTICK | ||
490 | unsigned long hrtick_flags; | ||
491 | ktime_t hrtick_expire; | ||
492 | struct hrtimer hrtick_timer; | ||
493 | #endif | ||
494 | |||
340 | #ifdef CONFIG_SCHEDSTATS | 495 | #ifdef CONFIG_SCHEDSTATS |
341 | /* latency stats */ | 496 | /* latency stats */ |
342 | struct sched_info rq_sched_info; | 497 | struct sched_info rq_sched_info; |
@@ -363,7 +518,6 @@ struct rq { | |||
363 | }; | 518 | }; |
364 | 519 | ||
365 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 520 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
366 | static DEFINE_MUTEX(sched_hotcpu_mutex); | ||
367 | 521 | ||
368 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) | 522 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) |
369 | { | 523 | { |
@@ -441,6 +595,23 @@ static void update_rq_clock(struct rq *rq) | |||
441 | #define task_rq(p) cpu_rq(task_cpu(p)) | 595 | #define task_rq(p) cpu_rq(task_cpu(p)) |
442 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 596 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
443 | 597 | ||
598 | unsigned long rt_needs_cpu(int cpu) | ||
599 | { | ||
600 | struct rq *rq = cpu_rq(cpu); | ||
601 | u64 delta; | ||
602 | |||
603 | if (!rq->rt_throttled) | ||
604 | return 0; | ||
605 | |||
606 | if (rq->clock > rq->rt_period_expire) | ||
607 | return 1; | ||
608 | |||
609 | delta = rq->rt_period_expire - rq->clock; | ||
610 | do_div(delta, NSEC_PER_SEC / HZ); | ||
611 | |||
612 | return (unsigned long)delta; | ||
613 | } | ||
614 | |||
444 | /* | 615 | /* |
445 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | 616 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: |
446 | */ | 617 | */ |
@@ -459,6 +630,8 @@ enum { | |||
459 | SCHED_FEAT_START_DEBIT = 4, | 630 | SCHED_FEAT_START_DEBIT = 4, |
460 | SCHED_FEAT_TREE_AVG = 8, | 631 | SCHED_FEAT_TREE_AVG = 8, |
461 | SCHED_FEAT_APPROX_AVG = 16, | 632 | SCHED_FEAT_APPROX_AVG = 16, |
633 | SCHED_FEAT_HRTICK = 32, | ||
634 | SCHED_FEAT_DOUBLE_TICK = 64, | ||
462 | }; | 635 | }; |
463 | 636 | ||
464 | const_debug unsigned int sysctl_sched_features = | 637 | const_debug unsigned int sysctl_sched_features = |
@@ -466,7 +639,9 @@ const_debug unsigned int sysctl_sched_features = | |||
466 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | | 639 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | |
467 | SCHED_FEAT_START_DEBIT * 1 | | 640 | SCHED_FEAT_START_DEBIT * 1 | |
468 | SCHED_FEAT_TREE_AVG * 0 | | 641 | SCHED_FEAT_TREE_AVG * 0 | |
469 | SCHED_FEAT_APPROX_AVG * 0; | 642 | SCHED_FEAT_APPROX_AVG * 0 | |
643 | SCHED_FEAT_HRTICK * 1 | | ||
644 | SCHED_FEAT_DOUBLE_TICK * 0; | ||
470 | 645 | ||
471 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) | 646 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) |
472 | 647 | ||
@@ -477,6 +652,21 @@ const_debug unsigned int sysctl_sched_features = | |||
477 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 652 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
478 | 653 | ||
479 | /* | 654 | /* |
655 | * period over which we measure -rt task cpu usage in ms. | ||
656 | * default: 1s | ||
657 | */ | ||
658 | const_debug unsigned int sysctl_sched_rt_period = 1000; | ||
659 | |||
660 | #define SCHED_RT_FRAC_SHIFT 16 | ||
661 | #define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT) | ||
662 | |||
663 | /* | ||
664 | * ratio of time -rt tasks may consume. | ||
665 | * default: 95% | ||
666 | */ | ||
667 | const_debug unsigned int sysctl_sched_rt_ratio = 62259; | ||
668 | |||
669 | /* | ||
480 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | 670 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu |
481 | * clock constructed from sched_clock(): | 671 | * clock constructed from sched_clock(): |
482 | */ | 672 | */ |
@@ -668,7 +858,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) | |||
668 | struct rq *rq = cpu_rq(smp_processor_id()); | 858 | struct rq *rq = cpu_rq(smp_processor_id()); |
669 | u64 now = sched_clock(); | 859 | u64 now = sched_clock(); |
670 | 860 | ||
671 | touch_softlockup_watchdog(); | ||
672 | rq->idle_clock += delta_ns; | 861 | rq->idle_clock += delta_ns; |
673 | /* | 862 | /* |
674 | * Override the previous timestamp and ignore all | 863 | * Override the previous timestamp and ignore all |
@@ -680,9 +869,177 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) | |||
680 | rq->prev_clock_raw = now; | 869 | rq->prev_clock_raw = now; |
681 | rq->clock += delta_ns; | 870 | rq->clock += delta_ns; |
682 | spin_unlock(&rq->lock); | 871 | spin_unlock(&rq->lock); |
872 | touch_softlockup_watchdog(); | ||
683 | } | 873 | } |
684 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | 874 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); |
685 | 875 | ||
876 | static void __resched_task(struct task_struct *p, int tif_bit); | ||
877 | |||
878 | static inline void resched_task(struct task_struct *p) | ||
879 | { | ||
880 | __resched_task(p, TIF_NEED_RESCHED); | ||
881 | } | ||
882 | |||
883 | #ifdef CONFIG_SCHED_HRTICK | ||
884 | /* | ||
885 | * Use HR-timers to deliver accurate preemption points. | ||
886 | * | ||
887 | * Its all a bit involved since we cannot program an hrt while holding the | ||
888 | * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a | ||
889 | * reschedule event. | ||
890 | * | ||
891 | * When we get rescheduled we reprogram the hrtick_timer outside of the | ||
892 | * rq->lock. | ||
893 | */ | ||
894 | static inline void resched_hrt(struct task_struct *p) | ||
895 | { | ||
896 | __resched_task(p, TIF_HRTICK_RESCHED); | ||
897 | } | ||
898 | |||
899 | static inline void resched_rq(struct rq *rq) | ||
900 | { | ||
901 | unsigned long flags; | ||
902 | |||
903 | spin_lock_irqsave(&rq->lock, flags); | ||
904 | resched_task(rq->curr); | ||
905 | spin_unlock_irqrestore(&rq->lock, flags); | ||
906 | } | ||
907 | |||
908 | enum { | ||
909 | HRTICK_SET, /* re-programm hrtick_timer */ | ||
910 | HRTICK_RESET, /* not a new slice */ | ||
911 | }; | ||
912 | |||
913 | /* | ||
914 | * Use hrtick when: | ||
915 | * - enabled by features | ||
916 | * - hrtimer is actually high res | ||
917 | */ | ||
918 | static inline int hrtick_enabled(struct rq *rq) | ||
919 | { | ||
920 | if (!sched_feat(HRTICK)) | ||
921 | return 0; | ||
922 | return hrtimer_is_hres_active(&rq->hrtick_timer); | ||
923 | } | ||
924 | |||
925 | /* | ||
926 | * Called to set the hrtick timer state. | ||
927 | * | ||
928 | * called with rq->lock held and irqs disabled | ||
929 | */ | ||
930 | static void hrtick_start(struct rq *rq, u64 delay, int reset) | ||
931 | { | ||
932 | assert_spin_locked(&rq->lock); | ||
933 | |||
934 | /* | ||
935 | * preempt at: now + delay | ||
936 | */ | ||
937 | rq->hrtick_expire = | ||
938 | ktime_add_ns(rq->hrtick_timer.base->get_time(), delay); | ||
939 | /* | ||
940 | * indicate we need to program the timer | ||
941 | */ | ||
942 | __set_bit(HRTICK_SET, &rq->hrtick_flags); | ||
943 | if (reset) | ||
944 | __set_bit(HRTICK_RESET, &rq->hrtick_flags); | ||
945 | |||
946 | /* | ||
947 | * New slices are called from the schedule path and don't need a | ||
948 | * forced reschedule. | ||
949 | */ | ||
950 | if (reset) | ||
951 | resched_hrt(rq->curr); | ||
952 | } | ||
953 | |||
954 | static void hrtick_clear(struct rq *rq) | ||
955 | { | ||
956 | if (hrtimer_active(&rq->hrtick_timer)) | ||
957 | hrtimer_cancel(&rq->hrtick_timer); | ||
958 | } | ||
959 | |||
960 | /* | ||
961 | * Update the timer from the possible pending state. | ||
962 | */ | ||
963 | static void hrtick_set(struct rq *rq) | ||
964 | { | ||
965 | ktime_t time; | ||
966 | int set, reset; | ||
967 | unsigned long flags; | ||
968 | |||
969 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); | ||
970 | |||
971 | spin_lock_irqsave(&rq->lock, flags); | ||
972 | set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags); | ||
973 | reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags); | ||
974 | time = rq->hrtick_expire; | ||
975 | clear_thread_flag(TIF_HRTICK_RESCHED); | ||
976 | spin_unlock_irqrestore(&rq->lock, flags); | ||
977 | |||
978 | if (set) { | ||
979 | hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS); | ||
980 | if (reset && !hrtimer_active(&rq->hrtick_timer)) | ||
981 | resched_rq(rq); | ||
982 | } else | ||
983 | hrtick_clear(rq); | ||
984 | } | ||
985 | |||
986 | /* | ||
987 | * High-resolution timer tick. | ||
988 | * Runs from hardirq context with interrupts disabled. | ||
989 | */ | ||
990 | static enum hrtimer_restart hrtick(struct hrtimer *timer) | ||
991 | { | ||
992 | struct rq *rq = container_of(timer, struct rq, hrtick_timer); | ||
993 | |||
994 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); | ||
995 | |||
996 | spin_lock(&rq->lock); | ||
997 | __update_rq_clock(rq); | ||
998 | rq->curr->sched_class->task_tick(rq, rq->curr, 1); | ||
999 | spin_unlock(&rq->lock); | ||
1000 | |||
1001 | return HRTIMER_NORESTART; | ||
1002 | } | ||
1003 | |||
1004 | static inline void init_rq_hrtick(struct rq *rq) | ||
1005 | { | ||
1006 | rq->hrtick_flags = 0; | ||
1007 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
1008 | rq->hrtick_timer.function = hrtick; | ||
1009 | rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | ||
1010 | } | ||
1011 | |||
1012 | void hrtick_resched(void) | ||
1013 | { | ||
1014 | struct rq *rq; | ||
1015 | unsigned long flags; | ||
1016 | |||
1017 | if (!test_thread_flag(TIF_HRTICK_RESCHED)) | ||
1018 | return; | ||
1019 | |||
1020 | local_irq_save(flags); | ||
1021 | rq = cpu_rq(smp_processor_id()); | ||
1022 | hrtick_set(rq); | ||
1023 | local_irq_restore(flags); | ||
1024 | } | ||
1025 | #else | ||
1026 | static inline void hrtick_clear(struct rq *rq) | ||
1027 | { | ||
1028 | } | ||
1029 | |||
1030 | static inline void hrtick_set(struct rq *rq) | ||
1031 | { | ||
1032 | } | ||
1033 | |||
1034 | static inline void init_rq_hrtick(struct rq *rq) | ||
1035 | { | ||
1036 | } | ||
1037 | |||
1038 | void hrtick_resched(void) | ||
1039 | { | ||
1040 | } | ||
1041 | #endif | ||
1042 | |||
686 | /* | 1043 | /* |
687 | * resched_task - mark a task 'to be rescheduled now'. | 1044 | * resched_task - mark a task 'to be rescheduled now'. |
688 | * | 1045 | * |
@@ -696,16 +1053,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | |||
696 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | 1053 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) |
697 | #endif | 1054 | #endif |
698 | 1055 | ||
699 | static void resched_task(struct task_struct *p) | 1056 | static void __resched_task(struct task_struct *p, int tif_bit) |
700 | { | 1057 | { |
701 | int cpu; | 1058 | int cpu; |
702 | 1059 | ||
703 | assert_spin_locked(&task_rq(p)->lock); | 1060 | assert_spin_locked(&task_rq(p)->lock); |
704 | 1061 | ||
705 | if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) | 1062 | if (unlikely(test_tsk_thread_flag(p, tif_bit))) |
706 | return; | 1063 | return; |
707 | 1064 | ||
708 | set_tsk_thread_flag(p, TIF_NEED_RESCHED); | 1065 | set_tsk_thread_flag(p, tif_bit); |
709 | 1066 | ||
710 | cpu = task_cpu(p); | 1067 | cpu = task_cpu(p); |
711 | if (cpu == smp_processor_id()) | 1068 | if (cpu == smp_processor_id()) |
@@ -728,10 +1085,10 @@ static void resched_cpu(int cpu) | |||
728 | spin_unlock_irqrestore(&rq->lock, flags); | 1085 | spin_unlock_irqrestore(&rq->lock, flags); |
729 | } | 1086 | } |
730 | #else | 1087 | #else |
731 | static inline void resched_task(struct task_struct *p) | 1088 | static void __resched_task(struct task_struct *p, int tif_bit) |
732 | { | 1089 | { |
733 | assert_spin_locked(&task_rq(p)->lock); | 1090 | assert_spin_locked(&task_rq(p)->lock); |
734 | set_tsk_need_resched(p); | 1091 | set_tsk_thread_flag(p, tif_bit); |
735 | } | 1092 | } |
736 | #endif | 1093 | #endif |
737 | 1094 | ||
@@ -871,6 +1228,23 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | |||
871 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | 1228 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} |
872 | #endif | 1229 | #endif |
873 | 1230 | ||
1231 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) | ||
1232 | { | ||
1233 | update_load_add(&rq->load, load); | ||
1234 | } | ||
1235 | |||
1236 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) | ||
1237 | { | ||
1238 | update_load_sub(&rq->load, load); | ||
1239 | } | ||
1240 | |||
1241 | #ifdef CONFIG_SMP | ||
1242 | static unsigned long source_load(int cpu, int type); | ||
1243 | static unsigned long target_load(int cpu, int type); | ||
1244 | static unsigned long cpu_avg_load_per_task(int cpu); | ||
1245 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | ||
1246 | #endif /* CONFIG_SMP */ | ||
1247 | |||
874 | #include "sched_stats.h" | 1248 | #include "sched_stats.h" |
875 | #include "sched_idletask.c" | 1249 | #include "sched_idletask.c" |
876 | #include "sched_fair.c" | 1250 | #include "sched_fair.c" |
@@ -881,41 +1255,14 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | |||
881 | 1255 | ||
882 | #define sched_class_highest (&rt_sched_class) | 1256 | #define sched_class_highest (&rt_sched_class) |
883 | 1257 | ||
884 | /* | ||
885 | * Update delta_exec, delta_fair fields for rq. | ||
886 | * | ||
887 | * delta_fair clock advances at a rate inversely proportional to | ||
888 | * total load (rq->load.weight) on the runqueue, while | ||
889 | * delta_exec advances at the same rate as wall-clock (provided | ||
890 | * cpu is not idle). | ||
891 | * | ||
892 | * delta_exec / delta_fair is a measure of the (smoothened) load on this | ||
893 | * runqueue over any given interval. This (smoothened) load is used | ||
894 | * during load balance. | ||
895 | * | ||
896 | * This function is called /before/ updating rq->load | ||
897 | * and when switching tasks. | ||
898 | */ | ||
899 | static inline void inc_load(struct rq *rq, const struct task_struct *p) | ||
900 | { | ||
901 | update_load_add(&rq->load, p->se.load.weight); | ||
902 | } | ||
903 | |||
904 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
905 | { | ||
906 | update_load_sub(&rq->load, p->se.load.weight); | ||
907 | } | ||
908 | |||
909 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | 1258 | static void inc_nr_running(struct task_struct *p, struct rq *rq) |
910 | { | 1259 | { |
911 | rq->nr_running++; | 1260 | rq->nr_running++; |
912 | inc_load(rq, p); | ||
913 | } | 1261 | } |
914 | 1262 | ||
915 | static void dec_nr_running(struct task_struct *p, struct rq *rq) | 1263 | static void dec_nr_running(struct task_struct *p, struct rq *rq) |
916 | { | 1264 | { |
917 | rq->nr_running--; | 1265 | rq->nr_running--; |
918 | dec_load(rq, p); | ||
919 | } | 1266 | } |
920 | 1267 | ||
921 | static void set_load_weight(struct task_struct *p) | 1268 | static void set_load_weight(struct task_struct *p) |
@@ -1039,7 +1386,7 @@ unsigned long weighted_cpuload(const int cpu) | |||
1039 | 1386 | ||
1040 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 1387 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
1041 | { | 1388 | { |
1042 | set_task_cfs_rq(p, cpu); | 1389 | set_task_rq(p, cpu); |
1043 | #ifdef CONFIG_SMP | 1390 | #ifdef CONFIG_SMP |
1044 | /* | 1391 | /* |
1045 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be | 1392 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be |
@@ -1051,12 +1398,24 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
1051 | #endif | 1398 | #endif |
1052 | } | 1399 | } |
1053 | 1400 | ||
1401 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, | ||
1402 | const struct sched_class *prev_class, | ||
1403 | int oldprio, int running) | ||
1404 | { | ||
1405 | if (prev_class != p->sched_class) { | ||
1406 | if (prev_class->switched_from) | ||
1407 | prev_class->switched_from(rq, p, running); | ||
1408 | p->sched_class->switched_to(rq, p, running); | ||
1409 | } else | ||
1410 | p->sched_class->prio_changed(rq, p, oldprio, running); | ||
1411 | } | ||
1412 | |||
1054 | #ifdef CONFIG_SMP | 1413 | #ifdef CONFIG_SMP |
1055 | 1414 | ||
1056 | /* | 1415 | /* |
1057 | * Is this task likely cache-hot: | 1416 | * Is this task likely cache-hot: |
1058 | */ | 1417 | */ |
1059 | static inline int | 1418 | static int |
1060 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | 1419 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) |
1061 | { | 1420 | { |
1062 | s64 delta; | 1421 | s64 delta; |
@@ -1281,7 +1640,7 @@ static unsigned long target_load(int cpu, int type) | |||
1281 | /* | 1640 | /* |
1282 | * Return the average load per task on the cpu's run queue | 1641 | * Return the average load per task on the cpu's run queue |
1283 | */ | 1642 | */ |
1284 | static inline unsigned long cpu_avg_load_per_task(int cpu) | 1643 | static unsigned long cpu_avg_load_per_task(int cpu) |
1285 | { | 1644 | { |
1286 | struct rq *rq = cpu_rq(cpu); | 1645 | struct rq *rq = cpu_rq(cpu); |
1287 | unsigned long total = weighted_cpuload(cpu); | 1646 | unsigned long total = weighted_cpuload(cpu); |
@@ -1438,58 +1797,6 @@ static int sched_balance_self(int cpu, int flag) | |||
1438 | 1797 | ||
1439 | #endif /* CONFIG_SMP */ | 1798 | #endif /* CONFIG_SMP */ |
1440 | 1799 | ||
1441 | /* | ||
1442 | * wake_idle() will wake a task on an idle cpu if task->cpu is | ||
1443 | * not idle and an idle cpu is available. The span of cpus to | ||
1444 | * search starts with cpus closest then further out as needed, | ||
1445 | * so we always favor a closer, idle cpu. | ||
1446 | * | ||
1447 | * Returns the CPU we should wake onto. | ||
1448 | */ | ||
1449 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | ||
1450 | static int wake_idle(int cpu, struct task_struct *p) | ||
1451 | { | ||
1452 | cpumask_t tmp; | ||
1453 | struct sched_domain *sd; | ||
1454 | int i; | ||
1455 | |||
1456 | /* | ||
1457 | * If it is idle, then it is the best cpu to run this task. | ||
1458 | * | ||
1459 | * This cpu is also the best, if it has more than one task already. | ||
1460 | * Siblings must be also busy(in most cases) as they didn't already | ||
1461 | * pickup the extra load from this cpu and hence we need not check | ||
1462 | * sibling runqueue info. This will avoid the checks and cache miss | ||
1463 | * penalities associated with that. | ||
1464 | */ | ||
1465 | if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) | ||
1466 | return cpu; | ||
1467 | |||
1468 | for_each_domain(cpu, sd) { | ||
1469 | if (sd->flags & SD_WAKE_IDLE) { | ||
1470 | cpus_and(tmp, sd->span, p->cpus_allowed); | ||
1471 | for_each_cpu_mask(i, tmp) { | ||
1472 | if (idle_cpu(i)) { | ||
1473 | if (i != task_cpu(p)) { | ||
1474 | schedstat_inc(p, | ||
1475 | se.nr_wakeups_idle); | ||
1476 | } | ||
1477 | return i; | ||
1478 | } | ||
1479 | } | ||
1480 | } else { | ||
1481 | break; | ||
1482 | } | ||
1483 | } | ||
1484 | return cpu; | ||
1485 | } | ||
1486 | #else | ||
1487 | static inline int wake_idle(int cpu, struct task_struct *p) | ||
1488 | { | ||
1489 | return cpu; | ||
1490 | } | ||
1491 | #endif | ||
1492 | |||
1493 | /*** | 1800 | /*** |
1494 | * try_to_wake_up - wake up a thread | 1801 | * try_to_wake_up - wake up a thread |
1495 | * @p: the to-be-woken-up thread | 1802 | * @p: the to-be-woken-up thread |
@@ -1510,11 +1817,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1510 | unsigned long flags; | 1817 | unsigned long flags; |
1511 | long old_state; | 1818 | long old_state; |
1512 | struct rq *rq; | 1819 | struct rq *rq; |
1513 | #ifdef CONFIG_SMP | ||
1514 | struct sched_domain *sd, *this_sd = NULL; | ||
1515 | unsigned long load, this_load; | ||
1516 | int new_cpu; | ||
1517 | #endif | ||
1518 | 1820 | ||
1519 | rq = task_rq_lock(p, &flags); | 1821 | rq = task_rq_lock(p, &flags); |
1520 | old_state = p->state; | 1822 | old_state = p->state; |
@@ -1532,92 +1834,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1532 | if (unlikely(task_running(rq, p))) | 1834 | if (unlikely(task_running(rq, p))) |
1533 | goto out_activate; | 1835 | goto out_activate; |
1534 | 1836 | ||
1535 | new_cpu = cpu; | 1837 | cpu = p->sched_class->select_task_rq(p, sync); |
1536 | 1838 | if (cpu != orig_cpu) { | |
1537 | schedstat_inc(rq, ttwu_count); | 1839 | set_task_cpu(p, cpu); |
1538 | if (cpu == this_cpu) { | ||
1539 | schedstat_inc(rq, ttwu_local); | ||
1540 | goto out_set_cpu; | ||
1541 | } | ||
1542 | |||
1543 | for_each_domain(this_cpu, sd) { | ||
1544 | if (cpu_isset(cpu, sd->span)) { | ||
1545 | schedstat_inc(sd, ttwu_wake_remote); | ||
1546 | this_sd = sd; | ||
1547 | break; | ||
1548 | } | ||
1549 | } | ||
1550 | |||
1551 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | ||
1552 | goto out_set_cpu; | ||
1553 | |||
1554 | /* | ||
1555 | * Check for affine wakeup and passive balancing possibilities. | ||
1556 | */ | ||
1557 | if (this_sd) { | ||
1558 | int idx = this_sd->wake_idx; | ||
1559 | unsigned int imbalance; | ||
1560 | |||
1561 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; | ||
1562 | |||
1563 | load = source_load(cpu, idx); | ||
1564 | this_load = target_load(this_cpu, idx); | ||
1565 | |||
1566 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | ||
1567 | |||
1568 | if (this_sd->flags & SD_WAKE_AFFINE) { | ||
1569 | unsigned long tl = this_load; | ||
1570 | unsigned long tl_per_task; | ||
1571 | |||
1572 | /* | ||
1573 | * Attract cache-cold tasks on sync wakeups: | ||
1574 | */ | ||
1575 | if (sync && !task_hot(p, rq->clock, this_sd)) | ||
1576 | goto out_set_cpu; | ||
1577 | |||
1578 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | ||
1579 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
1580 | |||
1581 | /* | ||
1582 | * If sync wakeup then subtract the (maximum possible) | ||
1583 | * effect of the currently running task from the load | ||
1584 | * of the current CPU: | ||
1585 | */ | ||
1586 | if (sync) | ||
1587 | tl -= current->se.load.weight; | ||
1588 | |||
1589 | if ((tl <= load && | ||
1590 | tl + target_load(cpu, idx) <= tl_per_task) || | ||
1591 | 100*(tl + p->se.load.weight) <= imbalance*load) { | ||
1592 | /* | ||
1593 | * This domain has SD_WAKE_AFFINE and | ||
1594 | * p is cache cold in this domain, and | ||
1595 | * there is no bad imbalance. | ||
1596 | */ | ||
1597 | schedstat_inc(this_sd, ttwu_move_affine); | ||
1598 | schedstat_inc(p, se.nr_wakeups_affine); | ||
1599 | goto out_set_cpu; | ||
1600 | } | ||
1601 | } | ||
1602 | |||
1603 | /* | ||
1604 | * Start passive balancing when half the imbalance_pct | ||
1605 | * limit is reached. | ||
1606 | */ | ||
1607 | if (this_sd->flags & SD_WAKE_BALANCE) { | ||
1608 | if (imbalance*this_load <= 100*load) { | ||
1609 | schedstat_inc(this_sd, ttwu_move_balance); | ||
1610 | schedstat_inc(p, se.nr_wakeups_passive); | ||
1611 | goto out_set_cpu; | ||
1612 | } | ||
1613 | } | ||
1614 | } | ||
1615 | |||
1616 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ | ||
1617 | out_set_cpu: | ||
1618 | new_cpu = wake_idle(new_cpu, p); | ||
1619 | if (new_cpu != cpu) { | ||
1620 | set_task_cpu(p, new_cpu); | ||
1621 | task_rq_unlock(rq, &flags); | 1840 | task_rq_unlock(rq, &flags); |
1622 | /* might preempt at this point */ | 1841 | /* might preempt at this point */ |
1623 | rq = task_rq_lock(p, &flags); | 1842 | rq = task_rq_lock(p, &flags); |
@@ -1631,6 +1850,21 @@ out_set_cpu: | |||
1631 | cpu = task_cpu(p); | 1850 | cpu = task_cpu(p); |
1632 | } | 1851 | } |
1633 | 1852 | ||
1853 | #ifdef CONFIG_SCHEDSTATS | ||
1854 | schedstat_inc(rq, ttwu_count); | ||
1855 | if (cpu == this_cpu) | ||
1856 | schedstat_inc(rq, ttwu_local); | ||
1857 | else { | ||
1858 | struct sched_domain *sd; | ||
1859 | for_each_domain(this_cpu, sd) { | ||
1860 | if (cpu_isset(cpu, sd->span)) { | ||
1861 | schedstat_inc(sd, ttwu_wake_remote); | ||
1862 | break; | ||
1863 | } | ||
1864 | } | ||
1865 | } | ||
1866 | #endif | ||
1867 | |||
1634 | out_activate: | 1868 | out_activate: |
1635 | #endif /* CONFIG_SMP */ | 1869 | #endif /* CONFIG_SMP */ |
1636 | schedstat_inc(p, se.nr_wakeups); | 1870 | schedstat_inc(p, se.nr_wakeups); |
@@ -1649,6 +1883,10 @@ out_activate: | |||
1649 | 1883 | ||
1650 | out_running: | 1884 | out_running: |
1651 | p->state = TASK_RUNNING; | 1885 | p->state = TASK_RUNNING; |
1886 | #ifdef CONFIG_SMP | ||
1887 | if (p->sched_class->task_wake_up) | ||
1888 | p->sched_class->task_wake_up(rq, p); | ||
1889 | #endif | ||
1652 | out: | 1890 | out: |
1653 | task_rq_unlock(rq, &flags); | 1891 | task_rq_unlock(rq, &flags); |
1654 | 1892 | ||
@@ -1691,7 +1929,7 @@ static void __sched_fork(struct task_struct *p) | |||
1691 | p->se.wait_max = 0; | 1929 | p->se.wait_max = 0; |
1692 | #endif | 1930 | #endif |
1693 | 1931 | ||
1694 | INIT_LIST_HEAD(&p->run_list); | 1932 | INIT_LIST_HEAD(&p->rt.run_list); |
1695 | p->se.on_rq = 0; | 1933 | p->se.on_rq = 0; |
1696 | 1934 | ||
1697 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 1935 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
@@ -1771,6 +2009,10 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
1771 | inc_nr_running(p, rq); | 2009 | inc_nr_running(p, rq); |
1772 | } | 2010 | } |
1773 | check_preempt_curr(rq, p); | 2011 | check_preempt_curr(rq, p); |
2012 | #ifdef CONFIG_SMP | ||
2013 | if (p->sched_class->task_wake_up) | ||
2014 | p->sched_class->task_wake_up(rq, p); | ||
2015 | #endif | ||
1774 | task_rq_unlock(rq, &flags); | 2016 | task_rq_unlock(rq, &flags); |
1775 | } | 2017 | } |
1776 | 2018 | ||
@@ -1891,6 +2133,11 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
1891 | prev_state = prev->state; | 2133 | prev_state = prev->state; |
1892 | finish_arch_switch(prev); | 2134 | finish_arch_switch(prev); |
1893 | finish_lock_switch(rq, prev); | 2135 | finish_lock_switch(rq, prev); |
2136 | #ifdef CONFIG_SMP | ||
2137 | if (current->sched_class->post_schedule) | ||
2138 | current->sched_class->post_schedule(rq); | ||
2139 | #endif | ||
2140 | |||
1894 | fire_sched_in_preempt_notifiers(current); | 2141 | fire_sched_in_preempt_notifiers(current); |
1895 | if (mm) | 2142 | if (mm) |
1896 | mmdrop(mm); | 2143 | mmdrop(mm); |
@@ -2124,11 +2371,13 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
2124 | /* | 2371 | /* |
2125 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | 2372 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. |
2126 | */ | 2373 | */ |
2127 | static void double_lock_balance(struct rq *this_rq, struct rq *busiest) | 2374 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest) |
2128 | __releases(this_rq->lock) | 2375 | __releases(this_rq->lock) |
2129 | __acquires(busiest->lock) | 2376 | __acquires(busiest->lock) |
2130 | __acquires(this_rq->lock) | 2377 | __acquires(this_rq->lock) |
2131 | { | 2378 | { |
2379 | int ret = 0; | ||
2380 | |||
2132 | if (unlikely(!irqs_disabled())) { | 2381 | if (unlikely(!irqs_disabled())) { |
2133 | /* printk() doesn't work good under rq->lock */ | 2382 | /* printk() doesn't work good under rq->lock */ |
2134 | spin_unlock(&this_rq->lock); | 2383 | spin_unlock(&this_rq->lock); |
@@ -2139,9 +2388,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
2139 | spin_unlock(&this_rq->lock); | 2388 | spin_unlock(&this_rq->lock); |
2140 | spin_lock(&busiest->lock); | 2389 | spin_lock(&busiest->lock); |
2141 | spin_lock(&this_rq->lock); | 2390 | spin_lock(&this_rq->lock); |
2391 | ret = 1; | ||
2142 | } else | 2392 | } else |
2143 | spin_lock(&busiest->lock); | 2393 | spin_lock(&busiest->lock); |
2144 | } | 2394 | } |
2395 | return ret; | ||
2145 | } | 2396 | } |
2146 | 2397 | ||
2147 | /* | 2398 | /* |
@@ -3485,12 +3736,14 @@ void scheduler_tick(void) | |||
3485 | /* | 3736 | /* |
3486 | * Let rq->clock advance by at least TICK_NSEC: | 3737 | * Let rq->clock advance by at least TICK_NSEC: |
3487 | */ | 3738 | */ |
3488 | if (unlikely(rq->clock < next_tick)) | 3739 | if (unlikely(rq->clock < next_tick)) { |
3489 | rq->clock = next_tick; | 3740 | rq->clock = next_tick; |
3741 | rq->clock_underflows++; | ||
3742 | } | ||
3490 | rq->tick_timestamp = rq->clock; | 3743 | rq->tick_timestamp = rq->clock; |
3491 | update_cpu_load(rq); | 3744 | update_cpu_load(rq); |
3492 | if (curr != rq->idle) /* FIXME: needed? */ | 3745 | curr->sched_class->task_tick(rq, curr, 0); |
3493 | curr->sched_class->task_tick(rq, curr); | 3746 | update_sched_rt_period(rq); |
3494 | spin_unlock(&rq->lock); | 3747 | spin_unlock(&rq->lock); |
3495 | 3748 | ||
3496 | #ifdef CONFIG_SMP | 3749 | #ifdef CONFIG_SMP |
@@ -3636,6 +3889,8 @@ need_resched_nonpreemptible: | |||
3636 | 3889 | ||
3637 | schedule_debug(prev); | 3890 | schedule_debug(prev); |
3638 | 3891 | ||
3892 | hrtick_clear(rq); | ||
3893 | |||
3639 | /* | 3894 | /* |
3640 | * Do the rq-clock update outside the rq lock: | 3895 | * Do the rq-clock update outside the rq lock: |
3641 | */ | 3896 | */ |
@@ -3654,6 +3909,11 @@ need_resched_nonpreemptible: | |||
3654 | switch_count = &prev->nvcsw; | 3909 | switch_count = &prev->nvcsw; |
3655 | } | 3910 | } |
3656 | 3911 | ||
3912 | #ifdef CONFIG_SMP | ||
3913 | if (prev->sched_class->pre_schedule) | ||
3914 | prev->sched_class->pre_schedule(rq, prev); | ||
3915 | #endif | ||
3916 | |||
3657 | if (unlikely(!rq->nr_running)) | 3917 | if (unlikely(!rq->nr_running)) |
3658 | idle_balance(cpu, rq); | 3918 | idle_balance(cpu, rq); |
3659 | 3919 | ||
@@ -3668,14 +3928,20 @@ need_resched_nonpreemptible: | |||
3668 | ++*switch_count; | 3928 | ++*switch_count; |
3669 | 3929 | ||
3670 | context_switch(rq, prev, next); /* unlocks the rq */ | 3930 | context_switch(rq, prev, next); /* unlocks the rq */ |
3931 | /* | ||
3932 | * the context switch might have flipped the stack from under | ||
3933 | * us, hence refresh the local variables. | ||
3934 | */ | ||
3935 | cpu = smp_processor_id(); | ||
3936 | rq = cpu_rq(cpu); | ||
3671 | } else | 3937 | } else |
3672 | spin_unlock_irq(&rq->lock); | 3938 | spin_unlock_irq(&rq->lock); |
3673 | 3939 | ||
3674 | if (unlikely(reacquire_kernel_lock(current) < 0)) { | 3940 | hrtick_set(rq); |
3675 | cpu = smp_processor_id(); | 3941 | |
3676 | rq = cpu_rq(cpu); | 3942 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
3677 | goto need_resched_nonpreemptible; | 3943 | goto need_resched_nonpreemptible; |
3678 | } | 3944 | |
3679 | preempt_enable_no_resched(); | 3945 | preempt_enable_no_resched(); |
3680 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3946 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
3681 | goto need_resched; | 3947 | goto need_resched; |
@@ -3691,10 +3957,9 @@ EXPORT_SYMBOL(schedule); | |||
3691 | asmlinkage void __sched preempt_schedule(void) | 3957 | asmlinkage void __sched preempt_schedule(void) |
3692 | { | 3958 | { |
3693 | struct thread_info *ti = current_thread_info(); | 3959 | struct thread_info *ti = current_thread_info(); |
3694 | #ifdef CONFIG_PREEMPT_BKL | ||
3695 | struct task_struct *task = current; | 3960 | struct task_struct *task = current; |
3696 | int saved_lock_depth; | 3961 | int saved_lock_depth; |
3697 | #endif | 3962 | |
3698 | /* | 3963 | /* |
3699 | * If there is a non-zero preempt_count or interrupts are disabled, | 3964 | * If there is a non-zero preempt_count or interrupts are disabled, |
3700 | * we do not want to preempt the current task. Just return.. | 3965 | * we do not want to preempt the current task. Just return.. |
@@ -3710,14 +3975,10 @@ asmlinkage void __sched preempt_schedule(void) | |||
3710 | * clear ->lock_depth so that schedule() doesnt | 3975 | * clear ->lock_depth so that schedule() doesnt |
3711 | * auto-release the semaphore: | 3976 | * auto-release the semaphore: |
3712 | */ | 3977 | */ |
3713 | #ifdef CONFIG_PREEMPT_BKL | ||
3714 | saved_lock_depth = task->lock_depth; | 3978 | saved_lock_depth = task->lock_depth; |
3715 | task->lock_depth = -1; | 3979 | task->lock_depth = -1; |
3716 | #endif | ||
3717 | schedule(); | 3980 | schedule(); |
3718 | #ifdef CONFIG_PREEMPT_BKL | ||
3719 | task->lock_depth = saved_lock_depth; | 3981 | task->lock_depth = saved_lock_depth; |
3720 | #endif | ||
3721 | sub_preempt_count(PREEMPT_ACTIVE); | 3982 | sub_preempt_count(PREEMPT_ACTIVE); |
3722 | 3983 | ||
3723 | /* | 3984 | /* |
@@ -3738,10 +3999,9 @@ EXPORT_SYMBOL(preempt_schedule); | |||
3738 | asmlinkage void __sched preempt_schedule_irq(void) | 3999 | asmlinkage void __sched preempt_schedule_irq(void) |
3739 | { | 4000 | { |
3740 | struct thread_info *ti = current_thread_info(); | 4001 | struct thread_info *ti = current_thread_info(); |
3741 | #ifdef CONFIG_PREEMPT_BKL | ||
3742 | struct task_struct *task = current; | 4002 | struct task_struct *task = current; |
3743 | int saved_lock_depth; | 4003 | int saved_lock_depth; |
3744 | #endif | 4004 | |
3745 | /* Catch callers which need to be fixed */ | 4005 | /* Catch callers which need to be fixed */ |
3746 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 4006 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
3747 | 4007 | ||
@@ -3753,16 +4013,12 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
3753 | * clear ->lock_depth so that schedule() doesnt | 4013 | * clear ->lock_depth so that schedule() doesnt |
3754 | * auto-release the semaphore: | 4014 | * auto-release the semaphore: |
3755 | */ | 4015 | */ |
3756 | #ifdef CONFIG_PREEMPT_BKL | ||
3757 | saved_lock_depth = task->lock_depth; | 4016 | saved_lock_depth = task->lock_depth; |
3758 | task->lock_depth = -1; | 4017 | task->lock_depth = -1; |
3759 | #endif | ||
3760 | local_irq_enable(); | 4018 | local_irq_enable(); |
3761 | schedule(); | 4019 | schedule(); |
3762 | local_irq_disable(); | 4020 | local_irq_disable(); |
3763 | #ifdef CONFIG_PREEMPT_BKL | ||
3764 | task->lock_depth = saved_lock_depth; | 4021 | task->lock_depth = saved_lock_depth; |
3765 | #endif | ||
3766 | sub_preempt_count(PREEMPT_ACTIVE); | 4022 | sub_preempt_count(PREEMPT_ACTIVE); |
3767 | 4023 | ||
3768 | /* | 4024 | /* |
@@ -4019,6 +4275,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4019 | unsigned long flags; | 4275 | unsigned long flags; |
4020 | int oldprio, on_rq, running; | 4276 | int oldprio, on_rq, running; |
4021 | struct rq *rq; | 4277 | struct rq *rq; |
4278 | const struct sched_class *prev_class = p->sched_class; | ||
4022 | 4279 | ||
4023 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 4280 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
4024 | 4281 | ||
@@ -4044,18 +4301,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4044 | if (on_rq) { | 4301 | if (on_rq) { |
4045 | if (running) | 4302 | if (running) |
4046 | p->sched_class->set_curr_task(rq); | 4303 | p->sched_class->set_curr_task(rq); |
4304 | |||
4047 | enqueue_task(rq, p, 0); | 4305 | enqueue_task(rq, p, 0); |
4048 | /* | 4306 | |
4049 | * Reschedule if we are currently running on this runqueue and | 4307 | check_class_changed(rq, p, prev_class, oldprio, running); |
4050 | * our priority decreased, or if we are not currently running on | ||
4051 | * this runqueue and our priority is higher than the current's | ||
4052 | */ | ||
4053 | if (running) { | ||
4054 | if (p->prio > oldprio) | ||
4055 | resched_task(rq->curr); | ||
4056 | } else { | ||
4057 | check_preempt_curr(rq, p); | ||
4058 | } | ||
4059 | } | 4308 | } |
4060 | task_rq_unlock(rq, &flags); | 4309 | task_rq_unlock(rq, &flags); |
4061 | } | 4310 | } |
@@ -4087,10 +4336,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4087 | goto out_unlock; | 4336 | goto out_unlock; |
4088 | } | 4337 | } |
4089 | on_rq = p->se.on_rq; | 4338 | on_rq = p->se.on_rq; |
4090 | if (on_rq) { | 4339 | if (on_rq) |
4091 | dequeue_task(rq, p, 0); | 4340 | dequeue_task(rq, p, 0); |
4092 | dec_load(rq, p); | ||
4093 | } | ||
4094 | 4341 | ||
4095 | p->static_prio = NICE_TO_PRIO(nice); | 4342 | p->static_prio = NICE_TO_PRIO(nice); |
4096 | set_load_weight(p); | 4343 | set_load_weight(p); |
@@ -4100,7 +4347,6 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4100 | 4347 | ||
4101 | if (on_rq) { | 4348 | if (on_rq) { |
4102 | enqueue_task(rq, p, 0); | 4349 | enqueue_task(rq, p, 0); |
4103 | inc_load(rq, p); | ||
4104 | /* | 4350 | /* |
4105 | * If the task increased its priority or is running and | 4351 | * If the task increased its priority or is running and |
4106 | * lowered its priority, then reschedule its CPU: | 4352 | * lowered its priority, then reschedule its CPU: |
@@ -4258,6 +4504,7 @@ int sched_setscheduler(struct task_struct *p, int policy, | |||
4258 | { | 4504 | { |
4259 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 4505 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
4260 | unsigned long flags; | 4506 | unsigned long flags; |
4507 | const struct sched_class *prev_class = p->sched_class; | ||
4261 | struct rq *rq; | 4508 | struct rq *rq; |
4262 | 4509 | ||
4263 | /* may grab non-irq protected spin_locks */ | 4510 | /* may grab non-irq protected spin_locks */ |
@@ -4351,18 +4598,10 @@ recheck: | |||
4351 | if (on_rq) { | 4598 | if (on_rq) { |
4352 | if (running) | 4599 | if (running) |
4353 | p->sched_class->set_curr_task(rq); | 4600 | p->sched_class->set_curr_task(rq); |
4601 | |||
4354 | activate_task(rq, p, 0); | 4602 | activate_task(rq, p, 0); |
4355 | /* | 4603 | |
4356 | * Reschedule if we are currently running on this runqueue and | 4604 | check_class_changed(rq, p, prev_class, oldprio, running); |
4357 | * our priority decreased, or if we are not currently running on | ||
4358 | * this runqueue and our priority is higher than the current's | ||
4359 | */ | ||
4360 | if (running) { | ||
4361 | if (p->prio > oldprio) | ||
4362 | resched_task(rq->curr); | ||
4363 | } else { | ||
4364 | check_preempt_curr(rq, p); | ||
4365 | } | ||
4366 | } | 4605 | } |
4367 | __task_rq_unlock(rq); | 4606 | __task_rq_unlock(rq); |
4368 | spin_unlock_irqrestore(&p->pi_lock, flags); | 4607 | spin_unlock_irqrestore(&p->pi_lock, flags); |
@@ -4490,13 +4729,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
4490 | struct task_struct *p; | 4729 | struct task_struct *p; |
4491 | int retval; | 4730 | int retval; |
4492 | 4731 | ||
4493 | mutex_lock(&sched_hotcpu_mutex); | 4732 | get_online_cpus(); |
4494 | read_lock(&tasklist_lock); | 4733 | read_lock(&tasklist_lock); |
4495 | 4734 | ||
4496 | p = find_process_by_pid(pid); | 4735 | p = find_process_by_pid(pid); |
4497 | if (!p) { | 4736 | if (!p) { |
4498 | read_unlock(&tasklist_lock); | 4737 | read_unlock(&tasklist_lock); |
4499 | mutex_unlock(&sched_hotcpu_mutex); | 4738 | put_online_cpus(); |
4500 | return -ESRCH; | 4739 | return -ESRCH; |
4501 | } | 4740 | } |
4502 | 4741 | ||
@@ -4536,7 +4775,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
4536 | } | 4775 | } |
4537 | out_unlock: | 4776 | out_unlock: |
4538 | put_task_struct(p); | 4777 | put_task_struct(p); |
4539 | mutex_unlock(&sched_hotcpu_mutex); | 4778 | put_online_cpus(); |
4540 | return retval; | 4779 | return retval; |
4541 | } | 4780 | } |
4542 | 4781 | ||
@@ -4593,7 +4832,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) | |||
4593 | struct task_struct *p; | 4832 | struct task_struct *p; |
4594 | int retval; | 4833 | int retval; |
4595 | 4834 | ||
4596 | mutex_lock(&sched_hotcpu_mutex); | 4835 | get_online_cpus(); |
4597 | read_lock(&tasklist_lock); | 4836 | read_lock(&tasklist_lock); |
4598 | 4837 | ||
4599 | retval = -ESRCH; | 4838 | retval = -ESRCH; |
@@ -4609,7 +4848,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) | |||
4609 | 4848 | ||
4610 | out_unlock: | 4849 | out_unlock: |
4611 | read_unlock(&tasklist_lock); | 4850 | read_unlock(&tasklist_lock); |
4612 | mutex_unlock(&sched_hotcpu_mutex); | 4851 | put_online_cpus(); |
4613 | 4852 | ||
4614 | return retval; | 4853 | return retval; |
4615 | } | 4854 | } |
@@ -4683,7 +4922,8 @@ static void __cond_resched(void) | |||
4683 | } while (need_resched()); | 4922 | } while (need_resched()); |
4684 | } | 4923 | } |
4685 | 4924 | ||
4686 | int __sched cond_resched(void) | 4925 | #if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) |
4926 | int __sched _cond_resched(void) | ||
4687 | { | 4927 | { |
4688 | if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && | 4928 | if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && |
4689 | system_state == SYSTEM_RUNNING) { | 4929 | system_state == SYSTEM_RUNNING) { |
@@ -4692,7 +4932,8 @@ int __sched cond_resched(void) | |||
4692 | } | 4932 | } |
4693 | return 0; | 4933 | return 0; |
4694 | } | 4934 | } |
4695 | EXPORT_SYMBOL(cond_resched); | 4935 | EXPORT_SYMBOL(_cond_resched); |
4936 | #endif | ||
4696 | 4937 | ||
4697 | /* | 4938 | /* |
4698 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, | 4939 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, |
@@ -4890,7 +5131,7 @@ out_unlock: | |||
4890 | 5131 | ||
4891 | static const char stat_nam[] = "RSDTtZX"; | 5132 | static const char stat_nam[] = "RSDTtZX"; |
4892 | 5133 | ||
4893 | static void show_task(struct task_struct *p) | 5134 | void sched_show_task(struct task_struct *p) |
4894 | { | 5135 | { |
4895 | unsigned long free = 0; | 5136 | unsigned long free = 0; |
4896 | unsigned state; | 5137 | unsigned state; |
@@ -4920,8 +5161,7 @@ static void show_task(struct task_struct *p) | |||
4920 | printk(KERN_CONT "%5lu %5d %6d\n", free, | 5161 | printk(KERN_CONT "%5lu %5d %6d\n", free, |
4921 | task_pid_nr(p), task_pid_nr(p->real_parent)); | 5162 | task_pid_nr(p), task_pid_nr(p->real_parent)); |
4922 | 5163 | ||
4923 | if (state != TASK_RUNNING) | 5164 | show_stack(p, NULL); |
4924 | show_stack(p, NULL); | ||
4925 | } | 5165 | } |
4926 | 5166 | ||
4927 | void show_state_filter(unsigned long state_filter) | 5167 | void show_state_filter(unsigned long state_filter) |
@@ -4943,7 +5183,7 @@ void show_state_filter(unsigned long state_filter) | |||
4943 | */ | 5183 | */ |
4944 | touch_nmi_watchdog(); | 5184 | touch_nmi_watchdog(); |
4945 | if (!state_filter || (p->state & state_filter)) | 5185 | if (!state_filter || (p->state & state_filter)) |
4946 | show_task(p); | 5186 | sched_show_task(p); |
4947 | } while_each_thread(g, p); | 5187 | } while_each_thread(g, p); |
4948 | 5188 | ||
4949 | touch_all_softlockup_watchdogs(); | 5189 | touch_all_softlockup_watchdogs(); |
@@ -4992,11 +5232,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
4992 | spin_unlock_irqrestore(&rq->lock, flags); | 5232 | spin_unlock_irqrestore(&rq->lock, flags); |
4993 | 5233 | ||
4994 | /* Set the preempt count _outside_ the spinlocks! */ | 5234 | /* Set the preempt count _outside_ the spinlocks! */ |
4995 | #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) | ||
4996 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); | ||
4997 | #else | ||
4998 | task_thread_info(idle)->preempt_count = 0; | 5235 | task_thread_info(idle)->preempt_count = 0; |
4999 | #endif | 5236 | |
5000 | /* | 5237 | /* |
5001 | * The idle tasks have their own, simple scheduling class: | 5238 | * The idle tasks have their own, simple scheduling class: |
5002 | */ | 5239 | */ |
@@ -5077,7 +5314,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | |||
5077 | goto out; | 5314 | goto out; |
5078 | } | 5315 | } |
5079 | 5316 | ||
5080 | p->cpus_allowed = new_mask; | 5317 | if (p->sched_class->set_cpus_allowed) |
5318 | p->sched_class->set_cpus_allowed(p, &new_mask); | ||
5319 | else { | ||
5320 | p->cpus_allowed = new_mask; | ||
5321 | p->rt.nr_cpus_allowed = cpus_weight(new_mask); | ||
5322 | } | ||
5323 | |||
5081 | /* Can the task run on the task's current CPU? If so, we're done */ | 5324 | /* Can the task run on the task's current CPU? If so, we're done */ |
5082 | if (cpu_isset(task_cpu(p), new_mask)) | 5325 | if (cpu_isset(task_cpu(p), new_mask)) |
5083 | goto out; | 5326 | goto out; |
@@ -5569,9 +5812,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5569 | struct rq *rq; | 5812 | struct rq *rq; |
5570 | 5813 | ||
5571 | switch (action) { | 5814 | switch (action) { |
5572 | case CPU_LOCK_ACQUIRE: | ||
5573 | mutex_lock(&sched_hotcpu_mutex); | ||
5574 | break; | ||
5575 | 5815 | ||
5576 | case CPU_UP_PREPARE: | 5816 | case CPU_UP_PREPARE: |
5577 | case CPU_UP_PREPARE_FROZEN: | 5817 | case CPU_UP_PREPARE_FROZEN: |
@@ -5590,6 +5830,15 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5590 | case CPU_ONLINE_FROZEN: | 5830 | case CPU_ONLINE_FROZEN: |
5591 | /* Strictly unnecessary, as first user will wake it. */ | 5831 | /* Strictly unnecessary, as first user will wake it. */ |
5592 | wake_up_process(cpu_rq(cpu)->migration_thread); | 5832 | wake_up_process(cpu_rq(cpu)->migration_thread); |
5833 | |||
5834 | /* Update our root-domain */ | ||
5835 | rq = cpu_rq(cpu); | ||
5836 | spin_lock_irqsave(&rq->lock, flags); | ||
5837 | if (rq->rd) { | ||
5838 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | ||
5839 | cpu_set(cpu, rq->rd->online); | ||
5840 | } | ||
5841 | spin_unlock_irqrestore(&rq->lock, flags); | ||
5593 | break; | 5842 | break; |
5594 | 5843 | ||
5595 | #ifdef CONFIG_HOTPLUG_CPU | 5844 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -5640,10 +5889,18 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5640 | } | 5889 | } |
5641 | spin_unlock_irq(&rq->lock); | 5890 | spin_unlock_irq(&rq->lock); |
5642 | break; | 5891 | break; |
5643 | #endif | 5892 | |
5644 | case CPU_LOCK_RELEASE: | 5893 | case CPU_DOWN_PREPARE: |
5645 | mutex_unlock(&sched_hotcpu_mutex); | 5894 | /* Update our root-domain */ |
5895 | rq = cpu_rq(cpu); | ||
5896 | spin_lock_irqsave(&rq->lock, flags); | ||
5897 | if (rq->rd) { | ||
5898 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | ||
5899 | cpu_clear(cpu, rq->rd->online); | ||
5900 | } | ||
5901 | spin_unlock_irqrestore(&rq->lock, flags); | ||
5646 | break; | 5902 | break; |
5903 | #endif | ||
5647 | } | 5904 | } |
5648 | return NOTIFY_OK; | 5905 | return NOTIFY_OK; |
5649 | } | 5906 | } |
@@ -5831,11 +6088,76 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
5831 | return 1; | 6088 | return 1; |
5832 | } | 6089 | } |
5833 | 6090 | ||
6091 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) | ||
6092 | { | ||
6093 | unsigned long flags; | ||
6094 | const struct sched_class *class; | ||
6095 | |||
6096 | spin_lock_irqsave(&rq->lock, flags); | ||
6097 | |||
6098 | if (rq->rd) { | ||
6099 | struct root_domain *old_rd = rq->rd; | ||
6100 | |||
6101 | for (class = sched_class_highest; class; class = class->next) { | ||
6102 | if (class->leave_domain) | ||
6103 | class->leave_domain(rq); | ||
6104 | } | ||
6105 | |||
6106 | cpu_clear(rq->cpu, old_rd->span); | ||
6107 | cpu_clear(rq->cpu, old_rd->online); | ||
6108 | |||
6109 | if (atomic_dec_and_test(&old_rd->refcount)) | ||
6110 | kfree(old_rd); | ||
6111 | } | ||
6112 | |||
6113 | atomic_inc(&rd->refcount); | ||
6114 | rq->rd = rd; | ||
6115 | |||
6116 | cpu_set(rq->cpu, rd->span); | ||
6117 | if (cpu_isset(rq->cpu, cpu_online_map)) | ||
6118 | cpu_set(rq->cpu, rd->online); | ||
6119 | |||
6120 | for (class = sched_class_highest; class; class = class->next) { | ||
6121 | if (class->join_domain) | ||
6122 | class->join_domain(rq); | ||
6123 | } | ||
6124 | |||
6125 | spin_unlock_irqrestore(&rq->lock, flags); | ||
6126 | } | ||
6127 | |||
6128 | static void init_rootdomain(struct root_domain *rd) | ||
6129 | { | ||
6130 | memset(rd, 0, sizeof(*rd)); | ||
6131 | |||
6132 | cpus_clear(rd->span); | ||
6133 | cpus_clear(rd->online); | ||
6134 | } | ||
6135 | |||
6136 | static void init_defrootdomain(void) | ||
6137 | { | ||
6138 | init_rootdomain(&def_root_domain); | ||
6139 | atomic_set(&def_root_domain.refcount, 1); | ||
6140 | } | ||
6141 | |||
6142 | static struct root_domain *alloc_rootdomain(void) | ||
6143 | { | ||
6144 | struct root_domain *rd; | ||
6145 | |||
6146 | rd = kmalloc(sizeof(*rd), GFP_KERNEL); | ||
6147 | if (!rd) | ||
6148 | return NULL; | ||
6149 | |||
6150 | init_rootdomain(rd); | ||
6151 | |||
6152 | return rd; | ||
6153 | } | ||
6154 | |||
5834 | /* | 6155 | /* |
5835 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 6156 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
5836 | * hold the hotplug lock. | 6157 | * hold the hotplug lock. |
5837 | */ | 6158 | */ |
5838 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) | 6159 | static void |
6160 | cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | ||
5839 | { | 6161 | { |
5840 | struct rq *rq = cpu_rq(cpu); | 6162 | struct rq *rq = cpu_rq(cpu); |
5841 | struct sched_domain *tmp; | 6163 | struct sched_domain *tmp; |
@@ -5860,6 +6182,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu) | |||
5860 | 6182 | ||
5861 | sched_domain_debug(sd, cpu); | 6183 | sched_domain_debug(sd, cpu); |
5862 | 6184 | ||
6185 | rq_attach_root(rq, rd); | ||
5863 | rcu_assign_pointer(rq->sd, sd); | 6186 | rcu_assign_pointer(rq->sd, sd); |
5864 | } | 6187 | } |
5865 | 6188 | ||
@@ -6228,6 +6551,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6228 | static int build_sched_domains(const cpumask_t *cpu_map) | 6551 | static int build_sched_domains(const cpumask_t *cpu_map) |
6229 | { | 6552 | { |
6230 | int i; | 6553 | int i; |
6554 | struct root_domain *rd; | ||
6231 | #ifdef CONFIG_NUMA | 6555 | #ifdef CONFIG_NUMA |
6232 | struct sched_group **sched_group_nodes = NULL; | 6556 | struct sched_group **sched_group_nodes = NULL; |
6233 | int sd_allnodes = 0; | 6557 | int sd_allnodes = 0; |
@@ -6244,6 +6568,12 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6244 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | 6568 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; |
6245 | #endif | 6569 | #endif |
6246 | 6570 | ||
6571 | rd = alloc_rootdomain(); | ||
6572 | if (!rd) { | ||
6573 | printk(KERN_WARNING "Cannot alloc root domain\n"); | ||
6574 | return -ENOMEM; | ||
6575 | } | ||
6576 | |||
6247 | /* | 6577 | /* |
6248 | * Set up domains for cpus specified by the cpu_map. | 6578 | * Set up domains for cpus specified by the cpu_map. |
6249 | */ | 6579 | */ |
@@ -6460,7 +6790,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6460 | #else | 6790 | #else |
6461 | sd = &per_cpu(phys_domains, i); | 6791 | sd = &per_cpu(phys_domains, i); |
6462 | #endif | 6792 | #endif |
6463 | cpu_attach_domain(sd, i); | 6793 | cpu_attach_domain(sd, rd, i); |
6464 | } | 6794 | } |
6465 | 6795 | ||
6466 | return 0; | 6796 | return 0; |
@@ -6518,7 +6848,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
6518 | unregister_sched_domain_sysctl(); | 6848 | unregister_sched_domain_sysctl(); |
6519 | 6849 | ||
6520 | for_each_cpu_mask(i, *cpu_map) | 6850 | for_each_cpu_mask(i, *cpu_map) |
6521 | cpu_attach_domain(NULL, i); | 6851 | cpu_attach_domain(NULL, &def_root_domain, i); |
6522 | synchronize_sched(); | 6852 | synchronize_sched(); |
6523 | arch_destroy_sched_domains(cpu_map); | 6853 | arch_destroy_sched_domains(cpu_map); |
6524 | } | 6854 | } |
@@ -6548,6 +6878,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) | |||
6548 | { | 6878 | { |
6549 | int i, j; | 6879 | int i, j; |
6550 | 6880 | ||
6881 | lock_doms_cur(); | ||
6882 | |||
6551 | /* always unregister in case we don't destroy any domains */ | 6883 | /* always unregister in case we don't destroy any domains */ |
6552 | unregister_sched_domain_sysctl(); | 6884 | unregister_sched_domain_sysctl(); |
6553 | 6885 | ||
@@ -6588,6 +6920,8 @@ match2: | |||
6588 | ndoms_cur = ndoms_new; | 6920 | ndoms_cur = ndoms_new; |
6589 | 6921 | ||
6590 | register_sched_domain_sysctl(); | 6922 | register_sched_domain_sysctl(); |
6923 | |||
6924 | unlock_doms_cur(); | ||
6591 | } | 6925 | } |
6592 | 6926 | ||
6593 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 6927 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
@@ -6595,10 +6929,10 @@ static int arch_reinit_sched_domains(void) | |||
6595 | { | 6929 | { |
6596 | int err; | 6930 | int err; |
6597 | 6931 | ||
6598 | mutex_lock(&sched_hotcpu_mutex); | 6932 | get_online_cpus(); |
6599 | detach_destroy_domains(&cpu_online_map); | 6933 | detach_destroy_domains(&cpu_online_map); |
6600 | err = arch_init_sched_domains(&cpu_online_map); | 6934 | err = arch_init_sched_domains(&cpu_online_map); |
6601 | mutex_unlock(&sched_hotcpu_mutex); | 6935 | put_online_cpus(); |
6602 | 6936 | ||
6603 | return err; | 6937 | return err; |
6604 | } | 6938 | } |
@@ -6709,12 +7043,12 @@ void __init sched_init_smp(void) | |||
6709 | { | 7043 | { |
6710 | cpumask_t non_isolated_cpus; | 7044 | cpumask_t non_isolated_cpus; |
6711 | 7045 | ||
6712 | mutex_lock(&sched_hotcpu_mutex); | 7046 | get_online_cpus(); |
6713 | arch_init_sched_domains(&cpu_online_map); | 7047 | arch_init_sched_domains(&cpu_online_map); |
6714 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); | 7048 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); |
6715 | if (cpus_empty(non_isolated_cpus)) | 7049 | if (cpus_empty(non_isolated_cpus)) |
6716 | cpu_set(smp_processor_id(), non_isolated_cpus); | 7050 | cpu_set(smp_processor_id(), non_isolated_cpus); |
6717 | mutex_unlock(&sched_hotcpu_mutex); | 7051 | put_online_cpus(); |
6718 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 7052 | /* XXX: Theoretical race here - CPU may be hotplugged now */ |
6719 | hotcpu_notifier(update_sched_domains, 0); | 7053 | hotcpu_notifier(update_sched_domains, 0); |
6720 | 7054 | ||
@@ -6722,6 +7056,21 @@ void __init sched_init_smp(void) | |||
6722 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 7056 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
6723 | BUG(); | 7057 | BUG(); |
6724 | sched_init_granularity(); | 7058 | sched_init_granularity(); |
7059 | |||
7060 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7061 | if (nr_cpu_ids == 1) | ||
7062 | return; | ||
7063 | |||
7064 | lb_monitor_task = kthread_create(load_balance_monitor, NULL, | ||
7065 | "group_balance"); | ||
7066 | if (!IS_ERR(lb_monitor_task)) { | ||
7067 | lb_monitor_task->flags |= PF_NOFREEZE; | ||
7068 | wake_up_process(lb_monitor_task); | ||
7069 | } else { | ||
7070 | printk(KERN_ERR "Could not create load balance monitor thread" | ||
7071 | "(error = %ld) \n", PTR_ERR(lb_monitor_task)); | ||
7072 | } | ||
7073 | #endif | ||
6725 | } | 7074 | } |
6726 | #else | 7075 | #else |
6727 | void __init sched_init_smp(void) | 7076 | void __init sched_init_smp(void) |
@@ -6746,13 +7095,87 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | |||
6746 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 7095 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
6747 | } | 7096 | } |
6748 | 7097 | ||
7098 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | ||
7099 | { | ||
7100 | struct rt_prio_array *array; | ||
7101 | int i; | ||
7102 | |||
7103 | array = &rt_rq->active; | ||
7104 | for (i = 0; i < MAX_RT_PRIO; i++) { | ||
7105 | INIT_LIST_HEAD(array->queue + i); | ||
7106 | __clear_bit(i, array->bitmap); | ||
7107 | } | ||
7108 | /* delimiter for bitsearch: */ | ||
7109 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
7110 | |||
7111 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED | ||
7112 | rt_rq->highest_prio = MAX_RT_PRIO; | ||
7113 | #endif | ||
7114 | #ifdef CONFIG_SMP | ||
7115 | rt_rq->rt_nr_migratory = 0; | ||
7116 | rt_rq->overloaded = 0; | ||
7117 | #endif | ||
7118 | |||
7119 | rt_rq->rt_time = 0; | ||
7120 | rt_rq->rt_throttled = 0; | ||
7121 | |||
7122 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7123 | rt_rq->rq = rq; | ||
7124 | #endif | ||
7125 | } | ||
7126 | |||
7127 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7128 | static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, | ||
7129 | struct cfs_rq *cfs_rq, struct sched_entity *se, | ||
7130 | int cpu, int add) | ||
7131 | { | ||
7132 | tg->cfs_rq[cpu] = cfs_rq; | ||
7133 | init_cfs_rq(cfs_rq, rq); | ||
7134 | cfs_rq->tg = tg; | ||
7135 | if (add) | ||
7136 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
7137 | |||
7138 | tg->se[cpu] = se; | ||
7139 | se->cfs_rq = &rq->cfs; | ||
7140 | se->my_q = cfs_rq; | ||
7141 | se->load.weight = tg->shares; | ||
7142 | se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); | ||
7143 | se->parent = NULL; | ||
7144 | } | ||
7145 | |||
7146 | static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, | ||
7147 | struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, | ||
7148 | int cpu, int add) | ||
7149 | { | ||
7150 | tg->rt_rq[cpu] = rt_rq; | ||
7151 | init_rt_rq(rt_rq, rq); | ||
7152 | rt_rq->tg = tg; | ||
7153 | rt_rq->rt_se = rt_se; | ||
7154 | if (add) | ||
7155 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | ||
7156 | |||
7157 | tg->rt_se[cpu] = rt_se; | ||
7158 | rt_se->rt_rq = &rq->rt; | ||
7159 | rt_se->my_q = rt_rq; | ||
7160 | rt_se->parent = NULL; | ||
7161 | INIT_LIST_HEAD(&rt_se->run_list); | ||
7162 | } | ||
7163 | #endif | ||
7164 | |||
6749 | void __init sched_init(void) | 7165 | void __init sched_init(void) |
6750 | { | 7166 | { |
6751 | int highest_cpu = 0; | 7167 | int highest_cpu = 0; |
6752 | int i, j; | 7168 | int i, j; |
6753 | 7169 | ||
7170 | #ifdef CONFIG_SMP | ||
7171 | init_defrootdomain(); | ||
7172 | #endif | ||
7173 | |||
7174 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7175 | list_add(&init_task_group.list, &task_groups); | ||
7176 | #endif | ||
7177 | |||
6754 | for_each_possible_cpu(i) { | 7178 | for_each_possible_cpu(i) { |
6755 | struct rt_prio_array *array; | ||
6756 | struct rq *rq; | 7179 | struct rq *rq; |
6757 | 7180 | ||
6758 | rq = cpu_rq(i); | 7181 | rq = cpu_rq(i); |
@@ -6761,52 +7184,39 @@ void __init sched_init(void) | |||
6761 | rq->nr_running = 0; | 7184 | rq->nr_running = 0; |
6762 | rq->clock = 1; | 7185 | rq->clock = 1; |
6763 | init_cfs_rq(&rq->cfs, rq); | 7186 | init_cfs_rq(&rq->cfs, rq); |
7187 | init_rt_rq(&rq->rt, rq); | ||
6764 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7188 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6765 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | ||
6766 | { | ||
6767 | struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i); | ||
6768 | struct sched_entity *se = | ||
6769 | &per_cpu(init_sched_entity, i); | ||
6770 | |||
6771 | init_cfs_rq_p[i] = cfs_rq; | ||
6772 | init_cfs_rq(cfs_rq, rq); | ||
6773 | cfs_rq->tg = &init_task_group; | ||
6774 | list_add(&cfs_rq->leaf_cfs_rq_list, | ||
6775 | &rq->leaf_cfs_rq_list); | ||
6776 | |||
6777 | init_sched_entity_p[i] = se; | ||
6778 | se->cfs_rq = &rq->cfs; | ||
6779 | se->my_q = cfs_rq; | ||
6780 | se->load.weight = init_task_group_load; | ||
6781 | se->load.inv_weight = | ||
6782 | div64_64(1ULL<<32, init_task_group_load); | ||
6783 | se->parent = NULL; | ||
6784 | } | ||
6785 | init_task_group.shares = init_task_group_load; | 7189 | init_task_group.shares = init_task_group_load; |
6786 | spin_lock_init(&init_task_group.lock); | 7190 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
7191 | init_tg_cfs_entry(rq, &init_task_group, | ||
7192 | &per_cpu(init_cfs_rq, i), | ||
7193 | &per_cpu(init_sched_entity, i), i, 1); | ||
7194 | |||
7195 | init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */ | ||
7196 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | ||
7197 | init_tg_rt_entry(rq, &init_task_group, | ||
7198 | &per_cpu(init_rt_rq, i), | ||
7199 | &per_cpu(init_sched_rt_entity, i), i, 1); | ||
6787 | #endif | 7200 | #endif |
7201 | rq->rt_period_expire = 0; | ||
7202 | rq->rt_throttled = 0; | ||
6788 | 7203 | ||
6789 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 7204 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
6790 | rq->cpu_load[j] = 0; | 7205 | rq->cpu_load[j] = 0; |
6791 | #ifdef CONFIG_SMP | 7206 | #ifdef CONFIG_SMP |
6792 | rq->sd = NULL; | 7207 | rq->sd = NULL; |
7208 | rq->rd = NULL; | ||
6793 | rq->active_balance = 0; | 7209 | rq->active_balance = 0; |
6794 | rq->next_balance = jiffies; | 7210 | rq->next_balance = jiffies; |
6795 | rq->push_cpu = 0; | 7211 | rq->push_cpu = 0; |
6796 | rq->cpu = i; | 7212 | rq->cpu = i; |
6797 | rq->migration_thread = NULL; | 7213 | rq->migration_thread = NULL; |
6798 | INIT_LIST_HEAD(&rq->migration_queue); | 7214 | INIT_LIST_HEAD(&rq->migration_queue); |
7215 | rq_attach_root(rq, &def_root_domain); | ||
6799 | #endif | 7216 | #endif |
7217 | init_rq_hrtick(rq); | ||
6800 | atomic_set(&rq->nr_iowait, 0); | 7218 | atomic_set(&rq->nr_iowait, 0); |
6801 | |||
6802 | array = &rq->rt.active; | ||
6803 | for (j = 0; j < MAX_RT_PRIO; j++) { | ||
6804 | INIT_LIST_HEAD(array->queue + j); | ||
6805 | __clear_bit(j, array->bitmap); | ||
6806 | } | ||
6807 | highest_cpu = i; | 7219 | highest_cpu = i; |
6808 | /* delimiter for bitsearch: */ | ||
6809 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
6810 | } | 7220 | } |
6811 | 7221 | ||
6812 | set_load_weight(&init_task); | 7222 | set_load_weight(&init_task); |
@@ -6975,12 +7385,187 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
6975 | 7385 | ||
6976 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7386 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6977 | 7387 | ||
7388 | #ifdef CONFIG_SMP | ||
7389 | /* | ||
7390 | * distribute shares of all task groups among their schedulable entities, | ||
7391 | * to reflect load distribution across cpus. | ||
7392 | */ | ||
7393 | static int rebalance_shares(struct sched_domain *sd, int this_cpu) | ||
7394 | { | ||
7395 | struct cfs_rq *cfs_rq; | ||
7396 | struct rq *rq = cpu_rq(this_cpu); | ||
7397 | cpumask_t sdspan = sd->span; | ||
7398 | int balanced = 1; | ||
7399 | |||
7400 | /* Walk thr' all the task groups that we have */ | ||
7401 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
7402 | int i; | ||
7403 | unsigned long total_load = 0, total_shares; | ||
7404 | struct task_group *tg = cfs_rq->tg; | ||
7405 | |||
7406 | /* Gather total task load of this group across cpus */ | ||
7407 | for_each_cpu_mask(i, sdspan) | ||
7408 | total_load += tg->cfs_rq[i]->load.weight; | ||
7409 | |||
7410 | /* Nothing to do if this group has no load */ | ||
7411 | if (!total_load) | ||
7412 | continue; | ||
7413 | |||
7414 | /* | ||
7415 | * tg->shares represents the number of cpu shares the task group | ||
7416 | * is eligible to hold on a single cpu. On N cpus, it is | ||
7417 | * eligible to hold (N * tg->shares) number of cpu shares. | ||
7418 | */ | ||
7419 | total_shares = tg->shares * cpus_weight(sdspan); | ||
7420 | |||
7421 | /* | ||
7422 | * redistribute total_shares across cpus as per the task load | ||
7423 | * distribution. | ||
7424 | */ | ||
7425 | for_each_cpu_mask(i, sdspan) { | ||
7426 | unsigned long local_load, local_shares; | ||
7427 | |||
7428 | local_load = tg->cfs_rq[i]->load.weight; | ||
7429 | local_shares = (local_load * total_shares) / total_load; | ||
7430 | if (!local_shares) | ||
7431 | local_shares = MIN_GROUP_SHARES; | ||
7432 | if (local_shares == tg->se[i]->load.weight) | ||
7433 | continue; | ||
7434 | |||
7435 | spin_lock_irq(&cpu_rq(i)->lock); | ||
7436 | set_se_shares(tg->se[i], local_shares); | ||
7437 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
7438 | balanced = 0; | ||
7439 | } | ||
7440 | } | ||
7441 | |||
7442 | return balanced; | ||
7443 | } | ||
7444 | |||
7445 | /* | ||
7446 | * How frequently should we rebalance_shares() across cpus? | ||
7447 | * | ||
7448 | * The more frequently we rebalance shares, the more accurate is the fairness | ||
7449 | * of cpu bandwidth distribution between task groups. However higher frequency | ||
7450 | * also implies increased scheduling overhead. | ||
7451 | * | ||
7452 | * sysctl_sched_min_bal_int_shares represents the minimum interval between | ||
7453 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
7454 | * | ||
7455 | * sysctl_sched_max_bal_int_shares represents the maximum interval between | ||
7456 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
7457 | * | ||
7458 | * These settings allows for the appropriate trade-off between accuracy of | ||
7459 | * fairness and the associated overhead. | ||
7460 | * | ||
7461 | */ | ||
7462 | |||
7463 | /* default: 8ms, units: milliseconds */ | ||
7464 | const_debug unsigned int sysctl_sched_min_bal_int_shares = 8; | ||
7465 | |||
7466 | /* default: 128ms, units: milliseconds */ | ||
7467 | const_debug unsigned int sysctl_sched_max_bal_int_shares = 128; | ||
7468 | |||
7469 | /* kernel thread that runs rebalance_shares() periodically */ | ||
7470 | static int load_balance_monitor(void *unused) | ||
7471 | { | ||
7472 | unsigned int timeout = sysctl_sched_min_bal_int_shares; | ||
7473 | struct sched_param schedparm; | ||
7474 | int ret; | ||
7475 | |||
7476 | /* | ||
7477 | * We don't want this thread's execution to be limited by the shares | ||
7478 | * assigned to default group (init_task_group). Hence make it run | ||
7479 | * as a SCHED_RR RT task at the lowest priority. | ||
7480 | */ | ||
7481 | schedparm.sched_priority = 1; | ||
7482 | ret = sched_setscheduler(current, SCHED_RR, &schedparm); | ||
7483 | if (ret) | ||
7484 | printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance" | ||
7485 | " monitor thread (error = %d) \n", ret); | ||
7486 | |||
7487 | while (!kthread_should_stop()) { | ||
7488 | int i, cpu, balanced = 1; | ||
7489 | |||
7490 | /* Prevent cpus going down or coming up */ | ||
7491 | get_online_cpus(); | ||
7492 | /* lockout changes to doms_cur[] array */ | ||
7493 | lock_doms_cur(); | ||
7494 | /* | ||
7495 | * Enter a rcu read-side critical section to safely walk rq->sd | ||
7496 | * chain on various cpus and to walk task group list | ||
7497 | * (rq->leaf_cfs_rq_list) in rebalance_shares(). | ||
7498 | */ | ||
7499 | rcu_read_lock(); | ||
7500 | |||
7501 | for (i = 0; i < ndoms_cur; i++) { | ||
7502 | cpumask_t cpumap = doms_cur[i]; | ||
7503 | struct sched_domain *sd = NULL, *sd_prev = NULL; | ||
7504 | |||
7505 | cpu = first_cpu(cpumap); | ||
7506 | |||
7507 | /* Find the highest domain at which to balance shares */ | ||
7508 | for_each_domain(cpu, sd) { | ||
7509 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
7510 | continue; | ||
7511 | sd_prev = sd; | ||
7512 | } | ||
7513 | |||
7514 | sd = sd_prev; | ||
7515 | /* sd == NULL? No load balance reqd in this domain */ | ||
7516 | if (!sd) | ||
7517 | continue; | ||
7518 | |||
7519 | balanced &= rebalance_shares(sd, cpu); | ||
7520 | } | ||
7521 | |||
7522 | rcu_read_unlock(); | ||
7523 | |||
7524 | unlock_doms_cur(); | ||
7525 | put_online_cpus(); | ||
7526 | |||
7527 | if (!balanced) | ||
7528 | timeout = sysctl_sched_min_bal_int_shares; | ||
7529 | else if (timeout < sysctl_sched_max_bal_int_shares) | ||
7530 | timeout *= 2; | ||
7531 | |||
7532 | msleep_interruptible(timeout); | ||
7533 | } | ||
7534 | |||
7535 | return 0; | ||
7536 | } | ||
7537 | #endif /* CONFIG_SMP */ | ||
7538 | |||
7539 | static void free_sched_group(struct task_group *tg) | ||
7540 | { | ||
7541 | int i; | ||
7542 | |||
7543 | for_each_possible_cpu(i) { | ||
7544 | if (tg->cfs_rq) | ||
7545 | kfree(tg->cfs_rq[i]); | ||
7546 | if (tg->se) | ||
7547 | kfree(tg->se[i]); | ||
7548 | if (tg->rt_rq) | ||
7549 | kfree(tg->rt_rq[i]); | ||
7550 | if (tg->rt_se) | ||
7551 | kfree(tg->rt_se[i]); | ||
7552 | } | ||
7553 | |||
7554 | kfree(tg->cfs_rq); | ||
7555 | kfree(tg->se); | ||
7556 | kfree(tg->rt_rq); | ||
7557 | kfree(tg->rt_se); | ||
7558 | kfree(tg); | ||
7559 | } | ||
7560 | |||
6978 | /* allocate runqueue etc for a new task group */ | 7561 | /* allocate runqueue etc for a new task group */ |
6979 | struct task_group *sched_create_group(void) | 7562 | struct task_group *sched_create_group(void) |
6980 | { | 7563 | { |
6981 | struct task_group *tg; | 7564 | struct task_group *tg; |
6982 | struct cfs_rq *cfs_rq; | 7565 | struct cfs_rq *cfs_rq; |
6983 | struct sched_entity *se; | 7566 | struct sched_entity *se; |
7567 | struct rt_rq *rt_rq; | ||
7568 | struct sched_rt_entity *rt_se; | ||
6984 | struct rq *rq; | 7569 | struct rq *rq; |
6985 | int i; | 7570 | int i; |
6986 | 7571 | ||
@@ -6994,97 +7579,89 @@ struct task_group *sched_create_group(void) | |||
6994 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); | 7579 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); |
6995 | if (!tg->se) | 7580 | if (!tg->se) |
6996 | goto err; | 7581 | goto err; |
7582 | tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); | ||
7583 | if (!tg->rt_rq) | ||
7584 | goto err; | ||
7585 | tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); | ||
7586 | if (!tg->rt_se) | ||
7587 | goto err; | ||
7588 | |||
7589 | tg->shares = NICE_0_LOAD; | ||
7590 | tg->rt_ratio = 0; /* XXX */ | ||
6997 | 7591 | ||
6998 | for_each_possible_cpu(i) { | 7592 | for_each_possible_cpu(i) { |
6999 | rq = cpu_rq(i); | 7593 | rq = cpu_rq(i); |
7000 | 7594 | ||
7001 | cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, | 7595 | cfs_rq = kmalloc_node(sizeof(struct cfs_rq), |
7002 | cpu_to_node(i)); | 7596 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
7003 | if (!cfs_rq) | 7597 | if (!cfs_rq) |
7004 | goto err; | 7598 | goto err; |
7005 | 7599 | ||
7006 | se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, | 7600 | se = kmalloc_node(sizeof(struct sched_entity), |
7007 | cpu_to_node(i)); | 7601 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
7008 | if (!se) | 7602 | if (!se) |
7009 | goto err; | 7603 | goto err; |
7010 | 7604 | ||
7011 | memset(cfs_rq, 0, sizeof(struct cfs_rq)); | 7605 | rt_rq = kmalloc_node(sizeof(struct rt_rq), |
7012 | memset(se, 0, sizeof(struct sched_entity)); | 7606 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
7607 | if (!rt_rq) | ||
7608 | goto err; | ||
7013 | 7609 | ||
7014 | tg->cfs_rq[i] = cfs_rq; | 7610 | rt_se = kmalloc_node(sizeof(struct sched_rt_entity), |
7015 | init_cfs_rq(cfs_rq, rq); | 7611 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
7016 | cfs_rq->tg = tg; | 7612 | if (!rt_se) |
7613 | goto err; | ||
7017 | 7614 | ||
7018 | tg->se[i] = se; | 7615 | init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); |
7019 | se->cfs_rq = &rq->cfs; | 7616 | init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); |
7020 | se->my_q = cfs_rq; | ||
7021 | se->load.weight = NICE_0_LOAD; | ||
7022 | se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); | ||
7023 | se->parent = NULL; | ||
7024 | } | 7617 | } |
7025 | 7618 | ||
7619 | lock_task_group_list(); | ||
7026 | for_each_possible_cpu(i) { | 7620 | for_each_possible_cpu(i) { |
7027 | rq = cpu_rq(i); | 7621 | rq = cpu_rq(i); |
7028 | cfs_rq = tg->cfs_rq[i]; | 7622 | cfs_rq = tg->cfs_rq[i]; |
7029 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | 7623 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); |
7624 | rt_rq = tg->rt_rq[i]; | ||
7625 | list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | ||
7030 | } | 7626 | } |
7031 | 7627 | list_add_rcu(&tg->list, &task_groups); | |
7032 | tg->shares = NICE_0_LOAD; | 7628 | unlock_task_group_list(); |
7033 | spin_lock_init(&tg->lock); | ||
7034 | 7629 | ||
7035 | return tg; | 7630 | return tg; |
7036 | 7631 | ||
7037 | err: | 7632 | err: |
7038 | for_each_possible_cpu(i) { | 7633 | free_sched_group(tg); |
7039 | if (tg->cfs_rq) | ||
7040 | kfree(tg->cfs_rq[i]); | ||
7041 | if (tg->se) | ||
7042 | kfree(tg->se[i]); | ||
7043 | } | ||
7044 | kfree(tg->cfs_rq); | ||
7045 | kfree(tg->se); | ||
7046 | kfree(tg); | ||
7047 | |||
7048 | return ERR_PTR(-ENOMEM); | 7634 | return ERR_PTR(-ENOMEM); |
7049 | } | 7635 | } |
7050 | 7636 | ||
7051 | /* rcu callback to free various structures associated with a task group */ | 7637 | /* rcu callback to free various structures associated with a task group */ |
7052 | static void free_sched_group(struct rcu_head *rhp) | 7638 | static void free_sched_group_rcu(struct rcu_head *rhp) |
7053 | { | 7639 | { |
7054 | struct task_group *tg = container_of(rhp, struct task_group, rcu); | ||
7055 | struct cfs_rq *cfs_rq; | ||
7056 | struct sched_entity *se; | ||
7057 | int i; | ||
7058 | |||
7059 | /* now it should be safe to free those cfs_rqs */ | 7640 | /* now it should be safe to free those cfs_rqs */ |
7060 | for_each_possible_cpu(i) { | 7641 | free_sched_group(container_of(rhp, struct task_group, rcu)); |
7061 | cfs_rq = tg->cfs_rq[i]; | ||
7062 | kfree(cfs_rq); | ||
7063 | |||
7064 | se = tg->se[i]; | ||
7065 | kfree(se); | ||
7066 | } | ||
7067 | |||
7068 | kfree(tg->cfs_rq); | ||
7069 | kfree(tg->se); | ||
7070 | kfree(tg); | ||
7071 | } | 7642 | } |
7072 | 7643 | ||
7073 | /* Destroy runqueue etc associated with a task group */ | 7644 | /* Destroy runqueue etc associated with a task group */ |
7074 | void sched_destroy_group(struct task_group *tg) | 7645 | void sched_destroy_group(struct task_group *tg) |
7075 | { | 7646 | { |
7076 | struct cfs_rq *cfs_rq = NULL; | 7647 | struct cfs_rq *cfs_rq = NULL; |
7648 | struct rt_rq *rt_rq = NULL; | ||
7077 | int i; | 7649 | int i; |
7078 | 7650 | ||
7651 | lock_task_group_list(); | ||
7079 | for_each_possible_cpu(i) { | 7652 | for_each_possible_cpu(i) { |
7080 | cfs_rq = tg->cfs_rq[i]; | 7653 | cfs_rq = tg->cfs_rq[i]; |
7081 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | 7654 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); |
7655 | rt_rq = tg->rt_rq[i]; | ||
7656 | list_del_rcu(&rt_rq->leaf_rt_rq_list); | ||
7082 | } | 7657 | } |
7658 | list_del_rcu(&tg->list); | ||
7659 | unlock_task_group_list(); | ||
7083 | 7660 | ||
7084 | BUG_ON(!cfs_rq); | 7661 | BUG_ON(!cfs_rq); |
7085 | 7662 | ||
7086 | /* wait for possible concurrent references to cfs_rqs complete */ | 7663 | /* wait for possible concurrent references to cfs_rqs complete */ |
7087 | call_rcu(&tg->rcu, free_sched_group); | 7664 | call_rcu(&tg->rcu, free_sched_group_rcu); |
7088 | } | 7665 | } |
7089 | 7666 | ||
7090 | /* change task's runqueue when it moves between groups. | 7667 | /* change task's runqueue when it moves between groups. |
@@ -7100,11 +7677,6 @@ void sched_move_task(struct task_struct *tsk) | |||
7100 | 7677 | ||
7101 | rq = task_rq_lock(tsk, &flags); | 7678 | rq = task_rq_lock(tsk, &flags); |
7102 | 7679 | ||
7103 | if (tsk->sched_class != &fair_sched_class) { | ||
7104 | set_task_cfs_rq(tsk, task_cpu(tsk)); | ||
7105 | goto done; | ||
7106 | } | ||
7107 | |||
7108 | update_rq_clock(rq); | 7680 | update_rq_clock(rq); |
7109 | 7681 | ||
7110 | running = task_current(rq, tsk); | 7682 | running = task_current(rq, tsk); |
@@ -7116,7 +7688,7 @@ void sched_move_task(struct task_struct *tsk) | |||
7116 | tsk->sched_class->put_prev_task(rq, tsk); | 7688 | tsk->sched_class->put_prev_task(rq, tsk); |
7117 | } | 7689 | } |
7118 | 7690 | ||
7119 | set_task_cfs_rq(tsk, task_cpu(tsk)); | 7691 | set_task_rq(tsk, task_cpu(tsk)); |
7120 | 7692 | ||
7121 | if (on_rq) { | 7693 | if (on_rq) { |
7122 | if (unlikely(running)) | 7694 | if (unlikely(running)) |
@@ -7124,53 +7696,82 @@ void sched_move_task(struct task_struct *tsk) | |||
7124 | enqueue_task(rq, tsk, 0); | 7696 | enqueue_task(rq, tsk, 0); |
7125 | } | 7697 | } |
7126 | 7698 | ||
7127 | done: | ||
7128 | task_rq_unlock(rq, &flags); | 7699 | task_rq_unlock(rq, &flags); |
7129 | } | 7700 | } |
7130 | 7701 | ||
7702 | /* rq->lock to be locked by caller */ | ||
7131 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 7703 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
7132 | { | 7704 | { |
7133 | struct cfs_rq *cfs_rq = se->cfs_rq; | 7705 | struct cfs_rq *cfs_rq = se->cfs_rq; |
7134 | struct rq *rq = cfs_rq->rq; | 7706 | struct rq *rq = cfs_rq->rq; |
7135 | int on_rq; | 7707 | int on_rq; |
7136 | 7708 | ||
7137 | spin_lock_irq(&rq->lock); | 7709 | if (!shares) |
7710 | shares = MIN_GROUP_SHARES; | ||
7138 | 7711 | ||
7139 | on_rq = se->on_rq; | 7712 | on_rq = se->on_rq; |
7140 | if (on_rq) | 7713 | if (on_rq) { |
7141 | dequeue_entity(cfs_rq, se, 0); | 7714 | dequeue_entity(cfs_rq, se, 0); |
7715 | dec_cpu_load(rq, se->load.weight); | ||
7716 | } | ||
7142 | 7717 | ||
7143 | se->load.weight = shares; | 7718 | se->load.weight = shares; |
7144 | se->load.inv_weight = div64_64((1ULL<<32), shares); | 7719 | se->load.inv_weight = div64_64((1ULL<<32), shares); |
7145 | 7720 | ||
7146 | if (on_rq) | 7721 | if (on_rq) { |
7147 | enqueue_entity(cfs_rq, se, 0); | 7722 | enqueue_entity(cfs_rq, se, 0); |
7148 | 7723 | inc_cpu_load(rq, se->load.weight); | |
7149 | spin_unlock_irq(&rq->lock); | 7724 | } |
7150 | } | 7725 | } |
7151 | 7726 | ||
7152 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | 7727 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) |
7153 | { | 7728 | { |
7154 | int i; | 7729 | int i; |
7730 | struct cfs_rq *cfs_rq; | ||
7731 | struct rq *rq; | ||
7732 | |||
7733 | lock_task_group_list(); | ||
7734 | if (tg->shares == shares) | ||
7735 | goto done; | ||
7736 | |||
7737 | if (shares < MIN_GROUP_SHARES) | ||
7738 | shares = MIN_GROUP_SHARES; | ||
7155 | 7739 | ||
7156 | /* | 7740 | /* |
7157 | * A weight of 0 or 1 can cause arithmetics problems. | 7741 | * Prevent any load balance activity (rebalance_shares, |
7158 | * (The default weight is 1024 - so there's no practical | 7742 | * load_balance_fair) from referring to this group first, |
7159 | * limitation from this.) | 7743 | * by taking it off the rq->leaf_cfs_rq_list on each cpu. |
7160 | */ | 7744 | */ |
7161 | if (shares < 2) | 7745 | for_each_possible_cpu(i) { |
7162 | shares = 2; | 7746 | cfs_rq = tg->cfs_rq[i]; |
7747 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | ||
7748 | } | ||
7163 | 7749 | ||
7164 | spin_lock(&tg->lock); | 7750 | /* wait for any ongoing reference to this group to finish */ |
7165 | if (tg->shares == shares) | 7751 | synchronize_sched(); |
7166 | goto done; | ||
7167 | 7752 | ||
7753 | /* | ||
7754 | * Now we are free to modify the group's share on each cpu | ||
7755 | * w/o tripping rebalance_share or load_balance_fair. | ||
7756 | */ | ||
7168 | tg->shares = shares; | 7757 | tg->shares = shares; |
7169 | for_each_possible_cpu(i) | 7758 | for_each_possible_cpu(i) { |
7759 | spin_lock_irq(&cpu_rq(i)->lock); | ||
7170 | set_se_shares(tg->se[i], shares); | 7760 | set_se_shares(tg->se[i], shares); |
7761 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
7762 | } | ||
7171 | 7763 | ||
7764 | /* | ||
7765 | * Enable load balance activity on this group, by inserting it back on | ||
7766 | * each cpu's rq->leaf_cfs_rq_list. | ||
7767 | */ | ||
7768 | for_each_possible_cpu(i) { | ||
7769 | rq = cpu_rq(i); | ||
7770 | cfs_rq = tg->cfs_rq[i]; | ||
7771 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
7772 | } | ||
7172 | done: | 7773 | done: |
7173 | spin_unlock(&tg->lock); | 7774 | unlock_task_group_list(); |
7174 | return 0; | 7775 | return 0; |
7175 | } | 7776 | } |
7176 | 7777 | ||
@@ -7179,6 +7780,31 @@ unsigned long sched_group_shares(struct task_group *tg) | |||
7179 | return tg->shares; | 7780 | return tg->shares; |
7180 | } | 7781 | } |
7181 | 7782 | ||
7783 | /* | ||
7784 | * Ensure the total rt_ratio <= sysctl_sched_rt_ratio | ||
7785 | */ | ||
7786 | int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio) | ||
7787 | { | ||
7788 | struct task_group *tgi; | ||
7789 | unsigned long total = 0; | ||
7790 | |||
7791 | rcu_read_lock(); | ||
7792 | list_for_each_entry_rcu(tgi, &task_groups, list) | ||
7793 | total += tgi->rt_ratio; | ||
7794 | rcu_read_unlock(); | ||
7795 | |||
7796 | if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio) | ||
7797 | return -EINVAL; | ||
7798 | |||
7799 | tg->rt_ratio = rt_ratio; | ||
7800 | return 0; | ||
7801 | } | ||
7802 | |||
7803 | unsigned long sched_group_rt_ratio(struct task_group *tg) | ||
7804 | { | ||
7805 | return tg->rt_ratio; | ||
7806 | } | ||
7807 | |||
7182 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7808 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7183 | 7809 | ||
7184 | #ifdef CONFIG_FAIR_CGROUP_SCHED | 7810 | #ifdef CONFIG_FAIR_CGROUP_SCHED |
@@ -7254,12 +7880,30 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) | |||
7254 | return (u64) tg->shares; | 7880 | return (u64) tg->shares; |
7255 | } | 7881 | } |
7256 | 7882 | ||
7883 | static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype, | ||
7884 | u64 rt_ratio_val) | ||
7885 | { | ||
7886 | return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val); | ||
7887 | } | ||
7888 | |||
7889 | static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft) | ||
7890 | { | ||
7891 | struct task_group *tg = cgroup_tg(cgrp); | ||
7892 | |||
7893 | return (u64) tg->rt_ratio; | ||
7894 | } | ||
7895 | |||
7257 | static struct cftype cpu_files[] = { | 7896 | static struct cftype cpu_files[] = { |
7258 | { | 7897 | { |
7259 | .name = "shares", | 7898 | .name = "shares", |
7260 | .read_uint = cpu_shares_read_uint, | 7899 | .read_uint = cpu_shares_read_uint, |
7261 | .write_uint = cpu_shares_write_uint, | 7900 | .write_uint = cpu_shares_write_uint, |
7262 | }, | 7901 | }, |
7902 | { | ||
7903 | .name = "rt_ratio", | ||
7904 | .read_uint = cpu_rt_ratio_read_uint, | ||
7905 | .write_uint = cpu_rt_ratio_write_uint, | ||
7906 | }, | ||
7263 | }; | 7907 | }; |
7264 | 7908 | ||
7265 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 7909 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 80fbbfc04290..4b5e24cf2f4a 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -179,6 +179,7 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
179 | PN(prev_clock_raw); | 179 | PN(prev_clock_raw); |
180 | P(clock_warps); | 180 | P(clock_warps); |
181 | P(clock_overflows); | 181 | P(clock_overflows); |
182 | P(clock_underflows); | ||
182 | P(clock_deep_idle_events); | 183 | P(clock_deep_idle_events); |
183 | PN(clock_max_delta); | 184 | PN(clock_max_delta); |
184 | P(cpu_load[0]); | 185 | P(cpu_load[0]); |
@@ -299,6 +300,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
299 | PN(se.exec_max); | 300 | PN(se.exec_max); |
300 | PN(se.slice_max); | 301 | PN(se.slice_max); |
301 | PN(se.wait_max); | 302 | PN(se.wait_max); |
303 | PN(se.wait_sum); | ||
304 | P(se.wait_count); | ||
302 | P(sched_info.bkl_count); | 305 | P(sched_info.bkl_count); |
303 | P(se.nr_migrations); | 306 | P(se.nr_migrations); |
304 | P(se.nr_migrations_cold); | 307 | P(se.nr_migrations_cold); |
@@ -366,6 +369,8 @@ void proc_sched_set_task(struct task_struct *p) | |||
366 | { | 369 | { |
367 | #ifdef CONFIG_SCHEDSTATS | 370 | #ifdef CONFIG_SCHEDSTATS |
368 | p->se.wait_max = 0; | 371 | p->se.wait_max = 0; |
372 | p->se.wait_sum = 0; | ||
373 | p->se.wait_count = 0; | ||
369 | p->se.sleep_max = 0; | 374 | p->se.sleep_max = 0; |
370 | p->se.sum_sleep_runtime = 0; | 375 | p->se.sum_sleep_runtime = 0; |
371 | p->se.block_max = 0; | 376 | p->se.block_max = 0; |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index da7c061e7206..72e25c7a3a18 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -20,6 +20,8 @@ | |||
20 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | 20 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> |
21 | */ | 21 | */ |
22 | 22 | ||
23 | #include <linux/latencytop.h> | ||
24 | |||
23 | /* | 25 | /* |
24 | * Targeted preemption latency for CPU-bound tasks: | 26 | * Targeted preemption latency for CPU-bound tasks: |
25 | * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) | 27 | * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) |
@@ -248,8 +250,8 @@ static u64 __sched_period(unsigned long nr_running) | |||
248 | unsigned long nr_latency = sched_nr_latency; | 250 | unsigned long nr_latency = sched_nr_latency; |
249 | 251 | ||
250 | if (unlikely(nr_running > nr_latency)) { | 252 | if (unlikely(nr_running > nr_latency)) { |
253 | period = sysctl_sched_min_granularity; | ||
251 | period *= nr_running; | 254 | period *= nr_running; |
252 | do_div(period, nr_latency); | ||
253 | } | 255 | } |
254 | 256 | ||
255 | return period; | 257 | return period; |
@@ -383,6 +385,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
383 | { | 385 | { |
384 | schedstat_set(se->wait_max, max(se->wait_max, | 386 | schedstat_set(se->wait_max, max(se->wait_max, |
385 | rq_of(cfs_rq)->clock - se->wait_start)); | 387 | rq_of(cfs_rq)->clock - se->wait_start)); |
388 | schedstat_set(se->wait_count, se->wait_count + 1); | ||
389 | schedstat_set(se->wait_sum, se->wait_sum + | ||
390 | rq_of(cfs_rq)->clock - se->wait_start); | ||
386 | schedstat_set(se->wait_start, 0); | 391 | schedstat_set(se->wait_start, 0); |
387 | } | 392 | } |
388 | 393 | ||
@@ -434,6 +439,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
434 | #ifdef CONFIG_SCHEDSTATS | 439 | #ifdef CONFIG_SCHEDSTATS |
435 | if (se->sleep_start) { | 440 | if (se->sleep_start) { |
436 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; | 441 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; |
442 | struct task_struct *tsk = task_of(se); | ||
437 | 443 | ||
438 | if ((s64)delta < 0) | 444 | if ((s64)delta < 0) |
439 | delta = 0; | 445 | delta = 0; |
@@ -443,9 +449,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
443 | 449 | ||
444 | se->sleep_start = 0; | 450 | se->sleep_start = 0; |
445 | se->sum_sleep_runtime += delta; | 451 | se->sum_sleep_runtime += delta; |
452 | |||
453 | account_scheduler_latency(tsk, delta >> 10, 1); | ||
446 | } | 454 | } |
447 | if (se->block_start) { | 455 | if (se->block_start) { |
448 | u64 delta = rq_of(cfs_rq)->clock - se->block_start; | 456 | u64 delta = rq_of(cfs_rq)->clock - se->block_start; |
457 | struct task_struct *tsk = task_of(se); | ||
449 | 458 | ||
450 | if ((s64)delta < 0) | 459 | if ((s64)delta < 0) |
451 | delta = 0; | 460 | delta = 0; |
@@ -462,11 +471,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
462 | * time that the task spent sleeping: | 471 | * time that the task spent sleeping: |
463 | */ | 472 | */ |
464 | if (unlikely(prof_on == SLEEP_PROFILING)) { | 473 | if (unlikely(prof_on == SLEEP_PROFILING)) { |
465 | struct task_struct *tsk = task_of(se); | ||
466 | 474 | ||
467 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), | 475 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), |
468 | delta >> 20); | 476 | delta >> 20); |
469 | } | 477 | } |
478 | account_scheduler_latency(tsk, delta >> 10, 0); | ||
470 | } | 479 | } |
471 | #endif | 480 | #endif |
472 | } | 481 | } |
@@ -642,13 +651,29 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
642 | cfs_rq->curr = NULL; | 651 | cfs_rq->curr = NULL; |
643 | } | 652 | } |
644 | 653 | ||
645 | static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | 654 | static void |
655 | entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | ||
646 | { | 656 | { |
647 | /* | 657 | /* |
648 | * Update run-time statistics of the 'current'. | 658 | * Update run-time statistics of the 'current'. |
649 | */ | 659 | */ |
650 | update_curr(cfs_rq); | 660 | update_curr(cfs_rq); |
651 | 661 | ||
662 | #ifdef CONFIG_SCHED_HRTICK | ||
663 | /* | ||
664 | * queued ticks are scheduled to match the slice, so don't bother | ||
665 | * validating it and just reschedule. | ||
666 | */ | ||
667 | if (queued) | ||
668 | return resched_task(rq_of(cfs_rq)->curr); | ||
669 | /* | ||
670 | * don't let the period tick interfere with the hrtick preemption | ||
671 | */ | ||
672 | if (!sched_feat(DOUBLE_TICK) && | ||
673 | hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) | ||
674 | return; | ||
675 | #endif | ||
676 | |||
652 | if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) | 677 | if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) |
653 | check_preempt_tick(cfs_rq, curr); | 678 | check_preempt_tick(cfs_rq, curr); |
654 | } | 679 | } |
@@ -690,7 +715,7 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
690 | 715 | ||
691 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | 716 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ |
692 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 717 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
693 | list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | 718 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) |
694 | 719 | ||
695 | /* Do the two (enqueued) entities belong to the same group ? */ | 720 | /* Do the two (enqueued) entities belong to the same group ? */ |
696 | static inline int | 721 | static inline int |
@@ -707,6 +732,8 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
707 | return se->parent; | 732 | return se->parent; |
708 | } | 733 | } |
709 | 734 | ||
735 | #define GROUP_IMBALANCE_PCT 20 | ||
736 | |||
710 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 737 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
711 | 738 | ||
712 | #define for_each_sched_entity(se) \ | 739 | #define for_each_sched_entity(se) \ |
@@ -752,6 +779,43 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
752 | 779 | ||
753 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 780 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
754 | 781 | ||
782 | #ifdef CONFIG_SCHED_HRTICK | ||
783 | static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | ||
784 | { | ||
785 | int requeue = rq->curr == p; | ||
786 | struct sched_entity *se = &p->se; | ||
787 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
788 | |||
789 | WARN_ON(task_rq(p) != rq); | ||
790 | |||
791 | if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { | ||
792 | u64 slice = sched_slice(cfs_rq, se); | ||
793 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; | ||
794 | s64 delta = slice - ran; | ||
795 | |||
796 | if (delta < 0) { | ||
797 | if (rq->curr == p) | ||
798 | resched_task(p); | ||
799 | return; | ||
800 | } | ||
801 | |||
802 | /* | ||
803 | * Don't schedule slices shorter than 10000ns, that just | ||
804 | * doesn't make sense. Rely on vruntime for fairness. | ||
805 | */ | ||
806 | if (!requeue) | ||
807 | delta = max(10000LL, delta); | ||
808 | |||
809 | hrtick_start(rq, delta, requeue); | ||
810 | } | ||
811 | } | ||
812 | #else | ||
813 | static inline void | ||
814 | hrtick_start_fair(struct rq *rq, struct task_struct *p) | ||
815 | { | ||
816 | } | ||
817 | #endif | ||
818 | |||
755 | /* | 819 | /* |
756 | * The enqueue_task method is called before nr_running is | 820 | * The enqueue_task method is called before nr_running is |
757 | * increased. Here we update the fair scheduling stats and | 821 | * increased. Here we update the fair scheduling stats and |
@@ -760,15 +824,28 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
760 | static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | 824 | static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) |
761 | { | 825 | { |
762 | struct cfs_rq *cfs_rq; | 826 | struct cfs_rq *cfs_rq; |
763 | struct sched_entity *se = &p->se; | 827 | struct sched_entity *se = &p->se, |
828 | *topse = NULL; /* Highest schedulable entity */ | ||
829 | int incload = 1; | ||
764 | 830 | ||
765 | for_each_sched_entity(se) { | 831 | for_each_sched_entity(se) { |
766 | if (se->on_rq) | 832 | topse = se; |
833 | if (se->on_rq) { | ||
834 | incload = 0; | ||
767 | break; | 835 | break; |
836 | } | ||
768 | cfs_rq = cfs_rq_of(se); | 837 | cfs_rq = cfs_rq_of(se); |
769 | enqueue_entity(cfs_rq, se, wakeup); | 838 | enqueue_entity(cfs_rq, se, wakeup); |
770 | wakeup = 1; | 839 | wakeup = 1; |
771 | } | 840 | } |
841 | /* Increment cpu load if we just enqueued the first task of a group on | ||
842 | * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs | ||
843 | * at the highest grouping level. | ||
844 | */ | ||
845 | if (incload) | ||
846 | inc_cpu_load(rq, topse->load.weight); | ||
847 | |||
848 | hrtick_start_fair(rq, rq->curr); | ||
772 | } | 849 | } |
773 | 850 | ||
774 | /* | 851 | /* |
@@ -779,16 +856,30 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | |||
779 | static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) | 856 | static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) |
780 | { | 857 | { |
781 | struct cfs_rq *cfs_rq; | 858 | struct cfs_rq *cfs_rq; |
782 | struct sched_entity *se = &p->se; | 859 | struct sched_entity *se = &p->se, |
860 | *topse = NULL; /* Highest schedulable entity */ | ||
861 | int decload = 1; | ||
783 | 862 | ||
784 | for_each_sched_entity(se) { | 863 | for_each_sched_entity(se) { |
864 | topse = se; | ||
785 | cfs_rq = cfs_rq_of(se); | 865 | cfs_rq = cfs_rq_of(se); |
786 | dequeue_entity(cfs_rq, se, sleep); | 866 | dequeue_entity(cfs_rq, se, sleep); |
787 | /* Don't dequeue parent if it has other entities besides us */ | 867 | /* Don't dequeue parent if it has other entities besides us */ |
788 | if (cfs_rq->load.weight) | 868 | if (cfs_rq->load.weight) { |
869 | if (parent_entity(se)) | ||
870 | decload = 0; | ||
789 | break; | 871 | break; |
872 | } | ||
790 | sleep = 1; | 873 | sleep = 1; |
791 | } | 874 | } |
875 | /* Decrement cpu load if we just dequeued the last task of a group on | ||
876 | * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs | ||
877 | * at the highest grouping level. | ||
878 | */ | ||
879 | if (decload) | ||
880 | dec_cpu_load(rq, topse->load.weight); | ||
881 | |||
882 | hrtick_start_fair(rq, rq->curr); | ||
792 | } | 883 | } |
793 | 884 | ||
794 | /* | 885 | /* |
@@ -836,6 +927,154 @@ static void yield_task_fair(struct rq *rq) | |||
836 | } | 927 | } |
837 | 928 | ||
838 | /* | 929 | /* |
930 | * wake_idle() will wake a task on an idle cpu if task->cpu is | ||
931 | * not idle and an idle cpu is available. The span of cpus to | ||
932 | * search starts with cpus closest then further out as needed, | ||
933 | * so we always favor a closer, idle cpu. | ||
934 | * | ||
935 | * Returns the CPU we should wake onto. | ||
936 | */ | ||
937 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | ||
938 | static int wake_idle(int cpu, struct task_struct *p) | ||
939 | { | ||
940 | cpumask_t tmp; | ||
941 | struct sched_domain *sd; | ||
942 | int i; | ||
943 | |||
944 | /* | ||
945 | * If it is idle, then it is the best cpu to run this task. | ||
946 | * | ||
947 | * This cpu is also the best, if it has more than one task already. | ||
948 | * Siblings must be also busy(in most cases) as they didn't already | ||
949 | * pickup the extra load from this cpu and hence we need not check | ||
950 | * sibling runqueue info. This will avoid the checks and cache miss | ||
951 | * penalities associated with that. | ||
952 | */ | ||
953 | if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) | ||
954 | return cpu; | ||
955 | |||
956 | for_each_domain(cpu, sd) { | ||
957 | if (sd->flags & SD_WAKE_IDLE) { | ||
958 | cpus_and(tmp, sd->span, p->cpus_allowed); | ||
959 | for_each_cpu_mask(i, tmp) { | ||
960 | if (idle_cpu(i)) { | ||
961 | if (i != task_cpu(p)) { | ||
962 | schedstat_inc(p, | ||
963 | se.nr_wakeups_idle); | ||
964 | } | ||
965 | return i; | ||
966 | } | ||
967 | } | ||
968 | } else { | ||
969 | break; | ||
970 | } | ||
971 | } | ||
972 | return cpu; | ||
973 | } | ||
974 | #else | ||
975 | static inline int wake_idle(int cpu, struct task_struct *p) | ||
976 | { | ||
977 | return cpu; | ||
978 | } | ||
979 | #endif | ||
980 | |||
981 | #ifdef CONFIG_SMP | ||
982 | static int select_task_rq_fair(struct task_struct *p, int sync) | ||
983 | { | ||
984 | int cpu, this_cpu; | ||
985 | struct rq *rq; | ||
986 | struct sched_domain *sd, *this_sd = NULL; | ||
987 | int new_cpu; | ||
988 | |||
989 | cpu = task_cpu(p); | ||
990 | rq = task_rq(p); | ||
991 | this_cpu = smp_processor_id(); | ||
992 | new_cpu = cpu; | ||
993 | |||
994 | if (cpu == this_cpu) | ||
995 | goto out_set_cpu; | ||
996 | |||
997 | for_each_domain(this_cpu, sd) { | ||
998 | if (cpu_isset(cpu, sd->span)) { | ||
999 | this_sd = sd; | ||
1000 | break; | ||
1001 | } | ||
1002 | } | ||
1003 | |||
1004 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | ||
1005 | goto out_set_cpu; | ||
1006 | |||
1007 | /* | ||
1008 | * Check for affine wakeup and passive balancing possibilities. | ||
1009 | */ | ||
1010 | if (this_sd) { | ||
1011 | int idx = this_sd->wake_idx; | ||
1012 | unsigned int imbalance; | ||
1013 | unsigned long load, this_load; | ||
1014 | |||
1015 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; | ||
1016 | |||
1017 | load = source_load(cpu, idx); | ||
1018 | this_load = target_load(this_cpu, idx); | ||
1019 | |||
1020 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | ||
1021 | |||
1022 | if (this_sd->flags & SD_WAKE_AFFINE) { | ||
1023 | unsigned long tl = this_load; | ||
1024 | unsigned long tl_per_task; | ||
1025 | |||
1026 | /* | ||
1027 | * Attract cache-cold tasks on sync wakeups: | ||
1028 | */ | ||
1029 | if (sync && !task_hot(p, rq->clock, this_sd)) | ||
1030 | goto out_set_cpu; | ||
1031 | |||
1032 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | ||
1033 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
1034 | |||
1035 | /* | ||
1036 | * If sync wakeup then subtract the (maximum possible) | ||
1037 | * effect of the currently running task from the load | ||
1038 | * of the current CPU: | ||
1039 | */ | ||
1040 | if (sync) | ||
1041 | tl -= current->se.load.weight; | ||
1042 | |||
1043 | if ((tl <= load && | ||
1044 | tl + target_load(cpu, idx) <= tl_per_task) || | ||
1045 | 100*(tl + p->se.load.weight) <= imbalance*load) { | ||
1046 | /* | ||
1047 | * This domain has SD_WAKE_AFFINE and | ||
1048 | * p is cache cold in this domain, and | ||
1049 | * there is no bad imbalance. | ||
1050 | */ | ||
1051 | schedstat_inc(this_sd, ttwu_move_affine); | ||
1052 | schedstat_inc(p, se.nr_wakeups_affine); | ||
1053 | goto out_set_cpu; | ||
1054 | } | ||
1055 | } | ||
1056 | |||
1057 | /* | ||
1058 | * Start passive balancing when half the imbalance_pct | ||
1059 | * limit is reached. | ||
1060 | */ | ||
1061 | if (this_sd->flags & SD_WAKE_BALANCE) { | ||
1062 | if (imbalance*this_load <= 100*load) { | ||
1063 | schedstat_inc(this_sd, ttwu_move_balance); | ||
1064 | schedstat_inc(p, se.nr_wakeups_passive); | ||
1065 | goto out_set_cpu; | ||
1066 | } | ||
1067 | } | ||
1068 | } | ||
1069 | |||
1070 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ | ||
1071 | out_set_cpu: | ||
1072 | return wake_idle(new_cpu, p); | ||
1073 | } | ||
1074 | #endif /* CONFIG_SMP */ | ||
1075 | |||
1076 | |||
1077 | /* | ||
839 | * Preempt the current task with a newly woken task if needed: | 1078 | * Preempt the current task with a newly woken task if needed: |
840 | */ | 1079 | */ |
841 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | 1080 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) |
@@ -876,6 +1115,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
876 | 1115 | ||
877 | static struct task_struct *pick_next_task_fair(struct rq *rq) | 1116 | static struct task_struct *pick_next_task_fair(struct rq *rq) |
878 | { | 1117 | { |
1118 | struct task_struct *p; | ||
879 | struct cfs_rq *cfs_rq = &rq->cfs; | 1119 | struct cfs_rq *cfs_rq = &rq->cfs; |
880 | struct sched_entity *se; | 1120 | struct sched_entity *se; |
881 | 1121 | ||
@@ -887,7 +1127,10 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) | |||
887 | cfs_rq = group_cfs_rq(se); | 1127 | cfs_rq = group_cfs_rq(se); |
888 | } while (cfs_rq); | 1128 | } while (cfs_rq); |
889 | 1129 | ||
890 | return task_of(se); | 1130 | p = task_of(se); |
1131 | hrtick_start_fair(rq, p); | ||
1132 | |||
1133 | return p; | ||
891 | } | 1134 | } |
892 | 1135 | ||
893 | /* | 1136 | /* |
@@ -944,25 +1187,6 @@ static struct task_struct *load_balance_next_fair(void *arg) | |||
944 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); | 1187 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); |
945 | } | 1188 | } |
946 | 1189 | ||
947 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
948 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | ||
949 | { | ||
950 | struct sched_entity *curr; | ||
951 | struct task_struct *p; | ||
952 | |||
953 | if (!cfs_rq->nr_running) | ||
954 | return MAX_PRIO; | ||
955 | |||
956 | curr = cfs_rq->curr; | ||
957 | if (!curr) | ||
958 | curr = __pick_next_entity(cfs_rq); | ||
959 | |||
960 | p = task_of(curr); | ||
961 | |||
962 | return p->prio; | ||
963 | } | ||
964 | #endif | ||
965 | |||
966 | static unsigned long | 1190 | static unsigned long |
967 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1191 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
968 | unsigned long max_load_move, | 1192 | unsigned long max_load_move, |
@@ -972,28 +1196,45 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
972 | struct cfs_rq *busy_cfs_rq; | 1196 | struct cfs_rq *busy_cfs_rq; |
973 | long rem_load_move = max_load_move; | 1197 | long rem_load_move = max_load_move; |
974 | struct rq_iterator cfs_rq_iterator; | 1198 | struct rq_iterator cfs_rq_iterator; |
1199 | unsigned long load_moved; | ||
975 | 1200 | ||
976 | cfs_rq_iterator.start = load_balance_start_fair; | 1201 | cfs_rq_iterator.start = load_balance_start_fair; |
977 | cfs_rq_iterator.next = load_balance_next_fair; | 1202 | cfs_rq_iterator.next = load_balance_next_fair; |
978 | 1203 | ||
979 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 1204 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { |
980 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1205 | #ifdef CONFIG_FAIR_GROUP_SCHED |
981 | struct cfs_rq *this_cfs_rq; | 1206 | struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu]; |
982 | long imbalance; | 1207 | unsigned long maxload, task_load, group_weight; |
983 | unsigned long maxload; | 1208 | unsigned long thisload, per_task_load; |
1209 | struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu]; | ||
1210 | |||
1211 | task_load = busy_cfs_rq->load.weight; | ||
1212 | group_weight = se->load.weight; | ||
984 | 1213 | ||
985 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | 1214 | /* |
1215 | * 'group_weight' is contributed by tasks of total weight | ||
1216 | * 'task_load'. To move 'rem_load_move' worth of weight only, | ||
1217 | * we need to move a maximum task load of: | ||
1218 | * | ||
1219 | * maxload = (remload / group_weight) * task_load; | ||
1220 | */ | ||
1221 | maxload = (rem_load_move * task_load) / group_weight; | ||
986 | 1222 | ||
987 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; | 1223 | if (!maxload || !task_load) |
988 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | ||
989 | if (imbalance <= 0) | ||
990 | continue; | 1224 | continue; |
991 | 1225 | ||
992 | /* Don't pull more than imbalance/2 */ | 1226 | per_task_load = task_load / busy_cfs_rq->nr_running; |
993 | imbalance /= 2; | 1227 | /* |
994 | maxload = min(rem_load_move, imbalance); | 1228 | * balance_tasks will try to forcibly move atleast one task if |
1229 | * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if | ||
1230 | * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load. | ||
1231 | */ | ||
1232 | if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load) | ||
1233 | continue; | ||
995 | 1234 | ||
996 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); | 1235 | /* Disable priority-based load balance */ |
1236 | *this_best_prio = 0; | ||
1237 | thisload = this_cfs_rq->load.weight; | ||
997 | #else | 1238 | #else |
998 | # define maxload rem_load_move | 1239 | # define maxload rem_load_move |
999 | #endif | 1240 | #endif |
@@ -1002,11 +1243,33 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1002 | * load_balance_[start|next]_fair iterators | 1243 | * load_balance_[start|next]_fair iterators |
1003 | */ | 1244 | */ |
1004 | cfs_rq_iterator.arg = busy_cfs_rq; | 1245 | cfs_rq_iterator.arg = busy_cfs_rq; |
1005 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, | 1246 | load_moved = balance_tasks(this_rq, this_cpu, busiest, |
1006 | maxload, sd, idle, all_pinned, | 1247 | maxload, sd, idle, all_pinned, |
1007 | this_best_prio, | 1248 | this_best_prio, |
1008 | &cfs_rq_iterator); | 1249 | &cfs_rq_iterator); |
1009 | 1250 | ||
1251 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1252 | /* | ||
1253 | * load_moved holds the task load that was moved. The | ||
1254 | * effective (group) weight moved would be: | ||
1255 | * load_moved_eff = load_moved/task_load * group_weight; | ||
1256 | */ | ||
1257 | load_moved = (group_weight * load_moved) / task_load; | ||
1258 | |||
1259 | /* Adjust shares on both cpus to reflect load_moved */ | ||
1260 | group_weight -= load_moved; | ||
1261 | set_se_shares(se, group_weight); | ||
1262 | |||
1263 | se = busy_cfs_rq->tg->se[this_cpu]; | ||
1264 | if (!thisload) | ||
1265 | group_weight = load_moved; | ||
1266 | else | ||
1267 | group_weight = se->load.weight + load_moved; | ||
1268 | set_se_shares(se, group_weight); | ||
1269 | #endif | ||
1270 | |||
1271 | rem_load_move -= load_moved; | ||
1272 | |||
1010 | if (rem_load_move <= 0) | 1273 | if (rem_load_move <= 0) |
1011 | break; | 1274 | break; |
1012 | } | 1275 | } |
@@ -1042,14 +1305,14 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1042 | /* | 1305 | /* |
1043 | * scheduler tick hitting a task of our scheduling class: | 1306 | * scheduler tick hitting a task of our scheduling class: |
1044 | */ | 1307 | */ |
1045 | static void task_tick_fair(struct rq *rq, struct task_struct *curr) | 1308 | static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) |
1046 | { | 1309 | { |
1047 | struct cfs_rq *cfs_rq; | 1310 | struct cfs_rq *cfs_rq; |
1048 | struct sched_entity *se = &curr->se; | 1311 | struct sched_entity *se = &curr->se; |
1049 | 1312 | ||
1050 | for_each_sched_entity(se) { | 1313 | for_each_sched_entity(se) { |
1051 | cfs_rq = cfs_rq_of(se); | 1314 | cfs_rq = cfs_rq_of(se); |
1052 | entity_tick(cfs_rq, se); | 1315 | entity_tick(cfs_rq, se, queued); |
1053 | } | 1316 | } |
1054 | } | 1317 | } |
1055 | 1318 | ||
@@ -1087,6 +1350,42 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) | |||
1087 | resched_task(rq->curr); | 1350 | resched_task(rq->curr); |
1088 | } | 1351 | } |
1089 | 1352 | ||
1353 | /* | ||
1354 | * Priority of the task has changed. Check to see if we preempt | ||
1355 | * the current task. | ||
1356 | */ | ||
1357 | static void prio_changed_fair(struct rq *rq, struct task_struct *p, | ||
1358 | int oldprio, int running) | ||
1359 | { | ||
1360 | /* | ||
1361 | * Reschedule if we are currently running on this runqueue and | ||
1362 | * our priority decreased, or if we are not currently running on | ||
1363 | * this runqueue and our priority is higher than the current's | ||
1364 | */ | ||
1365 | if (running) { | ||
1366 | if (p->prio > oldprio) | ||
1367 | resched_task(rq->curr); | ||
1368 | } else | ||
1369 | check_preempt_curr(rq, p); | ||
1370 | } | ||
1371 | |||
1372 | /* | ||
1373 | * We switched to the sched_fair class. | ||
1374 | */ | ||
1375 | static void switched_to_fair(struct rq *rq, struct task_struct *p, | ||
1376 | int running) | ||
1377 | { | ||
1378 | /* | ||
1379 | * We were most likely switched from sched_rt, so | ||
1380 | * kick off the schedule if running, otherwise just see | ||
1381 | * if we can still preempt the current task. | ||
1382 | */ | ||
1383 | if (running) | ||
1384 | resched_task(rq->curr); | ||
1385 | else | ||
1386 | check_preempt_curr(rq, p); | ||
1387 | } | ||
1388 | |||
1090 | /* Account for a task changing its policy or group. | 1389 | /* Account for a task changing its policy or group. |
1091 | * | 1390 | * |
1092 | * This routine is mostly called to set cfs_rq->curr field when a task | 1391 | * This routine is mostly called to set cfs_rq->curr field when a task |
@@ -1108,6 +1407,9 @@ static const struct sched_class fair_sched_class = { | |||
1108 | .enqueue_task = enqueue_task_fair, | 1407 | .enqueue_task = enqueue_task_fair, |
1109 | .dequeue_task = dequeue_task_fair, | 1408 | .dequeue_task = dequeue_task_fair, |
1110 | .yield_task = yield_task_fair, | 1409 | .yield_task = yield_task_fair, |
1410 | #ifdef CONFIG_SMP | ||
1411 | .select_task_rq = select_task_rq_fair, | ||
1412 | #endif /* CONFIG_SMP */ | ||
1111 | 1413 | ||
1112 | .check_preempt_curr = check_preempt_wakeup, | 1414 | .check_preempt_curr = check_preempt_wakeup, |
1113 | 1415 | ||
@@ -1122,6 +1424,9 @@ static const struct sched_class fair_sched_class = { | |||
1122 | .set_curr_task = set_curr_task_fair, | 1424 | .set_curr_task = set_curr_task_fair, |
1123 | .task_tick = task_tick_fair, | 1425 | .task_tick = task_tick_fair, |
1124 | .task_new = task_new_fair, | 1426 | .task_new = task_new_fair, |
1427 | |||
1428 | .prio_changed = prio_changed_fair, | ||
1429 | .switched_to = switched_to_fair, | ||
1125 | }; | 1430 | }; |
1126 | 1431 | ||
1127 | #ifdef CONFIG_SCHED_DEBUG | 1432 | #ifdef CONFIG_SCHED_DEBUG |
@@ -1132,7 +1437,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu) | |||
1132 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1437 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1133 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); | 1438 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); |
1134 | #endif | 1439 | #endif |
1440 | rcu_read_lock(); | ||
1135 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) | 1441 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) |
1136 | print_cfs_rq(m, cpu, cfs_rq); | 1442 | print_cfs_rq(m, cpu, cfs_rq); |
1443 | rcu_read_unlock(); | ||
1137 | } | 1444 | } |
1138 | #endif | 1445 | #endif |
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index bf9c25c15b8b..2bcafa375633 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -5,6 +5,12 @@ | |||
5 | * handled in sched_fair.c) | 5 | * handled in sched_fair.c) |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #ifdef CONFIG_SMP | ||
9 | static int select_task_rq_idle(struct task_struct *p, int sync) | ||
10 | { | ||
11 | return task_cpu(p); /* IDLE tasks as never migrated */ | ||
12 | } | ||
13 | #endif /* CONFIG_SMP */ | ||
8 | /* | 14 | /* |
9 | * Idle tasks are unconditionally rescheduled: | 15 | * Idle tasks are unconditionally rescheduled: |
10 | */ | 16 | */ |
@@ -55,7 +61,7 @@ move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
55 | } | 61 | } |
56 | #endif | 62 | #endif |
57 | 63 | ||
58 | static void task_tick_idle(struct rq *rq, struct task_struct *curr) | 64 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) |
59 | { | 65 | { |
60 | } | 66 | } |
61 | 67 | ||
@@ -63,6 +69,33 @@ static void set_curr_task_idle(struct rq *rq) | |||
63 | { | 69 | { |
64 | } | 70 | } |
65 | 71 | ||
72 | static void switched_to_idle(struct rq *rq, struct task_struct *p, | ||
73 | int running) | ||
74 | { | ||
75 | /* Can this actually happen?? */ | ||
76 | if (running) | ||
77 | resched_task(rq->curr); | ||
78 | else | ||
79 | check_preempt_curr(rq, p); | ||
80 | } | ||
81 | |||
82 | static void prio_changed_idle(struct rq *rq, struct task_struct *p, | ||
83 | int oldprio, int running) | ||
84 | { | ||
85 | /* This can happen for hot plug CPUS */ | ||
86 | |||
87 | /* | ||
88 | * Reschedule if we are currently running on this runqueue and | ||
89 | * our priority decreased, or if we are not currently running on | ||
90 | * this runqueue and our priority is higher than the current's | ||
91 | */ | ||
92 | if (running) { | ||
93 | if (p->prio > oldprio) | ||
94 | resched_task(rq->curr); | ||
95 | } else | ||
96 | check_preempt_curr(rq, p); | ||
97 | } | ||
98 | |||
66 | /* | 99 | /* |
67 | * Simple, special scheduling class for the per-CPU idle tasks: | 100 | * Simple, special scheduling class for the per-CPU idle tasks: |
68 | */ | 101 | */ |
@@ -72,6 +105,9 @@ const struct sched_class idle_sched_class = { | |||
72 | 105 | ||
73 | /* dequeue is not valid, we print a debug message there: */ | 106 | /* dequeue is not valid, we print a debug message there: */ |
74 | .dequeue_task = dequeue_task_idle, | 107 | .dequeue_task = dequeue_task_idle, |
108 | #ifdef CONFIG_SMP | ||
109 | .select_task_rq = select_task_rq_idle, | ||
110 | #endif /* CONFIG_SMP */ | ||
75 | 111 | ||
76 | .check_preempt_curr = check_preempt_curr_idle, | 112 | .check_preempt_curr = check_preempt_curr_idle, |
77 | 113 | ||
@@ -85,5 +121,9 @@ const struct sched_class idle_sched_class = { | |||
85 | 121 | ||
86 | .set_curr_task = set_curr_task_idle, | 122 | .set_curr_task = set_curr_task_idle, |
87 | .task_tick = task_tick_idle, | 123 | .task_tick = task_tick_idle, |
124 | |||
125 | .prio_changed = prio_changed_idle, | ||
126 | .switched_to = switched_to_idle, | ||
127 | |||
88 | /* no .task_new for idle tasks */ | 128 | /* no .task_new for idle tasks */ |
89 | }; | 129 | }; |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 9ba3daa03475..274b40d7bef2 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -3,6 +3,217 @@ | |||
3 | * policies) | 3 | * policies) |
4 | */ | 4 | */ |
5 | 5 | ||
6 | #ifdef CONFIG_SMP | ||
7 | |||
8 | static inline int rt_overloaded(struct rq *rq) | ||
9 | { | ||
10 | return atomic_read(&rq->rd->rto_count); | ||
11 | } | ||
12 | |||
13 | static inline void rt_set_overload(struct rq *rq) | ||
14 | { | ||
15 | cpu_set(rq->cpu, rq->rd->rto_mask); | ||
16 | /* | ||
17 | * Make sure the mask is visible before we set | ||
18 | * the overload count. That is checked to determine | ||
19 | * if we should look at the mask. It would be a shame | ||
20 | * if we looked at the mask, but the mask was not | ||
21 | * updated yet. | ||
22 | */ | ||
23 | wmb(); | ||
24 | atomic_inc(&rq->rd->rto_count); | ||
25 | } | ||
26 | |||
27 | static inline void rt_clear_overload(struct rq *rq) | ||
28 | { | ||
29 | /* the order here really doesn't matter */ | ||
30 | atomic_dec(&rq->rd->rto_count); | ||
31 | cpu_clear(rq->cpu, rq->rd->rto_mask); | ||
32 | } | ||
33 | |||
34 | static void update_rt_migration(struct rq *rq) | ||
35 | { | ||
36 | if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) { | ||
37 | if (!rq->rt.overloaded) { | ||
38 | rt_set_overload(rq); | ||
39 | rq->rt.overloaded = 1; | ||
40 | } | ||
41 | } else if (rq->rt.overloaded) { | ||
42 | rt_clear_overload(rq); | ||
43 | rq->rt.overloaded = 0; | ||
44 | } | ||
45 | } | ||
46 | #endif /* CONFIG_SMP */ | ||
47 | |||
48 | static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) | ||
49 | { | ||
50 | return container_of(rt_se, struct task_struct, rt); | ||
51 | } | ||
52 | |||
53 | static inline int on_rt_rq(struct sched_rt_entity *rt_se) | ||
54 | { | ||
55 | return !list_empty(&rt_se->run_list); | ||
56 | } | ||
57 | |||
58 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
59 | |||
60 | static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) | ||
61 | { | ||
62 | if (!rt_rq->tg) | ||
63 | return SCHED_RT_FRAC; | ||
64 | |||
65 | return rt_rq->tg->rt_ratio; | ||
66 | } | ||
67 | |||
68 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | ||
69 | list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) | ||
70 | |||
71 | static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) | ||
72 | { | ||
73 | return rt_rq->rq; | ||
74 | } | ||
75 | |||
76 | static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | ||
77 | { | ||
78 | return rt_se->rt_rq; | ||
79 | } | ||
80 | |||
81 | #define for_each_sched_rt_entity(rt_se) \ | ||
82 | for (; rt_se; rt_se = rt_se->parent) | ||
83 | |||
84 | static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) | ||
85 | { | ||
86 | return rt_se->my_q; | ||
87 | } | ||
88 | |||
89 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se); | ||
90 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se); | ||
91 | |||
92 | static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) | ||
93 | { | ||
94 | struct sched_rt_entity *rt_se = rt_rq->rt_se; | ||
95 | |||
96 | if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { | ||
97 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; | ||
98 | |||
99 | enqueue_rt_entity(rt_se); | ||
100 | if (rt_rq->highest_prio < curr->prio) | ||
101 | resched_task(curr); | ||
102 | } | ||
103 | } | ||
104 | |||
105 | static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) | ||
106 | { | ||
107 | struct sched_rt_entity *rt_se = rt_rq->rt_se; | ||
108 | |||
109 | if (rt_se && on_rt_rq(rt_se)) | ||
110 | dequeue_rt_entity(rt_se); | ||
111 | } | ||
112 | |||
113 | #else | ||
114 | |||
115 | static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) | ||
116 | { | ||
117 | return sysctl_sched_rt_ratio; | ||
118 | } | ||
119 | |||
120 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | ||
121 | for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) | ||
122 | |||
123 | static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) | ||
124 | { | ||
125 | return container_of(rt_rq, struct rq, rt); | ||
126 | } | ||
127 | |||
128 | static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | ||
129 | { | ||
130 | struct task_struct *p = rt_task_of(rt_se); | ||
131 | struct rq *rq = task_rq(p); | ||
132 | |||
133 | return &rq->rt; | ||
134 | } | ||
135 | |||
136 | #define for_each_sched_rt_entity(rt_se) \ | ||
137 | for (; rt_se; rt_se = NULL) | ||
138 | |||
139 | static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) | ||
140 | { | ||
141 | return NULL; | ||
142 | } | ||
143 | |||
144 | static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) | ||
145 | { | ||
146 | } | ||
147 | |||
148 | static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) | ||
149 | { | ||
150 | } | ||
151 | |||
152 | #endif | ||
153 | |||
154 | static inline int rt_se_prio(struct sched_rt_entity *rt_se) | ||
155 | { | ||
156 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
157 | struct rt_rq *rt_rq = group_rt_rq(rt_se); | ||
158 | |||
159 | if (rt_rq) | ||
160 | return rt_rq->highest_prio; | ||
161 | #endif | ||
162 | |||
163 | return rt_task_of(rt_se)->prio; | ||
164 | } | ||
165 | |||
166 | static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq) | ||
167 | { | ||
168 | unsigned int rt_ratio = sched_rt_ratio(rt_rq); | ||
169 | u64 period, ratio; | ||
170 | |||
171 | if (rt_ratio == SCHED_RT_FRAC) | ||
172 | return 0; | ||
173 | |||
174 | if (rt_rq->rt_throttled) | ||
175 | return 1; | ||
176 | |||
177 | period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; | ||
178 | ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT; | ||
179 | |||
180 | if (rt_rq->rt_time > ratio) { | ||
181 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
182 | |||
183 | rq->rt_throttled = 1; | ||
184 | rt_rq->rt_throttled = 1; | ||
185 | |||
186 | sched_rt_ratio_dequeue(rt_rq); | ||
187 | return 1; | ||
188 | } | ||
189 | |||
190 | return 0; | ||
191 | } | ||
192 | |||
193 | static void update_sched_rt_period(struct rq *rq) | ||
194 | { | ||
195 | struct rt_rq *rt_rq; | ||
196 | u64 period; | ||
197 | |||
198 | while (rq->clock > rq->rt_period_expire) { | ||
199 | period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; | ||
200 | rq->rt_period_expire += period; | ||
201 | |||
202 | for_each_leaf_rt_rq(rt_rq, rq) { | ||
203 | unsigned long rt_ratio = sched_rt_ratio(rt_rq); | ||
204 | u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT; | ||
205 | |||
206 | rt_rq->rt_time -= min(rt_rq->rt_time, ratio); | ||
207 | if (rt_rq->rt_throttled) { | ||
208 | rt_rq->rt_throttled = 0; | ||
209 | sched_rt_ratio_enqueue(rt_rq); | ||
210 | } | ||
211 | } | ||
212 | |||
213 | rq->rt_throttled = 0; | ||
214 | } | ||
215 | } | ||
216 | |||
6 | /* | 217 | /* |
7 | * Update the current task's runtime statistics. Skip current tasks that | 218 | * Update the current task's runtime statistics. Skip current tasks that |
8 | * are not in our scheduling class. | 219 | * are not in our scheduling class. |
@@ -10,6 +221,8 @@ | |||
10 | static void update_curr_rt(struct rq *rq) | 221 | static void update_curr_rt(struct rq *rq) |
11 | { | 222 | { |
12 | struct task_struct *curr = rq->curr; | 223 | struct task_struct *curr = rq->curr; |
224 | struct sched_rt_entity *rt_se = &curr->rt; | ||
225 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | ||
13 | u64 delta_exec; | 226 | u64 delta_exec; |
14 | 227 | ||
15 | if (!task_has_rt_policy(curr)) | 228 | if (!task_has_rt_policy(curr)) |
@@ -24,47 +237,228 @@ static void update_curr_rt(struct rq *rq) | |||
24 | curr->se.sum_exec_runtime += delta_exec; | 237 | curr->se.sum_exec_runtime += delta_exec; |
25 | curr->se.exec_start = rq->clock; | 238 | curr->se.exec_start = rq->clock; |
26 | cpuacct_charge(curr, delta_exec); | 239 | cpuacct_charge(curr, delta_exec); |
240 | |||
241 | rt_rq->rt_time += delta_exec; | ||
242 | /* | ||
243 | * might make it a tad more accurate: | ||
244 | * | ||
245 | * update_sched_rt_period(rq); | ||
246 | */ | ||
247 | if (sched_rt_ratio_exceeded(rt_rq)) | ||
248 | resched_task(curr); | ||
27 | } | 249 | } |
28 | 250 | ||
29 | static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | 251 | static inline |
252 | void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | ||
253 | { | ||
254 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); | ||
255 | rt_rq->rt_nr_running++; | ||
256 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED | ||
257 | if (rt_se_prio(rt_se) < rt_rq->highest_prio) | ||
258 | rt_rq->highest_prio = rt_se_prio(rt_se); | ||
259 | #endif | ||
260 | #ifdef CONFIG_SMP | ||
261 | if (rt_se->nr_cpus_allowed > 1) { | ||
262 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
263 | rq->rt.rt_nr_migratory++; | ||
264 | } | ||
265 | |||
266 | update_rt_migration(rq_of_rt_rq(rt_rq)); | ||
267 | #endif | ||
268 | } | ||
269 | |||
270 | static inline | ||
271 | void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | ||
272 | { | ||
273 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); | ||
274 | WARN_ON(!rt_rq->rt_nr_running); | ||
275 | rt_rq->rt_nr_running--; | ||
276 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED | ||
277 | if (rt_rq->rt_nr_running) { | ||
278 | struct rt_prio_array *array; | ||
279 | |||
280 | WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio); | ||
281 | if (rt_se_prio(rt_se) == rt_rq->highest_prio) { | ||
282 | /* recalculate */ | ||
283 | array = &rt_rq->active; | ||
284 | rt_rq->highest_prio = | ||
285 | sched_find_first_bit(array->bitmap); | ||
286 | } /* otherwise leave rq->highest prio alone */ | ||
287 | } else | ||
288 | rt_rq->highest_prio = MAX_RT_PRIO; | ||
289 | #endif | ||
290 | #ifdef CONFIG_SMP | ||
291 | if (rt_se->nr_cpus_allowed > 1) { | ||
292 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
293 | rq->rt.rt_nr_migratory--; | ||
294 | } | ||
295 | |||
296 | update_rt_migration(rq_of_rt_rq(rt_rq)); | ||
297 | #endif /* CONFIG_SMP */ | ||
298 | } | ||
299 | |||
300 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se) | ||
301 | { | ||
302 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | ||
303 | struct rt_prio_array *array = &rt_rq->active; | ||
304 | struct rt_rq *group_rq = group_rt_rq(rt_se); | ||
305 | |||
306 | if (group_rq && group_rq->rt_throttled) | ||
307 | return; | ||
308 | |||
309 | list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); | ||
310 | __set_bit(rt_se_prio(rt_se), array->bitmap); | ||
311 | |||
312 | inc_rt_tasks(rt_se, rt_rq); | ||
313 | } | ||
314 | |||
315 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | ||
30 | { | 316 | { |
31 | struct rt_prio_array *array = &rq->rt.active; | 317 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); |
318 | struct rt_prio_array *array = &rt_rq->active; | ||
319 | |||
320 | list_del_init(&rt_se->run_list); | ||
321 | if (list_empty(array->queue + rt_se_prio(rt_se))) | ||
322 | __clear_bit(rt_se_prio(rt_se), array->bitmap); | ||
32 | 323 | ||
33 | list_add_tail(&p->run_list, array->queue + p->prio); | 324 | dec_rt_tasks(rt_se, rt_rq); |
34 | __set_bit(p->prio, array->bitmap); | 325 | } |
326 | |||
327 | /* | ||
328 | * Because the prio of an upper entry depends on the lower | ||
329 | * entries, we must remove entries top - down. | ||
330 | * | ||
331 | * XXX: O(1/2 h^2) because we can only walk up, not down the chain. | ||
332 | * doesn't matter much for now, as h=2 for GROUP_SCHED. | ||
333 | */ | ||
334 | static void dequeue_rt_stack(struct task_struct *p) | ||
335 | { | ||
336 | struct sched_rt_entity *rt_se, *top_se; | ||
337 | |||
338 | /* | ||
339 | * dequeue all, top - down. | ||
340 | */ | ||
341 | do { | ||
342 | rt_se = &p->rt; | ||
343 | top_se = NULL; | ||
344 | for_each_sched_rt_entity(rt_se) { | ||
345 | if (on_rt_rq(rt_se)) | ||
346 | top_se = rt_se; | ||
347 | } | ||
348 | if (top_se) | ||
349 | dequeue_rt_entity(top_se); | ||
350 | } while (top_se); | ||
35 | } | 351 | } |
36 | 352 | ||
37 | /* | 353 | /* |
38 | * Adding/removing a task to/from a priority array: | 354 | * Adding/removing a task to/from a priority array: |
39 | */ | 355 | */ |
356 | static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | ||
357 | { | ||
358 | struct sched_rt_entity *rt_se = &p->rt; | ||
359 | |||
360 | if (wakeup) | ||
361 | rt_se->timeout = 0; | ||
362 | |||
363 | dequeue_rt_stack(p); | ||
364 | |||
365 | /* | ||
366 | * enqueue everybody, bottom - up. | ||
367 | */ | ||
368 | for_each_sched_rt_entity(rt_se) | ||
369 | enqueue_rt_entity(rt_se); | ||
370 | |||
371 | inc_cpu_load(rq, p->se.load.weight); | ||
372 | } | ||
373 | |||
40 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | 374 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) |
41 | { | 375 | { |
42 | struct rt_prio_array *array = &rq->rt.active; | 376 | struct sched_rt_entity *rt_se = &p->rt; |
377 | struct rt_rq *rt_rq; | ||
43 | 378 | ||
44 | update_curr_rt(rq); | 379 | update_curr_rt(rq); |
45 | 380 | ||
46 | list_del(&p->run_list); | 381 | dequeue_rt_stack(p); |
47 | if (list_empty(array->queue + p->prio)) | 382 | |
48 | __clear_bit(p->prio, array->bitmap); | 383 | /* |
384 | * re-enqueue all non-empty rt_rq entities. | ||
385 | */ | ||
386 | for_each_sched_rt_entity(rt_se) { | ||
387 | rt_rq = group_rt_rq(rt_se); | ||
388 | if (rt_rq && rt_rq->rt_nr_running) | ||
389 | enqueue_rt_entity(rt_se); | ||
390 | } | ||
391 | |||
392 | dec_cpu_load(rq, p->se.load.weight); | ||
49 | } | 393 | } |
50 | 394 | ||
51 | /* | 395 | /* |
52 | * Put task to the end of the run list without the overhead of dequeue | 396 | * Put task to the end of the run list without the overhead of dequeue |
53 | * followed by enqueue. | 397 | * followed by enqueue. |
54 | */ | 398 | */ |
399 | static | ||
400 | void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) | ||
401 | { | ||
402 | struct rt_prio_array *array = &rt_rq->active; | ||
403 | |||
404 | list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); | ||
405 | } | ||
406 | |||
55 | static void requeue_task_rt(struct rq *rq, struct task_struct *p) | 407 | static void requeue_task_rt(struct rq *rq, struct task_struct *p) |
56 | { | 408 | { |
57 | struct rt_prio_array *array = &rq->rt.active; | 409 | struct sched_rt_entity *rt_se = &p->rt; |
410 | struct rt_rq *rt_rq; | ||
58 | 411 | ||
59 | list_move_tail(&p->run_list, array->queue + p->prio); | 412 | for_each_sched_rt_entity(rt_se) { |
413 | rt_rq = rt_rq_of_se(rt_se); | ||
414 | requeue_rt_entity(rt_rq, rt_se); | ||
415 | } | ||
60 | } | 416 | } |
61 | 417 | ||
62 | static void | 418 | static void yield_task_rt(struct rq *rq) |
63 | yield_task_rt(struct rq *rq) | ||
64 | { | 419 | { |
65 | requeue_task_rt(rq, rq->curr); | 420 | requeue_task_rt(rq, rq->curr); |
66 | } | 421 | } |
67 | 422 | ||
423 | #ifdef CONFIG_SMP | ||
424 | static int find_lowest_rq(struct task_struct *task); | ||
425 | |||
426 | static int select_task_rq_rt(struct task_struct *p, int sync) | ||
427 | { | ||
428 | struct rq *rq = task_rq(p); | ||
429 | |||
430 | /* | ||
431 | * If the current task is an RT task, then | ||
432 | * try to see if we can wake this RT task up on another | ||
433 | * runqueue. Otherwise simply start this RT task | ||
434 | * on its current runqueue. | ||
435 | * | ||
436 | * We want to avoid overloading runqueues. Even if | ||
437 | * the RT task is of higher priority than the current RT task. | ||
438 | * RT tasks behave differently than other tasks. If | ||
439 | * one gets preempted, we try to push it off to another queue. | ||
440 | * So trying to keep a preempting RT task on the same | ||
441 | * cache hot CPU will force the running RT task to | ||
442 | * a cold CPU. So we waste all the cache for the lower | ||
443 | * RT task in hopes of saving some of a RT task | ||
444 | * that is just being woken and probably will have | ||
445 | * cold cache anyway. | ||
446 | */ | ||
447 | if (unlikely(rt_task(rq->curr)) && | ||
448 | (p->rt.nr_cpus_allowed > 1)) { | ||
449 | int cpu = find_lowest_rq(p); | ||
450 | |||
451 | return (cpu == -1) ? task_cpu(p) : cpu; | ||
452 | } | ||
453 | |||
454 | /* | ||
455 | * Otherwise, just let it ride on the affined RQ and the | ||
456 | * post-schedule router will push the preempted task away | ||
457 | */ | ||
458 | return task_cpu(p); | ||
459 | } | ||
460 | #endif /* CONFIG_SMP */ | ||
461 | |||
68 | /* | 462 | /* |
69 | * Preempt the current task with a newly woken task if needed: | 463 | * Preempt the current task with a newly woken task if needed: |
70 | */ | 464 | */ |
@@ -74,25 +468,48 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) | |||
74 | resched_task(rq->curr); | 468 | resched_task(rq->curr); |
75 | } | 469 | } |
76 | 470 | ||
77 | static struct task_struct *pick_next_task_rt(struct rq *rq) | 471 | static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, |
472 | struct rt_rq *rt_rq) | ||
78 | { | 473 | { |
79 | struct rt_prio_array *array = &rq->rt.active; | 474 | struct rt_prio_array *array = &rt_rq->active; |
80 | struct task_struct *next; | 475 | struct sched_rt_entity *next = NULL; |
81 | struct list_head *queue; | 476 | struct list_head *queue; |
82 | int idx; | 477 | int idx; |
83 | 478 | ||
84 | idx = sched_find_first_bit(array->bitmap); | 479 | idx = sched_find_first_bit(array->bitmap); |
85 | if (idx >= MAX_RT_PRIO) | 480 | BUG_ON(idx >= MAX_RT_PRIO); |
86 | return NULL; | ||
87 | 481 | ||
88 | queue = array->queue + idx; | 482 | queue = array->queue + idx; |
89 | next = list_entry(queue->next, struct task_struct, run_list); | 483 | next = list_entry(queue->next, struct sched_rt_entity, run_list); |
90 | |||
91 | next->se.exec_start = rq->clock; | ||
92 | 484 | ||
93 | return next; | 485 | return next; |
94 | } | 486 | } |
95 | 487 | ||
488 | static struct task_struct *pick_next_task_rt(struct rq *rq) | ||
489 | { | ||
490 | struct sched_rt_entity *rt_se; | ||
491 | struct task_struct *p; | ||
492 | struct rt_rq *rt_rq; | ||
493 | |||
494 | rt_rq = &rq->rt; | ||
495 | |||
496 | if (unlikely(!rt_rq->rt_nr_running)) | ||
497 | return NULL; | ||
498 | |||
499 | if (sched_rt_ratio_exceeded(rt_rq)) | ||
500 | return NULL; | ||
501 | |||
502 | do { | ||
503 | rt_se = pick_next_rt_entity(rq, rt_rq); | ||
504 | BUG_ON(!rt_se); | ||
505 | rt_rq = group_rt_rq(rt_se); | ||
506 | } while (rt_rq); | ||
507 | |||
508 | p = rt_task_of(rt_se); | ||
509 | p->se.exec_start = rq->clock; | ||
510 | return p; | ||
511 | } | ||
512 | |||
96 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | 513 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p) |
97 | { | 514 | { |
98 | update_curr_rt(rq); | 515 | update_curr_rt(rq); |
@@ -100,76 +517,448 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
100 | } | 517 | } |
101 | 518 | ||
102 | #ifdef CONFIG_SMP | 519 | #ifdef CONFIG_SMP |
103 | /* | 520 | |
104 | * Load-balancing iterator. Note: while the runqueue stays locked | 521 | /* Only try algorithms three times */ |
105 | * during the whole iteration, the current task might be | 522 | #define RT_MAX_TRIES 3 |
106 | * dequeued so the iterator has to be dequeue-safe. Here we | 523 | |
107 | * achieve that by always pre-iterating before returning | 524 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest); |
108 | * the current task: | 525 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); |
109 | */ | 526 | |
110 | static struct task_struct *load_balance_start_rt(void *arg) | 527 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) |
111 | { | 528 | { |
112 | struct rq *rq = arg; | 529 | if (!task_running(rq, p) && |
113 | struct rt_prio_array *array = &rq->rt.active; | 530 | (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && |
114 | struct list_head *head, *curr; | 531 | (p->rt.nr_cpus_allowed > 1)) |
115 | struct task_struct *p; | 532 | return 1; |
533 | return 0; | ||
534 | } | ||
535 | |||
536 | /* Return the second highest RT task, NULL otherwise */ | ||
537 | static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) | ||
538 | { | ||
539 | struct task_struct *next = NULL; | ||
540 | struct sched_rt_entity *rt_se; | ||
541 | struct rt_prio_array *array; | ||
542 | struct rt_rq *rt_rq; | ||
116 | int idx; | 543 | int idx; |
117 | 544 | ||
118 | idx = sched_find_first_bit(array->bitmap); | 545 | for_each_leaf_rt_rq(rt_rq, rq) { |
119 | if (idx >= MAX_RT_PRIO) | 546 | array = &rt_rq->active; |
120 | return NULL; | 547 | idx = sched_find_first_bit(array->bitmap); |
548 | next_idx: | ||
549 | if (idx >= MAX_RT_PRIO) | ||
550 | continue; | ||
551 | if (next && next->prio < idx) | ||
552 | continue; | ||
553 | list_for_each_entry(rt_se, array->queue + idx, run_list) { | ||
554 | struct task_struct *p = rt_task_of(rt_se); | ||
555 | if (pick_rt_task(rq, p, cpu)) { | ||
556 | next = p; | ||
557 | break; | ||
558 | } | ||
559 | } | ||
560 | if (!next) { | ||
561 | idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); | ||
562 | goto next_idx; | ||
563 | } | ||
564 | } | ||
121 | 565 | ||
122 | head = array->queue + idx; | 566 | return next; |
123 | curr = head->prev; | 567 | } |
124 | 568 | ||
125 | p = list_entry(curr, struct task_struct, run_list); | 569 | static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); |
126 | 570 | ||
127 | curr = curr->prev; | 571 | static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask) |
572 | { | ||
573 | int lowest_prio = -1; | ||
574 | int lowest_cpu = -1; | ||
575 | int count = 0; | ||
576 | int cpu; | ||
128 | 577 | ||
129 | rq->rt.rt_load_balance_idx = idx; | 578 | cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed); |
130 | rq->rt.rt_load_balance_head = head; | ||
131 | rq->rt.rt_load_balance_curr = curr; | ||
132 | 579 | ||
133 | return p; | 580 | /* |
581 | * Scan each rq for the lowest prio. | ||
582 | */ | ||
583 | for_each_cpu_mask(cpu, *lowest_mask) { | ||
584 | struct rq *rq = cpu_rq(cpu); | ||
585 | |||
586 | /* We look for lowest RT prio or non-rt CPU */ | ||
587 | if (rq->rt.highest_prio >= MAX_RT_PRIO) { | ||
588 | /* | ||
589 | * if we already found a low RT queue | ||
590 | * and now we found this non-rt queue | ||
591 | * clear the mask and set our bit. | ||
592 | * Otherwise just return the queue as is | ||
593 | * and the count==1 will cause the algorithm | ||
594 | * to use the first bit found. | ||
595 | */ | ||
596 | if (lowest_cpu != -1) { | ||
597 | cpus_clear(*lowest_mask); | ||
598 | cpu_set(rq->cpu, *lowest_mask); | ||
599 | } | ||
600 | return 1; | ||
601 | } | ||
602 | |||
603 | /* no locking for now */ | ||
604 | if ((rq->rt.highest_prio > task->prio) | ||
605 | && (rq->rt.highest_prio >= lowest_prio)) { | ||
606 | if (rq->rt.highest_prio > lowest_prio) { | ||
607 | /* new low - clear old data */ | ||
608 | lowest_prio = rq->rt.highest_prio; | ||
609 | lowest_cpu = cpu; | ||
610 | count = 0; | ||
611 | } | ||
612 | count++; | ||
613 | } else | ||
614 | cpu_clear(cpu, *lowest_mask); | ||
615 | } | ||
616 | |||
617 | /* | ||
618 | * Clear out all the set bits that represent | ||
619 | * runqueues that were of higher prio than | ||
620 | * the lowest_prio. | ||
621 | */ | ||
622 | if (lowest_cpu > 0) { | ||
623 | /* | ||
624 | * Perhaps we could add another cpumask op to | ||
625 | * zero out bits. Like cpu_zero_bits(cpumask, nrbits); | ||
626 | * Then that could be optimized to use memset and such. | ||
627 | */ | ||
628 | for_each_cpu_mask(cpu, *lowest_mask) { | ||
629 | if (cpu >= lowest_cpu) | ||
630 | break; | ||
631 | cpu_clear(cpu, *lowest_mask); | ||
632 | } | ||
633 | } | ||
634 | |||
635 | return count; | ||
134 | } | 636 | } |
135 | 637 | ||
136 | static struct task_struct *load_balance_next_rt(void *arg) | 638 | static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) |
137 | { | 639 | { |
138 | struct rq *rq = arg; | 640 | int first; |
139 | struct rt_prio_array *array = &rq->rt.active; | 641 | |
140 | struct list_head *head, *curr; | 642 | /* "this_cpu" is cheaper to preempt than a remote processor */ |
141 | struct task_struct *p; | 643 | if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) |
142 | int idx; | 644 | return this_cpu; |
645 | |||
646 | first = first_cpu(*mask); | ||
647 | if (first != NR_CPUS) | ||
648 | return first; | ||
649 | |||
650 | return -1; | ||
651 | } | ||
652 | |||
653 | static int find_lowest_rq(struct task_struct *task) | ||
654 | { | ||
655 | struct sched_domain *sd; | ||
656 | cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); | ||
657 | int this_cpu = smp_processor_id(); | ||
658 | int cpu = task_cpu(task); | ||
659 | int count = find_lowest_cpus(task, lowest_mask); | ||
143 | 660 | ||
144 | idx = rq->rt.rt_load_balance_idx; | 661 | if (!count) |
145 | head = rq->rt.rt_load_balance_head; | 662 | return -1; /* No targets found */ |
146 | curr = rq->rt.rt_load_balance_curr; | ||
147 | 663 | ||
148 | /* | 664 | /* |
149 | * If we arrived back to the head again then | 665 | * There is no sense in performing an optimal search if only one |
150 | * iterate to the next queue (if any): | 666 | * target is found. |
151 | */ | 667 | */ |
152 | if (unlikely(head == curr)) { | 668 | if (count == 1) |
153 | int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); | 669 | return first_cpu(*lowest_mask); |
154 | 670 | ||
155 | if (next_idx >= MAX_RT_PRIO) | 671 | /* |
156 | return NULL; | 672 | * At this point we have built a mask of cpus representing the |
673 | * lowest priority tasks in the system. Now we want to elect | ||
674 | * the best one based on our affinity and topology. | ||
675 | * | ||
676 | * We prioritize the last cpu that the task executed on since | ||
677 | * it is most likely cache-hot in that location. | ||
678 | */ | ||
679 | if (cpu_isset(cpu, *lowest_mask)) | ||
680 | return cpu; | ||
681 | |||
682 | /* | ||
683 | * Otherwise, we consult the sched_domains span maps to figure | ||
684 | * out which cpu is logically closest to our hot cache data. | ||
685 | */ | ||
686 | if (this_cpu == cpu) | ||
687 | this_cpu = -1; /* Skip this_cpu opt if the same */ | ||
688 | |||
689 | for_each_domain(cpu, sd) { | ||
690 | if (sd->flags & SD_WAKE_AFFINE) { | ||
691 | cpumask_t domain_mask; | ||
692 | int best_cpu; | ||
157 | 693 | ||
158 | idx = next_idx; | 694 | cpus_and(domain_mask, sd->span, *lowest_mask); |
159 | head = array->queue + idx; | ||
160 | curr = head->prev; | ||
161 | 695 | ||
162 | rq->rt.rt_load_balance_idx = idx; | 696 | best_cpu = pick_optimal_cpu(this_cpu, |
163 | rq->rt.rt_load_balance_head = head; | 697 | &domain_mask); |
698 | if (best_cpu != -1) | ||
699 | return best_cpu; | ||
700 | } | ||
164 | } | 701 | } |
165 | 702 | ||
166 | p = list_entry(curr, struct task_struct, run_list); | 703 | /* |
704 | * And finally, if there were no matches within the domains | ||
705 | * just give the caller *something* to work with from the compatible | ||
706 | * locations. | ||
707 | */ | ||
708 | return pick_optimal_cpu(this_cpu, lowest_mask); | ||
709 | } | ||
167 | 710 | ||
168 | curr = curr->prev; | 711 | /* Will lock the rq it finds */ |
712 | static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | ||
713 | { | ||
714 | struct rq *lowest_rq = NULL; | ||
715 | int tries; | ||
716 | int cpu; | ||
169 | 717 | ||
170 | rq->rt.rt_load_balance_curr = curr; | 718 | for (tries = 0; tries < RT_MAX_TRIES; tries++) { |
719 | cpu = find_lowest_rq(task); | ||
171 | 720 | ||
172 | return p; | 721 | if ((cpu == -1) || (cpu == rq->cpu)) |
722 | break; | ||
723 | |||
724 | lowest_rq = cpu_rq(cpu); | ||
725 | |||
726 | /* if the prio of this runqueue changed, try again */ | ||
727 | if (double_lock_balance(rq, lowest_rq)) { | ||
728 | /* | ||
729 | * We had to unlock the run queue. In | ||
730 | * the mean time, task could have | ||
731 | * migrated already or had its affinity changed. | ||
732 | * Also make sure that it wasn't scheduled on its rq. | ||
733 | */ | ||
734 | if (unlikely(task_rq(task) != rq || | ||
735 | !cpu_isset(lowest_rq->cpu, | ||
736 | task->cpus_allowed) || | ||
737 | task_running(rq, task) || | ||
738 | !task->se.on_rq)) { | ||
739 | |||
740 | spin_unlock(&lowest_rq->lock); | ||
741 | lowest_rq = NULL; | ||
742 | break; | ||
743 | } | ||
744 | } | ||
745 | |||
746 | /* If this rq is still suitable use it. */ | ||
747 | if (lowest_rq->rt.highest_prio > task->prio) | ||
748 | break; | ||
749 | |||
750 | /* try again */ | ||
751 | spin_unlock(&lowest_rq->lock); | ||
752 | lowest_rq = NULL; | ||
753 | } | ||
754 | |||
755 | return lowest_rq; | ||
756 | } | ||
757 | |||
758 | /* | ||
759 | * If the current CPU has more than one RT task, see if the non | ||
760 | * running task can migrate over to a CPU that is running a task | ||
761 | * of lesser priority. | ||
762 | */ | ||
763 | static int push_rt_task(struct rq *rq) | ||
764 | { | ||
765 | struct task_struct *next_task; | ||
766 | struct rq *lowest_rq; | ||
767 | int ret = 0; | ||
768 | int paranoid = RT_MAX_TRIES; | ||
769 | |||
770 | if (!rq->rt.overloaded) | ||
771 | return 0; | ||
772 | |||
773 | next_task = pick_next_highest_task_rt(rq, -1); | ||
774 | if (!next_task) | ||
775 | return 0; | ||
776 | |||
777 | retry: | ||
778 | if (unlikely(next_task == rq->curr)) { | ||
779 | WARN_ON(1); | ||
780 | return 0; | ||
781 | } | ||
782 | |||
783 | /* | ||
784 | * It's possible that the next_task slipped in of | ||
785 | * higher priority than current. If that's the case | ||
786 | * just reschedule current. | ||
787 | */ | ||
788 | if (unlikely(next_task->prio < rq->curr->prio)) { | ||
789 | resched_task(rq->curr); | ||
790 | return 0; | ||
791 | } | ||
792 | |||
793 | /* We might release rq lock */ | ||
794 | get_task_struct(next_task); | ||
795 | |||
796 | /* find_lock_lowest_rq locks the rq if found */ | ||
797 | lowest_rq = find_lock_lowest_rq(next_task, rq); | ||
798 | if (!lowest_rq) { | ||
799 | struct task_struct *task; | ||
800 | /* | ||
801 | * find lock_lowest_rq releases rq->lock | ||
802 | * so it is possible that next_task has changed. | ||
803 | * If it has, then try again. | ||
804 | */ | ||
805 | task = pick_next_highest_task_rt(rq, -1); | ||
806 | if (unlikely(task != next_task) && task && paranoid--) { | ||
807 | put_task_struct(next_task); | ||
808 | next_task = task; | ||
809 | goto retry; | ||
810 | } | ||
811 | goto out; | ||
812 | } | ||
813 | |||
814 | deactivate_task(rq, next_task, 0); | ||
815 | set_task_cpu(next_task, lowest_rq->cpu); | ||
816 | activate_task(lowest_rq, next_task, 0); | ||
817 | |||
818 | resched_task(lowest_rq->curr); | ||
819 | |||
820 | spin_unlock(&lowest_rq->lock); | ||
821 | |||
822 | ret = 1; | ||
823 | out: | ||
824 | put_task_struct(next_task); | ||
825 | |||
826 | return ret; | ||
827 | } | ||
828 | |||
829 | /* | ||
830 | * TODO: Currently we just use the second highest prio task on | ||
831 | * the queue, and stop when it can't migrate (or there's | ||
832 | * no more RT tasks). There may be a case where a lower | ||
833 | * priority RT task has a different affinity than the | ||
834 | * higher RT task. In this case the lower RT task could | ||
835 | * possibly be able to migrate where as the higher priority | ||
836 | * RT task could not. We currently ignore this issue. | ||
837 | * Enhancements are welcome! | ||
838 | */ | ||
839 | static void push_rt_tasks(struct rq *rq) | ||
840 | { | ||
841 | /* push_rt_task will return true if it moved an RT */ | ||
842 | while (push_rt_task(rq)) | ||
843 | ; | ||
844 | } | ||
845 | |||
846 | static int pull_rt_task(struct rq *this_rq) | ||
847 | { | ||
848 | int this_cpu = this_rq->cpu, ret = 0, cpu; | ||
849 | struct task_struct *p, *next; | ||
850 | struct rq *src_rq; | ||
851 | |||
852 | if (likely(!rt_overloaded(this_rq))) | ||
853 | return 0; | ||
854 | |||
855 | next = pick_next_task_rt(this_rq); | ||
856 | |||
857 | for_each_cpu_mask(cpu, this_rq->rd->rto_mask) { | ||
858 | if (this_cpu == cpu) | ||
859 | continue; | ||
860 | |||
861 | src_rq = cpu_rq(cpu); | ||
862 | /* | ||
863 | * We can potentially drop this_rq's lock in | ||
864 | * double_lock_balance, and another CPU could | ||
865 | * steal our next task - hence we must cause | ||
866 | * the caller to recalculate the next task | ||
867 | * in that case: | ||
868 | */ | ||
869 | if (double_lock_balance(this_rq, src_rq)) { | ||
870 | struct task_struct *old_next = next; | ||
871 | |||
872 | next = pick_next_task_rt(this_rq); | ||
873 | if (next != old_next) | ||
874 | ret = 1; | ||
875 | } | ||
876 | |||
877 | /* | ||
878 | * Are there still pullable RT tasks? | ||
879 | */ | ||
880 | if (src_rq->rt.rt_nr_running <= 1) | ||
881 | goto skip; | ||
882 | |||
883 | p = pick_next_highest_task_rt(src_rq, this_cpu); | ||
884 | |||
885 | /* | ||
886 | * Do we have an RT task that preempts | ||
887 | * the to-be-scheduled task? | ||
888 | */ | ||
889 | if (p && (!next || (p->prio < next->prio))) { | ||
890 | WARN_ON(p == src_rq->curr); | ||
891 | WARN_ON(!p->se.on_rq); | ||
892 | |||
893 | /* | ||
894 | * There's a chance that p is higher in priority | ||
895 | * than what's currently running on its cpu. | ||
896 | * This is just that p is wakeing up and hasn't | ||
897 | * had a chance to schedule. We only pull | ||
898 | * p if it is lower in priority than the | ||
899 | * current task on the run queue or | ||
900 | * this_rq next task is lower in prio than | ||
901 | * the current task on that rq. | ||
902 | */ | ||
903 | if (p->prio < src_rq->curr->prio || | ||
904 | (next && next->prio < src_rq->curr->prio)) | ||
905 | goto skip; | ||
906 | |||
907 | ret = 1; | ||
908 | |||
909 | deactivate_task(src_rq, p, 0); | ||
910 | set_task_cpu(p, this_cpu); | ||
911 | activate_task(this_rq, p, 0); | ||
912 | /* | ||
913 | * We continue with the search, just in | ||
914 | * case there's an even higher prio task | ||
915 | * in another runqueue. (low likelyhood | ||
916 | * but possible) | ||
917 | * | ||
918 | * Update next so that we won't pick a task | ||
919 | * on another cpu with a priority lower (or equal) | ||
920 | * than the one we just picked. | ||
921 | */ | ||
922 | next = p; | ||
923 | |||
924 | } | ||
925 | skip: | ||
926 | spin_unlock(&src_rq->lock); | ||
927 | } | ||
928 | |||
929 | return ret; | ||
930 | } | ||
931 | |||
932 | static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) | ||
933 | { | ||
934 | /* Try to pull RT tasks here if we lower this rq's prio */ | ||
935 | if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio) | ||
936 | pull_rt_task(rq); | ||
937 | } | ||
938 | |||
939 | static void post_schedule_rt(struct rq *rq) | ||
940 | { | ||
941 | /* | ||
942 | * If we have more than one rt_task queued, then | ||
943 | * see if we can push the other rt_tasks off to other CPUS. | ||
944 | * Note we may release the rq lock, and since | ||
945 | * the lock was owned by prev, we need to release it | ||
946 | * first via finish_lock_switch and then reaquire it here. | ||
947 | */ | ||
948 | if (unlikely(rq->rt.overloaded)) { | ||
949 | spin_lock_irq(&rq->lock); | ||
950 | push_rt_tasks(rq); | ||
951 | spin_unlock_irq(&rq->lock); | ||
952 | } | ||
953 | } | ||
954 | |||
955 | |||
956 | static void task_wake_up_rt(struct rq *rq, struct task_struct *p) | ||
957 | { | ||
958 | if (!task_running(rq, p) && | ||
959 | (p->prio >= rq->rt.highest_prio) && | ||
960 | rq->rt.overloaded) | ||
961 | push_rt_tasks(rq); | ||
173 | } | 962 | } |
174 | 963 | ||
175 | static unsigned long | 964 | static unsigned long |
@@ -178,38 +967,170 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
178 | struct sched_domain *sd, enum cpu_idle_type idle, | 967 | struct sched_domain *sd, enum cpu_idle_type idle, |
179 | int *all_pinned, int *this_best_prio) | 968 | int *all_pinned, int *this_best_prio) |
180 | { | 969 | { |
181 | struct rq_iterator rt_rq_iterator; | 970 | /* don't touch RT tasks */ |
182 | 971 | return 0; | |
183 | rt_rq_iterator.start = load_balance_start_rt; | ||
184 | rt_rq_iterator.next = load_balance_next_rt; | ||
185 | /* pass 'busiest' rq argument into | ||
186 | * load_balance_[start|next]_rt iterators | ||
187 | */ | ||
188 | rt_rq_iterator.arg = busiest; | ||
189 | |||
190 | return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd, | ||
191 | idle, all_pinned, this_best_prio, &rt_rq_iterator); | ||
192 | } | 972 | } |
193 | 973 | ||
194 | static int | 974 | static int |
195 | move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | 975 | move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, |
196 | struct sched_domain *sd, enum cpu_idle_type idle) | 976 | struct sched_domain *sd, enum cpu_idle_type idle) |
197 | { | 977 | { |
198 | struct rq_iterator rt_rq_iterator; | 978 | /* don't touch RT tasks */ |
979 | return 0; | ||
980 | } | ||
981 | |||
982 | static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask) | ||
983 | { | ||
984 | int weight = cpus_weight(*new_mask); | ||
985 | |||
986 | BUG_ON(!rt_task(p)); | ||
199 | 987 | ||
200 | rt_rq_iterator.start = load_balance_start_rt; | 988 | /* |
201 | rt_rq_iterator.next = load_balance_next_rt; | 989 | * Update the migration status of the RQ if we have an RT task |
202 | rt_rq_iterator.arg = busiest; | 990 | * which is running AND changing its weight value. |
991 | */ | ||
992 | if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { | ||
993 | struct rq *rq = task_rq(p); | ||
994 | |||
995 | if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { | ||
996 | rq->rt.rt_nr_migratory++; | ||
997 | } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { | ||
998 | BUG_ON(!rq->rt.rt_nr_migratory); | ||
999 | rq->rt.rt_nr_migratory--; | ||
1000 | } | ||
1001 | |||
1002 | update_rt_migration(rq); | ||
1003 | } | ||
203 | 1004 | ||
204 | return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, | 1005 | p->cpus_allowed = *new_mask; |
205 | &rt_rq_iterator); | 1006 | p->rt.nr_cpus_allowed = weight; |
206 | } | 1007 | } |
207 | #endif | ||
208 | 1008 | ||
209 | static void task_tick_rt(struct rq *rq, struct task_struct *p) | 1009 | /* Assumes rq->lock is held */ |
1010 | static void join_domain_rt(struct rq *rq) | ||
1011 | { | ||
1012 | if (rq->rt.overloaded) | ||
1013 | rt_set_overload(rq); | ||
1014 | } | ||
1015 | |||
1016 | /* Assumes rq->lock is held */ | ||
1017 | static void leave_domain_rt(struct rq *rq) | ||
1018 | { | ||
1019 | if (rq->rt.overloaded) | ||
1020 | rt_clear_overload(rq); | ||
1021 | } | ||
1022 | |||
1023 | /* | ||
1024 | * When switch from the rt queue, we bring ourselves to a position | ||
1025 | * that we might want to pull RT tasks from other runqueues. | ||
1026 | */ | ||
1027 | static void switched_from_rt(struct rq *rq, struct task_struct *p, | ||
1028 | int running) | ||
1029 | { | ||
1030 | /* | ||
1031 | * If there are other RT tasks then we will reschedule | ||
1032 | * and the scheduling of the other RT tasks will handle | ||
1033 | * the balancing. But if we are the last RT task | ||
1034 | * we may need to handle the pulling of RT tasks | ||
1035 | * now. | ||
1036 | */ | ||
1037 | if (!rq->rt.rt_nr_running) | ||
1038 | pull_rt_task(rq); | ||
1039 | } | ||
1040 | #endif /* CONFIG_SMP */ | ||
1041 | |||
1042 | /* | ||
1043 | * When switching a task to RT, we may overload the runqueue | ||
1044 | * with RT tasks. In this case we try to push them off to | ||
1045 | * other runqueues. | ||
1046 | */ | ||
1047 | static void switched_to_rt(struct rq *rq, struct task_struct *p, | ||
1048 | int running) | ||
1049 | { | ||
1050 | int check_resched = 1; | ||
1051 | |||
1052 | /* | ||
1053 | * If we are already running, then there's nothing | ||
1054 | * that needs to be done. But if we are not running | ||
1055 | * we may need to preempt the current running task. | ||
1056 | * If that current running task is also an RT task | ||
1057 | * then see if we can move to another run queue. | ||
1058 | */ | ||
1059 | if (!running) { | ||
1060 | #ifdef CONFIG_SMP | ||
1061 | if (rq->rt.overloaded && push_rt_task(rq) && | ||
1062 | /* Don't resched if we changed runqueues */ | ||
1063 | rq != task_rq(p)) | ||
1064 | check_resched = 0; | ||
1065 | #endif /* CONFIG_SMP */ | ||
1066 | if (check_resched && p->prio < rq->curr->prio) | ||
1067 | resched_task(rq->curr); | ||
1068 | } | ||
1069 | } | ||
1070 | |||
1071 | /* | ||
1072 | * Priority of the task has changed. This may cause | ||
1073 | * us to initiate a push or pull. | ||
1074 | */ | ||
1075 | static void prio_changed_rt(struct rq *rq, struct task_struct *p, | ||
1076 | int oldprio, int running) | ||
1077 | { | ||
1078 | if (running) { | ||
1079 | #ifdef CONFIG_SMP | ||
1080 | /* | ||
1081 | * If our priority decreases while running, we | ||
1082 | * may need to pull tasks to this runqueue. | ||
1083 | */ | ||
1084 | if (oldprio < p->prio) | ||
1085 | pull_rt_task(rq); | ||
1086 | /* | ||
1087 | * If there's a higher priority task waiting to run | ||
1088 | * then reschedule. | ||
1089 | */ | ||
1090 | if (p->prio > rq->rt.highest_prio) | ||
1091 | resched_task(p); | ||
1092 | #else | ||
1093 | /* For UP simply resched on drop of prio */ | ||
1094 | if (oldprio < p->prio) | ||
1095 | resched_task(p); | ||
1096 | #endif /* CONFIG_SMP */ | ||
1097 | } else { | ||
1098 | /* | ||
1099 | * This task is not running, but if it is | ||
1100 | * greater than the current running task | ||
1101 | * then reschedule. | ||
1102 | */ | ||
1103 | if (p->prio < rq->curr->prio) | ||
1104 | resched_task(rq->curr); | ||
1105 | } | ||
1106 | } | ||
1107 | |||
1108 | static void watchdog(struct rq *rq, struct task_struct *p) | ||
1109 | { | ||
1110 | unsigned long soft, hard; | ||
1111 | |||
1112 | if (!p->signal) | ||
1113 | return; | ||
1114 | |||
1115 | soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur; | ||
1116 | hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max; | ||
1117 | |||
1118 | if (soft != RLIM_INFINITY) { | ||
1119 | unsigned long next; | ||
1120 | |||
1121 | p->rt.timeout++; | ||
1122 | next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); | ||
1123 | if (p->rt.timeout > next) | ||
1124 | p->it_sched_expires = p->se.sum_exec_runtime; | ||
1125 | } | ||
1126 | } | ||
1127 | |||
1128 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | ||
210 | { | 1129 | { |
211 | update_curr_rt(rq); | 1130 | update_curr_rt(rq); |
212 | 1131 | ||
1132 | watchdog(rq, p); | ||
1133 | |||
213 | /* | 1134 | /* |
214 | * RR tasks need a special form of timeslice management. | 1135 | * RR tasks need a special form of timeslice management. |
215 | * FIFO tasks have no timeslices. | 1136 | * FIFO tasks have no timeslices. |
@@ -217,16 +1138,16 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) | |||
217 | if (p->policy != SCHED_RR) | 1138 | if (p->policy != SCHED_RR) |
218 | return; | 1139 | return; |
219 | 1140 | ||
220 | if (--p->time_slice) | 1141 | if (--p->rt.time_slice) |
221 | return; | 1142 | return; |
222 | 1143 | ||
223 | p->time_slice = DEF_TIMESLICE; | 1144 | p->rt.time_slice = DEF_TIMESLICE; |
224 | 1145 | ||
225 | /* | 1146 | /* |
226 | * Requeue to the end of queue if we are not the only element | 1147 | * Requeue to the end of queue if we are not the only element |
227 | * on the queue: | 1148 | * on the queue: |
228 | */ | 1149 | */ |
229 | if (p->run_list.prev != p->run_list.next) { | 1150 | if (p->rt.run_list.prev != p->rt.run_list.next) { |
230 | requeue_task_rt(rq, p); | 1151 | requeue_task_rt(rq, p); |
231 | set_tsk_need_resched(p); | 1152 | set_tsk_need_resched(p); |
232 | } | 1153 | } |
@@ -244,6 +1165,9 @@ const struct sched_class rt_sched_class = { | |||
244 | .enqueue_task = enqueue_task_rt, | 1165 | .enqueue_task = enqueue_task_rt, |
245 | .dequeue_task = dequeue_task_rt, | 1166 | .dequeue_task = dequeue_task_rt, |
246 | .yield_task = yield_task_rt, | 1167 | .yield_task = yield_task_rt, |
1168 | #ifdef CONFIG_SMP | ||
1169 | .select_task_rq = select_task_rq_rt, | ||
1170 | #endif /* CONFIG_SMP */ | ||
247 | 1171 | ||
248 | .check_preempt_curr = check_preempt_curr_rt, | 1172 | .check_preempt_curr = check_preempt_curr_rt, |
249 | 1173 | ||
@@ -253,8 +1177,18 @@ const struct sched_class rt_sched_class = { | |||
253 | #ifdef CONFIG_SMP | 1177 | #ifdef CONFIG_SMP |
254 | .load_balance = load_balance_rt, | 1178 | .load_balance = load_balance_rt, |
255 | .move_one_task = move_one_task_rt, | 1179 | .move_one_task = move_one_task_rt, |
1180 | .set_cpus_allowed = set_cpus_allowed_rt, | ||
1181 | .join_domain = join_domain_rt, | ||
1182 | .leave_domain = leave_domain_rt, | ||
1183 | .pre_schedule = pre_schedule_rt, | ||
1184 | .post_schedule = post_schedule_rt, | ||
1185 | .task_wake_up = task_wake_up_rt, | ||
1186 | .switched_from = switched_from_rt, | ||
256 | #endif | 1187 | #endif |
257 | 1188 | ||
258 | .set_curr_task = set_curr_task_rt, | 1189 | .set_curr_task = set_curr_task_rt, |
259 | .task_tick = task_tick_rt, | 1190 | .task_tick = task_tick_rt, |
1191 | |||
1192 | .prio_changed = prio_changed_rt, | ||
1193 | .switched_to = switched_to_rt, | ||
260 | }; | 1194 | }; |
diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 11df812263c8..c1d76552446e 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c | |||
@@ -8,6 +8,7 @@ | |||
8 | */ | 8 | */ |
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/cpu.h> | 10 | #include <linux/cpu.h> |
11 | #include <linux/nmi.h> | ||
11 | #include <linux/init.h> | 12 | #include <linux/init.h> |
12 | #include <linux/delay.h> | 13 | #include <linux/delay.h> |
13 | #include <linux/freezer.h> | 14 | #include <linux/freezer.h> |
@@ -23,8 +24,8 @@ static DEFINE_PER_CPU(unsigned long, touch_timestamp); | |||
23 | static DEFINE_PER_CPU(unsigned long, print_timestamp); | 24 | static DEFINE_PER_CPU(unsigned long, print_timestamp); |
24 | static DEFINE_PER_CPU(struct task_struct *, watchdog_task); | 25 | static DEFINE_PER_CPU(struct task_struct *, watchdog_task); |
25 | 26 | ||
26 | static int did_panic; | 27 | static int __read_mostly did_panic; |
27 | int softlockup_thresh = 10; | 28 | unsigned long __read_mostly softlockup_thresh = 60; |
28 | 29 | ||
29 | static int | 30 | static int |
30 | softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) | 31 | softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) |
@@ -45,7 +46,7 @@ static struct notifier_block panic_block = { | |||
45 | */ | 46 | */ |
46 | static unsigned long get_timestamp(int this_cpu) | 47 | static unsigned long get_timestamp(int this_cpu) |
47 | { | 48 | { |
48 | return cpu_clock(this_cpu) >> 30; /* 2^30 ~= 10^9 */ | 49 | return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ |
49 | } | 50 | } |
50 | 51 | ||
51 | void touch_softlockup_watchdog(void) | 52 | void touch_softlockup_watchdog(void) |
@@ -100,11 +101,7 @@ void softlockup_tick(void) | |||
100 | 101 | ||
101 | now = get_timestamp(this_cpu); | 102 | now = get_timestamp(this_cpu); |
102 | 103 | ||
103 | /* Wake up the high-prio watchdog task every second: */ | 104 | /* Warn about unreasonable delays: */ |
104 | if (now > (touch_timestamp + 1)) | ||
105 | wake_up_process(per_cpu(watchdog_task, this_cpu)); | ||
106 | |||
107 | /* Warn about unreasonable 10+ seconds delays: */ | ||
108 | if (now <= (touch_timestamp + softlockup_thresh)) | 105 | if (now <= (touch_timestamp + softlockup_thresh)) |
109 | return; | 106 | return; |
110 | 107 | ||
@@ -122,11 +119,93 @@ void softlockup_tick(void) | |||
122 | } | 119 | } |
123 | 120 | ||
124 | /* | 121 | /* |
122 | * Have a reasonable limit on the number of tasks checked: | ||
123 | */ | ||
124 | unsigned long __read_mostly sysctl_hung_task_check_count = 1024; | ||
125 | |||
126 | /* | ||
127 | * Zero means infinite timeout - no checking done: | ||
128 | */ | ||
129 | unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; | ||
130 | |||
131 | unsigned long __read_mostly sysctl_hung_task_warnings = 10; | ||
132 | |||
133 | /* | ||
134 | * Only do the hung-tasks check on one CPU: | ||
135 | */ | ||
136 | static int check_cpu __read_mostly = -1; | ||
137 | |||
138 | static void check_hung_task(struct task_struct *t, unsigned long now) | ||
139 | { | ||
140 | unsigned long switch_count = t->nvcsw + t->nivcsw; | ||
141 | |||
142 | if (t->flags & PF_FROZEN) | ||
143 | return; | ||
144 | |||
145 | if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { | ||
146 | t->last_switch_count = switch_count; | ||
147 | t->last_switch_timestamp = now; | ||
148 | return; | ||
149 | } | ||
150 | if ((long)(now - t->last_switch_timestamp) < | ||
151 | sysctl_hung_task_timeout_secs) | ||
152 | return; | ||
153 | if (sysctl_hung_task_warnings < 0) | ||
154 | return; | ||
155 | sysctl_hung_task_warnings--; | ||
156 | |||
157 | /* | ||
158 | * Ok, the task did not get scheduled for more than 2 minutes, | ||
159 | * complain: | ||
160 | */ | ||
161 | printk(KERN_ERR "INFO: task %s:%d blocked for more than " | ||
162 | "%ld seconds.\n", t->comm, t->pid, | ||
163 | sysctl_hung_task_timeout_secs); | ||
164 | printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | ||
165 | " disables this message.\n"); | ||
166 | sched_show_task(t); | ||
167 | __debug_show_held_locks(t); | ||
168 | |||
169 | t->last_switch_timestamp = now; | ||
170 | touch_nmi_watchdog(); | ||
171 | } | ||
172 | |||
173 | /* | ||
174 | * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for | ||
175 | * a really long time (120 seconds). If that happens, print out | ||
176 | * a warning. | ||
177 | */ | ||
178 | static void check_hung_uninterruptible_tasks(int this_cpu) | ||
179 | { | ||
180 | int max_count = sysctl_hung_task_check_count; | ||
181 | unsigned long now = get_timestamp(this_cpu); | ||
182 | struct task_struct *g, *t; | ||
183 | |||
184 | /* | ||
185 | * If the system crashed already then all bets are off, | ||
186 | * do not report extra hung tasks: | ||
187 | */ | ||
188 | if ((tainted & TAINT_DIE) || did_panic) | ||
189 | return; | ||
190 | |||
191 | read_lock(&tasklist_lock); | ||
192 | do_each_thread(g, t) { | ||
193 | if (!--max_count) | ||
194 | break; | ||
195 | if (t->state & TASK_UNINTERRUPTIBLE) | ||
196 | check_hung_task(t, now); | ||
197 | } while_each_thread(g, t); | ||
198 | |||
199 | read_unlock(&tasklist_lock); | ||
200 | } | ||
201 | |||
202 | /* | ||
125 | * The watchdog thread - runs every second and touches the timestamp. | 203 | * The watchdog thread - runs every second and touches the timestamp. |
126 | */ | 204 | */ |
127 | static int watchdog(void *__bind_cpu) | 205 | static int watchdog(void *__bind_cpu) |
128 | { | 206 | { |
129 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 207 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; |
208 | int this_cpu = (long)__bind_cpu; | ||
130 | 209 | ||
131 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 210 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
132 | 211 | ||
@@ -135,13 +214,18 @@ static int watchdog(void *__bind_cpu) | |||
135 | 214 | ||
136 | /* | 215 | /* |
137 | * Run briefly once per second to reset the softlockup timestamp. | 216 | * Run briefly once per second to reset the softlockup timestamp. |
138 | * If this gets delayed for more than 10 seconds then the | 217 | * If this gets delayed for more than 60 seconds then the |
139 | * debug-printout triggers in softlockup_tick(). | 218 | * debug-printout triggers in softlockup_tick(). |
140 | */ | 219 | */ |
141 | while (!kthread_should_stop()) { | 220 | while (!kthread_should_stop()) { |
142 | set_current_state(TASK_INTERRUPTIBLE); | ||
143 | touch_softlockup_watchdog(); | 221 | touch_softlockup_watchdog(); |
144 | schedule(); | 222 | msleep_interruptible(10000); |
223 | |||
224 | if (this_cpu != check_cpu) | ||
225 | continue; | ||
226 | |||
227 | if (sysctl_hung_task_timeout_secs) | ||
228 | check_hung_uninterruptible_tasks(this_cpu); | ||
145 | } | 229 | } |
146 | 230 | ||
147 | return 0; | 231 | return 0; |
@@ -171,6 +255,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
171 | break; | 255 | break; |
172 | case CPU_ONLINE: | 256 | case CPU_ONLINE: |
173 | case CPU_ONLINE_FROZEN: | 257 | case CPU_ONLINE_FROZEN: |
258 | check_cpu = any_online_cpu(cpu_online_map); | ||
174 | wake_up_process(per_cpu(watchdog_task, hotcpu)); | 259 | wake_up_process(per_cpu(watchdog_task, hotcpu)); |
175 | break; | 260 | break; |
176 | #ifdef CONFIG_HOTPLUG_CPU | 261 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -181,6 +266,15 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
181 | /* Unbind so it can run. Fall thru. */ | 266 | /* Unbind so it can run. Fall thru. */ |
182 | kthread_bind(per_cpu(watchdog_task, hotcpu), | 267 | kthread_bind(per_cpu(watchdog_task, hotcpu), |
183 | any_online_cpu(cpu_online_map)); | 268 | any_online_cpu(cpu_online_map)); |
269 | case CPU_DOWN_PREPARE: | ||
270 | case CPU_DOWN_PREPARE_FROZEN: | ||
271 | if (hotcpu == check_cpu) { | ||
272 | cpumask_t temp_cpu_online_map = cpu_online_map; | ||
273 | |||
274 | cpu_clear(hotcpu, temp_cpu_online_map); | ||
275 | check_cpu = any_online_cpu(temp_cpu_online_map); | ||
276 | } | ||
277 | break; | ||
184 | case CPU_DEAD: | 278 | case CPU_DEAD: |
185 | case CPU_DEAD_FROZEN: | 279 | case CPU_DEAD_FROZEN: |
186 | p = per_cpu(watchdog_task, hotcpu); | 280 | p = per_cpu(watchdog_task, hotcpu); |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 319821ef78af..51b5ee53571a 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -203,13 +203,13 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) | |||
203 | int ret; | 203 | int ret; |
204 | 204 | ||
205 | /* No CPUs can come up or down during this. */ | 205 | /* No CPUs can come up or down during this. */ |
206 | lock_cpu_hotplug(); | 206 | get_online_cpus(); |
207 | p = __stop_machine_run(fn, data, cpu); | 207 | p = __stop_machine_run(fn, data, cpu); |
208 | if (!IS_ERR(p)) | 208 | if (!IS_ERR(p)) |
209 | ret = kthread_stop(p); | 209 | ret = kthread_stop(p); |
210 | else | 210 | else |
211 | ret = PTR_ERR(p); | 211 | ret = PTR_ERR(p); |
212 | unlock_cpu_hotplug(); | 212 | put_online_cpus(); |
213 | 213 | ||
214 | return ret; | 214 | return ret; |
215 | } | 215 | } |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c68f68dcc605..8e96558cb8f3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -81,6 +81,7 @@ extern int compat_log; | |||
81 | extern int maps_protect; | 81 | extern int maps_protect; |
82 | extern int sysctl_stat_interval; | 82 | extern int sysctl_stat_interval; |
83 | extern int audit_argv_kb; | 83 | extern int audit_argv_kb; |
84 | extern int latencytop_enabled; | ||
84 | 85 | ||
85 | /* Constants used for minimum and maximum */ | 86 | /* Constants used for minimum and maximum */ |
86 | #ifdef CONFIG_DETECT_SOFTLOCKUP | 87 | #ifdef CONFIG_DETECT_SOFTLOCKUP |
@@ -306,9 +307,43 @@ static struct ctl_table kern_table[] = { | |||
306 | .procname = "sched_nr_migrate", | 307 | .procname = "sched_nr_migrate", |
307 | .data = &sysctl_sched_nr_migrate, | 308 | .data = &sysctl_sched_nr_migrate, |
308 | .maxlen = sizeof(unsigned int), | 309 | .maxlen = sizeof(unsigned int), |
309 | .mode = 644, | 310 | .mode = 0644, |
311 | .proc_handler = &proc_dointvec, | ||
312 | }, | ||
313 | { | ||
314 | .ctl_name = CTL_UNNUMBERED, | ||
315 | .procname = "sched_rt_period_ms", | ||
316 | .data = &sysctl_sched_rt_period, | ||
317 | .maxlen = sizeof(unsigned int), | ||
318 | .mode = 0644, | ||
310 | .proc_handler = &proc_dointvec, | 319 | .proc_handler = &proc_dointvec, |
311 | }, | 320 | }, |
321 | { | ||
322 | .ctl_name = CTL_UNNUMBERED, | ||
323 | .procname = "sched_rt_ratio", | ||
324 | .data = &sysctl_sched_rt_ratio, | ||
325 | .maxlen = sizeof(unsigned int), | ||
326 | .mode = 0644, | ||
327 | .proc_handler = &proc_dointvec, | ||
328 | }, | ||
329 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | ||
330 | { | ||
331 | .ctl_name = CTL_UNNUMBERED, | ||
332 | .procname = "sched_min_bal_int_shares", | ||
333 | .data = &sysctl_sched_min_bal_int_shares, | ||
334 | .maxlen = sizeof(unsigned int), | ||
335 | .mode = 0644, | ||
336 | .proc_handler = &proc_dointvec, | ||
337 | }, | ||
338 | { | ||
339 | .ctl_name = CTL_UNNUMBERED, | ||
340 | .procname = "sched_max_bal_int_shares", | ||
341 | .data = &sysctl_sched_max_bal_int_shares, | ||
342 | .maxlen = sizeof(unsigned int), | ||
343 | .mode = 0644, | ||
344 | .proc_handler = &proc_dointvec, | ||
345 | }, | ||
346 | #endif | ||
312 | #endif | 347 | #endif |
313 | { | 348 | { |
314 | .ctl_name = CTL_UNNUMBERED, | 349 | .ctl_name = CTL_UNNUMBERED, |
@@ -382,6 +417,15 @@ static struct ctl_table kern_table[] = { | |||
382 | .proc_handler = &proc_dointvec_taint, | 417 | .proc_handler = &proc_dointvec_taint, |
383 | }, | 418 | }, |
384 | #endif | 419 | #endif |
420 | #ifdef CONFIG_LATENCYTOP | ||
421 | { | ||
422 | .procname = "latencytop", | ||
423 | .data = &latencytop_enabled, | ||
424 | .maxlen = sizeof(int), | ||
425 | .mode = 0644, | ||
426 | .proc_handler = &proc_dointvec, | ||
427 | }, | ||
428 | #endif | ||
385 | #ifdef CONFIG_SECURITY_CAPABILITIES | 429 | #ifdef CONFIG_SECURITY_CAPABILITIES |
386 | { | 430 | { |
387 | .procname = "cap-bound", | 431 | .procname = "cap-bound", |
@@ -728,13 +772,40 @@ static struct ctl_table kern_table[] = { | |||
728 | .ctl_name = CTL_UNNUMBERED, | 772 | .ctl_name = CTL_UNNUMBERED, |
729 | .procname = "softlockup_thresh", | 773 | .procname = "softlockup_thresh", |
730 | .data = &softlockup_thresh, | 774 | .data = &softlockup_thresh, |
731 | .maxlen = sizeof(int), | 775 | .maxlen = sizeof(unsigned long), |
732 | .mode = 0644, | 776 | .mode = 0644, |
733 | .proc_handler = &proc_dointvec_minmax, | 777 | .proc_handler = &proc_doulongvec_minmax, |
734 | .strategy = &sysctl_intvec, | 778 | .strategy = &sysctl_intvec, |
735 | .extra1 = &one, | 779 | .extra1 = &one, |
736 | .extra2 = &sixty, | 780 | .extra2 = &sixty, |
737 | }, | 781 | }, |
782 | { | ||
783 | .ctl_name = CTL_UNNUMBERED, | ||
784 | .procname = "hung_task_check_count", | ||
785 | .data = &sysctl_hung_task_check_count, | ||
786 | .maxlen = sizeof(unsigned long), | ||
787 | .mode = 0644, | ||
788 | .proc_handler = &proc_doulongvec_minmax, | ||
789 | .strategy = &sysctl_intvec, | ||
790 | }, | ||
791 | { | ||
792 | .ctl_name = CTL_UNNUMBERED, | ||
793 | .procname = "hung_task_timeout_secs", | ||
794 | .data = &sysctl_hung_task_timeout_secs, | ||
795 | .maxlen = sizeof(unsigned long), | ||
796 | .mode = 0644, | ||
797 | .proc_handler = &proc_doulongvec_minmax, | ||
798 | .strategy = &sysctl_intvec, | ||
799 | }, | ||
800 | { | ||
801 | .ctl_name = CTL_UNNUMBERED, | ||
802 | .procname = "hung_task_warnings", | ||
803 | .data = &sysctl_hung_task_warnings, | ||
804 | .maxlen = sizeof(unsigned long), | ||
805 | .mode = 0644, | ||
806 | .proc_handler = &proc_doulongvec_minmax, | ||
807 | .strategy = &sysctl_intvec, | ||
808 | }, | ||
738 | #endif | 809 | #endif |
739 | #ifdef CONFIG_COMPAT | 810 | #ifdef CONFIG_COMPAT |
740 | { | 811 | { |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cb89fa8db110..1a21b6fdb674 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -153,6 +153,7 @@ void tick_nohz_update_jiffies(void) | |||
153 | void tick_nohz_stop_sched_tick(void) | 153 | void tick_nohz_stop_sched_tick(void) |
154 | { | 154 | { |
155 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; | 155 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; |
156 | unsigned long rt_jiffies; | ||
156 | struct tick_sched *ts; | 157 | struct tick_sched *ts; |
157 | ktime_t last_update, expires, now, delta; | 158 | ktime_t last_update, expires, now, delta; |
158 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 159 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; |
@@ -216,6 +217,10 @@ void tick_nohz_stop_sched_tick(void) | |||
216 | next_jiffies = get_next_timer_interrupt(last_jiffies); | 217 | next_jiffies = get_next_timer_interrupt(last_jiffies); |
217 | delta_jiffies = next_jiffies - last_jiffies; | 218 | delta_jiffies = next_jiffies - last_jiffies; |
218 | 219 | ||
220 | rt_jiffies = rt_needs_cpu(cpu); | ||
221 | if (rt_jiffies && rt_jiffies < delta_jiffies) | ||
222 | delta_jiffies = rt_jiffies; | ||
223 | |||
219 | if (rcu_needs_cpu(cpu)) | 224 | if (rcu_needs_cpu(cpu)) |
220 | delta_jiffies = 1; | 225 | delta_jiffies = 1; |
221 | /* | 226 | /* |
@@ -509,7 +514,6 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | |||
509 | { | 514 | { |
510 | struct tick_sched *ts = | 515 | struct tick_sched *ts = |
511 | container_of(timer, struct tick_sched, sched_timer); | 516 | container_of(timer, struct tick_sched, sched_timer); |
512 | struct hrtimer_cpu_base *base = timer->base->cpu_base; | ||
513 | struct pt_regs *regs = get_irq_regs(); | 517 | struct pt_regs *regs = get_irq_regs(); |
514 | ktime_t now = ktime_get(); | 518 | ktime_t now = ktime_get(); |
515 | int cpu = smp_processor_id(); | 519 | int cpu = smp_processor_id(); |
@@ -547,15 +551,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | |||
547 | touch_softlockup_watchdog(); | 551 | touch_softlockup_watchdog(); |
548 | ts->idle_jiffies++; | 552 | ts->idle_jiffies++; |
549 | } | 553 | } |
550 | /* | ||
551 | * update_process_times() might take tasklist_lock, hence | ||
552 | * drop the base lock. sched-tick hrtimers are per-CPU and | ||
553 | * never accessible by userspace APIs, so this is safe to do. | ||
554 | */ | ||
555 | spin_unlock(&base->lock); | ||
556 | update_process_times(user_mode(regs)); | 554 | update_process_times(user_mode(regs)); |
557 | profile_tick(CPU_PROFILING); | 555 | profile_tick(CPU_PROFILING); |
558 | spin_lock(&base->lock); | ||
559 | } | 556 | } |
560 | 557 | ||
561 | /* Do not restart, when we are in the idle loop */ | 558 | /* Do not restart, when we are in the idle loop */ |
diff --git a/kernel/timer.c b/kernel/timer.c index 2a00c22203f3..f739dfb539ce 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -896,7 +896,7 @@ static void run_timer_softirq(struct softirq_action *h) | |||
896 | { | 896 | { |
897 | tvec_base_t *base = __get_cpu_var(tvec_bases); | 897 | tvec_base_t *base = __get_cpu_var(tvec_bases); |
898 | 898 | ||
899 | hrtimer_run_queues(); | 899 | hrtimer_run_pending(); |
900 | 900 | ||
901 | if (time_after_eq(jiffies, base->timer_jiffies)) | 901 | if (time_after_eq(jiffies, base->timer_jiffies)) |
902 | __run_timers(base); | 902 | __run_timers(base); |
@@ -907,6 +907,7 @@ static void run_timer_softirq(struct softirq_action *h) | |||
907 | */ | 907 | */ |
908 | void run_local_timers(void) | 908 | void run_local_timers(void) |
909 | { | 909 | { |
910 | hrtimer_run_queues(); | ||
910 | raise_softirq(TIMER_SOFTIRQ); | 911 | raise_softirq(TIMER_SOFTIRQ); |
911 | softlockup_tick(); | 912 | softlockup_tick(); |
912 | } | 913 | } |
diff --git a/kernel/user.c b/kernel/user.c index ab4fd706993b..bc1c48d35cb3 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -319,7 +319,7 @@ void free_uid(struct user_struct *up) | |||
319 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | 319 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) |
320 | { | 320 | { |
321 | struct hlist_head *hashent = uidhashentry(ns, uid); | 321 | struct hlist_head *hashent = uidhashentry(ns, uid); |
322 | struct user_struct *up; | 322 | struct user_struct *up, *new; |
323 | 323 | ||
324 | /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() | 324 | /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() |
325 | * atomic. | 325 | * atomic. |
@@ -331,13 +331,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
331 | spin_unlock_irq(&uidhash_lock); | 331 | spin_unlock_irq(&uidhash_lock); |
332 | 332 | ||
333 | if (!up) { | 333 | if (!up) { |
334 | struct user_struct *new; | ||
335 | |||
336 | new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); | 334 | new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); |
337 | if (!new) { | 335 | if (!new) |
338 | uids_mutex_unlock(); | 336 | goto out_unlock; |
339 | return NULL; | ||
340 | } | ||
341 | 337 | ||
342 | new->uid = uid; | 338 | new->uid = uid; |
343 | atomic_set(&new->__count, 1); | 339 | atomic_set(&new->__count, 1); |
@@ -353,28 +349,14 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
353 | #endif | 349 | #endif |
354 | new->locked_shm = 0; | 350 | new->locked_shm = 0; |
355 | 351 | ||
356 | if (alloc_uid_keyring(new, current) < 0) { | 352 | if (alloc_uid_keyring(new, current) < 0) |
357 | kmem_cache_free(uid_cachep, new); | 353 | goto out_free_user; |
358 | uids_mutex_unlock(); | ||
359 | return NULL; | ||
360 | } | ||
361 | 354 | ||
362 | if (sched_create_user(new) < 0) { | 355 | if (sched_create_user(new) < 0) |
363 | key_put(new->uid_keyring); | 356 | goto out_put_keys; |
364 | key_put(new->session_keyring); | ||
365 | kmem_cache_free(uid_cachep, new); | ||
366 | uids_mutex_unlock(); | ||
367 | return NULL; | ||
368 | } | ||
369 | 357 | ||
370 | if (uids_user_create(new)) { | 358 | if (uids_user_create(new)) |
371 | sched_destroy_user(new); | 359 | goto out_destoy_sched; |
372 | key_put(new->uid_keyring); | ||
373 | key_put(new->session_keyring); | ||
374 | kmem_cache_free(uid_cachep, new); | ||
375 | uids_mutex_unlock(); | ||
376 | return NULL; | ||
377 | } | ||
378 | 360 | ||
379 | /* | 361 | /* |
380 | * Before adding this, check whether we raced | 362 | * Before adding this, check whether we raced |
@@ -402,6 +384,17 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
402 | uids_mutex_unlock(); | 384 | uids_mutex_unlock(); |
403 | 385 | ||
404 | return up; | 386 | return up; |
387 | |||
388 | out_destoy_sched: | ||
389 | sched_destroy_user(new); | ||
390 | out_put_keys: | ||
391 | key_put(new->uid_keyring); | ||
392 | key_put(new->session_keyring); | ||
393 | out_free_user: | ||
394 | kmem_cache_free(uid_cachep, new); | ||
395 | out_unlock: | ||
396 | uids_mutex_unlock(); | ||
397 | return NULL; | ||
405 | } | 398 | } |
406 | 399 | ||
407 | void switch_uid(struct user_struct *new_user) | 400 | void switch_uid(struct user_struct *new_user) |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8db0b597509e..52db48e7f6e7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -67,9 +67,8 @@ struct workqueue_struct { | |||
67 | #endif | 67 | #endif |
68 | }; | 68 | }; |
69 | 69 | ||
70 | /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove | 70 | /* Serializes the accesses to the list of workqueues. */ |
71 | threads to each one as cpus come/go. */ | 71 | static DEFINE_SPINLOCK(workqueue_lock); |
72 | static DEFINE_MUTEX(workqueue_mutex); | ||
73 | static LIST_HEAD(workqueues); | 72 | static LIST_HEAD(workqueues); |
74 | 73 | ||
75 | static int singlethread_cpu __read_mostly; | 74 | static int singlethread_cpu __read_mostly; |
@@ -592,8 +591,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on); | |||
592 | * Returns zero on success. | 591 | * Returns zero on success. |
593 | * Returns -ve errno on failure. | 592 | * Returns -ve errno on failure. |
594 | * | 593 | * |
595 | * Appears to be racy against CPU hotplug. | ||
596 | * | ||
597 | * schedule_on_each_cpu() is very slow. | 594 | * schedule_on_each_cpu() is very slow. |
598 | */ | 595 | */ |
599 | int schedule_on_each_cpu(work_func_t func) | 596 | int schedule_on_each_cpu(work_func_t func) |
@@ -605,7 +602,7 @@ int schedule_on_each_cpu(work_func_t func) | |||
605 | if (!works) | 602 | if (!works) |
606 | return -ENOMEM; | 603 | return -ENOMEM; |
607 | 604 | ||
608 | preempt_disable(); /* CPU hotplug */ | 605 | get_online_cpus(); |
609 | for_each_online_cpu(cpu) { | 606 | for_each_online_cpu(cpu) { |
610 | struct work_struct *work = per_cpu_ptr(works, cpu); | 607 | struct work_struct *work = per_cpu_ptr(works, cpu); |
611 | 608 | ||
@@ -613,8 +610,8 @@ int schedule_on_each_cpu(work_func_t func) | |||
613 | set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); | 610 | set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); |
614 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); | 611 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); |
615 | } | 612 | } |
616 | preempt_enable(); | ||
617 | flush_workqueue(keventd_wq); | 613 | flush_workqueue(keventd_wq); |
614 | put_online_cpus(); | ||
618 | free_percpu(works); | 615 | free_percpu(works); |
619 | return 0; | 616 | return 0; |
620 | } | 617 | } |
@@ -750,8 +747,10 @@ struct workqueue_struct *__create_workqueue_key(const char *name, | |||
750 | err = create_workqueue_thread(cwq, singlethread_cpu); | 747 | err = create_workqueue_thread(cwq, singlethread_cpu); |
751 | start_workqueue_thread(cwq, -1); | 748 | start_workqueue_thread(cwq, -1); |
752 | } else { | 749 | } else { |
753 | mutex_lock(&workqueue_mutex); | 750 | get_online_cpus(); |
751 | spin_lock(&workqueue_lock); | ||
754 | list_add(&wq->list, &workqueues); | 752 | list_add(&wq->list, &workqueues); |
753 | spin_unlock(&workqueue_lock); | ||
755 | 754 | ||
756 | for_each_possible_cpu(cpu) { | 755 | for_each_possible_cpu(cpu) { |
757 | cwq = init_cpu_workqueue(wq, cpu); | 756 | cwq = init_cpu_workqueue(wq, cpu); |
@@ -760,7 +759,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name, | |||
760 | err = create_workqueue_thread(cwq, cpu); | 759 | err = create_workqueue_thread(cwq, cpu); |
761 | start_workqueue_thread(cwq, cpu); | 760 | start_workqueue_thread(cwq, cpu); |
762 | } | 761 | } |
763 | mutex_unlock(&workqueue_mutex); | 762 | put_online_cpus(); |
764 | } | 763 | } |
765 | 764 | ||
766 | if (err) { | 765 | if (err) { |
@@ -775,7 +774,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) | |||
775 | { | 774 | { |
776 | /* | 775 | /* |
777 | * Our caller is either destroy_workqueue() or CPU_DEAD, | 776 | * Our caller is either destroy_workqueue() or CPU_DEAD, |
778 | * workqueue_mutex protects cwq->thread | 777 | * get_online_cpus() protects cwq->thread. |
779 | */ | 778 | */ |
780 | if (cwq->thread == NULL) | 779 | if (cwq->thread == NULL) |
781 | return; | 780 | return; |
@@ -810,9 +809,11 @@ void destroy_workqueue(struct workqueue_struct *wq) | |||
810 | struct cpu_workqueue_struct *cwq; | 809 | struct cpu_workqueue_struct *cwq; |
811 | int cpu; | 810 | int cpu; |
812 | 811 | ||
813 | mutex_lock(&workqueue_mutex); | 812 | get_online_cpus(); |
813 | spin_lock(&workqueue_lock); | ||
814 | list_del(&wq->list); | 814 | list_del(&wq->list); |
815 | mutex_unlock(&workqueue_mutex); | 815 | spin_unlock(&workqueue_lock); |
816 | put_online_cpus(); | ||
816 | 817 | ||
817 | for_each_cpu_mask(cpu, *cpu_map) { | 818 | for_each_cpu_mask(cpu, *cpu_map) { |
818 | cwq = per_cpu_ptr(wq->cpu_wq, cpu); | 819 | cwq = per_cpu_ptr(wq->cpu_wq, cpu); |
@@ -835,13 +836,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
835 | action &= ~CPU_TASKS_FROZEN; | 836 | action &= ~CPU_TASKS_FROZEN; |
836 | 837 | ||
837 | switch (action) { | 838 | switch (action) { |
838 | case CPU_LOCK_ACQUIRE: | ||
839 | mutex_lock(&workqueue_mutex); | ||
840 | return NOTIFY_OK; | ||
841 | |||
842 | case CPU_LOCK_RELEASE: | ||
843 | mutex_unlock(&workqueue_mutex); | ||
844 | return NOTIFY_OK; | ||
845 | 839 | ||
846 | case CPU_UP_PREPARE: | 840 | case CPU_UP_PREPARE: |
847 | cpu_set(cpu, cpu_populated_map); | 841 | cpu_set(cpu, cpu_populated_map); |
@@ -854,7 +848,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
854 | case CPU_UP_PREPARE: | 848 | case CPU_UP_PREPARE: |
855 | if (!create_workqueue_thread(cwq, cpu)) | 849 | if (!create_workqueue_thread(cwq, cpu)) |
856 | break; | 850 | break; |
857 | printk(KERN_ERR "workqueue for %i failed\n", cpu); | 851 | printk(KERN_ERR "workqueue [%s] for %i failed\n", |
852 | wq->name, cpu); | ||
858 | return NOTIFY_BAD; | 853 | return NOTIFY_BAD; |
859 | 854 | ||
860 | case CPU_ONLINE: | 855 | case CPU_ONLINE: |