diff options
author | Ingo Molnar <mingo@elte.hu> | 2006-12-22 04:11:56 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.osdl.org> | 2006-12-22 11:55:51 -0500 |
commit | 0888f06ac99f993df2bb4c479f5b9306dafe154f (patch) | |
tree | 8ad58be06a5e0613b781a0d5402cc356e9ef767d | |
parent | 6f5a9da1af5a8c286575c30c2706dc1fbef9164b (diff) |
[PATCH] sched: fix bad missed wakeups in the i386, x86_64, ia64, ACPI and APM idle code
Fernando Lopez-Lezcano reported frequent scheduling latencies and audio
xruns starting at the 2.6.18-rt kernel, and those problems persisted all
until current -rt kernels. The latencies were serious and unjustified by
system load, often in the milliseconds range.
After a patient and heroic multi-month effort of Fernando, where he
tested dozens of kernels, tried various configs, boot options,
test-patches of mine and provided latency traces of those incidents, the
following 'smoking gun' trace was captured by him:
_------=> CPU#
/ _-----=> irqs-off
| / _----=> need-resched
|| / _---=> hardirq/softirq
||| / _--=> preempt-depth
|||| /
||||| delay
cmd pid ||||| time | caller
\ / ||||| \ | /
IRQ_19-1479 1D..1 0us : __trace_start_sched_wakeup (try_to_wake_up)
IRQ_19-1479 1D..1 0us : __trace_start_sched_wakeup <<...>-5856> (37 0)
IRQ_19-1479 1D..1 0us : __trace_start_sched_wakeup (c01262ba 0 0)
IRQ_19-1479 1D..1 0us : resched_task (try_to_wake_up)
IRQ_19-1479 1D..1 0us : __spin_unlock_irqrestore (try_to_wake_up)
...
<idle>-0 1...1 11us!: default_idle (cpu_idle)
...
<idle>-0 0Dn.1 602us : smp_apic_timer_interrupt (c0103baf 1 0)
...
<...>-5856 0D..2 618us : __switch_to (__schedule)
<...>-5856 0D..2 618us : __schedule <<idle>-0> (20 162)
<...>-5856 0D..2 619us : __spin_unlock_irq (__schedule)
<...>-5856 0...1 619us : trace_stop_sched_switched (__schedule)
<...>-5856 0D..1 619us : trace_stop_sched_switched <<...>-5856> (37 0)
what is visible in this trace is that CPU#1 ran try_to_wake_up() for
PID:5856, it placed PID:5856 on CPU#0's runqueue and ran resched_task()
for CPU#0. But it decided to not send an IPI that no CPU - due to
TS_POLLING. But CPU#0 never woke up after its NEED_RESCHED bit was set,
and only rescheduled to PID:5856 upon the next lapic timer IRQ. The
result was a 600+ usecs latency and a missed wakeup!
the bug turned out to be an idle-wakeup bug introduced into the mainline
kernel this summer via an optimization in the x86_64 tree:
commit 495ab9c045e1b0e5c82951b762257fe1c9d81564
Author: Andi Kleen <ak@suse.de>
Date: Mon Jun 26 13:59:11 2006 +0200
[PATCH] i386/x86-64/ia64: Move polling flag into thread_info_status
During some profiling I noticed that default_idle causes a lot of
memory traffic. I think that is caused by the atomic operations
to clear/set the polling flag in thread_info. There is actually
no reason to make this atomic - only the idle thread does it
to itself, other CPUs only read it. So I moved it into ti->status.
the problem is this type of change:
if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
- clear_thread_flag(TIF_POLLING_NRFLAG);
+ current_thread_info()->status &= ~TS_POLLING;
smp_mb__after_clear_bit();
while (!need_resched()) {
local_irq_disable();
this changes clear_thread_flag() to an explicit clearing of TS_POLLING.
clear_thread_flag() is defined as:
clear_bit(flag, &ti->flags);
and clear_bit() is a LOCK-ed atomic instruction on all x86 platforms:
static inline void clear_bit(int nr, volatile unsigned long * addr)
{
__asm__ __volatile__( LOCK_PREFIX
"btrl %1,%0"
hence smp_mb__after_clear_bit() is defined as a simple compile barrier:
#define smp_mb__after_clear_bit() barrier()
but the explicit TS_POLLING clearing introduced by the patch:
+ current_thread_info()->status &= ~TS_POLLING;
is not an atomic op! So the clearing of the TS_POLLING bit is freely
reorderable with the reading of the NEED_RESCHED bit - and both now
reside in different memory addresses.
CPU idle wakeup very much depends on ordered memory ops, the clearing of
the TS_POLLING flag must always be done before we test need_resched()
and hit the idle instruction(s). [Symmetrically, the wakeup code needs
to set NEED_RESCHED before it tests the TS_POLLING flag, so memory
ordering is paramount.]
Fernando's dual-core Athlon64 system has a sufficiently advanced memory
ordering model so that it triggered this scenario very often.
( And it also turned out that the reason why these latencies never
triggered on my testsystems is that i routinely use idle=poll, which
was the only idle variant not affected by this bug. )
The fix is to change the smp_mb__after_clear_bit() to an smp_mb(), to
act as an absolute barrier between the TS_POLLING write and the
NEED_RESCHED read. This affects almost all idling methods (default,
ACPI, APM), on all 3 x86 architectures: i386, x86_64, ia64.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Tested-by: Fernando Lopez-Lezcano <nando@ccrma.Stanford.EDU>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | arch/i386/kernel/apm.c | 6 | ||||
-rw-r--r-- | arch/i386/kernel/process.c | 7 | ||||
-rw-r--r-- | arch/ia64/kernel/process.c | 10 | ||||
-rw-r--r-- | arch/x86_64/kernel/process.c | 6 | ||||
-rw-r--r-- | drivers/acpi/processor_idle.c | 12 |
5 files changed, 34 insertions, 7 deletions
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index b75cff25de4b..199016927541 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c | |||
@@ -785,7 +785,11 @@ static int apm_do_idle(void) | |||
785 | polling = !!(current_thread_info()->status & TS_POLLING); | 785 | polling = !!(current_thread_info()->status & TS_POLLING); |
786 | if (polling) { | 786 | if (polling) { |
787 | current_thread_info()->status &= ~TS_POLLING; | 787 | current_thread_info()->status &= ~TS_POLLING; |
788 | smp_mb__after_clear_bit(); | 788 | /* |
789 | * TS_POLLING-cleared state must be visible before we | ||
790 | * test NEED_RESCHED: | ||
791 | */ | ||
792 | smp_mb(); | ||
789 | } | 793 | } |
790 | if (!need_resched()) { | 794 | if (!need_resched()) { |
791 | idled = 1; | 795 | idled = 1; |
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 99308510a17c..c641056233a6 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c | |||
@@ -102,7 +102,12 @@ void default_idle(void) | |||
102 | { | 102 | { |
103 | if (!hlt_counter && boot_cpu_data.hlt_works_ok) { | 103 | if (!hlt_counter && boot_cpu_data.hlt_works_ok) { |
104 | current_thread_info()->status &= ~TS_POLLING; | 104 | current_thread_info()->status &= ~TS_POLLING; |
105 | smp_mb__after_clear_bit(); | 105 | /* |
106 | * TS_POLLING-cleared state must be visible before we | ||
107 | * test NEED_RESCHED: | ||
108 | */ | ||
109 | smp_mb(); | ||
110 | |||
106 | local_irq_disable(); | 111 | local_irq_disable(); |
107 | if (!need_resched()) | 112 | if (!need_resched()) |
108 | safe_halt(); /* enables interrupts racelessly */ | 113 | safe_halt(); /* enables interrupts racelessly */ |
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 51922b98086a..17685abaf496 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c | |||
@@ -268,10 +268,16 @@ cpu_idle (void) | |||
268 | 268 | ||
269 | /* endless idle loop with no priority at all */ | 269 | /* endless idle loop with no priority at all */ |
270 | while (1) { | 270 | while (1) { |
271 | if (can_do_pal_halt) | 271 | if (can_do_pal_halt) { |
272 | current_thread_info()->status &= ~TS_POLLING; | 272 | current_thread_info()->status &= ~TS_POLLING; |
273 | else | 273 | /* |
274 | * TS_POLLING-cleared state must be visible before we | ||
275 | * test NEED_RESCHED: | ||
276 | */ | ||
277 | smp_mb(); | ||
278 | } else { | ||
274 | current_thread_info()->status |= TS_POLLING; | 279 | current_thread_info()->status |= TS_POLLING; |
280 | } | ||
275 | 281 | ||
276 | if (!need_resched()) { | 282 | if (!need_resched()) { |
277 | void (*idle)(void); | 283 | void (*idle)(void); |
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index a418ee4c8c62..cbbc6adc1a92 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c | |||
@@ -109,7 +109,11 @@ void exit_idle(void) | |||
109 | static void default_idle(void) | 109 | static void default_idle(void) |
110 | { | 110 | { |
111 | current_thread_info()->status &= ~TS_POLLING; | 111 | current_thread_info()->status &= ~TS_POLLING; |
112 | smp_mb__after_clear_bit(); | 112 | /* |
113 | * TS_POLLING-cleared state must be visible before we | ||
114 | * test NEED_RESCHED: | ||
115 | */ | ||
116 | smp_mb(); | ||
113 | local_irq_disable(); | 117 | local_irq_disable(); |
114 | if (!need_resched()) { | 118 | if (!need_resched()) { |
115 | /* Enables interrupts one instruction before HLT. | 119 | /* Enables interrupts one instruction before HLT. |
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 65b3f056ad89..6dac6050bb5a 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c | |||
@@ -211,7 +211,11 @@ acpi_processor_power_activate(struct acpi_processor *pr, | |||
211 | static void acpi_safe_halt(void) | 211 | static void acpi_safe_halt(void) |
212 | { | 212 | { |
213 | current_thread_info()->status &= ~TS_POLLING; | 213 | current_thread_info()->status &= ~TS_POLLING; |
214 | smp_mb__after_clear_bit(); | 214 | /* |
215 | * TS_POLLING-cleared state must be visible before we | ||
216 | * test NEED_RESCHED: | ||
217 | */ | ||
218 | smp_mb(); | ||
215 | if (!need_resched()) | 219 | if (!need_resched()) |
216 | safe_halt(); | 220 | safe_halt(); |
217 | current_thread_info()->status |= TS_POLLING; | 221 | current_thread_info()->status |= TS_POLLING; |
@@ -345,7 +349,11 @@ static void acpi_processor_idle(void) | |||
345 | */ | 349 | */ |
346 | if (cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3) { | 350 | if (cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3) { |
347 | current_thread_info()->status &= ~TS_POLLING; | 351 | current_thread_info()->status &= ~TS_POLLING; |
348 | smp_mb__after_clear_bit(); | 352 | /* |
353 | * TS_POLLING-cleared state must be visible before we | ||
354 | * test NEED_RESCHED: | ||
355 | */ | ||
356 | smp_mb(); | ||
349 | if (need_resched()) { | 357 | if (need_resched()) { |
350 | current_thread_info()->status |= TS_POLLING; | 358 | current_thread_info()->status |= TS_POLLING; |
351 | local_irq_enable(); | 359 | local_irq_enable(); |