aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64
diff options
context:
space:
mode:
authorEric W. Biederman <ebiederm@xmission.com>2007-07-21 11:10:45 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-21 21:37:10 -0400
commitef3e28c5b956cbb3b17531c85b698a27e83d5cf2 (patch)
tree5ffb0424d5388120e486c036f029e032b63c09f9 /arch/x86_64
parent22293e5806f58a9682267139678a5cc117fd3dcf (diff)
x86_64: check remote IRR bit before migrating level triggered irq
On x86_64 kernel, level triggered irq migration gets initiated in the context of that interrupt(after executing the irq handler) and following steps are followed to do the irq migration. 1. mask IOAPIC RTE entry; // write to IOAPIC RTE 2. EOI; // processor EOI write 3. reprogram IOAPIC RTE entry // write to IOAPIC RTE with new destination and // and interrupt vector due to per cpu vector // allocation. 4. unmask IOAPIC RTE entry; // write to IOAPIC RTE Because of the per cpu vector allocation in x86_64 kernels, when the irq migrates to a different cpu, new vector(corresponding to the new cpu) will get allocated. An EOI write to local APIC has a side effect of generating an EOI write for level trigger interrupts (normally this is a broadcast to all IOAPICs). The EOI broadcast generated as a side effect of EOI write to processor may be delayed while the other IOAPIC writes (step 3 and 4) can go through. Normally, the EOI generated by local APIC for level trigger interrupt contains vector number. The IOAPIC will take this vector number and search the IOAPIC RTE entries for an entry with matching vector number and clear the remote IRR bit (indicate EOI). However, if the vector number is changed (as in step 3) the IOAPIC will not find the RTE entry when the EOI is received later. This will cause the remote IRR to get stuck causing the interrupt hang (no more interrupt from this RTE). Current x86_64 kernel assumes that remote IRR bit is cleared by the time IOAPIC RTE is reprogrammed. Fix this assumption by checking for remote IRR bit and if it still set, delay the irq migration to the next interrupt arrival event(hopefully, next time remote IRR bit will get cleared before the IOAPIC RTE is reprogrammed). Initial analysis and patch from Nanhai. Clean up patch from Suresh. Rewritten to be less intrusive, and to contain a big fat comment by Eric. [akpm@linux-foundation.org: fix comments] Acked-by: Ingo Molnar <mingo@elte.hu> Cc: Nanhai Zou <nanhai.zou@intel.com> Acked-by: Suresh Siddha <suresh.b.siddha@intel.com> Cc: Asit Mallick <asit.k.mallick@intel.com> Cc: Keith Packard <keith.packard@intel.com> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/kernel/io_apic.c58
1 files changed, 56 insertions, 2 deletions
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 1c6c6f724573..050141c0602b 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -152,6 +152,32 @@ static inline void io_apic_modify(unsigned int apic, unsigned int value)
152 writel(value, &io_apic->data); 152 writel(value, &io_apic->data);
153} 153}
154 154
155static int io_apic_level_ack_pending(unsigned int irq)
156{
157 struct irq_pin_list *entry;
158 unsigned long flags;
159 int pending = 0;
160
161 spin_lock_irqsave(&ioapic_lock, flags);
162 entry = irq_2_pin + irq;
163 for (;;) {
164 unsigned int reg;
165 int pin;
166
167 pin = entry->pin;
168 if (pin == -1)
169 break;
170 reg = io_apic_read(entry->apic, 0x10 + pin*2);
171 /* Is the remote IRR bit set? */
172 pending |= (reg >> 14) & 1;
173 if (!entry->next)
174 break;
175 entry = irq_2_pin + entry->next;
176 }
177 spin_unlock_irqrestore(&ioapic_lock, flags);
178 return pending;
179}
180
155/* 181/*
156 * Synchronize the IO-APIC and the CPU by doing 182 * Synchronize the IO-APIC and the CPU by doing
157 * a dummy read from the IO-APIC 183 * a dummy read from the IO-APIC
@@ -1418,9 +1444,37 @@ static void ack_apic_level(unsigned int irq)
1418 ack_APIC_irq(); 1444 ack_APIC_irq();
1419 1445
1420 /* Now we can move and renable the irq */ 1446 /* Now we can move and renable the irq */
1421 move_masked_irq(irq); 1447 if (unlikely(do_unmask_irq)) {
1422 if (unlikely(do_unmask_irq)) 1448 /* Only migrate the irq if the ack has been received.
1449 *
1450 * On rare occasions the broadcast level triggered ack gets
1451 * delayed going to ioapics, and if we reprogram the
1452 * vector while Remote IRR is still set the irq will never
1453 * fire again.
1454 *
1455 * To prevent this scenario we read the Remote IRR bit
1456 * of the ioapic. This has two effects.
1457 * - On any sane system the read of the ioapic will
1458 * flush writes (and acks) going to the ioapic from
1459 * this cpu.
1460 * - We get to see if the ACK has actually been delivered.
1461 *
1462 * Based on failed experiments of reprogramming the
1463 * ioapic entry from outside of irq context starting
1464 * with masking the ioapic entry and then polling until
1465 * Remote IRR was clear before reprogramming the
1466 * ioapic I don't trust the Remote IRR bit to be
1467 * completey accurate.
1468 *
1469 * However there appears to be no other way to plug
1470 * this race, so if the Remote IRR bit is not
1471 * accurate and is causing problems then it is a hardware bug
1472 * and you can go talk to the chipset vendor about it.
1473 */
1474 if (!io_apic_level_ack_pending(irq))
1475 move_masked_irq(irq);
1423 unmask_IO_APIC_irq(irq); 1476 unmask_IO_APIC_irq(irq);
1477 }
1424} 1478}
1425 1479
1426static struct irq_chip ioapic_chip __read_mostly = { 1480static struct irq_chip ioapic_chip __read_mostly = {