aboutsummaryrefslogtreecommitdiffstats
path: root/arch/ppc64
diff options
context:
space:
mode:
authorLinas Vepstas <linas@linas.org>2005-11-03 19:49:31 -0500
committerPaul Mackerras <paulus@samba.org>2005-11-09 19:33:33 -0500
commit5c1344e988c716470b5609708215dd7f135f0e3a (patch)
treeb7f736919ffa6b6e07e5b6bc950a4e36cd0d5547 /arch/ppc64
parentfd761fd876e4d1c0d07b6d93bc45c999fa596cb0 (diff)
[PATCH] ppc64: escape hatch for spinning interrupt deadlocks
08-eeh-spin-counter.patch One an EEH event is triggers, all further I/O to a device is blocked (until reset). Bad device drivers may end up spinning in their interrupt handlers, trying to read an interrupt status register that will never change state. This patch moves that spin counter to a per-device structure, and adds some diagnostic prints to help locate the bad driver. Signed-off-by: Linas Vepstas <linas@linas.org> Signed-off-by: Paul Mackerras <paulus@samba.org>
Diffstat (limited to 'arch/ppc64')
-rw-r--r--arch/ppc64/kernel/eeh.c21
1 files changed, 13 insertions, 8 deletions
diff --git a/arch/ppc64/kernel/eeh.c b/arch/ppc64/kernel/eeh.c
index e7522f6da69d..0c52c2de92e0 100644
--- a/arch/ppc64/kernel/eeh.c
+++ b/arch/ppc64/kernel/eeh.c
@@ -78,14 +78,12 @@ DECLARE_WORK(eeh_event_wq, eeh_event_handler, NULL);
78 78
79static struct notifier_block *eeh_notifier_chain; 79static struct notifier_block *eeh_notifier_chain;
80 80
81/* 81/* If a device driver keeps reading an MMIO register in an interrupt
82 * If a device driver keeps reading an MMIO register in an interrupt
83 * handler after a slot isolation event has occurred, we assume it 82 * handler after a slot isolation event has occurred, we assume it
84 * is broken and panic. This sets the threshold for how many read 83 * is broken and panic. This sets the threshold for how many read
85 * attempts we allow before panicking. 84 * attempts we allow before panicking.
86 */ 85 */
87#define EEH_MAX_FAILS 1000 86#define EEH_MAX_FAILS 100000
88static atomic_t eeh_fail_count;
89 87
90/* RTAS tokens */ 88/* RTAS tokens */
91static int ibm_set_eeh_option; 89static int ibm_set_eeh_option;
@@ -521,7 +519,6 @@ static void eeh_event_handler(void *dummy)
521 "%s\n", event->reset_state, 519 "%s\n", event->reset_state,
522 pci_name(event->dev)); 520 pci_name(event->dev));
523 521
524 atomic_set(&eeh_fail_count, 0);
525 notifier_call_chain (&eeh_notifier_chain, 522 notifier_call_chain (&eeh_notifier_chain,
526 EEH_NOTIFY_FREEZE, event); 523 EEH_NOTIFY_FREEZE, event);
527 524
@@ -657,12 +654,18 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
657 spin_lock_irqsave(&confirm_error_lock, flags); 654 spin_lock_irqsave(&confirm_error_lock, flags);
658 rc = 1; 655 rc = 1;
659 if (pdn->eeh_mode & EEH_MODE_ISOLATED) { 656 if (pdn->eeh_mode & EEH_MODE_ISOLATED) {
660 atomic_inc(&eeh_fail_count); 657 pdn->eeh_check_count ++;
661 if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) { 658 if (pdn->eeh_check_count >= EEH_MAX_FAILS) {
659 printk (KERN_ERR "EEH: Device driver ignored %d bad reads, panicing\n",
660 pdn->eeh_check_count);
661 dump_stack();
662
662 /* re-read the slot reset state */ 663 /* re-read the slot reset state */
663 if (read_slot_reset_state(pdn, rets) != 0) 664 if (read_slot_reset_state(pdn, rets) != 0)
664 rets[0] = -1; /* reset state unknown */ 665 rets[0] = -1; /* reset state unknown */
665 eeh_panic(dev, rets[0]); 666
667 /* If we are here, then we hit an infinite loop. Stop. */
668 panic("EEH: MMIO halt (%d) on device:%s\n", rets[0], pci_name(dev));
666 } 669 }
667 goto dn_unlock; 670 goto dn_unlock;
668 } 671 }
@@ -808,6 +811,8 @@ static void *early_enable_eeh(struct device_node *dn, void *data)
808 struct pci_dn *pdn = PCI_DN(dn); 811 struct pci_dn *pdn = PCI_DN(dn);
809 812
810 pdn->eeh_mode = 0; 813 pdn->eeh_mode = 0;
814 pdn->eeh_check_count = 0;
815 pdn->eeh_freeze_count = 0;
811 816
812 if (status && strcmp(status, "ok") != 0) 817 if (status && strcmp(status, "ok") != 0)
813 return NULL; /* ignore devices with bad status */ 818 return NULL; /* ignore devices with bad status */