diff options
author | Mike Mason <mmlnx@us.ibm.com> | 2008-07-21 12:40:17 -0400 |
---|---|---|
committer | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2008-07-21 20:39:37 -0400 |
commit | f36c5227cd88b6340c40d62b05859e8213740a97 (patch) | |
tree | 4a858eb42fea4a1984bd37e0854c4d96c8c800a2 /arch | |
parent | 57a20d8fb0d2a05abe40abd6bb29e3f923721f1b (diff) |
powerpc/eeh: Don't panic when EEH_MAX_FAILS is exceeded
This patch changes the EEH_MAX_FAILS action from panic to printing an
error message. Panicking under under this condition is too harsh.
Although performance will be affected and the device may not recover,
the system is still running, which at the very least should allow for a
more graceful shutdown. The patch also removes the msleep() within a
spinlock, which can lead to a deadlock and is not recommended.
Signed-off-by: Mike Mason <mmlnx@us.ibm.com>
Acked-by: Linas Vepstas <linasvepstas@gmail.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/powerpc/platforms/pseries/eeh.c | 26 |
1 files changed, 12 insertions, 14 deletions
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c index c027f0a70a04..54816d75b578 100644 --- a/arch/powerpc/platforms/pseries/eeh.c +++ b/arch/powerpc/platforms/pseries/eeh.c | |||
@@ -75,9 +75,9 @@ | |||
75 | */ | 75 | */ |
76 | 76 | ||
77 | /* If a device driver keeps reading an MMIO register in an interrupt | 77 | /* If a device driver keeps reading an MMIO register in an interrupt |
78 | * handler after a slot isolation event has occurred, we assume it | 78 | * handler after a slot isolation event, it might be broken. |
79 | * is broken and panic. This sets the threshold for how many read | 79 | * This sets the threshold for how many read attempts we allow |
80 | * attempts we allow before panicking. | 80 | * before printing an error message. |
81 | */ | 81 | */ |
82 | #define EEH_MAX_FAILS 2100000 | 82 | #define EEH_MAX_FAILS 2100000 |
83 | 83 | ||
@@ -470,6 +470,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) | |||
470 | unsigned long flags; | 470 | unsigned long flags; |
471 | struct pci_dn *pdn; | 471 | struct pci_dn *pdn; |
472 | int rc = 0; | 472 | int rc = 0; |
473 | const char *location; | ||
473 | 474 | ||
474 | total_mmio_ffs++; | 475 | total_mmio_ffs++; |
475 | 476 | ||
@@ -509,18 +510,15 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) | |||
509 | rc = 1; | 510 | rc = 1; |
510 | if (pdn->eeh_mode & EEH_MODE_ISOLATED) { | 511 | if (pdn->eeh_mode & EEH_MODE_ISOLATED) { |
511 | pdn->eeh_check_count ++; | 512 | pdn->eeh_check_count ++; |
512 | if (pdn->eeh_check_count >= EEH_MAX_FAILS) { | 513 | if (pdn->eeh_check_count % EEH_MAX_FAILS == 0) { |
513 | printk (KERN_ERR "EEH: Device driver ignored %d bad reads, panicing\n", | 514 | location = of_get_property(dn, "ibm,loc-code", NULL); |
514 | pdn->eeh_check_count); | 515 | printk (KERN_ERR "EEH: %d reads ignored for recovering device at " |
516 | "location=%s driver=%s pci addr=%s\n", | ||
517 | pdn->eeh_check_count, location, | ||
518 | dev->driver->name, pci_name(dev)); | ||
519 | printk (KERN_ERR "EEH: Might be infinite loop in %s driver\n", | ||
520 | dev->driver->name); | ||
515 | dump_stack(); | 521 | dump_stack(); |
516 | msleep(5000); | ||
517 | |||
518 | /* re-read the slot reset state */ | ||
519 | if (read_slot_reset_state(pdn, rets) != 0) | ||
520 | rets[0] = -1; /* reset state unknown */ | ||
521 | |||
522 | /* If we are here, then we hit an infinite loop. Stop. */ | ||
523 | panic("EEH: MMIO halt (%d) on device:%s\n", rets[0], pci_name(dev)); | ||
524 | } | 522 | } |
525 | goto dn_unlock; | 523 | goto dn_unlock; |
526 | } | 524 | } |