[PATCH] powerpc/pseries: clear PCI failure counter if no new failures

The current PCI error recovery system keeps track of the number of PCI card resets, and refuses to bring a card back up if this number is too large. The goal of doing this was to avoid an infinite loop of resets if a card is obviously dead. However, if the failures are rare, but the machine has a high uptime, this mechanism might still be triggered; this is too harsh. This patch will avoids this problem by decrementing the fail count after an hour. Thus, as long as a pci card BSOD's less than 6 times an hour, it will continue to be reset indefinitely. If it's failure rate is greater than that, it will be taken off-line permanently. This patch is larger than it might otherwise be because it changes indentation by removing a pointless while-loop. The while loop is not needed, as the handler is invoked once fo each event (by schedule_work()); the loop is leftover cruft from an earlier implementation. Signed-off-by: Linas Vepstas <linas@austin.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Paul Mackerras <paulus@samba.org>
author: Linas Vepstas <linas@austin.ibm.com> 2006-04-19 00:05:21 -0400
committer: Paul Mackerras <paulus@samba.org> 2006-04-22 04:46:13 -0400
commit: ac325acd50013fa8f4953208cbb96504dec9b12a (patch)
tree: 6c08470d68be38504c6aadae168b873efb39e8db /arch/powerpc/platforms/pseries
parent: 4bd174fe1cca738f53cf8bb9ac3cb327b1f516ed (diff)
2 files changed, 35 insertions, 28 deletions
diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c
index 1fba695e32e8..2a9eb2630730 100644
--- a/arch/powerpc/platforms/pseries/eeh_driver.c
+++ b/arch/powerpc/platforms/pseries/eeh_driver.c
@@ -23,9 +23,8 @@
 *
 */
 #include <linux/delay.h>
-#include <linux/irq.h>
 #include <linux/interrupt.h>
-#include <linux/notifier.h>
+#include <linux/irq.h>
 #include <linux/pci.h>
 #include <asm/eeh.h>
 #include <asm/eeh_event.h>
@@ -250,7 +249,7 @@ static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus)
 */
 #define MAX_WAIT_FOR_RECOVERY 15
-void handle_eeh_events (struct eeh_event *event)
+struct pci_dn * handle_eeh_events (struct eeh_event *event)
 {
        struct device_node *frozen_dn;
        struct pci_dn *frozen_pdn;
@@ -265,7 +264,7 @@ void handle_eeh_events (struct eeh_event *event)
        if (!frozen_dn) {
                printk(KERN_ERR "EEH: Error: Cannot find partition endpoint for %s\n",
                        pci_name(event->dev));
-                return;
+                return NULL;
        }
        /* There are two different styles for coming up with the PE.
@@ -280,7 +279,7 @@ void handle_eeh_events (struct eeh_event *event)
        if (!frozen_bus) {
                printk(KERN_ERR "EEH: Cannot find PCI bus for %s\n",
                        frozen_dn->full_name);
-                return;
+                return NULL;
        }
 #if 0
@@ -355,7 +354,7 @@ void handle_eeh_events (struct eeh_event *event)
        /* Tell all device drivers that they can resume operations */
        pci_walk_bus(frozen_bus, eeh_report_resume, NULL);
-        return;
+        return frozen_pdn;
        
 excess_failures:
        /*
@@ -384,6 +383,8 @@ perm_error:
        /* Shut down the device drivers for good. */
        pcibios_remove_pci_devices(frozen_bus);
+        return NULL;
 }
 /* ---------- end of file ---------- */
diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c
index a1bda6f96fd1..a0b39640a00f 100644
--- a/arch/powerpc/platforms/pseries/eeh_event.c
+++ b/arch/powerpc/platforms/pseries/eeh_event.c
@@ -18,6 +18,7 @@
 * Copyright (c) 2005 Linas Vepstas <linas@linas.org>
 */
+#include <linux/delay.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/pci.h>
@@ -56,38 +57,43 @@ static int eeh_event_handler(void * dummy)
 {
        unsigned long flags;
        struct eeh_event        *event;
+        struct pci_dn *pdn;
        daemonize ("eehd");
+        set_current_state(TASK_INTERRUPTIBLE);
-        while (1) {
+        spin_lock_irqsave(&eeh_eventlist_lock, flags);
-                set_current_state(TASK_INTERRUPTIBLE);
+        event = NULL;
-                spin_lock_irqsave(&eeh_eventlist_lock, flags);
+        /* Unqueue the event, get ready to process. */
-                event = NULL;
+        if (!list_empty(&eeh_eventlist)) {
+                event = list_entry(eeh_eventlist.next, struct eeh_event, list);
+                list_del(&event->list);
+        }
+        spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
-                /* Unqueue the event, get ready to process. */
+        if (event == NULL)
-                if (!list_empty(&eeh_eventlist)) {
+                return 0;
-                        event = list_entry(eeh_eventlist.next, struct eeh_event, list);
-                        list_del(&event->list);
-                }
-                spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
-                if (event == NULL)
+        /* Serialize processing of EEH events */
-                        break;
+        mutex_lock(&eeh_event_mutex);
+        eeh_mark_slot(event->dn, EEH_MODE_RECOVERING);
-                /* Serialize processing of EEH events */
+        printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
-                mutex_lock(&eeh_event_mutex);
+               pci_name(event->dev));
-                eeh_mark_slot(event->dn, EEH_MODE_RECOVERING);
-                printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
+        pdn = handle_eeh_events(event);
-                       pci_name(event->dev));
-                handle_eeh_events(event);
+        eeh_clear_slot(event->dn, EEH_MODE_RECOVERING);
+        pci_dev_put(event->dev);
+        kfree(event);
+        mutex_unlock(&eeh_event_mutex);
-                eeh_clear_slot(event->dn, EEH_MODE_RECOVERING);
+        /* If there are no new errors after an hour, clear the counter. */
-                pci_dev_put(event->dev);
+        if (pdn && pdn->eeh_freeze_count>0) {
-                kfree(event);
+                msleep_interruptible (3600*1000);
-                mutex_unlock(&eeh_event_mutex);
+                if (pdn->eeh_freeze_count>0)
+                        pdn->eeh_freeze_count--;
        }
        return 0;
author	Linas Vepstas <linas@austin.ibm.com>	2006-04-19 00:05:21 -0400
committer	Paul Mackerras <paulus@samba.org>	2006-04-22 04:46:13 -0400
commit	ac325acd50013fa8f4953208cbb96504dec9b12a (patch)
tree	6c08470d68be38504c6aadae168b873efb39e8db /arch/powerpc/platforms/pseries
parent	4bd174fe1cca738f53cf8bb9ac3cb327b1f516ed (diff)