diff options
author | Linas Vepstas <linas@austin.ibm.com> | 2006-04-19 00:05:21 -0400 |
---|---|---|
committer | Paul Mackerras <paulus@samba.org> | 2006-04-22 04:46:13 -0400 |
commit | ac325acd50013fa8f4953208cbb96504dec9b12a (patch) | |
tree | 6c08470d68be38504c6aadae168b873efb39e8db /arch/powerpc/platforms/pseries | |
parent | 4bd174fe1cca738f53cf8bb9ac3cb327b1f516ed (diff) |
[PATCH] powerpc/pseries: clear PCI failure counter if no new failures
The current PCI error recovery system keeps track of the number of PCI card
resets, and refuses to bring a card back up if this number is too large.
The goal of doing this was to avoid an infinite loop of resets if a card is
obviously dead. However, if the failures are rare, but the machine has a
high uptime, this mechanism might still be triggered; this is too harsh.
This patch will avoids this problem by decrementing the fail count after an
hour. Thus, as long as a pci card BSOD's less than 6 times an hour, it
will continue to be reset indefinitely. If it's failure rate is greater
than that, it will be taken off-line permanently.
This patch is larger than it might otherwise be because it changes
indentation by removing a pointless while-loop. The while loop is not
needed, as the handler is invoked once fo each event (by schedule_work());
the loop is leftover cruft from an earlier implementation.
Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Diffstat (limited to 'arch/powerpc/platforms/pseries')
-rw-r--r-- | arch/powerpc/platforms/pseries/eeh_driver.c | 13 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/eeh_event.c | 50 |
2 files changed, 35 insertions, 28 deletions
diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c index 1fba695e32e8..2a9eb2630730 100644 --- a/arch/powerpc/platforms/pseries/eeh_driver.c +++ b/arch/powerpc/platforms/pseries/eeh_driver.c | |||
@@ -23,9 +23,8 @@ | |||
23 | * | 23 | * |
24 | */ | 24 | */ |
25 | #include <linux/delay.h> | 25 | #include <linux/delay.h> |
26 | #include <linux/irq.h> | ||
27 | #include <linux/interrupt.h> | 26 | #include <linux/interrupt.h> |
28 | #include <linux/notifier.h> | 27 | #include <linux/irq.h> |
29 | #include <linux/pci.h> | 28 | #include <linux/pci.h> |
30 | #include <asm/eeh.h> | 29 | #include <asm/eeh.h> |
31 | #include <asm/eeh_event.h> | 30 | #include <asm/eeh_event.h> |
@@ -250,7 +249,7 @@ static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus) | |||
250 | */ | 249 | */ |
251 | #define MAX_WAIT_FOR_RECOVERY 15 | 250 | #define MAX_WAIT_FOR_RECOVERY 15 |
252 | 251 | ||
253 | void handle_eeh_events (struct eeh_event *event) | 252 | struct pci_dn * handle_eeh_events (struct eeh_event *event) |
254 | { | 253 | { |
255 | struct device_node *frozen_dn; | 254 | struct device_node *frozen_dn; |
256 | struct pci_dn *frozen_pdn; | 255 | struct pci_dn *frozen_pdn; |
@@ -265,7 +264,7 @@ void handle_eeh_events (struct eeh_event *event) | |||
265 | if (!frozen_dn) { | 264 | if (!frozen_dn) { |
266 | printk(KERN_ERR "EEH: Error: Cannot find partition endpoint for %s\n", | 265 | printk(KERN_ERR "EEH: Error: Cannot find partition endpoint for %s\n", |
267 | pci_name(event->dev)); | 266 | pci_name(event->dev)); |
268 | return; | 267 | return NULL; |
269 | } | 268 | } |
270 | 269 | ||
271 | /* There are two different styles for coming up with the PE. | 270 | /* There are two different styles for coming up with the PE. |
@@ -280,7 +279,7 @@ void handle_eeh_events (struct eeh_event *event) | |||
280 | if (!frozen_bus) { | 279 | if (!frozen_bus) { |
281 | printk(KERN_ERR "EEH: Cannot find PCI bus for %s\n", | 280 | printk(KERN_ERR "EEH: Cannot find PCI bus for %s\n", |
282 | frozen_dn->full_name); | 281 | frozen_dn->full_name); |
283 | return; | 282 | return NULL; |
284 | } | 283 | } |
285 | 284 | ||
286 | #if 0 | 285 | #if 0 |
@@ -355,7 +354,7 @@ void handle_eeh_events (struct eeh_event *event) | |||
355 | /* Tell all device drivers that they can resume operations */ | 354 | /* Tell all device drivers that they can resume operations */ |
356 | pci_walk_bus(frozen_bus, eeh_report_resume, NULL); | 355 | pci_walk_bus(frozen_bus, eeh_report_resume, NULL); |
357 | 356 | ||
358 | return; | 357 | return frozen_pdn; |
359 | 358 | ||
360 | excess_failures: | 359 | excess_failures: |
361 | /* | 360 | /* |
@@ -384,6 +383,8 @@ perm_error: | |||
384 | 383 | ||
385 | /* Shut down the device drivers for good. */ | 384 | /* Shut down the device drivers for good. */ |
386 | pcibios_remove_pci_devices(frozen_bus); | 385 | pcibios_remove_pci_devices(frozen_bus); |
386 | |||
387 | return NULL; | ||
387 | } | 388 | } |
388 | 389 | ||
389 | /* ---------- end of file ---------- */ | 390 | /* ---------- end of file ---------- */ |
diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c index a1bda6f96fd1..a0b39640a00f 100644 --- a/arch/powerpc/platforms/pseries/eeh_event.c +++ b/arch/powerpc/platforms/pseries/eeh_event.c | |||
@@ -18,6 +18,7 @@ | |||
18 | * Copyright (c) 2005 Linas Vepstas <linas@linas.org> | 18 | * Copyright (c) 2005 Linas Vepstas <linas@linas.org> |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include <linux/delay.h> | ||
21 | #include <linux/list.h> | 22 | #include <linux/list.h> |
22 | #include <linux/mutex.h> | 23 | #include <linux/mutex.h> |
23 | #include <linux/pci.h> | 24 | #include <linux/pci.h> |
@@ -56,38 +57,43 @@ static int eeh_event_handler(void * dummy) | |||
56 | { | 57 | { |
57 | unsigned long flags; | 58 | unsigned long flags; |
58 | struct eeh_event *event; | 59 | struct eeh_event *event; |
60 | struct pci_dn *pdn; | ||
59 | 61 | ||
60 | daemonize ("eehd"); | 62 | daemonize ("eehd"); |
63 | set_current_state(TASK_INTERRUPTIBLE); | ||
61 | 64 | ||
62 | while (1) { | 65 | spin_lock_irqsave(&eeh_eventlist_lock, flags); |
63 | set_current_state(TASK_INTERRUPTIBLE); | 66 | event = NULL; |
64 | 67 | ||
65 | spin_lock_irqsave(&eeh_eventlist_lock, flags); | 68 | /* Unqueue the event, get ready to process. */ |
66 | event = NULL; | 69 | if (!list_empty(&eeh_eventlist)) { |
70 | event = list_entry(eeh_eventlist.next, struct eeh_event, list); | ||
71 | list_del(&event->list); | ||
72 | } | ||
73 | spin_unlock_irqrestore(&eeh_eventlist_lock, flags); | ||
67 | 74 | ||
68 | /* Unqueue the event, get ready to process. */ | 75 | if (event == NULL) |
69 | if (!list_empty(&eeh_eventlist)) { | 76 | return 0; |
70 | event = list_entry(eeh_eventlist.next, struct eeh_event, list); | ||
71 | list_del(&event->list); | ||
72 | } | ||
73 | spin_unlock_irqrestore(&eeh_eventlist_lock, flags); | ||
74 | 77 | ||
75 | if (event == NULL) | 78 | /* Serialize processing of EEH events */ |
76 | break; | 79 | mutex_lock(&eeh_event_mutex); |
80 | eeh_mark_slot(event->dn, EEH_MODE_RECOVERING); | ||
77 | 81 | ||
78 | /* Serialize processing of EEH events */ | 82 | printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", |
79 | mutex_lock(&eeh_event_mutex); | 83 | pci_name(event->dev)); |
80 | eeh_mark_slot(event->dn, EEH_MODE_RECOVERING); | ||
81 | 84 | ||
82 | printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", | 85 | pdn = handle_eeh_events(event); |
83 | pci_name(event->dev)); | ||
84 | 86 | ||
85 | handle_eeh_events(event); | 87 | eeh_clear_slot(event->dn, EEH_MODE_RECOVERING); |
88 | pci_dev_put(event->dev); | ||
89 | kfree(event); | ||
90 | mutex_unlock(&eeh_event_mutex); | ||
86 | 91 | ||
87 | eeh_clear_slot(event->dn, EEH_MODE_RECOVERING); | 92 | /* If there are no new errors after an hour, clear the counter. */ |
88 | pci_dev_put(event->dev); | 93 | if (pdn && pdn->eeh_freeze_count>0) { |
89 | kfree(event); | 94 | msleep_interruptible (3600*1000); |
90 | mutex_unlock(&eeh_event_mutex); | 95 | if (pdn->eeh_freeze_count>0) |
96 | pdn->eeh_freeze_count--; | ||
91 | } | 97 | } |
92 | 98 | ||
93 | return 0; | 99 | return 0; |