diff options
author | Linas Vepstas <linas@linas.org> | 2005-11-03 19:52:49 -0500 |
---|---|---|
committer | Paul Mackerras <paulus@samba.org> | 2006-01-09 23:28:32 -0500 |
commit | 77bd741561016134d1761d6101c4f0361025062f (patch) | |
tree | 5e3389b6941add4b24a2be64c730b7a9087c1f2f | |
parent | 977127174a7dff52d17faeeb4c4949a54221881f (diff) |
[PATCH] powerpc: PCI Error Recovery: PPC64 core recovery routines
Various PCI bus errors can be signaled by newer PCI controllers. The
core error recovery routines are architecture dependent. This patch adds
a recovery infrastructure for the PPC64 pSeries systems.
Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
(cherry picked from e8ca11b460c4c9c7fa6b529be221529ebd770e38 commit)
-rw-r--r-- | arch/powerpc/platforms/pseries/Makefile | 2 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/eeh.c | 17 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/eeh_driver.c | 366 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/eeh_event.c | 39 | ||||
-rw-r--r-- | include/asm-powerpc/eeh.h | 8 | ||||
-rw-r--r-- | include/asm-powerpc/eeh_event.h | 7 | ||||
-rw-r--r-- | include/asm-powerpc/ppc-pci.h | 9 |
7 files changed, 413 insertions, 35 deletions
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index 6accdd155505..0b7d5cb64177 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile | |||
@@ -4,7 +4,7 @@ obj-$(CONFIG_SMP) += smp.o | |||
4 | obj-$(CONFIG_IBMVIO) += vio.o | 4 | obj-$(CONFIG_IBMVIO) += vio.o |
5 | obj-$(CONFIG_XICS) += xics.o | 5 | obj-$(CONFIG_XICS) += xics.o |
6 | obj-$(CONFIG_SCANLOG) += scanlog.o | 6 | obj-$(CONFIG_SCANLOG) += scanlog.o |
7 | obj-$(CONFIG_EEH) += eeh.o eeh_event.o | 7 | obj-$(CONFIG_EEH) += eeh.o eeh_driver.o eeh_event.o |
8 | 8 | ||
9 | obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o | 9 | obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o |
10 | obj-$(CONFIG_HVCS) += hvcserver.o | 10 | obj-$(CONFIG_HVCS) += hvcserver.o |
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c index 7fbfd16d72b7..d6560c45637b 100644 --- a/arch/powerpc/platforms/pseries/eeh.c +++ b/arch/powerpc/platforms/pseries/eeh.c | |||
@@ -485,6 +485,11 @@ static void __eeh_mark_slot (struct device_node *dn, int mode_flag) | |||
485 | if (PCI_DN(dn)) { | 485 | if (PCI_DN(dn)) { |
486 | PCI_DN(dn)->eeh_mode |= mode_flag; | 486 | PCI_DN(dn)->eeh_mode |= mode_flag; |
487 | 487 | ||
488 | /* Mark the pci device driver too */ | ||
489 | struct pci_dev *dev = PCI_DN(dn)->pcidev; | ||
490 | if (dev && dev->driver) | ||
491 | dev->error_state = pci_channel_io_frozen; | ||
492 | |||
488 | if (dn->child) | 493 | if (dn->child) |
489 | __eeh_mark_slot (dn->child, mode_flag); | 494 | __eeh_mark_slot (dn->child, mode_flag); |
490 | } | 495 | } |
@@ -544,6 +549,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) | |||
544 | int rets[3]; | 549 | int rets[3]; |
545 | unsigned long flags; | 550 | unsigned long flags; |
546 | struct pci_dn *pdn; | 551 | struct pci_dn *pdn; |
552 | enum pci_channel_state state; | ||
547 | int rc = 0; | 553 | int rc = 0; |
548 | 554 | ||
549 | __get_cpu_var(total_mmio_ffs)++; | 555 | __get_cpu_var(total_mmio_ffs)++; |
@@ -648,8 +654,13 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) | |||
648 | eeh_mark_slot (dn, EEH_MODE_ISOLATED); | 654 | eeh_mark_slot (dn, EEH_MODE_ISOLATED); |
649 | spin_unlock_irqrestore(&confirm_error_lock, flags); | 655 | spin_unlock_irqrestore(&confirm_error_lock, flags); |
650 | 656 | ||
651 | eeh_send_failure_event (dn, dev, rets[0], rets[2]); | 657 | state = pci_channel_io_normal; |
652 | 658 | if ((rets[0] == 2) || (rets[0] == 4)) | |
659 | state = pci_channel_io_frozen; | ||
660 | if (rets[0] == 5) | ||
661 | state = pci_channel_io_perm_failure; | ||
662 | eeh_send_failure_event (dn, dev, state, rets[2]); | ||
663 | |||
653 | /* Most EEH events are due to device driver bugs. Having | 664 | /* Most EEH events are due to device driver bugs. Having |
654 | * a stack trace will help the device-driver authors figure | 665 | * a stack trace will help the device-driver authors figure |
655 | * out what happened. So print that out. */ | 666 | * out what happened. So print that out. */ |
@@ -953,8 +964,10 @@ static void *early_enable_eeh(struct device_node *dn, void *data) | |||
953 | * But there are a few cases like display devices that make sense. | 964 | * But there are a few cases like display devices that make sense. |
954 | */ | 965 | */ |
955 | enable = 1; /* i.e. we will do checking */ | 966 | enable = 1; /* i.e. we will do checking */ |
967 | #if 0 | ||
956 | if ((*class_code >> 16) == PCI_BASE_CLASS_DISPLAY) | 968 | if ((*class_code >> 16) == PCI_BASE_CLASS_DISPLAY) |
957 | enable = 0; | 969 | enable = 0; |
970 | #endif | ||
958 | 971 | ||
959 | if (!enable) | 972 | if (!enable) |
960 | pdn->eeh_mode |= EEH_MODE_NOCHECK; | 973 | pdn->eeh_mode |= EEH_MODE_NOCHECK; |
diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c new file mode 100644 index 000000000000..f6cff065d89d --- /dev/null +++ b/arch/powerpc/platforms/pseries/eeh_driver.c | |||
@@ -0,0 +1,366 @@ | |||
1 | /* | ||
2 | * PCI Error Recovery Driver for RPA-compliant PPC64 platform. | ||
3 | * Copyright (C) 2004, 2005 Linas Vepstas <linas@linas.org> | ||
4 | * | ||
5 | * All rights reserved. | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License as published by | ||
9 | * the Free Software Foundation; either version 2 of the License, or (at | ||
10 | * your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, but | ||
13 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
15 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
16 | * details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public License | ||
19 | * along with this program; if not, write to the Free Software | ||
20 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
21 | * | ||
22 | * Send feedback to <linas@us.ibm.com> | ||
23 | * | ||
24 | */ | ||
25 | #include <linux/delay.h> | ||
26 | #include <linux/irq.h> | ||
27 | #include <linux/interrupt.h> | ||
28 | #include <linux/notifier.h> | ||
29 | #include <linux/pci.h> | ||
30 | #include <asm/eeh.h> | ||
31 | #include <asm/eeh_event.h> | ||
32 | #include <asm/ppc-pci.h> | ||
33 | #include <asm/pci-bridge.h> | ||
34 | #include <asm/prom.h> | ||
35 | #include <asm/rtas.h> | ||
36 | |||
37 | |||
38 | static inline const char * pcid_name (struct pci_dev *pdev) | ||
39 | { | ||
40 | if (pdev->dev.driver) | ||
41 | return pdev->dev.driver->name; | ||
42 | return ""; | ||
43 | } | ||
44 | |||
45 | /** | ||
46 | * Return the "partitionable endpoint" (pe) under which this device lies | ||
47 | */ | ||
48 | static struct device_node * find_device_pe(struct device_node *dn) | ||
49 | { | ||
50 | while ((dn->parent) && PCI_DN(dn->parent) && | ||
51 | (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) { | ||
52 | dn = dn->parent; | ||
53 | } | ||
54 | return dn; | ||
55 | } | ||
56 | |||
57 | |||
58 | #ifdef DEBUG | ||
59 | static void print_device_node_tree (struct pci_dn *pdn, int dent) | ||
60 | { | ||
61 | int i; | ||
62 | if (!pdn) return; | ||
63 | for (i=0;i<dent; i++) | ||
64 | printk(" "); | ||
65 | printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n", | ||
66 | pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr, | ||
67 | pdn->eeh_pe_config_addr, pdn->node->full_name); | ||
68 | dent += 3; | ||
69 | struct device_node *pc = pdn->node->child; | ||
70 | while (pc) { | ||
71 | print_device_node_tree(PCI_DN(pc), dent); | ||
72 | pc = pc->sibling; | ||
73 | } | ||
74 | } | ||
75 | #endif | ||
76 | |||
77 | /** | ||
78 | * irq_in_use - return true if this irq is being used | ||
79 | */ | ||
80 | static int irq_in_use(unsigned int irq) | ||
81 | { | ||
82 | int rc = 0; | ||
83 | unsigned long flags; | ||
84 | struct irq_desc *desc = irq_desc + irq; | ||
85 | |||
86 | spin_lock_irqsave(&desc->lock, flags); | ||
87 | if (desc->action) | ||
88 | rc = 1; | ||
89 | spin_unlock_irqrestore(&desc->lock, flags); | ||
90 | return rc; | ||
91 | } | ||
92 | |||
93 | /* ------------------------------------------------------- */ | ||
94 | /** eeh_report_error - report an EEH error to each device, | ||
95 | * collect up and merge the device responses. | ||
96 | */ | ||
97 | |||
98 | static void eeh_report_error(struct pci_dev *dev, void *userdata) | ||
99 | { | ||
100 | enum pcierr_result rc, *res = userdata; | ||
101 | struct pci_driver *driver = dev->driver; | ||
102 | |||
103 | dev->error_state = pci_channel_io_frozen; | ||
104 | |||
105 | if (!driver) | ||
106 | return; | ||
107 | |||
108 | if (irq_in_use (dev->irq)) { | ||
109 | struct device_node *dn = pci_device_to_OF_node(dev); | ||
110 | PCI_DN(dn)->eeh_mode |= EEH_MODE_IRQ_DISABLED; | ||
111 | disable_irq_nosync(dev->irq); | ||
112 | } | ||
113 | if (!driver->err_handler) | ||
114 | return; | ||
115 | if (!driver->err_handler->error_detected) | ||
116 | return; | ||
117 | |||
118 | rc = driver->err_handler->error_detected (dev, pci_channel_io_frozen); | ||
119 | if (*res == PCIERR_RESULT_NONE) *res = rc; | ||
120 | if (*res == PCIERR_RESULT_NEED_RESET) return; | ||
121 | if (*res == PCIERR_RESULT_DISCONNECT && | ||
122 | rc == PCIERR_RESULT_NEED_RESET) *res = rc; | ||
123 | } | ||
124 | |||
125 | /** eeh_report_reset -- tell this device that the pci slot | ||
126 | * has been reset. | ||
127 | */ | ||
128 | |||
129 | static void eeh_report_reset(struct pci_dev *dev, void *userdata) | ||
130 | { | ||
131 | struct pci_driver *driver = dev->driver; | ||
132 | struct device_node *dn = pci_device_to_OF_node(dev); | ||
133 | |||
134 | if (!driver) | ||
135 | return; | ||
136 | |||
137 | if ((PCI_DN(dn)->eeh_mode) & EEH_MODE_IRQ_DISABLED) { | ||
138 | PCI_DN(dn)->eeh_mode &= ~EEH_MODE_IRQ_DISABLED; | ||
139 | enable_irq(dev->irq); | ||
140 | } | ||
141 | if (!driver->err_handler) | ||
142 | return; | ||
143 | if (!driver->err_handler->slot_reset) | ||
144 | return; | ||
145 | |||
146 | driver->err_handler->slot_reset(dev); | ||
147 | } | ||
148 | |||
149 | static void eeh_report_resume(struct pci_dev *dev, void *userdata) | ||
150 | { | ||
151 | struct pci_driver *driver = dev->driver; | ||
152 | |||
153 | dev->error_state = pci_channel_io_normal; | ||
154 | |||
155 | if (!driver) | ||
156 | return; | ||
157 | if (!driver->err_handler) | ||
158 | return; | ||
159 | if (!driver->err_handler->resume) | ||
160 | return; | ||
161 | |||
162 | driver->err_handler->resume(dev); | ||
163 | } | ||
164 | |||
165 | static void eeh_report_failure(struct pci_dev *dev, void *userdata) | ||
166 | { | ||
167 | struct pci_driver *driver = dev->driver; | ||
168 | |||
169 | dev->error_state = pci_channel_io_perm_failure; | ||
170 | |||
171 | if (!driver) | ||
172 | return; | ||
173 | |||
174 | if (irq_in_use (dev->irq)) { | ||
175 | struct device_node *dn = pci_device_to_OF_node(dev); | ||
176 | PCI_DN(dn)->eeh_mode |= EEH_MODE_IRQ_DISABLED; | ||
177 | disable_irq_nosync(dev->irq); | ||
178 | } | ||
179 | if (!driver->err_handler) | ||
180 | return; | ||
181 | if (!driver->err_handler->error_detected) | ||
182 | return; | ||
183 | driver->err_handler->error_detected(dev, pci_channel_io_perm_failure); | ||
184 | } | ||
185 | |||
186 | /* ------------------------------------------------------- */ | ||
187 | /** | ||
188 | * handle_eeh_events -- reset a PCI device after hard lockup. | ||
189 | * | ||
190 | * pSeries systems will isolate a PCI slot if the PCI-Host | ||
191 | * bridge detects address or data parity errors, DMA's | ||
192 | * occuring to wild addresses (which usually happen due to | ||
193 | * bugs in device drivers or in PCI adapter firmware). | ||
194 | * Slot isolations also occur if #SERR, #PERR or other misc | ||
195 | * PCI-related errors are detected. | ||
196 | * | ||
197 | * Recovery process consists of unplugging the device driver | ||
198 | * (which generated hotplug events to userspace), then issuing | ||
199 | * a PCI #RST to the device, then reconfiguring the PCI config | ||
200 | * space for all bridges & devices under this slot, and then | ||
201 | * finally restarting the device drivers (which cause a second | ||
202 | * set of hotplug events to go out to userspace). | ||
203 | */ | ||
204 | |||
205 | /** | ||
206 | * eeh_reset_device() -- perform actual reset of a pci slot | ||
207 | * Args: bus: pointer to the pci bus structure corresponding | ||
208 | * to the isolated slot. A non-null value will | ||
209 | * cause all devices under the bus to be removed | ||
210 | * and then re-added. | ||
211 | * pe_dn: pointer to a "Partionable Endpoint" device node. | ||
212 | * This is the top-level structure on which pci | ||
213 | * bus resets can be performed. | ||
214 | */ | ||
215 | |||
216 | static void eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus) | ||
217 | { | ||
218 | if (bus) | ||
219 | pcibios_remove_pci_devices(bus); | ||
220 | |||
221 | /* Reset the pci controller. (Asserts RST#; resets config space). | ||
222 | * Reconfigure bridges and devices */ | ||
223 | rtas_set_slot_reset(pe_dn); | ||
224 | |||
225 | /* Walk over all functions on this device */ | ||
226 | rtas_configure_bridge(pe_dn); | ||
227 | eeh_restore_bars(pe_dn); | ||
228 | |||
229 | /* Give the system 5 seconds to finish running the user-space | ||
230 | * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes, | ||
231 | * this is a hack, but if we don't do this, and try to bring | ||
232 | * the device up before the scripts have taken it down, | ||
233 | * potentially weird things happen. | ||
234 | */ | ||
235 | if (bus) { | ||
236 | ssleep (5); | ||
237 | pcibios_add_pci_devices(bus); | ||
238 | } | ||
239 | } | ||
240 | |||
241 | /* The longest amount of time to wait for a pci device | ||
242 | * to come back on line, in seconds. | ||
243 | */ | ||
244 | #define MAX_WAIT_FOR_RECOVERY 15 | ||
245 | |||
246 | void handle_eeh_events (struct eeh_event *event) | ||
247 | { | ||
248 | struct device_node *frozen_dn; | ||
249 | struct pci_dn *frozen_pdn; | ||
250 | struct pci_bus *frozen_bus; | ||
251 | int perm_failure = 0; | ||
252 | |||
253 | frozen_dn = find_device_pe(event->dn); | ||
254 | frozen_bus = pcibios_find_pci_bus(frozen_dn); | ||
255 | |||
256 | if (!frozen_dn) { | ||
257 | printk(KERN_ERR "EEH: Error: Cannot find partition endpoint for %s\n", | ||
258 | pci_name(event->dev)); | ||
259 | return; | ||
260 | } | ||
261 | |||
262 | /* There are two different styles for coming up with the PE. | ||
263 | * In the old style, it was the highest EEH-capable device | ||
264 | * which was always an EADS pci bridge. In the new style, | ||
265 | * there might not be any EADS bridges, and even when there are, | ||
266 | * the firmware marks them as "EEH incapable". So another | ||
267 | * two-step is needed to find the pci bus.. */ | ||
268 | if (!frozen_bus) | ||
269 | frozen_bus = pcibios_find_pci_bus (frozen_dn->parent); | ||
270 | |||
271 | if (!frozen_bus) { | ||
272 | printk(KERN_ERR "EEH: Cannot find PCI bus for %s\n", | ||
273 | frozen_dn->full_name); | ||
274 | return; | ||
275 | } | ||
276 | |||
277 | #if 0 | ||
278 | /* We may get "permanent failure" messages on empty slots. | ||
279 | * These are false alarms. Empty slots have no child dn. */ | ||
280 | if ((event->state == pci_channel_io_perm_failure) && (frozen_device == NULL)) | ||
281 | return; | ||
282 | #endif | ||
283 | |||
284 | frozen_pdn = PCI_DN(frozen_dn); | ||
285 | frozen_pdn->eeh_freeze_count++; | ||
286 | |||
287 | if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES) | ||
288 | perm_failure = 1; | ||
289 | |||
290 | /* If the reset state is a '5' and the time to reset is 0 (infinity) | ||
291 | * or is more then 15 seconds, then mark this as a permanent failure. | ||
292 | */ | ||
293 | if ((event->state == pci_channel_io_perm_failure) && | ||
294 | ((event->time_unavail <= 0) || | ||
295 | (event->time_unavail > MAX_WAIT_FOR_RECOVERY*1000))) | ||
296 | { | ||
297 | perm_failure = 1; | ||
298 | } | ||
299 | |||
300 | /* Log the error with the rtas logger. */ | ||
301 | if (perm_failure) { | ||
302 | /* | ||
303 | * About 90% of all real-life EEH failures in the field | ||
304 | * are due to poorly seated PCI cards. Only 10% or so are | ||
305 | * due to actual, failed cards. | ||
306 | */ | ||
307 | printk(KERN_ERR | ||
308 | "EEH: PCI device %s - %s has failed %d times \n" | ||
309 | "and has been permanently disabled. Please try reseating\n" | ||
310 | "this device or replacing it.\n", | ||
311 | pci_name (frozen_pdn->pcidev), | ||
312 | pcid_name(frozen_pdn->pcidev), | ||
313 | frozen_pdn->eeh_freeze_count); | ||
314 | |||
315 | eeh_slot_error_detail(frozen_pdn, 2 /* Permanent Error */); | ||
316 | |||
317 | /* Notify all devices that they're about to go down. */ | ||
318 | pci_walk_bus(frozen_bus, eeh_report_failure, 0); | ||
319 | |||
320 | /* Shut down the device drivers for good. */ | ||
321 | pcibios_remove_pci_devices(frozen_bus); | ||
322 | return; | ||
323 | } | ||
324 | |||
325 | eeh_slot_error_detail(frozen_pdn, 1 /* Temporary Error */); | ||
326 | printk(KERN_WARNING | ||
327 | "EEH: This PCI device has failed %d times since last reboot: %s - %s\n", | ||
328 | frozen_pdn->eeh_freeze_count, | ||
329 | pci_name (frozen_pdn->pcidev), | ||
330 | pcid_name(frozen_pdn->pcidev)); | ||
331 | |||
332 | /* Walk the various device drivers attached to this slot through | ||
333 | * a reset sequence, giving each an opportunity to do what it needs | ||
334 | * to accomplish the reset. Each child gets a report of the | ||
335 | * status ... if any child can't handle the reset, then the entire | ||
336 | * slot is dlpar removed and added. | ||
337 | */ | ||
338 | enum pcierr_result result = PCIERR_RESULT_NONE; | ||
339 | pci_walk_bus(frozen_bus, eeh_report_error, &result); | ||
340 | |||
341 | /* If all device drivers were EEH-unaware, then shut | ||
342 | * down all of the device drivers, and hope they | ||
343 | * go down willingly, without panicing the system. | ||
344 | */ | ||
345 | if (result == PCIERR_RESULT_NONE) { | ||
346 | eeh_reset_device(frozen_pdn, frozen_bus); | ||
347 | } | ||
348 | |||
349 | /* If any device called out for a reset, then reset the slot */ | ||
350 | if (result == PCIERR_RESULT_NEED_RESET) { | ||
351 | eeh_reset_device(frozen_pdn, NULL); | ||
352 | pci_walk_bus(frozen_bus, eeh_report_reset, 0); | ||
353 | } | ||
354 | |||
355 | /* If all devices reported they can proceed, the re-enable PIO */ | ||
356 | if (result == PCIERR_RESULT_CAN_RECOVER) { | ||
357 | /* XXX Not supported; we brute-force reset the device */ | ||
358 | eeh_reset_device(frozen_pdn, NULL); | ||
359 | pci_walk_bus(frozen_bus, eeh_report_reset, 0); | ||
360 | } | ||
361 | |||
362 | /* Tell all device drivers that they can resume operations */ | ||
363 | pci_walk_bus(frozen_bus, eeh_report_resume, 0); | ||
364 | } | ||
365 | |||
366 | /* ---------- end of file ---------- */ | ||
diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c index 92497333c2b6..9a9961f27480 100644 --- a/arch/powerpc/platforms/pseries/eeh_event.c +++ b/arch/powerpc/platforms/pseries/eeh_event.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/list.h> | 21 | #include <linux/list.h> |
22 | #include <linux/pci.h> | 22 | #include <linux/pci.h> |
23 | #include <asm/eeh_event.h> | 23 | #include <asm/eeh_event.h> |
24 | #include <asm/ppc-pci.h> | ||
24 | 25 | ||
25 | /** Overview: | 26 | /** Overview: |
26 | * EEH error states may be detected within exception handlers; | 27 | * EEH error states may be detected within exception handlers; |
@@ -37,31 +38,6 @@ static void eeh_thread_launcher(void *); | |||
37 | DECLARE_WORK(eeh_event_wq, eeh_thread_launcher, NULL); | 38 | DECLARE_WORK(eeh_event_wq, eeh_thread_launcher, NULL); |
38 | 39 | ||
39 | /** | 40 | /** |
40 | * eeh_panic - call panic() for an eeh event that cannot be handled. | ||
41 | * The philosophy of this routine is that it is better to panic and | ||
42 | * halt the OS than it is to risk possible data corruption by | ||
43 | * oblivious device drivers that don't know better. | ||
44 | * | ||
45 | * @dev pci device that had an eeh event | ||
46 | * @reset_state current reset state of the device slot | ||
47 | */ | ||
48 | static void eeh_panic(struct pci_dev *dev, int reset_state) | ||
49 | { | ||
50 | /* | ||
51 | * Since the panic_on_oops sysctl is used to halt the system | ||
52 | * in light of potential corruption, we can use it here. | ||
53 | */ | ||
54 | if (panic_on_oops) { | ||
55 | panic("EEH: MMIO failure (%d) on device:%s\n", reset_state, | ||
56 | pci_name(dev)); | ||
57 | } | ||
58 | else { | ||
59 | printk(KERN_INFO "EEH: Ignored MMIO failure (%d) on device:%s\n", | ||
60 | reset_state, pci_name(dev)); | ||
61 | } | ||
62 | } | ||
63 | |||
64 | /** | ||
65 | * eeh_event_handler - dispatch EEH events. The detection of a frozen | 41 | * eeh_event_handler - dispatch EEH events. The detection of a frozen |
66 | * slot can occur inside an interrupt, where it can be hard to do | 42 | * slot can occur inside an interrupt, where it can be hard to do |
67 | * anything about it. The goal of this routine is to pull these | 43 | * anything about it. The goal of this routine is to pull these |
@@ -82,10 +58,16 @@ static int eeh_event_handler(void * dummy) | |||
82 | 58 | ||
83 | spin_lock_irqsave(&eeh_eventlist_lock, flags); | 59 | spin_lock_irqsave(&eeh_eventlist_lock, flags); |
84 | event = NULL; | 60 | event = NULL; |
61 | |||
62 | /* Unqueue the event, get ready to process. */ | ||
85 | if (!list_empty(&eeh_eventlist)) { | 63 | if (!list_empty(&eeh_eventlist)) { |
86 | event = list_entry(eeh_eventlist.next, struct eeh_event, list); | 64 | event = list_entry(eeh_eventlist.next, struct eeh_event, list); |
87 | list_del(&event->list); | 65 | list_del(&event->list); |
88 | } | 66 | } |
67 | |||
68 | if (event) | ||
69 | eeh_mark_slot(event->dn, EEH_MODE_RECOVERING); | ||
70 | |||
89 | spin_unlock_irqrestore(&eeh_eventlist_lock, flags); | 71 | spin_unlock_irqrestore(&eeh_eventlist_lock, flags); |
90 | if (event == NULL) | 72 | if (event == NULL) |
91 | break; | 73 | break; |
@@ -93,8 +75,11 @@ static int eeh_event_handler(void * dummy) | |||
93 | printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", | 75 | printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", |
94 | pci_name(event->dev)); | 76 | pci_name(event->dev)); |
95 | 77 | ||
96 | eeh_panic (event->dev, event->state); | 78 | handle_eeh_events(event); |
79 | |||
80 | eeh_clear_slot(event->dn, EEH_MODE_RECOVERING); | ||
97 | 81 | ||
82 | pci_dev_put(event->dev); | ||
98 | kfree(event); | 83 | kfree(event); |
99 | } | 84 | } |
100 | 85 | ||
@@ -122,7 +107,7 @@ static void eeh_thread_launcher(void *dummy) | |||
122 | */ | 107 | */ |
123 | int eeh_send_failure_event (struct device_node *dn, | 108 | int eeh_send_failure_event (struct device_node *dn, |
124 | struct pci_dev *dev, | 109 | struct pci_dev *dev, |
125 | int state, | 110 | enum pci_channel_state state, |
126 | int time_unavail) | 111 | int time_unavail) |
127 | { | 112 | { |
128 | unsigned long flags; | 113 | unsigned long flags; |
diff --git a/include/asm-powerpc/eeh.h b/include/asm-powerpc/eeh.h index 4395b7bc1ed4..b263fb2fa6e4 100644 --- a/include/asm-powerpc/eeh.h +++ b/include/asm-powerpc/eeh.h | |||
@@ -34,9 +34,11 @@ struct device_node; | |||
34 | extern int eeh_subsystem_enabled; | 34 | extern int eeh_subsystem_enabled; |
35 | 35 | ||
36 | /* Values for eeh_mode bits in device_node */ | 36 | /* Values for eeh_mode bits in device_node */ |
37 | #define EEH_MODE_SUPPORTED (1<<0) | 37 | #define EEH_MODE_SUPPORTED (1<<0) |
38 | #define EEH_MODE_NOCHECK (1<<1) | 38 | #define EEH_MODE_NOCHECK (1<<1) |
39 | #define EEH_MODE_ISOLATED (1<<2) | 39 | #define EEH_MODE_ISOLATED (1<<2) |
40 | #define EEH_MODE_RECOVERING (1<<3) | ||
41 | #define EEH_MODE_IRQ_DISABLED (1<<4) | ||
40 | 42 | ||
41 | /* Max number of EEH freezes allowed before we consider the device | 43 | /* Max number of EEH freezes allowed before we consider the device |
42 | * to be permanently disabled. */ | 44 | * to be permanently disabled. */ |
diff --git a/include/asm-powerpc/eeh_event.h b/include/asm-powerpc/eeh_event.h index 5e11a00b6fa0..93d55a2bebfd 100644 --- a/include/asm-powerpc/eeh_event.h +++ b/include/asm-powerpc/eeh_event.h | |||
@@ -30,7 +30,7 @@ struct eeh_event { | |||
30 | struct list_head list; | 30 | struct list_head list; |
31 | struct device_node *dn; /* struct device node */ | 31 | struct device_node *dn; /* struct device node */ |
32 | struct pci_dev *dev; /* affected device */ | 32 | struct pci_dev *dev; /* affected device */ |
33 | int state; | 33 | enum pci_channel_state state; /* PCI bus state for the affected device */ |
34 | int time_unavail; /* milliseconds until device might be available */ | 34 | int time_unavail; /* milliseconds until device might be available */ |
35 | }; | 35 | }; |
36 | 36 | ||
@@ -47,8 +47,11 @@ struct eeh_event { | |||
47 | */ | 47 | */ |
48 | int eeh_send_failure_event (struct device_node *dn, | 48 | int eeh_send_failure_event (struct device_node *dn, |
49 | struct pci_dev *dev, | 49 | struct pci_dev *dev, |
50 | int reset_state, | 50 | enum pci_channel_state state, |
51 | int time_unavail); | 51 | int time_unavail); |
52 | 52 | ||
53 | /* Main recovery function */ | ||
54 | void handle_eeh_events (struct eeh_event *); | ||
55 | |||
53 | #endif /* __KERNEL__ */ | 56 | #endif /* __KERNEL__ */ |
54 | #endif /* ASM_PPC64_EEH_EVENT_H */ | 57 | #endif /* ASM_PPC64_EEH_EVENT_H */ |
diff --git a/include/asm-powerpc/ppc-pci.h b/include/asm-powerpc/ppc-pci.h index bdef312900a1..caf67a3bcb78 100644 --- a/include/asm-powerpc/ppc-pci.h +++ b/include/asm-powerpc/ppc-pci.h | |||
@@ -53,6 +53,15 @@ extern unsigned long pci_probe_only; | |||
53 | /* ---- EEH internal-use-only related routines ---- */ | 53 | /* ---- EEH internal-use-only related routines ---- */ |
54 | #ifdef CONFIG_EEH | 54 | #ifdef CONFIG_EEH |
55 | /** | 55 | /** |
56 | * eeh_slot_error_detail -- record and EEH error condition to the log | ||
57 | * @severity: 1 if temporary, 2 if permanent failure. | ||
58 | * | ||
59 | * Obtains the the EEH error details from the RTAS subsystem, | ||
60 | * and then logs these details with the RTAS error log system. | ||
61 | */ | ||
62 | void eeh_slot_error_detail (struct pci_dn *pdn, int severity); | ||
63 | |||
64 | /** | ||
56 | * rtas_set_slot_reset -- unfreeze a frozen slot | 65 | * rtas_set_slot_reset -- unfreeze a frozen slot |
57 | * | 66 | * |
58 | * Clear the EEH-frozen condition on a slot. This routine | 67 | * Clear the EEH-frozen condition on a slot. This routine |