diff options
Diffstat (limited to 'arch/powerpc/platforms/pseries/eeh_driver.c')
-rw-r--r-- | arch/powerpc/platforms/pseries/eeh_driver.c | 366 |
1 files changed, 366 insertions, 0 deletions
diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c new file mode 100644 index 000000000000..f6cff065d89d --- /dev/null +++ b/arch/powerpc/platforms/pseries/eeh_driver.c | |||
@@ -0,0 +1,366 @@ | |||
1 | /* | ||
2 | * PCI Error Recovery Driver for RPA-compliant PPC64 platform. | ||
3 | * Copyright (C) 2004, 2005 Linas Vepstas <linas@linas.org> | ||
4 | * | ||
5 | * All rights reserved. | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License as published by | ||
9 | * the Free Software Foundation; either version 2 of the License, or (at | ||
10 | * your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, but | ||
13 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
15 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
16 | * details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public License | ||
19 | * along with this program; if not, write to the Free Software | ||
20 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
21 | * | ||
22 | * Send feedback to <linas@us.ibm.com> | ||
23 | * | ||
24 | */ | ||
25 | #include <linux/delay.h> | ||
26 | #include <linux/irq.h> | ||
27 | #include <linux/interrupt.h> | ||
28 | #include <linux/notifier.h> | ||
29 | #include <linux/pci.h> | ||
30 | #include <asm/eeh.h> | ||
31 | #include <asm/eeh_event.h> | ||
32 | #include <asm/ppc-pci.h> | ||
33 | #include <asm/pci-bridge.h> | ||
34 | #include <asm/prom.h> | ||
35 | #include <asm/rtas.h> | ||
36 | |||
37 | |||
38 | static inline const char * pcid_name (struct pci_dev *pdev) | ||
39 | { | ||
40 | if (pdev->dev.driver) | ||
41 | return pdev->dev.driver->name; | ||
42 | return ""; | ||
43 | } | ||
44 | |||
45 | /** | ||
46 | * Return the "partitionable endpoint" (pe) under which this device lies | ||
47 | */ | ||
48 | static struct device_node * find_device_pe(struct device_node *dn) | ||
49 | { | ||
50 | while ((dn->parent) && PCI_DN(dn->parent) && | ||
51 | (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) { | ||
52 | dn = dn->parent; | ||
53 | } | ||
54 | return dn; | ||
55 | } | ||
56 | |||
57 | |||
58 | #ifdef DEBUG | ||
59 | static void print_device_node_tree (struct pci_dn *pdn, int dent) | ||
60 | { | ||
61 | int i; | ||
62 | if (!pdn) return; | ||
63 | for (i=0;i<dent; i++) | ||
64 | printk(" "); | ||
65 | printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n", | ||
66 | pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr, | ||
67 | pdn->eeh_pe_config_addr, pdn->node->full_name); | ||
68 | dent += 3; | ||
69 | struct device_node *pc = pdn->node->child; | ||
70 | while (pc) { | ||
71 | print_device_node_tree(PCI_DN(pc), dent); | ||
72 | pc = pc->sibling; | ||
73 | } | ||
74 | } | ||
75 | #endif | ||
76 | |||
77 | /** | ||
78 | * irq_in_use - return true if this irq is being used | ||
79 | */ | ||
80 | static int irq_in_use(unsigned int irq) | ||
81 | { | ||
82 | int rc = 0; | ||
83 | unsigned long flags; | ||
84 | struct irq_desc *desc = irq_desc + irq; | ||
85 | |||
86 | spin_lock_irqsave(&desc->lock, flags); | ||
87 | if (desc->action) | ||
88 | rc = 1; | ||
89 | spin_unlock_irqrestore(&desc->lock, flags); | ||
90 | return rc; | ||
91 | } | ||
92 | |||
93 | /* ------------------------------------------------------- */ | ||
94 | /** eeh_report_error - report an EEH error to each device, | ||
95 | * collect up and merge the device responses. | ||
96 | */ | ||
97 | |||
98 | static void eeh_report_error(struct pci_dev *dev, void *userdata) | ||
99 | { | ||
100 | enum pcierr_result rc, *res = userdata; | ||
101 | struct pci_driver *driver = dev->driver; | ||
102 | |||
103 | dev->error_state = pci_channel_io_frozen; | ||
104 | |||
105 | if (!driver) | ||
106 | return; | ||
107 | |||
108 | if (irq_in_use (dev->irq)) { | ||
109 | struct device_node *dn = pci_device_to_OF_node(dev); | ||
110 | PCI_DN(dn)->eeh_mode |= EEH_MODE_IRQ_DISABLED; | ||
111 | disable_irq_nosync(dev->irq); | ||
112 | } | ||
113 | if (!driver->err_handler) | ||
114 | return; | ||
115 | if (!driver->err_handler->error_detected) | ||
116 | return; | ||
117 | |||
118 | rc = driver->err_handler->error_detected (dev, pci_channel_io_frozen); | ||
119 | if (*res == PCIERR_RESULT_NONE) *res = rc; | ||
120 | if (*res == PCIERR_RESULT_NEED_RESET) return; | ||
121 | if (*res == PCIERR_RESULT_DISCONNECT && | ||
122 | rc == PCIERR_RESULT_NEED_RESET) *res = rc; | ||
123 | } | ||
124 | |||
125 | /** eeh_report_reset -- tell this device that the pci slot | ||
126 | * has been reset. | ||
127 | */ | ||
128 | |||
129 | static void eeh_report_reset(struct pci_dev *dev, void *userdata) | ||
130 | { | ||
131 | struct pci_driver *driver = dev->driver; | ||
132 | struct device_node *dn = pci_device_to_OF_node(dev); | ||
133 | |||
134 | if (!driver) | ||
135 | return; | ||
136 | |||
137 | if ((PCI_DN(dn)->eeh_mode) & EEH_MODE_IRQ_DISABLED) { | ||
138 | PCI_DN(dn)->eeh_mode &= ~EEH_MODE_IRQ_DISABLED; | ||
139 | enable_irq(dev->irq); | ||
140 | } | ||
141 | if (!driver->err_handler) | ||
142 | return; | ||
143 | if (!driver->err_handler->slot_reset) | ||
144 | return; | ||
145 | |||
146 | driver->err_handler->slot_reset(dev); | ||
147 | } | ||
148 | |||
149 | static void eeh_report_resume(struct pci_dev *dev, void *userdata) | ||
150 | { | ||
151 | struct pci_driver *driver = dev->driver; | ||
152 | |||
153 | dev->error_state = pci_channel_io_normal; | ||
154 | |||
155 | if (!driver) | ||
156 | return; | ||
157 | if (!driver->err_handler) | ||
158 | return; | ||
159 | if (!driver->err_handler->resume) | ||
160 | return; | ||
161 | |||
162 | driver->err_handler->resume(dev); | ||
163 | } | ||
164 | |||
165 | static void eeh_report_failure(struct pci_dev *dev, void *userdata) | ||
166 | { | ||
167 | struct pci_driver *driver = dev->driver; | ||
168 | |||
169 | dev->error_state = pci_channel_io_perm_failure; | ||
170 | |||
171 | if (!driver) | ||
172 | return; | ||
173 | |||
174 | if (irq_in_use (dev->irq)) { | ||
175 | struct device_node *dn = pci_device_to_OF_node(dev); | ||
176 | PCI_DN(dn)->eeh_mode |= EEH_MODE_IRQ_DISABLED; | ||
177 | disable_irq_nosync(dev->irq); | ||
178 | } | ||
179 | if (!driver->err_handler) | ||
180 | return; | ||
181 | if (!driver->err_handler->error_detected) | ||
182 | return; | ||
183 | driver->err_handler->error_detected(dev, pci_channel_io_perm_failure); | ||
184 | } | ||
185 | |||
186 | /* ------------------------------------------------------- */ | ||
187 | /** | ||
188 | * handle_eeh_events -- reset a PCI device after hard lockup. | ||
189 | * | ||
190 | * pSeries systems will isolate a PCI slot if the PCI-Host | ||
191 | * bridge detects address or data parity errors, DMA's | ||
192 | * occuring to wild addresses (which usually happen due to | ||
193 | * bugs in device drivers or in PCI adapter firmware). | ||
194 | * Slot isolations also occur if #SERR, #PERR or other misc | ||
195 | * PCI-related errors are detected. | ||
196 | * | ||
197 | * Recovery process consists of unplugging the device driver | ||
198 | * (which generated hotplug events to userspace), then issuing | ||
199 | * a PCI #RST to the device, then reconfiguring the PCI config | ||
200 | * space for all bridges & devices under this slot, and then | ||
201 | * finally restarting the device drivers (which cause a second | ||
202 | * set of hotplug events to go out to userspace). | ||
203 | */ | ||
204 | |||
205 | /** | ||
206 | * eeh_reset_device() -- perform actual reset of a pci slot | ||
207 | * Args: bus: pointer to the pci bus structure corresponding | ||
208 | * to the isolated slot. A non-null value will | ||
209 | * cause all devices under the bus to be removed | ||
210 | * and then re-added. | ||
211 | * pe_dn: pointer to a "Partionable Endpoint" device node. | ||
212 | * This is the top-level structure on which pci | ||
213 | * bus resets can be performed. | ||
214 | */ | ||
215 | |||
216 | static void eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus) | ||
217 | { | ||
218 | if (bus) | ||
219 | pcibios_remove_pci_devices(bus); | ||
220 | |||
221 | /* Reset the pci controller. (Asserts RST#; resets config space). | ||
222 | * Reconfigure bridges and devices */ | ||
223 | rtas_set_slot_reset(pe_dn); | ||
224 | |||
225 | /* Walk over all functions on this device */ | ||
226 | rtas_configure_bridge(pe_dn); | ||
227 | eeh_restore_bars(pe_dn); | ||
228 | |||
229 | /* Give the system 5 seconds to finish running the user-space | ||
230 | * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes, | ||
231 | * this is a hack, but if we don't do this, and try to bring | ||
232 | * the device up before the scripts have taken it down, | ||
233 | * potentially weird things happen. | ||
234 | */ | ||
235 | if (bus) { | ||
236 | ssleep (5); | ||
237 | pcibios_add_pci_devices(bus); | ||
238 | } | ||
239 | } | ||
240 | |||
241 | /* The longest amount of time to wait for a pci device | ||
242 | * to come back on line, in seconds. | ||
243 | */ | ||
244 | #define MAX_WAIT_FOR_RECOVERY 15 | ||
245 | |||
246 | void handle_eeh_events (struct eeh_event *event) | ||
247 | { | ||
248 | struct device_node *frozen_dn; | ||
249 | struct pci_dn *frozen_pdn; | ||
250 | struct pci_bus *frozen_bus; | ||
251 | int perm_failure = 0; | ||
252 | |||
253 | frozen_dn = find_device_pe(event->dn); | ||
254 | frozen_bus = pcibios_find_pci_bus(frozen_dn); | ||
255 | |||
256 | if (!frozen_dn) { | ||
257 | printk(KERN_ERR "EEH: Error: Cannot find partition endpoint for %s\n", | ||
258 | pci_name(event->dev)); | ||
259 | return; | ||
260 | } | ||
261 | |||
262 | /* There are two different styles for coming up with the PE. | ||
263 | * In the old style, it was the highest EEH-capable device | ||
264 | * which was always an EADS pci bridge. In the new style, | ||
265 | * there might not be any EADS bridges, and even when there are, | ||
266 | * the firmware marks them as "EEH incapable". So another | ||
267 | * two-step is needed to find the pci bus.. */ | ||
268 | if (!frozen_bus) | ||
269 | frozen_bus = pcibios_find_pci_bus (frozen_dn->parent); | ||
270 | |||
271 | if (!frozen_bus) { | ||
272 | printk(KERN_ERR "EEH: Cannot find PCI bus for %s\n", | ||
273 | frozen_dn->full_name); | ||
274 | return; | ||
275 | } | ||
276 | |||
277 | #if 0 | ||
278 | /* We may get "permanent failure" messages on empty slots. | ||
279 | * These are false alarms. Empty slots have no child dn. */ | ||
280 | if ((event->state == pci_channel_io_perm_failure) && (frozen_device == NULL)) | ||
281 | return; | ||
282 | #endif | ||
283 | |||
284 | frozen_pdn = PCI_DN(frozen_dn); | ||
285 | frozen_pdn->eeh_freeze_count++; | ||
286 | |||
287 | if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES) | ||
288 | perm_failure = 1; | ||
289 | |||
290 | /* If the reset state is a '5' and the time to reset is 0 (infinity) | ||
291 | * or is more then 15 seconds, then mark this as a permanent failure. | ||
292 | */ | ||
293 | if ((event->state == pci_channel_io_perm_failure) && | ||
294 | ((event->time_unavail <= 0) || | ||
295 | (event->time_unavail > MAX_WAIT_FOR_RECOVERY*1000))) | ||
296 | { | ||
297 | perm_failure = 1; | ||
298 | } | ||
299 | |||
300 | /* Log the error with the rtas logger. */ | ||
301 | if (perm_failure) { | ||
302 | /* | ||
303 | * About 90% of all real-life EEH failures in the field | ||
304 | * are due to poorly seated PCI cards. Only 10% or so are | ||
305 | * due to actual, failed cards. | ||
306 | */ | ||
307 | printk(KERN_ERR | ||
308 | "EEH: PCI device %s - %s has failed %d times \n" | ||
309 | "and has been permanently disabled. Please try reseating\n" | ||
310 | "this device or replacing it.\n", | ||
311 | pci_name (frozen_pdn->pcidev), | ||
312 | pcid_name(frozen_pdn->pcidev), | ||
313 | frozen_pdn->eeh_freeze_count); | ||
314 | |||
315 | eeh_slot_error_detail(frozen_pdn, 2 /* Permanent Error */); | ||
316 | |||
317 | /* Notify all devices that they're about to go down. */ | ||
318 | pci_walk_bus(frozen_bus, eeh_report_failure, 0); | ||
319 | |||
320 | /* Shut down the device drivers for good. */ | ||
321 | pcibios_remove_pci_devices(frozen_bus); | ||
322 | return; | ||
323 | } | ||
324 | |||
325 | eeh_slot_error_detail(frozen_pdn, 1 /* Temporary Error */); | ||
326 | printk(KERN_WARNING | ||
327 | "EEH: This PCI device has failed %d times since last reboot: %s - %s\n", | ||
328 | frozen_pdn->eeh_freeze_count, | ||
329 | pci_name (frozen_pdn->pcidev), | ||
330 | pcid_name(frozen_pdn->pcidev)); | ||
331 | |||
332 | /* Walk the various device drivers attached to this slot through | ||
333 | * a reset sequence, giving each an opportunity to do what it needs | ||
334 | * to accomplish the reset. Each child gets a report of the | ||
335 | * status ... if any child can't handle the reset, then the entire | ||
336 | * slot is dlpar removed and added. | ||
337 | */ | ||
338 | enum pcierr_result result = PCIERR_RESULT_NONE; | ||
339 | pci_walk_bus(frozen_bus, eeh_report_error, &result); | ||
340 | |||
341 | /* If all device drivers were EEH-unaware, then shut | ||
342 | * down all of the device drivers, and hope they | ||
343 | * go down willingly, without panicing the system. | ||
344 | */ | ||
345 | if (result == PCIERR_RESULT_NONE) { | ||
346 | eeh_reset_device(frozen_pdn, frozen_bus); | ||
347 | } | ||
348 | |||
349 | /* If any device called out for a reset, then reset the slot */ | ||
350 | if (result == PCIERR_RESULT_NEED_RESET) { | ||
351 | eeh_reset_device(frozen_pdn, NULL); | ||
352 | pci_walk_bus(frozen_bus, eeh_report_reset, 0); | ||
353 | } | ||
354 | |||
355 | /* If all devices reported they can proceed, the re-enable PIO */ | ||
356 | if (result == PCIERR_RESULT_CAN_RECOVER) { | ||
357 | /* XXX Not supported; we brute-force reset the device */ | ||
358 | eeh_reset_device(frozen_pdn, NULL); | ||
359 | pci_walk_bus(frozen_bus, eeh_report_reset, 0); | ||
360 | } | ||
361 | |||
362 | /* Tell all device drivers that they can resume operations */ | ||
363 | pci_walk_bus(frozen_bus, eeh_report_resume, 0); | ||
364 | } | ||
365 | |||
366 | /* ---------- end of file ---------- */ | ||