diff options
-rw-r--r-- | arch/powerpc/platforms/pseries/eeh.c | 24 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/eeh_driver.c | 81 | ||||
-rw-r--r-- | include/asm-powerpc/ppc-pci.h | 4 |
3 files changed, 69 insertions, 40 deletions
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c index d8d24502970f..b0fa76d0c78a 100644 --- a/arch/powerpc/platforms/pseries/eeh.c +++ b/arch/powerpc/platforms/pseries/eeh.c | |||
@@ -450,11 +450,16 @@ eeh_slot_availability(struct pci_dn *pdn) | |||
450 | if (rc) return rc; | 450 | if (rc) return rc; |
451 | 451 | ||
452 | if (rets[1] == 0) return -1; /* EEH is not supported */ | 452 | if (rets[1] == 0) return -1; /* EEH is not supported */ |
453 | if (rets[0] == 0) return 0; /* Oll Korrect */ | 453 | if (rets[0] == 0) return 0; /* Oll Korrect */ |
454 | if (rets[0] == 5) { | 454 | if (rets[0] == 5) { |
455 | if (rets[2] == 0) return -1; /* permanently unavailable */ | 455 | if (rets[2] == 0) return -1; /* permanently unavailable */ |
456 | return rets[2]; /* number of millisecs to wait */ | 456 | return rets[2]; /* number of millisecs to wait */ |
457 | } | 457 | } |
458 | if (rets[0] == 1) | ||
459 | return 250; | ||
460 | |||
461 | printk (KERN_ERR "EEH: Slot unavailable: rc=%d, rets=%d %d %d\n", | ||
462 | rc, rets[0], rets[1], rets[2]); | ||
458 | return -1; | 463 | return -1; |
459 | } | 464 | } |
460 | 465 | ||
@@ -501,9 +506,11 @@ rtas_pci_slot_reset(struct pci_dn *pdn, int state) | |||
501 | 506 | ||
502 | /** rtas_set_slot_reset -- assert the pci #RST line for 1/4 second | 507 | /** rtas_set_slot_reset -- assert the pci #RST line for 1/4 second |
503 | * dn -- device node to be reset. | 508 | * dn -- device node to be reset. |
509 | * | ||
510 | * Return 0 if success, else a non-zero value. | ||
504 | */ | 511 | */ |
505 | 512 | ||
506 | void | 513 | int |
507 | rtas_set_slot_reset(struct pci_dn *pdn) | 514 | rtas_set_slot_reset(struct pci_dn *pdn) |
508 | { | 515 | { |
509 | int i, rc; | 516 | int i, rc; |
@@ -533,10 +540,21 @@ rtas_set_slot_reset(struct pci_dn *pdn) | |||
533 | * ready to be used; if not, wait for recovery. */ | 540 | * ready to be used; if not, wait for recovery. */ |
534 | for (i=0; i<10; i++) { | 541 | for (i=0; i<10; i++) { |
535 | rc = eeh_slot_availability (pdn); | 542 | rc = eeh_slot_availability (pdn); |
536 | if (rc <= 0) break; | 543 | if (rc < 0) |
544 | printk (KERN_ERR "EEH: failed (%d) to reset slot %s\n", rc, pdn->node->full_name); | ||
545 | if (rc == 0) | ||
546 | return 0; | ||
547 | if (rc < 0) | ||
548 | return -1; | ||
537 | 549 | ||
538 | msleep (rc+100); | 550 | msleep (rc+100); |
539 | } | 551 | } |
552 | |||
553 | rc = eeh_slot_availability (pdn); | ||
554 | if (rc) | ||
555 | printk (KERN_ERR "EEH: timeout resetting slot %s\n", pdn->node->full_name); | ||
556 | |||
557 | return rc; | ||
540 | } | 558 | } |
541 | 559 | ||
542 | /* ------------------------------------------------------- */ | 560 | /* ------------------------------------------------------- */ |
diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c index 199a3ce54786..242b2923360d 100644 --- a/arch/powerpc/platforms/pseries/eeh_driver.c +++ b/arch/powerpc/platforms/pseries/eeh_driver.c | |||
@@ -200,14 +200,18 @@ static void eeh_report_failure(struct pci_dev *dev, void *userdata) | |||
200 | * bus resets can be performed. | 200 | * bus resets can be performed. |
201 | */ | 201 | */ |
202 | 202 | ||
203 | static void eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus) | 203 | static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus) |
204 | { | 204 | { |
205 | int rc; | ||
205 | if (bus) | 206 | if (bus) |
206 | pcibios_remove_pci_devices(bus); | 207 | pcibios_remove_pci_devices(bus); |
207 | 208 | ||
208 | /* Reset the pci controller. (Asserts RST#; resets config space). | 209 | /* Reset the pci controller. (Asserts RST#; resets config space). |
209 | * Reconfigure bridges and devices */ | 210 | * Reconfigure bridges and devices. Don't try to bring the system |
210 | rtas_set_slot_reset(pe_dn); | 211 | * up if the reset failed for some reason. */ |
212 | rc = rtas_set_slot_reset(pe_dn); | ||
213 | if (rc) | ||
214 | return rc; | ||
211 | 215 | ||
212 | /* Walk over all functions on this device */ | 216 | /* Walk over all functions on this device */ |
213 | rtas_configure_bridge(pe_dn); | 217 | rtas_configure_bridge(pe_dn); |
@@ -223,6 +227,8 @@ static void eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus) | |||
223 | ssleep (5); | 227 | ssleep (5); |
224 | pcibios_add_pci_devices(bus); | 228 | pcibios_add_pci_devices(bus); |
225 | } | 229 | } |
230 | |||
231 | return 0; | ||
226 | } | 232 | } |
227 | 233 | ||
228 | /* The longest amount of time to wait for a pci device | 234 | /* The longest amount of time to wait for a pci device |
@@ -235,7 +241,7 @@ void handle_eeh_events (struct eeh_event *event) | |||
235 | struct device_node *frozen_dn; | 241 | struct device_node *frozen_dn; |
236 | struct pci_dn *frozen_pdn; | 242 | struct pci_dn *frozen_pdn; |
237 | struct pci_bus *frozen_bus; | 243 | struct pci_bus *frozen_bus; |
238 | int perm_failure = 0; | 244 | int rc = 0; |
239 | 245 | ||
240 | frozen_dn = find_device_pe(event->dn); | 246 | frozen_dn = find_device_pe(event->dn); |
241 | frozen_bus = pcibios_find_pci_bus(frozen_dn); | 247 | frozen_bus = pcibios_find_pci_bus(frozen_dn); |
@@ -272,7 +278,7 @@ void handle_eeh_events (struct eeh_event *event) | |||
272 | frozen_pdn->eeh_freeze_count++; | 278 | frozen_pdn->eeh_freeze_count++; |
273 | 279 | ||
274 | if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES) | 280 | if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES) |
275 | perm_failure = 1; | 281 | goto hard_fail; |
276 | 282 | ||
277 | /* If the reset state is a '5' and the time to reset is 0 (infinity) | 283 | /* If the reset state is a '5' and the time to reset is 0 (infinity) |
278 | * or is more then 15 seconds, then mark this as a permanent failure. | 284 | * or is more then 15 seconds, then mark this as a permanent failure. |
@@ -280,34 +286,7 @@ void handle_eeh_events (struct eeh_event *event) | |||
280 | if ((event->state == pci_channel_io_perm_failure) && | 286 | if ((event->state == pci_channel_io_perm_failure) && |
281 | ((event->time_unavail <= 0) || | 287 | ((event->time_unavail <= 0) || |
282 | (event->time_unavail > MAX_WAIT_FOR_RECOVERY*1000))) | 288 | (event->time_unavail > MAX_WAIT_FOR_RECOVERY*1000))) |
283 | { | 289 | goto hard_fail; |
284 | perm_failure = 1; | ||
285 | } | ||
286 | |||
287 | /* Log the error with the rtas logger. */ | ||
288 | if (perm_failure) { | ||
289 | /* | ||
290 | * About 90% of all real-life EEH failures in the field | ||
291 | * are due to poorly seated PCI cards. Only 10% or so are | ||
292 | * due to actual, failed cards. | ||
293 | */ | ||
294 | printk(KERN_ERR | ||
295 | "EEH: PCI device %s - %s has failed %d times \n" | ||
296 | "and has been permanently disabled. Please try reseating\n" | ||
297 | "this device or replacing it.\n", | ||
298 | pci_name (frozen_pdn->pcidev), | ||
299 | pcid_name(frozen_pdn->pcidev), | ||
300 | frozen_pdn->eeh_freeze_count); | ||
301 | |||
302 | eeh_slot_error_detail(frozen_pdn, 2 /* Permanent Error */); | ||
303 | |||
304 | /* Notify all devices that they're about to go down. */ | ||
305 | pci_walk_bus(frozen_bus, eeh_report_failure, 0); | ||
306 | |||
307 | /* Shut down the device drivers for good. */ | ||
308 | pcibios_remove_pci_devices(frozen_bus); | ||
309 | return; | ||
310 | } | ||
311 | 290 | ||
312 | eeh_slot_error_detail(frozen_pdn, 1 /* Temporary Error */); | 291 | eeh_slot_error_detail(frozen_pdn, 1 /* Temporary Error */); |
313 | printk(KERN_WARNING | 292 | printk(KERN_WARNING |
@@ -330,24 +309,54 @@ void handle_eeh_events (struct eeh_event *event) | |||
330 | * go down willingly, without panicing the system. | 309 | * go down willingly, without panicing the system. |
331 | */ | 310 | */ |
332 | if (result == PCIERR_RESULT_NONE) { | 311 | if (result == PCIERR_RESULT_NONE) { |
333 | eeh_reset_device(frozen_pdn, frozen_bus); | 312 | rc = eeh_reset_device(frozen_pdn, frozen_bus); |
313 | if (rc) | ||
314 | goto hard_fail; | ||
334 | } | 315 | } |
335 | 316 | ||
336 | /* If any device called out for a reset, then reset the slot */ | 317 | /* If any device called out for a reset, then reset the slot */ |
337 | if (result == PCIERR_RESULT_NEED_RESET) { | 318 | if (result == PCIERR_RESULT_NEED_RESET) { |
338 | eeh_reset_device(frozen_pdn, NULL); | 319 | rc = eeh_reset_device(frozen_pdn, NULL); |
320 | if (rc) | ||
321 | goto hard_fail; | ||
339 | pci_walk_bus(frozen_bus, eeh_report_reset, 0); | 322 | pci_walk_bus(frozen_bus, eeh_report_reset, 0); |
340 | } | 323 | } |
341 | 324 | ||
342 | /* If all devices reported they can proceed, the re-enable PIO */ | 325 | /* If all devices reported they can proceed, the re-enable PIO */ |
343 | if (result == PCIERR_RESULT_CAN_RECOVER) { | 326 | if (result == PCIERR_RESULT_CAN_RECOVER) { |
344 | /* XXX Not supported; we brute-force reset the device */ | 327 | /* XXX Not supported; we brute-force reset the device */ |
345 | eeh_reset_device(frozen_pdn, NULL); | 328 | rc = eeh_reset_device(frozen_pdn, NULL); |
329 | if (rc) | ||
330 | goto hard_fail; | ||
346 | pci_walk_bus(frozen_bus, eeh_report_reset, 0); | 331 | pci_walk_bus(frozen_bus, eeh_report_reset, 0); |
347 | } | 332 | } |
348 | 333 | ||
349 | /* Tell all device drivers that they can resume operations */ | 334 | /* Tell all device drivers that they can resume operations */ |
350 | pci_walk_bus(frozen_bus, eeh_report_resume, 0); | 335 | pci_walk_bus(frozen_bus, eeh_report_resume, 0); |
336 | |||
337 | return; | ||
338 | |||
339 | hard_fail: | ||
340 | /* | ||
341 | * About 90% of all real-life EEH failures in the field | ||
342 | * are due to poorly seated PCI cards. Only 10% or so are | ||
343 | * due to actual, failed cards. | ||
344 | */ | ||
345 | printk(KERN_ERR | ||
346 | "EEH: PCI device %s - %s has failed %d times \n" | ||
347 | "and has been permanently disabled. Please try reseating\n" | ||
348 | "this device or replacing it.\n", | ||
349 | pci_name (frozen_pdn->pcidev), | ||
350 | pcid_name(frozen_pdn->pcidev), | ||
351 | frozen_pdn->eeh_freeze_count); | ||
352 | |||
353 | eeh_slot_error_detail(frozen_pdn, 2 /* Permanent Error */); | ||
354 | |||
355 | /* Notify all devices that they're about to go down. */ | ||
356 | pci_walk_bus(frozen_bus, eeh_report_failure, 0); | ||
357 | |||
358 | /* Shut down the device drivers for good. */ | ||
359 | pcibios_remove_pci_devices(frozen_bus); | ||
351 | } | 360 | } |
352 | 361 | ||
353 | /* ---------- end of file ---------- */ | 362 | /* ---------- end of file ---------- */ |
diff --git a/include/asm-powerpc/ppc-pci.h b/include/asm-powerpc/ppc-pci.h index 4820b368bf15..1a2db61694f2 100644 --- a/include/asm-powerpc/ppc-pci.h +++ b/include/asm-powerpc/ppc-pci.h | |||
@@ -76,8 +76,10 @@ void eeh_slot_error_detail (struct pci_dn *pdn, int severity); | |||
76 | * does this by asserting the PCI #RST line for 1/8th of | 76 | * does this by asserting the PCI #RST line for 1/8th of |
77 | * a second; this routine will sleep while the adapter is | 77 | * a second; this routine will sleep while the adapter is |
78 | * being reset. | 78 | * being reset. |
79 | * | ||
80 | * Returns a non-zero value if the reset failed. | ||
79 | */ | 81 | */ |
80 | void rtas_set_slot_reset (struct pci_dn *); | 82 | int rtas_set_slot_reset (struct pci_dn *); |
81 | 83 | ||
82 | /** | 84 | /** |
83 | * eeh_restore_bars - Restore device configuration info. | 85 | * eeh_restore_bars - Restore device configuration info. |