aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/powerpc/platforms/pseries/eeh.c24
-rw-r--r--arch/powerpc/platforms/pseries/eeh_driver.c81
-rw-r--r--include/asm-powerpc/ppc-pci.h4
3 files changed, 69 insertions, 40 deletions
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index d8d24502970f..b0fa76d0c78a 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -450,11 +450,16 @@ eeh_slot_availability(struct pci_dn *pdn)
450 if (rc) return rc; 450 if (rc) return rc;
451 451
452 if (rets[1] == 0) return -1; /* EEH is not supported */ 452 if (rets[1] == 0) return -1; /* EEH is not supported */
453 if (rets[0] == 0) return 0; /* Oll Korrect */ 453 if (rets[0] == 0) return 0; /* Oll Korrect */
454 if (rets[0] == 5) { 454 if (rets[0] == 5) {
455 if (rets[2] == 0) return -1; /* permanently unavailable */ 455 if (rets[2] == 0) return -1; /* permanently unavailable */
456 return rets[2]; /* number of millisecs to wait */ 456 return rets[2]; /* number of millisecs to wait */
457 } 457 }
458 if (rets[0] == 1)
459 return 250;
460
461 printk (KERN_ERR "EEH: Slot unavailable: rc=%d, rets=%d %d %d\n",
462 rc, rets[0], rets[1], rets[2]);
458 return -1; 463 return -1;
459} 464}
460 465
@@ -501,9 +506,11 @@ rtas_pci_slot_reset(struct pci_dn *pdn, int state)
501 506
502/** rtas_set_slot_reset -- assert the pci #RST line for 1/4 second 507/** rtas_set_slot_reset -- assert the pci #RST line for 1/4 second
503 * dn -- device node to be reset. 508 * dn -- device node to be reset.
509 *
510 * Return 0 if success, else a non-zero value.
504 */ 511 */
505 512
506void 513int
507rtas_set_slot_reset(struct pci_dn *pdn) 514rtas_set_slot_reset(struct pci_dn *pdn)
508{ 515{
509 int i, rc; 516 int i, rc;
@@ -533,10 +540,21 @@ rtas_set_slot_reset(struct pci_dn *pdn)
533 * ready to be used; if not, wait for recovery. */ 540 * ready to be used; if not, wait for recovery. */
534 for (i=0; i<10; i++) { 541 for (i=0; i<10; i++) {
535 rc = eeh_slot_availability (pdn); 542 rc = eeh_slot_availability (pdn);
536 if (rc <= 0) break; 543 if (rc < 0)
544 printk (KERN_ERR "EEH: failed (%d) to reset slot %s\n", rc, pdn->node->full_name);
545 if (rc == 0)
546 return 0;
547 if (rc < 0)
548 return -1;
537 549
538 msleep (rc+100); 550 msleep (rc+100);
539 } 551 }
552
553 rc = eeh_slot_availability (pdn);
554 if (rc)
555 printk (KERN_ERR "EEH: timeout resetting slot %s\n", pdn->node->full_name);
556
557 return rc;
540} 558}
541 559
542/* ------------------------------------------------------- */ 560/* ------------------------------------------------------- */
diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c
index 199a3ce54786..242b2923360d 100644
--- a/arch/powerpc/platforms/pseries/eeh_driver.c
+++ b/arch/powerpc/platforms/pseries/eeh_driver.c
@@ -200,14 +200,18 @@ static void eeh_report_failure(struct pci_dev *dev, void *userdata)
200 * bus resets can be performed. 200 * bus resets can be performed.
201 */ 201 */
202 202
203static void eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus) 203static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus)
204{ 204{
205 int rc;
205 if (bus) 206 if (bus)
206 pcibios_remove_pci_devices(bus); 207 pcibios_remove_pci_devices(bus);
207 208
208 /* Reset the pci controller. (Asserts RST#; resets config space). 209 /* Reset the pci controller. (Asserts RST#; resets config space).
209 * Reconfigure bridges and devices */ 210 * Reconfigure bridges and devices. Don't try to bring the system
210 rtas_set_slot_reset(pe_dn); 211 * up if the reset failed for some reason. */
212 rc = rtas_set_slot_reset(pe_dn);
213 if (rc)
214 return rc;
211 215
212 /* Walk over all functions on this device */ 216 /* Walk over all functions on this device */
213 rtas_configure_bridge(pe_dn); 217 rtas_configure_bridge(pe_dn);
@@ -223,6 +227,8 @@ static void eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus)
223 ssleep (5); 227 ssleep (5);
224 pcibios_add_pci_devices(bus); 228 pcibios_add_pci_devices(bus);
225 } 229 }
230
231 return 0;
226} 232}
227 233
228/* The longest amount of time to wait for a pci device 234/* The longest amount of time to wait for a pci device
@@ -235,7 +241,7 @@ void handle_eeh_events (struct eeh_event *event)
235 struct device_node *frozen_dn; 241 struct device_node *frozen_dn;
236 struct pci_dn *frozen_pdn; 242 struct pci_dn *frozen_pdn;
237 struct pci_bus *frozen_bus; 243 struct pci_bus *frozen_bus;
238 int perm_failure = 0; 244 int rc = 0;
239 245
240 frozen_dn = find_device_pe(event->dn); 246 frozen_dn = find_device_pe(event->dn);
241 frozen_bus = pcibios_find_pci_bus(frozen_dn); 247 frozen_bus = pcibios_find_pci_bus(frozen_dn);
@@ -272,7 +278,7 @@ void handle_eeh_events (struct eeh_event *event)
272 frozen_pdn->eeh_freeze_count++; 278 frozen_pdn->eeh_freeze_count++;
273 279
274 if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES) 280 if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES)
275 perm_failure = 1; 281 goto hard_fail;
276 282
277 /* If the reset state is a '5' and the time to reset is 0 (infinity) 283 /* If the reset state is a '5' and the time to reset is 0 (infinity)
278 * or is more then 15 seconds, then mark this as a permanent failure. 284 * or is more then 15 seconds, then mark this as a permanent failure.
@@ -280,34 +286,7 @@ void handle_eeh_events (struct eeh_event *event)
280 if ((event->state == pci_channel_io_perm_failure) && 286 if ((event->state == pci_channel_io_perm_failure) &&
281 ((event->time_unavail <= 0) || 287 ((event->time_unavail <= 0) ||
282 (event->time_unavail > MAX_WAIT_FOR_RECOVERY*1000))) 288 (event->time_unavail > MAX_WAIT_FOR_RECOVERY*1000)))
283 { 289 goto hard_fail;
284 perm_failure = 1;
285 }
286
287 /* Log the error with the rtas logger. */
288 if (perm_failure) {
289 /*
290 * About 90% of all real-life EEH failures in the field
291 * are due to poorly seated PCI cards. Only 10% or so are
292 * due to actual, failed cards.
293 */
294 printk(KERN_ERR
295 "EEH: PCI device %s - %s has failed %d times \n"
296 "and has been permanently disabled. Please try reseating\n"
297 "this device or replacing it.\n",
298 pci_name (frozen_pdn->pcidev),
299 pcid_name(frozen_pdn->pcidev),
300 frozen_pdn->eeh_freeze_count);
301
302 eeh_slot_error_detail(frozen_pdn, 2 /* Permanent Error */);
303
304 /* Notify all devices that they're about to go down. */
305 pci_walk_bus(frozen_bus, eeh_report_failure, 0);
306
307 /* Shut down the device drivers for good. */
308 pcibios_remove_pci_devices(frozen_bus);
309 return;
310 }
311 290
312 eeh_slot_error_detail(frozen_pdn, 1 /* Temporary Error */); 291 eeh_slot_error_detail(frozen_pdn, 1 /* Temporary Error */);
313 printk(KERN_WARNING 292 printk(KERN_WARNING
@@ -330,24 +309,54 @@ void handle_eeh_events (struct eeh_event *event)
330 * go down willingly, without panicing the system. 309 * go down willingly, without panicing the system.
331 */ 310 */
332 if (result == PCIERR_RESULT_NONE) { 311 if (result == PCIERR_RESULT_NONE) {
333 eeh_reset_device(frozen_pdn, frozen_bus); 312 rc = eeh_reset_device(frozen_pdn, frozen_bus);
313 if (rc)
314 goto hard_fail;
334 } 315 }
335 316
336 /* If any device called out for a reset, then reset the slot */ 317 /* If any device called out for a reset, then reset the slot */
337 if (result == PCIERR_RESULT_NEED_RESET) { 318 if (result == PCIERR_RESULT_NEED_RESET) {
338 eeh_reset_device(frozen_pdn, NULL); 319 rc = eeh_reset_device(frozen_pdn, NULL);
320 if (rc)
321 goto hard_fail;
339 pci_walk_bus(frozen_bus, eeh_report_reset, 0); 322 pci_walk_bus(frozen_bus, eeh_report_reset, 0);
340 } 323 }
341 324
342 /* If all devices reported they can proceed, the re-enable PIO */ 325 /* If all devices reported they can proceed, the re-enable PIO */
343 if (result == PCIERR_RESULT_CAN_RECOVER) { 326 if (result == PCIERR_RESULT_CAN_RECOVER) {
344 /* XXX Not supported; we brute-force reset the device */ 327 /* XXX Not supported; we brute-force reset the device */
345 eeh_reset_device(frozen_pdn, NULL); 328 rc = eeh_reset_device(frozen_pdn, NULL);
329 if (rc)
330 goto hard_fail;
346 pci_walk_bus(frozen_bus, eeh_report_reset, 0); 331 pci_walk_bus(frozen_bus, eeh_report_reset, 0);
347 } 332 }
348 333
349 /* Tell all device drivers that they can resume operations */ 334 /* Tell all device drivers that they can resume operations */
350 pci_walk_bus(frozen_bus, eeh_report_resume, 0); 335 pci_walk_bus(frozen_bus, eeh_report_resume, 0);
336
337 return;
338
339hard_fail:
340 /*
341 * About 90% of all real-life EEH failures in the field
342 * are due to poorly seated PCI cards. Only 10% or so are
343 * due to actual, failed cards.
344 */
345 printk(KERN_ERR
346 "EEH: PCI device %s - %s has failed %d times \n"
347 "and has been permanently disabled. Please try reseating\n"
348 "this device or replacing it.\n",
349 pci_name (frozen_pdn->pcidev),
350 pcid_name(frozen_pdn->pcidev),
351 frozen_pdn->eeh_freeze_count);
352
353 eeh_slot_error_detail(frozen_pdn, 2 /* Permanent Error */);
354
355 /* Notify all devices that they're about to go down. */
356 pci_walk_bus(frozen_bus, eeh_report_failure, 0);
357
358 /* Shut down the device drivers for good. */
359 pcibios_remove_pci_devices(frozen_bus);
351} 360}
352 361
353/* ---------- end of file ---------- */ 362/* ---------- end of file ---------- */
diff --git a/include/asm-powerpc/ppc-pci.h b/include/asm-powerpc/ppc-pci.h
index 4820b368bf15..1a2db61694f2 100644
--- a/include/asm-powerpc/ppc-pci.h
+++ b/include/asm-powerpc/ppc-pci.h
@@ -76,8 +76,10 @@ void eeh_slot_error_detail (struct pci_dn *pdn, int severity);
76 * does this by asserting the PCI #RST line for 1/8th of 76 * does this by asserting the PCI #RST line for 1/8th of
77 * a second; this routine will sleep while the adapter is 77 * a second; this routine will sleep while the adapter is
78 * being reset. 78 * being reset.
79 *
80 * Returns a non-zero value if the reset failed.
79 */ 81 */
80void rtas_set_slot_reset (struct pci_dn *); 82int rtas_set_slot_reset (struct pci_dn *);
81 83
82/** 84/**
83 * eeh_restore_bars - Restore device configuration info. 85 * eeh_restore_bars - Restore device configuration info.