aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinas Vepstas <linas@linas.org>2005-11-03 19:54:54 -0500
committerPaul Mackerras <paulus@samba.org>2006-01-09 23:30:14 -0500
commitb6495c0c8f100b882d85774f44529519befefba9 (patch)
treeec38027b7e7e50ffbe843a5333fbf95456bc1cf5 /arch
parent21e464dd7c943c984dcccd9aff8c9f6a5ea920d7 (diff)
[PATCH] powerpc: Don't continue with PCI Error recovery if slot reset failed.
238-eeh-stop-if-reset_failed.patch If the firmware is unable to reset the PCI slot for some reason, then don't attempt any further recovery steps after that point. Instead, mark the device as permanently failed. Signed-off-by: Linas Vepstas <linas@austin.ibm.com> Signed-off-by: Paul Mackerras <paulus@samba.org> (cherry picked from e06b942521eb2cdaf232726f45a820d5837acb12 commit)
Diffstat (limited to 'arch')
-rw-r--r--arch/powerpc/platforms/pseries/eeh.c24
-rw-r--r--arch/powerpc/platforms/pseries/eeh_driver.c81
2 files changed, 66 insertions, 39 deletions
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index d8d24502970f..b0fa76d0c78a 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -450,11 +450,16 @@ eeh_slot_availability(struct pci_dn *pdn)
450 if (rc) return rc; 450 if (rc) return rc;
451 451
452 if (rets[1] == 0) return -1; /* EEH is not supported */ 452 if (rets[1] == 0) return -1; /* EEH is not supported */
453 if (rets[0] == 0) return 0; /* Oll Korrect */ 453 if (rets[0] == 0) return 0; /* Oll Korrect */
454 if (rets[0] == 5) { 454 if (rets[0] == 5) {
455 if (rets[2] == 0) return -1; /* permanently unavailable */ 455 if (rets[2] == 0) return -1; /* permanently unavailable */
456 return rets[2]; /* number of millisecs to wait */ 456 return rets[2]; /* number of millisecs to wait */
457 } 457 }
458 if (rets[0] == 1)
459 return 250;
460
461 printk (KERN_ERR "EEH: Slot unavailable: rc=%d, rets=%d %d %d\n",
462 rc, rets[0], rets[1], rets[2]);
458 return -1; 463 return -1;
459} 464}
460 465
@@ -501,9 +506,11 @@ rtas_pci_slot_reset(struct pci_dn *pdn, int state)
501 506
502/** rtas_set_slot_reset -- assert the pci #RST line for 1/4 second 507/** rtas_set_slot_reset -- assert the pci #RST line for 1/4 second
503 * dn -- device node to be reset. 508 * dn -- device node to be reset.
509 *
510 * Return 0 if success, else a non-zero value.
504 */ 511 */
505 512
506void 513int
507rtas_set_slot_reset(struct pci_dn *pdn) 514rtas_set_slot_reset(struct pci_dn *pdn)
508{ 515{
509 int i, rc; 516 int i, rc;
@@ -533,10 +540,21 @@ rtas_set_slot_reset(struct pci_dn *pdn)
533 * ready to be used; if not, wait for recovery. */ 540 * ready to be used; if not, wait for recovery. */
534 for (i=0; i<10; i++) { 541 for (i=0; i<10; i++) {
535 rc = eeh_slot_availability (pdn); 542 rc = eeh_slot_availability (pdn);
536 if (rc <= 0) break; 543 if (rc < 0)
544 printk (KERN_ERR "EEH: failed (%d) to reset slot %s\n", rc, pdn->node->full_name);
545 if (rc == 0)
546 return 0;
547 if (rc < 0)
548 return -1;
537 549
538 msleep (rc+100); 550 msleep (rc+100);
539 } 551 }
552
553 rc = eeh_slot_availability (pdn);
554 if (rc)
555 printk (KERN_ERR "EEH: timeout resetting slot %s\n", pdn->node->full_name);
556
557 return rc;
540} 558}
541 559
542/* ------------------------------------------------------- */ 560/* ------------------------------------------------------- */
diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c
index 199a3ce54786..242b2923360d 100644
--- a/arch/powerpc/platforms/pseries/eeh_driver.c
+++ b/arch/powerpc/platforms/pseries/eeh_driver.c
@@ -200,14 +200,18 @@ static void eeh_report_failure(struct pci_dev *dev, void *userdata)
200 * bus resets can be performed. 200 * bus resets can be performed.
201 */ 201 */
202 202
203static void eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus) 203static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus)
204{ 204{
205 int rc;
205 if (bus) 206 if (bus)
206 pcibios_remove_pci_devices(bus); 207 pcibios_remove_pci_devices(bus);
207 208
208 /* Reset the pci controller. (Asserts RST#; resets config space). 209 /* Reset the pci controller. (Asserts RST#; resets config space).
209 * Reconfigure bridges and devices */ 210 * Reconfigure bridges and devices. Don't try to bring the system
210 rtas_set_slot_reset(pe_dn); 211 * up if the reset failed for some reason. */
212 rc = rtas_set_slot_reset(pe_dn);
213 if (rc)
214 return rc;
211 215
212 /* Walk over all functions on this device */ 216 /* Walk over all functions on this device */
213 rtas_configure_bridge(pe_dn); 217 rtas_configure_bridge(pe_dn);
@@ -223,6 +227,8 @@ static void eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus)
223 ssleep (5); 227 ssleep (5);
224 pcibios_add_pci_devices(bus); 228 pcibios_add_pci_devices(bus);
225 } 229 }
230
231 return 0;
226} 232}
227 233
228/* The longest amount of time to wait for a pci device 234/* The longest amount of time to wait for a pci device
@@ -235,7 +241,7 @@ void handle_eeh_events (struct eeh_event *event)
235 struct device_node *frozen_dn; 241 struct device_node *frozen_dn;
236 struct pci_dn *frozen_pdn; 242 struct pci_dn *frozen_pdn;
237 struct pci_bus *frozen_bus; 243 struct pci_bus *frozen_bus;
238 int perm_failure = 0; 244 int rc = 0;
239 245
240 frozen_dn = find_device_pe(event->dn); 246 frozen_dn = find_device_pe(event->dn);
241 frozen_bus = pcibios_find_pci_bus(frozen_dn); 247 frozen_bus = pcibios_find_pci_bus(frozen_dn);
@@ -272,7 +278,7 @@ void handle_eeh_events (struct eeh_event *event)
272 frozen_pdn->eeh_freeze_count++; 278 frozen_pdn->eeh_freeze_count++;
273 279
274 if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES) 280 if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES)
275 perm_failure = 1; 281 goto hard_fail;
276 282
277 /* If the reset state is a '5' and the time to reset is 0 (infinity) 283 /* If the reset state is a '5' and the time to reset is 0 (infinity)
278 * or is more then 15 seconds, then mark this as a permanent failure. 284 * or is more then 15 seconds, then mark this as a permanent failure.
@@ -280,34 +286,7 @@ void handle_eeh_events (struct eeh_event *event)
280 if ((event->state == pci_channel_io_perm_failure) && 286 if ((event->state == pci_channel_io_perm_failure) &&
281 ((event->time_unavail <= 0) || 287 ((event->time_unavail <= 0) ||
282 (event->time_unavail > MAX_WAIT_FOR_RECOVERY*1000))) 288 (event->time_unavail > MAX_WAIT_FOR_RECOVERY*1000)))
283 { 289 goto hard_fail;
284 perm_failure = 1;
285 }
286
287 /* Log the error with the rtas logger. */
288 if (perm_failure) {
289 /*
290 * About 90% of all real-life EEH failures in the field
291 * are due to poorly seated PCI cards. Only 10% or so are
292 * due to actual, failed cards.
293 */
294 printk(KERN_ERR
295 "EEH: PCI device %s - %s has failed %d times \n"
296 "and has been permanently disabled. Please try reseating\n"
297 "this device or replacing it.\n",
298 pci_name (frozen_pdn->pcidev),
299 pcid_name(frozen_pdn->pcidev),
300 frozen_pdn->eeh_freeze_count);
301
302 eeh_slot_error_detail(frozen_pdn, 2 /* Permanent Error */);
303
304 /* Notify all devices that they're about to go down. */
305 pci_walk_bus(frozen_bus, eeh_report_failure, 0);
306
307 /* Shut down the device drivers for good. */
308 pcibios_remove_pci_devices(frozen_bus);
309 return;
310 }
311 290
312 eeh_slot_error_detail(frozen_pdn, 1 /* Temporary Error */); 291 eeh_slot_error_detail(frozen_pdn, 1 /* Temporary Error */);
313 printk(KERN_WARNING 292 printk(KERN_WARNING
@@ -330,24 +309,54 @@ void handle_eeh_events (struct eeh_event *event)
330 * go down willingly, without panicing the system. 309 * go down willingly, without panicing the system.
331 */ 310 */
332 if (result == PCIERR_RESULT_NONE) { 311 if (result == PCIERR_RESULT_NONE) {
333 eeh_reset_device(frozen_pdn, frozen_bus); 312 rc = eeh_reset_device(frozen_pdn, frozen_bus);
313 if (rc)
314 goto hard_fail;
334 } 315 }
335 316
336 /* If any device called out for a reset, then reset the slot */ 317 /* If any device called out for a reset, then reset the slot */
337 if (result == PCIERR_RESULT_NEED_RESET) { 318 if (result == PCIERR_RESULT_NEED_RESET) {
338 eeh_reset_device(frozen_pdn, NULL); 319 rc = eeh_reset_device(frozen_pdn, NULL);
320 if (rc)
321 goto hard_fail;
339 pci_walk_bus(frozen_bus, eeh_report_reset, 0); 322 pci_walk_bus(frozen_bus, eeh_report_reset, 0);
340 } 323 }
341 324
342 /* If all devices reported they can proceed, the re-enable PIO */ 325 /* If all devices reported they can proceed, the re-enable PIO */
343 if (result == PCIERR_RESULT_CAN_RECOVER) { 326 if (result == PCIERR_RESULT_CAN_RECOVER) {
344 /* XXX Not supported; we brute-force reset the device */ 327 /* XXX Not supported; we brute-force reset the device */
345 eeh_reset_device(frozen_pdn, NULL); 328 rc = eeh_reset_device(frozen_pdn, NULL);
329 if (rc)
330 goto hard_fail;
346 pci_walk_bus(frozen_bus, eeh_report_reset, 0); 331 pci_walk_bus(frozen_bus, eeh_report_reset, 0);
347 } 332 }
348 333
349 /* Tell all device drivers that they can resume operations */ 334 /* Tell all device drivers that they can resume operations */
350 pci_walk_bus(frozen_bus, eeh_report_resume, 0); 335 pci_walk_bus(frozen_bus, eeh_report_resume, 0);
336
337 return;
338
339hard_fail:
340 /*
341 * About 90% of all real-life EEH failures in the field
342 * are due to poorly seated PCI cards. Only 10% or so are
343 * due to actual, failed cards.
344 */
345 printk(KERN_ERR
346 "EEH: PCI device %s - %s has failed %d times \n"
347 "and has been permanently disabled. Please try reseating\n"
348 "this device or replacing it.\n",
349 pci_name (frozen_pdn->pcidev),
350 pcid_name(frozen_pdn->pcidev),
351 frozen_pdn->eeh_freeze_count);
352
353 eeh_slot_error_detail(frozen_pdn, 2 /* Permanent Error */);
354
355 /* Notify all devices that they're about to go down. */
356 pci_walk_bus(frozen_bus, eeh_report_failure, 0);
357
358 /* Shut down the device drivers for good. */
359 pcibios_remove_pci_devices(frozen_bus);
351} 360}
352 361
353/* ---------- end of file ---------- */ 362/* ---------- end of file ---------- */