diff options
author | Linas Vepstas <linas@linas.org> | 2005-11-03 19:54:54 -0500 |
---|---|---|
committer | Paul Mackerras <paulus@samba.org> | 2006-01-09 23:30:14 -0500 |
commit | b6495c0c8f100b882d85774f44529519befefba9 (patch) | |
tree | ec38027b7e7e50ffbe843a5333fbf95456bc1cf5 /arch | |
parent | 21e464dd7c943c984dcccd9aff8c9f6a5ea920d7 (diff) |
[PATCH] powerpc: Don't continue with PCI Error recovery if slot reset failed.
238-eeh-stop-if-reset_failed.patch
If the firmware is unable to reset the PCI slot for some reason, then
don't attempt any further recovery steps after that point. Instead,
mark the device as permanently failed.
Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
(cherry picked from e06b942521eb2cdaf232726f45a820d5837acb12 commit)
Diffstat (limited to 'arch')
-rw-r--r-- | arch/powerpc/platforms/pseries/eeh.c | 24 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/eeh_driver.c | 81 |
2 files changed, 66 insertions, 39 deletions
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c index d8d24502970f..b0fa76d0c78a 100644 --- a/arch/powerpc/platforms/pseries/eeh.c +++ b/arch/powerpc/platforms/pseries/eeh.c | |||
@@ -450,11 +450,16 @@ eeh_slot_availability(struct pci_dn *pdn) | |||
450 | if (rc) return rc; | 450 | if (rc) return rc; |
451 | 451 | ||
452 | if (rets[1] == 0) return -1; /* EEH is not supported */ | 452 | if (rets[1] == 0) return -1; /* EEH is not supported */ |
453 | if (rets[0] == 0) return 0; /* Oll Korrect */ | 453 | if (rets[0] == 0) return 0; /* Oll Korrect */ |
454 | if (rets[0] == 5) { | 454 | if (rets[0] == 5) { |
455 | if (rets[2] == 0) return -1; /* permanently unavailable */ | 455 | if (rets[2] == 0) return -1; /* permanently unavailable */ |
456 | return rets[2]; /* number of millisecs to wait */ | 456 | return rets[2]; /* number of millisecs to wait */ |
457 | } | 457 | } |
458 | if (rets[0] == 1) | ||
459 | return 250; | ||
460 | |||
461 | printk (KERN_ERR "EEH: Slot unavailable: rc=%d, rets=%d %d %d\n", | ||
462 | rc, rets[0], rets[1], rets[2]); | ||
458 | return -1; | 463 | return -1; |
459 | } | 464 | } |
460 | 465 | ||
@@ -501,9 +506,11 @@ rtas_pci_slot_reset(struct pci_dn *pdn, int state) | |||
501 | 506 | ||
502 | /** rtas_set_slot_reset -- assert the pci #RST line for 1/4 second | 507 | /** rtas_set_slot_reset -- assert the pci #RST line for 1/4 second |
503 | * dn -- device node to be reset. | 508 | * dn -- device node to be reset. |
509 | * | ||
510 | * Return 0 if success, else a non-zero value. | ||
504 | */ | 511 | */ |
505 | 512 | ||
506 | void | 513 | int |
507 | rtas_set_slot_reset(struct pci_dn *pdn) | 514 | rtas_set_slot_reset(struct pci_dn *pdn) |
508 | { | 515 | { |
509 | int i, rc; | 516 | int i, rc; |
@@ -533,10 +540,21 @@ rtas_set_slot_reset(struct pci_dn *pdn) | |||
533 | * ready to be used; if not, wait for recovery. */ | 540 | * ready to be used; if not, wait for recovery. */ |
534 | for (i=0; i<10; i++) { | 541 | for (i=0; i<10; i++) { |
535 | rc = eeh_slot_availability (pdn); | 542 | rc = eeh_slot_availability (pdn); |
536 | if (rc <= 0) break; | 543 | if (rc < 0) |
544 | printk (KERN_ERR "EEH: failed (%d) to reset slot %s\n", rc, pdn->node->full_name); | ||
545 | if (rc == 0) | ||
546 | return 0; | ||
547 | if (rc < 0) | ||
548 | return -1; | ||
537 | 549 | ||
538 | msleep (rc+100); | 550 | msleep (rc+100); |
539 | } | 551 | } |
552 | |||
553 | rc = eeh_slot_availability (pdn); | ||
554 | if (rc) | ||
555 | printk (KERN_ERR "EEH: timeout resetting slot %s\n", pdn->node->full_name); | ||
556 | |||
557 | return rc; | ||
540 | } | 558 | } |
541 | 559 | ||
542 | /* ------------------------------------------------------- */ | 560 | /* ------------------------------------------------------- */ |
diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c index 199a3ce54786..242b2923360d 100644 --- a/arch/powerpc/platforms/pseries/eeh_driver.c +++ b/arch/powerpc/platforms/pseries/eeh_driver.c | |||
@@ -200,14 +200,18 @@ static void eeh_report_failure(struct pci_dev *dev, void *userdata) | |||
200 | * bus resets can be performed. | 200 | * bus resets can be performed. |
201 | */ | 201 | */ |
202 | 202 | ||
203 | static void eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus) | 203 | static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus) |
204 | { | 204 | { |
205 | int rc; | ||
205 | if (bus) | 206 | if (bus) |
206 | pcibios_remove_pci_devices(bus); | 207 | pcibios_remove_pci_devices(bus); |
207 | 208 | ||
208 | /* Reset the pci controller. (Asserts RST#; resets config space). | 209 | /* Reset the pci controller. (Asserts RST#; resets config space). |
209 | * Reconfigure bridges and devices */ | 210 | * Reconfigure bridges and devices. Don't try to bring the system |
210 | rtas_set_slot_reset(pe_dn); | 211 | * up if the reset failed for some reason. */ |
212 | rc = rtas_set_slot_reset(pe_dn); | ||
213 | if (rc) | ||
214 | return rc; | ||
211 | 215 | ||
212 | /* Walk over all functions on this device */ | 216 | /* Walk over all functions on this device */ |
213 | rtas_configure_bridge(pe_dn); | 217 | rtas_configure_bridge(pe_dn); |
@@ -223,6 +227,8 @@ static void eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus) | |||
223 | ssleep (5); | 227 | ssleep (5); |
224 | pcibios_add_pci_devices(bus); | 228 | pcibios_add_pci_devices(bus); |
225 | } | 229 | } |
230 | |||
231 | return 0; | ||
226 | } | 232 | } |
227 | 233 | ||
228 | /* The longest amount of time to wait for a pci device | 234 | /* The longest amount of time to wait for a pci device |
@@ -235,7 +241,7 @@ void handle_eeh_events (struct eeh_event *event) | |||
235 | struct device_node *frozen_dn; | 241 | struct device_node *frozen_dn; |
236 | struct pci_dn *frozen_pdn; | 242 | struct pci_dn *frozen_pdn; |
237 | struct pci_bus *frozen_bus; | 243 | struct pci_bus *frozen_bus; |
238 | int perm_failure = 0; | 244 | int rc = 0; |
239 | 245 | ||
240 | frozen_dn = find_device_pe(event->dn); | 246 | frozen_dn = find_device_pe(event->dn); |
241 | frozen_bus = pcibios_find_pci_bus(frozen_dn); | 247 | frozen_bus = pcibios_find_pci_bus(frozen_dn); |
@@ -272,7 +278,7 @@ void handle_eeh_events (struct eeh_event *event) | |||
272 | frozen_pdn->eeh_freeze_count++; | 278 | frozen_pdn->eeh_freeze_count++; |
273 | 279 | ||
274 | if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES) | 280 | if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES) |
275 | perm_failure = 1; | 281 | goto hard_fail; |
276 | 282 | ||
277 | /* If the reset state is a '5' and the time to reset is 0 (infinity) | 283 | /* If the reset state is a '5' and the time to reset is 0 (infinity) |
278 | * or is more then 15 seconds, then mark this as a permanent failure. | 284 | * or is more then 15 seconds, then mark this as a permanent failure. |
@@ -280,34 +286,7 @@ void handle_eeh_events (struct eeh_event *event) | |||
280 | if ((event->state == pci_channel_io_perm_failure) && | 286 | if ((event->state == pci_channel_io_perm_failure) && |
281 | ((event->time_unavail <= 0) || | 287 | ((event->time_unavail <= 0) || |
282 | (event->time_unavail > MAX_WAIT_FOR_RECOVERY*1000))) | 288 | (event->time_unavail > MAX_WAIT_FOR_RECOVERY*1000))) |
283 | { | 289 | goto hard_fail; |
284 | perm_failure = 1; | ||
285 | } | ||
286 | |||
287 | /* Log the error with the rtas logger. */ | ||
288 | if (perm_failure) { | ||
289 | /* | ||
290 | * About 90% of all real-life EEH failures in the field | ||
291 | * are due to poorly seated PCI cards. Only 10% or so are | ||
292 | * due to actual, failed cards. | ||
293 | */ | ||
294 | printk(KERN_ERR | ||
295 | "EEH: PCI device %s - %s has failed %d times \n" | ||
296 | "and has been permanently disabled. Please try reseating\n" | ||
297 | "this device or replacing it.\n", | ||
298 | pci_name (frozen_pdn->pcidev), | ||
299 | pcid_name(frozen_pdn->pcidev), | ||
300 | frozen_pdn->eeh_freeze_count); | ||
301 | |||
302 | eeh_slot_error_detail(frozen_pdn, 2 /* Permanent Error */); | ||
303 | |||
304 | /* Notify all devices that they're about to go down. */ | ||
305 | pci_walk_bus(frozen_bus, eeh_report_failure, 0); | ||
306 | |||
307 | /* Shut down the device drivers for good. */ | ||
308 | pcibios_remove_pci_devices(frozen_bus); | ||
309 | return; | ||
310 | } | ||
311 | 290 | ||
312 | eeh_slot_error_detail(frozen_pdn, 1 /* Temporary Error */); | 291 | eeh_slot_error_detail(frozen_pdn, 1 /* Temporary Error */); |
313 | printk(KERN_WARNING | 292 | printk(KERN_WARNING |
@@ -330,24 +309,54 @@ void handle_eeh_events (struct eeh_event *event) | |||
330 | * go down willingly, without panicing the system. | 309 | * go down willingly, without panicing the system. |
331 | */ | 310 | */ |
332 | if (result == PCIERR_RESULT_NONE) { | 311 | if (result == PCIERR_RESULT_NONE) { |
333 | eeh_reset_device(frozen_pdn, frozen_bus); | 312 | rc = eeh_reset_device(frozen_pdn, frozen_bus); |
313 | if (rc) | ||
314 | goto hard_fail; | ||
334 | } | 315 | } |
335 | 316 | ||
336 | /* If any device called out for a reset, then reset the slot */ | 317 | /* If any device called out for a reset, then reset the slot */ |
337 | if (result == PCIERR_RESULT_NEED_RESET) { | 318 | if (result == PCIERR_RESULT_NEED_RESET) { |
338 | eeh_reset_device(frozen_pdn, NULL); | 319 | rc = eeh_reset_device(frozen_pdn, NULL); |
320 | if (rc) | ||
321 | goto hard_fail; | ||
339 | pci_walk_bus(frozen_bus, eeh_report_reset, 0); | 322 | pci_walk_bus(frozen_bus, eeh_report_reset, 0); |
340 | } | 323 | } |
341 | 324 | ||
342 | /* If all devices reported they can proceed, the re-enable PIO */ | 325 | /* If all devices reported they can proceed, the re-enable PIO */ |
343 | if (result == PCIERR_RESULT_CAN_RECOVER) { | 326 | if (result == PCIERR_RESULT_CAN_RECOVER) { |
344 | /* XXX Not supported; we brute-force reset the device */ | 327 | /* XXX Not supported; we brute-force reset the device */ |
345 | eeh_reset_device(frozen_pdn, NULL); | 328 | rc = eeh_reset_device(frozen_pdn, NULL); |
329 | if (rc) | ||
330 | goto hard_fail; | ||
346 | pci_walk_bus(frozen_bus, eeh_report_reset, 0); | 331 | pci_walk_bus(frozen_bus, eeh_report_reset, 0); |
347 | } | 332 | } |
348 | 333 | ||
349 | /* Tell all device drivers that they can resume operations */ | 334 | /* Tell all device drivers that they can resume operations */ |
350 | pci_walk_bus(frozen_bus, eeh_report_resume, 0); | 335 | pci_walk_bus(frozen_bus, eeh_report_resume, 0); |
336 | |||
337 | return; | ||
338 | |||
339 | hard_fail: | ||
340 | /* | ||
341 | * About 90% of all real-life EEH failures in the field | ||
342 | * are due to poorly seated PCI cards. Only 10% or so are | ||
343 | * due to actual, failed cards. | ||
344 | */ | ||
345 | printk(KERN_ERR | ||
346 | "EEH: PCI device %s - %s has failed %d times \n" | ||
347 | "and has been permanently disabled. Please try reseating\n" | ||
348 | "this device or replacing it.\n", | ||
349 | pci_name (frozen_pdn->pcidev), | ||
350 | pcid_name(frozen_pdn->pcidev), | ||
351 | frozen_pdn->eeh_freeze_count); | ||
352 | |||
353 | eeh_slot_error_detail(frozen_pdn, 2 /* Permanent Error */); | ||
354 | |||
355 | /* Notify all devices that they're about to go down. */ | ||
356 | pci_walk_bus(frozen_bus, eeh_report_failure, 0); | ||
357 | |||
358 | /* Shut down the device drivers for good. */ | ||
359 | pcibios_remove_pci_devices(frozen_bus); | ||
351 | } | 360 | } |
352 | 361 | ||
353 | /* ---------- end of file ---------- */ | 362 | /* ---------- end of file ---------- */ |