diff options
author | Keith Busch <keith.busch@intel.com> | 2018-09-20 12:27:12 -0400 |
---|---|---|
committer | Bjorn Helgaas <bhelgaas@google.com> | 2018-09-26 15:23:14 -0400 |
commit | bdb5ac85777de67c909c9ad4327f03f7648b543f (patch) | |
tree | 06ffd1c0d73efa4807579a2a5b9bc04da8fcca1b | |
parent | c4eed62a214330908eec11b0dc170d34fa50b412 (diff) |
PCI/ERR: Handle fatal error recovery
We don't need to be paranoid about the topology changing while handling an
error. If the device has changed in a hotplug capable slot, we can rely on
the presence detection handling to react to a changing topology.
Restore the fatal error handling behavior that existed before merging DPC
with AER with 7e9084b36740 ("PCI/AER: Handle ERR_FATAL with removal and
re-enumeration of devices").
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Sinan Kaya <okaya@kernel.org>
-rw-r--r-- | Documentation/PCI/pci-error-recovery.txt | 35 | ||||
-rw-r--r-- | drivers/pci/pci.h | 4 | ||||
-rw-r--r-- | drivers/pci/pcie/aer.c | 12 | ||||
-rw-r--r-- | drivers/pci/pcie/dpc.c | 4 | ||||
-rw-r--r-- | drivers/pci/pcie/err.c | 75 |
5 files changed, 28 insertions, 102 deletions
diff --git a/Documentation/PCI/pci-error-recovery.txt b/Documentation/PCI/pci-error-recovery.txt index 688b69121e82..0b6bb3ef449e 100644 --- a/Documentation/PCI/pci-error-recovery.txt +++ b/Documentation/PCI/pci-error-recovery.txt | |||
@@ -110,7 +110,7 @@ The actual steps taken by a platform to recover from a PCI error | |||
110 | event will be platform-dependent, but will follow the general | 110 | event will be platform-dependent, but will follow the general |
111 | sequence described below. | 111 | sequence described below. |
112 | 112 | ||
113 | STEP 0: Error Event: ERR_NONFATAL | 113 | STEP 0: Error Event |
114 | ------------------- | 114 | ------------------- |
115 | A PCI bus error is detected by the PCI hardware. On powerpc, the slot | 115 | A PCI bus error is detected by the PCI hardware. On powerpc, the slot |
116 | is isolated, in that all I/O is blocked: all reads return 0xffffffff, | 116 | is isolated, in that all I/O is blocked: all reads return 0xffffffff, |
@@ -228,7 +228,13 @@ proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume Operations). | |||
228 | If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform | 228 | If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform |
229 | proceeds to STEP 4 (Slot Reset) | 229 | proceeds to STEP 4 (Slot Reset) |
230 | 230 | ||
231 | STEP 3: Slot Reset | 231 | STEP 3: Link Reset |
232 | ------------------ | ||
233 | The platform resets the link. This is a PCI-Express specific step | ||
234 | and is done whenever a fatal error has been detected that can be | ||
235 | "solved" by resetting the link. | ||
236 | |||
237 | STEP 4: Slot Reset | ||
232 | ------------------ | 238 | ------------------ |
233 | 239 | ||
234 | In response to a return value of PCI_ERS_RESULT_NEED_RESET, the | 240 | In response to a return value of PCI_ERS_RESULT_NEED_RESET, the |
@@ -314,7 +320,7 @@ Failure). | |||
314 | >>> However, it probably should. | 320 | >>> However, it probably should. |
315 | 321 | ||
316 | 322 | ||
317 | STEP 4: Resume Operations | 323 | STEP 5: Resume Operations |
318 | ------------------------- | 324 | ------------------------- |
319 | The platform will call the resume() callback on all affected device | 325 | The platform will call the resume() callback on all affected device |
320 | drivers if all drivers on the segment have returned | 326 | drivers if all drivers on the segment have returned |
@@ -326,7 +332,7 @@ a result code. | |||
326 | At this point, if a new error happens, the platform will restart | 332 | At this point, if a new error happens, the platform will restart |
327 | a new error recovery sequence. | 333 | a new error recovery sequence. |
328 | 334 | ||
329 | STEP 5: Permanent Failure | 335 | STEP 6: Permanent Failure |
330 | ------------------------- | 336 | ------------------------- |
331 | A "permanent failure" has occurred, and the platform cannot recover | 337 | A "permanent failure" has occurred, and the platform cannot recover |
332 | the device. The platform will call error_detected() with a | 338 | the device. The platform will call error_detected() with a |
@@ -349,27 +355,6 @@ errors. See the discussion in powerpc/eeh-pci-error-recovery.txt | |||
349 | for additional detail on real-life experience of the causes of | 355 | for additional detail on real-life experience of the causes of |
350 | software errors. | 356 | software errors. |
351 | 357 | ||
352 | STEP 0: Error Event: ERR_FATAL | ||
353 | ------------------- | ||
354 | PCI bus error is detected by the PCI hardware. On powerpc, the slot is | ||
355 | isolated, in that all I/O is blocked: all reads return 0xffffffff, all | ||
356 | writes are ignored. | ||
357 | |||
358 | STEP 1: Remove devices | ||
359 | -------------------- | ||
360 | Platform removes the devices depending on the error agent, it could be | ||
361 | this port for all subordinates or upstream component (likely downstream | ||
362 | port) | ||
363 | |||
364 | STEP 2: Reset link | ||
365 | -------------------- | ||
366 | The platform resets the link. This is a PCI-Express specific step and is | ||
367 | done whenever a fatal error has been detected that can be "solved" by | ||
368 | resetting the link. | ||
369 | |||
370 | STEP 3: Re-enumerate the devices | ||
371 | -------------------- | ||
372 | Initiates the re-enumeration. | ||
373 | 358 | ||
374 | Conclusion; General Remarks | 359 | Conclusion; General Remarks |
375 | --------------------------- | 360 | --------------------------- |
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index b4ada8c383a8..9b279805489f 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h | |||
@@ -433,8 +433,8 @@ static inline int pci_dev_specific_disable_acs_redir(struct pci_dev *dev) | |||
433 | #endif | 433 | #endif |
434 | 434 | ||
435 | /* PCI error reporting and recovery */ | 435 | /* PCI error reporting and recovery */ |
436 | void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service); | 436 | void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state, |
437 | void pcie_do_nonfatal_recovery(struct pci_dev *dev); | 437 | u32 service); |
438 | 438 | ||
439 | bool pcie_wait_for_link(struct pci_dev *pdev, bool active); | 439 | bool pcie_wait_for_link(struct pci_dev *pdev, bool active); |
440 | #ifdef CONFIG_PCIEASPM | 440 | #ifdef CONFIG_PCIEASPM |
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 1563e22600ec..0619ec5d7bb5 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c | |||
@@ -1010,9 +1010,11 @@ static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info) | |||
1010 | info->status); | 1010 | info->status); |
1011 | pci_aer_clear_device_status(dev); | 1011 | pci_aer_clear_device_status(dev); |
1012 | } else if (info->severity == AER_NONFATAL) | 1012 | } else if (info->severity == AER_NONFATAL) |
1013 | pcie_do_nonfatal_recovery(dev); | 1013 | pcie_do_recovery(dev, pci_channel_io_normal, |
1014 | PCIE_PORT_SERVICE_AER); | ||
1014 | else if (info->severity == AER_FATAL) | 1015 | else if (info->severity == AER_FATAL) |
1015 | pcie_do_fatal_recovery(dev, PCIE_PORT_SERVICE_AER); | 1016 | pcie_do_recovery(dev, pci_channel_io_frozen, |
1017 | PCIE_PORT_SERVICE_AER); | ||
1016 | pci_dev_put(dev); | 1018 | pci_dev_put(dev); |
1017 | } | 1019 | } |
1018 | 1020 | ||
@@ -1048,9 +1050,11 @@ static void aer_recover_work_func(struct work_struct *work) | |||
1048 | } | 1050 | } |
1049 | cper_print_aer(pdev, entry.severity, entry.regs); | 1051 | cper_print_aer(pdev, entry.severity, entry.regs); |
1050 | if (entry.severity == AER_NONFATAL) | 1052 | if (entry.severity == AER_NONFATAL) |
1051 | pcie_do_nonfatal_recovery(pdev); | 1053 | pcie_do_recovery(pdev, pci_channel_io_normal, |
1054 | PCIE_PORT_SERVICE_AER); | ||
1052 | else if (entry.severity == AER_FATAL) | 1055 | else if (entry.severity == AER_FATAL) |
1053 | pcie_do_fatal_recovery(pdev, PCIE_PORT_SERVICE_AER); | 1056 | pcie_do_recovery(pdev, pci_channel_io_frozen, |
1057 | PCIE_PORT_SERVICE_AER); | ||
1054 | pci_dev_put(pdev); | 1058 | pci_dev_put(pdev); |
1055 | } | 1059 | } |
1056 | } | 1060 | } |
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c index ed815a28512e..23e063aefddf 100644 --- a/drivers/pci/pcie/dpc.c +++ b/drivers/pci/pcie/dpc.c | |||
@@ -216,7 +216,7 @@ static irqreturn_t dpc_handler(int irq, void *context) | |||
216 | 216 | ||
217 | reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN) >> 1; | 217 | reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN) >> 1; |
218 | ext_reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT) >> 5; | 218 | ext_reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT) >> 5; |
219 | dev_warn(dev, "DPC %s detected, remove downstream devices\n", | 219 | dev_warn(dev, "DPC %s detected\n", |
220 | (reason == 0) ? "unmasked uncorrectable error" : | 220 | (reason == 0) ? "unmasked uncorrectable error" : |
221 | (reason == 1) ? "ERR_NONFATAL" : | 221 | (reason == 1) ? "ERR_NONFATAL" : |
222 | (reason == 2) ? "ERR_FATAL" : | 222 | (reason == 2) ? "ERR_FATAL" : |
@@ -233,7 +233,7 @@ static irqreturn_t dpc_handler(int irq, void *context) | |||
233 | } | 233 | } |
234 | 234 | ||
235 | /* We configure DPC so it only triggers on ERR_FATAL */ | 235 | /* We configure DPC so it only triggers on ERR_FATAL */ |
236 | pcie_do_fatal_recovery(pdev, PCIE_PORT_SERVICE_DPC); | 236 | pcie_do_recovery(pdev, pci_channel_io_frozen, PCIE_PORT_SERVICE_DPC); |
237 | 237 | ||
238 | return IRQ_HANDLED; | 238 | return IRQ_HANDLED; |
239 | } | 239 | } |
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c index 62ab665f0f03..644f3f725ef0 100644 --- a/drivers/pci/pcie/err.c +++ b/drivers/pci/pcie/err.c | |||
@@ -271,83 +271,20 @@ static pci_ers_result_t broadcast_error_message(struct pci_dev *dev, | |||
271 | return result_data.result; | 271 | return result_data.result; |
272 | } | 272 | } |
273 | 273 | ||
274 | /** | 274 | void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state, |
275 | * pcie_do_fatal_recovery - handle fatal error recovery process | 275 | u32 service) |
276 | * @dev: pointer to a pci_dev data structure of agent detecting an error | ||
277 | * | ||
278 | * Invoked when an error is fatal. Once being invoked, removes the devices | ||
279 | * beneath this AER agent, followed by reset link e.g. secondary bus reset | ||
280 | * followed by re-enumeration of devices. | ||
281 | */ | ||
282 | void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service) | ||
283 | { | ||
284 | struct pci_dev *udev; | ||
285 | struct pci_bus *parent; | ||
286 | struct pci_dev *pdev, *temp; | ||
287 | pci_ers_result_t result; | ||
288 | |||
289 | if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) | ||
290 | udev = dev; | ||
291 | else | ||
292 | udev = dev->bus->self; | ||
293 | |||
294 | parent = udev->subordinate; | ||
295 | pci_walk_bus(parent, pci_dev_set_disconnected, NULL); | ||
296 | |||
297 | pci_lock_rescan_remove(); | ||
298 | pci_dev_get(dev); | ||
299 | list_for_each_entry_safe_reverse(pdev, temp, &parent->devices, | ||
300 | bus_list) { | ||
301 | pci_stop_and_remove_bus_device(pdev); | ||
302 | } | ||
303 | |||
304 | result = reset_link(udev, service); | ||
305 | |||
306 | if ((service == PCIE_PORT_SERVICE_AER) && | ||
307 | (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) { | ||
308 | /* | ||
309 | * If the error is reported by a bridge, we think this error | ||
310 | * is related to the downstream link of the bridge, so we | ||
311 | * do error recovery on all subordinates of the bridge instead | ||
312 | * of the bridge and clear the error status of the bridge. | ||
313 | */ | ||
314 | pci_aer_clear_fatal_status(dev); | ||
315 | pci_aer_clear_device_status(dev); | ||
316 | } | ||
317 | |||
318 | if (result == PCI_ERS_RESULT_RECOVERED) { | ||
319 | if (pcie_wait_for_link(udev, true)) | ||
320 | pci_rescan_bus(udev->bus); | ||
321 | pci_info(dev, "Device recovery from fatal error successful\n"); | ||
322 | } else { | ||
323 | pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT); | ||
324 | pci_info(dev, "Device recovery from fatal error failed\n"); | ||
325 | } | ||
326 | |||
327 | pci_dev_put(dev); | ||
328 | pci_unlock_rescan_remove(); | ||
329 | } | ||
330 | |||
331 | /** | ||
332 | * pcie_do_nonfatal_recovery - handle nonfatal error recovery process | ||
333 | * @dev: pointer to a pci_dev data structure of agent detecting an error | ||
334 | * | ||
335 | * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast | ||
336 | * error detected message to all downstream drivers within a hierarchy in | ||
337 | * question and return the returned code. | ||
338 | */ | ||
339 | void pcie_do_nonfatal_recovery(struct pci_dev *dev) | ||
340 | { | 276 | { |
341 | pci_ers_result_t status; | 277 | pci_ers_result_t status; |
342 | enum pci_channel_state state; | ||
343 | |||
344 | state = pci_channel_io_normal; | ||
345 | 278 | ||
346 | status = broadcast_error_message(dev, | 279 | status = broadcast_error_message(dev, |
347 | state, | 280 | state, |
348 | "error_detected", | 281 | "error_detected", |
349 | report_error_detected); | 282 | report_error_detected); |
350 | 283 | ||
284 | if (state == pci_channel_io_frozen && | ||
285 | reset_link(dev, service) != PCI_ERS_RESULT_RECOVERED) | ||
286 | goto failed; | ||
287 | |||
351 | if (status == PCI_ERS_RESULT_CAN_RECOVER) | 288 | if (status == PCI_ERS_RESULT_CAN_RECOVER) |
352 | status = broadcast_error_message(dev, | 289 | status = broadcast_error_message(dev, |
353 | state, | 290 | state, |