aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichael Neuling <mikey@neuling.org>2018-03-26 00:17:07 -0400
committerMichael Ellerman <mpe@ellerman.id.au>2018-03-31 09:47:45 -0400
commitf0295e047fcf52ccb42561fb7de6942f5201b676 (patch)
tree31ec8e1b732ea74c747f569b5d870c1553377aed
parentbf8a1abc3ddbd6e9a8312ea7d96e5dd89c140f18 (diff)
powerpc/eeh: Fix race with driver un/bind
The current EEH callbacks can race with a driver unbind. This can result in a backtraces like this: EEH: Frozen PHB#0-PE#1fc detected EEH: PE location: S000009, PHB location: N/A CPU: 2 PID: 2312 Comm: kworker/u258:3 Not tainted 4.15.6-openpower1 #2 Workqueue: nvme-wq nvme_reset_work [nvme] Call Trace: dump_stack+0x9c/0xd0 (unreliable) eeh_dev_check_failure+0x420/0x470 eeh_check_failure+0xa0/0xa4 nvme_reset_work+0x138/0x1414 [nvme] process_one_work+0x1ec/0x328 worker_thread+0x2e4/0x3a8 kthread+0x14c/0x154 ret_from_kernel_thread+0x5c/0xc8 nvme nvme1: Removing after probe failure status: -19 <snip> cpu 0x23: Vector: 300 (Data Access) at [c000000ff50f3800] pc: c0080000089a0eb0: nvme_error_detected+0x4c/0x90 [nvme] lr: c000000000026564: eeh_report_error+0xe0/0x110 sp: c000000ff50f3a80 msr: 9000000000009033 dar: 400 dsisr: 40000000 current = 0xc000000ff507c000 paca = 0xc00000000fdc9d80 softe: 0 irq_happened: 0x01 pid = 782, comm = eehd Linux version 4.15.6-openpower1 (smc@smc-desktop) (gcc version 6.4.0 (Buildroot 2017.11.2-00008-g4b6188e)) #2 SM P Tue Feb 27 12:33:27 PST 2018 enter ? for help eeh_report_error+0xe0/0x110 eeh_pe_dev_traverse+0xc0/0xdc eeh_handle_normal_event+0x184/0x4c4 eeh_handle_event+0x30/0x288 eeh_event_handler+0x124/0x170 kthread+0x14c/0x154 ret_from_kernel_thread+0x5c/0xc8 The first part is an EEH (on boot), the second half is the resulting crash. nvme probe starts the nvme_reset_work() worker thread. This worker thread starts touching the device which see a device error (EEH) and hence queues up an event in the powerpc EEH worker thread. nvme_reset_work() then continues and runs nvme_remove_dead_ctrl_work() which results in unbinding the driver from the device and hence releases all resources. At the same time, the EEH worker thread starts doing the EEH .error_detected() driver callback, which no longer works since the resources have been freed. This fixes the problem in the same way the generic PCIe AER code (in drivers/pci/pcie/aer/aerdrv_core.c) does. It makes the EEH code hold the device_lock() while performing the driver EEH callbacks and associated code. This ensures either the callbacks are no longer register, or if they are registered the driver will not be removed from underneath us. This has been broken forever. The EEH call backs were first introduced in 2005 (in 77bd7415610) but it's not clear if a lock was needed back then. Fixes: 77bd74156101 ("[PATCH] powerpc: PCI Error Recovery: PPC64 core recovery routines") Cc: stable@vger.kernel.org # v2.6.16+ Signed-off-by: Michael Neuling <mikey@neuling.org> Reviewed-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
-rw-r--r--arch/powerpc/kernel/eeh_driver.c68
1 files changed, 42 insertions, 26 deletions
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 43ceb6263cd8..b8a329f04814 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -207,18 +207,18 @@ static void *eeh_report_error(void *data, void *userdata)
207 207
208 if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe)) 208 if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
209 return NULL; 209 return NULL;
210
211 device_lock(&dev->dev);
210 dev->error_state = pci_channel_io_frozen; 212 dev->error_state = pci_channel_io_frozen;
211 213
212 driver = eeh_pcid_get(dev); 214 driver = eeh_pcid_get(dev);
213 if (!driver) return NULL; 215 if (!driver) goto out_no_dev;
214 216
215 eeh_disable_irq(dev); 217 eeh_disable_irq(dev);
216 218
217 if (!driver->err_handler || 219 if (!driver->err_handler ||
218 !driver->err_handler->error_detected) { 220 !driver->err_handler->error_detected)
219 eeh_pcid_put(dev); 221 goto out;
220 return NULL;
221 }
222 222
223 rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen); 223 rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
224 224
@@ -227,8 +227,12 @@ static void *eeh_report_error(void *data, void *userdata)
227 if (*res == PCI_ERS_RESULT_NONE) *res = rc; 227 if (*res == PCI_ERS_RESULT_NONE) *res = rc;
228 228
229 edev->in_error = true; 229 edev->in_error = true;
230 eeh_pcid_put(dev);
231 pci_uevent_ers(dev, PCI_ERS_RESULT_NONE); 230 pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
231
232out:
233 eeh_pcid_put(dev);
234out_no_dev:
235 device_unlock(&dev->dev);
232 return NULL; 236 return NULL;
233} 237}
234 238
@@ -251,15 +255,14 @@ static void *eeh_report_mmio_enabled(void *data, void *userdata)
251 if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe)) 255 if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
252 return NULL; 256 return NULL;
253 257
258 device_lock(&dev->dev);
254 driver = eeh_pcid_get(dev); 259 driver = eeh_pcid_get(dev);
255 if (!driver) return NULL; 260 if (!driver) goto out_no_dev;
256 261
257 if (!driver->err_handler || 262 if (!driver->err_handler ||
258 !driver->err_handler->mmio_enabled || 263 !driver->err_handler->mmio_enabled ||
259 (edev->mode & EEH_DEV_NO_HANDLER)) { 264 (edev->mode & EEH_DEV_NO_HANDLER))
260 eeh_pcid_put(dev); 265 goto out;
261 return NULL;
262 }
263 266
264 rc = driver->err_handler->mmio_enabled(dev); 267 rc = driver->err_handler->mmio_enabled(dev);
265 268
@@ -267,7 +270,10 @@ static void *eeh_report_mmio_enabled(void *data, void *userdata)
267 if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; 270 if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
268 if (*res == PCI_ERS_RESULT_NONE) *res = rc; 271 if (*res == PCI_ERS_RESULT_NONE) *res = rc;
269 272
273out:
270 eeh_pcid_put(dev); 274 eeh_pcid_put(dev);
275out_no_dev:
276 device_unlock(&dev->dev);
271 return NULL; 277 return NULL;
272} 278}
273 279
@@ -290,20 +296,20 @@ static void *eeh_report_reset(void *data, void *userdata)
290 296
291 if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe)) 297 if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
292 return NULL; 298 return NULL;
299
300 device_lock(&dev->dev);
293 dev->error_state = pci_channel_io_normal; 301 dev->error_state = pci_channel_io_normal;
294 302
295 driver = eeh_pcid_get(dev); 303 driver = eeh_pcid_get(dev);
296 if (!driver) return NULL; 304 if (!driver) goto out_no_dev;
297 305
298 eeh_enable_irq(dev); 306 eeh_enable_irq(dev);
299 307
300 if (!driver->err_handler || 308 if (!driver->err_handler ||
301 !driver->err_handler->slot_reset || 309 !driver->err_handler->slot_reset ||
302 (edev->mode & EEH_DEV_NO_HANDLER) || 310 (edev->mode & EEH_DEV_NO_HANDLER) ||
303 (!edev->in_error)) { 311 (!edev->in_error))
304 eeh_pcid_put(dev); 312 goto out;
305 return NULL;
306 }
307 313
308 rc = driver->err_handler->slot_reset(dev); 314 rc = driver->err_handler->slot_reset(dev);
309 if ((*res == PCI_ERS_RESULT_NONE) || 315 if ((*res == PCI_ERS_RESULT_NONE) ||
@@ -311,7 +317,10 @@ static void *eeh_report_reset(void *data, void *userdata)
311 if (*res == PCI_ERS_RESULT_DISCONNECT && 317 if (*res == PCI_ERS_RESULT_DISCONNECT &&
312 rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; 318 rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
313 319
320out:
314 eeh_pcid_put(dev); 321 eeh_pcid_put(dev);
322out_no_dev:
323 device_unlock(&dev->dev);
315 return NULL; 324 return NULL;
316} 325}
317 326
@@ -362,10 +371,12 @@ static void *eeh_report_resume(void *data, void *userdata)
362 371
363 if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe)) 372 if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
364 return NULL; 373 return NULL;
374
375 device_lock(&dev->dev);
365 dev->error_state = pci_channel_io_normal; 376 dev->error_state = pci_channel_io_normal;
366 377
367 driver = eeh_pcid_get(dev); 378 driver = eeh_pcid_get(dev);
368 if (!driver) return NULL; 379 if (!driver) goto out_no_dev;
369 380
370 was_in_error = edev->in_error; 381 was_in_error = edev->in_error;
371 edev->in_error = false; 382 edev->in_error = false;
@@ -375,18 +386,20 @@ static void *eeh_report_resume(void *data, void *userdata)
375 !driver->err_handler->resume || 386 !driver->err_handler->resume ||
376 (edev->mode & EEH_DEV_NO_HANDLER) || !was_in_error) { 387 (edev->mode & EEH_DEV_NO_HANDLER) || !was_in_error) {
377 edev->mode &= ~EEH_DEV_NO_HANDLER; 388 edev->mode &= ~EEH_DEV_NO_HANDLER;
378 eeh_pcid_put(dev); 389 goto out;
379 return NULL;
380 } 390 }
381 391
382 driver->err_handler->resume(dev); 392 driver->err_handler->resume(dev);
383 393
384 eeh_pcid_put(dev);
385 pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED); 394 pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
395out:
396 eeh_pcid_put(dev);
386#ifdef CONFIG_PCI_IOV 397#ifdef CONFIG_PCI_IOV
387 if (eeh_ops->notify_resume && eeh_dev_to_pdn(edev)) 398 if (eeh_ops->notify_resume && eeh_dev_to_pdn(edev))
388 eeh_ops->notify_resume(eeh_dev_to_pdn(edev)); 399 eeh_ops->notify_resume(eeh_dev_to_pdn(edev));
389#endif 400#endif
401out_no_dev:
402 device_unlock(&dev->dev);
390 return NULL; 403 return NULL;
391} 404}
392 405
@@ -406,23 +419,26 @@ static void *eeh_report_failure(void *data, void *userdata)
406 419
407 if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe)) 420 if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
408 return NULL; 421 return NULL;
422
423 device_lock(&dev->dev);
409 dev->error_state = pci_channel_io_perm_failure; 424 dev->error_state = pci_channel_io_perm_failure;
410 425
411 driver = eeh_pcid_get(dev); 426 driver = eeh_pcid_get(dev);
412 if (!driver) return NULL; 427 if (!driver) goto out_no_dev;
413 428
414 eeh_disable_irq(dev); 429 eeh_disable_irq(dev);
415 430
416 if (!driver->err_handler || 431 if (!driver->err_handler ||
417 !driver->err_handler->error_detected) { 432 !driver->err_handler->error_detected)
418 eeh_pcid_put(dev); 433 goto out;
419 return NULL;
420 }
421 434
422 driver->err_handler->error_detected(dev, pci_channel_io_perm_failure); 435 driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
423 436
424 eeh_pcid_put(dev);
425 pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT); 437 pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
438out:
439 eeh_pcid_put(dev);
440out_no_dev:
441 device_unlock(&dev->dev);
426 return NULL; 442 return NULL;
427} 443}
428 444