aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBart Van Assche <bvanassche@acm.org>2013-12-11 11:06:14 -0500
committerRoland Dreier <roland@purestorage.com>2014-01-21 13:46:17 -0500
commit93079162bf0ed2934c7b0c3ee93ba894df8fb3cd (patch)
tree3250899980a35531d4a38dcf2d899df8bef5ad37
parent18cc4e02508e3f1fadf81f697837567431ee5a9c (diff)
scsi_transport_srp: Fix a race condition
The rport timers must be stopped before the SRP initiator destroys the resources associated with the SCSI host. This is necessary because otherwise the callback functions invoked from the SRP transport layer could trigger a use-after-free. Stopping the rport timers before invoking scsi_remove_host() can trigger long delays in the SCSI error handler if a transport layer failure occurs while scsi_remove_host() is in progress. Hence move the code for stopping the rport timers from srp_rport_release() into a new function and invoke that function after scsi_remove_host() has finished. This patch fixes the following sporadic kernel crash: kernel BUG at include/asm-generic/dma-mapping-common.h:64! invalid opcode: 0000 [#1] SMP RIP: 0010:[<ffffffffa03b20b1>] [<ffffffffa03b20b1>] srp_unmap_data+0x121/0x130 [ib_srp] Call Trace: [<ffffffffa03b20fc>] srp_free_req+0x3c/0x80 [ib_srp] [<ffffffffa03b2188>] srp_finish_req+0x48/0x70 [ib_srp] [<ffffffffa03b21fb>] srp_terminate_io+0x4b/0x60 [ib_srp] [<ffffffffa03a6fb5>] __rport_fail_io_fast+0x75/0x80 [scsi_transport_srp] [<ffffffffa03a7438>] rport_fast_io_fail_timedout+0x88/0xc0 [scsi_transport_srp] [<ffffffff8108b370>] worker_thread+0x170/0x2a0 [<ffffffff81090876>] kthread+0x96/0xa0 [<ffffffff8100c0ca>] child_rip+0xa/0x20 Signed-off-by: Bart Van Assche <bvanassche@acm.org> Signed-off-by: Roland Dreier <roland@purestorage.com>
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.c1
-rw-r--r--drivers/scsi/scsi_transport_srp.c83
-rw-r--r--include/scsi/scsi_transport_srp.h4
3 files changed, 46 insertions, 42 deletions
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index a88631918e85..529b6bcdca7a 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -660,6 +660,7 @@ static void srp_remove_target(struct srp_target_port *target)
660 srp_rport_get(target->rport); 660 srp_rport_get(target->rport);
661 srp_remove_host(target->scsi_host); 661 srp_remove_host(target->scsi_host);
662 scsi_remove_host(target->scsi_host); 662 scsi_remove_host(target->scsi_host);
663 srp_stop_rport_timers(target->rport);
663 srp_disconnect_target(target); 664 srp_disconnect_target(target);
664 ib_destroy_cm_id(target->cm_id); 665 ib_destroy_cm_id(target->cm_id);
665 srp_free_target_ib(target); 666 srp_free_target_ib(target);
diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c
index 8b9cb22be963..a349d44c4c36 100644
--- a/drivers/scsi/scsi_transport_srp.c
+++ b/drivers/scsi/scsi_transport_srp.c
@@ -456,37 +456,29 @@ static void __srp_start_tl_fail_timers(struct srp_rport *rport)
456 456
457 lockdep_assert_held(&rport->mutex); 457 lockdep_assert_held(&rport->mutex);
458 458
459 if (!rport->deleted) { 459 delay = rport->reconnect_delay;
460 delay = rport->reconnect_delay; 460 fast_io_fail_tmo = rport->fast_io_fail_tmo;
461 fast_io_fail_tmo = rport->fast_io_fail_tmo; 461 dev_loss_tmo = rport->dev_loss_tmo;
462 dev_loss_tmo = rport->dev_loss_tmo; 462 pr_debug("%s current state: %d\n", dev_name(&shost->shost_gendev),
463 pr_debug("%s current state: %d\n", 463 rport->state);
464 dev_name(&shost->shost_gendev), rport->state);
465 464
466 if (delay > 0) 465 if (rport->state == SRP_RPORT_LOST)
466 return;
467 if (delay > 0)
468 queue_delayed_work(system_long_wq, &rport->reconnect_work,
469 1UL * delay * HZ);
470 if (srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) {
471 pr_debug("%s new state: %d\n", dev_name(&shost->shost_gendev),
472 rport->state);
473 scsi_target_block(&shost->shost_gendev);
474 if (fast_io_fail_tmo >= 0)
467 queue_delayed_work(system_long_wq, 475 queue_delayed_work(system_long_wq,
468 &rport->reconnect_work, 476 &rport->fast_io_fail_work,
469 1UL * delay * HZ); 477 1UL * fast_io_fail_tmo * HZ);
470 if (srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) { 478 if (dev_loss_tmo >= 0)
471 pr_debug("%s new state: %d\n", 479 queue_delayed_work(system_long_wq,
472 dev_name(&shost->shost_gendev), 480 &rport->dev_loss_work,
473 rport->state); 481 1UL * dev_loss_tmo * HZ);
474 scsi_target_block(&shost->shost_gendev);
475 if (fast_io_fail_tmo >= 0)
476 queue_delayed_work(system_long_wq,
477 &rport->fast_io_fail_work,
478 1UL * fast_io_fail_tmo * HZ);
479 if (dev_loss_tmo >= 0)
480 queue_delayed_work(system_long_wq,
481 &rport->dev_loss_work,
482 1UL * dev_loss_tmo * HZ);
483 }
484 } else {
485 pr_debug("%s has already been deleted\n",
486 dev_name(&shost->shost_gendev));
487 srp_rport_set_state(rport, SRP_RPORT_FAIL_FAST);
488 scsi_target_unblock(&shost->shost_gendev,
489 SDEV_TRANSPORT_OFFLINE);
490 } 482 }
491} 483}
492 484
@@ -560,7 +552,7 @@ int srp_reconnect_rport(struct srp_rport *rport)
560 scsi_target_block(&shost->shost_gendev); 552 scsi_target_block(&shost->shost_gendev);
561 while (scsi_request_fn_active(shost)) 553 while (scsi_request_fn_active(shost))
562 msleep(20); 554 msleep(20);
563 res = i->f->reconnect(rport); 555 res = rport->state != SRP_RPORT_LOST ? i->f->reconnect(rport) : -ENODEV;
564 pr_debug("%s (state %d): transport.reconnect() returned %d\n", 556 pr_debug("%s (state %d): transport.reconnect() returned %d\n",
565 dev_name(&shost->shost_gendev), rport->state, res); 557 dev_name(&shost->shost_gendev), rport->state, res);
566 if (res == 0) { 558 if (res == 0) {
@@ -626,10 +618,6 @@ static void srp_rport_release(struct device *dev)
626{ 618{
627 struct srp_rport *rport = dev_to_rport(dev); 619 struct srp_rport *rport = dev_to_rport(dev);
628 620
629 cancel_delayed_work_sync(&rport->reconnect_work);
630 cancel_delayed_work_sync(&rport->fast_io_fail_work);
631 cancel_delayed_work_sync(&rport->dev_loss_work);
632
633 put_device(dev->parent); 621 put_device(dev->parent);
634 kfree(rport); 622 kfree(rport);
635} 623}
@@ -784,12 +772,6 @@ void srp_rport_del(struct srp_rport *rport)
784 device_del(dev); 772 device_del(dev);
785 transport_destroy_device(dev); 773 transport_destroy_device(dev);
786 774
787 mutex_lock(&rport->mutex);
788 if (rport->state == SRP_RPORT_BLOCKED)
789 __rport_fail_io_fast(rport);
790 rport->deleted = true;
791 mutex_unlock(&rport->mutex);
792
793 put_device(dev); 775 put_device(dev);
794} 776}
795EXPORT_SYMBOL_GPL(srp_rport_del); 777EXPORT_SYMBOL_GPL(srp_rport_del);
@@ -814,6 +796,27 @@ void srp_remove_host(struct Scsi_Host *shost)
814} 796}
815EXPORT_SYMBOL_GPL(srp_remove_host); 797EXPORT_SYMBOL_GPL(srp_remove_host);
816 798
799/**
800 * srp_stop_rport_timers - stop the transport layer recovery timers
801 *
802 * Must be called after srp_remove_host() and scsi_remove_host(). The caller
803 * must hold a reference on the rport (rport->dev) and on the SCSI host
804 * (rport->dev.parent).
805 */
806void srp_stop_rport_timers(struct srp_rport *rport)
807{
808 mutex_lock(&rport->mutex);
809 if (rport->state == SRP_RPORT_BLOCKED)
810 __rport_fail_io_fast(rport);
811 srp_rport_set_state(rport, SRP_RPORT_LOST);
812 mutex_unlock(&rport->mutex);
813
814 cancel_delayed_work_sync(&rport->reconnect_work);
815 cancel_delayed_work_sync(&rport->fast_io_fail_work);
816 cancel_delayed_work_sync(&rport->dev_loss_work);
817}
818EXPORT_SYMBOL_GPL(srp_stop_rport_timers);
819
817static int srp_tsk_mgmt_response(struct Scsi_Host *shost, u64 nexus, u64 tm_id, 820static int srp_tsk_mgmt_response(struct Scsi_Host *shost, u64 nexus, u64 tm_id,
818 int result) 821 int result)
819{ 822{
diff --git a/include/scsi/scsi_transport_srp.h b/include/scsi/scsi_transport_srp.h
index 4ebf6913b7b2..f24e763fa430 100644
--- a/include/scsi/scsi_transport_srp.h
+++ b/include/scsi/scsi_transport_srp.h
@@ -19,7 +19,7 @@ struct srp_rport_identifiers {
19 * @SRP_RPORT_BLOCKED: Transport layer not operational; fast I/O fail timer 19 * @SRP_RPORT_BLOCKED: Transport layer not operational; fast I/O fail timer
20 * is running and I/O has been blocked. 20 * is running and I/O has been blocked.
21 * @SRP_RPORT_FAIL_FAST: Fast I/O fail timer has expired; fail I/O fast. 21 * @SRP_RPORT_FAIL_FAST: Fast I/O fail timer has expired; fail I/O fast.
22 * @SRP_RPORT_LOST: Device loss timer has expired; port is being removed. 22 * @SRP_RPORT_LOST: Port is being removed.
23 */ 23 */
24enum srp_rport_state { 24enum srp_rport_state {
25 SRP_RPORT_RUNNING, 25 SRP_RPORT_RUNNING,
@@ -48,7 +48,6 @@ struct srp_rport {
48 48
49 struct mutex mutex; 49 struct mutex mutex;
50 enum srp_rport_state state; 50 enum srp_rport_state state;
51 bool deleted;
52 int reconnect_delay; 51 int reconnect_delay;
53 int failed_reconnects; 52 int failed_reconnects;
54 struct delayed_work reconnect_work; 53 struct delayed_work reconnect_work;
@@ -101,6 +100,7 @@ extern int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo,
101extern int srp_reconnect_rport(struct srp_rport *rport); 100extern int srp_reconnect_rport(struct srp_rport *rport);
102extern void srp_start_tl_fail_timers(struct srp_rport *rport); 101extern void srp_start_tl_fail_timers(struct srp_rport *rport);
103extern void srp_remove_host(struct Scsi_Host *); 102extern void srp_remove_host(struct Scsi_Host *);
103extern void srp_stop_rport_timers(struct srp_rport *rport);
104 104
105/** 105/**
106 * srp_chkready() - evaluate the transport layer state before I/O 106 * srp_chkready() - evaluate the transport layer state before I/O