aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/ulp
diff options
context:
space:
mode:
authorBart Van Assche <bvanassche@acm.org>2013-10-26 08:34:27 -0400
committerRoland Dreier <roland@purestorage.com>2013-11-08 17:43:15 -0500
commited9b2264fb393327a6c8a4229d8df55df596188e (patch)
tree585689ca11664e8048dddb59e3f19b7f2c8313e1 /drivers/infiniband/ulp
parent29c17324803c8a3bb5b2b69309e43571164cc4de (diff)
IB/srp: Use SRP transport layer error recovery
Enable fast_io_fail_tmo and dev_loss_tmo functionality for the IB SRP initiator. Add kernel module parameters that allow to specify default values for these parameters. Signed-off-by: Bart Van Assche <bvanassche@acm.org> Acked-by: David Dillow <dillowda@ornl.gov> Signed-off-by: Roland Dreier <roland@purestorage.com>
Diffstat (limited to 'drivers/infiniband/ulp')
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.c141
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.h1
2 files changed, 101 insertions, 41 deletions
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 6edab7855f5e..15b4d2ce4989 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -86,6 +86,27 @@ module_param(topspin_workarounds, int, 0444);
86MODULE_PARM_DESC(topspin_workarounds, 86MODULE_PARM_DESC(topspin_workarounds,
87 "Enable workarounds for Topspin/Cisco SRP target bugs if != 0"); 87 "Enable workarounds for Topspin/Cisco SRP target bugs if != 0");
88 88
89static struct kernel_param_ops srp_tmo_ops;
90
91static int srp_fast_io_fail_tmo = 15;
92module_param_cb(fast_io_fail_tmo, &srp_tmo_ops, &srp_fast_io_fail_tmo,
93 S_IRUGO | S_IWUSR);
94MODULE_PARM_DESC(fast_io_fail_tmo,
95 "Number of seconds between the observation of a transport"
96 " layer error and failing all I/O. \"off\" means that this"
97 " functionality is disabled.");
98
99static int srp_dev_loss_tmo = 60;
100module_param_cb(dev_loss_tmo, &srp_tmo_ops, &srp_dev_loss_tmo,
101 S_IRUGO | S_IWUSR);
102MODULE_PARM_DESC(dev_loss_tmo,
103 "Maximum number of seconds that the SRP transport should"
104 " insulate transport layer errors. After this time has been"
105 " exceeded the SCSI host is removed. Should be"
106 " between 1 and " __stringify(SCSI_DEVICE_BLOCK_MAX_TIMEOUT)
107 " if fast_io_fail_tmo has not been set. \"off\" means that"
108 " this functionality is disabled.");
109
89static void srp_add_one(struct ib_device *device); 110static void srp_add_one(struct ib_device *device);
90static void srp_remove_one(struct ib_device *device); 111static void srp_remove_one(struct ib_device *device);
91static void srp_recv_completion(struct ib_cq *cq, void *target_ptr); 112static void srp_recv_completion(struct ib_cq *cq, void *target_ptr);
@@ -102,6 +123,44 @@ static struct ib_client srp_client = {
102 123
103static struct ib_sa_client srp_sa_client; 124static struct ib_sa_client srp_sa_client;
104 125
126static int srp_tmo_get(char *buffer, const struct kernel_param *kp)
127{
128 int tmo = *(int *)kp->arg;
129
130 if (tmo >= 0)
131 return sprintf(buffer, "%d", tmo);
132 else
133 return sprintf(buffer, "off");
134}
135
136static int srp_tmo_set(const char *val, const struct kernel_param *kp)
137{
138 int tmo, res;
139
140 if (strncmp(val, "off", 3) != 0) {
141 res = kstrtoint(val, 0, &tmo);
142 if (res)
143 goto out;
144 } else {
145 tmo = -1;
146 }
147 if (kp->arg == &srp_fast_io_fail_tmo)
148 res = srp_tmo_valid(tmo, srp_dev_loss_tmo);
149 else
150 res = srp_tmo_valid(srp_fast_io_fail_tmo, tmo);
151 if (res)
152 goto out;
153 *(int *)kp->arg = tmo;
154
155out:
156 return res;
157}
158
159static struct kernel_param_ops srp_tmo_ops = {
160 .get = srp_tmo_get,
161 .set = srp_tmo_set,
162};
163
105static inline struct srp_target_port *host_to_target(struct Scsi_Host *host) 164static inline struct srp_target_port *host_to_target(struct Scsi_Host *host)
106{ 165{
107 return (struct srp_target_port *) host->hostdata; 166 return (struct srp_target_port *) host->hostdata;
@@ -688,23 +747,42 @@ static void srp_free_req(struct srp_target_port *target,
688 spin_unlock_irqrestore(&target->lock, flags); 747 spin_unlock_irqrestore(&target->lock, flags);
689} 748}
690 749
691static void srp_reset_req(struct srp_target_port *target, struct srp_request *req) 750static void srp_finish_req(struct srp_target_port *target,
751 struct srp_request *req, int result)
692{ 752{
693 struct scsi_cmnd *scmnd = srp_claim_req(target, req, NULL); 753 struct scsi_cmnd *scmnd = srp_claim_req(target, req, NULL);
694 754
695 if (scmnd) { 755 if (scmnd) {
696 srp_free_req(target, req, scmnd, 0); 756 srp_free_req(target, req, scmnd, 0);
697 scmnd->result = DID_RESET << 16; 757 scmnd->result = result;
698 scmnd->scsi_done(scmnd); 758 scmnd->scsi_done(scmnd);
699 } 759 }
700} 760}
701 761
702static int srp_reconnect_target(struct srp_target_port *target) 762static void srp_terminate_io(struct srp_rport *rport)
703{ 763{
704 struct Scsi_Host *shost = target->scsi_host; 764 struct srp_target_port *target = rport->lld_data;
705 int i, ret; 765 int i;
766
767 for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) {
768 struct srp_request *req = &target->req_ring[i];
769 srp_finish_req(target, req, DID_TRANSPORT_FAILFAST << 16);
770 }
771}
706 772
707 scsi_target_block(&shost->shost_gendev); 773/*
774 * It is up to the caller to ensure that srp_rport_reconnect() calls are
775 * serialized and that no concurrent srp_queuecommand(), srp_abort(),
776 * srp_reset_device() or srp_reset_host() calls will occur while this function
777 * is in progress. One way to realize that is not to call this function
778 * directly but to call srp_reconnect_rport() instead since that last function
779 * serializes calls of this function via rport->mutex and also blocks
780 * srp_queuecommand() calls before invoking this function.
781 */
782static int srp_rport_reconnect(struct srp_rport *rport)
783{
784 struct srp_target_port *target = rport->lld_data;
785 int i, ret;
708 786
709 srp_disconnect_target(target); 787 srp_disconnect_target(target);
710 /* 788 /*
@@ -725,8 +803,7 @@ static int srp_reconnect_target(struct srp_target_port *target)
725 803
726 for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { 804 for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) {
727 struct srp_request *req = &target->req_ring[i]; 805 struct srp_request *req = &target->req_ring[i];
728 if (req->scmnd) 806 srp_finish_req(target, req, DID_RESET << 16);
729 srp_reset_req(target, req);
730 } 807 }
731 808
732 INIT_LIST_HEAD(&target->free_tx); 809 INIT_LIST_HEAD(&target->free_tx);
@@ -736,28 +813,9 @@ static int srp_reconnect_target(struct srp_target_port *target)
736 if (ret == 0) 813 if (ret == 0)
737 ret = srp_connect_target(target); 814 ret = srp_connect_target(target);
738 815
739 scsi_target_unblock(&shost->shost_gendev, ret == 0 ? SDEV_RUNNING : 816 if (ret == 0)
740 SDEV_TRANSPORT_OFFLINE); 817 shost_printk(KERN_INFO, target->scsi_host,
741 target->transport_offline = !!ret; 818 PFX "reconnect succeeded\n");
742
743 if (ret)
744 goto err;
745
746 shost_printk(KERN_INFO, target->scsi_host, PFX "reconnect succeeded\n");
747
748 return ret;
749
750err:
751 shost_printk(KERN_ERR, target->scsi_host,
752 PFX "reconnect failed (%d), removing target port.\n", ret);
753
754 /*
755 * We couldn't reconnect, so kill our target port off.
756 * However, we have to defer the real removal because we
757 * are in the context of the SCSI error handler now, which
758 * will deadlock if we call scsi_remove_host().
759 */
760 srp_queue_remove_work(target);
761 819
762 return ret; 820 return ret;
763} 821}
@@ -1356,10 +1414,11 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
1356 struct srp_cmd *cmd; 1414 struct srp_cmd *cmd;
1357 struct ib_device *dev; 1415 struct ib_device *dev;
1358 unsigned long flags; 1416 unsigned long flags;
1359 int len; 1417 int len, result;
1360 1418
1361 if (unlikely(target->transport_offline)) { 1419 result = srp_chkready(target->rport);
1362 scmnd->result = DID_NO_CONNECT << 16; 1420 if (unlikely(result)) {
1421 scmnd->result = result;
1363 scmnd->scsi_done(scmnd); 1422 scmnd->scsi_done(scmnd);
1364 return 0; 1423 return 0;
1365 } 1424 }
@@ -1757,7 +1816,7 @@ static int srp_abort(struct scsi_cmnd *scmnd)
1757 if (srp_send_tsk_mgmt(target, req->index, scmnd->device->lun, 1816 if (srp_send_tsk_mgmt(target, req->index, scmnd->device->lun,
1758 SRP_TSK_ABORT_TASK) == 0) 1817 SRP_TSK_ABORT_TASK) == 0)
1759 ret = SUCCESS; 1818 ret = SUCCESS;
1760 else if (target->transport_offline) 1819 else if (target->rport->state == SRP_RPORT_LOST)
1761 ret = FAST_IO_FAIL; 1820 ret = FAST_IO_FAIL;
1762 else 1821 else
1763 ret = FAILED; 1822 ret = FAILED;
@@ -1784,7 +1843,7 @@ static int srp_reset_device(struct scsi_cmnd *scmnd)
1784 for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { 1843 for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) {
1785 struct srp_request *req = &target->req_ring[i]; 1844 struct srp_request *req = &target->req_ring[i];
1786 if (req->scmnd && req->scmnd->device == scmnd->device) 1845 if (req->scmnd && req->scmnd->device == scmnd->device)
1787 srp_reset_req(target, req); 1846 srp_finish_req(target, req, DID_RESET << 16);
1788 } 1847 }
1789 1848
1790 return SUCCESS; 1849 return SUCCESS;
@@ -1793,14 +1852,10 @@ static int srp_reset_device(struct scsi_cmnd *scmnd)
1793static int srp_reset_host(struct scsi_cmnd *scmnd) 1852static int srp_reset_host(struct scsi_cmnd *scmnd)
1794{ 1853{
1795 struct srp_target_port *target = host_to_target(scmnd->device->host); 1854 struct srp_target_port *target = host_to_target(scmnd->device->host);
1796 int ret = FAILED;
1797 1855
1798 shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n"); 1856 shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n");
1799 1857
1800 if (!srp_reconnect_target(target)) 1858 return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED;
1801 ret = SUCCESS;
1802
1803 return ret;
1804} 1859}
1805 1860
1806static int srp_slave_configure(struct scsi_device *sdev) 1861static int srp_slave_configure(struct scsi_device *sdev)
@@ -2637,7 +2692,13 @@ static void srp_remove_one(struct ib_device *device)
2637} 2692}
2638 2693
2639static struct srp_function_template ib_srp_transport_functions = { 2694static struct srp_function_template ib_srp_transport_functions = {
2695 .has_rport_state = true,
2696 .reset_timer_if_blocked = true,
2697 .fast_io_fail_tmo = &srp_fast_io_fail_tmo,
2698 .dev_loss_tmo = &srp_dev_loss_tmo,
2699 .reconnect = srp_rport_reconnect,
2640 .rport_delete = srp_rport_delete, 2700 .rport_delete = srp_rport_delete,
2701 .terminate_rport_io = srp_terminate_io,
2641}; 2702};
2642 2703
2643static int __init srp_init_module(void) 2704static int __init srp_init_module(void)
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index 2a1768fcc57d..fd1817e48ad4 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -140,7 +140,6 @@ struct srp_target_port {
140 unsigned int cmd_sg_cnt; 140 unsigned int cmd_sg_cnt;
141 unsigned int indirect_size; 141 unsigned int indirect_size;
142 bool allow_ext_sg; 142 bool allow_ext_sg;
143 bool transport_offline;
144 143
145 /* Everything above this point is used in the hot path of 144 /* Everything above this point is used in the hot path of
146 * command processing. Try to keep them packed into cachelines. 145 * command processing. Try to keep them packed into cachelines.