aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/ulp
diff options
context:
space:
mode:
authorDavid Dillow <dillowda@ornl.gov>2008-01-08 17:08:52 -0500
committerRoland Dreier <rolandd@cisco.com>2008-02-04 23:20:43 -0500
commit9fe4bcf45ece0b0081031edaaa41581c85ef7049 (patch)
treeb4d7cd3aff19fe8802d71d031d79340962164434 /drivers/infiniband/ulp
parent893da75956ab48545e8732b46e1cf4350bd25f9c (diff)
IB/srp: Retry stale connections
When a host just goes away (crash, power loss, etc.) without tearing down its IB connections, it can get stale connection errors when it tries to reconnect to targets upon rebooting. Retrying the connection a few times will prevent sysadmins from playing the "which disk(s) went missing?" game. This would have made things slightly quicker when tracking down some of the recent bugs, but it also helps quite a bit when you've got a large number of targets hanging off a wedged server. Signed-off-by: David Dillow <dillowda@ornl.gov> Signed-off-by: Roland Dreier <rolandd@cisco.com>
Diffstat (limited to 'drivers/infiniband/ulp')
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.c53
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.h1
2 files changed, 42 insertions, 12 deletions
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 195ce7c1231..fd4a49fc477 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -204,6 +204,22 @@ out:
204 return ret; 204 return ret;
205} 205}
206 206
207static int srp_new_cm_id(struct srp_target_port *target)
208{
209 struct ib_cm_id *new_cm_id;
210
211 new_cm_id = ib_create_cm_id(target->srp_host->dev->dev,
212 srp_cm_handler, target);
213 if (IS_ERR(new_cm_id))
214 return PTR_ERR(new_cm_id);
215
216 if (target->cm_id)
217 ib_destroy_cm_id(target->cm_id);
218 target->cm_id = new_cm_id;
219
220 return 0;
221}
222
207static int srp_create_target_ib(struct srp_target_port *target) 223static int srp_create_target_ib(struct srp_target_port *target)
208{ 224{
209 struct ib_qp_init_attr *init_attr; 225 struct ib_qp_init_attr *init_attr;
@@ -436,6 +452,7 @@ static void srp_remove_work(struct work_struct *work)
436 452
437static int srp_connect_target(struct srp_target_port *target) 453static int srp_connect_target(struct srp_target_port *target)
438{ 454{
455 int retries = 3;
439 int ret; 456 int ret;
440 457
441 ret = srp_lookup_path(target); 458 ret = srp_lookup_path(target);
@@ -468,6 +485,21 @@ static int srp_connect_target(struct srp_target_port *target)
468 case SRP_DLID_REDIRECT: 485 case SRP_DLID_REDIRECT:
469 break; 486 break;
470 487
488 case SRP_STALE_CONN:
489 /* Our current CM id was stale, and is now in timewait.
490 * Try to reconnect with a new one.
491 */
492 if (!retries-- || srp_new_cm_id(target)) {
493 shost_printk(KERN_ERR, target->scsi_host, PFX
494 "giving up on stale connection\n");
495 target->status = -ECONNRESET;
496 return target->status;
497 }
498
499 shost_printk(KERN_ERR, target->scsi_host, PFX
500 "retrying stale connection\n");
501 break;
502
471 default: 503 default:
472 return target->status; 504 return target->status;
473 } 505 }
@@ -507,7 +539,6 @@ static void srp_reset_req(struct srp_target_port *target, struct srp_request *re
507 539
508static int srp_reconnect_target(struct srp_target_port *target) 540static int srp_reconnect_target(struct srp_target_port *target)
509{ 541{
510 struct ib_cm_id *new_cm_id;
511 struct ib_qp_attr qp_attr; 542 struct ib_qp_attr qp_attr;
512 struct srp_request *req, *tmp; 543 struct srp_request *req, *tmp;
513 struct ib_wc wc; 544 struct ib_wc wc;
@@ -526,14 +557,9 @@ static int srp_reconnect_target(struct srp_target_port *target)
526 * Now get a new local CM ID so that we avoid confusing the 557 * Now get a new local CM ID so that we avoid confusing the
527 * target in case things are really fouled up. 558 * target in case things are really fouled up.
528 */ 559 */
529 new_cm_id = ib_create_cm_id(target->srp_host->dev->dev, 560 ret = srp_new_cm_id(target);
530 srp_cm_handler, target); 561 if (ret)
531 if (IS_ERR(new_cm_id)) {
532 ret = PTR_ERR(new_cm_id);
533 goto err; 562 goto err;
534 }
535 ib_destroy_cm_id(target->cm_id);
536 target->cm_id = new_cm_id;
537 563
538 qp_attr.qp_state = IB_QPS_RESET; 564 qp_attr.qp_state = IB_QPS_RESET;
539 ret = ib_modify_qp(target->qp, &qp_attr, IB_QP_STATE); 565 ret = ib_modify_qp(target->qp, &qp_attr, IB_QP_STATE);
@@ -1171,6 +1197,11 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id,
1171 target->status = -ECONNRESET; 1197 target->status = -ECONNRESET;
1172 break; 1198 break;
1173 1199
1200 case IB_CM_REJ_STALE_CONN:
1201 shost_printk(KERN_WARNING, shost, " REJ reason: stale connection\n");
1202 target->status = SRP_STALE_CONN;
1203 break;
1204
1174 default: 1205 default:
1175 shost_printk(KERN_WARNING, shost, " REJ reason 0x%x\n", 1206 shost_printk(KERN_WARNING, shost, " REJ reason 0x%x\n",
1176 event->param.rej_rcvd.reason); 1207 event->param.rej_rcvd.reason);
@@ -1862,11 +1893,9 @@ static ssize_t srp_create_target(struct class_device *class_dev,
1862 if (ret) 1893 if (ret)
1863 goto err; 1894 goto err;
1864 1895
1865 target->cm_id = ib_create_cm_id(host->dev->dev, srp_cm_handler, target); 1896 ret = srp_new_cm_id(target);
1866 if (IS_ERR(target->cm_id)) { 1897 if (ret)
1867 ret = PTR_ERR(target->cm_id);
1868 goto err_free; 1898 goto err_free;
1869 }
1870 1899
1871 target->qp_in_error = 0; 1900 target->qp_in_error = 0;
1872 ret = srp_connect_target(target); 1901 ret = srp_connect_target(target);
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index 4a3c1f37e4c..cb6eb816024 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -54,6 +54,7 @@ enum {
54 54
55 SRP_PORT_REDIRECT = 1, 55 SRP_PORT_REDIRECT = 1,
56 SRP_DLID_REDIRECT = 2, 56 SRP_DLID_REDIRECT = 2,
57 SRP_STALE_CONN = 3,
57 58
58 SRP_MAX_LUN = 512, 59 SRP_MAX_LUN = 512,
59 SRP_DEF_SG_TABLESIZE = 12, 60 SRP_DEF_SG_TABLESIZE = 12,