diff options
author | David Dillow <dillowda@ornl.gov> | 2008-01-08 17:08:52 -0500 |
---|---|---|
committer | Roland Dreier <rolandd@cisco.com> | 2008-02-04 23:20:43 -0500 |
commit | 9fe4bcf45ece0b0081031edaaa41581c85ef7049 (patch) | |
tree | b4d7cd3aff19fe8802d71d031d79340962164434 /drivers/infiniband | |
parent | 893da75956ab48545e8732b46e1cf4350bd25f9c (diff) |
IB/srp: Retry stale connections
When a host just goes away (crash, power loss, etc.) without tearing
down its IB connections, it can get stale connection errors when it
tries to reconnect to targets upon rebooting. Retrying the connection
a few times will prevent sysadmins from playing the "which disk(s)
went missing?" game.
This would have made things slightly quicker when tracking down some
of the recent bugs, but it also helps quite a bit when you've got a
large number of targets hanging off a wedged server.
Signed-off-by: David Dillow <dillowda@ornl.gov>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
Diffstat (limited to 'drivers/infiniband')
-rw-r--r-- | drivers/infiniband/ulp/srp/ib_srp.c | 53 | ||||
-rw-r--r-- | drivers/infiniband/ulp/srp/ib_srp.h | 1 |
2 files changed, 42 insertions, 12 deletions
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 195ce7c12319..fd4a49fc4773 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c | |||
@@ -204,6 +204,22 @@ out: | |||
204 | return ret; | 204 | return ret; |
205 | } | 205 | } |
206 | 206 | ||
207 | static int srp_new_cm_id(struct srp_target_port *target) | ||
208 | { | ||
209 | struct ib_cm_id *new_cm_id; | ||
210 | |||
211 | new_cm_id = ib_create_cm_id(target->srp_host->dev->dev, | ||
212 | srp_cm_handler, target); | ||
213 | if (IS_ERR(new_cm_id)) | ||
214 | return PTR_ERR(new_cm_id); | ||
215 | |||
216 | if (target->cm_id) | ||
217 | ib_destroy_cm_id(target->cm_id); | ||
218 | target->cm_id = new_cm_id; | ||
219 | |||
220 | return 0; | ||
221 | } | ||
222 | |||
207 | static int srp_create_target_ib(struct srp_target_port *target) | 223 | static int srp_create_target_ib(struct srp_target_port *target) |
208 | { | 224 | { |
209 | struct ib_qp_init_attr *init_attr; | 225 | struct ib_qp_init_attr *init_attr; |
@@ -436,6 +452,7 @@ static void srp_remove_work(struct work_struct *work) | |||
436 | 452 | ||
437 | static int srp_connect_target(struct srp_target_port *target) | 453 | static int srp_connect_target(struct srp_target_port *target) |
438 | { | 454 | { |
455 | int retries = 3; | ||
439 | int ret; | 456 | int ret; |
440 | 457 | ||
441 | ret = srp_lookup_path(target); | 458 | ret = srp_lookup_path(target); |
@@ -468,6 +485,21 @@ static int srp_connect_target(struct srp_target_port *target) | |||
468 | case SRP_DLID_REDIRECT: | 485 | case SRP_DLID_REDIRECT: |
469 | break; | 486 | break; |
470 | 487 | ||
488 | case SRP_STALE_CONN: | ||
489 | /* Our current CM id was stale, and is now in timewait. | ||
490 | * Try to reconnect with a new one. | ||
491 | */ | ||
492 | if (!retries-- || srp_new_cm_id(target)) { | ||
493 | shost_printk(KERN_ERR, target->scsi_host, PFX | ||
494 | "giving up on stale connection\n"); | ||
495 | target->status = -ECONNRESET; | ||
496 | return target->status; | ||
497 | } | ||
498 | |||
499 | shost_printk(KERN_ERR, target->scsi_host, PFX | ||
500 | "retrying stale connection\n"); | ||
501 | break; | ||
502 | |||
471 | default: | 503 | default: |
472 | return target->status; | 504 | return target->status; |
473 | } | 505 | } |
@@ -507,7 +539,6 @@ static void srp_reset_req(struct srp_target_port *target, struct srp_request *re | |||
507 | 539 | ||
508 | static int srp_reconnect_target(struct srp_target_port *target) | 540 | static int srp_reconnect_target(struct srp_target_port *target) |
509 | { | 541 | { |
510 | struct ib_cm_id *new_cm_id; | ||
511 | struct ib_qp_attr qp_attr; | 542 | struct ib_qp_attr qp_attr; |
512 | struct srp_request *req, *tmp; | 543 | struct srp_request *req, *tmp; |
513 | struct ib_wc wc; | 544 | struct ib_wc wc; |
@@ -526,14 +557,9 @@ static int srp_reconnect_target(struct srp_target_port *target) | |||
526 | * Now get a new local CM ID so that we avoid confusing the | 557 | * Now get a new local CM ID so that we avoid confusing the |
527 | * target in case things are really fouled up. | 558 | * target in case things are really fouled up. |
528 | */ | 559 | */ |
529 | new_cm_id = ib_create_cm_id(target->srp_host->dev->dev, | 560 | ret = srp_new_cm_id(target); |
530 | srp_cm_handler, target); | 561 | if (ret) |
531 | if (IS_ERR(new_cm_id)) { | ||
532 | ret = PTR_ERR(new_cm_id); | ||
533 | goto err; | 562 | goto err; |
534 | } | ||
535 | ib_destroy_cm_id(target->cm_id); | ||
536 | target->cm_id = new_cm_id; | ||
537 | 563 | ||
538 | qp_attr.qp_state = IB_QPS_RESET; | 564 | qp_attr.qp_state = IB_QPS_RESET; |
539 | ret = ib_modify_qp(target->qp, &qp_attr, IB_QP_STATE); | 565 | ret = ib_modify_qp(target->qp, &qp_attr, IB_QP_STATE); |
@@ -1171,6 +1197,11 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id, | |||
1171 | target->status = -ECONNRESET; | 1197 | target->status = -ECONNRESET; |
1172 | break; | 1198 | break; |
1173 | 1199 | ||
1200 | case IB_CM_REJ_STALE_CONN: | ||
1201 | shost_printk(KERN_WARNING, shost, " REJ reason: stale connection\n"); | ||
1202 | target->status = SRP_STALE_CONN; | ||
1203 | break; | ||
1204 | |||
1174 | default: | 1205 | default: |
1175 | shost_printk(KERN_WARNING, shost, " REJ reason 0x%x\n", | 1206 | shost_printk(KERN_WARNING, shost, " REJ reason 0x%x\n", |
1176 | event->param.rej_rcvd.reason); | 1207 | event->param.rej_rcvd.reason); |
@@ -1862,11 +1893,9 @@ static ssize_t srp_create_target(struct class_device *class_dev, | |||
1862 | if (ret) | 1893 | if (ret) |
1863 | goto err; | 1894 | goto err; |
1864 | 1895 | ||
1865 | target->cm_id = ib_create_cm_id(host->dev->dev, srp_cm_handler, target); | 1896 | ret = srp_new_cm_id(target); |
1866 | if (IS_ERR(target->cm_id)) { | 1897 | if (ret) |
1867 | ret = PTR_ERR(target->cm_id); | ||
1868 | goto err_free; | 1898 | goto err_free; |
1869 | } | ||
1870 | 1899 | ||
1871 | target->qp_in_error = 0; | 1900 | target->qp_in_error = 0; |
1872 | ret = srp_connect_target(target); | 1901 | ret = srp_connect_target(target); |
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 4a3c1f37e4c2..cb6eb816024a 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h | |||
@@ -54,6 +54,7 @@ enum { | |||
54 | 54 | ||
55 | SRP_PORT_REDIRECT = 1, | 55 | SRP_PORT_REDIRECT = 1, |
56 | SRP_DLID_REDIRECT = 2, | 56 | SRP_DLID_REDIRECT = 2, |
57 | SRP_STALE_CONN = 3, | ||
57 | 58 | ||
58 | SRP_MAX_LUN = 512, | 59 | SRP_MAX_LUN = 512, |
59 | SRP_DEF_SG_TABLESIZE = 12, | 60 | SRP_DEF_SG_TABLESIZE = 12, |