aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband
diff options
context:
space:
mode:
authorOr Gerlitz <ogerlitz@voltaire.com>2010-05-05 10:31:44 -0400
committerRoland Dreier <rolandd@cisco.com>2010-05-12 12:30:44 -0400
commit39ff05dbbbdb082bbabf06206c56b3cd4ef73904 (patch)
tree85466e1e75d632b33a294dea436fad2f3233fe52 /drivers/infiniband
parentd265b9808272c9f25e1c36d3fb5ddb466efd90e9 (diff)
IB/iser: Enhance disconnection logic for multi-pathing
The iser connection teardown flow isn't over until the underlying Connection Manager (e.g the IB CM) delivers a disconnected or timeout event through the RDMA-CM. When the remote (target) side isn't reachable, e.g when some HW e.g port/hca/switch isn't functioning or taken down administratively, the CM timeout flow is used and the event may be generated only after relatively long time -- on the order of tens of seconds. The current iser code exposes this possibly long delay to higher layers, specifically to the iscsid daemon and iscsi kernel stack. As a result, the iscsi stack doesn't respond well: this low-level CM delay is added to the fail-over time under HA schemes such as the one provided by DM multipath through the multipathd(8) service. This patch enhances the reference counting scheme on iser's IB connections so that the disconnect flow initiated by iscsid from user space (ep_disconnect) doesn't wait for the CM to deliver the disconnect/timeout event. (The connection teardown isn't done from iser's view point until the event is delivered) The iser ib (rdma) connection object is destroyed when its reference count reaches zero. When this happens on the RDMA-CM callback context, extra care is taken so that the RDMA-CM does the actual destroying of the associated ID, since doing it in the callback is prohibited. The reference count of iser ib connection normally reaches three, where the <ref, deref> relations are 1. conn <init, terminate> 2. conn <bind, stop/destroy> 3. cma id <create, disconnect/error/timeout callbacks> With this patch, multipath fail-over time is about 30 seconds, while without this patch, multipath fail-over time is about 130 seconds. Signed-off-by: Or Gerlitz <ogerlitz@voltaire.com> Signed-off-by: Roland Dreier <rolandd@cisco.com>
Diffstat (limited to 'drivers/infiniband')
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.c9
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.h3
-rw-r--r--drivers/infiniband/ulp/iser/iser_verbs.c72
3 files changed, 46 insertions, 38 deletions
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 93399dff0c6f..7b2fc98e2f2b 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -325,7 +325,7 @@ iscsi_iser_conn_destroy(struct iscsi_cls_conn *cls_conn)
325 */ 325 */
326 if (ib_conn) { 326 if (ib_conn) {
327 ib_conn->iser_conn = NULL; 327 ib_conn->iser_conn = NULL;
328 iser_conn_put(ib_conn); 328 iser_conn_put(ib_conn, 1); /* deref iscsi/ib conn unbinding */
329 } 329 }
330} 330}
331 331
@@ -357,11 +357,12 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session,
357 /* binds the iSER connection retrieved from the previously 357 /* binds the iSER connection retrieved from the previously
358 * connected ep_handle to the iSCSI layer connection. exchanges 358 * connected ep_handle to the iSCSI layer connection. exchanges
359 * connection pointers */ 359 * connection pointers */
360 iser_err("binding iscsi conn %p to iser_conn %p\n",conn,ib_conn); 360 iser_err("binding iscsi/iser conn %p %p to ib_conn %p\n",
361 conn, conn->dd_data, ib_conn);
361 iser_conn = conn->dd_data; 362 iser_conn = conn->dd_data;
362 ib_conn->iser_conn = iser_conn; 363 ib_conn->iser_conn = iser_conn;
363 iser_conn->ib_conn = ib_conn; 364 iser_conn->ib_conn = ib_conn;
364 iser_conn_get(ib_conn); 365 iser_conn_get(ib_conn); /* ref iscsi/ib conn binding */
365 return 0; 366 return 0;
366} 367}
367 368
@@ -382,7 +383,7 @@ iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag)
382 * There is no unbind event so the stop callback 383 * There is no unbind event so the stop callback
383 * must release the ref from the bind. 384 * must release the ref from the bind.
384 */ 385 */
385 iser_conn_put(ib_conn); 386 iser_conn_put(ib_conn, 1); /* deref iscsi/ib conn unbinding */
386 } 387 }
387 iser_conn->ib_conn = NULL; 388 iser_conn->ib_conn = NULL;
388} 389}
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 53da74b45c75..f1df01567bb6 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -247,7 +247,6 @@ struct iser_conn {
247 struct rdma_cm_id *cma_id; /* CMA ID */ 247 struct rdma_cm_id *cma_id; /* CMA ID */
248 struct ib_qp *qp; /* QP */ 248 struct ib_qp *qp; /* QP */
249 struct ib_fmr_pool *fmr_pool; /* pool of IB FMRs */ 249 struct ib_fmr_pool *fmr_pool; /* pool of IB FMRs */
250 int disc_evt_flag; /* disconn event delivered */
251 wait_queue_head_t wait; /* waitq for conn/disconn */ 250 wait_queue_head_t wait; /* waitq for conn/disconn */
252 int post_recv_buf_count; /* posted rx count */ 251 int post_recv_buf_count; /* posted rx count */
253 atomic_t post_send_buf_count; /* posted tx count */ 252 atomic_t post_send_buf_count; /* posted tx count */
@@ -321,7 +320,7 @@ void iser_conn_init(struct iser_conn *ib_conn);
321 320
322void iser_conn_get(struct iser_conn *ib_conn); 321void iser_conn_get(struct iser_conn *ib_conn);
323 322
324void iser_conn_put(struct iser_conn *ib_conn); 323int iser_conn_put(struct iser_conn *ib_conn, int destroy_cma_id_allowed);
325 324
326void iser_conn_terminate(struct iser_conn *ib_conn); 325void iser_conn_terminate(struct iser_conn *ib_conn);
327 326
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index ed7c90135412..78fdecacea35 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -238,7 +238,7 @@ alloc_err:
238 * releases the FMR pool, QP and CMA ID objects, returns 0 on success, 238 * releases the FMR pool, QP and CMA ID objects, returns 0 on success,
239 * -1 on failure 239 * -1 on failure
240 */ 240 */
241static int iser_free_ib_conn_res(struct iser_conn *ib_conn) 241static int iser_free_ib_conn_res(struct iser_conn *ib_conn, int can_destroy_id)
242{ 242{
243 BUG_ON(ib_conn == NULL); 243 BUG_ON(ib_conn == NULL);
244 244
@@ -253,7 +253,8 @@ static int iser_free_ib_conn_res(struct iser_conn *ib_conn)
253 if (ib_conn->qp != NULL) 253 if (ib_conn->qp != NULL)
254 rdma_destroy_qp(ib_conn->cma_id); 254 rdma_destroy_qp(ib_conn->cma_id);
255 255
256 if (ib_conn->cma_id != NULL) 256 /* if cma handler context, the caller acts s.t the cma destroy the id */
257 if (ib_conn->cma_id != NULL && can_destroy_id)
257 rdma_destroy_id(ib_conn->cma_id); 258 rdma_destroy_id(ib_conn->cma_id);
258 259
259 ib_conn->fmr_pool = NULL; 260 ib_conn->fmr_pool = NULL;
@@ -331,7 +332,7 @@ static int iser_conn_state_comp_exch(struct iser_conn *ib_conn,
331/** 332/**
332 * Frees all conn objects and deallocs conn descriptor 333 * Frees all conn objects and deallocs conn descriptor
333 */ 334 */
334static void iser_conn_release(struct iser_conn *ib_conn) 335static void iser_conn_release(struct iser_conn *ib_conn, int can_destroy_id)
335{ 336{
336 struct iser_device *device = ib_conn->device; 337 struct iser_device *device = ib_conn->device;
337 338
@@ -341,7 +342,7 @@ static void iser_conn_release(struct iser_conn *ib_conn)
341 list_del(&ib_conn->conn_list); 342 list_del(&ib_conn->conn_list);
342 mutex_unlock(&ig.connlist_mutex); 343 mutex_unlock(&ig.connlist_mutex);
343 iser_free_rx_descriptors(ib_conn); 344 iser_free_rx_descriptors(ib_conn);
344 iser_free_ib_conn_res(ib_conn); 345 iser_free_ib_conn_res(ib_conn, can_destroy_id);
345 ib_conn->device = NULL; 346 ib_conn->device = NULL;
346 /* on EVENT_ADDR_ERROR there's no device yet for this conn */ 347 /* on EVENT_ADDR_ERROR there's no device yet for this conn */
347 if (device != NULL) 348 if (device != NULL)
@@ -354,10 +355,13 @@ void iser_conn_get(struct iser_conn *ib_conn)
354 atomic_inc(&ib_conn->refcount); 355 atomic_inc(&ib_conn->refcount);
355} 356}
356 357
357void iser_conn_put(struct iser_conn *ib_conn) 358int iser_conn_put(struct iser_conn *ib_conn, int can_destroy_id)
358{ 359{
359 if (atomic_dec_and_test(&ib_conn->refcount)) 360 if (atomic_dec_and_test(&ib_conn->refcount)) {
360 iser_conn_release(ib_conn); 361 iser_conn_release(ib_conn, can_destroy_id);
362 return 1;
363 }
364 return 0;
361} 365}
362 366
363/** 367/**
@@ -381,19 +385,20 @@ void iser_conn_terminate(struct iser_conn *ib_conn)
381 wait_event_interruptible(ib_conn->wait, 385 wait_event_interruptible(ib_conn->wait,
382 ib_conn->state == ISER_CONN_DOWN); 386 ib_conn->state == ISER_CONN_DOWN);
383 387
384 iser_conn_put(ib_conn); 388 iser_conn_put(ib_conn, 1); /* deref ib conn deallocate */
385} 389}
386 390
387static void iser_connect_error(struct rdma_cm_id *cma_id) 391static int iser_connect_error(struct rdma_cm_id *cma_id)
388{ 392{
389 struct iser_conn *ib_conn; 393 struct iser_conn *ib_conn;
390 ib_conn = (struct iser_conn *)cma_id->context; 394 ib_conn = (struct iser_conn *)cma_id->context;
391 395
392 ib_conn->state = ISER_CONN_DOWN; 396 ib_conn->state = ISER_CONN_DOWN;
393 wake_up_interruptible(&ib_conn->wait); 397 wake_up_interruptible(&ib_conn->wait);
398 return iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */
394} 399}
395 400
396static void iser_addr_handler(struct rdma_cm_id *cma_id) 401static int iser_addr_handler(struct rdma_cm_id *cma_id)
397{ 402{
398 struct iser_device *device; 403 struct iser_device *device;
399 struct iser_conn *ib_conn; 404 struct iser_conn *ib_conn;
@@ -402,8 +407,7 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id)
402 device = iser_device_find_by_ib_device(cma_id); 407 device = iser_device_find_by_ib_device(cma_id);
403 if (!device) { 408 if (!device) {
404 iser_err("device lookup/creation failed\n"); 409 iser_err("device lookup/creation failed\n");
405 iser_connect_error(cma_id); 410 return iser_connect_error(cma_id);
406 return;
407 } 411 }
408 412
409 ib_conn = (struct iser_conn *)cma_id->context; 413 ib_conn = (struct iser_conn *)cma_id->context;
@@ -412,11 +416,13 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id)
412 ret = rdma_resolve_route(cma_id, 1000); 416 ret = rdma_resolve_route(cma_id, 1000);
413 if (ret) { 417 if (ret) {
414 iser_err("resolve route failed: %d\n", ret); 418 iser_err("resolve route failed: %d\n", ret);
415 iser_connect_error(cma_id); 419 return iser_connect_error(cma_id);
416 } 420 }
421
422 return 0;
417} 423}
418 424
419static void iser_route_handler(struct rdma_cm_id *cma_id) 425static int iser_route_handler(struct rdma_cm_id *cma_id)
420{ 426{
421 struct rdma_conn_param conn_param; 427 struct rdma_conn_param conn_param;
422 int ret; 428 int ret;
@@ -437,9 +443,9 @@ static void iser_route_handler(struct rdma_cm_id *cma_id)
437 goto failure; 443 goto failure;
438 } 444 }
439 445
440 return; 446 return 0;
441failure: 447failure:
442 iser_connect_error(cma_id); 448 return iser_connect_error(cma_id);
443} 449}
444 450
445static void iser_connected_handler(struct rdma_cm_id *cma_id) 451static void iser_connected_handler(struct rdma_cm_id *cma_id)
@@ -451,12 +457,12 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id)
451 wake_up_interruptible(&ib_conn->wait); 457 wake_up_interruptible(&ib_conn->wait);
452} 458}
453 459
454static void iser_disconnected_handler(struct rdma_cm_id *cma_id) 460static int iser_disconnected_handler(struct rdma_cm_id *cma_id)
455{ 461{
456 struct iser_conn *ib_conn; 462 struct iser_conn *ib_conn;
463 int ret;
457 464
458 ib_conn = (struct iser_conn *)cma_id->context; 465 ib_conn = (struct iser_conn *)cma_id->context;
459 ib_conn->disc_evt_flag = 1;
460 466
461 /* getting here when the state is UP means that the conn is being * 467 /* getting here when the state is UP means that the conn is being *
462 * terminated asynchronously from the iSCSI layer's perspective. */ 468 * terminated asynchronously from the iSCSI layer's perspective. */
@@ -471,20 +477,24 @@ static void iser_disconnected_handler(struct rdma_cm_id *cma_id)
471 ib_conn->state = ISER_CONN_DOWN; 477 ib_conn->state = ISER_CONN_DOWN;
472 wake_up_interruptible(&ib_conn->wait); 478 wake_up_interruptible(&ib_conn->wait);
473 } 479 }
480
481 ret = iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */
482 return ret;
474} 483}
475 484
476static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) 485static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
477{ 486{
478 int ret = 0; 487 int ret = 0;
479 488
480 iser_err("event %d conn %p id %p\n",event->event,cma_id->context,cma_id); 489 iser_err("event %d status %d conn %p id %p\n",
490 event->event, event->status, cma_id->context, cma_id);
481 491
482 switch (event->event) { 492 switch (event->event) {
483 case RDMA_CM_EVENT_ADDR_RESOLVED: 493 case RDMA_CM_EVENT_ADDR_RESOLVED:
484 iser_addr_handler(cma_id); 494 ret = iser_addr_handler(cma_id);
485 break; 495 break;
486 case RDMA_CM_EVENT_ROUTE_RESOLVED: 496 case RDMA_CM_EVENT_ROUTE_RESOLVED:
487 iser_route_handler(cma_id); 497 ret = iser_route_handler(cma_id);
488 break; 498 break;
489 case RDMA_CM_EVENT_ESTABLISHED: 499 case RDMA_CM_EVENT_ESTABLISHED:
490 iser_connected_handler(cma_id); 500 iser_connected_handler(cma_id);
@@ -494,13 +504,12 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve
494 case RDMA_CM_EVENT_CONNECT_ERROR: 504 case RDMA_CM_EVENT_CONNECT_ERROR:
495 case RDMA_CM_EVENT_UNREACHABLE: 505 case RDMA_CM_EVENT_UNREACHABLE:
496 case RDMA_CM_EVENT_REJECTED: 506 case RDMA_CM_EVENT_REJECTED:
497 iser_err("event: %d, error: %d\n", event->event, event->status); 507 ret = iser_connect_error(cma_id);
498 iser_connect_error(cma_id);
499 break; 508 break;
500 case RDMA_CM_EVENT_DISCONNECTED: 509 case RDMA_CM_EVENT_DISCONNECTED:
501 case RDMA_CM_EVENT_DEVICE_REMOVAL: 510 case RDMA_CM_EVENT_DEVICE_REMOVAL:
502 case RDMA_CM_EVENT_ADDR_CHANGE: 511 case RDMA_CM_EVENT_ADDR_CHANGE:
503 iser_disconnected_handler(cma_id); 512 ret = iser_disconnected_handler(cma_id);
504 break; 513 break;
505 default: 514 default:
506 iser_err("Unexpected RDMA CM event (%d)\n", event->event); 515 iser_err("Unexpected RDMA CM event (%d)\n", event->event);
@@ -515,7 +524,7 @@ void iser_conn_init(struct iser_conn *ib_conn)
515 init_waitqueue_head(&ib_conn->wait); 524 init_waitqueue_head(&ib_conn->wait);
516 ib_conn->post_recv_buf_count = 0; 525 ib_conn->post_recv_buf_count = 0;
517 atomic_set(&ib_conn->post_send_buf_count, 0); 526 atomic_set(&ib_conn->post_send_buf_count, 0);
518 atomic_set(&ib_conn->refcount, 1); 527 atomic_set(&ib_conn->refcount, 1); /* ref ib conn allocation */
519 INIT_LIST_HEAD(&ib_conn->conn_list); 528 INIT_LIST_HEAD(&ib_conn->conn_list);
520 spin_lock_init(&ib_conn->lock); 529 spin_lock_init(&ib_conn->lock);
521} 530}
@@ -543,6 +552,7 @@ int iser_connect(struct iser_conn *ib_conn,
543 552
544 ib_conn->state = ISER_CONN_PENDING; 553 ib_conn->state = ISER_CONN_PENDING;
545 554
555 iser_conn_get(ib_conn); /* ref ib conn's cma id */
546 ib_conn->cma_id = rdma_create_id(iser_cma_handler, 556 ib_conn->cma_id = rdma_create_id(iser_cma_handler,
547 (void *)ib_conn, 557 (void *)ib_conn,
548 RDMA_PS_TCP); 558 RDMA_PS_TCP);
@@ -580,7 +590,7 @@ id_failure:
580addr_failure: 590addr_failure:
581 ib_conn->state = ISER_CONN_DOWN; 591 ib_conn->state = ISER_CONN_DOWN;
582connect_failure: 592connect_failure:
583 iser_conn_release(ib_conn); 593 iser_conn_release(ib_conn, 1);
584 return err; 594 return err;
585} 595}
586 596
@@ -749,12 +759,10 @@ static void iser_handle_comp_error(struct iser_tx_desc *desc,
749 iscsi_conn_failure(ib_conn->iser_conn->iscsi_conn, 759 iscsi_conn_failure(ib_conn->iser_conn->iscsi_conn,
750 ISCSI_ERR_CONN_FAILED); 760 ISCSI_ERR_CONN_FAILED);
751 761
752 /* complete the termination process if disconnect event was delivered * 762 /* no more non completed posts to the QP, complete the
753 * note there are no more non completed posts to the QP */ 763 * termination process w.o worrying on disconnect event */
754 if (ib_conn->disc_evt_flag) { 764 ib_conn->state = ISER_CONN_DOWN;
755 ib_conn->state = ISER_CONN_DOWN; 765 wake_up_interruptible(&ib_conn->wait);
756 wake_up_interruptible(&ib_conn->wait);
757 }
758 } 766 }
759} 767}
760 768