aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSagi Grimberg <sagig@mellanox.com>2014-10-01 07:02:01 -0400
committerRoland Dreier <roland@purestorage.com>2014-10-09 03:06:06 -0400
commitc47a3c9ed5be167f49a6fd3f696dac03536282eb (patch)
treee8d8ecdf2ddb5d7f661fa87c8172e042d6ebcebc
parent96f15198c1457df29b51ed151b1e5b2a223d1346 (diff)
IB/iser: Fix DEVICE REMOVAL handling in the absence of iscsi daemon
iscsi daemon is in user-space, thus we can't rely on it to be invoked at connection teardown (if not running or does not receive CPU time). This patch addresses the issue by re-structuring iSER connection teardown logic and CM events handling. The CM events will dictate the RDMA resources destruction (ib_conn) and iser_conn is kept around as long as iscsi_conn is left around allowing iscsi/iser callbacks to continue after RDMA transport was destroyed. This patch introduces a separation in logic when handling CM events: - DISCONNECTED_HANDLER, ADDR_CHANGED This events indicate the start of teardown process. Actions: 1. Terminate the connection: rdma_disconnect (send DREQ/DREP) 2. Notify iSCSI of connection failure 3. Change state to TERMINATING 4. Poll for all flush errors to be consumed - TIMEWAIT_EXIT, DEVICE_REMOVAL These events indicate the final stage of termination process and we can free RDMA related resources. Actions: 1. Call disconnected handler (we are not guaranteed that DISCONNECTED event was invoked in the past) 2. Cleanup RDMA related resources 3. For DEVICE_REMOVAL return non-zero rc from cma_handler to implicitly destroy the cm_id (Can't rely on user-space, make sure we have forward progress) We replace flush_completion (indicate all flushes were consumed) with ib_completion (rdma resources were cleaned up). The iser_conn_release_work will wait for teardown completions: - conn_stop was completed (tasks were cleaned-up) - stop_completion - RDMA resources were destroyed - ib_completion And then will continue to free iser connection representation (iser_conn). Signed-off-by: Sagi Grimberg <sagig@mellanox.com> Signed-off-by: Ariel Nahum <arieln@mellanox.com> Signed-off-by: Roi Dayan <roid@mellanox.com> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com> Signed-off-by: Roland Dreier <roland@purestorage.com>
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.h6
-rw-r--r--drivers/infiniband/ulp/iser/iser_verbs.c163
2 files changed, 108 insertions, 61 deletions
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index ec238b3bd278..95c484d0f881 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -370,9 +370,9 @@ struct iser_conn {
370 unsigned min_posted_rx; /* qp_max_recv_dtos >> 2 */ 370 unsigned min_posted_rx; /* qp_max_recv_dtos >> 2 */
371 char name[ISER_OBJECT_NAME_SIZE]; 371 char name[ISER_OBJECT_NAME_SIZE];
372 struct work_struct release_work; 372 struct work_struct release_work;
373 struct completion stop_completion;
374 struct mutex state_mutex; 373 struct mutex state_mutex;
375 struct completion flush_completion; 374 struct completion stop_completion;
375 struct completion ib_completion;
376 struct completion up_completion; 376 struct completion up_completion;
377 struct list_head conn_list; /* entry in ig conn list */ 377 struct list_head conn_list; /* entry in ig conn list */
378 378
@@ -442,7 +442,7 @@ void iser_conn_init(struct iser_conn *iser_conn);
442 442
443void iser_conn_release(struct iser_conn *iser_conn); 443void iser_conn_release(struct iser_conn *iser_conn);
444 444
445void iser_conn_terminate(struct iser_conn *iser_conn); 445int iser_conn_terminate(struct iser_conn *iser_conn);
446 446
447void iser_release_work(struct work_struct *work); 447void iser_release_work(struct work_struct *work);
448 448
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index e4299743c459..6170d06a8acc 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -44,6 +44,7 @@
44 44
45static void iser_cq_tasklet_fn(unsigned long data); 45static void iser_cq_tasklet_fn(unsigned long data);
46static void iser_cq_callback(struct ib_cq *cq, void *cq_context); 46static void iser_cq_callback(struct ib_cq *cq, void *cq_context);
47static int iser_drain_tx_cq(struct iser_device *device, int cq_index);
47 48
48static void iser_cq_event_callback(struct ib_event *cause, void *context) 49static void iser_cq_event_callback(struct ib_event *cause, void *context)
49{ 50{
@@ -573,11 +574,10 @@ void iser_release_work(struct work_struct *work)
573 rc = wait_for_completion_timeout(&iser_conn->stop_completion, 30 * HZ); 574 rc = wait_for_completion_timeout(&iser_conn->stop_completion, 30 * HZ);
574 WARN_ON(rc == 0); 575 WARN_ON(rc == 0);
575 576
576 /* wait for the qp`s post send and post receive buffers to empty */ 577 rc = wait_for_completion_timeout(&iser_conn->ib_completion, 30 * HZ);
577 rc = wait_for_completion_timeout(&iser_conn->flush_completion, 30 * HZ); 578 if (rc == 0)
578 WARN_ON(rc == 0); 579 iser_warn("conn %p, IB cleanup didn't complete in 30 "
579 580 "seconds, continue with release\n", iser_conn);
580 iser_conn->state = ISER_CONN_DOWN;
581 581
582 mutex_lock(&iser_conn->state_mutex); 582 mutex_lock(&iser_conn->state_mutex);
583 iser_conn->state = ISER_CONN_DOWN; 583 iser_conn->state = ISER_CONN_DOWN;
@@ -589,12 +589,16 @@ void iser_release_work(struct work_struct *work)
589/** 589/**
590 * iser_free_ib_conn_res - release IB related resources 590 * iser_free_ib_conn_res - release IB related resources
591 * @iser_conn: iser connection struct 591 * @iser_conn: iser connection struct
592 * @destroy_device: indicator if we need to try to release
593 * the iser device (only iscsi shutdown and DEVICE_REMOVAL
594 * will use this.
592 * 595 *
593 * This routine is called with the iser state mutex held 596 * This routine is called with the iser state mutex held
594 * so the cm_id removal is out of here. It is Safe to 597 * so the cm_id removal is out of here. It is Safe to
595 * be invoked multiple times. 598 * be invoked multiple times.
596 */ 599 */
597static void iser_free_ib_conn_res(struct iser_conn *iser_conn) 600static void iser_free_ib_conn_res(struct iser_conn *iser_conn,
601 bool destroy_device)
598{ 602{
599 struct ib_conn *ib_conn = &iser_conn->ib_conn; 603 struct ib_conn *ib_conn = &iser_conn->ib_conn;
600 struct iser_device *device = ib_conn->device; 604 struct iser_device *device = ib_conn->device;
@@ -610,7 +614,7 @@ static void iser_free_ib_conn_res(struct iser_conn *iser_conn)
610 ib_conn->qp = NULL; 614 ib_conn->qp = NULL;
611 } 615 }
612 616
613 if (device != NULL) { 617 if (destroy_device && device != NULL) {
614 iser_device_try_release(device); 618 iser_device_try_release(device);
615 ib_conn->device = NULL; 619 ib_conn->device = NULL;
616 } 620 }
@@ -629,7 +633,11 @@ void iser_conn_release(struct iser_conn *iser_conn)
629 633
630 mutex_lock(&iser_conn->state_mutex); 634 mutex_lock(&iser_conn->state_mutex);
631 BUG_ON(iser_conn->state != ISER_CONN_DOWN); 635 BUG_ON(iser_conn->state != ISER_CONN_DOWN);
632 iser_free_ib_conn_res(iser_conn); 636 /*
637 * In case we never got to bind stage, we still need to
638 * release IB resources (which is safe to call more than once).
639 */
640 iser_free_ib_conn_res(iser_conn, true);
633 mutex_unlock(&iser_conn->state_mutex); 641 mutex_unlock(&iser_conn->state_mutex);
634 642
635 if (ib_conn->cma_id != NULL) { 643 if (ib_conn->cma_id != NULL) {
@@ -641,23 +649,68 @@ void iser_conn_release(struct iser_conn *iser_conn)
641} 649}
642 650
643/** 651/**
652 * iser_poll_for_flush_errors - Don't settle for less than all.
653 * @struct ib_conn: IB context of the connection
654 *
655 * This routine is called when the QP is in error state
656 * It polls the send CQ until all flush errors are consumed and
657 * returns when all flush errors were processed.
658 */
659static void iser_poll_for_flush_errors(struct ib_conn *ib_conn)
660{
661 struct iser_device *device = ib_conn->device;
662 int count = 0;
663
664 while (ib_conn->post_recv_buf_count > 0 ||
665 atomic_read(&ib_conn->post_send_buf_count) > 0) {
666 msleep(100);
667 if (atomic_read(&ib_conn->post_send_buf_count) > 0)
668 iser_drain_tx_cq(device, ib_conn->cq_index);
669
670 count++;
671 /* Don't flood with prints */
672 if (count % 30 == 0)
673 iser_dbg("post_recv %d post_send %d",
674 ib_conn->post_recv_buf_count,
675 atomic_read(&ib_conn->post_send_buf_count));
676 }
677}
678
679/**
644 * triggers start of the disconnect procedures and wait for them to be done 680 * triggers start of the disconnect procedures and wait for them to be done
681 * Called with state mutex held
645 */ 682 */
646void iser_conn_terminate(struct iser_conn *iser_conn) 683int iser_conn_terminate(struct iser_conn *iser_conn)
647{ 684{
648 struct ib_conn *ib_conn = &iser_conn->ib_conn; 685 struct ib_conn *ib_conn = &iser_conn->ib_conn;
649 int err = 0; 686 int err = 0;
650 687
651 /* change the ib conn state only if the conn is UP, however always call 688 /* terminate the iser conn only if the conn state is UP */
652 * rdma_disconnect since this is the only way to cause the CMA to change 689 if (!iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP,
653 * the QP state to ERROR 690 ISER_CONN_TERMINATING))
691 return 0;
692
693 iser_info("iser_conn %p state %d\n", iser_conn, iser_conn->state);
694
695 /* suspend queuing of new iscsi commands */
696 if (iser_conn->iscsi_conn)
697 iscsi_suspend_queue(iser_conn->iscsi_conn);
698
699 /*
700 * In case we didn't already clean up the cma_id (peer initiated
701 * a disconnection), we need to Cause the CMA to change the QP
702 * state to ERROR.
654 */ 703 */
704 if (ib_conn->cma_id) {
705 err = rdma_disconnect(ib_conn->cma_id);
706 if (err)
707 iser_err("Failed to disconnect, conn: 0x%p err %d\n",
708 iser_conn, err);
709
710 iser_poll_for_flush_errors(ib_conn);
711 }
655 712
656 iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP, ISER_CONN_TERMINATING); 713 return 1;
657 err = rdma_disconnect(ib_conn->cma_id);
658 if (err)
659 iser_err("Failed to disconnect, conn: 0x%p err %d\n",
660 iser_conn, err);
661} 714}
662 715
663/** 716/**
@@ -780,34 +833,36 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id)
780 833
781static void iser_disconnected_handler(struct rdma_cm_id *cma_id) 834static void iser_disconnected_handler(struct rdma_cm_id *cma_id)
782{ 835{
783 struct iser_conn *iser_conn; 836 struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context;
784 struct ib_conn *ib_conn = &iser_conn->ib_conn;
785
786 iser_conn = (struct iser_conn *)cma_id->context;
787 837
788 /* getting here when the state is UP means that the conn is being * 838 if (iser_conn_terminate(iser_conn)) {
789 * terminated asynchronously from the iSCSI layer's perspective. */
790 if (iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP,
791 ISER_CONN_TERMINATING)){
792 if (iser_conn->iscsi_conn) 839 if (iser_conn->iscsi_conn)
793 iscsi_conn_failure(iser_conn->iscsi_conn, ISCSI_ERR_CONN_FAILED); 840 iscsi_conn_failure(iser_conn->iscsi_conn,
841 ISCSI_ERR_CONN_FAILED);
794 else 842 else
795 iser_err("iscsi_iser connection isn't bound\n"); 843 iser_err("iscsi_iser connection isn't bound\n");
796 } 844 }
845}
846
847static void iser_cleanup_handler(struct rdma_cm_id *cma_id,
848 bool destroy_device)
849{
850 struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context;
797 851
798 /* Complete the termination process if no posts are pending. This code 852 /*
799 * block also exists in iser_handle_comp_error(), but it is needed here 853 * We are not guaranteed that we visited disconnected_handler
800 * for cases of no flushes at all, e.g. discovery over rdma. 854 * by now, call it here to be safe that we handle CM drep
855 * and flush errors.
801 */ 856 */
802 if (ib_conn->post_recv_buf_count == 0 && 857 iser_disconnected_handler(cma_id);
803 (atomic_read(&ib_conn->post_send_buf_count) == 0)) { 858 iser_free_ib_conn_res(iser_conn, destroy_device);
804 complete(&iser_conn->flush_completion); 859 complete(&iser_conn->ib_completion);
805 } 860};
806}
807 861
808static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) 862static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
809{ 863{
810 struct iser_conn *iser_conn; 864 struct iser_conn *iser_conn;
865 int ret = 0;
811 866
812 iser_conn = (struct iser_conn *)cma_id->context; 867 iser_conn = (struct iser_conn *)cma_id->context;
813 iser_info("event %d status %d conn %p id %p\n", 868 iser_info("event %d status %d conn %p id %p\n",
@@ -832,17 +887,29 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve
832 iser_connect_error(cma_id); 887 iser_connect_error(cma_id);
833 break; 888 break;
834 case RDMA_CM_EVENT_DISCONNECTED: 889 case RDMA_CM_EVENT_DISCONNECTED:
835 case RDMA_CM_EVENT_DEVICE_REMOVAL:
836 case RDMA_CM_EVENT_ADDR_CHANGE: 890 case RDMA_CM_EVENT_ADDR_CHANGE:
837 case RDMA_CM_EVENT_TIMEWAIT_EXIT:
838 iser_disconnected_handler(cma_id); 891 iser_disconnected_handler(cma_id);
839 break; 892 break;
893 case RDMA_CM_EVENT_DEVICE_REMOVAL:
894 /*
895 * we *must* destroy the device as we cannot rely
896 * on iscsid to be around to initiate error handling.
897 * also implicitly destroy the cma_id.
898 */
899 iser_cleanup_handler(cma_id, true);
900 iser_conn->ib_conn.cma_id = NULL;
901 ret = 1;
902 break;
903 case RDMA_CM_EVENT_TIMEWAIT_EXIT:
904 iser_cleanup_handler(cma_id, false);
905 break;
840 default: 906 default:
841 iser_err("Unexpected RDMA CM event (%d)\n", event->event); 907 iser_err("Unexpected RDMA CM event (%d)\n", event->event);
842 break; 908 break;
843 } 909 }
844 mutex_unlock(&iser_conn->state_mutex); 910 mutex_unlock(&iser_conn->state_mutex);
845 return 0; 911
912 return ret;
846} 913}
847 914
848void iser_conn_init(struct iser_conn *iser_conn) 915void iser_conn_init(struct iser_conn *iser_conn)
@@ -851,7 +918,7 @@ void iser_conn_init(struct iser_conn *iser_conn)
851 iser_conn->ib_conn.post_recv_buf_count = 0; 918 iser_conn->ib_conn.post_recv_buf_count = 0;
852 atomic_set(&iser_conn->ib_conn.post_send_buf_count, 0); 919 atomic_set(&iser_conn->ib_conn.post_send_buf_count, 0);
853 init_completion(&iser_conn->stop_completion); 920 init_completion(&iser_conn->stop_completion);
854 init_completion(&iser_conn->flush_completion); 921 init_completion(&iser_conn->ib_completion);
855 init_completion(&iser_conn->up_completion); 922 init_completion(&iser_conn->up_completion);
856 INIT_LIST_HEAD(&iser_conn->conn_list); 923 INIT_LIST_HEAD(&iser_conn->conn_list);
857 spin_lock_init(&iser_conn->ib_conn.lock); 924 spin_lock_init(&iser_conn->ib_conn.lock);
@@ -1100,28 +1167,8 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc)
1100static void iser_handle_comp_error(struct iser_tx_desc *desc, 1167static void iser_handle_comp_error(struct iser_tx_desc *desc,
1101 struct ib_conn *ib_conn) 1168 struct ib_conn *ib_conn)
1102{ 1169{
1103 struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
1104 ib_conn);
1105
1106 if (desc && desc->type == ISCSI_TX_DATAOUT) 1170 if (desc && desc->type == ISCSI_TX_DATAOUT)
1107 kmem_cache_free(ig.desc_cache, desc); 1171 kmem_cache_free(ig.desc_cache, desc);
1108
1109 if (ib_conn->post_recv_buf_count == 0 &&
1110 atomic_read(&ib_conn->post_send_buf_count) == 0) {
1111 /**
1112 * getting here when the state is UP means that the conn is
1113 * being terminated asynchronously from the iSCSI layer's
1114 * perspective. It is safe to peek at the connection state
1115 * since iscsi_conn_failure is allowed to be called twice.
1116 **/
1117 if (iser_conn->state == ISER_CONN_UP)
1118 iscsi_conn_failure(iser_conn->iscsi_conn,
1119 ISCSI_ERR_CONN_FAILED);
1120
1121 /* no more non completed posts to the QP, complete the
1122 * termination process w.o worrying on disconnect event */
1123 complete(&iser_conn->flush_completion);
1124 }
1125} 1172}
1126 1173
1127static int iser_drain_tx_cq(struct iser_device *device, int cq_index) 1174static int iser_drain_tx_cq(struct iser_device *device, int cq_index)